pbdagcon-0.3+20161121+ds/0000755000175000017500000000000013026414603013017 5ustar afifafifpbdagcon-0.3+20161121+ds/test/0000755000175000017500000000000013026414536014003 5ustar afifafifpbdagcon-0.3+20161121+ds/test/cpp/0000755000175000017500000000000013026414602014557 5ustar afifafifpbdagcon-0.3+20161121+ds/test/cpp/SimpleAlignerTest.cpp0000644000175000017500000000113613026414537020666 0ustar afifafif#include #include #include #include #include "Alignment.hpp" #include "SimpleAligner.hpp" TEST(SimpleAligner, align) { SimpleAligner sa; dagcon::Alignment a; a.id = "test"; a.start = 765; a.end = 826; a.tlen = 2092; a.strand = '-'; a.tstr = "ACAGAGATGCAAGGTAAAGTACAATTGAAAAACTAACCTCTTCCAGCGAGACTTATAGCGA"; a.qstr = "ACAGAAGATGAAGGTAAATACAATGAAAAAACTACCTCGGTTCCAGCGAGAACTATAGCGA"; sa.align(a); EXPECT_EQ("TCGCTATAAGT-CTCGCTGGAA--GAGGTTAGTTTTT-CAATTGTACTTTACCTTGCATCT-CTGT", a.tstr); EXPECT_EQ(1267, a.start); } pbdagcon-0.3+20161121+ds/test/cpp/AlignmentTest.cpp0000644000175000017500000000614313026414536020053 0ustar afifafif#include #include #include #include #include #include #include "Alignment.hpp" using namespace dagcon; std::string dataDir() { char const* val = getenv("PBDAGCON_TEST_DATA_DIR"); if (!val || !*val) { return "."; } return val; } TEST(Alignment, Normalize) { Alignment a, b; a.start = 1; // test mismatch b.qstr = "CAC"; b.tstr = "CGC"; b = normalizeGaps(b); EXPECT_EQ("C-AC", b.qstr); EXPECT_EQ("CG-C", b.tstr); // test shifting gaps on read b.qstr = "-C--CGT"; b.tstr = "CCGAC-T"; b = normalizeGaps(b); EXPECT_EQ("CCG--T", b.qstr); EXPECT_EQ("CCGACT", b.tstr); // another gap reference push a.tstr = "ATATTA---GGC"; a.qstr = "ATAT-AGCCGGC"; b = a; b = normalizeGaps(b); // query remains unchanged EXPECT_EQ(a.qstr, b.qstr); // target shifts the G bases to the left EXPECT_EQ("ATATTAG--G-C", b.tstr); } TEST(Alignment, ParseBasic) { std::string fn = dataDir() + "basic.m5"; std::ifstream file(fn.c_str()); ASSERT_TRUE(file.good()); Alignment aln; file >> aln; EXPECT_EQ(1, aln.start); EXPECT_EQ("CAC", aln.qstr); EXPECT_EQ("CGC", aln.tstr); file >> aln; EXPECT_EQ(1, aln.start); EXPECT_EQ("AATTGGCC", aln.qstr); EXPECT_EQ("GGCCAATT", aln.tstr); } TEST(Alignment, ParseQuery) { std::string fn = dataDir() + "parsequery.m5"; std::ifstream file(fn.c_str()); ASSERT_TRUE(file.good()); Alignment aln; Alignment::groupByTarget = false; file >> aln; EXPECT_EQ("m130110_062238_00114_c100480560100000001823075906281381_s1_p0/311/1102_3151", aln.id); EXPECT_EQ(2049, aln.tlen); EXPECT_EQ(8, aln.start); EXPECT_EQ("CTGCATGCT", aln.tstr.substr(0,9)); EXPECT_EQ("CTGCA--CT", aln.qstr.substr(0,9)); } TEST(Alignment, Trim) { std::string const t = "ACG-TCA-GCA"; std::string const q = "AC-C-C-T---"; { dagcon::Alignment aln; aln.tstr = t; aln.qstr = q; aln.start = 1; aln.strand = '-'; trimAln(aln, 0); EXPECT_EQ(1, aln.start); EXPECT_EQ(t, aln.tstr); EXPECT_EQ(q, aln.qstr); } { dagcon::Alignment aln; aln.tstr = t; aln.qstr = q; aln.start = 1; aln.strand = '-'; trimAln(aln, 3); EXPECT_EQ(4, aln.start); EXPECT_EQ("-TCA-", aln.tstr); EXPECT_EQ("C-C-T", aln.qstr); } { dagcon::Alignment aln; aln.tstr = t; aln.qstr = q; aln.start = 1; aln.strand = '-'; trimAln(aln, 4); EXPECT_EQ(5, aln.start); EXPECT_EQ("C", aln.tstr); EXPECT_EQ("C", aln.qstr); } { dagcon::Alignment aln; aln.tstr = t; aln.qstr = q; aln.start = 1; aln.strand = '-'; trimAln(aln, 5); EXPECT_EQ(6, aln.start); EXPECT_EQ("", aln.tstr); EXPECT_EQ("", aln.qstr); } { dagcon::Alignment aln; aln.tstr = t; aln.qstr = q; aln.start = 1; aln.strand = '-'; trimAln(aln, 500); // EXPECT_EQ(1 + 9, aln.start); // start could be anything, really EXPECT_EQ("", aln.tstr); EXPECT_EQ("", aln.qstr); } } pbdagcon-0.3+20161121+ds/test/cpp/parsequery.m50000644000175000017500000001271513026414537017237 0ustar afifafifm130110_062238_00114_c100480560100000001823075906281381_s1_p0/311/1102_3151/0_2049 2049 7 1640 + m130110_062238_00114_c100480560100000001823075906281381_s1_p0/311/1102_3151 2049 493 2044 - -3997 1303 93 237 155 0 CTGCATGCT-GTGACTCTGGATAGAGTTTTCTGCCGCATGTTAATTGTAATCAAGACCGTTTAGTAAGTTCTTATCGATTATCATTACGTTTC-ATCG-C-T-ATTATCGTGGTTTTGGTAGGTCAATGGTGTTCAGGTGTGCATTGTTGTTGATGTATTTCTGCAGTATCGATTTCTGTTCTGTATCGCTGGCAT-TGTTG-TCATCAATAGG-AGATGGTAGTC--TGGTTTTTG-----------T-TAATAGATTATACGC-TGTGATAATGCTGAATTTCGCGTCTCTTCATACGCGATTGCCAGATGTTCTTGTAGTG-CAGATGTTATTTGACCTTGTATCTCTTTCAAACATCGGG-TGAGTATTATCCTTACTGTTCTTTTACATATTACATTTTGCTGATACTCGTTTAGCTTGAAACGTACTATTATCA-TGCCTAAGGAGTTTTATTAAAT--TAGTATC-A-TGAGTTTAGTAGTTCTGATG-CATATA-AGATTTGGTGGGTATTCAATGTTGTGCAGAT-CCGGTGTCTTGTCTCTCATTGCTAGACCATCTACGTAAGGTGTTTATTGTATTGGAT-AA-GGTATATGTATTTATTTAAGTCGGCATACAAATTATTTCTCATGTATGGTTTTATAAGTTCTATTATTATTATTGTTACATAATCTTCCTCATTCTGTCAATGTGCGAGAA-AT-ACTGATCACCGTGGCAAAA-TTATTATCACGAGTACGGTGGAAACGTATACTATGCCTCTCCTTTTTTTGTTAAAACAAAACATATAGATGATTAAACACAATATTAC-TACAC-ATCTCGCACTCGCGGGGATTTATTTATCT-G--AAC-TCGCTACGGCGGGTTTTG-TTTTATGGAGATGAT-AAAGTTGCA-CT-TCCGAGTCA--CAGGAAAAAAT-GGAATGGGAGAGCCCATC---TCAACAGAGTTA-CGAAGCGGAGAACCATCAACGACTGCTACGACCACTGG-AT-GATAACTGGGCGCAGATAGCA-CATGCAAACGTAACCAATATTCGAATTGAAGAGACTGAAAGAACACCAAGCCGCCTGAATGGC-GTTTTTTCTTGCGTGTACATCTCGCGCGACGACATCTCCCGAAGGCGCG-AGAGCGGCAGCGC-AGCGAGC-ACGCGCGACGTCCTCGC-C---AT-T--AC-CAAGAAAAAACCGCCA-TCAGGCGCCTTGGTGTTCTTTCAG--TCTTC-ATTCGAATATTGGTTACGTCTGCATGATGCTATCTGCGCCCA--TATCCATACCAGT-GTCGTAGCAGTCGTTGAT-G-TCTCCGC-TCGATAACTC-GTTGACCCGCT-GG--CT--CACTCCGA--TTCT-CTGCATGACTC-GACAGCTGC-ATTTTGATCATCTCCATAAAAAC-CAACCC-CCGTA-CGACGTTATCCAGA-AAATAACT-CCCGCGAGTGCGAGATTGT-TATGTAATATTG-GGTTAATCATCTATATG-TTTG---T-AC----AGA-GAG-GGCA-A-TAT-CGTTTCCACCGTACTCGTGATAATAACTTTTG-CACGGT-ATCAGTCATCTTCTCGCACATTG-CA-AACG-GG--GATT-T-T--C--T--TCAT--TAG-ACTT-TAAAAGC-TTCAT--G-AAT-ATTTGTATGCCGACCT--AT--AT-C-TAT-CCATTCAT-C--TAC-ATAAACACCTT-CGT-GAT-GTCT-GC-ATG-GAGACAAGACACCGGCA-CT-CACAACATTGA |||||**||*|||||*|||||*||||*|||||||*|||*|*****|*|*||******|*****|*||***|**|**||****|*|||*||**|*||||*|*|*||||*|***|**|*|*||**|**|*||*||**|*|*||*||*||**|||*|*|*|****|||||****|*****||****||||**|||||||*||*||*|*|*|**|*||*|**|*|*|*||***||****||***********|*|||*|||*||*||||*|||||*||||*||||||||||||*||||||*|*||||*||*||||*|***||||||||*||||||**|**|||****|||***|*|||*|||||||||*||*|||||*|*||||||||||||*||||||**|||**||||||||||*||||||||*||*||||*||*|*||*||*||*|*|||||||||*|*|*|||***||||||*|*||||*||||*||*|||||*|*||*|*|*||||||*||*|*||*|||||||||||*||*|*|||||||||||||||*||*|||*|||*||||*|||*||||||||||*||||***|||*||*||*|||*|*|**||**||*|||||||||||||*||**|*|||||*|*|*|||||*|||*||||**||*|**|**|**|*|*||||**||*|*||*||*|||||||||||||*||*||||||*|||||*||||||*||||||||||||||||||||||||||*|||*|*||||*||*|****|*||*|***||*||||||||||||||||*|||*|||||||||*||*||*|||||||||||||*||||*||||||*|||*|**|||*|||*|||||*|||*||*|*|||||||||||||||*|||**||||*||*|*|||||||**||*|**|*|||*||*|***|*|||*|||*|***|||||*||||||*||*|||||||*|*||||||||||||||||||*||||||*||*|||*|*|||||||||||||||*||||||*||||||||||||||||||*||*|*|||||||||||||||||||*||||||*|||||*||||||||||**|*|||*|**|*|*||||*|||*||*|*||***||*||**|*||*||*||*|**|||**|**||*|*||*|||*||||*|***||*|**||*||||||||||*|||||*|||||||*||||||||||||||||**|||||*|||||||||||||||||||*||||||*|||||||||||||||**|||*|||*|||||*||||||||||||||||||*|*|||||||*|||*||||||*|||||***|*|*||**||**||*|||*|**||*|*|||**||||||*||*||*|||*|*|||*|||||||||||*|||||**|||||*|||||*|||*|*|*|*||||*||||||*|*|||||||||||||||*|||*||*|||||||||*|*|||||||||||||||*||||***|*||****|*|*|||*||||*|*|||*||||||||||||||||||||||||||*|||||*||||||*||||||*||*|||||||||||||*||*||*|*||**||||*|*|**|**|**|*||**|||*||||*|||||*|*|*|||**|*|||*||||||||||||||*|**||**||*|*|||*||*||*||*|**|||*|||||||||||*|||*|||*||||*||*|||*|||||||||||||||*|*||*||||||||||| CTGCA--CTGGTGAC-CTGGA-AGAG-TTTCTGCGGCA-G-----T-T-AT------C-----G-AA---C--A-AGA--CCCGTTA-GTAACTATCGACATCATTA-C---GCATCGCTA-TTTTACGGGGTGGA-G-GT-CAATG-GGTTCAGGGA----TGCAG----G-----TG---AGTAT--CTGGCATATGATGTTGA-CGCT-GGCA-TTCGCA-TCAAAGGAGAGTGAGGATCGGTTTTGTAA-AGA-TA-ACGCTTGTGA-AATG-TGAATTTCGCGTGTCTTCACA-GCGA-TGGCAGA-G---TTGTAGTGTCAGATG--A--TGA----GTA---C-TTC-AACATCGGGTTG-GTATT-T-CTTACTGTTCTTCTACATA-AACA--TTGCTGATAC-CGTTTAGC-TG-AACG-AC-A-TA-CATTG-C-AAGGAGTTTCA-T-AATGAGAGTATCAATTGAG-TTAG-AG-TCTGA-GCCA-A-ACAGATTT-GTCGTTA-TCAATGTTGTG-AG-TGCCGGTGTCTTGTCTC-CA-TGC-AGA-CATC-ACG-AAGGTGTTTA-TGTA---GATGAATGG-ATA-G-A--TA--TAGGTCGGCATACAAA-TA--T-TCATG-AAGCTTTTA-AAG-TCTA--ATGA--A--G--A-A-AATC--CC-CGTT-TG-CAATGTGCGAGAAGATGACTGAT-ACCGT-GCAAAAGTTATTATCACGAGTACGGTGGAAACG-ATA-T-TGCC-CT-C----TCTG-T---AC-AAACATATAGATGATT-AACCCAATATTACATA-ACAATCTCGCACTCGC-GGGAGTTATTT-TCTGGATAACGTCG-TACGG-GGG-TTGGTTTTTATGGAGATGATCAAA-ATGCAGCTGT-CGAGTCATGCA-G--AGAATCGG-A---GTGAG-CCAGCGGGTCAAC-GAGTTATCG-AGCGGAG-A-CATCAACGACTGCTACGA-CACTGGTATGGAT-A-TGGGCGCAGATAGCATCATGCAGACGTAACCAATATTCGAA-TG-A-AGACTGAAAGAACACCAAGGCGCCTG-ATGGCGGTTTTTTCTT--G-GTA-A--T-G-GCGAGGACGTCGCGCG--TGCTCGCTGCGCTGCCGCTCTCGCG-CCTTCGGGAGATGTCGTCGCGCGAGATGTACACGCAAGAAAAAA-CGCCATTCAGGCGGCTTGGTGTTCTTTCAGTCTCTTCAATTCGAATATTGGTTACGTTTGCATG-TGCTATCTGCGCCCAGTTAT-CAT-CCAGTGGTCGTAGCAGTCGTTGATGGTTCTCCGCTTCG-TAACTCTGTTGA---GATGGGCTCTCCCATTCC-ATTTTTTCCTG--TGACTCGGA-AG-TGCAACTTT-ATCATCTCCAT-AAAACAAAACCCGCCGTAGCGA-G-T-T-CAGATAAATAAATCCCCGCGAGTGCGAGA-TGTGTA-GTAATATTGTGTTTAATCATCTATATGTTTTGTTTTAACAAAAAAAGGAGAGGCATAGTATACGTTTCCACCGTACTCGTGATAATAA-TTTTGCCACGGTGATCAGT-AT-TTCTCGCACATTGACAGAATGAGGAAGATTATGTAACAATAATAATAATAGAACTTATAAAACCATACATGAGAAATAATTTGTATGCCGACTTAAATAAATACATATACC-TT-ATCCAATACAATAAACACCTTACGTAGATGGTCTAGCAATGAGAGACAAGACACCGG-ATCTGCACAACATTGA pbdagcon-0.3+20161121+ds/test/cpp/AlnGraphBoostTest.cpp0000644000175000017500000000253213026414537020637 0ustar afifafif#include #include #include #include #include #include #include #include "Alignment.hpp" #include "AlnGraphBoost.hpp" TEST(AlnGraphBoostTest, RawConsensus) { std::string backbone = "ATATTAGGC"; AlnGraphBoost ag(backbone); std::unique_ptr algs(new dagcon::Alignment[5]); algs[0].tstr = "ATATTA---GGC"; algs[0].qstr = "ATAT-AGCCGGC"; algs[1].tstr = "ATATTA-GGC"; algs[1].qstr = "ATAT-ACGGC"; algs[2].tstr = "AT-ATTA--GGC"; algs[2].qstr = "ATCAT--CCGGC"; algs[3].tstr = "ATATTA--G-GC"; algs[3].qstr = "ATAT-ACCGAG-"; algs[4].tstr = "ATATTA---GGC"; algs[4].qstr = "ATAT-AGCCGGC"; for(int i=0; i < 5; i++) { dagcon::Alignment& ra = algs[i]; ra.id = "target"; ra.tlen = 9; ra.start = 1; } ag.addAln(algs[0]); ag.addAln(algs[1]); ag.addAln(algs[2]); ag.addAln(algs[3]); ag.addAln(algs[4]); ag.mergeNodes(); std::string expected = "ATATAGCCGGC"; const std::string actual = ag.consensus(); EXPECT_EQ(expected, actual); } TEST(AlnGraphBoostTest, DanglingNodes) { AlnGraphBoost ag(12); dagcon::Alignment a; a.tstr = "C-GCGGA-T-G-"; a.qstr = "CCGCGG-G-A-T"; ag.addAln(a); EXPECT_FALSE(ag.danglingNodes()); } pbdagcon-0.3+20161121+ds/test/cpp/makefile0000644000175000017500000000621113026414537016266 0ustar afifafif.PHONY: all check test_target_hit test_alngraph test_alignment test_simple_aligner all: THISDIR:=$(dir $(lastword ${MAKEFILE_LIST})) -include ${CURDIR}/../../defines.mk SRCDIR := ${THISDIR} INCDIRS := . \ ${SRCDIR} \ ${SRCDIR}/../../src/cpp \ ${DAZZ_DB_INCLUDE} \ ${DALIGNER_INCLUDE} \ ${LIBBLASR_INCLUDE} \ ${LIBPBDATA_INCLUDE} \ ${LIBPBIHDF_INCLUDE} \ ${PBBAM_INCLUDE} \ ${HDF5_INCLUDE} \ ${HTSLIB_INCLUDE} \ ${BOOST_INCLUDE} \ ${GTEST_INCLUDE} \ third-party LIBDIRS := \ ${LIBBLASR_LIB} \ ${LIBPBDATA_LIB} \ ${LIBPBIHDF_LIB} \ ${PBBAM_LIB} \ ${HDF5_LIB} \ ${HTSLIB_LIB} \ ${GCC_LIB} \ ${ZLIB_LIB} PTHREAD_LIBFLAGS:=-lpthread LDLIBS+= \ ${LIBBLASR_LIBFLAGS} \ ${LIBPBDATA_LIBFLAGS} \ ${LIBPBIHDF_LIBFLAGS} \ ${PBBAM_LIBFLAGS} \ ${HDF5_LIBFLAGS} \ ${HTSLIB_LIBFLAGS} \ ${ZLIB_LIBFLAGS} \ ${PTHREAD_LIBFLAGS} \ ${DL_LIBFLAGS} CPPFLAGS+=$(patsubst %,-I%,${INCDIRS}) LDFLAGS+=$(patsubst %,-L %,${LIBDIRS}) # For fused-src, gtest-all.cc includes gtest/*. # For non-fused, gtest-all.cc includes src/gtest-*. # So we add -I ${GTEST_SRC}/.. for the latter case. GTEST_CPPFLAGS += -isystem $(GTEST_INCLUDE) -isystem ${GTEST_SRC}/.. GTEST_CXXFLAGS += -g -Wall -Wextra -pthread CXXFLAGS := -O3 -std=c++11 GTEST_OBJECTS := gtest-all.o gtest_main.o DAZCON_OBJECTS := DB.o QV.o align.o Alignment.o \ DazAlnProvider.o PBDAGCON_OBJECTS := AlnGraphBoost.o Alignment.o \ SimpleAligner.o pbdagcon_testexes := test-alngraph \ test-alignment \ test-simple_aligner \ ${null} dazcon_testexes := test-target_hit \ test-target \ ${null} # Remove the test from the broken_testexes variable, once fixed. broken_testexes := test-target \ ${null} testexes := ${pbdagcon_testexes} ${dazcon_testexes} # Remove broken tests exes from the testexes list testexes := $(filter-out ${broken_testexes},${testexes}) empty:= space:=${empty} ${empty} ldp+=$(subst ${space},:,${LIBDIRS}):${LD_LIBRARY_PATH} #export LD_LIBRARY_PATH vpath %.c ${THISDIR}/../../src/cpp vpath %.cpp ${THISDIR}/../../src/cpp vpath %.cc ${GTEST_SRC} vpath align.c ${DALIGNER_SRC} vpath DB.c ${DAZZ_DB_SRC} vpath QV.c ${DAZZ_DB_SRC} #VALGRIND?=/mnt/software/v/valgrind/3.10.1/bin/valgrind --leak-check=full BUILDMSG = "=== Building $@ ===" ${GTEST_OBJECTS}: CXXFLAGS+=${GTEST_CXXFLAGS} ${GTEST_OBJECTS}: CPPFLAGS+=${GTEST_CPPFLAGS} all: check check: gtest-run gtest-run: $(testexes:%=%-gtestrun) gtest-build: ${testexes} xml: mkdir xml %-gtestrun: % | xml LD_LIBRARY_PATH=${ldp} PBDAGCON_TEST_DATA_DIR=${THISDIR} ${VALGRIND} ./$< --gtest_output=xml:xml/${ #include TEST(TargetHitTest, single_overlap_perfect) { Record rec; Overlap ovl; rec.ovl = ovl; Path path; path.abpos = 27; path.aepos = 7013; path.bbpos = 1231; path.bepos = 8217; path.diffs = 0; rec.ovl.path = path; TargetHit th; th.add(rec); th.computeOvlScore(); EXPECT_FLOAT_EQ(6986, th.ovlScore); } TEST(TargetHitTest, single_overlap_inaccurate) { Record rec; Overlap ovl; rec.ovl = ovl; Path path; path.abpos = 10; path.aepos = 5000; path.bbpos = 2000; path.bepos = 6000; path.diffs = 230; rec.ovl.path = path; TargetHit th; th.add(rec); th.computeOvlScore(); EXPECT_FLOAT_EQ(3770, th.ovlScore); } TEST(TargetHitTest, multi_overlap_inaccurate) { Record r1; Overlap o1; r1.ovl = o1; Path p1; p1.abpos = 10; p1.aepos = 5000; p1.bbpos = 2000; p1.bepos = 6000; p1.diffs = 230; r1.ovl.path = p1; Record r2; Overlap o2; r2.ovl = o2; Path p2; p2.abpos = 5005; p2.aepos = 7005; p2.bbpos = 6001; p2.bepos = 7995; p2.diffs = 53; r2.ovl.path = p2; TargetHit th; th.add(r1); th.computeOvlScore(); EXPECT_FLOAT_EQ(3770, th.ovlScore); th.add(r2); th.computeOvlScore(); EXPECT_FLOAT_EQ(4721, th.ovlScore); } pbdagcon-0.3+20161121+ds/test/cpp/basic.m50000644000175000017500000000017613026414537016116 0ustar afifafifid 3 0 3 + ref 3 0 3 + -40645 8129 0 0 0 254 CAC |-| CGC id 3 0 3 + ref 3 0 3 - -40645 8129 0 0 0 254 GGCCAATT |-| AATTGGCC pbdagcon-0.3+20161121+ds/test/cpp/TargetTest.cpp0000644000175000017500000000066713026414537017371 0ustar afifafif#include #include TEST(TargetTest, add_record) { Record r1, r2, r3; Overlap o1, o2, o3; o1.aread = 1; o1.bread = 3; o1.flags = 0; o2.aread = 1; o2.bread = 3; o2.flags = 0; o3.aread = 2; o3.bread = 3; o3.flags = 1; r1.ovl = o1; r2.ovl = o2; r3.ovl = o3; Target t; t.addRecord(r1); t.addRecord(r2); t.addRecord(r3); EXPECT_EQ(2, t.hits.size()); } pbdagcon-0.3+20161121+ds/README.md0000644000175000017500000001227613026414536014313 0ustar afifafif[![Build Status](https://travis-ci.org/pbjd/pbdagcon.svg?branch=master)](https://travis-ci.org/pbjd/pbdagcon) What is pbdagcon? ================= pbdagcon is a tool that implements DAGCon (Directed Acyclic Graph Consensus) which is a sequence consensus algorithm based on using directed acyclic graphs to encode multiple sequence alignment. It uses the alignment information from blasr to align sequence reads to a "backbone" sequence. Based on the underlying alignment directed acyclic graph (DAG), it will be able to use the new information from the reads to find the discrepancies between the reads and the "backbone" sequences. A dynamic programming process is then applied to the DAG to find the optimum sequence of bases as the consensus. The new consensus can be used as a new backbone sequence to iteratively improve the consensus quality. While the code is developed for processing PacBio(TM) raw sequence data, the algorithm can be used for general consensus purpose. Currently, it only takes FASTA input. For shorter read sequences, one might need to adjust the blasr alignment parameters to get the alignment string properly. The code and the underlying graphical data structure have been used for some algorithm development prototyping including phasing reads, pre-assembly and a work around to generate consensus from intermediate Celera Assembler outputs. The initial graphical algorithm was a pure python implementation. Cython was then use to speed it up. Check out the example/ directory to see how to use it. This code is released under the assumption it will help the community to adopt the PacBio data and make interesting science project possible and more feasible. It is not an official software release from the PacBio(TM) software developing organization. Building ======== The following are instructions on how to build the C++ pbdagcon executable. The code now depends on C++11 features, in particular std::thread, std::move. GCC 4.8.1 or higher is known to work. This project requires that you have boost headers available. You can either supply them yourself or the Makefile will obtain them for you from the internet. ### Compile/Check (pbdagcon) ```sh # First, configure your build. (You can look at `defines.mk` and # `blasr_libcpp/defines.mk` to diagnose any problems.) ./configure.py --boost --gtest --sub --no-pbbam # Then, fetch and build the relevant portions of the blasr_libcpp # submodule make init-submodule # build pbdagcon executable (Makefile fetches boost headers) make # build and run unit tests make check # usage cd src/cpp ./pbdagcon --help ``` Running ======= ### Use Case: Generating consensus from BLASR alignments The most basic use case where one can generate a consensus from a set of alignments using the pbdagcon executable directly. At the most basic level, pbdagcon takes information from BLASR alignments sorted by target and generates fasta-formatted corrected target sequences. The alignments from BLASR can be formatted with either *-m 4* or *-m 5*. For *-m 4* format, the alignments must be run through a format adapter, *[m4topre.py][]*, in order to generate suitable input to *pbdagcon*. The following example shows the simplest way to generate a consensus for one target using BLASR *-m 5* alignments as input. ```sh blasr queries.fasta target.fasta -bestn 1 -m 5 -out mapped.m5 pbdagcon mapped.m5 > consensus.fasta ``` ### Use Case: Generating corrected reads from daligner alignments Support for generating consensus from daligner output has been added in the form of a new executable *dazcon*. Note that it is sensitive to the version of daligner used and may crash if using inputs generated by versions other than what is referenced in the submodules. ```sh dazcon -ox -j 4 -s subreads.db -a subreads.las > corrected.fasta ``` ### Use Case: HGAP correction of PacBio reads Walks through how one could use pbdagcon to correct PacBio reads. This example demonstrates how correction is performed in PacBio's "Hierarchichal Genome Assembly Process" (HGAP) workflow. HGAP uses BLASR *-m 4* output. This example makes use of the *[filterm4.py][]* and *[m4topre.py][]* scripts: ```sh # First filter the m4 file to help remove chimeras filterm4.py mapped.m4 > mapped.m4.filt # Next run the m4 adapter script, generating 'pre-alignments' m4topre.py mapped.m4.filt mapped.m4.filt reads.fasta 24 > mapped.pre # Finally, correct using pbdagcon, typically using multiple consensus # threads. pbdagcon -j 4 -a mapped.pre > corrected.fasta ``` The *[pbdagcon_wf.sh][]* script automates this workflow. ----------------------------------------------------------------------------- [m4topre.py]: src/m4topre.py 'code' [filterm4.py]: src/filterm4.py 'code' [pbdagcon_wf.sh]: src/cpp/pbdagcon_wf.sh 'code' pbdagcon-0.3+20161121+ds/DALIGNER/0000755000175000017500000000000013026414602014203 5ustar afifafifpbdagcon-0.3+20161121+ds/DALIGNER/GNUmakefile0000644000175000017500000000136313026414545016266 0ustar afifafifTHISDIR:=$(abspath $(dir $(realpath $(lastword ${MAKEFILE_LIST})))) LIBDIRS?=${THISDIR}/../DAZZ_DB CFLAGS+= -O3 -Wall -Wextra -fno-strict-aliasing -Wno-unused-result CPPFLAGS+= -MMD -MP -I${THISDIR}/../DAZZ_DB LDLIBS+= -ldazzdb -lm -lpthread LDFLAGS+= $(patsubst %,-L%,${LIBDIRS}) MOST = daligner HPCdaligner HPCmapper LAsort LAmerge LAsplit LAcat LAshow LAcheck LA4Falcon LA4Ice DB2Falcon ALL:=${MOST} daligner_p vpath %.c ${THISDIR} vpath %.a ${THISDIR}/../DAZZ_DB %: %.c all: ${ALL} daligner: filter.o daligner_p: filter_p.o ${ALL}: align.o install: cp -f ${ALL} ${PREFIX}/bin clean: rm -f ${ALL} rm -f ${DEPS} rm -fr *.dSYM *.o .PHONY: clean all SRCS:=$(notdir $(wildcard ${THISDIR}/*.c)) #DEPS:=$(patsubst %.c,%.d,${SRCS}) #-include ${DEPS} pbdagcon-0.3+20161121+ds/DALIGNER/DB2Falcon.c0000644000175000017500000000575413026414545016062 0ustar afifafif/******************************************************************************************** * * Recreate all the .fasta files that have been loaded into a specified database. * * Author: Gene Myers * Date : May 2014 * ********************************************************************************************/ #include #include #include #include "DB.h" static char *Usage = "[-U] [-w] "; int main(int argc, char *argv[]) { HITS_DB _db, *db = &_db; FILE *dbfile; int nfiles; int UPPER, WIDTH; // Process arguments { int i, j, k; int flags[128]; char *eptr; ARG_INIT("DB2fasta") WIDTH = 80; j = 1; for (i = 1; i < argc; i++) if (argv[i][0] == '-') switch (argv[i][1]) { default: ARG_FLAGS("U") break; case 'w': ARG_NON_NEGATIVE(WIDTH,"Line width") break; } else argv[j++] = argv[i]; argc = j; UPPER = 1 + flags['U']; if (argc != 2) { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage); exit (1); } } // Open db and also db image file (dbfile) if (Open_DB(argv[1],db)) { fprintf(stderr,"%s: Database %s.db could not be opened\n",Prog_Name,argv[1]); exit (1); } if (db->part > 0) { fprintf(stderr,"%s: Cannot be called on a block: %s.db\n",Prog_Name,argv[1]); exit (1); } Trim_DB(db); { char *pwd, *root; pwd = PathTo(argv[1]); root = Root(argv[1],".db"); dbfile = Fopen(Catenate(pwd,"/",root,".db"),"r"); free(pwd); free(root); if (dbfile == NULL) exit (1); } // nfiles = # of files in data base fscanf(dbfile,DB_NFILE,&nfiles); // For each file do: { HITS_READ *reads; char *read; int f, first; FILE *ofile; reads = db->reads; read = New_Read_Buffer(db); first = 0; if ((ofile = Fopen(Catenate(".","/","preads4falcon",".fasta"),"w")) == NULL) exit (1); for (f = 0; f < nfiles; f++) { int i, last; char prolog[MAX_NAME], fname[MAX_NAME]; // Scan db image file line, create .fasta file for writing fscanf(dbfile,DB_FDATA,&last,fname,prolog); // For the relevant range of reads, write each to the file // recreating the original headers with the index meta-data about each read for (i = first; i < last && i < db->nreads; i++) { int j, len; HITS_READ *r; r = reads + i; len = r->rlen; fprintf(ofile,">%09lld", (long long int) i); fprintf(ofile,"\n"); Load_Read(db,i,read,UPPER); for (j = 0; j+WIDTH < len; j += WIDTH) fprintf(ofile,"%.*s\n",WIDTH,read+j); if (j < len) fprintf(ofile,"%s\n",read+j); } first = last; } fclose(ofile); } fclose(dbfile); Close_DB(db); exit (0); } pbdagcon-0.3+20161121+ds/DALIGNER/LAshow.c0000644000175000017500000004053413026414545015560 0ustar afifafif/******************************************************************************************* * * Utility for displaying the overlaps in a .las file in a variety of ways including * a minimal listing of intervals, a cartoon, and a full out alignment. * * Author: Gene Myers * Creation: July 2013 * Last Mod: Jan 2015 * *******************************************************************************************/ #include #include #include #include #include #include #include #include #include "DB.h" #include "align.h" static char *Usage[] = { "[-caroUF] [-i] [-w] [-b] ", " [ ] [ | ... ]" }; #define LAST_READ_SYMBOL '$' static int ORDER(const void *l, const void *r) { int x = *((int *) l); int y = *((int *) r); return (x-y); } int main(int argc, char *argv[]) { HITS_DB _db1, *db1 = &_db1; HITS_DB _db2, *db2 = &_db2; Overlap _ovl, *ovl = &_ovl; Alignment _aln, *aln = &_aln; FILE *input; int sameDB; int64 novl; int tspace, tbytes, small; int reps, *pts; int input_pts; int ALIGN, CARTOON, REFERENCE, OVERLAP; int FLIP, MAP; int INDENT, WIDTH, BORDER, UPPERCASE; int ISTWO; // Process options { int i, j, k; int flags[128]; char *eptr; ARG_INIT("LAshow") INDENT = 4; WIDTH = 100; BORDER = 10; j = 1; for (i = 1; i < argc; i++) if (argv[i][0] == '-') switch (argv[i][1]) { default: ARG_FLAGS("caroUFM") break; case 'i': ARG_NON_NEGATIVE(INDENT,"Indent") break; case 'w': ARG_POSITIVE(WIDTH,"Alignment width") break; case 'b': ARG_NON_NEGATIVE(BORDER,"Alignment border") break; } else argv[j++] = argv[i]; argc = j; CARTOON = flags['c']; ALIGN = flags['a']; REFERENCE = flags['r']; OVERLAP = flags['o']; UPPERCASE = flags['U']; FLIP = flags['F']; MAP = flags['M']; if (argc <= 2) { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage[0]); fprintf(stderr," %*s %s\n",(int) strlen(Prog_Name),"",Usage[1]); exit (1); } } // Open trimmed DB or DB pair { int status; char *pwd, *root; FILE *input; struct stat stat1, stat2; ISTWO = 0; status = Open_DB(argv[1],db1); if (status < 0) exit (1); if (db1->part > 0) { fprintf(stderr,"%s: Cannot be called on a block: %s\n",Prog_Name,argv[1]); exit (1); } sameDB = 1; if (argc > 3) { pwd = PathTo(argv[3]); root = Root(argv[3],".las"); if ((input = fopen(Catenate(pwd,"/",root,".las"),"r")) != NULL) { ISTWO = 1; fclose(input); status = Open_DB(argv[2],db2); if (status < 0) exit (1); if (db2->part > 0) { fprintf(stderr,"%s: Cannot be called on a block: %s\n",Prog_Name,argv[2]); exit (1); } stat(Catenate(db1->path,"","",".idx"),&stat1); stat(Catenate(db2->path,"","",".idx"),&stat2); if (stat1.st_ino != stat2.st_ino) sameDB = 0; Trim_DB(db2); } else db2 = db1; free(root); free(pwd); } else db2 = db1; Trim_DB(db1); } // Process read index arguments into a sorted list of read ranges input_pts = 0; if (argc == ISTWO+4) { if (argv[ISTWO+3][0] != LAST_READ_SYMBOL || argv[ISTWO+3][1] != '\0') { char *eptr, *fptr; int b, e; b = strtol(argv[ISTWO+3],&eptr,10); if (eptr > argv[ISTWO+3] && b > 0) { if (*eptr == '-') { if (eptr[1] != LAST_READ_SYMBOL || eptr[2] != '\0') { e = strtol(eptr+1,&fptr,10); input_pts = (fptr <= eptr+1 || *fptr != '\0' || e <= 0); } } else input_pts = (*eptr != '\0'); } else input_pts = 1; } } if (input_pts) { int v, x; FILE *input; input = Fopen(argv[ISTWO+3],"r"); if (input == NULL) exit (1); reps = 0; while ((v = fscanf(input," %d",&x)) != EOF) if (v == 0) { fprintf(stderr,"%s: %d'th item of input file %s is not an integer\n", Prog_Name,reps+1,argv[2]); exit (1); } else reps += 1; reps *= 2; pts = (int *) Malloc(sizeof(int)*reps,"Allocating read parameters"); if (pts == NULL) exit (1); rewind(input); for (v = 0; v < reps; v += 2) { fscanf(input," %d",&x); pts[v] = pts[v+1] = x; } fclose(input); } else { pts = (int *) Malloc(sizeof(int)*2*argc,"Allocating read parameters"); if (pts == NULL) exit (1); reps = 0; if (argc > 3+ISTWO) { int c, b, e; char *eptr, *fptr; for (c = 3+ISTWO; c < argc; c++) { if (argv[c][0] == LAST_READ_SYMBOL) { b = db1->nreads; eptr = argv[c]+1; } else b = strtol(argv[c],&eptr,10); if (eptr > argv[c]) { if (b <= 0) { fprintf(stderr,"%s: %d is not a valid index\n",Prog_Name,b); exit (1); } if (*eptr == '\0') { pts[reps++] = b; pts[reps++] = b; continue; } else if (*eptr == '-') { if (eptr[1] == LAST_READ_SYMBOL) { e = INT32_MAX; fptr = eptr+2; } else e = strtol(eptr+1,&fptr,10); if (fptr > eptr+1 && *fptr == 0 && e > 0) { pts[reps++] = b; pts[reps++] = e; if (b > e) { fprintf(stderr,"%s: Empty range '%s'\n",Prog_Name,argv[c]); exit (1); } continue; } } } fprintf(stderr,"%s: argument '%s' is not an integer range\n",Prog_Name,argv[c]); exit (1); } qsort(pts,reps/2,sizeof(int64),ORDER); b = 0; for (c = 0; c < reps; c += 2) if (b > 0 && pts[b-1] >= pts[c]-1) { if (pts[c+1] > pts[b-1]) pts[b-1] = pts[c+1]; } else { pts[b++] = pts[c]; pts[b++] = pts[c+1]; } pts[b++] = INT32_MAX; reps = b; } else { pts[reps++] = 1; pts[reps++] = INT32_MAX; } } // Initiate file reading and read (novl, tspace) header { char *over, *pwd, *root; pwd = PathTo(argv[2+ISTWO]); root = Root(argv[2+ISTWO],".las"); over = Catenate(pwd,"/",root,".las"); input = Fopen(over,"r"); if (input == NULL) exit (1); if (fread(&novl,sizeof(int64),1,input) != 1) SYSTEM_ERROR if (fread(&tspace,sizeof(int),1,input) != 1) SYSTEM_ERROR if (tspace <= 0) { fprintf(stderr,"%s: Garbage .las file, trace spacing <= 0 !\n",Prog_Name); exit (1); } if (tspace <= TRACE_XOVR) { small = 1; tbytes = sizeof(uint8); } else { small = 0; tbytes = sizeof(uint16); } printf("\n%s: ",root); Print_Number(novl,0,stdout); printf(" records\n"); free(pwd); free(root); } // Read the file and display selected records { int j; uint16 *trace; Work_Data *work; int tmax; int in, npt, idx, ar; int64 tps; char *abuffer, *bbuffer; int ar_wide, br_wide; int ai_wide, bi_wide; int mn_wide, mx_wide; int tp_wide; int blast, match, seen, lhalf, rhalf; aln->path = &(ovl->path); if (ALIGN || REFERENCE) { work = New_Work_Data(); abuffer = New_Read_Buffer(db1); bbuffer = New_Read_Buffer(db2); } else { abuffer = NULL; bbuffer = NULL; work = NULL; } tmax = 1000; trace = (uint16 *) Malloc(sizeof(uint16)*tmax,"Allocating trace vector"); if (trace == NULL) exit (1); in = 0; npt = pts[0]; idx = 1; ar_wide = Number_Digits((int64) db1->nreads); br_wide = Number_Digits((int64) db2->nreads); ai_wide = Number_Digits((int64) db1->maxlen); bi_wide = Number_Digits((int64) db2->maxlen); if (db1->maxlen < db2->maxlen) { mn_wide = ai_wide; mx_wide = bi_wide; tp_wide = Number_Digits((int64) db1->maxlen/tspace+2); } else { mn_wide = bi_wide; mx_wide = ai_wide; tp_wide = Number_Digits((int64) db2->maxlen/tspace+2); } ar_wide += (ar_wide-1)/3; br_wide += (br_wide-1)/3; ai_wide += (ai_wide-1)/3; bi_wide += (bi_wide-1)/3; mn_wide += (mn_wide-1)/3; tp_wide += (tp_wide-1)/3; if (FLIP) { int x; x = ar_wide; ar_wide = br_wide; br_wide = x; x = ai_wide; ai_wide = bi_wide; bi_wide = x; } // For each record do blast = -1; match = 0; seen = 0; lhalf = rhalf = 0; for (j = 0; j < novl; j++) // Read it in { Read_Overlap(input,ovl); if (ovl->path.tlen > tmax) { tmax = ((int) 1.2*ovl->path.tlen) + 100; trace = (uint16 *) Realloc(trace,sizeof(uint16)*tmax,"Allocating trace vector"); if (trace == NULL) exit (1); } ovl->path.trace = (void *) trace; Read_Trace(input,ovl,tbytes); // Determine if it should be displayed ar = ovl->aread+1; if (in) { while (ar > npt) { npt = pts[idx++]; if (ar < npt) { in = 0; break; } npt = pts[idx++]; } } else { while (ar >= npt) { npt = pts[idx++]; if (ar <= npt) { in = 1; break; } npt = pts[idx++]; } } if (!in) continue; // If -o check display only overlaps aln->alen = db1->reads[ovl->aread].rlen; aln->blen = db2->reads[ovl->bread].rlen; aln->flags = ovl->flags; tps = ovl->path.tlen/2; if (OVERLAP) { if (ovl->path.abpos != 0 && ovl->path.bbpos != 0) continue; if (ovl->path.aepos != aln->alen && ovl->path.bepos != aln->blen) continue; } // If -M option then check the completeness of the implied mapping if (MAP) { while (ovl->bread != blast) { if (!match && seen && !(lhalf && rhalf)) { printf("Missing "); Print_Number((int64) blast+1,br_wide+1,stdout); printf(" %d ->%lld\n",db2->reads[blast].rlen,db2->reads[blast].coff); } match = 0; seen = 0; lhalf = rhalf = 0; blast += 1; } seen = 1; if (ovl->path.abpos == 0) rhalf = 1; if (ovl->path.aepos == aln->alen) lhalf = 1; if (ovl->path.bbpos != 0 || ovl->path.bepos != aln->blen) continue; match = 1; } // Display it if (ALIGN || CARTOON || REFERENCE) printf("\n"); if (FLIP) { Flip_Alignment(aln,0); Print_Number((int64) ovl->bread+1,ar_wide+1,stdout); printf(" "); Print_Number((int64) ovl->aread+1,br_wide+1,stdout); } else { Print_Number((int64) ovl->aread+1,ar_wide+1,stdout); printf(" "); Print_Number((int64) ovl->bread+1,br_wide+1,stdout); } if (COMP(ovl->flags)) printf(" c"); else printf(" n"); printf(" ["); Print_Number((int64) ovl->path.abpos,ai_wide,stdout); printf(".."); Print_Number((int64) ovl->path.aepos,ai_wide,stdout); printf("] x ["); Print_Number((int64) ovl->path.bbpos,bi_wide,stdout); printf(".."); Print_Number((int64) ovl->path.bepos,bi_wide,stdout); printf("]"); if (ALIGN || CARTOON || REFERENCE) { if (ALIGN || REFERENCE) { char *aseq, *bseq; int amin, amax; int bmin, bmax; int self; if (FLIP) Flip_Alignment(aln,0); if (small) Decompress_TraceTo16(ovl); self = sameDB && (ovl->aread == ovl->bread) && !COMP(ovl->flags); amin = ovl->path.abpos - BORDER; if (amin < 0) amin = 0; amax = ovl->path.aepos + BORDER; if (amax > aln->alen) amax = aln->alen; if (COMP(aln->flags)) { bmin = (aln->blen-ovl->path.bepos) - BORDER; if (bmin < 0) bmin = 0; bmax = (aln->blen-ovl->path.bbpos) + BORDER; if (bmax > aln->blen) bmax = aln->blen; } else { bmin = ovl->path.bbpos - BORDER; if (bmin < 0) bmin = 0; bmax = ovl->path.bepos + BORDER; if (bmax > aln->blen) bmax = aln->blen; if (self) { if (bmin < amin) amin = bmin; if (bmax > amax) amax = bmax; } } aseq = Load_Subread(db1,ovl->aread,amin,amax,abuffer,0); if (!self) bseq = Load_Subread(db2,ovl->bread,bmin,bmax,bbuffer,0); else bseq = aseq; aln->aseq = aseq - amin; if (COMP(aln->flags)) { Complement_Seq(bseq,bmax-bmin); aln->bseq = bseq - (aln->blen - bmax); } else if (self) aln->bseq = aln->aseq; else aln->bseq = bseq - bmin; Compute_Trace_PTS(aln,work,tspace,GREEDIEST); if (FLIP) { if (COMP(aln->flags)) { Complement_Seq(aseq,amax-amin); Complement_Seq(bseq,bmax-bmin); aln->aseq = aseq - (aln->alen - amax); aln->bseq = bseq - bmin; } Flip_Alignment(aln,1); } } if (CARTOON) { printf(" ("); Print_Number(tps,tp_wide,stdout); printf(" trace pts)\n\n"); Alignment_Cartoon(stdout,aln,INDENT,mx_wide); } else { printf(" : = "); Print_Number((int64) ovl->path.diffs,mn_wide,stdout); printf(" diffs ("); Print_Number(tps,tp_wide,stdout); printf(" trace pts)\n"); } if (REFERENCE) Print_Reference(stdout,aln,work,INDENT,WIDTH,BORDER,UPPERCASE,mx_wide); if (ALIGN) Print_Alignment(stdout,aln,work,INDENT,WIDTH,BORDER,UPPERCASE,mx_wide); } else { printf(" : < "); Print_Number((int64) ovl->path.diffs,mn_wide,stdout); printf(" diffs ("); Print_Number(tps,tp_wide,stdout); printf(" trace pts)\n"); } } free(trace); if (ALIGN) { free(bbuffer-1); free(abuffer-1); Free_Work_Data(work); } } Close_DB(db1); if (ISTWO) Close_DB(db2); exit (0); } pbdagcon-0.3+20161121+ds/DALIGNER/LAmerge.c0000644000175000017500000002062613026414545015677 0ustar afifafif/******************************************************************************************* * * Given a list of sorted .las files, merge them into a single sorted .las file. * * Author: Gene Myers * Date : July 2013 * *******************************************************************************************/ #include #include #include #include #include #include #include #include "DB.h" #include "align.h" static char *Usage = "[-v] ..."; #define MEMORY 4000 // in Mb #undef DEBUG // Heap sort of records according to (aread,bread,COMP(flags),abpos) order #define COMPARE(lp,rp) \ if (lp->aread > rp->aread) \ bigger = 1; \ else if (lp->aread < rp->aread) \ bigger = 0; \ else if (lp->bread > rp->bread) \ bigger = 1; \ else if (lp->bread < rp->bread) \ bigger = 0; \ else if (COMP(lp->flags) > COMP(rp->flags)) \ bigger = 1; \ else if (COMP(lp->flags) < COMP(rp->flags)) \ bigger = 0; \ else if (lp->path.abpos > rp->path.abpos) \ bigger = 1; \ else \ bigger = 0; static void reheap(int s, Overlap **heap, int hsize) { int c, l, r; int bigger; Overlap *hs, *hr, *hl; c = s; hs = heap[s]; while ((l = 2*c) <= hsize) { r = l+1; hl = heap[l]; if (r > hsize) bigger = 1; else { hr = heap[r]; COMPARE(hr,hl) } if (bigger) { COMPARE(hs,hl) if (bigger) { heap[c] = hl; c = l; } else break; } else { COMPARE(hs,hr) if (bigger) { heap[c] = hr; c = r; } else break; } } if (c != s) heap[c] = hs; } // Heap sort of records according to (aread,abpos) order #define MAPARE(lp,rp) \ if (lp->aread > rp->aread) \ bigger = 1; \ else if (lp->aread < rp->aread) \ bigger = 0; \ else if (lp->path.abpos > rp->path.abpos) \ bigger = 1; \ else \ bigger = 0; static void maheap(int s, Overlap **heap, int hsize) { int c, l, r; int bigger; Overlap *hs, *hr, *hl; c = s; hs = heap[s]; while ((l = 2*c) <= hsize) { r = l+1; hl = heap[l]; if (r > hsize) bigger = 1; else { hr = heap[r]; MAPARE(hr,hl) } if (bigger) { MAPARE(hs,hl) if (bigger) { heap[c] = hl; c = l; } else break; } else { MAPARE(hs,hr) if (bigger) { heap[c] = hr; c = r; } else break; } } if (c != s) heap[c] = hs; } #ifdef DEBUG static void showheap(Overlap **heap, int hsize) { int i; printf("\n"); for (i = 1; i <= hsize; i++) printf(" %3d: %5d, %5d\n",i,heap[i]->aread,heap[i]->bread); } #endif // Input block data structure and block fetcher typedef struct { FILE *stream; char *block; char *ptr; char *top; int64 count; } IO_block; static void ovl_reload(IO_block *in, int64 bsize) { int64 remains; remains = in->top - in->ptr; if (remains > 0) memcpy(in->block, in->ptr, remains); in->ptr = in->block; in->top = in->block + remains; in->top += fread(in->top,1,bsize-remains,in->stream); } // The program int main(int argc, char *argv[]) { IO_block *in; int64 bsize, osize, psize; char *block, *oblock; int i, fway; Overlap **heap; int hsize; Overlap *ovls; int64 totl; int tspace, tbytes; FILE *output; char *optr, *otop; int VERBOSE; int MAP_SORT; // Process command line { int j, k; int flags[128]; ARG_INIT("LAmerge") j = 1; for (i = 1; i < argc; i++) if (argv[i][0] == '-') { ARG_FLAGS("vc") } else argv[j++] = argv[i]; argc = j; VERBOSE = flags['v']; MAP_SORT = flags['c']; if (argc < 3) { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage); exit (1); } fway = argc-2; if (fway > 252) { fprintf(stderr,"Exceeded maximum # of inputs and outputs (252) of merge\n"); exit (1); } } // Open all the input files and initialize their buffers psize = sizeof(void *); osize = sizeof(Overlap) - psize; bsize = (MEMORY*1000000ll)/(fway + 1); block = (char *) Malloc(bsize*(fway+1)+psize,"Allocating LAmerge blocks"); in = (IO_block *) Malloc(sizeof(IO_block)*fway,"Allocating LAmerge IO-reacords"); if (block == NULL || in == NULL) exit (1); block += psize; totl = 0; tbytes = 0; tspace = 0; for (i = 0; i < fway; i++) { int64 novl; int mspace; FILE *input; char *pwd, *root; char *iblock; pwd = PathTo(argv[i+2]); root = Root(argv[i+2],".las"); input = Fopen(Catenate(pwd,"/",root,".las"),"r"); if (input == NULL) exit (1); if (fread(&novl,sizeof(int64),1,input) != 1) SYSTEM_ERROR totl += novl; if (VERBOSE) fprintf(stdout, "In file %s, there are %lld records\n", Catenate(pwd,"/",root,".las"), novl); free(pwd); free(root); if (fread(&mspace,sizeof(int),1,input) != 1) SYSTEM_ERROR if (i == 0) { tspace = mspace; if (tspace <= TRACE_XOVR) tbytes = sizeof(uint8); else tbytes = sizeof(uint16); } else if (tspace != mspace) { fprintf(stderr,"%s: PT-point spacing conflict (%d vs %d)\n",Prog_Name,tspace,mspace); exit (1); } in[i].stream = input; in[i].block = iblock = block+i*bsize; in[i].ptr = iblock; in[i].top = iblock + fread(in[i].block,1,bsize,input); in[i].count = 0; } // Open the output file buffer and write (novl,tspace) header { char *pwd, *root; pwd = PathTo(argv[1]); root = Root(argv[1],".las"); output = Fopen(Catenate(pwd,"/",root,".las"),"w"); if (output == NULL) exit (1); free(pwd); free(root); fwrite(&totl,sizeof(int64),1,output); fwrite(&tspace,sizeof(int),1,output); oblock = block+fway*bsize; optr = oblock; otop = oblock + bsize; } if (VERBOSE) { printf("Merging %d files totalling ",fway); Print_Number(totl,0,stdout); printf(" records\n"); } // Initialize the heap heap = (Overlap **) Malloc(sizeof(Overlap *)*(fway+1),"Allocating heap"); ovls = (Overlap *) Malloc(sizeof(Overlap)*fway,"Allocating heap"); if (heap == NULL || ovls == NULL) exit (1); hsize = 0; for (i = 0; i < fway; i++) { if (in[i].ptr < in[i].top) { ovls[i] = *((Overlap *) (in[i].ptr - psize)); in[i].ptr += osize; hsize += 1; heap[hsize] = ovls + i; } } if (hsize > 3) { if (MAP_SORT) for (i = hsize/2; i > 1; i--) maheap(i,heap,hsize); else for (i = hsize/2; i > 1; i--) reheap(i,heap,hsize); } // While the heap is not empty do while (hsize > 0) { Overlap *ov; IO_block *src; int64 tsize, span; if (MAP_SORT) maheap(1,heap,hsize); else reheap(1,heap,hsize); ov = heap[1]; src = in + (ov - ovls); src->count += 1; tsize = ov->path.tlen*tbytes; span = osize + tsize; if (src->ptr + span > src->top) ovl_reload(src,bsize); if (optr + span > otop) { fwrite(oblock,1,optr-oblock,output); optr = oblock; } memcpy(optr,((char *) ov) + psize,osize); optr += osize; memcpy(optr,src->ptr,tsize); optr += tsize; src->ptr += tsize; if (src->ptr < src->top) { *ov = *((Overlap *) (src->ptr - psize)); src->ptr += osize; } else { heap[1] = heap[hsize]; hsize -= 1; } } // Flush output buffer and wind up if (optr > oblock) fwrite(oblock,1,optr-oblock,output); fclose(output); for (i = 0; i < fway; i++) fclose(in[i].stream); for (i = 0; i < fway; i++) totl -= in[i].count; if (totl != 0) { fprintf(stderr,"%s: Did not write all records (%lld)\n",argv[0],totl); exit (1); } free(ovls); free(heap); free(in); free(block-psize); exit (0); } pbdagcon-0.3+20161121+ds/DALIGNER/Makefile0000644000175000017500000000444513026414545015660 0ustar afifafifCFLAGS = -O3 -Wall -Wextra -fno-strict-aliasing -Wno-unused-result ALL = daligner HPCdaligner HPCmapper LAsort LAmerge LAsplit LAcat LAshow LAdump LAcheck LAindex daligner_p LA4Falcon LA4Ice DB2Falcon DB.so all: $(ALL) daligner: daligner.c filter.c filter.h align.c align.h DB.c DB.h QV.c QV.h gcc $(CFLAGS) -o daligner daligner.c filter.c align.c DB.c QV.c -lpthread -lm daligner_p: daligner.c filter.c filter.h align.c align.h DB.c DB.h QV.c QV.h gcc $(CFLAGS) -o daligner_p daligner.c filter.c align.c DB.c QV.c -lpthread -lm -DFALCON_DALIGNER_P HPCdaligner: HPCdaligner.c DB.c DB.h QV.c QV.h gcc $(CFLAGS) -o HPCdaligner HPCdaligner.c DB.c QV.c -lm HPCmapper: HPCmapper.c DB.c DB.h QV.c QV.h gcc $(CFLAGS) -o HPCmapper HPCmapper.c DB.c QV.c -lm LAsort: LAsort.c align.h DB.c DB.h QV.c QV.h gcc $(CFLAGS) -o LAsort LAsort.c DB.c QV.c -lm LAmerge: LAmerge.c align.h DB.c DB.h QV.c QV.h gcc $(CFLAGS) -o LAmerge LAmerge.c DB.c QV.c -lm LAshow: LAshow.c align.c align.h DB.c DB.h QV.c QV.h gcc $(CFLAGS) -o LAshow LAshow.c align.c DB.c QV.c -lm LAdump: LAdump.c align.c align.h DB.c DB.h QV.c QV.h gcc $(CFLAGS) -o LAdump LAdump.c align.c DB.c QV.c -lm LA4Falcon: LA4Falcon.c align.c align.h DB.c DB.h QV.c QV.h gcc $(CFLAGS) -o LA4Falcon LA4Falcon.c align.c DB.c QV.c -lm LA4Ice: LA4Ice.c align.c align.h DB.c DB.h QV.c QV.h gcc $(CFLAGS) -o LA4Ice LA4Ice.c align.c DB.c QV.c -lm LAcat: LAcat.c align.h DB.c DB.h QV.c QV.h gcc $(CFLAGS) -o LAcat LAcat.c DB.c QV.c -lm LAsplit: LAsplit.c align.h DB.c DB.h QV.c QV.h gcc $(CFLAGS) -o LAsplit LAsplit.c DB.c QV.c -lm LAcheck: LAcheck.c align.c align.h DB.c DB.h QV.c QV.h gcc $(CFLAGS) -o LAcheck LAcheck.c align.c DB.c QV.c -lm DB2Falcon: DB2Falcon.c DB.c DB.h QV.c QV.h gcc $(CFLAGS) -o DB2Falcon DB2Falcon.c DB.c QV.c -lm DB.so: DB.c DB.h QV.c QV.h gcc $(CFLAGS) -shared -fPIC -o DB.so DB.c QV.c -lm LAindex: LAindex.c align.c align.h DB.c DB.h QV.c QV.h gcc $(CFLAGS) -o LAindex LAindex.c align.c DB.c QV.c -lm clean: rm -f $(ALL) rm -f LAupgrade.Dec.31.2014 rm -f daligner.tar.gz LAupgrade.Dec.31.2014: LAupgrade.Dec.31.2014.c align.c align.h DB.c DB.h QV.c QV.h gcc $(CFLAGS) -o LAupgrade.Dec.31.2014 LAupgrade.Dec.31.2014.c align.c DB.c QV.c -lm install: cp $(ALL) ~/bin package: make clean tar -zcf daligner.tar.gz README *.h *.c Makefile pbdagcon-0.3+20161121+ds/DALIGNER/align.c0000644000175000017500000037701413026414545015463 0ustar afifafif/******************************************************************************************* * * Fast alignment discovery and trace generation along with utilites for displaying alignments * Based on previously unpublished ideas from 2005, subsequently refined in 2013-14. Basic * idea is to keep a dynamically selected interval of the f.r. waves from my 1986 O(nd) paper. * A recent cool idea is to not record all the details of an alignment while discovering it * but simply record trace points through which the optimal alignment passes every 100bp, * allowing rapid recomputation of the alignment details between trace points. * * Author : Gene Myers * First : June 2013 * Current: June 1, 2014 * ********************************************************************************************/ #include #include #include #include #include #include #include #include "DB.h" #include "align.h" #undef DEBUG_PASSES // Show forward / backward extension termini for Local_Alignment #undef DEBUG_POINTS // Show trace points #undef DEBUG_WAVE // Show waves of Local_Alignment #undef SHOW_MATCH_WAVE // For waves of Local_Alignment also show # of matches #undef SHOW_TRAIL // Show trace at the end of forward and reverse passes #undef SHOW_TPS // Show trace points as they are encountered in a wave #undef DEBUG_EXTEND // Show waves of Extend_Until_Overlap #undef DEBUG_ALIGN // Show division points of Compute_Trace #undef DEBUG_SCRIPT // Show trace additions for Compute_Trace #undef DEBUG_AWAVE // Show F/R waves of Compute_Trace #undef SHOW_TRACE // Show full trace for Print_Alignment #undef WAVE_STATS /****************************************************************************************\ * * * Working Storage Abstraction * * * \****************************************************************************************/ typedef struct // Hidden from the user, working space for each thread { int vecmax; void *vector; int celmax; void *cells; int pntmax; void *points; int tramax; void *trace; } _Work_Data; Work_Data *New_Work_Data() { _Work_Data *work; work = (_Work_Data *) Malloc(sizeof(_Work_Data),"Allocating work data block"); if (work == NULL) EXIT(NULL); work->vecmax = 0; work->vector = NULL; work->pntmax = 0; work->points = NULL; work->tramax = 0; work->trace = NULL; work->celmax = 0; work->cells = NULL; return ((Work_Data *) work); } static int enlarge_vector(_Work_Data *work, int newmax) { void *vec; int max; max = ((int) (newmax*1.2)) + 10000; vec = Realloc(work->vector,max,"Enlarging DP vector"); if (vec == NULL) EXIT(1); work->vecmax = max; work->vector = vec; return (0); } static int enlarge_points(_Work_Data *work, int newmax) { void *vec; int max; max = ((int) (newmax*1.2)) + 10000; vec = Realloc(work->points,max,"Enlarging point vector"); if (vec == NULL) EXIT(1); work->pntmax = max; work->points = vec; return (0); } static int enlarge_trace(_Work_Data *work, int newmax) { void *vec; int max; max = ((int) (newmax*1.2)) + 10000; vec = Realloc(work->trace,max,"Enlarging trace vector"); if (vec == NULL) EXIT(1); work->tramax = max; work->trace = vec; return (0); } void Free_Work_Data(Work_Data *ework) { _Work_Data *work = (_Work_Data *) ework; if (work->vector != NULL) free(work->vector); if (work->cells != NULL) free(work->cells); if (work->trace != NULL) free(work->trace); if (work->points != NULL) free(work->points); free(work); } /****************************************************************************************\ * * * ADAPTIVE PATH FINDING * * * \****************************************************************************************/ // Absolute/Fixed Parameters #define BVEC uint64 // Can be uint32 if PATH_LEN <= 32 #define TRIM_LEN 15 // Report as the tip, the last wave maximum for which the last // 2*TRIM_LEN edits are prefix-positive at rate ave_corr*f(bias) // (max value is 20) #define PATH_LEN 60 // Follow the last PATH_LEN columns/edges (max value is 63) // Derivative fixed parameters #define PATH_TOP 0x1000000000000000ll // Must be 1 << PATH_LEN #define PATH_INT 0x0fffffffffffffffll // Must be PATH_TOP-1 #define TRIM_MASK 0x7fff // Must be (1 << TRIM_LEN) - 1 #define TRIM_MLAG 200 // How far can last trim point be behind best point #define WAVE_LAG 30 // How far can worst point be behind the best point static double Bias_Factor[10] = { .690, .690, .690, .690, .780, .850, .900, .933, .966, 1.000 }; // Adjustable paramters typedef struct { double ave_corr; int trace_space; float freq[4]; int ave_path; int16 *score; int16 *table; } _Align_Spec; /* Fill in bit table: TABLE[x] = 1 iff the alignment modeled by x (1 = match, 0 = mismatch) has a non-negative score for every suffix of the alignment under the scoring scheme where match = MATCH and mismatch = -1. MATCH is set so that an alignment with TRIM_PCT matches has zero score ( (1-TRIM_PCT) / TRIM_PCT ). */ #define FRACTION 1000 // Implicit fractional part of scores, i.e. score = x/FRACTION typedef struct { int mscore; int dscore; int16 *table; int16 *score; } Table_Bits; static void set_table(int bit, int prefix, int score, int max, Table_Bits *parms) { if (bit >= TRIM_LEN) { parms->table[prefix] = (int16) (score-max); parms->score[prefix] = (int16) score; } else { if (score > max) max = score; set_table(bit+1,(prefix<<1),score - parms->dscore,max,parms); set_table(bit+1,(prefix<<1) | 1,score + parms->mscore,max,parms); } } /* Create an alignment specification record including path tip tables & values */ Align_Spec *New_Align_Spec(double ave_corr, int trace_space, float *freq) { _Align_Spec *spec; Table_Bits parms; double match; int bias; spec = (_Align_Spec *) Malloc(sizeof(_Align_Spec),"Allocating alignment specification"); if (spec == NULL) EXIT(NULL); spec->ave_corr = ave_corr; spec->trace_space = trace_space; spec->freq[0] = freq[0]; spec->freq[1] = freq[1]; spec->freq[2] = freq[2]; spec->freq[3] = freq[3]; match = freq[0] + freq[3]; if (match > .5) match = 1.-match; bias = (int) ((match+.025)*20.-1.); if (match < .2) { fprintf(stderr,"Warning: Base bias worse than 80/20%% ! (New_Align_Spec)\n"); fprintf(stderr," Capping bias at this ratio.\n"); bias = 3; } spec->ave_path = (int) (PATH_LEN * (1. - Bias_Factor[bias] * (1. - ave_corr))); parms.mscore = (int) (FRACTION * Bias_Factor[bias] * (1. - ave_corr)); parms.dscore = FRACTION - parms.mscore; parms.score = (int16 *) Malloc(sizeof(int16)*(TRIM_MASK+1)*2,"Allocating trim table"); if (parms.score == NULL) { free(spec); EXIT(NULL); } parms.table = parms.score + (TRIM_MASK+1); set_table(0,0,0,0,&parms); spec->table = parms.table; spec->score = parms.score; return ((Align_Spec *) spec); } void Free_Align_Spec(Align_Spec *espec) { _Align_Spec *spec = (_Align_Spec *) espec; free(spec->score); free(spec); } double Average_Correlation(Align_Spec *espec) { return (((_Align_Spec *) espec)->ave_corr); } int Trace_Spacing(Align_Spec *espec) { return (((_Align_Spec *) espec)->trace_space); } float *Base_Frequencies(Align_Spec *espec) { return (((_Align_Spec *) espec)->freq); } /****************************************************************************************\ * * * LOCAL ALIGNMENT FINDER: forward_/reverse_wave and Local_Alignment * * * \****************************************************************************************/ #ifdef WAVE_STATS static int64 MAX, TOT, NWV; static int64 RESTARTS; void Init_Stats() { MAX = TOT = NWV = 0; RESTARTS = 0; } void Print_Stats() { printf("\nMax = %lld Ave = %.1f # = %lld\n",MAX,(1.*TOT)/NWV,NWV); printf("\nRestarts = %lld\n",RESTARTS); } #endif #ifdef DEBUG_WAVE static void print_wave(int *V, int *M, int low, int hgh, int besta) { int k, bestk; (void) M; printf(" [%6d,%6d]: ",low,hgh); for (k = low; k <= hgh; k++) { if (besta == V[k]) bestk = k; // printf(" %3d",(V[k]+k)/2); printf(" %3d",besta-V[k]); } printf(" : %d (%d,%d)\n",besta,(besta+bestk)/2,(besta-bestk)/2); #ifdef SHOW_MATCH_WAVE printf(" "); for (k = low; k <= hgh; k++) printf(" %3d",M[k]); printf("\n"); #endif fflush(stdout); } #endif /* At each furthest reaching point, keep a-coordinate of point (V), bitvector recording the last TRIM_LEN columns of the implied alignment (T), and the # of matches (1-bits) in the bitvector (M). */ typedef struct { int ptr; int diag; int diff; int mark; } Pebble; static int VectorEl = 6*sizeof(int) + sizeof(BVEC); static int forward_wave(_Work_Data *work, _Align_Spec *spec, Alignment *align, Path *bpath, int *mind, int maxd, int mida, int minp, int maxp) { char *aseq = align->aseq; char *bseq = align->bseq; Path *apath = align->path; int hgh, low, dif; int vlen, vmin, vmax; int *V, *M; int *_V, *_M; BVEC *T; BVEC *_T; int *HA, *HB; int *_HA, *_HB; int *NA, *NB; int *_NA, *_NB; Pebble *cells; int avail, cmax, boff; int TRACE_SPACE = spec->trace_space; int PATH_AVE = spec->ave_path; int16 *SCORE = spec->score; int16 *TABLE = spec->table; int besta, besty; int trima, trimy, trimd; int trimha, trimhb; int morea, morey, mored; int moreha, morehb; int more, morem, lasta; int aclip, bclip; hgh = maxd; low = *mind; dif = 0; { int span, wing; span = (hgh-low)+1; vlen = work->vecmax/VectorEl; wing = (vlen - span)/2; vmin = low - wing; vmax = hgh + wing; _V = ((int *) work->vector); _M = _V + vlen; _HA = _M + vlen; _HB = _HA + vlen; _NA = _HB + vlen; _NB = _NA + vlen; _T = ((BVEC *) (_NB + vlen)); V = _V-vmin; M = _M-vmin; HA = _HA-vmin; HB = _HB-vmin; NA = _NA-vmin; NB = _NB-vmin; T = _T-vmin; cells = (Pebble *) (work->cells); cmax = work->celmax; avail = 0; if (COMP(align->flags)) boff = align->blen % TRACE_SPACE; else boff = 0; } /* Compute 0-wave starting from mid-line */ more = 1; aclip = INT32_MAX; bclip = -INT32_MAX; besta = trima = morea = lasta = mida; besty = trimy = morey = (mida-hgh) >> 1; trimd = mored = 0; trimha = moreha = 0; trimhb = morehb = 1; morem = -1; { int k; char *a; a = aseq + hgh; for (k = hgh; k >= low; k--) { int y, c, d; int ha, hb; int na, nb; Pebble *pb; y = (mida-k) >> 1; if (avail >= cmax-1) { cmax = ((int) (avail*1.2)) + 10000; cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble),"Reallocating trace cells"); if (cells == NULL) EXIT(1); work->celmax = cmax; work->cells = (void *) cells; } na = ((y+k)/TRACE_SPACE)*TRACE_SPACE; #ifdef SHOW_TPS printf(" A %d: %d,%d,0,%d\n",avail,-1,k,na); fflush(stdout); #endif pb = cells+avail; pb->ptr = -1; pb->diag = k; pb->diff = 0; pb->mark = na; ha = avail++; na += TRACE_SPACE; nb = ((y+(TRACE_SPACE-boff))/TRACE_SPACE-1)*TRACE_SPACE+boff; #ifdef SHOW_TPS printf(" B %d: %d,%d,0,%d\n",avail,-1,k,nb); fflush(stdout); #endif pb = cells+avail; pb->ptr = -1; pb->diag = k; pb->diff = 0; pb->mark = nb; hb = avail++; nb += TRACE_SPACE; while (1) { c = bseq[y]; if (c == 4) { more = 0; if (bclip < k) bclip = k; break; } d = a[y]; if (c != d) { if (d == 4) { more = 0; aclip = k; } break; } y += 1; } c = (y << 1) + k; while (y+k >= na) { if (avail >= cmax) { cmax = ((int) (avail*1.2)) + 10000; cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble),"Reallocating trace cells"); if (cells == NULL) EXIT(1); work->celmax = cmax; work->cells = (void *) cells; } #ifdef SHOW_TPS printf(" A %d: %d,%d,0,%d\n",avail,ha,k,na); fflush(stdout); #endif pb = cells+avail; pb->ptr = ha; pb->diag = k; pb->diff = 0; pb->mark = na; ha = avail++; na += TRACE_SPACE; } while (y >= nb) { if (avail >= cmax) { cmax = ((int) (avail*1.2)) + 10000; cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble),"Reallocating trace cells"); if (cells == NULL) EXIT(1); work->celmax = cmax; work->cells = (void *) cells; } #ifdef SHOW_TPS printf(" B %d: %d,%d,0,%d\n",avail,hb,k,nb); fflush(stdout); #endif pb = cells+avail; pb->ptr = hb; pb->diag = k; pb->diff = 0; pb->mark = nb; hb = avail++; nb += TRACE_SPACE; } if (c > besta) { besta = trima = lasta = c; besty = trimy = y; trimha = ha; trimhb = hb; } V[k] = c; T[k] = PATH_INT; M[k] = PATH_LEN; HA[k] = ha; HB[k] = hb; NA[k] = na; NB[k] = nb; a -= 1; } } if (more == 0) { if (bseq[besty] != 4 && aseq[besta - besty] != 4) more = 1; if (hgh >= aclip) { hgh = aclip-1; if (morem <= M[aclip]) { morem = M[aclip]; morea = V[aclip]; morey = (morea - aclip)/2; moreha = HA[aclip]; morehb = HB[aclip]; } } if (low <= bclip) { low = bclip+1; if (morem <= M[bclip]) { morem = M[bclip]; morea = V[bclip]; morey = (morea - bclip)/2; moreha = HA[bclip]; morehb = HB[bclip]; } } aclip = INT32_MAX; bclip = -INT32_MAX; } #ifdef DEBUG_WAVE printf("\nFORWARD WAVE:\n"); print_wave(V,M,low,hgh,besta); #endif /* Compute successive waves until no furthest reaching points remain */ while (more && lasta >= besta - TRIM_MLAG) { int k, n; int ua, ub; BVEC t; int am, ac, ap; char *a; low -= 1; hgh += 1; if (low <= vmin || hgh >= vmax) { int span, wing; int64 move; int64 vd, md, had, hbd, nad, nbd, td; span = (hgh-low)+1; if (.8*vlen < span) { if (enlarge_vector(work,vlen*VectorEl)) EXIT(1); move = ((void *) _V) - work->vector; vlen = work->vecmax/VectorEl; _V = (int *) work->vector; _M = _V + vlen; _HA = _M + vlen; _HB = _HA + vlen; _NA = _HB + vlen; _NB = _NA + vlen; _T = ((BVEC *) (_NB + vlen)); } else move = 0; wing = (vlen - span)/2; vd = ((void *) ( _V+wing)) - (((void *) ( V+low)) - move); md = ((void *) ( _M+wing)) - (((void *) ( M+low)) - move); had = ((void *) (_HA+wing)) - (((void *) (HA+low)) - move); hbd = ((void *) (_HB+wing)) - (((void *) (HB+low)) - move); nad = ((void *) (_NA+wing)) - (((void *) (NA+low)) - move); nbd = ((void *) (_NB+wing)) - (((void *) (NB+low)) - move); td = ((void *) ( _T+wing)) - (((void *) ( T+low)) - move); if (vd < 0) memmove( _V+wing, ((void *) ( V+low)) - move, span*sizeof(int)); if (md < 0) memmove( _M+wing, ((void *) ( M+low)) - move, span*sizeof(int)); if (had < 0) memmove(_HA+wing, ((void *) (HA+low)) - move, span*sizeof(int)); if (hbd < 0) memmove(_HB+wing, ((void *) (HB+low)) - move, span*sizeof(int)); if (nad < 0) memmove(_NA+wing, ((void *) (NA+low)) - move, span*sizeof(int)); if (nbd < 0) memmove(_NB+wing, ((void *) (NB+low)) - move, span*sizeof(int)); if (td < 0) memmove( _T+wing, ((void *) ( T+low)) - move, span*sizeof(BVEC)); if (td > 0) memmove( _T+wing, ((void *) ( T+low)) - move, span*sizeof(BVEC)); if (nbd > 0) memmove(_NB+wing, ((void *) (NB+low)) - move, span*sizeof(int)); if (nad > 0) memmove(_NA+wing, ((void *) (NA+low)) - move, span*sizeof(int)); if (hbd > 0) memmove(_HB+wing, ((void *) (HB+low)) - move, span*sizeof(int)); if (had > 0) memmove(_HA+wing, ((void *) (HA+low)) - move, span*sizeof(int)); if (md > 0) memmove( _M+wing, ((void *) ( M+low)) - move, span*sizeof(int)); if (vd > 0) memmove( _V+wing, ((void *) ( V+low)) - move, span*sizeof(int)); vmin = low-wing; vmax = hgh+wing; V = _V-vmin; M = _M-vmin; HA = _HA-vmin; HB = _HB-vmin; NA = _NA-vmin; NB = _NB-vmin; T = _T-vmin; } if (low >= minp) { NA[low] = NA[low+1]; NB[low] = NB[low+1]; V[low] = -1; } else low += 1; if (hgh <= maxp) { NA[hgh] = NA[hgh-1]; NB[hgh] = NB[hgh-1]; V[hgh] = am = -1; } else am = V[--hgh]; dif += 1; ac = V[hgh+1] = V[low-1] = -1; a = aseq + hgh; t = PATH_INT; n = PATH_LEN; ua = ub = -1; for (k = hgh; k >= low; k--) { int y, m; int ha, hb; int c, d; BVEC b; Pebble *pb; ap = ac; ac = am; am = V[d = k-1]; if (ac < am) if (am < ap) { c = ap+1; m = n; b = t; ha = ua; hb = ub; } else { c = am+1; m = M[d]; b = T[d]; ha = HA[d]; hb = HB[d]; } else if (ac < ap) { c = ap+1; m = n; b = t; ha = ua; hb = ub; } else { c = ac+2; m = M[k]; b = T[k]; ha = HA[k]; hb = HB[k]; } if ((b & PATH_TOP) != 0) m -= 1; b <<= 1; y = (c-k) >> 1; while (1) { c = bseq[y]; if (c == 4) { more = 0; if (bclip < k) bclip = k; break; } d = a[y]; if (c != d) { if (d == 4) { more = 0; aclip = k; } break; } y += 1; if ((b & PATH_TOP) == 0) m += 1; b = (b << 1) | 1; } c = (y << 1) + k; while (y+k >= NA[k]) { if (cells[ha].mark < NA[k]) { if (avail >= cmax) { cmax = ((int) (avail*1.2)) + 10000; cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble), "Reallocating trace cells"); if (cells == NULL) EXIT(1); work->celmax = cmax; work->cells = (void *) cells; } #ifdef SHOW_TPS printf(" A %d: %d,%d,%d,%d\n",avail,ha,k,dif,NA[k]); fflush(stdout); #endif pb = cells+avail; pb->ptr = ha; pb->diag = k; pb->diff = dif; pb->mark = NA[k]; ha = avail++; } NA[k] += TRACE_SPACE; } while (y >= NB[k]) { if (cells[hb].mark < NB[k]) { if (avail >= cmax) { cmax = ((int) (avail*1.2)) + 10000; cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble), "Reallocating trace cells"); if (cells == NULL) EXIT(1); work->celmax = cmax; work->cells = (void *) cells; } #ifdef SHOW_TPS printf(" B %d: %d,%d,%d,%d\n",avail,hb,k,dif,NB[k]); fflush(stdout); #endif pb = cells+avail; pb->ptr = hb; pb->diag = k; pb->diff = dif; pb->mark = NB[k]; hb = avail++; } NB[k] += TRACE_SPACE; } if (c > besta) { besta = c; besty = y; if (m >= PATH_AVE) { lasta = c; if (TABLE[b & TRIM_MASK] >= 0) if (TABLE[(b >> TRIM_LEN) & TRIM_MASK] + SCORE[b & TRIM_MASK] >= 0) { trima = c; trimy = y; trimd = dif; trimha = ha; trimhb = hb; } } } t = T[k]; n = M[k]; ua = HA[k]; ub = HB[k]; V[k] = c; T[k] = b; M[k] = m; HA[k] = ha; HB[k] = hb; a -= 1; } if (more == 0) { if (bseq[besty] != 4 && aseq[besta-besty] != 4) more = 1; if (hgh >= aclip) { hgh = aclip-1; if (morem <= M[aclip]) { morem = M[aclip]; morea = V[aclip]; morey = (morea - aclip)/2; mored = dif; moreha = HA[aclip]; morehb = HB[aclip]; } } if (low <= bclip) { low = bclip+1; if (morem <= M[bclip]) { morem = M[bclip]; morea = V[bclip]; morey = (morea - bclip)/2; mored = dif; moreha = HA[bclip]; morehb = HB[bclip]; } } aclip = INT32_MAX; bclip = -INT32_MAX; } n = besta - WAVE_LAG; while (hgh >= low) if (V[hgh] < n) hgh -= 1; else { while (V[low] < n) low += 1; break; } #ifdef WAVE_STATS k = (hgh-low)+1; if (k > MAX) MAX = k; TOT += k; NWV += 1; #endif #ifdef DEBUG_WAVE print_wave(V,M,low,hgh,besta); #endif } { uint16 *atrace = (uint16 *) apath->trace; uint16 *btrace = (uint16 *) bpath->trace; int atlen, btlen; int trimx; int a, b, k, h; int d, e; if (morem >= 0) { trimx = morea-morey; trimy = morey; trimd = mored; trimha = moreha; trimhb = morehb; } else trimx = trima-trimy; atlen = btlen = 0; a = -1; for (h = trimha; h >= 0; h = b) { b = cells[h].ptr; cells[h].ptr = a; a = h; } h = a; k = cells[h].diag; b = (mida-k)/2; e = 0; #ifdef SHOW_TRAIL printf(" A path = (%5d,%5d)\n",(mida+k)/2,b); fflush(stdout); #endif for (h = cells[h].ptr; h >= 0; h = cells[h].ptr) { k = cells[h].diag; a = cells[h].mark - k; d = cells[h].diff; atrace[atlen++] = (uint16) (d-e); atrace[atlen++] = (uint16) (a-b); #ifdef SHOW_TRAIL printf(" %4d: (%5d,%5d): %3d / %3d\n",h,a+k,a,d-e,a-b); fflush(stdout); #endif b = a; e = d; } if (b+k != trimx) { atrace[atlen++] = (uint16) (trimd-e); atrace[atlen++] = (uint16) (trimy-b); #ifdef SHOW_TRAIL printf(" (%5d,%5d): %3d / %3d\n",trimx,trimy,trimd-e,trimy-b); fflush(stdout); #endif } else if (b != trimy) { atrace[atlen-1] = (uint16) (atrace[atlen-1] + (trimy-b)); atrace[atlen-2] = (uint16) (atrace[atlen-2] + (trimd-e)); #ifdef SHOW_TRAIL printf(" @ (%5d,%5d): %3d / %3d\n",trimx,trimy,trimd-e,trimy-b); fflush(stdout); #endif } a = -1; for (h = trimhb; h >= 0; h = b) { b = cells[h].ptr; cells[h].ptr = a; a = h; } h = a; k = cells[h].diag; b = (mida+k)/2; e = 0; low = k; #ifdef SHOW_TRAIL printf(" B path = (%5d,%5d)\n",b,(mida-k)/2); fflush(stdout); #endif for (h = cells[h].ptr; h >= 0; h = cells[h].ptr) { k = cells[h].diag; a = cells[h].mark + k; d = cells[h].diff; btrace[btlen++] = (uint16) (d-e); btrace[btlen++] = (uint16) (a-b); #ifdef SHOW_TRAIL printf(" %4d: (%5d,%5d): %3d / %3d\n",h,a,a-k,d-e,a-b); fflush(stdout); #endif b = a; e = d; } if (b-k != trimy) { btrace[btlen++] = (uint16) (trimd-e); btrace[btlen++] = (uint16) (trimx-b); #ifdef SHOW_TRAIL printf(" (%5d,%5d): %3d / %3d\n",trimx,trimy,trimd-e,trimx-b); fflush(stdout); #endif } else if (b != trimx) { btrace[btlen-1] = (uint16) (btrace[btlen-1] + (trimx-b)); btrace[btlen-2] = (uint16) (btrace[btlen-2] + (trimd-e)); #ifdef SHOW_TRAIL printf(" @ (%5d,%5d): %3d / %3d\n",trimx,trimy,trimd-e,trimx-b); fflush(stdout); #endif } apath->aepos = trimx; apath->bepos = trimy; apath->diffs = trimd; apath->tlen = atlen; if (COMP(align->flags)) { bpath->abpos = align->blen - apath->bepos; bpath->bbpos = align->alen - apath->aepos; } else { bpath->aepos = apath->bepos; bpath->bepos = apath->aepos; } bpath->diffs = trimd; bpath->tlen = btlen; } *mind = low; return (0); } /*** Reverse Wave ***/ static int reverse_wave(_Work_Data *work, _Align_Spec *spec, Alignment *align, Path *bpath, int mind, int maxd, int mida, int minp, int maxp) { char *aseq = align->aseq - 1; char *bseq = align->bseq - 1; Path *apath = align->path; int hgh, low, dif; int vlen, vmin, vmax; int *V, *M; int *_V, *_M; BVEC *T; BVEC *_T; int *HA, *HB; int *_HA, *_HB; int *NA, *NB; int *_NA, *_NB; Pebble *cells; int avail, cmax, boff; int TRACE_SPACE = spec->trace_space; int PATH_AVE = spec->ave_path; int16 *SCORE = spec->score; int16 *TABLE = spec->table; int besta, besty; int trima, trimy, trimd; int trimha, trimhb; int morea, morey, mored; int moreha, morehb; int more, morem, lasta; int aclip, bclip; hgh = maxd; low = mind; dif = 0; { int span, wing; span = (hgh-low)+1; vlen = work->vecmax/VectorEl; wing = (vlen - span)/2; vmin = low - wing; vmax = hgh + wing; _V = ((int *) work->vector); _M = _V + vlen; _HA = _M + vlen; _HB = _HA + vlen; _NA = _HB + vlen; _NB = _NA + vlen; _T = ((BVEC *) (_NB + vlen)); V = _V-vmin; M = _M-vmin; HA = _HA-vmin; HB = _HB-vmin; NA = _NA-vmin; NB = _NB-vmin; T = _T-vmin; cells = (Pebble *) (work->cells); cmax = work->celmax; avail = 0; if (COMP(align->flags)) boff = align->blen % TRACE_SPACE; else boff = 0; } more = 1; aclip = -INT32_MAX; bclip = INT32_MAX; besta = trima = morea = lasta = mida; besty = trimy = morey = (mida-hgh) >> 1; trimd = mored = 0; trimha = moreha = 0; trimhb = morehb = 1; morem = -1; { int k; char *a; a = aseq + low; for (k = low; k <= hgh; k++) { int y, c, d; int ha, hb; int na, nb; Pebble *pb; y = (mida-k) >> 1; if (avail >= cmax-1) { cmax = ((int) (avail*1.2)) + 10000; cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble),"Reallocating trace cells"); if (cells == NULL) EXIT(1); work->celmax = cmax; work->cells = (void *) cells; } na = ((y+k+TRACE_SPACE-1)/TRACE_SPACE-1)*TRACE_SPACE; #ifdef SHOW_TPS printf(" A %d: -1,%d,0,%d\n",avail,k,na+TRACE_SPACE); fflush(stdout); #endif pb = cells+avail; pb->ptr = -1; pb->diag = k; pb->diff = 0; pb->mark = y+k; ha = avail++; nb = ((y+(TRACE_SPACE-boff)-1)/TRACE_SPACE-1)*TRACE_SPACE+boff; #ifdef SHOW_TPS printf(" B %d: -1,%d,0,%d\n",avail,k,nb+TRACE_SPACE); fflush(stdout); #endif pb = cells+avail; pb->ptr = -1; pb->diag = k; pb->diff = 0; pb->mark = y; hb = avail++; while (1) { c = bseq[y]; if (c == 4) { more = 0; if (bclip > k) bclip = k; break; } d = a[y]; if (c != d) { if (d == 4) { more = 0; aclip = k; } break; } y -= 1; } c = (y << 1) + k; while (y+k <= na) { if (avail >= cmax) { cmax = ((int) (avail*1.2)) + 10000; cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble),"Reallocating trace cells"); if (cells == NULL) EXIT(1); work->celmax = cmax; work->cells = (void *) cells; } #ifdef SHOW_TPS printf(" A %d: %d,%d,0,%d\n",avail,ha,k,na); fflush(stdout); #endif pb = cells+avail; pb->ptr = ha; pb->diag = k; pb->diff = 0; pb->mark = na; ha = avail++; na -= TRACE_SPACE; } while (y <= nb) { if (avail >= cmax) { cmax = ((int) (avail*1.2)) + 10000; cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble),"Reallocating trace cells"); if (cells == NULL) EXIT(1); work->celmax = cmax; work->cells = (void *) cells; } #ifdef SHOW_TPS printf(" B %d: %d,%d,0,%d\n",avail,hb,k,nb); fflush(stdout); #endif pb = cells+avail; pb->ptr = hb; pb->diag = k; pb->diff = 0; pb->mark = nb; hb = avail++; nb -= TRACE_SPACE; } if (c < besta) { besta = trima = lasta = c; besty = trimy = y; trimha = ha; trimhb = hb; } V[k] = c; T[k] = PATH_INT; M[k] = PATH_LEN; HA[k] = ha; HB[k] = hb; NA[k] = na; NB[k] = nb; a += 1; } } if (more == 0) { if (bseq[besty] != 4 && aseq[besta - besty] != 4) more = 1; if (low <= aclip) { low = aclip+1; if (morem <= M[aclip]) { morem = M[aclip]; morea = V[aclip]; morey = (morea - aclip)/2; moreha = HA[aclip]; morehb = HB[aclip]; } } if (hgh >= bclip) { hgh = bclip-1; if (morem <= M[bclip]) { morem = M[bclip]; morea = V[bclip]; morey = (morea - bclip)/2; moreha = HA[bclip]; morehb = HB[bclip]; } } aclip = -INT32_MAX; bclip = INT32_MAX; } #ifdef DEBUG_WAVE printf("\nREVERSE WAVE:\n"); print_wave(V,M,low,hgh,besta); #endif while (more && lasta <= besta + TRIM_MLAG) { int k, n; int ua, ub; BVEC t; int am, ac, ap; char *a; low -= 1; hgh += 1; if (low <= vmin || hgh >= vmax) { int span, wing; int64 move, vd, md, had, hbd, nad, nbd, td; span = (hgh-low)+1; if (.8*vlen < span) { if (enlarge_vector(work,vlen*VectorEl)) EXIT(1); move = ((void *) _V) - work->vector; vlen = work->vecmax/VectorEl; _V = (int *) work->vector; _M = _V + vlen; _HA = _M + vlen; _HB = _HA + vlen; _NA = _HB + vlen; _NB = _NA + vlen; _T = ((BVEC *) (_NB + vlen)); } else move = 0; wing = (vlen - span)/2; vd = ((void *) ( _V+wing)) - (((void *) ( V+low)) - move); md = ((void *) ( _M+wing)) - (((void *) ( M+low)) - move); had = ((void *) (_HA+wing)) - (((void *) (HA+low)) - move); hbd = ((void *) (_HB+wing)) - (((void *) (HB+low)) - move); nad = ((void *) (_NA+wing)) - (((void *) (NA+low)) - move); nbd = ((void *) (_NB+wing)) - (((void *) (NB+low)) - move); td = ((void *) ( _T+wing)) - (((void *) ( T+low)) - move); if (vd < 0) memmove( _V+wing, ((void *) ( V+low)) - move, span*sizeof(int)); if (md < 0) memmove( _M+wing, ((void *) ( M+low)) - move, span*sizeof(int)); if (had < 0) memmove(_HA+wing, ((void *) (HA+low)) - move, span*sizeof(int)); if (hbd < 0) memmove(_HB+wing, ((void *) (HB+low)) - move, span*sizeof(int)); if (nad < 0) memmove(_NA+wing, ((void *) (NA+low)) - move, span*sizeof(int)); if (nbd < 0) memmove(_NB+wing, ((void *) (NB+low)) - move, span*sizeof(int)); if (td < 0) memmove( _T+wing, ((void *) ( T+low)) - move, span*sizeof(BVEC)); if (td > 0) memmove( _T+wing, ((void *) ( T+low)) - move, span*sizeof(BVEC)); if (nbd > 0) memmove(_NB+wing, ((void *) (NB+low)) - move, span*sizeof(int)); if (nad > 0) memmove(_NA+wing, ((void *) (NA+low)) - move, span*sizeof(int)); if (hbd > 0) memmove(_HB+wing, ((void *) (HB+low)) - move, span*sizeof(int)); if (had > 0) memmove(_HA+wing, ((void *) (HA+low)) - move, span*sizeof(int)); if (md > 0) memmove( _M+wing, ((void *) ( M+low)) - move, span*sizeof(int)); if (vd > 0) memmove( _V+wing, ((void *) ( V+low)) - move, span*sizeof(int)); vmin = low-wing; vmax = hgh+wing; V = _V-vmin; M = _M-vmin; HA = _HA-vmin; HB = _HB-vmin; NA = _NA-vmin; NB = _NB-vmin; T = _T-vmin; } if (low >= minp) { NA[low] = NA[low+1]; NB[low] = NB[low+1]; V[low] = ap = INT32_MAX; } else ap = V[++low]; if (hgh <= maxp) { NA[hgh] = NA[hgh-1]; NB[hgh] = NB[hgh-1]; V[hgh] = INT32_MAX; } else hgh -= 1; dif += 1; ac = V[hgh+1] = V[low-1] = INT32_MAX; a = aseq + low; t = PATH_INT; n = PATH_LEN; ua = ub = -1; for (k = low; k <= hgh; k++) { int y, m; int ha, hb; int c, d; BVEC b; Pebble *pb; am = ac; ac = ap; ap = V[d = k+1]; if (ac > ap) if (ap > am) { c = am-1; m = n; b = t; ha = ua; hb = ub; } else { c = ap-1; m = M[d]; b = T[d]; ha = HA[d]; hb = HB[d]; } else if (ac > am) { c = am-1; m = n; b = t; ha = ua; hb = ub; } else { c = ac-2; m = M[k]; b = T[k]; ha = HA[k]; hb = HB[k]; } if ((b & PATH_TOP) != 0) m -= 1; b <<= 1; y = (c-k) >> 1; while (1) { c = bseq[y]; if (c == 4) { more = 0; if (bclip > k) bclip = k; break; } d = a[y]; if (c != d) { if (d == 4) { more = 0; aclip = k; } break; } y -= 1; if ((b & PATH_TOP) == 0) m += 1; b = (b << 1) | 1; } c = (y << 1) + k; while (y+k <= NA[k]) { if (cells[ha].mark > NA[k]) { if (avail >= cmax) { cmax = ((int) (avail*1.2)) + 10000; cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble), "Reallocating trace cells"); if (cells == NULL) EXIT(1); work->celmax = cmax; work->cells = (void *) cells; } #ifdef SHOW_TPS printf(" A %d: %d,%d,%d,%d\n",avail,ha,k,dif,NA[k]); fflush(stdout); #endif pb = cells+avail; pb->ptr = ha; pb->diag = k; pb->diff = dif; pb->mark = NA[k]; ha = avail++; } NA[k] -= TRACE_SPACE; } while (y <= NB[k]) { if (cells[hb].mark > NB[k]) { if (avail >= cmax) { cmax = ((int) (avail*1.2)) + 10000; cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble), "Reallocating trace cells"); if (cells == NULL) EXIT(1); work->celmax = cmax; work->cells = (void *) cells; } #ifdef SHOW_TPS printf(" B %d: %d,%d,%d,%d\n",avail,hb,k,dif,NB[k]); fflush(stdout); #endif pb = cells+avail; pb->ptr = hb; pb->diag = k; pb->diff = dif; pb->mark = NB[k]; hb = avail++; } NB[k] -= TRACE_SPACE; } if (c < besta) { besta = c; besty = y; if (m >= PATH_AVE) { lasta = c; if (TABLE[b & TRIM_MASK] >= 0) if (TABLE[(b >> TRIM_LEN) & TRIM_MASK] + SCORE[b & TRIM_MASK] >= 0) { trima = c; trimy = y; trimd = dif; trimha = ha; trimhb = hb; } } } t = T[k]; n = M[k]; ua = HA[k]; ub = HB[k]; V[k] = c; T[k] = b; M[k] = m; HA[k] = ha; HB[k] = hb; a += 1; } if (more == 0) { if (bseq[besty] != 4 && aseq[besta - besty] != 4) more = 1; if (low <= aclip) { low = aclip+1; if (morem <= M[aclip]) { morem = M[aclip]; morea = V[aclip]; morey = (morea - aclip)/2; mored = dif; moreha = HA[aclip]; morehb = HB[aclip]; } } if (hgh >= bclip) { hgh = bclip-1; if (morem <= M[bclip]) { morem = M[bclip]; morea = V[bclip]; morey = (morea - bclip)/2; mored = dif; moreha = HA[bclip]; morehb = HB[bclip]; } } aclip = -INT32_MAX; bclip = INT32_MAX; } n = besta + WAVE_LAG; while (hgh >= low) if (V[hgh] > n) hgh -= 1; else { while (V[low] > n) low += 1; break; } #ifdef WAVE_STATS k = (hgh-low)+1; if (k > MAX) MAX = k; TOT += k; NWV += 1; #endif #ifdef DEBUG_WAVE print_wave(V,M,low,hgh,besta); #endif } { uint16 *atrace = (uint16 *) apath->trace; uint16 *btrace = (uint16 *) bpath->trace; int atlen, btlen; int trimx; int a, b, k, h; int d, e; if (morem >= 0) { trimx = morea-morey; trimy = morey; trimd = mored; trimha = moreha; trimhb = morehb; } else trimx = trima-trimy; atlen = btlen = 0; a = -1; for (h = trimha; h >= 0; h = b) { b = cells[h].ptr; cells[h].ptr = a; a = h; } h = a; k = cells[h].diag; b = cells[h].mark - k; e = 0; #ifdef SHOW_TRAIL printf(" A path = (%5d,%5d)\n",b+k,b); fflush(stdout); #endif if ((b+k)%TRACE_SPACE != 0) { h = cells[h].ptr; if (h < 0) { a = trimy; d = trimd; } else { k = cells[h].diag; a = cells[h].mark - k; d = cells[h].diff; } #ifdef SHOW_TRAIL printf(" +%4d: (%5d,%5d): %3d / %3d\n",h,a+k,a,d-e,b-a); fflush(stdout); #endif if (apath->tlen == 0) { atrace[--atlen] = (uint16) (b-a); atrace[--atlen] = (uint16) (d-e); } else { atrace[1] = (uint16) (atrace[1] + (b-a)); atrace[0] = (uint16) (atrace[0] + (d-e)); } b = a; e = d; } if (h >= 0) { for (h = cells[h].ptr; h >= 0; h = cells[h].ptr) { k = cells[h].diag; a = cells[h].mark - k; atrace[--atlen] = (uint16) (b-a); d = cells[h].diff; atrace[--atlen] = (uint16) (d-e); #ifdef SHOW_TRAIL printf(" %4d: (%5d,%5d): %3d / %3d\n",h,a+k,a,d-e,b-a); fflush(stdout); #endif b = a; e = d; } if (b+k != trimx) { atrace[--atlen] = (uint16) (b-trimy); atrace[--atlen] = (uint16) (trimd-e); #ifdef SHOW_TRAIL printf(" (%5d,%5d): %3d / %3d\n",trimx,trimy,trimd-e,b-trimy); fflush(stdout); #endif } else if (b != trimy) { atrace[atlen+1] = (uint16) (atrace[atlen+1] + (b-trimy)); atrace[atlen] = (uint16) (atrace[atlen] + (trimd-e)); #ifdef SHOW_TRAIL printf(" @ (%5d,%5d): %3d / %3d\n",trimx,trimy,trimd-e,b-trimy); fflush(stdout); #endif } } a = -1; for (h = trimhb; h >= 0; h = b) { b = cells[h].ptr; cells[h].ptr = a; a = h; } h = a; k = cells[h].diag; b = cells[h].mark + k; e = 0; #ifdef SHOW_TRAIL printf(" B path = (%5d,%5d)\n",b,b-k); fflush(stdout); #endif if ((b-k)%TRACE_SPACE != boff) { h = cells[h].ptr; if (h < 0) { a = trimx; d = trimd; } else { k = cells[h].diag; a = cells[h].mark + k; d = cells[h].diff; } #ifdef SHOW_TRAIL printf(" +%4d: (%5d,%5d): %3d / %3d\n",h,a,a-k,d-e,b-a); fflush(stdout); #endif if (bpath->tlen == 0) { btrace[--btlen] = (uint16) (b-a); btrace[--btlen] = (uint16) (b-a); } else { btrace[1] = (uint16) (btrace[1] + (b-a)); btrace[0] = (uint16) (btrace[0] + (d-e)); } b = a; e = d; } if (h >= 0) { for (h = cells[h].ptr; h >= 0; h = cells[h].ptr) { k = cells[h].diag; a = cells[h].mark + k; btrace[--btlen] = (uint16) (b-a); d = cells[h].diff; btrace[--btlen] = (uint16) (d-e); #ifdef SHOW_TRAIL printf(" %4d: (%5d,%5d): %3d / %3d\n",h,a,a-k,d-e,b-a); fflush(stdout); #endif b = a; e = d; } if (b-k != trimy) { btrace[--btlen] = (uint16) (b-trimx); btrace[--btlen] = (uint16) (trimd-e); #ifdef SHOW_TRAIL printf(" (%5d,%5d): %3d / %3d\n",trimx,trimy,trimd-e,b-trimx); fflush(stdout); #endif } else if (b != trimx) { btrace[btlen+1] = (uint16) (btrace[btlen+1] + (b-trimx)); btrace[btlen] = (uint16) (btrace[btlen] + (trimd-e)); #ifdef SHOW_TRAIL printf(" @ (%5d,%5d): %3d / %3d\n",trimx,trimy,trimd-e,b-trimx); fflush(stdout); #endif } } apath->abpos = trimx; apath->bbpos = trimy; apath->diffs = apath->diffs + trimd; apath->tlen = apath->tlen - atlen; apath->trace = atrace + atlen; if (COMP(align->flags)) { bpath->aepos = align->blen - apath->bbpos; bpath->bepos = align->alen - apath->abpos; } else { bpath->abpos = apath->bbpos; bpath->bbpos = apath->abpos; } bpath->diffs = bpath->diffs + trimd; bpath->tlen = bpath->tlen - btlen; bpath->trace = btrace + btlen; } return (0); } /* Find the longest local alignment between aseq and bseq through (xcnt,ycnt) See associated .h file for the precise definition of the interface. */ Path *Local_Alignment(Alignment *align, Work_Data *ework, Align_Spec *espec, int low, int hgh, int anti, int lbord, int hbord) { _Work_Data *work = ( _Work_Data *) ework; _Align_Spec *spec = (_Align_Spec *) espec; Path *apath, *bpath; int minp, maxp; int selfie; { int alen, blen; int maxtp, wsize; alen = align->alen; blen = align->blen; if (hgh-low >= 7500) wsize = VectorEl*(hgh-low+1); else wsize = VectorEl*10000; if (wsize >= work->vecmax) if (enlarge_vector(work,wsize)) EXIT(NULL); if (alen < blen) maxtp = 2*(blen/spec->trace_space+2); else maxtp = 2*(alen/spec->trace_space+2); wsize = 4*maxtp*sizeof(uint16) + sizeof(Path); if (wsize > work->pntmax) if (enlarge_points(work,wsize)) EXIT(NULL); apath = align->path; bpath = (Path *) work->points; apath->trace = ((uint16 *) (bpath+1)) + maxtp; bpath->trace = ((uint16 *) apath->trace) + 2*maxtp; } #ifdef DEBUG_PASSES printf("\n"); #endif selfie = (align->aseq == align->bseq); if (lbord < 0) { if (selfie && low >= 0) minp = 1; else minp = -INT32_MAX; } else minp = low-lbord; if (hbord < 0) { if (selfie && hgh <= 0) maxp = -1; else maxp = INT32_MAX; } else maxp = hgh+hbord; if (forward_wave(work,spec,align,bpath,&low,hgh,anti,minp,maxp)) EXIT(NULL); #ifdef DEBUG_PASSES printf("F1 (%d,%d) ~ %d => (%d,%d) %d\n", (2*anti+(low+hgh))/4,(anti-(low+hgh))/4,hgh-low, apath->aepos,apath->bepos,apath->diffs); #endif if (reverse_wave(work,spec,align,bpath,low,low,anti,minp,maxp)) EXIT(NULL); #ifdef DEBUG_PASSES printf("R1 (%d,%d) => (%d,%d) %d\n", (anti+low)/2,(anti-low)/2,apath->abpos,apath->bbpos,apath->diffs); #endif if (COMP(align->flags)) { uint16 *trace = (uint16 *) bpath->trace; uint16 p; int i, j; i = bpath->tlen-2; j = 0; while (j < i) { p = trace[i]; trace[i] = trace[j]; trace[j] = p; p = trace[i+1]; trace[i+1] = trace[j+1]; trace[j+1] = p; i -= 2; j += 2; } } #ifdef DEBUG_POINTS { uint16 *trace = (uint16 *) apath->trace; int a, h; printf("\nA-path (%d,%d)->(%d,%d)",apath->abpos,apath->bbpos,apath->aepos,apath->bepos); printf(" %c\n",(COMP(align->flags) ? 'c' : 'n')); a = apath->bbpos; for (h = 1; h < apath->tlen; h += 2) { int dif = trace[h-1]; int del = trace[h]; a += del; printf(" %d / %d (%d)\n",dif,del,a); } } { uint16 *trace = (uint16 *) bpath->trace; int a, h; printf("\nB-path (%d,%d)->(%d,%d)",bpath->abpos,bpath->bbpos,bpath->aepos,bpath->bepos); printf(" %c [%d,%d]\n",(COMP(align->flags) ? 'c' : 'n'),align->blen,align->alen); a = bpath->bbpos; for (h = 1; h < bpath->tlen; h += 2) { int dif = trace[h-1]; int del = trace[h]; a += del; printf(" %d / %d (%d)\n",dif,del,a); } } #endif return (bpath); } /****************************************************************************************\ * * * EXTENSION VERSION OF LOCAL ALIGNMENT * * * \****************************************************************************************/ static int VectorEn = 4*sizeof(int) + sizeof(BVEC); static int forward_extend(_Work_Data *work, _Align_Spec *spec, Alignment *align, int midd, int mida, int minp, int maxp) { char *aseq = align->aseq; char *bseq = align->bseq; Path *apath = align->path; int hgh, low, dif; int vlen, vmin, vmax; int *V, *M; int *_V, *_M; BVEC *T; BVEC *_T; int *HA, *NA; int *_HA, *_NA; Pebble *cells; int avail, cmax; int TRACE_SPACE = spec->trace_space; int PATH_AVE = spec->ave_path; int16 *SCORE = spec->score; int16 *TABLE = spec->table; int besta, besty; int trima, trimy, trimd; int trimha; int morea, morey, mored; int moreha; int more, morem, lasta; int aclip, bclip; hgh = midd; low = midd; dif = 0; { int span, wing; span = (hgh-low)+1; vlen = work->vecmax/VectorEn; wing = (vlen - span)/2; vmin = low - wing; vmax = hgh + wing; _V = ((int *) work->vector); _M = _V + vlen; _HA = _M + vlen; _NA = _HA + vlen; _T = ((BVEC *) (_NA + vlen)); V = _V-vmin; M = _M-vmin; HA = _HA-vmin; NA = _NA-vmin; T = _T-vmin; cells = (Pebble *) (work->cells); cmax = work->celmax; avail = 0; } /* Compute 0-wave starting from mid-line */ more = 1; aclip = INT32_MAX; bclip = -INT32_MAX; besta = trima = morea = lasta = mida; besty = trimy = morey = (mida-hgh) >> 1; trimd = mored = 0; trimha = moreha = 0; morem = -1; { int k; char *a; a = aseq + hgh; for (k = hgh; k >= low; k--) { int y, c, d; int ha, na; Pebble *pb; y = (mida-k) >> 1; if (avail >= cmax-1) { cmax = ((int) (avail*1.2)) + 10000; cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble),"Reallocating trace cells"); if (cells == NULL) EXIT(1); work->celmax = cmax; work->cells = (void *) cells; } na = ((y+k)/TRACE_SPACE)*TRACE_SPACE; #ifdef SHOW_TPS printf(" A %d: %d,%d,0,%d\n",avail,-1,k,na); fflush(stdout); #endif pb = cells+avail; pb->ptr = -1; pb->diag = k; pb->diff = 0; pb->mark = na; ha = avail++; na += TRACE_SPACE; while (1) { c = bseq[y]; if (c == 4) { more = 0; if (bclip < k) bclip = k; break; } d = a[y]; if (c != d) { if (d == 4) { more = 0; aclip = k; } break; } y += 1; } c = (y << 1) + k; while (y+k >= na) { if (avail >= cmax) { cmax = ((int) (avail*1.2)) + 10000; cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble),"Reallocating trace cells"); if (cells == NULL) EXIT(1); work->celmax = cmax; work->cells = (void *) cells; } #ifdef SHOW_TPS printf(" A %d: %d,%d,0,%d\n",avail,ha,k,na); fflush(stdout); #endif pb = cells+avail; pb->ptr = ha; pb->diag = k; pb->diff = 0; pb->mark = na; ha = avail++; na += TRACE_SPACE; } if (c > besta) { besta = trima = lasta = c; besty = trimy = y; trimha = ha; } V[k] = c; T[k] = PATH_INT; M[k] = PATH_LEN; HA[k] = ha; NA[k] = na; a -= 1; } } if (more == 0) { if (bseq[besty] != 4 && aseq[besta - besty] != 4) more = 1; if (hgh >= aclip) { hgh = aclip-1; if (morem <= M[aclip]) { morem = M[aclip]; morea = V[aclip]; morey = (morea - aclip)/2; moreha = HA[aclip]; } } if (low <= bclip) { low = bclip+1; if (morem <= M[bclip]) { morem = M[bclip]; morea = V[bclip]; morey = (morea - bclip)/2; moreha = HA[bclip]; } } aclip = INT32_MAX; bclip = -INT32_MAX; } #ifdef DEBUG_WAVE printf("\nFORWARD WAVE:\n"); print_wave(V,M,low,hgh,besta); #endif /* Compute successive waves until no furthest reaching points remain */ while (more && lasta >= besta - TRIM_MLAG) { int k, n; int ua; BVEC t; int am, ac, ap; char *a; if (low <= vmin || hgh >= vmax) { int span, wing; int64 move; int64 vd, md, had, nad, td; span = (hgh-low)+1; if (.8*vlen < span) { if (enlarge_vector(work,vlen*VectorEn)) EXIT(1); move = ((void *) _V) - work->vector; vlen = work->vecmax/VectorEn; _V = (int *) work->vector; _M = _V + vlen; _HA = _M + vlen; _NA = _HA + vlen; _T = ((BVEC *) (_NA + vlen)); } else move = 0; wing = (vlen - span)/2; vd = ((void *) ( _V+wing)) - (((void *) ( V+low)) - move); md = ((void *) ( _M+wing)) - (((void *) ( M+low)) - move); had = ((void *) (_HA+wing)) - (((void *) (HA+low)) - move); nad = ((void *) (_NA+wing)) - (((void *) (NA+low)) - move); td = ((void *) ( _T+wing)) - (((void *) ( T+low)) - move); if (vd < 0) memmove( _V+wing, ((void *) ( V+low)) - move, span*sizeof(int)); if (md < 0) memmove( _M+wing, ((void *) ( M+low)) - move, span*sizeof(int)); if (had < 0) memmove(_HA+wing, ((void *) (HA+low)) - move, span*sizeof(int)); if (nad < 0) memmove(_NA+wing, ((void *) (NA+low)) - move, span*sizeof(int)); if (td < 0) memmove( _T+wing, ((void *) ( T+low)) - move, span*sizeof(BVEC)); if (td > 0) memmove( _T+wing, ((void *) ( T+low)) - move, span*sizeof(BVEC)); if (nad > 0) memmove(_NA+wing, ((void *) (NA+low)) - move, span*sizeof(int)); if (had > 0) memmove(_HA+wing, ((void *) (HA+low)) - move, span*sizeof(int)); if (md > 0) memmove( _M+wing, ((void *) ( M+low)) - move, span*sizeof(int)); if (vd > 0) memmove( _V+wing, ((void *) ( V+low)) - move, span*sizeof(int)); vmin = low-wing; vmax = hgh+wing; V = _V-vmin; M = _M-vmin; HA = _HA-vmin; NA = _NA-vmin; T = _T-vmin; } if (low > minp) { low -= 1; NA[low] = NA[low+1]; V[low] = -1; } if (hgh < maxp) { hgh += 1; NA[hgh] = NA[hgh-1]; V[hgh] = am = -1; } else am = V[hgh]; dif += 1; ac = V[hgh+1] = V[low-1] = -1; a = aseq + hgh; t = PATH_INT; n = PATH_LEN; ua = -1; for (k = hgh; k >= low; k--) { int y, m; int ha; int c, d; BVEC b; Pebble *pb; ap = ac; ac = am; am = V[d = k-1]; if (ac < am) if (am < ap) { c = ap+1; m = n; b = t; ha = ua; } else { c = am+1; m = M[d]; b = T[d]; ha = HA[d]; } else if (ac < ap) { c = ap+1; m = n; b = t; ha = ua; } else { c = ac+2; m = M[k]; b = T[k]; ha = HA[k]; } if ((b & PATH_TOP) != 0) m -= 1; b <<= 1; y = (c-k) >> 1; while (1) { c = bseq[y]; if (c == 4) { more = 0; if (bclip < k) bclip = k; break; } d = a[y]; if (c != d) { if (d == 4) { more = 0; aclip = k; } break; } y += 1; if ((b & PATH_TOP) == 0) m += 1; b = (b << 1) | 1; } c = (y << 1) + k; while (y+k >= NA[k]) { if (cells[ha].mark < NA[k]) { if (avail >= cmax) { cmax = ((int) (avail*1.2)) + 10000; cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble), "Reallocating trace cells"); if (cells == NULL) EXIT(1); work->celmax = cmax; work->cells = (void *) cells; } #ifdef SHOW_TPS printf(" A %d: %d,%d,%d,%d\n",avail,ha,k,dif,NA[k]); fflush(stdout); #endif pb = cells+avail; pb->ptr = ha; pb->diag = k; pb->diff = dif; pb->mark = NA[k]; ha = avail++; } NA[k] += TRACE_SPACE; } if (c > besta) { besta = c; besty = y; if (m >= PATH_AVE) { lasta = c; if (TABLE[b & TRIM_MASK] >= 0) if (TABLE[(b >> TRIM_LEN) & TRIM_MASK] + SCORE[b & TRIM_MASK] >= 0) { trima = c; trimy = y; trimd = dif; trimha = ha; } } } t = T[k]; n = M[k]; ua = HA[k]; V[k] = c; T[k] = b; M[k] = m; HA[k] = ha; a -= 1; } if (more == 0) { if (bseq[besty] != 4 && aseq[besta-besty] != 4) more = 1; if (hgh >= aclip) { hgh = aclip-1; if (morem <= M[aclip]) { morem = M[aclip]; morea = V[aclip]; morey = (morea - aclip)/2; mored = dif; moreha = HA[aclip]; } } if (low <= bclip) { low = bclip+1; if (morem <= M[bclip]) { morem = M[bclip]; morea = V[bclip]; morey = (morea - bclip)/2; mored = dif; moreha = HA[bclip]; } } aclip = INT32_MAX; bclip = -INT32_MAX; } n = besta - WAVE_LAG; while (hgh >= low) if (V[hgh] < n) hgh -= 1; else { while (V[low] < n) low += 1; break; } #ifdef WAVE_STATS k = (hgh-low)+1; if (k > MAX) MAX = k; TOT += k; NWV += 1; #endif #ifdef DEBUG_WAVE print_wave(V,M,low,hgh,besta); #endif } { uint16 *atrace = (uint16 *) apath->trace; int atlen; int trimx; int a, b, k, h; int d, e; if (morem >= 0) { trimx = morea-morey; trimy = morey; trimd = mored; trimha = moreha; } else trimx = trima-trimy; atlen = 0; a = -1; for (h = trimha; h >= 0; h = b) { b = cells[h].ptr; cells[h].ptr = a; a = h; } h = a; k = cells[h].diag; b = (mida-k)/2; e = 0; #ifdef SHOW_TRAIL printf(" A path = (%5d,%5d)\n",(mida+k)/2,b); fflush(stdout); #endif for (h = cells[h].ptr; h >= 0; h = cells[h].ptr) { k = cells[h].diag; a = cells[h].mark - k; d = cells[h].diff; atrace[atlen++] = (uint16) (d-e); atrace[atlen++] = (uint16) (a-b); #ifdef SHOW_TRAIL printf(" %4d: (%5d,%5d): %3d / %3d\n",h,a+k,a,d-e,a-b); fflush(stdout); #endif b = a; e = d; } if (b+k != trimx) { atrace[atlen++] = (uint16) (trimd-e); atrace[atlen++] = (uint16) (trimy-b); #ifdef SHOW_TRAIL printf(" (%5d,%5d): %3d / %3d\n",trimx,trimy,trimd-e,trimy-b); fflush(stdout); #endif } else if (b != trimy) { atrace[atlen-1] = (uint16) (atrace[atlen-1] + (trimy-b)); atrace[atlen-2] = (uint16) (atrace[atlen-2] + (trimd-e)); #ifdef SHOW_TRAIL printf(" @ (%5d,%5d): %3d / %3d\n",trimx,trimy,trimd-e,trimy-b); fflush(stdout); #endif } apath->aepos = trimx; apath->bepos = trimy; apath->diffs = trimd; apath->tlen = atlen; } return (0); } static int reverse_extend(_Work_Data *work, _Align_Spec *spec, Alignment *align, int midd, int mida, int minp, int maxp) { char *aseq = align->aseq - 1; char *bseq = align->bseq - 1; Path *apath = align->path; int hgh, low, dif; int vlen, vmin, vmax; int *V, *M; int *_V, *_M; BVEC *T; BVEC *_T; int *HA, *NA; int *_HA, *_NA; Pebble *cells; int avail, cmax; int TRACE_SPACE = spec->trace_space; int PATH_AVE = spec->ave_path; int16 *SCORE = spec->score; int16 *TABLE = spec->table; int besta, besty; int trima, trimy, trimd; int trimha; int morea, morey, mored; int moreha; int more, morem, lasta; int aclip, bclip; hgh = midd; low = midd; dif = 0; { int span, wing; span = (hgh-low)+1; vlen = work->vecmax/VectorEn; wing = (vlen - span)/2; vmin = low - wing; vmax = hgh + wing; _V = ((int *) work->vector); _M = _V + vlen; _HA = _M + vlen; _NA = _HA + vlen; _T = ((BVEC *) (_NA + vlen)); V = _V-vmin; M = _M-vmin; HA = _HA-vmin; NA = _NA-vmin; T = _T-vmin; cells = (Pebble *) (work->cells); cmax = work->celmax; avail = 0; } more = 1; aclip = -INT32_MAX; bclip = INT32_MAX; besta = trima = morea = lasta = mida; besty = trimy = morey = (mida-hgh) >> 1; trimd = mored = 0; trimha = moreha = 0; morem = -1; { int k; char *a; a = aseq + low; for (k = low; k <= hgh; k++) { int y, c, d; int ha, na; Pebble *pb; y = (mida-k) >> 1; if (avail >= cmax-1) { cmax = ((int) (avail*1.2)) + 10000; cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble),"Reallocating trace cells"); if (cells == NULL) EXIT(1); work->celmax = cmax; work->cells = (void *) cells; } na = ((y+k+TRACE_SPACE-1)/TRACE_SPACE-1)*TRACE_SPACE; #ifdef SHOW_TPS printf(" A %d: -1,%d,0,%d\n",avail,k,na+TRACE_SPACE); fflush(stdout); #endif pb = cells+avail; pb->ptr = -1; pb->diag = k; pb->diff = 0; pb->mark = y+k; ha = avail++; while (1) { c = bseq[y]; if (c == 4) { more = 0; if (bclip > k) bclip = k; break; } d = a[y]; if (c != d) { if (d == 4) { more = 0; aclip = k; } break; } y -= 1; } c = (y << 1) + k; while (y+k <= na) { if (avail >= cmax) { cmax = ((int) (avail*1.2)) + 10000; cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble),"Reallocating trace cells"); if (cells == NULL) EXIT(1); work->celmax = cmax; work->cells = (void *) cells; } #ifdef SHOW_TPS printf(" A %d: %d,%d,0,%d\n",avail,ha,k,na); fflush(stdout); #endif pb = cells+avail; pb->ptr = ha; pb->diag = k; pb->diff = 0; pb->mark = na; ha = avail++; na -= TRACE_SPACE; } if (c < besta) { besta = trima = lasta = c; besty = trimy = y; trimha = ha; } V[k] = c; T[k] = PATH_INT; M[k] = PATH_LEN; HA[k] = ha; NA[k] = na; a += 1; } } if (more == 0) { if (bseq[besty] != 4 && aseq[besta - besty] != 4) more = 1; if (low <= aclip) { low = aclip+1; if (morem <= M[aclip]) { morem = M[aclip]; morea = V[aclip]; morey = (morea - aclip)/2; moreha = HA[aclip]; } } if (hgh >= bclip) { hgh = bclip-1; if (morem <= M[bclip]) { morem = M[bclip]; morea = V[bclip]; morey = (morea - bclip)/2; moreha = HA[bclip]; } } aclip = -INT32_MAX; bclip = INT32_MAX; } #ifdef DEBUG_WAVE printf("\nREVERSE WAVE:\n"); print_wave(V,M,low,hgh,besta); #endif while (more && lasta <= besta + TRIM_MLAG) { int k, n; int ua; BVEC t; int am, ac, ap; char *a; if (low <= vmin || hgh >= vmax) { int span, wing; int64 move, vd, md, had, nad, td; span = (hgh-low)+1; if (.8*vlen < span) { if (enlarge_vector(work,vlen*VectorEn)) EXIT(1); move = ((void *) _V) - work->vector; vlen = work->vecmax/VectorEn; _V = (int *) work->vector; _M = _V + vlen; _HA = _M + vlen; _NA = _HA + vlen; _T = ((BVEC *) (_NA + vlen)); } else move = 0; wing = (vlen - span)/2; vd = ((void *) ( _V+wing)) - (((void *) ( V+low)) - move); md = ((void *) ( _M+wing)) - (((void *) ( M+low)) - move); had = ((void *) (_HA+wing)) - (((void *) (HA+low)) - move); nad = ((void *) (_NA+wing)) - (((void *) (NA+low)) - move); td = ((void *) ( _T+wing)) - (((void *) ( T+low)) - move); if (vd < 0) memmove( _V+wing, ((void *) ( V+low)) - move, span*sizeof(int)); if (md < 0) memmove( _M+wing, ((void *) ( M+low)) - move, span*sizeof(int)); if (had < 0) memmove(_HA+wing, ((void *) (HA+low)) - move, span*sizeof(int)); if (nad < 0) memmove(_NA+wing, ((void *) (NA+low)) - move, span*sizeof(int)); if (td < 0) memmove( _T+wing, ((void *) ( T+low)) - move, span*sizeof(BVEC)); if (td > 0) memmove( _T+wing, ((void *) ( T+low)) - move, span*sizeof(BVEC)); if (nad > 0) memmove(_NA+wing, ((void *) (NA+low)) - move, span*sizeof(int)); if (had > 0) memmove(_HA+wing, ((void *) (HA+low)) - move, span*sizeof(int)); if (md > 0) memmove( _M+wing, ((void *) ( M+low)) - move, span*sizeof(int)); if (vd > 0) memmove( _V+wing, ((void *) ( V+low)) - move, span*sizeof(int)); vmin = low-wing; vmax = hgh+wing; V = _V-vmin; M = _M-vmin; HA = _HA-vmin; NA = _NA-vmin; T = _T-vmin; } if (low > minp) { low -= 1; NA[low] = NA[low+1]; V[low] = ap = INT32_MAX; } else ap = V[low]; if (hgh < maxp) { hgh += 1; NA[hgh] = NA[hgh-1]; V[hgh] = INT32_MAX; } dif += 1; ac = V[hgh+1] = V[low-1] = INT32_MAX; a = aseq + low; t = PATH_INT; n = PATH_LEN; ua = -1; for (k = low; k <= hgh; k++) { int y, m; int ha; int c, d; BVEC b; Pebble *pb; am = ac; ac = ap; ap = V[d = k+1]; if (ac > ap) if (ap > am) { c = am-1; m = n; b = t; ha = ua; } else { c = ap-1; m = M[d]; b = T[d]; ha = HA[d]; } else if (ac > am) { c = am-1; m = n; b = t; ha = ua; } else { c = ac-2; m = M[k]; b = T[k]; ha = HA[k]; } if ((b & PATH_TOP) != 0) m -= 1; b <<= 1; y = (c-k) >> 1; while (1) { c = bseq[y]; if (c == 4) { more = 0; if (bclip > k) bclip = k; break; } d = a[y]; if (c != d) { if (d == 4) { more = 0; aclip = k; } break; } y -= 1; if ((b & PATH_TOP) == 0) m += 1; b = (b << 1) | 1; } c = (y << 1) + k; while (y+k <= NA[k]) { if (cells[ha].mark > NA[k]) { if (avail >= cmax) { cmax = ((int) (avail*1.2)) + 10000; cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble), "Reallocating trace cells"); if (cells == NULL) EXIT(1); work->celmax = cmax; work->cells = (void *) cells; } #ifdef SHOW_TPS printf(" A %d: %d,%d,%d,%d\n",avail,ha,k,dif,NA[k]); fflush(stdout); #endif pb = cells+avail; pb->ptr = ha; pb->diag = k; pb->diff = dif; pb->mark = NA[k]; ha = avail++; } NA[k] -= TRACE_SPACE; } if (c < besta) { besta = c; besty = y; if (m >= PATH_AVE) { lasta = c; if (TABLE[b & TRIM_MASK] >= 0) if (TABLE[(b >> TRIM_LEN) & TRIM_MASK] + SCORE[b & TRIM_MASK] >= 0) { trima = c; trimy = y; trimd = dif; trimha = ha; } } } t = T[k]; n = M[k]; ua = HA[k]; V[k] = c; T[k] = b; M[k] = m; HA[k] = ha; a += 1; } if (more == 0) { if (bseq[besty] != 4 && aseq[besta - besty] != 4) more = 1; if (low <= aclip) { low = aclip+1; if (morem <= M[aclip]) { morem = M[aclip]; morea = V[aclip]; morey = (morea - aclip)/2; mored = dif; moreha = HA[aclip]; } } if (hgh >= bclip) { hgh = bclip-1; if (morem <= M[bclip]) { morem = M[bclip]; morea = V[bclip]; morey = (morea - bclip)/2; mored = dif; moreha = HA[bclip]; } } aclip = -INT32_MAX; bclip = INT32_MAX; } n = besta + WAVE_LAG; while (hgh >= low) if (V[hgh] > n) hgh -= 1; else { while (V[low] > n) low += 1; break; } #ifdef WAVE_STATS k = (hgh-low)+1; if (k > MAX) MAX = k; TOT += k; NWV += 1; #endif #ifdef DEBUG_WAVE print_wave(V,M,low,hgh,besta); #endif } { uint16 *atrace = (uint16 *) apath->trace; int atlen; int trimx; int a, b, k, h; int d, e; if (morem >= 0) { trimx = morea-morey; trimy = morey; trimd = mored; trimha = moreha; } else trimx = trima-trimy; atlen = 0; a = -1; for (h = trimha; h >= 0; h = b) { b = cells[h].ptr; cells[h].ptr = a; a = h; } h = a; k = cells[h].diag; b = cells[h].mark - k; e = 0; #ifdef SHOW_TRAIL printf(" A path = (%5d,%5d)\n",b+k,b); fflush(stdout); #endif if ((b+k)%TRACE_SPACE != 0) { h = cells[h].ptr; if (h < 0) { a = trimy; d = trimd; } else { k = cells[h].diag; a = cells[h].mark - k; d = cells[h].diff; } #ifdef SHOW_TRAIL printf(" +%4d: (%5d,%5d): %3d / %3d\n",h,a+k,a,d-e,b-a); fflush(stdout); #endif atrace[--atlen] = (uint16) (b-a); atrace[--atlen] = (uint16) (d-e); b = a; e = d; } if (h >= 0) { for (h = cells[h].ptr; h >= 0; h = cells[h].ptr) { k = cells[h].diag; a = cells[h].mark - k; atrace[--atlen] = (uint16) (b-a); d = cells[h].diff; atrace[--atlen] = (uint16) (d-e); #ifdef SHOW_TRAIL printf(" %4d: (%5d,%5d): %3d / %3d\n",h,a+k,a,d-e,b-a); fflush(stdout); #endif b = a; e = d; } if (b+k != trimx) { atrace[--atlen] = (uint16) (b-trimy); atrace[--atlen] = (uint16) (trimd-e); #ifdef SHOW_TRAIL printf(" (%5d,%5d): %3d / %3d\n",trimx,trimy,trimd-e,b-trimy); fflush(stdout); #endif } else if (b != trimy) { atrace[atlen+1] = (uint16) (atrace[atlen+1] + (b-trimy)); atrace[atlen] = (uint16) (atrace[atlen] + (trimd-e)); #ifdef SHOW_TRAIL printf(" @ (%5d,%5d): %3d / %3d\n",trimx,trimy,trimd-e,b-trimy); fflush(stdout); #endif } } apath->abpos = trimx; apath->bbpos = trimy; apath->diffs = trimd; apath->tlen = - atlen; apath->trace = atrace + atlen; } return (0); } /* Find the longest local alignment between aseq and bseq through (xcnt,ycnt) See associated .h file for the precise definition of the interface. */ int Find_Extension(Alignment *align, Work_Data *ework, Align_Spec *espec, int diag, int anti, int lbord, int hbord, int prefix) { _Work_Data *work = ( _Work_Data *) ework; _Align_Spec *spec = (_Align_Spec *) espec; Path *apath; int minp, maxp; { int alen, blen; int maxtp, wsize; alen = align->alen; blen = align->blen; wsize = VectorEn*10000; if (wsize >= work->vecmax) if (enlarge_vector(work,wsize)) EXIT(1); if (alen < blen) maxtp = 2*(blen/spec->trace_space+2); else maxtp = 2*(alen/spec->trace_space+2); wsize = 2*maxtp*sizeof(uint16); if (wsize > work->pntmax) if (enlarge_points(work,wsize)) EXIT(1); apath = align->path; apath->trace = ((uint16 *) work->points) + maxtp; } #ifdef DEBUG_PASSES printf("\n"); #endif if (lbord < 0) minp = -INT32_MAX; else minp = diag-lbord; if (hbord < 0) maxp = INT32_MAX; else maxp = diag+hbord; if (prefix) { if (reverse_extend(work,spec,align,diag,anti,minp,maxp)) EXIT(1); apath->aepos = (anti-diag)/2; apath->bepos = (anti+diag)/2; #ifdef DEBUG_PASSES printf("E1 (%d,%d) => (%d,%d) %d\n", (anti+diag)/2,(anti-diag)/2,apath->abpos,apath->bbpos,apath->diffs); #endif } else { if (forward_extend(work,spec,align,diag,anti,minp,maxp)) EXIT(1); apath->abpos = (anti-diag)/2; apath->bbpos = (anti+diag)/2; #ifdef DEBUG_PASSES printf("F1 (%d,%d) => (%d,%d) %d\n", (anti+diag)/2,(anti-diag)/2,apath->aepos,apath->bepos,apath->diffs); #endif } #ifdef DEBUG_POINTS { uint16 *trace = (uint16 *) apath->trace; int a, h; printf("\nA-path (%d,%d)->(%d,%d)",apath->abpos,apath->bbpos,apath->aepos,apath->bepos); printf(" %c\n",(COMP(align->flags) ? 'c' : 'n')); a = apath->bbpos; for (h = 1; h < apath->tlen; h += 2) { int dif = trace[h-1]; int del = trace[h]; a += del; printf(" %d / %d (%d)\n",dif,del,a); } } #endif return (0); } /****************************************************************************************\ * * * OVERLAP MANIPULATION * * * \****************************************************************************************/ static int64 PtrSize = sizeof(void *); static int64 OvlIOSize = sizeof(Overlap) - sizeof(void *); int Read_Overlap(FILE *input, Overlap *ovl) { if (fread( ((char *) ovl) + PtrSize, OvlIOSize, 1, input) != 1) return (1); return (0); } int Read_Trace(FILE *input, Overlap *ovl, int tbytes) { if (tbytes > 0 && ovl->path.tlen > 0) { if (fread(ovl->path.trace, tbytes*ovl->path.tlen, 1, input) != 1) return (1); } return (0); } void Write_Overlap(FILE *output, Overlap *ovl, int tbytes) { fwrite( ((char *) ovl) + PtrSize, OvlIOSize, 1, output); if (ovl->path.trace != NULL) fwrite(ovl->path.trace,tbytes,ovl->path.tlen,output); } void Compress_TraceTo8(Overlap *ovl) { uint16 *t16 = (uint16 *) ovl->path.trace; uint8 *t8 = (uint8 *) ovl->path.trace; int j; for (j = 0; j < ovl->path.tlen; j++) t8[j] = (uint8) (t16[j]); } void Decompress_TraceTo16(Overlap *ovl) { uint16 *t16 = (uint16 *) ovl->path.trace; uint8 *t8 = (uint8 *) ovl->path.trace; int j; for (j = ovl->path.tlen-1; j >= 0; j--) t16[j] = t8[j]; } void Print_Overlap(FILE *output, Overlap *ovl, int tbytes, int indent) { int i; fprintf(output,"%*s%d vs. ",indent,"",ovl->aread); if (COMP(ovl->flags)) fprintf(output,"c(%d)\n",ovl->bread); else fprintf(output,"%d\n",ovl->bread); fprintf(output,"%*s [%d,%d] vs [%d,%d] w. %d diffs\n",indent,"", ovl->path.abpos,ovl->path.aepos,ovl->path.bbpos,ovl->path.bepos,ovl->path.diffs); if (tbytes == 1) { uint8 *trace = (uint8 *) (ovl->path.trace); if (trace != NULL) { int p = ovl->path.bbpos + trace[1]; fprintf(output,"%*sTrace: %3d/%5d",indent,"",trace[0],p); for (i = 3; i < ovl->path.tlen; i += 2) { if (i%10 == 0) fprintf(output,"\n%*s",indent+6,""); p += trace[i]; fprintf(output," %3d/%5d",trace[i-1],p); } fprintf(output,"\n"); } } else { uint16 *trace = (uint16 *) (ovl->path.trace); if (trace != NULL) { int p = ovl->path.bbpos + trace[1]; fprintf(output,"%*sTrace: %3d/%5d",indent,"",trace[0],p); for (i = 3; i < ovl->path.tlen; i += 2) { if (i%10 == 0) fprintf(output,"\n%*s",indent+6,""); p += trace[i]; fprintf(output," %3d/%5d",trace[i-1],p); } fprintf(output,"\n"); } } } int Check_Trace_Points(Overlap *ovl, int tspace, int verbose, char *fname) { int i, p; if (((ovl->path.aepos-1)/tspace - ovl->path.abpos/tspace)*2 != ovl->path.tlen-2) { if (verbose) EPRINTF(EPLACE," %s: Wrong number of trace points\n",fname); return (1); } p = ovl->path.bbpos; if (tspace <= TRACE_XOVR) { uint8 *trace8 = (uint8 *) ovl->path.trace; for (i = 1; i < ovl->path.tlen; i += 2) p += trace8[i]; } else { uint16 *trace16 = (uint16 *) ovl->path.trace; for (i = 1; i < ovl->path.tlen; i += 2) p += trace16[i]; } if (p != ovl->path.bepos) { if (verbose) EPRINTF(EPLACE," %s: Trace point sum != aligned interval\n",fname); return (1); } return (0); } void Flip_Alignment(Alignment *align, int full) { char *aseq = align->aseq; char *bseq = align->bseq; int alen = align->alen; int blen = align->blen; Path *path = align->path; int comp = COMP(align->flags); int *trace = (int *) path->trace; int tlen = path->tlen; int i, j, p; if (comp) { p = path->abpos; path->abpos = blen - path->bepos; path->bepos = alen - p; p = path->aepos; path->aepos = blen - path->bbpos; path->bbpos = alen - p; if (full) { alen += 2; blen += 2; for (i = 0; i < tlen; i++) if ((p = trace[i]) < 0) trace[i] = alen + p; else trace[i] = p - blen; i = tlen-1; j = 0; while (j < i) { p = trace[i]; trace[i] = trace[j]; trace[j] = p; i -= 1; j += 1; } alen -= 2; blen -= 2; } } else { p = path->abpos; path->abpos = path->bbpos; path->bbpos = p; p = path->aepos; path->aepos = path->bepos; path->bepos = p; if (full) for (i = 0; i < tlen; i++) trace[i] = - (trace[i]); } align->aseq = bseq; align->bseq = aseq; align->alen = blen; align->blen = alen; } /****************************************************************************************\ * * * ALIGNMENT PRINTING * * * \****************************************************************************************/ /* Complement the sequence in fragment aseq. The operation does the complementation/reversal in place. Calling it a second time on a given fragment restores it to its original state. */ void Complement_Seq(char *aseq, int len) { char *s, *t; int c; s = aseq; t = aseq + (len-1); while (s < t) { c = 3 - *s; *s++ = (char) (3 - *t); *t-- = (char) c; } if (s == t) *s = (char) (3 - *s); } /* Print an alignment to file between a and b given in trace (unpacked). Prefix gives the length of the initial prefix of a that is unaligned. */ static char ToL[8] = { 'a', 'c', 'g', 't', '.', '[', ']', '-' }; static char ToU[8] = { 'A', 'C', 'G', 'T', '.', '[', ']', '-' }; int Print_Alignment(FILE *file, Alignment *align, Work_Data *ework, int indent, int width, int border, int upper, int coord) { _Work_Data *work = (_Work_Data *) ework; int *trace = align->path->trace; int tlen = align->path->tlen; char *Abuf, *Bbuf, *Dbuf; int i, j, o; char *a, *b; char mtag, dtag; int prefa, prefb; int aend, bend; int sa, sb; int match, diff; char *N2A; if (trace == NULL) return (0); #ifdef SHOW_TRACE fprintf(file,"\nTrace:\n"); for (i = 0; i < tlen; i++) fprintf(file," %3d\n",trace[i]); #endif o = sizeof(char)*3*(width+1); if (o > work->vecmax) if (enlarge_vector(work,o)) EXIT(1); if (upper) N2A = ToU; else N2A = ToL; Abuf = (char *) work->vector; Bbuf = Abuf + (width+1); Dbuf = Bbuf + (width+1); aend = align->path->aepos; bend = align->path->bepos; Abuf[width] = Bbuf[width] = Dbuf[width] = '\0'; /* buffer/output next column */ #define COLUMN(x,y) \ { int u, v; \ if (o >= width) \ { fprintf(file,"\n"); \ fprintf(file,"%*s",indent,""); \ if (coord > 0) \ { if (sa <= aend) \ fprintf(file," %*d",coord,sa); \ else \ fprintf(file," %*s",coord,""); \ fprintf(file," %s\n",Abuf); \ fprintf(file,"%*s %*s %s\n",indent,"",coord,"",Dbuf); \ fprintf(file,"%*s",indent,""); \ if (sb <= bend) \ fprintf(file," %*d",coord,sb); \ else \ fprintf(file," %*s",coord,""); \ fprintf(file," %s",Bbuf); \ } \ else \ { fprintf(file," %s\n",Abuf); \ fprintf(file,"%*s %s\n",indent,"",Dbuf); \ fprintf(file,"%*s %s",indent,"",Bbuf); \ } \ fprintf(file," %5.1f%%\n",(100.*diff)/(diff+match)); \ o = 0; \ sa = i; \ sb = j; \ match = diff = 0; \ } \ u = (x); \ v = (y); \ if (u == 4 || v == 4) \ Dbuf[o] = ' '; \ else if (u == v) \ Dbuf[o] = mtag; \ else \ Dbuf[o] = dtag; \ Abuf[o] = N2A[u]; \ Bbuf[o] = N2A[v]; \ o += 1; \ } a = align->aseq - 1; b = align->bseq - 1; o = 0; i = j = 1; prefa = align->path->abpos; prefb = align->path->bbpos; if (prefa > border) { i = prefa-(border-1); prefa = border; } if (prefb > border) { j = prefb-(border-1); prefb = border; } sa = i; sb = j; mtag = ':'; dtag = ':'; while (prefa > prefb) { COLUMN(a[i],4) i += 1; prefa -= 1; } while (prefb > prefa) { COLUMN(4,b[j]) j += 1; prefb -= 1; } while (prefa > 0) { COLUMN(a[i],b[j]) i += 1; j += 1; prefa -= 1; } mtag = '['; if (prefb > 0) COLUMN(5,5) mtag = '|'; dtag = '*'; match = diff = 0; { int p, c; /* Output columns of alignment til reach trace end */ for (c = 0; c < tlen; c++) if ((p = trace[c]) < 0) { p = -p; while (i != p) { COLUMN(a[i],b[j]) if (a[i] == b[j]) match += 1; else diff += 1; i += 1; j += 1; } COLUMN(7,b[j]) j += 1; diff += 1; } else { while (j != p) { COLUMN(a[i],b[j]) if (a[i] == b[j]) match += 1; else diff += 1; i += 1; j += 1; } COLUMN(a[i],7) i += 1; diff += 1; } p = align->path->aepos; while (i <= p) { COLUMN(a[i],b[j]) if (a[i] == b[j]) match += 1; else diff += 1; i += 1; j += 1; } } { int c; /* Output remaining column including unaligned suffix */ mtag = ']'; if (a[i] != 4 && b[j] != 4 && border > 0) COLUMN(6,6) mtag = ':'; dtag = ':'; c = 0; while (c < border && (a[i] != 4 || b[j] != 4)) { if (a[i] != 4) if (b[j] != 4) { COLUMN(a[i],b[j]) i += 1; j += 1; } else { COLUMN(a[i],4) i += 1; } else { COLUMN(4,b[j]) j += 1; } c += 1; } } /* Print remainder of buffered col.s */ fprintf(file,"\n"); fprintf(file,"%*s",indent,""); if (coord > 0) { if (sa <= aend) fprintf(file," %*d",coord,sa); else fprintf(file," %*s",coord,""); fprintf(file," %.*s\n",o,Abuf); fprintf(file,"%*s %*s %.*s\n",indent,"",coord,"",o,Dbuf); fprintf(file,"%*s",indent,""); if (sb <= bend) fprintf(file," %*d",coord,sb); else fprintf(file," %*s",coord,""); fprintf(file," %.*s",o,Bbuf); } else { fprintf(file," %.*s\n",o,Abuf); fprintf(file,"%*s %.*s\n",indent,"",o,Dbuf); fprintf(file,"%*s %.*s",indent,"",o,Bbuf); } if (diff+match > 0) fprintf(file," %5.1f%%\n",(100.*diff)/(diff+match)); else fprintf(file,"\n"); fflush(file); return (0); } int Print_Reference(FILE *file, Alignment *align, Work_Data *ework, int indent, int block, int border, int upper, int coord) { _Work_Data *work = (_Work_Data *) ework; int *trace = align->path->trace; int tlen = align->path->tlen; char *Abuf, *Bbuf, *Dbuf; int i, j, o; char *a, *b; char mtag, dtag; int prefa, prefb; int aend, bend; int sa, sb, s0; int match, diff; char *N2A; int vmax; if (trace == NULL) return (0); #ifdef SHOW_TRACE fprintf(file,"\nTrace:\n"); for (i = 0; i < tlen; i++) fprintf(file," %3d\n",trace[i]); #endif vmax = work->vecmax/3; o = sizeof(char)*6*(block+1); if (o > vmax) { if (enlarge_vector(work,3*o)) EXIT(1); vmax = work->vecmax/3; } Abuf = (char *) work->vector; Bbuf = Abuf + vmax; Dbuf = Bbuf + vmax; if (upper) N2A = ToU; else N2A = ToL; aend = align->path->aepos; bend = align->path->bepos; #define BLOCK(x,y) \ { int u, v; \ if (i%block == 1 && i != s0 && x < 4 && o > 0) \ { fprintf(file,"\n"); \ fprintf(file,"%*s",indent,""); \ if (coord > 0) \ { if (sa <= aend) \ fprintf(file," %*d",coord,sa); \ else \ fprintf(file," %*s",coord,""); \ fprintf(file," %.*s\n",o,Abuf); \ fprintf(file,"%*s %*s %.*s\n",indent,"",coord,"",o,Dbuf); \ fprintf(file,"%*s",indent,""); \ if (sb <= bend) \ fprintf(file," %*d",coord,sb); \ else \ fprintf(file," %*s",coord,""); \ fprintf(file," %.*s",o,Bbuf); \ } \ else \ { fprintf(file," %.*s\n",o,Abuf); \ fprintf(file,"%*s %.*s\n",indent,"",o,Dbuf); \ fprintf(file,"%*s %.*s",indent,"",o,Bbuf); \ } \ fprintf(file," %5.1f%%\n",(100.*diff)/(diff+match)); \ o = 0; \ sa = i; \ sb = j; \ match = diff = 0; \ } \ u = (x); \ v = (y); \ if (u == 4 || v == 4) \ Dbuf[o] = ' '; \ else if (u == v) \ Dbuf[o] = mtag; \ else \ Dbuf[o] = dtag; \ Abuf[o] = N2A[u]; \ Bbuf[o] = N2A[v]; \ o += 1; \ if (o >= vmax) \ { if (enlarge_vector(work,3*o)) \ EXIT(1); \ vmax = work->vecmax/3; \ memmove(work->vector+2*vmax,Dbuf,o); \ memmove(work->vector+vmax,Bbuf,o); \ memmove(work->vector,Abuf,o); \ Abuf = (char *) work->vector; \ Bbuf = Abuf + vmax; \ Dbuf = Bbuf + vmax; \ } \ } a = align->aseq - 1; b = align->bseq - 1; o = 0; i = j = 1; prefa = align->path->abpos; prefb = align->path->bbpos; if (prefa > border) { i = prefa-(border-1); prefa = border; } if (prefb > border) { j = prefb-(border-1); prefb = border; } s0 = i; sa = i; sb = j; mtag = ':'; dtag = ':'; while (prefa > prefb) { BLOCK(a[i],4) i += 1; prefa -= 1; } while (prefb > prefa) { BLOCK(4,b[j]) j += 1; prefb -= 1; } while (prefa > 0) { BLOCK(a[i],b[j]) i += 1; j += 1; prefa -= 1; } mtag = '['; if (prefb > 0) BLOCK(5,5) mtag = '|'; dtag = '*'; match = diff = 0; { int p, c; /* Output columns of alignment til reach trace end */ for (c = 0; c < tlen; c++) if ((p = trace[c]) < 0) { p = -p; while (i != p) { BLOCK(a[i],b[j]) if (a[i] == b[j]) match += 1; else diff += 1; i += 1; j += 1; } BLOCK(7,b[j]) j += 1; diff += 1; } else { while (j != p) { BLOCK(a[i],b[j]) if (a[i] == b[j]) match += 1; else diff += 1; i += 1; j += 1; } BLOCK(a[i],7) i += 1; diff += 1; } p = align->path->aepos; while (i <= p) { BLOCK(a[i],b[j]) if (a[i] == b[j]) match += 1; else diff += 1; i += 1; j += 1; } } { int c; /* Output remaining column including unaligned suffix */ mtag = ']'; if (a[i] != 4 && b[j] != 4 && border > 0) BLOCK(6,6) mtag = ':'; dtag = ':'; c = 0; while (c < border && (a[i] != 4 || b[j] != 4)) { if (a[i] != 4) if (b[j] != 4) { BLOCK(a[i],b[j]) i += 1; j += 1; } else { BLOCK(a[i],4) i += 1; } else { BLOCK(4,b[j]) j += 1; } c += 1; } } /* Print remainder of buffered col.s */ fprintf(file,"\n"); fprintf(file,"%*s",indent,""); if (coord > 0) { if (sa <= aend) fprintf(file," %*d",coord,sa); else fprintf(file," %*s",coord,""); fprintf(file," %.*s\n",o,Abuf); fprintf(file,"%*s %*s %.*s\n",indent,"",coord,"",o,Dbuf); fprintf(file,"%*s",indent,""); if (sb <= bend) fprintf(file," %*d",coord,sb); else fprintf(file," %*s",coord,""); fprintf(file," %.*s",o,Bbuf); } else { fprintf(file," %.*s\n",o,Abuf); fprintf(file,"%*s %.*s\n",indent,"",o,Dbuf); fprintf(file,"%*s %.*s",indent,"",o,Bbuf); } if (diff+match > 0) fprintf(file," %5.1f%%\n",(100.*diff)/(diff+match)); else fprintf(file,"\n"); fflush(file); return (0); } /* Print an ASCII representation of the overlap in align between fragments a and b to given file. */ static inline void repchar(FILE *file, int symbol, int rep) { while (rep-- > 0) fputc(symbol,file); } void Alignment_Cartoon(FILE *file, Alignment *align, int indent, int coord) { int alen = align->alen; int blen = align->blen; Path *path = align->path; int comp = COMP(align->flags); int w; fprintf(file,"%*s",indent,""); if (path->abpos > 0) fprintf(file," %*d ",coord,path->abpos); else fprintf(file,"%*s",coord+5,""); if (path->aepos < alen) fprintf(file,"%*s%d",coord+8,"",alen-path->aepos); fprintf(file,"\n"); fprintf(file,"%*s",indent,""); if (path->abpos > 0) { fprintf(file,"A "); w = Number_Digits((int64) path->abpos); repchar(file,' ',coord-w); repchar(file,'=',w+3); fputc('+',file); repchar(file,'-',coord+5); } else { fprintf(file,"A %*s",coord+4,""); repchar(file,'-',coord+5); } if (path->aepos < alen) { fputc('+',file); w = Number_Digits((int64) (alen-path->aepos)); repchar(file,'=',w+2); fputc('>',file); repchar(file,' ',w); } else { fputc('>',file); repchar(file,' ',coord+3); } { int asub, bsub; asub = path->aepos - path->abpos; bsub = path->bepos - path->bbpos; fprintf(file," dif/(len1+len2) = %d/(%d+%d) = %5.2f%%\n", path->diffs,asub,bsub,(200.*path->diffs)/(asub+bsub)); } { int sym1e, sym2e; int sym1p, sym2p; if (comp > 0) { sym1p = '<'; sym2p = '-'; sym1e = '<'; sym2e = '='; } else { sym1p = '-'; sym2p = '>'; sym1e = '='; sym2e = '>'; } fprintf(file,"%*s",indent,""); if (path->bbpos > 0) { fprintf(file,"B "); w = Number_Digits((int64) path->bbpos); repchar(file,' ',coord-w); fputc(sym1e,file); repchar(file,'=',w+2); fputc('+',file); repchar(file,'-',coord+5); } else { fprintf(file,"B "); repchar(file,' ',coord+3); fputc(sym1p,file); repchar(file,'-',coord+5); } if (path->bepos < blen) { fprintf(file,"+"); w = Number_Digits((int64) (blen-path->bepos)); repchar(file,'=',w+2); fprintf(file,"%c\n",sym2e); } else fprintf(file,"%c\n",sym2p); } fprintf(file,"%*s",indent,""); if (path->bbpos > 0) fprintf(file," %*d ",coord,path->bbpos); else fprintf(file,"%*s",coord+5,""); if (path->bepos < blen) fprintf(file,"%*s%d",coord+8,"",blen-path->bepos); fprintf(file,"\n"); fflush(file); } /****************************************************************************************\ * * * O(ND) trace algorithm * * * \****************************************************************************************/ #ifdef DEBUG_AWAVE static void print_awave(int *V, int low, int hgh) { int k; printf(" [%6d,%6d]: ",low,hgh); for (k = low; k <= hgh; k++) printf(" %3d",V[k]); printf("\n"); fflush(stdout); } #endif #ifdef DEBUG_ALIGN static int depth = 0; #endif typedef struct { int *Stop; // Ongoing stack of alignment indels char *Aabs, *Babs; // Absolute base of A and B sequences int **PVF, **PHF; // List of waves for iterative np algorithms int mida, midb; // mid point division for mid-point algorithms int *VF, *VB; // Forward/Reverse waves for nd algorithms // (defunct: were used for O(nd) algorithms) } Trace_Waves; static int dandc_nd(char *A, int M, char *B, int N, Trace_Waves *wave) { int x, y; int D; #ifdef DEBUG_ALIGN printf("%*s %ld,%ld: %d vs %d\n",depth,"",A-wave->Aabs,B-wave->Babs,M,N); #endif if (M <= 0) { x = (wave->Aabs-A)-1; for (y = 1; y <= N; y++) { *wave->Stop++ = x; #ifdef DEBUG_SCRIPT printf("%*s *I %ld(%ld)\n",depth,"",y+(B-wave->Babs),(A-wave->Aabs)+1); #endif } return (N); } if (N <= 0) { y = (B-wave->Babs)+1; for (x = 1; x <= M; x++) { *wave->Stop++ = y; #ifdef DEBUG_SCRIPT printf("%*s *D %ld(%ld)\n",depth,"",x+(A-wave->Aabs),(B-wave->Babs)+1); #endif } return (M); } { int *VF = wave->VF; int *VB = wave->VB; int flow; // fhgh == D ! int blow, bhgh; char *a; y = 0; if (N < M) while (y < N && B[y] == A[y]) y += 1; else { while (y < M && B[y] == A[y]) y += 1; if (y >= M && N == M) return (0); } flow = 0; VF[0] = y; VF[-1] = -2; x = N-M; a = A-x; y = N-1; if (N > M) while (y >= x && B[y] == a[y]) y -= 1; else while (y >= 0 && B[y] == a[y]) y -= 1; blow = bhgh = -x; VB += x; VB[blow] = y; VB[blow-1] = N+1; for (D = 1; 1; D += 1) { int k, r; int am, ac, ap; // Forward wave flow -= 1; am = ac = VF[flow-1] = -2; a = A + D; x = M - D; for (k = D; k >= flow; k--) { ap = ac; ac = am+1; am = VF[k-1]; if (ac < am) if (ap < am) y = am; else y = ap; else if (ap < ac) y = ac; else y = ap; if (blow <= k && k <= bhgh) { r = VB[k]; if (y > r) { D = (D<<1)-1; if (ap > r) y = ap; else if (ac > r) y = ac; else y = r+1; x = k+y; goto OVERLAP2; } } if (N < x) while (y < N && B[y] == a[y]) y += 1; else while (y < x && B[y] == a[y]) y += 1; VF[k] = y; a -= 1; x += 1; } #ifdef DEBUG_AWAVE print_awave(VF,flow,D); #endif // Reverse Wave bhgh += 1; blow -= 1; am = ac = VB[blow-1] = N+1; a = A + bhgh; x = -bhgh; for (k = bhgh; k >= blow; k--) { ap = ac+1; ac = am; am = VB[k-1]; if (ac > am) if (ap > am) y = am; else y = ap; else if (ap > ac) y = ac; else y = ap; if (flow <= k && k <= D) { r = VF[k]; if (y <= r) { D = (D << 1); if (ap <= r) y = ap; else if (ac <= r) y = ac; else y = r; x = k+y; goto OVERLAP2; } } y -= 1; if (x > 0) while (y >= x && B[y] == a[y]) y -= 1; else while (y >= 0 && B[y] == a[y]) y -= 1; VB[k] = y; a -= 1; x += 1; } #ifdef DEBUG_AWAVE print_awave(VB,blow,bhgh); #endif } } OVERLAP2: #ifdef DEBUG_ALIGN printf("%*s (%d,%d) @ %d\n",depth,"",x,y,D); fflush(stdout); #endif if (D > 1) { #ifdef DEBUG_ALIGN depth += 2; #endif dandc_nd(A,x,B,y,wave); dandc_nd(A+x,M-x,B+y,N-y,wave); #ifdef DEBUG_ALIGN depth -= 2; #endif } else if (D == 1) { if (M > N) { *wave->Stop++ = (B-wave->Babs)+y+1; #ifdef DEBUG_SCRIPT printf("%*s D %ld(%ld)\n",depth,"",(A-wave->Aabs)+x,(B-wave->Babs)+y+1); #endif } else if (M < N) { *wave->Stop++ = (wave->Aabs-A)-x-1; #ifdef DEBUG_SCRIPT printf("%*s I %ld(%ld)\n",depth,"",(B-wave->Babs)+y,(A-wave->Aabs)+x+1); #endif } #ifdef DEBUG_SCRIPT else printf("%*s %ld S %ld\n",depth,"",(wave->Aabs-A)+x,(B-wave->Babs)+y); #endif } return (D); } static int Compute_Trace_ND_ALL(Alignment *align, Work_Data *ework) { _Work_Data *work = (_Work_Data *) ework; Trace_Waves wave; int L, D; int asub, bsub; Path *path; int *trace; path = align->path; asub = path->aepos-path->abpos; bsub = path->bepos-path->bbpos; if (asub < bsub) L = bsub; else L = asub; L *= sizeof(int); if (L > work->tramax) if (enlarge_trace(work,L)) EXIT(1); trace = wave.Stop = ((int *) work->trace); D = 2*(path->diffs + 4)*sizeof(int); if (D > work->vecmax) if (enlarge_vector(work,D)) EXIT(1); D = (path->diffs+3)/2; wave.VF = ((int *) work->vector) + (D+1); wave.VB = wave.VF + (2*D+1); wave.Aabs = align->aseq; wave.Babs = align->bseq; path->diffs = dandc_nd(align->aseq+path->abpos,path->aepos-path->abpos, align->bseq+path->bbpos,path->bepos-path->bbpos,&wave); path->trace = trace; path->tlen = wave.Stop - trace; return (0); } /****************************************************************************************\ * * * O(NP) tracing algorithms * * * \****************************************************************************************/ /* Iterative O(np) algorithm for finding the alignment between two substrings (specified by a Path record). The variation includes handling substitutions and guarantees to find left-most alignments so that low complexity runs are always aligned in the same way. */ #ifdef DEBUG_ALIGN static int ToA[4] = { 'a', 'c', 'g', 't' }; #endif static int iter_np(char *A, int M, char *B, int N, Trace_Waves *wave, int mode) { int **PVF = wave->PVF; int **PHF = wave->PHF; int D; int del = M-N; { int *F0, *F1, *F2; int *HF; int low, hgh; int posl, posh; #ifdef DEBUG_ALIGN printf("\n BASE %ld,%ld: %d vs %d\n",A-wave->Aabs,B-wave->Babs,M,N); printf(" A = "); for (D = 0; D < M; D++) printf("%c",ToA[(int) A[D]]); printf("\n"); printf(" B = "); for (D = 0; D < N; D++) printf("%c",ToA[(int) B[D]]); printf("\n"); #endif if (del >= 0) { low = 0; hgh = del; } else { low = del; hgh = 0; } posl = -INT32_MAX; posh = INT32_MAX; if (wave->Aabs == wave->Babs) { if (B == A) { EPRINTF(EPLACE,"Error: self comparison starts on diagonal 0 (Compute_Trace)\n"); EXIT(-1); } else if (B < A) posl = (B-A)+1; else posh = (B-A)-1; } F1 = PVF[-2]; F0 = PVF[-1]; for (D = low-1; D <= hgh+1; D++) F1[D] = F0[D] = -2; F0[0] = -1; low += 1; hgh -= 1; for (D = 0; 1; D += 1) { int k, i, j; int am, ac, ap; char *a; F2 = F1; F1 = F0; F0 = PVF[D]; HF = PHF[D]; if ((D & 0x1) == 0) { if (low > posl) low -= 1; if (hgh < posh) hgh += 1; } F0[hgh+1] = F0[low-1] = -2; #define FS_MOVE(mdir,pdir) \ ac = F1[k]+1; \ if (ac < am) \ if (ap < am) \ { HF[k] = mdir; \ j = am; \ } \ else \ { HF[k] = pdir; \ j = ap; \ } \ else \ if (ap < ac) \ { HF[k] = 0; \ j = ac; \ } \ else \ { HF[k] = pdir; \ j = ap; \ } \ \ if (N < i) \ while (j < N && B[j] == a[j]) \ j += 1; \ else \ while (j < i && B[j] == a[j]) \ j += 1; \ F0[k] = j; j = -2; a = A + hgh; i = M - hgh; for (k = hgh; k > del; k--) { ap = j+1; am = F2[k-1]; FS_MOVE(-1,4) a -= 1; i += 1; } j = -2; a = A + low; i = M - low; for (k = low; k < del; k++) { ap = F2[k+1]+1; am = j; FS_MOVE(2,1) a += 1; i -= 1; } ap = F0[del+1]+1; am = j; FS_MOVE(2,4) #ifdef DEBUG_AWAVE print_awave(F0,low,hgh); print_awave(HF,low,hgh); #endif if (F0[del] >= N) break; } } { int k, h, m, e, c; int ap = (wave->Aabs-A)-1; int bp = (B-wave->Babs)+1; PHF[0][0] = 3; c = N; k = del; e = PHF[D][k]; PHF[D][k] = 3; if (mode == UPPERMOST) while (e != 3) { h = k+e; if (e > 1) h -= 3; else if (e == 0) D -= 1; else D -= 2; if (h < k) // => e = -1 or 2, UPPERMOST { char *a; a = A + k; if (k < 0) m = -k; else m = 0; if (PVF[D][h] <= c) c = PVF[D][h]-1; while (c >= m && a[c] == B[c]) c -= 1; if (e == -1) // => edge is 2, others are 1, and 0 { if (c <= PVF[D+2][k+1]) { e = 4; h = k+1; D = D+2; } else if (c == PVF[D+1][k]) { e = 0; h = k; D = D+1; } else PVF[D][h] = c+1; } else // => edge is 0, others are 1, and 2 (if k != del), 0 (otherwise) { if (k == del) m = D; else m = D-2; if (c <= PVF[m][k+1]) { if (k == del) e = 4; else e = 1; h = k+1; D = m; } else if (c == PVF[D-1][k]) { e = 0; h = k; D = D-1; } else PVF[D][h] = c+1; } } m = PHF[D][h]; PHF[D][h] = e; e = m; k = h; } else if (mode == LOWERMOST) while (e != 3) { h = k+e; if (e > 1) h -= 3; else if (e == 0) D -= 1; else D -= 2; if (h > k) // => e = 1 or 4, LOWERMOST { char *a; a = A + k; if (k < 0) m = -k; else m = 0; if (PVF[D][h] < c) c = PVF[D][h]; while (c >= m && a[c] == B[c]) c -= 1; if (e == 1) // => edge is 2, others are 1, and 0 { if (c < PVF[D+2][k-1]) { e = 2; h = k-1; D = D+2; } else if (c == PVF[D+1][k]) { e = 0; h = k; D = D+1; } else PVF[D][h] = c--; } else // => edge is 0, others are 1, and 2 (if k != del), 0 (otherwise) { if (k == del) m = D; else m = D-2; if (c < PVF[m][k-1]) { if (k == del) e = 2; else e = -1; h = k-1; D = m; } else if (c == PVF[D-1][k]) { e = 0; h = k; D = D-1; } else PVF[D][h] = c--; } } m = PHF[D][h]; PHF[D][h] = e; e = m; k = h; } else // mode == GREEDIEST while (e != 3) { h = k+e; if (e > 1) h -= 3; else if (e == 0) D -= 1; else D -= 2; m = PHF[D][h]; PHF[D][h] = e; e = m; k = h; } k = D = 0; e = PHF[D][k]; while (e != 3) { h = k-e; c = PVF[D][k]; if (e > 1) h += 3; else if (e == 0) D += 1; else D += 2; #ifdef DEBUG_SCRIPT if (h > k) printf(" D %d(%d)\n",(c-k)-(ap-1),c+bp); else if (h < k) printf(" I %d(%d)\n",c+(bp-1),(c+k)-ap); else printf(" %d S %d\n",(c+k)-(ap+1),c+(bp-1)); #endif if (h > k) *wave->Stop++ = bp+c; else if (h < k) *wave->Stop++ = ap-(c+k); k = h; e = PHF[D][h]; } } return (D + abs(del)); } static int middle_np(char *A, int M, char *B, int N, Trace_Waves *wave, int mode) { int **PVF = wave->PVF; int **PHF = wave->PHF; int D; int del = M-N; { int *F0, *F1, *F2; int *HF; int low, hgh; int posl, posh; #ifdef DEBUG_ALIGN printf("\n%*s BASE %ld,%ld: %d vs %d\n",depth,"",A-wave->Aabs,B-wave->Babs,M,N); printf("%*s A = ",depth,""); for (D = 0; D < M; D++) printf("%c",ToA[(int) A[D]]); printf("\n"); printf("%*s B = ",depth,""); for (D = 0; D < N; D++) printf("%c",ToA[(int) B[D]]); printf("\n"); #endif if (del >= 0) { low = 0; hgh = del; } else { low = del; hgh = 0; } posl = -INT32_MAX; posh = INT32_MAX; if (wave->Aabs == wave->Babs) { if (B == A) { EPRINTF(EPLACE,"Error: self comparison starts on diagonal 0 (Compute_Trace)\n"); EXIT(1); } else if (B < A) posl = (B-A)+1; else posh = (B-A)-1; } F1 = PVF[-2]; F0 = PVF[-1]; for (D = low-1; D <= hgh+1; D++) F1[D] = F0[D] = -2; F0[0] = -1; low += 1; hgh -= 1; for (D = 0; 1; D += 1) { int k, i, j; int am, ac, ap; char *a; F2 = F1; F1 = F0; F0 = PVF[D]; HF = PHF[D]; if ((D & 0x1) == 0) { if (low > posl) low -= 1; if (hgh < posh) hgh += 1; } F0[hgh+1] = F0[low-1] = -2; j = -2; a = A + hgh; i = M - hgh; for (k = hgh; k > del; k--) { ap = j+1; am = F2[k-1]; FS_MOVE(-1,4) a -= 1; i += 1; } j = -2; a = A + low; i = M - low; for (k = low; k < del; k++) { ap = F2[k+1]+1; am = j; FS_MOVE(2,1) a += 1; i -= 1; } ap = F0[del+1]+1; am = j; FS_MOVE(2,4) #ifdef DEBUG_AWAVE print_awave(F0,low,hgh); print_awave(HF,low,hgh); #endif if (F0[del] >= N) break; } } { int k, h, m, e, c; int d, f; d = D + abs(del); c = N; k = del; if (mode == UPPERMOST) for (f = d/2; d > f; d--) { e = PHF[D][k]; h = k+e; if (e > 1) h -= 3; else if (e == 0) D -= 1; else D -= 2; if (h < k) // => e = -1 or 2, UPPERMOST { char *a; a = A + k; if (k < 0) m = -k; else m = 0; if (PVF[D][h] <= c) c = PVF[D][h]-1; while (c >= m && a[c] == B[c]) c -= 1; if (e == -1) // => edge is 2, others are 1, and 0 { if (c <= PVF[D+2][k+1]) { e = 4; h = k+1; D = D+2; } else if (c == PVF[D+1][k]) { e = 0; h = k; D = D+1; } else PVF[D][h] = c+1; } else // => edge is 0, others are 1, and 2 (if k != del), 0 (otherwise) { if (k == del) m = D; else m = D-2; if (c <= PVF[m][k+1]) { if (k == del) e = 4; else e = 1; h = k+1; D = m; } else if (c == PVF[D-1][k]) { e = 0; h = k; D = D-1; } else PVF[D][h] = c+1; } } k = h; } else if (mode == LOWERMOST) for (f = d/2; d > f; d--) { e = PHF[D][k]; h = k+e; if (e > 1) h -= 3; else if (e == 0) D -= 1; else D -= 2; if (h > k) // => e = 1 or 4, LOWERMOST { char *a; a = A + k; if (k < 0) m = -k; else m = 0; if (PVF[D][h] < c) c = PVF[D][h]; while (c >= m && a[c] == B[c]) c -= 1; if (e == 1) // => edge is 2, others are 1, and 0 { if (c < PVF[D+2][k-1]) { e = 2; h = k-1; D = D+2; } else if (c == PVF[D+1][k]) { e = 0; h = k; D = D+1; } else PVF[D][h] = c--; } else // => edge is 0, others are 1, and 2 (if k != del), 0 (otherwise) { if (k == del) m = D; else m = D-2; if (c < PVF[m][k-1]) { if (k == del) e = 2; else e = -1; h = k-1; D = m; } else if (c == PVF[D-1][k]) { e = 0; h = k; D = D-1; } else PVF[D][h] = c--; } } k = h; } else // mode == GREEDIEST for (f = d/2; d > f; d--) { e = PHF[D][k]; h = k+e; if (e > 1) h -= 3; else if (e == 0) D -= 1; else D -= 2; k = h; } wave->midb = (B-wave->Babs) + PVF[D][k]; wave->mida = (A-wave->Aabs) + k + PVF[D][k]; } return (0); } /****************************************************************************************\ * * * COMPUTE_TRACE FLAVORS * * * \****************************************************************************************/ int Compute_Trace_ALL(Alignment *align, Work_Data *ework) { _Work_Data *work = (_Work_Data *) ework; Trace_Waves wave; Path *path; char *aseq, *bseq; int M, N, D; path = align->path; aseq = align->aseq; bseq = align->bseq; M = path->aepos-path->abpos; N = path->bepos-path->bbpos; { int64 s; int d; int dmax; int **PVF, **PHF; if (M < N) s = N; else s = M; s *= sizeof(int); if (s > work->tramax) if (enlarge_trace(work,s)) EXIT(1); dmax = path->diffs - abs(M-N); s = (dmax+3)*2*((M+N+3)*sizeof(int) + sizeof(int *)); if (s > 256000000) return (Compute_Trace_ND_ALL(align,ework)); if (s > work->vecmax) if (enlarge_vector(work,s)) EXIT(1); wave.PVF = PVF = ((int **) (work->vector)) + 2; wave.PHF = PHF = PVF + (dmax+3); s = M+N+3; PVF[-2] = ((int *) (PHF + (dmax+1))) + (N+1); for (d = -1; d <= dmax; d++) PVF[d] = PVF[d-1] + s; PHF[-2] = PVF[dmax] + s; for (d = -1; d <= dmax; d++) PHF[d] = PHF[d-1] + s; } wave.Stop = ((int *) work->trace); wave.Aabs = aseq; wave.Babs = bseq; D = iter_np(aseq+path->abpos,M,bseq+path->bbpos,N,&wave,GREEDIEST); if (D < 0) EXIT(1); path->diffs = D; path->trace = work->trace; path->tlen = wave.Stop - ((int *) path->trace); return (0); } int Compute_Trace_PTS(Alignment *align, Work_Data *ework, int trace_spacing, int mode) { _Work_Data *work = (_Work_Data *) ework; Trace_Waves wave; Path *path; char *aseq, *bseq; uint16 *points; int tlen; int ab, bb; int ae, be; int diffs; path = align->path; aseq = align->aseq; bseq = align->bseq; tlen = path->tlen; points = (uint16 *) path->trace; { int64 s; int d; int M, N; int dmax, nmax; int **PVF, **PHF; M = path->aepos-path->abpos; N = path->bepos-path->bbpos; if (M < N) s = N*sizeof(int); else s = M*sizeof(int); if (s > work->tramax) if (enlarge_trace(work,s)) EXIT(1); nmax = 0; dmax = 0; for (d = 1; d < tlen; d += 2) { if (points[d-1] > dmax) dmax = points[d-1]; if (points[d] > nmax) nmax = points[d]; } if (tlen <= 1) nmax = N; s = (dmax+3)*2*((trace_spacing+nmax+3)*sizeof(int) + sizeof(int *)); if (s > work->vecmax) if (enlarge_vector(work,s)) EXIT(1); wave.PVF = PVF = ((int **) (work->vector)) + 2; wave.PHF = PHF = PVF + (dmax+3); s = trace_spacing+nmax+3; PVF[-2] = ((int *) (PHF + (dmax+1))) + (nmax+1); for (d = -1; d <= dmax; d++) PVF[d] = PVF[d-1] + s; PHF[-2] = PVF[dmax] + s; for (d = -1; d <= dmax; d++) PHF[d] = PHF[d-1] + s; } wave.Stop = (int *) (work->trace); wave.Aabs = aseq; wave.Babs = bseq; { int i, d; diffs = 0; ab = path->abpos; ae = (ab/trace_spacing)*trace_spacing; bb = path->bbpos; tlen -= 2; for (i = 1; i < tlen; i += 2) { ae = ae + trace_spacing; be = bb + points[i]; d = iter_np(aseq+ab,ae-ab,bseq+bb,be-bb,&wave,mode); if (d < 0) EXIT(1); diffs += d; ab = ae; bb = be; } ae = path->aepos; be = path->bepos; d = iter_np(aseq+ab,ae-ab,bseq+bb,be-bb,&wave,mode); if (d < 0) EXIT(1); diffs += d; } path->trace = work->trace; path->tlen = wave.Stop - ((int *) path->trace); path->diffs = diffs; return (0); } int Compute_Trace_MID(Alignment *align, Work_Data *ework, int trace_spacing, int mode) { _Work_Data *work = (_Work_Data *) ework; Trace_Waves wave; Path *path; char *aseq, *bseq; uint16 *points; int tlen; int ab, bb; int ae, be; int diffs; path = align->path; aseq = align->aseq; bseq = align->bseq; tlen = path->tlen; points = (uint16 *) path->trace; { int64 s; int d; int M, N; int dmax, nmax; int **PVF, **PHF; M = path->aepos-path->abpos; N = path->bepos-path->bbpos; if (M < N) s = N*sizeof(int); else s = M*sizeof(int); if (s > work->tramax) if (enlarge_trace(work,s)) EXIT(1); nmax = 0; dmax = 0; for (d = 1; d < tlen; d += 2) { if (points[d-1] > dmax) dmax = points[d-1]; if (points[d] > nmax) nmax = points[d]; } if (tlen <= 1) nmax = N; s = (dmax+3)*4*((trace_spacing+nmax+3)*sizeof(int) + sizeof(int *)); if (s > work->vecmax) if (enlarge_vector(work,s)) EXIT(1); wave.PVF = PVF = ((int **) (work->vector)) + 2; wave.PHF = PHF = PVF + (dmax+3); s = trace_spacing+nmax+3; PVF[-2] = ((int *) (PHF + (dmax+1))) + (nmax+1); for (d = -1; d <= dmax; d++) PVF[d] = PVF[d-1] + s; PHF[-2] = PVF[dmax] + s; for (d = -1; d <= dmax; d++) PHF[d] = PHF[d-1] + s; } wave.Stop = ((int *) work->trace); wave.Aabs = aseq; wave.Babs = bseq; { int i, d; int as, bs; int af, bf; diffs = 0; ab = as = af = path->abpos; ae = (ab/trace_spacing)*trace_spacing; bb = bs = bf = path->bbpos; tlen -= 2; for (i = 1; i < tlen; i += 2) { ae = ae + trace_spacing; be = bb + points[i]; if (middle_np(aseq+ab,ae-ab,bseq+bb,be-bb,&wave,mode)) EXIT(1); af = wave.mida; bf = wave.midb; d = iter_np(aseq+as,af-as,bseq+bs,bf-bs,&wave,mode); if (d < 0) EXIT(1); diffs += d; ab = ae; bb = be; as = af; bs = bf; } ae = path->aepos; be = path->bepos; if (middle_np(aseq+ab,ae-ab,bseq+bb,be-bb,&wave,mode)) EXIT(1); af = wave.mida; bf = wave.midb; d = iter_np(aseq+as,af-as,bseq+bs,bf-bs,&wave,mode); if (d < 0) EXIT(1); diffs += d; as = af; bs = bf; d += iter_np(aseq+af,ae-as,bseq+bf,be-bs,&wave,mode); if (d < 0) EXIT(1); diffs += d; } path->trace = work->trace; path->tlen = wave.Stop - ((int *) path->trace); path->diffs = diffs; return (0); } int Compute_Trace_IRR(Alignment *align, Work_Data *ework, int mode) { _Work_Data *work = (_Work_Data *) ework; Trace_Waves wave; Path *path; char *aseq, *bseq; uint16 *points; int tlen; int ab, bb; int ae, be; int diffs; path = align->path; aseq = align->aseq; bseq = align->bseq; tlen = path->tlen; points = (uint16 *) path->trace; { int64 s; int d; int M, N; int mmax, nmax, dmax; int **PVF, **PHF; M = path->aepos-path->abpos; N = path->bepos-path->bbpos; if (M < N) s = N*sizeof(int); else s = M*sizeof(int); if (s > work->tramax) if (enlarge_trace(work,s)) EXIT(1); nmax = mmax = 0; for (d = 0; d < tlen; d += 2) { if (points[d] > mmax) mmax = points[d]; if (points[d+1] > nmax) nmax = points[d+1]; } if (tlen <= 1) { mmax = M; nmax = N; } if (mmax > nmax) dmax = nmax; else dmax = mmax; s = (dmax+3)*2*((mmax+nmax+3)*sizeof(int) + sizeof(int *)); if (s > work->vecmax) if (enlarge_vector(work,s)) EXIT(1); wave.PVF = PVF = ((int **) (work->vector)) + 2; wave.PHF = PHF = PVF + (dmax+3); s = mmax+nmax+3; PVF[-2] = ((int *) (PHF + (dmax+1))) + (nmax+1); for (d = -1; d <= dmax; d++) PVF[d] = PVF[d-1] + s; PHF[-2] = PVF[dmax] + s; for (d = -1; d <= dmax; d++) PHF[d] = PHF[d-1] + s; } wave.Stop = (int *) (work->trace); wave.Aabs = aseq; wave.Babs = bseq; { int i, d; diffs = 0; ab = path->abpos; bb = path->bbpos; for (i = 0; i < tlen; i += 2) { ae = ab + points[i]; be = bb + points[i+1]; d = iter_np(aseq+ab,ae-ab,bseq+bb,be-bb,&wave,mode); if (d < 0) EXIT(1); diffs += d; ab = ae; bb = be; } } path->trace = work->trace; path->tlen = wave.Stop - ((int *) path->trace); path->diffs = diffs; return (0); } pbdagcon-0.3+20161121+ds/DALIGNER/LAindex.c0000644000175000017500000001215313026414545015703 0ustar afifafif/******************************************************************************************* * * Create an index with extension .las.idx for a .las file. * Utility expects the .las file to be sorted. * Header contains total # of trace points, max # of trace points for * a given overlap, max # of trace points in all the overlaps for a given aread, and * max # of overlaps for a given aread. The remainder are the offsets into each pile. * * Author: Gene Myers * Date : Sept 2015 * *******************************************************************************************/ #include #include #include #include #include #include #include #include "DB.h" #include "align.h" static char *Usage = "[-v] ..."; #define MEMORY 1000 // How many megabytes for output buffer int main(int argc, char *argv[]) { char *iblock; FILE *input, *output; int64 novl, bsize, ovlsize, ptrsize; int tspace, tbytes; char *pwd, *root; int64 tmax, ttot; int64 omax, smax; int64 odeg, sdeg; int i; int VERBOSE; // Process options { int j, k; int flags[128]; ARG_INIT("LAindex") j = 1; for (i = 1; i < argc; i++) if (argv[i][0] == '-') { ARG_FLAGS("v") } else argv[j++] = argv[i]; argc = j; VERBOSE = flags['v']; if (argc <= 1) { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage); exit (1); } } // For each file do ptrsize = sizeof(void *); ovlsize = sizeof(Overlap) - ptrsize; bsize = MEMORY * 1000000ll; iblock = (char *) Malloc(bsize + ptrsize,"Allocating input block"); if (iblock == NULL) exit (1); iblock += ptrsize; for (i = 1; i < argc; i++) { pwd = PathTo(argv[i]); root = Root(argv[i],".las"); input = Fopen(Catenate(pwd,"/",root,".las"),"r"); if (input == NULL) exit (1); if (fread(&novl,sizeof(int64),1,input) != 1) SYSTEM_ERROR if (fread(&tspace,sizeof(int),1,input) != 1) SYSTEM_ERROR if (tspace <= TRACE_XOVR) tbytes = sizeof(uint8); else tbytes = sizeof(uint16); output = Fopen(Catenate(pwd,"/.",root,".las.idx"),"w"); if (output == NULL) exit (1); free(pwd); free(root); if (VERBOSE) { printf(" Indexing %s: ",root); Print_Number(novl,0,stdout); printf(" records ... "); fflush(stdout); } fwrite(&novl,sizeof(int64),1,output); fwrite(&novl,sizeof(int64),1,output); fwrite(&novl,sizeof(int64),1,output); fwrite(&novl,sizeof(int64),1,output); { int j, alst; Overlap *w; int64 tsize; int64 optr; char *iptr, *itop; int64 tlen; optr = sizeof(int64) + sizeof(int32); iptr = iblock; itop = iblock + fread(iblock,1,bsize,input); alst = -1; odeg = sdeg = 0; omax = smax = 0; tmax = ttot = 0; for (j = 0; j < novl; j++) { if (iptr + ovlsize > itop) { int64 remains = itop-iptr; if (remains > 0) memcpy(iblock,iptr,remains); iptr = iblock; itop = iblock + remains; itop += fread(itop,1,bsize-remains,input); } w = (Overlap *) (iptr - ptrsize); tlen = w->path.tlen; if (alst < 0) { fwrite(&optr,sizeof(int64),1,output); alst = w->aread; } else while (alst < w->aread) { if (sdeg > smax) smax = sdeg; if (odeg > omax) omax = odeg; fwrite(&optr,sizeof(int64),1,output); odeg = sdeg = 0; alst += 1; } if (tlen > tmax) tmax = tlen; ttot += tlen; odeg += 1; sdeg += tlen; iptr += ovlsize; tsize = tlen*tbytes; if (iptr + tsize > itop) { int64 remains = itop-iptr; if (remains > 0) memcpy(iblock,iptr,remains); iptr = iblock; itop = iblock + remains; itop += fread(itop,1,bsize-remains,input); } optr += ovlsize + tsize; iptr += tsize; } fwrite(&optr,sizeof(int64),1,output); } if (sdeg > smax) smax = sdeg; if (odeg > omax) omax = odeg; rewind(output); fwrite(&omax,sizeof(int64),1,output); fwrite(&ttot,sizeof(int64),1,output); fwrite(&smax,sizeof(int64),1,output); fwrite(&tmax,sizeof(int64),1,output); if (VERBOSE) { Print_Number(ttot,0,stdout); printf(" trace points\n"); fflush(stdout); } fclose(input); fclose(output); } free(iblock-ptrsize); exit (0); } pbdagcon-0.3+20161121+ds/DALIGNER/LA4Ice.c0000644000175000017500000004374113026414545015367 0ustar afifafif/************************************************************************************\ * * * Copyright (c) 2014, Dr. Eugene W. Myers (EWM). All rights reserved. * * * * Redistribution and use in source and binary forms, with or without modification, * * are permitted provided that the following conditions are met: * * * * · Redistributions of source code must retain the above copyright notice, this * * list of conditions and the following disclaimer. * * * * · Redistributions in binary form must reproduce the above copyright notice, this * * list of conditions and the following disclaimer in the documentation and/or * * other materials provided with the distribution. * * * * · The name of EWM may not be used to endorse or promote products derived from * * this software without specific prior written permission. * * * * THIS SOFTWARE IS PROVIDED BY EWM ”AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, * * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND * * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL EWM BE LIABLE * * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS * * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN * * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * * * For any issues regarding this software and its use, contact EWM at: * * * * Eugene W. Myers Jr. * * Bautzner Str. 122e * * 01099 Dresden * * GERMANY * * Email: gene.myers@gmail.com * * * \************************************************************************************/ /******************************************************************************************* * * Utility for displaying the overlaps in a .las file in a variety of ways including * a minimal listing of intervals, a cartoon, and a full out alignment. * * Author: Gene Myers * Creation: July 2013 * Last Mod: Jan 2015 * *******************************************************************************************/ #include #include #include #include #include #include #include #include #include "DB.h" #include "align.h" static char *Usage[] = { "[-carmEUF] [-i] [-w] [-b] ", " [ [ ] [ ... ]" }; #define LAST_READ_SYMBOL '$' static int ORDER(const void *l, const void *r) { int x = *((int32 *) l); int y = *((int32 *) r); return (x-y); } int main(int argc, char *argv[]) { HITS_DB _db1, *db1 = &_db1; HITS_DB _db2, *db2 = &_db2; Overlap _ovl, *ovl = &_ovl; Alignment _aln, *aln = &_aln; FILE *input; int64 novl; int tspace, tbytes, small; int reps, *pts; int ALIGN, CARTOON, REFERENCE, FLIP; int INDENT, WIDTH, BORDER, UPPERCASE; int ISTWO; int ICE_FL; int M4OVL; // Process options { int i, j, k; int flags[128]; char *eptr; ARG_INIT("LA4Ice") INDENT = 4; WIDTH = 100; BORDER = 10; M4OVL = 0; ICE_FL = 0; j = 1; for (i = 1; i < argc; i++) if (argv[i][0] == '-') switch (argv[i][1]) { default: ARG_FLAGS("carmEUF") break; case 'i': ARG_NON_NEGATIVE(INDENT,"Indent") break; case 'w': ARG_POSITIVE(WIDTH,"Alignment width") break; case 'b': ARG_NON_NEGATIVE(BORDER,"Alignment border") break; } else argv[j++] = argv[i]; argc = j; UPPERCASE = flags['U']; ALIGN = flags['a']; REFERENCE = flags['r']; CARTOON = flags['c']; FLIP = flags['F']; M4OVL = flags['m']; ICE_FL = flags['E']; if (argc <= 2) { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage[0]); fprintf(stderr," %*s %s\n",(int) strlen(Prog_Name),"",Usage[1]); exit (1); } } // Open trimmed DB or DB pair { int status; char *pwd, *root; FILE *input; ISTWO = 0; status = Open_DB(argv[1],db1); if (status < 0) exit (1); if (db1->part > 0) { fprintf(stderr,"%s: Cannot be called on a block: %s\n",Prog_Name,argv[1]); exit (1); } if (argc > 3) { pwd = PathTo(argv[3]); root = Root(argv[3],".las"); if ((input = fopen(Catenate(pwd,"/",root,".las"),"r")) != NULL) { ISTWO = 1; fclose(input); status = Open_DB(argv[2],db2); if (status < 0) exit (1); if (db2->part > 0) { fprintf(stderr,"%s: Cannot be called on a block: %s\n",Prog_Name,argv[2]); exit (1); } Trim_DB(db2); } else db2 = db1; free(root); free(pwd); } else db2 = db1; Trim_DB(db1); } // Process read index arguments into a sorted list of read ranges pts = (int *) Malloc(sizeof(int)*2*argc,"Allocating read parameters"); if (pts == NULL) exit (1); reps = 0; if (argc > 3+ISTWO) { int c, b, e; char *eptr, *fptr; for (c = 3+ISTWO; c < argc; c++) { if (argv[c][0] == LAST_READ_SYMBOL) { b = db1->nreads; eptr = argv[c]+1; } else b = strtol(argv[c],&eptr,10); if (eptr > argv[c]) { if (b == 0) { fprintf(stderr,"%s: 0 is not a valid index\n",Prog_Name); exit (1); } if (*eptr == '\0') { pts[reps++] = b; pts[reps++] = b; continue; } else if (*eptr == '-') { if (eptr[1] == LAST_READ_SYMBOL) { e = INT32_MAX; fptr = eptr+2; } else e = strtol(eptr+1,&fptr,10); if (fptr > eptr+1 && *fptr == 0 && eptr[1] != '-') { pts[reps++] = b; pts[reps++] = e; if (b > e) { fprintf(stderr,"%s: Empty range '%s'\n",Prog_Name,argv[c]); exit (1); } continue; } } } fprintf(stderr,"%s: argument '%s' is not an integer range\n",Prog_Name,argv[c]); exit (1); } qsort(pts,reps/2,sizeof(int64),ORDER); b = 0; for (c = 0; c < reps; c += 2) if (b > 0 && pts[b-1] >= pts[c]-1) { if (pts[c+1] > pts[b-1]) pts[b-1] = pts[c+1]; } else { pts[b++] = pts[c]; pts[b++] = pts[c+1]; } pts[b++] = INT32_MAX; reps = b; } else { pts[reps++] = 1; pts[reps++] = INT32_MAX; } // Initiate file reading and read (novl, tspace) header { char *over, *pwd, *root; pwd = PathTo(argv[2+ISTWO]); root = Root(argv[2+ISTWO],".las"); over = Catenate(pwd,"/",root,".las"); input = Fopen(over,"r"); if (input == NULL) exit (1); if (fread(&novl,sizeof(int64),1,input) != 1) SYSTEM_ERROR if (fread(&tspace,sizeof(int),1,input) != 1) SYSTEM_ERROR if (tspace <= TRACE_XOVR) { small = 1; tbytes = sizeof(uint8); } else { small = 0; tbytes = sizeof(uint16); } if (!(M4OVL)) { printf("\n%s: ",root); Print_Number(novl,0,stdout); printf(" records\n"); } free(pwd); free(root); } // Read the file and display selected records if (tspace > 0) { int j; uint16 *trace; Work_Data *work; int tmax; int in, npt, idx, ar; int64 tps; char *abuffer, *bbuffer; int ar_wide, br_wide; int ai_wide, bi_wide; int mn_wide, mx_wide; int tp_wide; aln->path = &(ovl->path); if (ALIGN || REFERENCE) { work = New_Work_Data(); abuffer = New_Read_Buffer(db1); bbuffer = New_Read_Buffer(db2); } else { abuffer = NULL; bbuffer = NULL; work = NULL; } tmax = 1000; trace = (uint16 *) Malloc(sizeof(uint16)*tmax,"Allocating trace vector"); if (trace == NULL) exit (1); in = 0; npt = pts[0]; idx = 1; ar_wide = Number_Digits((int64) db1->nreads); br_wide = Number_Digits((int64) db2->nreads); ai_wide = Number_Digits((int64) db1->maxlen); bi_wide = Number_Digits((int64) db2->maxlen); if (db1->maxlen < db2->maxlen) { mn_wide = ai_wide; mx_wide = bi_wide; tp_wide = Number_Digits((int64) db1->maxlen/tspace+2); } else { mn_wide = bi_wide; mx_wide = ai_wide; tp_wide = Number_Digits((int64) db2->maxlen/tspace+2); } ar_wide += (ar_wide-1)/3; br_wide += (br_wide-1)/3; ai_wide += (ai_wide-1)/3; bi_wide += (bi_wide-1)/3; mn_wide += (mn_wide-1)/3; tp_wide += (tp_wide-1)/3; if (FLIP) { int x; x = ar_wide; ar_wide = br_wide; br_wide = x; x = ai_wide; ai_wide = bi_wide; bi_wide = x; } // For each record do for (j = 0; j < novl; j++) // Read it in { Read_Overlap(input,ovl); if (ovl->path.tlen > tmax) { tmax = ((int) 1.2*ovl->path.tlen) + 100; trace = (uint16 *) Realloc(trace,sizeof(uint16)*tmax,"Allocating trace vector"); if (trace == NULL) exit (1); } ovl->path.trace = (void *) trace; Read_Trace(input,ovl,tbytes); // Determine if it should be displayed ar = ovl->aread+1; if (in) { while (ar > npt) { npt = pts[idx++]; if (ar < npt) { in = 0; break; } npt = pts[idx++]; } } else { while (ar >= npt) { npt = pts[idx++]; if (ar <= npt) { in = 1; break; } npt = pts[idx++]; } } if (!in) continue; // move calculation of sStart and sEnd (bbpos, bepos) up here since both ICE and M4OVL uses it int64 bbpos, bepos; if (COMP(ovl->flags)) { bbpos = (int64) db2->reads[ovl->bread].rlen - (int64) ovl->path.bepos; bepos = (int64) db2->reads[ovl->bread].rlen - (int64) ovl->path.bbpos; } else { bbpos = (int64) ovl->path.bbpos; bepos = (int64) ovl->path.bepos; } if (ICE_FL) { // only contiue if it is a full-length-to-full-length mapping, as in: // (1) qStart < 200 and sStart < 200 // (2) qEnd + 50 > qLen and sEnd + 50 > qLen if (ovl->path.abpos > 200 || bbpos > 200) continue; if (ovl->path.aepos + 50 < db1->reads[ovl->aread].rlen) continue; if (bepos + 50 < db2->reads[ovl->bread].rlen) continue; } // Display it aln->alen = db1->reads[ovl->aread].rlen; aln->blen = db2->reads[ovl->bread].rlen; aln->flags = ovl->flags; tps = ((ovl->path.aepos-1)/tspace - ovl->path.abpos/tspace); if (M4OVL) { double acc; acc = 100-(200. * ovl->path.diffs)/( ovl->path.aepos - ovl->path.abpos + ovl->path.bepos - ovl->path.bbpos); printf("%09lld %09lld %lld %5.2f ", (int64) ovl->aread, (int64) ovl->bread, (int64) bbpos - (int64) bepos, acc); printf("0 %lld %lld %lld ", (int64) ovl->path.abpos, (int64) ovl->path.aepos, (int64) aln->alen); printf("%d %lld %lld %lld ", COMP(ovl->flags), bbpos, bepos, (int64) aln->blen); if ( ((int64) aln->blen < (int64) aln->alen) && ((int64) ovl->path.bbpos < 1) && ((int64) aln->blen - (int64) ovl->path.bepos < 1) ) { printf("contains\n"); } else if ( ((int64) aln->alen < (int64) aln->blen) && ((int64) ovl->path.abpos < 1) && ((int64) aln->alen - (int64) ovl->path.aepos < 1) ) { printf("contained\n"); } else { printf("overlap\n"); } } if (!M4OVL) { if (FLIP) { Flip_Alignment(aln,0); Print_Number((int64) ovl->bread+1,ar_wide+1,stdout); printf(" "); Print_Number((int64) ovl->aread+1,br_wide+1,stdout); } else { Print_Number((int64) ovl->aread+1,ar_wide+1,stdout); printf(" "); Print_Number((int64) ovl->bread+1,br_wide+1,stdout); } if (COMP(ovl->flags)) printf(" c"); else printf(" n"); printf(" ["); Print_Number((int64) ovl->path.abpos,ai_wide,stdout); printf(".."); Print_Number((int64) ovl->path.aepos,ai_wide,stdout); printf("] x ["); Print_Number((int64) ovl->path.bbpos,bi_wide,stdout); printf(".."); Print_Number((int64) ovl->path.bepos,bi_wide,stdout); printf("]"); } if (ALIGN || CARTOON || REFERENCE) { if (ALIGN || REFERENCE) { char *aseq, *bseq; int amin, amax; int bmin, bmax; if (FLIP) Flip_Alignment(aln,0); if (small) Decompress_TraceTo16(ovl); amin = ovl->path.abpos - BORDER; if (amin < 0) amin = 0; amax = ovl->path.aepos + BORDER; if (amax > aln->alen) amax = aln->alen; if (COMP(aln->flags)) { bmin = (aln->blen-ovl->path.bepos) - BORDER; if (bmin < 0) bmin = 0; bmax = (aln->blen-ovl->path.bbpos) + BORDER; if (bmax > aln->blen) bmax = aln->blen; } else { bmin = ovl->path.bbpos - BORDER; if (bmin < 0) bmin = 0; bmax = ovl->path.bepos + BORDER; if (bmax > aln->blen) bmax = aln->blen; } aseq = Load_Subread(db1,ovl->aread,amin,amax,abuffer,0); bseq = Load_Subread(db2,ovl->bread,bmin,bmax,bbuffer,0); aln->aseq = aseq - amin; if (COMP(aln->flags)) { Complement_Seq(bseq,bmax-bmin); aln->bseq = bseq - (aln->blen - bmax); } else aln->bseq = bseq - bmin; Compute_Trace_PTS(aln,work,tspace,GREEDIEST); if (FLIP) { if (COMP(aln->flags)) { Complement_Seq(aseq,amax-amin); Complement_Seq(bseq,bmax-bmin); aln->aseq = aseq - (aln->alen - amax); aln->bseq = bseq - bmin; } Flip_Alignment(aln,1); } } if (CARTOON) { printf(" ("); Print_Number(tps,tp_wide,stdout); printf(" trace pts)\n\n"); Alignment_Cartoon(stdout,aln,INDENT,mx_wide); } else { if (!M4OVL) { printf(" : = "); Print_Number((int64) ovl->path.diffs,mn_wide,stdout); printf(" diffs ("); Print_Number(tps,tp_wide,stdout); printf(" trace pts)\n"); } } if (REFERENCE) Print_Reference(stdout,aln,work,INDENT,WIDTH,BORDER,UPPERCASE,mx_wide); if (ALIGN) Print_Alignment(stdout,aln,work,INDENT,WIDTH,BORDER,UPPERCASE,mx_wide); } else { printf(" : < "); Print_Number((int64) ovl->path.diffs,mn_wide,stdout); printf(" diffs ("); Print_Number(tps,tp_wide,stdout); printf(" trace pts)\n"); } } free(trace); if (ALIGN) { free(bbuffer-1); free(abuffer-1); Free_Work_Data(work); } } if (M4OVL) { printf("+ +\n"); printf("- -\n"); } Close_DB(db1); if (ISTWO) Close_DB(db2); exit (0); } pbdagcon-0.3+20161121+ds/DALIGNER/py_utils/0000755000175000017500000000000013026414545016061 5ustar afifafifpbdagcon-0.3+20161121+ds/DALIGNER/py_utils/FALCON_ASM_NOTE0000644000175000017500000000154713026414545020302 0ustar afifafif # example for using daligner and Falcon to generate an assembly export PATH=/path_to_DAZZLER/DALIGNER/:$PATH export PATH=/path_to_DAZZLER/DAZZ_DB/:$PATH . /path_to_HBAR_FALCON/bin/activate for f in `cat input.fofn `; do fasta2DB yeast $f; done DBsplit -x500 -s400 yeast HPCdaligner -v -dal4 -t16 -m.70 -l1000 -s1000 yeast > run_jobs.sh bash run_jobs.sh cp /path_to_DAZZLER/DALIGNER/DB.so . cp /path_to_DAZZLER/DALIGNER/py_utils/*.py . for i in 1 2 3 4; do python DBLA_to_falcon.py yeast.$i.las yeast.db | falcon_sense.py \ --min_cov 4 --output_multi --min_idt 0.70 --trim_size 10 \ --n_core 24 > preads.$i.fa; done mkdir falcon_asm cat preads.1.fa preads.2.fa preads.3.fa preads.4.fa > falcon_asm/preads.fa cd falcon_asm falcon_overlap.py --d_core 3 --n_core 24 --min_len 8000 preads.fa > preads.ovl falcon_asm.py preads.ovl preads.fa falcon_fixasm.py pbdagcon-0.3+20161121+ds/DALIGNER/py_utils/DAPipe.py0000644000175000017500000000751313026414545017543 0ustar afifafiffrom pypeflow.common import * from pypeflow.data import PypeLocalFile, makePypeLocalFile, fn from pypeflow.task import PypeTask, PypeThreadTaskBase, PypeTaskBase from pypeflow.controller import PypeWorkflow, PypeThreadWorkflow import os import uuid import sys def run_script(job_data, job_type = "SGE" ): if job_type == "SGE": job_name = job_data["job_name"] cwd = job_data["cwd"] sge_option = job_data["sge_option"] script_fn = job_data["script_fn"] sge_cmd="qsub -N {job_name} {sge_option} -o {cwd}/sge_log -j y\ -S /bin/bash {script}".format(job_name=job_name, cwd=os.getcwd(), sge_option=sge_option, script=script_fn) #print sge_cmd os.system( sge_cmd ) os.system( "sleep 1") elif job_type == "local": os.system( "bash %s" % job_data["script_fn"] ) def wait_for_file(filename, task = None, job_name = ""): while 1: time.sleep(60) if os.path.exists(filename): break if task != None: if task.shutdown_event != None and task.shutdown_event.is_set(): os.system("qdel %s" % job_name) break def run_daligner(self): daligner_cmd = self.parameters["daligner_cmd"] job_id = self.parameters["job_id"] cwd = self.parameters["cwd"] script_dir = os.path.join( cwd ) script_fn = os.path.join( script_dir , "rj_%05d.sh" % (job_id)) log_path = os.path.join( script_dir, "rj_%05d.log" % (job_id)) script = [] script.append( "export PATH=~/task2014/dazzler/DALIGNER/:$PATH" ) script.append( "cd %s" % cwd ) script.append( "/usr/bin/time "+ daligner_cmd + ( " >& %s " % log_path ) + ( " && touch %s" % fn( self.job_done ) ) ) with open(script_fn,"w") as script_file: script_file.write("\n".join(script)) job_name = self.URL.split("/")[-1] job_name += "-"+str(uuid.uuid1())[:8] job_data = {"job_name": job_name, "cwd": cwd, "sge_option": " -pe smp 6 -q huasm ", "script_fn": script_fn } run_script(job_data, job_type = "SGE") wait_for_file( fn( self.job_done ), task=self, job_name=job_name ) if __name__ == "__main__": prefix = sys.argv[1] concurrent_jobs = 64 PypeThreadWorkflow.setNumThreadAllowed(concurrent_jobs, concurrent_jobs) wf = PypeThreadWorkflow() job_id = 0 db_file = makePypeLocalFile(os.path.abspath( "./%s.db" % prefix )) with open("run_jobs.sh") as f : for l in f : l = l.strip().split() if l[0] == "daligner": try: os.makedirs("./job_%05d" % job_id) except OSError: pass os.system("cd ./job_%05d;ln -s ../.%s.bps .; ln -s ../.%s.idx .; ln -s ../%s.db ." % (job_id, prefix, prefix, prefix) ) job_done = makePypeLocalFile(os.path.abspath( "./job_%05d/job_%05d_done" % (job_id,job_id) )) parameters = {"daligner_cmd": " ".join(l), "cwd": os.path.join(os.getcwd(), "job_%05d" % job_id), "job_id": job_id} make_daligner_task = PypeTask( inputs = {"db_file": db_file}, outputs = {"job_done": job_done}, parameters = parameters, TaskType = PypeThreadTaskBase, URL = "task://localhost/mtask_%05d" % job_id ) daligner_task = make_daligner_task ( run_daligner ) wf.addTask(daligner_task) job_id += 1 print job_id wf.refreshTargets(updateFreq = 45) #all pbdagcon-0.3+20161121+ds/DALIGNER/py_utils/LAPipe.py0000644000175000017500000001552513026414545017555 0ustar afifafiffrom pypeflow.common import * from pypeflow.data import PypeLocalFile, makePypeLocalFile, fn from pypeflow.task import PypeTask, PypeThreadTaskBase, PypeTaskBase from pypeflow.controller import PypeWorkflow, PypeThreadWorkflow import os import uuid import sys def run_script(job_data, job_type = "SGE" ): if job_type == "SGE": job_name = job_data["job_name"] cwd = job_data["cwd"] sge_option = job_data["sge_option"] script_fn = job_data["script_fn"] sge_cmd="qsub -N {job_name} {sge_option} -o {cwd}/sge_log -j y\ -S /bin/bash {script}".format(job_name=job_name, cwd=os.getcwd(), sge_option=sge_option, script=script_fn) #print sge_cmd os.system( sge_cmd ) os.system( "sleep 1") elif job_type == "local": os.system( "bash %s" % job_data["script_fn"] ) def wait_for_file(filename, task = None, job_name = ""): while 1: time.sleep(30) if os.path.exists(filename): break if task != None: if task.shutdown_event != None and task.shutdown_event.is_set(): os.system("qdel %s" % job_name) break def run_p_task(self): p_script_fn = self.parameters["p_file"] job_id = self.parameters["job_id"] cwd = self.parameters["cwd"] script_dir = os.path.join( cwd ) script_fn = os.path.join( script_dir , "rp_%05d.sh" % (job_id)) log_path = os.path.join( script_dir, "rp_%05d.log" % (job_id)) script = [] script.append( "export PATH=~/task2014/dazzler/DALIGNER/:$PATH" ) script.append( "cd %s" % cwd ) script.append( ("/usr/bin/time bash %s " % p_script_fn) + ( " >& %s " % log_path ) + ( " && touch %s" % fn( self.job_done ) ) ) with open(script_fn,"w") as script_file: script_file.write("\n".join(script)) job_name = self.URL.split("/")[-1] job_name += "-"+str(uuid.uuid1())[:8] job_data = {"job_name": job_name, "cwd": cwd, "sge_option": " -pe smp 2 -q huasm ", "script_fn": script_fn } run_script(job_data, job_type = "SGE") wait_for_file( fn( self.job_done ), task=self, job_name=job_name ) def run_consensus_task(self): job_id = self.parameters["job_id"] cwd = self.parameters["cwd"] script_dir = os.path.join( cwd ) script_fn = os.path.join( script_dir , "cp_%05d.sh" % (job_id)) log_path = os.path.join( script_dir, "cp_%05d.log" % (job_id)) with open( os.path.join(cwd, "c_%05d.sh" % job_id), "w") as p_script: print >> p_script, ". /mnt/secondary/Share/HBAR_03202013/bin/activate" print >> p_script, "cd .." print >> p_script, """./LA4Falcon -o -f:%s las_files/%s.%d.las | """ % (prefix, prefix, job_id), print >> p_script, """ falcon_sense.py --trim --output_multi --min_idt 0.70 --min_cov 4 --local_match_count_threshold 3 --max_n_read 800 --n_core 8 > %s""" % fn(self.out_file) script = [] script.append( "cd %s" % cwd ) script.append( ("/usr/bin/time bash c_%05d.sh " % job_id ) + ( " >& %s " % log_path ) + ( " && touch c_%05d_done" % job_id ) ) with open(script_fn,"w") as script_file: script_file.write("\n".join(script)) job_name = self.URL.split("/")[-1] job_name += "-"+str(uuid.uuid1())[:8] job_data = {"job_name": job_name, "cwd": cwd, "sge_option": " -pe smp 6 -q huasm ", "script_fn": script_fn } run_script(job_data, job_type = "SGE") wait_for_file( os.path.join(cwd,"c_%05d_done" % job_id) , task=self, job_name=job_name ) if __name__ == "__main__": prefix = sys.argv[1] concurrent_jobs = 16 PypeThreadWorkflow.setNumThreadAllowed(concurrent_jobs, concurrent_jobs) wf = PypeThreadWorkflow() mjob_data = {} with open("run_jobs.sh") as f: for l in f: l = l.strip().split() if l[0] not in ( "LAsort", "LAmerge" ): continue if l[0] == "LAsort": p_id = int( l[2].split(".")[1] ) mjob_data.setdefault( p_id, [] ) mjob_data[p_id].append( " ".join(l) ) if l[0] == "LAmerge": l2 = l[2].split(".") if l2[1] == "L2": p_id = int( l[2].split(".")[2] ) mjob_data.setdefault( p_id, [] ) mjob_data[p_id].append( " ".join(l) ) else: p_id = int( l[2].split(".")[1] ) mjob_data.setdefault( p_id, [] ) mjob_data[p_id].append( " ".join(l) ) db_file = makePypeLocalFile(os.path.abspath( "./%s.db" % prefix )) for p_id in mjob_data: s_data = mjob_data[p_id] try: os.makedirs("./p_%05d" % p_id) os.makedirs("./p_%05d/sge_log" % p_id) except OSError: pass try: os.makedirs("./preads") except OSError: pass try: os.makedirs("./las_files") except OSError: pass with open("./p_%05d/p_%05d.sh" % (p_id, p_id), "w") as p_script: print >> p_script, """for f in `find .. -wholename "*job*/%s.%d.%s.*.*.las"`; do ln -sf $f .; done""" % (prefix, p_id, prefix) for l in s_data: print >> p_script, l print >> p_script, "mv %s.%d.las ../las_files" % (prefix, p_id) p_file = os.path.abspath( "./p_%05d/p_%05d.sh" % (p_id, p_id) ) job_done = makePypeLocalFile(os.path.abspath( "./p_%05d/p_%05d_done" % (p_id,p_id) )) parameters = {"p_file": p_file, "cwd": os.path.join(os.getcwd(), "p_%05d" % p_id), "job_id": p_id} make_p_task = PypeTask( inputs = {"db_file": db_file}, outputs = {"job_done": job_done}, parameters = parameters, TaskType = PypeThreadTaskBase, URL = "task://localhost/ptask_%05d" % p_id ) p_task = make_p_task ( run_p_task ) wf.addTask(p_task) out_file = makePypeLocalFile(os.path.abspath( "./preads/out.%04d.fa" % p_id )) parameters = {"cwd": os.path.join(os.getcwd(), "preads" ), "job_id": p_id} make_c_task = PypeTask( inputs = {"job_done": job_done}, outputs = {"out_file": out_file }, parameters = parameters, TaskType = PypeThreadTaskBase, URL = "task://localhost/ct_%05d" % p_id ) c_task = make_c_task( run_consensus_task ) wf.addTask(c_task) print p_id wf.refreshTargets(updateFreq = 15) #all pbdagcon-0.3+20161121+ds/DALIGNER/py_utils/DAPI.py0000644000175000017500000000363313026414545017155 0ustar afifafiffrom ctypes import * _READIDX = c_uint16 _TRACE_XOVR = 125 class HITS_READ(Structure): _fields_ = [ ("origin", c_int), ("begin", _READIDX), ("end", _READIDX), ("boff", c_int64), ("coff", c_int64), ("flags", c_int)] class HITS_TRACK(Structure): pass HITS_TRACK._fields_ = [ ("_track", POINTER(HITS_TRACK)), ("name", c_char_p), ("size", c_int), ("anno", c_void_p), ("data", c_void_p)] class HITS_DB(Structure): _fields_ = [ ( "oreads", c_int ), ( "breads", c_int ), ( "cutoff", c_int ), ( "all", c_int), ( "freq", c_float * 4), ( "maxlen", c_int), ( "totlen", c_int64), ( "nreads", c_int), ( "trimmed", c_int), ( "part", c_int), ( "ofirst", c_int), ( "bfirst", c_int), ( "path", c_char_p), ( "loaded", c_int), ( "bases", c_void_p), ( "reads", POINTER(HITS_READ)), ( "tracks", POINTER(HITS_TRACK)) ] DB = CDLL("./DB.so") libc = CDLL("libc.so.6") fopen = libc.fopen fclose = libc.fclose fread = libc.fread fread.argtypes = [c_void_p, c_size_t, c_size_t, c_void_p] open_DB = DB.Open_DB open_DB.argtypes = [c_char_p, POINTER(HITS_DB)] open_DB.restype = c_int load_read = DB.Load_Read load_read.argtypes = [POINTER(HITS_DB), c_int, c_char_p, c_int] load_read.restype = c_int close_DB = DB.Close_DB close_DB.argtypes = [POINTER(HITS_DB)] close_DB.restype = c_int trim_DB = DB.Trim_DB trim_DB.argtypes = [POINTER(HITS_DB)] trim_DB.restype = c_int new_read_buffer = DB.New_Read_Buffer new_read_buffer.argtypes = [ POINTER(HITS_DB) ] new_read_buffer.restype = POINTER(c_char) pbdagcon-0.3+20161121+ds/DALIGNER/py_utils/DBLA_to_falcon.py0000644000175000017500000000172513026414545021166 0ustar afifafiffrom DAPI import * from ctypes import * import LAPI import sys rcmap = dict(zip("ACGTacgtNn-","TGCATGCANN-")) def rc(seq): return "".join([rcmap[c] for c in seq[::-1]]) ovl_data = LAPI.get_ovl_data(sys.argv[1]) db = HITS_DB() open_DB(sys.argv[2], db) trim_DB(db) aln = LAPI.Alignment() aln.aseq = LAPI.new_read_buffer(db) aln.bseq = LAPI.new_read_buffer(db) count = 0 for aread in ovl_data: LAPI.load_read(db, aread, aln.aseq, 2) aseq = cast( aln.aseq, c_char_p) aseq = aseq.value print "%08d" % aread, aseq for aln_data in ovl_data[aread]: aread, bread, acc, abpos, aepos, alen, comp, bbpos, bepos, blen = aln_data LAPI.load_read(db, bread, aln.bseq, 2) bseq = cast(aln.bseq, c_char_p) bseq = bseq.value bseq = bseq[bbpos:bepos] #load_read(db, ovl.bread, aln.bseq, 2) if comp == 1: bseq = rc(bseq) print bread, bseq print "+ +" count += 1 print "- -" close_DB(db) pbdagcon-0.3+20161121+ds/DALIGNER/py_utils/LAPI.py0000644000175000017500000000604113026414545017161 0ustar afifafiffrom ctypes import * from DAPI import * _READIDX = c_uint16 _TRACE_XOVR = 125 class Path(Structure): _fields_ = [("trace", POINTER(c_uint16)), ("tlen", _READIDX), ("diffs", _READIDX), ("abpos", _READIDX), ("bbpos", _READIDX), ("aepos", _READIDX), ("bepos", _READIDX)] class Alignment(Structure): _fields_ = [("path", POINTER(Path)), ("aseq", POINTER(c_char)), ("bseq", POINTER(c_char)), ("alen", c_int), ("blen", c_int), ("flag", c_int)] class Overlap(Structure): _fields_ = [("path", Path), ("aread", c_int), ("bread", c_int), ("alen", _READIDX), ("blen", _READIDX), ("flags", c_int)] libc = CDLL("libc.so.6") fopen = libc.fopen fclose = libc.fclose fread = libc.fread fread.argtypes = [c_void_p, c_size_t, c_size_t, c_void_p] """void *realloc(void *ptr, size_t size);""" realloc = libc.realloc realloc.argtypes = [c_void_p, c_size_t] realloc.restype = c_void_p """void *malloc(size_t size);""" malloc = libc.malloc malloc.argtypes = [c_size_t] malloc.restype = c_void_p ptr_size = sizeof(c_void_p) ovl_IO_size = sizeof(Overlap) - ptr_size def _read_overlap(in_f, ovl): p = ovl.path.trace fread( cast( addressof(ovl) + ptr_size, c_void_p ), ovl_IO_size, 1, in_f ) ovl.path.trace = p def _read_trace(in_f, ovl, tbytes): fread( cast(ovl.path.trace, c_void_p), tbytes, ovl.path.tlen, in_f ) def get_ovl_data(fn): in_f = fopen(fn, "r") novl = c_int64() tspace = c_int() fread(addressof(novl) , sizeof(c_int64), 1, in_f) fread(addressof(tspace) , sizeof(c_int), 1, in_f) if tspace.value < _TRACE_XOVR: small = 1 tbytes = sizeof(c_uint8) else: small = 0 tbytes = sizeof(c_uint16) tmax = 1000 trace = cast( malloc( sizeof(c_uint16) * tmax ), POINTER(c_uint16) ) ovl = Overlap() ovl_data = {} for j in xrange(novl.value): _read_overlap(in_f, ovl) if ovl.path.tlen > tmax: tmax = 1.2*ovl.path.tlen + 100 trace = cast( realloc( trace, sizeof(c_uint16) * tmax ), POINTER(c_uint16) ) ovl.path.trace = trace _read_trace(in_f, ovl, tbytes) if ovl.alen < 8000: continue if ovl.path.abpos > 50 and ovl.path.bbpos > 50: continue if ovl.alen - ovl.path.aepos > 50 and ovl.blen - ovl.path.bepos > 50: continue comp = ovl.flags & 0x1 bbpos, bepos = ovl.path.bbpos, ovl.path.bepos if comp == 1: bbpos, bepos = ovl.blen - bepos, ovl.blen - bbpos acc = 100 - (200.0 * ovl.path.diffs / ( ovl.path.aepos - ovl.path.abpos + ovl.path.aepos - ovl.path.abpos )) ovl_data.setdefault(ovl.aread,[]) ovl_data[ovl.aread].append( (ovl.aread, ovl.bread, acc, ovl.path.abpos, ovl.path.aepos, ovl.alen, comp, bbpos, bepos, ovl.blen) ) fclose(in_f) return ovl_data pbdagcon-0.3+20161121+ds/DALIGNER/HPCmapper.c0000644000175000017500000004076713026414545016212 0ustar afifafif/*********************************************************************************************\ * * Produce a script to compute overlaps for all block pairs between two DBs, and then sort * and merge * them into as many .las files as their are blocks of the 1st DB. * * Author: Gene Myers * Date : December 31, 2014 * *********************************************************************************************/ #include #include #include #include #include #include #include #include #include "DB.h" #include "filter.h" #undef LSF // define if want a directly executable LSF script static char *Usage[] = { "[-vb] [-k] [-w] [-h] [-t] [-M]", " [-e] [-s]", " [-m]+ [-dal] [-deg]", " [[-]]" }; static int power(int base, int exp) { int i, pow; pow = 1; for (i = 0; i < exp; i++) pow *= base; return (pow); } #define LSF_ALIGN "bsub -q medium -n 4 -o ALIGN.out -e ALIGN.err -R span[hosts=1] -J align#%d" #define LSF_MERGE "bsub -q short -n 12 -o MERGE.out -e MERGE.err -R span[hosts=1] -J merge#%d" int main(int argc, char *argv[]) { int nblocks1, nblocks2; int useblock1, useblock2; int fblock, lblock; #ifdef LSF int jobid; #endif char *pwd1, *root1; char *pwd2, *root2; int MUNIT, DUNIT; int VON, BON, CON; int WINT, TINT, HGAP, HINT, KINT, SINT, LINT, MINT; double EREL; int MMAX, MTOP; char **MASK; { int i, j, k; // Process options int flags[128]; char *eptr; ARG_INIT("HPCmapper") DUNIT = 4; MUNIT = 25; KINT = 20; WINT = 6; HINT = 50; TINT = 0; HGAP = 0; EREL = 0.; LINT = 1000; SINT = 100; MINT = -1; MTOP = 0; MMAX = 10; MASK = (char **) Malloc(MMAX*sizeof(char *),"Allocating mask track array"); if (MASK == NULL) exit (1); j = 1; for (i = 1; i < argc; i++) if (argv[i][0] == '-') switch (argv[i][1]) { default: ARG_FLAGS("vbc"); break; case 'k': ARG_POSITIVE(KINT,"K-mer length") break; case 'w': ARG_POSITIVE(WINT,"Log of bin width") break; case 'h': ARG_POSITIVE(HINT,"Hit threshold (in bp.s)") break; case 't': ARG_POSITIVE(TINT,"Tuple suppression frequency") break; case 'H': ARG_POSITIVE(HGAP,"HGAP threshold (in bp.s)") break; case 'e': ARG_REAL(EREL) if (EREL < .7 || EREL >= 1.) { fprintf(stderr,"%s: Average correlation must be in [.7,1.) (%g)\n",Prog_Name,EREL); exit (1); } break; case 'l': ARG_POSITIVE(LINT,"Minimum ovlerap length") break; case 's': ARG_POSITIVE(SINT,"Trace spacing") break; case 'M': ARG_NON_NEGATIVE(MINT,"Memory allocation (in Gb)") break; case 'm': if (MTOP >= MMAX) { MMAX = 1.2*MTOP + 10; MASK = (char **) Realloc(MASK,MMAX*sizeof(char *),"Reallocating mask track array"); if (MASK == NULL) exit (1); } MASK[MTOP++] = argv[i]+2; break; case 'd': if (argv[i][2] == 'e' && argv[i][3] == 'g') { MUNIT = strtol(argv[i]+4,&eptr,10); if (*eptr != '\0' || argv[i][4] == '\0') { fprintf(stderr,"%s: -mrg argument is not an integer\n",Prog_Name); exit (1); } if (MUNIT <= 0) { fprintf(stderr,"%s: Files per merge must be positive (%d)\n", Prog_Name,MUNIT); exit (1); } if (MUNIT < 3) { fprintf(stderr,"%s: Files per merge must be at least 3 (%d)\n", Prog_Name,MUNIT); exit (1); } } else if (argv[i][2] == 'a' && argv[i][3] == 'l') { DUNIT = strtol(argv[i]+4,&eptr,10); if (*eptr != '\0' || argv[i][4] == '\0') { fprintf(stderr,"%s: -dal argument is not an integer\n",Prog_Name); exit (1); } if (DUNIT <= 0) { fprintf(stderr,"%s: Blocks per daligner call must be positive (%d)\n", Prog_Name,DUNIT); exit (1); } } else { fprintf(stderr,"%s: -%.3s is an illegal option\n",Prog_Name,argv[i]+1); exit (1); } break; } else argv[j++] = argv[i]; argc = j; VON = flags['v']; BON = flags['b']; CON = flags['c']; if (argc < 3 || argc > 4) { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage[0]); fprintf(stderr," %*s %s\n",(int) strlen(Prog_Name),"",Usage[1]); fprintf(stderr," %*s %s\n",(int) strlen(Prog_Name),"",Usage[2]); fprintf(stderr," %*s %s\n",(int) strlen(Prog_Name),"",Usage[3]); exit (1); } } // Make sure DAM and DB exist and the DB is partitioned, get number of blocks in partition pwd1 = PathTo(argv[1]); if (strcmp(argv[1]+(strlen(argv[1])-4),".dam") == 0) root1 = Root(argv[1],".dam"); else root1 = Root(argv[1],".db"); { int i, nfiles; FILE *dbvis; dbvis = fopen(Catenate(pwd1,"/",root1,".dam"),"r"); if (dbvis == NULL) { dbvis = Fopen(Catenate(pwd1,"/",root1,".db"),"r"); if (dbvis == NULL) exit (1); } if (fscanf(dbvis,"files = %d\n",&nfiles) != 1) SYSTEM_ERROR for (i = 0; i < nfiles; i++) { char buffer[30001]; if (fgets(buffer,30000,dbvis) == NULL) SYSTEM_ERROR } useblock1 = 1; if (fscanf(dbvis,"blocks = %d\n",&nblocks1) != 1) { useblock1 = 0; nblocks1 = 1; } fclose(dbvis); } pwd2 = PathTo(argv[2]); if (strcmp(argv[2]+(strlen(argv[2])-4),".dam") == 0) root2 = Root(argv[2],".dam"); else root2 = Root(argv[2],".db"); if (strcmp(root2,root1) == 0 && strcmp(pwd1,pwd2) == 0) { fprintf(stderr,"%s: Comparing the same data base %s/%s against itself, use HPCdaligner\n", Prog_Name,pwd1,root1); exit (1); } { int i, nfiles; FILE *dbvis; dbvis = fopen(Catenate(pwd2,"/",root2,".dam"),"r"); if (dbvis == NULL) { dbvis = Fopen(Catenate(pwd2,"/",root2,".db"),"r"); if (dbvis == NULL) exit (1); } if (fscanf(dbvis,"files = %d\n",&nfiles) != 1) SYSTEM_ERROR for (i = 0; i < nfiles; i++) { char buffer[30001]; if (fgets(buffer,30000,dbvis) == NULL) SYSTEM_ERROR } useblock2 = 1; if (fscanf(dbvis,"blocks = %d\n",&nblocks2) != 1) { useblock2 = 0; nblocks2 = 1; } fclose(dbvis); } // Set range fblock-lblock checking that DB..las exists & DB..las does not { char *eptr, *fptr; FILE *file; if (argc == 4) { fblock = strtol(argv[3],&eptr,10); if (*eptr != '\0' && *eptr != '-') { fprintf(stderr,"%s: final argument '%s' does not start with an integer\n", Prog_Name,argv[3]); exit (1); } if (*eptr == '-') { lblock = strtol(eptr+1,&fptr,10); if (*fptr != '\0') { fprintf(stderr,"%s: second part of range '%s' is not an integer\n", Prog_Name,eptr+1); exit (1); } } else lblock = fblock; if (fblock < 1 || lblock > nblocks2 || fblock > lblock) { fprintf(stderr,"%s: range %d-%d is empty or out of bounds\n",Prog_Name,fblock,lblock); exit (1); } } else { fblock = 1; lblock = nblocks2; } if (fblock > 1) { file = fopen(Catenate(root1,".",root2,Numbered_Suffix(".",fblock-1,".las")),"r"); if (file == NULL) { fprintf(stderr,"%s: File %s.%s.%d.las should already be present!\n", Prog_Name,root1,root2,fblock-1); exit (1); } else fclose(file); } if (useblock2) { file = fopen(Catenate(root1,".",root2,Numbered_Suffix(".",fblock,".las")),"r"); if (file != NULL) { fprintf(stderr,"%s: File %s.%s.%d.las should not yet exist!\n", Prog_Name,root1,root2,fblock); exit (1); } } else { file = fopen(Catenate(root1,".",root2,".las"),"r"); if (file != NULL) { fprintf(stderr,"%s: File %s.%s.las should not yet exist!\n", Prog_Name,root1,root2); exit (1); } } } { int level, njobs; int i, j, k; int usepath1, usepath2; // Produce all necessary daligner jobs ... usepath1 = (strcmp(pwd1,".") != 0); usepath2 = (strcmp(pwd2,".") != 0); njobs = nblocks1 * ( (lblock-fblock)/DUNIT + 1); printf("# Daligner jobs (%d)\n",njobs); #ifdef LSF jobid = 1; #endif for (i = 1; i <= nblocks1; i++) { int bits; int low, hgh; bits = (lblock-fblock)/DUNIT+1; low = fblock; for (j = 1; j <= bits; j++) { #ifdef LSF printf(LSF_ALIGN,jobid++); printf(" \""); #endif printf("daligner -A"); if (VON) printf(" -v"); if (BON) printf(" -b"); printf(" -k%d",KINT); if (WINT != 6) printf(" -w%d",WINT); printf(" -h%d",HINT); if (TINT > 0) printf(" -t%d",TINT); if (HGAP > 0) printf(" -H%d",HGAP); if (EREL > .1) printf(" -e%g",EREL); else printf(" -e.85"); if (LINT != 1000) printf(" -l%d",LINT); if (SINT != 100) printf(" -s%d",SINT); if (MINT >= 0) printf(" -M%d",MINT); for (k = 0; k < MTOP; k++) printf(" -m%s",MASK[k]); if (useblock1) if (usepath1) printf(" %s/%s.%d",pwd1,root1,i); else printf(" %s.%d",root1,i); else if (usepath1) printf(" %s/%s",pwd1,root1); else printf(" %s",root1); hgh = fblock + (((lblock-fblock)+1)*j)/bits; for (k = low; k < hgh; k++) if (useblock2) if (usepath2) printf(" %s/%s.%d",pwd2,root2,k); else printf(" %s.%d",root2,k); else if (usepath2) printf(" %s/%s",pwd2,root2); else printf(" %s",root2); #ifdef LSF printf("\""); #endif printf("\n"); low = hgh; } } // ... and then all the initial sort & merge jobs for each block pair printf("# Initial sort jobs (%d)\n", nblocks1*((lblock-fblock)+1)); #ifdef LSF jobid = 1; #endif for (i = 1; i <= nblocks1; i++) for (j = fblock; j <= lblock; j++) { #ifdef LSF printf(LSF_MERGE,jobid++); printf(" \""); #endif printf("LAsort"); if (VON) printf(" -v"); if (CON) printf(" -c"); for (k = 0; k < NTHREADS; k++) { if (useblock1) printf(" %s.%d",root1,i); else printf(" %s",root1); if (useblock2) printf(".%s.%d.C%d",root2,j,k); else printf(".%s.C%d",root2,k); if (useblock1) printf(" %s.%d",root1,i); else printf(" %s",root1); if (useblock2) printf(".%s.%d.N%d",root2,j,k); else printf(".%s.N%d",root2,k); } printf(" && LAmerge"); if (VON) printf(" -v"); if (CON) printf(" -c"); if (nblocks1 == 1) if (useblock2) printf(" %s.%s.%d",root1,root2,j); else printf(" %s.%s",root1,root2); else printf(" L1.%d.%d",i,j); for (k = 0; k < NTHREADS; k++) { if (useblock1) printf(" %s.%d",root1,i); else printf(" %s",root1); if (useblock2) printf(".%s.%d.C%d.S",root2,j,k); else printf(".%s.C%d.S",root2,k); if (useblock1) printf(" %s.%d",root1,i); else printf(" %s",root1); if (useblock2) printf(".%s.%d.N%d.S",root2,j,k); else printf(".%s.N%d.S",root2,k); } printf(" && rm"); for (k = 0; k < NTHREADS; k++) { if (useblock1) printf(" %s.%d",root1,i); else printf(" %s",root1); if (useblock2) printf(".%s.%d.C%d.S.las",root2,j,k); else printf(".%s.C%d.S.las",root2,k); if (useblock1) printf(" %s.%d",root1,i); else printf(" %s",root1); if (useblock2) printf(".%s.%d.N%d.S.las",root2,j,k); else printf(".%s.N%d.S.las",root2,k); if (useblock1) printf(" %s.%d",root1,i); else printf(" %s",root1); if (useblock2) printf(".%s.%d.C%d.las",root2,j,k); else printf(".%s.C%d.las",root2,k); if (useblock1) printf(" %s.%d",root1,i); else printf(" %s",root1); if (useblock2) printf(".%s.%d.N%d.las",root2,j,k); else printf(".%s.N%d.las",root2,k); } #ifdef LSF printf("\""); #endif printf("\n"); } // Higher level merges (if lblock > 1) if (nblocks1 > 1) { int pow, mway; // Determine most balance mway for merging in ceil(log_mrg nblock1) levels pow = 1; for (level = 0; pow < nblocks1; level++) pow *= MUNIT; for (mway = MUNIT; mway >= 3; mway--) if (power(mway,level) < nblocks1) break; mway += 1; // Issue the commands for each merge level { int p, cnt; cnt = nblocks1; for (i = 1; i <= level; i++) { int bits; int low, hgh; bits = (cnt-1)/mway+1; printf("# Level %d jobs (%d)\n",i,bits*((lblock-fblock)+1)); // Block merges #ifdef LSF jobid = 1; #endif for (j = fblock; j <= lblock; j++) { low = 1; for (p = 1; p <= bits; p++) { hgh = (cnt*p)/bits; #ifdef LSF printf(LSF_MERGE,jobid++); printf(" \""); #endif printf("LAmerge"); if (VON) printf(" -v"); if (CON) printf(" -c"); if (i == level) if (useblock2) printf(" %s.%s.%d",root1,root2,j); else printf(" %s.%s",root1,root2); else printf(" L%d.%d.%d",i+1,j,p); for (k = low; k <= hgh; k++) printf(" L%d.%d.%d",i,k,j); printf(" && rm"); for (k = low; k <= hgh; k++) printf(" L%d.%d.%d.las",i,k,j); #ifdef LSF printf("\""); #endif printf("\n"); low = hgh+1; } } cnt = bits; } } } } free(root2); free(pwd2); free(root1); free(pwd1); exit (0); } pbdagcon-0.3+20161121+ds/DALIGNER/LA4Falcon.c0000644000175000017500000006365513026414545016077 0ustar afifafif/************************************************************************************\ * * * Copyright (c) 2014, Dr. Eugene W. Myers (EWM). All rights reserved. * * * * Redistribution and use in source and binary forms, with or without modification, * * are permitted provided that the following conditions are met: * * * * · Redistributions of source code must retain the above copyright notice, this * * list of conditions and the following disclaimer. * * * * · Redistributions in binary form must reproduce the above copyright notice, this * * list of conditions and the following disclaimer in the documentation and/or * * other materials provided with the distribution. * * * * · The name of EWM may not be used to endorse or promote products derived from * * this software without specific prior written permission. * * * * THIS SOFTWARE IS PROVIDED BY EWM ”AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, * * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND * * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL EWM BE LIABLE * * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS * * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN * * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * * * For any issues regarding this software and its use, contact EWM at: * * * * Eugene W. Myers Jr. * * Bautzner Str. 122e * * 01099 Dresden * * GERMANY * * Email: gene.myers@gmail.com * * * \************************************************************************************/ /******************************************************************************************* * * Utility for displaying the overlaps in a .las file in a variety of ways including * a minimal listing of intervals, a cartoon, and a full out alignment. * * Author: Gene Myers * Creation: July 2013 * Last Mod: Jan 2015 * *******************************************************************************************/ /******************************************************************************************* * * Based on the original LAshow.c, this code is modified by Jason Chin to support generating * consensus sequences from daligner output * * Last Mod: July 2015 * *******************************************************************************************/ #include #include #include #include #include #include #include #include #include "DB.h" #include "align.h" typedef struct { int r_id; int score; int t_o; int t_s; int t_e; int t_l; } hit_record; hit_record * hits; #define MIN(X,Y) ((X) < (Y)) ? (X) : (Y) static int compare_hits(const void * h1, const void *h2) { return ((hit_record *) h2)->score - ((hit_record *) h1)->score; } static char *Usage[] = { "[-mfsocarUFM] [-i] [-w] [-b] ", " [ ] [ | ... ]" }; #define LAST_READ_SYMBOL '$' static int ORDER(const void *l, const void *r) { int x = *((int32 *) l); int y = *((int32 *) r); return (x-y); } int main(int argc, char *argv[]) { HITS_DB _db1, *db1 = &_db1; HITS_DB _db2, *db2 = &_db2; Overlap _ovl, *ovl = &_ovl; Alignment _aln, *aln = &_aln; FILE *input; int64 novl; int tspace, tbytes, small; int reps, *pts; int input_pts; int ALIGN, CARTOON, REFERENCE, FLIP; int INDENT, WIDTH, BORDER, UPPERCASE; int ISTWO; int MAP; int FALCON, OVERLAP, M4OVL; int SEED_MIN, MAX_HIT_COUNT, SKIP; // Process options { int i, j, k; int flags[128]; char *eptr; ARG_INIT("LA4Falcon") INDENT = 4; WIDTH = 100; BORDER = 10; FALCON = 0; M4OVL = 0; SEED_MIN = 8000; SKIP = 0; ALIGN = 0; REFERENCE = 0; CARTOON = 0; FLIP = 0; MAX_HIT_COUNT = 400; j = 1; for (i = 1; i < argc; i++) if (argv[i][0] == '-') switch (argv[i][1]) { default: ARG_FLAGS("smfocarUFM") break; case 'i': ARG_NON_NEGATIVE(INDENT,"Indent") break; case 'w': ARG_POSITIVE(WIDTH,"Alignment width") break; case 'b': ARG_NON_NEGATIVE(BORDER,"Alignment border") break; case 'H': ARG_POSITIVE(SEED_MIN,"seed threshold (in bp)") break; case 'n': ARG_POSITIVE(MAX_HIT_COUNT, "max numer of supporting read ouput (used for FALCON consensus. default 400, max: 2000)") if (MAX_HIT_COUNT > 2000) MAX_HIT_COUNT = 2000; break; } else argv[j++] = argv[i]; argc = j; UPPERCASE = flags['U']; ALIGN = flags['a']; REFERENCE = flags['r']; CARTOON = flags['c']; FLIP = flags['F']; MAP = flags['M']; OVERLAP = flags['o']; M4OVL = flags['m']; FALCON = flags['f']; SKIP = flags['s']; if (argc <= 2) { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage[0]); fprintf(stderr," %*s %s\n",(int) strlen(Prog_Name),"",Usage[1]); exit (1); } } // Open trimmed DB or DB pair { int status; char *pwd, *root; FILE *input; ISTWO = 0; status = Open_DB(argv[1],db1); if (status < 0) exit (1); if (db1->part > 0) { fprintf(stderr,"%s: Cannot be called on a block: %s\n",Prog_Name,argv[1]); exit (1); } if (argc > 3) { pwd = PathTo(argv[3]); root = Root(argv[3],".las"); if ((input = fopen(Catenate(pwd,"/",root,".las"),"r")) != NULL) { ISTWO = 1; fclose(input); status = Open_DB(argv[2],db2); if (status < 0) exit (1); if (db2->part > 0) { fprintf(stderr,"%s: Cannot be called on a block: %s\n",Prog_Name,argv[2]); exit (1); } Trim_DB(db2); } else db2 = db1; free(root); free(pwd); } else db2 = db1; Trim_DB(db1); } // Process read index arguments into a sorted list of read ranges input_pts = 0; if (argc == ISTWO+4) { if (argv[ISTWO+3][0] != LAST_READ_SYMBOL || argv[ISTWO+3][1] != '\0') { char *eptr, *fptr; int b, e; b = strtol(argv[ISTWO+3],&eptr,10); if (eptr > argv[ISTWO+3] && b > 0) { if (*eptr == '-') { if (eptr[1] != LAST_READ_SYMBOL || eptr[2] != '\0') { e = strtol(eptr+1,&fptr,10); input_pts = (fptr <= eptr+1 || *fptr != '\0' || e <= 0); } } else input_pts = (*eptr != '\0'); } else input_pts = 1; } } if (input_pts) { int v, x; FILE *input; input = Fopen(argv[ISTWO+3],"r"); if (input == NULL) exit (1); reps = 0; while ((v = fscanf(input," %d",&x)) != EOF) if (v == 0) { fprintf(stderr,"%s: %d'th item of input file %s is not an integer\n", Prog_Name,reps+1,argv[2]); exit (1); } else reps += 1; reps *= 2; pts = (int *) Malloc(sizeof(int)*reps,"Allocating read parameters"); if (pts == NULL) exit (1); rewind(input); for (v = 0; v < reps; v += 2) { fscanf(input," %d",&x); pts[v] = pts[v+1] = x; } fclose(input); } else { pts = (int *) Malloc(sizeof(int)*2*argc,"Allocating read parameters"); if (pts == NULL) exit (1); reps = 0; if (argc > 3+ISTWO) { int c, b, e; char *eptr, *fptr; for (c = 3+ISTWO; c < argc; c++) { if (argv[c][0] == LAST_READ_SYMBOL) { b = db1->nreads; eptr = argv[c]+1; } else b = strtol(argv[c],&eptr,10); if (eptr > argv[c]) { if (b <= 0) { fprintf(stderr,"%s: %d is not a valid index\n",Prog_Name,b); exit (1); } if (*eptr == '\0') { pts[reps++] = b; pts[reps++] = b; continue; } else if (*eptr == '-') { if (eptr[1] == LAST_READ_SYMBOL) { e = INT32_MAX; fptr = eptr+2; } else e = strtol(eptr+1,&fptr,10); if (fptr > eptr+1 && *fptr == 0 && e > 0) { pts[reps++] = b; pts[reps++] = e; if (b > e) { fprintf(stderr,"%s: Empty range '%s'\n",Prog_Name,argv[c]); exit (1); } continue; } } } fprintf(stderr,"%s: argument '%s' is not an integer range\n",Prog_Name,argv[c]); exit (1); } qsort(pts,reps/2,sizeof(int64),ORDER); b = 0; for (c = 0; c < reps; c += 2) if (b > 0 && pts[b-1] >= pts[c]-1) { if (pts[c+1] > pts[b-1]) pts[b-1] = pts[c+1]; } else { pts[b++] = pts[c]; pts[b++] = pts[c+1]; } pts[b++] = INT32_MAX; reps = b; } else { pts[reps++] = 1; pts[reps++] = INT32_MAX; } } // Initiate file reading and read (novl, tspace) header { char *over, *pwd, *root; pwd = PathTo(argv[2+ISTWO]); root = Root(argv[2+ISTWO],".las"); over = Catenate(pwd,"/",root,".las"); input = Fopen(over,"r"); if (input == NULL) exit (1); if (fread(&novl,sizeof(int64),1,input) != 1) SYSTEM_ERROR if (fread(&tspace,sizeof(int),1,input) != 1) SYSTEM_ERROR if (tspace == 0) { printf("\nCRITICAL ERROR: tspace=0 in '%s'", root); exit(1); } if (tspace <= TRACE_XOVR) { small = 1; tbytes = sizeof(uint8); } else { small = 0; tbytes = sizeof(uint16); } if (!(FALCON || M4OVL)) { printf("\n%s: ",root); Print_Number(novl,0,stdout); printf(" records\n"); } free(pwd); free(root); } // Read the file and display selected records { int j; uint16 *trace; Work_Data *work; int tmax; int in, npt, idx, ar; int64 tps; int64 p_aread = -1; char buffer[131072]; int skip_rest = 0; char *abuffer, *bbuffer; int ar_wide, br_wide; int ai_wide, bi_wide; int mn_wide, mx_wide; int tp_wide; int blast, match, seen, lhalf, rhalf; int hit_count; aln->path = &(ovl->path); if (ALIGN || REFERENCE || FALCON) { work = New_Work_Data(); abuffer = New_Read_Buffer(db1); bbuffer = New_Read_Buffer(db2); if (FALCON) { hits = calloc(sizeof(hit_record), 50001); hit_count = 0; } } else { abuffer = NULL; bbuffer = NULL; work = NULL; } tmax = 1000; trace = (uint16 *) Malloc(sizeof(uint16)*tmax,"Allocating trace vector"); if (trace == NULL) exit (1); in = 0; npt = pts[0]; idx = 1; ar_wide = Number_Digits((int64) db1->nreads); br_wide = Number_Digits((int64) db2->nreads); ai_wide = Number_Digits((int64) db1->maxlen); bi_wide = Number_Digits((int64) db2->maxlen); if (db1->maxlen < db2->maxlen) { mn_wide = ai_wide; mx_wide = bi_wide; tp_wide = Number_Digits((int64) db1->maxlen/tspace+2); } else { mn_wide = bi_wide; mx_wide = ai_wide; tp_wide = Number_Digits((int64) db2->maxlen/tspace+2); } ar_wide += (ar_wide-1)/3; br_wide += (br_wide-1)/3; ai_wide += (ai_wide-1)/3; bi_wide += (bi_wide-1)/3; mn_wide += (mn_wide-1)/3; tp_wide += (tp_wide-1)/3; if (FLIP) { int x; x = ar_wide; ar_wide = br_wide; br_wide = x; x = ai_wide; ai_wide = bi_wide; bi_wide = x; } // For each record do blast = -1; match = 0; seen = 0; lhalf = rhalf = 0; for (j = 0; j < novl; j++) // Read it in { Read_Overlap(input,ovl); if (ovl->path.tlen > tmax) { tmax = ((int) 1.2*ovl->path.tlen) + 100; trace = (uint16 *) Realloc(trace,sizeof(uint16)*tmax,"Allocating trace vector"); if (trace == NULL) exit (1); } ovl->path.trace = (void *) trace; Read_Trace(input,ovl,tbytes); // Determine if it should be displayed ar = ovl->aread+1; if (in) { while (ar > npt) { npt = pts[idx++]; if (ar < npt) { in = 0; break; } npt = pts[idx++]; } } else { while (ar >= npt) { npt = pts[idx++]; if (ar <= npt) { in = 1; break; } npt = pts[idx++]; } } if (!in) continue; // Display it aln->alen = db1->reads[ovl->aread].rlen; aln->blen = db2->reads[ovl->bread].rlen; aln->flags = ovl->flags; tps = ((ovl->path.aepos-1)/tspace - ovl->path.abpos/tspace); if (OVERLAP && !FALCON) { if (ovl->path.abpos != 0 && ovl->path.bbpos != 0) continue; if (ovl->path.aepos != aln->alen && ovl->path.bepos != aln->blen) continue; } if (MAP) { while (ovl->bread != blast) { if (!match && seen && !(lhalf && rhalf)) { printf("Missing "); Print_Number((int64) blast+1,br_wide+1,stdout); printf(" %d ->%lld\n",db2->reads[blast].rlen,db2->reads[blast].coff); } match = 0; seen = 0; lhalf = rhalf = 0; blast += 1; } seen = 1; if (ovl->path.abpos == 0) rhalf = 1; if (ovl->path.aepos == aln->alen) lhalf = 1; if (ovl->path.bbpos != 0 || ovl->path.bepos != aln->blen) continue; match = 1; } // printf(" %7d %7d\n",ovl->path.abpos,ovl->path.aepos); // continue; if (!(FALCON || M4OVL) ) { if (ALIGN || CARTOON || REFERENCE) printf("\n"); if (FLIP) { Flip_Alignment(aln,0); Print_Number((int64) ovl->bread+1,ar_wide+1,stdout); printf(" "); Print_Number((int64) ovl->aread+1,br_wide+1,stdout); } else { Print_Number((int64) ovl->aread+1,ar_wide+1,stdout); printf(" "); Print_Number((int64) ovl->bread+1,br_wide+1,stdout); } if (COMP(ovl->flags)) printf(" c"); else printf(" n"); printf(" ["); Print_Number((int64) ovl->path.abpos,ai_wide,stdout); printf(".."); Print_Number((int64) ovl->path.aepos,ai_wide,stdout); printf("] x ["); Print_Number((int64) ovl->path.bbpos,bi_wide,stdout); printf(".."); Print_Number((int64) ovl->path.bepos,bi_wide,stdout); printf("]"); } // Display it if (M4OVL) { int64 bbpos, bepos; double acc; if (COMP(ovl->flags)) { bbpos = (int64) aln->blen - (int64) ovl->path.bepos; bepos = (int64) aln->blen - (int64) ovl->path.bbpos; } else { bbpos = (int64) ovl->path.bbpos; bepos = (int64) ovl->path.bepos; } acc = 100-(200. * ovl->path.diffs)/( ovl->path.aepos - ovl->path.abpos + ovl->path.bepos - ovl->path.bbpos ); printf("%09lld %09lld %lld %5.2f ", (int64) ovl->aread, (int64) ovl->bread, (int64) bbpos - (int64) bepos , acc); printf("0 %lld %lld %lld ", (int64) ovl->path.abpos, (int64) ovl->path.aepos, (int64) aln->alen); printf("%d %lld %lld %lld ", COMP(ovl->flags), bbpos, bepos, (int64) aln->blen); if ( ((int64) aln->blen < (int64) aln->alen) && ((int64) ovl->path.bbpos < 1) && ((int64) aln->blen - (int64) ovl->path.bepos < 1) ) { printf("contains\n"); } else if ( ((int64) aln->alen < (int64) aln->blen) && ((int64) ovl->path.abpos < 1) && ((int64) aln->alen - (int64) ovl->path.aepos < 1) ) { printf("contained\n"); } else { printf("overlap\n"); } } if (FALCON) { if (p_aread == -1) { Load_Read(db1, ovl->aread, abuffer, 2); printf("%08d %s\n", ovl->aread, abuffer); p_aread = ovl->aread; skip_rest = 0; } if (p_aread != ovl -> aread ) { int tmp_idx; qsort( hits, hit_count, sizeof(hit_record), compare_hits ); for (tmp_idx = 0; tmp_idx < hit_count && tmp_idx < MAX_HIT_COUNT; tmp_idx++) { Load_Read(db2, hits[tmp_idx].r_id, bbuffer, 0); if (hits[tmp_idx].t_o) Complement_Seq(bbuffer, hits[tmp_idx].t_l ); Upper_Read(bbuffer); int64 const rlen = (int64)(hits[tmp_idx].t_e) - (int64)(hits[tmp_idx].t_s); if (rlen < (int64)sizeof(buffer)) { strncpy( buffer, bbuffer + hits[tmp_idx].t_s, rlen ); buffer[rlen - 1] = '\0'; printf("%08d %s\n", hits[tmp_idx].r_id, buffer); } else { fprintf(stderr, "[WARNING]Skipping super-long read %08d, len=%lld\n", hits[tmp_idx].r_id, rlen); } } hit_count = 0; printf("+ +\n"); Load_Read(db1, ovl->aread, abuffer, 2); printf("%08d %s\n", ovl->aread, abuffer); p_aread = ovl->aread; skip_rest = 0; } if (skip_rest == 0) { int ovl_len, overhang_len, score; ovl_len = ovl->path.bepos - ovl->path.bbpos; overhang_len = MIN( ovl->path.abpos, ovl->path.bbpos ); overhang_len += MIN( aln->alen - ovl->path.aepos, aln->blen - ovl->path.bepos); score = ovl_len - overhang_len; hits[hit_count].r_id = ovl->bread; hits[hit_count].t_o = COMP(aln->flags); hits[hit_count].t_s = ovl->path.bbpos; hits[hit_count].t_e = ovl->path.bepos; hits[hit_count].t_l = aln->blen; hits[hit_count].score = score; hit_count ++; if (hit_count > 50000) skip_rest = 1; #undef TEST_ALN_OUT #ifdef TEST_ALN_OUT { tps = ((ovl->path.aepos-1)/tspace - ovl->path.abpos/tspace); if (small) Decompress_TraceTo16(ovl); Load_Read(db1, ovl->aread, abuffer, 0); Load_Read(db2, ovl->bread, bbuffer, 0); if (COMP(aln->flags)) Complement_Seq(bbuffer, aln->blen); Compute_Trace_PTS(aln,work,tspace); int tlen = aln->path->tlen; int *trace = aln->path->trace; int u; printf(" "); for (u = 0; u < tlen; u++) printf("%d,", (int16) trace[u]); } #endif //printf("\n"); if (SKIP == 1) { //if SKIP = 0, then skip_rest is always 0 if ( ((int64) aln->alen < (int64) aln->blen) && ((int64) ovl->path.abpos < 1) && ((int64) aln->alen - (int64) ovl->path.aepos < 1) ) { printf("* *\n"); skip_rest = 1; } } } } if (ALIGN || CARTOON || REFERENCE) { if (ALIGN || REFERENCE) { char *aseq, *bseq; int amin, amax; int bmin, bmax; if (FLIP) Flip_Alignment(aln,0); if (small) Decompress_TraceTo16(ovl); amin = ovl->path.abpos - BORDER; if (amin < 0) amin = 0; amax = ovl->path.aepos + BORDER; if (amax > aln->alen) amax = aln->alen; if (COMP(aln->flags)) { bmin = (aln->blen-ovl->path.bepos) - BORDER; if (bmin < 0) bmin = 0; bmax = (aln->blen-ovl->path.bbpos) + BORDER; if (bmax > aln->blen) bmax = aln->blen; } else { bmin = ovl->path.bbpos - BORDER; if (bmin < 0) bmin = 0; bmax = ovl->path.bepos + BORDER; if (bmax > aln->blen) bmax = aln->blen; } aseq = Load_Subread(db1,ovl->aread,amin,amax,abuffer,0); bseq = Load_Subread(db2,ovl->bread,bmin,bmax,bbuffer,0); aln->aseq = aseq - amin; if (COMP(aln->flags)) { Complement_Seq(bseq,bmax-bmin); aln->bseq = bseq - (aln->blen - bmax); } else aln->bseq = bseq - bmin; Compute_Trace_PTS(aln,work,tspace,GREEDIEST); if (FLIP) { if (COMP(aln->flags)) { Complement_Seq(aseq,amax-amin); Complement_Seq(bseq,bmax-bmin); aln->aseq = aseq - (aln->alen - amax); aln->bseq = bseq - bmin; } Flip_Alignment(aln,1); } } if (CARTOON) { printf(" ("); Print_Number(tps,tp_wide,stdout); printf(" trace pts)\n\n"); Alignment_Cartoon(stdout,aln,INDENT,mx_wide); } else { printf(" : = "); Print_Number((int64) ovl->path.diffs,mn_wide,stdout); printf(" diffs ("); Print_Number(tps,tp_wide,stdout); printf(" trace pts)\n"); } if (REFERENCE) Print_Reference(stdout,aln,work,INDENT,WIDTH,BORDER,UPPERCASE,mx_wide); if (ALIGN) Print_Alignment(stdout,aln,work,INDENT,WIDTH,BORDER,UPPERCASE,mx_wide); } else if (!(FALCON || M4OVL) ) { printf(" : < "); Print_Number((int64) ovl->path.diffs,mn_wide,stdout); printf(" diffs ("); Print_Number(tps,tp_wide,stdout); printf(" trace pts)\n"); } } if (FALCON) { qsort( hits, hit_count, sizeof(hit_record), compare_hits ); int tmp_idx; for (tmp_idx = 0; tmp_idx < hit_count && tmp_idx < MAX_HIT_COUNT; tmp_idx++) { Load_Read(db2, hits[tmp_idx].r_id, bbuffer, 0); if (hits[tmp_idx].t_o) Complement_Seq(bbuffer, hits[tmp_idx].t_l ); Upper_Read(bbuffer); strncpy( buffer, bbuffer + hits[tmp_idx].t_s, (int64) hits[tmp_idx].t_e - (int64) hits[tmp_idx].t_s ); buffer[ (int64) hits[tmp_idx].t_e - (int64) hits[tmp_idx].t_s - 1] = '\0'; printf("%08d %s\n", hits[tmp_idx].r_id, buffer); } printf("+ +\n"); printf("- -\n"); free(hits); } free(trace); if (ALIGN || FALCON) { free(bbuffer-1); free(abuffer-1); Free_Work_Data(work); } } Close_DB(db1); if (ISTWO) Close_DB(db2); exit (0); } pbdagcon-0.3+20161121+ds/DALIGNER/HPCdaligner.c0000644000175000017500000003673313026414545016511 0ustar afifafif/*********************************************************************************************\ * * Produce a script to compute overlaps for all block pairs of a DB, and then sort and merge * them into as many .las files as their are blocks. * * Author: Gene Myers * Date : June 1, 2014 * *********************************************************************************************/ #include #include #include #include #include #include #include #include #include "DB.h" #include "filter.h" #undef LSF // define if want a directly executable LSF script static char *Usage[] = { "[-vbAI] [-k] [-w] [-h] [-t] [-M]", " [-e] [-s]", " [-m]+ [-dal] [-deg]", " [[-]" }; static int power(int base, int exp) { int i, pow; pow = 1; for (i = 0; i < exp; i++) pow *= base; return (pow); } #define LSF_ALIGN "bsub -q medium -n 4 -o ALIGN.out -e ALIGN.err -R span[hosts=1] -J align#%d" #define LSF_MERGE "bsub -q short -n 12 -o MERGE.out -e MERGE.err -R span[hosts=1] -J merge#%d" int main(int argc, char *argv[]) { int nblocks; int useblock; int fblock, lblock; #ifdef LSF int jobid; #endif char *pwd, *root; int MUNIT, DUNIT; int VON, BON, AON, ION; int WINT, TINT, HGAP, HINT, KINT, SINT, LINT, MINT; double EREL; int MMAX, MTOP; char **MASK; { int i, j, k; // Process options int flags[128]; char *eptr; ARG_INIT("HPCdaligner") DUNIT = 4; MUNIT = 25; KINT = 14; WINT = 6; HINT = 35; TINT = 0; HGAP = 0; EREL = 0.; LINT = 1000; SINT = 100; MINT = -1; MTOP = 0; MMAX = 10; MASK = (char **) Malloc(MMAX*sizeof(char *),"Allocating mask track array"); if (MASK == NULL) exit (1); j = 1; for (i = 1; i < argc; i++) if (argv[i][0] == '-') switch (argv[i][1]) { default: ARG_FLAGS("vbAI"); break; case 'k': ARG_POSITIVE(KINT,"K-mer length") break; case 'w': ARG_POSITIVE(WINT,"Log of bin width") break; case 'h': ARG_POSITIVE(HINT,"Hit threshold (in bp.s)") break; case 't': ARG_POSITIVE(TINT,"Tuple suppression frequency") break; case 'H': ARG_POSITIVE(HGAP,"HGAP threshold (in bp.s)") break; case 'e': ARG_REAL(EREL) if (EREL < .7 || EREL >= 1.) { fprintf(stderr,"%s: Average correlation must be in [.7,1.) (%g)\n",Prog_Name,EREL); exit (1); } break; case 'l': ARG_POSITIVE(LINT,"Minimum ovlerap length") break; case 's': ARG_POSITIVE(SINT,"Trace spacing") break; case 'M': ARG_NON_NEGATIVE(MINT,"Memory allocation (in Gb)") break; case 'm': if (MTOP >= MMAX) { MMAX = 1.2*MTOP + 10; MASK = (char **) Realloc(MASK,MMAX*sizeof(char *),"Reallocating mask track array"); if (MASK == NULL) exit (1); } MASK[MTOP++] = argv[i]+2; break; case 'd': if (argv[i][2] == 'e' && argv[i][3] == 'g') { MUNIT = strtol(argv[i]+4,&eptr,10); if (*eptr != '\0' || argv[i][4] == '\0') { fprintf(stderr,"%s: -mrg argument is not an integer\n",Prog_Name); exit (1); } if (MUNIT <= 0) { fprintf(stderr,"%s: Files per merge must be positive (%d)\n", Prog_Name,MUNIT); exit (1); } if (MUNIT < 3) { fprintf(stderr,"%s: Files per merge must be at least 3 (%d)\n", Prog_Name,MUNIT); exit (1); } } else if (argv[i][2] == 'a' && argv[i][3] == 'l') { DUNIT = strtol(argv[i]+4,&eptr,10); if (*eptr != '\0' || argv[i][4] == '\0') { fprintf(stderr,"%s: -dal argument is not an integer\n",Prog_Name); exit (1); } if (DUNIT <= 0) { fprintf(stderr,"%s: Blocks per daligner call must be positive (%d)\n", Prog_Name,DUNIT); exit (1); } } else { fprintf(stderr,"%s: -%.3s is an illegal option\n",Prog_Name,argv[i]+1); exit (1); } break; } else argv[j++] = argv[i]; argc = j; VON = flags['v']; BON = flags['b']; AON = flags['A']; ION = flags['I']; if (argc < 2 || argc > 3) { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage[0]); fprintf(stderr," %*s %s\n",(int) strlen(Prog_Name),"",Usage[1]); fprintf(stderr," %*s %s\n",(int) strlen(Prog_Name),"",Usage[2]); fprintf(stderr," %*s %s\n",(int) strlen(Prog_Name),"",Usage[3]); exit (1); } } // Make sure DB exists and is partitioned, get number of blocks in partition pwd = PathTo(argv[1]); if (strcmp(argv[1]+(strlen(argv[1])-4),".dam") == 0) root = Root(argv[1],".dam"); else root = Root(argv[1],".db"); { int i, nfiles; FILE *dbvis; dbvis = fopen(Catenate(pwd,"/",root,".dam"),"r"); if (dbvis == NULL) { dbvis = Fopen(Catenate(pwd,"/",root,".db"),"r"); if (dbvis == NULL) exit (1); } if (fscanf(dbvis,"files = %d\n",&nfiles) != 1) SYSTEM_ERROR for (i = 0; i < nfiles; i++) { char buffer[30001]; if (fgets(buffer,30000,dbvis) == NULL) SYSTEM_ERROR } useblock = 1; if (fscanf(dbvis,"blocks = %d\n",&nblocks) != 1) { useblock = 0; nblocks = 1; } } // Set range fblock-lblock checking that DB..las exists & DB..las does not { char *eptr, *fptr; FILE *file; if (argc == 3) { fblock = strtol(argv[2],&eptr,10); if (*eptr != '\0' && *eptr != '-') { fprintf(stderr,"%s: final argument '%s' does not start with an integer\n", Prog_Name,argv[2]); exit (1); } if (*eptr == '-') { lblock = strtol(eptr+1,&fptr,10); if (*fptr != '\0') { fprintf(stderr,"%s: second part of range '%s' is not an integer\n", Prog_Name,eptr+1); exit (1); } } else lblock = fblock; if (fblock < 1 || lblock > nblocks || fblock > lblock) { fprintf(stderr,"%s: range %d-%d is empty or out of bounds\n",Prog_Name,fblock,lblock); exit (1); } } else { fblock = 1; lblock = nblocks; } if (fblock > 1) { file = fopen(Catenate(root,Numbered_Suffix(".",fblock-1,".las"),"",""),"r"); if (file == NULL) { fprintf(stderr,"%s: File %s.%d.las should already be present!\n", Prog_Name,root,fblock-1); exit (1); } else fclose(file); } file = fopen(Catenate(root,Numbered_Suffix(".",fblock,".las"),"",""),"r"); if (file != NULL) { fprintf(stderr,"%s: File %s.%d.las should not yet exist!\n", Prog_Name,root,fblock); exit (1); } } { int level, njobs; int i, j, k; int usepath; // Produce all necessary daligner jobs ... usepath = (strcmp(pwd,".") != 0); njobs = 0; for (i = fblock; i <= lblock; i++) njobs += (i-1)/DUNIT+1; printf("# Daligner jobs (%d)\n",njobs); #ifdef LSF jobid = 1; #endif for (i = fblock; i <= lblock; i++) { int bits; int low, hgh; bits = (i-1)/DUNIT+1; low = 1; for (j = 1; j <= bits; j++) { #ifdef LSF printf(LSF_ALIGN,jobid++); printf(" \""); #endif printf("daligner"); if (VON) printf(" -v"); if (BON) printf(" -b"); if (AON) printf(" -A"); if (ION) printf(" -I"); if (KINT != 14) printf(" -k%d",KINT); if (WINT != 6) printf(" -w%d",WINT); if (HINT != 35) printf(" -h%d",HINT); if (TINT > 0) printf(" -t%d",TINT); if (HGAP > 0) printf(" -H%d",HGAP); if (EREL > .1) printf(" -e%g",EREL); if (LINT != 1000) printf(" -l%d",LINT); if (SINT != 100) printf(" -s%d",SINT); if (MINT >= 0) printf(" -M%d",MINT); for (k = 0; k < MTOP; k++) printf(" -m%s",MASK[k]); if (useblock) if (usepath) printf(" %s/%s.%d",pwd,root,i); else printf(" %s.%d",root,i); else if (usepath) printf(" %s/%s",pwd,root); else printf(" %s",root); hgh = (i*j)/bits + 1; for (k = low; k < hgh; k++) if (useblock) if (usepath) printf(" %s/%s.%d",pwd,root,k); else printf(" %s.%d",root,k); else if (usepath) printf(" %s/%s",pwd,root); else printf(" %s",root); #ifdef LSF printf("\""); #endif printf("\n"); low = hgh; } } // ... and then all the initial sort & merge jobs for each block pair printf("# Initial sort jobs (%d)\n", lblock*lblock - (fblock-1)*(fblock-1) ); #ifdef LSF jobid = 1; #endif for (i = 1; i <= lblock; i++) for (j = (i < fblock ? fblock : 1); j <= lblock; j++) { #ifdef LSF printf(LSF_MERGE,jobid++); printf(" \""); #endif printf("LAsort"); if (VON) printf(" -v"); for (k = 0; k < NTHREADS; k++) if (useblock) { printf(" %s.%d.%s.%d.C%d",root,i,root,j,k); printf(" %s.%d.%s.%d.N%d",root,i,root,j,k); } else { printf(" %s.%s.C%d",root,root,k); printf(" %s.%s.N%d",root,root,k); } printf(" && LAmerge"); if (VON) printf(" -v"); if (lblock == 1) printf(" %s.%d",root,i); else if (i < fblock) printf(" L1.%d.%d",i,(j-fblock)+1); else printf(" L1.%d.%d",i,j); for (k = 0; k < NTHREADS; k++) if (useblock) { printf(" %s.%d.%s.%d.C%d.S",root,i,root,j,k); printf(" %s.%d.%s.%d.N%d.S",root,i,root,j,k); } else { printf(" %s.%s.C%d.S",root,root,k); printf(" %s.%s.N%d.S",root,root,k); } printf(" && rm"); for (k = 0; k < NTHREADS; k++) if (useblock) { printf(" %s.%d.%s.%d.C%d.S.las",root,i,root,j,k); printf(" %s.%d.%s.%d.N%d.S.las",root,i,root,j,k); } else { printf(" %s.%s.C%d.S.las",root,root,k); printf(" %s.%s.N%d.S.las",root,root,k); } #ifdef LSF printf("\""); #endif printf("\n"); } // Higher level merges (if lblock > 1) if (lblock > 1) { int pow, mway; // Determine most balance mway for merging in ceil(log_mrg lblock) levels pow = 1; for (level = 0; pow < lblock; level++) pow *= MUNIT; for (mway = MUNIT; mway >= 3; mway--) if (power(mway,level) < lblock) break; mway += 1; // Issue the commands for each merge level { int p, cnt, dnt; cnt = lblock; dnt = (lblock-fblock)+1; for (i = 1; i <= level; i++) { int bits, dits; int low, hgh; bits = (cnt-1)/mway+1; dits = (dnt-1)/mway+1; // Incremental update merges #ifdef LSF jobid = 1; #endif if (dnt >= 1) { int last; last = (dnt == 1 || i == level); printf("# Level %d jobs (%d)\n",i,bits*((lblock-fblock)+1) + dits*(fblock-1)); for (j = 1; j < fblock; j++) { #ifdef LSF printf(LSF_MERGE,jobid++); printf(" \""); #endif if (last) printf("mv %s.%d.las L%d.%d.0.las && ",root,j,i,j); low = 1; for (p = 1; p <= dits; p++) { hgh = (dnt*p)/dits; #ifdef LSF if (p > 1) { printf(LSF_MERGE,jobid++); printf(" \""); } #endif printf("LAmerge"); if (VON) printf(" -v"); if (last) printf(" %s.%d L%d.%d.0",root,j,i,j); else printf(" L%d.%d.%d",i+1,j,p); for (k = low; k <= hgh; k++) printf(" L%d.%d.%d",i,j,k); printf(" && rm"); if (last) printf(" L%d.%d.0.las",i,j); for (k = low; k <= hgh; k++) printf(" L%d.%d.%d.las",i,j,k); #ifdef LSF printf("\""); #endif printf("\n"); low = hgh+1; } } if (dnt > 1) dnt = dits; else dnt = 0; } else printf("# Level %d jobs (%d)\n",i,bits*((lblock-fblock)+1)); // New block merges for (j = fblock; j <= lblock; j++) { low = 1; for (p = 1; p <= bits; p++) { hgh = (cnt*p)/bits; #ifdef LSF printf(LSF_MERGE,jobid++); printf(" \""); #endif printf("LAmerge"); if (VON) printf(" -v"); if (i == level) printf(" %s.%d",root,j); else printf(" L%d.%d.%d",i+1,j,p); for (k = low; k <= hgh; k++) printf(" L%d.%d.%d",i,j,k); printf(" && rm"); for (k = low; k <= hgh; k++) printf(" L%d.%d.%d.las",i,j,k); #ifdef LSF printf("\""); #endif printf("\n"); low = hgh+1; } } cnt = bits; } } } } free(root); free(pwd); exit (0); } pbdagcon-0.3+20161121+ds/DALIGNER/README0000644000175000017500000005500613026414545015077 0ustar afifafif *** PLEASE GO TO THE DAZZLER BLOG (https://dazzlerblog.wordpress.com) FOR TYPESET *** DOCUMENTATION, EXAMPLES OF USE, AND DESIGN PHILOSOPHY. /************************************************************************************\ UPGRADE & DEVELOPER NOTES ! ! ! If you have already performed a big comparison and don't want to recompute all your local alignments in .las files, but do want to use a more recent version of the software that entails a change to the data structures (currently the update on December 31, 2014), please note the routine LAupgrade.Dec.31.2014. This take a .las file, say X.las, as an argument, and writes to standard output the .las file in the new format. The program can be made with "make" but is not by default created when make is called without an argument. For those interested in the details, on December 30, the "alen" and "blen" fields were dropped to save space as they can always be gotten from the underlying DB. \************************************************************************************/ The Daligner Overlap Library Author: Gene Myers First: July 17, 2013 Current: December 31, 2014 The commands below permit one to find all significant local alignments between reads encoded in Dazzler database. The assumption is that the reads are from a PACBIO RS II long read sequencer. That is the reads are long and noisy, up to 15% on average. Recall that a database has a current partition that divides it into blocks of a size that can conveniently be handled by calling the "dalign" overlapper on all the pairs of blocks producing a collection of .las local alignment files that can then be sorted and merged into an ordered sequence of sorted files containing all alignments between reads in the data set. The alignment records are parsimonious in that they do not record an alignment but simply a set of trace points, typically every 100bp or so, that allow the efficient reconstruction of alignments on demand. 1. daligner [-vbAI] [-k] [-w] [-h] [-t] [-M] [-e] [-H] [-m]+ ... Compare sequences in the trimmed block against those in the list of blocks searching for local alignments involving at least -l base pairs (default 1000) or more, that have an average correlation rate of -e (default 70%). The local alignments found will be output in a sparse encoding where a trace point on the alignment is recorded every -s base pairs of the a-read (default 100bp). Reads are compared in both orientations and local alignments meeting the criteria are output to one of several created files described below. The -v option turns on a verbose reporting mode that gives statistics on each major step of the computation. The options -k, -h, and -w control the initial filtration search for possible matches between reads. Specifically, our search code looks for a pair of diagonal bands of width 2^w (default 2^6 = 64) that contain a collection of exact matching k-mers (default 14) between the two reads, such that the total number of bases covered by the k-mer hits is h (default 35). k cannot be larger than 32 in the current implementation. If the -b option is set, then the daligner assumes the data has a strong compositional bias (e.g. >65% AT rich), and at the cost of a bit more time, dynamically adjusts k-mer sizes depending on compositional bias, so that the mers used have an effective specificity of 4^k. If there are one or more interval tracks specified with the -m option, then the reads of the DB or DB's to which the mask applies are soft masked with the union of the intervals of all the interval tracks that apply, that is any k-mers that contain any bases in any of the masked intervals are ignored for the purposes of seeding a match. An interval track is a track, such as the "dust" track created by DBdust, that encodes a set of intervals over either the untrimmed or trimmed DB. Invariably, some k-mers are significantly over-represented (e.g. homopolymer runs). These k-mers create an excessive number of matching k-mer pairs and left unaddressed would cause daligner to overflow the available physical memory. One way to deal with this is to explicitly set the -t parameter which suppresses the use of any k-mer that occurs more than t times in either the subject or target block. However, a better way to handle the situation is to let the program automatically select a value of t that meets a given memory usage limit specified (in Gb) by the -M parameter. By default daligner will use the amount of physical memory as the choice for -M. If you want to use less, say only 8Gb on a 24Gb HPC cluster node because you want to run 3 daligner jobs on the node, then specify -M8. Specifying -M0 basically indicates that you do not want daligner to self adjust k-mer suppression to fit within a given amount of memory. For each subject, target pair of blocks, say X and Y, the program reports alignments where the a-read is in X and the b-read is in Y, and vice versa. However, if the -A option is set ("A" for "asymmetric") then just overlaps where the a-read is in X and the b-read is in Y are reported, and if X = Y, then it further reports only those overlaps where the a-read index is less than the b-read index. In either case, if the -I option is set ("I" for "identity") then when X = Y, overlaps between different portions of the same read will also be found and reported. Each found alignment is recorded as -- a[ab,ae] x bo[bb,be] -- where a and b are the indices (in the trimmed DB) of the reads that overlap, o indicates whether the b-read is from the same or opposite strand, and [ab,ae] and [bb,be] are the intervals of a and bo, respectively, that align. The program places these alignment records in files whose name is of the form X.Y.[C|N]#.las where C indicates that the b-reads are complemented and N indicates they are not (both comparisons are performed) and # is the thread that detected and wrote out the collection of alignments contained in the file. That is the file X.Y.O#.las contains the alignments produced by thread # for which the a-read is from X and the b-read is from Y and in orientation O. The command "daligner -A X Y" produces 2*NTHREAD thread files X.Y.?.las and "daligner X Y" produces 4*NTHREAD files X.Y.?.las and Y.X.?.las (unless X=Y in which case only NTHREAD files, X.X.?.las, are produced). By default daligner compares all overlaps between reads in the database that are greater than the minimum cutoff set when the DB or DBs were split, typically 1 or 2 Kbp. However, the HGAP assembly pipeline only wants to correct large reads, say 8Kbp or over, and so needs only the overlaps where the a-read is one of the large reads. By setting the -H parameter to say N, one alters daligner so that it only reports overlaps where the a-read is over N base-pairs long. While the default parameter settings are good for raw Pacbio data, daligner can be used for efficiently finding alignments in corrected reads or other less noisy reads. For example, for mapping applications against .dams we run "daligner -k20 -h60 -e.85" and on corrected reads, we typically run "daligner -k25 -w5 -h60 -e.95 -s500" and at these settings it is very fast. 2. LAsort [-v] ... Sort each .las alignment file specified on the command line. For each file it reads in all the overlaps in the file and sorts them in lexicographical order of (a,b,o,ab) assuming each alignment is recorded as a[ab,ae] x b^o[bb,be]. It then writes them all to a file named .S.las (assuming that the input file was .las). With the -v option set then the program reports the number of records read and written. 3. LAmerge [-v] ... Merge the .las files into a singled sorted file , where it is assumed that the input files are sorted. Due to operating system limits, the number of files must be <= 252. With the -v option set the program reports the # of records read and written. Used correctly, LAmerge and LAsort together allow one to perform an "external" sort that produces a collection of sorted files containing in aggregate all the local alignments found by the daligner, such that their concatenation is sorted in order of (a,b,o,ab). In particular, this means that all the alignments for a given a-read will be found consecutively in one of the files. So computations that need to look at all the alignments for a given read can operate in simple sequential scans of these sorted files. 4. LAshow [-caroUF] [-i] [-w] [-b] [ ] [ | ... ] LAshow produces a printed listing of the local alignments contained in the specified .las file, where the a- and b-reads come from src1 or from src1 and scr2, respectively. If a file or list of read ranges is given then only the overlaps for which the a-read is in the set specified by the file or list are displayed. See DBshow for an explanation of how the file and list of read ranges are interpreted. If the -F option is set then the roles of the a- and b- reads are reversed in the display. If the -c option is given then a cartoon rendering is displayed, and if -a or -r option is set then an alignment of the local alignment is displayed. The -a option puts exactly -w columns per segment of the display, whereas the -r option puts exactly -w a-read symbols in each segment of the display. The -r display mode is useful when one wants to visually compare two alignments involving the same a-read. If a combination of the -c, -a, and -r flags is set, then the cartoon comes first, then the -a alignment, and lastly the -r alignment. The -i option sets the indent for the cartoon and/or alignment displays, if they are requested. The -b option sets the number of symbols on either side of the aligned segments in an alignment display, and -U specifies that uppercase should be used for DNA sequence instead of the default lowercase. If the -o option is set then only alignments that are proper overlaps (a sequence end occurs at the each end of the alignment) are displayed. 5. LAdump [-cdt] [-o] [ ] [ | ... ] Like LAshow, LAdump allows one to display the local alignments (LAs) of a subset of the piles in an .las file and select which information to show about them. The difference is that the information is written in a very simple "1-code" ASCII format that makes it easy for one to read and parse the information for further use. For each LA the pair of reads is output on a line. -c requests that one further output the coordinates of the LA segments be output. The -d option requests that the number of difference in the LA be output, and -t requests that the tracepoint information be output. Finally, -o requests that only LAs that are proper overlaps be output. The format is very simple. Each requested piece of information occurs on a line. The first character of every line is a "1-code" character that tells you what information to expect on the line. The rest of the line contains information where each item is separated by a single blank space. The trace point line gives the number of trace point intervals in the LA and is immediately followed by that many lines containing a pair of integers giving the # of differences and b-displacement in each successive trace point interval. P #a #b - (#a,#b) have an LA between them C #ab #ae #bb #be - [#ab,#ae] aligns with [#bb,#be] D # - there are # differences in the LA T #n - there are #n trace point intervals for the LA (#d #y )^#n - there are #d difference aligning the #y bp's of B with the next fixed-size interval of A + X # - Total amount of X (X = P or T) % X # - Maximum amount of X in any pile (X = P or T) @ T # - Maximum number of trace points in any trace 1-code lines that begin with +, %, or @ are always the first lines in the output. They give size information about what is contained in the output. Specifically, '+ X #' gives the total number of LAs (X=P), or the total number of trace point intervals (X=T) in the file . '% X #' gives the maximum number of LAs (X=P) or the maximum number of trace point intervals (X=T) in a given *pile* (collection of LAs all with the same a-read (applies only to sorted .las files). Finally @ T # gives the maximum # of trace point intervals in any trace within the file. 6. LAindex -v ... LAindex takes a series of one or more sorted .las files and produces a "pile index" for each one. If the input file has name "X.las", then the name of its index file is ".X.las.idx". For each A-read pile encoded in the .las file, the index contains the offset to the first local alignment with A in the file. The index starts with four 64-bit integers that encode the numbers % P, + T, % T, and @ T described for LAdump above, and then an offset for each pile beginning with the first A-read in the file (which may not be read 0). The index is meant to allow programs that process piles to more efficiently read just the piles they need at any momment int time, as opposed to having to sequentially scan through the .las file. 7. LAcat > .las Given argument , find all files .1.las, .2.las, ... .n. where .i.las exists for every i in [1,n]. Then concatenate these files in order into a single .las file and pipe the result to the standard output. 8. LAsplit ( | ) < .las If the second argument is an integer n, then divide the alignment file , piped in through the standard input, as evenly as possible into n alignment files with the name .i.las for i in [1,n], subject to the restriction that all alignment records for a given a-read are in the same file. If the second argument refers to a database .db that has been partitioned, then divide the input alignment file into block .las files where all records whose a-read is in .i.db are in .i.las. 9. LAcheck [-vS] [ ] ... LAcheck checks each .las file for structural integrity, where the a- and b-sequences come from src1 or from src1 and scr2, respectively. That is, it makes sure each file makes sense as a plausible .las file, e.g. values are not out of bound, the number of records is correct, the number of trace points for a record is correct, and so on. If the -S option is set then it further checks that the alignments are in sorted order. If the -v option is set then a line is output for each .las file saying either the file is OK or reporting the first error. If the -v option is not set then the program runs silently. The exit status is 0 if every file is deemed good, and 1 if at least one of the files looks corrupted. 10. HPCdaligner [-vbAI] [-k] [-w] [-h] [-t] [-M] [-e] [-H] [-m]+ [-dal] [-deg] [[-]] HPCdaligner writes a UNIX shell script to the standard output that consists of a sequence of commands that effectively run daligner on all pairs of blocks of a split database and then externally sorts and merges them using LAsort and LAmerge into a collection of alignment files with names .#.las where # ranges from 1 to the number of blocks the data base is split into. These sorted files if concatenated by say LAcat would contain all the alignments in sorted order (of a-read, then b-read, ...). Moreover, all overlaps for a given a-read are guaranteed to not be split across files, so one can run artifact analyzers or error correction on each sorted file in parallel. The data base must have been previously split by DBsplit and all the parameters, except -v, -dal, and -deg, are passed through to the calls to daligner. The defaults for these parameters are as for daligner. The -v flag, for verbose-mode, is also passed to all calls to LAsort and LAmerge. -dal and -deg options are described later. For a database divided into N sub-blocks, the calls to daligner will produce in total 2TN^2 .las files assuming daligner runs with T threads. These will then be sorted and merged into N^2 sorted .las files, one for each block pair. These are then merged in ceil(log_deg N) phases where the number of files decreases geometrically in -deg until there is 1 file per row of the N x N block matrix. So at the end one has N sorted .las files that when concatenated would give a single large sorted overlap file. The -dal option (default 4) gives the desired number of block comparisons per call to daligner. Some must contain dal-1 comparisons, and the first dal-2 block comparisons even less, but the HPCdaligner "planner" does the best it can to give an average load of dal block comparisons per command. The -deg option (default 25) gives the maximum number of files that will be merged in a single LAmerge command. The planner makes the most even k-ary tree of merges, where the number of levels is ceil(log_deg N). If the integers and are missing then the script produced is for every block in the database. If is present then HPCdaligner produces an incremental script that compares blocks through ( = if not present) against each other and all previous blocks 1 through -1, and then incrementally updates the .las files for blocks 1 through -1, and creates the .las files for blocks through . Each UNIX command line output by the HPCdaligner can be a batch job (we use the && operator to combine several commands into one line to make this so). Dependencies between jobs can be maintained simply by first running all the daligner jobs, then all the initial sort jobs, and then all the jobs in each phase of the external merge sort. Each of these phases is separated by an informative comment line for your scripting convenience. 9. HPCmapper [-vb] [-k] [-w] [-h] [-t] [-M] [-e] [-H] [-m]+ [-dal] [-deg] [[-]] HPCmapper writes a UNIX shell script to the standard output that consists of a sequence of commands that effectively "maps" every read in the DB against a reference set of sequences in the DB , recording all the found local alignments in the sequence of files ..1.las, ..2.las, ... where ..k.las contains the alignments between all of and the k'th block of . The parameters are exactly the same as for HPCdaligner save that the -k, -h, and -e defaults are set appropriately for mapping, and the -A and -I options make no sense as and are expected to be distinct data sets. If the integers and are missing then the script produced is for every block in the database . If is present then HPCmapper produces an script that compares blocks through ( = if not present) against DAM . Example: // Recall G.db from the example in DAZZ_DB/README > cat G.db files = 1 1862 G Sim blocks = 2 size = 11 cutoff = 0 all = 0 0 0 1024 1024 1862 1862 > HPCdaligner -mdust -t5 G | csh -v // Run the HPCdaligner script # Dazzler jobs (2) dazzler -d -t5 -mdust G.1 G.1 dazzler -d -t5 -mdust G.2 G.1 G.2 # Initial sort jobs (4) LAsort G.1.G.1.*.las && LAmerge G.L1.1.1 G.1.G.1.*.S.las && rm G.1.G.1.*.S.las LAsort G.1.G.2.*.las && LAmerge G.L1.1.2 G.1.G.2.*.S.las && rm G.1.G.2.*.S.las LAsort G.2.G.1.*.las && LAmerge G.L1.2.1 G.2.G.1.*.S.las && rm G.2.G.1.*.S.las LAsort G.2.G.2.*.las && LAmerge G.L1.2.2 G.2.G.2.*.S.las && rm G.2.G.2.*.S.las # Level 1 jobs (2) LAmerge G.1 G.L1.1.1 G.L1.1.2 && rm G.L1.1.1.las G.L1.1.2.las LAmerge G.2 G.L1.2.1 G.L1.2.2 && rm G.L1.2.1.las G.L1.2.2.las > LAshow -c -a:G -w50 G.1 | more // Take a look at the result ! G.1: 34,510 records 1 9 c [ 0.. 1,876] x [ 9,017..10,825] ( 18 trace pts) 12645 A ---------+====> dif/(len1+len2) = 398/(1876+1808) = 21.61% B <====+--------- 9017 1 ..........gtg-cggt--caggggtgcctgc-t-t-atcgcaatgtta |||*||||**||||||||*||||*|*|*||**|*|*|||| 9008 gagaggccaagtggcggtggcaggggtg-ctgcgtcttatatccaggtta 27.5% 35 ta-ctgggtggttaaacttagccaggaaacctgttgaaataa-acggtgg ||*|||||||||||||*|**|*||*|*||||||*|**|||||*|*||||| 9057 tagctgggtggttaaa-tctg-ca-g-aacctg-t--aataacatggtgg 24.0% 83 -ctagtggcttgccgtttacccaacagaagcataatgaaa-tttgaaagt *||||||||*||||||||*||**||||*|||**|||||||*||||*|||| 9100 gctagtggc-tgccgttt-ccgcacag-agc--aatgaaaatttg-aagt 20.0% 131 ggtaggttcctgctgtct-acatacagaacgacggagcgaaaaggtaccg ||*|||||||||||||*|*||||*|*|*||||||||||*||||||||||* 9144 gg-aggttcctgctgt-tcacat-c-ggacgacggagc-aaaaggtacc- 16.0% ... > LAcat G >G.las // Combine G.1.las & G.2.las into a single .las file > LAshow G G | more // Take another look, now at G.las G: 62,654 records 1 9 c [ 0.. 1,876] x [ 9,017..10,825] : < 398 diffs ( 18 trace pts) 1 38 c [ 0.. 7,107] x [ 5,381..12,330] : < 1,614 diffs ( 71 trace pts) 1 49 n [ 5,493..14,521] x [ 0.. 9,065] : < 2,028 diffs ( 91 trace pts) 1 68 n [12,809..14,521] x [ 0.. 1,758] : < 373 diffs ( 17 trace pts) 1 147 c [ 0..13,352] x [ 854..14,069] : < 2,993 diffs (133 trace pts) 1 231 n [10,892..14,521] x [ 0.. 3,735] : < 816 diffs ( 37 trace pts) 1 292 c [ 3,835..14,521] x [ 0..10,702] : < 2,353 diffs (107 trace pts) 1 335 n [ 7,569..14,521] x [ 0.. 7,033] : < 1,544 diffs ( 70 trace pts) 1 377 c [ 9,602..14,521] x [ 0.. 5,009] : < 1,104 diffs ( 49 trace pts) 1 414 c [ 6,804..14,521] x [ 0.. 7,812] : < 1,745 diffs ( 77 trace pts) 1 415 c [ 0.. 3,613] x [ 7,685..11,224] : < 840 diffs ( 36 trace pts) 1 445 c [ 9,828..14,521] x [ 0.. 4,789] : < 1,036 diffs ( 47 trace pts) 1 464 n [ 0.. 1,942] x [12,416..14,281] : < 411 diffs ( 19 trace pts) ... pbdagcon-0.3+20161121+ds/DALIGNER/daligner_p.c0000644000175000017500000000002613026414545016457 0ustar afifafif#include "daligner.c" pbdagcon-0.3+20161121+ds/DALIGNER/LAcat.c0000644000175000017500000001006213026414545015340 0ustar afifafif/******************************************************************************************* * * Merge together in index order, overlap files .1.las, .2.las, ... into a * single overlap file and output to the standard output * * Author: Gene Myers * Date : July 2013 * *******************************************************************************************/ #include #include #include #include #include #include #include #include "DB.h" #include "align.h" static char *Usage = " > .las"; #define MEMORY 1000 // How many megabytes for output buffer int main(int argc, char *argv[]) { char *iblock, *oblock; FILE *input; int64 novl, bsize, ovlsize, ptrsize; int tspace, tbytes; char *pwd, *root; Prog_Name = Strdup("LAcat",""); if (argc <= 1) { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage); exit (1); } ptrsize = sizeof(void *); ovlsize = sizeof(Overlap) - ptrsize; bsize = MEMORY * 1000000ll; oblock = (char *) Malloc(bsize,"Allocating output block"); iblock = (char *) Malloc(bsize + ptrsize,"Allocating input block"); if (oblock == NULL || iblock == NULL) exit (1); iblock += ptrsize; pwd = PathTo(argv[1]); root = Root(argv[1],".las"); { int64 povl; int i, mspace; novl = 0; tspace = 0; mspace = 0; tbytes = 0; for (i = 0; 1; i++) { char *name = Catenate(pwd,"/",root,Numbered_Suffix(".",i+1,".las")); if ((input = fopen(name,"r")) == NULL) break; if (fread(&povl,sizeof(int64),1,input) != 1) SYSTEM_ERROR novl += povl; if (fread(&mspace,sizeof(int),1,input) != 1) SYSTEM_ERROR if (i == 0) { tspace = mspace; if (tspace <= TRACE_XOVR) tbytes = sizeof(uint8); else tbytes = sizeof(uint16); } else if (tspace != mspace) { fprintf(stderr,"%s: PT-point spacing conflict (%d vs %d)\n",Prog_Name,tspace,mspace); exit (1); } fclose(input); } fwrite(&novl,sizeof(int64),1,stdout); fwrite(&tspace,sizeof(int),1,stdout); } { int i, j; Overlap *w; int64 tsize, povl; int mspace; char *iptr, *itop; char *optr, *otop; optr = oblock; otop = oblock + bsize; for (i = 0; 1; i++) { char *name = Catenate(pwd,"/",root,Numbered_Suffix(".",i+1,".las")); if ((input = fopen(name,"r")) == NULL) break; if (fread(&povl,sizeof(int64),1,input) != 1) SYSTEM_ERROR if (fread(&mspace,sizeof(int),1,input) != 1) SYSTEM_ERROR iptr = iblock; itop = iblock + fread(iblock,1,bsize,input); for (j = 0; j < povl; j++) { if (iptr + ovlsize > itop) { int64 remains = itop-iptr; if (remains > 0) memcpy(iblock,iptr,remains); iptr = iblock; itop = iblock + remains; itop += fread(itop,1,bsize-remains,input); } w = (Overlap *) (iptr - ptrsize); tsize = w->path.tlen*tbytes; if (optr + ovlsize + tsize > otop) { fwrite(oblock,1,optr-oblock,stdout); optr = oblock; } memcpy(optr,iptr,ovlsize); optr += ovlsize; iptr += ovlsize; if (iptr + tsize > itop) { int64 remains = itop-iptr; if (remains > 0) memcpy(iblock,iptr,remains); iptr = iblock; itop = iblock + remains; itop += fread(itop,1,bsize-remains,input); } memcpy(optr,iptr,tsize); optr += tsize; iptr += tsize; } fclose(input); } if (optr > oblock) fwrite(oblock,1,optr-oblock,stdout); } free(pwd); free(root); free(oblock); free(iblock-ptrsize); exit (0); } pbdagcon-0.3+20161121+ds/DALIGNER/LAcheck.c0000644000175000017500000002142313026414545015651 0ustar afifafif/******************************************************************************************* * * Check the structural integrity of .las files * * Author: Gene Myers * Date : July 2014 * *******************************************************************************************/ #include #include #include #include #include #include #include #include "DB.h" #include "align.h" static char *Usage = "[-vS] [ ] ..."; #define MEMORY 1000 // How many megabytes for output buffer int main(int argc, char *argv[]) { HITS_DB _db1, *db1 = &_db1; HITS_DB _db2, *db2 = &_db2; int VERBOSE; int SORTED; int ISTWO; // Process options { int i, j, k; int flags[128]; ARG_INIT("LAcheck") j = 1; for (i = 1; i < argc; i++) if (argv[i][0] == '-') switch (argv[i][1]) { default: ARG_FLAGS("vS") break; } else argv[j++] = argv[i]; argc = j; VERBOSE = flags['v']; SORTED = flags['S']; if (argc <= 2) { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage); exit (1); } } // Open trimmed DB { int status; char *pwd, *root; FILE *input; ISTWO = 0; status = Open_DB(argv[1],db1); if (status < 0) exit (1); if (db1->part > 0) { fprintf(stderr,"%s: Cannot be called on a block: %s\n",Prog_Name,argv[1]); exit (1); } pwd = PathTo(argv[2]); root = Root(argv[2],".las"); if ((input = fopen(Catenate(pwd,"/",root,".las"),"r")) == NULL) { ISTWO = 1; if (argc <= 3) { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage); exit (1); } status = Open_DB(argv[2],db2); if (status < 0) exit (1); if (db2->part > 0) { fprintf(stderr,"%s: Cannot be called on a block: %s\n",Prog_Name,argv[2]); exit (1); } Trim_DB(db2); } else { fclose(input); db2 = db1; } Trim_DB(db1); free(root); free(pwd); } { char *iblock; int64 bsize, ovlsize, ptrsize; int i, j; HITS_READ *reads1 = db1->reads; int nreads1 = db1->nreads; HITS_READ *reads2 = db2->reads; int nreads2 = db2->nreads; // Setup IO buffers ptrsize = sizeof(void *); ovlsize = sizeof(Overlap) - ptrsize; bsize = MEMORY * 1000000ll; iblock = (char *) Malloc(bsize+ptrsize,"Allocating input block"); if (iblock == NULL) exit (1); iblock += ptrsize; // For each file do for (i = 2+ISTWO; i < argc; i++) { char *pwd, *root; FILE *input; char *iptr, *itop; Overlap last; int64 novl; int tspace, tbytes; // Establish IO and (novl,tspace) header pwd = PathTo(argv[i]); root = Root(argv[i],".las"); if ((input = Fopen(Catenate(pwd,"/",root,".las"),"r")) == NULL) goto error; if (fread(&novl,sizeof(int64),1,input) != 1) SYSTEM_ERROR if (fread(&tspace,sizeof(int),1,input) != 1) SYSTEM_ERROR if (novl < 0) { if (VERBOSE) fprintf(stderr," %s: Number of alignments < 0\n",root); goto error; } if (tspace < 0) { if (VERBOSE) fprintf(stderr," %s: Trace spacing < 0\n",root); goto error; } if (tspace <= TRACE_XOVR) tbytes = sizeof(uint8); else tbytes = sizeof(uint16); iptr = iblock; itop = iblock + fread(iblock,1,bsize,input); // For each record in file do last.aread = -1; last.bread = -1; last.flags = 0; last.path.bbpos = last.path.abpos = 0; last.path.bepos = last.path.aepos = 0; for (j = 0; j < novl; j++) { Overlap ovl; int tsize; int equal; // Fetch next record if (iptr + ovlsize > itop) { int64 remains = itop-iptr; if (remains > 0) memcpy(iblock,iptr,remains); iptr = iblock; itop = iblock + remains; itop += fread(itop,1,bsize-remains,input); if (iptr + ovlsize > itop) { if (VERBOSE) fprintf(stderr," %s: Too few alignment records\n",root); goto error; } } ovl = *((Overlap *) (iptr - ptrsize)); iptr += ovlsize; tsize = ovl.path.tlen*tbytes; if (iptr + tsize > itop) { int64 remains = itop-iptr; if (remains > 0) memcpy(iblock,iptr,remains); iptr = iblock; itop = iblock + remains; itop += fread(itop,1,bsize-remains,input); if (iptr + tsize > itop) { if (VERBOSE) fprintf(stderr," %s: Too few alignment records\n",root); goto error; } } ovl.path.trace = iptr; iptr += tsize; // Basic checks if (ovl.aread < 0 || ovl.bread < 0) { if (VERBOSE) fprintf(stderr," %s: Read indices < 0\n",root); goto error; } if (ovl.aread >= nreads1 || ovl.bread >= nreads2) { if (VERBOSE) fprintf(stderr," %s: Read indices out of range\n",root); goto error; } if (ovl.path.abpos >= ovl.path.aepos || ovl.path.aepos > reads1[ovl.aread].rlen || ovl.path.bbpos >= ovl.path.bepos || ovl.path.bepos > reads2[ovl.bread].rlen || ovl.path.abpos < 0 || ovl.path.bbpos < 0 ) { if (VERBOSE) fprintf(stderr," %s: Non-sense alignment intervals\n",root); goto error; } if (ovl.path.diffs < 0 || ovl.path.diffs > reads1[ovl.aread].rlen || ovl.path.diffs > reads2[ovl.bread].rlen) { if (VERBOSE) fprintf(stderr," %s: Non-sense number of differences\n",root); goto error; } if (Check_Trace_Points(&ovl,tspace,VERBOSE,root)) goto error; // Duplicate check and sort check if -S set equal = 0; if (SORTED) { if (ovl.aread > last.aread) goto inorder; if (ovl.aread == last.aread) { if (ovl.bread > last.bread) goto inorder; if (ovl.bread == last.bread) { if (COMP(ovl.flags) > COMP(last.flags)) goto inorder; if (COMP(ovl.flags) == COMP(last.flags)) { if (ovl.path.abpos > last.path.abpos) goto inorder; if (ovl.path.abpos == last.path.abpos) { equal = 1; goto inorder; } } } } if (VERBOSE) fprintf(stderr," %s: Reads are not sorted (%d vs %d)\n", root,ovl.aread+1,ovl.bread+1); goto error; } else { if (ovl.aread == last.aread && ovl.bread == last.bread && COMP(ovl.flags) == COMP(last.flags) && ovl.path.abpos == last.path.abpos) equal = 1; } inorder: if (equal) { if (ovl.path.aepos == last.path.aepos && ovl.path.bbpos == last.path.bbpos && ovl.path.bepos == last.path.bepos) { if (VERBOSE) fprintf(stderr," %s: Duplicate overlap (%d vs %d)\n", root,ovl.aread+1,ovl.bread+1); goto error; } } last = ovl; } // File processing epilog: Check all data read and print OK if -v if (iptr < itop) { if (VERBOSE) fprintf(stderr," %s: Too many alignment records\n",root); goto error; } if (VERBOSE) { fprintf(stderr," %s: ",root); Print_Number(novl,0,stderr); fprintf(stderr," all OK\n"); } error: fclose(input); free(pwd); free(root); } free(iblock-ptrsize); } Close_DB(db1); if (ISTWO) Close_DB(db2); exit (0); } pbdagcon-0.3+20161121+ds/DALIGNER/filter_p.c0000644000175000017500000000005613026414545016162 0ustar afifafif#define FALCON_DALIGNER_P #include "filter.c" pbdagcon-0.3+20161121+ds/DALIGNER/LAdump.c0000644000175000017500000002664313026414545015552 0ustar afifafif/******************************************************************************************* * * Utility for displaying the information in the overlaps of a .las file in a very * simple to parse format. * * Author: Gene Myers * Creation: July 2013 * Last Mod: Jan 2015 * *******************************************************************************************/ #include #include #include #include #include #include #include #include #include "DB.h" #include "align.h" static char *Usage = "[-cdt] [-o] [ ] [ | ... ]"; #define LAST_READ_SYMBOL '$' static int ORDER(const void *l, const void *r) { int x = *((int *) l); int y = *((int *) r); return (x-y); } int main(int argc, char *argv[]) { HITS_DB _db1, *db1 = &_db1; HITS_DB _db2, *db2 = &_db2; Overlap _ovl, *ovl = &_ovl; FILE *input; int64 novl; int tspace, tbytes, small; int reps, *pts; int input_pts; int OVERLAP; int DOCOORDS, DODIFFS, DOTRACE; int ISTWO; // Process options { int i, j, k; int flags[128]; ARG_INIT("LAdump") j = 1; for (i = 1; i < argc; i++) if (argv[i][0] == '-') switch (argv[i][1]) { default: ARG_FLAGS("ocdtUF") break; } else argv[j++] = argv[i]; argc = j; OVERLAP = flags['o']; DOCOORDS = flags['c']; DODIFFS = flags['d']; DOTRACE = flags['t']; if (DOTRACE) DOCOORDS = 1; if (argc <= 2) { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage); exit (1); } } // Open trimmed DB or DB pair { int status; char *pwd, *root; FILE *input; ISTWO = 0; status = Open_DB(argv[1],db1); if (status < 0) exit (1); if (db1->part > 0) { fprintf(stderr,"%s: Cannot be called on a block: %s\n",Prog_Name,argv[1]); exit (1); } if (argc > 3) { pwd = PathTo(argv[3]); root = Root(argv[3],".las"); if ((input = fopen(Catenate(pwd,"/",root,".las"),"r")) != NULL) { ISTWO = 1; fclose(input); status = Open_DB(argv[2],db2); if (status < 0) exit (1); if (db2->part > 0) { fprintf(stderr,"%s: Cannot be called on a block: %s\n",Prog_Name,argv[2]); exit (1); } Trim_DB(db2); } else db2 = db1; free(root); free(pwd); } else db2 = db1; Trim_DB(db1); } // Process read index arguments into a sorted list of read ranges input_pts = 0; if (argc == ISTWO+4) { if (argv[ISTWO+3][0] != LAST_READ_SYMBOL || argv[ISTWO+3][1] != '\0') { char *eptr, *fptr; int b, e; b = strtol(argv[ISTWO+3],&eptr,10); if (eptr > argv[ISTWO+3] && b > 0) { if (*eptr == '-') { if (eptr[1] != LAST_READ_SYMBOL || eptr[2] != '\0') { e = strtol(eptr+1,&fptr,10); input_pts = (fptr <= eptr+1 || *fptr != '\0' || e <= 0); } } else input_pts = (*eptr != '\0'); } else input_pts = 1; } } if (input_pts) { int v, x; FILE *input; input = Fopen(argv[ISTWO+3],"r"); if (input == NULL) exit (1); reps = 0; while ((v = fscanf(input," %d",&x)) != EOF) if (v == 0) { fprintf(stderr,"%s: %d'th item of input file %s is not an integer\n", Prog_Name,reps+1,argv[2]); exit (1); } else reps += 1; reps *= 2; pts = (int *) Malloc(sizeof(int)*reps,"Allocating read parameters"); if (pts == NULL) exit (1); rewind(input); for (v = 0; v < reps; v += 2) { fscanf(input," %d",&x); pts[v] = pts[v+1] = x; } fclose(input); } else { pts = (int *) Malloc(sizeof(int)*2*argc,"Allocating read parameters"); if (pts == NULL) exit (1); reps = 0; if (argc > 3+ISTWO) { int c, b, e; char *eptr, *fptr; for (c = 3+ISTWO; c < argc; c++) { if (argv[c][0] == LAST_READ_SYMBOL) { b = db1->nreads; eptr = argv[c]+1; } else b = strtol(argv[c],&eptr,10); if (eptr > argv[c]) { if (b <= 0) { fprintf(stderr,"%s: %d is not a valid index\n",Prog_Name,b); exit (1); } if (*eptr == '\0') { pts[reps++] = b; pts[reps++] = b; continue; } else if (*eptr == '-') { if (eptr[1] == LAST_READ_SYMBOL) { e = INT32_MAX; fptr = eptr+2; } else e = strtol(eptr+1,&fptr,10); if (fptr > eptr+1 && *fptr == 0 && e > 0) { pts[reps++] = b; pts[reps++] = e; if (b > e) { fprintf(stderr,"%s: Empty range '%s'\n",Prog_Name,argv[c]); exit (1); } continue; } } } fprintf(stderr,"%s: argument '%s' is not an integer range\n",Prog_Name,argv[c]); exit (1); } qsort(pts,reps/2,sizeof(int64),ORDER); b = 0; for (c = 0; c < reps; c += 2) if (b > 0 && pts[b-1] >= pts[c]-1) { if (pts[c+1] > pts[b-1]) pts[b-1] = pts[c+1]; } else { pts[b++] = pts[c]; pts[b++] = pts[c+1]; } pts[b++] = INT32_MAX; reps = b; } else { pts[reps++] = 1; pts[reps++] = INT32_MAX; } } // Initiate file reading and read header { char *over, *pwd, *root; pwd = PathTo(argv[2+ISTWO]); root = Root(argv[2+ISTWO],".las"); over = Catenate(pwd,"/",root,".las"); input = Fopen(over,"r"); if (input == NULL) exit (1); if (fread(&novl,sizeof(int64),1,input) != 1) SYSTEM_ERROR if (fread(&tspace,sizeof(int),1,input) != 1) SYSTEM_ERROR if (tspace <= TRACE_XOVR) { small = 1; tbytes = sizeof(uint8); } else { small = 0; tbytes = sizeof(uint16); } free(pwd); free(root); } // Scan to count sizes of things { int j, al, tlen; int in, npt, idx, ar; int64 novls, odeg, omax, sdeg, smax, ttot, tmax; in = 0; npt = pts[0]; idx = 1; // For each record do novls = omax = smax = ttot = tmax = 0; sdeg = odeg = 0; al = 0; for (j = 0; j < novl; j++) // Read it in { Read_Overlap(input,ovl); tlen = ovl->path.tlen; fseeko(input,tlen*tbytes,SEEK_CUR); // Determine if it should be displayed ar = ovl->aread+1; if (in) { while (ar > npt) { npt = pts[idx++]; if (ar < npt) { in = 0; break; } npt = pts[idx++]; } } else { while (ar >= npt) { npt = pts[idx++]; if (ar <= npt) { in = 1; break; } npt = pts[idx++]; } } if (!in) continue; // If -o check display only overlaps if (OVERLAP) { if (ovl->path.abpos != 0 && ovl->path.bbpos != 0) continue; if (ovl->path.aepos != db1->reads[ovl->aread].rlen && ovl->path.bepos != db2->reads[ovl->bread].rlen) continue; } if (ar != al) { if (sdeg > smax) smax = sdeg; if (odeg > omax) omax = odeg; sdeg = odeg = 0; al = ar; } novls += 1; odeg += 1; sdeg += tlen; ttot += tlen; if (tlen > tmax) tmax = tlen; } if (sdeg > smax) smax = sdeg; if (odeg > omax) omax = odeg; printf("+ P %lld\n",novls); printf("%% P %lld\n",omax); printf("+ T %lld\n",ttot); printf("%% T %lld\n",smax); printf("@ T %lld\n",tmax); } // Read the file and display selected records { int j; uint16 *trace; int tmax; int in, npt, idx, ar; int64 verse; rewind(input); fread(&verse,sizeof(int64),1,input); fread(&tspace,sizeof(int),1,input); if (verse < 0) { for (j = 0; j < 5; j++) fread(&verse,sizeof(int64),1,input); } tmax = 1000; trace = (uint16 *) Malloc(sizeof(uint16)*tmax,"Allocating trace vector"); if (trace == NULL) exit (1); in = 0; npt = pts[0]; idx = 1; // For each record do for (j = 0; j < novl; j++) // Read it in { Read_Overlap(input,ovl); if (ovl->path.tlen > tmax) { tmax = ((int) 1.2*ovl->path.tlen) + 100; trace = (uint16 *) Realloc(trace,sizeof(uint16)*tmax,"Allocating trace vector"); if (trace == NULL) exit (1); } ovl->path.trace = (void *) trace; Read_Trace(input,ovl,tbytes); // Determine if it should be displayed ar = ovl->aread+1; if (in) { while (ar > npt) { npt = pts[idx++]; if (ar < npt) { in = 0; break; } npt = pts[idx++]; } } else { while (ar >= npt) { npt = pts[idx++]; if (ar <= npt) { in = 1; break; } npt = pts[idx++]; } } if (!in) continue; // If -o check display only overlaps if (OVERLAP) { if (ovl->path.abpos != 0 && ovl->path.bbpos != 0) continue; if (ovl->path.aepos != db1->reads[ovl->aread].rlen && ovl->path.bepos != db2->reads[ovl->bread].rlen) continue; } // Display it printf("P %d %d",ovl->aread+1,ovl->bread+1); if (COMP(ovl->flags)) printf(" c\n"); else printf(" n\n"); if (DOCOORDS) printf("C %d %d %d %d\n",ovl->path.abpos,ovl->path.aepos,ovl->path.bbpos,ovl->path.bepos); if (DODIFFS) printf("D %d\n",ovl->path.diffs); if (DOTRACE) { uint16 *trace = (uint16 *) ovl->path.trace; int tlen = ovl->path.tlen; if (small) Decompress_TraceTo16(ovl); printf("T %d\n",tlen>>1); for (j = 0; j < tlen; j += 2) printf(" %3d %3d\n",trace[j],trace[j+1]); } } free(trace); } Close_DB(db1); if (ISTWO) Close_DB(db2); exit (0); } pbdagcon-0.3+20161121+ds/DALIGNER/align.h0000644000175000017500000004256413026414545015467 0ustar afifafif/******************************************************************************************* * * Local alignment module. Routines for finding local alignments given a seed position, * representing such an l.a. with its interval and a set of pass-thru points, so that * a detailed alignment can be efficiently computed on demand. * * All routines work on a numeric representation of DNA sequences, i.e. 0 for A, 1 for C, * 2 for G, and 3 for T. * * Author: Gene Myers * Date : July 2013 * ********************************************************************************************/ #ifndef _A_MODULE #define _A_MODULE #include "DB.h" #define TRACE_XOVR 125 // If the trace spacing is not more than this value, then can // and do compress traces pts to 8-bit unsigned ints /*** INTERACTIVE vs BATCH version The defined constant INTERACTIVE (set in DB.h) determines whether an interactive or batch version of the routines in this library are compiled. In batch mode, routines print an error message and exit. In interactive mode, the routines place the error message in EPLACE (also defined in DB.h) and return an error value, typically NULL if the routine returns a pointer, and an unusual integer value if the routine returns an integer. Below when an error return is described, one should understand that this value is returned only if the routine was compiled in INTERACTIVE mode. ***/ /*** PATH ABSTRACTION: Coordinates are *between* characters where 0 is the tick just before the first char, 1 is the tick between the first and second character, and so on. Our data structure is called a Path refering to its conceptualization in an edit graph. A local alignment is specified by the point '(abpos,bbpos)' at which its path in the underlying edit graph starts, and the point '(aepos,bepos)' at which it ends. In otherwords A[abpos+1..aepos] is aligned to B[bbpos+1..bepos] (assuming X[1] is the *first* character of X). There are 'diffs' differences in an optimal local alignment between the beginning and end points of the alignment (if computed by Compute_Trace), or nearly so (if computed by Local_Alignment). Optionally, a Path can have additional information about the exact nature of the aligned substrings if the field 'trace' is not NULL. Trace points to either an array of integers (if computed by a Compute_Trace routine), or an array of unsigned short integers (if computed by Local_Alignment). If computed by Local_Alignment 'trace' points at a list of 'tlen' (always even) short values: d_0, b_0, d_1, b_1, ... d_n-1, b_n-1, d_n, b_n to be interpreted as follows. The alignment from (abpos,bbpos) to (aepos,bepos) passes through the n trace points for i in [1,n]: (a_i,b_i) where a_i = floor(abpos/TS)*TS + i*TS and b_i = bbpos + (b_0 + b_1 + b_i-1) where also let a_0,b_0 = abpos,bbpos and a_(n+1),b_(n+1) = aepos,bepos. That is, the interior (i.e. i != 0 and i != n+1) trace points pass through every TS'th position of the aread where TS is the "trace spacing" employed when finding the alignment (see New_Align_Spec). Typically TS is 100. Then d_i is the number of differences in the portion of the alignment between (a_i,b_i) and (a_i+1,b_i+1). These trace points allow the Compute_Trace routines to efficiently compute the exact alignment between the two reads by efficiently computing exact alignments between consecutive pairs of trace points. Moreover, the diff values give one an idea of the quality of the alignment along every segment of TS symbols of the aread. If computed by a Compute_Trace routine, 'trace' points at a list of 'tlen' integers < i1, i2, ... in > that encodes an exact alignment as follows. A negative number j indicates that a dash should be placed before A[-j] and a positive number k indicates that a dash should be placed before B[k], where A and B are the two sequences of the overlap. The indels occur in the trace in the order in which they occur along the alignment. For a good example of how to "decode" a trace into an alignment, see the code for the routine Print_Alignment. ***/ typedef struct { void *trace; int tlen; int diffs; int abpos, bbpos; int aepos, bepos; } Path; /*** ALIGNMENT ABSTRACTION: An alignment is modeled by an Alignment record, which in addition to a *pointer* to a 'path', gives pointers to the A and B sequences, their lengths, and indicates whether the B-sequence needs to be complemented ('comp' non-zero if so). The 'trace' pointer of the 'path' subrecord can be either NULL, a list of pass-through points, or an exact trace depending on what routines have been called on the record. One can (1) compute a trace, with Compute_Trace, either from scratch if 'path.trace' = NULL, or using the sequence of pass-through points in trace, (2) print an ASCII representation of an alignment, or (3) reverse the roles of A and B, and (4) complement a sequence (which is a reversible process). If the alignment record shows the B sequence as complemented, *** THEN IT IS THE RESPONSIBILITY OF THE CALLER *** to make sure that bseq points at a complement of the sequence before calling Compute_Trace or Print_Alignment. Complement_Seq complements the sequence a of length n. The operation does the complementation/reversal in place. Calling it a second time on a given fragment restores it to its original state. ***/ #define COMP(x) ((x) & 0x1) #define COMP_FLAG 0x1 typedef struct { Path *path; uint32 flags; /* Pipeline status and complementation flags */ char *aseq; /* Pointer to A sequence */ char *bseq; /* Pointer to B sequence */ int alen; /* Length of A sequence */ int blen; /* Length of B sequence */ } Alignment; void Complement_Seq(char *a, int n); /* Many routines like Local_Alignment, Compute_Trace, and Print_Alignment need working storage that is more efficiently reused with each call, rather than being allocated anew with each call. Each *thread* can create a Work_Data object with New_Work_Data and this object holds and retains the working storage for routines of this module between calls to the routines. If enough memory for a Work_Data is not available then NULL is returned. Free_Work_Data frees a Work_Data object and all working storage held by it. */ typedef void Work_Data; Work_Data *New_Work_Data(); void Free_Work_Data(Work_Data *work); /* Local_Alignment seeks local alignments of a quality determined by a number of parameters. These are coded in an Align_Spec object that can be created with New_Align_Spec and freed with Free_Align_Spec when no longer needed. There are 4 essential parameters: ave_corr: the average correlation (1 - 2*error_rate) for the sought alignments. For Pacbio data we set this to .70 assuming an average of 15% error in each read. trace_space: the spacing interval for keeping trace points and segment differences (see description of 'trace' for Paths above) freq[4]: a 4-element vector where afreq[0] = frequency of A, f(A), freq[1] = f(C), freq[2] = f(G), and freq[3] = f(T). This vector is part of the header of every HITS database (see db.h). If an alignment cannot reach the boundary of the d.p. matrix with this condition (i.e. overlap), then the last/first 30 columns of the alignment are guaranteed to be suffix/prefix positive at correlation ave_corr * g(freq) where g is an empirically measured function that increases from 1 as the entropy of freq decreases. If memory is unavailable or the freq distribution is too skewed then NULL is returned. You can get back the original parameters used to create an Align_Spec with the simple utility functions below. */ typedef void Align_Spec; Align_Spec *New_Align_Spec(double ave_corr, int trace_space, float *freq); void Free_Align_Spec(Align_Spec *spec); int Trace_Spacing (Align_Spec *spec); double Average_Correlation(Align_Spec *spec); float *Base_Frequencies (Align_Spec *spec); /* Local_Alignment finds the longest significant local alignment between the sequences in 'align' subject to: (a) the alignment criterion given by the Align_Spec 'spec', (b) it passes through one of the points (anti+k)/2,(anti-k)/2 for k in [low,hgh] within the underlying dynamic programming matrix (i.e. the points on diagonals low to hgh on anti-diagonal anti or anti-1 (depending on whether the diagonal is odd or even)), (c) if lbord >= 0, then the alignment is always above diagonal low-lbord, and (d) if hbord >= 0, then the alignment is always below diagonal hgh+hbord. The path record of 'align' has its 'trace' filled from the point of view of an overlap between the aread and the bread. In addition a Path record from the point of view of the bread versus the aread is returned by the function, with this Path's 'trace' filled in appropriately. The space for the returned path and the two 'trace's are in the working storage supplied by the Work_Data packet and this space is reused with each call, so if one wants to retain the bread-path and the two trace point sequences, then they must be copied to user-allocated storage before calling the routine again. NULL is returned in the event of an error. Find_Extension is a variant of Local_Alignment that simply finds a local alignment that either ends (if prefix is non-zero) or begins (if prefix is zero) at the point (anti+diag)/2,(anti-diag)/2). All other parameters are as before. It returns a non-zero value only when INTERACTIVE is on and it cannot allocate the memory it needs. Only the path and trace with respect to the aread is returned. This routine is experimental and may not persist in later versions of the code. */ Path *Local_Alignment(Alignment *align, Work_Data *work, Align_Spec *spec, int low, int hgh, int anti, int lbord, int hbord); int Find_Extension(Alignment *align, Work_Data *work, Align_Spec *spec, // experimental !! int diag, int anti, int lbord, int hbord, int prefix); /* Given a legitimate Alignment object, Compute_Trace_X computes an exact trace for the alignment. If 'path.trace' is non-NULL, then it is assumed to be a sequence of pass-through points and diff levels computed by Local_Alignment. In either case 'path.trace' is set to point at an integer array within the storage of the Work_Data packet encoding an exact optimal trace from the start to end points. If the trace is needed beyond the next call to a routine that sets it, then it should be copied to an array allocated and managed by the caller. Compute_Trace_ALL does not require a sequence of pass-through points, as it computes the best alignment between (path->abpos,path->bbpos) and (path->aepos,path->bepos) in the edit graph between the sequences. Compute_Trace_PTS computes a trace by computing the trace between successive pass through points. It is much, much faster than Compute_Trace_ALL but at the tradeoff of not necessarily being optimal as pass-through points are not all perfect. Compute_Trace_MID computes a trace by computing the trace between the mid-points of alignments between two adjacent pairs of pass through points. It is generally twice as slow as Compute_Trace_PTS, but it produces nearer optimal alignments. All these routines return 1 if an error occurred and 0 otherwise. */ #define LOWERMOST -1 // Possible modes for "mode" parameter below) #define GREEDIEST 0 #define UPPERMOST 1 int Compute_Trace_ALL(Alignment *align, Work_Data *work); int Compute_Trace_PTS(Alignment *align, Work_Data *work, int trace_spacing, int mode); int Compute_Trace_MID(Alignment *align, Work_Data *work, int trace_spacing, int mode); /* Compute_Trace_IRR (IRR for IRRegular) computes a trace for the given alignment where it assumes the spacing between trace points between both the A and B read varies, and futher assumes that the A-spacing is given in the short integers normally occupied by the differences in the alignment between the trace points. This routine is experimental and may not persist in later versions of the code. */ int Compute_Trace_IRR(Alignment *align, Work_Data *work, int mode); // experimental !! /* Alignment_Cartoon prints an ASCII representation of the overlap relationhip between the two reads of 'align' to the given 'file' indented by 'indent' space. Coord controls the display width of numbers, it must be not less than the width of any number to be displayed. If the alignment trace is an exact trace, then one can ask Print_Alignment to print an ASCII representation of the alignment 'align' to the file 'file'. Indent the display by "indent" spaces and put "width" columns per line in the display. Show "border" characters of sequence on each side of the aligned region. If upper is non-zero then display bases in upper case. If coord is greater than 0, then the positions of the first character in A and B in the given row is displayed with a field width given by coord's value. Print_Reference is like Print_Alignment but rather than printing exaclty "width" columns per segment, it prints "block" characters of the A sequence in each segment. This results in segments of different lengths, but is convenient when looking at two alignments involving A as segments are guaranteed to cover the same interval of A in a segment. Both Print routines return 1 if an error occurred (not enough memory), and 0 otherwise. Flip_Alignment modifies align so the roles of A and B are reversed. If full is off then the trace is ignored, otherwise the trace must be to a full alignment trace and this trace is also appropriately inverted. */ void Alignment_Cartoon(FILE *file, Alignment *align, int indent, int coord); int Print_Alignment(FILE *file, Alignment *align, Work_Data *work, int indent, int width, int border, int upper, int coord); int Print_Reference(FILE *file, Alignment *align, Work_Data *work, int indent, int block, int border, int upper, int coord); void Flip_Alignment(Alignment *align, int full); /*** OVERLAP ABSTRACTION: Externally, between modules an Alignment is modeled by an "Overlap" record, which (a) replaces the pointers to the two sequences with their ID's in the HITS data bases, (b) does not contain the length of the 2 sequences (must fetch from DB), and (c) contains its path as a subrecord rather than as a pointer (indeed, typically the corresponding Alignment record points at the Overlap's path sub-record). The trace pointer is always to a sequence of trace points and can be either compressed (uint8) or uncompressed (uint16). One can read and write binary records of an "Overlap". ***/ typedef struct { Path path; /* Path: begin- and end-point of alignment + diffs */ uint32 flags; /* Pipeline status and complementation flags */ int aread; /* Id # of A sequence */ int bread; /* Id # of B sequence */ } Overlap; /* Read_Overlap reads the next Overlap record from stream 'input', not including the trace (if any), and without modifying 'ovl's trace pointer. Read_Trace reads the ensuing trace into the memory pointed at by the trace field of 'ovl'. It is assumed to be big enough to accommodate the trace where each value take 'tbytes' bytes (1 if uint8 or 2 if uint16). Write_Overlap write 'ovl' to stream 'output' followed by its trace vector (if any) that occupies 'tbytes' bytes per value. Print_Overlap prints an ASCII version of the contents of 'ovl' to stream 'output' where the trace occupes 'tbytes' per value and the print out is indented from the left margin by 'indent' spaces. Compress_TraceTo8 converts a trace fo 16-bit values to 8-bit values in place, and Decompress_TraceTo16 does the reverse conversion. Check_Trace_Points checks that the number of trace points is correct and that the sum of the b-read displacements equals the b-read alignment interval, assuming the trace spacing is 'tspace'. It reports an error message if there is a problem and 'verbose' is non-zero. The 'ovl' came from the file names 'fname'. */ int Read_Overlap(FILE *input, Overlap *ovl); int Read_Trace(FILE *innput, Overlap *ovl, int tbytes); void Write_Overlap(FILE *output, Overlap *ovl, int tbytes); void Print_Overlap(FILE *output, Overlap *ovl, int tbytes, int indent); void Compress_TraceTo8(Overlap *ovl); void Decompress_TraceTo16(Overlap *ovl); int Check_Trace_Points(Overlap *ovl, int tspace, int verbose, char *fname); #endif // _A_MODULE pbdagcon-0.3+20161121+ds/DALIGNER/README_PB.md0000644000175000017500000000045013026414545016050 0ustar afifafifWe have removed DB and QV files, since there are identical in DAZZ_DB. Now, this package will not build unless the DAZZ_DB directory is supplied. CPPFLAGS+= -Idazzdb-build-dir LDFLAGS+= -Ldazzdb-build-dir For now, we use a relative path, `../DAZZ_DB`, assuming we are both submodules. pbdagcon-0.3+20161121+ds/DALIGNER/LAsplit.c0000644000175000017500000001303413026414545015726 0ustar afifafif/******************************************************************************************* * * Split an OVL file arriving from the standard input into 'parts' equal sized .las-files * .1.las, .2.las ... or according to a current partitioning of * * Author: Gene Myers * Date : June 2014 * *******************************************************************************************/ #include #include #include #include #include #include #include #include "DB.h" #include "align.h" static char *Usage = " ( | ) < .las"; #define MEMORY 1000 // How many megabytes for output buffer int main(int argc, char *argv[]) { char *iblock, *oblock; FILE *output, *dbvis; int64 novl, bsize, ovlsize, ptrsize; int parts, tspace, tbytes; int olast, blast; char *root, *pwd; Prog_Name = Strdup("LAsplit",""); if (argc != 3) { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage); exit (1); } { char *eptr; int nfiles, cutoff, all; int64 size; char buffer[2*MAX_NAME+100]; parts = strtol(argv[2],&eptr,10); if (*eptr != '\0') { pwd = PathTo(argv[2]); if (strcmp(argv[2]+(strlen(argv[2])-4),".dam") == 0) root = Root(argv[2],".dam"); else root = Root(argv[2],".db"); dbvis = fopen(Catenate(pwd,"/",root,".dam"),"r"); if (dbvis == NULL) { dbvis = fopen(Catenate(pwd,"/",root,".db"),"r"); if (dbvis == NULL) { fprintf(stderr,"%s: Second argument '%s' is not an integer or a DB\n", Prog_Name,argv[2]); exit (1); } } free(pwd); free(root); if (fscanf(dbvis,DB_NFILE,&nfiles) != 1) SYSTEM_ERROR while (nfiles-- > 0) if (fgets(buffer,2*MAX_NAME+100,dbvis) == NULL) SYSTEM_ERROR parts = 0; if (fscanf(dbvis,DB_NBLOCK,&parts) != 1) { fprintf(stderr,"%s: DB %s has not been partitioned\n",Prog_Name,argv[2]); exit (1); } if (fscanf(dbvis,DB_PARAMS,&size,&cutoff,&all) != 3) SYSTEM_ERROR if (fscanf(dbvis,DB_BDATA,&olast,&blast) != 2) SYSTEM_ERROR } else { dbvis = NULL; if (parts <= 0) { fprintf(stderr,"%s: Number of parts is not positive\n",Prog_Name); exit (1); } } } ptrsize = sizeof(void *); ovlsize = sizeof(Overlap) - ptrsize; bsize = MEMORY * 1000000ll; oblock = (char *) Malloc(bsize,"Allocating output block"); iblock = (char *) Malloc(bsize + ptrsize,"Allocating input block"); if (oblock == NULL || iblock == NULL) exit (1); iblock += ptrsize; pwd = PathTo(argv[1]); root = Root(argv[1],".las"); if (fread(&novl,sizeof(int64),1,stdin) != 1) SYSTEM_ERROR if (fread(&tspace,sizeof(int),1,stdin) != 1) SYSTEM_ERROR if (tspace <= TRACE_XOVR) tbytes = sizeof(uint8); else tbytes = sizeof(uint16); { int i, j; Overlap *w; int low, hgh, last; int64 tsize, povl; char *iptr, *itop; char *optr, *otop; iptr = iblock; itop = iblock + fread(iblock,1,bsize,stdin); hgh = 0; for (i = 0; i < parts; i++) { output = Fopen(Catenate(pwd,"/",root,Numbered_Suffix(".",i+1,".las")),"w"); if (output == NULL) exit (1); low = hgh; if (dbvis != NULL) { if (fscanf(dbvis,DB_BDATA,&olast,&blast) != 2) SYSTEM_ERROR last = blast-1; hgh = 0; } else { last = 0; hgh = (novl*(i+1))/parts; } povl = 0; fwrite(&povl,sizeof(int64),1,output); fwrite(&tspace,sizeof(int),1,output); optr = oblock; otop = oblock + bsize; for (j = low; j < novl; j++) { if (iptr + ovlsize > itop) { int64 remains = itop-iptr; if (remains > 0) memcpy(iblock,iptr,remains); iptr = iblock; itop = iblock + remains; itop += fread(itop,1,bsize-remains,stdin); } w = (Overlap *) (iptr-ptrsize); if (dbvis == NULL) { if (j >= hgh && w->aread > last) break; last = w->aread; } else { if (w->aread > last) break; } tsize = w->path.tlen*tbytes; if (optr + ovlsize + tsize > otop) { fwrite(oblock,1,optr-oblock,output); optr = oblock; } memcpy(optr,iptr,ovlsize); optr += ovlsize; iptr += ovlsize; if (iptr + tsize > itop) { int64 remains = itop-iptr; if (remains > 0) memcpy(iblock,iptr,remains); iptr = iblock; itop = iblock + remains; itop += fread(itop,1,bsize-remains,stdin); } memcpy(optr,iptr,tsize); optr += tsize; iptr += tsize; } hgh = j; if (optr > oblock) fwrite(oblock,1,optr-oblock,output); rewind(output); povl = hgh-low; fwrite(&povl,sizeof(int64),1,output); fclose(output); } } free(pwd); free(root); free(iblock-ptrsize); free(oblock); exit (0); } pbdagcon-0.3+20161121+ds/DALIGNER/daligner.c0000644000175000017500000004646213026414545016156 0ustar afifafif/*********************************************************************************************\ * * Find all local alignment between long, noisy DNA reads: * Compare sequences in 'subject' database against those in the list of 'target' databases * searching for local alignments of 1000bp or more (defined constant MIN_OVERLAP in * filter.c). Subject is compared in both orientations againt each target. An output * stream of 'Overlap' records (see align.h) is written in binary to the standard output, * each encoding a given found local alignment between two of the sequences. The -v * option turns on a verbose reporting mode that gives statistics on each major stage. * * The filter operates by looking for a pair of diagonal bands of width 2^'s' that contain * a collection of exact matching 'k'-mers between the two sequences, such that the total * number of bases covered by 'k'-mer hits is 'h'. k cannot be larger than 32 in the * current implementation. * * Some k-mers are significantly over-represented (e.g. homopolymer runs). These are * suppressed as seed hits, with the parameter 't' -- any k-mer that occurs more than * 't' times in either the subject or target is not counted as a seed hit. If the -t * option is absent then no k-mer is suppressed. Alternatively, the option -M specifies * that 't' is dynamically set to the largest value such that less than -M memory is * used. * * For each subject, target pair, say XXX and YYY, the program outputs a file containing * overlaps of the form XXX.YYY.[C|N]#.las where C implies that the reads in XXX were * complemented and N implies they were not (both comparisons are performed), and # is * the thread that detected and wrote out the collection of overlaps. For example, if * NTHREAD in the program is 4, then 8 files are output for each subject, target pair. * * Author: Gene Myers * Date : June 1, 2014 * *********************************************************************************************/ #include #include #include #include #include #include #include #include #include #if defined(BSD) #include #endif #include "DB.h" #include "filter.h" static char *Usage[] = { "[-vbAI] [-k] [-w] [-h] [-t] [-M]", " [-e] [-s] [-H]", " [-m]+ ...", }; int VERBOSE; // Globally visible to filter.c int BIASED; int MINOVER; int HGAP_MIN; int SYMMETRIC; int IDENTITY; uint64 MEM_LIMIT; uint64 MEM_PHYSICAL; /* Adapted from code by David Robert Nadeau (http://NadeauSoftware.com) licensed under * "Creative Commons Attribution 3.0 Unported License" * (http://creativecommons.org/licenses/by/3.0/deed.en_US) * * I removed Windows options, reformated, and return int64 instead of size_t */ static int64 getMemorySize( ) { #if defined(CTL_HW) && (defined(HW_MEMSIZE) || defined(HW_PHYSMEM64)) // OSX, NetBSD, OpenBSD int mib[2]; size_t size = 0; size_t len = sizeof( size ); mib[0] = CTL_HW; #if defined(HW_MEMSIZE) mib[1] = HW_MEMSIZE; // OSX #elif defined(HW_PHYSMEM64) mib[1] = HW_PHYSMEM64; // NetBSD, OpenBSD #endif if (sysctl(mib,2,&size,&len,NULL,0) == 0) return ((size_t) size); return (0); #elif defined(_SC_AIX_REALMEM) // AIX return ((size_t) sysconf( _SC_AIX_REALMEM ) * ((size_t) 1024L)); #elif defined(_SC_PHYS_PAGES) && defined(_SC_PAGESIZE) // FreeBSD, Linux, OpenBSD, & Solaris size_t size = 0; size = (size_t) sysconf(_SC_PHYS_PAGES); return (size * ((size_t) sysconf(_SC_PAGESIZE))); #elif defined(_SC_PHYS_PAGES) && defined(_SC_PAGE_SIZE) // ? Legacy ? size_t size = 0; size = (size_t) sysconf(_SC_PHYS_PAGES); return (size * ((size_t) sysconf(_SC_PAGE_SIZE))); #elif defined(CTL_HW) && (defined(HW_PHYSMEM) || defined(HW_REALMEM)) // DragonFly BSD, FreeBSD, NetBSD, OpenBSD, and OSX int mib[2]; unsigned int size = 0; size_t len = sizeof( size ); mib[0] = CTL_HW; #if defined(HW_REALMEM) mib[1] = HW_REALMEM; // FreeBSD #elif defined(HW_PYSMEM) mib[1] = HW_PHYSMEM; // Others #endif if (sysctl(mib,2,&size,&len,NULL,0) == 0) return (size_t)size; return (0); #else return (0); #endif } typedef struct { int *ano; int *end; int idx; int out; } Event; static void reheap(int s, Event **heap, int hsize) { int c, l, r; Event *hs, *hr, *hl; c = s; hs = heap[s]; while ((l = 2*c) <= hsize) { r = l+1; hl = heap[l]; hr = heap[r]; if (hr->idx > hl->idx) { if (hs->idx > hl->idx) { heap[c] = hl; c = l; } else break; } else { if (hs->idx > hr->idx) { heap[c] = hr; c = r; } else break; } } if (c != s) heap[c] = hs; } static int64 merge_size(HITS_DB *block, int mtop) { Event ev[mtop+1]; Event *heap[mtop+2]; int r, mhalf; int64 nsize; { HITS_TRACK *track; int i; track = block->tracks; for (i = 0; i < mtop; i++) { ev[i].ano = ((int *) (track->data)) + ((int64 *) (track->anno))[0]; ev[i].out = 1; heap[i+1] = ev+i; track = track->next; } ev[mtop].idx = INT32_MAX; heap[mtop+1] = ev+mtop; } mhalf = mtop/2; nsize = 0; for (r = 0; r < block->nreads; r++) { int i, level, hsize; HITS_TRACK *track; track = block->tracks; for (i = 0; i < mtop; i++) { ev[i].end = ((int *) (track->data)) + ((int64 *) (track->anno))[r+1]; if (ev[i].ano < ev[i].end) ev[i].idx = *(ev[i].ano); else ev[i].idx = INT32_MAX; track = track->next; } hsize = mtop; for (i = mhalf; i > 1; i--) reheap(i,heap,hsize); level = 0; while (1) { Event *p; reheap(1,heap,hsize); p = heap[1]; if (p->idx == INT32_MAX) break; p->out = 1-p->out; if (p->out) { level -= 1; if (level == 0) nsize += 1; } else { if (level == 0) nsize += 1; level += 1; } p->ano += 1; if (p->ano >= p->end) p->idx = INT32_MAX; else p->idx = *(p->ano); } } return (nsize); } static HITS_TRACK *merge_tracks(HITS_DB *block, int mtop, int64 nsize) { HITS_TRACK *ntrack; Event ev[mtop+1]; Event *heap[mtop+2]; int r, mhalf; int64 *anno; int *data; ntrack = (HITS_TRACK *) Malloc(sizeof(HITS_TRACK),"Allocating merged track"); if (ntrack == NULL) exit (1); ntrack->name = Strdup("merge","Allocating merged track"); ntrack->anno = anno = (int64 *) Malloc(sizeof(int64)*(block->nreads+1),"Allocating merged track"); ntrack->data = data = (int *) Malloc(sizeof(int)*nsize,"Allocating merged track"); ntrack->size = sizeof(int); ntrack->next = NULL; if (anno == NULL || data == NULL || ntrack->name == NULL) exit (1); { HITS_TRACK *track; int i; track = block->tracks; for (i = 0; i < mtop; i++) { ev[i].ano = ((int *) (track->data)) + ((int64 *) (track->anno))[0]; ev[i].out = 1; heap[i+1] = ev+i; track = track->next; } ev[mtop].idx = INT32_MAX; heap[mtop+1] = ev+mtop; } mhalf = mtop/2; nsize = 0; for (r = 0; r < block->nreads; r++) { int i, level, hsize; HITS_TRACK *track; anno[r] = nsize; track = block->tracks; for (i = 0; i < mtop; i++) { ev[i].end = ((int *) (track->data)) + ((int64 *) (track->anno))[r+1]; if (ev[i].ano < ev[i].end) ev[i].idx = *(ev[i].ano); else ev[i].idx = INT32_MAX; track = track->next; } hsize = mtop; for (i = mhalf; i > 1; i--) reheap(i,heap,hsize); level = 0; while (1) { Event *p; reheap(1,heap,hsize); p = heap[1]; if (p->idx == INT32_MAX) break; p->out = 1-p->out; if (p->out) { level -= 1; if (level == 0) data[nsize++] = p->idx; } else { if (level == 0) data[nsize++] = p->idx; level += 1; } p->ano += 1; if (p->ano >= p->end) p->idx = INT32_MAX; else p->idx = *(p->ano); } } anno[r] = nsize; return (ntrack); } static int read_DB(HITS_DB *block, char *name, char **mask, int *mstat, int mtop, int kmer) { int i, isdam, status, kind, stop; isdam = Open_DB(name,block); if (isdam < 0) exit (1); for (i = 0; i < mtop; i++) { status = Check_Track(block,mask[i],&kind); if (status >= 0) if (kind == MASK_TRACK) mstat[i] = 0; else { if (mstat[i] != 0) mstat[i] = -3; } else { if (mstat[i] == -2) mstat[i] = status; } if (status == 0 && kind == MASK_TRACK) Load_Track(block,mask[i]); } Trim_DB(block); stop = 0; for (i = 0; i < mtop; i++) { HITS_TRACK *track; int64 *anno; int j; status = Check_Track(block,mask[i],&kind); if (status < 0 || kind != MASK_TRACK) continue; stop += 1; track = Load_Track(block,mask[i]); anno = (int64 *) (track->anno); for (j = 0; j <= block->nreads; j++) anno[j] /= sizeof(int); } if (stop > 1) { int64 nsize; HITS_TRACK *track; nsize = merge_size(block,stop); track = merge_tracks(block,stop,nsize); while (block->tracks != NULL) Close_Track(block,block->tracks->name); block->tracks = track; } if (block->cutoff < kmer) { for (i = 0; i < block->nreads; i++) if (block->reads[i].rlen < kmer) { fprintf(stderr,"%s: Block %s contains reads < %dbp long ! Run DBsplit.\n", Prog_Name,name,kmer); exit (1); } } Read_All_Sequences(block,0); return (isdam); } static void complement(char *s, int len) { char *t; int c; t = s + (len-1); while (s < t) { c = *s; *s = (char) (3-*t); *t = (char) (3-c); s += 1; t -= 1; } if (s == t) *s = (char) (3-*s); } static HITS_DB *complement_DB(HITS_DB *block, int inplace) { static HITS_DB _cblock, *cblock = &_cblock; int nreads; HITS_READ *reads; char *seq; nreads = block->nreads; reads = block->reads; if (inplace) { seq = (char *) block->bases; cblock = block; } else { seq = (char *) Malloc(block->reads[nreads].boff+1,"Allocating dazzler sequence block"); if (seq == NULL) exit (1); *seq++ = 4; memcpy(seq,block->bases,block->reads[nreads].boff); *cblock = *block; cblock->bases = (void *) seq; cblock->tracks = NULL; } { int i; float x; x = cblock->freq[0]; cblock->freq[0] = cblock->freq[3]; cblock->freq[3] = x; x = cblock->freq[1]; cblock->freq[1] = cblock->freq[2]; cblock->freq[2] = x; for (i = 0; i < nreads; i++) complement(seq+reads[i].boff,reads[i].rlen); } { HITS_TRACK *src, *trg; int *data, *tata; int i, x, rlen; int64 *tano, *anno; int64 j, k; for (src = block->tracks; src != NULL; src = src->next) { tano = (int64 *) src->anno; tata = (int *) src->data; if (inplace) { data = tata; anno = tano; trg = src; } else { data = (int *) Malloc(sizeof(int)*tano[nreads], "Allocating dazzler interval track data"); anno = (int64 *) Malloc(sizeof(int64)*(nreads+1), "Allocating dazzler interval track index"); trg = (HITS_TRACK *) Malloc(sizeof(HITS_TRACK), "Allocating dazzler interval track header"); if (data == NULL || trg == NULL || anno == NULL) exit (1); trg->name = Strdup(src->name,"Copying track name"); if (trg->name == NULL) exit (1); trg->size = 4; trg->anno = (void *) anno; trg->data = (void *) data; trg->next = cblock->tracks; cblock->tracks = trg; } for (i = 0; i < nreads; i++) { rlen = reads[i].rlen; anno[i] = tano[i]; j = tano[i+1]-1; k = tano[i]; while (k < j) { x = tata[j]; data[j--] = rlen - tata[k]; data[k++] = rlen - x; } if (k == j) data[k] = rlen - tata[k]; } anno[nreads] = tano[nreads]; } } return (cblock); } int main(int argc, char *argv[]) { HITS_DB _ablock, _bblock; HITS_DB *ablock = &_ablock, *bblock = &_bblock; char *afile, *bfile; char *aroot, *broot; void *aindex, *bindex; int alen, blen; Align_Spec *asettings; int isdam; int MMAX, MTOP, *MSTAT; char **MASK; int KMER_LEN; int BIN_SHIFT; int MAX_REPS; int HIT_MIN; double AVE_ERROR; int SPACING; { int i, j, k; int flags[128]; char *eptr; ARG_INIT("daligner") KMER_LEN = 14; HIT_MIN = 35; BIN_SHIFT = 6; MAX_REPS = 0; HGAP_MIN = 0; AVE_ERROR = .70; SPACING = 100; MINOVER = 1000; // Globally visible to filter.c MEM_PHYSICAL = getMemorySize(); MEM_LIMIT = MEM_PHYSICAL; if (MEM_PHYSICAL == 0) { fprintf(stderr,"\nWarning: Could not get physical memory size\n"); fflush(stderr); } MTOP = 0; MMAX = 10; MASK = (char **) Malloc(MMAX*sizeof(char *),"Allocating mask track array"); MSTAT = (int *) Malloc(MMAX*sizeof(int),"Allocating mask status array"); if (MASK == NULL || MSTAT == NULL) exit (1); j = 1; for (i = 1; i < argc; i++) if (argv[i][0] == '-') switch (argv[i][1]) { default: ARG_FLAGS("vbAI") break; case 'k': ARG_POSITIVE(KMER_LEN,"K-mer length") if (KMER_LEN > 32) { fprintf(stderr,"%s: K-mer length must be 32 or less\n",Prog_Name); exit (1); } break; case 'w': ARG_POSITIVE(BIN_SHIFT,"Log of bin width") break; case 'h': ARG_POSITIVE(HIT_MIN,"Hit threshold (in bp.s)") break; case 't': ARG_POSITIVE(MAX_REPS,"Tuple supression frequency") break; case 'H': ARG_POSITIVE(HGAP_MIN,"HGAP threshold (in bp.s)") break; case 'e': ARG_REAL(AVE_ERROR) if (AVE_ERROR < .7 || AVE_ERROR >= 1.) { fprintf(stderr,"%s: Average correlation must be in [.7,1.) (%g)\n", Prog_Name,AVE_ERROR); exit (1); } break; case 'l': ARG_POSITIVE(MINOVER,"Minimum alignment length") break; case 's': ARG_POSITIVE(SPACING,"Trace spacing") break; case 'M': { int limit; ARG_NON_NEGATIVE(limit,"Memory allocation (in Gb)") MEM_LIMIT = limit * 0x40000000ll; break; } case 'm': if (MTOP >= MMAX) { MMAX = 1.2*MTOP + 10; MASK = (char **) Realloc(MASK,MMAX*sizeof(char *),"Reallocating mask track array"); MSTAT = (int *) Realloc(MSTAT,MMAX*sizeof(int),"Reallocating mask status array"); if (MASK == NULL || MSTAT == NULL) exit (1); } MASK[MTOP++] = argv[i]+2; break; } else argv[j++] = argv[i]; argc = j; VERBOSE = flags['v']; // Globally declared in filter.h BIASED = flags['b']; // Globally declared in filter.h SYMMETRIC = 1-flags['A']; IDENTITY = flags['I']; if (argc <= 2) { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage[0]); fprintf(stderr," %*s %s\n",(int) strlen(Prog_Name),"",Usage[1]); fprintf(stderr," %*s %s\n",(int) strlen(Prog_Name),"",Usage[2]); exit (1); } for (j = 0; j < MTOP; j++) MSTAT[j] = -2; } MINOVER *= 2; if (Set_Filter_Params(KMER_LEN,BIN_SHIFT,MAX_REPS,HIT_MIN)) { fprintf(stderr,"Illegal combination of filter parameters\n"); exit (1); } /* Read in the reads in A */ afile = argv[1]; isdam = read_DB(ablock,afile,MASK,MSTAT,MTOP,KMER_LEN); if (isdam) aroot = Root(afile,".dam"); else aroot = Root(afile,".db"); asettings = New_Align_Spec( AVE_ERROR, SPACING, ablock->freq); /* Compare against reads in B in both orientations */ { int i, j; aindex = NULL; broot = NULL; for (i = 2; i < argc; i++) { bfile = argv[i]; if (strcmp(afile,bfile) != 0) { isdam = read_DB(bblock,bfile,MASK,MSTAT,MTOP,KMER_LEN); if (isdam) broot = Root(bfile,".dam"); else broot = Root(bfile,".db"); } if (i == 2) { for (j = 0; j < MTOP; j++) { if (MSTAT[j] == -2) printf("%s: Warning: -m%s option given but no track found.\n",Prog_Name,MASK[i]); else if (MSTAT[j] == -1) printf("%s: Warning: %s track not sync'd with relevant db.\n",Prog_Name,MASK[i]); else if (MSTAT[j] == -3) printf("%s: Warning: %s track is not a mask track.\n",Prog_Name,MASK[i]); } if (VERBOSE) printf("\nBuilding index for %s\n",aroot); aindex = Sort_Kmers(ablock,&alen); } if (strcmp(afile,bfile) != 0) { if (VERBOSE) printf("\nBuilding index for %s\n",broot); bindex = Sort_Kmers(bblock,&blen); Match_Filter(aroot,ablock,broot,bblock,aindex,alen,bindex,blen,0,asettings); bblock = complement_DB(bblock,1); if (VERBOSE) printf("\nBuilding index for c(%s)\n",broot); bindex = Sort_Kmers(bblock,&blen); Match_Filter(aroot,ablock,broot,bblock,aindex,alen,bindex,blen,1,asettings); free(broot); } else { Match_Filter(aroot,ablock,aroot,ablock,aindex,alen,aindex,alen,0,asettings); bblock = complement_DB(ablock,0); if (VERBOSE) printf("\nBuilding index for c(%s)\n",aroot); bindex = Sort_Kmers(bblock,&blen); Match_Filter(aroot,ablock,aroot,bblock,aindex,alen,bindex,blen,1,asettings); bblock->reads = NULL; // ablock & bblock share "reads" vector, don't let Close_DB // free it ! } Close_DB(bblock); } } exit (0); } pbdagcon-0.3+20161121+ds/DALIGNER/filter.c0000644000175000017500000017611113026414545015651 0ustar afifafif/* vim: set et ts=2 sts=2 sw=2 : */ /******************************************************************************************* * * Fast local alignment filter for long, noisy reads based on "dumbing down" of my RECOMB 2005 * filter with Jens Stoye, and a "smarting up" of the k-mer matching by turning it into * a threaded sort and merge paradigm using a super cache coherent radix sort. Local * alignment is accomplised with dynamically-banded O(nd) algorithm that terminates when * it fails to find a e-matching patch for a significant distance, and polishes the match * to the last e-prefix-positive 32-mer. * * Author : Gene Myers * First : June 2013 * Current: June 1, 2014 * ********************************************************************************************/ // A complete threaded code for the filter #include #include #include #include #include #include #include #include "DB.h" #include "filter.h" #include "align.h" #define THREAD pthread_t #define MAX_BIAS 2 // In -b mode, don't consider tuples with specificity // <= 4 ^ -(kmer-MAX_BIAS) #define MAXGRAM 10000 // Cap on k-mer count histogram (in count_thread, merge_thread) #define PANEL_SIZE 50000 // Size to break up very long A-reads #define PANEL_OVERLAP 10000 // Overlap of A-panels #define MATCH_CHUNK 100 // Max expected number of hits between two reads #define TRACE_CHUNK 20000 // Max expected trace points in hits between two reads #undef TEST_LSORT #undef TEST_KSORT #undef TEST_PAIRS #undef TEST_CSORT #define HOW_MANY 3000 // Print first HOW_MANY items for each of the TEST options above #undef TEST_GATHER #undef TEST_CONTAIN #undef SHOW_OVERLAP // Show the cartoon #undef SHOW_ALIGNMENT // Show the alignment #define ALIGN_WIDTH 80 // Parameters for alignment #define ALIGN_INDENT 20 #define ALIGN_BORDER 10 #ifdef SHOW_OVERLAP #define NOTHREAD #endif #ifdef TEST_GATHER #define NOTHREAD #endif #ifdef TEST_CONTAIN #define NOTHREAD #endif typedef struct { uint64 p1; // The lower half uint64 p2; } Double; #if __ORDER_LITTLE_ENDIAN__ == __BYTE_ORDER__ typedef struct { uint64 code; int rpos; int read; } KmerPos; typedef struct { int diag; int apos; int aread; int bread; } SeedPair; #else typedef struct { uint64 code; int read; int rpos; } KmerPos; typedef struct { int apos; int diag; int bread; int aread; } SeedPair; #endif /******************************************************************************************* * * PARAMETER SETUP * ********************************************************************************************/ static int Kmer; static int Hitmin; static int Binshift; static int Suppress; static int Kshift; // 2*Kmer static uint64 Kmask; // 4^Kmer-1 static int TooFrequent; // (Suppress != 0) ? Suppress : INT32_MAX int Set_Filter_Params(int kmer, int binshift, int suppress, int hitmin) { if (kmer <= 1) return (1); Kmer = kmer; Binshift = binshift; Suppress = suppress; Hitmin = hitmin; Kshift = 2*Kmer; if (Kmer == 32) Kmask = 0xffffffffffffffffllu; else Kmask = (0x1llu << Kshift) - 1; if (Suppress == 0) TooFrequent = INT32_MAX; else TooFrequent = Suppress; return (0); } /******************************************************************************************* * * LEXICOGRAPHIC SORT * ********************************************************************************************/ #define BMER 4 #define BSHIFT 8 // = 2*BMER #define BPOWR 256 // = 2^BSHIFT #define BMASK 0xffllu // = BPOWR-1 static uint64 QMASK; // = BMASK << NSHIFT static int LEX_shift; static int64 LEX_zsize; static int LEX_last; static int LEX_next; static Double *LEX_src; static Double *LEX_trg; typedef struct { int64 beg; int64 end; int64 tptr[BPOWR]; int64 sptr[NTHREADS*BPOWR]; } Lex_Arg; #define VERY_VERBOSE 0 static void *lex_thread(void *arg) { Lex_Arg *data = (Lex_Arg *) arg; int64 *sptr = data->sptr; int64 *tptr = data->tptr; int shift = LEX_shift; // Must be a multiple of 8 in [0,120] int qshift = (LEX_next - LEX_shift) - NSHIFT; int64 zsize = LEX_zsize; Double *src = LEX_src; Double *trg = LEX_trg; int64 i, n, x; uint64 c, b; n = data->end; if (VERY_VERBOSE) { printf("\n ----"); printf("\n shift=%d, LEX_last=%d, n=%lld", shift, LEX_last, n); fflush(stdout); } if (shift >= 64) { shift -= 64; if (LEX_last) for (i = data->beg; i < n; i++) { c = src[i].p2; b = (c >> shift); x = tptr[b&BMASK]++; trg[x] = src[i]; } else for (i = data->beg; i < n; i++) { c = src[i].p2; b = (c >> shift); x = tptr[b&BMASK]++; trg[x] = src[i]; sptr[((b >> qshift) & QMASK) + x/zsize] += 1; } } else if ( ! LEX_last && LEX_next >= 64) // && LEX_shift < 64 { qshift = (LEX_next - 64) - NSHIFT; if (qshift < 0) for (i = data->beg; i < n; i++) { c = src[i].p1; b = (c >> shift); x = tptr[b&BMASK]++; trg[x] = src[i]; sptr[((src[i].p2 << NSHIFT) & QMASK) + x/zsize] += 1; } else for (i = data->beg; i < n; i++) { c = src[i].p1; b = (c >> shift); x = tptr[b&BMASK]++; trg[x] = src[i]; sptr[((src[i].p2 >> qshift) & QMASK) + x/zsize] += 1; } } else // LEX_last || LEX_next < 64 if (LEX_last) if (shift == 0) for (i = data->beg; i < n; i++) { c = src[i].p1; x = tptr[c&BMASK]++; trg[x] = src[i]; } else for (i = data->beg; i < n; i++) { c = src[i].p1; b = (c >> shift); x = tptr[b&BMASK]++; trg[x] = src[i]; } else if (shift == 0) for (i = data->beg; i < n; i++) { c = src[i].p1; x = tptr[c&BMASK]++; if (VERY_VERBOSE) { printf("\n @=%p+%lld i=%6lld,c&=%3lld,x=%3lld,c=%lld ", (void*)trg, (sizeof(Double)*x), i, (c&BMASK), x, c); fflush(stdout); } trg[x] = src[i]; sptr[((c >> qshift) & QMASK) + x/zsize] += 1; } else for (i = data->beg; i < n; i++) { c = src[i].p1; b = (c >> shift); x = tptr[b&BMASK]++; trg[x] = src[i]; sptr[((b >> qshift) & QMASK) + x/zsize] += 1; } if (VERY_VERBOSE) { printf("\n Finished @%p n=%lld", (void*)trg, n); fflush(stdout); } return (NULL); } static Double *lex_sort(int bytes[16], Double *src, Double *trg, Lex_Arg *parmx) { THREAD threads[NTHREADS]; int64 len, x, y; Double *xch; int i, j, k, z; int b, c, fb; len = parmx[NTHREADS-1].end; LEX_zsize = (len-1)/NTHREADS + 1; LEX_src = src; LEX_trg = trg; QMASK = (BMASK << NSHIFT); for (c = 0; c < 16; c++) if (bytes[c]) break; fb = c; for (b = c; b < 16; b = c) { for (c = b+1; c < 16; c++) if (bytes[c]) break; LEX_last = (c >= 16); LEX_shift = (b << 3); LEX_next = (c << 3); if (b == fb) { for (i = 0; i < NTHREADS; i++) for (z = 0; z < NTHREADS*BPOWR; z++) parmx[i].sptr[z] = 0; } else { x = 0; for (i = 0; i < NTHREADS; i++) { parmx[i].beg = x; x = LEX_zsize*(i+1); if (x > len) x = len; parmx[i].end = x; for (j = 0; j < BPOWR; j++) parmx[i].tptr[j] = 0; } parmx[NTHREADS-1].end = len; for (j = 0; j < BPOWR; j++) { k = (j << NSHIFT); for (z = 0; z < NTHREADS; z++) for (i = 0; i < NTHREADS; i++) { parmx[i].tptr[j] += parmx[z].sptr[k+i]; parmx[z].sptr[k+i] = 0; } } } x = 0; for (j = 0; j < BPOWR; j++) for (i = 0; i < NTHREADS; i++) { y = parmx[i].tptr[j]; parmx[i].tptr[j] = x; x += y; } for (i = 0; i < NTHREADS; i++) pthread_create(threads+i,NULL,lex_thread,parmx+i); for (i = 0; i < NTHREADS; i++) pthread_join(threads[i],NULL); xch = LEX_src; LEX_src = LEX_trg; LEX_trg = xch; #ifdef TEST_LSORT printf("\nLSORT %d\n",LEX_shift); if (LEX_shift >= 64) { x = (1 << ((LEX_shift-64)+BSHIFT))-1; for (i = 0; i < len; i++) { printf("%6d: %8llx %8llx %8llx %8llx : %4llx", i,LEX_src[i].p2>>32,(LEX_src[i].p2)&0xffffffffll,LEX_src[i].p1>>32, LEX_src[i].p1&0xffffffffll,LEX_src[i].p2&x); if (i > 0 && (LEX_src[i].p1 < LEX_src[i].p1 || (LEX_src[i].p1 == LEX_src[i].p1 && (LEX_src[i].p2 & x) < (LEX_src[i-1].p2 & x)))) printf(" OO"); printf("\n"); } } else { x = (1 << (LEX_shift+BSHIFT))-1; for (i = 0; i < len; i++) { printf("%6d: %8llx %8llx %8llx %8llx : %4llx", i,LEX_src[i].p2>>32,(LEX_src[i].p2)&0xffffffffll,LEX_src[i].p1>>32, LEX_src[i].p1&0xffffffffll,LEX_src[i].p1&x); if (i > 0 && (LEX_src[i].p1 & x) < (LEX_src[i-1].p1 & x)) printf(" OO"); printf("\n"); } } #endif } return (LEX_src); } /******************************************************************************************* * * INDEX BUILD * ********************************************************************************************/ static int *NormShift = NULL; static int LogNorm, LogThresh; static int LogBase[4]; static HITS_DB *TA_block; static KmerPos *TA_list; static HITS_TRACK *TA_track; typedef struct { int tnum; int64 *kptr; int fill; } Tuple_Arg; static void *tuple_thread(void *arg) { Tuple_Arg *data = (Tuple_Arg *) arg; int tnum = data->tnum; int64 *kptr = data->kptr; KmerPos *list = TA_list; int i, m, n, x, p; uint64 c; char *s; c = TA_block->nreads; i = (c * tnum) >> NSHIFT; n = TA_block->reads[i].boff; s = ((char *) (TA_block->bases)) + n; n -= Kmer*i; if (TA_track != NULL) { HITS_READ *reads = TA_block->reads; int64 *anno1 = ((int64 *) (TA_track->anno)) + 1; int *point = (int *) (TA_track->data); int64 a, b, f; int q = 0; f = anno1[i-1]; for (m = (c * (tnum+1)) >> NSHIFT; i < m; i++) { b = f; f = anno1[i]; for (a = b; a <= f; a += 2) { if (a == b) p = 0; else p = point[a-1]; if (a == f) q = reads[i].rlen; else q = point[a]; if (p+Kmer <= q) { c = 0; for (x = 1; x < Kmer; x++) c = (c << 2) | s[p++]; while (p < q) { x = s[p]; c = ((c << 2) | x) & Kmask; list[n].read = i; list[n].rpos = p++; list[n].code = c; n += 1; kptr[c & BMASK] += 1; } } } s += (q+1); } m = TA_block->reads[m].boff - Kmer*m; kptr[BMASK] += (data->fill = m-n); while (n < m) { list[n].code = 0xffffffffffffffffllu; list[n].read = 0xffffffff; list[n].rpos = 0xffffffff; n += 1; } } else for (m = (c * (tnum+1)) >> NSHIFT; i < m; i++) { c = p = 0; for (x = 1; x < Kmer; x++) c = (c << 2) | s[p++]; while ((x = s[p]) != 4) { c = ((c << 2) | x) & Kmask; list[n].read = i; list[n].rpos = p++; list[n].code = c; n += 1; kptr[c & BMASK] += 1; } s += (p+1); } return (NULL); } static void *biased_tuple_thread(void *arg) { Tuple_Arg *data = (Tuple_Arg *) arg; int tnum = data->tnum; int64 *kptr = data->kptr; KmerPos *list = TA_list; int n, i, m; int x, a, k, p; uint64 d, c; char *s, *t; c = TA_block->nreads; i = (c * tnum) >> NSHIFT; n = TA_block->reads[i].boff; s = ((char *) (TA_block->bases)) + n; n -= Kmer*i; if (TA_track != NULL) { HITS_READ *reads = TA_block->reads; int64 *anno1 = ((int64 *) (TA_track->anno)) + 1; int *point = (int *) (TA_track->data); int64 j, b, f; int q = 0; f = anno1[i-1]; for (m = (c * (tnum+1)) >> NSHIFT; i < m; i++) { b = f; f = anno1[i]; t = s+1; for (j = b; j <= f; j += 2) { if (j == b) p = 0; else p = point[j-1]; if (j == f) q = reads[i].rlen; else q = point[j]; if (p+Kmer <= q) { c = 0; a = 0; k = 1; while (p < q) { x = s[p]; a += LogBase[x]; c = ((c << 2) | x); while (a < LogNorm && k < Kmer) { if (++p >= q) break; k += 1; x = s[p]; a += LogBase[x]; c = ((c << 2) | x); } while (1) { int u = a-LogBase[(int) t[p-k]]; if (u < LogNorm) break; a = u; k -= 1; } if (a > LogThresh) { d = ((c << NormShift[k]) & Kmask); list[n].read = i; list[n].rpos = p; list[n].code = d; n += 1; kptr[d & BMASK] += 1; } p += 1; a -= LogBase[(int) s[p-k]]; } } } s += (q+1); } } else for (m = (c * (tnum+1)) >> NSHIFT; i < m; i++) { t = s+1; c = 0; p = a = 0; k = 1; while ((x = s[p]) != 4) { a += LogBase[x]; c = ((c << 2) | x); while (a < LogNorm && k < Kmer) { if ((x = s[++p]) == 4) goto eoread2; k += 1; a += LogBase[x]; c = ((c << 2) | x); } while (1) { int u = a-LogBase[(int) t[p-k]]; if (u < LogNorm) break; a = u; k -= 1; } if (a > LogThresh) { d = ((c << NormShift[k]) & Kmask); list[n].read = i; list[n].rpos = p; list[n].code = d; n += 1; kptr[d & BMASK] += 1; } p += 1; a -= LogBase[(int) s[p-k]]; } eoread2: s += (p+1); } m = TA_block->reads[m].boff - Kmer*m; kptr[BMASK] += (data->fill = m-n); while (n < m) { list[n].code = 0xffffffffffffffffllu; list[n].read = 0xffffffff; list[n].rpos = 0xffffffff; n += 1; } return (NULL); } static KmerPos *FR_src; static KmerPos *FR_trg; typedef struct { int beg; int end; int kept; } Comp_Arg; static void *compsize_thread(void *arg) { Comp_Arg *data = (Comp_Arg *) arg; int end = data->end; KmerPos *src = FR_src; int n, i, c, p; uint64 h, g; i = data->beg; h = src[i].code; n = 0; while (i < end) { p = i++; while ((g = src[i].code) == h) i += 1; if ((c = (i-p)) < TooFrequent) n += c; h = g; } data->kept = n; return (NULL); } static void *compress_thread(void *arg) { Comp_Arg *data = (Comp_Arg *) arg; int end = data->end; KmerPos *src = FR_src; KmerPos *trg = FR_trg; int n, i, p; uint64 h, g; i = data->beg; h = src[i].code; n = data->kept; while (i < end) { p = i++; while ((g = src[i].code) == h) i += 1; if (i-p < TooFrequent) { while (p < i) trg[n++] = src[p++]; } h = g; } return (NULL); } void *Sort_Kmers(HITS_DB *block, int *len) { THREAD threads[NTHREADS]; Tuple_Arg parmt[NTHREADS]; Comp_Arg parmf[NTHREADS]; Lex_Arg parmx[NTHREADS]; int mersort[16]; KmerPos *src, *trg, *rez; int kmers, nreads; int i, j, x, z; uint64 h; for (i = 0; i < 16; i++) mersort[i] = 0; for (i = 0; i < Kshift; i += 8) mersort[i>>3] = 1; if (NormShift == NULL && BIASED) { double scale; NormShift = (int *) Malloc(sizeof(int)*(Kmer+1),"Allocating Sort_Kmers bias shift"); if (NormShift == NULL) exit (1); for (i = 0; i <= Kmer; i++) NormShift[i] = Kshift - 2*i; LogNorm = 10000 * Kmer; LogThresh = 10000 * (Kmer-MAX_BIAS); scale = -10000. / log(4.); for (i = 0; i < 4; i++) LogBase[i] = (int) ceil( scale * log(block->freq[i]) ); } nreads = block->nreads; kmers = block->reads[nreads].boff - Kmer * nreads; if (kmers <= 0) goto no_mers; if (VERBOSE) { printf("\n Kshift=%d", Kshift); printf("\n BSHIFT=%d", BSHIFT); printf("\n TooFrequent=%d", TooFrequent); printf("\n (Kshift-1)/BSHIFT + (TooFrequent < INT32_MAX)=%d", ((Kshift-1)/BSHIFT + (TooFrequent < INT32_MAX))); printf("\n sizeof(KmerPos)=%ld", sizeof(KmerPos)); printf("\n nreads=%d", nreads); printf("\n Kmer=%d", Kmer); printf("\n block->reads[nreads].boff=%lld", (block->reads[nreads].boff)); printf("\n kmers=%d", kmers); printf("\n sizeof(KmerPos)*(kmers+1)=%ld", (sizeof(KmerPos)*(kmers+1))); fflush(stdout); } if (( (Kshift-1)/BSHIFT + (TooFrequent < INT32_MAX) ) & 0x1) { trg = (KmerPos *) Malloc(sizeof(KmerPos)*(kmers+2),"Allocating Sort_Kmers vectors"); src = (KmerPos *) Malloc(sizeof(KmerPos)*(kmers+2),"Allocating Sort_Kmers vectors"); } else { src = (KmerPos *) Malloc(sizeof(KmerPos)*(kmers+2),"Allocating Sort_Kmers vectors"); trg = (KmerPos *) Malloc(sizeof(KmerPos)*(kmers+2),"Allocating Sort_Kmers vectors"); } if (VERBOSE) printf("\n Allocated %d of %ld (%lu bytes) at %p", (kmers+1), sizeof(KmerPos), (sizeof(KmerPos)*(kmers+1)), (void*)trg); if (src == NULL || trg == NULL) exit (1); if (VERBOSE) { printf("\n Kmer count = "); Print_Number((int64) kmers,0,stdout); printf("\n Using %.2fGb of space\n",(1. * kmers) / 33554432); fflush(stdout); } TA_block = block; TA_list = src; TA_track = block->tracks; for (i = 0; i < NTHREADS; i++) { parmt[i].tnum = i; parmt[i].kptr = parmx[i].tptr; for (j = 0; j < BPOWR; j++) parmt[i].kptr[j] = 0; } if (BIASED) for (i = 0; i < NTHREADS; i++) pthread_create(threads+i,NULL,biased_tuple_thread,parmt+i); else for (i = 0; i < NTHREADS; i++) pthread_create(threads+i,NULL,tuple_thread,parmt+i); for (i = 0; i < NTHREADS; i++) pthread_join(threads[i],NULL); x = 0; for (i = 0; i < NTHREADS; i++) { parmx[i].beg = x; j = (int) ((((int64) nreads) * (i+1)) >> NSHIFT); parmx[i].end = x = block->reads[j].boff - j*Kmer; } rez = (KmerPos *) lex_sort(mersort,(Double *) src,(Double *) trg,parmx); if (BIASED || TA_track != NULL) for (i = 0; i < NTHREADS; i++) kmers -= parmt[i].fill; if (TooFrequent < INT32_MAX && kmers > 0) { parmf[0].beg = 0; for (i = 1; i < NTHREADS; i++) { x = (((int64) i)*kmers) >> NSHIFT; h = rez[x-1].code; while (rez[x].code == h) x += 1; parmf[i-1].end = parmf[i].beg = x; } parmf[NTHREADS-1].end = kmers; if (rez[kmers-1].code == 0xffffffffffffffffllu) rez[kmers].code = 0; else rez[kmers].code = 0xffffffffffffffffllu; if (src == rez) { FR_src = src; FR_trg = rez = trg; } else { FR_src = trg; FR_trg = rez = src; } for (i = 0; i < NTHREADS; i++) pthread_create(threads+i,NULL,compsize_thread,parmf+i); for (i = 0; i < NTHREADS; i++) pthread_join(threads[i],NULL); x = 0; for (i = 0; i < NTHREADS; i++) { z = parmf[i].kept; parmf[i].kept = x; x += z; } kmers = x; for (i = 0; i < NTHREADS; i++) pthread_create(threads+i,NULL,compress_thread,parmf+i); for (i = 0; i < NTHREADS; i++) pthread_join(threads[i],NULL); } rez[kmers].code = 0xffffffffffffffffllu; rez[kmers+1].code = 0; if (src != rez) free(src); else free(trg); #ifdef TEST_KSORT { int i; printf("\nKMER SORT:\n"); for (i = 0; i < HOW_MANY && i < kmers; i++) { KmerPos *c = rez+i; printf(" %5d / %5d / %10lld\n",c->read,c->rpos,c->code); } fflush(stdout); } #endif if (VERBOSE) { if (TooFrequent < INT32_MAX || BIASED || TA_track != NULL) { printf(" Revised kmer count = "); Print_Number((int64) kmers,0,stdout); printf("\n"); } printf(" Index occupies %.2fGb\n",(1. * kmers) / 67108864); fflush(stdout); } if (kmers <= 0) { free(rez); goto no_mers; } if (kmers > (int64) (MEM_LIMIT/(4*sizeof(KmerPos)))) { fprintf(stderr,"Warning: Block size too big, index occupies more than 1/4 of"); if (MEM_LIMIT == MEM_PHYSICAL) fprintf(stderr," physical memory (%.1fGb)\n",(1.*MEM_LIMIT)/0x40000000ll); else fprintf(stderr," desired memory allocation (%.1fGb)\n",(1.*MEM_LIMIT)/0x40000000ll); fflush(stderr); } *len = kmers; return (rez); no_mers: *len = 0; return (NULL); } /******************************************************************************************* * * FILTER MATCH * ********************************************************************************************/ static int find_tuple(uint64 x, KmerPos *a, int n) { int l, r, m; // smallest k s.t. a[k].code >= x (or n if does not exist) l = 0; r = n; while (l < r) { m = ((l+r) >> 1); if (a[m].code < x) l = m+1; else r = m; } return (l); } // Determine what *will* be the size of the merged list and histogram of sizes for given cutoffs static KmerPos *MG_alist; static KmerPos *MG_blist; static SeedPair *MG_hits; static int MG_comp; static int MG_self; typedef struct { int abeg, aend; int bbeg, bend; int64 *kptr; int64 nhits; int limit; int64 hitgram[MAXGRAM]; } Merge_Arg; static void *count_thread(void *arg) { Merge_Arg *data = (Merge_Arg *) arg; KmerPos *asort = MG_alist; KmerPos *bsort = MG_blist; int64 *gram = data->hitgram; int64 nhits = 0; int aend = data->aend; int64 ct; int ia, ib; int jb, ja; uint64 ca, cb; uint64 da, db; int ar, ap; int a, b; ia = data->abeg; ca = asort[ia].code; ib = data->bbeg; cb = bsort[ib].code; if (MG_self) { while (1) { while (cb < ca) cb = bsort[++ib].code; while (cb > ca) ca = asort[++ia].code; if (cb == ca) { ja = ia++; while ((da = asort[ia].code) == ca) ia += 1; jb = ib++; while ((db = bsort[ib].code) == cb) ib += 1; if (ia > aend) { if (ja >= aend) break; da = asort[ia = aend].code; db = bsort[ib = data->bend].code; } ct = 0; b = jb; if (IDENTITY) for (a = ja; a < ia; a++) { ar = asort[a].read; if (MG_comp) { while (b < ib && bsort[b].read <= ar) b += 1; } else { ap = asort[a].rpos; while (b < ib && bsort[b].read < ar) b += 1; while (b < ib && bsort[b].read == ar && bsort[b].rpos < ap) b += 1; } ct += (b-jb); } else for (a = ja; a < ia; a++) { ar = asort[a].read; while (b < ib && bsort[b].read < ar) b += 1; ct += (b-jb); } nhits += ct; ca = da; cb = db; if (ct < MAXGRAM) gram[ct] += 1; } } } else { while (1) { while (cb < ca) cb = bsort[++ib].code; while (cb > ca) ca = asort[++ia].code; if (cb == ca) { ja = ia++; while ((da = asort[ia].code) == ca) ia += 1; jb = ib++; while ((db = bsort[ib].code) == cb) ib += 1; if (ia > aend) { if (ja >= aend) break; da = asort[ia = aend].code; db = bsort[ib = data->bend].code; } ct = (ia-ja); ct *= (ib-jb); nhits += ct; ca = da; cb = db; if (ct < MAXGRAM) gram[ct] += 1; } } } data->nhits = nhits; return (NULL); } // Produce the merged list now that the list has been allocated and // the appropriate cutoff determined. static void *merge_thread(void *arg) { Merge_Arg *data = (Merge_Arg *) arg; int64 *kptr = data->kptr; KmerPos *asort = MG_alist; KmerPos *bsort = MG_blist; SeedPair *hits = MG_hits; int64 nhits = data->nhits; int aend = data->aend; int limit = data->limit; int64 ct; int ia, ib; int jb, ja; uint64 ca, cb; uint64 da, db; int ar, ap; int a, b, c; ia = data->abeg; ca = asort[ia].code; ib = data->bbeg; cb = bsort[ib].code; if (MG_self) { while (1) { while (cb < ca) cb = bsort[++ib].code; while (cb > ca) ca = asort[++ia].code; if (cb == ca) { ja = ia++; while ((da = asort[ia].code) == ca) ia += 1; jb = ib++; while ((db = bsort[ib].code) == cb) ib += 1; if (ia > aend) { if (ja >= aend) break; da = asort[ia = aend].code; db = bsort[ib = data->bend].code; } ct = 0; b = jb; if (IDENTITY) for (a = ja; a < ia; a++) { ar = asort[a].read; if (MG_comp) { while (b < ib && bsort[b].read <= ar) b += 1; } else { ap = asort[a].rpos; while (b < ib && bsort[b].read < ar) b += 1; while (b < ib && bsort[b].read == ar && bsort[b].rpos < ap) b += 1; } ct += (b-jb); } else for (a = ja; a < ia; a++) { ar = asort[a].read; while (b < ib && bsort[b].read < ar) b += 1; ct += (b-jb); } if (ct < limit) { b = jb; if (IDENTITY) for (a = ja; a < ia; a++) { ap = asort[a].rpos; ar = asort[a].read; if (MG_comp) { while (b < ib && bsort[b].read <= ar) b += 1; } else { while (b < ib && bsort[b].read < ar) b += 1; while (b < ib && bsort[b].read == ar && bsort[b].rpos < ap) b += 1; } if ((ct = b-jb) > 0) { kptr[ap & BMASK] += ct; for (c = jb; c < b; c++) { hits[nhits].bread = bsort[c].read; hits[nhits].aread = ar; hits[nhits].apos = ap; hits[nhits].diag = ap - bsort[c].rpos; nhits += 1; } } } else for (a = ja; a < ia; a++) { ap = asort[a].rpos; ar = asort[a].read; while (b < ib && bsort[b].read < ar) b += 1; if ((ct = b-jb) > 0) { kptr[ap & BMASK] += ct; for (c = jb; c < b; c++) { hits[nhits].bread = bsort[c].read; hits[nhits].aread = ar; hits[nhits].apos = ap; hits[nhits].diag = ap - bsort[c].rpos; nhits += 1; } } } } ca = da; cb = db; } } } else { while (1) { while (cb < ca) cb = bsort[++ib].code; while (cb > ca) ca = asort[++ia].code; if (cb == ca) { if (ia >= aend) break; ja = ia++; while ((da = asort[ia].code) == ca) ia += 1; jb = ib++; while ((db = bsort[ib].code) == cb) ib += 1; if (ia > aend) { if (ja >= aend) break; da = asort[ia = aend].code; db = bsort[ib = data->bend].code; } ct = ib-jb; if ((ia-ja)*ct < limit) { for (a = ja; a < ia; a++) { ap = asort[a].rpos; kptr[ap & BMASK] += ct; for (b = jb; b < ib; b++) { hits[nhits].bread = bsort[b].read; hits[nhits].aread = asort[a].read; hits[nhits].apos = ap; hits[nhits].diag = ap - bsort[b].rpos; nhits += 1; } } } ca = da; cb = db; } } } return (NULL); } // Report threads: given a segment of merged list, find all seeds and from them all alignments. static HITS_DB *MR_ablock; static HITS_DB *MR_bblock; static SeedPair *MR_hits; static int MR_two; static Align_Spec *MR_spec; static int MR_tspace; typedef struct { uint64 max; uint64 top; uint16 *trace; } Trace_Buffer; static int Entwine(Path *jpath, Path *kpath, Trace_Buffer *tbuf, int *where) { int ac, b2, y2, ae; int i, j, k; int num, den, min; #ifdef SEE_ENTWINE int strt = 1; int iflare, oflare; #endif uint16 *ktrace = tbuf->trace + (uint64) (kpath->trace); uint16 *jtrace = tbuf->trace + (uint64) (jpath->trace); min = 10000; num = 0; den = 0; #ifdef SEE_ENTWINE printf("\n"); #endif y2 = jpath->bbpos; j = jpath->abpos/MR_tspace; b2 = kpath->bbpos; k = kpath->abpos/MR_tspace; if (j < k) { ac = k*MR_tspace; j = 1 + 2*(k-j); k = 1; for (i = 1; i < j; i += 2) y2 += jtrace[i]; } else { ac = j*MR_tspace; k = 1 + 2*(j-k); j = 1; for (i = 1; i < k; i += 2) b2 += ktrace[i]; } ae = jpath->aepos; if (ae > kpath->aepos) ae = kpath->aepos; while (1) { ac += MR_tspace; if (ac >= ae) break; y2 += jtrace[j]; b2 += ktrace[k]; j += 2; k += 2; #ifdef SEE_ENTWINE printf(" @ %5d : %5d %5d = %4d\n",ac,y2,b2,abs(b2-y2)); #endif i = abs(y2-b2); if (i <= min) { min = i; if (i == 0) *where = ac; } num += i; den += 1; #ifdef SEE_ENTWINE if (strt) { strt = 0; iflare = i; } oflare = i; #endif } #ifdef SEE_ENTWINE if (den == 0) printf("Nothing\n"); else printf("MINIM = %d AVERAGE = %d IFLARE = %d OFLARE = %d\n",min,num/den,iflare,oflare); #endif if (den == 0) return (-1); else return (min); } // Produce the concatentation of path1 and path2 where they are known to meet at // the trace point with coordinate ap. Place this result in a big growing buffer, // that gets reset when fusion is called with path1 = NULL static void Fusion(Path *path1, int ap, Path *path2, Trace_Buffer *tbuf) { int k, k1, k2; int len, diff; uint16 *trace; k1 = 2 * ((ap/MR_tspace) - (path1->abpos/MR_tspace)); k2 = 2 * ((ap/MR_tspace) - (path2->abpos/MR_tspace)); len = k1+(path2->tlen-k2); if (tbuf->top + len >= tbuf->max) { tbuf->max = 1.2*(tbuf->top+len) + 1000; tbuf->trace = (uint16 *) Realloc(tbuf->trace,sizeof(uint16)*tbuf->max,"Allocating paths"); if (tbuf->trace == NULL) exit (1); } trace = tbuf->trace + tbuf->top; tbuf->top += len; diff = 0; len = 0; if (k1 > 0) { uint16 *t = tbuf->trace + (uint64) (path1->trace); for (k = 0; k < k1; k += 2) { trace[len++] = t[k]; trace[len++] = t[k+1]; diff += t[k]; } } if (k2 < path2->tlen) { uint16 *t = tbuf->trace + (uint64) (path2->trace); for (k = k2; k < path2->tlen; k += 2) { trace[len++] = t[k]; trace[len++] = t[k+1]; diff += t[k]; } } path1->aepos = path2->aepos; path1->bepos = path2->bepos; path1->diffs = diff; path1->trace = (void *) (trace - tbuf->trace); path1->tlen = len; } static int Handle_Redundancies(Path *amatch, int novls, Path *bmatch, Trace_Buffer *tbuf) { Path *jpath, *kpath; int j, k, no; int dist, awhen, bwhen; int hasB; #ifdef TEST_CONTAIN for (j = 0; j < novls; j++) printf(" %3d: [%5d,%5d] x [%5d,%5d]\n",j,amatch[j].abpos,amatch[j].aepos, amatch[j].bbpos,amatch[j].bepos); #endif hasB = (bmatch != NULL); no = 0; for (j = 1; j < novls; j++) { jpath = amatch+j; for (k = no; k >= 0; k--) { kpath = amatch+k; if (jpath->abpos < kpath->abpos) { if (kpath->abpos <= jpath->aepos && kpath->bbpos <= jpath->bepos) { dist = Entwine(jpath,kpath,tbuf,&awhen); if (dist == 0) { if (kpath->aepos > jpath->aepos) { if (hasB) { if (MG_comp) { dist = Entwine(bmatch+k,bmatch+j,tbuf,&bwhen); if (dist != 0) continue; Fusion(jpath,awhen,kpath,tbuf); amatch[k] = *jpath; Fusion(bmatch+k,bwhen,bmatch+j,tbuf); #ifdef TEST_CONTAIN printf(" Really 1"); #endif } else { dist = Entwine(bmatch+j,bmatch+k,tbuf,&bwhen); if (dist != 0) continue; Fusion(jpath,awhen,kpath,tbuf); amatch[k] = *jpath; Fusion(bmatch+j,bwhen,bmatch+k,tbuf); bmatch[k] = bmatch[j]; #ifdef TEST_CONTAIN printf(" Really 2"); #endif } } else { Fusion(jpath,awhen,kpath,tbuf); amatch[k] = *jpath; #ifdef TEST_CONTAIN printf(" Really 3"); #endif } } else { amatch[k] = *jpath; if (hasB) bmatch[k] = bmatch[j]; } #ifdef TEST_CONTAIN printf(" Fuse! A %d %d\n",j,k); #endif break; } } } else // kpath->abpos <= jpath->abpos { if (jpath->abpos <= kpath->aepos && jpath->bbpos <= kpath->bepos) { dist = Entwine(kpath,jpath,tbuf,&awhen); if (dist == 0) { if (kpath->abpos == jpath->abpos) { if (kpath->aepos < jpath->aepos) { amatch[k] = *jpath; if (hasB) bmatch[k] = bmatch[j]; } } else if (jpath->aepos > kpath->aepos) { if (hasB) { if (MG_comp) { dist = Entwine(bmatch+j,bmatch+k,tbuf,&bwhen); if (dist != 0) continue; Fusion(kpath,awhen,jpath,tbuf); Fusion(bmatch+j,bwhen,bmatch+k,tbuf); bmatch[k] = bmatch[j]; #ifdef TEST_CONTAIN printf(" Really 4"); #endif } else { dist = Entwine(bmatch+k,bmatch+j,tbuf,&bwhen); if (dist != 0) continue; Fusion(kpath,awhen,jpath,tbuf); Fusion(bmatch+k,bwhen,bmatch+j,tbuf); #ifdef TEST_CONTAIN printf(" Really 5"); #endif } } else { Fusion(kpath,awhen,jpath,tbuf); #ifdef TEST_CONTAIN printf(" Really 6"); #endif } } #ifdef TEST_CONTAIN printf(" Fuse! B %d %d\n",j,k); #endif break; } } } } if (k < 0) { no += 1; amatch[no] = *jpath; if (hasB) bmatch[no] = bmatch[j]; } } novls = no+1; #ifdef TEST_CONTAIN for (j = 0; j < novls; j++) printf(" %3d: [%5d,%5d] x [%5d,%5d]\n",j,amatch[j].abpos,amatch[j].aepos, amatch[j].bbpos,amatch[j].bepos); #endif return (novls); } void Diagonal_Span(Path *path, int *mind, int *maxd) { uint16 *points; int i, tlen; int dd, low, hgh; points = path->trace; tlen = path->tlen; dd = path->abpos - path->bbpos; low = hgh = dd; dd = path->aepos - path->bepos; if (dd < low) low = dd; else if (dd > hgh) hgh = dd; dd = (path->abpos/MR_tspace)*MR_tspace - path->bbpos; tlen -= 2; for (i = 1; i < tlen; i += 2) { dd += MR_tspace - points[i]; if (dd < low) low = dd; else if (dd > hgh) hgh = dd; } *mind = (low >> Binshift)-1; *maxd = (hgh >> Binshift)+1; } typedef struct { int64 beg, end; int *score; int *lastp; int *lasta; Work_Data *work; FILE *ofile1; FILE *ofile2; int64 nfilt; int64 ncheck; } Report_Arg; static void *report_thread(void *arg) { Report_Arg *data = (Report_Arg *) arg; SeedPair *hits = MR_hits; Double *hitd = (Double *) MR_hits; char *aseq = (char *) (MR_ablock->bases); char *bseq = (char *) (MR_bblock->bases); HITS_READ *aread = MR_ablock->reads; HITS_READ *bread = MR_bblock->reads; int *score = data->score; int *scorp = data->score + 1; int *scorm = data->score - 1; int *lastp = data->lastp; int *lasta = data->lasta; Work_Data *work = data->work; FILE *ofile1 = data->ofile1; FILE *ofile2 = data->ofile2; int afirst = MR_ablock->tfirst; int bfirst = MR_bblock->tfirst; int maxdiag = ( MR_ablock->maxlen >> Binshift); int mindiag = (-MR_bblock->maxlen >> Binshift); Overlap _ovla, *ovla = &_ovla; Overlap _ovlb, *ovlb = &_ovlb; Alignment _align, *align = &_align; Path *apath = &(ovla->path); Path *bpath; int64 nfilt = 0; int64 ahits = 0; int64 bhits = 0; int small, tbytes; int AOmax, BOmax; int novla, novlb; Path *amatch, *bmatch; Trace_Buffer _tbuf, *tbuf = &_tbuf; Double *hitc; int minhit; uint64 cpair, npair; int64 nidx, eidx; // In ovl and align roles of A and B are reversed, as the B sequence must be the // complemented sequence !! align->flags = ovla->flags = ovlb->flags = MG_comp; align->path = apath; if (MR_tspace <= TRACE_XOVR) { small = 1; tbytes = sizeof(uint8); } else { small = 0; tbytes = sizeof(uint16); } AOmax = BOmax = MATCH_CHUNK; amatch = Malloc(sizeof(Path)*AOmax,"Allocating match vector"); bmatch = Malloc(sizeof(Path)*BOmax,"Allocating match vector"); tbuf->max = 2*TRACE_CHUNK; tbuf->trace = Malloc(sizeof(short)*tbuf->max,"Allocating trace vector"); if (amatch == NULL || bmatch == NULL || tbuf->trace == NULL) exit (1); fwrite(&ahits,sizeof(int64),1,ofile1); fwrite(&MR_tspace,sizeof(int),1,ofile1); if (MR_two) { fwrite(&bhits,sizeof(int64),1,ofile2); fwrite(&MR_tspace,sizeof(int),1,ofile2); } minhit = (Hitmin-1)/Kmer + 1; hitc = hitd + (minhit-1); eidx = data->end - minhit; nidx = data->beg; for (cpair = hitd[nidx].p2; nidx < eidx; cpair = npair) if (hitc[nidx].p2 != cpair) { nidx += 1; while ((npair = hitd[nidx].p2) == cpair) nidx += 1; } else { int ar, br; int alen, blen; int doA, doB; int setaln, amark, amark2; int apos, bpos, diag; int64 lidx, sidx; int64 f, h2; ar = hits[nidx].aread; br = hits[nidx].bread; alen = aread[ar].rlen; blen = bread[br].rlen; if (alen < HGAP_MIN && blen < HGAP_MIN) { nidx += 1; while ((npair = hitd[nidx].p2) == cpair) nidx += 1; continue; } #ifdef TEST_GATHER printf("%5d vs %5d : %5d x %5d\n",br+bfirst,ar+afirst,blen,alen); #endif setaln = 1; doA = doB = 0; amark2 = 0; novla = novlb = 0; tbuf->top = 0; for (sidx = nidx; hitd[nidx].p2 == cpair; nidx = h2) { amark = amark2 + PANEL_SIZE; amark2 = amark - PANEL_OVERLAP; h2 = lidx = nidx; do { apos = hits[nidx].apos; npair = hitd[++nidx].p2; if (apos <= amark2) h2 = nidx; } while (npair == cpair && apos <= amark); if (nidx-lidx < minhit) continue; for (f = lidx; f < nidx; f++) { apos = hits[f].apos; diag = hits[f].diag >> Binshift; if (apos - lastp[diag] >= Kmer) score[diag] += Kmer; else score[diag] += apos - lastp[diag]; lastp[diag] = apos; } #ifdef TEST_GATHER printf(" %6lld upto %6d",nidx-lidx,amark); #endif for (f = lidx; f < nidx; f++) { apos = hits[f].apos; diag = hits[f].diag; bpos = apos - diag; diag = diag >> Binshift; if (apos > lasta[diag] && (score[diag] + scorp[diag] >= Hitmin || score[diag] + scorm[diag] >= Hitmin)) { if (setaln) { setaln = 0; align->aseq = aseq + aread[ar].boff; align->bseq = bseq + bread[br].boff; align->alen = alen; align->blen = blen; ovlb->bread = ovla->aread = ar + afirst; ovlb->aread = ovla->bread = br + bfirst; doA = (alen >= HGAP_MIN); doB = (SYMMETRIC && blen >= HGAP_MIN && (ar != br || !MG_self || !MG_comp)); } #ifdef TEST_GATHER else printf("\n "); if (scorm[diag] > scorp[diag]) printf(" %5d.. x %5d.. %5d (%3d)", bpos,apos,apos-bpos,score[diag]+scorm[diag]); else printf(" %5d.. x %5d.. %5d (%3d)", bpos,apos,apos-bpos,score[diag]+scorp[diag]); #endif nfilt += 1; bpath = Local_Alignment(align,work,MR_spec,apos-bpos,apos-bpos,apos+bpos,-1,-1); { int low, hgh, ae; Diagonal_Span(apath,&low,&hgh); if (diag < low) low = diag; else if (diag > hgh) hgh = diag; ae = apath->aepos; for (diag = low; diag <= hgh; diag++) if (ae > lasta[diag]) lasta[diag] = ae; #ifdef TEST_GATHER printf(" %d - %d @ %d",low,hgh,apath->aepos); #endif } #ifdef FALCON_DALIGNER_P if (apath->abpos > 24 && apath->bbpos > 24) continue; if (alen - apath->aepos > 24 && blen - apath->bepos > 24) continue; if (alen < 500 || blen < 500) continue; #endif // FALCON_DALIGNER_P if ((apath->aepos-apath->abpos) + (apath->bepos-apath->bbpos) >= MINOVER) { if (doA) { if (novla >= AOmax) { AOmax = 1.2*novla + MATCH_CHUNK; amatch = Realloc(amatch,sizeof(Path)*AOmax, "Reallocating match vector"); if (amatch == NULL) exit (1); } if (tbuf->top + apath->tlen > tbuf->max) { tbuf->max = 1.2*(tbuf->top+apath->tlen) + TRACE_CHUNK; tbuf->trace = Realloc(tbuf->trace,sizeof(short)*tbuf->max, "Reallocating trace vector"); if (tbuf->trace == NULL) exit (1); } amatch[novla] = *apath; amatch[novla].trace = (void *) (tbuf->top); memcpy(tbuf->trace+tbuf->top,apath->trace,sizeof(short)*apath->tlen); novla += 1; tbuf->top += apath->tlen; } if (doB) { if (novlb >= BOmax) { BOmax = 1.2*novlb + MATCH_CHUNK; bmatch = Realloc(bmatch,sizeof(Path)*BOmax, "Reallocating match vector"); if (bmatch == NULL) exit (1); } if (tbuf->top + bpath->tlen > tbuf->max) { tbuf->max = 1.2*(tbuf->top+bpath->tlen) + TRACE_CHUNK; tbuf->trace = Realloc(tbuf->trace,sizeof(short)*tbuf->max, "Reallocating trace vector"); if (tbuf->trace == NULL) exit (1); } bmatch[novlb] = *bpath; bmatch[novlb].trace = (void *) (tbuf->top); memcpy(tbuf->trace+tbuf->top,bpath->trace,sizeof(short)*bpath->tlen); novlb += 1; tbuf->top += bpath->tlen; } #ifdef TEST_GATHER printf(" [%5d,%5d] x [%5d,%5d] = %4d", apath->abpos,apath->aepos,apath->bbpos,apath->bepos,apath->diffs); #endif #ifdef SHOW_OVERLAP printf("\n\n %d(%d) vs %d(%d)\n\n", ovla->aread,ovla->alen,ovla->bread,ovla->blen); Print_ACartoon(stdout,align,ALIGN_INDENT); #ifdef SHOW_ALIGNMENT Compute_Trace_ALL(align,work); printf("\n Diff = %d\n",align->path->diffs); Print_Alignment(stdout,align,work, ALIGN_INDENT,ALIGN_WIDTH,ALIGN_BORDER,0,5); #endif #endif // SHOW_OVERLAP } #ifdef TEST_GATHER else printf(" No alignment %d", ((apath->aepos-apath->abpos) + (apath->bepos-apath->bbpos))/2); #endif } } for (f = lidx; f < nidx; f++) { diag = hits[f].diag >> Binshift; score[diag] = lastp[diag] = 0; } #ifdef TEST_GATHER printf("\n"); #endif } for (f = sidx; f < nidx; f++) { int d; diag = hits[f].diag >> Binshift; for (d = diag; d <= maxdiag; d++) if (lasta[d] == 0) break; else lasta[d] = 0; for (d = diag-1; d >= mindiag; d--) if (lasta[d] == 0) break; else lasta[d] = 0; } { int i; #ifdef TEST_CONTAIN if (novla > 1 || novlb > 1) printf("\n%5d vs %5d:\n",ar,br); #endif if (novla > 1) { if (novlb > 1) novla = novlb = Handle_Redundancies(amatch,novla,bmatch,tbuf); else novla = Handle_Redundancies(amatch,novla,NULL,tbuf); } else if (novlb > 1) novlb = Handle_Redundancies(bmatch,novlb,NULL,tbuf); for (i = 0; i < novla; i++) { ovla->path = amatch[i]; ovla->path.trace = tbuf->trace + (uint64) (ovla->path.trace); if (small) Compress_TraceTo8(ovla); Write_Overlap(ofile1,ovla,tbytes); } for (i = 0; i < novlb; i++) { ovlb->path = bmatch[i]; ovlb->path.trace = tbuf->trace + (uint64) (ovlb->path.trace); if (small) Compress_TraceTo8(ovlb); Write_Overlap(ofile2,ovlb,tbytes); } ahits += novla; bhits += novlb; } } free(tbuf->trace); free(bmatch); free(amatch); data->nfilt = nfilt; data->ncheck = ahits + bhits; if (MR_two) { rewind(ofile2); fwrite(&bhits,sizeof(int64),1,ofile2); fclose(ofile2); } else ahits += bhits; rewind(ofile1); fwrite(&ahits,sizeof(int64),1,ofile1); fclose(ofile1); return (NULL); } /******************************************************************************************* * * THE ALGORITHM * ********************************************************************************************/ void Match_Filter(char *aname, HITS_DB *ablock, char *bname, HITS_DB *bblock, void *vasort, int alen, void *vbsort, int blen, int comp, Align_Spec *aspec) { THREAD threads[NTHREADS]; Merge_Arg parmm[NTHREADS]; Lex_Arg parmx[NTHREADS]; Report_Arg parmr[NTHREADS]; int pairsort[16]; SeedPair *khit, *hhit; SeedPair *work1, *work2; int64 nhits; int64 nfilt, ncheck; KmerPos *asort, *bsort; int64 atot, btot; asort = (KmerPos *) vasort; bsort = (KmerPos *) vbsort; atot = ablock->totlen; btot = bblock->totlen; MR_tspace = Trace_Spacing(aspec); { int64 powr; int i, nbyte; for (i = 0; i < 16; i++) pairsort[i] = 0; powr = 1; for (nbyte = 0; powr < ablock->maxlen; nbyte += 1) powr <<= 8; for (i = 4; i < 4+nbyte; i++) pairsort[i] = 1; powr = 1; for (nbyte = 0; powr < ablock->nreads; nbyte += 1) powr <<= 8; for (i = 8; i < 8+nbyte; i++) pairsort[i] = 1; powr = 1; for (nbyte = 0; powr < bblock->nreads; nbyte += 1) powr <<= 8; for (i = 12; i < 12+nbyte; i++) pairsort[i] = 1; } nfilt = ncheck = nhits = 0; if (VERBOSE) { if (comp) printf("\nComparing %s to c(%s)\n",aname,bname); else printf("\nComparing %s to %s\n",aname,bname); } if (alen == 0 || blen == 0) goto zerowork; { int i, j, p; uint64 c; int limit; MG_alist = asort; MG_blist = bsort; MG_self = (aname == bname); MG_comp = comp; parmm[0].abeg = parmm[0].bbeg = 0; for (i = 1; i < NTHREADS; i++) { p = (int) ((((int64) alen) * i) >> NSHIFT); if (p > 0) { c = asort[p-1].code; while (asort[p].code == c) p += 1; } parmm[i].abeg = parmm[i-1].aend = p; parmm[i].bbeg = parmm[i-1].bend = find_tuple(asort[p].code,bsort,blen); } parmm[NTHREADS-1].aend = alen; parmm[NTHREADS-1].bend = blen; for (i = 0; i < NTHREADS; i++) for (j = 0; j < MAXGRAM; j++) parmm[i].hitgram[j] = 0; for (i = 0; i < NTHREADS; i++) pthread_create(threads+i,NULL,count_thread,parmm+i); for (i = 0; i < NTHREADS; i++) pthread_join(threads[i],NULL); if (VERBOSE) printf("\n"); if (MEM_LIMIT > 0) { int64 histo[MAXGRAM]; int64 tom, avail; for (j = 0; j < MAXGRAM; j++) histo[j] = parmm[0].hitgram[j]; for (i = 1; i < NTHREADS; i++) for (j = 0; j < MAXGRAM; j++) histo[j] += parmm[i].hitgram[j]; if (asort == bsort || (int64) (MEM_LIMIT/sizeof(Double)) > alen + 2*blen) avail = (MEM_LIMIT/sizeof(Double) - alen) / 2; else avail = MEM_LIMIT/sizeof(Double) - (alen + blen); avail *= .98; tom = 0; for (j = 0; j < MAXGRAM; j++) { tom += j*histo[j]; if (tom > avail) break; } limit = j; if (limit <= 1) { fprintf(stderr,"\nError: Insufficient "); if (MEM_LIMIT == MEM_PHYSICAL) fprintf(stderr," physical memory (%.1fGb), reduce block size\n", (1.*MEM_LIMIT)/0x40000000ll); else { fprintf(stderr," memory allocation (%.1fGb),",(1.*MEM_LIMIT)/0x40000000ll); fprintf(stderr," reduce block size or increase allocation\n"); } fflush(stderr); exit (1); } if (limit < 10) { fprintf(stderr,"\nWarning: Sensitivity hampered by low "); if (MEM_LIMIT == MEM_PHYSICAL) fprintf(stderr," physical memory (%.1fGb), reduce block size\n", (1.*MEM_LIMIT)/0x40000000ll); else { fprintf(stderr," memory allocation (%.1fGb),",(1.*MEM_LIMIT)/0x40000000ll); fprintf(stderr," reduce block size or increase allocation\n"); } fflush(stderr); } if (VERBOSE) { printf(" Capping mutual k-mer matches over %d (effectively -t%d)\n", limit,(int) sqrt(1.*limit)); fflush(stdout); } for (i = 0; i < NTHREADS; i++) { parmm[i].nhits = 0; for (j = 1; j < limit; j++) parmm[i].nhits += j * parmm[i].hitgram[j]; parmm[i].limit = limit; } } else for (i = 0; i < NTHREADS; i++) parmm[i].limit = INT32_MAX; nhits = parmm[0].nhits; for (i = 1; i < NTHREADS; i++) parmm[i].nhits = nhits += parmm[i].nhits; if (VERBOSE) { printf(" Hit count = "); Print_Number(nhits,0,stdout); if (asort == bsort || nhits >= blen) printf("\n Highwater of %.2fGb space\n", (1. * (alen + 2*nhits)) / 67108864); else printf("\n Highwater of %.2fGb space\n", (1. * (alen + blen + nhits)) / 67108864); fflush(stdout); } if (nhits == 0) goto zerowork; if (asort == bsort) hhit = work1 = (SeedPair *) Malloc(sizeof(SeedPair)*(nhits+1), "Allocating dazzler hit vectors"); else { if (nhits >= blen) bsort = (KmerPos *) Realloc(bsort,sizeof(SeedPair)*(nhits+1), "Reallocating dazzler sort vectors"); hhit = work1 = (SeedPair *) bsort; } khit = work2 = (SeedPair *) Malloc(sizeof(SeedPair)*(nhits+1),"Allocating dazzler hit vectors"); if (hhit == NULL || khit == NULL || bsort == NULL) exit (1); MG_blist = bsort; MG_hits = khit; for (i = NTHREADS-1; i > 0; i--) parmm[i].nhits = parmm[i-1].nhits; parmm[0].nhits = 0; for (i = 0; i < NTHREADS; i++) { parmm[i].kptr = parmx[i].tptr; for (p = 0; p < BPOWR; p++) parmm[i].kptr[p] = 0; } for (i = 0; i < NTHREADS; i++) pthread_create(threads+i,NULL,merge_thread,parmm+i); for (i = 0; i < NTHREADS; i++) pthread_join(threads[i],NULL); #ifdef TEST_PAIRS printf("\nSETUP SORT:\n"); for (i = 0; i < HOW_MANY && i < nhits; i++) { SeedPair *c = khit+i; printf(" %5d / %5d / %5d /%5d\n",c->aread,c->bread,c->apos,c->apos-c->diag); } #endif } { int i; int64 x; x = 0; for (i = 0; i < NTHREADS-1; i++) { parmx[i].beg = x; parmx[i].end = x = parmm[i+1].nhits; } parmx[NTHREADS-1].beg = x; parmx[NTHREADS-1].end = nhits; khit = (SeedPair *) lex_sort(pairsort,(Double *) khit,(Double *) hhit,parmx); khit[nhits].aread = 0x7fffffff; khit[nhits].bread = 0x7fffffff; khit[nhits].diag = 0x7fffffff; khit[nhits].apos = 0; #ifdef TEST_CSORT printf("\nCROSS SORT %lld:\n",nhits); for (i = 0; i < HOW_MANY && i <= nhits; i++) { SeedPair *c = khit+i; printf(" %5d / %5d / %5d /%5d\n",c->aread,c->bread,c->apos,c->apos-c->diag); } #endif } { int i, w; int64 p; int d; int *counters; MR_ablock = ablock; MR_bblock = bblock; MR_hits = khit; MR_two = ! MG_self && SYMMETRIC; MR_spec = aspec; parmr[0].beg = 0; for (i = 1; i < NTHREADS; i++) { p = (nhits * i) >> NSHIFT; if (p > 0) { d = khit[p-1].bread; while ((khit[p].bread) == d) p += 1; } parmr[i].beg = parmr[i-1].end = p; } parmr[NTHREADS-1].end = nhits; w = ((ablock->maxlen >> Binshift) - ((-bblock->maxlen) >> Binshift)) + 1; counters = (int *) Malloc(NTHREADS*3*w*sizeof(int),"Allocating diagonal buckets"); if (counters == NULL) exit (1); for (i = 0; i < 3*w*NTHREADS; i++) counters[i] = 0; for (i = 0; i < NTHREADS; i++) { if (i == 0) parmr[i].score = counters - ((-bblock->maxlen) >> Binshift); else parmr[i].score = parmr[i-1].lasta + w; parmr[i].lastp = parmr[i].score + w; parmr[i].lasta = parmr[i].lastp + w; parmr[i].work = New_Work_Data(); parmr[i].ofile1 = Fopen(Catenate(aname,".",bname,Numbered_Suffix((comp?".C":".N"),i,".las")),"w"); if (parmr[i].ofile1 == NULL) exit (1); if (MG_self) parmr[i].ofile2 = parmr[i].ofile1; else if (SYMMETRIC) { parmr[i].ofile2 = Fopen(Catenate(bname,".",aname,Numbered_Suffix((comp?".C":".N"),i,".las")),"w"); if (parmr[i].ofile2 == NULL) exit (1); } } #ifdef NOTHREAD for (i = 0; i < NTHREADS; i++) report_thread(parmr+i); #else for (i = 0; i < NTHREADS; i++) pthread_create(threads+i,NULL,report_thread,parmr+i); for (i = 0; i < NTHREADS; i++) pthread_join(threads[i],NULL); #endif if (VERBOSE) for (i = 0; i < NTHREADS; i++) { nfilt += parmr[i].nfilt; ncheck += parmr[i].ncheck; } for (i = 0; i < NTHREADS; i++) Free_Work_Data(parmr[i].work); free(counters); } free(work2); free(work1); goto epilogue; zerowork: { FILE *ofile; int i; nhits = 0; for (i = 0; i < NTHREADS; i++) { ofile = Fopen(Catenate(aname,".",bname,Numbered_Suffix((comp?".C":".N"),i,".las")),"w"); fwrite(&nhits,sizeof(int64),1,ofile); fwrite(&MR_tspace,sizeof(int),1,ofile); fclose(ofile); if (! MG_self && SYMMETRIC) { ofile = Fopen(Catenate(bname,".",aname,Numbered_Suffix((comp?".C":".N"),i,".las")),"w"); fwrite(&nhits,sizeof(int64),1,ofile); fwrite(&MR_tspace,sizeof(int),1,ofile); fclose(ofile); } } } epilogue: if (VERBOSE) { int width; if (nhits <= 0) width = 1; else width = ((int) log10((double) nhits)) + 1; width += (width-1)/3; printf("\n "); Print_Number(nhits,width,stdout); printf(" %d-mers (%e of matrix)\n ",Kmer,(1.*nhits/atot)/btot); Print_Number(nfilt,width,stdout); printf(" seed hits (%e of matrix)\n ",(1.*nfilt/atot)/btot); Print_Number(ncheck,width,stdout); printf(" confirmed hits (%e of matrix)\n",(1.*ncheck/atot)/btot); fflush(stdout); } } pbdagcon-0.3+20161121+ds/DALIGNER/LAsort.c0000644000175000017500000001320113026414545015556 0ustar afifafif/******************************************************************************************* * * Load a file U.las of overlaps into memory, sort them all by A,B index, * and then output the result to U.S.las * * Author: Gene Myers * Date : July 2013 * *******************************************************************************************/ #include #include #include #include #include #include #include #include "DB.h" #include "align.h" static char *Usage = "[-v] ..."; #define MEMORY 1000 // How many megabytes for output buffer static char *IBLOCK; static int SORT_OVL(const void *x, const void *y) { int64 l = *((int64 *) x); int64 r = *((int64 *) y); Overlap *ol, *or; int al, ar; int bl, br; int cl, cr; int pl, pr; ol = (Overlap *) (IBLOCK+l); or = (Overlap *) (IBLOCK+r); al = ol->aread; ar = or->aread; if (al != ar) return (al-ar); bl = ol->bread; br = or->bread; if (bl != br) return (bl-br); cl = COMP(ol->flags); cr = COMP(ol->flags); if (cl != cr) return (cl-cr); pl = ol->path.abpos; pr = or->path.abpos; return (pl-pr); } static int SORT_MAP(const void *x, const void *y) { int64 l = *((int64 *) x); int64 r = *((int64 *) y); Overlap *ol, *or; int al, ar; int pl, pr; ol = (Overlap *) (IBLOCK+l); or = (Overlap *) (IBLOCK+r); al = ol->aread; ar = or->aread; if (al != ar) return (al-ar); pl = ol->path.abpos; pr = or->path.abpos; return (pl-pr); } int main(int argc, char *argv[]) { char *iblock, *fblock; int64 isize, osize; int64 ovlsize, ptrsize; int tspace, tbytes; int i; int VERBOSE; int MAP_ORDER; // Process options { int j, k; int flags[128]; ARG_INIT("LAsort") j = 1; for (i = 1; i < argc; i++) if (argv[i][0] == '-') { ARG_FLAGS("vc") } else argv[j++] = argv[i]; argc = j; VERBOSE = flags['v']; MAP_ORDER = flags['c']; if (argc <= 1) { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage); exit (1); } } // For each file do ptrsize = sizeof(void *); ovlsize = sizeof(Overlap) - ptrsize; isize = 0; iblock = NULL; osize = MEMORY * 1000000ll; fblock = Malloc(osize,"Allocating LAsort output block"); for (i = 1; i < argc; i++) { int64 *perm; FILE *input, *foutput; int64 novl; // Read in the entire file and output header { int64 size; struct stat info; char *pwd, *root, *name; pwd = PathTo(argv[i]); root = Root(argv[i],".las"); name = Catenate(pwd,"/",root,".las"); input = Fopen(name,"r"); if (input == NULL) exit (1); stat(name,&info); size = info.st_size; if (fread(&novl,sizeof(int64),1,input) != 1) SYSTEM_ERROR if (fread(&tspace,sizeof(int),1,input) != 1) SYSTEM_ERROR if (tspace <= TRACE_XOVR) tbytes = sizeof(uint8); else tbytes = sizeof(uint16); if (VERBOSE) { printf(" %s: ",root); Print_Number(novl,0,stdout); printf(" records "); Print_Number(size-novl*ovlsize,0,stdout); printf(" trace bytes\n"); fflush(stdout); } foutput = Fopen(Catenate(pwd,"/",root,".S.las"),"w"); if (foutput == NULL) exit (1); fwrite(&novl,sizeof(int64),1,foutput); fwrite(&tspace,sizeof(int),1,foutput); free(pwd); free(root); if (size > isize) { if (iblock == NULL) iblock = Malloc(size+ptrsize,"Allocating LAsort input block"); else iblock = Realloc(iblock-ptrsize,size+ptrsize,"Allocating LAsort input block"); if (iblock == NULL) exit (1); iblock += ptrsize; isize = size; } size -= (sizeof(int64) + sizeof(int)); if (size > 0) { if (fread(iblock,size,1,input) != 1) SYSTEM_ERROR } fclose(input); } // Set up unsorted permutation array perm = (int64 *) Malloc(sizeof(int64)*novl,"Allocating LAsort permutation vector"); if (perm == NULL) exit (1); { int64 off; int j; off = -ptrsize; for (j = 0; j < novl; j++) { perm[j] = off; off += ovlsize + ((Overlap *) (iblock+off))->path.tlen*tbytes; } } // Sort permutation array of ptrs to records IBLOCK = iblock; if (MAP_ORDER) qsort(perm,novl,sizeof(int64),SORT_MAP); else qsort(perm,novl,sizeof(int64),SORT_OVL); // Output the records in sorted order { int j; Overlap *w; int64 tsize, span; char *fptr, *ftop; fptr = fblock; ftop = fblock + osize; for (j = 0; j < novl; j++) { w = (Overlap *) (iblock+perm[j]); tsize = w->path.tlen*tbytes; span = ovlsize + tsize; if (fptr + span > ftop) { fwrite(fblock,1,fptr-fblock,foutput); fptr = fblock; } memcpy(fptr,((char *) w)+ptrsize,ovlsize); fptr += ovlsize; memcpy(fptr,(char *) (w+1),tsize); fptr += tsize; } if (fptr > fblock) fwrite(fblock,1,fptr-fblock,foutput); } free(perm); fclose(foutput); } if (iblock != NULL) free(iblock - ptrsize); free(fblock); exit (0); } pbdagcon-0.3+20161121+ds/DALIGNER/filter.h0000644000175000017500000000170413026414545015651 0ustar afifafif/******************************************************************************************* * * Filter interface for the dazzler. * * Author: Gene Myers * Date : July 2013 * ********************************************************************************************/ #ifndef _FILTER #define _FILTER #include "DB.h" #include "align.h" extern int BIASED; extern int VERBOSE; extern int MINOVER; extern int HGAP_MIN; extern int SYMMETRIC; extern int IDENTITY; extern uint64 MEM_LIMIT; extern uint64 MEM_PHYSICAL; #define NTHREADS 4 // Must be a power of 2 #define NSHIFT 2 // log_2 NTHREADS int Set_Filter_Params(int kmer, int binshift, int suppress, int hitmin); void *Sort_Kmers(HITS_DB *block, int *len); void Match_Filter(char *aname, HITS_DB *ablock, char *bname, HITS_DB *bblock, void *atable, int alen, void *btable, int blen, int comp, Align_Spec *asettings); #endif pbdagcon-0.3+20161121+ds/DALIGNER/LICENSE0000644000175000017500000000531113026414545015216 0ustar afifafif Copyright (c) 2014, Dr. Eugene W. Myers (EWM). All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: · Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. · Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. · The name of EWM may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY EWM ”AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL EWM BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. For any issues regarding this software and its use, contact EWM at: Eugene W. Myers Jr. Bautzner Str. 122e 01099 Dresden GERMANY Email: gene.myers@gmail.com pbdagcon-0.3+20161121+ds/DAZZ_DB/0000755000175000017500000000000013026414602014133 5ustar afifafifpbdagcon-0.3+20161121+ds/DAZZ_DB/GNUmakefile0000644000175000017500000000170713026414552016216 0ustar afifafifTHISDIR:=$(abspath $(dir $(realpath $(lastword ${MAKEFILE_LIST})))) CFLAGS+= -O3 -Wall -Wextra -fno-strict-aliasing -Wno-unused-result CPPFLAGS+= -MMD -MP LDLIBS+= -lm LDFLAGS+= ALL = fasta2DB DB2fasta quiva2DB DB2quiva DBsplit DBdust Catrack DBshow DBstats DBrm simulator \ fasta2DAM DAM2fasta DBdump vpath %.c ${THISDIR} all: ${ALL} ${ALL}: libdazzdb.a libdazzdb.a: DB.o QV.o ${AR} -rcv $@ $^ # Shared libs are not used yet, but maybe someday. %.os: %.c ${CC} -o $@ -c $< -fPIC ${CFLAGS} ${CPPFLAGS} libdazzdb.so: DB.os QV.os ${CC} -o $@ $^ -shared ${LDFLAGS} install: cp -f fasta2DB DBsplit DBshow DBstats DBdust DBdump DB2fasta DBrm simulator ${PREFIX}/bin cp -f libdazzdb.* ${PREFIX}/lib clean: rm -f ${ALL} rm -f ${DEPS} rm -fr *.dSYM *.o *.a *.so *.os rm -f DBupgrade.Sep.25.2014 DBupgrade.Dec.31.2014 DUSTupgrade.Jan.1.2015 rm -f dazz.db.tar.gz SRCS:=$(notdir $(wildcard ${THISDIR}/*.c)) DEPS:=$(patsubst %.c,%.d,${SRCS}) -include ${DEPS} pbdagcon-0.3+20161121+ds/DAZZ_DB/quiva2DB.c0000644000175000017500000002225613026414552015727 0ustar afifafif/******************************************************************************************* * * Adds the given .quiva files to an existing DB "path". The input files must be added in * the same order as the .fasta files were and have the same root names, e.g. FOO.fasta * and FOO.quiva. The files can be added incrementally but must be added in the same order * as the .fasta files. This is enforced by the program. With the -l option set the * compression scheme is a bit lossy to get more compression (see the description of dexqv * in the DEXTRACTOR module). * * Author: Gene Myers * Date : July 2014 * ********************************************************************************************/ #include #include #include #include #include #include #include "DB.h" #include "QV.h" #ifdef HIDE_FILES #define PATHSEP "/." #else #define PATHSEP "/" #endif static char *Usage = "[-vl] ( -f | ... )"; typedef struct { int argc; char **argv; FILE *input; int count; char *name; } File_Iterator; File_Iterator *init_file_iterator(int argc, char **argv, FILE *input, int first) { File_Iterator *it; it = Malloc(sizeof(File_Iterator),"Allocating file iterator"); it->argc = argc; it->argv = argv; it->input = input; if (input == NULL) it->count = first; else { it->count = 1; rewind(input); } return (it); } int next_file(File_Iterator *it) { static char nbuffer[MAX_NAME+8]; if (it->input == NULL) { if (it->count >= it->argc) return (0); it->name = it->argv[it->count++]; } else { char *eol; if (fgets(nbuffer,MAX_NAME+8,it->input) == NULL) { if (feof(it->input)) return (0); SYSTEM_ERROR; } if ((eol = index(nbuffer,'\n')) == NULL) { fprintf(stderr,"%s: Line %d in file list is longer than %d chars!\n", Prog_Name,it->count,MAX_NAME+7); it->name = NULL; } *eol = '\0'; it->count += 1; it->name = nbuffer; } return (1); } int main(int argc, char *argv[]) { FILE *istub, *quiva, *indx; int64 coff; int ofile; HITS_DB db; HITS_READ *reads; int VERBOSE; int LOSSY; FILE *IFILE; // Process command line { int i, j, k; int flags[128]; ARG_INIT("quiva2DB") IFILE = NULL; j = 1; for (i = 1; i < argc; i++) if (argv[i][0] == '-') switch (argv[i][1]) { default: ARG_FLAGS("vl") break; case 'f': IFILE = fopen(argv[i]+2,"r"); if (IFILE == NULL) { fprintf(stderr,"%s: Cannot open file of inputs '%s'\n",Prog_Name,argv[i]+2); exit (1); } break; } else argv[j++] = argv[i]; argc = j; VERBOSE = flags['v']; LOSSY = flags['l']; if ((IFILE == NULL && argc <= 2) || (IFILE != NULL && argc != 2)) { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage); exit (1); } } // Open DB stub file and index, load db and read records. Confirm that the .fasta files // corresponding to the command line .quiva files are in the DB and in order where the // index of the first file is ofile and the index of the first read to be added is ofirst. // Record in coff the current size of the .qvs file in case an error occurs and it needs // to be truncated back to its size at the start. { int i; char *pwd, *root; int nfiles; File_Iterator *ng; root = Root(argv[1],".db"); pwd = PathTo(argv[1]); istub = Fopen(Catenate(pwd,"/",root,".db"),"r"); if (istub == NULL) exit (1); indx = Fopen(Catenate(pwd,PATHSEP,root,".idx"),"r+"); if (indx == NULL) exit (1); if (fread(&db,sizeof(HITS_DB),1,indx) != 1) SYSTEM_ERROR reads = (HITS_READ *) Malloc(sizeof(HITS_READ)*db.ureads,"Allocating DB index"); if (reads == NULL) exit (1); if (fread(reads,sizeof(HITS_READ),db.ureads,indx) != (size_t) (db.ureads)) SYSTEM_ERROR { int first, last; char prolog[MAX_NAME], fname[MAX_NAME]; char *core; ng = init_file_iterator(argc,argv,IFILE,2); if ( ! next_file(ng)) { fprintf(stderr,"%s: file list is empty!\n",Prog_Name); exit (1); } if (ng->name == NULL) exit (1); core = Root(ng->name,".quiva"); if (fscanf(istub,DB_NFILE,&nfiles) != 1) SYSTEM_ERROR first = 0; for (i = 0; i < nfiles; i++) { if (fscanf(istub,DB_FDATA,&last,fname,prolog) != 3) SYSTEM_ERROR if (strcmp(core,fname) == 0) break; first = last; } if (i >= nfiles) { fprintf(stderr,"%s: %s.fasta has never been added to DB\n",Prog_Name,core); exit (1); } ofile = i; if (first > 0 && reads[first-1].coff < 0) { fprintf(stderr,"%s: Predecessor of %s.quiva has not been added yet\n",Prog_Name,core); exit (1); } if (reads[first].coff >= 0) { fprintf(stderr,"%s: %s.quiva has already been added\n",Prog_Name,core); exit (1); } while (next_file(ng)) { if (ng->name == NULL) exit (1); core = Root(ng->name,".quiva"); if (++i >= nfiles) { fprintf(stderr,"%s: %s.fasta has never been added to DB\n",Prog_Name,core); exit (1); } if (fscanf(istub,DB_FDATA,&last,fname,prolog) != 3) SYSTEM_ERROR if (strcmp(core,fname) != 0) { fprintf(stderr,"%s: Files not being added in order (expect %s, given %s)", Prog_Name,fname,core); exit (1); } } if (ofile == 0) quiva = Fopen(Catenate(pwd,PATHSEP,root,".qvs"),"w"); else quiva = Fopen(Catenate(pwd,PATHSEP,root,".qvs"),"r+"); if (quiva == NULL) exit (1); fseeko(quiva,0,SEEK_END); coff = ftello(quiva); free(core); free(ng); } free(root); free(pwd); } // For each .quiva file, determine its compression scheme in a fast scan and append it to // the .qvs file Then compress every .quiva entry in the file, appending its compressed // form to the .qvs file as you go and recording the offset in the .qvs in the .coff field // of each read record (*except* the first, that points at the compression scheme immediately // preceding it). Ensure that the # of .quiva entries matches the # of .fasta entries // in each added file. { int i; int last, cur; File_Iterator *ng; // For each .quiva file do: rewind(istub); if (fscanf(istub,"files = %*d\n") != 0) SYSTEM_ERROR last = 0; for (i = 0; i < ofile; i++) if (fscanf(istub," %9d %*s %*s\n",&last) != 1) SYSTEM_ERROR ng = init_file_iterator(argc,argv,IFILE,2); cur = last; while (next_file(ng)) { FILE *input; int64 qpos; char *pwd, *root; QVcoding *coding; // Open next .quiva file and create its compression scheme pwd = PathTo(ng->name); root = Root(ng->name,".quiva"); if ((input = Fopen(Catenate(pwd,"/",root,".quiva"),"r")) == NULL) goto error; if (VERBOSE) { fprintf(stderr,"Analyzing '%s' ...\n",root); fflush(stderr); } QVcoding_Scan(input); coding = Create_QVcoding(LOSSY); coding->prefix = Strdup(".qvs","Allocating header prefix"); qpos = ftello(quiva); Write_QVcoding(quiva,coding); // Then compress and append to the .qvs each compressed QV entry if (VERBOSE) { fprintf(stderr,"Compressing '%s' ...\n",root); fflush(stderr); } rewind(input); while (Read_Lines(input,1) > 0) { reads[cur++].coff = qpos; Compress_Next_QVentry(input,quiva,coding,LOSSY); qpos = ftello(quiva); } if (fscanf(istub," %9d %*s %*s\n",&last) != 1) SYSTEM_ERROR if (last != cur) { fprintf(stderr,"%s: Number of reads in %s.quiva doesn't match number in %s.fasta\n", Prog_Name,root,root); goto error; } Free_QVcoding(coding); free(root); free(pwd); } free(ng); } // Write the db record and read index into .idx and clean up rewind(indx); fwrite(&db,sizeof(HITS_DB),1,indx); fwrite(reads,sizeof(HITS_READ),db.ureads,indx); fclose(istub); fclose(indx); fclose(quiva); exit (0); // Error exit: Either truncate or remove the .qvs file as appropriate. error: if (coff != 0) { fseeko(quiva,0,SEEK_SET); if (ftruncate(fileno(quiva),coff) < 0) SYSTEM_ERROR } fclose(istub); fclose(indx); fclose(quiva); if (coff == 0) { char *root = Root(argv[1],".db"); char *pwd = PathTo(argv[1]); unlink(Catenate(pwd,PATHSEP,root,".qvs")); free(pwd); free(root); } exit (1); } pbdagcon-0.3+20161121+ds/DAZZ_DB/DB2fasta.c0000644000175000017500000000664113026414552015700 0ustar afifafif/******************************************************************************************** * * Recreate all the .fasta files that have been loaded into a specified database. * * Author: Gene Myers * Date : May 2014 * ********************************************************************************************/ #include #include #include #include "DB.h" static char *Usage = "[-vU] [-w] "; int main(int argc, char *argv[]) { HITS_DB _db, *db = &_db; FILE *dbfile; int nfiles; int VERBOSE, UPPER, WIDTH; // Process arguments { int i, j, k; int flags[128]; char *eptr; ARG_INIT("DB2fasta") WIDTH = 80; j = 1; for (i = 1; i < argc; i++) if (argv[i][0] == '-') switch (argv[i][1]) { default: ARG_FLAGS("vU") break; case 'w': ARG_NON_NEGATIVE(WIDTH,"Line width") break; } else argv[j++] = argv[i]; argc = j; UPPER = 1 + flags['U']; VERBOSE = flags['v']; if (argc != 2) { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage); exit (1); } } // Open db { int status; status = Open_DB(argv[1],db); if (status < 0) exit (1); if (status == 1) { fprintf(stderr,"%s: Cannot be called on a .dam index: %s\n",Prog_Name,argv[1]); exit (1); } if (db->part > 0) { fprintf(stderr,"%s: Cannot be called on a block: %s\n",Prog_Name,argv[1]); exit (1); } } { char *pwd, *root; pwd = PathTo(argv[1]); root = Root(argv[1],".db"); dbfile = Fopen(Catenate(pwd,"/",root,".db"),"r"); free(pwd); free(root); if (dbfile == NULL) exit (1); } // nfiles = # of files in data base if (fscanf(dbfile,DB_NFILE,&nfiles) != 1) SYSTEM_ERROR // For each file do: { HITS_READ *reads; char *read; int f, first; reads = db->reads; read = New_Read_Buffer(db); first = 0; for (f = 0; f < nfiles; f++) { int i, last; FILE *ofile; char prolog[MAX_NAME], fname[MAX_NAME]; // Scan db image file line, create .fasta file for writing if (fscanf(dbfile,DB_FDATA,&last,fname,prolog) != 3) SYSTEM_ERROR if ((ofile = Fopen(Catenate(".","/",fname,".fasta"),"w")) == NULL) exit (1); if (VERBOSE) { fprintf(stderr,"Creating %s.fasta ...\n",fname); fflush(stdout); } // For the relevant range of reads, write each to the file // recreating the original headers with the index meta-data about each read for (i = first; i < last; i++) { int j, len; int flags, qv; HITS_READ *r; r = reads + i; len = r->rlen; flags = r->flags; qv = (flags & DB_QV); fprintf(ofile,">%s/%d/%d_%d",prolog,r->origin,r->fpulse,r->fpulse+len); if (qv > 0) fprintf(ofile," RQ=0.%3d",qv); fprintf(ofile,"\n"); Load_Read(db,i,read,UPPER); for (j = 0; j+WIDTH < len; j += WIDTH) fprintf(ofile,"%.*s\n",WIDTH,read+j); if (j < len) fprintf(ofile,"%s\n",read+j); } first = last; } } fclose(dbfile); Close_DB(db); exit (0); } pbdagcon-0.3+20161121+ds/DAZZ_DB/fasta2DB.c0000644000175000017500000004633513026414552015704 0ustar afifafif/******************************************************************************************* * * Add .fasta files to a DB: * Adds the given fasta files in the given order to .db. If the db does not exist * then it is created. All .fasta files added to a given data base must have the same * header format and follow Pacbio's convention. A file cannot be added twice and this * is enforced. The command either builds or appends to the ..idx and ..bps * files, where the index file (.idx) contains information about each read and their offsets * in the base-pair file (.bps) that holds the sequences where each base is compessed * into 2-bits. The two files are hidden by virtue of their names beginning with a '.'. * .db is effectively a stub file with given name that contains an ASCII listing * of the files added to the DB and possibly the block partitioning for the DB if DBsplit * has been called upon it. * * Author: Gene Myers * Date : May 2013 * Modify: DB upgrade: now *add to* or create a DB depending on whether it exists, read * multiple .fasta files (no longer a stdin pipe). * Date : April 2014 * ********************************************************************************************/ #include #include #include #include #include #include #include "DB.h" #ifdef HIDE_FILES #define PATHSEP "/." #else #define PATHSEP "/" #endif static char *Usage = "[-v] [-p] ( -f | ... )"; static char number[128] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }; typedef struct { int argc; char **argv; FILE *input; int count; char *name; } File_Iterator; File_Iterator *init_file_iterator(int argc, char **argv, FILE *input, int first) { File_Iterator *it; it = Malloc(sizeof(File_Iterator),"Allocating file iterator"); it->argc = argc; it->argv = argv; it->input = input; if (input == NULL) it->count = first; else { it->count = 1; rewind(input); } return (it); } int next_file(File_Iterator *it) { static char nbuffer[MAX_NAME+8]; if (it->input == NULL) { if (it->count >= it->argc) return (0); it->name = it->argv[it->count++]; } else { char *eol; if (fgets(nbuffer,MAX_NAME+8,it->input) == NULL) { if (feof(it->input)) return (0); SYSTEM_ERROR; } if ((eol = index(nbuffer,'\n')) == NULL) { fprintf(stderr,"%s: Line %d in file list is longer than %d chars!\n", Prog_Name,it->count,MAX_NAME+7); it->name = NULL; } *eol = '\0'; it->count += 1; it->name = nbuffer; } return (1); } int main(int argc, char *argv[]) { FILE *istub, *ostub; char *dbname; char *root, *pwd; FILE *bases, *indx; int64 boff, ioff; int ifiles, ofiles; char **flist; HITS_DB db; int ureads; int64 offset; FILE *IFILE; char *PNAME; int VERBOSE; // Usage: [-v] ( -f | ... ) { int i, j, k; int flags[128]; ARG_INIT("fasta2DB") IFILE = NULL; PNAME = NULL; j = 1; for (i = 1; i < argc; i++) if (argv[i][0] == '-') switch (argv[i][1]) { default: ARG_FLAGS("v") break; case 'f': IFILE = fopen(argv[i]+2,"r"); if (IFILE == NULL) { fprintf(stderr,"%s: Cannot open file of inputs '%s'\n",Prog_Name,argv[i]+2); exit (1); } break; case 'p': PNAME = argv[i]+2; break; } else argv[j++] = argv[i]; argc = j; VERBOSE = flags['v']; if ((IFILE == NULL && argc <= 2) || (IFILE != NULL && argc != 2)) { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage); exit (1); } } // Try to open DB file, if present then adding to DB, otherwise creating new DB. Set up // variables as follows: // dbname = full name of db = /.db // istub = open db file (if adding) or NULL (if creating) // ostub = new image of db file (will overwrite old image at end) // bases = .bps file positioned for appending // indx = .idx file positioned for appending // ureads = # of reads currently in db // offset = offset in .bps at which to place next sequence // ioff = offset in .idx file to truncate to if command fails // boff = offset in .bps file to truncate to if command fails // ifiles = # of .fasta files to add // ofiles = # of .fasta files already in db // flist = [0..ifiles+ofiles] list of file names (root only) added to db so far { int i; root = Root(argv[1],".db"); pwd = PathTo(argv[1]); dbname = Strdup(Catenate(pwd,"/",root,".db"),"Allocating db name"); if (dbname == NULL) exit (1); if (IFILE == NULL) ifiles = argc-2; else { File_Iterator *ng; ifiles = 0; ng = init_file_iterator(argc,argv,IFILE,2); while (next_file(ng)) ifiles += 1; free(ng); } istub = fopen(dbname,"r"); if (istub == NULL) { ofiles = 0; bases = Fopen(Catenate(pwd,PATHSEP,root,".bps"),"w+"); indx = Fopen(Catenate(pwd,PATHSEP,root,".idx"),"w+"); if (bases == NULL || indx == NULL) exit (1); fwrite(&db,sizeof(HITS_DB),1,indx); ureads = 0; offset = 0; boff = 0; ioff = 0; } else { if (fscanf(istub,DB_NFILE,&ofiles) != 1) SYSTEM_ERROR bases = Fopen(Catenate(pwd,PATHSEP,root,".bps"),"r+"); indx = Fopen(Catenate(pwd,PATHSEP,root,".idx"),"r+"); if (bases == NULL || indx == NULL) exit (1); if (fread(&db,sizeof(HITS_DB),1,indx) != 1) SYSTEM_ERROR fseeko(bases,0,SEEK_END); fseeko(indx, 0,SEEK_END); ureads = db.ureads; offset = ftello(bases); boff = offset; ioff = ftello(indx); } flist = (char **) Malloc(sizeof(char *)*(ofiles+ifiles),"Allocating file list"); ostub = Fopen(Catenate(pwd,"/",root,".dbx"),"w+"); if (ostub == NULL || flist == NULL) exit (1); fprintf(ostub,DB_NFILE,ofiles+ifiles); for (i = 0; i < ofiles; i++) { int last; char prolog[MAX_NAME], fname[MAX_NAME]; if (fscanf(istub,DB_FDATA,&last,fname,prolog) != 3) SYSTEM_ERROR if ((flist[i] = Strdup(fname,"Adding to file list")) == NULL) goto error; fprintf(ostub,DB_FDATA,last,fname,prolog); } } { int maxlen; int64 totlen, count[4]; int pmax, rmax; HITS_READ *prec; char *read; int c; File_Iterator *ng; // Buffer for reads all in the same well pmax = 100; prec = (HITS_READ *) Malloc(sizeof(HITS_READ)*pmax,"Allocating record buffer"); if (prec == NULL) goto error; // Buffer for accumulating .fasta sequence over multiple lines rmax = MAX_NAME + 60000; read = (char *) Malloc(rmax+1,"Allocating line buffer"); if (read == NULL) goto error; totlen = 0; // total # of bases in new .fasta files maxlen = 0; // longest read in new .fasta files for (c = 0; c < 4; c++) // count of acgt in new .fasta files count[c] = 0; // For each new .fasta file do: ng = init_file_iterator(argc,argv,IFILE,2); while (next_file(ng)) { FILE *input; char *path, *core, *prolog; int nline, eof, rlen, pcnt; int pwell; if (ng->name == NULL) goto error; // Open it: /.fasta, check that core is not too long, // and checking that it is not already in flist. path = PathTo(ng->name); core = Root(ng->name,".fasta"); if ((input = Fopen(Catenate(path,"/",core,".fasta"),"r")) == NULL) goto error; free(path); if (strlen(core) >= MAX_NAME) { fprintf(stderr,"%s: File name over %d chars: '%.200s'\n", Prog_Name,MAX_NAME,core); goto error; } { int j; for (j = 0; j < ofiles; j++) if (strcmp(core,flist[j]) == 0) { fprintf(stderr,"%s: File %s.fasta is already in database %s.db\n", Prog_Name,core,Root(argv[1],".db")); goto error; } } // Get the header of the first line. If the file is empty skip. pcnt = 0; rlen = 0; nline = 1; eof = (fgets(read,MAX_NAME,input) == NULL); if (eof || strlen(read) < 1) { fprintf(stderr,"Skipping '%s', file is empty!\n",core); fclose(input); free(core); continue; } // Add the file name to flist if (VERBOSE) { fprintf(stderr,"Adding '%s' ...\n",core); fflush(stderr); } flist[ofiles++] = core; // Check that the first line has PACBIO format, and record prolog in 'prolog'. if (read[strlen(read)-1] != '\n') { fprintf(stderr,"File %s.fasta, Line 1: Fasta line is too long (> %d chars)\n", core,MAX_NAME-2); goto error; } if (!eof && read[0] != '>') { fprintf(stderr,"File %s.fasta, Line 1: First header in fasta file is missing\n",core); goto error; } { char *find; int well, beg, end, qv; find = index(read+1,'/'); if (find != NULL && sscanf(find+1,"%d/%d_%d RQ=0.%d\n",&well,&beg,&end,&qv) >= 3) { *find = '\0'; if (PNAME != NULL) prolog = Strdup(PNAME,"Extracting prolog"); else prolog = Strdup(read+1,"Extracting prolog"); *find = '/'; if (prolog == NULL) goto error; } else { fprintf(stderr,"File %s.fasta, Line %d: Pacbio header line format error\n", core,nline); goto error; } } // Read in all the sequences until end-of-file { int i, x; pwell = -1; while (!eof) { int beg, end, clen; int well, qv; char *find; find = index(read+(rlen+1),'/'); if (find == NULL) { fprintf(stderr,"File %s.fasta, Line %d: Pacbio header line format error\n", core,nline); goto error; } if (PNAME == NULL) { *find = '\0'; if (strcmp(read+(rlen+1),prolog) != 0) { fprintf(stderr, "File %s.fasta, Line %d: Pacbio header line name inconsistent\n", core,nline); goto error; } *find = '/'; } x = sscanf(find+1,"%d/%d_%d RQ=0.%d\n",&well,&beg,&end,&qv); if (x < 3) { fprintf(stderr,"File %s.fasta, Line %d: Pacbio header line format error\n", core,nline); goto error; } else if (x == 3) qv = 0; rlen = 0; while (1) { eof = (fgets(read+rlen,MAX_NAME,input) == NULL); nline += 1; x = strlen(read+rlen)-1; if (read[rlen+x] != '\n') { if (read[rlen] == '>') { fprintf(stderr,"File %s.fasta, Line %d:",core,nline); fprintf(stderr," Fasta header line is too long (> %d chars)\n", MAX_NAME-2); goto error; } else x += 1; } if (eof || read[rlen] == '>') break; rlen += x; if (rlen + MAX_NAME > rmax) { rmax = ((int) (1.2 * rmax)) + 1000 + MAX_NAME; read = (char *) realloc(read,rmax+1); if (read == NULL) { fprintf(stderr,"File %s.fasta, Line %d:",core,nline); fprintf(stderr," Out of memory (Allocating line buffer)\n"); goto error; } } } read[rlen] = '\0'; for (i = 0; i < rlen; i++) { x = number[(int) read[i]]; count[x] += 1; read[i] = (char) x; } ureads += 1; totlen += rlen; if (rlen > maxlen) maxlen = rlen; prec[pcnt].origin = well; prec[pcnt].fpulse = beg; prec[pcnt].rlen = rlen; prec[pcnt].boff = offset; prec[pcnt].coff = -1; prec[pcnt].flags = qv; Compress_Read(rlen,read); clen = COMPRESSED_LEN(rlen); fwrite(read,1,clen,bases); offset += clen; if (pwell == well) { prec[pcnt].flags |= DB_CSS; pcnt += 1; if (pcnt >= pmax) { pmax = ((int) (pcnt*1.2)) + 100; prec = (HITS_READ *) realloc(prec,sizeof(HITS_READ)*pmax); if (prec == NULL) { fprintf(stderr,"File %s.fasta, Line %d: Out of memory",core,nline); fprintf(stderr," (Allocating read records)\n"); goto error; } } } else if (pcnt == 0) pcnt += 1; else { x = 0; for (i = 1; i < pcnt; i++) if (prec[i].rlen > prec[x].rlen) x = i; prec[x].flags |= DB_BEST; fwrite(prec,sizeof(HITS_READ),pcnt,indx); prec[0] = prec[pcnt]; pcnt = 1; } pwell = well; } // Complete processing of .fasta file: flush last well group, write file line // in db image, free prolog, and close file x = 0; for (i = 1; i < pcnt; i++) if (prec[i].rlen > prec[x].rlen) x = i; prec[x].flags |= DB_BEST; fwrite(prec,sizeof(HITS_READ),pcnt,indx); fprintf(ostub,DB_FDATA,ureads,core,prolog); } free(prolog); fclose(input); } // Finished loading all sequences: update relevant fields in db record db.ureads = ureads; if (istub == NULL) { for (c = 0; c < 4; c++) db.freq[c] = (float) ((1.*count[c])/totlen); db.totlen = totlen; db.maxlen = maxlen; db.cutoff = -1; } else { for (c = 0; c < 4; c++) db.freq[c] = (float) ((db.freq[c]*db.totlen + (1.*count[c]))/(db.totlen + totlen)); db.totlen += totlen; if (maxlen > db.maxlen) db.maxlen = maxlen; } } // If db has been previously partitioned then calculate additional partition points and // write to new db file image if (db.cutoff >= 0) { int64 totlen, dbpos, size; int nblock, ireads, tfirst, rlen; int ufirst, cutoff, allflag; HITS_READ record; int i; if (VERBOSE) { fprintf(stderr,"Updating block partition ...\n"); fflush(stderr); } // Read the block portion of the existing db image getting the indices of the first // read in the last block of the exisiting db as well as the partition parameters. // Copy the old image block information to the new block information (except for // the indices of the last partial block) if (fscanf(istub,DB_NBLOCK,&nblock) != 1) SYSTEM_ERROR dbpos = ftello(ostub); fprintf(ostub,DB_NBLOCK,0); if (fscanf(istub,DB_PARAMS,&size,&cutoff,&allflag) != 3) SYSTEM_ERROR fprintf(ostub,DB_PARAMS,size,cutoff,allflag); if (allflag) allflag = 0; else allflag = DB_BEST; nblock -= 1; for (i = 0; i <= nblock; i++) { if (fscanf(istub,DB_BDATA,&ufirst,&tfirst) != 2) SYSTEM_ERROR fprintf(ostub,DB_BDATA,ufirst,tfirst); } // Seek the first record of the last block of the existing db in .idx, and then // compute and record partition indices for the rest of the db from this point // forward. fseeko(indx,sizeof(HITS_DB)+sizeof(HITS_READ)*ufirst,SEEK_SET); totlen = 0; ireads = 0; for (i = ufirst; i < ureads; i++) { if (fread(&record,sizeof(HITS_READ),1,indx) != 1) SYSTEM_ERROR rlen = record.rlen; if (rlen >= cutoff && (record.flags & DB_BEST) >= allflag) { ireads += 1; tfirst += 1; totlen += rlen; if (totlen >= size) { fprintf(ostub," %9d %9d\n",i+1,tfirst); totlen = 0; ireads = 0; nblock += 1; } } } if (ireads > 0) { fprintf(ostub,DB_BDATA,ureads,tfirst); nblock += 1; } db.treads = tfirst; fseeko(ostub,dbpos,SEEK_SET); fprintf(ostub,DB_NBLOCK,nblock); // Rewind and record the new number of blocks } else db.treads = ureads; rewind(indx); fwrite(&db,sizeof(HITS_DB),1,indx); // Write the finalized db record into .idx rewind(ostub); // Rewrite the number of files actually added fprintf(ostub,DB_NFILE,ofiles); if (istub != NULL) fclose(istub); fclose(ostub); fclose(indx); fclose(bases); rename(Catenate(pwd,"/",root,".dbx"),dbname); // New image replaces old image exit (0); // Error exit: Either truncate or remove the .idx and .bps files as appropriate. // Remove the new image file /.dbx error: if (ioff != 0) { fseeko(indx,0,SEEK_SET); if (ftruncate(fileno(indx),ioff) < 0) SYSTEM_ERROR } if (boff != 0) { fseeko(bases,0,SEEK_SET); if (ftruncate(fileno(bases),boff) < 0) SYSTEM_ERROR } fclose(indx); fclose(bases); if (ioff == 0) unlink(Catenate(pwd,PATHSEP,root,".idx")); if (boff == 0) unlink(Catenate(pwd,PATHSEP,root,".bps")); if (istub != NULL) fclose(istub); fclose(ostub); unlink(Catenate(pwd,"/",root,".dbx")); exit (1); } pbdagcon-0.3+20161121+ds/DAZZ_DB/QV.h0000644000175000017500000001042413026414552014637 0ustar afifafif/******************************************************************************************* * * Compressor/decompressor for .quiv files: customized Huffman codes for each stream based on * the histogram of values occuring in a given file. The two low complexity streams * (deletionQV and substitutionQV) use a Huffman coding of the run length of the prevelant * character. * * Author: Gene Myers * Date: Jan 18, 2014 * Modified: July 25, 2014 * ********************************************************************************************/ #ifndef _QV_COMPRESSOR #include #define _QV_COMPRESSOR // The defined constant INTERACTIVE (set in DB.h) determines whether an interactive or // batch version of the routines in this library are compiled. In batch mode, routines // print an error message and exit. In interactive mode, the routines place the error // message in EPLACE (also defined in DB.h) and return an error value, typically NULL // if the routine returns a pointer, and an unusual integer value if the routine returns // an integer. // Below when an error return is described, one should understand that this value is returned // only if the routine was compiled in INTERACTIVE mode. // A PacBio compression scheme typedef struct { void *delScheme; // Huffman scheme for deletion QVs void *insScheme; // Huffman scheme for insertion QVs void *mrgScheme; // Huffman scheme for merge QVs void *subScheme; // Huffman scheme for substitution QVs void *dRunScheme; // Huffman scheme for deletion run lengths (if delChar > 0) void *sRunScheme; // Huffman scheme for substitution run lengths (if subChar > 0) int delChar; // If > 0, run-encoded deletion value int subChar; // If > 0, run-encoded substitution value int flip; // Need to flip multi-byte integers char *prefix; // Header line prefix } QVcoding; // Read the next nlines of input, and QVentry returns a pointer to the first line if needed. // If end-of-input is encountered before any further input, -1 is returned. If there is // an error than -2 is returned. Otherwise the length of the line(s) read is returned. int Read_Lines(FILE *input, int nlines); char *QVentry(); // Read the .quiva file on input and record frequency statistics. If there is an error // then 1 is returned, otherwise 0. int QVcoding_Scan(FILE *input); // Given QVcoding_Scan has been called at least once, create an encoding scheme based on // the accumulated statistics and return a pointer to it. The returned encoding object // is *statically allocated within the routine. If lossy is set then use a lossy scaling // for the insertion and merge streams. If there is an error, then NULL is returned. QVcoding *Create_QVcoding(int lossy); // Read/write a coding scheme to input/output. The encoding object returned by the reader // is *statically* allocated within the routine. If an error occurs while reading then // NULL is returned. QVcoding *Read_QVcoding(FILE *input); void Write_QVcoding(FILE *output, QVcoding *coding); // Free all the auxiliary storage associated with coding (but not the object itself!) void Free_QVcoding(QVcoding *coding); // Assuming the file pointer is positioned just beyond an entry header line, read the // next set of 5 QV lines, compress them according to 'coding', and output. If lossy // is set then the scheme is a lossy one. A non-zero value is return only if an // error occured. int Compress_Next_QVentry(FILE *input, FILE *output, QVcoding *coding, int lossy); // Assuming the input is position just beyond the compressed encoding of an entry header, // read the set of compressed encodings for the ensuing 5 QV vectors, decompress them, // and place their decompressed values into entry which is a 5 element array of character // pointers. The parameter rlen computed from the preceeding header line, critically // provides the length of each of the 5 vectors. A non-zero value is return only if an // error occured. int Uncompress_Next_QVentry(FILE *input, char **entry, QVcoding *coding, int rlen); #endif // _QV_COMPRESSOR pbdagcon-0.3+20161121+ds/DAZZ_DB/Catrack.c0000644000175000017500000001561313026414552015661 0ustar afifafif/******************************************************************************************** * * Concate in block order all "block tracks" ..# into a single track * . * * Author: Gene Myers * Date : June 2014 * ********************************************************************************************/ #include #include #include #include #include "DB.h" #ifdef HIDE_FILES #define PATHSEP "/." #else #define PATHSEP "/" #endif static char *Usage = "[-v] "; int main(int argc, char *argv[]) { char *prefix; FILE *aout, *dout; int VERBOSE; // Process arguments { int i, j, k; int flags[128]; ARG_INIT("Catrack") j = 1; for (i = 1; i < argc; i++) if (argv[i][0] == '-') { ARG_FLAGS("v") } else argv[j++] = argv[i]; argc = j; VERBOSE = flags['v']; if (argc != 3) { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage); exit (1); } } { char *pwd, *root; int plen; plen = strlen(argv[1]); if (strcmp(argv[1]+(plen-3),".dam") == 0) root = Root(argv[1],".dam"); else root = Root(argv[1],".db"); pwd = PathTo(argv[1]); prefix = Strdup(Catenate(pwd,PATHSEP,root,"."),"Allocating track name"); free(pwd); free(root); aout = fopen(Catenate(prefix,argv[2],".","anno"),"r"); if (aout != NULL) { fprintf(stderr,"%s: Track file %s%s.anno already exists!\n",Prog_Name,prefix,argv[2]); fclose(aout); exit (1); } dout = fopen(Catenate(prefix,argv[2],".","data"),"r"); if (dout != NULL) { fprintf(stderr,"%s: Track file %s%s.data already exists!\n",Prog_Name,prefix,argv[2]); fclose(dout); exit (1); } aout = Fopen(Catenate(prefix,argv[2],".","anno"),"w"); if (aout == NULL) exit (1); dout = NULL; } { int tracktot, tracksiz; int64 trackoff; int nfiles; char data[1024]; void *anno; anno = NULL; trackoff = 0; tracktot = tracksiz = 0; fwrite(&tracktot,sizeof(int),1,aout); fwrite(&tracksiz,sizeof(int),1,aout); nfiles = 0; while (1) { FILE *afile, *dfile; int i, size, tracklen; afile = fopen(Numbered_Suffix(prefix,nfiles+1,Catenate(".",argv[2],".","anno")),"r"); if (afile == NULL) break; dfile = fopen(Numbered_Suffix(prefix,nfiles+1,Catenate(".",argv[2],".","data")),"r"); if (VERBOSE) { fprintf(stderr,"Concatenating %s%d.%s ...\n",prefix,nfiles+1,argv[2]); fflush(stderr); } if (fread(&tracklen,sizeof(int),1,afile) != 1) SYSTEM_ERROR if (fread(&size,sizeof(int),1,afile) != 1) SYSTEM_ERROR if (nfiles == 0) { tracksiz = size; if (dfile != NULL) { dout = Fopen(Catenate(prefix,argv[2],".","data"),"w"); if (dout == NULL) { fclose(afile); fclose(dfile); goto error; } } else { anno = Malloc(size,"Allocating annotation record"); if (anno == NULL) { fclose(afile); goto error; } } } else { int escape = 1; if (tracksiz != size) { fprintf(stderr,"%s: Track block %d does not have the same annotation size (%d)", Prog_Name,nfiles+1,size); fprintf(stderr," as previous blocks (%d)\n",tracksiz); } else if (dfile == NULL && dout != NULL) fprintf(stderr,"%s: Track block %d does not have data but previous blocks do\n", Prog_Name,nfiles+1); else if (dfile != NULL && dout == NULL) fprintf(stderr,"%s: Track block %d has data but previous blocks do not\n", Prog_Name,nfiles+1); else escape = 0; if (escape) { fclose(afile); if (dfile != NULL) fclose(dfile); if (anno != NULL) free(anno); goto error; } } if (dfile != NULL) { int64 dlen; if (size == 4) { int anno4; for (i = 0; i < tracklen; i++) { if (fread(&anno4,sizeof(int),1,afile) != 1) SYSTEM_ERROR anno4 += trackoff; fwrite(&anno4,sizeof(int),1,aout); } if (fread(&anno4,sizeof(int),1,afile) != 1) SYSTEM_ERROR dlen = anno4; } else { int64 anno8; for (i = 0; i < tracklen; i++) { if (fread(&anno8,sizeof(int64),1,afile) != 1) SYSTEM_ERROR anno8 += trackoff; fwrite(&anno8,sizeof(int64),1,aout); } if (fread(&anno8,sizeof(int64),1,afile) != 1) SYSTEM_ERROR dlen = anno8; } trackoff += dlen; for (i = 1024; i < dlen; i += 1024) { if (fread(data,1024,1,dfile) != 1) SYSTEM_ERROR fwrite(data,1024,1,dout); } i -= 1024; if (i < dlen) { if (fread(data,dlen-i,1,dfile) != 1) SYSTEM_ERROR fwrite(data,dlen-i,1,dout); } } else { for (i = 0; i < tracklen; i++) { if (fread(anno,size,1,afile) != 1) SYSTEM_ERROR fwrite(anno,size,1,aout); } } tracktot += tracklen; nfiles += 1; if (dfile != NULL) fclose(dfile); fclose(afile); } if (nfiles == 0) { fprintf(stderr,"%s: Couldn't find first track block %s1.%s.anno\n", Prog_Name,prefix,argv[2]); goto error; } else { if (dout != NULL) { if (tracksiz == 4) { int anno4 = trackoff; fwrite(&anno4,sizeof(int),1,aout); } else { int64 anno8 = trackoff; fwrite(&anno8,sizeof(int64),1,aout); } } else { fwrite(anno,tracksiz,1,aout); free(anno); } rewind(aout); fwrite(&tracktot,sizeof(int),1,aout); fwrite(&tracksiz,sizeof(int),1,aout); } } fclose(aout); if (dout != NULL) fclose(dout); free(prefix); exit (0); error: fclose(aout); unlink(Catenate(prefix,argv[2],".","anno")); if (dout != NULL) { fclose(dout); unlink(Catenate(prefix,argv[2],".","data")); } free(prefix); exit (1); } pbdagcon-0.3+20161121+ds/DAZZ_DB/DBstats.c0000644000175000017500000002177313026414552015661 0ustar afifafif/******************************************************************************************* * * Display statistics about the contents of a .db and a histogram of its read lengths. * * Author: Gene Myers * Date : July 2013 * Mod : April 2014 * ********************************************************************************************/ #include #include #include #include #include "DB.h" static char *Usage = " [-nu] [-b] [-m]+ "; int main(int argc, char *argv[]) { HITS_DB _db, *db = &_db; int dam; int64 ototal; int oreads; int nbin, *hist; int64 *bsum; int NONE; int TRIM; int BIN; int MMAX, MTOP; char **MASK; { int i, j, k; int flags[128]; char *eptr; ARG_INIT("DBstats") BIN = 1000; MTOP = 0; MMAX = 10; MASK = (char **) Malloc(MMAX*sizeof(char *),"Allocating mask track array"); if (MASK == NULL) exit (1); j = 1; for (i = 1; i < argc; i++) if (argv[i][0] == '-') switch (argv[i][1]) { default: ARG_FLAGS("nu") break; case 'b': ARG_POSITIVE(BIN,"Bin size") break; case 'm': if (MTOP >= MMAX) { MMAX = 1.2*MTOP + 10; MASK = (char **) Realloc(MASK,MMAX*sizeof(char *),"Reallocating mask track array"); if (MASK == NULL) exit (1); } MASK[MTOP++] = argv[i]+2; break; } else argv[j++] = argv[i]; argc = j; NONE = flags['n']; TRIM = 1-flags['u']; if (argc != 2) { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage); exit (1); } } { int i, status, kind; // Open .db or .dam status = Open_DB(argv[1],db); if (status < 0) exit (1); dam = status; // Check tracks and load tracks for untrimmed DB for (i = 0; i < MTOP; i++) { status = Check_Track(db,MASK[i],&kind); if (status == -2) fprintf(stderr,"%s: Warning: -m%s option given but no track found.\n",Prog_Name,MASK[i]); else if (status == -1) fprintf(stderr,"%s: Warning: %s track not sync'd with db.\n",Prog_Name,MASK[i]); else if (kind != MASK_TRACK) fprintf(stderr,"%s: Warning: %s track is not a mask track.\n",Prog_Name,MASK[i]); else if (status == 0) Load_Track(db,MASK[i]); else if (status == 1 && !TRIM) fprintf(stderr,"%s: Warning: %s track is for a trimmed db but -u is set.\n", Prog_Name,MASK[i]); } oreads = db->nreads; ototal = db->totlen; if (TRIM) { Trim_DB(db); // Load tracks for trimmed DB for (i = 0; i < MTOP; i++) { status = Check_Track(db,MASK[i],&kind); if (status < 0) continue; else if (status == 1) Load_Track(db,MASK[i]); } } } { int i; int64 totlen; int nreads, maxlen; HITS_READ *reads; nreads = db->nreads; totlen = db->totlen; maxlen = db->maxlen; reads = db->reads; nbin = maxlen/BIN + 1; hist = (int *) Malloc(sizeof(int)*nbin,"Allocating histograms"); bsum = (int64 *) Malloc(sizeof(int64)*nbin,"Allocating histograms"); if (hist == NULL || bsum == NULL) exit (1); for (i = 0; i < nbin; i++) { hist[i] = 0; bsum[i] = 0; } for (i = 0; i < nreads; i++) { int rlen = reads[i].rlen; hist[rlen/BIN] += 1; bsum[rlen/BIN] += rlen; } if (dam) printf("\nStatistics for all contigs"); else if (db->all || !TRIM) printf("\nStatistics for all wells"); else printf("\nStatistics for all reads"); if (TRIM && db->cutoff > 0) { printf(" of length "); Print_Number(db->cutoff,0,stdout); printf(" bases or more\n\n"); } else if (dam) printf(" in the map index\n\n"); else printf(" in the data set\n\n"); Print_Number((int64) nreads,15,stdout); if (dam) printf(" contigs"); else printf(" reads "); if (TRIM) { printf(" out of "); Print_Number((int64 ) oreads,15,stdout); if (oreads <= 0) printf(" (100.0%%)"); else printf(" (%5.1f%%)",(100.*nreads)/oreads); } printf("\n"); Print_Number(totlen,15,stdout); printf(" base pairs"); if (TRIM) { printf(" out of "); Print_Number(ototal,15,stdout); if (ototal <= 0) printf(" (100.0%%)"); else printf(" (%5.1f%%)",(100.*totlen)/ototal); } printf("\n\n"); if (nreads > 0) { int64 ave, dev; ave = totlen/nreads; Print_Number(ave,15,stdout); if (dam) printf(" average contig length\n"); else { printf(" average read length\n"); dev = 0; for (i = 0; i < nreads; i++) { int rlen = reads[i].rlen; dev += (rlen-ave)*(rlen-ave); } dev = (int64) sqrt((1.*dev)/nreads); Print_Number(dev,15,stdout); printf(" standard deviation\n"); } } if (totlen <= 0) { free(hist); free(bsum); Close_DB(db); exit (0); } printf("\n Base composition: %.3f(A) %.3f(C) %.3f(G) %.3f(T)\n", db->freq[0],db->freq[1],db->freq[2],db->freq[3]); if (!NONE) { int64 btot; int cum, skip, avg; printf("\n Distribution of Read Lengths (Bin size = "); Print_Number((int64) BIN,0,stdout); printf(")\n\n Bin: Count %% Reads %% Bases Average\n"); if (dam) skip = 0; else skip = -1; cum = 0; btot = 0; for (i = nbin-1; i >= 0; i--) { cum += hist[i]; btot += bsum[i]; if (hist[i] != skip) { Print_Number((int64) (i*BIN),11,stdout); printf(":"); Print_Number((int64) hist[i],11,stdout); if (cum > 0) avg = btot/cum; else avg = 0; printf(" %5.1f %5.1f %9d\n",(100.*cum)/nreads,(100.*btot)/totlen,avg); } if (cum == nreads) break; } } } { int64 totlen; int numint, maxlen; HITS_TRACK *track; for (track = db->tracks; track != NULL; track = track->next) { char *data = track->data; int64 *anno = (int64 *) track->anno; int *idata, *edata; int64 ave, dev, btot; int k, rlen, cum; totlen = 0; numint = 0; maxlen = 0; for (k = 0; k < db->nreads; k++) { edata = (int *) (data + anno[k+1]); for (idata = (int *) (data + anno[k]); idata < edata; idata += 2) { rlen = idata[1] - *idata; numint += 1; totlen += rlen; if (rlen > maxlen) maxlen = rlen; } } printf("\n\nStatistics for %s-track\n",track->name); printf("\n There are "); Print_Number(numint,0,stdout); printf(" intervals totaling "); Print_Number(totlen,0,stdout); printf(" bases (%.1f%% of all data)\n",(100.*totlen)/db->totlen); if (numint <= 0) continue; nbin = maxlen/BIN + 1; for (k = 0; k < nbin; k++) { hist[k] = 0; bsum[k] = 0; } ave = totlen/numint; dev = 0; for (k = 0; k < db->nreads; k++) { edata = (int *) (data + anno[k+1]); for (idata = (int *) (data + anno[k]); idata < edata; idata += 2) { rlen = idata[1] - *idata; dev += (rlen-ave)*(rlen-ave); hist[rlen/BIN] += 1; bsum[rlen/BIN] += rlen; } } dev = (int64) sqrt((1.*dev)/numint); printf("\n"); Print_Number(ave,15,stdout); printf(" average interval length\n"); Print_Number(dev,15,stdout); printf(" standard deviation\n"); printf("\n Distribution of %s intervals (Bin size = ",track->name); Print_Number((int64) BIN,0,stdout); printf(")\n\n Bin: Count %% Intervals %% Bases Average\n"); cum = 0; btot = 0; for (k = nbin-1; k >= 0; k--) { cum += hist[k]; btot += bsum[k]; if (hist[k] > 0) { Print_Number((int64) (k*BIN),11,stdout); printf(":"); Print_Number((int64) hist[k],11,stdout); printf(" %5.1f %5.1f %9lld\n",(100.*cum)/numint, (100.*btot)/totlen,btot/cum); if (cum == numint) break; } } printf("\n"); } } free(hist); free(bsum); Close_DB(db); exit (0); } pbdagcon-0.3+20161121+ds/DAZZ_DB/DBsplit.c0000644000175000017500000001342113026414552015645 0ustar afifafif/******************************************************************************************* * * Split a .db into a set of sub-database blocks for use by the Dazzler: * Divide the database .db conceptually into a series of blocks referable to on the * command line as .1.db, .2.db, ... If the -x option is set then all reads * less than the given length are ignored, and if the -a option is not set then secondary * reads from a given well are also ignored. The remaining reads are split amongst the * blocks so that each block is of size -s * 1Mbp except for the last which necessarily * contains a smaller residual. The default value for -s is 400Mbp because blocks of this * size can be compared by our "overlapper" dalign in roughly 16Gb of memory. The blocks * are very space efficient in that their sub-index of the master .idx is computed on the * fly when loaded, and the .bps file of base pairs is shared with the master DB. Any * tracks associated with the DB are also computed on the fly when loading a database block. * * Author: Gene Myers * Date : September 2013 * Mod : New splitting definition to support incrementality, and new stub file format * Date : April 2014 * ********************************************************************************************/ #include #include #include #include #include "DB.h" #ifdef HIDE_FILES #define PATHSEP "/." #else #define PATHSEP "/" #endif static char *Usage = "[-a] [-x] [-s] "; int main(int argc, char *argv[]) { HITS_DB db, dbs; int64 dbpos; FILE *dbfile, *ixfile; int status; int ALL; int CUTOFF; int64 SIZE; { int i, j, k; int flags[128]; char *eptr; float size; ARG_INIT("DBsplit") CUTOFF = 0; size = 200; j = 1; for (i = 1; i < argc; i++) if (argv[i][0] == '-') switch (argv[i][1]) { default: ARG_FLAGS("a") break; case 'x': ARG_NON_NEGATIVE(CUTOFF,"Min read length cutoff") break; case 's': ARG_REAL(size) if (size <= 0.) { fprintf(stderr,"%s: Block size must be a positive number\n",Prog_Name); exit (1); } break; } else argv[j++] = argv[i]; argc = j; SIZE = size*1000000ll; ALL = flags['a']; if (argc != 2) { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage); exit (1); } } // Open db status = Open_DB(argv[1],&db); if (status < 0) exit (1); if (db.part > 0) { fprintf(stderr,"%s: Cannot be called on a block: %s\n",Prog_Name,argv[1]); exit (1); } { char *pwd, *root; char buffer[2*MAX_NAME+100]; int nfiles; int i; pwd = PathTo(argv[1]); if (status) { root = Root(argv[1],".dam"); dbfile = Fopen(Catenate(pwd,"/",root,".dam"),"r+"); } else { root = Root(argv[1],".db"); dbfile = Fopen(Catenate(pwd,"/",root,".db"),"r+"); } ixfile = Fopen(Catenate(pwd,PATHSEP,root,".idx"),"r+"); if (dbfile == NULL || ixfile == NULL) exit (1); free(pwd); free(root); if (fscanf(dbfile,DB_NFILE,&nfiles) != 1) SYSTEM_ERROR for (i = 0; i < nfiles; i++) if (fgets(buffer,2*MAX_NAME+100,dbfile) == NULL) SYSTEM_ERROR if (fread(&dbs,sizeof(HITS_DB),1,ixfile) != 1) SYSTEM_ERROR if (dbs.cutoff >= 0) { printf("You are about to overwrite the current partition settings. This\n"); printf("will invalidate any tracks, overlaps, and other derivative files.\n"); printf("Are you sure you want to proceed? [Y/N] "); fflush(stdout); if (fgets(buffer,100,stdin) == NULL) SYSTEM_ERROR if (index(buffer,'n') != NULL || index(buffer,'N') != NULL) { printf("Aborted\n"); fflush(stdout); fclose(dbfile); exit (1); } } dbpos = ftello(dbfile); fseeko(dbfile,dbpos,SEEK_SET); fprintf(dbfile,DB_NBLOCK,0); fprintf(dbfile,DB_PARAMS,SIZE,CUTOFF,ALL); } { HITS_READ *reads = db.reads; int nreads = db.ureads; int64 totlen; int nblock, ireads, treads, rlen, fno; int i; nblock = 0; totlen = 0; ireads = 0; treads = 0; fprintf(dbfile,DB_BDATA,0,0); if (ALL) for (i = 0; i < nreads; i++) { rlen = reads[i].rlen; if (rlen >= CUTOFF) { ireads += 1; treads += 1; totlen += rlen; if (totlen >= SIZE) { fprintf(dbfile,DB_BDATA,i+1,treads); totlen = 0; ireads = 0; nblock += 1; } } } else for (i = 0; i < nreads; i++) { rlen = reads[i].rlen; if (rlen >= CUTOFF && (reads[i].flags & DB_BEST) != 0) { ireads += 1; treads += 1; totlen += rlen; if (totlen >= SIZE) { fprintf(dbfile,DB_BDATA,i+1,treads); totlen = 0; ireads = 0; nblock += 1; } } } if (ireads > 0) { fprintf(dbfile,DB_BDATA,nreads,treads); nblock += 1; } fno = fileno(dbfile); if (ftruncate(fno,ftello(dbfile)) < 0) SYSTEM_ERROR fseeko(dbfile,dbpos,SEEK_SET); fprintf(dbfile,DB_NBLOCK,nblock); dbs.cutoff = CUTOFF; dbs.all = ALL; dbs.treads = treads; rewind(ixfile); fwrite(&dbs,sizeof(HITS_DB),1,ixfile); } fclose(ixfile); fclose(dbfile); Close_DB(&db); exit (0); } pbdagcon-0.3+20161121+ds/DAZZ_DB/Makefile0000644000175000017500000000416013026414552015600 0ustar afifafifCFLAGS = -O3 -Wall -Wextra -Wno-unused-result -fno-strict-aliasing ALL = fasta2DB DB2fasta quiva2DB DB2quiva DBsplit DBdust Catrack DBshow DBstats DBrm simulator \ fasta2DAM DAM2fasta DBdump all: $(ALL) fasta2DB: fasta2DB.c DB.c DB.h QV.c QV.h gcc $(CFLAGS) -o fasta2DB fasta2DB.c DB.c QV.c -lm DB2fasta: DB2fasta.c DB.c DB.h QV.c QV.h gcc $(CFLAGS) -o DB2fasta DB2fasta.c DB.c QV.c -lm quiva2DB: quiva2DB.c DB.c DB.h QV.c QV.h gcc $(CFLAGS) -o quiva2DB quiva2DB.c DB.c QV.c -lm DB2quiva: DB2quiva.c DB.c DB.h QV.c QV.h gcc $(CFLAGS) -o DB2quiva DB2quiva.c DB.c QV.c -lm DBsplit: DBsplit.c DB.c DB.h QV.c QV.h gcc $(CFLAGS) -o DBsplit DBsplit.c DB.c QV.c -lm DBdust: DBdust.c DB.c DB.h QV.c QV.h gcc $(CFLAGS) -o DBdust DBdust.c DB.c QV.c -lm Catrack: Catrack.c DB.c DB.h QV.c QV.h gcc $(CFLAGS) -o Catrack Catrack.c DB.c QV.c -lm DBshow: DBshow.c DB.c DB.h QV.c QV.h gcc $(CFLAGS) -o DBshow DBshow.c DB.c QV.c -lm DBdump: DBdump.c DB.c DB.h QV.c QV.h gcc $(CFLAGS) -o DBdump DBdump.c DB.c QV.c -lm DBstats: DBstats.c DB.c DB.h QV.c QV.h gcc $(CFLAGS) -o DBstats DBstats.c DB.c QV.c -lm DBrm: DBrm.c DB.c DB.h QV.c QV.h gcc $(CFLAGS) -o DBrm DBrm.c DB.c QV.c -lm simulator: simulator.c DB.c DB.h QV.c QV.h gcc $(CFLAGS) -o simulator simulator.c DB.c QV.c -lm fasta2DAM: fasta2DAM.c DB.c DB.h QV.c QV.h gcc $(CFLAGS) -o fasta2DAM fasta2DAM.c DB.c QV.c -lm DAM2fasta: DAM2fasta.c DB.c DB.h QV.c QV.h gcc $(CFLAGS) -o DAM2fasta DAM2fasta.c DB.c QV.c -lm DBupgrade.Sep.25.2014: DBupgrade.Sep.25.2014.c DB.c DB.h QV.c QV.h gcc $(CFLAGS) -o DBupgrade.Sep.25.2014 DBupgrade.Sep.25.2014.c DB.c QV.c -lm DBupgrade.Dec.31.2014: DBupgrade.Dec.31.2014.c DB.c DB.h QV.c QV.h gcc $(CFLAGS) -o DBupgrade.Dec.31.2014 DBupgrade.Dec.31.2014.c DB.c QV.c -lm DUSTupgrade.Jan.1.2015: DUSTupgrade.Jan.1.2015.c DB.c DB.h QV.c QV.h gcc $(CFLAGS) -o DUSTupgrade.Jan.1.2015 DUSTupgrade.Jan.1.2015.c DB.c QV.c -lm clean: rm -f $(ALL) rm -fr *.dSYM rm -f DBupgrade.Sep.25.2014 DBupgrade.Dec.31.2014 DUSTupgrade.Jan.1.2015 rm -f dazz.db.tar.gz install: cp $(ALL) ~/bin package: make clean tar -zcf dazz.db.tar.gz README Makefile *.h *.c pbdagcon-0.3+20161121+ds/DAZZ_DB/DUSTupgrade.Jan.1.2015.c0000644000175000017500000000367413026414552017700 0ustar afifafif/******************************************************************************************* * * Interim code: upgrade dust track indices from int's to int64's * * Author: Gene Myers * Date : December 2014 * ********************************************************************************************/ #include #include #include #include #include "DB.h" #ifdef HIDE_FILES #define PATHSEP "/." #else #define PATHSEP "/" #endif int main(int argc, char *argv[]) { FILE *afile, *dfile; FILE *nafile, *ndfile; char *pwd, *root; int size, tracklen; int i, vint, dint; int64 vlong; if (argc != 2) { fprintf(stderr,"Usage: %s \n",argv[0]); exit (1); } pwd = PathTo(argv[1]); root = Root(argv[1],".db"); afile = Fopen(Catenate(pwd,PATHSEP,root,".dust.anno"),"r"); dfile = Fopen(Catenate(pwd,PATHSEP,root,".dust.data"),"r"); nafile = Fopen(Catenate(pwd,PATHSEP,root,".next.anno"),"w"); ndfile = Fopen(Catenate(pwd,PATHSEP,root,".next.data"),"w"); if (afile == NULL || dfile == NULL || nafile == NULL || ndfile == NULL) exit (1); free(pwd); free(root); if (fread(&tracklen,sizeof(int),1,afile) != 1) SYSTEM_ERROR fwrite(&tracklen,sizeof(int),1,nafile); if (fread(&size,sizeof(int),1,afile) != 1) SYSTEM_ERROR size = 8; fwrite(&size,sizeof(int),1,nafile); for (i = 0; i <= tracklen; i++) { if (fread(&vint,sizeof(int),1,afile) != 1) SYSTEM_ERROR vlong = vint; fwrite(&vlong,sizeof(int64),1,nafile); } vint >>= 2; for (i = 0; i < vint; i += 2) { if (fread(&dint,sizeof(int),1,dfile) != 1) SYSTEM_ERROR fwrite(&dint,sizeof(int),1,ndfile); if (fread(&dint,sizeof(int),1,dfile) != 1) SYSTEM_ERROR dint += 1; fwrite(&dint,sizeof(int),1,ndfile); } fclose(nafile); fclose(ndfile); fclose(afile); fclose(dfile); exit (0); } pbdagcon-0.3+20161121+ds/DAZZ_DB/DBdump.c0000644000175000017500000004415713026414552015471 0ustar afifafif/******************************************************************************************* * * Display a portion of the data-base and selected information in 1-code format. * * Author: Gene Myers * Date : November 2015 * ********************************************************************************************/ #include #include #include #include #include "DB.h" #ifdef HIDE_FILES #define PATHSEP "/." #else #define PATHSEP "/" #endif static char *Usage[] = { "[-rhsiq] [-uU] [-m]+", " [ | ... ]" }; #define LAST_READ_SYMBOL '$' #define MAX_BUFFER 10001 typedef struct { FILE *input; int lineno; int read; int beg; int end; } File_Iterator; File_Iterator *init_file_iterator(FILE *input) { File_Iterator *it; it = Malloc(sizeof(File_Iterator),"Allocating file iterator"); it->input = input; it->lineno = 1; rewind(input); return (it); } int next_read(File_Iterator *it) { static char nbuffer[MAX_BUFFER]; char *eol; int x; if (fgets(nbuffer,MAX_BUFFER,it->input) == NULL) { if (feof(it->input)) return (1); SYSTEM_ERROR; } if ((eol = index(nbuffer,'\n')) == NULL) { fprintf(stderr,"%s: Line %d in read list is longer than %d chars!\n", Prog_Name,it->lineno,MAX_BUFFER-1); return (1); } *eol = '\0'; x = sscanf(nbuffer," %d %d %d",&(it->read),&(it->beg),&(it->end)); if (x == 1) it->beg = -1; else if (x != 3) { fprintf(stderr,"%s: Line %d of read list is improperly formatted\n",Prog_Name,it->lineno); return (1); } it->lineno += 1; return (0); } static int qv_map[51] = { 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y' }; int main(int argc, char *argv[]) { HITS_DB _db, *db = &_db; FILE *hdrs = NULL; int64 *qv_idx = NULL; uint8 *qv_val = NULL; int nfiles; char **flist = NULL; int *findx = NULL; int input_pts; int reps = 0; int *pts = NULL; File_Iterator *iter = NULL; FILE *input = NULL; int TRIM, UPPER; int DORED, DOSEQ, DOQVS, DOHDR, DOIQV, DAM; int MMAX, MTOP; char **MASK; HITS_TRACK **MTRACK; // Process arguments { int i, j, k; int flags[128]; ARG_INIT("DBdump") MTOP = 0; MMAX = 10; MASK = (char **) Malloc(MMAX*sizeof(char *),"Allocating mask track array"); if (MASK == NULL) exit (1); j = 1; for (i = 1; i < argc; i++) if (argv[i][0] == '-') switch (argv[i][1]) { default: ARG_FLAGS("hqrsiuU") break; case 'm': if (MTOP >= MMAX) { MMAX = 1.2*MTOP + 10; MASK = (char **) Realloc(MASK,MMAX*sizeof(char *),"Reallocating mask track array"); if (MASK == NULL) exit (1); } MASK[MTOP++] = argv[i]+2; break; } else argv[j++] = argv[i]; argc = j; DAM = 0; TRIM = 1-flags['u']; UPPER = 1+flags['U']; DOQVS = flags['q']; DORED = flags['r']; DOSEQ = flags['s']; DOHDR = flags['h']; DOIQV = flags['i']; if (argc <= 1) { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage[0]); fprintf(stderr," %*s %s\n",(int) strlen(Prog_Name),"",Usage[1]); exit (1); } if ( ! TRIM && DOIQV) { fprintf(stderr,"%s: -i and -u are incompatible\n",Prog_Name); exit (1); } } // Open DB or DAM, and if a DAM open also .hdr file { char *pwd, *root; int status; status = Open_DB(argv[1],db); if (status < 0) exit (1); if (status == 1) { root = Root(argv[1],".dam"); pwd = PathTo(argv[1]); hdrs = Fopen(Catenate(pwd,PATHSEP,root,".hdr"),"r"); if (hdrs == NULL) exit (1); DAM = 1; if (DOQVS) { fprintf(stderr,"%s: -Q and -q options not compatible with a .dam DB\n",Prog_Name); exit (1); } free(root); free(pwd); } } // Load QVs if requested if (DOQVS) { if (Load_QVs(db) < 0) { fprintf(stderr,"%s: QVs requested, but no .qvs for data base\n",Prog_Name); exit (1); } } // Check tracks and load tracks for untrimmed DB { int i, status, kind; MTRACK = Malloc(sizeof(HITS_TRACK *)*MTOP,"Allocation of track pointer vector"); if (MTRACK == NULL) exit (1); for (i = 0; i < MTOP; i++) { status = Check_Track(db,MASK[i],&kind); if (status == -2) { fprintf(stderr,"%s: Warning: -m%s option given but no track found.\n", Prog_Name,MASK[i]); exit (1); } else if (status == -1) { fprintf(stderr,"%s: Warning: %s track not sync'd with db.\n",Prog_Name,MASK[i]); exit (1); } else if (kind != MASK_TRACK) { fprintf(stderr,"%s: Warning: %s track is not a mask track.\n",Prog_Name,MASK[i]); exit (1); } else if (status == 0) MTRACK[i] = Load_Track(db,MASK[i]); else if (status == 1 && !TRIM) { fprintf(stderr,"%s: Warning: %s track is for a trimmed db but -u is set.\n", Prog_Name,MASK[i]); exit (1); } } } // If not a DAM then get prolog names and index ranges from the .db file if (!DAM) { char *pwd, *root; FILE *dstub; int i; root = Root(argv[1],".db"); pwd = PathTo(argv[1]); if (db->part > 0) *rindex(root,'.') = '\0'; dstub = Fopen(Catenate(pwd,"/",root,".db"),"r"); if (dstub == NULL) exit (1); free(pwd); free(root); if (fscanf(dstub,DB_NFILE,&nfiles) != 1) SYSTEM_ERROR flist = (char **) Malloc(sizeof(char *)*nfiles,"Allocating file list"); findx = (int *) Malloc(sizeof(int *)*(nfiles+1),"Allocating file index"); if (flist == NULL || findx == NULL) exit (1); findx += 1; findx[-1] = 0; for (i = 0; i < nfiles; i++) { char prolog[MAX_NAME], fname[MAX_NAME]; if (fscanf(dstub,DB_FDATA,findx+i,fname,prolog) != 3) SYSTEM_ERROR if ((flist[i] = Strdup(prolog,"Adding to file list")) == NULL) exit (1); } fclose(dstub); // If TRIM (the default) then "trim" prolog ranges and the DB if (TRIM) { int nid, oid, lid; int cutoff, allflag; HITS_READ *reads; reads = db->reads - db->ufirst; cutoff = db->cutoff; if (db->all) allflag = 0; else allflag = DB_BEST; nid = 0; oid = db->ufirst; lid = oid + db->nreads; for (i = 0; i < nfiles; i++) { while (oid < findx[i] && oid < lid) { if ((reads[oid].flags & DB_BEST) >= allflag && reads[oid].rlen >= cutoff) nid++; oid += 1; } findx[i] = nid; } } else if (db->part > 0) { for (i = 0; i < nfiles; i++) findx[i] -= db->ufirst; } } if (TRIM) { int i, status, kind; Trim_DB(db); // Load tracks for trimmed DB for (i = 0; i < MTOP; i++) { status = Check_Track(db,MASK[i],&kind); if (status < 0) continue; else if (status == 1) MTRACK[i] = Load_Track(db,MASK[i]); } } if (DOIQV) { int status, kind; HITS_TRACK *track; status = Check_Track(db,"qual",&kind); if (status == -2) { fprintf(stderr,"%s: .qual-track does not exist for this db.\n",Prog_Name); exit (1); } if (status == -1) { fprintf(stderr,"%s: .qual-track not sync'd with db.\n",Prog_Name); exit (1); } track = Load_Track(db,"qual"); qv_idx = (int64 *) track->anno; qv_val = (uint8 *) track->data; } // Process read index arguments into a list of read ranges input_pts = 0; if (argc == 3) { if (argv[2][0] != LAST_READ_SYMBOL || argv[2][1] != '\0') { char *eptr, *fptr; int b, e; b = strtol(argv[2],&eptr,10); if (eptr > argv[2] && b > 0) { if (*eptr == '-') { if (eptr[1] != LAST_READ_SYMBOL || eptr[2] != '\0') { e = strtol(eptr+1,&fptr,10); input_pts = (fptr <= eptr+1 || *fptr != '\0' || e <= 0); } } else input_pts = (*eptr != '\0'); } else input_pts = 1; } } if (input_pts) { input = Fopen(argv[2],"r"); if (input == NULL) exit (1); iter = init_file_iterator(input); } else { pts = (int *) Malloc(sizeof(int)*2*(argc-1),"Allocating read parameters"); if (pts == NULL) exit (1); reps = 0; if (argc > 2) { int c, b, e; char *eptr, *fptr; for (c = 2; c < argc; c++) { if (argv[c][0] == LAST_READ_SYMBOL) { b = db->nreads; eptr = argv[c]+1; } else b = strtol(argv[c],&eptr,10); if (eptr > argv[c]) { if (b <= 0) { fprintf(stderr,"%s: %d is not a valid index\n",Prog_Name,b); exit (1); } if (*eptr == 0) { pts[reps++] = b; pts[reps++] = b; continue; } else if (*eptr == '-') { if (eptr[1] == LAST_READ_SYMBOL) { e = db->nreads; fptr = eptr+2; } else e = strtol(eptr+1,&fptr,10); if (fptr > eptr+1 && *fptr == 0 && e > 0) { pts[reps++] = b; pts[reps++] = e; if (b > e) { fprintf(stderr,"%s: Empty range '%s'\n",Prog_Name,argv[c]); exit (1); } continue; } } } fprintf(stderr,"%s: argument '%s' is not an integer range\n",Prog_Name,argv[c]); exit (1); } } else { pts[reps++] = 1; pts[reps++] = db->nreads; } } // Scan to count the size of things { HITS_READ *reads; int c, b, e, i, m; int map, substr; int64 noreads; int64 seqmax, seqtot; int64 iqvmax, iqvtot; int64 hdrmax, hdrtot; int64 trkmax[MTOP], trktot[MTOP]; map = 0; reads = db->reads; substr = 0; noreads = 0; seqmax = 0; seqtot = 0; iqvmax = 0; iqvtot = 0; hdrmax = 0; hdrtot = 0; for (m = 0; m < MTOP; m++) { trkmax[m] = 0; trktot[m] = 0; } c = 0; while (1) { if (input_pts) { if (next_read(iter)) break; e = iter->read; b = e-1; substr = (iter->beg >= 0); } else { if (c >= reps) break; b = pts[c]-1; e = pts[c+1]; if (e > db->nreads) e = db->nreads; c += 2; } for (i = b; i < e; i++) { int len, ten; int fst, lst; HITS_READ *r; r = reads + i; len = r->rlen; noreads += 1; if (DOHDR) { int ten; if (DAM) { char header[MAX_NAME]; fseeko(hdrs,r->coff,SEEK_SET); fgets(header,MAX_NAME,hdrs); header[strlen(header)-1] = '\0'; ten = strlen(header); } else { while (i < findx[map-1]) map -= 1; while (i >= findx[map]) map += 1; ten = strlen(flist[map]); } if (hdrmax < ten) hdrmax = ten; hdrtot += ten; } for (m = 0; m < MTOP; m++) { int64 *anno; anno = (int64 *) MTRACK[m]->anno; ten = ((anno[i+1]-anno[i]) >> 3); if (ten > trkmax[m]) trkmax[m] = ten; trktot[m] += ten; } if (substr) { fst = iter->beg; lst = iter->end; if (DOIQV) { fprintf(stderr,"%s: Cannot select subreads when -i is requested\n",Prog_Name); exit (1); } } else { fst = 0; lst = len; } if (DOSEQ | DOQVS) { int ten = lst-fst; if (ten > seqmax) seqmax = ten; seqtot += ten; } if (DOIQV) { int ten = qv_idx[i+1] - qv_idx[i]; if (ten > iqvmax) iqvmax = ten; iqvtot += ten; } } } printf("+ R %lld\n",noreads); printf("+ M %d\n",MTOP); if (DOHDR) { printf("+ H %lld\n",hdrtot); printf("@ H %lld\n",hdrmax); } for (m = 0; m < MTOP; m++) { printf("+ T%d %lld\n",m,trktot[m]); printf("@ T%d %lld\n",m,trkmax[m]); } if (DOSEQ | DOQVS) { printf("+ S %lld\n",seqtot); printf("@ S %lld\n",seqmax); } if (DOIQV) { printf("+ I %lld\n",iqvtot); printf("@ I %lld\n",iqvmax); } } // Display each read (and/or QV streams) in the active DB according to the // range pairs in pts[0..reps) and according to the display options. { HITS_READ *reads; char *read, **entry; int c, b, e, i, m; int substr; int map; char qvname[5] = { 'd', 'c', 'i', 'm', 's' }; read = New_Read_Buffer(db); if (DOQVS) entry = New_QV_Buffer(db); else entry = NULL; map = 0; reads = db->reads; substr = 0; if (input_pts) iter = init_file_iterator(input); else iter = NULL; c = 0; while (1) { if (input_pts) { if (next_read(iter)) break; e = iter->read; b = e-1; substr = (iter->beg >= 0); } else { if (c >= reps) break; b = pts[c]-1; e = pts[c+1]; if (e > db->nreads) e = db->nreads; c += 2; } for (i = b; i < e; i++) { int len; int fst, lst; int flags, qv; HITS_READ *r; r = reads + i; len = r->rlen; if (DORED) printf("R %d\n",i+1); flags = r->flags; qv = (flags & DB_QV); if (DOHDR) { if (DAM) { char header[MAX_NAME]; fseeko(hdrs,r->coff,SEEK_SET); fgets(header,MAX_NAME,hdrs); header[strlen(header)-1] = '\0'; printf("H %ld %s\n",strlen(header),header); printf("L %d %d %d\n",r->origin,r->fpulse,r->fpulse+len); } else { while (i < findx[map-1]) map -= 1; while (i >= findx[map]) map += 1; printf("H %ld %s\n",strlen(flist[map]),flist[map]); printf("L %d %d %d\n",r->origin,r->fpulse,r->fpulse+len); if (qv > 0) printf("Q: %d\n",qv); } } if (DOQVS) Load_QVentry(db,i,entry,UPPER); if (DOSEQ) Load_Read(db,i,read,UPPER); for (m = 0; m < MTOP; m++) { int64 *anno; int *data; int64 s, f, j; anno = (int64 *) MTRACK[m]->anno; data = (int *) MTRACK[m]->data; s = (anno[i] >> 2); f = (anno[i+1] >> 2); printf("T%d %lld ",m,(f-s)/2); if (s < f) { for (j = s; j < f; j += 2) printf(" %d %d",data[j],data[j+1]); } printf("\n"); } if (substr) { fst = iter->beg; lst = iter->end; } else { fst = 0; lst = len; } if (DOSEQ) { printf("S %d ",lst-fst); printf("%.*s\n",lst-fst,read+fst); } if (DOIQV) { int64 k, e; k = qv_idx[i]; e = qv_idx[i+1]; printf("I %lld ",e-k); while (k < e) putchar(qv_map[qv_val[k++]]); printf("\n"); } if (DOQVS) { int k; for (k = 0; k < 5; k++) { printf("%c %d ",qvname[k],lst-fst); printf("%.*s\n",lst-fst,entry[k]+fst); } } } } } if (input_pts) { fclose(input); free(iter); } else free(pts); if (DAM) fclose(hdrs); else { int i; for (i = 0; i < nfiles; i++) free(flist[i]); free(flist); free(findx-1); } Close_DB(db); exit (0); } pbdagcon-0.3+20161121+ds/DAZZ_DB/QV.c0000644000175000017500000010514513026414552014637 0ustar afifafif/******************************************************************************************* * * Compressor/decompressor for .quiv files: customized Huffman codes for each stream based on * the histogram of values occuring in a given file. The two low complexity streams * (deletionQV and substitutionQV) use a Huffman coding of the run length of the prevelant * character. * * Author: Gene Myers * Date: Jan 18, 2014 * Modified: July 25, 2014 * ********************************************************************************************/ #include #include #include #include #include #include "DB.h" #undef DEBUG #define MIN_BUFFER 1000 #define HUFF_CUTOFF 16 // This cannot be larger than 16 ! /******************************************************************************************* * * Endian flipping routines * ********************************************************************************************/ static int LittleEndian; // Little-endian machine ? // Referred by: Decode & Decode_Run static int Flip; // Flip endian of all coded shorts and ints // Referred by: Decode & Decode_Run & Read_Scheme static void Set_Endian(int flip) { uint32 x = 3; uint8 *b = (uint8 *) (&x); Flip = flip; LittleEndian = (b[0] == 3); } static void Flip_Long(void *w) { uint8 *v = (uint8 *) w; uint8 x; x = v[0]; v[0] = v[3]; v[3] = x; x = v[1]; v[1] = v[2]; v[2] = x; } static void Flip_Short(void *w) { uint8 *v = (uint8 *) w; uint8 x; x = v[0]; v[0] = v[1]; v[1] = x; } /******************************************************************************************* * * Routines for computing a Huffman Encoding Scheme * ********************************************************************************************/ typedef struct { int type; // 0 => normal, 1 => normal but has long codes, 2 => truncated uint32 codebits[256]; // If type = 2, then code 255 is the special code for int codelens[256]; // non-Huffman exceptions int lookup[0x10000]; // Lookup table (just for decoding) } HScheme; typedef struct _HTree { struct _HTree *lft, *rgt; uint64 count; } HTree; // Establish heap property from node s down (1 is root, siblings of n are 2n and 2n+1) // assuming s is the only perturbation in the tree. static void Reheap(int s, HTree **heap, int hsize) { int c, l, r; HTree *hs, *hr, *hl; c = s; hs = heap[s]; while ((l = 2*c) <= hsize) { r = l+1; hl = heap[l]; hr = heap[r]; if (r > hsize || hr->count > hl->count) { if (hs->count > hl->count) { heap[c] = hl; c = l; } else break; } else { if (hs->count > hr->count) { heap[c] = hr; c = r; } else break; } } if (c != s) heap[c] = hs; } // Given Huffman tree build a table of codes from it, the low-order codelens[s] bits // of codebits[s] contain the code for symbol s. static void Build_Table(HTree *node, int code, int len, uint32 *codebits, int *codelens) { if (node->rgt == NULL) { uint64 symbol = (uint64) (node->lft); codebits[symbol] = code; codelens[symbol] = len; } else { code <<= 1; len += 1; Build_Table(node->lft,code,len,codebits,codelens); Build_Table(node->rgt,code+1,len,codebits,codelens); } } // For the non-zero symbols in hist, compute a huffman tree over them, and then // build a table of the codes. If inscheme is not NULL, then place all symbols // with code 255 or with more than HUFF_CUTOFF bits in the encoding by inscheme // as a single united entity, whose code signals that the value of these symbols // occur explicitly in 8 (values) or 16 (run lengths) bits following the code. // All the symbols in this class will have the same entry in the code table and // 255 is always in this class. static HScheme *Huffman(uint64 *hist, HScheme *inscheme) { HScheme *scheme; HTree *heap[259]; HTree node[512]; int hsize; HTree *lft, *rgt; int value, range; int i; scheme = (HScheme *) Malloc(sizeof(HScheme),"Allocating Huffman scheme record"); if (scheme == NULL) return (NULL); hsize = 0; // Load heap value = 0; if (inscheme != NULL) { node[0].count = 0; node[0].lft = (HTree *) (uint64) 255; node[0].rgt = NULL; heap[++hsize] = node+(value++); } for (i = 0; i < 256; i++) if (hist[i] > 0) { if (inscheme != NULL && (inscheme->codelens[i] > HUFF_CUTOFF || i == 255)) node[0].count += hist[i]; else { node[value].count = hist[i]; node[value].lft = (HTree *) (uint64) i; node[value].rgt = NULL; heap[++hsize] = node+(value++); } } for (i = hsize/2; i >= 1; i--) // Establish heap property Reheap(i,heap,hsize); range = value; // Merge pairs with smallest count until have a tree for (i = 1; i < value; i++) { lft = heap[1]; heap[1] = heap[hsize--]; Reheap(1,heap,hsize); rgt = heap[1]; node[range].lft = lft; node[range].rgt = rgt; node[range].count = lft->count + rgt->count; heap[1] = node+(range++); Reheap(1,heap,hsize); } for (i = 0; i < 256; i++) // Build the code table { scheme->codebits[i] = 0; scheme->codelens[i] = 0; } Build_Table(node+(range-1),0,0,scheme->codebits,scheme->codelens); if (inscheme != NULL) // Set scheme type and if truncated (2), map truncated codes { scheme->type = 2; // to code and length for 255 for (i = 0; i < 255; i++) if (inscheme->codelens[i] > HUFF_CUTOFF || scheme->codelens[i] > HUFF_CUTOFF) { scheme->codelens[i] = scheme->codelens[255]; scheme->codebits[i] = scheme->codebits[255]; } } else { scheme->type = 0; for (i = 0; i < 256; i++) { if (scheme->codelens[i] > HUFF_CUTOFF) scheme->type = 1; } } return (scheme); } #ifdef DEBUG // For debug, show the coding table static void Print_Table(HScheme *scheme, uint64 *hist, int infosize) { uint64 total_bits; uint32 specval, mask, code, *bits; int speclen, clen, *lens; int i, k; total_bits = 0; bits = scheme->codebits; lens = scheme->codelens; if (scheme->type == 2) { specval = bits[255]; speclen = lens[255]; } else specval = speclen = 0x7fffffff; printf("\nCode Table:\n"); for (i = 0; i < 256; i++) if (lens[i] > 0) { clen = lens[i]; mask = (1 << clen); code = bits[i]; printf(" %3d: %2d ",i,clen); for (k = 0; k < clen; k++) { mask >>= 1; if (code & mask) printf("1"); else printf("0"); } if (code == specval && clen == speclen) { printf(" ***"); if (hist != NULL) total_bits += (clen+infosize)*hist[i]; } else if (hist != NULL) total_bits += clen*hist[i]; printf("\n"); } if (hist != NULL) printf("\nTotal Bytes = %lld\n",(total_bits-1)/8+1); } // For debug, show the histogram static void Print_Histogram(uint64 *hist) { int i, low, hgh; uint64 count; for (hgh = 255; hgh >= 0; hgh--) if (hist[hgh] != 0) break; for (low = 0; low < 256; low++) if (hist[low] != 0) break; count = 0; for (i = low; i <= hgh; i++) count += hist[i]; for (i = hgh; i >= low; i--) printf(" %3d: %8llu %5.1f%%\n",i,hist[i],(hist[i]*100.)/count); } #endif /******************************************************************************************* * * Read and Write Huffman Schemes * ********************************************************************************************/ // Write the code table to out. static void Write_Scheme(HScheme *scheme, FILE *out) { int i; uint8 x; uint32 *bits; int *lens; lens = scheme->codelens; bits = scheme->codebits; x = (uint8) (scheme->type); fwrite(&x,1,1,out); for (i = 0; i < 256; i++) { x = (uint8) (lens[i]); fwrite(&x,1,1,out); if (x > 0) fwrite(bits+i,sizeof(uint32),1,out); } } // Allocate and read a code table from in, and return a pointer to it. static HScheme *Read_Scheme(FILE *in) { HScheme *scheme; int *look, *lens; uint32 *bits, base; int i, j, powr; uint8 x; scheme = (HScheme *) Malloc(sizeof(HScheme),"Allocating Huffman scheme record"); if (scheme == NULL) return (NULL); lens = scheme->codelens; bits = scheme->codebits; look = scheme->lookup; if (fread(&x,1,1,in) != 1) { EPRINTF(EPLACE,"Could not read scheme type byte (Read_Scheme)\n"); free(scheme); return (NULL); } scheme->type = x; for (i = 0; i < 256; i++) { if (fread(&x,1,1,in) != 1) { EPRINTF(EPLACE,"Could not read length of %d'th code (Read_Scheme)\n",i); return (NULL); } lens[i] = x; if (x > 0) { if (fread(bits+i,sizeof(uint32),1,in) != 1) { EPRINTF(EPLACE,"Could not read bit encoding of %d'th code (Read_Scheme)\n",i); free(scheme); return (NULL); } } else bits[i] = 0; } if (Flip) { for (i = 0; i < 256; i++) Flip_Long(bits+i); } for (i = 0; i < 256; i++) { if (lens[i] > 0) { base = (bits[i] << (16-lens[i])); powr = (1 << (16-lens[i])); for (j = 0; j < powr; j++) look[base+j] = i; } } return (scheme); } /******************************************************************************************* * * Encoders and Decoders * ********************************************************************************************/ // Encode read[0..rlen-1] according to scheme and write to out static void Encode(HScheme *scheme, FILE *out, uint8 *read, int rlen) { uint32 x, c, ocode; int n, k, olen, llen; int *nlens; uint32 *nbits; uint32 nspec; int nslen; nlens = scheme->codelens; nbits = scheme->codebits; if (scheme->type == 2) { nspec = nbits[255]; nslen = nlens[255]; } else nspec = nslen = 0x7fffffff; #define OCODE(L,C) \ { int len = olen + (L); \ uint32 code = (C); \ \ llen = olen; \ if (len >= 32) \ { olen = len-32; \ ocode |= (code >> olen); \ fwrite(&ocode,sizeof(uint32),1,out); \ if (olen > 0) \ ocode = (code << (32-olen)); \ else \ ocode = 0; \ } \ else \ { olen = len; \ ocode |= (code << (32-olen));; \ } \ } llen = 0; olen = 0; ocode = 0; for (k = 0; k < rlen; k++) { x = read[k]; n = nlens[x]; c = nbits[x]; OCODE(n,c); if (c == nspec && n == nslen) OCODE(8,x); } if (olen > 0) // Tricky: must pad so decoder does not read past { fwrite(&ocode,sizeof(uint32),1,out); // last integer int the coded output. if (llen > 16 && olen > llen) fwrite(&ocode,sizeof(uint32),1,out); } else if (llen > 16) fwrite(&ocode,sizeof(uint32),1,out); } // Encode read[0..rlen-1] according to non-rchar table neme, and run-length table reme for // runs of rchar characters. Write to out. static void Encode_Run(HScheme *neme, HScheme *reme, FILE *out, uint8 *read, int rlen, int rchar) { uint32 x, c, ocode; int n, h, k, olen, llen; int *nlens, *rlens; uint32 *nbits, *rbits; uint32 nspec, rspec; int nslen, rslen; nlens = neme->codelens; nbits = neme->codebits; rlens = reme->codelens; rbits = reme->codebits; if (neme->type == 2) { nspec = nbits[255]; nslen = nlens[255]; } else nspec = nslen = 0x7fffffff; rspec = rbits[255]; rslen = rlens[255]; llen = 0; olen = 0; ocode = 0; k = 0; while (k < rlen) { h = k; while (k < rlen && read[k] == rchar) k += 1; if (k-h >= 255) x = 255; else x = k-h; n = rlens[x]; c = rbits[x]; OCODE(n,c); if (c == rspec && n == rslen) OCODE(16,k-h); if (k < rlen) { x = read[k]; n = nlens[x]; c = nbits[x]; OCODE(n,c); if (c == nspec && n == nslen) OCODE(8,x); k += 1; } } if (olen > 0) { fwrite(&ocode,sizeof(uint32),1,out); if (llen > 16 && olen > llen) fwrite(&ocode,sizeof(uint32),1,out); } else if (llen > 16) fwrite(&ocode,sizeof(uint32),1,out); } // Read and decode from in, the next rlen symbols into read according to scheme static int Decode(HScheme *scheme, FILE *in, char *read, int rlen) { int *look, *lens; int signal, ilen; uint64 icode; uint32 *ipart; uint16 *xpart; uint8 *cpart; int j, n, c; if (LittleEndian) { ipart = ((uint32 *) (&icode)); xpart = ((uint16 *) (&icode)) + 2; cpart = ((uint8 *) (&icode)) + 5; } else { ipart = ((uint32 *) (&icode)) + 1; xpart = ((uint16 *) (&icode)) + 1; cpart = ((uint8 *) (&icode)) + 2; } if (scheme->type == 2) signal = 255; else signal = 256; lens = scheme->codelens; look = scheme->lookup; #define GET \ if (n > ilen) \ { icode <<= ilen; \ if (fread(ipart,sizeof(uint32),1,in) != 1) \ { EPRINTF(EPLACE,"Could not read more bits (Decode)\n"); \ return (1); \ } \ ilen = n-ilen; \ icode <<= ilen; \ ilen = 32-ilen; \ } \ else \ { icode <<= n; \ ilen -= n; \ } #define GETFLIP \ if (n > ilen) \ { icode <<= ilen; \ if (fread(ipart,sizeof(uint32),1,in) != 1) \ { EPRINTF(EPLACE,"Could not read more bits (Decode)\n"); \ return (1); \ } \ Flip_Long(ipart); \ ilen = n-ilen; \ icode <<= ilen; \ ilen = 32-ilen; \ } \ else \ { icode <<= n; \ ilen -= n; \ } n = 16; ilen = 0; icode = 0; if (Flip) for (j = 0; j < rlen; j++) { GETFLIP c = look[*xpart]; n = lens[c]; if (c == signal) { GETFLIP c = *cpart; n = 8; } read[j] = (char) c; } else for (j = 0; j < rlen; j++) { GET c = look[*xpart]; n = lens[c]; if (c == signal) { GET c = *cpart; n = 8; } read[j] = (char) c; } return (0); } // Read and decode from in, the next rlen symbols into read according to non-rchar scheme // neme, and the rchar runlength shceme reme static int Decode_Run(HScheme *neme, HScheme *reme, FILE *in, char *read, int rlen, int rchar) { int *nlook, *nlens; int *rlook, *rlens; int nsignal, ilen; uint64 icode; uint32 *ipart; uint16 *xpart; uint8 *cpart; int j, n, c, k; if (LittleEndian) { ipart = ((uint32 *) (&icode)); xpart = ((uint16 *) (&icode)) + 2; cpart = ((uint8 *) (&icode)) + 5; } else { ipart = ((uint32 *) (&icode)) + 1; xpart = ((uint16 *) (&icode)) + 1; cpart = ((uint8 *) (&icode)) + 2; } if (neme->type == 2) nsignal = 255; else nsignal = 256; nlens = neme->codelens; nlook = neme->lookup; rlens = reme->codelens; rlook = reme->lookup; n = 16; ilen = 0; icode = 0; if (Flip) for (j = 0; j < rlen; j++) { GETFLIP c = rlook[*xpart]; n = rlens[c]; if (c == 255) { GETFLIP c = *xpart; n = 16; } for (k = 0; k < c; k++) read[j++] = (char) rchar; if (j < rlen) { GETFLIP c = nlook[*xpart]; n = nlens[c]; if (c == nsignal) { GETFLIP c = *cpart; n = 8; } read[j] = (char) c; } } else for (j = 0; j < rlen; j++) { GET c = rlook[*xpart]; n = rlens[c]; if (c == 255) { GET c = *xpart; n = 16; } for (k = 0; k < c; k++) read[j++] = (char) rchar; if (j < rlen) { GET c = nlook[*xpart]; n = nlens[c]; if (c == nsignal) { GET c = *cpart; n = 8; } read[j] = (char) c; } } return (0); } /******************************************************************************************* * * Histogrammers * ********************************************************************************************/ // Histogram runlengths of symbol runChar in stream[0..rlen-1] into run. static void Histogram_Seqs(uint64 *hist, uint8 *stream, int rlen) { int k; for (k = 0; k < rlen; k++) hist[stream[k]] += 1; } static void Histogram_Runs(uint64 *run, uint8 *stream, int rlen, int runChar) { int k, h; k = 0; while (k < rlen) { h = k; while (k < rlen && stream[k] == runChar) k += 1; if (k-h >= 256) run[255] += 1; else run[k-h] += 1; if (k < rlen) k += 1; } } /******************************************************************************************* * * Reader * ********************************************************************************************/ static char *Read = NULL; // Referred by: QVentry, Read_Lines, QVcoding_Scan, static int Rmax = -1; // Compress_Next_QVentry static int Nline; // Referred by: QVcoding_Scan char *QVentry() { return (Read); } // If nlines == 1 trying to read a single header, nlines = 5 trying to read 5 QV/fasta lines // for a sequence. Place line j at Read+j*Rmax and the length of every line is returned // unless eof occurs in which case return -1. If any error occurs return -2. int Read_Lines(FILE *input, int nlines) { int i, rlen; int tmax; char *tread; char *other; if (Read == NULL) { tmax = MIN_BUFFER; tread = (char *) Malloc(5*tmax,"Allocating QV entry read buffer"); if (tread == NULL) EXIT(-2); Rmax = tmax; Read = tread; } Nline += 1; if (fgets(Read,Rmax,input) == NULL) return (-1); rlen = strlen(Read); while (Read[rlen-1] != '\n') { tmax = ((int) 1.4*Rmax) + MIN_BUFFER; tread = (char *) Realloc(Read,5*tmax,"Reallocating QV entry read buffer"); if (tread == NULL) EXIT(-2); Rmax = tmax; Read = tread; if (fgets(Read+rlen,Rmax-rlen,input) == NULL) { EPRINTF(EPLACE,"Line %d: Last line does not end with a newline !\n",Nline); EXIT(-2); } rlen += strlen(Read+rlen); } other = Read; for (i = 1; i < nlines; i++) { other += Rmax; Nline += 1; if (fgets(other,Rmax,input) == NULL) { EPRINTF(EPLACE,"Line %d: incomplete last entry of .quiv file\n",Nline); EXIT(-2); } if (rlen != (int) strlen(other)) { EPRINTF(EPLACE,"Line %d: Lines for an entry are not the same length\n",Nline); EXIT(-2); } } return (rlen-1); } /******************************************************************************************* * * Tag compression and decompression routines * ********************************************************************************************/ // Keep only the symbols in tags[0..rlen-1] for which qvs[k] != rchar and // return the # of symbols kept. static int Pack_Tag(char *tags, char *qvs, int rlen, int rchar) { int j, k; j = 0; for (k = 0; k < rlen; k++) if (qvs[k] != rchar) tags[j++] = tags[k]; tags[j] = '\0'; return (j); } // Count the # of non-rchar symbols in qvs[0..rlen-1] static int Packed_Length(char *qvs, int rlen, int rchar) { int k, clen; clen = 0; for (k = 0; k < rlen; k++) if (qvs[k] != rchar) clen += 1; return (clen); } // Unpack tags by moving its i'th char to position k where qvs[k] is the i'th non-rchar // symbol in qvs. All other chars are set to rchar. rlen is the length of qvs and // the unpacked result, clen is the initial length of tags. static void Unpack_Tag(char *tags, int clen, char *qvs, int rlen, int rchar) { int j, k; j = clen-1; for (k = rlen-1; k >= 0; k--) { if (qvs[k] == rchar) tags[k] = 'n'; else tags[k] = tags[j--]; } } /******************************************************************************************* * * Statistics Scan and Scheme creation and write * ********************************************************************************************/ // Read .quiva file from input, recording stats in the histograms. If zero is set then // start the stats anew with this file. static uint64 delHist[256], insHist[256], mrgHist[256], subHist[256], delRun[256], subRun[256]; static uint64 totChar; static int delChar, subChar; // Referred by: QVcoding_Scan, Create_QVcoding int QVcoding_Scan(FILE *input) { char *slash; int rlen; // Zero histograms bzero(delHist,sizeof(uint64)*256); bzero(mrgHist,sizeof(uint64)*256); bzero(insHist,sizeof(uint64)*256); bzero(subHist,sizeof(uint64)*256); { int i; for (i = 0; i < 256; i++) delRun[i] = subRun[i] = 1; } totChar = 0; delChar = -1; subChar = -1; // Make a sweep through the .quiva entries, histogramming the relevant things // and figuring out the run chars for the deletion and substition streams Nline = 0; while (1) { int well, beg, end, qv; rlen = Read_Lines(input,1); if (rlen == -2) EXIT(1); if (rlen < 0) break; if (rlen == 0 || Read[0] != '@') { EPRINTF(EPLACE,"Line %d: Header in quiv file is missing\n",Nline); EXIT(1); } slash = index(Read+1,'/'); if (slash == NULL) { EPRINTF(EPLACE,"%s: Line %d: Header line incorrectly formatted ?\n", Prog_Name,Nline); EXIT(1); } if (sscanf(slash+1,"%d/%d_%d RQ=0.%d\n",&well,&beg,&end,&qv) != 4) { EPRINTF(EPLACE,"%s: Line %d: Header line incorrectly formatted ?\n", Prog_Name,Nline); EXIT(1); } rlen = Read_Lines(input,5); if (rlen < 0) { if (rlen == -1) EPRINTF(EPLACE,"Line %d: incomplete last entry of .quiv file\n",Nline); EXIT(1); } Histogram_Seqs(delHist,(uint8 *) (Read),rlen); Histogram_Seqs(insHist,(uint8 *) (Read+2*Rmax),rlen); Histogram_Seqs(mrgHist,(uint8 *) (Read+3*Rmax),rlen); Histogram_Seqs(subHist,(uint8 *) (Read+4*Rmax),rlen); if (delChar < 0) { int k; char *del = Read+Rmax; for (k = 0; k < rlen; k++) if (del[k] == 'n' || del[k] == 'N') { delChar = Read[k]; break; } } if (delChar >= 0) Histogram_Runs( delRun,(uint8 *) (Read),rlen,delChar); totChar += rlen; if (subChar < 0) { if (totChar >= 100000) { int k; subChar = 0; for (k = 1; k < 256; k++) if (subHist[k] > subHist[subChar]) subChar = k; } } if (subChar >= 0) Histogram_Runs( subRun,(uint8 *) (Read+4*Rmax),rlen,subChar); } return (0); } // Using the statistics in the global stat tables, create the Huffman schemes and write // them to output. If lossy is set, then create a lossy table for the insertion and merge // QVs. QVcoding *Create_QVcoding(int lossy) { static QVcoding coding; HScheme *delScheme, *insScheme, *mrgScheme, *subScheme; HScheme *dRunScheme, *sRunScheme; delScheme = NULL; dRunScheme = NULL; insScheme = NULL; mrgScheme = NULL; subScheme = NULL; sRunScheme = NULL; // Check whether using a subtitution run char is a win if (totChar < 200000 || subHist[subChar] < .5*totChar) subChar = -1; // If lossy encryption is enabled then scale insertions and merge QVs. if (lossy) { int k; for (k = 0; k < 256; k += 2) { insHist[k] += insHist[k+1]; insHist[k+1] = 0; } for (k = 0; k < 256; k += 4) { mrgHist[k] += mrgHist[k+1]; mrgHist[k] += mrgHist[k+2]; mrgHist[k] += mrgHist[k+3]; mrgHist[k+1] = 0; mrgHist[k+2] = 0; mrgHist[k+3] = 0; } } // Build a Huffman scheme for each stream entity from the histograms #define SCHEME_MACRO(meme,hist,label,bits) \ scheme = Huffman( (hist), NULL); \ if (scheme == NULL) \ goto error; \ if (scheme->type) \ { (meme) = Huffman( (hist), scheme); \ free(scheme); \ } \ else \ (meme) = scheme; #ifdef DEBUG #define MAKE_SCHEME(meme,hist,label,bits) \ SCHEME_MACRO(meme,hist,label,bits) \ printf("\n%s\n", (label) ); \ Print_Histogram( (hist)); \ Print_Table( (meme), (hist), (bits)); #else #define MAKE_SCHEME(meme,hist,label,bits) \ SCHEME_MACRO(meme,hist,label,bits) #endif { HScheme *scheme; if (delChar < 0) { MAKE_SCHEME(delScheme,delHist, "Hisotgram of Deletion QVs", 8); dRunScheme = NULL; } else { delHist[delChar] = 0; MAKE_SCHEME(delScheme,delHist, "Hisotgram of Deletion QVs less run char", 8); MAKE_SCHEME(dRunScheme,delRun, "Histogram of Deletion Runs QVs", 16); #ifdef DEBUG printf("\nRun char is '%c'\n",delChar); #endif } #ifdef DEBUG { int k; uint64 count; count = 0; for (k = 0; k < 256; k++) count += delHist[k]; printf("\nDelTag will require %lld bytes\n",count/4); } #endif MAKE_SCHEME(insScheme,insHist, "Hisotgram of Insertion QVs", 8); MAKE_SCHEME(mrgScheme,mrgHist, "Hisotgram of Merge QVs", 8); if (subChar < 0) { MAKE_SCHEME(subScheme,subHist, "Hisotgram of Subsitution QVs", 8); sRunScheme = NULL; } else { subHist[subChar] = 0; MAKE_SCHEME(subScheme,subHist, "Hisotgram of Subsitution QVs less run char", 8); MAKE_SCHEME(sRunScheme,subRun, "Histogram of Substitution Run QVs", 16); #ifdef DEBUG printf("\nRun char is '%c'\n",subChar); #endif } } // Setup endian handling Set_Endian(0); coding.delScheme = delScheme; coding.insScheme = insScheme; coding.mrgScheme = mrgScheme; coding.subScheme = subScheme; coding.dRunScheme = dRunScheme; coding.sRunScheme = sRunScheme; coding.delChar = delChar; coding.subChar = subChar; coding.prefix = NULL; coding.flip = 0; return (&coding); error: if (delScheme != NULL) free(delScheme); if (dRunScheme != NULL) free(dRunScheme); if (insScheme != NULL) free(insScheme); if (mrgScheme != NULL) free(mrgScheme); if (subScheme != NULL) free(subScheme); if (sRunScheme != NULL) free(sRunScheme); EXIT(NULL); } // Write the encoding scheme 'coding' to 'output' void Write_QVcoding(FILE *output, QVcoding *coding) { // Write out the endian key, run chars, and prefix (if not NULL) { uint16 half; int len; half = 0x33cc; fwrite(&half,sizeof(uint16),1,output); if (coding->delChar < 0) half = 256; else half = (uint16) (coding->delChar); fwrite(&half,sizeof(uint16),1,output); if (coding->subChar < 0) half = 256; else half = (uint16) (coding->subChar); fwrite(&half,sizeof(uint16),1,output); len = strlen(coding->prefix); fwrite(&len,sizeof(int),1,output); fwrite(coding->prefix,1,len,output); } // Write out the scheme tables Write_Scheme(coding->delScheme,output); if (coding->delChar >= 0) Write_Scheme(coding->dRunScheme,output); Write_Scheme(coding->insScheme,output); Write_Scheme(coding->mrgScheme,output); Write_Scheme(coding->subScheme,output); if (coding->subChar >= 0) Write_Scheme(coding->sRunScheme,output); } // Read the encoding scheme 'coding' to 'output' QVcoding *Read_QVcoding(FILE *input) { static QVcoding coding; // Read endian key, run chars, and short name common to all headers { uint16 half; int len; if (fread(&half,sizeof(uint16),1,input) != 1) { EPRINTF(EPLACE,"Could not read flip byte (Read_QVcoding)\n"); EXIT(NULL); } coding.flip = (half != 0x33cc); if (fread(&half,sizeof(uint16),1,input) != 1) { EPRINTF(EPLACE,"Could not read deletion char (Read_QVcoding)\n"); EXIT(NULL); } if (coding.flip) Flip_Short(&half); coding.delChar = half; if (coding.delChar >= 256) coding.delChar = -1; if (fread(&half,sizeof(uint16),1,input) != 1) { EPRINTF(EPLACE,"Could not read substitution char (Read_QVcoding)\n"); EXIT(NULL); } if (coding.flip) Flip_Short(&half); coding.subChar = half; if (coding.subChar >= 256) coding.subChar = -1; // Read the short name common to all headers if (fread(&len,sizeof(int),1,input) != 1) { EPRINTF(EPLACE,"Could not read header name length (Read_QVcoding)\n"); EXIT(NULL); } if (coding.flip) Flip_Long(&len); coding.prefix = (char *) Malloc(len+1,"Allocating header prefix"); if (coding.prefix == NULL) EXIT(NULL); if (len > 0) { if (fread(coding.prefix,len,1,input) != 1) { EPRINTF(EPLACE,"Could not read header name (Read_QVcoding)\n"); EXIT(NULL); } } coding.prefix[len] = '\0'; } // Setup endian handling Set_Endian(coding.flip); // Read the Huffman schemes used to compress the data coding.delScheme = NULL; coding.dRunScheme = NULL; coding.insScheme = NULL; coding.mrgScheme = NULL; coding.subScheme = NULL; coding.sRunScheme = NULL; coding.delScheme = Read_Scheme(input); if (coding.delScheme == NULL) goto error; if (coding.delChar >= 0) { coding.dRunScheme = Read_Scheme(input); if (coding.dRunScheme == NULL) goto error; } coding.insScheme = Read_Scheme(input); if (coding.insScheme == NULL) goto error; coding.mrgScheme = Read_Scheme(input); if (coding.mrgScheme == NULL) goto error; coding.subScheme = Read_Scheme(input); if (coding.subScheme == NULL) goto error; if (coding.subChar >= 0) { coding.sRunScheme = Read_Scheme(input); if (coding.sRunScheme == NULL) goto error; } return (&coding); error: if (coding.delScheme != NULL) free(coding.delScheme); if (coding.dRunScheme != NULL) free(coding.dRunScheme); if (coding.insScheme != NULL) free(coding.insScheme); if (coding.mrgScheme != NULL) free(coding.mrgScheme); if (coding.subScheme != NULL) free(coding.subScheme); if (coding.sRunScheme != NULL) free(coding.sRunScheme); EXIT(NULL); } // Free all the auxilliary storage associated with the encoding argument void Free_QVcoding(QVcoding *coding) { if (coding->subChar >= 0) free(coding->sRunScheme); free(coding->subScheme); free(coding->mrgScheme); free(coding->insScheme); if (coding->delChar >= 0) free(coding->dRunScheme); free(coding->delScheme); free(coding->prefix); } /******************************************************************************************* * * Encode/Decode (w.r.t. coding) next entry from input and write to output * ********************************************************************************************/ int Compress_Next_QVentry(FILE *input, FILE *output, QVcoding *coding, int lossy) { int rlen, clen; // Get all 5 streams, compress each with its scheme, and output rlen = Read_Lines(input,5); if (rlen < 0) { if (rlen == -1) EPRINTF(EPLACE,"Line %d: incomplete last entry of .quiv file\n",Nline); EXIT (1); } if (coding->delChar < 0) { Encode(coding->delScheme, output, (uint8 *) Read, rlen); clen = rlen; } else { Encode_Run(coding->delScheme, coding->dRunScheme, output, (uint8 *) Read, rlen, coding->delChar); clen = Pack_Tag(Read+Rmax,Read,rlen,coding->delChar); } Number_Read(Read+Rmax); Compress_Read(clen,Read+Rmax); fwrite(Read+Rmax,1,COMPRESSED_LEN(clen),output); if (lossy) { uint8 *insert = (uint8 *) (Read+2*Rmax); uint8 *merge = (uint8 *) (Read+3*Rmax); int k; for (k = 0; k < rlen; k++) { insert[k] = (uint8) ((insert[k] >> 1) << 1); merge[k] = (uint8) (( merge[k] >> 2) << 2); } } Encode(coding->insScheme, output, (uint8 *) (Read+2*Rmax), rlen); Encode(coding->mrgScheme, output, (uint8 *) (Read+3*Rmax), rlen); if (coding->subChar < 0) Encode(coding->subScheme, output, (uint8 *) (Read+4*Rmax), rlen); else Encode_Run(coding->subScheme, coding->sRunScheme, output, (uint8 *) (Read+4*Rmax), rlen, coding->subChar); return (0); } int Uncompress_Next_QVentry(FILE *input, char **entry, QVcoding *coding, int rlen) { int clen, tlen; // Decode each stream and write to output if (coding->delChar < 0) { if (Decode(coding->delScheme, input, entry[0], rlen)) EXIT(1); clen = rlen; tlen = COMPRESSED_LEN(clen); if (tlen > 0) { if (fread(entry[1],tlen,1,input) != 1) { EPRINTF(EPLACE,"Could not read deletions entry (Uncompress_Next_QVentry\n"); EXIT(1); } } Uncompress_Read(clen,entry[1]); Lower_Read(entry[1]); } else { if (Decode_Run(coding->delScheme, coding->dRunScheme, input, entry[0], rlen, coding->delChar)) EXIT(1); clen = Packed_Length(entry[0],rlen,coding->delChar); tlen = COMPRESSED_LEN(clen); if (tlen > 0) { if (fread(entry[1],tlen,1,input) != 1) { EPRINTF(EPLACE,"Could not read deletions entry (Uncompress_Next_QVentry\n"); EXIT(1); } } Uncompress_Read(clen,entry[1]); Lower_Read(entry[1]); Unpack_Tag(entry[1],clen,entry[0],rlen,coding->delChar); } if (Decode(coding->insScheme, input, entry[2], rlen)) EXIT(1); if (Decode(coding->mrgScheme, input, entry[3], rlen)) EXIT(1); if (coding->subChar < 0) { if (Decode(coding->subScheme, input, entry[4], rlen)) EXIT(1); } else { if (Decode_Run(coding->subScheme, coding->sRunScheme, input, entry[4], rlen, coding->subChar)) EXIT(1); } return (0); } pbdagcon-0.3+20161121+ds/DAZZ_DB/fasta2DAM.c0000644000175000017500000002700713026414552016013 0ustar afifafif/******************************************************************************************* * * Add .fasta files to a DB: * Adds the given fasta files in the given order to .db. If the db does not exist * then it is created. All .fasta files added to a given data base must have the same * header format and follow Pacbio's convention. A file cannot be added twice and this * is enforced. The command either builds or appends to the ..idx and ..bps * files, where the index file (.idx) contains information about each read and their offsets * in the base-pair file (.bps) that holds the sequences where each base is compessed * into 2-bits. The two files are hidden by virtue of their names beginning with a '.'. * .db is effectively a stub file with given name that contains an ASCII listing * of the files added to the DB and possibly the block partitioning for the DB if DBsplit * has been called upon it. * * Author: Gene Myers * Date : May 2013 * Modify: DB upgrade: now *add to* or create a DB depending on whether it exists, read * multiple .fasta files (no longer a stdin pipe). * Date : April 2014 * ********************************************************************************************/ #include #include #include #include #include #include #include "DB.h" #ifdef HIDE_FILES #define PATHSEP "/." #else #define PATHSEP "/" #endif static char *Usage = "[-v] ( -f | ... )"; static char number[128] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }; typedef struct { int argc; char **argv; FILE *input; int count; char *name; } File_Iterator; File_Iterator *init_file_iterator(int argc, char **argv, FILE *input, int first) { File_Iterator *it; it = Malloc(sizeof(File_Iterator),"Allocating file iterator"); it->argc = argc; it->argv = argv; it->input = input; if (input == NULL) it->count = first; else { it->count = 1; rewind(input); } return (it); } int next_file(File_Iterator *it) { static char nbuffer[MAX_NAME+8]; if (it->input == NULL) { if (it->count >= it->argc) return (0); it->name = it->argv[it->count++]; } else { char *eol; if (fgets(nbuffer,MAX_NAME+8,it->input) == NULL) { if (feof(it->input)) return (0); SYSTEM_ERROR; } if ((eol = index(nbuffer,'\n')) == NULL) { fprintf(stderr,"%s: Line %d in file list is longer than %d chars!\n", Prog_Name,it->count,MAX_NAME+7); it->name = NULL; } *eol = '\0'; it->count += 1; it->name = nbuffer; } return (1); } int main(int argc, char *argv[]) { FILE *ostub; char *dbname; char *root, *pwd; FILE *bases, *indx, *hdrs; int64 boff, hoff; int ifiles, ofiles; char **flist; HITS_DB db; int ureads; int VERBOSE; FILE *IFILE; // Process command line { int i, j, k; int flags[128]; ARG_INIT("fasta2DAM") IFILE = NULL; j = 1; for (i = 1; i < argc; i++) if (argv[i][0] == '-') switch (argv[i][1]) { default: ARG_FLAGS("v") break; case 'f': IFILE = fopen(argv[i]+2,"r"); if (IFILE == NULL) { fprintf(stderr,"%s: Cannot open file of inputs '%s'\n",Prog_Name,argv[i]+2); exit (1); } break; } else argv[j++] = argv[i]; argc = j; VERBOSE = flags['v']; if ((IFILE == NULL && argc <= 2) || (IFILE != NULL && argc != 2)) { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage); exit (1); } } // Try to open DB file, if present then adding to DB, otherwise creating new DB. Set up // variables as follows: // dbname = full name of map index = /.dam // ostub = new image of db file (will overwrite old image at end) // bases = .bps file positioned for appending // indx = .idx file positioned for appending // ureads = # of reads currently in db // boff = offset in .bps at which to place next sequence // hoff = offset in .hdr at which to place next header prefix // ifiles = # of .fasta files to add // ofiles = # of .fasta files added so far // flist = [0..ifiles] list of file names (root only) added to db so far root = Root(argv[1],".dam"); pwd = PathTo(argv[1]); dbname = Strdup(Catenate(pwd,"/",root,".dam"),"Allocating map index name"); if (dbname == NULL) exit (1); if (IFILE == NULL) ifiles = argc-2; else { File_Iterator *ng; ifiles = 0; ng = init_file_iterator(argc,argv,IFILE,2); while (next_file(ng)) ifiles += 1; free(ng); } ofiles = 0; bases = Fopen(Catenate(pwd,PATHSEP,root,".bps"),"w"); indx = Fopen(Catenate(pwd,PATHSEP,root,".idx"),"w"); hdrs = Fopen(Catenate(pwd,PATHSEP,root,".hdr"),"w"); if (bases == NULL || indx == NULL || hdrs == NULL) exit (1); flist = (char **) Malloc(sizeof(char *)*ifiles,"Allocating file list"); fwrite(&db,sizeof(HITS_DB),1,indx); ureads = 0; boff = 0; hoff = 0; ostub = Fopen(dbname,"w+"); if (ostub == NULL) exit (1); fprintf(ostub,DB_NFILE,argc-2); { int maxlen; int64 totlen, count[4]; int rmax; HITS_READ prec; char *read; int c; File_Iterator *ng; // Buffer for accumulating .fasta sequence over multiple lines rmax = MAX_NAME + 60000; read = (char *) Malloc(rmax+1,"Allocating line buffer"); if (read == NULL) goto error; totlen = 0; // total # of bases in new .fasta files maxlen = 0; // longest read in new .fasta files for (c = 0; c < 4; c++) // count of acgt in new .fasta files count[c] = 0; // For each .fasta file do: ng = init_file_iterator(argc,argv,IFILE,2); while (next_file(ng)) { FILE *input; char *path, *core; int nline, eof, rlen; if (ng->name == NULL) goto error; // Open it: /.fasta, check that core is not too long, // and checking that it is not already in flist. path = PathTo(ng->name); core = Root(ng->name,".fasta"); if ((input = Fopen(Catenate(path,"/",core,".fasta"),"r")) == NULL) goto error; free(path); { int j; for (j = 0; j < ofiles; j++) if (strcmp(core,flist[j]) == 0) { fprintf(stderr,"%s: File %s.fasta is already in database %s.db\n", Prog_Name,core,Root(argv[1],".db")); goto error; } } // Get the header of the first line. If the file is empty skip. rlen = 0; nline = 1; eof = (fgets(read,MAX_NAME,input) == NULL); if (eof || strlen(read) < 1) { fprintf(stderr,"Skipping '%s', file is empty!\n",core); fclose(input); free(core); continue; } // Add the file name to flist if (VERBOSE) { fprintf(stderr,"Adding '%s' ...\n",core); fflush(stderr); } flist[ofiles++] = core; // Check that the first line has PACBIO format, and record prolog in 'prolog'. if (read[strlen(read)-1] != '\n') { fprintf(stderr,"File %s.fasta, Line 1: Fasta line is too long (> %d chars)\n", core,MAX_NAME-2); goto error; } if (!eof && read[0] != '>') { fprintf(stderr,"File %s.fasta, Line 1: First header in fasta file is missing\n",core); goto error; } // Read in all the sequences until end-of-file { int i, x, n; while (!eof) { int hlen; read[rlen] = '>'; hlen = strlen(read+rlen); fwrite(read+rlen,1,hlen,hdrs); rlen = 0; while (1) { eof = (fgets(read+rlen,MAX_NAME,input) == NULL); nline += 1; x = strlen(read+rlen)-1; if (read[rlen+x] != '\n') { fprintf(stderr,"File %s.fasta, Line %d:",core,nline); fprintf(stderr," Fasta line is too long (> %d chars)\n",MAX_NAME-2); goto error; } if (eof || read[rlen] == '>') break; rlen += x; if (rlen + MAX_NAME > rmax) { rmax = ((int) (1.2 * rmax)) + 1000 + MAX_NAME; read = (char *) realloc(read,rmax+1); if (read == NULL) { fprintf(stderr,"File %s.fasta, Line %d:",core,nline); fprintf(stderr," Out of memory (Allocating line buffer)\n"); goto error; } } } read[rlen] = '\0'; n = 0; i = -1; while (i < rlen) { int pbeg, plen, clen; while (i < rlen) if (number[(int) read[++i]] < 4) break; if (i >= rlen) break; pbeg = i; prec.fpulse = pbeg; prec.origin = n++; prec.boff = boff; prec.coff = hoff; prec.flags = DB_BEST; while (i < rlen) { x = number[(int) read[i]]; if (x >= 4) break; count[x] += 1; read[i++] = (char) x; } prec.rlen = plen = i-pbeg; ureads += 1; totlen += plen; if (plen > maxlen) maxlen = plen; Compress_Read(plen,read+pbeg); clen = COMPRESSED_LEN(plen); fwrite(read+pbeg,1,clen,bases); boff += clen; fwrite(&prec,sizeof(HITS_READ),1,indx); } hoff += hlen; } fprintf(ostub,DB_FDATA,ureads,core,core); fclose(input); } } // Update relevant fields in db record db.ureads = ureads; db.treads = ureads; for (c = 0; c < 4; c++) db.freq[c] = (float) ((1.*count[c])/totlen); db.totlen = totlen; db.maxlen = maxlen; db.cutoff = -1; } rewind(indx); fwrite(&db,sizeof(HITS_DB),1,indx); // Write the finalized db record into .idx fclose(ostub); fclose(indx); fclose(bases); fclose(hdrs); exit (0); // Error exit: Remove the .idx, .bps, and .dam files error: fclose(ostub); fclose(indx); fclose(hdrs); fclose(bases); unlink(Catenate(pwd,PATHSEP,root,".idx")); unlink(Catenate(pwd,PATHSEP,root,".bps")); unlink(Catenate(pwd,PATHSEP,root,".hdr")); unlink(Catenate(pwd,"/",root,".dam")); exit (1); } pbdagcon-0.3+20161121+ds/DAZZ_DB/DBshow.c0000644000175000017500000003577213026414552015507 0ustar afifafif/******************************************************************************************* * * Display a specified set of reads of a database in fasta format. * * Author: Gene Myers * Date : September 2013 * Mod : With DB overhaul, made this a routine strictly for printing a selected subset * and created DB2fasta for recreating all the fasta files of a DB * Date : April 2014 * Mod : Added options to display QV streams * Date : July 2014 * ********************************************************************************************/ #include #include #include #include #include "DB.h" #ifdef HIDE_FILES #define PATHSEP "/." #else #define PATHSEP "/" #endif static char *Usage[] = { "[-unqUQ] [-w] [-m]+", " [ | ... ]" }; #define LAST_READ_SYMBOL '$' #define MAX_BUFFER 10001 typedef struct { FILE *input; int lineno; int read; int beg; int end; } File_Iterator; File_Iterator *init_file_iterator(FILE *input) { File_Iterator *it; it = Malloc(sizeof(File_Iterator),"Allocating file iterator"); it->input = input; it->lineno = 1; rewind(input); return (it); } int next_read(File_Iterator *it) { static char nbuffer[MAX_BUFFER]; char *eol; int x; if (fgets(nbuffer,MAX_BUFFER,it->input) == NULL) { if (feof(it->input)) return (1); SYSTEM_ERROR; } if ((eol = index(nbuffer,'\n')) == NULL) { fprintf(stderr,"%s: Line %d in read list is longer than %d chars!\n", Prog_Name,it->lineno,MAX_BUFFER-1); return (1); } *eol = '\0'; x = sscanf(nbuffer," %d %d %d",&(it->read),&(it->beg),&(it->end)); if (x == 1) it->beg = -1; else if (x != 3) { fprintf(stderr,"%s: Line %d of read list is improperly formatted\n",Prog_Name,it->lineno); return (1); } it->lineno += 1; return (0); } int main(int argc, char *argv[]) { HITS_DB _db, *db = &_db; FILE *hdrs = NULL; int nfiles; char **flist = NULL; int *findx = NULL; int reps, *pts; int input_pts; File_Iterator *iter = NULL; FILE *input; int TRIM, UPPER; int DOSEQ, DOQVS, QUIVA, DAM; int WIDTH; int MMAX, MTOP; char **MASK; // Process arguments { int i, j, k; int flags[128]; char *eptr; ARG_INIT("DBshow") WIDTH = 80; MTOP = 0; MMAX = 10; MASK = (char **) Malloc(MMAX*sizeof(char *),"Allocating mask track array"); if (MASK == NULL) exit (1); j = 1; for (i = 1; i < argc; i++) if (argv[i][0] == '-') switch (argv[i][1]) { default: ARG_FLAGS("unqUQ") break; case 'w': ARG_NON_NEGATIVE(WIDTH,"Line width") break; case 'm': if (MTOP >= MMAX) { MMAX = 1.2*MTOP + 10; MASK = (char **) Realloc(MASK,MMAX*sizeof(char *),"Reallocating mask track array"); if (MASK == NULL) exit (1); } MASK[MTOP++] = argv[i]+2; break; } else argv[j++] = argv[i]; argc = j; DAM = 0; TRIM = 1-flags['u']; UPPER = 1+flags['U']; DOQVS = flags['q']; DOSEQ = 1-flags['n']; QUIVA = flags['Q']; if (QUIVA && (!DOSEQ || MTOP > 0)) { fprintf(stderr,"%s: -Q (quiva) format request inconsistent with -n and -m options\n", Prog_Name); exit (1); } if (QUIVA) DOQVS = 1; if (argc <= 1) { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage[0]); fprintf(stderr," %*s %s\n",(int) strlen(Prog_Name),"",Usage[1]); exit (1); } } // Open DB or DAM, and if a DAM open also .hdr file { char *pwd, *root; int status; status = Open_DB(argv[1],db); if (status < 0) exit (1); if (status == 1) { root = Root(argv[1],".dam"); pwd = PathTo(argv[1]); hdrs = Fopen(Catenate(pwd,PATHSEP,root,".hdr"),"r"); if (hdrs == NULL) exit (1); DAM = 1; if (QUIVA || DOQVS) { fprintf(stderr,"%s: -Q and -q options not compatible with a .dam DB\n",Prog_Name); exit (1); } free(root); free(pwd); } } // Load QVs if requested if (DOQVS) { if (Load_QVs(db) < 0) { fprintf(stderr,"%s: QVs requested, but no .qvs for data base\n",Prog_Name); exit (1); } } // Check tracks and load tracks for untrimmed DB { int i, status, kind; for (i = 0; i < MTOP; i++) { status = Check_Track(db,MASK[i],&kind); if (status == -2) printf("%s: Warning: -m%s option given but no track found.\n",Prog_Name,MASK[i]); else if (status == -1) printf("%s: Warning: %s track not sync'd with db.\n",Prog_Name,MASK[i]); else if (kind != MASK_TRACK) printf("%s: Warning: %s track is not a mask track.\n",Prog_Name,MASK[i]); else if (status == 0) Load_Track(db,MASK[i]); else if (status == 1 && !TRIM) printf("%s: Warning: %s track is for a trimmed db but -u is set.\n",Prog_Name,MASK[i]); } } // If not a DAM then get prolog names and index ranges from the .db file if (!DAM) { char *pwd, *root; FILE *dstub; int i; root = Root(argv[1],".db"); pwd = PathTo(argv[1]); if (db->part > 0) *rindex(root,'.') = '\0'; dstub = Fopen(Catenate(pwd,"/",root,".db"),"r"); if (dstub == NULL) exit (1); free(pwd); free(root); if (fscanf(dstub,DB_NFILE,&nfiles) != 1) SYSTEM_ERROR flist = (char **) Malloc(sizeof(char *)*nfiles,"Allocating file list"); findx = (int *) Malloc(sizeof(int *)*(nfiles+1),"Allocating file index"); if (flist == NULL || findx == NULL) exit (1); findx += 1; findx[-1] = 0; for (i = 0; i < nfiles; i++) { char prolog[MAX_NAME], fname[MAX_NAME]; if (fscanf(dstub,DB_FDATA,findx+i,fname,prolog) != 3) SYSTEM_ERROR if ((flist[i] = Strdup(prolog,"Adding to file list")) == NULL) exit (1); } fclose(dstub); // If TRIM (the default) then "trim" prolog ranges and the DB if (TRIM) { int nid, oid, lid; int cutoff, allflag; HITS_READ *reads; reads = db->reads - db->ufirst; cutoff = db->cutoff; if (db->all) allflag = 0; else allflag = DB_BEST; nid = 0; oid = db->ufirst; lid = oid + db->nreads; for (i = 0; i < nfiles; i++) { while (oid < findx[i] && oid < lid) { if ((reads[oid].flags & DB_BEST) >= allflag && reads[oid].rlen >= cutoff) nid++; oid += 1; } findx[i] = nid; } } else if (db->part > 0) { for (i = 0; i < nfiles; i++) findx[i] -= db->ufirst; } } if (TRIM) { int i, status, kind; Trim_DB(db); // Load tracks for trimmed DB for (i = 0; i < MTOP; i++) { status = Check_Track(db,MASK[i],&kind); if (status < 0) continue; else if (status == 1 && kind == MASK_TRACK) Load_Track(db,MASK[i]); } } // Process read index arguments into a list of read ranges input_pts = 0; if (argc == 3) { if (argv[2][0] != LAST_READ_SYMBOL || argv[2][1] != '\0') { char *eptr, *fptr; int b, e; b = strtol(argv[2],&eptr,10); if (eptr > argv[2] && b > 0) { if (*eptr == '-') { if (eptr[1] != LAST_READ_SYMBOL || eptr[2] != '\0') { e = strtol(eptr+1,&fptr,10); input_pts = (fptr <= eptr+1 || *fptr != '\0' || e <= 0); } } else input_pts = (*eptr != '\0'); } else input_pts = 1; } } if (input_pts) { input = Fopen(argv[2],"r"); if (input == NULL) exit (1); iter = init_file_iterator(input); } else { pts = (int *) Malloc(sizeof(int)*2*(argc-1),"Allocating read parameters"); if (pts == NULL) exit (1); reps = 0; if (argc > 2) { int c, b, e; char *eptr, *fptr; for (c = 2; c < argc; c++) { if (argv[c][0] == LAST_READ_SYMBOL) { b = db->nreads; eptr = argv[c]+1; } else b = strtol(argv[c],&eptr,10); if (eptr > argv[c]) { if (b <= 0) { fprintf(stderr,"%s: %d is not a valid index\n",Prog_Name,b); exit (1); } if (*eptr == 0) { pts[reps++] = b; pts[reps++] = b; continue; } else if (*eptr == '-') { if (eptr[1] == LAST_READ_SYMBOL) { e = db->nreads; fptr = eptr+2; } else e = strtol(eptr+1,&fptr,10); if (fptr > eptr+1 && *fptr == 0 && e > 0) { pts[reps++] = b; pts[reps++] = e; if (b > e) { fprintf(stderr,"%s: Empty range '%s'\n",Prog_Name,argv[c]); exit (1); } continue; } } } fprintf(stderr,"%s: argument '%s' is not an integer range\n",Prog_Name,argv[c]); exit (1); } } else { pts[reps++] = 1; pts[reps++] = db->nreads; } } // Display each read (and/or QV streams) in the active DB according to the // range pairs in pts[0..reps) and according to the display options. { HITS_READ *reads; HITS_TRACK *first; char *read, **entry; int c, b, e, i; int hilight, substr; int map; int (*iscase)(int); read = New_Read_Buffer(db); if (DOQVS) { entry = New_QV_Buffer(db); first = db->tracks->next; } else { entry = NULL; first = db->tracks; } if (UPPER == 1) { hilight = 'A'-'a'; iscase = islower; } else { hilight = 'a'-'A'; iscase = isupper; } map = 0; reads = db->reads; substr = 0; c = 0; while (1) { if (input_pts) { if (next_read(iter)) break; e = iter->read; b = e-1; substr = (iter->beg >= 0); } else { if (c >= reps) break; b = pts[c]-1; e = pts[c+1]; if (e > db->nreads) e = db->nreads; c += 2; } for (i = b; i < e; i++) { int len; int fst, lst; int flags, qv; HITS_READ *r; HITS_TRACK *track; r = reads + i; len = r->rlen; flags = r->flags; qv = (flags & DB_QV); if (DAM) { char header[MAX_NAME]; fseeko(hdrs,r->coff,SEEK_SET); fgets(header,MAX_NAME,hdrs); header[strlen(header)-1] = '\0'; printf("%s :: Contig %d[%d,%d]",header,r->origin,r->fpulse,r->fpulse+len); } else { while (i < findx[map-1]) map -= 1; while (i >= findx[map]) map += 1; if (QUIVA) printf("@%s/%d/%d_%d",flist[map],r->origin,r->fpulse,r->fpulse+len); else printf(">%s/%d/%d_%d",flist[map],r->origin,r->fpulse,r->fpulse+len); if (qv > 0) printf(" RQ=0.%3d",qv); } printf("\n"); if (DOQVS) Load_QVentry(db,i,entry,UPPER); if (DOSEQ) Load_Read(db,i,read,UPPER); for (track = first; track != NULL; track = track->next) { int64 *anno; int *data; int64 s, f, j; int bd, ed, m; anno = (int64 *) track->anno; data = (int *) track->data; s = (anno[i] >> 2); f = (anno[i+1] >> 2); if (s < f) { for (j = s; j < f; j += 2) { bd = data[j]; ed = data[j+1]; if (DOSEQ) for (m = bd; m < ed; m++) if (iscase(read[m])) read[m] = (char) (read[m] + hilight); if (j == s) printf("> %s:",track->name); printf(" [%d,%d]",bd,ed); } printf("\n"); } } if (substr) { fst = iter->beg; lst = iter->end; } else { fst = 0; lst = len; } if (QUIVA) { int k; for (k = 0; k < 5; k++) printf("%.*s\n",lst-fst,entry[k]+fst); } else { if (DOQVS) { int j, k; printf("\n"); for (j = fst; j+WIDTH < lst; j += WIDTH) { if (DOSEQ) printf("%.*s\n",WIDTH,read+j); for (k = 0; k < 5; k++) printf("%.*s\n",WIDTH,entry[k]+j); printf("\n"); } if (j < lst) { if (DOSEQ) printf("%.*s\n",lst-j,read+j); for (k = 0; k < 5; k++) printf("%.*s\n",lst-j,entry[k]+j); printf("\n"); } } else if (DOSEQ) { int j; for (j = fst; j+WIDTH < lst; j += WIDTH) printf("%.*s\n",WIDTH,read+j); if (j < lst) printf("%.*s\n",lst-j,read+j); } } } } } if (input_pts) { fclose(input); free(iter); } else free(pts); if (DAM) fclose(hdrs); else { int i; for (i = 0; i < nfiles; i++) free(flist[i]); free(flist); free(findx-1); } Close_DB(db); exit (0); } pbdagcon-0.3+20161121+ds/DAZZ_DB/README0000644000175000017500000006410213026414552015022 0ustar afifafif *** PLEASE GO TO THE DAZZLER BLOG (https://dazzlerblog.wordpress.com) FOR TYPESET *** DOCUMENTATION, EXAMPLES OF USE, AND DESIGN PHILOSOPHY. /************************************************************************************\ UPGRADE & DEVELOPER NOTES ! ! ! If you have already built a big database and don't want to rebuild it, but do want to use a more recent version of the software that entails a change to the data structures (currently the updates on Sept 25, 2014 and December 31, 2014), please note the routines DBupgrade.Sep.25.2014 and DBupgrade.Dec.31.2014. These take a DB, say X, as an argument, and produce a file .X.ndx which you should then replace .X.idx with. To update a very old DB to today's version you will need to run both in sequence. Both of the upgrade programs can be made with "make" but are not by default created when make is called without an argument. For those interested in the details, on September 25, the "beg" and "end" fields went from shorts to ints, and on December 31, the "beg" and "end" fields became "fpulse" and "rlen", respectively where fpulse = beg and rlen = end-beg. Unfortunately, the .dust track formats also changed on Dec.31.2014 and Jan.1.2015. To upgrade said use DUSTupgrade.Jan.1.2015. This program takes a DB, say X as an argument and produces .X.next.anno and .X.next.data which you should then replace .X.dust.* with. Of course, it may, if the DB is not too big, be easier and simpler to just rerun DBdust. Developers should also note carefully that the calling conventions to Open_DB have changed and there are new utility routines Number_Digits and Check_Track. \************************************************************************************/ The Dazzler Database Library Author: Gene Myers First: July 17, 2013 Current: December 31, 2014 To facilitate the multiple phases of the dazzler assembler, we organize all the read data into what is effectively a "database" of the reads and their meta-information. The design goals for this data base are as follows: (1) The database stores the source Pacbio read information in such a way that it can recreate the original input data, thus permitting a user to remove the (effectively redundant) source files. This avoids duplicating the same data, once in the source file and once in the database. (2) The data base can be built up incrementally, that is new sequence data can be added to the data base over time. (3) The data base flexibly allows one to store any meta-data desired for reads. This is accomplished with the concept of *tracks* that implementors can add as they need them. (4) The data is held in a compressed form equivalent to the .dexta and .dexqv files of the data extraction module. Both the .fasta and .quiva information for each read is held in the data base and can be recreated from it. The .quiva information can be added separately and later on if desired. (5) To facilitate job parallel, cluster operation of the phases of our assembler, the data base has a concept of a *current partitioning* in which all the reads that are over a given length and optionally unique to a well, are divided up into *blocks* containing roughly a given number of bases, except possibly the last block which may have a short count. Often programs con be run on blocks or pairs of blocks and each such job is reasonably well balanced as the blocks are all the same size. One must be careful about changing the partition during an assembly as doing so can void the structural validity of any interim block-based results. A Dazzler DB consists of one named, *visible* file, e.g. FOO.db, and several *invisible* secondary files encoding various elements of the DB. The secondary files are "invisible" to the UNIX OS in the sense that they begin with a "." and hence are not listed by "ls" unless one specifies the -a flag. We chose to do this so that when a user lists the contents of a directory they just see a single name, e.g. FOO.db, that is the one used to refer to the DB in commands. The files associated with a database named, say FOO, are as follows: (a) "FOO.db": a text file containing (i) the list of input files added to the database so far, and (ii) how to partition the database into blocks (if the partition parameters have been set). (b) ".FOO.idx": a binary "index" of all the meta-data about each read allowing, for example, one to randomly access a read's sequence (in the store ".FOO.bps"). It is 28N + 88 bytes in size where N is the number of reads in the database. (c) ".FOO.bps": a binary compressed "store" of all the DNA sequences. It is M/4 bytes in size where M is the total number of base pairs in the database. (d) ".FOO.qvs": a binary compressed "store" of the 5 Pacbio quality value streams for the reads. Its size is roughly 5/3M bytes depending on the compression acheived. This file only exists if .quiva files have been added to the database. (e) ".FOO..anno": a *track* containing customized meta-data for each read. For ".FOO..data" example, the DBdust command annotates low complexity intervals of reads and records the intervals for each read in two files .FOO.dust.anno & .FOO.dust.data. Any kind of information about a read can be recorded, such as micro-sats, repeat intervals, corrected sequence, etc. Specific tracks will be described as modules that produce them are released. If one does not like the convention of the secondary files being invisible, then un-defining the constant HIDE_FILES in DB.h before compiling the library, creates commands that do not place a prefixing "." before secondary file names, e.g. FOO.idx instead of .FOO.idx. One then sees all the files realizing a DB when listing the contents of a directory with ls. While a Dazzler DB holds a collection of Pacbio reads, a Dazzler map DB or DAM holds a collection of contigs from a reference genome assembly. This special type of DB has been introduced in order to facilitate the mapping of reads to an assembly and has been given the suffix .dam to distinguish it from an ordinary DB. It is structurally identical to a .db except: (a) there is no concept of quality values, and hence no .FOO.qvs file. (b) every .fasta scaffold (a sequence with runs of N's between contigs estimating the length of the gap) is broken into a separate contig sequence in the DB and the header for each scaffold is retained in a new .FOO.hdr file. (c) the original and first and last pulse fields in the meta-data records held in .FOO.idx, hold instead the contig number and the interval of the contig within its original scaffold sequence. A map DB can equally well be the argument of many of the commands below that operate on normal DBs. In general, a .dam can be an argument anywhere a .db can, with the exception of routines or optioned calls to routines that involve quality values, or the special routines fasta2DAM and DAM2fasta that create a DAM and reverse said, just like the pair fasta2DB and DB2fasta do for a normal DB. So in general when we refer to a database we are referring to either a DB or a DAM. The command DBsplit sets or resets the current partition for a database which is determined by 3 parameters: (i) the total number of basepairs to place in each block, (ii) the minimum read length of reads to include within a block, and (iii) whether or not to only include the longest read from a given well or all reads from a well (NB: several reads of the same insert in a given well can be produced by the Pacbio instrument). Note that the length and uniqueness parameters effectively select a subset of the reads that contribute to the size of a block. We call this subset the *trimmed* data base. Some commands operate on the entire database, others on the trimmed database, and yet others have an option flag that permits them to operate on either at the users discretion. Therefore, one should note carefully to which version of the database a command refers to. This is especially important for any command that identifies reads by their index (ordinal position) in the database. Once the database has been split into blocks, the commands DBshow, DBstats, and DBdust below and commands yet to come, such as the local alignment finder dalign, can take a block or blocks as arguments. On the command line this is indicated by supplying the name of the DB followed by a period and then a block number, e.g. FOO.3.db or simply FOO.3, refers to the 3'rd block of DB FOO (assuming of course it has a current partition and said partition has a 3rd block). One should note carefully that a block is a contiguous range of reads such that once it is trimmed has a given size in base pairs (as set by DBsplit). Thus like an entire database, a block can be either untrimmed or trimmed and one needs to again be careful when giving a read index to a command such as DBshow. All programs add suffixes (e.g. .db) as needed. The commands of the database library are currently as follows: 1. fasta2DB [-v] ( -f | ... ) Builds an initial data base, or adds to an existing database, the list of .fasta files following the database name argument, or if the -f option is used, the list of .fasta files in . A given .fasta file can only be added once to the DB (this is checked by the command). The .fasta headers must be in the "Pacbio" format (i.e. the output of the Pacbio tools or our dextract program) and the well, pulse interval, and read quality are extracted from the header and kept with each read record. If the files are being added to an existing database, and the partition settings of the DB have already been set (see DBsplit below), then the partitioning of the database is updated to include the new data. 2. DB2fasta [-vU] [-w] The set of .fasta files for the given DB are recreated from the DB exactly as they were input. That is, this is a perfect inversion, including the reconstitution of the proper .fasta headers. Because of this property, one can, if desired, delete the .fasta source files once they are in the DB as they can always be recreated from it. By default the output sequences are in lower case and 80 chars per line. The -U option specifies upper case should be used, and the characters per line, or line width, can be set to any positive value with the -w option. 3. quiva2DB [-vl] ( -f | ... ) Adds the given .quiva files on the command line or in the file specified by the -f option to an existing DB "path". The input files must be added in the same order as the .fasta files were and have the same root names, e.g. FOO.fasta and FOO.quiva. The files can be added incrementally but must be added in the same order as the .fasta files. This is enforced by the program. With the -l option set the compression scheme is a bit lossy to get more compression (see the description of dexqv in the DEXTRACTOR module). 4. DB2quiva [-vU] The set of .quiva files within the given DB are recreated from the DB exactly as they were input. That is, this is a perfect inversion, including the reconstitution of the proper .quiva headers. Because of this property, one can, if desired, delete the .quiva source files once they are in the DB as they can always be recreated from it. By .fastq convention each QV vector is output as a line without new-lines, and by default the Deletion Tag entry is in lower case letters. The -U option specifies upper case letters should be used instead. 5. fasta2DAM [-v] ( -f | ... ) Builds a map DB or DAM from the list of .fasta files following the map database name argument, or if the -f option is used, the list of .fasta files in . Any .fasta entry that has a run of N's in it will be split into separate "contig" entries and the interval of the contig in the original entry recorded. The header for each .fasta entry is saved with the contigs created from it. 6. DAM2fasta [-vU] [-w] The set of .fasta files for the given map DB or DAM are recreated from the DAM exactly as they were input. That is, this is a perfect inversion, including the reconstitution of the proper .fasta headers and the concatenation of contigs with the proper number of N's between them. By default the output sequences are in lower case and 80 chars per line. The -U option specifies upper case should be used, and the characters per line, or line width, can be set to any positive value with the -w option. 7. DBsplit [-a] [-x] [-s] Divide the database .db or .dam conceptually into a series of blocks referable to on the command line as .1, .2, ... If the -x option is set then all reads less than the given length are ignored, and if the -a option is not set then secondary reads from a given well are also ignored. The remaining reads, constituting what we call the trimmed DB, are split amongst the blocks so that each block is of size -s * 1Mbp except for the last which necessarily contains a smaller residual. The default value for -s is 200Mbp because blocks of this size can be compared by our "overlapper" dalign in roughly 16Gb of memory. The blocks are very space efficient in that their sub-index of the master .idx is computed on the fly when loaded, and the .bps and .qvs files (if a .db) of base pairs and quality values, respectively, is shared with the master DB. Any relevant portions of tracks associated with the DB are also computed on the fly when loading a database block. 8. DBdust [-b] [-w] [-t] [-m] Runs the symmetric DUST algorithm over the reads in the untrimmed DB .db or .dam producing a track ..dust[.anno,.data] that marks all intervals of low complexity sequence, where the scan window is of size -w, the threshold for being a low-complexity interval is -t, and only perfect intervals of size greater than -m are recorded. If the -b option is set then the definition of low complexity takes into account the frequency of a given base. The command is incremental if given a DB to which new data has been added since it was last run on the DB, then it will extend the track to include the new reads. It is important to set this flag for genomes with a strong AT/GC bias, albeit the code is a tad slower. The dust track, if present, is understood and used by DBshow, DBstats, and dalign. DBdust can also be run over an untriimmed DB block in which case it outputs a track encoding where the trace file names contain the block number, e.g. .FOO.3.dust.anno and .FOO.3.dust.data, given FOO.3 on the command line. We call this a *block track*. This permits job parallelism in block-sized chunks, and the resulting sequence of block tracks can then be merged into a track for the entire untrimmed DB with Catrack. 9. Catrack [-v] Find all block tracks of the form ..#.... and merge them into a single track, ....., for the given DB or DAM. The block track files must all encode the same kind of track data (this is checked), and the files must exist for block 1, 2, 3, ... up to the last block number. 10. DBshow [-unqUQ] [-w] [-m]+ [ | ... ] Displays the requested reads in the database .db or .dam. By default the command applies to the trimmed database, but if -u is set then the entire DB is used. If no read arguments are given then every read in the database or database block is displayed. Otherwise the input file or the list of supplied integer ranges give the ordinal positions in the actively loaded portion of the db. In the case of a file, it should simply contain a read index, one per line. In the other case, a read range is either a lone integer or the symbol $, in which case the read range consists of just that read (the last read in the database if $). One may also give two positive integers separated by a dash to indicate a range of integers, where again a $ represents the index of the last read in the actively loaded db. For example, 1 3-5 $ displays reads 1, 3, 4, 5, and the last read in the active db. As another example, 1-$ displays every read in the active db (the default). By default a .fasta file of the read sequences is displayed. If the -q option is set, then the QV streams are also displayed in a non-standard modification of the fasta format. If the -n option is set then the DNA sequence is *not* displayed. If the -Q option is set then a .quiva file is displayed and in this case the -n and -m options mayt not be set (and the -q and -w options have no effect). If one or more masks are set with the -m option then the track intervals are also displayed in an additional header line and the bases within an interval are displayed in the case opposite that used for all the other bases. By default the output sequences are in lower case and 80 chars per line. The -U option specifies upper case should be used, and the characters per line, or line width, can be set to any positive value with the -w option. The .fasta or .quiva files that are output can be converted into a DB by fasta2DB and quiva2DB (if the -q and -n options are not set and no -m options are set), giving one a simple way to make a DB of a subset of the reads for testing purposes. 12. DBdump [-rhsiq] [-uU] [-m]+ [ | ... ] Like DBshow, DBdump allows one to display a subset of the reads in the DB and select which information to show about them including any mask tracks. The difference is that the information is written in a very simple "1-code" ASCII format that makes it easy for one to read and parse the information for further use. -r requests that each read number be displayed (useful if only a subset of reads is requested). -h prints the header information which is the source file name, well #, and pulse range. -s requests the sequence be output, -i requests that the intrinsic quality values be output, -q requests that the 5 quiva sequences be output, and -m requests that mask be output. Set -u if you want data from the untrimmed database (the default is trimmed) and set -U if you'd like upper-case letter used in the DNA sequence strings. The format is very simple. Each requested piece of information occurs on a line. The first character of every line is a "1-code" character that tells you what information to expect on the line. The rest of the line contains information where each item is separated by a single blank space. Strings are output as first an integer giving the length of the string, a blank space, and then the string terminated by a new-line. Intrinsic quality values are between 0 and 50, inclusive, and a vector of said are displayed as an alphameric string where 'a' is 0, 'b' is '1', ... 'z' is 25, 'A' is 26, 'B' is 27, ... and 'Y' is 50. R # - read number H # string - original file name string (header) L # # # - location: well, pulse start, pulse end Tx #n (#b #e)^#n - x'th track on command line, #n intervals all on same line S # string - sequence string I # string - intrinsic quality vector (as an ASCII string) d # string - Quiva deletion values (as an ASCII string) c # string - Quiva deletion character string i # string - Quiva insertion value string m # string - Quiva merge value string s # string - Quiva substitution value string + X # - Total amount of X (X = R or H or S or I or M) @ X # - Maximum amount of X (X = H or S or I) 1-code lines that begin with + or @ are always the first lines in the output. They give size information about what is contained in the output. That is '+ X #' gives the number of reads (X=R), the number of masks (X=M), or the total number of characters in all headers (X=H), sequences (X=S), or intrinsic quality vectors (X=I). And '@ X #' gives the maximum number of characters in any header (X=H), sequence (X=S), or intrincic quality vector (X=I). The size numbers for the Quiva strings are identical to that for the sequence as they are all of the same length for any given entry. 12. DBstats [-nu] [-b]+ Show overview statistics for all the reads in the trimmed data base .db or .dam, including a histogram of read lengths where the bucket size is set with the -b option (default 1000). If the -u option is given then the untrimmed database is summarized. If the -n option is given then the histogran of read lengths is not displayed. Any track such as a "dust" track that gives a series of intervals along the read can be specified with the -m option in which case a summary and a histogram of the interval lengths is displayed. 13. DBrm ... Delete all the files for the given data bases. Do not use rm to remove a database, as there are at least two and often several secondary files for each DB including track files, and all of these are removed by DBrm. 14. simulator [-c] [-b] [-m] [-s] [-x] [-e] [-M] In addition to the DB commands we include here, somewhat tangentially, a simple simulator that generates synthetic reads for a random genome. simulator first generates a fake genome of size genlen*1Mb long, that has an AT-bias of -b. It then generates sample reads of mean length -m from a log-normal length distribution with standard deviation -s, but ignores reads of length less than -x. It collects enough reads to cover the genome -c times and introduces -e fraction errors into each read where the ratio of insertions, deletions, and substitutions are set by defined constants INS_RATE (default 73%) and DEL_RATE (default 20%) within generate.c. One can also control the rate at which reads are picked from the forward and reverse strands by setting the defined constant FLIP_RATE (default 50/50). The -r option seeds the random number generator for the generation of the genome so that one can reproducibly generate the same underlying genome to sample from. If this parameter is missing, then the job id of the invocation seeds the random number generator. The output is sent to the standard output (i.e. it is a UNIX pipe). The output is in Pacbio .fasta format suitable as input to fasta2DB. Finally, the -M option requests that the coordinates from which each read has been sampled are written to the indicated file, one line per read, ASCII encoded. This "map" file essentially tells one where every read belongs in an assembly and is very useful for debugging and testing purposes. If a read pair is say b,e then if b < e the read was sampled from [b,e] in the forward direction, and if b > e from [e,b] in the reverse direction. Example: A small complete example of most of the commands above. > simulator 1.0 -c20. >G.fasta // Generate a 20x data sets of a 1Mb genome > fasta2DB G G.fasta // Create a compressed data base of the reads, G.db > rm G.fasta // Redundant, recreate any time with "DB2fasta G" > DBsplit -s11 G // Split G into 2 parts of size ~ 11MB each > DBdust G.1 // Produce a "dust" track on each part > DBdust G.2 > Catrack G dust // Create one track for all of the DB > rm .G.*.dust.* // Clean up the sub-tracks > DBstats -mdust G // Take a look at the statistics for the database Statistics for all reads in the data set 1,836 reads out of 1,836 (100.0%) 20,007,090 base pairs out of 20,007,090 (100.0%) 10,897 average read length 2,192 standard deviation Base composition: 0.250(A) 0.250(C) 0.250(G) 0.250(T) Distribution of Read Lengths (Bin size = 1,000) Bin: Count % Reads % Bases Average 22,000: 1 0.1 0.1 22654 21,000: 0 0.1 0.1 22654 20,000: 1 0.1 0.2 21355 19,000: 0 0.1 0.2 21355 18,000: 4 0.3 0.6 19489 17,000: 8 0.8 1.3 18374 16,000: 19 1.8 2.8 17231 15,000: 43 4.1 6.2 16253 14,000: 81 8.6 12.0 15341 13,000: 146 16.5 21.9 14428 12,000: 200 27.4 34.4 13664 11,000: 315 44.6 52.4 12824 10,000: 357 64.0 71.2 12126 9,000: 306 80.7 85.8 11586 8,000: 211 92.2 94.8 11208 7,000: 95 97.3 98.4 11017 6,000: 43 99.7 99.8 10914 5,000: 6 100.0 100.0 10897 Statistics for dust-track There are 158 intervals totaling 1,820 bases (0.0% of all data) Distribution of dust intervals (Bin size = 1,000) Bin: Count % Intervals % Bases Average 0: 158 100.0 100.0 11 > ls -al total 66518744 drwxr-xr-x+ 177 myersg staff 6018 Mar 2 13:28 . drwxr-xr-x+ 20 myersg staff 680 Feb 26 19:52 .. -rw-r--r--+ 1 myersg staff 5002464 Mar 2 13:28 .G.bps -rw-r--r--+ 1 myersg staff 14704 Mar 2 13:28 .G.dust.anno -rw-r--r--+ 1 myersg staff 1264 Mar 2 13:28 .G.dust.data -rw-r--r--+ 1 myersg staff 73552 Mar 2 13:28 .G.idx -rw-r--r--+ 1 myersg staff 162 Mar 2 13:28 G.db > cat G.db files = 1 1836 G Sim blocks = 2 size = 11 cutoff = 0 all = 0 0 0 1011 1011 1836 1836 pbdagcon-0.3+20161121+ds/DAZZ_DB/DB.h0000644000175000017500000004767513026414552014620 0ustar afifafif/******************************************************************************************* * * Compressed data base module. Auxiliary routines to open and manipulate a data base for * which the sequence and read information are separated into two separate files, and the * sequence is compressed into 2-bits for each base. Support for tracks of additional * information, and trimming according to the current partition. Eventually will also * support compressed quality information. * * Author : Gene Myers * Date : July 2013 * Revised: April 2014 * ********************************************************************************************/ #ifndef _HITS_DB #define _HITS_DB #include #include "QV.h" #define HIDE_FILES // Auxiliary DB files start with a . so they are "hidden" // Undefine if you don't want this // For interactive applications where it is inappropriate to simply exit with an error // message to standard error, define the constant INTERACTIVE. If set, then error // messages are put in the global variable Ebuffer and the caller of a DB routine // can decide how to deal with the error. // // DB, QV, or alignment routines that can encounter errors function as before in // non-INTERACTIVE mode by exiting after printing an error message to stderr. In // INTERACTIVE mode the routines place a message at EPLACE and return an error // value. For such routines that were previously void, they are now int, and // return 1 if an error occured, 0 otherwise. #undef INTERACTIVE #ifdef INTERACTIVE #define EPRINTF sprintf #define EPLACE Ebuffer #define EXIT(x) return (x) #else // BATCH #define EPRINTF fprintf #define EPLACE stderr #define EXIT(x) exit (1) #endif typedef unsigned char uint8; typedef unsigned short uint16; typedef unsigned int uint32; typedef unsigned long long uint64; typedef signed char int8; typedef signed short int16; typedef signed int int32; typedef signed long long int64; typedef float float32; typedef double float64; /******************************************************************************************* * * COMMAND LINE INTERPRETATION MACROS * ********************************************************************************************/ extern char *Prog_Name; // Name of program #ifdef INTERACTIVE extern char Ebuffer[]; #endif #define SYSTEM_ERROR \ { EPRINTF(EPLACE,"%s: System error, read failed!\n",Prog_Name); \ exit (2); \ } #define ARG_INIT(name) \ Prog_Name = Strdup(name,""); \ for (i = 0; i < 128; i++) \ flags[i] = 0; #define ARG_FLAGS(set) \ for (k = 1; argv[i][k] != '\0'; k++) \ { if (index(set,argv[i][k]) == NULL) \ { fprintf(stderr,"%s: -%c is an illegal option\n",Prog_Name,argv[i][k]); \ exit (1); \ } \ flags[(int) argv[i][k]] = 1; \ } #define ARG_POSITIVE(var,name) \ var = strtol(argv[i]+2,&eptr,10); \ if (*eptr != '\0' || argv[i][2] == '\0') \ { fprintf(stderr,"%s: -%c '%s' argument is not an integer\n", \ Prog_Name,argv[i][1],argv[i]+2); \ exit (1); \ } \ if (var <= 0) \ { fprintf(stderr,"%s: %s must be positive (%d)\n",Prog_Name,name,var); \ exit (1); \ } #define ARG_NON_NEGATIVE(var,name) \ var = strtol(argv[i]+2,&eptr,10); \ if (*eptr != '\0' || argv[i][2] == '\0') \ { fprintf(stderr,"%s: -%c '%s' argument is not an integer\n", \ Prog_Name,argv[i][1],argv[i]+2); \ exit (1); \ } \ if (var < 0) \ { fprintf(stderr,"%s: %s must be non-negative (%d)\n",Prog_Name,name,var); \ exit (1); \ } #define ARG_REAL(var) \ var = strtod(argv[i]+2,&eptr); \ if (*eptr != '\0' || argv[i][2] == '\0') \ { fprintf(stderr,"%s: -%c '%s' argument is not a real number\n", \ Prog_Name,argv[i][1],argv[i]+2); \ exit (1); \ } /******************************************************************************************* * * UTILITIES * ********************************************************************************************/ // The following general utilities return NULL if any of their input pointers are NULL, or if they // could not perform their function (in which case they also print an error to stderr). void *Malloc(int64 size, char *mesg); // Guarded versions of malloc, realloc void *Realloc(void *object, int64 size, char *mesg); // and strdup, that output "mesg" to char *Strdup(char *string, char *mesg); // stderr if out of memory FILE *Fopen(char *path, char *mode); // Open file path for "mode" char *PathTo(char *path); // Return path portion of file name "path" char *Root(char *path, char *suffix); // Return the root name, excluding suffix, of "path" // Catenate returns concatenation of path.sep.root.suffix in a *temporary* buffer // Numbered_Suffix returns concatenation of left..right in a *temporary* buffer char *Catenate(char *path, char *sep, char *root, char *suffix); char *Numbered_Suffix(char *left, int num, char *right); // DB-related utilities void Print_Number(int64 num, int width, FILE *out); // Print readable big integer int Number_Digits(int64 num); // Return # of digits in printed number #define COMPRESSED_LEN(len) (((len)+3) >> 2) void Compress_Read(int len, char *s); // Compress read in-place into 2-bit form void Uncompress_Read(int len, char *s); // Uncompress read in-place into numeric form void Print_Read(char *s, int width); void Lower_Read(char *s); // Convert read from numbers to lowercase letters (0-3 to acgt) void Upper_Read(char *s); // Convert read from numbers to uppercase letters (0-3 to ACGT) void Number_Read(char *s); // Convert read from letters to numbers /******************************************************************************************* * * DB IN-CORE DATA STRUCTURES * ********************************************************************************************/ #define DB_QV 0x03ff // Mask for 3-digit quality value #define DB_CSS 0x0400 // This is the second or later of a group of reads from a given insert #define DB_BEST 0x0800 // This is the longest read of a given insert (may be the only 1) typedef struct { int origin; // Well # int rlen; // Length of the sequence (Last pulse = fpulse + rlen) int fpulse; // First pulse int64 boff; // Offset (in bytes) of compressed read in 'bases' file, or offset of // uncompressed bases in memory block int64 coff; // Offset (in bytes) of compressed quiva streams in 'quiva' file int flags; // QV of read + flags above } HITS_READ; // A track can be of 3 types: // data == NULL: there are nreads 'anno' records of size 'size'. // data != NULL && size == 4: anno is an array of nreads+1 int's and data[anno[i]..anno[i+1]) // contains the variable length data // data != NULL && size == 8: anno is an array of nreads+1 int64's and data[anno[i]..anno[i+1]) // contains the variable length data typedef struct _track { struct _track *next; // Link to next track char *name; // Symbolic name of track int size; // Size in bytes of anno records void *anno; // over [0,nreads]: read i annotation: int, int64, or 'size' records void *data; // data[anno[i] .. anno[i+1]-1] is data if data != NULL } HITS_TRACK; // The information for accessing QV streams is in a HITS_QV record that is a "pseudo-track" // named ".@qvs" and is always the first track record in the list (if present). Since normal // track names cannot begin with a . (this is enforced), this pseudo-track is never confused // with a normal track. typedef struct { struct _track *next; char *name; int ncodes; // # of coding tables QVcoding *coding; // array [0..ncodes-1] of coding schemes (see QV.h) uint16 *table; // for i in [0,db->nreads-1]: read i should be decompressed with // scheme coding[table[i]] FILE *quiva; // the open file pointer to the .qvs file } HITS_QV; // The DB record holds all information about the current state of an active DB including an // array of HITS_READS, one per read, and a linked list of HITS_TRACKs the first of which // is always a HITS_QV pseudo-track (if the QVs have been loaded). typedef struct { int ureads; // Total number of reads in untrimmed DB int treads; // Total number of reads in trimmed DB int cutoff; // Minimum read length in block (-1 if not yet set) int all; // Consider multiple reads from a given well float freq[4]; // frequency of A, C, G, T, respectively // Set with respect to "active" part of DB (all vs block, untrimmed vs trimmed) int maxlen; // length of maximum read (initially over all DB) int64 totlen; // total # of bases (initially over all DB) int nreads; // # of reads in actively loaded portion of DB int trimmed; // DB has been trimmed by cutoff/all int part; // DB block (if > 0), total DB (if == 0) int ufirst; // Index of first read in block (without trimming) int tfirst; // Index of first read in block (with trimming) // In order to avoid forcing users to have to rebuild all thier DBs to accommodate // the addition of fields for the size of the actively loaded trimmed and untrimmed // blocks, an additional read record is allocated in "reads" when a DB is loaded into // memory (reads[-1]) and the two desired fields are crammed into the first two // integer spaces of the record. char *path; // Root name of DB for .bps, .qvs, and tracks int loaded; // Are reads loaded in memory? void *bases; // file pointer for bases file (to fetch reads from), // or memory pointer to uncompressed block of all sequences. HITS_READ *reads; // Array [-1..nreads] of HITS_READ HITS_TRACK *tracks; // Linked list of loaded tracks } HITS_DB; /******************************************************************************************* * * DB STUB FILE FORMAT = NFILE FDATA^nfile NBLOCK PARAMS BDATA^nblock * ********************************************************************************************/ #define MAX_NAME 10000 // Longest file name or fasta header line #define DB_NFILE "files = %9d\n" // number of files #define DB_FDATA " %9d %s %s\n" // last read index + 1, fasta prolog, file name #define DB_NBLOCK "blocks = %9d\n" // number of blocks #define DB_PARAMS "size = %9lld cutoff = %9d all = %1d\n" // block size, len cutoff, all in well #define DB_BDATA " %9d %9d\n" // First read index (untrimmed), first read index (trimmed) /******************************************************************************************* * * DB ROUTINES * ********************************************************************************************/ // Suppose DB is the name of an original database. Then there will be files .DB.idx, .DB.bps, // .DB.qvs, and files .DB..anno and DB..data where is a track name // (not containing a . !). // A DAM is basically a DB except that: // 1. there are no QV's, instead .coff points the '\0' terminated fasta header of the read // in the file ..hdr file // 2. .origin contains the contig # of the read within a fasta entry (assembly sequences // contain N-separated contigs), and .fpulse the first base of the contig in the // fasta entry // Open the given database or dam, "path" into the supplied HITS_DB record "db". If the name has // a part # in it then just the part is opened. The index array is allocated (for all or // just the part) and read in. // Return status of routine: // -1: The DB could not be opened for a reason reported by the routine to EPLACE // 0: Open of DB proceeded without mishap // 1: Open of DAM proceeded without mishap int Open_DB(char *path, HITS_DB *db); // Trim the DB or part thereof and all loaded tracks according to the cutoff and all settings // of the current DB partition. Reallocate smaller memory blocks for the information kept // for the retained reads. void Trim_DB(HITS_DB *db); // Shut down an open 'db' by freeing all associated space, including tracks and QV structures, // and any open file pointers. The record pointed at by db however remains (the user // supplied it and so should free it). void Close_DB(HITS_DB *db); // If QV pseudo track is not already in db's track list, then load it and set it up. // The database must not have been trimmed yet. -1 is returned if a .qvs file is not // present, and 1 is returned if an error (reported to EPLACE) occured and INTERACTIVE // is defined. Otherwise a 0 is returned. int Load_QVs(HITS_DB *db); // Remove the QV pseudo track, all space associated with it, and close the .qvs file. void Close_QVs(HITS_DB *db); // Look up the file and header in the file of the indicated track. Return: // 1: Track is for trimmed DB // 0: Track is for untrimmed DB // -1: Track is not the right size of DB either trimmed or untrimmed // -2: Could not find the track // In addition, if opened (0 or 1 returned), then kind points at an integer indicating // the type of track as follows: // CUSTOM 0 => a custom track // MASK 1 => a mask track #define CUSTOM_TRACK 0 #define MASK_TRACK 1 int Check_Track(HITS_DB *db, char *track, int *kind); // If track is not already in the db's track list, then allocate all the storage for it, // read it in from the appropriate file, add it to the track list, and return a pointer // to the newly created HITS_TRACK record. If the track does not exist or cannot be // opened for some reason, then NULL is returned if INTERACTIVE is defined. Otherwise // the routine prints an error message to stderr and exits if an error occurs, and returns // with NULL only if the track does not exist. HITS_TRACK *Load_Track(HITS_DB *db, char *track); // If track is on the db's track list, then it is removed and all storage associated with it // is freed. void Close_Track(HITS_DB *db, char *track); // Allocate and return a buffer big enough for the largest read in 'db'. // **NB** free(x-1) if x is the value returned as *prefix* and suffix '\0'(4)-byte // are needed by the alignment algorithms. If cannot allocate memory then return NULL // if INTERACTIVE is defined, or print error to stderr and exit otherwise. char *New_Read_Buffer(HITS_DB *db); // Load into 'read' the i'th read in 'db'. As a lower case ascii string if ascii is 1, an // upper case ascii string if ascii is 2, and a numeric string over 0(A), 1(C), 2(G), and 3(T) // otherwise. A '\0' (or 4) is prepended and appended to the string so it has a delimeter // for traversals in either direction. A non-zero value is returned if an error occured // and INTERACTIVE is defined. int Load_Read(HITS_DB *db, int i, char *read, int ascii); // Load into 'read' the subread [beg,end] of the i'th read in 'db' and return a pointer to the // the start of the subinterval (not necessarily = to read !!! ). As a lower case ascii // string if ascii is 1, an upper case ascii string if ascii is 2, and a numeric string // over 0(A), 1(C), 2(G), and 3(T) otherwise. A '\0' (or 4) is prepended and appended to // the string holding the substring so it has a delimeter for traversals in either direction. // A NULL pointer is returned if an error occured and INTERACTIVE is defined. char *Load_Subread(HITS_DB *db, int i, int beg, int end, char *read, int ascii); // Allocate a set of 5 vectors large enough to hold the longest QV stream that will occur // in the database. If cannot allocate memory then return NULL if INTERACTIVE is defined, // or print error to stderr and exit otherwise. #define DEL_QV 0 // The deletion QVs are x[DEL_QV] if x is the buffer returned by New_QV_Buffer #define DEL_TAG 1 // The deleted characters #define INS_QV 2 // The insertion QVs #define SUB_QV 3 // The substitution QVs #define MRG_QV 4 // The merge QVs char **New_QV_Buffer(HITS_DB *db); // Load into 'entry' the 5 QV vectors for i'th read in 'db'. The deletion tag or characters // are converted to a numeric or upper/lower case ascii string as per ascii. Return with // a zero, except when an error occurs and INTERACTIVE is defined in which case return wtih 1. int Load_QVentry(HITS_DB *db, int i, char **entry, int ascii); // Allocate a block big enough for all the uncompressed sequences, read them into it, // reset the 'off' in each read record to be its in-memory offset, and set the // bases pointer to point at the block after closing the bases file. If ascii is // 1 then the reads are converted to lowercase ascii, if 2 then uppercase ascii, and // otherwise the reads are left as numeric strings over 0(A), 1(C), 2(G), and 3(T). // Return with a zero, except when an error occurs and INTERACTIVE is defined in which // case return wtih 1. int Read_All_Sequences(HITS_DB *db, int ascii); // For the DB or DAM "path" = "prefix/root.[db|dam]", find all the files for that DB, i.e. all // those of the form "prefix/[.]root.part" and call actor with the complete path to each file // pointed at by path, and the suffix of the path by extension. The . proceeds the root // name if the defined constant HIDE_FILES is set. Always the first call is with the // path "prefix/root.[db|dam]" and extension "db" or "dam". There will always be calls for // "prefix/[.]root.idx" and "prefix/[.]root.bps". All other calls are for *tracks* and // so this routine gives one a way to know all the tracks associated with a given DB. // -1 is returned if the path could not be found, and 1 is returned if an error (reported // to EPLACE) occured and INTERACTIVE is defined. Otherwise a 0 is returned. int List_DB_Files(char *path, void actor(char *path, char *extension)); #endif // _HITS_DB pbdagcon-0.3+20161121+ds/DAZZ_DB/DB2quiva.c0000644000175000017500000000706513026414552015730 0ustar afifafif/******************************************************************************************** * * Recreate all the .quiva files that have been loaded into a specified database. * * Author: Gene Myers * Date : May 2014 * ********************************************************************************************/ #include #include #include #include "DB.h" #include "QV.h" #ifdef HIDE_FILES #define PATHSEP "/." #else #define PATHSEP "/" #endif static char *Usage = "[-vU] "; int main(int argc, char *argv[]) { HITS_DB _db, *db = &_db; FILE *dbfile, *quiva; int VERBOSE, UPPER; // Process arguments { int i, j, k; int flags[128]; ARG_INIT("DB2quiva") j = 1; for (i = 1; i < argc; i++) if (argv[i][0] == '-') { ARG_FLAGS("vU") } else argv[j++] = argv[i]; argc = j; VERBOSE = flags['v']; UPPER = flags['U']; if (argc != 2) { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage); exit (1); } } // Open db, db stub file, and .qvs file { char *pwd, *root; int status; status = Open_DB(argv[1],db); if (status < 0) exit (1); if (status == 1) { fprintf(stderr,"%s: Cannot be called on a .dam index: %s\n",Prog_Name,argv[1]); exit (1); } if (db->part > 0) { fprintf(stderr,"%s: Cannot be called on a block: %s\n",Prog_Name,argv[1]); exit (1); } pwd = PathTo(argv[1]); root = Root(argv[1],".db"); dbfile = Fopen(Catenate(pwd,"/",root,".db"),"r"); quiva = Fopen(Catenate(pwd,PATHSEP,root,".qvs"),"r"); free(pwd); free(root); if (dbfile == NULL || quiva == NULL) exit (1); } // For each file do: { HITS_READ *reads; int f, first, nfiles; QVcoding *coding; char **entry; if (fscanf(dbfile,DB_NFILE,&nfiles) != 1) SYSTEM_ERROR entry = New_QV_Buffer(db); reads = db->reads; first = 0; for (f = 0; f < nfiles; f++) { int i, last; FILE *ofile; char prolog[MAX_NAME], fname[MAX_NAME]; // Scan db image file line, create .quiva file for writing if (reads[first].coff < 0) break; if (fscanf(dbfile,DB_FDATA,&last,fname,prolog) != 3) SYSTEM_ERROR if ((ofile = Fopen(Catenate(".","/",fname,".quiva"),"w")) == NULL) exit (1); if (VERBOSE) { fprintf(stderr,"Creating %s.quiva ...\n",fname); fflush(stderr); } coding = Read_QVcoding(quiva); // For the relevant range of reads, write the header for each to the file // and then uncompress and write the quiva entry for each for (i = first; i < last; i++) { int e, flags, qv, rlen; HITS_READ *r; r = reads + i; flags = r->flags; rlen = r->rlen; qv = (flags & DB_QV); fprintf(ofile,"@%s/%d/%d_%d",prolog,r->origin,r->fpulse,r->fpulse+rlen); if (qv > 0) fprintf(ofile," RQ=0.%3d",qv); fprintf(ofile,"\n"); Uncompress_Next_QVentry(quiva,entry,coding,rlen); if (UPPER) { char *deltag = entry[1]; int j; for (j = 0; j < rlen; j++) deltag[j] -= 32; } for (e = 0; e < 5; e++) fprintf(ofile,"%.*s\n",rlen,entry[e]); } first = last; } } fclose(quiva); fclose(dbfile); Close_DB(db); exit (0); } pbdagcon-0.3+20161121+ds/DAZZ_DB/DBupgrade.Dec.31.2014.c0000644000175000017500000000411313026414552017440 0ustar afifafif/******************************************************************************************* * * Interim code: upgrade previous db to have fpulse,rlen fields * * Author: Gene Myers * Date : December 2014 * ********************************************************************************************/ #include #include #include #include #include "DB.h" #ifdef HIDE_FILES #define PATHSEP "/." #else #define PATHSEP "/" #endif typedef struct { int origin; // Well # int beg; // First pulse int end; // Last pulse int64 boff; // Offset (in bytes) of compressed read in 'bases' file, or offset of // uncompressed bases in memory block int64 coff; // Offset (in bytes) of compressed quiva streams in 'quiva' file int flags; // QV of read + flags above } HITS_OLD; int main(int argc, char *argv[]) { HITS_DB db; FILE *nxfile, *ixfile; char *pwd, *root; int i; if (argc != 2) { fprintf(stderr,"Usage: %s \n",argv[0]); exit (1); } pwd = PathTo(argv[1]); root = Root(argv[1],".db"); ixfile = Fopen(Catenate(pwd,PATHSEP,root,".idx"),"r"); nxfile = Fopen(Catenate(pwd,PATHSEP,root,".ndx"),"w"); if (ixfile == NULL || nxfile == NULL) exit (1); free(pwd); free(root); if (fread(&db,sizeof(HITS_DB),1,ixfile) != 1) SYSTEM_ERROR fwrite(&db,sizeof(HITS_DB),1,nxfile); fprintf(stderr,"Converting %d reads\n",db.ureads); fflush(stderr); for (i = 0; i < db.ureads; i++) { HITS_OLD orec; HITS_READ nrec; if (i%10000 == 0) { fprintf(stderr," Processing %d\n",i); fflush(stderr); } if (fread(&orec,sizeof(HITS_OLD),1,ixfile) != 1) SYSTEM_ERROR nrec.origin = orec.origin; nrec.fpulse = orec.beg; nrec.rlen = orec.end-orec.beg; nrec.boff = orec.boff; nrec.coff = orec.coff; nrec.flags = orec.flags; fwrite(&nrec,sizeof(HITS_READ),1,nxfile); } fclose(ixfile); fclose(nxfile); exit (0); } pbdagcon-0.3+20161121+ds/DAZZ_DB/DAM2fasta.c0000644000175000017500000001125513026414552016011 0ustar afifafif/******************************************************************************************** * * Recreate all the .fasta files that are in a specified DAM. * * Author: Gene Myers * Date : May 2014 * ********************************************************************************************/ #include #include #include #include "DB.h" #ifdef HIDE_FILES #define PATHSEP "/." #else #define PATHSEP "/" #endif static char *Usage = "[-vU] [-w] "; int main(int argc, char *argv[]) { HITS_DB _db, *db = &_db; FILE *dbfile, *hdrs; int nfiles; int VERBOSE, UPPER, WIDTH; // Process arguments { int i, j, k; int flags[128]; char *eptr; ARG_INIT("DAM2fasta") WIDTH = 80; j = 1; for (i = 1; i < argc; i++) if (argv[i][0] == '-') switch (argv[i][1]) { default: ARG_FLAGS("vU") break; case 'w': ARG_NON_NEGATIVE(WIDTH,"Line width") break; } else argv[j++] = argv[i]; argc = j; UPPER = 1 + flags['U']; VERBOSE = flags['v']; if (argc != 2) { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage); exit (1); } } // Open db { int status; status = Open_DB(argv[1],db); if (status < 0) exit (1); if (status == 0) { fprintf(stderr,"%s: Cannot be called on a .db: %s\n",Prog_Name,argv[1]); exit (1); } if (db->part > 0) { fprintf(stderr,"%s: Cannot be called on a block: %s\n",Prog_Name,argv[1]); exit (1); } } { char *pwd, *root; pwd = PathTo(argv[1]); root = Root(argv[1],".dam"); dbfile = Fopen(Catenate(pwd,"/",root,".dam"),"r"); hdrs = Fopen(Catenate(pwd,PATHSEP,root,".hdr"),"r"); free(pwd); free(root); if (dbfile == NULL || hdrs == NULL) exit (1); } // nfiles = # of files in data base if (fscanf(dbfile,DB_NFILE,&nfiles) != 1) SYSTEM_ERROR // For each file do: { HITS_READ *reads; char *read; int f, first; char nstring[WIDTH+1]; if (UPPER == 2) for (f = 0; f < WIDTH; f++) nstring[f] = 'N'; else for (f = 0; f < WIDTH; f++) nstring[f] = 'n'; nstring[WIDTH] = '\0'; reads = db->reads; read = New_Read_Buffer(db); first = 0; for (f = 0; f < nfiles; f++) { int i, last, wpos; FILE *ofile; char prolog[MAX_NAME], fname[MAX_NAME], header[MAX_NAME]; // Scan db image file line, create .fasta file for writing if (fscanf(dbfile,DB_FDATA,&last,fname,prolog) != 3) SYSTEM_ERROR if ((ofile = Fopen(Catenate(".","/",fname,".fasta"),"w")) == NULL) exit (1); if (VERBOSE) { fprintf(stderr,"Creating %s.fasta ...\n",fname); fflush(stdout); } // For the relevant range of reads, write each to the file // recreating the original headers with the index meta-data about each read wpos = 0; for (i = first; i < last; i++) { int j, len, nlen, w; HITS_READ *r; r = reads + i; len = r->rlen; if (r->origin == 0) { if (i != first && wpos != 0) { fprintf(ofile,"\n"); wpos = 0; } fseeko(hdrs,r->coff,SEEK_SET); fgets(header,MAX_NAME,hdrs); fputs(header,ofile); } if (r->fpulse != 0) { if (r->origin != 0) nlen = r->fpulse - (reads[i-1].fpulse + reads[i-1].rlen); else nlen = r->fpulse; for (j = 0; j+(w = WIDTH-wpos) <= nlen; j += w) { fprintf(ofile,"%.*s\n",w,nstring); wpos = 0; } if (j < nlen) { fprintf(ofile,"%.*s",nlen-j,nstring); if (j == 0) wpos += nlen; else wpos = nlen-j; } } Load_Read(db,i,read,UPPER); for (j = 0; j+(w = WIDTH-wpos) <= len; j += w) { fprintf(ofile,"%.*s\n",w,read+j); wpos = 0; } if (j < len) { fprintf(ofile,"%s",read+j); if (j == 0) wpos += len; else wpos = len-j; } } if (wpos > 0) fprintf(ofile,"\n"); first = last; } } fclose(hdrs); fclose(dbfile); Close_DB(db); exit (0); } pbdagcon-0.3+20161121+ds/DAZZ_DB/DB.c0000644000175000017500000013055113026414552014575 0ustar afifafif/******************************************************************************************* * * Compressed data base module. Auxiliary routines to open and manipulate a data base for * which the sequence and read information are separated into two separate files, and the * sequence is compressed into 2-bits for each base. Support for tracks of additional * information, and trimming according to the current partition. * * Author : Gene Myers * Date : July 2013 * Revised: April 2014 * ********************************************************************************************/ #include #include #include #include #include #include #include "DB.h" #ifdef HIDE_FILES #define PATHSEP "/." #else #define PATHSEP "/" #endif /******************************************************************************************* * * GENERAL UTILITIES * ********************************************************************************************/ char *Prog_Name; #ifdef INTERACTIVE char Ebuffer[1000]; #endif void *Malloc(int64 size, char *mesg) { void *p; if ((p = malloc(size)) == NULL) { if (mesg == NULL) EPRINTF(EPLACE,"%s: Out of memory\n",Prog_Name); else EPRINTF(EPLACE,"%s: Out of memory (%s)\n",Prog_Name,mesg); } return (p); } void *Realloc(void *p, int64 size, char *mesg) { if (size <= 0) size = 1; if ((p = realloc(p,size)) == NULL) { if (mesg == NULL) EPRINTF(EPLACE,"%s: Out of memory\n",Prog_Name); else EPRINTF(EPLACE,"%s: Out of memory (%s)\n",Prog_Name,mesg); } return (p); } char *Strdup(char *name, char *mesg) { char *s; if (name == NULL) return (NULL); if ((s = strdup(name)) == NULL) { if (mesg == NULL) EPRINTF(EPLACE,"%s: Out of memory\n",Prog_Name); else EPRINTF(EPLACE,"%s: Out of memory (%s)\n",Prog_Name,mesg); } return (s); } FILE *Fopen(char *name, char *mode) { FILE *f; char newmode[3]; if (name == NULL || mode == NULL) return (NULL); if (strcmp(mode, "r") == 0) { strcpy(newmode,"rm"); } else { strcpy(newmode,mode); } if ((f = fopen(name,newmode)) == NULL) EPRINTF(EPLACE,"%s: Cannot open %s for '%s'\n",Prog_Name,name,mode); return (f); } char *PathTo(char *name) { char *path, *find; if (name == NULL) return (NULL); if ((find = rindex(name,'/')) != NULL) { *find = '\0'; path = Strdup(name,"Extracting path from"); *find = '/'; } else path = Strdup(".","Allocating default path"); return (path); } char *Root(char *name, char *suffix) { char *path, *find, *dot; int epos; if (name == NULL) return (NULL); find = rindex(name,'/'); if (find == NULL) find = name; else find += 1; if (suffix == NULL) { dot = strchr(find,'.'); if (dot != NULL) *dot = '\0'; path = Strdup(find,"Extracting root from"); if (dot != NULL) *dot = '.'; } else { epos = strlen(find); epos -= strlen(suffix); if (epos > 0 && strcasecmp(find+epos,suffix) == 0) { find[epos] = '\0'; path = Strdup(find,"Extracting root from"); find[epos] = suffix[0]; } else path = Strdup(find,"Allocating root"); } return (path); } char *Catenate(char *path, char *sep, char *root, char *suffix) { static char *cat = NULL; static int max = -1; int len; if (path == NULL || root == NULL || sep == NULL || suffix == NULL) return (NULL); len = strlen(path); len += strlen(sep); len += strlen(root); len += strlen(suffix); if (len > max) { max = ((int) (1.2*len)) + 100; if ((cat = (char *) realloc(cat,max+1)) == NULL) { EPRINTF(EPLACE,"%s: Out of memory (Making path name for %s)\n",Prog_Name,root); return (NULL); } } sprintf(cat,"%s%s%s%s",path,sep,root,suffix); return (cat); } char *Numbered_Suffix(char *left, int num, char *right) { static char *suffix = NULL; static int max = -1; int len; if (left == NULL || right == NULL) return (NULL); len = strlen(left); len += strlen(right) + 40; if (len > max) { max = ((int) (1.2*len)) + 100; if ((suffix = (char *) realloc(suffix,max+1)) == NULL) { EPRINTF(EPLACE,"%s: Out of memory (Making number suffix for %d)\n",Prog_Name,num); return (NULL); } } sprintf(suffix,"%s%d%s",left,num,right); return (suffix); } #define COMMA ',' // Print big integers with commas/periods for better readability void Print_Number(int64 num, int width, FILE *out) { if (width == 0) { if (num < 1000ll) fprintf(out,"%lld",num); else if (num < 1000000ll) fprintf(out,"%lld%c%03lld",num/1000ll,COMMA,num%1000ll); else if (num < 1000000000ll) fprintf(out,"%lld%c%03lld%c%03lld",num/1000000ll, COMMA,(num%1000000ll)/1000ll,COMMA,num%1000ll); else fprintf(out,"%lld%c%03lld%c%03lld%c%03lld",num/1000000000ll, COMMA,(num%1000000000ll)/1000000ll, COMMA,(num%1000000ll)/1000ll,COMMA,num%1000ll); } else { if (num < 1000ll) fprintf(out,"%*lld",width,num); else if (num < 1000000ll) { if (width <= 4) fprintf(out,"%lld%c%03lld",num/1000ll,COMMA,num%1000ll); else fprintf(out,"%*lld%c%03lld",width-4,num/1000ll,COMMA,num%1000ll); } else if (num < 1000000000ll) { if (width <= 8) fprintf(out,"%lld%c%03lld%c%03lld",num/1000000ll,COMMA,(num%1000000ll)/1000ll, COMMA,num%1000ll); else fprintf(out,"%*lld%c%03lld%c%03lld",width-8,num/1000000ll,COMMA,(num%1000000ll)/1000ll, COMMA,num%1000ll); } else { if (width <= 12) fprintf(out,"%lld%c%03lld%c%03lld%c%03lld",num/1000000000ll,COMMA, (num%1000000000ll)/1000000ll,COMMA, (num%1000000ll)/1000ll,COMMA,num%1000ll); else fprintf(out,"%*lld%c%03lld%c%03lld%c%03lld",width-12,num/1000000000ll,COMMA, (num%1000000000ll)/1000000ll,COMMA, (num%1000000ll)/1000ll,COMMA,num%1000ll); } } } // Return the number of digits, base 10, of num int Number_Digits(int64 num) { int digit; digit = 0; while (num >= 1) { num /= 10; digit += 1; } return (digit); } /******************************************************************************************* * * READ COMPRESSION/DECOMPRESSION UTILITIES * ********************************************************************************************/ // Compress read into 2-bits per base (from [0-3] per byte representation void Compress_Read(int len, char *s) { int i; char c, d; char *s0, *s1, *s2, *s3; s0 = s; s1 = s0+1; s2 = s1+1; s3 = s2+1; c = s1[len]; d = s2[len]; s0[len] = s1[len] = s2[len] = 0; for (i = 0; i < len; i += 4) *s++ = (char ) ((s0[i] << 6) | (s1[i] << 4) | (s2[i] << 2) | s3[i]); s1[len] = c; s2[len] = d; } // Uncompress read form 2-bits per base into [0-3] per byte representation void Uncompress_Read(int len, char *s) { int i, tlen, byte; char *s0, *s1, *s2, *s3; char *t; s0 = s; s1 = s0+1; s2 = s1+1; s3 = s2+1; tlen = (len-1)/4; t = s+tlen; for (i = tlen*4; i >= 0; i -= 4) { byte = *t--; s0[i] = (char) ((byte >> 6) & 0x3); s1[i] = (char) ((byte >> 4) & 0x3); s2[i] = (char) ((byte >> 2) & 0x3); s3[i] = (char) (byte & 0x3); } s[len] = 4; } // Convert read in [0-3] representation to ascii representation (end with '\n') void Lower_Read(char *s) { static char letter[4] = { 'a', 'c', 'g', 't' }; for ( ; *s != 4; s++) *s = letter[(int) *s]; *s = '\0'; } void Upper_Read(char *s) { static char letter[4] = { 'A', 'C', 'G', 'T' }; for ( ; *s != 4; s++) *s = letter[(int) *s]; *s = '\0'; } // Convert read in ascii representation to [0-3] representation (end with 4) void Number_Read(char *s) { static char number[128] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }; for ( ; *s != '\0'; s++) *s = number[(int) *s]; *s = 4; } /******************************************************************************************* * * DB OPEN, TRIM & CLOSE ROUTINES * ********************************************************************************************/ // Open the given database or dam, "path" into the supplied HITS_DB record "db". If the name has // a part # in it then just the part is opened. The index array is allocated (for all or // just the part) and read in. // Return status of routine: // -1: The DB could not be opened for a reason reported by the routine to EPLACE // 0: Open of DB proceeded without mishap // 1: Open of DAM proceeded without mishap int Open_DB(char* path, HITS_DB *db) { HITS_DB dbcopy; char *root, *pwd, *bptr, *fptr, *cat; int nreads; FILE *index, *dbvis; int status, plen, isdam; int part, cutoff, all; int ufirst, tfirst, ulast, tlast; status = -1; dbcopy = *db; plen = strlen(path); if (strcmp(path+(plen-4),".dam") == 0) root = Root(path,".dam"); else root = Root(path,".db"); pwd = PathTo(path); bptr = rindex(root,'.'); if (bptr != NULL && bptr[1] != '\0' && bptr[1] != '-') { part = strtol(bptr+1,&fptr,10); if (*fptr != '\0' || part == 0) part = 0; else *bptr = '\0'; } else part = 0; isdam = 0; cat = Catenate(pwd,"/",root,".db"); if (cat == NULL) return (-1); if ((dbvis = fopen(cat,"r")) == NULL) { cat = Catenate(pwd,"/",root,".dam"); if (cat == NULL) return (-1); if ((dbvis = fopen(cat,"r")) == NULL) { EPRINTF(EPLACE,"%s: Could not open database %s\n",Prog_Name,path); goto error; } isdam = 1; } if ((index = Fopen(Catenate(pwd,PATHSEP,root,".idx"),"r")) == NULL) goto error1; if (fread(db,sizeof(HITS_DB),1,index) != 1) { EPRINTF(EPLACE,"%s: Index file (.idx) of %s is junk\n",Prog_Name,root); goto error2; } { int p, nblocks, nfiles; int64 size; char fname[MAX_NAME], prolog[MAX_NAME]; nblocks = 0; if (fscanf(dbvis,DB_NFILE,&nfiles) != 1) { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root); goto error2; } for (p = 0; p < nfiles; p++) if (fscanf(dbvis,DB_FDATA,&tlast,fname,prolog) != 3) { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root); goto error2; } if (fscanf(dbvis,DB_NBLOCK,&nblocks) != 1) if (part == 0) { cutoff = 0; all = 1; } else { EPRINTF(EPLACE,"%s: DB %s has not yet been partitioned, cannot request a block !\n", Prog_Name,root); goto error2; } else { if (fscanf(dbvis,DB_PARAMS,&size,&cutoff,&all) != 3) { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root); goto error2; } if (part > nblocks) { EPRINTF(EPLACE,"%s: DB %s has only %d blocks\n",Prog_Name,root,nblocks); goto error2; } } if (part > 0) { for (p = 1; p <= part; p++) if (fscanf(dbvis,DB_BDATA,&ufirst,&tfirst) != 2) { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root); goto error2; } if (fscanf(dbvis,DB_BDATA,&ulast,&tlast) != 2) { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root); goto error2; } } else { ufirst = tfirst = 0; ulast = db->ureads; tlast = db->treads; } } db->trimmed = 0; db->tracks = NULL; db->part = part; db->cutoff = cutoff; db->all = all; db->ufirst = ufirst; db->tfirst = tfirst; nreads = ulast-ufirst; if (part <= 0) { db->reads = (HITS_READ *) Malloc(sizeof(HITS_READ)*(nreads+2),"Allocating Open_DB index"); if (db->reads == NULL) goto error2; db->reads += 1; if (fread(db->reads,sizeof(HITS_READ),nreads,index) != (size_t) nreads) { EPRINTF(EPLACE,"%s: Index file (.idx) of %s is junk\n",Prog_Name,root); free(db->reads); goto error2; } } else { HITS_READ *reads; int i, r, maxlen; int64 totlen; reads = (HITS_READ *) Malloc(sizeof(HITS_READ)*(nreads+2),"Allocating Open_DB index"); if (reads == NULL) goto error2; reads += 1; fseeko(index,sizeof(HITS_READ)*ufirst,SEEK_CUR); if (fread(reads,sizeof(HITS_READ),nreads,index) != (size_t) nreads) { EPRINTF(EPLACE,"%s: Index file (.idx) of %s is junk\n",Prog_Name,root); free(reads); goto error2; } totlen = 0; maxlen = 0; for (i = 0; i < nreads; i++) { r = reads[i].rlen; totlen += r; if (r > maxlen) maxlen = r; } db->maxlen = maxlen; db->totlen = totlen; db->reads = reads; } ((int *) (db->reads))[-1] = ulast - ufirst; // Kludge, need these for DB part ((int *) (db->reads))[-2] = tlast - tfirst; db->nreads = nreads; db->path = Strdup(Catenate(pwd,PATHSEP,root,""),"Allocating Open_DB path"); if (db->path == NULL) goto error2; db->bases = NULL; db->loaded = 0; status = isdam; error2: fclose(index); error1: fclose(dbvis); error: if (bptr != NULL) *bptr = '.'; free(pwd); free(root); if (status < 0) *db = dbcopy; return (status); } // Trim the DB or part thereof and all loaded tracks according to the cuttof and all settings // of the current DB partition. Reallocate smaller memory blocks for the information kept // for the retained reads. void Trim_DB(HITS_DB *db) { int i, j, r; int allflag, cutoff; int64 totlen; int maxlen, nreads; HITS_TRACK *record; HITS_READ *reads; if (db->trimmed) return; if (db->cutoff <= 0 && db->all) return; cutoff = db->cutoff; if (db->all) allflag = 0; else allflag = DB_BEST; reads = db->reads; nreads = db->nreads; for (record = db->tracks; record != NULL; record = record->next) if (strcmp(record->name,".@qvs") == 0) { uint16 *table = ((HITS_QV *) record)->table; j = 0; for (i = 0; i < db->nreads; i++) if ((reads[i].flags & DB_BEST) >= allflag && reads[i].rlen >= cutoff) table[j++] = table[i]; } else { int *anno4, size; int64 *anno8; char *anno, *data; size = record->size; data = (char *) record->data; if (data == NULL) { anno = (char *) record->anno; j = 0; for (i = r = 0; i < db->nreads; i++, r += size) if ((reads[i].flags & DB_BEST) >= allflag && reads[i].rlen >= cutoff) { memmove(anno+j,anno+r,size); j += size; } memmove(anno+j,anno+r,size); } else if (size == 4) { int ai; anno4 = (int *) (record->anno); j = anno4[0] = 0; for (i = 0; i < db->nreads; i++) if ((reads[i].flags & DB_BEST) >= allflag && reads[i].rlen >= cutoff) { ai = anno4[i]; anno4[j+1] = anno4[j] + (anno4[i+1]-ai); memmove(data+anno4[j],data+ai,anno4[i+1]-ai); j += 1; } record->data = Realloc(record->data,anno4[j],NULL); } else // size == 8 { int64 ai; anno8 = (int64 *) (record->anno); j = anno8[0] = 0; for (i = 0; i < db->nreads; i++) if ((reads[i].flags & DB_BEST) >= allflag && reads[i].rlen >= cutoff) { ai = anno8[i]; anno8[j+1] = anno8[j] + (anno8[i+1]-ai); memmove(data+anno8[j],data+ai,anno8[i+1]-ai); j += 1; } record->data = Realloc(record->data,anno8[j],NULL); } record->anno = Realloc(record->anno,record->size*(j+1),NULL); } totlen = maxlen = 0; for (j = i = 0; i < nreads; i++) { r = reads[i].rlen; if ((reads[i].flags & DB_BEST) >= allflag && r >= cutoff) { totlen += r; if (r > maxlen) maxlen = r; reads[j++] = reads[i]; } } db->totlen = totlen; db->maxlen = maxlen; db->nreads = j; db->trimmed = 1; if (j < nreads) { db->reads = Realloc(reads-1,sizeof(HITS_READ)*(j+2),NULL); db->reads += 1; } } // The DB has already been trimmed, but a track over the untrimmed DB needs to be loaded. // Trim the track by rereading the untrimmed DB index from the file system. static int Late_Track_Trim(HITS_DB *db, HITS_TRACK *track, int ispart) { int i, j, r; int allflag, cutoff; int ureads; char *root; HITS_READ read; FILE *indx; if (!db->trimmed) return (0); if (db->cutoff <= 0 && db->all) return (0); cutoff = db->cutoff; if (db->all) allflag = 0; else allflag = DB_BEST; root = rindex(db->path,'/') + 2; indx = Fopen(Catenate(db->path,"","",".idx"),"r"); fseeko(indx,sizeof(HITS_DB) + sizeof(HITS_READ)*db->ufirst,SEEK_SET); if (ispart) ureads = ((int *) (db->reads))[-1]; else ureads = db->ureads; if (strcmp(track->name,".@qvs") == 0) { EPRINTF(EPLACE,"%s: Cannot load QV track after trimming\n",Prog_Name); fclose(indx); EXIT(1); } { int *anno4, size; int64 *anno8; char *anno, *data; size = track->size; data = (char *) track->data; if (data == NULL) { anno = (char *) track->anno; j = r = 0; for (i = r = 0; i < ureads; i++, r += size) { if (fread(&read,sizeof(HITS_READ),1,indx) != 1) { EPRINTF(EPLACE,"%s: Index file (.idx) of %s is junk\n",Prog_Name,root); fclose(indx); EXIT(1); } if ((read.flags & DB_BEST) >= allflag && read.rlen >= cutoff) { memmove(anno+j,anno+r,size); j += size; } r += size; } memmove(anno+j,anno+r,size); } else if (size == 4) { int ai; anno4 = (int *) (track->anno); j = anno4[0] = 0; for (i = 0; i < ureads; i++) { if (fread(&read,sizeof(HITS_READ),1,indx) != 1) { EPRINTF(EPLACE,"%s: Index file (.idx) of %s is junk\n",Prog_Name,root); fclose(indx); EXIT(1); } if ((read.flags & DB_BEST) >= allflag && read.rlen >= cutoff) { ai = anno4[i]; anno4[j+1] = anno4[j] + (anno4[i+1]-ai); memmove(data+anno4[j],data+ai,anno4[i+1]-ai); j += 1; } } track->data = Realloc(track->data,anno4[j],NULL); } else // size == 8 { int64 ai; anno8 = (int64 *) (track->anno); j = anno8[0] = 0; for (i = 0; i < ureads; i++) { if (fread(&read,sizeof(HITS_READ),1,indx) != 1) { EPRINTF(EPLACE,"%s: Index file (.idx) of %s is junk\n",Prog_Name,root); fclose(indx); EXIT(1); } if ((read.flags & DB_BEST) >= allflag && read.rlen >= cutoff) { ai = anno8[i]; anno8[j+1] = anno8[j] + (anno8[i+1]-ai); memmove(data+anno8[j],data+ai,anno8[i+1]-ai); j += 1; } } track->data = Realloc(track->data,anno8[j],NULL); } track->anno = Realloc(track->anno,track->size*(j+1),NULL); } fclose(indx); return (0); } // Shut down an open 'db' by freeing all associated space, including tracks and QV structures, // and any open file pointers. The record pointed at by db however remains (the user // supplied it and so should free it). void Close_DB(HITS_DB *db) { HITS_TRACK *t, *p; if (db->loaded) free(((char *) (db->bases)) - 1); else if (db->bases != NULL) fclose((FILE *) db->bases); if (db->reads != NULL) free(db->reads-1); free(db->path); Close_QVs(db); for (t = db->tracks; t != NULL; t = p) { p = t->next; free(t->anno); free(t->data); free(t); } } /******************************************************************************************* * * QV LOAD & CLOSE ROUTINES * ********************************************************************************************/ HITS_DB *Active_DB = NULL; // Last db/qv used by "Load_QVentry" HITS_QV *Active_QV; // Becomes invalid after closing int Load_QVs(HITS_DB *db) { FILE *quiva, *istub, *indx; char *root; uint16 *table; HITS_QV *qvtrk; QVcoding *coding, *nx; int ncodes; if (db->tracks != NULL && strcmp(db->tracks->name,".@qvs") == 0) return (0); if (db->trimmed) { EPRINTF(EPLACE,"%s: Cannot load QVs after trimming the DB\n",Prog_Name); EXIT(1); } if (db->reads[db->nreads-1].coff < 0) { EPRINTF(EPLACE,"%s: The requested QVs have not been added to the DB!\n",Prog_Name); EXIT(1); } // Open .qvs, .idx, and .db files quiva = Fopen(Catenate(db->path,"","",".qvs"),"r"); if (quiva == NULL) return (-1); istub = NULL; indx = NULL; table = NULL; coding = NULL; qvtrk = NULL; root = rindex(db->path,'/') + 2; istub = Fopen(Catenate(db->path,"/",root,".db"),"r"); if (istub == NULL) goto error; { int first, last, nfiles; char prolog[MAX_NAME], fname[MAX_NAME]; int i, j; if (fscanf(istub,DB_NFILE,&nfiles) != 1) { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root); goto error; } if (db->part > 0) { int pfirst, plast; int fbeg, fend; int n, k; FILE *indx; // Determine first how many and which files span the block (fbeg to fend) pfirst = db->ufirst; plast = pfirst + db->nreads; first = 0; for (fbeg = 0; fbeg < nfiles; fbeg++) { if (fscanf(istub,DB_FDATA,&last,fname,prolog) != 3) { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root); goto error; } if (last > pfirst) break; first = last; } for (fend = fbeg+1; fend <= nfiles; fend++) { if (last >= plast) break; if (fscanf(istub,DB_FDATA,&last,fname,prolog) != 3) { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root); goto error; } first = last; } indx = Fopen(Catenate(db->path,"","",".idx"),"r"); ncodes = fend-fbeg; coding = (QVcoding *) Malloc(sizeof(QVcoding)*ncodes,"Allocating coding schemes"); table = (uint16 *) Malloc(sizeof(uint16)*db->nreads,"Allocating QV table indices"); if (indx == NULL || coding == NULL || table == NULL) { ncodes = 0; goto error; } // Carefully get the first coding scheme (its offset is most likely in a HITS_RECORD // in .idx that is *not* in memory). Get all the other coding schemes normally and // assign the tables # for each read in the block in "tables". rewind(istub); fscanf(istub,DB_NFILE,&nfiles); first = 0; for (n = 0; n < fbeg; n++) { fscanf(istub,DB_FDATA,&last,fname,prolog); first = last; } for (n = fbeg; n < fend; n++) { fscanf(istub,DB_FDATA,&last,fname,prolog); i = n-fbeg; if (first < pfirst) { HITS_READ read; fseeko(indx,sizeof(HITS_DB) + sizeof(HITS_READ)*first,SEEK_SET); if (fread(&read,sizeof(HITS_READ),1,indx) != 1) { EPRINTF(EPLACE,"%s: Index file (.idx) of %s is junk\n",Prog_Name,root); ncodes = i; goto error; } fseeko(quiva,read.coff,SEEK_SET); nx = Read_QVcoding(quiva); if (nx == NULL) { ncodes = i; goto error; } coding[i] = *nx; } else { fseeko(quiva,db->reads[first-pfirst].coff,SEEK_SET); nx = Read_QVcoding(quiva); if (nx == NULL) { ncodes = i; goto error; } coding[i] = *nx; db->reads[first-pfirst].coff = ftello(quiva); } j = first-pfirst; if (j < 0) j = 0; k = last-pfirst; if (k > db->nreads) k = db->nreads; while (j < k) table[j++] = (uint16) i; first = last; } fclose(indx); indx = NULL; } else { // Load in coding scheme for each file, adjust .coff of first read in the file, and // record which table each read uses ncodes = nfiles; coding = (QVcoding *) Malloc(sizeof(QVcoding)*nfiles,"Allocating coding schemes"); table = (uint16 *) Malloc(sizeof(uint16)*db->nreads,"Allocating QV table indices"); if (coding == NULL || table == NULL) goto error; first = 0; for (i = 0; i < nfiles; i++) { if (fscanf(istub,DB_FDATA,&last,fname,prolog) != 3) { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root); goto error; } fseeko(quiva,db->reads[first].coff,SEEK_SET); nx = Read_QVcoding(quiva); if (nx == NULL) { ncodes = i; goto error; } coding[i] = *nx; db->reads[first].coff = ftello(quiva); for (j = first; j < last; j++) table[j] = (uint16) i; first = last; } } // Allocate and fill in the HITS_QV record and add it to the front of the // track list qvtrk = (HITS_QV *) Malloc(sizeof(HITS_QV),"Allocating QV pseudo-track"); if (qvtrk == NULL) goto error; qvtrk->name = Strdup(".@qvs","Allocating QV pseudo-track name"); if (qvtrk->name == NULL) goto error; qvtrk->next = db->tracks; db->tracks = (HITS_TRACK *) qvtrk; qvtrk->ncodes = ncodes; qvtrk->table = table; qvtrk->coding = coding; qvtrk->quiva = quiva; } fclose(istub); return (0); error: if (qvtrk != NULL) free(qvtrk); if (table != NULL) free(table); if (coding != NULL) { int i; for (i = 0; i < ncodes; i++) Free_QVcoding(coding+i); free(coding); } if (indx != NULL) fclose(indx); if (istub != NULL) fclose(istub); fclose(quiva); EXIT(1); } // Close the QV stream, free the QV pseudo track and all associated memory void Close_QVs(HITS_DB *db) { HITS_TRACK *track; HITS_QV *qvtrk; int i; Active_DB = NULL; track = db->tracks; if (track != NULL && strcmp(track->name,".@qvs") == 0) { qvtrk = (HITS_QV *) track; for (i = 0; i < qvtrk->ncodes; i++) Free_QVcoding(qvtrk->coding+i); free(qvtrk->coding); free(qvtrk->table); fclose(qvtrk->quiva); db->tracks = track->next; free(track); } return; } /******************************************************************************************* * * TRACK LOAD & CLOSE ROUTINES * ********************************************************************************************/ // Return status of track: // 1: Track is for trimmed DB // 0: Track is for untrimmed DB // -1: Track is not the right size of DB either trimmed or untrimmed // -2: Could not find the track int Check_Track(HITS_DB *db, char *track, int *kind) { FILE *afile; int tracklen, size, ispart; int ureads, treads; afile = NULL; if (db->part > 0) { afile = fopen(Catenate(db->path,Numbered_Suffix(".",db->part,"."),track,".anno"),"r"); ispart = 1; } if (afile == NULL) { afile = fopen(Catenate(db->path,".",track,".anno"),"r"); ispart = 0; } if (afile == NULL) return (-2); if (fread(&tracklen,sizeof(int),1,afile) != 1) { fprintf(stderr,"%s: track files for %s are corrupted\n",Prog_Name,track); exit (1); } if (fread(&size,sizeof(int),1,afile) != 1) { fprintf(stderr,"%s: track files for %s are corrupted\n",Prog_Name,track); exit (1); } if (size == 0) *kind = MASK_TRACK; else if (size > 0) *kind = CUSTOM_TRACK; else { fprintf(stderr,"%s: track files for %s are corrupted\n",Prog_Name,track); exit (1); } fclose(afile); if (ispart) { ureads = ((int *) (db->reads))[-1]; treads = ((int *) (db->reads))[-2]; } else { ureads = db->ureads; treads = db->treads; } if (tracklen == ureads) return (0); else if (tracklen == treads) return (1); else return (-1); } // If track is not already in the db's track list, then allocate all the storage for it, // read it in from the appropriate file, add it to the track list, and return a pointer // to the newly created HITS_TRACK record. If the track does not exist or cannot be // opened for some reason, then NULL is returned. HITS_TRACK *Load_Track(HITS_DB *db, char *track) { FILE *afile, *dfile; int tracklen, size; int nreads, ispart; int treads, ureads; void *anno; void *data; char *name; HITS_TRACK *record; if (track[0] == '.') { EPRINTF(EPLACE,"%s: Track name, '%s', cannot begin with a .\n",Prog_Name,track); EXIT(NULL); } for (record = db->tracks; record != NULL; record = record->next) if (strcmp(record->name,track) == 0) return (record); afile = NULL; if (db->part) { afile = fopen(Catenate(db->path,Numbered_Suffix(".",db->part,"."),track,".anno"),"r"); ispart = 1; } if (afile == NULL) { afile = fopen(Catenate(db->path,".",track,".anno"),"r"); ispart = 0; } if (afile == NULL) { EPRINTF(EPLACE,"%s: Track '%s' does not exist\n",Prog_Name,track); return (NULL); } dfile = NULL; anno = NULL; data = NULL; record = NULL; if (ispart) name = Catenate(db->path,Numbered_Suffix(".",db->part,"."),track,".data"); else name = Catenate(db->path,".",track,".data"); if (name == NULL) goto error; dfile = fopen(name,"r"); if (fread(&tracklen,sizeof(int),1,afile) != 1) { EPRINTF(EPLACE,"%s: Track '%s' annotation file is junk\n",Prog_Name,track); goto error; } if (fread(&size,sizeof(int),1,afile) != 1) { EPRINTF(EPLACE,"%s: Track '%s' annotation file is junk\n",Prog_Name,track); goto error; } if (size < 0) { EPRINTF(EPLACE,"%s: Track '%s' annotation file is junk\n",Prog_Name,track); goto error; } if (size == 0) size = 8; if (ispart) { ureads = ((int *) (db->reads))[-1]; treads = ((int *) (db->reads))[-2]; } else { ureads = db->ureads; treads = db->treads; } if (db->trimmed) { if (tracklen != treads && tracklen != ureads) { EPRINTF(EPLACE,"%s: Track '%s' not same size as database !\n",Prog_Name,track); goto error; } if ( ! ispart && db->part > 0) { if (tracklen == treads) fseeko(afile,size*db->tfirst,SEEK_CUR); else fseeko(afile,size*db->ufirst,SEEK_CUR); } } else { if (tracklen != ureads) { if (tracklen == treads) EPRINTF(EPLACE,"%s: Track '%s' is for a trimmed DB !\n",Prog_Name,track); else EPRINTF(EPLACE,"%s: Track '%s' not same size as database !\n",Prog_Name,track); goto error; } if ( ! ispart && db->part > 0) fseeko(afile,size*db->ufirst,SEEK_CUR); } nreads = db->nreads; anno = (void *) Malloc(size*(nreads+1),"Allocating Track Anno Vector"); if (anno == NULL) goto error; if (dfile != NULL) { int64 *anno8, off8, dlen; int *anno4, off4; int i; if (fread(anno,size,nreads+1,afile) != (size_t) (nreads+1)) { EPRINTF(EPLACE,"%s: Track '%s' annotation file is junk\n",Prog_Name,track); goto error; } if (size == 4) { anno4 = (int *) anno; off4 = anno4[0]; if (off4 != 0) { for (i = 0; i <= nreads; i++) anno4[i] -= off4; fseeko(dfile,off4,SEEK_SET); } dlen = anno4[nreads]; data = (void *) Malloc(dlen,"Allocating Track Data Vector"); } else { anno8 = (int64 *) anno; off8 = anno8[0]; if (off8 != 0) { for (i = 0; i <= nreads; i++) anno8[i] -= off8; fseeko(dfile,off8,SEEK_SET); } dlen = anno8[nreads]; data = (void *) Malloc(dlen,"Allocating Track Data Vector"); } if (data == NULL) goto error; if (dlen > 0) { if (fread(data,dlen,1,dfile) != 1) { EPRINTF(EPLACE,"%s: Track '%s' data file is junk\n",Prog_Name,track); goto error; } } fclose(dfile); dfile = NULL; } else { if (fread(anno,size,nreads,afile) != (size_t) nreads) { EPRINTF(EPLACE,"%s: Track '%s' annotation file is junk\n",Prog_Name,track); goto error; } data = NULL; } fclose(afile); record = (HITS_TRACK *) Malloc(sizeof(HITS_TRACK),"Allocating Track Record"); if (record == NULL) goto error; record->name = Strdup(track,"Allocating Track Name"); if (record->name == NULL) goto error; record->data = data; record->anno = anno; record->size = size; if (db->trimmed && tracklen != treads) { if (Late_Track_Trim(db,record,ispart)) goto error; } if (db->tracks != NULL && strcmp(db->tracks->name,".@qvs") == 0) { record->next = db->tracks->next; db->tracks->next = record; } else { record->next = db->tracks; db->tracks = record; } return (record); error: if (record != NULL) free(record); if (data != NULL) free(data); if (anno != NULL) free(anno); if (dfile != NULL) fclose(dfile); fclose(afile); EXIT (NULL); } void Close_Track(HITS_DB *db, char *track) { HITS_TRACK *record, *prev; prev = NULL; for (record = db->tracks; record != NULL; record = record->next) { if (strcmp(record->name,track) == 0) { free(record->anno); free(record->data); free(record->name); if (prev == NULL) db->tracks = record->next; else prev->next = record->next; free(record); return; } prev = record; } return; } /******************************************************************************************* * * READ BUFFER ALLOCATION AND READ ACCESS * ********************************************************************************************/ // Allocate and return a buffer big enough for the largest read in 'db', leaving room // for an initial delimiter character char *New_Read_Buffer(HITS_DB *db) { char *read; read = (char *) Malloc(db->maxlen+4,"Allocating New Read Buffer"); if (read == NULL) EXIT(NULL); return (read+1); } // Load into 'read' the i'th read in 'db'. As an upper case ASCII string if ascii is 2, as a // lower-case ASCII string is ascii is 1, and as a numeric string over 0(A), 1(C), 2(G), and // 3(T) otherwise. // // **NB**, the byte before read will be set to a delimiter character! int Load_Read(HITS_DB *db, int i, char *read, int ascii) { FILE *bases = (FILE *) db->bases; int64 off; int len, clen; HITS_READ *r = db->reads; if (i >= db->nreads) { EPRINTF(EPLACE,"%s: Index out of bounds (Load_Read)\n",Prog_Name); EXIT(1); } if (bases == NULL) { bases = Fopen(Catenate(db->path,"","",".bps"),"r"); if (bases == NULL) EXIT(1); db->bases = (void *) bases; } off = r[i].boff; len = r[i].rlen; if (ftello(bases) != off) fseeko(bases,off,SEEK_SET); clen = COMPRESSED_LEN(len); if (clen > 0) { if (fread(read,clen,1,bases) != 1) { EPRINTF(EPLACE,"%s: Failed read of .bps file (Load_Read)\n",Prog_Name); EXIT(1); } } Uncompress_Read(len,read); if (ascii == 1) { Lower_Read(read); read[-1] = '\0'; } else if (ascii == 2) { Upper_Read(read); read[-1] = '\0'; } else read[-1] = 4; return (0); } char *Load_Subread(HITS_DB *db, int i, int beg, int end, char *read, int ascii) { FILE *bases = (FILE *) db->bases; int64 off; int len, clen; int bbeg, bend; HITS_READ *r = db->reads; if (i >= db->nreads) { EPRINTF(EPLACE,"%s: Index out of bounds (Load_Read)\n",Prog_Name); EXIT(NULL); } if (bases == NULL) { bases = Fopen(Catenate(db->path,"","",".bps"),"r"); if (bases == NULL) EXIT(NULL); db->bases = (void *) bases; } bbeg = beg/4; bend = (end-1)/4+1; off = r[i].boff + bbeg; len = end - beg; if (ftello(bases) != off) fseeko(bases,off,SEEK_SET); clen = bend-bbeg; if (clen > 0) { if (fread(read,clen,1,bases) != 1) { EPRINTF(EPLACE,"%s: Failed read of .bps file (Load_Read)\n",Prog_Name); EXIT(NULL); } } Uncompress_Read(4*clen,read); read += beg%4; read[len] = 4; if (ascii == 1) { Lower_Read(read); read[-1] = '\0'; } else if (ascii == 2) { Upper_Read(read); read[-1] = '\0'; } else read[-1] = 4; return (read); } /******************************************************************************************* * * QV BUFFER ALLOCATION QV READ ACCESS * ********************************************************************************************/ // Allocate and return a buffer of 5 vectors big enough for the largest read in 'db' char **New_QV_Buffer(HITS_DB *db) { char **entry; char *qvs; int i; qvs = (char *) Malloc(db->maxlen*5,"Allocating New QV Buffer"); entry = (char **) Malloc(sizeof(char *)*5,"Allocating New QV Buffer"); if (qvs == NULL || entry == NULL) EXIT(NULL); for (i = 0; i < 5; i++) entry[i] = qvs + i*db->maxlen; return (entry); } // Load into entry the QV streams for the i'th read from db. The parameter ascii applies to // the DELTAG stream as described for Load_Read. int Load_QVentry(HITS_DB *db, int i, char **entry, int ascii) { HITS_READ *reads; FILE *quiva; int rlen; if (db != Active_DB) { if (db->tracks == NULL || strcmp(db->tracks->name,".@qvs") != 0) { EPRINTF(EPLACE,"%s: QV's are not loaded (Load_QVentry)\n",Prog_Name); EXIT(1); } Active_QV = (HITS_QV *) db->tracks; Active_DB = db; } if (i >= db->nreads) { EPRINTF(EPLACE,"%s: Index out of bounds (Load_QVentry)\n",Prog_Name); EXIT(1); } reads = db->reads; quiva = Active_QV->quiva; rlen = reads[i].rlen; fseeko(quiva,reads[i].coff,SEEK_SET); if (Uncompress_Next_QVentry(quiva,entry,Active_QV->coding+Active_QV->table[i],rlen)) EXIT(1); if (ascii != 1) { char *deltag = entry[1]; if (ascii != 2) { char x = deltag[rlen]; deltag[rlen] = '\0'; Number_Read(deltag); deltag[rlen] = x; } else { int j; int u = 'A'-'a'; for (j = 0; j < rlen; j++) deltag[j] = (char) (deltag[j]+u); } } return (0); } /******************************************************************************************* * * BLOCK LOAD OF ALL READS (PRIMARILY FOR DALIGNER) * ********************************************************************************************/ // Allocate a block big enough for all the uncompressed sequences, read them into it, // reset the 'off' in each read record to be its in-memory offset, and set the // bases pointer to point at the block after closing the bases file. If ascii is // non-zero then the reads are converted to ACGT ascii, otherwise the reads are left // as numeric strings over 0(A), 1(C), 2(G), and 3(T). int Read_All_Sequences(HITS_DB *db, int ascii) { FILE *bases; int nreads = db->nreads; HITS_READ *reads = db->reads; void (*translate)(char *s); char *seq; int64 o, off; int i, len, clen; bases = Fopen(Catenate(db->path,"","",".bps"),"r"); if (bases == NULL) EXIT(1); seq = (char *) Malloc(db->totlen+nreads+4,"Allocating All Sequence Reads"); if (seq == NULL) { fclose(bases); EXIT(1); } *seq++ = 4; if (ascii == 1) translate = Lower_Read; else translate = Upper_Read; o = 0; for (i = 0; i < nreads; i++) { len = reads[i].rlen; off = reads[i].boff; if (ftello(bases) != off) fseeko(bases,off,SEEK_SET); clen = COMPRESSED_LEN(len); if (clen > 0) { if (fread(seq+o,clen,1,bases) != 1) { EPRINTF(EPLACE,"%s: Read of .bps file failed (Read_All_Sequences)\n",Prog_Name); free(seq); fclose(bases); EXIT(1); } } Uncompress_Read(len,seq+o); if (ascii) translate(seq+o); reads[i].boff = o; o += (len+1); } reads[nreads].boff = o; fclose(bases); db->bases = (void *) seq; db->loaded = 1; return (0); } int List_DB_Files(char *path, void actor(char *path, char *extension)) { int status, plen, rlen, dlen; char *root, *pwd, *name; int isdam; DIR *dirp; struct dirent *dp; status = 0; pwd = PathTo(path); plen = strlen(path); if (strcmp(path+(plen-4),".dam") == 0) root = Root(path,".dam"); else root = Root(path,".db"); rlen = strlen(root); if (root == NULL || pwd == NULL) { free(pwd); free(root); EXIT(1); } if ((dirp = opendir(pwd)) == NULL) { EPRINTF(EPLACE,"%s: Cannot open directory %s (List_DB_Files)\n",Prog_Name,pwd); status = -1; goto error; } isdam = 0; while ((dp = readdir(dirp)) != NULL) // Get case dependent root name (if necessary) { name = dp->d_name; if (strcmp(name,Catenate("","",root,".db")) == 0) break; if (strcmp(name,Catenate("","",root,".dam")) == 0) { isdam = 1; break; } if (strcasecmp(name,Catenate("","",root,".db")) == 0) { strncpy(root,name,rlen); break; } if (strcasecmp(name,Catenate("","",root,".dam")) == 0) { strncpy(root,name,rlen); isdam = 1; break; } } if (dp == NULL) { EPRINTF(EPLACE,"%s: Cannot find %s (List_DB_Files)\n",Prog_Name,pwd); status = -1; closedir(dirp); goto error; } if (isdam) actor(Catenate(pwd,"/",root,".dam"),"dam"); else actor(Catenate(pwd,"/",root,".db"),"db"); rewinddir(dirp); // Report each auxiliary file while ((dp = readdir(dirp)) != NULL) { name = dp->d_name; dlen = strlen(name); #ifdef HIDE_FILES if (name[0] != '.') continue; dlen -= 1; name += 1; #endif if (dlen < rlen+1) continue; if (name[rlen] != '.') continue; if (strncmp(name,root,rlen) != 0) continue; actor(Catenate(pwd,PATHSEP,name,""),name+(rlen+1)); } closedir(dirp); error: free(pwd); free(root); return (status); } void Print_Read(char *s, int width) { int i; if (s[0] < 4) { for (i = 0; s[i] != 4; i++) { if (i%width == 0 && i != 0) printf("\n"); printf("%d",s[i]); } printf("\n"); } else { for (i = 0; s[i] != '\0'; i++) { if (i%width == 0 && i != 0) printf("\n"); printf("%c",s[i]); } printf("\n"); } } pbdagcon-0.3+20161121+ds/DAZZ_DB/DBupgrade.Sep.25.2014.c0000644000175000017500000000475513026414552017513 0ustar afifafif/******************************************************************************************* * * Interim code: upgrade previous db to have int's for pulse positions. * * Author: Gene Myers * Date : September 2014 * ********************************************************************************************/ #include #include #include #include #include "DB.h" #ifdef HIDE_FILES #define PATHSEP "/." #else #define PATHSEP "/" #endif typedef struct { int origin; // Well # uint16 beg; // First pulse uint16 end; // Last pulse int64 boff; // Offset (in bytes) of compressed read in 'bases' file, or offset of // uncompressed bases in memory block int64 coff; // Offset (in bytes) of compressed quiva streams in 'quiva' file int flags; // QV of read + flags above } HITS_OLD; typedef struct { int origin; // Well # int beg; // First pulse int end; // Last pulse int64 boff; // Offset (in bytes) of compressed read in 'bases' file, or offset of // uncompressed bases in memory block int64 coff; // Offset (in bytes) of compressed quiva streams in 'quiva' file int flags; // QV of read + flags above } HITS_NEW; int main(int argc, char *argv[]) { HITS_DB db; FILE *nxfile, *ixfile; char *pwd, *root; int i; if (argc != 2) { fprintf(stderr,"Usage: %s \n",argv[0]); exit (1); } pwd = PathTo(argv[1]); root = Root(argv[1],".db"); ixfile = Fopen(Catenate(pwd,PATHSEP,root,".idx"),"r"); nxfile = Fopen(Catenate(pwd,PATHSEP,root,".ndx"),"w"); if (ixfile == NULL || nxfile == NULL) exit (1); free(pwd); free(root); if (fread(&db,sizeof(HITS_DB),1,ixfile) != 1) SYSTEM_ERROR fwrite(&db,sizeof(HITS_DB),1,nxfile); fprintf(stderr,"Converting %d reads\n",db.ureads); fflush(stderr); for (i = 0; i < db.ureads; i++) { HITS_OLD orec; HITS_NEW nrec; if (i%10000 == 0) { fprintf(stderr," Processing %d\n",i); fflush(stderr); } if (fread(&orec,sizeof(HITS_OLD),1,ixfile) != 1) SYSTEM_ERROR nrec.origin = orec.origin; nrec.beg = orec.beg; nrec.end = orec.end; nrec.boff = orec.boff; nrec.coff = orec.coff; nrec.flags = orec.flags; fwrite(&nrec,sizeof(HITS_NEW),1,nxfile); } fclose(ixfile); fclose(nxfile); exit (0); } pbdagcon-0.3+20161121+ds/DAZZ_DB/DBdust.c0000644000175000017500000003117413026414552015476 0ustar afifafif/******************************************************************************************* * * My implementation of the SDUST algorithm (Morgulis et al., JCB 13, 5 (2006), 1028-1040) * * Author: Gene Myers * Date : September 2013 * Mod : Is now incremental * Date : April 2014 * ********************************************************************************************/ #include #include #include #include #include #include "DB.h" #ifdef HIDE_FILES #define PATHSEP "/." #else #define PATHSEP "/" #endif #undef DEBUG #ifdef DEBUG static int Caps[4] = { 'A', 'C', 'G', 'T' }; static int Lowr[4] = { 'a', 'c', 'g', 't' }; #endif static char *Usage = "[-b] [-w] [-t] [-m] "; typedef struct _cand { struct _cand *next; struct _cand *prev; int beg; int end; double score; } Candidate; int main(int argc, char *argv[]) { HITS_DB _db, *db = &_db; FILE *afile, *dfile; int64 indx; int nreads; int *mask; Candidate *cptr; int WINDOW; double THRESH; int MINLEN; int BIASED; { int i, j, k; int flags[128]; char *eptr; ARG_INIT("DBdust") WINDOW = 64; THRESH = 2.; MINLEN = 9; j = 1; for (i = 1; i < argc; i++) if (argv[i][0] == '-') switch (argv[i][1]) { default: ARG_FLAGS("b") break; case 'w': ARG_POSITIVE(WINDOW,"Window size") break; case 't': ARG_REAL(THRESH) if (THRESH <= 0.) { fprintf(stderr,"%s: Threshold must be positive (%g)\n",Prog_Name,THRESH); exit (1); } break; case 'm': ARG_NON_NEGATIVE(MINLEN,"Minimum hit") MINLEN -= 1; break; } else argv[j++] = argv[i]; argc = j; BIASED = flags['b']; if (argc != 2) { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage); exit (1); } } // Open .db or .dam { int status; status = Open_DB(argv[1],db); if (status < 0) exit (1); } mask = (int *) Malloc((db->maxlen+1)*sizeof(int),"Allocating mask vector"); cptr = (Candidate *) Malloc((WINDOW+1)*sizeof(Candidate),"Allocating candidate vector"); if (mask == NULL || cptr == NULL) exit (1); { char *pwd, *root, *fname; int size; pwd = PathTo(argv[1]); root = Root(argv[1],".db"); size = 0; fname = Catenate(pwd,PATHSEP,root,".dust.anno"); if ((afile = fopen(fname,"r+")) == NULL || db->part > 0) { if (afile != NULL) fclose(afile); afile = Fopen(fname,"w"); dfile = Fopen(Catenate(pwd,PATHSEP,root,".dust.data"),"w"); if (dfile == NULL || afile == NULL) exit (1); fwrite(&(db->nreads),sizeof(int),1,afile); fwrite(&size,sizeof(int),1,afile); nreads = 0; indx = 0; fwrite(&indx,sizeof(int64),1,afile); } else { dfile = Fopen(Catenate(pwd,PATHSEP,root,".dust.data"),"r+"); if (dfile == NULL) exit (1); if (fread(&nreads,sizeof(int),1,afile) != 1) SYSTEM_ERROR if (nreads >= db->nreads) { fclose(afile); fclose(dfile); exit(0); } fseeko(afile,0,SEEK_SET); fwrite(&(db->nreads),sizeof(int),1,afile); fwrite(&size,sizeof(int),1,afile); fseeko(afile,0,SEEK_END); fseeko(dfile,0,SEEK_END); indx = ftello(dfile); } free(pwd); free(root); } { int *mask1; char *read, *lag2; int wcount[64], lcount[64]; Candidate *aptr; double skew[64], thresh2r; int thresh2i; int i; read = New_Read_Buffer(db); lag2 = read-2; mask1 = mask+1; *mask = -2; aptr = cptr+1; for (i = 1; i < WINDOW; i++) cptr[i].next = aptr+i; cptr[WINDOW].next = NULL; cptr->next = cptr->prev = cptr; cptr->beg = -2; thresh2r = 2.*THRESH; thresh2i = (int) ceil(thresh2r); if (BIASED) { int a, b, c, p; p = 0; for (a = 0; a < 4; a++) for (b = 0; b < 4; b++) for (c = 0; c < 4; c++) skew[p++] = .015625 / (db->freq[a]*db->freq[b]*db->freq[c]); } for (i = nreads; i < db->nreads; i++) { Candidate *lptr, *jptr; int *mtop; double mscore; int len; int wb, lb; int j, c, d; len = db->reads[i].rlen; // Fetch read Load_Read(db,i,read,0); c = (read[0] << 2) | read[1]; // Convert to triple codes for (j = 2; j < len; j++) { c = ((c << 2) & 0x3f) | read[j]; lag2[j] = (char) c; } len -= 2; for (j = 0; j < 64; j++) // Setup counter arrays wcount[j] = lcount[j] = 0; mtop = mask; // The dust algorithm lb = wb = -1; if (BIASED) { double lsqr, wsqr, trun; // Modification for high-compositional bias wsqr = lsqr = 0.; for (j = 0; j < len; j++) { c = read[j]; #define ADDR(e,cnt,sqr) sqr += (cnt[e]++) * skew[e]; #define DELR(e,cnt,sqr) sqr -= (--cnt[e]) * skew[e]; #define WADDR(e) ADDR(e,wcount,wsqr) #define WDELR(e) DELR(e,wcount,wsqr) #define LADDR(e) ADDR(e,lcount,lsqr) #define LDELR(e) DELR(e,lcount,lsqr) if (j > WINDOW-3) { d = read[++wb]; WDELR(d) } WADDR(c) if (lb < wb) { d = read[++lb]; LDELR(d) } trun = (lcount[c]++) * skew[c]; lsqr += trun; if (trun >= thresh2r) { while (lb < j) { d = read[++lb]; LDELR(d) if (d == c) break; } } jptr = cptr->prev; if (jptr != cptr && jptr->beg <= wb) { c = jptr->end + 2; if (*mtop+1 >= jptr->beg) { if (*mtop < c) *mtop = c; } else { *++mtop = jptr->beg; *++mtop = c; } lptr = jptr->prev; cptr->prev = lptr; lptr->next = cptr; jptr->next = aptr; aptr = jptr; } if (wsqr <= lsqr*THRESH) continue; jptr = cptr->next; lptr = cptr; mscore = 0.; for (c = lb; c > wb; c--) { d = read[c]; LADDR(d) if (lsqr >= THRESH * (j-c)) { for ( ; jptr->beg >= c; jptr = (lptr = jptr)->next) if (jptr->score > mscore) mscore = jptr->score; if (lsqr >= mscore * (j-c)) { mscore = lsqr / (j-c); if (lptr->beg == c) { lptr->end = j; lptr->score = mscore; } else { aptr->beg = c; aptr->end = j; aptr->score = mscore; aptr->prev = lptr; lptr = lptr->next = aptr; aptr = aptr->next; jptr->prev = lptr; lptr->next = jptr; } } } } for (c++; c <= lb; c++) { d = read[c]; LDELR(d) } } } else { int lsqr, wsqr, trun; // Algorithm for GC-balanced sequences wsqr = lsqr = 0; for (j = 0; j < len; j++) { c = read[j]; #define ADDI(e,cnt,sqr) sqr += (cnt[e]++); #define DELI(e,cnt,sqr) sqr -= (--cnt[e]); #define WADDI(e) ADDI(e,wcount,wsqr) #define WDELI(e) DELI(e,wcount,wsqr) #define LADDI(e) ADDI(e,lcount,lsqr) #define LDELI(e) DELI(e,lcount,lsqr) if (j > WINDOW-3) { d = read[++wb]; WDELI(d) } WADDI(c) if (lb < wb) { d = read[++lb]; LDELI(d) } trun = lcount[c]++; lsqr += trun; if (trun >= thresh2i) { while (lb < j) { d = read[++lb]; LDELI(d) if (d == c) break; } } jptr = cptr->prev; if (jptr != cptr && jptr->beg <= wb) { c = jptr->end + 2; if (*mtop+1 >= jptr->beg) { if (*mtop < c) *mtop = c; } else { *++mtop = jptr->beg; *++mtop = c; } lptr = jptr->prev; cptr->prev = lptr; lptr->next = cptr; jptr->next = aptr; aptr = jptr; } if (wsqr <= lsqr*THRESH) continue; jptr = cptr->next; lptr = cptr; mscore = 0.; for (c = lb; c > wb; c--) { d = read[c]; LADDI(d) if (lsqr >= THRESH * (j-c)) { for ( ; jptr->beg >= c; jptr = (lptr = jptr)->next) if (jptr->score > mscore) mscore = jptr->score; if (lsqr >= mscore * (j-c)) { mscore = (1. * lsqr) / (j-c); if (lptr->beg == c) { lptr->end = j; lptr->score = mscore; } else { aptr->beg = c; aptr->end = j; aptr->score = mscore; aptr->prev = lptr; lptr = lptr->next = aptr; aptr = aptr->next; jptr->prev = lptr; lptr->next = jptr; } } } } for (c++; c <= lb; c++) { d = read[c]; LDELI(d) } } } while ((jptr = cptr->prev) != cptr) { c = jptr->end + 2; if (*mtop+1 >= jptr->beg) { if (*mtop < c) *mtop = c; } else { *++mtop = jptr->beg; *++mtop = c; } cptr->prev = jptr->prev; jptr->prev->next = cptr; jptr->next = aptr; aptr = jptr; } { int *jtop, ntop; ntop = 0; for (jtop = mask1; jtop < mtop; jtop += 2) if (jtop[1] - jtop[0] >= MINLEN) { mask[++ntop] = jtop[0]; mask[++ntop] = jtop[1]+1; } mtop = mask + ntop; indx += ntop*sizeof(int); fwrite(&indx,sizeof(int64),1,afile); fwrite(mask1,sizeof(int),ntop,dfile); } #ifdef DEBUG { int *jtop; printf("\nREAD %d\n",i); for (jtop = mask1; jtop < mtop; jtop += 2) printf(" [%5d,%5d]\n",jtop[0],jtop[1]); Load_Read(db,i,read,0); jtop = mask1; for (c = 0; c < len; c++) { while (jtop < mtop && c > jtop[1]) jtop += 2; if (jtop < mtop && c >= *jtop) printf("%c",Caps[(int) read[c]]); else printf("%c",Lowr[(int) read[c]]); if ((c%80) == 79) printf("\n"); } printf("\n"); } #endif } } fclose(afile); fclose(dfile); Close_DB(db); exit (0); } pbdagcon-0.3+20161121+ds/DAZZ_DB/simulator.c0000644000175000017500000003045613026414552016332 0ustar afifafif/******************************************************************************************* * * Synthetic DNA shotgun dataset simulator * Generate a fake genome of size genlen*1Mb long, that has an AT-bias of -b. Then * sample reads of mean length -m from a log-normal length distribution with * standard deviation -s, but ignore reads of length less than -x. Collect enough * reads to cover the genome -c times. Introduce -e fraction errors into each * read where the ratio of insertions, deletions, and substitutions are set by * defined constants INS_RATE and DEL_RATE within generate.c. One can also control * the rate at which reads are picked from the forward and reverse strands by setting * the defined constant FLIP_RATE. * * The -r parameter seeds the random number generator for the generation of the genome * so that one can reproducbile produce the same underlying genome to sample from. If * missing, then the job id of the invocation seeds the generator. The output is sent * to the standard output (i.e. it is a pipe). The output is in fasta format (i.e. it is * a UNIX pipe). The output is in Pacbio .fasta format suitable as input to fasta2DB. * * The -M option requests that the coordinates from which each read has been sampled are * written to the indicated file, one line per read, ASCII encoded. This "map" file * essentially tells one where every read belongs in an assembly and is very useful for * debugging and testing purposes. If a read pair is say b,e then if b < e the read was * sampled from [b,e] in the forward direction, and from [e,b] in the reverse direction * otherwise. * * Author: Gene Myers * Date : July 2013 * Mod : April 2014 (made independent of "mylib") * ********************************************************************************************/ #include #include #include #include #include #include "DB.h" static char *Usage[] = { " [-c] [-b] [-r]", " [-m] [-s] [-x]", " [-e] [-M]" }; static int GENOME; // -g option * 1Mbp static double COVERAGE; // -c option static double BIAS; // -b option static int HASR = 0; // -r option is set? static int SEED; // -r option static int RMEAN; // -m option static int RSDEV; // -s option static int RSHORT; // -x option static double ERROR; // -e option static FILE *MAP; // -M option #define INS_RATE .73333 // insert rate #define DEL_RATE .20000 // deletion rate #define IDL_RATE .93333 // insert + delete rate #define FLIP_RATE .5 // orientation rate (equal) // Generate a random 4 letter string of length *len* with every letter having equal probability. static char *random_genome() { char *seq; int i; double x, PRA, PRC, PRG; PRA = BIAS/2.; PRC = (1.-BIAS)/2. + PRA; PRG = (1.-BIAS)/2. + PRC; if (HASR) srand48(SEED); else srand48(getpid()); if ((seq = (char *) Malloc(GENOME+1,"Allocating genome sequence")) == NULL) exit (1); for (i = 0; i < GENOME; i++) { x = drand48(); if (x < PRA) seq[i] = 0; else if (x < PRC) seq[i] = 1; else if (x < PRG) seq[i] = 2; else seq[i] = 3; } seq[GENOME] = 4; return (seq); } // Complement (in the DNA sense) string *s*. static void complement(int elen, char *s) { char *t; int c; t = s + (elen-1); while (s <= t) { c = *s; *s = (char) (3-*t); *t = (char) (3-c); s += 1; t -= 1; } } #define UNORM_LEN 60000 #define UNORM_MAX 6.0 static double unorm_table[UNORM_LEN+1]; // Upper half of cdf of N(0,1) static double unorm_scale; static void init_unorm() { double del, sum, x; int i; unorm_scale = del = UNORM_MAX / UNORM_LEN; sum = 0; // Integrate pdf, x >= 0 half only. for (i = 0; i < UNORM_LEN; i++) { x = i * del; unorm_table[i] = sum; sum += exp(-.5*x*x) * del; } unorm_table[UNORM_LEN] = sum; /* Normalize cdf */ sum *= 2.; for (i = 0; i < UNORM_LEN; i++) unorm_table[i] /= sum; unorm_table[UNORM_LEN] = 1.; #ifdef DEBUG printf("Truncated tail is < %g\n", exp(-.5*UNORM_MAX*UNORM_MAX)/(sum*(1.-exp(-UNORM_MAX))) ); printf("Diff between last two entries is %g\n",.5-unorm_table[UNORM_LEN-1]); printf("\n CDF:\n"); for (i = 0; i <= UNORM_LEN; i += 100) printf("%6.2f: %10.9f\n",i*del,unorm_table[i]); #endif } static int bin_search(int len, double *tab, double y) { int l, m, r; // Searches tab[0..len] for min { r : y < tab[r] }. // Assumes y < 1, tab[0] = 0 and tab[len] = 1. // So returned index is in [1,len]. l = 0; r = len; while (l < r) { m = (l+r) >> 1; if (y < tab[m]) r = m; else l = m+1; } return (r); } static double sample_unorm(double x) { double y; int f; if (x >= .5) // Map [0,1) random var to upper-half of cdf */ y = x-.5; else y = .5-x; f = bin_search(UNORM_LEN,unorm_table,y); // Bin. search upper-half cdf #ifdef DEBUG printf("Normal search %g -> %g -> %d",x,y,f); #endif // Linear interpolate between table points y = (f - (unorm_table[f]-y) / (unorm_table[f] - unorm_table[f-1]) ) * unorm_scale; if (x < .5) y = -y; // Map upper-half var back to full range #ifdef DEBUG printf(" -> %g\n",y); #endif return (y); } // Generate reads (a) whose lengths are exponentially distributed with mean *mean* and // standard deviation *stdev*, (b) that are never shorter than *shortest* and never // longer than the string *source*. Each read is a randomly sampled interval of // *source* (each interval is equally likely) that has insertion, deletion, and/or // substitution errors introduced into it and which is oriented in either the forward // or reverse strand direction with probability FLIP_RATE. The number of errors // introduced is the length of the string times *erate*, and the probability of an // insertion, deletion, or substitution is controlled by the defined constants INS_RATE // and DEL_RATE. Generate reads until the sum of the lengths of the reads is greater // than slen*coverage. The reads are output as fasta entries with a specific header // format that contains the sampling interval, read length, and a read id. static void shotgun(char *source) { int maxlen, nreads, qv; int64 totlen, totbp; char *rbuffer; double nmean, nsdev; nsdev = (1.*RSDEV)/RMEAN; nsdev = log(1.+nsdev*nsdev); nmean = log(1.*RMEAN) - .5*nsdev; nsdev = sqrt(nsdev); if (GENOME < RSHORT) { fprintf(stderr,"Genome length is less than shortest read length !\n"); exit (1); } init_unorm(); qv = (int) (1000 * (1.-ERROR)); rbuffer = NULL; maxlen = 0; totlen = 0; totbp = COVERAGE*GENOME; nreads = 0; while (totlen < totbp) { int len, sdl, ins, del, elen, rbeg, rend; int j; char *s, *t; len = (int) exp(nmean + nsdev*sample_unorm(drand48())); // Determine length of read. if (len > GENOME) len = GENOME; if (len < RSHORT) continue; sdl = (int) (len*ERROR); // Determine number of inserts *ins*, deletions *del, ins = del = 0; // and substitions+deletions *sdl*. for (j = 0; j < sdl; j++) { double x = drand48(); if (x < INS_RATE) ins += 1; else if (x < IDL_RATE) del += 1; } sdl -= ins; elen = len + (ins-del); rbeg = (int) (drand48()*((GENOME-len)+.9999999)); rend = rbeg + len; if (elen > maxlen) { maxlen = ((int) (1.2*elen)) + 1000; rbuffer = (char *) Realloc(rbuffer,maxlen+3,"Allocating read buffer"); if (rbuffer == NULL) exit (1); } t = rbuffer; s = source + rbeg; // Generate the string with errors. NB that inserts occur randomly between source // characters, while deletions and substitutions occur on source characters. while ((len+1) * drand48() < ins) { *t++ = (char) (4.*drand48()); ins -= 1; } for ( ; len > 0; len--) { if (len * drand48() >= sdl) *t++ = *s; else if (sdl * drand48() >= del) { double x = 3.*drand48(); if (x >= *s) x += 1.; *t++ = (char) x; sdl -= 1; } else { del -= 1; sdl -= 1; } s += 1; while (len * drand48() < ins) { *t++ = (char) (4.*drand48()); ins -= 1; } } *t = 4; if (drand48() >= FLIP_RATE) // Complement the string with probability FLIP_RATE. { printf(">Sim/%d/%d_%d RQ=0.%d\n",nreads+1,0,elen,qv); complement(elen,rbuffer); j = rend; rend = rbeg; rbeg = j; } else printf(">Sim/%d/%d_%d RQ=0.%d\n",nreads+1,0,elen,qv); Lower_Read(rbuffer); for (j = 0; j+80 < elen; j += 80) printf("%.80s\n",rbuffer+j); if (j < elen) printf("%s\n",rbuffer+j); if (MAP != NULL) fprintf(MAP," %9d %9d\n",rbeg,rend); totlen += elen; nreads += 1; } } int main(int argc, char *argv[]) { char *source; // Usage: [-c] [-b] [-r] // [-m] [-s] [-x] // [-e] [-M 1.) { fprintf(stderr,"%s: AT-bias must be in [0,1] (%g)\n",Prog_Name,BIAS); exit (1); } break; case 'r': SEED = strtol(argv[i]+2,&eptr,10); HASR = 1; if (*eptr != '\0' || argv[i][2] == '\0') { fprintf(stderr,"%s: -r argument is not an integer\n",Prog_Name); exit (1); } break; case 'M': MAP = Fopen(argv[i]+2,"w"); if (MAP == NULL) exit (1); break; case 'm': ARG_POSITIVE(RMEAN,"Mean read length") break; case 's': ARG_POSITIVE(RSDEV,"Read length standard deviation") break; case 'x': ARG_NON_NEGATIVE(RSHORT,"Read length minimum") break; case 'e': ARG_REAL(ERROR) if (ERROR < 0. || ERROR > .5) { fprintf(stderr,"%s: Error rate must be in [0,.5] (%g)\n",Prog_Name,ERROR); exit (1); } break; } else argv[j++] = argv[i]; argc = j; if (argc != 2) { fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage[0]); fprintf(stderr," %*s %s\n",(int) strlen(Prog_Name),"",Usage[1]); fprintf(stderr," %*s %s\n",(int) strlen(Prog_Name),"",Usage[2]); exit (1); } glen = strtod(argv[1],&eptr); if (*eptr != '\0') { fprintf(stderr,"%s: genome length is not a real number\n",Prog_Name); exit (1); } if (glen < 0.) { fprintf(stderr,"%s: Genome length must be positive (%g)\n",Prog_Name,glen); exit (1); } GENOME = (int) (glen*1000000.); } source = random_genome(); shotgun(source); if (MAP != NULL) fclose(MAP); exit (0); } pbdagcon-0.3+20161121+ds/DAZZ_DB/LICENSE0000644000175000017500000000531113026414552015144 0ustar afifafif Copyright (c) 2014, Dr. Eugene W. Myers (EWM). All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: · Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. · Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. · The name of EWM may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY EWM ”AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL EWM BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. For any issues regarding this software and its use, contact EWM at: Eugene W. Myers Jr. Bautzner Str. 122e 01099 Dresden GERMANY Email: gene.myers@gmail.com pbdagcon-0.3+20161121+ds/DAZZ_DB/DBrm.c0000644000175000017500000000176613026414552015141 0ustar afifafif/******************************************************************************************** * * Remove a list of .db databases * Delete all the files for the given data bases .db ... (there are a couple * of hidden . files for each DB, and these are removed too.) Do not use "rm" to * remove a database. * * Author: Gene Myers * Date : July 2013 * ********************************************************************************************/ #include #include #include #include #include "DB.h" static char *Usage = " ... "; static void HANDLER(char *path, char *name) { (void) name; unlink(path); } int main(int argc, char *argv[]) { int i; Prog_Name = Strdup("DBrm",""); if (argc <= 1) fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage); for (i = 1; i < argc; i++) if (List_DB_Files(argv[i],HANDLER) < 0) fprintf(stderr,"%s: Could not list database %s\n",Prog_Name,argv[i]); exit (0); } pbdagcon-0.3+20161121+ds/makefile0000755000175000017500000000155213026414536014532 0ustar afifafif.PHONY: all clean test init-submodule cpp cpp-check cpp-clean THISDIR:=$(dir $(lastword ${MAKEFILE_LIST})) ROOT:=${THISDIR} -include ${CURDIR}/defines.mk SHELL = /bin/bash -e all: cpp clean: cpp-clean check: cpp-check project: init-submodule cpp init-submodule: ${MAKE} update-submodule ${MAKE} build-submodule update-submodule: git submodule update --init build-submodule: cd blasr_libcpp; NOHDF=1 NOPBBAM=1 ./configure.py ${MAKE} -C blasr_libcpp/pbdata libconfig.h ${MAKE} -C blasr_libcpp/pbdata libpbdata.a ${MAKE} -C blasr_libcpp/alignment libblasr.a submodule-clean: ${RM} -r blasr_libcpp cpp: ${MAKE} -C src/cpp cpp-check: cpp ${MAKE} -C test/cpp cpp-clean: ${MAKE} -C src/cpp clean ${MAKE} -C test/cpp clean clean-all: cpp-clean submodule-clean ${RM}r src/cpp/third-party/boost_1_58_0-headersonly ${RM}r test/cpp/googletest-release-1.7.0 pbdagcon-0.3+20161121+ds/configure.py0000755000175000017500000002614113026414536015366 0ustar afifafif#!/usr/bin/env python """Configure the build. - Fetch boost/gtest. - Create defines.mk """ import argparse import commands import contextlib import os import sys ROOT = os.path.abspath(os.path.dirname(__file__)) def log(msg): sys.stderr.write(msg) sys.stderr.write('\n') def shell(cmd): log(cmd) status, output = commands.getstatusoutput(cmd) if status: raise Exception('%d <-| %r' %(status, cmd)) return output def system(cmd): log(cmd) status = os.system(cmd) if status: raise Exception('%d <- %r' %(status, cmd)) return def mkdirs(path): if not os.path.isdir(path): os.makedirs(path) @contextlib.contextmanager def cd(nwd): cwd = os.getcwd() log('cd %r -> %r' %(cwd, nwd)) os.chdir(nwd) yield os.chdir(cwd) log('cd %r <- %r' %(cwd, nwd)) def fetch_gtest(build_dir): gtest_version = '1.7.0' gtest_uri = 'https://github.com/google/googletest/archive/release-%s.zip' %gtest_version gdir = os.path.join(build_dir, 'test', 'cpp', 'googletest-release-%s' % gtest_version) if not os.path.isdir(gdir): #mkdirs(gdir) zipfile = gdir + '.zip' if not os.path.isfile(zipfile): get_gtest_cmd = 'curl -L %s --output %s' %(gtest_uri, zipfile) system(get_gtest_cmd) install_gtest_cmd = 'unzip -q %s -d %s' %(zipfile, os.path.join(build_dir, 'test', 'cpp')) system(install_gtest_cmd) assert os.path.isdir(gdir) return gdir def fetch_boost_headers(build_dir): """Fetch into {build_dir}/src/cpp/third-party/ Return actual directory path, relative to subdirs. """ uri = 'https://www.dropbox.com/s/g22iayi83p5gbbq/boost_1_58_0-headersonly.tbz2?dl=0' hdir = os.path.join(build_dir, 'src', 'cpp', 'third-party', 'boost_1_58_0-headersonly') if not os.path.isdir(hdir): mkdirs(os.path.dirname(hdir)) #get_boost_cmd = 'curl -L %s | tar xjf -C src/cpp/third-party -' %uri tbz = os.path.join(build_dir, 'src', 'cpp', 'third-party', 'boost_1_58_0-headersonly.tbz2') if not os.path.isfile(tbz): get_boost_cmd = 'curl -L %s --output %s' %(uri, tbz) system(get_boost_cmd) install_boost_cmd = 'tar vxjf %s -C %s/src/cpp/third-party | head' %(tbz, build_dir) system(install_boost_cmd) assert os.path.isdir(hdir) return hdir def update_content(fn, content): current_content = open(fn).read() if os.path.exists(fn) else None if content != current_content: log('writing to %r' %fn) log('"""\n' + content + '"""') open(fn, 'w').write(content) def compose_defines_with_hdf_headers(HDF_HEADERS): thisdir = os.path.dirname(os.path.abspath(__file__)) return """ HDF_HEADERS:=%(HDF_HEADERS)s #HDF5_INCLUDE?=${HDF_HEADERS}/src CPPFLAGS+=-I${HDF_HEADERS}/src -I${HDF_HEADERS}/c++/src CPPFLAGS+=-I../pbdata -I../hdf -I../alignment LIBPBDATA_LIB ?=../pbdata/libpbdata.so LIBPBIHDF_LIB ?=../pbdata/libpbihdf.so LIBBLASR_LIB ?=../pbdata/libblasr.so """%(dict(thisdir=thisdir, HDF_HEADERS=HDF_HEADERS)) def compose_defines(): """ Note that our local 'hdf' subdir will not even build in this case. """ thisdir = os.path.dirname(os.path.abspath(__file__)) return """ LIBPBDATA_INCLUDE ?=../pbdata LIBPBIHDF_INCLUDE ?=../hdf LIBBLASR_INCLUDE ?=../alignment LIBPBDATA_LIB ?=%(thisdir)s/pbdata/libpbdata.so LIBPBIHDF_LIB ?=%(thisdir)s/pbdata/libpbihdf.so LIBBLASR_LIB ?=%(thisdir)s/pbdata/libblasr.so nohdf ?=1 """%(dict(thisdir=thisdir)) def ifenvf(env, key, func): if key in env: return env[key] else: return func() def setifenvf(envout, envin, key, func): envout[key] = ifenvf(envin, key, func) def setifenv(envout, envin, key, val): envout[key] = envin.get(key, val) def setenv(envout, key, val): envout[key] = val def update_env_if(envout, envin, keys): for key in keys: if key in envin: envout[key] = envin[key] def compose_defs_env(env): # We disallow env overrides for anything with a default from GNU make. nons = ['CXX', 'CC', 'AR'] # 'SHELL'? ovr = ['%-20s ?= %s' %(k, v) for k,v in sorted(env.items()) if k not in nons] nonovr = ['%-20s := %s' %(k, v) for k,v in sorted(env.items()) if k in nons] return '\n'.join(ovr + nonovr + ['']) def compose_defines_pacbio(envin): """ This is used by mobs via buildcntl.sh. """ env = dict() #setifenv(env, envin, 'LIBPBDATA_INCLUDE', '../pbdata') #setifenv(env, envin, 'LIBPBIHDF_INCLUDE', '../hdf') #setifenv(env, envin, 'LIBBLASR_INCLUDE', '../alignment') #setifenv(env, envin, 'LIBPBDATA_LIB', '../pbdata/libpbdata.so') #setifenv(env, envin, 'LIBPBIHDF_LIB', '../hdf/libpbihdf.so') #setifenv(env, envin, 'LIBBLASR_LIB', '../alignment/libblasr.so') #setifenv(env, envin, 'nohdf', '1') possibs = set([ 'CC', 'CXX', 'AR', 'GTEST_INCLUDE', 'GTEST_SRC', 'LIBBLASR_INCLUDE', 'LIBBLASR_LIB', 'LIBBLASR_LIBFLAGS', 'LIBPBDATA_INCLUDE', 'LIBPBDATA_LIB', 'LIBPBDATA_LIBFLAGS', 'LIBPBIHDF_INCLUDE', 'LIBPBIHDF_LIB', 'LIBPBIHDF_LIBFLAGS', 'HDF5_INCLUDE', 'HDF5_LIB', 'HDF5_LIBFLAGS', 'PBBAM_INCLUDE', 'PBBAM_LIB', 'PBBAM_LIBFLAGS', 'HTSLIB_INCLUDE', 'HTSLIB_LIB', 'HTSLIB_LIBFLAGS', 'BOOST_INCLUDE','PTHREAD_LIBFLAGS', 'ZLIB_LIB', 'ZLIB_LIBFLAGS', 'GCC_LIB', 'DAZZ_DB_SRC', 'DAZZ_DB_INCLUDE', 'DALIGNER_SRC', 'DALIGNER_INCLUDE', ]) update_env_if(env, envin, possibs) return compose_defs_env(env) def configure_pacbio(envin, shared, build_dir): content1 = compose_defines_pacbio(envin) if shared: content1 += 'LDLIBS+=-lrt\n' update_content(os.path.join(build_dir, 'defines.mk'), content1) def get_make_style_env(envin, args): envout = dict() for arg in args: if '=' in arg: k, v = arg.split('=') envout[k] = v envout.update(envin) return envout class OsType: Unknown, Linux, Darwin = range(3) def getOsType(): uname = shell('uname -s') log('uname=%r' %uname) if 'Darwin' in uname: return OsType.Darwin elif 'Linux' in uname: return OsType.Linux else: return OsType.Unknown def update_env_for_linux(env): env['SET_LIB_NAME'] = '-soname' env['SH_LIB_EXT'] = '.so' env['EXTRA_LDFLAGS'] = '-Wl,--no-as-needed' def update_env_for_darwin(env): env['SET_LIB_NAME'] = '-install_name' env['SH_LIB_EXT'] = '.dylib' env['EXTRA_LDFLAGS'] = '-flat_namespace' # -flat_namespace makes BSD ld act like Linux ld, finding # shared libs recursively. def update_env_for_unknown(env): env['SET_LIB_NAME'] = '-soname' env['SH_LIB_EXT'] = '.so' update_env_for_os = { OsType.Linux: update_env_for_linux, OsType.Darwin: update_env_for_darwin, OsType.Unknown: update_env_for_unknown, } def parse_args(args): parser = argparse.ArgumentParser() parser.add_argument('--boost-headers', action='store_true', help='Download Boost headers.') parser.add_argument('--gtest', action='store_true', help='Download google-test.') parser.add_argument('--no-pbbam', action='store_true', help='Avoid compiling anything which would need pbbam.') parser.add_argument('--submodules', action='store_true', help='Set variables to use our git-submodules, which must be pulled and built first. (Implies --no-pbbam.)') parser.add_argument('--shared', action='store_true', help='Build for dynamic linking.') parser.add_argument('--mode', default='opt', help='debug, opt, profile [default=%(default)s] CURRENTLY IGNORED') parser.add_argument('--build-dir', help='Can be different from source directory, but only when *not* also building submodule.') parser.add_argument('makevars', nargs='*', help='Variables in the style of make: FOO=val1 BAR=val2 etc.') return parser.parse_args(args) def set_defs_defaults(env, nopbbam): defaults = { 'LIBPBDATA_LIBFLAGS': '-lpbdata', 'LIBBLASR_LIBFLAGS': '-lblasr', 'SHELL': 'bash -xe', 'DAZZ_DB_SRC': os.path.join(ROOT, '..', 'DAZZ_DB'), 'DALIGNER_SRC': os.path.join(ROOT, '..', 'DALIGNER'), 'DAZZ_DB_INCLUDE': '${DAZZ_DB_SRC}', 'DALIGNER_INCLUDE': '${DALIGNER_SRC}', 'PTHREAD_LIBFLAGS': '-lpthread', } pbbam_defaults = { 'LIBPBIHDF_LIBFLAGS': '-lpbihdf', 'PBBAM_LIBFLAGS': '-lpbbam', 'HTSLIB_LIBFLAGS': '-lhts', 'HDF5_LIBFLAGS': '-lhdf5_cpp -lhdf5', 'ZLIB_LIBFLAGS': '-lz', 'PTHREAD_LIBFLAGS': '-lpthread', 'DL_LIBFLAGS': '-ldl', # neeeded by HDF5 always } if not nopbbam: defaults.update(pbbam_defaults) for k in defaults: if k not in env: env[k] = defaults[k] def set_defs_submodule_defaults(env, nopbbam): libcpp = os.path.join(ROOT, 'blasr_libcpp') daligner = os.path.join(ROOT, 'DALIGNER') dazz_db = os.path.join(ROOT, 'DAZZ_DB') defaults = { 'LIBPBDATA_INCLUDE': os.path.join(libcpp, 'pbdata'), 'LIBBLASR_INCLUDE': os.path.join(libcpp, 'alignment'), 'LIBPBIHDF_INCLUDE': '' if nopbbam else os.path.join(libcpp, 'hdf'), 'LIBPBDATA_LIB': os.path.join(libcpp, 'pbdata'), 'LIBBLASR_LIB': os.path.join(libcpp, 'alignment'), 'LIBPBIHDF_LIB': '' if nopbbam else os.path.join(libcpp, 'hdf'), 'DALIGNER_SRC': daligner, 'DAZZ_DB_SRC': dazz_db, } for k in defaults: if k not in env: env[k] = defaults[k] def write_makefile(build_dir_root, src_dir_root, makefilename, relpath): src_dir = os.path.join(src_dir_root, relpath) build_dir = os.path.join(build_dir_root, relpath) content = """\ vpath %%.cpp %(src_dir)s vpath %%.c %(src_dir)s include %(src_dir)s/%(makefilename)s """ %dict(makefilename=makefilename, src_dir=src_dir) mkdirs(build_dir) fn = os.path.join(build_dir, makefilename) update_content(fn, content) def write_makefiles(build_dir): write_makefile(build_dir, ROOT, 'makefile', '.') write_makefile(build_dir, ROOT, 'makefile', 'src/cpp') write_makefile(build_dir, ROOT, 'makefile', 'test/cpp') def main(prog, *args): """We are still deciding what env-vars to use, if any. """ conf = parse_args(args) envin = get_make_style_env(os.environ, conf.makevars) ost = getOsType() update_env_for_os[ost](envin) if conf.build_dir is not None: write_makefiles(conf.build_dir) else: conf.build_dir = '.' conf.build_dir = os.path.abspath(conf.build_dir) if conf.boost_headers: envin['BOOST_INCLUDE'] = fetch_boost_headers(conf.build_dir) if conf.gtest: gtest_dir = fetch_gtest(conf.build_dir) envin['GTEST_INCLUDE'] = os.path.join(gtest_dir, 'include') envin['GTEST_SRC'] = os.path.join(gtest_dir, 'src') if conf.submodules: set_defs_submodule_defaults(envin, conf.no_pbbam) conf.no_pbbam = True set_defs_defaults(envin, conf.no_pbbam) configure_pacbio(envin, conf.shared, conf.build_dir) if __name__=="__main__": main(*sys.argv) pbdagcon-0.3+20161121+ds/travis.sh0000755000175000017500000000020313026414537014667 0ustar afifafif#!/usr/bin/env bash set -ex ./configure.py --boost --gtest --sub make -j init-submodule make --debug=b -j make --debug=v -j check pbdagcon-0.3+20161121+ds/src/0000755000175000017500000000000013026414536013613 5ustar afifafifpbdagcon-0.3+20161121+ds/src/tests/0000755000175000017500000000000013026414536014755 5ustar afifafifpbdagcon-0.3+20161121+ds/src/tests/test_aligngraph.py0000755000175000017500000001615513026414536020515 0ustar afifafiffrom nose.tools import assert_equal from nose import SkipTest import random from pbtools.pbdagcon.aligngraph import * def generate_simulated_reads(pi=None, pd=None, n = 4): import random random.seed(42) seq = "ATATTTGGC" seq1 = "ATAGCCGGC" seq2 = "ATACCCGGC" seq3 = "ATATCCGGC" seq4 = "ATATCGGC" if pi == None: pi = 0.03 if pd == None: pd = 0.03 out_seq = [] for i in range(n): c = 0 s = [] if i % 4 == 0: ss = seq1 elif i % 4 == 1: ss = seq2 elif i % 4 == 2: ss = seq3 else: ss = seq4 while 1: if random.uniform(0,1) < pi: s.append(random.choice( ("A","G","C","T") ) ) continue if random.uniform(0,1) < pd: c += 1 continue if c < len(ss): s.append(ss[c]) c += 1 else: break out_seq.append( "".join(s) ) return seq, out_seq class TestPhiCoeff: def test_phi_coeff(self): # assert_equal(expected, phi_coeff(xvec, yvec)) raise SkipTest # TODO: implement your test here class TestConvertMismatches: def test_convert_mismatches(self): assert_equal( ('C-AC', 'CG-C'), convert_mismatches("CAC","CGC") ) assert_equal( ('CAACAT', 'CAA--T'), convert_mismatches("CAACAT","C-A-AT" ) ) assert_equal( ('CCG--T', 'CCGACT'), convert_mismatches("-C--CGT","CCGAC-T") ) class TestAlnEdge: def test___init__(self): # aln_edge = AlnEdge(in_node, out_node) raise SkipTest # TODO: implement your test here def test___repr__(self): # aln_edge = AlnEdge(in_node, out_node) # assert_equal(expected, aln_edge.__repr__()) raise SkipTest # TODO: implement your test here def test_add_to_score(self): # aln_edge = AlnEdge(in_node, out_node) # assert_equal(expected, aln_edge.add_to_score(s)) raise SkipTest # TODO: implement your test here def test_increase_count(self): # aln_edge = AlnEdge(in_node, out_node) # assert_equal(expected, aln_edge.increase_count()) raise SkipTest # TODO: implement your test here def test_set_score(self): # aln_edge = AlnEdge(in_node, out_node) # assert_equal(expected, aln_edge.set_score(s)) raise SkipTest # TODO: implement your test here class TestAlnNode: def test___init__(self): # aln_node = AlnNode(base) raise SkipTest # TODO: implement your test here def test___repr__(self): # aln_node = AlnNode(base) # assert_equal(expected, aln_node.__repr__()) raise SkipTest # TODO: implement your test here def test_add_in_edge(self): # aln_node = AlnNode(base) # assert_equal(expected, aln_node.add_in_edge(in_edge)) raise SkipTest # TODO: implement your test here def test_addout_edge(self): # aln_node = AlnNode(base) # assert_equal(expected, aln_node.addout_edge(out_edge)) raise SkipTest # TODO: implement your test here def test_increase_weight(self): # aln_node = AlnNode(base) # assert_equal(expected, aln_node.increase_weight(w)) raise SkipTest # TODO: implement your test here class TestAlnGraph: def test___init__(self): backbone_seq, reads = generate_simulated_reads() aln_graph = AlnGraph(backbone_seq) assert len(aln_graph.nodes) == len(backbone_seq) + 2 def test_add_alignment(self): aln_graph = AlnGraph("ATATTAGGC") alns = [((0, 9, 'A-TAGCCGGC'), (2, 9, 'ATTA---GGC')), ((0, 10, 'ATA-TACCGAG-'), (0, 9, 'ATATTA--G-GC')), ((0, 10, 'ATCATCC--GGC'), (0, 9, 'AT-AT--TAGGC')), ((0, 9, 'ATA-TACGGC'), (0, 9, 'ATATTA-GGC'))] for aln in alns: aln_graph.add_alignment( aln ) assert len(aln_graph.nodes) != 0 assert len(aln_graph.edges) != 0 def test_add_edge(self): # aln_graph = AlnGraph(backbone_seq) # assert_equal(expected, aln_graph.add_edge(edge)) raise SkipTest # TODO: implement your test here def test_add_node(self): # aln_graph = AlnGraph(backbone_seq) # assert_equal(expected, aln_graph.add_node(node)) raise SkipTest # TODO: implement your test here def test_delete_edge(self): # aln_graph = AlnGraph(backbone_seq) # assert_equal(expected, aln_graph.delete_edge(edge)) raise SkipTest # TODO: implement your test here def test_delete_node(self): # aln_graph = AlnGraph(backbone_seq) # assert_equal(expected, aln_graph.delete_node(node)) raise SkipTest # TODO: implement your test here def test_find_best_path(self): # aln_graph = AlnGraph(backbone_seq) # assert_equal(expected, aln_graph.find_best_path()) raise SkipTest # TODO: implement your test here def test_generate_consensus(self): # aln_graph = AlnGraph(backbone_seq) # assert_equal(expected, aln_graph.generate_consensus()) raise SkipTest # TODO: implement your test here def test_merge_in_nodes(self): # aln_graph = AlnGraph(backbone_seq) # assert_equal(expected, aln_graph.merge_in_nodes(nodes, node)) raise SkipTest # TODO: implement your test here def test_merge_nodes(self): aln_graph = AlnGraph("ATAATTGGC") alns = [((0, 9, 'ATAG--CTGGC'), (0, 9, 'ATA-AT-TGGC')), ((0, 9, 'ATAG--CTGGC'), (0, 9, 'ATA-AT-TGGC')), ((0, 9, 'ATAG-TTGGC'), (0, 9, 'ATA-ATTGGC')), ((0, 9, 'ATAG-TTGGC'), (0, 9, 'ATA-ATTGGC')), ((0, 9, 'ATAG--CTGGC'), (0, 9, 'ATA-AT-TGGC'))] for aln in alns: aln_graph.add_alignment( aln ) aln_graph.merge_nodes() def test_merge_out_nodes(self): # aln_graph = AlnGraph(backbone_seq) # assert_equal(expected, aln_graph.merge_out_nodes(node, nodes)) raise SkipTest # TODO: implement your test here def test_output_consensus_fasta(self): # aln_graph = AlnGraph(backbone_seq) # assert_equal(expected, aln_graph.output_consensus_fasta(fn, rID)) raise SkipTest # TODO: implement your test here def test_track_path(self): # aln_graph = AlnGraph(backbone_seq) # assert_equal(expected, aln_graph.track_path(seq, node)) raise SkipTest # TODO: implement your test here class TestOutputDot: def test_output_dot(self): # assert_equal(expected, output_dot(aln_graph, fn, r)) raise SkipTest # TODO: implement your test here class TestOutputDot2: def test_output_dot_2(self): # assert_equal(expected, output_dot_2(aln_graph, fn)) raise SkipTest # TODO: implement your test here class TestGenerateSimulatedReads: def test_generate_simulated_reads(self): # assert_equal(expected, generate_simulated_reads()) raise SkipTest # TODO: implement your test here class TestSimpleTest: def test_simple_test(self): # assert_equal(expected, simple_test()) raise SkipTest # TODO: implement your test here pbdagcon-0.3+20161121+ds/src/cpp/0000755000175000017500000000000013026414536014375 5ustar afifafifpbdagcon-0.3+20161121+ds/src/cpp/ProgramOpts.hpp0000644000175000017500000000171413026414536017366 0ustar afifafif#pragma once #include #include // Normalizing *slightly* difference cmd line interfaces between pbdagcon and // dazcon. May unify someday ... struct ProgramOpts { // Common to both pbdagcon and dazcon /// Minimum alignment coverage for consensus size_t minCov; /// Minimum consensus length to output size_t minLen; /// Amount to trim alignments by on either side. unsigned int trim; /// Number of threads to use int threads; // Specific to pbdagcon bool align; std::string input; // Specific to dazcon /// Path to the alignment file std::string alnFile; /// Path to the sequence file std::string seqFile; /// Maximimum number of hits to include in correction unsigned int maxHits; /// Sort hits by coverage score bool sortCov; /// Use only proper overlaps for correction bool properOvls; /// Limit correction to these targets std::set targets; }; pbdagcon-0.3+20161121+ds/src/cpp/AlnGraphBoost.cpp0000644000175000017500000003563613026414536017621 0ustar afifafif#include #include #include #include #include #include #include #include #include #include #include #include #include "Alignment.hpp" #include "AlnGraphBoost.hpp" AlnGraphBoost::AlnGraphBoost(const std::string& backbone) { // initialize the graph structure with the backbone length + enter/exit // vertex size_t blen = backbone.length(); _g = G(blen+2); for (size_t i = 0; i < blen+1; i++) boost::add_edge(i, i+1, _g); VtxIter curr, last; boost::tie(curr, last) = boost::vertices(_g); _enterVtx = *curr++; _g[_enterVtx].base = '^'; _g[_enterVtx].backbone = true; for (size_t i = 0; i < blen; i++, ++curr) { VtxDesc v = *curr; _g[v].backbone = true; _g[v].weight = 1; _g[v].base = backbone[i]; _bbMap[v] = v; } _exitVtx = *curr; _g[_exitVtx].base = '$'; _g[_exitVtx].backbone = true; } AlnGraphBoost::AlnGraphBoost(const size_t blen) { _g = G(blen+2); for (size_t i = 0; i < blen+1; i++) boost::add_edge(i, i+1, _g); VtxIter curr, last; boost::tie(curr, last) = boost::vertices(_g); _enterVtx = *curr++; _g[_enterVtx].base = '^'; _g[_enterVtx].backbone = true; for (size_t i = 0; i < blen; ++i, ++curr) { VtxDesc v = *curr; _g[v].backbone = true; _g[v].weight = 1; _g[v].deleted = false; _g[v].base = 'N'; _bbMap[v] = v; } _exitVtx = *curr; _g[_exitVtx].base = '$'; _g[_exitVtx].backbone = true; } void AlnGraphBoost::addAln(dagcon::Alignment& aln) { IndexMap index = boost::get(boost::vertex_index, _g); // tracks the position on the backbone uint32_t bbPos = aln.start; VtxDesc prevVtx = _enterVtx; for (size_t i = 0; i < aln.qstr.length(); i++) { char queryBase = aln.qstr[i], targetBase = aln.tstr[i]; assert(queryBase != '.'); assert(targetBase != '.'); VtxDesc currVtx = index[bbPos]; // match if (queryBase == targetBase) { _g[_bbMap[currVtx]].coverage++; // NOTE: for empty backbones _g[_bbMap[currVtx]].base = targetBase; _g[currVtx].weight++; addEdge(prevVtx, currVtx); bbPos++; prevVtx = currVtx; // query deletion } else if (queryBase == '-' && targetBase != '-') { _g[_bbMap[currVtx]].coverage++; // NOTE: for empty backbones _g[_bbMap[currVtx]].base = targetBase; bbPos++; // query insertion } else if (queryBase != '-' && targetBase == '-') { // create new node and edge VtxDesc newVtx = boost::add_vertex(_g); _g[newVtx].base = queryBase; _g[newVtx].weight++; _g[newVtx].backbone = false; _g[newVtx].deleted = false; _bbMap[newVtx] = bbPos; addEdge(prevVtx, newVtx); prevVtx = newVtx; } } addEdge(prevVtx, _exitVtx); } void AlnGraphBoost::addEdge(VtxDesc u, VtxDesc v) { // Check if edge exists with prev node. If it does, increment edge counter, // otherwise add a new edge. InEdgeIter ii, ie; bool edgeExists = false; for (boost::tie(ii, ie) = boost::in_edges(v, _g); ii != ie; ++ii) { EdgeDesc e = *ii; if (boost::source(e , _g) == u) { // increment edge count _g[e].count++; edgeExists = true; } } if (! edgeExists) { // add new edge std::pair p = boost::add_edge(u, v, _g); _g[p.first].count++; } } void AlnGraphBoost::mergeNodes() { std::queue seedNodes; seedNodes.push(_enterVtx); while(true) { if (seedNodes.size() == 0) break; VtxDesc u = seedNodes.front(); seedNodes.pop(); mergeInNodes(u); mergeOutNodes(u); OutEdgeIter oi, oe; for (boost::tie(oi, oe) = boost::out_edges(u, _g); oi != oe; ++oi) { EdgeDesc e = *oi; _g[e].visited = true; VtxDesc v = boost::target(e, _g); InEdgeIter ii, ie; int notVisited = 0; for (boost::tie(ii, ie) = boost::in_edges(v, _g); ii != ie; ++ii) { if (_g[*ii].visited == false) notVisited++; } // move onto the boost::target node after we visit all incoming edges for // the boost::target node if (notVisited == 0) seedNodes.push(v); } } } void AlnGraphBoost::mergeInNodes(VtxDesc n) { std::map> nodeGroups; InEdgeIter ii, ie; // Group neighboring nodes by base for(boost::tie(ii, ie) = boost::in_edges(n, _g); ii != ie; ++ii) { VtxDesc inNode = boost::source(*ii, _g); if (out_degree(inNode, _g) == 1) { nodeGroups[_g[inNode].base].push_back(inNode); } } // iterate over node groups, merge an accumulate information for(auto kvp = nodeGroups.cbegin(); kvp != nodeGroups.end(); ++kvp) { std::vector nodes = (*kvp).second; if (nodes.size() <= 1) continue; std::vector::const_iterator ni = nodes.cbegin(); VtxDesc an = *ni++; OutEdgeIter anoi, anoe; boost::tie(anoi, anoe) = boost::out_edges(an, _g); // Accumulate out edge information for (; ni != nodes.cend(); ++ni) { OutEdgeIter oi, oe; boost::tie(oi, oe) = boost::out_edges(*ni, _g); _g[*anoi].count += _g[*oi].count; _g[an].weight += _g[*ni].weight; } // Accumulate in edge information, merges nodes ni = nodes.cbegin(); ++ni; for (; ni != nodes.cend(); ++ni) { InEdgeIter ii, ie; VtxDesc n = *ni; for (boost::tie(ii, ie) = boost::in_edges(n, _g); ii != ie; ++ii) { VtxDesc n1 = boost::source(*ii, _g); EdgeDesc e; bool exists; boost::tie(e, exists) = edge(n1, an, _g); if (exists) { _g[e].count += _g[*ii].count; } else { std::pair p = boost::add_edge(n1, an, _g); _g[p.first].count = _g[*ii].count; _g[p.first].visited = _g[*ii].visited; } } markForReaper(n); } mergeInNodes(an); } } void AlnGraphBoost::mergeOutNodes(VtxDesc n) { std::map> nodeGroups; OutEdgeIter oi, oe; for(boost::tie(oi, oe) = boost::out_edges(n, _g); oi != oe; ++oi) { VtxDesc outNode = boost::target(*oi, _g); if (in_degree(outNode, _g) == 1) { nodeGroups[_g[outNode].base].push_back(outNode); } } for(auto kvp = nodeGroups.cbegin(); kvp != nodeGroups.end(); ++kvp) { std::vector nodes = (*kvp).second; if (nodes.size() <= 1) continue; std::vector::const_iterator ni = nodes.cbegin(); VtxDesc an = *ni++; InEdgeIter anii, anie; boost::tie(anii, anie) = boost::in_edges(an, _g); // Accumulate inner edge information for (; ni != nodes.cend(); ++ni) { InEdgeIter ii, ie; boost::tie(ii, ie) = boost::in_edges(*ni, _g); _g[*anii].count += _g[*ii].count; _g[an].weight += _g[*ni].weight; } // Accumulate and merge outer edge information ni = nodes.cbegin(); ++ni; for (; ni != nodes.cend(); ++ni) { OutEdgeIter oi, oe; VtxDesc n = *ni; for (boost::tie(oi, oe) = boost::out_edges(n, _g); oi != oe; ++oi) { VtxDesc n2 = boost::target(*oi, _g); EdgeDesc e; bool exists; boost::tie(e, exists) = edge(an, n2, _g); if (exists) { _g[e].count += _g[*oi].count; } else { std::pair p = boost::add_edge(an, n2, _g); _g[p.first].count = _g[*oi].count; _g[p.first].visited = _g[*oi].visited; } } markForReaper(n); } } } void AlnGraphBoost::markForReaper(VtxDesc n) { _g[n].deleted = true; clear_vertex(n, _g); _reaperBag.push_back(n); } void AlnGraphBoost::reapNodes() { int reapCount = 0; std::sort(_reaperBag.begin(), _reaperBag.end()); std::vector::iterator curr = _reaperBag.begin(); for (; curr != _reaperBag.end(); ++curr) { assert(_g[*curr].backbone==false); remove_vertex(*curr-reapCount++, _g); } } const std::string AlnGraphBoost::consensus(int minWeight) { // get the best scoring path std::vector path = bestPath(); // consensus sequence std::string cns; // track the longest consensus path meeting minimum weight int offs = 0, bestOffs = 0, length = 0, idx = 0; bool metWeight = false; std::vector::iterator curr = path.begin(); for (; curr != path.end(); ++curr) { AlnNode n = *curr; if (n.base == _g[_enterVtx].base || n.base == _g[_exitVtx].base) continue; cns += n.base; // initial beginning of minimum weight section if (!metWeight && n.weight >= minWeight) { offs = idx; metWeight = true; } else if (metWeight && n.weight < minWeight) { // concluded minimum weight section, update if longest seen so far if ((idx - offs) > length) { bestOffs = offs; length = idx - offs; } metWeight = false; } idx++; } // include end of sequence if (metWeight && (idx - offs) > length) { bestOffs = offs; length = idx - offs; } return cns.substr(bestOffs, length); } void AlnGraphBoost::consensus(std::vector& seqs, int minWeight, size_t minLen) { seqs.clear(); // get the best scoring path std::vector path = bestPath(); // consensus sequence std::string cns; // track the longest consensus path meeting minimum weight int offs = 0, idx = 0; bool metWeight = false; std::vector::iterator curr = path.begin(); for (; curr != path.end(); ++curr) { AlnNode n = *curr; if (n.base == _g[_enterVtx].base || n.base == _g[_exitVtx].base) continue; cns += n.base; // initial beginning of minimum weight section if (!metWeight && n.weight >= minWeight) { offs = idx; metWeight = true; } else if (metWeight && n.weight < minWeight) { // concluded minimum weight section, add sequence to supplied vector metWeight = false; CnsResult result; result.range[0] = offs; result.range[1] = idx; size_t length = idx - offs; result.seq = cns.substr(offs, length); if (length >= minLen) seqs.push_back(result); } idx++; } // include end of sequence if (metWeight) { size_t length = idx - offs; CnsResult result; result.range[0] = offs; result.range[1] = idx; result.seq = cns.substr(offs, length); if (length >= minLen) seqs.push_back(result); } } const std::vector AlnGraphBoost::bestPath() { EdgeIter ei, ee; for (boost::tie(ei, ee) = edges(_g); ei != ee; ++ei) _g[*ei].visited = false; std::map bestNodeScoreEdge; std::map nodeScore; std::queue seedNodes; // start at the end and make our way backwards seedNodes.push(_exitVtx); nodeScore[_exitVtx] = 0.0f; while (true) { if (seedNodes.size() == 0) break; VtxDesc n = seedNodes.front(); seedNodes.pop(); bool bestEdgeFound = false; float bestScore = -FLT_MAX; EdgeDesc bestEdgeD = boost::initialized_value; OutEdgeIter oi, oe; for(boost::tie(oi, oe) = boost::out_edges(n, _g); oi != oe; ++oi) { EdgeDesc outEdgeD = *oi; VtxDesc outNodeD = boost::target(outEdgeD, _g); AlnNode outNode = _g[outNodeD]; float newScore, score = nodeScore[outNodeD]; if (outNode.backbone && outNode.weight == 1) { newScore = score - 10.0f; } else { AlnNode bbNode = _g[_bbMap[outNodeD]]; newScore = _g[outEdgeD].count - bbNode.coverage*0.5f + score; } if (newScore > bestScore) { bestScore = newScore; bestEdgeD = outEdgeD; bestEdgeFound = true; } } if (bestEdgeFound) { nodeScore[n]= bestScore; bestNodeScoreEdge[n] = bestEdgeD; } InEdgeIter ii, ie; for (boost::tie(ii, ie) = boost::in_edges(n, _g); ii != ie; ++ii) { EdgeDesc inEdge = *ii; _g[inEdge].visited = true; VtxDesc inNode = boost::source(inEdge, _g); int notVisited = 0; OutEdgeIter oi, oe; for (boost::tie(oi, oe) = boost::out_edges(inNode, _g); oi != oe; ++oi) { if (_g[*oi].visited == false) notVisited++; } // move onto the target node after we visit all incoming edges for // the target node if (notVisited == 0) seedNodes.push(inNode); } } // construct the final best path VtxDesc prev = _enterVtx, next; std::vector bpath; while (true) { bpath.push_back(_g[prev]); if (bestNodeScoreEdge.count(prev) == 0) { break; } else { EdgeDesc bestOutEdge = bestNodeScoreEdge[prev]; _g[prev].bestOutEdge = bestOutEdge; next = boost::target(bestOutEdge, _g); _g[next].bestInEdge = bestOutEdge; prev = next; } } return bpath; } void AlnGraphBoost::printGraph() { reapNodes(); boost::write_graphviz(std::cout, _g, make_label_writer(get(&AlnNode::base, _g)), make_label_writer(get(&AlnEdge::count, _g))); } bool AlnGraphBoost::danglingNodes() { VtxIter curr, last; boost::tie(curr, last) = boost::vertices(_g); bool found = false; for (;curr != last; ++curr) { if (_g[*curr].deleted) continue; if (_g[*curr].base == _g[_enterVtx].base || _g[*curr].base == _g[_exitVtx].base) continue; int indeg = out_degree(*curr, _g); int outdeg = in_degree(*curr, _g); if (outdeg > 0 && indeg > 0) continue; found = true; } return found; } AlnGraphBoost::~AlnGraphBoost(){} pbdagcon-0.3+20161121+ds/src/cpp/BlasrM5AlnProvider.hpp0000644000175000017500000000447213026414536020530 0ustar afifafif#pragma once #include "AlnProvider.hpp" /// /// Exceptions thrown by this class /// namespace M5Exception { struct FormatError { std::string msg; FormatError(std::string m) { msg = m; } }; struct SortError {}; struct FileOpenError {}; } /// /// Provides sets of alignments for a given target sequence from a blasr M5 /// file. File may be grouped by target or query. The grouping determines /// which set gets corrected. Earlier, pre-assembly reads were corrected as /// targets. However, we can avoid the sort step if we can correct the reads /// as queries, since blasr groups alignments by query. /// class BlasrM5AlnProvider : public AlnProvider { public: /// Constructs a new alignment provider. Checks the format of the file and /// throws an exception if it's malformed. /// \param fpath Path to the file containing alignments. BlasrM5AlnProvider(const std::string& fpath); /// Constructs a provider based on the given stream. Note that no checks /// are actually made on the validity of the format, caveat emptor. This /// can be used to take a piped stream of alignments straight from blasr. BlasrM5AlnProvider(std::istream* stream); /// Cleans up some stuff. ~BlasrM5AlnProvider(); /// Gets the set of alignments for the next target and puts them into the /// given vector. Note this function will clear the contents of the vector /// prior to adding the next set of alignments. /// \param dest reference to a vector to hold the alignments. /// \return True if there are more targets, otherwise false. bool nextTarget(std::vector& dest); /// Same as \fn bool nextTarget(std::vector& dest) except it /// also returns the target sequence we are going to correct. bool nextTarget(std::string& targetSeq, std::vector& dest); /// Called during constructor, checks that the file is formatted correctly. /// Also determines if the input is grouped by query or target. void checkFormat(); private: /// Path to the input file const std::string fpath_; /// State variables std::string currId_; dagcon::Alignment prevAln_; bool firstAln_; /// Represents an input stream to the alignments. std::ifstream fs_; std::istream* is_; }; pbdagcon-0.3+20161121+ds/src/cpp/third-party/0000755000175000017500000000000013026414536016644 5ustar afifafifpbdagcon-0.3+20161121+ds/src/cpp/third-party/easylogging++.h0000755000175000017500000110160113026414536021456 0ustar afifafif// // Easylogging++ v9.80 // Single-header only, cross-platform logging library for C++ applications // // Copyright (c) 2015 muflihun.com // // This library is released under the MIT Licence. // http://easylogging.muflihun.com/licence.php // // easylogging@muflihun.com // // https://github.com/easylogging/easyloggingpp // http://easylogging.muflihun.com // http://muflihun.com // #ifndef EASYLOGGINGPP_H #define EASYLOGGINGPP_H // Compilers and C++0x/C++11 Evaluation #if defined(__GNUC__) # define ELPP_COMPILER_GCC 1 # define ELPP_GCC_VERSION (__GNUC__ * 10000 \ + __GNUC_MINOR__ * 100 \ + __GNUC_PATCHLEVEL__) # if defined(__GXX_EXPERIMENTAL_CXX0X__) # define ELPP_CXX0X 1 # elif(ELPP_GCC_VERSION >= 40801) # define ELPP_CXX11 1 # endif // defined(__GXX_EXPERIMENTAL_CXX0X__) #endif // defined(__GNUC__) // Visual C++ #if defined(_MSC_VER) # define ELPP_COMPILER_MSVC 1 # define ELPP_CRT_DBG_WARNINGS 1 # if (_MSC_VER == 1600) # define ELPP_CXX0X 1 # elif(_MSC_VER >= 1700) # define ELPP_CXX11 1 # endif // (_MSC_VER == 1600) #endif // defined(_MSC_VER) // Clang++ #if defined(__clang__) && (__clang__ == 1) # define ELPP_COMPILER_CLANG 1 # define ELPP_CLANG_VERSION (__clang_major__ * 10000 \ + __clang_minor__ * 100 \ + __clang_patchlevel__) # if (ELPP_CLANG_VERSION >= 30300) # define ELPP_CXX11 1 # endif // (ELPP_CLANG_VERSION >= 30300) #endif // defined(__clang__) && (__clang__ == 1) // MinGW #if defined(__MINGW32__) || defined(__MINGW64__) # define ELPP_MINGW 1 #endif // defined(__MINGW32__) || defined(__MINGW64__) // Cygwin #if defined(__CYGWIN__) && (__CYGWIN__ == 1) # define ELPP_CYGWIN 1 #endif // defined(__CYGWIN__) && (__CYGWIN__ == 1) // Intel C++ #if defined(__INTEL_COMPILER) # define ELPP_COMPILER_INTEL 1 #endif // Operating System Evaluation // Windows #if defined(_WIN32) || defined(_WIN64) # define ELPP_OS_WINDOWS 1 #endif // defined(_WIN32) || defined(_WIN64) // Linux #if (defined(__linux) || defined(__linux__)) # define ELPP_OS_LINUX 1 #endif // (defined(__linux) || defined(__linux__)) // Mac #if defined(__APPLE__) # define ELPP_OS_MAC 1 #endif // defined(__APPLE__) // FreeBSD #if defined(__FreeBSD__) # define ELPP_OS_FREEBSD 1 #endif // Solaris #if defined(__sun) # define ELPP_OS_SOLARIS 1 #endif // Unix #if ((ELPP_OS_LINUX || ELPP_OS_MAC || ELPP_OS_FREEBSD || ELPP_OS_SOLARIS) && (!ELPP_OS_WINDOWS)) # define ELPP_OS_UNIX 1 #endif // ((ELPP_OS_LINUX || ELPP_OS_MAC || ELPP_OS_FREEBSD || ELPP_OS_SOLARIS) && (!ELPP_OS_WINDOWS)) // Android #if defined(__ANDROID__) # define ELPP_OS_ANDROID 1 #endif // defined(__ANDROID__) // Evaluating Cygwin as *nix OS #if !ELPP_OS_UNIX && !ELPP_OS_WINDOWS && ELPP_CYGWIN # undef ELPP_OS_UNIX # undef ELPP_OS_LINUX # define ELPP_OS_UNIX 1 # define ELPP_OS_LINUX 1 #endif // !ELPP_OS_UNIX && !ELPP_OS_WINDOWS && ELPP_CYGWIN #if !defined(ELPP_INTERNAL_DEBUGGING_OUT_INFO) # define ELPP_INTERNAL_DEBUGGING_OUT_INFO std::cout #endif // !defined(ELPP_INTERNAL_DEBUGGING_OUT) #if !defined(ELPP_INTERNAL_DEBUGGING_OUT_ERROR) # define ELPP_INTERNAL_DEBUGGING_OUT_ERROR std::cerr #endif // !defined(ELPP_INTERNAL_DEBUGGING_OUT) #if !defined(ELPP_INTERNAL_DEBUGGING_ENDL) # define ELPP_INTERNAL_DEBUGGING_ENDL std::endl #endif // !defined(ELPP_INTERNAL_DEBUGGING_OUT) #if !defined(ELPP_INTERNAL_DEBUGGING_MSG) # define ELPP_INTERNAL_DEBUGGING_MSG(msg) msg #endif // !defined(ELPP_INTERNAL_DEBUGGING_OUT) // Internal Assertions and errors #if !defined(ELPP_DISABLE_ASSERT) # if (defined(ELPP_DEBUG_ASSERT_FAILURE)) # define ELPP_ASSERT(expr, msg) if (!(expr)) { \ std::stringstream internalInfoStream; internalInfoStream << msg; \ ELPP_INTERNAL_DEBUGGING_OUT_ERROR \ << "EASYLOGGING++ ASSERTION FAILED (LINE: " << __LINE__ << ") [" #expr << "] WITH MESSAGE \"" \ << ELPP_INTERNAL_DEBUGGING_MSG(internalInfoStream.str()) << "\"" << ELPP_INTERNAL_DEBUGGING_ENDL; base::utils::abort(1, \ "ELPP Assertion failure, please define ELPP_DEBUG_ASSERT_FAILURE"); } # else # define ELPP_ASSERT(expr, msg) if (!(expr)) { \ std::stringstream internalInfoStream; internalInfoStream << msg; \ ELPP_INTERNAL_DEBUGGING_OUT_ERROR\ << "ASSERTION FAILURE FROM EASYLOGGING++ (LINE: " \ << __LINE__ << ") [" #expr << "] WITH MESSAGE \"" << ELPP_INTERNAL_DEBUGGING_MSG(internalInfoStream.str()) << "\"" \ << ELPP_INTERNAL_DEBUGGING_ENDL; } # endif // (defined(ELPP_DEBUG_ASSERT_FAILURE)) #else # define ELPP_ASSERT(x, y) #endif //(!defined(ELPP_DISABLE_ASSERT) #if ELPP_COMPILER_MSVC # define ELPP_INTERNAL_DEBUGGING_WRITE_PERROR \ { char buff[256]; strerror_s(buff, 256, errno); \ ELPP_INTERNAL_DEBUGGING_OUT_ERROR << ": " << buff << " [" << errno << "]";} (void)0 #else # define ELPP_INTERNAL_DEBUGGING_WRITE_PERROR \ ELPP_INTERNAL_DEBUGGING_OUT_ERROR << ": " << strerror(errno) << " [" << errno << "]"; (void)0 #endif // ELPP_COMPILER_MSVC #if defined(ELPP_DEBUG_ERRORS) # if !defined(ELPP_INTERNAL_ERROR) # define ELPP_INTERNAL_ERROR(msg, pe) { \ std::stringstream internalInfoStream; internalInfoStream << " " << msg; \ ELPP_INTERNAL_DEBUGGING_OUT_ERROR \ << "ERROR FROM EASYLOGGING++ (LINE: " << __LINE__ << ") " \ << ELPP_INTERNAL_DEBUGGING_MSG(internalInfoStream.str()) << ELPP_INTERNAL_DEBUGGING_ENDL; \ if (pe) { ELPP_INTERNAL_DEBUGGING_OUT_ERROR << " "; ELPP_INTERNAL_DEBUGGING_WRITE_PERROR; }} (void)0 # endif #else # undef ELPP_INTERNAL_INFO # define ELPP_INTERNAL_ERROR(msg, pe) #endif // defined(ELPP_DEBUG_ERRORS) #if (defined(ELPP_DEBUG_INFO)) # if !(defined(ELPP_INTERNAL_INFO_LEVEL)) # define ELPP_INTERNAL_INFO_LEVEL 9 # endif // !(defined(ELPP_INTERNAL_INFO_LEVEL)) # if !defined(ELPP_INTERNAL_INFO) # define ELPP_INTERNAL_INFO(lvl, msg) { if (lvl <= ELPP_INTERNAL_INFO_LEVEL) { \ std::stringstream internalInfoStream; internalInfoStream << " " << msg; \ ELPP_INTERNAL_DEBUGGING_OUT_INFO << ELPP_INTERNAL_DEBUGGING_MSG(internalInfoStream.str()) \ << ELPP_INTERNAL_DEBUGGING_ENDL; }} # endif #else # undef ELPP_INTERNAL_INFO # define ELPP_INTERNAL_INFO(lvl, msg) #endif // (defined(ELPP_DEBUG_INFO)) #if defined(ELPP_STACKTRACE_ON_CRASH) # if (ELPP_COMPILER_GCC && !ELPP_MINGW) # define ELPP_STACKTRACE 1 # else # if ELPP_COMPILER_MSVC # pragma message("Stack trace not available for this compiler") # else # warning "Stack trace not available for this compiler"; # endif // ELPP_COMPILER_MSVC # endif // ELPP_COMPILER_GCC #endif // (defined(ELPP_STACKTRACE_ON_CRASH)) // Miscellaneous macros #define ELPP_UNUSED(x) (void)x #if ELPP_OS_UNIX // Log file permissions for unix-based systems # define ELPP_LOG_PERMS S_IRUSR | S_IWUSR | S_IXUSR | S_IWGRP | S_IRGRP | S_IXGRP | S_IWOTH | S_IXOTH #endif // ELPP_OS_UNIX #if defined(ELPP_AS_DLL) && ELPP_COMPILER_MSVC # if defined(ELPP_EXPORT_SYMBOLS) # define ELPP_EXPORT __declspec(dllexport) # else # define ELPP_EXPORT __declspec(dllimport) # endif // defined(ELPP_EXPORT_SYMBOLS) #else # define ELPP_EXPORT #endif // defined(ELPP_AS_DLL) && ELPP_COMPILER_MSVC // Some special functions that are VC++ specific #undef STRTOK #undef STRERROR #undef STRCAT #undef STRCPY #if ELPP_CRT_DBG_WARNINGS # define STRTOK(a, b, c) strtok_s(a, b, c) # define STRERROR(a, b, c) strerror_s(a, b, c) # define STRCAT(a, b, len) strcat_s(a, len, b) # define STRCPY(a, b, len) strcpy_s(a, len, b) #else # define STRTOK(a, b, c) strtok(a, b) # define STRERROR(a, b, c) strerror(c) # define STRCAT(a, b, len) strcat(a, b) # define STRCPY(a, b, len) strcpy(a, b) #endif // Compiler specific support evaluations #if (!ELPP_MINGW && !ELPP_COMPILER_CLANG) || defined(ELPP_FORCE_USE_STD_THREAD) # define ELPP_USE_STD_THREADING 1 #endif // (!ELPP_MINGW && !ELPP_COMPILER_CLANG) || defined(ELPP_FORCE_USE_STD_THREAD) #undef ELPP_FINAL #if ELPP_COMPILER_INTEL || (ELPP_GCC_VERSION < 40702) # define ELPP_FINAL #else # define ELPP_FINAL final #endif // ELPP_COMPILER_INTEL || (ELPP_GCC_VERSION < 40702) #if defined(ELPP_EXPERIMENTAL_ASYNC) # define ELPP_ASYNC_LOGGING 1 #else # define ELPP_ASYNC_LOGGING 0 #endif // defined(ELPP_EXPERIMENTAL_ASYNC) #if defined(ELPP_THREAD_SAFE) || ELPP_ASYNC_LOGGING # define ELPP_THREADING_ENABLED 1 #endif // defined(ELPP_THREAD_SAFE) || ELPP_ASYNC_LOGGING // Function macro ELPP_FUNC #undef ELPP_FUNC #if ELPP_COMPILER_MSVC // Visual C++ # define ELPP_FUNC __FUNCSIG__ #elif ELPP_COMPILER_GCC // GCC # define ELPP_FUNC __PRETTY_FUNCTION__ #elif ELPP_COMPILER_INTEL // Intel C++ # define ELPP_FUNC __PRETTY_FUNCTION__ #elif ELPP_COMPILER_CLANG // Clang++ # define ELPP_FUNC __PRETTY_FUNCTION__ #else # if defined(__func__) # define ELPP_FUNC __func__ # else # define ELPP_FUNC "" # endif // defined(__func__) #endif // defined(_MSC_VER) #undef ELPP_VARIADIC_TEMPLATES_SUPPORTED // Keep following line commented until features are fixed #if ELPP_COMPILER_GCC || ELPP_COMPILER_CLANG || ELPP_COMPILER_INTEL || (ELPP_COMPILER_MSVC && _MSC_VER >= 1800) # define ELPP_VARIADIC_TEMPLATES_SUPPORTED 1 #endif // ELPP_COMPILER_GCC || ELPP_COMPILER_CLANG || ELPP_COMPILER_INTEL || (ELPP_COMPILER_MSVC && _MSC_VER >= 1800) // Logging Enable/Disable macros #if (!defined(ELPP_DISABLE_LOGS)) # define ELPP_LOGGING_ENABLED 1 #endif // (!defined(ELPP_DISABLE_LOGS)) #if (!defined(ELPP_DISABLE_DEBUG_LOGS) && (ELPP_LOGGING_ENABLED) && ((defined(_DEBUG)) || (!defined(NDEBUG)))) # define ELPP_DEBUG_LOG 1 #else # define ELPP_DEBUG_LOG 0 #endif // (!defined(ELPP_DISABLE_DEBUG_LOGS) && (ELPP_LOGGING_ENABLED) && ((defined(_DEBUG)) || (!defined(NDEBUG)))) #if (!defined(ELPP_DISABLE_INFO_LOGS) && (ELPP_LOGGING_ENABLED)) # define ELPP_INFO_LOG 1 #else # define ELPP_INFO_LOG 0 #endif // (!defined(ELPP_DISABLE_INFO_LOGS) && (ELPP_LOGGING_ENABLED)) #if (!defined(ELPP_DISABLE_WARNING_LOGS) && (ELPP_LOGGING_ENABLED)) # define ELPP_WARNING_LOG 1 #else # define ELPP_WARNING_LOG 0 #endif // (!defined(ELPP_DISABLE_WARNING_LOGS) && (ELPP_LOGGING_ENABLED)) #if (!defined(ELPP_DISABLE_ERROR_LOGS) && (ELPP_LOGGING_ENABLED)) # define ELPP_ERROR_LOG 1 #else # define ELPP_ERROR_LOG 0 #endif // (!defined(ELPP_DISABLE_ERROR_LOGS) && (ELPP_LOGGING_ENABLED)) #if (!defined(ELPP_DISABLE_FATAL_LOGS) && (ELPP_LOGGING_ENABLED)) # define ELPP_FATAL_LOG 1 #else # define ELPP_FATAL_LOG 0 #endif // (!defined(ELPP_DISABLE_FATAL_LOGS) && (ELPP_LOGGING_ENABLED)) #if (!defined(ELPP_DISABLE_TRACE_LOGS) && (ELPP_LOGGING_ENABLED)) # define ELPP_TRACE_LOG 1 #else # define ELPP_TRACE_LOG 0 #endif // (!defined(ELPP_DISABLE_TRACE_LOGS) && (ELPP_LOGGING_ENABLED)) #if (!defined(ELPP_DISABLE_VERBOSE_LOGS) && (ELPP_LOGGING_ENABLED)) # define ELPP_VERBOSE_LOG 1 #else # define ELPP_VERBOSE_LOG 0 #endif // (!defined(ELPP_DISABLE_VERBOSE_LOGS) && (ELPP_LOGGING_ENABLED)) #if (!(ELPP_CXX0X || ELPP_CXX11)) # error "Easylogging++ 9.0+ is only compatible with C++0x (or higher) compliant compiler" #endif // (!(ELPP_CXX0X || ELPP_CXX11)) // Headers #if defined(ELPP_SYSLOG) # include #endif // defined(ELPP_SYSLOG) #include #include #include #include #include #include #include #include #if defined(ELPP_UNICODE) # include # if ELPP_OS_WINDOWS # include # endif // ELPP_OS_WINDOWS #endif // defined(ELPP_UNICODE) #if ELPP_STACKTRACE # include # include #endif // ELPP_STACKTRACE #if ELPP_OS_ANDROID # include #endif // ELPP_OS_ANDROID #if ELPP_OS_UNIX # include # include #elif ELPP_OS_WINDOWS # include # include # if defined(WIN32_LEAN_AND_MEAN) # if defined(ELPP_WINSOCK2) # include # else # include # endif // defined(ELPP_WINSOCK2) # endif // defined(WIN32_LEAN_AND_MEAN) #endif // ELPP_OS_UNIX #include #include #include #include #include #include #include #include #include #include #include #if ELPP_THREADING_ENABLED # if ELPP_USE_STD_THREADING # include # include # else # if ELPP_OS_UNIX # include # endif // ELPP_OS_UNIX # endif // ELPP_USE_STD_THREADING #endif // ELPP_THREADING_ENABLED #if ELPP_ASYNC_LOGGING # include # include # include #endif // ELPP_ASYNC_LOGGING #if defined(ELPP_STL_LOGGING) // For logging STL based templates # include # include # include # include # include # include # if defined(ELPP_LOG_STD_ARRAY) # include # endif // defined(ELPP_LOG_STD_ARRAY) # if defined(ELPP_LOG_UNORDERED_MAP) # include # endif // defined(ELPP_LOG_UNORDERED_MAP) # if defined(ELPP_LOG_UNORDERED_SET) # include # endif // defined(ELPP_UNORDERED_SET) #endif // defined(ELPP_STL_LOGGING) #if defined(ELPP_QT_LOGGING) // For logging Qt based classes & templates # include # include # include # include # include # include # include # include # include # include # include # include #endif // defined(ELPP_QT_LOGGING) #if defined(ELPP_BOOST_LOGGING) // For logging boost based classes & templates # include # include # include # include # include # include # include # include #endif // defined(ELPP_BOOST_LOGGING) #if defined(ELPP_WXWIDGETS_LOGGING) // For logging wxWidgets based classes & templates # include #endif // defined(ELPP_WXWIDGETS_LOGGING) // Forward declarations namespace el { class Logger; class LogMessage; class PerformanceTrackingData; class Loggers; class Helpers; template class Callback; class LogDispatchCallback; class PerformanceTrackingCallback; class LogDispatchData; namespace base { class Storage; class RegisteredLoggers; class PerformanceTracker; class MessageBuilder; class Writer; class PErrorWriter; class LogDispatcher; class DefaultLogBuilder; class DefaultLogDispatchCallback; #if ELPP_ASYNC_LOGGING class AsyncLogDispatchCallback; class AsyncDispatchWorker; #endif // ELPP_ASYNC_LOGGING class DefaultPerformanceTrackingCallback; } // namespace base } // namespace el /// @brief Easylogging++ entry namespace namespace el { /// @brief Namespace containing base/internal functionality used by Easylogging++ namespace base { /// @brief Data types used by Easylogging++ namespace type { #undef ELPP_LITERAL #undef ELPP_STRLEN #undef ELPP_COUT #if defined(ELPP_UNICODE) # define ELPP_LITERAL(txt) L##txt # define ELPP_STRLEN wcslen # if defined ELPP_CUSTOM_COUT # define ELPP_COUT ELPP_CUSTOM_COUT # else # define ELPP_COUT std::wcout # endif // defined ELPP_CUSTOM_COUT typedef wchar_t char_t; typedef std::wstring string_t; typedef std::wstringstream stringstream_t; typedef std::wfstream fstream_t; typedef std::wostream ostream_t; #else # define ELPP_LITERAL(txt) txt # define ELPP_STRLEN strlen # if defined ELPP_CUSTOM_COUT # define ELPP_COUT ELPP_CUSTOM_COUT # else # define ELPP_COUT std::cout # endif // defined ELPP_CUSTOM_COUT typedef char char_t; typedef std::string string_t; typedef std::stringstream stringstream_t; typedef std::fstream fstream_t; typedef std::ostream ostream_t; #endif // defined(ELPP_UNICODE) #if defined(ELPP_CUSTOM_COUT_LINE) # define ELPP_COUT_LINE(logLine) ELPP_CUSTOM_COUT_LINE(logLine) #else # define ELPP_COUT_LINE(logLine) logLine << std::flush #endif // defined(ELPP_CUSTOM_COUT_LINE) typedef unsigned short EnumType; typedef std::shared_ptr StoragePointer; typedef int VerboseLevel; typedef std::shared_ptr LogDispatchCallbackPtr; typedef std::shared_ptr PerformanceTrackingCallbackPtr; } // namespace type /// @brief Internal helper class that prevent copy constructor for class /// /// @detail When using this class simply inherit it privately class NoCopy { protected: NoCopy(void) {} private: NoCopy(const NoCopy&); NoCopy& operator=(const NoCopy&); }; /// @brief Internal helper class that makes all default constructors private. /// /// @detail This prevents initializing class making it static unless an explicit constructor is declared. /// When using this class simply inherit it privately class StaticClass { private: StaticClass(void); StaticClass(const StaticClass&); StaticClass& operator=(const StaticClass&); }; } // namespace base /// @brief Represents enumeration for severity level used to determine level of logging /// /// @detail With Easylogging++, developers may disable or enable any level regardless of /// what the severity is. Or they can choose to log using hierarchical logging flag enum class Level : base::type::EnumType { /// @brief Generic level that represents all the levels. Useful when setting global configuration for all levels Global = 1, /// @brief Information that can be useful to back-trace certain events - mostly useful than debug logs. Trace = 2, /// @brief Informational events most useful for developers to debug application Debug = 4, /// @brief Severe error information that will presumably abort application Fatal = 8, /// @brief Information representing errors in application but application will keep running Error = 16, /// @brief Useful when application has potentially harmful situtaions Warning = 32, /// @brief Information that can be highly useful and vary with verbose logging level. Verbose = 64, /// @brief Mainly useful to represent current progress of application Info = 128, /// @brief Represents unknown level Unknown = 1010 }; /// @brief Static class that contains helper functions for el::Level class LevelHelper : base::StaticClass { public: /// @brief Represents minimum valid level. Useful when iterating through enum. static const base::type::EnumType kMinValid = static_cast(Level::Trace); /// @brief Represents maximum valid level. This is used internally and you should not need it. static const base::type::EnumType kMaxValid = static_cast(Level::Info); /// @brief Casts level to int, useful for iterating through enum. static base::type::EnumType castToInt(Level level) { return static_cast(level); } /// @brief Casts int(ushort) to level, useful for iterating through enum. static Level castFromInt(base::type::EnumType l) { return static_cast(l); } /// @brief Converts level to associated const char* /// @return Upper case string based level. static const char* convertToString(Level level) { // Do not use switch over strongly typed enums because Intel C++ compilers dont support them yet. if (level == Level::Global) return "GLOBAL"; if (level == Level::Debug) return "DEBUG"; if (level == Level::Info) return "INFO"; if (level == Level::Warning) return "WARNING"; if (level == Level::Error) return "ERROR"; if (level == Level::Fatal) return "FATAL"; if (level == Level::Verbose) return "VERBOSE"; if (level == Level::Trace) return "TRACE"; return "UNKNOWN"; } /// @brief Converts from levelStr to Level /// @param levelStr Upper case string based level. /// Lower case is also valid but providing upper case is recommended. static Level convertFromString(const char* levelStr) { if ((strcmp(levelStr, "GLOBAL") == 0) || (strcmp(levelStr, "global") == 0)) return Level::Global; if ((strcmp(levelStr, "DEBUG") == 0) || (strcmp(levelStr, "debug") == 0)) return Level::Debug; if ((strcmp(levelStr, "INFO") == 0) || (strcmp(levelStr, "info") == 0)) return Level::Info; if ((strcmp(levelStr, "WARNING") == 0) || (strcmp(levelStr, "warning") == 0)) return Level::Warning; if ((strcmp(levelStr, "ERROR") == 0) || (strcmp(levelStr, "error") == 0)) return Level::Error; if ((strcmp(levelStr, "FATAL") == 0) || (strcmp(levelStr, "fatal") == 0)) return Level::Fatal; if ((strcmp(levelStr, "VERBOSE") == 0) || (strcmp(levelStr, "verbose") == 0)) return Level::Verbose; if ((strcmp(levelStr, "TRACE") == 0) || (strcmp(levelStr, "trace") == 0)) return Level::Trace; return Level::Unknown; } /// @brief Applies specified function to each level starting from startIndex /// @param startIndex initial value to start the iteration from. This is passed as pointer and /// is left-shifted so this can be used inside function (fn) to represent current level. /// @param fn function to apply with each level. This bool represent whether or not to stop iterating through levels. static inline void forEachLevel(base::type::EnumType* startIndex, const std::function& fn) { base::type::EnumType lIndexMax = LevelHelper::kMaxValid; do { if (fn()) { break; } *startIndex = static_cast(*startIndex << 1); } while (*startIndex <= lIndexMax); } }; /// @brief Represents enumeration of ConfigurationType used to configure or access certain aspect /// of logging enum class ConfigurationType : base::type::EnumType { /// @brief Determines whether or not corresponding level and logger of logging is enabled /// You may disable all logs by using el::Level::Global Enabled = 1, /// @brief Whether or not to write corresponding log to log file ToFile = 2, /// @brief Whether or not to write corresponding level and logger log to standard output. /// By standard output meaning termnal, command prompt etc ToStandardOutput = 4, /// @brief Determines format of logging corresponding level and logger. Format = 8, /// @brief Determines log file (full path) to write logs to for correponding level and logger Filename = 16, /// @brief Specifies milliseconds width. Width can be within range (1-6) MillisecondsWidth = 32, /// @brief Determines whether or not performance tracking is enabled. /// /// @detail This does not depend on logger or level. Performance tracking always uses 'performance' logger PerformanceTracking = 64, /// @brief Specifies log file max size. /// /// @detail If file size of corresponding log file (for corresponding level) is >= specified size, log file will /// be truncated and re-initiated. MaxLogFileSize = 128, /// @brief Specifies number of log entries to hold until we flush pending log data LogFlushThreshold = 256, /// @brief Represents unknown configuration Unknown = 1010 }; /// @brief Static class that contains helper functions for el::ConfigurationType class ConfigurationTypeHelper : base::StaticClass { public: /// @brief Represents minimum valid configuration type. Useful when iterating through enum. static const base::type::EnumType kMinValid = static_cast(ConfigurationType::Enabled); /// @brief Represents maximum valid configuration type. This is used internally and you should not need it. static const base::type::EnumType kMaxValid = static_cast(ConfigurationType::MaxLogFileSize); /// @brief Casts configuration type to int, useful for iterating through enum. static base::type::EnumType castToInt(ConfigurationType configurationType) { return static_cast(configurationType); } /// @brief Casts int(ushort) to configurationt type, useful for iterating through enum. static ConfigurationType castFromInt(base::type::EnumType c) { return static_cast(c); } /// @brief Converts configuration type to associated const char* /// @returns Upper case string based configuration type. static const char* convertToString(ConfigurationType configurationType) { // Do not use switch over strongly typed enums because Intel C++ compilers dont support them yet. if (configurationType == ConfigurationType::Enabled) return "ENABLED"; if (configurationType == ConfigurationType::Filename) return "FILENAME"; if (configurationType == ConfigurationType::Format) return "FORMAT"; if (configurationType == ConfigurationType::ToFile) return "TO_FILE"; if (configurationType == ConfigurationType::ToStandardOutput) return "TO_STANDARD_OUTPUT"; if (configurationType == ConfigurationType::MillisecondsWidth) return "MILLISECONDS_WIDTH"; if (configurationType == ConfigurationType::PerformanceTracking) return "PERFORMANCE_TRACKING"; if (configurationType == ConfigurationType::MaxLogFileSize) return "MAX_LOG_FILE_SIZE"; if (configurationType == ConfigurationType::LogFlushThreshold) return "LOG_FLUSH_THRESHOLD"; return "UNKNOWN"; } /// @brief Converts from configStr to ConfigurationType /// @param configStr Upper case string based configuration type. /// Lower case is also valid but providing upper case is recommended. static ConfigurationType convertFromString(const char* configStr) { if ((strcmp(configStr, "ENABLED") == 0) || (strcmp(configStr, "enabled") == 0)) return ConfigurationType::Enabled; if ((strcmp(configStr, "TO_FILE") == 0) || (strcmp(configStr, "to_file") == 0)) return ConfigurationType::ToFile; if ((strcmp(configStr, "TO_STANDARD_OUTPUT") == 0) || (strcmp(configStr, "to_standard_output") == 0)) return ConfigurationType::ToStandardOutput; if ((strcmp(configStr, "FORMAT") == 0) || (strcmp(configStr, "format") == 0)) return ConfigurationType::Format; if ((strcmp(configStr, "FILENAME") == 0) || (strcmp(configStr, "filename") == 0)) return ConfigurationType::Filename; if ((strcmp(configStr, "MILLISECONDS_WIDTH") == 0) || (strcmp(configStr, "milliseconds_width") == 0)) return ConfigurationType::MillisecondsWidth; if ((strcmp(configStr, "PERFORMANCE_TRACKING") == 0) || (strcmp(configStr, "performance_tracking") == 0)) return ConfigurationType::PerformanceTracking; if ((strcmp(configStr, "MAX_LOG_FILE_SIZE") == 0) || (strcmp(configStr, "max_log_file_size") == 0)) return ConfigurationType::MaxLogFileSize; if ((strcmp(configStr, "LOG_FLUSH_THRESHOLD") == 0) || (strcmp(configStr, "log_flush_threshold") == 0)) return ConfigurationType::LogFlushThreshold; return ConfigurationType::Unknown; } /// @brief Applies specified function to each configuration type starting from startIndex /// @param startIndex initial value to start the iteration from. This is passed by pointer and is left-shifted /// so this can be used inside function (fn) to represent current configuration type. /// @param fn function to apply with each configuration type. /// This bool represent whether or not to stop iterating through configurations. static inline void forEachConfigType(base::type::EnumType* startIndex, const std::function& fn) { base::type::EnumType cIndexMax = ConfigurationTypeHelper::kMaxValid; do { if (fn()) { break; } *startIndex = static_cast(*startIndex << 1); } while (*startIndex <= cIndexMax); } }; /// @brief Flags used while writing logs. This flags are set by user enum class LoggingFlag : base::type::EnumType { /// @brief Makes sure we have new line for each container log entry NewLineForContainer = 1, /// @brief Makes sure if -vmodule is used and does not specifies a module, then verbose /// logging is allowed via that module. AllowVerboseIfModuleNotSpecified = 2, /// @brief When handling crashes by default, detailed crash reason will be logged as well LogDetailedCrashReason = 4, /// @brief Allows to disable application abortion when logged using FATAL level DisableApplicationAbortOnFatalLog = 8, /// @brief Flushes log with every log-entry (performance sensative) - Disabled by default ImmediateFlush = 16, /// @brief Enables strict file rolling StrictLogFileSizeCheck = 32, /// @brief Make terminal output colorful for supported terminals ColoredTerminalOutput = 64, /// @brief Supports use of multiple logging in same macro, e.g, CLOG(INFO, "default", "network") MultiLoggerSupport = 128, /// @brief Disables comparing performance tracker's checkpoints DisablePerformanceTrackingCheckpointComparison = 256, /// @brief Disable VModules DisableVModules = 512, /// @brief Disable VModules extensions DisableVModulesExtensions = 1024, /// @brief Enables hierarchical logging HierarchicalLogging = 2048, /// @brief Creates logger automatically when not available CreateLoggerAutomatically = 4096, /// @brief Adds spaces b/w logs that separated by left-shift operator AutoSpacing = 8192, /// @brief Preserves time format and does not convert it to sec, hour etc (performance tracking only) FixedTimeFormat = 16384 }; namespace base { /// @brief Namespace containing constants used internally. namespace consts { // Level log values - These are values that are replaced in place of %level format specifier static const base::type::char_t* kInfoLevelLogValue = ELPP_LITERAL("INFO "); static const base::type::char_t* kDebugLevelLogValue = ELPP_LITERAL("DEBUG"); static const base::type::char_t* kWarningLevelLogValue = ELPP_LITERAL("WARN "); static const base::type::char_t* kErrorLevelLogValue = ELPP_LITERAL("ERROR"); static const base::type::char_t* kFatalLevelLogValue = ELPP_LITERAL("FATAL"); static const base::type::char_t* kVerboseLevelLogValue = ELPP_LITERAL("VER"); static const base::type::char_t* kTraceLevelLogValue = ELPP_LITERAL("TRACE"); static const base::type::char_t* kInfoLevelShortLogValue = ELPP_LITERAL("I"); static const base::type::char_t* kDebugLevelShortLogValue = ELPP_LITERAL("D"); static const base::type::char_t* kWarningLevelShortLogValue = ELPP_LITERAL("W"); static const base::type::char_t* kErrorLevelShortLogValue = ELPP_LITERAL("E"); static const base::type::char_t* kFatalLevelShortLogValue = ELPP_LITERAL("F"); static const base::type::char_t* kVerboseLevelShortLogValue = ELPP_LITERAL("V"); static const base::type::char_t* kTraceLevelShortLogValue = ELPP_LITERAL("T"); // Format specifiers - These are used to define log format static const base::type::char_t* kAppNameFormatSpecifier = ELPP_LITERAL("%app"); static const base::type::char_t* kLoggerIdFormatSpecifier = ELPP_LITERAL("%logger"); static const base::type::char_t* kThreadIdFormatSpecifier = ELPP_LITERAL("%thread"); static const base::type::char_t* kSeverityLevelFormatSpecifier = ELPP_LITERAL("%level"); static const base::type::char_t* kSeverityLevelShortFormatSpecifier = ELPP_LITERAL("%levshort"); static const base::type::char_t* kDateTimeFormatSpecifier = ELPP_LITERAL("%datetime"); static const base::type::char_t* kLogFileFormatSpecifier = ELPP_LITERAL("%file"); static const base::type::char_t* kLogFileBaseFormatSpecifier = ELPP_LITERAL("%fbase"); static const base::type::char_t* kLogLineFormatSpecifier = ELPP_LITERAL("%line"); static const base::type::char_t* kLogLocationFormatSpecifier = ELPP_LITERAL("%loc"); static const base::type::char_t* kLogFunctionFormatSpecifier = ELPP_LITERAL("%func"); static const base::type::char_t* kCurrentUserFormatSpecifier = ELPP_LITERAL("%user"); static const base::type::char_t* kCurrentHostFormatSpecifier = ELPP_LITERAL("%host"); static const base::type::char_t* kMessageFormatSpecifier = ELPP_LITERAL("%msg"); static const base::type::char_t* kVerboseLevelFormatSpecifier = ELPP_LITERAL("%vlevel"); static const char* kDateTimeFormatSpecifierForFilename = "%datetime"; // Date/time static const char* kDays[7] = { "Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday" }; static const char* kDaysAbbrev[7] = { "Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat" }; static const char* kMonths[12] = { "January", "February", "March", "Apri", "May", "June", "July", "August", "September", "October", "November", "December" }; static const char* kMonthsAbbrev[12] = { "Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec" }; static const char* kDefaultDateTimeFormat = "%Y-%M-%d %H:%m:%s,%g"; static const char* kDefaultDateTimeFormatInFilename = "%Y-%M-%d_%H-%m"; static const int kYearBase = 1900; static const char* kAm = "AM"; static const char* kPm = "PM"; // Miscellaneous constants static const char* kDefaultLoggerId = "default"; static const char* kPerformanceLoggerId = "performance"; static const char* kSysLogLoggerId = "syslog"; static const char* kNullPointer = "nullptr"; static const char kFormatSpecifierChar = '%'; #if ELPP_VARIADIC_TEMPLATES_SUPPORTED static const char kFormatSpecifierCharValue = 'v'; #endif // ELPP_VARIADIC_TEMPLATES_SUPPORTED static const unsigned int kMaxLogPerContainer = 100; static const unsigned int kMaxLogPerCounter = 100000; static const unsigned int kDefaultMillisecondsWidth = 3; static const base::type::VerboseLevel kMaxVerboseLevel = 9; static const char* kUnknownUser = "user"; static const char* kUnknownHost = "unknown-host"; #if defined(ELPP_DEFAULT_LOG_FILE) static const char* kDefaultLogFile = ELPP_DEFAULT_LOG_FILE; #else # if ELPP_OS_UNIX # if ELPP_OS_ANDROID static const char* kDefaultLogFile = "logs/myeasylog.log"; # else static const char* kDefaultLogFile = "logs/myeasylog.log"; # endif // ELPP_OS_ANDROID # elif ELPP_OS_WINDOWS static const char* kDefaultLogFile = "logs\\myeasylog.log"; # endif // ELPP_OS_UNIX #endif // defined(ELPP_DEFAULT_LOG_FILE) #if !defined(ELPP_DISABLE_LOG_FILE_FROM_ARG) static const char* kDefaultLogFileParam = "--default-log-file"; #endif // !defined(ELPP_DISABLE_LOG_FILE_FROM_ARG) #if defined(ELPP_LOGGING_FLAGS_FROM_ARG) static const char* kLoggingFlagsParam = "--logging-flags"; #endif // defined(ELPP_LOGGING_FLAGS_FROM_ARG) #if ELPP_OS_WINDOWS static const char* kFilePathSeperator = "\\"; #else static const char* kFilePathSeperator = "/"; #endif // ELPP_OS_WINDOWS static const char* kValidLoggerIdSymbols = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-._"; static const char* kConfigurationComment = "##"; static const char* kConfigurationLevel = "*"; static const char* kConfigurationLoggerId = "--"; static const std::size_t kSourceFilenameMaxLength = 100; static const std::size_t kSourceLineMaxLength = 10; static const Level kPerformanceTrackerDefaultLevel = Level::Info; const struct { double value; const base::type::char_t* unit; } kTimeFormats[] = { { 1000.0f, ELPP_LITERAL("mis") }, { 1000.0f, ELPP_LITERAL("ms") }, { 60.0f, ELPP_LITERAL("seconds") }, { 60.0f, ELPP_LITERAL("minutes") }, { 24.0f, ELPP_LITERAL("hours") }, { 7.0f, ELPP_LITERAL("days") } }; static const int kTimeFormatsCount = sizeof(kTimeFormats) / sizeof(kTimeFormats[0]); const struct { int numb; const char* name; const char* brief; const char* detail; } kCrashSignals[] = { // NOTE: Do not re-order, if you do please check CrashHandler(bool) constructor and CrashHandler::setHandler(..) { SIGABRT, "SIGABRT", "Abnormal termination", "Program was abnormally terminated." }, { SIGFPE, "SIGFPE", "Erroneous arithmetic operation", "Arithemetic operation issue such as division by zero or operation resulting in overflow." }, { SIGILL, "SIGILL", "Illegal instruction", "Generally due to a corruption in the code or to an attempt to execute data."}, { SIGSEGV, "SIGSEGV", "Invalid access to memory", "Program is trying to read an invalid (unallocated, deleted or corrupted) or inaccessible memory." }, { SIGINT, "SIGINT", "Interactive attention signal", "Interruption generated (generally) by user or operating system." }, }; static const int kCrashSignalsCount = sizeof(kCrashSignals) / sizeof(kCrashSignals[0]); } // namespace consts } // namespace base typedef std::function PreRollOutCallback; namespace base { static inline void defaultPreRollOutCallback(const char*, std::size_t) {} /// @brief Enum to represent timestamp unit enum class TimestampUnit : base::type::EnumType { Microsecond = 0, Millisecond = 1, Second = 2, Minute = 3, Hour = 4, Day = 5 }; /// @brief Format flags used to determine specifiers that are active for performance improvements. enum class FormatFlags : base::type::EnumType { DateTime = 1<<1, LoggerId = 1<<2, File = 1<<3, Line = 1<<4, Location = 1<<5, Function = 1<<6, User = 1<<7, Host = 1<<8, LogMessage = 1<<9, VerboseLevel = 1<<10, AppName = 1<<11, ThreadId = 1<<12, Level = 1<<13, FileBase = 1<<14, LevelShort = 1<<15 }; /// @brief A milliseconds width class containing actual width and offset for date/time class MillisecondsWidth { public: MillisecondsWidth(void) { init(base::consts::kDefaultMillisecondsWidth); } explicit MillisecondsWidth(int width) { init(width); } bool operator==(const MillisecondsWidth& msWidth) { return m_width == msWidth.m_width && m_offset == msWidth.m_offset; } int m_width; unsigned int m_offset; private: void init(int width) { if (width < 1 || width > 6) { width = base::consts::kDefaultMillisecondsWidth; } m_width = width; switch (m_width) { case 3: m_offset = 1000; break; case 4: m_offset = 100; break; case 5: m_offset = 10; break; case 6: m_offset = 1; break; default: m_offset = 1000; break; } } }; /// @brief Namespace containing utility functions/static classes used internally namespace utils { /// @brief Deletes memory safely and points to null template static inline typename std::enable_if::value, void>::type safeDelete(T*& pointer) { if (pointer == nullptr) return; delete pointer; pointer = nullptr; } /// @brief Gets value of const char* but if it is nullptr, a string nullptr is returned static inline const char* charPtrVal(const char* pointer) { return pointer == nullptr ? base::consts::kNullPointer : pointer; } /// @brief Aborts application due with user-defined status static inline void abort(int status, const std::string& reason = std::string()) { // Both status and reason params are there for debugging with tools like gdb etc ELPP_UNUSED(status); ELPP_UNUSED(reason); #if defined(ELPP_COMPILER_MSVC) && defined(_M_IX86) && defined(_DEBUG) // Ignore msvc critical error dialog - break instead (on debug mode) _asm int 3 #else ::abort(); #endif // defined(ELPP_COMPILER_MSVC) && defined(_M_IX86) && defined(_DEBUG) } /// @brief Bitwise operations for C++11 strong enum class. This casts e into Flag_T and returns value after bitwise operation /// Use these function as
flag = bitwise::Or(MyEnum::val1, flag);
namespace bitwise { template static inline base::type::EnumType And(Enum e, base::type::EnumType flag) { return static_cast(flag) & static_cast(e); } template static inline base::type::EnumType Not(Enum e, base::type::EnumType flag) { return static_cast(flag) & ~(static_cast(e)); } template static inline base::type::EnumType Or(Enum e, base::type::EnumType flag) { return static_cast(flag) | static_cast(e); } } // namespace bitwise template static inline void addFlag(Enum e, base::type::EnumType* flag) { *flag = base::utils::bitwise::Or(e, *flag); } template static inline void removeFlag(Enum e, base::type::EnumType* flag) { *flag = base::utils::bitwise::Not(e, *flag); } template static inline bool hasFlag(Enum e, base::type::EnumType flag) { return base::utils::bitwise::And(e, flag) > 0x0; } } // namespace utils namespace threading { #if ELPP_THREADING_ENABLED # if !ELPP_USE_STD_THREADING namespace internal { /// @brief A mutex wrapper for compiler that dont yet support std::mutex class Mutex : base::NoCopy { public: Mutex(void) { # if ELPP_OS_UNIX pthread_mutex_init(&m_underlyingMutex, nullptr); # elif ELPP_OS_WINDOWS InitializeCriticalSection(&m_underlyingMutex); # endif // ELPP_OS_UNIX } virtual ~Mutex(void) { # if ELPP_OS_UNIX pthread_mutex_destroy(&m_underlyingMutex); # elif ELPP_OS_WINDOWS DeleteCriticalSection(&m_underlyingMutex); # endif // ELPP_OS_UNIX } inline void lock(void) { # if ELPP_OS_UNIX pthread_mutex_lock(&m_underlyingMutex); # elif ELPP_OS_WINDOWS EnterCriticalSection(&m_underlyingMutex); # endif // ELPP_OS_UNIX } inline bool try_lock(void) { # if ELPP_OS_UNIX return (pthread_mutex_trylock(&m_underlyingMutex) == 0); # elif ELPP_OS_WINDOWS return TryEnterCriticalSection(&m_underlyingMutex); # endif // ELPP_OS_UNIX } inline void unlock(void) { # if ELPP_OS_UNIX pthread_mutex_unlock(&m_underlyingMutex); # elif ELPP_OS_WINDOWS LeaveCriticalSection(&m_underlyingMutex); # endif // ELPP_OS_UNIX } private: # if ELPP_OS_UNIX pthread_mutex_t m_underlyingMutex; # elif ELPP_OS_WINDOWS CRITICAL_SECTION m_underlyingMutex; # endif // ELPP_OS_UNIX }; /// @brief Scoped lock for compiler that dont yet support std::lock_guard template class ScopedLock : base::NoCopy { public: explicit ScopedLock(M& mutex) { m_mutex = &mutex; m_mutex->lock(); } virtual ~ScopedLock(void) { m_mutex->unlock(); } private: M* m_mutex; ScopedLock(void); }; } // namespace internal /// @brief Gets ID of currently running threading in windows systems. On unix, nothing is returned. static inline std::string getCurrentThreadId(void) { std::stringstream ss; # if (ELPP_OS_WINDOWS) ss << GetCurrentThreadId(); # endif // (ELPP_OS_WINDOWS) return ss.str(); } static inline void msleep(int) { // No implementation for non std::thread version } typedef base::threading::internal::Mutex Mutex; typedef base::threading::internal::ScopedLock ScopedLock; # else /// @brief Gets ID of currently running threading using std::this_thread::get_id() static inline std::string getCurrentThreadId(void) { std::stringstream ss; ss << std::this_thread::get_id(); return ss.str(); } static inline void msleep(int ms) { // Only when async logging enabled - this is because async is strict on compiler #if ELPP_ASYNC_LOGGING std::this_thread::sleep_for(std::chrono::milliseconds(ms)); #endif // ELPP_ASYNC_LOGGING } typedef std::mutex Mutex; typedef std::lock_guard ScopedLock; # endif // !ELPP_USE_STD_THREADING #else namespace internal { /// @brief Mutex wrapper used when multi-threading is disabled. class NoMutex : base::NoCopy { public: NoMutex(void) {} inline void lock(void) {} inline bool try_lock(void) { return true; } inline void unlock(void) {} }; /// @brief Lock guard wrapper used when multi-threading is disabled. template class NoScopedLock : base::NoCopy { public: explicit NoScopedLock(Mutex&) { } virtual ~NoScopedLock(void) { } private: NoScopedLock(void); }; } // namespace internal static inline std::string getCurrentThreadId(void) { return std::string(); } static inline void msleep(int) { // No custom implementation } typedef base::threading::internal::NoMutex Mutex; typedef base::threading::internal::NoScopedLock ScopedLock; #endif // ELPP_THREADING_ENABLED /// @brief Base of thread safe class, this class is inheritable-only class ThreadSafe { public: virtual inline void acquireLock(void) ELPP_FINAL { m_mutex.lock(); } virtual inline void releaseLock(void) ELPP_FINAL { m_mutex.unlock(); } virtual inline base::threading::Mutex& lock(void) ELPP_FINAL { return m_mutex; } protected: ThreadSafe(void) {} virtual ~ThreadSafe(void) {} private: base::threading::Mutex m_mutex; }; } // namespace threading namespace utils { class File : base::StaticClass { public: /// @brief Creates new out file stream for specified filename. /// @return Pointer to newly created fstream or nullptr static base::type::fstream_t* newFileStream(const std::string& filename) { base::type::fstream_t *fs = new base::type::fstream_t(filename.c_str(), base::type::fstream_t::out | base::type::fstream_t::app); #if defined(ELPP_UNICODE) std::locale elppUnicodeLocale(""); #if ELPP_OS_WINDOWS std::locale elppUnicodeLocaleWindows(elppUnicodeLocale, new std::codecvt_utf8_utf16); elppUnicodeLocale = elppUnicodeLocaleWindows; #endif fs->imbue(elppUnicodeLocale); #endif // defined(ELPP_UNICODE) if (fs->is_open()) { fs->flush(); } else { base::utils::safeDelete(fs); ELPP_INTERNAL_ERROR("Bad file [" << filename << "]", true); } return fs; } /// @brief Gets size of file provided in stream static std::size_t getSizeOfFile(base::type::fstream_t* fs) { if (fs == nullptr) { return 0; } std::streampos currPos = fs->tellg(); fs->seekg(0, fs->end); std::size_t size = static_cast(fs->tellg()); fs->seekg(currPos); return size; } /// @brief Determines whether or not provided path exist in current file system static inline bool pathExists(const char* path, bool considerFile = false) { if (path == nullptr) { return false; } #if ELPP_OS_UNIX ELPP_UNUSED(considerFile); struct stat st; return (stat(path, &st) == 0); #elif ELPP_OS_WINDOWS DWORD fileType = GetFileAttributesA(path); if (fileType == INVALID_FILE_ATTRIBUTES) { return false; } return considerFile ? true : ((fileType & FILE_ATTRIBUTE_DIRECTORY) == 0 ? false : true); #endif // ELPP_OS_UNIX } /// @brief Creates specified path on file system /// @param path Path to create. static bool createPath(const std::string& path) { if (path.empty()) { return false; } if (base::utils::File::pathExists(path.c_str())) { return true; } int status = -1; char* currPath = const_cast(path.c_str()); std::string builtPath = std::string(); #if ELPP_OS_UNIX if (path[0] == '/') { builtPath = "/"; } currPath = STRTOK(currPath, base::consts::kFilePathSeperator, 0); #elif ELPP_OS_WINDOWS // Use secure functions API char* nextTok_ = nullptr; currPath = STRTOK(currPath, base::consts::kFilePathSeperator, &nextTok_); ELPP_UNUSED(nextTok_); #endif // ELPP_OS_UNIX while (currPath != nullptr) { builtPath.append(currPath); builtPath.append(base::consts::kFilePathSeperator); #if ELPP_OS_UNIX status = mkdir(builtPath.c_str(), ELPP_LOG_PERMS); currPath = STRTOK(nullptr, base::consts::kFilePathSeperator, 0); #elif ELPP_OS_WINDOWS status = _mkdir(builtPath.c_str()); currPath = STRTOK(nullptr, base::consts::kFilePathSeperator, &nextTok_); #endif // ELPP_OS_UNIX } if (status == -1) { ELPP_INTERNAL_ERROR("Error while creating path [" << path << "]", true); return false; } return true; } /// @brief Extracts path of filename with leading slash static std::string extractPathFromFilename(const std::string& fullPath, const char* seperator = base::consts::kFilePathSeperator) { if ((fullPath == "") || (fullPath.find(seperator) == std::string::npos)) { return fullPath; } std::size_t lastSlashAt = fullPath.find_last_of(seperator); if (lastSlashAt == 0) { return std::string(seperator); } return fullPath.substr(0, lastSlashAt + 1); } /// @brief builds stripped filename and puts it in buff static void buildStrippedFilename(const char* filename, char buff[], std::size_t limit = base::consts::kSourceFilenameMaxLength) { std::size_t sizeOfFilename = strlen(filename); if (sizeOfFilename >= limit) { filename += (sizeOfFilename - limit); if (filename[0] != '.' && filename[1] != '.') { // prepend if not already filename += 3; // 3 = '..' STRCAT(buff, "..", limit); } } STRCAT(buff, filename, limit); } /// @brief builds base filename and puts it in buff static void buildBaseFilename(const std::string& fullPath, char buff[], std::size_t limit = base::consts::kSourceFilenameMaxLength, const char* seperator = base::consts::kFilePathSeperator) { const char *filename = fullPath.c_str(); std::size_t lastSlashAt = fullPath.find_last_of(seperator); filename += lastSlashAt ? lastSlashAt+1 : 0; std::size_t sizeOfFilename = strlen(filename); if (sizeOfFilename >= limit) { filename += (sizeOfFilename - limit); if (filename[0] != '.' && filename[1] != '.') { // prepend if not already filename += 3; // 3 = '..' STRCAT(buff, "..", limit); } } STRCAT(buff, filename, limit); } }; /// @brief String utilities helper class used internally. You should not use it. class Str : base::StaticClass { public: /// @brief Checks if character is digit. Dont use libc implementation of it to prevent locale issues. static inline bool isDigit(char c) { return c >= '0' && c <= '9'; } /// @brief Matches wildcards, '*' and '?' only supported. static bool wildCardMatch(const char* str, const char* pattern) { while (*pattern) { switch (*pattern) { case '?': if (!*str) return false; ++str; ++pattern; break; case '*': if (wildCardMatch(str, pattern + 1)) return true; if (*str && wildCardMatch(str + 1, pattern)) return true; return false; break; default: if (*str++ != *pattern++) return false; break; } } return !*str && !*pattern; } /// @brief Trims string from start /// @param [in,out] str String to trim static inline std::string& ltrim(std::string& str) { str.erase(str.begin(), std::find_if(str.begin(), str.end(), std::not1(std::ptr_fun(&std::isspace)))); return str; } /// @brief Trim string from end /// @param [in,out] str String to trim static inline std::string& rtrim(std::string& str) { str.erase(std::find_if(str.rbegin(), str.rend(), std::not1(std::ptr_fun(&std::isspace))).base(), str.end()); return str; } /// @brief Trims string from left and right /// @param [in,out] str String to trim static inline std::string& trim(std::string& str) { return ltrim(rtrim(str)); } /// @brief Determines whether or not str starts with specified string /// @param str String to check /// @param start String to check against /// @return Returns true if starts with specified string, false otherwise static inline bool startsWith(const std::string& str, const std::string& start) { return (str.length() >= start.length()) && (str.compare(0, start.length(), start) == 0); } /// @brief Determines whether or not str ends with specified string /// @param str String to check /// @param end String to check against /// @return Returns true if ends with specified string, false otherwise static inline bool endsWith(const std::string& str, const std::string& end) { return (str.length() >= end.length()) && (str.compare(str.length() - end.length(), end.length(), end) == 0); } /// @brief Replaces all instances of replaceWhat with 'replaceWith'. Original variable is changed for performance. /// @param [in,out] str String to replace from /// @param replaceWhat Character to replace /// @param replaceWith Character to replace with /// @return Modified version of str static inline std::string& replaceAll(std::string& str, char replaceWhat, char replaceWith) { std::replace(str.begin(), str.end(), replaceWhat, replaceWith); return str; } /// @brief Replaces all instances of 'replaceWhat' with 'replaceWith'. (String version) Replaces in place /// @param str String to replace from /// @param replaceWhat Character to replace /// @param replaceWith Character to replace with /// @return Modified (original) str static inline std::string& replaceAll(std::string& str, const std::string& replaceWhat, // NOLINT const std::string& replaceWith) { if (replaceWhat == replaceWith) return str; std::size_t foundAt = std::string::npos; while ((foundAt = str.find(replaceWhat, foundAt + 1)) != std::string::npos) { str.replace(foundAt, replaceWhat.length(), replaceWith); } return str; } static void replaceFirstWithEscape(base::type::string_t& str, const base::type::string_t& replaceWhat, // NOLINT const base::type::string_t& replaceWith) { std::size_t foundAt = base::type::string_t::npos; while ((foundAt = str.find(replaceWhat, foundAt + 1)) != base::type::string_t::npos) { if (foundAt > 0 && str[foundAt - 1] == base::consts::kFormatSpecifierChar) { str.erase(foundAt > 0 ? foundAt - 1 : 0, 1); ++foundAt; } else { str.replace(foundAt, replaceWhat.length(), replaceWith); return; } } } #if defined(ELPP_UNICODE) static void replaceFirstWithEscape(base::type::string_t& str, const base::type::string_t& replaceWhat, // NOLINT const std::string& replaceWith) { replaceFirstWithEscape(str, replaceWhat, base::type::string_t(replaceWith.begin(), replaceWith.end())); } #endif // defined(ELPP_UNICODE) /// @brief Converts string to uppercase /// @param str String to convert /// @return Uppercase string static inline std::string& toUpper(std::string& str) { std::transform(str.begin(), str.end(), str.begin(), ::toupper); return str; } /// @brief Compares cstring equality - uses strcmp static inline bool cStringEq(const char* s1, const char* s2) { if (s1 == nullptr && s2 == nullptr) return true; if (s1 == nullptr || s2 == nullptr) return false; return strcmp(s1, s2) == 0; } /// @brief Compares cstring equality (case-insensitive) - uses toupper(char) /// Dont use strcasecmp because of CRT (VC++) static bool cStringCaseEq(const char* s1, const char* s2) { if (s1 == nullptr && s2 == nullptr) return true; if (s1 == nullptr || s2 == nullptr) return false; if (strlen(s1) != strlen(s2)) return false; while (*s1 != '\0' && *s2 != '\0') { if (::toupper(*s1) != ::toupper(*s2)) return false; ++s1; ++s2; } return true; } /// @brief Returns true if c exist in str static inline bool contains(const char* str, char c) { for (; *str; ++str) { if (*str == c) return true; } return false; } static inline char* convertAndAddToBuff(std::size_t n, int len, char* buf, const char* bufLim, bool zeroPadded = true) { char localBuff[10] = ""; char* p = localBuff + sizeof(localBuff) - 2; if (n > 0) { for (; n > 0 && p > localBuff && len > 0; n /= 10, --len) *--p = static_cast(n % 10 + '0'); } else { *--p = '0'; --len; } if (zeroPadded) while (p > localBuff && len-- > 0) *--p = static_cast('0'); return addToBuff(p, buf, bufLim); } static inline char* addToBuff(const char* str, char* buf, const char* bufLim) { while ((buf < bufLim) && ((*buf = *str++) != '\0')) ++buf; return buf; } static inline char* clearBuff(char buff[], std::size_t lim) { STRCPY(buff, "", lim); ELPP_UNUSED(lim); // For *nix we dont have anything using lim in above STRCPY macro return buff; } /// @brief Converst wchar* to char* /// NOTE: Need to free return value after use! static char* wcharPtrToCharPtr(const wchar_t* line) { std::size_t len_ = wcslen(line) + 1; char* buff_ = static_cast(malloc(len_ + 1)); # if ELPP_OS_UNIX || (ELPP_OS_WINDOWS && !ELPP_CRT_DBG_WARNINGS) std::wcstombs(buff_, line, len_); # elif ELPP_OS_WINDOWS std::size_t convCount_ = 0; mbstate_t mbState_; ::memset(static_cast(&mbState_), 0, sizeof(mbState_)); wcsrtombs_s(&convCount_, buff_, len_, &line, len_, &mbState_); # endif // ELPP_OS_UNIX || (ELPP_OS_WINDOWS && !ELPP_CRT_DBG_WARNINGS) return buff_; } }; /// @brief Operating System helper static class used internally. You should not use it. class OS : base::StaticClass { public: #if ELPP_OS_WINDOWS /// @brief Gets environment variables for Windows based OS. /// We are not using getenv(const char*) because of CRT deprecation /// @param varname Variable name to get environment variable value for /// @return If variable exist the value of it otherwise nullptr static const char* getWindowsEnvironmentVariable(const char* varname) { const DWORD bufferLen = 50; static char buffer[bufferLen]; if (GetEnvironmentVariableA(varname, buffer, bufferLen)) { return buffer; } return nullptr; } #endif // ELPP_OS_WINDOWS #if ELPP_OS_ANDROID /// @brief Reads android property value static inline std::string getProperty(const char* prop) { char propVal[PROP_VALUE_MAX + 1]; int ret = __system_property_get(prop, propVal); return ret == 0 ? std::string() : std::string(propVal); } /// @brief Reads android device name static std::string getDeviceName(void) { std::stringstream ss; std::string manufacturer = getProperty("ro.product.manufacturer"); std::string model = getProperty("ro.product.model"); if (manufacturer.empty() || model.empty()) { return std::string(); } ss << manufacturer << "-" << model; return ss.str(); } #endif // ELPP_OS_ANDROID /// @brief Runs command on terminal and returns the output. /// /// @detail This is applicable only on unix based systems, for all other OS, an empty string is returned. /// @param command Bash command /// @return Result of bash output or empty string if no result found. static const std::string getBashOutput(const char* command) { #if (ELPP_OS_UNIX && !ELPP_OS_ANDROID && !ELPP_CYGWIN) if (command == nullptr) { return std::string(); } FILE* proc = nullptr; if ((proc = popen(command, "r")) == nullptr) { ELPP_INTERNAL_ERROR("\nUnable to run command [" << command << "]", true); return std::string(); } char hBuff[4096]; if (fgets(hBuff, sizeof(hBuff), proc) != nullptr) { pclose(proc); if (hBuff[strlen(hBuff) - 1] == '\n') { hBuff[strlen(hBuff) - 1] = '\0'; } return std::string(hBuff); } return std::string(); #else ELPP_UNUSED(command); return std::string(); #endif // (ELPP_OS_UNIX && !ELPP_OS_ANDROID && !ELPP_CYGWIN) } /// @brief Gets environment variable. This is cross-platform and CRT safe (for VC++) /// @param variableName Environment variable name /// @param defaultVal If no environment variable or value found the value to return by default /// @param alternativeBashCommand If environment variable not found what would be alternative bash command /// in order to look for value user is looking for. E.g, for 'user' alternative command will 'whoami' static std::string getEnvironmentVariable(const char* variableName, const char* defaultVal, const char* alternativeBashCommand = nullptr) { #if ELPP_OS_UNIX const char* val = getenv(variableName); #elif ELPP_OS_WINDOWS const char* val = getWindowsEnvironmentVariable(variableName); #endif // ELPP_OS_UNIX if ((val == nullptr) || ((strcmp(val, "") == 0))) { #if ELPP_OS_UNIX && defined(ELPP_FORCE_ENV_VAR_FROM_BASH) // Try harder on unix-based systems std::string valBash = base::utils::OS::getBashOutput(alternativeBashCommand); if (valBash.empty()) { return std::string(defaultVal); } else { return valBash; } #elif ELPP_OS_WINDOWS || ELPP_OS_UNIX ELPP_UNUSED(alternativeBashCommand); return std::string(defaultVal); #endif // ELPP_OS_UNIX && defined(ELPP_FORCE_ENV_VAR_FROM_BASH) } return std::string(val); } /// @brief Gets current username. static inline std::string currentUser(void) { #if ELPP_OS_UNIX && !ELPP_OS_ANDROID return getEnvironmentVariable("USER", base::consts::kUnknownUser, "whoami"); #elif ELPP_OS_WINDOWS return getEnvironmentVariable("USERNAME", base::consts::kUnknownUser); #elif ELPP_OS_ANDROID ELPP_UNUSED(base::consts::kUnknownUser); return std::string("android"); #else return std::string(); #endif // ELPP_OS_UNIX && !ELPP_OS_ANDROID } /// @brief Gets current host name or computer name. /// /// @detail For android systems this is device name with its manufacturer and model seperated by hyphen static inline std::string currentHost(void) { #if ELPP_OS_UNIX && !ELPP_OS_ANDROID return getEnvironmentVariable("HOSTNAME", base::consts::kUnknownHost, "hostname"); #elif ELPP_OS_WINDOWS return getEnvironmentVariable("COMPUTERNAME", base::consts::kUnknownHost); #elif ELPP_OS_ANDROID ELPP_UNUSED(base::consts::kUnknownHost); return getDeviceName(); #else return std::string(); #endif // ELPP_OS_UNIX && !ELPP_OS_ANDROID } /// @brief Whether or not terminal supports colors static inline bool termSupportsColor(void) { std::string term = getEnvironmentVariable("TERM", ""); return term == "xterm" || term == "xterm-color" || term == "xterm-256color" || term == "screen" || term == "linux" || term == "cygwin"; } }; extern std::string s_currentUser; extern std::string s_currentHost; extern bool s_termSupportsColor; #define ELPP_INITI_BASIC_DECLR \ namespace el {\ namespace base {\ namespace utils {\ std::string s_currentUser = el::base::utils::OS::currentUser(); \ std::string s_currentHost = el::base::utils::OS::currentHost(); \ bool s_termSupportsColor = el::base::utils::OS::termSupportsColor(); \ }\ }\ } /// @brief Contains utilities for cross-platform date/time. This class make use of el::base::utils::Str class DateTime : base::StaticClass { public: /// @brief Cross platform gettimeofday for Windows and unix platform. This can be used to determine current millisecond. /// /// @detail For unix system it uses gettimeofday(timeval*, timezone*) and for Windows, a seperate implementation is provided /// @param [in,out] tv Pointer that gets updated static void gettimeofday(struct timeval* tv) { #if ELPP_OS_WINDOWS if (tv != nullptr) { # if ELPP_COMPILER_MSVC || defined(_MSC_EXTENSIONS) const unsigned __int64 delta_ = 11644473600000000Ui64; # else const unsigned __int64 delta_ = 11644473600000000ULL; # endif // ELPP_COMPILER_MSVC || defined(_MSC_EXTENSIONS) const double secOffSet = 0.000001; const unsigned long usecOffSet = 1000000; FILETIME fileTime; GetSystemTimeAsFileTime(&fileTime); unsigned __int64 present = 0; present |= fileTime.dwHighDateTime; present = present << 32; present |= fileTime.dwLowDateTime; present /= 10; // mic-sec // Subtract the difference present -= delta_; tv->tv_sec = static_cast(present * secOffSet); tv->tv_usec = static_cast(present % usecOffSet); } #else ::gettimeofday(tv, nullptr); #endif // ELPP_OS_WINDOWS } /// @brief Gets current date and time with milliseconds. /// @param format User provided date/time format /// @param msWidth A pointer to base::MillisecondsWidth from configuration (non-null) /// @returns string based date time in specified format. static inline std::string getDateTime(const char* format, const base::MillisecondsWidth* msWidth) { struct timeval currTime; gettimeofday(&currTime); struct ::tm timeInfo; buildTimeInfo(&currTime, &timeInfo); const int kBuffSize = 30; char buff_[kBuffSize] = ""; parseFormat(buff_, kBuffSize, format, &timeInfo, static_cast(currTime.tv_usec / msWidth->m_offset), msWidth); return std::string(buff_); } /// @brief Formats time to get unit accordingly, units like second if > 1000 or minutes if > 60000 etc static base::type::string_t formatTime(unsigned long long time, base::TimestampUnit timestampUnit) { double result = static_cast(time); base::type::EnumType start = static_cast(timestampUnit); const base::type::char_t* unit = base::consts::kTimeFormats[start].unit; for (base::type::EnumType i = start; i < base::consts::kTimeFormatsCount - 1; ++i) { if (result <= base::consts::kTimeFormats[i].value) { break; } result /= base::consts::kTimeFormats[i].value; unit = base::consts::kTimeFormats[i + 1].unit; } base::type::stringstream_t ss; ss << result << " " << unit; return ss.str(); } /// @brief Gets time difference in milli/micro second depending on timestampUnit static inline unsigned long long getTimeDifference(const struct timeval& endTime, const struct timeval& startTime, base::TimestampUnit timestampUnit) { if (timestampUnit == base::TimestampUnit::Microsecond) { return static_cast(static_cast(1000000 * endTime.tv_sec + endTime.tv_usec) - static_cast(1000000 * startTime.tv_sec + startTime.tv_usec)); } else { return static_cast((((endTime.tv_sec - startTime.tv_sec) * 1000000) + (endTime.tv_usec - startTime.tv_usec)) / 1000); } } private: static inline struct ::tm* buildTimeInfo(struct timeval* currTime, struct ::tm* timeInfo) { #if ELPP_OS_UNIX time_t rawTime = currTime->tv_sec; ::localtime_r(&rawTime, timeInfo); return timeInfo; #else # if ELPP_COMPILER_MSVC ELPP_UNUSED(currTime); time_t t; _time64(&t); localtime_s(timeInfo, &t); return timeInfo; # else // For any other compilers that don't have CRT warnings issue e.g, MinGW or TDM GCC- we use different method time_t rawTime = currTime->tv_sec; struct tm* tmInf = localtime(&rawTime); *timeInfo = *tmInf; return timeInfo; # endif // ELPP_COMPILER_MSVC #endif // ELPP_OS_UNIX } static char* parseFormat(char* buf, std::size_t bufSz, const char* format, const struct tm* tInfo, std::size_t msec, const base::MillisecondsWidth* msWidth) { const char* bufLim = buf + bufSz; for (; *format; ++format) { if (*format == base::consts::kFormatSpecifierChar) { switch (*++format) { case base::consts::kFormatSpecifierChar: // Escape break; case '\0': // End --format; break; case 'd': // Day buf = base::utils::Str::convertAndAddToBuff(tInfo->tm_mday, 2, buf, bufLim); continue; case 'a': // Day of week (short) buf = base::utils::Str::addToBuff(base::consts::kDaysAbbrev[tInfo->tm_wday], buf, bufLim); continue; case 'A': // Day of week (long) buf = base::utils::Str::addToBuff(base::consts::kDays[tInfo->tm_wday], buf, bufLim); continue; case 'M': // month buf = base::utils::Str::convertAndAddToBuff(tInfo->tm_mon + 1, 2, buf, bufLim); continue; case 'b': // month (short) buf = base::utils::Str::addToBuff(base::consts::kMonthsAbbrev[tInfo->tm_mon], buf, bufLim); continue; case 'B': // month (long) buf = base::utils::Str::addToBuff(base::consts::kMonths[tInfo->tm_mon], buf, bufLim); continue; case 'y': // year (two digits) buf = base::utils::Str::convertAndAddToBuff(tInfo->tm_year + base::consts::kYearBase, 2, buf, bufLim); continue; case 'Y': // year (four digits) buf = base::utils::Str::convertAndAddToBuff(tInfo->tm_year + base::consts::kYearBase, 4, buf, bufLim); continue; case 'h': // hour (12-hour) buf = base::utils::Str::convertAndAddToBuff(tInfo->tm_hour % 12, 2, buf, bufLim); continue; case 'H': // hour (24-hour) buf = base::utils::Str::convertAndAddToBuff(tInfo->tm_hour, 2, buf, bufLim); continue; case 'm': // minute buf = base::utils::Str::convertAndAddToBuff(tInfo->tm_min, 2, buf, bufLim); continue; case 's': // second buf = base::utils::Str::convertAndAddToBuff(tInfo->tm_sec, 2, buf, bufLim); continue; case 'z': // milliseconds case 'g': buf = base::utils::Str::convertAndAddToBuff(msec, msWidth->m_width, buf, bufLim); continue; case 'F': // AM/PM buf = base::utils::Str::addToBuff((tInfo->tm_hour >= 12) ? base::consts::kPm : base::consts::kAm, buf, bufLim); continue; default: continue; } } if (buf == bufLim) break; *buf++ = *format; } return buf; } }; /// @brief Command line arguments for application if specified using el::Helpers::setArgs(..) or START_EASYLOGGINGPP(..) class CommandLineArgs { public: CommandLineArgs(void) { setArgs(0, static_cast(nullptr)); } CommandLineArgs(int argc, const char** argv) { setArgs(argc, argv); } CommandLineArgs(int argc, char** argv) { setArgs(argc, argv); } virtual ~CommandLineArgs(void) {} /// @brief Sets arguments and parses them inline void setArgs(int argc, const char** argv) { setArgs(argc, const_cast(argv)); } /// @brief Sets arguments and parses them inline void setArgs(int argc, char** argv) { m_params.clear(); m_paramsWithValue.clear(); if (argc == 0 || argv == nullptr) { return; } m_argc = argc; m_argv = argv; for (int i = 1; i < m_argc; ++i) { const char* v = (strstr(m_argv[i], "=")); if (v != nullptr && strlen(v) > 0) { std::string key = std::string(m_argv[i]); key = key.substr(0, key.find_first_of('=')); if (hasParamWithValue(key.c_str())) { ELPP_INTERNAL_INFO(1, "Skipping [" << key << "] arg since it already has value [" << getParamValue(key.c_str()) << "]"); } else { m_paramsWithValue.insert(std::make_pair(key, std::string(v + 1))); } } if (v == nullptr) { if (hasParam(m_argv[i])) { ELPP_INTERNAL_INFO(1, "Skipping [" << m_argv[i] << "] arg since it already exists"); } else { m_params.push_back(std::string(m_argv[i])); } } } } /// @brief Returns true if arguments contain paramKey with a value (seperated by '=') inline bool hasParamWithValue(const char* paramKey) const { return m_paramsWithValue.find(std::string(paramKey)) != m_paramsWithValue.end(); } /// @brief Returns value of arguments /// @see hasParamWithValue(const char*) inline const char* getParamValue(const char* paramKey) const { return m_paramsWithValue.find(std::string(paramKey))->second.c_str(); } /// @brief Return true if arguments has a param (not having a value) i,e without '=' inline bool hasParam(const char* paramKey) const { return std::find(m_params.begin(), m_params.end(), std::string(paramKey)) != m_params.end(); } /// @brief Returns true if no params available. This exclude argv[0] inline bool empty(void) const { return m_params.empty() && m_paramsWithValue.empty(); } /// @brief Returns total number of arguments. This exclude argv[0] inline std::size_t size(void) const { return m_params.size() + m_paramsWithValue.size(); } inline friend base::type::ostream_t& operator<<(base::type::ostream_t& os, const CommandLineArgs& c) { for (int i = 1; i < c.m_argc; ++i) { os << ELPP_LITERAL("[") << c.m_argv[i] << ELPP_LITERAL("]"); if (i < c.m_argc - 1) { os << ELPP_LITERAL(" "); } } return os; } private: int m_argc; char** m_argv; std::map m_paramsWithValue; std::vector m_params; }; /// @brief Abstract registry (aka repository) that provides basic interface for pointer repository specified by T_Ptr type. /// /// @detail Most of the functions are virtual final methods but anything implementing this abstract class should implement /// unregisterAll() and deepCopy(const AbstractRegistry&) and write registerNew() method according to container /// and few more methods; get() to find element, unregister() to unregister single entry. /// Please note that this is thread-unsafe and should also implement thread-safety mechanisms in implementation. template class AbstractRegistry : public base::threading::ThreadSafe { public: typedef typename Container::iterator iterator; typedef typename Container::const_iterator const_iterator; /// @brief Default constructor AbstractRegistry(void) {} /// @brief Move constructor that is useful for base classes AbstractRegistry(AbstractRegistry&& sr) { if (this == &sr) { return; } unregisterAll(); m_list = std::move(sr.m_list); } bool operator==(const AbstractRegistry& other) { if (size() != other.size()) { return false; } for (std::size_t i = 0; i < m_list.size(); ++i) { if (m_list.at(i) != other.m_list.at(i)) { return false; } } return true; } bool operator!=(const AbstractRegistry& other) { if (size() != other.size()) { return true; } for (std::size_t i = 0; i < m_list.size(); ++i) { if (m_list.at(i) != other.m_list.at(i)) { return true; } } return false; } /// @brief Assignment move operator AbstractRegistry& operator=(AbstractRegistry&& sr) { if (this == &sr) { return *this; } unregisterAll(); m_list = std::move(sr.m_list); return *this; } virtual ~AbstractRegistry(void) { } /// @return Iterator pointer from start of repository virtual inline iterator begin(void) ELPP_FINAL { return m_list.begin(); } /// @return Iterator pointer from end of repository virtual inline iterator end(void) ELPP_FINAL { return m_list.end(); } /// @return Constant iterator pointer from start of repository virtual inline const_iterator cbegin(void) const ELPP_FINAL { return m_list.cbegin(); } /// @return End of repository virtual inline const_iterator cend(void) const ELPP_FINAL { return m_list.cend(); } /// @return Whether or not repository is empty virtual inline bool empty(void) const ELPP_FINAL { return m_list.empty(); } /// @return Size of repository virtual inline std::size_t size(void) const ELPP_FINAL { return m_list.size(); } /// @brief Returns underlying container by reference virtual inline Container& list(void) ELPP_FINAL { return m_list; } /// @brief Returns underlying container by constant reference. virtual inline const Container& list(void) const ELPP_FINAL { return m_list; } /// @brief Unregisters all the pointers from current repository. virtual void unregisterAll(void) = 0; protected: virtual void deepCopy(const AbstractRegistry&) = 0; void reinitDeepCopy(const AbstractRegistry& sr) { unregisterAll(); deepCopy(sr); } private: Container m_list; }; /// @brief A pointer registry mechanism to manage memory and provide search functionalities. (non-predicate version) /// /// @detail NOTE: This is thread-unsafe implementation (although it contains lock function, it does not use these functions) /// of AbstractRegistry. Any implementation of this class should be /// explicitly (by using lock functions) template class Registry : public AbstractRegistry> { public: typedef typename Registry::iterator iterator; typedef typename Registry::const_iterator const_iterator; Registry(void) {} /// @brief Copy constructor that is useful for base classes. Try to avoid this constructor, use move constructor. Registry(const Registry& sr) : AbstractRegistry>() { if (this == &sr) { return; } this->reinitDeepCopy(sr); } /// @brief Assignment operator that unregisters all the existing registeries and deeply copies each of repo element /// @see unregisterAll() /// @see deepCopy(const AbstractRegistry&) Registry& operator=(const Registry& sr) { if (this == &sr) { return *this; } this->reinitDeepCopy(sr); return *this; } virtual ~Registry(void) { unregisterAll(); } protected: virtual inline void unregisterAll(void) ELPP_FINAL { if (!this->empty()) { for (auto&& curr : this->list()) { base::utils::safeDelete(curr.second); } this->list().clear(); } } /// @brief Registers new registry to repository. virtual inline void registerNew(const T_Key& uniqKey, T_Ptr* ptr) ELPP_FINAL { unregister(uniqKey); this->list().insert(std::make_pair(uniqKey, ptr)); } /// @brief Unregisters single entry mapped to specified unique key inline void unregister(const T_Key& uniqKey) { T_Ptr* existing = get(uniqKey); if (existing != nullptr) { base::utils::safeDelete(existing); this->list().erase(uniqKey); } } /// @brief Gets pointer from repository. If none found, nullptr is returned. inline T_Ptr* get(const T_Key& uniqKey) { iterator it = this->list().find(uniqKey); return it == this->list().end() ? nullptr : it->second; } private: virtual inline void deepCopy(const AbstractRegistry>& sr) ELPP_FINAL { for (const_iterator it = sr.cbegin(); it != sr.cend(); ++it) { registerNew(it->first, new T_Ptr(*it->second)); } } }; /// @brief A pointer registry mechanism to manage memory and provide search functionalities. (predicate version) /// /// @detail NOTE: This is thread-unsafe implementation of AbstractRegistry. Any implementation of this class /// should be made thread-safe explicitly template class RegistryWithPred : public AbstractRegistry> { public: typedef typename RegistryWithPred::iterator iterator; typedef typename RegistryWithPred::const_iterator const_iterator; RegistryWithPred(void) { } virtual ~RegistryWithPred(void) { unregisterAll(); } /// @brief Copy constructor that is useful for base classes. Try to avoid this constructor, use move constructor. RegistryWithPred(const RegistryWithPred& sr) : AbstractRegistry>() { if (this == &sr) { return; } this->reinitDeepCopy(sr); } /// @brief Assignment operator that unregisters all the existing registeries and deeply copies each of repo element /// @see unregisterAll() /// @see deepCopy(const AbstractRegistry&) RegistryWithPred& operator=(const RegistryWithPred& sr) { if (this == &sr) { return *this; } this->reinitDeepCopy(sr); return *this; } friend inline base::type::ostream_t& operator<<(base::type::ostream_t& os, const RegistryWithPred& sr) { for (const_iterator it = sr.list().begin(); it != sr.list().end(); ++it) { os << ELPP_LITERAL(" ") << **it << ELPP_LITERAL("\n"); } return os; } protected: virtual inline void unregisterAll(void) ELPP_FINAL { if (!this->empty()) { for (auto&& curr : this->list()) { base::utils::safeDelete(curr); } this->list().clear(); } } virtual void unregister(T_Ptr*& ptr) ELPP_FINAL { if (ptr) { iterator iter = this->begin(); for (; iter != this->end(); ++iter) { if (ptr == *iter) { break; } } if (iter != this->end() && *iter != nullptr) { this->list().erase(iter); base::utils::safeDelete(*iter); } } } virtual inline void registerNew(T_Ptr* ptr) ELPP_FINAL { this->list().push_back(ptr); } /// @brief Gets pointer from repository with speicifed arguments. Arguments are passed to predicate /// in order to validate pointer. template inline T_Ptr* get(const T& arg1, const T2 arg2) { iterator iter = std::find_if(this->list().begin(), this->list().end(), Pred(arg1, arg2)); if (iter != this->list().end() && *iter != nullptr) { return *iter; } return nullptr; } private: virtual inline void deepCopy(const AbstractRegistry>& sr) { for (const_iterator it = sr.list().begin(); it != sr.list().end(); ++it) { registerNew(new T_Ptr(**it)); } } }; } // namespace utils } // namespace base /// @brief Base of Easylogging++ friendly class /// /// @detail After inheriting this class publicly, implement pure-virtual function `void log(std::ostream&) const` class Loggable { public: virtual ~Loggable(void) {} virtual void log(el::base::type::ostream_t&) const = 0; private: friend inline el::base::type::ostream_t& operator<<(el::base::type::ostream_t& os, const Loggable& loggable) { loggable.log(os); return os; } }; namespace base { /// @brief Represents log format containing flags and date format. This is used internally to start initial log class LogFormat : public Loggable { public: LogFormat(void) : m_level(Level::Unknown), m_userFormat(base::type::string_t()), m_format(base::type::string_t()), m_dateTimeFormat(std::string()), m_flags(0x0) { } LogFormat(Level level, const base::type::string_t& format) : m_level(level), m_userFormat(format) { parseFromFormat(m_userFormat); } LogFormat(const LogFormat& logFormat) { m_level = logFormat.m_level; m_userFormat = logFormat.m_userFormat; m_format = logFormat.m_format; m_dateTimeFormat = logFormat.m_dateTimeFormat; m_flags = logFormat.m_flags; } LogFormat(LogFormat&& logFormat) { m_level = std::move(logFormat.m_level); m_userFormat = std::move(logFormat.m_userFormat); m_format = std::move(logFormat.m_format); m_dateTimeFormat = std::move(logFormat.m_dateTimeFormat); m_flags = std::move(logFormat.m_flags); } LogFormat& operator=(const LogFormat& logFormat) { m_level = logFormat.m_level; m_userFormat = logFormat.m_userFormat; m_dateTimeFormat = logFormat.m_dateTimeFormat; m_flags = logFormat.m_flags; return *this; } virtual ~LogFormat(void) { } inline bool operator==(const LogFormat& other) { return m_level == other.m_level && m_userFormat == other.m_userFormat && m_format == other.m_format && m_dateTimeFormat == other.m_dateTimeFormat && m_flags == other.m_flags; } /// @brief Updates format to be used while logging. /// @param userFormat User provided format void parseFromFormat(const base::type::string_t& userFormat) { // We make copy because we will be changing the format // i.e, removing user provided date format from original format // and then storing it. base::type::string_t formatCopy = userFormat; m_flags = 0x0; auto conditionalAddFlag = [&](const base::type::char_t* specifier, base::FormatFlags flag) { std::size_t foundAt = base::type::string_t::npos; while ((foundAt = formatCopy.find(specifier, foundAt + 1)) != base::type::string_t::npos){ if (foundAt > 0 && formatCopy[foundAt - 1] == base::consts::kFormatSpecifierChar) { if (hasFlag(flag)) { // If we already have flag we remove the escape chars so that '%%' is turned to '%' // even after specifier resolution - this is because we only replaceFirst specifier formatCopy.erase(foundAt > 0 ? foundAt - 1 : 0, 1); ++foundAt; } } else { if (!hasFlag(flag)) addFlag(flag); } } }; conditionalAddFlag(base::consts::kAppNameFormatSpecifier, base::FormatFlags::AppName); conditionalAddFlag(base::consts::kSeverityLevelFormatSpecifier, base::FormatFlags::Level); conditionalAddFlag(base::consts::kSeverityLevelShortFormatSpecifier, base::FormatFlags::LevelShort); conditionalAddFlag(base::consts::kLoggerIdFormatSpecifier, base::FormatFlags::LoggerId); conditionalAddFlag(base::consts::kThreadIdFormatSpecifier, base::FormatFlags::ThreadId); conditionalAddFlag(base::consts::kLogFileFormatSpecifier, base::FormatFlags::File); conditionalAddFlag(base::consts::kLogFileBaseFormatSpecifier, base::FormatFlags::FileBase); conditionalAddFlag(base::consts::kLogLineFormatSpecifier, base::FormatFlags::Line); conditionalAddFlag(base::consts::kLogLocationFormatSpecifier, base::FormatFlags::Location); conditionalAddFlag(base::consts::kLogFunctionFormatSpecifier, base::FormatFlags::Function); conditionalAddFlag(base::consts::kCurrentUserFormatSpecifier, base::FormatFlags::User); conditionalAddFlag(base::consts::kCurrentHostFormatSpecifier, base::FormatFlags::Host); conditionalAddFlag(base::consts::kMessageFormatSpecifier, base::FormatFlags::LogMessage); conditionalAddFlag(base::consts::kVerboseLevelFormatSpecifier, base::FormatFlags::VerboseLevel); // For date/time we need to extract user's date format first std::size_t dateIndex = std::string::npos; if ((dateIndex = formatCopy.find(base::consts::kDateTimeFormatSpecifier)) != std::string::npos) { while (dateIndex > 0 && formatCopy[dateIndex - 1] == base::consts::kFormatSpecifierChar) { dateIndex = formatCopy.find(base::consts::kDateTimeFormatSpecifier, dateIndex + 1); } if (dateIndex != std::string::npos) { addFlag(base::FormatFlags::DateTime); updateDateFormat(dateIndex, formatCopy); } } m_format = formatCopy; updateFormatSpec(); } inline Level level(void) const { return m_level; } inline const base::type::string_t& userFormat(void) const { return m_userFormat; } inline const base::type::string_t& format(void) const { return m_format; } inline const std::string& dateTimeFormat(void) const { return m_dateTimeFormat; } inline base::type::EnumType flags(void) const { return m_flags; } inline bool hasFlag(base::FormatFlags flag) const { return base::utils::hasFlag(flag, m_flags); } virtual void log(el::base::type::ostream_t& os) const { os << m_format; } protected: /// @brief Updates date time format if available in currFormat. /// @param index Index where %datetime, %date or %time was found /// @param [in,out] currFormat current format that is being used to format virtual void updateDateFormat(std::size_t index, base::type::string_t& currFormat) ELPP_FINAL { if (hasFlag(base::FormatFlags::DateTime)) { index += ELPP_STRLEN(base::consts::kDateTimeFormatSpecifier); } const base::type::char_t* ptr = currFormat.c_str() + index; if ((currFormat.size() > index) && (ptr[0] == '{')) { // User has provided format for date/time ++ptr; int count = 1; // Start by 1 in order to remove starting brace std::stringstream ss; for (; *ptr; ++ptr, ++count) { if (*ptr == '}') { ++count; // In order to remove ending brace break; } ss << *ptr; } currFormat.erase(index, count); m_dateTimeFormat = ss.str(); } else { // No format provided, use default if (hasFlag(base::FormatFlags::DateTime)) { m_dateTimeFormat = std::string(base::consts::kDefaultDateTimeFormat); } } } /// @brief Updates %level from format. This is so that we dont have to do it at log-writing-time. It uses m_format and m_level virtual void updateFormatSpec(void) ELPP_FINAL { // Do not use switch over strongly typed enums because Intel C++ compilers dont support them yet. if (m_level == Level::Debug) { base::utils::Str::replaceFirstWithEscape(m_format, base::consts::kSeverityLevelFormatSpecifier, base::consts::kDebugLevelLogValue); base::utils::Str::replaceFirstWithEscape(m_format, base::consts::kSeverityLevelShortFormatSpecifier, base::consts::kDebugLevelShortLogValue); } else if (m_level == Level::Info) { base::utils::Str::replaceFirstWithEscape(m_format, base::consts::kSeverityLevelFormatSpecifier, base::consts::kInfoLevelLogValue); base::utils::Str::replaceFirstWithEscape(m_format, base::consts::kSeverityLevelShortFormatSpecifier, base::consts::kInfoLevelShortLogValue); } else if (m_level == Level::Warning) { base::utils::Str::replaceFirstWithEscape(m_format, base::consts::kSeverityLevelFormatSpecifier, base::consts::kWarningLevelLogValue); base::utils::Str::replaceFirstWithEscape(m_format, base::consts::kSeverityLevelShortFormatSpecifier, base::consts::kWarningLevelShortLogValue); } else if (m_level == Level::Error) { base::utils::Str::replaceFirstWithEscape(m_format, base::consts::kSeverityLevelFormatSpecifier, base::consts::kErrorLevelLogValue); base::utils::Str::replaceFirstWithEscape(m_format, base::consts::kSeverityLevelShortFormatSpecifier, base::consts::kErrorLevelShortLogValue); } else if (m_level == Level::Fatal) { base::utils::Str::replaceFirstWithEscape(m_format, base::consts::kSeverityLevelFormatSpecifier, base::consts::kFatalLevelLogValue); base::utils::Str::replaceFirstWithEscape(m_format, base::consts::kSeverityLevelShortFormatSpecifier, base::consts::kFatalLevelShortLogValue); } else if (m_level == Level::Verbose) { base::utils::Str::replaceFirstWithEscape(m_format, base::consts::kSeverityLevelFormatSpecifier, base::consts::kVerboseLevelLogValue); base::utils::Str::replaceFirstWithEscape(m_format, base::consts::kSeverityLevelShortFormatSpecifier, base::consts::kVerboseLevelShortLogValue); } else if (m_level == Level::Trace) { base::utils::Str::replaceFirstWithEscape(m_format, base::consts::kSeverityLevelFormatSpecifier, base::consts::kTraceLevelLogValue); base::utils::Str::replaceFirstWithEscape(m_format, base::consts::kSeverityLevelShortFormatSpecifier, base::consts::kTraceLevelShortLogValue); } if (hasFlag(base::FormatFlags::User)) { std::string s = base::utils::s_currentUser; base::utils::Str::replaceFirstWithEscape(m_format, base::consts::kCurrentUserFormatSpecifier, base::utils::s_currentUser); } if (hasFlag(base::FormatFlags::Host)) { base::utils::Str::replaceFirstWithEscape(m_format, base::consts::kCurrentHostFormatSpecifier, base::utils::s_currentHost); } // Ignore Level::Global and Level::Unknown } inline void addFlag(base::FormatFlags flag) { base::utils::addFlag(flag, &m_flags); } private: Level m_level; base::type::string_t m_userFormat; base::type::string_t m_format; std::string m_dateTimeFormat; base::type::EnumType m_flags; friend class el::Logger; // To resolve loggerId format specifier easily }; } // namespace base /// @brief Resolving function for format specifier typedef std::function FormatSpecifierValueResolver; /// @brief User-provided custom format specifier /// @see el::Helpers::installCustomFormatSpecifier /// @see FormatSpecifierValueResolver class CustomFormatSpecifier { public: CustomFormatSpecifier(const char* formatSpecifier, const FormatSpecifierValueResolver& resolver) : m_formatSpecifier(formatSpecifier), m_resolver(resolver) {} inline const char* formatSpecifier(void) const { return m_formatSpecifier; } inline const FormatSpecifierValueResolver& resolver(void) const { return m_resolver; } inline bool operator==(const char* formatSpecifier) { return strcmp(m_formatSpecifier, formatSpecifier) == 0; } private: const char* m_formatSpecifier; FormatSpecifierValueResolver m_resolver; }; /// @brief Represents single configuration that has representing level, configuration type and a string based value. /// /// @detail String based value means any value either its boolean, integer or string itself, it will be embedded inside quotes /// and will be parsed later. /// /// Consider some examples below: /// * el::Configuration confEnabledInfo(el::Level::Info, el::ConfigurationType::Enabled, "true"); /// * el::Configuration confMaxLogFileSizeInfo(el::Level::Info, el::ConfigurationType::MaxLogFileSize, "2048"); /// * el::Configuration confFilenameInfo(el::Level::Info, el::ConfigurationType::Filename, "/var/log/my.log"); class Configuration : public Loggable { public: Configuration(const Configuration& c) : m_level(c.m_level), m_configurationType(c.m_configurationType), m_value(c.m_value) { } Configuration& operator=(const Configuration& c) { m_level = c.m_level; m_configurationType = c.m_configurationType; m_value = c.m_value; return *this; } virtual ~Configuration(void) { } /// @brief Full constructor used to sets value of configuration Configuration(Level level, ConfigurationType configurationType, const std::string& value) : m_level(level), m_configurationType(configurationType), m_value(value) { } /// @brief Gets level of current configuration inline Level level(void) const { return m_level; } /// @brief Gets configuration type of current configuration inline ConfigurationType configurationType(void) const { return m_configurationType; } /// @brief Gets string based configuration value inline const std::string& value(void) const { return m_value; } /// @brief Set string based configuration value /// @param value Value to set. Values have to be std::string; For boolean values use "true", "false", for any integral values /// use them in quotes. They will be parsed when configuring inline void setValue(const std::string& value) { m_value = value; } virtual inline void log(el::base::type::ostream_t& os) const { os << LevelHelper::convertToString(m_level) << ELPP_LITERAL(" ") << ConfigurationTypeHelper::convertToString(m_configurationType) << ELPP_LITERAL(" = ") << m_value.c_str(); } /// @brief Used to find configuration from configuration (pointers) repository. Avoid using it. class Predicate { public: Predicate(Level level, ConfigurationType configurationType) : m_level(level), m_configurationType(configurationType) { } inline bool operator()(const Configuration* conf) const { return ((conf != nullptr) && (conf->level() == m_level) && (conf->configurationType() == m_configurationType)); } private: Level m_level; ConfigurationType m_configurationType; }; private: Level m_level; ConfigurationType m_configurationType; std::string m_value; }; /// @brief Thread-safe Configuration repository /// /// @detail This repository represents configurations for all the levels and configuration type mapped to a value. class Configurations : public base::utils::RegistryWithPred { public: /// @brief Default constructor with empty repository Configurations(void) : m_configurationFile(std::string()), m_isFromFile(false) { } /// @brief Constructor used to set configurations using configuration file. /// @param configurationFile Full path to configuration file /// @param useDefaultsForRemaining Lets you set the remaining configurations to default. /// @param base If provided, this configuration will be based off existing repository that this argument is pointing to. /// @see parseFromFile(const std::string&, Configurations* base) /// @see setRemainingToDefault() Configurations(const std::string& configurationFile, bool useDefaultsForRemaining = true, Configurations* base = nullptr) : m_configurationFile(configurationFile), m_isFromFile(false) { parseFromFile(configurationFile, base); if (useDefaultsForRemaining) { setRemainingToDefault(); } } virtual ~Configurations(void) { } /// @brief Parses configuration from file. /// @param configurationFile Full path to configuration file /// @param base Configurations to base new configuration repository off. This value is used when you want to use /// existing Configurations to base all the values and then set rest of configuration via configuration file. /// @return True if successfully parsed, false otherwise. You may define 'ELPP_DEBUG_ASSERT_FAILURE' to make sure you /// do not proceed without successful parse. inline bool parseFromFile(const std::string& configurationFile, Configurations* base = nullptr) { // We initial assertion with true because if we have assertion diabled, we want to pass this // check and if assertion is enabled we will have values re-assigned any way. bool assertionPassed = true; ELPP_ASSERT((assertionPassed = base::utils::File::pathExists(configurationFile.c_str(), true)), "Configuration file [" << configurationFile << "] does not exist!"); if (!assertionPassed) { return false; } bool success = Parser::parseFromFile(configurationFile, this, base); m_isFromFile = success; return success; } /// @brief Parse configurations from configuration string. /// /// @detail This configuration string has same syntax as configuration file contents. Make sure all the necessary /// new line characters are provided. /// @param base Configurations to base new configuration repository off. This value is used when you want to use /// existing Configurations to base all the values and then set rest of configuration via configuration text. /// @return True if successfully parsed, false otherwise. You may define 'ELPP_DEBUG_ASSERT_FAILURE' to make sure you /// do not proceed without successful parse. inline bool parseFromText(const std::string& configurationsString, Configurations* base = nullptr) { bool success = Parser::parseFromText(configurationsString, this, base); if (success) { m_isFromFile = false; } return success; } /// @brief Sets configuration based-off an existing configurations. /// @param base Pointer to existing configurations. inline void setFromBase(Configurations* base) { if (base == nullptr || base == this) { return; } base::threading::ScopedLock scopedLock(base->lock()); for (Configuration*& conf : base->list()) { set(conf); } } /// @brief Determines whether or not specified configuration type exists in the repository. /// /// @detail Returns as soon as first level is found. /// @param configurationType Type of configuration to check existence for. bool hasConfiguration(ConfigurationType configurationType) { base::type::EnumType lIndex = LevelHelper::kMinValid; bool result = false; LevelHelper::forEachLevel(&lIndex, [&](void) -> bool { if (hasConfiguration(LevelHelper::castFromInt(lIndex), configurationType)) { result = true; } return result; }); return result; } /// @brief Determines whether or not specified configuration type exists for specified level /// @param level Level to check /// @param configurationType Type of configuration to check existence for. inline bool hasConfiguration(Level level, ConfigurationType configurationType) { base::threading::ScopedLock scopedLock(lock()); #if ELPP_COMPILER_INTEL // We cant specify template types here, Intel C++ throws compilation error // "error: type name is not allowed" return RegistryWithPred::get(level, configurationType) != nullptr; #else return RegistryWithPred::get(level, configurationType) != nullptr; #endif // ELPP_COMPILER_INTEL } /// @brief Sets value of configuration for specified level. /// /// @detail Any existing configuration for specified level will be replaced. Also note that configuration types /// ConfigurationType::MillisecondsWidth and ConfigurationType::PerformanceTracking will be ignored if not set for /// Level::Global because these configurations are not dependant on level. /// @param level Level to set configuration for (el::Level). /// @param configurationType Type of configuration (el::ConfigurationType) /// @param value A string based value. Regardless of what the data type of configuration is, it will always be string /// from users' point of view. This is then parsed later to be used internally. /// @see Configuration::setValue(const std::string& value) /// @see el::Level /// @see el::ConfigurationType inline void set(Level level, ConfigurationType configurationType, const std::string& value) { base::threading::ScopedLock scopedLock(lock()); unsafeSet(level, configurationType, value); // This is not unsafe anymore as we have locked mutex if (level == Level::Global) { unsafeSetGlobally(configurationType, value, false); // Again this is not unsafe either } } /// @brief Sets single configuration based on other single configuration. /// @see set(Level level, ConfigurationType configurationType, const std::string& value) inline void set(Configuration* conf) { if (conf == nullptr) { return; } set(conf->level(), conf->configurationType(), conf->value()); } inline Configuration* get(Level level, ConfigurationType configurationType) { base::threading::ScopedLock scopedLock(lock()); return RegistryWithPred::get(level, configurationType); } /// @brief Sets configuration for all levels. /// @param configurationType Type of configuration /// @param value String based value /// @see Configurations::set(Level level, ConfigurationType configurationType, const std::string& value) inline void setGlobally(ConfigurationType configurationType, const std::string& value) { setGlobally(configurationType, value, false); } /// @brief Clears repository so that all the configurations are unset inline void clear(void) { base::threading::ScopedLock scopedLock(lock()); unregisterAll(); } /// @brief Gets configuration file used in parsing this configurations. /// /// @detail If this repository was set manually or by text this returns empty string. inline const std::string& configurationFile(void) const { return m_configurationFile; } /// @brief Sets configurations to "factory based" configurations. void setToDefault(void) { setGlobally(ConfigurationType::Enabled, std::string("true"), true); #if !defined(ELPP_NO_DEFAULT_LOG_FILE) setGlobally(ConfigurationType::Filename, std::string(base::consts::kDefaultLogFile), true); #else ELPP_UNUSED(base::consts::kDefaultLogFile); #endif // !defined(ELPP_NO_DEFAULT_LOG_FILE) setGlobally(ConfigurationType::ToFile, std::string("true"), true); setGlobally(ConfigurationType::ToStandardOutput, std::string("true"), true); setGlobally(ConfigurationType::MillisecondsWidth, std::string("3"), true); setGlobally(ConfigurationType::PerformanceTracking, std::string("true"), true); setGlobally(ConfigurationType::MaxLogFileSize, std::string("0"), true); setGlobally(ConfigurationType::LogFlushThreshold, std::string("0"), true); setGlobally(ConfigurationType::Format, std::string("%datetime %level [%logger] %msg"), true); set(Level::Debug, ConfigurationType::Format, std::string("%datetime %level [%logger] [%user@%host] [%func] [%loc] %msg")); // INFO and WARNING are set to default by Level::Global set(Level::Error, ConfigurationType::Format, std::string("%datetime %level [%logger] %msg")); set(Level::Fatal, ConfigurationType::Format, std::string("%datetime %level [%logger] %msg")); set(Level::Verbose, ConfigurationType::Format, std::string("%datetime %level-%vlevel [%logger] %msg")); set(Level::Trace, ConfigurationType::Format, std::string("%datetime %level [%logger] [%func] [%loc] %msg")); } /// @brief Lets you set the remaining configurations to default. /// /// @detail By remaining, it means that the level/type a configuration does not exist for. /// This function is useful when you want to minimize chances of failures, e.g, if you have a configuration file that sets /// configuration for all the configurations except for Enabled or not, we use this so that ENABLED is set to default i.e, /// true. If you dont do this explicitley (either by calling this function or by using second param in Constructor /// and try to access a value, an error is thrown void setRemainingToDefault(void) { base::threading::ScopedLock scopedLock(lock()); unsafeSetIfNotExist(Level::Global, ConfigurationType::Enabled, std::string("true")); #if !defined(ELPP_NO_DEFAULT_LOG_FILE) unsafeSetIfNotExist(Level::Global, ConfigurationType::Filename, std::string(base::consts::kDefaultLogFile)); #endif // !defined(ELPP_NO_DEFAULT_LOG_FILE) unsafeSetIfNotExist(Level::Global, ConfigurationType::ToFile, std::string("true")); unsafeSetIfNotExist(Level::Global, ConfigurationType::ToStandardOutput, std::string("true")); unsafeSetIfNotExist(Level::Global, ConfigurationType::MillisecondsWidth, std::string("3")); unsafeSetIfNotExist(Level::Global, ConfigurationType::PerformanceTracking, std::string("true")); unsafeSetIfNotExist(Level::Global, ConfigurationType::MaxLogFileSize, std::string("0")); unsafeSetIfNotExist(Level::Global, ConfigurationType::Format, std::string("%datetime %level [%logger] %msg")); unsafeSetIfNotExist(Level::Debug, ConfigurationType::Format, std::string("%datetime %level [%logger] [%user@%host] [%func] [%loc] %msg")); // INFO and WARNING are set to default by Level::Global unsafeSetIfNotExist(Level::Error, ConfigurationType::Format, std::string("%datetime %level [%logger] %msg")); unsafeSetIfNotExist(Level::Fatal, ConfigurationType::Format, std::string("%datetime %level [%logger] %msg")); unsafeSetIfNotExist(Level::Verbose, ConfigurationType::Format, std::string("%datetime %level-%vlevel [%logger] %msg")); unsafeSetIfNotExist(Level::Trace, ConfigurationType::Format, std::string("%datetime %level [%logger] [%func] [%loc] %msg")); } /// @brief Parser used internally to parse configurations from file or text. /// /// @detail This class makes use of base::utils::Str. /// You should not need this unless you are working on some tool for Easylogging++ class Parser : base::StaticClass { public: /// @brief Parses configuration from file. /// @param configurationFile Full path to configuration file /// @param sender Sender configurations pointer. Usually 'this' is used from calling class /// @param base Configurations to base new configuration repository off. This value is used when you want to use /// existing Configurations to base all the values and then set rest of configuration via configuration file. /// @return True if successfully parsed, false otherwise. You may define '_STOP_ON_FIRSTELPP_ASSERTION' to make sure you /// do not proceed without successful parse. static bool parseFromFile(const std::string& configurationFile, Configurations* sender, Configurations* base = nullptr) { sender->setFromBase(base); std::ifstream fileStream_(configurationFile.c_str(), std::ifstream::in); ELPP_ASSERT(fileStream_.is_open(), "Unable to open configuration file [" << configurationFile << "] for parsing."); bool parsedSuccessfully = false; std::string line = std::string(); Level currLevel = Level::Unknown; std::string currConfigStr = std::string(); std::string currLevelStr = std::string(); while (fileStream_.good()) { std::getline(fileStream_, line); parsedSuccessfully = parseLine(&line, &currConfigStr, &currLevelStr, &currLevel, sender); ELPP_ASSERT(parsedSuccessfully, "Unable to parse configuration line: " << line); } return parsedSuccessfully; } /// @brief Parse configurations from configuration string. /// /// @detail This configuration string has same syntax as configuration file contents. Make sure all the necessary /// new line characters are provided. You may define '_STOP_ON_FIRSTELPP_ASSERTION' to make sure you /// do not proceed without successful parse (This is recommended) /// @param configurationsString /// @param sender Sender configurations pointer. Usually 'this' is used from calling class /// @param base Configurations to base new configuration repository off. This value is used when you want to use /// existing Configurations to base all the values and then set rest of configuration via configuration text. /// @return True if successfully parsed, false otherwise. static bool parseFromText(const std::string& configurationsString, Configurations* sender, Configurations* base = nullptr) { sender->setFromBase(base); bool parsedSuccessfully = false; std::stringstream ss(configurationsString); std::string line = std::string(); Level currLevel = Level::Unknown; std::string currConfigStr = std::string(); std::string currLevelStr = std::string(); while (std::getline(ss, line)) { parsedSuccessfully = parseLine(&line, &currConfigStr, &currLevelStr, &currLevel, sender); ELPP_ASSERT(parsedSuccessfully, "Unable to parse configuration line: " << line); } return parsedSuccessfully; } private: friend class el::Loggers; static void ignoreComments(std::string* line) { std::size_t foundAt = 0; std::size_t quotesStart = line->find("\""); std::size_t quotesEnd = std::string::npos; if (quotesStart != std::string::npos) { quotesEnd = line->find("\"", quotesStart + 1); while (quotesEnd != std::string::npos && line->at(quotesEnd - 1) == '\\') { // Do not erase slash yet - we will erase it in parseLine(..) while loop quotesEnd = line->find("\"", quotesEnd + 2); } } if ((foundAt = line->find(base::consts::kConfigurationComment)) != std::string::npos) { if (foundAt < quotesEnd) { foundAt = line->find(base::consts::kConfigurationComment, quotesEnd + 1); } *line = line->substr(0, foundAt); } } static inline bool isLevel(const std::string& line) { return base::utils::Str::startsWith(line, std::string(base::consts::kConfigurationLevel)); } static inline bool isComment(const std::string& line) { return base::utils::Str::startsWith(line, std::string(base::consts::kConfigurationComment)); } static inline bool isConfig(const std::string& line) { std::size_t assignment = line.find('='); return line != "" && (line[0] >= 65 || line[0] <= 90 || line[0] >= 97 || line[0] <= 122) && (assignment != std::string::npos) && (line.size() > assignment); } static bool parseLine(std::string* line, std::string* currConfigStr, std::string* currLevelStr, Level* currLevel, Configurations* conf) { ConfigurationType currConfig = ConfigurationType::Unknown; std::string currValue = std::string(); *line = base::utils::Str::trim(*line); if (isComment(*line)) return true; ignoreComments(line); *line = base::utils::Str::trim(*line); if (line->empty()) { // Comment ignored return true; } if (isLevel(*line)) { if (line->size() <= 2) { return true; } *currLevelStr = line->substr(1, line->size() - 2); *currLevelStr = base::utils::Str::toUpper(*currLevelStr); *currLevelStr = base::utils::Str::trim(*currLevelStr); *currLevel = LevelHelper::convertFromString(currLevelStr->c_str()); return true; } if (isConfig(*line)) { std::size_t assignment = line->find('='); *currConfigStr = line->substr(0, assignment); *currConfigStr = base::utils::Str::toUpper(*currConfigStr); *currConfigStr = base::utils::Str::trim(*currConfigStr); currConfig = ConfigurationTypeHelper::convertFromString(currConfigStr->c_str()); currValue = line->substr(assignment + 1); currValue = base::utils::Str::trim(currValue); std::size_t quotesStart = currValue.find("\"", 0); std::size_t quotesEnd = std::string::npos; if (quotesStart != std::string::npos) { quotesEnd = currValue.find("\"", quotesStart + 1); while (quotesEnd != std::string::npos && currValue.at(quotesEnd - 1) == '\\') { currValue = currValue.erase(quotesEnd - 1, 1); quotesEnd = currValue.find("\"", quotesEnd + 2); } } if (quotesStart != std::string::npos && quotesEnd != std::string::npos) { // Quote provided - check and strip if valid ELPP_ASSERT((quotesStart < quotesEnd), "Configuration error - No ending quote found in [" << currConfigStr << "]"); ELPP_ASSERT((quotesStart + 1 != quotesEnd), "Empty configuration value for [" << currConfigStr << "]"); if ((quotesStart != quotesEnd) && (quotesStart + 1 != quotesEnd)) { // Explicit check in case if assertion is disabled currValue = currValue.substr(quotesStart + 1, quotesEnd - 1); } } } ELPP_ASSERT(*currLevel != Level::Unknown, "Unrecognized severity level [" << *currLevelStr << "]"); ELPP_ASSERT(currConfig != ConfigurationType::Unknown, "Unrecognized configuration [" << *currConfigStr << "]"); if (*currLevel == Level::Unknown || currConfig == ConfigurationType::Unknown) { return false; // unrecognizable level or config } conf->set(*currLevel, currConfig, currValue); return true; } }; private: std::string m_configurationFile; bool m_isFromFile; friend class el::Loggers; /// @brief Unsafely sets configuration if does not already exist void unsafeSetIfNotExist(Level level, ConfigurationType configurationType, const std::string& value) { Configuration* conf = RegistryWithPred::get(level, configurationType); if (conf == nullptr) { unsafeSet(level, configurationType, value); } } /// @brief Thread unsafe set void unsafeSet(Level level, ConfigurationType configurationType, const std::string& value) { Configuration* conf = RegistryWithPred::get(level, configurationType); if (conf == nullptr) { registerNew(new Configuration(level, configurationType, value)); } else { conf->setValue(value); } if (level == Level::Global) { unsafeSetGlobally(configurationType, value, false); } } /// @brief Sets configurations for all levels including Level::Global if includeGlobalLevel is true /// @see Configurations::setGlobally(ConfigurationType configurationType, const std::string& value) void setGlobally(ConfigurationType configurationType, const std::string& value, bool includeGlobalLevel) { if (includeGlobalLevel) { set(Level::Global, configurationType, value); } base::type::EnumType lIndex = LevelHelper::kMinValid; LevelHelper::forEachLevel(&lIndex, [&](void) -> bool { set(LevelHelper::castFromInt(lIndex), configurationType, value); return false; // Do not break lambda function yet as we need to set all levels regardless }); } /// @brief Sets configurations (Unsafely) for all levels including Level::Global if includeGlobalLevel is true /// @see Configurations::setGlobally(ConfigurationType configurationType, const std::string& value) void unsafeSetGlobally(ConfigurationType configurationType, const std::string& value, bool includeGlobalLevel) { if (includeGlobalLevel) { unsafeSet(Level::Global, configurationType, value); } base::type::EnumType lIndex = LevelHelper::kMinValid; LevelHelper::forEachLevel(&lIndex, [&](void) -> bool { unsafeSet(LevelHelper::castFromInt(lIndex), configurationType, value); return false; // Do not break lambda function yet as we need to set all levels regardless }); } }; namespace base { typedef std::shared_ptr FileStreamPtr; typedef std::map LogStreamsReferenceMap; /// @brief Configurations with data types. /// /// @detail el::Configurations have string based values. This is whats used internally in order to read correct configurations. /// This is to perform faster while writing logs using correct configurations. /// /// This is thread safe and final class containing non-virtual destructor (means nothing should inherit this class) class TypedConfigurations : public base::threading::ThreadSafe { public: /// @brief Constructor to initialize (construct) the object off el::Configurations /// @param configurations Configurations pointer/reference to base this typed configurations off. /// @param logStreamsReference Use ELPP->registeredLoggers()->logStreamsReference() TypedConfigurations(Configurations* configurations, base::LogStreamsReferenceMap* logStreamsReference) { m_configurations = configurations; m_logStreamsReference = logStreamsReference; build(m_configurations); } TypedConfigurations(const TypedConfigurations& other) { this->m_configurations = other.m_configurations; this->m_logStreamsReference = other.m_logStreamsReference; build(m_configurations); } virtual ~TypedConfigurations(void) { } const Configurations* configurations(void) const { return m_configurations; } inline bool enabled(Level level) { return getConfigByVal(level, &m_enabledMap, "enabled"); } inline bool toFile(Level level) { return getConfigByVal(level, &m_toFileMap, "toFile"); } inline const std::string& filename(Level level) { return getConfigByRef(level, &m_filenameMap, "filename"); } inline bool toStandardOutput(Level level) { return getConfigByVal(level, &m_toStandardOutputMap, "toStandardOutput"); } inline const base::LogFormat& logFormat(Level level) { return getConfigByRef(level, &m_logFormatMap, "logFormat"); } inline const base::MillisecondsWidth& millisecondsWidth(Level level = Level::Global) { return getConfigByRef(level, &m_millisecondsWidthMap, "millisecondsWidth"); } inline bool performanceTracking(Level level = Level::Global) { return getConfigByVal(level, &m_performanceTrackingMap, "performanceTracking"); } inline base::type::fstream_t* fileStream(Level level) { return getConfigByRef(level, &m_fileStreamMap, "fileStream").get(); } inline std::size_t maxLogFileSize(Level level) { return getConfigByVal(level, &m_maxLogFileSizeMap, "maxLogFileSize"); } inline std::size_t logFlushThreshold(Level level) { return getConfigByVal(level, &m_logFlushThresholdMap, "logFlushThreshold"); } private: Configurations* m_configurations; std::map m_enabledMap; std::map m_toFileMap; std::map m_filenameMap; std::map m_toStandardOutputMap; std::map m_logFormatMap; std::map m_millisecondsWidthMap; std::map m_performanceTrackingMap; std::map m_fileStreamMap; std::map m_maxLogFileSizeMap; std::map m_logFlushThresholdMap; base::LogStreamsReferenceMap* m_logStreamsReference; friend class el::Helpers; friend class el::base::MessageBuilder; friend class el::base::Writer; friend class el::base::DefaultLogDispatchCallback; friend class el::base::LogDispatcher; template inline Conf_T getConfigByVal(Level level, const std::map* confMap, const char* confName) { base::threading::ScopedLock scopedLock(lock()); return unsafeGetConfigByVal(level, confMap, confName); // This is not unsafe anymore - mutex locked in scope } template inline Conf_T& getConfigByRef(Level level, std::map* confMap, const char* confName) { base::threading::ScopedLock scopedLock(lock()); return unsafeGetConfigByRef(level, confMap, confName); // This is not unsafe anymore - mutex locked in scope } template inline Conf_T unsafeGetConfigByVal(Level level, const std::map* confMap, const char* confName) { ELPP_UNUSED(confName); typename std::map::const_iterator it = confMap->find(level); if (it == confMap->end()) { try { return confMap->at(Level::Global); } catch (...) { ELPP_INTERNAL_ERROR("Unable to get configuration [" << confName << "] for level [" << LevelHelper::convertToString(level) << "]" << std::endl << "Please ensure you have properly configured logger.", false); return Conf_T(); } } return it->second; } template inline Conf_T& unsafeGetConfigByRef(Level level, std::map* confMap, const char* confName) { ELPP_UNUSED(confName); typename std::map::iterator it = confMap->find(level); if (it == confMap->end()) { try { return confMap->at(Level::Global); } catch (...) { ELPP_INTERNAL_ERROR("Unable to get configuration [" << confName << "] for level [" << LevelHelper::convertToString(level) << "]" << std::endl << "Please ensure you have properly configured logger.", false); } } return it->second; } template void setValue(Level level, const Conf_T& value, std::map* confMap, bool includeGlobalLevel = true) { // If map is empty and we are allowed to add into generic level (Level::Global), do it! if (confMap->empty() && includeGlobalLevel) { confMap->insert(std::make_pair(Level::Global, value)); return; } // If same value exist in generic level already, dont add it to explicit level typename std::map::iterator it = confMap->find(Level::Global); if (it != confMap->end() && it->second == value) { return; } // Now make sure we dont double up values if we really need to add it to explicit level it = confMap->find(level); if (it == confMap->end()) { // Value not found for level, add new confMap->insert(std::make_pair(level, value)); } else { // Value found, just update value confMap->at(level) = value; } } void build(Configurations* configurations) { base::threading::ScopedLock scopedLock(lock()); auto getBool = [] (std::string boolStr) -> bool { // Pass by value for trimming base::utils::Str::trim(boolStr); return (boolStr == "TRUE" || boolStr == "true" || boolStr == "1"); }; std::vector withFileSizeLimit; for (Configurations::const_iterator it = configurations->begin(); it != configurations->end(); ++it) { Configuration* conf = *it; // We cannot use switch on strong enums because Intel C++ dont support them yet if (conf->configurationType() == ConfigurationType::Enabled) { setValue(conf->level(), getBool(conf->value()), &m_enabledMap); } else if (conf->configurationType() == ConfigurationType::ToFile) { setValue(conf->level(), getBool(conf->value()), &m_toFileMap); } else if (conf->configurationType() == ConfigurationType::ToStandardOutput) { setValue(conf->level(), getBool(conf->value()), &m_toStandardOutputMap); } else if (conf->configurationType() == ConfigurationType::Filename) { // We do not yet configure filename but we will configure in another // loop. This is because if file cannot be created, we will force ToFile // to be false. Because configuring logger is not necessarily performance // sensative operation, we can live with another loop; (by the way this loop // is not very heavy either) } else if (conf->configurationType() == ConfigurationType::Format) { setValue(conf->level(), base::LogFormat(conf->level(), base::type::string_t(conf->value().begin(), conf->value().end())), &m_logFormatMap); } else if (conf->configurationType() == ConfigurationType::MillisecondsWidth) { setValue(Level::Global, base::MillisecondsWidth(static_cast(getULong(conf->value()))), &m_millisecondsWidthMap); } else if (conf->configurationType() == ConfigurationType::PerformanceTracking) { setValue(Level::Global, getBool(conf->value()), &m_performanceTrackingMap); } else if (conf->configurationType() == ConfigurationType::MaxLogFileSize) { setValue(conf->level(), static_cast(getULong(conf->value())), &m_maxLogFileSizeMap); #if !defined(ELPP_NO_DEFAULT_LOG_FILE) withFileSizeLimit.push_back(conf); #endif // !defined(ELPP_NO_DEFAULT_LOG_FILE) } else if (conf->configurationType() == ConfigurationType::LogFlushThreshold) { setValue(conf->level(), static_cast(getULong(conf->value())), &m_logFlushThresholdMap); } } // As mentioned early, we will now set filename configuration in separate loop to deal with non-existent files for (Configurations::const_iterator it = configurations->begin(); it != configurations->end(); ++it) { Configuration* conf = *it; if (conf->configurationType() == ConfigurationType::Filename) { insertFile(conf->level(), conf->value()); } } for (std::vector::iterator conf = withFileSizeLimit.begin(); conf != withFileSizeLimit.end(); ++conf) { // This is not unsafe as mutex is locked in currect scope unsafeValidateFileRolling((*conf)->level(), base::defaultPreRollOutCallback); } } unsigned long getULong(std::string confVal) { bool valid = true; base::utils::Str::trim(confVal); valid = !confVal.empty() && std::find_if(confVal.begin(), confVal.end(), [](char c) { return !base::utils::Str::isDigit(c); }) == confVal.end(); if (!valid) { valid = false; ELPP_ASSERT(valid, "Configuration value not a valid integer [" << confVal << "]"); return 0; } return atol(confVal.c_str()); } std::string resolveFilename(const std::string& filename) { std::string resultingFilename = filename; std::size_t dateIndex = std::string::npos; std::string dateTimeFormatSpecifierStr = std::string(base::consts::kDateTimeFormatSpecifierForFilename); if ((dateIndex = resultingFilename.find(dateTimeFormatSpecifierStr.c_str())) != std::string::npos) { while (dateIndex > 0 && resultingFilename[dateIndex - 1] == base::consts::kFormatSpecifierChar) { dateIndex = resultingFilename.find(dateTimeFormatSpecifierStr.c_str(), dateIndex + 1); } if (dateIndex != std::string::npos) { const char* ptr = resultingFilename.c_str() + dateIndex; // Goto end of specifier ptr += dateTimeFormatSpecifierStr.size(); std::string fmt; if ((resultingFilename.size() > dateIndex) && (ptr[0] == '{')) { // User has provided format for date/time ++ptr; int count = 1; // Start by 1 in order to remove starting brace std::stringstream ss; for (; *ptr; ++ptr, ++count) { if (*ptr == '}') { ++count; // In order to remove ending brace break; } ss << *ptr; } resultingFilename.erase(dateIndex + dateTimeFormatSpecifierStr.size(), count); fmt = ss.str(); } else { fmt = std::string(base::consts::kDefaultDateTimeFormatInFilename); } base::MillisecondsWidth msWidth(3); std::string now = base::utils::DateTime::getDateTime(fmt.c_str(), &msWidth); base::utils::Str::replaceAll(now, '/', '-'); // Replace path element since we are dealing with filename base::utils::Str::replaceAll(resultingFilename, dateTimeFormatSpecifierStr, now); } } return resultingFilename; } void insertFile(Level level, const std::string& fullFilename) { std::string resolvedFilename = resolveFilename(fullFilename); if (resolvedFilename.empty()) { std::cerr << "Could not load empty file for logging, please re-check your configurations for level [" << LevelHelper::convertToString(level) << "]"; } std::string filePath = base::utils::File::extractPathFromFilename(resolvedFilename, base::consts::kFilePathSeperator); if (filePath.size() < resolvedFilename.size()) { base::utils::File::createPath(filePath); } auto create = [&](Level level) { base::LogStreamsReferenceMap::iterator filestreamIter = m_logStreamsReference->find(resolvedFilename); base::type::fstream_t* fs = nullptr; if (filestreamIter == m_logStreamsReference->end()) { // We need a completely new stream, nothing to share with fs = base::utils::File::newFileStream(resolvedFilename); m_filenameMap.insert(std::make_pair(level, resolvedFilename)); m_fileStreamMap.insert(std::make_pair(level, base::FileStreamPtr(fs))); m_logStreamsReference->insert(std::make_pair(resolvedFilename, base::FileStreamPtr(m_fileStreamMap.at(level)))); } else { // Woops! we have an existing one, share it! m_filenameMap.insert(std::make_pair(level, filestreamIter->first)); m_fileStreamMap.insert(std::make_pair(level, base::FileStreamPtr(filestreamIter->second))); fs = filestreamIter->second.get(); } if (fs == nullptr) { // We display bad file error from newFileStream() ELPP_INTERNAL_ERROR("Setting [TO_FILE] of [" << LevelHelper::convertToString(level) << "] to FALSE", false); setValue(level, false, &m_toFileMap); } }; // If we dont have file conf for any level, create it for Level::Global first // otherwise create for specified level create(m_filenameMap.empty() && m_fileStreamMap.empty() ? Level::Global : level); } bool unsafeValidateFileRolling(Level level, const PreRollOutCallback& PreRollOutCallback) { base::type::fstream_t* fs = unsafeGetConfigByRef(level, &m_fileStreamMap, "fileStream").get(); if (fs == nullptr) { return true; } std::size_t maxLogFileSize = unsafeGetConfigByVal(level, &m_maxLogFileSizeMap, "maxLogFileSize"); std::size_t currFileSize = base::utils::File::getSizeOfFile(fs); if (maxLogFileSize != 0 && currFileSize >= maxLogFileSize) { std::string fname = unsafeGetConfigByRef(level, &m_filenameMap, "filename"); ELPP_INTERNAL_INFO(1, "Truncating log file [" << fname << "] as a result of configurations for level [" << LevelHelper::convertToString(level) << "]"); fs->close(); PreRollOutCallback(fname.c_str(), currFileSize); fs->open(fname, std::fstream::out | std::fstream::trunc); return true; } return false; } bool validateFileRolling(Level level, const PreRollOutCallback& PreRollOutCallback) { base::threading::ScopedLock scopedLock(lock()); return unsafeValidateFileRolling(level, PreRollOutCallback); } }; /// @brief Class that keeps record of current line hit for occasional logging class HitCounter { public: HitCounter(void) : m_filename(""), m_lineNumber(0), m_hitCounts(0) { } HitCounter(const char* filename, unsigned long int lineNumber) : m_filename(filename), m_lineNumber(lineNumber), m_hitCounts(0) { } HitCounter(const HitCounter& hitCounter) : m_filename(hitCounter.m_filename), m_lineNumber(hitCounter.m_lineNumber), m_hitCounts(hitCounter.m_hitCounts) { } HitCounter& operator=(const HitCounter& hitCounter) { m_filename = hitCounter.m_filename; m_lineNumber = hitCounter.m_lineNumber; m_hitCounts = hitCounter.m_hitCounts; return *this; } virtual ~HitCounter(void) { } /// @brief Resets location of current hit counter inline void resetLocation(const char* filename, unsigned long int lineNumber) { m_filename = filename; m_lineNumber = lineNumber; } /// @brief Validates hit counts and resets it if necessary inline void validateHitCounts(std::size_t n) { if (m_hitCounts >= base::consts::kMaxLogPerCounter) { m_hitCounts = (n >= 1 ? base::consts::kMaxLogPerCounter % n : 0); } ++m_hitCounts; } inline const char* filename(void) const { return m_filename; } inline unsigned long int lineNumber(void) const { return m_lineNumber; } inline std::size_t hitCounts(void) const { return m_hitCounts; } inline void increment(void) { ++m_hitCounts; } class Predicate { public: Predicate(const char* filename, unsigned long int lineNumber) : m_filename(filename), m_lineNumber(lineNumber) { } inline bool operator()(const HitCounter* counter) { return ((counter != nullptr) && (strcmp(counter->m_filename, m_filename) == 0) && (counter->m_lineNumber == m_lineNumber)); } private: const char* m_filename; unsigned long int m_lineNumber; }; private: const char* m_filename; unsigned long int m_lineNumber; std::size_t m_hitCounts; }; /// @brief Repository for hit counters used across the application class RegisteredHitCounters : public base::utils::RegistryWithPred { public: /// @brief Validates counter for every N, i.e, registers new if does not exist otherwise updates original one /// @return True if validation resulted in triggering hit. Meaning logs should be written everytime true is returned bool validateEveryN(const char* filename, unsigned long int lineNumber, std::size_t n) { base::threading::ScopedLock scopedLock(lock()); base::HitCounter* counter = get(filename, lineNumber); if (counter == nullptr) { registerNew(counter = new base::HitCounter(filename, lineNumber)); } counter->validateHitCounts(n); bool result = (n >= 1 && counter->hitCounts() != 0 && counter->hitCounts() % n == 0); return result; } /// @brief Validates counter for hits >= N, i.e, registers new if does not exist otherwise updates original one /// @return True if validation resulted in triggering hit. Meaning logs should be written everytime true is returned bool validateAfterN(const char* filename, unsigned long int lineNumber, std::size_t n) { base::threading::ScopedLock scopedLock(lock()); base::HitCounter* counter = get(filename, lineNumber); if (counter == nullptr) { registerNew(counter = new base::HitCounter(filename, lineNumber)); } // Do not use validateHitCounts here since we do not want to reset counter here // Note the >= instead of > because we are incrementing // after this check if (counter->hitCounts() >= n) return true; counter->increment(); return false; } /// @brief Validates counter for hits are <= n, i.e, registers new if does not exist otherwise updates original one /// @return True if validation resulted in triggering hit. Meaning logs should be written everytime true is returned bool validateNTimes(const char* filename, unsigned long int lineNumber, std::size_t n) { base::threading::ScopedLock scopedLock(lock()); base::HitCounter* counter = get(filename, lineNumber); if (counter == nullptr) { registerNew(counter = new base::HitCounter(filename, lineNumber)); } counter->increment(); // Do not use validateHitCounts here since we do not want to reset counter here if (counter->hitCounts() <= n) return true; return false; } /// @brief Gets hit counter registered at specified position inline const base::HitCounter* getCounter(const char* filename, unsigned long int lineNumber) { base::threading::ScopedLock scopedLock(lock()); return get(filename, lineNumber); } }; /// @brief Action to be taken for dispatching enum class DispatchAction : base::type::EnumType { None = 1, NormalLog = 2, SysLog = 4 }; } // namespace base template class Callback : protected base::threading::ThreadSafe { public: Callback(void) : m_enabled(true) {} inline bool enabled(void) const { return m_enabled; } inline void setEnabled(bool enabled) { base::threading::ScopedLock scopedLock(lock()); m_enabled = enabled; } protected: virtual void handle(const T* handlePtr) = 0; private: bool m_enabled; }; class LogDispatchData { public: LogDispatchData() : m_logMessage(nullptr), m_dispatchAction(base::DispatchAction::None) {} inline const LogMessage* logMessage(void) const { return m_logMessage; } inline base::DispatchAction dispatchAction(void) const { return m_dispatchAction; } private: LogMessage* m_logMessage; base::DispatchAction m_dispatchAction; friend class base::LogDispatcher; inline void setLogMessage(LogMessage* logMessage) { m_logMessage = logMessage; } inline void setDispatchAction(base::DispatchAction dispatchAction) { m_dispatchAction = dispatchAction; } }; class LogDispatchCallback : public Callback { private: friend class base::LogDispatcher; }; class PerformanceTrackingCallback : public Callback { private: friend class base::PerformanceTracker; }; class LogBuilder : base::NoCopy { public: virtual ~LogBuilder(void) { ELPP_INTERNAL_INFO(3, "Destroying log builder...")} virtual base::type::string_t build(const LogMessage* logMessage, bool appendNewLine) const = 0; void convertToColoredOutput(base::type::string_t* logLine, Level level) { if (!base::utils::s_termSupportsColor) return; const base::type::char_t* resetColor = ELPP_LITERAL("\x1b[0m"); if (level == Level::Error || level == Level::Fatal) *logLine = ELPP_LITERAL("\x1b[31m") + *logLine + resetColor; else if (level == Level::Warning) *logLine = ELPP_LITERAL("\x1b[33m") + *logLine + resetColor; } private: friend class el::base::DefaultLogDispatchCallback; }; typedef std::shared_ptr LogBuilderPtr; /// @brief Represents a logger holding ID and configurations we need to write logs /// /// @detail This class does not write logs itself instead its used by writer to read configuations from. class Logger : public base::threading::ThreadSafe, public Loggable { public: Logger(const std::string& id, base::LogStreamsReferenceMap* logStreamsReference) : m_id(id), m_typedConfigurations(nullptr), m_parentApplicationName(std::string()), m_isConfigured(false), m_logStreamsReference(logStreamsReference) { initUnflushedCount(); } Logger(const std::string& id, const Configurations& configurations, base::LogStreamsReferenceMap* logStreamsReference) : m_id(id), m_typedConfigurations(nullptr), m_parentApplicationName(std::string()), m_isConfigured(false), m_logStreamsReference(logStreamsReference) { initUnflushedCount(); configure(configurations); } Logger(const Logger& logger) { base::utils::safeDelete(m_typedConfigurations); m_id = logger.m_id; m_typedConfigurations = logger.m_typedConfigurations; m_parentApplicationName = logger.m_parentApplicationName; m_isConfigured = logger.m_isConfigured; m_configurations = logger.m_configurations; m_unflushedCount = logger.m_unflushedCount; m_logStreamsReference = logger.m_logStreamsReference; } Logger& operator=(const Logger& logger) { base::utils::safeDelete(m_typedConfigurations); m_id = logger.m_id; m_typedConfigurations = logger.m_typedConfigurations; m_parentApplicationName = logger.m_parentApplicationName; m_isConfigured = logger.m_isConfigured; m_configurations = logger.m_configurations; m_unflushedCount = logger.m_unflushedCount; m_logStreamsReference = logger.m_logStreamsReference; return *this; } virtual ~Logger(void) { base::utils::safeDelete(m_typedConfigurations); } virtual inline void log(el::base::type::ostream_t& os) const { os << m_id.c_str(); } /// @brief Configures the logger using specified configurations. void configure(const Configurations& configurations) { m_isConfigured = false; // we set it to false in case if we fail initUnflushedCount(); if (m_typedConfigurations != nullptr) { Configurations* c = const_cast(m_typedConfigurations->configurations()); if (c->hasConfiguration(Level::Global, ConfigurationType::Filename)) { // This check is definitely needed for cases like ELPP_NO_DEFAULT_LOG_FILE flush(); } } base::threading::ScopedLock scopedLock(lock()); if (m_configurations != configurations) { m_configurations.setFromBase(const_cast(&configurations)); } base::utils::safeDelete(m_typedConfigurations); m_typedConfigurations = new base::TypedConfigurations(&m_configurations, m_logStreamsReference); resolveLoggerFormatSpec(); m_isConfigured = true; } /// @brief Reconfigures logger using existing configurations inline void reconfigure(void) { ELPP_INTERNAL_INFO(1, "Reconfiguring logger [" << m_id << "]"); configure(m_configurations); } inline const std::string& id(void) const { return m_id; } inline const std::string& parentApplicationName(void) const { return m_parentApplicationName; } inline void setParentApplicationName(const std::string& parentApplicationName) { m_parentApplicationName = parentApplicationName; } inline Configurations* configurations(void) { return &m_configurations; } inline base::TypedConfigurations* typedConfigurations(void) { return m_typedConfigurations; } static inline bool isValidId(const std::string& id) { for (std::string::const_iterator it = id.begin(); it != id.end(); ++it) { if (!base::utils::Str::contains(base::consts::kValidLoggerIdSymbols, *it)) { return false; } } return true; } /// @brief Flushes logger to sync all log files for all levels inline void flush(void) { ELPP_INTERNAL_INFO(3, "Flushing logger [" << m_id << "] all levels"); base::threading::ScopedLock scopedLock(lock()); base::type::EnumType lIndex = LevelHelper::kMinValid; LevelHelper::forEachLevel(&lIndex, [&](void) -> bool { flush(LevelHelper::castFromInt(lIndex), nullptr); return false; }); } inline void flush(Level level, base::type::fstream_t* fs) { if (fs == nullptr && m_typedConfigurations->toFile(level)) { fs = m_typedConfigurations->fileStream(level); } if (fs != nullptr) { fs->flush(); m_unflushedCount.find(level)->second = 0; } } inline bool isFlushNeeded(Level level) { return ++m_unflushedCount.find(level)->second >= m_typedConfigurations->logFlushThreshold(level); } inline LogBuilder* logBuilder(void) const { return m_logBuilder.get(); } inline void setLogBuilder(const LogBuilderPtr& logBuilder) { m_logBuilder = logBuilder; } inline bool enabled(Level level) const { return m_typedConfigurations->enabled(level); } #if ELPP_VARIADIC_TEMPLATES_SUPPORTED # define LOGGER_LEVEL_WRITERS_SIGNATURES(FUNCTION_NAME)\ template \ inline void FUNCTION_NAME(const char*, const T&, const Args&...);\ template \ inline void FUNCTION_NAME(const T&); template inline void verbose(int, const char*, const T&, const Args&...); template inline void verbose(int, const T&); LOGGER_LEVEL_WRITERS_SIGNATURES(info) LOGGER_LEVEL_WRITERS_SIGNATURES(debug) LOGGER_LEVEL_WRITERS_SIGNATURES(warn) LOGGER_LEVEL_WRITERS_SIGNATURES(error) LOGGER_LEVEL_WRITERS_SIGNATURES(fatal) LOGGER_LEVEL_WRITERS_SIGNATURES(trace) # undef LOGGER_LEVEL_WRITERS_SIGNATURES #endif // ELPP_VARIADIC_TEMPLATES_SUPPORTED private: std::string m_id; base::TypedConfigurations* m_typedConfigurations; base::type::stringstream_t m_stream; std::string m_parentApplicationName; bool m_isConfigured; Configurations m_configurations; std::map m_unflushedCount; base::LogStreamsReferenceMap* m_logStreamsReference; LogBuilderPtr m_logBuilder; friend class el::LogMessage; friend class el::Loggers; friend class el::Helpers; friend class el::base::RegisteredLoggers; friend class el::base::DefaultLogDispatchCallback; friend class el::base::MessageBuilder; friend class el::base::Writer; friend class el::base::PErrorWriter; friend class el::base::Storage; friend class el::base::PerformanceTracker; friend class el::base::LogDispatcher; Logger(void); #if ELPP_VARIADIC_TEMPLATES_SUPPORTED template void log_(Level, int, const char*, const T&, const Args&...); template inline void log_(Level, int, const T&); template void log(Level, const char*, const T&, const Args&...); template inline void log(Level, const T&); #endif // ELPP_VARIADIC_TEMPLATES_SUPPORTED void initUnflushedCount(void) { m_unflushedCount.clear(); base::type::EnumType lIndex = LevelHelper::kMinValid; LevelHelper::forEachLevel(&lIndex, [&](void) -> bool { m_unflushedCount.insert(std::make_pair(LevelHelper::castFromInt(lIndex), 0)); return false; }); } inline base::type::stringstream_t& stream(void) { return m_stream; } void resolveLoggerFormatSpec(void) const { base::type::EnumType lIndex = LevelHelper::kMinValid; LevelHelper::forEachLevel(&lIndex, [&](void) -> bool { base::LogFormat* logFormat = const_cast(&m_typedConfigurations->logFormat(LevelHelper::castFromInt(lIndex))); base::utils::Str::replaceFirstWithEscape(logFormat->m_format, base::consts::kLoggerIdFormatSpecifier, m_id); return false; }); } }; namespace base { /// @brief Loggers repository class RegisteredLoggers : public base::utils::Registry { public: explicit RegisteredLoggers(const LogBuilderPtr& defaultLogBuilder) : m_defaultLogBuilder(defaultLogBuilder) { m_defaultConfigurations.setToDefault(); } virtual ~RegisteredLoggers(void) { flushAll(); } inline void setDefaultConfigurations(const Configurations& configurations) { base::threading::ScopedLock scopedLock(lock()); m_defaultConfigurations.setFromBase(const_cast(&configurations)); } inline Configurations* defaultConfigurations(void) { return &m_defaultConfigurations; } Logger* get(const std::string& id, bool forceCreation = true) { base::threading::ScopedLock scopedLock(lock()); Logger* logger_ = base::utils::Registry::get(id); if (logger_ == nullptr && forceCreation) { bool validId = Logger::isValidId(id); if (!validId) { ELPP_ASSERT(validId, "Invalid logger ID [" << id << "]. Not registering this logger."); return nullptr; } logger_ = new Logger(id, m_defaultConfigurations, &m_logStreamsReference); logger_->m_logBuilder = m_defaultLogBuilder; registerNew(id, logger_); } return logger_; } bool remove(const std::string& id) { if (id == "default") { return false; } Logger* logger = base::utils::Registry::get(id); if (logger != nullptr) { unregister(logger); } return true; } inline bool has(const std::string& id) { return get(id, false) != nullptr; } inline void unregister(Logger*& logger) { base::threading::ScopedLock scopedLock(lock()); base::utils::Registry::unregister(logger->id()); } inline base::LogStreamsReferenceMap* logStreamsReference(void) { return &m_logStreamsReference; } inline void flushAll(void) { ELPP_INTERNAL_INFO(1, "Flushing all log files"); base::threading::ScopedLock scopedLock(lock()); for (base::LogStreamsReferenceMap::iterator it = m_logStreamsReference.begin(); it != m_logStreamsReference.end(); ++it) { if (it->second.get() == nullptr) continue; it->second->flush(); } } private: LogBuilderPtr m_defaultLogBuilder; Configurations m_defaultConfigurations; base::LogStreamsReferenceMap m_logStreamsReference; friend class el::base::Storage; }; /// @brief Represents registries for verbose logging class VRegistry : base::NoCopy, public base::threading::ThreadSafe { public: explicit VRegistry(base::type::VerboseLevel level, base::type::EnumType* pFlags) : m_level(level), m_pFlags(pFlags) { } /// @brief Sets verbose level. Accepted range is 0-9 inline void setLevel(base::type::VerboseLevel level) { base::threading::ScopedLock scopedLock(lock()); if (level < 0) m_level = 0; else if (level > 9) m_level = base::consts::kMaxVerboseLevel; else m_level = level; } inline base::type::VerboseLevel level(void) const { return m_level; } inline void clearModules(void) { base::threading::ScopedLock scopedLock(lock()); m_modules.clear(); } void setModules(const char* modules) { base::threading::ScopedLock scopedLock(lock()); auto addSuffix = [](std::stringstream& ss, const char* sfx, const char* prev) { if (prev != nullptr && base::utils::Str::endsWith(ss.str(), std::string(prev))) { std::string chr(ss.str().substr(0, ss.str().size() - strlen(prev))); ss.str(std::string("")); ss << chr; } if (base::utils::Str::endsWith(ss.str(), std::string(sfx))) { std::string chr(ss.str().substr(0, ss.str().size() - strlen(sfx))); ss.str(std::string("")); ss << chr; } ss << sfx; }; auto insert = [&](std::stringstream& ss, base::type::VerboseLevel level) { if (!base::utils::hasFlag(LoggingFlag::DisableVModulesExtensions, *m_pFlags)) { addSuffix(ss, ".h", nullptr); m_modules.insert(std::make_pair(ss.str(), level)); addSuffix(ss, ".c", ".h"); m_modules.insert(std::make_pair(ss.str(), level)); addSuffix(ss, ".cpp", ".c"); m_modules.insert(std::make_pair(ss.str(), level)); addSuffix(ss, ".cc", ".cpp"); m_modules.insert(std::make_pair(ss.str(), level)); addSuffix(ss, ".cxx", ".cc"); m_modules.insert(std::make_pair(ss.str(), level)); addSuffix(ss, ".-inl.h", ".cxx"); m_modules.insert(std::make_pair(ss.str(), level)); addSuffix(ss, ".hxx", ".-inl.h"); m_modules.insert(std::make_pair(ss.str(), level)); addSuffix(ss, ".hpp", ".hxx"); m_modules.insert(std::make_pair(ss.str(), level)); addSuffix(ss, ".hh", ".hpp"); } m_modules.insert(std::make_pair(ss.str(), level)); }; bool isMod = true; bool isLevel = false; std::stringstream ss; int level = -1; for (; *modules; ++modules) { switch (*modules) { case '=': isLevel = true; isMod = false; break; case ',': isLevel = false; isMod = true; if (!ss.str().empty() && level != -1) { insert(ss, level); ss.str(std::string("")); level = -1; } break; default: if (isMod) { ss << *modules; } else if (isLevel) { if (isdigit(*modules)) { level = static_cast(*modules) - 48; } } break; } } if (!ss.str().empty() && level != -1) { insert(ss, level); } } bool allowed(base::type::VerboseLevel vlevel, const char* file) { base::threading::ScopedLock scopedLock(lock()); if (m_modules.empty() || file == nullptr) { return vlevel <= m_level; } else { std::map::iterator it = m_modules.begin(); for (; it != m_modules.end(); ++it) { if (base::utils::Str::wildCardMatch(file, it->first.c_str())) { return vlevel <= it->second; } } if (base::utils::hasFlag(LoggingFlag::AllowVerboseIfModuleNotSpecified, *m_pFlags)) { return true; } return false; } } inline const std::map& modules(void) const { return m_modules; } void setFromArgs(const base::utils::CommandLineArgs* commandLineArgs) { if (commandLineArgs->hasParam("-v") || commandLineArgs->hasParam("--verbose") || commandLineArgs->hasParam("-V") || commandLineArgs->hasParam("--VERBOSE")) { setLevel(base::consts::kMaxVerboseLevel); } else if (commandLineArgs->hasParamWithValue("--v")) { setLevel(atoi(commandLineArgs->getParamValue("--v"))); } else if (commandLineArgs->hasParamWithValue("--V")) { setLevel(atoi(commandLineArgs->getParamValue("--V"))); } else if ((commandLineArgs->hasParamWithValue("-vmodule")) && vModulesEnabled()) { setModules(commandLineArgs->getParamValue("-vmodule")); } else if (commandLineArgs->hasParamWithValue("-VMODULE") && vModulesEnabled()) { setModules(commandLineArgs->getParamValue("-VMODULE")); } } /// @brief Whether or not vModules enabled inline bool vModulesEnabled(void) { return !base::utils::hasFlag(LoggingFlag::DisableVModules, *m_pFlags); } private: base::type::VerboseLevel m_level; base::type::EnumType* m_pFlags; std::map m_modules; }; } // namespace base class LogMessage { public: LogMessage(Level level, const std::string& file, unsigned long int line, const std::string& func, base::type::VerboseLevel verboseLevel, Logger* logger) : m_level(level), m_file(file), m_line(line), m_func(func), m_verboseLevel(verboseLevel), m_logger(logger), m_message(std::move(logger->stream().str())) { } inline Level level(void) const { return m_level; } inline const std::string& file(void) const { return m_file; } inline unsigned long int line(void) const { return m_line; } // NOLINT inline const std::string& func(void) const { return m_func; } inline base::type::VerboseLevel verboseLevel(void) const { return m_verboseLevel; } inline Logger* logger(void) const { return m_logger; } inline const base::type::string_t& message(void) const { return m_message; } private: Level m_level; std::string m_file; unsigned long int m_line; std::string m_func; base::type::VerboseLevel m_verboseLevel; Logger* m_logger; base::type::string_t m_message; }; namespace base { #if ELPP_ASYNC_LOGGING class AsyncLogItem { public: explicit AsyncLogItem(const LogMessage& logMessage, const LogDispatchData& data, const base::type::string_t& logLine) : m_logMessage(logMessage), m_dispatchData(data), m_logLine(logLine) {} virtual ~AsyncLogItem() {} inline LogMessage* logMessage(void) { return &m_logMessage; } inline LogDispatchData* data(void) { return &m_dispatchData; } inline base::type::string_t logLine(void) { return m_logLine; } private: LogMessage m_logMessage; LogDispatchData m_dispatchData; base::type::string_t m_logLine; }; class AsyncLogQueue : public base::threading::ThreadSafe { public: virtual ~AsyncLogQueue() { ELPP_INTERNAL_INFO(6, "~AsyncLogQueue"); } inline AsyncLogItem next(void) { base::threading::ScopedLock scopedLock(lock()); AsyncLogItem result = m_queue.front(); m_queue.pop(); return result; } inline void push(const AsyncLogItem& item) { base::threading::ScopedLock scopedLock(lock()); m_queue.push(item); } inline void pop(void) { base::threading::ScopedLock scopedLock(lock()); m_queue.pop(); } inline AsyncLogItem front(void) { base::threading::ScopedLock scopedLock(lock()); return m_queue.front(); } inline bool empty(void) { base::threading::ScopedLock scopedLock(lock()); return m_queue.empty(); } private: std::queue m_queue; }; class IWorker { public: virtual ~IWorker() {} virtual void start() = 0; }; #endif // ELPP_ASYNC_LOGGING /// @brief Easylogging++ management storage class Storage : base::NoCopy, public base::threading::ThreadSafe { public: #if ELPP_ASYNC_LOGGING Storage(const LogBuilderPtr& defaultLogBuilder, base::IWorker* asyncDispatchWorker) : #else explicit Storage(const LogBuilderPtr& defaultLogBuilder) : #endif // ELPP_ASYNC_LOGGING m_registeredHitCounters(new base::RegisteredHitCounters()), m_registeredLoggers(new base::RegisteredLoggers(defaultLogBuilder)), m_flags(0x0), m_vRegistry(new base::VRegistry(0, &m_flags)), #if ELPP_ASYNC_LOGGING m_asyncLogQueue(new base::AsyncLogQueue()), m_asyncDispatchWorker(asyncDispatchWorker), #endif // ELPP_ASYNC_LOGGING m_preRollOutCallback(base::defaultPreRollOutCallback) { // Register default logger m_registeredLoggers->get(std::string(base::consts::kDefaultLoggerId)); // Register performance logger and reconfigure format Logger* performanceLogger = m_registeredLoggers->get(std::string(base::consts::kPerformanceLoggerId)); performanceLogger->configurations()->setGlobally(ConfigurationType::Format, std::string("%datetime %level %msg")); performanceLogger->reconfigure(); #if defined(ELPP_SYSLOG) // Register syslog logger and reconfigure format Logger* sysLogLogger = m_registeredLoggers->get(std::string(base::consts::kSysLogLoggerId)); sysLogLogger->configurations()->setGlobally(ConfigurationType::Format, std::string("%level: %msg")); sysLogLogger->reconfigure(); #else ELPP_UNUSED(base::consts::kSysLogLoggerId); #endif // defined(ELPP_SYSLOG) addFlag(LoggingFlag::AllowVerboseIfModuleNotSpecified); #if ELPP_ASYNC_LOGGING installLogDispatchCallback(std::string("AsyncLogDispatchCallback")); #else installLogDispatchCallback(std::string("DefaultLogDispatchCallback")); #endif // ELPP_ASYNC_LOGGING installPerformanceTrackingCallback(std::string("DefaultPerformanceTrackingCallback")); ELPP_INTERNAL_INFO(1, "Easylogging++ has been initialized"); #if ELPP_ASYNC_LOGGING m_asyncDispatchWorker->start(); #endif // ELPP_ASYNC_LOGGING } virtual ~Storage(void) { ELPP_INTERNAL_INFO(4, "Destroying storage"); #if ELPP_ASYNC_LOGGING ELPP_INTERNAL_INFO(5, "Replacing log dispatch callback to synchronous"); uninstallLogDispatchCallback(std::string("AsyncLogDispatchCallback")); installLogDispatchCallback(std::string("DefaultLogDispatchCallback")); ELPP_INTERNAL_INFO(5, "Destroying asyncDispatchWorker"); base::utils::safeDelete(m_asyncDispatchWorker); ELPP_INTERNAL_INFO(5, "Destroying asyncLogQueue"); base::utils::safeDelete(m_asyncLogQueue); #endif // ELPP_ASYNC_LOGGING ELPP_INTERNAL_INFO(5, "Destroying registeredHitCounters"); base::utils::safeDelete(m_registeredHitCounters); ELPP_INTERNAL_INFO(5, "Destroying registeredLoggers"); base::utils::safeDelete(m_registeredLoggers); ELPP_INTERNAL_INFO(5, "Destroying vRegistry"); base::utils::safeDelete(m_vRegistry); } inline bool validateEveryNCounter(const char* filename, unsigned long int lineNumber, std::size_t occasion) { return hitCounters()->validateEveryN(filename, lineNumber, occasion); } inline bool validateAfterNCounter(const char* filename, unsigned long int lineNumber, std::size_t n) { // NOLINT return hitCounters()->validateAfterN(filename, lineNumber, n); } inline bool validateNTimesCounter(const char* filename, unsigned long int lineNumber, std::size_t n) { // NOLINT return hitCounters()->validateNTimes(filename, lineNumber, n); } inline base::RegisteredHitCounters* hitCounters(void) const { return m_registeredHitCounters; } inline base::RegisteredLoggers* registeredLoggers(void) const { return m_registeredLoggers; } inline base::VRegistry* vRegistry(void) const { return m_vRegistry; } #if ELPP_ASYNC_LOGGING inline base::AsyncLogQueue* asyncLogQueue(void) const { return m_asyncLogQueue; } #endif // ELPP_ASYNC_LOGGING inline const base::utils::CommandLineArgs* commandLineArgs(void) const { return &m_commandLineArgs; } inline void addFlag(LoggingFlag flag) { base::utils::addFlag(flag, &m_flags); } inline void removeFlag(LoggingFlag flag) { base::utils::removeFlag(flag, &m_flags); } inline bool hasFlag(LoggingFlag flag) const { return base::utils::hasFlag(flag, m_flags); } inline base::type::EnumType flags(void) const { return m_flags; } inline void setFlags(base::type::EnumType flags) { m_flags = flags; } inline void setPreRollOutCallback(const PreRollOutCallback& callback) { m_preRollOutCallback = callback; } inline void unsetPreRollOutCallback(void) { m_preRollOutCallback = base::defaultPreRollOutCallback; } inline PreRollOutCallback& preRollOutCallback(void) { return m_preRollOutCallback; } inline bool hasCustomFormatSpecifier(const char* formatSpecifier) { base::threading::ScopedLock scopedLock(lock()); return std::find(m_customFormatSpecifiers.begin(), m_customFormatSpecifiers.end(), formatSpecifier) != m_customFormatSpecifiers.end(); } inline void installCustomFormatSpecifier(const CustomFormatSpecifier& customFormatSpecifier) { if (hasCustomFormatSpecifier(customFormatSpecifier.formatSpecifier())) { return; } base::threading::ScopedLock scopedLock(lock()); m_customFormatSpecifiers.push_back(customFormatSpecifier); } inline bool uninstallCustomFormatSpecifier(const char* formatSpecifier) { base::threading::ScopedLock scopedLock(lock()); std::vector::iterator it = std::find(m_customFormatSpecifiers.begin(), m_customFormatSpecifiers.end(), formatSpecifier); if (it != m_customFormatSpecifiers.end() && strcmp(formatSpecifier, it->formatSpecifier()) == 0) { m_customFormatSpecifiers.erase(it); return true; } return false; } const std::vector* customFormatSpecifiers(void) const { return &m_customFormatSpecifiers; } inline void setLoggingLevel(Level level) { m_loggingLevel = level; } template inline bool installLogDispatchCallback(const std::string& id) { return installCallback(id, &m_logDispatchCallbacks); } template inline void uninstallLogDispatchCallback(const std::string& id) { uninstallCallback(id, &m_logDispatchCallbacks); } template inline T* logDispatchCallback(const std::string& id) { return callback(id, &m_logDispatchCallbacks); } template inline bool installPerformanceTrackingCallback(const std::string& id) { return installCallback(id, &m_performanceTrackingCallbacks); } template inline void uninstallPerformanceTrackingCallback(const std::string& id) { uninstallCallback(id, &m_performanceTrackingCallbacks); } template inline T* performanceTrackingCallback(const std::string& id) { return callback(id, &m_performanceTrackingCallbacks); } private: base::RegisteredHitCounters* m_registeredHitCounters; base::RegisteredLoggers* m_registeredLoggers; base::type::EnumType m_flags; base::VRegistry* m_vRegistry; #if ELPP_ASYNC_LOGGING base::AsyncLogQueue* m_asyncLogQueue; base::IWorker* m_asyncDispatchWorker; #endif // ELPP_ASYNC_LOGGING base::utils::CommandLineArgs m_commandLineArgs; PreRollOutCallback m_preRollOutCallback; std::map m_logDispatchCallbacks; std::map m_performanceTrackingCallbacks; std::vector m_customFormatSpecifiers; Level m_loggingLevel; friend class el::Helpers; friend class el::base::DefaultLogDispatchCallback; friend class el::LogBuilder; friend class el::base::MessageBuilder; friend class el::base::Writer; friend class el::base::PerformanceTracker; friend class el::base::LogDispatcher; void setApplicationArguments(int argc, char** argv) { m_commandLineArgs.setArgs(argc, argv); m_vRegistry->setFromArgs(commandLineArgs()); // default log file #if !defined(ELPP_DISABLE_LOG_FILE_FROM_ARG) if (m_commandLineArgs.hasParamWithValue(base::consts::kDefaultLogFileParam)) { Configurations c; c.setGlobally(ConfigurationType::Filename, std::string(m_commandLineArgs.getParamValue(base::consts::kDefaultLogFileParam))); registeredLoggers()->setDefaultConfigurations(c); for (base::RegisteredLoggers::iterator it = registeredLoggers()->begin(); it != registeredLoggers()->end(); ++it) { it->second->configure(c); } } #endif // !defined(ELPP_DISABLE_LOG_FILE_FROM_ARG) #if defined(ELPP_LOGGING_FLAGS_FROM_ARG) if (m_commandLineArgs.hasParamWithValue(base::consts::kLoggingFlagsParam)) { m_flags = atoi(m_commandLineArgs.getParamValue(base::consts::kLoggingFlagsParam)); } #endif // defined(ELPP_LOGGING_FLAGS_FROM_ARG) } inline void setApplicationArguments(int argc, const char** argv) { setApplicationArguments(argc, const_cast(argv)); } template inline bool installCallback(const std::string& id, std::map* mapT) { if (mapT->find(id) == mapT->end()) { mapT->insert(std::make_pair(id, TPtr(new T()))); return true; } return false; } template inline void uninstallCallback(const std::string& id, std::map* mapT) { if (mapT->find(id) != mapT->end()) { mapT->erase(id); } } template inline T* callback(const std::string& id, std::map* mapT) { typename std::map::iterator iter = mapT->find(id); if (iter != mapT->end()) { return static_cast(iter->second.get()); } return nullptr; } }; extern ELPP_EXPORT base::type::StoragePointer elStorage; #define ELPP el::base::elStorage class DefaultLogDispatchCallback : public LogDispatchCallback { protected: void handle(const LogDispatchData* data) { m_data = data; dispatch(std::move(m_data->logMessage()->logger()->logBuilder()->build(m_data->logMessage(), m_data->dispatchAction() == base::DispatchAction::NormalLog))); } private: const LogDispatchData* m_data; void dispatch(base::type::string_t&& logLine) { if (m_data->dispatchAction() == base::DispatchAction::NormalLog) { if (m_data->logMessage()->logger()->m_typedConfigurations->toFile(m_data->logMessage()->level())) { base::type::fstream_t* fs = m_data->logMessage()->logger()->m_typedConfigurations->fileStream(m_data->logMessage()->level()); if (fs != nullptr) { fs->write(logLine.c_str(), logLine.size()); if (fs->fail()) { ELPP_INTERNAL_ERROR("Unable to write log to file [" << m_data->logMessage()->logger()->m_typedConfigurations->filename(m_data->logMessage()->level()) << "].\n" << "Few possible reasons (could be something else):\n" << " * Permission denied\n" << " * Disk full\n" << " * Disk is not writable", true); } else { if (ELPP->hasFlag(LoggingFlag::ImmediateFlush) || (m_data->logMessage()->logger()->isFlushNeeded(m_data->logMessage()->level()))) { m_data->logMessage()->logger()->flush(m_data->logMessage()->level(), fs); } } } else { ELPP_INTERNAL_ERROR("Log file for [" << LevelHelper::convertToString(m_data->logMessage()->level()) << "] " << "has not been configured but [TO_FILE] is configured to TRUE. [Logger ID: " << m_data->logMessage()->logger()->id() << "]", false); } } if (m_data->logMessage()->logger()->m_typedConfigurations->toStandardOutput(m_data->logMessage()->level())) { if (ELPP->hasFlag(LoggingFlag::ColoredTerminalOutput)) m_data->logMessage()->logger()->logBuilder()->convertToColoredOutput(&logLine, m_data->logMessage()->level()); ELPP_COUT << ELPP_COUT_LINE(logLine); } } #if defined(ELPP_SYSLOG) else if (m_data->dispatchAction() == base::DispatchAction::SysLog) { // Determine syslog priority int sysLogPriority = 0; if (m_data->logMessage()->level() == Level::Fatal) sysLogPriority = LOG_EMERG; else if (m_data->logMessage()->level() == Level::Error) sysLogPriority = LOG_ERR; else if (m_data->logMessage()->level() == Level::Warning) sysLogPriority = LOG_WARNING; else if (m_data->logMessage()->level() == Level::Info) sysLogPriority = LOG_INFO; else if (m_data->logMessage()->level() == Level::Debug) sysLogPriority = LOG_DEBUG; else sysLogPriority = LOG_NOTICE; # if defined(ELPP_UNICODE) char* line = base::utils::Str::wcharPtrToCharPtr(logLine.c_str()); syslog(sysLogPriority, "%s", line); free(line); # else syslog(sysLogPriority, "%s", logLine.c_str()); # endif } #endif // defined(ELPP_SYSLOG) } }; #if ELPP_ASYNC_LOGGING class AsyncLogDispatchCallback : public LogDispatchCallback { protected: void handle(const LogDispatchData* data) { base::type::string_t logLine = data->logMessage()->logger()->logBuilder()->build(data->logMessage(), data->dispatchAction() == base::DispatchAction::NormalLog); if (data->dispatchAction() == base::DispatchAction::NormalLog && data->logMessage()->logger()->typedConfigurations()->toStandardOutput(data->logMessage()->level())) { if (ELPP->hasFlag(LoggingFlag::ColoredTerminalOutput)) data->logMessage()->logger()->logBuilder()->convertToColoredOutput(&logLine, data->logMessage()->level()); ELPP_COUT << ELPP_COUT_LINE(logLine); } // Save resources and only queue if we want to write to file otherwise just ignore handler if (data->logMessage()->logger()->typedConfigurations()->toFile(data->logMessage()->level())) { ELPP->asyncLogQueue()->push(AsyncLogItem(*(data->logMessage()), *data, logLine)); } } }; class AsyncDispatchWorker : public base::IWorker, public base::threading::ThreadSafe { public: AsyncDispatchWorker() { setContinueRunning(false); } virtual ~AsyncDispatchWorker() { setContinueRunning(false); ELPP_INTERNAL_INFO(6, "Stopping dispatch worker - Cleaning log queue"); clean(); ELPP_INTERNAL_INFO(6, "Log queue cleaned"); } inline bool clean() { std::mutex m; std::unique_lock lk(m); cv.wait(lk, []{ return !ELPP->asyncLogQueue()->empty(); }); emptyQueue(); lk.unlock(); cv.notify_one(); return ELPP->asyncLogQueue()->empty(); } inline void emptyQueue() { while (!ELPP->asyncLogQueue()->empty()) { AsyncLogItem data = ELPP->asyncLogQueue()->next(); handle(&data); base::threading::msleep(100); } } virtual inline void start() { base::threading::msleep(5000); // Wait extra few seconds setContinueRunning(true); std::thread t1(&AsyncDispatchWorker::runner, this); t1.join(); } void handle(AsyncLogItem* logItem) { LogDispatchData* data = logItem->data(); LogMessage* logMessage = logItem->logMessage(); Logger* logger = logMessage->logger(); base::TypedConfigurations* conf = logger->typedConfigurations(); base::type::string_t logLine = logItem->logLine(); if (data->dispatchAction() == base::DispatchAction::NormalLog) { if (conf->toFile(logMessage->level())) { base::type::fstream_t* fs = conf->fileStream(logMessage->level()); if (fs != nullptr) { fs->write(logLine.c_str(), logLine.size()); if (fs->fail()) { ELPP_INTERNAL_ERROR("Unable to write log to file [" << conf->filename(logMessage->level()) << "].\n" << "Few possible reasons (could be something else):\n" << " * Permission denied\n" << " * Disk full\n" << " * Disk is not writable", true); } else { if (ELPP->hasFlag(LoggingFlag::ImmediateFlush) || (logger->isFlushNeeded(logMessage->level()))) { logger->flush(logMessage->level(), fs); } } } else { ELPP_INTERNAL_ERROR("Log file for [" << LevelHelper::convertToString(logMessage->level()) << "] " << "has not been configured but [TO_FILE] is configured to TRUE. [Logger ID: " << logger->id() << "]", false); } } } # if defined(ELPP_SYSLOG) else if (data->dispatchAction() == base::DispatchAction::SysLog) { // Determine syslog priority int sysLogPriority = 0; if (logMessage->level() == Level::Fatal) sysLogPriority = LOG_EMERG; else if (logMessage->level() == Level::Error) sysLogPriority = LOG_ERR; else if (logMessage->level() == Level::Warning) sysLogPriority = LOG_WARNING; else if (logMessage->level() == Level::Info) sysLogPriority = LOG_INFO; else if (logMessage->level() == Level::Debug) sysLogPriority = LOG_DEBUG; else sysLogPriority = LOG_NOTICE; # if defined(ELPP_UNICODE) char* line = base::utils::Str::wcharPtrToCharPtr(logLine.c_str()); syslog(sysLogPriority, "%s", line); free(line); # else syslog(sysLogPriority, "%s", logLine.c_str()); # endif } # endif // defined(ELPP_SYSLOG) } void run() { while (continueRunning()) { emptyQueue(); base::threading::msleep(10); // 10ms } } static void* runner(void *context) { static_cast(context)->run(); return NULL; } void setContinueRunning(bool value) { base::threading::ScopedLock scopedLock(m_continueRunningMutex); m_continueRunning = value; } bool continueRunning(void) { return m_continueRunning; } private: std::condition_variable cv; bool m_continueRunning; base::threading::Mutex m_continueRunningMutex; }; #endif // ELPP_ASYNC_LOGGING } // namespace base namespace base { class DefaultLogBuilder : public LogBuilder { public: base::type::string_t build(const LogMessage* logMessage, bool appendNewLine) const { base::TypedConfigurations* tc = logMessage->logger()->typedConfigurations(); const base::LogFormat* logFormat = &tc->logFormat(logMessage->level()); base::type::string_t logLine = logFormat->format(); char buff[base::consts::kSourceFilenameMaxLength + base::consts::kSourceLineMaxLength] = ""; const char* bufLim = buff + sizeof(buff); if (logFormat->hasFlag(base::FormatFlags::AppName)) { // App name base::utils::Str::replaceFirstWithEscape(logLine, base::consts::kAppNameFormatSpecifier, logMessage->logger()->parentApplicationName()); } if (logFormat->hasFlag(base::FormatFlags::ThreadId)) { // Thread ID base::utils::Str::replaceFirstWithEscape(logLine, base::consts::kThreadIdFormatSpecifier, base::threading::getCurrentThreadId()); } if (logFormat->hasFlag(base::FormatFlags::DateTime)) { // DateTime base::utils::Str::replaceFirstWithEscape(logLine, base::consts::kDateTimeFormatSpecifier, base::utils::DateTime::getDateTime(logFormat->dateTimeFormat().c_str(), &tc->millisecondsWidth(logMessage->level()))); } if (logFormat->hasFlag(base::FormatFlags::Function)) { // Function base::utils::Str::replaceFirstWithEscape(logLine, base::consts::kLogFunctionFormatSpecifier, logMessage->func()); } if (logFormat->hasFlag(base::FormatFlags::File)) { // File char* buf = base::utils::Str::clearBuff(buff, base::consts::kSourceFilenameMaxLength); base::utils::File::buildStrippedFilename(logMessage->file().c_str(), buff); buf = base::utils::Str::addToBuff(buff, buf, bufLim); base::utils::Str::replaceFirstWithEscape(logLine, base::consts::kLogFileFormatSpecifier, std::string(buff)); } if (logFormat->hasFlag(base::FormatFlags::FileBase)) { // FileBase char* buf = base::utils::Str::clearBuff(buff, base::consts::kSourceFilenameMaxLength); base::utils::File::buildBaseFilename(logMessage->file(), buff); buf = base::utils::Str::addToBuff(buff, buf, bufLim); base::utils::Str::replaceFirstWithEscape(logLine, base::consts::kLogFileBaseFormatSpecifier, std::string(buff)); } if (logFormat->hasFlag(base::FormatFlags::Line)) { // Line char* buf = base::utils::Str::clearBuff(buff, base::consts::kSourceLineMaxLength); buf = base::utils::Str::convertAndAddToBuff(logMessage->line(), base::consts::kSourceLineMaxLength, buf, bufLim, false); base::utils::Str::replaceFirstWithEscape(logLine, base::consts::kLogLineFormatSpecifier, std::string(buff)); } if (logFormat->hasFlag(base::FormatFlags::Location)) { // Location char* buf = base::utils::Str::clearBuff(buff, base::consts::kSourceFilenameMaxLength + base::consts::kSourceLineMaxLength); base::utils::File::buildStrippedFilename(logMessage->file().c_str(), buff); buf = base::utils::Str::addToBuff(buff, buf, bufLim); buf = base::utils::Str::addToBuff(":", buf, bufLim); buf = base::utils::Str::convertAndAddToBuff(logMessage->line(), base::consts::kSourceLineMaxLength, buf, bufLim, false); base::utils::Str::replaceFirstWithEscape(logLine, base::consts::kLogLocationFormatSpecifier, std::string(buff)); } if (logMessage->level() == Level::Verbose && logFormat->hasFlag(base::FormatFlags::VerboseLevel)) { // Verbose level char* buf = base::utils::Str::clearBuff(buff, 1); buf = base::utils::Str::convertAndAddToBuff(logMessage->verboseLevel(), 1, buf, bufLim, false); base::utils::Str::replaceFirstWithEscape(logLine, base::consts::kVerboseLevelFormatSpecifier, std::string(buff)); } if (logFormat->hasFlag(base::FormatFlags::LogMessage)) { // Log message base::utils::Str::replaceFirstWithEscape(logLine, base::consts::kMessageFormatSpecifier, logMessage->message()); } #if !defined(ELPP_DISABLE_CUSTOM_FORMAT_SPECIFIERS) for (std::vector::const_iterator it = ELPP->customFormatSpecifiers()->begin(); it != ELPP->customFormatSpecifiers()->end(); ++it) { std::string fs(it->formatSpecifier()); base::type::string_t wcsFormatSpecifier(fs.begin(), fs.end()); base::utils::Str::replaceFirstWithEscape(logLine, wcsFormatSpecifier, std::string(it->resolver()())); } #endif // !defined(ELPP_DISABLE_CUSTOM_FORMAT_SPECIFIERS) if (appendNewLine) logLine += ELPP_LITERAL("\n"); return logLine; } }; /// @brief Dispatches log messages class LogDispatcher : base::NoCopy { public: LogDispatcher(bool proceed, LogMessage&& logMessage, base::DispatchAction dispatchAction) : m_proceed(proceed), m_logMessage(std::move(logMessage)), m_dispatchAction(std::move(dispatchAction)) { } void dispatch(void) { if (m_proceed && m_dispatchAction == base::DispatchAction::None) { m_proceed = false; } if (!m_proceed) { return; } // We minimize the time of ELPP's lock - this lock is released after log is written base::threading::ScopedLock scopedLock(ELPP->lock()); base::TypedConfigurations* tc = m_logMessage.logger()->m_typedConfigurations; if (ELPP->hasFlag(LoggingFlag::StrictLogFileSizeCheck)) { tc->validateFileRolling(m_logMessage.level(), ELPP->preRollOutCallback()); } LogDispatchCallback* callback = nullptr; LogDispatchData data; for (const std::pair& h : ELPP->m_logDispatchCallbacks) { callback = h.second.get(); if (callback != nullptr && callback->enabled()) { data.setLogMessage(&m_logMessage); data.setDispatchAction(m_dispatchAction); callback->acquireLock(); callback->handle(&data); callback->releaseLock(); } } } private: bool m_proceed; LogMessage m_logMessage; base::DispatchAction m_dispatchAction; }; #if defined(ELPP_STL_LOGGING) /// @brief Workarounds to write some STL logs /// /// @detail There is workaround needed to loop through some stl containers. In order to do that, we need iterable containers /// of same type and provide iterator interface and pass it on to writeIterator(). /// Remember, this is passed by value in constructor so that we dont change original containers. /// This operation is as expensive as Big-O(std::min(class_.size(), base::consts::kMaxLogPerContainer)) namespace workarounds { /// @brief Abstract IterableContainer template that provides interface for iterable classes of type T template class IterableContainer { public: typedef typename Container::iterator iterator; typedef typename Container::const_iterator const_iterator; IterableContainer(void) {} virtual ~IterableContainer(void) {} iterator begin(void) { return getContainer().begin(); } iterator end(void) { return getContainer().end(); } private: virtual Container& getContainer(void) = 0; }; /// @brief Implements IterableContainer and provides iterable std::priority_queue class template, typename Comparator = std::less> class IterablePriorityQueue : public IterableContainer, public std::priority_queue { public: IterablePriorityQueue(std::priority_queue queue_) { std::size_t count_ = 0; while (++count_ < base::consts::kMaxLogPerContainer && !queue_.empty()) { this->push(queue_.top()); queue_.pop(); } } private: inline Container& getContainer(void) { return this->c; } }; /// @brief Implements IterableContainer and provides iterable std::queue class template> class IterableQueue : public IterableContainer, public std::queue { public: IterableQueue(std::queue queue_) { std::size_t count_ = 0; while (++count_ < base::consts::kMaxLogPerContainer && !queue_.empty()) { this->push(queue_.front()); queue_.pop(); } } private: inline Container& getContainer(void) { return this->c; } }; /// @brief Implements IterableContainer and provides iterable std::stack class template> class IterableStack : public IterableContainer, public std::stack { public: IterableStack(std::stack stack_) { std::size_t count_ = 0; while (++count_ < base::consts::kMaxLogPerContainer && !stack_.empty()) { this->push(stack_.top()); stack_.pop(); } } private: inline Container& getContainer(void) { return this->c; } }; } // namespace workarounds #endif // defined(ELPP_STL_LOGGING) // Log message builder class MessageBuilder { public: MessageBuilder(void) : m_logger(nullptr), m_containerLogSeperator(ELPP_LITERAL("")) {} void initialize(Logger* logger) { m_logger = logger; m_containerLogSeperator = ELPP->hasFlag(LoggingFlag::NewLineForContainer) ? ELPP_LITERAL("\n ") : ELPP_LITERAL(", "); } # define ELPP_SIMPLE_LOG(LOG_TYPE)\ inline MessageBuilder& operator<<(LOG_TYPE msg) {\ m_logger->stream() << msg;\ if (ELPP->hasFlag(LoggingFlag::AutoSpacing)) {\ m_logger->stream() << " ";\ }\ return *this;\ } inline MessageBuilder& operator<<(const std::string& msg) { return operator<<(msg.c_str()); } ELPP_SIMPLE_LOG(char) ELPP_SIMPLE_LOG(bool) ELPP_SIMPLE_LOG(signed short) ELPP_SIMPLE_LOG(unsigned short) ELPP_SIMPLE_LOG(signed int) ELPP_SIMPLE_LOG(unsigned int) ELPP_SIMPLE_LOG(signed long) ELPP_SIMPLE_LOG(unsigned long) ELPP_SIMPLE_LOG(float) ELPP_SIMPLE_LOG(double) ELPP_SIMPLE_LOG(char*) ELPP_SIMPLE_LOG(const char*) ELPP_SIMPLE_LOG(const void*) ELPP_SIMPLE_LOG(long double) inline MessageBuilder& operator<<(const std::wstring& msg) { return operator<<(msg.c_str()); } inline MessageBuilder& operator<<(const wchar_t* msg) { if (msg == nullptr) { m_logger->stream() << base::consts::kNullPointer; return *this; } # if defined(ELPP_UNICODE) m_logger->stream() << msg; # else char* buff_ = base::utils::Str::wcharPtrToCharPtr(msg); m_logger->stream() << buff_; free(buff_); # endif if (ELPP->hasFlag(LoggingFlag::AutoSpacing)) { m_logger->stream() << " "; } return *this; } // ostream manipulators inline MessageBuilder& operator<<(std::ostream& (*OStreamMani)(std::ostream&)) { m_logger->stream() << OStreamMani; return *this; } #define ELPP_ITERATOR_CONTAINER_LOG_ONE_ARG(temp) \ template \ inline MessageBuilder& operator<<(const temp& template_inst) { \ return writeIterator(template_inst.begin(), template_inst.end(), template_inst.size()); \ } #define ELPP_ITERATOR_CONTAINER_LOG_TWO_ARG(temp) \ template \ inline MessageBuilder& operator<<(const temp& template_inst) { \ return writeIterator(template_inst.begin(), template_inst.end(), template_inst.size()); \ } #define ELPP_ITERATOR_CONTAINER_LOG_THREE_ARG(temp) \ template \ inline MessageBuilder& operator<<(const temp& template_inst) { \ return writeIterator(template_inst.begin(), template_inst.end(), template_inst.size()); \ } #define ELPP_ITERATOR_CONTAINER_LOG_FOUR_ARG(temp) \ template \ inline MessageBuilder& operator<<(const temp& template_inst) { \ return writeIterator(template_inst.begin(), template_inst.end(), template_inst.size()); \ } #define ELPP_ITERATOR_CONTAINER_LOG_FIVE_ARG(temp) \ template \ inline MessageBuilder& operator<<(const temp& template_inst) { \ return writeIterator(template_inst.begin(), template_inst.end(), template_inst.size()); \ } #if defined(ELPP_STL_LOGGING) ELPP_ITERATOR_CONTAINER_LOG_TWO_ARG(std::vector) ELPP_ITERATOR_CONTAINER_LOG_TWO_ARG(std::list) ELPP_ITERATOR_CONTAINER_LOG_TWO_ARG(std::deque) ELPP_ITERATOR_CONTAINER_LOG_THREE_ARG(std::set) ELPP_ITERATOR_CONTAINER_LOG_THREE_ARG(std::multiset) ELPP_ITERATOR_CONTAINER_LOG_FOUR_ARG(std::map) ELPP_ITERATOR_CONTAINER_LOG_FOUR_ARG(std::multimap) template inline MessageBuilder& operator<<(const std::queue& queue_) { base::workarounds::IterableQueue iterableQueue_ = static_cast >(queue_); return writeIterator(iterableQueue_.begin(), iterableQueue_.end(), iterableQueue_.size()); } template inline MessageBuilder& operator<<(const std::stack& stack_) { base::workarounds::IterableStack iterableStack_ = static_cast >(stack_); return writeIterator(iterableStack_.begin(), iterableStack_.end(), iterableStack_.size()); } template inline MessageBuilder& operator<<(const std::priority_queue& priorityQueue_) { base::workarounds::IterablePriorityQueue iterablePriorityQueue_ = static_cast >(priorityQueue_); return writeIterator(iterablePriorityQueue_.begin(), iterablePriorityQueue_.end(), iterablePriorityQueue_.size()); } template inline MessageBuilder& operator<<(const std::pair& pair_) { m_logger->stream() << ELPP_LITERAL("("); operator << (static_cast(pair_.first)); m_logger->stream() << ELPP_LITERAL(", "); operator << (static_cast(pair_.second)); m_logger->stream() << ELPP_LITERAL(")"); return *this; } template inline MessageBuilder& operator<<(const std::bitset& bitset_) { m_logger->stream() << ELPP_LITERAL("["); operator << (bitset_.to_string()); m_logger->stream() << ELPP_LITERAL("]"); return *this; } # if defined(ELPP_LOG_STD_ARRAY) template inline MessageBuilder& operator<<(const std::array& array) { return writeIterator(array.begin(), array.end(), array.size()); } # endif // defined(ELPP_LOG_STD_ARRAY) # if defined(ELPP_LOG_UNORDERED_MAP) ELPP_ITERATOR_CONTAINER_LOG_FIVE_ARG(std::unordered_map) ELPP_ITERATOR_CONTAINER_LOG_FIVE_ARG(std::unordered_multimap) # endif // defined(ELPP_LOG_UNORDERED_MAP) # if defined(ELPP_LOG_UNORDERED_SET) ELPP_ITERATOR_CONTAINER_LOG_FOUR_ARG(std::unordered_set) ELPP_ITERATOR_CONTAINER_LOG_FOUR_ARG(std::unordered_multiset) # endif // defined(ELPP_LOG_UNORDERED_SET) #endif // defined(ELPP_STL_LOGGING) #if defined(ELPP_QT_LOGGING) inline MessageBuilder& operator<<(const QString& msg) { # if defined(ELPP_UNICODE) m_logger->stream() << msg.toStdWString(); # else m_logger->stream() << msg.toStdString(); # endif // defined(ELPP_UNICODE) return *this; } inline MessageBuilder& operator<<(const QByteArray& msg) { return operator << (QString(msg)); } inline MessageBuilder& operator<<(const QStringRef& msg) { return operator<<(msg.toString()); } inline MessageBuilder& operator<<(qint64 msg) { # if defined(ELPP_UNICODE) m_logger->stream() << QString::number(msg).toStdWString(); # else m_logger->stream() << QString::number(msg).toStdString(); # endif // defined(ELPP_UNICODE) return *this; } inline MessageBuilder& operator<<(quint64 msg) { # if defined(ELPP_UNICODE) m_logger->stream() << QString::number(msg).toStdWString(); # else m_logger->stream() << QString::number(msg).toStdString(); # endif // defined(ELPP_UNICODE) return *this; } inline MessageBuilder& operator<<(QChar msg) { m_logger->stream() << msg.toLatin1(); return *this; } inline MessageBuilder& operator<<(const QLatin1String& msg) { m_logger->stream() << msg.latin1(); return *this; } ELPP_ITERATOR_CONTAINER_LOG_ONE_ARG(QList) ELPP_ITERATOR_CONTAINER_LOG_ONE_ARG(QVector) ELPP_ITERATOR_CONTAINER_LOG_ONE_ARG(QQueue) ELPP_ITERATOR_CONTAINER_LOG_ONE_ARG(QSet) ELPP_ITERATOR_CONTAINER_LOG_ONE_ARG(QLinkedList) ELPP_ITERATOR_CONTAINER_LOG_ONE_ARG(QStack) template inline MessageBuilder& operator<<(const QPair& pair_) { m_logger->stream() << ELPP_LITERAL("("); operator << (static_cast(pair_.first)); m_logger->stream() << ELPP_LITERAL(", "); operator << (static_cast(pair_.second)); m_logger->stream() << ELPP_LITERAL(")"); return *this; } template inline MessageBuilder& operator<<(const QMap& map_) { m_logger->stream() << ELPP_LITERAL("["); QList keys = map_.keys(); typename QList::const_iterator begin = keys.begin(); typename QList::const_iterator end = keys.end(); int max_ = static_cast(base::consts::kMaxLogPerContainer); // to prevent warning for (int index_ = 0; begin != end && index_ < max_; ++index_, ++begin) { m_logger->stream() << ELPP_LITERAL("("); operator << (static_cast(*begin)); m_logger->stream() << ELPP_LITERAL(", "); operator << (static_cast(map_.value(*begin))); m_logger->stream() << ELPP_LITERAL(")"); m_logger->stream() << ((index_ < keys.size() -1) ? m_containerLogSeperator : ELPP_LITERAL("")); } if (begin != end) { m_logger->stream() << ELPP_LITERAL("..."); } m_logger->stream() << ELPP_LITERAL("]"); return *this; } template inline MessageBuilder& operator<<(const QMultiMap& map_) { operator << (static_cast>(map_)); return *this; } template inline MessageBuilder& operator<<(const QHash& hash_) { m_logger->stream() << ELPP_LITERAL("["); QList keys = hash_.keys(); typename QList::const_iterator begin = keys.begin(); typename QList::const_iterator end = keys.end(); int max_ = static_cast(base::consts::kMaxLogPerContainer); // prevent type warning for (int index_ = 0; begin != end && index_ < max_; ++index_, ++begin) { m_logger->stream() << ELPP_LITERAL("("); operator << (static_cast(*begin)); m_logger->stream() << ELPP_LITERAL(", "); operator << (static_cast(hash_.value(*begin))); m_logger->stream() << ELPP_LITERAL(")"); m_logger->stream() << ((index_ < keys.size() -1) ? m_containerLogSeperator : ELPP_LITERAL("")); } if (begin != end) { m_logger->stream() << ELPP_LITERAL("..."); } m_logger->stream() << ELPP_LITERAL("]"); return *this; } template inline MessageBuilder& operator<<(const QMultiHash& multiHash_) { operator << (static_cast>(multiHash_)); return *this; } #endif // defined(ELPP_QT_LOGGING) #if defined(ELPP_BOOST_LOGGING) ELPP_ITERATOR_CONTAINER_LOG_TWO_ARG(boost::container::vector) ELPP_ITERATOR_CONTAINER_LOG_TWO_ARG(boost::container::stable_vector) ELPP_ITERATOR_CONTAINER_LOG_TWO_ARG(boost::container::list) ELPP_ITERATOR_CONTAINER_LOG_TWO_ARG(boost::container::deque) ELPP_ITERATOR_CONTAINER_LOG_FOUR_ARG(boost::container::map) ELPP_ITERATOR_CONTAINER_LOG_FOUR_ARG(boost::container::flat_map) ELPP_ITERATOR_CONTAINER_LOG_THREE_ARG(boost::container::set) ELPP_ITERATOR_CONTAINER_LOG_THREE_ARG(boost::container::flat_set) #endif // defined(ELPP_BOOST_LOGGING) /// @brief Macro used internally that can be used externally to make containers easylogging++ friendly /// /// @detail This macro expands to write an ostream& operator<< for container. This container is expected to /// have begin() and end() methods that return respective iterators /// @param ContainerType Type of container e.g, MyList from WX_DECLARE_LIST(int, MyList); in wxwidgets /// @param SizeMethod Method used to get size of container. /// @param ElementInstance Instance of element to be fed out. Insance name is "elem". See WXELPP_ENABLED macro /// for an example usage #define MAKE_CONTAINERELPP_FRIENDLY(ContainerType, SizeMethod, ElementInstance) \ el::base::type::ostream_t& operator<<(el::base::type::ostream_t& ss, const ContainerType& container) {\ const el::base::type::char_t* sep = ELPP->hasFlag(el::LoggingFlag::NewLineForContainer) ? \ ELPP_LITERAL("\n ") : ELPP_LITERAL(", ");\ ContainerType::const_iterator elem = container.begin();\ ContainerType::const_iterator endElem = container.end();\ std::size_t size_ = container.SizeMethod; \ ss << ELPP_LITERAL("[");\ for (std::size_t i = 0; elem != endElem && i < el::base::consts::kMaxLogPerContainer; ++i, ++elem) { \ ss << ElementInstance;\ ss << ((i < size_ - 1) ? sep : ELPP_LITERAL(""));\ }\ if (elem != endElem) {\ ss << ELPP_LITERAL("...");\ }\ ss << ELPP_LITERAL("]");\ return ss;\ } #if defined(ELPP_WXWIDGETS_LOGGING) ELPP_ITERATOR_CONTAINER_LOG_ONE_ARG(wxVector) # define ELPP_WX_PTR_ENABLED(ContainerType) MAKE_CONTAINERELPP_FRIENDLY(ContainerType, size(), *(*elem)) # define ELPP_WX_ENABLED(ContainerType) MAKE_CONTAINERELPP_FRIENDLY(ContainerType, size(), (*elem)) # define ELPP_WX_HASH_MAP_ENABLED(ContainerType) MAKE_CONTAINERELPP_FRIENDLY(ContainerType, size(), \ ELPP_LITERAL("(") << elem->first << ELPP_LITERAL(", ") << elem->second << ELPP_LITERAL(")") #else # define ELPP_WX_PTR_ENABLED(ContainerType) # define ELPP_WX_ENABLED(ContainerType) # define ELPP_WX_HASH_MAP_ENABLED(ContainerType) #endif // defined(ELPP_WXWIDGETS_LOGGING) // Other classes template ELPP_SIMPLE_LOG(const Class&) #undef ELPP_SIMPLE_LOG #undef ELPP_ITERATOR_CONTAINER_LOG_ONE_ARG #undef ELPP_ITERATOR_CONTAINER_LOG_TWO_ARG #undef ELPP_ITERATOR_CONTAINER_LOG_THREE_ARG #undef ELPP_ITERATOR_CONTAINER_LOG_FOUR_ARG #undef ELPP_ITERATOR_CONTAINER_LOG_FIVE_ARG private: Logger* m_logger; const base::type::char_t* m_containerLogSeperator; template inline MessageBuilder& writeIterator(Iterator begin_, Iterator end_, std::size_t size_) { m_logger->stream() << ELPP_LITERAL("["); for (std::size_t i = 0; begin_ != end_ && i < base::consts::kMaxLogPerContainer; ++i, ++begin_) { operator << (*begin_); m_logger->stream() << ((i < size_ - 1) ? m_containerLogSeperator : ELPP_LITERAL("")); } if (begin_ != end_) { m_logger->stream() << ELPP_LITERAL("..."); } m_logger->stream() << ELPP_LITERAL("]"); if (ELPP->hasFlag(LoggingFlag::AutoSpacing)) { m_logger->stream() << " "; } return *this; } }; /// @brief Writes nothing - Used when certain log is disabled class NullWriter : base::NoCopy { public: NullWriter(void) {} // Null manipulator inline NullWriter& operator<<(std::ostream& (*)(std::ostream&)) { return *this; } template inline NullWriter& operator<<(const T&) { return *this; } }; /// @brief Main entry point of each logging class Writer : base::NoCopy { public: Writer(Level level, const char* file, unsigned long int line, const char* func, base::DispatchAction dispatchAction = base::DispatchAction::NormalLog, base::type::VerboseLevel verboseLevel = 0) : m_level(level), m_file(file), m_line(line), m_func(func), m_verboseLevel(verboseLevel), m_proceed(false), m_dispatchAction(dispatchAction) { } virtual ~Writer(void) { processDispatch(); } template inline Writer& operator<<(const T& log) { #if ELPP_LOGGING_ENABLED if (m_proceed) { m_messageBuilder << log; } #endif // ELPP_LOGGING_ENABLED return *this; } inline Writer& operator<<(std::ostream& (*log)(std::ostream&)) { #if ELPP_LOGGING_ENABLED if (m_proceed) { m_messageBuilder << log; } #endif // ELPP_LOGGING_ENABLED return *this; } Writer& construct(Logger* logger, bool needLock = true) { m_logger = logger; initializeLogger(logger->id(), false, needLock); m_messageBuilder.initialize(m_logger); return *this; } Writer& construct(int count, const char* loggerIds, ...) { if (ELPP->hasFlag(LoggingFlag::MultiLoggerSupport)) { va_list loggersList; va_start(loggersList, loggerIds); const char* id = loggerIds; for (int i = 0; i < count; ++i) { m_loggerIds.push_back(std::string(id)); id = va_arg(loggersList, const char*); } va_end(loggersList); initializeLogger(m_loggerIds.at(0)); } else { initializeLogger(std::string(loggerIds)); } m_messageBuilder.initialize(m_logger); return *this; } protected: Level m_level; const char* m_file; const unsigned long int m_line; const char* m_func; base::type::VerboseLevel m_verboseLevel; Logger* m_logger; bool m_proceed; base::MessageBuilder m_messageBuilder; base::DispatchAction m_dispatchAction; std::vector m_loggerIds; friend class el::Helpers; void initializeLogger(const std::string& loggerId, bool lookup = true, bool needLock = true) { if (lookup) { m_logger = ELPP->registeredLoggers()->get(loggerId, ELPP->hasFlag(LoggingFlag::CreateLoggerAutomatically)); } if (m_logger == nullptr) { ELPP->acquireLock(); if (!ELPP->registeredLoggers()->has(std::string(base::consts::kDefaultLoggerId))) { // Somehow default logger has been unregistered. Not good! Register again ELPP->registeredLoggers()->get(std::string(base::consts::kDefaultLoggerId)); } ELPP->releaseLock(); // Need to unlock it for next writer Writer(Level::Debug, m_file, m_line, m_func).construct(1, base::consts::kDefaultLoggerId) << "Logger [" << loggerId << "] is not registered yet!"; m_proceed = false; } else { if (needLock) { m_logger->acquireLock(); // This should not be unlocked by checking m_proceed because // m_proceed can be changed by lines below } if (ELPP->hasFlag(LoggingFlag::HierarchicalLogging)) { m_proceed = m_level == Level::Verbose ? m_logger->enabled(m_level) : LevelHelper::castToInt(m_level) >= LevelHelper::castToInt(ELPP->m_loggingLevel); } else { m_proceed = m_logger->enabled(m_level); } } } void processDispatch() { #if ELPP_LOGGING_ENABLED if (ELPP->hasFlag(LoggingFlag::MultiLoggerSupport)) { bool firstDispatched = false; base::type::string_t logMessage; std::size_t i = 0; do { if (m_proceed) { if (firstDispatched) { m_logger->stream() << logMessage; } else { firstDispatched = true; if (m_loggerIds.size() > 1) { logMessage = m_logger->stream().str(); } } triggerDispatch(); } else if (m_logger != nullptr) { m_logger->stream().str(ELPP_LITERAL("")); m_logger->releaseLock(); } if (i + 1 < m_loggerIds.size()) { initializeLogger(m_loggerIds.at(i + 1)); } } while (++i < m_loggerIds.size()); } else { if (m_proceed) { triggerDispatch(); } else if (m_logger != nullptr) { m_logger->stream().str(ELPP_LITERAL("")); m_logger->releaseLock(); } } #else if (m_logger != nullptr) { m_logger->stream().str(ELPP_LITERAL("")); m_logger->releaseLock(); } #endif // ELPP_LOGGING_ENABLED } void triggerDispatch(void) { if (m_proceed) { base::LogDispatcher(m_proceed, LogMessage(m_level, m_file, m_line, m_func, m_verboseLevel, m_logger), m_dispatchAction).dispatch(); } if (m_logger != nullptr) { m_logger->stream().str(ELPP_LITERAL("")); m_logger->releaseLock(); } if (m_proceed && m_level == Level::Fatal && !ELPP->hasFlag(LoggingFlag::DisableApplicationAbortOnFatalLog)) { base::Writer(Level::Warning, m_file, m_line, m_func).construct(1, base::consts::kDefaultLoggerId) << "Aborting application. Reason: Fatal log at [" << m_file << ":" << m_line << "]"; std::stringstream reasonStream; reasonStream << "Fatal log at [" << m_file << ":" << m_line << "]" << " If you wish to disable 'abort on fatal log' please use " << "el::Helpers::addFlag(el::LoggingFlag::DisableApplicationAbortOnFatalLog)"; base::utils::abort(1, reasonStream.str()); } m_proceed = false; } }; class PErrorWriter : public base::Writer { public: PErrorWriter(Level level, const char* file, unsigned long int line, const char* func, base::DispatchAction dispatchAction = base::DispatchAction::NormalLog, base::type::VerboseLevel verboseLevel = 0) : base::Writer(level, file, line, func, dispatchAction, verboseLevel) { } virtual ~PErrorWriter(void) { if (m_proceed) { #if ELPP_COMPILER_MSVC char buff[256]; strerror_s(buff, 256, errno); m_logger->stream() << ": " << buff << " [" << errno << "]"; #else m_logger->stream() << ": " << strerror(errno) << " [" << errno << "]"; #endif } } }; } // namespace base // Logging from Logger class. Why this is here? Because we have Storage and Writer class available #if ELPP_VARIADIC_TEMPLATES_SUPPORTED template void Logger::log_(Level level, int vlevel, const char* s, const T& value, const Args&... args) { base::MessageBuilder b; b.initialize(this); while (*s) { if (*s == base::consts::kFormatSpecifierChar) { if (*(s + 1) == base::consts::kFormatSpecifierChar) { ++s; } else { if (*(s + 1) == base::consts::kFormatSpecifierCharValue) { ++s; b << value; log_(level, vlevel, ++s, args...); return; } } } b << *s++; } ELPP_INTERNAL_ERROR("Too many arguments provided. Unable to handle. Please provide more format specifiers", false); } template inline void Logger::log_(Level level, int vlevel, const T& log) { if (level == Level::Verbose) { if (ELPP->vRegistry()->allowed(vlevel, __FILE__)) { base::Writer(Level::Verbose, "FILE", 0, "FUNCTION", base::DispatchAction::NormalLog, vlevel).construct(this, false) << log; } else { stream().str(ELPP_LITERAL("")); } } else { base::Writer(level, "FILE", 0, "FUNCTION").construct(this, false) << log; } } template void Logger::log(Level level, const char* s, const T& value, const Args&... args) { base::threading::ScopedLock scopedLock(lock()); log_(level, 0, s, value, args...); } template inline void Logger::log(Level level, const T& log) { base::threading::ScopedLock scopedLock(lock()); log_(level, 0, log); } # if ELPP_VERBOSE_LOG template inline void Logger::verbose(int vlevel, const char* s, const T& value, const Args&... args) { base::threading::ScopedLock scopedLock(lock()); log_(el::Level::Verbose, vlevel, s, value, args...); } template inline void Logger::verbose(int vlevel, const T& log) { base::threading::ScopedLock scopedLock(lock()); log_(el::Level::Verbose, vlevel, log); } # else template inline void Logger::verbose(int, const char*, const T&, const Args&...) { return; } template inline void Logger::verbose(int, const T&) { return; } # endif // ELPP_VERBOSE_LOG # define LOGGER_LEVEL_WRITERS(FUNCTION_NAME, LOG_LEVEL)\ template \ inline void Logger::FUNCTION_NAME(const char* s, const T& value, const Args&... args) {\ log(LOG_LEVEL, s, value, args...);\ }\ template \ inline void Logger::FUNCTION_NAME(const T& value) {\ log(LOG_LEVEL, value);\ } # define LOGGER_LEVEL_WRITERS_DISABLED(FUNCTION_NAME, LOG_LEVEL)\ template \ inline void Logger::FUNCTION_NAME(const char*, const T&, const Args&...) {\ return;\ }\ template \ inline void Logger::FUNCTION_NAME(const T&) {\ return;\ } # if ELPP_INFO_LOG LOGGER_LEVEL_WRITERS(info, Level::Info) # else LOGGER_LEVEL_WRITERS_DISABLED(info, Level::Info) # endif // ELPP_INFO_LOG # if ELPP_DEBUG_LOG LOGGER_LEVEL_WRITERS(debug, Level::Debug) # else LOGGER_LEVEL_WRITERS_DISABLED(debug, Level::Debug) # endif // ELPP_DEBUG_LOG # if ELPP_WARNING_LOG LOGGER_LEVEL_WRITERS(warn, Level::Warning) # else LOGGER_LEVEL_WRITERS_DISABLED(warn, Level::Warning) # endif // ELPP_WARNING_LOG # if ELPP_ERROR_LOG LOGGER_LEVEL_WRITERS(error, Level::Error) # else LOGGER_LEVEL_WRITERS_DISABLED(error, Level::Error) # endif // ELPP_ERROR_LOG # if ELPP_FATAL_LOG LOGGER_LEVEL_WRITERS(fatal, Level::Fatal) # else LOGGER_LEVEL_WRITERS_DISABLED(fatal, Level::Fatal) # endif // ELPP_FATAL_LOG # if ELPP_TRACE_LOG LOGGER_LEVEL_WRITERS(trace, Level::Trace) # else LOGGER_LEVEL_WRITERS_DISABLED(trace, Level::Trace) # endif // ELPP_TRACE_LOG # undef LOGGER_LEVEL_WRITERS # undef LOGGER_LEVEL_WRITERS_DISABLED #endif // ELPP_VARIADIC_TEMPLATES_SUPPORTED #if ELPP_COMPILER_MSVC # define ELPP_VARIADIC_FUNC_MSVC(variadicFunction, variadicArgs) variadicFunction variadicArgs # define ELPP_VARIADIC_FUNC_MSVC_RUN(variadicFunction, ...) ELPP_VARIADIC_FUNC_MSVC(variadicFunction, (__VA_ARGS__)) # define el_getVALength(...) ELPP_VARIADIC_FUNC_MSVC_RUN(el_resolveVALength, 0, ## __VA_ARGS__,\ 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0) #else # if ELPP_COMPILER_CLANG # define el_getVALength(...) el_resolveVALength(0, __VA_ARGS__, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0) # else # define el_getVALength(...) el_resolveVALength(0, ## __VA_ARGS__, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0) # endif // ELPP_COMPILER_CLANG #endif // ELPP_COMPILER_MSVC #define el_resolveVALength(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, N, ...) N #define ELPP_WRITE_LOG(writer, level, dispatchAction, ...) \ writer(level, __FILE__, __LINE__, ELPP_FUNC, dispatchAction).construct(el_getVALength(__VA_ARGS__), __VA_ARGS__) #define ELPP_WRITE_LOG_IF(writer, condition, level, dispatchAction, ...) if (condition) \ writer(level, __FILE__, __LINE__, ELPP_FUNC, dispatchAction).construct(el_getVALength(__VA_ARGS__), __VA_ARGS__) #define ELPP_WRITE_LOG_EVERY_N(writer, occasion, level, dispatchAction, ...) \ if (ELPP->validateEveryNCounter(__FILE__, __LINE__, occasion)) \ writer(level, __FILE__, __LINE__, ELPP_FUNC, dispatchAction).construct(el_getVALength(__VA_ARGS__), __VA_ARGS__) #define ELPP_WRITE_LOG_AFTER_N(writer, n, level, dispatchAction, ...) \ if (ELPP->validateAfterNCounter(__FILE__, __LINE__, n)) \ writer(level, __FILE__, __LINE__, ELPP_FUNC, dispatchAction).construct(el_getVALength(__VA_ARGS__), __VA_ARGS__) #define ELPP_WRITE_LOG_N_TIMES(writer, n, level, dispatchAction, ...) \ if (ELPP->validateNTimesCounter(__FILE__, __LINE__, n)) \ writer(level, __FILE__, __LINE__, ELPP_FUNC, dispatchAction).construct(el_getVALength(__VA_ARGS__), __VA_ARGS__) #undef ELPP_CURR_FILE_PERFORMANCE_LOGGER #if defined(ELPP_PERFORMANCE_LOGGER) # define ELPP_CURR_FILE_PERFORMANCE_LOGGER ELPP_PERFORMANCE_LOGGER #else # define ELPP_CURR_FILE_PERFORMANCE_LOGGER el::base::consts::kPerformanceLoggerId #endif class PerformanceTrackingData { public: enum class DataType : base::type::EnumType { Checkpoint = 1, Complete = 2 }; // Do not use constructor, will run into multiple definition error, use init(PerformanceTracker*) explicit PerformanceTrackingData(DataType dataType) : m_performanceTracker(nullptr), m_dataType(dataType), m_file(""), m_line(0), m_func("") {} inline const std::string* blockName(void) const; inline const struct timeval* startTime(void) const; inline const struct timeval* endTime(void) const; inline const struct timeval* lastCheckpointTime(void) const; inline const base::PerformanceTracker* performanceTracker(void) const { return m_performanceTracker; } inline PerformanceTrackingData::DataType dataType(void) const { return m_dataType; } inline bool firstCheckpoint(void) const { return m_firstCheckpoint; } inline std::string checkpointId(void) const { return m_checkpointId; } inline const char* file(void) const { return m_file; } inline unsigned long int line(void) const { return m_line; } inline const char* func(void) const { return m_func; } inline const base::type::string_t* formattedTimeTaken() const { return &m_formattedTimeTaken; } inline const std::string& loggerId(void) const; private: base::PerformanceTracker* m_performanceTracker; base::type::string_t m_formattedTimeTaken; PerformanceTrackingData::DataType m_dataType; bool m_firstCheckpoint; std::string m_checkpointId; const char* m_file; unsigned long int m_line; const char* m_func; inline void init(base::PerformanceTracker* performanceTracker, bool firstCheckpoint = false) { m_performanceTracker = performanceTracker; m_firstCheckpoint = firstCheckpoint; } friend class el::base::PerformanceTracker; }; namespace base { /// @brief Represents performanceTracker block of code that conditionally adds performance status to log /// either when goes outside the scope of when checkpoint() is called class PerformanceTracker : public base::threading::ThreadSafe, public Loggable { public: PerformanceTracker(const std::string& blockName, base::TimestampUnit timestampUnit = base::TimestampUnit::Millisecond, const std::string& loggerId = std::string(ELPP_CURR_FILE_PERFORMANCE_LOGGER), bool scopedLog = true, Level level = base::consts::kPerformanceTrackerDefaultLevel) : m_blockName(blockName), m_timestampUnit(timestampUnit), m_loggerId(loggerId), m_scopedLog(scopedLog), m_level(level), m_hasChecked(false), m_lastCheckpointId(std::string()), m_enabled(false) { #if !defined(ELPP_DISABLE_PERFORMANCE_TRACKING) && ELPP_LOGGING_ENABLED // We store it locally so that if user happen to change configuration by the end of scope // or before calling checkpoint, we still depend on state of configuraton at time of construction el::Logger* loggerPtr = ELPP->registeredLoggers()->get(loggerId, false); m_enabled = loggerPtr != nullptr && loggerPtr->m_typedConfigurations->performanceTracking(m_level); if (m_enabled) { base::utils::DateTime::gettimeofday(&m_startTime); } #endif // !defined(ELPP_DISABLE_PERFORMANCE_TRACKING) && ELPP_LOGGING_ENABLED } /// @brief Copy constructor PerformanceTracker(const PerformanceTracker& t) : m_blockName(t.m_blockName), m_timestampUnit(t.m_timestampUnit), m_loggerId(t.m_loggerId), m_scopedLog(t.m_scopedLog), m_level(t.m_level), m_hasChecked(t.m_hasChecked), m_lastCheckpointId(t.m_lastCheckpointId), m_enabled(t.m_enabled), m_startTime(t.m_startTime), m_endTime(t.m_endTime), m_lastCheckpointTime(t.m_lastCheckpointTime) { } virtual ~PerformanceTracker(void) { #if !defined(ELPP_DISABLE_PERFORMANCE_TRACKING) && ELPP_LOGGING_ENABLED if (m_enabled) { base::threading::ScopedLock scopedLock(lock()); if (m_scopedLog) { base::utils::DateTime::gettimeofday(&m_endTime); base::type::string_t formattedTime = getFormattedTimeTaken(); PerformanceTrackingData data(PerformanceTrackingData::DataType::Complete); data.init(this); data.m_formattedTimeTaken = formattedTime; PerformanceTrackingCallback* callback = nullptr; for (const std::pair& h : ELPP->m_performanceTrackingCallbacks) { callback = h.second.get(); if (callback != nullptr && callback->enabled()) { callback->acquireLock(); callback->handle(&data); callback->releaseLock(); } } } } #endif // !defined(ELPP_DISABLE_PERFORMANCE_TRACKING) } /// @brief A checkpoint for current performanceTracker block. void checkpoint(const std::string& id = std::string(), const char* file = __FILE__, unsigned long int line = __LINE__, const char* func = "") { #if !defined(ELPP_DISABLE_PERFORMANCE_TRACKING) && ELPP_LOGGING_ENABLED if (m_enabled) { base::threading::ScopedLock scopedLock(lock()); base::utils::DateTime::gettimeofday(&m_endTime); base::type::string_t formattedTime = m_hasChecked ? getFormattedTimeTaken(m_lastCheckpointTime) : ELPP_LITERAL(""); PerformanceTrackingData data(PerformanceTrackingData::DataType::Checkpoint); data.init(this); data.m_checkpointId = id; data.m_file = file; data.m_line = line; data.m_func = func; data.m_formattedTimeTaken = formattedTime; PerformanceTrackingCallback* callback = nullptr; for (const std::pair& h : ELPP->m_performanceTrackingCallbacks) { callback = h.second.get(); if (callback != nullptr && callback->enabled()) { callback->acquireLock(); callback->handle(&data); callback->releaseLock(); } } base::utils::DateTime::gettimeofday(&m_lastCheckpointTime); m_hasChecked = true; m_lastCheckpointId = id; } #endif // !defined(ELPP_DISABLE_PERFORMANCE_TRACKING) && ELPP_LOGGING_ENABLED ELPP_UNUSED(id); ELPP_UNUSED(file); ELPP_UNUSED(line); ELPP_UNUSED(func); } inline Level level(void) const { return m_level; } private: std::string m_blockName; base::TimestampUnit m_timestampUnit; std::string m_loggerId; bool m_scopedLog; Level m_level; bool m_hasChecked; std::string m_lastCheckpointId; bool m_enabled; struct timeval m_startTime, m_endTime, m_lastCheckpointTime; PerformanceTracker(void); friend class el::PerformanceTrackingData; friend class base::DefaultPerformanceTrackingCallback; const inline base::type::string_t getFormattedTimeTaken() const { return getFormattedTimeTaken(m_startTime); } const base::type::string_t getFormattedTimeTaken(struct timeval startTime) const { if (ELPP->hasFlag(LoggingFlag::FixedTimeFormat)) { base::type::stringstream_t ss; ss << base::utils::DateTime::getTimeDifference(m_endTime, startTime, m_timestampUnit) << " " << base::consts::kTimeFormats[static_cast(m_timestampUnit)].unit; return ss.str(); } return base::utils::DateTime::formatTime(base::utils::DateTime::getTimeDifference(m_endTime, startTime, m_timestampUnit), m_timestampUnit); } virtual inline void log(el::base::type::ostream_t& os) const { os << getFormattedTimeTaken(); } }; class DefaultPerformanceTrackingCallback : public PerformanceTrackingCallback { protected: void handle(const PerformanceTrackingData* data) { m_data = data; base::type::stringstream_t ss; if (m_data->dataType() == PerformanceTrackingData::DataType::Complete) { ss << ELPP_LITERAL("Executed [") << m_data->blockName()->c_str() << ELPP_LITERAL("] in [") << *m_data->formattedTimeTaken() << ELPP_LITERAL("]"); } else { ss << ELPP_LITERAL("Performance checkpoint"); if (!m_data->checkpointId().empty()) { ss << ELPP_LITERAL(" [") << m_data->checkpointId().c_str() << ELPP_LITERAL("]"); } ss << ELPP_LITERAL(" for block [") << m_data->blockName()->c_str() << ELPP_LITERAL("] : [") << *m_data->performanceTracker(); if (!ELPP->hasFlag(LoggingFlag::DisablePerformanceTrackingCheckpointComparison) && m_data->performanceTracker()->m_hasChecked) { ss << ELPP_LITERAL(" ([") << *m_data->formattedTimeTaken() << ELPP_LITERAL("] from "); if (m_data->performanceTracker()->m_lastCheckpointId.empty()) { ss << ELPP_LITERAL("last checkpoint"); } else { ss << ELPP_LITERAL("checkpoint '") << m_data->performanceTracker()->m_lastCheckpointId.c_str() << ELPP_LITERAL("'"); } ss << ELPP_LITERAL(")]"); } else { ss << ELPP_LITERAL("]"); } } el::base::Writer(m_data->performanceTracker()->level(), m_data->file(), m_data->line(), m_data->func()).construct(1, m_data->loggerId().c_str()) << ss.str(); } private: const PerformanceTrackingData* m_data; }; } // namespace base inline const std::string* PerformanceTrackingData::blockName() const { return const_cast(&m_performanceTracker->m_blockName); } inline const struct timeval* PerformanceTrackingData::startTime() const { return const_cast(&m_performanceTracker->m_startTime); } inline const struct timeval* PerformanceTrackingData::endTime() const { return const_cast(&m_performanceTracker->m_endTime); } inline const struct timeval* PerformanceTrackingData::lastCheckpointTime() const { return const_cast(&m_performanceTracker->m_lastCheckpointTime); } inline const std::string& PerformanceTrackingData::loggerId(void) const { return m_performanceTracker->m_loggerId; } namespace base { /// @brief Contains some internal debugging tools like crash handler and stack tracer namespace debug { class StackTrace : base::NoCopy { public: static const std::size_t kMaxStack = 64; static const std::size_t kStackStart = 2; // We want to skip c'tor and StackTrace::generateNew() class StackTraceEntry { public: StackTraceEntry(std::size_t index, const char* loc, const char* demang, const char* hex, const char* addr) { m_index = index; m_location = std::string(loc); m_demangled = std::string(demang); m_hex = std::string(hex); m_addr = std::string(addr); } StackTraceEntry(std::size_t index, char* loc) { m_index = index; m_location = std::string(loc); } std::size_t m_index; std::string m_location; std::string m_demangled; std::string m_hex; std::string m_addr; friend std::ostream& operator<<(std::ostream& ss, const StackTraceEntry& si) { ss << "[" << si.m_index << "] " << si.m_location << (si.m_demangled.empty() ? "" : ":") << si.m_demangled << (si.m_hex.empty() ? "" : "+") << si.m_hex << si.m_addr; return ss; } private: StackTraceEntry(void); }; StackTrace(void) { generateNew(); } virtual ~StackTrace(void) { } inline std::vector& getLatestStack(void) { return m_stack; } friend inline std::ostream& operator<<(std::ostream& os, const StackTrace& st) { std::vector::const_iterator it = st.m_stack.begin(); while (it != st.m_stack.end()) { os << " " << *it++ << "\n"; } return os; } private: std::vector m_stack; void generateNew(void) { #if ELPP_STACKTRACE m_stack.clear(); void* stack[kMaxStack]; std::size_t size = backtrace(stack, kMaxStack); char** strings = backtrace_symbols(stack, size); if (size > kStackStart) { // Skip StackTrace c'tor and generateNew for (std::size_t i = kStackStart; i < size; ++i) { char* mangName = nullptr; char* hex = nullptr; char* addr = nullptr; for (char* c = strings[i]; *c; ++c) { switch (*c) { case '(': mangName = c; break; case '+': hex = c; break; case ')': addr = c; break; } } // Perform demangling if parsed properly if (mangName != nullptr && hex != nullptr && addr != nullptr && mangName < hex) { *mangName++ = '\0'; *hex++ = '\0'; *addr++ = '\0'; int status = 0; char* demangName = abi::__cxa_demangle(mangName, 0, 0, &status); // if demangling is successful, output the demangled function name if (status == 0) { // Success (see http://gcc.gnu.org/onlinedocs/libstdc++/libstdc++-html-USERS-4.3/a01696.html) StackTraceEntry entry(i - 1, strings[i], demangName, hex, addr); m_stack.push_back(entry); } else { // Not successful - we will use mangled name StackTraceEntry entry(i - 1, strings[i], mangName, hex, addr); m_stack.push_back(entry); } free(demangName); } else { StackTraceEntry entry(i - 1, strings[i]); m_stack.push_back(entry); } } } free(strings); #else ELPP_INTERNAL_INFO(1, "Stacktrace generation not supported for selected compiler"); #endif // ELPP_STACKTRACE } }; static std::string crashReason(int sig) { std::stringstream ss; bool foundReason = false; for (int i = 0; i < base::consts::kCrashSignalsCount; ++i) { if (base::consts::kCrashSignals[i].numb == sig) { ss << "Application has crashed due to [" << base::consts::kCrashSignals[i].name << "] signal"; if (ELPP->hasFlag(el::LoggingFlag::LogDetailedCrashReason)) { ss << std::endl << " " << base::consts::kCrashSignals[i].brief << std::endl << " " << base::consts::kCrashSignals[i].detail; } foundReason = true; } } if (!foundReason) { ss << "Application has crashed due to unknown signal [" << sig << "]"; } return ss.str(); } /// @brief Logs reason of crash from sig static void logCrashReason(int sig, bool stackTraceIfAvailable, Level level, const char* logger) { std::stringstream ss; ss << "CRASH HANDLED; "; ss << crashReason(sig); #if ELPP_STACKTRACE if (stackTraceIfAvailable) { ss << std::endl << " ======= Backtrace: =========" << std::endl << base::debug::StackTrace(); } #else ELPP_UNUSED(stackTraceIfAvailable); #endif // ELPP_STACKTRACE ELPP_WRITE_LOG(el::base::Writer, level, base::DispatchAction::NormalLog, logger) << ss.str(); } static inline void crashAbort(int sig) { base::utils::abort(sig); } /// @brief Default application crash handler /// /// @detail This function writes log using 'default' logger, prints stack trace for GCC based compilers and aborts program. static inline void defaultCrashHandler(int sig) { base::debug::logCrashReason(sig, true, Level::Fatal, base::consts::kDefaultLoggerId); base::debug::crashAbort(sig); } /// @brief Handles unexpected crashes class CrashHandler : base::NoCopy { public: typedef void (*Handler)(int); explicit CrashHandler(bool useDefault) { if (useDefault) { setHandler(defaultCrashHandler); } } explicit CrashHandler(const Handler& cHandler) { setHandler(cHandler); } void setHandler(const Handler& cHandler) { m_handler = cHandler; #if defined(ELPP_HANDLE_SIGABRT) int i = 0; // SIGABRT is at base::consts::kCrashSignals[0] #else int i = 1; #endif // defined(ELPP_HANDLE_SIGABRT) for (; i < base::consts::kCrashSignalsCount; ++i) { m_handler = signal(base::consts::kCrashSignals[i].numb, cHandler); } } private: Handler m_handler; }; } // namespace debug } // namespace base extern base::debug::CrashHandler elCrashHandler; #define MAKE_LOGGABLE(ClassType, ClassInstance, OutputStreamInstance) \ el::base::type::ostream_t& operator<<(el::base::type::ostream_t& OutputStreamInstance, const ClassType& ClassInstance) /// @brief Initializes syslog with process ID, options and facility. calls closelog() on d'tor class SysLogInitializer { public: SysLogInitializer(const char* processIdent, int options = 0, int facility = 0) { #if defined(ELPP_SYSLOG) openlog(processIdent, options, facility); #else ELPP_UNUSED(processIdent); ELPP_UNUSED(options); ELPP_UNUSED(facility); #endif // defined(ELPP_SYSLOG) } virtual ~SysLogInitializer(void) { #if defined(ELPP_SYSLOG) closelog(); #endif // defined(ELPP_SYSLOG) } }; #define ELPP_INITIALIZE_SYSLOG(id, opt, fac) el::SysLogInitializer elSyslogInit(id, opt, fac) /// @brief Static helpers for developers class Helpers : base::StaticClass { public: /// @brief Shares logging repository (base::Storage) static inline void setStorage(base::type::StoragePointer storage) { ELPP = storage; } /// @return Main storage repository static inline base::type::StoragePointer storage() { return ELPP; } /// @brief Sets application arguments and figures out whats active for logging and whats not. static inline void setArgs(int argc, char** argv) { ELPP->setApplicationArguments(argc, argv); } /// @copydoc setArgs(int argc, char** argv) static inline void setArgs(int argc, const char** argv) { ELPP->setApplicationArguments(argc, const_cast(argv)); } /// @brief Overrides default crash handler and installs custom handler. /// @param crashHandler A functor with no return type that takes single int argument. /// Handler is a typedef with specification: void (*Handler)(int) static inline void setCrashHandler(const el::base::debug::CrashHandler::Handler& crashHandler) { el::elCrashHandler.setHandler(crashHandler); } /// @brief Abort due to crash with signal in parameter /// @param sig Crash signal static inline void crashAbort(int sig, const char* sourceFile = "", unsigned int long line = 0) { std::stringstream ss; ss << base::debug::crashReason(sig).c_str(); ss << " - [Called el::Helpers::crashAbort(" << sig << ")]"; if (sourceFile != nullptr && strlen(sourceFile) > 0) { ss << " - Source: " << sourceFile; if (line > 0) ss << ":" << line; else ss << " (line number not specified)"; } base::utils::abort(sig, ss.str()); } /// @brief Logs reason of crash as per sig /// @param sig Crash signal /// @param stackTraceIfAvailable Includes stack trace if available /// @param level Logging level /// @param logger Logger to use for logging static inline void logCrashReason(int sig, bool stackTraceIfAvailable = false, Level level = Level::Fatal, const char* logger = base::consts::kDefaultLoggerId) { el::base::debug::logCrashReason(sig, stackTraceIfAvailable, level, logger); } /// @brief Installs pre rollout callback, this callback is triggered when log file is about to be rolled out /// (can be useful for backing up) static inline void installPreRollOutCallback(const PreRollOutCallback& callback) { ELPP->setPreRollOutCallback(callback); } /// @brief Uninstalls pre rollout callback static inline void uninstallPreRollOutCallback(void) { ELPP->unsetPreRollOutCallback(); } /// @brief Installs post log dispatch callback, this callback is triggered when log is dispatched template static inline bool installLogDispatchCallback(const std::string& id) { return ELPP->installLogDispatchCallback(id); } /// @brief Uninstalls log dispatch callback template static inline void uninstallLogDispatchCallback(const std::string& id) { ELPP->uninstallLogDispatchCallback(id); } template static inline T* logDispatchCallback(const std::string& id) { return ELPP->logDispatchCallback(id); } /// @brief Installs post performance tracking callback, this callback is triggered when performance tracking is finished template static inline bool installPerformanceTrackingCallback(const std::string& id) { return ELPP->installPerformanceTrackingCallback(id); } /// @brief Uninstalls post performance tracking handler template static inline void uninstallPerformanceTrackingCallback(const std::string& id) { ELPP->uninstallPerformanceTrackingCallback(id); } template static inline T* performanceTrackingCallback(const std::string& id) { return ELPP->performanceTrackingCallback(id); } /// @brief Converts template to std::string - useful for loggable classes to log containers within log(std::ostream&) const template static std::string convertTemplateToStdString(const T& templ) { el::Logger* logger = ELPP->registeredLoggers()->get(el::base::consts::kDefaultLoggerId); if (logger == nullptr) { return std::string(); } base::MessageBuilder b; b.initialize(logger); logger->acquireLock(); b << templ; #if defined(ELPP_UNICODE) std::string s = std::string(logger->stream().str().begin(), logger->stream().str().end()); #else std::string s = logger->stream().str(); #endif // defined(ELPP_UNICODE) logger->stream().str(ELPP_LITERAL("")); logger->releaseLock(); return s; } /// @brief Returns command line arguments (pointer) provided to easylogging++ static inline const el::base::utils::CommandLineArgs* commandLineArgs(void) { return ELPP->commandLineArgs(); } /// @brief Installs user defined format specifier and handler static inline void installCustomFormatSpecifier(const CustomFormatSpecifier& customFormatSpecifier) { ELPP->installCustomFormatSpecifier(customFormatSpecifier); } /// @brief Uninstalls user defined format specifier and handler static inline bool uninstallCustomFormatSpecifier(const char* formatSpecifier) { return ELPP->uninstallCustomFormatSpecifier(formatSpecifier); } /// @brief Returns true if custom format specifier is installed static inline bool hasCustomFormatSpecifier(const char* formatSpecifier) { return ELPP->hasCustomFormatSpecifier(formatSpecifier); } static inline void validateFileRolling(Logger* logger, Level level) { if (logger == nullptr) return; logger->m_typedConfigurations->validateFileRolling(level, ELPP->preRollOutCallback()); } }; /// @brief Static helpers to deal with loggers and their configurations class Loggers : base::StaticClass { public: /// @brief Gets existing or registers new logger static inline Logger* getLogger(const std::string& identity, bool registerIfNotAvailable = true) { base::threading::ScopedLock scopedLock(ELPP->lock()); return ELPP->registeredLoggers()->get(identity, registerIfNotAvailable); } /// @brief Unregisters logger - use it only when you know what you are doing, you may unregister /// loggers initialized / used by third-party libs. static inline bool unregisterLogger(const std::string& identity) { base::threading::ScopedLock scopedLock(ELPP->lock()); return ELPP->registeredLoggers()->remove(identity); } /// @brief Whether or not logger with id is registered static inline bool hasLogger(const std::string& identity) { base::threading::ScopedLock scopedLock(ELPP->lock()); return ELPP->registeredLoggers()->has(identity); } /// @brief Reconfigures specified logger with new configurations static inline Logger* reconfigureLogger(Logger* logger, const Configurations& configurations) { if (!logger) return nullptr; logger->configure(configurations); return logger; } /// @brief Reconfigures logger with new configurations after looking it up using identity static inline Logger* reconfigureLogger(const std::string& identity, const Configurations& configurations) { return Loggers::reconfigureLogger(Loggers::getLogger(identity), configurations); } /// @brief Reconfigures logger's single configuration static inline Logger* reconfigureLogger(const std::string& identity, ConfigurationType configurationType, const std::string& value) { Logger* logger = Loggers::getLogger(identity); if (logger == nullptr) { return nullptr; } logger->configurations()->set(Level::Global, configurationType, value); logger->reconfigure(); return logger; } /// @brief Reconfigures all the existing loggers with new configurations static inline void reconfigureAllLoggers(const Configurations& configurations) { for (base::RegisteredLoggers::iterator it = ELPP->registeredLoggers()->begin(); it != ELPP->registeredLoggers()->end(); ++it) { Loggers::reconfigureLogger(it->second, configurations); } } /// @brief Reconfigures single configuration for all the loggers static inline void reconfigureAllLoggers(ConfigurationType configurationType, const std::string& value) { reconfigureAllLoggers(Level::Global, configurationType, value); } /// @brief Reconfigures single configuration for all the loggers for specified level static inline void reconfigureAllLoggers(Level level, ConfigurationType configurationType, const std::string& value) { for (base::RegisteredLoggers::iterator it = ELPP->registeredLoggers()->begin(); it != ELPP->registeredLoggers()->end(); ++it) { Logger* logger = it->second; logger->configurations()->set(level, configurationType, value); logger->reconfigure(); } } /// @brief Sets default configurations. This configuration is used for future (and conditionally for existing) loggers static inline void setDefaultConfigurations(const Configurations& configurations, bool reconfigureExistingLoggers = false) { ELPP->registeredLoggers()->setDefaultConfigurations(configurations); if (reconfigureExistingLoggers) { Loggers::reconfigureAllLoggers(configurations); } } /// @brief Returns current default static inline const Configurations* defaultConfigurations(void) { return ELPP->registeredLoggers()->defaultConfigurations(); } /// @brief Returns log stream reference pointer if needed by user static inline const base::LogStreamsReferenceMap* logStreamsReference(void) { return ELPP->registeredLoggers()->logStreamsReference(); } /// @brief Default typed configuration based on existing defaultConf static base::TypedConfigurations defaultTypedConfigurations(void) { return base::TypedConfigurations( ELPP->registeredLoggers()->defaultConfigurations(), ELPP->registeredLoggers()->logStreamsReference()); } /// @brief Populates all logger IDs in current repository. /// @param [out] targetList List of fill up. static inline std::vector* populateAllLoggerIds(std::vector* targetList) { targetList->clear(); for (base::RegisteredLoggers::iterator it = ELPP->registeredLoggers()->list().begin(); it != ELPP->registeredLoggers()->list().end(); ++it) { targetList->push_back(it->first); } return targetList; } /// @brief Sets configurations from global configuration file. static void configureFromGlobal(const char* globalConfigurationFilePath) { std::ifstream gcfStream(globalConfigurationFilePath, std::ifstream::in); ELPP_ASSERT(gcfStream.is_open(), "Unable to open global configuration file [" << globalConfigurationFilePath << "] for parsing."); std::string line = std::string(); std::stringstream ss; Logger* logger = nullptr; auto configure = [&](void) { ELPP_INTERNAL_INFO(8, "Configuring logger: '" << logger->id() << "' with configurations \n" << ss.str() << "\n--------------"); Configurations c; c.parseFromText(ss.str()); logger->configure(c); }; while (gcfStream.good()) { std::getline(gcfStream, line); ELPP_INTERNAL_INFO(1, "Parsing line: " << line); base::utils::Str::trim(line); if (Configurations::Parser::isComment(line)) continue; Configurations::Parser::ignoreComments(&line); base::utils::Str::trim(line); if (line.size() > 2 && base::utils::Str::startsWith(line, std::string(base::consts::kConfigurationLoggerId))) { if (!ss.str().empty() && logger != nullptr) { configure(); } ss.str(std::string("")); line = line.substr(2); base::utils::Str::trim(line); if (line.size() > 1) { ELPP_INTERNAL_INFO(1, "Getting logger: '" << line << "'"); logger = getLogger(line); } } else { ss << line << "\n"; } } if (!ss.str().empty() && logger != nullptr) { configure(); } } /// @brief Configures loggers using command line arg. Ensure you have already set command line args, /// @return False if invalid argument or argument with no value provided, true if attempted to configure logger. /// If true is returned that does not mean it has been configured successfully, it only means that it /// has attempeted to configure logger using configuration file provided in argument static inline bool configureFromArg(const char* argKey) { #if defined(ELPP_DISABLE_CONFIGURATION_FROM_PROGRAM_ARGS) ELPP_UNUSED(argKey); #else if (!Helpers::commandLineArgs()->hasParamWithValue(argKey)) { return false; } configureFromGlobal(Helpers::commandLineArgs()->getParamValue(argKey)); #endif // defined(ELPP_DISABLE_CONFIGURATION_FROM_PROGRAM_ARGS) return true; } /// @brief Flushes all loggers for all levels - Be careful if you dont know how many loggers are registered static inline void flushAll(void) { ELPP->registeredLoggers()->flushAll(); } /// @brief Adds logging flag used internally. static inline void addFlag(LoggingFlag flag) { ELPP->addFlag(flag); } /// @brief Removes logging flag used internally. static inline void removeFlag(LoggingFlag flag) { ELPP->removeFlag(flag); } /// @brief Determines whether or not certain flag is active static inline bool hasFlag(LoggingFlag flag) { return ELPP->hasFlag(flag); } /// @brief Adds flag and removes it when scope goes out class ScopedAddFlag { public: ScopedAddFlag(LoggingFlag flag) : m_flag(flag) { Loggers::addFlag(m_flag); } ~ScopedAddFlag(void) { Loggers::removeFlag(m_flag); } private: LoggingFlag m_flag; }; /// @brief Removes flag and add it when scope goes out class ScopedRemoveFlag { public: ScopedRemoveFlag(LoggingFlag flag) : m_flag(flag) { Loggers::removeFlag(m_flag); } ~ScopedRemoveFlag(void) { Loggers::addFlag(m_flag); } private: LoggingFlag m_flag; }; /// @brief Sets hierarchy for logging. Needs to enable logging flag (HierarchicalLogging) static inline void setLoggingLevel(Level level) { ELPP->setLoggingLevel(level); } /// @brief Sets verbose level on the fly static inline void setVerboseLevel(base::type::VerboseLevel level) { ELPP->vRegistry()->setLevel(level); } /// @brief Gets current verbose level static inline base::type::VerboseLevel verboseLevel(void) { return ELPP->vRegistry()->level(); } /// @brief Sets vmodules as specified (on the fly) static inline void setVModules(const char* modules) { if (ELPP->vRegistry()->vModulesEnabled()) { ELPP->vRegistry()->setModules(modules); } } /// @brief Clears vmodules static inline void clearVModules(void) { ELPP->vRegistry()->clearModules(); } }; class VersionInfo : base::StaticClass { public: /// @brief Current version number static inline const std::string version(void) { return std::string("9.80"); } /// @brief Release date of current version static inline const std::string releaseDate(void) { return std::string("08-01-2015 0850hrs"); } }; } // namespace el #undef VLOG_IS_ON /// @brief Determines whether verbose logging is on for specified level current file. #define VLOG_IS_ON(verboseLevel) (ELPP->vRegistry()->allowed(verboseLevel, __FILE__)) #undef TIMED_BLOCK #undef TIMED_SCOPE #undef TIMED_FUNC #undef ELPP_MIN_UNIT #if defined(ELPP_PERFORMANCE_MICROSECONDS) # define ELPP_MIN_UNIT el::base::TimestampUnit::Microsecond #else # define ELPP_MIN_UNIT el::base::TimestampUnit::Millisecond #endif // (defined(ELPP_PERFORMANCE_MICROSECONDS)) /// @brief Performance tracked scope. Performance gets written when goes out of scope using /// 'performance' logger. /// /// @detail Please note in order to check the performance at a certain time you can use obj.checkpoint(); /// @see el::base::PerformanceTracker /// @see el::base::PerformanceTracker::checkpoint // Note: Do not surround this definition with null macro because of obj instance #define TIMED_SCOPE(obj, blockname) el::base::PerformanceTracker obj(blockname, ELPP_MIN_UNIT) #define TIMED_BLOCK(obj, blockName) for (struct { int i; el::base::PerformanceTracker timer; } obj = { 0, \ el::base::PerformanceTracker(blockName, ELPP_MIN_UNIT) }; obj.i < 1; ++obj.i) /// @brief Performance tracked function. Performance gets written when goes out of scope using /// 'performance' logger. /// /// @detail Please note in order to check the performance at a certain time you can use obj.checkpoint(); /// @see el::base::PerformanceTracker /// @see el::base::PerformanceTracker::checkpoint #define TIMED_FUNC(obj) TIMED_SCOPE(obj, ELPP_FUNC) #undef PERFORMANCE_CHECKPOINT #undef PERFORMANCE_CHECKPOINT_WITH_ID #define PERFORMANCE_CHECKPOINT(obj) obj.checkpoint(std::string(), __FILE__, __LINE__, ELPP_FUNC) #define PERFORMANCE_CHECKPOINT_WITH_ID(obj, id) obj.checkpoint(id, __FILE__, __LINE__, ELPP_FUNC) #undef ELPP_COUNTER #undef ELPP_COUNTER_POS /// @brief Gets hit counter for file/line #define ELPP_COUNTER (ELPP->hitCounters()->getCounter(__FILE__, __LINE__)) /// @brief Gets hit counter position for file/line, -1 if not registered yet #define ELPP_COUNTER_POS (ELPP_COUNTER == nullptr ? -1 : ELPP_COUNTER->hitCounts()) // Undef levels to support LOG(LEVEL) #undef INFO #undef WARNING #undef DEBUG #undef ERROR #undef FATAL #undef TRACE #undef VERBOSE // Undef existing #undef CINFO #undef CWARNING #undef CDEBUG #undef CFATAL #undef CERROR #undef CTRACE #undef CVERBOSE #undef CINFO_IF #undef CWARNING_IF #undef CDEBUG_IF #undef CERROR_IF #undef CFATAL_IF #undef CTRACE_IF #undef CVERBOSE_IF #undef CINFO_EVERY_N #undef CWARNING_EVERY_N #undef CDEBUG_EVERY_N #undef CERROR_EVERY_N #undef CFATAL_EVERY_N #undef CTRACE_EVERY_N #undef CVERBOSE_EVERY_N #undef CINFO_AFTER_N #undef CWARNING_AFTER_N #undef CDEBUG_AFTER_N #undef CERROR_AFTER_N #undef CFATAL_AFTER_N #undef CTRACE_AFTER_N #undef CVERBOSE_AFTER_N #undef CINFO_N_TIMES #undef CWARNING_N_TIMES #undef CDEBUG_N_TIMES #undef CERROR_N_TIMES #undef CFATAL_N_TIMES #undef CTRACE_N_TIMES #undef CVERBOSE_N_TIMES // Normal logs #if ELPP_INFO_LOG # define CINFO(writer, dispatchAction, ...) ELPP_WRITE_LOG(writer, el::Level::Info, dispatchAction, __VA_ARGS__) #else # define CINFO(writer, dispatchAction, ...) el::base::NullWriter() #endif // ELPP_INFO_LOG #if ELPP_WARNING_LOG # define CWARNING(writer, dispatchAction, ...) ELPP_WRITE_LOG(writer, el::Level::Warning, dispatchAction, __VA_ARGS__) #else # define CWARNING(writer, dispatchAction, ...) el::base::NullWriter() #endif // ELPP_WARNING_LOG #if ELPP_DEBUG_LOG # define CDEBUG(writer, dispatchAction, ...) ELPP_WRITE_LOG(writer, el::Level::Debug, dispatchAction, __VA_ARGS__) #else # define CDEBUG(writer, dispatchAction, ...) el::base::NullWriter() #endif // ELPP_DEBUG_LOG #if ELPP_ERROR_LOG # define CERROR(writer, dispatchAction, ...) ELPP_WRITE_LOG(writer, el::Level::Error, dispatchAction, __VA_ARGS__) #else # define CERROR(writer, dispatchAction, ...) el::base::NullWriter() #endif // ELPP_ERROR_LOG #if ELPP_FATAL_LOG # define CFATAL(writer, dispatchAction, ...) ELPP_WRITE_LOG(writer, el::Level::Fatal, dispatchAction, __VA_ARGS__) #else # define CFATAL(writer, dispatchAction, ...) el::base::NullWriter() #endif // ELPP_FATAL_LOG #if ELPP_TRACE_LOG # define CTRACE(writer, dispatchAction, ...) ELPP_WRITE_LOG(writer, el::Level::Trace, dispatchAction, __VA_ARGS__) #else # define CTRACE(writer, dispatchAction, ...) el::base::NullWriter() #endif // ELPP_TRACE_LOG #if ELPP_VERBOSE_LOG # define CVERBOSE(writer, vlevel, dispatchAction, ...) if (VLOG_IS_ON(vlevel)) writer(\ el::Level::Verbose, __FILE__, __LINE__, ELPP_FUNC, dispatchAction, vlevel).construct(el_getVALength(__VA_ARGS__), __VA_ARGS__) #else # define CVERBOSE(writer, vlevel, dispatchAction, ...) el::base::NullWriter() #endif // ELPP_VERBOSE_LOG // Conditional logs #if ELPP_INFO_LOG # define CINFO_IF(writer, condition_, dispatchAction, ...) \ ELPP_WRITE_LOG_IF(writer, (condition_), el::Level::Info, dispatchAction, __VA_ARGS__) #else # define CINFO_IF(writer, condition_, dispatchAction, ...) el::base::NullWriter() #endif // ELPP_INFO_LOG #if ELPP_WARNING_LOG # define CWARNING_IF(writer, condition_, dispatchAction, ...)\ ELPP_WRITE_LOG_IF(writer, (condition_), el::Level::Warning, dispatchAction, __VA_ARGS__) #else # define CWARNING_IF(writer, condition_, dispatchAction, ...) el::base::NullWriter() #endif // ELPP_WARNING_LOG #if ELPP_DEBUG_LOG # define CDEBUG_IF(writer, condition_, dispatchAction, ...)\ ELPP_WRITE_LOG_IF(writer, (condition_), el::Level::Debug, dispatchAction, __VA_ARGS__) #else # define CDEBUG_IF(writer, condition_, dispatchAction, ...) el::base::NullWriter() #endif // ELPP_DEBUG_LOG #if ELPP_ERROR_LOG # define CERROR_IF(writer, condition_, dispatchAction, ...)\ ELPP_WRITE_LOG_IF(writer, (condition_), el::Level::Error, dispatchAction, __VA_ARGS__) #else # define CERROR_IF(writer, condition_, dispatchAction, ...) el::base::NullWriter() #endif // ELPP_ERROR_LOG #if ELPP_FATAL_LOG # define CFATAL_IF(writer, condition_, dispatchAction, ...)\ ELPP_WRITE_LOG_IF(writer, (condition_), el::Level::Fatal, dispatchAction, __VA_ARGS__) #else # define CFATAL_IF(writer, condition_, dispatchAction, ...) el::base::NullWriter() #endif // ELPP_FATAL_LOG #if ELPP_TRACE_LOG # define CTRACE_IF(writer, condition_, dispatchAction, ...)\ ELPP_WRITE_LOG_IF(writer, (condition_), el::Level::Trace, dispatchAction, __VA_ARGS__) #else # define CTRACE_IF(writer, condition_, dispatchAction, ...) el::base::NullWriter() #endif // ELPP_TRACE_LOG #if ELPP_VERBOSE_LOG # define CVERBOSE_IF(writer, condition_, vlevel, dispatchAction, ...) if (VLOG_IS_ON(vlevel) && (condition_)) writer( \ el::Level::Verbose, __FILE__, __LINE__, ELPP_FUNC, dispatchAction, vlevel).construct(el_getVALength(__VA_ARGS__), __VA_ARGS__) #else # define CVERBOSE_IF(writer, condition_, vlevel, dispatchAction, ...) el::base::NullWriter() #endif // ELPP_VERBOSE_LOG // Occasional logs #if ELPP_INFO_LOG # define CINFO_EVERY_N(writer, occasion, dispatchAction, ...)\ ELPP_WRITE_LOG_EVERY_N(writer, occasion, el::Level::Info, dispatchAction, __VA_ARGS__) #else # define CINFO_EVERY_N(writer, occasion, dispatchAction, ...) el::base::NullWriter() #endif // ELPP_INFO_LOG #if ELPP_WARNING_LOG # define CWARNING_EVERY_N(writer, occasion, dispatchAction, ...)\ ELPP_WRITE_LOG_EVERY_N(writer, occasion, el::Level::Warning, dispatchAction, __VA_ARGS__) #else # define CWARNING_EVERY_N(writer, occasion, dispatchAction, ...) el::base::NullWriter() #endif // ELPP_WARNING_LOG #if ELPP_DEBUG_LOG # define CDEBUG_EVERY_N(writer, occasion, dispatchAction, ...)\ ELPP_WRITE_LOG_EVERY_N(writer, occasion, el::Level::Debug, dispatchAction, __VA_ARGS__) #else # define CDEBUG_EVERY_N(writer, occasion, dispatchAction, ...) el::base::NullWriter() #endif // ELPP_DEBUG_LOG #if ELPP_ERROR_LOG # define CERROR_EVERY_N(writer, occasion, dispatchAction, ...)\ ELPP_WRITE_LOG_EVERY_N(writer, occasion, el::Level::Error, dispatchAction, __VA_ARGS__) #else # define CERROR_EVERY_N(writer, occasion, dispatchAction, ...) el::base::NullWriter() #endif // ELPP_ERROR_LOG #if ELPP_FATAL_LOG # define CFATAL_EVERY_N(writer, occasion, dispatchAction, ...)\ ELPP_WRITE_LOG_EVERY_N(writer, occasion, el::Level::Fatal, dispatchAction, __VA_ARGS__) #else # define CFATAL_EVERY_N(writer, occasion, dispatchAction, ...) el::base::NullWriter() #endif // ELPP_FATAL_LOG #if ELPP_TRACE_LOG # define CTRACE_EVERY_N(writer, occasion, dispatchAction, ...)\ ELPP_WRITE_LOG_EVERY_N(writer, occasion, el::Level::Trace, dispatchAction, __VA_ARGS__) #else # define CTRACE_EVERY_N(writer, occasion, dispatchAction, ...) el::base::NullWriter() #endif // ELPP_TRACE_LOG #if ELPP_VERBOSE_LOG # define CVERBOSE_EVERY_N(writer, occasion, vlevel, dispatchAction, ...)\ CVERBOSE_IF(writer, ELPP->validateEveryNCounter(__FILE__, __LINE__, occasion), vlevel, dispatchAction, __VA_ARGS__) #else # define CVERBOSE_EVERY_N(writer, occasion, vlevel, dispatchAction, ...) el::base::NullWriter() #endif // ELPP_VERBOSE_LOG // After N logs #if ELPP_INFO_LOG # define CINFO_AFTER_N(writer, n, dispatchAction, ...)\ ELPP_WRITE_LOG_AFTER_N(writer, n, el::Level::Info, dispatchAction, __VA_ARGS__) #else # define CINFO_AFTER_N(writer, n, dispatchAction, ...) el::base::NullWriter() #endif // ELPP_INFO_LOG #if ELPP_WARNING_LOG # define CWARNING_AFTER_N(writer, n, dispatchAction, ...)\ ELPP_WRITE_LOG_AFTER_N(writer, n, el::Level::Warning, dispatchAction, __VA_ARGS__) #else # define CWARNING_AFTER_N(writer, n, dispatchAction, ...) el::base::NullWriter() #endif // ELPP_WARNING_LOG #if ELPP_DEBUG_LOG # define CDEBUG_AFTER_N(writer, n, dispatchAction, ...)\ ELPP_WRITE_LOG_AFTER_N(writer, n, el::Level::Debug, dispatchAction, __VA_ARGS__) #else # define CDEBUG_AFTER_N(writer, n, dispatchAction, ...) el::base::NullWriter() #endif // ELPP_DEBUG_LOG #if ELPP_ERROR_LOG # define CERROR_AFTER_N(writer, n, dispatchAction, ...)\ ELPP_WRITE_LOG_AFTER_N(writer, n, el::Level::Error, dispatchAction, __VA_ARGS__) #else # define CERROR_AFTER_N(writer, n, dispatchAction, ...) el::base::NullWriter() #endif // ELPP_ERROR_LOG #if ELPP_FATAL_LOG # define CFATAL_AFTER_N(writer, n, dispatchAction, ...)\ ELPP_WRITE_LOG_AFTER_N(writer, n, el::Level::Fatal, dispatchAction, __VA_ARGS__) #else # define CFATAL_AFTER_N(writer, n, dispatchAction, ...) el::base::NullWriter() #endif // ELPP_FATAL_LOG #if ELPP_TRACE_LOG # define CTRACE_AFTER_N(writer, n, dispatchAction, ...)\ ELPP_WRITE_LOG_AFTER_N(writer, n, el::Level::Trace, dispatchAction, __VA_ARGS__) #else # define CTRACE_AFTER_N(writer, n, dispatchAction, ...) el::base::NullWriter() #endif // ELPP_TRACE_LOG #if ELPP_VERBOSE_LOG # define CVERBOSE_AFTER_N(writer, n, vlevel, dispatchAction, ...)\ CVERBOSE_IF(writer, ELPP->validateAfterNCounter(__FILE__, __LINE__, n), vlevel, dispatchAction, __VA_ARGS__) #else # define CVERBOSE_AFTER_N(writer, n, vlevel, dispatchAction, ...) el::base::NullWriter() #endif // ELPP_VERBOSE_LOG // N Times logs #if ELPP_INFO_LOG # define CINFO_N_TIMES(writer, n, dispatchAction, ...)\ ELPP_WRITE_LOG_N_TIMES(writer, n, el::Level::Info, dispatchAction, __VA_ARGS__) #else # define CINFO_N_TIMES(writer, n, dispatchAction, ...) el::base::NullWriter() #endif // ELPP_INFO_LOG #if ELPP_WARNING_LOG # define CWARNING_N_TIMES(writer, n, dispatchAction, ...)\ ELPP_WRITE_LOG_N_TIMES(writer, n, el::Level::Warning, dispatchAction, __VA_ARGS__) #else # define CWARNING_N_TIMES(writer, n, dispatchAction, ...) el::base::NullWriter() #endif // ELPP_WARNING_LOG #if ELPP_DEBUG_LOG # define CDEBUG_N_TIMES(writer, n, dispatchAction, ...)\ ELPP_WRITE_LOG_N_TIMES(writer, n, el::Level::Debug, dispatchAction, __VA_ARGS__) #else # define CDEBUG_N_TIMES(writer, n, dispatchAction, ...) el::base::NullWriter() #endif // ELPP_DEBUG_LOG #if ELPP_ERROR_LOG # define CERROR_N_TIMES(writer, n, dispatchAction, ...)\ ELPP_WRITE_LOG_N_TIMES(writer, n, el::Level::Error, dispatchAction, __VA_ARGS__) #else # define CERROR_N_TIMES(writer, n, dispatchAction, ...) el::base::NullWriter() #endif // ELPP_ERROR_LOG #if ELPP_FATAL_LOG # define CFATAL_N_TIMES(writer, n, dispatchAction, ...)\ ELPP_WRITE_LOG_N_TIMES(writer, n, el::Level::Fatal, dispatchAction, __VA_ARGS__) #else # define CFATAL_N_TIMES(writer, n, dispatchAction, ...) el::base::NullWriter() #endif // ELPP_FATAL_LOG #if ELPP_TRACE_LOG # define CTRACE_N_TIMES(writer, n, dispatchAction, ...)\ ELPP_WRITE_LOG_N_TIMES(writer, n, el::Level::Trace, dispatchAction, __VA_ARGS__) #else # define CTRACE_N_TIMES(writer, n, dispatchAction, ...) el::base::NullWriter() #endif // ELPP_TRACE_LOG #if ELPP_VERBOSE_LOG # define CVERBOSE_N_TIMES(writer, n, vlevel, dispatchAction, ...)\ CVERBOSE_IF(writer, ELPP->validateNTimesCounter(__FILE__, __LINE__, n), vlevel, dispatchAction, __VA_ARGS__) #else # define CVERBOSE_N_TIMES(writer, n, vlevel, dispatchAction, ...) el::base::NullWriter() #endif // ELPP_VERBOSE_LOG // // Custom Loggers - Requires (level, dispatchAction, loggerId/s) // // undef existing #undef CLOG #undef CLOG_VERBOSE #undef CVLOG #undef CLOG_IF #undef CLOG_VERBOSE_IF #undef CVLOG_IF #undef CLOG_EVERY_N #undef CVLOG_EVERY_N #undef CLOG_AFTER_N #undef CVLOG_AFTER_N #undef CLOG_N_TIMES #undef CVLOG_N_TIMES // Normal logs #define CLOG(LEVEL, ...)\ C##LEVEL(el::base::Writer, el::base::DispatchAction::NormalLog, __VA_ARGS__) #define CVLOG(vlevel, ...) CVERBOSE(el::base::Writer, vlevel, el::base::DispatchAction::NormalLog, __VA_ARGS__) // Conditional logs #define CLOG_IF(condition, LEVEL, ...)\ C##LEVEL##_IF(el::base::Writer, condition, el::base::DispatchAction::NormalLog, __VA_ARGS__) #define CVLOG_IF(condition, vlevel, ...)\ CVERBOSE_IF(el::base::Writer, condition, vlevel, el::base::DispatchAction::NormalLog, __VA_ARGS__) // Hit counts based logs #define CLOG_EVERY_N(n, LEVEL, ...)\ C##LEVEL##_EVERY_N(el::base::Writer, n, el::base::DispatchAction::NormalLog, __VA_ARGS__) #define CVLOG_EVERY_N(n, vlevel, ...)\ CVERBOSE_EVERY_N(el::base::Writer, n, vlevel, el::base::DispatchAction::NormalLog, __VA_ARGS__) #define CLOG_AFTER_N(n, LEVEL, ...)\ C##LEVEL##_AFTER_N(el::base::Writer, n, el::base::DispatchAction::NormalLog, __VA_ARGS__) #define CVLOG_AFTER_N(n, vlevel, ...)\ CVERBOSE_AFTER_N(el::base::Writer, n, vlevel, el::base::DispatchAction::NormalLog, __VA_ARGS__) #define CLOG_N_TIMES(n, LEVEL, ...)\ C##LEVEL##_N_TIMES(el::base::Writer, n, el::base::DispatchAction::NormalLog, __VA_ARGS__) #define CVLOG_N_TIMES(n, vlevel, ...)\ CVERBOSE_N_TIMES(el::base::Writer, n, vlevel, el::base::DispatchAction::NormalLog, __VA_ARGS__) // // Default Loggers macro using CLOG(), CLOG_VERBOSE() and CVLOG() macros // // undef existing #undef LOG #undef VLOG #undef LOG_IF #undef VLOG_IF #undef LOG_EVERY_N #undef VLOG_EVERY_N #undef LOG_AFTER_N #undef VLOG_AFTER_N #undef LOG_N_TIMES #undef VLOG_N_TIMES #undef ELPP_CURR_FILE_LOGGER_ID #if defined(ELPP_DEFAULT_LOGGER) # define ELPP_CURR_FILE_LOGGER_ID ELPP_DEFAULT_LOGGER #else # define ELPP_CURR_FILE_LOGGER_ID el::base::consts::kDefaultLoggerId #endif #undef ELPP_TRACE #define ELPP_TRACE CLOG(TRACE, ELPP_CURR_FILE_LOGGER_ID) // Normal logs #define LOG(LEVEL) CLOG(LEVEL, ELPP_CURR_FILE_LOGGER_ID) #define VLOG(vlevel) CVLOG(vlevel, ELPP_CURR_FILE_LOGGER_ID) // Conditional logs #define LOG_IF(condition, LEVEL) CLOG_IF(condition, LEVEL, ELPP_CURR_FILE_LOGGER_ID) #define VLOG_IF(condition, vlevel) CVLOG_IF(condition, vlevel, ELPP_CURR_FILE_LOGGER_ID) // Hit counts based logs #define LOG_EVERY_N(n, LEVEL) CLOG_EVERY_N(n, LEVEL, ELPP_CURR_FILE_LOGGER_ID) #define VLOG_EVERY_N(n, vlevel) CVLOG_EVERY_N(n, vlevel, ELPP_CURR_FILE_LOGGER_ID) #define LOG_AFTER_N(n, LEVEL) CLOG_AFTER_N(n, LEVEL, ELPP_CURR_FILE_LOGGER_ID) #define VLOG_AFTER_N(n, vlevel) CVLOG_AFTER_N(n, vlevel, ELPP_CURR_FILE_LOGGER_ID) #define LOG_N_TIMES(n, LEVEL) CLOG_N_TIMES(n, LEVEL, ELPP_CURR_FILE_LOGGER_ID) #define VLOG_N_TIMES(n, vlevel) CVLOG_N_TIMES(n, vlevel, ELPP_CURR_FILE_LOGGER_ID) // Generic PLOG() #undef CPLOG #undef CPLOG_IF #undef PLOG #undef PLOG_IF #undef DCPLOG #undef DCPLOG_IF #undef DPLOG #undef DPLOG_IF #define CPLOG(LEVEL, ...)\ C##LEVEL(el::base::PErrorWriter, el::base::DispatchAction::NormalLog, __VA_ARGS__) #define CPLOG_IF(condition, LEVEL, ...)\ C##LEVEL##_IF(el::base::PErrorWriter, condition, el::base::DispatchAction::NormalLog, __VA_ARGS__) #define DCPLOG(LEVEL, ...)\ if (ELPP_DEBUG_LOG) C##LEVEL(el::base::PErrorWriter, el::base::DispatchAction::NormalLog, __VA_ARGS__) #define DCPLOG_IF(condition, LEVEL, ...)\ C##LEVEL##_IF(el::base::PErrorWriter, (ELPP_DEBUG_LOG) && (condition), el::base::DispatchAction::NormalLog, __VA_ARGS__) #define PLOG(LEVEL) CPLOG(LEVEL, ELPP_CURR_FILE_LOGGER_ID) #define PLOG_IF(condition, LEVEL) CPLOG_IF(condition, LEVEL, ELPP_CURR_FILE_LOGGER_ID) #define DPLOG(LEVEL) DCPLOG(LEVEL, ELPP_CURR_FILE_LOGGER_ID) #define DPLOG_IF(condition, LEVEL) DCPLOG_IF(condition, LEVEL, ELPP_CURR_FILE_LOGGER_ID) // Generic SYSLOG() #undef CSYSLOG #undef CSYSLOG_IF #undef CSYSLOG_EVERY_N #undef CSYSLOG_AFTER_N #undef CSYSLOG_N_TIMES #undef SYSLOG #undef SYSLOG_IF #undef SYSLOG_EVERY_N #undef SYSLOG_AFTER_N #undef SYSLOG_N_TIMES #undef DCSYSLOG #undef DCSYSLOG_IF #undef DCSYSLOG_EVERY_N #undef DCSYSLOG_AFTER_N #undef DCSYSLOG_N_TIMES #undef DSYSLOG #undef DSYSLOG_IF #undef DSYSLOG_EVERY_N #undef DSYSLOG_AFTER_N #undef DSYSLOG_N_TIMES #if defined(ELPP_SYSLOG) # define CSYSLOG(LEVEL, ...)\ C##LEVEL(el::base::Writer, el::base::DispatchAction::SysLog, __VA_ARGS__) # define CSYSLOG_IF(condition, LEVEL, ...)\ C##LEVEL##_IF(el::base::Writer, condition, el::base::DispatchAction::SysLog, __VA_ARGS__) # define CSYSLOG_EVERY_N(n, LEVEL, ...) C##LEVEL##_EVERY_N(el::base::Writer, n, el::base::DispatchAction::SysLog, __VA_ARGS__) # define CSYSLOG_AFTER_N(n, LEVEL, ...) C##LEVEL##_AFTER_N(el::base::Writer, n, el::base::DispatchAction::SysLog, __VA_ARGS__) # define CSYSLOG_N_TIMES(n, LEVEL, ...) C##LEVEL##_N_TIMES(el::base::Writer, n, el::base::DispatchAction::SysLog, __VA_ARGS__) # define SYSLOG(LEVEL) CSYSLOG(LEVEL, el::base::consts::kSysLogLoggerId) # define SYSLOG_IF(condition, LEVEL) CSYSLOG_IF(condition, LEVEL, el::base::consts::kSysLogLoggerId) # define SYSLOG_EVERY_N(n, LEVEL) CSYSLOG_EVERY_N(n, LEVEL, el::base::consts::kSysLogLoggerId) # define SYSLOG_AFTER_N(n, LEVEL) CSYSLOG_AFTER_N(n, LEVEL, el::base::consts::kSysLogLoggerId) # define SYSLOG_N_TIMES(n, LEVEL) CSYSLOG_N_TIMES(n, LEVEL, el::base::consts::kSysLogLoggerId) # define DCSYSLOG(LEVEL, ...) if (ELPP_DEBUG_LOG) C##LEVEL(el::base::Writer, el::base::DispatchAction::SysLog, __VA_ARGS__) # define DCSYSLOG_IF(condition, LEVEL, ...)\ C##LEVEL##_IF(el::base::Writer, (ELPP_DEBUG_LOG) && (condition), el::base::DispatchAction::SysLog, __VA_ARGS__) # define DCSYSLOG_EVERY_N(n, LEVEL, ...)\ if (ELPP_DEBUG_LOG) C##LEVEL##_EVERY_N(el::base::Writer, n, el::base::DispatchAction::SysLog, __VA_ARGS__) # define DCSYSLOG_AFTER_N(n, LEVEL, ...)\ if (ELPP_DEBUG_LOG) C##LEVEL##_AFTER_N(el::base::Writer, n, el::base::DispatchAction::SysLog, __VA_ARGS__) # define DCSYSLOG_N_TIMES(n, LEVEL, ...)\ if (ELPP_DEBUG_LOG) C##LEVEL##_EVERY_N(el::base::Writer, n, el::base::DispatchAction::SysLog, __VA_ARGS__) # define DSYSLOG(LEVEL) DCSYSLOG(LEVEL, el::base::consts::kSysLogLoggerId) # define DSYSLOG_IF(condition, LEVEL) DCSYSLOG_IF(condition, LEVEL, el::base::consts::kSysLogLoggerId) # define DSYSLOG_EVERY_N(n, LEVEL) DCSYSLOG_EVERY_N(n, LEVEL, el::base::consts::kSysLogLoggerId) # define DSYSLOG_AFTER_N(n, LEVEL) DCSYSLOG_AFTER_N(n, LEVEL, el::base::consts::kSysLogLoggerId) # define DSYSLOG_N_TIMES(n, LEVEL) DCSYSLOG_N_TIMES(n, LEVEL, el::base::consts::kSysLogLoggerId) #else # define CSYSLOG(LEVEL, ...) el::base::NullWriter() # define CSYSLOG_IF(condition, LEVEL, ...) el::base::NullWriter() # define CSYSLOG_EVERY_N(n, LEVEL, ...) el::base::NullWriter() # define CSYSLOG_AFTER_N(n, LEVEL, ...) el::base::NullWriter() # define CSYSLOG_N_TIMES(n, LEVEL, ...) el::base::NullWriter() # define SYSLOG(LEVEL) el::base::NullWriter() # define SYSLOG_IF(condition, LEVEL) el::base::NullWriter() # define SYSLOG_EVERY_N(n, LEVEL) el::base::NullWriter() # define SYSLOG_AFTER_N(n, LEVEL) el::base::NullWriter() # define SYSLOG_N_TIMES(n, LEVEL) el::base::NullWriter() # define DCSYSLOG(LEVEL, ...) el::base::NullWriter() # define DCSYSLOG_IF(condition, LEVEL, ...) el::base::NullWriter() # define DCSYSLOG_EVERY_N(n, LEVEL, ...) el::base::NullWriter() # define DCSYSLOG_AFTER_N(n, LEVEL, ...) el::base::NullWriter() # define DCSYSLOG_N_TIMES(n, LEVEL, ...) el::base::NullWriter() # define DSYSLOG(LEVEL) el::base::NullWriter() # define DSYSLOG_IF(condition, LEVEL) el::base::NullWriter() # define DSYSLOG_EVERY_N(n, LEVEL) el::base::NullWriter() # define DSYSLOG_AFTER_N(n, LEVEL) el::base::NullWriter() # define DSYSLOG_N_TIMES(n, LEVEL) el::base::NullWriter() #endif // defined(ELPP_SYSLOG) // // Custom Debug Only Loggers - Requires (level, loggerId/s) // // undef existing #undef DCLOG #undef DCVLOG #undef DCLOG_IF #undef DCVLOG_IF #undef DCLOG_EVERY_N #undef DCVLOG_EVERY_N #undef DCLOG_AFTER_N #undef DCVLOG_AFTER_N #undef DCLOG_N_TIMES #undef DCVLOG_N_TIMES // Normal logs #define DCLOG(LEVEL, ...) if (ELPP_DEBUG_LOG) CLOG(LEVEL, __VA_ARGS__) #define DCLOG_VERBOSE(vlevel, ...) if (ELPP_DEBUG_LOG) CLOG_VERBOSE(vlevel, __VA_ARGS__) #define DCVLOG(vlevel, ...) if (ELPP_DEBUG_LOG) CVLOG(vlevel, __VA_ARGS__) // Conditional logs #define DCLOG_IF(condition, LEVEL, ...) if (ELPP_DEBUG_LOG) CLOG_IF(condition, LEVEL, __VA_ARGS__) #define DCVLOG_IF(condition, vlevel, ...) if (ELPP_DEBUG_LOG) CVLOG_IF(condition, vlevel, __VA_ARGS__) // Hit counts based logs #define DCLOG_EVERY_N(n, LEVEL, ...) if (ELPP_DEBUG_LOG) CLOG_EVERY_N(n, LEVEL, __VA_ARGS__) #define DCVLOG_EVERY_N(n, vlevel, ...) if (ELPP_DEBUG_LOG) CVLOG_EVERY_N(n, vlevel, __VA_ARGS__) #define DCLOG_AFTER_N(n, LEVEL, ...) if (ELPP_DEBUG_LOG) CLOG_AFTER_N(n, LEVEL, __VA_ARGS__) #define DCVLOG_AFTER_N(n, vlevel, ...) if (ELPP_DEBUG_LOG) CVLOG_AFTER_N(n, vlevel, __VA_ARGS__) #define DCLOG_N_TIMES(n, LEVEL, ...) if (ELPP_DEBUG_LOG) CLOG_N_TIMES(n, LEVEL, __VA_ARGS__) #define DCVLOG_N_TIMES(n, vlevel, ...) if (ELPP_DEBUG_LOG) CVLOG_N_TIMES(n, vlevel, __VA_ARGS__) // // Default Debug Only Loggers macro using CLOG(), CLOG_VERBOSE() and CVLOG() macros // // undef existing #undef DLOG #undef DVLOG #undef DLOG_IF #undef DVLOG_IF #undef DLOG_EVERY_N #undef DVLOG_EVERY_N #undef DLOG_AFTER_N #undef DVLOG_AFTER_N #undef DLOG_N_TIMES #undef DVLOG_N_TIMES // Normal logs #define DLOG(LEVEL) DCLOG(LEVEL, ELPP_CURR_FILE_LOGGER_ID) #define DVLOG(vlevel) DCVLOG(vlevel, ELPP_CURR_FILE_LOGGER_ID) // Conditional logs #define DLOG_IF(condition, LEVEL) DCLOG_IF(condition, LEVEL, ELPP_CURR_FILE_LOGGER_ID) #define DVLOG_IF(condition, vlevel) DCVLOG_IF(condition, vlevel, ELPP_CURR_FILE_LOGGER_ID) // Hit counts based logs #define DLOG_EVERY_N(n, LEVEL) DCLOG_EVERY_N(n, LEVEL, ELPP_CURR_FILE_LOGGER_ID) #define DVLOG_EVERY_N(n, vlevel) DCVLOG_EVERY_N(n, vlevel, ELPP_CURR_FILE_LOGGER_ID) #define DLOG_AFTER_N(n, LEVEL) DCLOG_AFTER_N(n, LEVEL, ELPP_CURR_FILE_LOGGER_ID) #define DVLOG_AFTER_N(n, vlevel) DCVLOG_AFTER_N(n, vlevel, ELPP_CURR_FILE_LOGGER_ID) #define DLOG_N_TIMES(n, LEVEL) DCLOG_N_TIMES(n, LEVEL, ELPP_CURR_FILE_LOGGER_ID) #define DVLOG_N_TIMES(n, vlevel) DCVLOG_N_TIMES(n, vlevel, ELPP_CURR_FILE_LOGGER_ID) // Check macros #undef CCHECK #undef CPCHECK #undef CCHECK_EQ #undef CCHECK_NE #undef CCHECK_LT #undef CCHECK_GT #undef CCHECK_LE #undef CCHECK_GE #undef CCHECK_BOUNDS #undef CCHECK_NOTNULL #undef CCHECK_STRCASEEQ #undef CCHECK_STRCASENE #undef CHECK #undef PCHECK #undef CHECK_EQ #undef CHECK_NE #undef CHECK_LT #undef CHECK_GT #undef CHECK_LE #undef CHECK_GE #undef CHECK_BOUNDS #undef CHECK_NOTNULL #undef CHECK_STRCASEEQ #undef CHECK_STRCASENE #define CCHECK(condition, ...) CLOG_IF(!(condition), FATAL, __VA_ARGS__) << "Check failed: [" << #condition << "] " #define CPCHECK(condition, ...) CPLOG_IF(!(condition), FATAL, __VA_ARGS__) << "Check failed: [" << #condition << "] " #define CHECK(condition) CCHECK(condition, ELPP_CURR_FILE_LOGGER_ID) #define PCHECK(condition) CPCHECK(condition, ELPP_CURR_FILE_LOGGER_ID) #define CCHECK_EQ(a, b, ...) CCHECK(a == b, __VA_ARGS__) #define CCHECK_NE(a, b, ...) CCHECK(a != b, __VA_ARGS__) #define CCHECK_LT(a, b, ...) CCHECK(a < b, __VA_ARGS__) #define CCHECK_GT(a, b, ...) CCHECK(a > b, __VA_ARGS__) #define CCHECK_LE(a, b, ...) CCHECK(a <= b, __VA_ARGS__) #define CCHECK_GE(a, b, ...) CCHECK(a >= b, __VA_ARGS__) #define CCHECK_BOUNDS(val, min, max, ...) CCHECK(val >= min && val <= max, __VA_ARGS__) #define CHECK_EQ(a, b) CCHECK_EQ(a, b, ELPP_CURR_FILE_LOGGER_ID) #define CHECK_NE(a, b) CCHECK_NE(a, b, ELPP_CURR_FILE_LOGGER_ID) #define CHECK_LT(a, b) CCHECK_LT(a, b, ELPP_CURR_FILE_LOGGER_ID) #define CHECK_GT(a, b) CCHECK_GT(a, b, ELPP_CURR_FILE_LOGGER_ID) #define CHECK_LE(a, b) CCHECK_LE(a, b, ELPP_CURR_FILE_LOGGER_ID) #define CHECK_GE(a, b) CCHECK_GE(a, b, ELPP_CURR_FILE_LOGGER_ID) #define CHECK_BOUNDS(val, min, max) CCHECK_BOUNDS(val, min, max, ELPP_CURR_FILE_LOGGER_ID) namespace el { namespace base { namespace utils { template static T* checkNotNull(T* ptr, const char* name, const char* loggers, ...) { CLOG_IF(ptr == nullptr, FATAL, loggers) << "Check failed: [" << name << " != nullptr]"; return ptr; } } // namespace utils } // namespace base } // namespace el #define CCHECK_NOTNULL(ptr, ...) el::base::utils::checkNotNull(ptr, #ptr, __VA_ARGS__) #define CCHECK_STREQ(str1, str2, ...) CLOG_IF(!el::base::utils::Str::cStringEq(str1, str2), FATAL, __VA_ARGS__) \ << "Check failed: [" << #str1 << " == " << #str2 << "] " #define CCHECK_STRNE(str1, str2, ...) CLOG_IF(el::base::utils::Str::cStringEq(str1, str2), FATAL, __VA_ARGS__) \ << "Check failed: [" << #str1 << " != " << #str2 << "] " #define CCHECK_STRCASEEQ(str1, str2, ...) CLOG_IF(!el::base::utils::Str::cStringCaseEq(str1, str2), FATAL, __VA_ARGS__) \ << "Check failed: [" << #str1 << " == " << #str2 << "] " #define CCHECK_STRCASENE(str1, str2, ...) CLOG_IF(el::base::utils::Str::cStringCaseEq(str1, str2), FATAL, __VA_ARGS__) \ << "Check failed: [" << #str1 << " != " << #str2 << "] " #define CHECK_NOTNULL(ptr) CCHECK_NOTNULL(ptr, ELPP_CURR_FILE_LOGGER_ID) #define CHECK_STREQ(str1, str2) CCHECK_STREQ(str1, str2, ELPP_CURR_FILE_LOGGER_ID) #define CHECK_STRNE(str1, str2) CCHECK_STRNE(str1, str2, ELPP_CURR_FILE_LOGGER_ID) #define CHECK_STRCASEEQ(str1, str2) CCHECK_STRCASEEQ(str1, str2, ELPP_CURR_FILE_LOGGER_ID) #define CHECK_STRCASENE(str1, str2) CCHECK_STRCASENE(str1, str2, ELPP_CURR_FILE_LOGGER_ID) #undef DCCHECK #undef DCCHECK_EQ #undef DCCHECK_NE #undef DCCHECK_LT #undef DCCHECK_GT #undef DCCHECK_LE #undef DCCHECK_GE #undef DCCHECK_BOUNDS #undef DCCHECK_NOTNULL #undef DCCHECK_STRCASEEQ #undef DCCHECK_STRCASENE #undef DCPCHECK #undef DCHECK #undef DCHECK_EQ #undef DCHECK_NE #undef DCHECK_LT #undef DCHECK_GT #undef DCHECK_LE #undef DCHECK_GE #undef DCHECK_BOUNDS_ #undef DCHECK_NOTNULL #undef DCHECK_STRCASEEQ #undef DCHECK_STRCASENE #undef DPCHECK #define DCCHECK(condition, ...) if (ELPP_DEBUG_LOG) CCHECK(condition, __VA_ARGS__) #define DCCHECK_EQ(a, b, ...) if (ELPP_DEBUG_LOG) CCHECK_EQ(a, b, __VA_ARGS__) #define DCCHECK_NE(a, b, ...) if (ELPP_DEBUG_LOG) CCHECK_NE(a, b, __VA_ARGS__) #define DCCHECK_LT(a, b, ...) if (ELPP_DEBUG_LOG) CCHECK_LT(a, b, __VA_ARGS__) #define DCCHECK_GT(a, b, ...) if (ELPP_DEBUG_LOG) CCHECK_GT(a, b, __VA_ARGS__) #define DCCHECK_LE(a, b, ...) if (ELPP_DEBUG_LOG) CCHECK_LE(a, b, __VA_ARGS__) #define DCCHECK_GE(a, b, ...) if (ELPP_DEBUG_LOG) CCHECK_GE(a, b, __VA_ARGS__) #define DCCHECK_BOUNDS(val, min, max, ...) if (ELPP_DEBUG_LOG) CCHECK_BOUNDS(val, min, max, __VA_ARGS__) #define DCCHECK_NOTNULL(ptr, ...) if (ELPP_DEBUG_LOG) CCHECK_NOTNULL(ptr, __VA_ARGS__) #define DCCHECK_STREQ(str1, str2, ...) if (ELPP_DEBUG_LOG) CCHECK_STREQ(str1, str2, __VA_ARGS__) #define DCCHECK_STRNE(str1, str2, ...) if (ELPP_DEBUG_LOG) CCHECK_STRNE(str1, str2, __VA_ARGS__) #define DCCHECK_STRCASEEQ(str1, str2, ...) if (ELPP_DEBUG_LOG) CCHECK_STRCASEEQ(str1, str2, __VA_ARGS__) #define DCCHECK_STRCASENE(str1, str2, ...) if (ELPP_DEBUG_LOG) CCHECK_STRCASENE(str1, str2, __VA_ARGS__) #define DCPCHECK(condition, ...) if (ELPP_DEBUG_LOG) CPCHECK(condition, __VA_ARGS__) #define DCHECK(condition) DCCHECK(condition, ELPP_CURR_FILE_LOGGER_ID) #define DCHECK_EQ(a, b) DCCHECK_EQ(a, b, ELPP_CURR_FILE_LOGGER_ID) #define DCHECK_NE(a, b) DCCHECK_NE(a, b, ELPP_CURR_FILE_LOGGER_ID) #define DCHECK_LT(a, b) DCCHECK_LT(a, b, ELPP_CURR_FILE_LOGGER_ID) #define DCHECK_GT(a, b) DCCHECK_GT(a, b, ELPP_CURR_FILE_LOGGER_ID) #define DCHECK_LE(a, b) DCCHECK_LE(a, b, ELPP_CURR_FILE_LOGGER_ID) #define DCHECK_GE(a, b) DCCHECK_GE(a, b, ELPP_CURR_FILE_LOGGER_ID) #define DCHECK_BOUNDS(val, min, max) DCCHECK_BOUNDS(val, min, max, ELPP_CURR_FILE_LOGGER_ID) #define DCHECK_NOTNULL(ptr) DCCHECK_NOTNULL(ptr, ELPP_CURR_FILE_LOGGER_ID) #define DCHECK_STREQ(str1, str2) DCCHECK_STREQ(str1, str2, ELPP_CURR_FILE_LOGGER_ID) #define DCHECK_STRNE(str1, str2) DCCHECK_STRNE(str1, str2, ELPP_CURR_FILE_LOGGER_ID) #define DCHECK_STRCASEEQ(str1, str2) DCCHECK_STRCASEEQ(str1, str2, ELPP_CURR_FILE_LOGGER_ID) #define DCHECK_STRCASENE(str1, str2) DCCHECK_STRCASENE(str1, str2, ELPP_CURR_FILE_LOGGER_ID) #define DPCHECK(condition) DCPCHECK(condition, ELPP_CURR_FILE_LOGGER_ID) #if defined(ELPP_DISABLE_DEFAULT_CRASH_HANDLING) # define ELPP_USE_DEF_CRASH_HANDLER false #else # define ELPP_USE_DEF_CRASH_HANDLER true #endif // defined(ELPP_DISABLE_DEFAULT_CRASH_HANDLING) #define ELPP_CRASH_HANDLER_INIT #define ELPP_INIT_EASYLOGGINGPP(val)\ ELPP_INITI_BASIC_DECLR\ namespace el {\ namespace base {\ el::base::type::StoragePointer elStorage(val);\ }\ el::base::debug::CrashHandler elCrashHandler(ELPP_USE_DEF_CRASH_HANDLER);\ } #if ELPP_ASYNC_LOGGING # define INITIALIZE_EASYLOGGINGPP\ ELPP_INIT_EASYLOGGINGPP(new el::base::Storage(el::LogBuilderPtr(new el::base::DefaultLogBuilder()),\ new el::base::AsyncDispatchWorker()))\ #else # define INITIALIZE_EASYLOGGINGPP\ ELPP_INIT_EASYLOGGINGPP(new el::base::Storage(el::LogBuilderPtr(new el::base::DefaultLogBuilder()))) #endif // ELPP_ASYNC_LOGGING #define INITIALIZE_NULL_EASYLOGGINGPP\ ELPP_INITI_BASIC_DECLR\ namespace el {\ namespace base {\ el::base::type::StoragePointer elStorage;\ }\ el::base::debug::CrashHandler elCrashHandler(ELPP_USE_DEF_CRASH_HANDLER);\ } // NOTE: no ELPP_INITI_BASIC_DECLR when sharing - causes double free corruption on external symbols #define SHARE_EASYLOGGINGPP(initializedStorage)\ namespace el {\ namespace base {\ el::base::type::StoragePointer elStorage(initializedStorage);\ }\ el::base::debug::CrashHandler elCrashHandler(ELPP_USE_DEF_CRASH_HANDLER);\ } #if defined(ELPP_UNICODE) # define START_EASYLOGGINGPP(argc, argv) el::Helpers::setArgs(argc, argv); std::locale::global(std::locale("")) #else # define START_EASYLOGGINGPP(argc, argv) el::Helpers::setArgs(argc, argv) #endif // defined(ELPP_UNICODE) #endif // EASYLOGGINGPP_H pbdagcon-0.3+20161121+ds/src/cpp/DazAlnProvider.hpp0000644000175000017500000001170413026414536017775 0ustar afifafif#pragma once #include #include #include #include #include #include "ProgramOpts.hpp" #include "Alignment.hpp" #include "AlnProvider.hpp" // Dazzler headers extern "C" { #include "DB.h" #include "align.h" } // Represents one record from the LAS file, essentially a thin container for // a dazzler overlap so we can manage things on the stack. struct Record { Overlap ovl; std::vector trace; Record() {} ~Record() {} Record(Record &&o) noexcept : ovl(std::move(o.ovl)), trace(std::move(o.trace)) { o.ovl.path.trace = NULL; } Record& operator=(Record &&o) noexcept { ovl = std::move(o.ovl); trace = std::move(o.trace); o.ovl.path.trace = NULL; return *this; } Record(const Record& o) : ovl(o.ovl), trace(o.trace) { ovl.path.trace = (void *) &trace.front(); } Record& operator=(const Record& o) { ovl = o.ovl; trace = o.trace; ovl.path.trace = (void *) &trace.front(); return *this; } }; // Holds information for all the a,b overlaps in a particular direction, // either forward or reverse. Overlaps for a particular a,b,strand // combination may come in as multiple overlaps. This class allows us to // handle them as a unit. class TargetHit { public: TargetHit(); TargetHit(Record& rec); // Returns true if given overlap belongs to this hit, otherwise false bool belongs(Overlap& ovl); // Adds the next overlap to this hit set (a,b,strand combination) void add(Record& rec); void computeOvlScore(bool proper=false); int abeg(); int aend(); int alen; int blen; // Tracks the overlap score, (pct id) x (a-read aln len) float ovlScore; // Tracks the coverage score sum(1/depth for aln len) float covScore; // information promoted from the overlap struct int aread; int bread; uint32 flags; // container holding records related to this hit std::vector records; }; std::ostream& operator<<(std::ostream& ostrm, TargetHit& hit); // Re-usable class container for a target and its overlaps class Target { public: Target(); Target(HITS_DB& db, int tspace, int small); ~Target(); // Initializes this target based on the given record, possibly scoring as // a 'proper' overlap (more stringent). void firstRecord(Record& rec, bool proper=false); // Adds the next overlap record to this target, possibly scoring as // a 'proper' overlap (more stringent). void addRecord(Record& rec, bool proper=false); // Sorts overlaps based on a two-phase scoring system void sortHits(bool sortCov); void getAlignments(std::vector &alns, unsigned int max, bool sortCov); // ID of the target int id; // Length of the target int length; std::vector hits; private: HITS_DB db_; char* abuffer_, *bbuffer_; Work_Data* work_; int tspace_, small_; std::vector coverage_; bool needsFree_; // track if we need to free memory on destruct }; /// /// Provides sets of alignments for a given target sequence from a daligner /// output file. /// class DazAlnProvider : public AlnProvider { public: /// Constructs a new alignment provider. Checks the format of the file and /// throws an exception if it's malformed. /// \param popts options passed to the program on the command line DazAlnProvider(const ProgramOpts& popts); /// Cleans up some stuff. ~DazAlnProvider(); /// Gets the set of alignments for the next target and puts them into the /// given vector. Note this function will clear the contents of the vector /// prior to adding the next set of alignments. /// In dazzler parlance, this will correct the A reads as targets. /// \param dest reference to a vector to hold the alignments. /// \return True if there are more targets, otherwise false. bool nextTarget(std::vector& dest); bool nextTarget(std::string& targSeq, std::vector& dest); private: // maintains the previous record Record prevRec_; Target* trg_; const ProgramOpts popts_; // Dazzler-related data HITS_DB db_; int64 novl_, covl_; int tbytes_; FILE* input_; char* targSeqBuf_; /// Obtain data from the las file for the next overlap record void nextRecord(Record& rec); }; /// Compares the hits based on (percent id) x (query alignment length) bool cmpHitOvlScore(const TargetHit& l, const TargetHit& r); /// Compares based on coverage score bool cmpHitCovScore(const TargetHit& l, const TargetHit& r); float invertedSum(float x, unsigned int y); /// Convert dazzler alignment into a dagcon alignment. Eventually, we /// should update the alignment graph to process the dazzler alignment /// directly, but this will be useful for debugging purposes. void decodeAlignment(Alignment* src, dagcon::Alignment& dest); pbdagcon-0.3+20161121+ds/src/cpp/AlnGraphBoost.hpp0000644000175000017500000001242513026414536017615 0ustar afifafif#pragma once /// Alignment graph representation and consensus caller. Based on the original /// Python implementation, pbdagcon. This class is modelled after its /// aligngraph.py component, which accumulates alignment information into a /// partial-order graph and then calls consensus. Used to error-correct pacbio /// on pacbio reads. /// /// Implemented using the boost graph library. // forward declaration //struct Alignment; // this allows me to forward-declare properties with graph descriptors as // members types typedef boost::adjacency_list graphTraits; /// Graph vertex property. An alignment node, which represents one base position /// in the alignment graph. struct AlnNode { char base; ///< DNA base: [ACTG] int coverage; ///< Number of reads align to this position, but not ///< necessarily match int weight; ///< Number of reads that align to this node *with the same base*, but not ///< necessarily represented in the target. bool backbone; ///< Is this node based on the reference bool deleted; ///< mark for removed as part of the merging process graphTraits::edge_descriptor bestInEdge; ///< Best scoring in edge graphTraits::edge_descriptor bestOutEdge; ///< Best scoring out edge AlnNode() { base = 'N'; coverage = 0; weight = 0; backbone = false; deleted = false; } }; /// Graph edge property. Represents an edge between alignment nodes. struct AlnEdge { int count; ///< Number of times this edge was confirmed by an alignment bool visited; ///< Tracks a visit during algorithm processing AlnEdge() { count = 0; visited = false; } }; // Boost-related typedefs // XXX: listS, listS? typedef boost::adjacency_list G; typedef boost::graph_traits::vertex_descriptor VtxDesc; typedef boost::graph_traits::vertex_iterator VtxIter; typedef boost::graph_traits::edge_descriptor EdgeDesc; typedef boost::graph_traits::edge_iterator EdgeIter; typedef boost::graph_traits::in_edge_iterator InEdgeIter; typedef boost::graph_traits::out_edge_iterator OutEdgeIter; typedef boost::property_map::type IndexMap; /// /// Simple consensus interface datastructure /// struct CnsResult { int range[2]; ///< Range on the target std::string seq; ///< Consensus fragment }; /// /// Core alignments into consensus algorithm, implemented using the boost graph /// library. Takes a set of alignments to a reference and builds a higher /// accuracy (~ 99.9) consensus sequence from it. Designed for use in the HGAP /// pipeline as a long read error correction step. /// class AlnGraphBoost { public: /// Constructor. Initialize graph based on the given sequence. Graph is /// annotated with the bases from the backbone. /// \param backbone the reference sequence. AlnGraphBoost(const std::string& backbone); /// Constructor. Initialize graph to a given backbone length. Base /// information is filled in as alignments are added. /// \param blen length of the reference sequence. AlnGraphBoost(const size_t blen); /// Add alignment to the graph. /// \param Alignment an alignment record (see Alignment.hpp) void addAln(dagcon::Alignment& aln); /// Adds a new or increments an existing edge between two aligned bases. /// \param u the 'from' vertex descriptor /// \param v the 'to' vertex descriptor void addEdge(VtxDesc u, VtxDesc v); /// Collapses degenerate nodes (vertices). Must be called before /// consensus(). Calls mergeInNodes() followed by mergeOutNodes(). void mergeNodes(); /// Recursive merge of 'in' nodes. /// \param n the base node to merge around. void mergeInNodes(VtxDesc n); /// Non-recursive merge of 'out' nodes. /// \param n the base node to merge around. void mergeOutNodes(VtxDesc n); /// Mark a given node for removal from graph. Doesn't not modify graph. /// \param n the node to remove. void markForReaper(VtxDesc n); /// Removes the set of nodes that have been marked. Modifies graph. /// Prohibitively expensive when using vecS as the vertex container. void reapNodes(); /// Generates the consensus from the graph. Must be called after /// mergeNodes(). Returns the longest contiguous consensus sequence where /// each base meets the minimum weight requirement. /// \param minWeight sets the minimum weight for each base in the consensus. /// default = 0 const std::string consensus(int minWeight=0); /// Generates all consensus sequences from a target that meet the minimum /// weight requirement. void consensus(std::vector& seqs, int minWeight=0, size_t minLength=500); /// Locates the optimal path through the graph. Called by consensus() const std::vector bestPath(); /// Emits the current graph, in dot format, to stdout. void printGraph(); /// Locate nodes that are missing either in or out edges. bool danglingNodes(); /// Destructor. virtual ~AlnGraphBoost(); private: G _g; VtxDesc _enterVtx; VtxDesc _exitVtx; std::map _bbMap; std::vector _reaperBag; }; pbdagcon-0.3+20161121+ds/src/cpp/DazAlnProvider.cpp0000644000175000017500000002673413026414536020001 0ustar afifafif#include #include #include #include #include #include #include #include #include #include "DazAlnProvider.hpp" #undef DEBUG using namespace PacBio::DagCon; // for IOException IOException::IOException(const char* desc) : desc_(desc) {} static char ToU[8] = { 'A', 'C', 'G', 'T', '.', '[', ']', '-' }; static int BORDER = 10; // Should write my own, but for reasons of expediency, this borrows heavily // from LAshow.c DazAlnProvider::DazAlnProvider(const ProgramOpts& popts) : popts_(popts), novl_(0), covl_(0) { // Initialize the sequence DB accessor // Open_DB takes a mutable char pointer, booo char* path = new char[popts_.seqFile.length()+1]; std::strcpy(path,popts_.seqFile.c_str()); path[popts_.seqFile.length()] = '\0'; int status = Open_DB(path, &db_); if (status < 0) throw IOException("Failed to open DB"); delete[] path; Trim_DB(&db_); // Initialize the alignment file reader input_ = fopen(popts_.alnFile.c_str(),"r"); if (input_ == NULL) { std::string msg = "Open failed: " + popts_.alnFile; throw IOException(msg.c_str()); } if (fread(&novl_,sizeof(int64),1,input_) != 1) throw IOException("Failed to read novl"); int tspace; if (fread(&tspace,sizeof(int),1,input_) != 1) throw IOException("Failed to read tspace"); int small; if (tspace <= TRACE_XOVR) { small = 1; tbytes_ = sizeof(uint8); } else { small = 0; tbytes_ = sizeof(uint16); } trg_ = new Target(db_, tspace, small); targSeqBuf_ = New_Read_Buffer(&db_); // Read in the first record nextRecord(prevRec_); } DazAlnProvider::~DazAlnProvider() { fclose(input_); Close_DB(&db_); free(targSeqBuf_-1); delete trg_; } bool DazAlnProvider::nextTarget(std::vector &dest) { dest.clear(); // constructor initializes the first prevRec_ struct trg_->firstRecord(prevRec_, popts_.properOvls); std::set tfilt = popts_.targets; unsigned int filter = tfilt.size(); bool skipTarget = false; while(covl_++ < novl_) { Record rec; nextRecord(rec); if (rec.ovl.aread != trg_->id || covl_ == novl_) { int tid = trg_->id + 1; if (filter == 0 || tfilt.find(tid) != tfilt.end()) { trg_->getAlignments(dest, popts_.maxHits, popts_.sortCov); if (dest.size() < popts_.minCov) { dest.clear(); skipTarget = true; } } else { skipTarget = true; } if (skipTarget && covl_ != novl_) { trg_->firstRecord(rec, popts_.properOvls); skipTarget = false; continue; } // This will initialize the target object on the next call prevRec_ = rec; break; } trg_->addRecord(rec, popts_.properOvls); } return covl_ != novl_; } bool DazAlnProvider::nextTarget(std::string& targSeq, std::vector& dest) { bool hasNext = nextTarget(dest); targSeq.resize(trg_->length); char* seq; seq = Load_Subread(&db_, trg_->id, 0, trg_->length, targSeqBuf_, 0); int i; for (i = 0; i < trg_->length; i++) targSeq[i] = ToU[(int)seq[i]]; return hasNext; } void DazAlnProvider::nextRecord(Record& rec) { Read_Overlap(input_,&rec.ovl); int tmax = ((int)1.2*rec.ovl.path.tlen) + 100; rec.trace.resize(tmax,0); rec.ovl.path.trace = (void *) &rec.trace.front(); Read_Trace(input_, &rec.ovl, tbytes_); } TargetHit::TargetHit() : ovlScore(0.0f), covScore(0.0f), aread(-1), bread(0), flags(0) {} TargetHit::TargetHit(Record& rec) : TargetHit() { aread = rec.ovl.aread; bread = rec.ovl.bread; flags = rec.ovl.flags; add(rec); } std::ostream& operator<<(std::ostream& ostrm, TargetHit& hit) { ostrm << "target: " << (hit.aread+1) << " query: " << (hit.bread+1); ostrm << " flags: " << hit.flags << " tstart: " << hit.abeg(); ostrm << " tend: " << hit.aend() << " gaps: "; ostrm << hit.records.size() << " ovlscore: " << hit.ovlScore; ostrm << " covScore: " << hit.covScore << std::endl; return ostrm; } bool TargetHit::belongs(Overlap& ovl) { return aread == ovl.aread && bread == ovl.bread && flags == ovl.flags; } void TargetHit::add(Record& rec) { if (records.size() == 0) { records.push_back(std::move(rec)); } else { Path prev = records.back().ovl.path; int prevLen = prev.aepos - prev.abpos; Path curr = rec.ovl.path; int currLen = curr.aepos - curr.abpos; if (curr.abpos > prev.aepos) { records.push_back(std::move(rec)); } else if (currLen > prevLen) { records.pop_back(); records.push_back(std::move(rec)); } } } void TargetHit::computeOvlScore(bool proper) { int ahlen = 0, bhlen = 0, diff = 0; // XXX: penalize for gaps between records? for (auto& rec: records) { Path p = rec.ovl.path; ahlen += p.aepos - p.abpos; bhlen += p.bepos - p.bbpos; diff += std::abs(ahlen - bhlen) + p.diffs; } ovlScore = (1 - diff/(float)ahlen) * ahlen; if (proper) { const Path& f = records.front().ovl.path; const Path& b = records.back().ovl.path; if (f.abpos != 0 && b.bbpos != 0) ovlScore = 0.0f; if (f.aepos != alen && b.bepos != blen) ovlScore = 0.0f; } } int TargetHit::abeg() { return records.front().ovl.path.abpos; } int TargetHit::aend() { return records.back().ovl.path.aepos; } // Simplify unit testing, don't burden with malloc'd // daligner structures. Target::Target(): needsFree_(false) { } Target::Target(HITS_DB& db, int tspace, int small) : db_(db), tspace_(tspace), small_(small), needsFree_(true) { work_ = New_Work_Data(); abuffer_ = New_Read_Buffer(&db_); bbuffer_ = New_Read_Buffer(&db_); } Target::~Target() { if (needsFree_) { free(abuffer_-1); free(bbuffer_-1); Free_Work_Data(work_); } } void Target::firstRecord(Record& rec, bool proper) { id = rec.ovl.aread; length = db_.reads[id].rlen; hits.clear(); coverage_.clear(); if (coverage_.size() < (unsigned int) length) coverage_.resize(length); auto beg = coverage_.begin(); std::for_each(beg, beg+length, [](unsigned int& x){x=0;}); TargetHit hit(rec); hit.alen = length; hit.blen = db_.reads[rec.ovl.bread].rlen; hit.computeOvlScore(proper); hits.push_back(std::move(hit)); } void Target::addRecord(Record& rec, bool proper) { // collapse into target hits, maintaining only the best hit from either // forward or reverse. if (hits.size() > 0) { TargetHit& prev = hits.back(); // Add overlap to previous hit, instead of storing a new one if (prev.belongs(rec.ovl)) { prev.add(rec); prev.computeOvlScore(proper); return; } } TargetHit hit(rec); hit.alen = length; hit.blen = db_.reads[rec.ovl.bread].rlen; hit.computeOvlScore(proper); hits.push_back(std::move(hit)); } void Target::sortHits(bool sortCov) { // sort descending based on ovl length x percent id std::sort(hits.begin(), hits.end(), cmpHitOvlScore); if (! sortCov) return; // Coverage based scoring for (auto& hit : hits) { for (auto const& rec : hit.records) { auto beg = coverage_.begin() + rec.ovl.path.abpos; auto end = coverage_.begin() + rec.ovl.path.aepos; std::for_each(beg, end, [](unsigned int& x){++x;}); hit.covScore = std::accumulate(beg, end, 0.0, invertedSum); } } std::sort(hits.begin(), hits.end(), cmpHitCovScore); } void Target::getAlignments(std::vector &alns, unsigned int max, bool sortCov) { sortHits(sortCov); auto hbeg = hits.begin(); auto hend = hits.size() > max ? hbeg + max : hits.end(); for (auto& hit = hbeg; hit != hend; ++hit) { //std::cerr << *hit; for (auto const& rec : hit->records) { Overlap ovl = rec.ovl; char* aseq, *bseq; int amin, amax; int bmin, bmax; Alignment aln; aln.path = &(ovl.path); aln.alen = length; aln.blen = hit->blen; aln.flags = ovl.flags; if (small_) Decompress_TraceTo16(&ovl); amin = ovl.path.abpos - BORDER; if (amin < 0) amin = 0; amax = ovl.path.aepos + BORDER; if (amax > aln.alen) amax = aln.alen; if (COMP(aln.flags)) { bmin = (aln.blen-ovl.path.bepos) - BORDER; if (bmin < 0) bmin = 0; bmax = (aln.blen-ovl.path.bbpos) + BORDER; if (bmax > aln.blen) bmax = aln.blen; } else { bmin = ovl.path.bbpos - BORDER; if (bmin < 0) bmin = 0; bmax = ovl.path.bepos + BORDER; if (bmax > aln.blen) bmax = aln.blen; } // XXX: potential optimization, cache reads aseq = Load_Subread(&db_, ovl.aread, amin, amax, abuffer_, 0); bseq = Load_Subread(&db_, ovl.bread, bmin, bmax, bbuffer_, 0); aln.aseq = aseq - amin; if (COMP(aln.flags)) { Complement_Seq(bseq,bmax-bmin); aln.bseq = bseq - (aln.blen - bmax); } else aln.bseq = bseq - bmin; Compute_Trace_PTS(&aln, work_, tspace_, GREEDIEST); // initialize the dagcon alignment class dagcon::Alignment dest; std::stringstream tid, qid; tid << (ovl.aread+1); qid << (ovl.bread+1); dest.id = tid.str(); dest.sid = qid.str(); dest.tlen = aln.alen; dest.start = ovl.path.abpos+1; dest.end = ovl.path.aepos+1; decodeAlignment(&aln, dest); alns.push_back(dest); } } } bool cmpHitOvlScore(const TargetHit& l, const TargetHit& r) { return l.ovlScore > r.ovlScore; } bool cmpHitCovScore(const TargetHit& l, const TargetHit& r) { return l.covScore > r.covScore; } float invertedSum(float x, unsigned int y) { return x + 1/(float)y; } void decodeAlignment(Alignment* src, dagcon::Alignment& dest) { int i, j, tlen, c, p; char* a, *b; // pointers to the sequence int* trace = (int *) src->path->trace; a = src->aseq; b = src->bseq; tlen = src->path->tlen; i = src->path->abpos; j = src->path->bbpos; // XXX: slow. pre-allocate string length or work directly with // encoded strings. for (c = 0; c < tlen; c++) { if ((p = trace[c]) < 0) { p = -p; while (i != p) { dest.tstr += ToU[(int)a[i++]]; dest.qstr += ToU[(int)b[j++]]; } dest.tstr += ToU[7]; dest.qstr += ToU[(int)b[j++]]; } else { while (j != p) { dest.tstr += ToU[(int)a[i++]]; dest.qstr += ToU[(int)b[j++]]; } dest.tstr += ToU[(int)a[i++]]; dest.qstr += ToU[7]; } } p = src->path->aepos; while(i <= p) { dest.tstr += ToU[(int)a[i++]]; dest.qstr += ToU[(int)b[j++]]; } } pbdagcon-0.3+20161121+ds/src/cpp/main.cpp0000644000175000017500000002046313026414536016032 0ustar afifafif#include #include #include #include #include #include #include #include #include #include #include #include #include #define ELPP_CUSTOM_COUT std::cerr #define ELPP_THREAD_SAFE 1 #include "easylogging++.h" #include "tclap/CmdLine.h" #include "Alignment.hpp" #include "AlnGraphBoost.hpp" #include "BlasrM5AlnProvider.hpp" #include "BoundedBuffer.hpp" #include "tuples/TupleMetrics.hpp" #include "SimpleAligner.hpp" #include "ProgramOpts.hpp" INITIALIZE_NULL_EASYLOGGINGPP ProgramOpts popts; bool AlignFirst = false; typedef std::vector AlnVec; typedef BoundedBuffer AlnBuf; typedef BoundedBuffer CnsBuf; class Reader { AlnBuf* alnBuf_; const std::string fpath_; size_t minCov_; int nCnsThreads_; public: Reader(AlnBuf* b, const std::string fpath, size_t minCov) : alnBuf_(b), fpath_(fpath), minCov_(minCov) { } void setNumCnsThreads(int n) { nCnsThreads_ = n; } void operator()() { el::Logger* logger = el::Loggers::getLogger("Reader"); try { AlnProvider* ap; if (fpath_ == "-") { ap = new BlasrM5AlnProvider(&std::cin); } else { ap = new BlasrM5AlnProvider(fpath_); } AlnVec alns; bool hasNext = true; while (hasNext) { hasNext = ap->nextTarget(alns); size_t cov = alns.size(); if (cov == 0) continue; if (cov < minCov_) { logger->debug("Coverage requirement not met for %v, coverage: %v", alns[0].id, alns.size()); continue; } boost::format msg("Consensus candidate: %s"); msg % alns[0].id; logger->debug(msg.str()); alnBuf_->push(alns); } } catch (M5Exception::FileOpenError) { logger->error("Error opening file: %s", fpath_); } catch (M5Exception::FormatError err) { logger->error("Format error. Input: %s, Error: %s", fpath_, err.msg); } catch (M5Exception::SortError err) { logger->error("Input file is not sorted by either target or query."); } // write out sentinals, one per consensus thread AlnVec sentinel; for (int i=0; i < nCnsThreads_; i++) alnBuf_->push(sentinel); } }; class Consensus { AlnBuf* alnBuf_; CnsBuf* cnsBuf_; size_t minLen_; int minWeight_; SimpleAligner aligner; public: Consensus(AlnBuf* ab, CnsBuf* cb, size_t minLen, int minWeight) : alnBuf_(ab), cnsBuf_(cb), minLen_(minLen), minWeight_(minWeight) { } void operator()() { el::Logger* logger = el::Loggers::getLogger("Consensus"); AlnVec alns; alnBuf_->pop(&alns); std::vector seqs; while (alns.size() > 0) { if (alns.size() < popts.minCov) { alnBuf_->pop(&alns); continue; } boost::format msg("Consensus calling: %s Alignments: %d"); msg % alns[0].id; msg % alns.size(); logger->info(msg.str()); if (AlignFirst) for_each(alns.begin(), alns.end(), aligner); AlnGraphBoost ag(alns[0].tlen); for (auto it = alns.begin(); it != alns.end(); ++it) { if (it->qstr.length() < minLen_) continue; dagcon::Alignment aln = normalizeGaps(*it); trimAln(aln, popts.trim); ag.addAln(aln); } ag.mergeNodes(); ag.consensus(seqs, minWeight_, minLen_); for (auto it = seqs.begin(); it != seqs.end(); ++it) { CnsResult result = *it; boost::format fasta(">%s/%d_%d\n%s\n"); fasta % alns[0].id % result.range[0] % result.range[1]; fasta % result.seq; cnsBuf_->push(fasta.str()); } alnBuf_->pop(&alns); } // write out a sentinal cnsBuf_->push(""); } }; class Writer { CnsBuf* cnsBuf_; int nCnsThreads_; public: Writer(CnsBuf* cb) : cnsBuf_(cb) {} void setNumCnsThreads(int n) { nCnsThreads_ = n; } void operator()() { std::string cns; cnsBuf_->pop(&cns); int sentinelCount = 0; while (true) { std::cout << cns; if (cns == "" && ++sentinelCount == nCnsThreads_) break; cnsBuf_->pop(&cns); } } }; void parseArgs(int argc, char **argv) { try { TCLAP::CmdLine cmd("PBI consensus module", ' ', "0.3"); TCLAP::ValueArg threadArg( "j","threads", // short, long name "Number of consensus threads", // description false, 4, // required, default "int", cmd); TCLAP::ValueArg minCovArg( "c","min-coverage", "Minimum coverage for correction", false, 6, "uint", cmd); TCLAP::ValueArg minLenArg( "m","min-length", "Minimum length for correction", false, 500, "uint", cmd); TCLAP::ValueArg trimArg( "t","trim", "Trim alignments on either size", false, 50, "uint", cmd); TCLAP::SwitchArg alignArg("a","align", "Align sequences before adding to consensus", cmd, false); TCLAP::SwitchArg verboseArg("v","verbose", "Turns on verbose logging", cmd, false); TCLAP::UnlabeledValueArg inputArg( "input", "Input data", true, "-","either file path or stdin", cmd); cmd.parse(argc, argv); popts.minCov = minCovArg.getValue(); popts.minLen = minLenArg.getValue(); popts.trim = trimArg.getValue(); popts.threads = threadArg.getValue(); popts.align = alignArg.getValue(); popts.input = inputArg.getValue(); } catch (TCLAP::ArgException& e) { std::cerr << "Error " << e.argId() << ": " << e.error() << std::endl; exit(1); } } int main(int argc, char* argv[]) { parseArgs(argc, argv); #if ELPP_ASYNC_LOGGING el::base::elStorage.reset( new el::base::Storage(el::LogBuilderPtr(new el::base::DefaultLogBuilder()), new el::base::AsyncDispatchWorker()) ); #else el::base::elStorage.reset( new el::base::Storage(el::LogBuilderPtr(new el::base::DefaultLogBuilder())) ); #endif // ELPP_ASYNC_LOGGING START_EASYLOGGINGPP(argc, argv); el::Logger* logger = el::Loggers::getLogger("default"); if (popts.align) { dagcon::Alignment::parse = parsePre; AlignFirst = true; } AlnBuf alnBuf(30); CnsBuf cnsBuf(30); if (popts.threads > 1) { logger->info("Multi-threaded. Input: %v, Threads: %v", popts.input, popts.threads); Writer writer(&cnsBuf); writer.setNumCnsThreads(popts.threads); std::thread writerThread(writer); std::vector cnsThreads; for (int i=0; i < popts.threads; i++) { Consensus c(&alnBuf, &cnsBuf, popts.minLen, popts.minCov); cnsThreads.push_back(std::thread(c)); } Reader reader(&alnBuf, popts.input, popts.minCov); reader.setNumCnsThreads(popts.threads); std::thread readerThread(reader); writerThread.join(); std::vector::iterator it; for (it = cnsThreads.begin(); it != cnsThreads.end(); ++it) it->join(); readerThread.join(); } else { logger->info("Single-threaded. Input: %v", popts.input); Reader reader(&alnBuf, popts.input, popts.minCov); reader.setNumCnsThreads(1); Consensus cns(&alnBuf, &cnsBuf, popts.minLen, popts.minCov); Writer writer(&cnsBuf); writer.setNumCnsThreads(1); reader(); cns(); writer(); } return 0; } pbdagcon-0.3+20161121+ds/src/cpp/AlnProvider.hpp0000644000175000017500000000236713026414536017343 0ustar afifafif#pragma once #include /// /// Exceptions thrown by derived classes /// namespace PacBio { namespace DagCon { class IOException : public std::exception { public: IOException(const char* desc); virtual const char* what() const throw() { return desc_; } private: const char* desc_; }; class MemoryException : public std::exception { public: virtual const char* what() const throw() { return "Failed to allocate memory"; } }; }} /// /// Generic alignment provider interface. /// class AlnProvider { public: /// Gets the set of alignments for the next target and puts them into the /// given vector. Note this function will clear the contents of the vector /// prior to adding the next set of alignments. /// \param dest reference to a vector to hold the alignments. /// \return True if there are more targets, otherwise false. virtual bool nextTarget(std::vector& dest) = 0; /// Same as nextTarget(dest), except it also returns the target sequence we are /// going to correct. virtual bool nextTarget(std::string& targSeq, std::vector& dest) = 0; virtual ~AlnProvider() {}; }; pbdagcon-0.3+20161121+ds/src/cpp/BoundedBuffer.hpp0000644000175000017500000000246613026414536017630 0ustar afifafif#pragma once #include #include #include #include #include #include #include "Alignment.hpp" /// /// Templated, thread-safe buffer container, uses boost::circular buffer /// bounded by a given capacity specified by the caller. When the buffer is /// full, the push waits for an open spot. When the buffer is empty, the pop /// waits for an item to be present. Condition variables are used to signal /// the state of the buffer. /// template class BoundedBuffer { public: typedef std::deque buffer_type; BoundedBuffer(size_t max) : max_(max) { } void push(T item) { std::unique_lock lock(mutex_); not_full_.wait(lock, [this](){return buffer_.size() != max_;}); buffer_.push_front(item); not_empty_.notify_one(); } void pop(T* pItem) { std::unique_lock lock(mutex_); not_empty_.wait(lock, [this](){return buffer_.size() != 0U;}); *pItem = buffer_.back(); buffer_.pop_back(); not_full_.notify_one(); } unsigned int size() { return buffer_.size(); } private: size_t const max_; buffer_type buffer_; std::mutex mutex_; std::condition_variable not_empty_; std::condition_variable not_full_; }; pbdagcon-0.3+20161121+ds/src/cpp/makefile0000644000175000017500000000340013026414536016072 0ustar afifafifall: THISDIR:=$(dir $(lastword ${MAKEFILE_LIST})) -include ${CURDIR}/../../defines.mk COMMON_OBJECTS := Alignment.o AlnGraphBoost.o PBDAGCON_OBJECTS := BlasrM5AlnProvider.o main.o SimpleAligner.o DAZCON_OBJECTS := DB.o QV.o align.o DazAlnProvider.o dazcon.o CPPFLAGS += -MMD -MP CXXFLAGS = -O3 -std=c++11 -Wall -Wuninitialized -pedantic CFLAGS = -O3 -Wall -Wextra -fno-strict-aliasing INCDIRS := \ ${DAZZ_DB_INCLUDE} \ ${DALIGNER_INCLUDE} \ ${LIBBLASR_INCLUDE} \ ${LIBPBDATA_INCLUDE} \ ${LIBPBIHDF_INCLUDE} \ ${PBBAM_INCLUDE} \ ${HDF5_INCLUDE} \ ${HTSLIB_INCLUDE} SYS_INCDIRS := \ ${BOOST_INCLUDE} \ ${THISDIR}/third-party LIBDIRS := \ ${LIBBLASR_LIB} \ ${LIBPBDATA_LIB} \ ${LIBPBIHDF_LIB} \ ${PBBAM_LIB} \ ${HDF5_LIB} \ ${HTSLIB_LIB} \ ${GCC_LIB} \ ${ZLIB_LIB} LDLIBS+= \ ${LIBBLASR_LIBFLAGS} \ ${LIBPBDATA_LIBFLAGS} \ ${LIBPBIHDF_LIBFLAGS} \ ${PBBAM_LIBFLAGS} \ ${HDF5_LIBFLAGS} \ ${HTSLIB_LIBFLAGS} \ ${ZLIB_LIBFLAGS} \ ${PTHREAD_LIBFLAGS} \ ${DL_LIBFLAGS} CPPFLAGS+=$(patsubst %,-I%,${INCDIRS}) $(patsubst %,-isystem%,${SYS_INCDIRS}) LDFLAGS+=$(patsubst %,-L %,${LIBDIRS}) LDFLAGS += ${EXTRA_LDFLAGS} vpath align.c ${DALIGNER_SRC} vpath DB.c ${DAZZ_DB_SRC} vpath QV.c ${DAZZ_DB_SRC} all: pbdagcon dazcon # Technically does not need pbdata or blasr, but so what? dazcon: $(COMMON_OBJECTS) $(DAZCON_OBJECTS) $(CXX) -o $@ $^ $(LDFLAGS) $(LDLIBS) pbdagcon: $(COMMON_OBJECTS) $(PBDAGCON_OBJECTS) $(CXX) -o $@ $^ $(LDFLAGS) $(LDLIBS) $(COMMON_OBJECTS) $(PBDAGCON_OBJECTS): clean: $(RM) *.d $(RM) *.o $(RM) pbdagcon $(RM) dazcon .PHONY: all clean SRCS:= $(notdir $(wildcard ${THISDIR}/*.c)) CPP_SRCS:=$(notdir $(wildcard ${THISDIR}/*.cpp)) DEPS:=$(patsubst %.c,%.d,${SRCS}) CPP_DEPS:=$(patsubst %.cpp,%.d,${CPP_SRCS}) -include ${DEPS} ${CPP_DEPS} pbdagcon-0.3+20161121+ds/src/cpp/SimpleAligner.cpp0000644000175000017500000000373513026414536017644 0ustar afifafif#include #include #include #include #include #include #include "Alignment.hpp" #include "SimpleAligner.hpp" SimpleAligner::SimpleAligner() { config_.indelRate = 0.3; config_.indel = 5; config_.match = 0; config_.sdpIndel = 5; config_.sdpIns = 5; config_.sdpDel = 10; config_.kmer = 11; config_.bandSize = 10; tupleMetrics_.Initialize(config_.kmer); distScoreFn_.del = config_.indel; distScoreFn_.ins = 4; distScoreFn_.InitializeScoreMatrix(SMRTDistanceMatrix); } void SimpleAligner::align(dagcon::Alignment& aln) { // This alignment type defined in blasr code base blasr::Alignment initialAln, refinedAln; FASTQSequence query; query.seq = (Nucleotide*)aln.qstr.c_str(); query.length = aln.qstr.length(); DNASequence target; target.seq = (Nucleotide*)aln.tstr.c_str(); target.length = aln.tstr.length(); SDPAlign(query, target, distScoreFn_, tupleMetrics_.tupleSize, config_.sdpIndel, config_.sdpIndel, config_.indelRate*2, initialAln, Local); GuidedAlign(query, target, initialAln, distScoreFn_, config_.bandSize, refinedAln); std::string queryStr, alignStr, targetStr; //StickPrintAlignment(initialAln, query, target, std::cout); //StickPrintAlignment(refinedAln, query, target, std::cout); CreateAlignmentStrings(refinedAln, query.seq, target.seq, targetStr, alignStr, queryStr, query.length, target.length); // alignment coordinates may change, update alignment object aln.start += refinedAln.GenomicTBegin(); aln.end = aln.start + refinedAln.GenomicTEnd(); if (aln.strand == '-') { aln.start = aln.tlen - aln.end; aln.qstr = revComp(queryStr); aln.tstr = revComp(targetStr); } else { aln.qstr = queryStr; aln.tstr = targetStr; } aln.start++; } void SimpleAligner::operator() (dagcon::Alignment& aln) { align(aln); } pbdagcon-0.3+20161121+ds/src/cpp/Alignment.hpp0000644000175000017500000000405313026414536017026 0ustar afifafif#pragma once #include /// /// Super-simple alignment representation. Represents an alignment between two /// PacBio reads, one of which we're trying to correct. The read to correct /// may be either the target or the query, depending on how the alignment was /// done. /// namespace dagcon { class Alignment { public: typedef void (*ParseFunc)(std::istream&, Alignment* aln); // May correct the target or the query, default is target static bool groupByTarget; // length of the sequence we are trying to correct uint32_t tlen; // conforming offsets are 1-based uint32_t start; uint32_t end; // ID of the read we're trying to correct (target) std::string id; // ID of the supporting read (query) std::string sid; char strand; // query and target strings must be equal length std::string qstr; std::string tstr; Alignment(); static ParseFunc parse; }; } std::istream& operator>>(std::istream& instrm, dagcon::Alignment& data); std::ostream& operator<<(std::ostream& ostrm, dagcon::Alignment& data); void parseM5(std::istream& stream, dagcon::Alignment* aln); void parsePre(std::istream& stream, dagcon::Alignment* aln); /// Simplifies the alignment by normalizing gaps. Converts mismatches into /// indels ... /// query: CAC query: C-AC /// | | ---> | | /// target: CGC target: CG-C /// /// Shifts equivalent gaps to the right in the reference ... /// query: CAACAT query: CAACAT /// | | || ---> ||| | /// target: C-A-AT target: CAA--T /// /// Shifts equivalent gaps to the right in the read ... /// query: -C--CGT query: CCG--T /// | | | ---> ||| | /// target: CCGAC-T target: CCGACT /// Allow optional gap pushing, some aligners may not need it and I'd like /// to get rid of it anyway. dagcon::Alignment normalizeGaps(dagcon::Alignment& aln, bool push=true); void trimAln(dagcon::Alignment& aln, int trimLen=50); std::string revComp(std::string& seq); pbdagcon-0.3+20161121+ds/src/cpp/SimpleAligner.hpp0000644000175000017500000000145513026414536017646 0ustar afifafif#pragma once #include "Types.h" #include "Enumerations.h" #include "DNASequence.hpp" #include "datastructures/alignment/Alignment.hpp" #include "algorithms/alignment/AlignmentUtils.hpp" #include "algorithms/alignment/SDPAlign.hpp" #include "algorithms/alignment/GuidedAlign.hpp" #include "format/StickAlignmentPrinter.hpp" #include "FASTQSequence.hpp" namespace Aligner { struct Config { float indelRate; int indel; int match; int sdpIndel; int sdpIns; int sdpDel; int kmer; int bandSize; }; } class SimpleAligner { public: SimpleAligner(); void align(dagcon::Alignment& aln); void operator() (dagcon::Alignment& aln); private: Aligner::Config config_; TupleMetrics tupleMetrics_; DistanceMatrixScoreFunction distScoreFn_; }; pbdagcon-0.3+20161121+ds/src/cpp/BlasrM5AlnProvider.cpp0000644000175000017500000000436313026414536020522 0ustar afifafif#include #include #include #include #include #include #include "Alignment.hpp" #include "BlasrM5AlnProvider.hpp" BlasrM5AlnProvider::BlasrM5AlnProvider(const std::string& fpath) : fpath_(fpath), currId_(""), firstAln_(true), fs_() { //checkFormat(); fs_.open(fpath_); is_ = &fs_; } BlasrM5AlnProvider::BlasrM5AlnProvider(std::istream* stream) : fpath_(""), currId_(""), firstAln_(true), fs_(), is_(stream) { } BlasrM5AlnProvider::~BlasrM5AlnProvider() { delete is_; } bool BlasrM5AlnProvider::nextTarget(std::vector& dest) { // first clear any previous alignments dest.clear(); // process up to EOF or next target // need to maintain state in between calls if (! firstAln_) dest.push_back(prevAln_); dagcon::Alignment aln; while (*is_ >> aln) { if (aln.id != currId_) { firstAln_ = false; prevAln_ = aln; currId_ = aln.id; break; } dest.push_back(aln); } return bool(*is_); } bool BlasrM5AlnProvider::nextTarget(std::string& targetSeq, std::vector& dest) { // NOOP return false; } void BlasrM5AlnProvider::checkFormat() { std::ifstream ifs(fpath_); if (! ifs.is_open() || ifs.fail()) { throw M5Exception::FileOpenError(); } // parse the first line and run some field checks std::string line; std::getline(ifs, line); std::stringstream row(line); std::string col; std::vector fields; while(std::getline(row, col, ' ')) { if (col == "") continue; fields.push_back(col); } if (fields.size() < 19) { boost::format msg("Expected 19 fields, found %d"); msg % fields.size(); throw M5Exception::FormatError(msg.str()); } // check how the alignments are grouped dagcon::Alignment aln; std::vector raw, sorted; int max = 50, count = 0; while(ifs >> aln && count++ < max) raw.push_back(aln.id); sorted = raw; std::sort(sorted.begin(), sorted.end()); std::string logl = "dagcon::Alignments appear to be grouped by %s"; ifs.close(); } pbdagcon-0.3+20161121+ds/src/cpp/Alignment.cpp0000644000175000017500000001522113026414536017020 0ustar afifafif#include #include #include #include #include #include #include #include "Alignment.hpp" using namespace dagcon; /// /// Simple method to reverse complement a sequence. /// std::string revComp(std::string& seq) { const std::string bases = "ACTG"; std::string::iterator curr = seq.begin(); for (; curr != seq.end(); ++curr) { char& c = *curr; c = c == 'T' ? bases[0] : c == 'G' ? bases[1] : c == 'A' ? bases[2] : c == 'C' ? bases[3] : c; } return std::string(seq.rbegin(), seq.rend()); } // Set this to false if the alignments are grouped by query. The parse // routine will be adjusted to build the alignment graph based on the // queries. bool Alignment::groupByTarget = true; Alignment::Alignment() : tlen(0), start(0), end(0), id(""), sid(""), strand('+'), qstr(""), tstr("") { } // Parses blasr m5 output grouped either by target or query. void parseM5(std::istream& stream, Alignment* aln) { std::string line; std::getline(stream, line); std::stringstream row(line); std::string col; std::vector fields; while(std::getline(row, col, ' ')) { if (col == "") continue; fields.push_back(col); } // avoids *some* empty lines if (fields.size() == 0) return; // base query id (without the last '/'), allows us to // group properly by query when asked. std::string baseQid = fields[0].substr(0,fields[0].find_last_of("/")); aln->sid = fields[0]; aln->id = Alignment::groupByTarget ? fields[5] : baseQid; std::istringstream ssLen(Alignment::groupByTarget ? fields[6] : fields[1]); ssLen >> aln->tlen; std::istringstream ssStart(Alignment::groupByTarget ? fields[7] : fields[2]); ssStart >> aln->start; aln->start++; // the target is always reversed. aln->strand = fields[9][0]; if (aln->strand == '-' && Alignment::groupByTarget) { // only need to reverse complement when correcting targets aln->qstr = revComp(fields[16]); aln->tstr = revComp(fields[18]); } else { aln->qstr = Alignment::groupByTarget ? fields[16] : fields[18]; aln->tstr = Alignment::groupByTarget ? fields[18] : fields[16]; } } void parsePre(std::istream& stream, Alignment* aln) { std::string line; std::getline(stream, line); std::stringstream row(line); std::string col; std::vector fields; while(std::getline(row, col, ' ')) { if (col == "") continue; fields.push_back(col); } // avoids *some* empty lines if (fields.size() == 0) return; // qid, tid, strand, tlen, tstart, tend, qstr, tstr aln->sid = fields[0]; aln->id = fields[1]; aln->strand = fields[2][0]; std::istringstream ssLen(fields[3]); ssLen >> aln->tlen; std::istringstream ssStart(fields[4]); ssStart >> aln->start; std::istringstream ssEnd(fields[5]); ssEnd >> aln->end; aln->qstr = fields[6]; aln->tstr = fields[7]; } // default to parsing m5 Alignment::ParseFunc Alignment::parse = parseM5; std::istream& operator>>(std::istream& instrm, Alignment& data) { Alignment::parse(instrm, &data); return instrm; } std::ostream& operator<<(std::ostream& ostrm, Alignment& data) { ostrm << "target: " << data.id << ", query: " << data.sid; ostrm << ", start: " << data.start << ", end: " << data.end; ostrm << ", length: " << data.tlen << std::endl; ostrm << "tstr(50): " << data.tstr.substr(0,50) << std::endl; ostrm << "qstr(50): " << data.qstr.substr(0,50) << std::endl; return ostrm; } Alignment normalizeGaps(Alignment& aln, bool push) { // XXX: optimize this assert(aln.qstr.length() == aln.tstr.length()); size_t len = aln.qstr.length(); std::string qNorm, tNorm; qNorm.reserve(len+100); tNorm.reserve(len+100); std::string qstr = aln.qstr; std::string tstr = aln.tstr; // convert dots to dashes for (size_t i=0; i < len; i++) { if ('.' == qstr[i]) qstr[i] = '-'; if ('.' == tstr[i]) tstr[i] = '-'; } // convert mismatches to indels for (size_t i=0; i < len; i++) { char qb = qstr[i], tb = tstr[i]; if (qb != tb && qb != '-' && tb != '-') { qNorm += '-'; qNorm += qb; tNorm += tb; tNorm += '-'; } else { qNorm += qb; tNorm += tb; } } // update length assert(qNorm.length() == tNorm.length()); len = qNorm.length(); if (push) { // push gaps to the right, but not past the end for (size_t i=0; i < len-1; i++) { // pushing target gaps if (tNorm[i] == '-') { size_t j = i; while (++j < len) { char c = tNorm[j]; if (c != '-') { if (c == qNorm[i]) { tNorm[i] = c; tNorm[j] = '-'; } break; } } } // pushing query gaps if (qNorm[i] == '-') { size_t j = i; while (++j < len) { char c = qNorm[j]; if (c != '-') { if (c == tNorm[i]) { qNorm[i] = c; qNorm[j] = '-'; } break; } } } } } assert(qNorm.length() == tNorm.length()); assert(len == tNorm.length()); // generate the final, normalized alignment strings Alignment finalNorm; finalNorm.id = aln.id; finalNorm.sid = aln.sid; finalNorm.start = aln.start; finalNorm.tlen = aln.tlen; finalNorm.strand = aln.strand; for (size_t i=0; i < len; i++) { if (qNorm[i] != '-' || tNorm[i] != '-') { finalNorm.qstr += qNorm[i]; finalNorm.tstr += tNorm[i]; } } return finalNorm; } void trimAln(Alignment& aln, int trimLen) { int lbases, rbases; size_t loffs, roffs; auto const len = aln.tstr.length(); lbases = 0; loffs = 0U; while(lbases < trimLen && loffs < len) { if (aln.tstr[loffs++] != '-') { lbases++; } } rbases = 0; roffs = len; while (rbases < trimLen && roffs > loffs) { if (aln.tstr[--roffs] != '-') { rbases++; } } aln.start += lbases; aln.qstr = aln.qstr.substr(loffs, roffs - loffs); aln.tstr = aln.tstr.substr(loffs, roffs - loffs); } pbdagcon-0.3+20161121+ds/src/cpp/dazcon.cpp0000644000175000017500000001601213026414536016357 0ustar afifafif#include #include #include #include #include #include #include #include #include #include #include #include #define ELPP_CUSTOM_COUT std::cerr #define ELPP_THREAD_SAFE 1 #include "easylogging++.h" #include "tclap/CmdLine.h" #include "ProgramOpts.hpp" #include "Alignment.hpp" #include "AlnGraphBoost.hpp" #include "DazAlnProvider.hpp" #include "BoundedBuffer.hpp" INITIALIZE_NULL_EASYLOGGINGPP ProgramOpts popts; typedef std::vector AlnVec; struct TargetData { std::string targSeq; AlnVec alns; }; typedef BoundedBuffer TrgBuf; typedef BoundedBuffer CnsBuf; void Reader(TrgBuf& trgBuf, AlnProvider* ap) { try { TargetData td; bool hasNext = true; do { hasNext = ap->nextTarget(td.targSeq, td.alns); //for (auto& aln : td.alns) // std::cerr << aln; if (! td.alns.empty()) trgBuf.push(td); } while (hasNext); } catch (PacBio::DagCon::IOException& e) { std::cerr << e.what(); exit(1); } // write out sentinals, one per consensus thread TargetData sentinel; for (int i=0; i < popts.threads; i++) trgBuf.push(sentinel); } void Consensus(int id, TrgBuf& trgBuf, CnsBuf& cnsBuf) { int fake_well_counter; // just to avoid too many reads in the same bin TargetData td; trgBuf.pop(&td); std::vector seqs; el::Loggers::getLogger("Consensus"); while (td.alns.size() > 0) { if (td.alns.size() < popts.minCov) { trgBuf.pop(&td); continue; } boost::format msg("(%d) calling: %s Alignments: %d"); CLOG(INFO, "Consensus") << msg % id % td.alns[0].id % td.alns.size(); AlnGraphBoost ag(td.targSeq); AlnVec alns = td.alns; for (auto it = alns.begin(); it != alns.end(); ++it) { if (it->qstr.length() < popts.minLen) continue; dagcon::Alignment aln = normalizeGaps(*it); // XXX: Shouldn't be needed for dazcon, but causes some infinite // loops in the current consensus code. trimAln(aln, popts.trim); ag.addAln(aln); } CVLOG(3, "Consensus") << "Merging nodes"; ag.mergeNodes(); CVLOG(3, "Consensus") << "Generating consensus"; ag.consensus(seqs, popts.minCov, popts.minLen); for (auto it = seqs.begin(); it != seqs.end(); ++it) { CnsResult result = *it; boost::format fasta(">%s/%d/%d_%d\n%s\n"); fasta % alns[0].id % fake_well_counter % result.range[0] % result.range[1] % result.seq; cnsBuf.push(fasta.str()); ++fake_well_counter; } trgBuf.pop(&td); } boost::format msg("(%d) ending ..."); CLOG(INFO, "Consensus") << msg % id; // write out a sentinal cnsBuf.push(""); } void Writer(CnsBuf& cnsBuf) { std::string cns; cnsBuf.pop(&cns); int sentinelCount = 0; while (true) { std::cout << cns; if (cns == "" && ++sentinelCount == popts.threads) break; cnsBuf.pop(&cns); } } void parseArgs(int argc, char **argv) { try { TCLAP::CmdLine cmd("PBI consensus module", ' ', "0.3"); TCLAP::ValueArg threadArg( "j","threads", // short, long name "Number of consensus threads", // description false, 4, // required, default "int", cmd); TCLAP::ValueArg minCovArg( "c","min-coverage", "Minimum coverage for correction", false, 6, "uint", cmd); TCLAP::ValueArg minLenArg( "l","min-len", "Minimum length for correction", false, 500, "uint", cmd); TCLAP::ValueArg trimArg( "t","trim", "Trim alignments on either size", false, 10, "uint", cmd); TCLAP::ValueArg alnFileArg( "a","align-file", "Path to the alignments file", true,"","string", cmd); TCLAP::ValueArg seqFileArg( "s","seq-file", "Path to the sequences file", true,"","string", cmd); TCLAP::ValueArg maxHitArg( "m","max-hit", "Maximum number of hits to pass to consensus", false,85,"uint", cmd); TCLAP::SwitchArg sortCovArg("x","coverage-sort", "Sort hits by coverage", cmd, false); TCLAP::SwitchArg properOvlArg("o","only-proper-overlaps", "Use only 'proper overlaps', i.e., align to the ends", cmd, false); TCLAP::SwitchArg verboseArg("v","verbose", "Turns on verbose logging", cmd, false); TCLAP::UnlabeledMultiArg targetArgs( "targets", "Limit consensus to list of target ids", false, "list of ints", cmd); cmd.parse(argc, argv); popts.minCov = minCovArg.getValue(); popts.minLen = minLenArg.getValue(); popts.trim = trimArg.getValue(); popts.alnFile = alnFileArg.getValue(); popts.seqFile = seqFileArg.getValue(); popts.threads = threadArg.getValue(); popts.maxHits = maxHitArg.getValue(); popts.sortCov = sortCovArg.getValue(); popts.properOvls = properOvlArg.getValue(); std::vector tgs = targetArgs.getValue(); popts.targets.insert(tgs.begin(), tgs.end()); } catch (TCLAP::ArgException& e) { std::cerr << "Error " << e.argId() << ": " << e.error() << std::endl; exit(1); } } int main(int argc, char* argv[]) { parseArgs(argc, argv); #if ELPP_ASYNC_LOGGING el::base::elStorage.reset( new el::base::Storage(el::LogBuilderPtr(new el::base::DefaultLogBuilder()), new el::base::AsyncDispatchWorker()) ); #else el::base::elStorage.reset( new el::base::Storage(el::LogBuilderPtr(new el::base::DefaultLogBuilder())) ); #endif // ELPP_ASYNC_LOGGING START_EASYLOGGINGPP(argc, argv); LOG(INFO) << "Initializing alignment provider"; DazAlnProvider* ap; ap = new DazAlnProvider(popts); TrgBuf trgBuf(20); CnsBuf cnsBuf(10); std::thread writerThread(Writer, std::ref(cnsBuf)); std::vector cnsThreads; for (int i=0; i < popts.threads; i++) { std::thread ct(Consensus, i, std::ref(trgBuf), std::ref(cnsBuf)); cnsThreads.push_back(std::move(ct)); } std::thread readerThread(Reader, std::ref(trgBuf), ap); writerThread.join(); std::vector::iterator it; for (it = cnsThreads.begin(); it != cnsThreads.end(); ++it) it->join(); readerThread.join(); delete ap; return 0; } pbdagcon-0.3+20161121+ds/src/cpp/pbdagcon_wf.sh0000755000175000017500000000217113026414536017206 0ustar afifafif#!/bin/bash # Simple pbdagcon workflow script. Written for the benefit of running via # smrtpipe so I can communicate pipe errors to the task. We're overcoming # the limitation of smrtpipe forcing tasks to run serially, enabling a new # level of pipelining that's extremely efficient in an imperfect world ... # However, direct file I/O is faster by default. tmp=${tmp-"/tmp"} trap "rm -f $tmp/aln.$$.pre" EXIT SIGINT echo "Generating pre-alignments" echo "m4topre.py $mym4 $allm4 $subreads ${bestn-24} > $tmp/aln.$$.pre" # generate pre-alignments to a tmp directory m4topre.py $mym4 $allm4 $subreads ${bestn-24} > $tmp/aln.$$.pre || exit $? echo "Correcting reads" # pipe it to consensus and generate fasta pbdagcon -c ${cov-8} -a -j ${nproc-15} $tmp/aln.$$.pre | tee ${fasta-"corrected.fa"} | \ # generate a fastq awk '{if($0~/>/){sub(/>/,"@",$0);print;}else{l=length($0);q="";while(l--){q=q "9"}printf("%s\n+\n%s\n",$0,q)}}' > ${fastq-"corrected.fq"} # check the status of each piped command and exit non-zero if found for exitval in ${PIPESTATUS[*]} do if [ $exitval -gt 0 ] then exit $exitval fi done exit 0; pbdagcon-0.3+20161121+ds/src/m4topre.py0000755000175000017500000001347413026414536015573 0ustar afifafif#!/usr/bin/env python """Super-simple converter from blasr m4 alignments to pbdagcon 'pre' alignments. For use in the pre-assembler dagcon workflow. """ import sys import heapq import string # pylint: disable=W0402 from itertools import ifilter from collections import namedtuple, defaultdict import numpy as np from pbcore.io.FastaIO import FastaReader # qname tname score pctsimilarity qstrand qstart qend qseqlength tstrand tstart # ... tend tseqlength mapqv # # store only fields we need __m4fields__ = [0, 1, 2, 5, 6, 8, 9, 10, 11] M4RECORD = namedtuple( 'M4RECORD', 'qname tname score qstart qend tstrand tstart tend tseqlength') __tuplfy__ = M4RECORD._make # pylint: disable=W0212 # dna compliment __rc__ = string.maketrans('actgACTG', 'tgacTGAC') def parse_m4(rec): """Parse in the m4 file, returning a list of records""" return [y for (x, y) in enumerate(rec.split()) if x in __m4fields__] def rating(rec): """Rates the alignment for by length and score (revisit at some point)""" score = -int(rec.score) alen = int(rec.tend) - int(rec.tstart) return score + alen def schwartzian(rec): """Provides a schwartzian transform for the given record, used for sorting """ flds = rec.split() return (flds[1], float(flds[2]), rec) def sort_targ_score(recs): """Sorts the list in place by target id (string), then score (float)""" recs[:] = [schwartzian(x) for x in recs] recs.sort() recs[:] = [rec for (target, score, rec) in recs] # pylint: disable=W0612 def rescore(recs): """Rescore alignments using coverage based statistics""" prev = "" cov = np.zeros(1) for idx, rec in enumerate(recs): fields = rec.split() rec = __tuplfy__(fields) if rec.tname != prev: prev = rec.tname cov = np.zeros(int(rec.tseqlength), dtype=np.float16) if rec.tstrand: start = int(rec.tseqlength) - int(rec.tend) end = int(rec.tseqlength) - int(rec.tstart) else: start = int(rec.tstart) end = int(rec.tend) cov[start:end] += 1 score = np.sum(1/cov[start:end]) fields[2] = str(-score) recs[idx] = " ".join(fields) def bestn_true(recstr, myq): """Checks if the record falls inside bestn (used when blasr is chunked)""" rec = __tuplfy__(recstr.split()) rate = rating(rec) return rate in myq[rec.qname[32:]].top class AlnLimiter(object): # pylint: disable=R0903 """Functor that returns alignments until some count is reached. Alignments should be sorted. """ def __init__(self, limit=76): self.count = 0 self.target = '' self.limit = limit def __call__(self, rec): target = rec.split()[1] if target != self.target: self.count = 0 self.target = target self.count += 1 return self.count < self.limit class TopAlignments(object): # pylint: disable=R0903 """Tracks the top alignments for a given query, used for bestn calc""" bestn = 10 def __init__(self): self.top = [0] * TopAlignments.bestn def __call__(self): return # noop def add(self, aln): """Adds an alignment to a bounded list, kicking out another if necessary """ heapq.heappushpop(self.top, aln) def main(): # pylint: disable=R0914 """Drives the program""" mym4 = sys.argv[1] allm4 = sys.argv[2] reads = sys.argv[3] TopAlignments.bestn = int(sys.argv[4]) # tracks bestn my_queries = defaultdict(TopAlignments) my_m4recs = [] # load my m4 chunk m4h = open(mym4) rec_add = my_m4recs.append for line in m4h: flds = parse_m4(line) rec = __tuplfy__(flds) rate = rating(rec) my_queries[rec.qname[32:]].add(rate) rec_add(' '.join(flds)) m4h.close() # if we're chunked locate relevant alignments if mym4 != allm4: # assuming fofn here m4files = [x.rstrip() for x in open(allm4) if x.rstrip() != mym4] for m4f in m4files: m4h = open(m4f) for recstr in m4h: rec = __tuplfy__(parse_m4(recstr)) if rec.qname[32:] in my_queries: rate = rating(rec) my_queries[rec.qname[32:]].add(rate) m4h.close() # remove alignments that fall outside of bestn my_m4recs[:] = [x for x in my_m4recs if bestn_true(x, my_queries)] # sort by target name/score sort_targ_score(my_m4recs) # rescore based on coverage rescore(my_m4recs) # sort one more time be new score sort_targ_score(my_m4recs) # take a max number of alignments for each target limiter = AlnLimiter() my_m4recs[:] = [x for x in ifilter(limiter, my_m4recs)] # load only related sequences seqs = {} frea = FastaReader(reads) for fent in frea: if fent.name[32:] in my_queries: seqs[fent.name] = fent.sequence # may or may not help del my_queries # generate pre-alignments for recstr in my_m4recs: rec = __tuplfy__(recstr.split()) # Bug 24538, rare case missing self hit if rec.tname not in seqs: msg = "Warning: skipping query %s target %s\n" sys.stderr.write(msg % (rec.qname, rec.tname)) continue qst = int(rec.qstart) qnd = int(rec.qend) qseq = seqs[rec.qname][qst:qnd] strand = '-' if rec.tstrand == '1' else '+' tst = int(rec.tstart) tnd = int(rec.tend) if strand == '+': tseq = seqs[rec.tname][tst:tnd] else: tseq = seqs[rec.tname].translate(__rc__)[::-1][tst:tnd] print ' '.join([rec.qname, rec.tname, strand, rec.tseqlength, str(tst), str(tnd), qseq, tseq]) if __name__ == '__main__': sys.exit(main()) pbdagcon-0.3+20161121+ds/src/q-sense.py0000755000175000017500000001556113026414536015553 0ustar afifafif#!/usr/bin/env python import sys import os import logging import pkg_resources from pbcore.util.ToolRunner import PBMultiToolRunner from pbcore.io import FastaReader from pbtools.pbdagcon.q_sense import * try: __p4revision__ = "$Revision: #15 $" __p4change__ = "$Change: 115421 $" revNum = int(__p4revision__.strip("$").split(" ")[1].strip("#")) changeNum = int(__p4change__.strip("$").split(":")[-1]) __version__ = "%s-r%d-c%d" % ( pkg_resources.require("pbtools.pbdagcon")[0].version, revNum, changeNum ) except: __version__ = "pbtools.pbdagcon-github" class Consensus(PBMultiToolRunner): def __init__(self): desc = ["Making consensus sequence from a group reads storing in a fasta file. " "All of the reads are expected to cover most of the target templates that gets sequenced. " "If the reads have broad read length distribution and not all of them cover the same region of " "a template, this code won't generate correct result. " "This code is designed for getting consensus up to the length of reads (~10k). " "It is not optimized for getting consensus for larger templates."] super(Consensus, self).__init__('\n'.join(desc)) subparsers = self.subParsers desc = ['using self-self alignment for getting the seed sequence for constructing consensus'] parser_d = subparsers.add_parser('d', help = "generate consensus using the alignments between the input sequneces to find seed sequence", description = "\n".join(desc)) parser_d.add_argument('input', metavar = 'input.fasta', help = 'an input fasta file') desc = ['using a reference file as seed for consensus'] parser_r = subparsers.add_parser('r', help = "using a reference fasta as the seed sequence", description = "\n".join(desc)) parser_r.add_argument('input', metavar = 'input.fasta', help = 'an input fasta file') parser_r.add_argument('ref', metavar = 'ref.fasta', help = 'a reference fasta file') for subp in (parser_r, parser_d): subp.add_argument('-o', '--output', metavar = 'file-name', dest = 'out_file_name', default = "g_consensus", help = 'consensus output filename') subp.add_argument('-d', '--output_dir', metavar = 'directory-name', dest = 'out_dir_name', default = "./", help = 'consensus output working directory') subp.add_argument('--cname', metavar = 'consensus-seq-name', dest = 'consensus_seq_name', default = "consensus", help = 'consensus sequence name') subp.add_argument('--enable_hp_correction', action='store_true', default = False, dest="enable_hp_corr", help = 'enable aggressive homopolymer missing errot detection and correction') subp.add_argument('--mark_lower_case', action='store_true', default = False, dest="mark_lower_case", help = 'mark low quality consensus base with lower case letter') subp.add_argument('--hp_correction_th', dest = 'entropy_th', default = 0.65, help = 'homopolymer missing correction entropy threshold') subp.add_argument('--n_iter', default = 4, dest = 'niter', help = 'number of iteration of consensus correction') subp.add_argument('--min_cov', default = 8, dest = 'min_cov', help = 'minimum coverage for generate consensus') subp.add_argument('--max_cov', default = 60, dest = 'max_cov', help = 'maximum coverage for generate consensus') subp.add_argument('--max_n_reads', default = 150, dest = 'max_num_reads', help = 'the maximum number of reads used for consensus') subp.add_argument('--nproc', default = 4, dest = 'nproc', help = 'the number cpu core used by blasr, default to 4 cores') subp.add_argument('--dump_dag_info', action='store_true', default = False, dest="dump_dag_info", help = 'dump the information of the dag, including a pileup view of the alignments') def getVersion(self): return __version__ def denovoConsensus(self): prefix = self.args.out_file_name.split(".") input_fasta_name = self.args.input rid,s =best_template_by_blasr(input_fasta_name) if len(prefix) > 1: prefix = ".".join(prefix[:-1]) else: prefix = ".".join(prefix) full_prefix = os.path.join(self.args.out_dir_name, prefix) with open("%s_ref.fa" % full_prefix, "w") as f: print >>f ,">%s_ref" % self.args.consensus_seq_name print >>f, s hp_corr = True if self.args.enable_hp_corr else False mark_lower_case = True if self.args.mark_lower_case else False generate_consensus(input_fasta_name, "%s_ref.fa" % full_prefix, full_prefix, self.args.consensus_seq_name, hp_corr, int(self.args.niter), int(self.args.max_num_reads), float(self.args.entropy_th), self.args.dump_dag_info, int(self.args.min_cov), int(self.args.max_cov), mark_lower_case, int(self.args.nproc)) def refConsensus(self): input_fasta_name = self.args.input prefix = self.args.out_file_name.split(".") if len(prefix) > 1: prefix = ".".join(prefix[:-1]) else: prefix = ".".join(prefix) full_prefix = os.path.join(self.args.out_dir_name, prefix) hp_corr = True if self.args.enable_hp_corr else False mark_lower_case = True if self.args.mark_lower_case else False generate_consensus(input_fasta_name, self.args.ref, full_prefix, self.args.consensus_seq_name, hp_corr, int(self.args.niter), int(self.args.max_num_reads), float(self.args.entropy_th), self.args.dump_dag_info, int(self.args.min_cov), int(self.args.max_cov), mark_lower_case, int(self.args.nproc)) def run(self): logging.debug("Arguments" + str(self.args)) if self.args.subCommand == 'd': self.denovoConsensus() elif self.args.subCommand == 'r': self.refConsensus() if __name__ == '__main__': sys.exit(Consensus().start()) pbdagcon-0.3+20161121+ds/src/filterm4.py0000755000175000017500000000306413026414536015721 0ustar afifafif#!/usr/bin/env python # Filters for unique, highest scoring subread query/target pairs from an m4 # file. Helps get rid of chimeras, at the cost of some yield. import sys from collections import namedtuple M4Record = namedtuple('M4Record', ('qname tname score pctsimilarity qstrand ' 'qstart qend qseqlength tstrand tstart ' 'tend tseqlength mapqv')) class Count(object): """Tracks record count for original and filtered""" def __init__(self): self.orig = 0 self.filt = 0 def __repr__(self): return "Record count: original=%i, filtered=%i\n" % \ (self.orig, self.filt) def printUniq(qgroup, count): top = dict() for q in qgroup: m = M4Record._make(q.split()) k = "%s%s" % (m.qname, m.tname) if k in top: n = M4Record._make(top[k].split()) if int(m.score) < int(n.score): top[k] = q else: top[k] = q for r in top.values(): count.filt += 1 print r, qgroup[:] = [] def main(): m4file = sys.argv[1] m4Hndl = open(m4file) qgroup = [] curr = '' count = Count() for rec in m4Hndl: count.orig += 1 m = M4Record._make(rec.split()) if curr != m.qname: printUniq(qgroup, count) qgroup.append(rec) curr = m.qname else: qgroup.append(rec) printUniq(qgroup, count) sys.stderr.write(str(count)) if __name__ == '__main__': sys.exit(main()) pbdagcon-0.3+20161121+ds/.travis.yml0000644000175000017500000000057513026414536015144 0ustar afifafiflanguage: cpp script: - ./travis.sh compiler: - gcc - clang install: - if [ "$CXX" = "g++" ]; then export CXX="g++-4.8" CC="gcc-4.8"; fi addons: apt: sources: - ubuntu-toolchain-r-test packages: - libyajl-dev - libxml2-dev - gcc-4.8 - g++-4.8 - clang # - libxqilla-dev # missing, but not needed? notifications: email: false sudo: false pbdagcon-0.3+20161121+ds/LICENSE0000644000175000017500000000355213026414536014036 0ustar afifafif#################################################################################$$ # Copyright (c) 2011-2016, Pacific Biosciences of California, Inc. # # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted (subject to the limitations in the # disclaimer below) provided that the following conditions are met: # # * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # # * Redistributions in binary form must reproduce the above # copyright notice, this list of conditions and the following # disclaimer in the documentation and/or other materials provided # with the distribution. # # * Neither the name of Pacific Biosciences nor the names of its # contributors may be used to endorse or promote products derived # from this software without specific prior written permission. # # NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE # GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC # BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED # WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE # DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF # USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT # OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF # SUCH DAMAGE. #################################################################################$$