flexbar_v2.4_src/0000751000651600065100000000000012175470061015207 5ustar jroehrbioinformaticsflexbar_v2.4_src/README.txt0000640000651600065100000001077212175327143016716 0ustar jroehrbioinformaticsFlexbar — flexible barcode and adapter removal, version 2.4 Bioinformatics in Quantitative Biology at BIMSB, GPLv3 === Installation === To run binaries, make sure that the TBB library (Intel Threading Building Blocks) is available to the system. Flexbar binaries are provided for Linux 64, Mac OSX ≥ 10.7.4, Windows 32 and 64 bit systems on: flexbar.sourceforge.net ### Linux One possibility is to put the file libtbb.so.2 in your working directory. To use it permanently, install the library runtime files from the repository of your distribution, or copy libtbb.so.2 to the shared library directory with the following command, or ask the administrator to install it: sudo cp FLEXBAR_DIR/libtbb.so.2 /usr/local/lib Or adjust the lib search path to include the directory of the lib file for the current terminal session: export LD_LIBRARY_PATH=FLEXBAR_DIR ### Mac OSX It applies the same as for Linux. Make the file libtbb.dylib available: sudo cp FLEXBAR_DIR/libtbb.dylib /usr/local/lib Or set the lib search path accordingly: export DYLD_LIBRARY_PATH=FLEXBAR_DIR ### Windows Keep the file tbb.dll in the directory of the Flexbar executable. Visual Studio 10 sp1 has to be installed. Alternatively, those who have not this program version can download the Visual Studio 10 sp1 redistributable package from Microsoft. Win 32: www.microsoft.com/en-us/download/details.aspx?id=8328 Win 64: www.microsoft.com/en-us/download/details.aspx?id=13523 === Program usage === Flexbar needs at least one file with sequencing reads in fasta/q or csfasta/q format as input. Additionally, the target name, quality format of reads and further options can be specified. For barcode based read seperation and adapter removal, a file in fasta format with barcode or adapter sequences should be provided. Please refer to the help screen (flexbar -h) or documentation on: sourceforge.net/p/flexbar/wiki SYNOPSIS flexbar -r reads [-t target] [-b barcodes] [-a adapters] [options] EXAMPLES flexbar -r reads.fq -f i1.8 -t target -b brc.fa -a adap.fa flexbar -r reads.csfastq.gz -a adap.fa -ao 5 -ae LEFT -c In the first example, barcoded reads in illumina version 1.8 fastq format are demultiplexed by specifying a file with barcodes in fasta format. After read seperation based on barcodes, adapters given in fasta format are removed from the right side if they align at the read beginning or downstream. After removal the left side of reads is kept. Remaining reads are written to the file target.fastq in same format. The second example, shows how to remove adapters in fasta format from left side of gzip compressed color-space (c) reads with quality scores (csfastq), if the overlap of adapter and read has at least length five. For left trim-end type the right side of reads is retained. Although default parameters of Flexbar are optimized to deliver good results in a large number of scenarios, the adjustment of parameters might improve results, e.g. --adapter-min-overlap and --adapter-threshold. === Building from SVN === 1) Check out the SVN repository to a local directory FLEXBAR_DIR: svn co http://svn.code.sf.net/p/flexbar/code/trunk FLEXBAR_DIR 2) Download TBB library, if you dont have Linux, Windows or Mac OSX running. For these systems the lib files are supplied together with binaries. Download the latest stable source release. It should work with version >= 3.0, then unpack the archive and run gmake in the unpacked folder. http://www.threadingbuildingblocks.org/file.php?fid=77 3) Make the TBB library available in your library searchpath. For Linux 64, Mac OSX or Windows follow the steps for binaries above. If you compiled TBB yourself copy the compiled lib to your library searchpath and change the line "LINK_DIRECTORIES(${FLEXBAR_SOURCE_DIR}/lib/linux64)" in file FLEXBAR_DIR/src/CMakeLists.txt to include your TBB_INSTALL_DIR/build/release folder. 4) Get cmake from cmake.org and install it. Change to FLEXBAR_DIR on command line and type the following: cmake . 5) Compile source code by issuing make in FLEXBAR_DIR. In general, the seqan and tbb library (in FLEXBAR_DIR/lib) need to be available to the compiler and linker. In case of eclipse, import the project from FLEXBAR_DIR and compile Flexbar after setting the lib path in the project settings. === Project folders === lib: shared tbb libs for Linux 64, Mac OSX, Windows include: versions of SeqAn and tbb libraries test: small test datasets To run Flexbar with the test dataset, make sure flexbar is reachable via the path variable and run flexbar_validate.sh within the test folder. flexbar_v2.4_src/CMakeLists.txt0000640000651600065100000000053212173312106017740 0ustar jroehrbioinformaticscmake_minimum_required( VERSION 2.8.0 ) project( FLEXBAR ) #file( MAKE_DIRECTORY build ) set( EXECUTABLE_OUTPUT_PATH ${FLEXBAR_BINARY_DIR} ) add_subdirectory( src ) if( NOT CMAKE_BUILD_TYPE ) set( CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel." FORCE ) endif() flexbar_v2.4_src/test/0000751000651600065100000000000012175466027016175 5ustar jroehrbioinformaticsflexbar_v2.4_src/test/correct_result_right.csfastq0000640000651600065100000000164412170341604024010 0ustar jroehrbioinformatics@AA do nothing T00000000000000000000000000000000000 + &$''44(#/5(&&&%-$7-%%*%#$.''$#/5(-$ @AB read should start with 1 in alignment T01000000003303011033000220000000330 + &$''44(#/5(&&&%-$7-%%*%#$.''$#/5(-$ @AD right:5 bases match(end with 0201) - left:discarded T00000133023300002022321330201 + &$''44(#/5(&&&%-$7-%%*%#$.''$ @AF right: adapter not aligned, reported - left: partyally maps T0031310002010202002010321110100233 + $''44(#/5(&&&%-$7-%%*%#$.''$#/5(-$ @AG right: adapter not aligned, reported - left: partyally maps,10bp T0031310020222221 + %%*%#$.''$#/5(-$ @AH right: adapter not aligned, reported - left: partyally maps,9bp/discarded T003131002022222 + %*%#$.''$#/5(-$ @AI right: 10bp, left: empty read T0200211022 + -%%*%#$.'' @AL right:match - left:match,(discarded, ends with 1) T000221211102210012233020 + ''44(#/5(&&&%-$7-%%*%#$. @AM right:match partly T0000022220011012201121330201 + ''44(#/5(&&&%-$7-%%*%#$.''$# flexbar_v2.4_src/test/adapters_cs.fasta0000640000651600065100000000001512071003712021460 0ustar jroehrbioinformatics>ad1 TAATGCA flexbar_v2.4_src/test/correct_result_left.csfasta0000640000651600065100000000111212170341604023573 0ustar jroehrbioinformatics>AA do nothing T00000000000000000000000000000000000 >AB read should start with 1 in alignment T01000000003303011033000220000000330 >AC both:right part remains T00323001021310330101 >AD right:5 bases match(end with 0201) - left:discarded T00000133023300002022321330201030313 >AE right: map, empty read remains - left:removed T032223233302010300133012011 >AF right: adapter not aligned, reported - left: partyally maps T0002010202002010321110100233 >AG right: adapter not aligned, reported - left: partyally maps,10bp T0020222221 >AM right:match partly T000002222001101220112133020103031 flexbar_v2.4_src/test/correct_result_left.fastq0000640000651600065100000000167112150154574023305 0ustar jroehrbioinformatics@left_tail:shouldnt work - right_tail:end with CAC - right:end_with_A_left:discarded_empty_read(N) TCACCGGGTGGAAACTAGCCCCCCCCCCCACCGTCT + BSSMNXUTVX``[````\`___^_^_`_`_``^_^X @left:begin_with_G-left_tail:should_work(begin with G) - right_tail:shouldnt work - right:discarded GAAAAAAACCCCCCCCCCTTTTTTTTTTTT + UTVX``[````\`___^_^_`_`_``^_^X @left:begin_with_G-left_tail:should_work(begin with A) - right_tail:shouldnt work - right:discarded AAAAAAACCCCCCCCCCTTTTTTTTTTTTTT + XUTVX``[````\`___^_^_`_`_``^_^X @left_tail:sholdnt_work - right_tail:shouldnt work - right:10bp_remain CATTATACAGAACACAGCAT + `\`___^_^_`_`_``^_^X @left_tail:shouldnt work - right_tail:shouldnt work - right:discarded CATTATACAGAACACAGCAT + ``\`___^_^_`_`_``^_^ @left_tail:shouldnt work - right_tail:shouldnt work - both:right_remains AAAAAATTTTTTAAAAAA + `___^_^_`_`_``^_^X @left_tail:shouldnt work - right_tail:shouldnt work - both:left_remains-left:10bp AAAAAATTTT + `_`_``^_^X flexbar_v2.4_src/test/correct_result_right_tail.fastq0000640000651600065100000000362412150154574024501 0ustar jroehrbioinformatics@left_tail:shouldnt work - right_tail:shouldnt work - rigth:end with ATGC - left:discarded(C) TGAGATCGTTCAGTACGGCAATCGTATGCCGTCTTC + BSSMNXUTVX``[````\`___^_^_`_`_``^_^X @left_tail:shouldnt work - right_tail:end with ATGC - right:end_with_ATGC - left:discarded_empty_read(N) TGAGATCGTTCAGTACGGCAATTCGTATGC + BSSMNXUTVX``[````\`___^_^_`_`_ @left_tail:shouldnt work - right_tail:shouldnt work - right:should_end_with_TGC-left:Discarded(T) TGAGATCGTTCAGTACGGCAATCGTATGCCGTCTTT + BSSMNXUTVX``[````\`___^_^_`_`_``^_^X @left_tail:shouldnt work - right_tail:end with CAC - right:end_with_A_left:discarded_empty_read(N) TCACCGGGTGGAAACTAGCCCCCCCCCCCAC + BSSMNXUTVX``[````\`___^_^_`_`_` @left:begin_with_G-left_tail:should_work(begin with G) - right_tail:shouldnt work - right:discarded CGTCTTGAAAAAAACCCCCCCCCCTTTTTTTTTTTT + BSSMNXUTVX``[````\`___^_^_`_`_``^_^X @left:begin_with_G-left_tail:should_work(begin with A) - right_tail:shouldnt work - right:discarded GTCTTAAAAAAACCCCCCCCCCTTTTTTTTTTTTTT + BSSMNXUTVX``[````\`___^_^_`_`_``^_^X @left:begin_with_G, discarded - left_tail:should_work,but discarded! - right_tail:shouldnt work - right:discarded TCTTGAAAAAAAA + BSSMNXUTVX``[ @left_tail:sholdnt_work - right_tail:shouldnt work - right:10bp_remain AAAAAAAAAACGTCTTCATTATACAGAACACAGCAT + BSSMNXUTVX``[````\`___^_^_`_`_``^_^X @left_tail:shouldnt work - right_tail:shouldnt work - right:discarded AAAAAAAAACGTCTTCATTATACAGAACACAGCAT + BSSMNXUTVX``[````\`___^_^_`_`_``^_^ @left_tail:shouldnt work - right_tail:shouldnt work - both:right_remains TGGAAGCCCCAGCGTCTTAAAAAATTTTTTAAAAAA + BSSMNXUTVX``[````\`___^_^_`_`_``^_^X @left_tail:shouldnt work - right_tail:shouldnt work - both:left_remains-left:10bp TGGAAAAAAAAAAGCCCCAGCGTCTTAAAAAATTTT + BSSMNXUTVX``[````\`___^_^_`_`_``^_^X @left_tail:shouldnt work - right_tail:shouldnt work - both:left_remains-left:Discarded_9bp TGGAAAAAAAAAAAGCCCCAGCGTCTTAAAAATTTT + BSSMNXUTVX``[````\`___^_^_`_`_``^_^X flexbar_v2.4_src/test/correct_result_right_tail.csfasta0000640000651600065100000000150712170341604024777 0ustar jroehrbioinformatics>AA do nothing T00000000000000000000000000000000000 >AB read should start with 1 in alignment T01000000003303011033000220000000330 >AC both:right part remains T00000010230313120323001021310330101 >AD right:5 bases match(end with 0201) - left:discarded T00000133023300002022321330201 >AE right: map, empty read remains - left:removed T0303131332223233302010300133012011 >AF right: adapter not aligned, reported - left: partyally maps T0031310002010202002010321110100233 >AG right: adapter not aligned, reported - left: partyally maps,10bp T0031310020222221 >AH right: adapter not aligned, reported - left: partyally maps,9bp/discarded T003131002022222 >AI right: 10bp, left: empty read T0200211022 >AL right:match - left:match,(discarded, ends with 1) T000221211102210012233020130313101 >AM right:match partly T0000022220011012201121330201 flexbar_v2.4_src/test/barcodes_wildcardN.fasta0000640000651600065100000000004512114635642022757 0ustar jroehrbioinformatics>Barcode1 AANNAAA >Barcode2 TCGTTCAG flexbar_v2.4_src/test/flexbar_test_csfasta.sh0000750000651600065100000000371212173312106022706 0ustar jroehrbioinformatics flexbar --reads test.csfasta --target result_right --format csfasta --adapter-min-overlap 4 --adapters adapters_cs.fasta --min-read-length 10 --adapter-threshold 1 --adapter-trim-end RIGHT > /dev/null a=`diff correct_result_right.csfasta result_right.csfasta` l1=`expr length "$a"` if [ $l1 != 0 ]; then echo "error testing mode csfasta, right" echo $a exit -1 else echo "Test 1 OK" fi flexbar --reads test.csfasta --target result_left --format csfasta --adapter-min-overlap 4 --adapters adapters_cs.fasta --min-read-length 10 --adapter-threshold 1 --adapter-trim-end LEFT > /dev/null a=`diff correct_result_left.csfasta result_left.csfasta` l1=`expr length "$a"` if [ $l1 != 0 ]; then echo "error testing mode csfasta, left" echo $a exit -1 else echo "Test 2 OK" fi flexbar --reads test.csfasta --target result_any --format csfasta --adapter-min-overlap 4 --adapters adapters_cs.fasta --min-read-length 10 --adapter-threshold 1 --adapter-trim-end ANY > /dev/null a=`diff correct_result_any.csfasta result_any.csfasta` l1=`expr length "$a"` if [ $l1 != 0 ]; then echo "error testing mode csfasta, any" echo $a exit -1 else echo "Test 3 OK" fi flexbar --reads test.csfasta --target result_left_tail --format csfasta --adapter-min-overlap 4 --adapters adapters_cs.fasta --min-read-length 10 --adapter-threshold 1 --adapter-trim-end LEFT_TAIL > /dev/null a=`diff correct_result_left_tail.csfasta result_left_tail.csfasta` l1=`expr length "$a"` if [ $l1 != 0 ]; then echo "error testing mode csfasta, left_tail" echo $a exit -1 else echo "Test 4 OK" fi flexbar --reads test.csfasta --target result_right_tail --format csfasta --adapter-min-overlap 4 --adapters adapters_cs.fasta --min-read-length 10 --adapter-threshold 1 --adapter-trim-end RIGHT_TAIL > /dev/null a=`diff correct_result_right_tail.csfasta result_right_tail.csfasta` l1=`expr length "$a"` if [ $l1 != 0 ]; then echo "error testing mode csfasta, right_tail" echo $a exit -1 else echo "Test 5 OK" fi echo "" flexbar_v2.4_src/test/correct_result_left_tail.csfasta0000640000651600065100000000143212170341604024611 0ustar jroehrbioinformatics>AA do nothing T00000000000000000000000000000000000 >AB read should start with 1 in alignment T01000000003303011033000220000000330 >AC both:right part remains T00000010230313120323001021310330101 >AD right:5 bases match(end with 0201) - left:discarded T00000133023300002022321330201030313 >AE right: map, empty read remains - left:removed T032223233302010300133012011 >AF right: adapter not aligned, reported - left: partyally maps T0002010202002010321110100233 >AG right: adapter not aligned, reported - left: partyally maps,10bp T0020222221 >AI right: 10bp, left: empty read T02002110223303131 >AJ right: 9bp, left: empty read T0230021220303131 >AL right:match - left:match,(discarded, ends with 1) T000221211102210012233020130313101 >AM right:match partly T000002222001101220112133020103031 flexbar_v2.4_src/test/flexbar_test_fasta.sh0000750000651600065100000000361012173312106022355 0ustar jroehrbioinformatics flexbar --reads test.fasta --target result_right --format fasta --adapter-min-overlap 4 --adapters adapters.fasta --min-read-length 10 --adapter-threshold 1 --adapter-trim-end RIGHT > /dev/null a=`diff correct_result_right.fasta result_right.fasta` l1=`expr length "$a"` if [ $l1 != 0 ]; then echo "error testing mode fasta, right" echo $a exit -1 else echo "Test 1 OK" fi flexbar --reads test.fasta --target result_left --format fasta --adapter-min-overlap 4 --adapters adapters.fasta --min-read-length 10 --adapter-threshold 1 --adapter-trim-end LEFT > /dev/null a=`diff correct_result_left.fasta result_left.fasta` l1=`expr length "$a"` if [ $l1 != 0 ]; then echo "error testing mode fasta, left" echo $a exit -1 else echo "Test 2 OK" fi flexbar --reads test.fasta --target result_any --format fasta --adapter-min-overlap 4 --adapters adapters.fasta --min-read-length 10 --adapter-threshold 1 --adapter-trim-end ANY > /dev/null a=`diff correct_result_any.fasta result_any.fasta` l1=`expr length "$a"` if [ $l1 != 0 ]; then echo "error testing mode any, left" echo $a exit -1 else echo "Test 3 OK" fi flexbar --reads test.fasta --target result_left_tail --format fasta --adapter-min-overlap 4 --adapters adapters.fasta --min-read-length 10 --adapter-threshold 1 --adapter-trim-end LEFT_TAIL > /dev/null a=`diff correct_result_left_tail.fasta result_left_tail.fasta` l1=`expr length "$a"` if [ $l1 != 0 ]; then echo "error testing mode fasta, left_tail" echo $a exit -1 else echo "Test 4 OK" fi flexbar --reads test.fasta --target result_right_tail --format fasta --adapter-min-overlap 4 --adapters adapters.fasta --min-read-length 10 --adapter-threshold 1 --adapter-trim-end RIGHT_TAIL > /dev/null a=`diff correct_result_right_tail.fasta result_right_tail.fasta` l1=`expr length "$a"` if [ $l1 != 0 ]; then echo "error testing mode fasta, right_tail" echo $a exit -1 else echo "Test 5 OK" fi echo "" flexbar_v2.4_src/test/correct_result_right.fastq0000640000651600065100000000262612150154574023471 0ustar jroehrbioinformatics@left_tail:shouldnt work - right_tail:shouldnt work - rigth:end with ATGC - left:discarded(C) TGAGATCGTTCAGTACGGCAATCGTATGC + BSSMNXUTVX``[````\`___^_^_`_` @left_tail:shouldnt work - right_tail:end with ATGC - right:end_with_ATGC - left:discarded_empty_read(N) TGAGATCGTTCAGTACGGCAATTCGTATGC + BSSMNXUTVX``[````\`___^_^_`_`_ @left_tail:shouldnt work - right_tail:shouldnt work - right:should_end_with_TGC-left:Discarded(T) TGAGATCGTTCAGTACGGCAATCGTATGC + BSSMNXUTVX``[````\`___^_^_`_` @left_tail:shouldnt work - right_tail:end with CAC - right:end_with_A_left:discarded_empty_read(N) TCACCGGGTGGAAACTAGCCCCCCCCCCCAC + BSSMNXUTVX``[````\`___^_^_`_`_` @left:begin_with_G-left_tail:should_work(begin with A) - right_tail:shouldnt work - right:discarded GTCTTAAAAAAACCCCCCCCCCTTTTTTTTTTTTTT + BSSMNXUTVX``[````\`___^_^_`_`_``^_^X @left:begin_with_G, discarded - left_tail:should_work,but discarded! - right_tail:shouldnt work - right:discarded TCTTGAAAAAAAA + BSSMNXUTVX``[ @left_tail:sholdnt_work - right_tail:shouldnt work - right:10bp_remain AAAAAAAAAA + BSSMNXUTVX @left_tail:shouldnt work - right_tail:shouldnt work - both:right_remains TGGAAGCCCCAG + BSSMNXUTVX`` @left_tail:shouldnt work - right_tail:shouldnt work - both:left_remains-left:10bp TGGAAAAAAAAAAGCCCCAG + BSSMNXUTVX``[````\`_ @left_tail:shouldnt work - right_tail:shouldnt work - both:left_remains-left:Discarded_9bp TGGAAAAAAAAAAAGCCCCAG + BSSMNXUTVX``[````\`__ flexbar_v2.4_src/test/correct_result_right.fasta0000640000651600065100000000222112150154574023440 0ustar jroehrbioinformatics>left_tail:shouldnt work - right_tail:shouldnt work - rigth:end with ATGC - left:discarded(C) TGAGATCGTTCAGTACGGCAATCGTATGC >left_tail:shouldnt work - right_tail:end with ATGC - right:end_with_ATGC - left:discarded_empty_read(N) TGAGATCGTTCAGTACGGCAATTCGTATGC >left_tail:shouldnt work - right_tail:shouldnt work - right:should_end_with_TGC-left:Discarded(T) TGAGATCGTTCAGTACGGCAATCGTATGC >left_tail:shouldnt work - right_tail:end with CAC - right:end_with_A_left:discarded_empty_read(N) TCACCGGGTGGAAACTAGCCCCCCCCCCCAC >left:begin_with_G-left_tail:should_work(begin with A) - right_tail:shouldnt work - right:discarded GTCTTAAAAAAACCCCCCCCCCTTTTTTTTTTTTTT >left:begin_with_G, discarded - left_tail:should_work,but discarded! - right_tail:shouldnt work - right:discarded TCTTGAAAAAAAA >left_tail:sholdnt_work - right_tail:shouldnt work - right:10bp_remain AAAAAAAAAA >left_tail:shouldnt work - right_tail:shouldnt work - both:right_remains TGGAAGCCCCAG >left_tail:shouldnt work - right_tail:shouldnt work - both:left_remains-left:10bp TGGAAAAAAAAAAGCCCCAG >left_tail:shouldnt work - right_tail:shouldnt work - both:left_remains-left:Discarded_9bp TGGAAAAAAAAAAAGCCCCAG flexbar_v2.4_src/test/correct_result_left_tail.csfastq0000640000651600065100000000215312170341604024632 0ustar jroehrbioinformatics@AA do nothing T00000000000000000000000000000000000 + &$''44(#/5(&&&%-$7-%%*%#$.''$#/5(-$ @AB read should start with 1 in alignment T01000000003303011033000220000000330 + &$''44(#/5(&&&%-$7-%%*%#$.''$#/5(-$ @AC both:right part remains T00000010230313120323001021310330101 + &$''44(#/5(&&&%-$7-%%*%#$.''$#/5(-$ @AD right:5 bases match(end with 0201) - left:discarded T00000133023300002022321330201030313 + &$''44(#/5(&&&%-$7-%%*%#$.''$#/5(-$ @AE right: map, empty read remains - left:removed T032223233302010300133012011 + $5(&&&%-$7-%%*%#$.''$#/5(-$ @AF right: adapter not aligned, reported - left: partyally maps T0002010202002010321110100233 + $/5(&&&%-$7-%%*%#$.''$#/5(-$ @AG right: adapter not aligned, reported - left: partyally maps,10bp T0020222221 + %''$#/5(-$ @AI right: 10bp, left: empty read T02002110223303131 + -%%*%#$.''$#/5(-$ @AJ right: 9bp, left: empty read T0230021220303131 + %%*%#$.''$#/5(-$ @AL right:match - left:match,(discarded, ends with 1) T000221211102210012233020130313101 + ''44(#/5(&&&%-$7-%%*%#$.''$#/5(-$ @AM right:match partly T000002222001101220112133020103031 + ''44(#/5(&&&%-$7-%%*%#$.''$#/5(-$ flexbar_v2.4_src/test/adapters.fasta0000640000651600065100000000001411471257472021013 0ustar jroehrbioinformatics>ad1 CGTCTT flexbar_v2.4_src/test/flexbar_test_fastq.sh0000750000651600065100000000364112173312106022401 0ustar jroehrbioinformatics flexbar --reads test.fastq --target result_right --format fastq-i1.5 --adapter-min-overlap 4 --adapters adapters.fasta --min-read-length 10 --adapter-threshold 1 --adapter-trim-end RIGHT > /dev/null a=`diff correct_result_right.fastq result_right.fastq` l1=`expr length "$a"` if [ $l1 != 0 ]; then echo "error testing mode fasta, right" echo $a exit -1 else echo "Test 1 OK" fi flexbar --reads test.fastq --target result_left --format fastq-i1.5 --adapter-min-overlap 4 --adapters adapters.fasta --min-read-length 10 --adapter-threshold 1 --adapter-trim-end LEFT > /dev/null a=`diff correct_result_left.fastq result_left.fastq` l1=`expr length "$a"` if [ $l1 != 0 ]; then echo "error testing mode fasta, left" echo $a exit -1 else echo "Test 2 OK" fi flexbar --reads test.fastq --target result_any --format fastq-i1.5 --adapter-min-overlap 4 --adapters adapters.fasta --min-read-length 10 --adapter-threshold 1 --adapter-trim-end ANY > /dev/null a=`diff correct_result_any.fastq result_any.fastq` l1=`expr length "$a"` if [ $l1 != 0 ]; then echo "error testing mode any, left" echo $a exit -1 else echo "Test 3 OK" fi flexbar --reads test.fastq --target result_left_tail --format fastq-i1.5 --adapter-min-overlap 4 --adapters adapters.fasta --min-read-length 10 --adapter-threshold 1 --adapter-trim-end LEFT_TAIL > /dev/null a=`diff correct_result_left_tail.fastq result_left_tail.fastq` l1=`expr length "$a"` if [ $l1 != 0 ]; then echo "error testing mode fasta, left_tail" echo $a exit -1 else echo "Test 4 OK" fi flexbar --reads test.fastq --target result_right_tail --format fastq-i1.5 --adapter-min-overlap 4 --adapters adapters.fasta --min-read-length 10 --adapter-threshold 1 --adapter-trim-end RIGHT_TAIL > /dev/null a=`diff correct_result_right_tail.fastq result_right_tail.fastq` l1=`expr length "$a"` if [ $l1 != 0 ]; then echo "error testing mode fasta, right_tail" echo $a exit -1 else echo "Test 5 OK" fi echo "" flexbar_v2.4_src/test/flexbar_test_csfastq.sh0000750000651600065100000000371212173312106022726 0ustar jroehrbioinformatics flexbar --reads test.csfastq --target result_right --format csfastq --adapter-min-overlap 4 --adapters adapters_cs.fasta --min-read-length 10 --adapter-threshold 1 --adapter-trim-end RIGHT > /dev/null a=`diff correct_result_right.csfastq result_right.csfastq` l1=`expr length "$a"` if [ $l1 != 0 ]; then echo "error testing mode csfastq, right" echo $a exit -1 else echo "Test 1 OK" fi flexbar --reads test.csfastq --target result_left --format csfastq --adapter-min-overlap 4 --adapters adapters_cs.fasta --min-read-length 10 --adapter-threshold 1 --adapter-trim-end LEFT > /dev/null a=`diff correct_result_left.csfastq result_left.csfastq` l1=`expr length "$a"` if [ $l1 != 0 ]; then echo "error testing mode csfastq, left" echo $a exit -1 else echo "Test 2 OK" fi flexbar --reads test.csfastq --target result_any --format csfastq --adapter-min-overlap 4 --adapters adapters_cs.fasta --min-read-length 10 --adapter-threshold 1 --adapter-trim-end ANY > /dev/null a=`diff correct_result_any.csfastq result_any.csfastq` l1=`expr length "$a"` if [ $l1 != 0 ]; then echo "error testing mode csfastq, any" echo $a exit -1 else echo "Test 3 OK" fi flexbar --reads test.csfastq --target result_left_tail --format csfastq --adapter-min-overlap 4 --adapters adapters_cs.fasta --min-read-length 10 --adapter-threshold 1 --adapter-trim-end LEFT_TAIL > /dev/null a=`diff correct_result_left_tail.csfastq result_left_tail.csfastq` l1=`expr length "$a"` if [ $l1 != 0 ]; then echo "error testing mode csfastq, left_tail" echo $a exit -1 else echo "Test 4 OK" fi flexbar --reads test.csfastq --target result_right_tail --format csfastq --adapter-min-overlap 4 --adapters adapters_cs.fasta --min-read-length 10 --adapter-threshold 1 --adapter-trim-end RIGHT_TAIL > /dev/null a=`diff correct_result_right_tail.csfastq result_right_tail.csfastq` l1=`expr length "$a"` if [ $l1 != 0 ]; then echo "error testing mode csfastq, right_tail" echo $a exit -1 else echo "Test 5 OK" fi echo "" flexbar_v2.4_src/test/flexbar_validate.sh0000750000651600065100000000031612071004114022003 0ustar jroehrbioinformatics echo "" echo "Testing fasta:" ./flexbar_test_fasta.sh echo "Testing csfasta:" ./flexbar_test_csfasta.sh echo "Testing fastq:" ./flexbar_test_fastq.sh echo "Testing csfastq:" ./flexbar_test_csfastq.sh flexbar_v2.4_src/test/correct_result_right_tail.fasta0000640000651600065100000000274312150154574024462 0ustar jroehrbioinformatics>left_tail:shouldnt work - right_tail:shouldnt work - rigth:end with ATGC - left:discarded(C) TGAGATCGTTCAGTACGGCAATCGTATGCCGTCTTC >left_tail:shouldnt work - right_tail:end with ATGC - right:end_with_ATGC - left:discarded_empty_read(N) TGAGATCGTTCAGTACGGCAATTCGTATGC >left_tail:shouldnt work - right_tail:shouldnt work - right:should_end_with_TGC-left:Discarded(T) TGAGATCGTTCAGTACGGCAATCGTATGCCGTCTTT >left_tail:shouldnt work - right_tail:end with CAC - right:end_with_A_left:discarded_empty_read(N) TCACCGGGTGGAAACTAGCCCCCCCCCCCAC >left:begin_with_G-left_tail:should_work(begin with G) - right_tail:shouldnt work - right:discarded CGTCTTGAAAAAAACCCCCCCCCCTTTTTTTTTTTT >left:begin_with_G-left_tail:should_work(begin with A) - right_tail:shouldnt work - right:discarded GTCTTAAAAAAACCCCCCCCCCTTTTTTTTTTTTTT >left:begin_with_G, discarded - left_tail:should_work,but discarded! - right_tail:shouldnt work - right:discarded TCTTGAAAAAAAA >left_tail:sholdnt_work - right_tail:shouldnt work - right:10bp_remain AAAAAAAAAACGTCTTCATTATACAGAACACAGCAT >left_tail:shouldnt work - right_tail:shouldnt work - right:discarded AAAAAAAAACGTCTTCATTATACAGAACACAGCAT >left_tail:shouldnt work - right_tail:shouldnt work - both:right_remains TGGAAGCCCCAGCGTCTTAAAAAATTTTTTAAAAAA >left_tail:shouldnt work - right_tail:shouldnt work - both:left_remains-left:10bp TGGAAAAAAAAAAGCCCCAGCGTCTTAAAAAATTTT >left_tail:shouldnt work - right_tail:shouldnt work - both:left_remains-left:Discarded_9bp TGGAAAAAAAAAAAGCCCCAGCGTCTTAAAAATTTT flexbar_v2.4_src/test/correct_result_right_tail.csfastq0000640000651600065100000000223012170341604025011 0ustar jroehrbioinformatics@AA do nothing T00000000000000000000000000000000000 + &$''44(#/5(&&&%-$7-%%*%#$.''$#/5(-$ @AB read should start with 1 in alignment T01000000003303011033000220000000330 + &$''44(#/5(&&&%-$7-%%*%#$.''$#/5(-$ @AC both:right part remains T00000010230313120323001021310330101 + &$''44(#/5(&&&%-$7-%%*%#$.''$#/5(-$ @AD right:5 bases match(end with 0201) - left:discarded T00000133023300002022321330201 + &$''44(#/5(&&&%-$7-%%*%#$.''$ @AE right: map, empty read remains - left:removed T0303131332223233302010300133012011 + $''44(#/5(&&&%-$7-%%*%#$.''$#/5(-$ @AF right: adapter not aligned, reported - left: partyally maps T0031310002010202002010321110100233 + $''44(#/5(&&&%-$7-%%*%#$.''$#/5(-$ @AG right: adapter not aligned, reported - left: partyally maps,10bp T0031310020222221 + %%*%#$.''$#/5(-$ @AH right: adapter not aligned, reported - left: partyally maps,9bp/discarded T003131002022222 + %*%#$.''$#/5(-$ @AI right: 10bp, left: empty read T0200211022 + -%%*%#$.'' @AL right:match - left:match,(discarded, ends with 1) T000221211102210012233020130313101 + ''44(#/5(&&&%-$7-%%*%#$.''$#/5(-$ @AM right:match partly T0000022220011012201121330201 + ''44(#/5(&&&%-$7-%%*%#$.''$# flexbar_v2.4_src/test/correct_result_left.fasta0000640000651600065100000000137712150154574023270 0ustar jroehrbioinformatics>left_tail:shouldnt work - right_tail:end with CAC - right:end_with_A_left:discarded_empty_read(N) TCACCGGGTGGAAACTAGCCCCCCCCCCCACCGTCT >left:begin_with_G-left_tail:should_work(begin with G) - right_tail:shouldnt work - right:discarded GAAAAAAACCCCCCCCCCTTTTTTTTTTTT >left:begin_with_G-left_tail:should_work(begin with A) - right_tail:shouldnt work - right:discarded AAAAAAACCCCCCCCCCTTTTTTTTTTTTTT >left_tail:sholdnt_work - right_tail:shouldnt work - right:10bp_remain CATTATACAGAACACAGCAT >left_tail:shouldnt work - right_tail:shouldnt work - right:discarded CATTATACAGAACACAGCAT >left_tail:shouldnt work - right_tail:shouldnt work - both:right_remains AAAAAATTTTTTAAAAAA >left_tail:shouldnt work - right_tail:shouldnt work - both:left_remains-left:10bp AAAAAATTTT flexbar_v2.4_src/test/correct_result_left.csfastq0000640000651600065100000000150112170341604023615 0ustar jroehrbioinformatics@AA do nothing T00000000000000000000000000000000000 + &$''44(#/5(&&&%-$7-%%*%#$.''$#/5(-$ @AB read should start with 1 in alignment T01000000003303011033000220000000330 + &$''44(#/5(&&&%-$7-%%*%#$.''$#/5(-$ @AC both:right part remains T00323001021310330101 + &$7-%%*%#$.''$#/5(-$ @AD right:5 bases match(end with 0201) - left:discarded T00000133023300002022321330201030313 + &$''44(#/5(&&&%-$7-%%*%#$.''$#/5(-$ @AE right: map, empty read remains - left:removed T032223233302010300133012011 + $5(&&&%-$7-%%*%#$.''$#/5(-$ @AF right: adapter not aligned, reported - left: partyally maps T0002010202002010321110100233 + $/5(&&&%-$7-%%*%#$.''$#/5(-$ @AG right: adapter not aligned, reported - left: partyally maps,10bp T0020222221 + %''$#/5(-$ @AM right:match partly T000002222001101220112133020103031 + ''44(#/5(&&&%-$7-%%*%#$.''$#/5(-$ flexbar_v2.4_src/test/correct_result_right.csfasta0000640000651600065100000000124712170341604023767 0ustar jroehrbioinformatics>AA do nothing T00000000000000000000000000000000000 >AB read should start with 1 in alignment T01000000003303011033000220000000330 >AD right:5 bases match(end with 0201) - left:discarded T00000133023300002022321330201 >AF right: adapter not aligned, reported - left: partyally maps T0031310002010202002010321110100233 >AG right: adapter not aligned, reported - left: partyally maps,10bp T0031310020222221 >AH right: adapter not aligned, reported - left: partyally maps,9bp/discarded T003131002022222 >AI right: 10bp, left: empty read T0200211022 >AL right:match - left:match,(discarded, ends with 1) T000221211102210012233020 >AM right:match partly T0000022220011012201121330201 flexbar_v2.4_src/test/correct_result_left_tail.fasta0000640000651600065100000000266312150154574024300 0ustar jroehrbioinformatics>left_tail:shouldnt work - right_tail:shouldnt work - rigth:end with ATGC - left:discarded(C) TGAGATCGTTCAGTACGGCAATCGTATGCCGTCTTC >left_tail:shouldnt work - right_tail:end with ATGC - right:end_with_ATGC - left:discarded_empty_read(N) TGAGATCGTTCAGTACGGCAATTCGTATGCCGTCTT >left_tail:shouldnt work - right_tail:shouldnt work - right:should_end_with_TGC-left:Discarded(T) TGAGATCGTTCAGTACGGCAATCGTATGCCGTCTTT >left_tail:shouldnt work - right_tail:end with CAC - right:end_with_A_left:discarded_empty_read(N) TCACCGGGTGGAAACTAGCCCCCCCCCCCACCGTCT >left:begin_with_G-left_tail:should_work(begin with G) - right_tail:shouldnt work - right:discarded GAAAAAAACCCCCCCCCCTTTTTTTTTTTT >left:begin_with_G-left_tail:should_work(begin with A) - right_tail:shouldnt work - right:discarded AAAAAAACCCCCCCCCCTTTTTTTTTTTTTT >left:should_work right:discarded! - right_tail:works,discarded AAAAAAAACGTCTT >left_tail:sholdnt_work - right_tail:shouldnt work - right:10bp_remain AAAAAAAAAACGTCTTCATTATACAGAACACAGCAT >left_tail:shouldnt work - right_tail:shouldnt work - right:discarded AAAAAAAAACGTCTTCATTATACAGAACACAGCAT >left_tail:shouldnt work - right_tail:shouldnt work - both:right_remains TGGAAGCCCCAGCGTCTTAAAAAATTTTTTAAAAAA >left_tail:shouldnt work - right_tail:shouldnt work - both:left_remains-left:10bp TGGAAAAAAAAAAGCCCCAGCGTCTTAAAAAATTTT >left_tail:shouldnt work - right_tail:shouldnt work - both:left_remains-left:Discarded_9bp TGGAAAAAAAAAAAGCCCCAGCGTCTTAAAAATTTT flexbar_v2.4_src/test/correct_result_any.csfasta0000640000651600065100000000127512170341604023442 0ustar jroehrbioinformatics>AA do nothing T00000000000000000000000000000000000 >AB read should start with 1 in alignment T01000000003303011033000220000000330 >AC both:right part remains T00323001021310330101 >AD right:5 bases match(end with 0201) - left:discarded T00000133023300002022321330201 >AE right: map, empty read remains - left:removed T032223233302010300133012011 >AF right: adapter not aligned, reported - left: partyally maps T0002010202002010321110100233 >AG right: adapter not aligned, reported - left: partyally maps,10bp T0020222221 >AI right: 10bp, left: empty read T0200211022 >AL right:match - left:match,(discarded, ends with 1) T000221211102210012233020 >AM right:match partly T0000022220011012201121330201 flexbar_v2.4_src/test/test.fasta0000640000651600065100000000307611471257472020202 0ustar jroehrbioinformatics>left_tail:shouldnt work - right_tail:shouldnt work - rigth:end with ATGC - left:discarded(C) TGAGATCGTTCAGTACGGCAATCGTATGCCGTCTTC >left_tail:shouldnt work - right_tail:end with ATGC - right:end_with_ATGC - left:discarded_empty_read(N) TGAGATCGTTCAGTACGGCAATTCGTATGCCGTCTT >left_tail:shouldnt work - right_tail:shouldnt work - right:should_end_with_TGC-left:Discarded(T) TGAGATCGTTCAGTACGGCAATCGTATGCCGTCTTT >left_tail:shouldnt work - right_tail:end with CAC - right:end_with_A_left:discarded_empty_read(N) TCACCGGGTGGAAACTAGCCCCCCCCCCCACCGTCT >left:begin_with_G-left_tail:should_work(begin with G) - right_tail:shouldnt work - right:discarded CGTCTTGAAAAAAACCCCCCCCCCTTTTTTTTTTTT >left:begin_with_G-left_tail:should_work(begin with A) - right_tail:shouldnt work - right:discarded GTCTTAAAAAAACCCCCCCCCCTTTTTTTTTTTTTT >left:begin_with_G, discarded - left_tail:should_work,but discarded! - right_tail:shouldnt work - right:discarded TCTTGAAAAAAAA >left:should_work right:discarded! - right_tail:works,discarded AAAAAAAACGTCTT >left_tail:sholdnt_work - right_tail:shouldnt work - right:10bp_remain AAAAAAAAAACGTCTTCATTATACAGAACACAGCAT >left_tail:shouldnt work - right_tail:shouldnt work - right:discarded AAAAAAAAACGTCTTCATTATACAGAACACAGCAT >left_tail:shouldnt work - right_tail:shouldnt work - both:right_remains TGGAAGCCCCAGCGTCTTAAAAAATTTTTTAAAAAA >left_tail:shouldnt work - right_tail:shouldnt work - both:left_remains-left:10bp TGGAAAAAAAAAAGCCCCAGCGTCTTAAAAAATTTT >left_tail:shouldnt work - right_tail:shouldnt work - both:left_remains-left:Discarded_9bp TGGAAAAAAAAAAAGCCCCAGCGTCTTAAAAATTTT flexbar_v2.4_src/test/correct_result_any.fastq0000640000651600065100000000306212150154574023136 0ustar jroehrbioinformatics@left_tail:shouldnt work - right_tail:shouldnt work - rigth:end with ATGC - left:discarded(C) TGAGATCGTTCAGTACGGCAATCGTATGC + BSSMNXUTVX``[````\`___^_^_`_` @left_tail:shouldnt work - right_tail:end with ATGC - right:end_with_ATGC - left:discarded_empty_read(N) TGAGATCGTTCAGTACGGCAATTCGTATGC + BSSMNXUTVX``[````\`___^_^_`_`_ @left_tail:shouldnt work - right_tail:shouldnt work - right:should_end_with_TGC-left:Discarded(T) TGAGATCGTTCAGTACGGCAATCGTATGC + BSSMNXUTVX``[````\`___^_^_`_` @left_tail:shouldnt work - right_tail:end with CAC - right:end_with_A_left:discarded_empty_read(N) TCACCGGGTGGAAACTAGCCCCCCCCCCCAC + BSSMNXUTVX``[````\`___^_^_`_`_` @left:begin_with_G-left_tail:should_work(begin with G) - right_tail:shouldnt work - right:discarded GAAAAAAACCCCCCCCCCTTTTTTTTTTTT + UTVX``[````\`___^_^_`_`_``^_^X @left:begin_with_G-left_tail:should_work(begin with A) - right_tail:shouldnt work - right:discarded AAAAAAACCCCCCCCCCTTTTTTTTTTTTTT + XUTVX``[````\`___^_^_`_`_``^_^X @left_tail:sholdnt_work - right_tail:shouldnt work - right:10bp_remain CATTATACAGAACACAGCAT + `\`___^_^_`_`_``^_^X @left_tail:shouldnt work - right_tail:shouldnt work - right:discarded CATTATACAGAACACAGCAT + ``\`___^_^_`_`_``^_^ @left_tail:shouldnt work - right_tail:shouldnt work - both:right_remains AAAAAATTTTTTAAAAAA + `___^_^_`_`_``^_^X @left_tail:shouldnt work - right_tail:shouldnt work - both:left_remains-left:10bp TGGAAAAAAAAAAGCCCCAG + BSSMNXUTVX``[````\`_ @left_tail:shouldnt work - right_tail:shouldnt work - both:left_remains-left:Discarded_9bp TGGAAAAAAAAAAAGCCCCAG + BSSMNXUTVX``[````\`__ flexbar_v2.4_src/test/test.csfastq0000640000651600065100000000254211471257472020545 0ustar jroehrbioinformatics@discarded, uncalled T00.30110022333302.00303331113120113 + &$''44(#/5(&&&%-$7-%%*%#$.''$#/5(-$ @AA do nothing T00000000000000000000000000000000000 + &$''44(#/5(&&&%-$7-%%*%#$.''$#/5(-$ @AB read should start with 1 in alignment T01000000003303011033000220000000330 + &$''44(#/5(&&&%-$7-%%*%#$.''$#/5(-$ @AC both:right part remains T00000010230313120323001021310330101 + &$''44(#/5(&&&%-$7-%%*%#$.''$#/5(-$ @AD right:5 bases match(end with 0201) - left:discarded T00000133023300002022321330201030313 + &$''44(#/5(&&&%-$7-%%*%#$.''$#/5(-$ @AE right: map, empty read remains - left:removed T0303131332223233302010300133012011 + $''44(#/5(&&&%-$7-%%*%#$.''$#/5(-$ @AF right: adapter not aligned, reported - left: partyally maps T0031310002010202002010321110100233 + $''44(#/5(&&&%-$7-%%*%#$.''$#/5(-$ @AG right: adapter not aligned, reported - left: partyally maps,10bp T0031310020222221 + %%*%#$.''$#/5(-$ @AH right: adapter not aligned, reported - left: partyally maps,9bp/discarded T003131002022222 + %*%#$.''$#/5(-$ @AI right: 10bp, left: empty read T02002110223303131 + -%%*%#$.''$#/5(-$ @AJ right: 9bp, left: empty read T0230021220303131 + %%*%#$.''$#/5(-$ @AL right:match - left:match,(discarded, ends with 1) T000221211102210012233020130313101 + ''44(#/5(&&&%-$7-%%*%#$.''$#/5(-$ @AM right:match partly T000002222001101220112133020103031 + ''44(#/5(&&&%-$7-%%*%#$.''$#/5(-$ flexbar_v2.4_src/test/test.csfasta0000640000651600065100000000170611471257472020526 0ustar jroehrbioinformatics>discarded, uncalled T00.30110022333302.00303331113120113 >AA do nothing T00000000000000000000000000000000000 >AB read should start with 1 in alignment T01000000003303011033000220000000330 >AC both:right part remains T00000010230313120323001021310330101 >AD right:5 bases match(end with 0201) - left:discarded T00000133023300002022321330201030313 >AE right: map, empty read remains - left:removed T0303131332223233302010300133012011 >AF right: adapter not aligned, reported - left: partyally maps T0031310002010202002010321110100233 >AG right: adapter not aligned, reported - left: partyally maps,10bp T0031310020222221 >AH right: adapter not aligned, reported - left: partyally maps,9bp/discarded T003131002022222 >AI right: 10bp, left: empty read T02002110223303131 >AJ right: 9bp, left: empty read T0230021220303131 >AL right:match - left:match,(discarded, ends with 1) T000221211102210012233020130313101 >AM right:match partly T000002222001101220112133020103031 flexbar_v2.4_src/test/correct_result_any.fasta0000640000651600065100000000237212150154574023121 0ustar jroehrbioinformatics>left_tail:shouldnt work - right_tail:shouldnt work - rigth:end with ATGC - left:discarded(C) TGAGATCGTTCAGTACGGCAATCGTATGC >left_tail:shouldnt work - right_tail:end with ATGC - right:end_with_ATGC - left:discarded_empty_read(N) TGAGATCGTTCAGTACGGCAATTCGTATGC >left_tail:shouldnt work - right_tail:shouldnt work - right:should_end_with_TGC-left:Discarded(T) TGAGATCGTTCAGTACGGCAATCGTATGC >left_tail:shouldnt work - right_tail:end with CAC - right:end_with_A_left:discarded_empty_read(N) TCACCGGGTGGAAACTAGCCCCCCCCCCCAC >left:begin_with_G-left_tail:should_work(begin with G) - right_tail:shouldnt work - right:discarded GAAAAAAACCCCCCCCCCTTTTTTTTTTTT >left:begin_with_G-left_tail:should_work(begin with A) - right_tail:shouldnt work - right:discarded AAAAAAACCCCCCCCCCTTTTTTTTTTTTTT >left_tail:sholdnt_work - right_tail:shouldnt work - right:10bp_remain CATTATACAGAACACAGCAT >left_tail:shouldnt work - right_tail:shouldnt work - right:discarded CATTATACAGAACACAGCAT >left_tail:shouldnt work - right_tail:shouldnt work - both:right_remains AAAAAATTTTTTAAAAAA >left_tail:shouldnt work - right_tail:shouldnt work - both:left_remains-left:10bp TGGAAAAAAAAAAGCCCCAG >left_tail:shouldnt work - right_tail:shouldnt work - both:left_remains-left:Discarded_9bp TGGAAAAAAAAAAAGCCCCAG flexbar_v2.4_src/test/test.fasta.gz0000640000651600065100000000060412171336507020606 0ustar jroehrbioinformaticsbQtest.fasta;o0w~HXHW-w P D(å摨ʵ}ͱPB%Ey}-J[|=iS>?XV*'Ū}OLf!N[ʼn ajftU"y/$ s4?qhV:8**+|SW9|]kXnBs݇rĩ<e)\Yban5>oz  c A/(q}Xnww'zIAMn|yN/sRTjKh|aacNitO' 6 ٙBwcom;JN=E1A">flexbar_v2.4_src/test/test.fasta.bz20000640000651600065100000000065212171336507020666 0ustar jroehrbioinformaticsBZh91AY&SY~߀`f`1,ޠ@XDC 4hIH @hM 1% j4 4z +I 'HMlYkEa1j|%WlF BqC7N@čhSkgv)Q t)"v)DRKe mVp$ H hi]'<@p0\&* Pj M0.T ŢZȴCVLHNة.HRj t &C%7MK@%lڜ0 PuuRՏ8 QX5T7+4H ԸJje2%!0ch$j8!X.7`u1 jA946U/ zOw$S flexbar_v2.4_src/test/test.fastq0000640000651600065100000000401311471257472020212 0ustar jroehrbioinformatics@left_tail:shouldnt work - right_tail:shouldnt work - rigth:end with ATGC - left:discarded(C) TGAGATCGTTCAGTACGGCAATCGTATGCCGTCTTC + BSSMNXUTVX``[````\`___^_^_`_`_``^_^X @left_tail:shouldnt work - right_tail:end with ATGC - right:end_with_ATGC - left:discarded_empty_read(N) TGAGATCGTTCAGTACGGCAATTCGTATGCCGTCTT + BSSMNXUTVX``[````\`___^_^_`_`_``^_^X @left_tail:shouldnt work - right_tail:shouldnt work - right:should_end_with_TGC-left:Discarded(T) TGAGATCGTTCAGTACGGCAATCGTATGCCGTCTTT + BSSMNXUTVX``[````\`___^_^_`_`_``^_^X @left_tail:shouldnt work - right_tail:end with CAC - right:end_with_A_left:discarded_empty_read(N) TCACCGGGTGGAAACTAGCCCCCCCCCCCACCGTCT + BSSMNXUTVX``[````\`___^_^_`_`_``^_^X @left:begin_with_G-left_tail:should_work(begin with G) - right_tail:shouldnt work - right:discarded CGTCTTGAAAAAAACCCCCCCCCCTTTTTTTTTTTT + BSSMNXUTVX``[````\`___^_^_`_`_``^_^X @left:begin_with_G-left_tail:should_work(begin with A) - right_tail:shouldnt work - right:discarded GTCTTAAAAAAACCCCCCCCCCTTTTTTTTTTTTTT + BSSMNXUTVX``[````\`___^_^_`_`_``^_^X @left:begin_with_G, discarded - left_tail:should_work,but discarded! - right_tail:shouldnt work - right:discarded TCTTGAAAAAAAA + BSSMNXUTVX``[ @left:should_work right:discarded! - right_tail:works,discarded AAAAAAAACGTCTT + BSSMNXUTVX``[` @left_tail:sholdnt_work - right_tail:shouldnt work - right:10bp_remain AAAAAAAAAACGTCTTCATTATACAGAACACAGCAT + BSSMNXUTVX``[````\`___^_^_`_`_``^_^X @left_tail:shouldnt work - right_tail:shouldnt work - right:discarded AAAAAAAAACGTCTTCATTATACAGAACACAGCAT + BSSMNXUTVX``[````\`___^_^_`_`_``^_^ @left_tail:shouldnt work - right_tail:shouldnt work - both:right_remains TGGAAGCCCCAGCGTCTTAAAAAATTTTTTAAAAAA + BSSMNXUTVX``[````\`___^_^_`_`_``^_^X @left_tail:shouldnt work - right_tail:shouldnt work - both:left_remains-left:10bp TGGAAAAAAAAAAGCCCCAGCGTCTTAAAAAATTTT + BSSMNXUTVX``[````\`___^_^_`_`_``^_^X @left_tail:shouldnt work - right_tail:shouldnt work - both:left_remains-left:Discarded_9bp TGGAAAAAAAAAAAGCCCCAGCGTCTTAAAAATTTT + BSSMNXUTVX``[````\`___^_^_`_`_``^_^X flexbar_v2.4_src/test/barcodes.fasta0000640000651600065100000000004512057343516020772 0ustar jroehrbioinformatics>Barcode1 AAAAAAA >Barcode2 TCGTTCAG flexbar_v2.4_src/test/correct_result_left_tail.fastq0000640000651600065100000000354512150154574024320 0ustar jroehrbioinformatics@left_tail:shouldnt work - right_tail:shouldnt work - rigth:end with ATGC - left:discarded(C) TGAGATCGTTCAGTACGGCAATCGTATGCCGTCTTC + BSSMNXUTVX``[````\`___^_^_`_`_``^_^X @left_tail:shouldnt work - right_tail:end with ATGC - right:end_with_ATGC - left:discarded_empty_read(N) TGAGATCGTTCAGTACGGCAATTCGTATGCCGTCTT + BSSMNXUTVX``[````\`___^_^_`_`_``^_^X @left_tail:shouldnt work - right_tail:shouldnt work - right:should_end_with_TGC-left:Discarded(T) TGAGATCGTTCAGTACGGCAATCGTATGCCGTCTTT + BSSMNXUTVX``[````\`___^_^_`_`_``^_^X @left_tail:shouldnt work - right_tail:end with CAC - right:end_with_A_left:discarded_empty_read(N) TCACCGGGTGGAAACTAGCCCCCCCCCCCACCGTCT + BSSMNXUTVX``[````\`___^_^_`_`_``^_^X @left:begin_with_G-left_tail:should_work(begin with G) - right_tail:shouldnt work - right:discarded GAAAAAAACCCCCCCCCCTTTTTTTTTTTT + UTVX``[````\`___^_^_`_`_``^_^X @left:begin_with_G-left_tail:should_work(begin with A) - right_tail:shouldnt work - right:discarded AAAAAAACCCCCCCCCCTTTTTTTTTTTTTT + XUTVX``[````\`___^_^_`_`_``^_^X @left:should_work right:discarded! - right_tail:works,discarded AAAAAAAACGTCTT + BSSMNXUTVX``[` @left_tail:sholdnt_work - right_tail:shouldnt work - right:10bp_remain AAAAAAAAAACGTCTTCATTATACAGAACACAGCAT + BSSMNXUTVX``[````\`___^_^_`_`_``^_^X @left_tail:shouldnt work - right_tail:shouldnt work - right:discarded AAAAAAAAACGTCTTCATTATACAGAACACAGCAT + BSSMNXUTVX``[````\`___^_^_`_`_``^_^ @left_tail:shouldnt work - right_tail:shouldnt work - both:right_remains TGGAAGCCCCAGCGTCTTAAAAAATTTTTTAAAAAA + BSSMNXUTVX``[````\`___^_^_`_`_``^_^X @left_tail:shouldnt work - right_tail:shouldnt work - both:left_remains-left:10bp TGGAAAAAAAAAAGCCCCAGCGTCTTAAAAAATTTT + BSSMNXUTVX``[````\`___^_^_`_`_``^_^X @left_tail:shouldnt work - right_tail:shouldnt work - both:left_remains-left:Discarded_9bp TGGAAAAAAAAAAAGCCCCAGCGTCTTAAAAATTTT + BSSMNXUTVX``[````\`___^_^_`_`_``^_^X flexbar_v2.4_src/test/correct_result_any.csfastq0000640000651600065100000000172112170341604023456 0ustar jroehrbioinformatics@AA do nothing T00000000000000000000000000000000000 + &$''44(#/5(&&&%-$7-%%*%#$.''$#/5(-$ @AB read should start with 1 in alignment T01000000003303011033000220000000330 + &$''44(#/5(&&&%-$7-%%*%#$.''$#/5(-$ @AC both:right part remains T00323001021310330101 + &$7-%%*%#$.''$#/5(-$ @AD right:5 bases match(end with 0201) - left:discarded T00000133023300002022321330201 + &$''44(#/5(&&&%-$7-%%*%#$.''$ @AE right: map, empty read remains - left:removed T032223233302010300133012011 + $5(&&&%-$7-%%*%#$.''$#/5(-$ @AF right: adapter not aligned, reported - left: partyally maps T0002010202002010321110100233 + $/5(&&&%-$7-%%*%#$.''$#/5(-$ @AG right: adapter not aligned, reported - left: partyally maps,10bp T0020222221 + %''$#/5(-$ @AI right: 10bp, left: empty read T0200211022 + -%%*%#$.'' @AL right:match - left:match,(discarded, ends with 1) T000221211102210012233020 + ''44(#/5(&&&%-$7-%%*%#$. @AM right:match partly T0000022220011012201121330201 + ''44(#/5(&&&%-$7-%%*%#$.''$# flexbar_v2.4_src/src/0000751000651600065100000000000012175474107016003 5ustar jroehrbioinformaticsflexbar_v2.4_src/src/CMakeLists.txt0000640000651600065100000000132012175474107020537 0ustar jroehrbioinformaticscmake_minimum_required( VERSION 2.8.0 ) # include_directories( ${FLEXBAR_SOURCE_DIR}/include ) # link_directories( ${FLEXBAR_SOURCE_DIR}/lib ) add_executable( flexbar Flexbar.cpp ) target_link_libraries( flexbar tbb ) find_package( ZLIB ) if( ZLIB_FOUND ) include_directories( ${ZLIB_INCLUDE_DIRS} ) target_link_libraries( flexbar ${ZLIB_LIBRARIES} ) add_definitions( "-DSEQAN_HAS_ZLIB=1" ) else() message( STATUS "Build will not support zlib!" ) endif() find_package( BZip2 ) if( BZIP2_FOUND ) include_directories( ${BZIP2_INCLUDE_DIRS} ) target_link_libraries( flexbar ${BZIP2_LIBRARIES} ) add_definitions( "-DSEQAN_HAS_BZIP2=1" ) else() message( STATUS "Build will not support bzip2!" ) endif() flexbar_v2.4_src/src/AlignmentFilter.h0000640000651600065100000002743312175327143021246 0ustar jroehrbioinformatics/* * AlignmentFilter.h * * Authors: mat and jtr */ #ifndef FLEXBAR_ALIGNMENTFILTER_H_ #define FLEXBAR_ALIGNMENTFILTER_H_ #include #include #include #include #include "Enums.h" #include "SequencingRead.h" #include "AdapterLoader.h" /* Performs alignment via passed Algorithm. Aligns each adapter or barcode (queries) to read. The one with the highest score is used. */ template class AlignmentFilter { private: const flexbar::TrimEnd m_trimEnd; const flexbar::LogLevel m_verb; const flexbar::FileFormat m_format; const bool m_isBarcoding, m_writeTag, m_randTag; const int m_minLength, m_minOverlap, m_tailLength; const float m_threshold; tbb::atomic m_nPreShortReads, m_modified; tbb::concurrent_vector *m_queries; tbb::concurrent_vector *m_rmOverlaps; std::ostream *m_out; TAlgorithm *algo; public: AlignmentFilter(tbb::concurrent_vector *queries, const Options &o, int minOverlap, float threshold, const int tailLength, const int match, const int mismatch, const int gapCost, const flexbar::TrimEnd end, const bool isBarcoding): m_minOverlap(minOverlap), m_threshold(threshold), m_tailLength(tailLength), m_trimEnd(end), m_isBarcoding(isBarcoding), m_randTag(o.randTag), m_minLength(o.min_readLen), m_verb(o.logLevel), m_format(o.format), m_writeTag(o.useRemovalTag), m_out(o.out){ m_queries = queries; m_nPreShortReads = 0; m_modified = 0; algo = new TAlgorithm(o, match, mismatch, gapCost, m_trimEnd); m_rmOverlaps = new tbb::concurrent_vector(flexbar::MAX_READLENGTH + 1, 0); }; virtual ~AlignmentFilter(){ delete algo; delete m_rmOverlaps; }; // aligns all query sequences to read and optionally removes best one int align(void* item, const bool performRemoval){ using namespace std; using namespace flexbar; using seqan::prefix; using seqan::suffix; using seqan::infix; SequencingRead &myRead = *static_cast< SequencingRead* >(item); int fmismatches, fgapsR, fgapsA, foverlapLength, fqueryLength, ftailLength; int fstartPos, fstartPosA, fstartPosS, fendPos, fendPosS, fendPosA; int qIndex = -1; int scoreMax = -1000000; float fallowedErrors; stringstream ss; TString read, quality, finalAliStr, finalRandTag; TString readTag = myRead.getSequenceTag(); switch(m_format){ case CSFASTQ: read = suffix(myRead.getSequence(), 2); quality = suffix(myRead.getQuality(), 1); break; case FASTQ: read = myRead.getSequence(); quality = myRead.getQuality(); break; case CSFASTA: read = suffix(myRead.getSequence(), 2); quality = ""; break; case FASTA: read = myRead.getSequence(); quality = ""; break; } TString sequence = read; int readLength = length(read); if(! m_isBarcoding && readLength < m_minLength){ ++m_nPreShortReads; return ++qIndex; } // align each query sequence and keep track of best one for(unsigned int i = 0; i < m_queries->size(); ++i){ TString query = m_queries->at(i).first->getSequence(); int queryLength = length(query); int tailLength = (m_tailLength > 0) ? m_tailLength : queryLength; // align only read tail in tail modes if(m_trimEnd == LEFT_TAIL || m_trimEnd == RIGHT_TAIL){ if(tailLength < readLength){ if(m_trimEnd == LEFT_TAIL){ sequence = prefix(read, tailLength); }else{ sequence = suffix(read, readLength - tailLength); } if(m_verb == ALL || m_verb == MOD) ss << "Read tail length: " << tailLength << "\n\n"; } } int startPos = 0, endPos = 0, startPosA = 0, endPosA = 0, startPosS = 0, endPosS = 0; int aliScore = 0, mismatches = 0, gapsR = 0, gapsA = 0; TString randTag = ""; stringstream aliString; // align query with specified algorithm algo->align(query, sequence, gapsR, gapsA, mismatches, startPos, endPos, startPosA, endPosA, startPosS, endPosS, aliScore, aliString, randTag); int overlapLength = endPos - startPos; float allowedErrors = m_threshold * overlapLength / 10.0f; float madeErrors = static_cast(mismatches + gapsR + gapsA); int minOverlapValue = (m_isBarcoding && m_minOverlap == 0) ? queryLength : m_minOverlap; bool validAli = true; if(((m_trimEnd == RIGHT_TAIL || m_trimEnd == RIGHT) && startPosA < startPosS) || ((m_trimEnd == LEFT_TAIL || m_trimEnd == LEFT) && endPosA > endPosS) || overlapLength < 1){ validAli = false; } // check if alignment is valid and score is max as well as if number of errors and overlap length are allowed if(validAli && aliScore > scoreMax && madeErrors <= allowedErrors && overlapLength >= minOverlapValue){ qIndex = i; scoreMax = aliScore; fstartPos = startPos; fstartPosA = startPosA; fstartPosS = startPosS; fendPos = endPos; fendPosA = endPosA; fendPosS = endPosS; fgapsR = gapsR; fgapsA = gapsA; finalRandTag = randTag; ftailLength = tailLength; foverlapLength = overlapLength; fqueryLength = queryLength; if(m_verb != NONE){ fmismatches = mismatches; finalAliStr = aliString.str(); fallowedErrors = allowedErrors; } } } // valid alignment if(qIndex >= 0){ TrimEnd trimEnd = m_trimEnd; // cut read according to best alignment if(performRemoval){ if(trimEnd == ANY){ if(fstartPosA <= fstartPosS && fendPosS <= fendPosA){ myRead.setSequence(""); if(m_format == FASTQ || m_format == CSFASTQ) myRead.setQuality(""); } else if(fstartPosA - fstartPosS >= fendPosS - fendPosA){ trimEnd = RIGHT; } else{ trimEnd = LEFT; } } switch(trimEnd){ int rCutPos; case LEFT_TAIL: sequence = read; case LEFT: rCutPos = fendPos; // translate alignment end pos to read idx if(fstartPosS > 0) rCutPos -= fstartPosS; // adjust to inner read gaps rCutPos -= fgapsR; if(rCutPos > readLength) rCutPos = readLength; if(m_format == FASTA || m_format == FASTQ){ erase(sequence, 0, rCutPos); myRead.setSequence(sequence); if(m_format == FASTQ){ erase(quality, 0, rCutPos); myRead.setQuality(quality); } } else { // colorspace if(rCutPos < readLength) ++rCutPos; erase(sequence, 0, rCutPos); insert(sequence, 0, prefix(myRead.getSequence(), 2)); myRead.setSequence(sequence); if(m_format == CSFASTQ){ erase(quality, 0, rCutPos); insert(quality, 0, prefix(myRead.getQuality(), 1)); myRead.setQuality(quality); } } break; case RIGHT_TAIL: sequence = read; // adjust cut pos to original read length fstartPos += readLength - ftailLength; case RIGHT: rCutPos = fstartPos; if(m_format == FASTA || m_format == FASTQ){ erase(sequence, rCutPos, readLength); myRead.setSequence(sequence); if(m_format == FASTQ){ erase(quality, rCutPos, readLength); myRead.setQuality(quality); } } else { // colorspace if(rCutPos > 0) --rCutPos; erase(sequence, rCutPos, readLength); insert(sequence, 0, prefix(myRead.getSequence(), 2)); myRead.setSequence(sequence); if(m_format == CSFASTQ){ erase(quality, rCutPos, readLength); insert(quality, 0, prefix(myRead.getQuality(), 1)); myRead.setQuality(quality); } } break; case ANY:; } ++m_modified; // count for each query number of removals m_queries->at(qIndex).second.first++; if(foverlapLength == fqueryLength){ m_queries->at(qIndex).second.second++; } if(m_writeTag){ TString newTag = myRead.getSequenceTag(); append(newTag, "_Flexbar_removal"); myRead.setSequenceTag(newTag); } // store overlap occurrences for min, max, mean and median if(foverlapLength <= MAX_READLENGTH) m_rmOverlaps->at(foverlapLength)++; else cerr << "\nCompile Flexbar with larger max read length to get correct overlap stats.\n" << endl; } // valid alignment, not neccesarily removal if(m_randTag && finalRandTag != ""){ TString newTag = myRead.getSequenceTag(); append(newTag, "_"); append(newTag, finalRandTag); myRead.setSequenceTag(newTag); } // alignment stats TString queryTag = m_queries->at(qIndex).first->getSequenceTag(); if(m_verb == ALL || (m_verb == MOD && performRemoval)){ if(performRemoval){ ss << "Sequence removal:"; if(trimEnd == LEFT || trimEnd == LEFT_TAIL) ss << " left side\n"; else if(trimEnd == RIGHT || trimEnd == RIGHT_TAIL) ss << " right side\n"; else ss << " any side\n"; } else{ ss << "Sequence detection, no removal:\n"; } ss << " query tag " << queryTag << "\n" << " read tag " << readTag << "\n" << " read " << read << "\n" << " read pos " << fstartPosS << "-" << fendPosS << "\n" << " query pos " << fstartPosA << "-" << fendPosA << "\n" << " score " << scoreMax << "\n" << " overlap " << foverlapLength << "\n" << " errors " << fgapsR + fgapsA + fmismatches << "\n" << " allowed errors " << fallowedErrors << "\n"; if(performRemoval){ ss << " remaining read " << myRead.getSequence() << "\n"; if(m_format == FASTQ || m_format == CSFASTQ) ss << " remaining qual " << myRead.getQuality() << "\n"; } ss << "\n Alignment:\n" << endl << finalAliStr; } else if(m_verb == TAB){ ss << readTag << "\t" << queryTag << "\t" << fstartPosA << "\t" << fendPosA << "\t" << foverlapLength << "\t" << fmismatches << "\t" << fgapsR + fgapsA << "\t" << fallowedErrors << endl; } } else if(m_verb == ALL){ ss << "No valid alignment:" << "\n" << "read tag " << readTag << "\n" << "read " << read << "\n\n" << endl; } // bundeled output for multi-threading if(m_verb != NONE) *m_out << ss.str(); return ++qIndex; } std::string getOverlapStatsString(){ unsigned long nValues = 0, halfValues = 0, cumValues = 0, lenSum = 0; int min = 1000000, max = 0, median = 0, mean = 0; for (int i = 0; i <= flexbar::MAX_READLENGTH; ++i){ unsigned long lenCount = m_rmOverlaps->at(i); if(lenCount > 0 && i < min) min = i; if(lenCount > 0 && i > max) max = i; nValues += lenCount; lenSum += lenCount * i; } halfValues = nValues / 2; for (int i = 0; i <= flexbar::MAX_READLENGTH; ++i){ cumValues += m_rmOverlaps->at(i); if(cumValues >= halfValues){ median = i; break; } } if(m_modified > 0) mean = lenSum / m_modified; std::stringstream ss; ss << "Min, max, mean and median adapter overlap: "; ss << min << " / " << max << " / " << mean << " / " << median; return ss.str(); } unsigned long getNrPreShortReads() const { return m_nPreShortReads; } unsigned long getNrModifiedReads() const { return m_modified; } }; #endif /* FLEXBAR_ALIGNMENTFILTER_H_ */ flexbar_v2.4_src/src/FlexbarIO.h0000640000651600065100000001043412175327143017766 0ustar jroehrbioinformatics/* * FlexbarIO.h * * Author: jtr */ #ifndef FLEXBAR_FLEXBARIO_H_ #define FLEXBAR_FLEXBARIO_H_ #include #include #include #include #include #include #include #include #include "Enums.h" #if SEQAN_HAS_ZLIB #include #endif #if SEQAN_HAS_BZIP2 #include #endif void openInputFile(std::fstream &strm, std::string path){ using namespace std; strm.open(path.c_str(), ios::in | ios::binary); if(! strm.good()){ cerr << "Error opening file: " << path << "\n" << endl; exit(1); } } void openOutputFile(std::fstream &strm, std::string path){ using namespace std; strm.open(path.c_str(), ios::out | ios::binary); if(! strm.good()){ cerr << "Error opening file: " << path << "\n" << endl; exit(1); } } void closeFile(std::fstream &strm){ strm.close(); } // void openInputFile(std::istream &strm, std::string path){} // void closeFile(std::istream &strm){} #if SEQAN_HAS_ZLIB void openInputFile(seqan::Stream &strm, std::string path){ using namespace std; // if(path == "-.gz") path = "-"; if(! open(strm, path.c_str(), "rb")){ cerr << "Error opening gzip file: " << path << "\n" << endl; exit(1); } } void openOutputFile(seqan::Stream &strm, std::string path){ using namespace std; // bool ok; // // if(path == "-") ok = open(strm, path.c_str(), "w"); // else ok = open(strm, path.c_str(), "wb"); if(! open(strm, path.c_str(), "wb")){ cerr << "Error opening gzip file: " << path << "\n" << endl; exit(1); } } void closeFile(seqan::Stream &strm){} #endif #if SEQAN_HAS_BZIP2 void openInputFile(seqan::Stream &strm, std::string path){ using namespace std; // if(path == "-.bz2") path = "-"; if(! open(strm, path.c_str(), "rb")){ cerr << "Error opening bz2 file: " << path << "\n" << endl; exit(1); } } void openOutputFile(seqan::Stream &strm, std::string path){ using namespace std; if(! open(strm, path.c_str(), "wb")){ cerr << "Error opening bz2 file: " << path << "\n" << endl; exit(1); } } void closeFile(seqan::Stream &strm){} #endif void checkFileCompression(std::string path, flexbar::CompressionType &cmprsType){ using namespace std; using namespace flexbar; using seqan::CharString; using seqan::suffix; using seqan::length; cmprsType = UNCOMPRESSED; if(length(path) > 3){ CharString ending = suffix(path, length(path) - 3); if(ending == ".gz"){ #if SEQAN_HAS_ZLIB cmprsType = GZ; #else cerr << "Input file decompression canceled.\n"; cerr << "This build does not support zlib!\n" << endl; exit(1); #endif } else if(length(path) > 4){ ending = suffix(path, length(path) - 4); if(ending == ".bz2"){ #if SEQAN_HAS_BZIP2 cmprsType = BZ2; #else cerr << "Input file decompression canceled.\n"; cerr << "This build does not support bzip2!\n" << endl; exit(1); #endif } } } } template void checkInputType(std::string path, flexbar::FileFormat &format){ using namespace std; using namespace flexbar; char c; if(path == "-"){ if(cin) c = cin.peek(); else{ cerr << "Standard input reading error.\n" << endl; exit(1); } } else{ TStream fstrm; openInputFile(fstrm, path); // streamPeek(c, fstrm); seqan::RecordReader > reader(fstrm); if(! atEnd(reader)){ c = value(reader); // seqan::CharString text; // if(readDigits(text, reader) != 0){ // cerr << "File reading error occured.\n" << endl; // exit(1); }; cout << text << endl; } else{ cerr << "Reads file seems to be empty.\n" << endl; exit(1); } closeFile(fstrm); } if(c == '>') format = FASTA; else if(c == '@') format = FASTQ; else{ cerr << "Reads file type not conform.\n"; cerr << "Neither fasta nor fastq header.\n" << endl; exit(1); } } std::string toFormatString(flexbar::FileFormat format){ using namespace flexbar; switch(format){ case FASTA: return ".fasta"; case FASTQ: return ".fastq"; case CSFASTA: return ".csfasta"; case CSFASTQ: return ".csfastq"; } return ".unknown"; } #endif /* FLEXBAR_FLEXBARIO_H_ */ flexbar_v2.4_src/src/Flexbar.h0000640000651600065100000002754412175457453017560 0ustar jroehrbioinformatics/* * Flexbar.h * * Author: jtr */ #ifndef FLEXBAR_FLEXBAR_H_ #define FLEXBAR_FLEXBAR_H_ #include #include #include #include #include #include #include #include #include #include #include "Enums.h" #include "Options.h" #include "FlexbarIO.h" #include "AdapterLoader.h" #include "SequencingRead.h" #include "SequenceConverter.h" #include "SequenceInputFilter.h" #include "MultiplexedInputFilter.h" #include "MultiplexedOutputFilter.h" #include "MultiplexedAlignmentFilter.h" void loadBarcodesAndAdapters(Options &o){ using namespace std; using namespace flexbar; using seqan::CharString; if(o.barDetect != BOFF){ tbb::task_scheduler_init init_serial(1); tbb::pipeline bpipeline; SequenceInputFilter adapter_filter(o, o.barcodeFile, true, false, false); bpipeline.add_filter(adapter_filter); AdapterLoader adapterLoader(o); bpipeline.add_filter(adapterLoader); bpipeline.run(1); o.barcodes = adapterLoader.getAdapters(); adapterLoader.printAdapters("Barcode"); if(o.barcodes.size() == 0){ cerr << "No barcodes found in file!\n" << endl; exit(1); } } if(o.adapRm != AOFF){ AdapterLoader adapterLoader(o); if(o.useAdapterFile){ tbb::task_scheduler_init init_serial(1); tbb::pipeline prepipeline; SequenceInputFilter adapter_filter(o, o.adapterFile, true, false, false); prepipeline.add_filter(adapter_filter); prepipeline.add_filter(adapterLoader); prepipeline.run(1); o.adapters = adapterLoader.getAdapters(); if(o.adapters.size() == 0){ cerr << "No adapters found in file!\n" << endl; exit(1); } } else { SequencingRead *myRead; CharString adapterSeq = o.adapterSeq; if(o.format == flexbar::CSFASTA || o.format == flexbar::CSFASTQ){ adapterSeq = SequenceConverter::getInstance()->bpToColorSpace(adapterSeq); } myRead = new SequencingRead(adapterSeq, "cmdline"); TAdapter adap; adap.first = myRead; o.adapters.push_back(adap); adapterLoader.setAdapters(o.adapters); } adapterLoader.printAdapters("Adapter"); } } void printComputationTime(Options &o, const time_t start){ using namespace std; time_t end; time(&end); int totalTime = int(difftime(end, start)); int hours = div(totalTime, 3600).quot; int rest = div(totalTime, 3600).rem; int minutes = div(rest, 60).quot; int seconds = div(rest, 60).rem; ostream *out = o.out; *out << "Computation time: "; if(hours > 0) *out << hours << " h "; if(hours > 0 || minutes > 0) *out << minutes << " min "; if(hours > 0 || minutes > 0 || seconds > 0) *out << seconds << " sec\n\n\n"; else *out << "< 1 sec\n\n\n"; } std::string alignValue(const int refLength, const unsigned long value){ using namespace std; stringstream s; s << value; int wSpaceLen = refLength - s.str().length(); if(wSpaceLen < 0) wSpaceLen = 0; return string(wSpaceLen, ' ') + s.str(); } void printCompletedMessage(Options &o){ using namespace std; using namespace flexbar; stringstream s; s << "Flexbar completed "; if(o.barDetect != BOFF) s << "barcode"; if(o.barDetect == WITHIN_READ_REMOVAL) s << " removal within reads"; if(o.barDetect == WITHIN_READ) s << " detection within reads"; if(o.barDetect == BARCODE_READ) s << " detection with separate reads"; if(o.barDetect != BOFF && o.adapRm != AOFF) s << " and "; if(o.barDetect == BOFF && o.adapRm == AOFF) s << "basic processing"; if(o.adapRm != AOFF) s << "adapter removal"; *o.out << s.str() << ".\n" << endl; if(o.useStdout) closeFile(o.fstrmOut); } template void startProcessing(Options &o){ using namespace std; using namespace flexbar; typedef seqan::CharString TString; typedef seqan::CharString TIDString; time_t start; time(&start); ostream *out = o.out; *out << "\nProcessing reads ..." << flush; if(o.logLevel != NONE) *out << "\n\nLog level " << o.logLevelStr << " output generation:\n\n" << endl; MultiplexedInputFilter inputFilter(o); MultiplexedAlignmentFilter alignFilter(o); MultiplexedOutputFilter outputFilter(o); tbb::task_scheduler_init init_serial(o.nThreads); tbb::pipeline pipe; pipe.add_filter(inputFilter); pipe.add_filter(alignFilter); pipe.add_filter(outputFilter); pipe.run(o.nThreads); if(o.logLevel == TAB) *out << "\n"; *out << "done.\n" << endl; printComputationTime(o, start); // barcode and adapter removal statistics if(o.writeLengthDist) outputFilter.writeLengthDist(); if(o.adapRm != AOFF){ outputFilter.printAdapterRemovalStats(); alignFilter.printAdapterOverlapStats(); } outputFilter.printFileSummary(); const unsigned long nReads = inputFilter.getNrProcessedReads(); const unsigned long nGoodReads = outputFilter.getNrGoodReads(); const unsigned long uncalled = inputFilter.getNrUncalledReads(); const unsigned long uPairs = inputFilter.getNrUncalledPairedReads(); stringstream s; s << nReads; int len = s.str().length(); *out << "Filtering statistics\n"; *out << "====================\n"; *out << "Processed reads " << nReads << endl; *out << " skipped due to uncalled bases "; if(o.isPaired){ *out << alignValue(len, 2 * uPairs); if(uncalled > 0) *out << " (" << uncalled << " uncalled in " << uPairs << " pairs)"; *out << endl; } else *out << alignValue(len, uncalled) << endl; if(o.phred_preQual > 0) *out << " trimmed due to low quality " << alignValue(len, inputFilter.getNrLowPhredReads()) << endl; if(o.barDetect != BOFF && ! o.writeUnassigned) *out << " skipped unassigned reads " << alignValue(len, alignFilter.getNrUnassignedReads()) << endl; if(o.adapRm != AOFF) *out << " short prior adapter removal " << alignValue(len, alignFilter.getNrPreShortReads()) << endl; *out << " finally skipped short reads " << alignValue(len, outputFilter.getNrShortReads()) << endl; if(o.isPaired && ! o.writeSingleReads) *out << " skipped single paired reads " << alignValue(len, outputFilter.getNrSingleReads()) << endl; *out << "Discarded reads overall " << alignValue(len, nReads - nGoodReads) << endl; *out << "Remaining reads " << alignValue(len, nGoodReads); if(nReads > 0) *out << " (" << fixed << setprecision(2) << 100 * nGoodReads / nReads << "% of input reads)"; *out << "\n\n" << endl; } template void startProcessing(Options &o){ using namespace std; using namespace flexbar; if(o.cmprsType == GZ){ #if SEQAN_HAS_ZLIB startProcessing >(o); #else o.outCompression = ""; o.cmprsType = UNCOMPRESSED; cerr << "Output file compression inactive.\n" << "This build does not support zlib!\n" << endl; #endif } else if(o.cmprsType == BZ2){ #if SEQAN_HAS_BZIP2 startProcessing >(o); #else o.outCompression = ""; o.cmprsType = UNCOMPRESSED; cerr << "Output file compression inactive.\n" << "This build does not support bzip2!\n" << endl; #endif } if(o.cmprsType == UNCOMPRESSED){ startProcessing(o); } } template void startProcessing(Options &o){ using namespace flexbar; CompressionType cmprsType; checkFileCompression(o.barReadsFile, cmprsType); #if SEQAN_HAS_ZLIB if(cmprsType == GZ){ startProcessing >(o); } #endif #if SEQAN_HAS_BZIP2 if(cmprsType == BZ2){ startProcessing >(o); } #endif if(cmprsType == UNCOMPRESSED){ startProcessing(o); } } template void startProcessing(Options &o){ using namespace flexbar; CompressionType cmprsType; checkFileCompression(o.readsFile2, cmprsType); #if SEQAN_HAS_ZLIB if(cmprsType == GZ){ startProcessing >(o); } #endif #if SEQAN_HAS_BZIP2 if(cmprsType == BZ2){ startProcessing >(o); } #endif if(cmprsType == UNCOMPRESSED){ startProcessing(o); } } void startProcessing(Options &o, const bool start){ using namespace flexbar; CompressionType cmprsType; checkFileCompression(o.readsFile, cmprsType); #if SEQAN_HAS_ZLIB if(cmprsType == GZ){ if(start) startProcessing >(o); else checkInputType >(o.readsFile, o.format); } #endif #if SEQAN_HAS_BZIP2 if(cmprsType == BZ2){ if(start) startProcessing >(o); else checkInputType >(o.readsFile, o.format); } #endif if(cmprsType == UNCOMPRESSED){ if(start) startProcessing(o); else checkInputType(o.readsFile, o.format); } } void initOptions(Options &o, seqan::ArgumentParser &parser){ using namespace std; bool stdout = isSet(parser, "stdout-reads"); bool paired = isSet(parser, "reads2"); if(stdout && ! paired){ string s; getOptionValue(s, parser, "target"); openOutputFile(o.fstrmOut, s + ".out"); o.out = &o.fstrmOut; o.useStdout = true; *o.out << endl; } else{ o.out = &cout; if(stdout) *o.out << endl; } getOptionValue(o.readsFile, parser, "reads"); startProcessing(o, false); } // #include // #include void performTest(){ using namespace std; using namespace flexbar; using seqan::CharString; // typedef seqan::String > TMMapString; // TMMapString mmapStr; // // if(! open(mmapStr, "test/test.fasta", seqan::OPEN_RDONLY)){ // cout << "Error opening File." << std::endl; // exit(1); // } // seqan::RecordReader > mmReader(mmapStr); // string text2 = ""; // readLine(text2, mmReader); // cout << text2 << endl; // CharString haystack = "ATGGATTGCG", needle = "ATGCAT"; // // seqan::Finder finder(haystack); // seqan::Pattern > pattern(needle, seqan::SimpleScore(0, -1, -7)); // // while (find(finder, pattern, -2)){ // while (findBegin(finder, pattern, getScore(pattern))){ // cout << '[' << beginPosition(finder) << ',' << endPosition(finder) << ")\t" << infix(finder) << endl; // // cout << end(finder) << endl; //',' << position(pattern) << endl; // } } // clear(finder); // seqan::Pattern pattern2(needle, -2); // // //seqan::Score sc(0,-3,-2); // = scoringScheme(pattern2); // //setScoringScheme(pattern2, sc); // // while (find(finder, pattern2)){ // while (findBegin(finder, pattern2, getScore(pattern2))){ // cout << '[' << beginPosition(finder) << ',' << endPosition(finder) << ")\t" << infix(finder) << endl; // } // } // fstream out; // openOutputFile(out, "test.out"); // // ostream &outStrm; // outStrm = out; // *outStrm << "Test output file.\n\n" << endl; } void startComputation(Options &o){ using namespace std; // performTest(); startProcessing(o, true); } #endif /* FLEXBAR_FLEXBAR_H_ */ flexbar_v2.4_src/src/MultiplexedRead.h0000640000651600065100000000141212173312106021226 0ustar jroehrbioinformatics/* * MultiplexedRead.h * * Author: mat */ #ifndef FLEXBAR_MULTIPLEXEDREAD_H_ #define FLEXBAR_MULTIPLEXEDREAD_H_ /* Class represents either a single read or a paired read. In both cases a barcode-read can be also present. */ template class MultiplexedRead { public: typedef SequencingRead TSequencingRead; TSequencingRead *m_r1; TSequencingRead *m_r2; TSequencingRead *m_b; TString m_randTag; int m_barcode_id; MultiplexedRead(TSequencingRead *r1, TSequencingRead *r2, TSequencingRead *b) : m_r1(r1), m_r2(r2), m_b(b), m_barcode_id(0), m_randTag(""){ }; virtual ~MultiplexedRead(){ delete m_r1; delete m_r2; delete m_b; }; }; #endif /* FLEXBAR_MULTIPLEXEDREAD_H_ */ flexbar_v2.4_src/src/Enums.h0000640000651600065100000000141712171617400017235 0ustar jroehrbioinformatics/* * Enums.h * * Authors: mat and jtr */ #ifndef FLEXBAR_ENUMS_H_ #define FLEXBAR_ENUMS_H_ // These enums are used by almost every class. namespace flexbar{ const unsigned int MAX_READLENGTH = 2048; enum LogLevel { NONE, ALL, TAB, MOD }; enum CompressionType { UNCOMPRESSED, GZ, BZ2 }; enum TrimEnd { ANY, LEFT, RIGHT, LEFT_TAIL, RIGHT_TAIL }; enum FileFormat { FASTA, FASTQ, CSFASTA, CSFASTQ }; enum QualityType { SANGER, SOLEXA, ILLUMINA13 }; enum BarcodeDetect { BARCODE_READ, WITHIN_READ, WITHIN_READ_REMOVAL, BOFF }; enum AdapterRemoval { NORMAL, AONE, ATWO, AOFF }; enum RunType { SINGLE, PAIRED, SINGLE_BARCODED, PAIRED_BARCODED }; } #endif /* FLEXBAR_ENUMS_H_ */ flexbar_v2.4_src/src/SequenceConverter.h0000640000651600065100000000305312173312106021601 0ustar jroehrbioinformatics/* * SequenceConverter.h * * Authors: mat and jtr */ #ifndef FLEXBAR_SEQUENCECONVERTER_H_ #define FLEXBAR_SEQUENCECONVERTER_H_ // This class converts sequences from basepair space to colorspace. template class SequenceConverter { private: static SequenceConverter* instance; SequenceConverter(){}; public: static SequenceConverter* getInstance(){ if(instance == NULL) instance = new SequenceConverter(); return instance; } TString bpToColorSpace(TString bpSequence){ TString result = ""; TString substr = "XX"; for(size_t i = 1; i < length(bpSequence); ++i){ substr[0] = bpSequence[i - 1]; substr[1] = bpSequence[i]; if(substr=="TT") append(result, "0"); if(substr=="TG") append(result, "1"); if(substr=="TC") append(result, "2"); if(substr=="TA") append(result, "3"); if(substr=="CC") append(result, "0"); if(substr=="CA") append(result, "1"); if(substr=="CT") append(result, "2"); if(substr=="CG") append(result, "3"); if(substr=="GG") append(result, "0"); if(substr=="GT") append(result, "1"); if(substr=="GA") append(result, "2"); if(substr=="GC") append(result, "3"); if(substr=="AA") append(result, "0"); if(substr=="AC") append(result, "1"); if(substr=="AG") append(result, "2"); if(substr=="AT") append(result, "3"); } return result; } virtual ~SequenceConverter(){}; }; template SequenceConverter* SequenceConverter::instance = 0; #endif /* FLEXBAR_SEQUENCECONVERTER_H_ */ flexbar_v2.4_src/src/SequenceInputFilter.h0000640000651600065100000002267412173312106022111 0ustar jroehrbioinformatics/* * SequenceInputFilter.h * * Authors: mat and jtr */ #ifndef FLEXBAR_SEQUENCEINPUTFILTER_H_ #define FLEXBAR_SEQUENCEINPUTFILTER_H_ #include #include #include #include #include #include #include #include #include "Enums.h" #include "Options.h" #include "FlexbarIO.h" #include "SequencingRead.h" // This class reads a (CS)FASTA/Q file and builds instances for each SequencingRead. template class SequenceInputFilter : public tbb::filter { private: typedef seqan::RecordReader > TRecordReader; TRecordReader *reader; TStream fstrm; typedef seqan::RecordReader > TRecordReaderCin; TRecordReaderCin *readerCin; // typedef seqan::String > TMMapString; // typedef seqan::RecordReader > TRecordReaderStr; // TRecordReaderStr *strReader; const flexbar::QualityType m_qualType; flexbar::FileFormat m_format; TIDString m_nextTag; const bool m_switch2Fasta, m_preProcess, m_useStdin; const int m_maxUncalled, m_preTrimBegin, m_preTrimEnd, m_prePhredTrim; tbb::atomic m_nrReads, m_nLowPhred; public: SequenceInputFilter(const Options &o, const std::string filePath, const bool fastaFormat, const bool preProcess, const bool useStdin) : filter(serial_in_order), m_preProcess(preProcess), m_useStdin(useStdin), m_qualType(o.qual), m_switch2Fasta(o.switch2Fasta), m_maxUncalled(o.maxUncalled), m_preTrimBegin(o.cutLen_begin), m_preTrimEnd(o.cutLen_end), m_prePhredTrim(o.phred_preQual), m_format(o.format){ m_nextTag = ""; m_nrReads = 0; m_nLowPhred = 0; using namespace std; using namespace flexbar; if(fastaFormat){ m_format = FASTA; } else if(m_switch2Fasta){ if(m_format == FASTA) m_format = FASTQ; if(m_format == CSFASTA) m_format = CSFASTQ; } if(m_useStdin) readerCin = new TRecordReaderCin(cin); else{ openInputFile(fstrm, filePath); reader = new TRecordReader(fstrm); // istream &f = fstrm; } // TMMapString mmapStr; // if(! open(mmapStr, filePath.c_str(), seqan::OPEN_RDONLY)){ // cout << "Error opening File: " << filePath << endl; } // strReader = new TRecordReaderStr(mmapStr); }; virtual ~SequenceInputFilter(){ if(m_useStdin) delete readerCin; else{ delete reader; closeFile(fstrm); } }; unsigned long getNrLowPhredReads() const { return m_nLowPhred; } unsigned long getNrProcessedReads() const { return m_nrReads; } bool atStreamEnd(){ if(m_useStdin) return atEnd(*readerCin); else return atEnd(*reader); } std::string readOneLine(){ using namespace std; string text; if(! atStreamEnd()){ if(m_useStdin){ if(readLine(text, *readerCin) != 0){ cerr << "File reading error occured.\n" << endl; exit(1); } } else{ if(readLine(text, *reader) != 0){ cerr << "File reading error occured.\n" << endl; exit(1); } } } else{ text = ""; } return text; } // Core method for reading and parsing FASTA/FASTQ input files. // @return: single SequencingRead or NULL if no more reads in file or error. void* getRead(bool &isUncalled){ using namespace std; using namespace flexbar; using seqan::prefix; using seqan::suffix; using seqan::length; SequencingRead *myRead = NULL; TString source = "", quality = "", dummy = ""; TIDString tag = ""; if(! atStreamEnd()){ isUncalled = false; try{ // FastA parsing if(m_format == FASTA || m_format == CSFASTA){ // tag line is read in previous iteration if(m_nextTag == "") tag = readOneLine(); else tag = m_nextTag; if(length(tag) > 0){ if(getValue(tag, 0) != '>'){ stringstream error; error << "Incorrect FASTA entry, missing > on new line. Input: " << tag << endl; throw runtime_error(error.str()); } else tag = suffix(tag, 1); if(length(tag) == 0){ stringstream error; error << "Incorrect FASTA entry, missing read name after > symbol." << endl; throw runtime_error(error.str()); } } else return NULL; source = readOneLine(); if(length(source) < 1){ stringstream error; error << "Empty FASTA entry, found tag without read! Tag: " << tag << endl; throw runtime_error(error.str()); } m_nextTag = readOneLine(); // fasta files with sequences splitted over several lines while(! atStreamEnd() && length(m_nextTag) > 0 && getValue(m_nextTag, 0) != '>'){ append(source, m_nextTag); m_nextTag = readOneLine(); } if(m_preProcess){ isUncalled = isUncalledSequence(source); if(m_preTrimBegin > 0 && length(source) > 3){ int idx = m_preTrimBegin; if(idx >= length(source) - 2) idx = length(source) - 3; if(m_format == FASTA) erase(source, 0, idx); else erase(source, 2, idx + 2); } if(m_preTrimEnd > 0 && length(source) > 3){ int idx = m_preTrimEnd; if(idx >= length(source) - 2) idx = length(source) - 3; source = prefix(source, length(source) - idx); } } myRead = new SequencingRead(source, tag); ++m_nrReads; } // FastQ parsing else{ source = readOneLine(); if(length(source) > 0){ if(getValue(source, 0) != '@'){ stringstream error; error << "Incorrect FASTQ entry, missing @ on new line. Input: " << source << endl; throw runtime_error(error.str()); } else tag = suffix(source, 1); if(length(tag) == 0){ stringstream error; error << "Incorrect FASTQ entry, missing read name after @ symbol." << endl; throw runtime_error(error.str()); } } else return NULL; source = readOneLine(); if(length(source) < 1){ stringstream error; error << "Empty FASTQ entry, found tag without read! Tag: " << tag << endl; throw runtime_error(error.str()); } dummy = readOneLine(); if(length(dummy) == 0 || seqan::isNotEqual(getValue(dummy, 0), '+')){ stringstream error; error << "Incorrect FASTQ entry, missing + line. Tag: " << tag << endl; throw runtime_error(error.str()); } quality = readOneLine(); // in case CSFASTQ format has same quality and read length it will be trimmed if(m_format == CSFASTQ){ if(length(quality) == length(source)){ quality = suffix(quality, 1); } } if(length(quality) < 1){ stringstream error; error << "Empty FASTQ entry, found read without quality values! Tag: " << tag << endl; throw runtime_error(error.str()); } if(m_preProcess){ isUncalled = isUncalledSequence(source); if(m_preTrimBegin > 0 && length(source) > 3){ int idx = m_preTrimBegin; if(idx >= length(source) - 2) idx = length(source) - 3; if(m_format == FASTQ){ erase(source, 0, idx); erase(quality, 0, idx); } else{ erase(source, 2, idx + 2); erase(quality, 1, idx + 1); } } if(m_preTrimEnd > 0 && length(source) > 3){ int idx = m_preTrimEnd; if(idx >= length(source) - 2) idx = length(source) - 3; source = prefix(source, length(source) - idx); quality = prefix(quality, length(quality) - idx); } // filtering based on phred quality if(m_prePhredTrim > 0){ typename seqan::Iterator::Type it = seqan::begin(quality); typename seqan::Iterator::Type itEnd = seqan::end(quality); --itEnd; unsigned int n = length(quality); bool nChanged = false; while(itEnd != it){ if(static_cast(*itEnd) >= m_prePhredTrim) break; --n; --itEnd; if(! nChanged){ m_nLowPhred++; nChanged = true; } } source = prefix(source, n); if(m_format == CSFASTQ) --n; quality = prefix(quality, n); } } if(m_switch2Fasta) myRead = new SequencingRead(source, tag); else myRead = new SequencingRead(source, tag, quality); ++m_nrReads; } return myRead; } catch(exception &e){ cerr << "\n\n" << e.what() << "\nProgram execution aborted.\n" << endl; if(m_useStdin) delete readerCin; else{ delete reader; closeFile(fstrm); } exit(1); } } // end of stream else return NULL; } // returns TRUE if read contains too many uncalled bases bool isUncalledSequence(TString source){ int n = 0; typename seqan::Iterator::Type it, itEnd; it = seqan::begin(source); itEnd = seqan::end(source); while(it != itEnd){ if(*it == '.' || *it == 'N') n++; ++it; } return(n > m_maxUncalled); } // override void* operator()(void*){ bool isUncalled = false; return getRead(isUncalled); } }; #endif /* FLEXBAR_SEQUENCEINPUTFILTER_H_ */ flexbar_v2.4_src/src/SequenceOutputFilter.h0000640000651600065100000001047312175327143022315 0ustar jroehrbioinformatics/* * SequenceOutputFilter.h * * Authors: mat and jtr */ #ifndef FLEXBAR_SEQUENCEOUTPUTFILTER_H_ #define FLEXBAR_SEQUENCEOUTPUTFILTER_H_ #include #include #include "Enums.h" #include "FlexbarIO.h" #include "SequencingRead.h" // This class writes sequencing reads in specified format to a file. template class SequenceOutputFilter { private: TStream m_targetStream; const bool m_writeLenDist, m_useStdout; const unsigned int m_minLength, m_cutLen_read; const std::string m_filePath; const TIDString m_tagStr; const flexbar::FileFormat m_format; const flexbar::CompressionType m_cmprsType; tbb::atomic m_countGood; tbb::concurrent_vector *m_lengthDist; public: SequenceOutputFilter(const std::string& filePath, const TIDString tagStr, const Options &o) : m_format(o.format), m_tagStr(tagStr), m_minLength(o.min_readLen), m_cutLen_read(o.cutLen_read), m_writeLenDist(o.writeLengthDist), m_useStdout(o.useStdout), m_cmprsType(o.cmprsType), m_filePath(filePath + o.outCompression){ using namespace flexbar; m_countGood = 0; m_lengthDist = new tbb::concurrent_vector(MAX_READLENGTH + 1, 0); if(! m_useStdout) openOutputFile(m_targetStream, m_filePath); // if(m_useStdout && m_cmprsType != UNCOMPRESSED) openOutputFile(m_targetStream, "-"); // else if(! m_useStdout) openOutputFile(m_targetStream, m_filePath); }; virtual ~SequenceOutputFilter(){ if(! m_useStdout) closeFile(m_targetStream); delete m_lengthDist; }; const std::string getFileName() const { if(! m_useStdout) return m_filePath; else return "stdout"; } void writeLengthDist() const { using namespace std; string fname = m_filePath + ".lengthdist"; fstream lstream; lstream.open(fname.c_str(), ios::out | ios::binary); if(! lstream.is_open()){ cerr << "Error opening File: " << fname << "\n"; } else{ lstream << "Readlength\tCount" << "\n"; for (int i = 0; i <= flexbar::MAX_READLENGTH; ++i){ if(m_lengthDist->at(i) > 0) lstream << i << "\t" << m_lengthDist->at(i) << "\n"; } lstream.close(); } } void writeFastString(const SequencingRead& myRead){ using namespace std; using namespace flexbar; seqan::CharString s = ""; switch(m_format){ case FASTQ: case CSFASTQ: append(s, "@"); append(s, myRead.getSequenceTag()); if(m_useStdout){ append(s, "_"); append(s, m_tagStr); } append(s, "\n"); append(s, myRead.getSequence()); append(s, "\n+\n"); append(s, myRead.getQuality()); append(s, "\n"); break; case FASTA: case CSFASTA: append(s, ">"); append(s, myRead.getSequenceTag()); if(m_useStdout){ append(s, "_"); append(s, m_tagStr); } append(s, "\n"); append(s, myRead.getSequence()); append(s, "\n"); } // if(m_useStdout && m_cmprsType == UNCOMPRESSED) cout << s; if(m_useStdout) cout << s; else{ if(streamPut(m_targetStream, s) != 0){ cerr << "File writing error occured!\n" << endl; exit(1); } } } unsigned long getNrGoodReads() const { return m_countGood; } void *writeRead(void *item){ using namespace std; using namespace flexbar; if(item){ SequencingRead *myRead = static_cast< SequencingRead* >(item); unsigned int readLength = length(myRead->getSequence()); if(m_cutLen_read > 1 && m_cutLen_read >= m_minLength && m_cutLen_read < readLength){ myRead->setSequence(prefix(myRead->getSequence(), m_cutLen_read)); if(m_format == FASTQ){ myRead->setQuality(prefix(myRead->getQuality(), m_cutLen_read)); } else if(m_format == CSFASTQ){ myRead->setQuality(prefix(myRead->getQuality(), m_cutLen_read - 1)); } } ++m_countGood; // store read length distribution if(m_writeLenDist && readLength <= MAX_READLENGTH) m_lengthDist->at(readLength)++; else if(m_writeLenDist) cerr << "\nCompile Flexbar with larger max read length to get correct length dist.\n" << endl; writeFastString(*myRead); } return NULL; } }; #endif /* FLEXBAR_SEQUENCEOUTPUTFILTER_H_ */ flexbar_v2.4_src/src/AdapterLoader.h0000640000651600065100000000373412175327143020667 0ustar jroehrbioinformatics/* * AdapterLoader.h * * Authors: mat and jtr */ #ifndef FLEXBAR_ADAPTERLOADER_H_ #define FLEXBAR_ADAPTERLOADER_H_ #include #include #include #include #include #include "Enums.h" #include "Options.h" #include "SequencingRead.h" #include "SequenceConverter.h" // This class will store each processed read plus it's ID in a vector. template class AdapterLoader : public tbb::filter{ private: std::ostream *out; flexbar::FileFormat m_format; tbb::concurrent_vector adapters; public: AdapterLoader(const Options &o) : filter(serial), m_format(o.format), out(o.out){ }; virtual ~AdapterLoader(){}; void* operator()( void* item ){ SequencingRead *myRead = static_cast< SequencingRead* >(item); if(m_format == flexbar::CSFASTA || m_format == flexbar::CSFASTQ){ TString csRead = SequenceConverter::getInstance()->bpToColorSpace(myRead->getSequence()); myRead->setSequence(csRead); } TAdapter adap; adap.first = myRead; adapters.push_back(adap); return NULL; }; tbb::concurrent_vector getAdapters(){ return adapters; } void setAdapters(tbb::concurrent_vector &adapterVec){ adapters = adapterVec; } void printAdapters(std::string adapterName) const { using namespace std; const unsigned int maxSpaceLen = 23; *out << adapterName << ":" << string(maxSpaceLen - 8, ' ') << "Sequence:" << "\n"; for(unsigned int i=0; i < adapters.size(); ++i){ TString seqTag = adapters.at(i).first->getSequenceTag(); int whiteSpaceLen = maxSpaceLen - length(seqTag); if(whiteSpaceLen < 2) whiteSpaceLen = 2; string whiteSpace = string(whiteSpaceLen, ' '); *out << seqTag << whiteSpace << adapters.at(i).first->getSequence() << "\n"; } *out << endl; } }; #endif /* FLEXBAR_ADAPTERLOADER_H_ */ flexbar_v2.4_src/src/SequencingRead.h0000640000651600065100000000232412173312106021036 0ustar jroehrbioinformatics/* * SequencingRead.h * * Author: mat and jtr */ #ifndef FLEXBAR_SEQUENCINGREAD_H_ #define FLEXBAR_SEQUENCINGREAD_H_ #include /* A Sequencing read consists of a nucleotide sequence (color or basepair space), a sequence name and optionally a quality string plus the quality scaling. */ template class SequencingRead { private: TString m_seq; TIDString m_tag, m_qual; public: SequencingRead() : m_tag(), m_seq(){ } SequencingRead(const TString& source, const TIDString& sequence_tag) : m_tag(sequence_tag), m_seq(source){ } SequencingRead(const TString& source, const TIDString& sequence_tag, const TIDString& qual) : m_tag(sequence_tag), m_seq(source), m_qual(qual){ } void setSequenceTag(const TString& tag){ m_tag = tag; } void setSequence(const TString& seq){ m_seq = seq; } void setQuality(const TString& qual){ m_qual = qual; } const TIDString& getSequenceTag() const { return m_tag; } const TString& getSequence() const { return m_seq; } const TIDString& getQuality() const{ return m_qual; } virtual ~SequencingRead(){}; }; #endif /* FLEXBAR_SEQUENCINGREAD_H_ */ flexbar_v2.4_src/src/Options.h0000640000651600065100000006034512175457453017624 0ustar jroehrbioinformatics/* * Options.h * * Created on: Jul 31, 2012 * Author: jtr */ #ifndef FLEXBAR_OPTIONS_H_ #define FLEXBAR_OPTIONS_H_ #include #include #include #include #include #include "Enums.h" #include "SequencingRead.h" typedef std::pair< SequencingRead*, std::pair< tbb::atomic, tbb::atomic > > TAdapter; struct Options{ std::string readsFile, readsFile2, barReadsFile, barcodeFile, adapterFile; std::string adapterSeq, targetName, logLevelStr, outCompression; bool isColorSpace, isPaired, useAdapterFile, useNumberTag, useRemovalTag, useStdin, useStdout; bool switch2Fasta, writeUnassigned, writeSingleReads, writeLengthDist, randTag; int cutLen_begin, cutLen_end, phred_preQual, cutLen_read, a_tail_len, b_tail_len; int maxUncalled, min_readLen, a_min_overlap, b_min_overlap, nThreads; int match, mismatch, gapCost, b_match, b_mismatch, b_gapCost; float a_threshold, b_threshold; flexbar::TrimEnd end, b_end; flexbar::FileFormat format; flexbar::QualityType qual; flexbar::LogLevel logLevel; flexbar::CompressionType cmprsType; flexbar::RunType runType; flexbar::BarcodeDetect barDetect; flexbar::AdapterRemoval adapRm; tbb::concurrent_vector adapters, barcodes; std::ostream *out; std::fstream fstrmOut; Options(){ readsFile = ""; readsFile2 = ""; barReadsFile = ""; barcodeFile = ""; adapterFile = ""; outCompression = ""; isColorSpace = false; isPaired = false; useAdapterFile = false; useNumberTag = false; useRemovalTag = false; writeUnassigned = false; writeSingleReads = false; writeLengthDist = false; switch2Fasta = false; randTag = false; useStdin = false; useStdout = false; cutLen_begin = 0; cutLen_end = 0; cutLen_read = 0; phred_preQual = 0; a_tail_len = 0; b_tail_len = 0; b_min_overlap = 0; format = flexbar::FASTA; qual = flexbar::SANGER; logLevel = flexbar::NONE; cmprsType = flexbar::UNCOMPRESSED; barDetect = flexbar::BOFF; adapRm = flexbar::AOFF; } }; const std::string getFlexbarBanner(const std::string version){ std::string banner = ""; banner += " ________ __ \n"; banner += " / ____/ /__ _ __/ /_ ____ ______\n"; banner += " / /_ / / _ \\| |/ / __ \\/ __ `/ ___/\n"; banner += " / __/ / / __/> = read start", false); addText(parser._toolDoc, "LEFT_TAIL: consider first n bases of reads in alignment", false); addText(parser._toolDoc, "RIGHT_TAIL: use only last n bases, see tail-length options", false); hideOption(parser, "barcode-tail-length"); hideOption(parser, "barcode-keep"); hideOption(parser, "barcode-match"); hideOption(parser, "barcode-mismatch"); hideOption(parser, "barcode-gap"); hideOption(parser, "adapter-tail-length"); hideOption(parser, "adapter-read-set"); hideOption(parser, "adapter-match"); hideOption(parser, "adapter-mismatch"); hideOption(parser, "adapter-gap"); hideOption(parser, "version"); hideOption(parser, "stdout-reads"); hideOption(parser, "length-dist"); hideOption(parser, "number-tags"); hideOption(parser, "random-tags"); // setRequired(parser, "reads"); // setMinValue(parser, "threads", "1"); // setValidValues(parser, "format", "sanger solexa i1.3 i1.5 i1.8"); // // setValidValues(parser, "barcode-trim-end", "ANY LEFT RIGHT LEFT_TAIL RIGHT_TAIL"); // setMinValue(parser, "barcode-tail-length", "1"); // setMinValue(parser, "barcode-min-overlap", "1"); // setMinValue(parser, "barcode-threshold", "0"); // setMaxValue(parser, "barcode-threshold", "10"); // // setValidValues(parser, "adapter-trim-end", "ANY LEFT RIGHT LEFT_TAIL RIGHT_TAIL"); // setMinValue(parser, "adapter-tail-length", "1"); // setMinValue(parser, "adapter-min-overlap", "1"); // setMinValue(parser, "adapter-threshold", "0"); // setMaxValue(parser, "adapter-threshold", "10"); // // setMinValue(parser, "max-uncalled", "0"); // setMinValue(parser, "pre-trim-left", "1"); // setMinValue(parser, "pre-trim-right", "1"); // setMinValue(parser, "pre-trim-phred", "0"); // setMinValue(parser, "post-trim-length", "1"); // setMinValue(parser, "min-read-length", "1"); setValidValues(parser, "log-level", "ALL MOD TAB"); setValidValues(parser, "zip-output", "GZ BZ2"); setValidValues(parser, "adapter-read-set", "1 2"); setDefaultValue(parser, "target", "flexbar"); setDefaultValue(parser, "threads", "1"); setDefaultValue(parser, "max-uncalled", "0"); setDefaultValue(parser, "min-read-length", "18"); setDefaultValue(parser, "barcode-trim-end", "ANY"); setDefaultValue(parser, "barcode-threshold", "1.0"); setDefaultValue(parser, "barcode-match", "1"); setDefaultValue(parser, "barcode-mismatch", "-1"); setDefaultValue(parser, "barcode-gap", "-9"); setDefaultValue(parser, "adapter-trim-end", "RIGHT"); setDefaultValue(parser, "adapter-min-overlap", "1"); setDefaultValue(parser, "adapter-threshold", "3.0"); setDefaultValue(parser, "adapter-match", "1"); setDefaultValue(parser, "adapter-mismatch", "-1"); setDefaultValue(parser, "adapter-gap", "-7"); addTextSection(parser, "EXAMPLES"); addText(parser._toolDoc, "\\fBflexbar\\fP \\fB-r\\fP reads.fq \\fB-f\\fP i1.8 \\fB-t\\fP target \\fB-b\\fP brc.fa \\fB-a\\fP adap.fa", false); addText(parser._toolDoc, "\\fBflexbar\\fP \\fB-r\\fP reads.csfastq.gz \\fB-a\\fP adap.fa \\fB-ao\\fP 5 \\fB-ae\\fP LEFT \\fB-c\\fP"); } void printLocalTime(Options &o){ time_t t_current; time(&t_current); *o.out << "Local time: " << asctime(localtime(&t_current)) << "\n"; } void parseCommandLine(seqan::ArgumentParser &parser, std::string version, int argc, char const ** argv){ using namespace std; using seqan::ArgumentParser; bool useStdout = false; for (int i=0; i 0){ *out << "pre-trim-phred: " << o.phred_preQual; switch(o.qual){ case SANGER: o.phred_preQual += 33; break; case SOLEXA: o.phred_preQual += 59; break; case ILLUMINA13: o.phred_preQual += 64; } *out << " (" << o.phred_preQual << ")" << endl; } } if(isSet(parser, "post-trim-length")){ getOptionValue(o.cutLen_read, parser, "post-trim-length"); *out << "post-trim-length: " << o.cutLen_read << endl; } getOptionValue(o.min_readLen, parser, "min-read-length"); *out << "min-read-length: " << o.min_readLen << endl; if(o.isColorSpace) o.min_readLen++; // logging and tagging options if(isSet(parser, "log-level")){ getOptionValue(o.logLevelStr, parser, "log-level"); if(o.logLevelStr == "ALL") o.logLevel = ALL; else if(o.logLevelStr == "TAB") o.logLevel = TAB; else if(o.logLevelStr == "MOD") o.logLevel = MOD; } if(isSet(parser, "zip-output")){ getOptionValue(o.outCompression, parser, "zip-output"); if(o.outCompression == "GZ"){ o.cmprsType = GZ; o.outCompression = ".gz"; } else if(o.outCompression == "BZ2"){ o.cmprsType = BZ2; o.outCompression = ".bz2"; } } if(isSet(parser, "fasta-output")){ if(o.format == FASTQ){ o.format = FASTA; o.switch2Fasta = true; } else if(o.format == CSFASTQ){ o.format = CSFASTA; o.switch2Fasta = true; } } if(isSet(parser, "single-reads")) o.writeSingleReads = true; if(isSet(parser, "length-dist")) o.writeLengthDist = true; if(isSet(parser, "number-tags")) o.useNumberTag = true; if(isSet(parser, "removal-tags")) o.useRemovalTag = true; if(isSet(parser, "random-tags")) o.randTag = true; *out << endl; // barcode options if(o.barDetect != BOFF){ string b_trim_end; getOptionValue(b_trim_end, parser, "barcode-trim-end"); if(b_trim_end == "LEFT") o.b_end = LEFT; else if(b_trim_end == "RIGHT") o.b_end = RIGHT; else if(b_trim_end == "ANY") o.b_end = ANY; else if(b_trim_end == "LEFT_TAIL") o.b_end = LEFT_TAIL; else if(b_trim_end == "RIGHT_TAIL") o.b_end = RIGHT_TAIL; else{ cerr << "Specified barcode trim-end is unknown!\n" << endl; exit(1); } *out << "barcode-trim-end: " << b_trim_end << endl; if(isSet(parser, "barcode-tail-length")){ getOptionValue(o.b_tail_len, parser, "barcode-tail-length"); *out << "barcode-tail-length: " << o.b_tail_len << endl; } if(isSet(parser, "barcode-min-overlap")){ getOptionValue(o.b_min_overlap, parser, "barcode-min-overlap"); *out << "barcode-min-overlap: " << o.b_min_overlap << endl; } getOptionValue(o.b_threshold, parser, "barcode-threshold"); *out << "barcode-threshold: " << o.b_threshold << endl; if(isSet(parser, "barcode-unassigned")) o.writeUnassigned = true; getOptionValue(o.b_match, parser, "barcode-match"); getOptionValue(o.b_mismatch, parser, "barcode-mismatch"); getOptionValue(o.b_gapCost, parser, "barcode-gap"); *out << "barcode-match: "; if(o.b_match >= 0) *out << " "; *out << o.b_match << endl; *out << "barcode-mismatch: "; if(o.b_mismatch >= 0) *out << " "; *out << o.b_mismatch << endl; *out << "barcode-gap: "; if(o.b_gapCost >= 0) *out << " "; *out << o.b_gapCost << "\n" << endl; } // adapter options if(o.adapRm != AOFF){ string a_trim_end; getOptionValue(a_trim_end, parser, "adapter-trim-end"); if (a_trim_end == "LEFT") o.end = LEFT; else if(a_trim_end == "RIGHT") o.end = RIGHT; else if(a_trim_end == "ANY") o.end = ANY; else if(a_trim_end == "LEFT_TAIL") o.end = LEFT_TAIL; else if(a_trim_end == "RIGHT_TAIL") o.end = RIGHT_TAIL; else { cerr << "Specified adapter trim-end is unknown!\n" << endl; exit(1); } *out << "adapter-trim-end: " << a_trim_end << endl; if(isSet(parser, "adapter-tail-length")){ getOptionValue(o.a_tail_len, parser, "adapter-tail-length"); *out << "adapter-tail-length: " << o.a_tail_len << endl; } if(isSet(parser, "adapter-read-set") && o.isPaired){ string a_read_set; getOptionValue(a_read_set, parser, "adapter-read-set"); *out << "adapter-read-set: " << a_read_set << endl; if(a_read_set == "1") o.adapRm = AONE; else if(a_read_set == "2") o.adapRm = ATWO; } getOptionValue(o.a_min_overlap, parser, "adapter-min-overlap"); *out << "adapter-min-overlap: " << o.a_min_overlap << endl; getOptionValue(o.a_threshold, parser, "adapter-threshold"); *out << "adapter-threshold: " << o.a_threshold << endl; getOptionValue(o.match, parser, "adapter-match"); getOptionValue(o.mismatch, parser, "adapter-mismatch"); getOptionValue(o.gapCost, parser, "adapter-gap"); *out << "adapter-match: "; if(o.match >= 0) *out << " "; *out << o.match << endl; *out << "adapter-mismatch: "; if(o.mismatch >= 0) *out << " "; *out << o.mismatch << endl; *out << "adapter-gap: "; if(o.gapCost >= 0) *out << " "; *out << o.gapCost << "\n" << endl; } // option compatibility tests if(o.cutLen_read != 0 && o.cutLen_read < o.min_readLen){ o.cutLen_read = 0; cerr << "\nOption post-trim-length omitted, as it is shorter than min read length.\n" << endl; } } #endif /* FLEXBAR_OPTIONS_H_ */ flexbar_v2.4_src/src/MultiplexedOutputFilter.h0000640000651600065100000002622712175327143023045 0ustar jroehrbioinformatics/* * MultiplexedOutputFilter.h * * Authors: mat and jtr */ #ifndef FLEXBAR_MULTIPLEXEDOUTPUTFILTER_H_ #define FLEXBAR_MULTIPLEXEDOUTPUTFILTER_H_ #include #include #include #include "Enums.h" #include "Options.h" #include "FlexbarIO.h" #include "MultiplexedRead.h" #include "SequenceOutputFilter.h" #include "OutputFileStruct.h" #include "AdapterLoader.h" /* This class will process a MultiplexedRead and write it to a file depending on the runtype: single-end, paired-end and/or barcoded. */ template class MultiplexedOutputFilter : public tbb::filter { private: int m_mapsize; const int m_minLength, m_cutLen_read; const bool m_isPaired, m_writeUnassigned, m_writeSingleReads; tbb::atomic m_nSingleReads; const std::string m_target; const flexbar::FileFormat m_format; const flexbar::RunType m_runType; const flexbar::BarcodeDetect m_barDetect; typedef SequenceOutputFilter TOutputFilter; typedef OutputFileStruct filters; filters *m_outputMap; std::ostream *out; tbb::concurrent_vector *m_adapters, *m_barcodes; public: MultiplexedOutputFilter(Options &o) : filter(serial_in_order), m_target(o.targetName), m_format(o.format), m_runType(o.runType), m_barDetect(o.barDetect), m_minLength(o.min_readLen), m_cutLen_read(o.cutLen_read), m_isPaired(o.isPaired), m_writeUnassigned(o.writeUnassigned), m_writeSingleReads(o.writeSingleReads), out(o.out){ using namespace std; using namespace flexbar; m_adapters = &o.adapters; m_barcodes = &o.barcodes; m_nSingleReads = 0; m_mapsize = 0; switch(m_runType){ case PAIRED_BARCODED:{ m_mapsize = m_barcodes->size() + 1; m_outputMap = new filters[m_mapsize]; stringstream ss; for(unsigned int i = 0; i < m_barcodes->size(); ++i){ ss << m_target << "_barcode_" << m_barcodes->at(i).first->getSequenceTag() << "_1" << toFormatString(m_format); TOutputFilter *of1 = new TOutputFilter(ss.str(), "", o); ss.str(""); ss.clear(); ss << m_target << "_barcode_" << m_barcodes->at(i).first->getSequenceTag() << "_2"<< toFormatString(m_format); TOutputFilter *of2 = new TOutputFilter(ss.str(), "", o); ss.str(""); ss.clear(); filters& f = m_outputMap[i + 1]; f.f1 = of1; f.f2 = of2; if(m_writeSingleReads){ ss << m_target << "_barcode_" << m_barcodes->at(i).first->getSequenceTag() << "_1_single" << toFormatString(m_format); TOutputFilter *osingle1 = new TOutputFilter(ss.str(), "", o); ss.str(""); ss.clear(); ss << m_target << "_barcode_" << m_barcodes->at(i).first->getSequenceTag() << "_2_single"<< toFormatString(m_format); TOutputFilter *osingle2 = new TOutputFilter(ss.str(), "", o); ss.str(""); ss.clear(); f.single1 = osingle1; f.single2 = osingle2; } } if(m_writeUnassigned){ ss << m_target << "_barcode_unassigned_1"<< toFormatString(m_format); TOutputFilter *of1 = new TOutputFilter(ss.str(), "", o); ss.str(""); ss.clear(); ss << m_target << "_barcode_unassigned_2"<< toFormatString(m_format); TOutputFilter *of2 = new TOutputFilter(ss.str(), "", o); ss.str(""); ss.clear(); filters& f = m_outputMap[0]; f.f1 = of1; f.f2 = of2; if(m_writeSingleReads){ ss << m_target << "_barcode_unassigned_1_single"<< toFormatString(m_format); TOutputFilter *osingle1 = new TOutputFilter(ss.str(), "", o); ss.str(""); ss.clear(); ss << m_target << "_barcode_unassigned_2_single"<< toFormatString(m_format); TOutputFilter *osingle2 = new TOutputFilter(ss.str(), "", o); f.single1 = osingle1; f.single2 = osingle2; } } break; } case PAIRED:{ m_mapsize = 1; m_outputMap = new filters[m_mapsize]; stringstream ss; ss << m_target << "_1"<< toFormatString(m_format); TOutputFilter *of1 = new TOutputFilter(ss.str(), "", o); ss.str(""); ss.clear(); ss << m_target << "_2"<< toFormatString(m_format); TOutputFilter *of2 = new TOutputFilter(ss.str(), "", o); ss.str(""); ss.clear(); filters& f = m_outputMap[0]; f.f1 = of1; f.f2 = of2; if(m_writeSingleReads){ ss << m_target << "_1_single" << toFormatString(m_format); TOutputFilter *osingle1 = new TOutputFilter(ss.str(), "", o); ss.str(""); ss.clear(); ss << m_target << "_2_single"<< toFormatString(m_format); TOutputFilter *osingle2 = new TOutputFilter(ss.str(), "", o); f.single1 = osingle1; f.single2 = osingle2; } break; } case SINGLE:{ m_mapsize = 1; m_outputMap = new filters[m_mapsize]; stringstream ss; ss << m_target << toFormatString(m_format); TOutputFilter *of1 = new TOutputFilter(ss.str(), "", o); filters& f = m_outputMap[0]; f.f1 = of1; break; } case SINGLE_BARCODED:{ m_mapsize = m_barcodes->size() + 1; m_outputMap = new filters[m_mapsize]; for(unsigned int i=0; i < m_barcodes->size(); ++i){ TIDString barcode = m_barcodes->at(i).first->getSequenceTag(); stringstream ss; ss << m_target << "_barcode_" << barcode << toFormatString(m_format); TOutputFilter *of1 = new TOutputFilter(ss.str(), barcode, o); filters& f = m_outputMap[i + 1]; f.f1 = of1; } if(m_writeUnassigned){ stringstream ss; ss << m_target << "_barcode_unassigned" << toFormatString(m_format); TOutputFilter *of1 = new TOutputFilter(ss.str(), "unassigned", o); filters& f = m_outputMap[0]; f.f1 = of1; } } } } virtual ~MultiplexedOutputFilter(){ delete[] m_outputMap; }; void* operator()(void* item) { using namespace flexbar; MultiplexedRead *myRead = static_cast< MultiplexedRead* >(item); bool l1ok = false, l2ok = false; switch(m_runType){ case SINGLE: case SINGLE_BARCODED:{ if(myRead->m_r1 != NULL){ if(m_runType == SINGLE || m_writeUnassigned || myRead->m_barcode_id > 0){ if(length(myRead->m_r1->getSequence()) >= m_minLength){ m_outputMap[myRead->m_barcode_id].f1->writeRead(myRead->m_r1); } else m_outputMap[myRead->m_barcode_id].m_nShort_1++; } } break; } case PAIRED: case PAIRED_BARCODED:{ if(myRead->m_r1 != NULL && myRead->m_r2 != NULL){ if(m_runType == PAIRED || m_writeUnassigned || myRead->m_barcode_id > 0){ // now check if both reads have min length if(length(myRead->m_r1->getSequence()) >= m_minLength) l1ok = true; if(length(myRead->m_r2->getSequence()) >= m_minLength) l2ok = true; if(l1ok && l2ok){ m_outputMap[myRead->m_barcode_id].f1->writeRead(myRead->m_r1); m_outputMap[myRead->m_barcode_id].f2->writeRead(myRead->m_r2); } else if(l1ok && ! l2ok){ m_nSingleReads++; if(m_writeSingleReads){ m_outputMap[myRead->m_barcode_id].single1->writeRead(myRead->m_r1); } } else if(! l1ok && l2ok){ m_nSingleReads++; if(m_writeSingleReads){ m_outputMap[myRead->m_barcode_id].single2->writeRead(myRead->m_r2); } } if(! l1ok) m_outputMap[myRead->m_barcode_id].m_nShort_1++; if(! l2ok) m_outputMap[myRead->m_barcode_id].m_nShort_2++; } } } } delete myRead; return NULL; } void writeLengthDist(){ for(unsigned int i = 0; i < m_mapsize; i++){ m_outputMap[i].f1->writeLengthDist(); if(m_outputMap[i].f2 != NULL) m_outputMap[i].f2->writeLengthDist(); } } unsigned long getNrSingleReads() const { return m_nSingleReads; } unsigned long getNrGoodReads(){ using namespace flexbar; unsigned long nGood = 0; for(unsigned int i = 0; i < m_mapsize; i++){ if(m_barDetect == BOFF || m_writeUnassigned || i > 0){ nGood += m_outputMap[i].f1->getNrGoodReads(); if(m_outputMap[i].f2 != NULL){ nGood += m_outputMap[i].f2->getNrGoodReads(); if(m_writeSingleReads){ nGood += m_outputMap[i].single1->getNrGoodReads(); nGood += m_outputMap[i].single2->getNrGoodReads(); } } } } return nGood; } unsigned long getNrShortReads(){ using namespace flexbar; unsigned long nShort = 0; for(unsigned int i = 0; i < m_mapsize; i++){ if(m_barDetect == BOFF || m_writeUnassigned || i > 0){ nShort += m_outputMap[i].m_nShort_1; if(m_isPaired) nShort += m_outputMap[i].m_nShort_2; } } return nShort; } void printAdapterRemovalStats(){ using namespace std; *out << "Adapter removal statistics\n"; *out << "==========================\n"; const unsigned int maxSpaceLen = 20; *out << "Adapter:" << string(maxSpaceLen - 8, ' ') << "Overlap removal:" << string(maxSpaceLen - 16, ' ') << "Full length:" << "\n"; for(unsigned int i = 0; i < m_adapters->size(); i++){ seqan::CharString seqTag = m_adapters->at(i).first->getSequenceTag(); int wsLen = maxSpaceLen - length(seqTag); if(wsLen < 2) wsLen = 2; string whiteSpace = string(wsLen, ' '); unsigned long nAdapOvl = m_adapters->at(i).second.first; unsigned long nAdapFull = m_adapters->at(i).second.second; stringstream ss; ss << nAdapOvl; int wsLen2 = maxSpaceLen - ss.str().length(); if(wsLen2 < 2) wsLen2 = 2; string whiteSpace2 = string(wsLen2, ' '); *out << seqTag << whiteSpace << nAdapOvl << whiteSpace2 << nAdapFull << "\n"; } *out << endl; } void printFileSummary(){ using namespace std; using namespace flexbar; *out << "Output file statistics\n"; *out << "======================\n"; for(unsigned int i = 0; i < m_mapsize; i++){ if(m_barDetect == BOFF || m_writeUnassigned || i > 0){ *out << "Read file: " << m_outputMap[i].f1->getFileName() << "\n"; *out << " written reads " << m_outputMap[i].f1->getNrGoodReads() << "\n"; *out << " skipped short reads " << m_outputMap[i].m_nShort_1 << "\n"; if(m_isPaired){ *out << "Read file 2: " << m_outputMap[i].f2->getFileName() << "\n"; *out << " written reads " << m_outputMap[i].f2->getNrGoodReads() << "\n"; *out << " too short reads " << m_outputMap[i].m_nShort_2 << "\n"; if(m_writeSingleReads){ *out << "Single read file: " << m_outputMap[i].single1->getFileName() << "\n"; *out << " written reads " << m_outputMap[i].single1->getNrGoodReads() << "\n"; *out << "Single read file 2: " << m_outputMap[i].single2->getFileName() << "\n"; *out << " written reads " << m_outputMap[i].single2->getNrGoodReads() << "\n"; } } *out << endl; } } *out << endl; } }; #endif /* FLEXBAR_MULTIPLEXEDOUTPUTFILTER_H_ */ flexbar_v2.4_src/src/AlignmentAlgorithm.h0000640000651600065100000001201612150134372021727 0ustar jroehrbioinformatics/* * AlignmentAlgorithm.h * * Authors: mat and jtr */ #ifndef FLEXBAR_ALIGNMENTALGORITHM_H_ #define FLEXBAR_ALIGNMENTALGORITHM_H_ #include #include #include #include template class AlignmentAlgorithm { private: typedef typename seqan::Dna5 TChar; typedef typename seqan::Value::Type TStringChar; // typedef seqan::SimpleType > TChar; typedef seqan::Align TAlign; typedef typename seqan::Row::Type TRow; typedef typename seqan::Iterator::Type TRowIterator; typedef seqan::Score > TScoreDna5; TScoreDna5 m_scoreDna5; seqan::Score m_score; const bool m_isColorSpace, m_randTag; const flexbar::LogLevel m_verb; const flexbar::TrimEnd m_trimEnd; public: AlignmentAlgorithm(const Options &o, const int match, const int mismatch, const int gapCost, const flexbar::TrimEnd trimEnd): m_randTag(o.randTag), m_isColorSpace(o.isColorSpace), m_verb(o.logLevel), m_trimEnd(trimEnd){ using namespace std; using namespace seqan; m_score = Score(match, mismatch, gapCost); m_scoreDna5 = TScoreDna5(gapCost); for (unsigned i = 0; i < ValueSize::VALUE; ++i){ for (unsigned j = 0; j < ValueSize::VALUE; ++j){ if(i == j || TChar(i) == 'N' || TChar(j) == 'N'){ setScore(m_scoreDna5, TChar(i), TChar(j), match); } else{ setScore(m_scoreDna5, TChar(i), TChar(j), mismatch); } // cout << i << "\t" << TChar(i) << endl; // cout << j << "\t" << TChar(j) << endl; // cout << ValueSize::VALUE << endl << endl; } } // cout << endl; // for (unsigned i = 0; i < ValueSize::VALUE; ++i) // cout << "\t" << TChar(i); // cout << endl; // // for (unsigned i = 0; i < ValueSize::VALUE; ++i) { // cout << TChar(i); // for (unsigned j = 0; j < ValueSize::VALUE; ++j){ // cout << "\t" << score(m_scoreDna5, TChar(i), TChar(j)); // } // cout << endl; // } }; virtual ~AlignmentAlgorithm(){ }; void align(const TString &querySeq, const TString &read, int &gapsR, int &gapsA, int &mismatches, int &startPos, int &endPos, int &startPosA, int &endPosA, int &startPosS, int &endPosS, int &aliScore, std::stringstream &aliString, TString &tagSeq){ using namespace std; using namespace seqan; using namespace flexbar; TAlign align; resize(rows(align), 2); assignSource(row(align, 0), read); assignSource(row(align, 1), querySeq); if(m_trimEnd == RIGHT || m_trimEnd == RIGHT_TAIL){ AlignConfig ac; if(m_isColorSpace) aliScore = globalAlignment(align, m_score, ac); else aliScore = globalAlignment(align, m_scoreDna5, ac); } else if(m_trimEnd == LEFT || m_trimEnd == LEFT_TAIL){ AlignConfig ac; if(m_isColorSpace) aliScore = globalAlignment(align, m_score, ac); else aliScore = globalAlignment(align, m_scoreDna5, ac); } else{ AlignConfig ac; if(m_isColorSpace) aliScore = globalAlignment(align, m_score, ac); else aliScore = globalAlignment(align, m_scoreDna5, ac); } TRow &row1 = row(align, 0); TRow &row2 = row(align, 1); startPosS = toViewPosition(row1, 0); startPosA = toViewPosition(row2, 0); endPosS = toViewPosition(row1, length(source(row1))); endPosA = toViewPosition(row2, length(source(row2))); // calculate overlap start and end if(startPosA > startPosS) startPos = startPosA; else startPos = startPosS; if(endPosA > endPosS) endPos = endPosS; else endPos = endPosA; // cout << endl << endl << startPosS << endl << startPosA << endl << endPosS << endl << endPosA; // int fstartPosS = toViewPosition(row1, 0); // int fstartPosA = toViewPosition(row2, 0); // int fendPosS = toViewPosition(row1, length(source(row1))); // int fendPosA = toViewPosition(row2, length(source(row2))); // cout << endl << endl << fstartPosS << endl << fstartPosA << endl << fendPosS << endl << fendPosA; // cout << align << endl << aliScore << endl; if(m_verb != flexbar::NONE) aliString << align; // compute number of mismatches and gaps TRowIterator it1 = begin(row1); TRowIterator it2 = begin(row2); int aliPos = 0; gapsR = 0; gapsA = 0; mismatches = 0; for(; it1 != end(row1); ++it1){ if(startPos <= aliPos && aliPos < endPos){ if(isGap(it1)) ++gapsR; else if(isGap(it2)) ++gapsA; else if(*it1 != *it2 && *it1 != 'N' && *it2 != 'N') ++mismatches; else if(m_randTag && *it2 == 'N') append(tagSeq, (TStringChar) *it1); } ++aliPos; ++it2; } // cout << endl << endl << gapsR << endl << gapsA << endl << mismatches << endl << align; } }; #endif /* FLEXBAR_ALIGNMENTALGORITHM_H_ */ flexbar_v2.4_src/src/OutputFileStruct.h0000640000651600065100000000205312171336175021457 0ustar jroehrbioinformatics/* * OutputFileStruct.h * * Author: mat and jtr */ #ifndef FLEXBAR_OUTPUTFILESTRUCT_H_ #define FLEXBAR_OUTPUTFILESTRUCT_H_ #include "SequenceOutputFilter.h" /* Structure to store statistics for each generated FASTQ file (how many reads were discarded due to being to short, etc.) */ template class OutputFileStruct { public: typedef SequenceOutputFilter TOutputFilter; TOutputFilter *f1, *f2, *single1, *single2; tbb::atomic m_nShort_1, m_nShort_2; OutputFileStruct() : f1(0), f2(0), single1(0), single2(0){ m_nShort_1 = 0; m_nShort_2 = 0; }; virtual ~OutputFileStruct(){ delete f1; delete f2; delete single1; delete single2; }; private: // forbid copying this object since we want to call the destructor only once (pointing to unique objects) OutputFileStruct(OutputFileStruct&); OutputFileStruct& operator =(const OutputFileStruct& rhs); }; #endif /* FLEXBAR_OUTPUTFILESTRUCT_H_ */ flexbar_v2.4_src/src/MultiplexedAlignmentFilter.h0000640000651600065100000000673012175327143023460 0ustar jroehrbioinformatics/* * MultiplexedAlignmentFilter.h * * Authors: mat and jtr */ #ifndef FLEXBAR_MULTIPLEXEDALIGNMENTFILTER_H_ #define FLEXBAR_MULTIPLEXEDALIGNMENTFILTER_H_ #include #include #include #include "Enums.h" #include "Options.h" #include "MultiplexedRead.h" #include "AlignmentFilter.h" #include "AlignmentAlgorithm.h" #include "AdapterLoader.h" // Processes MultiplexedRead and assigns barcode to read or removes adapters. template class MultiplexedAlignmentFilter : public tbb::filter { private: const bool m_writeUnassigned; const flexbar::LogLevel m_verb; const flexbar::RunType m_runType; const flexbar::BarcodeDetect m_barType; const flexbar::AdapterRemoval m_adapRem; tbb::atomic m_unassigned; tbb::concurrent_vector *m_adapters; tbb::concurrent_vector *m_barcodes; typedef AlignmentFilter > AliFilter; AliFilter *m_afilter, *m_bfilter; std::ostream *out; public: MultiplexedAlignmentFilter(Options &o) : filter(parallel), m_verb(o.logLevel), m_writeUnassigned(o.writeUnassigned), m_runType(o.runType), m_barType(o.barDetect), m_adapRem(o.adapRm), out(o.out){ m_barcodes = &o.barcodes; m_adapters = &o.adapters; m_unassigned = 0; m_afilter = new AliFilter(m_adapters, o, o.a_min_overlap, o.a_threshold, o.a_tail_len, o.match, o.mismatch, o.gapCost, o.end, false); m_bfilter = new AliFilter(m_barcodes, o, o.b_min_overlap, o.b_threshold, o.b_tail_len, o.b_match, o.b_mismatch, o.b_gapCost, o.b_end, true); if(m_verb == flexbar::TAB) *out << "ReadTag\tQueryTag\tQueryStart\tQueryEnd\tOverlapLength\tMismatches\tIndels\tAllowedErrors" << std::endl; } virtual ~MultiplexedAlignmentFilter(){ delete m_afilter; delete m_bfilter; }; void* operator()(void* item){ using namespace flexbar; if(item != NULL){ MultiplexedRead *myRead = static_cast< MultiplexedRead* >(item); bool skipAdapRem = false; // barcode detection if(m_barType != BOFF){ switch(m_barType){ case BARCODE_READ: myRead->m_barcode_id = m_bfilter->align(myRead->m_b, false); break; case WITHIN_READ_REMOVAL: myRead->m_barcode_id = m_bfilter->align(myRead->m_r1, true); break; case WITHIN_READ: myRead->m_barcode_id = m_bfilter->align(myRead->m_r1, false); break; case BOFF: break; } if(myRead->m_barcode_id == 0){ m_unassigned++; if(! m_writeUnassigned) skipAdapRem = true; } } // adapter removal if(m_adapRem != AOFF && ! skipAdapRem){ if(m_adapRem != ATWO) m_afilter->align(myRead->m_r1, true); if(myRead->m_r2 != NULL && m_adapRem != AONE) m_afilter->align(myRead->m_r2, true); } return myRead; } else return NULL; } unsigned long getNrUnassignedReads() const { if(m_runType == flexbar::PAIRED_BARCODED) return m_unassigned * 2; else return m_unassigned; } unsigned long getNrPreShortReads() const { return m_afilter->getNrPreShortReads(); } void printAdapterOverlapStats(){ if(m_afilter->getNrModifiedReads() > 0){ *out << m_afilter->getOverlapStatsString() << "\n\n"; } *out << std::endl; } }; #endif /* FLEXBAR_MULTIPLEXEDALIGNMENTFILTER_H_ */ flexbar_v2.4_src/src/MultiplexedInputFilter.h0000640000651600065100000001044412173312106022625 0ustar jroehrbioinformatics/* * MultiplexedInputFilter.h * * Authors: mat and jtr */ #ifndef FLEXBAR_MULTIPLEXEDINPUTFILTER_H_ #define FLEXBAR_MULTIPLEXEDINPUTFILTER_H_ #include #include "Options.h" #include "MultiplexedRead.h" #include "SequenceInputFilter.h" /* This class handles up to 3 file sources (paired and barcode reads) and creates a MultiplexedRead depending on the run type. */ template class MultiplexedInputFilter : public tbb::filter { private: const bool m_isPaired, m_useBarcodeRead, m_useNumberTag; tbb::atomic m_uncalled, m_uncalledPairs, m_tagCounter; SequenceInputFilter *m_f1; SequenceInputFilter *m_f2; SequenceInputFilter *m_b; public: MultiplexedInputFilter(const Options &o) : filter(serial_in_order), m_useNumberTag(o.useNumberTag), m_isPaired(o.isPaired), m_useBarcodeRead(o.barDetect == flexbar::BARCODE_READ){ m_tagCounter = 0; m_uncalled = 0; m_uncalledPairs = 0; m_f1 = new SequenceInputFilter(o, o.readsFile, false, true, o.useStdin); m_f2 = NULL; m_b = NULL; if(m_isPaired){ m_f2 = new SequenceInputFilter(o, o.readsFile2, false, true, false); } if(m_useBarcodeRead){ m_b = new SequenceInputFilter(o, o.barReadsFile, false, false, false); } } virtual ~MultiplexedInputFilter(){ delete m_f1; delete m_f2; delete m_b; } void* operator()(void*){ using namespace std; SequencingRead *myRead1 = NULL, *myRead2 = NULL, *myBarcode = NULL; bool uncalled = true, uncalled2 = true, uBR = true; // single read input if(! m_isPaired){ while(uncalled){ myRead1 = static_cast< SequencingRead* >(m_f1->getRead(uncalled)); if(m_useBarcodeRead) myBarcode = static_cast< SequencingRead* >(m_b->getRead(uBR)); if(myRead1 == NULL) return NULL; else if(m_useBarcodeRead && myBarcode == NULL){ cerr << "Error: read without barcode read, or file reading error!\n" << endl; exit(1); } if(uncalled){ ++m_uncalled; delete myRead1; delete myBarcode; } } } // paired read input else{ // find paired read without uncalled bases while(uncalled || uncalled2){ myRead1 = static_cast< SequencingRead* >(m_f1->getRead(uncalled)); myRead2 = static_cast< SequencingRead* >(m_f2->getRead(uncalled2)); if(m_useBarcodeRead) myBarcode = static_cast< SequencingRead* >(m_b->getRead(uBR)); // end of files reached if(myRead1 == NULL && myRead2 == NULL) return NULL; else if(myRead1 == NULL || myRead2 == NULL){ cerr << "Error: single read in paired mode, or file reading error!\n" << endl; exit(1); } else if(m_useBarcodeRead && myBarcode == NULL){ cerr << "Error: reads without barcode read or file reading error!\n" << endl; exit(1); } if(uncalled || uncalled2){ ++m_uncalledPairs; if(uncalled) ++m_uncalled; if(uncalled2) ++m_uncalled; delete myRead1; delete myRead2; delete myBarcode; } } } if(m_useNumberTag){ stringstream converter; converter << ++m_tagCounter; TString tagCount = converter.str(); myRead1->setSequenceTag(tagCount); if(m_isPaired) myRead2->setSequenceTag(tagCount); if(m_useBarcodeRead) myBarcode->setSequenceTag(tagCount); } return new MultiplexedRead(myRead1, myRead2, myBarcode); } unsigned long getNrUncalledReads() const{ return m_uncalled; } unsigned long getNrUncalledPairedReads() const{ return m_uncalledPairs; } unsigned long getNrProcessedReads() const{ if(m_isPaired) return m_f1->getNrProcessedReads() + m_f2->getNrProcessedReads(); else return m_f1->getNrProcessedReads(); } unsigned long getNrLowPhredReads() const { if(m_isPaired) return m_f1->getNrLowPhredReads() + m_f2->getNrLowPhredReads(); else return m_f1->getNrLowPhredReads(); } }; #endif /* FLEXBAR_MULTIPLEXEDINPUTFILTER_H_ */ flexbar_v2.4_src/src/Flexbar.cpp0000640000651600065100000000203112175457453020073 0ustar jroehrbioinformatics/*======================================================================= Name: Flexbar.cpp Authors: Matthias Dodt and Johannes Roehr Description: Flexbar - flexible barcode and adapter removal Version: 2.4 Copyright: GPL version 3 SeqAn lib: release 1.4.1, revision 14262 on July 11, 2013 TBB lib: version 4.0 update 5, stable release June 13, 2012 ========================================================================*/ #include "Flexbar.h" #include "Options.h" #include "Enums.h" int main(int argc, const char* argv[]){ using namespace std; using namespace flexbar; using seqan::ArgumentParser; const string version = "2.4"; const string date = "July 29, 2013"; ArgumentParser parser("flexbar"); defineOptionsAndHelp(parser, version, date); parseCommandLine(parser, version, argc, argv); Options o; initOptions(o, parser); loadProgramOptions(o, parser, version); loadBarcodesAndAdapters(o); startComputation(o); printCompletedMessage(o); return 0; }