pax_global_header00006660000000000000000000000064132115570050014511gustar00rootroot0000000000000052 comment=133292b5092f376eabbec2e3021049b7dc920d99 HiLive-v1.1-133292b5092f376eabbec2e3021049b7dc920d99/000077500000000000000000000000001321155700500202065ustar00rootroot00000000000000HiLive-v1.1-133292b5092f376eabbec2e3021049b7dc920d99/CHANGELOG.md000066400000000000000000000033141321155700500220200ustar00rootroot00000000000000HiLive - Live Mapping of Illumina reads ======================================= Changelog ----------- v1.1 ----- General functionality: * Continue a hilive run in a specified cycle if the temporary file is available (--continue ARG) * Keep temporary alignment files of specified cycles (--keep-files ARG) * New option to keep all temporary alignment files (--keep-all-files) Output: * Multithreaded SAM/BAM output * Different alignment modes are supported for intermediate sequencing cycles * New output mode UNIQUE to report only unique alignments * Output mode can be changed when using hilive-out executable without losing information * Option to specify a minimum alignment score ratio for output (--min-as-ratio). v1.0 ----- General functionality: * Support of gapped k-mers * Support of arbitrary sequence structures (e.g. for paired-end sequencing) * Support of dual barcodes * Extensive algorithmic optimizations Usability: * K-mers are defined for each index individually instead of defining at compile time * K-mer definition is loaded from the index when running HiLive * Read sequencing information from Illuminas RunInfo.xml * Load HiLive settings from a settings file Index: * Slightly changed index. Index files of versions <1.0 are no longer supported. Output: * Completely revised output structure (one file per barcode; no separation in lanes or tiles) * Support BAM output * Real-time SAM/BAM output for intermediate sequencing cycles * Optional: Extended CIGAR format * Executable "hilive-out" to create SAM/BAM output from existing temporary alignment files v0.3 ----- * Live demultiplexing * Code optimization v0.2 ----- * Code optimization v0.1 ----- * Initial version HiLive-v1.1-133292b5092f376eabbec2e3021049b7dc920d99/CMakeLists.txt000066400000000000000000000054461321155700500227570ustar00rootroot00000000000000############## ### Header ### cmake_minimum_required (VERSION 2.8) project (HiLive) # Set the version number add_definitions(-DHiLive_VERSION_MAJOR=1) add_definitions(-DHiLive_VERSION_MINOR=1) # Set flags for compilation set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC -g -pthread -W -Wall -std=gnu++14 -O0") # add the binary tree to the search path for include files include_directories("${PROJECT_BINARY_DIR}") ############################# ### setup Boost libraries ### set(Boost_USE_STATIC_LIBS ON) set(Boost_USE_MULTITHREADED ON) set(Boost_USE_STATIC_RUNTIME ON) find_package( Boost COMPONENTS system filesystem program_options REQUIRED ) include_directories( ${Boost_INCLUDE_DIR} ) ############################ ### setup Zlib and lz4 libraries ### find_package( ZLIB REQUIRED ) #set (LZ4_PATH /usr/local/lib) # possibly adjust this to [pathToLz4]/lib if using downloaded lz4 source code include_directories(${LZ4_PATH}) link_directories(${LZ4_PATH}) set(CompressionLibs "${ZLIB_LIBRARIES};lz4") ############################# ### setup seqan libraries ### needs to be done AFTER searching for Zlib set (CMAKE_MODULE_PATH "/usr/local/lib/seqan/util/cmake") # adjust this to [pathToSeqanCode]/util/cmake set (SEQAN_INCLUDE_PATH "/usr/local/lib/seqan/include/") # adjust this to [pathToSeqanCode]/include # Configure SeqAn, enabling features for libbz2 and zlib. #set (SEQAN_FIND_DEPENDENCIES ZLIB BZip2) # original version from seqan tutorial set (SEQAN_FIND_DEPENDENCIES BZip2) find_package (SeqAn REQUIRED) # Add include directories, defines, and flags for SeqAn (and its dependencies). include_directories (${SEQAN_INCLUDE_DIRS}) add_definitions (${SEQAN_DEFINITIONS}) set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SEQAN_CXX_FLAGS}") ############################## ### setup HiLive libraries ### include_directories("${PROJECT_SOURCE_DIR}/lib") # make a list of HiLive libraries set (LIB_NAMES tools_static tools alnread alnstream illumina_parsers kindex parallel argument_parser) set(LIB_LIST "") foreach (x ${LIB_NAMES}) list(APPEND LIB_LIST "lib/${x}.cpp") endforeach() add_library(HiLiveLibs ${LIB_LIST}) ############################# ### Build the executables ### add_executable (hilive tools/hilive.cpp) target_link_libraries (hilive HiLiveLibs ${CompressionLibs} ${Boost_LIBRARIES} ${SEQAN_LIBRARIES}) add_executable(hilive-build tools/build_index.cpp ) target_link_libraries(hilive-build HiLiveLibs ${CompressionLibs} ${Boost_LIBRARIES} ${SEQAN_LIBRARIES}) add_executable(hilive-out tools/hilive_out.cpp ) target_link_libraries(hilive-out HiLiveLibs ${CompressionLibs} ${Boost_LIBRARIES} ${SEQAN_LIBRARIES}) ##################### ### for debugging ### #get_cmake_property(_variableNames VARIABLES) #foreach (_variableName ${_variableNames}) #message(STATUS "${_variableName}=${${_variableName}}") #endforeach() HiLive-v1.1-133292b5092f376eabbec2e3021049b7dc920d99/CONTRIBUTORS000066400000000000000000000010101321155700500220560ustar00rootroot00000000000000Bernhard Y. Renard * Project head Martin S. Lindner * Project founder, implemented versions 0.1 and 0.2 Jakob M. Schulze * development v0.2 -> v0.3 * development support v0.3 -> v1.0 Tobias P. Loka * development v0.3 -> v1.0 * development v1.0 -> v1.1 Simon H. Tausch * continuous development support Kristina Kirsten * development support v0.3 -> v1.0 (real-time SAM/BAM output)HiLive-v1.1-133292b5092f376eabbec2e3021049b7dc920d99/LICENSE000066400000000000000000000030221321155700500212100ustar00rootroot00000000000000Copyright (c) 2015-2017, Martin S. Lindner and the HiLive contributors. See CONTRIBUTORS for more info. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. HiLive-v1.1-133292b5092f376eabbec2e3021049b7dc920d99/README.md000066400000000000000000000124051321155700500214670ustar00rootroot00000000000000HiLive - Live Mapping of Illumina reads ======================================= Description ----------- HiLive is a read mapping tool that maps Illumina HiSeq (or comparable) reads right in the moment when they are produced. This means, read mapping is finished as soon as the sequencer is finished. Website ------- The HiLive project website is https://gitlab.com/SimonHTausch/HiLive There you can find the latest version of HiLive, source code, documentation, and examples. Installation ------------ If you are using a Debian based system you can directly install the Debian package from the official [Debian repository](https://packages.debian.org/sid/hilive "HiLive Debian package") If this does not work for you, you can still compile HiLive from source. Make sure that the following dependencies are installed: * cmake (>= 2.8) * boost (system, filesystem, program\_options) * zlib * lz4 If using a local version of lz4 then adjust path in CMakeLists.txt line 32. --- You also need to download download seqan. Cloning the repository makes it possible to switch between different versions: git clone https://github.com/seqan/seqan.git HiLive 1.0 was tested with SeqAn version 2.3.2. --- Check out the HiLive source code from the project website and adjust the paths of the seqan module in the file CMakeLists.txt in the hilive folder (line 41 and 42). Then, compile HiLive with: cd [hilive-code] mkdir build && cd build cmake .. make Usage ----- HiLive has three components: * ``hilive-build`` builds the k-mer index of the reference genome * ``hilive`` the read mapper itself * ``hilive-out`` executable to produce output files --- #### Using hilive-build: Building a k-mer index from FASTA file input.fa to output file input.fa.kix with k-mer weight 15: hilive-build input.fa 15 Building an index from a large reference genome. Here is makes sense to use trimming, i.e. removing k-mers from the index that occurr more than 1000 times (for example) in the index. The index is written into the file trimmed.kix hilive-build -t 1000 -o trimmed.kix input.fa 15 For gapped k-mers, use the -p parameter to specify the gap positions. For example, for a gap pattern of 1101110011, type: hilive-build -p 3 7 8 input.fa 7 With the current index structure, we strongly recommend to use a maximum k-mer weight of 15 because of huge disk space and memory requirements for large k-mers. --- #### Using hilive: To map reads in a 100bp run using default settings: hilive /path/to/BaseCalls /path/to/index.kix 100 /path/to/outputFolder For an overview of additional parameters, type hilive --help To prevent errors during argument parsing we recommend to set optional parameters AFTER the positional options: hilive BC_DIR INDEX CYCLES OUTDIR [options] However, if unexpected parsing errors occur, please try to specify all parameters with the "--"-syntax (e.g. --BCDIR /path/to/BaseCalls) instead of using positional arguments. This is also necessary when loading (some of the) positional arguments from a settings file instead of using the command line. --- #### Using hilive-out: To create a SAM or BAM alignment output from existing temporary files in HiLive, type: hilive-out --settings /path/to/temp/dir/hilive_settings.xml This will output the alignment results of the last cycle based on the settings that were specified for the related HiLive run. To produce output files for other cycles, e.g. 50, 70, 90, type: hilive-out --settings /path/to/temp/dir/hilive_settings.xml --output-cycles 50 70 90 Please note, that the temporary files for the respective cycles must be present in the temp folder. This is only the case if the --keep-files parameter and/or the --output-cycles parameter for the respective cycles was activated in the HiLive run. #### Demultiplexing: To map reads from multiplexed sequencing runs, you can provide HiLive with the barcode sequences from your Sample Sheet. In default cases, barcode sequences are read after the (first) read, such that demultiplexing is carried out after the mapping is completed. If you use double indexing, please concatenate both indices in the correct order and provide them as one sequence. Please take care that the number of cycles is exactly the read length from your Sample Sheet plus that of your complete barcode sequence. All entered indices must be of the same length. To provide multiple indices, enter the -XXX argument for every barcode or barcode combination, e.g.: hilive /path/to/BaseCalls /path/to/index.kix 107 /path/to/outputFolder -b barcode1 -b barcode2 ... One output file will be produced for each barcode. To get alignments with undetermined barcodes, activate the --keep-all-barcodes parameter. Dual barcodes must be delimited with "-" (e.g., -b ATCGTGAT-TAGTTAGC for a 2x8bp barcode). --- Please consult the project website for more details on the parameters! License ------- See the file LICENSE for licensing information. Contact ------- Please consult the HiLive project website for questions! If this does not help, please feel free to consult: * Technical support (technical contact) * Bernhard Y. Renard (project head) also see CONTRIBUTORS for a complete list of contributors and their contact information HiLive-v1.1-133292b5092f376eabbec2e3021049b7dc920d99/lib/000077500000000000000000000000001321155700500207545ustar00rootroot00000000000000HiLive-v1.1-133292b5092f376eabbec2e3021049b7dc920d99/lib/alignmentSettings.h000066400000000000000000000612401321155700500246270ustar00rootroot00000000000000#ifndef ALIGNMENTSETTINGS_H #define ALIGNMENTSETTINGS_H #include "headers.h" #include "definitions.h" #include "tools_static.h" // Data structure to store the alignment settings class AlignmentSettings { private: // kmer gap positions Unmodifiable> kmer_gaps; // reverse gap positions Unmodifiable> rev_kmer_gaps; // PARAMETER: kmer span (automatically computed from kmer_weight and kmer_gaps) Unmodifiable kmer_span; // PARAMETER: Weight of the k-mers Unmodifiable kmer_weight; // VARIABLE: maximum number of consecutive gaps in the gap pattern (will be computed at runtime) Unmodifiable max_consecutive_gaps; // PARAMETER: Base Call quality cutoff, treat BC with quality < bc_cutoff as miscall Unmodifiable min_qual; // PARAMETER: max. insert/deletion size Unmodifiable window; // PARAMETER: minimum number of errors allowed in alignment Unmodifiable min_errors; // SWITCH: discard One-hit-wonders Unmodifiable discard_ohw; // PARAMETER: first cycle to discard one-hit-wonders Unmodifiable start_ohw; // PARAMETER: All-Best-N-Scores-Mode::N Unmodifiable best_n; // PARAMETER: temporary directory for the streamed alignment Unmodifiable temp_dir; // SWITCH: write sam/bam output or not Unmodifiable write_bam; // PARAMETER: Cycles for intermediate SAM/BAM output Unmodifiable> output_cycles; // SWITCH: Keep the old alignment files of previous cycles Unmodifiable> keep_aln_files; // PARAMETER: Memory block size for the input and output buffer in the streamed alignment Unmodifiable block_size; // PARAMETER: Compression format for alignment files Unmodifiable compression_format; // PARAMETER: list of lanes to process Unmodifiable> lanes; // PARAMETER: list of tiles to process Unmodifiable> tiles; // PARAMETER: root directory of hilive run Unmodifiable root; // PARAMETER: path to the index file Unmodifiable index_fname; // PARAMETER: the first cycle to handle. Should be 1 by default. Unmodifiable start_cycle; // PARAMETER: read length of all reads (including barcodes) Unmodifiable cycles; //PARAMETER: Stores the barcodes defined by the user. The inner vector contains the single fragments of multi-barcodes. Unmodifiable>> barcodeVector; // PARAMETER: directory in which to create the output directory structure Unmodifiable out_dir; // PARAMETER: number of threads to use Unmodifiable num_threads; // PARAMETER: max. amount of threads used for output Unmodifiable num_out_threads; // SWITCH: activate extended CIGAR annotation Unmodifiable extended_cigar; /** * Contains the read information of the sequencing machine (as SequenceElement objects). Includes sequence reads and barcodes. * Arbitrary numbers and orders of reads are supported. The summed length of all elements must equal the number of sequencing cycles. * @author Tobias Loka */ Unmodifiable> seqs; // Number of mates (information taken from the seqLengths parameter), (Hint: corresponding indeces are 1-based) Unmodifiable mates; // PARAMETER: number of allowed errors for the single barcodes Unmodifiable> barcode_errors; // SWITCH: if true, keep all barcodes (disables barcode filtering). Unmodifiable keep_all_barcodes; Unmodifiable mode; Unmodifiable min_as_ratio; Unmodifiable force_resort; template bool set_unmodifiable(Unmodifiable & unmodifiable, T value, std::string variable_name) { try { unmodifiable.set(value); } catch (unmodifiable_error& e) { // std::cerr << e.what() << " (" << variable_name << ")." << std::endl; variable_name.length(); // TODO: just to remove compiler warnings. Remove variable_name string when finished. return false ; } return true; } template T get_unmodifiable(Unmodifiable unmodifiable, std::string variable_name, bool allow_unset = false) { try { return unmodifiable.get(allow_unset); } catch (unmodifiable_error& e) { std::cerr << e.what() << " (" << variable_name << ")." << std::endl; return T(); } } std::vector xmlParse_barcodeVector() { std::vector bc_strings; for ( CountType i = 0; i < get_barcodeVector().size(); i++ ) { bc_strings.push_back( get_barcodeString(i) ); } return bc_strings; } std::vector xmlParse_seqs() { std::vector seq_vector; for ( auto el : get_seqs() ) { std::string seq_string; seq_string += std::to_string(el.length); seq_string += el.mate == 0 ? "B" : "R"; seq_vector.push_back(seq_string); } return seq_vector; } void set_barcodeVector(std::vector > value) { set_unmodifiable(barcodeVector, value, "barcodeVector"); } void set_seqs(std::vector value) { set_unmodifiable(seqs, value, "seqs"); } void set_block_size(uint64_t value) { set_unmodifiable(block_size, value, "block_size"); } void set_mates(uint16_t value) { set_unmodifiable(mates, value, "mates"); } AlignmentMode get_mode() { return get_unmodifiable(mode, "mode"); } void set_mode(AlignmentMode value, CountType bestn = 0) { set_unmodifiable(mode, value, "mode"); set_unmodifiable(best_n, bestn, "best_n"); } public: /** * Create a property tree that is filled with all (relevant) settings. * @return Property tree containing all settings. * @author Tobias Loka */ boost::property_tree::ptree to_ptree() { boost::property_tree::ptree xml_out; // General settings xml_out.add_child("settings.lanes", getXMLnode_vector ( get_lanes() )); xml_out.add_child("settings.tiles", getXMLnode_vector ( get_tiles() )); xml_out.add_child("settings.min_errors", getXMLnode (get_min_errors() )); xml_out.add_child("settings.cycles", getXMLnode ( get_cycles() )); xml_out.add_child("settings.sequences", getXMLnode_vector ( xmlParse_seqs() )); // Barcode settings xml_out.add_child("settings.barcodes.sequences", getXMLnode_vector( xmlParse_barcodeVector() )); xml_out.add_child("settings.barcodes.errors", getXMLnode_vector ( get_barcode_errors() )); xml_out.add_child("settings.barcodes.keep_all", getXMLnode ( get_keep_all_barcodes() )); // Alignment mode std::string mode = std::string(1, char(get_mode())); if ( get_mode() == AlignmentMode::BESTN ) mode += std::to_string(get_best_n()); xml_out.add_child("settings.mode", getXMLnode ( mode )); // Paths xml_out.add_child("settings.paths.temp_dir", getXMLnode ( get_temp_dir() )); xml_out.add_child("settings.paths.out_dir", getXMLnode ( get_out_dir() )); xml_out.add_child("settings.paths.root", getXMLnode ( get_root() )); xml_out.add_child("settings.paths.index", getXMLnode ( get_index_fname() )); // Output settings xml_out.add_child("settings.out.bam", getXMLnode ( get_write_bam() )); xml_out.add_child("settings.out.cycles", getXMLnode_vector ( get_output_cycles() )); xml_out.add_child("settings.out.extended_cigar", getXMLnode ( get_extended_cigar() )); xml_out.add_child("settings.out.min_as_ratio", getXMLnode ( get_min_as_ratio()) ); // Technical settings xml_out.add_child("settings.technical.num_threads", getXMLnode ( get_num_threads() )); xml_out.add_child("settings.technical.num_out_threads", getXMLnode ( get_num_out_threads() )); xml_out.add_child("settings.technical.keep_aln_files", getXMLnode_vector ( get_keep_aln_files() )); xml_out.add_child("settings.technical.block_size", getXMLnode ( get_block_size() )); xml_out.add_child("settings.technical.compression_format", getXMLnode ( get_compression_format() )); // Alignment algorithm settings xml_out.add_child("settings.align.min_qual", getXMLnode (get_min_qual() )); xml_out.add_child("settings.align.window", getXMLnode (get_window() )); xml_out.add_child("settings.align.discard_ohw", getXMLnode ( get_discard_ohw() )); xml_out.add_child("settings.align.start_ohw", getXMLnode ( get_start_ohw() )); return xml_out; } void set_barcodes ( std::vector< std::string > barcodeArg ) { // Get the barcode length(s) from the seqs vector std::vector barcode_lengths; for ( uint16_t seq_num = 0; seq_num < get_seqs().size(); seq_num++ ) { if ( getSeqById(seq_num).isBarcode() ) barcode_lengths.push_back( getSeqById(seq_num).length ); } // Fill 2D-vector for internal storage std::vector > barcodeVector; for ( auto barcode = barcodeArg.begin(); barcode != barcodeArg.end(); ++barcode) { // Check if all characters in the current barcode are valid std::string valid_chars = seq_chars + "-"; for(CountType i = 0; i != (*barcode).length(); i++){ char c = (*barcode)[i]; if ( valid_chars.find(c) == std::string::npos ) throw std::runtime_error("Invalid character '" + std::string(1,c) + "' in barcode sequence " + *barcode + "."); } // Split barcode string into fragments std::vector fragments; split(*barcode, '-', fragments); // Check correct number of fragments if ( barcode_lengths.size() != fragments.size()) { throw std::runtime_error("Wrong number of fragments for barcode " + *barcode + " (should have " + std::to_string(barcode_lengths.size()) + " fragments)."); } // Check correct length of all fragments for ( uint16_t num = 0; num != fragments.size(); num++ ) { if ( fragments[num].length() != barcode_lengths[num] ) { throw std::runtime_error("Wrong fragment length in barcode " + *barcode); } } // Push barcode to the final 2D vector barcodeVector.push_back(fragments); } set_barcodeVector(barcodeVector); } std::vector > get_barcodeVector() { return get_unmodifiable(barcodeVector, "barcodeVector", true); } std::string format_barcode(std::string unformatted_barcode) { CountType pos = 0; for ( auto el : get_seqs() ) { if ( el.mate == 0 ) { pos+=el.length; if ( unformatted_barcode.length() >= pos ) unformatted_barcode.insert(pos++, "-"); else break; } } return unformatted_barcode.substr(0,pos-1); } std::string get_barcodeString(CountType index) { // invalid index if ( index >= get_barcodeVector().size() ) { return ""; } else { std::vector bc_vec = get_barcodeVector()[index]; std::stringstream ss; for ( auto fragment : bc_vec ) { ss << fragment; } std::string barcode_string = ss.str(); return format_barcode(barcode_string); } } void set_read_structure ( std::vector read_argument ) { // Init variables CountType lenSum = 0; CountType length = 0; std::string length_string = ""; char type; unsigned mates = 0; std::vector temp; // Iterate through input vector for ( auto read = read_argument.begin(); read != read_argument.end(); ++read ) { // Split string into fragment length and fragment type (B or R) length_string = (*read).substr(0,(*read).length()-1); type = (*(*read).rbegin()); if ( length_string.find_first_not_of("0123456789")!=std::string::npos ) { throw std::runtime_error("Invalid length for read fragment " + *read + ". Please only use unsigned integer values."); } try{ length = CountType(std::atol(length_string.c_str())); } catch( std::bad_cast & ex ){ std::cerr << "Error while casting length " << length_string << " to type uint16_t." << std::endl; throw ex; } if ( type!='B' && type!='R' ) { std::stringstream ss; ss << "\'" << type << "\' is no valid read type. Please use \'R\' for sequencing reads or \'B\' for barcode reads."; throw std::runtime_error(ss.str()); } temp.push_back(SequenceElement(temp.size(), (type == 'R') ? ++mates : 0, length)); lenSum += length; } set_seqs(temp); set_mates(mates); if ( lenSum!=get_cycles() ) { throw std::runtime_error("Sum of defined reads does not equal the given number of cycles."); } } /** * Get a SequenceElement object from the seqs vector by using the id * @param id The id of the SequenceElement. * @return The respective SequenceElement object for the given id. * @author Tobias Loka */ SequenceElement getSeqById(CountType id) {return seqs.get()[id];} /** * Get a SequenceElement object from the seqs vector by using the mate number * @param id The mate number of the SequenceElement. * @return The respective SequenceElement object for the given mate number. NULLSEQ if mate==0 (barcodes). * @author Tobias Loka */ SequenceElement getSeqByMate(CountType mate) { if ( mate == 0 ) return NULLSEQ; auto the_seq = seqs.get(); for (uint16_t i = 0; i != the_seq.size(); i++) { if(the_seq[i].mate == mate) return the_seq[i]; } return NULLSEQ; } std::vector get_kmer_gaps() { return get_unmodifiable(kmer_gaps, "kmer_gaps", true); } std::vector get_rev_kmer_gaps() { return get_unmodifiable(rev_kmer_gaps, "rev_kmer_gaps", true); } bool set_kmer( uint8_t kmer_weight, std::vector gaps ) { if ( gaps.size() > 0 ) { // Prepare user-defined list of gap positions (sort and erase duplicates) std::sort(gaps.begin(), gaps.end()); gaps.erase( std::unique(gaps.begin(), gaps.end()), gaps.end()); // Weight and gap positions not consistent if ( kmer_weight + gaps.size() <= *(std::max_element(gaps.begin(), gaps.end())) || *(std::min_element(gaps.begin(), gaps.end())) <= 1 ) { std::cerr << "Warning: k-mer weight and gap pattern not consistent. Ensure that the first gap positions is >1 and" << "the maximal gap positions is lower than the total length of the k-mer pattern." << std::endl; return false; } } // Set k-mer variables set_unmodifiable(this->kmer_weight, kmer_weight, "kmer_weight"); set_unmodifiable(this->kmer_gaps, gaps, "kmer_gaps"); set_unmodifiable(this->kmer_span, uint8_t(kmer_weight + gaps.size()), "kmer_span"); std::vector rev_kmer_gaps; for ( auto gap:gaps ) { rev_kmer_gaps.push_back(this->kmer_span - gap + 1); } std::reverse(rev_kmer_gaps.begin(), rev_kmer_gaps.end()); set_unmodifiable(this->rev_kmer_gaps, rev_kmer_gaps, "rev_kmer_gaps"); // Compute maximal consecutive gaps in gap pattern CountType current_consecutive_gaps = 0; CountType last_gap = 0; CountType temp_max_consecutive_gaps = 0; for ( unsigned el : this->get_kmer_gaps() ) { // init first gap if ( last_gap == 0 ) { current_consecutive_gaps = 1; last_gap = el; continue; } // handle consecutive gaps else if ( el == unsigned( last_gap + 1 ) ){ current_consecutive_gaps += 1; last_gap = el; } // handle end of gap region else { temp_max_consecutive_gaps = std::max ( temp_max_consecutive_gaps, current_consecutive_gaps ); current_consecutive_gaps = 1; last_gap = el; } } set_unmodifiable(this->max_consecutive_gaps, std::max ( temp_max_consecutive_gaps, current_consecutive_gaps ), "max_consecutive_gaps"); return true; } uint8_t get_kmer_span() { return get_unmodifiable(kmer_span, "kmer_span"); } uint8_t get_kmer_weight() { return get_unmodifiable(kmer_weight, "kmer_weight"); } void set_min_qual(CountType value) { set_unmodifiable(min_qual, value, "min_qual"); } CountType get_min_qual() { return get_unmodifiable(min_qual, "min_qual"); } void set_window(DiffType value) { set_unmodifiable(window, value, "window"); } DiffType get_window() { return get_unmodifiable(window, "window"); } void set_min_errors(CountType value) { set_unmodifiable(min_errors, value, "min_errors"); } CountType get_min_errors() { return get_unmodifiable(min_errors, "min_errors"); } void disable_ohw(bool value) { set_unmodifiable(discard_ohw, !value, "discard_ohw"); } bool get_discard_ohw() { return get_unmodifiable(discard_ohw, "discard_ohw"); } void set_start_ohw(CountType value) { set_unmodifiable(start_ohw, value, "start_ohw"); } CountType get_start_ohw() { return get_unmodifiable(start_ohw, "start_ohw"); } bool get_any_best_hit_mode() { return (get_unmodifiable(mode, "mode")==AlignmentMode::ANYBEST); } bool get_all_hit_mode() { return (get_mode()==AlignmentMode::ALL); } bool get_all_best_hit_mode() { return (get_mode()==AlignmentMode::ALLBEST); } bool get_all_best_n_scores_mode() { return (get_mode()==AlignmentMode::BESTN); } bool get_unique_hit_mode() { return (get_mode()==AlignmentMode::UNIQUE); } CountType get_best_n() { return get_unmodifiable(best_n, "best_n", true); } void set_temp_dir(std::string value) { set_unmodifiable(temp_dir, value, "temp_dir"); } std::string get_temp_dir() { std::string dir = get_unmodifiable(temp_dir, "temp_dir"); dir = dir!="" ? dir : get_unmodifiable(root, "root"); return dir; } void set_write_bam(bool value) { set_unmodifiable(write_bam, value, "write_bam"); } bool get_write_bam() { return get_unmodifiable(write_bam, "write_bam"); } void set_output_cycles(std::vector cycles) { std::vector the_cycles; for ( auto it = cycles.begin(); it != cycles.end(); ++it ) { if ( *it > get_cycles() ) the_cycles.push_back(get_cycles()); else the_cycles.push_back(*it); } std::sort(the_cycles.begin(), the_cycles.end()); the_cycles.erase( std::unique(the_cycles.begin(), the_cycles.end()), the_cycles.end()); set_unmodifiable(output_cycles, the_cycles, "output_cycles"); } std::vector get_output_cycles() { return get_unmodifiable(output_cycles, "output_cycles", true); } bool is_output_cycle(CountType cycle) { auto out_cycles = get_output_cycles(); if ( std::find(out_cycles.begin(), out_cycles.end(), cycle) == out_cycles.end() ) return false; return true; } void set_keep_aln_files(std::vector value) { set_unmodifiable(keep_aln_files, value, "keep_aln_files"); } std::vector get_keep_aln_files() { return get_unmodifiable(keep_aln_files, "keep_aln_files"); } bool is_keep_aln_files_cycle(CountType cycle) { auto aln_files_cycles = get_keep_aln_files(); if ( std::find(aln_files_cycles.begin(), aln_files_cycles.end(), cycle) == aln_files_cycles.end() ) return false; return true; } void set_block_size(std::string value) { uint64_t size; char type = 'B'; // Split value to size and type if ( value.find_first_of("BKM") != std::string::npos ) { type = *value.rbegin(); value = value.substr(0,value.length()-1); } if ( value.find_first_not_of("0123456789")!=std::string::npos ) { throw std::runtime_error("Invalid block size " + value + ". Please only use unsigned integer values."); } try{ size = uint64_t(std::atol(value.c_str())); } catch( std::bad_cast & ex ){ std::cerr << "Error while casting length " << value << " to type uint16_t." << std::endl; throw ex; } if ( type == 'B' ) set_block_size(size); else if ( type == 'K' ) set_block_size(size*1024); else if ( type == 'M' ) set_block_size(size*1024*1024); else throw std::runtime_error("Invalid block size type. Only 'B' (Bytes), 'K' (Kilobytes) or 'M' (Megabytes) are permitted."); } uint64_t get_block_size() { return get_unmodifiable(block_size, "block_size"); } void set_compression_format(uint16_t value) { if ( value > 2 ) value = 2; uint8_t one_byte_value = value; set_unmodifiable(compression_format, one_byte_value, "compression_format"); } uint8_t get_compression_format() { return get_unmodifiable(compression_format, "compression_format"); } void set_lanes(std::vector value) { std::sort( value.begin(), value.end() ); value.erase( std::unique( value.begin(), value.end() ), value.end() ); set_unmodifiable(lanes, value, "lanes"); } std::vector get_lanes() { return get_unmodifiable(lanes, "lanes", true); } void set_mode(std::string value) { // All hit mode if ( value == "ALL" || value == "A" ) { set_mode(AlignmentMode::ALL); } // Unique mode else if ( value == "UNIQUE" || value == "U" ) { set_mode(AlignmentMode::UNIQUE); } // Best N scores mode else if ( value.substr(0,5) == "BESTN" || value.substr(0,1) == "N" ) { std::string bestn = value.substr(0,5) == "BESTN" ? value.substr(5) : value.substr(1); if ( bestn.find_first_not_of("0123456789")!=std::string::npos ) { throw std::runtime_error("Invalid alignment mode: " + value + "."); } try{ set_mode(AlignmentMode::BESTN, CountType(std::atol(bestn.c_str()))); } catch( std::bad_cast & ex ){ std::cerr << "Error while casting length " << bestn << " to type uint16_t." << std::endl; throw ex; } } // All best mode else if ( value == "ALLBEST" || value == "H" ) { set_mode(AlignmentMode::ALLBEST); } // All hit mode else if ( value == "ANYBEST" || value == "B" ) { set_mode(AlignmentMode::ANYBEST); } // Unknown mode else { throw std::runtime_error("Invalid alignment mode: " + value + "."); } } void set_tiles(std::vector value) { std::sort( value.begin(), value.end() ); value.erase( std::unique( value.begin(), value.end() ), value.end() ); set_unmodifiable(tiles, value, "tiles"); } std::vector get_tiles() { return get_unmodifiable(tiles, "tiles", true); } void set_root(std::string value) { set_unmodifiable(root, value, "root"); } std::string get_root() { return get_unmodifiable(root, "root"); } void set_index_fname(std::string value) { set_unmodifiable(index_fname, value, "index_fname"); } std::string get_index_fname() { return get_unmodifiable(index_fname, "index_fname"); } void set_cycles(CountType value) { set_unmodifiable(cycles, value, "cycles"); } CountType get_cycles() { return get_unmodifiable(cycles, "cycles"); } void set_start_cycle(CountType value) { set_unmodifiable(start_cycle, value, "start_cycle"); } CountType get_start_cycle() { CountType ret = get_unmodifiable(start_cycle, "start_cycle", true); // Value if not set. if ( ret == 0 ) return 1; return ret; } void set_out_dir(std::string value) { set_unmodifiable(out_dir, value, "out_dir"); } std::string get_out_dir() { return get_unmodifiable(out_dir, "out_dir"); } void set_num_threads(CountType value) { set_unmodifiable(num_threads, value, "num_threads"); } CountType get_num_threads() { return get_unmodifiable(num_threads, "num_threads"); } void set_num_out_threads(CountType value) { set_unmodifiable(num_out_threads, value, "num_out_threads"); } CountType get_num_out_threads() { return get_unmodifiable(num_out_threads, "num_out_threads"); } std::vector get_seqs() { return get_unmodifiable(seqs, "seqs", true); } uint16_t get_mates() { return get_unmodifiable(mates, "mates", true); } void set_barcode_errors(std::vector value) { set_unmodifiable(barcode_errors, value, "barcode_errors"); } std::vector get_barcode_errors() { return get_unmodifiable(barcode_errors, "barcode_errors", true); } void set_keep_all_barcodes(bool value) { set_unmodifiable(keep_all_barcodes, value, "keep_all_barcodes"); } bool get_keep_all_barcodes() { if ( get_barcodeVector().size() == 0 ) return true; return get_unmodifiable(keep_all_barcodes, "keep_all_barcodes"); } void set_extended_cigar(bool value) { set_unmodifiable(extended_cigar, value, "extended_cigar"); } bool get_extended_cigar() { return get_unmodifiable(extended_cigar, "extended_cigar"); } CountType get_max_consecutive_gaps() { return get_unmodifiable(max_consecutive_gaps, "max_consecutive_gaps"); } float get_min_as_ratio() { return get_unmodifiable(min_as_ratio, "min_as_ratio"); } void set_min_as_ratio(float value) { if ( value > 1.0f ) value = 1.0f; if ( value < 0.0f ) value = 0.0f; set_unmodifiable(min_as_ratio, value, "min_as_ratio"); } bool get_force_resort() { return get_unmodifiable(force_resort, "force_resort"); } void set_force_resort(bool value) { set_unmodifiable(force_resort, value, "force_resort"); } }; #endif HiLive-v1.1-133292b5092f376eabbec2e3021049b7dc920d99/lib/alnread.cpp000066400000000000000000001144061321155700500230740ustar00rootroot00000000000000#include "alnread.h" seqan::String > Seed::returnSeqanCigarString(unsigned* nm_i, unsigned* as_i) { typedef seqan::String > TSeqanCigarString; TSeqanCigarString seqanCigarString; seqan::CigarElement<> cigarElem; int last_offset = 0; for (CigarVector::const_iterator it = cigar_data.begin(); it != cigar_data.end(); ++it) { // Alignment begins with NO_MATCH region => Softclipped start if (it == cigar_data.begin() && (*it).offset==NO_MATCH ) { cigarElem.operation='S'; cigarElem.count=(*it).length; seqan::appendValue(seqanCigarString, cigarElem); continue; } // Alignment ends with NO_MATCH region => Softclipped end if (it == --cigar_data.end() && (*it).offset==NO_MATCH ) { cigarElem.operation='S'; cigarElem.count=(*it).length; seqan::appendValue(seqanCigarString, cigarElem); continue; } // Alignment ends with NO_MATCH + TRIMMED_MATCH region => Softclipped end if ( it == --(--cigar_data.end()) && (*it).offset==NO_MATCH && (--cigar_data.end())->offset==TRIMMED_MATCH ) { cigarElem.operation='S'; cigarElem.count=( (*it).length + (*(++it)).length ); seqan::appendValue(seqanCigarString, cigarElem); continue; } // Mismatch region if ((*it).offset==NO_MATCH) { cigarElem.operation = globalAlignmentSettings.get_extended_cigar() ? 'X' : 'M'; cigarElem.count=(*it).length; (*nm_i) += (*it).length; seqan::appendValue(seqanCigarString, cigarElem); continue; } // Match region if ((*it).offset!=NO_MATCH) { // no offset change if (last_offset == (*it).offset) { cigarElem.operation = globalAlignmentSettings.get_extended_cigar() ? '=' : 'M'; cigarElem.count=(*it).length; seqan::appendValue(seqanCigarString, cigarElem); (*as_i) += (*it).length; last_offset = (*it).offset; continue; } // offset gets bigger => reference in alignment is longer than read => deletion in read (and thereby cigar string) if (last_offset < (*it).offset) { cigarElem.operation='D'; cigarElem.count=(*it).offset - last_offset; (*nm_i) += (*it).offset - last_offset; seqan::appendValue(seqanCigarString, cigarElem); cigarElem.operation = globalAlignmentSettings.get_extended_cigar() ? '=' : 'M'; cigarElem.count=(*it).length; seqan::appendValue(seqanCigarString, cigarElem); (*as_i) += (*it).length; (*as_i) -= ( (*it).offset - last_offset ); last_offset = (*it).offset; continue; } // offset gets smaller => reference in alignment is smaller than read => insertion in read (and thereby cigar string) if (last_offset > (*it).offset) { cigarElem.operation='I'; cigarElem.count=last_offset - (*it).offset; (*nm_i) += last_offset - (*it).offset; seqan::appendValue(seqanCigarString, cigarElem); cigarElem.operation = globalAlignmentSettings.get_extended_cigar() ? '=' : 'M'; cigarElem.count=(*it).length; seqan::appendValue(seqanCigarString, cigarElem); (*as_i) += (*it).length; last_offset = (*it).offset; continue; } } } // collapse Neighboring identical regions for (unsigned k = 1; kgid << std::endl; std::cout << "start_pos: " << this->start_pos << std::endl; std::cout << "num_matches: " << this->num_matches << std::endl; std::cout << "CIGAR: "; for ( auto el : this->cigar_data ) { std::cout << el.length; if ( el.offset == NO_MATCH ) { std::cout << "X "; } else if (el.offset == TRIMMED_MATCH ) { std::cout << "T"; } else { std::cout << "M(" << el.offset << ") "; } } std::cout << std::endl << "------ SEED END ------" << std::endl; } uint16_t Seed::serialize_size() { // calculate total size uint16_t total_size = 0; total_size += sizeof(GenomeIdType); // the target genome ID total_size += sizeof(PositionType); // the start position total_size += sizeof(CountType); // the number of matching positions if (cigar_data.size() >= 256) throw std::overflow_error("CIGAR information contains more than 255 elements!"); uint8_t cigar_len = cigar_data.size(); total_size += sizeof(uint8_t); // the size of the cigar information total_size += cigar_len*(sizeof(CountType) + sizeof(DiffType)); // the cigar information itself return total_size; } std::vector Seed::serialize() { // get the total size of the serialization uint16_t total_size = serialize_size(); uint8_t cigar_len = (uint8_t) cigar_data.size(); // create the vector to store the data std::vector data (total_size); char* d = data.data(); // write the target Genome ID memcpy(d,&gid,sizeof(GenomeIdType)); d += sizeof(GenomeIdType); // write the start position memcpy(d,&start_pos,sizeof(PositionType)); d += sizeof(PositionType); // write the number of matches memcpy(d,&num_matches,sizeof(CountType)); d += sizeof(CountType); // write the number of cigar elements memcpy(d,&cigar_len,sizeof(uint8_t)); d += sizeof(uint8_t); // write the seeds for (auto it = cigar_data.begin(); it != cigar_data.end(); ++it) { memcpy(d,&(it->length),sizeof(CountType)); d += sizeof(CountType); memcpy(d,&(it->offset),sizeof(DiffType)); d += sizeof(DiffType); } return data; } uint16_t Seed::deserialize(char* d) { // the total number of bytes read uint16_t bytes = 0; // read the target Genome ID memcpy(&gid,d,sizeof(GenomeIdType)); bytes += sizeof(GenomeIdType); // read the start position memcpy(&start_pos,d+bytes,sizeof(PositionType)); bytes += sizeof(PositionType); // read the number of matches memcpy(&num_matches,d+bytes,sizeof(CountType)); bytes += sizeof(CountType); // read the number of cigar elements uint8_t cigar_len = 0; memcpy(&cigar_len,d+bytes,sizeof(uint8_t)); bytes += sizeof(uint8_t); // read the cigar elements cigar_data.clear(); for (uint8_t i = 0; i < cigar_len; ++i) { CigarElement cig; memcpy(&(cig.length),d+bytes,sizeof(CountType)); bytes += sizeof(CountType); memcpy(&(cig.offset),d+bytes,sizeof(DiffType)); bytes += sizeof(DiffType); cigar_data.emplace_back(cig); } return bytes; } void ReadAlignment::set_total_cycles(CountType c) { total_cycles = c; } uint64_t ReadAlignment::serialize_size() { // calculate total size first uint64_t total_size = 0; total_size += 1; // the flag total_size += sizeof(CountType); // the cycle number total_size += sizeof(CountType); // the last_invalid cycle total_size += sizeof(CountType); // the sequence length total_size += sequenceStoreVector.size()*(sizeof(uint8_t)); // the sequence information itself total_size += sizeof(CountType); // The barcode length total_size += barcodeStoreVector.size()*(sizeof(uint8_t)); // the barcode sequence information // total number of seeds total_size += sizeof(uint32_t); // size of the single seeds for (auto & s : seeds) { total_size += sizeof(uint16_t) + s->serialize_size(); } return total_size; } std::vector ReadAlignment::serialize() { // get the total size of the serialization uint64_t total_size = serialize_size(); uint32_t num_seeds = (uint32_t) seeds.size(); // create the vector to store the data std::vector data (total_size); char* d = data.data(); // write the flag memcpy(d,&flags,1); d++; // write the cycle memcpy(d,&cycle,sizeof(CountType)); d += sizeof(CountType); // write the last invalid cycle memcpy(d,&last_invalid,sizeof(CountType)); d += sizeof(CountType); // write the sequence length memcpy(d,&sequenceLen,sizeof(CountType)); d += sizeof(CountType); // write the sequenceStoreVector for (auto it = sequenceStoreVector.begin(); it != sequenceStoreVector.end(); ++it) { memcpy(d,&(*it),sizeof(uint8_t)); d += sizeof(uint8_t); } // write the barcode length memcpy(d,&barcodeLen,sizeof(CountType)); d += sizeof(CountType); // write the barcodeStoreVector for (auto it = barcodeStoreVector.begin(); it != barcodeStoreVector.end(); ++it) { memcpy(d,&(*it),sizeof(uint8_t)); d += sizeof(uint8_t); } // write the number of seeds memcpy(d,&num_seeds,sizeof(uint32_t)); d += sizeof(uint32_t); // write the seeds for (auto it = seeds.begin(); it != seeds.end(); ++it) { std::vector seed_data = (*it)->serialize(); uint16_t seed_size = seed_data.size(); memcpy(d,&seed_size,sizeof(uint16_t)); d += sizeof(uint16_t); memcpy(d,seed_data.data(),seed_size); d += seed_size; } return data; } uint64_t ReadAlignment::deserialize(char* d) { // the total number of bytes read uint64_t bytes = 0; // read the flag memcpy(&flags,d,1); bytes++; // read the cycle memcpy(&cycle,d+bytes,sizeof(CountType)); bytes += sizeof(CountType); // read the last invalid cycle memcpy(&last_invalid,d+bytes,sizeof(CountType)); bytes += sizeof(CountType); // read the sequence length sequenceLen = 0; memcpy(&sequenceLen,d+bytes,sizeof(CountType)); bytes += sizeof(CountType); // read the sequence unsigned seqVec_size = sequenceLen; sequenceStoreVector.clear(); sequenceStoreVector.reserve(seqVec_size); for (unsigned i = 0; i seed_data (seed_size,0); memcpy(seed_data.data(),d+bytes,seed_size); bytes += seed_size; USeed s (new Seed); s->deserialize(seed_data.data()); // insert into sorted list. The data read in is already sorted!!! // therefore I only push back seeds.push_back(std::move(s)); } return bytes; } // convert and return sequence of the seed as string std::string ReadAlignment::getSequenceString() { std::string seq = ""; uint8_t two_bit_mask = 3; // iterate through all sequence bytes for (unsigned i = 0; i N-call seq.append("N"); } else { // two-bit qual > 0 --> write nucleotide seq += revtwobit_repr(next & two_bit_mask); } } // return barcode sequence return seq; } // convert and return sequence of the seed as string std::string ReadAlignment::getQualityString() { std::string qual = ""; // iterate through all sequence bytes for (unsigned i = 0; i> 2; qual += (to_phred_quality(next_qual)); } // return PHRED quality sequence return qual; } std::string ReadAlignment::getBarcodeString() { std::string seq = ""; uint8_t two_bit_mask = 3; // iterate through all sequence bytes for (unsigned i = 0; i N-call seq.append("N"); } else { // two-bit qual > 0 --> write nucleotide seq += revtwobit_repr(next & two_bit_mask); } } // return barcode sequence return seq; } // append one nucleotide to sequenceStoreVector void ReadAlignment::appendNucleotideToSequenceStoreVector(char bc, bool appendToBarCode) { // Store byte CountType & len = appendToBarCode ? barcodeLen : sequenceLen; std::vector & seqVector = appendToBarCode ? barcodeStoreVector : sequenceStoreVector; seqVector.push_back(bc); ++len; return; } // helper function for add_new_seeds bool seed_compare_pos (const USeed & i, const USeed & j) { if ( i->start_pos == j->start_pos ) return i->gid < j->gid; return (i->start_pos < j->start_pos); } // helper function for alignment output bool seed_compare_errors (const USeed & i, const USeed & j) { // CountType i_err = min_errors(i); // CountType j_err = min_errors(j); // if ( i_err == j_err ) // return i->num_matches >= j->num_matches; // return ( i_err < j_err ); if ( i->num_matches == j->num_matches ) return min_errors(i) <= min_errors(j); return i->num_matches > j->num_matches; } void ReadAlignment::sort_seeds_by_errors() { seeds.sort(seed_compare_errors); } // Create new seeds from a list of kmer positions and add to current seeds void ReadAlignment::add_new_seeds(GenomePosListType& pos, std::vector & posWasUsedForExtension) { SeedVecIt sit = seeds.begin(); CigarVector front; CountType num_matches_placeholder = 0; // If PLACEHOLDER exist, start with its CIGAR vector if ( seeds.size() > 0 && (*sit)->gid == TRIMMED ) { front = (*sit)->cigar_data; num_matches_placeholder = (*sit)->num_matches; seeds.pop_front(); sit = seeds.begin(); } // If no PLACEHOLDER exist, create initial CIGAR vector else { if ( cycle > globalAlignmentSettings.get_kmer_span() ) { front.emplace_back(cycle-globalAlignmentSettings.get_kmer_span(), NO_MATCH); } front.emplace_back(0,0); } for(GenomePosListIt it = pos.begin(); it != pos.end(); ++it) { if (posWasUsedForExtension[it - pos.begin()]) // if current reference hit was used at least once for seed extension continue; USeed s (new Seed); s->gid = it->gid; s->start_pos = it->pos - (cycle-globalAlignmentSettings.get_kmer_span()); s->num_matches = globalAlignmentSettings.get_kmer_weight() + num_matches_placeholder; s->cigar_data = front; // set correct matches and mismatches depending on kmer mask std::vector gapVec = s->start_pos > 0 ? globalAlignmentSettings.get_kmer_gaps() : globalAlignmentSettings.get_rev_kmer_gaps(); gapVec.push_back(globalAlignmentSettings.get_kmer_span()+1); unsigned lastProcessedGapPosition = 0; for (unsigned gapIndex = 0, nextGap; gapIndex < gapVec.size(); ++gapIndex) { nextGap = gapVec[gapIndex]; // Join first kmer match region with the existing match region at the end of the CIGAR string if ( lastProcessedGapPosition == 0 ) s->cigar_data.back().length += (nextGap - lastProcessedGapPosition - 1); else if (nextGap - lastProcessedGapPosition - 1 > 0) s->cigar_data.emplace_back(nextGap - lastProcessedGapPosition - 1,0); lastProcessedGapPosition = nextGap; s->cigar_data.emplace_back(1,NO_MATCH); } s->cigar_data.pop_back(); // remove tailing NO_MATCH from the for-loop // insert seed into sorted list of seeds // PLACEHOLDER does not have to be considered here because it was converted during seed creation while (sit != seeds.end() && seed_compare_pos(*sit, s)) // if seed exists and elem has larger starting position than (*sit) ++sit; sit = seeds.insert(sit, std::move(s)); } } // Extend or create a placeholder seed for read with only trimmed matches void ReadAlignment::create_placeholder_seed() { // Don't create PLACEHOLDER if already exist if ( minErrors_in_region( cycle - globalAlignmentSettings.get_kmer_span(), 1) > globalAlignmentSettings.get_min_errors() ) { return; } // Don't create PLACEHOLDER if already existing if ( seeds.size() > 0 && (*seeds.begin())->gid == TRIMMED ) { return; } USeed s (new Seed); s->gid = TRIMMED; s->num_matches = 1; s->cigar_data.clear(); if ( cycle > globalAlignmentSettings.get_kmer_span() ) s->cigar_data.emplace_back(cycle - globalAlignmentSettings.get_kmer_span(), NO_MATCH); s->cigar_data.emplace_back(1,0); // Put PLACEHOLDER to the first position of the vector seeds.push_front(std::move(s)); } CountType minErrors_in_region(CountType region_length, CountType border, CountType offset_change) { // Border must not be larger than 2 if ( border > 2 ) return 0; // Estimate no errors for regions of length 0 if ( region_length == 0 ) { return offset_change; } // Magic formula to estimate the minimal number of matches (supports gapped/spaced kmers) int minErr = ( 1 - border ) + ( ( region_length + border + globalAlignmentSettings.get_kmer_span() + globalAlignmentSettings.get_max_consecutive_gaps() - 2 ) / (globalAlignmentSettings.get_kmer_span() + globalAlignmentSettings.get_max_consecutive_gaps()) ); minErr = std::max( minErr, int(offset_change) ); // Catch negative values if ( minErr < 0 ) return 0; return CountType(minErr); } CountType min_errors(const USeed & s) { CigarVector* c = &(s->cigar_data); // Catch elements with length 1 beforehand to save runtime. There can't be any errors in this case. if ( (*c).size() <= 1 ) return 0; CountType minErr = 0; CountType border = 0; CountType region_length = 0; DiffType last_offset = 0; // Iterate through all CIGAR elements for ( auto cig_el = (*c).begin(); cig_el != (*c).end(); ++cig_el ) { if ( cig_el == (--(*c).end()) && cig_el->offset == TRIMMED_MATCH ) { border += 1; continue; } // Finish error region if CIGAR MATCH element spans a complete k-mer if ( cig_el->offset != NO_MATCH && cig_el->length >= ( globalAlignmentSettings.get_kmer_span() -1 ) ) { CountType offset_change = ( cig_el->offset > last_offset ) ? ( cig_el->offset - last_offset ) : ( last_offset - cig_el->offset ); minErr += minErrors_in_region(region_length, border, offset_change); region_length = 0; border = 0; last_offset = cig_el->offset; continue; } // Init or continue error region for NO_MATCH and too short MATCH elements else { region_length += cig_el->length; border += ( cig_el == (*c).begin() ); border += ( cig_el == ( --( (*c).end() ) ) ); } } DiffType final_offset = NO_MATCH; // Compute offset of the last match CIGAR element for ( auto cig_el = (*c).rbegin(); final_offset == NO_MATCH || final_offset == TRIMMED_MATCH; ++cig_el ) { final_offset = (*cig_el).offset; } CountType offset_change = ( final_offset > last_offset ) ? ( final_offset - last_offset ) : ( last_offset - final_offset ); // Finish last region minErr += minErrors_in_region(region_length, border, offset_change); return minErr; } // filter seeds based on filtering mode and q gram lemma. Also calls add_new_seeds. void ReadAlignment::filterAndCreateNewSeeds(GenomePosListType & pos, std::vector & posWasUsedForExtension) { // Compute the number of maximum estimated number of errors for the remaining cycles CountType possible_remaining_errors = minErrors_in_region( total_cycles - cycle, 1); CountType min_num_errors = globalAlignmentSettings.get_min_errors(); CountType max_num_matches = 0; // only required for any best mode in last cycle // Compute the number of errors to remove a seed in any_best and all_best mode if ( globalAlignmentSettings.get_all_best_hit_mode() || globalAlignmentSettings.get_any_best_hit_mode() ) { for(SeedVecIt sd = seeds.begin() ; sd !=seeds.end(); ++sd) { // Ignore PLACEHOLDER seed if ( (*sd)->gid == TRIMMED ) { continue; } // If seed has the lowest maximal number of errors set the values CountType max_seed_errors = min_errors(*sd) + possible_remaining_errors; if ( max_seed_errors < min_num_errors ) { min_num_errors = max_seed_errors; max_num_matches = (*sd)->num_matches; } else if ( max_seed_errors == min_num_errors ) { max_num_matches = std::max( max_num_matches, (*sd)->num_matches ); } } } // Fill the vector containing the best n scores for seed filtering decisions in all_best_n mode else if ( globalAlignmentSettings.get_all_best_n_scores_mode() && globalAlignmentSettings.get_best_n() > 0 ) { std::set all_min_errors; for(SeedVecIt sd = seeds.begin() ; sd !=seeds.end(); ++sd) { if ( (*sd)->gid == TRIMMED ) { continue; } all_min_errors.insert( min_errors(*sd) ); } auto it = all_min_errors.begin(); if ( all_min_errors.size() > 0 ) { std::advance(it, std::min( int(globalAlignmentSettings.get_best_n() - 1 ) , int (all_min_errors.size() - 1 ) ) ); min_num_errors = (*it) + possible_remaining_errors; } } // All hit mode: Only consider the min_errors parameter else { min_num_errors = globalAlignmentSettings.get_min_errors(); } // delete all seeds which do not reach threshold SeedVecIt it=seeds.begin(); // bool foundHit = false; while ( it!=seeds.end()) { // Handle PLACEHOLDER seeds separately if ( (*it)->gid == TRIMMED ) { // Filter if last cycle if ( cycle == total_cycles ) { it = seeds.erase(it); continue; } // Keep it otherwise ++it; continue; } CountType seed_errors = min_errors(*it); // Filter all seeds that have more errors than the threshold if ( seed_errors > min_num_errors ) { it = seeds.erase(it); continue; } // Filter One-hit-Wonders else if ( globalAlignmentSettings.get_discard_ohw() && (cycle>globalAlignmentSettings.get_start_ohw()) && ((*it)->num_matches <= globalAlignmentSettings.get_kmer_weight()) && ( (*it)->cigar_data.back().length > globalAlignmentSettings.get_max_consecutive_gaps() ) ) { it = seeds.erase(it); continue; } // Don't do that any longer since we re-filter during output. // Filter suboptimal alignments in the last cycle for any_best mode // else if (cycle == total_cycles && globalAlignmentSettings.get_any_best_hit_mode() && ( (*it)->num_matches < max_num_matches || foundHit ) ) { // it = seeds.erase(it); // continue; // } else ++it; // If not filtered, a hit was found // foundHit = true; } // Create new seeds if they have a chance to stay below the error threshold (Consider the number of matches given by a PLACEHOLDER seed) CountType placeholder_matches = 0; if ( seeds.size() > 0 && (*seeds.begin())->gid == TRIMMED ) { placeholder_matches = (*seeds.begin())->num_matches; } if ( pos.size() != 0 && cycle < total_cycles && minErrors_in_region( cycle - placeholder_matches - globalAlignmentSettings.get_kmer_span(), 1) <= min_num_errors ) { add_new_seeds(pos, posWasUsedForExtension); } } // updates cigar_data accordingly to a new matching kmer void ReadAlignment::addMatchingKmer(USeed & s, DiffType offset) { s->cigar_data.emplace_back(1,offset); s->num_matches += 1; //////////////////////////////////////////////////////////// //// determine last occurred offset //////////////////////// int last_offset = 0; if ((*prev(prev(s->cigar_data.end()))).offset != NO_MATCH) // if last CigarElement is Match last_offset = (*prev(prev(s->cigar_data.end()))).offset; else // then the one before has to be a match last_offset = (*prev(prev(prev(s->cigar_data.end())))).offset; assert(last_offset != NO_MATCH); //////////////////////////////////////////////////////////// //// split last kmer-span bases in single CigarElements //// CigarVector::iterator it = --(s->cigar_data.end()); unsigned summedLength = 1; while (summedLength < globalAlignmentSettings.get_kmer_span()) { ++summedLength; --it; if ((*it).length > 1) { CigarElement elem1(1, (*it).offset); CigarElement elem2((*it).length-1, (*it).offset); s->cigar_data.insert(it, elem2); s->cigar_data.insert(it, elem1); it = s->cigar_data.erase(it); // points now at element after former it --it; // points now at elem1 } } // it points now at kmer-spans-th last element of cigar_data list //////////////////////////////////////////////////////////// //// remove possibly inserted bases //////////////////////// if (offset < last_offset) { --it; for (int i=0; i 1) --((*it).length); else it = --(s->cigar_data.erase(it)); } ++it; // now again points at kmer-spans-th last element of cigar_data list } //////////////////////////////////////////////////////////// //// set matched bases as match //////////////////////////// CigarVector::iterator it_save = it; // for joining unsigned positionInKmer = 1; while (it != s->cigar_data.end()) { // if positionInKmer is not in kmer_gaps and cigar element was match std::vector kmer_gaps = s->start_pos > 0 ? globalAlignmentSettings.get_kmer_gaps() : globalAlignmentSettings.get_rev_kmer_gaps(); if ( (*it).offset == NO_MATCH && std::find(kmer_gaps.begin(), kmer_gaps.end(), positionInKmer) == kmer_gaps.end()) { (*it).offset = offset; s->num_matches += 1; } ++it; ++positionInKmer; } //////////////////////////////////////////////////////////// //// join last kmer-span+1 CigarElements /////////////////// it = it_save; if (it == s->cigar_data.begin()) ++it; while (it != s->cigar_data.end()) { if ((*prev(it)).offset == (*it).offset) { (*prev(it)).length += 1; it = s->cigar_data.erase(it); } else ++it; } } // Extend an existing seed (to be precise, extend the CIGAR vector / data). bool ReadAlignment::extendSeed(USeed & s, DiffType offset){ // TODO: Do we need to handle this? // Extend placeholder seed only with trimmed k-mer if ( s->gid == TRIMMED ) { // Extend PLACEHOLDER when last k-mer is trimmed if ( offset == TRIMMED_MATCH ) { s->cigar_data.back().length += 1; s->num_matches += 1; return true; } // Everthing else can not happen, but if so, just return false. return false; } // Extend CIGAR for TRIMMED k-mers if ( offset == TRIMMED_MATCH ) { if ( s->cigar_data.back().offset == TRIMMED_MATCH ) { s->cigar_data.back().length += 1; } else if ( s->cigar_data.back().offset == NO_MATCH ){ s->cigar_data.emplace_back(1,TRIMMED_MATCH); } else { addMatchingKmer(s, s->cigar_data.back().offset); } return true; } // Extend CIGAR for NO_MATCH k-mer else if ( offset == NO_MATCH ) { // NO_MATCH --> NO_MATCH if ( s->cigar_data.back().offset == NO_MATCH ) { s->cigar_data.back().length += 1; return false; } // TRIMMED_MATCH --> NO_MATCH else if ( s->cigar_data.back().offset == TRIMMED_MATCH ) { CountType trimmed_length = s->cigar_data.back().length; s->cigar_data.erase( (--(s->cigar_data.end())) ); s->cigar_data.back().length += (trimmed_length + 1); } // MATCH --> NO_MATCH else { s->cigar_data.emplace_back(1, NO_MATCH); return false; } } // Extend CIGAR for MATCH k-mer else { // NO_MATCH --> MATCH if ( s->cigar_data.back().offset == NO_MATCH ) { assert((++(s->cigar_data.rbegin()))->offset != TRIMMED_MATCH && (++(s->cigar_data.rbegin()))->offset != NO_MATCH); int offset_change = offset - (++(s->cigar_data.rbegin()))->offset; // If there is an offset change, I need to have seen the appropriate mismatches before. if ( offset_change == 0 || ((offset_change < 0) && (s->cigar_data.back().length >= -offset_change + globalAlignmentSettings.get_kmer_span() - 1)) // Insertion in read || ((offset_change > 0) && (s->cigar_data.back().length >= globalAlignmentSettings.get_kmer_span() - 1 )) ) { // Deletion in read addMatchingKmer(s, offset); return true; // Appropriate mismatches not existing: Extend mismatch area } else { s->cigar_data.back().length += 1; return false; } } // TRIMMED_MATCH --> MATCH else if ( s->cigar_data.back().offset == TRIMMED_MATCH ) { CountType trimmed_length = s->cigar_data.back().length; s->cigar_data.erase( (--(s->cigar_data.end())) ); int offset_change = offset - (++(s->cigar_data.rbegin()))->offset; // Insertion or deletion in read if ( offset_change != 0 ) { // Offset change is only considered for Insertions (negative offset change) int considered_offsetChange = (offset_change < 0) ? offset_change : 0; // Move TRIMMED_MATCHES from TRIMMED_MATCH region to previous NO_MATCH region such that the offset criteria are fulfilled. // If this is not possible, count all trimmed MATCHes and current MATCH as NO_MATCH. if ( (s->cigar_data.back().length < globalAlignmentSettings.get_kmer_span() - considered_offsetChange - 1) ) { CountType required_nomatches = std::max(0, int(globalAlignmentSettings.get_kmer_span()) - considered_offsetChange - 1 - s->cigar_data.back().length); if ( trimmed_length >= required_nomatches ) { trimmed_length -= required_nomatches; s->cigar_data.back().length += required_nomatches; } else { s->cigar_data.back().length += (trimmed_length + 1); return false; } } } // Add all previous TRIMMED k-mers as MATCH k-mers. for ( CountType i = 0; i < trimmed_length + 1; i++ ) { addMatchingKmer(s, offset); } return true; } // MATCH --> MATCH else { int offset_change = offset - s->cigar_data.rbegin()->offset; // without any mismatch in between there cannot be a valid offset_change other than 0 if (offset_change!=0) { // new mismatch region s->cigar_data.emplace_back(1,NO_MATCH); return false; } // else: extend current match region addMatchingKmer(s, offset); return true; } } // Default: Should not be reached. return false; } void ReadAlignment::extend_alignment(char bc, KixRun* index, bool testRead) { // move to the next cycle cycle += 1; // cycle is not allowed to be > total_cycles assert( total_cycles >= cycle ); // update the last k-mer uint8_t qual = ((bc >> 2) & 63); // get bits 3-8 if ( (bc == 0) || (qual < globalAlignmentSettings.get_min_qual()) ){ // no call if all 0 bits or quality below threshold last_invalid = last_invalid > cycle ? last_invalid : cycle; // TODO append an N as basecall? Could be a bad idea } if (flags != 0) // if read is valid appendNucleotideToSequenceStoreVector(bc); // get the nucleotide as an actual character, disregarding the quality // do not update the alignments when reading the first kmer_span-1 cycles if (cycle < globalAlignmentSettings.get_kmer_span()) return; // update the alignments GenomePosListType pos; std::vector posWasUsedForExtension; // if last kmer of read is not valid if (!( last_invalid+globalAlignmentSettings.get_kmer_span()-1 < cycle )) { // write a NO_MATCH for (auto sit = seeds.begin(); sit != seeds.end(); ++sit) extendSeed(*sit, NO_MATCH); // Remove placeholder if exist if ( seeds.size() > 0 && (*seeds.begin())->gid == TRIMMED ) seeds.pop_front(); } else { // get all occurrences of last_kmer (fwd & rc) from index const std::string sequence = getSequenceString(); std::string::const_iterator it_lastKmer = sequence.end() - globalAlignmentSettings.get_kmer_span(); HashIntoType last_kmer = 0; hash_fw(it_lastKmer, sequence.end(), last_kmer); pos = index->retrieve_positions(sequence.substr(sequence.length()-globalAlignmentSettings.get_kmer_span())); posWasUsedForExtension.resize(pos.size(), false); // check if the current k-mer was trimmed in the index if ( (pos.size() == 1) && ((*pos.begin()).gid == TRIMMED) ) { // pretend that all existing seeds could be extended for(auto sd = seeds.begin() ; sd !=seeds.end(); ++sd) extendSeed(*sd, TRIMMED_MATCH); if ( seeds.size() == 0 || (*seeds.begin())->gid != TRIMMED ) create_placeholder_seed(); // clear the pos list so nothing bad happens in the next steps pos.clear(); } // not trimmed in the index --> try to extend existing seeds else { // find support for each candidate: iterate over seed candidates and positions simultaneously auto cPos1 = pos.begin(), cPos2 = pos.begin(); // sliding window [cPos1, cPos2) // if ( pos.size() > 0 ) { // std::cout << cPos1->pos << std::endl; // } for (auto cSeed = seeds.begin(); cSeed!=seeds.end(); ++cSeed ) { // Don't handle PLACEHOLDER seed if ( (*cSeed)->gid == TRIMMED ) { continue; } // Compute the last offset of the current seed PositionType last_offset = prev((*cSeed)->cigar_data.end())->offset; if(last_offset == NO_MATCH) { last_offset = prev(prev((*cSeed)->cigar_data.end()))->offset; } else if ( last_offset == TRIMMED_MATCH ) { last_offset = prev(prev(prev((*cSeed)->cigar_data.end())))->offset; } // Compute the optimal match position for the next k-mer PositionType seed_pos = (*cSeed)->start_pos + cycle - globalAlignmentSettings.get_kmer_span() + last_offset; // adjust the window in the position list while( (cPos1!=pos.end()) && (cPos1->pos < seed_pos - globalAlignmentSettings.get_window()) ) ++cPos1; while( (cPos2!=pos.end()) && (cPos2->pos <= seed_pos + globalAlignmentSettings.get_window()) ) ++cPos2; // search all positions in the window for the best matching extension of the seed DiffType best_offset = globalAlignmentSettings.get_window()+1; // set larger than search window GenomePosListIt best_match = cPos2; // set behind the last element of the window for(GenomePosListIt kmerHitIt = cPos1; kmerHitIt!=cPos2; ++kmerHitIt) if (kmerHitIt->gid == (*cSeed)->gid){ // if offset gets bigger => Deletion in read int offset = kmerHitIt->pos - seed_pos; if ((best_match==cPos2)||(abs(offset) < abs(best_offset))) { best_match = kmerHitIt; best_offset = offset; } } // check if a best match was found for this seed if (best_match != cPos2) { if(extendSeed(*cSeed, best_offset + last_offset)) // if pos was used as match, mark it so that later it does not get converted into a new seed posWasUsedForExtension[best_match-pos.begin()] = true; } else{ // no position found to extend the current seed extendSeed(*cSeed, NO_MATCH); } } // END: for(seeds...) } // END: not trimmed } // END: if last kmer is valid filterAndCreateNewSeeds(pos, posWasUsedForExtension); if ( testRead ) { for ( auto seed = seeds.begin(); seed != seeds.end(); ++seed ) { (*seed)->cout(); std::cout << "Seed's min errors: " << min_errors((*seed)) << std::endl;; } } return; } CountType ReadAlignment::getBarcodeIndex() { // Get the barcodes of the read std::string read_bc = getBarcodeString(); if ( read_bc.length() == 0 ) return NO_MATCH; uint16_t fragment_errors = 0; uint16_t fragment_pos = 0; uint16_t fragment_num = 0; uint16_t matching_bc = NO_MATCH; // Iterate through all user-defined (multi-)barcodes // That's quite complicated since the read barcodes are consecutive and the user barcodes are divided in vectors. // TODO: change that? for ( uint16_t barcodeIndex = 0; barcodeIndex < globalAlignmentSettings.get_barcodeVector().size(); barcodeIndex++ ) { // reset values for the barcode fragment_errors = 0; fragment_pos = 0; fragment_num = 0; matching_bc = barcodeIndex; // for each base of the read barcode for ( uint16_t nucl = 0; nucl < read_bc.length(); nucl++ ) { // reset values for each barcode fragment if ( fragment_pos >= (globalAlignmentSettings.get_barcodeVector()[barcodeIndex])[fragment_num].length() ) { fragment_pos = 0; fragment_num += 1; fragment_errors = 0; assert( fragment_num < (globalAlignmentSettings.get_barcodeVector()[barcodeIndex]).size() ); } // compare nucleotides and increase the number of fragment errors if not equal if ( read_bc.at(nucl) != (globalAlignmentSettings.get_barcodeVector()[barcodeIndex])[fragment_num].at(fragment_pos) ) { fragment_errors++; } // if too many errors in a fragment, break the loop for the barcode if ( fragment_errors > globalAlignmentSettings.get_barcode_errors()[fragment_num] ) { matching_bc = NO_MATCH; break; } fragment_pos += 1; // increment the fragment position } // if one barcode fulfilled the criteria, we can stop. if ( matching_bc != NO_MATCH ) break; } return matching_bc; } // disable this alignment, i.e. delete all seeds and set the last_invalid indicator to the // end of the read. --> This read will not be aligned and consumes almost no space. void ReadAlignment::disable() { last_invalid = total_cycles; seeds.clear(); flags = 0; sequenceLen=0; sequenceStoreVector.clear(); } // obtain start position of a seed according to SAM (leftmost) PositionType ReadAlignment::get_SAM_start_pos(USeed & sd) { PositionType pos = sd->start_pos; if (pos < 0) { if (sd->cigar_data.back().offset == NO_MATCH) pos = -pos - total_cycles + globalAlignmentSettings.get_kmer_span() - (++sd->cigar_data.rbegin())->offset; else pos = -pos - total_cycles + globalAlignmentSettings.get_kmer_span() - (sd->cigar_data.rbegin())->offset; } return pos; } // Calculate the mapping quality for all alignments of the read based on the other alignments and the number of matching positions. int16_t MAPQ(const SeedVec &sv){ return sv.size(); } HiLive-v1.1-133292b5092f376eabbec2e3021049b7dc920d99/lib/alnread.h000066400000000000000000000154211321155700500225360ustar00rootroot00000000000000#ifndef ALNREAD_H #define ALNREAD_H #include "headers.h" #include "definitions.h" #include "kindex.h" #include "tools.h" #include #include //-------------------------------------------------------------------// //------ The Seed data structure ----------------------------------// //-------------------------------------------------------------------// // a Seed stores the alignment of a read to a target genome struct Seed { // internal sequence ID of taget genome GenomeIdType gid; // (estimated) start position of the read on the target PositionType start_pos; // number of matching bases CountType num_matches; // Information about matches/mismatches (similar to CIGAR). The last element is the current one CigarVector cigar_data; // return Seqans String of CigarElement seqan::String > returnSeqanCigarString(unsigned* nm_i, unsigned* as_i); // get the size of the serialized object uint16_t serialize_size(); // serialize the object std::vector serialize(); // deserialize (read) data from a char vector uint16_t deserialize(char* d); void cout(); }; typedef std::unique_ptr USeed; // compare function to sort Seed objects by position bool seed_compare_pos (const USeed & i, const USeed & j); // std::list of Seed pointers is much faster typedef std::list SeedVec; // a SeedVec Iterator typedef SeedVec::iterator SeedVecIt; /** * This function is the modified pigeonhole principle holding for both spaced and unspaced kmers. * It computes the minimum number of errors in an error region of a given CIGAR vector. * An error region is a region that is surrounded by MATCH elements of length >= ( kmer_span - 1 ). * The error region cannot contain MATCH elements of length >= ( kmer_span - 1 ). * * @param region_length Sum of all (!) elements within the error region, including involved MATCH elements. * @param border Number of included borders of the CIGAR vector (begin and/or end). Must be in [0,2]. * @param Absolute number (positive) of the offset change during a region * @return Minimum number of errors that caused a region of the given length. * @author Tobias Loka, Jakob Schulze */ CountType minErrors_in_region(CountType region_length, CountType border, CountType offset_change=0 ); /** * Compute the minimum number of errors for a seed by using the modified pigeonhole principle implemented in ReadAlignment::minErrors_in_region. * * @param s The seed. * @return The minimum number of errors for the given seed. * @author Tobias Loka, Jakob Schulze */ CountType min_errors(const USeed & s); // compare function to sort Seed objects by errors bool seed_compare_errors (const USeed & i, const USeed & j); //-------------------------------------------------------------------// //------ The Read-Alignment class ---------------------------------// //-------------------------------------------------------------------// class ReadAlignment { private: // read length CountType total_cycles; // sequence of the read so far, saved as vector so interpretation is not that trivial. CountType sequenceLen=0; std::vector sequenceStoreVector; // sequence of the barcode so far, saved as vector so interpretation is not that trivial CountType barcodeLen=0; std::vector barcodeStoreVector; // Extend or create a placeholder seed for read with only trimmed matches void create_placeholder_seed(); // Create new seeds from a list of kmer positions and add to current seeds void add_new_seeds(GenomePosListType& pos, std::vector & posWasUsedForExtension); // filter seeds based on filtering mode and q gram lemma. Also calls add_new_seeds. void filterAndCreateNewSeeds(GenomePosListType & pos, std::vector & posWasUsedForExtension); // updates cigar_data accordingly to a new matching kmer void addMatchingKmer(USeed & s, DiffType offset); // Extend an existing CIGAR string for a seed based on a new basecall. return false if last CIGAR element after extension is mismatch area (NO_MATCH), true otherwise. bool extendSeed(USeed & s, DiffType offset); public: // have everything public until the apropriate access functions are available // Flags for this read; 1 = read is valid (illumina flag) unsigned char flags = 1; // the last invalid cycle CountType last_invalid; // the current cycle CountType cycle; // a list of all found seeds SeedVec seeds; // max number of matches for this read CountType max_num_matches; // set the read_length void set_total_cycles(CountType c); // get the size of the serialized object uint64_t serialize_size(); // serialize the object std::vector serialize(); // deserialize (read) data from a char vector uint64_t deserialize(char* d); // convert and return sequence of the read as string (without barcode) std::string getSequenceString(); // convert and return sequence of the read as string (without barcode) std::string getQualityString(); /** * Convert and return sequence of the barcode. Multiple barcodes are concatenated (without delimiter). * @return The Barcode as string * @author Tobias Loka */ std::string getBarcodeString(); /** * Check whether the barcode of this read fulfills the criteria of at least one user-defined barcode. * The nucleotides are only compared pairwise, not allowing for Indels. * @return The index of the matching barcode in globalAlignmentSettings.multiBarcodeVector. NO_MATCH, if none. * Also return NO_MATCH, if demultiplexing is not activated. * @author Tobias Loka */ CountType getBarcodeIndex() ; /** * Append one nucleotide to sequenceStoreVector * @param nucl The nucleotide. Must be 2-bit-formatted. * @param appendToBarcode If true, the nucleotide is appended to the barcode instead of the read sequence (default: false). * @return * @author Jakob Schulze */ void appendNucleotideToSequenceStoreVector(char bc, bool appendToBarcode=false); // extend the alignment by one basecall using reference database index void extend_alignment(char bc, KixRun* index, bool testRead=false); // disable this alignment void disable(); // obtain start position of a seed according to SAM (leftmost) PositionType get_SAM_start_pos(USeed & sd); /** * Sort the seeds by num_errors instead by position. * Attention: In-place sorting. Only use for output, not during alignment! * @author Tobias Loka */ void sort_seeds_by_errors(); }; // END class ReadAlignment //-------------------------------------------------------------------// //------ Other helper functions -----------------------------------// //-------------------------------------------------------------------// int16_t MAPQ(const SeedVec &sv); #endif /* ALNREAD_H */ HiLive-v1.1-133292b5092f376eabbec2e3021049b7dc920d99/lib/alnstream.cpp000066400000000000000000001042461321155700500234550ustar00rootroot00000000000000#include "alnstream.h" //-------------------------------------------------------------------// //------ The output Alignment Stream class ------------------------// //-------------------------------------------------------------------// oAlnStream::oAlnStream(uint16_t ln, uint16_t tl, uint16_t cl, CountType rl, uint32_t nr, uint64_t bs, uint8_t fmt): lane(ln), tile(tl), cycle(cl), rlen(rl), num_reads(nr), num_written(0), buffer(bs,0), buf_size(bs), buf_pos(0), format(fmt), fstream(NULL), zfstream(Z_NULL), fname(""), flocked(false) {} oAlnStream::~oAlnStream() { funlock(); } uint64_t oAlnStream::lz4write(const char* source, uint64_t size) { // allocate buffer for the compressed data std::vector buf (LZ4_COMPRESSBOUND(size),0); // compress the data uint32_t compressed_size = LZ4_compress (source, buf.data(), size); if (!compressed_size) throw std::runtime_error("Error compressing data with LZ4."); // write the block size if ( !fwrite(&compressed_size, 1, sizeof(uint32_t), fstream) ) throw std::runtime_error("Error writing block size to file while compressing data with LZ4."); // write the data chunk if ( !fwrite(buf.data(), 1, compressed_size, fstream) ) throw std::runtime_error("Error writing data to file while compressing with LZ4."); return size; } uint64_t oAlnStream::open(std::string f_name) { fname = f_name; flock(); // open the new Alignment file switch (format) { case 0: case 2: fstream = fopen(fname.c_str(), "wb"); if (!fstream) { funlock(); throw file_open_error( "Error opening file " + fname + " for writing."); return 0; } break; case 1: zfstream = gzopen(fname.c_str(), "wb1"); //Don't compress too much, not enough bang for the buck if (zfstream == Z_NULL) { funlock(); throw file_open_error( "Error opening file " + fname + " for writing."); return 0; } break; default: funlock(); throw file_format_error("Output file format not recognized."); } // write the header: // calculate total size first unsigned long int total_size = 0; total_size += sizeof(uint16_t); // lane total_size += sizeof(uint16_t); // tile total_size += sizeof(CountType); // cycle // read length total_size += sizeof(CountType); // number of reads total_size += sizeof(uint32_t); // create the vector to store the data std::vector data (total_size); char* d = data.data(); // write the lane memcpy(d,&lane,sizeof(uint16_t)); d += sizeof(uint16_t); // write the tile memcpy(d,&tile,sizeof(uint16_t)); d += sizeof(uint16_t); // write the cycle memcpy(d,&cycle,sizeof(CountType)); d += sizeof(CountType); // write the read length memcpy(d,&rlen,sizeof(CountType)); d += sizeof(CountType); // write the number of reads memcpy(d,&num_reads,sizeof(uint32_t)); d += sizeof(int32_t); // write all data uint64_t written = 0; switch (format) { case 0: case 2: written = fwrite(data.data(), 1, data.size(), fstream); break; case 1: written = gzwrite(zfstream, data.data(), data.size()); break; } return written; } uint64_t oAlnStream::write_alignment(ReadAlignment * al) { if ( (!fstream && (format == 0 || format == 2)) || (zfstream == Z_NULL && format == 1) ){ throw std::runtime_error("Could not write alignment to file. File handle not valid."); } if (num_written >= num_reads) { throw std::length_error("Could not write alignment to file. All alignments were already written."); } std::vector data = al->serialize(); uint64_t al_size = data.size(); // first, write the size of the serialized alignment (uint32_t = 4 bytes) if (buf_pos+sizeof(uint32_t) <= buf_size) { // directly copy if all 4 bytes have space in the buffer (should be almost always the case) memcpy(buffer.data()+buf_pos,&al_size,sizeof(uint32_t)); buf_pos += sizeof(uint32_t); } else { // copy the first bytes into temporary buffer to compose the alignment size std::vector temp (sizeof(uint32_t),0); memcpy(temp.data(),&al_size,sizeof(uint32_t)); uint64_t first_part = buf_size-buf_pos; memcpy(buffer.data()+buf_pos,temp.data(),first_part); // write out buffer uint64_t written = 0; switch (format) { case 0: written = fwrite(buffer.data(), 1, buffer.size(), fstream); break; case 1: written = gzwrite(zfstream, buffer.data(), buffer.size()); break; case 2: written = lz4write(buffer.data(), buffer.size()); break; } if(written != buf_size) throw std::runtime_error("Could not write out buffer 1 in oAlnStream::write_alignment."); // copy remaining data memcpy(buffer.data(),temp.data()+first_part,sizeof(uint32_t)-first_part); buf_pos = sizeof(uint32_t)-first_part; } // finally, write the serialized data uint64_t copied = 0; while (copied < al_size) { uint64_t to_copy = std::min(al_size-copied,buf_size-buf_pos); memcpy(buffer.data()+buf_pos, data.data()+copied, to_copy); buf_pos += to_copy; copied += to_copy; // write buffer to disk if full if(buf_pos >= buf_size){ uint64_t written = 0; switch (format) { case 0: written = fwrite(buffer.data(), 1, buffer.size(), fstream); break; case 1: written = gzwrite(zfstream, buffer.data(), buffer.size()); break; case 2: written = lz4write(buffer.data(), buffer.size()); break; } if(written != buf_size) throw std::runtime_error("Could not write out buffer 2 in oAlnStream::write_alignment."); buf_pos = 0; } } num_written++; return num_written; } bool oAlnStream::close() { if ( ((format == 0 || format == 2) && fstream) || (format == 1 && zfstream != Z_NULL) ) { // write remaining buffer content to file uint64_t written = 0; switch (format) { case 0: written = fwrite(buffer.data(), 1, buf_pos, fstream); break; case 1: written = gzwrite(zfstream, buffer.data(), buf_pos); break; case 2: written = lz4write(buffer.data(), buf_pos); break; } if(written != buf_pos) throw std::runtime_error("Could not write out buffer in oAlnStream::close."); buf_pos = 0; if (num_written == num_reads) { switch (format) { case 0: case 2: fclose(fstream); break; case 1: gzclose(zfstream); break; } funlock(); return true; } else { std::cerr << "Error: Could not close output alignment file! "<< num_reads - num_written <<" alignments missing." << std::endl; return false; } } else { std::cerr << "Error: Could not close output alignment file! File handle not valid." << std::endl; return false; } } void oAlnStream::flock() { fileLocks.lock(fname); flocked = true; } void oAlnStream::funlock() { if ( flocked ) { fileLocks.unlock(fname); } } //-------------------------------------------------------------------// //------ The input Alignment Stream class -------------------------// //-------------------------------------------------------------------// iAlnStream::iAlnStream(uint64_t bs, uint8_t fmt): lane(0), tile(0), cycle(0), rlen(0), num_reads(0), num_loaded(0), buffer(bs,0), buf_size(bs), buf_pos(bs), format(fmt), fstream(NULL), zfstream(Z_NULL), fname(""), flocked(false) {} iAlnStream::~iAlnStream() { funlock(); } // read function for lz4 decompression, reads one block of data uint64_t iAlnStream::lz4read_block() { // get the size of the next block uint32_t compressed_size = 0; if ( !fread(&compressed_size,sizeof(uint32_t),1,fstream) ) return 0; // allocate buffer for the compressed data std::vector cbuf (compressed_size,0); // read the data if ( !fread(cbuf.data(),compressed_size,1,fstream) ) throw std::runtime_error("Malformed input file. Could not read next block."); // decompress the data int64_t r_size = LZ4_decompress_safe (cbuf.data(), buffer.data(), compressed_size, buffer.size()); if ( r_size < 0 ) throw std::runtime_error("Error while decompressing LZ4 compressed block."); // update the current buffer size buf_size = r_size; return (uint64_t)r_size; } uint64_t iAlnStream::open(std::string f_name) { if ( !file_exists(f_name) ) { throw file_not_exist_error( " File " + fname + " does not exist."); } fname = f_name; flock(); // open the new Alignment file switch (format) { case 0: case 2: fstream = fopen(fname.c_str(), "rb"); if (!fstream) { funlock(); throw file_open_error( "Error opening file " + fname + " for reading."); return 0; } break; case 1: zfstream = gzopen(fname.c_str(), "rb"); if (zfstream == Z_NULL) { funlock(); throw file_open_error( "Error opening file " + fname + " for reading."); return 0; } break; default: funlock(); throw file_format_error("Input file format not recognized."); } // load the header: uint64_t bytes = 0; switch (format) { case 0: case 2: { // read the lane bytes += fread(&lane,sizeof(uint16_t),1,fstream); // read the tile bytes += fread(&tile,sizeof(uint16_t),1,fstream); // read the cycle bytes += fread(&cycle,sizeof(CountType),1,fstream); // read the read length bytes += fread(&rlen,sizeof(CountType),1,fstream); // read the number of reads bytes += fread(&num_reads,sizeof(uint32_t),1,fstream); break; } case 1: { // read the lane bytes += gzread(zfstream,&lane,sizeof(uint16_t)); // read the tile bytes += gzread(zfstream,&tile,sizeof(uint16_t)); // read the cycle bytes += gzread(zfstream,&cycle,sizeof(CountType)); // read the read length bytes += gzread(zfstream,&rlen,sizeof(CountType)); // read the number of reads bytes += gzread(zfstream,&num_reads,sizeof(uint32_t)); break; } } return bytes; } ReadAlignment* iAlnStream::get_alignment() { if ( (format==0 && !fstream) || (format==1 && zfstream == Z_NULL) ){ throw std::runtime_error("Could not load alignment from file. File handle not valid."); } if (num_loaded >= num_reads) { throw std::length_error("Could not load alignment from file. All alignments were already loaded."); } // first, get the size of the serialized alignment (uint32_t = 4 bytes) uint32_t al_size = 0; if (buf_pos+sizeof(uint32_t) <= buf_size) { // directly copy if all 4 bytes are in the buffer (should be almost always the case) memcpy(&al_size,buffer.data()+buf_pos,sizeof(uint32_t)); buf_pos += sizeof(uint32_t); } else { // copy the first bytes into temporary buffer to compose the alignment size std::vector temp (sizeof(uint32_t),0); uint64_t first_part = buf_size-buf_pos; memcpy(temp.data(),buffer.data()+buf_pos,first_part); // load new buffer switch (format) { case 0: fread(buffer.data(),1,buf_size,fstream); break; case 1: gzread(zfstream,buffer.data(),buf_size); break; case 2: lz4read_block(); break; } // copy remaining data and copy to variable memcpy(temp.data()+first_part,buffer.data(),sizeof(uint32_t)-first_part); buf_pos = sizeof(uint32_t)-first_part; memcpy(&al_size,temp.data(),sizeof(uint32_t)); } // then, copy the content to the data vector std::vector data(al_size,0); uint64_t copied = 0; while (copied < al_size) { uint64_t to_copy = std::min(al_size-copied,buf_size-buf_pos); memcpy(data.data()+copied, buffer.data()+buf_pos, to_copy); buf_pos += to_copy; copied += to_copy; // read new buffer from disk if necessary if(buf_pos >= buf_size){ switch (format) { case 0: fread(buffer.data(),1,buf_size,fstream); break; case 1: gzread(zfstream,buffer.data(),buf_size); break; case 2: lz4read_block(); break; } buf_pos = 0; } } // finally, deserialize the alignment ReadAlignment* ra = new ReadAlignment(); ra->set_total_cycles(rlen); ra->deserialize(data.data()); num_loaded++; return ra; } bool iAlnStream::close() { if ( ((format==0 || format==2) && fstream) || (format==1 && zfstream != Z_NULL)) { if (num_loaded == num_reads) { switch (format) { case 0: case 2: fclose(fstream); break; case 1: gzclose(zfstream); break; } funlock(); return true; } else { std::cerr << "Error: Could not close alignment file! "<< num_reads - num_loaded <<" alignments missing." << std::endl; return false; } } else { throw std::runtime_error("Could not close alignment file. File handle not valid."); } } void iAlnStream::flock() { fileLocks.lock(fname); flocked = true; } void iAlnStream::funlock() { if ( flocked ) { fileLocks.unlock(fname); } } //-------------------------------------------------------------------// //------ The StreamedAlignment class ------------------------------// //-------------------------------------------------------------------// std::string StreamedAlignment::get_bcl_file(uint16_t cycle, uint16_t mate) { std::ostringstream path_stream; path_stream << globalAlignmentSettings.get_root() << "/L00" << lane << "/C" << getSeqCycle(cycle, mate) << ".1/s_"<< lane <<"_" << tile << ".bcl"; return path_stream.str(); } std::string StreamedAlignment::get_alignment_file(uint16_t cycle, uint16_t mate, std::string base){ if (base == "") { base = globalAlignmentSettings.get_root(); } std::ostringstream path_stream; path_stream << base << "/L00" << lane << "/s_"<< lane << "_" << tile << "." << mate << "."<< cycle << ".align"; return path_stream.str(); } std::string StreamedAlignment::get_filter_file() { std::ostringstream path_stream; path_stream << globalAlignmentSettings.get_root() << "/L00" << lane << "/s_"<< lane << "_" << tile << ".filter"; return path_stream.str(); } void StreamedAlignment::create_directories() { std::ostringstream path_stream; if (globalAlignmentSettings.get_temp_dir() == "") { path_stream << globalAlignmentSettings.get_root(); } else { path_stream << globalAlignmentSettings.get_temp_dir(); } path_stream << "/L00" << lane; boost::filesystem::create_directories(path_stream.str()); boost::filesystem::create_directories(globalAlignmentSettings.get_out_dir()); } void StreamedAlignment::init_alignment(uint16_t mate) { std::string out_fname = get_alignment_file(0, mate, globalAlignmentSettings.get_temp_dir()); // get the number of reads in this tile by looking in the first bcl file std::string first_cycle = get_bcl_file(1, 0); // extract the number of reads uint32_t num_reads = num_reads_from_bcl(first_cycle); // open output alignment stream oAlnStream output (lane, tile, 0, rlen, num_reads, globalAlignmentSettings.get_block_size(), globalAlignmentSettings.get_compression_format()); output.open(out_fname); // write empty read alignments for each read for (uint32_t i = 0; i < num_reads; ++i) { ReadAlignment * ra = new ReadAlignment(); ra->set_total_cycles(rlen); output.write_alignment(ra); delete ra; } if(!output.close()) { std::cerr << "Error: Could not create initial alignment file." << std::endl; } } uint64_t StreamedAlignment::extend_alignment(uint16_t cycle, uint16_t read_no, uint16_t mate, KixRun* index) { // 1. Open the input file //----------------------- std::string in_fname = get_alignment_file(cycle-1, mate, globalAlignmentSettings.get_temp_dir()); std::string bcl_fname = get_bcl_file(cycle, read_no); std::string filter_fname = get_filter_file(); iAlnStream input ( globalAlignmentSettings.get_block_size(), globalAlignmentSettings.get_compression_format() ); input.open(in_fname); assert(input.get_cycle() == cycle-1); assert(input.get_lane() == lane); assert(input.get_tile() == tile); assert(input.get_rlen() == rlen); uint32_t num_reads = input.get_num_reads(); // 2. Open output stream //---------------------------------------------------------- std::string out_fname = get_alignment_file(cycle, mate, globalAlignmentSettings.get_temp_dir()); oAlnStream output (lane, tile, cycle, rlen, num_reads, globalAlignmentSettings.get_block_size(), globalAlignmentSettings.get_compression_format()); output.open(out_fname); // 3. Read the full BCL file (this is not too much) //------------------------------------------------- BclParser basecalls; basecalls.open(bcl_fname); // 4. Load the filter flags if filter file is available // ---------------------------------------------------- FilterParser filters; if (file_exists(filter_fname)) { filters.open(filter_fname); // extract the number of reads from the filter file uint32_t num_reads_filter = filters.size(); if (num_reads != num_reads_filter){ std::string msg = std::string("Number of reads in filter file (") + std::to_string(num_reads_filter) + ") does not match the number of reads in the BCL file (" + std::to_string(num_reads) + ")."; throw std::length_error(msg.c_str()); } } // 5. Extend alignments 1 by 1 //------------------------------------------------- uint64_t num_seeds = 0; for (uint64_t i = 0; i < num_reads; ++i) { // iAlnStream input2( globalAlignmentSettings.get_block_size(), globalAlignmentSettings.get_compression_format() ); // input2.open(in_fname); bool testRead = false; ReadAlignment* ra = input.get_alignment(); if (filters.size() > 0 && filters.has_next()) { // filter file was found -> apply filter if(filters.next()) { ra->extend_alignment(basecalls.next(), index, testRead); num_seeds += ra->seeds.size(); } else { basecalls.next(); ra->disable(); } } // filter file was not found -> treat every alignment as valid else { ra->extend_alignment(basecalls.next(), index, testRead); num_seeds += ra->seeds.size(); } output.write_alignment(ra); delete ra; } // 6. Close files //------------------------------------------------- if (!(input.close() && output.close())) { std::cerr << "Could not finish alignment!" << std::endl; } // 7. Delete old alignment file, if requested //------------------------------------------- if ( ! ( globalAlignmentSettings.is_keep_aln_files_cycle(getSeqCycle(cycle, globalAlignmentSettings.getSeqByMate(mate).id)-1) || globalAlignmentSettings.is_output_cycle(getSeqCycle(cycle, globalAlignmentSettings.getSeqByMate(mate).id)-1)) ) { std::remove(in_fname.c_str()); } return num_seeds; } void StreamedAlignment::extend_barcode(uint16_t bc_cycle, uint16_t read_cycle, uint16_t read_no, uint16_t mate) { // 1. Open the input file //----------------------- std::string in_fname = get_alignment_file(read_cycle, mate, globalAlignmentSettings.get_temp_dir()); std::string bcl_fname = get_bcl_file(bc_cycle, read_no); std::string filter_fname = get_filter_file(); iAlnStream input ( globalAlignmentSettings.get_block_size(), globalAlignmentSettings.get_compression_format() ); input.open(in_fname); assert(input.get_cycle() == read_cycle); assert(input.get_lane() == lane); assert(input.get_tile() == tile); uint32_t num_reads = input.get_num_reads(); // 2. Open output stream //---------------------------------------------------------- std::string out_fname = in_fname + ".temp"; oAlnStream output (lane, tile, read_cycle, input.get_rlen(), num_reads, globalAlignmentSettings.get_block_size(), globalAlignmentSettings.get_compression_format()); output.open(out_fname); // 3. Read the full BCL file (this is not too much) //------------------------------------------------- BclParser basecalls; basecalls.open(bcl_fname); // 4. Extend barcode sequence //------------------------------------------------- for (uint64_t i = 0; i < num_reads; ++i) { char bc = basecalls.next(); ReadAlignment* ra = input.get_alignment(); ra->appendNucleotideToSequenceStoreVector(bc, true); // filter invalid barcodes if new barcode fragment is completed // TODO: Is done for each mate. Check if it's worth to change it (runtime should not be too high?) if ( !globalAlignmentSettings.get_keep_all_barcodes() && bc_cycle == globalAlignmentSettings.get_seqs()[read_no].length && ra->getBarcodeIndex() == NO_MATCH ) { ra->disable(); } output.write_alignment(ra); delete ra; } // 5. Close files //------------------------------------------------- if (!(input.close() && output.close())) { std::cerr << "Could not finish alignment!" << std::endl; } // 6. Move temp out file to the original file. //------------------------------------------- atomic_rename(out_fname.c_str(), in_fname.c_str()); } StreamedAlignment& StreamedAlignment::operator=(const StreamedAlignment& other) { if(&other == this) return *this; lane = other.lane; tile = other.tile; rlen = other.rlen; return *this; } //-------------------------------------------------------------------// //------ Streamed SAM generation -----------------------------------// //-------------------------------------------------------------------// AlnOut::AlnOut(std::vector lns, std::vector tls, CountType cycl, KixRun* idx ) : cycle(cycl),index(idx) { // Fill list of specified barcodes for ( unsigned i = 0; i < globalAlignmentSettings.get_barcodeVector().size(); i++ ) { barcodes.push_back(globalAlignmentSettings.get_barcodeString(i)); } // Get the finished cycles and minimal as:i score for each mate for ( CountType mate = 1; mate <= globalAlignmentSettings.get_mates(); mate++ ) { CountType mateCycle = getMateCycle( mate, cycle ); mateCycles.push_back( mateCycle ); min_as_scores.push_back( mateCycle * globalAlignmentSettings.get_min_as_ratio() ); } // Add a waiting task for all lanes and tiles. for ( auto ln : lns ) { for ( auto tl : tls ) { add_task(Task(ln,tl,cycle), WAITING); } } }; AlnOut::~AlnOut() { if ( !is_finalized() && !finalize() ) { std::cerr << "Could not finish output for cycle " << cycle << "." << std::endl; } } void AlnOut::init() { std::lock_guard lock(if_lock); if ( initialized ) return; // Init the bamIOContext (the same object can be used for all output streams) bfos.set_context(index->seq_names, index->seq_lengths); // Init the header (the same object can be used for all output streams) seqan::BamHeader header = getBamHeader(); // Init output stream for each barcode (plus undetermined if keep_all_barcodes is set) for ( unsigned barcode=0; barcode < (barcodes.size() + 1); barcode ++) { if ( barcode < barcodes.size() || globalAlignmentSettings.get_keep_all_barcodes() ) { std::string barcode_string = ( barcode == barcodes.size() ) ? "undetermined" : barcodes[barcode]; // Open file in Bam output stream and write the header bfos.emplace_back( getBamTempFileName(barcode_string, cycle).c_str() ); bfos[barcode].writeHeader(header); } } initialized = true; } bool AlnOut::set_task_status( Task t, ItemStatus status ) { std::lock_guard lock(tasks_lock); if ( tasks.find(t) == tasks.end() ) return false; tasks[t] = status; return true; } bool AlnOut::set_task_status_from_to( Task t, ItemStatus oldStatus, ItemStatus newStatus ) { std::lock_guard lock(tasks_lock); if ( tasks.find(t) == tasks.end() ) return false; if ( tasks[t] == oldStatus ) { tasks[t] = newStatus; return true; } return false; } Task AlnOut::get_next( ItemStatus getStatus, ItemStatus setToStatus ) { std::lock_guard lock(tasks_lock); for ( auto it = tasks.begin(); it != tasks.end(); ++it ) { if ( it->second == getStatus ) { tasks[it->first] = setToStatus; return it->first; } } return NO_TASK; } bool AlnOut::add_task( Task t, ItemStatus status ) { std::lock_guard lock(tasks_lock); if ( tasks.find(t) != tasks.end() ) return false; tasks[t] = status; return true; } bool AlnOut::sort_tile( CountType ln, CountType tl, CountType mate, CountType cycle, bool overwrite ) { std::string in_fname = alignment_name(ln, tl, cycle, mate); std::string out_fname = alignment_name(ln, tl, cycle, mate) + ".sorted"; // Stop if sorted file already exist if ( file_exists( out_fname ) && !overwrite ) return true; iAlnStream input ( globalAlignmentSettings.get_block_size(), globalAlignmentSettings.get_compression_format() ); input.open(in_fname); assert(input.get_cycle() == cycle); assert(input.get_lane() == ln); assert(input.get_tile() == tl); uint32_t num_reads = input.get_num_reads(); oAlnStream output(ln, tl, cycle, input.get_rlen(), num_reads, globalAlignmentSettings.get_block_size(), globalAlignmentSettings.get_compression_format()); output.open(out_fname); for ( uint32_t i = 0; i < num_reads; i++ ) { try { ReadAlignment * ra = input.get_alignment(); ra->sort_seeds_by_errors(); output.write_alignment(ra); delete ra; } catch (const std::exception & ex) { return false; } } if (!(input.close() && output.close())) return false; return true; } void AlnOut::write_tile_to_bam ( Task t ) { if ( !is_initialized() ) init(); try { __write_tile_to_bam__ (t); set_task_status( t, FINISHED ); } catch ( const std::exception& e) { set_task_status( t, FAILED ); std::cerr << "Writing of task " << t << " failed: " << e.what() << std::endl; } } void AlnOut::__write_tile_to_bam__ ( Task t) { std::vector> locks; CountType lane = t.lane; CountType tile = t.tile; //////////////////////////////////////////////////// // Main loop ////////////////////////////////////// //////////////////////////////////////////////////// // set the filter file std::string filter_fname = filter_name(lane, tile); FilterParser filters; if (file_exists(filter_fname)) { filters.open(filter_fname); } // set the alignment files std::vector alignmentFiles; unsigned numberOfAlignments = 0; for (unsigned mateIndex = 1; mateIndex <= mateCycles.size(); mateIndex++) { if ( globalAlignmentSettings.getSeqByMate(mateIndex) == NULLSEQ ) return; CountType mateCycle = mateCycles[mateIndex-1]; if ( !sort_tile( lane, tile, mateIndex, mateCycle, globalAlignmentSettings.get_force_resort()) ) { std::cout << "Couldn't sort" << std::endl; continue; } // Open sorted alignment file std::string alignment_fname = alignment_name(lane, tile, mateCycle, mateIndex) + ".sorted"; if ( !file_exists(alignment_fname) ) { continue; } iAlnStream* input = new iAlnStream( globalAlignmentSettings.get_block_size(), globalAlignmentSettings.get_compression_format() ); input->open(alignment_fname); // compare number of reads in alignment file with number of reads in filter file, if filter file exists if (file_exists(filter_fname) && input->get_num_reads() != filters.size()) { throw std::length_error("Unequal number of reads (.filer vs .align)"); } // compare number of reads in alignment file with number of reads in previous alignment file if (mateIndex != 1 && input->get_num_reads() != numberOfAlignments) { throw std::length_error("Unequal number of reads (between mates)"); } numberOfAlignments = input->get_num_reads(); // set this after last if-then construct alignmentFiles.push_back(input); } // for all reads in a tile ///////////////////////////////////////////////////////////////////////////// for (uint64_t i = 0; i < numberOfAlignments; i++) { std::vector records; std::vector mateAlignments; for (auto e:alignmentFiles) { mateAlignments.push_back(e->get_alignment()); } // if the filter file is available and the filter flag is 0 then skip if (filters.size() != 0 && filters.next() == false) continue; // compute barcode sequence as it should be written to BC tag std::string barcode = globalAlignmentSettings.format_barcode(mateAlignments[0]->getBarcodeString()); // Barcode index for the read CountType barcodeIndex = mateAlignments[0]->getBarcodeIndex(); // If read has undetermined barcode and keep_all_barcodes is not set, skip this read if ( barcodeIndex == NO_MATCH && !globalAlignmentSettings.get_keep_all_barcodes() ) continue; else if ( barcodeIndex == NO_MATCH ) barcodeIndex = barcodes.size(); // this is the index for the "undetermined" output stream // setup QNAME // Read name format :::::: // readname << ":::" << ln << ":" << tl << ":::" << i; //TODO: where do we get the Illumina read coordinate from? std::stringstream readname; readname << "lane." << lane << "|tile." << tile << "|read." << i; // for all mates ///////////////////////////////////////////////////////////////////////////// for (unsigned mateAlignmentIndex=0; mateAlignmentIndex < mateAlignments.size(); ++mateAlignmentIndex) { readname << "|mate." << mateAlignmentIndex+1; // Variables for output modes CountType first_seed_score = 0; CountType last_seed_score = 0; CountType num_diff_scores = 0; // Number of printed alignments for the current mate. unsigned printedMateAlignments = 0; // Unique mode interruption if ( mateAlignments[mateAlignmentIndex]->seeds.size() > 1 && globalAlignmentSettings.get_unique_hit_mode() ) continue; // for all seeds ///////////////////////////////////////////////////////////////////////////// for (SeedVecIt it = mateAlignments[mateAlignmentIndex]->seeds.begin(); it != mateAlignments[mateAlignmentIndex]->seeds.end(); ++it) { // Skip completely trimmed seeds if ( (*it)->gid == TRIMMED ) { continue; } // Any best mode interruption if ( printedMateAlignments > 0 && globalAlignmentSettings.get_any_best_hit_mode() ) break; // Get next number of errors CountType curr_seed_score = (*it)->num_matches; if ( min_as_scores[mateAlignmentIndex] > curr_seed_score ) break; // No seed printed yet -> current seed is the best one. if ( num_diff_scores == 0 ) first_seed_score = curr_seed_score; // All best mode interruption if ( first_seed_score > curr_seed_score && globalAlignmentSettings.get_all_best_hit_mode() ) break; // All best n mode interruption if ( globalAlignmentSettings.get_all_best_n_scores_mode() && globalAlignmentSettings.get_best_n() == 0 ) break; if ( curr_seed_score < last_seed_score && globalAlignmentSettings.get_all_best_n_scores_mode() && globalAlignmentSettings.get_best_n() <= num_diff_scores ) break; seqan::BamAlignmentRecord record; record.beginPos = mateAlignments[mateAlignmentIndex]->get_SAM_start_pos(*it)-1; if (record.beginPos < 0) { continue; } unsigned nm_i = 0; unsigned as_score = 0; record.cigar = (*it)->returnSeqanCigarString(&nm_i, &as_score); as_score = curr_seed_score; record.qName = readname.str(); record.rID = (*it)->gid; // Sequence of the current read std::string seq; seq.reserve(mateCycles[mateAlignmentIndex]); // Quality of the current read std::string qual; qual.reserve(mateCycles[mateAlignmentIndex]); // Only obtain sequence and quality if no alignment was printed yet if ( printedMateAlignments == 0 ) { seq = mateAlignments[mateAlignmentIndex]->getSequenceString(); qual = mateAlignments[mateAlignmentIndex]->getQualityString(); } // flag and seq record.flag = 0; if (printedMateAlignments >= 1) { // if current seed is secondary alignment record.flag |= 256; seqan::clear(record.seq); seqan::clear(record.qual); } else { record.seq = seq == "" ? "*" : seq; record.qual = qual == "" ? "*" : qual; } if ((*it)->start_pos < 0) { // if read matched reverse complementary seqan::reverseComplement(record.seq); seqan::reverse(record.qual); record.flag |= 16; } if (globalAlignmentSettings.get_mates() > 1) { // if there are more than two mates record.flag |= 1; if (mateAlignmentIndex == 0) { record.flag |= 64; } else if (mateAlignmentIndex == mateAlignments.size()-1) { record.flag |= 128; } else { record.flag |= 192; // 64 + 128 } bool eachMateAligned = true; for (auto e:mateAlignments) eachMateAligned = eachMateAligned && e->seeds.size() > 0; if (eachMateAligned) record.flag |= 2; } // tags seqan::BamTagsDict dict; seqan::appendTagValue(dict, "AS", ( as_score ) ); if (barcode!="") { // if demultiplexing is on seqan::appendTagValue(dict, "BC", barcode); } seqan::appendTagValue(dict, "NM", nm_i); record.tags = seqan::host(dict); // fill records list records.push_back(record); // set variables for mode selection if ( last_seed_score != curr_seed_score || num_diff_scores == 0 ) ++num_diff_scores; last_seed_score = curr_seed_score; ++printedMateAlignments; } } // Write all records as a group to keep suboptimal alignments and paired reads together. bfos[barcodeIndex].writeRecords(records); for (auto e:mateAlignments) delete e; } for (auto e:alignmentFiles) delete e; return; } Task AlnOut::write_next ( ) { Task t = get_next ( BCL_AVAILABLE, RUNNING ); if ( t != NO_TASK ) { write_tile_to_bam ( t ); return t; } else { return NO_TASK; } } CountType AlnOut::get_task_status_num ( ItemStatus getStatus ) { CountType num = 0; std::lock_guard lock(tasks_lock); for ( auto it = tasks.begin(); it != tasks.end(); ++it ) { if ( it->second == getStatus ) { num += 1; } } return num; } bool AlnOut::finalize() { std::lock_guard lock(if_lock); if ( finalized ) return true; // Don't finish if there are unfinished tasks. if ( !is_finished() ) return false; bool success = true; bfos.clear(); // Move all output files to their final location. for ( unsigned barcode=0; barcode < barcodes.size() + 1; barcode ++) { if ( barcode < barcodes.size() || globalAlignmentSettings.get_keep_all_barcodes() ) { std::string barcode_string = ( barcode == barcodes.size() ) ? "undetermined" : barcodes[barcode]; int rename = atomic_rename(getBamTempFileName(barcode_string, cycle).c_str(), getBamFileName(barcode_string, cycle).c_str()); if ( rename == -1 ) { std::cerr << "Renaming temporary output file " << getBamTempFileName(barcode_string, cycle).c_str() << " to " << getBamFileName(barcode_string, cycle).c_str() << " failed." << std::endl; success = false; } } } // If it comes here, it counts as finalized independently from the success state (otherwise, contradictory error messages may occur). finalized = true; return success; } HiLive-v1.1-133292b5092f376eabbec2e3021049b7dc920d99/lib/alnstream.h000066400000000000000000000422231321155700500231160ustar00rootroot00000000000000#ifndef ALNSTREAM_H #define ALNSTREAM_H #include "headers.h" #include "definitions.h" #include "global_variables.h" #include "kindex.h" #include "tools.h" #include "alnread.h" #include "illumina_parsers.h" #include "parallel.h" /** * Output stream to write temporary .align files. */ class oAlnStream { /** Lane for the output. */ uint16_t lane; /** Tile for the output. */ uint16_t tile; /** Output cycle. */ uint16_t cycle; /** Total read length. */ CountType rlen; /** Total number of reads for this lane/tile. */ uint32_t num_reads; /** Number of reads written to file. */ uint32_t num_written; /** Data buffer. */ std::vector buffer; /** Size of the data buffer. */ uint64_t buf_size; /** Current position in the buffer. */ uint64_t buf_pos; /** Output file compression [0: None; 1: zlib (lvl1); 2: lz4 (lvl1)] */ uint8_t format; /** Standard file handler. */ FILE* fstream; /** zlib file handler. */ gzFile zfstream; /** Name of the file that is currently streamed. */ std::string fname; /** File lock flag. True, if the file was locked by this alignment stream. */ bool flocked; /** * Write with lz4 compression. * @param buf Pointer to the buffer data. * @param size Size of the buffer data. * @return New size of the buffer data. */ uint64_t lz4write(const char* buf, uint64_t size); /** Lock the file (globally in the program). */ void flock(); /** Unlock the file (globally in the program). */ void funlock(); public: /** * Constructor. * @param ln Lane for the output. * @param tl Tile for the output. * @param cl Output cycle. * @param rl Total read length. * @param nr Total number of reads. * @param bs Buffer size. * @param fmt Compression format (0: None, 1: zlib, 2: lz4) */ oAlnStream(uint16_t ln, uint16_t tl, uint16_t cl, CountType rl, uint32_t nr, uint64_t bs, uint8_t fmt); /** Default destructor. Unlocks the global fileLock if it was locked by this stream. */ ~oAlnStream(); /** * Open alignment stream for a file and write the header. * The file will be locked globally. * @param f_name Name of the file to be written (will be overridden if already exists). * @return Number of written bytes. */ uint64_t open(std::string f_name); /** * Write a read alignment to the current output stream. * @param al Pointer to the read alignment object that is written to the output stream. * @return Number of written bytes. */ uint64_t write_alignment(ReadAlignment * al); /** * Close the file stream if all alignments were written. * Unlocks the global file lock. * @return true, if file stream was closed successfully. */ bool close(); }; /** * Input stream to read temporary .align files. */ class iAlnStream { /** Lane for the output. */ uint16_t lane; /** Tile for the output. */ uint16_t tile; /** Output cycle. */ uint16_t cycle; /** Total read length. */ CountType rlen; /** Total number of reads for this lane/tile. */ uint32_t num_reads; /** Number of reads loaded from the file. */ uint32_t num_loaded; /** Data buffer. */ std::vector buffer; /** Size of the data buffer. */ uint64_t buf_size; /** Current position in the buffer. */ uint64_t buf_pos; /** Output file compression [0: None; 1: zlib (lvl1); 2: lz4 (lvl1)] */ uint8_t format; /** Standard file handler. */ FILE* fstream; /** zlib file handler. */ gzFile zfstream; /** Name of the file that is currently streamed. */ std::string fname; /** File lock flag. True, if the file was locked by this alignment stream. */ bool flocked; /** * Load a lz4-compressed block to the buffer. * @return The new buffer size. */ uint64_t lz4read_block(); /** Lock the file (globally in the program). */ void flock(); /** Unlock the file (globally in the program). */ void funlock(); public: /** * Constructor. * @param bs Buffer size. * @param fmt Compression format (0: None, 1: zlib, 2: lz4) */ iAlnStream(uint64_t bs, uint8_t fmt); /** Default destructor. Unlocks the global fileLock if it was locked by this stream. */ ~iAlnStream(); /** * Open alignment stream for a file and load the header. * The file will be locked globally. * @param f_name Name of the file to be loaded. * @return Number of loaded bytes. */ uint64_t open(std::string f_name); /** * Loasd a read alignment from the current input stream. * @return The read alignment object that was loaded from the file. */ ReadAlignment* get_alignment(); /** * Close the file stream if all alignments were loaded. * Unlocks the global file lock. * @return true, if file stream was closed successfully. */ bool close(); // Getter uint16_t get_lane() {return lane;}; uint16_t get_tile() {return tile;}; uint16_t get_cycle() {return cycle;}; CountType get_rlen() {return rlen;}; uint32_t get_num_reads() {return num_reads;}; uint32_t get_num_loaded() {return num_loaded;}; }; /** * Streamer for new base calls to the alignment algorithm. */ class StreamedAlignment { /** The lane to be handled. */ uint16_t lane; /** The tile to be handled. */ uint16_t tile; /** Total read length. */ CountType rlen; /** * Get the path to the bcl file of a given cycle. * @param cycle The current read cycle. * @param mate Number of the current mate. * @return Path to the bcl file. */ std::string get_bcl_file(uint16_t cycle, uint16_t mate); /** * Get the path to the align file. * @param cycle The current read cycle. * @param mate Number of the current mate. * @param base Base of the path to the align files. * @return Path to the align file. */ std::string get_alignment_file(uint16_t cycle, uint16_t mate, std::string base = ""); /** * Get the path to the filter file. * @return Path to the filter file. */ std::string get_filter_file(); public: /** * Constructor. * @param ln The lane to be handled. * @param tl The tile to be handled. * @param rl Total read length. */ StreamedAlignment(uint16_t ln, uint16_t tl, CountType rl): lane(ln), tile(tl), rlen(rl) {}; /** * Create the underlying directories of the align files. */ void create_directories(); /** * Initialize empty alignments for the current mate (stored as output of a virtual cycle 0). * @param mate Number of the current mate. */ void init_alignment(uint16_t mate); /** * Extend the alignments for all reads of the specified lane and tile by one cycle. * @param cycle Current cycle, i.e. the cycle that will be extended. * @param read_no Total number of reads. * @param mate Number of the current mate. * @param index Pointer to the reference index. * @return Total number of seeds (for all reads). */ uint64_t extend_alignment(uint16_t cycle, uint16_t read_no, uint16_t mate, KixRun* index); /** * Extend the barcode for all reads with the information of the current sequencing cycle. * @param bc_cycle The cycle of the barcode read. * @param read_cycle The last handled cycle for the respective mate (should always be 0 or the full length) * @param read_no The number of the sequence read for which the barcode will be extended (:= index in globalAlignmentSettings.seqs). * @param mate The read mate to extend the barcode. */ void extend_barcode(uint16_t bc_cycle, uint16_t read_cycle, uint16_t read_no, uint16_t mate); StreamedAlignment& operator=(const StreamedAlignment& other); }; /** * Extension of SeqAn's BamFileOut data type supporting multithreading via an atomic flag. * @author Tobias Loka */ struct Atomic_bfo { private: /** The BamFileOut stream.*/ seqan::BamFileOut bfo; /** The atomic flag to perform a spinlock while writing.*/ std::atomic_flag flag = ATOMIC_FLAG_INIT; /** * Lock the atomic flag. */ void lock() { while ( flag.test_and_set(std::memory_order_acquire)) ; // spin } /** * Unlock the atomic flag. */ void unlock() { flag.clear(std::memory_order_release); } public: /** * Default constructor that also initializes the BamFileOut stream. * @param f_name Output file name. * */ Atomic_bfo( std::string f_name) : bfo(f_name.c_str()) { } /** Destructor. */ ~Atomic_bfo ( ) { } /** * Set the context of the BamFileOut stream. * @param context The new context. */ void setContext ( seqan::BamIOContext > & context ) { bfo.context = context; } /** * Write the header to the output file. * @param header The header for the output file. */ void writeHeader ( seqan::BamHeader & header ) { seqan::writeHeader( bfo, header ); } /** * Write records to the output file in a "thread-safe" manner. * @param records Reference to a vector containing a set of records. */ void writeRecords ( std::vector & records ) { if ( records.size() == 0 ) return; lock(); seqan::writeRecords(bfo, records); unlock(); } }; /** * Extends a deque of atomic BamFileOut streams. * Store the context, refNames and refNamesCache such that it exist only once and will not be destructed as long as it is needed. * @author Tobias Loka */ class BamFileOutDeque { /** Deque of bfos. */ std::deque bfos; /** All fields needed for the context of the BamFileOut streams. */ seqan::BamIOContext > context; seqan::StringSet refNames; seqan::NameStoreCache > refNamesCache; public: /** * Set the context of the list. * @param seq_names Names of all sequences in the database * @param seq_length Lengths of all sequences in the database */ void set_context(StringListType & seq_names, std::vector seq_lengths) { seqan::NameStoreCache > rnc(refNames); refNamesCache = rnc; seqan::BamIOContext > cxt(refNames, refNamesCache); context = cxt; seqan::contigNames(context) = seq_names; seqan::contigLengths(context) = seq_lengths; } /** * Add a Bam Output stream for the given file name. * The stream will be created in-place. * @param f_name File name for the Bam output stream. */ void emplace_back ( std::string f_name ) { bfos.emplace_back ( f_name.c_str() ); bfos.back().setContext(context); } /** * Get a reference to the last atomic bfo of the deque. * @return Reference to the last atomic bfo of the deque. */ Atomic_bfo & back() { return bfos.back(); } /** * Get the Atomic_bfo at a certain position of the deque. * @param i Index of the Atomic_bfo in the deque. * @return reference to the Atomic_bfo at a certain position of the deque. */ Atomic_bfo & operator [](int i) { return bfos[i]; } /** * Clear the deque of Atomic bfos. */ void clear () { bfos.clear(); } }; /** * Class to organize the output for a specific cycle. * @author Tobias Loka */ class AlnOut { private: /** Map of tasks and the status. */ std::map tasks; /** Mutex to lock tasks when their status is getting modified. */ std::mutex tasks_lock; /** True if the output was finalized. */ bool finalized = false; /** True if the output was initialized. */ bool initialized = false; /** Mutex to lock the initialization and finalization. */ std::mutex if_lock; /** Cycle for the output. */ CountType cycle; /** Deque of output streams. */ BamFileOutDeque bfos; /** Vector of barcodes. */ std::vector barcodes; /** Vector containing the current cycle of all mates. */ std::vector mateCycles; /** Minimal alignment score for a certain cycle to print an alignment. */ std::vector min_as_scores; /** The underlying index for the output. */ KixRun* index; /** * Set the status of a task (only if the task exists). * @param t The task. * @param status The new status. * @return true, if the task exists and the status was successfully changed. false otherwise. */ bool set_task_status( Task t, ItemStatus status ); /** * Change the status of a task if it had a specified status before. * @param t The task. * @param oldStatus The previous status of the task. * @param newStatus The new status of the task. * @return true, if the task exists, has the correct previous status and the status was successfully changed. false otherwise. */ bool set_task_status_from_to( Task t, ItemStatus oldStatus, ItemStatus newStatus ); /** * Change the status of the next task with a given status. * @param getStatus Status to be changed. * @param setToStatus Status the task is set to. * @return The task for that the status was changed. */ Task get_next( ItemStatus getStatus, ItemStatus setToStatus ); /** * Add a new task with a certain status. If the task already exists, the status will not be changed. * @param t The new task. * @param status The status. * @return true, if the task didn't exist before and was successfully created. false otherwise. */ bool add_task( Task t, ItemStatus status ); /** * Create a temporary align file that is sorted by score. * @param ln The lane. * @param tl The tile. * @param mate The mate. * @param cycle The mate cycle (not the sequencing cycle). * @param overwrite If true and a sorted file already exists, it will be sorted again and the old file will be overridden (default: false) * @return true, if sorting was successful. */ bool sort_tile ( CountType ln, CountType tl, CountType mate, CountType cycle, bool overwrite = false ); /** * Calls __write_tile_to_bam__(Task t) to start the output of a task with handled exceptions. * @param t Task that contains the information about lane and tile. */ void write_tile_to_bam ( Task t ); /** * Start the output of a task. Should be called by write_tile_to_bam(Task t) to handle exceptions in an appropriate manner. * @param t Task that contains the information about lane and tile. */ void __write_tile_to_bam__ ( Task t ); /** * Initalize the output streams. */ void init(); public: /** * Constructor. * Includes opening all output streams and writing the header. * @param lns The lanes to consider. * @param tls The tiles to consider. * @param cycl The sequencing cycle. * @param idx The underlying index. */ AlnOut (std::vector lns, std::vector tls, CountType cycl, KixRun* idx); /** * Destructor. * The destructor will wait for all running threads to be finished. * The temporary output files are moved to their final locations. */ ~AlnOut (); /** * Check if the output writing is finished. * @return true, if all tasks are finished or failed. false otherwise. * TODO: maybe change the return value to the number of failed tasks!? */ bool is_finished() { if ( is_finalized() ) return true; return ( get_task_status_num( FINISHED ) + get_task_status_num( FAILED ) ) == tasks.size(); }; /** * Set that the alignment file of a task is available. * @param t The task containing information about lane and tile. * @return true if the task was set to available status. */ bool set_task_available ( Task t ) { return set_task_status_from_to( t, WAITING, BCL_AVAILABLE ); } /** * Check if a task is contained in the list of tasks. * @param t The task of interest. * @return true, if the task is in the list of tasks (the status is not considered). False if not. */ bool has_task ( Task t ) { if ( tasks.find(t) != tasks.end() ) return true; return false; } /** * Check if the status of a task is FINISHED. * @param t The task of interest. * @return true, if the task has status FINISHED. false, if the task has a status other than FINISHED or does not exist. */ bool is_task_finished ( Task t ) { if ( !has_task(t) ) return false; return tasks[t] == FINISHED; } /** * Check if the status of a task is FAILED. * @param t The task of interest. * @return true, if the task has status FAILED. false, if the task has a status other than FAILED or does not exist. */ bool is_task_failed ( Task t ) { if ( !has_task(t) ) return false; return tasks[t] == FAILED; } /** * Write the next available task. This function is implemented in a thread-safe manner. * @return The task that was written. NO_TASK if no task was written. */ Task write_next ( ); /** * Get the number of tasks with a given status. * @param getStatus Requested status. * @return Number of tasks with the requested status. */ CountType get_task_status_num ( ItemStatus getStatus ); /** * Renames the temporary sam files if all tasks are finished. * @return true, if finished with success. */ bool finalize(); /** * Check if this output controller was already finalized. * @return true, if output controller was already finalized. */ bool is_finalized () { return finalized; } /** * Check if the output controller was already initialized (i.e., the output streams are created). * @return true, if the output controller was already initialized. */ bool is_initialized(){ return initialized; } /** * Get the cycle number for this output controller. * @return The sequencing cycle. */ CountType get_cycle() { return cycle; } /** * Get the cycle for a certain mate. * @param Mate number. * @return Mate cycle. */ CountType get_mate_cycle( CountType mate ) { return mateCycles[mate-1]; } }; #endif /* ALNSTREAM_H */ HiLive-v1.1-133292b5092f376eabbec2e3021049b7dc920d99/lib/argument_parser.cpp000066400000000000000000000724201321155700500246630ustar00rootroot00000000000000#include "argument_parser.h" namespace po = boost::program_options; //||||||||||||||||||||||||||||||||||||||||||||||||||||||| //-----ArgumentParser------------------------------------ //||||||||||||||||||||||||||||||||||||||||||||||||||||||| ArgumentParser::ArgumentParser(int argC, char const ** argV):argc(argC),argv(argV){} //||||||||||||||||||||||||||||||||||||||||||||||||||||||| //-----BuildIndexArgumentParser-------------------------- //||||||||||||||||||||||||||||||||||||||||||||||||||||||| po::options_description BuildIndexArgumentParser::general_options() { po::options_description general("General"); general.add_options() ("help,h", "Print this help message and exit") ("license", "Print licensing information and exit"); return general; } po::options_description BuildIndexArgumentParser::positional_options() { po::options_description parameters("Required"); parameters.add_options() ("INPUT", po::value()->required(), "Reference genomes in (multi-) FASTA format.") ("KMER_WEIGHT", po::value()->required(), "Number of non-gap positions in a k-mer (For ungapped k-mers this is the k-mer size)."); return parameters; } po::options_description BuildIndexArgumentParser::build_options() { po::options_description options("Options"); options.add_options() ("outfile,o", po::value(), "Set output file name [Default: INPUT.kix]") ("trim,t", po::value(&trim)->default_value(0), "Ignore k-mers with more than t occurrences. [Default: no limit]") ("gap-positions,p", po::value< std::vector >()->multitoken()->composing(), "Gap positions in the k-mer pattern (example: -p 3 6 7 for 1101100111 with k=7). [Default: ungapped]") ("do-not-convert-spaces", po::bool_switch(&do_not_convert_spaces)->default_value(false), "Do not convert all spaces in reference ids to underscores [Default: converting is on]") ("trim-after-space", po::bool_switch(&trim_ids)->default_value(false), "Trim all reference ids after first space [Default: false]"); return options; } void BuildIndexArgumentParser::init_help(po::options_description visible_options) { std::stringstream help_message; help_message << "Copyright (c) 2015-2017, Martin S. Lindner and the HiLive contributors. See CONTRIBUTORS for more info." << std::endl; help_message << "All rights reserved" << std::endl << std::endl; help_message << "HiLive is open-source software. Check with --license for details." << std::endl << std::endl; help_message << "Usage: " << std::endl << " hilive-build INPUT KMER_WEIGHT [options]" << std::endl << std::endl; help_message << "Required:" << std::endl; help_message << " INPUT Reference genomes in (multi-) FASTA format." << std::endl; help_message << " KMER_WEIGHT Number of non-gap positions in a k-mer (For ungapped k-mers this is the k-mer size)." << std::endl; help_message << visible_options; help = help_message.str(); } bool BuildIndexArgumentParser::set_positional_variables(po::variables_map vm) { // Name of the input fasta file fasta_name = vm["INPUT"].as(); // User-defined k-mer weight kmer_weight = vm["KMER_WEIGHT"].as(); // Check if input file exists if ( !file_exists(fasta_name) ){ std::cerr << "Input error: Could not find input file " << fasta_name << std::endl; return false; } // Check for maximal k-mer size CountType maxKmerWeight = sizeof(HashIntoType)*4; if ( kmer_weight > maxKmerWeight ) { std::cerr << "K-mer weight is too high. Maximal k-mer weight is " << maxKmerWeight << "." << std::endl; return false; } return true; } bool BuildIndexArgumentParser::set_build_variables(po::variables_map vm) { if ( vm.count("gap-positions") ) { gap_positions = vm["gap-positions"].as< std::vector >(); } // Init the k-mer structure if ( ! globalAlignmentSettings.set_kmer(kmer_weight, gap_positions) ) { return false; } // Name of the ouput .kix file if (vm.count("outfile")) { index_name = vm["outfile"].as(); } else { index_name = fasta_name + std::string(".kix"); } return true; } void BuildIndexArgumentParser::report() { std::cout << "K-mer weight: " << (uint16_t) globalAlignmentSettings.get_kmer_weight() << std::endl; std::cout << "K-mer span: " << (uint16_t) globalAlignmentSettings.get_kmer_span() << std::endl; std::cout << "K-mer gap positions: "; if ( globalAlignmentSettings.get_kmer_gaps().size() > 0 ) { for ( auto pos : globalAlignmentSettings.get_kmer_gaps() ) { if ( pos != *(globalAlignmentSettings.get_kmer_gaps().begin()) ) std::cout << ","; std::cout << (uint16_t) pos; } std::cout << std::endl; } else { std::cout << "-" << std::endl; } std::cout << std::endl; } int BuildIndexArgumentParser::parseCommandLineArguments() { po::options_description gen_opt = general_options(); po::options_description pos_opt = positional_options(); po::options_description build_opt = build_options(); po::options_description cmdline_options; cmdline_options.add(pos_opt).add(gen_opt).add(build_opt); po::options_description visible_options; visible_options.add(gen_opt).add(build_opt); init_help(visible_options); po::positional_options_description p; p.add("INPUT", 1); p.add("KMER_WEIGHT", 1); po::variables_map vm; try { // parse arguments po::store(po::command_line_parser(argc, argv). options(cmdline_options).positional(p).run(), vm); // first check if -h or --help was called if (vm.count("help")) { printHelp(); return 1; } // first check if --license was called if (vm.count("license")) { printLicense(); return 1; } // then check arguments po::notify(vm); } catch ( po::required_option& e ) { std::cerr << "Missing Parameter: " << e.what() << std::endl; return -1; } catch( po::error& e) { std::cerr << "Error while parsing command line options: " << e.what() << std::endl; return -1; } if ( !set_positional_variables(vm) ) { return -1; } if ( !set_build_variables(vm) ) { return -1; } report(); return 0; } //||||||||||||||||||||||||||||||||||||||||||||||||||||||| //-----HiLiveArgumentParser------------------------------ //||||||||||||||||||||||||||||||||||||||||||||||||||||||| po::options_description HiLiveArgumentParser::general_options() { po::options_description general("General"); general.add_options() ("help,h", "Print this help message and exit") ("license", "Print licensing information and exit") ("settings,s", po::value(), "Load settings from file. If command line arguments are given additionally, they are prefered.") ("runinfo", po::value(), "Path to runInfo.xml for parsing read and index lengths [Default (if activated): BC_DIR/../../RunInfo.xml]") ("continue", po::value(), "Continue an interrupted HiLive run from a specified cycle. We strongly recommend to load the settings from the previous run using the -s option."); return general; } po::options_description HiLiveArgumentParser::positional_options() { po::options_description parameters("Parameters"); parameters.add_options() ("BC_DIR", po::value(), "Illumina BaseCalls directory") ("INDEX", po::value(), "Path to k-mer index") ("CYCLES", po::value(), "Number of cycles") ("OUTDIR", po::value(), "Directory to store sam files in [Default: ./out"); return parameters; } po::options_description HiLiveArgumentParser::io_options() { po::options_description io_settings("IO settings"); io_settings.add_options() ("temp", po::value(), "Temporary directory for the alignment files [Default: ./temp]") ("bam,B", po::bool_switch(), "Create BAM files instead of SAM files [Default: false]") ("output-cycles,O", po::value>()->multitoken()->composing(), "Cycles for alignment output. The respective temporary files are kept. [Default: last cycle]") ("extended-cigar", po::bool_switch(), "Activate extended CIGAR format (= and X instead of only M) in output files [Default: false]") ("keep-files,k", po::value>()->multitoken()->composing(), "Keep intermediate alignment files for these cycles. The last cycle is always kept. [Default: None]") ("keep-all-files,K", po::bool_switch(), "Keep all intermediate alignment files [Default: false]") ("min-as-ratio", po::value(), "Minimum alignment score (relative to the current read length) for alignments to be reported (0-1) [Default: 0 - Report all alignments]") ("force-resort", po::bool_switch(), "If set, the align files are always sorted before output. Existing sorted align files are overwritten [Default: false]") ("lanes,l", po::value< std::vector >()->multitoken()->composing(), "Select lane [Default: all lanes]") ("tiles,t", po::value< std::vector >()->multitoken()->composing(), "Select tile numbers [Default: all tiles]") ("reads,r", po::value< std::vector >()->multitoken()->composing(), "Enumerate read lengths and type. Example: -r 101R 8B 8B 101R equals paired-end sequencing with 2x101bp reads and 2x8bp barcodes. Overwrites information of runInfo.xml. [Default: single end reads without barcodes]"); return io_settings; } po::options_description HiLiveArgumentParser::alignment_options() { po::options_description alignment("Alignment settings"); alignment.add_options() ("min-errors,e", po::value(), "Number of errors tolerated in read alignment [Default: 2]") ("mode,m", po::value(), "Alignment mode. [ALL|A]: Report all alignments; [BESTN#|N#]: Report alignments of the best # scores; " "[ALLBEST|H]: Report all alignments with the best score (similar to N1); [UNIQUE|U]: Report only unique alignments; [ANYBEST|B]: Report one best alignment (default)") ("disable-ohw-filter", po::bool_switch(), "Disable the One-Hit Wonder filter [Default: false]") ("start-ohw", po::value(), "First cycle to apply One-Hit Wonder filter [Default: 20]") ("window,w", po::value(), "Set the window size to search for alignment extension, i.e. maximum total insertion/deletion size [Default: 5]") ("min-quality", po::value(), "Minimum allowed basecall quality [Default: 1]") ("barcodes,b", po::value< std::vector >()->multitoken()->composing(), "Enumerate barcodes (must have same length) for demultiplexing, e.g. -b AGGATC -b CCCTTT [Default: no demultiplexing]") ("barcode-errors,E", po::value< std::vector >()->multitoken()->composing(), "Enumerate the number of tolerated errors (only SNPs) for each barcode fragment, e.g. -E 2 2 [Default: 1 per fragment]") ("keep-all-barcodes", po::bool_switch()->default_value(false), "Align and output all barcodes [Default: false]"); return alignment; } po::options_description HiLiveArgumentParser::technical_options() { po::options_description technical("Technical settings"); technical.add_options() ("block-size", po::value(), "Block size for the alignment input/output stream in Bytes. Append 'K' or 'M' to specify in Kilobytes or Megabytes, respectively (e.g. '--block-size 64M' for 64 Megabytes)") ("compression,c", po::value(), "Compress alignment files. 0: no compression 1: Deflate (smaller) 2: LZ4 (faster; default)") ("num-threads,n", po::value(), "Number of threads to spawn [Default: all available]") ("num-out-threads,N", po::value(), "Maximum number of threads to use for output if threads are not idle [Default: half of -n]"); return technical; } void HiLiveArgumentParser::init_help(po::options_description visible_options) { std::stringstream help_message; help_message << "Copyright (c) 2015-2017, Martin S. Lindner and the HiLive contributors. See CONTRIBUTORS for more info." << std::endl; help_message << "All rights reserved" << std::endl << std::endl; help_message << "HiLive is open-source software. Check with --license for details." << std::endl << std::endl; help_message << "Usage: " << std::endl << " hilive BC_DIR INDEX CYCLES OUTDIR [options]" << std::endl << std::endl; help_message << "Required:" << std::endl; help_message << " BC_DIR Illumina BaseCalls directory of the sequencing run to analyze" << std::endl; help_message << " INDEX Path to k-mer index file (*.kix)" << std::endl; help_message << " CYCLES Total number of sequencing cycles" << std::endl; help_message << " OUTDIR Output directory" << std::endl; help_message << visible_options; help = help_message.str(); } bool HiLiveArgumentParser::checkPaths() { if (!file_exists(globalAlignmentSettings.get_index_fname())){ std::cerr << "Input error: Could not find k-mer index file " << globalAlignmentSettings.get_index_fname() << std::endl; return false; } std::size_t found = globalAlignmentSettings.get_root().find("BaseCalls"); if (!(found != std::string::npos && found >= globalAlignmentSettings.get_root().size()-10)) { std::cerr << "Warning: BaseCalls directory seems to be invalid: " << globalAlignmentSettings.get_root() << std::endl; } if (!is_directory(globalAlignmentSettings.get_root())){ std::cerr << "Input error: Could not find BaseCalls directory " << globalAlignmentSettings.get_root() << std::endl; return false; } for ( uint16_t ln : globalAlignmentSettings.get_lanes() ) { std::string ln_dir = globalAlignmentSettings.get_root(); if ( ln < 10 ) ln_dir += "/L00"; else if ( ln < 100 ) ln_dir += "/L0"; else ln_dir += "/L"; ln_dir += std::to_string(ln); if (!is_directory(ln_dir)){ std::cerr << "Input error: Could not find location of Lane " << ln << ": " << ln_dir << std::endl; return false; } } if ( !is_directory(globalAlignmentSettings.get_temp_dir())) { boost::filesystem::create_directories(globalAlignmentSettings.get_temp_dir()); } if ( !is_directory(globalAlignmentSettings.get_out_dir())) { boost::filesystem::create_directories(globalAlignmentSettings.get_out_dir()); } return true; } void HiLiveArgumentParser::report() { std::cout << "Running HiLive with " << globalAlignmentSettings.get_num_threads() << " thread(s)." << std::endl; std::cout << "BaseCalls directory: " << globalAlignmentSettings.get_root() << std::endl; if (globalAlignmentSettings.get_temp_dir() != "") { std::cout << "Temporary directory: " << globalAlignmentSettings.get_temp_dir() << std::endl; } if (!globalAlignmentSettings.get_write_bam()) std::cout << "SAM output directory: " << globalAlignmentSettings.get_out_dir() << std::endl; else std::cout << "BAM output directory: " << globalAlignmentSettings.get_out_dir() << std::endl; std::cout << "Lanes: "; for ( uint16_t ln : globalAlignmentSettings.get_lanes() ) std::cout << ln << " "; std::cout << std::endl; std::cout << "K-mer index: " << globalAlignmentSettings.get_index_fname() << std::endl; std::cout << "Read lengths: "; std::string barcode_suffix; for ( uint16_t read = 0; read != globalAlignmentSettings.get_seqs().size(); read ++) { std::cout << globalAlignmentSettings.getSeqById(read).length; barcode_suffix = globalAlignmentSettings.getSeqById(read).isBarcode() ? "B" : "R"; std::cout << barcode_suffix << " "; } std::cout << std::endl; std::cout << "Mapping error: " << globalAlignmentSettings.get_min_errors() << std::endl; if (globalAlignmentSettings.get_any_best_hit_mode()) std::cout << "Mapping mode: Any-Best-Hit-Mode" << std::endl; else if (globalAlignmentSettings.get_all_best_hit_mode()) std::cout << "Mapping mode: All-Best-Hit-Mode" << std::endl; else if (globalAlignmentSettings.get_all_best_n_scores_mode()) std::cout << "Mapping mode: All-Best-N-Scores-Mode with N=" << globalAlignmentSettings.get_best_n() << std::endl; else if (globalAlignmentSettings.get_unique_hit_mode()) std::cout << "Mapping mode: Unique-Hits-Mode" << std::endl; else std::cout << "Mapping mode: All-Hits-Mode" << std::endl; if ( globalAlignmentSettings.get_start_cycle() > 1 ) { std::cout << std::endl; std::cout << "----- CONTINUE RUN FROM CYCLE " << cmd_settings.at("continue").as() << " -----" << std::endl; } std::cout << std::endl; } bool HiLiveArgumentParser::parseRunInfo(po::variables_map vm) { if ( ! vm.count("runinfo")) return false; boost::property_tree::ptree tree; read_xml(tree, vm["runinfo"].as()); using boost::property_tree::ptree; if (!tree.empty() && tree.count("RunInfo")!=0) { ptree ptree_RunInfo = tree.get_child("RunInfo"); if (ptree_RunInfo.count("Run")!=0) { ptree ptree_Run = ptree_RunInfo.get_child("Run"); if (ptree_Run.count("Reads")!=0) { ptree ptree_Reads = ptree_Run.get_child("Reads"); // Get the sequence structure and total number of cycles std::vector sequences; CountType num_cycles = 0; for (const auto &read : ptree_Reads) { std::string sequence = ""; sequence += read.second.get(".NumCycles"); sequence += read.second.get(".IsIndexedRead") == "N" ? "R" : "B"; sequences.push_back(sequence); num_cycles += read.second.get(".NumCycles"); } if ( sequences.size() > 0 ) runInfo_settings.add_child("settings.sequences", getXMLnode_vector(sequences)); runInfo_settings.put("settings.cycles", num_cycles); if (ptree_Run.count("FlowcellLayout")!=0) { ptree ptree_FlowcellLayout = ptree_Run.get_child("FlowcellLayout"); // Get the lanes std::vector lanes_vec(ptree_FlowcellLayout.get(".LaneCount")); std::iota(lanes_vec.begin(), lanes_vec.end(), 1); runInfo_settings.add_child("settings.lanes", getXMLnode_vector(lanes_vec)); // Get the tiles std::vector tiles_vec = flowcell_layout_to_tile_numbers( ptree_FlowcellLayout.get(".SurfaceCount"), ptree_FlowcellLayout.get(".SwathCount"), ptree_FlowcellLayout.get(".TileCount") ); runInfo_settings.add_child("settings.tiles", getXMLnode_vector(tiles_vec)); } } } } return true; } int HiLiveArgumentParser::parseCommandLineArguments() { this->set_required_parameters(); // Init general options po::options_description gen_opt = general_options(); po::variables_map vm; // Init all other options po::options_description pos_opt = positional_options(); po::options_description io_opt = io_options(); po::options_description align_opt = alignment_options(); po::options_description tech_opt = technical_options(); // All command line options po::options_description cmdline_options; cmdline_options.add(gen_opt).add(pos_opt).add(io_opt).add(align_opt).add(tech_opt); // Options visible in the help po::options_description visible_options; visible_options.add(gen_opt).add(io_opt).add(align_opt).add(tech_opt); init_help(visible_options); // First parameter iteration for general options (includes help, license and input settings file. try { po::store(po::command_line_parser(argc, argv).options(gen_opt).allow_unregistered().run(), vm); // first check if -h or --help was called if (vm.count("help")) { printHelp(); return 1; } // first check if --license was called if (vm.count("license")) { printLicense(); return 1; } vm.notify(); // Load input settings if exist if ( vm.count("settings")) { if ( ! read_xml(input_settings, vm["settings"].as()) ) { std::cerr << "Input settings file not found: " << vm["settings"].as() << std::endl; return -1; } } else if ( isRequired("settings") ) { throw po::required_option("settings"); } if ( vm.count("runinfo") ) { if ( ! parseRunInfo(vm) ) { std::cerr << "Error while parsing Run Info file: " << vm["runinfo"].as() << std::endl; return -1; } } else if ( isRequired("runinfo") ) { throw po::required_option("runinfo"); } } catch( po::error& e) { std::cerr << "Error while parsing command line options: " << std::endl << e.what() << std::endl; return -1; } po::positional_options_description p; p.add("BC_DIR", 1); p.add("INDEX", 1); p.add("CYCLES", 1); p.add("OUTDIR", 1); // Parse all command line arguments to cmd_settings try { // parse arguments po::store(po::command_line_parser(argc, argv).options(cmdline_options).positional(p).run(), cmd_settings); // then check arguments po::notify(cmd_settings); } catch ( po::required_option& e ) { std::cerr << "Missing Parameter: " << e.what() << std::endl; return -1; } catch( po::error& e) { std::cerr << "Error while parsing command line options: " << e.what() << std::endl; return -1; } // Set all options from command line and input files if ( !set_options() ) { return -1; } if ( !checkPaths() ) { return -1; } // Report the basic settings report(); return 0; } bool HiLiveArgumentParser::set_options() { try { // Set continue cycle if given by the user if ( cmd_settings.count("continue") ) { globalAlignmentSettings.set_start_cycle(cmd_settings.at("continue").as()); } else { globalAlignmentSettings.set_start_cycle(1); } // Set positional arguments set_option("BC_DIR", "settings.paths.root", "", &AlignmentSettings::set_root); set_option("INDEX", "settings.paths.index", "", &AlignmentSettings::set_index_fname); set_option("CYCLES", "settings.cycles", 0, &AlignmentSettings::set_cycles); set_option("OUTDIR", "settings.paths.out_dir", "./out", &AlignmentSettings::set_out_dir); // Set I/O options set_option("temp", "settings.paths.temp_dir", "./temp", &AlignmentSettings::set_temp_dir); set_option("bam", "settings.out.bam", false, &AlignmentSettings::set_write_bam); std::vector output_cycles = {globalAlignmentSettings.get_cycles()}; set_option>("output-cycles", "settings.out.cycles", output_cycles, &AlignmentSettings::set_output_cycles); set_option("extended-cigar", "settings.out.extended_cigar", false, &AlignmentSettings::set_extended_cigar); if ( cmd_settings.at("keep-all-files").as() ) { std::vectorkeep_all_files (globalAlignmentSettings.get_cycles()); std::iota(keep_all_files.begin(), keep_all_files.end(), 1); globalAlignmentSettings.set_keep_aln_files(keep_all_files); } else { set_option>("keep-files", "settings.technical.keep_aln_files", std::vector(), &AlignmentSettings::set_keep_aln_files); } set_option("min-as-ratio", "settings.out.min_as_ratio", 0.0f, &AlignmentSettings::set_min_as_ratio); set_option>("lanes", "settings.lanes", all_lanes(), &AlignmentSettings::set_lanes); set_option>("tiles", "settings.tiles", all_tiles(), &AlignmentSettings::set_tiles); set_option("force-resort", "settings.out.force-resort", false, &AlignmentSettings::set_force_resort); // Set alignment options std::vector default_read_structure; default_read_structure.push_back(std::to_string(globalAlignmentSettings.get_cycles()) + "R"); set_option>("reads", "settings.sequences", default_read_structure, &AlignmentSettings::set_read_structure); set_option("min-errors", "settings.min_errors", 2, &AlignmentSettings::set_min_errors); set_option("mode", "settings.mode", "ANYBEST", &AlignmentSettings::set_mode); set_option("disable-ohw-filter", "settings.align.discard_ohw", false, &AlignmentSettings::disable_ohw); set_option("start-ohw", "settings.align.start_ohw", 20, &AlignmentSettings::set_start_ohw); set_option("window", "settings.align.window", 5, &AlignmentSettings::set_window); set_option("min-quality", "settings.align.min_qual", 1, &AlignmentSettings::set_min_qual); std::vector barcode_sequences_default; set_option>("barcodes", "settings.barcodes.sequences", barcode_sequences_default, &AlignmentSettings::set_barcodes); std::vector barcode_errors_default = {2}; set_option>("barcode-errors", "settings.barcodes.errors", barcode_errors_default, &AlignmentSettings::set_barcode_errors); set_option("keep-all-barcodes", "settings.barcodes.keep_all", false, &AlignmentSettings::set_keep_all_barcodes); // Set technical options set_option("block-size", "settings.technical.block_size", "64M", &AlignmentSettings::set_block_size); set_option("compression", "settings.technical.compression_format", 2, &AlignmentSettings::set_compression_format); CountType n_cpu = std::thread::hardware_concurrency(); CountType n_threads_default = 1; if (n_cpu > 1) n_threads_default = std::min( n_cpu, CountType( globalAlignmentSettings.get_lanes().size() * globalAlignmentSettings.get_tiles().size() ) ) ; set_option("num-threads", "settings.technical.num_threads", n_threads_default, &AlignmentSettings::set_num_threads); set_option("num-out-threads", "settings.technical.num_out_threads", globalAlignmentSettings.get_num_threads()/2, &AlignmentSettings::set_num_out_threads); } catch ( std::exception & ex ) { std::cerr << "Error while parsing options: " << std::endl << ex.what() << std::endl; return false; } return true; } //||||||||||||||||||||||||||||||||||||||||||||||||||||||| //-----HiLiveOutArgumentParser-------------------------- //||||||||||||||||||||||||||||||||||||||||||||||||||||||| void HiLiveOutArgumentParser::init_help(po::options_description visible_options) { std::stringstream help_message; help_message << "Copyright (c) 2015-2017, Martin S. Lindner and the HiLive contributors. See CONTRIBUTORS for more info." << std::endl; help_message << "All rights reserved" << std::endl << std::endl; help_message << "HiLive is open-source software. Check with --license for details." << std::endl << std::endl; help_message << "Usage: " << std::endl << " hilive-out --settings /path/to/settings/file [options]" << std::endl << std::endl; help_message << "Required:" << std::endl; help_message << " settings Path to a HiLive settings file (by default, the file is in the temp directory of the respective run)" << std::endl; help_message << std::endl << "All parameters can be set as for the HiLive main program." << std::endl; help_message << "By default, only output files for the last cycle are produced." << std::endl; help_message << "Use the --output-cycles parameter to declare different cycle numbers (will only work if --keep-files was activated for the respective HiLive run)" << std::endl; help_message << visible_options; help = help_message.str(); } void HiLiveOutArgumentParser::report() { if (globalAlignmentSettings.get_temp_dir() != "") { std::cout << "Temporary directory: " << globalAlignmentSettings.get_temp_dir() << std::endl; } if (!globalAlignmentSettings.get_write_bam()) std::cout << "SAM output directory: " << globalAlignmentSettings.get_out_dir() << std::endl; else std::cout << "BAM output directory: " << globalAlignmentSettings.get_out_dir() << std::endl; std::cout << "Lanes: "; for ( uint16_t ln : globalAlignmentSettings.get_lanes() ) std::cout << ln << " "; std::cout << std::endl; std::cout << "K-mer index: " << globalAlignmentSettings.get_index_fname() << std::endl; std::cout << "Total Read lengths: "; std::string barcode_suffix; for ( uint16_t read = 0; read != globalAlignmentSettings.get_seqs().size(); read ++) { std::cout << globalAlignmentSettings.getSeqById(read).length; barcode_suffix = globalAlignmentSettings.getSeqById(read).isBarcode() ? "B" : "R"; std::cout << barcode_suffix << " "; } std::cout << std::endl; std::cout << "Mapping error: " << globalAlignmentSettings.get_min_errors() << std::endl; if (globalAlignmentSettings.get_any_best_hit_mode()) std::cout << "Mapping mode: Any-Best-Hit-Mode" << std::endl; else if (globalAlignmentSettings.get_all_best_hit_mode()) std::cout << "Mapping mode: All-Best-Hit-Mode" << std::endl; else if (globalAlignmentSettings.get_all_best_n_scores_mode()) std::cout << "Mapping mode: All-Best-N-Scores-Mode with N=" << globalAlignmentSettings.get_best_n() << std::endl; else if (globalAlignmentSettings.get_unique_hit_mode()) std::cout << "Mapping mode: Unique-Hits-Mode" << std::endl; else std::cout << "Mapping mode: All-Hits-Mode" << std::endl; std::cout << "Output Cycles: "; for ( auto cycle : globalAlignmentSettings.get_output_cycles() ) { std::cout << cycle << " "; } std::cout << std::endl; std::cout << std::endl; } HiLive-v1.1-133292b5092f376eabbec2e3021049b7dc920d99/lib/argument_parser.h000066400000000000000000000343531321155700500243330ustar00rootroot00000000000000#include #include "headers.h" #include "definitions.h" #include "global_variables.h" #include "parallel.h" namespace po = boost::program_options; /** * Interface for the argument parsers of the different executables. * @author Tobias Loka */ class ArgumentParser { protected: /** Number of command line arguments. */ int argc; /** List of command line arguments. */ char const ** argv; /** Vector of required options. */ std::vector required_options; /** Map of command line arguments. */ po::variables_map cmd_settings; /** Settings property tree obtained from the runInfo. */ boost::property_tree::ptree runInfo_settings; /** Settings property tree obtained from an settings input file. */ boost::property_tree::ptree input_settings; /** Bool describing whether an input settings file was specified. */ bool has_input_settings = false; /** Program license. */ std::string license = "Copyright (c) 2015-2017, Martin S. Lindner and the HiLive contributors. See CONTRIBUTORS for more info.\n" "All rights reserved.\n" "\n" "Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:\n" "\n" "1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.\n" "\n" "2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.\n" "\n" "3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.\n" "\n" "THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE."; /** Help output (differs for each executable). */ std::string help; /** * (virtual) function that is called from the executable's main() to parse the command line arguments. * @return Error code ( 0: continue program; 1: exit with success; -1: exit with error) * @author Tobias Loka */ virtual int parseCommandLineArguments() = 0; /** * Print the license. * @author Tobias Loka */ void printLicense(){ std::cout << license << std::endl; }; /** * Init the help text by using the options that should be visible to the user. * @param visible_options The options that shall be visible for the user. * @return help message as string * @author Tobias Loka */ virtual void init_help(po::options_description visible_options) = 0; /** * Print the license. * @author Tobias Loka */ void printHelp(){ std::cout << help << std::endl; }; /** * Report the most important settings to the console. * @author Martin Lindner */ virtual void report() = 0; virtual void set_required_parameters() {}; /** * Check if an option is required. * @param vm_key Key for the virtual map for the parameter to check (equals cmd line option name). * @return true, if the option is required. */ bool isRequired( std::string vm_key) { if ( std::find(required_options.begin(), required_options.end(), vm_key) == required_options.end() ) return false; return true; } /** * Set an option in the globalAlignmentSettings. * Thereby, the different input sources have the following priority: * 1. Command line argument * 2. RunInfo file * 3. Input settings file * @param vm_key Key for the command line's variables map. * @param settings_key Key for the RunInfo and Input settings property tree (must be similar for both files) * @param default_value A default value that is set if the variable is set in none of the input sources (this is not considered if "required" is true) * @param function Function that is called to set the variable in globalAlignmentSettings. Must be of type void &AlignmentSettings::*(T). * @param required true, if the option must be set by the user (from one of the input sources) * @author Tobias Loka */ template void set_option(std::string vm_key, std::string settings_key, T default_value, void (AlignmentSettings::*function)(T)) { set_option_impl(vm_key, settings_key, default_value, function, isRequired(vm_key), static_cast(0)); } /** * General implementation of set_option(...). */ template void set_option_impl(std::string vm_key, std::string settings_key, T default_value, void (AlignmentSettings::*function)(T), bool required, T*) { T value = default_value; bool was_set = false; boost::optional isv = input_settings.get_optional(settings_key); boost::optional rsv = runInfo_settings.get_optional(settings_key); // User parameter -> first priority if ( cmd_settings.count(vm_key) ) { value = cmd_settings[vm_key].as(); was_set = true; } // Settings file -> second priority else if ( isv ) { value = isv.get(); was_set = true; } // RunInfo file -> third priority else if ( rsv ) { value = rsv.get(); was_set = true; } // Throw exception if unset if ( required && !was_set ) throw po::required_option(vm_key); // Otherwise set value auto binded_function = std::bind(function, &globalAlignmentSettings, std::placeholders::_1); binded_function(value); } /** * Overload of set_option_impl for bool data type. */ void set_option_impl(std::string vm_key, std::string settings_key, bool default_value, void (AlignmentSettings::*function)(bool), bool required, bool *) { bool value = default_value; bool was_set = false; boost::optional isv = input_settings.get_optional(settings_key); boost::optional rsv = runInfo_settings.get_optional(settings_key); if ( cmd_settings[vm_key].as() != default_value ) { value = !default_value; was_set = true; } else if ( isv && isv.get() != default_value ) { value = !default_value; was_set = true; } else if ( rsv && rsv.get() != default_value ) { value = !default_value; was_set = true; } if ( required && !was_set ) throw po::required_option(vm_key); // Otherwise set value auto binded_function = std::bind(function, &globalAlignmentSettings, std::placeholders::_1); binded_function(value); } /** Overload of set_option_impl for std::vector data types. */ template void set_option_impl(std::string vm_key, std::string settings_key, std::vector default_value, void (AlignmentSettings::*function)(std::vector), bool required, std::vector *) { std::vector value; bool was_set = false; auto sub_isv = input_settings.get_child_optional(settings_key); auto sub_rsv = runInfo_settings.get_child_optional(settings_key); // User parameter -> first priority if ( cmd_settings.count(vm_key) ) { if ( vm_key == "reads") { std::cout << "USED PARAMETER" << std::endl; } value = cmd_settings[vm_key].as>(); was_set = true; } // Settings file -> third priority else if ( sub_isv && sub_isv.get().count("el") ) { for ( auto& v : sub_isv.get() ) { if ( v.first == "el" ) value.push_back(v.second.get_value()); } was_set = true; } // RunInfo file -> second priority else if ( sub_rsv && sub_rsv.get().count("el") ) { for ( auto& v : sub_rsv.get() ) { if ( v.first == "el" ) value.push_back(v.second.get_value()); } was_set = true; } // Throw exception if unset if ( required && !was_set ) throw po::required_option(vm_key); if ( !was_set) { value = default_value; } // Otherwise set value auto binded_function = std::bind(function, &globalAlignmentSettings, std::placeholders::_1); binded_function(value); } public: /** * Default constructor. * @argC Number of command line arguments. * @argV List of command line arguments. * @author Tobias Loka */ explicit ArgumentParser(int argC, char const ** argV); /** * Virtual destructor. * @author Tobias Loka */ virtual ~ArgumentParser(){}; }; /** * Class to parse arguments for HiLive build. */ class BuildIndexArgumentParser : public ArgumentParser { uint16_t kmer_weight; std::vector gap_positions = {}; /** * Use the constructor of the inherited ArgumentParser class. */ using ArgumentParser::ArgumentParser; /** * General options of HiLive build. * @return Option descriptor containing all general options that can be set by the user. * @author Martin Lindner */ po::options_description general_options(); /** * Positional options of HiLive build. * @return Option descriptor containing all positional options that must be set by the user. * @author Martin Lindner */ po::options_description positional_options(); /** * Build options of HiLive build. * @return Option descriptor containing all positional options that must be set by the user. * @author Martin Lindner */ po::options_description build_options(); /** * Set all variables for the positional command arguments. * @param vm The variables map containing the user parameters. * @return true on success, false otherwise * @author Tobias Loka */ bool set_positional_variables(po::variables_map vm); /** * Set all variables for the build arguments. * @param vm The variables map containing the user parameters. * @return true on success, false otherwise * @author Tobias Loka */ bool set_build_variables(po::variables_map vm); void report() override; void init_help(po::options_description visible_options) override; public: // name of the index file std::string index_name; // name of the input fasta file std::string fasta_name; // trimming parameter unsigned trim; // do_not_convert_spaces_switch bool do_not_convert_spaces; // trim_ids switch bool trim_ids; int parseCommandLineArguments() override; }; /** * Class to parse arguments for HiLive. */ class HiLiveArgumentParser : public ArgumentParser { protected: /** * Use the constructor of the inherited ArgumentParser class. */ using ArgumentParser::ArgumentParser; /** * General options of HiLive. * @return Option descriptor containing all general options that can be set by the user. * @author Martin Lindner */ po::options_description general_options(); /** * Positional options of HiLive. * @return Option descriptor containing all positional options that must be set by the user. * @author Martin Lindner */ po::options_description positional_options(); /** * I/O options of HiLive. * @return Option descriptor containing all I/O options that can be set by the user. * @author Martin Lindner */ po::options_description io_options(); /** * Alignment options of HiLive. * @return Option descriptor containing all alignment options that can be set by the user. * @author Martin Lindner */ po::options_description alignment_options(); /** * Technical options of HiLive. * @return Option descriptor containing all technical options that can be set by the user. * @author Martin Lindner */ po::options_description technical_options(); /** * Check all paths that are relevant for the functionality of HiLive. * @return true if all paths and files are accessible * @author Jakob Schulze */ bool checkPaths(); /** * Parse Lanes, Tiles and read fragments from a RunInfo.xml file. * @param vm The variables map containing the user parameters. * @return true on success, false otherwise * @author Jakob Schulze */ bool parseRunInfo(po::variables_map vm); virtual void report() override; void init_help(po::options_description visible_options) override; bool set_options(); virtual void set_required_parameters() override { required_options = {"BC_DIR", "INDEX", "CYCLES"}; } public: int parseCommandLineArguments() override; }; /** * Class to parse arguments for HiLive out. */ class HiLiveOutArgumentParser : public HiLiveArgumentParser { using HiLiveArgumentParser::HiLiveArgumentParser; void init_help(po::options_description visible_options) override; void report() override; void set_required_parameters() override { required_options = {"settings", "INDEX"}; }; }; //class HiLiveOutArgumentParser : public ArgumentParser { // // /** // * Use the constructor of the inherited ArgumentParser class. // */ // using ArgumentParser::ArgumentParser; // // /** // * General options of HiLive build. // * @return Option descriptor containing all general options that can be set by the user. // * @author Martin Lindner // */ // po::options_description general_options(); // // /** // * Positional options of HiLive build. // * @return Option descriptor containing all positional options that must be set by the user. // * @author Martin Lindner // */ // po::options_description positional_options(); // // /** // * Build options of HiLive build. // * @return Option descriptor containing all positional options that must be set by the user. // * @author Martin Lindner // */ // po::options_description output_options(); // // /** // * Set all variables for the positional command arguments. // * @param vm The variables map containing the user parameters. // * @return true on success, false otherwise // * @author Tobias Loka // */ // bool set_positional_variables(po::variables_map vm); // // /** // * Set all variables for the build arguments. // * @param vm The variables map containing the user parameters. // * @return true on success, false otherwise // * @author Tobias Loka // */ // bool set_output_variables(po::variables_map vm); // // void report() override; // // void init_help(po::options_description visible_options) override; // // bool set_options() override; // //public: // // int parseCommandLineArguments() override; // //}; HiLive-v1.1-133292b5092f376eabbec2e3021049b7dc920d99/lib/definitions.h000066400000000000000000000244011321155700500234410ustar00rootroot00000000000000#ifndef DEFINITIONS_H #define DEFINITIONS_H #include "headers.h" ///////////////////////////////////////////// ////////// Sequences / Nucleotides ////////// ///////////////////////////////////////////// /** * Two-bit representation of a nucleotide. * @param ch Nucleotide as char * @return 2-bit representation */ #define twobit_repr(ch) ((toupper(ch)) == 'A' ? 0LL : \ (toupper(ch)) == 'C' ? 1LL : \ (toupper(ch)) == 'G' ? 2LL : 3LL) /** * Complementary two-bit representation of a nucleotide. * @param ch Nucleotide as char * @return Complementary 2-bit representation */ #define twobit_comp(ch) ((toupper(ch)) == 'A' ? 3LL : \ (toupper(ch)) == 'C' ? 2LL : \ (toupper(ch)) == 'G' ? 1LL : 0LL) /** * Nucleotide of a 2-bit representation. * @param n 2-bit representation of a nucleotide. * @return Nucleotide as char */ #define revtwobit_repr(n) ((n) == 0 ? 'A' : \ (n) == 1 ? 'C' : \ (n) == 2 ? 'G' : 'T') /** * Supported nucleotides. */ const std::string seq_chars = "ACGTacgt"; //////////////////////////////////////// ////////// Genome Identifiers ////////// //////////////////////////////////////// /** * Type for the identifier of genomes (gid). */ typedef uint32_t GenomeIdType; /** * Constant variable to tag a k-mer as "trimmed". */ const GenomeIdType TRIMMED = std::numeric_limits::max(); /** * A list of Genome Ids */ typedef std::vector GenomeIdListType; ////////////////////////////////////// ////////// Genome Positions ////////// ////////////////////////////////////// /** * Type for positions in a genome. */ typedef int32_t PositionType; /** * A pair of genome ID and position. */ struct GenomePosType { GenomeIdType gid; PositionType pos; GenomePosType()=default; GenomePosType(GenomeIdType g, PositionType p): gid(g), pos(p) {}; }; /** * Size of a pair of genome ID and position (in bytes) */ const uint64_t GenomePos_size = sizeof(GenomeIdType) + sizeof(PositionType); /** * A vector of GenomePosTypes. */ typedef std::vector GenomePosListType; /** * Iterator on GenomePosListType. */ typedef GenomePosListType::iterator GenomePosListIt; ///////////////////////////////// ////////// K-mer index ////////// ///////////////////////////////// /** * Type to hash k-mers into. * This type also limits the k-mer weight (currently to 32). */ typedef uint64_t HashIntoType; /** * K-mer index type. */ typedef std::vector KmerIndexType; /** * A lightweight type for storing the index. */ typedef std::vector KixRunDB; //////////////////////////////////////// ////////// Integer data types ////////// //////////////////////////////////////// /** * Type for small counters. */ typedef uint16_t CountType; /** * Difference between k-mer position in the read and matching position in the reference. */ typedef int16_t DiffType; //////////////////////////////////////// ////////// Offset definitions ////////// //////////////////////////////////////// /** * Define a mismatch as maximum value of DiffType. */ const DiffType NO_MATCH = std::numeric_limits::max(); /** * Define a trimmed match maximum value of DiffType -1. */ const DiffType TRIMMED_MATCH = std::numeric_limits::max()-1; //////////////////////////////////// ////////// CIGAR elements ////////// //////////////////////////////////// /** * One (internal) CIGAR element. */ struct CigarElement { /** Length of the region. */ CountType length; /** Offset of the region to the original start pos (created by InDels). */ DiffType offset; CigarElement (CountType l, DiffType o): length(l), offset(o) {}; CigarElement (): length(0), offset(NO_MATCH) {}; }; /** * Vector of CIGAR elements, representing the alignment information for one seed. */ typedef std::list CigarVector; /////////////////////////////////////// ////////// Sequence Elements ////////// /////////////////////////////////////// /** * Information about the sequences. * One element can be a read or a barcode. * @author Tobias Loka */ struct SequenceElement { /** The id of the read. Equals the position in the argument list and in the AlignmentSettings::seqs vector (0-based). */ CountType id; /** The mate number. 0 for barcodes, increasing for sequence reads in the given order (1-based). */ CountType mate; /** The length of the respective read. */ CountType length; /** * Constructor of a SequenceElement NULL object. * @author Tobias Loka */ SequenceElement () : id(0), mate(0), length(0) {}; /** * Constructor of a valid SequenceElement object. * @param id The id of the read. * @param m The mate number of the read (0 for barcodes, incrementing for sequence reads) * @param l The length of the read * @author Tobias Loka */ SequenceElement (CountType id, CountType m, CountType l): id(id), mate(m), length(l) {}; /** * Check whether the SequenceElement object is a barcode or not. * @return true, if SequenceElement is a barcode. False if not. * @author Tobias Loka */ bool isBarcode() { return (mate==0);} }; /** * Check if two Sequence elements are equal. */ inline bool operator==(const SequenceElement l, const SequenceElement r) {return (l.length==r.length) && (l.mate==r.mate) && (l.id==r.id);} /** * Checks if two sequence elements are not equal. */ inline bool operator!=(const SequenceElement l, const SequenceElement r) {return !(l==r);} /** * An undefined sequence element (NULL element). */ const SequenceElement NULLSEQ = SequenceElement(); //////////////////////////////////////////// ////////// Unmodifiable variables ////////// //////////////////////////////////////////// /** * Exception specialization for Unmodifiable data types. * @author Tobias Loka */ class unmodifiable_error : public std::logic_error { public: using std::logic_error::logic_error; }; /** * Template to define data types that can only be set once. * @type T Data type of the unmodifiable object. * @author Tobias Loka */ template class Unmodifiable { private: /** The unmodifiable object. */ T unmodifiable_object; /** Flag to check if the object was already set once. */ bool setFlag = false; public: /** Constructor without setting the object (to only declare the object).*/ Unmodifiable(){ } /** Constructor with setting the object (to init the object).*/ Unmodifiable(T object) { unmodifiable_object = object; } /** Automatic cast to of the unmodifiable to the object type. */ operator T() { return unmodifiable_object; } /** * Set the unmodifiable object (will only work once!). * @param object The object to be copied to this unmodifiable data type. * @return true if setting was successful * @author Tobias Loka */ void set(T object) { if ( isSet() ) { throw unmodifiable_error("Tried to modify unmodifiable object"); } unmodifiable_object = object; setFlag = true; } /** * Check if the object was already set. * @return true if the object was already set. * @author Tobias Loka */ bool isSet() { return setFlag; } /** * Return a copy of the unmodifiable object. * @param allow_unset if false, an exception is thrown when the object was not set before. Should only be true for * objects that require access to certain properties before their initialization (e.g. to check a container's size * without knowing if the container was already set). * @return (copy/value of) the unmodifiable object * @author Tobias Loka */ T get(bool allow_unset = false ) { if ( ! isSet() && ! allow_unset) { throw unmodifiable_error("Tried to access uninitialized object"); } return unmodifiable_object; } }; //////////////////////////////// ////////// Exceptions ////////// //////////////////////////////// class io_error : public std::runtime_error { public: using std::runtime_error::runtime_error; }; class file_open_error : public io_error { public: using io_error::io_error; }; class file_not_exist_error : public io_error { public: using io_error::io_error; }; class file_format_error : public io_error { public: using io_error::io_error; }; /////////////////////////////////////// ////////// Other definitions ////////// /////////////////////////////////////// /** * A list of strings */ typedef std::vector StringListType; /** * The different alignment modes. * @author Tobias Loka */ enum AlignmentMode:char { ALL='A', ALLBEST='H', ANYBEST='B', BESTN='N', UNIQUE='U', UNKNOWN='Z' }; /** * Template to store a map of mutexes. * Ensure that a locked mutex gets always unlocked (on destruction, if necessary). If possible, use a combination of std::lock_guard and get_reference(T). */ template class mutex_map { private: std::map map; std::mutex mut; typename std::map::size_type count(K k) { return map.count(k); } std::mutex& try_emplace(K k) { { std::lock_guard lock(mut); if ( !count(k) ) map.emplace(std::piecewise_construct, std::forward_as_tuple(k), std::forward_as_tuple()); return map.at(k); } } public: void unlock(K k) { if ( count(k) ) map.at(k).unlock(); } void lock(K k) { try_emplace(k); map.at(k).lock(); } std::mutex& at(K k){ return try_emplace(k); } }; /** * A data type that increments an arithmetic field for the time of it's existance. * This functionality can be used to block one slot of a certain capacity. */ template< typename T, typename = typename std::enable_if::value, T>::type > class block_guard { T& val; T blocked_value; public: block_guard( T& value ) : val(value), blocked_value(++val){ } ~block_guard() { --val; } T get_blocked_value(){ return blocked_value; } }; /** * A data type that increments an atomic arithmetic field for the time of it's existance. * This functionality can be used to block one slot of a certain capacity. */ template< typename T, typename = typename std::enable_if::value, T>::type > class atomic_block_guard { std::atomic& val; T blocked_value; public: atomic_block_guard( std::atomic& value ) : val(value), blocked_value(++val) { } ~atomic_block_guard() { --val; } T get_blocked_value(){ return blocked_value; } }; #endif /* DEFINITIONS_H */ HiLive-v1.1-133292b5092f376eabbec2e3021049b7dc920d99/lib/global_variables.h000066400000000000000000000003221321155700500244120ustar00rootroot00000000000000#ifndef GLOBAL_VARIABLES_H #define GLOBAL_VARIABLES_H #include "../lib/headers.h" class AlignmentSettings; extern AlignmentSettings globalAlignmentSettings; extern mutex_map fileLocks; #endif HiLive-v1.1-133292b5092f376eabbec2e3021049b7dc920d99/lib/headers.h000066400000000000000000000013771321155700500225500ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include HiLive-v1.1-133292b5092f376eabbec2e3021049b7dc920d99/lib/illumina_parsers.cpp000066400000000000000000000032511321155700500250320ustar00rootroot00000000000000#include "illumina_parsers.h" // Constructor takes filename and directly loads the whole file uint64_t BclParser::open (std::string fname) { // read the whole file as a chunk data = read_binary_file(fname); // extract the number of reads memcpy(&num_reads,data.data(),4); // set the position pointer to the beginning of the data block position = 4; return data.size(); } // Get the next base call char BclParser::next() { if ( position < data.size() ) { position++; return *(data.data()+position-1); } else { throw std::runtime_error("Error reading BCL file: requested position is beyond EOF." ); } } // Check if there are base calls left bool BclParser::has_next() { return (position < data.size()); } // Returns the total number of base calls in the file uint32_t BclParser::size() { return num_reads; } // Constructor takes filename and directly loads the whole file uint64_t FilterParser::open (std::string fname) { // read the whole file as a chunk data = read_binary_file(fname); // extract the number of reads memcpy(&num_reads,data.data()+8,4); // set the position pointer to the beginning of the data block position = 12; return data.size(); } // Get the next filter flag bool FilterParser::next() { if ( position < data.size() ) { position++; return (*(data.data()+position-1) > 0); } else { throw std::runtime_error("Error reading filter file: requested position is beyond EOF." ); } } // Check if there are filter flags left bool FilterParser::has_next() { return (position < data.size()); } // Returns the total number of filter flags in the file uint32_t FilterParser::size() { return num_reads; } HiLive-v1.1-133292b5092f376eabbec2e3021049b7dc920d99/lib/illumina_parsers.h000066400000000000000000000022171321155700500245000ustar00rootroot00000000000000#ifndef ILLUMINA_PARSERS_H #define ILLUMINA_PARSERS_H #include "headers.h" #include "definitions.h" #include "tools.h" // BCL file parser class BclParser { // storage for the raw binary data std::vector data; // current position in data uint32_t position; // number of reads in this bcl file uint32_t num_reads; public: // open file and directly load all data uint64_t open(std::string fname); // Get the next base call char next(); // Check if there are base calls left bool has_next(); // Returns the total number of base calls in the file uint32_t size(); }; // filter file parser class FilterParser { // storage for the raw binary data std::vector data; // current position in data uint32_t position; // number of reads in this filter file uint32_t num_reads; public: // open file and directly load all data uint64_t open(std::string fname); // Get the next filter flag bool next(); // Check if there are filter flags left bool has_next(); // Returns the total number of filter flags in the file uint32_t size(); }; // clocs file parser #endif /* ILLUMINA_PARSERS_H */ HiLive-v1.1-133292b5092f376eabbec2e3021049b7dc920d99/lib/kindex.cpp000066400000000000000000000335601321155700500227510ustar00rootroot00000000000000#include "kindex.h" KixBuild::KixBuild() { // default constructor this->db.resize(pow(4,globalAlignmentSettings.get_kmer_weight())); } int KixBuild::add_fasta(const std::string &fname, bool convert_spaces, bool trim_ids) { GenomeIdListType added_ids; return add_fasta(fname, added_ids, convert_spaces, trim_ids); } int KixBuild::add_fasta(const std::string &fname, GenomeIdListType &ids, bool convert_spaces, bool trim_ids) { std::ios::sync_with_stdio(false); std::ifstream::sync_with_stdio(false); std::ifstream infile (fname.c_str()); assert(infile.is_open()); std::string line; GenomeIdType seq_id = 0; std::string seq_name; GenomeIdListType added_ids; PositionType sequencePosition; bool startNewSequence = false; std::string tailingKmer; while(getline(infile, line)) { if (line.length() == 0) {continue;}; if (line[line.length()-1] == '\r'){ // check for Windows newline characters (just in case the fasta comes from Windows --> thanks Simon) line.erase(line.length()-1); } if (line[0] == '>') { // header line // initialize a new sequence num_seq += 1; seq_id = num_seq - 1; if (convert_spaces) std::replace( line.begin(), line.end(), ' ', '_'); if (trim_ids) seq_name = line.substr(1,line.find(' ')-1); else seq_name = line.substr(1,line.length()-1); added_ids.push_back(seq_id); seq_names.push_back(seq_name); seq_lengths.push_back(0); // gets later corrected to globalAlignmentSettings.get_kmer_span() - 1 assert(seq_names.size() == num_seq); startNewSequence = true; } else { // sequence line if (startNewSequence) { if (line.length() < globalAlignmentSettings.get_kmer_span()) continue; // ignore sequences shorter than K start_sequence(line, tailingKmer, sequencePosition); startNewSequence = false; } else continue_sequence(line, tailingKmer, sequencePosition); } } infile.close(); ids = added_ids; return added_ids.size(); } /* Start adding all k-mers in a sequence string s to the database. A new ID is created for this sequence. Return: sequence ID */ GenomeIdType KixBuild::start_sequence(const std::string &s, std::string& tailingKmer, PositionType& sequencePosition) { assert(seq_names.size() == num_seq); assert(s.length() >= globalAlignmentSettings.get_kmer_span()); // add sequence kmers to index sequencePosition = 0; // use 1-based positions (to allow for negative positions) std::string::const_iterator it_s = s.begin(); std::string::const_iterator last_invalid; HashIntoType fw; // forward k-mer seq_lengths.back()=globalAlignmentSettings.get_kmer_span()-1; for (; it_s < s.end()-globalAlignmentSettings.get_kmer_span()+1; ++it_s) { ++sequencePosition; // use 1-based positions (to allow for negative positions) seq_lengths.back()+=1; last_invalid = hash_fw(it_s, s.end(), fw); // add k-mer to database if (last_invalid < it_s) add_kmer(fw,num_seq-1,sequencePosition); else { unsigned jumplength = std::min(last_invalid - it_s, s.end() - globalAlignmentSettings.get_kmer_span() - it_s); sequencePosition += jumplength; seq_lengths.back() += jumplength; it_s = last_invalid; } } tailingKmer = s.substr(s.length()-globalAlignmentSettings.get_kmer_span()); return num_seq; } /* Continue adding all k-mers in a sequence string s to the database. Return: sequence ID */ GenomeIdType KixBuild::continue_sequence(const std::string &s, std::string& tailingKmer, PositionType& sequencePosition) { assert(seq_names.size() == num_seq); std::string concatString = tailingKmer + s; // add sequence kmers to index std::string::const_iterator it_s = concatString.begin() + 1; std::string::const_iterator last_invalid; HashIntoType fw; // forward k-mer for (; it_s < concatString.end()-globalAlignmentSettings.get_kmer_span()+1; ++it_s) { ++sequencePosition; // use 1-based positions (to allow for negative positions) seq_lengths.back()+=1; last_invalid = hash_fw(it_s, concatString.end(), fw); // add k-mer to database if (last_invalid < it_s) add_kmer(fw,num_seq-1,sequencePosition); else { unsigned jumplength = std::min(last_invalid - it_s, concatString.end()- globalAlignmentSettings.get_kmer_span() - it_s); sequencePosition += jumplength; seq_lengths.back() += jumplength; it_s = last_invalid; } } tailingKmer = concatString.substr(concatString.length()-globalAlignmentSettings.get_kmer_span()); return num_seq; } int KixBuild::add_kmer(HashIntoType kmer, GenomeIdType id, PositionType pos) { assert(kmer < db.size()); assert(id < num_seq); GenomePosType gp; gp.gid = id; gp.pos = pos; db[kmer].push_back(gp); return 1; } /* Trim the k-mer index: removes all k-mers with more than max_count occurrences in the reference genomes. Trimmed k-mers are marked by the GenomeIdType TRIMMED (from definitions.h). */ uint64_t KixBuild::trim(uint64_t max_count) { uint64_t trimmed = 0; GenomePosType gp_trimmed (TRIMMED,0); for (auto it = db.begin(); it != db.end(); ++it) if ((*it).size() > max_count) { trimmed += (*it).size(); (*it).clear(); (*it).push_back(gp_trimmed); } return trimmed; } std::vector KixBuild::serialize() { // first of all, sort the database entries by position for (auto it = db.begin(); it != db.end(); ++it) std::sort(it->begin(), it->end(), gp_compare); // calculate total size unsigned long int total_size = 0; // The k-mer weight itself total_size += 1; // The number of gaps total_size += 1; // The gaps themselves total_size += globalAlignmentSettings.get_kmer_gaps().size(); // total number of sequences in database total_size += sizeof(GenomeIdType); // sequence names for (uint32_t i = 0; i < seq_names.size(); i++) { uint16_t nm_length = seq_names.size(); total_size += sizeof(uint16_t); total_size += nm_length; } // reference sequence lengths for (uint32_t i = 0; i < seq_lengths.size(); i++) { total_size += sizeof(uint64_t); } // database entries for (auto it = db.begin(); it != db.end(); ++it) { total_size += sizeof(uint32_t); // number of positions uint32_t num_positions = (*it).size(); total_size += num_positions*(sizeof(GenomeIdType) + sizeof(PositionType)); } // create the vector to store the data std::vector data (total_size); char* d = data.data(); // write K uint8_t kk = globalAlignmentSettings.get_kmer_weight(); memcpy(d,&kk,1); d++; // number of gaps std::vector kmer_gaps = globalAlignmentSettings.get_kmer_gaps(); uint8_t gap_num = kmer_gaps.size(); memcpy(d,&gap_num,1); d++; // The gaps themselves for ( uint8_t gap : kmer_gaps) { memcpy(d,&gap,1); d++; } // total number of sequences in database memcpy(d,&num_seq,sizeof(GenomeIdType)); d += sizeof(GenomeIdType); // sequence names for (uint32_t i = 0; i < seq_names.size(); i++) { uint16_t nm_length = seq_names[i].size(); memcpy(d,&nm_length,sizeof(uint16_t)); d += sizeof(uint16_t); memcpy(d,seq_names[i].c_str(),nm_length); d += nm_length; } for (uint32_t i = 0; i < seq_lengths.size(); i++) { uint64_t seq_len = seq_lengths[i]; memcpy(d,&seq_len,sizeof(uint64_t)); d += sizeof(uint64_t); } // database entries for (auto it = db.begin(); it != db.end(); ++it) { // number of positions uint32_t num_positions = (*it).size(); //db is an array of GenomePosType structs memcpy(d,&num_positions,sizeof(uint32_t)); d += sizeof(uint32_t); // genome ID and position for(GenomePosListIt entry=(*it).begin(); entry!=(*it).end(); ++entry) { GenomeIdType gid = (*entry).gid; PositionType pos = (*entry).pos; memcpy(d,&gid,sizeof(GenomeIdType)); d += sizeof(GenomeIdType); memcpy(d,&pos,sizeof(PositionType)); d += sizeof(PositionType); } } return data; } uint64_t KixBuild::serialize_file(std::string f) { std::string fname = f; // serialize data std::vector sdata = serialize(); // open binary file FILE* ofile; ofile = fopen(fname.c_str(), "wb"); if (!ofile) { std::cerr << "Error serializing object to file " << fname << ": Could not open file for writing." << std::endl; return 0; } // write all data uint64_t written = fwrite(sdata.data(), 1, sdata.size(), ofile); // close file fclose(ofile); if (written != sdata.size()){ std::cerr << "Error serializing object to file " << fname << ": Total size: " << sdata.size() << " bytes. Written: " << written << " bytes." << std::endl; } return written; } uint64_t KixRun::deserialize(char* d) { // the total number of bytes read uint64_t bytes = 0; // read k-mer weight uint8_t kk; memcpy(&kk,d+bytes,1); bytes++; this->kmer_weight = kk; // read number of gaps in k-mer pattern uint8_t gap_num; memcpy(&gap_num,d+bytes,1); bytes++; // read k-mer pattern for ( uint8_t i = 0; i < gap_num; i++ ) { uint8_t gap; memcpy(&gap, d+bytes, 1); kmer_gaps.push_back(gap); bytes++; } // globalAlignmentSettings.set_kmer_weight(this->kmer_weight); store_kmer(); this->db.resize(pow(4,globalAlignmentSettings.get_kmer_weight())); // read total number of sequences in database memcpy(&num_seq,d+bytes,sizeof(GenomeIdType)); bytes += sizeof(GenomeIdType); // sequence names seq_names.clear(); seq_lengths.clear(); for (uint32_t i = 0; i < num_seq; i++) { uint16_t nm_length; memcpy(&nm_length,d+bytes,sizeof(uint16_t)); bytes += sizeof(uint16_t); char * tmp = new char[nm_length+1]; memcpy(tmp,d+bytes,nm_length); tmp[nm_length] = 0; // make the string null-terminated seq_names.push_back(tmp); delete tmp; bytes += nm_length; } // sequence lengths for (uint32_t i = 0; i < num_seq; i++) { uint64_t seq_len; memcpy(&seq_len,d+bytes,sizeof(uint64_t)); bytes += sizeof(uint64_t); seq_lengths.push_back(seq_len); } // database entries for (auto it = db.begin(); it != db.end(); ++it) { // number of positions uint32_t* num_positions = (uint32_t*)(d+bytes); // total size of data block for this k-mer uint32_t total_size = sizeof(uint32_t) + (*num_positions)*GenomePos_size; // allocate the memory (*it) = d+bytes; // increase pointer bytes += total_size; } return bytes; } uint64_t KixRun::deserialize_file(std::string f) { std::string fname = f; sdata = read_binary_file(f); deserialize(sdata.data()); return sdata.size(); } // return k-mer weight read from index uint8_t KixRun::get_kmer_weight() { return(this->kmer_weight); } /* Retrieve all occurrences (fwd & rc) of kmer in the reference from the index */ GenomePosListType KixRun::retrieve_positions(std::string kmerSpan) { // get the reverse complement of the kmer HashIntoType fwHashValue; HashIntoType rcHashValue; hash(kmerSpan.c_str(), fwHashValue, rcHashValue); // std::cout << "FW: " << fwHashValue << std::endl; // std::cout << "REV: " << rcHashValue << std::endl; // obtain the list of positions for each k-mer char* fwd_begin = db[fwHashValue]; uint32_t fwd_len; memcpy(&fwd_len,fwd_begin,sizeof(uint32_t)); char* rev_begin = db[rcHashValue]; uint32_t rev_len; memcpy(&rev_len,rev_begin,sizeof(uint32_t)); // the position list: all positions in all genomes, where the current k-mer was found GenomePosListType pos; pos.reserve(fwd_len+rev_len); // indicate reverse complement hits by negative position and append in reverse order for(uint64_t i = 0; i < rev_len; i++) { GenomePosType rev; memcpy(&rev.gid,rev_begin+sizeof(uint32_t)+(rev_len-1-i)*GenomePos_size, sizeof(GenomeIdType)); memcpy(&rev.pos,rev_begin+sizeof(uint32_t)+(rev_len-1-i)*GenomePos_size+sizeof(GenomeIdType), sizeof(PositionType)); rev.pos = - rev.pos; if (rev.gid == TRIMMED) { pos.clear(); pos.push_back(GenomePosType(TRIMMED,0)); return pos; } pos.push_back(rev); } // then add the forward hits with positive position in normal order --> position list is sorted if index was sorted! for(uint64_t i = 0; i < fwd_len; i++) { GenomePosType fwd; memcpy(&fwd.gid,fwd_begin+sizeof(uint32_t)+i*GenomePos_size, sizeof(GenomeIdType)); memcpy(&fwd.pos,fwd_begin+sizeof(uint32_t)+i*GenomePos_size+sizeof(GenomeIdType), sizeof(PositionType)); if (fwd.gid == TRIMMED) { pos.clear(); pos.push_back(GenomePosType(TRIMMED,0)); return pos; } pos.push_back(fwd); } return pos; } /** * get kix-header information only to not load the complete index * structure: * 8 bit (k-mer size) * 32 bit (# sequences) * for each sequence: 16 bit (sequence name length) + sequence name * for each sequence: 32 bit (length of sequence) */ uint64_t KixRun::get_header_information(std::string f) { std::string fname = f; // open binary file FILE* fi; fi = fopen(fname.c_str(), "rb"); if (!fi) { std::cerr << "Error reading binary file " << fname << ": Could not open file." << std::endl; return 0; } uint16_t seq_name_len; uint64_t seq_amount, seq_len; int c; // sequence names & lengths seq_names.clear(); seq_lengths.clear(); fread(&kmer_weight,1,1,fi); // Ignore k-mer gaps uint8_t gap_num; fread(&gap_num,1,sizeof(uint8_t),fi); // read k-mer pattern for ( uint8_t i = 0; i < gap_num; i++ ) { uint8_t gap; fread(&gap, 1, sizeof(uint8_t), fi); } fread(&seq_amount,4,1,fi); //get the names for every sequence for( unsigned a = 0; a < seq_amount; a = a + 1 ){ fread(&seq_name_len,2,1,fi); std::string seq_name; for( unsigned b = 0; b < seq_name_len; b = b + 1 ){ c = fgetc(fi); seq_name.push_back(c); } seq_names.push_back(seq_name); } for( unsigned a = 0; a < seq_amount; a = a + 1 ) { fread(&seq_len,4,1,fi); } seq_lengths.push_back(seq_len); fclose(fi); return seq_amount; } HiLive-v1.1-133292b5092f376eabbec2e3021049b7dc920d99/lib/kindex.h000066400000000000000000000061131321155700500224100ustar00rootroot00000000000000#ifndef KINDEX_H #define KINDEX_H #include "headers.h" #include "definitions.h" #include "global_variables.h" #include "tools.h" //-------------------------------------------------------------------// //------ The k-mer index builder: KixBuild -------------------------// //-------------------------------------------------------------------// class KixBuild { // add a single k-mer to the database // Note: the index uses 1-based positions (to allow for negative positions) int add_kmer(HashIntoType kmer, GenomeIdType id, PositionType pos); public: // constructor resizing db (see below) to match the number of possible k-mers KixBuild(); // add k-mers of all sequences in FASTA file int add_fasta(const std::string &fname, GenomeIdListType &ids, bool convert_spaces, bool trim_ids); int add_fasta(const std::string &fname, bool convert_spaces, bool trim_ids); // add all k-mers in a string sequence to the database GenomeIdType start_sequence(const std::string &s, std::string& tailingKmer, PositionType& sequencePosition); GenomeIdType continue_sequence(const std::string &s, std::string& tailingKmer, PositionType& sequencePosition); // trim the database: remove kmers with more than max_count occurrences uint64_t trim(uint64_t max_count); // serialize the KixBuild std::vector serialize(); // serialize and store the KixBuild to a file uint64_t serialize_file(std::string f); GenomeIdType num_seq=0; // total number of sequences in the database KmerIndexType db; // the database structure itself StringListType seq_names; // names of the sequences in the database std::vector seq_lengths; // lengths of the sequences in the database }; // END class KixBuild //-------------------------------------------------------------------// //------ The k-mer runtime index: KixRun ---------------------------// //-------------------------------------------------------------------// class KixRun { private: uint8_t kmer_weight; // k-mer weight read from file std::vector kmer_gaps; public: // pointer to the matching positions for a k-mer char* kmer(HashIntoType kmer); // retrieve all fwd and rc occurrences of kmer in the index GenomePosListType retrieve_positions(std::string kmerSpan); // deserialize Kix, also sets kmer_weight and globalAlignmentSettings.kmer_weight uint64_t deserialize(char* d); // load and deserialize Kix from file uint64_t deserialize_file(std::string f); // return k-mer weight of the k-mers in the index uint8_t get_kmer_weight(); uint64_t get_header_information(std::string f); void store_kmer() { globalAlignmentSettings.set_kmer(kmer_weight, kmer_gaps); }; // Database content GenomeIdType num_seq; // total number of sequences in the database StringListType seq_names; // names of the sequences in the database std::vector seq_lengths; // lengths of the sequences in the database KixRunDB db; // the lightweight database structure itself, pointing to sdata std::vector sdata; // actual chunk of data }; // END class KixRun #endif /* KINDEX_H */ HiLive-v1.1-133292b5092f376eabbec2e3021049b7dc920d99/lib/parallel.cpp000066400000000000000000000213341321155700500232570ustar00rootroot00000000000000#include "parallel.h" std::ostream& operator<<(std::ostream& os, const Task& t) { std::string mate = t.seqEl.mate == 0 ? "b" : std::to_string(t.seqEl.mate); os << "Lane " << t.lane << " Tile " << t.tile << " Cycle " << mate << "." << t.cycle; return os; } // Add element to the task list void TaskQueue::push(Task t) { std::lock_guard lk(m); tasks.push(t); } // Get element from the task list. If TaskList is empty, return NO_TASK. Task TaskQueue::pop() { std::lock_guard lk(m); if (!tasks.empty()) { Task t = tasks.front(); tasks.pop(); return t; } else { return NO_TASK; } } // return the size of the queue uint64_t TaskQueue::size() { std::lock_guard lk(m); return tasks.size(); } // create a vector with all lane numbers std::vector all_lanes() { std::vector ln; for (uint16_t l=0; l < 8; l++) ln.push_back(l+1); return ln; } // create a vector with one lane number std::vector one_lane(uint16_t l) { return std::vector (1,l); } // create a vector with all tile numbers std::vector all_tiles() { std::vector tl; for (uint16_t l = 0; l < 2; l++) { for (uint16_t s = 0; s < 3; s++) { for (uint16_t t = 0; t < 16; t++) { // construct tile number tl.push_back( (l+1)*1000 + (s+1)*100 + (t+1) ); } } } return tl; } // create a vector with one tile number std::vector one_tile(uint16_t t) { return std::vector (1,t); } // initialize agenda with read length only (all lanes, all tiles) Agenda::Agenda (uint16_t rl) { // add lanes 1-8 to the list std::vector ln = all_lanes(); // call the tiles constructor Agenda(rl, ln); } // initialize agenda with read length and lanes (all tiles) Agenda::Agenda (uint16_t rl, std::vector ln) { // add all tiles to the list std::vector tl = all_tiles(); // call the full constructor Agenda (rl, ln, tl); } // initialize agenda with read length, lanes, and tiles Agenda::Agenda (uint16_t rl, std::vector ln, std::vector tl) { Agenda(rl, ln, tl, 1); } // initialize agenda with read length, lanes, and tiles Agenda::Agenda (uint16_t rl, std::vector ln, std::vector tl, CountType start_cycle) { rlen = rl; lanes = ln; tiles = tl; // set up the agenda items.clear(); for (uint16_t ln_id = 0; ln_id < lanes.size(); ln_id++) { std::vector > lane_status; for (uint16_t tl_id = 0; tl_id < tiles.size(); tl_id++) { // Status for finished cycles if "--continue" was used. std::vector tile_status (start_cycle-1, FINISHED); // Waiting cycles std::vector waiting_status (rlen-(start_cycle-1), WAITING); // Merge vectors tile_status.insert(tile_status.end(), waiting_status.begin(), waiting_status.end()); // Push back lane_status.push_back(tile_status); } items.push_back(lane_status); } } // check for BCL files and update item status void Agenda::update_status () { // iterate over lanes for (uint16_t ln_id = 0; ln_id < items.size(); ++ln_id) { // iterate over all tiles for (uint16_t tl_id = 0; tl_id < items[ln_id].size(); ++tl_id) { // get the first cycle that is not in the FINISHED status uint16_t first_unfinished = 0; while ( (first_unfinished < items[ln_id][tl_id].size()) && (items[ln_id][tl_id][first_unfinished] == FINISHED)) { first_unfinished++; } // std::cout << ln_id << ";" << tl_id << ";" << first_unfinished << std::endl; // if there is one, check if there is a BCL file available if ((first_unfinished != items[ln_id][tl_id].size()) && (items[ln_id][tl_id][first_unfinished] == WAITING)) { std::string this_fname = bcl_name(lanes[ln_id], tiles[tl_id], first_unfinished+1); // only change the status if the file exists if ( file_exists(this_fname) ) { // TODO: probably find a way to check if the machine currently writes to that file items[ln_id][tl_id][first_unfinished] = BCL_AVAILABLE; } } } } } // generate a new task from the agenda Task Agenda::get_task(){ // iterate over lanes for (uint16_t ln_id = 0; ln_id < items.size(); ++ln_id) { // iterate over all tiles for (uint16_t tl_id = 0; tl_id < items[ln_id].size(); ++tl_id) { // check if there is a cycle with an unprocessed BCL file uint16_t unprocessed = 0; while ( (unprocessed < items[ln_id][tl_id].size()) && (items[ln_id][tl_id][unprocessed] != BCL_AVAILABLE)) { unprocessed++; } // generate a new task if there is an unprocessed BCL file if ( unprocessed != items[ln_id][tl_id].size() ) { uint16_t cycle = unprocessed + 1; uint16_t read_no = 0; while ( cycle > globalAlignmentSettings.getSeqById(read_no).length) { cycle -= globalAlignmentSettings.getSeqById(read_no).length; read_no += 1; } Task t (lanes[ln_id], tiles[tl_id], globalAlignmentSettings.getSeqById(read_no), cycle); return t; } } } // return indicator that no new task could be created return NO_TASK; } // set a status void Agenda::set_status(Task t, ItemStatus status) { // get the lane index uint64_t diff = std::find(lanes.begin(), lanes.end(), t.lane) - lanes.begin(); if ( diff >= lanes.size() ) { throw std::out_of_range("Lane ID out of range."); } uint16_t ln_id = diff; // get the tile index diff = std::find(tiles.begin(), tiles.end(), t.tile) - tiles.begin(); if ( diff >= tiles.size() ) { throw std::out_of_range("Tile ID out of range."); } uint16_t tl_id = diff; // get the cycle index if ( (t.cycle > rlen) || (t.cycle == 0) ) { throw std::out_of_range("Cycle out of range."); } uint16_t cl_id = getSeqCycle(t.cycle,t.seqEl.id) -1; items[ln_id][tl_id][cl_id] = status; } // get the status of a task ItemStatus Agenda::get_status(Task t) { // get the lane index uint64_t diff = std::find(lanes.begin(), lanes.end(), t.lane) - lanes.begin(); if ( diff >= lanes.size() ) { throw std::out_of_range("Lane ID out of range."); } uint16_t ln_id = diff; // get the tile index diff = std::find(tiles.begin(), tiles.end(), t.tile) - tiles.begin(); if ( diff >= tiles.size() ) { throw std::out_of_range("Tile ID out of range."); } uint16_t tl_id = diff; // get the cycle index if ( (t.cycle > rlen) || (t.cycle == 0) ) { throw std::out_of_range("Cycle out of range."); } uint16_t cl_id = t.cycle -1; return items[ln_id][tl_id][cl_id]; } // check if all items of the agenda were processed, if possible bool Agenda::finished() { // check for each tile if either all cycles are finished OR there is a failed status item for (uint16_t ln_id = 0; ln_id < items.size(); ++ln_id) { for (uint16_t tl_id = 0; tl_id < items[ln_id].size(); ++tl_id) { for (uint16_t cl_id = 0; cl_id < items[ln_id][tl_id].size(); ++cl_id) { ItemStatus s = items[ln_id][tl_id][cl_id]; if ( s == FAILED ) { // the rest of the tile is "allowed" to be unprocessed --> skip continue; } else if (s != FINISHED) { // otherwise any other status means that the agenda is not finished return false; } } } } return true; } // check if all items of the agenda were processed, if possible bool Agenda::finished( CountType cycle ) { // check for each tile if either all cycles are finished OR there is a failed status item for (uint16_t ln_id = 0; ln_id < items.size(); ++ln_id) { for (uint16_t tl_id = 0; tl_id < items[ln_id].size(); ++tl_id) { for (uint16_t cl_id = 0; cl_id < cycle; ++cl_id) { ItemStatus s = items[ln_id][tl_id][cl_id]; if ( s == FAILED ) { // the rest of the tile is "allowed" to be unprocessed --> skip continue; } else if (s != FINISHED) { // otherwise any other status means that the agenda is not finished return false; } } } } return true; } bool Agenda::cycle_available( CountType cycle ) { if ( cycle == 0 || cycle > rlen ) return false; for (uint16_t ln_id = 0; ln_id < items.size(); ++ln_id) { for (uint16_t tl_id = 0; tl_id < items[ln_id].size(); ++tl_id) { if ( items[ln_id][tl_id][cycle-1] == WAITING ) return false; } } return true; } // the total number of tasks on the agenda uint32_t Agenda::task_count() { return lanes.size() * tiles.size() * rlen; } // the total number of finished tasks on the agenda uint32_t Agenda::tasks_finished() { uint32_t num_finished = 0; // iterate over all items and count the finished tasks for (uint16_t ln_id = 0; ln_id < items.size(); ++ln_id) { for (uint16_t tl_id = 0; tl_id < items[ln_id].size(); ++tl_id) { for (uint16_t cl_id = 0; cl_id < items[ln_id][tl_id].size(); ++cl_id) { if (items[ln_id][tl_id][cl_id] == FINISHED) { num_finished++; } } } } return num_finished; } HiLive-v1.1-133292b5092f376eabbec2e3021049b7dc920d99/lib/parallel.h000066400000000000000000000122521321155700500227230ustar00rootroot00000000000000#ifndef PARALLEL_H #define PARALLEL_H #include "headers.h" #include "definitions.h" #include "tools.h" #include "kindex.h" //------ Threading tools --------------------------------------------// /** * Task data structure. Contains all information for a thread to process a BCL file. * @author Martin Lindner */ struct Task { /** The lane of the task. */ uint16_t lane; /** The tile of the task. */ uint16_t tile; /** Struct containing the read properties (Barcode vs. sequence; length; mate). */ SequenceElement seqEl; /** Current cycle of the particular read (in general, this does NOT equal the sequencing cycle!). Must be <=seqEl.length. */ uint16_t cycle; /** * Constructor for a NULL task. * @author Tobias Loka */ Task() : lane(255), tile(0), seqEl(NULLSEQ), cycle(0) {}; /** * Constructor for a valid task. * @param ln The lane number. * @param tl The tile number. * @param seq The respective seqEl element for the current read containing information about length, type (barcode vs. sequence), mate number ... * @param cl The cycle of the current read (in general, this does NOT equal the sequencing cycle!). Must be <=seqEl.length. * @author Martin Lindner */ Task(uint16_t ln, uint16_t tl, SequenceElement seq, uint16_t cl): lane(ln), tile(tl), seqEl(seq), cycle(cl) {}; /** Constructor for a task without seqEl information. */ Task(uint16_t ln, uint16_t tl, uint16_t cl) : lane(ln), tile(tl), seqEl(NULLSEQ), cycle(cl) {}; /** * Overload of the << operator. Defines the cout form of a task. * @author Martin Lindner */ friend std::ostream& operator<<(std::ostream& os, const Task& t); }; /** * Overload of the == operator. * @return true, if all fields/variables of the compared tasks equal. * @author Martin Lindner */ inline bool operator==(const Task& l, const Task& r){ return (r.lane==l.lane)&&(r.tile==l.tile)&&(r.cycle==l.cycle)&&(r.seqEl==l.seqEl); } /** * Overload of the != operator. * @return true, if at least one field/variable of the compared tasks is different. * @author Martin Lindner */ inline bool operator!=(const Task& l, const Task& r){ return !(l==r); } inline bool operator<(const Task& l, const Task& r){ if ( l.cycle == r.cycle) { if ( l.lane == r.lane ) { if ( l.tile == r.tile ) { return l.seqEl.mate < r.seqEl.mate; } else { return l.tile < r.tile; } } else { return l.lane < r.lane; } } else { return l.cycle < r.cycle; } } /** * Definition of a NULL task. * @author Martin Lindner */ const Task NO_TASK (255,0,NULLSEQ,0); // Task queue data structure. Manages a list of task objects in a thread safe way. class TaskQueue { // the internal queue std::queue tasks; // mutex to ensure that only one process can access the queue at once std::mutex m; public: // Add element to the task list void push(Task t); // Get element from the task list Task pop(); // return the size of the queue uint64_t size(); }; // Agenda item status typedef uint8_t ItemStatus; const ItemStatus WAITING = 0; const ItemStatus BCL_AVAILABLE = 1; const ItemStatus RUNNING = 2; const ItemStatus FINISHED = 3; const ItemStatus RETRY = 4; const ItemStatus FAILED = 5; const ItemStatus ERROR = std::numeric_limits::max(); // Agenda: monitors the sequencing process and manages the alignment // - Monitors BCL files // - generates new tasks // - receive finished/fail signals class Agenda { // list of items on the agenda. items[lane][tile][cycle] std::vector< std::vector< std::vector > > items; // dataset information uint16_t rlen; std::vector lanes; std::vector tiles; public: // initialize agenda with read length only (all lanes, all tiles) Agenda (uint16_t rl); // initialize agenda with read length and lanes (all tiles) Agenda (uint16_t rl, std::vector ln); // initialize agenda with read length, lanes and tiles Agenda (uint16_t rl, std::vector ln, std::vector tl); Agenda (uint16_t rl, std::vector ln, std::vector tl, CountType start_cycle); // check for BCL files and update item status void update_status(); // generate a new task from the agenda Task get_task(); // set the status of a task void set_status(Task t, ItemStatus status); // get the status of a task ItemStatus get_status(Task t); // check if all items of the agenda were processed, if possible bool finished(); bool finished( CountType cycle ); bool cycle_available( CountType cycle ); // the total number of tasks on the agenda uint32_t task_count(); // the total number of finished tasks on the agenda uint32_t tasks_finished(); // generate a complete TaskQueue with tasks to generate SAM files // SAM files can only be generated for tiles where all cycles are completed std::vector get_SAM_tasks(); }; // create a vector with all lane numbers std::vector all_lanes(); // create a vector with one lane number std::vector one_lane(uint16_t l); // create a vector with all tile numbers std::vector all_tiles(); // create a vector with one tile number std::vector one_tile(uint16_t t); #endif /* PARALLEL_H */ HiLive-v1.1-133292b5092f376eabbec2e3021049b7dc920d99/lib/tools.cpp000066400000000000000000000170751321155700500226320ustar00rootroot00000000000000#include "tools.h" /////////////////////////////////// ////////// K-mer Hashing ////////// /////////////////////////////////// HashIntoType hash(const char * kmer, HashIntoType& _h, HashIntoType& _r) { assert(strlen(kmer) >= globalAlignmentSettings.get_kmer_span()); HashIntoType h = 0, r = 0; h |= twobit_repr(kmer[0]); r |= twobit_comp(kmer[globalAlignmentSettings.get_kmer_span()-1]); for (unsigned int i = 1, j = globalAlignmentSettings.get_kmer_span()-2; i < globalAlignmentSettings.get_kmer_span(); i++, j--) { // if i not gap position auto gaps_vec = globalAlignmentSettings.get_kmer_gaps(); if (std::find(gaps_vec.begin(), gaps_vec.end(), i+1) == gaps_vec.end()) { h = h << 2; h |= twobit_repr(kmer[i]); r = r << 2; r |= twobit_comp(kmer[j]); } } _h = h; _r = r; return (h)<(r)?h:r; } std::string::const_iterator hash_fw(std::string::const_iterator it, std::string::const_iterator end, HashIntoType& _h) { if (!(it+globalAlignmentSettings.get_kmer_span()-1 < end)) { std::cerr << "Error: hash_fw was called using an begin position which had not at least kmer_span bases behind it." << std::endl; } HashIntoType h = 0; std::string::const_iterator last_invalid = it-1; h |= twobit_repr(*it); std::string::const_iterator kmerEnd = it+globalAlignmentSettings.get_kmer_span(); ++it; int positionInKmer = 2; auto kmer_gaps = globalAlignmentSettings.get_kmer_gaps(); for (; it != kmerEnd; ++it, ++positionInKmer) { if (std::find(kmer_gaps.begin(), kmer_gaps.end(), positionInKmer) != kmer_gaps.end()) continue; h = h << 2; h |= twobit_repr(*it); if ( seq_chars.find(*it) == std::string::npos ) { last_invalid = it+globalAlignmentSettings.get_kmer_span()-1; } } _h = h; return last_invalid; } std::string unhash(HashIntoType myHash, unsigned hashLen) { std::string kmer = ""; unsigned mask = 3; for (unsigned i = 1; i> 2; } std::reverse(kmer.begin(), kmer.end()); return kmer; } //////////////////////////////////////////// ////////// File name construction ////////// //////////////////////////////////////////// std::string bcl_name(uint16_t ln, uint16_t tl, uint16_t cl) { std::ostringstream path_stream; path_stream << globalAlignmentSettings.get_root() << "/L00" << ln << "/C" << cl << ".1/s_"<< ln <<"_" << tl << ".bcl"; return path_stream.str(); } std::string alignment_name(uint16_t ln, uint16_t tl, uint16_t cl, uint16_t mt){ std::ostringstream path_stream; std::string base = globalAlignmentSettings.get_temp_dir() != "" ? globalAlignmentSettings.get_temp_dir() : globalAlignmentSettings.get_root(); path_stream << base << "/L00" << ln << "/s_"<< ln << "_" << tl << "." << mt << "."<< cl << ".align"; return path_stream.str(); } uint16_t getSeqCycle(uint16_t cycle, uint16_t seq_id) { uint16_t seq_cycle = cycle; for ( int i = 0; i < seq_id; i++ ) seq_cycle += globalAlignmentSettings.getSeqById(i).length; return seq_cycle; } uint16_t getMateCycle( uint16_t mate_number, uint16_t seq_cycle ) { // Invalid mate if ( mate_number == 0 || mate_number > globalAlignmentSettings.get_mates() ) return 0; // Iterate through all sequence elements (including barcodes) for ( CountType id = 0; id < globalAlignmentSettings.get_seqs().size(); id++ ) { // Current sequence element SequenceElement seq = globalAlignmentSettings.getSeqById(id); // Seq is mate of interest if ( seq.mate == mate_number ) return ( seq.length > seq_cycle ? seq_cycle : seq.length ); // Not enough cycles left to reach mate of interest else if ( seq.length >= seq_cycle ) return 0; // Reduce number of cycles by the Seq length else seq_cycle -= seq.length; } // Should not be reached return 0; } std::string filter_name(uint16_t ln, uint16_t tl) { std::ostringstream path_stream; path_stream << globalAlignmentSettings.get_root() << "/L00" << ln << "/s_"<< ln << "_" << tl << ".filter"; return path_stream.str(); } std::string position_name(uint16_t ln, uint16_t tl) { std::ostringstream path_stream; path_stream << globalAlignmentSettings.get_root() << "../L00" << ln << "/s_"<< ln << "_" << tl << ".clocs"; return path_stream.str(); } std::string get_settings_name() { std::ostringstream path_stream; std::string base = globalAlignmentSettings.get_temp_dir() != "" ? globalAlignmentSettings.get_temp_dir() : globalAlignmentSettings.get_root(); path_stream << base << "/hilive_settings.xml"; return path_stream.str(); } std::string get_out_log_name() { return ( globalAlignmentSettings.get_out_dir() + "/hilive_out.log" ); } //////////////////////////////////// ////////// SAM/BAM output ////////// //////////////////////////////////// seqan::BamHeader getBamHeader() { std::stringstream ss; ss.str(std::string()); ss << HiLive_VERSION_MAJOR << "." << HiLive_VERSION_MINOR; seqan::BamHeader header; resize(header, 2); // @HD header. seqan::resize(header[0].tags, 2); header[0].type = seqan::BAM_HEADER_FIRST; header[0].tags[0].i1 = "VN"; header[0].tags[0].i2 = "1.5"; header[0].tags[1].i1 = "GO"; header[0].tags[1].i2 = "query"; // @PG header. seqan::resize(header[1].tags, 3); header[1].type = seqan::BAM_HEADER_PROGRAM; header[1].tags[0].i1 = "ID"; header[1].tags[0].i2 = "hilive"; header[1].tags[1].i1 = "PN"; header[1].tags[1].i2 = "HiLive"; header[1].tags[2].i1 = "VN"; header[1].tags[2].i2 = ss.str(); return header; } std::string getTileBamTempFileName(CountType ln, CountType tl, std::string barcode, CountType cycle) { std::ostringstream fname; std::string file_suffix = globalAlignmentSettings.get_write_bam() ? ".bam" : ".sam"; fname << globalAlignmentSettings.get_temp_dir() << "/L00" << ln << "/s_" << std::to_string(ln) << "_" << std::to_string(tl) << "." << std::to_string(cycle) << "." << barcode << ".temp" << file_suffix; return fname.str(); } std::string getTileBamFileName(CountType ln, CountType tl, std::string barcode, CountType cycle) { std::ostringstream fname; std::string file_suffix = globalAlignmentSettings.get_write_bam() ? ".bam" : ".sam"; fname << globalAlignmentSettings.get_temp_dir() << "/L00" << ln << "/s_" << std::to_string(ln) << "_" << std::to_string(tl) << "." << std::to_string(cycle) << "." << barcode << file_suffix; return fname.str(); } std::string getBamTempFileName(std::string barcode, CountType cycle) { std::ostringstream fname; std::string file_suffix = globalAlignmentSettings.get_write_bam() ? ".bam" : ".sam"; fname << globalAlignmentSettings.get_out_dir() << "/hilive_out_" << "cycle" << std::to_string(cycle) << "_" << barcode << ".temp" << file_suffix; return fname.str(); } std::string getBamFileName(std::string barcode, CountType cycle) { std::ostringstream fname; std::string file_suffix = globalAlignmentSettings.get_write_bam() ? ".bam" : ".sam"; fname << globalAlignmentSettings.get_out_dir() << "/hilive_out_" << "cycle" << std::to_string(cycle) << "_" << barcode << file_suffix; return fname.str(); } int atomic_rename( const char *oldname, const char *newname ) { if ( !file_exists(oldname) ) throw file_not_exist_error("Can't rename file: " + std::string(oldname) + ". File does not exist."); std::lock_guard old_lock(fileLocks.at(std::string(oldname))); std::lock_guard new_lock(fileLocks.at(std::string(newname))); return std::rename(oldname, newname); } ///////////////////////////////// ////////// Other stuff ////////// ///////////////////////////////// char to_phred_quality ( uint8_t bc_qual ) { char phred_score = '!'; phred_score += bc_qual; return phred_score; } HiLive-v1.1-133292b5092f376eabbec2e3021049b7dc920d99/lib/tools.h000066400000000000000000000122631321155700500222710ustar00rootroot00000000000000/** * This class provides functions that are dependent of the alignmentSettings! * For functions that are not dependent of any other HiLive class, please use the tools_static class! * Please do NOT add further includes to this file since this will lead to unwanted dependencies! */ #ifndef TOOLS_H #define TOOLS_H /* DONT ADD ANY INCLUDES */ #include "tools_static.h" #include "alignmentSettings.h" #include "global_variables.h" /* DONT ADD ANY INCLUDES */ /////////////////////////////////// ////////// K-mer Hashing ////////// /////////////////////////////////// /** * Calculate the first forward and reverse complement k-mer in the string . * @param kmer Input sequence. * @param _h Reference to forward hash variable. * @param _r Reference to reverse hash variable. * @return The larger hash value (TODO: why?) */ HashIntoType hash(const char * kmer, HashIntoType& _h, HashIntoType& _r); /** * Calculates the first forward k-mer in the string . * @param it Iterator of the input sequence. * @param end End of the iterator of the input sequence. * @param _h Reference to the forward hash variable. * @return Iterator pointing at the last invalid base. */ std::string::const_iterator hash_fw(std::string::const_iterator it, std::string::const_iterator end, HashIntoType& _h); /** * Calculate the sequence from a hash value. * @param myHash The input hash value. * @param hashLen Length (weight) of the hashed sequence. * @return The unhashed sequence. */ std::string unhash(HashIntoType myHash, unsigned hashLen=globalAlignmentSettings.get_kmer_weight()); //////////////////////////////////////////// ////////// File name construction ////////// //////////////////////////////////////////// /** * Get the name of a bcl file. * @param ln The lane number. * @param tl The tile number. * @param cl The sequencing cycle. * @return Path to the bcl file. */ std::string bcl_name(uint16_t ln, uint16_t tl, uint16_t cl); /** * Get the name of an alignment file. * @param ln The lane number. * @param tl The tile number. * @param cl The cycle for the respective mate. * @param mt The mate number. * @return Path to the alignment file. */ std::string alignment_name(uint16_t ln, uint16_t tl, uint16_t cl, uint16_t mt); /** * Get the name of a filter file. * @param ln The lane number. * @param tl The tile number. * @return Path to the filter file. */ std::string filter_name(uint16_t ln, uint16_t tl); /** * Get the name of a clocs file. * @param ln The lane number. * @param tl The tile number. * @return Path to the clocs file. */ std::string position_name(uint16_t ln, uint16_t tl); /** * Get the name of the settings file. * @return Path to the settings file. */ std::string get_settings_name(); /** * Get the name of the output log file. * @return Path to the output log file. */ std::string get_out_log_name(); /** Get the current sequencing cycle using the current alignment cycle and read number. * @param cycle The read cycle. * @param seq_id The sequence id (:= id of the respective element in globalAlignmentSettings::seqs) * @return The sequencing cycle. * @author Tobias Loka */ uint16_t getSeqCycle(uint16_t cycle, uint16_t seq_id=1); /** * Get the cycle of a mate for a given sequencing cycle. * When the mate is completely finished in the given cycle, return its total sequence length. * @param mate_number Mate of interest. * @param seq_cycle The sequencing cycle. * @return Cycle of the mate in the given sequencing cycle. * @author Tobias Loka */ uint16_t getMateCycle( uint16_t mate_number, uint16_t seq_cycle ); //////////////////////////////////// ////////// SAM/BAM output ////////// //////////////////////////////////// std::string getTileBamTempFileName(CountType ln, CountType tl, std::string barcode, CountType cycle); std::string getTileBamFileName(CountType ln, CountType tl, std::string barcode, CountType cycle); /** * Get the header for a SAM/BAM output file. * @return The BAM header. * @author Tobias Loka */ seqan::BamHeader getBamHeader(); /** * Name of a temporary SAM/BAM file (for the time it is written). * @param barcode Barcode of the output file (or "undetermined" for undetermined reads) * @param cycle The output cycle. * @return Name of the temporary output file for writing. * @author Tobias Loka */ std::string getBamTempFileName(std::string barcode, CountType cycle); /** * Final name of a SAM/BAM file. * @param barcode Barcode of the output file (or "undetermined" for undetermined reads) * @param cycle The output cycle. * @return Name of the final output file. * @author Tobias Loka */ std::string getBamFileName(std::string barcode, CountType cycle); /** * Copy a file while locking them in the global fileLocks. */ int atomic_rename( const char *oldname, const char *newname ); ///////////////////////////////// ////////// Other stuff ////////// ///////////////////////////////// /** * Convert a base call quality value to the respective char in PHRED syntax. * This function considers the settings of full quality or 2-bit quality in the globalAlignmentSettings. * @param bc_qual The base call quality as stored in HiLive. * @return PHRED char ( "!" - "I" ) */ char to_phred_quality ( uint8_t bc_qual ); #endif /* TOOLS_H */ HiLive-v1.1-133292b5092f376eabbec2e3021049b7dc920d99/lib/tools_static.cpp000066400000000000000000000110611321155700500241660ustar00rootroot00000000000000#include "tools_static.h" ///////////////////////////////// ////////// Comparators ////////// ///////////////////////////////// bool gp_compare (GenomePosType i,GenomePosType j) { if ( i.pos == j.pos ) return i.gid < j.gid; return (i.pos < j.pos); } ///////////////////////////////////// ////////// Type convertion ////////// ///////////////////////////////////// void split(const std::string &s, char delim, std::vector &elems) { std::stringstream ss; ss.str(s); std::string item; while (std::getline(ss, item, delim)) { elems.push_back(item); } } /////////////////////////////////// ////////// File handling ////////// /////////////////////////////////// std::ifstream::pos_type get_filesize(const std::string &fname) { std::ifstream in(fname, std::ios::binary | std::ios::ate); return in.tellg(); } bool is_directory(const std::string &path) { if ( boost::filesystem::exists(path) ) { if ( boost::filesystem::is_directory(path) ) { return true; } else { return false; } } else { return false; } } bool file_exists(const std::string &fname) { return boost::filesystem::exists(fname); } std::string absolute_path(std::string fname) { boost::filesystem::path input_path(fname); return boost::filesystem::canonical(fname).string(); } std::vector read_binary_file(const std::string &fname) { // get file size uint64_t size = get_filesize(fname); // open binary file FILE* f; f = fopen(fname.c_str(), "rb"); if (!f) { std::cerr << "Error reading binary file " << fname << ": Could not open file." << std::endl; return std::vector(); } // allocate memory std::vector data (size); // read all data at once uint64_t read = fread(data.data(), 1, size, f); if (read != size){ std::cerr << "Error reading binary file " << fname << ": File size: " << size << " bytes. Read: " << read << " bytes." << std::endl; return std::vector(); } fclose(f); return data; } uint64_t write_binary_file(const std::string &fname, const std::vector & data) { // open binary file FILE* ofile; ofile = fopen(fname.c_str(), "wb"); if (!ofile) { std::cerr << "Error serializing object to file " << fname << ": Could not open file for writing." << std::endl; return 1; } // write all data uint64_t written = fwrite(data.data(), 1, data.size(), ofile); // close file fclose(ofile); if (written != data.size()){ std::cerr << "Error serializing object to file " << fname << ": Total size: " << data.size() << " bytes. Written: " << written << " bytes." << std::endl; } return written; } //////////////////////////////////////////////// ////////// Property trees / XML files ////////// //////////////////////////////////////////////// bool read_xml(boost::property_tree::ptree & xml_in, std::string xml_fname) { if ( !file_exists(xml_fname) ) { std::cout << "XML file not found: " << xml_fname << std::endl; return false; } try { boost::property_tree::read_xml (xml_fname, xml_in); } catch ( const std::exception &ex) { std::cerr << "Error loading xml file " << xml_fname << ": " << std::endl << ex.what() << std::endl; return false; } return true; } bool write_xml(boost::property_tree::ptree & xml_out, std::string xml_fname) { try { boost::property_tree::write_xml( xml_fname, xml_out ); } catch ( const std::exception &ex ) { std::cerr << "Error writing xml file " << xml_fname << ": " << std::endl << ex.what() << std::endl; return false; } return true; } ///////////////////////////////// ////////// Other stuff ////////// ///////////////////////////////// uint32_t num_reads_from_bcl(std::string bcl) { // open BCL file of first cycle FILE* ifile; ifile = fopen(bcl.c_str(), "rb"); if (!ifile) { std::cerr << "Error reading BCL file " << bcl << ": Could not open file." << std::endl; return 0; } // extract the number of reads uint32_t num_reads; bool res = fread(&num_reads, 1, sizeof(uint32_t), ifile); if (!res) { std::cerr << "Error extracting number of reads from BCL file " << bcl << std::endl; return 0; } // close file fclose (ifile); return num_reads; } std::vector flowcell_layout_to_tile_numbers( CountType surfaceCount, CountType swathCount, CountType tileCount ) { std::vector tiles_vec; for (uint16_t surf = 1; surf <= surfaceCount; surf++) for (uint16_t swath = 1; swath <= swathCount; swath++) for (uint16_t tile = 1; tile <= tileCount; tile++) tiles_vec.push_back(surf*1000 + swath*100 + tile); return tiles_vec; } HiLive-v1.1-133292b5092f376eabbec2e3021049b7dc920d99/lib/tools_static.h000066400000000000000000000120031321155700500236300ustar00rootroot00000000000000/** * This class provides functions that are independent of any other HiLive class. * Please do NOT add further includes to this file since this will lead to unwanted dependencies! */ #ifndef TOOLS_STATIC_H #define TOOLS_STATIC_H /* DONT ADD ANY INCLUDES */ #include "headers.h" #include "definitions.h" /* DONT ADD ANY INCLUDES */ ///////////////////////////////// ////////// Comparators ////////// ///////////////////////////////// /** * Compare function to sort GenomePosType objects by position. * If position if equal, compare by gid. * @param i First position to compare * @param j Second position to compare * @return true, if first position is "smaller" than second position. */ bool gp_compare (GenomePosType i,GenomePosType j); ///////////////////////////////////// ////////// Type convertion ////////// ///////////////////////////////////// /** * Split a std::string to a std::vector. * @param s Reference to the input string. * @param delim A split delimiter. * @param elems The target vector. * @author Tobias Loka */ void split(const std::string &s, char delim, std::vector &elems); /////////////////////////////////// ////////// File handling ////////// /////////////////////////////////// /** * Get total size of a file (in bytes) * @param fname Name of the file. * @return Size of the file. */ std::ifstream::pos_type get_filesize(const std::string &fname); /** * Check if a given path is a directory. * @param Path of interest. * @return true, if the given path is a directory. */ bool is_directory(const std::string &path); /** * Check if a given path is a file. * @param Path of interest. * @return true, if the given path is a file. */ bool file_exists(const std::string &fname); /** * Convert a relative to an absolute path. * @param fname Input path. * @return Absolute path to fname. * @author Tobias Loka * TODO: Not tested and used yet. */ std::string absolute_path(std::string fname); /** * Read a binary file and stores its content in a char vector. * @param fname Path to the file. * @return All data from the file as char vector. */ std::vector read_binary_file(const std::string &fname); /** * Write data from a char vector into a binary file. * @param fname Path to the file. * @param data Data to be saved in the file. * @return Number of written bytes. */ uint64_t write_binary_file(const std::string &fname, const std::vector & data); //////////////////////////////////////////////// ////////// Property trees / XML files ////////// //////////////////////////////////////////////// /** * Read a file in XML format. Results are stored as property tree. * @param xml_in Reference to the property tree to store the XML data. * @param xml_fname Name of the input file. * @return true on success * @author Tobias Loka */ bool read_xml(boost::property_tree::ptree & xml_in, std::string xml_fname); /** * Write a property tree to an XML file. * @param xml_out Property tree that contains the data. * @param xml_fname Name of the output file. * @return true on success * @author Tobias Loka */ bool write_xml(boost::property_tree::ptree & xml_out, std::string xml_fname); /** * Convert a variable of a non-vector type to a property tree. * @param variable The variable to convert. * @return The property tree for the input variable * @author Tobias Loka * TODO: check if the exception handling makes sense. */ /** Convert a variable to an XML node. T must be a data type that can be cast to a string-like output format. */ template boost::property_tree::ptree getXMLnode (T variable) { boost::property_tree::ptree node; try { node.put("", variable); } catch ( const std::exception &ex ) { std::cerr << "Failed to convert variable to XML output format." << std::endl; } return node; } /** * Convert a variable of a vector type to a property tree. * The subnodes have key "el". * @param vector The vector to convert. * @return The property tree for the input variable * @author Tobias Loka */ /** Convert a vector to an XML node. T must be a data type that can be cast to a string-like output format. */ template boost::property_tree::ptree getXMLnode_vector (std::vector vector) { boost::property_tree::ptree node; for ( auto el = vector.begin(); el != vector.end(); ++el ) { node.add_child("el", getXMLnode ( *el )); } return node; } ///////////////////////////////// ////////// Other stuff ////////// ///////////////////////////////// /** * Extract the number of reads from a BCL file. * @param bcl Path to the bcl file. * @return Number of reads in the bcl file. */ uint32_t num_reads_from_bcl(std::string bcl); /** * Convert the flowcell layout data to a plain vector of tile numbers. * @param surfaceCount The surface count. * @param swatchCount The swath count. * @param tileCount The tile count. * @return A vector of tile numbers. */ std::vector flowcell_layout_to_tile_numbers( CountType surfaceCount, CountType swathCount, CountType tileCount ); #endif /* TOOLS_STATIC_H */ HiLive-v1.1-133292b5092f376eabbec2e3021049b7dc920d99/tools/000077500000000000000000000000001321155700500213465ustar00rootroot00000000000000HiLive-v1.1-133292b5092f376eabbec2e3021049b7dc920d99/tools/build_index.cpp000066400000000000000000000035421321155700500243440ustar00rootroot00000000000000#include #include "../lib/headers.h" #include "../lib/definitions.h" #include "../lib/kindex.h" #include "../lib/argument_parser.h" AlignmentSettings globalAlignmentSettings; mutex_map fileLocks; /** * Main function that organizes the overall structure of the program. * @param argc Number of arguments * @param argv Argument array * @return 0 on success, other numbers on error */ int main(int argc, const char* argv[]) { // Program start output. std::cout << std::endl << "------" << std::endl << "HiLive Index Builder v" << HiLive_VERSION_MAJOR << "." << HiLive_VERSION_MINOR << " - Build Index for Realtime Alignment of Illumina Reads" << std::endl << "------" << std::endl << std::endl; // Init argument parser BuildIndexArgumentParser argumentParser(argc, argv); // Parse command line arguments int parser_returnStatus = argumentParser.parseCommandLineArguments(); // Successful execution of "help" or "license" if ( parser_returnStatus == 1 ) { exit(EXIT_SUCCESS); } // Parsing error else if ( parser_returnStatus == -1 ) { std::cout << "Parsing of command line options failed. For help, type 'hilive-build --help'." << std::endl; exit(EXIT_FAILURE); } // Build index std::cout << "Create index from file " << argumentParser.fasta_name << " ..." << std::endl; KixBuild* index = new KixBuild(); index->add_fasta(argumentParser.fasta_name, !argumentParser.do_not_convert_spaces, argumentParser.trim_ids); // Trim index if ( argumentParser.trim > 0) { uint64_t trimmed = index->trim(argumentParser.trim); std::cout << "Removed " << trimmed << " k-mer positions from the database." << std::endl; } // Write index to file std::cout << "Writing index to file " << argumentParser.index_name << std::endl; index->serialize_file(argumentParser.index_name); delete index; return EXIT_SUCCESS; } HiLive-v1.1-133292b5092f376eabbec2e3021049b7dc920d99/tools/hilive.cpp000066400000000000000000000265671321155700500233520ustar00rootroot00000000000000#include "../lib/headers.h" #include "../lib/definitions.h" #include "../lib/global_variables.h" #include "../lib/kindex.h" #include "../lib/alnstream.h" #include "../lib/parallel.h" #include "../lib/argument_parser.h" #include "../lib/tools_static.h" AlignmentSettings globalAlignmentSettings; mutex_map fileLocks; /** * If a thread is used for output, call this function to start the next available output task. * @param alnout The deque of output controllers for each output cycle. * @return The written task. NO_TASK if no task was written (e.g., when no task is available). */ Task writeNextTaskToBam ( std::deque & alnouts ) { // Search for the next task to write for ( auto& alnout : alnouts ) { // Only loop through non-finished output deques if ( !alnout.is_finished() ) { // Try to write the next task from the deque Task return_status = alnout.write_next(); // Proceed with next deque if no task was written if ( return_status == NO_TASK ) { continue; } // Return the written task else { // Finalize the output for this cycle if this was the last task. if ( alnout.is_finished() ) { alnout.finalize(); } return return_status; } } } return NO_TASK; } /** * Worker function for the alignment threads. * @param tasks Reference to the "to do" task queue * @param finished Reference to the "finished" task queue * @param failed Reference to the "failed" task queue * @param idx Pointer to the index object * @param surrender Control flag (threads stop if true) */ void worker (TaskQueue & tasks, TaskQueue & finished, TaskQueue & failed, KixRun* idx, std::deque & alnouts, std::atomic & writing_threads, bool & surrender ) { // Continue until surrender flag is set while ( !surrender ) { { // scope for block guard atomic_block_guard block( writing_threads ); // Start an output task if output threads and tasks available. if ( block.get_blocked_value() <= globalAlignmentSettings.get_num_out_threads() ) { Task written_task = writeNextTaskToBam( alnouts ); if ( written_task != NO_TASK ) { continue; } } } // Try to obtain a new task Task t = tasks.pop(); // If "to do" task was found if ( t != NO_TASK ) { // Execute the task bool success = true; std::stringstream ss; try { StreamedAlignment s (t.lane, t.tile, t.seqEl.length); uint64_t num_seeds; // Seed extension if current read is sequence fragment. if ( !t.seqEl.isBarcode() ) { num_seeds = s.extend_alignment(t.cycle,t.seqEl.id,t.seqEl.mate,idx); ss << "Task [" << t << "]: Found " << num_seeds << " seeds." << std::endl; } // Barcode extension if current read is barcode fragment else { CountType mate = 1; for ( ; mate <= globalAlignmentSettings.get_mates(); mate++ ) { SequenceElement seqEl = globalAlignmentSettings.getSeqByMate(mate); CountType current_mate_cycle = t.seqEl.id < seqEl.id ? 0 : seqEl.length; s.extend_barcode(t.cycle, current_mate_cycle, t.seqEl.id, mate); } ss << "Task [" << t << "]: Extended barcode of " << --mate << " mates." << std::endl; } std::cout << ss.str(); } catch (const std::exception &e) { ss << "Failed to finish task [" << t << "]: " << e.what() << std::endl; std::cerr << ss.str(); success = false; } // Push the task in the correct Task Queue (Finished or Failed) if (success) { // Make previous cycle available for output. If current cycle is the last one, make current cycle available. CountType seqCycle = getSeqCycle(t.cycle, t.seqEl.id); CountType output_cycle = seqCycle - 1; bool is_last_cycle = seqCycle == globalAlignmentSettings.get_cycles(); if ( globalAlignmentSettings.is_output_cycle( output_cycle ) || is_last_cycle ) { for ( auto& alnout : alnouts ) { if ( output_cycle == alnout.get_cycle() ) { alnout.set_task_available( Task(t.lane, t.tile, output_cycle)); } if ( is_last_cycle && alnout.get_cycle() == globalAlignmentSettings.get_cycles() ) { alnout.set_task_available( Task(t.lane, t.tile, globalAlignmentSettings.get_cycles())); } } } finished.push(t); } else { failed.push(t); } } // Thread is idle --> Also use it for output if the maximum number of output threads is exceeded. else { atomic_block_guard block( writing_threads ); writeNextTaskToBam( alnouts ); } // send this thread to sleep for a second std::this_thread::sleep_for (std::chrono::milliseconds(100)); } } /** * Main function that organizes the overall structure of the program. * @param argc Number of arguments * @param argv Argument array * @return 0 on success, other numbers on error */ int main(int argc, const char* argv[]) { // Variable for runtime measurement time_t t_start = time(NULL); // Program start output std::cout << std::endl << "------" << std::endl << "HiLive v"<< HiLive_VERSION_MAJOR << "." << HiLive_VERSION_MINOR << " - Realtime Alignment of Illumina Reads" << std::endl << "------" << std::endl<< std::endl; // Parse command line arguments HiLiveArgumentParser argumentParser(argc, argv); int parser_returnStatus = argumentParser.parseCommandLineArguments(); // Successful execution of "help" or "license" if ( parser_returnStatus == 1 ) { exit(EXIT_SUCCESS); } // Parsing error else if ( parser_returnStatus == -1 ) { std::cout << "Parsing of command line options failed. For help, type 'hilive --help'." << std::endl; exit(EXIT_FAILURE); } // Load the index std::cout << "Loading Index ... " << std::endl; KixRun* index = new KixRun(); index->deserialize_file(globalAlignmentSettings.get_index_fname()); // Report loaded k-mer properties std::cout << std::endl; std::cout << "kmer span: " << std::to_string(globalAlignmentSettings.get_kmer_span()) << std::endl; std::cout << "kmer weight: " << std::to_string(globalAlignmentSettings.get_kmer_weight()) << std::endl; std::cout << "kmer gaps: "; for ( auto gap : globalAlignmentSettings.get_kmer_gaps() ) { std::cout << gap << " "; } std::cout << std::endl << std::endl; // Write the alignment settings to an XML file boost::property_tree::ptree xml_out = globalAlignmentSettings.to_ptree(); if ( ! write_xml(xml_out, get_settings_name()) ) exit(EXIT_FAILURE); // Create the overall agenda Agenda agenda (globalAlignmentSettings.get_cycles(), globalAlignmentSettings.get_lanes(), globalAlignmentSettings.get_tiles(), globalAlignmentSettings.get_start_cycle()); // Wait for the first cycle to be written std::cout << "Waiting for the first cycle to finish..." << std::endl; while ( ! agenda.cycle_available(globalAlignmentSettings.get_start_cycle()) ) { agenda.update_status(); std::this_thread::sleep_for(std::chrono::milliseconds(1000)); } // Write empty alignment file for each tile and for each sequence read std::cout << "Initializing Alignment files..." << std::endl; for (uint16_t ln : globalAlignmentSettings.get_lanes()) { for (uint16_t tl : globalAlignmentSettings.get_tiles()) { CountType mate = 1; for ( ; mate <= globalAlignmentSettings.get_mates(); mate++ ) { // Don't init files if "--continue" was used to start in a later cycle. if ( getMateCycle(mate, globalAlignmentSettings.get_start_cycle()) > 1 ) continue; StreamedAlignment s (ln, tl, globalAlignmentSettings.getSeqByMate(mate).length); s.create_directories(); s.init_alignment(mate); } } } std::cout << "First cycle complete. Starting alignment." << std::endl; // Set up the queues TaskQueue toDoQ; TaskQueue finishedQ; TaskQueue failedQ; // Init output controller for each output cycle. TODO: check if it is possible to replace the deque by a map (cycle, alnout). std::deque alnouts; for ( CountType cycle : globalAlignmentSettings.get_output_cycles() ) { if ( cycle >= globalAlignmentSettings.get_start_cycle() ) alnouts.emplace_back(globalAlignmentSettings.get_lanes(), globalAlignmentSettings.get_tiles(), cycle, index); } // Number of threads currently used for writing output. std::atomic writing_threads(0); // Flag to stop the threads. bool surrender = false; // Create the threads std::cout << "Creating " << globalAlignmentSettings.get_num_threads() << " threads." << std::endl; std::vector workers; for (int i = 0; i < globalAlignmentSettings.get_num_threads(); i++) { workers.push_back(std::thread(worker, std::ref(toDoQ), std::ref(finishedQ), std::ref(failedQ), index, std::ref(alnouts), std::ref(writing_threads), std::ref(surrender))); } // Process all tasks on the agenda while ( !agenda.finished() ) { // check for new BCL files and update the agenda status agenda.update_status(); // fill the To Do queue with tasks from the agenda while(true) { Task t = agenda.get_task(); if (t == NO_TASK) break; toDoQ.push(t); agenda.set_status(t,RUNNING); } // take a look in the finished queue and process finished tasks while(true) { Task t = finishedQ.pop(); if (t == NO_TASK) break; agenda.set_status(t,FINISHED); } // take a look in the failed queue and process failed tasks while(true) { Task t = failedQ.pop(); if (t == NO_TASK) break; if (agenda.get_status(t) == RUNNING) { // give it one more chance agenda.set_status(t,RETRY); toDoQ.push(t); } else { agenda.set_status(t,FAILED); std::cout << "Task failed! " << t << std::endl; } } // take a small break std::this_thread::sleep_for (std::chrono::milliseconds(100)); } std::cout << "Finished all alignments." << std::endl; std::cout << "Waiting for output tasks..." << std::endl; for ( auto& alnout : alnouts ) { while ( !alnout.is_finalized() ) { ; // wait } } // Clear the vector will destruct all elements. alnouts.clear(); std::cout << "Finished output tasks." << std::endl; // Halt the threads surrender = true; for (auto& w : workers) { w.join(); } std::cout << "All threads joined." << std::endl; // std::cout << "Total mapping time: " << time(NULL) - t_start << " s" << std::endl << std::endl; delete index; std::cout << "Total run time: " << time(NULL) - t_start << " s" << std::endl; exit(EXIT_SUCCESS); } HiLive-v1.1-133292b5092f376eabbec2e3021049b7dc920d99/tools/hilive_out.cpp000066400000000000000000000065771321155700500242400ustar00rootroot00000000000000#include #include "../lib/headers.h" #include "../lib/definitions.h" #include "../lib/kindex.h" #include "../lib/alnstream.h" #include "../lib/parallel.h" #include "../lib/argument_parser.h" namespace po = boost::program_options; AlignmentSettings globalAlignmentSettings; mutex_map fileLocks; /** * Main function that organizes the overall structure of the program. * @param argc Number of arguments * @param argv Argument array * @return 0 on success, other numbers on error */ int main(int argc, const char* argv[]) { // Program start output std::cout << std::endl << "------" << std::endl << "HiLive Output Tool v"<< HiLive_VERSION_MAJOR << "." << HiLive_VERSION_MINOR << " - Output of Realtime Alignments of Illumina Reads" << std::endl << "------" << std::endl<< std::endl; // Parse the command line arguments HiLiveOutArgumentParser argumentParser(argc, argv); int parser_returnStatus = argumentParser.parseCommandLineArguments(); // Successful execution of "help" or "license" if ( parser_returnStatus == 1 ) { exit(EXIT_SUCCESS); } // Parsing error else if ( parser_returnStatus == -1 ) { std::cout << "Parsing of command line options failed. For help, type 'hilive-out --help'." << std::endl; exit(EXIT_FAILURE); } // load the index std::cout << "Loading Index Header..." << std::endl; KixRun* index = new KixRun(); index->get_header_information(globalAlignmentSettings.get_index_fname()); index->store_kmer(); std::cout << "Start writing ouput." << std::endl; // Maximum number of output threads. CountType max_output_threads = std::max(CountType(1), globalAlignmentSettings.get_num_out_threads()); std::cout << "Using " << max_output_threads << " threads." << std::endl; std::deque alnouts; std::deque threads; bool all_finished = false; for ( CountType cycle : globalAlignmentSettings.get_output_cycles() ) { alnouts.emplace_back(globalAlignmentSettings.get_lanes(), globalAlignmentSettings.get_tiles(), cycle, index); } while ( !all_finished ) { all_finished = true; CountType num_active_threads = 0; for ( auto& alnout : alnouts ) { num_active_threads += alnout.get_task_status_num( RUNNING ); } for ( auto& alnout : alnouts ) { // Assume that all tasks are available. for ( auto& lane : globalAlignmentSettings.get_lanes() ) { for ( auto& tile : globalAlignmentSettings.get_tiles() ) { alnout.set_task_available( Task(lane, tile, alnout.get_cycle()) ); } } if ( !alnout.is_finished() ) { all_finished = false; if ( num_active_threads < max_output_threads ) { CountType newThreads = max_output_threads - num_active_threads; for ( auto i=0; i < newThreads; i++) { threads.emplace_back(&AlnOut::write_next, &alnout); ++num_active_threads; } } } else if ( !alnout.is_finalized() ) { alnout.finalize(); std::cout << "Finished output of cycle " << alnout.get_cycle() << " (" << alnout.get_task_status_num( FINISHED ) << " finished, " << alnout.get_task_status_num( FAILED ) << " failed)." << std::endl; } } std::this_thread::sleep_for (std::chrono::milliseconds(1000)); } // Clear the vector will destruct all elements. alnouts.clear(); // Ensure that all threads are finished. for ( auto& thread : threads ) thread.join(); std::cout << "Finished." << std::endl; delete index; return EXIT_SUCCESS; }