MindTheGap-2.3.0/ 0000755 0001750 0001750 00000000000 14230013106 013010 5 ustar nilesh nilesh MindTheGap-2.3.0/CMakeLists.txt 0000644 0001750 0001750 00000014171 14230013106 015554 0 ustar nilesh nilesh ################################################################################
# MindTheGap: Integrated detection and assembly of insertion variants
# A tool from the GATB (Genome Assembly Tool Box)
# Copyright (C) 2014 INRIA
# Authors: C.Lemaitre, G. Rizk
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see .
################################################################################
project(MindTheGap)
cmake_minimum_required(VERSION 3.1)
################################################################################
# The version number.
################################################################################
# The default version number is the latest official build
SET (gatb-tool_VERSION_MAJOR 2)
SET (gatb-tool_VERSION_MINOR 3)
SET (gatb-tool_VERSION_PATCH 0)
# But, it is possible to define another release number during a local build
IF (DEFINED MAJOR)
SET (gatb-tool_VERSION_MAJOR ${MAJOR})
ENDIF()
IF (DEFINED MINOR)
SET (gatb-tool_VERSION_MINOR ${MINOR})
ENDIF()
IF (DEFINED PATCH)
SET (gatb-tool_VERSION_PATCH ${PATCH})
ENDIF()
set (gatb-tool-version ${gatb-tool_VERSION_MAJOR}.${gatb-tool_VERSION_MINOR}.${gatb-tool_VERSION_PATCH})
# However, continuous integration has priority over local compilation
IF (DEFINED JENKINS_TAG)
SET (gatb-tool-version ${JENKINS_TAG})
ENDIF()
################################################################################
# Define cmake modules directory
################################################################################
SET (GATB_CORE_HOME ${PROJECT_SOURCE_DIR}/thirdparty/gatb-core/gatb-core)
SET (CMAKE_MODULE_PATH ${GATB_CORE_HOME}/cmake)
################################################################################
# SUPPORTED KMER SIZES
################################################################################
# One can uncomment this line and set the wanted values
#set (KSIZE_LIST "32 64 96 128 160 192 224 256")
################################################################################
# THIRD PARTIES
################################################################################
# We don't want to install some GATB-CORE artifacts
#SET (GATB_CORE_EXCLUDE_TOOLS 1)
SET (GATB_CORE_EXCLUDE_TESTS 1)
SET (GATB_CORE_EXCLUDE_EXAMPLES 1)
# GATB CORE
include (GatbCore)
################################################################################
# TOOL
################################################################################
# We also set a flag for TR1 management
if (use_new_cxx)
set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_NEW_CXX ")
endif()
message("-- cxx: ${CMAKE_CXX_FLAGS}")
# we get compilation definitions from the gatb-core part
add_definitions (${gatb-core-flags})
# SET( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra" )
SET( CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0")
# we give the headers directories from :
# - from project source
# - from GATB-CORE source
# - from dsk source
set (PROGRAM_SOURCE_DIR ${PROJECT_SOURCE_DIR}/src)
include_directories (${PROGRAM_SOURCE_DIR} ${gatb-core-includes})
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
# we define the files to be compiled
file (GLOB ProjectFiles src/*)
# we define the artifact to be built: the project binary
add_executable (${PROJECT_NAME} src/main.cpp ${ProjectFiles})
add_executable(nwalign src/nwAlign/nwalign.cpp)
# we define which libraries to be linked with project binary
target_link_libraries (${PROJECT_NAME} ${gatb-core-libraries})
target_link_libraries (nwalign ${gatb-core-libraries})
################################################################################
# PACKAGING
################################################################################
# We set the version number
SET (CPACK_PACKAGE_DESCRIPTION_SUMMARY "gatb-tool ${PROJECT_NAME}")
SET (CPACK_PACKAGE_VENDOR "Genscale team (INRIA)")
SET (CPACK_PACKAGE_VERSION_MAJOR "${gatb-tool_VERSION_MAJOR}")
SET (CPACK_PACKAGE_VERSION_MINOR "${gatb-tool_VERSION_MINOR}")
SET (CPACK_PACKAGE_VERSION_PATCH "${gatb-tool_VERSION_PATCH}")
SET (CPACK_PACKAGE_VERSION "${gatb-tool-version}")
# We set the kind of archive
SET (CPACK_GENERATOR "TGZ")
SET (CPACK_SOURCE_GENERATOR "TGZ")
# We ignore unwated files for the source archive
SET (CPACK_SOURCE_IGNORE_FILES
"^${PROJECT_SOURCE_DIR}/\\.git/" ;
"^${PROJECT_SOURCE_DIR}/\\.gitmodules" ;
"^${PROJECT_SOURCE_DIR}/\\.gitignore";
"^${PROJECT_SOURCE_DIR}/build/" ;
"^${GATB_CORE_HOME}/\\.cproject" ;
"^${GATB_CORE_HOME}/\\.git/" ;
"^${GATB_CORE_HOME}/\\.project" ;
"^${GATB_CORE_HOME}/\\.gitignore";
"^${GATB_CORE_HOME}/doc/design" ;
"^${GATB_CORE_HOME}/DELIVERY.md"
)
# We copy the project binary to the 'bin' directory
INSTALL (TARGETS ${PROJECT_NAME} DESTINATION bin)
INSTALL (DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/test DESTINATION .)
INSTALL (DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/data DESTINATION .)
#INSTALL (DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/doc DESTINATION .)
INSTALL (FILES ${CMAKE_CURRENT_SOURCE_DIR}/README.md DESTINATION .)
INSTALL (FILES ${CMAKE_CURRENT_SOURCE_DIR}/CHANGELOG.md DESTINATION .)
INSTALL (FILES ${CMAKE_CURRENT_SOURCE_DIR}/LICENSE DESTINATION .)
# We include the "bin" tag into binary archive file name
set (CPACK_PACKAGE_FILE_NAME ${PROJECT_NAME}-${CPACK_PACKAGE_VERSION}-bin-${CMAKE_SYSTEM_NAME})
include (CPack)
MindTheGap-2.3.0/INSTALL 0000644 0001750 0001750 00000001050 14230013106 014035 0 ustar nilesh nilesh # CMake is required to compile software (http://www.cmake.org/cmake/resources/software.html)
# you can install software by executing this file: sh INSTALL
#
# Prepare GATB sub-module
git submodule init
git submodule update
# Prepare directories:
rm -rf build
mkdir build
# Go in the 'build' directory
cd build
# Prepare the makefile
cmake ..
# Run the newly created makefile:
make
# To compile faster, use:
# make -j8
# Go back at the installation root directory
cd ..
# run tests
echo "Running simple test..."
cd test
. ./simple_test.sh
cd ..
MindTheGap-2.3.0/README.md 0000644 0001750 0001750 00000034124 14230013106 014273 0 ustar nilesh nilesh # MindTheGap
| **Linux** | **Mac OSX** |
|-----------|-------------|
[](https://ci.inria.fr/gatb-core/view/MindTheGap/job/tool-mindthegap-build-debian7-64bits-gcc-4.7/) | [](https://ci.inria.fr/gatb-core/view/MindTheGap/job/tool-mindthegap-build-macos-10.9.5-gcc-4.2.1/)
[](http://bioconda.github.io/recipes/mindthegap/README.html)
[](http://www.gnu.org/licenses/agpl-3.0.en.html)
# What is MindTheGap ?
MindTheGap performs detection and assembly of **DNA insertion variants** in NGS read datasets with respect to a reference genome. It is designed to call insertions of any size, whether they are novel or duplicated, homozygous or heterozygous in the donor genome. It takes as input a set of reads and a reference genome. It outputs two sets of FASTA sequences: one is the set of breakpoints of detected insertion sites, the other is the set of assembled insertions for each breakpoint.
**New !** MindTheGap can also be used as a **genome assembly finishing tool**: it can fill the gaps between a set of input contigs without any a priori on their relative order and orientation. It outputs the results in a gfa file. It is notably integrated as an essential step in the targeted assembly tool **MinYS** (MineYourSymbiont in metagenomics datasets, see [https://github.com/cguyomar/MinYS](https://github.com/cguyomar/MinYS)).
MindTheGap is a [Genscale](http://team.inria.fr/genscale/) tool, built upon the [GATB](http://gatb.inria.fr/) C++ library, and developed by:
* Claire Lemaitre
* Cervin Guyomar
* Wesley Delage
* Guillaume Rizk
* Former developers: Rayan Chikhi, Pierre Marijon.
# Installation instructions
## Requirements
CMake 3.1+; see http://www.cmake.org/cmake/resources/software.html
C++/11 capable compiler (e.g. gcc 4.7+, clang 3.5+, Apple/clang 6.0+)
## Getting the latest source code with git
# get a local copy of MindTheGap source code
git clone --recursive https://github.com/GATB/MindTheGap.git
# compile the code
cd MindTheGap
sh INSTALL
# the binary file is located in directory build/bin/
./build/bin/MindTheGap -help
Note: when updating your local repository with `git pull`, if you see that thirdparty/gatb-core has changed, you have to run also : `git submodule update`.
## Installing a stable release
Retrieve a binary archive file from one of the official MindTheGap releases (see "Releases" tab on the Github web page); file name is `MindTheGap-vX.Y.Z-bin-Linux.tar.gz` (for Linux) or `MindTheGap-vX.Y.Z-bin-Darwin.tar.gz` (for MacOs).
tar -zxf MindTheGap-vX.Y.Z-bin-Darwin.tar.gz
cd MindTheGap-vX.Y.Z-bin-Darwin
chmod u+x bin/MindTheGap
./bin/MindTheGap -help
In case the software does not run appropriately on your system, you should consider to install it from its source code. Retrieve the source archive file `MindTheGap-vX.Y.Z-Source.tar.gz`.
tar -zxf MindTheGap-vX.Y.Z-Source.tar.gz
cd MindTheGap-vX.Y.Z-Source
sh INSTALL
# the binary file is located in directory build/bin/
./build/bin/MindTheGap -help
## Using conda or docker
MindTheGap is also distributed as a [Bioconda package](https://anaconda.org/bioconda/mindthegap):
conda install -c bioconda mindthegap
Or pull the docker image of MindTheGap (warning: need to be updated with latest releases):
docker pull clemaitr/mindthegap
## Small run example
```
MindTheGap find -in data/reads_r1.fastq,data/reads_r2.fastq -ref data/reference.fasta -out example
MindTheGap fill -graph example.h5 -bkpt example.breakpoints -out example
```
# USER MANUAL
## Description
MindTheGap is a software that performs integrated detection and assembly of **genomic insertion variants** in NGS read datasets with respect to a reference genome. It is designed to call insertions of any size, whether they are novel or duplicated, homozygous or heterozygous in the donor genome.
Alternatively and since release 2.1.0, MindTheGap can also be used as a **genome assembly finishing tool**. It is integrated as an essential step in the **targeted assembly** tool [MinYS (MineYourSymbiont in metagenomics datasets)](https://github.com/cguyomar/MinYS). It takes also part of a gap-filling pipeline dedicated to linked-read data (10X Genomics): [MTG-link](https://github.com/anne-gcd/MTG-Link).
**Insertion variant detection**
It takes as input a set of reads and a reference genome. Its main output is a VCF file, giving for each insertion variant, its insertion site location on the reference genome, a single insertion sequence or a set of candidate insertion sequences (when there are assembly ambiguities), and its genotype in the sample.
For a detailed user manual specific to insertion variants see [doc/MindTheGap_insertion_caller.md](doc/MindTheGap_insertion_caller.md).
**Genome assembly gap-filling** (New feature !)
When given a set of reads and a set of contigs as input, MindTheGap tries to fill the gaps between all pairs of contigs by de novo local assembly without any a priori on their relative order and orientation. It outputs the results in gfa file.
For a detailed user manual specific to contig gap-filling see [doc/MindTheGap_assembly.md](doc/MindTheGap_assembly.md).
**Performances**
MindTheGap performs de novo assembly using the [GATB](http://gatb.inria.fr) C++ library and inspired from algorithms from Minia. Hence, the computational resources required to run MindTheGap are significantly lower than that of other assemblers (for instance it uses less than 6GB of main memory for analyzing a full human NGS dataset).
For more details on the method and some recent results, see the [web page](http://gatb.inria.fr/software/mind-the-gap/).
## Usage and examples
MindTheGap is composed of two main modules : breakpoint detection (`find` module) and the local assembly of insertions or gaps (`fill` module). Both steps are implemented in a single executable, MindTheGap, and can be run independently by specifying the module name as follows :
MindTheGap [module options]
1. **Basic command lines**
#Find module:
MindTheGap find (-in | -graph ) -ref [options]
#To get help:
MindTheGap find -help
#Fill module:
MindTheGap fill (-in | -graph ) (-bkpt | -contig ) [options]
#To get help:
MindTheGap fill -help
2. **Examples**
These examples can be run with the small datasets in directory `data/`
**Example for insertion variant calling:**
#find
build/bin/MindTheGap find -in data/reads_r1.fastq,data/reads_r2.fastq -ref data/reference.fasta -out example
# 3 files are generated:
# example.h5 (de bruijn graph),
# example.othervariants.vcf (SNPs and deletion variants),
# example.breakpoints (breakpoints of insertion variants).
#fill
build/bin/MindTheGap fill -graph example.h5 -bkpt example.breakpoints -out example
# 3 files are generated:
# example.insertions.fasta (insertion sequences)
# example.insertions.vcf (insertion variants)
# example.info.txt (log file)
**Example for gap-filling between contigs:**
```
build/bin/MindTheGap fill -in data/contig-reads.fasta.gz -contig data/contigs.fasta -abundance-min 3 -out contig_example
# 4 files are generated
# contig_example.h5 (de bruijn graph)
# contig_example.insertions.fasta (gap-filling sequences)
# contig_example.gfa (genome graph)
# contig_example.info.txt (log file)
```
The usage of the `fill` module is a little bit different depending on the type of gap-filling : assembling insertion variants (using the `-bkpt`option with a breakpoint file) or gap-filling between contigs (using the `-contig` option with a contig fasta file).
## Details
1. **Input sequencing read data**
For both modules, read dataset(s) are first indexed in a De Bruijn graph. The input format of read dataset(s) is either the read files themselves (option `-in`), or the already computed de bruijn graph in hdf5 format (.h5) (option `-graph`).
NOTE: options `-in` and `-graph` are mutually exclusive, and one of these is mandatory.
If the input is composed of several read files, they can be provided as a list of file paths separated by a comma or as a "file of file" (fof), that is a text file containing on each line the path to each read file. All read files will be treated as if concatenated in a single sample. The read file format can be fasta, fastq or gzipped.
2. **de Bruijn graph creation options**
In addition to input read set(s), the de Bruijn graph creation uses two main parameters, `-kmer-size` and `-abundance-min`:
* `-kmer-size`: the k-mer size [default '31']. By default, the largest kmer-size allowed is 128. To use k>128, you will need to re-compile MindTheGap as follows:
```
cd build/
cmake -DKSIZE_LIST="32 64 96 256" ..
make
```
To go back to default, replace 256 by 128. Note that increasing the range between two consecutive kmer-sizes in the list can have an impact on the size of the output h5 files (but none on the results).
* `-abundance-min`: the minimal abundance threshold, k-mers having less than this number of occurrences are discarded from the graph [default 'auto', ie. automatically inferred from the dataset].
* `-abundance-max`: the maximal abundance threshold, k-mers having more than this number of occurrences are discarded from the graph [default '2147483647' ie. no limit].
3. **Computational resources options**
Additional options are related to computational runtime and memory:
* `-nb-cores`: number of cores to be used for computation [default '0', ie. all available cores will be used].
* `-max-memory`: max RAM memory for the graph creation (in MBytes) [default '2000']. Increasing the memory will speed up the graph creation phase.
* `-max-disk`: max usable disk space for the graph creation (in MBytes) [default '0', ie. automatically set]. Kmers are counted by writing temporary files on the disk, to speed up the counting you can increase the usable disk space.
4. **MindTheGap Output**
All the output files are prefixed either by a default name: "MindTheGap_Expe-[date:YY:MM:DD-HH:mm]" or by a user defined prefix (option `-out` of MindTheGap).
The main results files are output by the Fill module, these are:
* an **insertion variant file** (`.insertions.vcf`) in vcf format, in the case of insertion variant detection (for insertions >2 bp).
* an **assembly graph file** (`.gfa`) in GFA format, in the case of contig gap-filling. It contains the original contigs and the obtained gap-fill sequences (nodes of the graph), together with their overlapping relationships (arcs of the graph).
Additional output files are:
* a graph file (`.h5`), output by both MindTheGap modules. This is a binary file containing the de Bruijn graph data structure. To obtain information stored in it, you can use the utility program `dbginfo` located in your bin directory or in ext/gatb-core/bin/.
* Files output specifically by `MindTheGap find`:
* a breakpoint file (`.breakpoints`) in fasta format.
* a variant file (`.othervariants.vcf`) in vcf format. It contains SNPs, deletions and very small insertions (1-2 bp).
* Files output specifically by `MindTheGap fill`:
* a sequence file (`.insertions.fasta`) in fasta format. It contains the inserted sequences (for insertions >2 bp) or contig gap-fills that were successfully assembled.
* a log file (`.info.txt`), a tabular file with some information about the filling process for each breakpoint/grap-fill.
* with option `-extend`, an additional sequence file (`.extensions.fasta`) in fasta format. It contains sequence extensions for failed insertion or gap-filling assemblies, ie. when the target kmer was not found, the first contig immediately after the source kmer is output.
Other optional parameters and details on input and output file formats are given in [doc/MindTheGap_insertion_caller.md](doc/MindTheGap_insertion_caller.md) and [doc/MindTheGap_assembly.md](doc/MindTheGap_assembly.md), depending on the usage.
## Utility programs
Either in your `bin/` directory or in `ext/gatb-core/bin/`, you can find additional utility programs :
* `dbginfo` : to get information about a graph stored in a .h5 file
* `dbgh5` : to build a graph from read set(s) and obtain a .h5 file
* `h5dump` : to extract data stored in a .h5 file
## Reference
If you use MindTheGap, please cite:
MindTheGap: integrated detection and assembly of short and long insertions. Guillaume Rizk, Anaïs Gouin, Rayan Chikhi and Claire Lemaitre. Bioinformatics 2014 30(24):3451-3457. http://bioinformatics.oxfordjournals.org/content/30/24/3451
[Web page](https://gatb.inria.fr/software/mind-the-gap/) with some updated results.
MindTheGap was also evaluated in a recent benchmark exploring many different genomic features (size, nature, repeat context, junctional homology at breakpoints) of human insertion variants. Among other tested SV callers, MindTheGap was the only tool able to output sequence-resolved insertions for many types of insertions. Read more: [Towards a better understanding of the low recall of insertion variants with short-read based variant callers.](https://bmcgenomics.biomedcentral.com/articles/10.1186/s12864-020-07125-5) Delage W, Thevenon J, Lemaitre C. *BMC Genomics* **2020**, 21(1):762.
# Contact
To contact a developer, request help, or for any feedback on MindTheGap, please use the issue form of github: https://github.com/GATB/MindTheGap/issues
You can see all issues concerning MindTheGap [here](https://github.com/GATB/MindTheGap/issues) and GATB [here](https://www.biostars.org/t/GATB/).
If you do not have any github account, you can also send an email to claire dot lemaitre at inria dot fr
MindTheGap-2.3.0/thirdparty/ 0000755 0001750 0001750 00000000000 14230013106 015202 5 ustar nilesh nilesh MindTheGap-2.3.0/thirdparty/gatb-core/ 0000755 0001750 0001750 00000000000 14230013106 017045 5 ustar nilesh nilesh MindTheGap-2.3.0/docker/ 0000755 0001750 0001750 00000000000 14230013106 014257 5 ustar nilesh nilesh MindTheGap-2.3.0/docker/Dockerfile 0000644 0001750 0001750 00000001531 14230013106 016251 0 ustar nilesh nilesh FROM debian:wheezy
MAINTAINER Claire Lemaitre claire.lemaitre@inria.fr
# Set MindTheGap version
ENV MTG_VERSION 2.1.0
# Set noninteratve mode
ENV DEBIAN_FRONTEND noninteractive
ENV PACKAGES wget gcc g++ make cmake zlib1g-dev libboost-dev git
ENV DIR /opt
ENV SOURCE MindTheGap
ENV BUILD build
WORKDIR ${DIR}
RUN apt-get update -y && \
apt-get install -y --no-install-recommends ${PACKAGES}
RUN git config --global http.sslVerify false
# clone the github repo
RUN git clone --recursive https://github.com/GATB/MindTheGap.git
WORKDIR ${DIR}/${SOURCE}
RUN git submodule init
# Using an official release
RUN git checkout v${MTG_VERSION}
RUN git submodule update
RUN mkdir ${BUILD}
WORKDIR ${DIR}/${SOURCE}/${BUILD}
RUN cmake ..
RUN make
# symlink binary in /usr/local/bin
RUN ln -s ${DIR}/${SOURCE}/${BUILD}/bin/MindTheGap /usr/local/bin
MindTheGap-2.3.0/src/ 0000755 0001750 0001750 00000000000 14230013106 013577 5 ustar nilesh nilesh MindTheGap-2.3.0/src/FindDeletion.hpp 0000644 0001750 0001750 00000013421 14230013106 016655 0 ustar nilesh nilesh /*****************************************************************************
* MindTheGap: Integrated detection and assembly of insertion variants
* A tool from the GATB (Genome Assembly Tool Box)
* Copyright (C) 2014 INRIA
* Authors: C.Lemaitre, G.Rizk, P.Marijon
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see .
*****************************************************************************/
#ifndef _TOOL_FindDeletion_HPP_
#define _TOOL_FindDeletion_HPP_
/*****************************************************************************/
#include
#include
template
class FindDeletion : public IFindObserver
{
public :
typedef typename gatb::core::kmer::impl::Kmer Kmer;
typedef typename Kmer::ModelCanonical KmerModel;
typedef typename KmerModel::Iterator KmerIterator;
public:
/** \copydoc IFindObserver
*/
FindDeletion(FindBreakpoints * find);
/** \copydoc IFindObserver::IFindObserver
*/
bool update();
private:
/** Detect if the end of a kmer is equal to the begin of other
* \param[in] begin first kmer
* \param[in] end the other kmer
* \return The size of repetition
*/
unsigned int fuzzy_site(std::string begin, std::string end);
};
template
FindDeletion::FindDeletion(FindBreakpoints * find) : IFindObserver(find){}
template
bool FindDeletion::update()
{
if((this->_find->kmer_begin().isValid() && this->_find->kmer_end().isValid()) == false)
{
return false;
}
if( this->_find->gap_stretch_size() < (this->_find->kmer_size() - this->_find->max_repeat() ) )
{
return false;
}
// Test if deletion is a fuzzy deletion
std::string begin = this->_find->model().toString(this->_find->kmer_begin().forward());
std::string end = this->_find->model().toString(this->_find->kmer_end().forward());
unsigned int repeat_size = this->fuzzy_site(begin, end);
if(repeat_size > (unsigned)this->_find->max_repeat())
{
return false;
}
if(repeat_size != 0)
{
begin = begin.substr(0, begin.length() - repeat_size);
}
// Compute del_size
int del_size = (int) this->_find->gap_stretch_size() - (int) this->_find->kmer_size() + (int) repeat_size + 1;
//was size_t, caused computation bug
// Create a sequence maybe is in graphe
std::string seq = begin + end;
// Create variable required for iterate on kmer
KmerModel local_m(this->_find->kmer_size());
KmerIterator local_it(local_m);
Data local_d(const_cast(seq.c_str()));
// Init this variable
local_d.setRef(const_cast(seq.c_str()), (size_t)seq.length());
local_it.setData(local_d);
bool is_deletion = true;
for(local_it.first(); !local_it.isDone(); local_it.next())
{
if(!this->contains(local_it->forward()))
{
is_deletion = false;
break;
}
}
if(is_deletion == false)
{
if(repeat_size == 0)
{
return false;
}
else // Maybee isn't a fuzzy deletion
{
seq = this->_find->model().toString(this->_find->kmer_begin().forward()) + end;
local_d.setRef(const_cast(seq.c_str()), (size_t)seq.length());
local_it.setData(local_d);
for(local_it.first(); !local_it.isDone(); local_it.next())
{
if(!this->contains(local_it->forward()))
{
return false;
}
}
del_size -= repeat_size;
repeat_size = 0;
}
}
//printf("FindDeletion repeat_size %u del_size %i %i %llu\n",repeat_size,del_size,this->_find->position(),this->_find->position());
if(del_size<=0) return false; //just in case
// Write the breakpoint
//this->_find->writeBreakpoint(this->_find->breakpoint_id(), this->_find->chrom_name(), this->_find->position() - del_size - 1, begin, end, repeat_size, STR_DEL_TYPE);
//NOTE : position will always be the left-most when repeat_size>0.
size_t del_start_pos = this->_find->position() - 2 - del_size; //begining position of the deletion -1 (0-based): because in VCF we need to put the letter just before the deleted sequence
//cout << "start pos = " << del_start_pos << "size = " << del_size << endl;
char *del_sequence = new char[del_size+2];
sprintf(del_sequence,"%.*s", del_size+1, this->_find->chrom_seq()+del_start_pos);
char *alt_char = new char[2];
sprintf(alt_char,"%.*s", 1, del_sequence);
//cout << del_sequence << endl;
//cout << alt_char << endl;
// here position is 0-based
this->_find->writeVcfVariant(this->_find->breakpoint_id(),
this->_find->chrom_name(),
del_start_pos, del_sequence, alt_char, repeat_size, STR_DEL_TYPE);
delete [](del_sequence);
delete [] (alt_char);
this->_find->breakpoint_id_iterate();
if(repeat_size != 0)
this->_find->fuzzy_deletion_iterate();
else
this->_find->clean_deletion_iterate();
return true;
}
/*
with max_repeat = 5
good case 1 + 5 + 1 = 6 operation exemple AAAAATTCGG TTCGGCCCCC
*/
template
unsigned int FindDeletion::fuzzy_site(std::string begin, std::string end)
{
for(unsigned int i = this->_find->max_repeat(); i != 0; i--)
for(unsigned int j = 1; begin.substr(begin.length() - i, j) == end.substr(0, j); j++)
if(i == j)
return j;
return 0;
}
#endif /* _TOOL_FindDeletion_HPP_ */
MindTheGap-2.3.0/src/FindBreakpoints.hpp 0000644 0001750 0001750 00000076545 14230013106 017413 0 ustar nilesh nilesh /*****************************************************************************
* MindTheGap: Integrated detection and assembly of insertion variants
* A tool from the GATB (Genome Assembly Tool Box)
* Copyright (C) 2014 INRIA
* Authors: C.Lemaitre, G.Rizk, P.Marijon
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see .
*****************************************************************************/
/**
* \file FindBreakpoins.hpp
* \date 09/04/2015
* \author pmarijon
* \brief FindBreakpoint definition class
*/
#ifndef _TOOL_FindBreakpoints_HPP_
#define _TOOL_FindBreakpoints_HPP_
/********************************************************************************/
#include
#include
#include
#include "CircularBuffer.hpp"
/********************************************************************************/
template
class IFindObserver;
/**
* \brief An observable functor for find gaps in reference genome
*
* This class associated with IFindObserver inherit, to find gaps in reference genome
*/
template
class FindBreakpoints
{
public :
typedef typename gatb::core::kmer::impl::Kmer Kmer;
typedef typename Kmer::ModelCanonical KmerModel;
typedef typename Kmer::Type KmerType;
typedef typename Kmer::Count KmerCount;
typedef typename Kmer::KmerCanonical KmerCanonical;
typedef typename KmerModel::Iterator KmerIterator;
/** Variables for the heterozyguous mode */
// structure to store information about a kmer : kmer that will be treated as kmer_begin of a breakpoint
typedef struct info_type
{
KmerType kmer;
int nb_in;
int nb_out;
bool is_repeated; // is the k-1 suffix of this kmer is repeated in the reference genome
} info_type;
public :
/** Constructor
* \param[in] find : A pointeur one Finder instance
*/
FindBreakpoints(Finder * find);
/** Destructor. */
virtual ~FindBreakpoints();
//Functor
/** overloading operator ()
* Read reference genome, and find gaps
*/
void operator()();
// Observable
/** Notify gap observer
* \param[in] If kmer is in graph in_graph is true else is false
*/
void notify(Node node, bool is_valid);
/** Add observer call after a gap detection
*/
void addGapObserver(IFindObserver* new_obs);
/** Add observer call after a gap detection
*/
void addKmerObserver(IFindObserver* new_obs);
/** writes a given breakpoint in the output file
*/
void writeBreakpoint(int bkt_id, string& chrom_name, uint64_t position, string& kmer_begin, string& kmer_end, int repeat_size, string type,bool repeat_in_genome_kmer_begin =false, bool repeat_in_genome_kmer_end = false);
/** writes a given variant in the output vcf file
*/
void writeVcfVariant(int bkt_id, string& chrom_name, uint64_t position, char* ref_char, char* alt_char, int repeat_size, string type);
void writeIndel(int bkt_id, string &chrom_name, uint64_t position, string ref_char, string alt_char, int repeat_size, string type);
/*Getter*/
/** Return the number of found breakpoints
*/
uint64_t breakpoint_id();
/** Return the position of first pb of actual read kmer
*/
uint64_t position();
/** Return the reference chromosome sequence
*/
char* chrom_seq();
/** Return the comment of sequence
*/
string& chrom_name();
/** Return the model of Kmer
*/
KmerModel& model();
/** Return the size of kmer used for gap search
*/
size_t kmer_size();
/** Return the max repeat size at breakpoint
*/
int max_repeat();
/** Return the number of minimal kmer required to validate snp
*/
int snp_min_val();
/** Return the threashold value of the branching filter
*/
int branching_threshold();
/** The last solid kmer before gap
*/
KmerCanonical& kmer_begin();
/** The first solid kmer after gap
*/
KmerCanonical& kmer_end();
/** Size of current solid stretch
*/
uint64_t solid_stretch_size();
/** Size of current gap stretch
*/
uint64_t gap_stretch_size();
/** MindTheGap run with homo-only flag
*/
bool homo_only();
/** Mask to get easily the k-1 prefix/suffix of a kmer (in KmerType unit)
*/
KmerType kminus1_mask();
/** Information one the next kmer
*/
info_type& current_info();
/** if >0, the precedent hetero site was too close, to avoid very close hetero sites
*/
int recent_hetero();
/**
*/
bool kmer_end_is_repeated();
bool kmer_begin_is_repeated();
/** Get info_type at the index in circular buffer, rq : limits the kmerSize <256
*/
info_type& het_kmer_history(unsigned char index);
/**
*/
unsigned char het_kmer_begin_index();
unsigned char het_kmer_end_index();
/**
*/
bool graph_contains(Node& kmer_node);
int node_in_branch(Node& kmer_node);
int node_out_branch(Node& kmer_node);
bool ref_bloom_contains(KmerType kmer);
/*Iterater*/
/** Incremente the value of breakpoint_id counter
*/
uint64_t breakpoint_id_iterate();
/** Incremente the value of homo_fuzzy_iterate
*/
int homo_fuzzy_iterate();
/** Incremente the value of homo_clean_iterate
*/
int homo_clean_iterate();
/** Incremente the value of hetero_fuzzy_iterate
*/
int hetero_fuzzy_iterate();
/** Incremente the value of hetero_clean_iterate
*/
int hetero_clean_iterate();
/** Incremente the value of fuzzy_deletion_iterate
*/
int fuzzy_deletion_iterate();
/** Incremente the value of fuzzy_deletion_iterate
*/
int clean_deletion_iterate();
/** Incremente the value of solo_snp_iterate
*/
int solo_snp_iterate();
/** Incremente the value of multi_snp_iterate
*/
int multi_snp_iterate();
/** Incremente the value of backup_iterate
*/
int backup_iterate();
/*Incremente the value of homo_clean_indel
*/
int homo_clean_indel_iterate();
/* Incremente the value of homo_fuzzy_indel
*/
int homo_fuzzy_indel_iterate();
/* Incremente the value of hetero_indel
*/
int hetero_indel_iterate();
/*Setter*/
/** Set value of recent_hetero
*/
void recent_hetero(int value);
private :
IBloom* fillRefBloom();
void store_kmer_info(Node node);
private :
/*Observable membre*/
std::vector* > gap_obs;
std::vector* > kmer_obs;
/*Find breakpoint membre*/
/*Write breakpoint*/
uint64_t m_breakpoint_id;
uint64_t m_position;
char* m_chrom_sequence;
string m_chrom_name;
/*Kmer related object*/
KmerModel m_model;
KmerCanonical m_previous_kmer;
KmerIterator m_it_kmer;
/*Kmer related object*/
KmerCanonical m_kmer_begin;
KmerCanonical m_kmer_end;
/*Gap type detection*/
uint64_t m_solid_stretch_size;
uint64_t m_gap_stretch_size;
/*Finder access*/
Finder* finder;
/*Hetero mode*/
info_type m_het_kmer_history[256];
unsigned char m_het_kmer_end_index; // index in history, must remain an unsigned char = same limit as the history array
unsigned char m_het_kmer_begin_index;
info_type m_current_info;
int m_recent_hetero;
bool m_kmer_end_is_repeated;
bool m_kmer_begin_is_repeated;
// CircularBuffer m_het_kmer_history_CB;
// typedef typename CircularBuffer::itCB iterCB;
// iterCB* m_het_kmer_end_index_CB;
// iterCB* m_het_kmer_begin_index_CB;
/** Bloom of the repeated kmers of the reference genome
*/
IBloom* m_ref_bloom;
//Please didn't add other friends please
friend bool FindMultiSNP::update();
friend bool FindMultiSNPrev::update();
/** Handle on the progress information. */
gatb::core::tools::dp::IteratorListener* _progress;
void setProgress (gatb::core::tools::dp::IteratorListener* progress) { SP_SETATTR(progress); }
};
template
FindBreakpoints::FindBreakpoints(Finder * find) : gap_obs(), m_model(find->_kmerSize), m_it_kmer(m_model), _progress (0)
{
this->m_breakpoint_id = 1;
this->m_position = 0;
this->m_chrom_sequence = NULL;
this->m_chrom_name = "";
this->m_kmer_begin = KmerCanonical(); // init kmerbegin and kmerend otherwise not init when checking this->_find->kmer_begin().isValid() in update
this->m_kmer_end = KmerCanonical();
//m_het_kmer_end_index_CB = new iterCB (&m_het_kmer_history_CB);
//m_het_kmer_begin_index_CB = new iterCB (&m_het_kmer_history_CB);
/*Homozygote usage*/
this->m_solid_stretch_size = 0;
this->m_gap_stretch_size = 0;
this->finder = find;
/*Heterozygote usage*/ //always fill repeat ref bloom
//if(this->finder->_hete_insert)
{
this->m_ref_bloom = this->fillRefBloom();
this->m_ref_bloom->use();
}
}
template
FindBreakpoints::~FindBreakpoints()
{
for(typename std::vector* >::iterator it = this->kmer_obs.begin(); it != this->kmer_obs.end(); it++)
{
(*it)->forget();
}
for(typename std::vector* >::iterator it = this->gap_obs.begin(); it != this->gap_obs.end(); it++)
{
(*it)->forget();
}
//if(this->finder->_hete_insert) //always fill repeat ref bloom
this->m_ref_bloom->forget();
setProgress (0);
}
template
void FindBreakpoints::operator()()
{
// We create an iterator over this bank
Iterator* it_seq = this->finder->_refBank->iterator();
LOCAL(it_seq);
u_int64_t totalsize = this->finder->_refBank->estimateSequencesSize();
u_int64_t nbkmersdone = 0;
//printf("bank size %lli \n",totalsize);
setProgress (new ProgressSynchro (
finder->createIteratorListener (totalsize, "Finding breakpoints"), //bon sang le createIteratorListener est dans le tool finder
System::thread().newSynchronizer())
);
_progress->init ();
// We loop over sequences
for (it_seq->first(); !it_seq->isDone(); it_seq->next())
{
this->m_kmer_begin = KmerCanonical();
this->m_kmer_end = KmerCanonical();
//DEBUG
//cout<<"sequence "<< (*it_seq)->getCommentShort() << endl;
//Reintialize stretch_size for each sequence
this->m_solid_stretch_size = 0;
this->m_gap_stretch_size = 0;
// for hetero mode:
memset(this->m_het_kmer_history, 0, sizeof(info_type)*256);
//m_het_kmer_history_CB.clear();
this->m_het_kmer_end_index = this->finder->_kmerSize +1;
this->m_het_kmer_begin_index = 1;
//m_het_kmer_end_index_CB->set(this->finder->_kmerSize +1);
//m_het_kmer_end_index_CB->set(1);
this->m_recent_hetero = 0;
// We set the data from which we want to extract kmers.
m_it_kmer.setData ((*it_seq)->getData());
this->m_chrom_sequence = (*it_seq)->getDataBuffer();
this->m_chrom_name = (*it_seq)->getCommentShort();
this->m_position = 0;
if (this->finder->_bed_file_name=="")
{
// We iterate the kmers.
for (m_it_kmer.first(); !m_it_kmer.isDone(); m_it_kmer.next(), m_position++, m_het_kmer_begin_index++, m_het_kmer_end_index++
) //,m_het_kmer_begin_index_CB++, m_het_kmer_end_index++
{
if(!(*m_it_kmer).isValid())
{
this->m_solid_stretch_size = 0;
this->m_gap_stretch_size = 0;
this->m_kmer_begin = KmerCanonical();
this->m_kmer_end = KmerCanonical();
//DEBUG
//cout<<"n";
}
else
{
//we need to convert the kmer in a node to query the graph.
Node node(Node::Value(m_it_kmer->value()), m_it_kmer->strand());// strand is necessary for hetero mode (in/out degree depends on the strand
uint64_t save_position = m_position; // m_position can be modified by observer (multisnp rev)
//we notify all observer
this->notify(node, (*m_it_kmer).isValid());
m_position = save_position;
//save actual kmer for potential False Positive
m_previous_kmer = *m_it_kmer;
//if(!graph_contains(node) & (*m_it_kmer).isValid()) {cout << m_position << endl;}
nbkmersdone++;
if (nbkmersdone > 1000) { _progress->inc (nbkmersdone); nbkmersdone = 0; }
}
}
//DEBUG
//cout<finder->_bed_file_name);
std::vector> interval_vector;
//std::vector>::iterator interval_it;
while(getline(reader,line))
{
if ((line.length()==0) ||(line.at(0)=='#') ||(line.at(0)=='@') ) continue;
string token;
stringstream iss;
vector < string > v;
std::tuple interval;
iss << line;
//cout << line << endl;
while(getline(iss,token,'\t'))
{
v.push_back(token);
}
if(v[0]==m_chrom_name){ // we are on the current chromosome
uint64_t bed_begin = std::stoi(v[1]);
uint64_t bed_end = std::stoi(v[2]);
if ((bed_end-bed_begin) > this->finder->_kmerSize){
interval=std::make_pair(std::stoi(v[1]),std::stoi(v[2]));
interval_vector.push_back( tuple(interval));
}
}
iss.clear();
}
if (!interval_vector.empty()){
uint64_t start_pos=get<0>(interval_vector.front());
uint64_t end_pos=get<1>(interval_vector.front());
//iterate over the kmers of the chromosome
for (m_it_kmer.first(); !m_it_kmer.isDone(); m_it_kmer.next(), m_position++, m_het_kmer_begin_index++, m_het_kmer_end_index++) //,m_het_kmer_begin_index_CB++, m_het_kmer_end_index++
{
if (m_position >= end_pos)
{
//move to the next interval
interval_vector.erase(interval_vector.begin());
if(interval_vector.empty()){
break;
}
start_pos=get<0>(interval_vector.front());
end_pos=get<1>(interval_vector.front());
}
if(!(*m_it_kmer).isValid())
{
//Re-initialize stretch_size
this->m_solid_stretch_size = 0;
this->m_gap_stretch_size = 0;
this->m_kmer_begin = KmerCanonical();
this->m_kmer_end = KmerCanonical();
}
if(m_position==start_pos-1) //for each beginning of bed region
{
//Re-initialize stretch_size for each bed region
this->m_solid_stretch_size = 0;
this->m_gap_stretch_size = 0;
this->m_kmer_begin = KmerCanonical();
this->m_kmer_end = KmerCanonical();
//Re-initialize het_kmer_history for each bed region
memset(this->m_het_kmer_history, 0, sizeof(info_type)*256);
}
if(((*m_it_kmer).isValid()) && (m_position>=start_pos)) //inside the current bed interval
{
//we need to convert the kmer in a node to query the graph.
Node node(Node::Value(m_it_kmer->value()), m_it_kmer->strand());// strand is necessary for hetero mode (in/out degree depends on the strand
uint64_t save_position = m_position; // m_position can be modified by observer (multisnp rev)
//we notify all observer
this->notify(node, (*m_it_kmer).isValid());
m_position = save_position;
//save actual kmer for potential False Positive
m_previous_kmer = *m_it_kmer;
//if(!graph_contains(node) & (*m_it_kmer).isValid()) {cout << m_position << endl;}
nbkmersdone++;
if (nbkmersdone > 1000) { _progress->inc (nbkmersdone); nbkmersdone = 0; }
}
}
}
}
}
_progress->finish ();
}
template
void FindBreakpoints::notify(Node node, bool is_valid)
{
bool in_graph = this->graph_contains(node);
this->store_kmer_info(node);
for(typename std::vector* >::iterator it = this->kmer_obs.begin(); it != this->kmer_obs.end(); it++)
{
(*it)->update();
}
// Kmer is in graph incremente scretch size
if(in_graph && is_valid)
{
//DEBUG
//cout<<"1";
m_solid_stretch_size++;
if(m_solid_stretch_size > 1 && m_gap_stretch_size > 0)
{
// Call each readonly observer
for(typename std::vector* >::iterator it = this->gap_obs.begin(); it != this->gap_obs.end(); it++)
{
//DEBUG
//cout << m_gap_stretch_size << endl;
if((*it)->update())
{
break;
}
}
// gap stretch size is re-set to 0 only when we are sure that the end of the gap is not due to an isolated solid kmer (likely FP)
this->m_gap_stretch_size = 0;
}
if (this->m_solid_stretch_size==1)
{
// kmer_end should be the first kmer indexed after a gap (the first kmer of a solid_stretch is when m_solid_stretch_size=1)
this->m_kmer_end = *this->m_it_kmer;
}
}
// Kmer isn't in graph incremente gap size and reset solid size
if(!in_graph && is_valid)
{
//DEBUG
//cout<<"0";
if(this->m_solid_stretch_size==1)
{
this->m_gap_stretch_size = this->m_gap_stretch_size + this->m_solid_stretch_size; //if previous position was an isolated solid kmer, we need to add 1 to the m_gap_stretch_size (as if replacing the FP by a non indexed kmer)
}
if(this->m_solid_stretch_size > 1 && this->m_previous_kmer.isValid()) // begin of not indexed zone
{
this->m_kmer_begin = this->m_previous_kmer;
this->m_kmer_begin_is_repeated = this->m_current_info.is_repeated ;
}
m_gap_stretch_size++;
m_solid_stretch_size = 0;
}
}
template
void FindBreakpoints::addGapObserver(IFindObserver* new_obs)
{
new_obs->use();
// Add observer in tables use unique_ptr for safety destruction
this->gap_obs.push_back(new_obs);
}
template
void FindBreakpoints::addKmerObserver(IFindObserver* new_obs)
{
new_obs->use();
// Add observer in tables use unique_ptr for safety destruction
this->kmer_obs.push_back(new_obs);
}
template
void FindBreakpoints::writeBreakpoint(int bkt_id, string& chrom_name, uint64_t position, string& kmer_begin, string& kmer_end, int repeat_size, string type, bool repeat_in_genome_kmer_begin, bool repeat_in_genome_kmer_end ){
fprintf(this->finder->_breakpoint_file,">bkpt%i_%s_pos_%lli_fuzzy_%i_%s %s left_kmer\n%s\n>bkpt%i_%s_pos_%lli_fuzzy_%i_%s %s right_kmer\n%s\n",
bkt_id,
chrom_name.c_str(),
position+1, //switch to 1-based
repeat_size,
type.c_str(),
repeat_in_genome_kmer_begin ? "REPEATED" : "",
kmer_begin.c_str(),
bkt_id,
chrom_name.c_str(),
position+1, //switch to 1-based
repeat_size,
type.c_str(),
repeat_in_genome_kmer_end ? "REPEATED" : "",
kmer_end.c_str()
);
}
template
void FindBreakpoints::writeVcfVariant(int bkt_id, string& chrom_name, uint64_t position, char* ref_char, char* alt_char, int repeat_size, string type){
//cout << ref_char << alt_char << endl;
// NOTE : currently all positions coming from FindObservers are 0-based, VCF is supposed to be 1-based, so we add +1
int variant_size=1;
if (strcmp(type.c_str(),STR_DEL_TYPE)==0){
variant_size = strlen(ref_char) - 1;
}
fprintf(this->finder->_vcf_file,"%s\t%lli\tbkpt%i\t%s\t%s\t.\tPASS\tTYPE=%s;LEN=%i;FUZZY=%i\tGT\t1/1\n",
chrom_name.c_str(),
position+1, //switch to 1-based
bkt_id,
ref_char,
alt_char,
type.c_str(),
variant_size,
repeat_size
);
}
template
void FindBreakpoints::writeIndel(int bkt_id, string &chrom_name, uint64_t position, string ref_string, string alt_string, int repeat_size, string type)
{
// NOTE : currently all positions coming from FindObservers are 0-based, VCF is supposed to be 1-based, so we add +1
int variant_size = alt_string.length() - 1;
string GT = "./.";
if (type == "HOM")
{
GT = "1/1";
}
if (type == "HET")
{
GT = "0/1";
}
fprintf(this->finder->_vcf_file, "%s\t%lli\tbkpt%i\t%s\t%s\t.\tPASS\tTYPE=INS;LEN=%i;FUZZY=%i\tGT\t%s\n",
chrom_name.c_str(),
position + 1, //switch to 1-based
bkt_id,
ref_string.c_str(),
alt_string.c_str(),
variant_size,
repeat_size,
GT.c_str());
}
/*Getter*/
template
int FindBreakpoints::node_in_branch(Node& kmer_node)
{
return this->finder->_graph.indegree(kmer_node);
}
template
int FindBreakpoints::node_out_branch(Node& kmer_node)
{
return this->finder->_graph.outdegree(kmer_node);
}
template
uint64_t FindBreakpoints::breakpoint_id()
{
return this->m_breakpoint_id;
}
template
uint64_t FindBreakpoints::position()
{
return this->m_position;
}
template
char * FindBreakpoints::chrom_seq()
{
return this->m_chrom_sequence;
}
template
string& FindBreakpoints::chrom_name()
{
return this->m_chrom_name;
}
template
typename FindBreakpoints::KmerModel& FindBreakpoints::model()
{
return this->m_model;
}
template
size_t FindBreakpoints::kmer_size()
{
return this->finder->_kmerSize;
}
template
int FindBreakpoints::max_repeat()
{
return this->finder->_max_repeat;
}
template
int FindBreakpoints::snp_min_val()
{
return this->finder->_snp_min_val;
}
template
int FindBreakpoints::branching_threshold()
{
return this->finder->_branching_threshold;
}
/*Kmer related object*/
template
typename FindBreakpoints::KmerCanonical& FindBreakpoints::kmer_begin()
{
return this->m_kmer_begin;
}
template
typename FindBreakpoints::KmerCanonical& FindBreakpoints::kmer_end()
{
return this->m_kmer_end;
}
template
uint64_t FindBreakpoints::solid_stretch_size()
{
return this->m_solid_stretch_size();
}
template
uint64_t FindBreakpoints::gap_stretch_size()
{
return this->m_gap_stretch_size;
}
template
bool FindBreakpoints::homo_only()
{
return this->finder->_homo_only;
}
template
typename FindBreakpoints::info_type& FindBreakpoints::current_info()
{
return this->m_current_info;
}
template
int FindBreakpoints::recent_hetero()
{
return this->m_recent_hetero;
}
template
bool FindBreakpoints::kmer_end_is_repeated()
{
return this->m_kmer_end_is_repeated;
}
template
bool FindBreakpoints::kmer_begin_is_repeated()
{
return this->m_kmer_begin_is_repeated;
}
template
typename FindBreakpoints::info_type& FindBreakpoints::het_kmer_history(unsigned char index)
{
return this->m_het_kmer_history[index];
// return index(); //with index of type iterCB
}
template
unsigned char FindBreakpoints::het_kmer_begin_index()
{
return this->m_het_kmer_begin_index;
}
template
unsigned char FindBreakpoints::het_kmer_end_index()
{
return this->m_het_kmer_end_index;
}
template
bool FindBreakpoints::graph_contains(Node& kmer_node)
{
return this->finder->_graph.contains(kmer_node);
//keep tips and internal node sonly
//return ( this->finder->_graph.contains(kmer_node) && (this->finder->_graph.indegree(kmer_node)>=1 || this->finder->_graph.outdegree(kmer_node)>=1 ));
//keep internal nodes only
// return ( this->finder->_graph.contains(kmer_node) && (this->finder->_graph.indegree(kmer_node)>=1 && this->finder->_graph.outdegree(kmer_node)>=1 ));
}
template
bool FindBreakpoints::ref_bloom_contains(KmerType kmer)
{
return this->m_ref_bloom->contains(kmer);
}
/*Iterater*/
template
uint64_t FindBreakpoints::breakpoint_id_iterate()
{
return this->m_breakpoint_id++;
}
template
int FindBreakpoints::homo_fuzzy_iterate()
{
return this->finder->_nb_homo_fuzzy++;
}
template
int FindBreakpoints::homo_clean_iterate()
{
return this->finder->_nb_homo_clean++;
}
template
int FindBreakpoints::hetero_fuzzy_iterate()
{
return this->finder->_nb_hetero_fuzzy++;
}
template
int FindBreakpoints::hetero_clean_iterate()
{
return this->finder->_nb_hetero_clean++;
}
template
int FindBreakpoints::fuzzy_deletion_iterate()
{
return this->finder->_nb_fuzzy_deletion++;
}
template
int FindBreakpoints::clean_deletion_iterate()
{
return this->finder->_nb_clean_deletion++;
}
template
int FindBreakpoints::solo_snp_iterate()
{
return this->finder->_nb_solo_snp++;
}
template
int FindBreakpoints::multi_snp_iterate()
{
return this->finder->_nb_multi_snp++;
}
template
int FindBreakpoints::backup_iterate()
{
return this->finder->_nb_backup++;
}
template
int FindBreakpoints::homo_clean_indel_iterate()
{
return this->finder->_nb_homo_clean_indel++;
}
template
int FindBreakpoints::homo_fuzzy_indel_iterate()
{
return this->finder->_nb_homo_fuzzy_indel++;
}
template
int FindBreakpoints::hetero_indel_iterate()
{
return this->finder->_nb_hetero_indel++;
}
/*Setter*/
template
void FindBreakpoints::recent_hetero(int value)
{
this->m_recent_hetero = value;
}
//todo later replace this by mphf+ abundance per kmer
template
IBloom::KmerType>* FindBreakpoints::fillRefBloom(){
//Bloom of the repeated (k-1)mers of the reference genome
IBloom* ref_bloom = 0;
//solid kmers must be stored in a file
string tempFileName = this->finder->getInput()->getStr(STR_URI_OUTPUT)+"_trashme.h5";
// Parameters for SortingCountAlgorithm // all defaults
IProperties* props = SortingCountAlgorithm<>::getDefaultProperties();
props->setInt (STR_KMER_ABUNDANCE_MIN, this->finder->_het_max_occ+1);
props->setInt (STR_KMER_SIZE, this->finder->_kmerSize-1);
props->setStr (STR_URI_OUTPUT, tempFileName);
//Remark : could re-use MAX_DISK or others from Finder options ? not necessary here, small counting in theory
//props->setStr (STR_MAX_DISK, this->finder->getInput()->getStr(STR_MAX_DISK));
/** We create a DSK (kmer counting) instance and execute it. */
SortingCountAlgorithm sortingCount (this->finder->_refBank,props);
sortingCount.getInput()->add (0, STR_VERBOSE, 0);//do not show progress bar
sortingCount.execute();
// OLD WAY : Partition & solidCollection = storage->root().getGroup("dsk").getPartition ("solid");
Partition & solidCollection = * sortingCount.getSolidCounts();
/** We get the number of solid kmers. */
u_int64_t nb_solid = solidCollection.getNbItems();
/** parameters of the Bloom filter */
float NBITS_PER_KMER = 12;
u_int64_t estimatedBloomSize = (u_int64_t) ((double)nb_solid * NBITS_PER_KMER * 2); //TODO *3 ?
if (estimatedBloomSize ==0 )
{
estimatedBloomSize = 1000;
}
size_t nbHash = (int)floorf (0.7*NBITS_PER_KMER);
//iterator of KmerCount
Iterator* itKmers = this->finder->createIterator(
solidCollection.iterator(),
nb_solid
);
LOCAL (itKmers);
// building the bloom
BloomBuilder builder (estimatedBloomSize, nbHash, this->finder->_kmerSize-1, BLOOM_CACHE, this->finder->getDispatcher()->getExecutionUnitsNumber(), this->finder->_het_max_occ+1);
ref_bloom = builder.build (itKmers);
//cout << typeid(*ref_bloom).name() << endl; // to verify the type of bloom
System::file().remove(tempFileName);
return ref_bloom;
}
template
void FindBreakpoints::store_kmer_info(Node node)
{
KmerType one; one.setVal(1);
KmerType kminus1_mask = (one << ((this->finder->_kmerSize-1)*2)) - one;
this->m_current_info.kmer = this->m_it_kmer->forward();
if (this->finder->_graph.contains(node))
{
this->m_current_info.nb_in = this->finder->_graph.indegree (node);
this->m_current_info.nb_out = this->finder->_graph.outdegree (node);
}
else
{
this->m_current_info.nb_in = 0;
this->m_current_info.nb_out = 0;
}
//checking if the k-1 suffix is repeated
KmerType suffix = this->m_it_kmer->forward() & kminus1_mask ; // getting the k-1 suffix (because putative kmer_begin)
KmerType suffix_rev = revcomp(suffix,this->finder->_kmerSize-1); // we get its reverse complement to compute the canonical value of this k-1-mer
//if(this->finder->_hete_insert) //alwayss fill repeat info
this->m_current_info.is_repeated = this->m_ref_bloom->contains(min(suffix,suffix_rev));
//filling the history array with the current kmer information
this->m_het_kmer_history[m_het_kmer_end_index] = m_current_info;
//m_het_kmer_end_index_CB->item() = m_current_info ;
//checking if the k-1 prefix is repeated
KmerType prefix = (this->m_it_kmer->forward() >> 2) & kminus1_mask; // getting the k-1 prefix (applying kminus1_mask after shifting of 2 bits to get the prefix)
KmerType prefix_rev = revcomp(prefix,this->finder->_kmerSize-1); // we get its reverse complement to compute the canonical value of this k-1-mer
// if(this->finder->_hete_insert) //alwayss fill repeat info
this->m_kmer_end_is_repeated = this->m_ref_bloom->contains(min(prefix,prefix_rev));
}
#endif /* _TOOL_FindBreakpoints_HPP_ */
MindTheGap-2.3.0/src/GraphAnalysis.hpp 0000644 0001750 0001750 00000005366 14230013106 017067 0 ustar nilesh nilesh /*****************************************************************************
* MindTheGap: Integrated detection and assembly of insertion variants
* A tool from the GATB (Genome Assembly Tool Box)
* Copyright (C) 2014 INRIA
* Authors: C.Lemaitre, G.Rizk, R. Chikhi
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see .
*****************************************************************************/
#include
//USE_NEW_CXX variable defined in CMakeList.txt of gatb-core : depending on the compil version unordered_map is not in the same location...
#include
#define NS_TR1_PREFIX std
#include
#include
#include
#include
#include
using namespace std;
// path type
typedef vector unlabeled_path;
class GraphAnalysis {
public:
static const int max_breadth = 20; //changed from 10 to 20
string prefix;
FILE *graph_file;
int nb_nodes, nb_edges;
string node_identifier(int node);
int revcomp_node(int node);
NS_TR1_PREFIX::unordered_map node_sequences;
NS_TR1_PREFIX::unordered_map > out_edges;
NS_TR1_PREFIX::unordered_map > in_edges;
size_t _sizeKmer;
GraphAnalysis(string graph_file_name,size_t kmerSize);
set> find_all_paths(set terminal_nodes_with_endpos, bool &success);
set> find_all_paths(int start_node, set terminal_nodes_with_endpos, unlabeled_path current_path, int &nb_calls, bool &success);
set> find_all_paths_rev(set< info_node_t > terminal_nodes_with_endpos);
set> find_all_paths_rev(int start_node, set< info_node_t > terminal_nodes_with_endpos, unlabeled_path current_path, int &nb_calls, bool &success, int &terminal_node, bkpt_t &target_id);
static int debug; // 0: no debug, 1: node id debug, 2: ful sequence debug; useful to see the sequences of the traversed paths
std::vector paths_to_sequences(set paths, set< info_node_t > terminal_nodes_with_endpos);
};
MindTheGap-2.3.0/src/nwAlign/ 0000755 0001750 0001750 00000000000 14230013106 015176 5 ustar nilesh nilesh MindTheGap-2.3.0/src/nwAlign/nwalign.cpp 0000644 0001750 0001750 00000002351 14230013106 017342 0 ustar nilesh nilesh /*********************************************************************
Minimalist utility to use perform fast Needleman_Wunsch alignment outside of MindTheGap
Usage : nwalign < infile
Where infile is a two lines file with the two sequences to compare
Outputs identity score in stdout
*********************************************************************/
#include
#include
#include
using namespace std;
int main (int argc, char* argv[])
{
// We use a try/catch block since GATB functions may throw exceptions
try
{
int nbLine = 0;
string seq1;
string seq2;
float score;
for (std::string line; std::getline(std::cin, line);) {
nbLine += 1;
if (nbLine == 1){
seq1 = line;
} else if (nbLine == 2){
seq2 = line;
} else{
cout << "Only two lines expected" << endl;
break;
}
}
score = needleman_wunsch(seq1,seq2, NULL, NULL, NULL);
cout << score << endl;
return 0;
}
catch (Exception& e)
{
std::cout << "EXCEPTION: " << e.getMessage() << std::endl;
return EXIT_FAILURE;
}
} MindTheGap-2.3.0/src/Finder.hpp 0000644 0001750 0001750 00000010002 14230013106 015510 0 ustar nilesh nilesh /*****************************************************************************
* MindTheGap: Integrated detection and assembly of insertion variants
* A tool from the GATB (Genome Assembly Tool Box)
* Copyright (C) 2014 INRIA
* Authors: C.Lemaitre, G.Rizk
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see .
*****************************************************************************/
#ifndef _TOOL_Finder_HPP_
#define _TOOL_Finder_HPP_
/********************************************************************************/
#include
using namespace std;
/********************************************************************************/
static const char* STR_URI_REF = "-ref";
static const char* STR_MAX_REPEAT = "-max-rep";;
static const char* STR_HET_MAX_OCC = "-het-max-occ";
static const char* STR_SNP_MIN_VAL = "-snp-min-val";
static const char* STR_BRANCHING_FILTER = "-branching-filter";
static const char* STR_HOMO_ONLY = "-homo-only";
static const char* STR_INSERT_ONLY = "-insert-only";
static const char* STR_SNP_ONLY = "-snp-only";
static const char* STR_DELETION_ONLY = "-deletion-only";
static const char* STR_HETERO_ONLY = "-hete-only";
static const char* STR_NO_BACKUP = "-no-backup";
static const char* STR_WITH_BACKUP = "-backup";
static const char* STR_NO_SNP = "-no-snp";
static const char* STR_NO_INSERT = "-no-insert";
static const char* STR_NO_DELETION = "-no-deletion";
static const char* STR_NO_HETERO = "-no-hetero";
static const char* STR_BED = "-bed";
static const char* STR_HOM_TYPE = "HOM";
static const char* STR_HET_TYPE = "HET";
static const char* STR_SNP_TYPE = "SNP";
static const char* STR_MSNP_TYPE = "MSNP";
static const char* STR_DEL_TYPE = "DEL";
static const char* STR_BKP_TYPE = "BACKUP";
class Finder : public Tool
{
public:
// Constructor
Finder ();
~Finder ();
void FinderHelp();
const char* _mtg_version;
size_t _kmerSize;
Graph _graph;
//parameters
int _max_repeat;
int _het_max_occ;
int _snp_min_val;
int _branching_threshold;
int _nbCores;
bool _homo_only;
bool _homo_insert;
bool _hete_insert;
bool _snp;
bool _backup;
bool _deletion;
bool _small_homo;
bool _small_hetero;
//input/output files
IBank* _refBank;
string _breakpoint_file_name;
FILE * _breakpoint_file;
string _vcf_file_name;
FILE * _vcf_file;
string _bed_file_name;
//results statistics
int _nb_homo_clean;
int _nb_homo_fuzzy;
int _nb_hetero_clean;
int _nb_hetero_fuzzy;
int _nb_fuzzy_deletion;
int _nb_clean_deletion;
int _nb_solo_snp;
int _nb_multi_snp;
int _nb_backup;
int _nb_homo_clean_indel;
int _nb_homo_fuzzy_indel;
int _nb_hetero_indel;
// Actual job done by the tool is here
void execute ();
private:
/** fills getInfo() with parameters informations
*/
void resumeParameters();
/** fills getInfo() with results informations
* arg seconds running time
*/
void resumeResults(double seconds);
/** writes the header of the vcf file
*/
void writeVcfHeader();
/** Create and use FindBreakpoints class to find gaps in the reference genome
*/
template
struct runFindBreakpoints { void operator () (Finder* object); };
};
/********************************************************************************/
#endif /* _TOOL_Finder_HPP_ */
MindTheGap-2.3.0/src/FindBackup.hpp 0000644 0001750 0001750 00000004545 14230013106 016326 0 ustar nilesh nilesh /*****************************************************************************
* MindTheGap: Integrated detection and assembly of insertion variants
* A tool from the GATB (Genome Assembly Tool Box)
* Copyright (C) 2014 INRIA
* Authors: C.Lemaitre, G.Rizk, P.Marijon
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see .
*****************************************************************************/
#ifndef _TOOL_FindBackup_HPP_
#define _TOOL_FindBackup_HPP_
/*******************************************************************************/
#include
#include
template
class FindBackup : public IFindObserver
{
public :
/** \copydoc IFindObserver::IFindObserver
*/
FindBackup(FindBreakpoints * find);
/** \copydoc IFindObserver::update
*/
bool update();
};
template
FindBackup::FindBackup(FindBreakpoints * find) : IFindObserver(find){}
template
bool FindBackup::update()
{
if((this->_find->kmer_begin().isValid() && this->_find->kmer_end().isValid()) == false)
{
return false;
}
if(this->_find->gap_stretch_size() > (this->_find->kmer_size() / 2)) {
string kmer_begin_str = this->_find->model().toString(this->_find->kmer_begin().forward());
string kmer_end_str = this->_find->model().toString(this->_find->kmer_end().forward());
string chrom_name_bak = this->_find->chrom_name()+"_backup";
this->_find->writeBreakpoint(this->_find->breakpoint_id(), chrom_name_bak, this->_find->position() - 1, kmer_begin_str, kmer_end_str, 0, STR_BKP_TYPE);
this->_find->breakpoint_id_iterate();
this->_find->backup_iterate();
return true;
}
return false;
}
#endif /* _TOOL_FindBackup_HPP_ */
MindTheGap-2.3.0/src/FindSmallInsertion.hpp 0000644 0001750 0001750 00000017454 14230013106 020067 0 ustar nilesh nilesh /*****************************************************************************
* MindTheGap: Integrated detection and assembly of insertion variants
* A tool from the GATB (Genome Assembly Tool Box)
* Copyright (C) 2022 INRIA
* Authors: C. Lemaitre, G. Rizk, P. Marijon, W. Delage
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see .
*****************************************************************************/
#ifndef FINDSMALLINSERTION_HPP_
#define FINDSMALLINSERTION_HPP_
//**********************************
#include
#include
template
class FindSmallCleanInsertion : public IFindObserver
{
public :
typedef typename gatb::core::kmer::impl::Kmer Kmer;
typedef typename Kmer::ModelCanonical KmerModel;
typedef typename KmerModel::Iterator KmerIterator;
public:
/** \copydoc IFindObserver
*/
/** \copydoc IFindObserver::IFindObserver
*/
FindSmallCleanInsertion(FindBreakpoints * find);
/** \copydoc IFindObserver::IFindObserver
*/
bool update();
};
template
FindSmallCleanInsertion::FindSmallCleanInsertion(FindBreakpoints * find) : IFindObserver(find){}
template
bool FindSmallCleanInsertion::update()
{
if((this->_find->kmer_begin().isValid() && this->_find->kmer_end().isValid()) == false)
{
return false;
}
if(this->_find->gap_stretch_size() == (this->_find->kmer_size()-1)) //Check size of gap
{
// obtains the kmer sequence
string kmer_begin_str = this->_find->model().toString(this->_find->kmer_begin().forward());
string kmer_end_str = this->_find->model().toString(this->_find->kmer_end().forward());
string ref = kmer_begin_str.substr(kmer_begin_str.size()-1,1);
//All possible insertions of size 1 and 2
char nucleo[20][6] = {"A","C","G","T","AA","AC","AG","AT","CA","CC","CG","CT","GA","GC","GG","GT","TA","TC","TG","TT"};
KmerModel local_m(this->_find->kmer_size());
KmerIterator local_it(local_m);
std::string seq;
string inser_base_one;
bool found_base_one=false;
//Test all possible insertions, by performing a micro-guided-assembly, ie. checks if all kmers of the insertion are present in the graph
for (int i=0; i<20; i++)
{
seq = kmer_begin_str+ nucleo[i] + kmer_end_str;
Data local_d(const_cast(seq.c_str()));
int sum_valid=0;
// Init this variable
local_d.setRef(const_cast(seq.c_str()), (size_t)seq.length());
local_it.setData(local_d);
for(local_it.first(); !local_it.isDone(); local_it.next())
{
if(this->contains(local_it->forward()))
{
sum_valid++;
}
else
{
break;
}
if (sum_valid==this->_find->kmer_size())
{
inser_base_one=ref+nucleo[i];
found_base_one=true;
}
}
if (found_base_one==true) break;
}
if (!found_base_one) return false;
this->_find->writeIndel(this->_find->breakpoint_id(),this->_find->chrom_name(),this->_find->position()-2, ref, inser_base_one, 0, STR_HOM_TYPE);
this->_find->homo_clean_indel_iterate();
this->_find->breakpoint_id_iterate();
return true;
}
return false;
}
///*
template
class FindSmallFuzzyInsertion : public IFindObserver
{
public :
typedef typename gatb::core::kmer::impl::Kmer Kmer;
typedef typename Kmer::ModelCanonical KmerModel;
typedef typename KmerModel::Iterator KmerIterator;
public:
/** \copydoc IFindObserver
*/
/** \copydoc IFindObserver::IFindObserver
*/
FindSmallFuzzyInsertion(FindBreakpoints * find);
/** \copydoc IFindObserver::IFindObserver
*/
bool update();
};
template
FindSmallFuzzyInsertion::FindSmallFuzzyInsertion(FindBreakpoints * find):IFindObserver(find){}
template
bool FindSmallFuzzyInsertion::update()
{
if((this->_find->kmer_begin().isValid() && this->_find->kmer_end().isValid()) == false)
{
return false;
}
if(this->_find->gap_stretch_size() < this->_find->kmer_size() - 1 && this->_find->gap_stretch_size() >= this->_find->kmer_size() - 1 - this->_find->max_repeat())
{
int repeat_size = this->_find->kmer_size() - 1 - this->_find->gap_stretch_size();
// obtains the kmer sequence
string kmer_begin_str = this->_find->model().toString(this->_find->kmer_begin().forward());
string kmer_end_str = string(&(this->_find->chrom_seq()[this->_find->position() - 1 + repeat_size]), this->_find->kmer_size());
if ((this->nb_out_branch(this->_find->kmer_begin().forward())==0) || (this->nb_in_branch(this->_find->kmer_end().forward())==0) || (!this->_find->model().codeSeed(&(this->_find->chrom_seq()[this->_find->position() - 1 + repeat_size]),Data::ASCII).isValid()))
{
return false;
}
else
{
string ref = kmer_begin_str.substr(kmer_begin_str.size()-1-repeat_size,1);
//All possible insertions of size 1 and 2
char nucleo[20][6] = {"A","C","G","T","AA","AC","AG","AT","CA","CC","CG","CT","GA","GC","GG","GT","TA","TC","TG","TT"};
KmerModel local_m(this->_find->kmer_size());
KmerIterator local_it(local_m);
std::string seq;
string inser_base_one;
bool found_base_one=false;
//std::list fourth (nucleo, nucleo + sizeof(nucleo) / sizeof(char) );
for (int i=0; i<20; i++)
{
seq = kmer_begin_str+ nucleo[i] + kmer_end_str;
//std::cout << seq << endl;
Data local_d(const_cast(seq.c_str()));
int sum_valid=0;
// // Init this variable
local_d.setRef(const_cast(seq.c_str()), (size_t)seq.length());
local_it.setData(local_d);
for(local_it.first(); !local_it.isDone(); local_it.next())
{
if(this->contains(local_it->forward()))
{
sum_valid++;
}
else
{
break;
}
if (sum_valid==this->_find->kmer_size())
{
inser_base_one=ref+nucleo[i];
found_base_one=true;
}
}
if (found_base_one==true) break;
}
if (!found_base_one) return false;
this->_find->writeIndel(this->_find->breakpoint_id(),this->_find->chrom_name(),this->_find->position()- 2, ref, inser_base_one, repeat_size, STR_HOM_TYPE);
this->_find->homo_clean_indel_iterate();
this->_find->breakpoint_id_iterate();
return true;
}
}
return false;
}
#endif // FINDSMALLINSERTION_HPP_
MindTheGap-2.3.0/src/FindHeteroInsertion.hpp 0000644 0001750 0001750 00000017631 14230013106 020242 0 ustar nilesh nilesh /*****************************************************************************
* MindTheGap: Integrated detection and assembly of insertion variants
* A tool from the GATB (Genome Assembly Tool Box)
* Copyright (C) 2014 INRIA
* Authors: C.Lemaitre, G.Rizk, P.Marijon
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see .
*****************************************************************************/
#ifndef _TOOL_FindHetero_HPP_
#define _TOOL_FindHetero_HPP_
/*******************************************************************************/
#include
#include
template
class FindHeteroInsertion : public IFindObserver
{
public :
typedef typename gatb::core::kmer::impl::Kmer Kmer;
typedef typename Kmer::ModelCanonical KmerModel;
typedef typename KmerModel::Iterator KmerIterator;
/** \copydoc IFindObserver::IFindObserver
*/
FindHeteroInsertion(FindBreakpoints * find);
/** \copydoc IFindObserver::update
*/
bool update();
};
template
FindHeteroInsertion::FindHeteroInsertion(FindBreakpoints * find) : IFindObserver(find){}
template
bool FindHeteroInsertion::update()
{
if(!this->_find->homo_only())
{
// branching filter parameters
int branching_threshold = this->_find->branching_threshold(); //max number of branching kmers in the 100 bp window of previous kmers
int max_branching_kmers = branching_threshold;
bool filtering = true;
if (branching_threshold<0){
filtering = false;
max_branching_kmers = 100;
}
int filter_window_size = 100 ; //should not be larger than the size of het_kmer_history = 256
// hetero site detection
if(!this->_find->kmer_end_is_repeated() && this->_find->current_info().nb_in == 2 && !this->_find->recent_hetero())
{
//loop over putative repeat size (0=clean, >0 fuzzy), reports only the smallest repeat size found.
for(int i = 0; i <= this->_find->max_repeat(); i++)
{
bool found_base_one = false;
if(this->_find->het_kmer_history(this->_find->het_kmer_begin_index()+i).nb_out == 2 && !this->_find->het_kmer_history(this->_find->het_kmer_begin_index()+i).is_repeated)
{
//hetero breakpoint found
string kmer_begin_str = this->_find->model().toString(this->_find->het_kmer_history(this->_find->het_kmer_begin_index()+i).kmer);
//string kmer_end_str = this->_find->model().toString(this->_find->current_info().kmer);
//modif 15/06/2018 to check !!! (before in case of fuzzy>0, the end and right kmers overlapped, => insertion of wrong size (- fuzzy), missing the repeat + loss of recall if insertion of size < repeat)
string kmer_end_str = string(&(this->_find->chrom_seq()[this->_find->position() + i]), this->_find->kmer_size());
string ref = kmer_begin_str.substr(kmer_begin_str.size() - 1 - i, 1);
//Tests if this can be a small (1-2 bp) insertion
char nucleo[20][6] = {"A", "C", "G", "T", "AA", "AC", "AG", "AT", "CA", "CC", "CG", "CT", "GA", "GC", "GG", "GT", "TA", "TC", "TG", "TT"};
KmerModel local_m(this->_find->kmer_size());
KmerIterator local_it(local_m);
std::string seq;
string inser_base_one;
if (!this->_find->model().codeSeed(&(this->_find->chrom_seq()[this->_find->position() +i]),Data::ASCII).isValid())
{
return false;
}
for (int a = 0; a < 20; a++) // for all possible 1-2 bp insertions, perform a micro-assembly
{
seq = kmer_begin_str + nucleo[a] + kmer_end_str;
Data local_d(const_cast(seq.c_str()));
int sum_valid = 0;
// // Init this variable
local_d.setRef(const_cast(seq.c_str()), (size_t)seq.length());
local_it.setData(local_d);
for (local_it.first(); !local_it.isDone(); local_it.next())
{
if (this->contains(local_it->forward()))
{
sum_valid++;
}
else
{
break;
}
if (sum_valid == this->_find->kmer_size())
{
inser_base_one = ref + nucleo[a];
found_base_one = true;
}
}
if (found_base_one == true)
break;
}
if (found_base_one)
{
this->_find->writeIndel(this->_find->breakpoint_id(), this->_find->chrom_name(), this->_find->position() - 1, ref, inser_base_one, i, STR_HET_TYPE);
this->_find->hetero_indel_iterate();
this->_find->breakpoint_id_iterate();
return true;
}
else
{
//this may be a large insertion
int nb_branching = 0;
//Applying the branching-filter :
if (filtering){
//counts the number of branching-kmers among the 100 previous ones
int nb_prev = 0;
unsigned char begin_index = this->_find->het_kmer_begin_index()-1;
while ((nb_branching <= max_branching_kmers) && (nb_prev_find->het_kmer_history(begin_index-nb_prev).nb_out >1 || this->_find->het_kmer_history(begin_index-nb_prev).nb_in >1 ){
nb_branching ++;
}
nb_prev++;
}
}
if(nb_branching <= max_branching_kmers){
this->_find->writeBreakpoint(this->_find->breakpoint_id(), this->_find->chrom_name(), this->_find->position() - 1 + i, kmer_begin_str, kmer_end_str, i, STR_HET_TYPE, this->_find->het_kmer_history(this->_find->het_kmer_begin_index() + i).is_repeated, this->_find->kmer_end_is_repeated());
this->_find->breakpoint_id_iterate();
if (i == 0)
{
this->_find->hetero_clean_iterate();
}
else
{
this->_find->hetero_fuzzy_iterate();
}
this->_find->recent_hetero(this->_find->max_repeat()); // we found a breakpoint, the next hetero one mus be at least _max_repeat apart from this one.
return true; //reports only the smallest repeat size found.
}
else{ // stop the loop over fuzzy size, because the branching context will remain not good for other fuzzy sizes
this->_find->recent_hetero(max(0, this->_find->recent_hetero() - 1)); // when recent_hetero=0 : we are sufficiently far from the previous hetero-site
return false;
}
}
}
}
}
this->_find->recent_hetero(max(0, this->_find->recent_hetero() - 1)); // when recent_hetero=0 : we are sufficiently far from the previous hetero-site
}
return false;
}
#endif /* _TOOL_FindHetero_HPP_ */
MindTheGap-2.3.0/src/GraphAnalysis.cpp 0000644 0001750 0001750 00000040011 14230013106 017044 0 ustar nilesh nilesh /*****************************************************************************
* MindTheGap: Integrated detection and assembly of insertion variants
* A tool from the GATB (Genome Assembly Tool Box)
* Copyright (C) 2014 INRIA
* Authors: C.Lemaitre, G.Rizk, R. Chikhi
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see .
*****************************************************************************/
#include
#include
#include // for find()
int GraphAnalysis::debug = 0;
/*
* the graph (produced by GraphOutput) is loaded as a directed graph where
* each node and its revcomp are separated. For instance,
* node 0
* becomes:
* node "0f" and node "0r"
* enabling edges between, for instance
* 0 -> 1 [label="fr"]
* becomes:
* 0f -> 1r
*
* internal representation of "0f" is 0, and "0r" is 0+nb_nodes
*
* the following functions performs the conversion
*/
string GraphAnalysis::node_identifier(int node)
{
char node_id[10];
sprintf(node_id,"%d%s",(node>nb_nodes)?(node-nb_nodes):node, (node> GraphAnalysis::find_all_paths(set< info_node_t > terminal_nodes_with_endpos, bool &success)
{
success = true;
unlabeled_path start_path;
start_path.push_back(0);
int nb_calls = 0;
set> paths = find_all_paths(0, terminal_nodes_with_endpos, start_path, nb_calls, success);
//std::cout << "PATHS0 \n" << endl;
return paths;
}
// precondition: terminal_nodes is non-empty
set> GraphAnalysis::find_all_paths(int start_node, set< info_node_t > terminal_nodes_with_endpos, unlabeled_path current_path, int &nb_calls, bool &success)
{
//cout << nb_calls << endl;
set> paths;
// don't explore for too long
if (nb_calls++ > 10000000)
{
// printf("fail, max nb_calls reached \n");
success = false;
return paths;
}
for (set< info_node_t >::iterator it_targets = terminal_nodes_with_endpos.begin() ; it_targets != terminal_nodes_with_endpos.end() ; it_targets++)
{
if (it_targets->node_id == start_node )
{
pair found_path = make_pair(current_path,it_targets->targetId);
paths.insert(found_path);
return paths;
}
}
// if (terminal_nodes.find(start_node) != terminal_nodes.end()) //stops when reaches one of the terminal nodes.
// {
// paths.insert(current_path);
// return paths;
// }
// visit all neighbors
for(set