kissplice-2.1.0/modules/README000644 001751 001751 00000000317 12262534651 017151 0ustar00marchetmarchet000000 000000 To compile: make To run: ./run_modules edge_file node_file k_value [base_name] The output will be one edge_file and one node_file for each biconected component. They are written by default at ./bcc/. kissplice-2.1.0/modules/check_error_removal.cpp000644 001751 001751 00000017342 12262534651 023016 0ustar00marchetmarchet000000 000000 #include #include #include #include #include #include "LabelledCEdge.h" #define MAX 1024 using namespace std; struct NodeSeq { char *seq; int node; NodeSeq(char *initSeq, int initNode) : seq(initSeq), node(initNode) { } bool operator== (const NodeSeq& rhs) const { return (strcmp(seq, rhs.seq) == 0); } bool operator< (const NodeSeq& rhs) const { return (strcmp(seq, rhs.seq) < 0); } }; int count_nb_lines( FILE* file ) { int ch, number_of_lines = 0; while (EOF != (ch=getc(file))) if ('\n' == ch) number_of_lines++; // Set the cursor back to the begining of the file. rewind(file); // Don't care if the last line has a '\n' or not. We over-estimate it. return number_of_lines + 1; } FILE* open_file( char* filename ) { FILE* file = fopen( filename, "r" ); if ( file == NULL ) { fprintf( stderr, "Problem opening %s!\n", filename ); exit( EXIT_FAILURE ); } return file; } static char complement(char b) { switch(b) { case 'A': return 'T'; case 'T': return 'A'; case 'G': return 'C'; case 'C': return 'G'; case 'a': return 't'; case 't': return 'a'; case 'g': return 'c'; case 'c': return 'g'; case 'N': return 'N'; case '*': return '*'; } return '?'; } string reverse_complement(string seq) { string s(seq.begin(),seq.end()); string::iterator pos; for (pos = s.begin(); pos != s.end(); ++pos) { // cout << *pos; } // cout << endl; reverse(s.begin(), s.end()); for(pos=s.begin();pos!=s.end();++pos) *pos=complement(*pos); return s; } void readNodeFile(char* nodes_fname, vector& nodesF, vector& nodesR, int k_val ) { FILE* node_file = open_file(nodes_fname); char* buffer = new char[100 * MAX]; char* seq; nodesF.reserve(count_nb_lines(node_file)); nodesR.reserve(count_nb_lines(node_file)); while ( fgets(buffer, 100 * MAX, node_file) != NULL ) { char* p; if (strlen(buffer) == 100 * MAX) { p = strtok(buffer, "\t\n"); fprintf(stdout, "ERROR: node %s with sequence larger than %d!", p, 100 * MAX); exit(0); } // Node label, should be contiguous p = strtok( buffer, "\t\n" ); int node = atoi(p); // Node seq p = strtok( NULL, "\t\n" ); // We need the prefix of size k seq = new char[k_val + 1]; strncpy( seq, p, k_val); seq[k_val] = '\0'; nodesF.push_back( NodeSeq(seq, node) ); // And the prefix of size k in the reverse complement seq = new char[k_val + 1]; string rev_comp = reverse_complement(string(p)); strncpy( seq, rev_comp.c_str(), k_val); seq[k_val] = '\0'; nodesR.push_back( NodeSeq(seq, node) ); //printf("%d %s %s\n", node, p, rev_comp.c_str()); } delete [] buffer; fclose(node_file); } char *rev_seq = NULL; char *revComp(char *seq) { int size = strlen(seq); if (rev_seq == NULL) rev_seq = new char[size + 1]; for (int i = size - 1; i >= 0; i--) rev_seq[(size - 1) - i] = complement(seq[i]); rev_seq[size] = '\0'; return rev_seq; } int findNode(char *query, vector& nodes) { vector::iterator low, low_r; char *query_r = revComp(query); // The value of "x" in NodeSeq(query, x) doesn't matter low = lower_bound(nodes.begin(), nodes.end(), NodeSeq(query, 0)); low_r = lower_bound(nodes.begin(), nodes.end(), NodeSeq(query_r, 0)); if (low != nodes.end() && *low == NodeSeq(query, 0)) return low->node; if (low_r != nodes.end() && *low_r == NodeSeq(query_r, 0)) return low_r->node; return -1; } void readCounts(char* counts_fname, map& counts) { FILE *count_file = open_file(counts_fname); char* buffer = new char[100 * MAX]; while ( fgets(buffer, 100 * MAX, count_file) != NULL ) { char* p; // Sequence p = strtok( buffer, "\t\n " ); string kmer = p; // Count p = strtok( NULL, "\t\n " ); counts[kmer] = atoi(p); } delete [] buffer; fclose(count_file); } int kcounts(string query, map counts) { if (counts.find(query) != counts.end()) return counts[query]; if (counts.find(reverse_complement(query)) != counts.end()) return counts[reverse_complement(query)]; // fprintf(stderr, "k-mer not found!\n"); return 0; } char ACTG[4] = {'A','C','T','G'}; void readEdgeFile( char* nodes_fname, vector& edges ) { FILE *edge_file = open_file(nodes_fname); char* buffer = new char[100 * MAX]; char* u = new char[MAX]; char* v = new char[MAX]; char* label = new char[MAX]; edges.reserve(count_nb_lines(edge_file)); while ( fgets(buffer, 100 * MAX, edge_file) != NULL ) { char* p; // outgoing node p = strtok( buffer, "\t\n" ); strcpy( u, p ); // incoming node p = strtok( NULL, "\t\n" ); strcpy( v, p ); // edge label p = strtok( NULL, "\t\n" ); strcpy(label, p); edges.push_back( LabelledCEdge( atoi(u), atoi(v), label ) ); } sort( edges.begin(), edges.end() ); delete [] buffer; delete [] u; delete [] v; delete [] label; fclose(edge_file); } LabelledCEdge reverse(LabelledCEdge e) { char *label = new char[3]; label[0] = e.label[1] == 'F' ? 'R' : 'F'; label[1] = e.label[0] == 'F' ? 'R' : 'F'; label[2] = '\0'; return LabelledCEdge(e.getSecond(), e.getFirst(), label); } int findEdge(vector& allEdges, LabelledCEdge e) { vector::iterator low; low = lower_bound(allEdges.begin(), allEdges.end(), e); if (low != allEdges.end()) return low - allEdges.begin(); fprintf(stderr, "inconsistent graph!"); return -1; } int main( int argc, char** argv ) { if ( argc < 6 ) { fprintf( stderr, "Wrong number of arguments!\n" ); fprintf( stderr, "Usage: ./check_error_removal removed.edges nodes_file k_value k_mer_count_file cutoff\n" ); return 0; } int k_value = atoi( argv[3] ); double cutoff = atof( argv[5] ); // Read node file vector nodesF, nodesR; readNodeFile(argv[2], nodesF, nodesR, k_value ); // Read count file map counts; readCounts(argv[4], counts); // printf("\n"); // vector::iterator it; // for (it = nodesF.begin(); it != nodesF.end(); it++) // printf("%d %s %d\n", it->node, it->seq, kcounts(string(it->seq), counts)); // printf("\n"); // for (it = nodesR.begin(); it != nodesR.end(); it++) // printf("%d %s %d\n", it->node, it->seq, kcounts(string(it->seq), counts)); vector allEdges; readEdgeFile( argv[1], allEdges ); vector verified(allEdges.size(), false); for (int i = 0; i < (int)allEdges.size(); i++) if (!verified[i]) { int u = allEdges[i].getFirst(); int v = allEdges[i].getSecond(); string label = allEdges[i].label; string kmer, pref; if (label[1] == 'F') kmer = nodesF[v].seq; else kmer = nodesR[v].seq; pref = kmer.substr(0,k_value -1); int sum = 0; for (int j = 0; j < 4; j++) sum += kcounts(string(pref+ACTG[j]), counts); double ratio = (double)kcounts(kmer, counts) / (double)sum; //fprintf(stderr, "edge %d -> %d (%s) ratio = %lf\n", u, v, label.c_str(), ratio); if (ratio < cutoff) { verified[i] = true; verified[findEdge(allEdges, reverse(allEdges[i]))] = true; } } bool flag = true; for (int i = 0; i < (int)allEdges.size(); i++) if (!verified[i]) { int u = allEdges[i].getFirst(); int v = allEdges[i].getSecond(); string label = allEdges[i].label; fprintf(stderr, "edge %d -> %d (%s) should not be removed\n", u, v, label.c_str()); flag = false; } if (flag) fprintf(stderr, "Removed edges are correct!\n"); return 0; } kissplice-2.1.0/modules/memused000755 001751 001751 00000000444 12262534651 017657 0ustar00marchetmarchet000000 000000 #!/bin/bash "$@" & cd /proc/$! max=0 while [ -f status ] do sleep 0.1 if [ -f status ] then mem=`cat status | grep VmHWM | tr -s [:blank:] | cut -d ' ' -f 2` if [ "$mem" -gt "$max" ] then max=$mem fi fi; done echo "maximal memory used ( kilobyte(s) (K / Kb))" $max kissplice-2.1.0/modules/WeightedDigraph.h000644 001751 001751 00000001550 12262534651 021501 0ustar00marchetmarchet000000 000000 #ifndef WEIGHTED_DIGRAPH_H #define WEIGHTED_DIGRAPH_H #include using namespace std; class WeightedEdge{ public: int node; int cost; bool removed; WeightedEdge(int n, int nCost) : node(n), cost(nCost), removed(false) { } }; class WeightedDigraph{ public: vector > adj_list; vector removed; vector node_cost; WeightedDigraph(int nbNodes) : adj_list(nbNodes), removed(nbNodes, false), node_cost(nbNodes) { } void setArcStatus(bool status, int u, int v) { for (int i = 0; i < (int)adj_list[u].size(); i++) if (adj_list[u][i].node == v) adj_list[u][i].removed = status; } int adjListSz(int node) { int sz = 0; for (int i = 0; i < (int)adj_list[node].size(); i++) sz += (!adj_list[node][i].removed && !removed[adj_list[node][i].node]); return sz; } }; #endif kissplice-2.1.0/modules/CEdge.h000644 001751 001751 00000015663 12262534651 017423 0ustar00marchetmarchet000000 000000 /* *************************************************************************** * * KisSplice * de-novo calling alternative splicing events from RNA-seq data. * * *************************************************************************** * * Copyright INRIA * contributors : Vincent Lacroix * Pierre Peterlongo * Gustavo Sacomoto * Vincent Miele * Alice Julien-Laferriere * David Parsons * * pierre.peterlongo@inria.fr * vincent.lacroix@univ-lyon1.fr * * This software is a computer program whose purpose is to detect alternative * splicing events from RNA-seq data. * * This software is governed by the CeCILL license under French law and * abiding by the rules of distribution of free software. You can use, * modify and/ or redistribute the software under the terms of the CeCILL * license as circulated by CEA, CNRS and INRIA at the following URL * "http://www.cecill.info". * As a counterpart to the access to the source code and rights to copy, * modify and redistribute granted by the license, users are provided only * with a limited warranty and the software's author, the holder of the * economic rights, and the successive licensors have only limited * liability. * In this respect, the user's attention is drawn to the risks associated * with loading, using, modifying and/or developing or reproducing the * software by the user in light of its specific status of free software, * that may mean that it is complicated to manipulate, and that also * therefore means that it is reserved for developers and experienced * professionals having in-depth computer knowledge. Users are therefore * encouraged to load and test the software's suitability as regards their * requirements in conditions enabling the security of their systems and/or * data to be ensured and, more generally, to use and operate it in the * same conditions as regards security. * * The fact that you are presently reading this means that you have had * knowledge of the CeCILL license and that you accept its terms. */ #ifndef EDGE_H #define EDGE_H // =========================================================================== // Include Libraries // =========================================================================== #include #include #include #include // =========================================================================== // Include Project Files // =========================================================================== // =========================================================================== // Class declarations // =========================================================================== // =========================================================================== // Declare Used Namespaces // =========================================================================== using namespace std; //! Edge class /*! * \brief Class meant to be used by CGraph (compact graph) to represent edges. * * It encapsulates a pair of int (u -> v edge), each int (u or v) being a node id */ class CEdge { public : // ======================================================================= // Enums // ======================================================================= // ======================================================================= // Constructors // ======================================================================= CEdge( void ); CEdge( const CEdge& model ); CEdge( int f, int s ); // ======================================================================= // Destructor // ======================================================================= // ======================================================================= // Accessors: getters // ======================================================================= inline int getFirst( void ); inline int getSecond( void ); // ======================================================================= // Accessors: setters // ======================================================================= // ======================================================================= // Operators // ======================================================================= bool operator==( const CEdge& that ) const; bool operator!=( const CEdge& that ) const; bool operator<( const CEdge& that ) const; // ======================================================================= // Public Methods // ======================================================================= CEdge& swap_ends( void ); // ======================================================================= // Public Attributes // ======================================================================= protected : pair _pair; // ======================================================================= // Forbidden Constructors // ======================================================================= // ======================================================================= // Protected Methods // ======================================================================= // ======================================================================= // Protected Attributes // ======================================================================= }; // =========================================================================== // Getters' definitions // =========================================================================== inline int CEdge::getFirst( void ) { return _pair.first; } inline int CEdge::getSecond( void ) { return _pair.second; } // =========================================================================== // Setters' definitions // =========================================================================== // =========================================================================== // Inline Operators' definitions // =========================================================================== // =========================================================================== // Inline functions' definition // =========================================================================== #endif // EDGE_H kissplice-2.1.0/modules/BubbleEnumeration.h000644 001751 001751 00000011612 12262534651 022044 0ustar00marchetmarchet000000 000000 /* *************************************************************************** * * KisSplice * de-novo calling alternative splicing events from RNA-seq data. * * *************************************************************************** * * Copyright INRIA * contributors : Vincent Lacroix * Pierre Peterlongo * Gustavo Sacomoto * Vincent Miele * Alice Julien-Laferriere * David Parsons * * pierre.peterlongo@inria.fr * vincent.lacroix@univ-lyon1.fr * * This software is a computer program whose purpose is to detect alternative * splicing events from RNA-seq data. * * This software is governed by the CeCILL license under French law and * abiding by the rules of distribution of free software. You can use, * modify and/ or redistribute the software under the terms of the CeCILL * license as circulated by CEA, CNRS and INRIA at the following URL * "http://www.cecill.info". * As a counterpart to the access to the source code and rights to copy, * modify and redistribute granted by the license, users are provided only * with a limited warranty and the software's author, the holder of the * economic rights, and the successive licensors have only limited * liability. * In this respect, the user's attention is drawn to the risks associated * with loading, using, modifying and/or developing or reproducing the * software by the user in light of its specific status of free software, * that may mean that it is complicated to manipulate, and that also * therefore means that it is reserved for developers and experienced * professionals having in-depth computer knowledge. Users are therefore * encouraged to load and test the software's suitability as regards their * requirements in conditions enabling the security of their systems and/or * data to be ensured and, more generally, to use and operate it in the * same conditions as regards security. * * The fact that you are presently reading this means that you have had * knowledge of the CeCILL license and that you accept its terms. */ #include #include #include #include #include #include #include #include "NGraph.h" #include "Utils.h" #ifndef MOUTH_ENUMERATION_H #define MOUTH_ENUMERATION_H template bool read_edges_and_nodes_withoptimIO(char* filename_info, char* filename_contents_edge, char* filename_contents_node, char* filename_edge, char* filename_node, int *required_sequence, TEdgeFunctor edgefunctor, TNodeFunctor& nodefunctor) { //////////////////////////// //////////////////////////// //////////////////////////// // IO optimization (start) //////////////////////////// int bcc_size, records_per_file, number_of_files_max, file_index; FILE *info_file = fopen(filename_info, "r"); if (info_file == NULL) { fprintf(stderr, "Problem opening %s!\n", filename_info); exit(0); } FILE *contents_file_edge = fopen(filename_contents_edge, "r"); if (contents_file_edge == NULL) { fprintf(stderr, "Problem opening %s!\n", filename_contents_edge); exit(0); } FILE *contents_file_node = fopen(filename_contents_node, "r"); if (contents_file_node == NULL) { fprintf(stderr, "Problem opening %s!\n", filename_contents_node); exit(0); } // read the info file fscanf(info_file, "%d \n",&bcc_size ); fscanf(info_file, "%d \n",&records_per_file ); // find in which file is the required record number_of_files_max = NUMBEROFFILES; if (bcc_sizenumber_of_files_max) file_index = number_of_files_max; if ( (*required_sequence <=0) || (*required_sequence > bcc_size) ) { fprintf(stderr, "Problem opening sequence %d in edge/node files !\n", *required_sequence); exit(0); } // filenames char total_edge_fname[1024]; sprintf( total_edge_fname, "%s_%d",filename_edge,file_index ); char total_node_fname[1024]; sprintf( total_node_fname, "%s_%d",filename_node,file_index ); //////////////////////////// // IO optimization (end) //////////////////////////// //////////////////////////// //////////////////////////// FILE* edge_file = open_file(total_edge_fname); FILE* node_file = open_file(total_node_fname); bool atleast4nodes = read_node_noncontigous_file_withoptimIO( contents_file_node, node_file, required_sequence, &file_index, nodefunctor); if (atleast4nodes){ read_edge_file_withoptimIO( contents_file_edge, edge_file, required_sequence, &file_index, edgefunctor ); } fclose( contents_file_edge ); fclose( contents_file_node ); fclose( edge_file ); fclose( node_file ); return atleast4nodes; } #endif /* MOUTH_ENUMERATION_H */ kissplice-2.1.0/modules/CycleCompression.cpp000644 001751 001751 00000010662 12271425257 022263 0ustar00marchetmarchet000000 000000 /* *************************************************************************** * * KisSplice * de-novo calling alternative splicing events from RNA-seq data. * * *************************************************************************** * * Copyright INRIA * contributors : Vincent Lacroix * Pierre Peterlongo * Gustavo Sacomoto * Vincent Miele * Alice Julien-Laferriere * David Parsons * * pierre.peterlongo@inria.fr * vincent.lacroix@univ-lyon1.fr * * This software is a computer program whose purpose is to detect alternative * splicing events from RNA-seq data. * * This software is governed by the CeCILL license under French law and * abiding by the rules of distribution of free software. You can use, * modify and/ or redistribute the software under the terms of the CeCILL * license as circulated by CEA, CNRS and INRIA at the following URL * "http://www.cecill.info". * As a counterpart to the access to the source code and rights to copy, * modify and redistribute granted by the license, users are provided only * with a limited warranty and the software's author, the holder of the * economic rights, and the successive licensors have only limited * liability. * In this respect, the user's attention is drawn to the risks associated * with loading, using, modifying and/or developing or reproducing the * software by the user in light of its specific status of free software, * that may mean that it is complicated to manipulate, and that also * therefore means that it is reserved for developers and experienced * professionals having in-depth computer knowledge. Users are therefore * encouraged to load and test the software's suitability as regards their * requirements in conditions enabling the security of their systems and/or * data to be ensured and, more generally, to use and operate it in the * same conditions as regards security. * * The fact that you are presently reading this means that you have had * knowledge of the CeCILL license and that you accept its terms. */ #include #include #include #include #include #include "debug.h" #include "NGraph.h" #include "Utils.h" #include "CycleCompression.h" #define MAX 1024 //MAX2 and MIN2 never used? TODO: Remove? #define MAX2(a,b) ((a) > (b) ? (a) : (b)) #define MIN2(a,b) ((a) < (b) ? (a) : (b)) using namespace std; idx_dir reverse_dir(idx_dir node) { return make_pair(node.first, reverse_dir(node.second)); } bool is_same_node( const idx_dir u, const idx_dir v) { return (v.first == u.first && v.second == u.second); } /*! * \brief Merge sequence information between two nodes and print it in the snp file * with bbc number, cycle and length of upper and lower path. * Returns the merged sequence * \param seq1 sequence of the upper path * \param seq2 sequence of the lower path * \param snp_log_file the file to be printing the two paths * \param bccid id of the bi-connected component being treated */ string merge_sequences( const string seq1, const string seq2 ) { string merged; for (int i = 0 ; i < (int)seq1.size(); i++) merged += (seq1[i] == seq2[i] ? seq1[i] : 'N'); return merged; } void output_sequences( string seq1, string seq2, FILE * snp_log_file, const int bccid, const int cycleNum, const int contextL, const int contextR, const int kValue) { int u_len = (int)seq1.size(); int l_len = (int)seq2.size(); if( contextL != 0 || contextR !=0 ) { seq1 = toLowerContext(seq1, contextL, contextR); seq2 = toLowerContext(seq2, contextL, contextR); } if (u_len- (contextL + contextR) > 2* kValue +1) // multiple SNP { fprintf(snp_log_file, ">bcc_%d|Cycle_%d|Type_0b|upper_path_Length_%d\n", bccid, cycleNum, u_len- (contextL + contextR)); fprintf(snp_log_file, "%s\n", seq1.c_str()); fprintf(snp_log_file, ">bcc_%d|Cycle_%d|Type_0b|lower_path_Length_%d\n", bccid, cycleNum, l_len- (contextL + contextR)); fprintf(snp_log_file, "%s\n", seq2.c_str()); } else // single SNP { fprintf(snp_log_file, ">bcc_%d|Cycle_%d|Type_0a|upper_path_Length_%d\n", bccid, cycleNum, u_len- (contextL + contextR)); fprintf(snp_log_file, "%s\n", seq1.c_str()); fprintf(snp_log_file, ">bcc_%d|Cycle_%d|Type_0a|lower_path_Length_%d\n", bccid, cycleNum, l_len- (contextL + contextR)); fprintf(snp_log_file, "%s\n", seq2.c_str()); } } kissplice-2.1.0/modules/ErrorRemoval.cpp000644 001751 001751 00000017646 12262534651 021431 0ustar00marchetmarchet000000 000000 #include #include #include #include #include "Utils.h" #include "LabelledCEdge.h" #define MAX 1024 using namespace std; struct NodeSeq { char *seq; int node; NodeSeq(char *initSeq, int initNode) : seq(initSeq), node(initNode) { } bool operator== (const NodeSeq& rhs) const { return (strcmp(seq, rhs.seq) == 0); } bool operator< (const NodeSeq& rhs) const { return (strcmp(seq, rhs.seq) < 0); } }; int count_nb_lines( FILE* file ) { int ch, number_of_lines = 0; while (EOF != (ch=getc(file))) if ('\n' == ch) number_of_lines++; // Set the cursor back to the begining of the file. rewind(file); // Don't care if the last line has a '\n' or not. We over-estimate it. return number_of_lines + 1; } void readNodeFile(char* nodes_fname, vector& nodesF, vector& nodesR, int k_val ) { FILE* node_file = open_file(nodes_fname); char* buffer = new char[100 * MAX]; char* seq; nodesF.reserve(count_nb_lines(node_file)); nodesR.reserve(count_nb_lines(node_file)); while ( fgets(buffer, 100 * MAX, node_file) != NULL ) { char* p; if (strlen(buffer) == 100 * MAX) { p = strtok(buffer, "\t\n"); fprintf(stdout, "ERROR: node %s with sequence larger than %d!", p, 100 * MAX); exit(0); } // Node label, should be contiguous p = strtok( buffer, "\t\n" ); int node = atoi(p); // Node seq p = strtok( NULL, "\t\n" ); // We need the prefix of size k seq = new char[k_val + 1]; strncpy( seq, p, k_val); seq[k_val] = '\0'; nodesF.push_back( NodeSeq(seq, node) ); // And the prefix of size k in the reverse complement seq = new char[k_val + 1]; string rev_comp = reverse_complement(string(p)); strncpy( seq, rev_comp.c_str(), k_val); seq[k_val] = '\0'; nodesR.push_back( NodeSeq(seq, node) ); //printf("%d %s %s\n", node, p, rev_comp.c_str()); } sort(nodesF.begin(), nodesF.end()); sort(nodesR.begin(), nodesR.end()); delete [] buffer; fclose(node_file); } void readEdgeFile( char* nodes_fname, vector& edges ) { FILE *edge_file = open_file(nodes_fname); char* buffer = new char[100 * MAX]; char* u = new char[MAX]; char* v = new char[MAX]; char* label = new char[MAX]; edges.reserve(count_nb_lines(edge_file)); while ( fgets(buffer, 100 * MAX, edge_file) != NULL ) { char* p; // outgoing node p = strtok( buffer, "\t\n" ); strcpy( u, p ); // incoming node p = strtok( NULL, "\t\n" ); strcpy( v, p ); // edge label p = strtok( NULL, "\t\n" ); strcpy(label, p); edges.push_back( LabelledCEdge( atoi(u), atoi(v), label ) ); } sort( edges.begin(), edges.end() ); delete [] buffer; delete [] u; delete [] v; delete [] label; fclose(edge_file); } // rev_seq needs to be global! let's make sure we allocate only once char *rev_seq = NULL; char *revComp(char *seq) { int size = strlen(seq); if (rev_seq == NULL) rev_seq = new char[size + 1]; for (int i = size - 1; i >= 0; i--) rev_seq[(size - 1) - i] = complement(seq[i]); rev_seq[size] = '\0'; return rev_seq; } int findNode(char *query, vector& nodes) { vector::iterator low, low_r; char *query_r = revComp(query); // The value of "x" in NodeSeq(query, x) doesn't matter low = lower_bound(nodes.begin(), nodes.end(), NodeSeq(query, 0)); low_r = lower_bound(nodes.begin(), nodes.end(), NodeSeq(query_r, 0)); if (low != nodes.end() && *low == NodeSeq(query, 0)) return low->node; if (low_r != nodes.end() && *low_r == NodeSeq(query_r, 0)) return low_r->node; return -1; } void readCounts(char* counts_fname, vector& nodesF, vector& nodesR, vector& countsF, vector& countsR) { FILE *count_file = open_file(counts_fname); char* buffer = new char[100 * MAX]; while ( fgets(buffer, 100 * MAX, count_file) != NULL ) { char* p; // Sequence p = strtok( buffer, "\t\n " ); int nodeF = findNode(p, nodesF); int nodeR = findNode(p, nodesR); // Count p = strtok( NULL, "\t\n " ); if (nodeF != -1) countsF[nodeF] += atoi(p); if (nodeR != -1) countsR[nodeR] += atoi(p); } delete [] buffer; fclose(count_file); } int findEdge(vector& allEdges, LabelledCEdge e) { vector::iterator low; low = lower_bound(allEdges.begin(), allEdges.end(), e); if (low != allEdges.end()) return low - allEdges.begin(); fprintf(stderr, "inconsistent graph!"); return -1; } // Move this to CEdge LabelledCEdge reverse(LabelledCEdge e) { char *label = new char[3]; label[0] = e.label[1] == 'F' ? 'R' : 'F'; label[1] = e.label[0] == 'F' ? 'R' : 'F'; label[2] = '\0'; return LabelledCEdge(e.getSecond(), e.getFirst(), label); } void errorRemoval(vector& allEdges, int nbNodes, double cutoff, vector& removed, vector& countsF, vector& countsR) { int offset = 0; for ( int src = 0 ; src < nbNodes; src++ ) { // Count the number of edges (they are ordered in all_edges) of node src int size = 0; while ( offset + size < (int) allEdges.size() && allEdges[offset + size].getFirst() == src ) size++; // Compute the sum of coverage for each direction int sum_F = 0, sum_R = 0; for (int k = 0; k < size; k++) if (!removed[offset + k]) { int target = allEdges[offset + k].getSecond(); if (allEdges[offset + k].label[0] == 'F') sum_F += (allEdges[offset + k].label[1] == 'F') ? countsF[target] : countsR[target]; if (allEdges[offset + k].label[0] == 'R') sum_R += (allEdges[offset + k].label[1] == 'F') ? countsF[target] : countsR[target]; } // Remove the edges with relative coverage below cutoff for (int k = 0; k < size; k++) if (!removed[offset + k]) { int target = allEdges[offset + k].getSecond(); int count = (allEdges[offset + k].label[1] == 'F') ? countsF[target] : countsR[target]; double ratio; ratio = (double)count / (double)sum_F; if (allEdges[offset + k].label[0] == 'F' && ratio < cutoff) { removed[offset + k] = true; removed[findEdge(allEdges, reverse(allEdges[offset+k]))] = true; } ratio = (double)count / (double)sum_R; if (allEdges[offset + k].label[0] == 'R' && ratio < cutoff) { removed[offset + k] = true; removed[findEdge(allEdges, reverse(allEdges[offset+k]))] = true; } } offset += size; } } int main( int argc, char** argv ) { string base_name = "graph"; if ( argc < 6 ) { fprintf( stderr, "Wrong number of arguments!\n" ); fprintf( stderr, "Usage: ./error_removal edge_file node_file k_value k_mer_count_file cutoff [base_name]\n" ); return 0; } if ( argc == 7 ) { base_name = argv[6]; } int k_value = atoi( argv[3] ); double cutoff = atof( argv[5] ); // Read edge file vector allEdges; readEdgeFile( argv[1], allEdges ); // Read node file vector nodesF, nodesR; readNodeFile(argv[2], nodesF, nodesR, k_value ); // Read count file vector countsF(nodesF.size(), 0), countsR(nodesR.size(), 0); readCounts(argv[4], nodesF, nodesR, countsF, countsR); vector removed(allEdges.size(), false); errorRemoval(allEdges, (int)nodesF.size(), cutoff, removed, countsF, countsR); int nb_removed = 0; FILE *output = fopen((base_name + ".edges").c_str(), "w"); FILE *removed_out = fopen((base_name + "_removed.edges").c_str(), "w"); for (int i = 0 ; i < (int)allEdges.size(); i++) if (!removed[i]) fprintf(output, "%d\t%d\t%s\n", allEdges[i].getFirst(), allEdges[i].getSecond(), allEdges[i].label); else { fprintf(removed_out, "%d\t%d\t%s\n", allEdges[i].getFirst(), allEdges[i].getSecond(), allEdges[i].label); nb_removed++; } fclose(output); fclose(removed_out); fprintf(stdout, "%d out of %d edges removed\n", nb_removed, (int)allEdges.size()); return 0; } kissplice-2.1.0/modules/NGraph.cpp000644 001751 001751 00000106320 12262534651 020155 0ustar00marchetmarchet000000 000000 /* *************************************************************************** * * KisSplice * de-novo calling alternative splicing events from RNA-seq data. * * *************************************************************************** * * Copyright INRIA * contributors : Vincent Lacroix * Pierre Peterlongo * Gustavo Sacomoto * Vincent Miele * Alice Julien-Laferriere * David Parsons * * pierre.peterlongo@inria.fr * vincent.lacroix@univ-lyon1.fr * * This software is a computer program whose purpose is to detect alternative * splicing events from RNA-seq data. * * This software is governed by the CeCILL license under French law and * abiding by the rules of distribution of free software. You can use, * modify and/ or redistribute the software under the terms of the CeCILL * license as circulated by CEA, CNRS and INRIA at the following URL * "http://www.cecill.info". * As a counterpart to the access to the source code and rights to copy, * modify and redistribute granted by the license, users are provided only * with a limited warranty and the software's author, the holder of the * economic rights, and the successive licensors have only limited * liability. * In this respect, the user's attention is drawn to the risks associated * with loading, using, modifying and/or developing or reproducing the * software by the user in light of its specific status of free software, * that may mean that it is complicated to manipulate, and that also * therefore means that it is reserved for developers and experienced * professionals having in-depth computer knowledge. Users are therefore * encouraged to load and test the software's suitability as regards their * requirements in conditions enabling the security of their systems and/or * data to be ensured and, more generally, to use and operate it in the * same conditions as regards security. * * The fact that you are presently reading this means that you have had * knowledge of the CeCILL license and that you accept its terms. */ // =========================================================================== // Include Libraries // =========================================================================== #include #include #include #include #include #include #include #include #include #include #include // =========================================================================== // Include Project Files // =========================================================================== #include #include #include #include // =========================================================================== // Declare Used Namespaces // =========================================================================== using namespace std; // =========================================================================== // Constant // =========================================================================== #define MAX 1024 //############################################################################ // # // Class NGraph # // # //############################################################################ // =========================================================================== // Constructors // =========================================================================== NGraph::NGraph( int kValue, int outputtedSnps ) { _kValue = kValue; _nbOutput = outputtedSnps; } NGraph::NGraph( const NGraph &model ) { _kValue = model._kValue; _nodes = model._nodes; _nbOutput = model._nbOutput; _nodeToStr = model._nodeToStr; _strToNode = model._strToNode; } /*! TODO : check : Create a NGraph for the BCC passed in edges ? all_edges ? */ NGraph::NGraph( CGraph& cgraph, vector& seqs, vector& all_edges, vector& edges ) { _nbOutput = 0; for ( int i = 0 ; i < (int)edges.size() ; i++ ) { int u = edges[i].getFirst(); int v = edges[i].getSecond(); //~ printf( "contructing %s %s (0x%x, 0x%x)\n", seqs[u], seqs[v], seqs[u], seqs[v] ); //~ getchar(); insert_node( to_str(u), to_str( seqs[u] ) ); insert_node( to_str(v), to_str( seqs[v] ) ); insert_bidirected_edges( all_edges, edges[i] ); edges[i].swap_ends(); insert_bidirected_edges( all_edges, edges[i] ); } _kValue = cgraph.k_value; expand_parallel_edges(); } // =========================================================================== // Destructors // =========================================================================== NGraph::~NGraph(void) { } // =========================================================================== // Public Methods // =========================================================================== void NGraph::insert_empty_node( string u ) { _strToNode[u] = _nodeToStr.size(); // _nodeToStr.size() corresponds to the current number of nodes in the graph _nodeToStr.push_back(u); _nodes.push_back( NNode() ); } // Insert the new edge u->v (label). // If the edge u->v is already present with a different label, concatenate