pax_global_header00006660000000000000000000000064126077214070014520gustar00rootroot0000000000000052 comment=38f80c4faafc88ea9ba4032e9b7cf6f5524e863b snpomatic-1.0/000077500000000000000000000000001260772140700133555ustar00rootroot00000000000000snpomatic-1.0/Makefile000066400000000000000000000022161260772140700150160ustar00rootroot00000000000000# Use g++ CXX=g++ CXXFLAGS=-g -O3 LIBS=-lm # Use Intel compiler (faster!) #CXX=icpc #CXXFLAGS=-g -O2 -ip -align -falign-functions -Wno-deprecated -Isrc -w #LIBS= findknownsnps: src/findknownsnps.o src/global_functions.o src/TChromosome.o src/TChromosomalIndices.o src/TAlignmentOutput.o src/TChromosomeAlign.o $(CXX) $(CXXFLAGS) $(LIBS) $^ -o $@ reassemble: src/reassemble.o src/global_functions.o src/TChromosome.o src/TChromosomalIndices.o src/TAlignmentOutput.o src/TChromosomeAlign.o $(CXX) $(CXXFLAGS) $(LIBS) $^ -o $@ variety: src/variety.o src/global_functions.o src/TChromosome.o src/TChromosomalIndices.o src/TAlignmentOutput.o src/TChromosomeAlign.o $(CXX) $(CXXFLAGS) $(LIBS) $^ -o $@ mapcontigs: src/mapcontigs.o src/global_functions.o src/TChromosome.o src/TChromosomalIndices.o src/TAlignmentOutput.o src/TChromosomeAlign.o $(CXX) $(CXXFLAGS) $(LIBS) $^ -o $@ ungap: src/ungap.o src/global_functions.o src/TChromosome.o src/TChromosomalIndices.o src/TAlignmentOutput.o src/TChromosomeAlign.o $(CXX) $(CXXFLAGS) $(LIBS) $^ -o $@ .cpp.o: $(CXX) $(CXXFLAGS) -c $< -o $@ clean: rm -f *.o rm -f src/*.o rm -f findknownsnps rm -f variety snpomatic-1.0/README.md000066400000000000000000000003411260772140700146320ustar00rootroot00000000000000SNP-o-matic is a fast, stringent short-read mapping software. It supports a multitude of output types and formats, for uses in filtering reads, alignments, sequence-based genotyping calls, assisted reassembly of contigs etc. snpomatic-1.0/doc/000077500000000000000000000000001260772140700141225ustar00rootroot00000000000000snpomatic-1.0/doc/Manual.doc000066400000000000000000001260001260772140700160250ustar00rootroot00000000000000ࡱ> ORN D%bjbj .BD<<#tWWWWW222"""""""$$1'\"22222"WW"2WW"2": , !W=+ ̕0 ""0# R'' !' !|2222222""222#2222'222222222 : SNP-O-MATIC manual SNP-O-MATIC is software to analyze Solexa reads and compare them to a reference. The core application, findknownsnps, looks at perfect matches of single and paired Solexa reads to a given reference sequence. While it will only find perfect matches, it can allow for a list of known SNPs, turning it into a Solexa-based genotyping application. SNP-O-MATIC works by first creating an index on the reference sequence, then looking up reads in that index. Most output files are tab-delimitered. Parameters have to use the --" prefix and must be separated from the parameter value by =; a space will not do. Parameters Genome --genome=FILE [mandatory] The FASTA file containing the reference genome (can also be contigs etc.). This must be present for all operations of SNP-O-MATIC. --noshortcuts By default, kmers that occur very often (for example, in repeat regions) are removed from the index, speeding up read scan considerably. This option causes all kmers to be processed, resulting in a slower but more thorough process. --index=FILENAME The genome index is usually generated on the fly. To save time, such an index can be stored in a file and read in for subsequent use. Points to consider: The index file can get quite large The index file varies with the following parameters: --mspi, --noshortcuts, --index1, --index2, --snps, --gff SNP-O-MATIC will not check if the parameters used to create the index are the same that are used in a subsequent run. If any of these parameters differ, it might segfault or return wrong results without warning. --mspi=NUMBER The maximum number of SNPs per chromosomal index. The default value is 8. When looking for known SNPs, each kmer (default: 26; see --index1 and --index2) is indexed in all possible combinations. If many potential SNPs occur in such a kmer, the number of indices rises exponentially (worst case: 26 Ns => 426 indices), slowing down index generation and using large amounts of memory. Also, matches will get unspecific. Therefore, no kmers with more than NUMBER SNPs are indexed. --index1=NUMBER --index2=NUMBER The length of the internal index keys for genome indexing. By default, index1 is 10 and index2 is 16, resulting in a kmer index length of 26. Neither index must be 0, nor must index2 exceed 16. It is usually not necessary to alter these settings. --mismatch=NUMBER The number of mismatches allowed outside the index (index1+index2) --chromosome=NAME Discards all chromosomes but NAME prior to run, effectively only working on that chromosome. --index_from=NUMBER Starts indexing at this position on all chromosomes. To be used with --chromosome. --index_to=NUMBER Stops indexing at this position on all chromosomes. To be used with --chromosome. Reads --fasta=FASTA_FILE (--fastq2=FASTQ_FILE) --fastq=FASTQ_FILE The file(s) containing the reads. Use only one or the other. Use fastq together with fastq2 for split files. One is mandatory, unless --uniqueness is used. --nono=FILENAME A file with names of reads to ignore. One read name per line; names may be prefixed with > or @. Paired reads Paired reads have several additional options: --pair=NUMBER The length of the first read. Mandatory for paired reads. --fragment=NUMBER The fragment length (including the two read pairs). The default is the entire chromosome. --variance=NUMBER The allowed variance for the fragment length. If not specified, of the fragment length is used. If the fragment length is not specified either, the entire chromosome length is used. --chop=NUMBER If one read but not the other matches, shorten the other by NUMBER bases. --multimatch If a read pair matches in multiple locations, assign it to one of them at random. --singlematch Only performs additional output functions (not bins) for single matches [currently paired reads only] --foum Find One Unique Match will only map a read pair if at least one of the reads is unique within the genome. Known SNPs --snps=SIMPLE_SNP_FILE --gff=GFF_SNP_FILE A file containing a list of known SNPs. The simple SNP format contains one known SNP per line: chromosome_name tab potition_in_chromosome tab reference_allee tab snp_allele_or_iupac_code The GFF format is more complicated; ignore it. Bins --bins=FILE_PREFIX Reads can be written into different bin files, depending on how they match the reference: FILE_PREFIX_no_match.fast[a|q] for reads that do not match the reference genome FILE_PREFIX_single_match.fast[a|q] for reads that match the reference genome exactly once FILE_PREFIX_multi_match.fast[a|q] for reads that match the reference genome more than once FILE_PREFIX_iupac.fast[a|q] for reads that contain IUPAC codes (e.g., N) The file ending will be .fasta or .fastq, depending on the original format of the reads. --binmask=BINARY_MASK Use to create only certain bin files. For each bin type, use a 1 to create it and a 0 to prevent its creation, in the order of the above list. The default is 1111, that is, all bins are created. Misc output --uniqueness=OUTPUT_FILE Output a uniqueness data file for the reference. As it acts on the reference sequence only, no read file has to be supplied. The option activates --noshortcuts. --pileup=OUTPUT_FILE Outputs a detailed pileup view of the reads. The maximum pileup depth is 70, deeper pileups are cut off. Quality scores are shown when using a fastq read file. --snpsonly When using --pileup, only output lines with SNPs. --coverage=FILENAME Outputs how many As, Cs, Gs, and Ts are found at each position. Less detailed than --pileup, but works for unlimited depth. --cigar=OUTPUT_FILE Outputs read alignments in CIGAR format. --gffout=OUTPUT_FILE Outputs read alignments in GFF format. --fragmentplot=OUTPUT_FILE Outputs a plot of fragment size distribution. --snpsinreads=OUTPUT_FILE Outputs a list of found known SNPs (that is, either reference or alternate allele). Each line contains a single found SNP, the name of the read it was found in, and miscellaneous metadata and quality scores. A header line explains the individual columns. --indelplot=OUTPUT_FILE Outputs the fragment length found for each match. For paired reads only. See also --fragment and --variance. --inversions=FILENAME For paired reads, output a list of possible inversions. A large fragment/variance is recommended, as inversions would appear to be far apart. --wobble= OUTPUT_FILE Outputs a list of possible variations (paired reads only) --wobblemax=NUMBER Maximum number of mismatches allowed for wobble-match --sqlite=FILENAME Creates a sqlite text file with alignment data --sam=FILENAME Creates a SAM alignment file --spancontigs=FILENAME Outputs read pairs where "half" reads map uniquely to different contigs Experimental The following options are under development. It is not recommended to use them unless you know what youre doing. --regions=REGION_FILE Region file for finding new SNPs --memory_save=NUMBER Indexes the genome every NUMBER of positions; saves memory and runtime, but may have strange side effects. Examples Example 1 Reference genome in ref.fasta Fastq reads in reads.fastq Reads are 36 bases, fragment size is 200-300 Task : Get all the reads that do not match perfectly within the fragment size Command: findknownsnps --genome=ref.fasta --fastq=reads.fastq --pair=36 --fragment=250 --variance=50 --bins=out --binmask=1000 This will create out_no_match.fastq, containing the non-matching reads. zi j q r | }   ( * 8 a l ! 1 2 $ & , / < > F H P R X Z 23@AEܱܽ蕑h7HhqO"6h:sphRhqO"h7HhR6h7Hh Se6h h]h7Hh]6hPwhlhhXhhXhB*phhXhh7Hh{6h{h9)hh]hvhv6hvhX]hZ(4j q r } ) * 8 ! 2 23A!"2B & FgdqO"gd Segd]gd{gd{$a$gd=gdF9EFRWX\]lmtvw "B9:Lo<Y[efhzȱh7Hh6hh7Hh{6h{h ;h ;6 h ;6h ;hz +hRVhz +6 hr8hkhr8hk6hkhh7Hh Se6hV hqO"hqO"hqO"hqO"H*h0hqO"h SehqO"hqO">*2B9:Lghz|!"/]$a$gd=$a$gdz +$a$gdk$a$gd=0K{"./^l&5678E}hct~hct~hct~6h"h 2^h7Hh 2^6h7Hh 6hhXh7Hhg6hgh hh'hPkAhct~hct~hct~6h# hr8h#hr8h#62{o$ost)*5gh & Fgd<$a$gd=$a$gd=cqrst()*5OUgj~%&6789MNuvٽٽٝ嘭٩ِ嘭hBdh7}F hxv"6hYh7HhY6hh7Hh6hl1hvhzh7Hhz6hPShXhh7HhXh6h{h7Hh ~n6h7Hh{6hgh ~nh<\ h<\hj 8hij~89Nuv> b c y !!!X!$a$gdRV$a$gdp$a$gd=a b c y !!Y!l!!!!!"+"u""" #,#-#B##########$$ $ÿ|tpcpcpcphhOJQJ^JhhkRhkR>*hD`h<h * hr8hr8hRVhr86hr8h7HhL6h7Hh ~n6h ~nhRVhRV6hk.*hRV6hRVh7HhRV6hph"hp6hgh Seh`Rh7Hh6 hxv"6h7Hh{6h7}Fh{&X!Y!l!!!!!!!""+"t"u"""" #,#-#B######gdD`$a$gdr8$a$gd=$a$gd=$a$gdRV###($v$w$$$$$$C%D%^gdlggdlg$a$gd $ & Fa$gd $#$$$'$v$$$$$$$$$ %%C%D%Ϳޡޚ hk.*hlghlghlgOJQJ^J hlghlgCJOJQJ^JaJhlgCJOJQJ^JaJ hlghCJOJQJ^JaJhlgh@hOJQJ^JhhhOJQJ^J,1h. A!"#$% ^ 2 0@P`p2( 0@P`p 0@P`p 0@P`p 0@P`p 0@P`p 0@P`p8XV~_HmH nH sH tH @`@ NormalCJ_HaJmH sH tH Z@Z F9 Heading 1$<@&5CJ KH OJQJ\^JaJ \@\ { Heading 2$<@& 56CJOJQJ\]^JaJV@V { Heading 3$<@&5CJOJQJ\^JaJDA D Default Paragraph FontRiR  Table Normal4 l4a (k (No List PK![Content_Types].xmlj0Eжr(΢Iw},-j4 wP-t#bΙ{UTU^hd}㨫)*1P' ^W0)T9<l#$yi};~@(Hu* Dנz/0ǰ $ X3aZ,D0j~3߶b~i>3\`?/[G\!-Rk.sԻ..a濭?PK!֧6 _rels/.relsj0 }Q%v/C/}(h"O = C?hv=Ʌ%[xp{۵_Pѣ<1H0ORBdJE4b$q_6LR7`0̞O,En7Lib/SeеPK!kytheme/theme/themeManager.xml M @}w7c(EbˮCAǠҟ7՛K Y, e.|,H,lxɴIsQ}#Ր ֵ+!,^$j=GW)E+& 8PK!Ptheme/theme/theme1.xmlYOo6w toc'vuر-MniP@I}úama[إ4:lЯGRX^6؊>$ !)O^rC$y@/yH*񄴽)޵߻UDb`}"qۋJחX^)I`nEp)liV[]1M<OP6r=zgbIguSebORD۫qu gZo~ٺlAplxpT0+[}`jzAV2Fi@qv֬5\|ʜ̭NleXdsjcs7f W+Ն7`g ȘJj|h(KD- dXiJ؇(x$( :;˹! I_TS 1?E??ZBΪmU/?~xY'y5g&΋/ɋ>GMGeD3Vq%'#q$8K)fw9:ĵ x}rxwr:\TZaG*y8IjbRc|XŻǿI u3KGnD1NIBs RuK>V.EL+M2#'fi ~V vl{u8zH *:(W☕ ~JTe\O*tHGHY}KNP*ݾ˦TѼ9/#A7qZ$*c?qUnwN%Oi4 =3ڗP 1Pm \\9Mؓ2aD];Yt\[x]}Wr|]g- eW )6-rCSj id DЇAΜIqbJ#x꺃 6k#ASh&ʌt(Q%p%m&]caSl=X\P1Mh9MVdDAaVB[݈fJíP|8 քAV^f Hn- "d>znNJ ة>b&2vKyϼD:,AGm\nziÙ.uχYC6OMf3or$5NHT[XF64T,ќM0E)`#5XY`פ;%1U٥m;R>QD DcpU'&LE/pm%]8firS4d 7y\`JnίI R3U~7+׸#m qBiDi*L69mY&iHE=(K&N!V.KeLDĕ{D vEꦚdeNƟe(MN9ߜR6&3(a/DUz<{ˊYȳV)9Z[4^n5!J?Q3eBoCM m<.vpIYfZY_p[=al-Y}Nc͙ŋ4vfavl'SA8|*u{-ߟ0%M07%<ҍPK! ѐ'theme/theme/_rels/themeManager.xml.relsM 0wooӺ&݈Э5 6?$Q ,.aic21h:qm@RN;d`o7gK(M&$R(.1r'JЊT8V"AȻHu}|$b{P8g/]QAsم(#L[PK-![Content_Types].xmlPK-!֧6 +_rels/.relsPK-!kytheme/theme/themeManager.xmlPK-!Ptheme/theme/theme1.xmlPK-! ѐ' theme/theme/_rels/themeManager.xml.relsPK] D BE $D% B]hX!#D%8@0(  B S  ?,7DI&*/:RVZ]59+/ j r  P U :D(/EL[bz89<o$:;>fq,4;Ax[d!kr/: FAr H z  0 & S '= F333333333333333iijjgituvBCFiijjgituvBCFj@ME l\"^`OJPJQJ^Jo(^`OJQJ^Jo(hHopp^p`OJQJo(hH@ @ ^@ `OJQJo(hH^`OJQJ^Jo(hHo^`OJQJo(hH^`OJQJo(hH^`OJQJ^Jo(hHoPP^P`OJQJo(hH^`OJPJQJ^Jo(^`OJQJ^Jo(hHopp^p`OJQJo(hH@ @ ^@ `OJQJo(hH^`OJQJ^Jo(hHo^`OJQJo(hH^`OJQJo(hH^`OJQJ^Jo(hHoPP^P`OJQJo(hHj@E l                  YXtj Xh Xv] VPwl"*+kqO"xv"n%aN'9) *k.*z +r8F9PkAMF7}FrJ,`M5eYX] 2^C[^D`E1b Se'rpY`CElgkR<\vDF@++cc++DX@UnknownG* Times New Roman5Symbol3. * Arial?= * Courier New;WingdingsA BCambria Math"qh)2Ʀ&ǻ&R^5^5a24d662QHP ?Z(2!xxSNP-O-MATIC manualWTSImm6  Oh+'0  < H T `lt|SNP-O-MATIC manualWTSI Normal.dotmmm682Microsoft Office Word@0@e`ȕ@ @:̕^՜.+,0 hp|  56 SNP-O-MATIC manual Title  !#$%&'()+,-./0123456789:;<=?@ABCDEGHIJKLMPQTRoot Entry Fi2 ̕SData "1Table*'WordDocument .BSummaryInformation(>DocumentSummaryInformation8FMsoDataStore! ̕=+ ̕HNH4CUUCA==2! ̕=+ ̕Item  PropertiesUCompObj y   F'Microsoft Office Word 97-2003 Document MSWordDocWord.Document.89qsnpomatic-1.0/gpl-3.0.txt000066400000000000000000001045131260772140700152020ustar00rootroot00000000000000 GNU GENERAL PUBLIC LICENSE Version 3, 29 June 2007 Copyright (C) 2007 Free Software Foundation, Inc. Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. Preamble The GNU General Public License is a free, copyleft license for software and other kinds of works. The licenses for most software and other practical works are designed to take away your freedom to share and change the works. By contrast, the GNU General Public License is intended to guarantee your freedom to share and change all versions of a program--to make sure it remains free software for all its users. We, the Free Software Foundation, use the GNU General Public License for most of our software; it applies also to any other work released this way by its authors. You can apply it to your programs, too. When we speak of free software, we are referring to freedom, not price. Our General Public Licenses are designed to make sure that you have the freedom to distribute copies of free software (and charge for them if you wish), that you receive source code or can get it if you want it, that you can change the software or use pieces of it in new free programs, and that you know you can do these things. To protect your rights, we need to prevent others from denying you these rights or asking you to surrender the rights. Therefore, you have certain responsibilities if you distribute copies of the software, or if you modify it: responsibilities to respect the freedom of others. For example, if you distribute copies of such a program, whether gratis or for a fee, you must pass on to the recipients the same freedoms that you received. You must make sure that they, too, receive or can get the source code. And you must show them these terms so they know their rights. Developers that use the GNU GPL protect your rights with two steps: (1) assert copyright on the software, and (2) offer you this License giving you legal permission to copy, distribute and/or modify it. For the developers' and authors' protection, the GPL clearly explains that there is no warranty for this free software. For both users' and authors' sake, the GPL requires that modified versions be marked as changed, so that their problems will not be attributed erroneously to authors of previous versions. Some devices are designed to deny users access to install or run modified versions of the software inside them, although the manufacturer can do so. This is fundamentally incompatible with the aim of protecting users' freedom to change the software. The systematic pattern of such abuse occurs in the area of products for individuals to use, which is precisely where it is most unacceptable. Therefore, we have designed this version of the GPL to prohibit the practice for those products. If such problems arise substantially in other domains, we stand ready to extend this provision to those domains in future versions of the GPL, as needed to protect the freedom of users. Finally, every program is threatened constantly by software patents. States should not allow patents to restrict development and use of software on general-purpose computers, but in those that do, we wish to avoid the special danger that patents applied to a free program could make it effectively proprietary. To prevent this, the GPL assures that patents cannot be used to render the program non-free. The precise terms and conditions for copying, distribution and modification follow. TERMS AND CONDITIONS 0. Definitions. "This License" refers to version 3 of the GNU General Public License. "Copyright" also means copyright-like laws that apply to other kinds of works, such as semiconductor masks. "The Program" refers to any copyrightable work licensed under this License. Each licensee is addressed as "you". "Licensees" and "recipients" may be individuals or organizations. To "modify" a work means to copy from or adapt all or part of the work in a fashion requiring copyright permission, other than the making of an exact copy. The resulting work is called a "modified version" of the earlier work or a work "based on" the earlier work. A "covered work" means either the unmodified Program or a work based on the Program. To "propagate" a work means to do anything with it that, without permission, would make you directly or secondarily liable for infringement under applicable copyright law, except executing it on a computer or modifying a private copy. Propagation includes copying, distribution (with or without modification), making available to the public, and in some countries other activities as well. To "convey" a work means any kind of propagation that enables other parties to make or receive copies. Mere interaction with a user through a computer network, with no transfer of a copy, is not conveying. An interactive user interface displays "Appropriate Legal Notices" to the extent that it includes a convenient and prominently visible feature that (1) displays an appropriate copyright notice, and (2) tells the user that there is no warranty for the work (except to the extent that warranties are provided), that licensees may convey the work under this License, and how to view a copy of this License. If the interface presents a list of user commands or options, such as a menu, a prominent item in the list meets this criterion. 1. Source Code. The "source code" for a work means the preferred form of the work for making modifications to it. "Object code" means any non-source form of a work. A "Standard Interface" means an interface that either is an official standard defined by a recognized standards body, or, in the case of interfaces specified for a particular programming language, one that is widely used among developers working in that language. The "System Libraries" of an executable work include anything, other than the work as a whole, that (a) is included in the normal form of packaging a Major Component, but which is not part of that Major Component, and (b) serves only to enable use of the work with that Major Component, or to implement a Standard Interface for which an implementation is available to the public in source code form. A "Major Component", in this context, means a major essential component (kernel, window system, and so on) of the specific operating system (if any) on which the executable work runs, or a compiler used to produce the work, or an object code interpreter used to run it. The "Corresponding Source" for a work in object code form means all the source code needed to generate, install, and (for an executable work) run the object code and to modify the work, including scripts to control those activities. However, it does not include the work's System Libraries, or general-purpose tools or generally available free programs which are used unmodified in performing those activities but which are not part of the work. For example, Corresponding Source includes interface definition files associated with source files for the work, and the source code for shared libraries and dynamically linked subprograms that the work is specifically designed to require, such as by intimate data communication or control flow between those subprograms and other parts of the work. The Corresponding Source need not include anything that users can regenerate automatically from other parts of the Corresponding Source. The Corresponding Source for a work in source code form is that same work. 2. Basic Permissions. All rights granted under this License are granted for the term of copyright on the Program, and are irrevocable provided the stated conditions are met. This License explicitly affirms your unlimited permission to run the unmodified Program. The output from running a covered work is covered by this License only if the output, given its content, constitutes a covered work. This License acknowledges your rights of fair use or other equivalent, as provided by copyright law. You may make, run and propagate covered works that you do not convey, without conditions so long as your license otherwise remains in force. You may convey covered works to others for the sole purpose of having them make modifications exclusively for you, or provide you with facilities for running those works, provided that you comply with the terms of this License in conveying all material for which you do not control copyright. Those thus making or running the covered works for you must do so exclusively on your behalf, under your direction and control, on terms that prohibit them from making any copies of your copyrighted material outside their relationship with you. Conveying under any other circumstances is permitted solely under the conditions stated below. Sublicensing is not allowed; section 10 makes it unnecessary. 3. Protecting Users' Legal Rights From Anti-Circumvention Law. No covered work shall be deemed part of an effective technological measure under any applicable law fulfilling obligations under article 11 of the WIPO copyright treaty adopted on 20 December 1996, or similar laws prohibiting or restricting circumvention of such measures. When you convey a covered work, you waive any legal power to forbid circumvention of technological measures to the extent such circumvention is effected by exercising rights under this License with respect to the covered work, and you disclaim any intention to limit operation or modification of the work as a means of enforcing, against the work's users, your or third parties' legal rights to forbid circumvention of technological measures. 4. Conveying Verbatim Copies. You may convey verbatim copies of the Program's source code as you receive it, in any medium, provided that you conspicuously and appropriately publish on each copy an appropriate copyright notice; keep intact all notices stating that this License and any non-permissive terms added in accord with section 7 apply to the code; keep intact all notices of the absence of any warranty; and give all recipients a copy of this License along with the Program. You may charge any price or no price for each copy that you convey, and you may offer support or warranty protection for a fee. 5. Conveying Modified Source Versions. You may convey a work based on the Program, or the modifications to produce it from the Program, in the form of source code under the terms of section 4, provided that you also meet all of these conditions: a) The work must carry prominent notices stating that you modified it, and giving a relevant date. b) The work must carry prominent notices stating that it is released under this License and any conditions added under section 7. This requirement modifies the requirement in section 4 to "keep intact all notices". c) You must license the entire work, as a whole, under this License to anyone who comes into possession of a copy. This License will therefore apply, along with any applicable section 7 additional terms, to the whole of the work, and all its parts, regardless of how they are packaged. This License gives no permission to license the work in any other way, but it does not invalidate such permission if you have separately received it. d) If the work has interactive user interfaces, each must display Appropriate Legal Notices; however, if the Program has interactive interfaces that do not display Appropriate Legal Notices, your work need not make them do so. A compilation of a covered work with other separate and independent works, which are not by their nature extensions of the covered work, and which are not combined with it such as to form a larger program, in or on a volume of a storage or distribution medium, is called an "aggregate" if the compilation and its resulting copyright are not used to limit the access or legal rights of the compilation's users beyond what the individual works permit. Inclusion of a covered work in an aggregate does not cause this License to apply to the other parts of the aggregate. 6. Conveying Non-Source Forms. You may convey a covered work in object code form under the terms of sections 4 and 5, provided that you also convey the machine-readable Corresponding Source under the terms of this License, in one of these ways: a) Convey the object code in, or embodied in, a physical product (including a physical distribution medium), accompanied by the Corresponding Source fixed on a durable physical medium customarily used for software interchange. b) Convey the object code in, or embodied in, a physical product (including a physical distribution medium), accompanied by a written offer, valid for at least three years and valid for as long as you offer spare parts or customer support for that product model, to give anyone who possesses the object code either (1) a copy of the Corresponding Source for all the software in the product that is covered by this License, on a durable physical medium customarily used for software interchange, for a price no more than your reasonable cost of physically performing this conveying of source, or (2) access to copy the Corresponding Source from a network server at no charge. c) Convey individual copies of the object code with a copy of the written offer to provide the Corresponding Source. This alternative is allowed only occasionally and noncommercially, and only if you received the object code with such an offer, in accord with subsection 6b. d) Convey the object code by offering access from a designated place (gratis or for a charge), and offer equivalent access to the Corresponding Source in the same way through the same place at no further charge. You need not require recipients to copy the Corresponding Source along with the object code. If the place to copy the object code is a network server, the Corresponding Source may be on a different server (operated by you or a third party) that supports equivalent copying facilities, provided you maintain clear directions next to the object code saying where to find the Corresponding Source. Regardless of what server hosts the Corresponding Source, you remain obligated to ensure that it is available for as long as needed to satisfy these requirements. e) Convey the object code using peer-to-peer transmission, provided you inform other peers where the object code and Corresponding Source of the work are being offered to the general public at no charge under subsection 6d. A separable portion of the object code, whose source code is excluded from the Corresponding Source as a System Library, need not be included in conveying the object code work. A "User Product" is either (1) a "consumer product", which means any tangible personal property which is normally used for personal, family, or household purposes, or (2) anything designed or sold for incorporation into a dwelling. In determining whether a product is a consumer product, doubtful cases shall be resolved in favor of coverage. For a particular product received by a particular user, "normally used" refers to a typical or common use of that class of product, regardless of the status of the particular user or of the way in which the particular user actually uses, or expects or is expected to use, the product. A product is a consumer product regardless of whether the product has substantial commercial, industrial or non-consumer uses, unless such uses represent the only significant mode of use of the product. "Installation Information" for a User Product means any methods, procedures, authorization keys, or other information required to install and execute modified versions of a covered work in that User Product from a modified version of its Corresponding Source. The information must suffice to ensure that the continued functioning of the modified object code is in no case prevented or interfered with solely because modification has been made. If you convey an object code work under this section in, or with, or specifically for use in, a User Product, and the conveying occurs as part of a transaction in which the right of possession and use of the User Product is transferred to the recipient in perpetuity or for a fixed term (regardless of how the transaction is characterized), the Corresponding Source conveyed under this section must be accompanied by the Installation Information. But this requirement does not apply if neither you nor any third party retains the ability to install modified object code on the User Product (for example, the work has been installed in ROM). The requirement to provide Installation Information does not include a requirement to continue to provide support service, warranty, or updates for a work that has been modified or installed by the recipient, or for the User Product in which it has been modified or installed. Access to a network may be denied when the modification itself materially and adversely affects the operation of the network or violates the rules and protocols for communication across the network. Corresponding Source conveyed, and Installation Information provided, in accord with this section must be in a format that is publicly documented (and with an implementation available to the public in source code form), and must require no special password or key for unpacking, reading or copying. 7. Additional Terms. "Additional permissions" are terms that supplement the terms of this License by making exceptions from one or more of its conditions. Additional permissions that are applicable to the entire Program shall be treated as though they were included in this License, to the extent that they are valid under applicable law. If additional permissions apply only to part of the Program, that part may be used separately under those permissions, but the entire Program remains governed by this License without regard to the additional permissions. When you convey a copy of a covered work, you may at your option remove any additional permissions from that copy, or from any part of it. (Additional permissions may be written to require their own removal in certain cases when you modify the work.) You may place additional permissions on material, added by you to a covered work, for which you have or can give appropriate copyright permission. Notwithstanding any other provision of this License, for material you add to a covered work, you may (if authorized by the copyright holders of that material) supplement the terms of this License with terms: a) Disclaiming warranty or limiting liability differently from the terms of sections 15 and 16 of this License; or b) Requiring preservation of specified reasonable legal notices or author attributions in that material or in the Appropriate Legal Notices displayed by works containing it; or c) Prohibiting misrepresentation of the origin of that material, or requiring that modified versions of such material be marked in reasonable ways as different from the original version; or d) Limiting the use for publicity purposes of names of licensors or authors of the material; or e) Declining to grant rights under trademark law for use of some trade names, trademarks, or service marks; or f) Requiring indemnification of licensors and authors of that material by anyone who conveys the material (or modified versions of it) with contractual assumptions of liability to the recipient, for any liability that these contractual assumptions directly impose on those licensors and authors. All other non-permissive additional terms are considered "further restrictions" within the meaning of section 10. If the Program as you received it, or any part of it, contains a notice stating that it is governed by this License along with a term that is a further restriction, you may remove that term. If a license document contains a further restriction but permits relicensing or conveying under this License, you may add to a covered work material governed by the terms of that license document, provided that the further restriction does not survive such relicensing or conveying. If you add terms to a covered work in accord with this section, you must place, in the relevant source files, a statement of the additional terms that apply to those files, or a notice indicating where to find the applicable terms. Additional terms, permissive or non-permissive, may be stated in the form of a separately written license, or stated as exceptions; the above requirements apply either way. 8. Termination. You may not propagate or modify a covered work except as expressly provided under this License. Any attempt otherwise to propagate or modify it is void, and will automatically terminate your rights under this License (including any patent licenses granted under the third paragraph of section 11). However, if you cease all violation of this License, then your license from a particular copyright holder is reinstated (a) provisionally, unless and until the copyright holder explicitly and finally terminates your license, and (b) permanently, if the copyright holder fails to notify you of the violation by some reasonable means prior to 60 days after the cessation. Moreover, your license from a particular copyright holder is reinstated permanently if the copyright holder notifies you of the violation by some reasonable means, this is the first time you have received notice of violation of this License (for any work) from that copyright holder, and you cure the violation prior to 30 days after your receipt of the notice. Termination of your rights under this section does not terminate the licenses of parties who have received copies or rights from you under this License. If your rights have been terminated and not permanently reinstated, you do not qualify to receive new licenses for the same material under section 10. 9. Acceptance Not Required for Having Copies. You are not required to accept this License in order to receive or run a copy of the Program. Ancillary propagation of a covered work occurring solely as a consequence of using peer-to-peer transmission to receive a copy likewise does not require acceptance. However, nothing other than this License grants you permission to propagate or modify any covered work. These actions infringe copyright if you do not accept this License. Therefore, by modifying or propagating a covered work, you indicate your acceptance of this License to do so. 10. Automatic Licensing of Downstream Recipients. Each time you convey a covered work, the recipient automatically receives a license from the original licensors, to run, modify and propagate that work, subject to this License. You are not responsible for enforcing compliance by third parties with this License. An "entity transaction" is a transaction transferring control of an organization, or substantially all assets of one, or subdividing an organization, or merging organizations. If propagation of a covered work results from an entity transaction, each party to that transaction who receives a copy of the work also receives whatever licenses to the work the party's predecessor in interest had or could give under the previous paragraph, plus a right to possession of the Corresponding Source of the work from the predecessor in interest, if the predecessor has it or can get it with reasonable efforts. You may not impose any further restrictions on the exercise of the rights granted or affirmed under this License. For example, you may not impose a license fee, royalty, or other charge for exercise of rights granted under this License, and you may not initiate litigation (including a cross-claim or counterclaim in a lawsuit) alleging that any patent claim is infringed by making, using, selling, offering for sale, or importing the Program or any portion of it. 11. Patents. A "contributor" is a copyright holder who authorizes use under this License of the Program or a work on which the Program is based. The work thus licensed is called the contributor's "contributor version". A contributor's "essential patent claims" are all patent claims owned or controlled by the contributor, whether already acquired or hereafter acquired, that would be infringed by some manner, permitted by this License, of making, using, or selling its contributor version, but do not include claims that would be infringed only as a consequence of further modification of the contributor version. For purposes of this definition, "control" includes the right to grant patent sublicenses in a manner consistent with the requirements of this License. Each contributor grants you a non-exclusive, worldwide, royalty-free patent license under the contributor's essential patent claims, to make, use, sell, offer for sale, import and otherwise run, modify and propagate the contents of its contributor version. In the following three paragraphs, a "patent license" is any express agreement or commitment, however denominated, not to enforce a patent (such as an express permission to practice a patent or covenant not to sue for patent infringement). To "grant" such a patent license to a party means to make such an agreement or commitment not to enforce a patent against the party. If you convey a covered work, knowingly relying on a patent license, and the Corresponding Source of the work is not available for anyone to copy, free of charge and under the terms of this License, through a publicly available network server or other readily accessible means, then you must either (1) cause the Corresponding Source to be so available, or (2) arrange to deprive yourself of the benefit of the patent license for this particular work, or (3) arrange, in a manner consistent with the requirements of this License, to extend the patent license to downstream recipients. "Knowingly relying" means you have actual knowledge that, but for the patent license, your conveying the covered work in a country, or your recipient's use of the covered work in a country, would infringe one or more identifiable patents in that country that you have reason to believe are valid. If, pursuant to or in connection with a single transaction or arrangement, you convey, or propagate by procuring conveyance of, a covered work, and grant a patent license to some of the parties receiving the covered work authorizing them to use, propagate, modify or convey a specific copy of the covered work, then the patent license you grant is automatically extended to all recipients of the covered work and works based on it. A patent license is "discriminatory" if it does not include within the scope of its coverage, prohibits the exercise of, or is conditioned on the non-exercise of one or more of the rights that are specifically granted under this License. You may not convey a covered work if you are a party to an arrangement with a third party that is in the business of distributing software, under which you make payment to the third party based on the extent of your activity of conveying the work, and under which the third party grants, to any of the parties who would receive the covered work from you, a discriminatory patent license (a) in connection with copies of the covered work conveyed by you (or copies made from those copies), or (b) primarily for and in connection with specific products or compilations that contain the covered work, unless you entered into that arrangement, or that patent license was granted, prior to 28 March 2007. Nothing in this License shall be construed as excluding or limiting any implied license or other defenses to infringement that may otherwise be available to you under applicable patent law. 12. No Surrender of Others' Freedom. If conditions are imposed on you (whether by court order, agreement or otherwise) that contradict the conditions of this License, they do not excuse you from the conditions of this License. If you cannot convey a covered work so as to satisfy simultaneously your obligations under this License and any other pertinent obligations, then as a consequence you may not convey it at all. For example, if you agree to terms that obligate you to collect a royalty for further conveying from those to whom you convey the Program, the only way you could satisfy both those terms and this License would be to refrain entirely from conveying the Program. 13. Use with the GNU Affero General Public License. Notwithstanding any other provision of this License, you have permission to link or combine any covered work with a work licensed under version 3 of the GNU Affero General Public License into a single combined work, and to convey the resulting work. The terms of this License will continue to apply to the part which is the covered work, but the special requirements of the GNU Affero General Public License, section 13, concerning interaction through a network will apply to the combination as such. 14. Revised Versions of this License. The Free Software Foundation may publish revised and/or new versions of the GNU General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. Each version is given a distinguishing version number. If the Program specifies that a certain numbered version of the GNU General Public License "or any later version" applies to it, you have the option of following the terms and conditions either of that numbered version or of any later version published by the Free Software Foundation. If the Program does not specify a version number of the GNU General Public License, you may choose any version ever published by the Free Software Foundation. If the Program specifies that a proxy can decide which future versions of the GNU General Public License can be used, that proxy's public statement of acceptance of a version permanently authorizes you to choose that version for the Program. Later license versions may give you additional or different permissions. However, no additional obligations are imposed on any author or copyright holder as a result of your choosing to follow a later version. 15. Disclaimer of Warranty. THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 16. Limitation of Liability. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. 17. Interpretation of Sections 15 and 16. If the disclaimer of warranty and limitation of liability provided above cannot be given local legal effect according to their terms, reviewing courts shall apply local law that most closely approximates an absolute waiver of all civil liability in connection with the Program, unless a warranty or assumption of liability accompanies a copy of the Program in return for a fee. END OF TERMS AND CONDITIONS How to Apply These Terms to Your New Programs If you develop a new program, and you want it to be of the greatest possible use to the public, the best way to achieve this is to make it free software which everyone can redistribute and change under these terms. To do so, attach the following notices to the program. It is safest to attach them to the start of each source file to most effectively state the exclusion of warranty; and each file should have at least the "copyright" line and a pointer to where the full notice is found. Copyright (C) This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . Also add information on how to contact you by electronic and paper mail. If the program does terminal interaction, make it output a short notice like this when it starts in an interactive mode: Copyright (C) This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. This is free software, and you are welcome to redistribute it under certain conditions; type `show c' for details. The hypothetical commands `show w' and `show c' should show the appropriate parts of the General Public License. Of course, your program's commands might be different; for a GUI interface, you would use an "about box". You should also get your employer (if you work as a programmer) or school, if any, to sign a "copyright disclaimer" for the program, if necessary. For more information on this, and how to apply and follow the GNU GPL, see . The GNU General Public License does not permit incorporating your program into proprietary programs. If your program is a subroutine library, you may consider it more useful to permit linking proprietary applications with the library. If this is what you want to do, use the GNU Lesser General Public License instead of this License. But first, please read . snpomatic-1.0/src/000077500000000000000000000000001260772140700141445ustar00rootroot00000000000000snpomatic-1.0/src/TAlignmentOutput.cpp000066400000000000000000000054151260772140700201400ustar00rootroot00000000000000#include "snpomatic.h" #define MINOCCUR 1 void TAlignmentOutput::init ( TChromosome *_chr ) { chr = _chr ; align_full.assign ( chr->sequence.length() , false ) ; align_size = MAXLINES * chr->sequence.length() ; align = new char[align_size] ; qalign = new char[align_size] ; for ( uint i = 0 ; i < align_size ; i++ ) align[i] = qalign[i] = ' ' ; } void TAlignmentOutput::add_align ( const string &seq , const string &quality , uint pos , int chromosome ) { if ( align_full[pos] ) return ; int lane , seql = seq.length() ; for ( lane = 0 ; lane < MAXLINES ; lane++ ) { if ( align[twotoone(lane,pos)] + align[twotoone(lane,pos+seql/2-1)] + align[twotoone(lane,pos+seql-1)] == 96 ) break ; } if ( lane >= MAXLINES ) { align_full[pos] = true ; return ; // HARD CUT OFF } for ( int i = 0 ; i < seql ; i++ ) { uint p = twotoone(lane,pos+i) ; if ( p >= align_size ) continue ; align[p] = seq[i] ; qalign[p] = quality[i] ; } } void TAlignmentOutput::show_pileup ( FILE *pileup , bool snps_only ) { int search_snps = 0 , found_snps = 0 , bogus = 0 ; uint base , l , cnt , p , lastns ; uchar common ; char *s2 = new char[MAXLINES+5] ; char *s3 = new char[MAXLINES+5] ; char *t , *u ; uint count[256] ; for ( base = 0 ; base < chr->sequence.length() ; base++ ) { if ( snps_only && !isIUPAC[chr->sequence[base]] ) continue ; if ( isIUPAC[chr->sequence[base]] ) search_snps++ ; count['A']=count['C']=count['G']=count['T']=0 ; cnt = 0 ; common = ' ' ; lastns = 0 ; p = twotoone(0,base) ; t = s2 ; u = s3 ; char *c = align + p ; char *q = qalign + p ; for ( l = 0 ; l < MAXLINES ; l++ , c++ , q++ ) { if ( *c && *c != ' ' ) { lastns = l ; cnt++ ; count[*c]++ ; common = MERGE_IUPAC ( *c , common ) ; if ( *q < MINQUAL + 33 ) *t++ = to_lc[*c] ; else *t++ = *c ; *u++ = *q ; } else { *t++ = ' ' ; *u++ = ' ' ; } } s2[lastns+1] = 0 ; s3[lastns+1] = 0 ; bool confirmed_snp = false ; uint n = 0 ; if ( count['A'] >= MINOCCUR ) n++ ; if ( count['C'] >= MINOCCUR ) n++ ; if ( count['G'] >= MINOCCUR ) n++ ; if ( count['T'] >= MINOCCUR ) n++ ; if ( n > 1 ) { confirmed_snp = true ; found_snps++ ; } /* if ( common != ' ' && isIUPAC[common] ) { confirmed_snp = true ; found_snps++ ; } */ if ( common != ' ' && isIUPAC[common] && !isIUPAC[chr->sequence[base]] ) bogus++ ; if ( snps_only && common == ' ' ) continue ; char mark = confirmed_snp ? '*' : ' ' ; string ref ; if ( !chr->original_sequence.empty() ) { ref += chr->original_sequence[base] ; ref += "\t" ; } ref += chr->sequence[base] ; fprintf ( pileup , "%s\t%s\t%d\t%c%c\t%d\t" , chr->name.c_str() , ref.c_str() , base+1 , common , mark , cnt ) ; fprintf ( pileup , "%s\t%s\n" , s2 , s3 ) ; } } snpomatic-1.0/src/TChromosomalIndices.cpp000066400000000000000000000360261260772140700205650ustar00rootroot00000000000000#include "snpomatic.h" /// Constructor. TChromosomalIndices::TChromosomalIndices ( int _index_length , int _index_length2 ) { index_length = _index_length ; index_length2 = _index_length2 ; index_length_double = index_length + index_length2 ; uint size = 1 ; for ( int a = 0 ; a < index_length ; a++ ) size *= 4 ; position.assign ( size , pwi_vector() ) ; noshortcuts = 0 ; // Take shortcuts by default max_snps_per_index = 8 ; index_from = 0 ; index_to = 0 ; index_steps = 1 ; } /// Indexes reads for reassembly. void TChromosomalIndices::run_reads ( string filename ) { FILE * file = fopen ( filename.c_str() , "r" ) ; char dummy[READ_CHAR_BUFFER] , *c ; ulong count = 0 , a , from , to , l ; ulong filepos = 0 ; for ( a = 0 ; a < position.size() ; a++ ) position[a].reserve ( 95 ) ; // Speedup cheat while ( !feof(file) ) { count++ ; filepos = ftell ( file ) ; if ( count % 1000000 == 0 ) cerr << count << endl ; *dummy = 0 ; fgets ( dummy , READ_CHAR_BUFFER , file ) ; if ( !*dummy ) break ; if ( *dummy != '>' ) break ; // Not a FASTA file, apparently *dummy = 0 ; fgets ( dummy , READ_CHAR_BUFFER , file ) ; if ( !*dummy ) break ; l = 0 ; for ( c = dummy ; *c > 13 ; c++ ) { if ( isIUPAC[*c] ) { // No IUPAC reads, please... *dummy = 0 ; break ; } l++ ; } if ( !*dummy ) continue ; *c = 0 ; from = 3 ; // 0 = no offset to = l - from - index_length_double ; for ( a = from ; a <= to ; a++ ) add_index ( dummy + from , false , filepos , from ) ; REVERSE_COMPLEMENT_CHARP ( dummy ) ; for ( a = from ; a <= to ; a++ ) add_index ( dummy + from , true , filepos , from ) ; } // Sort for ( a = 0 ; a < position.size() ; a++ ) sort ( position[a].begin() , position[a].end() ) ; // Analyze uint min = 5 ; uint max = 100 ; for ( a = 0 ; a < position.size() ; a++ ) { uint b , last = 0 , cnt = 0 ; for ( b = 0 ; b < position[a].size() ; b++ ) { if ( position[a][b].index2 == position[a][last].index2 ) { cnt++ ; } else { if ( cnt >= min && cnt <= max ) reassemble_reads ( file , a , last , b-1 ) ; last = b ; cnt = 1 ; } } if ( cnt >= min && cnt <= max ) reassemble_reads ( file , a , last , b-1 ) ; } } bool TChromosomalIndices::align_read2contigs ( char *read , vector &vs ) { uint a , vsmax = vs.size() ; int l , b , c ; bool ret = false ; for ( l = 0 ; *(read+l) ; l++ ) ; int from = -10 ; int to = 20 ; for ( a = 0 ; a < vsmax ; a++ ) { if ( vs[a] == read ) return ret ; for ( b = from ; b < to ; b++ ) { bool match = true ; for ( c = 0 ; match && c < l ; c++ ) { if ( b + c < 0 ) continue ; if ( b + c >= vs[a].length() ) continue ; if ( *(read+c) != vs[a][b+c] ) match = false ; } if ( match ) { string m ; string n ; for ( c = b ; c < 0 ; c++ ) n += ' ' ; for ( c = 0 ; c < b ; c++ ) m += ' ' ; n += vs[a] ; m += read ; if ( n.length() == vs[a].length() ) continue ; while ( n.length() < m.length() ) n += ' ' ; while ( m.length() < n.length() ) m += ' ' ; for ( c = 0 ; c < n.length() ; c++ ) { if ( n[c] == ' ' ) n[c] = m[c] ; } vs.push_back ( n ) ; break ; // One match will do just fine... } } } if ( !ret ) vs.push_back ( read ) ; // New contig since it doesn't match old ones return ret ; } void TChromosomalIndices::reassemble_reads ( FILE *file , uint i1 , index2type i2a , index2type i2b ) { char dummy[READ_CHAR_BUFFER] , *c ; vector contigs ; uint maxlen = 0 ; for ( index2type i2 = i2a ; i2 <= i2b ; i2++ ) { fseek ( file , position[i1][i2].position , SEEK_SET ) ; fgets ( dummy , READ_CHAR_BUFFER , file ) ; fgets ( dummy , READ_CHAR_BUFFER , file ) ; uint l = 0 ; for ( c = dummy ; *c > 13 ; l++ , c++ ) ; // All possible hits are already IUPAC-filtered, no need to do it again *c = 0 ; if ( l > maxlen ) maxlen = l ; if ( position[i1][i2].reverse_complement ) REVERSE_COMPLEMENT_CHARP ( dummy ) ; align_read2contigs ( dummy , contigs ) ; } for ( uint a = 0 ; a < contigs.size() ; a++ ) { if ( contigs[a].length() <= maxlen ) continue ; cout << contigs[a] << endl ; } } void TChromosomalIndices::add_index ( char *start , bool rc , uint pos , uchar chromosome ) { uint key = calculate_index ( start ) ; TPositionWithIndex pwi ; pwi.position = pos ; pwi.index2 = calculate_index2 ( start + index_length ) ; pwi.reverse_complement = rc ; pwi.chromosome = chromosome ; position[key].push_back ( pwi ) ; } /// Creates index for all regions, mark regions N void TChromosomalIndices::run_regions ( chrvec *_chrs , string filename ) { if ( DEBUGGING ) { fprintf ( stdout , "Indexing regions ... " ) ; fflush(stdout); } chrs = _chrs ; int a , chr ; // for ( a = 0 ; a < position.size() ; a++ ) position[a].reserve ( 100 ) ; // Pre-reserve memory, rule-of-thumb // cout << endl ; // Go through region file FILE * file = fopen ( filename.c_str() , "r" ) ; char dummy[READ_CHAR_BUFFER] ; while ( !feof(file) ) { *dummy = 0 ; fgets ( dummy , READ_CHAR_BUFFER , file ) ; if ( !*dummy ) break ; vector parts ; Tokenize ( dummy , parts , "\t" ) ; if ( parts.size() < 3 ) continue ; for ( chr = 0 ; chr < chrs->size() && (*chrs)[chr].name != parts[0] ; a++ ) ; if ( chr == chrs->size() ) { cout << "Unknown chromosome " << parts[0] << " in region list!\n" ; continue ; } // cout << dummy ; uint from = atoi ( parts[1].c_str() ) - 1 ; uint to = atoi ( parts[2].c_str() ) - 1 ; // Mark region as "N" for ( a = from ; a <= to ; a++ ) (*chrs)[chr].sequence[a] = 'N' ; uint offset = 20 ; string s ; uint pl = (*chrs)[chr].sequence.length() - index_length_double ; string *p = & ( (*chrs)[chr].sequence ) ; for ( a = from - index_length_double - offset ; a <= to ; a++ ) { if ( a < 0 ) continue ; if ( a >= pl ) continue ; s = p->substr ( a , index_length_double ) ; char *sc = (char*) s.c_str() ; add_sequence ( sc , sc , a , chr , false ) ; add_sequence ( sc , sc , a+index_length_double , chr , true ) ; } } // Sort all sub-lists for ( a = 0 ; a < position.size() ; a++ ) { sort ( position[a].begin() , position[a].end() ) ; } if ( DEBUGGING ) { fprintf ( stdout , "done.\n" ) ; fflush(stdout); } } /// Create index for beginning and end of all contigs. void TChromosomalIndices::run_ends ( chrvec *_chrs , int min_length ) { if ( DEBUGGING ) { fprintf ( stdout , "Indexing contigs ... " ) ; fflush(stdout); } chrs = _chrs ; uint a , chr ; // for ( a = 0 ; a < position.size() ; a++ ) position[a].reserve ( 100 ) ; // Pre-reserve memory, rule-of-thumb for ( chr = 0 ; chr < chrs->size() ; chr++ ) { if ( (*chrs)[chr].sequence.length() < min_length ) continue ; string s , *p ; uint pl = (*chrs)[chr].sequence.length() - index_length_double ; char *sc = (char*) (*chrs)[chr].sequence.c_str() ; for ( uint a = 0 ; a < pl ; a++ ) { add_sequence ( sc , sc , a , chr , false ) ; // add_sequence ( sc , sc , a+index_length_double , chr , true ) ; } } /* if ( DEBUGGING ) { fprintf ( stdout , "sorting ... " ) ; fflush(stdout); } // Sort all sub-lists for ( a = 0 ; a < position.size() ; a++ ) { sort ( position[a].begin() , position[a].end() ) ; } */ if ( DEBUGGING ) { fprintf ( stdout , "done.\n" ) ; fflush(stdout); } } string TChromosomalIndices::decompile ( uint index , uint bases ) { string ret ; while ( bases > 0 ) { ret += num2char[(index&3)+1] ; index >>= 2 ; bases-- ; } reverse ( ret.begin() , ret.end() ) ; return ret ; } /// Create index for all chromosomes. void TChromosomalIndices::run ( chrvec *_chrs ) { chrs = _chrs ; if ( !index_file.empty() && fopen ( index_file.c_str() , "r" ) ) { load_index ( index_file ) ; return ; } if ( DEBUGGING ) { if ( index_from == 0 && index_to == 0 ) fprintf ( stdout , "Indexing %d chromosomes ... " , chrs->size() ) ; else fprintf ( stdout , "Indexing %d chromosomes (%d-%d) ... " , chrs->size() , index_from , index_to ) ; fflush(stdout); } uint a , chr ; for ( chr = 0 ; chr < chrs->size() ; chr++ ) { string s , *p ; char *sc = (char*) (*chrs)[chr].sequence.c_str() ; uint pl = (*chrs)[chr].sequence.length() ; if ( index_to > 0 && index_to < pl ) pl = index_to ; sc += index_from ; for ( a = index_from ; a < pl ; a += index_steps ) { int r = 0 ; r += add_sequence ( sc , sc , a , chr , false ) ; r += add_sequence ( sc , sc , a+index_length_double , chr , true ) ; sc += index_steps ; } } if ( DEBUGGING ) { fprintf ( stdout , "sorting ... " ) ; fflush(stdout); } // Nuke huge indices; speedup: 50% if ( !noshortcuts ) { pwi_vector largest ; for ( a = 0 ; a < position.size() ; a++ ) { if ( largest.size() < 20 ) { TPositionWithIndex tmp ; tmp.index2 = position[a].size() ; tmp.position = a ; largest.push_back ( tmp ) ; sort ( largest.begin() , largest.end() ) ; continue ; } if ( largest[0].index2 >= position[a].size() ) continue ; largest[0].index2 = position[a].size() ; largest[0].position = a ; sort ( largest.begin() , largest.end() ) ; } for ( a = 0 ; a < largest.size() ; a++ ) { if ( largest[a].index2 < largest.back().index2 / 4 ) continue ; position[largest[a].position].clear() ; } } // Sort all sub-lists for ( a = 0 ; a < position.size() ; a++ ) { sort ( position[a].begin() , position[a].end() ) ; } if ( DEBUGGING ) { fprintf ( stdout , "done.\n" ) ; fflush(stdout); } if ( !index_file.empty() ) { save_index ( index_file ) ; return ; } } /// Recurses/iterates through sequence with possible IUPAC codes, creates all possible combinations, and adds them to the index list. int TChromosomalIndices::add_sequence ( char *begin , char *cur , int from , int chr , bool rc , int nos ) { if ( nos > max_snps_per_index ) return 0 ; int ret = 0 ; while ( index_length_double > cur - begin ) { if ( isIUPAC[*cur] ) { if ( *cur == 'X' ) return ret ; // Contains X, aborting uchar c = char2IUPAC[*cur] ; uchar oc = *cur ; if ( c & IUPAC_A ) { *cur = 'A' ; ret += add_sequence ( begin , cur + 1 , from , chr , rc , nos+1 ) ; } if ( c & IUPAC_C ) { *cur = 'C' ; ret += add_sequence ( begin , cur + 1 , from , chr , rc , nos+1 ) ; } if ( c & IUPAC_G ) { *cur = 'G' ; ret += add_sequence ( begin , cur + 1 , from , chr , rc , nos+1 ) ; } if ( c & IUPAC_T ) { *cur = 'T' ; ret += add_sequence ( begin , cur + 1 , from , chr , rc , nos+1 ) ; } *cur = oc ; return ret ; } cur++ ; } char *sc = begin ; if ( rc ) { sc = tmp ; memcpy ( tmp , begin , index_length_double ) ; tmp[index_length_double] = 0 ; REVERSE_COMPLEMENT_CHARP ( sc ) ; } uint key = calculate_index ( sc ) ; TPositionWithIndex pwi ; pwi.position = from ; pwi.index2 = calculate_index2 ( sc + index_length ) ; pwi.reverse_complement = rc ; pwi.chromosome = chr ; position[key].push_back ( pwi ) ; return ret + 1 ; } /// sequence_kmer MUST only contain As, Cs, Gs, or Ts uint TChromosomalIndices::calculate_index ( const char *sequence_kmer ) { uint ret = 0 ; for ( int a = 0 ; a < index_length ; a++ ) { ret = ( ret << 2 ) | ( ((uint)char2num[(uint)sequence_kmer[a]])-1 ) ; } return ret ; } /// sequence_kmer MUST only contain As, Cs, Gs, or Ts index2type TChromosomalIndices::calculate_index2 ( const char *sequence_kmer ) { uint ret = 0 ; for ( int a = 0 ; a < index_length2 ; a++ ) { ret = ( ret << 2 ) | ( ((uint)char2num[(uint)sequence_kmer[a]])-1 ) ; } return ret ; } void TChromosomalIndices::save_index ( string filename ) { if ( DEBUGGING ) { fprintf ( stdout , "Saving index file ... " ) ; fflush(stdout); } FILE *file = fopen ( string ( filename.c_str() ).c_str() , "w" ) ; uint a , b ; uint temp = 1 ; // Version fwrite ( &temp , sizeof ( temp ) , 1 , file ) ; temp = position.size() ; // cout << endl << position.size() << endl << temp << endl ; fwrite ( &temp , sizeof ( temp ) , 1 , file ) ; for ( a = 0 ; a < position.size() ; a++ ) { temp = position[a].size() ; fwrite ( &temp , sizeof ( temp ) , 1 , file ) ; for ( b = 0 ; b < position[a].size() ; b++ ) { fwrite ( &position[a][b].index2 , sizeof ( position[a][b].index2 ) , 1 , file ) ; fwrite ( &position[a][b].position , sizeof ( position[a][b].position ) , 1 , file ) ; fwrite ( &position[a][b].chromosome , sizeof ( position[a][b].chromosome ) , 1 , file ) ; fwrite ( &position[a][b].reverse_complement , sizeof ( position[a][b].reverse_complement ) , 1 , file ) ; } } fclose ( file ) ; if ( DEBUGGING ) { fprintf ( stdout , "done.\n" ) ; fflush(stdout); } } void TChromosomalIndices::load_index ( string filename ) { if ( DEBUGGING ) { fprintf ( stdout , "Loading index file ... " ) ; fflush(stdout); } uint temp ; FILE *file = fopen ( string ( filename.c_str() ).c_str() , "r" ) ; if ( !file ) exit ( 1 ) ; fread ( &temp , sizeof ( temp ) , 1 , file ) ; // Version fread ( &temp , sizeof ( temp ) , 1 , file ) ; // position size if ( position.size() != temp ) exit ( 1 ) ; uint a , b ; for ( a = 0 ; a < position.size() ; a++ ) { fread ( &temp , sizeof ( temp ) , 1 , file ) ; // Sub-index size position[a].reserve ( temp ) ; for ( b = 0 ; b < temp ; b++ ) { TPositionWithIndex i ; fread ( &i.index2 , sizeof ( i.index2 ) , 1 , file ) ; fread ( &i.position , sizeof ( i.position ) , 1 , file ) ; fread ( &i.chromosome , sizeof ( i.chromosome ) , 1 , file ) ; fread ( &i.reverse_complement , sizeof ( i.reverse_complement ) , 1 , file ) ; position[a].push_back ( i ) ; // if ( i.chromosome > 100 ) cerr << "!" << i.chromosome << endl ; } } fclose ( file ) ; if ( DEBUGGING ) { fprintf ( stdout , "done.\n" ) ; fflush(stdout); } } void TChromosomalIndices::uniqueness ( string filename ) { uint a , b , c ; for ( a = 0 ; a < chrs->size() ; a++ ) { (*chrs)[a].uniqueness.resize ( (*chrs)[a].sequence.length() , 0 ) ; } for ( a = 0 ; a < position.size() ; a++ ) { for ( b = 0 ; b < position[a].size() ; b++ ) { for ( c = b+1 ; c < position[a].size() && position[a][b].index2 == position[a][c].index2 ; c++ ) ; if ( c == b+1 ) continue ; uint count = c - b ; cout << count << endl ; for ( c = b ; c < position[a].size() && position[a][b].index2 == position[a][c].index2 ; c++ ) { uint p = position[a][c].position ; uint chr = position[a][c].chromosome ; if ( chr > chrs->size() ) { cerr << position[a][c].chromosome << "/" << c << " : " << chr << endl ; exit ( 1 ) ; } uint si = (*chrs)[chr].uniqueness.size() ; if ( si == 0 ) { (*chrs)[chr].uniqueness.resize ( (*chrs)[chr].sequence.length() , 0 ) ; si = (*chrs)[chr].uniqueness.size() ; } for ( uint u = 0 ; u < index_length_double ; u++ ) { if ( u + p >= si ) break ; (*chrs)[chr].uniqueness[u+p] += count ; } } b = c - 1 ; } } for ( a = 0 ; a < chrs->size() ; a++ ) { for ( b = 0 ; b < (*chrs)[a].uniqueness.size() ; b++ ) { if ( (*chrs)[a].uniqueness[b] == 0 ) (*chrs)[a].uniqueness[b] = 1 ; } } FILE *u = fopen ( filename.c_str() , "w" ) ; for ( a = 0 ; a < chrs->size() ; a++ ) { for ( b = 0 ; b < (*chrs)[a].uniqueness.size() ; b++ ) { fprintf ( u , "%s\t%d\t%d\n" , (*chrs)[a].name.c_str() , b + 1 , (*chrs)[a].uniqueness[b] ) ; } } fclose ( u ) ; } snpomatic-1.0/src/TChromosome.cpp000066400000000000000000000110621260772140700171070ustar00rootroot00000000000000#include "snpomatic.h" /// Reads a single chromosome from a FASTA file; superceded by load_all_chromosomes(). void TChromosome::read_chromosome ( string filename , string chromosome_name ) { if ( DEBUGGING ) { fprintf ( stdout , "Reading %s from %s ... " , chromosome_name.c_str() , filename.c_str() ) ; fflush(stdout); } name = chromosome_name ; FILE * file = fopen ( filename.c_str() , "r" ) ; char dummy[READ_CHAR_BUFFER] , *c1 , *c2 ; uchar store = 0 ; while ( !feof(file) ) { *dummy = 0 ; fgets ( dummy , READ_CHAR_BUFFER , file ) ; if ( *dummy == '>' ) { if ( store ) break ; // Read the chromosome for ( c1 = dummy+1 ; *c1 == ' ' ; c1++ ) ; for ( c2 = c1 ; *c2 && *c2 != 10 && *c2 != 13 && *c2 != ' ' && *c2 != '|' ; c2++ ) ; *c2 = 0 ; if ( c1 == chromosome_name ) store = 1 ; } else if ( store ) { for ( c1 = dummy ; *c1 ; c1++ ) { switch ( *c1 ) { case 'a' : *c1 = 'A' ; break ; case 'c' : *c1 = 'C' ; break ; case 'g' : *c1 = 'G' ; break ; case 't' : *c1 = 'T' ; break ; } // if ( *c1 == 'A' || *c1 == 'C' || *c1 == 'G' || *c1 == 'T' ) { sequence += *c1 ; // } else if ( *c1 != 13 && *c1 != 10 ) { // fprintf ( stdout , "Genome data contains incompatible character '%c' - aborting.\n" , *c1 ) ; // exit ( 1 ) ; // } } } } fclose ( file ) ; if ( DEBUGGING ) { fprintf ( stdout , "read %d nucleotides.\n" , sequence.length() ) ; fflush(stdout); } } /// Reads a SNP list in .gff and puts the SNPs as IUPAC bases into the chromosome; superceded by incorporate_all_gff(). void TChromosome::incorporate_known_snps_gff ( string filename ) { if ( DEBUGGING ) { fprintf ( stdout , "Reading SNPS into %s from %s ... " , name.c_str() , filename.c_str() ) ; fflush(stdout); } FILE * file = fopen ( filename.c_str() , "r" ) ; int snpcnt = 0 ; char dummy[READ_CHAR_BUFFER] , *c1 , *c2 ; while ( !feof(file) ) { vector parts , parts2 ; *dummy = 0 ; fgets ( dummy , READ_CHAR_BUFFER , file ) ; Tokenize ( dummy , parts , "\t" ) ; if ( parts.size() < 7 ) continue ; // These is not the data you are looking for if ( parts[0] != name ) continue ; // Wrong chromosome if ( parts[2] != "SNP" ) continue ; // Not a SNP if ( parts[3] != parts[4] ) continue ; // Not a single point if ( parts[5] != "+" ) continue ; // Wrong strand (?) int pos = atoi ( parts[3].c_str() ) ; char merge = sequence[pos-1] ; Tokenize ( parts[6].c_str() , parts2 , "\"\" \"\"" ) ; bool found_reference = false ; for ( int b = 0 ; b < parts2.size() ; b++ ) { int l = parts2[b].length() ; if ( l < 2 ) continue ; if ( parts2[b][l-2] != ':' ) continue ; char c = parts2[b][l-1] ; if ( c == sequence[pos-1] ) found_reference = true ; merge = MERGE_IUPAC(merge,c) ; } if ( !found_reference ) { // Strange; maybe original sequence contains "N" // We should count this! continue ; } sequence[pos-1] = merge ; snpcnt++ ; } fclose ( file ) ; if ( DEBUGGING ) { fprintf ( stdout , "incorporated %d SNPs.\n" , snpcnt ) ; fflush(stdout); } } /// Reads a SNP list in simple format and puts the SNPs as IUPAC bases into the chromosome. void TChromosome::incorporate_known_snps ( string filename ) { if ( DEBUGGING ) { fprintf ( stdout , "Reading from %s ... " , filename.c_str() ) ; fflush(stdout); } FILE * file = fopen ( filename.c_str() , "r" ) ; char dummy[READ_CHAR_BUFFER] , *c1 , *c2 ; int snpcnt = 0 ; while ( !feof(file) ) { vector parts ; *dummy = 0 ; fgets ( dummy , READ_CHAR_BUFFER , file ) ; Tokenize ( dummy , parts , "\t" ) ; if ( parts.size() != 4 ) continue ; if ( name != parts[0] ) continue ; int pos = atoi ( parts[1].c_str() ) ; uchar orig = parts[2][0] ; uchar snp = parts[3][0] ; sequence[pos-1] = MERGE_IUPAC(orig,snp) ; snpcnt++ ; } fclose ( file ) ; if ( DEBUGGING ) { fprintf ( stdout , "incorporated %d SNPs.\n" , snpcnt ) ; fflush(stdout); } } /// Adds a (single) read to the coverage statistics void TChromosome::add_coverage ( const string &seq , uint pos , bool rc ) { if ( coverage.size() == 0 ) { // Set up vector TCoverage dc ; dc.A = dc.C = dc.T = dc.G = dc.rc = 0 ; // Paranoia coverage.resize ( sequence.length() , dc ) ; } for ( uint p = 0 ; p < seq.length() ; p++ ) { if ( p + pos >= sequence.length() ) break ; // End of the line, my friend... if ( rc ) coverage[p+pos].rc++ ; switch ( seq[p] ) { case 'A' : coverage[p+pos].A++ ; break ; case 'C' : coverage[p+pos].C++ ; break ; case 'G' : coverage[p+pos].G++ ; break ; case 'T' : coverage[p+pos].T++ ; break ; default : cerr << seq[p] << endl ; } ; } } snpomatic-1.0/src/TChromosomeAlign.cpp000066400000000000000000002001621260772140700200630ustar00rootroot00000000000000#include "snpomatic.h" string dor ( string s ) { // Do Reverse reverse ( s.begin() , s.end() ) ; return s ; } string doc ( string s ) { // Do Complementary for ( int a = 0 ; a < s.length() ; a++ ) s[a] = base_rc[s[a]] ; return s ; } string dorc ( string s ) { // Do Reverse/Complementary for ( int a = 0 ; a < s.length() ; a++ ) s[a] = base_rc[s[a]] ; reverse ( s.begin() , s.end() ) ; return s ; } bool tpi_full_compare ( const TPositionWithIndex a , const TPositionWithIndex b ) { return ( a.chromosome != b.chromosome ) ? a.chromosome < b.chromosome : a.position < b.position ; } /// Constructor TChromosomeAlign::TChromosomeAlign ( chrvec &_chrs , TChromosomalIndices &_index ) { pileup = binfile_no_match = binfile_single_match = binfile_multi_match = binfile_iupac = cigar = NULL ; wobble = fragmentplot = featuretable = gffout = coverage = snpsinreads = indelplot = inversions = NULL ; sqlite = sam = spancontigs = faceaway = rpa = NULL ; chrs = &_chrs ; index = &_index ; chop = 0 ; wobblemax = 2 ; max_align = (uint) 2 << 30 ; // Just a very large number fasta = false ; use_nono = false ; using_bins = false ; snpsinreads_first = true ; multimatch = false ; singlematch = false ; force_one_unique_match = false ; sqlite_prefix_first = true ; rpa_header_written = false ; single_read_length = 0 ; } void TChromosomeAlign::dump_coverage () { if ( !coverage ) return ; fprintf ( coverage , "Chromosome\tPosition\tReference\tA\tC\tG\tT\tReverse-complement\n" ) ; for ( uint chr = 0 ; chr < chrs->size() ; chr++ ) { if ( (*chrs)[chr].coverage.size() == 0 ) continue ; // Paranoia for ( uint pos = 0 ; pos < (*chrs)[chr].sequence.length() ; pos++ ) { if ( pos >= (*chrs)[chr].coverage.size() ) { fprintf ( coverage , "%s\t%d\t%c\t%d\t%d\t%d\t%d\t%d\n" , (*chrs)[chr].name.c_str() , pos + 1 , (*chrs)[chr].original_sequence.length() > 0 ? (*chrs)[chr].original_sequence[pos] : (*chrs)[chr].sequence[pos] , 0 , 0 , 0 , 0 , 0 ) ; } fprintf ( coverage , "%s\t%d\t%c\t%d\t%d\t%d\t%d\t%d\n" , (*chrs)[chr].name.c_str() , pos + 1 , (*chrs)[chr].original_sequence.length() > 0 ? (*chrs)[chr].original_sequence[pos] : (*chrs)[chr].sequence[pos] , (*chrs)[chr].coverage[pos].A , (*chrs)[chr].coverage[pos].C , (*chrs)[chr].coverage[pos].G , (*chrs)[chr].coverage[pos].T , (*chrs)[chr].coverage[pos].rc ) ; } } fclose ( coverage ) ; coverage = NULL ; // Paranoia } /// Initialization (after constructor) void TChromosomeAlign::init () { int i ; if ( pileup ) for ( i = 0 ; i < chrs->size() ; i++ ) (*chrs)[i].ao.init(&((*chrs)[i])) ; if ( sam ) { fprintf ( sam , "@HD\tVN:1.0\n" ) ; for ( i = 0 ; i < chrs->size() ; i++ ) { fprintf ( sam , "@SQ\tSN:%s\tLN:%d\n" , (*chrs)[i].name.c_str() , (*chrs)[i].sequence.length() ) ; } } } void TChromosomeAlign::add_nono_list ( string filename ) { FILE * file = fopen ( filename.c_str() , "r" ) ; char dummy[READ_CHAR_BUFFER] , *c1 , *c2 ; while ( !feof(file) ) { *dummy = 0 ; fgets ( dummy , READ_CHAR_BUFFER , file ) ; if ( !*dummy ) break ; c1 = dummy ; if ( *c1 == '>' || *c1 == '@' ) c1++ ; while ( *c1 == ' ' ) c1++ ; for ( c2 = c1 ; *c2 > 13 ; c2++ ) ; *c2 = 0 ; nono.push_back ( c1 ) ; } sort ( nono.begin() , nono.end() ) ; use_nono = true ; } bool TChromosomeAlign::is_nono ( const string &s ) { pair ss = equal_range ( nono.begin() , nono.end() , s ) ; return ss.first != ss.second ; } void TChromosomeAlign::write2bin ( const string &name , const string &sequence , const string &quality , int matches ) { string out ; if ( fasta ) { out = ">" + name + "\n" + sequence + "\n" ; } else { out = "@" + name + "\n" + sequence + "\n+\n" + quality + "\n" ; } if ( matches < 0 ) { if ( binfile_iupac ) fprintf ( binfile_iupac , "%s" , out.c_str() ) ; } else if ( matches == 0 ) { if ( binfile_no_match ) fprintf ( binfile_no_match , "%s" , out.c_str() ) ; } else if ( matches == 1 ) { if ( binfile_single_match ) fprintf ( binfile_single_match , "%s" , out.c_str() ) ; } else { if ( binfile_multi_match ) fprintf ( binfile_multi_match , "%s" , out.c_str() ) ; } } uint TChromosomeAlign::align_solexa_paired_read ( const string &name , const string &sequence , string quality , uint read_length_1 , uint fragment_length , uint range ) { string seq1 = sequence.substr ( 0 , read_length_1 ) ; string seq2 = sequence.substr ( read_length_1 , sequence.length() - read_length_1 ) ; while ( quality.length() < sequence.length() ) quality += (char) 33+30 ; string qual1 = quality.substr ( 0 , read_length_1 ) ; string qual2 = quality.substr ( read_length_1 , sequence.length() - read_length_1 ) ; pwi_vector res1 , res2 ; get_align_solexa_read ( seq1.c_str() , res1 ) ; if ( !sam && !wobble && !chop && !sqlite && res1.size() == 0 ) return 0 ; get_align_solexa_read ( seq2.c_str() , res2 ) ; if ( !sam && !wobble && !chop && !sqlite && res2.size() == 0 ) return 0 ; sort ( res1.begin() , res1.end() , tpi_full_compare ) ; sort ( res2.begin() , res2.end() , tpi_full_compare ) ; int ret = 0 , old_fragment , old_variance ; // Spanning contigs if ( ( rpa || spancontigs ) && res1.size() == 1 && res2.size() == 1 && res1[0].chromosome != res2[0].chromosome ) { if ( spancontigs ) { fprintf ( spancontigs , "%s\t%s\t%d\t%d\t%s\t%s\t%s\t%d\t%d\t%s\t%s\n" , last_solexa_name.c_str() , (*chrs)[res1[0].chromosome].name.c_str() , res1[0].position , res1[0].reverse_complement , seq1.c_str() , qual1.c_str() , (*chrs)[res2[0].chromosome].name.c_str() , res2[0].position , res2[0].reverse_complement , seq2.c_str() , qual2.c_str() ) ; } if ( rpa ) rpa_out ( res1[0] , res2[0] , read_length_1 , seq1 , qual1 , seq2 , qual2 ) ; return 0 ; } if ( ( sqlite || sam ) && res1.size() + res2.size() == 1 ) { // Unique single read TPositionWithIndex r = res1.size() ? res1[0] : res2[0] ; string seq = res1.size() ? seq1 : seq2 , s ; if ( sqlite ) { if ( r.reverse_complement ) s = generate_sqlite_paired_command ( r.chromosome , dorc ( seq ) , r.position - seq.length() , "" , 0 , '1' ) ; else s = generate_sqlite_paired_command ( r.chromosome , seq , r.position , "" , 0 , '1' ) ; add2sqlite_cache ( s ) ; } if ( sam ) { string qual = res1.size() ? qual1 : qual2 , s ; int readpart = res1.size() ? 1 : 2 ; if ( r.reverse_complement ) add_sam ( dorc ( seq ) , dor ( qual ) , r.chromosome , r.position - seq.length() , readpart , 0 , 0 , r.reverse_complement , false , false , true ) ; else add_sam ( seq , qual , r.chromosome , r.position , readpart , 0 , 0 , r.reverse_complement , false , false , true ) ; } return 0 ; } if ( sqlite ) { // Clear temporary cache, increase fragment/variance temporarily sqlite_out.clear () ; } sam_wrote = false ; ret += paired_read_combine ( res1 , res2 , fragment_length , range , read_length_1 , seq1 , qual1 , seq2 , qual2 ) ; ret += paired_read_combine ( res2 , res1 , fragment_length , range , read_length_1 , seq2 , qual2 , seq1 , qual1 ) ; if ( sqlite && sqlite_out.size() == 1 ) { add2sqlite_cache ( sqlite_out[0] ) ; return ret ; } // Try all fragment sizes if sqlite and no in-range find if ( ( sam && !sam_wrote ) || ( sqlite && sqlite_out.size() == 0 ) ) { old_fragment = fragment_length ; old_variance = range ; fragment_length = 90000000 ; // Long... range = 90000000 ; // Looooooong... bool done = false ; paired_read_combine ( res1 , res2 , fragment_length , range , read_length_1 , seq1 , qual1 , seq2 , qual2 ) ; paired_read_combine ( res2 , res1 , fragment_length , range , read_length_1 , seq2 , qual2 , seq1 , qual1 ) ; if ( sqlite_out.size() == 1 ) { add2sqlite_cache ( sqlite_out[0] ) ; done = true ; } fragment_length = old_fragment ; // Ah. Just kidding. range = old_variance ; if ( done ) return 0 ; } // Try inversions if ( ret == 0 && ( rpa || inversions || sqlite || ( sam && !sam_wrote ) ) ) { bool fake_long_region = true ; // sqlite ; if ( fake_long_region ) { old_fragment = fragment_length ; old_variance = range ; fragment_length = 90000000 ; // Long... range = 90000000 ; // Looooooong... } paired_read_combine ( res1 , res2 , fragment_length , range , read_length_1 , seq1 , qual1 , seq2 , qual2 , false , true , 1 ) ; paired_read_combine ( res2 , res1 , fragment_length , range , read_length_1 , seq2 , qual2 , seq1 , qual1 , false , true , 2 ) ; if ( fake_long_region ) { fragment_length = old_fragment ; // Ah. Just kidding. range = old_variance ; } if ( sqlite_out.size() == 1 ) { add2sqlite_cache ( sqlite_out[0] ) ; return 0 ; } } if ( ret > 0 ) return ret ; // Chop off read ends and try again (optional!) if ( chop > 0 ) { uint temp_chop = chop ; if ( 1 ) { // Chop off read end string nseq = seq1.substr ( 0 , seq1.length() - chop ) + seq2.substr ( chop , seq2.length() - chop ) ; string nqual = qual1.substr ( 0 , qual1.length() - chop ) + qual2.substr ( chop , qual2.length() - chop ) ; chop = 0 ; ret = align_solexa_paired_read ( name , nseq , nqual , read_length_1 - temp_chop , fragment_length , range ) ; chop = temp_chop ; } if ( ret == 0 ) { // Still no luck, try to chop off the beginning string nseq = seq1.substr ( chop , seq1.length() - chop ) + seq2.substr ( 0 , seq2.length() - chop ) ; string nqual = qual1.substr ( chop , qual1.length() - chop ) + qual2.substr ( 0 , qual2.length() - chop ) ; chop = 0 ; ret = align_solexa_paired_read ( name , nseq , nqual , read_length_1 - temp_chop , fragment_length , range ) ; chop = temp_chop ; } if ( ret == 0 ) { // Still no luck, try to chop off some of both beginning and end uint chop1 = chop / 2 ; uint chop2 = chop - chop1 ; string nseq = seq1.substr ( chop1 , seq1.length() - chop ) + seq2.substr ( chop2 , seq2.length() - chop ) ; string nqual = qual1.substr ( chop1 , qual1.length() - chop ) + qual2.substr ( chop2 , qual2.length() - chop ) ; chop = 0 ; ret = align_solexa_paired_read ( name , nseq , nqual , read_length_1 - temp_chop , fragment_length , range ) ; chop = temp_chop ; } } else if ( wobble ) run_wobble ( seq1 , seq2 , qual1 , qual2 , res1 , res2 , fragment_length ) ; // Don't do in chopped and unchopped state return ret ; } void TChromosomeAlign::get_align_solexa_read ( const char *_seq , pwi_vector &results ) { string rc , rcq ; uint l = strlen ( _seq ) ; // results.reserve ( 50 ) ; for ( uint offset = 0 ; offset < index->index_steps ; offset++ ) { uint i1 = index->calculate_index ( _seq + offset ) ; if ( !index->position[i1].size() ) continue ; // Binary search TPositionWithIndex findme ; findme.index2 = index->calculate_index2 ( _seq + offset + index->index_length ) ; pair pp = equal_range ( index->position[i1].begin() , index->position[i1].end() , findme ) ; for ( TPWIi it = pp.first ; it < pp.second ; it++ ) { int p = it->position ; if ( it->reverse_complement ) { if ( p+offset < l ) continue ; if ( rc.empty() ) { // Creating the reverse-complement rc = string ( _seq ) ; REVERSE_COMPLEMENT_CHARP ( (char*)rc.c_str() ) ; } char *chrseq = (char*) (*chrs)[it->chromosome].sequence.c_str() ; if ( !matchIUPAC ( rc.c_str() , chrseq+p-l+offset ) ) continue ; p += offset ; } else { if ( p + l > (*chrs)[it->chromosome].sequence.length() ) continue ; if ( p < offset ) continue ; char *chrseq = (char*) (*chrs)[it->chromosome].sequence.c_str() ; if ( !matchIUPAC ( _seq , chrseq+p-offset ) ) continue ; p -= offset ; } results.push_back ( *it ) ; if ( offset ) results.back().position = p ; } } } void TChromosomeAlign::run_wobble ( const string &seq1 , const string &seq2 , const string &qual1 , const string &qual2 , const pwi_vector &res1 , const pwi_vector &res2 , uint fragment_length ) { uint cnt , frag , range = fragment_range ; uint read_length_1 = seq1.length() ; uint insertion_length = 10000 ; if ( res1.size() + res2.size() > 10 ) return ; // Too many (number's just a whim) if ( res1.size() * res2.size() == 0 ) { // One read matches perfectly but the other does not if ( res1.size() + res2.size() == 1 ) { // Single match for one side, none for the other int the_pos , the_chr , match_pos ; string out ; bool rc ; if ( res1.size() == 1 ) { rc = res1[0].reverse_complement ; match_pos = res1[0].position ; the_pos = res1[0].position + ( fragment_length - seq1.length() * 2 ) * ( rc ? -1 : 1 ) ; the_chr = res1[0].chromosome ; out = !rc ? dorc ( seq2 ) : seq2 ; } else { rc = res2[0].reverse_complement ; match_pos = res2[0].position ; the_pos = res2[0].position - ( fragment_length - seq1.length() * 2 ) * ( rc ? -1 : 1 ) ; the_chr = res2[0].chromosome ; out = rc ? dorc ( seq1 ) : seq1 ; } fprintf ( wobble , "MIS\t%s\t%d\t%d\t%s\n" , (*chrs)[the_chr].name.c_str() , match_pos , the_pos , out.c_str() ) ; } else if ( res1.size() == 1 || res2.size() == 1 ) { pwi_vector single = res1.size() == 1 ? res1 : res2 ; pwi_vector multi = res1.size() == 1 ? res2 : res1 ; bool good = true ; int avg_pos = 0 ; int avg_cnt = 0 ; for ( int a = 0 ; a < multi.size() ; a++ ) { if ( single[0].chromosome != multi[a].chromosome ) { good = false ; break ; } if ( abs ( ((int)single[0].position) - ((int)multi[a].position) ) > fragment_length * 2 ) { good = false ; break ; } if ( single[0].reverse_complement == multi[a].reverse_complement ) { good = false ; break ; } avg_pos += multi[a].position ; avg_cnt++ ; } if ( good ) { int the_chr = single[0].chromosome ; int match_pos = single[0].position ; int the_pos = avg_pos / avg_cnt ; string out = res1.size() == 1 ? seq2 : seq1 ; out = multi[0].reverse_complement ? dorc ( out ) : out ; fprintf ( wobble , "MUL\t%s\t%d\t%d\t%s\n" , (*chrs)[the_chr].name.c_str() , match_pos , the_pos , out.c_str() ) ; } } if ( sqlite ) sqlite_out.clear() ; range = fragment_range ; // if ( res1.size() + res2.size() == 1 ) { if ( res1.size() > 0 ) wobble4snps ( res1 , seq2 , fragment_length , range , seq1 , false ) ; if ( res2.size() > 0 ) wobble4snps ( res2 , seq1 , fragment_length , range , seq2 , true ) ; // } if ( sqlite_out.size() == 1 ) { add2sqlite_cache ( sqlite_out[0] ) ; } return ; } return ; uint a ; cout << endl << seq1 << "\t" ; for ( a = 0 ; a < res1.size() ; a++ ) cout << res1[a].chromosome << "@" << res1[a].position << ":" << res1[a].reverse_complement << ", " ; cout << endl ; cout << seq2 << "\t" ; for ( a = 0 ; a < res2.size() ; a++ ) cout << res2[a].chromosome << "@" << res2[a].position << ":" << res2[a].reverse_complement << ", " ; cout << endl ; uint b ; for ( a = 0 ; a < res1.size() ; a++ ) { for ( b = 0 ; b < res2.size() ; b++ ) { if ( res1[a].chromosome != res2[b].chromosome ) continue ; TPositionWithIndex pa , pb ; if ( res1[a].position > res2[b].position ) { pa = res2[b] ; pb = res1[a] ; } else { pa = res1[a] ; pb = res2[b] ; } pa.position += (int) ( pa.reverse_complement ? -1 : 1 ) * seq1.length() ; pb.position += (int) ( pb.reverse_complement ? -1 : 1 ) * seq2.length() ; // uint diff = pb.position - pa.position ; fprintf ( wobble , "DEL\t%s\t%d\t%d\t%d\n" , (*chrs)[pa.chromosome].name.c_str() , pa.position , pb.position , pb.position - pa.position ) ; // cout << res1[a].position << ":" << res1[a].reverse_complement << " <=> " ; // cout << res2[b].position << ":" << res2[b].reverse_complement << "\t" << diff << endl ; } } return; /* // Deletions frag = fragment_length / 2 ; range = frag - read_length_1 ; wobble_results.clear() ; cnt = paired_read_combine ( res1 , res2 , frag , range , read_length_1 , seq1 , qual1 , seq2 , qual2 , true ) ; while ( wobble_results.size() ) { TPositionWithIndex p1 , p2 ; p2 = wobble_results.back() ; // Reverse order wobble_results.pop_back() ; p1 = wobble_results.back() ; // than pushing wobble_results.pop_back() ; if ( p1.position > p2.position ) { TPositionWithIndex p3 = p1 ; p1 = p2 ; p2 = p1 ; } fprintf ( wobble , "DEL\t%s\t%d\t%d\t%d\n" , (*chrs)[p1.chromosome].name.c_str() , p1.position , p1.position + fragment_length , p2.position - p1.position ) ; } // Insertions frag = fragment_length + insertion_length / 2 ; range = insertion_length / 2 ; wobble_results.clear() ; cnt = paired_read_combine ( res1 , res2 , frag , range , read_length_1 , seq1 , qual1 , seq2 , qual2 , true ) ; while ( wobble_results.size() ) { TPositionWithIndex p1 , p2 ; p2 = wobble_results.back() ; // Reverse order wobble_results.pop_back() ; p1 = wobble_results.back() ; // than pushing wobble_results.pop_back() ; if ( p1.position > p2.position ) { TPositionWithIndex p3 = p1 ; p1 = p2 ; p2 = p1 ; } fprintf ( wobble , "INS\t%s\t%d\t%d\t%d\n" , (*chrs)[p1.chromosome].name.c_str() , p1.position , p1.position + fragment_length , p2.position - p1.position ) ; } */ } void TChromosomeAlign::wobble4snps ( const pwi_vector &res , const string &seq , uint fragment_size , uint range , const string &anchor_seq , bool rc ) { if ( res.empty() ) return ; string seq_rc = dorc ( seq ) ; int from , to , c ; vector out ; char dummy[READ_CHAR_BUFFER] ; int total_matches = 0 ; // string rc = dorc ( seq ) ; int sl = seq.length() ; for ( uint a = 0 ; a < res.size() ; a++ ) { string *cseq = &(*chrs)[res[a].chromosome].sequence ; if ( !(*chrs)[res[a].chromosome].original_sequence.empty() ) cseq = &(*chrs)[res[a].chromosome].original_sequence ; // if ( !cseq->empty() ) cseq = &(*chrs)[res[a].chromosome].sequence ; if ( res[a].reverse_complement ) from = res[a].position - fragment_size - range ; else from = res[a].position + fragment_size - range ; if ( from < 0 ) from = 0 ; to = from + range * 2 ; if ( to >= cseq->length() ) to = cseq->length() - 1 ; to -= sl ; // Add general variation range // fprintf ( wobble , "VAR\t%s\t%d\t%d\n" , (*chrs)[res[a].chromosome].name.c_str() , (int) ((from+to)/2-seq.length()/2) , (int) ((from+to)/2+seq.length()/2) ) ; // out.push_back ( dummy ) ; for ( int b = from ; b <= to ; b++ ) { vector pos ; vector o , n ; pos.reserve ( wobblemax ) ; o.reserve ( wobblemax ) ; n.reserve ( wobblemax ) ; const char *t = res[a].reverse_complement ? seq.c_str() : seq_rc.c_str() ; int mismatches = 0 ; const char *orig = cseq->c_str() + b ; for ( c = 0 ; c < sl ; c++ , t++ , orig++ ) { if ( char2IUPAC[*t] & char2IUPAC[*orig] ) continue ; if ( ++mismatches > wobblemax ) break ; pos.push_back ( b + c + 1 ) ; o.push_back ( *orig ) ; n.push_back ( *t ) ; } if ( mismatches == 0 ) continue ; // Weird... if ( mismatches > wobblemax ) continue ; // total_matches++ ; // if ( total_matches > 1 ) break ; if ( ++total_matches > 1 ) return ; for ( c = 0 ; c < pos.size() ; c++ ) { sprintf ( dummy , "SNP\t%s\t%d\t%c\t%c\n" , (*chrs)[res[a].chromosome].name.c_str() , pos[c] , o[c] , n[c] ) ; out.push_back ( dummy ) ; } if ( sqlite ) { uint opos = res[a].position ; if ( res[a].reverse_complement ) opos -= anchor_seq.length() ; string s = generate_sqlite_paired_command ( res[a].chromosome , res[a].reverse_complement ? dorc ( anchor_seq ) : anchor_seq , opos , res[a].reverse_complement ? seq : seq_rc , b , 'S' ) ; /* cout << s << endl ; cout << ( res[a].reverse_complement ? "RC" : "NORM" ) << endl ; cout << anchor_seq << endl ; cout << dorc ( anchor_seq ) << endl ; cout << dor ( anchor_seq ) << endl ; cout << doc ( anchor_seq ) << endl ; cout << cseq->substr ( opos , anchor_seq.length() ) << endl ; cout << endl ;*/ sqlite_out.push_back ( s ) ; } } } if ( total_matches > 1 ) return ; for ( c = 0 ; c < out.size() ; c++ ) { fprintf ( wobble , "%s" , out[c].c_str() ) ; } } string TChromosomeAlign::generate_sqlite_paired_command ( int chr , string seq1 , int pos1 , string seq2 , int pos2 , char mode , string add_keys , string add_values ) { string ret ; string *refp = (*chrs)[chr].original_sequence.empty() ? &((*chrs)[chr].sequence) : &((*chrs)[chr].original_sequence) ; string ref1 = refp->substr ( pos1 , seq1.length() ) ; string ref2 = refp->substr ( pos2 , seq2.length() ) ; if ( seq1 == ref1 && seq2 == ref2 && mode == ' ' ) { // No SNPs char tmp[1000] , *chrn = (char*) (*chrs)[chr].name.c_str() ; sprintf ( tmp , "/*P|%s|%9d*/ INSERT INTO %s_perfect_match (read_name,pos1,pos2) VALUES (\"%s\",%d,%d);" , chrn , pos1 , chrn , last_solexa_name.c_str() , pos1+1 , pos2+1 ) ; ret = tmp ; } else { // SNP(s) char tmp[1000] , *chrn = (char*) (*chrs)[chr].name.c_str() ; int a ; string out1 , out2 ; string table = "_snp_match" ; if ( mode == ' ' ) mode = 'S' ; else if ( mode == 'I' ) table = "_inversions" ; else if ( mode == '1' ) table = "_single" ; if ( seq1 != ref1 ) { out1 = seq1 ; for ( a = 0 ; a < out1.length() ; a++ ) out1[a] = seq1[a] != ref1[a] ? seq1[a] : seq1[a] - 'A' + 'a' ; } if ( seq2 != ref2 && mode != '1' ) { out2 = seq2 ; for ( a = 0 ; a < out2.length() ; a++ ) out2[a] = seq2[a] != ref2[a] ? seq2[a] : seq2[a] - 'A' + 'a' ; } if ( mode == '1' ) { sprintf ( tmp , "/*%c|%s|%9d*/ INSERT INTO %s%s (read_name,seq1,pos1) VALUES (\"%s\",\"%s\",%d);" , mode , chrn , pos1 , chrn , table.c_str() , last_solexa_name.c_str() , out1.c_str() , pos1+1 ) ; } else { sprintf ( tmp , "/*%c|%s|%9d*/ INSERT INTO %s%s (read_name,seq1,pos1,seq2,pos2%s) VALUES (\"%s\",\"%s\",%d,\"%s\",%d%s);" , mode , chrn , pos1 , chrn , table.c_str() , add_keys.c_str(), last_solexa_name.c_str() , out1.c_str() , pos1+1 , out2.c_str() , pos2+1 , add_values.c_str() ) ; } ret = tmp ; } return ret ; } int TChromosomeAlign::paired_read_combine ( const pwi_vector &v1 , const pwi_vector &v2 , uint fragment_length , uint range , uint read_length_1 , const string &seq1 , const string &qual1 , const string &seq2 , const string &qual2 , bool wobbling , bool inverting , int order ) { int ret = 0 ; vector cache ; uint a , b , c ; uint seq1_length = seq1.length() ; uint seq2_length = seq2.length() ; if ( force_one_unique_match && v1.size() > 1 && v2.size() > 1 ) return ret ; a = 0 ; b = 0 ; while ( a < v1.size() && b < v2.size() ) { int chr = v1[a].chromosome ; int l1 = v1[a].position + fragment_length - range - read_length_1 ; if ( l1 < v1[a].position + read_length_1 ) l1 = v1[a].position + read_length_1 ; while ( b < v2.size() && ( v2[b].chromosome < chr || ( v2[b].chromosome == chr && v2[b].position < l1 ) ) ) b++ ; if ( b == v2.size() ) break ; int l2 = v1[a].position + fragment_length + range ; for ( c = b ; c < v2.size() && chr == v2[c].chromosome && v2[c].position <= l2 ; c++ ) { if ( v2[c].position < l1 ) continue ; if ( inverting && v1[a].reverse_complement == v2[c].reverse_complement ) { // INVERSION! FROM MARS!!! if ( inversions ) fprintf ( inversions , "%s\t%d\t%d\n" , (*chrs)[v1[a].chromosome].name.c_str() , v1[a].position , v2[c].position ) ; if ( sqlite ) { string s , t ; if ( v1[a].reverse_complement ) { t = ",\"--\"" ; t[2] = '0' + order ; s = generate_sqlite_paired_command ( chr , dorc ( seq1 ) , v1[a].position - seq1_length , dorc ( seq2 ) , v2[c].position - seq2_length , 'I' , ",dir" , t ) ; } else { t = ",\"++\"" ; t[2] = '0' + order ; s = generate_sqlite_paired_command ( chr , seq1 , v1[a].position , seq2 , v2[c].position , 'I' , ",dir" , t ) ; } sqlite_out.push_back ( s ) ; } if ( sam ) { int position_a = v1[a].reverse_complement ? v1[a].position - seq1_length : v1[a].position ; int position_b = v2[c].reverse_complement ? v2[c].position - seq2_length : v2[c].position ; int dist = v2[c].position > v1[a].position ? v2[c].position - v1[a].position : v1[a].position - v2[c].position ; bool unique = true ; // Not true, but too complicated to fix... if ( v1[a].reverse_complement ) { add_sam ( dorc ( seq1 ) , dor ( qual1 ) , chr , position_a , 1 , position_b , dist , true , true , true , unique ) ; add_sam ( dorc ( seq2 ) , dor ( qual2 ) , chr , position_b , 2 , position_a , dist , true , true , true , unique ) ; } else { add_sam ( seq1 , qual1 , chr , position_a , 1 , position_b , dist , false , true , false , unique ) ; add_sam ( seq2 , qual2 , chr , position_b , 2 , position_a , dist , false , true , false , unique ) ; } sam_wrote = true ; } continue ; } if ( v1[a].reverse_complement == v2[c].reverse_complement ) continue ; ret++ ; // Add to output cache prc_cache ca ; ca.a = v1[a] ; ca.b = v2[c] ; cache.push_back ( ca ) ; if ( gffout ) { fprintf ( gffout , "\"%s\"\t\"SNP-O-MATIC\"\t\"misc_feature\"\t%d\t%d\t\"-\"\t\"%c\"\t\"-\"\t\"ID=%s_a;Gap=M%d\"\n" , (*chrs)[v1[a].chromosome].name.c_str() , v1[a].position , v1[a].position + read_length_1 , v1[a].reverse_complement ? '-' : '+' , last_solexa_name.c_str() , read_length_1 ) ; fprintf ( gffout , "\"%s\"\t\"SNP-O-MATIC\"\t\"misc_feature\"\t%d\t%d\t\"-\"\t\"%c\"\t\"-\"\t\"ID=%s_b;Gap=M%d\"\n" , (*chrs)[v2[c].chromosome].name.c_str() , v2[c].position , v2[c].position + read_length_1 , v2[c].reverse_complement ? '-' : '+' , last_solexa_name.c_str() , read_length_1 ) ; } if ( wobbling ) { TPositionWithIndex p1 , p2 ; p1.chromosome = chr ; p2.chromosome = chr ; if ( v1[a].reverse_complement ) p1.position = v1[a].position - seq1_length ; else p1.position = v1[a].position ; if ( v2[c].reverse_complement ) p2.position = v2[c].position - seq2_length ; else p2.position = v2[c].position ; wobble_results.push_back ( p1 ) ; wobble_results.push_back ( p2 ) ; } } a++ ; } if ( multimatch && cache.size() > 1 ) { // Force to map the read in one of its possible locations cache[0] = cache[rand()%cache.size()] ; cache.resize ( 1 ) ; } if ( singlematch && cache.size() > 1 ) return ret ; for ( uint n = 0 ; n < cache.size() ; n++ ) { prc_cache *p = &cache[n] ; int chr = p->a.chromosome ; // Same as p->b.chromosome int position_a = p->a.reverse_complement ? p->a.position - seq1_length : p->a.position ; int position_b = p->b.reverse_complement ? p->b.position - seq2_length : p->b.position ; int dist = p->b.position > p->a.position ? p->b.position - p->a.position : p->a.position - p->b.position ; if ( faceaway ) { if ( p->a.reverse_complement && !p->b.reverse_complement && p->a.position < p->b.position ) { fprintf ( faceaway , "%s\t%s" , last_solexa_name.c_str() , (*chrs)[p->a.chromosome].name.c_str() ) ; fprintf ( faceaway , "\t%d" , position_a ) ; fprintf ( faceaway , "\t%s" , p->a.reverse_complement ? dorc(seq1).c_str() : seq1.c_str() ) ; fprintf ( faceaway , "\t%d" , position_b ) ; fprintf ( faceaway , "\t%s" , p->b.reverse_complement ? dorc(seq2).c_str() : seq2.c_str() ) ; fprintf ( faceaway , "\n" ) ; } } if ( cigar ) { add_cigar_sqlite ( seq1 , last_solexa_name , position_a , p->a.chromosome , p->a.reverse_complement ? '-' : '+' ) ; add_cigar_sqlite ( seq2 , last_solexa_name , position_b , p->b.chromosome , p->b.reverse_complement ? '-' : '+' ) ; } if ( rpa ) { rpa_out ( p->a , p->b , read_length_1 , seq1 , qual1 , seq2 , qual2 ) ; } if ( sqlite ) { string s ; if ( p->a.reverse_complement ) s = generate_sqlite_paired_command ( chr , dorc ( seq1 ) , position_a , seq2 , position_b ) ; else s = generate_sqlite_paired_command ( chr , seq1 , position_a , dorc ( seq2 ) , position_b ) ; sqlite_out.push_back ( s ) ; } if ( fragmentplot ) { while ( fragment_stats.size() <= dist ) fragment_stats.push_back ( 0 ) ; fragment_stats[dist]++ ; } if ( indelplot ) { fprintf ( indelplot , "%s\t%d\t%d\t%d\n" , (*chrs)[p->a.chromosome].name.c_str() , p->a.position , p->b.position , dist ) ; } if ( coverage ) { if ( p->a.reverse_complement ) (*chrs)[chr].add_coverage ( dorc ( seq1 ) , position_a , true ) ; else (*chrs)[chr].add_coverage ( seq1 , position_a , false ) ; if ( p->b.reverse_complement ) (*chrs)[chr].add_coverage ( dorc ( seq2 ) , position_b , true ) ; else (*chrs)[chr].add_coverage ( seq2 , position_b , false ) ; } if ( sam ) { bool unique = cache.size() == 1 ; if ( p->a.reverse_complement ) add_sam ( dorc ( seq1 ) , dor ( qual1 ) , chr , position_a , 1 , position_b , dist , p->a.reverse_complement , true , p->b.reverse_complement , unique ) ; else add_sam ( seq1 , qual1 , chr , position_a , 1 , position_b , dist , p->a.reverse_complement , true , p->b.reverse_complement , unique ) ; if ( p->b.reverse_complement ) add_sam ( dorc ( seq2 ) , dor ( qual2 ) , chr , position_b , 2 , position_a , dist , p->b.reverse_complement , true , p->a.reverse_complement , unique ) ; else add_sam ( seq2 , qual2 , chr , position_b , 2 , position_a , dist , p->b.reverse_complement , true , p->a.reverse_complement , unique ) ; sam_wrote = true ; } if ( snpsinreads ) { if ( p->a.reverse_complement ) add_snpsinreads ( dorc ( seq1 ) , dor ( qual1 ) , chr , position_a , 1 ) ; else add_snpsinreads ( seq1 , qual1 , chr , position_a , 1 ) ; if ( p->b.reverse_complement ) add_snpsinreads ( dorc ( seq2 ) , dor ( qual2 ) , chr , position_b , 2 ) ; else add_snpsinreads ( seq2 , qual2 , chr , position_b , 2 ) ; } if ( pileup ) { if ( p->a.reverse_complement ) (*chrs)[chr].ao.add_align ( dorc ( seq1 ) , dor ( qual1 ) , position_a , chr ) ; else (*chrs)[chr].ao.add_align ( seq1 , qual1 , position_a , chr ) ; if ( p->b.reverse_complement ) (*chrs)[chr].ao.add_align ( dorc ( seq2 ) , dor ( qual2 ) , position_b , chr ) ; else (*chrs)[chr].ao.add_align ( seq2 , qual2 , position_b , chr ) ; } if ( featuretable ) { fprintf ( featuretable , "gene\t%d-%d\n\t/product=\"%s\"\n\t/chromosome=\"%s\"\n" , p->a.position , p->a.position + read_length_1 , last_solexa_name.c_str() , (*chrs)[p->a.chromosome].name.c_str() ) ; fprintf ( featuretable , "gene\t%d-%d\n\t/product=\"%s\"\n\t/chromosome=\"%s\"\n" , p->b.position , p->b.position + read_length_1 , last_solexa_name.c_str() , (*chrs)[p->b.chromosome].name.c_str() ) ; } } return ret ; } void TChromosomeAlign::rpa_out ( const TPositionWithIndex &a , const TPositionWithIndex &b , uint read_length_1 , const string &seq1 , const string &qual1 , const string &seq2 , const string &qual2 ) { if ( !rpa_header_written ) { fprintf ( rpa , "#Readname\tr1_seq\tr1_qual\tr1_chr\tr1_pos\tr1_rc\tr2_seq\tr2_qual\tr2_chr\tr2_pos\tr2_rc\n" ) ; rpa_header_written = true ; } fprintf ( rpa , "%s\t%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\t%d\t%d\n" , last_solexa_name.c_str() , seq1.c_str() , qual1.c_str() , (*chrs)[a.chromosome].name.c_str() , a.position , a.reverse_complement ? 1 : 0 , seq2.c_str() , qual2.c_str() , (*chrs)[b.chromosome].name.c_str() , b.position , b.reverse_complement ? 1 : 0 ) ; } void TChromosomeAlign::add_sam ( const string &seq , const string &quality , int chr , int pos , int readpart , int matepos , int ins_size , bool rc , bool as_pair , bool mate_rc , bool unique_match ) { char cigar[100] ; sprintf ( cigar , "%dM" , (int) seq.length() ) ; unsigned int flag = 0 ; flag |= 0x0001 ; // Paired in sequencing if ( as_pair ) flag |= 0x0002 ; // Mapped as pair else flag |= 0x0008 ; // Mate unmapped if ( rc ) flag |= 0x0010 ; // On reverse strand if ( as_pair ) { if ( mate_rc ) flag |= 0x0020 ; // Mate RC if ( 1 == readpart ) flag |= 0x0040 ; // First read in a pair if ( 2 == readpart ) flag |= 0x0080 ; // Second read in a pair } if ( !unique_match ) flag |= 0x0100 ; // Not uniquely mapped char mismatches[1000] ; sprintf ( mismatches , "\tNM:i:%d" , count_snpsinreads ( seq , chr , pos ) ) ; string tags ; // FILL ME // tags += "\tRG:Z:" + rgid ; // tags += "\tPU:Z:" + rgid ; tags += "\tPG:Z:SNP-o-matic" ; tags += mismatches ; if ( pos > matepos ) ins_size = -ins_size ; fprintf ( sam , "%s\t%d\t%s\t%d\t%d\t%s\t%s\t%d\t%d\t%s\t%s%s\n" , last_solexa_name.c_str() , // Read name flag , // flag (*chrs)[chr].name.c_str() , // Chromosome name pos + 1 , // Alignment start position 255 , // PHRED alignment score dummy cigar , // CIGAR "=" , // Mate matepos , // Mate position ins_size , // ISIZE (actually, fragment size) seq.c_str() , // Sequence quality.c_str() , // Quality tags.c_str() // Tag ) ; } int TChromosomeAlign::count_snpsinreads ( const string &seq , int chr , int pos ) { uint sl = (*chrs)[chr].original_sequence.length() ; if ( sl == 0 ) return 0 ; // No SNPs here int ret = 0 ; int seql = seq.length() , n , cnt ; const char *os = (*chrs)[chr].original_sequence.c_str() ; const char *ss = (*chrs)[chr].sequence.c_str() ; for ( int p = 0 ; p < seql && p + pos < sl ; p++ ) { if ( !isIUPAC[ss[p+pos]] || os[p+pos] == 'X' ) continue ; if ( seq[p] != os[p+pos] ) ret++ ; } return ret ; } void TChromosomeAlign::add_snpsinreads ( const string &seq , const string &quality , int chr , int pos , int readpart ) { uint sl = (*chrs)[chr].original_sequence.length() ; if ( sl == 0 ) return ; // No SNPs here if ( snpsinreads_first ) { // Output header snpsinreads_first = false ; fprintf ( snpsinreads , "#read_name\tread_part\tsnp_name\tchromosome\tposition_on_chromosome\tposition_on_read\tobserved_base\treference_base\tbase_quality\tmin_total\tavg_total\tmin_5\tavg_5\n" ) ; } int min_total = -1 , avg_total , min_5 , avg_5 ; int seql = seq.length() , n , cnt ; const char *os = (*chrs)[chr].original_sequence.c_str() ; const char *ss = (*chrs)[chr].sequence.c_str() ; for ( int p = 0 ; p < seql && p + pos < sl ; p++ ) { // if ( os[p+pos] == seq[p] || os[p+pos] == 'X' ) continue ; if ( !isIUPAC[ss[p+pos]] || os[p+pos] == 'X' ) continue ; // Preparing total quality scores if ( min_total == -1 ) { min_total = 1000 ; // Some high number avg_total = 0 ; for ( n = 0 ; n < seql ; n++ ) { if ( quality[n] < min_total ) min_total = quality[n] ; avg_total += quality[n] ; } min_total -= 33 ; avg_total = avg_total / seql - 33 ; } // Prepare surrounding quality scores min_5 = 1000 ; // Some high number avg_5 = 0 ; cnt = 0 ; for ( n = ( p >= 5 ) ? p - 5 : 0 ; n < p + 5 && n < seql ; n++ ) { if ( n == p ) continue ; // Do not use actual SNP base for quality cnt++ ; if ( quality[n] < min_5 ) min_5 = quality[n] ; avg_5 += quality[n] ; } if ( cnt > 0 ) { min_5 -= 33 ; avg_5 = avg_5 / cnt - 33 ; } else { avg_5 = 0 ; cerr << "TChromosomeAlign::add_snpsinreads : cnt == 0\n" ; } fprintf ( snpsinreads , "%s\t%d\t%s:%d\t%s\t%d\t%d\t%c\t%c\t%d\t%d\t%d\t%d\t%d\n" , last_solexa_name.c_str() , readpart , (*chrs)[chr].name.c_str() , pos+p+1 , (*chrs)[chr].name.c_str() , pos+p+1 , p+1 , seq[p] , os[p+pos] , (unsigned char) quality[p] - 33 , min_total , avg_total , min_5 , avg_5 ) ; } } void TChromosomeAlign::add2sqlite_cache ( string s ) { if ( sqlite_cache_name.empty() ) { char *c = new char[10000] ; strcpy ( c , "sqlite_cache_XXXXXXXX" ) ; close ( mkstemp ( c ) ) ; //tmpnam ( c ) ; sqlite_cache_name = c ; sqlite_cache = fopen ( c , "w" ) ; delete c ; } fprintf ( sqlite_cache , "%s\n" , s.c_str() ) ; } void TChromosomeAlign::finish_sqlite ( int fragment , int variance , int read_length ) { if ( !sqlite ) return ; int a , b ; fprintf ( sqlite , "BEGIN EXCLUSIVE TRANSACTION;\n" ) ; // Meta-data fprintf ( sqlite , "CREATE TABLE meta ( key VARCHAR[64] , value VARCHAR[256] ) ;\n" ) ; fprintf ( sqlite , "INSERT INTO meta ( key , value ) VALUES ( \"dbversion\" , \"3\" ) ;\n" ) ; fprintf ( sqlite , "INSERT INTO meta ( key , value ) VALUES ( \"%s\" , \"%d\" ) ;\n" , "fragment" , fragment ) ; fprintf ( sqlite , "INSERT INTO meta ( key , value ) VALUES ( \"%s\" , \"%d\" ) ;\n" , "variance" , variance ) ; fprintf ( sqlite , "INSERT INTO meta ( key , value ) VALUES ( \"%s\" , \"%d\" ) ;\n" , "read_length" , read_length ) ; fprintf ( sqlite , "INSERT INTO meta ( key , value ) VALUES ( \"%s\" , \"%s\" ) ;\n" , "name_prefix" , sqlite_prefix.c_str() ) ; // Create tables fprintf ( sqlite , "CREATE TABLE chromosomes ( name VARCHAR[256] , size INTEGER ) ;\n" ) ; for ( a = 0 ; a < chrs->size() ; a++ ) { fprintf ( sqlite , "CREATE TABLE %s_perfect_match ( read_name VARCHAR[32], pos1 INTEGER, pos2 INTEGER ) ;\n" , (*chrs)[a].name.c_str() ) ; fprintf ( sqlite , "CREATE TABLE %s_snp_match ( read_name VARCHAR[32], pos1 INTEGER, seq1 VARCHAR[64] , pos2 INTEGER , seq2 VARCHAR[64] ) ;\n" , (*chrs)[a].name.c_str() ) ; fprintf ( sqlite , "CREATE TABLE %s_inversions ( read_name VARCHAR[32], pos1 INTEGER, seq1 VARCHAR[64] , pos2 INTEGER , seq2 VARCHAR[64] , dir VARCHAR[4] ) ;\n" , (*chrs)[a].name.c_str() ) ; fprintf ( sqlite , "CREATE TABLE %s_single ( read_name VARCHAR[32], pos1 INTEGER, seq1 VARCHAR[64] ) ;\n" , (*chrs)[a].name.c_str() ) ; fprintf ( sqlite , "INSERT INTO chromosomes ( name , size ) VALUES ( \"%s\" , \"%d\") ;\n" , (*chrs)[a].name.c_str() , (*chrs)[a].sequence.length() ) ; } // Write contents // if ( DEBUGGING ) { fprintf ( stdout , "Sorting sqlite output ... " ) ; fflush(stdout); } // sort ( sqlite_cache.begin() , sqlite_cache.end() ) ; // if ( DEBUGGING ) { fprintf ( stdout , "writing ... " ) ; fflush(stdout); } fclose ( sqlite_cache ) ; sqlite_cache = fopen ( sqlite_cache_name.c_str() , "r" ) ; char dummy[READ_CHAR_BUFFER] , *c1 , *c2 ; string pref = "\"" + sqlite_prefix ; // for ( a = 0 ; a < sqlite_cache.size() ; a++ ) { // string s = sqlite_cache[a] ; while ( !feof ( sqlite_cache ) ) { fgets ( dummy , READ_CHAR_BUFFER , sqlite_cache ) ; for ( c1 = dummy ; *c1 > 13 ; c1++ ) ; *c1 = 0 ; string s ( dummy ) ; if ( s.empty() ) continue ; // Removing initial SQL comment to save disk space; we could leave it in, and sqlite would ignore it b = s.find ( "*/" ) ; if ( b > 0 && b < s.length() ) { b += 2 ; while ( s[b] == ' ' ) b++ ; s = s.substr ( b ) ; } // Remove common read name prefix int position = s.find ( pref ) ; // find first space while ( position != string::npos ) { s.replace( position, pref.length(), "\"" ); position = s.find( pref, position + 1 ); } fprintf ( sqlite , "%s\n" , s.c_str() ) ; } // Create indices fprintf ( sqlite , "COMMIT;\n" ) ; fprintf ( sqlite , "BEGIN EXCLUSIVE TRANSACTION;\n" ) ; for ( a = 0 ; a < chrs->size() ; a++ ) { char *c = (char*) (*chrs)[a].name.c_str() ; fprintf ( sqlite , "CREATE INDEX %s_perfect_index ON %s_perfect_match ( pos1 , pos2 ) ;\n" , c , c ) ; fprintf ( sqlite , "CREATE INDEX %s_snp_index ON %s_snp_match ( pos1 , pos2 ) ;\n" , c , c ) ; fprintf ( sqlite , "CREATE INDEX %s_inv_index ON %s_inversions ( pos1 , pos2 ) ;\n" , c , c ) ; fprintf ( sqlite , "CREATE INDEX %s_sin_index ON %s_single ( pos1 ) ;\n" , c , c ) ; } // Close up fprintf ( sqlite , "COMMIT;\n" ) ; fclose ( sqlite_cache ) ; remove ( sqlite_cache_name.c_str() ) ; if ( DEBUGGING ) { fprintf ( stdout , "done.\n" ) ; fflush(stdout); } } void TChromosomeAlign::align_solexa_paired ( string filename , string filename2 , uint read_length_1 , uint fragment_length , uint range ) { if ( DEBUGGING ) { fprintf ( stdout , "Reading solexa pair data from %s ... " , filename.c_str() ) ; fflush(stdout); } FILE * file = fopen ( filename.c_str() , "r" ) ; if ( !file ) { cerr << "Could not open FASTQ file \"" << filename << "\" - aborting." << endl ; exit ( 1 ) ; } FILE *file2 = NULL ; if ( !filename2.empty() ) { file2 = fopen ( filename2.c_str() , "r" ) ; if ( !file2 ) { cerr << "Could not open FASTQ file \"" << filename2 << "\" - aborting." << endl ; exit ( 1 ) ; } } if ( sam ) { for ( int a = 0 ; a < filename.length() ; a++ ) { if ( filename[a] == '/' ) rgid.clear() ; else rgid += filename[a] ; } char predicted_insert_size[100] ; sprintf ( predicted_insert_size , "\tPI:%d" , fragment_length - read_length_1 * 2 ) ; /* string rg ( "@RG" ) ; rg += "\tID:" + rgid ; rg += "\tPU:" + rgid ; rg += predicted_insert_size ; rg += "\tPL:SOLEXA" ; fprintf ( sam , "%s\n" , rg.c_str() ) ; */ string pg ( "@PG\tID:SNP-o-matic" ) ; fprintf ( sam , "%s\n" , pg.c_str() ) ; } char dummy[READ_CHAR_BUFFER] , *c1 , *c2 ; string lastseq , lastqual ; fasta = true ; bool last_was_quality = false , lastone = false , eof = false , skip = false ; uint readcount = 0 , matched_reads = 0 , total_matches = 0 , skipped = 0 ; if ( range == 0 ) range = fragment_length / 4 ; fragment_range = range ; int linecounter = 0 ; while ( !eof || lastone ) { // if ( readcount > 10000 ) break ; // TESTING if ( lastone ) { strcpy ( dummy , "@dummy" ) ; if ( fasta ) *dummy = '>' ; } else { *dummy = 0 ; fgets ( dummy , READ_CHAR_BUFFER , file ) ; if ( file2 ) { int len = 0 ; for ( c1 = dummy ; *c1 > 13 ; c1++ , len++ ) ; if ( linecounter == 1 ) read_length_1 = len ; fgets ( c1 , READ_CHAR_BUFFER , file2 ) ; } } // Remove EOL for ( c1 = dummy ; *c1 > 13 ; c1++ ) ; *c1 = 0 ; if ( linecounter == 0 && ( ( fasta && *dummy == '>' ) || *dummy == '@' ) ) { if ( *dummy == '@' ) fasta = false ; if ( !skip ) { if ( sqlite && !last_solexa_name.empty() ) { if ( sqlite_prefix_first ) { sqlite_prefix_first = false ; sqlite_prefix = last_solexa_name ; } else { int a ; for ( a = 1 ; a < sqlite_prefix.length() && a < last_solexa_name.length() && last_solexa_name[a] == sqlite_prefix[a] ; a++ ) ; if ( sqlite_prefix.length() != a ) sqlite_prefix = sqlite_prefix.substr ( 0 , a ) ; } } for ( c1 = (char*) lastseq.c_str() ; *c1 && !isIUPAC[*c1] ; c1++ ) ; if ( *c1 ) { // Contains IUPAC if ( using_bins ) write2bin ( last_solexa_name , lastseq , lastqual , -1 ) ; } else if ( !lastseq.empty() ) { uint matches = align_solexa_paired_read ( last_solexa_name , lastseq , lastqual , read_length_1 , fragment_length , range ) ; if ( matches > 0 ) matched_reads++ ; total_matches += matches ; if ( using_bins ) write2bin ( last_solexa_name , lastseq , lastqual , (int) matches ) ; } } if ( lastone ) break ; // Clear data c1 = dummy ; if ( *c1 == '>' || *c1 == '@' ) c1++ ; while ( *c1 == ' ' ) c1++ ; last_solexa_name = c1 ; readcount++ ; last_was_quality = false ; lastseq.clear() ; lastqual.clear() ; if ( use_nono ) { skip = is_nono ( last_solexa_name ) ; if ( skip ) skipped++ ; } } else if ( *dummy == '+' ) { last_was_quality = true ; } else if ( last_was_quality ) { lastqual += dummy ; } else { lastseq += dummy ; } linecounter++ ; linecounter %= 4 ; eof = feof ( file ) ; if ( eof ) lastone = true ; // Fake last read } if ( fragmentplot ) { for ( uint a = 0 ; a < fragment_stats.size() ; a++ ) { fprintf ( fragmentplot , "%d\t%d\n" , a , fragment_stats[a] ) ; } } if ( DEBUGGING ) { fprintf ( stdout , "scanned %d solexa reads, %d (%2.2f%%) matched , %d total matches, %d skipped.\n" , readcount , matched_reads , (float) matched_reads * 100 / readcount , total_matches , skipped ) ; fflush(stdout); } } void TChromosomeAlign::align_contigs ( chrvec &contigs , TChromosomalIndices &contig_index , uint maxerr , string output_filename ) { uint a , b , c ; vector vp ; vector vi ; for ( a = 0 ; a < contig_index.position.size() ; a++ ) { for ( b = 0 ; b < contig_index.position[a].size() ; b++ ) { uint chr = contig_index.position[a][b].chromosome ; while ( vp.size() <= chr ) vp.push_back ( pwi_vector () ) ; while ( vi.size() <= chr ) vi.push_back ( vuint () ) ; vp[chr].push_back ( contig_index.position[a][b] ) ; vi[chr].push_back ( a ) ; } } uint mismatch = 0 ; vector out ; char c2[50000] ; for ( a = 0 ; a < vp.size() ; a++ ) { string refseq = contigs[a].sequence ; for ( b = 0 ; b < vp[a].size() ; b++ ) { vector matches ; uint i1 = vi[a][b] ; pair pp = equal_range ( index->position[i1].begin() , index->position[i1].end() , vp[a][b] ) ; for ( TPWIi it = pp.first ; it < pp.second ; it++ ) { uint chr = it->chromosome ; string s ; uint from , len = contigs[a].sequence.length() ; if ( vp[a][b].reverse_complement ) { if ( it->reverse_complement ) { from = it->position - index->index_length_double ; from -= vp[a][b].position - index->index_length_double ; } else { from = it->position + index->index_length_double - contigs[a].sequence.length() ; from += vp[a][b].position - index->index_length_double ; } } else { if ( it->reverse_complement ) { from = it->position - contigs[a].sequence.length() ; from += vp[a][b].position ; } else { from = it->position ; from -= vp[a][b].position ; } } bool rc = false ; s = (*chrs)[chr].sequence.substr ( from , len ) ; if ( s.length() != refseq.length() ) continue ; // Sanity check if ( it->reverse_complement ) { REVERSE_COMPLEMENT_CHARP ( (char*)s.c_str() ) ; rc = !rc ; } if ( vp[a][b].reverse_complement ) { REVERSE_COMPLEMENT_CHARP ( (char*)s.c_str() ) ; rc = !rc ; } if ( s == refseq ) continue ; // No need to bother with a perfect match - or is there? Where did it come from? // Check for errors uint errcnt = 0 ; for ( c = 0 ; errcnt <= maxerr && c < s.length() ; c++ ) { if ( s[c] != refseq[c] ) { errcnt++ ; } else { // Lower case for all matching chars s[c] = s[c] - 'A' + 'a' ; } } if ( errcnt > maxerr ) continue ; sprintf ( c2 , "%s\t%d\t%d\t%s\t%c" , s.c_str() , chr , from , refseq.c_str() , rc ? 'X' : ' ' ) ; string s2 ( c2 ) ; for ( c = 0 ; c < matches.size() && matches[c] != s2 ; c++ ) ; if ( c < matches.size() ) continue ; // We have that one already matches.push_back ( s2 ) ; } int lastpos , lastchr = -1 ; for ( c = 0 ; c < matches.size() ; c++ ) { if ( c > 0 && matches[c-1] == matches[c] ) continue ; int d ; vector vs ; Tokenize ( matches[c] , vs , "\t" ) ; int _chr = atoi ( vs[1].c_str() ) ; int _pos = atoi ( vs[2].c_str() ) ; if ( lastchr == _chr && lastpos == _pos ) continue ; lastchr = _chr ; lastpos = _pos ; bool rc = vs[4] == "X" ; int thepos ; for ( d = 0 ; d < vs[0].length() ; d++ ) { if ( vs[0][d] < 'A' || vs[0][d] > 'Z' ) continue ; // Perfect match if ( rc ) thepos = _pos + ( vs[0].length() - d ) ; else thepos = _pos + 1 + d ; char the_c = rc ? base_rc[vs[0][d]] : vs[0][d] ; if ( (*chrs)[_chr].sequence[thepos-1] != the_c ) mismatch++ ; if ( the_c == vs[3][d] ) continue ; // Pa-ra-no-ia sprintf ( c2 , "%s\t%d\t%c\t%c\n" , (*chrs)[_chr].name.c_str() , thepos , the_c , vs[3][d] ) ; out.push_back ( c2 ) ; } } } } sort ( out.begin() , out.end() ) ; FILE * file = fopen ( output_filename.c_str() , "w" ) ; for ( a = 0 ; a < out.size() ; a++ ) { if ( a > 0 && out[a] == out[a-1] ) continue ; fwrite ( out[a].c_str() , 1 , out[a].length() , file ) ; } fclose ( file ) ; // if ( DEBUGGING ) cerr << mismatch << " mismatches\n" ; } /// Gets Solexa reads from a FASTQ file and matches them against the sequence. void TChromosomeAlign::align_solexa_fastq ( string filename ) { if ( DEBUGGING ) { fprintf ( stdout , "Reading solexa data from %s ... " , filename.c_str() ) ; fflush(stdout); } FILE * file = fopen ( filename.c_str() , "r" ) ; if ( !file ) { cerr << "Could not open FASTQ file \"" << filename << "\" - aborting." << endl ; exit ( 1 ) ; } char dummy[READ_CHAR_BUFFER] , *c1 , *c2 ; int solexacount = 0 , total_matches = 0 , matched_reads = 0 ; string seq , q ; char *c ; bool invalid ; fasta = false ; if ( !pileup ) max_align = 2 ; // If we don't do the pileup but just bin, 2 matches are all we need (=> multiple_bin) // Run through file while ( !feof(file) ) { invalid = false ; *dummy = 0 ; fgets ( dummy , READ_CHAR_BUFFER , file ) ; if ( !*dummy ) break ; // Read name if ( *dummy != '@' ) { cerr << "Expected '@'\n" << dummy ; exit ( 1 ) ; } for ( c = dummy ; *c > 13 ; c++ ) ; *c = 0 ; last_solexa_name = dummy + 1 ; // Sequence fgets ( dummy , READ_CHAR_BUFFER , file ) ; for ( c = dummy ; !isIUPAC[*c] ; c++ ) ; if ( *c == 10 || *c == 13 ) *c = 0 ; if ( *c ) { invalid = true ; while ( *c > 13 ) c++ ; *c = 0 ; } seq = dummy ; fgets ( dummy , READ_CHAR_BUFFER , file ) ; if ( *dummy != '+' ) { cerr << "Expected '+'\n" ; exit ( 1 ) ; } for ( c = dummy ; *c > 13 ; c++ ) ; *c = 0 ; string qname = dummy + 1 ; if ( qname.empty() ) qname = last_solexa_name ; if ( last_solexa_name != qname ) { cerr << "Name mismatch\n" << last_solexa_name << " : " << qname ; exit ( 1 ) ; } fgets ( dummy , READ_CHAR_BUFFER , file ) ; for ( c = dummy ; *c > 13 ; c++ ) ; *c = 0 ; q = dummy ; if ( !single_read_length ) single_read_length = seq.length() ; int m = invalid ? -1 : align_solexa_read ( (char*) seq.c_str() , q ) ; if ( m > 0 ) matched_reads++ ; if ( m > 0 ) total_matches += m ; if ( using_bins ) { // Write to bins if ( last_solexa_name.empty() ) {} // Paranoia else write2bin ( last_solexa_name , seq , q , m ) ; /* else if ( invalid ) fprintf ( binfile_iupac , "@%s\n%s\n+%s\n%s\n" , last_solexa_name.c_str() , seq.c_str() , last_solexa_name.c_str() , q.c_str() ) ; else if ( m == 0 ) fprintf ( binfile_no_match , "@%s\n%s\n+%s\n%s\n" , last_solexa_name.c_str() , seq.c_str() , last_solexa_name.c_str() , q.c_str() ) ; else if ( m == 1 ) fprintf ( binfile_single_match , "@%s\n%s\n+%s\n%s\n" , last_solexa_name.c_str() , seq.c_str() , last_solexa_name.c_str() , q.c_str() ) ; else fprintf ( binfile_multi_match , "@%s\n%s\n+%s\n%s\n" , last_solexa_name.c_str() , seq.c_str() , last_solexa_name.c_str() , q.c_str() ) ;*/ last_solexa_name.clear() ; } if ( !invalid ) solexacount++ ; } fclose ( file ) ; if ( DEBUGGING ) { fprintf ( stdout , "scanned %d solexa reads, %d (%2.2f%%) matched , %d total matches.\n" , solexacount , matched_reads , (float) matched_reads * 100 / solexacount , total_matches ) ; fflush(stdout); } } /// Gets Solexa reads from a FASTA file and matches them against the sequence. Should not be used anymore. void TChromosomeAlign::align_solexa_fasta ( string filename ) { if ( DEBUGGING ) { fprintf ( stdout , "Reading solexa data from %s ... " , filename.c_str() ) ; fflush(stdout); } FILE * file = fopen ( filename.c_str() , "r" ) ; if ( !file ) { cerr << "Could not open FASTA file \"" << filename << "\" - aborting." << endl ; exit ( 1 ) ; } char dummy[READ_CHAR_BUFFER] , *c ; int solexacount = 0 , total_matches = 0 , matched_reads = 0 ; bool invalid ; fasta = true ; // Artificial quality strings vector q ; for ( int a = 0 ; a < 100 ; a++ ) q.push_back ( string ( a , (char) 30+33 ) ) ; if ( !pileup ) max_align = 2 ; // If we don't do the pileup but just bin, 2 matches are all we need (=> multiple_bin) // Run through file while ( !feof(file) ) { *dummy = 0 ; fgets ( dummy , READ_CHAR_BUFFER , file ) ; if ( !*dummy ) break ; if ( *dummy == '>' ) { if ( using_bins || cigar ) { // Writing to binfiles, storing solexa name for ( c = dummy ; *c > 13 ; c++ ) ; *c = 0 ; last_solexa_name = dummy + 1 ; } continue ; } invalid = false ; for ( c = dummy ; !isIUPAC[*c] ; c++ ) ; if ( *c == 10 || *c == 13 ) *c = 0 ; if ( *c ) { invalid = true ; while ( *c > 13 ) c++ ; *c = 0 ; } if ( !invalid && !single_read_length ) single_read_length = strlen ( dummy ) ; int m = invalid ? -1 : align_solexa_read ( dummy , q[strlen(dummy)] ) ; if ( m > 0 ) matched_reads++ ; if ( m > 0 ) total_matches += m ; if ( using_bins ) { // Write to bins if ( last_solexa_name.empty() ) {} // Paranoia else write2bin ( last_solexa_name , string ( dummy ) , "" , m ) ; /* else if ( invalid ) fprintf ( binfile_iupac , ">%s\n%s\n" , last_solexa_name.c_str() , dummy ) ; else if ( m == 0 ) fprintf ( binfile_no_match , ">%s\n%s\n" , last_solexa_name.c_str() , dummy ) ; else if ( m == 1 ) fprintf ( binfile_single_match , ">%s\n%s\n" , last_solexa_name.c_str() , dummy ) ; else fprintf ( binfile_multi_match , ">%s\n%s\n" , last_solexa_name.c_str() , dummy ) ;*/ last_solexa_name.clear() ; } solexacount++ ; } fclose ( file ) ; if ( DEBUGGING ) { fprintf ( stdout , "scanned %d solexa reads, %d (%2.2f%%) matched , %d total matches.\n" , solexacount , matched_reads , (float) matched_reads * 100 / solexacount , total_matches ) ; fflush(stdout); } } int TChromosomeAlign::align_solexa_read ( char *_seq , const string &_quality , bool inv_compl ) { uint ret = 0 ; uint i1 = index->calculate_index ( _seq ) ; if ( !index->position[i1].size() ) return 0 ; string rc , rcq ; uint l = strlen ( _seq ) ; // Binary search TPositionWithIndex findme ; findme.index2 = index->calculate_index2 ( _seq + index->index_length ) ; pair pp = equal_range ( index->position[i1].begin() , index->position[i1].end() , findme ) ; if ( sqlite ) sqlite_out.clear () ; for ( TPWIi it = pp.first ; it < pp.second ; it++ ) { uint chr = it->chromosome ; char *chrseq = (char*) (*chrs)[chr].sequence.c_str() ; int p = it->position ; if ( it->reverse_complement ) { if ( p < l ) continue ; if ( rc.empty() ) { // Creating the reverse-complement rc = string ( _seq ) ; REVERSE_COMPLEMENT_CHARP ( (char*)rc.c_str() ) ; } if ( !matchIUPAC ( rc.substr(0,l-index->index_length_double).c_str() , chrseq+p-l ) ) continue ; if ( cigar || sqlite ) add_cigar_sqlite ( rc , last_solexa_name , p-l , chr , '-' ) ; if ( coverage ) (*chrs)[chr].add_coverage ( rc , p-l , true ) ; if ( pileup ) { if ( rcq.empty() ) { rcq = _quality ; reverse ( rcq.begin() , rcq.end() ) ; } (*chrs)[chr].ao.add_align ( rc , rcq , p-l , chr ) ; } if ( snpsinreads ) add_snpsinreads ( rc , rcq , chr , p-l , 1 ) ; if ( sqlite ) sqlite_out.push_back ( generate_sqlite_paired_command ( chr , rc , p-l , "" , 0 , '1' ) ) ; } else { uint chrl = (*chrs)[chr].sequence.length() ; if ( p + l > chrl ) continue ; if ( !matchIUPAC ( _seq+index->index_length_double , chrseq+p+index->index_length_double ) ) continue ; if ( pileup ) (*chrs)[chr].ao.add_align ( _seq , _quality , p , chr ) ; if ( cigar || sqlite ) add_cigar_sqlite ( _seq , last_solexa_name , p , chr , '+' ) ; if ( coverage ) (*chrs)[chr].add_coverage ( _seq , p , false ) ; if ( snpsinreads ) add_snpsinreads ( _seq , _quality , chr , p , 1 ) ; if ( sqlite ) sqlite_out.push_back ( generate_sqlite_paired_command ( chr , _seq , p , "" , 0 , '1' ) ) ; } ret++ ; if ( ret >= max_align && !sqlite ) return ret ; // Shortcut for binning-only } if ( wobble && ret == 0 ) wobble_single_read ( _seq , _quality ) ; if ( sqlite && sqlite_out.size() == 1 ) add2sqlite_cache ( sqlite_out[0] ) ; return ret ; } string TChromosomeAlign::matchIUPAC_tolerant ( const char *c1 , const char *c2 , int tolerance ) { string ret ; while ( *c1 && *c2 ) { if ( char2IUPAC[*c1++] & char2IUPAC[*c2++] ) { ret += *(c2-1) ; continue ; } if ( tolerance-- == 0 ) return "" ; ret += *(c2-1) ; } return ret ; } void TChromosomeAlign::wobble_single_read ( char *seq , const string &_quality ) { uint i1 = index->calculate_index ( seq ) ; uint l = strlen ( seq ) ; string rc , rcq , query , realseq ; int start , dir ; // Binary search TPositionWithIndex findme ; findme.index2 = index->calculate_index2 ( seq + index->index_length ) ; pair pp = equal_range ( index->position[i1].begin() , index->position[i1].end() , findme ) ; for ( TPWIi it = pp.first ; it < pp.second ; it++ ) { uint chr = it->chromosome ; char *chrseq = (char*) (*chrs)[chr].sequence.c_str() ; int p = it->position ; string found ; if ( it->reverse_complement ) { if ( p < l ) continue ; if ( rc.empty() ) { // Creating the reverse-complement rc = string ( seq ) ; REVERSE_COMPLEMENT_CHARP ( (char*)rc.c_str() ) ; } query = rc.substr(0,l-index->index_length_double) ; start = p-l ; found = matchIUPAC_tolerant ( query.c_str() , chrseq+start , wobblemax ) ; if ( found.empty() ) continue ; realseq = dorc(seq).substr(0,l-index->index_length_double) ; } else { uint chrl = (*chrs)[chr].sequence.length() ; if ( p + l > chrl ) continue ; query = seq+index->index_length_double ; start = p+index->index_length_double ; found = matchIUPAC_tolerant ( query.c_str() , chrseq+start , wobblemax ) ; if ( found.empty() ) continue ; realseq = query ; } dir = 1 ; for ( int a = 0 ; a < query.length() && a < found.length() ; a++ ) { char c = (*chrs)[chr].original_sequence.empty() ? (*chrs)[chr].sequence[start + a * dir] : (*chrs)[chr].original_sequence[start + a * dir] ; // cout << start + a * dir + 1 << "\t" << c << "\t" << query[a] << "\t" << found[a] << endl ; if ( char2IUPAC[c] & char2IUPAC[query[a]] ) continue ; // if ( c == found[a] ) continue ; fprintf ( wobble , "SNP\t%s\t%d\t%c\t%c\n" , (*chrs)[chr].name.c_str() , start + a * dir + 1 , c , query[a] ) ; } // cout << query << " O" << endl << found << " N" << endl << endl ; } } void TChromosomeAlign::add_cigar_sqlite ( const string &sequence , const string &name , uint position , uint chromosome , char orientation ) { char cig[READ_CHAR_BUFFER] ; string cigar_text ; int a ; bool iu = false ; for ( a = 0 ; a < sequence.length() && !iu ; a++ ) { if ( isIUPAC[(*chrs)[chromosome].sequence[a+position]] ) iu = true ; } if ( iu ) { string s ; for ( a = 0 ; a < sequence.length() ; a++ ) { if ( (*chrs)[chromosome].sequence[a+position] != sequence[a] ) s += "D" ; else s += "M" ; } // Compressing CIGAR char last = ' ' ; uint cnt = 0 ; for ( a = 0 ; a < s.length() ; a++ ) { if ( s[a] != last ) { if ( last != ' ' ) { if ( !cigar_text.empty() ) cigar_text += " " ; sprintf ( cig , "%c %d" , last , cnt ) ; cigar_text += cig ; if ( last == 'D' ) { sprintf ( cig , " I %d" , cnt ) ; cigar_text += cig ; } } last = s[a] ; cnt = 1 ; } else cnt++ ; } sprintf ( cig , " %c %d" , last , cnt ) ; cigar_text += cig ; if ( last == 'D' ) { sprintf ( cig , " I %d" , cnt ) ; cigar_text += cig ; } // cout << cigar_text << endl ; } else { sprintf ( cig , "M %d" , (int)sequence.length() ) ; cigar_text = cig ; } if ( cigar ) { fprintf ( cigar , "%s %d %d %c %s %d %d + %d %s\n" , name.c_str() , // Read name 1 , // Read begin sequence.length() , // Read end orientation , // Read orientation (*chrs)[chromosome].name.c_str(), // Chromosome name position + 1 , // Begin on chromosome position + sequence.length() , // End on chromosome 99 , // Fake Smith-Waterman score cigar_text.c_str() // CIGAR commands ) ; } } void TChromosomeAlign::show_pileup ( bool snps_only ) { if ( !pileup ) return ; for ( int chromosome = 0 ; chromosome < chrs->size() ; chromosome++ ) (*chrs)[chromosome].ao.show_pileup ( pileup , snps_only ) ; // cerr << "SNPS searched : " << search_snps << ", found : " << found_snps << " (" << (found_snps*100/search_snps) << "%)" << endl ; // if ( bogus ) cerr << "Bogus : " << bogus << endl ; } void TChromosomeAlign::align_solexa_fastq_variety ( string filename ) { if ( DEBUGGING ) { fprintf ( stdout , "Reading solexa data from %s ... " , filename.c_str() ) ; fflush(stdout); } FILE * file = fopen ( filename.c_str() , "r" ) ; if ( !file ) { cerr << "Could not open FASTQ file \"" << filename << "\" - aborting." << endl ; exit ( 1 ) ; } char dummy[READ_CHAR_BUFFER] , *c1 , *c2 ; int solexacount = 0 , total_matches = 0 , matched_reads = 0 ; string seq , q ; char *c ; bool invalid ; fasta = false ; if ( !pileup ) max_align = 2 ; // If we don't do the pileup but just bin, 2 matches are all we need (=> multiple_bin) // Run through file while ( !feof(file) ) { invalid = false ; *dummy = 0 ; fgets ( dummy , READ_CHAR_BUFFER , file ) ; if ( !*dummy ) break ; // Read name if ( *dummy != '@' ) { cerr << "Expected '@'\n" << dummy ; exit ( 1 ) ; } for ( c = dummy ; *c > 13 ; c++ ) ; *c = 0 ; last_solexa_name = dummy + 1 ; // Sequence fgets ( dummy , READ_CHAR_BUFFER , file ) ; for ( c = dummy ; !isIUPAC[*c] ; c++ ) ; if ( *c == 10 || *c == 13 ) *c = 0 ; if ( *c ) { invalid = true ; while ( *c > 13 ) c++ ; *c = 0 ; } seq = dummy ; fgets ( dummy , READ_CHAR_BUFFER , file ) ; if ( *dummy != '+' ) { cerr << "Expected '+'\n" ; exit ( 1 ) ; } for ( c = dummy ; *c > 13 ; c++ ) ; *c = 0 ; string qname = dummy + 1 ; if ( qname.empty() ) qname = last_solexa_name ; if ( last_solexa_name != qname ) { cerr << "Name mismatch\n" << last_solexa_name << " : " << qname ; exit ( 1 ) ; } fgets ( dummy , READ_CHAR_BUFFER , file ) ; for ( c = dummy ; *c > 13 ; *c++ ) ; *c = 0 ; q = dummy ; try_matching_read ( seq , q ) ; int m = invalid ? -1 : try_matching_read ( seq , q ) ; if ( m > 0 ) matched_reads++ ; if ( m > 0 ) total_matches += m ; if ( using_bins ) { // Write to bins if ( last_solexa_name.empty() ) {} // Paranoia else write2bin ( last_solexa_name , seq , q , m ) ; last_solexa_name.clear() ; } if ( !invalid ) solexacount++ ; } fclose ( file ) ; if ( DEBUGGING ) { fprintf ( stdout , "scanned %d solexa reads, %d (%2.2f%%) matched , %d total matches.\n" , solexacount , matched_reads , (float) matched_reads * 100 / solexacount , total_matches ) ; fflush(stdout); } } int TChromosomeAlign::try_matching_read ( const string & seq , string quality ) { vector indices ; uint idd = index->index_length_double ; // Beginning indices.push_back ( index->calculate_index ( seq.c_str() ) ) ; indices.push_back ( index->calculate_index2 ( seq.c_str() + index->index_length ) ) ; // Middle // indices.push_back ( index->calculate_index ( seq.c_str() + seq.length() / 2 + idd / 2 ) ) ; // indices.push_back ( index->calculate_index2 ( seq.c_str() + seq.length() / 2 + idd / 2 + index->index_length ) ) ; // End indices.push_back ( index->calculate_index ( seq.c_str() + seq.length() - 1 - idd ) ) ; indices.push_back ( index->calculate_index2 ( seq.c_str() + seq.length() - 1 - idd + index->index_length ) ) ; uint m , n ; pwi_vector out ; for ( n = 0 ; n < indices.size() ; n += 2 ) { uint i1 = indices[n] ; TPositionWithIndex findme ; findme.index2 = indices[n+1] ; pair pp = equal_range ( index->position[i1].begin() , index->position[i1].end() , findme ) ; for ( TPWIi it = pp.first ; it < pp.second ; it++ ) { out.push_back ( *it ) ; } } if ( out.size() < 2 ) return 0 ; sort ( out.begin() , out.end() , tpi_full_compare ) ; int cnt = 0 , start = -1 ; for ( n = 0 ; n < out.size() ; n++ ) { bool found = false ; for ( m = n + 1 ; m < out.size() ; m++ ) { if ( out[m].chromosome != out[n].chromosome ) break ; if ( out[m].reverse_complement != out[n].reverse_complement ) break ; int diff = out[m].position - out[n].position ; if ( diff < 0 ) diff = -diff ; if ( diff > seq.length() ) continue ; found = true ; start = out[m].position < out[n].position ? m : n ; break ; } if ( !found ) continue ; cnt++ ; // cout << out[n].chromosome << "\t" << out[n].position << "\t" << out[n].reverse_complement << endl ; } // cout << endl ; if ( cnt == 1 ) { string ref = get_ref_substr ( out[start] , seq.length() ) ; string s = out[start].reverse_complement ? dorc ( seq ) : seq ; // ???? for ( int a = 0 ; a < s.length() ; a++ ) if ( s[a] != ref[a] ) s[a] = s[a] - 'A' + 'a' ; cout << out[start].chromosome << "\t" << out[start].position << "\t" << ref << "\t" << s << "\t" << out.size() << "\t" << cnt << "\t" << out[start].reverse_complement << endl ; } return cnt ; } string TChromosomeAlign::get_ref_substr ( const TPositionWithIndex &p , int length ) { int start = p.position ; if ( p.reverse_complement ) start -= index->index_length_double + 1 ; if ( start < 0 ) return "" ; string ret = (*chrs)[p.chromosome].sequence.substr ( start , length ) ; // if ( p.reverse_complement ) return doc ( ret ) ; return ret ; } snpomatic-1.0/src/findknownsnps.cpp000066400000000000000000000446151260772140700175630ustar00rootroot00000000000000/*! \mainpage SNP-O-MATIC * * \section authors Authors * * - Magnus Manske (mm6@sanger.ac.uk) * * \section intro_sec Introduction * * SNP-O-MATIC can align Solexa reads against a reference sequence. * Known SNPs can be added to the reference sequence as IUPAC bases. * Only perfectly matching Solexa reads, with respect to the IUPAC bases, will be aligned. * Reads can be binned into those that do not match the reference, those that match uniquely, and those that match two or more times. * * \section install_sec Installation * * SNP-O-MATIC comes with a Makefile. The Intel compiler is used by default for performance reasons, but the g++ variant is in the Makefile as well in the form of comments. */ #include "snpomatic.h" #include using namespace std ; // __________________________________________________________________________________________________________________________________________________________________ MAIN /// Shows possible parameters, and an (optional) error message. void die_with_description ( const char *msg = NULL ) { if ( msg ) cerr << msg << endl ; cerr << "Parameters : \n" ; cerr << "--genome=GENOME_FILE FASTA file with chromosomes (mandatory)\n" ; cerr << "--fasta=FASTA_FILE FASTA file with Solexa reads (mandatory, except when --fastq or --index is used)\n" ; cerr << "--fastq=FASTQ_FILE FASTQ file with Solexa reads (mandatory, except when --fasta or --index is used)\n" ; cerr << "--fastq2=FASTQ_FILE2 FASTQ file with Solexa reads (optional; read mate)\n" ; cerr << "--nono=FILENAME File with list of read names (!) to ignore (optional)\n" ; cerr << "--regions=REGION_FILE Region file for finding new SNPs (optional) [DEPRECATED]\n" ; cerr << "--snps=SNP_FILE Simple SNP file (optional)\n" ; cerr << "--gff=GFF_FILE GFF file with SNPs (optional)\n" ; cerr << "--uniqueness=FILE Output a uniqueness data file for the reference; no Solexa reads needed; implies --noshortcuts (optional)\n" ; cerr << "--pileup=FILE Outputs complete pileup into that file (optional)\n" ; cerr << "--cigar=FILE Outputs alignment in CIGAR format (optional)\n" ; cerr << "--gffout=FILE Outputs reads in GFF format (optional)\n" ; cerr << "--coverage=FILENAME Outputs (high depth) coverage data (optional)\n" ; cerr << "--wobble=FILENAME Outputs a list of possible variations (optional; paired reads only) [UNDER CONSTRUCTION]\n" ; cerr << "--fragmentplot=FILENAME Outputs a plot of fragment size distribution to a file (optional)\n" ; cerr << "--snpsinreads=FILENAME Outputs a list of reads containing known SNPs to a file (optional)\n" ; cerr << "--indelplot=FILENAME Outputs indel data to a file (optional)\n" ; cerr << "--inversions=FILENAME For paired reads, writes read matches indicating inversions into a file (optional)\n" ; cerr << "--faceaway=FILENAME For paired reads, writes read matches that \"face away\" from each other into a file (optional)\n" ; cerr << "--sqlite=FILENAME Creates a sqlite text file with alignment data [EXPERIMENTAL] (optional)\n" ; cerr << "--sam=FILENAME Creates a SAM alignment file (optional)\n" ; cerr << "--spancontigs=FILENAME Outputs read pairs where \"half\" reads map uniquely to different contigs (optional)\n" ; cerr << "--bins=FILE_PREFIX Outputs no-match, single-match and multi-match Solexa reads into prefixed files (optional)\n" ; cerr << "--binmask=MASK Mask of 1s and 0s to turn off individual bins. Order: No match, single match, multi-match, IUPAC.\n" ; cout << " Example: 0100 creates only single-match bin. (optional; default:1111)\n" ; cerr << "--pair=NUMBER For paired reads, the length of the first part of the read (mandatory for paired reads)\n" ; cerr << "--fragment=NUMBER For paired reads, the average fragment length (mandatory for paired reads)\n" ; cerr << "--variance=NUMBER For paired reads, the variance of the fragment length to either side (optional; default: 1/4 of fragment size)\n" ; cerr << "--wobblemax=NUMBER Maximum number of mismatches for wobble (optional; default 2; see --wobble)\n" ; cerr << "--mspi=NUMBER Maximum number of SNPs per chromosomal index (optional; default:8)\n" ; cerr << "--index=FILENAME Index filename (index will be created if it does not exist; optional)\n" ; cerr << "--noshortcuts Will process all chrososomal regions, even those with lots'o'repeats (optional; no value)\n" ; cerr << "--snpsonly Only lists found SNPs in the pileup (optional; no value)\n" ; cerr << "--chromosome=NAME Discards all chromosomes but NAME prior to run (optional)\n" ; cerr << "--index_from=NUMBER Starts indexing at this position on all chromosomes (optional)\n" ; cerr << "--index_to=NUMBER Stops indexing at this position on all chromosomes (optional)\n" ; cerr << "--chop=NUMBER For paired reads, if one but not the other matches, shorten the other by NUMBER bases (optional)\n" ; cerr << "--index1=NUMBER Length of internal index 1 (optional; default:10)\n" ; cerr << "--index2=NUMBER Length of internal index 2 (optional; default:16)\n" ; cerr << "--memory_save=NUMBER Indexes the genome every NUMBER of positions; saves memory and runtime, but can have strange side effects (optional)\n" ; cerr << "--multimatch Puts a multiple-matching read to a random position (optional) [currently paired reads only]\n" ; cerr << "--singlematch Only performs additional output functions for single matches (optional) [currently paired reads only]\n" ; cerr << "--foum For paired reads, at least one read has to match uniquely in the genome (force one unique match) (optional)\n" ; cerr << "--mismatch The number of mismatches allowed outside the index (index1+index2) (optional)\n" ; cerr << "--rpa=FILENAME Writes all read pair alignments into a file (optional)\n" ; // cerr << "--featuretable=OUTPUT_FILE\tOutputs a DDBJ/EMBL/GenBank feature table for matched reads (optional; paired reads only for now)\n" ; exit ( 0 ) ; } /// Well, guess what. int main ( int argc , char **argv ) { string genome_file , gff_file , simple_snp_file , fastq_file , fastq2_file , fasta_file , pileup_file , bin_prefix , regions_file ; string cigar_file , wobble_file , nono , fragmentplot , featuretable , gffout , coverage_file , uniqueness , snpsinreads ; string index_file , indelplot_file , inversions_file , chromosome , sqlite , sam_file , spancontigs , face_away , rpa ; string binmask ( "1111" ) ; uint wobblemax = 2 ; uint mspi = 8 ; uint index_from = 0 ; uint index_to = 0 ; uint pair_length = 0 ; uint fragment_length = 0 ; uint index_length_1 = 10 ; uint index_length_2 = 16 ; int noshortcuts = false ; int snpsonly = false ; int chop = 0 ; uint variance = 0 ; int multimatch = false ; int singlematch = false ; int force_one_unique_match = false ; int memory_save = 1 ; // Parameters static struct option long_options[] = { { "genome" , required_argument , 0 , 'g' } , { "fasta" , optional_argument , 0 , 'a' } , { "fastq" , optional_argument , 0 , 'q' } , { "fastq2" , optional_argument , 0 , 'Q' } , { "snps" , optional_argument , 0 , 's' } , { "coverage" , optional_argument , 0 , 'e' } , { "sam" , optional_argument , 0 , 'S' } , { "gff" , optional_argument , 0 , 'f' } , { "rpa" , optional_argument , 0 , 'R' } , { "faceaway" , optional_argument , 0 , 'F' } , { "pileup" , optional_argument , 0 , 'p' } , { "bins" , optional_argument , 0 , 'b' } , { "binmask" , optional_argument , 0 , 'z' } , { "index" , optional_argument , 0 , 'x' } , { "nono" , optional_argument , 0 , 'n' } , { "uniqueness" , optional_argument , 0 , '0' } , { "wobble" , optional_argument , 0 , 'w' } , { "pair" , optional_argument , 0 , 'i' } , { "fragment" , optional_argument , 0 , 't' } , { "featuretable" , optional_argument , 0 , 'u' } , { "gffout" , optional_argument , 0 , 'y' } , { "variance" , optional_argument , 0 , 'v' } , { "mspi" , optional_argument , 0 , 'm' } , { "cigar" , optional_argument , 0 , 'c' } , { "sqlite" , optional_argument , 0 , 'P' } , { "fragmentplot" , optional_argument , 0 , 'o' } , { "snpsinreads" , optional_argument , 0 , 'd' } , { "regions" , optional_argument , 0 , 'r' } , { "index1" , optional_argument , 0 , '1' } , { "index2" , optional_argument , 0 , '2' } , { "chop" , optional_argument , 0 , '3' } , { "indelplot" , optional_argument , 0 , '4' } , { "wobblemax" , optional_argument , 0 , '5' } , { "inversions" , optional_argument , 0 , '6' } , { "chromosome" , optional_argument , 0 , '7' } , { "index_from" , optional_argument , 0 , '8' } , { "index_to" , optional_argument , 0 , '9' } , { "mismatch" , optional_argument , 0 , 'M' } , { "spancontigs" , optional_argument , 0 , 'X' } , { "memory_save" , optional_argument , 0 , 'Y' } , { "multimatch" , optional_argument , &multimatch , true } , { "singlematch" , optional_argument , &singlematch , true } , { "noshortcuts" , optional_argument , &noshortcuts , true } , { "foum" , optional_argument , &force_one_unique_match , true } , { "snpsonly" , optional_argument , &snpsonly , true } , { 0 , 0 , 0 , 0 } } ; int c ; int option_index = 0; while ( -1 != ( c = getopt_long (argc, argv, "g:a::q::s::f::p::b::r::w::n::",long_options, NULL) ) ) { switch ( c ) { case 0 : break ; case 'g' : genome_file = optarg ; break ; case 'a' : fasta_file = optarg ; break ; case 'q' : fastq_file = optarg ; break ; case 'S' : sam_file = optarg ; break ; case 'Q' : fastq2_file = optarg ; break ; case 's' : simple_snp_file = optarg ; break ; case 'f' : gff_file = optarg ; break ; case 'e' : coverage_file = optarg ; break ; case 'p' : pileup_file = optarg ; break ; case 'b' : bin_prefix = optarg ; break ; case 'z' : binmask = optarg ; break ; case 'F' : face_away = optarg ; break ; case '0' : uniqueness = optarg ; break ; case 'w' : wobble_file = optarg ; break ; case 'n' : nono = optarg ; break ; case 'x' : index_file = optarg ; break ; case 'i' : pair_length = atoi ( optarg ) ; break ; case 'o' : fragmentplot = optarg ; break ; case 'u' : featuretable = optarg ; break ; case 'y' : gffout = optarg ; break ; case 'X' : spancontigs = optarg ; break ; case 'd' : snpsinreads = optarg ; break ; case 'v' : variance = atoi ( optarg ) ; break ; case 'Y' : memory_save = atoi ( optarg ) ; break ; case 't' : fragment_length = atoi ( optarg ) ; break ; case 'm' : mspi = atoi ( optarg ) ; break ; case '1' : index_length_1 = atoi ( optarg ) ; break ; case '2' : index_length_2 = atoi ( optarg ) ; break ; case '3' : chop = atoi ( optarg ) ; break ; case '4' : indelplot_file = optarg ; break ; case '5' : wobblemax = atoi ( optarg ) ; break ; case '6' : inversions_file = optarg ; break ; case '7' : chromosome = optarg ; break ; case '8' : index_from = atoi ( optarg ) ; break ; case '9' : index_to = atoi ( optarg ) ; break ; case 'M' : globally_allowed_mismatches = atoi ( optarg ) ; break ; case 'c' : cigar_file = optarg ; break ; case 'r' : regions_file = optarg ; break ; case 'P' : sqlite = optarg ; break ; case 'R' : rpa = optarg ; break ; default : die_with_description("Unknown option") ; } } binmask += "0000" ; // Ignore bins if shorter string was supplied // Sanity checks if ( genome_file.empty() ) die_with_description("No genome file given!") ; // Need this if ( index_file.empty() ) { if ( fastq_file.empty() && fasta_file.empty() && uniqueness.empty() ) die_with_description("No Solexa file given!") ; // Need one if ( !fastq_file.empty() && !fasta_file.empty() ) die_with_description("Two Solexa files given!") ; // Can't have both } if ( !regions_file.empty() || !uniqueness.empty() ) noshortcuts = true ; // Incorporating known SNPs and indexing chromosomes init_iupac () ; TChromosomalIndices index ( index_length_1 , index_length_2 ) ; index.noshortcuts = noshortcuts ; index.max_snps_per_index = mspi ; index.index_file = index_file ; index.index_from = index_from ; index.index_to = index_to ; vector chrs ; load_all_chromosomes ( genome_file , chrs ) ; if ( !gff_file.empty() ) incorporate_all_gff ( gff_file , chrs ) ; if ( !simple_snp_file.empty() ) incorporate_simple_snps ( simple_snp_file , chrs ) ; if ( !chromosome.empty() ) { for ( int a = 0 ; a < chrs.size() ; a++ ) { if ( chrs[a].name == chromosome ) continue ; chrs.erase ( chrs.begin() + a ) ; a-- ; } if ( chrs.size() == 0 ) { cout << "No chromosome with that name (" << chromosome << ") in file -- aborting.\n" ; exit ( 1 ) ; } cout << "Keeping " << chrs.size() << " chromosomes" << endl ; } index.index_steps = memory_save ; if ( !regions_file.empty() ) index.run_regions ( &chrs , regions_file ) ; else index.run ( &chrs ) ; if ( !uniqueness.empty() ) { index.uniqueness ( uniqueness ) ; if ( fastq_file.empty() && fasta_file.empty() ) return 0 ; // My work here is done } if ( !fastq_file.empty() && !fastq2_file.empty() && pair_length == 0 ) { // Auto-detect read length ifstream in ( fastq_file.c_str() ) ; char tmp[10000] ; in.getline ( tmp , 10000 ) ; in.getline ( tmp , 10000 ) ; in.close() ; pair_length = strlen ( tmp ) ; cout << "Setting read length to " << pair_length << endl ; } // Now look at the Solexa reads and do the thing TChromosomeAlign ca ( chrs , index ) ; ca.wobblemax = wobblemax ; ca.multimatch = multimatch ; ca.singlematch = singlematch ; ca.force_one_unique_match = force_one_unique_match ; if ( !pileup_file.empty() ) ca.pileup = fopen ( pileup_file.c_str() , "w" ) ; if ( !cigar_file.empty() ) ca.cigar = fopen ( cigar_file.c_str() , "w" ) ; if ( !fragmentplot.empty() ) ca.fragmentplot = fopen ( fragmentplot.c_str() , "w" ) ; if ( !indelplot_file.empty() ) ca.indelplot = fopen ( indelplot_file.c_str() , "w" ) ; if ( !gffout.empty() ) ca.gffout = fopen ( gffout.c_str() , "w" ) ; if ( !snpsinreads.empty() ) ca.snpsinreads = fopen ( snpsinreads.c_str() , "w" ) ; if ( !face_away.empty() ) ca.faceaway = fopen ( face_away.c_str() , "w" ) ; if ( !coverage_file.empty() ) ca.coverage = fopen ( coverage_file.c_str() , "w" ) ; if ( !inversions_file.empty() ) ca.inversions = fopen ( inversions_file.c_str() , "w" ) ; if ( !sqlite.empty() ) ca.sqlite = fopen ( sqlite.c_str() , "w" ) ; if ( !sam_file.empty() ) ca.sam = fopen ( sam_file.c_str() , "w" ) ; if ( !spancontigs.empty() ) ca.spancontigs = fopen ( spancontigs.c_str() , "w" ) ; if ( !rpa.empty() ) ca.rpa = fopen ( rpa.c_str() , "w" ) ; if ( !featuretable.empty() ) { ca.featuretable = fopen ( featuretable.c_str() , "w" ) ; fprintf ( ca.featuretable , "Key\tLocation/Qualifiers\n" ) ; } if ( !bin_prefix.empty() ) { string ext = ".fasta" ; if ( !fastq_file.empty() ) ext = ".fastq" ; cout << "Using binmask " << binmask << endl ; if ( binmask[0] == '1' ) ca.binfile_no_match = fopen ( string ( bin_prefix + "_no_match" + ext ).c_str() , "w" ) ; if ( binmask[1] == '1' ) ca.binfile_single_match = fopen ( string ( bin_prefix + "_single_match" + ext ).c_str() , "w" ) ; if ( binmask[2] == '1' ) ca.binfile_multi_match = fopen ( string ( bin_prefix + "_multi_match" + ext ).c_str() , "w" ) ; if ( binmask[3] == '1' ) ca.binfile_iupac = fopen ( string ( bin_prefix + "_iupac" + ext ).c_str() , "w" ) ; ca.using_bins = true ; } ca.chop = chop ; ca.init () ; if ( !nono.empty() ) ca.add_nono_list ( nono ) ; if ( pair_length == 0 ) { if ( !wobble_file.empty() ) ca.wobble = fopen ( string ( wobble_file ).c_str() , "w" ) ; if ( !fasta_file.empty() ) ca.align_solexa_fasta ( fasta_file ) ; if ( !fastq_file.empty() ) ca.align_solexa_fastq ( fastq_file ) ; } else { if ( fragment_length == 0 && variance == 0 ) { // If no fragment size is given, do entire chromosome for ( uint a = 0 ; a < chrs.size() ; a++ ) { if ( fragment_length < chrs[a].sequence.length() ) fragment_length = chrs[a].sequence.length() ; } fragment_length = fragment_length / 2 ; variance = fragment_length ; } if ( !wobble_file.empty() ) ca.wobble = fopen ( string ( wobble_file ).c_str() , "w" ) ; if ( !fasta_file.empty() ) ca.align_solexa_paired ( fasta_file , fastq2_file , pair_length , fragment_length , variance ) ; if ( !fastq_file.empty() ) ca.align_solexa_paired ( fastq_file , fastq2_file , pair_length , fragment_length , variance ) ; } ca.show_pileup ( !regions_file.empty() || snpsonly ) ; ca.dump_coverage () ; ca.finish_sqlite ( fragment_length , variance , pair_length != 0 ? pair_length : ca.single_read_length ) ; return 0 ; } // Example runs: // rm test ; make clean ; make ; /usr/bin/time ./findknownsnps --genome=/nfs/malaria/data2/3D7/Pf.3D7.v2.1.chromosomes.dna.fa --gff=/nfs/malaria/data2/3D7/known_snps/PfalciparumCombinedSNPs.gff --fasta=/nfs/malaria/data2/3D7/all.solexa.fasta --noshortcuts --bins=test // make clean ; make ; time ./findknownsnps --genome=/nfs/malaria/data2/human_sequence/Homo_sapiens.NCBI36.48.dna.chromosome.22.fa --fasta=/nfs/malaria/data2/3D7/all.solexa.fasta --noshortcuts --bins=human // make clean ; make ; /usr/bin/time ./findknownsnps --genome=/nfs/malaria/data2/3D7/Pf.3D7.v2.1.chromosomes.dna.fa --fasta=/nfs/malaria/data2/3D7/all.solexa.fasta --bins=q2 // make clean ; make ; /usr/bin/time ./findknownsnps --genome=/nfs/malaria/data2/3D7/Pf.3D7.v2.1.chromosomes.dna.fa --fasta=/nfs/malaria/data2/3D7/all.solexa.fasta --gff=/nfs/malaria/data2/3D7/known_snps/PfalciparumCombinedSNPs.gff --cigar=test.cigar // make clean ; make ; /usr/bin/time ./findknownsnps --genome=/nfs/malaria/data2/3D7/Pf.3D7.v2.1.chromosomes.dna.fa --fasta=q1_no_match.fasta --snps=new_snps_60_uniq --bins=q3 // make clean ; make ; /usr/bin/time ./findknownsnps --genome=/nfs/faculty/mm6/3D7_pm.fa --fastq=$MAL_SOLEXA_HOME/run368/368_s_3.fastq --pair=35 --fragment=300 --bins=q3 // make clean ; make ; /usr/bin/time ./findknownsnps --genome=../3D7_pm.fa --snps=MERGED_CAPILLARY_MAQ.tab.subset --fastq=$MAL_SOLEXA_DATA/986_s_8/986_s_8.fastq --pair=37 --fragment=200 --variance=200 --sqlite=test.sqlite.text /* // Test params, obsolete genome_file = "/nfs/malaria/data2/3D7/Pf.3D7.v2.1.chromosomes.dna.fa" ; gff_file = "/nfs/malaria/data2/3D7/known_snps/PfalciparumCombinedSNPs.gff" ; fasta_file = "/nfs/malaria/data2/3D7/all.solexa.fasta" ; //simple_snp_file = "/nfs/malaria/data2/3D7/known_snps/MAL1.snps" ; //fastq_file = "/nfs/malaria/data2/3D7/fastq/PFCLIN_slx.fastq" ; */ snpomatic-1.0/src/global_functions.cpp000066400000000000000000000231101260772140700201750ustar00rootroot00000000000000#include "snpomatic.h" // __________________________________________________________________________________________________________________________________________________________________ GLOBAL VARIABLES uchar char2num[256] , num2char[256] , to_lc[256] , to_uc[256] , char2IUPAC[256] , IUPAC2char[256] , isIUPAC[256] , base_rc[256] , IUPAC_variance[256] ; int globally_allowed_mismatches = 0 ; // __________________________________________________________________________________________________________________________________________________________________ GLOBAL FUNCTIONS /// Breaks up a string in a vector of strings at given sepatator void Tokenize(const string& str, vector& tokens, const string& delimiters) { // Skip delimiters at beginning. string::size_type lastPos = str.find_first_not_of(delimiters, 0); // Find first "non-delimiter". string::size_type pos = str.find_first_of(delimiters, lastPos); while (string::npos != pos || string::npos != lastPos) { // Found a token, add it to the vector. tokens.push_back(str.substr(lastPos, pos - lastPos)); // Skip delimiters. Note the "not_of" lastPos = str.find_first_not_of(delimiters, pos); // Find next "non-delimiter" pos = str.find_first_of(delimiters, lastPos); } } /// Initializes IUPAC lookup tables void init_iupac () { int a ; for ( a = 0 ; a < 256 ; a++ ) char2IUPAC[a] = 0 ; char2IUPAC['A'] = IUPAC_A ; char2IUPAC['C'] = IUPAC_C ; char2IUPAC['G'] = IUPAC_G ; char2IUPAC['T'] = IUPAC_T ; // char2IUPAC['U'] = IUPAC_T ; // U <=> T char2IUPAC['R'] = IUPAC_A|IUPAC_G ; // R Purine (A or G) char2IUPAC['Y'] = IUPAC_C|IUPAC_T ; // Y Pyrimidine (C, T) char2IUPAC['M'] = IUPAC_C|IUPAC_A ; // M C or A char2IUPAC['K'] = IUPAC_T|IUPAC_G ; // K T, or G char2IUPAC['W'] = IUPAC_T|IUPAC_A ; // W T, or A char2IUPAC['S'] = IUPAC_C|IUPAC_G ; // S C or G char2IUPAC['B'] = IUPAC_C|IUPAC_T|IUPAC_G ; // B C, T, or G (not A) char2IUPAC['D'] = IUPAC_T|IUPAC_A|IUPAC_G ; // D A, T, or G (not C) char2IUPAC['H'] = IUPAC_T|IUPAC_A|IUPAC_C ; // H A, T, or C (not G) char2IUPAC['V'] = IUPAC_A|IUPAC_C|IUPAC_G ; // V A, C, or G (not T) char2IUPAC['N'] = IUPAC_A|IUPAC_C|IUPAC_G|IUPAC_T ; // N Any base (A, C, G, T) for ( a = 0 ; a < 256 ; a++ ) IUPAC2char[char2IUPAC[a]] = a ; char2IUPAC['X'] = IUPAC_A|IUPAC_C|IUPAC_G|IUPAC_T ; // X Any base (A, C, G, T) for ( a = 0 ; a < 256 ; a++ ) isIUPAC[a] = 1 ; isIUPAC['A'] = isIUPAC['C'] = isIUPAC['G'] = isIUPAC['T'] = 0 ; isIUPAC[IUPAC_A] = isIUPAC[IUPAC_C] = isIUPAC[IUPAC_G] = isIUPAC[IUPAC_T] = 0 ; // 1,2,4,8 - isIUPAC is safe for double use... for ( a = 0 ; a < 256 ; a++ ) base_rc[a] = ' ' ; base_rc['A'] = 'T' ; base_rc['T'] = 'A' ; base_rc['C'] = 'G' ; base_rc['G'] = 'C' ; base_rc['N'] = 'N' ; base_rc['X'] = 'X' ; for ( a = 0 ; a < 256 ; a++ ) IUPAC_variance[a] = 0 ; for ( a = 0 ; a < 256 ; a++ ) { if ( ( char2IUPAC[a] & IUPAC_A ) > 0 ) IUPAC_variance[a]++ ; if ( ( char2IUPAC[a] & IUPAC_C ) > 0 ) IUPAC_variance[a]++ ; if ( ( char2IUPAC[a] & IUPAC_G ) > 0 ) IUPAC_variance[a]++ ; if ( ( char2IUPAC[a] & IUPAC_T ) > 0 ) IUPAC_variance[a]++ ; } for ( a = 0 ; a < 256 ; a++ ) to_lc[a] = a ; for ( a = 'A' ; a <= 'Z' ; a++ ) to_lc[a] = a - 'A' + 'a' ; for ( a = 0 ; a < 256 ; a++ ) to_uc[a] = a ; for ( a = 'a' ; a <= 'z' ; a++ ) to_uc[a] = a - 'a' + 'A' ; for ( a = 0 ; a < 256 ; a++ ) char2num[a] = 0 ; char2num[(uchar)'A'] = 1 ; char2num[(uchar)'C'] = 2 ; char2num[(uchar)'G'] = 3 ; char2num[(uchar)'T'] = 4 ; char2num[(uchar)'N'] = 5 ; char2num[(uchar)'X'] = 5 ; for ( a = 0 ; a < 256 ; a++ ) num2char[char2num[a]] = a ; } /// IUPAC-tolerant basematch for two char strings bool matchIUPAC ( const char *c1 , const char *c2 ) { int left = globally_allowed_mismatches ; while ( *c1 && *c2 ) { if ( char2IUPAC[*c1++] & char2IUPAC[*c2++] ) continue ; if ( --left >= 0 ) continue ; return false ; } return true ; } /// Loads an entire FASTA file into a vector of TChromosome. void load_all_chromosomes ( string filename , vector &chrs , bool keep_original_snps ) { if ( DEBUGGING ) { fprintf ( stdout , "Reading all chromosomes from %s ... " , filename.c_str() ) ; fflush(stdout); } string *seq = NULL ; FILE * file = fopen ( filename.c_str() , "r" ) ; if ( NULL == file ) { cerr << "file could not be accessed - aborting.\n" ; exit ( 1 ) ; } char dummy[READ_CHAR_BUFFER] , *c1 , *c2 ; while ( !feof(file) ) { *dummy = 0 ; fgets ( dummy , READ_CHAR_BUFFER , file ) ; if ( *dummy == '>' ) { for ( c1 = dummy+1 ; *c1 == ' ' ; c1++ ) ; for ( c2 = c1 ; *c2 && *c2 != 10 && *c2 != 13 && *c2 != ' ' && *c2 != '|' ; c2++ ) ; *c2 = 0 ; chrs.push_back ( TChromosome() ) ; chrs[chrs.size()-1].name = c1 ; seq = &( chrs[chrs.size()-1].sequence ) ; } else { for ( c1 = dummy ; *c1 ; c1++ ) { switch ( *c1 ) { case 'a' : *c1 = 'A' ; break ; case 'c' : *c1 = 'C' ; break ; case 'g' : *c1 = 'G' ; break ; case 't' : *c1 = 'T' ; break ; case 'A' : break ; case 'C' : break ; case 'G' : break ; case 'T' : break ; case 10 : *c1 = 0 ; break ; // Line end case 13 : *c1 = 0 ; break ; // Line end default : if ( !keep_original_snps ) *c1 = 'X' ; // Setting everything else to 'X' } // if ( !*c1 ) break ; } *seq += dummy ; } } fclose ( file ) ; if ( DEBUGGING ) { fprintf ( stdout , "read %d chromosomes.\n" , chrs.size() ) ; fflush(stdout); } } /// Includes SNPs from a .gff file into chromosomes. void incorporate_all_gff ( string filename , vector &chrs ) { if ( DEBUGGING ) { fprintf ( stdout , "Reading SNPS from %s ... ", filename.c_str() ) ; fflush(stdout); } // Backup for ( uint a = 0 ; a < chrs.size() ; a++ ) { if ( chrs[a].original_sequence.empty() ) chrs[a].original_sequence = chrs[a].sequence ; } FILE * file = fopen ( filename.c_str() , "r" ) ; int snpcnt = 0 ; char dummy[READ_CHAR_BUFFER] , *c1 , *c2 ; while ( !feof(file) ) { vector parts , parts2 ; *dummy = 0 ; fgets ( dummy , READ_CHAR_BUFFER , file ) ; Tokenize ( dummy , parts , "\t" ) ; if ( parts.size() < 7 ) continue ; // These is not the data you are looking for int i ; for ( i = 0 ; i < chrs.size() && chrs[i].name != parts[0] ; i++ ) ; if ( i >= chrs.size() ) continue ; // Chromosome unknown, ignore this one if ( parts[2] != "SNP" ) continue ; // Not a SNP if ( parts[3] != parts[4] ) continue ; // Not a single point if ( parts[5] != "+" ) continue ; // Wrong strand (?) int pos = atoi ( parts[3].c_str() ) ; char merge = chrs[i].sequence[pos-1] ; Tokenize ( parts[6].c_str() , parts2 , "\"\" \"\"" ) ; bool found_reference = false ; for ( int b = 0 ; b < parts2.size() ; b++ ) { int l = parts2[b].length() ; if ( l < 2 ) continue ; if ( parts2[b][l-2] != ':' ) continue ; char c = parts2[b][l-1] ; if ( c == chrs[i].sequence[pos-1] ) found_reference = true ; merge = MERGE_IUPAC(merge,c) ; } if ( !found_reference ) { // Strange; maybe original sequence contains "N" // We should count this! continue ; } chrs[i].sequence[pos-1] = merge ; snpcnt++ ; } fclose ( file ) ; if ( DEBUGGING ) { fprintf ( stdout , "incorporated %d SNPs.\n" , snpcnt ) ; fflush(stdout); } } /// Include SNPs from a simple SNP list into chromosome. void incorporate_simple_snps ( string filename , vector &chrs ) { if ( DEBUGGING ) { fprintf ( stdout , "Reading SNPS from %s ... ", filename.c_str() ) ; fflush(stdout); } // Backup for ( uint a = 0 ; a < chrs.size() ; a++ ) { if ( chrs[a].original_sequence.empty() ) chrs[a].original_sequence = chrs[a].sequence ; } FILE * file = fopen ( filename.c_str() , "r" ) ; int snpcnt = 0 , already_snp = 0 ; char dummy[READ_CHAR_BUFFER] , *c1 , *c2 ; uint chr_unknown = 0 ; uint reference_mismatch = 0 ; while ( !feof(file) ) { bool found_reference = false ; vector parts ; *dummy = 0 ; fgets ( dummy , READ_CHAR_BUFFER , file ) ; if ( !*dummy ) break ; // cout << dummy << endl ; Tokenize ( dummy , parts , "\t" ) ; if ( parts.size() < 4 ) { parts.clear() ; Tokenize ( dummy , parts , " " ) ; if ( parts.size() < 4 ) continue ; // These is not the data you are looking for } int i ; for ( i = 0 ; i < chrs.size() && chrs[i].name != parts[0] ; i++ ) ; if ( i >= chrs.size() ) { chr_unknown++ ; continue ; } // Chromosome unknown, ignore this one int pos = atoi ( parts[1].c_str() ) ; if ( pos < 1 || pos >= chrs[i].sequence.length() ) continue ; // Out-of-range char c = parts[2][0] ; char d = chrs[i].original_sequence[pos-1] ; if ( d == c ) found_reference = true ; else if ( d != 'A' && d != 'C' && d != 'G' && d != 'T' ) { found_reference = true ; already_snp++ ; } else { reference_mismatch++ ; //cerr << "Called as " << c << ", is really " << d << endl ; } c = parts[3][0] ; char merge = MERGE_IUPAC ( chrs[i].sequence[pos-1] , c ) ; chrs[i].sequence[pos-1] = merge ; snpcnt++ ; } fclose ( file ) ; if ( DEBUGGING ) { fprintf ( stdout , "incorporated %d SNPs (%d unknown chromosomes, %d reference mismatches, %d already marked as SNPs in genome).\n" , snpcnt , chr_unknown , reference_mismatch , already_snp ) ; fflush(stdout); } } // _________________________________________________________________________________________________________________________________________________________________ OPERATORS /// Compares two indices; important for finding second-level indices only, so chromosome is not important. bool operator < ( const TPositionWithIndex &a , const TPositionWithIndex &b ) { return a.index2 < b.index2 ; } snpomatic-1.0/src/mapcontigs.cpp000066400000000000000000000056761260772140700170320ustar00rootroot00000000000000#include "snpomatic.h" /// Shows possible parameters, and an (optional) error message. void die_with_description ( const char *msg = NULL ) { if ( msg ) cerr << msg << endl ; cerr << "Parameters : \n" ; cerr << "--genome=GENOME_FILE\tFASTA file with chromosomes (mandatory)\n" ; cerr << "--fasta=FASTA_FILE\tFASTA file with Solexa reads (mandatory, except when --fastq is used)\n" ; cerr << "--output=SNP_FILE\tWhere the output goes (mandatory)\n" ; cerr << "--mincontigsize=NUMBER\tMinimal size of contigs (optional)\n" ; exit ( 0 ) ; } void analyze_contigs ( vector &contigs ) { vector sizes ; int a ; for ( a = 0 ; a < contigs.size() ; a++ ) { int l = contigs[a].sequence.length() ; if ( l > 99 ) l = 100 ; while ( sizes.size() <= l ) sizes.push_back ( 0 ) ; sizes[l]++ ; } for ( a = 0 ; a < sizes.size() ; a++ ) { if ( sizes[a] == 0 ) continue ; cout << a << "\t" << sizes[a] << endl ; } exit ( 0 ) ; } int main ( int argc , char **argv ) { string genome_file , fasta_file , output_file ; uint min_contig_size = 0 ; uint index_length_1 = 8 ; uint index_length_2 = 16 ; // Parameters static struct option long_options[] = { { "fasta" , optional_argument , 0 , 'a' } , { "genome" , optional_argument , 0 , 'g' } , { "output" , optional_argument , 0 , 'o' } , { "mincontigsize" , optional_argument , 0 , 'm' } , { 0 , 0 , 0 , 0 } } ; int c ; int option_index = 0; while ( -1 != ( c = getopt_long (argc, argv, "g:a::o::s::f::p::b::m::",long_options, NULL) ) ) { switch ( c ) { case 0 : break ; case 'a' : fasta_file = optarg ; break ; case 'g' : genome_file = optarg ; break ; case 'o' : output_file = optarg ; break ; case 'm' : min_contig_size = atoi ( optarg ) ; break ; default : die_with_description("Unknown option") ; } } if ( genome_file.empty() ) die_with_description("No genome file given!") ; // Need one if ( fasta_file.empty() ) die_with_description("No Solexa file given!") ; // Need one if ( output_file.empty() ) die_with_description("No output file given!") ; // Need one init_iupac () ; vector contigs ; load_all_chromosomes ( fasta_file , contigs ) ; TChromosomalIndices contig_index ( index_length_1 , index_length_2 ) ; contig_index.noshortcuts = true ; contig_index.run_ends ( &contigs , min_contig_size ) ; // analyze_contigs ( contigs ) ; vector chrs ; load_all_chromosomes ( genome_file , chrs ) ; TChromosomalIndices genome_index ( index_length_1 , index_length_2 ) ; genome_index.noshortcuts = true ; genome_index.run ( &chrs ) ; TChromosomeAlign ca ( chrs , genome_index ) ; ca.align_contigs ( contigs , contig_index , 3 , output_file ) ; } /* ./velveth temp1 21 -shortPaired ~/som++/q1_no_match.fasta ./velvetg temp1 rm mapcontigs ; make mapcontigs ; time ./mapcontigs --genome=/nfs/malaria/data2/3D7/Pf.3D7.v2.1.chromosomes.dna.fa --fasta=../velvet/velvet_0.5.05/temp1/contigs.fa --out=new_snps_60 --mincontigsize=60 */snpomatic-1.0/src/reassemble.cpp000066400000000000000000000712001260772140700167720ustar00rootroot00000000000000/*! \mainpage SNP-O-MATIC * * \section authors Authors * * - Magnus Manske (mm6@sanger.ac.uk) * * \section intro_sec Introduction * * SNP-O-MATIC can align Solexa reads against a reference sequence. * Known SNPs can be added to the reference sequence as IUPAC bases. * Only perfectly matching Solexa reads, with respect to the IUPAC bases, will be aligned. * Reads can be binned into those that do not match the reference, those that match uniquely, and those that match two or more times. * * \section install_sec Installation * * SNP-O-MATIC comes with a Makefile. The Intel compiler is used by default for performance reasons, but the g++ variant is in the Makefile as well in the form of comments. */ #include "snpomatic.h" #include #include #include #define MAX_NEW 1 TChromosomalIndices *idx ; class TRead ; class TPart { public : void do_index ( uint id , bool rc = false ) ; string seq ; } ; class TRead { public : TRead () {} ; TRead ( const string &seq ) ; TRead ( const string &seq1 , const string &seq2 ) ; inline void reset ( uint id ) ; bool used ; vector related ; TPart p1 , p2 ; } ; TRead::TRead ( const string &seq ) { int l = seq.length() / 2 ; p1.seq = seq.substr ( 0 , l ) ; p2.seq = seq.substr ( l , l ) ; } TRead::TRead ( const string &seq1 , const string &seq2 ) { p1.seq = seq1 ; p2.seq = seq2 ; } void TRead::reset ( uint id ) { p1.do_index ( id , false ) ; p2.do_index ( id , true ) ; used = false ; related.clear() ; } void TPart::do_index ( uint id , bool rc ) { for ( int a = 0 ; a + idx->index_length_double < seq.length() ; a++ ) { uint i1 = idx->calculate_index ( (char*) seq.c_str() + a ) ; TPositionWithIndex p ; p.index2 = idx->calculate_index2 ( (char*) seq.c_str() + a + idx->index_length ) ; p.position = a ; p.reverse_complement = rc ; p.chromosome = id ; idx->position[i1].push_back ( p ) ; break ; } } string pad ( int a ) { string ret ; while ( ret.length() < a ) ret += " " ; return ret ; } /* int remove_full_sequences ( vector &vr , vector &vs ) { int min = 30 ; int cnt = 0 ; for ( int a = 0 ; a < vr.size() ; a++ ) { if ( vr[a].p1.seq.length() < min ) continue ; string m = vr[a].p1.seq.substr ( vr[a].p1.seq.length() - min , min ) ; int n = vr[a].p2.seq.find ( m , 0 ) ; if ( n == std::string::npos ) continue ; m = vr[a].p1.seq + vr[a].p2.seq.substr ( n ) ; vs.push_back ( m ) ; cnt++ ; vr[a].p1.seq = "" ; vr[a].p2.seq = "" ; } return cnt ; } void create_contigs ( vector &vr , vector &vn ) { for ( uint r1 = 0 ; r1 < vr.size() ; r1++ ) { if ( vr[r1].used ) continue ; for ( uint r2x = 0 ; r2x < vr[r1].related.size() ; r2x++ ) { uint r2 = vr[r1].related[r2x] ; if ( vr[r1].p1.seq == vr[r2].p1.seq ) continue ; if ( vr[r1].p2.seq == vr[r2].p2.seq ) continue ; string t1 = combine_strings ( vr[r1].p1.seq , vr[r2].p1.seq ) ; if ( t1.empty() ) continue ; string t2 = combine_strings ( vr[r1].p2.seq , vr[r2].p2.seq ) ; if ( t2.empty() ) continue ; vr[r1].used = true ; vr[r2].used = true ; vn.push_back ( TRead ( t1 , t2 ) ) ; // cout << t1 << "\t" << t2 << endl ; break ; } } } void join_single_sequences ( vector &vs ) { vector vs2 ; vs.swap ( vs2 ) ; do { for ( int a = 0 ; a < vs.size() ; a++ ) { if ( vs[a].empty() ) continue ; vs2.push_back ( vs[a] ) ; } vs.swap ( vs2 ) ; vs2.clear() ; for ( int a = 0 ; a < vs.size() ; a++ ) { if ( vs[a].length() < 100 ) continue ; for ( int b = a+1 ; b < vs.size() ; b++ ) { if ( vs[b].empty() ) continue ; string m = combine_strings ( vs[a] , vs[b] , 100 ) ; if ( m.empty() ) continue ; vs[a].clear() ; vs[b].clear() ; vs2.push_back ( m ) ; } } } while ( vs2.size() > 0 ) ; } void interlink_reads ( vector &vr ) { uint p1s = idx->position.size() ; for ( uint i1 = 0 ; i1 < p1s ; i1++ ) { uint last = 0 ; uint p2s = idx->position[i1].size() ; for ( uint a = 1 ; a < p2s ; a++ ) { if ( idx->position[i1][last].index2 == idx->position[i1][a].index2 ) { uint n2 = idx->position[i1][a].chromosome ; for ( uint b = last ; b < a ; b++ ) { uint n1 = idx->position[i1][b].chromosome ; vr[n2].related.push_back ( n1 ) ; vr[n1].related.push_back ( n2 ) ; } } else { last = a ; } } } uint cnt = 0 ; for ( uint a = 0 ; a < vr.size() ; a++ ) { sort ( vr[a].related.begin() , vr[a].related.end() ) ; vector vi ; vi.reserve ( vr[a].related.size() ) ; for ( uint b = 0 ; b < vr[a].related.size() ; b++ ) { if ( b > 0 && vr[a].related[b-1] == vr[a].related[b] ) continue ; vi.push_back ( vr[a].related[b] ) ; } cnt += vi.size() ; vr[a].related.swap ( vi ) ; } cerr << cnt << " crosslinks found\n" ; } void align_perfect ( vector &reads , int offset ) { int a , b ; if ( DEBUGGING ) { fprintf ( stdout , "Indexing ... " ) ; fflush(stdout); } for ( a = 0 ; a < reads.size() ; a++ ) reads[a].reset ( a ) ; if ( DEBUGGING ) { fprintf ( stdout , "done.\n" ) ; fflush(stdout); } if ( DEBUGGING ) { fprintf ( stdout , "Scanning with offset %d ... " , offset ) ; fflush(stdout); } for ( uint i1 = 0 ; i1 < idx->position.size() ; i1++ ) { uint i1s = idx->position[i1].size() ; for ( uint a = 0 ; a < i1s ; a++ ) { if ( idx->position[i1][a].position > offset ) continue ; uint i2 = idx->position[i1][a].index2 ; for ( uint b = a+1 ; b < i1s && idx->position[i1][b].index2 == i2 ; b++ ) { if ( idx->position[i1][b].position > offset ) continue ; uint r1 = idx->position[i1][a].chromosome ; uint r2 = idx->position[i1][b].chromosome ; if ( reads[r2].p1.seq != reads[r1].p1.seq && reads[r2].p1.seq != reads[r1].p2.seq && reads[r2].p2.seq != reads[r1].p1.seq && reads[r2].p2.seq != reads[r1].p2.seq ) continue ; } } } if ( DEBUGGING ) { fprintf ( stdout , "done.\n" ) ; fflush(stdout); } } string combine_strings ( const string &s1 , const string &s2 , int max = -1 ) { if ( max == -1 ) max = idx->index_length_double ; string ret ; int s1l = s1.length() ; int s2l = s2.length() ; int maxa = s2l - max ; int a , b ; for ( a = - ( s1l - max ) ; a < maxa ; a++ ) { int maxb = s2l < s1l-a ? s2l : s1l-a ; for ( b = a<0?-a:0 ; b < maxb && s1[a+b] == s2[b] ; b++ ) ; if ( b < maxb ) continue ; if ( !ret.empty() ) return string () ; // Do not allow multiple hits if ( a < 0 ) { ret = s2.substr ( 0 , -a ) + s1 ; if ( s1l-a < s2l ) ret += s2.substr ( s1l-a ) ; } else { ret = s1.substr ( 0 , a ) + s2 ; if ( s2l+a < s1l ) ret += s1.substr ( s2l+a ) ; } } return ret ; } */ string merge_strings ( string s1 , string s2 ) { string ret ; if ( s1.length() > s2.length() ) { string tmp = s1 ; s1 = s2 ; s2 = tmp ; } int sl1 = s1.length() ; int sl2 = s2.length() ; int best = sl1 + sl2 ; int bestpos = 0 ; int a , b ; int min_offset = 5 ; for ( a = -sl1 + min_offset ; a < sl2 - min_offset ; a++ ) { int match = 0 ; int mismatch = 0 ; for ( b = 0 ; b < sl1 ; b++ ) { int p2 = a + b ; if ( p2 < 0 || p2 >= sl2 ) continue ; if ( s1[b] == 'N' || s2[p2] == 'N' ) continue ; if ( s1[b] == s2[p2] ) match++ ; else mismatch++ ; } if ( mismatch > match / 3 ) continue ; int total = match + mismatch ; if ( mismatch < best ) { best = mismatch ; bestpos = a ; } } if ( best > 5 ) return ret ; // Hard cutoff ret = s2 + pad ( sl1 ) ; if ( bestpos < 0 ) { ret = pad ( -bestpos ) + ret ; bestpos = 0 ; } // Merge for ( a = 0 ; a < sl1 ; a++ ) { b = bestpos + a ; if ( s1[a] == ret[b] ) continue ; if ( ret[b] != 'N' && s1[a] != ' ' ) { if ( ret[b] == ' ' ) ret[b] = s1[a] ; else ret[b] = 'N' ; } } // Trim blanks for ( a = 0 ; ret[a] == ' ' ; a++ ) ; if ( a > 0 ) ret = ret.substr ( a , ret.length()-a ) ; for ( a = 0 ; a < ret.length() && ret[a] != ' ' ; a++ ) ; ret = ret.substr ( 0 , a ) ; return ret ; } void read_fastq ( string filename , vector &vr , int min_qual = 0 ) { if ( DEBUGGING ) { fprintf ( stdout , "Reading solexa pair data from %s ... " , filename.c_str() ) ; fflush(stdout); } FILE * file = fopen ( filename.c_str() , "r" ) ; if ( !file ) { cerr << "Could not open FASTQ file \"" << filename << "\" - aborting." << endl ; exit ( 1 ) ; } char dummy[READ_CHAR_BUFFER] , *c1 , *c2 ; string lastseq , lastqual , last_solexa_name ; bool fasta = true ; bool last_was_quality = false , lastone = false , eof = false , skip = false , goodqual = true ; uint readcount = 0 , matched_reads = 0 , total_matches = 0 , skipped = 0 , badqualcnt = 0 ; while ( !eof || lastone ) { if ( lastone ) { strcpy ( dummy , "@dummy" ) ; if ( fasta ) *dummy = '>' ; } else { *dummy = 0 ; fgets ( dummy , READ_CHAR_BUFFER , file ) ; } // Remove EOL for ( c1 = dummy ; *c1 > 13 ; c1++ ) ; *c1 = 0 ; if ( ( fasta && *dummy == '>' ) || *dummy == '@' ) { if ( *dummy == '@' ) fasta = false ; if ( !skip ) { for ( c1 = (char*) lastseq.c_str() ; *c1 && !isIUPAC[*c1] ; c1++ ) ; if ( *c1 ) { // Contains IUPAC } else if ( !goodqual ) { badqualcnt++ ; } else if ( !lastseq.empty() ) { vr.push_back ( TRead ( lastseq ) ) ; } } goodqual = true ; if ( lastone ) break ; // Clear data c1 = dummy ; if ( *c1 == '>' || *c1 == '@' ) c1++ ; while ( *c1 == ' ' ) c1++ ; last_solexa_name = c1 ; readcount++ ; last_was_quality = false ; lastseq.clear() ; lastqual.clear() ; } else if ( *dummy == '+' ) { last_was_quality = true ; } else if ( last_was_quality ) { if ( min_qual > 0 ) { goodqual = true ; for ( c1 = dummy ; goodqual && *c1 > 20 ; c1++ ) { // cout << (int) *c1-33 << ", " ; goodqual = ( *c1-33 ) >= min_qual ; } // cout << endl ; } last_was_quality = false ; // lastqual += dummy ; } else { lastseq += dummy ; } eof = feof ( file ) ; // if ( vr.size() >= 100000 ) eof = 1 ; // TESTING!!!! if ( eof ) lastone = true ; // Fake last read } if ( DEBUGGING ) { fprintf ( stdout , "scanned %d solexa reads, %d (%2.2f%%) matched , %d total matches, %d skipped, %d bad quality.\n" , readcount , matched_reads , (float) matched_reads * 100 / readcount , total_matches , skipped , badqualcnt ) ; fflush(stdout); } } // __________________________________________________________________________________________________________________________________________________________________ MAIN /// Shows possible parameters, and an (optional) error message. void die_with_description ( const char *msg = NULL ) { if ( msg ) cerr << msg << endl ; cerr << "Parameters : \n" ; cerr << "--genome=GENOME_FILE FASTA file with chromosomes (mandatory)\n" ; cerr << "--fasta=FASTA_FILE FASTA file with Solexa reads (mandatory, except when --fastq or --index is used)\n" ; cerr << "--fastq=FASTQ_FILE FASTQ file with Solexa reads (mandatory, except when --fasta or --index is used)\n" ; cerr << "--nono=FILENAME File with list of read names (!) to ignore (optional)\n" ; cerr << "--regions=REGION_FILE Region file for finding new SNPs (optional) [DEPRECATED]\n" ; cerr << "--snps=SNP_FILE Simple SNP file (optional)\n" ; cerr << "--gff=GFF_FILE GFF file with SNPs (optional)\n" ; cerr << "--uniqueness=FILE Output a uniqueness data file for the reference; no Solexa reads needed; implies --noshortcuts (optional)\n" ; cerr << "--pileup=FILE Outputs complete pileup into that file (optional)\n" ; cerr << "--cigar=FILE Outputs alignment in CIGAR format (optional)\n" ; cerr << "--gffout=FILE Outputs reads in GFF format (optional)\n" ; cerr << "--coverage=FILENAME Outputs (high depth) coverage data (optional)\n" ; cerr << "--wobble=FILENAME Outputs a list of possible variations (optional; paired reads only) [UNDER CONSTRUCTION]\n" ; cerr << "--fragmentplot=FILENAME Outputs a plot of fragment size distribution to a file (optional)\n" ; cerr << "--snpsinreads=FILENAME Outputs a list of reads containing known SNPs to a file (optional)\n" ; cerr << "--indelplot=FILENAME Outputs indel data to a file (optional)\n" ; cerr << "--inversions=FILENAME For paired reads, writes read matches indicating inversions into a file (optional)\n" ; cerr << "--bins=FILE_PREFIX Outputs no-match, single-match and multi-match Solexa reads into prefixed files (optional)\n" ; cerr << "--binmask=MASK Mask of 1s and 0s to turn off individual bins. Order: No match, single match, multi-match, IUPAC.\n" ; cout << " Example: 0100 creates only single-match bin. (optional; default:1111)\n" ; cerr << "--pair=NUMBER For paired reads, the length of the first part of the read (mandatory for paired reads)\n" ; cerr << "--fragment=NUMBER For paired reads, the average fragment length (mandatory for paired reads)\n" ; cerr << "--variance=NUMBER For paired reads, the variance of the fragment length to either side (optional; default: 1/4 of fragment size)\n" ; cerr << "--wobblemax=NUMBER Maximum number of mismatches for wobble (optional; default 2; see --wobble)\n" ; cerr << "--mspi=NUMBER Maximum number of SNPs per chromosomal index (optional; default:8)\n" ; cerr << "--index=FILENAME Index filename (index will be created if it does not exist; optional)\n" ; cerr << "--noshortcuts Will process all chrososomal regions, even those with lots'o'repeats (optional; no value)\n" ; cerr << "--snpsonly Only lists found SNPs in the pileup (optional; no value)\n" ; cerr << "--chromosome=NAME Discards all chromosomes but NAME prior to run (optional)\n" ; cerr << "--chop=NUMBER For paired reads, if one but not the other matches, shorten the other by NUMBER bases (optional)\n" ; cerr << "--index1=NUMBER Length of internal index 1 (optional; default:10)\n" ; cerr << "--index2=NUMBER Length of internal index 2 (optional; default:16)\n" ; cerr << "--multimatch Puts a multiple-matching read to a random position (optional) [currently paired reads only]\n" ; cerr << "--foum For paired reads, at least one read has to match uniquely in the genome (force one unique match) (optional)\n" ; // cerr << "--featuretable=OUTPUT_FILE\tOutputs a DDBJ/EMBL/GenBank feature table for matched reads (optional; paired reads only for now)\n" ; exit ( 0 ) ; } class TContig { public : TContig () { remove = false ; } string sequence ; // vector reads ; bool remove ; } ; vector contigs ; map cache ; vector contig_idx ; void initialize_contigs ( vector & vr , string start ) { unsigned int a , b ; for ( a = 0 ; a < vr.size() ; a++ ) { // char *cc = (char*) vr[a].p1.seq.c_str() ; // if ( *(cc) != 'A' || *(cc+1) != 'T' || *(cc+2) != 'G' ) continue ; if ( vr[a].p1.seq.substr(0,3) == start && cache.end() == cache.find ( vr[a].p1.seq ) ) { cache[vr[a].p1.seq] = true ; TContig c ; c.sequence = vr[a].p1.seq ; contigs.push_back ( c ) ; vr[a].used = true ; } if ( vr[a].p2.seq.substr(0,3) == start && cache.end() == cache.find ( vr[a].p2.seq ) ) { cache[vr[a].p2.seq] = true ; TContig c ; c.sequence = vr[a].p2.seq ; contigs.push_back ( c ) ; vr[a].used = true ; } } cout << contigs.size() << " contigs created (from " << vr.size() << " reads).\n" ; } void add_contig ( string seq , int readnum ) { if ( cache.end() == cache.find ( seq ) ) { TContig c ; c.sequence = seq ; contigs.push_back ( c ) ; cache[seq] = 1 ; } else { } } int grow_contigs ( vector & vr , int pos , string start ) { cout << "Growing at position " << pos << " ... " ; int read_length = vr[0].p1.seq.length() ; int ret = 0 ; contig_idx.clear() ; for ( unsigned int a = 0 ; a < contigs.size() ; a++ ) { TPositionWithIndex p ; p.index2 = idx->calculate_index2 ( (char*) contigs[a].sequence.c_str() ) ; p.chromosome = a ; contig_idx.push_back ( p ) ; } sort ( contig_idx.begin() , contig_idx.end() ) ; for ( unsigned int a = 0 ; a < contigs.size() ; a++ ) { if ( contigs[a].remove ) continue ; // Marked for deletion string cseq = contigs[a].sequence ; if ( read_length + pos > cseq.length() + MAX_NEW ) continue ; int len = cseq.length() - pos ; if ( len > read_length ) len = read_length ; string csub = cseq.substr ( pos , len ) ; contigs[a].remove = true ; bool altered = false ; if ( csub.substr ( 0 , 3 ) == start ) { // Elongate with another "contig" csub = cseq.substr ( pos , cseq.length()-pos ) ; unsigned int i2 = idx->calculate_index2 ( (char*) csub.c_str() ) ; unsigned int b ; for ( unsigned int b2 = 0 ; b2 < contig_idx.size() ; b2++ ) { if ( contig_idx[b2].index2 > i2 ) break ; if ( contig_idx[b2].index2 < i2 ) continue ; b = contig_idx[b2].chromosome ; if ( a == b ) continue ; // No self-elongation // if ( contigs[b].remove ) continue ; if ( contigs[b].sequence.length() <= csub.length() ) continue ; // Contig smaller than fragment if ( contigs[b].sequence.substr ( 0 , csub.length() ) == csub ) { string s = cseq.substr ( 0 , pos ) + contigs[b].sequence ; add_contig ( s , -1 ) ; altered = true ; contigs[b].remove = true ; break ; } } } else { // Try to elongate with reads uint i1 = idx->calculate_index ( (char*) csub.c_str() ) ; uint i2 = idx->calculate_index2 ( (char*) csub.c_str() + idx->index_length ) ; for ( unsigned d = 0 ; d < idx->position[i1].size() ; d++ ) { if ( idx->position[i1][d].index2 > i2 ) break ; if ( idx->position[i1][d].index2 < i2 ) continue ; unsigned int b = idx->position[i1][d].chromosome ; if ( vr[b].used ) continue ; string *sp = idx->position[i1][d].reverse_complement ? &(vr[b].p2.seq) : &(vr[b].p1.seq) ; if ( sp->substr ( 0 , len ) != csub ) continue ; string s = cseq.substr ( 0 , pos ) + *sp ; if ( s.length() <= cseq.length() ) continue ; // No new length increase add_contig ( s , b ) ; vr[b].used = true ; altered = true ; } } if ( altered ) ret++ ; else contigs[a].remove = false ; } // Cleanup for ( int a = 0 ; a < contigs.size() ; a++ ) { if ( !contigs[a].remove ) continue ; cache.erase ( contigs[a].sequence ) ; contigs[a] = contigs[contigs.size()-1] ; contigs.pop_back() ; a-- ; } cout << ret << " contigs grew (" << contigs.size() << " contigs total)." << endl ; return ret ; } void assemble_contigs ( vector &vr , vector &chrs , string start ) { initialize_contigs ( vr , start ) ; int a = 1 , growing , growth ; do { growing = grow_contigs ( vr , a++ , start ) ; if ( growing > 0 ) growth = 10 ; else growth-- ; } while ( growth > 0 ) ; cout << "Writing contigs to contigs.fasta\n" ; ofstream fout( "contigs.fasta", ios::out|ios::trunc ); for ( a = 0 ; a < contigs.size() ; a++ ) { if ( contigs[a].sequence.length() == vr[0].p1.seq.length() ) continue ; string s = contigs[a].sequence ; //new_contigs.push_back ( s ) ; // FIXME : Add to chrs fout << ">contig_" << a << "_" << s.length() << endl ; while ( !s.empty() ) { if ( s.length() >= 60 ) { fout << s.substr ( 0 , 60 ) << endl ; s = s.substr ( 60 ) ; } else { fout << s << endl ; break ; } } } fout.close() ; int used = 0 ; for ( a = 0 ; a < vr.size() ; a++ ) { if ( vr[a].used ) used++ ; vr[a].used = false ; // Resetting } cout << used << " of " << vr.size() << " reads (" << 100 * used / vr.size() << "%) used in contigs\n" ; // Cleanup contigs.clear() ; cache.clear() ; contig_idx.clear() ; } void write_chromosomes_to_fasta ( vector &chrs ) { cout << "Writing contigs to chrs.fasta\n" ; ofstream fout( "chrs.fasta", ios::out|ios::trunc ); map seen ; for ( int a = 0 ; a < chrs.size() ; a++ ) { string s = chrs[a].sequence ; if ( seen.end() != seen.find ( s ) ) continue ; // Wrote that one already seen[s] = true ; fout << ">" << chrs[a].name << endl ; while ( !s.empty() ) { if ( s.length() >= 60 ) { fout << s.substr ( 0 , 60 ) << endl ; s = s.substr ( 60 ) ; } else { fout << s << endl ; break ; } } } fout.close() ; } void make_read_indices ( vector &vr ) { // Create indices for ( uint a = 0 ; a < vr.size() ; a++ ) { vr[a].reset ( a ) ; } // Sort indices for ( uint a = 0 ; a < idx->position.size() ; a++ ) { sort ( idx->position[a].begin() , idx->position[a].end() ) ; } } void join_chromosomes ( vector &vr , uint index_length_1 , uint index_length_2 , string genome_file ) { for ( int cnt = 0 ; cnt < 100 ; cnt++ ) { cout << "Chromosome joining, round " << cnt << endl ; vector chrs ; TChromosomalIndices contig_index ( index_length_1 , index_length_2 ) ; // contig_index.noshortcuts = noshortcuts ; idx = &contig_index ; load_all_chromosomes ( genome_file , chrs , true ) ; contig_index.run ( &chrs ) ; TChromosomeAlign ca ( chrs , contig_index ) ; ca.init () ; for ( uint a = 0 ; a < vr.size() ; a++ ) { pwi_vector res1 , res2 ; ca.get_align_solexa_read ( vr[a].p1.seq.c_str() , res1 ) ; if ( res1.size() != 1 ) continue ; ca.get_align_solexa_read ( vr[a].p2.seq.c_str() , res2 ) ; if ( res2.size() != 1 ) continue ; if ( res1[0].chromosome == res2[0].chromosome ) continue ; if ( res1[0].reverse_complement == res2[0].reverse_complement ) continue ; // Abusing uniqueness vector to store connections chrs[res1[0].chromosome].uniqueness.push_back ( res2[0].chromosome ) ; chrs[res2[0].chromosome].uniqueness.push_back ( res1[0].chromosome ) ; } for ( uint a = 0 ; a < chrs.size() ; a++ ) { sort ( chrs[a].uniqueness.begin() , chrs[a].uniqueness.end() ) ; chrs[a].uniqueness.push_back ( chrs.size()+1 ) ; // Dummy, will be ignored uint last = chrs.size()+1 ; int cnt = -1 ; for ( uint b = 0 ; b < chrs[a].uniqueness.size() ; b++ ) { if ( chrs[a].uniqueness[b] == last ) { cnt++ ; } else { if ( cnt >= 0 ) { if ( cnt > 5 ) { string joined = merge_strings ( chrs[a].sequence , chrs[last].sequence ) ; if ( !joined.empty() ) { chrs[a].sequence = joined ; chrs[last].sequence = joined ; } // cout << a << " => " << last << " (" << cnt << "x)\n" ; // cout << chrs[a].sequence << endl << chrs[last].sequence << endl ; // cout << merge_strings ( chrs[a].sequence , chrs[last].sequence ) << endl ; } } cnt = 1 ; last = chrs[a].uniqueness[b] ; } } } write_chromosomes_to_fasta ( chrs ) ; } } /// Well, guess what. int main ( int argc , char **argv ) { string genome_file , gff_file , simple_snp_file , fastq_file , fasta_file , pileup_file , bin_prefix , regions_file ; string cigar_file , wobble_file , nono , fragmentplot , featuretable , gffout , coverage_file , uniqueness , snpsinreads ; string index_file , indelplot_file , inversions_file , chromosome ; string binmask ( "1111" ) ; uint wobblemax = 2 ; uint mspi = 8 ; uint pair_length = 0 ; uint fragment_length = 0 ; uint index_length_1 = 10 ; uint index_length_2 = 16 ; int noshortcuts = false ; int snpsonly = false ; int chop = 0 ; uint variance = 0 ; int multimatch = false ; int force_one_unique_match = false ; // Parameters static struct option long_options[] = { { "genome" , required_argument , 0 , 'g' } , { "fasta" , optional_argument , 0 , 'a' } , { "fastq" , optional_argument , 0 , 'q' } , { "snps" , optional_argument , 0 , 's' } , { "coverage" , optional_argument , 0 , 'e' } , { "gff" , optional_argument , 0 , 'f' } , { "pileup" , optional_argument , 0 , 'p' } , { "bins" , optional_argument , 0 , 'b' } , { "binmask" , optional_argument , 0 , 'z' } , { "index" , optional_argument , 0 , 'x' } , { "nono" , optional_argument , 0 , 'n' } , { "uniqueness" , optional_argument , 0 , '0' } , { "wobble" , optional_argument , 0 , 'w' } , { "pair" , optional_argument , 0 , 'i' } , { "fragment" , optional_argument , 0 , 't' } , { "featuretable" , optional_argument , 0 , 'u' } , { "gffout" , optional_argument , 0 , 'y' } , { "variance" , optional_argument , 0 , 'v' } , { "mspi" , optional_argument , 0 , 'm' } , { "cigar" , optional_argument , 0 , 'c' } , { "fragmentplot" , optional_argument , 0 , 'o' } , { "snpsinreads" , optional_argument , 0 , 'd' } , { "regions" , optional_argument , 0 , 'r' } , { "index1" , optional_argument , 0 , '1' } , { "index2" , optional_argument , 0 , '2' } , { "chop" , optional_argument , 0 , '3' } , { "indelplot" , optional_argument , 0 , '4' } , { "wobblemax" , optional_argument , 0 , '5' } , { "inversions" , optional_argument , 0 , '6' } , { "chromosome" , optional_argument , 0 , '7' } , { "multimatch" , optional_argument , &multimatch , true } , { "noshortcuts" , optional_argument , &noshortcuts , true } , { "foum" , optional_argument , &force_one_unique_match , true } , { "snpsonly" , optional_argument , &snpsonly , true } , { 0 , 0 , 0 , 0 } } ; int c ; int option_index = 0; while ( -1 != ( c = getopt_long (argc, argv, "g:a::q::s::f::p::b::r::w::n::",long_options, NULL) ) ) { switch ( c ) { case 0 : break ; case 'g' : genome_file = optarg ; break ; case 'a' : fasta_file = optarg ; break ; case 'q' : fastq_file = optarg ; break ; case 's' : simple_snp_file = optarg ; break ; case 'f' : gff_file = optarg ; break ; case 'e' : coverage_file = optarg ; break ; case 'p' : pileup_file = optarg ; break ; case 'b' : bin_prefix = optarg ; break ; case 'z' : binmask = optarg ; break ; case '0' : uniqueness = optarg ; break ; case 'w' : wobble_file = optarg ; break ; case 'n' : nono = optarg ; break ; case 'x' : index_file = optarg ; break ; case 'i' : pair_length = atoi ( optarg ) ; break ; case 'o' : fragmentplot = optarg ; break ; case 'u' : featuretable = optarg ; break ; case 'y' : gffout = optarg ; break ; case 'd' : snpsinreads = optarg ; break ; case 'v' : variance = atoi ( optarg ) ; break ; case 't' : fragment_length = atoi ( optarg ) ; break ; case 'm' : mspi = atoi ( optarg ) ; break ; case '1' : index_length_1 = atoi ( optarg ) ; break ; case '2' : index_length_2 = atoi ( optarg ) ; break ; case '3' : chop = atoi ( optarg ) ; break ; case '4' : indelplot_file = optarg ; break ; case '5' : wobblemax = atoi ( optarg ) ; break ; case '6' : inversions_file = optarg ; break ; case '7' : chromosome = optarg ; break ; case 'c' : cigar_file = optarg ; break ; case 'r' : regions_file = optarg ; break ; default : die_with_description("Unknown option") ; } } binmask += "0000" ; // Ignore bins if shorter string was supplied init_iupac () ; TChromosomalIndices main_index ( index_length_1 , index_length_2 ) ; main_index.noshortcuts = noshortcuts ; idx = &main_index ; // FASTA sequence? Just try to join them up and ignore everything smaller than 100 bases. /* if ( !fasta_file.empty() ) { vector chrs ; load_all_chromosomes ( fasta_file , chrs ) ; vector vs ; for ( int a = 0 ; a < chrs.size() ; a++ ) vs.push_back ( chrs[a].sequence ) ; join_single_sequences ( vs ) ; for ( int a = 0 ; a < vs.size() ; a++ ) { if ( vs[a].length() < 100 ) continue ; // Hard cutoff cout << ">CONTIG" << a << "_(" << vs[a].length() << ")" << endl << vs[a] << endl ; } return 0 ; }*/ vector vr ; read_fastq ( fastq_file , vr , 5 ) ; make_read_indices ( vr ) ; vector chrs ; assemble_contigs ( vr , chrs , "ATG" ) ; // join_chromosomes ( vr , index_length_1 , index_length_2 , genome_file ) ; // align_perfect ( vr , 0 ) ; /* vector vs ; vector vr , vn ; read_fastq ( fastq_file , vr ) ; vn.swap ( vr ) ; int iteration = 0 ; do { cerr << "Iteration " << ++iteration << endl ; cerr << vn.size() << " items left" << endl ; vn.swap ( vr ) ; vn.clear() ; vn.reserve ( vr.size() ) ; interlink_reads ( vr ) ; for ( uint a = 0 ; a < idx->position.size() ; a++ ) { idx->position[a].clear() ; } create_contigs ( vr , vn ) ; // for ( int a = 0 ; a < vr.size() ; a++ ) { // if ( vr[a].used ) continue ; // vn.push_back ( vr[a] ) ; // } for ( int a = 0 ; a < vn.size() ; a++ ) { vn[a].reset ( a ) ; } if ( remove_full_sequences ( vn , vs ) == 0 && iteration > 5 ) break ; if ( iteration > 15 ) break ; // Hard cutoff } while ( 1 ) ; // join_single_sequences ( vs ) ; for ( int a = 0 ; a < vs.size() ; a++ ) { if ( vs[a].length() < 100 ) continue ; // Hard cutoff cout << ">CONTIG" << a << "_(" << vs[a].length() << ")" << endl << vs[a] << endl ; }*/ return 0 ; } // rm -f reassemble ; make clean ; make reassemble ; ./reassemble --fastq=/nfs/repository/d0071/SLX_ST_1451_BK1/1547_s_2.fastq --pair=76 --genome=contigs.fasta snpomatic-1.0/src/snpomatic.h000066400000000000000000000241621260772140700163170ustar00rootroot00000000000000#ifndef __SNPOMATIC_H__ #define __SNPOMATIC_H__ #include #include #include #include #include #include #include #include #include #include #include #include using namespace std ; #ifdef __APPLE__ #define ulong unsigned long #endif // __________________________________________________________________________________________________________________________________________________________________ DEFINES // DEBUGGING: set to 1 to get status information over STDOUT #define DEBUGGING 1 // READ_CHAR_BUFFER: Buffer size for reading a FASTA/FASTQ line #define READ_CHAR_BUFFER 1024 // MINQUAL: Make bases in output lowercase if their quality falls below this threshold #define MINQUAL 20 // MAXLINES: Max number of lines in alignment #define MAXLINES 70 // Misc stuff #define IUPAC_A 1 #define IUPAC_C 2 #define IUPAC_G 4 #define IUPAC_T 8 #define IUPAC_ALL 15 // Macros (to save runtime) #define MERGE_IUPAC(_a,_b) IUPAC2char[char2IUPAC[_a]|char2IUPAC[_b]] #define twotoone(__lane,__pos) ((__pos)*MAXLINES+(__lane)) #define REVERSE_COMPLEMENT_CHARP(__start) { \ char *__c , *__d , __e ; \ for ( __c = __start ; *__c ; __c++ ) ; \ for ( __c-- , __d = __start ; __d < __c ; __c-- , __d++ ) { \ __e = base_rc[*__d] ; \ *__d = base_rc[*__c] ; \ *__c = __e ; \ } \ if ( __c == __d ) *__c = base_rc[*__c] ; \ } // __________________________________________________________________________________________________________________________________________________________________ TYPEDEFS class TPositionWithIndex ; class TChromosome ; class TCoverage ; typedef unsigned char uchar ; typedef unsigned int index2type ; typedef vector vuint ; typedef vector pwi_vector ; typedef vector chrvec ; typedef vector ::iterator TPWIi ; typedef vector ::iterator TSVI ; typedef vector TVC ; // __________________________________________________________________________________________________________________________________________________________________ GLOBAL VARIABLES extern uchar char2num[256] , num2char[256] , to_lc[256] , to_uc[256] , char2IUPAC[256] , IUPAC2char[256] , isIUPAC[256] , base_rc[256] , IUPAC_variance[256] ; extern int globally_allowed_mismatches ; // __________________________________________________________________________________________________________________________________________________________________ GLOBAL FUNCTIONS void Tokenize(const string& str, vector& tokens, const string& delimiters = " ") ; void init_iupac () ; bool matchIUPAC ( const char *c1 , const char *c2 ) ; void load_all_chromosomes ( string filename , vector &chrs , bool keep_original_snps = false ) ; void incorporate_all_gff ( string filename , vector &chrs ) ; void incorporate_simple_snps ( string filename , vector &chrs ) ; // __________________________________________________________________________________________________________________________________________________________________ CLASSES class TCoverage { public : TCoverage () { A = C = G = T = rc = 0 ; } uint A , C , G , T , rc ; } ; /// Stores and outputs the pileup. class TAlignmentOutput { public : char *align ; ///< Two-dimensional alignment array for the bases. char *qalign ; ///< Same as *align, but for quality. vector align_full ; ///< Shortcut mechanism to speed up alignment of extreme depths. void init ( TChromosome *_chr ) ; ///< Initialization of align and qualign memory. void show_pileup ( FILE *pileup , bool snps_only ) ; ///< Outputs the alignment to the given file. void add_align ( const string &seq , const string &quality , uint pos , int chromosome ) ; TChromosome *chr ; ///< Pointer back to the chromosome. protected: uint align_size ; } ; /// Stores secondary index, position, chromosome, and reverse/complement information. Heavily used by TChromosomalIndices. class TPositionWithIndex { public : index2type index2 ; ///< The secondary index. uint position ; ///< The position in the chromosome. bool reverse_complement ; ///< Is this reverse/complement? uint chromosome ; ///< Number of chromosome. } ; /// Calculates and stores the indices on the chromosomes. class TChromosomalIndices { public: TChromosomalIndices ( int _index_length , int _index_length2 ) ; void run_reads ( string filename ) ; void run ( chrvec *_chrs ) ; void run_ends ( chrvec *_chrs , int min_length = 0 ) ; void run_regions ( chrvec *_chrs , string filename ) ; uint calculate_index ( const char *sequence_kmer ) ; index2type calculate_index2 ( const char *sequence_kmer ) ; void save_index ( string filename ) ; void load_index ( string filename ) ; void uniqueness ( string filename ) ; string decompile ( uint index , uint bases ) ; vector position ; ///< The complete index. int index_length , index_length2 , index_length_double , noshortcuts ; int max_snps_per_index ; ///< Maximal number of known SNPs per index (1+2); prevent excessive memory usage. chrvec *chrs ; ///< Pointer to vector with all the chromosomes (TChromosome). string index_file ; uint index_from , index_to ; uint index_steps ; private: int add_sequence ( char *begin , char *cur , int from , int chr , bool rc , int nos = 0 ) ; void add_index ( char *start , bool rc , uint pos , uchar chromosome ) ; void reassemble_reads ( FILE *file , uint i1 , index2type i2a , index2type i2b ) ; bool align_read2contigs ( char *read , vector &vs ) ; char tmp[READ_CHAR_BUFFER] ; } ; /// Holds an individual chromosome. class TChromosome { public: string name , sequence , original_sequence ; TAlignmentOutput ao ; TVC coverage ; vector uniqueness ; void read_chromosome ( string filename , string chromosome_name ) ; void incorporate_known_snps ( string filename ) ; void incorporate_known_snps_gff ( string filename ) ; void add_coverage ( const string &seq , uint pos , bool rc ) ; } ; /// Reads Solexa data from FASTA/FASTQ and finds perfect matches in the reference. class TChromosomeAlign { public: TChromosomeAlign ( chrvec &_chrs , TChromosomalIndices &_index ) ; void init () ; void align_solexa_fasta ( string filename ) ; void align_solexa_fastq ( string filename ) ; void align_solexa_paired ( string filename , string filename2 , uint read_length_1 , uint fragment_length , uint range ) ; void align_solexa_fastq_variety ( string filename ) ; void show_pileup ( bool snps_only = false ) ; void align_contigs ( chrvec &contigs , TChromosomalIndices &contig_index , uint maxerr , string output_filename ) ; void add_nono_list ( string filename ) ; bool is_nono ( const string &s ) ; void dump_coverage () ; void finish_sqlite ( int fragment , int variance , int read_length ) ; void get_align_solexa_read ( const char *_seq , pwi_vector &results ) ; FILE *pileup , *binfile_no_match , *binfile_single_match , *binfile_multi_match , *snpsinreads ; FILE *binfile_iupac , *cigar , *wobble , *fragmentplot , *featuretable , *gffout , *coverage ; FILE *indelplot , *inversions , *sqlite , *sam , *spancontigs , *faceaway , *rpa ; bool using_bins , multimatch , singlematch , force_one_unique_match ; uint chop , wobblemax , single_read_length ; protected: int align_solexa_read ( char *_seq , const string &_quality , bool inv_compl = false ) ; void add_cigar_sqlite ( const string &sequence , const string &name , uint position , uint chromosome , char orientation ) ; virtual uint align_solexa_paired_read ( const string &name , const string &sequence , string quality , uint read_length_1 , uint fragment_length , uint range ) ; int paired_read_combine ( const pwi_vector &v1 , const pwi_vector &v2 , uint fragment_length , uint range , uint read_length_1 , const string &seq1 , const string &qual1 , const string &seq2 , const string &qual2 , bool wobbling = false , bool inverting = false , int order = 0 ) ; void write2bin ( const string &name , const string &sequence , const string &quality , int matches ) ; void run_wobble ( const string &seq1 , const string &seq2 , const string &qual1 , const string &qual2 , const pwi_vector &res1 , const pwi_vector &res2 , uint fragment_length ) ; void wobble4snps ( const pwi_vector &res , const string &seq , uint fragment_size , uint range , const string &anchor_seq , bool rc ) ; void add_coverage ( const string &seq , uint pos ) ; int try_matching_read ( const string & seq , string quality ) ; string get_ref_substr ( const TPositionWithIndex &p , int length ) ; void add_snpsinreads ( const string &seq , const string &quality , int chr , int pos , int readpart ) ; int count_snpsinreads ( const string &seq , int chr , int pos ) ; void add_sam ( const string &seq , const string &quality , int chr , int pos , int readpart , int matepos , int ins_size , bool rc , bool as_pair , bool mate_rc , bool unique_match ) ; void wobble_single_read ( char *seq , const string &_quality ) ; string matchIUPAC_tolerant ( const char *c1 , const char *c2 , int tolerance ) ; string generate_sqlite_paired_command ( int chr , string seq1 , int pos1 , string seq2 , int pos2 , char mode = ' ' , string add_keys = "" , string add_values = "" ) ; void add2sqlite_cache ( string s ) ; void rpa_out ( const TPositionWithIndex &a , const TPositionWithIndex &b , uint read_length_1 , const string &seq1 , const string &qual1 , const string &seq2 , const string &qual2 ) ; uint max_align ; string last_solexa_name ; vector sqlite_out ; FILE *sqlite_cache ; string sqlite_cache_name ; chrvec *chrs ; TChromosomalIndices *index ; bool fasta , use_nono , sam_wrote ; pwi_vector wobble_results ; vector nono ; uint fragment_range ; vector fragment_stats ; bool snpsinreads_first ; bool sqlite_prefix_first ; bool rpa_header_written ; string sqlite_prefix ; string rgid ; } ; class prc_cache { public : TPositionWithIndex a , b ; } ; bool operator < ( const TPositionWithIndex &a , const TPositionWithIndex &b ) ; bool tpi_full_compare ( const TPositionWithIndex a , const TPositionWithIndex b ) ; string dor ( string s ) ; string doc ( string s ) ; string dorc ( string s ) ; #endif snpomatic-1.0/src/ungap.cpp000066400000000000000000000233141260772140700157650ustar00rootroot00000000000000#include "snpomatic.h" #include int maxfrag = 0 ; class TLeftovers { public : string seq ; int pos ; // Not used ATM bool used ; vector keys ; vector related ; } ; class TRegionScan : public TChromosomeAlign { public: TRegionScan ( chrvec &_chrs , TChromosomalIndices &_index ) : TChromosomeAlign ( _chrs , _index ) {} ; void assemble_leftovers ( int cnt ) ; vector leftovers ; protected: virtual uint align_solexa_paired_read ( const string &name , const string &sequence , string quality , uint read_length_1 , uint fragment_length , uint range ) ; int get_overlap ( TLeftovers &l1 , TLeftovers &l2 ) ; TLeftovers merge_reads ( TLeftovers &l1 , TLeftovers &l2 , int overlap ) ; } ; bool operator < ( const TLeftovers &a , const TLeftovers &b ) { return a.seq < b.seq ; } TLeftovers TRegionScan::merge_reads ( TLeftovers &l1 , TLeftovers &l2 , int overlap ) { TLeftovers l ; l.seq = l2.seq.substr ( 0 , overlap ) + l1.seq ; for ( int n = 0 ; n + 16 < l.seq.length() ; n++ ) { l.keys.push_back ( index->calculate_index2 ( l.seq.c_str() + n ) ) ; } l.used = false ; return l ; } int TRegionScan::get_overlap ( TLeftovers &l1 , TLeftovers &l2 ) { int a, b ; int o = -1 ; for ( a = 0 ; a < l1.keys.size() ; a++ ) { for ( b = a+1 ; b < l2.keys.size() ; b++ ) { if ( l1.keys[a] != l2.keys[b] ) continue ; int diff = b - a ; if ( o == -1 ) o = diff ; else if ( o != diff ) return -1 ; // More than one overlap } } /* string s1 = l1.seq.substr ( o , l1.seq.length() - o ) ; string s2 = l2.seq ; if ( s1.length() < s2.length() ) s2 = s2.substr ( 0 , s1.length() ) ; else if ( s2.length() < s1.length() ) s1 = s1.substr ( 0 , s2.length() ) ; if ( s1 != s2 ) return -1 ; // No match */ return o ; } void TRegionScan::assemble_leftovers ( int cnt ) { uint a , b ; vector lo ; cout << "Assembling, depth " << cnt << endl ; for ( a = 0 ; a+1 < leftovers.size() ; a++ ) { for ( b = a+1 ; b < leftovers.size() ; b++ ) { int o = get_overlap ( leftovers[a] , leftovers[b] ) ; if ( o < 0 ) continue ; // No overlap, or more than one leftovers[a].used = true ; leftovers[b].used = true ; lo.push_back ( merge_reads ( leftovers[a] , leftovers[b] , o ) ) ; // cout << leftovers[a].seq << " <=> " << leftovers[b].seq << " : " << o << endl ; } } if ( cnt <= 0 ) return ; for ( a = 0 ; a < leftovers.size() ; a++ ) { if ( leftovers[a].used ) continue ; lo.push_back ( leftovers[a] ) ; } // Add unique sequences as new source set sort ( lo.begin() , lo.end() ) ; leftovers.clear() ; for ( a = 0 ; a < lo.size() ; a++ ) { if ( a == 0 || lo[a-1].seq != lo[a].seq ) leftovers.push_back ( lo[a] ) ; } assemble_leftovers ( cnt - 1 ) ; } uint TRegionScan::align_solexa_paired_read ( const string &name , const string &sequence , string quality , uint read_length_1 , uint fragment_length , uint range ) { string seq1 = sequence.substr ( 0 , read_length_1 ) ; string seq2 = sequence.substr ( read_length_1 , sequence.length() - read_length_1 ) ; while ( quality.length() < sequence.length() ) quality += (char) 33+30 ; string qual1 = quality.substr ( 0 , read_length_1 ) ; string qual2 = quality.substr ( read_length_1 , sequence.length() - read_length_1 ) ; pwi_vector res1 , res2 ; get_align_solexa_read ( seq1.c_str() , res1 ) ; get_align_solexa_read ( seq2.c_str() , res2 ) ; if ( res1.size() + res2.size() == 0 ) return 0 ; sort ( res1.begin() , res1.end() , tpi_full_compare ) ; sort ( res2.begin() , res2.end() , tpi_full_compare ) ; int ret = 0 ; ret += paired_read_combine ( res1 , res2 , maxfrag/2 , maxfrag/2 , read_length_1 , seq1 , qual1 , seq2 , qual2 ) ; ret += paired_read_combine ( res2 , res1 , maxfrag/2 , maxfrag/2 , read_length_1 , seq2 , qual2 , seq1 , qual1 ) ; if ( ret == 0 && res1.size() + res2.size() == 1 ) { string seq ; int newpos , chr ; if ( res1.size() ) { chr = res1[0].chromosome ; if ( res1[0].reverse_complement ) { seq = seq2 ; newpos = res1[0].position - fragment_length ; } else { seq = dorc ( seq2 ) ; newpos = (int) res1[0].position + fragment_length ; } } else { chr = res2[0].chromosome ; if ( res2[0].reverse_complement ) { seq = seq1 ; newpos = (int) res2[0].position - fragment_length ; } else { seq = dorc ( seq1 ) ; newpos = res2[0].position + fragment_length ; } } if ( newpos > 0 || newpos + seq.length() < (*chrs)[chr].sequence.length() ) { TLeftovers l ; l.seq = seq ; l.pos = newpos ; l.used = false ; for ( int n = 0 ; n + 16 < seq.length() ; n++ ) { l.keys.push_back ( index->calculate_index2 ( seq.c_str() + n ) ) ; } leftovers.push_back ( l ) ; } } return ret ; } // __________________________________________________________________________________________________________________________________________________________________ MAIN /// Shows possible parameters, and an (optional) error message. void die_with_description ( const char *msg = NULL ) { if ( msg ) cerr << msg << endl ; cerr << "Parameters : \n" ; exit ( 1 ) ; } /// Well, guess what. int main ( int argc , char **argv ) { string genome_file , gff_file , simple_snp_file , fastq_file , chromosome , name , coverage_file ; string bin_prefix , binmask ; uint mspi = 8 ; uint pair_length = 0 ; uint fragment_length = 0 ; uint variance = 0 ; uint index_length_1 = 10 ; uint index_length_2 = 16 ; int noshortcuts = false ; uint from = 0 , to = 0 ; // Parameters static struct option long_options[] = { { "genome" , required_argument , 0 , 'g' } , { "fastq" , optional_argument , 0 , 'q' } , { "snps" , optional_argument , 0 , 's' } , { "gff" , optional_argument , 0 , 'f' } , { "pair" , optional_argument , 0 , 'i' } , { "fragment" , optional_argument , 0 , 't' } , { "variance" , optional_argument , 0 , 'v' } , { "coverage" , optional_argument , 0 , 'e' } , { "mspi" , optional_argument , 0 , 'm' } , { "chr" , optional_argument , 0 , 'c' } , { "from" , optional_argument , 0 , 'r' } , { "to" , optional_argument , 0 , 'o' } , { "name" , optional_argument , 0 , 'n' } , { "bins" , optional_argument , 0 , 'b' } , { "binmask" , optional_argument , 0 , 'z' } , { 0 , 0 , 0 , 0 } } ; int c ; int option_index = 0; while ( -1 != ( c = getopt_long (argc, argv, "g:a::q::s::f::p::b::r::w::n::",long_options, NULL) ) ) { switch ( c ) { case 0 : break ; case 'g' : genome_file = optarg ; break ; case 'q' : fastq_file = optarg ; break ; case 's' : simple_snp_file = optarg ; break ; case 'f' : gff_file = optarg ; break ; case 'c' : chromosome = optarg ; break ; case 'n' : name = optarg ; break ; case 'e' : coverage_file = optarg ; break ; case 'i' : pair_length = atoi ( optarg ) ; break ; case 't' : fragment_length = atoi ( optarg ) ; break ; case 'v' : variance = atoi ( optarg ) ; break ; case 'm' : mspi = atoi ( optarg ) ; break ; case 'r' : from = atoi ( optarg ) ; break ; case 'o' : to = atoi ( optarg ) ; break ; case 'b' : bin_prefix = optarg ; break ; case 'z' : binmask = optarg ; break ; default : die_with_description("Unknown option") ; } } binmask += "0000" ; // Ignore bins if shorter string was supplied if ( genome_file.empty() ) die_with_description("No genome file given!") ; // Need this if ( fastq_file.empty() ) die_with_description("No Solexa file given!") ; // Need one if ( chromosome.empty() ) die_with_description("No chromosome given!") ; // Need one if ( name.empty() ) die_with_description("No name given!") ; // Need one if ( from == 0 ) die_with_description("Needs --from!") ; // Need one if ( to == 0 ) die_with_description("Needs --to!") ; // Need one // Incorporating known SNPs and indexing chromosomes init_iupac () ; TChromosomalIndices index ( index_length_1 , index_length_2 ) ; index.noshortcuts = noshortcuts ; index.max_snps_per_index = mspi ; vector chrs ; load_all_chromosomes ( genome_file , chrs ) ; if ( !gff_file.empty() ) incorporate_all_gff ( gff_file , chrs ) ; if ( !simple_snp_file.empty() ) incorporate_simple_snps ( simple_snp_file , chrs ) ; // Remove all but one chromosome uint a ; for ( a = 0 ; a < chrs.size() ; a++ ) { if ( chrs[a].name == chromosome ) continue ; for ( int b = a ; b+1 < chrs.size() ; b++ ) chrs[b] = chrs[b+1] ; chrs.pop_back() ; a-- ; } chrs[0].sequence = chrs[0].sequence.substr ( from-1 , to-from ) ; maxfrag = chrs[0].sequence.length() ; cout << "Keeping " << chrs[0].sequence.length() << " bases from " << chromosome << endl ; index.run ( &chrs ) ; // Now look at the SOlexa reads and do the thing TRegionScan ca ( chrs , index ) ; if ( !coverage_file.empty() ) ca.coverage = fopen ( coverage_file.c_str() , "w" ) ; ca.init () ; /* if ( !bin_prefix.empty() ) { string ext = ".fasta" ; if ( !fastq_file.empty() ) ext = ".fastq" ; cout << "Using binmask " << binmask << endl ; if ( binmask[0] == '1' ) ca.binfile_no_match = fopen ( string ( bin_prefix + "_no_match" + ext ).c_str() , "w" ) ; if ( binmask[1] == '1' ) ca.binfile_single_match = fopen ( string ( bin_prefix + "_single_match" + ext ).c_str() , "w" ) ; if ( binmask[2] == '1' ) ca.binfile_multi_match = fopen ( string ( bin_prefix + "_multi_match" + ext ).c_str() , "w" ) ; if ( binmask[3] == '1' ) ca.binfile_iupac = fopen ( string ( bin_prefix + "_iupac" + ext ).c_str() , "w" ) ; ca.using_bins = true ; } */ ca.align_solexa_paired ( fastq_file , pair_length , fragment_length , variance ) ; // ca.assemble_leftovers ( 2 ) ; string fasta_output = name + ".fasta" ; fstream filestr; filestr.open ( fasta_output.c_str() , fstream::out ); //| fstream::app); for ( a = 0 ; a < ca.leftovers.size() ; a++ ) { filestr << ">" << a << endl << ca.leftovers[a].seq << endl ; } filestr.close(); ca.dump_coverage() ; // cout << chrs[0].sequence << endl ; } snpomatic-1.0/src/variety.cpp000066400000000000000000000276661260772140700163540ustar00rootroot00000000000000/*! \mainpage SNP-O-MATIC * * \section authors Authors * * - Magnus Manske (mm6@sanger.ac.uk) * * \section intro_sec Introduction * * SNP-O-MATIC can align Solexa reads against a reference sequence. * Known SNPs can be added to the reference sequence as IUPAC bases. * Only perfectly matching Solexa reads, with respect to the IUPAC bases, will be aligned. * Reads can be binned into those that do not match the reference, those that match uniquely, and those that match two or more times. * * \section install_sec Installation * * SNP-O-MATIC comes with a Makefile. The Intel compiler is used by default for performance reasons, but the g++ variant is in the Makefile as well in the form of comments. */ #include "snpomatic.h" // __________________________________________________________________________________________________________________________________________________________________ MAIN /// Shows possible parameters, and an (optional) error message. void die_with_description ( const char *msg = NULL ) { if ( msg ) cerr << msg << endl ; cerr << "Parameters : \n" ; cerr << "--genome=GENOME_FILE\tFASTA file with chromosomes (mandatory)\n" ; cerr << "--fasta=FASTA_FILE\tFASTA file with Solexa reads (mandatory, except when --fastq or --index is used)\n" ; cerr << "--fastq=FASTQ_FILE\tFASTQ file with Solexa reads (mandatory, except when --fasta or --index is used)\n" ; cerr << "--nono=FILENAME\tFile with list of read names (!) to ignore (optional)\n" ; cerr << "--regions=REGION_FILE\tRegion file for finding new SNPs (optional)\n" ; cerr << "--snps=SNP_FILE\t\tSimple SNP file (optional)\n" ; cerr << "--gff=GFF_FILE\t\tGFF file with SNPs (optional)\n" ; cerr << "--uniqueness=OUTPUT_FILE\tOutput a uniqueness data file for the reference; no Solexa reads needed; implies --noshortcuts (optional)\n" ; cerr << "--pileup=OUTPUT_FILE\tOutputs complete pileup into that file (optional)\n" ; cerr << "--cigar=OUTPUT_FILE\tOutputs alignment in CIGAR format (optional)\n" ; cerr << "--gffout=OUTPUT_FILE\tOutputs reads in GFF format (optional)\n" ; // cerr << "--featuretable=OUTPUT_FILE\tOutputs a DDBJ/EMBL/GenBank feature table for matched reads (optional; paired reads only for now)\n" ; cerr << "--bins=FILE_PREFIX\tOutputs no-match, single-match and multi-match Solexa reads into prefixed files (optional)\n" ; cerr << "--binmask=MASK\tMask of 1s and 0s to turn off individual bins. Order: No match, single match, multi-match, IUPAC. Example: 0100 creates only single-match bin. (optional; default:1111)\n" ; cerr << "--pair=NUMBER\t\tFor paired reads, the length of the first part of the read (mandatory for paired reads)\n" ; cerr << "--fragment=NUMBER\tFor paired reads, the average fragment length (mandatory for paired reads)\n" ; cerr << "--variance=NUMBER\tFor paired reads, the variance of the fragment length to either side (optional; default: 1/4 of fragment size)\n" ; cerr << "--wobble=FILENAME\tOutputs a list of possible variations (optional; paired reads only) [UNDER CONSTRUCTION]\n" ; cerr << "--coverage=FILENAME\tOutputs (high depth) coverage data (optional)\n" ; cerr << "--index=FILENAME\tIndex filename (index will be created if it does not exist; optional)\n" ; cerr << "--mspi=NUMBER\t\tMaximum number of SNPs per chromosomal index (optional; default:8)\n" ; cerr << "--noshortcuts\t\tWill process all chrososomal regions, even those with lots'o'repeats (optional; no value)\n" ; cerr << "--snpsonly\t\tOnly lists found SNPs in the pileup (optional; no value)\n" ; cerr << "--fragmentplot=FILENAME\tWrites a plot of fragment size distribution to a file (optional)\n" ; exit ( 0 ) ; } /// Well, guess what. int main ( int argc , char **argv ) { string genome_file , fastq_file ; uint index_length_1 = 4 ; uint index_length_2 = 10 ; int noshortcuts = true ; uint mspi = 8 ; static struct option long_options[] = { { "genome" , required_argument , 0 , 'g' } , { "fastq" , optional_argument , 0 , 'q' } , { 0 , 0 , 0 , 0 } } ; int c ; int option_index = 0; while ( -1 != ( c = getopt_long (argc, argv, "g:a::q::s::f::p::b::r::w::n::",long_options, NULL) ) ) { switch ( c ) { case 0 : break ; case 'g' : genome_file = optarg ; break ; case 'q' : fastq_file = optarg ; break ; } } init_iupac () ; TChromosomalIndices index ( index_length_1 , index_length_2 ) ; index.noshortcuts = noshortcuts ; index.max_snps_per_index = mspi ; vector chrs ; load_all_chromosomes ( genome_file , chrs ) ; index.run ( &chrs ) ; TChromosomeAlign ca ( chrs , index ) ; ca.align_solexa_fastq_variety ( fastq_file ) ; /* string genome_file , gff_file , simple_snp_file , fastq_file , fasta_file , pileup_file , bin_prefix , regions_file ; string cigar_file , wobble_file , nono , fragmentplot , featuretable , gffout , coverage_file , uniqueness ; string index_file ; string binmask ( "1111" ) ; uint mspi = 8 ; uint pair_length = 0 ; uint fragment_length = 0 ; uint index_length_1 = 10 ; uint index_length_2 = 16 ; int noshortcuts = false ; int snpsonly = false ; uint variance = 0 ; // Parameters static struct option long_options[] = { { "genome" , required_argument , 0 , 'g' } , { "fasta" , optional_argument , 0 , 'a' } , { "fastq" , optional_argument , 0 , 'q' } , { "snps" , optional_argument , 0 , 's' } , { "coverage" , optional_argument , 0 , 'e' } , { "gff" , optional_argument , 0 , 'f' } , { "pileup" , optional_argument , 0 , 'p' } , { "bins" , optional_argument , 0 , 'b' } , { "binmask" , optional_argument , 0 , 'z' } , { "index" , optional_argument , 0 , 'x' } , { "nono" , optional_argument , 0 , 'n' } , { "uniqueness" , optional_argument , 0 , '0' } , { "wobble" , optional_argument , 0 , 'w' } , { "pair" , optional_argument , 0 , 'i' } , { "fragment" , optional_argument , 0 , 't' } , { "featuretable" , optional_argument , 0 , 'u' } , { "gffout" , optional_argument , 0 , 'y' } , { "variance" , optional_argument , 0 , 'v' } , { "mspi" , optional_argument , 0 , 'm' } , { "cigar" , optional_argument , 0 , 'c' } , { "fragmentplot" , optional_argument , 0 , 'o' } , { "regions" , optional_argument , 0 , 'r' } , { "noshortcuts" , optional_argument , &noshortcuts , true } , { "snpsonly" , optional_argument , &snpsonly , true } , { 0 , 0 , 0 , 0 } } ; int c ; int option_index = 0; while ( -1 != ( c = getopt_long (argc, argv, "g:a::q::s::f::p::b::r::w::n::",long_options, NULL) ) ) { switch ( c ) { case 0 : break ; case 'g' : genome_file = optarg ; break ; case 'a' : fasta_file = optarg ; break ; case 'q' : fastq_file = optarg ; break ; case 's' : simple_snp_file = optarg ; break ; case 'f' : gff_file = optarg ; break ; case 'e' : coverage_file = optarg ; break ; case 'p' : pileup_file = optarg ; break ; case 'b' : bin_prefix = optarg ; break ; case 'z' : binmask = optarg ; break ; case '0' : uniqueness = optarg ; break ; case 'w' : wobble_file = optarg ; break ; case 'n' : nono = optarg ; break ; case 'x' : index_file = optarg ; break ; case 'i' : pair_length = atoi ( optarg ) ; break ; case 'o' : fragmentplot = optarg ; break ; case 'u' : featuretable = optarg ; break ; case 'y' : gffout = optarg ; break ; case 'v' : variance = atoi ( optarg ) ; break ; case 't' : fragment_length = atoi ( optarg ) ; break ; case 'm' : mspi = atoi ( optarg ) ; break ; case 'c' : cigar_file = optarg ; break ; case 'r' : regions_file = optarg ; break ; default : die_with_description("Unknown option") ; } } binmask += "0000" ; // Ignore bins if shorter string was supplied // Sanity checks if ( genome_file.empty() ) die_with_description("No genome file given!") ; // Need this if ( index_file.empty() ) { if ( fastq_file.empty() && fasta_file.empty() && uniqueness.empty() ) die_with_description("No Solexa file given!") ; // Need one if ( !fastq_file.empty() && !fasta_file.empty() ) die_with_description("Two Solexa files given!") ; // Can't have both } if ( !regions_file.empty() || !uniqueness.empty() ) noshortcuts = true ; // Incorporating known SNPs and indexing chromosomes init_iupac () ; TChromosomalIndices index ( index_length_1 , index_length_2 ) ; index.noshortcuts = noshortcuts ; index.max_snps_per_index = mspi ; index.index_file = index_file ; vector chrs ; load_all_chromosomes ( genome_file , chrs ) ; if ( !gff_file.empty() ) incorporate_all_gff ( gff_file , chrs ) ; if ( !simple_snp_file.empty() ) incorporate_simple_snps ( simple_snp_file , chrs ) ; if ( !regions_file.empty() ) index.run_regions ( &chrs , regions_file ) ; else index.run ( &chrs ) ; if ( !uniqueness.empty() ) { index.uniqueness ( uniqueness ) ; if ( fastq_file.empty() && fasta_file.empty() ) return 0 ; // My work here is done } // Now look at the SOlexa reads and do the thing TChromosomeAlign ca ( chrs , index ) ; if ( !pileup_file.empty() ) ca.pileup = fopen ( pileup_file.c_str() , "w" ) ; if ( !cigar_file.empty() ) ca.cigar = fopen ( cigar_file.c_str() , "w" ) ; if ( !fragmentplot.empty() ) ca.fragmentplot = fopen ( fragmentplot.c_str() , "w" ) ; if ( !gffout.empty() ) ca.gffout = fopen ( gffout.c_str() , "w" ) ; if ( !coverage_file.empty() ) ca.coverage = fopen ( coverage_file.c_str() , "w" ) ; if ( !featuretable.empty() ) { ca.featuretable = fopen ( featuretable.c_str() , "w" ) ; fprintf ( ca.featuretable , "Key\tLocation/Qualifiers\n" ) ; } if ( !bin_prefix.empty() ) { string ext = ".fasta" ; if ( !fastq_file.empty() ) ext = ".fastq" ; cout << "Using binmask " << binmask << endl ; if ( binmask[0] == '1' ) ca.binfile_no_match = fopen ( string ( bin_prefix + "_no_match" + ext ).c_str() , "w" ) ; if ( binmask[1] == '1' ) ca.binfile_single_match = fopen ( string ( bin_prefix + "_single_match" + ext ).c_str() , "w" ) ; if ( binmask[2] == '1' ) ca.binfile_multi_match = fopen ( string ( bin_prefix + "_multi_match" + ext ).c_str() , "w" ) ; if ( binmask[3] == '1' ) ca.binfile_iupac = fopen ( string ( bin_prefix + "_iupac" + ext ).c_str() , "w" ) ; ca.using_bins = true ; } ca.init () ; if ( !nono.empty() ) ca.add_nono_list ( nono ) ; if ( pair_length == 0 ) { if ( !fasta_file.empty() ) ca.align_solexa_fasta ( fasta_file ) ; if ( !fastq_file.empty() ) ca.align_solexa_fastq ( fastq_file ) ; } else { ca.wobble = fopen ( string ( wobble_file ).c_str() , "w" ) ; if ( !fasta_file.empty() ) ca.align_solexa_paired ( fasta_file , pair_length , fragment_length , variance ) ; if ( !fastq_file.empty() ) ca.align_solexa_paired ( fastq_file , pair_length , fragment_length , variance ) ; } ca.show_pileup ( !regions_file.empty() || snpsonly ) ; ca.dump_coverage () ; */ return 0 ; } // Example runs: // rm test ; make clean ; make ; /usr/bin/time ./findknownsnps --genome=/nfs/malaria/data2/3D7/Pf.3D7.v2.1.chromosomes.dna.fa --gff=/nfs/malaria/data2/3D7/known_snps/PfalciparumCombinedSNPs.gff --fasta=/nfs/malaria/data2/3D7/all.solexa.fasta --noshortcuts --bins=test // make clean ; make ; time ./findknownsnps --genome=/nfs/malaria/data2/human_sequence/Homo_sapiens.NCBI36.48.dna.chromosome.22.fa --fasta=/nfs/malaria/data2/3D7/all.solexa.fasta --noshortcuts --bins=human // make clean ; make ; /usr/bin/time ./findknownsnps --genome=/nfs/malaria/data2/3D7/Pf.3D7.v2.1.chromosomes.dna.fa --fasta=/nfs/malaria/data2/3D7/all.solexa.fasta --bins=q2 // make clean ; make ; /usr/bin/time ./findknownsnps --genome=/nfs/malaria/data2/3D7/Pf.3D7.v2.1.chromosomes.dna.fa --fasta=/nfs/malaria/data2/3D7/all.solexa.fasta --gff=/nfs/malaria/data2/3D7/known_snps/PfalciparumCombinedSNPs.gff --cigar=test.cigar // make clean ; make ; /usr/bin/time ./findknownsnps --genome=/nfs/malaria/data2/3D7/Pf.3D7.v2.1.chromosomes.dna.fa --fasta=q1_no_match.fasta --snps=new_snps_60_uniq --bins=q3 // make clean ; make ; /usr/bin/time ./findknownsnps --genome=/nfs/faculty/mm6/3D7_pm.fa --fastq=$MAL_SOLEXA_HOME/run368/368_s_3.fastq --pair=35 --fragment=300 --bins=q3 /* TEST: make clean ; make variety ; ./variety --genome=tools/MAL3.925000-927000.fasta --fastq=tools/585_s_5.bin2_no_match.fastq > v.out */