NxTrim-0.4.3/000077500000000000000000000000001323731670200127435ustar00rootroot00000000000000NxTrim-0.4.3/.gitignore000077500000000000000000000000171323731670200147340ustar00rootroot00000000000000# python *.pyc NxTrim-0.4.3/Changelog000077500000000000000000000043531323731670200145650ustar00rootroot000000000000002018.02.09 *fixed a bug where partial (but perfect match) adapters could be missed when they were right at the end of a read *added some rudimentary unit tests 2018.02.07 *improved a warning message *fixed a command line parsing bug where `-a` would not trigger (long form --aggressive still worked) 2017.10.22 *removed the mergeReads binary (see seqtk for equivalent functionality) *deprecated joinReads functionality 2017.05.27 *will now discard read pairs if one read has length < minoverlap (previously was minlength) 2017.05.26 *added a check to make sure R1/R2 files have same number of lines 2017.04.18 *added --aggressive mode for more thorough adapter search *improved speed of hamming distance calculations 2017.04.17 *added smith-waterman routine for adapter detection - still needs improvement *added a build script for E.coli MG1655 "make ecmg" 2017.04.08 *improved detection of reads where two copies of the Nextera junction adapter are present 2017.03.28 *better error reporting for malformed reads 2016.04.10 *added --stdout-mp and --stdout-un functionality *changed the --norc flag to --rf 2016.03.27 *fixed a bug that occurs when comment field is empty issue #17 2016.02.27 *fixed issue #15 *removed BOOST dependencies 2015.08.13 *makefile/versioning tweaks *added some more examples 2015.05.26 *minor bugfix concerning --justmp and external adapters 2015.05.22 *fixed a problem with header passing when reads were not in Illumina format (eg. ENA) *added --ignorePF flag 2015.05.21 *added --stdout flag *program output now prints to stderr rather than stdout 2014.10.17 *added some code to trim _external_ adapters from unknown libraries (should be very rare) *added git hash to the version number 2014.10.15 *v0.2.0 results for paper resubmission *a number of tweaks/fixes to adapter trimming logic to improve accuracy *added a --justmp flag so only mp libraries are produced 2014.08.29 *fixed a bug in --preserve-mp *tweaks to --joinreads (now performing better on 2x251 libraries) 2014.08.28 *fixed a bug in --joinreads 2014.08.13 *renamed example files to match documentation 2014.08.06 *added handling for fastqs with varying read lengths *better I/O checking *better handling of reads with large numbers of Ns 2014.08.05 *initial commit NxTrim-0.4.3/LICENSE.txt000077500000000000000000000024341323731670200145740ustar00rootroot00000000000000Copyright (c) 2014, Illumina, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. NxTrim-0.4.3/Makefile000077500000000000000000000015611323731670200144110ustar00rootroot00000000000000CXX ?= g++ CC ?= gcc CXXFLAGS ?= -O2 LFLAGS = -lz all: nxtrim debug: CXXFLAGS += -Wall -g debug: all GIT_HASH := $(shell git describe --abbrev=4 --always ) VERSION = v0.4.3 GIT_VERSION = ifneq "$(wildcard .git)" "" GIT_VERSION = -$(shell git describe --always) endif version.h: echo '#define VERSION "$(VERSION)$(GIT_VERSION)"' > $@ OBJS=matepair.o fastqlib.o utilityfunc.o .cpp.o: $(CXX) $(CXXFLAGS) -c -o $@ $< nxtrim: nxtrim.cpp $(OBJS) version.h $(CXX) $(CXXFLAGS) nxtrim.cpp $(OBJS) $(LFLAGS) -o $@ matepair.o: matepair.cpp matepair.h fastqlib.h fastqlib.o: fastqlib.cpp fastqlib.h utilityfunc.h utilityfunc.o: utilityfunc.cpp utilityfunc.h test: nxtrim bash -e example/run_test.sh ecmg: nxtrim cd test/;bash -e ecmg.sh clean: rm $(OBJS) nxtrim test version.h rm -rf test/output_dir/ rm test/*bam test/*pe.fastq.gz test/*mp.fastq.gz test/*unknown.fastq.gz NxTrim-0.4.3/README.md000077500000000000000000000115721323731670200142330ustar00rootroot00000000000000nxtrim: Software to remove Nextera Mate Pair junction adapters and categorise reads according to the orientation implied by the adapter location. This software is not commercially supported. Copyright (c) 2018, Illumina, Inc. All rights reserved. This software is provided under the terms and conditions of the BSD 2-Clause License You should have received a copy of the BSD 2-Clause License along with this program. If not, see https://github.com/sequencing/licenses/. Some detailed assembly results for Nextera Mate-Pair data are available [here](https://github.com/sequencing/NxTrim/wiki/Bacterial-assemblies-using-Nextera-Mate-pairs) ### Installation ``` git clone https://github.com/sequencing/NxTrim.git cd NxTrim make ./nxtrim ``` ### Usage Trimming and assembly with [Velvet](https://www.ebi.ac.uk/~zerbino/velvet/): ``` nxtrim -1 sample_R1.fastq.gz -2 sample_R2.fastq.gz -O sample velveth output_dir 55 -short -fastq.gz sample.se.fastq.gz -shortPaired2 -fastq.gz sample.pe.fastq.gz -shortPaired3 -fastq.gz sample.mp.fastq.gz -shortPaired4 -fastq.gz sample.unknown.fastq.gz velvetg output_dir -exp_cov auto -cov_cutoff auto -shortMatePaired4 yes ``` the above approach corresponds to the results in the NxTrim publication. Trimming and assembly with [SPAdes](http://bioinf.spbau.ru/spades): ``` nxtrim -1 sample_R1.fastq.gz -2 sample_R2.fastq.gz --stdout | gzip -1 -c > sample.allmp.fastq.gz spades.py -t 4 --hqmp1-12 sample.allmp.fastq.gz -o output_dir ``` We concatenate the unknown/mp libraries in this example for SPAdes. SPAdes versions>3.1.0 seems to perform better without our virtual single/pe libraries. **Note:** We achieved good results using the above commands on the bacterial samples analysed in the NxTrim paper. These had modest coverage (<50X). If you have very high coverage samples, it might be preferable to not use the "unknown" library at all or just treat it as a single-ended library, this will remove the risk of PE contaminants causing problems. Piping trimmed reads directly to an aligner: ``` nxtrim --stdout -1 EcMG1_ATGTCA_L001_R1_001.fastq.gz -2 EcMG1_ATGTCA_L001_R2_001.fastq.gz | bwa mem EcMG.fna -p - > out.sam or nxtrim --stdout-mp -1 EcMG1_ATGTCA_L001_R1_001.fastq.gz -2 EcMG1_ATGTCA_L001_R2_001.fastq.gz | bwa mem EcMG.fna -p - > out.sam ``` The first command pipes both unknown/MP reads to stdout, this is useful if you have a high quality reference to align to. The second only prints *known* MP reads, which is useful for scaffolding purposes. ### Output: The default behaviour expects raw fastq files from a Nextera Mate-Pair library kit in Reverse-Forward orientation. Based on the location of the Nextera junction adapter (if detected), nxtrim produces four different "virtual libraries": * mp: read pairs that are large insert-size mate-pairs, both mates will be reverse-complemented by nxtrim (from RF to FR) unless --rf commandline option is used * pe: read pairs that are short insert-size paired-end reads due to the junction adapter occurring early in a read * se: single reads (reads having no R1 or R2 counterpart) * unknown: a library of read-pairs that are mostly large-insert mate-pair, but possibly contain a small proportion of paired end contaminants ### Options: The trimmer will reverse-complement the reads such that the resulting libraries will be in Forward-Reverse (FR) orientation, if you wish to keep your reads as Reverse-Forward then use `--rf` flag. If you wish to generate pure mate-pair libraries (say for scaffolding), you can use the `--justmp` flag. This will only generate the unknown and mp libraries. Reads with a junction adapter occurring < minlength bp before the start will be completely N masked. If you wish to preserve mate-pair libraries whenever possible, the `--preservemp` flag may be useful. This will always keep the mate-pair library *unless* a read generated would be < minlength, in which case it will generate a PE. You can trade specificity/sensitivity of junction adapter detection with the `--similarity` flag (1 - proportion of bp differences allowed for match) and the --minoverlap flag (minimum #bp considered on the ends of reads to match with the Nextera junction adapter). The defaults were well suited to bacteria in our testing. You can turn on a more aggressive search for junction adapters with `--aggressive`. [Some notes on how we detect adapters are here](https://github.com/sequencing/NxTrim/blob/master/docs/adapter_matching.md). ### Example data: https://basespace.illumina.com/s/TXv32Ve6wTl9 Free registration required. ### References: http://res.illumina.com/documents/products/technotes/technote_nextera_matepair_data_processing.pdf http://res.illumina.com/documents/products/appnotes/appnote-nextera-mate-pair-bacteria.pdf [O’Connell, Jared, et al. "NxTrim: optimized trimming of Illumina mate pair reads." Bioinformatics 31.12 (2015): 2035-2037.](http://bioinformatics.oxfordjournals.org/content/31/12/2035.abstract) NxTrim-0.4.3/docs/000077500000000000000000000000001323731670200136735ustar00rootroot00000000000000NxTrim-0.4.3/docs/adapter_matching.md000066400000000000000000000030471323731670200175130ustar00rootroot00000000000000## Adapter matching: ### Default behaviour: We implement a very simple approach. Each read is searched for the junction adapter (`CTGTCTCTTATACACATCT`) and its reverse complement (`AGATGTGTATAAGAGACAG`) based on Hamming distance, the entire junction being `CTGTCTCTTATACACATCT`+`AGATGTGTATAAGAGACAG`. If a match to either side of the junction is found then the entire 38bp juncton is clipped. This means the algorithm can tolerate an indel error in one of the sides of the junction, but not both. eg. this will be caught: ``` CTGTCTCT-TA-TACACATCTAGATGTGTATAAGAGACAG ``` but not this: ``` CTGTCTCT-TA-TACACATCTAGATGT-GTATAAGAGACAG ``` ### Aggressive mode Turning on `--aggressive` will seek for adapters more aggressively. Rather than just check for the left/right side of the adapters with Hamming distance, it shreds the 38bp junction adapter into 19-mers and then checks if any of these are present in the reads. If any 19-mer match within the specified similarity is found, it will clip the read appropriately. This approach finds a very modest increase in adapters, but doesn't have a practical impact on assembly quality in my hands. Your mileage may vary. Ideally we would use a seed-and-extend approach as implemented in tools such as [Trimmomatic](http://www.usadellab.org/cms/?page=trimmomatic) or semi-global alignment as used in [cutadapt](https://cutadapt.readthedocs.io/en/stable/). The rate of missed adapters with `--aggressive` on was around 1 per 8000 read-pairs in one test on 2x151bp E. coli K-12 MG1655 run (missed adapters found with blast). NxTrim-0.4.3/example/000077500000000000000000000000001323731670200143765ustar00rootroot00000000000000NxTrim-0.4.3/example/ENA_R1.fastq.gz000077500000000000000000000003261323731670200170260ustar00rootroot00000000000000^Ur1.fastqLj0 }W e\&Oc90Iq.h}߯k0!GZ}О\AY,l`9("y˅ Rp9oz N`^yyO(E /M7MM)gHIk}DrW[܈w+1D =]zNxTrim-0.4.3/example/ENA_R2.fastq.gz000077500000000000000000000003021323731670200170210ustar00rootroot00000000000000^Ur2.fastqELA0 ܑ6 iˆҁDzu }^S{9]Gtmq;}BڮwЀ"3*S̨f2U/?}XOFP ͱAZ\j!")˒ a)y/, \nLEx5ZUVi\@ENxTrim-0.4.3/example/EXTERNAL_R1.fastq.gz000066400000000000000000000004351323731670200176430ustar00rootroot00000000000000,ATEXTERNAL_R1.fastq]PAn0 Xu hhrN[FkЕ$Yhw)TeW"}:AO/Rj2EeeF3U 3h Pp]1B$ =|T[ZfzM!*ׁF)Sۓ . NyJxi (zQ \Q:Ȧ.*ZцkRԋ[2nz"?IX'iÜ~q=/XZގP<~°uq,g-NxTrim-0.4.3/example/EXTERNAL_R2.fastq.gz000066400000000000000000000004441323731670200176440ustar00rootroot00000000000000u,ATEXTERNAL_R2.fastqEOAr wn7ӒtҞ[*M",oǖ>d)v }_ )NDLܻu&"ōI3[`(,QQ;{ 2P@æ,vq-.ck\IYWM ;||Dc`~;O Ȁ^h9iG6X~'[GZ^Er՛rfFsVQjy2K,URYr."^\{I)b;>3Lݧ/NxTrim-0.4.3/example/FailPF_R1.fastq.gz000066400000000000000000000002511323731670200175160ustar00rootroot00000000000000^ULA0 HZ2ie'n*ql'&tpSSu]Ԋn61D DŽy9f[K YnEh'_dlh?1]jQU 5`e&8S- okNxTrim-0.4.3/example/FailPF_R2.fastq.gz000066400000000000000000000002031323731670200175140ustar00rootroot00000000000000^UO1 0 @ Ě*$S[q㒽8IB=JPQbHZ țyzDy=T4q%G3#?l`Qm|lŕ:V+gWB@/+ ?kNxTrim-0.4.3/example/HFYJ5AFXX.1_5kb_R1.missed.fastq.txt000077500000000000000000000171051323731670200223240ustar00rootroot00000000000000@NB501598:62:HFYJ5AFXX:1:11101:5308:4469 1:N:0:GCCAAT GCCCCAAATGCCCATTCTGCCCCAAATGCCCCAACTGCACACACTTATACAAATCTAGAAGTGTATAACAAACAGGCATTATATCTCCAGATAGCAAGAGAACACCCCTGAACGCCAGTCACGCCAAGAGCTCGTAAGCCCTCATCTGCGGGAAA + AAAAAEEEEEEEE6AEEEEEEE//E/EEA/㗟}O{8y~>7_~VN ΉsD8'1ȉ̹<1Z93JshB>?̀te31]n uT$295IOD;}j4&>Ʌ'S'W(t||;W\2ԓ _l9gOL$ !I.e9 ?0IE2oJUF1zCnq@@ BfbSJZPs>'`0%U2h~, '#/ u0JK< ' ʓR$k0&Lv8slb]ɀ 4mBsk^P+!d+eP|AQ~e)L P2 'V!xŮUi& dƊA4DCt~ C=+ Wpb{aRM&F]֤WnLxkԕaJk3VWb"060vwU[h(D 0' uEgD5n u. :1 @ALe wr9>&i傜DzHZ+I}')ȕB`9``*8S Nzs+„SjJߋe977z29%F58ʕ84qR|HQ,@?uW2/xI@Ko~y j(HRKj(z%uE}|I \CXN,[*+g ts";-+Zq :^K**UR$SkXjTB . 6Y>T@C7ZY1 KZ7:]!J0/nv)GW."s:\JHhXuԅ (zLRr9LLjk4HAҢ\eAbMT DJFxKP#Cۨ4 oU' PLC!:%QRB!sHph5`'Ѥt'?Bv.PVgj7R7xo-# Iⷅ/h8ZYă1 )N|:Z/f2tU>A QW:UJ0mDGNÏ_(~P1}; u˟ܾ-f9;hF+ ^ vl`RU[sc\l\Ui73Bu ,w’GI}%V)aݭ;kqG]Z5Nv"Ƽ3/"+Z3~;^c ^D`2ÁdK@q[R6wjA?J+,Gɰ%}H:c7+ݝ j%4lXJ4j^_fy [\$* .4%lf~G '*tO蟐n؟T,I(݌e('$vou{`~a;CýrI[.oK- -^R(dJt>L;CvXBe Q;w[ؘ{&y#ذRo~iPS][r-;+zMZ򈨫<}W[\ k(`ޅۑ}_m+ҳ cn)@ylxȲxC`Xوn>?qeGeb]lm+BYUT6[ A?}g{⽲ƒhU4}uє7z//LNxTrim-0.4.3/example/MP_R2.fastq.gz000077500000000000000000000057141323731670200167460ustar00rootroot00000000000000TMP_R2.fastqݜr\ DS"8%seMK.4&LRL7?G1˧~:77_{oo|?]~_C]Hm?&V*sz⥠gR:ynʉ݉|;'fB'[1g ˝9 ܑY7zTo ,sNl&`IbT12ͷsrFt+F/^  B BTY0!UY3(uaZ lk V}y+2O( hOU8J-],񫤾@`DEV;!(qk\* ) Qup y R79X[]+Pb-CT )V)S| PE0$SP+A}KX]aT S՜RFRө ֻ'2Ԃm:'Mce Nps}L͖ r^"jUʲh? Yo"ipQhx* ހ>fuNHOK(qiY.1" :mq' tM $B( )TUտ,Pnp|.ebUR_jA fF+b!RI0"nL"Qݲ7*$6eeCT1\CXN,[P$u,abP׷:j=s SkA@|SaDwFŲ@ PR]Q<"زF!No ht/ DQؠTg WQ!6kw[I}%CΤ oU:u46,RPgVBC)55$O4ZM  iQ.(meqqXb_2 k#^[I}%(B2m!*M5+:x0f{#ũ 2_ZG+UCis+Mt!h̵z)3 F0EAt*fm3׸6?JZfjGMSn& MVwbA O90=44:˔4-}L- % QS/4o u Y `s11b‰ȇ.$Uh[zo$ڽ@cb 0t7 mXKc5Hs$ʊ yOK27:-{dޅ T4ޥQK9R5S2]˹NR_ 2hȳCPlh>pŴyEScv3LyF'1ԉhjwMJT&'2JyTGnAn%R 'nbLJG?'p5bVjKGS _){l)KO־siV8a(CiFLL->q2:i R6wJ:сcp쏢rokN^Hj=o ߾b>h w F1ETBMI҃D~@Sx?j/y2iK0@׺7Aڠklf`樒.LgؙjdhֶYUZOd8nF'pڸ2 ҏ!45<hPrKV'\\nEXL0LBCNefEVgQl\ ǙN@&I6,b"!D=8I}%VZ`&cn7􎹑L ܤۥ0zNJv`/,rdy+@/ȝ:Pl gB( ӝ<)IV&V֦^SRG87JТ  [80u_H֥ $ ֘E|sƤibL:Nh:z`YGK? h_P^ƭV9jF`ƠX-IفMy2[@bedl/ >>kBڲcRR^x>cAW{g-LNxTrim-0.4.3/example/PE_R1.fastq.gz000077500000000000000000000056711323731670200167370ustar00rootroot00000000000000شTPE_R1.fastqr\U y U6f"@lJ,\ĉS8ęszju__C}P/ʉġ+:Mg W9xAc7M8=Q E>@GgdžEZspg0=H%1'+yuƚI UĿc:Ouͪ5טY#C]JUAy+zwTsx$9uVE<1o )CsNhԜGZO@b~Sp+MeivVV"Kb?ND V XYY:1%P uOc%x (-h#NP/@IZ\ _@QɳSCs!fD^B<#b= u P˓}m #Z|6 iE#^ژ`F) u0B*yE-:W*oUT X'Z~5 .B˔y扦[w!O F: 4 C>!~:3!.4$7쬡#&}ĥ1I \E9It[>9d;ԫVqeUNڙ-GiDJ|؆[PQ L80r@)n}}COtdT@Id`8I9iQ%RW0d8` )ؑO:Uԅx/NQKzfPww"r,k4%,[_ըjYlF&uVxnGx=Uś] G&vݎ D&іl9_m SN 5bOF/HTּRexP0>3k!֐9f`pP@{6.kN 5[. TͰ[|r)b~!~G ⋼f7\ JeEKi- -D MyHg➣_Fa{碂Ŕڬى.,$ =I1 tTZ 0m`:*$:w0)p2/'!9 L 0wåM23;s6C?[i c! fΘධMɳTXTH0f~dYꚛmL1ʲtV5Hfn뮍}MaT.o&*C0'I?O3vf(nvV^=?K@hld/L\BaBXV=*C' A۵vے`q=p19Sfv.WlO5%")U9杶%)CRsTCoGsD%L}\n<|1u-|ne()|bi|xaF`\E)V=2v^޹Iǩ½FD פuY\I@Ђ[H'z]`܉pe^g=$2N]s571Ƅ4dc:qh_xEwBz D pEL1}1vBz@ЬFOZKߔuS~A? ʉͶQ/;_FŤW/҆uij˧&ňmx gnf.7 o(n㇑5ڎGzvYFK|K>Q<,.}[eV#YoC7MNxTrim-0.4.3/example/PE_R2.fastq.gz000077500000000000000000000057001323731670200167310ustar00rootroot00000000000000شTPE_R2.fastqr]E y UvpWbWzgᨿup89>hZnM~o?p{ط?~x{||p{O߼=nY;32Qy]ʜ;uym]7U| eׇЮ㺪uw}w~m/aozy^o^>y ~bqit>]'0'Y`UzhL W*r":Њ,?POd֕HO)'2LDuܩ|2;xy2u^5#gR1YҢiQ i͵RV s'B^dX_7]74OI*B)♡V[9ZN级<(6O& +@ *t[-Z _,D{|ubi=/HɢN*iTZLoUPU TGZ}p["R; {DB<#z DXE.IG6Pa#ڐ"ĥn:RBAwNR^qc9!!Vb \%zɉi͜b jkYP#,hzڀw.2|̬'JQ# (OF&z`#)ց~0MV=K}f_u'HH;h)F6Xpem=UJK9K)+"%̧6*KHnUI98+$qЧ,6`cZ/'gfu*j$7AAKa;ЪZ `Iij(Yid"ʘRTHx%,[QD >sKٌLȢ)xډ|i~UGմN$д%26X3=DGԖP j.^VZ#lP*aEL4CGPAٞJAӴ&(T #[x= ],tZ d{!ŗM -aUdm S ִ#2/dY=`~,!ޫ۟)n#^ 5pqkMuᕟI\p߲y)䴜DjUo1F+If(O <"ֶl#E!c"4Ÿ%$]J[ZD6mfp!љXٴ0CE#o"!SXF M$=pv3̎kTk@ݣT?1V @ŗˣ1AocN#h{ T m عBh9Ʈ|Z+S23aؽR +2cc}3}u381פ C^/Vc˚ M= <,&._-6),B P;U.ziw5[PҴ(Lȥ, C]YJwJZ3趏 '\-Xiۆ {rr SRZjh!Ė " C_JksMA8 G cA1\$3c3a[Ϡ82ԌM+[/* ljI]xRS:۷e=TmC?,Ӻ@0j¹KoTr5B/E00*(,zcpi3^֘FYm[F>u8Jk`kV̙цXp[8|lcxOa*=5+|$[Yd )gӇ<`;zʩ;X8#7`>;Kr&˷:\b'G/IӸ3(Sc!N>4Cxv :ÇqGʣ|{ԝG{l t@S5cG$g~+eF3/'w,i0QagjZDaH^͔ ]1՞HOy=D+P\I[ϒ)!TMۑޥKq՜ ~)1 g!gqUٺ¼伜:[t1J |OrK^V(C=][-OM*JӉR8 gI^s,to̙gy;Z(P LQ bn"f'̓k]RъbNNZ3b덡PIDV,/ фGk0MKAgB\ Wʷ1]v~ UMjSH,@i"W'Sd+O@6D8b X V-PM1zgeK(h!Jza#}./X<38?;AK8OW6ϝCMUʥqDr狟 DreAxb.bH^p%t4U)XY>,<Q>oV@Ը2R=Q2fvTNq5b^YoQԋ%"{10+'Z\t9@"Dr,SW'7m=t`r{$3V.O]Cȉ&֑ɖK<]ihMA,3OP 6lV CvM 7Ӯ@ `ŕG:0TP\A$!'H<ƎA9ƵcC d8iM2a#raH%B"is“})0c6"i֭CXkl1ϕ͐Ψ+ދPJˢrf5!힒Zf e #1bjS=\|;H/A,(3>\@jL8tCA꣪<,š6P#{Ҥ|hJVN!dvz~E7bBb~J^SqvyO:;5dkUh^>\&^ZJ ڈwwL# 0OG0 Z_nvB,&ǡ'LNxTrim-0.4.3/example/UNKNOWN_R2.fastq.gz000077500000000000000000000034641323731670200175710ustar00rootroot00000000000000!TUNKNOWN_R2.fastqnG z܃{O}O|g8>,E$hwW[)j_oo~_?noooo_޿{w[U/+cՊ<^ΪʌqrcqeU}04{DOr2z*ޥ8&o}LoܧCuPǕ1>y_ɗaC}`Vl@q*(HҠcq)hq WKc=| o32xP_Yy&k(FdJxظÚ :ԴBKZ3^ٌi26n7W*Y(75e>V Ih7q)E1| E;laWWZh)x"P3,k#彜V/UN^n;ex;g%ͬqq.[b$#n7WM"D m7g]/JBtr^/tykxgro43q~R#نaP爇ԧaF=oFkJ[1P66̈́\R=LX/v=49662;#LJ=Bɔ"uPwD*=v^wC5koEbg EQ UoH 9J,-t0'@C1ɢM1jJkT{Φh?dOJS1ϹhC@t& Yn/륮5D~5-B 1po>@]s,IKP_fn|%xM㲉;1[E$paN;8Jo7Po%{t>-ٻ2ɵf*9j|8dbl>{gi:h7uy PPoȹ޸&fב<[%+OVL>[[I 1\u9%̆?B֓.LNxTrim-0.4.3/example/double_adapter_r1.fq000066400000000000000000000005541323731670200203060ustar00rootroot00000000000000@M903:194:000000000-A5FBD:1:1101:15346:24501 1:N:0:ATGTCA ATAGCTCATGCTGCTGATGAAGACCTGTCTCTTATACACATCTAGATGTGTATAAGAGACAGAGATGTGTATAAGAGACAGTATCCGTAGCGTGCGTGACGGGGATATTGGCGCGGTATTTGGCATTGGTTTTCCGCCATTTCTCGGTGGA + BBBBBFFFFFFFCGGGGGGFGGHHHHHHHHHHHHHHHHHHHHHGHGHH5GHHHHHHHHHHHHHHHHHBFGFHHHHHHHGHHBFBGFEFGGECEFGGFDGGGGGGGCHHHGHHCGGGEEFFHHGHHHHHH3GHEHHDGGGGEGHGHGG/EGG NxTrim-0.4.3/example/double_adapter_r2.fq000066400000000000000000000005541323731670200203070ustar00rootroot00000000000000@M903:194:000000000-A5FBD:1:1101:15346:24501 2:N:0:ATGTCA GCGTAAGTATAGTCAATATGCGTTTACATTTTGACCAACACTCCGCCATTCAGCGCGGATTCATATAGCTTTGACCTTCTTATTGCAGGTCAGTTGCAGTTGTTTTCCAAAAACTTTCCCCACGCGCGCCCATCTCGACCAAACGCTCGCA + AA?>1AFDFFFF3BGGGGFGGG1EAGHHGFFHFFDFHHGGGGFGGGAEABDG2FEGCAEEGHHFHHHHHBHFBGHBHFBFHHHGFHGCGFGCG@FGHHGEGHHHHHFGHHGHFGH1F1GGHGE?ECGCGGCFHHHBHHG/AFFGCC?ACC@ NxTrim-0.4.3/example/example11_R1.fastq.gz000066400000000000000000000003331323731670200202130ustar00rootroot00000000000000uUexample11_R1.fastqUNA0 \'jjUDh"ebkvN-soΔhBdB'k2 &lbf"&*PM7 4 Ҋ=m:0P*Y{Us;mly(!Qʁpb5FTK2\~k: ZGRSh='%]<fntNxTrim-0.4.3/example/example11_R2.fastq.gz000066400000000000000000000003401323731670200202120ustar00rootroot00000000000000Uexample11_R2.fastqMMA0 \RvH  iq6qtϴm/}1GSZSSks SjX`PTNUDv A5;[3ZK"ASs:\kg(_"Fٍ̭ P[+μlpDOC._srNxTrim-0.4.3/example/iss34.r1.fq.gz000066400000000000000000000016041323731670200166340ustar00rootroot00000000000000PXiss34.r1.fqv ~@%,sXd!t_΀,9{Xg 7 O/ߖa\kS"?18e\/cx C()ԓ$_*QGcRdDd+U}zUN_OrιPɒ{aEb`=P%r֢g 4 now w`:'w}l{![o*٭/yX4M9JJO`{'~ZD#{J̉&SRRshGRLk,z˾ËvW{]F-tZ וi<:lOp)aZNŪɱPU'D_@a6j t@$sD2}VmiiU1=Z;˕k4)Ezt0lm)Nй?뼾8q[PH5E4jDs`*[d/\l@̩ËTy;PǸekQm״ 1LsrTEO 8NpMe 3]RnuGv.oS5 z8ir(KLb&42a]؅P+x2-EcfQ7\YuU=:?bv- dnl~tΠggPam3Q=PeI˴l|vS}p@&AQC">.]J_'kp184a5A~ta1eDf96bu NxTrim-0.4.3/example/issue12_R1.fastq.gz000066400000000000000000000007611323731670200177160ustar00rootroot00000000000000JUissue12_R1.fastqr0 f7 (%꓁ngr`@$bctЧ95=^SH!4!!ϩˏ~&Ń)+ʹ{e;ʾY\JdO[PMPRp%RMlO2qt9!(AXRўXAۥ%m T]*P{&o7Y*ȶ7⋲]FarlOWln̷s sZ$KKia[b/t|KBt5;|ΞV &ŧcظaa ,Tla(-ZЋd5Y&9kTmWikl[@8M3 KWOi71E郎\zTYxC,Q(yu'KaKzS ܲJNxTrim-0.4.3/example/issue12_R2.fastq.gz000066400000000000000000000006561323731670200177220ustar00rootroot00000000000000\Uissue12_R2.fastqRN@ +8"l,Sm*hpڍcؖg珶!خz!І؅00vS?cI"'bIL_ϟ?>˯ۗO_y 3"x|z?UïkXW|z:_yyvk{~>x}_׿o;~O~~_?_~q~ODW\+O ^_x}b0;Oe&65Y[9&y>OSS|98wA%fxPII'FsO'bWN"y =ubjL1A9XWrA,0&:POBN{xQO29ztwOD`jGrQH"МI+C}jtu5&lBL zz",I)'V1v͋+Uh+Uinst6Ms20'uMv=^T0:Z'7| n=2lVWtOI'0CVXɻ-;&<)H0$ n̛EYE}CK@7\3' S&;k{Hؔ[.ڙh?MdM; t LqmNfLTPLqf^7oQ{ijZOɤaY{tion 0bZb&G9 pЃ6u88Դskc(UJ?-7u]ohzg$5Zp$O5p,9W"~^e &ǝ 5(q j ]֍WgWj ߙoߪye8L-ty. `̨2<#3!:%E A g gn,^9b:i̇zcL4`Ӹ!a'7}k$ߑJ+i-l UoCԛ,9|Aե) \kC- Atm$-E bA1at9) ?Q[I}eiۘ4oQ׹=]ٛSI}eJ͐Q (CoO^Ǵtv#EQ#(MhS?r4/7O^Z@Wd.MZYN9BqKsZ@)Ż0 u鱈>|/52t MXkL0̓W[r?6Qagԡ71@01Cg{[%^  $N^rV:E}R1rs4B>p0sV p\tJ.ʵJfno -G@ Jtb?M;C*[?PQ(fґ-V < c彂eفscc5F`КhG {<݅IZ0f}Vn[˪SgZ`ş{63 j|ur4!U//Tb2} th!ei4"DL;mE[ԁtE&bzMCi;~-[oMdag;5VDZh.ZAvk?$$7|k-E64i:S5 T31X=I1;.>,W_앂)0HJe^BCs3bUWW W-3.TU6"ȅs-!7M'O(M-}T=B_EP 3*cm D>i̔^N , G3%pC6@ouhb٭؄ $݃`fI:F\6I0Noj5bvqjȈH\I܆Brf*WZ݂ܳ}LxJj$BMCT7}.z-JʊO^Cz&Hqr*~ ~vgVS2G8p?]d }Jv:lPI4D .p!#Nk@S'$RUrNfxpj~W[' 4~gV cAvgs[ʰ(&m.ZuJK\iZ0KsŘsrrmiwM e(5YGnl3ԓ&鷶@ˇ;ФG|ݥ0. rզ. \YZW^o`Ҏ|vI(SMk8g"6)nps]"bl`U+|'"Zg5cS3X|JlJ^,v^HU=jFZ؅R]Ϯ2s5˕)/j(۲Ϭ"DƹmGoLSNс~]=;ffi)hSa$U#g+o2{SFMμ v|օV%.R܅M F Vdj;kuԂuƺXJ0b0(Pn9}z /gYmšT8zBWH%{Tx[ҵk ֖;l2G{VfuR}Uk-ϑ>\ ύn)nC(dq !f7+κg(ѥS ~C .VĈ,X3}[G:tG" i[2@[{i.zߒ<\hp`{z<ioIA9ٚn51Vhňy8 dZAZBKNPE4xz$Dh#2M ]A$+YP~FeMѻ_=r!yZKi\KؕKҊP5)lP XQ.v$|kQҹƮC}1(:œY:7m1D4j^;4(yu;xqEO 8"JV-Jej!Vb뮑Aㄘ"[wnyur5fL,+I)Dn<]4խWW @,QD SMC-]%X2Z#ij]1H0OڕIsq') Fi e$2~Lh}ns/51ԥf)Œ&vЏVAꅔ!= %^z )*! 1GcUN+DV Z :Ӑ$A~@ 27,L _|t=@5=w 71?6ZՂ@)0KUS Z,"e᎝KE6] ӯNYZi,Zj1NYPMgOn ±0~+7@]c{w V04tC=򃎖YP` c?Q1 N/|jw@Jx;iܟb]uw4RuYHΏ4IU\6&V4EAycu䳻^oѮlBp C7JOepzNײ{tf@/aб%sۛv,̬1;3Geqh@a8CkW:J]*e@ޟvgvǼqCh)[wV^<);qT[/?K܆†D!Vo ԭ]kmK_^̝x5ڭvaHhw6^REr B&p[#rg5ތ` s"&/5>JsD ʪE٭gʥne.f;W~C^t3(x_ibE<x dd^'W`KV)$#+g΄5҅7hQESpk?p98r'}eVᲠ^g<7sqc&8t hie:q,0z Dt0Ә Bri_j_tՁRY ϝ+qm&:ꔡ?`o3z5W/2^lA|W.|ZiS Ԇ;Kng>]7Z#xpp%Zs@(Ҟ8Qob!V\_ǖ' R~eYZA$vj0)O-f:;Mur,m>#Y܉WLK}c둏&tu:+&!9-Sv*Ia-5l@ϠlI(]zcDͮ)P.v\Fd8R|7N,KmGCz1`Zi4* p) 4%eɗNr|xc B!$Fb=B*μ@1FڽC?j I2;z,ѵ\;ï4#.=pcRrX fkBsg-\x1-py/8.{SZ?SX<ۤ#fō0e Gt݊ zc<4 ]WHk/ uڪ`Pn L0k-ѣ ֮rA,1 N &m/0JHʮ<c)7V m*P;..pUԵ:)7 D(2<|r*mSbUkbIaR]k7R/Z]T+ku p(ErEz46Ӑ~ M`3 0Ƒ[XaL{-4]X[-.Qjiej1)L7%nB{joZCR\W+`XȷL8H0R*o'$ ݺg `[X#V#8 DhOR&,b7goI,!$QX%gk_sFCNsѻh0LRkL8mtNf26w; pU1҃՜`&DZ]qT00LJV'w@2>[΢*hE򐦛YY].VQp~эف$]-jZae|Nh#eoӒ .Ԑ 0獡ݍnt;KgRs4a|sGvCG|s+!zgVƽkC*UH$S hJ.v '-M5 wn– Vջ)"ʿpõ) ؟vSC'ȳ.'wW^CC,`}1ԋKcw r7 [t]b)fҠn͖ CD~WفYbqhJ5k6!W/Pxe"ƶſ^zwT7[d^t:UMD!j:{ n r L>y2ԩ䟻AΫx.W|\[R˻FnIGŊz5 =!, ѻg}3. OR^.AYyĉ10vQ֌_% EÝ#9ޗf=r)Z&w0BL] R]ځùǍ4Ob3#SoE,׳msgm5WMQtx)%=E,[b&G 5{AOݳ3뜔E8w.ȍf|Uqe>pڴ0u )N/`Bi;Q-ۤ|܉>;0`)!mj d ؞vge^Ad{Y uW2խ apLȮ9dV P=.vl%+8ŕTADbh XjT]#:fܝY 7Nbx۱ fFnW%hpD]k̆¹ٓ.dOk10VPʡW!\l˩[aǟ')+z e, +M֍aX cҲHpY6_tK.|dz ;SÖXt*Ly5:*5"^!Ի:"(mq8qMre=R> Mn4=P)4!x ?e\w.+HoN Vfi(4v|b⬤6Fsk۟g:k4Пqgl)ILwlhI5~>i0#vS7)\cB9 i!Qw:`Eor {3$ C&=s4Оvh }JE4RQmn  ʵ[I[# y U'$+w S4ieJH[呹sfcN%O郁t;0QY ZkJ{h@DfG rU+ CyC8> < EkZZWwNJڛnNK&Eԫ'1x\!]Ë$f'S7`^8B!IjYTm43Q>H~uyƵx7=E*pK2л*P.Ccˠ_4hF=h %5FƉ<@£߮ZF-)_/z?yx$Ȇ0ND6 3k֒!߯ S^wTV)q%LxA~WLBtkfqk*ŒdOu uACiѲc|&zpwFHM5+NQZ Na=b# Yx6zpAW;AVW6]pv 4vn NxTrim-0.4.3/example/sample_R2.fastq.gz000077500000000000000000000162451323731670200177140ustar00rootroot00000000000000GwT˒\WnE;!0l'-΃pRbDYY__>)>_˧_˯?oO?)+OwugtGVEfTT]ٮ~=Ϩףu>>~S~}b|}Vkσ~zS^WO~}YWOïz~wAO'œo_o^?8u|NpN@j|]W^Qktx=xz~'Kq9Ͽ|.n;/|\sn _?{ls؜L4'AH''5?;P#'w'sT$OMF'4H{j4_{jjqƗI}e91:?ۤ'9pbST^{89 }=~ .yԫB}jrvbFwWSO\S6kh[ k7y7dZ.ϸRAB\:zsTI3/e?=-I7&S;fB91Ħpjʔ::Z9&JATn2lV. t sՔ 斠pDbEtNqכS7/EHN4id&s87͍Ntcb5S&;k{Hm\4絓|4s6<-A;4iS` )̒sh]r_D8 ͗%ʌ;ܟKsw&mqFôIqZ9HW4~Zny.$x*D:23\6H^MXLS7n^Lpl3MǴn 5@S4|&@4'-8>4~k23 2[&vJW9-l '&iw<+3eQ1/4 % 7#-^Nvc' _xkN783 瓩G][TZR_grѓ6ޢ"L33p A*5y"sYn˴x;̃58ɍN0EQC5 wP4sv-:UK;,GI}e)>aӓu[scCykU7G 0j"%e)ӰK(6 kE,5AZ R}dr-lo&Z:B| =t*zsٍE}F~Fo:O^j8+n`@.Թ d]*2cP"SkR) {s |;Ո_zUPڃJXk 6x6s^9Iq;o ՊL?IZ+Tzr\A'P*\XYugQOӺ=fCIB;Á3(t3M]iej1ۤ y-}'!3)}BpQayR@")SB4I}eE55kKsw9K:XfJZQ" T/I(bQ!6 `>%r0@VWTH ~dWpfTEz)̹.3H֥HX|Řmy丛>dRnj3 u@H#Raw桖?Ii&̐)I'M`93SP5ʈCb k:X,&pEa|CUJįhp6?N+C͝?|^!.Rv*@$_MI<kaZ&iBOgrc15JWl4532(fS*CN!h Jʇ@P0 KK.5F8dUAK8i):b)ZJ]Ђf`K8.S"5=PU=?!gS' E:M\ H@Vt,N-3P_6"+sW˴dq%|R:ؚ즬Vw@5# UIlnb|#LmaˆIcJWZcjQCz!$Δ ذPQĎ;Cۃ_72Z&^cʄ3p*늶(^vSV`K /i&GHL;PB1zVV`8Zx54CЫ,oV&AX$ F#y;'sM`7Ca^MyCa}.E,CK`t2>Y,l>ڠ,' 7|}SH1 u>$"VAdYq?,դ3鲃v?DZPkwf>c,ȮsK?̤}v2m+mVsxi>-F"jqTn"+DeZ,˭,êohP#v7Ln|NnN"]͊QpQm›>v]IsŘsHrZr.5Y =,{]PזRtsIe̦WbkJ冸^!܆vHU=j V+9ۑn#)x58 > FJJWkD *-ag&)@zG?,2](.MKA{JȨ͠))Iv? PgW%QLp"l gWZ 6b3W-,3 rz\M-PQ}o njJvޙWhK K1x9K2{s8nb օ,(CC ,@rn*Y%Z-A۽j\D:BӻnGª5pƂ22@)馸- WӚTzav)uJ8Q#J 6NlP=#ʙPcMpaѮELvEn @Ӷ&J5Z#7t41#MFvY/ںLmX3%[ho^_jX.=e^Ʋ̟8 dZAZ%pDJUU&L~Bq=\Hn E!\%y)bF4t٢mT,xܥ&N[k.Ԯ:PyzF'k͘YyX0R:`+kIsT)-!R2]{3BߴT:$v :h(.a%&eIA1R-ݻ&T2M wP.9uo :R: aS*= ր`#"rZ0RwjБ՜.EF\jdz$ θaV wucPϛB~ѲM(mߠV轊 a] Iv tIO,?^Z ~7[ 1H5xƢZnU-Vѻ?47ZI>k9(@Apa&u]ufݣ3"x XގX-ɝlJ r άxD::s$ DBp|c?@ݎ鑹fZWJ )T[Kk]0&1S tJt+,I7K@J$?DY=)"P"rb}œ]zK-_bP8T:4o}N]=nZkKW(7ݫav fr lGŊx: @ȼ3y gxaWrj"MBq}\:.6\9Ɲ}ND6j<댧&_:+yDTX_Y߃4fGAh\:-=+gÙ95҅7Mt}28Ќg}@Y\);QDj&ox\Jn \Sz1*q-ԡԉ zeVߗpŀ׏.z D30Ә Bf:wDS.wK:܆dz0d3ۆX7LU<[/-A} dqsIJb \> tvɕev6N9=}Ezvtp(]2 O N{HN8N%I8,ƝM2 %hAߡ~_:S]/xhbGR]( a5,:3Z.WCz,q\0%GL:"_=ϥR1VXFۗJaTMϝjS5>[Qs hƣ)K>桙C(-$C"R.1;?-. rIq2UR8L.R7Jw"t,;?-ԭhu~ L]r(4jlubj_HogBPn)wpwO (XZIG 34X4.運z1Mh5Ik kf"ݡRjZ3֌0{]Aa^Ĵ4 GNRu"S/z0o5ӽCB9 gT -+(]$})5@ [ ~oD;j-;Z[wVw7e5/JeYaЬre2P}2b c\\PwMPPώRQlB_9&s_U?XFY,~YX* ;P>хQVp~ HCji.)l@,U>w 3Rqn˜ e`ә?yj/VJ2 nH6PaS} @il 4n˱|@wߡAv0͚M)6œx]D^}Dƹwߡ T MXUTUl@ ke]\ڳ5j -tOvO m+!Վ2@c mn4F(@XS+vckwXTi{²Λ3wh#YeЮ >ꔌ bN Y޶LK8kSx3 [;JKdPPÃ8Z\y X10 !GV~]fvh6-;?F <[k摁LDebo:0wsRFXcug([;C~ҐN H4&n(r'1 ԫ5`JԞ$e*T7:i;z'9U@s%攇\Xbݨh\oZUO'O(Ÿs0z`]R.R%cC*k N)#\h5g٪ޛ.W*oÛCnt,Q8vPi nVtph:Ng`;?-Աz=7?I+bGp>&K 1kc%\OtC0=;P+ NxTrim-0.4.3/fastqlib.cpp000077500000000000000000000132131323731670200152570ustar00rootroot00000000000000#include "fastqlib.h" fqread::fqread(string header,string dna,string line3,string qual) { set( header, dna, line3, qual) ; } int fqread::set(string header,string dna,string line3,string qual) { description=true; h=header; s=dna; l3=line3; q=qual; l=s.size(); istringstream iss; iss.str(header); string tmp; iss >> tmp; iss >> tmp; if(!iss) { filtered=false; description=false; } else { if(tmp.substr(1,3)==":Y:") filtered=true; else filtered=false; } if(dna.size()!=qual.size()) { cerr << header << endl; cerr << dna << endl; cerr << line3 << endl; cerr << qual << endl; die("base length was not equal to qual length in read"); } return(0); } fqread::fqread() { l=0; } int fqread::clear() { l=0; h=""; s.clear(); q.clear(); l3=""; return(0); } fqread::fqread(int L) { l=L; h=""; s.resize(L); q.resize(L); l3=""; } readPair::readPair() { } readPair::readPair(fqread read1,fqread read2) { r1=read1; r2=read2; filtered = read1.filtered || read2.filtered; } int readPair::rc() { r1 = r1.rc(); r2 = r2.rc(); return(0); } fqread fqread::rc() { fqread ret(l); ret.h=h; ret.l3=l3; for(int i=0;ia); int count = 0; for(int i=a;i=0&&b<=l); string new_s = s; string new_q = q; for(int i=a;i0) { cout << "@"<0) { if(_stdout) { read.print(); } else { bool write_ok=true; write_ok = write_ok && gzwrite(fp,"@",1)!=0; write_ok = write_ok && gzwrite(fp,(char *)read.h.c_str(),read.h.size())!=0; write_ok = write_ok && gzwrite(fp,"\n",1)!=0; write_ok = write_ok && gzwrite(fp,(char *)read.s.c_str(),read.s.size())!=0; write_ok = write_ok && gzwrite(fp,"\n",1)!=0; write_ok = write_ok && gzwrite(fp,(char *)read.l3.c_str(),read.l3.size())!=0; write_ok = write_ok && gzwrite(fp,"\n",1)!=0; write_ok = write_ok && gzwrite(fp,(char *)read.q.c_str(),read.q.size())!=0; write_ok = write_ok && gzwrite(fp,"\n",1)!=0; if(!write_ok) { die("problem writing output"); } } return(1); } else return(0); } int fastqWriter::write(readPair & p) { if(p.r1.l>0 && p.r2.l>0) { write(p.r1); write(p.r2); return(1); } else return(0); } int fastqReader::next(fqread & r) { if(kseq_read(seq)<0) return(0); if(seq->comment.s==NULL) r.set((string)seq->name.s,(string)seq->seq.s,"+",(string)seq->qual.s); else r.set((string)seq->name.s+" "+(string)seq->comment.s,(string)seq->seq.s,"+",(string)seq->qual.s); if(!warned&&!r.description&&fp) { cerr << "WARNING: no description found in read header. Assuming read passed passed chastity/purity filters." << endl; warned=true; } return(1); } pairReader::pairReader(string fname1,string fname2) { f1 = new fastqReader(fname1); f2 = new fastqReader(fname2); } void readPair::print() { r1.print(); r2.print(); } int pairReader::next(readPair & p) { bool ret1 = f1->next(p.r1); bool ret2 = f2->next(p.r2); if( (!ret1 && ret2) || (ret1 && !ret2) ) { die("R1/R2 files are out of sync. Check your input."); } if(ret1&&ret2) { p.filtered = p.r1.filtered || p.r2.filtered; } return(ret1&&ret2); } pairWriter::pairWriter() { } pairWriter::pairWriter(string fname) { open(fname); } int pairWriter::open(string fname) { outfile.open(fname); separate=false; return(0); } pairWriter::pairWriter(string fname1,string fname2) { open(fname1,fname2); } int pairWriter::open(string fname1,string fname2) { outfile1.open(fname1); outfile2.open(fname2); separate=true; return(0); } int pairWriter::write(readPair & p) { if(p.r1.l>0 && p.r2.l>0) { if(separate) { outfile1.write(p.r1); outfile2.write(p.r2); } else { outfile.write(p); } return(1); } else return(0); } NxTrim-0.4.3/fastqlib.h000077500000000000000000000035301323731670200147250ustar00rootroot00000000000000#pragma once #include "utilityfunc.h" #include #include #include "kseq.h" KSEQ_INIT(gzFile, gzread) using namespace std; class fqread { public: fqread(); fqread(int L); fqread(string header,string dna,string line3,string qual); int set(string header,string dna,string line3,string qual); int l; bool filtered,description;//if filtered is true, it failed QC string h,s,l3,q; int notN(); int notN(int a,int b);//tells us how many non-missing bases are in this window. fqread mask(int a,int b);//N masks the region a,b fqread mask(); fqread window(int a,int b); fqread window(int a); fqread rc(); void print(); int clear(); }; class readPair { public: readPair(); readPair(fqread read1,fqread read2); int rc(); void print(); fqread r1,r2; int l; bool filtered; }; class fastqReader { public: fastqReader(string fname); ~fastqReader(); int next(fqread & r); private: bool warned; gzFile fp; kseq_t *seq; }; class fastqWriter { public: fastqWriter(); ~fastqWriter(); fastqWriter(string fname); int open(string fname); int write(fqread & read); int write(readPair & read); private: gzFile fp; bool _stdout;//true if writing to stdout }; class pairReader { public: pairReader(string fname1,string fname2); int next(readPair & r); int print(); bool getPair(readPair & p); private: fastqReader *f1,*f2; }; class pairWriter { public: pairWriter(); //interleaved pairWriter(string fname); int open(string fname); //separate files pairWriter(string fname1,string fname2); int open(string fname1,string fname2); int write(readPair & read); bool separate; private: fastqWriter outfile;//interleaved fastqWriter outfile1,outfile2;//separate files }; NxTrim-0.4.3/googletest.make000066400000000000000000000053471323731670200157670ustar00rootroot00000000000000# A sample Makefile for building Google Test and using it in user # tests. Please tweak it to suit your environment and project. You # may want to move it to your project's root directory. # # SYNOPSIS: # # make [all] - makes everything. # make TARGET - makes the given target. # make clean - removes all files generated by make. # Please tweak the following variable definitions as needed by your # project, except GTEST_HEADERS, which you can use in your own targets # but shouldn't modify. # Points to the root of Google Test, relative to where this file is. # Remember to tweak this if you move this file. GTEST_DIR = ./googletest/ # Where to find user code. USER_DIR = ./googletest/samples/ # Flags passed to the preprocessor. # Set Google Test's header directory as a system directory, such that # the compiler doesn't generate warnings in Google Test headers. CPPFLAGS += -isystem $(GTEST_DIR)/include # Flags passed to the C++ compiler. #CXXFLAGS += -std=c++11 -O2 -pthread -lz -lm -mpopcnt CXXFLAGS += -std=c++11 -g -Wall -pthread -lz -lm -mpopcnt # All tests produced by this Makefile. Remember to add new tests you # created to the list. TESTS = test_akt # All Google Test headers. Usually you shouldn't change this # definition. GTEST_HEADERS = $(GTEST_DIR)/include/gtest/*.h \ $(GTEST_DIR)/include/gtest/internal/*.h # House-keeping build targets. all : $(TESTS) clean : rm -f $(TESTS) gtest.a gtest_main.a *.o # Builds gtest.a and gtest_main.a. # Usually you shouldn't tweak such internal variables, indicated by a # trailing _. GTEST_SRCS_ = $(GTEST_DIR)/src/*.cc $(GTEST_DIR)/src/*.h $(GTEST_HEADERS) # For simplicity and to avoid depending on Google Test's # implementation details, the dependencies specified below are # conservative and not optimized. This is fine as Google Test # compiles fast and for ordinary users its source rarely changes. gtest-all.o : $(GTEST_SRCS_) $(CXX) $(CPPFLAGS) -I$(GTEST_DIR) $(CXXFLAGS) -c \ $(GTEST_DIR)/src/gtest-all.cc gtest_main.o : $(GTEST_SRCS_) $(CXX) $(CPPFLAGS) -I$(GTEST_DIR) $(CXXFLAGS) -c \ $(GTEST_DIR)/src/gtest_main.cc gtest.a : gtest-all.o $(AR) $(ARFLAGS) $@ $^ gtest_main.a : gtest-all.o gtest_main.o $(AR) $(ARFLAGS) $@ $^ ## akt build stuff starts here IFLAGS = -I./ OBJS=matepair.o fastqlib.o utilityfunc.o .cpp.o: $(CXX) $(CXXFLAGS) $(IFLAGS) -c -o $@ $< .c.o: $(CC) $(CXXFLAGS) -c -o $@ $< matepair.o: matepair.cpp matepair.h fastqlib.h fastqlib.o: fastqlib.cpp fastqlib.h utilityfunc.h utilityfunc.o: utilityfunc.cpp utilityfunc.h nxtrim: nxtrim.cpp $(OBJS) version.h $(CXX) $(CXXFLAGS) nxtrim.cpp $(OBJS) $(LFLAGS) -o $@ test_nxtrim : test_nxtrim.cpp gtest_main.a $(OBJS) $(CXX) $(IFLAGS) $^ -o $@ $(CPPFLAGS) $(CXXFLAGS) NxTrim-0.4.3/kseq.h000066400000000000000000000175361323731670200140730ustar00rootroot00000000000000/* The MIT License Copyright (c) 2008 Genome Research Ltd (GRL). Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ /* Contact: Heng Li */ /* Last Modified: 12APR2009 */ #ifndef AC_KSEQ_H #define AC_KSEQ_H #include #include #include #define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r #define KS_SEP_TAB 1 // isspace() && !' ' #define KS_SEP_MAX 1 #define __KS_TYPE(type_t) \ typedef struct __kstream_t { \ char *buf; \ int begin, end, is_eof; \ type_t f; \ } kstream_t; #define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end) #define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0) #define __KS_BASIC(type_t, __bufsize) \ static inline kstream_t *ks_init(type_t f) \ { \ kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t)); \ ks->f = f; \ ks->buf = (char*)malloc(__bufsize); \ return ks; \ } \ static inline void ks_destroy(kstream_t *ks) \ { \ if (ks) { \ free(ks->buf); \ free(ks); \ } \ } #define __KS_GETC(__read, __bufsize) \ static inline int ks_getc(kstream_t *ks) \ { \ if (ks->is_eof && ks->begin >= ks->end) return -1; \ if (ks->begin >= ks->end) { \ ks->begin = 0; \ ks->end = __read(ks->f, ks->buf, __bufsize); \ if (ks->end < __bufsize) ks->is_eof = 1; \ if (ks->end == 0) return -1; \ } \ return (int)ks->buf[ks->begin++]; \ } #ifndef KSTRING_T #define KSTRING_T kstring_t typedef struct __kstring_t { size_t l, m; char *s; } kstring_t; #endif #ifndef kroundup32 #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) #endif #define __KS_GETUNTIL(__read, __bufsize) \ static int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \ { \ if (dret) *dret = 0; \ str->l = 0; \ if (ks->begin >= ks->end && ks->is_eof) return -1; \ for (;;) { \ int i; \ if (ks->begin >= ks->end) { \ if (!ks->is_eof) { \ ks->begin = 0; \ ks->end = __read(ks->f, ks->buf, __bufsize); \ if (ks->end < __bufsize) ks->is_eof = 1; \ if (ks->end == 0) break; \ } else break; \ } \ if (delimiter > KS_SEP_MAX) { \ for (i = ks->begin; i < ks->end; ++i) \ if (ks->buf[i] == delimiter) break; \ } else if (delimiter == KS_SEP_SPACE) { \ for (i = ks->begin; i < ks->end; ++i) \ if (isspace(ks->buf[i])) break; \ } else if (delimiter == KS_SEP_TAB) { \ for (i = ks->begin; i < ks->end; ++i) \ if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \ } else i = 0; /* never come to here! */ \ if (str->m - str->l < i - ks->begin + 1) { \ str->m = str->l + (i - ks->begin) + 1; \ kroundup32(str->m); \ str->s = (char*)realloc(str->s, str->m); \ } \ memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \ str->l = str->l + (i - ks->begin); \ ks->begin = i + 1; \ if (i < ks->end) { \ if (dret) *dret = ks->buf[i]; \ break; \ } \ } \ if (str->l == 0) { \ str->m = 1; \ str->s = (char*)calloc(1, 1); \ } \ str->s[str->l] = '\0'; \ return str->l; \ } #define KSTREAM_INIT(type_t, __read, __bufsize) \ __KS_TYPE(type_t) \ __KS_BASIC(type_t, __bufsize) \ __KS_GETC(__read, __bufsize) \ __KS_GETUNTIL(__read, __bufsize) #define __KSEQ_BASIC(type_t) \ static inline kseq_t *kseq_init(type_t fd) \ { \ kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t)); \ s->f = ks_init(fd); \ return s; \ } \ static inline void kseq_rewind(kseq_t *ks) \ { \ ks->last_char = 0; \ ks->f->is_eof = ks->f->begin = ks->f->end = 0; \ } \ static inline void kseq_destroy(kseq_t *ks) \ { \ if (!ks) return; \ free(ks->name.s); free(ks->comment.s); free(ks->seq.s); free(ks->qual.s); \ ks_destroy(ks->f); \ free(ks); \ } /* Return value: >=0 length of the sequence (normal) -1 end-of-file -2 truncated quality string */ #define __KSEQ_READ \ static int kseq_read(kseq_t *seq) \ { \ int c; \ kstream_t *ks = seq->f; \ if (seq->last_char == 0) { /* then jump to the next header line */ \ while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \ if (c == -1) return -1; /* end of file */ \ seq->last_char = c; \ } /* the first header char has been read */ \ seq->comment.l = seq->seq.l = seq->qual.l = 0; \ if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; \ if (c != '\n') ks_getuntil(ks, '\n', &seq->comment, 0); \ while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \ if (isgraph(c)) { /* printable non-space character */ \ if (seq->seq.l + 1 >= seq->seq.m) { /* double the memory */ \ seq->seq.m = seq->seq.l + 2; \ kroundup32(seq->seq.m); /* rounded to next closest 2^k */ \ seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \ } \ seq->seq.s[seq->seq.l++] = (char)c; \ } \ } \ if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \ seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \ if (c != '+') return seq->seq.l; /* FASTA */ \ if (seq->qual.m < seq->seq.m) { /* allocate enough memory */ \ seq->qual.m = seq->seq.m; \ seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \ } \ while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \ if (c == -1) return -2; /* we should not stop here */ \ while ((c = ks_getc(ks)) != -1 && seq->qual.l < seq->seq.l) \ if (c >= 33 && c <= 127) seq->qual.s[seq->qual.l++] = (unsigned char)c; \ seq->qual.s[seq->qual.l] = 0; /* null terminated string */ \ seq->last_char = 0; /* we have not come to the next header line */ \ if (seq->seq.l != seq->qual.l) return -2; /* qual string is shorter than seq string */ \ return seq->seq.l; \ } #define __KSEQ_TYPE(type_t) \ typedef struct { \ kstring_t name, comment, seq, qual; \ int last_char; \ kstream_t *f; \ } kseq_t; #define KSEQ_INIT(type_t, __read) \ KSTREAM_INIT(type_t, __read, 4096) \ __KSEQ_TYPE(type_t) \ __KSEQ_BASIC(type_t) \ __KSEQ_READ #endif NxTrim-0.4.3/matepair.cpp000077500000000000000000000411721323731670200152610ustar00rootroot00000000000000#include "matepair.h" //nextera mp adapters string adapter1 = "CTGTCTCTTATACACATCT"; string adapter2 = "AGATGTGTATAAGAGACAG"; string adapterj = adapter1+adapter2; //EXTERNAL adapters. this are used to clip very short dna fragments where R1 goes into R2 // string r1_external_adapter = "GTGACTGGAGTTCAGACGTGTGCTCTTCCGATC"; // string r2_external_adapter = "ACACTCTTTCCCTACACGACGCTCTTCCGATC"; string r1_external_adapter = "GATCGGAAGAGCACACGTCTGAACTCCAGTCAC"; string r2_external_adapter = "GATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT"; #define DEBUG 0 #define MIN3(a, b, c) ((a) < (b) ? ((a) < (c) ? (a) : (c)) : ((b) < (c) ? (b) : (c))) //only handles substitution errors in adapter (faster) int hamming_match(string & target,string & query,int minoverlap,float similarity) { int L1=target.size(); int L2=query.size(); if(L2>=L1) { return(L1); } assert((int)L1>=minoverlap); //check for full query matches. int maxdist = ceil ( (1.-similarity) * L2); int mini=L1,mind=L2; for(int i=0;i<=(L1-L2);i++) { int d=0,j=0; while(j=minoverlap;i--) { maxdist = ceil ( (1.-similarity) * i); //check the front of target int j=L2-i,d=0; while(j=0 && j1<(int)s1.size() && j2>=0 && j2<(int)s2.size() ) { d+=(s1[j1]!=s2[j2]); if(d>maxd) { d=s1.length(); break; } } } return(d); } int overlap(string & s1,string & s2,int minoverlap,float similarity) { int mini=0,mind,minL; if(s1.size()1) cerr << "mind = "<0) cerr << "w = "<2) { cerr << "Resolving overhang"<2) { cerr << r2.s <r1.notN(0,a) && !preserve_mp) { pe.r1=tmp1; pe.r2=r2; if(a>=minlen) se = r1.window(0,a); } else { if(tmp1.l>=minlen) se = tmp1; mp.r1=r1.window(0,a); mp.r2=r2; } return(0); } matePair::matePair() { _aggressive=false; //build seeds; seedsize=adapter1.length(); nseed=0; for(int i=0;i0) if(a1>0||a2>0) cerr << "trimUnknown: " << a1 << " " << b1 << " " << a2 << " " << b2 << endl; if(a1>0) unknown.r1 = unknown.r1.window(0,a1); if(a2>0) unknown.r2 = unknown.r2.window(0,a2); return(0); } //gets rid of the rare case where external adapters are present (isize < 2*L) bool matePair::trimExternal(readPair& rp) { bool found = false; int a,b; unsigned int tmp = rp.r1.s.find(r1_external_adapter);//PERFECT MATCH? if(tmp>=rp.r1.s.size()) //PARTIAL MATCH? a = hamming_match(rp.r1.s,r1_external_adapter,minoverlap,similarity); else a = (int)tmp; tmp = rp.r2.s.find(r2_external_adapter);//PERFECT MATCH? if(tmp>=rp.r1.s.size()) //PARTIAL MATCH? b = hamming_match(rp.r2.s,r2_external_adapter,minoverlap,similarity); else b = (int)tmp; if(DEBUG>1) { if((a>0 && a0 && b0 && a0 && b PAIRED END FRAG if(!(a>0 && a0 && b1) cerr <<"OVERLAP FOUND "<< a <<" " << b<0 && a0 && b=sim. returns s1.size() if no alignment found unsigned int matePair::ham_align(string & s1,string & s2) { if(s1.size()=minoverlap); int maxd = ceil ( (1.-similarity) * L2); int mind=maxd,mini=L1; int d; for(int i=0;i<(L1-L2);i++) { d = hamming(s1,s2,L1-i-L2,0,L2,maxd); if(dmaxd)//hit wasnt good enough mini=L1; return(mini); } //checks the right end of a read for partial adapter hit int checkRight(string & s1,string & adapter,int offset,int minoverlap,float similarity) { assert(offset <= (s1.size()-minoverlap)); int a=s1.size(); int mind = s1.size(); for(int i=offset;i<(s1.size()-minoverlap);i++) { int compare_len = (s1.size() - i); int maxdist = ceil(compare_len * (1. - similarity)); int d = hamming(s1,adapter,i,0,compare_len,maxdist); if(d1) { cerr << "read L1 = "< finds adapter on r1 string overhang = rc2.s.substr(0,rc2.l-b2); a1 = ham_align(readpair.r1.s,overhang); b1 = a1+adapterj.size(); } if(a2==L2&&b1<(L1-minoverlap)) {//vice-versa string overhang = rc1.s.substr(0,rc1.l-b1); a2 = ham_align(readpair.r2.s,overhang); b2 = a2+adapterj.size(); } if(DEBUG>1) { cerr << "adapter locations (second pass): "<1) { cerr << "adapter locations (third pass): "< overlap implies PE if(!trimExternal(readpair)) { unknown=readPair(readpair.r1,readpair.r2); trimUnknown(); } if(DEBUG>1) { cerr << "CASE A"<=(L1-minlen); bool R2_has_adapter_at_end = a2=(L2-minlen); if(a11) cerr << "CASE B"<1) cerr << "CASE C"<=(L1-minoverlap) && a2=minlen && (L2-b2)>=minlen) { if(_justmp) { mp=readPair(readpair.r1.window(0,a1),readpair.r2.mask()) ; } else { pe.r1 = readpair.r1.window(0,a1); pe.r2 = readpair.r2.window(b2,b2+a1); } } if(DEBUG>1) cerr << "CASE D"<=(L2-minoverlap) && a1=minlen && (L1-b1)>=minlen) { if(_justmp) { mp=readPair(readpair.r1.mask(),readpair.r2.window(0,a2)); } else { pe.r1 = readpair.r1.window(b1,b1+a2); pe.r2 = readpair.r2.window(0,a2); } } if(DEBUG>1) cerr << "CASE E"<1) cerr << "CASE F"<minlen && b1<=b2) se = readpair.r1.window(b1); if((L2-b2)>minlen && b21) cerr << "CASE G"<1) cerr << "CASE H"<0) cerr << "Wrote: n_mp="< seeds; int nseed,seedsize; bool _aggressive; void setAggressive(bool a) {_aggressive=a;}; }; //handles the output for nxtrim (which reads go to which file etc) class nxtrimWriter { public: nxtrimWriter(); nxtrimWriter(string prefix,bool jmp,bool separate_read_files=false); int open(string prefix,bool jmp,bool separate_read_files); int open(); bool setMP(bool val) {_write_mp=val;return(_write_mp);} bool setUN(bool val) {_write_un=val;return(_write_un);} int write(matePair & m); int weird,n_mp,n_unk,n_se,n_pe;//counts for each virtual library bool _justmp,_write_mp,_write_se,_write_pe,_write_un; bool print_to_stdout; pairWriter mp_out; pairWriter pe_out; pairWriter unknown_out; fastqWriter se_out; }; NxTrim-0.4.3/nxtrim.cpp000077500000000000000000000200711323731670200147730ustar00rootroot00000000000000#include "version.h" #include "matepair.h" #include "fastqlib.h" #include using namespace std; string percent(int num,int den) { char buffer[100]; sprintf(buffer,"%d / %d\t( %.2f%% )\t",num,den,100. * float(num)/float(den)); return(buffer); } void usage() { cerr << "\nProgram:\tnxtrim" << endl; cerr << "Version:\t" << VERSION <= 0) { switch (c) { case '1': r1 = optarg; break; case '2': r2 = optarg; break; case 'O': prefix = optarg; break; case 's': similarity = atof(optarg); break; case 'v': minoverlap = atoi(optarg); break; case 'l': minlen = atoi(optarg); break; case 'a': aggressive = true; break; case STDOUT: write_stdout=true; break; case STDOUT_MP: write_stdout_mp=true; break; case STDOUT_UN: write_stdout_un=true; break; case JUSTMP:justmp=true; break; case JOINREADS:joinreads=true; break; case NORC:rc=false; break; case PMP:preserve_mp=true; break; case IGNOREPF:ignorePF=true; break; case SEPARATE:separate=true; break; case 'w':hamming=false; break; default: die("Unrecognised argument"); } } if(!(r1==NULL&&r2==NULL) && !(r1!=NULL&&r2!=NULL)) die("both --r1 and --r2 must be speicified"); if(write_stdout && !prefix.empty()) die("--stdout and -O are incompatible"); if(!write_stdout && !write_stdout_mp && !write_stdout_un && prefix.empty() ) die("one of --stdout / --stdout-mp / --stdout-un / -O must be specified"); if(preserve_mp&&justmp) die("the --preserve_mp and --justmp flags are incompatible!"); if( (write_stdout+write_stdout_mp+write_stdout_un)>1) die("only one of --stdout / --stdout-mp / --stdout-un may be specified!"); if(minlen<=0) { die("--minlength must be >0"); } if(minoverlap<=0) { die("--minoverlap must be >0"); } if(write_stdout||write_stdout_mp||write_stdout_un) { cerr << "Writing to stdout"< pos; matePair m; m.setAggressive(aggressive); int num_reads_with_multiple_adapters=0,npass=0,nread=0; bool trim_warn=true; nxtrimWriter out; if(write_stdout||write_stdout_un||write_stdout_mp) { out.open(); if(write_stdout_un) out.setMP(false); if(write_stdout_mp) out.setUN(false); } else out.open(prefix,justmp,separate); int se_only = 0; while(infile.next(p)) { if(p.r1.l!=p.r2.l && trim_warn) { cerr << "WARNING: reads with differing lengths. Has this data already been processed with nxtrim?"<0; } if(rc) { m.mp.rc(); m.unknown.rc(); } else { m.pe.rc(); } out.write(m); npass++; } nread++; if(nread%10000==0) cerr << "READ PAIR "<thr and read.pos < (sam.lengths[read.tid] - thr) and read.pnext>thr and read.pnext< (sam.lengths[read.tid] - thr) not_too_small = (max(read.pos+L,read.pnext+L) - min(read.pos,read.pnext))> (L) not_identical = read.pos!=read.pnext if not read.is_unmapped and not read.mate_is_unmapped and not_identical: if not_on_end and not_too_small: if read.pos < read.pnext: if (read.flag & 0x10): k1 = 'R' else: k1 = 'F' if (read.flag & 0x20): k2='R' else: k2='F' o=k1+k2 d[k1+k2] += 1 else: if (read.flag & 0x10): k1 = 'R' else: k1 = 'F' if (read.flag & 0x20): k2='R' else: k2='F' o=k2+k1 if o=='RF': out_rf.write((read)) if o=='FR': out_fr.write((read)) for k in ['FR','RF','FF','RR']: print k,d[k] NxTrim-0.4.3/test/000077500000000000000000000000001323731670200137225ustar00rootroot00000000000000NxTrim-0.4.3/test/alignment_summary.py000066400000000000000000000050401323731670200200260ustar00rootroot00000000000000import sys,pysam,collections if __name__ == "__main__": import argparse parser = argparse.ArgumentParser(description='routine to ntrim adapter sequences from Nextera mate pair data') parser.add_argument('bam', metavar='bam', type=str, help='bam') parser.add_argument('-output', metavar='output', type=str, help='output prefix',default=None) args = parser.parse_args() sam = pysam.Samfile(args.bam, "rb" ) if(args.output!=None): out_fr = pysam.Samfile(args.bam[:-3]+"fr.bam", "wb", header = sam.header) out_rf = pysam.Samfile(args.bam[:-3]+"rf.bam", "wb", header = sam.header) L = 151 d = collections.defaultdict(int) isizes = collections.defaultdict(list) thr=20000 #how much to clip off contig ends? for read in sam: not_on_end = read.pos>thr and read.pos < (sam.lengths[read.tid] - thr) and read.pnext>thr and read.pnext< (sam.lengths[read.tid] - thr) insert_size = max(read.pos+L,read.pnext+L) - min(read.pos,read.pnext) not_too_small = insert_size > L not_identical = read.pos!=read.pnext if not read.is_unmapped and not read.mate_is_unmapped and not_identical: if not_on_end and not_too_small: if read.pos < read.pnext: if (read.flag & 0x10): k1 = 'R' else: k1 = 'F' if (read.flag & 0x20): k2='R' else: k2='F' o=k1+k2 d[k1+k2] += 1 isizes[k1+k2].append(insert_size) # sys.stderr.write("%s %d\n"%(k1+k2,insert_size)) else: if (read.flag & 0x10): k1 = 'R' else: k1 = 'F' if (read.flag & 0x20): k2='R' else: k2='F' o=k2+k1 if(args.output!=None): if o=='RF': out_rf.write((read)) if o=='FR': out_fr.write((read)) print "Orientation","Frequency".rjust(10," "),"Median insert size".rjust(20," ") for k in ['FR','RF','FF','RR']: x=isizes[k] x.sort() print k.ljust(9," "),("%d"%d[k]).rjust(12," "),("%d"%x[len(x)/2]).rjust(20," ") print "\nRF/(FR+RF) = %f"%(d['RF']/float(d['RF']+d['FR'])) NxTrim-0.4.3/test/ecmg.sh000066400000000000000000000014121323731670200151670ustar00rootroot00000000000000#get data r1=EcMG1_ATGTCA_L001_R1_001.fastq.gz r2=EcMG1_ATGTCA_L001_R2_001.fastq.gz ref=EcMG.fna if [ ! -f $ref ] then curl -O https://s3-eu-west-1.amazonaws.com/nxtrim-examples/bacteria/${ref} fi if [ ! -f $r1 ] then curl -O https://s3-eu-west-1.amazonaws.com/nxtrim-examples/bacteria/${r1} fi if [ ! -f $r2 ] then curl -O https://s3-eu-west-1.amazonaws.com/nxtrim-examples/bacteria/${r2} fi ##assemble with velvet time ../nxtrim -1 $r1 -2 $r2 -O EcMG --aggressive velveth output_dir 61 -short -fastq.gz EcMG.se.fastq.gz -shortPaired2 -fastq.gz EcMG.pe.fastq.gz -shortPaired3 -fastq.gz EcMG.mp.fastq.gz -shortPaired4 -fastq.gz EcMG.unknown.fastq.gz velvetg output_dir -exp_cov auto -cov_cutoff auto -shortMatePaired4 yes python n50.py output_dir/contigs.fa 500 NxTrim-0.4.3/test/n50.py000077500000000000000000000023621323731670200147040ustar00rootroot00000000000000import sys def N50(dna): lens = [len(val) for val in dna] lens.sort() thr = sum(lens)/2 n50 = 0 for i,l in enumerate(lens): n50+=l if n50>thr: return l def read_assembly(fname,minlen): contigs = [] scaffolds = [] count=0 s = [] def parse(s): tmp = "".join(s) # print len(tmp) scaffolds.append(tmp) for c in [val for val in tmp.split("N") if len(val)>0]: contigs.append(c) infile = open(fname) line = infile.next() for linenum,line in enumerate(infile): if line[0]==">": if linenum>0: parse(s) s = [] hdr = line else: count += len(line) s.append(line.strip()) parse(s) return [val for val in contigs if len(val)>=minlen],[val for val in scaffolds if len(val)>=minlen] if __name__ == "__main__": try: MINLEN = int(sys.argv[2]) except: MINLEN=0 contigs,scaffs = read_assembly(sys.argv[1],MINLEN) contig_lengths = [len(val) for val in contigs] scaff_lengths = [len(val) for val in scaffs] print "Assembly length\t%d"%sum(contig_lengths) print "#contigs\t%d"%len(contigs) print "Contig N50\t%d"%N50(contigs) NxTrim-0.4.3/test/test.sh000066400000000000000000000042011323731670200152320ustar00rootroot00000000000000#get data r1=EcMG1_ATGTCA_L001_R1_001.fastq.gz r2=EcMG1_ATGTCA_L001_R2_001.fastq.gz ref=EcMG.fna if [ ! -f $ref ] then curl -O https://s3-eu-west-1.amazonaws.com/nxtrim-examples/bacteria/${ref} fi if [ ! -f $r1 ] then curl -O https://s3-eu-west-1.amazonaws.com/nxtrim-examples/bacteria/${r1} fi if [ ! -f $r2 ] then curl -O https://s3-eu-west-1.amazonaws.com/nxtrim-examples/bacteria/${r2} fi bwa index $ref #test stdout and alignment out=EcMG.bam ../nxtrim --stdout -1 $r1 -2 $r2 | bwa mem -p $ref - | samtools view - -b -o $out out=EcMG.rf.bam ../nxtrim --rf --stdout -1 $r1 -2 $r2 | bwa mem -p $ref - | samtools view -b -o $out out=EcMG.mp.bam ../nxtrim --stdout-mp -1 $r1 -2 $r2 | bwa mem -p $ref - | samtools view -b -o $out out=EcMG.un.bam ../nxtrim --stdout-un -1 $r1 -2 $r2 | bwa mem -p $ref - | samtools view -b -o $out ##assemble with velvet ../nxtrim -1 $r1 -2 $r2 -O EcMG velveth output_dir 61 -short -fastq.gz EcMG.se.fastq.gz -shortPaired2 -fastq.gz EcMG.pe.fastq.gz -shortPaired3 -fastq.gz EcMG.mp.fastq.gz -shortPaired4 -fastq.gz EcMG.unknown.fastq.gz velvetg output_dir -exp_cov auto -cov_cutoff auto -shortMatePaired4 yes ##do some alignments ../nxtrim --aggressive -1 EcMG1_ATGTCA_L001_R1_001.fastq.gz -2 EcMG1_ATGTCA_L001_R2_001.fastq.gz --stdout-mp | bwa mem EcMG.fna -p - | gzip -1 > mp.agg.bam ../nxtrim -1 EcMG1_ATGTCA_L001_R1_001.fastq.gz -2 EcMG1_ATGTCA_L001_R2_001.fastq.gz --stdout-mp | bwa mem EcMG.fna -p - | gzip -1 > mp.ham.bam ../nxtrim --aggressive -1 EcMG1_ATGTCA_L001_R1_001.fastq.gz -2 EcMG1_ATGTCA_L001_R2_001.fastq.gz --stdout-un | bwa mem EcMG.fna -p - | gzip -1 > un.agg.bam ../nxtrim -1 EcMG1_ATGTCA_L001_R1_001.fastq.gz -2 EcMG1_ATGTCA_L001_R2_001.fastq.gz --stdout-un | bwa mem EcMG.fna -p - | gzip -1 > un.ham.bam for i in mp.ham.bam mp.agg.bam un.ham.bam un.agg.bam;do echo $i; /usr/bin/python alignment_summary.py $i;done ## try the sw routine time ../nxtrim -1 $r1 -2 $r2 -O EcMG --aggressive for i in mp unknown pe ; do bwa mem $ref -p EcMG.${i}.fastq.gz | gzip -1 > ${i%fastq.gz}.bam done for i in mp.bam unknown.bam pe.bam;do echo $i; /usr/bin/python alignment_summary.py $i;done NxTrim-0.4.3/test_nxtrim.cpp000066400000000000000000000055701323731670200160360ustar00rootroot00000000000000#include #include "gtest/gtest.h" #include "matepair.h" TEST(HammingMatch,PerfectMatches) { string adapter1 = "CTGTCTCTTATACACATCT"; string adapter2 = "AGATGTGTATAAGAGACAG"; int min_overlap=12; float similarity=0.85; string target = "AGATGTGTATAAGAGACAGGCGATTTCCTTACATTGACGTTTTTATTACTCACTGTCCTGTTCCTGTTATCACATTATCTGCTGAACAATTACTGATAGGTTAAAGAGAACCAGGCCTGGGCATTGGCGATGCCGCCAGTAACCCGA"; string query = "AGATGTGTATAAGAGACAG"; ASSERT_EQ(hamming_match(target,query,min_overlap,similarity),0); target = "GCGATTTCCTTACATTGACGTTTTTATTACTCACTGTCCTGTTCCTGTTATCACATTATCTGCTGAACAATTACTGATAGGTTAAAGAGAACCAGGCCTGGGCATTGGCGATGCCGCCAGTAACCCGACGCCAGATGTGTATAAGAGACAG"; query = "AGATGTGTATAAGAGACAG"; ASSERT_EQ(hamming_match(target,query,min_overlap,similarity),target.size()-query.size()); string s1 = "GCGATTTCCTTACATTGACGTTTTTATTACTCACTGTCCTGTTCC"; string s2 = "TGTTATCACATTATCTGCTGAACAATTACTGATAGGTTAAAGAGAACCAGG"; query = adapter1+adapter2; target = s1 + query + s2; ASSERT_EQ(hamming_match(target,query,min_overlap,similarity),s1.size()); } TEST(HammingMatch,NoisyMatches) { string adapter1 = "CTGTCTCTTATACACATCT"; string adapter2 = "AGATGTGTATAAGAGACAG"; int min_overlap=12; float similarity=0.85; string target = "AGGTGTGCATAAGAGACAGGCGATTTCCTTACATTGACGTTTTTATTACTCACTGTCCTGTTCCTGTTATCACATTATCTGCTGAACAATTACTGATAGGTTAAAGAGAACCAGGCCTGGGCATTGGCGATGCCGCCAGTAACCCGA"; string query = "AGATGTGTATAAGAGACAG"; ASSERT_EQ(hamming_match(target,query,min_overlap,similarity),0); target = "GCGATTTCCTTACATTGACGTTTTTATTACTCACTGTCCTGTTCCTGTTATCACATTATCTGCTGAACAATTACTGATAGGTTAAAGAGAACCAGGCCTGGGCATTGGCGATGCCGCCAGTAACCCGACGCCAGATGTGTATATCAGACAG"; query = "AGATGTGTATAAGAGACAG"; ASSERT_EQ(hamming_match(target,query,min_overlap,similarity),target.size()-query.size()); string s1 = "GCGATTTCCTTACATTGACGTTTTTATTACTCACTGTCCTGTTCC"; string s2 = "TGTTATCACATTATCTGCTGAACAATTACTGATAGGTTAAAGAGAACCAGG"; query = "CTGTCTCTTATAGTCATCTAGATGTGTATAAGACACAG"; target = s1 + query + s2; ASSERT_EQ(hamming_match(target,query,min_overlap,similarity),s1.size()); } TEST(HammingMatch,PartialMatches) { string adapter1 = "CTGTCTCTTATACACATCT"; string adapter2 = "AGATGTGTATAAGAGACAG"; int min_overlap=12; float similarity=0.85; string target =adapter2.substr(4) + "GATTTCCTTACATTGACGTTTTTATTACTCACTGTCCTGTTCCTGTTATCACATTATCTGCTGAACAATTACTGATAGGTTAAAGAGAACCAGGCCTGGGCATTGGCGATGCCGCCAGTAACCCGA"; string query = adapter2; ASSERT_EQ(hamming_match(target,query,min_overlap,similarity),-4); target = "GCGATTTCCTTACATTGACGTTTTTATTACTCACTGTCCTGTTCCTGTTATCACATTATCTGCTGAACAATTACTGATAGGTTAAAGAGAACCAGGCCTGGGCATTGGCGATGCCGCCAGTAACCCGACG" + adapter1.substr(0,15); query = adapter1; ASSERT_EQ(hamming_match(target,query,min_overlap,similarity),target.size()-15); } NxTrim-0.4.3/utilityfunc.cpp000077500000000000000000000010451323731670200160310ustar00rootroot00000000000000#include "utilityfunc.h" using namespace std; int argmax(vector & x) { int maxind=0; for(int i=1;i<(int)x.size();i++) if(x[i]>x[maxind]) maxind=i; return(maxind); } int which_max(int *x,int n) { int maxidx = 0; int maxval = x[maxidx]; for(int i=0;imaxval) { maxidx = i; maxval = x[i]; } } return maxidx; } bool fileexists(string fname){ ifstream ifile(fname.c_str()); return ifile.good(); } void die(const string & err) { cerr << "ERROR: "<< err << endl; exit(1); } NxTrim-0.4.3/utilityfunc.h000077500000000000000000000026341323731670200155030ustar00rootroot00000000000000#pragma once #include #include #include //#include #include #include #include #include #include #include using namespace std; int which_max(int *x,int n); //unordered_set sampleIndex(int k,int n); template bool comparator ( const pair& l, const pair& r) { return l.first < r.first; } template vector argsort(vector *list) { vector > tosort(list->size()); for(int i=0;isize();i++) { tosort[i] = make_pair( (*list)[i] ,i ); } sort(tosort.begin(),tosort.end());//,comparator); vector ret(tosort.size()); for(int i=0;i T **newMatrix(int nrow,int ncol) { T **ret = new T*[nrow]; for(int i=0;i void delMatrix(T **mat,int nrow,int ncol) { for(int i=0;i void printMatrix(T **H,int nrow,int ncol) { for(int i=0;i & x);