pax_global_header00006660000000000000000000000064141243463420014515gustar00rootroot0000000000000052 comment=94872ac58cbbd2aeeb70f7ea14ad8cb67a9e1652 umis-1.0.8/000077500000000000000000000000001412434634200125005ustar00rootroot00000000000000umis-1.0.8/.gitignore000066400000000000000000000000761412434634200144730ustar00rootroot00000000000000tests/results build/ .eggs/ dist/ umis.egg-info/ umis/utils.c umis-1.0.8/HISTORY.md000066400000000000000000000073521412434634200141720ustar00rootroot00000000000000## 1.0.8 - Add support for dual UMI indexes. Thanks @lbeltrame! ## 1.0.7 - Ensure headers are not written when writing out a Series, to make us compatible with pandas > 0.24. - Fix for deprecated .ix call, .loc is the new replacement. Thanks to @naumenko-sa. ## 1.0.6 - Fix for the python3 fix. ## 1.0.5 - Fix for cb_filter with python3. ## 1.0.4 - Enable cb_histogram to be used on samples without UMIs. - Enable filtering of cells during `demultiplex_cells`. - Fix incorrect pandas.read_csv call with header=-1. ## 1.0.3 - Python 3 support ## 1.0.2 - Add `demultiplex_cells` subcommand to break a transformed FASTQ file into separate FASTQ files by cell. - Future proofing for changes to panda's `to_csv` function. ## 1.0.1 - Add support for click 7.0. ## 1.0.0 - Fix for min-length filtering with paired samples. Previously required only one read to be longer, fix requires both. - Fix tests for fastqtagcount to use indexed BAM files. - Support gzipped cellular barcode files. - Support 10x V2 barcoding scheme. Thanks to @tomasgomes for the fix. - Re-enable streaming for cellular barcode filtering. - Add `--umi_matrix` option to fasttagcount. This outputs a non-umi-deduped matrix of counts, useful for QC. - Support gzipped files for `sb_filter`, `mb_filter` and `add_uid`. ## 0.8.0 - Fix `fasttagcount` off-by-one issue. - Add `version` subcommand. - Fix missing pandas import in `sparse` subcommand. ## 0.7.0 - Fix for kallisto output failing due to defaultdict not being imported. Thanks to @andreas-wilm for the fix. - Added `tagcount` option `--parse_tags` to use BAM tags rather than parsing read names (`UM` for UMI, `CR` for cell barcode) - Added `tagcount` option `--gene_tags` to use BAM tags to get ID of mapping gene (`GX` tag). - Fix tagcount with `--genemap` option not including a column name for the index. - Add `sparse` subcommand to turn a matrix into a sparse matrix. - Add `fasttagcount` subcommand. This assumes the input BAM/SAM file is coordinate sorted. Reduces memory usage by over 100x and runtime by 30-40% for deep samples. - Warn, don't fail if transcripts are missing from the genemap. ## 0.6.0 - Fix skipping first piece of evidence when tagcounting. - Add test for tagcount. - Output full sorted transcript table from tagcount rather than only the observed transcripts. - Add `--sparse` option to output tagcount matrices in MatrixMarket format. - Allow cb_histogram subcommand to take gzipped files. - Allow cb_filter subcommand to take gzipped files. - Add support for triple-cellular barcodes. - Add example for Illumina SureCell (https://www.illumina.com/products/by-type/sequencing-kits/library-prep-kits/surecell-wta-ddseq.html) ## 0.5.0 - Fix automatic format detection in cb_histogram. - Add tests for cb_histogram. - Re-enable streaming bamtagging. Thanks to @chapmanb for the suggestion. - Add subset_bamfile to subset a BAM file to keep alignments with a given set of cellular barcodes. - Speed improvements for reading gzipped FASTQ files. - Memory usage improvements for tagcount. ## 0.4.0 - Fix for handling unicode, thanks to @chapmanb and @sowmyaiyer - Adds support for adding BAM tags to aligned fastqtransformed files. Thanks to @chapmanb. - Adds support for UMI-only fastqtransformation. - Adds support for paired-end target sequences. - Adds support for detecting sample barcodes via the SB tag in the regex. - Adds support for sample-based demultiplexing with error correction. ## 0.3.0 - Now supports transforming 3-file input, as from the Linnarsson lab STRT-Seq data - New kallisto subcommand formats read files for input to kallisto's UMI mode - Fix gzip based fastq reading on Python 3.5 - Including preliminary subcommand for guessing cell cutoff from cb_histogram ## 0.2.2 - Added MANIFEST file which broke pip installation umis-1.0.8/LICENSE.md000066400000000000000000000020751412434634200141100ustar00rootroot00000000000000The MIT License (MIT) Copyright (c) 2016 Valentine Svensson Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. umis-1.0.8/MANIFEST.in000066400000000000000000000000151412434634200142320ustar00rootroot00000000000000include *.md umis-1.0.8/README.md000066400000000000000000000071351412434634200137650ustar00rootroot00000000000000# umis **umis** provides tools for estimating expression in RNA-Seq data which performs sequencing of end tags of transcript, and incorporate molecular tags to correct for amplification bias. There are four steps in this process. 1. Formatting reads 2. Filtering noisy cellular barcodes 3. Pseudo-mapping to cDNAs 4. Counting molecular identifiers ## 1. Formatting reads We want to strip out all non-biological segments of the sequenced reads for the sake of mapping. While also keeping this information for later use. We consider non-biological information such as Cellular Barcode and Molecular Barcode. To later be able to extract the optional CB and the MB these are put in the read header, with the following format. @HWI-ST808:130:H0B8YADXX:1:1101:2088:2222:CELL_GGTCCA:UMI_CCCT AGGAAGATGGAGGAGAGAAGGCGGTGAAAGAGACCTGTAAAAAGCCACCGN + @@@DDBD>=AFCF+1AAADD @M00642:544:000000000-APV11:1:2105:15595:1383 3:N:0:0 AAACGGCG + CCCCCCDB @M00642:544:000000000-APV11:1:2105:14585:1385 3:N:0:0 AAACGGCG + BCBCCCCB @M00642:544:000000000-APV11:1:2105:16466:1386 3:N:0:0 AAACGGCG + 3>AA3>A> @M00642:544:000000000-APV11:1:2105:16616:1395 3:N:0:0 AAACGGCG + AAAA3ADB @M00642:544:000000000-APV11:1:2105:15267:1397 3:N:0:0 AAACGGCG + BABBB?AA @M00642:544:000000000-APV11:1:2105:15060:1398 3:N:0:0 AAACGGCG + AAAA33A2 umis-1.0.8/examples/10XGenomics/read_I7_cell_barcode_read_14_base.fastq000066400000000000000000000015341412434634200256370ustar00rootroot00000000000000@M00642:544:000000000-APV11:1:2105:14645:1375 2:N:0:0 NGAACACTAGAACA + #>1>>ACFBF1BFG @M00642:544:000000000-APV11:1:2105:14815:1376 2:N:0:0 NTTGGTCTTGAGGG + #>1>AAFBFF3@11 @M00642:544:000000000-APV11:1:2105:16555:1377 2:N:0:0 NGGCACGAGACACT + #>3>>AAADAAFAG @M00642:544:000000000-APV11:1:2105:16817:1378 2:N:0:0 NCTAGATGCTTGTT + #>>>11B@@FFFGG @M00642:544:000000000-APV11:1:2105:15595:1383 2:N:0:0 NATAAGTGAGGTCT + #3>AA3@DB@BCGG @M00642:544:000000000-APV11:1:2105:14585:1385 2:N:0:0 NCCATCCTCCTTAT + #>1>>AFFFFFF33 @M00642:544:000000000-APV11:1:2105:16466:1386 2:N:0:0 NGCACGACCCGTTC + #1>>11>>AD1110 @M00642:544:000000000-APV11:1:2105:16616:1395 2:N:0:0 AGACACTGGGTATC + 111>111B11CDCG @M00642:544:000000000-APV11:1:2105:15267:1397 2:N:0:0 ACTACGGCTGGCCA + 1>>1>111>>1>C1 @M00642:544:000000000-APV11:1:2105:15060:1398 2:N:0:0 TCAACACTGTGTTG + ?AB@BBAFB55B44 umis-1.0.8/examples/10XGenomics/transcript_umi_reads.fastq000066400000000000000000000063701412434634200236420ustar00rootroot00000000000000@M00642:544:000000000-APV11:1:2105:14645:1375 1:N:0:0 TCATGTTGGAGATCTCTATTGTAATCTCTATTGGAGATTACAATGATTAAATCAATAAATAACTAAAAAAAAAAAAAAAAAAAAAAAAAAAAAATTAC + >AAAA3@D111CGGFGGGGFGGFHHFHFHFHHHHEGHFFHHH1FGFHHFEGGGF1DFA1DFDGHFFG11E?EECGG/E@EE///@CCCGCCC-<./0= @M00642:544:000000000-APV11:1:2105:14645:1375 4:N:0:0 NCCGAAGTCA + #>>AAADBFF @M00642:544:000000000-APV11:1:2105:14815:1376 1:N:0:0 GAAAAAGCAGACATTTTTTTTACATGGTCACATTTCGTGCTTCTCGGATTTCTGAGGAAATATTTTGTATTGTATATTACAATGATCACTGGCTGAAA + BBB@BFAFFFFFFGCBGB2EAG35GHB55DFGGFHFGGFAF5GHHGGEGDGBGG5D1B23AD5AGGHHFBGHHF5DEFFDG@BB5FD4FGHHHFG1FB @M00642:544:000000000-APV11:1:2105:14815:1376 4:N:0:0 NAGGTACTAG + #>>3>AFFFF @M00642:544:000000000-APV11:1:2105:16555:1377 1:N:0:0 TTTTATCCAGTATTCTCACTGGCTCCTTTCTCATAATTTGACCTTCTGGCCTAAGCGGTGTCCCCCAGGGCCCCACTCCTGTATTTCTATTTCAGAAC + BBBBAFFFFFFBFFGGGBDFFCHHHGGHHHGBHFFHHHGHHHHHHEGBFFHFHEHF?E22EFGHHGCEGGGFGGGGHHHFHHFHFGHHBGFFHEHFHH @M00642:544:000000000-APV11:1:2105:16555:1377 4:N:0:0 NGGATTGAGC + #>>A11>@11 @M00642:544:000000000-APV11:1:2105:16817:1378 1:N:0:0 GCTGTGCGGCCAAAAGTCCTTATGAGACTGTCTACGACACAGAAGCACGTCAGCAGGGCCTATGGCGGCTCCATGTGTGCCAAGTGTGTCCGTGCCAG + AAA1>FB111A11AF0GF1GGGGF3FDFFF1BB21AEE/EFHHFHHHHGHHAEGE0CAEEGHHF1//E@E?F1B1FFD0@1@1>FDD1FDH?///?>F @M00642:544:000000000-APV11:1:2105:16817:1378 4:N:0:0 NCGCAAGCAA + #>3333@>>F @M00642:544:000000000-APV11:1:2105:15595:1383 1:N:0:0 AGAGCTATGTGTATGGCCCCATCCCTCATACTTTCGATCGTGACTGGGTGGCCATGCAGACCAAGCGAATGCTGGACATGAAGGCCAACCCCATTCAG + AABBADBFFBFDGA6BFGGECBGFHFFCBGBFGFHGE5FEGFHGHFEFFFGGHGHHHEAFGF2EGGGCEEHHHGEHCBGFG5F3CCGFFDGEGHHHFG @M00642:544:000000000-APV11:1:2105:15595:1383 4:N:0:0 NGCACCAGAG + #>>AAAFFFA @M00642:544:000000000-APV11:1:2105:14585:1385 1:N:0:0 CACATGCTGCCCAGCGGCTTCCGCAAGTTCCTGGTCCACAATGTCAAGGAGCTGGAGGTGCTGCTGATGTGCAACAAATCTTACTGTGCTGAGATTGC + AABBBBFF54BAG4GECCCGGB2EEC2FGHHHHFFFCFBGEAGHFHFHFHHAGCCHGEAGBGHGFFHHHHHHFGFHAHHHHHHHHHBFHHHHFHHHHD @M00642:544:000000000-APV11:1:2105:14585:1385 4:N:0:0 NCCGATCATC + #1>1>>@?FB @M00642:544:000000000-APV11:1:2105:16466:1386 1:N:0:0 GGAAAATGATCAGAAAAAGAAAGAAGCCCAAGAGAAAGGTACCTGGGTTCAACTAAAGCGCCAGCCTGCTCCACCCAGAGAAGCACACTTTGTGAGCA + A@AAAFFBDFF31FFB1B1ABFHHFCFH0GEFEHFH1G01DDGHHHEG/FGFFHCGBGF/EEEEFGHHH1EHHHGGEG/>FFHFEGGGBGGGBGHB11 @M00642:544:000000000-APV11:1:2105:16466:1386 4:N:0:0 NTTACGACCC + #1>>A11>>A @M00642:544:000000000-APV11:1:2105:16616:1395 1:N:0:0 AGCCCAGTCAGCCCTCAGAGCTCTTGCTCGCTCTGGGATGAAGATTGGGCGGATTGAGGATGTCACCCCATTCCCTCTGATAGCACTCGAAGAAAAGG + A11AAAAFF1BCB1BGE111FD1DH1GGG??CGCBF0B1FGFFGFH10AGGA??EGBGFH0E1GHGHHGCBE2FDGGHHFFGHBGHEHF/0F/GH00A @M00642:544:000000000-APV11:1:2105:16616:1395 4:N:0:0 CGGTCGATAA + AA@AA?@A?A @M00642:544:000000000-APV11:1:2105:15267:1397 1:N:0:0 GATAGTACAATGACAAGAATCCTTCAAGATTCATTAGGTGGCAACTGTAGAACCACTATTGTACTTTGCTGCTCTCCATCATCATACAATGAGTCTGA + AAA1A3B3BFFBBD1BFGG1FA3FF1131ABA3EGHGHD1A0C1FGGGHHGEDEFFF0GHHFB2DFEBGHGHGHHHHBFHFGFGG2DEEF111DDGF1 @M00642:544:000000000-APV11:1:2105:15267:1397 4:N:0:0 AGCTGGACCA + AAA33BCFF? @M00642:544:000000000-APV11:1:2105:15060:1398 1:N:0:0 AACAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA + AA1>A1>AADDDG0/E//EEE@EGE?EECGECCGGGCCG-@=@-@ @M00642:544:000000000-APV11:1:2105:15060:1398 4:N:0:0 GCTGGTAACT + ABBB?CDFFF umis-1.0.8/examples/10XGenomics/transform.json000066400000000000000000000002751412434634200212650ustar00rootroot00000000000000{ "read1": "(?P@.*) .*\\n(?P.*)\\n\\+(.*)\\n(?P.*)\\n", "read2": "(@.*)\\n(?P.*)\\n\\+(.*)\\n(.*)\\n", "read3": "(@.*)\\n(?P.*)\\n\\+(.*)\\n(.*)\\n" } umis-1.0.8/examples/10XGenomics_v2/000077500000000000000000000000001412434634200167625ustar00rootroot00000000000000umis-1.0.8/examples/10XGenomics_v2/test_7_I1.fastq000066400000000000000000000030041412434634200215550ustar00rootroot00000000000000@ST-K00126:314:HFYL2BBXX:7:1101:1631:1226 1:N:0:GTAATTGC GTAATTGC + AAAFFJFJ @ST-K00126:314:HFYL2BBXX:7:1101:1834:1226 1:N:0:GTAATTGC GTAATTGC + AAAFFJ@.*) .*\\n(?P.{16})(?P.{10})(.*)\\n\\+(.*)\\n(.*)\\n", "read2": "(@.*) .*\\n(?P.*)\\n\\+(.*)\\n(?P.*)\\n", "read3": "(@.*)\\n(?P.*)\\n\\+(.*)\\n(.*)\\n" } umis-1.0.8/examples/BATseq/000077500000000000000000000000001412434634200154355ustar00rootroot00000000000000umis-1.0.8/examples/BATseq/SRR1558183_1.fastq000066400000000000000000000011041412434634200201360ustar00rootroot00000000000000@SRR1558183.43605 43605/1 CTGCTATGTTTTTT + FFFFFFGFGGGGGG @SRR1558183.43606 43606/1 CACGTTTATTTTTT + FFFCFFGGGGGGGG @SRR1558183.43607 43607/1 GACAGCTGTTTTTT + FFFFFBGGGGGGGG @SRR1558183.43608 43608/1 TCCTTTGCTTTTTT + FFFFFFGGGGGGGG @SRR1558183.43609 43609/1 TGCATATATTTTTT + FFFFFFGGGGGGFE @SRR1558183.43610 43610/1 CCCCCGCCTTTTTT + FFFFBBGGGGGGGG @SRR1558183.43611 43611/1 ATGGGTGATTTTTT + FFFFFFEGGGGGGG @SRR1558183.43612 43612/1 TGGATCAGTTTTTT + FFCFFFGGGGGGGG @SRR1558183.43613 43613/1 CCCCTTTATTTTTT + FFFBFFGFGGGGGG @SRR1558183.43614 43614/1 TCTTAGTCTTTTTT + FFFFFFGGFGGGGG umis-1.0.8/examples/BATseq/SRR1558183_2.fastq000066400000000000000000000130621412434634200201450ustar00rootroot00000000000000@SRR1558183.1 1/2 CAAGACTGTTGTCAACACGGATGTGTTCAGGGACCCAGCTTTGAAGCGCAAGGCCAGGCGGGAGGCCCAGGTCCAGTTTGTGGAGCGATACCAGACCCGGAAGAACACATGGTTTTTCCAGAAGCTTCGCTTTTTGGTATATTTTTTATTTGGTCCTCCAAAAAAAAAAAAAAAAAAAAAAAACAACCCTTGTTCCCCCGCGCCCCCCCCCTCCTATTTTTTTTTTCTCGGTGTTCTCCTCTTTTTTCCTCCCTTTTTCTTTTTTTTCTTTTT + F@DF1FGGGG1GG33G11000A0BD2EHDGEGFGFECEGHHHEGBFF/A/A//EG////EC///>AF//>/1>11BBBF10B//?00000/1<01?FFF0F########################################################################################################################################################## @SRR1558183.2 2/2 ATCCGAGTACGCTTGTCCAGAAAACGTAATGAGGATGAGGATTCCCCAAACAAGCTCTACACGCTGGTAACTTACGTGCCTGTTACCACATTCAAAAATCTACAGACGGTCAACGTGGATGAGAACTAACCTGCTCGTGTCAAATAAAGTTGCAGAACTGCCAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACAAAGAACCCGAACGGCCCCCAAAACAAAAAAAAAAAACAACCGGGGTCCCCCCCCTCTACTAAAAAAAAAAAAACAACAA + FDFFBGGGGGGGGGGHHHGHFGHHGGGHHGHHHHGHHHHEFGHHHHHHGHHHHGHHHHHHHHFGGGGHHHHHHGHGHHGHHHHHHHHHHHHHHGFHHHHHHHHHFHHFEGGGHHGFAFFHHHHFGHHGGHFHHHBHGGHHHHHHHGEFFFHHHHHGFGGFFHB1BEECCCFGGGGGGGGGFFFADFF###################################################################################### @SRR1558183.3 3/2 GGGATCCTTGGCCACCTTGCTTCATGATTTGATACATTTGTTGTATTCAAAAACTTGAACTGTAGGATGCCATTAAGAGTCTGTTTATATTATTGGAATATTTGTATTACAATTGTTAATAAAGGCTGGTTTAAAAACCTAAAAAAAAAAAAAAAAAAAAAAACACCCACCCAAAAAAACCACAACGGCCCCCAAAAACAATATAAAAAACCAAAACGTACCCCTTTTTTTTTAAAAAAAAAATATTCAAACTCTTATCTTATAAATTAAAAT + @@1AFFGGGGBFGGGHHHHHHHHBEG3GDGHGGFHHHHFFFGHHFEGFHBDHHGHHBEGGFFGGHFFFCGGFHHFDE1GDGHGFEHHHHHFFFHBGFDHFFHHHGHHHHHHHGHHHHHHHHHHBGGFHCHGHFFFFHGGHHHHFHGGGGGGGGGCGGGGGGG@############################################################################################################## @SRR1558183.4 4/2 AGGAGTCTGAAGATGACATGGGCTTTGGTCTTTTTGACTAAACTGCTTTTGTTAAGTTAGCCAATAAAGAGCTGAACCTGTAAAAAAAAAAAAAAAAAAAACAAAAGGACAAGGTACCCGCGGGGGCCCCAACACAATCATGAGATACCTCGCGGGGCCCCCACAACAAAAAAAAAAAAAAAAAAAATAAAAAAACTACGCATATTACCCAAAAAACCAACAAAACACGCAGAACACAAAAAAAAACAGAAGAACAAACGAAACATAAAGCAG + FFFFFGGGGGGGFGGHHGHHHGHHHCGFFGHHHHGGFEFHGFEEFFGHHGGHHH5DHH5FGHH2CFHFHFGBFHFHHGHHGHHHHHGGGGGGGGGGGG@C>//?F00/?#################################################################################################################################################################### @SRR1558183.5 5/2 CTCCCTCACAATTTCCATCCCAGACCCCCATAATAACAGGAGGGGCCTAGGGAGCCCTCCCTACTCTCTTGAATACCATCAATAAAGTTCGCTTCACCCAAAAAAAAAAAAAAGAAAAAAAAAAAAAAAAAAATATAACAAAAGGGATCCGAAGCGCGCCCCCACCCCCAAAATAAAACCCTAAATAGCACCCCCCTTATAAAACAAAAAAAAAAAATAAAAAAAACATAAAAATAAAAAATTACAAAAAAAAAATTAAACAAAAAACCAAAA + FFFFFGGGGGGGGCEHHHHHHHHHGGHGGGHHHHHHHHHGHGGGGGGGHHHEGGGGGHHGGHHHHHHHHHFDHHHGHHHHFHHHGGBGGHFCG1?FFGHFHFHGGGCGCFG?C/FGFHFGGGGGGGGGGGA############################################################################################################################################## @SRR1558183.6 6/2 CACACTTTGTGAGGACTAATGGAAAAGAGCCTGAGCTGTTGGAGCCCATTCCATACGAATTCATGGCCTAATGTACACAAAGAAATAAAATACCAGCACCAGGAAAAAAAAAAAAAAAAGGGGGGGGAAGGGGAGCGGCGGGACCCGCCCAACCCCCAGAACAAACCACGGGGCGCCCCCCCCCCACTCAAAAAAAAAAAACCAACACAACAAAAAAACCAAAGCGAAACCAAAAAGCTAACAAAACAAAGCAGACACGCTAAAAGCGCCAGA + F5DDFGGGGFGFGGGHFH4FGHF4FHGFHFHHFHHHGHHGFHHHHGGGFHHHHHGHHFCGHHHHHHHHHHFBFGHHHHHGHHHHHHHHHHHHHHHGHHHHHHGGHFHHGGGCGGGGGG?########################################################################################################################################################## @SRR1558183.7 7/2 ATTGAACCCATTCATCATAAGTCTTAACTCGTTAGAGATAATGTACCCATGGAGACTAGCAAAATAGTATGTAGATGTGATCTCAATTGTAAATAGAAAAATTTAATTCAATAAACTCTGTATCAGCCAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACCCAAAAAAAAAACCCACACAGCCCACACACACAAAAAAAAAAAAAAAAAGAAATCCCTAAAAAAAAAAAAAAAAAAAAAAAAAAAATAAAAAAACAAAAAAAAA + FFFFFGGGGGGGGGGHHHHHHHFHHGHHHHGHGHGGGHGHHFHHGHHHHHHHGHHGHHHHHHHHHHGHHHHHHHHHHFBGHHHHHHHHHHGHHHHGHGGHHHEGHHHHHHHHHHHHHHHGFHHHHHHHHHHHHGFGGGGGGGG@->@DCDDGGGGGGGFFFFFFF############################################################################################################ @SRR1558183.8 8/2 ACAGATGTCACTATCACCAACAAGTTTATCAGCCCCAACTCCCTCAACAGTACTGTCATAATAGTCACCATTACCATCCTTGTTCACTTTATTTTAGTTACTCAGTTGCTGTCATCATTAATACTAATTGTGCCTTCAAGATGTCAAAAAAAAAAAAAAAAACAAATCCAACAGTAAACCCGAGCGGCCCCCAAACCAAAAATATAAACATCCGGGGGCCCCCCATTTTTAAAAAAAAAAAAAAAAAAAAATAAAACAAAAAAACCAACAAAC + FFFFFGGGGGGGGGGHHHHHGGHHGHHGHHHHHHGGGGGHHHHHHHHHHHHHHHHFHHHHHHHHHHHHHHHHHHHHHHHGHHHHHHHHGHHHHHHHHHHHHHHHHHHHHHHHHHHHGGHHHHHHHHHHHHGHHHHHHHHHHHHHHHHHHHGGGGGGGGGGGG############################################################################################################### @SRR1558183.9 9/2 CTGACTGCTCTCCCAGAGGTCCTGAGTTCAATTCCCAACAACCACATGGTGACTCACAACCATCTGCAATGGGATCGGATGCCCTCTTCTGGTGTGTCTGAAGCTACAGTGTACTCATGTACATAAAATAAATAAATCTTTTAAAGAAAATAATAACAAAAAAAAAAAAAAAAAAAAAAAAAGGGGGAAAGGAAACCGGGGGGAAACGAAAAAAAAAAAGAAAACAACGGAGGGAACGACACAAAATCAAAAAAAAAAAAAAAAAAAAAAACA + BDDFFFGGFCG4EFGHHGGCFGHHHHFHGHFFHGFHFGAFGHGGGHHHHFHHFHFHHGHGCGFGHFHHHHHHHHHGGD1AEHHHGHGHHHHH3B?BFFGHFHGFFBGHHGFGDHFHHGHFFHHHHHDGDFFHHHHGHHHHHHHHGHHHHEHHHFHH1FHHHHGGGGGGGGGGGGGGGG-9B@########################################################################################### @SRR1558183.10 10/2 GGCCTCAGTTCCTGGCCCCAGAAACGAGATCCTGACCACATGAACAATTTGGGCTCTTTTGGGAGAATAAAAGACTTATATATTGAAAAAACAAAAAAAAAAAAAAAAAAAAAACAAAAAAAAGGGACCCCGACGGGCCCCCAATCCAAAAATAAAAATCCCGGTGGCCCCCCCCTTATTAAAAAAAAAAAAAAAAAATAAACAAAAAAAACAAAAAAACAAAACAAAAAAACAACCCGCCCCCAGCACAAACAGCAAAAAACAAAAAATCAA + FFFFFGGGGGGGGGGHHHGGGHFHHGEGGGHHHHHHHHHHHHFHHHHHHHGHGHHHHHHHHGGHGGHHHHHHGGHHHHHHHHHHHHHHHHGGHHHHGGGGGGGGGGGGGGGC################################################################################################################################################################# umis-1.0.8/examples/BATseq/transform.json000066400000000000000000000002031412434634200203360ustar00rootroot00000000000000{ "read1": "(@.*)\\n(?P.{8})(.*)\\n\\+\\n(.*)\\n", "read2": "(?P@.*) .*\\n(?P.*)\\n\\+\\n(?P.*)\\n" } umis-1.0.8/examples/CEL-Seq/000077500000000000000000000000001412434634200154475ustar00rootroot00000000000000umis-1.0.8/examples/CEL-Seq/SRP036633_1.fastq000066400000000000000000000032451412434634200200640ustar00rootroot00000000000000@SRR1161549.1 HWI-D00148:56:C24EVACXX:5:1101:1396:2277:ACAGTG/1 ATGTGTCAGAGGTTTTTTTTTTTTTTTTTTTTTTCAAAAAAAAAAAAAAAA + BCCFFFFFHHHHHJIJJJJJJJJJJJFD82;@(8@DDDB&59BDD @SRR1161549.2 HWI-D00148:56:C24EVACXX:5:1101:1375:2292:ACAGTG/1 CATCAATCTAGTGTATTTTTCAACAAATACTTTTGTTTTTCTGAGGTTCAC + CCCFFFFFHHHFHI,((3@39 @SRR1161549.4 HWI-D00148:56:C24EVACXX:5:1101:1394:2379:ACAGTG/1 TGATGCGCGTTTTTTTTTTTTTTTTTTTAAGTAAACGGCTTTTATTGTCAG + CCCFFFFFHHHHHJJJJJJJJHFDDDD&2(++8A88?<-7BDD>CCCCDC9 @SRR1161549.5 HWI-D00148:56:C24EVACXX:5:1101:1488:2428:ACAGTG/1 GACACCGCGCAGTTTTTTTGTTTTGTTTTGTTTTAAATTGAAGGTCCCTTA + @@@DD=?@D@F@7.=DFFFFEFB @SRR1161549.6 HWI-D00148:56:C24EVACXX:5:1101:1607:2275:ACAGTG/1 CGTGTGAGACGCTTTTTTTTTTTTTTTTTTTTTTTTTTTAAATTTTTTAAA + 1=:DD3:2ADDACEEIIIIIIIIIIDD@???::::50&&(+(+38>>0((( @SRR1161549.7 HWI-D00148:56:C24EVACXX:5:1101:1708:2317:ACAGTG/1 GACACCGCCGGATTTTTTTTTTTTTTTTTTTTTTGGTTAAGGATTTCTTTT + CCCFFFFFHHHHHJHJJGIJJJJFDDDDDBDDDB-&+8(8(838@((3>C3 @SRR1161549.8 HWI-D00148:56:C24EVACXX:5:1101:1659:2428:ACAGTG/1 TGATGCGCGGATTTTTTCATCATATTTTGAAAAAAAAAAAAAAAAAAAAAA + CCCFFFFFHHHHHJJJJ:00?FHIGIJJ9BF@897;EEDDDDDDDDDDDDD @SRR1161549.9 HWI-D00148:56:C24EVACXX:5:1101:1705:2446:ACAGTG/1 GACACGAGGCAGTTTTTTTTTTTTTTTTTTTTTTTTTCAAAAAATATTTGT + CCCFFFFFHHHHHJJJJJJJJJJJHFDDDDDDDDDB&(((3@B&((+(3(9 @SRR1161549.10 HWI-D00148:56:C24EVACXX:5:1101:1661:2455:ACAGTG/1 ACGACGAATTTGTTTTTTTTTTTTTTTTTTTTTGGTTTTTTTTTTTTTTTT + CCCFFFFFHHHHHJJJJJJJJJJJHFDDDBBDD-0(2<@8:8B0:BD5@DB umis-1.0.8/examples/CEL-Seq/SRP036633_1.fastq.gz000066400000000000000000000011231412434634200204740ustar00rootroot00000000000000~USRP036633_1.fastqݎ0Bhx g![?Iǰ'bO3g``"//oJY(yO-4ci\Jm Cxc';y aؕO:'}uE(^d!KDNfEʏI@Ғ/Fq?[0}IB_UQUo_wfmHC\03">=q%EOݺh"dƒT7Ud4Fs$4`t3C0#&$J5Y"tz[RqR\-vFWavk= 6L廩΅óe^ې*1fܯ $eW ino%i4IHJL5%lH!i䫧Xb(::XOSNTe`NkmHձ|8 !<^r)m{:J޶ֆ8x4%#x'ĦddQZ*@pL9E Q@p ;umis-1.0.8/examples/CEL-Seq/SRP036633_2.fastq000066400000000000000000000032451412434634200200650ustar00rootroot00000000000000@SRR1161549.1 HWI-D00148:56:C24EVACXX:5:1101:1396:2277:ACAGTG/2 TGTCAGAGGTTNNNNTTTTTTTTTTTTTTTTTTTTTNNNTTTAAAAAAAAA + <<<@@@@@@@@####43=@@@????????=<;<::7###+++(+8:<75:: @SRR1161549.2 HWI-D00148:56:C24EVACXX:5:1101:1375:2292:ACAGTG/2 TCAGGAAAACAAGGGCAAAAGATTGATACGCTCTAAAGAAAAATCAGAGTC + CCCFFFFFHHHHHJJJJJJJJJJJJJJJIJJJJJJJIJJJJJJJGIJJJGG @SRR1161549.3 HWI-D00148:56:C24EVACXX:5:1101:1451:2322:ACAGTG/2 TCCTTCAGTTTGCTAGTCATTGCTTTACTTACTGCCCCCAGACCTTTCCTT + CCCFFFFFHHHHHJJJIJJJJJJJJJJJJJJJJJJJJJJJJGIIJJJJJJJ @SRR1161549.4 HWI-D00148:56:C24EVACXX:5:1101:1394:2379:ACAGTG/2 TGCATCTGCAAAGAGGCTTCCGACAAGTGCAGCTGCTGTGCCTGAAGGGGG + CCCFFFFFHHHHHJJJJJJJJJGIGJJDGHJJJJJJJJIIJJJJJJIJJJJ @SRR1161549.5 HWI-D00148:56:C24EVACXX:5:1101:1488:2428:ACAGTG/2 GGAGGGCTGGCTGCCCTCCCCTTTCCTTTGCTCTTGACCACTCATGGAAGC + ?=:=+=DD:?DD;:E<3294243@@<))###---0-..6)9>> @SRR1161549.7 HWI-D00148:56:C24EVACXX:5:1101:1708:2317:ACAGTG/2 AGTCGCGGTTGGTGGGTAACAACCGAGCCAAGATGTTGCGGAATCTGCTGG + CCCFFFFFHHHHHIJJGIIJJJJJHIJJJJJJJJJJJJJJHHFFFEEEEEE @SRR1161549.8 HWI-D00148:56:C24EVACXX:5:1101:1659:2428:ACAGTG/2 CAGTCCGACGATCGTCTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT + CCCFFFFFHHHHHJFHIJJJJJJJJJJFDDDDDDDDDDDDDDDDDDDDDDD @SRR1161549.9 HWI-D00148:56:C24EVACXX:5:1101:1705:2446:ACAGTG/2 GTTTATGGTTCTTTTTAACCTTTTCTGGAGGGTTGGGGGGTTATTTTTGTT + CBBFFFFFDHHHHJJJJJJJJJJJJJJJHHIJEGHJJJJDBDDDDEEDDBC @SRR1161549.10 HWI-D00148:56:C24EVACXX:5:1101:1661:2455:ACAGTG/2 CAAGTCAGGGCTGGATGAATACAAATGGTTAATTAAGAGCTTGTGTGAGGG + CCCFFFFFHHHHHJIIJJJJJJJJJIJJJJIIJJJJJJIJJJJHIGHHIJJ umis-1.0.8/examples/CEL-Seq/SRP036633_2.fastq.gz000066400000000000000000000011541412434634200205010ustar00rootroot00000000000000~USRP036633_2.fastqݒ0}%! Lw+SLk$9i||LU6c$B:˵kδ"r-eQh,-7N-@XUMVhci K5Κkh.p=mTjSdZo\'HY)N~Y)pX. T_ Isg견 4X*ZrA!hE g  ~A:^)o[ R+$-yxX: FE@SdQ4.niiYjdy'HxHNM^7uoRilӽi{$=m]w4_%͓I -wSПJB;>d)QDh O"iɺ-)Qn1MiN*m[z'iAZ jYə(8P".rfnÀziiV=tlLdr@^'g4[J|SJ9ySI?v0i i1qK+c{߹HERܨk,%<&|20_ ,zWOF`l04umis-1.0.8/examples/CEL-Seq/SRP036633_expected.fastq000066400000000000000000000026271412434634200215300ustar00rootroot00000000000000@SRR1161549.1:CELL_ATGTGTCA:UMI_GAGG TGTCAGAGGTTNNNNTTTTTTTTTTTTTTTTTTTTTNNNTTTAAAAAAAAA + <<<@@@@@@@@####43=@@@????????=<;<::7###+++(+8:<75:: @SRR1161549.2:CELL_CATCAATC:UMI_TAGT TCAGGAAAACAAGGGCAAAAGATTGATACGCTCTAAAGAAAAATCAGAGTC + CCCFFFFFHHHHHJJJJJJJJJJJJJJJIJJJJJJJIJJJJJJJGIJJJGG @SRR1161549.3:CELL_TCACACGC:UMI_ACCG TCCTTCAGTTTGCTAGTCATTGCTTTACTTACTGCCCCCAGACCTTTCCTT + CCCFFFFFHHHHHJJJIJJJJJJJJJJJJJJJJJJJJJJJJGIIJJJJJJJ @SRR1161549.4:CELL_TGATGCGC:UMI_GTTT TGCATCTGCAAAGAGGCTTCCGACAAGTGCAGCTGCTGTGCCTGAAGGGGG + CCCFFFFFHHHHHJJJJJJJJJGIGJJDGHJJJJJJJJIIJJJJJJIJJJJ @SRR1161549.5:CELL_GACACCGC:UMI_GCAG GGAGGGCTGGCTGCCCTCCCCTTTCCTTTGCTCTTGACCACTCATGGAAGC + ?=:=+=DD:?DD;:E<3294243@@<))###---0-..6)9>> @SRR1161549.7:CELL_GACACCGC:UMI_CGGA AGTCGCGGTTGGTGGGTAACAACCGAGCCAAGATGTTGCGGAATCTGCTGG + CCCFFFFFHHHHHIJJGIIJJJJJHIJJJJJJJJJJJJJJHHFFFEEEEEE @SRR1161549.8:CELL_TGATGCGC:UMI_GGAT CAGTCCGACGATCGTCTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT + CCCFFFFFHHHHHJFHIJJJJJJJJJJFDDDDDDDDDDDDDDDDDDDDDDD @SRR1161549.9:CELL_GACACGAG:UMI_GCAG GTTTATGGTTCTTTTTAACCTTTTCTGGAGGGTTGGGGGGTTATTTTTGTT + CBBFFFFFDHHHHJJJJJJJJJJJJJJJHHIJEGHJJJJDBDDDDEEDDBC @SRR1161549.10:CELL_ACGACGAA:UMI_TTTG CAAGTCAGGGCTGGATGAATACAAATGGTTAATTAAGAGCTTGTGTGAGGG + CCCFFFFFHHHHHJIIJJJJJJJJJIJJJJIIJJJJJJIJJJJHIGHHIJJ umis-1.0.8/examples/CEL-Seq/SRP036633_trimmed.fastq000066400000000000000000000026271412434634200213700ustar00rootroot00000000000000@SRR1161549.1:CELL_ATGTGTCA:UMI_GAGG TGTCAGAGGTTNNNNTTTTTTTTTTTTTTTTTTTTTNNNTTTAAAAAAAAA + <<<@@@@@@@@####43=@@@????????=<;<::7###+++(+8:<75:: @SRR1161549.2:CELL_CATCAATC:UMI_TAGT TCAGGAAAACAAGGGCAAAAGATTGATACGCTCTAAAGAAAAATCAGAGTC + CCCFFFFFHHHHHJJJJJJJJJJJJJJJIJJJJJJJIJJJJJJJGIJJJGG @SRR1161549.3:CELL_TCACACGC:UMI_ACCG TCCTTCAGTTTGCTAGTCATTGCTTTACTTACTGCCCCCAGACCTTTCCTT + CCCFFFFFHHHHHJJJIJJJJJJJJJJJJJJJJJJJJJJJJGIIJJJJJJJ @SRR1161549.4:CELL_TGATGCGC:UMI_GTTT TGCATCTGCAAAGAGGCTTCCGACAAGTGCAGCTGCTGTGCCTGAAGGGGG + CCCFFFFFHHHHHJJJJJJJJJGIGJJDGHJJJJJJJJIIJJJJJJIJJJJ @SRR1161549.5:CELL_GACACCGC:UMI_GCAG GGAGGGCTGGCTGCCCTCCCCTTTCCTTTGCTCTTGACCACTCATGGAAGC + ?=:=+=DD:?DD;:E<3294243@@<))###---0-..6)9>> @SRR1161549.7:CELL_GACACCGC:UMI_CGGA AGTCGCGGTTGGTGGGTAACAACCGAGCCAAGATGTTGCGGAATCTGCTGG + CCCFFFFFHHHHHIJJGIIJJJJJHIJJJJJJJJJJJJJJHHFFFEEEEEE @SRR1161549.8:CELL_TGATGCGC:UMI_GGAT CAGTCCGACGATCGTCTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT + CCCFFFFFHHHHHJFHIJJJJJJJJJJFDDDDDDDDDDDDDDDDDDDDDDD @SRR1161549.9:CELL_GACACGAG:UMI_GCAG GTTTATGGTTCTTTTTAACCTTTTCTGGAGGGTTGGGGGGTTATTTTTGTT + CBBFFFFFDHHHHJJJJJJJJJJJJJJJHHIJEGHJJJJDBDDDDEEDDBC @SRR1161549.10:CELL_ACGACGAA:UMI_TTTG CAAGTCAGGGCTGGATGAATACAAATGGTTAATTAAGAGCTTGTGTGAGGG + CCCFFFFFHHHHHJIIJJJJJJJJJIJJJJIIJJJJJJIJJJJHIGHHIJJ umis-1.0.8/examples/CEL-Seq/SRP048838_1.fastq000066400000000000000000000044721412434634200201010ustar00rootroot00000000000000@SRR1610598.1 HISEQ_HU01:115:H8AC4ADXX:1:1101:1170:2105 length=51 TGATGCGCGCGGTTTTTTTTTTTTTTTTTTTTTTTTGAGGGGGGGGACGGG +SRR1610598.1 HISEQ_HU01:115:H8AC4ADXX:1:1101:1170:2105 length=51 CCCFFFFFHHHHHJJJJJJJJJHFDDDDDDDDDDDD&(+8+69B58050:5 @SRR1610598.2 HISEQ_HU01:115:H8AC4ADXX:1:1101:1190:2126 length=51 GCTCATCGTGGATTTTTTTTTTTCTTTATATTTTTAGCATCTTCCATAGCC +SRR1610598.2 HISEQ_HU01:115:H8AC4ADXX:1:1101:1190:2126 length=51 CCCFFFFFHHHHHJJJJJJJJJJDCGHGFEIJJJJJGIIGGHGHHEHHHEF @SRR1610598.3 HISEQ_HU01:115:H8AC4ADXX:1:1101:1123:2136 length=51 GCTCATCGGAACTTTTTTTTTTTTTTCTTTATATTTTTAGCATCTTCCATN +SRR1610598.3 HISEQ_HU01:115:H8AC4ADXX:1:1101:1123:2136 length=51 CCCFFFFFHGHHHJJJJJJJJJJJJF?AHB;7)?DFECA(;3;-5->@>35 @SRR1610598.4 HISEQ_HU01:115:H8AC4ADXX:1:1101:1159:2165 length=51 CTATGTCGGGTTTTTTTTTTTTAATTTTCACATATAAGTTGGATTTTAATT +SRR1610598.4 HISEQ_HU01:115:H8AC4ADXX:1:1101:1159:2165 length=51 CCCFFEFFHHHHHJJJJJJJJFGEFGHA7.=7.7.?CEEDFFBEDEEDCCC @SRR1610598.5 HISEQ_HU01:115:H8AC4ADXX:1:1101:1164:2185 length=51 GACACCGCTTCTTTTTTTTTTTTTTTTTCCCCCCCCGTTACTTTTTCCCTT +SRR1610598.5 HISEQ_HU01:115:H8AC4ADXX:1:1101:1164:2185 length=51 @@@DDD@@@@CFHIIIIIIIIIIIHEB/:@CC70&&))(+(38@@38(((( @SRR1610598.6 HISEQ_HU01:115:H8AC4ADXX:1:1101:1170:2201 length=51 GCTCATCGGGGGTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT +SRR1610598.6 HISEQ_HU01:115:H8AC4ADXX:1:1101:1170:2201 length=51 @@@FFFDFHHHAHHJJJJJJHFDDDDDDDDDDDDDDDB05933:@5&0500 @SRR1610598.7 HISEQ_HU01:115:H8AC4ADXX:1:1101:1200:2208 length=51 TGCATATCATAGTTTTTTTTTTTTTTTTTTTTTTTTTCGAACCCCCTACAC +SRR1610598.7 HISEQ_HU01:115:H8AC4ADXX:1:1101:1200:2208 length=51 CCCFFFFFHHHHHIIJJJJJJJJJJJHFDDDDDDDB3>&++8@5BD&(+(+ @SRR1610598.8 HISEQ_HU01:115:H8AC4ADXX:1:1101:1140:2233 length=51 GAATCGATAGAATTTTTTTTTTTTTTTTTTTTTTTTGGGGGGGTTTTGGTT +SRR1610598.8 HISEQ_HU01:115:H8AC4ADXX:1:1101:1140:2233 length=51 CCCFFFFFHHHHHHJJJJJJJJJJJHFDDDDDDD@&&6@?@>??????????????????????????????????? @SRR1610598.2 HISEQ_HU01:115:H8AC4ADXX:1:1101:1190:2126 length=51 NNTNNCATGGCTATGGAAGATGCTAAAAATATAAAGAAAAAAAAAAATCCN +SRR1610598.2 HISEQ_HU01:115:H8AC4ADXX:1:1101:1190:2126 length=51 ##0##22@???????@@@??@??@@@@@@@@@@?????????????????< @SRR1610598.3 HISEQ_HU01:115:H8AC4ADXX:1:1101:1123:2136 length=51 TACCATGGCTATGGAAGATGCTAAAAATATAAAGAAAAAAAAAAAAAAGTC +SRR1610598.3 HISEQ_HU01:115:H8AC4ADXX:1:1101:1123:2136 length=51 @@CFFFFFHHHHHJGGGFGIGHIJJJEGAHHIG?@?<@@?@??@@???@???@@?@?????????????=<<<:: @SRR1610598.10 HISEQ_HU01:115:H8AC4ADXX:1:1101:1470:2151 length=51 ATTCAAGTAGCACAACTATATATTGCCGCTACCCCAATCCCTCCTTCCAAC +SRR1610598.10 HISEQ_HU01:115:H8AC4ADXX:1:1101:1470:2151 length=51 CCCFFFFFHHGHGJJJJJJJJJJJJJJJJIJJJJJJJJJJJJJJJJJJJJJ umis-1.0.8/examples/CEL-Seq/test.bash000066400000000000000000000003221412434634200172620ustar00rootroot00000000000000python umicount.py fastqtrim \ --fastq1 examples/CEL-Seq/SRP036633_1.fastq \ --fastq2 examples/CEL-Seq/SRP036633_2.fastq \ --cbs 1 --cbe 8 --mbs 9 --mbe 12 \ --outfastq examples/CEL-Seq/SRP036633_trimmed.fastq umis-1.0.8/examples/CEL-Seq/transform.json000066400000000000000000000002271412434634200203560ustar00rootroot00000000000000{ "read1": "(?P@.*) .*\\n(?P.{8})(?P.{4})(.*)\\n\\+(.*)\\n(.*)\\n", "read2": "(@.*)\\n(?P.*)\\n\\+(.*)\\n(?P.*)\\n" } umis-1.0.8/examples/CEL-Seq2/000077500000000000000000000000001412434634200155315ustar00rootroot00000000000000umis-1.0.8/examples/CEL-Seq2/SRR3195918_5.fastq000066400000000000000000000025571412434634200202600ustar00rootroot00000000000000@SRR3195918.1 SBS123:272:C5BW1ACXX:6:1101:1409:2141:UMI:GGCGT:/1 GATTATGTCAAAAGACATAATCGATTCACAAAAAA + BBBFFFFFFFFFFIFIFFIIIIIFIIIIIIIIIBB @SRR3195918.2 SBS123:272:C5BW1ACXX:6:1101:1394:2178:UMI:CGACG:/1 ATCGATTATGTCAAAAGACATAATCGATTCACAAA + BBBFFFFFFFFFFIIIIIIIIIIIIIIIIIIIFFF @SRR3195918.3 SBS123:272:C5BW1ACXX:6:1101:1807:2108:UMI:CCCGC:/1 GGTTTTGGTCTTCCTTGTTCTCTCTCTGCACATGG + BB@.*) .*UMI:(?P.{5,6}):.*\\n(?P.*)\\n\\+\\n(?P.*)\\n", "read2": null } umis-1.0.8/examples/CEL-seq2_I7sample_Indexed/000077500000000000000000000000001412434634200207725ustar00rootroot00000000000000umis-1.0.8/examples/CEL-seq2_I7sample_Indexed/CEL-seq2.R1.fastq000066400000000000000000000015771412434634200236000ustar00rootroot00000000000000@NS500123:456:HYGKFBFZY:1:11101:1640:1028 1:N:0:AGTCAA TGTATCATAGGCTTT + AAA6A#AEAEEEAEE @NS500123:456:HYGKFBFZY:1:11101:20576:1030 1:N:0:AGTCAA GAAAAAAAAAAAAAA + AAAAA#EEEEEEEEE @NS500123:456:HYGKFBFZY:1:11101:4066:1032 1:N:0:AGTCAA CTGATCTTCACGTTT + AAAAA#EEEEEEAEE @NS500123:456:HYGKFBFZY:1:11101:11491:1034 1:N:0:AGTCAA TGTAACGGTCTATTT + AAAAA#EEEEEEEEE @NS500123:456:HYGKFBFZY:1:11101:2250:1036 1:N:0:AGTCAA GTCCTGATCTGGTTT + AAAAA#EEEEEEAAE @NS500123:456:HYGKFBFZY:1:11101:24841:1038 1:N:0:AGTCAA AGGCAGTGACAGTTT + /AAAA#EEEEEEEAE @NS500123:456:HYGKFBFZY:1:11101:10794:1046 1:N:0:AGTCAA TCTCTTATAGGCTTT + AAAAA#EEEEEEEEE @NS500123:456:HYGKFBFZY:1:11101:1155:1047 1:N:0:AGTCAA ACGTTGATAGGCTTT + AAAAAEEEEEEEEEE @NS500123:456:HYGKFBFZY:1:11101:3326:1049 1:N:0:AGTCAA AAAGTGCAAGTGTTT + /A/AAEEEAEEEAEA @NS500123:456:HYGKFBFZY:1:11101:11730:1050 1:N:0:AGTCAA TTAAAAAAAAAAAAA + /AAAAEEEEEEEEEE umis-1.0.8/examples/CEL-seq2_I7sample_Indexed/CEL-seq2.R2.fastq000066400000000000000000000037131412434634200235730ustar00rootroot00000000000000@NS500123:456:HYGKFBFZY:1:11101:1640:1028 2:N:0:AGTCAA CACTTTATGTTTTAATATCCTAGGCATCTGCTGTAATAATATTTTAGAAANTGTTTGGAATTTAAGAAAT + #AAA################################################################## @NS500123:456:HYGKFBFZY:1:11101:20576:1030 2:N:0:AGTCAA CACTTTATGTTTTAATATCCTAGGCATCTGCTGTAATAATATCGTAGAAANTGTTTGGAATTTAAGAAAT + #AAAA################################################################# @NS500123:456:HYGKFBFZY:1:11101:4066:1032 2:N:0:AGTCAA CACTTTATGTTTTAATATCCTAGGCATCTGCTGTAATAATATTTTAGAAANTGTTTGGAATTTAAGAAAT + AAAAA#################################################A############### @NS500123:456:HYGKFBFZY:1:11101:11491:1034 2:N:0:AGTCAA CACTTTATGTTTTAATCGCCTAGGCATCTGCTGTAATAATATTTTAGAAANTGTTTGGAATTTAAGAAAT + AAAAA###E#####E##E##############################<#####/###########/### @NS500123:456:HYGKFBFZY:1:11101:2250:1036 2:N:0:AGTCAA CACTTTATGTTTTAATATCCTAGGCATCTGCTCGAATAATATTTTAGAAANTGTTTGGAATTTAAGAAAT + AAAAA#<#/###A#E##E##############################EA####6E#######/##/### @NS500123:456:HYGKFBFZY:1:11101:24841:1038 2:N:0:AGTCAA CACTTTATGTTTTAATATCCTAGGCATCTGCTGTAATAATATTTTAGAAANTGTTTGGAATTTAAGAAAT + AAAAAEE#EE##6#/##E#################E#####E######AE####EE#######/##/### @NS500123:456:HYGKFBFZY:1:11101:10794:1046 2:N:0:AGTCAA CACTTTATGTTTCAATATCCTAGGCATCTGCTGTAATAATATTTTAGAAANTGTTTGGAATTTAAGAAAT + A/AAAEEE6EEEAEEEE/EE/EEEEEEEEAEEEA6AEAEA6A6E66A/AA#//EEEEE/@.*) .:*.:*.:(?P.*).*\\n(?P.{6})(?P.{6})(.*)\\n\\+(.*)\\n(.*)\\n", "read2": "(?P@.*) .*\\n(?P.*)\\n\\+\\n(?P.*)\\n" } umis-1.0.8/examples/DropSeq/000077500000000000000000000000001412434634200156735ustar00rootroot00000000000000umis-1.0.8/examples/DropSeq/SRR1873278_1.fastq000066400000000000000000000032661412434634200204140ustar00rootroot00000000000000@SRR1873278.1 NS500531:16:H2L75BGXX:1:11101:5725:1038 length=20 TAGTTNGCGCGGGNGAGTAC +SRR1873278.1 NS500531:16:H2L75BGXX:1:11101:5725:1038 length=20 AA7AA#FFFFFFF#FFFFFF @SRR1873278.2 NS500531:16:H2L75BGXX:1:11101:11397:1039 length=20 CAGGGNTAGCACTNCAGGGA +SRR1873278.2 NS500531:16:H2L75BGXX:1:11101:11397:1039 length=20 AAAAA#FAFFAFF#FFFFFF @SRR1873278.3 NS500531:16:H2L75BGXX:1:11101:11354:1039 length=20 GTAATNGCATTTCNATTGGT +SRR1873278.3 NS500531:16:H2L75BGXX:1:11101:11354:1039 length=20 AA<[^\\s]+).*\\n(?P.{12})(?P.{8})(.*)\\n\\+(.*)\\n(.*)\\n", "read2": "(@.*)\\n(?P.*)\\n\\+(.*)\\n(?P.*)\\n" } umis-1.0.8/examples/Klein-inDrop/000077500000000000000000000000001412434634200166115ustar00rootroot00000000000000umis-1.0.8/examples/Klein-inDrop/klein-v3_R1.fq000066400000000000000000005307711412434634200211500ustar00rootroot00000000000000@NS500233:572:H25VKBGX2:1:11101:16195:1041 1:N:0:1 GCTTTNCATGTTGTTTTGAAGGTTCCCACNGTNANCNTTCTTGTTNACNGNNNNNTTNNNN + /AAAA#EEEEEEEEE[^\\s]+).*\\n(?P.*)\\n\\+(.*)\\n(?P.*)\\n", "read2": "(.*)\\n(?P.*)\\n(.*)\\n(.*)\\n", "read3": "(.*)\\n(?P.*)\\n(.*)\\n(.*)\\n", "read4": "(.*)\\n(?P.{8})(?P.{6})\\n(.*)\\n(.*)\\n" }umis-1.0.8/examples/MARS-Seq/000077500000000000000000000000001412434634200156065ustar00rootroot00000000000000umis-1.0.8/examples/MARS-Seq/SRP035326.fastq000066400000000000000000000032451412434634200200010ustar00rootroot00000000000000@HWI-ST808:130:H0B8YADXX:1:1101:1914:2223 1::NNNNNN:GGTCCA:AAAA TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTGTTATTCTGGCGGN + @@@BDDDDFF:DF@6;BA?B=@6B########################### @HWI-ST808:130:H0B8YADXX:1:1101:2088:2222 1::NNNNNN:GGTCCA:CCCT AGGAAGATGGAGGAGAGAAGGCGGTGAAAGAGACCTGTAAAAAGCCACCGN + @@@DDBD>=AFCF+?BBDDB05307B##################### @HWI-ST808:130:H0B8YADXX:1:1101:5684:2136 1::NNNNNN:GGTCCA:GAAC GCTTAGATGGCAGGTTCAGCGGAAAGTGATCTGCATCCCCAAAAGCATCAN + ?@@BDBDB:A?F?:GEGEHGHEG?DDCGEEHIIG# @HWI-ST808:130:H0B8YADXX:1:1101:5935:2225 1::NNNNNN:GGTCCA:AGTG TCAGTTGGGGCCTTAACTTTGGTGATCAAGGATACATTCGGATGGCAAGAN + @@@DDDEBDDFHDGIIIIIIJJFGIIJIIJGGCCGFHGIIICHIEHGIIG# @HWI-ST808:130:H0B8YADXX:1:1101:6036:2098 1::NNNNNN:GGTCCA:CATT TNATAATTTGAGAGGCCTTTGCTTCAAAACGAGAAGTAATATCAGTATCGN + @#1=BDDDBFHFHIIIIIIIIIIIIIIIIIIGIGGIGHIIIIIGIDFGIG# @HWI-ST808:130:H0B8YADXX:1:1101:6367:2163 1::NNNNNN:GGTCCA:GGTA TAATGACTTGTTGGGTAGCTATTAAGGTACTAGAATTGATAAATGTGTACN + @@@?:B?ADA= @HWI-ST808:140:H0L10ADXX:1:1101:4597:2:NNNNNN:CAACCA:AAAA TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTGTTCTTTCTGTCGTAT + @@FG>G@>CD=CCF @HWI-ST808:140:H0L10ADXX:1:1101:13033:2:NNNNNN:CAACCA:AAAA GAGAGTTTTTTTTTTTTTTTTTTGGTGAGGGGCAGAAAACAGGGATGAATGTAATCCT + 11:442=BFHHHDE?FHIGAEA#################################### @HWI-ST808:140:H0L10ADXX:1:1101:1718:2:NNNNNN:CTACCA:CCTT CGTCGAACCTTTCTGGCCTGGCTTGTTTGCCAAGGCTCTGGCCAATGTCAACATTGGG + ?<8BAADDAFDBFEHGBBHHIIHHFIJIGGEGGJGFCEG0?DCHBB9CB4BBFEC;@= @HWI-ST808:140:H0L10ADXX:1:1101:11558:2:NNNNNN:CTACCA:TTGT TGCCACCCCCCCTCAAACCCCACCCCCTTTCAGGTTCCTTGCTCAGCCAAGCTTGTCA + @@@DDFFFHHHHGIIJ3?GGHJJIGIGHIIGFHE>@GBEGHFHFEEECB;?C>;@@>; @HWI-ST808:140:H0L10ADXX:1:1101:18714:2:NNNNNN:CAACCA:AAAA TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTATTTGTGTTGTCAA + ???DDDDDDDDDDDDD@9?####################################### @HWI-ST808:140:H0L10ADXX:1:1101:5802:2:NNNNNN:CTACCA:AAAA GTGACCGCAACGTGGCCAGTGTGTGTCTGCAGATCGGGTACCCAACTGTTGCCTCGGT + 8?@4=+=@@@D; @HWI-ST808:140:H0L10ADXX:1:1101:6626:2:NNNNNN:GTACCA:AAAA AGGCGATAGTCGCCATTCACAGATTTGCTCGGCAATCAGTACTGGTAGGCGTTAGACC + 1:?DF1ADFFHHHGIGHGHGIJIGGHIIGHGIIIBDFHGIGGHBAFHGCDA@E=AFCF+?BBDDB05307B##################### @HWI-ST808:130:H0B8YADXX:1:1101:5684:2136 1::NNNNNN:CELL_GGTCCA:UMI_GAAC GCTTAGATGGCAGGTTCAGCGGAAAGTGATCTGCATCCCCAAAAGCATCAN + ?@@BDBDB:A?F?:GEGEHGHEG?DDCGEEHIIG# @HWI-ST808:130:H0B8YADXX:1:1101:5935:2225 1::NNNNNN:CELL_GGTCCA:UMI_AGTG TCAGTTGGGGCCTTAACTTTGGTGATCAAGGATACATTCGGATGGCAAGAN + @@@DDDEBDDFHDGIIIIIIJJFGIIJIIJGGCCGFHGIIICHIEHGIIG# @HWI-ST808:130:H0B8YADXX:1:1101:6036:2098 1::NNNNNN:CELL_GGTCCA:UMI_CATT TNATAATTTGAGAGGCCTTTGCTTCAAAACGAGAAGTAATATCAGTATCGN + @#1=BDDDBFHFHIIIIIIIIIIIIIIIIIIGIGGIGHIIIIIGIDFGIG# @HWI-ST808:130:H0B8YADXX:1:1101:6367:2163 1::NNNNNN:CELL_GGTCCA:UMI_GGTA TAATGACTTGTTGGGTAGCTATTAAGGTACTAGAATTGATAAATGTGTACN + @@@?:B?A@.*).*:(?P.{6}):(?P.{4,8})\\n(?P.*)\\n\\+\\n(?P.*)\\n", "read2": null } umis-1.0.8/examples/MARS-Seq/transform_SRP063520.json000066400000000000000000000002021412434634200217120ustar00rootroot00000000000000{ "read1": "(?P@.*) (.*)-(.{4})-(?P.{6})-(?P.{4})/1\\n(?P.*)\\n\\+\\n(?P.*)\\n", "read2": null } umis-1.0.8/examples/SCRB-Seq/000077500000000000000000000000001412434634200155755ustar00rootroot00000000000000umis-1.0.8/examples/SCRB-Seq/scrbseq_R1.fastq000066400000000000000000000206061412434634200206450ustar00rootroot00000000000000@NS500422:158:HF57FBGXX:1:11101:19442:1039 1:N:0:0 GGACCNCGAATACTGT + AAAAA#EEEEEEEEEE @NS500422:158:HF57FBGXX:1:11101:17359:1039 1:N:0:0 TTAAGNGCACTAGAAT + AAAAA#EEEEEEEEEE @NS500422:158:HF57FBGXX:1:11101:3450:1039 1:N:0:0 AATAANGCGTCTTTC + AAAAA#EEEEEEEEE @NS500422:158:HF57FBGXX:1:11101:19362:1039 1:N:0:0 GCCTGNTAAGTACGTG + AAAAA#EEEEEEEEEE @NS500422:158:HF57FBGXX:1:11101:20958:1040 1:N:0:0 GCTCCNGGTCCCCGG + /AAAA#EEEEEEEEE @NS500422:158:HF57FBGXX:1:11101:15773:1040 1:N:0:0 CGGCTNATGTGGTTA + AAAAA#EEEEEEEEE @NS500422:158:HF57FBGXX:1:11101:22339:1040 1:N:0:0 CGCGTNCGACTGTA + AAAAA#EEEEEEEE @NS500422:158:HF57FBGXX:1:11101:3719:1040 1:N:0:0 GGTGCNGCAGCAGGC + AAAAA#EEEEEEEEE @NS500422:158:HF57FBGXX:1:11101:4222:1040 1:N:0:0 GTCGGNAGTATATCCG + AAAAA#EEEEEEEEEE @NS500422:158:HF57FBGXX:1:11101:10828:1040 1:N:0:0 GAGGGNAGTTTCTAA + AAAAA#EEEEEEEEE @NS500422:158:HF57FBGXX:1:11101:24642:1040 1:N:0:0 GAGCCNTAATAGCTG + AAAAA#EEEEEEEEE @NS500422:158:HF57FBGXX:1:11101:8383:1040 1:N:0:0 CTCGGNGAGGGGATTC + AAAAA#EEEEEEEEEE @NS500422:158:HF57FBGXX:1:11101:21086:1040 1:N:0:0 CAAAANGGCGCATCGG + AAAAA#EEEEEEEEEE @NS500422:158:HF57FBGXX:1:11101:1678:1040 1:N:0:0 GACGGNCGCTGGCCTG + A6A6A#EEEEEEAEEA @NS500422:158:HF57FBGXX:1:11101:8761:1040 1:N:0:0 CGTGCNAAGAGGCTA + AAAAA#EEEEEEEAE @NS500422:158:HF57FBGXX:1:11101:18610:1040 1:N:0:0 TGACGNCCATAAGGCT + AA6AA#EEEEEEEEEE @NS500422:158:HF57FBGXX:1:11101:10551:1040 1:N:0:0 GCTGCNACTGGGATGG + AAAAA#EEEEEEEEEE @NS500422:158:HF57FBGXX:1:11101:21626:1040 1:N:0:0 AACATNTATTGGTCAT + AAAAA#EEEEEEEEEE @NS500422:158:HF57FBGXX:1:11101:5764:1041 1:N:0:0 GCCACNAAGTGTTTA + AAAAA#EEEEEEEEE @NS500422:158:HF57FBGXX:1:11101:8046:1041 1:N:0:0 GGCGGNGCCAGCACT + AAAAA#EEEEEEEEE @NS500422:158:HF57FBGXX:1:11101:11617:1041 1:N:0:0 TTTGTNTTGTAGT + AAAAA#EEEEEEE @NS500422:158:HF57FBGXX:1:11101:6397:1041 1:N:0:0 TAGAANGTTTATTTAT + AAAAA#EEEEEEEEEE @NS500422:158:HF57FBGXX:1:11101:11685:1041 1:N:0:0 TATTTNTTTGTAAGCC + AAAAA#EEEEEEEEEE @NS500422:158:HF57FBGXX:1:11101:6276:1041 1:N:0:0 TAAGTNAAAGCACCTT + AAAAA#AEEEEAEEEE @NS500422:158:HF57FBGXX:1:11101:2003:1041 1:N:0:0 GGGCGNATGGACAGGT + AA/AA#AAE/EE/EE6 @NS500422:158:HF57FBGXX:1:11101:9807:1041 1:N:0:0 TCCCGNTGGGCTTTTG + AAAAA#EEEE/A/EEE @NS500422:158:HF57FBGXX:1:11101:10497:1041 1:N:0:0 GCGCGNGCCATTCTTG + AAAAA#EEEEEEEEEE @NS500422:158:HF57FBGXX:1:11101:10906:1042 1:N:0:0 CCGGGNGGTTATATAC + AA/A/#EE/EAEAEEE @NS500422:158:HF57FBGXX:1:11101:16378:1042 1:N:0:0 TATCTNATTGTGTTA + AAAAA#EEEEEEEEE @NS500422:158:HF57FBGXX:1:11101:24935:1042 1:N:0:0 TTTTANGTTGGGAGGT + AAAAA#EEEEEEEEEE @NS500422:158:HF57FBGXX:1:11101:15517:1042 1:N:0:0 ATCTANATACATCCAC + AAAAA#EEEEEEEEEE @NS500422:158:HF57FBGXX:1:11101:18824:1042 1:N:0:0 TAGATNGGGCAGCATG + AAAAA#EEEEEEEEEE @NS500422:158:HF57FBGXX:1:11101:26766:1042 1:N:0:0 GGCCANATGCGGACTT + AAAAA#EEEEEEEEEE @NS500422:158:HF57FBGXX:1:11101:11450:1042 1:N:0:0 AATTTNACTCGTCGCT + AAAAA#EEEEEEEEEE @NS500422:158:HF57FBGXX:1:11101:2596:1042 1:N:0:0 CGTCGNTGGATAATA + AAAAA#EEEEEEEEE @NS500422:158:HF57FBGXX:1:11101:7778:1042 1:N:0:0 CGCACNTATCGCATCT + AAAAA#EEEEEEEEEE @NS500422:158:HF57FBGXX:1:11101:1074:1042 1:N:0:0 CTGCGNGGGGACATCT + AAAAA#EEEEAEEEEE @NS500422:158:HF57FBGXX:1:11101:8154:1042 1:N:0:0 AGTAANTGTTTCTATC + AAAAA#EEEEEEEEEE @NS500422:158:HF57FBGXX:1:11101:7359:1042 1:N:0:0 ATGAANATTCGTAACT + AAAAA#EAEEEEEEEE @NS500422:158:HF57FBGXX:1:11101:15368:1043 1:N:0:0 GGTCCNAGCGTCTTTT + AAAAA#EEEEEEEEEE @NS500422:158:HF57FBGXX:1:11101:2550:1043 1:N:0:0 CTCCGNGCACTTTTAT + AAAAA#EEEEEEEEEE @NS500422:158:HF57FBGXX:1:11101:26642:1043 1:N:0:0 ATACTNTAACCATA + AAAAA#EEEEEEEE @NS500422:158:HF57FBGXX:1:11101:26099:1043 1:N:0:0 TCTAANCGATGTCGGT + AAAAA#EEEEEEEEEE @NS500422:158:HF57FBGXX:1:11101:20355:1043 1:N:0:0 ATATCNTTAAGTGAA + AAAAA#EEEEEEEEE @NS500422:158:HF57FBGXX:1:11101:5055:1043 1:N:0:0 AATACNGATTACGGTG + AAAAA#EEEEEEEEEE @NS500422:158:HF57FBGXX:1:11101:10699:1043 1:N:0:0 AATATNGCGTGAA + AAAAA#EEEEEEE @NS500422:158:HF57FBGXX:1:11101:4401:1043 1:N:0:0 CAAAANCCCATAATCT + AAAAA#EEEEEEEEEE @NS500422:158:HF57FBGXX:1:11101:18451:1043 1:N:0:0 GGGCTNCGTTTTTTTG + AAAAA#EEEEEEEEEE @NS500422:158:HF57FBGXX:1:11101:19768:1043 1:N:0:0 CAAAANTGAGGTGCTG + AAAAA#EEEEEEEEEE @NS500422:158:HF57FBGXX:1:11101:25438:1043 1:N:0:0 GTCGCNCGTAAGGTCC + AAAAA#EEEEEEEEEE @NS500422:158:HF57FBGXX:1:11101:8868:1044 1:N:0:0 TTACANAGGAGTTTA + AAAAA#EEEEEEEEE @NS500422:158:HF57FBGXX:1:11101:15290:1044 1:N:0:0 GGCTCNTTGGTATTC + AAAAA#EEEEEEEEE @NS500422:158:HF57FBGXX:1:11101:11164:1044 1:N:0:0 CGTGGNTCGGGGGCCG + AAA6A#/EEA/AEEEE @NS500422:158:HF57FBGXX:1:11101:19315:1044 1:N:0:0 AATTTNTCAATAGGT + AAAAA#EEEEEEEEE @NS500422:158:HF57FBGXX:1:11101:10414:1044 1:N:0:0 TAACANCTATAGCCTG + AAAAA#EEEEEEEEEE @NS500422:158:HF57FBGXX:1:11101:24477:1044 1:N:0:0 GGCCANAGTTATGTCT + AAAAA#EEEEEEEEEE @NS500422:158:HF57FBGXX:1:11101:20469:1044 1:N:0:0 GGAGGNACGGCAAATT + AA6AA#EEEEEEEEEE @NS500422:158:HF57FBGXX:1:11101:3630:1044 1:N:0:0 TTGGANGGACAAGTGC + AAAAA#EEEEAEEEEE @NS500422:158:HF57FBGXX:1:11101:18327:1044 1:N:0:0 CACGCNTATTTTATAC + AAAAA#EEEEEEEEEE @NS500422:158:HF57FBGXX:1:11101:5145:1044 1:N:0:0 CCTTCNTCTTTTTGCT + AAAAA#EEEEEEEEEA @NS500422:158:HF57FBGXX:1:11101:2419:1044 1:N:0:0 AGGCCNTATTACA + AAAAA#EEEEEEE @NS500422:158:HF57FBGXX:1:11101:13059:1044 1:N:0:0 CCACANCATCGATTC + AAAAA#EEEEEEEEE @NS500422:158:HF57FBGXX:1:11101:20619:1044 1:N:0:0 CGCCTNATTGATGTTG + AAAAA#EEEEEEEEEE @NS500422:158:HF57FBGXX:1:11101:18192:1045 1:N:0:0 TATAGNCGGTGGTTCG + AAAAA#EEEEAEEEEE @NS500422:158:HF57FBGXX:1:11101:19695:1045 1:N:0:0 TTACTNGATTCTATCG + AAAAA#EEEEEEEEEE @NS500422:158:HF57FBGXX:1:11101:9205:1045 1:N:0:0 GAGCGNGATTGTTAAT + AAAAA#EEEEEEEEEE @NS500422:158:HF57FBGXX:1:11101:5844:1045 1:N:0:0 CGTGCNAATAGAAACT + AAAAA#EEEEEEEEEE @NS500422:158:HF57FBGXX:1:11101:22259:1045 1:N:0:0 GACGGNCGCTTCAATC + AAAAA#EEEEEEEEEE @NS500422:158:HF57FBGXX:1:11101:10791:1045 1:N:0:0 GGGATNCGTTTCGCTT + AAAAA#EEEEEEEEEE @NS500422:158:HF57FBGXX:1:11101:2096:1045 1:N:0:0 ATCGANACAATGTTGT + AAAAA#EEEEEEEEEE @NS500422:158:HF57FBGXX:1:11101:23532:1045 1:N:0:0 CCCCCNAATGCTC + AAAAA#EEEEEEE @NS500422:158:HF57FBGXX:1:11101:12839:1045 1:N:0:0 CTCCGNGGTGTAAGGC + AAAAA#EEEEEEEEEE @NS500422:158:HF57FBGXX:1:11101:4296:1045 1:N:0:0 GATATNCAGGGAAGGG + AAAAA#EEEEEEEEEE @NS500422:158:HF57FBGXX:1:11101:8419:1045 1:N:0:0 CGCGGNGTTTCTTGGC + AAAAA#EEEEEEEEEE @NS500422:158:HF57FBGXX:1:11101:6865:1045 1:N:0:0 ATGAANTGATCGGT + AAAAA#EEEEEEEE @NS500422:158:HF57FBGXX:1:11101:19793:1045 1:N:0:0 ATTTANTGTGTGTAAT + AAAAA#EEEEEEEEE/ @NS500422:158:HF57FBGXX:1:11101:16745:1045 1:N:0:0 GGGGTNTCTATGCACT + AAAAA#EEEEEEEEEE @NS500422:158:HF57FBGXX:1:11101:21291:1046 1:N:0:0 ATTCANGTCTACTCT + AAAAA#EEEEEEEEE @NS500422:158:HF57FBGXX:1:11101:14757:1046 1:N:0:0 CATAANAACGCTTTTC + AAAAA#EEEEEEEEEE @NS500422:158:HF57FBGXX:1:11101:8832:1046 1:N:0:0 CGTCGNGTTGTTTTA + AAAAA#EEEEEEEEE @NS500422:158:HF57FBGXX:1:11101:14468:1046 1:N:0:0 AGTTANATCCTGACAT + AAAAA#EEEEEEEEEE @NS500422:158:HF57FBGXX:1:11101:5886:1046 1:N:0:0 CCCGANTCCGGGTGGG + AAAA/#EEEE/EEE/A @NS500422:158:HF57FBGXX:1:11101:16812:1046 1:N:0:0 GTCGGNGTTCCGCTGT + AAAAA#EEEEEEEEE/ @NS500422:158:HF57FBGXX:1:11101:11083:1046 1:N:0:0 GGAGGNAAGCGCAATG + AAAAA#EEEEEEEEEE @NS500422:158:HF57FBGXX:1:11101:6973:1046 1:N:0:0 TGATANTGGAATAATT + AAAAA#EEEEEEEEEE @NS500422:158:HF57FBGXX:1:11101:3588:1046 1:N:0:0 CGTGCNAGATCCGTGG + AAAAA#EEAEEEEEEE @NS500422:158:HF57FBGXX:1:11101:8739:1046 1:N:0:0 ATTTTNATTGAGATAT + AAAAA#EAEEEEEEEE @NS500422:158:HF57FBGXX:1:11101:26209:1046 1:N:0:0 TGTAANTACTTCTACT + AAAAA#EEEEEEEEEE @NS500422:158:HF57FBGXX:1:11101:17976:1046 1:N:0:0 TAATANACCGATTAC + AAAAA#EEEEEEEEE @NS500422:158:HF57FBGXX:1:11101:22721:1047 1:N:0:0 CGGCCNCAAGTACTCG + AAAAA#EEEEEEEEEE @NS500422:158:HF57FBGXX:1:11101:15317:1047 1:N:0:0 ATCTANGTGATGGCTT + AAAAA#EEEEEEEEEE @NS500422:158:HF57FBGXX:1:11101:1781:1047 1:N:0:0 ACGCGNATTATCCTGT + AAAAA#EEEEEEEEEE @NS500422:158:HF57FBGXX:1:11101:4354:1047 1:N:0:0 GCCTGNGTGCTTGTAT + AAAAA#EEEEEEEEEE @NS500422:158:HF57FBGXX:1:11101:16171:1047 1:N:0:0 TTACANTATAGGTTA + AAAAA#EEEEEEEEE @NS500422:158:HF57FBGXX:1:11101:22183:1047 1:N:0:0 CTAGTNTGGTGGGGGC + AAAAA#EEEE/EEEEE @NS500422:158:HF57FBGXX:1:11101:2517:1047 1:N:0:0 TATTTNTCAGCAACT + AAAAA#EEEEEEEEE @NS500422:158:HF57FBGXX:1:11101:24068:1047 1:N:0:0 AGCCGNTTTTGAGGGT + AAAAA#EEEEEEEEEE @NS500422:158:HF57FBGXX:1:11101:24214:1047 1:N:0:0 GAGGGNGGCTCAGGTC + AAAAA#EEEEEEEEEE @NS500422:158:HF57FBGXX:1:11101:25750:1047 1:N:0:0 AATAANTCGGTTTCGC + AAAAA#EEEEEEEEEE @NS500422:158:HF57FBGXX:1:11101:25340:1048 1:N:0:0 GCCGGNTTTACTTAT + AAAAA#EEEEEEEEE umis-1.0.8/examples/SCRB-Seq/scrbseq_R2.fastq000066400000000000000000000435261412434634200206540ustar00rootroot00000000000000@NS500422:158:HF57FBGXX:1:11101:19442:1039 2:N:0:0 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:17359:1039 2:N:0:0 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:3450:1039 2:N:0:0 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:19362:1039 2:N:0:0 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:20958:1040 2:N:0:0 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:15773:1040 2:N:0:0 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:22339:1040 2:N:0:0 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:3719:1040 2:N:0:0 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:4222:1040 2:N:0:0 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:10828:1040 2:N:0:0 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:24642:1040 2:N:0:0 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:8383:1040 2:N:0:0 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:21086:1040 2:N:0:0 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:1678:1040 2:N:0:0 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:8761:1040 2:N:0:0 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:18610:1040 2:N:0:0 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:10551:1040 2:N:0:0 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:21626:1040 2:N:0:0 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:5764:1041 2:N:0:0 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:8046:1041 2:N:0:0 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:11617:1041 2:N:0:0 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:6397:1041 2:N:0:0 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:11685:1041 2:N:0:0 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:6276:1041 2:N:0:0 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:2003:1041 2:N:0:0 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:9807:1041 2:N:0:0 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:10497:1041 2:N:0:0 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:10906:1042 2:N:0:0 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:16378:1042 2:N:0:0 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:24935:1042 2:N:0:0 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:15517:1042 2:N:0:0 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:18824:1042 2:N:0:0 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:26766:1042 2:N:0:0 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:11450:1042 2:N:0:0 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:2596:1042 2:N:0:0 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:7778:1042 2:N:0:0 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:1074:1042 2:N:0:0 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:8154:1042 2:N:0:0 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:7359:1042 2:N:0:0 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:15368:1043 2:N:0:0 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:2550:1043 2:N:0:0 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:26642:1043 2:N:0:0 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:26099:1043 2:N:0:0 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:20355:1043 2:N:0:0 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:5055:1043 2:N:0:0 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:10699:1043 2:N:0:0 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:4401:1043 2:N:0:0 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:18451:1043 2:N:0:0 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:19768:1043 2:N:0:0 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:25438:1043 2:N:0:0 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:8868:1044 2:N:0:0 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:15290:1044 2:N:0:0 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:11164:1044 2:N:0:0 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:19315:1044 2:N:0:0 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:10414:1044 2:N:0:0 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:24477:1044 2:N:0:0 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:20469:1044 2:N:0:0 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:3630:1044 2:N:0:0 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:18327:1044 2:N:0:0 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:5145:1044 2:N:0:0 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:2419:1044 2:N:0:0 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:13059:1044 2:N:0:0 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:20619:1044 2:N:0:0 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:18192:1045 2:N:0:0 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:19695:1045 2:N:0:0 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:9205:1045 2:N:0:0 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:5844:1045 2:N:0:0 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:22259:1045 2:N:0:0 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:10791:1045 2:N:0:0 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:2096:1045 2:N:0:0 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:23532:1045 2:N:0:0 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:12839:1045 2:N:0:0 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:4296:1045 2:N:0:0 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:8419:1045 2:N:0:0 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:6865:1045 2:N:0:0 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:19793:1045 2:N:0:0 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:16745:1045 2:N:0:0 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:21291:1046 2:N:0:0 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:14757:1046 2:N:0:0 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:8832:1046 2:N:0:0 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:14468:1046 2:N:0:0 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:5886:1046 2:N:0:0 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:16812:1046 2:N:0:0 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:11083:1046 2:N:0:0 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:6973:1046 2:N:0:0 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:3588:1046 2:N:0:0 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:8739:1046 2:N:0:0 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:26209:1046 2:N:0:0 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:17976:1046 2:N:0:0 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:22721:1047 2:N:0:0 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:15317:1047 2:N:0:0 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:1781:1047 2:N:0:0 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:4354:1047 2:N:0:0 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:16171:1047 2:N:0:0 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:22183:1047 2:N:0:0 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:2517:1047 2:N:0:0 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:24068:1047 2:N:0:0 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:24214:1047 2:N:0:0 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:25750:1047 2:N:0:0 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:25340:1048 2:N:0:0 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ umis-1.0.8/examples/SCRB-Seq/transform.json000066400000000000000000000002171412434634200205030ustar00rootroot00000000000000{ "read1": "(@.*)\\n(?P.{6})(?P.{10})\\n\\+(.*)\\n(.*)\\n", "read2": "(?P@.*) .*\\n(?P.*)\\n\\+(.*)\\n(?P.*)\\n" }umis-1.0.8/examples/STRT-Seq/000077500000000000000000000000001412434634200156405ustar00rootroot00000000000000umis-1.0.8/examples/STRT-Seq/SRP022764_ESCell_1_ATTAGAC_single.fastq000066400000000000000000000030401412434634200241220ustar00rootroot00000000000000@SRR1043198.1 Run0197_AD24YTACXX_L2_T1101_C16/1 GTCTGGGGGTGGATGCAGGGGTGGTGGTCTGTCTCTTATACACATCTGACGC + ^^^cccccc[ccacdhcdbccW^cFabZ`dbbddhhdhhccdcccccc`c__ @SRR1043198.2 Run0197_AD24YTACXX_L2_T1101_C65/1 CTACCGGGAAGCCCCCTGACCCTCCGGGCGAAGCCAGGAGTCCGCGAAGCCG + bbbeeeeegggggiiiiiiihiiiiiiiiihiiggggeeeeddccccccccc @SRR1043198.3 Run0197_AD24YTACXX_L2_T1101_C91/1 GGTAAGCTGTCTCTTATACACATCTGACGCATTAGACGTCGTATGCCGTCTT + abbeeccegggggiiiiiiiiiiiiihiiihihihhiighhffghieghiii @SRR1043198.4 Run0197_AD24YTACXX_L2_T1101_C226/1 TGTTTGGGGAAGTAAAGCGGGGAGAGAGAGTAGGAGCAAAGAGAAAATATGG + abaeeeeeggggghiiiiiiiihiegghhhbfhiihggggeeeeeddddddc @SRR1043198.5 Run0197_AD24YTACXX_L2_T1101_C234/1 TGTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTAGGCTTTCCGGTGTTGGGTA + bbbeeeeegggggiiiigeccccccccaaccOGJGGGJJRGQEHEOGQWLQG @SRR1043198.6 Run0197_AD24YTACXX_L2_T1101_C271/1 TTACCGGGACTTCTGCTGTCTCTTATACACATCTGACGCATTAGACGTCGTA + ___ceceegcggedbfege`gghfhh_gfhhhhhhhegghhhfbfbfced`` @SRR1043198.7 Run0197_AD24YTACXX_L2_T1101_C286/1 CGTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT + bbbeeeeegggggiiiigeccccccU_ccccccccccccccccccccccccc @SRR1043198.8 Run0197_AD24YTACXX_L2_T1101_C332/1 TGTAAGGGGATCGGGATGCTACTAGTTTACTGCGTCACGCCTGTCTCTTATA + _bbeeeeegggggihiiiiiiifghiiiihhihhiiiihihgfgihiefhhh @SRR1043198.9 Run0197_AD24YTACXX_L2_T1101_C332/1 CAAAGGGGAAGAACGAGAATGACGCCATTCCTGCCAAGAAGGCTCAGAGTAA + __beeeeefeecchicghhiiiffegddghiihiifhhehhhichdghf\c_ @SRR1043198.10 Run0197_AD24YTACXX_L2_T1101_C395/1 GCGTTGGGTTAACAGAGAAGTTATAGGTGGATTATTTATAGTGTGATTATTG + _bbeeeeeggggghihiiiidgfhihh^egfhifiiiiiiieghihhiiigh umis-1.0.8/examples/STRT-Seq/SRP022764_transform.json000066400000000000000000000001651412434634200217610ustar00rootroot00000000000000{ "read1": "(?P[^\\s]+).*\\n(?P.{5})(?P.*)\\n\\+(.*)\\n(.{5})(?P.*)\\n", "read2": null }umis-1.0.8/examples/STRT-Seq/SRP045452_1772058148_A01.fastq000066400000000000000000000030251412434634200216030ustar00rootroot00000000000000SRR1544693.1 Run0199_AC237YACXX_L2_T1101_C/1 GATGAAGGGGGGAATTCTCTTGCTTCAACAATAACGTCTCTTTCAGAAGGCA + _@P\aceegge_ccccbcccccbbbbcccccccccccacb_`bbbcccccaa @SRR1544693.2 Run0199_AC237YACXX_L2_T1101_C15/1 GAACAGGGGGGAATTCATCTCCTCTAACTTTGGAGAGGTAGGAATGGGAGTA + b@P`ceeeggcccccccddccccccccccdcccccccc]bcccccccccc^b @SRR1544693.3 Run0199_AC237YACXX_L2_T1101_C36/1 CCACAAGGGATTTATGTATATTTTGAAGGTATGAGACCCACAAGCACAATAG + bbbeeeeeggggghiiffiiiiiiifgghfghhhiiiiidfhiiihihgiii @SRR1544693.4 Run0199_AC237YACXX_L2_T1101_C38/1 AGCTGCAGGACAGGAGAGAAGCCATACAAGTGTGAAATATGTGGGAAGACCT + _bbeeecdgfggghhhhhhgafghhhhhhhfgfgfgfgfhfagfhfhhhhhh @SRR1544693.5 Run0199_AC237YACXX_L2_T1101_C43/1 TGTAATGGGAGGCACTGGACGACCAGTTGTAGGCAGGCACGTGTCCTGCTAG + ___cceeeggefgdhhihfhhhiihighhhghha]ca`gfheeV\_V\b_Zb @SRR1544693.6 Run0199_AC237YACXX_L2_T1101_C46/1 GTTTTAGGGTGTCTTTATCCTTTGGAGACTGGGAGACTCCCCCAAAAGCCCT + a__eeeeeggggghiiiiiiiiiiiiiihiiiihihhiihiiiifhhihhih @SRR1544693.7 Run0199_AC237YACXX_L2_T1101_C61/1 GGATTAGGGGGGCATCATCTGTCCTTATAGCTCATTAGGAAGAGAAACAGTG + _b_eeeeegggccccccccccdccccccccccccccdcccccccccccccbb @SRR1544693.8 Run0199_AC237YACXX_L2_T1101_C81/1 TCTAAAGGGTGCTAGGTAAAGAGAAGGCAAGGACTCCATGCAACGTTCACAC + abbeeeeegegggihidfhihihiiiiiihiiiiiiiiiiiiiihfhihiih @SRR1544693.9 Run0199_AC237YACXX_L2_T1101_C81/1 GGCAGAGGGGGAATTCATCTCCTCTAACTTTGGAGAGGTAGGAATGGGAGTA + bbbeeeeegggggiiiiiiiiiiiiiiiiiiiihihhicfhi\\aefhii_d @SRR1544693.10 Run0199_AC237YACXX_L2_T1101_C84/1 GTTTACGGGGGGTTCTTGTTCTACTCTGCTCCATCCTCTTTTTCTTCCTCCC + ___eeeeeggfcU_accG]bbcR]`bbcccccbbccccccccc]bbW_bcccumis-1.0.8/examples/STRT-Seq/SRP045452_transform.json000066400000000000000000000002131412434634200217520ustar00rootroot00000000000000{ "read1": "(?P[^\\s]+).*\\n(?P.{6})(?PG{3,3})(?P.*)\\n\\+(.*)\\n(.{6})(.{3,3})(?P.*)\\n", "read2": null } umis-1.0.8/examples/STRT-Seq/dual_index_transform.json000066400000000000000000000003031412434634200227360ustar00rootroot00000000000000{ "read1": "(?P[^\\s]+).*\\n(?P.{3})(?P.{5})(?PG{3,3})(?P.*)\\n\\+(.*)\\n(.{3})(.{5})(.{3,3})(?P.*)\\n", "read2": "(@.*)\\n(?P.*)\\n\\+(.*)\\n(.*)\\n" } umis-1.0.8/examples/STRT-Seq/dualindex_example_1.fastq000066400000000000000000000026521412434634200226150ustar00rootroot00000000000000@Run0331_BC7Y46ACXX_L1_R1_T1101_C27083 CTCTTATACACATCTGACGCCGATGTTCGTATGCCGTCTTCTGCTTGAAAA + ^__ccdeccgegebgdaeda]ff^_PYbece^ecegV_e^bcebggfGHW\ @Run0331_BC7Y46ACXX_L1_R1_T1101_C27084 CTCTGTCTCTTATACACATCTGACGCTGACCATCGTATGCCGTCTTCTGCT + bbbeeedeeggggiighiiiihhhehhideghgiighhifhhfhihhhfff @Run0331_BC7Y46ACXX_L1_R1_T1101_C27085 AGTAGGCAAGTGTGTTGTGTTACTGTGTCAATAAACTGATTTAAAGCTGTC + a_aeeeeefgcebeghf`ceghifhggghhiiiiiiiighhiihiiiiiii @Run0331_BC7Y46ACXX_L1_R1_T1101_C27086 CGTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT + aa^cceeeggcgfihefeecccccaccccccaccccccccccccccccccc @Run0331_BC7Y46ACXX_L1_R1_T1101_C27087 CTAGACCAGGGGCCTCTTCCTTTCTCCGCCATCGTGGTGCTGTCTCTTATA + bbbeeeeeggggfihiihiiiiiiiiiihifiiicgfaeghighhiiiiii @Run0331_BC7Y46ACXX_L1_R1_T1101_C27088 CGCTGTCTCTTATACACATCTGACGCTTAGGCTCGTATGCCGTCTTCTGCT + bbbeeeeegfggghhfhhghghhhiigeghhihifhgdgiihdgfgghdh` @Run0331_BC7Y46ACXX_L1_R1_T1101_C27089 CACTATAGACCACCGCCCCGAAGGGGACGAAAAATGGTTTTTAGAGAACGA + bbbeeeeefggggiiiiiiidhiiiihhighi`ghhi\dggggeeeeeccb @Run0331_BC7Y46ACXX_L1_R1_T1101_C27090 AGCCTGTCGCATTGCATTCATCAAACGCTGAATAGCAAAGCCTCTACGCGA + ___eeeeegfgggiiiiiifghhdghhiZffhiiiiiifgdhiiiefiiii @Run0331_BC7Y46ACXX_L1_R1_T1101_C27091 TTTCTAGAGGGGTGGAAGATGTTACAACTGTCTCTTATACACATCTGACGC + _bbeeeeegggg[beggfcbf^dgghfhegcegghfffiifgb_dghhhdf @Run0331_BC7Y46ACXX_L1_R1_T1101_C27092 CTACAATGGGGGACAACTCGGTGGTGGCCACTGCGCAGACCAGACTTCGCT + bbbeeeeegggggiiiihiiigghcfhhiiiiiiiifiiiiiiiiggggee umis-1.0.8/examples/STRT-Seq/dualindex_example_2.fastq000066400000000000000000000011161412434634200226100ustar00rootroot00000000000000@Run0331_BC7Y46ACXX_L1_R2_T1101_C27083 CTCTCAGG + OGGGGHHM @Run0331_BC7Y46ACXX_L1_R2_T1101_C27084 TGACCATC + \aaccdee @Run0331_BC7Y46ACXX_L1_R2_T1101_C27085 GATCAGTC + _aaeeece @Run0331_BC7Y46ACXX_L1_R2_T1101_C27086 ATGGCATG + VGGQGHHG @Run0331_BC7Y46ACXX_L1_R2_T1101_C27087 CTTGTATC + abbeeeee @Run0331_BC7Y46ACXX_L1_R2_T1101_C27088 TTAGGCTC + ^^_`cccd @Run0331_BC7Y46ACXX_L1_R2_T1101_C27089 GGCTCGAT + GGLGQLYH @Run0331_BC7Y46ACXX_L1_R2_T1101_C27090 CGCGCATA + YJJJQOHJ @Run0331_BC7Y46ACXX_L1_R2_T1101_C27091 ATCACGTC + aa_cccdc @Run0331_BC7Y46ACXX_L1_R2_T1101_C27092 ACAGTGTC + a__ececd umis-1.0.8/examples/STRT-Seq/three_file_1.fastq000066400000000000000000000024501412434634200212270ustar00rootroot00000000000000@Run0358_BC8M5VACXX_L5_R1_T1101_C15 CTGGGAAAAGTTACTGTAGCCGACGTTTTGGCGGCGCAACCAAAGA + bbbeeeccaeggeghgffbhgghhfeghhhhhhdfggeccd@@@KG @Run0358_BC8M5VACXX_L5_R1_T1101_C16 AATAAAGGGGCATACTGCAAGACTTGTAGGCCCATGAATTACCCGC + b_beeeecgggggiiiiiidgfgghhgihhhiifigcefhhhhhhi @Run0358_BC8M5VACXX_L5_R1_T1101_C17 CACCAAGGGGCTTTTCTGCTGGGTGCCGGCTGCCTGCTGTCGAGAT + ___ecceegfggcghhhhghhff[bffhiiiiiiiiibgaeffg_Z @Run0358_BC8M5VACXX_L5_R1_T1101_C20 GACGAGGGTAAGGTGGCCGACGAAAGGGAGGACATGCACCTAGTTG + ___ecceebeecgbeghhedghgffhhidfhfffhhiiiafff`gg @Run0358_BC8M5VACXX_L5_R1_T1101_C21 ATATGAGGGGGCTCTCTGCTCCTCCCTGTTCCAGAGACGGCCGAAT + @PYaccdce``gcgfeae_`ghhfhifhgdf[aecf[cfg_N_@@L @Run0358_BC8M5VACXX_L5_R1_T1101_C22 AATCTGTCTCTTATACACATCTGACGCGACACAAATCGTATGCCGT + ___c^accccc`aef`df^YbhdbgehhdeUUXafdeeeddeheaF @Run0358_BC8M5VACXX_L5_R1_T1101_C23 GCTCGGGGGAGGTAGTTAATGTGATTCATGAAGAAAGGGGAAATGG + ___cccccg[egYafefacgcXcfghhhhebcfdfhghhh_abggd @Run0358_BC8M5VACXX_L5_R1_T1101_C24 CTGCAAGCTGTCTCTTATACACATCTGACGCTTCCCCCGTCGTATG + abbececeegcccefghe`h^eedghihhiiiihX^fgfgfd_Z\e @Run0358_BC8M5VACXX_L5_R1_T1101_C33 CCCACAGGGCAAGAAAATGCCAGAAAATTCTTGGGAAAAAAAAAAA + ___cccccee@R`efecehhh[dbfeehehhhhhh@OYc@@@@@@@ @Run0358_BC8M5VACXX_L5_R1_T1101_C35 TCTCAAGGGGGCTGGAGAGATGGCTCAGCGGTTAAGAGCGCAAACT + ^\^cccccgggcgadWbZd_[[e^fffgfhhfg[a_[[H_S@@@GL umis-1.0.8/examples/STRT-Seq/three_file_2.fastq000066400000000000000000000010601412434634200212240ustar00rootroot00000000000000@Run0358_BC8M5VACXX_L5_R2_T1101_C15 AAAAAAAA + @@@@@@@@ @Run0358_BC8M5VACXX_L5_R2_T1101_C16 CTTAACAT + [[[@@Q@Q @Run0358_BC8M5VACXX_L5_R2_T1101_C17 TGAAAAAC + ZZZ@@Q@S @Run0358_BC8M5VACXX_L5_R2_T1101_C20 GTGGCAAC + a^_eec@Q @Run0358_BC8M5VACXX_L5_R2_T1101_C21 AAGGAGAT + a__cec@Q @Run0358_BC8M5VACXX_L5_R2_T1101_C22 GACACAAA + a\acecee @Run0358_BC8M5VACXX_L5_R2_T1101_C23 TTAGCGGT + ___cccec @Run0358_BC8M5VACXX_L5_R2_T1101_C24 TTCACCCG + a\acccee @Run0358_BC8M5VACXX_L5_R2_T1101_C33 AAAAAAAA + @@@@@@@@ @Run0358_BC8M5VACXX_L5_R2_T1101_C35 AAAAAAAA + @@@@@@@@ umis-1.0.8/examples/STRT-Seq/three_file_3.fastq000066400000000000000000000007641412434634200212370ustar00rootroot00000000000000@Run0358_BC8M5VACXX_L5_R3_T1101_C15 AAAAA + @@@@@ @Run0358_BC8M5VACXX_L5_R3_T1101_C16 AGAGG + @G@GL @Run0358_BC8M5VACXX_L5_R3_T1101_C17 ATAAG + @G@GQ @Run0358_BC8M5VACXX_L5_R3_T1101_C20 AGACA + @G@GG @Run0358_BC8M5VACXX_L5_R3_T1101_C21 AAAGG + @O@GG @Run0358_BC8M5VACXX_L5_R3_T1101_C22 AGGAG + PYJSJ @Run0358_BC8M5VACXX_L5_R3_T1101_C23 GGCAA + JJPSJ @Run0358_BC8M5VACXX_L5_R3_T1101_C24 AGGGA + PP\Y` @Run0358_BC8M5VACXX_L5_R3_T1101_C33 AAAAA + @@@@@ @Run0358_BC8M5VACXX_L5_R3_T1101_C35 AAAAA + @@@@@ umis-1.0.8/examples/STRT-Seq/three_file_transform.json000066400000000000000000000002771412434634200227420ustar00rootroot00000000000000{ "read1": "(?P[^\\s]+).*\\n(?P.*)\\n\\+(.*)\\n(?P.*)\\n", "read2": "(@.*)\\n(?P.*)\\n\\+(.*)\\n(.*)\\n", "read3": "(@.*)\\n(?P.*)\\n\\+(.*)\\n(.*)\\n" } umis-1.0.8/examples/SpatialTranscriptomics/000077500000000000000000000000001412434634200210205ustar00rootroot00000000000000umis-1.0.8/examples/SpatialTranscriptomics/Sample_R1.fastq000066400000000000000000000022571412434634200236510ustar00rootroot00000000000000@NS500688:260:H7V7CBGX2:3:11401:10360:1039 1:N:0:ATCACG TTACCTACCGCTCAGTGATGCCACTTCGTT + AAAAAEAAEEEEE/EEEAEEEEAEEEE6E< @NS500688:260:H7V7CBGX2:3:11401:24187:1039 1:N:0:ATCACG CCACTGTTACGGCATAGTACAATGGTAGTT + AAAAAEEEEEEEEEEEEEEEAAEEEEE/EE @NS500688:260:H7V7CBGX2:3:11401:17338:1039 1:N:0:ATCACG GCATTAACCTGATAACATAGCCCCCTAGTT + A/AAA/AEE/A/EAEE/EEEE/A/EEE//6 @NS500688:260:H7V7CBGX2:3:11401:25477:1039 1:N:0:ATCACG GACACGTCGGTCCTCCTTTGCCAGGCAGTT + A/AAAEEEEE/EEEEEEEEAEE/EEEEEEA @NS500688:260:H7V7CBGX2:3:11401:17598:1039 1:N:0:ATCACG TTGGCTCCGTGTCTAGCGTGCTAGTCGGGT + AAAAAEEEEEEEEE/EEEEEEEEEEEEEE6 @NS500688:260:H7V7CBGX2:3:11401:17911:1040 1:N:0:ATCACG CTGAAGTGTTCCGTTTCCTCACAGAGGGTT + AAA6AEEEEEEEEEAEEEEE/EAEAEEEEE @NS500688:260:H7V7CBGX2:3:11401:2903:1040 1:N:0:ATCACG CTATTTCAGCTACTTAGGTGTTTCCTAGTT + AAAAAEEAEEE/EEE/EEEEEEEAEEEA6E @NS500688:260:H7V7CBGX2:3:11401:11186:1040 1:N:0:ATCACG AGACCAGCTCCTATTTAAAGGGTCAGGGTT + AAAAAEEEEEEEEEEEAEEEEEEEEEE/EE @NS500688:260:H7V7CBGX2:3:11401:12693:1041 1:N:0:ATCACG TTGACCTACTCGTGTTATTGGTTGTGAGTT + AAAAAEEEEEEEEEEEEEEEEAEEEEE/E6 @NS500688:260:H7V7CBGX2:3:11401:22221:1041 1:N:0:ATCACG GTAACTTGATGTATACCCAGGCTGTTCGTT + AAAAAEEEAEEEEEAEEEAEEEEEEEEAEE umis-1.0.8/examples/SpatialTranscriptomics/Sample_R2.fastq000066400000000000000000000032431412434634200236460ustar00rootroot00000000000000@NS500688:260:H7V7CBGX2:3:11401:10360:1039 2:N:0:ATCACG GGGGGGCAGGGGGCGGGGCCGCCCCGGTTTCTTCAAGTGCATATCTTTCATTTAT + AAAAA/EAE/EEEEEEAAEEAE/EEEEEEAEEEEEE/E/EEEEEEEE//E@.*) .*\\n(?P.{18})(?P.{9})(.*)\\n\\+(.*)\\n(.*)\\n", "read2": "(@.*)\\n(?P.*)\\n\\+(.*)\\n(?P.*)\\n" } umis-1.0.8/examples/SureCell/000077500000000000000000000000001412434634200160345ustar00rootroot00000000000000umis-1.0.8/examples/SureCell/K562_R1.fastq000066400000000000000000000051531412434634200200710ustar00rootroot00000000000000@NB551071:153:HW3FWBGXY:1:11101:8622:1050:ACATANATAGCCATCGCATTGCAGTAAATACCTCTGAGCTGAATGAATTACGACCCTCCTGACTTTTT 1:N:0:TAAGGCGA ACATANATAGCCATCGCATTGCAGTAAATACCTCTGAGCTGAATGAATTACGACCCTCCTGACTTTTT + AAAAA#AEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE/EEEEEEEEEEEEEEA @NB551071:153:HW3FWBGXY:1:11101:10159:1051:ACCTTNTAGCCATCGCATTGCTCATCATAACTCTGAGCTGAACCGATGACGCGAGTAACGACTTTTTT 1:N:0:TAAGGCGA ACCTTNTAGCCATCGCATTGCTCATCATAACTCTGAGCTGAACCGATGACGCGAGTAACGACTTTTTT + /AAAA#EEEEEEEEEEEEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEEEEEAEEAEEEEEEEEE/ @NB551071:153:HW3FWBGXY:1:11101:15732:1052:GCAGANGCCTAGCCATCGCATTGCGGATTGTACCTCTGAGCTGAACCGTAAACGCTCTGACTGACTTT 1:N:0:TAAGGCGA GCAGANGCCTAGCCATCGCATTGCGGATTGTACCTCTGAGCTGAACCGTAAACGCTCTGACTGACTTT + AAAAA#EEEEEEEEEEEAEEEEEEEEEEEEEEEEEEEEEEEAEEEEEEE/EAEEEEEAAAEEEEEEE/ @NB551071:153:HW3FWBGXY:1:11101:24308:1053:TGCGTNAGTGTAGCGCGCGTGCAGCCCCGGACATCTAAGGGCATCACAGACCTGTTATTGCTCAATCT 1:N:0:TAAGGCGA TGCGTNAGTGTAGCGCGCGTGCAGCCCCGGACATCTAAGGGCATCACAGACCTGTTATTGCTCAATCT + AAAAA#EEEEEEEEEEEEEEEEEEEEAEEE/AE/EEEAEEEE/EE/AEE6EEEEEEEEEEEEAEEEE/ @NB551071:153:HW3FWBGXY:1:11101:22250:1053:AGGTTNTAGCCATCGCATTGCCGGTCCTACCTCTGAGCTGAAATAGCGACGCCTTCCCCGACTTTTTT 1:N:0:TAAGGCGA AGGTTNTAGCCATCGCATTGCCGGTCCTACCTCTGAGCTGAAATAGCGACGCCTTCCCCGACTTTTTT + AAAAA#EEEEEEEEEEEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEA/ @NB551071:153:HW3FWBGXY:1:11101:7908:1053:ACACCNCTAGCCATCGCATTGCATACTTTACCTCTGAGCTGAATTAAGAACGTCATTTTTGACTTTTT 1:N:0:TAAGGAGA ACACCNCTAGCCATCGCATTGCATACTTTACCTCTGAGCTGAATTAAGAACGTCATTTTTGACTTTTT + 6AAAA#A/EEEEEEEEEEEEEEEEEEAEEEEEEEEEEEEEEEEEEAEEEEEEEEEEEEEEEEEEEEAA @NB551071:153:HW3FWBGXY:1:11101:9384:1053:AGACCNTAGCCATCGCATTGCCATAGATACCTCTGAGCTGAAGGTGCTACGGCCACTCCGACTTTTTT 1:N:0:TAAGGCGA AGACCNTAGCCATCGCATTGCCATAGATACCTCTGAGCTGAAGGTGCTACGGCCACTCCGACTTTTTT + AAAAA#EEEEEEEEAAEEEEEEEEEEEEEEEEEAEEEEEEEEEEEEEEEEAEEEEEEEEEEE/EEEEA @NB551071:153:HW3FWBGXY:1:11101:18696:1053:AGATGNTAGCCATCGCATTGCACCCAATACCTCTGAGCTGAAGCTCCCACGAGATTAGTGACTTTTTT 1:N:0:TAAGGCGA AGATGNTAGCCATCGCATTGCACCCAATACCTCTGAGCTGAAGCTCCCACGAGATTAGTGACTTTTTT + AAAAA#EEEEEEEEEEEEEEEAEEEEEEEEEEAEEEEEEEEEEEAEEEEEEEAEEEEEEEEEEEEEE/ @NB551071:153:HW3FWBGXY:1:11101:19870:1054:GCAGANCTTTAGCCATCGCATTGCAACGTGTACCTCTGAGCTGAACTCAATACGCAGATCCCGACCTT 1:N:0:TAAGGCGA GCAGANCTTTAGCCATCGCATTGCAACGTGTACCTCTGAGCTGAACTCAATACGCAGATCCCGACCTT + AAAAA#EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEEEEEEAEEEEEEAEEEEE/EEE/ @NB551071:153:HW3FWBGXY:1:11101:22362:1054:GCAGCNGAGTAGCCATCGCATTGCCTGACTTTTTTTTTTTTTTTTTTTTTTTAAAAAAACAAAGAGAC 1:N:0:TAAGGCGA GCAGCNGAGTAGCCATCGCATTGCCTGACTTTTTTTTTTTTTTTTTTTTTTTAAAAAAACAAAGAGAC + AA/AA#EEEEAAEEE.{6})TAGCCATCGCATTGC(?P.{6})TACCTCTGAGCTGAA(?P.{6})ACG(?P.{8})GAC(.*)\\n\\+(.*)\\n(.*)\\n", "read2": "(?P@.*) .*\\n(?P.*)\\n\\+(.*)\\n(?P.*)\\n" } umis-1.0.8/examples/bamtag/000077500000000000000000000000001412434634200155515ustar00rootroot00000000000000umis-1.0.8/examples/bamtag/bamtag-xstag.bam000066400000000000000000000004561412434634200206160ustar00rootroot00000000000000BCgsrea``pp 23 J/JK,Ir r vq 1C3c3sSJ?+ccK.F~Q Ʀ!( D4eBCu @#pRawTʻXg zFjse sk|.ҴT%!۬v"! pyNԶel ;f ݺfb~ /H\D 2,&0hR?BCumis-1.0.8/examples/bamtag/bamtag.bam000066400000000000000000000004571412434634200174730ustar00rootroot00000000000000BCgsrea``pp 23 J/JK,Ir r vq 1C3c3sSJ?+ccK.F~Q Ʀ!( D4eBCu @#pRawTʻzQ%hZV=Զ:x6M絩 @]犡;2bNhX$'&˞YYO,6pvpz86{b[tANYGBCumis-1.0.8/examples/bamtag/bamtag.sam000066400000000000000000000004231412434634200175050ustar00rootroot00000000000000@HD VN:0.1 SO:coordinate @SQ SN:ENSDART00000163675.1 LN:339 NB500929:118:HHL3MBGXY:2:23110:12155:17144:CELL_GATAACCATC-GTTACCGC:UMI_ACAATC:SAMPLE_GGGTTT 0 ENSDART00000163675.1 269 255 36M * 0 0 ATGGCATAAAGCTGAAAGAAATAAAGGAGGAACACG AAAAAEAEEEEEEEEEEEEEEEEEEEEEEEE/EEEE NH:i:1 umis-1.0.8/examples/inDrop/000077500000000000000000000000001412434634200155515ustar00rootroot00000000000000umis-1.0.8/examples/inDrop/README.md000066400000000000000000000002151412434634200170260ustar00rootroot00000000000000Read structure (Klein et al) Read 1 [Barcode 1 (10bp)][W1 (22bp)][Barcode 2 (8bp)][UMI (6bp)][TTT..] Read 2 [Tag] umis-1.0.8/examples/inDrop/SRR1784317_1.fastq000066400000000000000000000032451412434634200202620ustar00rootroot00000000000000@SRR1784317.1 D6FSQ5P1:452:H9UEFADXX:1:1101:1178:2141/1 GTCNCTGCGTTGAGGCTTGCGTTTATGGTACGCTGGACTTTGTAGGATACCCTCC + CCC#4BDFHHHDFEFACECCFGHJIJHCGIFGFHIJJIIIIIHIIJGGIIJJHI( @SRR1784317.2 D6FSQ5P1:452:H9UEFADXX:1:1101:1246:2161/1 GATGGGATTCGAGTGATTGCTTGTGACGCCTTAGTGGACCACGCCTTTTTTTTTT + @@@FFFFFHGHHFGEGHGECHICGGJF2@CGHHJFIIGIJJJGIJJIJJJJJHFD @SRR1784317.3 D6FSQ5P1:452:H9UEFADXX:1:1101:1210:2166/1 AGCTTTCCAGAGTGATTGCTTGTGACGCCTTGTGCTCAGTGCTTTTTTTTTTTTT + =??DDFFFHHGHHFGIICCFIFFHIJB@CGHIGHGJCDHHIIJJJJJIJJIHDDD @SRR1784317.4 D6FSQ5P1:452:H9UEFADXX:1:1101:1150:2168/1 GAATTCCTTGGAGTGATTGCTTGTGACGCCTTAATGTTTGCCCGGATTTTTTTTT + CCCFFDFFHHDFHFGEGFAACHAFFHC<@?FHFEFIJJJJJJIGIHJHIIJJJHF @SRR1784317.5 D6FSQ5P1:452:H9UEFADXX:1:1101:1190:2190/1 GACAAGAGGGGAGTGATTGCTTGTGACGCCTTGTTACCGCCCCAGTCAAACTCCC + @CCFFFFFHHHHHIJGHGFAFHCGHGC)?DGHHIIJJIJJJJJIJHIJHHHHHHD @SRR1784317.6 D6FSQ5P1:452:H9UEFADXX:1:1101:1228:2203/1 AAGCGAAGTGAGTGATTGCTTGTGACGCCTTCTTCGCACTGTGCCTTTTTTTTTT + @@CFFDFFHHHFHIIIJFCFHEFIJJECEHHHJIIIIJJJJHIIJJJJJJJHFDD @SRR1784317.7 D6FSQ5P1:452:H9UEFADXX:1:1101:1162:2214/1 TACTTGTGGAGTGATTGCTTGTGACGCCTTCAATTAGTATATAGTTTTTTTTTTT + ?=?DFFDDHHHHHJIJGGIJIGGJJJFHHIJG?GIFGDFGHIJJJIJJJJJJHFD @SRR1784317.8 D6FSQ5P1:452:H9UEFADXX:1:1101:1181:2218/1 TCGAAGCTGAGTGATTGCTTGTGACGCCTTGGTCAGCAGCGAACTTTTTTTTTTT + B?BDDFFFHHHHHIJJGCIHGGGHIJGHHHGHGIIJIJJJJJJIJJJIIIHFDDD @SRR1784317.9 D6FSQ5P1:452:H9UEFADXX:1:1101:1211:2232/1 GACCTACTAGGAGTGATTGCTTGTGACGCCTTGTTACGATAAGAAATTTTTTTTT + CCCFFFFFHHHHHHHIIJGIIIFHJJFAGGHIJJJJJJJJJJJJJJJJJJJJJHF @SRR1784317.10 D6FSQ5P1:452:H9UEFADXX:1:1101:1340:2129/1 AGCNACCTGGAGTGATTGCTTGTGACGCCTTATCCAAAGTATGAGTTTTTTTTTT + @@@#4ADDHHFDAEGGGAFEHFHIIIIIIEHDGGIIIIIHC umis-1.0.8/examples/inDrop/SRR1784317_2.fastq000066400000000000000000000036251412434634200202650ustar00rootroot00000000000000@SRR1784317.1 D6FSQ5P1:452:H9UEFADXX:1:1101:1178:2141/2 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################### @SRR1784317.2 D6FSQ5P1:452:H9UEFADXX:1:1101:1246:2161/2 NAAAAGAAACNNNNNNNCNNCCTCGGCGCCGCCACNNCNNCCGCCCCCCNCCNCNCNNNNNNNNNNN + #0((2((264#######2##2(-(-(13(-='''-##(##-((735'9;#,,#+#+########### @SRR1784317.3 D6FSQ5P1:452:H9UEFADXX:1:1101:1210:2166/2 NTTAAATGAAAAAANAAAAAAAAAAAGCCCGACGCNNAGNCCGCCACCAGAANACCTNNNNNNNNNN + #0;=;;??><>>?:#2:;>?;;6=3'-.(.--','##((#((,(,''(((+3#((+(########## @SRR1784317.4 D6FSQ5P1:452:H9UEFADXX:1:1101:1150:2168/2 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + #####################################0############################# @SRR1784317.5 D6FSQ5P1:452:H9UEFADXX:1:1101:1190:2190/2 NCCAGGTGGGGAGTTTGACTGGGNCGGTAACAAGGNNTNNCNAGCAATCANTNCCCTCTTGTCNGAN + #079=[^\\s]+).*\\n(?P.{40})(?P.{6})(.*)\\n\\+(.*)\\n(.*)\\n", "read2": "(@.*)\\n(?P.*)\\n\\+(.*)\\n(?P.*)\\n" } umis-1.0.8/examples/paired-with-umi-read/000077500000000000000000000000001412434634200202345ustar00rootroot00000000000000umis-1.0.8/examples/paired-with-umi-read/fq_1.fq000066400000000000000000000004111412434634200214060ustar00rootroot00000000000000@D00443:224:H27VCADXY:1:1105:1483:2182 1:N:0:GCTCATTA+TATAGCCT CACACACTGCAGGAGGCAGTAGCATGGGCCTCAGAGTTTGGGTGAACACAGCTTTTAGAGGGGAGAGATGCTCCTATCCCAGGCTCAGGTCCAGGGGCC + @@@DDDDD>DD>DDHIIIEC@HHH>EHIGF7DBABD:?BGAE)??FCADHCAFH>G>AACCC<@;CAA>ACCCBB9ACC94@CCC35>@> umis-1.0.8/examples/paired-with-umi-read/fq_2.fq000066400000000000000000000004071412434634200214140ustar00rootroot00000000000000@D00443:224:H27VCADXY:1:1105:1483:2182 3:N:0:GCTCATTA+TATAGCCT NCCCGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + #0;1=############################################################################################# umis-1.0.8/examples/paired-with-umi-read/transform.json000066400000000000000000000003121412434634200231360ustar00rootroot00000000000000{ "read1": "(?P@.*)\\n(?P.*)\\n\\+(.*)\\n(?P.*)\\n", "read2": "(?P@.*)\\n(?P.*)\\n\\+(.*)\\n(?P.*)\\n", "read3": "(@.*)\\n(?P.*)\\n\\+(.*)\\n(.*)\\n" } umis-1.0.8/examples/paired-with-umi-read/umi.fq000066400000000000000000000001171412434634200213550ustar00rootroot00000000000000@D00443:224:H27VCADXY:1:1105:1483:2182 2:N:0:GCTCATTA+TATAGCCT AGCTCC + FHDDBH umis-1.0.8/examples/tagcount/000077500000000000000000000000001412434634200161425ustar00rootroot00000000000000umis-1.0.8/examples/tagcount/cb-histogram.txt000066400000000000000000000000541412434634200212610ustar00rootroot00000000000000GATAACCATC-GTTACCGC 2 GATAACCATA-GTTACCGC 1 umis-1.0.8/examples/tagcount/cb-histogram.txt.gz000066400000000000000000000001011412434634200216710ustar00rootroot00000000000000fXcb-histogram.txtsw qttvv qu ܝ9aQC.7,umis-1.0.8/examples/tagcount/gene-map.tsv000066400000000000000000000000751412434634200203730ustar00rootroot00000000000000ENSDART00000163675.1 gene1 ENSDART00000163675.2 gene1 umis-1.0.8/examples/tagcount/tagcount-example.csv000066400000000000000000000001371412434634200221350ustar00rootroot00000000000000gene,GATAACCATA-GTTACCGC,GATAACCATC-GTTACCGC ENSDART00000163675.1,1,2 ENSDART00000163675.2,0,0 umis-1.0.8/examples/tagcount/tagcount.bam000066400000000000000000000005331412434634200204500ustar00rootroot00000000000000BCvsreg``pp 23 J/JK,Ir r vq 1C3c3sSJ?+ccK4b ck+ԗ/F/>NXw;VBCumis-1.0.8/examples/tagcount/tagcount.bam.bai000066400000000000000000000001501412434634200211750ustar00rootroot00000000000000BAIIw?Jw?wumis-1.0.8/examples/tagcount/tagcount.sam000066400000000000000000000016731412434634200204770ustar00rootroot00000000000000@HD VN:0.1 SO:coordinate @SQ SN:ENSDART00000163675.1 LN:339 @SQ SN:ENSDART00000163675.2 LN:400 NB500929:118:HHL3MBGXY:2:23110:12155:17144:CELL_GATAACCATC-GTTACCGC:UMI_ACAATC:SAMPLE_GGGTTT 0 ENSDART00000163675.1 269 255 36M * 0 0 ATGGCATAAAGCTGAAAGAAATAAAGGAGGAACACG AAAAAEAEEEEEEEEEEEEEEEEEEEEEEEE/EEEE NH:i:1 NB500929:118:HHL3MBGXY:2:23110:12155:17145:CELL_GATAACCATA-GTTACCGC:UMI_ACAATA:SAMPLE_GGGTTT 0 ENSDART00000163675.1 269 255 36M * 0 0 ATGGCATAAAGCTGAAAGAAATAAAGGAGGAACACG AAAAAEAEEEEEEEEEEEEEEEEEEEEEEEE/EEEE NH:i:1 NB500929:118:HHL3MBGXY:2:23110:12255:17144:CELL_GATAACCATC-GTTACCGC:UMI_ACAAAA:SAMPLE_GGGTTT 0 ENSDART00000163675.1 269 255 36M * 0 0 ATGGCATAAAGCTGAAAGAAATAAAGGAGGAACACG AAAAAEAEEEEEEEEEEEEEEEEEEEEEEEE/EEEE NH:i:1 NB500929:118:HHL3MBGXY:2:23110:12255:17146:CELL_GATAACCATC-GTTACCGC:UMI_ACAAAA:SAMPLE_GGGTTT 0 ENSDART00000163675.1 269 255 36M * 0 0 ATGGCATAAAGCTGAAAGAAATAAAGGAGGAACACG AAAAAEAEEEEEEEEEEEEEEEEEEEEEEEE/EEEE NH:i:1 umis-1.0.8/setup.cfg000066400000000000000000000001011412434634200143110ustar00rootroot00000000000000[bdist_wheel] universal=1 [metadata] description-file = README.mdumis-1.0.8/setup.py000066400000000000000000000015351412434634200142160ustar00rootroot00000000000000import os from setuptools import setup, find_packages, Extension ext = Extension('utils', ['umis/utils.pyx']) def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() setup( name='umis', version='1.0.8', description='Package for estimating UMI counts in Transcript Tag Counting data.', packages=find_packages(), install_requires=['click', 'pysam>=0.8.3', 'pandas', 'regex', 'scipy', 'toolz'], ext_modules=[ext], setup_requires=['cython'], entry_points = { 'console_scripts': ['umis=umis.umis:umis'] }, url='https://github.com/vals/umis', author='Valentine Svensson', author_email='valentine@nxn.se', long_description=read('README.md'), package_data = { '': ['examples/*/*.json'] } ) umis-1.0.8/test.sh000066400000000000000000000132301412434634200140120ustar00rootroot00000000000000rm -r tests/results mkdir -p tests/results umis fastqtransform \ examples/MARS-Seq/transform_SRP035326.json \ examples/MARS-Seq/SRP035326.fastq \ > tests/results/test01.fq umis fastqtransform \ examples/MARS-Seq/transform_SRP035326.json \ examples/MARS-Seq/SRP035326_5.fastq \ > tests/results/test02.fq umis fastqtransform \ examples/CEL-Seq/transform.json \ examples/CEL-Seq/SRP036633_1.fastq \ examples/CEL-Seq/SRP036633_2.fastq \ > tests/results/test03.fq umis fastqtransform \ examples/DropSeq/transform.json \ examples/DropSeq/SRR1873278_1.fastq \ examples/DropSeq/SRR1873278_2.fastq \ > tests/results/test04.fq umis fastqtransform \ examples/inDrop/transform.json \ examples/inDrop/SRR1784317_1.fastq \ examples/inDrop/SRR1784317_2.fastq \ > tests/results/test05.fq umis fastqtransform \ --demuxed_cb ATTAGAC \ examples/STRT-Seq/SRP022764_transform.json \ examples/STRT-Seq/SRP022764_ESCell_1_ATTAGAC_single.fastq \ > tests/results/test06.fq umis fastqtransform \ --demuxed_cb A01 \ examples/STRT-Seq/SRP045452_transform.json \ examples/STRT-Seq/SRP045452_1772058148_A01.fastq \ > tests/results/test07.fq umis fastqtransform \ --demuxed_cb CACTGT \ examples/BATseq/transform.json \ examples/BATseq/SRR1558183_1.fastq \ examples/BATseq/SRR1558183_2.fastq \ > tests/results/test08.fq umis fastqtransform \ examples/CEL-Seq/transform.json \ examples/CEL-Seq/SRP036633_1.fastq.gz \ examples/CEL-Seq/SRP036633_2.fastq.gz \ > tests/results/test09.fq umis fastqtransform \ examples/CEL-Seq/transform.json \ examples/CEL-Seq/SRP048838_1.fastq \ examples/CEL-Seq/SRP048838_2.fastq \ > tests/results/test10.fq umis fastqtransform \ examples/STRT-Seq/dual_index_transform.json \ examples/STRT-Seq/dualindex_example_1.fastq \ examples/STRT-Seq/dualindex_example_2.fastq \ > tests/results/test11.fq umis fastqtransform \ --keep_fastq_tags \ --fastq1out tests/results/test12_1.fq \ --fastq2out tests/results/test12_2.fq \ examples/paired-with-umi-read/transform.json \ examples/paired-with-umi-read/fq_1.fq \ examples/paired-with-umi-read/fq_2.fq \ examples/paired-with-umi-read/umi.fq umis fastqtransform \ examples/SCRB-Seq/transform.json \ examples/SCRB-Seq/scrbseq_R1.fastq \ examples/SCRB-Seq/scrbseq_R2.fastq \ > tests/results/test13.fq umis fastqtransform \ --separate_cb \ examples/Klein-inDrop/transform.json \ examples/Klein-inDrop/klein-v3_R1.fq \ examples/Klein-inDrop/klein-v3_R2.fq \ examples/Klein-inDrop/klein-v3_R3.fq \ examples/Klein-inDrop/klein-v3_R4.fq \ > tests/results/test14.fq umis demultiplex_samples --nedit 1 \ --barcodes examples/Klein-inDrop/sample-index.txt \ --out_dir tests/results \ examples/Klein-inDrop/test14.fq umis fastqtransform \ examples/10XGenomics_v2/transform.json \ examples/10XGenomics_v2/test_7_R1.fastq \ examples/10XGenomics_v2/test_7_R2.fastq \ examples/10XGenomics_v2/test_7_I1.fastq \ > tests/results/test15.fq umis bamtag \ examples/bamtag/bamtag.sam \ > tests/results/test_bamtag.sam # test streaming bamtag umis bamtag - < \ examples/bamtag/bamtag.bam \ > tests/results/test_streaming_bamtag.sam # test conflicting tag/qname annotations umis bamtag - < examples/bamtag/bamtag-xstag.bam > tests/results/test-bamtag-xstag.sam umis cb_histogram \ examples/Klein-inDrop/test14.fq \ | sort -k2,2rn > tests/results/test15-cb-histogram.txt umis cb_histogram \ --umi_histogram tests/results/test15-mb-histogram.txt \ examples/Klein-inDrop/test14.fq \ | sort -k2,2rn > tests/results/test15-cb-histogram.txt sort -k3,3rn -o tests/results/test15-mb-histogram.txt tests/results/test15-mb-histogram.txt umis umi_histogram \ examples/Klein-inDrop/test14.fq \ | sort -k2,2rn > tests/results/test16-umi-histogram.txt umis tagcount \ examples/tagcount/tagcount.sam \ tests/results/test17-tagcount.txt umis tagcount \ --sparse \ examples/tagcount/tagcount.sam \ tests/results/test18-tagcount-matrixmarket.txt umis tagcount \ --cb_cutoff 1 \ --cb_histogram examples/tagcount/cb-histogram.txt \ examples/tagcount/tagcount.sam \ tests/results/test19-tagcount-cbhistogram.txt umis tagcount \ --cb_cutoff 1 \ --cb_histogram examples/tagcount/cb-histogram.txt.gz \ examples/tagcount/tagcount.sam \ tests/results/test20-tagcount-cbhistogram.txt umis fastqtransform \ --separate_cb \ examples/SureCell/transform.json \ examples/SureCell/K562_R1.fastq \ examples/SureCell/K562_R2.fastq \ > tests/results/test21.fq umis cb_filter \ --nedit 1 \ --bc1 examples/SureCell/barcodes.txt \ --bc2 examples/SureCell/barcodes.txt \ --bc3 examples/SureCell/barcodes.txt \ tests/results/test21.fq \ > tests/results/test21-filtered.fq umis fasttagcount \ --cb_cutoff 1 \ --cb_histogram examples/tagcount/cb-histogram.txt.gz \ --umi_matrix tests/results/test22-fasttagcount-umi-matrix.txt \ examples/tagcount/tagcount.bam \ tests/results/test22-fasttagcount-cbhistogram.txt umis tagcount \ --genemap examples/tagcount/gene-map.tsv \ --cb_cutoff 1 \ --cb_histogram examples/tagcount/cb-histogram.txt.gz \ examples/tagcount/tagcount.sam \ tests/results/test23-tagcount-cbhistogram-genemap.txt umis fasttagcount \ --genemap examples/tagcount/gene-map.tsv \ --cb_cutoff 1 \ --cb_histogram examples/tagcount/cb-histogram.txt.gz \ examples/tagcount/tagcount.bam \ tests/results/test24-fasttagcount-cbhistogram-genemap.txt umis demultiplex_cells \ --out_dir tests/results \ examples/Klein-inDrop/test_cell_demultiplex.fq umis sparse examples/tagcount/tagcount-example.csv tests/results/test25.mtx # only display diff output if there are differences if [[ $(diff -rq tests/results tests/correct) ]]; then diff -rq tests/results tests/correct exit 1 else echo "Tests passed." fi umis-1.0.8/tests/000077500000000000000000000000001412434634200136425ustar00rootroot00000000000000umis-1.0.8/tests/correct/000077500000000000000000000000001412434634200153035ustar00rootroot00000000000000umis-1.0.8/tests/correct/AAAGCCAC.fq000066400000000000000000000027731412434634200167470ustar00rootroot00000000000000@NS500233:572:H25VKBGX2:1:11101:24838:1041:CELL_AAAAACAT-AAACANNN:UMI_NNNNNN:SAMPLE_AAAGCCAC TCTTCCTTAATGTTATTTTCTATATAAATNACNGACAATTCAAACAATNTNNNNTACNNNN + 6AAA/A=AFCF+?BBDDB05307B##################### @HWI-ST808:130:H0B8YADXX:1:1101:5684:2136:CELL_GGTCCA:UMI_GAAC GCTTAGATGGCAGGTTCAGCGGAAAGTGATCTGCATCCCCAAAAGCATCAN + ?@@BDBDB:A?F?:GEGEHGHEG?DDCGEEHIIG# @HWI-ST808:130:H0B8YADXX:1:1101:5935:2225:CELL_GGTCCA:UMI_AGTG TCAGTTGGGGCCTTAACTTTGGTGATCAAGGATACATTCGGATGGCAAGAN + @@@DDDEBDDFHDGIIIIIIJJFGIIJIIJGGCCGFHGIIICHIEHGIIG# @HWI-ST808:130:H0B8YADXX:1:1101:6036:2098:CELL_GGTCCA:UMI_CATT TNATAATTTGAGAGGCCTTTGCTTCAAAACGAGAAGTAATATCAGTATCGN + @#1=BDDDBFHFHIIIIIIIIIIIIIIIIIIGIGGIGHIIIIIGIDFGIG# @HWI-ST808:130:H0B8YADXX:1:1101:6367:2163:CELL_GGTCCA:UMI_GGTA TAATGACTTGTTGGGTAGCTATTAAGGTACTAGAATTGATAAATGTGTACN + @@@?:B?ADA= @HWI-ST808:140:H0L10ADXX:1:1101:4597:2:NNNNNN:CELL_CAACCA:UMI_AAAA TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTGTTCTTTCTGTCGTAT + @@FG>G@>CD=CCF @HWI-ST808:140:H0L10ADXX:1:1101:13033:2:NNNNNN:CELL_CAACCA:UMI_AAAA GAGAGTTTTTTTTTTTTTTTTTTGGTGAGGGGCAGAAAACAGGGATGAATGTAATCCT + 11:442=BFHHHDE?FHIGAEA#################################### @HWI-ST808:140:H0L10ADXX:1:1101:1718:2:NNNNNN:CELL_CTACCA:UMI_CCTT CGTCGAACCTTTCTGGCCTGGCTTGTTTGCCAAGGCTCTGGCCAATGTCAACATTGGG + ?<8BAADDAFDBFEHGBBHHIIHHFIJIGGEGGJGFCEG0?DCHBB9CB4BBFEC;@= @HWI-ST808:140:H0L10ADXX:1:1101:11558:2:NNNNNN:CELL_CTACCA:UMI_TTGT TGCCACCCCCCCTCAAACCCCACCCCCTTTCAGGTTCCTTGCTCAGCCAAGCTTGTCA + @@@DDFFFHHHHGIIJ3?GGHJJIGIGHIIGFHE>@GBEGHFHFEEECB;?C>;@@>; @HWI-ST808:140:H0L10ADXX:1:1101:18714:2:NNNNNN:CELL_CAACCA:UMI_AAAA TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTATTTGTGTTGTCAA + ???DDDDDDDDDDDDD@9?####################################### @HWI-ST808:140:H0L10ADXX:1:1101:5802:2:NNNNNN:CELL_CTACCA:UMI_AAAA GTGACCGCAACGTGGCCAGTGTGTGTCTGCAGATCGGGTACCCAACTGTTGCCTCGGT + 8?@4=+=@@@D; @HWI-ST808:140:H0L10ADXX:1:1101:6626:2:NNNNNN:CELL_GTACCA:UMI_AAAA AGGCGATAGTCGCCATTCACAGATTTGCTCGGCAATCAGTACTGGTAGGCGTTAGACC + 1:?DF1ADFFHHHGIGHGHGIJIGGHIIGHGIIIBDFHGIGGHBAFHGCDA@E94243@@<))###---0-..6)9>> @SRR1161549.7:CELL_GACACCGC:UMI_CGGA AGTCGCGGTTGGTGGGTAACAACCGAGCCAAGATGTTGCGGAATCTGCTGG + CCCFFFFFHHHHHIJJGIIJJJJJHIJJJJJJJJJJJJJJHHFFFEEEEEE @SRR1161549.8:CELL_TGATGCGC:UMI_GGAT CAGTCCGACGATCGTCTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT + CCCFFFFFHHHHHJFHIJJJJJJJJJJFDDDDDDDDDDDDDDDDDDDDDDD @SRR1161549.9:CELL_GACACGAG:UMI_GCAG GTTTATGGTTCTTTTTAACCTTTTCTGGAGGGTTGGGGGGTTATTTTTGTT + CBBFFFFFDHHHHJJJJJJJJJJJJJJJHHIJEGHJJJJDBDDDDEEDDBC @SRR1161549.10:CELL_ACGACGAA:UMI_TTTG CAAGTCAGGGCTGGATGAATACAAATGGTTAATTAAGAGCTTGTGTGAGGG + CCCFFFFFHHHHHJIIJJJJJJJJJIJJJJIIJJJJJJIJJJJHIGHHIJJ umis-1.0.8/tests/correct/test04.fq000066400000000000000000000032331412434634200167570ustar00rootroot00000000000000@SRR1873278.1:CELL_TAGTTNGCGCGG:UMI_GNGAGTAC AAACAAACGACTCAGACGGCCTCACGAGAATCTAGACGAACTAATTAGAGAACACCAGCG + ))<>>?:#2:;>?;;6=3'-.(.--','##((#((,(,''(((+3#((+(########## @SRR1784317.4:CELL_GAATTCCTTGGAGTGATTGCTTGTGACGCCTTAATGTTTG:UMI_CCCGGA NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + #####################################0############################# @SRR1784317.5:CELL_GACAAGAGGGGAGTGATTGCTTGTGACGCCTTGTTACCGC:UMI_CCCAGT NCCAGGTGGGGAGTTTGACTGGGNCGGTAACAAGGNNTNNCNAGCAATCANTNCCCTCTTGTCNGAN + #079=AF//>/1>11BBBF10B//?00000/1<01?FFF0F########################################################################################################################################################## @SRR1558183.2:CELL_CACTGT:UMI_CACGTTTA ATCCGAGTACGCTTGTCCAGAAAACGTAATGAGGATGAGGATTCCCCAAACAAGCTCTACACGCTGGTAACTTACGTGCCTGTTACCACATTCAAAAATCTACAGACGGTCAACGTGGATGAGAACTAACCTGCTCGTGTCAAATAAAGTTGCAGAACTGCCAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACAAAGAACCCGAACGGCCCCCAAAACAAAAAAAAAAAACAACCGGGGTCCCCCCCCTCTACTAAAAAAAAAAAAACAACAA + FDFFBGGGGGGGGGGHHHGHFGHHGGGHHGHHHHGHHHHEFGHHHHHHGHHHHGHHHHHHHHFGGGGHHHHHHGHGHHGHHHHHHHHHHHHHHGFHHHHHHHHHFHHFEGGGHHGFAFFHHHHFGHHGGHFHHHBHGGHHHHHHHGEFFFHHHHHGFGGFFHB1BEECCCFGGGGGGGGGFFFADFF###################################################################################### @SRR1558183.3:CELL_CACTGT:UMI_GACAGCTG GGGATCCTTGGCCACCTTGCTTCATGATTTGATACATTTGTTGTATTCAAAAACTTGAACTGTAGGATGCCATTAAGAGTCTGTTTATATTATTGGAATATTTGTATTACAATTGTTAATAAAGGCTGGTTTAAAAACCTAAAAAAAAAAAAAAAAAAAAAAACACCCACCCAAAAAAACCACAACGGCCCCCAAAAACAATATAAAAAACCAAAACGTACCCCTTTTTTTTTAAAAAAAAAATATTCAAACTCTTATCTTATAAATTAAAAT + @@1AFFGGGGBFGGGHHHHHHHHBEG3GDGHGGFHHHHFFFGHHFEGFHBDHHGHHBEGGFFGGHFFFCGGFHHFDE1GDGHGFEHHHHHFFFHBGFDHFFHHHGHHHHHHHGHHHHHHHHHHBGGFHCHGHFFFFHGGHHHHFHGGGGGGGGGCGGGGGGG@############################################################################################################## @SRR1558183.4:CELL_CACTGT:UMI_TCCTTTGC AGGAGTCTGAAGATGACATGGGCTTTGGTCTTTTTGACTAAACTGCTTTTGTTAAGTTAGCCAATAAAGAGCTGAACCTGTAAAAAAAAAAAAAAAAAAAACAAAAGGACAAGGTACCCGCGGGGGCCCCAACACAATCATGAGATACCTCGCGGGGCCCCCACAACAAAAAAAAAAAAAAAAAAAATAAAAAAACTACGCATATTACCCAAAAAACCAACAAAACACGCAGAACACAAAAAAAAACAGAAGAACAAACGAAACATAAAGCAG + FFFFFGGGGGGGFGGHHGHHHGHHHCGFFGHHHHGGFEFHGFEEFFGHHGGHHH5DHH5FGHH2CFHFHFGBFHFHHGHHGHHHHHGGGGGGGGGGGG@C>//?F00/?#################################################################################################################################################################### @SRR1558183.5:CELL_CACTGT:UMI_TGCATATA CTCCCTCACAATTTCCATCCCAGACCCCCATAATAACAGGAGGGGCCTAGGGAGCCCTCCCTACTCTCTTGAATACCATCAATAAAGTTCGCTTCACCCAAAAAAAAAAAAAAGAAAAAAAAAAAAAAAAAAATATAACAAAAGGGATCCGAAGCGCGCCCCCACCCCCAAAATAAAACCCTAAATAGCACCCCCCTTATAAAACAAAAAAAAAAAATAAAAAAAACATAAAAATAAAAAATTACAAAAAAAAAATTAAACAAAAAACCAAAA + FFFFFGGGGGGGGCEHHHHHHHHHGGHGGGHHHHHHHHHGHGGGGGGGHHHEGGGGGHHGGHHHHHHHHHFDHHHGHHHHFHHHGGBGGHFCG1?FFGHFHFHGGGCGCFG?C/FGFHFGGGGGGGGGGGA############################################################################################################################################## @SRR1558183.6:CELL_CACTGT:UMI_CCCCCGCC CACACTTTGTGAGGACTAATGGAAAAGAGCCTGAGCTGTTGGAGCCCATTCCATACGAATTCATGGCCTAATGTACACAAAGAAATAAAATACCAGCACCAGGAAAAAAAAAAAAAAAAGGGGGGGGAAGGGGAGCGGCGGGACCCGCCCAACCCCCAGAACAAACCACGGGGCGCCCCCCCCCCACTCAAAAAAAAAAAACCAACACAACAAAAAAACCAAAGCGAAACCAAAAAGCTAACAAAACAAAGCAGACACGCTAAAAGCGCCAGA + F5DDFGGGGFGFGGGHFH4FGHF4FHGFHFHHFHHHGHHGFHHHHGGGFHHHHHGHHFCGHHHHHHHHHHFBFGHHHHHGHHHHHHHHHHHHHHHGHHHHHHGGHFHHGGGCGGGGGG?########################################################################################################################################################## @SRR1558183.7:CELL_CACTGT:UMI_ATGGGTGA ATTGAACCCATTCATCATAAGTCTTAACTCGTTAGAGATAATGTACCCATGGAGACTAGCAAAATAGTATGTAGATGTGATCTCAATTGTAAATAGAAAAATTTAATTCAATAAACTCTGTATCAGCCAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACCCAAAAAAAAAACCCACACAGCCCACACACACAAAAAAAAAAAAAAAAAGAAATCCCTAAAAAAAAAAAAAAAAAAAAAAAAAAAATAAAAAAACAAAAAAAAA + FFFFFGGGGGGGGGGHHHHHHHFHHGHHHHGHGHGGGHGHHFHHGHHHHHHHGHHGHHHHHHHHHHGHHHHHHHHHHFBGHHHHHHHHHHGHHHHGHGGHHHEGHHHHHHHHHHHHHHHGFHHHHHHHHHHHHGFGGGGGGGG@->@DCDDGGGGGGGFFFFFFF############################################################################################################ @SRR1558183.8:CELL_CACTGT:UMI_TGGATCAG ACAGATGTCACTATCACCAACAAGTTTATCAGCCCCAACTCCCTCAACAGTACTGTCATAATAGTCACCATTACCATCCTTGTTCACTTTATTTTAGTTACTCAGTTGCTGTCATCATTAATACTAATTGTGCCTTCAAGATGTCAAAAAAAAAAAAAAAAACAAATCCAACAGTAAACCCGAGCGGCCCCCAAACCAAAAATATAAACATCCGGGGGCCCCCCATTTTTAAAAAAAAAAAAAAAAAAAAATAAAACAAAAAAACCAACAAAC + FFFFFGGGGGGGGGGHHHHHGGHHGHHGHHHHHHGGGGGHHHHHHHHHHHHHHHHFHHHHHHHHHHHHHHHHHHHHHHHGHHHHHHHHGHHHHHHHHHHHHHHHHHHHHHHHHHHHGGHHHHHHHHHHHHGHHHHHHHHHHHHHHHHHHHGGGGGGGGGGGG############################################################################################################### @SRR1558183.9:CELL_CACTGT:UMI_CCCCTTTA CTGACTGCTCTCCCAGAGGTCCTGAGTTCAATTCCCAACAACCACATGGTGACTCACAACCATCTGCAATGGGATCGGATGCCCTCTTCTGGTGTGTCTGAAGCTACAGTGTACTCATGTACATAAAATAAATAAATCTTTTAAAGAAAATAATAACAAAAAAAAAAAAAAAAAAAAAAAAAGGGGGAAAGGAAACCGGGGGGAAACGAAAAAAAAAAAGAAAACAACGGAGGGAACGACACAAAATCAAAAAAAAAAAAAAAAAAAAAAACA + BDDFFFGGFCG4EFGHHGGCFGHHHHFHGHFFHGFHFGAFGHGGGHHHHFHHFHFHHGHGCGFGHFHHHHHHHHHGGD1AEHHHGHGHHHHH3B?BFFGHFHGFFBGHHGFGDHFHHGHFFHHHHHDGDFFHHHHGHHHHHHHHGHHHHEHHHFHH1FHHHHGGGGGGGGGGGGGGGG-9B@########################################################################################### @SRR1558183.10:CELL_CACTGT:UMI_TCTTAGTC GGCCTCAGTTCCTGGCCCCAGAAACGAGATCCTGACCACATGAACAATTTGGGCTCTTTTGGGAGAATAAAAGACTTATATATTGAAAAAACAAAAAAAAAAAAAAAAAAAAAACAAAAAAAAGGGACCCCGACGGGCCCCCAATCCAAAAATAAAAATCCCGGTGGCCCCCCCCTTATTAAAAAAAAAAAAAAAAAATAAACAAAAAAAACAAAAAAACAAAACAAAAAAACAACCCGCCCCCAGCACAAACAGCAAAAAACAAAAAATCAA + FFFFFGGGGGGGGGGHHHGGGHFHHGEGGGHHHHHHHHHHHHFHHHHHHHGHGHHHHHHHHGGHGGHHHHHHGGHHHHHHHHHHHHHHHHGGHHHHGGGGGGGGGGGGGGGC################################################################################################################################################################# umis-1.0.8/tests/correct/test09.fq000066400000000000000000000026271412434634200167720ustar00rootroot00000000000000@SRR1161549.1:CELL_ATGTGTCA:UMI_GAGG TGTCAGAGGTTNNNNTTTTTTTTTTTTTTTTTTTTTNNNTTTAAAAAAAAA + <<<@@@@@@@@####43=@@@????????=<;<::7###+++(+8:<75:: @SRR1161549.2:CELL_CATCAATC:UMI_TAGT TCAGGAAAACAAGGGCAAAAGATTGATACGCTCTAAAGAAAAATCAGAGTC + CCCFFFFFHHHHHJJJJJJJJJJJJJJJIJJJJJJJIJJJJJJJGIJJJGG @SRR1161549.3:CELL_TCACACGC:UMI_ACCG TCCTTCAGTTTGCTAGTCATTGCTTTACTTACTGCCCCCAGACCTTTCCTT + CCCFFFFFHHHHHJJJIJJJJJJJJJJJJJJJJJJJJJJJJGIIJJJJJJJ @SRR1161549.4:CELL_TGATGCGC:UMI_GTTT TGCATCTGCAAAGAGGCTTCCGACAAGTGCAGCTGCTGTGCCTGAAGGGGG + CCCFFFFFHHHHHJJJJJJJJJGIGJJDGHJJJJJJJJIIJJJJJJIJJJJ @SRR1161549.5:CELL_GACACCGC:UMI_GCAG GGAGGGCTGGCTGCCCTCCCCTTTCCTTTGCTCTTGACCACTCATGGAAGC + ?=:=+=DD:?DD;:E<3294243@@<))###---0-..6)9>> @SRR1161549.7:CELL_GACACCGC:UMI_CGGA AGTCGCGGTTGGTGGGTAACAACCGAGCCAAGATGTTGCGGAATCTGCTGG + CCCFFFFFHHHHHIJJGIIJJJJJHIJJJJJJJJJJJJJJHHFFFEEEEEE @SRR1161549.8:CELL_TGATGCGC:UMI_GGAT CAGTCCGACGATCGTCTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT + CCCFFFFFHHHHHJFHIJJJJJJJJJJFDDDDDDDDDDDDDDDDDDDDDDD @SRR1161549.9:CELL_GACACGAG:UMI_GCAG GTTTATGGTTCTTTTTAACCTTTTCTGGAGGGTTGGGGGGTTATTTTTGTT + CBBFFFFFDHHHHJJJJJJJJJJJJJJJHHIJEGHJJJJDBDDDDEEDDBC @SRR1161549.10:CELL_ACGACGAA:UMI_TTTG CAAGTCAGGGCTGGATGAATACAAATGGTTAATTAAGAGCTTGTGTGAGGG + CCCFFFFFHHHHHJIIJJJJJJJJJIJJJJIIJJJJJJIJJJJHIGHHIJJ umis-1.0.8/tests/correct/test10.fq000066400000000000000000000026271412434634200167620ustar00rootroot00000000000000@SRR1610598.1:CELL_TGATGCGC:UMI_GCGG NNNNNAGGTGTAGCCAATGAAATGGGAAGAAATGGGCTACATTTTCTTATN + #####42=<@?>@?@>??????????????????????????????????? @SRR1610598.2:CELL_GCTCATCG:UMI_TGGA NNTNNCATGGCTATGGAAGATGCTAAAAATATAAAGAAAAAAAAAAATCCN + ##0##22@???????@@@??@??@@@@@@@@@@?????????????????< @SRR1610598.3:CELL_GCTCATCG:UMI_GAAC TACCATGGCTATGGAAGATGCTAAAAATATAAAGAAAAAAAAAAAAAAGTC + @@CFFFFFHHHHHJGGGFGIGHIJJJEGAHHIG?@?<@@?@??@@???@???@@?@?????????????=<<<:: @SRR1610598.10:CELL_GCTCATCG:UMI_ACCC ATTCAAGTAGCACAACTATATATTGCCGCTACCCCAATCCCTCCTTCCAAC + CCCFFFFFHHGHGJJJJJJJJJJJJJJJJIJJJJJJJJJJJJJJJJJJJJJ umis-1.0.8/tests/correct/test11.fq000066400000000000000000000007021412434634200167530ustar00rootroot00000000000000@Run0331_BC7Y46ACXX_L1_R1_T1101_C27087:CELL_CTACTTGTATC:UMI_GACCA GCCTCTTCCTTTCTCCGCCATCGTGGTGCTGTCTCTTATA + gfihiihiiiiiiiiiihifiiicgfaeghighhiiiiii @Run0331_BC7Y46ACXX_L1_R1_T1101_C27091:CELL_TTTATCACGTC:UMI_CTAGA GTGGAAGATGTTACAACTGTCTCTTATACACATCTGACGC + g[beggfcbf^dgghfhegcegghfffiifgb_dghhhdf @Run0331_BC7Y46ACXX_L1_R1_T1101_C27092:CELL_CTAACAGTGTC:UMI_CAATG GACAACTCGGTGGTGGCCACTGCGCAGACCAGACTTCGCT + ggiiiihiiigghcfhhiiiiiiiifiiiiiiiiggggee umis-1.0.8/tests/correct/test12_1.fq000066400000000000000000000004241412434634200171750ustar00rootroot00000000000000@D00443:224:H27VCADXY:1:1105:1483:2182:UMI_AGCTCC 1:N:0:GCTCATTA+TATAGCCT CACACACTGCAGGAGGCAGTAGCATGGGCCTCAGAGTTTGGGTGAACACAGCTTTTAGAGGGGAGAGATGCTCCTATCCCAGGCTCAGGTCCAGGGGCC + @@@DDDDD>DD>DDHIIIEC@HHH>EHIGF7DBABD:?BGAE)??FCADHCAFH>G>AACCC<@;CAA>ACCCBB9ACC94@CCC35>@> umis-1.0.8/tests/correct/test12_2.fq000066400000000000000000000004221412434634200171740ustar00rootroot00000000000000@D00443:224:H27VCADXY:1:1105:1483:2182:UMI_AGCTCC 3:N:0:GCTCATTA+TATAGCCT NCCCGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + #0;1=############################################################################################# umis-1.0.8/tests/correct/test13.fq000066400000000000000000000337531412434634200167710ustar00rootroot00000000000000@NS500422:158:HF57FBGXX:1:11101:19442:1039:CELL_GGACCN:UMI_CGAATACTGT NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:17359:1039:CELL_TTAAGN:UMI_GCACTAGAAT NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:19362:1039:CELL_GCCTGN:UMI_TAAGTACGTG NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:4222:1040:CELL_GTCGGN:UMI_AGTATATCCG NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:8383:1040:CELL_CTCGGN:UMI_GAGGGGATTC NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:21086:1040:CELL_CAAAAN:UMI_GGCGCATCGG NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:1678:1040:CELL_GACGGN:UMI_CGCTGGCCTG NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:18610:1040:CELL_TGACGN:UMI_CCATAAGGCT NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:10551:1040:CELL_GCTGCN:UMI_ACTGGGATGG NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:21626:1040:CELL_AACATN:UMI_TATTGGTCAT NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:6397:1041:CELL_TAGAAN:UMI_GTTTATTTAT NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:11685:1041:CELL_TATTTN:UMI_TTTGTAAGCC NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:6276:1041:CELL_TAAGTN:UMI_AAAGCACCTT NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:2003:1041:CELL_GGGCGN:UMI_ATGGACAGGT NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:9807:1041:CELL_TCCCGN:UMI_TGGGCTTTTG NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:10497:1041:CELL_GCGCGN:UMI_GCCATTCTTG NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:10906:1042:CELL_CCGGGN:UMI_GGTTATATAC NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:24935:1042:CELL_TTTTAN:UMI_GTTGGGAGGT NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:15517:1042:CELL_ATCTAN:UMI_ATACATCCAC NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:18824:1042:CELL_TAGATN:UMI_GGGCAGCATG NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:26766:1042:CELL_GGCCAN:UMI_ATGCGGACTT NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:11450:1042:CELL_AATTTN:UMI_ACTCGTCGCT NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:7778:1042:CELL_CGCACN:UMI_TATCGCATCT NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:1074:1042:CELL_CTGCGN:UMI_GGGGACATCT NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:8154:1042:CELL_AGTAAN:UMI_TGTTTCTATC NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:7359:1042:CELL_ATGAAN:UMI_ATTCGTAACT NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:15368:1043:CELL_GGTCCN:UMI_AGCGTCTTTT NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:2550:1043:CELL_CTCCGN:UMI_GCACTTTTAT NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:26099:1043:CELL_TCTAAN:UMI_CGATGTCGGT NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:5055:1043:CELL_AATACN:UMI_GATTACGGTG NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:4401:1043:CELL_CAAAAN:UMI_CCCATAATCT NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:18451:1043:CELL_GGGCTN:UMI_CGTTTTTTTG NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:19768:1043:CELL_CAAAAN:UMI_TGAGGTGCTG NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:25438:1043:CELL_GTCGCN:UMI_CGTAAGGTCC NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:11164:1044:CELL_CGTGGN:UMI_TCGGGGGCCG NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:10414:1044:CELL_TAACAN:UMI_CTATAGCCTG NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:24477:1044:CELL_GGCCAN:UMI_AGTTATGTCT NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:20469:1044:CELL_GGAGGN:UMI_ACGGCAAATT NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:3630:1044:CELL_TTGGAN:UMI_GGACAAGTGC NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:18327:1044:CELL_CACGCN:UMI_TATTTTATAC NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:5145:1044:CELL_CCTTCN:UMI_TCTTTTTGCT NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:20619:1044:CELL_CGCCTN:UMI_ATTGATGTTG NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:18192:1045:CELL_TATAGN:UMI_CGGTGGTTCG NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:19695:1045:CELL_TTACTN:UMI_GATTCTATCG NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:9205:1045:CELL_GAGCGN:UMI_GATTGTTAAT NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:5844:1045:CELL_CGTGCN:UMI_AATAGAAACT NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:22259:1045:CELL_GACGGN:UMI_CGCTTCAATC NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:10791:1045:CELL_GGGATN:UMI_CGTTTCGCTT NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:2096:1045:CELL_ATCGAN:UMI_ACAATGTTGT NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:12839:1045:CELL_CTCCGN:UMI_GGTGTAAGGC NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:4296:1045:CELL_GATATN:UMI_CAGGGAAGGG NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:8419:1045:CELL_CGCGGN:UMI_GTTTCTTGGC NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:19793:1045:CELL_ATTTAN:UMI_TGTGTGTAAT NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:16745:1045:CELL_GGGGTN:UMI_TCTATGCACT NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:14757:1046:CELL_CATAAN:UMI_AACGCTTTTC NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:14468:1046:CELL_AGTTAN:UMI_ATCCTGACAT NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:5886:1046:CELL_CCCGAN:UMI_TCCGGGTGGG NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:16812:1046:CELL_GTCGGN:UMI_GTTCCGCTGT NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:11083:1046:CELL_GGAGGN:UMI_AAGCGCAATG NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:6973:1046:CELL_TGATAN:UMI_TGGAATAATT NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:3588:1046:CELL_CGTGCN:UMI_AGATCCGTGG NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:8739:1046:CELL_ATTTTN:UMI_ATTGAGATAT NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:26209:1046:CELL_TGTAAN:UMI_TACTTCTACT NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:22721:1047:CELL_CGGCCN:UMI_CAAGTACTCG NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:15317:1047:CELL_ATCTAN:UMI_GTGATGGCTT NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:1781:1047:CELL_ACGCGN:UMI_ATTATCCTGT NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:4354:1047:CELL_GCCTGN:UMI_GTGCTTGTAT NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:22183:1047:CELL_CTAGTN:UMI_TGGTGGGGGC NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:24068:1047:CELL_AGCCGN:UMI_TTTTGAGGGT NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:24214:1047:CELL_GAGGGN:UMI_GGCTCAGGTC NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ @NS500422:158:HF57FBGXX:1:11101:25750:1047:CELL_AATAAN:UMI_TCGGTTTCGC NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN + ################################################################ umis-1.0.8/tests/correct/test14.fq000066400000000000000000006530111412434634200167650ustar00rootroot00000000000000@NS500233:572:H25VKBGX2:1:11101:16195:1041:CELL_AGGGGGGG-ATATNNNN:UMI_NNNNNN:SAMPLE_ATCGCCGG GCTTTNCATGTTGTTTTGAAGGTTCCCACNGTNANCNTTCTTGTTNACNGNNNNNTTNNNN + /AAAA#EEEEEEEEE.*)\\n(.*)\\n\\+\\n(.*)\\n') kept = [] for read in chunk: match = parser_re.search(read).groupdict() sample = match['SB'] if sample not in barcodes: continue kept.append(read) return kept def correcting_sample_filter2(chunk, barcodehash): parser_re = re.compile('(.*):CELL_(.*):UMI_(.*):SAMPLE_(?P.*)\\n(.*)\\n\\+\\n(.*)\\n') kept = [] for read in chunk: match = parser_re.search(read).groupdict() sample = match['SB'] barcodecorrected = barcodehash[sample] if not barcodecorrected: continue correctbc = barcodecorrected if correctbc == match['SB']: kept.append(read) else: read = read.replace("SAMPLE_" + match['SB'], "SAMPLE_" + correctbc) kept.append(read) return kept def exact_sample_filter(read, barcodes): parser_re = re.compile('(.*):CELL_(.*):UMI_(.*):SAMPLE_(?P.*)\\n(.*)\\n\\+\\n(.*)\\n') match = parser_re.search(read).groupdict() sample = match['SB'] if sample not in barcodes: return None return read def umi_filter(chunk): parser_re = re.compile('(.*):CELL_(.*):UMI_(?P.*):SAMPLE_(.*)\\n(.*)\\n\\+\\n(.*)\\n') kept = [] for read in chunk: match = parser_re.search(read).groupdict() MB = match['MB'] if not acgt_match(MB): continue else: kept.append(read) return kept def append_uids(chunk): parser_re = re.compile('(.*):CELL_(?P.*):UMI_(?P.*):SAMPLE_(?P.*)\\n(.*)\\n\\+\\n(.*)\\n') kept = [] for read in chunk: match = parser_re.search(read).groupdict() CB = match['CB'] MB = match['MB'] SB = match['SB'] sample = "SAMPLE_"+ match['SB'] idx = read.find(sample)+len(sample) read = read[:idx]+":UID_" + SB + CB + MB+ read[idx:] kept.append(read) return kept def correcting_sample_filter(read, barcodehash): parser_re = re.compile('(.*):CELL_(.*):UMI_(.*):SAMPLE_(?P.*)\\n(.*)\\n\\+\\n(.*)\\n') match = parser_re.search(read).groupdict() sample = match['SB'] barcodecorrected = barcodehash[sample] if not barcodecorrected: return None correctbc = barcodecorrected if correctbc == match['SB']: return(read) else: read = read.replace("SAMPLE_" + match['SB'], "SAMPLE_" + correctbc) return(read) class MutationHash(object): def __init__(self, strings, nedit): self.hash = mutationhash(strings, nedit) def __getitem__(self, barcode): result = self.hash[barcode] if len(result) != 1: return None else: return list(result)[0] def mutationhash(strings, nedit): """ produce a hash with each key a nedit distance substitution for a set of strings. values of the hash is the set of strings the substitution could have come from """ maxlen = max([len(string) for string in strings]) indexes = generate_idx(maxlen, nedit) muthash = defaultdict(set) for string in strings: muthash[string].update([string]) for x in substitution_set(string, indexes): muthash[x].update([string]) return muthash def substitution_set(string, indexes): """ for a string, return a set of all possible substitutions """ strlen = len(string) return {mutate_string(string, x) for x in indexes if valid_substitution(strlen, x)} def valid_substitution(strlen, index): """ skip performing substitutions that are outside the bounds of the string """ values = index[0] return all([strlen > i for i in values]) def generate_idx(maxlen, nedit): """ generate all possible nedit edits of a string. each item has the form ((index1, index2), 'A', 'G') for nedit=2 index1 will be replaced by 'A', index2 by 'G' this covers all edits < nedit as well since some of the specified substitutions will not change the base """ ALPHABET = ["A", "C", "G", "T", "N"] indexlists = [] ALPHABETS = [ALPHABET for x in range(nedit)] return list(itertools.product(itertools.combinations(range(maxlen), nedit), *ALPHABETS)) def acgt_match(string): """ returns True if sting consist of only "A "C" "G" "T" """ search = re.compile(r'[^ACGT]').search return not bool(search(string)) def mutate_string(string, tomutate): strlist = list(string) for i, idx in enumerate(tomutate[0]): strlist[idx] = tomutate[i+1] return "".join(strlist) umis-1.0.8/umis/umis.py000066400000000000000000001433431412434634200150140ustar00rootroot00000000000000#!/usr/bin/env python from __future__ import print_function import os import itertools import collections import regex as re import json import gzip import sys import logging import time import multiprocessing import tempfile from io import BufferedReader, TextIOWrapper from functools import partial import toolz as tz from .barcodes import (exact_barcode_filter, correcting_barcode_filter, exact_sample_filter, correcting_sample_filter, exact_sample_filter2, correcting_sample_filter2, umi_filter, append_uids, MutationHash) import numpy as np import scipy.io, scipy.sparse import click VERSION = "1.0.7" logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) BarcodeInfo = collections.namedtuple("BarcodeInfo", ["bamtag", "readprefix"]) BARCODEINFO = {"sample": BarcodeInfo(bamtag="XS", readprefix="SAMPLE"), "cellular": BarcodeInfo(bamtag="XC", readprefix="CELL"), "molecular": BarcodeInfo(bamtag="RX", readprefix="UMI")} def open_gzipsafe(f): if is_python3(): return gzip.open(f, mode="rt") if f.endswith(".gz") else open(f) else: return gzip.open(f) if f.endswith(".gz") else open(f) def safe_makedir(dname): """Make a directory if it doesn't exist, handling concurrent race conditions. """ if not dname: return dname num_tries = 0 max_tries = 5 while not os.path.exists(dname): # we could get an error here if multiple processes are creating # the directory at the same time. Grr, concurrency. try: os.makedirs(dname) except OSError: if num_tries > max_tries: raise num_tries += 1 time.sleep(2) return dname def stream_fastq(file_handler): ''' Generator which gives all four lines if a fastq read as one string ''' next_element = '' for i, line in enumerate(file_handler): next_element += line if i % 4 == 3: yield next_element next_element = '' def read_fastq(filename): """ return a stream of FASTQ entries, handling gzipped and empty files """ if not filename: return itertools.cycle((None,)) if filename == "-": filename_fh = sys.stdin elif filename.endswith('gz'): if is_python3(): filename_fh = gzip.open(filename, mode='rt') else: filename_fh = BufferedReader(gzip.open(filename, mode='rt')) else: filename_fh = open(filename) return stream_fastq(filename_fh) def read_cbhistogram(filename): if not filename: return None if filename.endswith('gz'): # filename_fh = BufferedReader(gzip.open(filename, mode='rt')) filename_fh = gzip.open(filename, mode='rt') else: filename_fh = open(filename) return filename_fh def write_fastq(filename): """ return a handle for FASTQ writing, handling gzipped files """ if filename: if filename.endswith('gz'): filename_fh = gzip.open(filename, mode='wt') else: filename_fh = open(filename, mode='w') else: filename_fh = None return filename_fh def stream_bamfile(sam, transcript=None): sam_file = open_bamfile(sam) if transcript: track = sam_file.fetch(transcript) else: track = sam_file.fetch(until_eof=True) return track def open_bamfile(sam): from pysam import AlignmentFile sam_mode = 'r' if sam.endswith(".sam") else 'rb' return AlignmentFile(sam, mode=sam_mode) def detect_alignment_annotations(queryalignment, tags=False): """ detects the annotations present in a SAM file, inspecting either the tags or the query names and returns a set of annotations present """ annotations = set() for k, v in BARCODEINFO.items(): if tags: if queryalignment.has_tag(v.bamtag): annotations.add(k) else: if v.readprefix in queryalignment.qname: annotations.add(k) return annotations def detect_fastq_annotations(fastq_file): """ detects annotations preesent in a FASTQ file by examining the first read """ annotations = set() queryread = tz.first(read_fastq(fastq_file)) for k, v in BARCODEINFO.items(): if v.readprefix in queryread: annotations.add(k) return annotations def construct_transformed_regex(annotations): """ construct a regex that matches possible fields in a transformed file annotations is a set of which keys in BARCODEINFO are present in the file """ re_string = '.*' if "cellular" in annotations: re_string += ":CELL_(?P.*)" if "molecular" in annotations: re_string += ":UMI_(?P\w*)" if "sample" in annotations: re_string += ":SAMPLE_(?P\w*)" if re_string == ".*": logger.error("No annotation present on this file, aborting.") sys.exit(1) return re_string @click.command() @click.argument('transform', required=True) @click.argument('fastq1', required=True) @click.argument('fastq2', default=None, required=False) @click.argument('fastq3', default=None, required=False) @click.argument('fastq4', default=None, required=False) @click.option('--keep_fastq_tags', default=False, is_flag=True) @click.option('--separate_cb', is_flag=True, help="Keep dual index barcodes separate.") @click.option('--demuxed_cb', default=None) @click.option('--cores', default=1) @click.option('--fastq1out', default=None) @click.option('--fastq2out', default=None) @click.option('--min_length', default=1, help="Minimum length of read to keep.") def fastqtransform(transform, fastq1, fastq2, fastq3, fastq4, keep_fastq_tags, separate_cb, demuxed_cb, cores, fastq1out, fastq2out, min_length): ''' Transform input reads to the tagcounts compatible read layout using regular expressions as defined in a transform file. Outputs new format to stdout. ''' transform = json.load(open(transform)) options = _infer_transform_options(transform) read_template = '{name}' logger.info("Transforming %s." % fastq1) if options.dual_index and options.CB: logger.info("Detected dual cellular indexes.") if separate_cb: read_template += ':CELL_{CB1}-{CB2}' else: read_template += ':CELL_{CB}' elif options.triple_index: logger.info("Detected triple cellular indexes.") if separate_cb: read_template += ':CELL_{CB1}-{CB2}-{CB3}' else: read_template += ':CELL_{CB}' elif options.CB or demuxed_cb: logger.info("Detected cellular barcodes.") read_template += ':CELL_{CB}' if options.MB and options.dual_index: logger.info("Detected dual UMI.") read_template += ':UMI_{MB1}-{MB2}' elif options.MB: logger.info("Detected UMI.") read_template += ":UMI_{MB}" if options.SB: logger.info("Detected sample.") read_template += ':SAMPLE_{SB}' read_template += "{readnum}" if keep_fastq_tags: read_template += ' {fastqtag}' read_template += '\n{seq}\n+\n{qual}\n' paired = fastq1out and fastq2out read1_regex = re.compile(transform['read1']) read2_regex = re.compile(transform['read2']) if fastq2 else None read3_regex = re.compile(transform['read3']) if fastq3 else None read4_regex = re.compile(transform['read4']) if fastq4 else None fastq_file1 = read_fastq(fastq1) fastq_file2 = read_fastq(fastq2) fastq_file3 = read_fastq(fastq3) fastq_file4 = read_fastq(fastq4) transform = partial(transformer, read1_regex=read1_regex, read2_regex=read2_regex, read3_regex=read3_regex, read4_regex=read4_regex, paired=paired) fastq1out_fh = write_fastq(fastq1out) fastq2out_fh = write_fastq(fastq2out) p = multiprocessing.Pool(cores) try : zzip = itertools.izip except AttributeError: zzip = zip chunks = tz.partition_all(10000, zzip(fastq_file1, fastq_file2, fastq_file3, fastq_file4)) bigchunks = tz.partition_all(cores, chunks) for bigchunk in bigchunks: for chunk in p.map(transform, list(bigchunk)): if paired: for read1_dict, read2_dict in tz.partition(2, chunk): if options.dual_index and options.CB: if not separate_cb: read1_dict['CB'] = read1_dict['CB1'] + read1_dict['CB2'] read2_dict['CB'] = read2_dict['CB1'] + read2_dict['CB2'] if demuxed_cb: read1_dict['CB'] = demuxed_cb read2_dict['CB'] = demuxed_cb if options.dual_index and options.MB: read1_dict['MB'] = read1_dict['MB1'] + read2_dict['MB2'] # Deal with spaces in read names if keep_fastq_tags: name, tag = read1_dict['name'].split(' ') read1_dict['name'] = name read1_dict['fastqtag'] = tag name, tag = read2_dict['name'].split(' ') read2_dict['name'] = name read2_dict['fastqtag'] = tag else: read1_dict['name'] = read1_dict['name'].partition(' ')[0] read2_dict['name'] = read2_dict['name'].partition(' ')[0] read1_dict = _extract_readnum(read1_dict) read2_dict = _extract_readnum(read2_dict) tooshort = (len(read1_dict['seq']) < min_length or len(read2_dict['seq']) < min_length) if not tooshort: fastq1out_fh.write(read_template.format(**read1_dict)) fastq2out_fh.write(read_template.format(**read2_dict)) else: for read1_dict in chunk: if options.dual_index and options.CB: if not separate_cb: read1_dict['CB'] = read1_dict['CB1'] + read1_dict['CB2'] if demuxed_cb: read1_dict['CB'] = demuxed_cb if options.dual_index and options.MB: read1_dict['MB'] = read1_dict['MB1'] + read1_dict['MB2'] # Deal with spaces in read names if keep_fastq_tags: name, tag = read1_dict['name'].split(' ') read1_dict['name'] = name read1_dict['fastqtag'] = tag else: read1_dict['name'] = read1_dict['name'].partition(' ')[0] read1_dict = _extract_readnum(read1_dict) if len(read1_dict['seq']) >= min_length: if fastq1out_fh: fastq1out_fh.write(read_template.format(**read1_dict)) else: sys.stdout.write(read_template.format(**read1_dict)) def _is_umi_only(options): return options.MB and not options.CB def _infer_transform_options(transform): """ figure out what transform options should be by examining the provided regexes for keywords """ TransformOptions = collections.namedtuple("TransformOptions", ['CB', 'dual_index', 'triple_index', 'MB', 'SB']) CB = False SB = False MB = False dual_index = False triple_index = False for rx in transform.values(): if not rx: continue if "CB1" in rx: if "CB3" in rx: triple_index = True else: dual_index = True if "MB1" in rx: dual_index = True if "SB" in rx: SB = True if "CB" in rx: CB = True if "MB" in rx: MB = True return TransformOptions(CB=CB, dual_index=dual_index, triple_index=triple_index, MB=MB, SB=SB) def _extract_readnum(read_dict): """Extract read numbers from old-style fastqs. Handles read 1 and 2 specifications where naming is readname/1 readname/2 """ pat = re.compile(r"(?P/\d+)$") parts = pat.split(read_dict["name"]) if len(parts) == 3: name, readnum, endofline = parts read_dict["name"] = name read_dict["readnum"] = readnum else: read_dict["readnum"] = "" return read_dict def transformer(chunk, read1_regex, read2_regex, read3_regex, read4_regex, paired=False): # Parse the reads with the regexes update_keys = ("MB", "CB", "CB1", "CB2", "SP", "MB1", "MB2") reads = [] for read1, read2, read3, read4 in chunk: read1_match = read1_regex.search(read1) if not read1_match: continue read1_dict = read1_match.groupdict() if read2_regex: read2_match = read2_regex.search(read2) if not read2_match: continue read2_dict = read2_match.groupdict() else: read2_dict = dict() if read3_regex: read3_match = read3_regex.search(read3) if not read3_match: continue read3_dict = read3_match.groupdict() else: read3_dict = dict() if read4_regex: read4_match = read4_regex.search(read4) if not read4_match: continue read4_dict = read4_match.groupdict() else: read4_dict = dict() if paired: read1_dict.update({k: v for k, v in read2_dict.items() if k not in read1_dict}) read1_dict.update({k: v for k, v in read3_dict.items() if k not in read1_dict}) read1_dict.update({k: v for k, v in read4_dict.items() if k not in read1_dict}) read2_dict.update({k: v for k, v in read1_dict.items() if k not in read2_dict}) read2_dict.update({k: v for k, v in read3_dict.items() if k not in read2_dict}) read2_dict.update({k: v for k, v in read4_dict.items() if k not in read2_dict}) else: read1_dict.update(read2_dict) read1_dict.update(read3_dict) read1_dict.update(read4_dict) # Output the restrutured read reads.append(read1_dict) if paired: reads.append(read2_dict) return reads @click.command() @click.argument('sam') @click.argument('out') @click.option('--genemap', required=False, default=None, help=('A TSV file mapping transcript ids to gene ids. If ' 'provided expression will be summarised to gene level ' '(recommended).')) @click.option('--output_evidence_table', default=None) @click.option('--positional', default=False, is_flag=True) @click.option('--minevidence', required=False, default=1.0, type=float) @click.option('--cb_histogram', default=None, help=('A TSV file with CBs and a count. If the counts are ' 'are the number of reads at a CB, the cb_cutoff option ' 'can be used to filter out CBs to be counted.')) @click.option('--cb_cutoff', default=None, help=("Number of counts to filter cellular barcodes. Set to " "'auto' to calculate a cutoff automatically.")) @click.option('--no_scale_evidence', default=False, is_flag=True) @click.option('--subsample', required=False, default=None, type=int) @click.option('--sparse', is_flag=True, default=False, help="Ouput counts in MatrixMarket format.") @click.option('--parse_tags', required=False, is_flag=True, help=('Parse BAM tags in stead of read name. In this mode ' 'the optional tags UM and CR will be used for UMI and ' 'cell barcode, respetively.')) @click.option('--gene_tags', required=False, is_flag=True, help=('Use the optional TX and GX tags in the BAM file to ' 'read gene mapping information in stead of the mapping ' 'target nane. Useful if e.g. reads have been mapped to ' 'genome in stead of transcriptome.')) def tagcount(sam, out, genemap, output_evidence_table, positional, minevidence, cb_histogram, cb_cutoff, no_scale_evidence, subsample, sparse, parse_tags, gene_tags): ''' Count up evidence for tagged molecules ''' from pysam import AlignmentFile from io import StringIO import pandas as pd from utils import weigh_evidence logger.info('Reading optional files') gene_map = None if genemap: with open(genemap) as fh: try: gene_map = dict(p.strip().split() for p in fh) except ValueError: logger.error('Incorrectly formatted gene_map, need to be tsv.') sys.exit() if positional: tuple_template = '{0},{1},{2},{3}' else: tuple_template = '{0},{1},{3}' if not cb_cutoff: cb_cutoff = 0 if cb_histogram and cb_cutoff == "auto": cb_cutoff = guess_depth_cutoff(cb_histogram) cb_cutoff = int(cb_cutoff) cb_hist = None filter_cb = False if cb_histogram: cb_hist = pd.read_csv(cb_histogram, index_col=0, header=None, squeeze=True, sep="\t") total_num_cbs = cb_hist.shape[0] cb_hist = cb_hist[cb_hist > cb_cutoff] logger.info('Keeping {} out of {} cellular barcodes.'.format(cb_hist.shape[0], total_num_cbs)) filter_cb = True parser_re = re.compile('.*:CELL_(?P.*):UMI_(?P.*)') if subsample: logger.info('Creating reservoir of subsampled reads ({} per cell)'.format(subsample)) start_sampling = time.time() reservoir = collections.defaultdict(list) cb_hist_sampled = 0 * cb_hist cb_obs = 0 * cb_hist track = stream_bamfile(sam) current_read = 'none_observed_yet' for i, aln in enumerate(track): if aln.qname == current_read: continue current_read = aln.qname if parse_tags: CB = aln.get_tag('CR') else: match = parser_re.match(aln.qname) CB = match.group('CB') if CB not in cb_hist.index: continue cb_obs[CB] += 1 if len(reservoir[CB]) < subsample: reservoir[CB].append(i) cb_hist_sampled[CB] += 1 else: s = pd.np.random.randint(0, cb_obs[CB]) if s < subsample: reservoir[CB][s] = i index_filter = set(itertools.chain.from_iterable(reservoir.values())) sam_file.close() sampling_time = time.time() - start_sampling logger.info('Sampling done - {:.3}s'.format(sampling_time)) evidence = collections.defaultdict(int) logger.info('Tallying evidence') start_tally = time.time() sam_mode = 'r' if sam.endswith(".sam") else 'rb' sam_file = AlignmentFile(sam, mode=sam_mode) targets = [x["SN"] for x in sam_file.header["SQ"]] track = sam_file.fetch(until_eof=True) count = 0 unmapped = 0 kept = 0 nomatchcb = 0 current_read = 'none_observed_yet' count_this_read = True missing_transcripts = set() for i, aln in enumerate(track): if count and not count % 1000000: logger.info("Processed %d alignments, kept %d." % (count, kept)) logger.info("%d were filtered for being unmapped." % unmapped) if filter_cb: logger.info("%d were filtered for not matching known barcodes." % nomatchcb) count += 1 if aln.is_unmapped: unmapped += 1 continue if gene_tags and not aln.has_tag('GX'): unmapped += 1 continue if aln.qname != current_read: current_read = aln.qname if subsample and i not in index_filter: count_this_read = False continue else: count_this_read = True else: if not count_this_read: continue if parse_tags: CB = aln.get_tag('CR') else: match = parser_re.match(aln.qname) CB = match.group('CB') if filter_cb: if CB not in cb_hist.index: nomatchcb += 1 continue if parse_tags: MB = aln.get_tag('UM') else: MB = match.group('MB') if gene_tags: target_name = aln.get_tag('GX').split(',')[0] else: txid = sam_file.getrname(aln.reference_id) if gene_map: if txid in gene_map: target_name = gene_map[txid] else: missing_transcripts.add(txid) target_name = txid else: target_name = txid e_tuple = tuple_template.format(CB, target_name, aln.pos, MB) # Scale evidence by number of hits if no_scale_evidence: evidence[e_tuple] += 1.0 else: evidence[e_tuple] += weigh_evidence(aln.tags) kept += 1 tally_time = time.time() - start_tally if missing_transcripts: logger.warn('The following transcripts were missing gene_ids, so we added them as the transcript ids: %s' % str(missing_transcripts)) logger.info('Tally done - {:.3}s, {:,} alns/min'.format(tally_time, int(60. * count / tally_time))) logger.info('Collapsing evidence') logger.info('Writing evidence') with tempfile.NamedTemporaryFile('w+t') as out_handle: for key in evidence: line = '{},{}\n'.format(key, evidence[key]) out_handle.write(line) out_handle.flush() out_handle.seek(0) evidence_table = pd.read_csv(out_handle, header=None) del evidence evidence_query = 'evidence >= %f' % minevidence if positional: evidence_table.columns=['cell', 'gene', 'umi', 'pos', 'evidence'] collapsed = evidence_table.query(evidence_query).groupby(['cell', 'gene'])['umi', 'pos'].size() else: evidence_table.columns=['cell', 'gene', 'umi', 'evidence'] collapsed = evidence_table.query(evidence_query).groupby(['cell', 'gene'])['umi'].size() expanded = collapsed.unstack().T if gene_map: # This Series is just for sorting the index genes = pd.Series(index=set(gene_map.values())) genes = genes.sort_index() # Now genes is assigned to a DataFrame genes = expanded.loc[genes.index] elif gene_tags: expanded.sort_index() genes = expanded else: # make data frame have a complete accounting of transcripts targets = pd.Series(index=set(targets)) targets = targets.sort_index() expanded = expanded.reindex(targets.index.values, fill_value=0) genes = expanded genes.fillna(0, inplace=True) genes = genes.astype(int) genes.index.name = "gene" logger.info('Output results') if subsample: cb_hist_sampled.to_csv('ss_{}_'.format(subsample) + os.path.basename(cb_histogram), sep='\t') if output_evidence_table: import shutil buf.seek(0) with open(output_evidence_table, 'w') as etab_fh: shutil.copyfileobj(buf, etab_fh) if sparse: pd.Series(genes.index).to_csv(out + ".rownames", index=False, header=False) pd.Series(genes.columns.values).to_csv(out + ".colnames", index=False, header=False) with open(out, "w+b") as out_handle: scipy.io.mmwrite(out_handle, scipy.sparse.csr_matrix(genes)) else: genes.to_csv(out) @click.command() @click.argument('sam') @click.argument('out') @click.option('--genemap', required=False, default=None, help=('A TSV file mapping transcript ids to gene ids. If ' 'provided expression will be summarised to gene level ' '(recommended).')) @click.option('--positional', default=False, is_flag=True) @click.option('--minevidence', required=False, default=1.0, type=float) @click.option('--cb_histogram', default=None, help=('A TSV file with CBs and a count. If the counts are ' 'are the number of reads at a CB, the cb_cutoff option ' 'can be used to filter out CBs to be counted.')) @click.option('--cb_cutoff', default=None, help=("Number of counts to filter cellular barcodes. Set to " "'auto' to calculate a cutoff automatically.")) @click.option('--subsample', required=False, default=None, type=int) @click.option('--parse_tags', required=False, is_flag=True, help=('Parse BAM tags in stead of read name. In this mode ' 'the optional tags UM and CR will be used for UMI and ' 'cell barcode, respetively.')) @click.option('--gene_tags', required=False, is_flag=True, help=('Use the optional TX and GX tags in the BAM file to ' 'read gene mapping information in stead of the mapping ' 'target nane. Useful if e.g. reads have been mapped to ' 'genome in stead of transcriptome.')) @click.option('--umi_matrix', required=False, help=('Save a sparse matrix of counts without UMI deduping to this file.')) def fasttagcount(sam, out, genemap, positional, minevidence, cb_histogram, cb_cutoff, subsample, parse_tags, gene_tags, umi_matrix): ''' Count up evidence for tagged molecules, this implementation assumes the alignment file is coordinate sorted ''' from pysam import AlignmentFile from io import StringIO import pandas as pd from utils import weigh_evidence if sam.endswith(".sam"): logger.error("To use the fasttagcount subcommand, the alignment file must be a " "coordinate sorted, indexed BAM file.") sys.exit(1) logger.info('Reading optional files') gene_map = None if genemap: with open(genemap) as fh: try: gene_map = dict(p.strip().split() for p in fh) except ValueError: logger.error('Incorrectly formatted gene_map, need to be tsv.') sys.exit() if positional: tuple_template = '{0},{1},{2},{3}' else: tuple_template = '{0},{1},{3}' if not cb_cutoff: cb_cutoff = 0 if cb_histogram and cb_cutoff == "auto": cb_cutoff = guess_depth_cutoff(cb_histogram) cb_cutoff = int(cb_cutoff) cb_hist = None filter_cb = False if cb_histogram: cb_hist = pd.read_csv(cb_histogram, index_col=0, header=None, squeeze=True, sep="\t") total_num_cbs = cb_hist.shape[0] cb_hist = cb_hist[cb_hist > cb_cutoff] logger.info('Keeping {} out of {} cellular barcodes.'.format(cb_hist.shape[0], total_num_cbs)) filter_cb = True parser_re = re.compile('.*:CELL_(?P.*):UMI_(?P.*)') if subsample: logger.info('Creating reservoir of subsampled reads ({} per cell)'.format(subsample)) start_sampling = time.time() reservoir = collections.defaultdict(list) cb_hist_sampled = 0 * cb_hist cb_obs = 0 * cb_hist track = stream_bamfile(sam) current_read = 'none_observed_yet' for i, aln in enumerate(track): if aln.qname == current_read: continue current_read = aln.qname if parse_tags: CB = aln.get_tag('CR') else: match = parser_re.match(aln.qname) CB = match.group('CB') if CB not in cb_hist.index: continue cb_obs[CB] += 1 if len(reservoir[CB]) < subsample: reservoir[CB].append(i) cb_hist_sampled[CB] += 1 else: s = pd.np.random.randint(0, cb_obs[CB]) if s < subsample: reservoir[CB][s] = i index_filter = set(itertools.chain.from_iterable(reservoir.values())) sam_file.close() sampling_time = time.time() - start_sampling logger.info('Sampling done - {:.3}s'.format(sampling_time)) evidence = collections.defaultdict(lambda: collections.defaultdict(float)) bare_evidence = collections.defaultdict(float) logger.info('Tallying evidence') start_tally = time.time() sam_mode = 'r' if sam.endswith(".sam") else 'rb' sam_file = AlignmentFile(sam, mode=sam_mode) transcript_map = collections.defaultdict(set) sam_transcripts = [x["SN"] for x in sam_file.header["SQ"]] if gene_map: for transcript, gene in gene_map.items(): if transcript in sam_transcripts: transcript_map[gene].add(transcript) else: for transcript in sam_transcripts: transcript_map[transcript].add(transcript) missing_transcripts = set() alignments_processed = 0 unmapped = 0 kept = 0 nomatchcb = 0 current_read = 'none_observed_yet' current_transcript = None count_this_read = True transcripts_processed = 0 genes_processed = 0 cells = list(cb_hist.index) targets_seen = set() if umi_matrix: bare_evidence_handle = open(umi_matrix, "w") bare_evidence_handle.write(",".join(["gene"] + cells) + "\n") with open(out, "w") as out_handle: out_handle.write(",".join(["gene"] + cells) + "\n") for gene, transcripts in transcript_map.items(): for transcript in transcripts: for aln in sam_file.fetch(transcript): alignments_processed += 1 if aln.is_unmapped: unmapped += 1 continue if gene_tags and not aln.has_tag('GX'): unmapped += 1 continue if aln.qname != current_read: current_read = aln.qname if subsample and i not in index_filter: count_this_read = False continue else: count_this_read = True else: if not count_this_read: continue if parse_tags: CB = aln.get_tag('CR') else: match = parser_re.match(aln.qname) CB = match.group('CB') if filter_cb: if CB not in cb_hist.index: nomatchcb += 1 continue if parse_tags: MB = aln.get_tag('UM') else: MB = match.group('MB') if gene_tags: target_name = aln.get_tag('GX').split(',')[0] else: txid = sam_file.getrname(aln.reference_id) if gene_map: if txid in gene_map: target_name = gene_map[txid] else: missing_transcripts.add(txid) continue else: target_name = txid targets_seen.add(target_name) # Scale evidence by number of hits evidence[CB][MB] += weigh_evidence(aln.tags) bare_evidence[CB] += weigh_evidence(aln.tags) kept += 1 transcripts_processed += 1 if not transcripts_processed % 1000: logger.info("%d genes processed." % genes_processed) logger.info("%d transcripts processed." % transcripts_processed) logger.info("%d alignments processed." % alignments_processed) earray = [] for cell in cells: umis = [1 for _, v in evidence[cell].items() if v >= minevidence] earray.append(str(sum(umis))) out_handle.write(",".join([gene] + earray) + "\n") earray = [] if umi_matrix: for cell in cells: earray.append(str(int(bare_evidence[cell]))) bare_evidence_handle.write(",".join([gene] + earray) + "\n") evidence = collections.defaultdict(lambda: collections.defaultdict(int)) bare_evidence = collections.defaultdict(int) genes_processed += 1 if umi_matrix: bare_evidence_handle.close() # fill dataframe with missing values, sort and output df = pd.read_csv(out, index_col=0, header=0) targets = pd.Series(index=set(transcript_map.keys())) targets = targets.sort_index() df = df.reindex(targets.index.values, fill_value=0) df = df.sort_index() df.to_csv(out) if umi_matrix: df = pd.read_csv(umi_matrix, index_col=0, header=0) df = df.reindex(targets.index.values, fill_value=0) df = df.sort_index() df.to_csv(umi_matrix) @click.command() @click.argument("csv") @click.argument("sparse") def sparse(csv, sparse): ''' Convert a CSV file to a sparse matrix with rows and column names saved as companion files. ''' import pandas as pd df = pd.read_csv(csv, index_col=0, header=0) pd.Series(df.index).to_csv(sparse + ".rownames", index=False, header=False) pd.Series(df.columns.values).to_csv(sparse + ".colnames", index=False, header=False) with open(sparse, "w+b") as out_handle: scipy.io.mmwrite(out_handle, scipy.sparse.csr_matrix(df)) @click.command() @click.argument('fastq', required=True) @click.option("--umi_histogram", required=False, help=("Output a count of each UMI for each cellular barcode to this " "file.")) def cb_histogram(fastq, umi_histogram): ''' Counts the number of reads for each cellular barcode Expects formatted fastq files. ''' annotations = detect_fastq_annotations(fastq) re_string = construct_transformed_regex(annotations) parser_re = re.compile(re_string) cb_counter = collections.Counter() umi_counter = collections.Counter() for read in read_fastq(fastq): match = parser_re.search(read).groupdict() cb = match['CB'] cb_counter[cb] += 1 if umi_histogram: umi = match['MB'] umi_counter[(cb, umi)] += 1 for bc, count in cb_counter.most_common(): sys.stdout.write('{}\t{}\n'.format(bc, count)) if umi_histogram: with open(umi_histogram, "w") as umi_handle: for cbumi, count in umi_counter.most_common(): umi_handle.write('{}\t{}\t{}\n'.format(cbumi[0], cbumi[1], count)) @click.command() @click.argument('fastq', required=True) def umi_histogram(fastq): ''' Counts the number of reads for each UMI Expects formatted fastq files. ''' annotations = detect_fastq_annotations(fastq) re_string = construct_transformed_regex(annotations) parser_re = re.compile(re_string) counter = collections.Counter() for read in read_fastq(fastq): match = parser_re.search(read).groupdict() counter[match['MB']] += 1 for bc, count in counter.most_common(): sys.stdout.write('{}\t{}\n'.format(bc, count)) def get_cb_depth_set(cb_histogram, cb_cutoff): ''' Returns a set of barcodes with a minimum number of reads ''' cb_keep_set = set() if not cb_histogram: return cb_keep_set with read_cbhistogram(cb_histogram) as fh: cb_map = dict(p.strip().split() for p in fh) cb_keep_set = set([k for k, v in cb_map.items() if int(v) > cb_cutoff]) logger.info('Keeping %d out of %d cellular barcodes.' % (len(cb_keep_set), len(cb_map))) return cb_keep_set def guess_depth_cutoff(cb_histogram): ''' Guesses at an appropriate barcode cutoff ''' with read_cbhistogram(cb_histogram) as fh: cb_vals = [int(p.strip().split()[1]) for p in fh] histo = np.histogram(np.log10(cb_vals), bins=50) vals = histo[0] edges = histo[1] mids = np.array([(edges[i] + edges[i+1])/2 for i in range(edges.size - 1)]) wdensity = vals * (10**mids) / sum(vals * (10**mids)) baseline = np.median(wdensity) wdensity = list(wdensity) # find highest density in upper half of barcode distribution peak = wdensity.index(max(wdensity[len(wdensity)/2:])) cutoff = None for index, dens in reversed(list(enumerate(wdensity[1:peak]))): if dens < 2 * baseline: cutoff = index break if not cutoff: return None else: cutoff = 10**mids[cutoff] logger.info('Setting barcode cutoff to %d' % cutoff) return cutoff @click.command() @click.argument('fastq', required=True) @click.option('--bc1', default=None, required=True) @click.option('--bc2', default=None) @click.option('--bc3', default=None) @click.option('--cores', default=1) @click.option('--nedit', default=0) def cb_filter(fastq, bc1, bc2, bc3, cores, nedit): ''' Filters reads with non-matching barcodes Expects formatted fastq files. ''' with open_gzipsafe(bc1) as bc1_fh: bc1 = set(cb.strip() for cb in bc1_fh) if bc2: with open_gzipsafe(bc2) as bc2_fh: bc2 = set(cb.strip() for cb in bc2_fh) if bc3: with open_gzipsafe(bc3) as bc3_fh: bc3 = set(cb.strip() for cb in bc3_fh) annotations = detect_fastq_annotations(fastq) re_string = construct_transformed_regex(annotations) if nedit == 0: filter_cb = partial(exact_barcode_filter, bc1=bc1, bc2=bc2, bc3=bc3, re_string=re_string) else: bc1hash = MutationHash(bc1, nedit) bc2hash = None bc3hash = None if bc2: bc2hash = MutationHash(bc2, nedit) if bc3: bc3hash = MutationHash(bc3, nedit) filter_cb = partial(correcting_barcode_filter, bc1hash=bc1hash, bc2hash=bc2hash, bc3hash=bc3hash, re_string=re_string) p = multiprocessing.Pool(cores) chunks = tz.partition_all(10000, read_fastq(fastq)) bigchunks = tz.partition_all(cores, chunks) for bigchunk in bigchunks: for chunk in p.map(filter_cb, list(bigchunk)): for read in chunk: sys.stdout.write(read) @click.command() @click.argument('fastq', required=True) @click.option('--bc', type=click.File('r')) @click.option('--cores', default=1) @click.option('--nedit', default=0) def sb_filter(fastq, bc, cores, nedit): ''' Filters reads with non-matching sample barcodes Expects formatted fastq files. ''' barcodes = set(sb.strip() for sb in bc) if nedit == 0: filter_sb = partial(exact_sample_filter2, barcodes=barcodes) else: barcodehash = MutationHash(barcodes, nedit) filter_sb = partial(correcting_sample_filter2, barcodehash=barcodehash) p = multiprocessing.Pool(cores) chunks = tz.partition_all(10000, read_fastq(fastq)) bigchunks = tz.partition_all(cores, chunks) for bigchunk in bigchunks: for chunk in p.map(filter_sb, list(bigchunk)): for read in chunk: sys.stdout.write(read) @click.command() @click.argument('fastq', required=True) @click.option('--cores', default=1) def mb_filter(fastq, cores): ''' Filters umis with non-ACGT bases Expects formatted fastq files. ''' filter_mb = partial(umi_filter) p = multiprocessing.Pool(cores) chunks = tz.partition_all(10000, read_fastq(fastq)) bigchunks = tz.partition_all(cores, chunks) for bigchunk in bigchunks: for chunk in p.map(filter_mb, list(bigchunk)): for read in chunk: sys.stdout.write(read) @click.command() @click.argument('fastq', required=True) @click.option('--cores', default=1) def add_uid(fastq, cores): ''' Adds UID:[samplebc cellbc umi] to readname for umi-tools deduplication Expects formatted fastq files with correct sample and cell barcodes. ''' uids = partial(append_uids) p = multiprocessing.Pool(cores) chunks = tz.partition_all(10000, read_fastq(fastq)) bigchunks = tz.partition_all(cores, chunks) for bigchunk in bigchunks: for chunk in p.map(uids, list(bigchunk)): for read in chunk: sys.stdout.write(read) def write_kallisto_chunk(out_dir, cb, chunk): fq_fn = os.path.join(out_dir, cb + ".fq") umi_fn = os.path.join(out_dir, cb + ".umi") with open(fq_fn, "a") as fq_handle, open(umi_fn, "a") as umi_handle: for read, umi in chunk: fq_handle.write(read) umi_handle.write(umi + "\n") @click.command() @click.argument('fastq', required=True) @click.option('--out_dir', default=".") @click.option('--cb_histogram', default=None) @click.option('--cb_cutoff', default=0) def kallisto(fastq, out_dir, cb_histogram, cb_cutoff): ''' Convert fastqtransformed file to output format compatible with kallisto. ''' parser_re = re.compile('(.*):CELL_(?.*):UMI_(?P.*)\\n(.*)\\n\\+\\n(.*)\\n') if fastq.endswith('gz'): fastq_fh = gzip.GzipFile(fileobj=open(fastq)) elif fastq == "-": fastq_fh = sys.stdin else: fastq_fh = open(fastq) cb_depth_set = get_cb_depth_set(cb_histogram, cb_cutoff) cb_set = set() cb_batch = collections.defaultdict(list) parsed = 0 for read in stream_fastq(fastq_fh): match = parser_re.search(read).groupdict() umi = match['UMI'] cb = match['CB'] if cb_depth_set and cb not in cb_depth_set: continue parsed += 1 cb_set.add(cb) cb_batch[cb].append((read, umi)) # write in batches to avoid opening up file handles repeatedly if not parsed % 10000000: for cb, chunk in cb_batch.items(): write_kallisto_chunk(out_dir, cb, chunk) cb_batch = collections.defaultdict(list) for cb, chunk in cb_batch.items(): write_kallisto_chunk(out_dir, cb, chunk) with open(os.path.join(out_dir, "barcodes.batch"), "w") as out_handle: out_handle.write("#id umi-file file-1\n") batchformat = "{cb} {cb}.umi {cb}.fq\n" for cb in cb_set: out_handle.write(batchformat.format(**locals())) @click.command() @click.argument('sam', required=True) def bamtag(sam): ''' Convert a BAM/SAM with fastqtransformed read names to have UMI and cellular barcode tags ''' from pysam import AlignmentFile start_time = time.time() sam_file = open_bamfile(sam) out_file = AlignmentFile("-", "wh", template=sam_file) track = sam_file.fetch(until_eof=True) # peek at first alignment to determine the annotations if is_python3(): queryalignment = next(track) else: queryalignment = track.next() annotations = detect_alignment_annotations(queryalignment) track = itertools.chain([queryalignment], track) re_string = construct_transformed_regex(annotations) parser_re = re.compile(re_string) for count, aln in enumerate(track, start=1): if count and not count % 1000000: logger.info("Processed %d alignments." % count) match = parser_re.match(aln.qname) tags = aln.tags if "cellular" in annotations: aln.tags += [('XC', match.group('CB'))] if "molecular" in annotations: aln.tags += [('RX', match.group('MB'))] if "sample" in annotations: aln.tags += [('XS', match.group('SB'))] out_file.write(aln) total_time = time.time() - start_time logger.info('BAM tag conversion done - {:.3}s, {:,} alns/min'.format(total_time, int(60. * count / total_time))) logger.info("Processed %d alignments." % count) @click.command() @click.argument('fastq', required=True) @click.option('--out_dir', default=".") @click.option('--nedit', default=0) @click.option('--barcodes', type=click.File('r'), required=False) def demultiplex_samples(fastq, out_dir, nedit, barcodes): ''' Demultiplex a fastqtransformed FASTQ file into a FASTQ file for each sample. ''' annotations = detect_fastq_annotations(fastq) re_string = construct_transformed_regex(annotations) parser_re = re.compile(re_string) if barcodes: barcodes = set(barcode.strip() for barcode in barcodes) else: barcodes = set() if nedit == 0: filter_bc = partial(exact_sample_filter, barcodes=barcodes) else: barcodehash = MutationHash(barcodes, nedit) filter_bc = partial(correcting_sample_filter, barcodehash=barcodehash) sample_set = set() batch = collections.defaultdict(list) parsed = 0 safe_makedir(out_dir) for read in read_fastq(fastq): parsed += 1 read = filter_bc(read) if not read: continue match = parser_re.search(read).groupdict() sample = match['SB'] sample_set.add(sample) batch[sample].append(read) # write in batches to avoid opening up file handles repeatedly if not parsed % 10000000: for sample, reads in batch.items(): out_file = os.path.join(out_dir, sample + ".fq") with open(out_file, "a") as out_handle: for read in reads: fixed = filter_bc(read) if fixed: out_handle.write(fixed) batch = collections.defaultdict(list) for sample, reads in batch.items(): out_file = os.path.join(out_dir, sample + ".fq") with open(out_file, "a") as out_handle: for read in reads: fixed = filter_bc(read) if fixed: out_handle.write(read) def is_python3(): return sys.version_info >= (3, 0) @click.command() @click.argument('fastq', required=True) @click.option('--out_dir', default=".") @click.option('--readnumber', default="") @click.option('--prefix', default="") @click.option('--cb_histogram', default=None) @click.option('--cb_cutoff', default=0) def demultiplex_cells(fastq, out_dir, readnumber, prefix, cb_histogram, cb_cutoff): ''' Demultiplex a fastqtransformed FASTQ file into a FASTQ file for each cell. ''' annotations = detect_fastq_annotations(fastq) re_string = construct_transformed_regex(annotations) parser_re = re.compile(re_string) readstring = "" if not readnumber else "_R{}".format(readnumber) filestring = "{prefix}{sample}{readstring}.fq" cb_set = set() if cb_histogram: cb_set = get_cb_depth_set(cb_histogram, cb_cutoff) sample_set = set() batch = collections.defaultdict(list) parsed = 0 safe_makedir(out_dir) for read in read_fastq(fastq): parsed += 1 match = parser_re.search(read).groupdict() sample = match['CB'] if cb_set and sample not in cb_set: continue sample_set.add(sample) batch[sample].append(read) # write in batches to avoid opening up file handles repeatedly if not parsed % 10000000: for sample, reads in batch.items(): out_file = os.path.join(out_dir, filestring.format(**locals())) with open(out_file, "a") as out_handle: for read in reads: out_handle.write(read) batch = collections.defaultdict(list) for sample, reads in batch.items(): out_file = os.path.join(out_dir, filestring.format(**locals())) with open(out_file, "a") as out_handle: for read in reads: out_handle.write(read) @click.command() @click.argument('SAM', required=True) @click.argument('barcodes', type=click.File('r'), required=True) def subset_bamfile(sam, barcodes): """ Subset a SAM/BAM file, keeping only alignments from given cellular barcodes """ from pysam import AlignmentFile start_time = time.time() sam_file = open_bamfile(sam) out_file = AlignmentFile("-", "wh", template=sam_file) track = sam_file.fetch(until_eof=True) # peek at first alignment to determine the annotations queryalignment = track.next() annotations = detect_alignment_annotations(queryalignment) track = itertools.chain([queryalignment], track) re_string = construct_transformed_regex(annotations) parser_re = re.compile(re_string) barcodes = set(barcode.strip() for barcode in barcodes) for count, aln in enumerate(track, start=1): if count and not count % 1000000: logger.info("Processed %d alignments." % count) match = parser_re.match(aln.qname) tags = aln.tags if "cellular" in annotations: cb = match.group('CB') if cb in barcodes: out_file.write(aln) @click.command() def version(): print(VERSION) @click.group() def umis(): pass umis.add_command(version) umis.add_command(sparse) umis.add_command(fastqtransform) umis.add_command(tagcount) umis.add_command(fasttagcount) umis.add_command(cb_histogram, name="cb_histogram") umis.add_command(umi_histogram, name="umi_histogram") umis.add_command(cb_filter, name="cb_filter") umis.add_command(sb_filter, name="sb_filter") umis.add_command(mb_filter, name="mb_filter") umis.add_command(add_uid, name="add_uid") umis.add_command(kallisto) umis.add_command(bamtag) umis.add_command(demultiplex_samples, name="demultiplex_samples") umis.add_command(demultiplex_cells, name="demultiplex_cells") umis.add_command(subset_bamfile, name="subset_bamfile") umis-1.0.8/umis/utils.pyx000066400000000000000000000002051412434634200153540ustar00rootroot00000000000000 def weigh_evidence(aux_list): for aux_tag in aux_list: if aux_tag[0] == 'NH': return 1. / aux_tag[1]