miniasm-0.3/000077500000000000000000000000001332550340100130055ustar00rootroot00000000000000miniasm-0.3/.gitignore000066400000000000000000000000341332550340100147720ustar00rootroot00000000000000.*.swp *.o *.a Makefile.bak miniasm-0.3/LICENSE.txt000066400000000000000000000020641332550340100146320ustar00rootroot00000000000000The MIT License Copyright (c) 2015 Broad Institute Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. miniasm-0.3/Makefile000066400000000000000000000015431332550340100144500ustar00rootroot00000000000000CC= gcc CFLAGS= -g -Wall -O2 -Wc++-compat CPPFLAGS= INCLUDES= -I. OBJS= sys.o sdict.o paf.o asg.o common.o hit.o asm.o PROG= miniasm minidot LIBS= -lm -lz -lpthread .SUFFIXES:.c .o .c.o: $(CC) -c $(CFLAGS) $(CPPFLAGS) $(INCLUDES) $< -o $@ all:$(PROG) miniasm:$(OBJS) main.o $(CC) $(CFLAGS) $^ -o $@ $(LIBS) minidot:paf.o sdict.o dotter.o $(CC) $(CFLAGS) $^ -o $@ $(LIBS) clean: rm -fr gmon.out *.o a.out $(PROG) *~ *.a *.dSYM session* depend: (LC_ALL=C; export LC_ALL; makedepend -Y -- $(CFLAGS) $(DFLAGS) -- *.c) # DO NOT DELETE asg.o: asg.h kvec.h ksort.h asm.o: miniasm.h sdict.h asg.h kvec.h kdq.h kseq.h common.o: miniasm.h sdict.h asg.h dotter.o: paf.h sdict.h kvec.h eps.h ksort.h hit.o: sdict.h paf.h kvec.h sys.h miniasm.h asg.h ksort.h main.o: kvec.h sys.h paf.h sdict.h miniasm.h asg.h paf.o: paf.h kseq.h sdict.o: sdict.h khash.h miniasm-0.3/PAF.md000066400000000000000000000025561332550340100137450ustar00rootroot00000000000000## PAF: a Pairwise mApping Format PAF is a text format describing the approximate mapping positions between two set of sequences. PAF is TAB-delimited with each line consisting of the following predefined fields: |Col|Type |Description | |--:|:----:|:-----------------------------------------| |1 |string|Query sequence name | |2 |int |Query sequence length | |3 |int |Query start (0-based) | |4 |int |Query end (0-based) | |5 |char |Relative strand: "+" or "-" | |6 |string|Target sequence name | |7 |int |Target sequence length | |8 |int |Target start on original strand (0-based) | |9 |int |Target end on original strand (0-based) | |10 |int |Number of residue matches | |11 |int |Alignment block length | |12 |int |Mapping quality (0-255; 255 for missing) | If PAF is generated from an alignment, column 10 equals the number of sequence matches, and column 11 equals the total number of sequence matches, mismatches, insertions and deletions in the alignment. If alignment is not available, column 10 and 11 are still required but may be highly inaccurate. A PAF file may optionally contain SAM-like typed key-value pairs at the end of each line. miniasm-0.3/README.md000066400000000000000000000125051332550340100142670ustar00rootroot00000000000000## Getting Started ```sh # Download sample PacBio from the PBcR website wget -O- http://www.cbcb.umd.edu/software/PBcR/data/selfSampleData.tar.gz | tar zxf - ln -s selfSampleData/pacbio_filtered.fastq reads.fq # Install minimap and miniasm (requiring gcc and zlib) git clone https://github.com/lh3/minimap2 && (cd minimap2 && make) git clone https://github.com/lh3/miniasm && (cd miniasm && make) # Overlap for PacBio reads (or use "-x map-ont" for nanopore read overlapping) minimap2/minimap2 -x map-pb -t8 pb-reads.fq pb-reads.fq | gzip -1 > reads.paf.gz # Layout miniasm/miniasm -f reads.fq reads.paf.gz > reads.gfa ``` ## Introduction Miniasm is a very fast OLC-based *de novo* assembler for noisy long reads. It takes all-vs-all read self-mappings (typically by [minimap][minimap]) as input and outputs an assembly graph in the [GFA][gfa] format. Different from mainstream assemblers, miniasm does not have a consensus step. It simply concatenates pieces of read sequences to generate the final [unitig][unitig] sequences. Thus the per-base error rate is similar to the raw input reads. So far miniasm is in early development stage. It has only been tested on a dozen of PacBio and Oxford Nanopore (ONT) bacterial data sets. Including the mapping step, it takes about 3 minutes to assemble a bacterial genome. Under the default setting, miniasm assembles 9 out of 12 PacBio datasets and 3 out of 4 ONT datasets into a single contig. The 12 PacBio data sets are [PacBio E. coli sample][PB-151103], [ERS473430][ERS473430], [ERS544009][ERS544009], [ERS554120][ERS554120], [ERS605484][ERS605484], [ERS617393][ERS617393], [ERS646601][ERS646601], [ERS659581][ERS659581], [ERS670327][ERS670327], [ERS685285][ERS685285], [ERS743109][ERS743109] and a [deprecated PacBio E. coli data set][PB-deprecated]. ONT data are acquired from the [Loman Lab][loman-ont]. For a *C. elegans* [PacBio data set][ce] (only 40X are used, not the whole dataset), miniasm finishes the assembly, including reads overlapping, in ~10 minutes with 16 CPUs. The total assembly size is 105Mb; the N50 is 1.94Mb. In comparison, the [HGAP3][hgap] produces a 104Mb assembly with N50 1.61Mb. [This dotter plot][ce-img] gives a global view of the miniasm assembly (on the X axis) and the HGAP3 assembly (on Y). They are broadly comparable. Of course, the HGAP3 consensus sequences are much more accurate. In addition, on the whole data set (assembled in ~30 min), the miniasm N50 is reduced to 1.79Mb. Miniasm still needs improvements. Miniasm confirms that at least for high-coverage bacterial genomes, it is possible to generate long contigs from raw PacBio or ONT reads without error correction. It also shows that [minimap][minimap] can be used as a read overlapper, even though it is probably not as sensitive as the more sophisticated overlapers such as [MHAP][mhap] and [DALIGNER][daligner]. Coupled with long-read error correctors and consensus tools, miniasm may also be useful to produce high-quality assemblies. ## Algorithm Overview 1. Crude read selection. For each read, find the longest contiguous region covered by three good mappings. Get an approximate estimate of read coverage. 2. Fine read selection. Use the coverage information to find the good regions again but with more stringent thresholds. Discard contained reads. 3. Generate a [string graph][sg]. Prune tips, drop weak overlaps and collapse short bubbles. These procedures are similar to those implemented in short-read assemblers. 4. Merge unambiguous overlaps to produce unitig sequences. ## Limitations 1. Consensus base quality is similar to input reads (may be fixed with a consensus tool). 2. Only tested on a dozen of high-coverage PacBio/ONT data sets (more testing needed). 3. Prone to collapse repeats or segmental duplications longer than input reads (hard to fix without error correction). [unitig]: http://wgs-assembler.sourceforge.net/wiki/index.php/Celera_Assembler_Terminology [minimap]: https://github.com/lh3/minimap [paf]: https://github.com/lh3/miniasm/blob/master/PAF.md [gfa]: https://github.com/pmelsted/GFA-spec/blob/master/GFA-spec.md [ERS473430]: http://www.ebi.ac.uk/ena/data/view/ERS473430 [ERS544009]: http://www.ebi.ac.uk/ena/data/view/ERS544009 [ERS554120]: http://www.ebi.ac.uk/ena/data/view/ERS554120 [ERS605484]: http://www.ebi.ac.uk/ena/data/view/ERS605484 [ERS617393]: http://www.ebi.ac.uk/ena/data/view/ERS617393 [ERS646601]: http://www.ebi.ac.uk/ena/data/view/ERS646601 [ERS659581]: http://www.ebi.ac.uk/ena/data/view/ERS659581 [ERS670327]: http://www.ebi.ac.uk/ena/data/view/ERS670327 [ERS685285]: http://www.ebi.ac.uk/ena/data/view/ERS685285 [ERS743109]: http://www.ebi.ac.uk/ena/data/view/ERS743109 [PB-151103]: https://github.com/PacificBiosciences/DevNet/wiki/E.-coli-Bacterial-Assembly [PB-deprecated]: https://github.com/PacificBiosciences/DevNet/wiki/E.-coli-20kb-Size-Selected-Library-with-P6-C4/ce0533c1d2a957488594f0b29da61ffa3e4627e8 [ce]: https://github.com/PacificBiosciences/DevNet/wiki/C.-elegans-data-set [mhap]: https://github.com/marbl/MHAP [daligner]: https://github.com/thegenemyers/DALIGNER [sg]: http://bioinformatics.oxfordjournals.org/content/21/suppl_2/ii79.abstract [loman-ont]: http://lab.loman.net/2015/09/24/first-sqk-map-006-experiment/ [hgap]: https://github.com/PacificBiosciences/Bioinformatics-Training/wiki/HGAP [ce-img]: http://lh3lh3.users.sourceforge.net/download/ce-miniasm.png miniasm-0.3/asg.c000066400000000000000000000270571332550340100137360ustar00rootroot00000000000000#include #include #include #include "asg.h" #include "kvec.h" #include "ksort.h" #define asg_arc_key(a) ((a).ul) KRADIX_SORT_INIT(asg, asg_arc_t, asg_arc_key, 8) asg_t *asg_init(void) { return (asg_t*)calloc(1, sizeof(asg_t)); } void asg_destroy(asg_t *g) { if (g == 0) return; free(g->seq); free(g->idx); free(g->arc); free(g); } void asg_arc_sort(asg_t *g) { radix_sort_asg(g->arc, g->arc + g->n_arc); } uint64_t *asg_arc_index_core(size_t max_seq, size_t n, const asg_arc_t *a) { size_t i, last; uint64_t *idx; idx = (uint64_t*)calloc(max_seq * 2, 8); for (i = 1, last = 0; i <= n; ++i) if (i == n || a[i-1].ul>>32 != a[i].ul>>32) idx[a[i-1].ul>>32] = (uint64_t)last<<32 | (i - last), last = i; return idx; } void asg_arc_index(asg_t *g) { if (g->idx) free(g->idx); g->idx = asg_arc_index_core(g->n_seq, g->n_arc, g->arc); } void asg_seq_set(asg_t *g, int sid, int len, int del) { if (sid >= g->m_seq) { g->m_seq = sid + 1; kv_roundup32(g->m_seq); g->seq = (asg_seq_t*)realloc(g->seq, g->m_seq * sizeof(asg_seq_t)); } if (sid >= g->n_seq) g->n_seq = sid + 1; g->seq[sid].len = len; g->seq[sid].del = !!del; } // hard remove arcs marked as "del" void asg_arc_rm(asg_t *g) { uint32_t e, n; for (e = n = 0; e < g->n_arc; ++e) { uint32_t u = g->arc[e].ul>>32, v = g->arc[e].v; if (!g->arc[e].del && !g->seq[u>>1].del && !g->seq[v>>1].del) g->arc[n++] = g->arc[e]; } if (n < g->n_arc) { // arc index is out of sync if (g->idx) free(g->idx); g->idx = 0; } g->n_arc = n; } void asg_cleanup(asg_t *g) { asg_arc_rm(g); if (!g->is_srt) { asg_arc_sort(g); g->is_srt = 1; } if (g->idx == 0) asg_arc_index(g); } // delete short arcs int asg_arc_del_short(asg_t *g, float drop_ratio) { uint32_t v, n_vtx = g->n_seq * 2, n_short = 0; for (v = 0; v < n_vtx; ++v) { asg_arc_t *av = asg_arc_a(g, v); uint32_t i, thres, nv = asg_arc_n(g, v); if (nv < 2) continue; thres = (uint32_t)(av[0].ol * drop_ratio + .499); for (i = nv - 1; i >= 1 && av[i].ol < thres; --i); for (i = i + 1; i < nv; ++i) av[i].del = 1, ++n_short; } if (n_short) { asg_cleanup(g); asg_symm(g); } fprintf(stderr, "[M::%s] removed %d short overlaps\n", __func__, n_short); return n_short; } // delete multi-arcs int asg_arc_del_multi(asg_t *g) { uint32_t *cnt, n_vtx = g->n_seq * 2, n_multi = 0, v; cnt = (uint32_t*)calloc(n_vtx, 4); for (v = 0; v < n_vtx; ++v) { asg_arc_t *av = asg_arc_a(g, v); int32_t i, nv = asg_arc_n(g, v); if (nv < 2) continue; for (i = nv - 1; i >= 0; --i) ++cnt[av[i].v]; for (i = nv - 1; i >= 0; --i) if (--cnt[av[i].v] != 0) av[i].del = 1, ++n_multi; } free(cnt); if (n_multi) asg_cleanup(g); fprintf(stderr, "[M::%s] removed %d multi-arcs\n", __func__, n_multi); return n_multi; } // remove asymmetric arcs: u->v is present, but v'->u' not int asg_arc_del_asymm(asg_t *g) { uint32_t e, n_asymm = 0; for (e = 0; e < g->n_arc; ++e) { uint32_t v = g->arc[e].v^1, u = g->arc[e].ul>>32^1; uint32_t i, nv = asg_arc_n(g, v); asg_arc_t *av = asg_arc_a(g, v); for (i = 0; i < nv; ++i) if (av[i].v == u) break; if (i == nv) g->arc[e].del = 1, ++n_asymm; } if (n_asymm) asg_cleanup(g); fprintf(stderr, "[M::%s] removed %d asymmetric arcs\n", __func__, n_asymm); return n_asymm; } void asg_symm(asg_t *g) { asg_arc_del_multi(g); asg_arc_del_asymm(g); g->is_symm = 1; } // transitive reduction; see Myers, 2005 int asg_arc_del_trans(asg_t *g, int fuzz) { uint8_t *mark; uint32_t v, n_vtx = g->n_seq * 2, n_reduced = 0; mark = (uint8_t*)calloc(n_vtx, 1); for (v = 0; v < n_vtx; ++v) { uint32_t L, i, nv = asg_arc_n(g, v); asg_arc_t *av = asg_arc_a(g, v); if (nv == 0) continue; // no hits if (g->seq[v>>1].del) { for (i = 0; i < nv; ++i) av[i].del = 1, ++n_reduced; continue; } for (i = 0; i < nv; ++i) mark[av[i].v] = 1; L = asg_arc_len(av[nv-1]) + fuzz; for (i = 0; i < nv; ++i) { uint32_t w = av[i].v; uint32_t j, nw = asg_arc_n(g, w); asg_arc_t *aw = asg_arc_a(g, w); if (mark[av[i].v] != 1) continue; for (j = 0; j < nw && asg_arc_len(aw[j]) + asg_arc_len(av[i]) <= L; ++j) if (mark[aw[j].v]) mark[aw[j].v] = 2; } #if 0 for (i = 0; i < nv; ++i) { uint32_t w = av[i].v; uint32_t j, nw = asg_arc_n(g, w); asg_arc_t *aw = asg_arc_a(g, w); for (j = 0; j < nw && (j == 0 || asg_arc_len(aw[j]) < fuzz); ++j) if (mark[aw[j].v]) mark[aw[j].v] = 2; } #endif for (i = 0; i < nv; ++i) { if (mark[av[i].v] == 2) av[i].del = 1, ++n_reduced; mark[av[i].v] = 0; } } free(mark); fprintf(stderr, "[M::%s] transitively reduced %d arcs\n", __func__, n_reduced); if (n_reduced) { asg_cleanup(g); asg_symm(g); } return n_reduced; } /********************************** * Filter short potential unitigs * **********************************/ #define ASG_ET_MERGEABLE 0 #define ASG_ET_TIP 1 #define ASG_ET_MULTI_OUT 2 #define ASG_ET_MULTI_NEI 3 static inline int asg_is_utg_end(const asg_t *g, uint32_t v, uint64_t *lw) { uint32_t w, nv, nw, nw0, nv0 = asg_arc_n(g, v^1); int i, i0 = -1; asg_arc_t *aw, *av = asg_arc_a(g, v^1); for (i = nv = 0; i < nv0; ++i) if (!av[i].del) i0 = i, ++nv; if (nv == 0) return ASG_ET_TIP; // tip if (nv > 1) return ASG_ET_MULTI_OUT; // multiple outgoing arcs if (lw) *lw = av[i0].ul<<32 | av[i0].v; w = av[i0].v ^ 1; nw0 = asg_arc_n(g, w); aw = asg_arc_a(g, w); for (i = nw = 0; i < nw0; ++i) if (!aw[i].del) ++nw; if (nw != 1) return ASG_ET_MULTI_NEI; return ASG_ET_MERGEABLE; } int asg_extend(const asg_t *g, uint32_t v, int max_ext, asg64_v *a) { int ret; uint64_t lw; a->n = 0; kv_push(uint64_t, *a, v); do { ret = asg_is_utg_end(g, v^1, &lw); if (ret != 0) break; kv_push(uint64_t, *a, lw); v = (uint32_t)lw; } while (--max_ext > 0); return ret; } int asg_cut_tip(asg_t *g, int max_ext) { asg64_v a = {0,0,0}; uint32_t n_vtx = g->n_seq * 2, v, i, cnt = 0; for (v = 0; v < n_vtx; ++v) { if (g->seq[v>>1].del) continue; if (asg_is_utg_end(g, v, 0) != ASG_ET_TIP) continue; // not a tip if (asg_extend(g, v, max_ext, &a) == ASG_ET_MERGEABLE) continue; // not a short unitig for (i = 0; i < a.n; ++i) asg_seq_del(g, (uint32_t)a.a[i]>>1); ++cnt; } free(a.a); if (cnt > 0) asg_cleanup(g); fprintf(stderr, "[M::%s] cut %d tips\n", __func__, cnt); return cnt; } int asg_cut_internal(asg_t *g, int max_ext) { asg64_v a = {0,0,0}; uint32_t n_vtx = g->n_seq * 2, v, i, cnt = 0; for (v = 0; v < n_vtx; ++v) { if (g->seq[v>>1].del) continue; if (asg_is_utg_end(g, v, 0) != ASG_ET_MULTI_NEI) continue; if (asg_extend(g, v, max_ext, &a) != ASG_ET_MULTI_NEI) continue; for (i = 0; i < a.n; ++i) asg_seq_del(g, (uint32_t)a.a[i]>>1); ++cnt; } free(a.a); if (cnt > 0) asg_cleanup(g); fprintf(stderr, "[M::%s] cut %d internal sequences\n", __func__, cnt); return cnt; } int asg_cut_biloop(asg_t *g, int max_ext) { asg64_v a = {0,0,0}; uint32_t n_vtx = g->n_seq * 2, v, i, cnt = 0; for (v = 0; v < n_vtx; ++v) { uint32_t nv, nw, w = UINT32_MAX, x, ov = 0, ox = 0; asg_arc_t *av, *aw; if (g->seq[v>>1].del) continue; if (asg_is_utg_end(g, v, 0) != ASG_ET_MULTI_NEI) continue; if (asg_extend(g, v, max_ext, &a) != ASG_ET_MULTI_OUT) continue; x = (uint32_t)a.a[a.n - 1] ^ 1; nv = asg_arc_n(g, v ^ 1), av = asg_arc_a(g, v ^ 1); for (i = 0; i < nv; ++i) if (!av[i].del) w = av[i].v ^ 1; assert(w != UINT32_MAX); nw = asg_arc_n(g, w), aw = asg_arc_a(g, w); for (i = 0; i < nw; ++i) { // we are looking for: v->...->x', w->v and w->x if (aw[i].del) continue; if (aw[i].v == x) ox = aw[i].ol; if (aw[i].v == v) ov = aw[i].ol; } if (ov == 0 && ox == 0) continue; if (ov > ox) { asg_arc_del(g, w, x, 1); asg_arc_del(g, x^1, w^1, 1); ++cnt; } } free(a.a); if (cnt > 0) asg_cleanup(g); fprintf(stderr, "[M::%s] cut %d small bi-loops\n", __func__, cnt); return cnt; } /****************** * Bubble popping * ******************/ typedef struct { uint32_t p; // the optimal parent vertex uint32_t d; // the shortest distance from the initial vertex uint32_t c; // max count of reads uint32_t r:31, s:1; // r: the number of remaining incoming arc; s: state } binfo_t; typedef struct { binfo_t *a; kvec_t(uint32_t) S; // set of vertices without parents kvec_t(uint32_t) T; // set of tips kvec_t(uint32_t) b; // visited vertices kvec_t(uint32_t) e; // visited edges/arcs } buf_t; // count the number of outgoing arcs, excluding reduced arcs static inline int count_out(const asg_t *g, uint32_t v) { uint32_t i, n, nv = asg_arc_n(g, v); const asg_arc_t *av = asg_arc_a(g, v); for (i = n = 0; i < nv; ++i) if (!av[i].del) ++n; return n; } // in a resolved bubble, mark unused vertices and arcs as "reduced" static void asg_bub_backtrack(asg_t *g, uint32_t v0, buf_t *b) { uint32_t i, v; assert(b->S.n == 1); for (i = 0; i < b->b.n; ++i) g->seq[b->b.a[i]>>1].del = 1; for (i = 0; i < b->e.n; ++i) { asg_arc_t *a = &g->arc[b->e.a[i]]; a->del = 1; asg_arc_del(g, a->v^1, a->ul>>32^1, 1); } v = b->S.a[0]; do { uint32_t u = b->a[v].p; // u->v g->seq[v>>1].del = 0; asg_arc_del(g, u, v, 0); asg_arc_del(g, v^1, u^1, 0); v = u; } while (v != v0); } // pop bubbles from vertex v0; the graph MJUST BE symmetric: if u->v present, v'->u' must be present as well static uint64_t asg_bub_pop1(asg_t *g, uint32_t v0, int max_dist, buf_t *b) { uint32_t i, n_pending = 0; uint64_t n_pop = 0; if (g->seq[v0>>1].del) return 0; // already deleted if ((uint32_t)g->idx[v0] < 2) return 0; // no bubbles b->S.n = b->T.n = b->b.n = b->e.n = 0; b->a[v0].c = b->a[v0].d = 0; kv_push(uint32_t, b->S, v0); do { uint32_t v = kv_pop(b->S), d = b->a[v].d, c = b->a[v].c; uint32_t nv = asg_arc_n(g, v); asg_arc_t *av = asg_arc_a(g, v); assert(nv > 0); for (i = 0; i < nv; ++i) { // loop through v's neighbors uint32_t w = av[i].v, l = (uint32_t)av[i].ul; // u->w with length l binfo_t *t = &b->a[w]; if (w == v0) goto pop_reset; if (av[i].del) continue; kv_push(uint32_t, b->e, (g->idx[v]>>32) + i); if (d + l > max_dist) break; // too far if (t->s == 0) { // this vertex has never been visited kv_push(uint32_t, b->b, w); // save it for revert t->p = v, t->s = 1, t->d = d + l; t->r = count_out(g, w^1); ++n_pending; } else { // visited before if (c + 1 > t->c || (c + 1 == t->c && d + l > t->d)) t->p = v; if (c + 1 > t->c) t->c = c + 1; if (d + l < t->d) t->d = d + l; // update dist } assert(t->r > 0); if (--(t->r) == 0) { uint32_t x = asg_arc_n(g, w); if (x) kv_push(uint32_t, b->S, w); else kv_push(uint32_t, b->T, w); // a tip --n_pending; } } if (i < nv || b->S.n == 0) goto pop_reset; } while (b->S.n > 1 || n_pending); asg_bub_backtrack(g, v0, b); n_pop = 1 | (uint64_t)b->T.n<<32; pop_reset: for (i = 0; i < b->b.n; ++i) { // clear the states of visited vertices binfo_t *t = &b->a[b->b.a[i]]; t->s = t->c = t->d = 0; } return n_pop; } // pop bubbles int asg_pop_bubble(asg_t *g, int max_dist) { uint32_t v, n_vtx = g->n_seq * 2; uint64_t n_pop = 0; buf_t b; if (!g->is_symm) asg_symm(g); memset(&b, 0, sizeof(buf_t)); b.a = (binfo_t*)calloc(n_vtx, sizeof(binfo_t)); for (v = 0; v < n_vtx; ++v) { uint32_t i, n_arc = 0, nv = asg_arc_n(g, v); asg_arc_t *av = asg_arc_a(g, v); if (nv < 2 || g->seq[v>>1].del) continue; for (i = 0; i < nv; ++i) // asg_bub_pop1() may delete some edges/arcs if (!av[i].del) ++n_arc; if (n_arc > 1) n_pop += asg_bub_pop1(g, v, max_dist, &b); } free(b.a); free(b.S.a); free(b.T.a); free(b.b.a); free(b.e.a); if (n_pop) asg_cleanup(g); fprintf(stderr, "[M::%s] popped %d bubbles and trimmed %d tips\n", __func__, (uint32_t)n_pop, (uint32_t)(n_pop>>32)); return n_pop; } miniasm-0.3/asg.h000066400000000000000000000035331332550340100137340ustar00rootroot00000000000000#ifndef ASG_H #define ASG_H #include #include typedef struct { uint64_t ul; uint32_t v; uint32_t ol:31, del:1; } asg_arc_t; typedef struct { uint32_t len:31, del:1; } asg_seq_t; typedef struct { uint32_t m_arc, n_arc:31, is_srt:1; asg_arc_t *arc; uint32_t m_seq, n_seq:31, is_symm:1; asg_seq_t *seq; uint64_t *idx; } asg_t; typedef struct { size_t n, m; uint64_t *a; } asg64_v; #define asg_arc_len(arc) ((uint32_t)(arc).ul) #define asg_arc_n(g, v) ((uint32_t)(g)->idx[(v)]) #define asg_arc_a(g, v) (&(g)->arc[(g)->idx[(v)]>>32]) asg_t *asg_init(void); void asg_destroy(asg_t *g); void asg_seq_set(asg_t *g, int sid, int len, int del); void asg_symm(asg_t *g); void asg_cleanup(asg_t *g); int asg_arc_del_short(asg_t *g, float drop_ratio); int asg_arc_del_trans(asg_t *g, int fuzz); int asg_cut_tip(asg_t *g, int max_ext); int asg_cut_internal(asg_t *g, int max_ext); int asg_cut_biloop(asg_t *g, int max_ext); int asg_pop_bubble(asg_t *g, int max_dist); // append an arc static inline asg_arc_t *asg_arc_pushp(asg_t *g) { if (g->n_arc == g->m_arc) { g->m_arc = g->m_arc? g->m_arc<<1 : 16; g->arc = (asg_arc_t*)realloc(g->arc, g->m_arc * sizeof(asg_arc_t)); } return &g->arc[g->n_arc++]; } // set asg_arc_t::del for v->w static inline void asg_arc_del(asg_t *g, uint32_t v, uint32_t w, int del) { uint32_t i, nv = asg_arc_n(g, v); asg_arc_t *av = asg_arc_a(g, v); for (i = 0; i < nv; ++i) if (av[i].v == w) av[i].del = !!del; } // set asg_arc_t::del and asg_seq_t::del to 1 for sequence s and all its associated arcs static inline void asg_seq_del(asg_t *g, uint32_t s) { uint32_t k; g->seq[s].del = 1; for (k = 0; k < 2; ++k) { uint32_t i, v = s<<1 | k; uint32_t nv = asg_arc_n(g, v); asg_arc_t *av = asg_arc_a(g, v); for (i = 0; i < nv; ++i) { av[i].del = 1; asg_arc_del(g, av[i].v^1, v^1, 1); } } } #endif miniasm-0.3/asm.c000066400000000000000000000212261332550340100137340ustar00rootroot00000000000000#include #include #include #include #include #include "miniasm.h" #include "kvec.h" asg_t *ma_sg_gen(const ma_opt_t *opt, const sdict_t *d, const ma_sub_t *sub, size_t n_hits, const ma_hit_t *hit) { size_t i; asg_t *g; g = asg_init(); for (i = 0; i < d->n_seq; ++i) { if (sub) asg_seq_set(g, i, sub[i].e - sub[i].s, (sub[i].del || d->seq[i].del)); else asg_seq_set(g, i, d->seq[i].len, d->seq[i].del); } for (i = 0; i < n_hits; ++i) { int r; asg_arc_t t, *p; const ma_hit_t *h = &hit[i]; uint32_t qn = h->qns>>32; int ql = sub? sub[qn].e - sub[qn].s : d->seq[qn].len; int tl = sub? sub[h->tn].e - sub[h->tn].s : d->seq[h->tn].len; r = ma_hit2arc(h, ql, tl, opt->max_hang, opt->int_frac, opt->min_ovlp, &t); if (r >= 0) { if (qn == h->tn) { // self match if ((uint32_t)h->qns == h->ts && h->qe == h->te && h->rev) // PacBio-specific artifact (TODO: is this right when we skip target containment above?) g->seq[qn].del = 1; continue; } p = asg_arc_pushp(g); *p = t; } else if (r == MA_HT_QCONT) g->seq[qn].del = 1; } asg_cleanup(g); fprintf(stderr, "[M::%s] read %d arcs\n", __func__, g->n_arc); return g; } void ma_sg_print(const asg_t *g, const sdict_t *d, const ma_sub_t *sub, FILE *fp) { uint32_t i; for (i = 0; i < g->n_arc; ++i) { const asg_arc_t *p = &g->arc[i]; if (sub) { const ma_sub_t *sq = &sub[p->ul>>33], *st = &sub[p->v>>1]; fprintf(fp, "L\t%s:%d-%d\t%c\t%s:%d-%d\t%c\t%d:\tL1:i:%d\n", d->seq[p->ul>>33].name, sq->s + 1, sq->e, "+-"[p->ul>>32&1], d->seq[p->v>>1].name, st->s + 1, st->e, "+-"[p->v&1], p->ol, (uint32_t)p->ul); } else { fprintf(fp, "L\t%s\t%c\t%s\t%c\t%d:\tL1:i:%d\n", d->seq[p->ul>>33].name, "+-"[p->ul>>32&1], d->seq[p->v>>1].name, "+-"[p->v&1], p->ol, (uint32_t)p->ul); } } } /********************* * Unitig generation * *********************/ #include "kdq.h" KDQ_INIT(uint64_t) void ma_ug_destroy(ma_ug_t *ug) { uint32_t i; if (ug == 0) return; for (i = 0; i < ug->u.n; ++i) { free(ug->u.a[i].a); free(ug->u.a[i].s); } free(ug->u.a); asg_destroy(ug->g); free(ug); } void ma_ug_print(const ma_ug_t *ug, const sdict_t *d, const ma_sub_t *sub, FILE *fp) { uint32_t i, j, l; char name[32]; for (i = 0; i < ug->u.n; ++i) { // the Segment lines in GFA ma_utg_t *p = &ug->u.a[i]; sprintf(name, "utg%.6d%c", i + 1, "lc"[p->circ]); fprintf(fp, "S\t%s\t%s\tLN:i:%d\n", name, p->s? p->s : "*", p->len); for (j = l = 0; j < p->n; l += (uint32_t)p->a[j++]) { uint32_t x = p->a[j]>>33; if (sub) fprintf(fp, "a\t%s\t%d\t%s:%d-%d\t%c\t%d\n", name, l, d->seq[x].name, sub[x].s + 1, sub[x].e, "+-"[p->a[j]>>32&1], (uint32_t)p->a[j]); else fprintf(fp, "a\t%s\t%d\t%s\t%c\t%d\n", name, l, d->seq[x].name, "+-"[p->a[j]>>32&1], (uint32_t)p->a[j]); } } for (i = 0; i < ug->g->n_arc; ++i) { // the Link lines in GFA uint32_t u = ug->g->arc[i].ul>>32, v = ug->g->arc[i].v; fprintf(fp, "L\tutg%.6d%c\t%c\tutg%.6d%c\t%c\t%dM\tSD:i:%d\n", (u>>1)+1, "lc"[ug->u.a[u>>1].circ], "+-"[u&1], (v>>1)+1, "lc"[ug->u.a[v>>1].circ], "+-"[v&1], ug->g->arc[i].ol, asg_arc_len(ug->g->arc[i])); } for (i = 0; i < ug->u.n; ++i) { // summary of unitigs uint32_t cnt[2]; ma_utg_t *u = &ug->u.a[i]; if (u->start == UINT32_MAX) { fprintf(fp, "x\tutg%.6dc\t%d\t%d\n", i + 1, u->len, u->n); } else { for (j = 0; j < 2; ++j) cnt[j] = asg_arc_n(ug->g, i<<1|j); if (sub) fprintf(fp, "x\tutg%.6dl\t%d\t%d\t%d\t%d\t%s:%d-%d\t%c\t%s:%d-%d\t%c\n", i + 1, u->len, u->n, cnt[1], cnt[0], d->seq[u->start>>1].name, sub[u->start>>1].s + 1, sub[u->start>>1].e, "+-"[u->start&1], d->seq[u->end>>1].name, sub[u->end>>1].s + 1, sub[u->end>>1].e, "+-"[u->end&1]); else fprintf(fp, "x\tutg%.6dl\t%d\t%d\t%d\t%d\t%s\t%c\t%s\t%c\n", i + 1, u->len, u->n, cnt[1], cnt[0], d->seq[u->start>>1].name, "+-"[u->start&1], d->seq[u->end>>1].name, "+-"[u->end&1]); } } } #define arc_cnt(g, v) ((uint32_t)(g)->idx[(v)]) #define arc_first(g, v) ((g)->arc[(g)->idx[(v)]>>32]) ma_ug_t *ma_ug_gen(asg_t *g) { int32_t *mark; uint32_t i, v, n_vtx = g->n_seq * 2; kdq_t(uint64_t) *q; ma_ug_t *ug; ug = (ma_ug_t*)calloc(1, sizeof(ma_ug_t)); ug->g = asg_init(); mark = (int32_t*)calloc(n_vtx, 4); q = kdq_init(uint64_t); for (v = 0; v < n_vtx; ++v) { uint32_t w, x, l, start, end, len; ma_utg_t *p; if (g->seq[v>>1].del || arc_cnt(g, v) == 0 || mark[v]) continue; mark[v] = 1; q->count = 0, start = v, end = v^1, len = 0; // forward w = v; while (1) { if (arc_cnt(g, w) != 1) break; x = arc_first(g, w).v; // w->x if (arc_cnt(g, x^1) != 1) break; mark[x] = mark[w^1] = 1; l = asg_arc_len(arc_first(g, w)); kdq_push(uint64_t, q, (uint64_t)w<<32 | l); end = x^1, len += l; w = x; if (x == v) break; } if (start != (end^1) || kdq_size(q) == 0) { // linear unitig l = g->seq[end>>1].len; kdq_push(uint64_t, q, (uint64_t)(end^1)<<32 | l); len += l; } else { // circular unitig start = end = UINT32_MAX; goto add_unitig; // then it is not necessary to do the backward } // backward x = v; while (1) { // similar to forward but not the same if (arc_cnt(g, x^1) != 1) break; w = arc_first(g, x^1).v ^ 1; // w->x if (arc_cnt(g, w) != 1) break; mark[x] = mark[w^1] = 1; l = asg_arc_len(arc_first(g, w)); kdq_unshift(uint64_t, q, (uint64_t)w<<32 | l); start = w, len += l; x = w; } add_unitig: if (start != UINT32_MAX) mark[start] = mark[end] = 1; kv_pushp(ma_utg_t, ug->u, &p); p->s = 0, p->start = start, p->end = end, p->len = len, p->n = kdq_size(q), p->circ = (start == UINT32_MAX); p->m = p->n; kv_roundup32(p->m); p->a = (uint64_t*)malloc(8 * p->m); for (i = 0; i < kdq_size(q); ++i) p->a[i] = kdq_at(q, i); } kdq_destroy(uint64_t, q); // add arcs between unitigs; reusing mark for a different purpose for (v = 0; v < n_vtx; ++v) mark[v] = -1; for (i = 0; i < ug->u.n; ++i) { if (ug->u.a[i].circ) continue; mark[ug->u.a[i].start] = i<<1 | 0; mark[ug->u.a[i].end] = i<<1 | 1; } for (i = 0; i < g->n_arc; ++i) { asg_arc_t *p = &g->arc[i]; if (p->del) continue; if (mark[p->ul>>32^1] >= 0 && mark[p->v] >= 0) { asg_arc_t *q; uint32_t u = mark[p->ul>>32^1]^1; int l = ug->u.a[u>>1].len - p->ol; if (l < 0) l = 1; q = asg_arc_pushp(ug->g); q->ol = p->ol, q->del = 0; q->ul = (uint64_t)u<<32 | l; q->v = mark[p->v]; } } for (i = 0; i < ug->u.n; ++i) asg_seq_set(ug->g, i, ug->u.a[i].len, 0); asg_cleanup(ug->g); free(mark); return ug; } /******************* * Unitig sequence * *******************/ #include #include "kseq.h" KSEQ_INIT(gzFile, gzread) typedef struct { uint32_t utg:31, ori:1, start, len; } utg_intv_t; static char comp_tab[] = { // complement base 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 'T', 'V', 'G', 'H', 'E', 'F', 'C', 'D', 'I', 'J', 'M', 'L', 'K', 'N', 'O', 'P', 'Q', 'Y', 'S', 'A', 'A', 'B', 'W', 'X', 'R', 'Z', 91, 92, 93, 94, 95, 64, 't', 'v', 'g', 'h', 'e', 'f', 'c', 'd', 'i', 'j', 'm', 'l', 'k', 'n', 'o', 'p', 'q', 'y', 's', 'a', 'a', 'b', 'w', 'x', 'r', 'z', 123, 124, 125, 126, 127 }; // generate unitig sequences int ma_ug_seq(ma_ug_t *g, const sdict_t *d, const ma_sub_t *sub, const char *fn) { gzFile fp; kseq_t *ks; utg_intv_t *tmp; uint32_t i, j; fp = fn && strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r"); if (fp == 0) return -1; ks = kseq_init(fp); tmp = (utg_intv_t*)calloc(d->n_seq, sizeof(utg_intv_t)); for (i = 0; i < g->u.n; ++i) { ma_utg_t *u = &g->u.a[i]; uint32_t l = 0; u->s = (char*)calloc(1, u->len + 1); memset(u->s, 'N', u->len); for (j = 0; j < u->n; ++j) { utg_intv_t *t = &tmp[u->a[j]>>33]; assert(t->len == 0); t->utg = i, t->ori = u->a[j]>>32&1; t->start = l, t->len = (uint32_t)u->a[j]; l += t->len; } } while (kseq_read(ks) >= 0) { int32_t id; utg_intv_t *t; ma_utg_t *u; id = sd_get(d, ks->name.s); if (id < 0 || tmp[id].len == 0) continue; t = &tmp[id]; u = &g->u.a[t->utg]; if (sub) { assert(sub[id].e - sub[id].s <= ks->seq.l); memmove(ks->seq.s, ks->seq.s + sub[id].s, sub[id].e - sub[id].s); ks->seq.l = sub[id].e - sub[id].s; } if (!t->ori) { // forward strand for (i = 0; i < t->len; ++i) u->s[t->start + i] = ks->seq.s[i]; } else { for (i = 0; i < t->len; ++i) { int c = (uint8_t)ks->seq.s[ks->seq.l - 1 - i]; u->s[t->start + i] = c >= 128? 'N' : comp_tab[c]; } } } free(tmp); kseq_destroy(ks); gzclose(fp); return 0; } miniasm-0.3/common.c000066400000000000000000000006531332550340100144450ustar00rootroot00000000000000#include "miniasm.h" int ma_verbose = 3; void ma_opt_init(ma_opt_t *opt) { opt->min_span = 2000; opt->min_match = 100; opt->min_dp = 3; opt->min_iden = .05; opt->max_hang = 1000; opt->min_ovlp = opt->min_span; opt->int_frac = .8; opt->gap_fuzz = 1000; opt->n_rounds = 2; opt->bub_dist = 50000; opt->max_ext = 4; opt->min_ovlp_drop_ratio = .5; opt->max_ovlp_drop_ratio = .7; opt->final_ovlp_drop_ratio = .8; } miniasm-0.3/dotter.c000066400000000000000000000133521332550340100144560ustar00rootroot00000000000000#include #include #include #include #include #include "paf.h" #include "sdict.h" #include "kvec.h" #include "eps.h" typedef struct { uint32_t qn, qs, qe; uint32_t tn, ts, te; uint32_t ml; } dt_hit_t; typedef struct { const char *name; double tot; uint64_t w; uint32_t i; } srtaux_t; static inline int mixed_numcompare(const char *_a, const char *_b) { const unsigned char *a = (const unsigned char*)_a, *b = (const unsigned char*)_b; const unsigned char *pa = a, *pb = b; while (*pa && *pb) { if (isdigit(*pa) && isdigit(*pb)) { while (*pa == '0') ++pa; while (*pb == '0') ++pb; while (isdigit(*pa) && isdigit(*pb) && *pa == *pb) ++pa, ++pb; if (isdigit(*pa) && isdigit(*pb)) { int i = 0; while (isdigit(pa[i]) && isdigit(pb[i])) ++i; return isdigit(pa[i])? 1 : isdigit(pb[i])? -1 : (int)*pa - (int)*pb; } else if (isdigit(*pa)) return 1; else if (isdigit(*pb)) return -1; else if (pa - a != pb - b) return pa - a < pb - b? 1 : -1; } else { if (*pa != *pb) return (int)*pa - (int)*pb; ++pa; ++pb; } } return *pa? 1 : *pb? -1 : 0; } #include "ksort.h" #define srtx_lt(a, b) (mixed_numcompare((a).name, (b).name) < 0) KSORT_INIT(dtx, srtaux_t, srtx_lt) #define srty_lt(a, b) ((a).tot < (b).tot) KSORT_INIT(dty, srtaux_t, srty_lt) int main(int argc, char *argv[]) { int min_span = 1000, min_match = 100, width = 600, height, diagonal = 1; int color[2] = { 0xFF0000, 0x0080FF }, font_size = 11, no_label = 0; float min_iden = .1; paf_file_t *f; sdict_t *d[2]; paf_rec_t r; int32_t c, i, j; uint64_t *acclen[2], totlen[2]; srtaux_t *a[2]; kvec_t(dt_hit_t) h = {0,0,0}; double sx, sy; while ((c = getopt(argc, argv, "m:i:s:w:f:Ld")) >= 0) { if (c == 'm') min_match = atoi(optarg); else if (c == 'i') min_iden = atof(optarg); else if (c == 's') min_span = atoi(optarg); else if (c == 'w') width = atoi(optarg); else if (c == 'f') font_size = atoi(optarg); else if (c == 'L') no_label = 1; else if (c == 'd') diagonal = 0; } if (argc == optind) { fprintf(stderr, "Usage: minidot [options] \n"); fprintf(stderr, "Options:\n"); fprintf(stderr, " -m INT min match length [%d]\n", min_match); fprintf(stderr, " -i FLOAT min identity [%.2f]\n", min_iden); fprintf(stderr, " -s INT min span [%d]\n", min_span); fprintf(stderr, " -w INT image width [%d]\n", width); fprintf(stderr, " -f INT font size [%d]\n", font_size); fprintf(stderr, " -L don't print labels\n"); fprintf(stderr, " -D don't try to put hits onto the diagonal\n"); return 1; } d[0] = sd_init(); d[1] = sd_init(); f = paf_open(argv[optind]); if (!f) { fprintf(stderr, "[E::%s] could not open PAF file %s\n", __func__, argv[optind]); return 1; } while (paf_read(f, &r) >= 0) { dt_hit_t *s; if (r.qe - r.qs < min_span || r.te - r.ts < min_span || r.ml < min_match) continue; if (r.ml < r.bl * min_iden) continue; kv_pushp(dt_hit_t, h, &s); s->qn = sd_put(d[1], r.qn, r.ql), s->qs = r.qs, s->qe = r.qe; s->tn = sd_put(d[0], r.tn, r.tl); s->ts = r.rev? r.te : r.ts, s->te = r.rev? r.ts : r.te; s->ml = r.ml; } paf_close(f); for (i = 0; i < 2; ++i) { // 0 for target; 1 for query uint32_t n = d[i]->n_seq; uint64_t l = 0; a[i] = (srtaux_t*)calloc(n + 1, sizeof(srtaux_t)); if (i == 0 || !diagonal) { for (j = 0; j < n; ++j) a[i][j].name = d[i]->seq[j].name, a[i][j].i = j; ks_introsort_dtx(n, a[i]); } else { srtaux_t *b = a[i]; for (j = 0; j < n; ++j) b[j].name = d[i]->seq[j].name, b[j].tot = b[j].w = 0, b[j].i = j; for (j = 0; j < h.n; ++j) { uint64_t w, coor; dt_hit_t *p = &h.a[j]; srtaux_t *q = &b[p->qn]; coor = acclen[0][p->tn] + (p->ts + p->te) / 2; w = (uint64_t)(.01 * p->ml * p->ml + .499); q->tot += (double)coor * w; q->w += w; } for (j = 0; j < n; ++j) b[j].tot /= b[j].w; ks_introsort_dty(n, b); } acclen[i] = (uint64_t*)calloc(n, 8); for (j = 0; j < n; ++j) acclen[i][a[i][j].i] = l, l += d[i]->seq[a[i][j].i].len; totlen[i] = l; } height = (int)((double)width / totlen[0] * totlen[1] + .499); sx = (double)width / totlen[0]; sy = (double)height / totlen[1]; eps_header(stdout, width, height, .2); eps_font(stdout, "Helvetica-Narrow", font_size); eps_gray(stdout, .8); if (!no_label) { // write x labels for (i = 0; i < d[0]->n_seq; ++i) eps_Mstr(stdout, (acclen[0][a[0][i].i] + .5 * d[0]->seq[a[0][i].i].len) * sx, font_size*.5, a[0][i].name); eps_stroke(stdout); fprintf(stdout, "gsave %g 0 translate 90 rotate\n", font_size*1.25); // write y labels for (i = 0; i < d[1]->n_seq; ++i) eps_Mstr(stdout, (acclen[1][a[1][i].i] + .5 * d[1]->seq[a[1][i].i].len) * sx, 0, a[1][i].name); fprintf(stdout, "grestore\n"); eps_stroke(stdout); } // write grid lines eps_linewidth(stdout, .1); for (i = 0; i < d[1]->n_seq; ++i) eps_linex(stdout, 1, width, i == 0? 1 : acclen[1][a[1][i].i] * sy); eps_linex(stdout, 1, width, totlen[1] * sy); for (i = 0; i < d[0]->n_seq; ++i) eps_liney(stdout, 1, height, i == 0? 1 : acclen[0][a[0][i].i] * sx); eps_liney(stdout, 1, height, totlen[0] * sx); eps_stroke(stdout); // write hits eps_linewidth(stdout, .1); for (j = 0; j < 2; ++j) { eps_color(stdout, color[j]); for (i = 0; i < h.n; ++i) { dt_hit_t *p = &h.a[i]; double x0, y0, x1, y1; uint64_t xo = acclen[0][p->tn], yo = acclen[1][p->qn]; if (j == 0 && p->ts > p->te) continue; if (j == 1 && p->ts < p->te) continue; x0 = (p->ts + xo) * sx, y0 = (p->qs + yo) * sy; x1 = (p->te + xo) * sx, y1 = (p->qe + yo) * sy; eps_line(stdout, x0, y0, x1, y1); } eps_stroke(stdout); } eps_bottom(stdout); for (i = 0; i < 2; ++i) { free(acclen[i]); free(a[i]); sd_destroy(d[i]); } free(h.a); return 0; } miniasm-0.3/eps.h000066400000000000000000000051311332550340100137450ustar00rootroot00000000000000#ifndef EPS_H_ #define EPS_H_ #include #define EPS FILE #define EPSPTR FILE * #define eps_open(s) fopen((s),"w+") #define eps_close(fp) fclose(fp) #define eps_header(fp,x,y,linewidth) { \ fprintf(fp,"%%!PS-Adobe-3.0 EPSF-3.0\n"); \ fprintf(fp,"%%%%BoundingBox:"); \ fprintf(fp," 1 1 %g %g\n\n",(float)(x),(float)(y)); \ fprintf(fp,"/C { dup 255 and 255 div exch dup -8 bitshift 255 and 255 div 3 1 roll -16 bitshift 255 and 255 div 3 1 roll setrgbcolor } bind def\n"); \ fprintf(fp,"/L { 4 2 roll moveto lineto } bind def\n"); \ fprintf(fp,"/LX { dup 4 -1 roll exch moveto lineto } bind def\n"); \ fprintf(fp,"/LY { dup 4 -1 roll moveto exch lineto } bind def\n"); \ fprintf(fp,"/LS { 3 1 roll moveto show } bind def\n"); \ fprintf(fp,"/MS { dup stringwidth pop 2 div 4 -1 roll exch sub 3 -1 roll moveto show } bind def\n"); \ fprintf(fp,"/RS { dup stringwidth pop 4 -1 roll exch sub 3 -1 roll moveto show } bind def\n"); \ fprintf(fp,"/B { 4 copy 3 1 roll exch 6 2 roll 8 -2 roll moveto lineto lineto lineto closepath } bind def\n");\ fprintf(fp,"%g setlinewidth\n\n",linewidth);\ } #define eps_font(fp,f,s) do { \ fprintf(fp,"/FS %d def\n",s); \ fprintf(fp,"/FS4 FS 4 div def\n"); \ fprintf(fp,"/%s findfont FS scalefont setfont\n\n",f); \ } while (0) #define eps_bottom(fp) fprintf(fp,"stroke showpage\n") #define eps_color(fp,col) fprintf(fp,"stroke %d C\n",col) #define eps_gray(fp,gray) fprintf(fp, "%g setgray\n",(float)gray) #define eps_linewidth(fp, lw) fprintf(fp, "%g setlinewidth\n", (float)(lw)) #define eps_line(fp,x1,y1,x2,y2) fprintf(fp,"%g %g %g %g L\n",(float)(x1),(float)(y1),(float)(x2),(float)(y2)) #define eps_linex(fp,x1,x2,y) fprintf(fp,"%g %g %g LX\n",(float)(x1),(float)(x2),(float)(y)) #define eps_liney(fp,y1,y2,x) fprintf(fp,"%g %g %g LY\n",(float)(y1),(float)(y2),(float)(x)) #define eps_Lstr(fp,x,y,s) fprintf(fp,"%g %g (%s) LS\n",(float)(x),(float)(y),s) #define eps_Mstr(fp,x,y,s) fprintf(fp,"%g %g (%s) MS\n",(float)(x),(float)(y),s) #define eps_Rstr(fp,x,y,s) fprintf(fp,"%g %g (%s) RS\n",(float)(x),(float)(y),s) #define eps_Lstr4(fp,x,y,s) fprintf(fp,"%g %g FS4 add (%s) LS\n",(float)(x),(float)(y),s) #define eps_Rstr4(fp,x,y,s) fprintf(fp,"%g %g FS4 add (%s) RS\n",(float)(x),(float)(y),s) #define eps_Lstr4s(fp,x,y,s) fprintf(fp,"%g %g FS4 sub (%s) LS\n",(float)(x),(float)(y),s) #define eps_Rstr4s(fp,x,y,s) fprintf(fp,"%g %g FS4 sub (%s) RS\n",(float)(x),(float)(y),s) #define eps_box(fp,x1,y1,x2,y2) fprintf(fp,"%g %g %g %g B\n",(float)(x1),(float)(y1),(float)(x2),(float)(y2)) #define eps_fill(fp) fprintf(fp,"fill\n") #define eps_stroke(fp) fprintf(fp,"stroke\n") #endif miniasm-0.3/hit.c000066400000000000000000000201001332550340100137260ustar00rootroot00000000000000#include #include #include #include #include "sdict.h" #include "paf.h" #include "kvec.h" #include "sys.h" #include "miniasm.h" #include "ksort.h" #define ma_hit_key(a) ((a).qns) KRADIX_SORT_INIT(hit, ma_hit_t, ma_hit_key, 8) KSORT_INIT_GENERIC(uint32_t) typedef kvec_t(uint32_t) uint32_v; void ma_hit_sort(size_t n, ma_hit_t *a) { radix_sort_hit(a, a + n); } void ma_hit_mark_unused(sdict_t *d, size_t n, const ma_hit_t *a) { size_t i; for (i = 0; i < d->n_seq; ++i) d->seq[i].aux = 0; for (i = 0; i < n; ++i) d->seq[a[i].qns>>32].aux = d->seq[a[i].tn].aux = 1; for (i = 0; i < d->n_seq; ++i) { sd_seq_t *s = &d->seq[i]; if (!s->aux) s->del = 1; else s->aux = 0; } } sdict_t *ma_hit_no_cont(const char *fn, int min_span, int min_match, int max_hang, float int_frac) { paf_file_t *fp; paf_rec_t r; sdict_t *d; fp = paf_open(fn); if (!fp) { fprintf(stderr, "[E::%s] could not open PAF file %s\n", __func__, fn); exit(1); } d = sd_init(); while (paf_read(fp, &r) >= 0) { int l5, l3; if (r.qe - r.qs < min_span || r.te - r.ts < min_span || r.ml < min_match) continue; l5 = r.rev? r.tl - r.te : r.ts; l3 = r.rev? r.ts : r.tl - r.te; if (r.ql>>1 > r.tl) { if (l5 > max_hang>>2 || l3 > max_hang>>2 || r.te - r.ts < r.tl * int_frac) continue; // internal match if ((int)r.qs - l5 > max_hang<<1 && (int)(r.ql - r.qe) - l3 > max_hang<<1) sd_put(d, r.tn, r.tl); } else if (r.ql < r.tl>>1) { if (r.qs > max_hang>>2 || r.ql - r.qe > max_hang>>2 || r.qe - r.qs < r.ql * int_frac) continue; // internal if (l5 - (int)r.qs > max_hang<<1 && l3 - (int)(r.ql - r.qe) > max_hang<<1) sd_put(d, r.qn, r.ql); } } paf_close(fp); if (ma_verbose >= 3) fprintf(stderr, "[M::%s::%s] dropped %d contained reads\n", __func__, sys_timestamp(), d->n_seq); return d; } ma_hit_t *ma_hit_read(const char *fn, int min_span, int min_match, sdict_t *d, size_t *n, int bi_dir, const sdict_t *excl) { paf_file_t *fp; paf_rec_t r; ma_hit_v h = {0,0,0}; size_t i, tot = 0, tot_len = 0; fp = paf_open(fn); if (!fp) { fprintf(stderr, "[E::%s] could not open PAF file %s\n", __func__, fn); exit(1); } while (paf_read(fp, &r) >= 0) { ma_hit_t *p; ++tot; if (r.qe - r.qs < min_span || r.te - r.ts < min_span || r.ml < min_match) continue; if (excl && (sd_get(excl, r.qn) >= 0 || sd_get(excl, r.tn) >= 0)) continue; kv_pushp(ma_hit_t, h, &p); p->qns = (uint64_t)sd_put(d, r.qn, r.ql)<<32 | r.qs; p->qe = r.qe; p->tn = sd_put(d, r.tn, r.tl); p->ts = r.ts, p->te = r.te, p->rev = r.rev, p->ml = r.ml, p->bl = r.bl; if (bi_dir && p->qns>>32 != p->tn) { kv_pushp(ma_hit_t, h, &p); p->qns = (uint64_t)sd_put(d, r.tn, r.tl)<<32 | r.ts; p->qe = r.te; p->tn = sd_put(d, r.qn, r.ql); p->ts = r.qs, p->te = r.qe, p->rev = r.rev, p->ml = r.ml, p->bl = r.bl; } } paf_close(fp); for (i = 0; i < d->n_seq; ++i) tot_len += d->seq[i].len; if (ma_verbose >= 3) fprintf(stderr, "[M::%s::%s] read %ld hits; stored %ld hits and %d sequences (%ld bp)\n", __func__, sys_timestamp(), tot, h.n, d->n_seq, tot_len); ma_hit_sort(h.n, h.a); *n = h.n; return h.a; } ma_sub_t *ma_hit_sub(int min_dp, float min_iden, int end_clip, size_t n, const ma_hit_t *a, size_t n_sub) { size_t i, j, last, n_remained = 0; kvec_t(uint32_t) b = {0,0,0}; ma_sub_t *sub = 0; sub = (ma_sub_t*)calloc(n_sub, sizeof(ma_sub_t)); for (i = 1, last = 0; i <= n; ++i) { if (i == n || a[i].qns>>32 != a[i-1].qns>>32) { // we come to a new query sequence size_t start = 0; int dp, qid = a[i-1].qns>>32; ma_sub_t max, max2; kv_resize(uint32_t, b, i - last); b.n = 0; for (j = last; j < i; ++j) { // collect all starts and ends uint32_t qs, qe; if (a[j].tn == qid || a[j].ml < a[j].bl * min_iden) continue; // skip self match qs = (uint32_t)a[j].qns + end_clip, qe = a[j].qe - end_clip; if (qe > qs) { kv_push(uint32_t, b, qs<<1); kv_push(uint32_t, b, qe<<1|1); } } ks_introsort_uint32_t(b.n, b.a); max.s = max.e = max.del = max2.s = max2.e = max2.del = 0; for (j = 0, dp = 0; j < b.n; ++j) { int old_dp = dp; if (b.a[j]&1) --dp; else ++dp; if (old_dp < min_dp && dp >= min_dp) { start = b.a[j]>>1; } else if (old_dp >= min_dp && dp < min_dp) { int len = (b.a[j]>>1) - start; if (len > max.e - max.s) max2 = max, max.s = start, max.e = b.a[j]>>1; else if (len > max2.e - max2.s) max2.s = start, max2.e = b.a[j]>>1; } } if (max.e - max.s > 0) { assert(qid < n_sub); sub[qid].s = max.s - end_clip; sub[qid].e = max.e + end_clip; sub[qid].del = 0; ++n_remained; } else sub[qid].del = 1; last = i; } } free(b.a); if (ma_verbose >= 3) fprintf(stderr, "[M::%s::%s] %ld query sequences remain after sub\n", __func__, sys_timestamp(), n_remained); return sub; } size_t ma_hit_cut(const ma_sub_t *reg, int min_span, size_t n, ma_hit_t *a) { size_t i, m; for (i = m = 0; i < n; ++i) { ma_hit_t *p = &a[i]; const ma_sub_t *rq = ®[p->qns>>32], *rt = ®[p->tn]; int qs, qe, ts, te; if (rq->del || rt->del) continue; if (p->rev) { qs = p->te < rt->e? (uint32_t)p->qns : (uint32_t)p->qns + (p->te - rt->e); qe = p->ts > rt->s? p->qe : p->qe - (rt->s - p->ts); ts = p->qe < rq->e? p->ts : p->ts + (p->qe - rq->e); te = (uint32_t)p->qns > rq->s? p->te : p->te - (rq->s - (uint32_t)p->qns); } else { qs = p->ts > rt->s? (uint32_t)p->qns : (uint32_t)p->qns + (rt->s - p->ts); qe = p->te < rt->e? p->qe : p->qe - (p->te - rt->e); ts = (uint32_t)p->qns > rq->s? p->ts : p->ts + (rq->s - (uint32_t)p->qns); te = p->qe < rq->e? p->te : p->te - (p->qe - rq->e); } qs = (qs > rq->s? qs : rq->s) - rq->s; qe = (qe < rq->e? qe : rq->e) - rq->s; ts = (ts > rt->s? ts : rt->s) - rt->s; te = (te < rt->e? te : rt->e) - rt->s; if (qe - qs >= min_span && te - ts >= min_span) { p->qns = p->qns>>32<<32 | qs, p->qe = qe, p->ts = ts, p->te = te; a[m++] = *p; } } if (ma_verbose >= 3) fprintf(stderr, "[M::%s::%s] %ld hits remain after cut\n", __func__, sys_timestamp(), m); return m; } size_t ma_hit_flt(const ma_sub_t *sub, int max_hang, int min_ovlp, size_t n, ma_hit_t *a, float *cov) { size_t i, m; asg_arc_t t; uint64_t tot_dp = 0, tot_len = 0; for (i = m = 0; i < n; ++i) { ma_hit_t *h = &a[i]; const ma_sub_t *sq = &sub[h->qns>>32], *st = &sub[h->tn]; int r; if (sq->del || st->del) continue; r = ma_hit2arc(h, sq->e - sq->s, st->e - st->s, max_hang, .5, min_ovlp, &t); if (r >= 0 || r == MA_HT_QCONT || r == MA_HT_TCONT) a[m++] = *h, tot_dp += r >= 0? r : r == MA_HT_QCONT? sq->e - sq->s : st->e - st->s; } for (i = 1; i <= m; ++i) if (i == m || a[i].qns>>32 != a[i-1].qns>>32) tot_len += sub[a[i-1].qns>>32].e - sub[a[i-1].qns>>32].s; *cov = (double)tot_dp / tot_len; if (ma_verbose >= 3) fprintf(stderr, "[M::%s::%s] %ld hits remain after filtering; crude coverage after filtering: %.2f\n", __func__, sys_timestamp(), m, *cov); return m; } void ma_sub_merge(size_t n_sub, ma_sub_t *a, const ma_sub_t *b) { size_t i; for (i = 0; i < n_sub; ++i) a[i].e = a[i].s + b[i].e, a[i].s += b[i].s; } size_t ma_hit_contained(const ma_opt_t *opt, sdict_t *d, ma_sub_t *sub, size_t n, ma_hit_t *a) { int32_t *map, r; size_t i, m, old_n_seq = d->n_seq; asg_arc_t t; for (i = m = 0; i < n; ++i) { ma_hit_t *h = &a[i]; ma_sub_t *sq = &sub[h->qns>>32], *st = &sub[h->tn]; r = ma_hit2arc(h, sq->e - sq->s, st->e - st->s, opt->max_hang, opt->int_frac, opt->min_ovlp, &t); if (r == MA_HT_QCONT) sq->del = 1; else if (r == MA_HT_TCONT) st->del = 1; } for (i = 0; i < d->n_seq; ++i) if (sub[i].del) d->seq[i].del = 1; ma_hit_mark_unused(d, n, a); map = sd_squeeze(d); for (i = 0; i < old_n_seq; ++i) if (map[i] >= 0) sub[map[i]] = sub[i]; for (i = m = 0; i < n; ++i) { ma_hit_t *h = &a[i]; int32_t qn = map[h->qns>>32], tn = map[h->tn]; if (qn >= 0 && tn >= 0) { a[i].qns = (uint64_t)qn<<32 | (uint32_t)a[i].qns; a[i].tn = tn; a[m++] = a[i]; } } free(map); if (ma_verbose >= 3) fprintf(stderr, "[M::%s::%s] %d sequences and %ld hits remain after containment removal\n", __func__, sys_timestamp(), d->n_seq, m); return m; } miniasm-0.3/kdq.h000066400000000000000000000102601332550340100137340ustar00rootroot00000000000000#ifndef __AC_KDQ_H #define __AC_KDQ_H #include #include #define __KDQ_TYPE(type) \ typedef struct { \ size_t front:58, bits:6, count, mask; \ type *a; \ } kdq_##type##_t; #define kdq_t(type) kdq_##type##_t #define kdq_size(q) ((q)->count) #define kdq_first(q) ((q)->a[(q)->front]) #define kdq_last(q) ((q)->a[((q)->front + (q)->count - 1) & (q)->mask]) #define kdq_at(q, i) ((q)->a[((q)->front + (i)) & (q)->mask]) #define __KDQ_IMPL(type, SCOPE) \ SCOPE kdq_##type##_t *kdq_init_##type() \ { \ kdq_##type##_t *q; \ q = (kdq_##type##_t*)calloc(1, sizeof(kdq_##type##_t)); \ q->bits = 2, q->mask = (1ULL<bits) - 1; \ q->a = (type*)malloc((1<bits) * sizeof(type)); \ return q; \ } \ SCOPE void kdq_destroy_##type(kdq_##type##_t *q) \ { \ if (q == 0) return; \ free(q->a); free(q); \ } \ SCOPE int kdq_resize_##type(kdq_##type##_t *q, int new_bits) \ { \ size_t new_size = 1ULL<bits; \ if (new_size < q->count) { /* not big enough */ \ int i; \ for (i = 0; i < 64; ++i) \ if (1ULL< q->count) break; \ new_bits = i, new_size = 1ULL<bits) return q->bits; /* unchanged */ \ if (new_bits > q->bits) q->a = (type*)realloc(q->a, (1ULL<front + q->count <= old_size) { /* unwrapped */ \ if (q->front + q->count > new_size) /* only happens for shrinking */ \ memmove(q->a, q->a + new_size, (q->front + q->count - new_size) * sizeof(type)); \ } else { /* wrapped */ \ memmove(q->a + (new_size - (old_size - q->front)), q->a + q->front, (old_size - q->front) * sizeof(type)); \ q->front = new_size - (old_size - q->front); \ } \ q->bits = new_bits, q->mask = (1ULL<bits) - 1; \ if (new_bits < q->bits) q->a = (type*)realloc(q->a, (1ULL<bits; \ } \ SCOPE type *kdq_pushp_##type(kdq_##type##_t *q) \ { \ if (q->count == 1ULL<bits) kdq_resize_##type(q, q->bits + 1); \ return &q->a[((q->count++) + q->front) & (q)->mask]; \ } \ SCOPE void kdq_push_##type(kdq_##type##_t *q, type v) \ { \ if (q->count == 1ULL<bits) kdq_resize_##type(q, q->bits + 1); \ q->a[((q->count++) + q->front) & (q)->mask] = v; \ } \ SCOPE type *kdq_unshiftp_##type(kdq_##type##_t *q) \ { \ if (q->count == 1ULL<bits) kdq_resize_##type(q, q->bits + 1); \ ++q->count; \ q->front = q->front? q->front - 1 : (1ULL<bits) - 1; \ return &q->a[q->front]; \ } \ SCOPE void kdq_unshift_##type(kdq_##type##_t *q, type v) \ { \ type *p; \ p = kdq_unshiftp_##type(q); \ *p = v; \ } \ SCOPE type *kdq_pop_##type(kdq_##type##_t *q) \ { \ return q->count? &q->a[((--q->count) + q->front) & q->mask] : 0; \ } \ SCOPE type *kdq_shift_##type(kdq_##type##_t *q) \ { \ type *d = 0; \ if (q->count == 0) return 0; \ d = &q->a[q->front++]; \ q->front &= q->mask; \ --q->count; \ return d; \ } #define KDQ_INIT2(type, SCOPE) \ __KDQ_TYPE(type) \ __KDQ_IMPL(type, SCOPE) #ifndef klib_unused #if (defined __clang__ && __clang_major__ >= 3) || (defined __GNUC__ && __GNUC__ >= 3) #define klib_unused __attribute__ ((__unused__)) #else #define klib_unused #endif #endif /* klib_unused */ #define KDQ_INIT(type) KDQ_INIT2(type, static inline klib_unused) #define KDQ_DECLARE(type) \ __KDQ_TYPE(type) \ kdq_##type##_t *kdq_init_##type(); \ void kdq_destroy_##type(kdq_##type##_t *q); \ int kdq_resize_##type(kdq_##type##_t *q, int new_bits); \ type *kdq_pushp_##type(kdq_##type##_t *q); \ void kdq_push_##type(kdq_##type##_t *q, type v); \ type *kdq_unshiftp_##type(kdq_##type##_t *q); \ void kdq_unshift_##type(kdq_##type##_t *q, type v); \ type *kdq_pop_##type(kdq_##type##_t *q); \ type *kdq_shift_##type(kdq_##type##_t *q); #define kdq_init(type) kdq_init_##type() #define kdq_destroy(type, q) kdq_destroy_##type(q) #define kdq_resize(type, q, new_bits) kdq_resize_##type(q, new_bits) #define kdq_pushp(type, q) kdq_pushp_##type(q) #define kdq_push(type, q, v) kdq_push_##type(q, v) #define kdq_pop(type, q) kdq_pop_##type(q) #define kdq_unshiftp(type, q) kdq_unshiftp_##type(q) #define kdq_unshift(type, q, v) kdq_unshift_##type(q, v) #define kdq_shift(type, q) kdq_shift_##type(q) #endif miniasm-0.3/khash.h000066400000000000000000000520441332550340100142610ustar00rootroot00000000000000/* The MIT License Copyright (c) 2008, 2009, 2011 by Attractive Chaos Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ /* An example: #include "khash.h" KHASH_MAP_INIT_INT(32, char) int main() { int ret, is_missing; khiter_t k; khash_t(32) *h = kh_init(32); k = kh_put(32, h, 5, &ret); kh_value(h, k) = 10; k = kh_get(32, h, 10); is_missing = (k == kh_end(h)); k = kh_get(32, h, 5); kh_del(32, h, k); for (k = kh_begin(h); k != kh_end(h); ++k) if (kh_exist(h, k)) kh_value(h, k) = 1; kh_destroy(32, h); return 0; } */ /* 2013-05-02 (0.2.8): * Use quadratic probing. When the capacity is power of 2, stepping function i*(i+1)/2 guarantees to traverse each bucket. It is better than double hashing on cache performance and is more robust than linear probing. In theory, double hashing should be more robust than quadratic probing. However, my implementation is probably not for large hash tables, because the second hash function is closely tied to the first hash function, which reduce the effectiveness of double hashing. Reference: http://research.cs.vt.edu/AVresearch/hashing/quadratic.php 2011-12-29 (0.2.7): * Minor code clean up; no actual effect. 2011-09-16 (0.2.6): * The capacity is a power of 2. This seems to dramatically improve the speed for simple keys. Thank Zilong Tan for the suggestion. Reference: - http://code.google.com/p/ulib/ - http://nothings.org/computer/judy/ * Allow to optionally use linear probing which usually has better performance for random input. Double hashing is still the default as it is more robust to certain non-random input. * Added Wang's integer hash function (not used by default). This hash function is more robust to certain non-random input. 2011-02-14 (0.2.5): * Allow to declare global functions. 2009-09-26 (0.2.4): * Improve portability 2008-09-19 (0.2.3): * Corrected the example * Improved interfaces 2008-09-11 (0.2.2): * Improved speed a little in kh_put() 2008-09-10 (0.2.1): * Added kh_clear() * Fixed a compiling error 2008-09-02 (0.2.0): * Changed to token concatenation which increases flexibility. 2008-08-31 (0.1.2): * Fixed a bug in kh_get(), which has not been tested previously. 2008-08-31 (0.1.1): * Added destructor */ #ifndef __AC_KHASH_H #define __AC_KHASH_H /*! @header Generic hash table library. */ #define AC_VERSION_KHASH_H "0.2.8" #include #include #include /* compiler specific configuration */ #if UINT_MAX == 0xffffffffu typedef unsigned int khint32_t; #elif ULONG_MAX == 0xffffffffu typedef unsigned long khint32_t; #endif #if ULONG_MAX == ULLONG_MAX typedef unsigned long khint64_t; #else typedef unsigned long long khint64_t; #endif #ifndef kh_inline #ifdef _MSC_VER #define kh_inline __inline #else #define kh_inline inline #endif #endif /* kh_inline */ #ifndef klib_unused #if (defined __clang__ && __clang_major__ >= 3) || (defined __GNUC__ && __GNUC__ >= 3) #define klib_unused __attribute__ ((__unused__)) #else #define klib_unused #endif #endif /* klib_unused */ typedef khint32_t khint_t; typedef khint_t khiter_t; #define __ac_isempty(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&2) #define __ac_isdel(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&1) #define __ac_iseither(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&3) #define __ac_set_isdel_false(flag, i) (flag[i>>4]&=~(1ul<<((i&0xfU)<<1))) #define __ac_set_isempty_false(flag, i) (flag[i>>4]&=~(2ul<<((i&0xfU)<<1))) #define __ac_set_isboth_false(flag, i) (flag[i>>4]&=~(3ul<<((i&0xfU)<<1))) #define __ac_set_isdel_true(flag, i) (flag[i>>4]|=1ul<<((i&0xfU)<<1)) #define __ac_fsize(m) ((m) < 16? 1 : (m)>>4) #ifndef kroundup32 #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) #endif #ifndef kcalloc #define kcalloc(N,Z) calloc(N,Z) #endif #ifndef kmalloc #define kmalloc(Z) malloc(Z) #endif #ifndef krealloc #define krealloc(P,Z) realloc(P,Z) #endif #ifndef kfree #define kfree(P) free(P) #endif static const double __ac_HASH_UPPER = 0.77; #define __KHASH_TYPE(name, khkey_t, khval_t) \ typedef struct kh_##name##_s { \ khint_t n_buckets, size, n_occupied, upper_bound; \ khint32_t *flags; \ khkey_t *keys; \ khval_t *vals; \ } kh_##name##_t; #define __KHASH_PROTOTYPES(name, khkey_t, khval_t) \ extern kh_##name##_t *kh_init_##name(void); \ extern void kh_destroy_##name(kh_##name##_t *h); \ extern void kh_clear_##name(kh_##name##_t *h); \ extern khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key); \ extern int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets); \ extern khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret); \ extern void kh_del_##name(kh_##name##_t *h, khint_t x); #define __KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ SCOPE kh_##name##_t *kh_init_##name(void) { \ return (kh_##name##_t*)kcalloc(1, sizeof(kh_##name##_t)); \ } \ SCOPE void kh_destroy_##name(kh_##name##_t *h) \ { \ if (h) { \ kfree((void *)h->keys); kfree(h->flags); \ kfree((void *)h->vals); \ kfree(h); \ } \ } \ SCOPE void kh_clear_##name(kh_##name##_t *h) \ { \ if (h && h->flags) { \ memset(h->flags, 0xaa, __ac_fsize(h->n_buckets) * sizeof(khint32_t)); \ h->size = h->n_occupied = 0; \ } \ } \ SCOPE khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key) \ { \ if (h->n_buckets) { \ khint_t k, i, last, mask, step = 0; \ mask = h->n_buckets - 1; \ k = __hash_func(key); i = k & mask; \ last = i; \ while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \ i = (i + (++step)) & mask; \ if (i == last) return h->n_buckets; \ } \ return __ac_iseither(h->flags, i)? h->n_buckets : i; \ } else return 0; \ } \ SCOPE int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \ { /* This function uses 0.25*n_buckets bytes of working space instead of [sizeof(key_t+val_t)+.25]*n_buckets. */ \ khint32_t *new_flags = 0; \ khint_t j = 1; \ { \ kroundup32(new_n_buckets); \ if (new_n_buckets < 4) new_n_buckets = 4; \ if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0; /* requested size is too small */ \ else { /* hash table size to be changed (shrink or expand); rehash */ \ new_flags = (khint32_t*)kmalloc(__ac_fsize(new_n_buckets) * sizeof(khint32_t)); \ if (!new_flags) return -1; \ memset(new_flags, 0xaa, __ac_fsize(new_n_buckets) * sizeof(khint32_t)); \ if (h->n_buckets < new_n_buckets) { /* expand */ \ khkey_t *new_keys = (khkey_t*)krealloc((void *)h->keys, new_n_buckets * sizeof(khkey_t)); \ if (!new_keys) { kfree(new_flags); return -1; } \ h->keys = new_keys; \ if (kh_is_map) { \ khval_t *new_vals = (khval_t*)krealloc((void *)h->vals, new_n_buckets * sizeof(khval_t)); \ if (!new_vals) { kfree(new_flags); return -1; } \ h->vals = new_vals; \ } \ } /* otherwise shrink */ \ } \ } \ if (j) { /* rehashing is needed */ \ for (j = 0; j != h->n_buckets; ++j) { \ if (__ac_iseither(h->flags, j) == 0) { \ khkey_t key = h->keys[j]; \ khval_t val; \ khint_t new_mask; \ new_mask = new_n_buckets - 1; \ if (kh_is_map) val = h->vals[j]; \ __ac_set_isdel_true(h->flags, j); \ while (1) { /* kick-out process; sort of like in Cuckoo hashing */ \ khint_t k, i, step = 0; \ k = __hash_func(key); \ i = k & new_mask; \ while (!__ac_isempty(new_flags, i)) i = (i + (++step)) & new_mask; \ __ac_set_isempty_false(new_flags, i); \ if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { /* kick out the existing element */ \ { khkey_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } \ if (kh_is_map) { khval_t tmp = h->vals[i]; h->vals[i] = val; val = tmp; } \ __ac_set_isdel_true(h->flags, i); /* mark it as deleted in the old hash table */ \ } else { /* write the element and jump out of the loop */ \ h->keys[i] = key; \ if (kh_is_map) h->vals[i] = val; \ break; \ } \ } \ } \ } \ if (h->n_buckets > new_n_buckets) { /* shrink the hash table */ \ h->keys = (khkey_t*)krealloc((void *)h->keys, new_n_buckets * sizeof(khkey_t)); \ if (kh_is_map) h->vals = (khval_t*)krealloc((void *)h->vals, new_n_buckets * sizeof(khval_t)); \ } \ kfree(h->flags); /* free the working space */ \ h->flags = new_flags; \ h->n_buckets = new_n_buckets; \ h->n_occupied = h->size; \ h->upper_bound = (khint_t)(h->n_buckets * __ac_HASH_UPPER + 0.5); \ } \ return 0; \ } \ SCOPE khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \ { \ khint_t x; \ if (h->n_occupied >= h->upper_bound) { /* update the hash table */ \ if (h->n_buckets > (h->size<<1)) { \ if (kh_resize_##name(h, h->n_buckets - 1) < 0) { /* clear "deleted" elements */ \ *ret = -1; return h->n_buckets; \ } \ } else if (kh_resize_##name(h, h->n_buckets + 1) < 0) { /* expand the hash table */ \ *ret = -1; return h->n_buckets; \ } \ } /* TODO: to implement automatically shrinking; resize() already support shrinking */ \ { \ khint_t k, i, site, last, mask = h->n_buckets - 1, step = 0; \ x = site = h->n_buckets; k = __hash_func(key); i = k & mask; \ if (__ac_isempty(h->flags, i)) x = i; /* for speed up */ \ else { \ last = i; \ while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \ if (__ac_isdel(h->flags, i)) site = i; \ i = (i + (++step)) & mask; \ if (i == last) { x = site; break; } \ } \ if (x == h->n_buckets) { \ if (__ac_isempty(h->flags, i) && site != h->n_buckets) x = site; \ else x = i; \ } \ } \ } \ if (__ac_isempty(h->flags, x)) { /* not present at all */ \ h->keys[x] = key; \ __ac_set_isboth_false(h->flags, x); \ ++h->size; ++h->n_occupied; \ *ret = 1; \ } else if (__ac_isdel(h->flags, x)) { /* deleted */ \ h->keys[x] = key; \ __ac_set_isboth_false(h->flags, x); \ ++h->size; \ *ret = 2; \ } else *ret = 0; /* Don't touch h->keys[x] if present and not deleted */ \ return x; \ } \ SCOPE void kh_del_##name(kh_##name##_t *h, khint_t x) \ { \ if (x != h->n_buckets && !__ac_iseither(h->flags, x)) { \ __ac_set_isdel_true(h->flags, x); \ --h->size; \ } \ } #define KHASH_DECLARE(name, khkey_t, khval_t) \ __KHASH_TYPE(name, khkey_t, khval_t) \ __KHASH_PROTOTYPES(name, khkey_t, khval_t) #define KHASH_INIT2(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ __KHASH_TYPE(name, khkey_t, khval_t) \ __KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) #define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ KHASH_INIT2(name, static kh_inline klib_unused, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) /* --- BEGIN OF HASH FUNCTIONS --- */ /*! @function @abstract Integer hash function @param key The integer [khint32_t] @return The hash value [khint_t] */ #define kh_int_hash_func(key) (khint32_t)(key) /*! @function @abstract Integer comparison function */ #define kh_int_hash_equal(a, b) ((a) == (b)) /*! @function @abstract 64-bit integer hash function @param key The integer [khint64_t] @return The hash value [khint_t] */ #define kh_int64_hash_func(key) (khint32_t)((key)>>33^(key)^(key)<<11) /*! @function @abstract 64-bit integer comparison function */ #define kh_int64_hash_equal(a, b) ((a) == (b)) /*! @function @abstract const char* hash function @param s Pointer to a null terminated string @return The hash value */ static kh_inline khint_t __ac_X31_hash_string(const char *s) { khint_t h = (khint_t)*s; if (h) for (++s ; *s; ++s) h = (h << 5) - h + (khint_t)*s; return h; } /*! @function @abstract Another interface to const char* hash function @param key Pointer to a null terminated string [const char*] @return The hash value [khint_t] */ #define kh_str_hash_func(key) __ac_X31_hash_string(key) /*! @function @abstract Const char* comparison function */ #define kh_str_hash_equal(a, b) (strcmp(a, b) == 0) static kh_inline khint_t __ac_Wang_hash(khint_t key) { key += ~(key << 15); key ^= (key >> 10); key += (key << 3); key ^= (key >> 6); key += ~(key << 11); key ^= (key >> 16); return key; } #define kh_int_hash_func2(k) __ac_Wang_hash((khint_t)key) /* --- END OF HASH FUNCTIONS --- */ /* Other convenient macros... */ /*! @abstract Type of the hash table. @param name Name of the hash table [symbol] */ #define khash_t(name) kh_##name##_t /*! @function @abstract Initiate a hash table. @param name Name of the hash table [symbol] @return Pointer to the hash table [khash_t(name)*] */ #define kh_init(name) kh_init_##name() /*! @function @abstract Destroy a hash table. @param name Name of the hash table [symbol] @param h Pointer to the hash table [khash_t(name)*] */ #define kh_destroy(name, h) kh_destroy_##name(h) /*! @function @abstract Reset a hash table without deallocating memory. @param name Name of the hash table [symbol] @param h Pointer to the hash table [khash_t(name)*] */ #define kh_clear(name, h) kh_clear_##name(h) /*! @function @abstract Resize a hash table. @param name Name of the hash table [symbol] @param h Pointer to the hash table [khash_t(name)*] @param s New size [khint_t] */ #define kh_resize(name, h, s) kh_resize_##name(h, s) /*! @function @abstract Insert a key to the hash table. @param name Name of the hash table [symbol] @param h Pointer to the hash table [khash_t(name)*] @param k Key [type of keys] @param r Extra return code: -1 if the operation failed; 0 if the key is present in the hash table; 1 if the bucket is empty (never used); 2 if the element in the bucket has been deleted [int*] @return Iterator to the inserted element [khint_t] */ #define kh_put(name, h, k, r) kh_put_##name(h, k, r) /*! @function @abstract Retrieve a key from the hash table. @param name Name of the hash table [symbol] @param h Pointer to the hash table [khash_t(name)*] @param k Key [type of keys] @return Iterator to the found element, or kh_end(h) if the element is absent [khint_t] */ #define kh_get(name, h, k) kh_get_##name(h, k) /*! @function @abstract Remove a key from the hash table. @param name Name of the hash table [symbol] @param h Pointer to the hash table [khash_t(name)*] @param k Iterator to the element to be deleted [khint_t] */ #define kh_del(name, h, k) kh_del_##name(h, k) /*! @function @abstract Test whether a bucket contains data. @param h Pointer to the hash table [khash_t(name)*] @param x Iterator to the bucket [khint_t] @return 1 if containing data; 0 otherwise [int] */ #define kh_exist(h, x) (!__ac_iseither((h)->flags, (x))) /*! @function @abstract Get key given an iterator @param h Pointer to the hash table [khash_t(name)*] @param x Iterator to the bucket [khint_t] @return Key [type of keys] */ #define kh_key(h, x) ((h)->keys[x]) /*! @function @abstract Get value given an iterator @param h Pointer to the hash table [khash_t(name)*] @param x Iterator to the bucket [khint_t] @return Value [type of values] @discussion For hash sets, calling this results in segfault. */ #define kh_val(h, x) ((h)->vals[x]) /*! @function @abstract Alias of kh_val() */ #define kh_value(h, x) ((h)->vals[x]) /*! @function @abstract Get the start iterator @param h Pointer to the hash table [khash_t(name)*] @return The start iterator [khint_t] */ #define kh_begin(h) (khint_t)(0) /*! @function @abstract Get the end iterator @param h Pointer to the hash table [khash_t(name)*] @return The end iterator [khint_t] */ #define kh_end(h) ((h)->n_buckets) /*! @function @abstract Get the number of elements in the hash table @param h Pointer to the hash table [khash_t(name)*] @return Number of elements in the hash table [khint_t] */ #define kh_size(h) ((h)->size) /*! @function @abstract Get the number of buckets in the hash table @param h Pointer to the hash table [khash_t(name)*] @return Number of buckets in the hash table [khint_t] */ #define kh_n_buckets(h) ((h)->n_buckets) /*! @function @abstract Iterate over the entries in the hash table @param h Pointer to the hash table [khash_t(name)*] @param kvar Variable to which key will be assigned @param vvar Variable to which value will be assigned @param code Block of code to execute */ #define kh_foreach(h, kvar, vvar, code) { khint_t __i; \ for (__i = kh_begin(h); __i != kh_end(h); ++__i) { \ if (!kh_exist(h,__i)) continue; \ (kvar) = kh_key(h,__i); \ (vvar) = kh_val(h,__i); \ code; \ } } /*! @function @abstract Iterate over the values in the hash table @param h Pointer to the hash table [khash_t(name)*] @param vvar Variable to which value will be assigned @param code Block of code to execute */ #define kh_foreach_value(h, vvar, code) { khint_t __i; \ for (__i = kh_begin(h); __i != kh_end(h); ++__i) { \ if (!kh_exist(h,__i)) continue; \ (vvar) = kh_val(h,__i); \ code; \ } } /* More conenient interfaces */ /*! @function @abstract Instantiate a hash set containing integer keys @param name Name of the hash table [symbol] */ #define KHASH_SET_INIT_INT(name) \ KHASH_INIT(name, khint32_t, char, 0, kh_int_hash_func, kh_int_hash_equal) /*! @function @abstract Instantiate a hash map containing integer keys @param name Name of the hash table [symbol] @param khval_t Type of values [type] */ #define KHASH_MAP_INIT_INT(name, khval_t) \ KHASH_INIT(name, khint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal) /*! @function @abstract Instantiate a hash map containing 64-bit integer keys @param name Name of the hash table [symbol] */ #define KHASH_SET_INIT_INT64(name) \ KHASH_INIT(name, khint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal) /*! @function @abstract Instantiate a hash map containing 64-bit integer keys @param name Name of the hash table [symbol] @param khval_t Type of values [type] */ #define KHASH_MAP_INIT_INT64(name, khval_t) \ KHASH_INIT(name, khint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal) typedef const char *kh_cstr_t; /*! @function @abstract Instantiate a hash map containing const char* keys @param name Name of the hash table [symbol] */ #define KHASH_SET_INIT_STR(name) \ KHASH_INIT(name, kh_cstr_t, char, 0, kh_str_hash_func, kh_str_hash_equal) /*! @function @abstract Instantiate a hash map containing const char* keys @param name Name of the hash table [symbol] @param khval_t Type of values [type] */ #define KHASH_MAP_INIT_STR(name, khval_t) \ KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, kh_str_hash_equal) #endif /* __AC_KHASH_H */ miniasm-0.3/kseq.h000066400000000000000000000214221332550340100141220ustar00rootroot00000000000000/* The MIT License Copyright (c) 2008, 2009, 2011 Attractive Chaos Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ /* Last Modified: 05MAR2012 */ #ifndef AC_KSEQ_H #define AC_KSEQ_H #include #include #include #ifndef klib_unused #if (defined __clang__ && __clang_major__ >= 3) || (defined __GNUC__ && __GNUC__ >= 3) #define klib_unused __attribute__ ((__unused__)) #else #define klib_unused #endif #endif /* klib_unused */ #define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r #define KS_SEP_TAB 1 // isspace() && !' ' #define KS_SEP_LINE 2 // line separator: "\n" (Unix) or "\r\n" (Windows) #define KS_SEP_MAX 2 #define __KS_TYPE(type_t) \ typedef struct __kstream_t { \ int begin, end; \ int is_eof:2, bufsize:30; \ type_t f; \ unsigned char *buf; \ } kstream_t; #define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end) #define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0) #define __KS_BASIC(SCOPE, type_t, __bufsize) \ SCOPE kstream_t *ks_init(type_t f) \ { \ kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t)); \ ks->f = f; ks->bufsize = __bufsize; \ ks->buf = (unsigned char*)malloc(__bufsize); \ return ks; \ } \ SCOPE void ks_destroy(kstream_t *ks) \ { \ if (!ks) return; \ free(ks->buf); \ free(ks); \ } #define __KS_INLINED(__read) \ static inline klib_unused int ks_getc(kstream_t *ks) \ { \ if (ks->is_eof && ks->begin >= ks->end) return -1; \ if (ks->begin >= ks->end) { \ ks->begin = 0; \ ks->end = __read(ks->f, ks->buf, ks->bufsize); \ if (ks->end < ks->bufsize) ks->is_eof = 1; \ if (ks->end == 0) return -1; \ } \ return (int)ks->buf[ks->begin++]; \ } \ static inline klib_unused int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \ { return ks_getuntil2(ks, delimiter, str, dret, 0); } #ifndef KSTRING_T #define KSTRING_T kstring_t typedef struct __kstring_t { size_t l, m; char *s; } kstring_t; #endif #ifndef kroundup32 #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) #endif #define __KS_GETUNTIL(SCOPE, __read) \ SCOPE int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append) \ { \ if (dret) *dret = 0; \ str->l = append? str->l : 0; \ if (ks->begin >= ks->end && ks->is_eof) return -1; \ for (;;) { \ int i; \ if (ks->begin >= ks->end) { \ if (!ks->is_eof) { \ ks->begin = 0; \ ks->end = __read(ks->f, ks->buf, ks->bufsize); \ if (ks->end < ks->bufsize) ks->is_eof = 1; \ if (ks->end == 0) break; \ } else break; \ } \ if (delimiter == KS_SEP_LINE) { \ for (i = ks->begin; i < ks->end; ++i) \ if (ks->buf[i] == '\n') break; \ } else if (delimiter > KS_SEP_MAX) { \ for (i = ks->begin; i < ks->end; ++i) \ if (ks->buf[i] == delimiter) break; \ } else if (delimiter == KS_SEP_SPACE) { \ for (i = ks->begin; i < ks->end; ++i) \ if (isspace(ks->buf[i])) break; \ } else if (delimiter == KS_SEP_TAB) { \ for (i = ks->begin; i < ks->end; ++i) \ if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \ } else i = 0; /* never come to here! */ \ if (str->m - str->l < (size_t)(i - ks->begin + 1)) { \ str->m = str->l + (i - ks->begin) + 1; \ kroundup32(str->m); \ str->s = (char*)realloc(str->s, str->m); \ } \ memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \ str->l = str->l + (i - ks->begin); \ ks->begin = i + 1; \ if (i < ks->end) { \ if (dret) *dret = ks->buf[i]; \ break; \ } \ } \ if (str->s == 0) { \ str->m = 1; \ str->s = (char*)calloc(1, 1); \ } else if (delimiter == KS_SEP_LINE && str->l > 1 && str->s[str->l-1] == '\r') --str->l; \ str->s[str->l] = '\0'; \ return str->l; \ } #define KSTREAM_INIT2(SCOPE, type_t, __read, __bufsize) \ __KS_TYPE(type_t) \ __KS_BASIC(SCOPE, type_t, __bufsize) \ __KS_GETUNTIL(SCOPE, __read) \ __KS_INLINED(__read) #define KSTREAM_INIT(type_t, __read, __bufsize) KSTREAM_INIT2(static, type_t, __read, __bufsize) #define KSTREAM_DECLARE(type_t, __read) \ __KS_TYPE(type_t) \ extern int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append); \ extern kstream_t *ks_init(type_t f); \ extern void ks_destroy(kstream_t *ks); \ __KS_INLINED(__read) /****************** * FASTA/Q parser * ******************/ #define kseq_rewind(ks) ((ks)->last_char = (ks)->f->is_eof = (ks)->f->begin = (ks)->f->end = 0) #define __KSEQ_BASIC(SCOPE, type_t) \ SCOPE kseq_t *kseq_init(type_t fd) \ { \ kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t)); \ s->f = ks_init(fd); \ return s; \ } \ SCOPE void kseq_destroy(kseq_t *ks) \ { \ if (!ks) return; \ free(ks->name.s); free(ks->comment.s); free(ks->seq.s); free(ks->qual.s); \ ks_destroy(ks->f); \ free(ks); \ } /* Return value: >=0 length of the sequence (normal) -1 end-of-file -2 truncated quality string */ #define __KSEQ_READ(SCOPE) \ SCOPE int kseq_read(kseq_t *seq) \ { \ int c; \ kstream_t *ks = seq->f; \ if (seq->last_char == 0) { /* then jump to the next header line */ \ while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \ if (c == -1) return -1; /* end of file */ \ seq->last_char = c; \ } /* else: the first header char has been read in the previous call */ \ seq->comment.l = seq->seq.l = seq->qual.l = 0; /* reset all members */ \ if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; /* normal exit: EOF */ \ if (c != '\n') ks_getuntil(ks, KS_SEP_LINE, &seq->comment, 0); /* read FASTA/Q comment */ \ if (seq->seq.s == 0) { /* we can do this in the loop below, but that is slower */ \ seq->seq.m = 256; \ seq->seq.s = (char*)malloc(seq->seq.m); \ } \ while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \ if (c == '\n') continue; /* skip empty lines */ \ seq->seq.s[seq->seq.l++] = c; /* this is safe: we always have enough space for 1 char */ \ ks_getuntil2(ks, KS_SEP_LINE, &seq->seq, 0, 1); /* read the rest of the line */ \ } \ if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \ if (seq->seq.l + 1 >= seq->seq.m) { /* seq->seq.s[seq->seq.l] below may be out of boundary */ \ seq->seq.m = seq->seq.l + 2; \ kroundup32(seq->seq.m); /* rounded to the next closest 2^k */ \ seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \ } \ seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \ if (c != '+') return seq->seq.l; /* FASTA */ \ if (seq->qual.m < seq->seq.m) { /* allocate memory for qual in case insufficient */ \ seq->qual.m = seq->seq.m; \ seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \ } \ while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \ if (c == -1) return -2; /* error: no quality string */ \ while (ks_getuntil2(ks, KS_SEP_LINE, &seq->qual, 0, 1) >= 0 && seq->qual.l < seq->seq.l); \ seq->last_char = 0; /* we have not come to the next header line */ \ if (seq->seq.l != seq->qual.l) return -2; /* error: qual string is of a different length */ \ return seq->seq.l; \ } #define __KSEQ_TYPE(type_t) \ typedef struct { \ kstring_t name, comment, seq, qual; \ int last_char; \ kstream_t *f; \ } kseq_t; #define KSEQ_INIT2(SCOPE, type_t, __read) \ KSTREAM_INIT2(SCOPE, type_t, __read, 16384) \ __KSEQ_TYPE(type_t) \ __KSEQ_BASIC(SCOPE, type_t) \ __KSEQ_READ(SCOPE) #define KSEQ_INIT(type_t, __read) KSEQ_INIT2(static, type_t, __read) #define KSEQ_DECLARE(type_t) \ __KS_TYPE(type_t) \ __KSEQ_TYPE(type_t) \ extern kseq_t *kseq_init(type_t fd); \ void kseq_destroy(kseq_t *ks); \ int kseq_read(kseq_t *seq); #endif miniasm-0.3/ksort.h000066400000000000000000000145471332550340100143330ustar00rootroot00000000000000/* The MIT License Copyright (c) 2008, 2011 Attractive Chaos Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ // This is a simplified version of ksort.h #ifndef AC_KSORT_H #define AC_KSORT_H #include #include typedef struct { void *left, *right; int depth; } ks_isort_stack_t; #define KSORT_SWAP(type_t, a, b) { register type_t t=(a); (a)=(b); (b)=t; } #define KSORT_INIT(name, type_t, __sort_lt) \ static inline void __ks_insertsort_##name(type_t *s, type_t *t) \ { \ type_t *i, *j, swap_tmp; \ for (i = s + 1; i < t; ++i) \ for (j = i; j > s && __sort_lt(*j, *(j-1)); --j) { \ swap_tmp = *j; *j = *(j-1); *(j-1) = swap_tmp; \ } \ } \ void ks_combsort_##name(size_t n, type_t a[]) \ { \ const double shrink_factor = 1.2473309501039786540366528676643; \ int do_swap; \ size_t gap = n; \ type_t tmp, *i, *j; \ do { \ if (gap > 2) { \ gap = (size_t)(gap / shrink_factor); \ if (gap == 9 || gap == 10) gap = 11; \ } \ do_swap = 0; \ for (i = a; i < a + n - gap; ++i) { \ j = i + gap; \ if (__sort_lt(*j, *i)) { \ tmp = *i; *i = *j; *j = tmp; \ do_swap = 1; \ } \ } \ } while (do_swap || gap > 2); \ if (gap != 1) __ks_insertsort_##name(a, a + n); \ } \ void ks_introsort_##name(size_t n, type_t a[]) \ { \ int d; \ ks_isort_stack_t *top, *stack; \ type_t rp, swap_tmp; \ type_t *s, *t, *i, *j, *k; \ \ if (n < 1) return; \ else if (n == 2) { \ if (__sort_lt(a[1], a[0])) { swap_tmp = a[0]; a[0] = a[1]; a[1] = swap_tmp; } \ return; \ } \ for (d = 2; 1ul<>1) + 1; \ if (__sort_lt(*k, *i)) { \ if (__sort_lt(*k, *j)) k = j; \ } else k = __sort_lt(*j, *i)? i : j; \ rp = *k; \ if (k != t) { swap_tmp = *k; *k = *t; *t = swap_tmp; } \ for (;;) { \ do ++i; while (__sort_lt(*i, rp)); \ do --j; while (i <= j && __sort_lt(rp, *j)); \ if (j <= i) break; \ swap_tmp = *i; *i = *j; *j = swap_tmp; \ } \ swap_tmp = *i; *i = *t; *t = swap_tmp; \ if (i-s > t-i) { \ if (i-s > 16) { top->left = s; top->right = i-1; top->depth = d; ++top; } \ s = t-i > 16? i+1 : t; \ } else { \ if (t-i > 16) { top->left = i+1; top->right = t; top->depth = d; ++top; } \ t = i-s > 16? i-1 : s; \ } \ } else { \ if (top == stack) { \ free(stack); \ __ks_insertsort_##name(a, a+n); \ return; \ } else { --top; s = (type_t*)top->left; t = (type_t*)top->right; d = top->depth; } \ } \ } \ } #define ks_lt_generic(a, b) ((a) < (b)) #define ks_lt_str(a, b) (strcmp((a), (b)) < 0) typedef const char *ksstr_t; #define KSORT_INIT_GENERIC(type_t) KSORT_INIT(type_t, type_t, ks_lt_generic) #define KSORT_INIT_STR KSORT_INIT(str, ksstr_t, ks_lt_str) #define RS_MIN_SIZE 64 #define KRADIX_SORT_INIT(name, rstype_t, rskey, sizeof_key) \ typedef struct { \ rstype_t *b, *e; \ } rsbucket_##name##_t; \ void rs_insertsort_##name(rstype_t *beg, rstype_t *end) \ { \ rstype_t *i; \ for (i = beg + 1; i < end; ++i) \ if (rskey(*i) < rskey(*(i - 1))) { \ rstype_t *j, tmp = *i; \ for (j = i; j > beg && rskey(tmp) < rskey(*(j-1)); --j) \ *j = *(j - 1); \ *j = tmp; \ } \ } \ void rs_sort_##name(rstype_t *beg, rstype_t *end, int n_bits, int s) \ { \ rstype_t *i; \ int size = 1<b = k->e = beg; \ for (i = beg; i != end; ++i) ++b[rskey(*i)>>s&m].e; \ for (k = b + 1; k != be; ++k) \ k->e += (k-1)->e - beg, k->b = (k-1)->e; \ for (k = b; k != be;) { \ if (k->b != k->e) { \ rsbucket_##name##_t *l; \ if ((l = b + (rskey(*k->b)>>s&m)) != k) { \ rstype_t tmp = *k->b, swap; \ do { \ swap = tmp; tmp = *l->b; *l->b++ = swap; \ l = b + (rskey(tmp)>>s&m); \ } while (l != k); \ *k->b++ = tmp; \ } else ++k->b; \ } else ++k; \ } \ for (b->b = beg, k = b + 1; k != be; ++k) k->b = (k-1)->e; \ if (s) { \ s = s > n_bits? s - n_bits : 0; \ for (k = b; k != be; ++k) \ if (k->e - k->b > RS_MIN_SIZE) rs_sort_##name(k->b, k->e, n_bits, s); \ else if (k->e - k->b > 1) rs_insertsort_##name(k->b, k->e); \ } \ } \ void radix_sort_##name(rstype_t *beg, rstype_t *end) \ { \ if (end - beg <= RS_MIN_SIZE) rs_insertsort_##name(beg, end); \ else rs_sort_##name(beg, end, 8, sizeof_key * 8 - 8); \ } #endif miniasm-0.3/kvec.h000066400000000000000000000063311332550340100141110ustar00rootroot00000000000000/* The MIT License Copyright (c) 2008, by Attractive Chaos Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ /* An example: #include "kvec.h" int main() { kvec_t(int) array; kv_init(array); kv_push(int, array, 10); // append kv_a(int, array, 20) = 5; // dynamic kv_A(array, 20) = 4; // static kv_destroy(array); return 0; } */ /* 2008-09-22 (0.1.0): * The initial version. */ #ifndef AC_KVEC_H #define AC_KVEC_H #include #define kv_roundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) #define kvec_t(type) struct { size_t n, m; type *a; } #define kv_init(v) ((v).n = (v).m = 0, (v).a = 0) #define kv_destroy(v) free((v).a) #define kv_A(v, i) ((v).a[(i)]) #define kv_pop(v) ((v).a[--(v).n]) #define kv_size(v) ((v).n) #define kv_max(v) ((v).m) #define kv_resize(type, v, s) do { \ if ((v).m < (s)) { \ (v).m = (s); \ kv_roundup32((v).m); \ (v).a = (type*)realloc((v).a, sizeof(type) * (v).m); \ } \ } while (0) #define kv_copy(type, v1, v0) do { \ if ((v1).m < (v0).n) kv_resize(type, v1, (v0).n); \ (v1).n = (v0).n; \ memcpy((v1).a, (v0).a, sizeof(type) * (v0).n); \ } while (0) \ #define kv_push(type, v, x) do { \ if ((v).n == (v).m) { \ (v).m = (v).m? (v).m<<1 : 2; \ (v).a = (type*)realloc((v).a, sizeof(type) * (v).m); \ } \ (v).a[(v).n++] = (x); \ } while (0) #define kv_pushp(type, v, p) do { \ if ((v).n == (v).m) { \ (v).m = (v).m? (v).m<<1 : 2; \ (v).a = (type*)realloc((v).a, sizeof(type) * (v).m); \ } \ *(p) = &(v).a[(v).n++]; \ } while (0) #define kv_a(type, v, i) ((v).m <= (size_t)(i)? \ ((v).m = (v).n = (i) + 1, kv_roundup32((v).m), \ (v).a = (type*)realloc((v).a, sizeof(type) * (v).m), 0) \ : (v).n <= (size_t)(i)? (v).n = (i) \ : 0), (v).a[(i)] #define kv_reverse(type, v, start) do { \ if ((v).m > 0 && (v).n > (start)) { \ size_t __i, __end = (v).n - (start); \ type *__a = (v).a + (start); \ for (__i = 0; __i < __end>>1; ++__i) { \ type __t = __a[__end - 1 - __i]; \ __a[__end - 1 - __i] = __a[__i]; __a[__i] = __t; \ } \ } \ } while (0) #endif miniasm-0.3/main.c000066400000000000000000000202471332550340100141020ustar00rootroot00000000000000#include #include #include #include #include "kvec.h" #include "sys.h" #include "paf.h" #include "sdict.h" #include "miniasm.h" #define MA_VERSION "0.3-r179" static void print_subs(const sdict_t *d, const ma_sub_t *sub) { uint32_t i; for (i = 0; i < d->n_seq; ++i) if (!d->seq[i].del && sub[i].s != sub[i].e) printf("%s\t%d\t%d\n", d->seq[i].name, sub[i].s, sub[i].e); } static void print_hits(size_t n_hits, const ma_hit_t *hit, const sdict_t *d, const ma_sub_t *sub) { size_t i; for (i = 0; i < n_hits; ++i) { const ma_hit_t *p = &hit[i]; const ma_sub_t *rq = &sub[p->qns>>32], *rt = &sub[p->tn]; printf("%s:%d-%d\t%d\t%d\t%d\t%c\t%s:%d-%d\t%d\t%d\t%d\t%d\t%d\t255\n", d->seq[p->qns>>32].name, rq->s + 1, rq->e, rq->e - rq->s, (uint32_t)p->qns, p->qe, "+-"[p->rev], d->seq[p->tn].name, rt->s + 1, rt->e, rt->e - rt->s, p->ts, p->te, p->ml, p->bl); } } int main(int argc, char *argv[]) { ma_opt_t opt; int i, c, stage = 100, no_first = 0, no_second = 0, bi_dir = 1, o_set = 0, no_cont = 0; sdict_t *d, *excl = 0; ma_sub_t *sub = 0; ma_hit_t *hit; size_t n_hits; float cov = 40.0; char *fn_reads = 0, *outfmt = "ug"; ma_opt_init(&opt); while ((c = getopt(argc, argv, "n:m:s:c:S:i:d:g:o:h:I:r:f:e:p:12VBRbF:")) >= 0) { if (c == 'm') opt.min_match = atoi(optarg); else if (c == 'i') opt.min_iden = atof(optarg); else if (c == 's') opt.min_span = atoi(optarg); else if (c == 'c') opt.min_dp = atoi(optarg); else if (c == 'o') opt.min_ovlp = atoi(optarg), o_set = 1; else if (c == 'S') stage = atoi(optarg); else if (c == 'd') opt.bub_dist = atoi(optarg); else if (c == 'g') opt.gap_fuzz = atoi(optarg); else if (c == 'h') opt.max_hang = atoi(optarg); else if (c == 'I') opt.int_frac = atof(optarg); else if (c == 'e') opt.max_ext = atoi(optarg); else if (c == 'f') fn_reads = optarg; else if (c == 'p') outfmt = optarg; else if (c == '1') no_first = 1; else if (c == '2') no_second = 1; else if (c == 'n') opt.n_rounds = atoi(optarg) - 1; else if (c == 'B') bi_dir = 1; else if (c == 'b') bi_dir = 0; else if (c == 'R') no_cont = 1; else if (c == 'F') opt.final_ovlp_drop_ratio = atof(optarg); else if (c == 'V') { printf("%s\n", MA_VERSION); return 0; } else if (c == 'r') { char *s; opt.max_ovlp_drop_ratio = strtod(optarg, &s); if (*s == ',') opt.min_ovlp_drop_ratio = strtod(s + 1, &s); } } if (o_set == 0) opt.min_ovlp = opt.min_span; if (argc == optind) { fprintf(stderr, "Usage: miniasm [options] \n"); fprintf(stderr, "Options:\n"); fprintf(stderr, " Pre-selection:\n"); fprintf(stderr, " -R prefilter clearly contained reads (2-pass required)\n"); fprintf(stderr, " -m INT min match length [%d]\n", opt.min_match); fprintf(stderr, " -i FLOAT min identity [%.2g]\n", opt.min_iden); fprintf(stderr, " -s INT min span [%d]\n", opt.min_span); fprintf(stderr, " -c INT min coverage [%d]\n", opt.min_dp); fprintf(stderr, " Overlap:\n"); fprintf(stderr, " -o INT min overlap [same as -s]\n"); fprintf(stderr, " -h INT max over hang length [%d]\n", opt.max_hang); fprintf(stderr, " -I FLOAT min end-to-end match ratio [%.2g]\n", opt.int_frac); fprintf(stderr, " Layout:\n"); fprintf(stderr, " -g INT max gap differences between reads for trans-reduction [%d]\n", opt.gap_fuzz); fprintf(stderr, " -d INT max distance for bubble popping [%d]\n", opt.bub_dist); fprintf(stderr, " -e INT small unitig threshold [%d]\n", opt.max_ext); fprintf(stderr, " -f FILE read sequences []\n"); fprintf(stderr, " -n INT rounds of short overlap removal [%d]\n", opt.n_rounds + 1); fprintf(stderr, " -r FLOAT[,FLOAT]\n"); fprintf(stderr, " max and min overlap drop ratio [%.2g,%.2g]\n", opt.max_ovlp_drop_ratio, opt.min_ovlp_drop_ratio); fprintf(stderr, " -F FLOAT aggressive overlap drop ratio in the end [%.2g]\n", opt.final_ovlp_drop_ratio); fprintf(stderr, " Miscellaneous:\n"); fprintf(stderr, " -p STR output information: bed, paf, sg or ug [%s]\n", outfmt); // fprintf(stderr, " -B only one direction of an arc is present in input PAF\n"); // deprecated; for backward compatibility fprintf(stderr, " -b both directions of an arc are present in input\n"); fprintf(stderr, " -1 skip 1-pass read selection\n"); fprintf(stderr, " -2 skip 2-pass read selection\n"); fprintf(stderr, " -V print version number\n"); fprintf(stderr, "\nSee miniasm.1 for detailed description of the command-line options.\n"); return 1; } sys_init(); d = sd_init(); if (no_cont) { fprintf(stderr, "[M::%s] ===> Step 0: removing contained reads <===\n", __func__); excl = ma_hit_no_cont(argv[optind], opt.min_span, opt.min_match, opt.max_hang, opt.int_frac); } fprintf(stderr, "[M::%s] ===> Step 1: reading read mappings <===\n", __func__); hit = ma_hit_read(argv[optind], opt.min_span, opt.min_match, d, &n_hits, bi_dir, excl); if (!no_first) { fprintf(stderr, "[M::%s] ===> Step 2: 1-pass (crude) read selection <===\n", __func__); if (stage >= 2) { sub = ma_hit_sub(opt.min_dp, opt.min_iden, 0, n_hits, hit, d->n_seq); n_hits = ma_hit_cut(sub, opt.min_span, n_hits, hit); } if (stage >= 3) n_hits = ma_hit_flt(sub, opt.max_hang * 1.5, opt.min_ovlp * .5, n_hits, hit, &cov); } if (!no_second) { fprintf(stderr, "[M::%s] ===> Step 3: 2-pass (fine) read selection <===\n", __func__); if (stage >= 4) { ma_sub_t *sub2; sub2 = ma_hit_sub(opt.min_dp, opt.min_iden, opt.min_span/2, n_hits, hit, d->n_seq); n_hits = ma_hit_cut(sub2, opt.min_span, n_hits, hit); if (!no_first) { ma_sub_merge(d->n_seq, sub, sub2); free(sub2); } else { sub = sub2; } } if (stage >= 5) n_hits = ma_hit_contained(&opt, d, sub, n_hits, hit); } hit = (ma_hit_t*)realloc(hit, n_hits * sizeof(ma_hit_t)); if (strcmp(outfmt, "bed") == 0) { print_subs(d, sub); } else if (strcmp(outfmt, "paf") == 0) { print_hits(n_hits, hit, d, sub); } else if (strcmp(outfmt, "ug") == 0 || strcmp(outfmt, "sg") == 0) { asg_t *sg = 0; ma_ug_t *ug = 0; fprintf(stderr, "[M::%s] ===> Step 4: graph cleaning <===\n", __func__); sg = ma_sg_gen(&opt, d, sub, n_hits, hit); if (stage >= 6) { fprintf(stderr, "[M::%s] ===> Step 4.1: transitive reduction <===\n", __func__); asg_arc_del_trans(sg, opt.gap_fuzz); } if (stage >= 7) { fprintf(stderr, "[M::%s] ===> Step 4.2: initial tip cutting and bubble popping <===\n", __func__); asg_cut_tip(sg, opt.max_ext); asg_pop_bubble(sg, opt.bub_dist); } if (stage >= 9) { fprintf(stderr, "[M::%s] ===> Step 4.3: cutting short overlaps (%d rounds in total) <===\n", __func__, opt.n_rounds + 1); for (i = 0; i <= opt.n_rounds; ++i) { float r = opt.min_ovlp_drop_ratio + (opt.max_ovlp_drop_ratio - opt.min_ovlp_drop_ratio) / opt.n_rounds * i; if (asg_arc_del_short(sg, r) != 0) { asg_cut_tip(sg, opt.max_ext); asg_pop_bubble(sg, opt.bub_dist); } } } if (stage >= 10) { fprintf(stderr, "[M::%s] ===> Step 4.4: removing short internal sequences and bi-loops <===\n", __func__); asg_cut_internal(sg, 1); asg_cut_biloop(sg, opt.max_ext); asg_cut_tip(sg, opt.max_ext); asg_pop_bubble(sg, opt.bub_dist); } if (stage >= 11) { fprintf(stderr, "[M::%s] ===> Step 4.5: aggressively cutting short overlaps <===\n", __func__); if (asg_arc_del_short(sg, opt.final_ovlp_drop_ratio) != 0) { asg_cut_tip(sg, opt.max_ext); asg_pop_bubble(sg, opt.bub_dist); } } if (strcmp(outfmt, "ug") == 0) { fprintf(stderr, "[M::%s] ===> Step 5: generating unitigs <===\n", __func__); ug = ma_ug_gen(sg); if (fn_reads) ma_ug_seq(ug, d, sub, fn_reads); ma_ug_print(ug, d, sub, stdout); } else ma_sg_print(sg, d, sub, stdout); asg_destroy(sg); ma_ug_destroy(ug); } free(sub); free(hit); sd_destroy(d); if (excl) sd_destroy(excl); fprintf(stderr, "[M::%s] Version: %s\n", __func__, MA_VERSION); fprintf(stderr, "[M::%s] CMD:", __func__); for (i = 0; i < argc; ++i) fprintf(stderr, " %s", argv[i]); fprintf(stderr, "\n[M::%s] Real time: %.3f sec; CPU: %.3f sec\n", __func__, sys_realtime(), sys_cputime()); return 0; } miniasm-0.3/miniasm.1000066400000000000000000000142541332550340100145320ustar00rootroot00000000000000.TH miniasm 1 "23 July 2018" "miniasm-0.3 (r179)" "Bioinformatics tools" .SH NAME .PP miniasm - de novo assembler for long read sequences .SH SYNOPSIS .PP miniasm .RB [ -b12VR ] .RB [ -m .IR minMatch ] .RB [ -i .IR minIden ] .RB [ -s .IR minSpan ] .RB [ -c .IR minCov ] .RB [ -o .IR minOvlp ] .RB [ -h .IR maxHang ] .RB [ -I .IR intThres ] .RB [ -g .IR maxGapDiff ] .RB [ -d .IR maxBubDist ] .RB [ -e .IR minUtgSize ] .RB [ -f .IR readFile ] .RB [ -n .IR nRounds ] .RB [ -r .IR dropRatio ] .RB [ -F .IR finalDropRatio ] .RB [ -p .IR outputInfo ] .I mapping.paf > .I output.gfa .SH DESCRIPTION .PP Miniasm is a very fast OLC-based de novo assembler for noisy long reads. It takes all-vs-all read self-mappings in the PAF format as input and outputs an assembly graph in the GFA format. Different from mainstream assemblers, miniasm does not have a consensus step. It simply concatenates pieces of read sequences to generate the final unitig sequences. Thus the per-base error rate is similar to the raw input reads. .SH OPTIONS .SS Preselection options .TP 10 .BI -R Pre-filter clearly contained short reads. In this mode, .I mapping.paf is read twice. The first pass identifies contained reads without loading hits to RAM; the second pass skips contained reads and load the rest into RAM. Due to the 2-pass behavior, the peak RAM is greatly reduced, but .I mapping.paf has to be a normal file, not a stream. When this option is in use, it is recommended to reduce .B -c to 2, as there are fewer reads after pre-filtering. Applying .BI -Rc 2 sometimes improves assembly. .TP .BI -m \ INT Drop mappings having less than .I INT matching bases (col10 in PAF) [100]. This option has the same role as .B -L of minimap. .TP .BI -s \ INT Drop mappings shorter than .IR INT -bp [1000]. This option also affects the second round of read filtering and minimal overlap length. .TP .BI -i \ FLOAT During read filtering, ignore mappings with col10/col11 below .I FLOAT [0.05]. Ignored mappings are still used for read overlaps. .TP .BI -c \ INT Minimal coverage by other reads [3]. In the first round of filtering, miniasm finds the longest region covered by .I INT or more reads. In the second round, it in addition requires each remaining base to be covered by .I INT bases at least .IR minSpan /2 from the ends of other reads. .SS Overlapping options .TP 10 .BI -o \ INT Minimal overlap length [same as .IR minSpan ] .TP .BI -h \ INT Maximum overhang length [1000]. An overhang is an unmapped region that should be mapped given a true overlap or true containment. If the overhang is too long, the mapping is considered an internal match and will be ignored. .TP .BI -I \ FLOAT Minimal ratio of mapping length to mapping+overhang length for a mapping considered a containment or an overlap [0.8]. This option has a similar role to .BR -h , except that it controls the ratio, not length. .SS Graph layout options .TP 10 .BI -g \ INT Maximal gap differences between two reads in a mapping [1000]. This parameter is only used for transitive reduction. .TP .BI -d \ INT Maximal probing distance for bubble popping [50000]. Bubbles longer than .I INT will not be popped. .TP .BI -e \ INT A unitig is considered small if it is composed of less than .I INT reads [4]. Miniasm may try to remove small unitigs at various steps. .TP .BI -f \ FILE Read sequence file in FASTA or FASTQ format for generating unitig sequences [null]. If this option is absent, miniasm produces a GFA output without sequences. .TP .BI -r \ FLOAT1,[FLOAT2] Max and min overlap drop ratio [0.7,0.5]. Let overlap(v->w) be the overlap length of edge v->w and maxovlp(v)=max_w{overlap(v->w)} be the length of largest overlap. Miniasm drops overlap v->w if overlap(v->w)/maxovlp(v) is below a threshold controled by this option. Miniasm applies .I nRounds rounds of short overlap removal with an increasing threshold between .I FLOAT1 and .IR FLOAT2 . .TP .BI -n \ INT Rounds of short overlap removal [3]. .TP .BI -F \ FLOAT Overlap drop ratio threshold after short unitig removal [0.8] .SS Miscellaneous options .TP 10 .B -b Indicate that in the input, the same mapping is likely to be given twice .TP .B -1 Skip the first round of pre-assembly read selection .TP .B -2 Skip the second round of pre-assembly read selection .TP .BI -p \ STR Output information and format [ug]. Possible .I STR values include - .BR bed : post-filtered read regions in the BED format; .BR paf : mappings between post-filtered reads; .BR sg : read overlap graph in the GFA format; .BR ug : unitig graph in the GFA format. .TP .B -V Print version number to stdout .SH INPUT FORMAT .PP Miniasm reads mapping positions in the Pairwise mApping Format (PAF), which is a TAB-delimited text format with each line consisting of at least 12 fields as are described in the following table: .TS center box; cb | cb | cb r | c | l . Col Type Description _ 1 string Query sequence name 2 int Query sequence length 3 int Query start coordinate (0-based) 4 int Query end coordinate (0-based) 5 char `+' if query and target on the same strand; `-' if opposite 6 string Target sequence name 7 int Target sequence length 8 int Target start coordinate on the original strand 9 int Target end coordinate on the original strand 10 int Number of matching bases in the mapping 11 int Number bases, including gaps, in the mapping 12 int Mapping quality (0-255 with 255 for missing) .TE .PP Please see minimap(1) for the detailed description of each field. .SH OUTPUT FORMAT .PP Miniasm outputs the assembly in the Graphical Fragment Assembly format (GFA). It is a line based TAB-delimited format, with the leading letter indicates the type of the line. The following table gives the line types used by miniasm: .TS center box; cb | cb | cb c | l | l . Line Comment Fixed fields _ H Header N/A S Segment segName segSeq L Overlap segName1 segOri1 segName2 segOri2 ovlpCIGAR a Golden path utgName utgStart readName:rStart-rEnd readOri incLen .TE .PP An `a' line indicates that the unitig subsequence in .RI [ utgStart , utgStart + incLen ) is taken from read .I readName in region .RI [ rStart -1, rStart -1+ incLen ). It is not a standard GFA line. An `x' line gives a brief summary of each unitig, which can be inferred from `S' and `a' lines. .SH SEE ALSO .PP minimap(1) miniasm-0.3/miniasm.h000066400000000000000000000066501332550340100146220ustar00rootroot00000000000000#ifndef MINIASM_H #define MINIASM_H #include #include #include #include "sdict.h" #include "asg.h" extern int ma_verbose; typedef struct { int min_span; int min_match; int min_dp; float min_iden; int max_hang; int min_ovlp; float int_frac; int gap_fuzz; int n_rounds; int bub_dist; int max_ext; float min_ovlp_drop_ratio, max_ovlp_drop_ratio, final_ovlp_drop_ratio; } ma_opt_t; typedef struct { uint64_t qns; uint32_t qe, tn, ts, te; uint32_t ml:31, rev:1; uint32_t bl:31, del:1; } ma_hit_t; typedef struct { size_t n, m; ma_hit_t *a; } ma_hit_v; typedef struct { uint32_t s:31, del:1, e; } ma_sub_t; typedef struct { uint32_t len:31, circ:1; // len: length of the unitig; circ: circular if non-zero uint32_t start, end; // start: starting vertex in the string graph; end: ending vertex uint32_t m, n; // number of reads uint64_t *a; // list of reads char *s; // unitig sequence is not null } ma_utg_t; typedef struct { size_t n, m; ma_utg_t *a; } ma_utg_v; typedef struct { ma_utg_v u; asg_t *g; } ma_ug_t; #ifdef __cplusplus extern "C" { #endif void ma_opt_init(ma_opt_t *opt); sdict_t *ma_hit_no_cont(const char *fn, int min_span, int min_match, int max_hang, float int_frac); ma_hit_t *ma_hit_read(const char *fn, int min_span, int min_match, sdict_t *d, size_t *n, int bi_dir, const sdict_t *excl); ma_sub_t *ma_hit_sub(int min_dp, float min_iden, int end_clip, size_t n, const ma_hit_t *a, size_t n_sub); size_t ma_hit_cut(const ma_sub_t *reg, int min_span, size_t n, ma_hit_t *a); size_t ma_hit_flt(const ma_sub_t *sub, int max_hang, int min_ovlp, size_t n, ma_hit_t *a, float *cov); void ma_sub_merge(size_t n_sub, ma_sub_t *a, const ma_sub_t *b); size_t ma_hit_contained(const ma_opt_t *opt, sdict_t *d, ma_sub_t *sub, size_t n, ma_hit_t *a); asg_t *ma_sg_gen(const ma_opt_t *opt, const sdict_t *d, const ma_sub_t *sub, size_t n_hits, const ma_hit_t *hit); void ma_sg_print(const asg_t *g, const sdict_t *d, const ma_sub_t *sub, FILE *fp); ma_ug_t *ma_ug_gen(asg_t *g); int ma_ug_seq(ma_ug_t *g, const sdict_t *d, const ma_sub_t *sub, const char *fn); void ma_ug_print(const ma_ug_t *ug, const sdict_t *d, const ma_sub_t *sub, FILE *fp); void ma_ug_destroy(ma_ug_t *ug); #ifdef __cplusplus } #endif #define MA_HT_INT (-1) #define MA_HT_QCONT (-2) #define MA_HT_TCONT (-3) #define MA_HT_SHORT_OVLP (-4) static inline int ma_hit2arc(const ma_hit_t *h, int ql, int tl, int max_hang, float int_frac, int min_ovlp, asg_arc_t *p) { int32_t tl5, tl3, ext5, ext3, qs = (int32_t)h->qns; uint32_t u, v, l; // u: query end; v: target end; l: length from u to v if (h->rev) tl5 = tl - h->te, tl3 = h->ts; // tl5: 5'-end overhang (on the query strand); tl3: similar else tl5 = h->ts, tl3 = tl - h->te; ext5 = qs < tl5? qs : tl5; ext3 = ql - h->qe < tl3? ql - h->qe : tl3; if (ext5 > max_hang || ext3 > max_hang || h->qe - qs < (h->qe - qs + ext5 + ext3) * int_frac) return MA_HT_INT; if (qs <= tl5 && ql - h->qe <= tl3) return MA_HT_QCONT; // query contained else if (qs >= tl5 && ql - h->qe >= tl3) return MA_HT_TCONT; // target contained else if (qs > tl5) u = 0, v = !!h->rev, l = qs - tl5; else u = 1, v = !h->rev, l = (ql - h->qe) - tl3; if (h->qe - qs + ext5 + ext3 < min_ovlp || h->te - h->ts + ext5 + ext3 < min_ovlp) return MA_HT_SHORT_OVLP; // short overlap u |= h->qns>>32<<1, v |= h->tn<<1; p->ul = (uint64_t)u<<32 | l, p->v = v, p->ol = ql - l, p->del = 0; return l; } #endif miniasm-0.3/misc/000077500000000000000000000000001332550340100137405ustar00rootroot00000000000000miniasm-0.3/misc/da2paf.pl000077500000000000000000000025461332550340100154440ustar00rootroot00000000000000#!/usr/bin/env perl use strict; use warnings; use Getopt::Std; my %opts; getopts("2n", \%opts); die("Usage: ls *.las | xargs -i LAdump -cd reads.db {} | da2paf.pl [-2n] <(DBdump -rh reads.db)\n") if @ARGV < 1; my $is_dbl = defined($opts{2}); my $with_name = defined($opts{n}); warn("Reading sequence lengths...\n"); my $fn = shift(@ARGV); open(FH, $fn) || die; my ($id, $pre, @len, @name); while () { if (/^R\s(\d+)/) { $id = $1; } elsif (/^H\s\S+\s(\S+)/) { $pre = $1; } elsif (/^L\s(\S+)\s(\d+)\s(\d+)/) { $len[$id] = $3 - $2; $name[$id] = "$pre/$1/$2_$3"; } } close(FH); warn("Converting mappings...\n"); my ($id0, $id1, $strand, $ab, $ae, $bb, $be, $skip); while (<>) { if (/^P\s(\S+)\s(\S+)\s([nc])/) { $id0 = $1; $id1 = $2; $strand = $3 eq 'n'? '+' : '-'; $skip = !$is_dbl && $id0 > $id1? 1 : 0; } elsif (!$skip && /^C\s(\d+)\s(\d+)\s(\d+)\s(\d+)/) { $ab = $1, $ae = $2, $bb = $3, $be = $4; } elsif (!$skip && /^D\s(\d+)/) { my $bl = $ae - $ab > $be - $bb? $ae - $ab : $be - $bb; my $ml = $bl - $1; my ($n0, $n1) = $with_name? ($name[$id0], $name[$id1]) : ($id0, $id1); if ($strand eq '+') { print join("\t", $n0, $len[$id0], $ab, $ae, '+', $n1, $len[$id1], $bb, $be, $ml, $bl, 255), "\n"; } else { my $l = $len[$id1]; print join("\t", $n0, $len[$id0], $ab, $ae, '-', $n1, $l, $l - $be, $l - $bb, $ml, $bl, 255), "\n"; } } } miniasm-0.3/misc/demo-ecoli-pacbio.sh000077500000000000000000000015551332550340100175550ustar00rootroot00000000000000# # Dependencies: awk, wget, git, gcc and zlib # # Download sample PacBio from the PBcR website wget -O- http://www.cbcb.umd.edu/software/PBcR/data/selfSampleData.tar.gz | tar zxf - ln -s selfSampleData/pacbio_filtered.fastq reads.fq # Install minimap and miniasm (requiring gcc and zlib) git clone https://github.com/lh3/minimap && (cd minimap && make) git clone https://github.com/lh3/miniasm && (cd miniasm && make) # Overlap minimap/minimap -Sw5 -L100 -m0 -t8 reads.fq reads.fq | gzip -1 > reads.paf.gz # Layout miniasm/miniasm -f reads.fq reads.paf.gz > utg.gfa # Convert to FASTA awk '/^S/{print ">"$2"\n"$3}' utg.gfa > utg.fa # Download E. coli K-12 sequence wget -O NC_000913.fa 'http://www.ncbi.nlm.nih.gov/sviewer/viewer.cgi?sendto=on&db=nuccore&dopt=fasta&val=556503834' # Map assembly to ref minimap/minimap NC_000913.fa utg.fa | miniasm/minidot - > utg.eps miniasm-0.3/misc/demo-worm-pacbio.sh000077500000000000000000000147251332550340100174510ustar00rootroot00000000000000prefix=ce-40X # list of read files cat > $prefix.files < $prefix.fa.gz fi # Install minimap and miniasm (requiring gcc and zlib) git clone https://github.com/lh3/minimap && (cd minimap && make) git clone https://github.com/lh3/miniasm && (cd miniasm && make) # Overlap (shorter N50 without -I6G) minimap/minimap -Sw5 -L100 -m0 -t8 -I6G $prefix.fa.gz $prefix.fa.gz 2> $prefix.paf.gz.log | gzip -1 > $prefix.paf.gz # Layout miniasm/miniasm -f $prefix.fa.gz $prefix.paf.gz > $prefix.gfa 2> $prefix.gfa.log # Convert to FASTA awk '/^S/{print ">"$2"\n"$3}' $prefix.gfa > $prefix.utg.fa miniasm-0.3/misc/mhap2paf.pl000077500000000000000000000017431332550340100160030ustar00rootroot00000000000000#!/usr/bin/env perl use strict; use warnings; use Getopt::Std; my %opts = (); getopts("2f:l:", \%opts); die("Usage: mhap2paf.pl [-2] [-f name_list] [-l min_len] \n") if (@ARGV == 0 && -t STDIN); my $is_dbl = defined($opts{2}); my $min_blen = defined($opts{l})? $opts{l} : 0; my @a = (); if (defined $opts{f}) { open(FH, $opts{f} =~ /\.gz$/? "gzip -dc $opts{f} |" : $opts{f}) || die; while () { chomp; my @t = split; push(@a, $t[0]); } close(FH); } while (<>) { chomp; my @t = split; my $bl = $t[6] - $t[5] > $t[10] - $t[9]? $t[6] - $t[5] : $t[10] - $t[9]; my $r = $t[2]; my $ml = int($bl * ($r <= 1.? $r : .01 * $r) + .499); my $cm = "cm:i:" . int($t[3] + .499); my $rev = $t[4] == $t[8]? '+' : '-'; next if $bl < $min_blen; if (@a) { $t[0] = $a[$t[0]-1]; $t[1] = $a[$t[1]-1]; } print(join("\t", @t[0,7,5,6], $rev, @t[1,11,9,10], $ml, $bl, 255, $cm), "\n"); print(join("\t", @t[1,11,9,10], $rev, @t[0,7,5,6], $ml, $bl, 255, $cm), "\n") if ($is_dbl); } miniasm-0.3/misc/ov-sen.js000066400000000000000000000063421332550340100155120ustar00rootroot00000000000000var getopt = function(args, ostr) { var oli; // option letter list index if (typeof(getopt.place) == 'undefined') getopt.ind = 0, getopt.arg = null, getopt.place = -1; if (getopt.place == -1) { // update scanning pointer if (getopt.ind >= args.length || args[getopt.ind].charAt(getopt.place = 0) != '-') { getopt.place = -1; return null; } if (getopt.place + 1 < args[getopt.ind].length && args[getopt.ind].charAt(++getopt.place) == '-') { // found "--" ++getopt.ind; getopt.place = -1; return null; } } var optopt = args[getopt.ind].charAt(getopt.place++); // character checked for validity if (optopt == ':' || (oli = ostr.indexOf(optopt)) < 0) { if (optopt == '-') return null; // if the user didn't specify '-' as an option, assume it means null. if (getopt.place < 0) ++getopt.ind; return '?'; } if (oli+1 >= ostr.length || ostr.charAt(++oli) != ':') { // don't need argument getopt.arg = null; if (getopt.place < 0 || getopt.place >= args[getopt.ind].length) ++getopt.ind, getopt.place = -1; } else { // need an argument if (getopt.place >= 0 && getopt.place < args[getopt.ind].length) getopt.arg = args[getopt.ind].substr(getopt.place); else if (args.length <= ++getopt.ind) { // no arg getopt.place = -1; if (ostr.length > 0 && ostr.charAt(0) == ':') return ':'; return '?'; } else getopt.arg = args[getopt.ind]; // white space getopt.place = -1; ++getopt.ind; } return optopt; } var c, min_len = 2000, min_mapq = 10; while ((c = getopt(arguments, "l:q:")) != null) if (c == 'l') min_len = parseInt(getopt.arg); else if (c == 'q') min_mapq = parseInt(getopt.arg); if (arguments.length - getopt.ind < 2) { warn("Usage: k8 ov-sen.js [options] "); warn("Options:"); warn(" -l INT min overlap length [2000]"); warn(" -q INT min mapping quality [10]"); exit(1); } var file, buf = new Bytes(); var h = {}; file = new File(arguments[getopt.ind]); var a = []; while (file.readline(buf) >= 0) { var t = buf.toString().split("\t"); if (parseInt(t[11]) < min_mapq) continue; if (parseInt(t[10]) < min_len) continue; var st = parseInt(t[7]), en = parseInt(t[8]); var n_shift = 0; if (a.length > 0) { for (var i = 0; i < a.length; ++i) { if (t[5] != a[i][1]) { ++n_shift; } else { var min_en = a[i][3] < en? a[i][3] : en; if (min_en - st >= min_len) break; ++n_shift; } } } if (n_shift > 0) { for (var i = 0; i < n_shift; ++i) a.shift(); } if (a.length > 0) { for (var i = 0; i < a.length; ++i) { if (t[5] != a[i][1]) continue; var min_en = a[i][3] < en? a[i][3] : en; if (min_en - st < min_len) continue; h[a[i][0] + "\t" + t[0]] = 0; //print(a[i][0], t[0], min_en - st); } } a.push([t[0], t[5], st, en]); } file.close(); file = new File(arguments[getopt.ind+1]); while (file.readline(buf) >= 0) { var t = buf.toString().split("\t"); var key = t[0] + "\t" + t[5]; if (h[key] != null) ++h[key]; else { key = t[5] + "\t" + t[0]; if (h[key] != null) ++h[key]; } } file.close(); buf.destroy(); var n_ovlp = 0, n_missed = 0; for (var key in h) { ++n_ovlp; if (h[key] == 0) ++n_missed; } print(n_ovlp + " overlaps"); print(n_missed + " missed"); print((1 - n_missed/n_ovlp).toFixed(4) + " sensitivity"); miniasm-0.3/misc/paf2mhap.pl000077500000000000000000000015461332550340100160040ustar00rootroot00000000000000#!/usr/bin/env perl use strict; use warnings; use Getopt::Std; my %opts = (); getopts("p", \%opts); my $is_100 = defined($opts{p}); die("Usage: paf2mhap.pl [-p] \n") if (@ARGV == 0); warn("Parsing FASTA to create the name<=>id table...\n"); my %hash; my $fn = shift(@ARGV); open(FH, $fn =~ /\.gz$/? "gzip -dc {} |" : $fn) || die; my $cnt = 0; while () { if (/^>(\S+)/) { $hash{$1} = ++$cnt unless defined($hash{$1}); } } close(FH); warn("Converting PAF to MHAP format...\n"); while (<>) { chomp; my @t = split; next if ($t[0] eq $t[5]); # NB: ignore self matches my $cnt = /cm:i:(\d+)/? $1 : 0; my $r = $t[9] / $t[10]; $r = sprintf("%.4f", $is_100? 100. * $r : $r); die if !defined($hash{$t[0]}) || !defined($hash{$t[5]}); print(join(" ", $hash{$t[0]}, $hash{$t[5]}, $r, $cnt, 0, @t[2,3,1], $t[4] eq '+'? 0 : 1, @t[7,8,6]), "\n"); } miniasm-0.3/misc/paftop.js000066400000000000000000000103001332550340100155610ustar00rootroot00000000000000var getopt = function(args, ostr) { var oli; // option letter list index if (typeof(getopt.place) == 'undefined') getopt.ind = 0, getopt.arg = null, getopt.place = -1; if (getopt.place == -1) { // update scanning pointer if (getopt.ind >= args.length || args[getopt.ind].charAt(getopt.place = 0) != '-') { getopt.place = -1; return null; } if (getopt.place + 1 < args[getopt.ind].length && args[getopt.ind].charAt(++getopt.place) == '-') { // found "--" ++getopt.ind; getopt.place = -1; return null; } } var optopt = args[getopt.ind].charAt(getopt.place++); // character checked for validity if (optopt == ':' || (oli = ostr.indexOf(optopt)) < 0) { if (optopt == '-') return null; // if the user didn't specify '-' as an option, assume it means null. if (getopt.place < 0) ++getopt.ind; return '?'; } if (oli+1 >= ostr.length || ostr.charAt(++oli) != ':') { // don't need argument getopt.arg = null; if (getopt.place < 0 || getopt.place >= args[getopt.ind].length) ++getopt.ind, getopt.place = -1; } else { // need an argument if (getopt.place >= 0 && getopt.place < args[getopt.ind].length) getopt.arg = args[getopt.ind].substr(getopt.place); else if (args.length <= ++getopt.ind) { // no arg getopt.place = -1; if (ostr.length > 0 && ostr.charAt(0) == ':') return ':'; return '?'; } else getopt.arg = args[getopt.ind]; // white space getopt.place = -1; ++getopt.ind; } return optopt; } function pafmask(a, mask_level) { var k = 1; for (var i = 1; i < a.length; ++i) { var j, ai = a[i]; for (j = 0; j < k; ++j) { var ol = 0, aj = a[j]; if (ai[2] < aj[2]) { if (ai[3] > aj[2]) ol = ai[3] - aj[2]; } else { if (aj[3] > ai[2]) ol = aj[3] - ai[2]; } var min_l = ai[3] - ai[2] < aj[3] - aj[2]? ai[3] - ai[2] : aj[3] - aj[2]; if (ol > min_l * mask_level) break; } if (j == k) a[k++] = ai; } a.length = k; } function pafmerge(a, max_gap) { for (var i = 1; i < a.length; ++i) { var ai = a[i]; for (var j = 0; j < i; ++j) { var aj = a[j]; if (aj[4] != ai[4] || aj[5] != ai[5]) continue; // diff strand or chr var ts = [ai[7], aj[7]], te = [ai[8], aj[8]]; var qs = [ai[2], aj[2]], qe = [ai[3], aj[3]]; if (qs[0] > qs[1]) { qs = [aj[2], ai[2]], qe = [aj[3], ai[3]]; ts = [aj[7], ai[7]], te = [aj[8], ai[8]]; if (ai[4] == '-') { ts = [aj[6] - aj[8], ai[6] - ai[8]]; te = [aj[6] - aj[7], ai[6] - ai[7]]; } } else { if (ai[4] == '-') { ts = [ai[6] - ai[8], aj[6] - aj[8]]; te = [ai[6] - ai[7], aj[6] - aj[7]]; } } if (qe[0] > qe[1]) continue; // contained if (ts[0] > ts[1]) continue; var qg = qs[1] - qe[0], tg = ts[1] - te[0]; if ((qg < 0 && tg < 0) || Math.abs(tg - qg) < max_gap) { //print("Merged: ["+ai[2]+","+ai[3]+") <=> ["+aj[2]+","+aj[3]+") "+ai[4]+" ["+ai[7]+","+ai[8]+") <=> ["+aj[7]+","+aj[8]+")"); aj[2] = qs[0], aj[3] = qe[1]; if (aj[4] == '+') { aj[7] = ts[0], aj[8] = te[1]; } else { aj[7] = aj[6] - te[1], aj[8] = aj[6] - ts[0]; } aj[9] += ai[9], aj[10] += ai[10]; aj[11] = aj[11] > ai[11]? aj[11] : ai[11]; a[i] = []; break; } } } var k = 0; for (var i = 0; i < a.length; ++i) if (a[i].length != 0) a[k++] = a[i]; a.length = k; } function paftop(a, mask_level, max_gap) { for (var i = 0; i < a.length; ++i) { for (var j = 1; j <= 3; ++j) a[i][j] = parseInt(a[i][j]); for (var j = 6; j <= 11; ++j) a[i][j] = parseInt(a[i][j]); } a.sort(function(x,y){return y[9]-x[9];}); pafmask(a, mask_level); pafmerge(a, max_gap); pafmask(a, mask_level); for (var i = 0; i < a.length; ++i) if (a[i].length) print(a[i].join("\t")); } var c, mask_level = .5, max_gap = 1000; while ((c = getopt(arguments, 'm:g:')) != null) if (c == 'm') mask_level = parseFloat(getopt.arg); else if (c == 'g') max_gap = parseInt(getopt.arg); var file = arguments.length == getopt.ind? new File() : new File(arguments[getopt.ind]); var buf = new Bytes(); var last = null, a = []; while (file.readline(buf) >= 0) { var t = buf.toString().split("\t"); if (t[0] != last) { if (a.length) paftop(a, mask_level, max_gap); a = [], last = t[0]; } a.push(t); } if (a.length) paftop(a, mask_level, max_gap); buf.destroy(); file.close(); miniasm-0.3/misc/sam2paf.js000066400000000000000000000107141332550340100156320ustar00rootroot00000000000000var getopt = function(args, ostr) { var oli; // option letter list index if (typeof(getopt.place) == 'undefined') getopt.ind = 0, getopt.arg = null, getopt.place = -1; if (getopt.place == -1) { // update scanning pointer if (getopt.ind >= args.length || args[getopt.ind].charAt(getopt.place = 0) != '-') { getopt.place = -1; return null; } if (getopt.place + 1 < args[getopt.ind].length && args[getopt.ind].charAt(++getopt.place) == '-') { // found "--" ++getopt.ind; getopt.place = -1; return null; } } var optopt = args[getopt.ind].charAt(getopt.place++); // character checked for validity if (optopt == ':' || (oli = ostr.indexOf(optopt)) < 0) { if (optopt == '-') return null; // if the user didn't specify '-' as an option, assume it means null. if (getopt.place < 0) ++getopt.ind; return '?'; } if (oli+1 >= ostr.length || ostr.charAt(++oli) != ':') { // don't need argument getopt.arg = null; if (getopt.place < 0 || getopt.place >= args[getopt.ind].length) ++getopt.ind, getopt.place = -1; } else { // need an argument if (getopt.place >= 0 && getopt.place < args[getopt.ind].length) getopt.arg = args[getopt.ind].substr(getopt.place); else if (args.length <= ++getopt.ind) { // no arg getopt.place = -1; if (ostr.length > 0 && ostr.charAt(0) == ':') return ':'; return '?'; } else getopt.arg = args[getopt.ind]; // white space getopt.place = -1; ++getopt.ind; } return optopt; } var c, pri_only = false; while ((c = getopt(arguments, "p")) != null) if (c == 'p') pri_only = true; var file = arguments.length == getopt.ind? new File() : new File(arguments[getopt.ind]); var buf = new Bytes(); var re = /(\d+)([MIDSHNX=])/g; var len = {}, lineno = 0; while (file.readline(buf) >= 0) { var m, n_cigar = 0, line = buf.toString(); ++lineno; if (line.charAt(0) == '@') { if (/^@SQ/.test(line)) { var name = (m = /\tSN:(\S+)/.exec(line)) != null? m[1] : null; var l = (m = /\tLN:(\d+)/.exec(line)) != null? parseInt(m[1]) : null; if (name != null && l != null) len[name] = l; } continue; } var t = line.split("\t"); var flag = parseInt(t[1]); if (t[9] != '*' && t[10] != '*' && t[9].length != t[10].length) throw Error("ERROR at line " + lineno + ": inconsistent SEQ and QUAL lengths - " + t[9].length + " != " + t[10].length); if (t[2] == '*' || (flag&4)) continue; if (pri_only && (flag&0x100)) continue; var tlen = len[t[2]]; if (tlen == null) throw Error("ERROR at line " + lineno + ": can't find the length of contig " + t[2]); var nn = (m = /\tnn:i:(\d+)/.exec(line)) != null? parseInt(m[1]) : 0; var NM = (m = /\tNM:i:(\d+)/.exec(line)) != null? parseInt(m[1]) : null; var have_NM = NM == null? false : true; NM += nn; var clip = [0, 0], I = [0, 0], D = [0, 0], M = 0, N = 0, ql = 0, tl = 0, mm = 0, ext_cigar = false; while ((m = re.exec(t[5])) != null) { var l = parseInt(m[1]); if (m[2] == 'M') M += l, ql += l, tl += l, ext_cigar = false; else if (m[2] == 'I') ++I[0], I[1] += l, ql += l; else if (m[2] == 'D') ++D[0], D[1] += l, tl += l; else if (m[2] == 'N') N += l, tl += l; else if (m[2] == 'S') clip[M == 0? 0 : 1] = l, ql += l; else if (m[2] == 'H') clip[M == 0? 0 : 1] = l; else if (m[2] == '=') M += l, ql += l, tl += l, ext_cigar = true; else if (m[2] == 'X') M += l, ql += l, tl += l, mm += l, ext_cigar = true; ++n_cigar; } if (n_cigar > 65535) warn("WARNING at line " + lineno + ": " + n_cigar + " CIGAR operations"); if (tl + parseInt(t[3]) - 1 > tlen) { warn("WARNING at line " + lineno + ": alignment end position larger than ref length; skipped"); continue; } if (t[9] != '*' && t[9].length != ql) { warn("WARNING at line " + lineno + ": SEQ length inconsistent with CIGAR (" + t[9].length + " != " + ql + "); skipped"); continue; } if (!have_NM || ext_cigar) NM = I[1] + D[1] + mm; if (NM < I[1] + D[1] + mm) { warn("WARNING at line " + lineno + ": NM is less than the total number of gaps (" + NM + " < " + (I[1]+D[1]+mm) + ")"); NM = I[1] + D[1] + mm; } var extra = ["mm:i:"+(NM-I[1]-D[1]), "io:i:"+I[0], "in:i:"+I[1], "do:i:"+D[0], "dn:i:"+D[1]]; var match = M - (NM - I[1] - D[1]); var blen = M + I[1] + D[1]; var qlen = M + I[1] + clip[0] + clip[1]; var qs, qe; if (flag&16) qs = clip[1], qe = qlen - clip[0]; else qs = clip[0], qe = qlen - clip[1]; var ts = parseInt(t[3]) - 1, te = ts + M + D[1] + N; var a = [t[0], qlen, qs, qe, flag&16? '-' : '+', t[2], tlen, ts, te, match, blen, t[4]]; print(a.join("\t"), extra.join("\t")); } buf.destroy(); file.close(); miniasm-0.3/misc/wt2paf.pl000077500000000000000000000005531332550340100155060ustar00rootroot00000000000000#!/usr/bin/env perl use strict; use warnings; while (<>) { chomp; my @t = split("\t"); if ($t[4] eq '-') { @t[3,4] = ($t[2] - $t[4], $t[2] - $t[3]); } if ($t[6] eq '-') { @t[8,9] = ($t[7] - $t[9], $t[7] - $t[8]); } my $bl = $t[12] + $t[13] + $t[14] + $t[15]; print join("\t", @t[0,2..4], $t[1] eq $t[6]? '+' : '-', @t[5,7..9,12], $bl, 255), "\n"; } miniasm-0.3/paf.c000066400000000000000000000031671332550340100137260ustar00rootroot00000000000000#include #include #include #include "paf.h" #include "kseq.h" KSTREAM_INIT(gzFile, gzread, 0x10000) paf_file_t *paf_open(const char *fn) { kstream_t *ks; gzFile fp; paf_file_t *pf; fp = fn && strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r"); if (fp == 0) return 0; ks = ks_init(fp); pf = (paf_file_t*)calloc(1, sizeof(paf_file_t)); pf->fp = ks; return pf; } int paf_close(paf_file_t *pf) { kstream_t *ks; if (pf == 0) return 0; free(pf->buf.s); ks = (kstream_t*)pf->fp; gzclose(ks->f); ks_destroy(ks); free(pf); return 0; } int paf_parse(int l, char *s, paf_rec_t *pr) // s must be NULL terminated { // on return: <0 for failure; 0 for success; >0 for filtered char *q, *r; int i, t; for (i = t = 0, q = s; i <= l; ++i) { if (i < l && s[i] != '\t') continue; s[i] = 0; if (t == 0) pr->qn = q; else if (t == 1) pr->ql = strtol(q, &r, 10); else if (t == 2) pr->qs = strtol(q, &r, 10); else if (t == 3) pr->qe = strtol(q, &r, 10); else if (t == 4) pr->rev = (*q == '-'); else if (t == 5) pr->tn = q; else if (t == 6) pr->tl = strtol(q, &r, 10); else if (t == 7) pr->ts = strtol(q, &r, 10); else if (t == 8) pr->te = strtol(q, &r, 10); else if (t == 9) pr->ml = strtol(q, &r, 10); else if (t == 10) pr->bl = strtol(q, &r, 10); ++t, q = i < l? &s[i+1] : 0; } if (t < 10) return -1; return 0; } int paf_read(paf_file_t *pf, paf_rec_t *r) { int ret, dret; file_read_more: ret = ks_getuntil((kstream_t*)pf->fp, KS_SEP_LINE, &pf->buf, &dret); if (ret < 0) return ret; ret = paf_parse(pf->buf.l, pf->buf.s, r); if (ret < 0) goto file_read_more; return ret; } miniasm-0.3/paf.h000066400000000000000000000011471332550340100137270ustar00rootroot00000000000000#ifndef PAF_PAF_H #define PAF_PAF_H #include #include #ifndef KSTRING_T #define KSTRING_T kstring_t typedef struct __kstring_t { size_t l, m; char *s; } kstring_t; #endif typedef struct { void *fp; kstring_t buf; } paf_file_t; typedef struct { const char *qn, *tn; // these point to the input string; NOT allocated uint32_t ql, qs, qe, tl, ts, te; uint32_t ml:31, rev:1, bl; } paf_rec_t; #ifdef __cplusplus extern "C" { #endif paf_file_t *paf_open(const char *fn); int paf_close(paf_file_t *pf); int paf_read(paf_file_t *pf, paf_rec_t *r); #ifdef __cplusplus } #endif #endif miniasm-0.3/sdict.c000066400000000000000000000032531332550340100142620ustar00rootroot00000000000000#include #include "sdict.h" #include "khash.h" KHASH_MAP_INIT_STR(str, uint32_t) typedef khash_t(str) shash_t; sdict_t *sd_init(void) { sdict_t *d; d = (sdict_t*)calloc(1, sizeof(sdict_t)); d->h = kh_init(str); return d; } void sd_destroy(sdict_t *d) { uint32_t i; if (d == 0) return; if (d->h) kh_destroy(str, (shash_t*)d->h); for (i = 0; i < d->n_seq; ++i) free(d->seq[i].name); free(d->seq); free(d); } int32_t sd_put(sdict_t *d, const char *name, uint32_t len) { shash_t *h = (shash_t*)d->h; khint_t k; int absent; k = kh_put(str, h, name, &absent); if (absent) { sd_seq_t *s; if (d->n_seq == d->m_seq) { d->m_seq = d->m_seq? d->m_seq<<1 : 16; d->seq = (sd_seq_t*)realloc(d->seq, d->m_seq * sizeof(sd_seq_t)); } s = &d->seq[d->n_seq]; s->len = len, s->aux = 0, s->del = 0; kh_key(h, k) = s->name = strdup(name); kh_val(h, k) = d->n_seq++; } // TODO: test if len is the same; return kh_val(h, k); } int32_t sd_get(const sdict_t *d, const char *name) { shash_t *h = (shash_t*)d->h; khint_t k; k = kh_get(str, h, name); return k == kh_end(h)? -1 : kh_val(h, k); } void sd_hash(sdict_t *d) { uint32_t i; shash_t *h; if (d->h) return; d->h = h = kh_init(str); for (i = 0; i < d->n_seq; ++i) { int absent; khint_t k; k = kh_put(str, h, d->seq[i].name, &absent); kh_val(h, k) = i; } } int32_t *sd_squeeze(sdict_t *d) { int32_t *map, i, j; if (d->h) { kh_destroy(str, (shash_t*)d->h); d->h = 0; } map = (int32_t*)calloc(d->n_seq, 4); for (i = j = 0; i < d->n_seq; ++i) { if (d->seq[i].del) { free(d->seq[i].name); map[i] = -1; } else d->seq[j] = d->seq[i], map[i] = j++; } d->n_seq = j; sd_hash(d); return map; } miniasm-0.3/sdict.h000066400000000000000000000007411332550340100142660ustar00rootroot00000000000000#ifndef SDICT_H #define SDICT_H #include typedef struct { char *name; uint32_t len, aux:31, del:1; } sd_seq_t; typedef struct { uint32_t n_seq, m_seq; sd_seq_t *seq; void *h; } sdict_t; #ifdef __cplusplus extern "C" { #endif sdict_t *sd_init(void); void sd_destroy(sdict_t *d); int32_t sd_put(sdict_t *d, const char *name, uint32_t len); int32_t sd_get(const sdict_t *d, const char *name); int32_t *sd_squeeze(sdict_t *d); #ifdef __cplusplus } #endif #endif miniasm-0.3/sys.c000066400000000000000000000014431332550340100137710ustar00rootroot00000000000000#include #include #include static double realtime0; double sys_cputime() { struct rusage r; getrusage(RUSAGE_SELF, &r); return r.ru_utime.tv_sec + r.ru_stime.tv_sec + 1e-6 * (r.ru_utime.tv_usec + r.ru_stime.tv_usec); } double sys_realtime() { struct timeval tp; struct timezone tzp; gettimeofday(&tp, &tzp); return (tp.tv_sec + tp.tv_usec * 1e-6) - realtime0; } void sys_liftrlimit() { #ifdef __linux__ struct rlimit r; getrlimit(RLIMIT_AS, &r); r.rlim_cur = r.rlim_max; setrlimit(RLIMIT_AS, &r); #endif } void sys_init() { sys_liftrlimit(); realtime0 = sys_realtime(); } const char *sys_timestamp() { static char buf[256]; double rt, ct; rt = sys_realtime(); ct = sys_cputime(); snprintf(buf, 255, "%.3f*%.2f", rt, ct/rt); return buf; } miniasm-0.3/sys.h000066400000000000000000000003131332550340100137710ustar00rootroot00000000000000#ifndef HL_SYS_H #define HL_SYS_H #ifdef __cplusplus extern "C" { #endif double sys_cputime(); double sys_realtime(); void sys_init(); const char *sys_timestamp(); #ifdef __cplusplus } #endif #endif