pax_global_header00006660000000000000000000000064126311674200014514gustar00rootroot0000000000000052 comment=bbefd212adbf6e6d0a680f2d7d725f4eec3c9cd6 minimap-0.2/000077500000000000000000000000001263116742000130075ustar00rootroot00000000000000minimap-0.2/.gitignore000066400000000000000000000000171263116742000147750ustar00rootroot00000000000000.*.swp *.o *.a minimap-0.2/LICENSE.txt000066400000000000000000000020641263116742000146340ustar00rootroot00000000000000The MIT License Copyright (c) 2015 Broad Institute Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. minimap-0.2/Makefile000066400000000000000000000020761263116742000144540ustar00rootroot00000000000000CC= gcc CFLAGS= -g -Wall -O2 -Wc++-compat -Wno-unused-function CPPFLAGS= INCLUDES= -I. OBJS= kthread.o misc.o bseq.o sketch.o sdust.o index.o map.o PROG= minimap PROG_EXTRA= sdust minimap-lite LIBS= -lm -lz -lpthread .SUFFIXES:.c .o .c.o: $(CC) -c $(CFLAGS) $(CPPFLAGS) $(INCLUDES) $< -o $@ all:$(PROG) extra:all $(PROG_EXTRA) minimap:main.o libminimap.a $(CC) $(CFLAGS) $< -o $@ -L. -lminimap $(LIBS) minimap-lite:example.o libminimap.a $(CC) $(CFLAGS) $< -o $@ -L. -lminimap $(LIBS) libminimap.a:$(OBJS) $(AR) -csru $@ $(OBJS) sdust:sdust.c kdq.h kvec.h kseq.h sdust.h $(CC) -D_SDUST_MAIN $(CFLAGS) $< -o $@ -lz clean: rm -fr gmon.out *.o a.out $(PROG) $(PROG_EXTRA) *~ *.a *.dSYM session* depend: (LC_ALL=C; export LC_ALL; makedepend -Y -- $(CFLAGS) $(DFLAGS) -- *.c) # DO NOT DELETE bseq.o: bseq.h kseq.h example.o: minimap.h bseq.h kseq.h index.o: minimap.h bseq.h kvec.h khash.h main.o: minimap.h bseq.h map.o: bseq.h kvec.h minimap.h sdust.h ksort.h misc.o: minimap.h bseq.h ksort.h sdust.o: kdq.h kvec.h sdust.h sketch.o: kvec.h minimap.h bseq.h minimap-0.2/README.md000066400000000000000000000110441263116742000142660ustar00rootroot00000000000000## Introduction Minimap is an *experimental* tool to efficiently find multiple approximate mapping positions between two sets of long sequences, such as between reads and reference genomes, between genomes and between long noisy reads. By default, it is tuned to have high sensitivity to 2kb matches around 20% divergence but with low specificity. Minimap does not generate alignments as of now and because of this, it is usually tens of times faster than mainstream *aligners*. With four CPU cores, minimap can map 1.6Gbp PacBio reads to human in 2.5 minutes, 1Gbp PacBio E. coli reads to pre-indexed 9.6Gbp bacterial genomes in 3 minutes, to pre-indexed >100Gbp nt database in ~1 hour (of which ~20 minutes are spent on loading index from the network filesystem; peak RAM: 10GB), map 2800 bacteria to themselves in 1 hour, and map 1Gbp E. coli reads against themselves in a couple of minutes. Minimap does not replace mainstream aligners, but it can be useful when you want to quickly identify long approximate matches at moderate divergence among a huge collection of sequences. For this task, it is much faster than most existing tools. ## Usage * Map two sets of long sequences: ```sh minimap target.fa.gz query.fa.gz > out.mini ``` The output is TAB-delimited with each line consisting of query name, length, 0-based start, end, strand, target name, length, start, end, the number of matching bases, the number of co-linear minimizers in the match and the fraction of matching bases. * All-vs-all PacBio read self-mapping for [miniasm][miniasm]: ```sh minimap -Sw5 -L100 -m0 reads.fa reads.fa | gzip -1 > reads.paf.gz ``` * Prebuild index and then map: ```sh minimap -d target.mmi target.fa.gz minimap -l target.mmi query.fa.gz > out.mini ``` Minimap indexing is very fast (1 minute for human genome; 50 minutes for >100Gbp nt database retrieved on 2015-09-30), but for huge repeatedly used databases, prebuilding index is still preferred. * Map sequences against themselve without diagnal matches: ```sh minimap -S sequences.fa sequences.fa > self-match.mini ``` The output may still contain overlapping matches in repetitive regions. ## Algorithm Overview 1. Indexing. Collect all [(*w*,*k*)-minimizers][mini] in a batch (**-I**=4 billion bp) of target sequences and store them in a hash table. Mark top **-f**=0.1% of most frequent minimizers as repeats. Minimap uses [invertible hash function][invhash] to avoid taking ploy-A as minimizers. 2. For each query, collect all (*w*,*k*)-minimizers and look up the hash table for matches (*qi*,*ti*,*si*), where *qi* is the query position, *ti* the target position and *si* indicates whether the minimizer match is on the same strand. 3. For matches on the same strand, sort by {*qi*-*ti*} and then cluster matches within a **-r**=500bp window. Minimap merges two windows if **-m**=50% of minimizer matches overlap. For matches on different strands, sort {*qi*+*ti*} and apply a similar clustering procedure. This is inspired by the [Hough transformation][hough]. 4. For each cluster, sort (*qi*,*ti*) by *qi* and solve a [longest increasing sequence problem][lis] for *ti*. This finds the longest co-linear matching chain. Break the chain whenever there is a gap longer than **-g**=10000. 5. Output the start and end of the chain if it contains **-c**=4 or more minimizer matches and the matching length is no less than **-L**=40. 6. Go to 1 and rewind to the first record of query if there are more target sequences; otherwise stop. To increase sensitivity, we may decrease **-w** to index more minimizers; we may also decrease **-k**, though this may greatly impact performance for mammalian genomes. Also note that by default, if the total length of target sequences is less than 4Gbp (1G=1 billion; controlled by **-I**), minimap creates one index and stream all the query sequences in one go. The multiple hits of a query sequence is adjacent to each other in the output. If the total length is greater than 4Gbp, minimap needs to read query sequences multiple times. The multiple hits of a query may not be adjacent. [mini]: http://bioinformatics.oxfordjournals.org/content/20/18/3363.abstract [lis]: https://en.wikipedia.org/wiki/Longest_increasing_subsequence [hough]: https://en.wikipedia.org/wiki/Hough_transform [invhash]: https://gist.github.com/lh3/974ced188be2f90422cc [miniasm]: https://github.com/lh3/miniasm minimap-0.2/bseq.c000066400000000000000000000023131263116742000141040ustar00rootroot00000000000000#include #include #include #include #include #include "bseq.h" #include "kseq.h" KSEQ_INIT(gzFile, gzread) extern unsigned char seq_nt4_table[256]; struct bseq_file_s { int is_eof; gzFile fp; kseq_t *ks; }; bseq_file_t *bseq_open(const char *fn) { bseq_file_t *fp; gzFile f; f = fn && strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r"); if (f == 0) return 0; fp = (bseq_file_t*)calloc(1, sizeof(bseq_file_t)); fp->fp = f; fp->ks = kseq_init(fp->fp); return fp; } void bseq_close(bseq_file_t *fp) { kseq_destroy(fp->ks); gzclose(fp->fp); free(fp); } bseq1_t *bseq_read(bseq_file_t *fp, int chunk_size, int *n_) { int size = 0, m, n; bseq1_t *seqs; kseq_t *ks = fp->ks; m = n = 0; seqs = 0; while (kseq_read(ks) >= 0) { bseq1_t *s; assert(ks->seq.l <= INT32_MAX); if (n >= m) { m = m? m<<1 : 256; seqs = (bseq1_t*)realloc(seqs, m * sizeof(bseq1_t)); } s = &seqs[n]; s->name = strdup(ks->name.s); s->seq = strdup(ks->seq.s); s->l_seq = ks->seq.l; size += seqs[n++].l_seq; if (size >= chunk_size) break; } if (n == 0) fp->is_eof = 1; *n_ = n; return seqs; } int bseq_eof(bseq_file_t *fp) { return fp->is_eof; } minimap-0.2/bseq.h000066400000000000000000000005471263116742000141200ustar00rootroot00000000000000#ifndef MM_BSEQ_H #define MM_BSEQ_H #include struct bseq_file_s; typedef struct bseq_file_s bseq_file_t; typedef struct { int l_seq, rid; char *name, *seq; } bseq1_t; bseq_file_t *bseq_open(const char *fn); void bseq_close(bseq_file_t *fp); bseq1_t *bseq_read(bseq_file_t *fp, int chunk_size, int *n_); int bseq_eof(bseq_file_t *fp); #endif minimap-0.2/example.c000066400000000000000000000027601263116742000146130ustar00rootroot00000000000000// To compile: // gcc -g -O2 example.c libminimap.a -lz #include #include #include #include #include "minimap.h" #include "kseq.h" KSEQ_INIT(gzFile, gzread) int main(int argc, char *argv[]) { if (argc < 3) { fprintf(stderr, "Usage: minimap-lite \n"); return 1; } // open query file for reading; you may use your favorite FASTA/Q parser gzFile f = gzopen(argv[2], "r"); assert(f); kseq_t *ks = kseq_init(f); // create index for target; we are creating one index for all target sequence int n_threads = 4, w = 10, k = 15; mm_idx_t *mi = mm_idx_build(argv[1], w, k, n_threads); assert(mi); // mapping mm_mapopt_t opt; mm_mapopt_init(&opt); // initialize mapping parameters mm_tbuf_t *tbuf = mm_tbuf_init(); // thread buffer; for multi-threading, allocate one tbuf for each thread while (kseq_read(ks) >= 0) { // each kseq_read() call reads one query sequence const mm_reg1_t *reg; int j, n_reg; // get all hits for the query reg = mm_map(mi, ks->seq.l, ks->seq.s, &n_reg, tbuf, &opt, 0); // traverse hits and print them out for (j = 0; j < n_reg; ++j) { const mm_reg1_t *r = ®[j]; printf("%s\t%d\t%d\t%d\t%c\t", ks->name.s, ks->seq.l, r->qs, r->qe, "+-"[r->rev]); printf("%s\t%d\t%d\t%d\t%d\t%d\n", mi->name[r->rid], mi->len[r->rid], r->rs, r->re, r->len, r->cnt); } } mm_tbuf_destroy(tbuf); // deallocate index and close the query file mm_idx_destroy(mi); kseq_destroy(ks); gzclose(f); return 0; } minimap-0.2/index.c000066400000000000000000000212711263116742000142650ustar00rootroot00000000000000#include #include #include #include "minimap.h" #include "kvec.h" #include "khash.h" #define idx_hash(a) ((a)>>1) #define idx_eq(a, b) ((a)>>1 == (b)>>1) KHASH_INIT(idx, uint64_t, uint64_t, 1, idx_hash, idx_eq) typedef khash_t(idx) idxhash_t; void kt_for(int n_threads, void (*func)(void*,long,int), void *data, long n); mm_idx_t *mm_idx_init(int w, int k, int b) { mm_idx_t *mi; if (k*2 < b) b = k * 2; if (w < 1) w = 1; mi = (mm_idx_t*)calloc(1, sizeof(mm_idx_t)); mi->w = w, mi->k = k, mi->b = b; mi->max_occ = UINT32_MAX; mi->B = (mm_idx_bucket_t*)calloc(1<b; ++i) { free(mi->B[i].p); free(mi->B[i].a.a); kh_destroy(idx, (idxhash_t*)mi->B[i].h); } free(mi->B); if (mi->name) for (i = 0; i < mi->n; ++i) free(mi->name[i]); free(mi->len); free(mi->name); free(mi); } const uint64_t *mm_idx_get(const mm_idx_t *mi, uint64_t minier, int *n) { int mask = (1<b) - 1; khint_t k; mm_idx_bucket_t *b = &mi->B[minier&mask]; idxhash_t *h = (idxhash_t*)b->h; *n = 0; if (h == 0) return 0; k = kh_get(idx, h, minier>>mi->b<<1); if (k == kh_end(h)) return 0; if (kh_key(h, k)&1) { *n = 1; return &kh_val(h, k); } else { *n = (uint32_t)kh_val(h, k); return &b->p[kh_val(h, k)>>32]; } } uint32_t mm_idx_cal_max_occ(const mm_idx_t *mi, float f) { int i; size_t n = 0; uint32_t thres; khint_t *a, k; if (f <= 0.) return UINT32_MAX; for (i = 0; i < 1<b; ++i) if (mi->B[i].h) n += kh_size((idxhash_t*)mi->B[i].h); a = (uint32_t*)malloc(n * 4); for (i = n = 0; i < 1<b; ++i) { idxhash_t *h = (idxhash_t*)mi->B[i].h; if (h == 0) continue; for (k = 0; k < kh_end(h); ++k) { if (!kh_exist(h, k)) continue; a[n++] = kh_key(h, k)&1? 1 : (uint32_t)kh_val(h, k); } } thres = ks_ksmall_uint32_t(n, a, (uint32_t)((1. - f) * n)) + 1; free(a); return thres; } void mm_idx_set_max_occ(mm_idx_t *mi, float f) { mi->freq_thres = f; mi->max_occ = mm_idx_cal_max_occ(mi, f); } /********************************* * Sort and generate hash tables * *********************************/ static void worker_post(void *g, long i, int tid) { int j, start_a, start_p, n, n_keys; idxhash_t *h; mm_idx_t *mi = (mm_idx_t*)g; mm_idx_bucket_t *b = &mi->B[i]; if (b->a.n == 0) return; // sort by minimizer radix_sort_128x(b->a.a, b->a.a + b->a.n); // count and preallocate for (j = 1, n = 1, n_keys = 0, b->n = 0; j <= b->a.n; ++j) { if (j == b->a.n || b->a.a[j].x != b->a.a[j-1].x) { ++n_keys; if (n > 1) b->n += n; n = 1; } else ++n; } h = kh_init(idx); kh_resize(idx, h, n_keys); b->p = (uint64_t*)calloc(b->n, 8); // create the hash table for (j = 1, n = 1, start_a = start_p = 0; j <= b->a.n; ++j) { if (j == b->a.n || b->a.a[j].x != b->a.a[j-1].x) { khint_t itr; int absent; mm128_t *p = &b->a.a[j-1]; itr = kh_put(idx, h, p->x>>mi->b<<1, &absent); assert(absent && j - start_a == n); if (n == 1) { kh_key(h, itr) |= 1; kh_val(h, itr) = p->y; } else { int k; for (k = 0; k < n; ++k) b->p[start_p + k] = b->a.a[start_a + k].y; kh_val(h, itr) = (uint64_t)start_p<<32 | n; start_p += n; } start_a = j, n = 1; } else ++n; } b->h = h; assert(b->n == start_p); // deallocate and clear b->a free(b->a.a); b->a.n = b->a.m = 0, b->a.a = 0; } static void mm_idx_post(mm_idx_t *mi, int n_threads) { kt_for(n_threads, worker_post, mi, 1<b); } /****************** * Generate index * ******************/ #include #include #include "bseq.h" void kt_pipeline(int n_threads, void *(*func)(void*, int, void*), void *shared_data, int n_steps); typedef struct { int tbatch_size, n_processed, keep_name; bseq_file_t *fp; uint64_t ibatch_size, n_read; mm_idx_t *mi; } pipeline_t; typedef struct { int n_seq; bseq1_t *seq; mm128_v a; } step_t; static void mm_idx_add(mm_idx_t *mi, int n, const mm128_t *a) { int i, mask = (1<b) - 1; for (i = 0; i < n; ++i) { mm128_v *p = &mi->B[a[i].x&mask].a; kv_push(mm128_t, *p, a[i]); } } static void *worker_pipeline(void *shared, int step, void *in) { int i; pipeline_t *p = (pipeline_t*)shared; if (step == 0) { // step 0: read sequences step_t *s; if (p->n_read > p->ibatch_size) return 0; s = (step_t*)calloc(1, sizeof(step_t)); s->seq = bseq_read(p->fp, p->tbatch_size, &s->n_seq); if (s->seq) { uint32_t old_m = p->mi->n, m, n; assert((uint64_t)p->n_processed + s->n_seq <= INT32_MAX); m = n = p->mi->n + s->n_seq; kroundup32(m); kroundup32(old_m); if (old_m != m) { if (p->keep_name) p->mi->name = (char**)realloc(p->mi->name, m * sizeof(char*)); p->mi->len = (int*)realloc(p->mi->len, m * sizeof(int)); } for (i = 0; i < s->n_seq; ++i) { if (p->keep_name) { assert(strlen(s->seq[i].name) <= 254); p->mi->name[p->mi->n] = strdup(s->seq[i].name); } p->mi->len[p->mi->n++] = s->seq[i].l_seq; s->seq[i].rid = p->n_processed++; p->n_read += s->seq[i].l_seq; } return s; } else free(s); } else if (step == 1) { // step 1: compute sketch step_t *s = (step_t*)in; for (i = 0; i < s->n_seq; ++i) { bseq1_t *t = &s->seq[i]; mm_sketch(t->seq, t->l_seq, p->mi->w, p->mi->k, t->rid, &s->a); free(t->seq); free(t->name); } free(s->seq); s->seq = 0; return s; } else if (step == 2) { // dispatch sketch to buckets step_t *s = (step_t*)in; mm_idx_add(p->mi, s->a.n, s->a.a); free(s->a.a); free(s); } return 0; } mm_idx_t *mm_idx_gen(bseq_file_t *fp, int w, int k, int b, int tbatch_size, int n_threads, uint64_t ibatch_size, int keep_name) { pipeline_t pl; memset(&pl, 0, sizeof(pipeline_t)); pl.tbatch_size = tbatch_size; pl.keep_name = keep_name; pl.ibatch_size = ibatch_size; pl.fp = fp; if (pl.fp == 0) return 0; pl.mi = mm_idx_init(w, k, b); kt_pipeline(n_threads < 3? n_threads : 3, worker_pipeline, &pl, 3); if (mm_verbose >= 3) fprintf(stderr, "[M::%s::%.3f*%.2f] collected minimizers\n", __func__, realtime() - mm_realtime0, cputime() / (realtime() - mm_realtime0)); mm_idx_post(pl.mi, n_threads); if (mm_verbose >= 3) fprintf(stderr, "[M::%s::%.3f*%.2f] sorted minimizers\n", __func__, realtime() - mm_realtime0, cputime() / (realtime() - mm_realtime0)); return pl.mi; } mm_idx_t *mm_idx_build(const char *fn, int w, int k, int n_threads) // a simpler interface { bseq_file_t *fp; mm_idx_t *mi; fp = bseq_open(fn); if (fp == 0) return 0; mi = mm_idx_gen(fp, w, k, MM_IDX_DEF_B, 1<<18, n_threads, UINT64_MAX, 1); mm_idx_set_max_occ(mi, 0.001); bseq_close(fp); return mi; } /************* * index I/O * *************/ #define MM_IDX_MAGIC "MMI\1" void mm_idx_dump(FILE *fp, const mm_idx_t *mi) { uint32_t x[6]; int i; x[0] = mi->w, x[1] = mi->k, x[2] = mi->b, x[3] = mi->n, x[4] = mi->name? 1 : 0, x[5] = mi->max_occ; fwrite(MM_IDX_MAGIC, 1, 4, fp); fwrite(x, 4, 6, fp); fwrite(&mi->freq_thres, sizeof(float), 1, fp); fwrite(mi->len, 4, mi->n, fp); if (mi->name) { for (i = 0; i < mi->n; ++i) { uint8_t l; l = strlen(mi->name[i]); fwrite(&l, 1, 1, fp); fwrite(mi->name[i], 1, l, fp); } } for (i = 0; i < 1<b; ++i) { mm_idx_bucket_t *b = &mi->B[i]; khint_t k; idxhash_t *h = (idxhash_t*)b->h; uint32_t size = h? h->size : 0; fwrite(&b->n, 4, 1, fp); fwrite(b->p, 8, b->n, fp); fwrite(&size, 4, 1, fp); if (size == 0) continue; for (k = 0; k < kh_end(h); ++k) { uint64_t x[2]; if (!kh_exist(h, k)) continue; x[0] = kh_key(h, k), x[1] = kh_val(h, k); fwrite(x, 8, 2, fp); } } } mm_idx_t *mm_idx_load(FILE *fp) { int i; char magic[4]; uint32_t x[6]; mm_idx_t *mi; if (fread(magic, 1, 4, fp) != 4) return 0; if (strncmp(magic, MM_IDX_MAGIC, 4) != 0) return 0; if (fread(x, 4, 6, fp) != 6) return 0; mi = mm_idx_init(x[0], x[1], x[2]); mi->n = x[3], mi->max_occ = x[5]; fread(&mi->freq_thres, sizeof(float), 1, fp); mi->len = (int32_t*)malloc(mi->n * 4); fread(mi->len, 4, mi->n, fp); if (x[4]) { // has names mi->name = (char**)calloc(mi->n, sizeof(char*)); for (i = 0; i < mi->n; ++i) { uint8_t l; fread(&l, 1, 1, fp); mi->name[i] = (char*)malloc(l + 1); fread(mi->name[i], 1, l, fp); mi->name[i][l] = 0; } } for (i = 0; i < 1<b; ++i) { mm_idx_bucket_t *b = &mi->B[i]; uint32_t j, size; khint_t k; idxhash_t *h; fread(&b->n, 4, 1, fp); b->p = (uint64_t*)malloc(b->n * 8); fread(b->p, 8, b->n, fp); fread(&size, 4, 1, fp); if (size == 0) continue; b->h = h = kh_init(idx); kh_resize(idx, h, size); for (j = 0; j < size; ++j) { uint64_t x[2]; int absent; fread(x, 8, 2, fp); k = kh_put(idx, h, x[0], &absent); assert(absent); kh_val(h, k) = x[1]; } } return mi; } minimap-0.2/kdq.h000066400000000000000000000102601263116742000137360ustar00rootroot00000000000000#ifndef __AC_KDQ_H #define __AC_KDQ_H #include #include #define __KDQ_TYPE(type) \ typedef struct { \ size_t front:58, bits:6, count, mask; \ type *a; \ } kdq_##type##_t; #define kdq_t(type) kdq_##type##_t #define kdq_size(q) ((q)->count) #define kdq_first(q) ((q)->a[(q)->front]) #define kdq_last(q) ((q)->a[((q)->front + (q)->count - 1) & (q)->mask]) #define kdq_at(q, i) ((q)->a[((q)->front + (i)) & (q)->mask]) #define __KDQ_IMPL(type, SCOPE) \ SCOPE kdq_##type##_t *kdq_init_##type() \ { \ kdq_##type##_t *q; \ q = (kdq_##type##_t*)calloc(1, sizeof(kdq_##type##_t)); \ q->bits = 2, q->mask = (1ULL<bits) - 1; \ q->a = (type*)malloc((1<bits) * sizeof(type)); \ return q; \ } \ SCOPE void kdq_destroy_##type(kdq_##type##_t *q) \ { \ if (q == 0) return; \ free(q->a); free(q); \ } \ SCOPE int kdq_resize_##type(kdq_##type##_t *q, int new_bits) \ { \ size_t new_size = 1ULL<bits; \ if (new_size < q->count) { /* not big enough */ \ int i; \ for (i = 0; i < 64; ++i) \ if (1ULL< q->count) break; \ new_bits = i, new_size = 1ULL<bits) return q->bits; /* unchanged */ \ if (new_bits > q->bits) q->a = (type*)realloc(q->a, (1ULL<front + q->count <= old_size) { /* unwrapped */ \ if (q->front + q->count > new_size) /* only happens for shrinking */ \ memmove(q->a, q->a + new_size, (q->front + q->count - new_size) * sizeof(type)); \ } else { /* wrapped */ \ memmove(q->a + (new_size - (old_size - q->front)), q->a + q->front, (old_size - q->front) * sizeof(type)); \ q->front = new_size - (old_size - q->front); \ } \ q->bits = new_bits, q->mask = (1ULL<bits) - 1; \ if (new_bits < q->bits) q->a = (type*)realloc(q->a, (1ULL<bits; \ } \ SCOPE type *kdq_pushp_##type(kdq_##type##_t *q) \ { \ if (q->count == 1ULL<bits) kdq_resize_##type(q, q->bits + 1); \ return &q->a[((q->count++) + q->front) & (q)->mask]; \ } \ SCOPE void kdq_push_##type(kdq_##type##_t *q, type v) \ { \ if (q->count == 1ULL<bits) kdq_resize_##type(q, q->bits + 1); \ q->a[((q->count++) + q->front) & (q)->mask] = v; \ } \ SCOPE type *kdq_unshiftp_##type(kdq_##type##_t *q) \ { \ if (q->count == 1ULL<bits) kdq_resize_##type(q, q->bits + 1); \ ++q->count; \ q->front = q->front? q->front - 1 : (1ULL<bits) - 1; \ return &q->a[q->front]; \ } \ SCOPE void kdq_unshift_##type(kdq_##type##_t *q, type v) \ { \ type *p; \ p = kdq_unshiftp_##type(q); \ *p = v; \ } \ SCOPE type *kdq_pop_##type(kdq_##type##_t *q) \ { \ return q->count? &q->a[((--q->count) + q->front) & q->mask] : 0; \ } \ SCOPE type *kdq_shift_##type(kdq_##type##_t *q) \ { \ type *d = 0; \ if (q->count == 0) return 0; \ d = &q->a[q->front++]; \ q->front &= q->mask; \ --q->count; \ return d; \ } #define KDQ_INIT2(type, SCOPE) \ __KDQ_TYPE(type) \ __KDQ_IMPL(type, SCOPE) #ifndef klib_unused #if (defined __clang__ && __clang_major__ >= 3) || (defined __GNUC__ && __GNUC__ >= 3) #define klib_unused __attribute__ ((__unused__)) #else #define klib_unused #endif #endif /* klib_unused */ #define KDQ_INIT(type) KDQ_INIT2(type, static inline klib_unused) #define KDQ_DECLARE(type) \ __KDQ_TYPE(type) \ kdq_##type##_t *kdq_init_##type(); \ void kdq_destroy_##type(kdq_##type##_t *q); \ int kdq_resize_##type(kdq_##type##_t *q, int new_bits); \ type *kdq_pushp_##type(kdq_##type##_t *q); \ void kdq_push_##type(kdq_##type##_t *q, type v); \ type *kdq_unshiftp_##type(kdq_##type##_t *q); \ void kdq_unshift_##type(kdq_##type##_t *q, type v); \ type *kdq_pop_##type(kdq_##type##_t *q); \ type *kdq_shift_##type(kdq_##type##_t *q); #define kdq_init(type) kdq_init_##type() #define kdq_destroy(type, q) kdq_destroy_##type(q) #define kdq_resize(type, q, new_bits) kdq_resize_##type(q, new_bits) #define kdq_pushp(type, q) kdq_pushp_##type(q) #define kdq_push(type, q, v) kdq_push_##type(q, v) #define kdq_pop(type, q) kdq_pop_##type(q) #define kdq_unshiftp(type, q) kdq_unshiftp_##type(q) #define kdq_unshift(type, q, v) kdq_unshift_##type(q, v) #define kdq_shift(type, q) kdq_shift_##type(q) #endif minimap-0.2/khash.h000066400000000000000000000515011263116742000142600ustar00rootroot00000000000000/* The MIT License Copyright (c) 2008, 2009, 2011 by Attractive Chaos Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ /* An example: #include "khash.h" KHASH_MAP_INIT_INT(32, char) int main() { int ret, is_missing; khiter_t k; khash_t(32) *h = kh_init(32); k = kh_put(32, h, 5, &ret); kh_value(h, k) = 10; k = kh_get(32, h, 10); is_missing = (k == kh_end(h)); k = kh_get(32, h, 5); kh_del(32, h, k); for (k = kh_begin(h); k != kh_end(h); ++k) if (kh_exist(h, k)) kh_value(h, k) = 1; kh_destroy(32, h); return 0; } */ /* 2013-05-02 (0.2.8): * Use quadratic probing. When the capacity is power of 2, stepping function i*(i+1)/2 guarantees to traverse each bucket. It is better than double hashing on cache performance and is more robust than linear probing. In theory, double hashing should be more robust than quadratic probing. However, my implementation is probably not for large hash tables, because the second hash function is closely tied to the first hash function, which reduce the effectiveness of double hashing. Reference: http://research.cs.vt.edu/AVresearch/hashing/quadratic.php 2011-12-29 (0.2.7): * Minor code clean up; no actual effect. 2011-09-16 (0.2.6): * The capacity is a power of 2. This seems to dramatically improve the speed for simple keys. Thank Zilong Tan for the suggestion. Reference: - http://code.google.com/p/ulib/ - http://nothings.org/computer/judy/ * Allow to optionally use linear probing which usually has better performance for random input. Double hashing is still the default as it is more robust to certain non-random input. * Added Wang's integer hash function (not used by default). This hash function is more robust to certain non-random input. 2011-02-14 (0.2.5): * Allow to declare global functions. 2009-09-26 (0.2.4): * Improve portability 2008-09-19 (0.2.3): * Corrected the example * Improved interfaces 2008-09-11 (0.2.2): * Improved speed a little in kh_put() 2008-09-10 (0.2.1): * Added kh_clear() * Fixed a compiling error 2008-09-02 (0.2.0): * Changed to token concatenation which increases flexibility. 2008-08-31 (0.1.2): * Fixed a bug in kh_get(), which has not been tested previously. 2008-08-31 (0.1.1): * Added destructor */ #ifndef __AC_KHASH_H #define __AC_KHASH_H /*! @header Generic hash table library. */ #define AC_VERSION_KHASH_H "0.2.8" #include #include #include /* compiler specific configuration */ #if UINT_MAX == 0xffffffffu typedef unsigned int khint32_t; #elif ULONG_MAX == 0xffffffffu typedef unsigned long khint32_t; #endif #if ULONG_MAX == ULLONG_MAX typedef unsigned long khint64_t; #else typedef unsigned long long khint64_t; #endif #ifndef kh_inline #ifdef _MSC_VER #define kh_inline __inline #else #define kh_inline inline #endif #endif /* kh_inline */ typedef khint32_t khint_t; typedef khint_t khiter_t; #define __ac_isempty(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&2) #define __ac_isdel(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&1) #define __ac_iseither(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&3) #define __ac_set_isdel_false(flag, i) (flag[i>>4]&=~(1ul<<((i&0xfU)<<1))) #define __ac_set_isempty_false(flag, i) (flag[i>>4]&=~(2ul<<((i&0xfU)<<1))) #define __ac_set_isboth_false(flag, i) (flag[i>>4]&=~(3ul<<((i&0xfU)<<1))) #define __ac_set_isdel_true(flag, i) (flag[i>>4]|=1ul<<((i&0xfU)<<1)) #define __ac_fsize(m) ((m) < 16? 1 : (m)>>4) #ifndef kroundup32 #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) #endif #ifndef kcalloc #define kcalloc(N,Z) calloc(N,Z) #endif #ifndef kmalloc #define kmalloc(Z) malloc(Z) #endif #ifndef krealloc #define krealloc(P,Z) realloc(P,Z) #endif #ifndef kfree #define kfree(P) free(P) #endif static const double __ac_HASH_UPPER = 0.77; #define __KHASH_TYPE(name, khkey_t, khval_t) \ typedef struct kh_##name##_s { \ khint_t n_buckets, size, n_occupied, upper_bound; \ khint32_t *flags; \ khkey_t *keys; \ khval_t *vals; \ } kh_##name##_t; #define __KHASH_PROTOTYPES(name, khkey_t, khval_t) \ extern kh_##name##_t *kh_init_##name(void); \ extern void kh_destroy_##name(kh_##name##_t *h); \ extern void kh_clear_##name(kh_##name##_t *h); \ extern khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key); \ extern int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets); \ extern khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret); \ extern void kh_del_##name(kh_##name##_t *h, khint_t x); #define __KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ SCOPE kh_##name##_t *kh_init_##name(void) { \ return (kh_##name##_t*)kcalloc(1, sizeof(kh_##name##_t)); \ } \ SCOPE void kh_destroy_##name(kh_##name##_t *h) \ { \ if (h) { \ kfree((void *)h->keys); kfree(h->flags); \ kfree((void *)h->vals); \ kfree(h); \ } \ } \ SCOPE void kh_clear_##name(kh_##name##_t *h) \ { \ if (h && h->flags) { \ memset(h->flags, 0xaa, __ac_fsize(h->n_buckets) * sizeof(khint32_t)); \ h->size = h->n_occupied = 0; \ } \ } \ SCOPE khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key) \ { \ if (h->n_buckets) { \ khint_t k, i, last, mask, step = 0; \ mask = h->n_buckets - 1; \ k = __hash_func(key); i = k & mask; \ last = i; \ while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \ i = (i + (++step)) & mask; \ if (i == last) return h->n_buckets; \ } \ return __ac_iseither(h->flags, i)? h->n_buckets : i; \ } else return 0; \ } \ SCOPE int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \ { /* This function uses 0.25*n_buckets bytes of working space instead of [sizeof(key_t+val_t)+.25]*n_buckets. */ \ khint32_t *new_flags = 0; \ khint_t j = 1; \ { \ kroundup32(new_n_buckets); \ if (new_n_buckets < 4) new_n_buckets = 4; \ if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0; /* requested size is too small */ \ else { /* hash table size to be changed (shrink or expand); rehash */ \ new_flags = (khint32_t*)kmalloc(__ac_fsize(new_n_buckets) * sizeof(khint32_t)); \ if (!new_flags) return -1; \ memset(new_flags, 0xaa, __ac_fsize(new_n_buckets) * sizeof(khint32_t)); \ if (h->n_buckets < new_n_buckets) { /* expand */ \ khkey_t *new_keys = (khkey_t*)krealloc((void *)h->keys, new_n_buckets * sizeof(khkey_t)); \ if (!new_keys) { kfree(new_flags); return -1; } \ h->keys = new_keys; \ if (kh_is_map) { \ khval_t *new_vals = (khval_t*)krealloc((void *)h->vals, new_n_buckets * sizeof(khval_t)); \ if (!new_vals) { kfree(new_flags); return -1; } \ h->vals = new_vals; \ } \ } /* otherwise shrink */ \ } \ } \ if (j) { /* rehashing is needed */ \ for (j = 0; j != h->n_buckets; ++j) { \ if (__ac_iseither(h->flags, j) == 0) { \ khkey_t key = h->keys[j]; \ khval_t val; \ khint_t new_mask; \ new_mask = new_n_buckets - 1; \ if (kh_is_map) val = h->vals[j]; \ __ac_set_isdel_true(h->flags, j); \ while (1) { /* kick-out process; sort of like in Cuckoo hashing */ \ khint_t k, i, step = 0; \ k = __hash_func(key); \ i = k & new_mask; \ while (!__ac_isempty(new_flags, i)) i = (i + (++step)) & new_mask; \ __ac_set_isempty_false(new_flags, i); \ if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { /* kick out the existing element */ \ { khkey_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } \ if (kh_is_map) { khval_t tmp = h->vals[i]; h->vals[i] = val; val = tmp; } \ __ac_set_isdel_true(h->flags, i); /* mark it as deleted in the old hash table */ \ } else { /* write the element and jump out of the loop */ \ h->keys[i] = key; \ if (kh_is_map) h->vals[i] = val; \ break; \ } \ } \ } \ } \ if (h->n_buckets > new_n_buckets) { /* shrink the hash table */ \ h->keys = (khkey_t*)krealloc((void *)h->keys, new_n_buckets * sizeof(khkey_t)); \ if (kh_is_map) h->vals = (khval_t*)krealloc((void *)h->vals, new_n_buckets * sizeof(khval_t)); \ } \ kfree(h->flags); /* free the working space */ \ h->flags = new_flags; \ h->n_buckets = new_n_buckets; \ h->n_occupied = h->size; \ h->upper_bound = (khint_t)(h->n_buckets * __ac_HASH_UPPER + 0.5); \ } \ return 0; \ } \ SCOPE khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \ { \ khint_t x; \ if (h->n_occupied >= h->upper_bound) { /* update the hash table */ \ if (h->n_buckets > (h->size<<1)) { \ if (kh_resize_##name(h, h->n_buckets - 1) < 0) { /* clear "deleted" elements */ \ *ret = -1; return h->n_buckets; \ } \ } else if (kh_resize_##name(h, h->n_buckets + 1) < 0) { /* expand the hash table */ \ *ret = -1; return h->n_buckets; \ } \ } /* TODO: to implement automatically shrinking; resize() already support shrinking */ \ { \ khint_t k, i, site, last, mask = h->n_buckets - 1, step = 0; \ x = site = h->n_buckets; k = __hash_func(key); i = k & mask; \ if (__ac_isempty(h->flags, i)) x = i; /* for speed up */ \ else { \ last = i; \ while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \ if (__ac_isdel(h->flags, i)) site = i; \ i = (i + (++step)) & mask; \ if (i == last) { x = site; break; } \ } \ if (x == h->n_buckets) { \ if (__ac_isempty(h->flags, i) && site != h->n_buckets) x = site; \ else x = i; \ } \ } \ } \ if (__ac_isempty(h->flags, x)) { /* not present at all */ \ h->keys[x] = key; \ __ac_set_isboth_false(h->flags, x); \ ++h->size; ++h->n_occupied; \ *ret = 1; \ } else if (__ac_isdel(h->flags, x)) { /* deleted */ \ h->keys[x] = key; \ __ac_set_isboth_false(h->flags, x); \ ++h->size; \ *ret = 2; \ } else *ret = 0; /* Don't touch h->keys[x] if present and not deleted */ \ return x; \ } \ SCOPE void kh_del_##name(kh_##name##_t *h, khint_t x) \ { \ if (x != h->n_buckets && !__ac_iseither(h->flags, x)) { \ __ac_set_isdel_true(h->flags, x); \ --h->size; \ } \ } #define KHASH_DECLARE(name, khkey_t, khval_t) \ __KHASH_TYPE(name, khkey_t, khval_t) \ __KHASH_PROTOTYPES(name, khkey_t, khval_t) #define KHASH_INIT2(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ __KHASH_TYPE(name, khkey_t, khval_t) \ __KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) #define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ KHASH_INIT2(name, static kh_inline, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) /* --- BEGIN OF HASH FUNCTIONS --- */ /*! @function @abstract Integer hash function @param key The integer [khint32_t] @return The hash value [khint_t] */ #define kh_int_hash_func(key) (khint32_t)(key) /*! @function @abstract Integer comparison function */ #define kh_int_hash_equal(a, b) ((a) == (b)) /*! @function @abstract 64-bit integer hash function @param key The integer [khint64_t] @return The hash value [khint_t] */ #define kh_int64_hash_func(key) (khint32_t)((key)>>33^(key)^(key)<<11) /*! @function @abstract 64-bit integer comparison function */ #define kh_int64_hash_equal(a, b) ((a) == (b)) /*! @function @abstract const char* hash function @param s Pointer to a null terminated string @return The hash value */ static kh_inline khint_t __ac_X31_hash_string(const char *s) { khint_t h = (khint_t)*s; if (h) for (++s ; *s; ++s) h = (h << 5) - h + (khint_t)*s; return h; } /*! @function @abstract Another interface to const char* hash function @param key Pointer to a null terminated string [const char*] @return The hash value [khint_t] */ #define kh_str_hash_func(key) __ac_X31_hash_string(key) /*! @function @abstract Const char* comparison function */ #define kh_str_hash_equal(a, b) (strcmp(a, b) == 0) static kh_inline khint_t __ac_Wang_hash(khint_t key) { key += ~(key << 15); key ^= (key >> 10); key += (key << 3); key ^= (key >> 6); key += ~(key << 11); key ^= (key >> 16); return key; } #define kh_int_hash_func2(k) __ac_Wang_hash((khint_t)key) /* --- END OF HASH FUNCTIONS --- */ /* Other convenient macros... */ /*! @abstract Type of the hash table. @param name Name of the hash table [symbol] */ #define khash_t(name) kh_##name##_t /*! @function @abstract Initiate a hash table. @param name Name of the hash table [symbol] @return Pointer to the hash table [khash_t(name)*] */ #define kh_init(name) kh_init_##name() /*! @function @abstract Destroy a hash table. @param name Name of the hash table [symbol] @param h Pointer to the hash table [khash_t(name)*] */ #define kh_destroy(name, h) kh_destroy_##name(h) /*! @function @abstract Reset a hash table without deallocating memory. @param name Name of the hash table [symbol] @param h Pointer to the hash table [khash_t(name)*] */ #define kh_clear(name, h) kh_clear_##name(h) /*! @function @abstract Resize a hash table. @param name Name of the hash table [symbol] @param h Pointer to the hash table [khash_t(name)*] @param s New size [khint_t] */ #define kh_resize(name, h, s) kh_resize_##name(h, s) /*! @function @abstract Insert a key to the hash table. @param name Name of the hash table [symbol] @param h Pointer to the hash table [khash_t(name)*] @param k Key [type of keys] @param r Extra return code: -1 if the operation failed; 0 if the key is present in the hash table; 1 if the bucket is empty (never used); 2 if the element in the bucket has been deleted [int*] @return Iterator to the inserted element [khint_t] */ #define kh_put(name, h, k, r) kh_put_##name(h, k, r) /*! @function @abstract Retrieve a key from the hash table. @param name Name of the hash table [symbol] @param h Pointer to the hash table [khash_t(name)*] @param k Key [type of keys] @return Iterator to the found element, or kh_end(h) if the element is absent [khint_t] */ #define kh_get(name, h, k) kh_get_##name(h, k) /*! @function @abstract Remove a key from the hash table. @param name Name of the hash table [symbol] @param h Pointer to the hash table [khash_t(name)*] @param k Iterator to the element to be deleted [khint_t] */ #define kh_del(name, h, k) kh_del_##name(h, k) /*! @function @abstract Test whether a bucket contains data. @param h Pointer to the hash table [khash_t(name)*] @param x Iterator to the bucket [khint_t] @return 1 if containing data; 0 otherwise [int] */ #define kh_exist(h, x) (!__ac_iseither((h)->flags, (x))) /*! @function @abstract Get key given an iterator @param h Pointer to the hash table [khash_t(name)*] @param x Iterator to the bucket [khint_t] @return Key [type of keys] */ #define kh_key(h, x) ((h)->keys[x]) /*! @function @abstract Get value given an iterator @param h Pointer to the hash table [khash_t(name)*] @param x Iterator to the bucket [khint_t] @return Value [type of values] @discussion For hash sets, calling this results in segfault. */ #define kh_val(h, x) ((h)->vals[x]) /*! @function @abstract Alias of kh_val() */ #define kh_value(h, x) ((h)->vals[x]) /*! @function @abstract Get the start iterator @param h Pointer to the hash table [khash_t(name)*] @return The start iterator [khint_t] */ #define kh_begin(h) (khint_t)(0) /*! @function @abstract Get the end iterator @param h Pointer to the hash table [khash_t(name)*] @return The end iterator [khint_t] */ #define kh_end(h) ((h)->n_buckets) /*! @function @abstract Get the number of elements in the hash table @param h Pointer to the hash table [khash_t(name)*] @return Number of elements in the hash table [khint_t] */ #define kh_size(h) ((h)->size) /*! @function @abstract Get the number of buckets in the hash table @param h Pointer to the hash table [khash_t(name)*] @return Number of buckets in the hash table [khint_t] */ #define kh_n_buckets(h) ((h)->n_buckets) /*! @function @abstract Iterate over the entries in the hash table @param h Pointer to the hash table [khash_t(name)*] @param kvar Variable to which key will be assigned @param vvar Variable to which value will be assigned @param code Block of code to execute */ #define kh_foreach(h, kvar, vvar, code) { khint_t __i; \ for (__i = kh_begin(h); __i != kh_end(h); ++__i) { \ if (!kh_exist(h,__i)) continue; \ (kvar) = kh_key(h,__i); \ (vvar) = kh_val(h,__i); \ code; \ } } /*! @function @abstract Iterate over the values in the hash table @param h Pointer to the hash table [khash_t(name)*] @param vvar Variable to which value will be assigned @param code Block of code to execute */ #define kh_foreach_value(h, vvar, code) { khint_t __i; \ for (__i = kh_begin(h); __i != kh_end(h); ++__i) { \ if (!kh_exist(h,__i)) continue; \ (vvar) = kh_val(h,__i); \ code; \ } } /* More conenient interfaces */ /*! @function @abstract Instantiate a hash set containing integer keys @param name Name of the hash table [symbol] */ #define KHASH_SET_INIT_INT(name) \ KHASH_INIT(name, khint32_t, char, 0, kh_int_hash_func, kh_int_hash_equal) /*! @function @abstract Instantiate a hash map containing integer keys @param name Name of the hash table [symbol] @param khval_t Type of values [type] */ #define KHASH_MAP_INIT_INT(name, khval_t) \ KHASH_INIT(name, khint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal) /*! @function @abstract Instantiate a hash map containing 64-bit integer keys @param name Name of the hash table [symbol] */ #define KHASH_SET_INIT_INT64(name) \ KHASH_INIT(name, khint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal) /*! @function @abstract Instantiate a hash map containing 64-bit integer keys @param name Name of the hash table [symbol] @param khval_t Type of values [type] */ #define KHASH_MAP_INIT_INT64(name, khval_t) \ KHASH_INIT(name, khint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal) typedef const char *kh_cstr_t; /*! @function @abstract Instantiate a hash map containing const char* keys @param name Name of the hash table [symbol] */ #define KHASH_SET_INIT_STR(name) \ KHASH_INIT(name, kh_cstr_t, char, 0, kh_str_hash_func, kh_str_hash_equal) /*! @function @abstract Instantiate a hash map containing const char* keys @param name Name of the hash table [symbol] @param khval_t Type of values [type] */ #define KHASH_MAP_INIT_STR(name, khval_t) \ KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, kh_str_hash_equal) #endif /* __AC_KHASH_H */ minimap-0.2/kseq.h000066400000000000000000000210451263116742000141250ustar00rootroot00000000000000/* The MIT License Copyright (c) 2008, 2009, 2011 Attractive Chaos Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ /* Last Modified: 05MAR2012 */ #ifndef AC_KSEQ_H #define AC_KSEQ_H #include #include #include #define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r #define KS_SEP_TAB 1 // isspace() && !' ' #define KS_SEP_LINE 2 // line separator: "\n" (Unix) or "\r\n" (Windows) #define KS_SEP_MAX 2 #define __KS_TYPE(type_t) \ typedef struct __kstream_t { \ int begin, end; \ int is_eof:2, bufsize:30; \ type_t f; \ unsigned char *buf; \ } kstream_t; #define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end) #define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0) #define __KS_BASIC(SCOPE, type_t, __bufsize) \ SCOPE kstream_t *ks_init(type_t f) \ { \ kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t)); \ ks->f = f; ks->bufsize = __bufsize; \ ks->buf = (unsigned char*)malloc(__bufsize); \ return ks; \ } \ SCOPE void ks_destroy(kstream_t *ks) \ { \ if (!ks) return; \ free(ks->buf); \ free(ks); \ } #define __KS_INLINED(__read) \ static inline int ks_getc(kstream_t *ks) \ { \ if (ks->is_eof && ks->begin >= ks->end) return -1; \ if (ks->begin >= ks->end) { \ ks->begin = 0; \ ks->end = __read(ks->f, ks->buf, ks->bufsize); \ if (ks->end < ks->bufsize) ks->is_eof = 1; \ if (ks->end == 0) return -1; \ } \ return (int)ks->buf[ks->begin++]; \ } \ static inline int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \ { return ks_getuntil2(ks, delimiter, str, dret, 0); } #ifndef KSTRING_T #define KSTRING_T kstring_t typedef struct __kstring_t { unsigned l, m; char *s; } kstring_t; #endif #ifndef kroundup32 #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) #endif #define __KS_GETUNTIL(SCOPE, __read) \ SCOPE int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append) \ { \ if (dret) *dret = 0; \ str->l = append? str->l : 0; \ if (ks->begin >= ks->end && ks->is_eof) return -1; \ for (;;) { \ int i; \ if (ks->begin >= ks->end) { \ if (!ks->is_eof) { \ ks->begin = 0; \ ks->end = __read(ks->f, ks->buf, ks->bufsize); \ if (ks->end < ks->bufsize) ks->is_eof = 1; \ if (ks->end == 0) break; \ } else break; \ } \ if (delimiter == KS_SEP_LINE) { \ for (i = ks->begin; i < ks->end; ++i) \ if (ks->buf[i] == '\n') break; \ } else if (delimiter > KS_SEP_MAX) { \ for (i = ks->begin; i < ks->end; ++i) \ if (ks->buf[i] == delimiter) break; \ } else if (delimiter == KS_SEP_SPACE) { \ for (i = ks->begin; i < ks->end; ++i) \ if (isspace(ks->buf[i])) break; \ } else if (delimiter == KS_SEP_TAB) { \ for (i = ks->begin; i < ks->end; ++i) \ if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \ } else i = 0; /* never come to here! */ \ if (str->m - str->l < (size_t)(i - ks->begin + 1)) { \ str->m = str->l + (i - ks->begin) + 1; \ kroundup32(str->m); \ str->s = (char*)realloc(str->s, str->m); \ } \ memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \ str->l = str->l + (i - ks->begin); \ ks->begin = i + 1; \ if (i < ks->end) { \ if (dret) *dret = ks->buf[i]; \ break; \ } \ } \ if (str->s == 0) { \ str->m = 1; \ str->s = (char*)calloc(1, 1); \ } else if (delimiter == KS_SEP_LINE && str->l > 1 && str->s[str->l-1] == '\r') --str->l; \ str->s[str->l] = '\0'; \ return str->l; \ } #define KSTREAM_INIT2(SCOPE, type_t, __read, __bufsize) \ __KS_TYPE(type_t) \ __KS_BASIC(SCOPE, type_t, __bufsize) \ __KS_GETUNTIL(SCOPE, __read) \ __KS_INLINED(__read) #define KSTREAM_INIT(type_t, __read, __bufsize) KSTREAM_INIT2(static, type_t, __read, __bufsize) #define KSTREAM_DECLARE(type_t, __read) \ __KS_TYPE(type_t) \ extern int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append); \ extern kstream_t *ks_init(type_t f); \ extern void ks_destroy(kstream_t *ks); \ __KS_INLINED(__read) /****************** * FASTA/Q parser * ******************/ #define kseq_rewind(ks) ((ks)->last_char = (ks)->f->is_eof = (ks)->f->begin = (ks)->f->end = 0) #define __KSEQ_BASIC(SCOPE, type_t) \ SCOPE kseq_t *kseq_init(type_t fd) \ { \ kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t)); \ s->f = ks_init(fd); \ return s; \ } \ SCOPE void kseq_destroy(kseq_t *ks) \ { \ if (!ks) return; \ free(ks->name.s); free(ks->comment.s); free(ks->seq.s); free(ks->qual.s); \ ks_destroy(ks->f); \ free(ks); \ } /* Return value: >=0 length of the sequence (normal) -1 end-of-file -2 truncated quality string */ #define __KSEQ_READ(SCOPE) \ SCOPE int kseq_read(kseq_t *seq) \ { \ int c; \ kstream_t *ks = seq->f; \ if (seq->last_char == 0) { /* then jump to the next header line */ \ while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \ if (c == -1) return -1; /* end of file */ \ seq->last_char = c; \ } /* else: the first header char has been read in the previous call */ \ seq->comment.l = seq->seq.l = seq->qual.l = 0; /* reset all members */ \ if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; /* normal exit: EOF */ \ if (c != '\n') ks_getuntil(ks, KS_SEP_LINE, &seq->comment, 0); /* read FASTA/Q comment */ \ if (seq->seq.s == 0) { /* we can do this in the loop below, but that is slower */ \ seq->seq.m = 256; \ seq->seq.s = (char*)malloc(seq->seq.m); \ } \ while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \ if (c == '\n') continue; /* skip empty lines */ \ seq->seq.s[seq->seq.l++] = c; /* this is safe: we always have enough space for 1 char */ \ ks_getuntil2(ks, KS_SEP_LINE, &seq->seq, 0, 1); /* read the rest of the line */ \ } \ if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \ if (seq->seq.l + 1 >= seq->seq.m) { /* seq->seq.s[seq->seq.l] below may be out of boundary */ \ seq->seq.m = seq->seq.l + 2; \ kroundup32(seq->seq.m); /* rounded to the next closest 2^k */ \ seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \ } \ seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \ if (c != '+') return seq->seq.l; /* FASTA */ \ if (seq->qual.m < seq->seq.m) { /* allocate memory for qual in case insufficient */ \ seq->qual.m = seq->seq.m; \ seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \ } \ while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \ if (c == -1) return -2; /* error: no quality string */ \ while (ks_getuntil2(ks, KS_SEP_LINE, &seq->qual, 0, 1) >= 0 && seq->qual.l < seq->seq.l); \ seq->last_char = 0; /* we have not come to the next header line */ \ if (seq->seq.l != seq->qual.l) return -2; /* error: qual string is of a different length */ \ return seq->seq.l; \ } #define __KSEQ_TYPE(type_t) \ typedef struct { \ kstring_t name, comment, seq, qual; \ int last_char; \ kstream_t *f; \ } kseq_t; #define KSEQ_INIT2(SCOPE, type_t, __read) \ KSTREAM_INIT2(SCOPE, type_t, __read, 16384) \ __KSEQ_TYPE(type_t) \ __KSEQ_BASIC(SCOPE, type_t) \ __KSEQ_READ(SCOPE) #define KSEQ_INIT(type_t, __read) KSEQ_INIT2(static, type_t, __read) #define KSEQ_DECLARE(type_t) \ __KS_TYPE(type_t) \ __KSEQ_TYPE(type_t) \ extern kseq_t *kseq_init(type_t fd); \ void kseq_destroy(kseq_t *ks); \ int kseq_read(kseq_t *seq); #endif minimap-0.2/ksort.h000066400000000000000000000124161263116742000143260ustar00rootroot00000000000000/* The MIT License Copyright (c) 2008, 2011 Attractive Chaos Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ // This is a simplified version of ksort.h #ifndef AC_KSORT_H #define AC_KSORT_H #include #include typedef struct { void *left, *right; int depth; } ks_isort_stack_t; #define KSORT_SWAP(type_t, a, b) { register type_t t=(a); (a)=(b); (b)=t; } #define KSORT_INIT(name, type_t, __sort_lt) \ size_t ks_lis_##name(size_t n, const type_t *a, size_t *b, size_t *_p) \ { /* translated from: http://www.algorithmist.com/index.php/Longest_Increasing_Subsequence.cpp */ \ size_t i, u, v, *top = b, *p; \ if (n == 0) return 0; \ p = _p? _p : (size_t*)malloc(n * sizeof(size_t)); \ *top++ = 0; \ for (i = 1; i < n; i++) { \ if (__sort_lt(a[*(top-1)], a[i])) { \ p[i] = *(top-1); \ *top++ = i; \ continue; \ } \ for (u = 0, v = top - b - 1; u < v;) { \ size_t c = (u + v) >> 1; \ if (__sort_lt(a[b[c]], a[i])) u = c + 1; \ else v = c; \ } \ if (__sort_lt(a[i], a[b[u]])) { \ if (u > 0) p[i] = b[u-1]; \ b[u] = i; \ } \ } \ for (u = top - b, v = *(top-1); u--; v = p[v]) b[u] = v; \ if (!_p) free(p); \ return top - b; \ } \ type_t ks_ksmall_##name(size_t n, type_t arr[], size_t kk) \ { \ type_t *low, *high, *k, *ll, *hh, *mid; \ low = arr; high = arr + n - 1; k = arr + kk; \ for (;;) { \ if (high <= low) return *k; \ if (high == low + 1) { \ if (__sort_lt(*high, *low)) KSORT_SWAP(type_t, *low, *high); \ return *k; \ } \ mid = low + (high - low) / 2; \ if (__sort_lt(*high, *mid)) KSORT_SWAP(type_t, *mid, *high); \ if (__sort_lt(*high, *low)) KSORT_SWAP(type_t, *low, *high); \ if (__sort_lt(*low, *mid)) KSORT_SWAP(type_t, *mid, *low); \ KSORT_SWAP(type_t, *mid, *(low+1)); \ ll = low + 1; hh = high; \ for (;;) { \ do ++ll; while (__sort_lt(*ll, *low)); \ do --hh; while (__sort_lt(*low, *hh)); \ if (hh < ll) break; \ KSORT_SWAP(type_t, *ll, *hh); \ } \ KSORT_SWAP(type_t, *low, *hh); \ if (hh <= k) low = ll; \ if (hh >= k) high = hh - 1; \ } \ } \ #define ks_ksmall(name, n, a, k) ks_ksmall_##name(n, a, k) #define ks_lt_generic(a, b) ((a) < (b)) #define ks_lt_str(a, b) (strcmp((a), (b)) < 0) typedef const char *ksstr_t; #define KSORT_INIT_GENERIC(type_t) KSORT_INIT(type_t, type_t, ks_lt_generic) #define KSORT_INIT_STR KSORT_INIT(str, ksstr_t, ks_lt_str) #define RS_MIN_SIZE 64 #define KRADIX_SORT_INIT(name, rstype_t, rskey, sizeof_key) \ typedef struct { \ rstype_t *b, *e; \ } rsbucket_##name##_t; \ void rs_insertsort_##name(rstype_t *beg, rstype_t *end) \ { \ rstype_t *i; \ for (i = beg + 1; i < end; ++i) \ if (rskey(*i) < rskey(*(i - 1))) { \ rstype_t *j, tmp = *i; \ for (j = i; j > beg && rskey(tmp) < rskey(*(j-1)); --j) \ *j = *(j - 1); \ *j = tmp; \ } \ } \ void rs_sort_##name(rstype_t *beg, rstype_t *end, int n_bits, int s) \ { \ rstype_t *i; \ int size = 1<b = k->e = beg; \ for (i = beg; i != end; ++i) ++b[rskey(*i)>>s&m].e; \ for (k = b + 1; k != be; ++k) \ k->e += (k-1)->e - beg, k->b = (k-1)->e; \ for (k = b; k != be;) { \ if (k->b != k->e) { \ rsbucket_##name##_t *l; \ if ((l = b + (rskey(*k->b)>>s&m)) != k) { \ rstype_t tmp = *k->b, swap; \ do { \ swap = tmp; tmp = *l->b; *l->b++ = swap; \ l = b + (rskey(tmp)>>s&m); \ } while (l != k); \ *k->b++ = tmp; \ } else ++k->b; \ } else ++k; \ } \ for (b->b = beg, k = b + 1; k != be; ++k) k->b = (k-1)->e; \ if (s) { \ s = s > n_bits? s - n_bits : 0; \ for (k = b; k != be; ++k) \ if (k->e - k->b > RS_MIN_SIZE) rs_sort_##name(k->b, k->e, n_bits, s); \ else if (k->e - k->b > 1) rs_insertsort_##name(k->b, k->e); \ } \ } \ void radix_sort_##name(rstype_t *beg, rstype_t *end) \ { \ if (end - beg <= RS_MIN_SIZE) rs_insertsort_##name(beg, end); \ else rs_sort_##name(beg, end, 8, sizeof_key * 8 - 8); \ } #endif minimap-0.2/kthread.c000066400000000000000000000072141263116742000146010ustar00rootroot00000000000000#include #include #include /************ * kt_for() * ************/ struct kt_for_t; typedef struct { struct kt_for_t *t; long i; } ktf_worker_t; typedef struct kt_for_t { int n_threads; long n; ktf_worker_t *w; void (*func)(void*,long,int); void *data; } kt_for_t; static inline long steal_work(kt_for_t *t) { int i, min_i = -1; long k, min = LONG_MAX; for (i = 0; i < t->n_threads; ++i) if (min > t->w[i].i) min = t->w[i].i, min_i = i; k = __sync_fetch_and_add(&t->w[min_i].i, t->n_threads); return k >= t->n? -1 : k; } static void *ktf_worker(void *data) { ktf_worker_t *w = (ktf_worker_t*)data; long i; for (;;) { i = __sync_fetch_and_add(&w->i, w->t->n_threads); if (i >= w->t->n) break; w->t->func(w->t->data, i, w - w->t->w); } while ((i = steal_work(w->t)) >= 0) w->t->func(w->t->data, i, w - w->t->w); pthread_exit(0); } void kt_for(int n_threads, void (*func)(void*,long,int), void *data, long n) { int i; kt_for_t t; pthread_t *tid; t.func = func, t.data = data, t.n_threads = n_threads, t.n = n; t.w = (ktf_worker_t*)alloca(n_threads * sizeof(ktf_worker_t)); tid = (pthread_t*)alloca(n_threads * sizeof(pthread_t)); for (i = 0; i < n_threads; ++i) t.w[i].t = &t, t.w[i].i = i; for (i = 0; i < n_threads; ++i) pthread_create(&tid[i], 0, ktf_worker, &t.w[i]); for (i = 0; i < n_threads; ++i) pthread_join(tid[i], 0); } /***************** * kt_pipeline() * *****************/ struct ktp_t; typedef struct { struct ktp_t *pl; int64_t index; int step; void *data; } ktp_worker_t; typedef struct ktp_t { void *shared; void *(*func)(void*, int, void*); int64_t index; int n_workers, n_steps; ktp_worker_t *workers; pthread_mutex_t mutex; pthread_cond_t cv; } ktp_t; static void *ktp_worker(void *data) { ktp_worker_t *w = (ktp_worker_t*)data; ktp_t *p = w->pl; while (w->step < p->n_steps) { // test whether we can kick off the job with this worker pthread_mutex_lock(&p->mutex); for (;;) { int i; // test whether another worker is doing the same step for (i = 0; i < p->n_workers; ++i) { if (w == &p->workers[i]) continue; // ignore itself if (p->workers[i].step <= w->step && p->workers[i].index < w->index) break; } if (i == p->n_workers) break; // no workers with smaller indices are doing w->step or the previous steps pthread_cond_wait(&p->cv, &p->mutex); } pthread_mutex_unlock(&p->mutex); // working on w->step w->data = p->func(p->shared, w->step, w->step? w->data : 0); // for the first step, input is NULL // update step and let other workers know pthread_mutex_lock(&p->mutex); w->step = w->step == p->n_steps - 1 || w->data? (w->step + 1) % p->n_steps : p->n_steps; if (w->step == 0) w->index = p->index++; pthread_cond_broadcast(&p->cv); pthread_mutex_unlock(&p->mutex); } pthread_exit(0); } void kt_pipeline(int n_threads, void *(*func)(void*, int, void*), void *shared_data, int n_steps) { ktp_t aux; pthread_t *tid; int i; if (n_threads < 1) n_threads = 1; aux.n_workers = n_threads; aux.n_steps = n_steps; aux.func = func; aux.shared = shared_data; aux.index = 0; pthread_mutex_init(&aux.mutex, 0); pthread_cond_init(&aux.cv, 0); aux.workers = (ktp_worker_t*)alloca(n_threads * sizeof(ktp_worker_t)); for (i = 0; i < n_threads; ++i) { ktp_worker_t *w = &aux.workers[i]; w->step = 0; w->pl = &aux; w->data = 0; w->index = aux.index++; } tid = (pthread_t*)alloca(n_threads * sizeof(pthread_t)); for (i = 0; i < n_threads; ++i) pthread_create(&tid[i], 0, ktp_worker, &aux.workers[i]); for (i = 0; i < n_threads; ++i) pthread_join(tid[i], 0); pthread_mutex_destroy(&aux.mutex); pthread_cond_destroy(&aux.cv); } minimap-0.2/kvec.h000066400000000000000000000063311263116742000141130ustar00rootroot00000000000000/* The MIT License Copyright (c) 2008, by Attractive Chaos Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ /* An example: #include "kvec.h" int main() { kvec_t(int) array; kv_init(array); kv_push(int, array, 10); // append kv_a(int, array, 20) = 5; // dynamic kv_A(array, 20) = 4; // static kv_destroy(array); return 0; } */ /* 2008-09-22 (0.1.0): * The initial version. */ #ifndef AC_KVEC_H #define AC_KVEC_H #include #define kv_roundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) #define kvec_t(type) struct { size_t n, m; type *a; } #define kv_init(v) ((v).n = (v).m = 0, (v).a = 0) #define kv_destroy(v) free((v).a) #define kv_A(v, i) ((v).a[(i)]) #define kv_pop(v) ((v).a[--(v).n]) #define kv_size(v) ((v).n) #define kv_max(v) ((v).m) #define kv_resize(type, v, s) do { \ if ((v).m < (s)) { \ (v).m = (s); \ kv_roundup32((v).m); \ (v).a = (type*)realloc((v).a, sizeof(type) * (v).m); \ } \ } while (0) #define kv_copy(type, v1, v0) do { \ if ((v1).m < (v0).n) kv_resize(type, v1, (v0).n); \ (v1).n = (v0).n; \ memcpy((v1).a, (v0).a, sizeof(type) * (v0).n); \ } while (0) \ #define kv_push(type, v, x) do { \ if ((v).n == (v).m) { \ (v).m = (v).m? (v).m<<1 : 2; \ (v).a = (type*)realloc((v).a, sizeof(type) * (v).m); \ } \ (v).a[(v).n++] = (x); \ } while (0) #define kv_pushp(type, v, p) do { \ if ((v).n == (v).m) { \ (v).m = (v).m? (v).m<<1 : 2; \ (v).a = (type*)realloc((v).a, sizeof(type) * (v).m); \ } \ *(p) = &(v).a[(v).n++]; \ } while (0) #define kv_a(type, v, i) ((v).m <= (size_t)(i)? \ ((v).m = (v).n = (i) + 1, kv_roundup32((v).m), \ (v).a = (type*)realloc((v).a, sizeof(type) * (v).m), 0) \ : (v).n <= (size_t)(i)? (v).n = (i) \ : 0), (v).a[(i)] #define kv_reverse(type, v, start) do { \ if ((v).m > 0 && (v).n > (start)) { \ size_t __i, __end = (v).n - (start); \ type *__a = (v).a + (start); \ for (__i = 0; __i < __end>>1; ++__i) { \ type __t = __a[__end - 1 - __i]; \ __a[__end - 1 - __i] = __a[__i]; __a[__i] = __t; \ } \ } \ } while (0) #endif minimap-0.2/main.c000066400000000000000000000136111263116742000141010ustar00rootroot00000000000000#include #include #include #include #include #include #include "minimap.h" #define MM_VERSION "0.2-r123" void liftrlimit() { #ifdef __linux__ struct rlimit r; getrlimit(RLIMIT_AS, &r); r.rlim_cur = r.rlim_max; setrlimit(RLIMIT_AS, &r); #endif } int main(int argc, char *argv[]) { mm_mapopt_t opt; int i, c, k = 15, w = -1, b = MM_IDX_DEF_B, n_threads = 3, keep_name = 1, is_idx = 0; int tbatch_size = 100000000; uint64_t ibatch_size = 4000000000ULL; float f = 0.001; bseq_file_t *fp = 0; char *fnw = 0; FILE *fpr = 0, *fpw = 0; liftrlimit(); mm_realtime0 = realtime(); mm_mapopt_init(&opt); while ((c = getopt(argc, argv, "w:k:B:b:t:r:c:f:Vv:NOg:I:d:lRPST:m:L:Dx:")) >= 0) { if (c == 'w') w = atoi(optarg); else if (c == 'k') k = atoi(optarg); else if (c == 'b') b = atoi(optarg); else if (c == 'r') opt.radius = atoi(optarg); else if (c == 'c') opt.min_cnt = atoi(optarg); else if (c == 'm') opt.merge_frac = atof(optarg); else if (c == 'f') f = atof(optarg); else if (c == 't') n_threads = atoi(optarg); else if (c == 'v') mm_verbose = atoi(optarg); else if (c == 'g') opt.max_gap = atoi(optarg); else if (c == 'N') keep_name = 0; else if (c == 'd') fnw = optarg; else if (c == 'l') is_idx = 1; else if (c == 'R') opt.flag |= MM_F_WITH_REP; else if (c == 'P') opt.flag &= ~MM_F_WITH_REP; else if (c == 'D') opt.flag |= MM_F_NO_SELF; else if (c == 'O') opt.flag |= MM_F_NO_ISO; else if (c == 'S') opt.flag |= MM_F_AVA | MM_F_NO_SELF; else if (c == 'T') opt.sdust_thres = atoi(optarg); else if (c == 'L') opt.min_match = atoi(optarg); else if (c == 'V') { puts(MM_VERSION); return 0; } else if (c == 'B' || c == 'I') { double x; char *p; x = strtod(optarg, &p); if (*p == 'G' || *p == 'g') x *= 1e9; else if (*p == 'M' || *p == 'm') x *= 1e6; else if (*p == 'K' || *p == 'k') x *= 1e3; if (c == 'B') tbatch_size = (uint64_t)(x + .499); else ibatch_size = (uint64_t)(x + .499); } else if (c == 'x') { if (strcmp(optarg, "ava10k") == 0) { opt.flag |= MM_F_AVA | MM_F_NO_SELF; opt.min_match = 100; opt.merge_frac = 0.0; w = 5; } } } if (w < 0) w = (int)(.6666667 * k + .499); if (argc == optind) { fprintf(stderr, "Usage: minimap [options] [query.fa] [...]\n"); fprintf(stderr, "Options:\n"); fprintf(stderr, " Indexing:\n"); fprintf(stderr, " -k INT k-mer size [%d]\n", k); fprintf(stderr, " -w INT minizer window size [{-k}*2/3]\n"); fprintf(stderr, " -I NUM split index for every ~NUM input bases [4G]\n"); fprintf(stderr, " -d FILE dump index to FILE []\n"); fprintf(stderr, " -l the 1st argument is a index file (overriding -k, -w and -I)\n"); // fprintf(stderr, " -b INT bucket bits [%d]\n", b); // most users would care about this fprintf(stderr, " Mapping:\n"); fprintf(stderr, " -f FLOAT filter out top FLOAT fraction of repetitive minimizers [%.3f]\n", f); fprintf(stderr, " -r INT bandwidth [%d]\n", opt.radius); fprintf(stderr, " -m FLOAT merge two chains if FLOAT fraction of minimizers are shared [%.2f]\n", opt.merge_frac); fprintf(stderr, " -c INT retain a mapping if it consists of >=INT minimizers [%d]\n", opt.min_cnt); fprintf(stderr, " -L INT min matching length [%d]\n", opt.min_match); fprintf(stderr, " -g INT split a mapping if there is a gap longer than INT [%d]\n", opt.max_gap); fprintf(stderr, " -T INT SDUST threshold; 0 to disable SDUST [%d]\n", opt.sdust_thres); // fprintf(stderr, " -D skip self mappings but keep dual mappings\n"); // too confusing to expose to end users fprintf(stderr, " -S skip self and dual mappings\n"); fprintf(stderr, " -O drop isolated hits before chaining (EXPERIMENTAL)\n"); fprintf(stderr, " -P filtering potential repeats after mapping (EXPERIMENTAL)\n"); // fprintf(stderr, " -R skip post-mapping repeat filtering\n"); // deprecated option for backward compatibility fprintf(stderr, " -x STR preset (recommended to be applied before other options) []\n"); fprintf(stderr, " ava10k: -Sw5 -L100 -m0 (PacBio/ONT all-vs-all read mapping)\n"); fprintf(stderr, " Input/Output:\n"); fprintf(stderr, " -t INT number of threads [%d]\n", n_threads); // fprintf(stderr, " -B NUM process ~NUM bp in each batch [100M]\n"); // fprintf(stderr, " -v INT verbose level [%d]\n", mm_verbose); // fprintf(stderr, " -N use integer as target names\n"); fprintf(stderr, " -V show version number\n"); fprintf(stderr, "\nSee minimap.1 for detailed description of the command-line options.\n"); return 1; } if (is_idx) fpr = fopen(argv[optind], "rb"); else fp = bseq_open(argv[optind]); if (fnw) fpw = fopen(fnw, "wb"); for (;;) { mm_idx_t *mi = 0; if (fpr) mi = mm_idx_load(fpr); else if (!bseq_eof(fp)) mi = mm_idx_gen(fp, w, k, b, tbatch_size, n_threads, ibatch_size, keep_name); if (mi == 0) break; if (mm_verbose >= 3) fprintf(stderr, "[M::%s::%.3f*%.2f] loaded/built the index for %d target sequence(s)\n", __func__, realtime() - mm_realtime0, cputime() / (realtime() - mm_realtime0), mi->n); mm_idx_set_max_occ(mi, f); if (mm_verbose >= 3) fprintf(stderr, "[M::%s] max occurrences of a minimizer to consider: %d\n", __func__, mi->max_occ); if (fpw) mm_idx_dump(fpw, mi); for (i = optind + 1; i < argc; ++i) mm_map_file(mi, argv[i], &opt, n_threads, tbatch_size); mm_idx_destroy(mi); } if (fpw) fclose(fpw); if (fpr) fclose(fpr); if (fp) bseq_close(fp); fprintf(stderr, "[M::%s] Version: %s\n", __func__, MM_VERSION); fprintf(stderr, "[M::%s] CMD:", __func__); for (i = 0; i < argc; ++i) fprintf(stderr, " %s", argv[i]); fprintf(stderr, "\n[M::%s] Real time: %.3f sec; CPU: %.3f sec\n", __func__, realtime() - mm_realtime0, cputime()); return 0; } minimap-0.2/map.c000066400000000000000000000300271263116742000137320ustar00rootroot00000000000000#include #include #include #include "bseq.h" #include "kvec.h" #include "minimap.h" #include "sdust.h" void mm_mapopt_init(mm_mapopt_t *opt) { opt->radius = 500; opt->max_gap = 10000; opt->min_cnt = 4; opt->min_match = 40; opt->sdust_thres = 0; opt->flag = MM_F_WITH_REP; opt->merge_frac = .5; } /**************************** * Find approxiate mappings * ****************************/ struct mm_tbuf_s { // per-thread buffer mm128_v mini; // query minimizers mm128_v coef; // Hough transform coefficient mm128_v intv; // intervals on sorted coef uint32_v reg2mini; uint32_v rep_aux; sdust_buf_t *sdb; // the following are for computing LIS uint32_t n, m; uint64_t *a; size_t *b, *p; // final output kvec_t(mm_reg1_t) reg; }; mm_tbuf_t *mm_tbuf_init() { mm_tbuf_t *b; b = (mm_tbuf_t*)calloc(1, sizeof(mm_tbuf_t)); b->sdb = sdust_buf_init(); return b; } void mm_tbuf_destroy(mm_tbuf_t *b) { if (b == 0) return; free(b->mini.a); free(b->coef.a); free(b->intv.a); free(b->reg.a); free(b->reg2mini.a); free(b->rep_aux.a); free(b->a); free(b->b); free(b->p); sdust_buf_destroy(b->sdb); free(b); } #include "ksort.h" #define sort_key_64(a) (a) KRADIX_SORT_INIT(64, uint64_t, sort_key_64, 8) #define lt_low32(a, b) ((uint32_t)(a) < (uint32_t)(b)) KSORT_INIT(low32lt, uint64_t, lt_low32) #define gt_low32(a, b) ((uint32_t)(a) > (uint32_t)(b)) KSORT_INIT(low32gt, uint64_t, gt_low32) /* TODO: drop_rep() is not robust. For all-vs-all mapping but without the -S * flag, all minimizers have at least one hit. The _thres_ computed below will * be highly skewed. Some improvements need to be made. */ static void drop_rep(mm_tbuf_t *b, int min_cnt) { int i, j, n, m; uint32_t thres; b->rep_aux.n = 0; for (i = 0; i < b->mini.n; ++i) if (b->mini.a[i].y>>32) kv_push(uint32_t, b->rep_aux, b->mini.a[i].y>>32); if (b->rep_aux.n < 3) return; thres = (uint32_t)(ks_ksmall_uint32_t(b->rep_aux.n, b->rep_aux.a, b->rep_aux.n>>1) * MM_DEREP_Q50 + .499); for (i = n = m = 0; i < b->reg.n; ++i) { int cnt = 0, all_cnt = b->reg.a[i].cnt; for (j = 0; j < all_cnt; ++j) if (b->mini.a[b->reg2mini.a[m + j]].y>>32 <= thres) ++cnt; if (cnt >= min_cnt) b->reg.a[n++] = b->reg.a[i]; m += all_cnt; } // printf("%ld=>%d\t%d\n", b->reg.n, n, thres); b->reg.n = n; } static void proc_intv(mm_tbuf_t *b, int which, int k, int min_cnt, int max_gap) { int i, j, l_lis, rid = -1, rev = 0, start = b->intv.a[which].y, end = start + b->intv.a[which].x; // make room for arrays needed by LIS (longest increasing sequence) if (end - start > b->m) { b->m = end - start; kv_roundup32(b->m); b->a = (uint64_t*)realloc(b->a, b->m * 8); b->b = (size_t*)realloc(b->b, b->m * sizeof(size_t)); b->p = (size_t*)realloc(b->p, b->m * sizeof(size_t)); } // prepare the input array _a_ for LIS b->n = 0; for (i = start; i < end; ++i) if (b->coef.a[i].x != UINT64_MAX) b->a[b->n++] = b->coef.a[i].y, rid = b->coef.a[i].x << 1 >> 33, rev = b->coef.a[i].x >> 63; if (b->n < min_cnt) return; radix_sort_64(b->a, b->a + b->n); // find the longest increasing sequence l_lis = rev? ks_lis_low32gt(b->n, b->a, b->b, b->p) : ks_lis_low32lt(b->n, b->a, b->b, b->p); // LIS if (l_lis < min_cnt) return; for (i = 1, j = 1; i < l_lis; ++i) // squeeze out minimizaers reused in the LIS sequence if (b->a[b->b[i]]>>32 != b->a[b->b[i-1]]>>32) b->a[b->b[j++]] = b->a[b->b[i]]; l_lis = j; if (l_lis < min_cnt) return; // convert LISes to regions; possibly break an LIS at a long gaps for (i = 1, start = 0; i <= l_lis; ++i) { int32_t qgap = i == l_lis? 0 : ((uint32_t)b->mini.a[b->a[b->b[i]]>>32].y>>1) - ((uint32_t)b->mini.a[b->a[b->b[i-1]]>>32].y>>1); if (i == l_lis || (qgap > max_gap && abs((int32_t)b->a[b->b[i]] - (int32_t)b->a[b->b[i-1]]) > max_gap)) { if (i - start >= min_cnt) { uint32_t lq = 0, lr = 0, eq = 0, er = 0, sq = 0, sr = 0; mm_reg1_t *r; kv_pushp(mm_reg1_t, b->reg, &r); r->rid = rid, r->rev = rev, r->cnt = i - start, r->rep = 0; r->qs = ((uint32_t)b->mini.a[b->a[b->b[start]]>>32].y>>1) - (k - 1); r->qe = ((uint32_t)b->mini.a[b->a[b->b[i-1]]>>32].y>>1) + 1; r->rs = rev? (uint32_t)b->a[b->b[i-1]] : (uint32_t)b->a[b->b[start]]; r->re = rev? (uint32_t)b->a[b->b[start]] : (uint32_t)b->a[b->b[i-1]]; r->rs -= k - 1; r->re += 1; for (j = start; j < i; ++j) { // count the number of times each minimizer is used int jj = b->a[b->b[j]]>>32; b->mini.a[jj].y += 1ULL<<32; kv_push(uint32_t, b->reg2mini, jj); // keep minimizer<=>reg mapping for derep } for (j = start; j < i; ++j) { // compute ->len uint32_t q = ((uint32_t)b->mini.a[b->a[b->b[j]]>>32].y>>1) - (k - 1); uint32_t r = (uint32_t)b->a[b->b[j]]; r = !rev? r - (k - 1) : (0x80000000U - r); if (r > er) lr += er - sr, sr = r, er = sr + k; else er = r + k; if (q > eq) lq += eq - sq, sq = q, eq = sq + k; else eq = q + k; } lr += er - sr, lq += eq - sq; r->len = lr < lq? lr : lq; } start = i; } } } // merge or add a Hough interval; only used by get_reg() static inline void push_intv(mm128_v *intv, int start, int end, float merge_frac) { mm128_t *p; if (intv->n > 0) { // test overlap int last_start, last_end, min; p = &intv->a[intv->n-1]; last_start = p->y, last_end = p->x + last_start; min = end - start < last_end - last_start? end - start : last_end - last_start; if (last_end > start && last_end - start > min * merge_frac) { // large overlap; then merge p->x = end - last_start; return; } } kv_pushp(mm128_t, *intv, &p); // a new interval p->x = end - start, p->y = start; } // find mapping regions from a list of minimizer hits static void get_reg(mm_tbuf_t *b, int radius, int k, int min_cnt, int max_gap, float merge_frac, int flag) { const uint64_t v_kept = ~(1ULL<<31), v_dropped = 1ULL<<31; mm128_v *c = &b->coef; int i, j, start = 0, iso_dist = radius * 2; if (c->n < min_cnt) return; // drop isolated minimizer hits if (flag&MM_F_NO_ISO) { for (i = 0; i < c->n; ++i) c->a[i].y |= v_dropped; for (i = 1; i < c->n; ++i) { uint64_t x = c->a[i].x; int32_t rpos = (uint32_t)c->a[i].y; for (j = i - 1; j >= 0 && x - c->a[j].x < radius; --j) { int32_t y = c->a[j].y; if (abs(y - rpos) < iso_dist) { c->a[i].y &= v_kept, c->a[j].y &= v_kept; break; } } } for (i = j = 0; i < c->n; ++i) // squeeze out hits still marked as v_dropped if ((c->a[i].y&v_dropped) == 0) c->a[j++] = c->a[i]; c->n = j; } // identify (possibly overlapping) intervals within _radius_; an interval is a cluster of hits b->intv.n = 0; for (i = 1; i < c->n; ++i) { if (c->a[i].x - c->a[start].x > radius) { if (i - start >= min_cnt) push_intv(&b->intv, start, i, merge_frac); for (++start; start < i && c->a[i].x - c->a[start].x > radius; ++start); } } if (i - start >= min_cnt) push_intv(&b->intv, start, i, merge_frac); // sort by the size of the interval radix_sort_128x(b->intv.a, b->intv.a + b->intv.n); // generate hits, starting from the largest interval b->reg2mini.n = 0; for (i = b->intv.n - 1; i >= 0; --i) proc_intv(b, i, k, min_cnt, max_gap); // post repeat removal if (!(flag&MM_F_WITH_REP)) drop_rep(b, min_cnt); } const mm_reg1_t *mm_map(const mm_idx_t *mi, int l_seq, const char *seq, int *n_regs, mm_tbuf_t *b, const mm_mapopt_t *opt, const char *name) { int j, n_dreg = 0, u = 0; const uint64_t *dreg = 0; b->mini.n = b->coef.n = 0; mm_sketch(seq, l_seq, mi->w, mi->k, 0, &b->mini); if (opt->sdust_thres > 0) dreg = sdust_core((const uint8_t*)seq, l_seq, opt->sdust_thres, 64, &n_dreg, b->sdb); for (j = 0; j < b->mini.n; ++j) { int k, n; const uint64_t *r; int32_t qpos = (uint32_t)b->mini.a[j].y>>1, strand = b->mini.a[j].y&1; b->mini.a[j].y = b->mini.a[j].y<<32>>32; // clear the rid field if (dreg && n_dreg) { // test complexity int s = qpos - (mi->k - 1), e = s + mi->k; while (u < n_dreg && (uint32_t)dreg[u] <= s) ++u; if (u < n_dreg && dreg[u]>>32 < e) { int v, l = 0; for (v = u; v < n_dreg && dreg[v]>>32 < e; ++v) { // iterate over LCRs overlapping this minimizer int ss = s > dreg[v]>>32? s : dreg[v]>>32; int ee = e < (uint32_t)dreg[v]? e : (uint32_t)dreg[v]; l += ee - ss; } if (l > mi->k>>1) continue; } } r = mm_idx_get(mi, b->mini.a[j].x, &n); if (n > mi->max_occ) continue; for (k = 0; k < n; ++k) { int32_t rpos = (uint32_t)r[k] >> 1; mm128_t *p; if (name && (opt->flag&MM_F_NO_SELF) && mi->name && strcmp(name, mi->name[r[k]>>32]) == 0 && rpos == qpos) continue; if (name && (opt->flag&MM_F_AVA) && mi->name && strcmp(name, mi->name[r[k]>>32]) > 0) continue; kv_pushp(mm128_t, b->coef, &p); if ((r[k]&1) == strand) { // forward strand p->x = (uint64_t)r[k] >> 32 << 32 | (0x80000000U + rpos - qpos); p->y = (uint64_t)j << 32 | rpos; } else { // reverse strand p->x = (uint64_t)r[k] >> 32 << 32 | (rpos + qpos) | 1ULL<<63; p->y = (uint64_t)j << 32 | rpos; } } } radix_sort_128x(b->coef.a, b->coef.a + b->coef.n); b->reg.n = 0; get_reg(b, opt->radius, mi->k, opt->min_cnt, opt->max_gap, opt->merge_frac, opt->flag); *n_regs = b->reg.n; return b->reg.a; } /************************** * Multi-threaded mapping * **************************/ void kt_for(int n_threads, void (*func)(void*,long,int), void *data, long n); void kt_pipeline(int n_threads, void *(*func)(void*, int, void*), void *shared_data, int n_steps); typedef struct { int batch_size, n_processed, n_threads; const mm_mapopt_t *opt; bseq_file_t *fp; const mm_idx_t *mi; } pipeline_t; typedef struct { const pipeline_t *p; int n_seq; bseq1_t *seq; int *n_reg; mm_reg1_t **reg; mm_tbuf_t **buf; } step_t; static void worker_for(void *_data, long i, int tid) // kt_for() callback { step_t *step = (step_t*)_data; const mm_reg1_t *regs; int n_regs; regs = mm_map(step->p->mi, step->seq[i].l_seq, step->seq[i].seq, &n_regs, step->buf[tid], step->p->opt, step->seq[i].name); step->n_reg[i] = n_regs; if (n_regs > 0) { step->reg[i] = (mm_reg1_t*)malloc(n_regs * sizeof(mm_reg1_t)); memcpy(step->reg[i], regs, n_regs * sizeof(mm_reg1_t)); } } static void *worker_pipeline(void *shared, int step, void *in) { int i, j; pipeline_t *p = (pipeline_t*)shared; if (step == 0) { // step 0: read sequences step_t *s; s = (step_t*)calloc(1, sizeof(step_t)); s->seq = bseq_read(p->fp, p->batch_size, &s->n_seq); if (s->seq) { s->p = p; for (i = 0; i < s->n_seq; ++i) s->seq[i].rid = p->n_processed++; s->buf = (mm_tbuf_t**)calloc(p->n_threads, sizeof(mm_tbuf_t*)); for (i = 0; i < p->n_threads; ++i) s->buf[i] = mm_tbuf_init(); s->n_reg = (int*)calloc(s->n_seq, sizeof(int)); s->reg = (mm_reg1_t**)calloc(s->n_seq, sizeof(mm_reg1_t**)); return s; } else free(s); } else if (step == 1) { // step 1: map kt_for(p->n_threads, worker_for, in, ((step_t*)in)->n_seq); return in; } else if (step == 2) { // step 2: output step_t *s = (step_t*)in; const mm_idx_t *mi = p->mi; for (i = 0; i < p->n_threads; ++i) mm_tbuf_destroy(s->buf[i]); free(s->buf); for (i = 0; i < s->n_seq; ++i) { bseq1_t *t = &s->seq[i]; for (j = 0; j < s->n_reg[i]; ++j) { mm_reg1_t *r = &s->reg[i][j]; if (r->len < p->opt->min_match) continue; printf("%s\t%d\t%d\t%d\t%c\t", t->name, t->l_seq, r->qs, r->qe, "+-"[r->rev]); if (mi->name) fputs(mi->name[r->rid], stdout); else printf("%d", r->rid + 1); printf("\t%d\t%d\t%d\t%d\t%d\t255\tcm:i:%d\n", mi->len[r->rid], r->rs, r->re, r->len, r->re - r->rs > r->qe - r->qs? r->re - r->rs : r->qe - r->qs, r->cnt); } free(s->reg[i]); free(s->seq[i].seq); free(s->seq[i].name); } free(s->reg); free(s->n_reg); free(s->seq); free(s); } return 0; } int mm_map_file(const mm_idx_t *idx, const char *fn, const mm_mapopt_t *opt, int n_threads, int tbatch_size) { pipeline_t pl; memset(&pl, 0, sizeof(pipeline_t)); pl.fp = bseq_open(fn); if (pl.fp == 0) return -1; pl.opt = opt, pl.mi = idx; pl.n_threads = n_threads, pl.batch_size = tbatch_size; kt_pipeline(n_threads == 1? 1 : 2, worker_pipeline, &pl, 3); bseq_close(pl.fp); return 0; } minimap-0.2/minimap.1000066400000000000000000000125311263116742000145250ustar00rootroot00000000000000.TH minimap 1 "06 December 2015" "minimap-0.2" "Bioinformatics tools" .SH NAME .PP minimap - fast mapping between long DNA sequences .SH SYNOPSIS .PP minimap .RB [ -lSOV ] .RB [ -k .IR kmer ] .RB [ -w .IR winSize ] .RB [ -I .IR batchSize ] .RB [ -d .IR dumpFile ] .RB [ -f .IR occThres ] .RB [ -r .IR bandWidth ] .RB [ -m .IR minShared ] .RB [ -c .IR minCount ] .RB [ -L .IR minMatch ] .RB [ -g .IR maxGap ] .RB [ -T .IR dustThres ] .RB [ -t .IR nThreads ] .RB [ -x .IR preset ] .I target.fa .I query.fa > .I output.paf .SH DESCRIPTION .PP Minimap is a tool to efficiently find multiple approximate mapping positions between two sets of long sequences, such as between reads and reference genomes, between genomes and between long noisy reads. Minimap has an indexing and a mapping phase. In the indexing phase, it collects all minimizers of a large batch of target sequences in a hash table; in the mapping phase, it identifies good clusters of colinear minimizer hits. Minimap does not generate detailed alignments between the target and the query sequences. It only outputs the approximate start and the end coordinates of these clusters. .SH OPTIONS .SS Indexing options .TP 10 .BI -k \ INT Minimizer k-mer length [15] .TP .BI -w \ INT Minimizer window size [2/3 of k-mer length]. A minimizer is the smallest k-mer in a window of w consecutive k-mers. .TP .BI -I \ NUM Load at most .I NUM target bases into RAM for indexing [4G]. If there are more than .I NUM bases in .IR target.fa , minimap needs to read .I query.fa multiple times to map it against each batch of target sequences. .I NUM may be ending with k/K/m/M/g/G. .TP .BI -d \ FILE Dump minimizer index to .I FILE [no dump] .TP .B -l Indicate that .I target.fa is in fact a minimizer index generated by option .BR -d , not a FASTA or FASTQ file. .SS Mapping options .TP 10 .BI -f \ FLOAT Ignore top .I FLOAT fraction of most occurring minimizers [0.001] .TP .BI -r \ INT Approximate bandwidth for initial minimizer hits clustering [500]. A .I minimizer hit is a minimizer present in both the target and query sequences. A .I minimizer hit cluster is a group of potentially colinear minimizer hits between a target and a query sequence. .TP .BI -m \ FLOAT Merge initial minimizer hit clusters if .I FLOAT or higher fraction of minimizers are shared between the clusters [0.5] .TP .BI -c \ INT Retain a minimizer hit cluster if it contains .I INT or more minimizer hits [4] .TP .BI -L \ INT Discard a minimizer hit cluster if after colinearization, the number of matching bases is below .I INT [40]. This option mainly reduces the size of output. It has little effect on the speed and peak memory. .TP .BI -g \ INT Split a minimizer hit cluster at a gap .IR INT -bp or longer that does not contain any minimizer hits [10000] .TP .BI -T \ INT Mask regions on query sequences with SDUST score threshold .IR INT ; 0 to disable [0]. SDUST is an algorithm to identify low-complexity subsequences. It is not enabled by default. If SDUST is preferred, a value between 20 and 25 is recommended. A higher threshold masks less sequences. .TP .B -S Perform all-vs-all mapping. In this mode, if the query sequence name is lexicographically larger than the target sequence name, the hits between them will be suppressed; if the query sequence name is the same as the target name, diagonal minimizer hits will also be suppressed. .TP .B -O Drop a minimizer hit if it is far away from other hits (EXPERIMENTAL). This option is useful for mapping long chromosomes from two diverged species. .TP .BI -x \ STR Changing multiple settings based on .I STR [not set]. It is recommended to apply this option before other options, such that the following options may override the multiple settings modified by this option. .RS .TP 8 .B ava10k for PacBio or Oxford Nanopore all-vs-all read mapping (-Sw5 -L100 -m0). .RE .SS Input/output options .TP 10 .BI -t \ INT Number of threads [3]. Minimap uses at most three threads when collecting minimizers on target sequences, and uses up to .IR INT +1 threads when mapping (the extra thread is for I/O, which is frequently idle and takes little CPU time). .TP .B -V Print version number to stdout .SH OUTPUT FORMAT .PP Minimap outputs mapping positions in the Pairwise mApping Format (PAF). PAF is a TAB-delimited text format with each line consisting of at least 12 fields as are described in the following table: .TS center box; cb | cb | cb r | c | l . Col Type Description _ 1 string Query sequence name 2 int Query sequence length 3 int Query start coordinate (0-based) 4 int Query end coordinate (0-based) 5 char `+' if query and target on the same strand; `-' if opposite 6 string Target sequence name 7 int Target sequence length 8 int Target start coordinate on the original strand 9 int Target end coordinate on the original strand 10 int Number of matching bases in the mapping 11 int Number bases, including gaps, in the mapping 12 int Mapping quality (0-255 with 255 for missing) .TE .PP When the alignment is available, column 11 gives the total number of sequence matches, mismatches and gaps in the alignment; column 10 divided by column 11 gives the alignment identity. As minimap does not generate detailed alignment, these two columns are approximate. PAF may optionally have additional fields in the SAM-like typed key-value format. Minimap writes the number of minimizer hits in a cluster to the cm tag. .SH SEE ALSO .PP miniasm(1) minimap-0.2/minimap.h000066400000000000000000000057551263116742000146260ustar00rootroot00000000000000#ifndef MINIMAP_H #define MINIMAP_H #include #include #include #include "bseq.h" #define MM_IDX_DEF_B 14 #define MM_DEREP_Q50 5.0 #define MM_F_WITH_REP 0x1 #define MM_F_NO_SELF 0x2 #define MM_F_NO_ISO 0x4 #define MM_F_AVA 0x8 typedef struct { uint64_t x, y; } mm128_t; typedef struct { size_t n, m; mm128_t *a; } mm128_v; typedef struct { size_t n, m; uint64_t *a; } uint64_v; typedef struct { size_t n, m; uint32_t *a; } uint32_v; typedef struct { mm128_v a; // (minimizer, position) array int32_t n; // size of the _p_ array uint64_t *p; // position array for minimizers appearing >1 times void *h; // hash table indexing _p_ and minimizers appearing once } mm_idx_bucket_t; typedef struct { int b, w, k; uint32_t n; // number of reference sequences mm_idx_bucket_t *B; uint32_t max_occ; float freq_thres; int32_t *len; // length of each reference sequence char **name; // TODO: if this uses too much RAM, switch one concatenated string } mm_idx_t; typedef struct { uint32_t cnt:31, rev:1; uint32_t rid:31, rep:1; uint32_t len; int32_t qs, qe, rs, re; } mm_reg1_t; typedef struct { int radius; // bandwidth to cluster hits int max_gap; // break a chain if there are no minimizers in a max_gap window int min_cnt; // minimum number of minimizers to start a chain int min_match; int sdust_thres; // score threshold for SDUST; 0 to disable int flag; // see MM_F_* macros float merge_frac; // merge two chains if merge_frac fraction of minimzers are shared between the chains } mm_mapopt_t; extern int mm_verbose; extern double mm_realtime0; struct mm_tbuf_s; typedef struct mm_tbuf_s mm_tbuf_t; #ifdef __cplusplus extern "C" { #endif // compute minimizers void mm_sketch(const char *str, int len, int w, int k, uint32_t rid, mm128_v *p); // minimizer indexing mm_idx_t *mm_idx_init(int w, int k, int b); void mm_idx_destroy(mm_idx_t *mi); mm_idx_t *mm_idx_gen(bseq_file_t *fp, int w, int k, int b, int tbatch_size, int n_threads, uint64_t ibatch_size, int keep_name); void mm_idx_set_max_occ(mm_idx_t *mi, float f); const uint64_t *mm_idx_get(const mm_idx_t *mi, uint64_t minier, int *n); mm_idx_t *mm_idx_build(const char *fn, int w, int k, int n_threads); // minimizer index I/O void mm_idx_dump(FILE *fp, const mm_idx_t *mi); mm_idx_t *mm_idx_load(FILE *fp); // mapping void mm_mapopt_init(mm_mapopt_t *opt); mm_tbuf_t *mm_tbuf_init(void); void mm_tbuf_destroy(mm_tbuf_t *b); const mm_reg1_t *mm_map(const mm_idx_t *mi, int l_seq, const char *seq, int *n_regs, mm_tbuf_t *b, const mm_mapopt_t *opt, const char *name); int mm_map_file(const mm_idx_t *idx, const char *fn, const mm_mapopt_t *opt, int n_threads, int tbatch_size); // private functions (may be moved to a "mmpriv.h" in future) double cputime(void); double realtime(void); void radix_sort_128x(mm128_t *beg, mm128_t *end); void radix_sort_64(uint64_t *beg, uint64_t *end); uint32_t ks_ksmall_uint32_t(size_t n, uint32_t arr[], size_t kk); #ifdef __cplusplus } #endif #endif minimap-0.2/misc.c000066400000000000000000000010351263116742000141050ustar00rootroot00000000000000#include #include #include "minimap.h" int mm_verbose = 3; double mm_realtime0; double cputime() { struct rusage r; getrusage(RUSAGE_SELF, &r); return r.ru_utime.tv_sec + r.ru_stime.tv_sec + 1e-6 * (r.ru_utime.tv_usec + r.ru_stime.tv_usec); } double realtime() { struct timeval tp; struct timezone tzp; gettimeofday(&tp, &tzp); return tp.tv_sec + tp.tv_usec * 1e-6; } #include "ksort.h" #define sort_key_128x(a) ((a).x) KRADIX_SORT_INIT(128x, mm128_t, sort_key_128x, 8) KSORT_INIT_GENERIC(uint32_t) minimap-0.2/sdust.c000066400000000000000000000143571263116742000143270ustar00rootroot00000000000000#include #include #include #include "kdq.h" #include "kvec.h" #include "sdust.h" #define SD_WLEN 3 #define SD_WTOT (1<<(SD_WLEN<<1)) #define SD_WMSK (SD_WTOT - 1) typedef struct { int start, finish; int r, l; } perf_intv_t; typedef kvec_t(perf_intv_t) perf_intv_v; typedef kvec_t(uint64_t) uint64_v; KDQ_INIT(int) #if defined(_NO_NT4_TBL) || defined(_SDUST_MAIN) unsigned char seq_nt4_table[256] = { 0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 }; #else extern unsigned char seq_nt4_table[256]; #endif struct sdust_buf_s { kdq_t(int) *w; perf_intv_v P; // the list of perfect intervals for the current window, sorted by descending start and then by ascending finish uint64_v res; // the result }; sdust_buf_t *sdust_buf_init(void) { sdust_buf_t *buf; buf = (sdust_buf_t*)calloc(1, sizeof(sdust_buf_t)); buf->w = kdq_init(int); return buf; } void sdust_buf_destroy(sdust_buf_t *buf) { if (buf == 0) return; kdq_destroy(int, buf->w); free(buf->P.a); free(buf->res.a); free(buf); } static inline void shift_window(int t, kdq_t(int) *w, int T, int W, int *L, int *rw, int *rv, int *cw, int *cv) { int s; if (kdq_size(w) >= W - SD_WLEN + 1) { // TODO: is this right for SD_WLEN!=3? s = *kdq_shift(int, w); *rw -= --cw[s]; if (*L > kdq_size(w)) --*L, *rv -= --cv[s]; } kdq_push(int, w, t); ++*L; *rw += cw[t]++; *rv += cv[t]++; if (cv[t] * 10 > T<<1) { do { s = kdq_at(w, kdq_size(w) - *L); *rv -= --cv[s]; --*L; } while (s != t); } } static inline void save_masked_regions(uint64_v *res, perf_intv_v *P, int start) { int i, saved = 0; perf_intv_t *p; if (P->n == 0 || P->a[P->n - 1].start >= start) return; p = &P->a[P->n - 1]; if (res->n) { int s = res->a[res->n - 1]>>32, f = (uint32_t)res->a[res->n - 1]; if (p->start <= f) // if overlapping with or adjacent to the previous interval saved = 1, res->a[res->n - 1] = (uint64_t)s<<32 | (f > p->finish? f : p->finish); } if (!saved) kv_push(uint64_t, *res, (uint64_t)p->start<<32|p->finish); for (i = P->n - 1; i >= 0 && P->a[i].start < start; --i); // remove perfect intervals that have falled out of the window P->n = i + 1; } static void find_perfect(perf_intv_v *P, const kdq_t(int) *w, int T, int start, int L, int rv, const int *cv) { int c[SD_WTOT], r = rv, i, max_r = 0, max_l = 0; memcpy(c, cv, SD_WTOT * sizeof(int)); for (i = (long)kdq_size(w) - L - 1; i >= 0; --i) { int j, t = kdq_at(w, i), new_r, new_l; r += c[t]++; new_r = r, new_l = kdq_size(w) - i - 1; if (new_r * 10 > T * new_l) { for (j = 0; j < P->n && P->a[j].start >= i + start; ++j) { // find insertion position perf_intv_t *p = &P->a[j]; if (max_r == 0 || p->r * max_l > max_r * p->l) max_r = p->r, max_l = p->l; } if (max_r == 0 || new_r * max_l >= max_r * new_l) { // then insert max_r = new_r, max_l = new_l; if (P->n == P->m) kv_resize(perf_intv_t, *P, P->n + 1); memmove(&P->a[j+1], &P->a[j], (P->n - j) * sizeof(perf_intv_t)); // make room ++P->n; P->a[j].start = i + start, P->a[j].finish = kdq_size(w) + (SD_WLEN - 1) + start; P->a[j].r = new_r, P->a[j].l = new_l; } } } } const uint64_t *sdust_core(const uint8_t *seq, int l_seq, int T, int W, int *n, sdust_buf_t *buf) { int rv = 0, rw = 0, L = 0, cv[SD_WTOT], cw[SD_WTOT]; int i, start, l; // _start_: start of the current window; _l_: length of a contiguous A/C/G/T (sub)sequence unsigned t; // current word buf->P.n = buf->res.n = 0; buf->w->front = buf->w->count = 0; memset(cv, 0, SD_WTOT * sizeof(int)); memset(cw, 0, SD_WTOT * sizeof(int)); if (l_seq < 0) l_seq = strlen((const char*)seq); for (i = l = t = 0; i <= l_seq; ++i) { int b = i < l_seq? seq_nt4_table[seq[i]] : 4; if (b < 4) { // an A/C/G/T base ++l, t = (t<<2 | b) & SD_WMSK; if (l >= SD_WLEN) { // we have seen a word start = (l - W > 0? l - W : 0) + (i + 1 - l); // set the start of the current window save_masked_regions(&buf->res, &buf->P, start); // save intervals falling out of the current window? shift_window(t, buf->w, T, W, &L, &rw, &rv, cw, cv); if (rw * 10 > L * T) find_perfect(&buf->P, buf->w, T, start, L, rv, cv); } } else { // N or the end of sequence; N effectively breaks input into pieces of independent sequences start = (l - W + 1 > 0? l - W + 1 : 0) + (i + 1 - l); while (buf->P.n) save_masked_regions(&buf->res, &buf->P, start++); // clear up unsaved perfect intervals l = t = 0; } } *n = buf->res.n; return buf->res.a; } uint64_t *sdust(const uint8_t *seq, int l_seq, int T, int W, int *n) { uint64_t *ret; sdust_buf_t *buf; buf = sdust_buf_init(); ret = (uint64_t*)sdust_core(seq, l_seq, T, W, n, buf); buf->res.a = 0; sdust_buf_destroy(buf); return ret; } #ifdef _SDUST_MAIN #include #include #include #include "kseq.h" KSEQ_INIT(gzFile, gzread) int main(int argc, char *argv[]) { gzFile fp; kseq_t *ks; int W = 64, T = 20, c; while ((c = getopt(argc, argv, "w:t:")) >= 0) { if (c == 'w') W = atoi(optarg); else if (c == 't') T = atoi(optarg); } if (optind == argc) { fprintf(stderr, "Usage: sdust [-w %d] [-t %d] \n", W, T); return 1; } fp = strcmp(argv[optind], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r"); ks = kseq_init(fp); while (kseq_read(ks) >= 0) { uint64_t *r; int i, n; r = sdust((uint8_t*)ks->seq.s, -1, T, W, &n); for (i = 0; i < n; ++i) printf("%s\t%d\t%d\n", ks->name.s, (int)(r[i]>>32), (int)r[i]); free(r); } kseq_destroy(ks); gzclose(fp); return 0; } #endif minimap-0.2/sdust.h000066400000000000000000000010331263116742000143170ustar00rootroot00000000000000#ifndef SDUST_H #define SDUST_H struct sdust_buf_s; typedef struct sdust_buf_s sdust_buf_t; #ifdef __cplusplus extern "C" { #endif // the simple interface uint64_t *sdust(const uint8_t *seq, int l_seq, int T, int W, int *n); // the following interface dramatically reduce heap allocations when sdust is frequently called. sdust_buf_t *sdust_buf_init(void); void sdust_buf_destroy(sdust_buf_t *buf); const uint64_t *sdust_core(const uint8_t *seq, int l_seq, int T, int W, int *n, sdust_buf_t *buf); #ifdef __cplusplus } #endif #endif minimap-0.2/sketch.c000066400000000000000000000105401263116742000144340ustar00rootroot00000000000000#include #include #include #include #include "kvec.h" #include "minimap.h" unsigned char seq_nt4_table[256] = { 0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 }; static inline uint64_t hash64(uint64_t key, uint64_t mask) { key = (~key + (key << 21)) & mask; // key = (key << 21) - key - 1; key = key ^ key >> 24; key = ((key + (key << 3)) + (key << 8)) & mask; // key * 265 key = key ^ key >> 14; key = ((key + (key << 2)) + (key << 4)) & mask; // key * 21 key = key ^ key >> 28; key = (key + (key << 31)) & mask; return key; } /** * Find symmetric (w,k)-minimizers on a DNA sequence * * @param str DNA sequence * @param len length of $str * @param w find a minimizer for every $w consecutive k-mers * @param k k-mer size * @param rid reference ID; will be copied to the output $p array * @param p minimizers; p->a[i].x is the 2k-bit hash value; * p->a[i].y = rid<<32 | lastPos<<1 | strand * where lastPos is the position of the last base of the i-th minimizer, * and strand indicates whether the minimizer comes from the top or the bottom strand. * Callers may want to set "p->n = 0"; otherwise results are appended to p */ void mm_sketch(const char *str, int len, int w, int k, uint32_t rid, mm128_v *p) { uint64_t shift1 = 2 * (k - 1), mask = (1ULL<<2*k) - 1, kmer[2] = {0,0}; int i, j, l, buf_pos, min_pos; mm128_t *buf, min = { UINT64_MAX, UINT64_MAX }; assert(len > 0 && w > 0 && k > 0); buf = (mm128_t*)alloca(w * 16); memset(buf, 0xff, w * 16); for (i = l = buf_pos = min_pos = 0; i < len; ++i) { int c = seq_nt4_table[(uint8_t)str[i]]; mm128_t info = { UINT64_MAX, UINT64_MAX }; if (c < 4) { // not an ambiguous base int z; kmer[0] = (kmer[0] << 2 | c) & mask; // forward k-mer kmer[1] = (kmer[1] >> 2) | (3ULL^c) << shift1; // reverse k-mer if (kmer[0] == kmer[1]) continue; // skip "symmetric k-mers" as we don't know it strand z = kmer[0] < kmer[1]? 0 : 1; // strand if (++l >= k) info.x = hash64(kmer[z], mask), info.y = (uint64_t)rid<<32 | (uint32_t)i<<1 | z; } else l = 0; buf[buf_pos] = info; // need to do this here as appropriate buf_pos and buf[buf_pos] are needed below if (l == w + k - 1) { // special case for the first window - because identical k-mers are not stored yet for (j = buf_pos + 1; j < w; ++j) if (min.x == buf[j].x && buf[j].y != min.y) kv_push(mm128_t, *p, buf[j]); for (j = 0; j < buf_pos; ++j) if (min.x == buf[j].x && buf[j].y != min.y) kv_push(mm128_t, *p, buf[j]); } if (info.x <= min.x) { // a new minimum; then write the old min if (l >= w + k) kv_push(mm128_t, *p, min); min = info, min_pos = buf_pos; } else if (buf_pos == min_pos) { // old min has moved outside the window if (l >= w + k - 1) kv_push(mm128_t, *p, min); for (j = buf_pos + 1, min.x = UINT64_MAX; j < w; ++j) // the two loops are necessary when there are identical k-mers if (min.x >= buf[j].x) min = buf[j], min_pos = j; // >= is important s.t. min is always the closest k-mer for (j = 0; j <= buf_pos; ++j) if (min.x >= buf[j].x) min = buf[j], min_pos = j; if (l >= w + k - 1) { // write identical k-mers for (j = buf_pos + 1; j < w; ++j) // these two loops make sure the output is sorted if (min.x == buf[j].x && min.y != buf[j].y) kv_push(mm128_t, *p, buf[j]); for (j = 0; j <= buf_pos; ++j) if (min.x == buf[j].x && min.y != buf[j].y) kv_push(mm128_t, *p, buf[j]); } } if (++buf_pos == w) buf_pos = 0; } if (min.x != UINT64_MAX) kv_push(mm128_t, *p, min); }