pax_global_header00006660000000000000000000000064137523654610014526gustar00rootroot0000000000000052 comment=f03225bd9a9273c1a0b3711fd5e73cf1c81e778b hts-nim-tools-0.2.1/000077500000000000000000000000001375236546100142435ustar00rootroot00000000000000hts-nim-tools-0.2.1/.gitignore000066400000000000000000000000341375236546100162300ustar00rootroot00000000000000nimcache bin/ hts_nim_tools hts-nim-tools-0.2.1/LICENSE000066400000000000000000000020571375236546100152540ustar00rootroot00000000000000MIT License Copyright (c) 2018 Brent Pedersen Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. hts-nim-tools-0.2.1/README.md000066400000000000000000000072211375236546100155240ustar00rootroot00000000000000# hts-nim-tools This repository contains a number of tools created with [hts-nim](https://github.com/brentp/hts-nim/) intended to serve as examples for using `hts-nim` as well as to be useful tools. These tools are: ``` hts-nim utility programs. version: $version • bam-filter : filter BAM/CRAM/SAM files with a simple expression language • count-reads : count BAM/CRAM reads in regions given in a BED file • vcf-check : check regions of a VCF against a background for missing chunks ``` each of these is described in more detail below. # bam-filter Use simple expressions to filter a BAM/CRAM file: ``` bam-filter Usage: bam-filter [options] -t --threads number of BAM decompression threads [default: 0] -f --fasta fasta file for use with CRAM files [default: $env_fasta]. ``` valid expressions may access the bam attibutes: + `mapq `/ `start `/ `pos `/ `end `/ `flag `/ `insert_size ` (where pos is the 1-based start) + `is_aligned` `is_read1` `is_read2` `is_supplementary` `is_secondary` `is_dup` `is_qcfail` + `is_reverse` `is_mate_reverse` `is_pair` `is_proper_pair` `is_mate_unmapped` `is_unmapped` to use aux tags, indicate them prefixed with 'tag_', e.g.: tag_NM < 2. Any tag present in the bam can be used in this manner. example: ``` bam-filter "tag_NM == 2 && tag_RG == 'SRR741410' && is_proper_pair" tests/HG02002.bam ``` # count-reads Count reads reports the number of reads overlapping each interval in a BED file. ``` count-reads Usage: count-reads [options] Arguments: the bed file containing regions in which to count reads. the alignment file for which to calculate depth. Options: -t --threads number of BAM decompression threads [default: 0] -f --fasta fasta file for use with CRAM files [default: ]. -F --flag exclude reads with any of the bits in FLAG set [default: 1796] -Q --mapq mapping quality threshold [default: 0] -h --help show help ``` This is output a line with a count of reads for each line in . # vcf-check `vcf-check` is useful as a quality control for large projects which have done variant calling in regions where each region is called in parallel. With many regions, and large projects, some regions can error and this might be unknown to the analyst. This tools takes a background VCF, such as gnomad, that has full genome (though in some cases, users will instead want whole exome) coverage and uses that as an expectation of variants. **If the background has many variants across a long stretch of genome where the query VCF has no variation, we can expect that region is missed in the query VCF.** ``` Check a VCF against a background to make sure that there are no large missing chunks. vcf-check Usage: vcf-check [options] Arguments: population VCF/BCF with expected sites query VCF/BCF to check Options: -c --chunk chunk size for genome [default: 100000] -m --maf allele frequency cutoff [default: 0.1] ``` This will output a tab-delimited file of `chrom\tposition\tbackground-count\tquery-count`. The user can find regions that might be problematic by plotting or with some simple `awk` commands. hts-nim-tools-0.2.1/hts_nim_tools.nimble000066400000000000000000000005741375236546100203220ustar00rootroot00000000000000# Package version = "0.2.1" author = "Brent Pedersen" description = "hts-nim command-line tools" license = "MIT" # Dependencies requires "nim >= 0.17.2", "c2nim >= 0.9.10", "docopt", "lapper", "hts", "kexpr" srcDir = "src" bin = @["hts_nim_tools"] task named_build, "custom build": mkdir "bin" exec "nimble c --out:bin/hts-nim-tools src/hts_nim_tools" hts-nim-tools-0.2.1/src/000077500000000000000000000000001375236546100150325ustar00rootroot00000000000000hts-nim-tools-0.2.1/src/bam_filter.nim000066400000000000000000000074701375236546100176530ustar00rootroot00000000000000import os import hts import sequtils import strutils import tables import algorithm import docopt import kexpr import ./version proc bam_filter(argv: var seq[string]): int = let args = docopt(""" bam-filter Usage: bam-filter [options] -t --threads number of BAM decompression threads [default: 0] -f --fasta fasta file for use with CRAM files [default: $env_fasta]. valid expressions may contain: > mapq/start/pos/end/flag/insert_size (where pos is the 1-based start) > is_aligned is_read1 is_read2 is_supplementary is_secondary is_dup is_qcfail > is_reverse is_mate_reverse is_pair is_proper_pair is_mate_unmapped is_unmapped to use aux tags, indicate them prefixed with 'tag_', e.g.: tag_NM < 2. Any tag present in the bam can be used in this manner. example: bam-filter "tag_NM == 2 && tag_RG == 'SRR741410' && is_proper_pair" tests/HG02002.bam """, version=version(), argv=argv) #-O --format BAM/CRAM. [default: SAM] var ex = expression($args[""]) bam:Bam obam:Bam threads = parse_int($args["--threads"]) fasta: cstring format = "" if $args["--fasta"] != "nil": fasta = cstring($args["--fasta"]) if ex.ke == nil: stderr.write_line("[bam-filter] error parsing expression:" & $args[""]) stderr.write_line($ex.error()) quit(2) open(bam, $args[""], threads=threads, fai=fasta) open(obam, "-", threads=threads, mode="w" & format, fai=fasta) obam.write_header(bam.hdr) var tags = new_seq[string]() ## get the list of tags we need to pull from each record if ($args[""]).contains("tag_"): var et = ($args[""]).split("tag_") echo et for i in countup(1, len(et) - 1, 1): var k: int while k < et[i].len and et[i][k].isAlphaAscii: k += 1 assert k == 2 tags.add(et[i][0..= 4: prob = parse_float(cse[3]) else: prob = 0.5 return cnv(chrom: cse[0], start: s, stop: e, prob:prob) proc bed_to_cnv_table(bed: string): TableRef[string, seq[cnv]] = var bed_regions = newTable[string, seq[cnv]]() var hf = hts.hts_open(cstring(bed), "r") var kstr = kstring_t(l:0, m:0, s:nil) while hts_getline(hf, cint(10), addr kstr) > 0: if ($kstr.s).startswith("track "): continue if $kstr.s[0] == "#": continue var v = bed_line_to_cnv($kstr.s) if v == nil: continue discard bed_regions.hasKeyOrPut(v.chrom, new_seq[cnv]()) bed_regions[v.chrom].add(v) # since it is read into mem, can also well sort. for chrom, ivs in bed_regions.mpairs: sort(ivs, proc (a, b: cnv): int = a.start - b.start) hts.free(kstr.s) return bed_regions proc overlap_p(a: cnv, b:Record): float = var o = min(a.stop, b.stop) - max(a.start, b.start) if o < 0: return 0 return o.float / float(b.stop - a.start) proc internal_sampler(ibam:Bam, obam:var Bam, regions:TableRef[string, seq[cnv]]) = var last_chrom: string var lap:Lapper[cnv] var res = new_seq[cnv](20) randomize() for record in ibam: if not regions.contains(record.chrom): obam.write(record) continue if record.chrom != last_chrom: lap = lapify(regions[record.chrom]) last_chrom = record.chrom discard lap.find(record.start.int, record.stop.int, res) if len(res) == 0: obam.write(record) continue var po = overlap_p(res[0], record) if res[0].prob < 1 and rand(1.0) < res[0].prob: if po == 1 or po < 3.0 * rand(1.0): obam.write(record) continue # TODO: allow writing single-end reads. #if res[0].prob > 1: # obam.write_record(record) # if random(1.0) > 1 / prob: # obam.write_record(record) proc copy_number_sampler(argv: var seq[string]): int = let env_fasta = getEnv("REF_PATH") let doc = format(""" $version Usage: copy-number-sampler [options] BED format looks like: chr1\t1123\t345\t0.5 where the final column indicates the sampling probability. Arguments: the bed file containing regions in which to sample reads; 4th column is a float indicating sampling probability. the alignment file for which to calculate depth. Options: -f --fasta fasta file for use with CRAM files [default: $env_fasta]. -h --help show help """ % ["version", version(), "env_fasta", env_fasta]) let args = docopt(doc, version=version(), argv=argv) var fasta: cstring if $args["--fasta"] != "nil": fasta = cstring($args["--fasta"]) var bam:Bam open(bam, $args[""], fai=fasta, threads=1) var obam:Bam open(obam, "sampled.bam", fai=fasta, mode="wb", threads=1) obam.write_header(bam.hdr) var regions = bed_to_cnv_table($args[""]) internal_sampler(bam, obam, regions) obam.close() return 0 hts-nim-tools-0.2.1/src/count_reads.nim000066400000000000000000000071741375236546100200560ustar00rootroot00000000000000import os import hts import docopt import lapper import strutils import tables import algorithm import ./version type region_t = ref object chrom: string start: int stop: int name: string count: int proc inc_count(r:region_t) = inc(r.count) proc start(r: region_t): int {.inline.} = return r.start proc stop(r: region_t): int {.inline.} = return r.stop proc tostring(r: region_t, s:var string) {.inline.} = s.set_len(0) s.add(r.chrom & "\t" & $r.start & "\t" & $r.stop & "\t") if r.name != "": s.add(r.name & "\t") s.add($r.count) proc bed_line_to_region(line: string): region_t = var cse = line.strip().split('\t', 5) if len(cse) < 3: stderr.write_line("[mosdepth] skipping bad bed line:", line.strip()) return nil var s = parse_int(cse[1]) e = parse_int(cse[2]) reg = region_t(chrom: cse[0], start: s, stop: e, count:0) if len(cse) > 3: reg.name = cse[3] return reg proc bed_to_table(bed: string): TableRef[string, seq[region_t]] = var bed_regions = newTable[string, seq[region_t]]() var hf = hts.hts_open(cstring(bed), "r") var kstr: hts.kstring_t kstr.l = 0 kstr.m = 0 kstr.s = nil while hts_getline(hf, cint(10), addr kstr) > 0: if ($kstr.s).startswith("track "): continue if $kstr.s[0] == "#": continue var v = bed_line_to_region($kstr.s) if v == nil: continue discard bed_regions.hasKeyOrPut(v.chrom, new_seq[region_t]()) bed_regions[v.chrom].add(v) # since it is read into mem, can also well sort. for chrom, ivs in bed_regions.mpairs: sort(ivs, proc (a, b: region_t): int = a.start - b.start) hts.free(kstr.s) return bed_regions proc internal_count(bam:Bam, mapq:uint8, eflag:uint16, regions:TableRef[string, seq[region_t]]) = for chrom in regions.keys(): if not regions.contains(chrom) or regions[chrom].len == 0: continue var lap:Lapper[region_t] = lapify(regions[chrom]) for aln in bam.query(chrom): if aln.mapping_quality < mapq: continue if (aln.flag and eflag) != 0: continue lap.each_seek(aln.start.int, aln.stop.int, inc_count) var s = new_string_of_cap(1000) for region in regions[chrom]: region.tostring(s) echo s proc count_reads(argv: var seq[string]): int = let env_fasta = getEnv("REF_PATH") let doc = format(""" $version Usage: count-reads [options] Arguments: the bed file containing regions in which to count reads. the alignment file for which to calculate depth. Options: -t --threads number of BAM decompression threads [default: 0] -f --fasta fasta file for use with CRAM files [default: $env_fasta]. -F --flag exclude reads with any of the bits in FLAG set [default: 1796] -Q --mapq mapping quality threshold [default: 0] -h --help show help """ % ["version", version(), "env_fasta", env_fasta]) let args = docopt(doc, version=version(), argv=argv) let mapq = parse_int($args["--mapq"]) var fasta: cstring if $args["--fasta"] != "nil": fasta = cstring($args["--fasta"]) var eflag = uint16(parse_int($args["--flag"])) threads = parse_int($args["--threads"]) bam:Bam open(bam, $args[""], threads=threads, index=true, fai=fasta) if bam.idx == nil: stderr.write_line ("count-reads: requires bam/cram index") var regions = bed_to_table($args[""]) internal_count(bam, uint8(mapq), eflag, regions) return 0 hts-nim-tools-0.2.1/src/hts_nim_tools.nim000066400000000000000000000024641375236546100204260ustar00rootroot00000000000000import os import hts import sequtils import strutils import tables import algorithm import docopt import kexpr import ./version include ./bam_filter include ./count_reads include ./vcf_check include ./copy_number_sampler var progs = { "vcf-check": vcf_check, "bam-filter": bam_filter, "count-reads": count_reads, "copy-number-sampler": copy_number_sampler }.toTable proc main() = var helps = {"bam-filter": "filter BAM/CRAM/SAM files with a simple expression language", "count-reads": "count BAM/CRAM reads in regions given in a BED file", "vcf-check": "check regions of a VCF against a background for missing chunks", "copy-number-sampler": "sample BAM regions with a probability given in a BED file (for creating CN truth-sets)" }.toTable var args = commandLineParams() if len(args) < 1 or not progs.contains(args[0]): var hkeys = toSeq(keys(helps)) sort(hkeys, proc(a, b: string): int = if a < b: return -1 else: return 1 ) echo format("\nhts-nim utility programs.\nversion: $#\n", version()) for k in hkeys: echo format(" • $1: $2", k & repeat(" ", 20 - len(k)), helps[k]) echo "" else: var p = args[0]; args.delete(0) quit(progs[p](args)) when isMainModule: main() hts-nim-tools-0.2.1/src/vcf_check.nim000066400000000000000000000064061375236546100174600ustar00rootroot00000000000000import os import hts import docopt import strutils import ./version proc which_bin(pos:int, chunk_size:int): int {.inline.} = return int(pos / chunk_size) proc extend(L:var seq[uint32], newIdx:int) {.inline.} = var newLen = newIdx + 1 if newLen <= L.len: return var o = L.len L.set_len(newLen) for i in o.. Arguments: population VCF/BCF with expected sites query VCF/BCF to check Options: -c --chunk chunk size for genome [default: 100000] -m --maf allele frequency cutoff [default: 0.1] """ % ["version", version()]) let args = docopt(doc, version=version(), argv=argv) let chunk = parse_int($args["--chunk"]) let maf = parse_float($args["--maf"]) check_vcf($args[""], $args[""], chunk, maf.float32) return 0 hts-nim-tools-0.2.1/src/version.nim000066400000000000000000000000541375236546100172230ustar00rootroot00000000000000 proc version*(): string = return "0.2.0"