pax_global_header00006660000000000000000000000064141207074510014513gustar00rootroot0000000000000052 comment=6554f7a872db2f104d7a67e9c3e6026f8cccf878 nthash-0.4.0/000077500000000000000000000000001412070745100130015ustar00rootroot00000000000000nthash-0.4.0/.travis.yml000066400000000000000000000003171412070745100151130ustar00rootroot00000000000000language: go go: - 1.14.x before_install: - go get -t -v ./... script: - go test -race -coverprofile=coverage.txt -covermode=atomic ./... after_success: - bash <(curl -s https://codecov.io/bash) nthash-0.4.0/LICENSE000066400000000000000000000021121412070745100140020ustar00rootroot00000000000000The MIT License (MIT) Copyright © 2018 Will Rowe Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. nthash-0.4.0/README.md000066400000000000000000000043031412070745100142600ustar00rootroot00000000000000

ntHash

ntHash implementation in Go


travis GoDoc goreportcard codecov
--- ## Overview This is a Go implementation of the [ntHash](https://github.com/bcgsc/ntHash) recursive hash function for hashing all possible k-mers in a DNA/RNA sequence. For more information, read the ntHash [paper](http://dx.doi.org/10.1093/bioinformatics/btw397) by Mohamadi et al. or check out their C++ [implementation](https://github.com/bcgsc/ntHash). This implementation was inspired by [Luiz Irber](https://luizirber.org/) and his recent [blog post](https://blog.luizirber.org/2018/09/13/nthash/) on his cool [Rust ntHash implementation](https://github.com/luizirber/nthash). I have coded this up in Go so that ntHash can be used in my [HULK](https://github.com/will-rowe/hulk) and [GROOT](https://github.com/will-rowe/groot) projects but feel free to use it for yourselves. ## Installation ```go go get github.com/will-rowe/nthash ``` ## Example usage ### range over ntHash values for a sequence ```go package main import ( "log" "github.com/will-rowe/nthash" ) var ( sequence = []byte("ACGTCGTCAGTCGATGCAGTACGTCGTCAGTCGATGCAGT") kmerSize = 11 ) func main() { // create the ntHash iterator using a pointer to the sequence and a k-mer size hasher, err := ntHash.New(&sequence, kmerSize) // check for errors (e.g. bad k-mer size choice) if err != nil { log.Fatal(err) } // collect the hashes by ranging over the hash channel produced by the Hash method canonical := true for hash := range hasher.Hash(canonical) { log.Println(hash) } } ``` nthash-0.4.0/go.mod000066400000000000000000000000541412070745100141060ustar00rootroot00000000000000module github.com/will-rowe/nthash go 1.14 nthash-0.4.0/ntHash.go000066400000000000000000000225651412070745100145670ustar00rootroot00000000000000// Package nthash is a port of ntHash (https://github.com/bcgsc/ntHash) recursive hash function for DNA kmers. // // It was inspired by the Rust port by Luiz Irber (https://github.com/luizirber/nthash) // package nthash import ( "fmt" "math" "sync" ) const ( // maxK is the maximum k-mer size permitted maxK uint = math.MaxUint32 // bufferSize is the size of te buffer used by the channel in the Hash method bufferSize uint = 128 // offset is used as a mask to retrieve a base's complement in the seed table offset uint8 = 0x07 // seedA is the 64-bit random seed corresponding to base A seedA uint64 = 0x3c8bfbb395c60474 // seedC is the 64-bit random seed corresponding to base C seedC uint64 = 0x3193c18562a02b4c // seedG is the 64-bit random seed corresponding to base G seedG uint64 = 0x20323ed082572324 // seedT is the 64-bit random seed corresponding to base T seedT uint64 = 0x295549f54be24456 // seedN is the 64-bit random seed corresponding to N seedN uint64 = 0x0000000000000000 // seed for gerenerating multiple hash values multiSeed uint64 = 0x90b45d39fb6da1fa // multiShift is used for gerenerating multiple hash values multiShift uint = 27 ) // seedTab is the lookup table for the bases on their complements var seedTab = [256]uint64{ seedN, seedT, seedN, seedG, seedA, seedA, seedN, seedC, // 0..7 seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 8..15 seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 16..23 seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 24..31 seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 32..39 seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 40..47 seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 48..55 seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 56..63 seedN, seedA, seedN, seedC, seedN, seedN, seedN, seedG, // 64..71 seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 72..79 seedN, seedN, seedN, seedN, seedT, seedT, seedN, seedN, // 80..87 seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 88..95 seedN, seedA, seedN, seedC, seedN, seedN, seedN, seedG, // 96..103 seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 104..111 seedN, seedN, seedN, seedN, seedT, seedT, seedN, seedN, // 112..119 seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 120..127 seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 128..135 seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 136..143 seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 144..151 seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 152..159 seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 160..167 seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 168..175 seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 176..183 seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 184..191 seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 192..199 seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 200..207 seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 208..215 seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 216..223 seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 224..231 seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 232..239 seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 240..247 seedN, seedN, seedN, seedN, seedN, seedN, seedN, seedN, // 248..255 } // NTHi is the ntHash iterator type NTHi struct { seq *[]byte // the sequence being hashed k uint // the k-mer size fh uint64 // the current forward hash value rh uint64 // the current reverse hash value currentIdx uint // the current index position in the sequence being hashed maxIdx uint // the maximum index position to hash up to } // use object pool to reducing GC load for computation of huge number of sequences. var poolNTHi = &sync.Pool{New: func() interface{} { return &NTHi{} }} // NewHasher is the constructor function for the ntHash iterator // seq is a pointer to the sequence being hashed // k is the k-mer size to use func NewHasher(seq *[]byte, k uint) (*NTHi, error) { seqLen := uint(len(*seq)) if k > seqLen { return nil, fmt.Errorf("k size is greater than sequence length (%d vs %d)", k, seqLen) } if k > maxK { return nil, fmt.Errorf("k size is greater than the maximum allowed k size (%d vs %d)", k, maxK) } fh := ntf64((*seq)[0:k], 0, k) rh := ntr64((*seq)[0:k], 0, k) nthi := poolNTHi.Get().(*NTHi) nthi.seq = seq nthi.k = k nthi.fh = fh nthi.rh = rh nthi.currentIdx = 0 nthi.maxIdx = seqLen - (k - 1) return nthi, nil } // Next returns the next ntHash value from an ntHash iterator func (nthi *NTHi) Next(canonical bool) (uint64, bool) { // end the iterator if we have got to the maximum index position TODO: this needs to be done in a better way. if nthi.currentIdx >= nthi.maxIdx { poolNTHi.Put(nthi) return 0, false } // roll the hash if index>0 if nthi.currentIdx != 0 { prevBase := (*nthi.seq)[nthi.currentIdx-1] endBase := (*nthi.seq)[nthi.currentIdx+nthi.k-1] // alg 3. of ntHash paper nthi.fh = roL(nthi.fh, 1) nthi.fh ^= roL(seedTab[prevBase], nthi.k) nthi.fh ^= seedTab[endBase] nthi.rh = roR(nthi.rh, 1) nthi.rh ^= roR(seedTab[prevBase&offset], 1) nthi.rh ^= roL(seedTab[endBase&offset], nthi.k-1) } nthi.currentIdx++ if canonical { return nthi.getCanonical(), true } return nthi.fh, true } // Hash returns a channel to range over the canonical ntHash values of a sequence // canonical is set true to return the canonical k-mers, otherwise the forward hashes are returned func (nthi *NTHi) Hash(canonical bool) <-chan uint64 { hashChan := make(chan uint64, bufferSize) go func() { defer close(hashChan) // start the rolling hash for { // check that rolling can continue if nthi.currentIdx >= nthi.maxIdx { poolNTHi.Put(nthi) return } // start the hashing if nthi.currentIdx != 0 { prevBase := (*nthi.seq)[nthi.currentIdx-1] endBase := (*nthi.seq)[nthi.currentIdx+nthi.k-1] // alg 3. of ntHash paper nthi.fh = roL(nthi.fh, 1) nthi.fh ^= roL(seedTab[prevBase], nthi.k) nthi.fh ^= seedTab[endBase] nthi.rh = roR(nthi.rh, 1) nthi.rh ^= roR(seedTab[prevBase&offset], 1) nthi.rh ^= roL(seedTab[endBase&offset], nthi.k-1) } // calculate and return the canonical ntHash if requested if canonical { hashChan <- nthi.getCanonical() } else { hashChan <- nthi.fh } // increment the index nthi.currentIdx++ } }() return hashChan } // MultiHash returns a channel to range over the canonical multi ntHash values of a sequence // canonical is set true to return the canonical k-mers, otherwise the forward hashes are returned // numMultiHash sets the number of multi hashes to generate for each k-mer func (nthi *NTHi) MultiHash(canonical bool, numMultiHash uint) <-chan []uint64 { hashChan := make(chan []uint64, bufferSize) go func() { defer close(hashChan) // start the rolling hash for { // check that rolling can continue if nthi.currentIdx >= nthi.maxIdx { poolNTHi.Put(nthi) return } // start the hashing if nthi.currentIdx != 0 { prevBase := (*nthi.seq)[nthi.currentIdx-1] endBase := (*nthi.seq)[nthi.currentIdx+nthi.k-1] // alg 3. of ntHash paper nthi.fh = roL(nthi.fh, 1) nthi.fh ^= roL(seedTab[prevBase], nthi.k) nthi.fh ^= seedTab[endBase] nthi.rh = roR(nthi.rh, 1) nthi.rh ^= roR(seedTab[prevBase&offset], 1) nthi.rh ^= roL(seedTab[endBase&offset], nthi.k-1) } // set up the return slice multiHashes := make([]uint64, numMultiHash) if canonical { multiHashes[0] = nthi.getCanonical() } else { multiHashes[0] = nthi.fh } for i := uint64(1); i < uint64(numMultiHash); i++ { tVal := multiHashes[0] * (i ^ uint64(nthi.k)*multiSeed) tVal ^= tVal >> multiShift multiHashes[i] = tVal } // send the multihashes for this k-mer hashChan <- multiHashes // increment the index nthi.currentIdx++ } }() return hashChan } // getCanonical returns the canonical hash value currently held by the iterator func (nthi *NTHi) getCanonical() uint64 { if nthi.rh < nthi.fh { return nthi.rh } return nthi.fh } // roL is a function to bit shift to the left by "n" positions func roL(v uint64, n uint) uint64 { if (n & 63) == 0 { return v } return (v << n) | (v >> (64 - n)) } // roR is a function to bit shift to the right by "n" positions func roR(v uint64, n uint) uint64 { if (n & 63) == 0 { return v } return (v >> n) | (v << (64 - n)) } // ntf64 generates the ntHash for the forward strand of the kmer func ntf64(seq []byte, i, k uint) uint64 { var hv uint64 for i < k { hv = roL(hv, 1) hv ^= seedTab[seq[i]] i++ } return hv } // ntr64 generates the ntHash for the reverse strand of the kmer func ntr64(seq []byte, i, k uint) uint64 { var hv uint64 for i < k { hv = roL(hv, 1) hv ^= seedTab[seq[k-1-i]&offset] i++ } return hv } // ntc64 generates the canonical ntHash func ntc64(seq []byte, i, k uint) uint64 { fh := ntf64(seq, i, k) rh := ntr64(seq, i, k) if rh < fh { return rh } return fh } // nthash returns the canonical ntHash for each k-mer in a sequence // it does not use the rolling hash properties of ntHash func nthash(seq []byte, k int) []uint64 { hvs := make([]uint64, (len(seq) - (k - 1))) for i := 0; i <= (len(seq) - k); i++ { hvs[i] = ntc64(seq[i:i+k], 0, uint(k)) } return hvs } nthash-0.4.0/ntHash_test.go000066400000000000000000000073761412070745100156310ustar00rootroot00000000000000// test values have been lifted from Luiz Irber -- all credit and my thanks to him! // see https://github.com/luizirber/nthash/blob/master/src/lib.rs package nthash import ( "fmt" "testing" ) var ( kmer = []byte("TGCAG") sequence = []byte("ACGTCGTCAGTCGATGCAGT") kmer2 = []byte("ACTGC") ) // test seed lookup func TestSeedLookup(t *testing.T) { if seedTab[kmer[0]] != 0x295549f54be24456 { t.Fatal() } if seedTab[kmer[1]] != 0x20323ed082572324 { t.Fatal() } if seedTab[kmer[2]] != 0x3193c18562a02b4c { t.Fatal() } if seedTab[kmer[3]] != 0x3c8bfbb395c60474 { t.Fatal() } } // test forward ntHash func TestNTF64hash(t *testing.T) { hv := ntf64(kmer, 0, 5) t.Log(fmt.Printf("%x\n", hv)) if hv != 0xbafa6728fc6dabf { t.Fatal() } } // test reverse ntHash func TestNTR64(t *testing.T) { hv := ntr64(kmer, 0, 5) t.Log(fmt.Printf("%x\n", hv)) if hv != 0x8cf2d4072cca480e { t.Fatal() } } // test the canonical ntHash func TestNTC64(t *testing.T) { hv := ntc64(kmer, 0, 5) t.Log(fmt.Printf("%x\n", hv)) if hv != 0xbafa6728fc6dabf { t.Fatal() } } // test the ntHash function TODO: actually test this.... func TestNTHash(t *testing.T) { hvs := nthash(sequence, 5) for i, h := range hvs { t.Log(i, h) } } // test the ntHash iterator constructor func TestNewHasherNTHI(t *testing.T) { if _, err := NewHasher(&kmer, 10); err == nil { t.Fatal("should trigger k > seq error") } if _, err := NewHasher(&kmer, 200); err == nil { t.Fatal("should trigger k > max_k error") } nthi, err := NewHasher(&sequence, 5) if err != nil { t.Fatal() } t.Log(nthi) } // test the ntHash iterator next method func TestNext(t *testing.T) { nthi, err := NewHasher(&kmer2, 3) if err != nil { t.Fatal() } // should return the pre-calculated ntHash for the first canonical k-mer if h, _ := nthi.Next(true); h != 0x9b1eda9a185413ce { t.Fatal() } t.Log(nthi) // should calculate the next canonical k-mer ntHash and return it if h, _ := nthi.Next(true); h != 0x9f6acfa2235b86fc { t.Fatal() } // should calculate the final canonical k-mer ntHash and return it if h, _ := nthi.Next(true); h != 0xd4a29bf149877c5c { t.Fatal() } } // test the ntHash iterator hash method func TestHash(t *testing.T) { nthi, err := NewHasher(&kmer2, 3) if err != nil { t.Fatal() } counter := 0 // use the canonical switch for hash := range nthi.Hash(true) { t.Log(hash) counter++ switch counter { case 1: if hash != 0x9b1eda9a185413ce { t.Fatal() } case 2: if hash != 0x9f6acfa2235b86fc { t.Fatal() } case 3: if hash != 0xd4a29bf149877c5c { t.Fatal() } default: t.Fatal("unexpected output from nthi") } } if counter != 3 { t.Fatal("wrong iteration") } } // test the ntHash iterator multihash method func TestMultiHash(t *testing.T) { nthi, err := NewHasher(&kmer2, 3) if err != nil { t.Fatal() } counter := 0 // use the canonical switch and 3 multihashes for hashes := range nthi.MultiHash(true, 3) { t.Log(hashes) counter++ switch counter { case 1: if hashes[0] != 0x9b1eda9a185413ce { t.Fatal() } case 2: if hashes[0] != 0x9f6acfa2235b86fc { t.Fatal() } case 3: if hashes[0] != 0xd4a29bf149877c5c { t.Fatal() } default: t.Fatal("unexpected output from nthi") } } if counter != 3 { t.Fatal("wrong iteration") } } // run benchmarks of ntHash func BenchmarkHash(b *testing.B) { // run the ntHash iterator b.N times for n := 0; n < b.N; n++ { nthi, err := NewHasher(&sequence, 7) if err != nil { b.Fatal() } for range nthi.Hash(false) { } } } func BenchmarkCanonicalHash(b *testing.B) { // run the ntHash iterator b.N times for n := 0; n < b.N; n++ { nthi, err := NewHasher(&sequence, 7) if err != nil { b.Fatal() } for range nthi.Hash(true) { } } }