bwt-0.6.0/0000755000175000017500000000000014146115001011630 5ustar nileshnileshbwt-0.6.0/go.sum0000644000175000017500000000000014146115001012751 0ustar nileshnileshbwt-0.6.0/README.md0000644000175000017500000000104014146115001013102 0ustar nileshnilesh# bwt [![Go Reference](https://pkg.go.dev/badge/github.com/shenwei356/bwt.svg)](https://pkg.go.dev/github.com/shenwei356/bwt) [![Go Report Card](https://goreportcard.com/badge/github.com/shenwei356/bwt)](https://goreportcard.com/report/github.com/shenwei356/bwt) Burrows-Wheeler Transform and FM-index in golang ## Install This package is "go-gettable", just: go get -u github.com/shenwei356/bwt ## Licence Copyright (c) 2015-2021, Wei Shen (shenwei356@gmail.com) [MIT License](https://github.com/shenwei356/bwt/blob/master/LICENSE) bwt-0.6.0/util.go0000644000175000017500000000046014146115001013134 0ustar nileshnileshpackage bwt import "bytes" // SliceOfByteSlice is [][]byte type SliceOfByteSlice [][]byte func (s SliceOfByteSlice) Len() int { return len(s) } func (s SliceOfByteSlice) Less(i, j int) bool { return bytes.Compare(s[i], s[j]) < 0 } func (s SliceOfByteSlice) Swap(i, j int) { s[i], s[j] = s[j], s[i] } bwt-0.6.0/go.mod0000644000175000017500000000005214146115001012733 0ustar nileshnileshmodule github.com/shenwei356/bwt go 1.17 bwt-0.6.0/.gitignore0000755000175000017500000000041114146115001013617 0ustar nileshnilesh# Compiled Object files, Static and Dynamic libs (Shared Objects) *.o *.a *.so # Folders _obj _test # Architecture specific extensions/prefixes *.[568vq] [568vq].out *.cgo1.go *.cgo2.c _cgo_defun.c _cgo_gotypes.go _cgo_export.* _testmain.go *.exe *.directory bwt-0.6.0/bwt.go0000644000175000017500000000560214146115001012756 0ustar nileshnileshpackage bwt import ( "errors" "index/suffixarray" "reflect" "sort" ) // CheckEndSymbol is a global variable for checking end symbol before Burrows–Wheeler transform var CheckEndSymbol = true // ErrEndSymbolExisted means you should choose another EndSymbol var ErrEndSymbolExisted = errors.New("bwt: end-symbol existed in string") // Transform returns Burrows–Wheeler transform of a byte slice. // See https://en.wikipedia.org/wiki/Burrows%E2%80%93Wheeler_transform func Transform(s []byte, es byte) ([]byte, error) { if CheckEndSymbol { for _, c := range s { if c == es { return nil, ErrEndSymbolExisted } } } sa := SuffixArray(s) bwt, err := FromSuffixArray(s, sa, es) return bwt, err } // InverseTransform reverses the bwt to original byte slice. Not optimized yet. func InverseTransform(t []byte, es byte) []byte { n := len(t) lines := make([][]byte, n) for i := 0; i < n; i++ { lines[i] = make([]byte, n) } for i := 0; i < n; i++ { for j := 0; j < n; j++ { lines[j][n-1-i] = t[j] } sort.Sort(SliceOfByteSlice(lines)) } s := make([]byte, n-1) for _, line := range lines { if line[n-1] == es { s = line[0 : n-1] break } } return s } // SuffixArray returns the suffix array of s. // This function is the performance bottleneck of bwt and bwt/fmi package, with O(nlogn). func SuffixArray(s []byte) []int { // sa := make([]int, len(s)+1) // sa[0] = len(s) // for i := 0; i < len(s); i++ { // sa[i+1] = i // } // sort.Slice(sa[1:], func(i, j int) bool { // return bytes.Compare(s[sa[i+1]:], s[sa[j+1]:]) < 0 // }) // return sa // https://github.com/shenwei356/bwt/issues/3 . // nearly copy from https://github.com/crazyleg/burrow-wheelers-golang/blob/master/pkg/bwtgolang/suffixarrayBWT.go#L8 // It's 4X faster! // // benchmark old ns/op new ns/op delta // BenchmarkTransform-16 1339346 310706 -76.80% // // benchmark old allocs new allocs delta // BenchmarkTransform-16 4 5 +25.00% // // benchmark old bytes new bytes delta // BenchmarkTransform-16 55362 79962 +44.43% _sa := suffixarray.New(s) tmp := reflect.ValueOf(_sa).Elem().FieldByName("sa").FieldByIndex([]int{0}) var sa []int = make([]int, len(s)+1) sa[0] = len(s) for i := 0; i < len(s); i++ { sa[i+1] = int(tmp.Index(i).Int()) } return sa } // ErrInvalidSuffixArray means length of sa is not equal to 1+len(s) var ErrInvalidSuffixArray = errors.New("bwt: invalid suffix array") // FromSuffixArray compute BWT from sa func FromSuffixArray(s []byte, sa []int, es byte) ([]byte, error) { if len(s)+1 != len(sa) || sa[0] != len(s) { return nil, ErrInvalidSuffixArray } bwt := make([]byte, len(sa)) bwt[0] = s[len(s)-1] for i := 1; i < len(sa); i++ { if sa[i] == 0 { bwt[i] = es } else { bwt[i] = s[sa[i]-1] } } return bwt, nil } bwt-0.6.0/fmi/0000755000175000017500000000000014146115001012403 5ustar nileshnileshbwt-0.6.0/fmi/util.go0000644000175000017500000000077214146115001013715 0ustar nileshnileshpackage fmi type sMatch struct { query []byte start, end int mismatches int } // Stack struct type Stack []sMatch // Empty tell if it is empty func (s Stack) Empty() bool { return len(s) == 0 } // Peek return the last element func (s Stack) Peek() sMatch { return s[len(s)-1] } // Put puts element to stack func (s *Stack) Put(i sMatch) { (*s) = append((*s), i) } // Pop pops element from the stack func (s *Stack) Pop() sMatch { d := (*s)[len(*s)-1] (*s) = (*s)[:len(*s)-1] return d } bwt-0.6.0/fmi/fmi_test.go0000644000175000017500000000415214146115001014546 0ustar nileshnileshpackage fmi import ( "testing" ) type Case struct { s, q string m int r []int } var cases = []Case{ {"mississippi", "iss", 0, []int{1, 4}}, {"abcabcabc", "abc", 0, []int{0, 3, 6}}, {"abcabcabc", "gef", 0, []int{}}, {"abcabcabc", "gef", 0, []int{}}, {"abcabcabc", "xef", 0, []int{}}, {"abcabcabc", "xabcb", 1, []int{}}, {"abcabcabc", "xabcb", 2, []int{2}}, {"abcabd", "abc", 1, []int{0, 3}}, {"acctatac", "ac", 0, []int{0, 6}}, {"acctatac", "tac", 0, []int{5}}, {"acctatac", "tac", 1, []int{3, 5}}, {"acctatac", "taz", 1, []int{3, 5}}, {"ccctatac", "tzc", 1, []int{5}}, {"acctatac", "atac", 0, []int{4}}, {"acctatac", "acctatac", 0, []int{0}}, {"acctatac", "acctatac", 1, []int{0}}, {"acctatac", "cctatac", 1, []int{1}}, {"acctatac", "caa", 2, []int{1, 2, 3, 4, 5}}, {"acctatac", "caa", 3, []int{0, 1, 2, 3, 4, 5}}, } func TestLocate(t *testing.T) { var err error var match bool var fmi *FMIndex for i, c := range cases { fmi = NewFMIndex() _, err = fmi.Transform([]byte(c.s)) if err != nil { t.Errorf("case #%d: Transform: %s", i+1, err) return } match, err = fmi.Match([]byte(c.q), c.m) if err != nil { t.Errorf("case #%d: Locate: %s", i, err) return } if match != (len(c.r) > 0) { t.Errorf("case #%d: Match '%s' in '%s' (allow %d mismatch), result: %v. right answer: %v", i+1, c.q, c.s, c.m, match, len(c.r) > 0) return } } } func TestMatch(t *testing.T) { var err error var loc []int var fmi *FMIndex for i, c := range cases { fmi = NewFMIndex() _, err = fmi.Transform([]byte(c.s)) if err != nil { t.Errorf("case #%d: Transform: %s", i+1, err) return } loc, err = fmi.Locate([]byte(c.q), c.m) if err != nil { t.Errorf("case #%d: Locate: %s", i, err) return } if len(loc) != len(c.r) { t.Errorf("case #%d: Locate '%s' in '%s' (allow %d mismatch), result: %d. right answer: %d", i+1, c.q, c.s, c.m, loc, c.r) return } for j := 0; j < len(loc); j++ { if loc[j] != c.r[j] { t.Errorf("case #%d: Locate '%s' in '%s' (allow %d mismatch), result: %d. right answer: %d", i+1, c.q, c.s, c.m, loc, c.r) return } } } } bwt-0.6.0/fmi/fmi.go0000644000175000017500000002411314146115001013506 0ustar nileshnileshpackage fmi import ( "bytes" "fmt" "sort" "strings" "github.com/shenwei356/bwt" ) // FMIndex is Burrows-Wheeler Index type FMIndex struct { // EndSymbol EndSymbol byte // SuffixArray SuffixArray []int // Burrows-Wheeler Transform BWT []byte // First column of BWM F []byte // Alphabet in the BWT Alphabet []byte // Count of Letters in Alphabet. // CountOfLetters map[byte]int CountOfLetters []int // slice is faster han map // C[c] is a table that, for each character c in the alphabet, // contains the number of occurrences of lexically smaller characters // in the text. // C map[byte]int C []int // slice is faster han map // Occ(c, k) is the number of occurrences of character c in the // prefix L[1..k], k is 0-based. // Occ map[byte]*[]int32 Occ []*[]int32 // slice is faster han map } // NewFMIndex is constructor of FMIndex func NewFMIndex() *FMIndex { fmi := new(FMIndex) fmi.EndSymbol = byte(0) return fmi } // Transform return Burrows-Wheeler-Transform of s func (fmi *FMIndex) Transform(s []byte) ([]byte, error) { var err error sa := bwt.SuffixArray(s) fmi.SuffixArray = sa fmi.BWT, err = bwt.FromSuffixArray(s, fmi.SuffixArray, fmi.EndSymbol) if err != nil { return nil, err } F := make([]byte, len(s)+1) F[0] = fmi.EndSymbol for i := 1; i <= len(s); i++ { F[i] = s[sa[i]] } fmi.F = F // fmi.CountOfLetters = byteutil.CountOfByte(fmi.BWT) // delete(fmi.CountOfLetters, fmi.EndSymbol) count := make([]int, 128) for _, b := range fmi.BWT { count[b]++ } count[fmi.EndSymbol] = 0 fmi.CountOfLetters = count // fmi.Alphabet = byteutil.AlphabetFromCountOfByte(fmi.CountOfLetters) alphabet := make([]byte, 0, 128) for b, c := range count { if c > 0 { alphabet = append(alphabet, byte(b)) } } fmi.Alphabet = alphabet fmi.C = computeC(fmi.F) fmi.Occ = computeOccurrence(fmi.BWT, fmi.Alphabet) return fmi.BWT, nil } // Last2First mapping func (fmi *FMIndex) Last2First(i int) int { c := fmi.BWT[i] return fmi.C[c] + int((*fmi.Occ[c])[i]) } func (fmi *FMIndex) nextLetterInAlphabet(c byte) byte { var nextLetter byte for i, letter := range fmi.Alphabet { if letter == c { if i < len(fmi.Alphabet)-1 { nextLetter = fmi.Alphabet[i+1] } else { nextLetter = fmi.Alphabet[i] } break } } return nextLetter } // Locate locates the pattern func (fmi *FMIndex) Locate(query []byte, mismatches int) ([]int, error) { var locations []int locationsMap := make(map[int]struct{}) if mismatches == 0 { // letters := byteutil.Alphabet(query) count := make([]int, 128) for _, b := range query { if count[b] == 0 { count[b]++ } } letters := make([]byte, 0, 128) for b, c := range count { if c > 0 { letters = append(letters, byte(b)) } } for _, letter := range letters { // query having letter not in alphabet // if _, ok := fmi.CountOfLetters[letter]; !ok { if fmi.CountOfLetters[letter] == 0 { return locations, nil } } } n := len(fmi.BWT) var matches Stack // start and end are 0-based matches.Put(sMatch{query: query, start: 0, end: n - 1, mismatches: mismatches}) // fmt.Printf("====%s====\n", query) // fmt.Println(fmi) var match sMatch var last, c byte var start, end int var m int var letters []byte // var ok bool for !matches.Empty() { match = matches.Pop() query = match.query[0 : len(match.query)-1] last = match.query[len(match.query)-1] if match.mismatches == 0 { letters = []byte{last} } else { letters = fmi.Alphabet } // fmt.Println("\n--------------------------------------------") // fmt.Printf("%s, %s, %c\n", match.query, query, last) // fmt.Printf("query: %s, last: %c\n", query, last) for _, c = range letters { // if _, ok = fmi.CountOfLetters[c]; !ok { // letter not in alphabet if fmi.CountOfLetters[c] == 0 { continue } // fmt.Printf("letter: %c, start: %d, end: %d, mismatches: %d\n", c, match.start, match.end, match.mismatches) if match.start == 0 { start = fmi.C[c] + 0 } else { start = fmi.C[c] + int((*fmi.Occ[c])[match.start-1]) } end = fmi.C[c] + int((*fmi.Occ[c])[match.end]-1) // fmt.Printf(" s: %d, e: %d\n", start, end) if start > end { continue } if len(query) == 0 { for _, i := range fmi.SuffixArray[start : end+1] { // fmt.Printf(" >>> found: %d\n", i) locationsMap[i] = struct{}{} } } else { m = match.mismatches if c != last { if match.mismatches > 1 { m = match.mismatches - 1 } else { m = 0 } } // fmt.Printf(" >>> candidate: query: %s, start: %d, end: %d, m: %d\n", query, start, end, m) matches.Put(sMatch{query: query, start: start, end: end, mismatches: m}) } } } i := 0 locations = make([]int, len(locationsMap)) for loc := range locationsMap { locations[i] = loc i++ } sort.Ints(locations) return locations, nil } // Match is a simple version of Locate, which returns immediately for a match. func (fmi *FMIndex) Match(query []byte, mismatches int) (bool, error) { if mismatches == 0 { // letters := byteutil.Alphabet(query) count := make([]int, 128) for _, b := range query { if count[b] == 0 { count[b]++ } } letters := make([]byte, 0, 128) for b, c := range count { if c > 0 { letters = append(letters, byte(b)) } } for _, letter := range letters { // query having letter not in alphabet // if _, ok := fmi.CountOfLetters[letter]; !ok { if fmi.CountOfLetters[letter] == 0 { return false, nil } } } n := len(fmi.BWT) var matches Stack // start and end are 0-based matches.Put(sMatch{query: query, start: 0, end: n - 1, mismatches: mismatches}) // fmt.Printf("====%s====\n", query) // fmt.Println(fmi) var match sMatch var last, c byte var start, end int var m int var letters []byte // var ok bool for !matches.Empty() { match = matches.Pop() query = match.query[0 : len(match.query)-1] last = match.query[len(match.query)-1] if match.mismatches == 0 { letters = []byte{last} } else { letters = fmi.Alphabet } // fmt.Println("\n--------------------------------------------") // fmt.Printf("%s, %s, %c\n", match.query, query, last) // fmt.Printf("query: %s, last: %c\n", query, last) for _, c = range letters { // if _, ok = fmi.CountOfLetters[c]; !ok { // letter not in alphabet if fmi.CountOfLetters[c] == 0 { continue } // fmt.Printf("letter: %c, start: %d, end: %d, mismatches: %d\n", c, match.start, match.end, match.mismatches) if match.start == 0 { start = fmi.C[c] + 0 } else { start = fmi.C[c] + int((*fmi.Occ[c])[match.start-1]) } end = fmi.C[c] + int((*fmi.Occ[c])[match.end]-1) // fmt.Printf(" s: %d, e: %d\n", start, end) if start > end { continue } if len(query) == 0 { return true, nil } else { m = match.mismatches if c != last { if match.mismatches > 1 { m = match.mismatches - 1 } else { m = 0 } } // fmt.Printf(" >>> candidate: query: %s, start: %d, end: %d, m: %d\n", query, start, end, m) matches.Put(sMatch{query: query, start: start, end: end, mismatches: m}) } } } return false, nil } func (fmi *FMIndex) String() string { var buffer bytes.Buffer buffer.WriteString(fmt.Sprintf("EndSymbol: %c\n", fmi.EndSymbol)) buffer.WriteString(fmt.Sprintf("BWT: %s\n", string(fmi.BWT))) buffer.WriteString(fmt.Sprintf("Alphabet: %s\n", string(fmi.Alphabet))) buffer.WriteString("F:\n") buffer.WriteString(string(fmi.F) + "\n") buffer.WriteString("C:\n") for _, letter := range fmi.Alphabet { buffer.WriteString(fmt.Sprintf(" %c: %d\n", letter, fmi.C[letter])) } buffer.WriteString("Occ:\n") buffer.WriteString(fmt.Sprintf(" BWT[%s]\n", strings.Join(strings.Split(string(fmi.BWT), ""), " "))) for _, letter := range fmi.Alphabet { buffer.WriteString(fmt.Sprintf(" %c: %v\n", letter, fmi.Occ[letter])) } buffer.WriteString("SA:\n") buffer.WriteString(fmt.Sprintf(" %d\n", fmi.SuffixArray)) return buffer.String() } // ComputeC computes C. // C[c] is a table that, for each character c in the alphabet, // contains the number of occurrences of lexically smaller characters // in the text. // func ComputeC(L []byte, alphabet []byte) map[byte]int { // if alphabet == nil { // alphabet = byteutil.Alphabet(L) // } // C := make(map[byte]int, len(alphabet)) // count := 0 // for _, c := range L { // if _, ok := C[c]; !ok { // C[c] = count // } // count++ // } // return C // } func computeC(L []byte) []int { C := make([]int, 128) count := 0 for _, c := range L { if C[c] == 0 { C[c] = count } count++ } return C } // ComputeOccurrence returns occurrence information. // Occ(c, k) is the number of occurrences of character c in the prefix L[1..k] // func ComputeOccurrence(bwt []byte, letters []byte) map[byte]*[]int32 { // if letters == nil { // letters = byteutil.Alphabet(bwt) // } // occ := make(map[byte]*[]int32, len(letters)-1) // for _, letter := range letters { // t := make([]int32, 1, len(bwt)) // t[0] = 0 // occ[letter] = &t // } // t := make([]int32, 1, len(bwt)) // t[0] = 1 // occ[bwt[0]] = &t // var letter, k byte // var v *[]int32 // for _, letter = range bwt[1:] { // for k, v = range occ { // if k == letter { // *v = append(*v, (*v)[len(*v)-1]+1) // } else { // *v = append(*v, (*v)[len(*v)-1]) // } // } // } // return occ // } func computeOccurrence(bwt []byte, letters []byte) []*[]int32 { if letters == nil { count := make([]int, 128) for _, b := range bwt { if count[b] == 0 { count[b]++ } } letters = make([]byte, 0, 128) for b, c := range count { if c > 0 { letters = append(letters, byte(b)) } } } occ := make([]*[]int32, 128) for _, letter := range letters { t := make([]int32, 1, len(bwt)) t[0] = 0 occ[letter] = &t } t := make([]int32, 1, len(bwt)) t[0] = 1 occ[bwt[0]] = &t var letter byte var k, letterInt int var v *[]int32 for _, letter = range bwt[1:] { letterInt = int(letter) for k, v = range occ { if v == nil { continue } if k == letterInt { *v = append(*v, (*v)[len(*v)-1]+1) } else { *v = append(*v, (*v)[len(*v)-1]) } } } return occ } bwt-0.6.0/LICENSE0000644000175000017500000000211114146115001012630 0ustar nileshnileshCopyright (c) 2015-2021 Wei Shen (shenwei356@gmail.com) The MIT License Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. bwt-0.6.0/bwt_test.go0000644000175000017500000000326414146115001014017 0ustar nileshnileshpackage bwt import ( "fmt" "math/rand" "testing" ) func TestTransformAndInverseTransform(t *testing.T) { s := "abracadabra" trans := "ard$rcaaaabb" tr, err := Transform([]byte(s), '$') if err != nil { t.Error(err) } if string(tr) != trans { t.Error("Test failed: Transform") } if string(InverseTransform([]byte(trans), '$')) != s { t.Error("Test failed: InverseTransform") } } func TestFromSuffixArray(t *testing.T) { s := "GATGCGAGAGATG" trans := "GGGGGGTCAA$TAA" sa := SuffixArray([]byte(s)) B, err := FromSuffixArray([]byte(s), sa, '$') if err != nil { t.Error("Test failed: FromSuffixArray error") } if string(B) != trans { t.Error("Test failed: FromSuffixArray returns wrong result") } } func TestSA(t *testing.T) { s := "mississippi" sa := SuffixArray([]byte(s)) sa1 := []int{11, 10, 7, 4, 1, 0, 9, 8, 6, 3, 5, 2} // fmt.Printf("%s\nanswer: %v, result: %v", s, sa1, sa) if len(sa) != len(sa1) { t.Error(fmt.Errorf("sa error. answer: %v, result: %v", sa1, sa)) return } for i := range sa { if sa[i] != sa1[i] { t.Error(fmt.Errorf("sa error. answer: %v, result: %v", sa1, sa)) return } } } var cases [][]byte func init() { rand.Seed(1) alphabet := "ACGT" n := len(alphabet) scales := []float32{1e3, 1e5} cases = make([][]byte, len(scales)) for i, scale := range scales { l := rand.Float32() * scale * 10 buf := make([]byte, int(l)) for j := 0; j < int(l); j++ { buf[j] = alphabet[rand.Intn(n)] } cases[i] = buf } } var result []byte func BenchmarkTransform(t *testing.B) { var r []byte var err error for i := 0; i < t.N; i++ { r, err = Transform(cases[0], '$') if err != nil { t.Error(err) return } } result = r }