pax_global_header00006660000000000000000000000064136402171750014520gustar00rootroot0000000000000052 comment=8ba29bd255b740aee4eb4e4ddb5d7ec0b4d9f23e codesearch-1.2.0/000077500000000000000000000000001364021717500136205ustar00rootroot00000000000000codesearch-1.2.0/AUTHORS000066400000000000000000000002221364021717500146640ustar00rootroot00000000000000# This source code is copyright "The Go Authors", # as defined by the AUTHORS file in the root of the Go tree. # # http://tip.golang.org/AUTHORS. codesearch-1.2.0/CONTRIBUTORS000066400000000000000000000003241364021717500154770ustar00rootroot00000000000000# The official list of people who can contribute code to the repository # is maintained in the standard Go repository as the CONTRIBUTORS # file in the root of the Go tree. # # http://tip.golang.org/CONTRIBUTORS codesearch-1.2.0/LICENSE000066400000000000000000000027071364021717500146330ustar00rootroot00000000000000Copyright (c) 2011 The Go Authors. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Google Inc. nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. codesearch-1.2.0/README000066400000000000000000000006331364021717500145020ustar00rootroot00000000000000Code Search is a tool for indexing and then performing regular expression searches over large bodies of source code. It is a set of command-line programs written in Go. For background and an overview of the commands, see http://swtch.com/~rsc/regexp/regexp4.html. To install: go get github.com/google/codesearch/cmd/... Use "go get -u" to update an existing installation. Russ Cox rsc@swtch.com June 2015 codesearch-1.2.0/cmd/000077500000000000000000000000001364021717500143635ustar00rootroot00000000000000codesearch-1.2.0/cmd/cgrep/000077500000000000000000000000001364021717500154635ustar00rootroot00000000000000codesearch-1.2.0/cmd/cgrep/cgrep.go000066400000000000000000000027471364021717500171240ustar00rootroot00000000000000// Copyright 2011 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. package main import ( "flag" "fmt" "log" "os" "runtime/pprof" "github.com/google/codesearch/regexp" ) var usageMessage = `usage: cgrep [-c] [-h] [-i] [-l] [-n] regexp [file...] Cgrep behaves like grep, searching for regexp, an RE2 (nearly PCRE) regular expression. The -c, -h, -i, -l, and -n flags are as in grep, although note that as per Go's flag parsing convention, they cannot be combined: the option pair -i -n cannot be abbreviated to -in. ` func usage() { fmt.Fprintf(os.Stderr, usageMessage) os.Exit(2) } var ( iflag = flag.Bool("i", false, "case-insensitive match") cpuProfile = flag.String("cpuprofile", "", "write cpu profile to this file") ) func main() { var g regexp.Grep g.AddFlags() g.Stdout = os.Stdout g.Stderr = os.Stderr flag.Usage = usage flag.Parse() args := flag.Args() if len(args) == 0 { flag.Usage() } if *cpuProfile != "" { f, err := os.Create(*cpuProfile) if err != nil { log.Fatal(err) } defer f.Close() pprof.StartCPUProfile(f) defer pprof.StopCPUProfile() } pat := "(?m)" + args[0] if *iflag { pat = "(?i)" + pat } re, err := regexp.Compile(pat) if err != nil { log.Fatal(err) } g.Regexp = re if len(args) == 1 { g.Reader(os.Stdin, "") } else { for _, arg := range args[1:] { g.File(arg) } } if !g.Match { os.Exit(1) } } codesearch-1.2.0/cmd/cindex/000077500000000000000000000000001364021717500156355ustar00rootroot00000000000000codesearch-1.2.0/cmd/cindex/cindex.go000066400000000000000000000070521364021717500174420ustar00rootroot00000000000000// Copyright 2011 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. package main import ( "flag" "fmt" "log" "os" "path/filepath" "runtime/pprof" "sort" "github.com/google/codesearch/index" ) var usageMessage = `usage: cindex [-list] [-reset] [path...] Cindex prepares the trigram index for use by csearch. The index is the file named by $CSEARCHINDEX, or else $HOME/.csearchindex. The simplest invocation is cindex path... which adds the file or directory tree named by each path to the index. For example: cindex $HOME/src /usr/include or, equivalently: cindex $HOME/src cindex /usr/include If cindex is invoked with no paths, it reindexes the paths that have already been added, in case the files have changed. Thus, 'cindex' by itself is a useful command to run in a nightly cron job. The -list flag causes cindex to list the paths it has indexed and exit. By default cindex adds the named paths to the index but preserves information about other paths that might already be indexed (the ones printed by cindex -list). The -reset flag causes cindex to delete the existing index before indexing the new paths. With no path arguments, cindex -reset removes the index. ` func usage() { fmt.Fprintf(os.Stderr, usageMessage) os.Exit(2) } var ( listFlag = flag.Bool("list", false, "list indexed paths and exit") resetFlag = flag.Bool("reset", false, "discard existing index") verboseFlag = flag.Bool("verbose", false, "print extra information") cpuProfile = flag.String("cpuprofile", "", "write cpu profile to this file") ) func main() { flag.Usage = usage flag.Parse() args := flag.Args() if *listFlag { ix := index.Open(index.File()) for _, arg := range ix.Paths() { fmt.Printf("%s\n", arg) } return } if *cpuProfile != "" { f, err := os.Create(*cpuProfile) if err != nil { log.Fatal(err) } defer f.Close() pprof.StartCPUProfile(f) defer pprof.StopCPUProfile() } if *resetFlag && len(args) == 0 { os.Remove(index.File()) return } if len(args) == 0 { ix := index.Open(index.File()) for _, arg := range ix.Paths() { args = append(args, arg) } } // Translate paths to absolute paths so that we can // generate the file list in sorted order. for i, arg := range args { a, err := filepath.Abs(arg) if err != nil { log.Printf("%s: %s", arg, err) args[i] = "" continue } args[i] = a } sort.Strings(args) for len(args) > 0 && args[0] == "" { args = args[1:] } master := index.File() if _, err := os.Stat(master); err != nil { // Does not exist. *resetFlag = true } file := master if !*resetFlag { file += "~" } ix := index.Create(file) ix.Verbose = *verboseFlag ix.AddPaths(args) for _, arg := range args { log.Printf("index %s", arg) filepath.Walk(arg, func(path string, info os.FileInfo, err error) error { if _, elem := filepath.Split(path); elem != "" { // Skip various temporary or "hidden" files or directories. if elem[0] == '.' || elem[0] == '#' || elem[0] == '~' || elem[len(elem)-1] == '~' { if info.IsDir() { return filepath.SkipDir } return nil } } if err != nil { log.Printf("%s: %s", path, err) return nil } if info != nil && info.Mode()&os.ModeType == 0 { ix.AddFile(path) } return nil }) } log.Printf("flush index") ix.Flush() if !*resetFlag { log.Printf("merge %s %s", master, file) index.Merge(file+"~", master, file) os.Remove(file) os.Rename(file+"~", master) } log.Printf("done") return } codesearch-1.2.0/cmd/csearch/000077500000000000000000000000001364021717500157735ustar00rootroot00000000000000codesearch-1.2.0/cmd/csearch/csearch.go000066400000000000000000000062541364021717500177410ustar00rootroot00000000000000// Copyright 2011 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. package main import ( "flag" "fmt" "log" "os" "runtime/pprof" "github.com/google/codesearch/index" "github.com/google/codesearch/regexp" ) var usageMessage = `usage: csearch [-c] [-f fileregexp] [-h] [-i] [-l] [-n] regexp Csearch behaves like grep over all indexed files, searching for regexp, an RE2 (nearly PCRE) regular expression. The -c, -h, -i, -l, and -n flags are as in grep, although note that as per Go's flag parsing convention, they cannot be combined: the option pair -i -n cannot be abbreviated to -in. The -f flag restricts the search to files whose names match the RE2 regular expression fileregexp. Csearch relies on the existence of an up-to-date index created ahead of time. To build or rebuild the index that csearch uses, run: cindex path... where path... is a list of directories or individual files to be included in the index. If no index exists, this command creates one. If an index already exists, cindex overwrites it. Run cindex -help for more. Csearch uses the index stored in $CSEARCHINDEX or, if that variable is unset or empty, $HOME/.csearchindex. ` func usage() { fmt.Fprintf(os.Stderr, usageMessage) os.Exit(2) } var ( fFlag = flag.String("f", "", "search only files with names matching this regexp") iFlag = flag.Bool("i", false, "case-insensitive search") verboseFlag = flag.Bool("verbose", false, "print extra information") bruteFlag = flag.Bool("brute", false, "brute force - search all files in index") cpuProfile = flag.String("cpuprofile", "", "write cpu profile to this file") matches bool ) func Main() { g := regexp.Grep{ Stdout: os.Stdout, Stderr: os.Stderr, } g.AddFlags() flag.Usage = usage flag.Parse() args := flag.Args() if len(args) != 1 { usage() } if *cpuProfile != "" { f, err := os.Create(*cpuProfile) if err != nil { log.Fatal(err) } defer f.Close() pprof.StartCPUProfile(f) defer pprof.StopCPUProfile() } pat := "(?m)" + args[0] if *iFlag { pat = "(?i)" + pat } re, err := regexp.Compile(pat) if err != nil { log.Fatal(err) } g.Regexp = re var fre *regexp.Regexp if *fFlag != "" { fre, err = regexp.Compile(*fFlag) if err != nil { log.Fatal(err) } } q := index.RegexpQuery(re.Syntax) if *verboseFlag { log.Printf("query: %s\n", q) } ix := index.Open(index.File()) ix.Verbose = *verboseFlag var post []uint32 if *bruteFlag { post = ix.PostingQuery(&index.Query{Op: index.QAll}) } else { post = ix.PostingQuery(q) } if *verboseFlag { log.Printf("post query identified %d possible files\n", len(post)) } if fre != nil { fnames := make([]uint32, 0, len(post)) for _, fileid := range post { name := ix.Name(fileid) if fre.MatchString(name, true, true) < 0 { continue } fnames = append(fnames, fileid) } if *verboseFlag { log.Printf("filename regexp matched %d files\n", len(fnames)) } post = fnames } for _, fileid := range post { name := ix.Name(fileid) g.File(name) } matches = g.Match } func main() { Main() if !matches { os.Exit(1) } os.Exit(0) } codesearch-1.2.0/go.mod000066400000000000000000000000551364021717500147260ustar00rootroot00000000000000module github.com/google/codesearch go 1.13 codesearch-1.2.0/index/000077500000000000000000000000001364021717500147275ustar00rootroot00000000000000codesearch-1.2.0/index/merge.go000066400000000000000000000170401364021717500163570ustar00rootroot00000000000000// Copyright 2011 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. package index // Merging indexes. // // To merge two indexes A and B (newer) into a combined index C: // // Load the path list from B and determine for each path the docid ranges // that it will replace in A. // // Read A's and B's name lists together, merging them into C's name list. // Discard the identified ranges from A during the merge. Also during the merge, // record the mapping from A's docids to C's docids, and also the mapping from // B's docids to C's docids. Both mappings can be summarized in a table like // // 10-14 map to 20-24 // 15-24 is deleted // 25-34 maps to 40-49 // // The number of ranges will be at most the combined number of paths. // Also during the merge, write the name index to a temporary file as usual. // // Now merge the posting lists (this is why they begin with the trigram). // During the merge, translate the docid numbers to the new C docid space. // Also during the merge, write the posting list index to a temporary file as usual. // // Copy the name index and posting list index into C's index and write the trailer. // Rename C's index onto the new index. import ( "encoding/binary" "os" "strings" ) // An idrange records that the half-open interval [lo, hi) maps to [new, new+hi-lo). type idrange struct { lo, hi, new uint32 } type postIndex struct { tri uint32 count uint32 offset uint32 } // Merge creates a new index in the file dst that corresponds to merging // the two indices src1 and src2. If both src1 and src2 claim responsibility // for a path, src2 is assumed to be newer and is given preference. func Merge(dst, src1, src2 string) { ix1 := Open(src1) ix2 := Open(src2) paths1 := ix1.Paths() paths2 := ix2.Paths() // Build docid maps. var i1, i2, new uint32 var map1, map2 []idrange for _, path := range paths2 { // Determine range shadowed by this path. old := i1 for i1 < uint32(ix1.numName) && ix1.Name(i1) < path { i1++ } lo := i1 limit := path[:len(path)-1] + string(path[len(path)-1]+1) for i1 < uint32(ix1.numName) && ix1.Name(i1) < limit { i1++ } hi := i1 // Record range before the shadow. if old < lo { map1 = append(map1, idrange{old, lo, new}) new += lo - old } // Determine range defined by this path. // Because we are iterating over the ix2 paths, // there can't be gaps, so it must start at i2. if i2 < uint32(ix2.numName) && ix2.Name(i2) < path { panic("merge: inconsistent index") } lo = i2 for i2 < uint32(ix2.numName) && ix2.Name(i2) < limit { i2++ } hi = i2 if lo < hi { map2 = append(map2, idrange{lo, hi, new}) new += hi - lo } } if i1 < uint32(ix1.numName) { map1 = append(map1, idrange{i1, uint32(ix1.numName), new}) new += uint32(ix1.numName) - i1 } if i2 < uint32(ix2.numName) { panic("merge: inconsistent index") } numName := new ix3 := bufCreate(dst) ix3.writeString(magic) // Merged list of paths. pathData := ix3.offset() mi1 := 0 mi2 := 0 last := "\x00" // not a prefix of anything for mi1 < len(paths1) || mi2 < len(paths2) { var p string if mi2 >= len(paths2) || mi1 < len(paths1) && paths1[mi1] <= paths2[mi2] { p = paths1[mi1] mi1++ } else { p = paths2[mi2] mi2++ } if strings.HasPrefix(p, last) { continue } last = p ix3.writeString(p) ix3.writeString("\x00") } ix3.writeString("\x00") // Merged list of names. nameData := ix3.offset() nameIndexFile := bufCreate("") new = 0 mi1 = 0 mi2 = 0 for new < numName { if mi1 < len(map1) && map1[mi1].new == new { for i := map1[mi1].lo; i < map1[mi1].hi; i++ { name := ix1.Name(i) nameIndexFile.writeUint32(ix3.offset() - nameData) ix3.writeString(name) ix3.writeString("\x00") new++ } mi1++ } else if mi2 < len(map2) && map2[mi2].new == new { for i := map2[mi2].lo; i < map2[mi2].hi; i++ { name := ix2.Name(i) nameIndexFile.writeUint32(ix3.offset() - nameData) ix3.writeString(name) ix3.writeString("\x00") new++ } mi2++ } else { panic("merge: inconsistent index") } } if new*4 != nameIndexFile.offset() { panic("merge: inconsistent index") } nameIndexFile.writeUint32(ix3.offset()) // Merged list of posting lists. postData := ix3.offset() var r1 postMapReader var r2 postMapReader var w postDataWriter r1.init(ix1, map1) r2.init(ix2, map2) w.init(ix3) for { if r1.trigram < r2.trigram { w.trigram(r1.trigram) for r1.nextId() { w.fileid(r1.fileid) } r1.nextTrigram() w.endTrigram() } else if r2.trigram < r1.trigram { w.trigram(r2.trigram) for r2.nextId() { w.fileid(r2.fileid) } r2.nextTrigram() w.endTrigram() } else { if r1.trigram == ^uint32(0) { break } w.trigram(r1.trigram) r1.nextId() r2.nextId() for r1.fileid < ^uint32(0) || r2.fileid < ^uint32(0) { if r1.fileid < r2.fileid { w.fileid(r1.fileid) r1.nextId() } else if r2.fileid < r1.fileid { w.fileid(r2.fileid) r2.nextId() } else { panic("merge: inconsistent index") } } r1.nextTrigram() r2.nextTrigram() w.endTrigram() } } // Name index nameIndex := ix3.offset() copyFile(ix3, nameIndexFile) // Posting list index postIndex := ix3.offset() copyFile(ix3, w.postIndexFile) ix3.writeUint32(pathData) ix3.writeUint32(nameData) ix3.writeUint32(postData) ix3.writeUint32(nameIndex) ix3.writeUint32(postIndex) ix3.writeString(trailerMagic) ix3.flush() os.Remove(nameIndexFile.name) os.Remove(w.postIndexFile.name) } type postMapReader struct { ix *Index idmap []idrange triNum uint32 trigram uint32 count uint32 offset uint32 d []byte oldid uint32 fileid uint32 i int } func (r *postMapReader) init(ix *Index, idmap []idrange) { r.ix = ix r.idmap = idmap r.trigram = ^uint32(0) r.load() } func (r *postMapReader) nextTrigram() { r.triNum++ r.load() } func (r *postMapReader) load() { if r.triNum >= uint32(r.ix.numPost) { r.trigram = ^uint32(0) r.count = 0 r.fileid = ^uint32(0) return } r.trigram, r.count, r.offset = r.ix.listAt(r.triNum * postEntrySize) if r.count == 0 { r.fileid = ^uint32(0) return } r.d = r.ix.slice(r.ix.postData+r.offset+3, -1) r.oldid = ^uint32(0) r.i = 0 } func (r *postMapReader) nextId() bool { for r.count > 0 { r.count-- delta64, n := binary.Uvarint(r.d) delta := uint32(delta64) if n <= 0 || delta == 0 { corrupt() } r.d = r.d[n:] r.oldid += delta for r.i < len(r.idmap) && r.idmap[r.i].hi <= r.oldid { r.i++ } if r.i >= len(r.idmap) { r.count = 0 break } if r.oldid < r.idmap[r.i].lo { continue } r.fileid = r.idmap[r.i].new + r.oldid - r.idmap[r.i].lo return true } r.fileid = ^uint32(0) return false } type postDataWriter struct { out *bufWriter postIndexFile *bufWriter buf [10]byte base uint32 count, offset uint32 last uint32 t uint32 } func (w *postDataWriter) init(out *bufWriter) { w.out = out w.postIndexFile = bufCreate("") w.base = out.offset() } func (w *postDataWriter) trigram(t uint32) { w.offset = w.out.offset() w.count = 0 w.t = t w.last = ^uint32(0) } func (w *postDataWriter) fileid(id uint32) { if w.count == 0 { w.out.writeTrigram(w.t) } w.out.writeUvarint(id - w.last) w.last = id w.count++ } func (w *postDataWriter) endTrigram() { if w.count == 0 { return } w.out.writeUvarint(0) w.postIndexFile.writeTrigram(w.t) w.postIndexFile.writeUint32(w.count) w.postIndexFile.writeUint32(w.offset - w.base) } codesearch-1.2.0/index/merge_test.go000066400000000000000000000042261364021717500174200ustar00rootroot00000000000000// Copyright 2011 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. package index import ( "io/ioutil" "os" "testing" ) var mergePaths1 = []string{ "/a", "/b", "/c", } var mergePaths2 = []string{ "/b", "/cc", } var mergeFiles1 = map[string]string{ "/a/x": "hello world", "/a/y": "goodbye world", "/b/xx": "now is the time", "/b/xy": "for all good men", "/c/ab": "give me all the potatoes", "/c/de": "or give me death now", } var mergeFiles2 = map[string]string{ "/b/www": "world wide indeed", "/b/xx": "no, not now", "/b/yy": "first potatoes, now liberty?", "/cc": "come to the aid of his potatoes", } func TestMerge(t *testing.T) { f1, _ := ioutil.TempFile("", "index-test") f2, _ := ioutil.TempFile("", "index-test") f3, _ := ioutil.TempFile("", "index-test") defer os.Remove(f1.Name()) defer os.Remove(f2.Name()) defer os.Remove(f3.Name()) out1 := f1.Name() out2 := f2.Name() out3 := f3.Name() buildIndex(out1, mergePaths1, mergeFiles1) buildIndex(out2, mergePaths2, mergeFiles2) Merge(out3, out1, out2) ix1 := Open(out1) ix2 := Open(out2) ix3 := Open(out3) nameof := func(ix *Index) string { switch { case ix == ix1: return "ix1" case ix == ix2: return "ix2" case ix == ix3: return "ix3" } return "???" } checkFiles := func(ix *Index, l ...string) { for i, s := range l { if n := ix.Name(uint32(i)); n != s { t.Errorf("%s: Name(%d) = %s, want %s", nameof(ix), i, n, s) } } } checkFiles(ix1, "/a/x", "/a/y", "/b/xx", "/b/xy", "/c/ab", "/c/de") checkFiles(ix2, "/b/www", "/b/xx", "/b/yy", "/cc") checkFiles(ix3, "/a/x", "/a/y", "/b/www", "/b/xx", "/b/yy", "/c/ab", "/c/de", "/cc") check := func(ix *Index, trig string, l ...uint32) { l1 := ix.PostingList(tri(trig[0], trig[1], trig[2])) if !equalList(l1, l) { t.Errorf("PostingList(%s, %s) = %v, want %v", nameof(ix), trig, l1, l) } } check(ix1, "wor", 0, 1) check(ix1, "now", 2, 5) check(ix1, "all", 3, 4) check(ix2, "now", 1, 2) check(ix3, "all", 5) check(ix3, "wor", 0, 1, 2) check(ix3, "now", 3, 4, 6) check(ix3, "pot", 4, 5, 7) } codesearch-1.2.0/index/mmap_bsd.go000066400000000000000000000014321364021717500170400ustar00rootroot00000000000000// Copyright 2011 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. // +build darwin freebsd openbsd netbsd package index import ( "log" "os" "syscall" ) // missing from package syscall on freebsd, openbsd const ( _PROT_READ = 1 _MAP_SHARED = 1 ) func mmapFile(f *os.File) mmapData { st, err := f.Stat() if err != nil { log.Fatal(err) } size := st.Size() if int64(int(size+4095)) != size+4095 { log.Fatalf("%s: too large for mmap", f.Name()) } n := int(size) if n == 0 { return mmapData{f, nil} } data, err := syscall.Mmap(int(f.Fd()), 0, (n+4095)&^4095, _PROT_READ, _MAP_SHARED) if err != nil { log.Fatalf("mmap %s: %v", f.Name(), err) } return mmapData{f, data[:n]} } codesearch-1.2.0/index/mmap_linux.go000066400000000000000000000012361364021717500174310ustar00rootroot00000000000000// Copyright 2011 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. package index import ( "log" "os" "syscall" ) func mmapFile(f *os.File) mmapData { st, err := f.Stat() if err != nil { log.Fatal(err) } size := st.Size() if int64(int(size+4095)) != size+4095 { log.Fatalf("%s: too large for mmap", f.Name()) } n := int(size) if n == 0 { return mmapData{f, nil} } data, err := syscall.Mmap(int(f.Fd()), 0, (n+4095)&^4095, syscall.PROT_READ, syscall.MAP_SHARED) if err != nil { log.Fatalf("mmap %s: %v", f.Name(), err) } return mmapData{f, data[:n]} } codesearch-1.2.0/index/mmap_windows.go000066400000000000000000000016071364021717500177660ustar00rootroot00000000000000// Copyright 2011 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. package index import ( "log" "os" "syscall" "unsafe" ) func mmapFile(f *os.File) mmapData { st, err := f.Stat() if err != nil { log.Fatal(err) } size := st.Size() if int64(int(size+4095)) != size+4095 { log.Fatalf("%s: too large for mmap", f.Name()) } if size == 0 { return mmapData{f, nil} } h, err := syscall.CreateFileMapping(syscall.Handle(f.Fd()), nil, syscall.PAGE_READONLY, uint32(size>>32), uint32(size), nil) if err != nil { log.Fatalf("CreateFileMapping %s: %v", f.Name(), err) } addr, err := syscall.MapViewOfFile(h, syscall.FILE_MAP_READ, 0, 0, 0) if err != nil { log.Fatalf("MapViewOfFile %s: %v", f.Name(), err) } data := (*[1 << 30]byte)(unsafe.Pointer(addr)) return mmapData{f, data[:size]} } codesearch-1.2.0/index/read.go000066400000000000000000000245771364021717500162100ustar00rootroot00000000000000// Copyright 2011 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. package index // Index format. // // An index stored on disk has the format: // // "csearch index 1\n" // list of paths // list of names // list of posting lists // name index // posting list index // trailer // // The list of paths is a sorted sequence of NUL-terminated file or directory names. // The index covers the file trees rooted at those paths. // The list ends with an empty name ("\x00"). // // The list of names is a sorted sequence of NUL-terminated file names. // The initial entry in the list corresponds to file #0, // the next to file #1, and so on. The list ends with an // empty name ("\x00"). // // The list of posting lists are a sequence of posting lists. // Each posting list has the form: // // trigram [3] // deltas [v]... // // The trigram gives the 3 byte trigram that this list describes. The // delta list is a sequence of varint-encoded deltas between file // IDs, ending with a zero delta. For example, the delta list [2,5,1,1,0] // encodes the file ID list 1, 6, 7, 8. The delta list [0] would // encode the empty file ID list, but empty posting lists are usually // not recorded at all. The list of posting lists ends with an entry // with trigram "\xff\xff\xff" and a delta list consisting a single zero. // // The indexes enable efficient random access to the lists. The name // index is a sequence of 4-byte big-endian values listing the byte // offset in the name list where each name begins. The posting list // index is a sequence of index entries describing each successive // posting list. Each index entry has the form: // // trigram [3] // file count [4] // offset [4] // // Index entries are only written for the non-empty posting lists, // so finding the posting list for a specific trigram requires a // binary search over the posting list index. In practice, the majority // of the possible trigrams are never seen, so omitting the missing // ones represents a significant storage savings. // // The trailer has the form: // // offset of path list [4] // offset of name list [4] // offset of posting lists [4] // offset of name index [4] // offset of posting list index [4] // "\ncsearch trailr\n" import ( "bytes" "encoding/binary" "log" "os" "path/filepath" "runtime" "sort" ) const ( magic = "csearch index 1\n" trailerMagic = "\ncsearch trailr\n" ) // An Index implements read-only access to a trigram index. type Index struct { Verbose bool data mmapData pathData uint32 nameData uint32 postData uint32 nameIndex uint32 postIndex uint32 numName int numPost int } const postEntrySize = 3 + 4 + 4 func Open(file string) *Index { mm := mmap(file) if len(mm.d) < 4*4+len(trailerMagic) || string(mm.d[len(mm.d)-len(trailerMagic):]) != trailerMagic { corrupt() } n := uint32(len(mm.d) - len(trailerMagic) - 5*4) ix := &Index{data: mm} ix.pathData = ix.uint32(n) ix.nameData = ix.uint32(n + 4) ix.postData = ix.uint32(n + 8) ix.nameIndex = ix.uint32(n + 12) ix.postIndex = ix.uint32(n + 16) ix.numName = int((ix.postIndex-ix.nameIndex)/4) - 1 ix.numPost = int((n - ix.postIndex) / postEntrySize) return ix } // slice returns the slice of index data starting at the given byte offset. // If n >= 0, the slice must have length at least n and is truncated to length n. func (ix *Index) slice(off uint32, n int) []byte { o := int(off) if uint32(o) != off || n >= 0 && o+n > len(ix.data.d) { corrupt() } if n < 0 { return ix.data.d[o:] } return ix.data.d[o : o+n] } // uint32 returns the uint32 value at the given offset in the index data. func (ix *Index) uint32(off uint32) uint32 { return binary.BigEndian.Uint32(ix.slice(off, 4)) } // uvarint returns the varint value at the given offset in the index data. func (ix *Index) uvarint(off uint32) uint32 { v, n := binary.Uvarint(ix.slice(off, -1)) if n <= 0 { corrupt() } return uint32(v) } // Paths returns the list of indexed paths. func (ix *Index) Paths() []string { off := ix.pathData var x []string for { s := ix.str(off) if len(s) == 0 { break } x = append(x, string(s)) off += uint32(len(s) + 1) } return x } // NameBytes returns the name corresponding to the given fileid. func (ix *Index) NameBytes(fileid uint32) []byte { off := ix.uint32(ix.nameIndex + 4*fileid) return ix.str(ix.nameData + off) } func (ix *Index) str(off uint32) []byte { str := ix.slice(off, -1) i := bytes.IndexByte(str, '\x00') if i < 0 { corrupt() } return str[:i] } // Name returns the name corresponding to the given fileid. func (ix *Index) Name(fileid uint32) string { return string(ix.NameBytes(fileid)) } // listAt returns the index list entry at the given offset. func (ix *Index) listAt(off uint32) (trigram, count, offset uint32) { d := ix.slice(ix.postIndex+off, postEntrySize) trigram = uint32(d[0])<<16 | uint32(d[1])<<8 | uint32(d[2]) count = binary.BigEndian.Uint32(d[3:]) offset = binary.BigEndian.Uint32(d[3+4:]) return } func (ix *Index) dumpPosting() { d := ix.slice(ix.postIndex, postEntrySize*ix.numPost) for i := 0; i < ix.numPost; i++ { j := i * postEntrySize t := uint32(d[j])<<16 | uint32(d[j+1])<<8 | uint32(d[j+2]) count := int(binary.BigEndian.Uint32(d[j+3:])) offset := binary.BigEndian.Uint32(d[j+3+4:]) log.Printf("%#x: %d at %d", t, count, offset) } } func (ix *Index) findList(trigram uint32) (count int, offset uint32) { // binary search d := ix.slice(ix.postIndex, postEntrySize*ix.numPost) i := sort.Search(ix.numPost, func(i int) bool { i *= postEntrySize t := uint32(d[i])<<16 | uint32(d[i+1])<<8 | uint32(d[i+2]) return t >= trigram }) if i >= ix.numPost { return 0, 0 } i *= postEntrySize t := uint32(d[i])<<16 | uint32(d[i+1])<<8 | uint32(d[i+2]) if t != trigram { return 0, 0 } count = int(binary.BigEndian.Uint32(d[i+3:])) offset = binary.BigEndian.Uint32(d[i+3+4:]) return } type postReader struct { ix *Index count int offset uint32 fileid uint32 d []byte restrict []uint32 } func (r *postReader) init(ix *Index, trigram uint32, restrict []uint32) { count, offset := ix.findList(trigram) if count == 0 { return } r.ix = ix r.count = count r.offset = offset r.fileid = ^uint32(0) r.d = ix.slice(ix.postData+offset+3, -1) r.restrict = restrict } func (r *postReader) max() int { return int(r.count) } func (r *postReader) next() bool { for r.count > 0 { r.count-- delta64, n := binary.Uvarint(r.d) delta := uint32(delta64) if n <= 0 || delta == 0 { corrupt() } r.d = r.d[n:] r.fileid += delta if r.restrict != nil { i := 0 for i < len(r.restrict) && r.restrict[i] < r.fileid { i++ } r.restrict = r.restrict[i:] if len(r.restrict) == 0 || r.restrict[0] != r.fileid { continue } } return true } // list should end with terminating 0 delta if r.d != nil && (len(r.d) == 0 || r.d[0] != 0) { corrupt() } r.fileid = ^uint32(0) return false } func (ix *Index) PostingList(trigram uint32) []uint32 { return ix.postingList(trigram, nil) } func (ix *Index) postingList(trigram uint32, restrict []uint32) []uint32 { var r postReader r.init(ix, trigram, restrict) x := make([]uint32, 0, r.max()) for r.next() { x = append(x, r.fileid) } return x } func (ix *Index) PostingAnd(list []uint32, trigram uint32) []uint32 { return ix.postingAnd(list, trigram, nil) } func (ix *Index) postingAnd(list []uint32, trigram uint32, restrict []uint32) []uint32 { var r postReader r.init(ix, trigram, restrict) x := list[:0] i := 0 for r.next() { fileid := r.fileid for i < len(list) && list[i] < fileid { i++ } if i < len(list) && list[i] == fileid { x = append(x, fileid) i++ } } return x } func (ix *Index) PostingOr(list []uint32, trigram uint32) []uint32 { return ix.postingOr(list, trigram, nil) } func (ix *Index) postingOr(list []uint32, trigram uint32, restrict []uint32) []uint32 { var r postReader r.init(ix, trigram, restrict) x := make([]uint32, 0, len(list)+r.max()) i := 0 for r.next() { fileid := r.fileid for i < len(list) && list[i] < fileid { x = append(x, list[i]) i++ } x = append(x, fileid) if i < len(list) && list[i] == fileid { i++ } } x = append(x, list[i:]...) return x } func (ix *Index) PostingQuery(q *Query) []uint32 { return ix.postingQuery(q, nil) } func (ix *Index) postingQuery(q *Query, restrict []uint32) (ret []uint32) { var list []uint32 switch q.Op { case QNone: // nothing case QAll: if restrict != nil { return restrict } list = make([]uint32, ix.numName) for i := range list { list[i] = uint32(i) } return list case QAnd: for _, t := range q.Trigram { tri := uint32(t[0])<<16 | uint32(t[1])<<8 | uint32(t[2]) if list == nil { list = ix.postingList(tri, restrict) } else { list = ix.postingAnd(list, tri, restrict) } if len(list) == 0 { return nil } } for _, sub := range q.Sub { if list == nil { list = restrict } list = ix.postingQuery(sub, list) if len(list) == 0 { return nil } } case QOr: for _, t := range q.Trigram { tri := uint32(t[0])<<16 | uint32(t[1])<<8 | uint32(t[2]) if list == nil { list = ix.postingList(tri, restrict) } else { list = ix.postingOr(list, tri, restrict) } } for _, sub := range q.Sub { list1 := ix.postingQuery(sub, restrict) list = mergeOr(list, list1) } } return list } func mergeOr(l1, l2 []uint32) []uint32 { var l []uint32 i := 0 j := 0 for i < len(l1) || j < len(l2) { switch { case j == len(l2) || (i < len(l1) && l1[i] < l2[j]): l = append(l, l1[i]) i++ case i == len(l1) || (j < len(l2) && l1[i] > l2[j]): l = append(l, l2[j]) j++ case l1[i] == l2[j]: l = append(l, l1[i]) i++ j++ } } return l } func corrupt() { log.Fatal("corrupt index: remove " + File()) } // An mmapData is mmap'ed read-only data from a file. type mmapData struct { f *os.File d []byte } // mmap maps the given file into memory. func mmap(file string) mmapData { f, err := os.Open(file) if err != nil { log.Fatal(err) } return mmapFile(f) } // File returns the name of the index file to use. // It is either $CSEARCHINDEX or $HOME/.csearchindex. func File() string { f := os.Getenv("CSEARCHINDEX") if f != "" { return f } var home string home = os.Getenv("HOME") if runtime.GOOS == "windows" && home == "" { home = os.Getenv("USERPROFILE") } return filepath.Clean(home + "/.csearchindex") } codesearch-1.2.0/index/read_test.go000066400000000000000000000033271364021717500172350ustar00rootroot00000000000000// Copyright 2011 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. package index import ( "io/ioutil" "os" "testing" ) var postFiles = map[string]string{ "file0": "", "file1": "Google Code Search", "file2": "Google Code Project Hosting", "file3": "Google Web Search", } func tri(x, y, z byte) uint32 { return uint32(x)<<16 | uint32(y)<<8 | uint32(z) } func TestTrivialPosting(t *testing.T) { f, _ := ioutil.TempFile("", "index-test") defer os.Remove(f.Name()) out := f.Name() buildIndex(out, nil, postFiles) ix := Open(out) if l := ix.PostingList(tri('S', 'e', 'a')); !equalList(l, []uint32{1, 3}) { t.Errorf("PostingList(Sea) = %v, want [1 3]", l) } if l := ix.PostingList(tri('G', 'o', 'o')); !equalList(l, []uint32{1, 2, 3}) { t.Errorf("PostingList(Goo) = %v, want [1 2 3]", l) } if l := ix.PostingAnd(ix.PostingList(tri('S', 'e', 'a')), tri('G', 'o', 'o')); !equalList(l, []uint32{1, 3}) { t.Errorf("PostingList(Sea&Goo) = %v, want [1 3]", l) } if l := ix.PostingAnd(ix.PostingList(tri('G', 'o', 'o')), tri('S', 'e', 'a')); !equalList(l, []uint32{1, 3}) { t.Errorf("PostingList(Goo&Sea) = %v, want [1 3]", l) } if l := ix.PostingOr(ix.PostingList(tri('S', 'e', 'a')), tri('G', 'o', 'o')); !equalList(l, []uint32{1, 2, 3}) { t.Errorf("PostingList(Sea|Goo) = %v, want [1 2 3]", l) } if l := ix.PostingOr(ix.PostingList(tri('G', 'o', 'o')), tri('S', 'e', 'a')); !equalList(l, []uint32{1, 2, 3}) { t.Errorf("PostingList(Goo|Sea) = %v, want [1 2 3]", l) } } func equalList(x, y []uint32) bool { if len(x) != len(y) { return false } for i, xi := range x { if xi != y[i] { return false } } return true } codesearch-1.2.0/index/regexp.go000066400000000000000000000500371364021717500165550ustar00rootroot00000000000000// Copyright 2011 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. package index import ( "regexp/syntax" "sort" "strconv" "strings" "unicode" ) // A Query is a matching machine, like a regular expression, // that matches some text and not other text. When we compute a // Query from a regexp, the Query is a conservative version of the // regexp: it matches everything the regexp would match, and probably // quite a bit more. We can then filter target files by whether they match // the Query (using a trigram index) before running the comparatively // more expensive regexp machinery. type Query struct { Op QueryOp Trigram []string Sub []*Query } type QueryOp int const ( QAll QueryOp = iota // Everything matches QNone // Nothing matches QAnd // All in Sub and Trigram must match QOr // At least one in Sub or Trigram must match ) var allQuery = &Query{Op: QAll} var noneQuery = &Query{Op: QNone} // and returns the query q AND r, possibly reusing q's and r's storage. func (q *Query) and(r *Query) *Query { return q.andOr(r, QAnd) } // or returns the query q OR r, possibly reusing q's and r's storage. func (q *Query) or(r *Query) *Query { return q.andOr(r, QOr) } // andOr returns the query q AND r or q OR r, possibly reusing q's and r's storage. // It works hard to avoid creating unnecessarily complicated structures. func (q *Query) andOr(r *Query, op QueryOp) (out *Query) { opstr := "&" if op == QOr { opstr = "|" } //println("andOr", q.String(), opstr, r.String()) //defer func() { println(" ->", out.String()) }() _ = opstr if len(q.Trigram) == 0 && len(q.Sub) == 1 { q = q.Sub[0] } if len(r.Trigram) == 0 && len(r.Sub) == 1 { r = r.Sub[0] } // Boolean simplification. // If q ⇒ r, q AND r ≡ q. // If q ⇒ r, q OR r ≡ r. if q.implies(r) { //println(q.String(), "implies", r.String()) if op == QAnd { return q } return r } if r.implies(q) { //println(r.String(), "implies", q.String()) if op == QAnd { return r } return q } // Both q and r are QAnd or QOr. // If they match or can be made to match, merge. qAtom := len(q.Trigram) == 1 && len(q.Sub) == 0 rAtom := len(r.Trigram) == 1 && len(r.Sub) == 0 if q.Op == op && (r.Op == op || rAtom) { q.Trigram = stringSet.union(q.Trigram, r.Trigram, false) q.Sub = append(q.Sub, r.Sub...) return q } if r.Op == op && qAtom { r.Trigram = stringSet.union(r.Trigram, q.Trigram, false) return r } if qAtom && rAtom { q.Op = op q.Trigram = append(q.Trigram, r.Trigram...) return q } // If one matches the op, add the other to it. if q.Op == op { q.Sub = append(q.Sub, r) return q } if r.Op == op { r.Sub = append(r.Sub, q) return r } // We are creating an AND of ORs or an OR of ANDs. // Factor out common trigrams, if any. common := stringSet{} i, j := 0, 0 wi, wj := 0, 0 for i < len(q.Trigram) && j < len(r.Trigram) { qt, rt := q.Trigram[i], r.Trigram[j] if qt < rt { q.Trigram[wi] = qt wi++ i++ } else if qt > rt { r.Trigram[wj] = rt wj++ j++ } else { common = append(common, qt) i++ j++ } } for ; i < len(q.Trigram); i++ { q.Trigram[wi] = q.Trigram[i] wi++ } for ; j < len(r.Trigram); j++ { r.Trigram[wj] = r.Trigram[j] wj++ } q.Trigram = q.Trigram[:wi] r.Trigram = r.Trigram[:wj] if len(common) > 0 { // If there were common trigrams, rewrite // // (abc|def|ghi|jkl) AND (abc|def|mno|prs) => // (abc|def) OR ((ghi|jkl) AND (mno|prs)) // // (abc&def&ghi&jkl) OR (abc&def&mno&prs) => // (abc&def) AND ((ghi&jkl) OR (mno&prs)) // // Build up the right one of // (ghi|jkl) AND (mno|prs) // (ghi&jkl) OR (mno&prs) // Call andOr recursively in case q and r can now be simplified // (we removed some trigrams). s := q.andOr(r, op) // Add in factored trigrams. otherOp := QAnd + QOr - op t := &Query{Op: otherOp, Trigram: common} return t.andOr(s, t.Op) } // Otherwise just create the op. return &Query{Op: op, Sub: []*Query{q, r}} } // implies reports whether q implies r. // It is okay for it to return false negatives. func (q *Query) implies(r *Query) bool { if q.Op == QNone || r.Op == QAll { // False implies everything. // Everything implies True. return true } if q.Op == QAll || r.Op == QNone { // True implies nothing. // Nothing implies False. return false } if q.Op == QAnd || (q.Op == QOr && len(q.Trigram) == 1 && len(q.Sub) == 0) { return trigramsImply(q.Trigram, r) } if q.Op == QOr && r.Op == QOr && len(q.Trigram) > 0 && len(q.Sub) == 0 && stringSet.isSubsetOf(q.Trigram, r.Trigram) { return true } return false } func trigramsImply(t []string, q *Query) bool { switch q.Op { case QOr: for _, qq := range q.Sub { if trigramsImply(t, qq) { return true } } for i := range t { if stringSet.isSubsetOf(t[i:i+1], q.Trigram) { return true } } return false case QAnd: for _, qq := range q.Sub { if !trigramsImply(t, qq) { return false } } if !stringSet.isSubsetOf(q.Trigram, t) { return false } return true } return false } // maybeRewrite rewrites q to use op if it is possible to do so // without changing the meaning. It also simplifies if the node // is an empty OR or AND. func (q *Query) maybeRewrite(op QueryOp) { if q.Op != QAnd && q.Op != QOr { return } // AND/OR doing real work? Can't rewrite. n := len(q.Sub) + len(q.Trigram) if n > 1 { return } // Nothing left in the AND/OR? if n == 0 { if q.Op == QAnd { q.Op = QAll } else { q.Op = QNone } return } // Just a sub-node: throw away wrapper. if len(q.Sub) == 1 { *q = *q.Sub[0] } // Just a trigram: can use either op. q.Op = op } // andTrigrams returns q AND the OR of the AND of the trigrams present in each string. func (q *Query) andTrigrams(t stringSet) *Query { if t.minLen() < 3 { // If there is a short string, we can't guarantee // that any trigrams must be present, so use ALL. // q AND ALL = q. return q } //println("andtrigrams", strings.Join(t, ",")) or := noneQuery for _, tt := range t { var trig stringSet for i := 0; i+3 <= len(tt); i++ { trig.add(tt[i : i+3]) } trig.clean(false) //println(tt, "trig", strings.Join(trig, ",")) or = or.or(&Query{Op: QAnd, Trigram: trig}) } q = q.and(or) return q } func (q *Query) String() string { if q == nil { return "?" } if q.Op == QNone { return "-" } if q.Op == QAll { return "+" } if len(q.Sub) == 0 && len(q.Trigram) == 1 { return strconv.Quote(q.Trigram[0]) } var ( s string sjoin string end string tjoin string ) if q.Op == QAnd { sjoin = " " tjoin = " " } else { s = "(" sjoin = ")|(" end = ")" tjoin = "|" } for i, t := range q.Trigram { if i > 0 { s += tjoin } s += strconv.Quote(t) } if len(q.Sub) > 0 { if len(q.Trigram) > 0 { s += sjoin } s += q.Sub[0].String() for i := 1; i < len(q.Sub); i++ { s += sjoin + q.Sub[i].String() } } s += end return s } // RegexpQuery returns a Query for the given regexp. func RegexpQuery(re *syntax.Regexp) *Query { info := analyze(re) info.simplify(true) info.addExact() return info.match } // A regexpInfo summarizes the results of analyzing a regexp. type regexpInfo struct { // canEmpty records whether the regexp matches the empty string canEmpty bool // exact is the exact set of strings matching the regexp. exact stringSet // if exact is nil, prefix is the set of possible match prefixes, // and suffix is the set of possible match suffixes. prefix stringSet // otherwise: the exact set of matching prefixes ... suffix stringSet // ... and suffixes // match records a query that must be satisfied by any // match for the regexp, in addition to the information // recorded above. match *Query } const ( // Exact sets are limited to maxExact strings. // If they get too big, simplify will rewrite the regexpInfo // to use prefix and suffix instead. It's not worthwhile for // this to be bigger than maxSet. // Because we allow the maximum length of an exact string // to grow to 5 below (see simplify), it helps to avoid ridiculous // alternations if maxExact is sized so that 3 case-insensitive letters // triggers a flush. maxExact = 7 // Prefix and suffix sets are limited to maxSet strings. // If they get too big, simplify will replace groups of strings // sharing a common leading prefix (or trailing suffix) with // that common prefix (or suffix). It is useful for maxSet // to be at least 2³ = 8 so that we can exactly // represent a case-insensitive abc by the set // {abc, abC, aBc, aBC, Abc, AbC, ABc, ABC}. maxSet = 20 ) // anyMatch returns the regexpInfo describing a regexp that // matches any string. func anyMatch() regexpInfo { return regexpInfo{ canEmpty: true, prefix: []string{""}, suffix: []string{""}, match: allQuery, } } // anyChar returns the regexpInfo describing a regexp that // matches any single character. func anyChar() regexpInfo { return regexpInfo{ prefix: []string{""}, suffix: []string{""}, match: allQuery, } } // noMatch returns the regexpInfo describing a regexp that // matches no strings at all. func noMatch() regexpInfo { return regexpInfo{ match: noneQuery, } } // emptyString returns the regexpInfo describing a regexp that // matches only the empty string. func emptyString() regexpInfo { return regexpInfo{ canEmpty: true, exact: []string{""}, match: allQuery, } } // analyze returns the regexpInfo for the regexp re. func analyze(re *syntax.Regexp) (ret regexpInfo) { //println("analyze", re.String()) //defer func() { println("->", ret.String()) }() var info regexpInfo switch re.Op { case syntax.OpNoMatch: return noMatch() case syntax.OpEmptyMatch, syntax.OpBeginLine, syntax.OpEndLine, syntax.OpBeginText, syntax.OpEndText, syntax.OpWordBoundary, syntax.OpNoWordBoundary: return emptyString() case syntax.OpLiteral: if re.Flags&syntax.FoldCase != 0 { switch len(re.Rune) { case 0: return emptyString() case 1: // Single-letter case-folded string: // rewrite into char class and analyze. re1 := &syntax.Regexp{ Op: syntax.OpCharClass, } re1.Rune = re1.Rune0[:0] r0 := re.Rune[0] re1.Rune = append(re1.Rune, r0, r0) for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) { re1.Rune = append(re1.Rune, r1, r1) } info = analyze(re1) return info } // Multi-letter case-folded string: // treat as concatenation of single-letter case-folded strings. re1 := &syntax.Regexp{ Op: syntax.OpLiteral, Flags: syntax.FoldCase, } info = emptyString() for i := range re.Rune { re1.Rune = re.Rune[i : i+1] info = concat(info, analyze(re1)) } return info } info.exact = stringSet{string(re.Rune)} info.match = allQuery case syntax.OpAnyCharNotNL, syntax.OpAnyChar: return anyChar() case syntax.OpCapture: return analyze(re.Sub[0]) case syntax.OpConcat: return fold(concat, re.Sub, emptyString()) case syntax.OpAlternate: return fold(alternate, re.Sub, noMatch()) case syntax.OpQuest: return alternate(analyze(re.Sub[0]), emptyString()) case syntax.OpStar: // We don't know anything, so assume the worst. return anyMatch() case syntax.OpRepeat: if re.Min == 0 { // Like OpStar return anyMatch() } fallthrough case syntax.OpPlus: // x+ // Since there has to be at least one x, the prefixes and suffixes // stay the same. If x was exact, it isn't anymore. info = analyze(re.Sub[0]) if info.exact.have() { info.prefix = info.exact info.suffix = info.exact.copy() info.exact = nil } case syntax.OpCharClass: info.match = allQuery // Special case. if len(re.Rune) == 0 { return noMatch() } // Special case. if len(re.Rune) == 1 { info.exact = stringSet{string(re.Rune[0])} break } n := 0 for i := 0; i < len(re.Rune); i += 2 { n += int(re.Rune[i+1] - re.Rune[i]) } // If the class is too large, it's okay to overestimate. if n > 100 { return anyChar() } info.exact = []string{} for i := 0; i < len(re.Rune); i += 2 { lo, hi := re.Rune[i], re.Rune[i+1] for rr := lo; rr <= hi; rr++ { info.exact.add(string(rr)) } } } info.simplify(false) return info } // fold is the usual higher-order function. func fold(f func(x, y regexpInfo) regexpInfo, sub []*syntax.Regexp, zero regexpInfo) regexpInfo { if len(sub) == 0 { return zero } if len(sub) == 1 { return analyze(sub[0]) } info := f(analyze(sub[0]), analyze(sub[1])) for i := 2; i < len(sub); i++ { info = f(info, analyze(sub[i])) } return info } // concat returns the regexp info for xy given x and y. func concat(x, y regexpInfo) (out regexpInfo) { //println("concat", x.String(), "...", y.String()) //defer func() { println("->", out.String()) }() var xy regexpInfo xy.match = x.match.and(y.match) if x.exact.have() && y.exact.have() { xy.exact = x.exact.cross(y.exact, false) } else { if x.exact.have() { xy.prefix = x.exact.cross(y.prefix, false) } else { xy.prefix = x.prefix if x.canEmpty { xy.prefix = xy.prefix.union(y.prefix, false) } } if y.exact.have() { xy.suffix = x.suffix.cross(y.exact, true) } else { xy.suffix = y.suffix if y.canEmpty { xy.suffix = xy.suffix.union(x.suffix, true) } } } // If all the possible strings in the cross product of x.suffix // and y.prefix are long enough, then the trigram for one // of them must be present and would not necessarily be // accounted for in xy.prefix or xy.suffix yet. Cut things off // at maxSet just to keep the sets manageable. if !x.exact.have() && !y.exact.have() && x.suffix.size() <= maxSet && y.prefix.size() <= maxSet && x.suffix.minLen()+y.prefix.minLen() >= 3 { xy.match = xy.match.andTrigrams(x.suffix.cross(y.prefix, false)) } xy.simplify(false) return xy } // alternate returns the regexpInfo for x|y given x and y. func alternate(x, y regexpInfo) (out regexpInfo) { //println("alternate", x.String(), "...", y.String()) //defer func() { println("->", out.String()) }() var xy regexpInfo if x.exact.have() && y.exact.have() { xy.exact = x.exact.union(y.exact, false) } else if x.exact.have() { xy.prefix = x.exact.union(y.prefix, false) xy.suffix = x.exact.union(y.suffix, true) x.addExact() } else if y.exact.have() { xy.prefix = x.prefix.union(y.exact, false) xy.suffix = x.suffix.union(y.exact.copy(), true) y.addExact() } else { xy.prefix = x.prefix.union(y.prefix, false) xy.suffix = x.suffix.union(y.suffix, true) } xy.canEmpty = x.canEmpty || y.canEmpty xy.match = x.match.or(y.match) xy.simplify(false) return xy } // addExact adds to the match query the trigrams for matching info.exact. func (info *regexpInfo) addExact() { if info.exact.have() { info.match = info.match.andTrigrams(info.exact) } } // simplify simplifies the regexpInfo when the exact set gets too large. func (info *regexpInfo) simplify(force bool) { //println(" simplify", info.String(), " force=", force) //defer func() { println(" ->", info.String()) }() // If there are now too many exact strings, // loop over them, adding trigrams and moving // the relevant pieces into prefix and suffix. info.exact.clean(false) if len(info.exact) > maxExact || (info.exact.minLen() >= 3 && force) || info.exact.minLen() >= 4 { info.addExact() for _, s := range info.exact { n := len(s) if n < 3 { info.prefix.add(s) info.suffix.add(s) } else { info.prefix.add(s[:2]) info.suffix.add(s[n-2:]) } } info.exact = nil } if !info.exact.have() { info.simplifySet(&info.prefix) info.simplifySet(&info.suffix) } } // simplifySet reduces the size of the given set (either prefix or suffix). // There is no need to pass around enormous prefix or suffix sets, since // they will only be used to create trigrams. As they get too big, simplifySet // moves the information they contain into the match query, which is // more efficient to pass around. func (info *regexpInfo) simplifySet(s *stringSet) { t := *s t.clean(s == &info.suffix) // Add the OR of the current prefix/suffix set to the query. info.match = info.match.andTrigrams(t) for n := 3; n == 3 || t.size() > maxSet; n-- { // Replace set by strings of length n-1. w := 0 for _, str := range t { if len(str) >= n { if s == &info.prefix { str = str[:n-1] } else { str = str[len(str)-n+1:] } } if w == 0 || t[w-1] != str { t[w] = str w++ } } t = t[:w] t.clean(s == &info.suffix) } // Now make sure that the prefix/suffix sets aren't redundant. // For example, if we know "ab" is a possible prefix, then it // doesn't help at all to know that "abc" is also a possible // prefix, so delete "abc". w := 0 f := strings.HasPrefix if s == &info.suffix { f = strings.HasSuffix } for _, str := range t { if w == 0 || !f(str, t[w-1]) { t[w] = str w++ } } t = t[:w] *s = t } func (info regexpInfo) String() string { s := "" if info.canEmpty { s += "canempty " } if info.exact.have() { s += "exact:" + strings.Join(info.exact, ",") } else { s += "prefix:" + strings.Join(info.prefix, ",") s += " suffix:" + strings.Join(info.suffix, ",") } s += " match: " + info.match.String() return s } // A stringSet is a set of strings. // The nil stringSet indicates not having a set. // The non-nil but empty stringSet is the empty set. type stringSet []string // have reports whether we have a stringSet. func (s stringSet) have() bool { return s != nil } // contains reports whether s contains str. func (s stringSet) contains(str string) bool { for _, ss := range s { if ss == str { return true } } return false } type byPrefix []string func (x *byPrefix) Len() int { return len(*x) } func (x *byPrefix) Swap(i, j int) { (*x)[i], (*x)[j] = (*x)[j], (*x)[i] } func (x *byPrefix) Less(i, j int) bool { return (*x)[i] < (*x)[j] } type bySuffix []string func (x *bySuffix) Len() int { return len(*x) } func (x *bySuffix) Swap(i, j int) { (*x)[i], (*x)[j] = (*x)[j], (*x)[i] } func (x *bySuffix) Less(i, j int) bool { s := (*x)[i] t := (*x)[j] for i := 1; i <= len(s) && i <= len(t); i++ { si := s[len(s)-i] ti := t[len(t)-i] if si < ti { return true } if si > ti { return false } } return len(s) < len(t) } // add adds str to the set. func (s *stringSet) add(str string) { *s = append(*s, str) } // clean removes duplicates from the stringSet. func (s *stringSet) clean(isSuffix bool) { t := *s if isSuffix { sort.Sort((*bySuffix)(s)) } else { sort.Sort((*byPrefix)(s)) } w := 0 for _, str := range t { if w == 0 || t[w-1] != str { t[w] = str w++ } } *s = t[:w] } // size returns the number of strings in s. func (s stringSet) size() int { return len(s) } // minLen returns the length of the shortest string in s. func (s stringSet) minLen() int { if len(s) == 0 { return 0 } m := len(s[0]) for _, str := range s { if m > len(str) { m = len(str) } } return m } // maxLen returns the length of the longest string in s. func (s stringSet) maxLen() int { if len(s) == 0 { return 0 } m := len(s[0]) for _, str := range s { if m < len(str) { m = len(str) } } return m } // union returns the union of s and t, reusing s's storage. func (s stringSet) union(t stringSet, isSuffix bool) stringSet { s = append(s, t...) s.clean(isSuffix) return s } // cross returns the cross product of s and t. func (s stringSet) cross(t stringSet, isSuffix bool) stringSet { p := stringSet{} for _, ss := range s { for _, tt := range t { p.add(ss + tt) } } p.clean(isSuffix) return p } // clear empties the set but preserves the storage. func (s *stringSet) clear() { *s = (*s)[:0] } // copy returns a copy of the set that does not share storage with the original. func (s stringSet) copy() stringSet { return append(stringSet{}, s...) } // isSubsetOf returns true if all strings in s are also in t. // It assumes both sets are sorted. func (s stringSet) isSubsetOf(t stringSet) bool { j := 0 for _, ss := range s { for j < len(t) && t[j] < ss { j++ } if j >= len(t) || t[j] != ss { return false } } return true } codesearch-1.2.0/index/regexp_test.go000066400000000000000000000054231364021717500176130ustar00rootroot00000000000000// Copyright 2011 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. package index import ( "regexp/syntax" "testing" ) var queryTests = []struct { re string q string }{ {`Abcdef`, `"Abc" "bcd" "cde" "def"`}, {`(abc)(def)`, `"abc" "bcd" "cde" "def"`}, {`abc.*(def|ghi)`, `"abc" ("def"|"ghi")`}, {`abc(def|ghi)`, `"abc" ("bcd" "cde" "def")|("bcg" "cgh" "ghi")`}, {`a+hello`, `"ahe" "ell" "hel" "llo"`}, {`(a+hello|b+world)`, `("ahe" "ell" "hel" "llo")|("bwo" "orl" "rld" "wor")`}, {`a*bbb`, `"bbb"`}, {`a?bbb`, `"bbb"`}, {`(bbb)a?`, `"bbb"`}, {`(bbb)a*`, `"bbb"`}, {`^abc`, `"abc"`}, {`abc$`, `"abc"`}, {`ab[cde]f`, `("abc" "bcf")|("abd" "bdf")|("abe" "bef")`}, {`(abc|bac)de`, `"cde" ("abc" "bcd")|("acd" "bac")`}, // These don't have enough letters for a trigram, so they return the // always matching query "+". {`ab[^cde]f`, `+`}, {`ab.f`, `+`}, {`.`, `+`}, {`()`, `+`}, // No matches. {`[^\s\S]`, `-`}, // Factoring works. {`(abc|abc)`, `"abc"`}, {`(ab|ab)c`, `"abc"`}, {`ab(cab|cat)`, `"abc" "bca" ("cab"|"cat")`}, {`(z*(abc|def)z*)(z*(abc|def)z*)`, `("abc"|"def")`}, {`(z*abcz*defz*)|(z*abcz*defz*)`, `"abc" "def"`}, {`(z*abcz*defz*(ghi|jkl)z*)|(z*abcz*defz*(mno|prs)z*)`, `"abc" "def" ("ghi"|"jkl"|"mno"|"prs")`}, {`(z*(abcz*def)|(ghiz*jkl)z*)|(z*(mnoz*prs)|(tuvz*wxy)z*)`, `("abc" "def")|("ghi" "jkl")|("mno" "prs")|("tuv" "wxy")`}, {`(z*abcz*defz*)(z*(ghi|jkl)z*)`, `"abc" "def" ("ghi"|"jkl")`}, {`(z*abcz*defz*)|(z*(ghi|jkl)z*)`, `("ghi"|"jkl")|("abc" "def")`}, // analyze keeps track of multiple possible prefix/suffixes. {`[ab][cd][ef]`, `("ace"|"acf"|"ade"|"adf"|"bce"|"bcf"|"bde"|"bdf")`}, {`ab[cd]e`, `("abc" "bce")|("abd" "bde")`}, // Different sized suffixes. {`(a|ab)cde`, `"cde" ("abc" "bcd")|("acd")`}, {`(a|b|c|d)(ef|g|hi|j)`, `+`}, {`(?s).`, `+`}, // Expanding case. {`(?i)a~~`, `("A~~"|"a~~")`}, {`(?i)ab~`, `("AB~"|"Ab~"|"aB~"|"ab~")`}, {`(?i)abc`, `("ABC"|"ABc"|"AbC"|"Abc"|"aBC"|"aBc"|"abC"|"abc")`}, {`(?i)abc|def`, `("ABC"|"ABc"|"AbC"|"Abc"|"DEF"|"DEf"|"DeF"|"Def"|"aBC"|"aBc"|"abC"|"abc"|"dEF"|"dEf"|"deF"|"def")`}, {`(?i)abcd`, `("ABC"|"ABc"|"AbC"|"Abc"|"aBC"|"aBc"|"abC"|"abc") ("BCD"|"BCd"|"BcD"|"Bcd"|"bCD"|"bCd"|"bcD"|"bcd")`}, {`(?i)abc|abc`, `("ABC"|"ABc"|"AbC"|"Abc"|"aBC"|"aBc"|"abC"|"abc")`}, // Word boundary. {`\b`, `+`}, {`\B`, `+`}, {`\babc`, `"abc"`}, {`\Babc`, `"abc"`}, {`abc\b`, `"abc"`}, {`abc\B`, `"abc"`}, {`ab\bc`, `"abc"`}, {`ab\Bc`, `"abc"`}, } func TestQuery(t *testing.T) { for _, tt := range queryTests { re, err := syntax.Parse(tt.re, syntax.Perl) if err != nil { t.Fatal(err) } q := RegexpQuery(re).String() if q != tt.q { t.Errorf("RegexpQuery(%#q) = %#q, want %#q", tt.re, q, tt.q) } } } codesearch-1.2.0/index/write.go000066400000000000000000000346151364021717500164210ustar00rootroot00000000000000// Copyright 2011 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. package index import ( "io" "io/ioutil" "log" "os" "strings" "unsafe" "github.com/google/codesearch/sparse" ) // Index writing. See read.go for details of on-disk format. // // It would suffice to make a single large list of (trigram, file#) pairs // while processing the files one at a time, sort that list by trigram, // and then create the posting lists from subsequences of the list. // However, we do not assume that the entire index fits in memory. // Instead, we sort and flush the list to a new temporary file each time // it reaches its maximum in-memory size, and then at the end we // create the final posting lists by merging the temporary files as we // read them back in. // // It would also be useful to be able to create an index for a subset // of the files and then merge that index into an existing one. This would // allow incremental updating of an existing index when a directory changes. // But we have not implemented that. // An IndexWriter creates an on-disk index corresponding to a set of files. type IndexWriter struct { LogSkip bool // log information about skipped files Verbose bool // log status using package log trigram *sparse.Set // trigrams for the current file buf [8]byte // scratch buffer paths []string nameData *bufWriter // temp file holding list of names nameLen uint32 // number of bytes written to nameData nameIndex *bufWriter // temp file holding name index numName int // number of names written totalBytes int64 post []postEntry // list of (trigram, file#) pairs postFile []*os.File // flushed post entries postIndex *bufWriter // temp file holding posting list index inbuf []byte // input buffer main *bufWriter // main index file } const npost = 64 << 20 / 8 // 64 MB worth of post entries // Create returns a new IndexWriter that will write the index to file. func Create(file string) *IndexWriter { return &IndexWriter{ trigram: sparse.NewSet(1 << 24), nameData: bufCreate(""), nameIndex: bufCreate(""), postIndex: bufCreate(""), main: bufCreate(file), post: make([]postEntry, 0, npost), inbuf: make([]byte, 16384), } } // A postEntry is an in-memory (trigram, file#) pair. type postEntry uint64 func (p postEntry) trigram() uint32 { return uint32(p >> 32) } func (p postEntry) fileid() uint32 { return uint32(p) } func makePostEntry(trigram, fileid uint32) postEntry { return postEntry(trigram)<<32 | postEntry(fileid) } // Tuning constants for detecting text files. // A file is assumed not to be text files (and thus not indexed) // if it contains an invalid UTF-8 sequences, if it is longer than maxFileLength // bytes, if it contains a line longer than maxLineLen bytes, // or if it contains more than maxTextTrigrams distinct trigrams. const ( maxFileLen = 1 << 30 maxLineLen = 2000 maxTextTrigrams = 20000 ) // AddPaths adds the given paths to the index's list of paths. func (ix *IndexWriter) AddPaths(paths []string) { ix.paths = append(ix.paths, paths...) } // AddFile adds the file with the given name (opened using os.Open) // to the index. It logs errors using package log. func (ix *IndexWriter) AddFile(name string) { f, err := os.Open(name) if err != nil { log.Print(err) return } defer f.Close() ix.Add(name, f) } // Add adds the file f to the index under the given name. // It logs errors using package log. func (ix *IndexWriter) Add(name string, f io.Reader) { ix.trigram.Reset() var ( c = byte(0) i = 0 buf = ix.inbuf[:0] tv = uint32(0) n = int64(0) linelen = 0 ) for { tv = (tv << 8) & (1<<24 - 1) if i >= len(buf) { n, err := f.Read(buf[:cap(buf)]) if n == 0 { if err != nil { if err == io.EOF { break } log.Printf("%s: %v\n", name, err) return } log.Printf("%s: 0-length read\n", name) return } buf = buf[:n] i = 0 } c = buf[i] i++ tv |= uint32(c) if n++; n >= 3 { ix.trigram.Add(tv) } if !validUTF8((tv>>8)&0xFF, tv&0xFF) { if ix.LogSkip { log.Printf("%s: invalid UTF-8, ignoring\n", name) } return } if n > maxFileLen { if ix.LogSkip { log.Printf("%s: too long, ignoring\n", name) } return } if linelen++; linelen > maxLineLen { if ix.LogSkip { log.Printf("%s: very long lines, ignoring\n", name) } return } if c == '\n' { linelen = 0 } } if ix.trigram.Len() > maxTextTrigrams { if ix.LogSkip { log.Printf("%s: too many trigrams, probably not text, ignoring\n", name) } return } ix.totalBytes += n if ix.Verbose { log.Printf("%d %d %s\n", n, ix.trigram.Len(), name) } fileid := ix.addName(name) for _, trigram := range ix.trigram.Dense() { if len(ix.post) >= cap(ix.post) { ix.flushPost() } ix.post = append(ix.post, makePostEntry(trigram, fileid)) } } // Flush flushes the index entry to the target file. func (ix *IndexWriter) Flush() { ix.addName("") var off [5]uint32 ix.main.writeString(magic) off[0] = ix.main.offset() for _, p := range ix.paths { ix.main.writeString(p) ix.main.writeString("\x00") } ix.main.writeString("\x00") off[1] = ix.main.offset() copyFile(ix.main, ix.nameData) off[2] = ix.main.offset() ix.mergePost(ix.main) off[3] = ix.main.offset() copyFile(ix.main, ix.nameIndex) off[4] = ix.main.offset() copyFile(ix.main, ix.postIndex) for _, v := range off { ix.main.writeUint32(v) } ix.main.writeString(trailerMagic) os.Remove(ix.nameData.name) for _, f := range ix.postFile { os.Remove(f.Name()) } os.Remove(ix.nameIndex.name) os.Remove(ix.postIndex.name) log.Printf("%d data bytes, %d index bytes", ix.totalBytes, ix.main.offset()) ix.main.flush() } func copyFile(dst, src *bufWriter) { dst.flush() _, err := io.Copy(dst.file, src.finish()) if err != nil { log.Fatalf("copying %s to %s: %v", src.name, dst.name, err) } } // addName adds the file with the given name to the index. // It returns the assigned file ID number. func (ix *IndexWriter) addName(name string) uint32 { if strings.Contains(name, "\x00") { log.Fatalf("%q: file has NUL byte in name", name) } ix.nameIndex.writeUint32(ix.nameData.offset()) ix.nameData.writeString(name) ix.nameData.writeByte(0) id := ix.numName ix.numName++ return uint32(id) } // flushPost writes ix.post to a new temporary file and // clears the slice. func (ix *IndexWriter) flushPost() { w, err := ioutil.TempFile("", "csearch-index") if err != nil { log.Fatal(err) } if ix.Verbose { log.Printf("flush %d entries to %s", len(ix.post), w.Name()) } sortPost(ix.post) // Write the raw ix.post array to disk as is. // This process is the one reading it back in, so byte order is not a concern. data := (*[npost * 8]byte)(unsafe.Pointer(&ix.post[0]))[:len(ix.post)*8] if n, err := w.Write(data); err != nil || n < len(data) { if err != nil { log.Fatal(err) } log.Fatalf("short write writing %s", w.Name()) } ix.post = ix.post[:0] w.Seek(0, 0) ix.postFile = append(ix.postFile, w) } // mergePost reads the flushed index entries and merges them // into posting lists, writing the resulting lists to out. func (ix *IndexWriter) mergePost(out *bufWriter) { var h postHeap log.Printf("merge %d files + mem", len(ix.postFile)) for _, f := range ix.postFile { h.addFile(f) } sortPost(ix.post) h.addMem(ix.post) npost := 0 e := h.next() offset0 := out.offset() for { npost++ offset := out.offset() - offset0 trigram := e.trigram() ix.buf[0] = byte(trigram >> 16) ix.buf[1] = byte(trigram >> 8) ix.buf[2] = byte(trigram) // posting list fileid := ^uint32(0) nfile := uint32(0) out.write(ix.buf[:3]) for ; e.trigram() == trigram && trigram != 1<<24-1; e = h.next() { out.writeUvarint(e.fileid() - fileid) fileid = e.fileid() nfile++ } out.writeUvarint(0) // index entry ix.postIndex.write(ix.buf[:3]) ix.postIndex.writeUint32(nfile) ix.postIndex.writeUint32(offset) if trigram == 1<<24-1 { break } } } // A postChunk represents a chunk of post entries flushed to disk or // still in memory. type postChunk struct { e postEntry // next entry m []postEntry // remaining entries after e } const postBuf = 4096 // A postHeap is a heap (priority queue) of postChunks. type postHeap struct { ch []*postChunk } func (h *postHeap) addFile(f *os.File) { data := mmapFile(f).d m := (*[npost]postEntry)(unsafe.Pointer(&data[0]))[:len(data)/8] h.addMem(m) } func (h *postHeap) addMem(x []postEntry) { h.add(&postChunk{m: x}) } // step reads the next entry from ch and saves it in ch.e. // It returns false if ch is over. func (h *postHeap) step(ch *postChunk) bool { old := ch.e m := ch.m if len(m) == 0 { return false } ch.e = postEntry(m[0]) m = m[1:] ch.m = m if old >= ch.e { panic("bad sort") } return true } // add adds the chunk to the postHeap. // All adds must be called before the first call to next. func (h *postHeap) add(ch *postChunk) { if len(ch.m) > 0 { ch.e = ch.m[0] ch.m = ch.m[1:] h.push(ch) } } // empty reports whether the postHeap is empty. func (h *postHeap) empty() bool { return len(h.ch) == 0 } // next returns the next entry from the postHeap. // It returns a postEntry with trigram == 1<<24 - 1 if h is empty. func (h *postHeap) next() postEntry { if len(h.ch) == 0 { return makePostEntry(1<<24-1, 0) } ch := h.ch[0] e := ch.e m := ch.m if len(m) == 0 { h.pop() } else { ch.e = m[0] ch.m = m[1:] h.siftDown(0) } return e } func (h *postHeap) pop() *postChunk { ch := h.ch[0] n := len(h.ch) - 1 h.ch[0] = h.ch[n] h.ch = h.ch[:n] if n > 1 { h.siftDown(0) } return ch } func (h *postHeap) push(ch *postChunk) { n := len(h.ch) h.ch = append(h.ch, ch) if len(h.ch) >= 2 { h.siftUp(n) } } func (h *postHeap) siftDown(i int) { ch := h.ch for { j1 := 2*i + 1 if j1 >= len(ch) { break } j := j1 if j2 := j1 + 1; j2 < len(ch) && ch[j1].e >= ch[j2].e { j = j2 } if ch[i].e < ch[j].e { break } ch[i], ch[j] = ch[j], ch[i] i = j } } func (h *postHeap) siftUp(j int) { ch := h.ch for { i := (j - 1) / 2 if i == j || ch[i].e < ch[j].e { break } ch[i], ch[j] = ch[j], ch[i] j = i } } // A bufWriter is a convenience wrapper: a closeable bufio.Writer. type bufWriter struct { name string file *os.File buf []byte tmp [8]byte } // bufCreate creates a new file with the given name and returns a // corresponding bufWriter. If name is empty, bufCreate uses a // temporary file. func bufCreate(name string) *bufWriter { var ( f *os.File err error ) if name != "" { f, err = os.OpenFile(name, os.O_RDWR|os.O_CREATE|os.O_TRUNC, 0600) } else { f, err = ioutil.TempFile("", "csearch") } if err != nil { log.Fatal(err) } return &bufWriter{ name: f.Name(), buf: make([]byte, 0, 256<<10), file: f, } } func (b *bufWriter) write(x []byte) { n := cap(b.buf) - len(b.buf) if len(x) > n { b.flush() if len(x) >= cap(b.buf) { if _, err := b.file.Write(x); err != nil { log.Fatalf("writing %s: %v", b.name, err) } return } } b.buf = append(b.buf, x...) } func (b *bufWriter) writeByte(x byte) { if len(b.buf) >= cap(b.buf) { b.flush() } b.buf = append(b.buf, x) } func (b *bufWriter) writeString(s string) { n := cap(b.buf) - len(b.buf) if len(s) > n { b.flush() if len(s) >= cap(b.buf) { if _, err := b.file.WriteString(s); err != nil { log.Fatalf("writing %s: %v", b.name, err) } return } } b.buf = append(b.buf, s...) } // offset returns the current write offset. func (b *bufWriter) offset() uint32 { off, _ := b.file.Seek(0, 1) off += int64(len(b.buf)) if int64(uint32(off)) != off { log.Fatalf("index is larger than 4GB") } return uint32(off) } func (b *bufWriter) flush() { if len(b.buf) == 0 { return } _, err := b.file.Write(b.buf) if err != nil { log.Fatalf("writing %s: %v", b.name, err) } b.buf = b.buf[:0] } // finish flushes the file to disk and returns an open file ready for reading. func (b *bufWriter) finish() *os.File { b.flush() f := b.file f.Seek(0, 0) return f } func (b *bufWriter) writeTrigram(t uint32) { if cap(b.buf)-len(b.buf) < 3 { b.flush() } b.buf = append(b.buf, byte(t>>16), byte(t>>8), byte(t)) } func (b *bufWriter) writeUint32(x uint32) { if cap(b.buf)-len(b.buf) < 4 { b.flush() } b.buf = append(b.buf, byte(x>>24), byte(x>>16), byte(x>>8), byte(x)) } func (b *bufWriter) writeUvarint(x uint32) { if cap(b.buf)-len(b.buf) < 5 { b.flush() } switch { case x < 1<<7: b.buf = append(b.buf, byte(x)) case x < 1<<14: b.buf = append(b.buf, byte(x|0x80), byte(x>>7)) case x < 1<<21: b.buf = append(b.buf, byte(x|0x80), byte(x>>7|0x80), byte(x>>14)) case x < 1<<28: b.buf = append(b.buf, byte(x|0x80), byte(x>>7|0x80), byte(x>>14|0x80), byte(x>>21)) default: b.buf = append(b.buf, byte(x|0x80), byte(x>>7|0x80), byte(x>>14|0x80), byte(x>>21|0x80), byte(x>>28)) } } // validUTF8 reports whether the byte pair can appear in a // valid sequence of UTF-8-encoded code points. func validUTF8(c1, c2 uint32) bool { switch { case c1 < 0x80: // 1-byte, must be followed by 1-byte or first of multi-byte return c2 < 0x80 || 0xc0 <= c2 && c2 < 0xf8 case c1 < 0xc0: // continuation byte, can be followed by nearly anything return c2 < 0xf8 case c1 < 0xf8: // first of multi-byte, must be followed by continuation byte return 0x80 <= c2 && c2 < 0xc0 } return false } // sortPost sorts the postentry list. // The list is already sorted by fileid (bottom 32 bits) // and the top 8 bits are always zero, so there are only // 24 bits to sort. Run two rounds of 12-bit radix sort. const sortK = 12 var sortTmp []postEntry var sortN [1 << sortK]int func sortPost(post []postEntry) { if len(post) > len(sortTmp) { sortTmp = make([]postEntry, len(post)) } tmp := sortTmp[:len(post)] const k = sortK for i := range sortN { sortN[i] = 0 } for _, p := range post { r := uintptr(p>>32) & (1<>32) & (1<>(32+k)) & (1<>(32+k)) & (1<> 24) buf[1] = byte(x >> 16) buf[2] = byte(x >> 8) buf[3] = byte(x) return string(buf[:]) } func fileList(list ...uint32) string { var buf []byte last := ^uint32(0) for _, x := range list { delta := x - last for delta >= 0x80 { buf = append(buf, byte(delta)|0x80) delta >>= 7 } buf = append(buf, byte(delta)) last = x } buf = append(buf, 0) return string(buf) } func buildFlushIndex(out string, paths []string, doFlush bool, fileData map[string]string) { ix := Create(out) ix.AddPaths(paths) var files []string for name := range fileData { files = append(files, name) } sort.Strings(files) for _, name := range files { ix.Add(name, strings.NewReader(fileData[name])) } if doFlush { ix.flushPost() } ix.Flush() } func buildIndex(name string, paths []string, fileData map[string]string) { buildFlushIndex(name, paths, false, fileData) } func testTrivialWrite(t *testing.T, doFlush bool) { f, _ := ioutil.TempFile("", "index-test") defer os.Remove(f.Name()) out := f.Name() buildFlushIndex(out, nil, doFlush, trivialFiles) data, err := ioutil.ReadFile(out) if err != nil { t.Fatalf("reading _test/index.triv: %v", err) } want := []byte(trivialIndex) if !bytes.Equal(data, want) { i := 0 for i < len(data) && i < len(want) && data[i] == want[i] { i++ } t.Fatalf("wrong index:\nhave: %q %q\nwant: %q %q", data[:i], data[i:], want[:i], want[i:]) } } func TestTrivialWrite(t *testing.T) { testTrivialWrite(t, false) } func TestTrivialWriteDisk(t *testing.T) { testTrivialWrite(t, true) } func TestHeap(t *testing.T) { h := &postHeap{} es := []postEntry{7, 4, 3, 2, 4} for _, e := range es { h.addMem([]postEntry{e}) } if len(h.ch) != len(es) { t.Fatalf("wrong heap size: %d, want %d", len(h.ch), len(es)) } for a, b := h.next(), h.next(); b.trigram() != (1<<24 - 1); a, b = b, h.next() { if a > b { t.Fatalf("%d should <= %d", a, b) } } } codesearch-1.2.0/lib/000077500000000000000000000000001364021717500143665ustar00rootroot00000000000000codesearch-1.2.0/lib/README.template000066400000000000000000000006511364021717500170620ustar00rootroot00000000000000These are the command-line Code Search tools from https://github.com/google/codesearch. These binaries are for ARCH systems running OPERSYS. To get started, run cindex with a list of directories to index: cindex /usr/include $HOME/src Then run csearch to run grep over all the indexed sources: csearch DATAKIT For details, run either command with the -help option, and read http://swtch.com/~rsc/regexp/regexp4.html. codesearch-1.2.0/lib/buildall000077500000000000000000000015641364021717500161120ustar00rootroot00000000000000#!/bin/bash # This script builds the code search binaries for a variety of OS/architecture combinations. . ./setup for i in {5,6,8}{c,g,a,l} do go tool dist install cmd/$i done build() { echo "# $1" goos=$(echo $1 | sed 's;/.*;;') goarch=$(echo $1 | sed 's;.*/;;') GOOS=$goos GOARCH=$goarch CGO_ENABLED=0 \ go install -a code.google.com/p/codesearch/cmd/{cgrep,cindex,csearch} rm -rf codesearch-$version mkdir codesearch-$version mv ~/g/bin/{cgrep,cindex,csearch}* codesearch-$version chmod +x codesearch-$version/* cat README.template | sed "s/ARCH/$(arch $goarch)/; s/OPERSYS/$(os $goos)/" >codesearch-$version/README.txt rm -f codesearch-$version-$goos-$goarch.zip zip -z -r codesearch-$version-$goos-$goarch.zip codesearch-$version < codesearch-$version/README.txt rm -rf codesearch-0.01 } for i in {linux,darwin,freebsd,windows}/{amd64,386} do build $i done codesearch-1.2.0/lib/setup000066400000000000000000000004731364021717500154550ustar00rootroot00000000000000set -e os() { case "$1" in freebsd) echo FreeBSD;; linux) echo Linux;; darwin) echo Mac OS X;; openbsd) echo OpenBSD;; netbsd) echo NetBSD;; windows) echo Windows;; *) echo $1;; esac } arch() { case "$1" in 386) echo 32-bit x86;; amd64) echo 64-bit x86;; *) echo $1;; esac } version=$(cat version) codesearch-1.2.0/lib/uploadall000066400000000000000000000010011364021717500162560ustar00rootroot00000000000000#!/bin/sh # gcodeup is a copy of $GOROOT/misc/dashboard/googlecode_upload.py. . ./setup user=$(sed -n 's/^re2.username = //' ~/.hgrc) password=$(sed -n 's/^re2\.password = //' ~/.hgrc) upload() { goos=$(echo $1 | sed "s/codesearch-$version-//; s/-.*//") goarch=$(echo $1 | sed "s/codesearch-$version-//; s/[a-z0-9]*-//; s/-.*//") gcodeup -s "binaries for $(os $goos) $(arch $goarch)" -p codesearch -u "$user" -w "$password" codesearch-$version-$1-$2.zip } for i in codesearch-$version-* do upload $i done codesearch-1.2.0/lib/version000066400000000000000000000000051364021717500157710ustar00rootroot000000000000000.01 codesearch-1.2.0/regexp/000077500000000000000000000000001364021717500151125ustar00rootroot00000000000000codesearch-1.2.0/regexp/copy.go000066400000000000000000000113601364021717500164140ustar00rootroot00000000000000// Copyright 2011 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. // Copied from Go's regexp/syntax. // Formatters edited to handle instByteRange. package regexp import ( "bytes" "fmt" "regexp/syntax" "sort" "strconv" "unicode" ) // cleanClass sorts the ranges (pairs of elements of r), // merges them, and eliminates duplicates. func cleanClass(rp *[]rune) []rune { // Sort by lo increasing, hi decreasing to break ties. sort.Sort(ranges{rp}) r := *rp if len(r) < 2 { return r } // Merge abutting, overlapping. w := 2 // write index for i := 2; i < len(r); i += 2 { lo, hi := r[i], r[i+1] if lo <= r[w-1]+1 { // merge with previous range if hi > r[w-1] { r[w-1] = hi } continue } // new disjoint range r[w] = lo r[w+1] = hi w += 2 } return r[:w] } // appendRange returns the result of appending the range lo-hi to the class r. func appendRange(r []rune, lo, hi rune) []rune { // Expand last range or next to last range if it overlaps or abuts. // Checking two ranges helps when appending case-folded // alphabets, so that one range can be expanding A-Z and the // other expanding a-z. n := len(r) for i := 2; i <= 4; i += 2 { // twice, using i=2, i=4 if n >= i { rlo, rhi := r[n-i], r[n-i+1] if lo <= rhi+1 && rlo <= hi+1 { if lo < rlo { r[n-i] = lo } if hi > rhi { r[n-i+1] = hi } return r } } } return append(r, lo, hi) } const ( // minimum and maximum runes involved in folding. // checked during test. minFold = 0x0041 maxFold = 0x1044f ) // appendFoldedRange returns the result of appending the range lo-hi // and its case folding-equivalent runes to the class r. func appendFoldedRange(r []rune, lo, hi rune) []rune { // Optimizations. if lo <= minFold && hi >= maxFold { // Range is full: folding can't add more. return appendRange(r, lo, hi) } if hi < minFold || lo > maxFold { // Range is outside folding possibilities. return appendRange(r, lo, hi) } if lo < minFold { // [lo, minFold-1] needs no folding. r = appendRange(r, lo, minFold-1) lo = minFold } if hi > maxFold { // [maxFold+1, hi] needs no folding. r = appendRange(r, maxFold+1, hi) hi = maxFold } // Brute force. Depend on appendRange to coalesce ranges on the fly. for c := lo; c <= hi; c++ { r = appendRange(r, c, c) f := unicode.SimpleFold(c) for f != c { r = appendRange(r, f, f) f = unicode.SimpleFold(f) } } return r } // ranges implements sort.Interface on a []rune. // The choice of receiver type definition is strange // but avoids an allocation since we already have // a *[]rune. type ranges struct { p *[]rune } func (ra ranges) Less(i, j int) bool { p := *ra.p i *= 2 j *= 2 return p[i] < p[j] || p[i] == p[j] && p[i+1] > p[j+1] } func (ra ranges) Len() int { return len(*ra.p) / 2 } func (ra ranges) Swap(i, j int) { p := *ra.p i *= 2 j *= 2 p[i], p[i+1], p[j], p[j+1] = p[j], p[j+1], p[i], p[i+1] } func progString(p *syntax.Prog) string { var b bytes.Buffer dumpProg(&b, p) return b.String() } func instString(i *syntax.Inst) string { var b bytes.Buffer dumpInst(&b, i) return b.String() } func bw(b *bytes.Buffer, args ...string) { for _, s := range args { b.WriteString(s) } } func dumpProg(b *bytes.Buffer, p *syntax.Prog) { for j := range p.Inst { i := &p.Inst[j] pc := strconv.Itoa(j) if len(pc) < 3 { b.WriteString(" "[len(pc):]) } if j == p.Start { pc += "*" } bw(b, pc, "\t") dumpInst(b, i) bw(b, "\n") } } func u32(i uint32) string { return strconv.FormatUint(uint64(i), 10) } func dumpInst(b *bytes.Buffer, i *syntax.Inst) { switch i.Op { case syntax.InstAlt: bw(b, "alt -> ", u32(i.Out), ", ", u32(i.Arg)) case syntax.InstAltMatch: bw(b, "altmatch -> ", u32(i.Out), ", ", u32(i.Arg)) case syntax.InstCapture: bw(b, "cap ", u32(i.Arg), " -> ", u32(i.Out)) case syntax.InstEmptyWidth: bw(b, "empty ", u32(i.Arg), " -> ", u32(i.Out)) case syntax.InstMatch: bw(b, "match") case syntax.InstFail: bw(b, "fail") case syntax.InstNop: bw(b, "nop -> ", u32(i.Out)) case instByteRange: fmt.Fprintf(b, "byte %02x-%02x", (i.Arg>>8)&0xFF, i.Arg&0xFF) if i.Arg&argFold != 0 { bw(b, "/i") } bw(b, " -> ", u32(i.Out)) // Should not happen case syntax.InstRune: if i.Rune == nil { // shouldn't happen bw(b, "rune ") } bw(b, "rune ", strconv.QuoteToASCII(string(i.Rune))) if syntax.Flags(i.Arg)&syntax.FoldCase != 0 { bw(b, "/i") } bw(b, " -> ", u32(i.Out)) case syntax.InstRune1: bw(b, "rune1 ", strconv.QuoteToASCII(string(i.Rune)), " -> ", u32(i.Out)) case syntax.InstRuneAny: bw(b, "any -> ", u32(i.Out)) case syntax.InstRuneAnyNotNL: bw(b, "anynotnl -> ", u32(i.Out)) } } codesearch-1.2.0/regexp/match.go000066400000000000000000000244341364021717500165440ustar00rootroot00000000000000// Copyright 2011 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. package regexp import ( "bytes" "encoding/binary" "flag" "fmt" "io" "os" "regexp/syntax" "sort" "github.com/google/codesearch/sparse" ) // A matcher holds the state for running regular expression search. type matcher struct { prog *syntax.Prog // compiled program dstate map[string]*dstate // dstate cache start *dstate // start state startLine *dstate // start state for beginning of line z1, z2 nstate // two temporary nstates } // An nstate corresponds to an NFA state. type nstate struct { q sparse.Set // queue of program instructions partial rune // partially decoded rune (TODO) flag flags // flags (TODO) } // The flags record state about a position between bytes in the text. type flags uint32 const ( flagBOL flags = 1 << iota // beginning of line flagEOL // end of line flagBOT // beginning of text flagEOT // end of text flagWord // last byte was word byte ) // A dstate corresponds to a DFA state. type dstate struct { next [256]*dstate // next state, per byte enc string // encoded nstate matchNL bool // match when next byte is \n matchEOT bool // match in this state at end of text } func (z *nstate) String() string { return fmt.Sprintf("%v/%#x+%#x", z.q.Dense(), z.flag, z.partial) } // enc encodes z as a string. func (z *nstate) enc() string { var buf []byte var v [10]byte last := ^uint32(0) n := binary.PutUvarint(v[:], uint64(z.partial)) buf = append(buf, v[:n]...) n = binary.PutUvarint(v[:], uint64(z.flag)) buf = append(buf, v[:n]...) dense := z.q.Dense() ids := make([]int, 0, len(dense)) for _, id := range z.q.Dense() { ids = append(ids, int(id)) } sort.Ints(ids) for _, id := range ids { n := binary.PutUvarint(v[:], uint64(uint32(id)-last)) buf = append(buf, v[:n]...) last = uint32(id) } return string(buf) } // dec decodes the encoding s into z. func (z *nstate) dec(s string) { b := []byte(s) i, n := binary.Uvarint(b) if n <= 0 { bug() } b = b[n:] z.partial = rune(i) i, n = binary.Uvarint(b) if n <= 0 { bug() } b = b[n:] z.flag = flags(i) z.q.Reset() last := ^uint32(0) for len(b) > 0 { i, n = binary.Uvarint(b) if n <= 0 { bug() } b = b[n:] last += uint32(i) z.q.Add(last) } } // dmatch is the state we're in when we've seen a match and are just // waiting for the end of the line. var dmatch = dstate{ matchNL: true, matchEOT: true, } func init() { var z nstate dmatch.enc = z.enc() for i := range dmatch.next { if i != '\n' { dmatch.next[i] = &dmatch } } } // init initializes the matcher. func (m *matcher) init(prog *syntax.Prog) error { m.prog = prog m.dstate = make(map[string]*dstate) m.z1.q.Init(uint32(len(prog.Inst))) m.z2.q.Init(uint32(len(prog.Inst))) m.addq(&m.z1.q, uint32(prog.Start), syntax.EmptyBeginLine|syntax.EmptyBeginText) m.z1.flag = flagBOL | flagBOT m.start = m.cache(&m.z1) m.z1.q.Reset() m.addq(&m.z1.q, uint32(prog.Start), syntax.EmptyBeginLine) m.z1.flag = flagBOL m.startLine = m.cache(&m.z1) return nil } // stepEmpty steps runq to nextq expanding according to flag. func (m *matcher) stepEmpty(runq, nextq *sparse.Set, flag syntax.EmptyOp) { nextq.Reset() for _, id := range runq.Dense() { m.addq(nextq, id, flag) } } // stepByte steps runq to nextq consuming c and then expanding according to flag. // It returns true if a match ends immediately before c. // c is either an input byte or endText. func (m *matcher) stepByte(runq, nextq *sparse.Set, c int, flag syntax.EmptyOp) (match bool) { nextq.Reset() m.addq(nextq, uint32(m.prog.Start), flag) for _, id := range runq.Dense() { i := &m.prog.Inst[id] switch i.Op { default: continue case syntax.InstMatch: match = true continue case instByteRange: if c == endText { break } lo := int((i.Arg >> 8) & 0xFF) hi := int(i.Arg & 0xFF) ch := c if i.Arg&argFold != 0 && 'a' <= ch && ch <= 'z' { ch += 'A' - 'a' } if lo <= ch && ch <= hi { m.addq(nextq, i.Out, flag) } } } return } // addq adds id to the queue, expanding according to flag. func (m *matcher) addq(q *sparse.Set, id uint32, flag syntax.EmptyOp) { if q.Has(id) { return } q.Add(id) i := &m.prog.Inst[id] switch i.Op { case syntax.InstCapture, syntax.InstNop: m.addq(q, i.Out, flag) case syntax.InstAlt, syntax.InstAltMatch: m.addq(q, i.Out, flag) m.addq(q, i.Arg, flag) case syntax.InstEmptyWidth: if syntax.EmptyOp(i.Arg)&^flag == 0 { m.addq(q, i.Out, flag) } } } const endText = -1 // computeNext computes the next DFA state if we're in d reading c (an input byte or endText). func (m *matcher) computeNext(d *dstate, c int) *dstate { this, next := &m.z1, &m.z2 this.dec(d.enc) // compute flags in effect before c flag := syntax.EmptyOp(0) if this.flag&flagBOL != 0 { flag |= syntax.EmptyBeginLine } if this.flag&flagBOT != 0 { flag |= syntax.EmptyBeginText } if this.flag&flagWord != 0 { if !isWordByte(c) { flag |= syntax.EmptyWordBoundary } else { flag |= syntax.EmptyNoWordBoundary } } else { if isWordByte(c) { flag |= syntax.EmptyWordBoundary } else { flag |= syntax.EmptyNoWordBoundary } } if c == '\n' { flag |= syntax.EmptyEndLine } if c == endText { flag |= syntax.EmptyEndLine | syntax.EmptyEndText } // re-expand queue using new flags. // TODO: only do this when it matters // (something is gating on word boundaries). m.stepEmpty(&this.q, &next.q, flag) this, next = next, this // now compute flags after c. flag = 0 next.flag = 0 if c == '\n' { flag |= syntax.EmptyBeginLine next.flag |= flagBOL } if isWordByte(c) { next.flag |= flagWord } // re-add start, process rune + expand according to flags. if m.stepByte(&this.q, &next.q, c, flag) { return &dmatch } return m.cache(next) } func (m *matcher) cache(z *nstate) *dstate { enc := z.enc() d := m.dstate[enc] if d != nil { return d } d = &dstate{enc: enc} m.dstate[enc] = d d.matchNL = m.computeNext(d, '\n') == &dmatch d.matchEOT = m.computeNext(d, endText) == &dmatch return d } func (m *matcher) match(b []byte, beginText, endText bool) (end int) { // fmt.Printf("%v\n", m.prog) d := m.startLine if beginText { d = m.start } // m.z1.dec(d.enc) // fmt.Printf("%v (%v)\n", &m.z1, d==&dmatch) for i, c := range b { d1 := d.next[c] if d1 == nil { if c == '\n' { if d.matchNL { return i } d1 = m.startLine } else { d1 = m.computeNext(d, int(c)) } d.next[c] = d1 } d = d1 // m.z1.dec(d.enc) // fmt.Printf("%#U: %v (%v, %v, %v)\n", c, &m.z1, d==&dmatch, d.matchNL, d.matchEOT) } if d.matchNL || endText && d.matchEOT { return len(b) } return -1 } func (m *matcher) matchString(b string, beginText, endText bool) (end int) { d := m.startLine if beginText { d = m.start } for i := 0; i < len(b); i++ { c := b[i] d1 := d.next[c] if d1 == nil { if c == '\n' { if d.matchNL { return i } d1 = m.startLine } else { d1 = m.computeNext(d, int(c)) } d.next[c] = d1 } d = d1 } if d.matchNL || endText && d.matchEOT { return len(b) } return -1 } // isWordByte reports whether the byte c is a word character: ASCII only. // This is used to implement \b and \B. This is not right for Unicode, but: // - it's hard to get right in a byte-at-a-time matching world // (the DFA has only one-byte lookahead) // - this crude approximation is the same one PCRE uses func isWordByte(c int) bool { return 'A' <= c && c <= 'Z' || 'a' <= c && c <= 'z' || '0' <= c && c <= '9' || c == '_' } // TODO: type Grep struct { Regexp *Regexp // regexp to search for Stdout io.Writer // output target Stderr io.Writer // error target L bool // L flag - print file names only C bool // C flag - print count of matches N bool // N flag - print line numbers H bool // H flag - do not print file names Match bool buf []byte } func (g *Grep) AddFlags() { flag.BoolVar(&g.L, "l", false, "list matching files only") flag.BoolVar(&g.C, "c", false, "print match counts only") flag.BoolVar(&g.N, "n", false, "show line numbers") flag.BoolVar(&g.H, "h", false, "omit file names") } func (g *Grep) File(name string) { f, err := os.Open(name) if err != nil { fmt.Fprintf(g.Stderr, "%s\n", err) return } defer f.Close() g.Reader(f, name) } var nl = []byte{'\n'} func countNL(b []byte) int { n := 0 for { i := bytes.IndexByte(b, '\n') if i < 0 { break } n++ b = b[i+1:] } return n } func (g *Grep) Reader(r io.Reader, name string) { if g.buf == nil { g.buf = make([]byte, 1<<20) } var ( buf = g.buf[:0] needLineno = g.N lineno = 1 count = 0 prefix = "" beginText = true endText = false ) if !g.H { prefix = name + ":" } for { n, err := io.ReadFull(r, buf[len(buf):cap(buf)]) buf = buf[:len(buf)+n] end := len(buf) if err == nil { i := bytes.LastIndex(buf, nl) if i >= 0 { end = i + 1 } } else { endText = true } chunkStart := 0 for chunkStart < end { m1 := g.Regexp.Match(buf[chunkStart:end], beginText, endText) + chunkStart beginText = false if m1 < chunkStart { break } g.Match = true if g.L { fmt.Fprintf(g.Stdout, "%s\n", name) return } lineStart := bytes.LastIndex(buf[chunkStart:m1], nl) + 1 + chunkStart lineEnd := m1 + 1 if lineEnd > end { lineEnd = end } if needLineno { lineno += countNL(buf[chunkStart:lineStart]) } line := buf[lineStart:lineEnd] nl := "" if len(line) == 0 || line[len(line)-1] != '\n' { nl = "\n" } switch { case g.C: count++ case g.N: fmt.Fprintf(g.Stdout, "%s%d:%s%s", prefix, lineno, line, nl) default: fmt.Fprintf(g.Stdout, "%s%s%s", prefix, line, nl) } if needLineno { lineno++ } chunkStart = lineEnd } if needLineno && err == nil { lineno += countNL(buf[chunkStart:end]) } n = copy(buf, buf[end:]) buf = buf[:n] if len(buf) == 0 && err != nil { if err != io.EOF && err != io.ErrUnexpectedEOF { fmt.Fprintf(g.Stderr, "%s: %v\n", name, err) } break } } if g.C && count > 0 { fmt.Fprintf(g.Stdout, "%s: %d\n", name, count) } } codesearch-1.2.0/regexp/regexp.go000066400000000000000000000027401364021717500167360ustar00rootroot00000000000000// Copyright 2011 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. // Package regexp implements regular expression search tuned for // use in grep-like programs. package regexp import "regexp/syntax" func bug() { panic("codesearch/regexp: internal error") } // Regexp is the representation of a compiled regular expression. // A Regexp is NOT SAFE for concurrent use by multiple goroutines. type Regexp struct { Syntax *syntax.Regexp expr string // original expression m matcher } // String returns the source text used to compile the regular expression. func (re *Regexp) String() string { return re.expr } // Compile parses a regular expression and returns, if successful, // a Regexp object that can be used to match against lines of text. func Compile(expr string) (*Regexp, error) { re, err := syntax.Parse(expr, syntax.Perl) if err != nil { return nil, err } sre := re.Simplify() prog, err := syntax.Compile(sre) if err != nil { return nil, err } if err := toByteProg(prog); err != nil { return nil, err } r := &Regexp{ Syntax: re, expr: expr, } if err := r.m.init(prog); err != nil { return nil, err } return r, nil } func (r *Regexp) Match(b []byte, beginText, endText bool) (end int) { return r.m.match(b, beginText, endText) } func (r *Regexp) MatchString(s string, beginText, endText bool) (end int) { return r.m.matchString(s, beginText, endText) } codesearch-1.2.0/regexp/regexp_test.go000066400000000000000000000121211364021717500177670ustar00rootroot00000000000000// Copyright 2011 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. package regexp import ( "bytes" "reflect" "strings" "testing" ) var nstateTests = []struct { q []uint32 partial rune }{ {[]uint32{1, 2, 3}, 1}, {[]uint32{1}, 1}, {[]uint32{}, 0}, {[]uint32{1, 2, 8}, 0x10FFF}, } func TestNstateEnc(t *testing.T) { var n1, n2 nstate n1.q.Init(10) n2.q.Init(10) for _, tt := range nstateTests { n1.q.Reset() n1.partial = tt.partial for _, id := range tt.q { n1.q.Add(id) } enc := n1.enc() n2.dec(enc) if n2.partial != n1.partial || !reflect.DeepEqual(n1.q.Dense(), n2.q.Dense()) { t.Errorf("%v.enc.dec = %v", &n1, &n2) } } } var matchTests = []struct { re string s string m []int }{ // Adapted from go/src/pkg/regexp/find_test.go. {`a+`, "abc\ndef\nghi\n", []int{1}}, {``, ``, []int{1}}, {`^abcdefg`, "abcdefg", []int{1}}, {`a+`, "baaab", []int{1}}, {"abcd..", "abcdef", []int{1}}, {`a`, "a", []int{1}}, {`x`, "y", nil}, {`b`, "abc", []int{1}}, {`.`, "a", []int{1}}, {`.*`, "abcdef", []int{1}}, {`^`, "abcde", []int{1}}, {`$`, "abcde", []int{1}}, {`^abcd$`, "abcd", []int{1}}, {`^bcd'`, "abcdef", nil}, {`^abcd$`, "abcde", nil}, {`a+`, "baaab", []int{1}}, {`a*`, "baaab", []int{1}}, {`[a-z]+`, "abcd", []int{1}}, {`[^a-z]+`, "ab1234cd", []int{1}}, {`[a\-\]z]+`, "az]-bcz", []int{1}}, {`[^\n]+`, "abcd\n", []int{1}}, {`[日本語]+`, "日本語日本語", []int{1}}, {`日本語+`, "日本語", []int{1}}, {`日本語+`, "日本語語語語", []int{1}}, {`()`, "", []int{1}}, {`(a)`, "a", []int{1}}, {`(.)(.)`, "日a", []int{1}}, {`(.*)`, "", []int{1}}, {`(.*)`, "abcd", []int{1}}, {`(..)(..)`, "abcd", []int{1}}, {`(([^xyz]*)(d))`, "abcd", []int{1}}, {`((a|b|c)*(d))`, "abcd", []int{1}}, {`(((a|b|c)*)(d))`, "abcd", []int{1}}, {`\a\f\r\t\v`, "\a\f\r\t\v", []int{1}}, {`[\a\f\n\r\t\v]+`, "\a\f\r\t\v", []int{1}}, {`a*(|(b))c*`, "aacc", []int{1}}, {`(.*).*`, "ab", []int{1}}, {`[.]`, ".", []int{1}}, {`/$`, "/abc/", []int{1}}, {`/$`, "/abc", nil}, // multiple matches {`.`, "abc", []int{1}}, {`(.)`, "abc", []int{1}}, {`.(.)`, "abcd", []int{1}}, {`ab*`, "abbaab", []int{1}}, {`a(b*)`, "abbaab", []int{1}}, // fixed bugs {`ab$`, "cab", []int{1}}, {`axxb$`, "axxcb", nil}, {`data`, "daXY data", []int{1}}, {`da(.)a$`, "daXY data", []int{1}}, {`zx+`, "zzx", []int{1}}, {`ab$`, "abcab", []int{1}}, {`(aa)*$`, "a", []int{1}}, {`(?:.|(?:.a))`, "", nil}, {`(?:A(?:A|a))`, "Aa", []int{1}}, {`(?:A|(?:A|a))`, "a", []int{1}}, {`(a){0}`, "", []int{1}}, // {`(?-s)(?:(?:^).)`, "\n", nil}, // {`(?s)(?:(?:^).)`, "\n", []int{1}}, // {`(?:(?:^).)`, "\n", nil}, {`\b`, "x", []int{1}}, {`\b`, "xx", []int{1}}, {`\b`, "x y", []int{1}}, {`\b`, "xx yy", []int{1}}, {`\B`, "x", nil}, {`\B`, "xx", []int{1}}, {`\B`, "x y", nil}, {`\B`, "xx yy", []int{1}}, {`(?im)^[abc]+$`, "abcABC", []int{1}}, {`(?im)^[α]+$`, "αΑ", []int{1}}, {`[Aa]BC`, "abc", nil}, {`[Aa]bc`, "abc", []int{1}}, // RE2 tests {`[^\S\s]`, "abcd", nil}, {`[^\S[:space:]]`, "abcd", nil}, {`[^\D\d]`, "abcd", nil}, {`[^\D[:digit:]]`, "abcd", nil}, {`(?i)\W`, "x", nil}, {`(?i)\W`, "k", nil}, {`(?i)\W`, "s", nil}, // can backslash-escape any punctuation {`\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\\\]\^\_\{\|\}\~`, `!"#$%&'()*+,-./:;<=>?@[\]^_{|}~`, []int{1}}, {`[\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\\\]\^\_\{\|\}\~]+`, `!"#$%&'()*+,-./:;<=>?@[\]^_{|}~`, []int{1}}, {"\\`", "`", []int{1}}, {"[\\`]+", "`", []int{1}}, // long set of matches (longer than startSize) { ".", "qwertyuiopasdfghjklzxcvbnm1234567890", []int{1}, }, } func TestMatch(t *testing.T) { for _, tt := range matchTests { re, err := Compile("(?m)" + tt.re) if err != nil { t.Errorf("Compile(%#q): %v", tt.re, err) continue } b := []byte(tt.s) lines := grep(re, b) if !reflect.DeepEqual(lines, tt.m) { t.Errorf("grep(%#q, %q) = %v, want %v", tt.re, tt.s, lines, tt.m) } } } func grep(re *Regexp, b []byte) []int { var m []int lineno := 1 for { i := re.Match(b, true, true) if i < 0 { break } start := bytes.LastIndex(b[:i], nl) + 1 end := i + 1 if end > len(b) { end = len(b) } lineno += bytes.Count(b[:start], nl) m = append(m, lineno) if start < end && b[end-1] == '\n' { lineno++ } b = b[end:] if len(b) == 0 { break } } return m } var grepTests = []struct { re string s string out string err string g Grep }{ {re: `a+`, s: "abc\ndef\nghalloo\n", out: "input:abc\ninput:ghalloo\n"}, {re: `x.*y`, s: "xay\nxa\ny\n", out: "input:xay\n"}, } func TestGrep(t *testing.T) { for i, tt := range grepTests { re, err := Compile("(?m)" + tt.re) if err != nil { t.Errorf("Compile(%#q): %v", tt.re, err) continue } g := tt.g g.Regexp = re var out, errb bytes.Buffer g.Stdout = &out g.Stderr = &errb g.Reader(strings.NewReader(tt.s), "input") if out.String() != tt.out || errb.String() != tt.err { t.Errorf("#%d: grep(%#q, %q) = %q, %q, want %q, %q", i, tt.re, tt.s, out.String(), errb.String(), tt.out, tt.err) } } } codesearch-1.2.0/regexp/utf.go000066400000000000000000000131041364021717500162360ustar00rootroot00000000000000// Copyright 2011 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. package regexp import ( "regexp/syntax" "unicode" "unicode/utf8" ) const ( instFail = syntax.InstFail instAlt = syntax.InstAlt instByteRange = syntax.InstRune | 0x80 // local opcode argFold = 1 << 16 ) func toByteProg(prog *syntax.Prog) error { var b runeBuilder for pc := range prog.Inst { i := &prog.Inst[pc] switch i.Op { case syntax.InstRune, syntax.InstRune1: // General rune range. PIA. // TODO: Pick off single-byte case. if lo, hi, fold, ok := oneByteRange(i); ok { i.Op = instByteRange i.Arg = uint32(lo)<<8 | uint32(hi) if fold { i.Arg |= argFold } break } r := i.Rune if syntax.Flags(i.Arg)&syntax.FoldCase != 0 { // Build folded list. var rr []rune if len(r) == 1 { rr = appendFoldedRange(rr, r[0], r[0]) } else { for j := 0; j < len(r); j += 2 { rr = appendFoldedRange(rr, r[j], r[j+1]) } } r = rr } b.init(prog, uint32(pc), i.Out) if len(r) == 1 { b.addRange(r[0], r[0], false) } else { for j := 0; j < len(r); j += 2 { b.addRange(r[j], r[j+1], false) } } case syntax.InstRuneAny, syntax.InstRuneAnyNotNL: // All runes. // AnyNotNL should exclude \n but the line-at-a-time // execution takes care of that for us. b.init(prog, uint32(pc), i.Out) b.addRange(0, unicode.MaxRune, false) } } return nil } func oneByteRange(i *syntax.Inst) (lo, hi byte, fold, ok bool) { if i.Op == syntax.InstRune1 { r := i.Rune[0] if r < utf8.RuneSelf { return byte(r), byte(r), false, true } } if i.Op != syntax.InstRune { return } fold = syntax.Flags(i.Arg)&syntax.FoldCase != 0 if len(i.Rune) == 1 || len(i.Rune) == 2 && i.Rune[0] == i.Rune[1] { r := i.Rune[0] if r >= utf8.RuneSelf { return } if fold && !asciiFold(r) { return } return byte(r), byte(r), fold, true } if len(i.Rune) == 2 && i.Rune[1] < utf8.RuneSelf { if fold { for r := i.Rune[0]; r <= i.Rune[1]; r++ { if asciiFold(r) { return } } } return byte(i.Rune[0]), byte(i.Rune[1]), fold, true } if len(i.Rune) == 4 && i.Rune[0] == i.Rune[1] && i.Rune[2] == i.Rune[3] && unicode.SimpleFold(i.Rune[0]) == i.Rune[2] && unicode.SimpleFold(i.Rune[2]) == i.Rune[0] { return byte(i.Rune[0]), byte(i.Rune[0]), true, true } return } func asciiFold(r rune) bool { if r >= utf8.RuneSelf { return false } r1 := unicode.SimpleFold(r) if r1 >= utf8.RuneSelf { return false } if r1 == r { return true } return unicode.SimpleFold(r1) == r } func maxRune(n int) rune { b := 0 if n == 1 { b = 7 } else { b = 8 - (n + 1) + 6*(n-1) } return 1< 0xbf { // Not a continuation byte, no need to cache. return b.uncachedSuffix(lo, hi, fold, next) } key := cacheKey{lo, hi, fold, next} if pc, ok := b.cache[key]; ok { return pc } pc := b.uncachedSuffix(lo, hi, fold, next) b.cache[key] = pc return pc } func (b *runeBuilder) addBranch(pc uint32) { // Add pc to the branch at the beginning. i := &b.p.Inst[b.begin] switch i.Op { case syntax.InstFail: i.Op = syntax.InstNop i.Out = pc return case syntax.InstNop: i.Op = syntax.InstAlt i.Arg = pc return case syntax.InstAlt: apc := uint32(len(b.p.Inst)) b.p.Inst = append(b.p.Inst, syntax.Inst{Op: instAlt, Out: i.Arg, Arg: pc}) i = &b.p.Inst[b.begin] i.Arg = apc b.begin = apc } } func (b *runeBuilder) addRange(lo, hi rune, fold bool) { if lo > hi { return } // TODO: Pick off 80-10FFFF for special handling? if lo == 0x80 && hi == 0x10FFFF { } // Split range into same-length sized ranges. for i := 1; i < utf8.UTFMax; i++ { max := maxRune(i) if lo <= max && max < hi { b.addRange(lo, max, fold) b.addRange(max+1, hi, fold) return } } // ASCII range is special. if hi < utf8.RuneSelf { b.addBranch(b.suffix(byte(lo), byte(hi), fold, 0)) return } // Split range into sections that agree on leading bytes. for i := 1; i < utf8.UTFMax; i++ { m := rune(1)<= 0; i-- { pc = b.suffix(ulo[i], uhi[i], false, pc) } b.addBranch(pc) } codesearch-1.2.0/sparse/000077500000000000000000000000001364021717500151155ustar00rootroot00000000000000codesearch-1.2.0/sparse/set.go000066400000000000000000000032401364021717500162360ustar00rootroot00000000000000// Copyright 2011 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. // Package sparse implements sparse sets. package sparse // For comparison: running cindex over the Linux 2.6 kernel with this // implementation of trigram sets takes 11 seconds. If I change it to // a bitmap (which must be cleared between files) it takes 25 seconds. // A Set is a sparse set of uint32 values. // http://research.swtch.com/2008/03/using-uninitialized-memory-for-fun-and.html type Set struct { dense []uint32 sparse []uint32 } // NewSet returns a new Set with a given maximum size. // The set can contain numbers in [0, max-1]. func NewSet(max uint32) *Set { return &Set{ sparse: make([]uint32, max), } } // Init initializes a Set to have a given maximum size. // The set can contain numbers in [0, max-1]. func (s *Set) Init(max uint32) { s.sparse = make([]uint32, max) } // Reset clears (empties) the set. func (s *Set) Reset() { s.dense = s.dense[:0] } // Add adds x to the set if it is not already there. func (s *Set) Add(x uint32) { v := s.sparse[x] if v < uint32(len(s.dense)) && s.dense[v] == x { return } n := len(s.dense) s.sparse[x] = uint32(n) s.dense = append(s.dense, x) } // Has reports whether x is in the set. func (s *Set) Has(x uint32) bool { v := s.sparse[x] return v < uint32(len(s.dense)) && s.dense[v] == x } // Dense returns the values in the set. // The values are listed in the order in which they // were inserted. func (s *Set) Dense() []uint32 { return s.dense } // Len returns the number of values in the set. func (s *Set) Len() int { return len(s.dense) }