pax_global_header00006660000000000000000000000064141206324170014512gustar00rootroot0000000000000052 comment=30ef4bf031a83c2f81bd772a8b4259412a14cbc5 breader-0.3.1/000077500000000000000000000000001412063241700131175ustar00rootroot00000000000000breader-0.3.1/.gitignore000077500000000000000000000004111412063241700151060ustar00rootroot00000000000000# Compiled Object files, Static and Dynamic libs (Shared Objects) *.o *.a *.so # Folders _obj _test # Architecture specific extensions/prefixes *.[568vq] [568vq].out *.cgo1.go *.cgo2.c _cgo_defun.c _cgo_gotypes.go _cgo_export.* _testmain.go *.exe *.directory breader-0.3.1/BufferedReader.go000066400000000000000000000123571412063241700163230ustar00rootroot00000000000000/*Package breader (Buffered File Reader), asynchronous parsing and pre-processing while reading file. Safe cancellation is also supported. Detail: https://github.com/shenwei356/breader */ package breader import ( "errors" "runtime" "strings" "sync" "github.com/shenwei356/xopen" "github.com/twotwotwo/sorts/sortutil" ) // Chunk is a struct compossing with slice of data and error as status type Chunk struct { ID uint64 // useful for keeping the order of chunk in downstream process Data []interface{} Err error } // chunk is chunk of lines type linesChunk struct { ID uint64 // useful for keeping the order of chunk in downstream process Data []string } // BufferedReader is BufferedReader type BufferedReader struct { reader *xopen.Reader BufferSize int ChunkSize int ProcessFunc func(string) (interface{}, bool, error) Ch chan Chunk done chan struct{} finished bool cancelled bool } // NewDefaultBufferedReader creates BufferedReader with default parameter func NewDefaultBufferedReader(file string) (*BufferedReader, error) { reader, err := initBufferedReader(file, runtime.NumCPU(), 100, DefaultFunc) if err != nil { if err == xopen.ErrNoContent { reader.Ch = make(chan Chunk, 100) close(reader.Ch) return reader, nil } return reader, err } reader.run() return reader, nil } // NewBufferedReader is the constructor of BufferedReader with full parameters func NewBufferedReader(file string, bufferSize int, chunkSize int, fn func(line string) (interface{}, bool, error)) (*BufferedReader, error) { reader, err := initBufferedReader(file, bufferSize, chunkSize, fn) if err != nil { if err == xopen.ErrNoContent { reader.Ch = make(chan Chunk, bufferSize) close(reader.Ch) return reader, nil } return reader, err } reader.run() return reader, nil } // DefaultFunc just trim the new line symbol var DefaultFunc = func(line string) (interface{}, bool, error) { line = strings.TrimRight(line, "\r\n") return line, true, nil } func initBufferedReader(file string, bufferSize int, chunkSize int, fn func(line string) (interface{}, bool, error)) (*BufferedReader, error) { if bufferSize < 1 { bufferSize = 1 } if chunkSize < 1 { chunkSize = 1 } reader := new(BufferedReader) fh, err := xopen.Ropen(file) if err != nil { return reader, err } reader.reader = fh reader.BufferSize = bufferSize reader.ChunkSize = chunkSize reader.ProcessFunc = fn reader.Ch = make(chan Chunk, bufferSize) reader.done = make(chan struct{}) reader.finished = false reader.cancelled = false return reader, nil } // ErrorCanceled means that the reading process is canceled var ErrorCanceled = errors.New("reading canceled") func (reader *BufferedReader) run() { ch2 := make(chan Chunk, reader.BufferSize) // receive processed chunks and return them in order go func() { var id uint64 chunks := make(map[uint64]Chunk) for chunk := range ch2 { if chunk.Err != nil { reader.Ch <- chunk close(reader.Ch) return } if chunk.ID == id { reader.Ch <- chunk id++ } else { // check bufferd result for true { if chunk1, ok := chunks[id]; ok { reader.Ch <- chunk1 delete(chunks, chunk1.ID) id++ } else { break } } chunks[chunk.ID] = chunk } } if len(chunks) > 0 { ids := make([]uint64, len(chunks)) i := 0 for id := range chunks { ids[i] = id i++ } sortutil.Uint64s(ids) for _, id := range ids { chunk := chunks[id] reader.Ch <- chunk } } close(reader.Ch) }() // receive lines and process with ProcessFunc ch := make(chan linesChunk, reader.BufferSize) go func() { defer close(ch2) var wg sync.WaitGroup tokens := make(chan int, reader.BufferSize) errDetectedCh := make(chan int) var once sync.Once for chunk := range ch { tokens <- 1 wg.Add(1) go func(chunk linesChunk) { defer func() { wg.Done() <-tokens }() var chunkData []interface{} for _, line := range chunk.Data { result, ok, err := reader.ProcessFunc(line) if err != nil { select { case <-errDetectedCh: default: ch2 <- Chunk{chunk.ID, chunkData, err} once.Do(func() { close(errDetectedCh) }) } return } if ok { chunkData = append(chunkData, result) } } ch2 <- Chunk{chunk.ID, chunkData, nil} }(chunk) } wg.Wait() }() // read lines go func() { var ( i int id uint64 line string err error ) chunkData := make([]string, reader.ChunkSize) for { select { case <-reader.done: if !reader.finished { reader.finished = true reader.reader.Close() close(ch) return } default: } line, err = reader.reader.ReadString('\n') if err != nil { chunkData[i] = line i++ ch <- linesChunk{id, chunkData[0:i]} reader.finished = true reader.reader.Close() close(ch) return } chunkData[i] = line i++ if i == reader.ChunkSize { ch <- linesChunk{id, chunkData[0:i]} id++ chunkData = make([]string, reader.ChunkSize) i = 0 } } }() } // Cancel method cancel the reading process func (reader *BufferedReader) Cancel() { if !reader.finished && !reader.cancelled { close(reader.done) reader.cancelled = true } } breader-0.3.1/BufferedReader_test.go000066400000000000000000000062101412063241700173510ustar00rootroot00000000000000package breader import ( "io/ioutil" "os" "strconv" "strings" "testing" ) var testfile = "testdata.tsv" func TestUnprocessedText(t *testing.T) { var text []string fn := func(line string) (interface{}, bool, error) { return line, true, nil } reader, err := NewBufferedReader(testfile, 2, 4, fn) if err != nil { t.Error(err) return } for chunk := range reader.Ch { if chunk.Err != nil { t.Error(chunk.Err) return } for _, data := range chunk.Data { text = append(text, data.(string)) } } originalText, err := readFileText(testfile) if err != nil { t.Error(err) return } if strings.Join(text, "") != originalText { t.Error("text unmatch") } } func TestProcessedText(t *testing.T) { type Slice []string fn := func(line string) (interface{}, bool, error) { line = strings.TrimRight(line, "\n") if line == "" || line[0] == '#' { return "", false, nil } items := strings.Split(line, "\t") if len(items) != 2 { return items, false, nil } return Slice(items), true, nil } reader, err := NewBufferedReader(testfile, 2, 4, fn) if err != nil { t.Error(err) return } n := 0 for chunk := range reader.Ch { if chunk.Err != nil { t.Error(chunk.Err) return } // for _, data := range chunk.Data { // switch reflect.TypeOf(data).Kind() { // case reflect.Slice: // s := reflect.ValueOf(data) // items := make([]string, s.Len()) // for i := 0; i < s.Len(); i++ { // items[i] = s.Index(i).String() // } // fmt.Println(items) // n++ // } // fmt.Println(data.(Slice)) // n++ // } n += len(chunk.Data) } if n != 9 { t.Error("testing TestProcessedText failed") } } func TestCancellation(t *testing.T) { reader, err := NewBufferedReader(testfile, 1, 1, DefaultFunc) if err != nil { t.Error(err) return } // note that range is bufferd. using range will be failed // for chunk := range reader.Ch { LOOP: for { select { case chunk := <-reader.Ch: if chunk.Err != nil { t.Log(chunk.Err) return } reader.Cancel() break LOOP default: } } } func TestProcessedTextReturnObject(t *testing.T) { type string2int struct { id string value int } fn := func(line string) (interface{}, bool, error) { line = strings.TrimRight(line, "\n") if line == "" || line[0] == '#' { return nil, false, nil } items := strings.Split(line, "\t") if len(items) != 2 { return nil, false, nil } if items[0] == "" || items[1] == "" { return nil, false, nil } id := items[0] value, err := strconv.Atoi(items[1]) if err != nil { return nil, false, err } return string2int{id, value}, true, nil } reader, err := NewBufferedReader(testfile, 2, 4, fn) if err != nil { t.Error(err) return } n := 0 for chunk := range reader.Ch { if chunk.Err != nil { t.Error(chunk.Err) return } for _, data := range chunk.Data { _ = data.(string2int) n++ } } if n != 7 { t.Error("testing TestProcessedTextReturnObject failed") } } func readFileText(file string) (string, error) { fh, err := os.Open(file) defer fh.Close() if err != nil { return "", err } bs, _ := ioutil.ReadAll(fh) return string(bs), nil } breader-0.3.1/LICENSE000066400000000000000000000020621412063241700141240ustar00rootroot00000000000000The MIT License (MIT) Copyright © 2016 Wei Shen Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. breader-0.3.1/README.md000066400000000000000000000063011412063241700143760ustar00rootroot00000000000000# breader [![GoDoc](https://godoc.org/github.com/shenwei356/breader?status.svg)](https://godoc.org/github.com/shenwei356/breader) [![Go Report Card](https://goreportcard.com/badge/github.com/shenwei356/breader)](https://goreportcard.com/report/github.com/shenwei356/breader) breader (Buffered File Reader), asynchronous parsing and pre-processing while reading file. Safe cancellation is also supported. ## Example 1). Simple example with default parameters (`ChunkSize`: 100; `BufferSize`: #. of CPUs, `ProcessFunc`: trimming new-line symbol) ```go import "github.com/shenwei356/breader" reader, err := breader.NewDefaultBufferedReader(file) checkErr(err) for chunk := range reader.Ch { checkError(chunk.Err) for _, data := range chunk.Data { line := data.(string) fmt.Println(line) } } ``` 2). Example with custom pre-processing function: splitting line to slice. **Note the processing of interface{} containing slice, using a custom struct is recommended**. ```go type Slice []string // custom type fn := func(line string) (interface{}, bool, error) { line = strings.TrimRight(line, "\n") if line == "" || line[0] == '#' { // ignoring blank line and comment line return "", false, nil } items := strings.Split(line, "\t") if len(items) != 2 { return items, false, nil } return Slice(items), true, nil } reader, err := breader.NewBufferedReader(file, runtime.NumCPU(), 100, fn) checkErr(err) for chunk := range reader.Ch { checkError(chunk.Err) for _, data := range chunk.Data { // do not simply use: data.(slice) fmt.Println(data.(Slice)) } } ``` 3). Example with custom pre-processing function: creating object from line data. ```go type string2int struct { id string value int } fn := func(line string) (interface{}, bool, error) { line = strings.TrimRight(line, "\n") if line == "" || line[0] == '#' { return nil, false, nil } items := strings.Split(line, "\t") if len(items) != 2 { return nil, false, nil } if items[0] == "" || items[1] == "" { return nil, false, nil } id := items[0] value, err := strconv.Atoi(items[1]) if err != nil { return nil, false, err } return string2int{id, value}, true, nil } reader, err := breader.NewBufferedReader(file, runtime.NumCPU(), 100, fn) checkErr(err) for chunk := range reader.Ch { checkError(chunk.Err) for _, data := range chunk.Data { obj := data.(string2int) // handle of the string2int object } } ``` 4). Example of cancellation. **Note that `range chanel` is buffered, therefore, `for-select-case` is used.** ```go reader, err := breader.NewBufferedReader(testfile, 0, 1, breader.DefaultFunc) checkErr(err) // note that range is bufferd. using range will be failed // for chunk := range reader.Ch { LOOP: for { select { case chunk := <-reader.Ch: if chunk.Err != nil { t.Log(chunk.Err) return } reader.Cancel() break LOOP default: } } ``` ## License [MIT License](https://github.com/shenwei356/breader/blob/master/LICENSE) breader-0.3.1/go.mod000066400000000000000000000005021412063241700142220ustar00rootroot00000000000000module github.com/shenwei356/breader go 1.16 require ( github.com/klauspost/compress v1.13.6 // indirect github.com/klauspost/pgzip v1.2.5 // indirect github.com/shenwei356/xopen v0.1.0 github.com/twotwotwo/sorts v0.0.0-20160814051341-bf5c1f2b8553 gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c // indirect ) breader-0.3.1/go.sum000066400000000000000000000025111412063241700142510ustar00rootroot00000000000000github.com/klauspost/compress v1.13.6 h1:P76CopJELS0TiO2mebmnzgWaajssP/EszplttgQxcgc= github.com/klauspost/compress v1.13.6/go.mod h1:/3/Vjq9QcHkK5uEr5lBEmyoZ1iFhe47etQ6QUkpK6sk= github.com/klauspost/pgzip v1.2.5 h1:qnWYvvKqedOF2ulHpMG72XQol4ILEJ8k2wwRl/Km8oE= github.com/klauspost/pgzip v1.2.5/go.mod h1:Ch1tH69qFZu15pkjo5kYi6mth2Zzwzt50oCQKQE9RUs= github.com/kr/pretty v0.2.1 h1:Fmg33tUaq4/8ym9TJN1x7sLJnHVwhP33CNkpYV/7rwI= github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI= github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= github.com/kr/text v0.1.0 h1:45sCR5RtlFHMR4UwH9sdQ5TC8v0qDQCHnXt+kaKSTVE= github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= github.com/shenwei356/xopen v0.1.0 h1:PizY52rLA7A6EdkwKZ6A8h8/a+c9DCBXqfLtwVzsWnM= github.com/shenwei356/xopen v0.1.0/go.mod h1:6EQUa6I7Zsl2GQKqcL9qGLrTzVE+oZyly+uhzovQYSk= github.com/twotwotwo/sorts v0.0.0-20160814051341-bf5c1f2b8553 h1:DRC1ubdb3ZmyyIeCSTxjZIQAnpLPfKVgYrLETQuOPjo= github.com/twotwotwo/sorts v0.0.0-20160814051341-bf5c1f2b8553/go.mod h1:Rj7Csq/tZ/egz+Ltc2IVpsA5309AmSMEswjkTZmq2Xc= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= breader-0.3.1/testdata.tsv000066400000000000000000000001211412063241700154600ustar00rootroot00000000000000#id value abc 123 def 456 adf 457 asdf 999 agfaf 111 adfaf adfaf 233 aaa 8980