pax_global_header00006660000000000000000000000064140641103530014507gustar00rootroot0000000000000052 comment=7bd22796ec03e9a4bd80544495c988db527e5f62 reedsolomon-1.9.13/000077500000000000000000000000001406411035300141305ustar00rootroot00000000000000reedsolomon-1.9.13/.gitignore000066400000000000000000000004201406411035300161140ustar00rootroot00000000000000# Compiled Object files, Static and Dynamic libs (Shared Objects) *.o *.a *.so # Folders _obj _test # Architecture specific extensions/prefixes *.[568vq] [568vq].out *.cgo1.go *.cgo2.c _cgo_defun.c _cgo_gotypes.go _cgo_export.* _testmain.go *.exe *.test *.prof .ideareedsolomon-1.9.13/.travis.yml000066400000000000000000000030141406411035300162370ustar00rootroot00000000000000language: go os: - linux - osx - windows arch: - amd64 - arm64 - ppc64le - s390x go: - 1.14.x - 1.15.x - 1.16.x - master env: - GO111MODULE=off CGO_ENABLED=0 install: - go get ./... script: - go vet ./... - go test -cpu=1,2 . - go test -tags=noasm -cpu=1,2 . - go build examples/simple-decoder.go - go build examples/simple-encoder.go - go build examples/stream-decoder.go - go build examples/stream-encoder.go jobs: allow_failures: - go: 'master' - arch: s390x fast_finish: true include: - stage: other go: 1.16.x os: linux arch: amd64 script: - diff <(gofmt -d .) <(printf "") - diff <(gofmt -d ./examples) <(printf "") - go get github.com/klauspost/asmfmt&&go install github.com/klauspost/asmfmt/cmd/asmfmt - diff <(asmfmt -d .) <(printf "") - CGO_ENABLED=1 go test -cpu=1 -short -race . - CGO_ENABLED=1 go test -cpu=2 -short -race . - CGO_ENABLED=1 go test -tags=noasm -cpu=1 -short -race . - CGO_ENABLED=1 go test -tags=noasm -cpu=4 -short -race . - CGO_ENABLED=1 go test -no-avx512 -short -race . - CGO_ENABLED=1 go test -no-avx512 -no-avx2 -short -race . - CGO_ENABLED=1 go test -no-avx512 -no-avx2 -no-ssse3 -short -race . - GOOS=linux GOARCH=386 go test -short . - stage: other go: 1.15.x os: linux arch: amd64 script: - go test -no-avx512 - go test -no-avx512 -no-avx2 - go test -no-avx512 -no-avx2 -no-ssse3 reedsolomon-1.9.13/LICENSE000066400000000000000000000021231406411035300151330ustar00rootroot00000000000000The MIT License (MIT) Copyright (c) 2015 Klaus Post Copyright (c) 2015 Backblaze Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. reedsolomon-1.9.13/README.md000066400000000000000000000404211406411035300154100ustar00rootroot00000000000000# Reed-Solomon [![GoDoc][1]][2] [![Build Status][3]][4] [1]: https://godoc.org/github.com/klauspost/reedsolomon?status.svg [2]: https://pkg.go.dev/github.com/klauspost/reedsolomon?tab=doc [3]: https://travis-ci.org/klauspost/reedsolomon.svg?branch=master [4]: https://travis-ci.org/klauspost/reedsolomon Reed-Solomon Erasure Coding in Go, with speeds exceeding 1GB/s/cpu core implemented in pure Go. This is a Go port of the [JavaReedSolomon](https://github.com/Backblaze/JavaReedSolomon) library released by [Backblaze](http://backblaze.com), with some additional optimizations. For an introduction on erasure coding, see the post on the [Backblaze blog](https://www.backblaze.com/blog/reed-solomon/). Package home: https://github.com/klauspost/reedsolomon Godoc: https://pkg.go.dev/github.com/klauspost/reedsolomon?tab=doc # Installation To get the package use the standard: ```bash go get -u github.com/klauspost/reedsolomon ``` Using Go modules recommended. # Changes ## May 2020 * ARM64 optimizations, up to 2.5x faster. * Added [WithFastOneParityMatrix](https://pkg.go.dev/github.com/klauspost/reedsolomon?tab=doc#WithFastOneParityMatrix) for faster operation with 1 parity shard. * Much better performance when using a limited number of goroutines. * AVX512 is now using multiple cores. * Stream processing overhaul, big speedups in most cases. * AVX512 optimizations ## March 6, 2019 The pure Go implementation is about 30% faster. Minor tweaks to assembler implementations. ## February 8, 2019 AVX512 accelerated version added for Intel Skylake CPUs. This can give up to a 4x speed improvement as compared to AVX2. See [here](https://github.com/klauspost/reedsolomon#performance-on-avx512) for more details. ## December 18, 2018 Assembly code for ppc64le has been contributed, this boosts performance by about 10x on this platform. ## November 18, 2017 Added [WithAutoGoroutines](https://godoc.org/github.com/klauspost/reedsolomon#WithAutoGoroutines) which will attempt to calculate the optimal number of goroutines to use based on your expected shard size and detected CPU. ## October 1, 2017 * [Cauchy Matrix](https://godoc.org/github.com/klauspost/reedsolomon#WithCauchyMatrix) is now an option. Thanks to [templexxx](https://github.com/templexxx) for the basis of this. * Default maximum number of [goroutines](https://godoc.org/github.com/klauspost/reedsolomon#WithMaxGoroutines) has been increased for better multi-core scaling. * After several requests the Reconstruct and ReconstructData now slices of zero length but sufficient capacity to be used instead of allocating new memory. ## August 26, 2017 * The [`Encoder()`](https://godoc.org/github.com/klauspost/reedsolomon#Encoder) now contains an `Update` function contributed by [chenzhongtao](https://github.com/chenzhongtao). * [Frank Wessels](https://github.com/fwessels) kindly contributed ARM 64 bit assembly, which gives a huge performance boost on this platform. ## July 20, 2017 `ReconstructData` added to [`Encoder`](https://godoc.org/github.com/klauspost/reedsolomon#Encoder) interface. This can cause compatibility issues if you implement your own Encoder. A simple workaround can be added: ```Go func (e *YourEnc) ReconstructData(shards [][]byte) error { return ReconstructData(shards) } ``` You can of course also do your own implementation. The [`StreamEncoder`](https://godoc.org/github.com/klauspost/reedsolomon#StreamEncoder) handles this without modifying the interface. This is a good lesson on why returning interfaces is not a good design. # Usage This section assumes you know the basics of Reed-Solomon encoding. A good start is this [Backblaze blog post](https://www.backblaze.com/blog/reed-solomon/). This package performs the calculation of the parity sets. The usage is therefore relatively simple. First of all, you need to choose your distribution of data and parity shards. A 'good' distribution is very subjective, and will depend a lot on your usage scenario. A good starting point is above 5 and below 257 data shards (the maximum supported number), and the number of parity shards to be 2 or above, and below the number of data shards. To create an encoder with 10 data shards (where your data goes) and 3 parity shards (calculated): ```Go enc, err := reedsolomon.New(10, 3) ``` This encoder will work for all parity sets with this distribution of data and parity shards. The error will only be set if you specify 0 or negative values in any of the parameters, or if you specify more than 256 data shards. If you will primarily be using it with one shard size it is recommended to use [`WithAutoGoroutines(shardSize)`](https://pkg.go.dev/github.com/klauspost/reedsolomon?tab=doc#WithAutoGoroutines) as an additional parameter. This will attempt to calculate the optimal number of goroutines to use for the best speed. It is not required that all shards are this size. The you send and receive data is a simple slice of byte slices; `[][]byte`. In the example above, the top slice must have a length of 13. ```Go data := make([][]byte, 13) ``` You should then fill the 10 first slices with *equally sized* data, and create parity shards that will be populated with parity data. In this case we create the data in memory, but you could for instance also use [mmap](https://github.com/edsrzf/mmap-go) to map files. ```Go // Create all shards, size them at 50000 each for i := range input { data[i] := make([]byte, 50000) } // Fill some data into the data shards for i, in := range data[:10] { for j:= range in { in[j] = byte((i+j)&0xff) } } ``` To populate the parity shards, you simply call `Encode()` with your data. ```Go err = enc.Encode(data) ``` The only cases where you should get an error is, if the data shards aren't of equal size. The last 3 shards now contain parity data. You can verify this by calling `Verify()`: ```Go ok, err = enc.Verify(data) ``` The final (and important) part is to be able to reconstruct missing shards. For this to work, you need to know which parts of your data is missing. The encoder *does not know which parts are invalid*, so if data corruption is a likely scenario, you need to implement a hash check for each shard. If a byte has changed in your set, and you don't know which it is, there is no way to reconstruct the data set. To indicate missing data, you set the shard to nil before calling `Reconstruct()`: ```Go // Delete two data shards data[3] = nil data[7] = nil // Reconstruct the missing shards err := enc.Reconstruct(data) ``` The missing data and parity shards will be recreated. If more than 3 shards are missing, the reconstruction will fail. If you are only interested in the data shards (for reading purposes) you can call `ReconstructData()`: ```Go // Delete two data shards data[3] = nil data[7] = nil // Reconstruct just the missing data shards err := enc.ReconstructData(data) ``` So to sum up reconstruction: * The number of data/parity shards must match the numbers used for encoding. * The order of shards must be the same as used when encoding. * You may only supply data you know is valid. * Invalid shards should be set to nil. For complete examples of an encoder and decoder see the [examples folder](https://github.com/klauspost/reedsolomon/tree/master/examples). # Splitting/Joining Data You might have a large slice of data. To help you split this, there are some helper functions that can split and join a single byte slice. ```Go bigfile, _ := ioutil.Readfile("myfile.data") // Split the file split, err := enc.Split(bigfile) ``` This will split the file into the number of data shards set when creating the encoder and create empty parity shards. An important thing to note is that you have to *keep track of the exact input size*. If the size of the input isn't divisible by the number of data shards, extra zeros will be inserted in the last shard. To join a data set, use the `Join()` function, which will join the shards and write it to the `io.Writer` you supply: ```Go // Join a data set and write it to io.Discard. err = enc.Join(io.Discard, data, len(bigfile)) ``` # Streaming/Merging It might seem like a limitation that all data should be in memory, but an important property is that *as long as the number of data/parity shards are the same, you can merge/split data sets*, and they will remain valid as a separate set. ```Go // Split the data set of 50000 elements into two of 25000 splitA := make([][]byte, 13) splitB := make([][]byte, 13) // Merge into a 100000 element set merged := make([][]byte, 13) for i := range data { splitA[i] = data[i][:25000] splitB[i] = data[i][25000:] // Concatenate it to itself merged[i] = append(make([]byte, 0, len(data[i])*2), data[i]...) merged[i] = append(merged[i], data[i]...) } // Each part should still verify as ok. ok, err := enc.Verify(splitA) if ok && err == nil { log.Println("splitA ok") } ok, err = enc.Verify(splitB) if ok && err == nil { log.Println("splitB ok") } ok, err = enc.Verify(merge) if ok && err == nil { log.Println("merge ok") } ``` This means that if you have a data set that may not fit into memory, you can split processing into smaller blocks. For the best throughput, don't use too small blocks. This also means that you can divide big input up into smaller blocks, and do reconstruction on parts of your data. This doesn't give the same flexibility of a higher number of data shards, but it will be much more performant. # Streaming API There has been added support for a streaming API, to help perform fully streaming operations, which enables you to do the same operations, but on streams. To use the stream API, use [`NewStream`](https://godoc.org/github.com/klauspost/reedsolomon#NewStream) function to create the encoding/decoding interfaces. You can use [`WithConcurrentStreams`](https://godoc.org/github.com/klauspost/reedsolomon#WithConcurrentStreams) to ready an interface that reads/writes concurrently from the streams. You can specify the size of each operation using [`WithStreamBlockSize`](https://godoc.org/github.com/klauspost/reedsolomon#WithStreamBlockSize). This will set the size of each read/write operation. Input is delivered as `[]io.Reader`, output as `[]io.Writer`, and functionality corresponds to the in-memory API. Each stream must supply the same amount of data, similar to how each slice must be similar size with the in-memory API. If an error occurs in relation to a stream, a [`StreamReadError`](https://godoc.org/github.com/klauspost/reedsolomon#StreamReadError) or [`StreamWriteError`](https://godoc.org/github.com/klauspost/reedsolomon#StreamWriteError) will help you determine which stream was the offender. There is no buffering or timeouts/retry specified. If you want to add that, you need to add it to the Reader/Writer. For complete examples of a streaming encoder and decoder see the [examples folder](https://github.com/klauspost/reedsolomon/tree/master/examples). # Advanced Options You can modify internal options which affects how jobs are split between and processed by goroutines. To create options, use the WithXXX functions. You can supply options to `New`, `NewStream`. If no Options are supplied, default options are used. Example of how to supply options: ```Go enc, err := reedsolomon.New(10, 3, WithMaxGoroutines(25)) ``` # Performance Performance depends mainly on the number of parity shards. In rough terms, doubling the number of parity shards will double the encoding time. Here are the throughput numbers with some different selections of data and parity shards. For reference each shard is 1MB random data, and 16 CPU cores are used for encoding. | Data | Parity | Go MB/s | SSSE3 MB/s | AVX2 MB/s | |------|--------|---------|------------|-----------| | 5 | 2 | 14287 | 66355 | 108755 | | 8 | 8 | 5569 | 34298 | 70516 | | 10 | 4 | 6766 | 48237 | 93875 | | 50 | 20 | 1540 | 12130 | 22090 | The throughput numbers here is the size of the encoded data and parity shards. If `runtime.GOMAXPROCS()` is set to a value higher than 1, the encoder will use multiple goroutines to perform the calculations in `Verify`, `Encode` and `Reconstruct`. Example of performance scaling on AMD Ryzen 3950X - 16 physical cores, 32 logical cores, AVX 2. The example uses 10 blocks with 1MB data each and 4 parity blocks. | Threads | Speed | |---------|------------| | 1 | 9979 MB/s | | 2 | 18870 MB/s | | 4 | 33697 MB/s | | 8 | 51531 MB/s | | 16 | 59204 MB/s | Benchmarking `Reconstruct()` followed by a `Verify()` (=`all`) versus just calling `ReconstructData()` (=`data`) gives the following result: ``` benchmark all MB/s data MB/s speedup BenchmarkReconstruct10x2x10000-8 2011.67 10530.10 5.23x BenchmarkReconstruct50x5x50000-8 4585.41 14301.60 3.12x BenchmarkReconstruct10x2x1M-8 8081.15 28216.41 3.49x BenchmarkReconstruct5x2x1M-8 5780.07 28015.37 4.85x BenchmarkReconstruct10x4x1M-8 4352.56 14367.61 3.30x BenchmarkReconstruct50x20x1M-8 1364.35 4189.79 3.07x BenchmarkReconstruct10x4x16M-8 1484.35 5779.53 3.89x ``` # Performance on AVX512 The performance on AVX512 has been accelerated for Intel CPUs. This gives speedups on a per-core basis typically up to 2x compared to AVX2 as can be seen in the following table: ``` [...] ``` This speedup has been achieved by computing multiple parity blocks in parallel as opposed to one after the other. In doing so it is possible to minimize the memory bandwidth required for loading all data shards. At the same time the calculations are performed in the 512-bit wide ZMM registers and the surplus of ZMM registers (32 in total) is used to keep more data around (most notably the matrix coefficients). # Performance on ARM64 NEON By exploiting NEON instructions the performance for ARM has been accelerated. Below are the performance numbers for a single core on an EC2 m6g.16xlarge (Graviton2) instance (Amazon Linux 2): ``` BenchmarkGalois128K-64 119562 10028 ns/op 13070.78 MB/s BenchmarkGalois1M-64 14380 83424 ns/op 12569.22 MB/s BenchmarkGaloisXor128K-64 96508 12432 ns/op 10543.29 MB/s BenchmarkGaloisXor1M-64 10000 100322 ns/op 10452.13 MB/s ``` # Performance on ppc64le The performance for ppc64le has been accelerated. This gives roughly a 10x performance improvement on this architecture as can been seen below: ``` benchmark old MB/s new MB/s speedup BenchmarkGalois128K-160 948.87 8878.85 9.36x BenchmarkGalois1M-160 968.85 9041.92 9.33x BenchmarkGaloisXor128K-160 862.02 7905.00 9.17x BenchmarkGaloisXor1M-160 784.60 6296.65 8.03x ``` # asm2plan9s [asm2plan9s](https://github.com/fwessels/asm2plan9s) is used for assembling the AVX2 instructions into their BYTE/WORD/LONG equivalents. # Links * [Backblaze Open Sources Reed-Solomon Erasure Coding Source Code](https://www.backblaze.com/blog/reed-solomon/). * [JavaReedSolomon](https://github.com/Backblaze/JavaReedSolomon). Compatible java library by Backblaze. * [ocaml-reed-solomon-erasure](https://gitlab.com/darrenldl/ocaml-reed-solomon-erasure). Compatible OCaml implementation. * [reedsolomon-c](https://github.com/jannson/reedsolomon-c). C version, compatible with output from this package. * [Reed-Solomon Erasure Coding in Haskell](https://github.com/NicolasT/reedsolomon). Haskell port of the package with similar performance. * [reed-solomon-erasure](https://github.com/darrenldl/reed-solomon-erasure). Compatible Rust implementation. * [go-erasure](https://github.com/somethingnew2-0/go-erasure). A similar library using cgo, slower in my tests. * [Screaming Fast Galois Field Arithmetic](http://www.snia.org/sites/default/files2/SDC2013/presentations/NewThinking/EthanMiller_Screaming_Fast_Galois_Field%20Arithmetic_SIMD%20Instructions.pdf). Basis for SSE3 optimizations. # License This code, as the original [JavaReedSolomon](https://github.com/Backblaze/JavaReedSolomon) is published under an MIT license. See LICENSE file for more information. reedsolomon-1.9.13/_gen/000077500000000000000000000000001406411035300150405ustar00rootroot00000000000000reedsolomon-1.9.13/_gen/gen.go000066400000000000000000000335621406411035300161510ustar00rootroot00000000000000//+build generate //go:generate go run gen.go -out ../galois_gen_amd64.s -stubs ../galois_gen_amd64.go -pkg=reedsolomon //go:generate gofmt -w ../galois_gen_switch_amd64.go package main import ( "bufio" "fmt" "os" "github.com/mmcloughlin/avo/attr" . "github.com/mmcloughlin/avo/build" "github.com/mmcloughlin/avo/buildtags" . "github.com/mmcloughlin/avo/operand" "github.com/mmcloughlin/avo/reg" ) // Technically we can do slightly bigger, but we stay reasonable. const inputMax = 10 const outputMax = 10 var switchDefs [inputMax][outputMax]string var switchDefsX [inputMax][outputMax]string // Prefetch offsets, set to 0 to disable. // Disabled since they appear to be consistently slower. const prefetchSrc = 0 const prefetchDst = 0 func main() { Constraint(buildtags.Not("appengine").ToConstraint()) Constraint(buildtags.Not("noasm").ToConstraint()) Constraint(buildtags.Not("nogen").ToConstraint()) Constraint(buildtags.Term("gc").ToConstraint()) const perLoopBits = 5 const perLoop = 1 << perLoopBits for i := 1; i <= inputMax; i++ { for j := 1; j <= outputMax; j++ { //genMulAvx2(fmt.Sprintf("mulAvxTwoXor_%dx%d", i, j), i, j, true) genMulAvx2(fmt.Sprintf("mulAvxTwo_%dx%d", i, j), i, j, false) genMulAvx2Sixty64(fmt.Sprintf("mulAvxTwo_%dx%d_64", i, j), i, j, false) } } f, err := os.Create("../galois_gen_switch_amd64.go") if err != nil { panic(err) } defer f.Close() w := bufio.NewWriter(f) defer w.Flush() w.WriteString(`// Code generated by command: go generate ` + os.Getenv("GOFILE") + `. DO NOT EDIT. // +build !appengine // +build !noasm // +build gc // +build !nogen package reedsolomon import "fmt" `) w.WriteString("const avx2CodeGen = true\n") w.WriteString(fmt.Sprintf("const maxAvx2Inputs = %d\nconst maxAvx2Outputs = %d\n", inputMax, outputMax)) w.WriteString(` func galMulSlicesAvx2(matrix []byte, in, out [][]byte, start, stop int) int { n := stop-start `) w.WriteString(fmt.Sprintf("n = (n>>%d)<<%d\n\n", perLoopBits, perLoopBits)) w.WriteString(`switch len(in) { `) for in, defs := range switchDefs[:] { w.WriteString(fmt.Sprintf(" case %d:\n switch len(out) {\n", in+1)) for out, def := range defs[:] { w.WriteString(fmt.Sprintf(" case %d:\n", out+1)) w.WriteString(def) } w.WriteString("}\n") } w.WriteString(`} panic(fmt.Sprintf("unhandled size: %dx%d", len(in), len(out))) } `) Generate() } func genMulAvx2(name string, inputs int, outputs int, xor bool) { const perLoopBits = 5 const perLoop = 1 << perLoopBits total := inputs * outputs doc := []string{ fmt.Sprintf("%s takes %d inputs and produces %d outputs.", name, inputs, outputs), } if !xor { doc = append(doc, "The output is initialized to 0.") } // Load shuffle masks on every use. var loadNone bool // Use registers for destination registers. var regDst = true var reloadLength = false // lo, hi, 1 in, 1 out, 2 tmp, 1 mask est := total*2 + outputs + 5 if outputs == 1 { // We don't need to keep a copy of the input if only 1 output. est -= 2 } if est > 16 { loadNone = true // We run out of GP registers first, now. if inputs+outputs > 13 { regDst = false } // Save one register by reloading length. if inputs+outputs > 12 && regDst { reloadLength = true } } TEXT(name, attr.NOSPLIT, fmt.Sprintf("func(matrix []byte, in [][]byte, out [][]byte, start, n int)")) // SWITCH DEFINITION: s := fmt.Sprintf(" mulAvxTwo_%dx%d(matrix, in, out, start, n)\n", inputs, outputs) s += fmt.Sprintf("\t\t\t\treturn n\n") switchDefs[inputs-1][outputs-1] = s if loadNone { Comment("Loading no tables to registers") } else { // loadNone == false Comment("Loading all tables to registers") } if regDst { Comment("Destination kept in GP registers") } else { Comment("Destination kept on stack") } Doc(doc...) Pragma("noescape") Commentf("Full registers estimated %d YMM used", est) length := Load(Param("n"), GP64()) matrixBase := GP64() addr, err := Param("matrix").Base().Resolve() if err != nil { panic(err) } MOVQ(addr.Addr, matrixBase) SHRQ(U8(perLoopBits), length) TESTQ(length, length) JZ(LabelRef(name + "_end")) inLo := make([]reg.VecVirtual, total) inHi := make([]reg.VecVirtual, total) for i := range inLo { if loadNone { break } tableLo := YMM() tableHi := YMM() VMOVDQU(Mem{Base: matrixBase, Disp: i * 64}, tableLo) VMOVDQU(Mem{Base: matrixBase, Disp: i*64 + 32}, tableHi) inLo[i] = tableLo inHi[i] = tableHi } inPtrs := make([]reg.GPVirtual, inputs) inSlicePtr := GP64() addr, err = Param("in").Base().Resolve() if err != nil { panic(err) } MOVQ(addr.Addr, inSlicePtr) for i := range inPtrs { ptr := GP64() MOVQ(Mem{Base: inSlicePtr, Disp: i * 24}, ptr) inPtrs[i] = ptr } // Destination dst := make([]reg.VecVirtual, outputs) dstPtr := make([]reg.GPVirtual, outputs) addr, err = Param("out").Base().Resolve() if err != nil { panic(err) } outBase := addr.Addr outSlicePtr := GP64() MOVQ(addr.Addr, outSlicePtr) for i := range dst { dst[i] = YMM() if !regDst { continue } ptr := GP64() MOVQ(Mem{Base: outSlicePtr, Disp: i * 24}, ptr) dstPtr[i] = ptr } offset := GP64() addr, err = Param("start").Resolve() if err != nil { panic(err) } MOVQ(addr.Addr, offset) if regDst { Comment("Add start offset to output") for _, ptr := range dstPtr { ADDQ(offset, ptr) } } Comment("Add start offset to input") for _, ptr := range inPtrs { ADDQ(offset, ptr) } // Offset no longer needed unless not regdst tmpMask := GP64() MOVQ(U32(15), tmpMask) lowMask := YMM() MOVQ(tmpMask, lowMask.AsX()) VPBROADCASTB(lowMask.AsX(), lowMask) if reloadLength { length = Load(Param("n"), GP64()) SHRQ(U8(perLoopBits), length) } Label(name + "_loop") if xor { Commentf("Load %d outputs", outputs) } else { Commentf("Clear %d outputs", outputs) } for i := range dst { if xor { if regDst { VMOVDQU(Mem{Base: dstPtr[i]}, dst[i]) if prefetchDst > 0 { PREFETCHT0(Mem{Base: dstPtr[i], Disp: prefetchDst}) } continue } ptr := GP64() MOVQ(outBase, ptr) VMOVDQU(Mem{Base: ptr, Index: offset, Scale: 1}, dst[i]) if prefetchDst > 0 { PREFETCHT0(Mem{Base: ptr, Disp: prefetchDst, Index: offset, Scale: 1}) } } else { VPXOR(dst[i], dst[i], dst[i]) } } lookLow, lookHigh := YMM(), YMM() inLow, inHigh := YMM(), YMM() for i := range inPtrs { Commentf("Load and process 32 bytes from input %d to %d outputs", i, outputs) VMOVDQU(Mem{Base: inPtrs[i]}, inLow) if prefetchSrc > 0 { PREFETCHT0(Mem{Base: inPtrs[i], Disp: prefetchSrc}) } ADDQ(U8(perLoop), inPtrs[i]) VPSRLQ(U8(4), inLow, inHigh) VPAND(lowMask, inLow, inLow) VPAND(lowMask, inHigh, inHigh) for j := range dst { if loadNone { VMOVDQU(Mem{Base: matrixBase, Disp: 64 * (i*outputs + j)}, lookLow) VMOVDQU(Mem{Base: matrixBase, Disp: 32 + 64*(i*outputs+j)}, lookHigh) VPSHUFB(inLow, lookLow, lookLow) VPSHUFB(inHigh, lookHigh, lookHigh) } else { VPSHUFB(inLow, inLo[i*outputs+j], lookLow) VPSHUFB(inHigh, inHi[i*outputs+j], lookHigh) } VPXOR(lookLow, lookHigh, lookLow) VPXOR(lookLow, dst[j], dst[j]) } } Commentf("Store %d outputs", outputs) for i := range dst { if regDst { VMOVDQU(dst[i], Mem{Base: dstPtr[i]}) if prefetchDst > 0 && !xor { PREFETCHT0(Mem{Base: dstPtr[i], Disp: prefetchDst}) } ADDQ(U8(perLoop), dstPtr[i]) continue } ptr := GP64() MOVQ(Mem{Base: outSlicePtr, Disp: i * 24}, ptr) VMOVDQU(dst[i], Mem{Base: ptr, Index: offset, Scale: 1}) if prefetchDst > 0 && !xor { PREFETCHT0(Mem{Base: ptr, Disp: prefetchDst, Index: offset, Scale: 1}) } } Comment("Prepare for next loop") if !regDst { ADDQ(U8(perLoop), offset) } DECQ(length) JNZ(LabelRef(name + "_loop")) VZEROUPPER() Label(name + "_end") RET() } func genMulAvx2Sixty64(name string, inputs int, outputs int, xor bool) { if outputs >= 4 { return } const perLoopBits = 6 const perLoop = 1 << perLoopBits total := inputs * outputs doc := []string{ fmt.Sprintf("%s takes %d inputs and produces %d outputs.", name, inputs, outputs), } if !xor { doc = append(doc, "The output is initialized to 0.") } // Load shuffle masks on every use. var loadNone bool // Use registers for destination registers. var regDst = false var reloadLength = false // lo, hi, 1 in, 1 out, 2 tmp, 1 mask est := total*2 + outputs + 5 if outputs == 1 { // We don't need to keep a copy of the input if only 1 output. est -= 2 } if true || est > 16 { loadNone = true // We run out of GP registers first, now. if inputs+outputs > 13 { regDst = false } // Save one register by reloading length. if true || inputs+outputs > 12 && regDst { reloadLength = true } } TEXT(name, 0, fmt.Sprintf("func(matrix []byte, in [][]byte, out [][]byte, start, n int)")) // SWITCH DEFINITION: s := fmt.Sprintf("n = (n>>%d)<<%d\n", perLoopBits, perLoopBits) s += fmt.Sprintf(" mulAvxTwo_%dx%d_64(matrix, in, out, start, n)\n", inputs, outputs) s += fmt.Sprintf("\t\t\t\treturn n\n") switchDefs[inputs-1][outputs-1] = s if loadNone { Comment("Loading no tables to registers") } else { // loadNone == false Comment("Loading all tables to registers") } if regDst { Comment("Destination kept in GP registers") } else { Comment("Destination kept on stack") } Doc(doc...) Pragma("noescape") Commentf("Full registers estimated %d YMM used", est) length := Load(Param("n"), GP64()) matrixBase := GP64() addr, err := Param("matrix").Base().Resolve() if err != nil { panic(err) } MOVQ(addr.Addr, matrixBase) SHRQ(U8(perLoopBits), length) TESTQ(length, length) JZ(LabelRef(name + "_end")) inLo := make([]reg.VecVirtual, total) inHi := make([]reg.VecVirtual, total) for i := range inLo { if loadNone { break } tableLo := YMM() tableHi := YMM() VMOVDQU(Mem{Base: matrixBase, Disp: i * 64}, tableLo) VMOVDQU(Mem{Base: matrixBase, Disp: i*64 + 32}, tableHi) inLo[i] = tableLo inHi[i] = tableHi } inPtrs := make([]reg.GPVirtual, inputs) inSlicePtr := GP64() addr, err = Param("in").Base().Resolve() if err != nil { panic(err) } MOVQ(addr.Addr, inSlicePtr) for i := range inPtrs { ptr := GP64() MOVQ(Mem{Base: inSlicePtr, Disp: i * 24}, ptr) inPtrs[i] = ptr } // Destination dst := make([]reg.VecVirtual, outputs) dst2 := make([]reg.VecVirtual, outputs) dstPtr := make([]reg.GPVirtual, outputs) addr, err = Param("out").Base().Resolve() if err != nil { panic(err) } outBase := addr.Addr outSlicePtr := GP64() MOVQ(addr.Addr, outSlicePtr) MOVQ(outBase, outSlicePtr) for i := range dst { dst[i] = YMM() dst2[i] = YMM() if !regDst { continue } ptr := GP64() MOVQ(Mem{Base: outSlicePtr, Disp: i * 24}, ptr) dstPtr[i] = ptr } offset := GP64() addr, err = Param("start").Resolve() if err != nil { panic(err) } MOVQ(addr.Addr, offset) if regDst { Comment("Add start offset to output") for _, ptr := range dstPtr { ADDQ(offset, ptr) } } Comment("Add start offset to input") for _, ptr := range inPtrs { ADDQ(offset, ptr) } // Offset no longer needed unless not regdst tmpMask := GP64() MOVQ(U32(15), tmpMask) lowMask := YMM() MOVQ(tmpMask, lowMask.AsX()) VPBROADCASTB(lowMask.AsX(), lowMask) if reloadLength { length = Load(Param("n"), GP64()) SHRQ(U8(perLoopBits), length) } Label(name + "_loop") if xor { Commentf("Load %d outputs", outputs) } else { Commentf("Clear %d outputs", outputs) } for i := range dst { if xor { if regDst { VMOVDQU(Mem{Base: dstPtr[i]}, dst[i]) if prefetchDst > 0 { PREFETCHT0(Mem{Base: dstPtr[i], Disp: prefetchDst}) } continue } ptr := GP64() MOVQ(outBase, ptr) VMOVDQU(Mem{Base: ptr, Index: offset, Scale: 1}, dst[i]) if prefetchDst > 0 { PREFETCHT0(Mem{Base: ptr, Disp: prefetchDst, Index: offset, Scale: 1}) } } else { VPXOR(dst[i], dst[i], dst[i]) VPXOR(dst2[i], dst2[i], dst2[i]) } } lookLow, lookHigh := YMM(), YMM() lookLow2, lookHigh2 := YMM(), YMM() inLow, inHigh := YMM(), YMM() in2Low, in2High := YMM(), YMM() for i := range inPtrs { Commentf("Load and process 64 bytes from input %d to %d outputs", i, outputs) VMOVDQU(Mem{Base: inPtrs[i]}, inLow) VMOVDQU(Mem{Base: inPtrs[i], Disp: 32}, in2Low) if prefetchSrc > 0 { PREFETCHT0(Mem{Base: inPtrs[i], Disp: prefetchSrc}) } ADDQ(U8(perLoop), inPtrs[i]) VPSRLQ(U8(4), inLow, inHigh) VPSRLQ(U8(4), in2Low, in2High) VPAND(lowMask, inLow, inLow) VPAND(lowMask, in2Low, in2Low) VPAND(lowMask, inHigh, inHigh) VPAND(lowMask, in2High, in2High) for j := range dst { if loadNone { VMOVDQU(Mem{Base: matrixBase, Disp: 64 * (i*outputs + j)}, lookLow) VMOVDQU(Mem{Base: matrixBase, Disp: 32 + 64*(i*outputs+j)}, lookHigh) VPSHUFB(in2Low, lookLow, lookLow2) VPSHUFB(inLow, lookLow, lookLow) VPSHUFB(in2High, lookHigh, lookHigh2) VPSHUFB(inHigh, lookHigh, lookHigh) } else { VPSHUFB(inLow, inLo[i*outputs+j], lookLow) VPSHUFB(in2Low, inLo[i*outputs+j], lookLow2) VPSHUFB(inHigh, inHi[i*outputs+j], lookHigh) VPSHUFB(in2High, inHi[i*outputs+j], lookHigh2) } VPXOR(lookLow, lookHigh, lookLow) VPXOR(lookLow2, lookHigh2, lookLow2) VPXOR(lookLow, dst[j], dst[j]) VPXOR(lookLow2, dst2[j], dst2[j]) } } Commentf("Store %d outputs", outputs) for i := range dst { if regDst { VMOVDQU(dst[i], Mem{Base: dstPtr[i]}) VMOVDQU(dst2[i], Mem{Base: dstPtr[i], Disp: 32}) if prefetchDst > 0 && !xor { PREFETCHT0(Mem{Base: dstPtr[i], Disp: prefetchDst}) } ADDQ(U8(perLoop), dstPtr[i]) continue } ptr := GP64() MOVQ(Mem{Base: outSlicePtr, Disp: i * 24}, ptr) VMOVDQU(dst[i], Mem{Base: ptr, Index: offset, Scale: 1}) VMOVDQU(dst2[i], Mem{Base: ptr, Index: offset, Scale: 1, Disp: 32}) if prefetchDst > 0 && !xor { PREFETCHT0(Mem{Base: ptr, Disp: prefetchDst, Index: offset, Scale: 1}) } } Comment("Prepare for next loop") if !regDst { ADDQ(U8(perLoop), offset) } DECQ(length) JNZ(LabelRef(name + "_loop")) VZEROUPPER() Label(name + "_end") RET() } reedsolomon-1.9.13/_gen/go.mod000066400000000000000000000001411406411035300161420ustar00rootroot00000000000000module github.com/klauspost/reedsolomon/_gen go 1.14 require github.com/mmcloughlin/avo v0.2.0 reedsolomon-1.9.13/_gen/go.sum000066400000000000000000000057661406411035300162110ustar00rootroot00000000000000github.com/mmcloughlin/avo v0.2.0 h1:6vhoSaKtxb6f4RiH+LK2qL6GSMpFzhEwJYTTSZNy09w= github.com/mmcloughlin/avo v0.2.0/go.mod h1:5tidO2Z9Z7N6X7UMcGg+1KTj51O8OxYDCMHxCZTVpEA= github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= golang.org/x/arch v0.0.0-20210405154355-08b684f594a5/go.mod h1:flIaEI6LNU6xOCD5PaJvn9wGP0agmIOqjrtsKGRguv4= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= golang.org/x/mod v0.3.0 h1:RM4zey1++hCTbCVQfnWeKs9/IEsaBLA8vTkd0WVtmH4= golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210119212857-b64e53b001e4/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210403161142-5e06dd20ab57 h1:F5Gozwx4I1xtr/sr/8CFbb57iKi3297KFs0QDbGN60A= golang.org/x/sys v0.0.0-20210403161142-5e06dd20ab57/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.1.0 h1:po9/4sTYwZU9lPhi1tOrb4hCv3qrhiQ77LZfGa2OjwY= golang.org/x/tools v0.1.0/go.mod h1:xkSsbof2nBLbhDlRMhhhyNLN/zl3eTqcnHD5viDpcZ0= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 h1:go1bK/D/BFZV2I8cIQd1NKEZ+0owSTG1fDTci4IqFcE= golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= rsc.io/pdf v0.1.1/go.mod h1:n8OzWcQ6Sp37PL01nO98y4iUCRdTGarVfzxY20ICaU4= reedsolomon-1.9.13/appveyor.yml000066400000000000000000000005151406411035300165210ustar00rootroot00000000000000os: Visual Studio 2015 platform: x64 clone_folder: c:\gopath\src\github.com\klauspost\reedsolomon # environment variables environment: GOPATH: c:\gopath install: - echo %PATH% - echo %GOPATH% - go version - go env - go get -d ./... build_script: - go test -v -cpu=2 ./... - go test -cpu=1,2,4 -short -race ./... reedsolomon-1.9.13/examples/000077500000000000000000000000001406411035300157465ustar00rootroot00000000000000reedsolomon-1.9.13/examples/README.md000066400000000000000000000026301406411035300172260ustar00rootroot00000000000000# Examples This folder contains usage examples of the Reed-Solomon encoder. # Simple Encoder/Decoder Shows basic use of the encoder, and will encode a single file into a number of data and parity shards. This is meant as an example and is not meant for production use since there is a number of shotcomings noted below. To build an executable use: ```bash go build simple-decoder.go go build simple-encoder.go ``` # Streamin API examples There are streaming examples of the same functionality, which streams data instead of keeping it in memory. To build the executables use: ```bash go build stream-decoder.go go build stream-encoder.go ``` ## Shortcomings * If the file size of the input isn't diviable by the number of data shards the output will contain extra zeroes * If the shard numbers isn't the same for the decoder as in the encoder, invalid output will be generated. * If values have changed in a shard, it cannot be reconstructed. * If two shards have been swapped, reconstruction will always fail. You need to supply the shards in the same order as they were given to you. The solution for this is to save a metadata file containing: * File size. * The number of data/parity shards. * HASH of each shard. * Order of the shards. If you save these properties, you should abe able to detect file corruption in a shard and be able to reconstruct your data if you have the needed number of shards left. reedsolomon-1.9.13/examples/simple-decoder.go000066400000000000000000000060541406411035300211760ustar00rootroot00000000000000//+build ignore // Copyright 2015, Klaus Post, see LICENSE for details. // // Simple decoder example. // // The decoder reverses the process of "simple-encoder.go" // // To build an executable use: // // go build simple-decoder.go // // Simple Encoder/Decoder Shortcomings: // * If the file size of the input isn't divisible by the number of data shards // the output will contain extra zeroes // // * If the shard numbers isn't the same for the decoder as in the // encoder, invalid output will be generated. // // * If values have changed in a shard, it cannot be reconstructed. // // * If two shards have been swapped, reconstruction will always fail. // You need to supply the shards in the same order as they were given to you. // // The solution for this is to save a metadata file containing: // // * File size. // * The number of data/parity shards. // * HASH of each shard. // * Order of the shards. // // If you save these properties, you should abe able to detect file corruption // in a shard and be able to reconstruct your data if you have the needed number of shards left. package main import ( "flag" "fmt" "io/ioutil" "os" "github.com/klauspost/reedsolomon" ) var dataShards = flag.Int("data", 4, "Number of shards to split the data into") var parShards = flag.Int("par", 2, "Number of parity shards") var outFile = flag.String("out", "", "Alternative output path/file") func init() { flag.Usage = func() { fmt.Fprintf(os.Stderr, "Usage of %s:\n", os.Args[0]) fmt.Fprintf(os.Stderr, " simple-decoder [-flags] basefile.ext\nDo not add the number to the filename.\n") fmt.Fprintf(os.Stderr, "Valid flags:\n") flag.PrintDefaults() } } func main() { // Parse flags flag.Parse() args := flag.Args() if len(args) != 1 { fmt.Fprintf(os.Stderr, "Error: No filenames given\n") flag.Usage() os.Exit(1) } fname := args[0] // Create matrix enc, err := reedsolomon.New(*dataShards, *parShards) checkErr(err) // Create shards and load the data. shards := make([][]byte, *dataShards+*parShards) for i := range shards { infn := fmt.Sprintf("%s.%d", fname, i) fmt.Println("Opening", infn) shards[i], err = ioutil.ReadFile(infn) if err != nil { fmt.Println("Error reading file", err) shards[i] = nil } } // Verify the shards ok, err := enc.Verify(shards) if ok { fmt.Println("No reconstruction needed") } else { fmt.Println("Verification failed. Reconstructing data") err = enc.Reconstruct(shards) if err != nil { fmt.Println("Reconstruct failed -", err) os.Exit(1) } ok, err = enc.Verify(shards) if !ok { fmt.Println("Verification failed after reconstruction, data likely corrupted.") os.Exit(1) } checkErr(err) } // Join the shards and write them outfn := *outFile if outfn == "" { outfn = fname } fmt.Println("Writing data to", outfn) f, err := os.Create(outfn) checkErr(err) // We don't know the exact filesize. err = enc.Join(f, shards, len(shards[0])**dataShards) checkErr(err) } func checkErr(err error) { if err != nil { fmt.Fprintf(os.Stderr, "Error: %s", err.Error()) os.Exit(2) } } reedsolomon-1.9.13/examples/simple-encoder.go000066400000000000000000000055701406411035300212120ustar00rootroot00000000000000//+build ignore // Copyright 2015, Klaus Post, see LICENSE for details. // // Simple encoder example // // The encoder encodes a simgle file into a number of shards // To reverse the process see "simpledecoder.go" // // To build an executable use: // // go build simple-decoder.go // // Simple Encoder/Decoder Shortcomings: // * If the file size of the input isn't divisible by the number of data shards // the output will contain extra zeroes // // * If the shard numbers isn't the same for the decoder as in the // encoder, invalid output will be generated. // // * If values have changed in a shard, it cannot be reconstructed. // // * If two shards have been swapped, reconstruction will always fail. // You need to supply the shards in the same order as they were given to you. // // The solution for this is to save a metadata file containing: // // * File size. // * The number of data/parity shards. // * HASH of each shard. // * Order of the shards. // // If you save these properties, you should abe able to detect file corruption // in a shard and be able to reconstruct your data if you have the needed number of shards left. package main import ( "flag" "fmt" "io/ioutil" "os" "path/filepath" "github.com/klauspost/reedsolomon" ) var dataShards = flag.Int("data", 4, "Number of shards to split the data into, must be below 257.") var parShards = flag.Int("par", 2, "Number of parity shards") var outDir = flag.String("out", "", "Alternative output directory") func init() { flag.Usage = func() { fmt.Fprintf(os.Stderr, "Usage of %s:\n", os.Args[0]) fmt.Fprintf(os.Stderr, " simple-encoder [-flags] filename.ext\n\n") fmt.Fprintf(os.Stderr, "Valid flags:\n") flag.PrintDefaults() } } func main() { // Parse command line parameters. flag.Parse() args := flag.Args() if len(args) != 1 { fmt.Fprintf(os.Stderr, "Error: No input filename given\n") flag.Usage() os.Exit(1) } if (*dataShards + *parShards) > 256 { fmt.Fprintf(os.Stderr, "Error: sum of data and parity shards cannot exceed 256\n") os.Exit(1) } fname := args[0] // Create encoding matrix. enc, err := reedsolomon.New(*dataShards, *parShards) checkErr(err) fmt.Println("Opening", fname) b, err := ioutil.ReadFile(fname) checkErr(err) // Split the file into equally sized shards. shards, err := enc.Split(b) checkErr(err) fmt.Printf("File split into %d data+parity shards with %d bytes/shard.\n", len(shards), len(shards[0])) // Encode parity err = enc.Encode(shards) checkErr(err) // Write out the resulting files. dir, file := filepath.Split(fname) if *outDir != "" { dir = *outDir } for i, shard := range shards { outfn := fmt.Sprintf("%s.%d", file, i) fmt.Println("Writing to", outfn) err = ioutil.WriteFile(filepath.Join(dir, outfn), shard, 0644) checkErr(err) } } func checkErr(err error) { if err != nil { fmt.Fprintf(os.Stderr, "Error: %s", err.Error()) os.Exit(2) } } reedsolomon-1.9.13/examples/stream-decoder.go000066400000000000000000000100141406411035300211670ustar00rootroot00000000000000//+build ignore // Copyright 2015, Klaus Post, see LICENSE for details. // // Stream decoder example. // // The decoder reverses the process of "stream-encoder.go" // // To build an executable use: // // go build stream-decoder.go // // Simple Encoder/Decoder Shortcomings: // * If the file size of the input isn't dividable by the number of data shards // the output will contain extra zeroes // // * If the shard numbers isn't the same for the decoder as in the // encoder, invalid output will be generated. // // * If values have changed in a shard, it cannot be reconstructed. // // * If two shards have been swapped, reconstruction will always fail. // You need to supply the shards in the same order as they were given to you. // // The solution for this is to save a metadata file containing: // // * File size. // * The number of data/parity shards. // * HASH of each shard. // * Order of the shards. // // If you save these properties, you should abe able to detect file corruption // in a shard and be able to reconstruct your data if you have the needed number of shards left. package main import ( "flag" "fmt" "io" "os" "github.com/klauspost/reedsolomon" ) var dataShards = flag.Int("data", 4, "Number of shards to split the data into") var parShards = flag.Int("par", 2, "Number of parity shards") var outFile = flag.String("out", "", "Alternative output path/file") func init() { flag.Usage = func() { fmt.Fprintf(os.Stderr, "Usage of %s:\n", os.Args[0]) fmt.Fprintf(os.Stderr, " %s [-flags] basefile.ext\nDo not add the number to the filename.\n", os.Args[0]) fmt.Fprintf(os.Stderr, "Valid flags:\n") flag.PrintDefaults() } } func main() { // Parse flags flag.Parse() args := flag.Args() if len(args) != 1 { fmt.Fprintf(os.Stderr, "Error: No filenames given\n") flag.Usage() os.Exit(1) } fname := args[0] // Create matrix enc, err := reedsolomon.NewStream(*dataShards, *parShards) checkErr(err) // Open the inputs shards, size, err := openInput(*dataShards, *parShards, fname) checkErr(err) // Verify the shards ok, err := enc.Verify(shards) if ok { fmt.Println("No reconstruction needed") } else { fmt.Println("Verification failed. Reconstructing data") shards, size, err = openInput(*dataShards, *parShards, fname) checkErr(err) // Create out destination writers out := make([]io.Writer, len(shards)) for i := range out { if shards[i] == nil { outfn := fmt.Sprintf("%s.%d", fname, i) fmt.Println("Creating", outfn) out[i], err = os.Create(outfn) checkErr(err) } } err = enc.Reconstruct(shards, out) if err != nil { fmt.Println("Reconstruct failed -", err) os.Exit(1) } // Close output. for i := range out { if out[i] != nil { err := out[i].(*os.File).Close() checkErr(err) } } shards, size, err = openInput(*dataShards, *parShards, fname) ok, err = enc.Verify(shards) if !ok { fmt.Println("Verification failed after reconstruction, data likely corrupted:", err) os.Exit(1) } checkErr(err) } // Join the shards and write them outfn := *outFile if outfn == "" { outfn = fname } fmt.Println("Writing data to", outfn) f, err := os.Create(outfn) checkErr(err) shards, size, err = openInput(*dataShards, *parShards, fname) checkErr(err) // We don't know the exact filesize. err = enc.Join(f, shards, int64(*dataShards)*size) checkErr(err) } func openInput(dataShards, parShards int, fname string) (r []io.Reader, size int64, err error) { // Create shards and load the data. shards := make([]io.Reader, dataShards+parShards) for i := range shards { infn := fmt.Sprintf("%s.%d", fname, i) fmt.Println("Opening", infn) f, err := os.Open(infn) if err != nil { fmt.Println("Error reading file", err) shards[i] = nil continue } else { shards[i] = f } stat, err := f.Stat() checkErr(err) if stat.Size() > 0 { size = stat.Size() } else { shards[i] = nil } } return shards, size, nil } func checkErr(err error) { if err != nil { fmt.Fprintf(os.Stderr, "Error: %s", err.Error()) os.Exit(2) } } reedsolomon-1.9.13/examples/stream-encoder.go000066400000000000000000000066201406411035300212110ustar00rootroot00000000000000//+build ignore // Copyright 2015, Klaus Post, see LICENSE for details. // // Simple stream encoder example // // The encoder encodes a single file into a number of shards // To reverse the process see "stream-decoder.go" // // To build an executable use: // // go build stream-encoder.go // // Simple Encoder/Decoder Shortcomings: // * If the file size of the input isn't dividable by the number of data shards // the output will contain extra zeroes // // * If the shard numbers isn't the same for the decoder as in the // encoder, invalid output will be generated. // // * If values have changed in a shard, it cannot be reconstructed. // // * If two shards have been swapped, reconstruction will always fail. // You need to supply the shards in the same order as they were given to you. // // The solution for this is to save a metadata file containing: // // * File size. // * The number of data/parity shards. // * HASH of each shard. // * Order of the shards. // // If you save these properties, you should abe able to detect file corruption // in a shard and be able to reconstruct your data if you have the needed number of shards left. package main import ( "flag" "fmt" "os" "path/filepath" "io" "github.com/klauspost/reedsolomon" ) var dataShards = flag.Int("data", 4, "Number of shards to split the data into, must be below 257.") var parShards = flag.Int("par", 2, "Number of parity shards") var outDir = flag.String("out", "", "Alternative output directory") func init() { flag.Usage = func() { fmt.Fprintf(os.Stderr, "Usage of %s:\n", os.Args[0]) fmt.Fprintf(os.Stderr, " %s [-flags] filename.ext\n\n", os.Args[0]) fmt.Fprintf(os.Stderr, "Valid flags:\n") flag.PrintDefaults() } } func main() { // Parse command line parameters. flag.Parse() args := flag.Args() if len(args) != 1 { fmt.Fprintf(os.Stderr, "Error: No input filename given\n") flag.Usage() os.Exit(1) } if (*dataShards + *parShards) > 256 { fmt.Fprintf(os.Stderr, "Error: sum of data and parity shards cannot exceed 256\n") os.Exit(1) } fname := args[0] // Create encoding matrix. enc, err := reedsolomon.NewStream(*dataShards, *parShards) checkErr(err) fmt.Println("Opening", fname) f, err := os.Open(fname) checkErr(err) instat, err := f.Stat() checkErr(err) shards := *dataShards + *parShards out := make([]*os.File, shards) // Create the resulting files. dir, file := filepath.Split(fname) if *outDir != "" { dir = *outDir } for i := range out { outfn := fmt.Sprintf("%s.%d", file, i) fmt.Println("Creating", outfn) out[i], err = os.Create(filepath.Join(dir, outfn)) checkErr(err) } // Split into files. data := make([]io.Writer, *dataShards) for i := range data { data[i] = out[i] } // Do the split err = enc.Split(f, data, instat.Size()) checkErr(err) // Close and re-open the files. input := make([]io.Reader, *dataShards) for i := range data { out[i].Close() f, err := os.Open(out[i].Name()) checkErr(err) input[i] = f defer f.Close() } // Create parity output writers parity := make([]io.Writer, *parShards) for i := range parity { parity[i] = out[*dataShards+i] defer out[*dataShards+i].Close() } // Encode parity err = enc.Encode(input, parity) checkErr(err) fmt.Printf("File split into %d data + %d parity shards.\n", *dataShards, *parShards) } func checkErr(err error) { if err != nil { fmt.Fprintf(os.Stderr, "Error: %s", err.Error()) os.Exit(2) } } reedsolomon-1.9.13/examples_test.go000066400000000000000000000106561406411035300173440ustar00rootroot00000000000000package reedsolomon_test import ( "bytes" "fmt" "io" "io/ioutil" "log" "math/rand" "github.com/klauspost/reedsolomon" ) func fillRandom(p []byte) { for i := 0; i < len(p); i += 7 { val := rand.Int63() for j := 0; i+j < len(p) && j < 7; j++ { p[i+j] = byte(val) val >>= 8 } } } // Simple example of how to use all functions of the Encoder. // Note that all error checks have been removed to keep it short. func ExampleEncoder() { // Create some sample data var data = make([]byte, 250000) fillRandom(data) // Create an encoder with 17 data and 3 parity slices. enc, _ := reedsolomon.New(17, 3) // Split the data into shards shards, _ := enc.Split(data) // Encode the parity set _ = enc.Encode(shards) // Verify the parity set ok, _ := enc.Verify(shards) if ok { fmt.Println("ok") } // Delete two shards shards[10], shards[11] = nil, nil // Reconstruct the shards _ = enc.Reconstruct(shards) // Verify the data set ok, _ = enc.Verify(shards) if ok { fmt.Println("ok") } // Output: ok // ok } // This demonstrates that shards can be arbitrary sliced and // merged and still remain valid. func ExampleEncoder_slicing() { // Create some sample data var data = make([]byte, 250000) fillRandom(data) // Create 5 data slices of 50000 elements each enc, _ := reedsolomon.New(5, 3) shards, _ := enc.Split(data) err := enc.Encode(shards) if err != nil { panic(err) } // Check that it verifies ok, err := enc.Verify(shards) if ok && err == nil { fmt.Println("encode ok") } // Split the data set of 50000 elements into two of 25000 splitA := make([][]byte, 8) splitB := make([][]byte, 8) // Merge into a 100000 element set merged := make([][]byte, 8) // Split/merge the shards for i := range shards { splitA[i] = shards[i][:25000] splitB[i] = shards[i][25000:] // Concencate it to itself merged[i] = append(make([]byte, 0, len(shards[i])*2), shards[i]...) merged[i] = append(merged[i], shards[i]...) } // Each part should still verify as ok. ok, err = enc.Verify(shards) if ok && err == nil { fmt.Println("splitA ok") } ok, err = enc.Verify(splitB) if ok && err == nil { fmt.Println("splitB ok") } ok, err = enc.Verify(merged) if ok && err == nil { fmt.Println("merge ok") } // Output: encode ok // splitA ok // splitB ok // merge ok } // This demonstrates that shards can xor'ed and // still remain a valid set. // // The xor value must be the same for element 'n' in each shard, // except if you xor with a similar sized encoded shard set. func ExampleEncoder_xor() { // Create some sample data var data = make([]byte, 25000) fillRandom(data) // Create 5 data slices of 5000 elements each enc, _ := reedsolomon.New(5, 3) shards, _ := enc.Split(data) err := enc.Encode(shards) if err != nil { panic(err) } // Check that it verifies ok, err := enc.Verify(shards) if !ok || err != nil { fmt.Println("falied initial verify", err) } // Create an xor'ed set xored := make([][]byte, 8) // We xor by the index, so you can see that the xor can change, // It should however be constant vertically through your slices. for i := range shards { xored[i] = make([]byte, len(shards[i])) for j := range xored[i] { xored[i][j] = shards[i][j] ^ byte(j&0xff) } } // Each part should still verify as ok. ok, err = enc.Verify(xored) if ok && err == nil { fmt.Println("verified ok after xor") } // Output: verified ok after xor } // This will show a simple stream encoder where we encode from // a []io.Reader which contain a reader for each shard. // // Input and output can be exchanged with files, network streams // or what may suit your needs. func ExampleStreamEncoder() { dataShards := 5 parityShards := 2 // Create a StreamEncoder with the number of data and // parity shards. rs, err := reedsolomon.NewStream(dataShards, parityShards) if err != nil { log.Fatal(err) } shardSize := 50000 // Create input data shards. input := make([][]byte, dataShards) for s := range input { input[s] = make([]byte, shardSize) fillRandom(input[s]) } // Convert our buffers to io.Readers readers := make([]io.Reader, dataShards) for i := range readers { readers[i] = io.Reader(bytes.NewBuffer(input[i])) } // Create our output io.Writers out := make([]io.Writer, parityShards) for i := range out { out[i] = ioutil.Discard } // Encode from input to output. err = rs.Encode(readers, out) if err != nil { log.Fatal(err) } fmt.Println("ok") // OUTPUT: ok } reedsolomon-1.9.13/galois.go000066400000000000000000015526011406411035300157470ustar00rootroot00000000000000/** * 8-bit Galois Field * Copyright 2015, Klaus Post * Copyright 2015, Backblaze, Inc. All rights reserved. */ package reedsolomon const ( // The number of elements in the field. fieldSize = 256 // The polynomial used to generate the logarithm table. // // There are a number of polynomials that work to generate // a Galois field of 256 elements. The choice is arbitrary, // and we just use the first one. // // The possibilities are: 29, 43, 45, 77, 95, 99, 101, 105, //* 113, 135, 141, 169, 195, 207, 231, and 245. generatingPolynomial = 29 ) var logTable = [fieldSize]byte{ 0, 0, 1, 25, 2, 50, 26, 198, 3, 223, 51, 238, 27, 104, 199, 75, 4, 100, 224, 14, 52, 141, 239, 129, 28, 193, 105, 248, 200, 8, 76, 113, 5, 138, 101, 47, 225, 36, 15, 33, 53, 147, 142, 218, 240, 18, 130, 69, 29, 181, 194, 125, 106, 39, 249, 185, 201, 154, 9, 120, 77, 228, 114, 166, 6, 191, 139, 98, 102, 221, 48, 253, 226, 152, 37, 179, 16, 145, 34, 136, 54, 208, 148, 206, 143, 150, 219, 189, 241, 210, 19, 92, 131, 56, 70, 64, 30, 66, 182, 163, 195, 72, 126, 110, 107, 58, 40, 84, 250, 133, 186, 61, 202, 94, 155, 159, 10, 21, 121, 43, 78, 212, 229, 172, 115, 243, 167, 87, 7, 112, 192, 247, 140, 128, 99, 13, 103, 74, 222, 237, 49, 197, 254, 24, 227, 165, 153, 119, 38, 184, 180, 124, 17, 68, 146, 217, 35, 32, 137, 46, 55, 63, 209, 91, 149, 188, 207, 205, 144, 135, 151, 178, 220, 252, 190, 97, 242, 86, 211, 171, 20, 42, 93, 158, 132, 60, 57, 83, 71, 109, 65, 162, 31, 45, 67, 216, 183, 123, 164, 118, 196, 23, 73, 236, 127, 12, 111, 246, 108, 161, 59, 82, 41, 157, 85, 170, 251, 96, 134, 177, 187, 204, 62, 90, 203, 89, 95, 176, 156, 169, 160, 81, 11, 245, 22, 235, 122, 117, 44, 215, 79, 174, 213, 233, 230, 231, 173, 232, 116, 214, 244, 234, 168, 80, 88, 175, } /** * Inverse of the logarithm table. Maps integer logarithms * to members of the field. There is no entry for 255 * because the highest log is 254. * * This table was generated by `go run gentables.go` */ var expTable = []byte{0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26, 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9, 0x8f, 0x3, 0x6, 0xc, 0x18, 0x30, 0x60, 0xc0, 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35, 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23, 0x46, 0x8c, 0x5, 0xa, 0x14, 0x28, 0x50, 0xa0, 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1, 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc, 0x65, 0xca, 0x89, 0xf, 0x1e, 0x3c, 0x78, 0xf0, 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f, 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2, 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88, 0xd, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce, 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93, 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc, 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9, 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54, 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa, 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73, 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e, 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff, 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4, 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41, 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x7, 0xe, 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6, 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef, 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x9, 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5, 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0xb, 0x16, 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83, 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26, 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9, 0x8f, 0x3, 0x6, 0xc, 0x18, 0x30, 0x60, 0xc0, 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35, 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23, 0x46, 0x8c, 0x5, 0xa, 0x14, 0x28, 0x50, 0xa0, 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1, 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc, 0x65, 0xca, 0x89, 0xf, 0x1e, 0x3c, 0x78, 0xf0, 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f, 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2, 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88, 0xd, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce, 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93, 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc, 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9, 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54, 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa, 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73, 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e, 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff, 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4, 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41, 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x7, 0xe, 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6, 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef, 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x9, 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5, 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0xb, 0x16, 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83, 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e} func galAdd(a, b byte) byte { return a ^ b } func galSub(a, b byte) byte { return a ^ b } // Table from https://github.com/templexxx/reedsolomon var invTable = [256]byte{0x0, 0x1, 0x8e, 0xf4, 0x47, 0xa7, 0x7a, 0xba, 0xad, 0x9d, 0xdd, 0x98, 0x3d, 0xaa, 0x5d, 0x96, 0xd8, 0x72, 0xc0, 0x58, 0xe0, 0x3e, 0x4c, 0x66, 0x90, 0xde, 0x55, 0x80, 0xa0, 0x83, 0x4b, 0x2a, 0x6c, 0xed, 0x39, 0x51, 0x60, 0x56, 0x2c, 0x8a, 0x70, 0xd0, 0x1f, 0x4a, 0x26, 0x8b, 0x33, 0x6e, 0x48, 0x89, 0x6f, 0x2e, 0xa4, 0xc3, 0x40, 0x5e, 0x50, 0x22, 0xcf, 0xa9, 0xab, 0xc, 0x15, 0xe1, 0x36, 0x5f, 0xf8, 0xd5, 0x92, 0x4e, 0xa6, 0x4, 0x30, 0x88, 0x2b, 0x1e, 0x16, 0x67, 0x45, 0x93, 0x38, 0x23, 0x68, 0x8c, 0x81, 0x1a, 0x25, 0x61, 0x13, 0xc1, 0xcb, 0x63, 0x97, 0xe, 0x37, 0x41, 0x24, 0x57, 0xca, 0x5b, 0xb9, 0xc4, 0x17, 0x4d, 0x52, 0x8d, 0xef, 0xb3, 0x20, 0xec, 0x2f, 0x32, 0x28, 0xd1, 0x11, 0xd9, 0xe9, 0xfb, 0xda, 0x79, 0xdb, 0x77, 0x6, 0xbb, 0x84, 0xcd, 0xfe, 0xfc, 0x1b, 0x54, 0xa1, 0x1d, 0x7c, 0xcc, 0xe4, 0xb0, 0x49, 0x31, 0x27, 0x2d, 0x53, 0x69, 0x2, 0xf5, 0x18, 0xdf, 0x44, 0x4f, 0x9b, 0xbc, 0xf, 0x5c, 0xb, 0xdc, 0xbd, 0x94, 0xac, 0x9, 0xc7, 0xa2, 0x1c, 0x82, 0x9f, 0xc6, 0x34, 0xc2, 0x46, 0x5, 0xce, 0x3b, 0xd, 0x3c, 0x9c, 0x8, 0xbe, 0xb7, 0x87, 0xe5, 0xee, 0x6b, 0xeb, 0xf2, 0xbf, 0xaf, 0xc5, 0x64, 0x7, 0x7b, 0x95, 0x9a, 0xae, 0xb6, 0x12, 0x59, 0xa5, 0x35, 0x65, 0xb8, 0xa3, 0x9e, 0xd2, 0xf7, 0x62, 0x5a, 0x85, 0x7d, 0xa8, 0x3a, 0x29, 0x71, 0xc8, 0xf6, 0xf9, 0x43, 0xd7, 0xd6, 0x10, 0x73, 0x76, 0x78, 0x99, 0xa, 0x19, 0x91, 0x14, 0x3f, 0xe6, 0xf0, 0x86, 0xb1, 0xe2, 0xf1, 0xfa, 0x74, 0xf3, 0xb4, 0x6d, 0x21, 0xb2, 0x6a, 0xe3, 0xe7, 0xb5, 0xea, 0x3, 0x8f, 0xd3, 0xc9, 0x42, 0xd4, 0xe8, 0x75, 0x7f, 0xff, 0x7e, 0xfd} var mulTable = [256][256]uint8{[256]uint8{0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff}, {0x0, 0x2, 0x4, 0x6, 0x8, 0xa, 0xc, 0xe, 0x10, 0x12, 0x14, 0x16, 0x18, 0x1a, 0x1c, 0x1e, 0x20, 0x22, 0x24, 0x26, 0x28, 0x2a, 0x2c, 0x2e, 0x30, 0x32, 0x34, 0x36, 0x38, 0x3a, 0x3c, 0x3e, 0x40, 0x42, 0x44, 0x46, 0x48, 0x4a, 0x4c, 0x4e, 0x50, 0x52, 0x54, 0x56, 0x58, 0x5a, 0x5c, 0x5e, 0x60, 0x62, 0x64, 0x66, 0x68, 0x6a, 0x6c, 0x6e, 0x70, 0x72, 0x74, 0x76, 0x78, 0x7a, 0x7c, 0x7e, 0x80, 0x82, 0x84, 0x86, 0x88, 0x8a, 0x8c, 0x8e, 0x90, 0x92, 0x94, 0x96, 0x98, 0x9a, 0x9c, 0x9e, 0xa0, 0xa2, 0xa4, 0xa6, 0xa8, 0xaa, 0xac, 0xae, 0xb0, 0xb2, 0xb4, 0xb6, 0xb8, 0xba, 0xbc, 0xbe, 0xc0, 0xc2, 0xc4, 0xc6, 0xc8, 0xca, 0xcc, 0xce, 0xd0, 0xd2, 0xd4, 0xd6, 0xd8, 0xda, 0xdc, 0xde, 0xe0, 0xe2, 0xe4, 0xe6, 0xe8, 0xea, 0xec, 0xee, 0xf0, 0xf2, 0xf4, 0xf6, 0xf8, 0xfa, 0xfc, 0xfe, 0x1d, 0x1f, 0x19, 0x1b, 0x15, 0x17, 0x11, 0x13, 0xd, 0xf, 0x9, 0xb, 0x5, 0x7, 0x1, 0x3, 0x3d, 0x3f, 0x39, 0x3b, 0x35, 0x37, 0x31, 0x33, 0x2d, 0x2f, 0x29, 0x2b, 0x25, 0x27, 0x21, 0x23, 0x5d, 0x5f, 0x59, 0x5b, 0x55, 0x57, 0x51, 0x53, 0x4d, 0x4f, 0x49, 0x4b, 0x45, 0x47, 0x41, 0x43, 0x7d, 0x7f, 0x79, 0x7b, 0x75, 0x77, 0x71, 0x73, 0x6d, 0x6f, 0x69, 0x6b, 0x65, 0x67, 0x61, 0x63, 0x9d, 0x9f, 0x99, 0x9b, 0x95, 0x97, 0x91, 0x93, 0x8d, 0x8f, 0x89, 0x8b, 0x85, 0x87, 0x81, 0x83, 0xbd, 0xbf, 0xb9, 0xbb, 0xb5, 0xb7, 0xb1, 0xb3, 0xad, 0xaf, 0xa9, 0xab, 0xa5, 0xa7, 0xa1, 0xa3, 0xdd, 0xdf, 0xd9, 0xdb, 0xd5, 0xd7, 0xd1, 0xd3, 0xcd, 0xcf, 0xc9, 0xcb, 0xc5, 0xc7, 0xc1, 0xc3, 0xfd, 0xff, 0xf9, 0xfb, 0xf5, 0xf7, 0xf1, 0xf3, 0xed, 0xef, 0xe9, 0xeb, 0xe5, 0xe7, 0xe1, 0xe3}, {0x0, 0x3, 0x6, 0x5, 0xc, 0xf, 0xa, 0x9, 0x18, 0x1b, 0x1e, 0x1d, 0x14, 0x17, 0x12, 0x11, 0x30, 0x33, 0x36, 0x35, 0x3c, 0x3f, 0x3a, 0x39, 0x28, 0x2b, 0x2e, 0x2d, 0x24, 0x27, 0x22, 0x21, 0x60, 0x63, 0x66, 0x65, 0x6c, 0x6f, 0x6a, 0x69, 0x78, 0x7b, 0x7e, 0x7d, 0x74, 0x77, 0x72, 0x71, 0x50, 0x53, 0x56, 0x55, 0x5c, 0x5f, 0x5a, 0x59, 0x48, 0x4b, 0x4e, 0x4d, 0x44, 0x47, 0x42, 0x41, 0xc0, 0xc3, 0xc6, 0xc5, 0xcc, 0xcf, 0xca, 0xc9, 0xd8, 0xdb, 0xde, 0xdd, 0xd4, 0xd7, 0xd2, 0xd1, 0xf0, 0xf3, 0xf6, 0xf5, 0xfc, 0xff, 0xfa, 0xf9, 0xe8, 0xeb, 0xee, 0xed, 0xe4, 0xe7, 0xe2, 0xe1, 0xa0, 0xa3, 0xa6, 0xa5, 0xac, 0xaf, 0xaa, 0xa9, 0xb8, 0xbb, 0xbe, 0xbd, 0xb4, 0xb7, 0xb2, 0xb1, 0x90, 0x93, 0x96, 0x95, 0x9c, 0x9f, 0x9a, 0x99, 0x88, 0x8b, 0x8e, 0x8d, 0x84, 0x87, 0x82, 0x81, 0x9d, 0x9e, 0x9b, 0x98, 0x91, 0x92, 0x97, 0x94, 0x85, 0x86, 0x83, 0x80, 0x89, 0x8a, 0x8f, 0x8c, 0xad, 0xae, 0xab, 0xa8, 0xa1, 0xa2, 0xa7, 0xa4, 0xb5, 0xb6, 0xb3, 0xb0, 0xb9, 0xba, 0xbf, 0xbc, 0xfd, 0xfe, 0xfb, 0xf8, 0xf1, 0xf2, 0xf7, 0xf4, 0xe5, 0xe6, 0xe3, 0xe0, 0xe9, 0xea, 0xef, 0xec, 0xcd, 0xce, 0xcb, 0xc8, 0xc1, 0xc2, 0xc7, 0xc4, 0xd5, 0xd6, 0xd3, 0xd0, 0xd9, 0xda, 0xdf, 0xdc, 0x5d, 0x5e, 0x5b, 0x58, 0x51, 0x52, 0x57, 0x54, 0x45, 0x46, 0x43, 0x40, 0x49, 0x4a, 0x4f, 0x4c, 0x6d, 0x6e, 0x6b, 0x68, 0x61, 0x62, 0x67, 0x64, 0x75, 0x76, 0x73, 0x70, 0x79, 0x7a, 0x7f, 0x7c, 0x3d, 0x3e, 0x3b, 0x38, 0x31, 0x32, 0x37, 0x34, 0x25, 0x26, 0x23, 0x20, 0x29, 0x2a, 0x2f, 0x2c, 0xd, 0xe, 0xb, 0x8, 0x1, 0x2, 0x7, 0x4, 0x15, 0x16, 0x13, 0x10, 0x19, 0x1a, 0x1f, 0x1c}, {0x0, 0x4, 0x8, 0xc, 0x10, 0x14, 0x18, 0x1c, 0x20, 0x24, 0x28, 0x2c, 0x30, 0x34, 0x38, 0x3c, 0x40, 0x44, 0x48, 0x4c, 0x50, 0x54, 0x58, 0x5c, 0x60, 0x64, 0x68, 0x6c, 0x70, 0x74, 0x78, 0x7c, 0x80, 0x84, 0x88, 0x8c, 0x90, 0x94, 0x98, 0x9c, 0xa0, 0xa4, 0xa8, 0xac, 0xb0, 0xb4, 0xb8, 0xbc, 0xc0, 0xc4, 0xc8, 0xcc, 0xd0, 0xd4, 0xd8, 0xdc, 0xe0, 0xe4, 0xe8, 0xec, 0xf0, 0xf4, 0xf8, 0xfc, 0x1d, 0x19, 0x15, 0x11, 0xd, 0x9, 0x5, 0x1, 0x3d, 0x39, 0x35, 0x31, 0x2d, 0x29, 0x25, 0x21, 0x5d, 0x59, 0x55, 0x51, 0x4d, 0x49, 0x45, 0x41, 0x7d, 0x79, 0x75, 0x71, 0x6d, 0x69, 0x65, 0x61, 0x9d, 0x99, 0x95, 0x91, 0x8d, 0x89, 0x85, 0x81, 0xbd, 0xb9, 0xb5, 0xb1, 0xad, 0xa9, 0xa5, 0xa1, 0xdd, 0xd9, 0xd5, 0xd1, 0xcd, 0xc9, 0xc5, 0xc1, 0xfd, 0xf9, 0xf5, 0xf1, 0xed, 0xe9, 0xe5, 0xe1, 0x3a, 0x3e, 0x32, 0x36, 0x2a, 0x2e, 0x22, 0x26, 0x1a, 0x1e, 0x12, 0x16, 0xa, 0xe, 0x2, 0x6, 0x7a, 0x7e, 0x72, 0x76, 0x6a, 0x6e, 0x62, 0x66, 0x5a, 0x5e, 0x52, 0x56, 0x4a, 0x4e, 0x42, 0x46, 0xba, 0xbe, 0xb2, 0xb6, 0xaa, 0xae, 0xa2, 0xa6, 0x9a, 0x9e, 0x92, 0x96, 0x8a, 0x8e, 0x82, 0x86, 0xfa, 0xfe, 0xf2, 0xf6, 0xea, 0xee, 0xe2, 0xe6, 0xda, 0xde, 0xd2, 0xd6, 0xca, 0xce, 0xc2, 0xc6, 0x27, 0x23, 0x2f, 0x2b, 0x37, 0x33, 0x3f, 0x3b, 0x7, 0x3, 0xf, 0xb, 0x17, 0x13, 0x1f, 0x1b, 0x67, 0x63, 0x6f, 0x6b, 0x77, 0x73, 0x7f, 0x7b, 0x47, 0x43, 0x4f, 0x4b, 0x57, 0x53, 0x5f, 0x5b, 0xa7, 0xa3, 0xaf, 0xab, 0xb7, 0xb3, 0xbf, 0xbb, 0x87, 0x83, 0x8f, 0x8b, 0x97, 0x93, 0x9f, 0x9b, 0xe7, 0xe3, 0xef, 0xeb, 0xf7, 0xf3, 0xff, 0xfb, 0xc7, 0xc3, 0xcf, 0xcb, 0xd7, 0xd3, 0xdf, 0xdb}, {0x0, 0x5, 0xa, 0xf, 0x14, 0x11, 0x1e, 0x1b, 0x28, 0x2d, 0x22, 0x27, 0x3c, 0x39, 0x36, 0x33, 0x50, 0x55, 0x5a, 0x5f, 0x44, 0x41, 0x4e, 0x4b, 0x78, 0x7d, 0x72, 0x77, 0x6c, 0x69, 0x66, 0x63, 0xa0, 0xa5, 0xaa, 0xaf, 0xb4, 0xb1, 0xbe, 0xbb, 0x88, 0x8d, 0x82, 0x87, 0x9c, 0x99, 0x96, 0x93, 0xf0, 0xf5, 0xfa, 0xff, 0xe4, 0xe1, 0xee, 0xeb, 0xd8, 0xdd, 0xd2, 0xd7, 0xcc, 0xc9, 0xc6, 0xc3, 0x5d, 0x58, 0x57, 0x52, 0x49, 0x4c, 0x43, 0x46, 0x75, 0x70, 0x7f, 0x7a, 0x61, 0x64, 0x6b, 0x6e, 0xd, 0x8, 0x7, 0x2, 0x19, 0x1c, 0x13, 0x16, 0x25, 0x20, 0x2f, 0x2a, 0x31, 0x34, 0x3b, 0x3e, 0xfd, 0xf8, 0xf7, 0xf2, 0xe9, 0xec, 0xe3, 0xe6, 0xd5, 0xd0, 0xdf, 0xda, 0xc1, 0xc4, 0xcb, 0xce, 0xad, 0xa8, 0xa7, 0xa2, 0xb9, 0xbc, 0xb3, 0xb6, 0x85, 0x80, 0x8f, 0x8a, 0x91, 0x94, 0x9b, 0x9e, 0xba, 0xbf, 0xb0, 0xb5, 0xae, 0xab, 0xa4, 0xa1, 0x92, 0x97, 0x98, 0x9d, 0x86, 0x83, 0x8c, 0x89, 0xea, 0xef, 0xe0, 0xe5, 0xfe, 0xfb, 0xf4, 0xf1, 0xc2, 0xc7, 0xc8, 0xcd, 0xd6, 0xd3, 0xdc, 0xd9, 0x1a, 0x1f, 0x10, 0x15, 0xe, 0xb, 0x4, 0x1, 0x32, 0x37, 0x38, 0x3d, 0x26, 0x23, 0x2c, 0x29, 0x4a, 0x4f, 0x40, 0x45, 0x5e, 0x5b, 0x54, 0x51, 0x62, 0x67, 0x68, 0x6d, 0x76, 0x73, 0x7c, 0x79, 0xe7, 0xe2, 0xed, 0xe8, 0xf3, 0xf6, 0xf9, 0xfc, 0xcf, 0xca, 0xc5, 0xc0, 0xdb, 0xde, 0xd1, 0xd4, 0xb7, 0xb2, 0xbd, 0xb8, 0xa3, 0xa6, 0xa9, 0xac, 0x9f, 0x9a, 0x95, 0x90, 0x8b, 0x8e, 0x81, 0x84, 0x47, 0x42, 0x4d, 0x48, 0x53, 0x56, 0x59, 0x5c, 0x6f, 0x6a, 0x65, 0x60, 0x7b, 0x7e, 0x71, 0x74, 0x17, 0x12, 0x1d, 0x18, 0x3, 0x6, 0x9, 0xc, 0x3f, 0x3a, 0x35, 0x30, 0x2b, 0x2e, 0x21, 0x24}, {0x0, 0x6, 0xc, 0xa, 0x18, 0x1e, 0x14, 0x12, 0x30, 0x36, 0x3c, 0x3a, 0x28, 0x2e, 0x24, 0x22, 0x60, 0x66, 0x6c, 0x6a, 0x78, 0x7e, 0x74, 0x72, 0x50, 0x56, 0x5c, 0x5a, 0x48, 0x4e, 0x44, 0x42, 0xc0, 0xc6, 0xcc, 0xca, 0xd8, 0xde, 0xd4, 0xd2, 0xf0, 0xf6, 0xfc, 0xfa, 0xe8, 0xee, 0xe4, 0xe2, 0xa0, 0xa6, 0xac, 0xaa, 0xb8, 0xbe, 0xb4, 0xb2, 0x90, 0x96, 0x9c, 0x9a, 0x88, 0x8e, 0x84, 0x82, 0x9d, 0x9b, 0x91, 0x97, 0x85, 0x83, 0x89, 0x8f, 0xad, 0xab, 0xa1, 0xa7, 0xb5, 0xb3, 0xb9, 0xbf, 0xfd, 0xfb, 0xf1, 0xf7, 0xe5, 0xe3, 0xe9, 0xef, 0xcd, 0xcb, 0xc1, 0xc7, 0xd5, 0xd3, 0xd9, 0xdf, 0x5d, 0x5b, 0x51, 0x57, 0x45, 0x43, 0x49, 0x4f, 0x6d, 0x6b, 0x61, 0x67, 0x75, 0x73, 0x79, 0x7f, 0x3d, 0x3b, 0x31, 0x37, 0x25, 0x23, 0x29, 0x2f, 0xd, 0xb, 0x1, 0x7, 0x15, 0x13, 0x19, 0x1f, 0x27, 0x21, 0x2b, 0x2d, 0x3f, 0x39, 0x33, 0x35, 0x17, 0x11, 0x1b, 0x1d, 0xf, 0x9, 0x3, 0x5, 0x47, 0x41, 0x4b, 0x4d, 0x5f, 0x59, 0x53, 0x55, 0x77, 0x71, 0x7b, 0x7d, 0x6f, 0x69, 0x63, 0x65, 0xe7, 0xe1, 0xeb, 0xed, 0xff, 0xf9, 0xf3, 0xf5, 0xd7, 0xd1, 0xdb, 0xdd, 0xcf, 0xc9, 0xc3, 0xc5, 0x87, 0x81, 0x8b, 0x8d, 0x9f, 0x99, 0x93, 0x95, 0xb7, 0xb1, 0xbb, 0xbd, 0xaf, 0xa9, 0xa3, 0xa5, 0xba, 0xbc, 0xb6, 0xb0, 0xa2, 0xa4, 0xae, 0xa8, 0x8a, 0x8c, 0x86, 0x80, 0x92, 0x94, 0x9e, 0x98, 0xda, 0xdc, 0xd6, 0xd0, 0xc2, 0xc4, 0xce, 0xc8, 0xea, 0xec, 0xe6, 0xe0, 0xf2, 0xf4, 0xfe, 0xf8, 0x7a, 0x7c, 0x76, 0x70, 0x62, 0x64, 0x6e, 0x68, 0x4a, 0x4c, 0x46, 0x40, 0x52, 0x54, 0x5e, 0x58, 0x1a, 0x1c, 0x16, 0x10, 0x2, 0x4, 0xe, 0x8, 0x2a, 0x2c, 0x26, 0x20, 0x32, 0x34, 0x3e, 0x38}, {0x0, 0x7, 0xe, 0x9, 0x1c, 0x1b, 0x12, 0x15, 0x38, 0x3f, 0x36, 0x31, 0x24, 0x23, 0x2a, 0x2d, 0x70, 0x77, 0x7e, 0x79, 0x6c, 0x6b, 0x62, 0x65, 0x48, 0x4f, 0x46, 0x41, 0x54, 0x53, 0x5a, 0x5d, 0xe0, 0xe7, 0xee, 0xe9, 0xfc, 0xfb, 0xf2, 0xf5, 0xd8, 0xdf, 0xd6, 0xd1, 0xc4, 0xc3, 0xca, 0xcd, 0x90, 0x97, 0x9e, 0x99, 0x8c, 0x8b, 0x82, 0x85, 0xa8, 0xaf, 0xa6, 0xa1, 0xb4, 0xb3, 0xba, 0xbd, 0xdd, 0xda, 0xd3, 0xd4, 0xc1, 0xc6, 0xcf, 0xc8, 0xe5, 0xe2, 0xeb, 0xec, 0xf9, 0xfe, 0xf7, 0xf0, 0xad, 0xaa, 0xa3, 0xa4, 0xb1, 0xb6, 0xbf, 0xb8, 0x95, 0x92, 0x9b, 0x9c, 0x89, 0x8e, 0x87, 0x80, 0x3d, 0x3a, 0x33, 0x34, 0x21, 0x26, 0x2f, 0x28, 0x5, 0x2, 0xb, 0xc, 0x19, 0x1e, 0x17, 0x10, 0x4d, 0x4a, 0x43, 0x44, 0x51, 0x56, 0x5f, 0x58, 0x75, 0x72, 0x7b, 0x7c, 0x69, 0x6e, 0x67, 0x60, 0xa7, 0xa0, 0xa9, 0xae, 0xbb, 0xbc, 0xb5, 0xb2, 0x9f, 0x98, 0x91, 0x96, 0x83, 0x84, 0x8d, 0x8a, 0xd7, 0xd0, 0xd9, 0xde, 0xcb, 0xcc, 0xc5, 0xc2, 0xef, 0xe8, 0xe1, 0xe6, 0xf3, 0xf4, 0xfd, 0xfa, 0x47, 0x40, 0x49, 0x4e, 0x5b, 0x5c, 0x55, 0x52, 0x7f, 0x78, 0x71, 0x76, 0x63, 0x64, 0x6d, 0x6a, 0x37, 0x30, 0x39, 0x3e, 0x2b, 0x2c, 0x25, 0x22, 0xf, 0x8, 0x1, 0x6, 0x13, 0x14, 0x1d, 0x1a, 0x7a, 0x7d, 0x74, 0x73, 0x66, 0x61, 0x68, 0x6f, 0x42, 0x45, 0x4c, 0x4b, 0x5e, 0x59, 0x50, 0x57, 0xa, 0xd, 0x4, 0x3, 0x16, 0x11, 0x18, 0x1f, 0x32, 0x35, 0x3c, 0x3b, 0x2e, 0x29, 0x20, 0x27, 0x9a, 0x9d, 0x94, 0x93, 0x86, 0x81, 0x88, 0x8f, 0xa2, 0xa5, 0xac, 0xab, 0xbe, 0xb9, 0xb0, 0xb7, 0xea, 0xed, 0xe4, 0xe3, 0xf6, 0xf1, 0xf8, 0xff, 0xd2, 0xd5, 0xdc, 0xdb, 0xce, 0xc9, 0xc0, 0xc7}, {0x0, 0x8, 0x10, 0x18, 0x20, 0x28, 0x30, 0x38, 0x40, 0x48, 0x50, 0x58, 0x60, 0x68, 0x70, 0x78, 0x80, 0x88, 0x90, 0x98, 0xa0, 0xa8, 0xb0, 0xb8, 0xc0, 0xc8, 0xd0, 0xd8, 0xe0, 0xe8, 0xf0, 0xf8, 0x1d, 0x15, 0xd, 0x5, 0x3d, 0x35, 0x2d, 0x25, 0x5d, 0x55, 0x4d, 0x45, 0x7d, 0x75, 0x6d, 0x65, 0x9d, 0x95, 0x8d, 0x85, 0xbd, 0xb5, 0xad, 0xa5, 0xdd, 0xd5, 0xcd, 0xc5, 0xfd, 0xf5, 0xed, 0xe5, 0x3a, 0x32, 0x2a, 0x22, 0x1a, 0x12, 0xa, 0x2, 0x7a, 0x72, 0x6a, 0x62, 0x5a, 0x52, 0x4a, 0x42, 0xba, 0xb2, 0xaa, 0xa2, 0x9a, 0x92, 0x8a, 0x82, 0xfa, 0xf2, 0xea, 0xe2, 0xda, 0xd2, 0xca, 0xc2, 0x27, 0x2f, 0x37, 0x3f, 0x7, 0xf, 0x17, 0x1f, 0x67, 0x6f, 0x77, 0x7f, 0x47, 0x4f, 0x57, 0x5f, 0xa7, 0xaf, 0xb7, 0xbf, 0x87, 0x8f, 0x97, 0x9f, 0xe7, 0xef, 0xf7, 0xff, 0xc7, 0xcf, 0xd7, 0xdf, 0x74, 0x7c, 0x64, 0x6c, 0x54, 0x5c, 0x44, 0x4c, 0x34, 0x3c, 0x24, 0x2c, 0x14, 0x1c, 0x4, 0xc, 0xf4, 0xfc, 0xe4, 0xec, 0xd4, 0xdc, 0xc4, 0xcc, 0xb4, 0xbc, 0xa4, 0xac, 0x94, 0x9c, 0x84, 0x8c, 0x69, 0x61, 0x79, 0x71, 0x49, 0x41, 0x59, 0x51, 0x29, 0x21, 0x39, 0x31, 0x9, 0x1, 0x19, 0x11, 0xe9, 0xe1, 0xf9, 0xf1, 0xc9, 0xc1, 0xd9, 0xd1, 0xa9, 0xa1, 0xb9, 0xb1, 0x89, 0x81, 0x99, 0x91, 0x4e, 0x46, 0x5e, 0x56, 0x6e, 0x66, 0x7e, 0x76, 0xe, 0x6, 0x1e, 0x16, 0x2e, 0x26, 0x3e, 0x36, 0xce, 0xc6, 0xde, 0xd6, 0xee, 0xe6, 0xfe, 0xf6, 0x8e, 0x86, 0x9e, 0x96, 0xae, 0xa6, 0xbe, 0xb6, 0x53, 0x5b, 0x43, 0x4b, 0x73, 0x7b, 0x63, 0x6b, 0x13, 0x1b, 0x3, 0xb, 0x33, 0x3b, 0x23, 0x2b, 0xd3, 0xdb, 0xc3, 0xcb, 0xf3, 0xfb, 0xe3, 0xeb, 0x93, 0x9b, 0x83, 0x8b, 0xb3, 0xbb, 0xa3, 0xab}, {0x0, 0x9, 0x12, 0x1b, 0x24, 0x2d, 0x36, 0x3f, 0x48, 0x41, 0x5a, 0x53, 0x6c, 0x65, 0x7e, 0x77, 0x90, 0x99, 0x82, 0x8b, 0xb4, 0xbd, 0xa6, 0xaf, 0xd8, 0xd1, 0xca, 0xc3, 0xfc, 0xf5, 0xee, 0xe7, 0x3d, 0x34, 0x2f, 0x26, 0x19, 0x10, 0xb, 0x2, 0x75, 0x7c, 0x67, 0x6e, 0x51, 0x58, 0x43, 0x4a, 0xad, 0xa4, 0xbf, 0xb6, 0x89, 0x80, 0x9b, 0x92, 0xe5, 0xec, 0xf7, 0xfe, 0xc1, 0xc8, 0xd3, 0xda, 0x7a, 0x73, 0x68, 0x61, 0x5e, 0x57, 0x4c, 0x45, 0x32, 0x3b, 0x20, 0x29, 0x16, 0x1f, 0x4, 0xd, 0xea, 0xe3, 0xf8, 0xf1, 0xce, 0xc7, 0xdc, 0xd5, 0xa2, 0xab, 0xb0, 0xb9, 0x86, 0x8f, 0x94, 0x9d, 0x47, 0x4e, 0x55, 0x5c, 0x63, 0x6a, 0x71, 0x78, 0xf, 0x6, 0x1d, 0x14, 0x2b, 0x22, 0x39, 0x30, 0xd7, 0xde, 0xc5, 0xcc, 0xf3, 0xfa, 0xe1, 0xe8, 0x9f, 0x96, 0x8d, 0x84, 0xbb, 0xb2, 0xa9, 0xa0, 0xf4, 0xfd, 0xe6, 0xef, 0xd0, 0xd9, 0xc2, 0xcb, 0xbc, 0xb5, 0xae, 0xa7, 0x98, 0x91, 0x8a, 0x83, 0x64, 0x6d, 0x76, 0x7f, 0x40, 0x49, 0x52, 0x5b, 0x2c, 0x25, 0x3e, 0x37, 0x8, 0x1, 0x1a, 0x13, 0xc9, 0xc0, 0xdb, 0xd2, 0xed, 0xe4, 0xff, 0xf6, 0x81, 0x88, 0x93, 0x9a, 0xa5, 0xac, 0xb7, 0xbe, 0x59, 0x50, 0x4b, 0x42, 0x7d, 0x74, 0x6f, 0x66, 0x11, 0x18, 0x3, 0xa, 0x35, 0x3c, 0x27, 0x2e, 0x8e, 0x87, 0x9c, 0x95, 0xaa, 0xa3, 0xb8, 0xb1, 0xc6, 0xcf, 0xd4, 0xdd, 0xe2, 0xeb, 0xf0, 0xf9, 0x1e, 0x17, 0xc, 0x5, 0x3a, 0x33, 0x28, 0x21, 0x56, 0x5f, 0x44, 0x4d, 0x72, 0x7b, 0x60, 0x69, 0xb3, 0xba, 0xa1, 0xa8, 0x97, 0x9e, 0x85, 0x8c, 0xfb, 0xf2, 0xe9, 0xe0, 0xdf, 0xd6, 0xcd, 0xc4, 0x23, 0x2a, 0x31, 0x38, 0x7, 0xe, 0x15, 0x1c, 0x6b, 0x62, 0x79, 0x70, 0x4f, 0x46, 0x5d, 0x54}, {0x0, 0xa, 0x14, 0x1e, 0x28, 0x22, 0x3c, 0x36, 0x50, 0x5a, 0x44, 0x4e, 0x78, 0x72, 0x6c, 0x66, 0xa0, 0xaa, 0xb4, 0xbe, 0x88, 0x82, 0x9c, 0x96, 0xf0, 0xfa, 0xe4, 0xee, 0xd8, 0xd2, 0xcc, 0xc6, 0x5d, 0x57, 0x49, 0x43, 0x75, 0x7f, 0x61, 0x6b, 0xd, 0x7, 0x19, 0x13, 0x25, 0x2f, 0x31, 0x3b, 0xfd, 0xf7, 0xe9, 0xe3, 0xd5, 0xdf, 0xc1, 0xcb, 0xad, 0xa7, 0xb9, 0xb3, 0x85, 0x8f, 0x91, 0x9b, 0xba, 0xb0, 0xae, 0xa4, 0x92, 0x98, 0x86, 0x8c, 0xea, 0xe0, 0xfe, 0xf4, 0xc2, 0xc8, 0xd6, 0xdc, 0x1a, 0x10, 0xe, 0x4, 0x32, 0x38, 0x26, 0x2c, 0x4a, 0x40, 0x5e, 0x54, 0x62, 0x68, 0x76, 0x7c, 0xe7, 0xed, 0xf3, 0xf9, 0xcf, 0xc5, 0xdb, 0xd1, 0xb7, 0xbd, 0xa3, 0xa9, 0x9f, 0x95, 0x8b, 0x81, 0x47, 0x4d, 0x53, 0x59, 0x6f, 0x65, 0x7b, 0x71, 0x17, 0x1d, 0x3, 0x9, 0x3f, 0x35, 0x2b, 0x21, 0x69, 0x63, 0x7d, 0x77, 0x41, 0x4b, 0x55, 0x5f, 0x39, 0x33, 0x2d, 0x27, 0x11, 0x1b, 0x5, 0xf, 0xc9, 0xc3, 0xdd, 0xd7, 0xe1, 0xeb, 0xf5, 0xff, 0x99, 0x93, 0x8d, 0x87, 0xb1, 0xbb, 0xa5, 0xaf, 0x34, 0x3e, 0x20, 0x2a, 0x1c, 0x16, 0x8, 0x2, 0x64, 0x6e, 0x70, 0x7a, 0x4c, 0x46, 0x58, 0x52, 0x94, 0x9e, 0x80, 0x8a, 0xbc, 0xb6, 0xa8, 0xa2, 0xc4, 0xce, 0xd0, 0xda, 0xec, 0xe6, 0xf8, 0xf2, 0xd3, 0xd9, 0xc7, 0xcd, 0xfb, 0xf1, 0xef, 0xe5, 0x83, 0x89, 0x97, 0x9d, 0xab, 0xa1, 0xbf, 0xb5, 0x73, 0x79, 0x67, 0x6d, 0x5b, 0x51, 0x4f, 0x45, 0x23, 0x29, 0x37, 0x3d, 0xb, 0x1, 0x1f, 0x15, 0x8e, 0x84, 0x9a, 0x90, 0xa6, 0xac, 0xb2, 0xb8, 0xde, 0xd4, 0xca, 0xc0, 0xf6, 0xfc, 0xe2, 0xe8, 0x2e, 0x24, 0x3a, 0x30, 0x6, 0xc, 0x12, 0x18, 0x7e, 0x74, 0x6a, 0x60, 0x56, 0x5c, 0x42, 0x48}, {0x0, 0xb, 0x16, 0x1d, 0x2c, 0x27, 0x3a, 0x31, 0x58, 0x53, 0x4e, 0x45, 0x74, 0x7f, 0x62, 0x69, 0xb0, 0xbb, 0xa6, 0xad, 0x9c, 0x97, 0x8a, 0x81, 0xe8, 0xe3, 0xfe, 0xf5, 0xc4, 0xcf, 0xd2, 0xd9, 0x7d, 0x76, 0x6b, 0x60, 0x51, 0x5a, 0x47, 0x4c, 0x25, 0x2e, 0x33, 0x38, 0x9, 0x2, 0x1f, 0x14, 0xcd, 0xc6, 0xdb, 0xd0, 0xe1, 0xea, 0xf7, 0xfc, 0x95, 0x9e, 0x83, 0x88, 0xb9, 0xb2, 0xaf, 0xa4, 0xfa, 0xf1, 0xec, 0xe7, 0xd6, 0xdd, 0xc0, 0xcb, 0xa2, 0xa9, 0xb4, 0xbf, 0x8e, 0x85, 0x98, 0x93, 0x4a, 0x41, 0x5c, 0x57, 0x66, 0x6d, 0x70, 0x7b, 0x12, 0x19, 0x4, 0xf, 0x3e, 0x35, 0x28, 0x23, 0x87, 0x8c, 0x91, 0x9a, 0xab, 0xa0, 0xbd, 0xb6, 0xdf, 0xd4, 0xc9, 0xc2, 0xf3, 0xf8, 0xe5, 0xee, 0x37, 0x3c, 0x21, 0x2a, 0x1b, 0x10, 0xd, 0x6, 0x6f, 0x64, 0x79, 0x72, 0x43, 0x48, 0x55, 0x5e, 0xe9, 0xe2, 0xff, 0xf4, 0xc5, 0xce, 0xd3, 0xd8, 0xb1, 0xba, 0xa7, 0xac, 0x9d, 0x96, 0x8b, 0x80, 0x59, 0x52, 0x4f, 0x44, 0x75, 0x7e, 0x63, 0x68, 0x1, 0xa, 0x17, 0x1c, 0x2d, 0x26, 0x3b, 0x30, 0x94, 0x9f, 0x82, 0x89, 0xb8, 0xb3, 0xae, 0xa5, 0xcc, 0xc7, 0xda, 0xd1, 0xe0, 0xeb, 0xf6, 0xfd, 0x24, 0x2f, 0x32, 0x39, 0x8, 0x3, 0x1e, 0x15, 0x7c, 0x77, 0x6a, 0x61, 0x50, 0x5b, 0x46, 0x4d, 0x13, 0x18, 0x5, 0xe, 0x3f, 0x34, 0x29, 0x22, 0x4b, 0x40, 0x5d, 0x56, 0x67, 0x6c, 0x71, 0x7a, 0xa3, 0xa8, 0xb5, 0xbe, 0x8f, 0x84, 0x99, 0x92, 0xfb, 0xf0, 0xed, 0xe6, 0xd7, 0xdc, 0xc1, 0xca, 0x6e, 0x65, 0x78, 0x73, 0x42, 0x49, 0x54, 0x5f, 0x36, 0x3d, 0x20, 0x2b, 0x1a, 0x11, 0xc, 0x7, 0xde, 0xd5, 0xc8, 0xc3, 0xf2, 0xf9, 0xe4, 0xef, 0x86, 0x8d, 0x90, 0x9b, 0xaa, 0xa1, 0xbc, 0xb7}, {0x0, 0xc, 0x18, 0x14, 0x30, 0x3c, 0x28, 0x24, 0x60, 0x6c, 0x78, 0x74, 0x50, 0x5c, 0x48, 0x44, 0xc0, 0xcc, 0xd8, 0xd4, 0xf0, 0xfc, 0xe8, 0xe4, 0xa0, 0xac, 0xb8, 0xb4, 0x90, 0x9c, 0x88, 0x84, 0x9d, 0x91, 0x85, 0x89, 0xad, 0xa1, 0xb5, 0xb9, 0xfd, 0xf1, 0xe5, 0xe9, 0xcd, 0xc1, 0xd5, 0xd9, 0x5d, 0x51, 0x45, 0x49, 0x6d, 0x61, 0x75, 0x79, 0x3d, 0x31, 0x25, 0x29, 0xd, 0x1, 0x15, 0x19, 0x27, 0x2b, 0x3f, 0x33, 0x17, 0x1b, 0xf, 0x3, 0x47, 0x4b, 0x5f, 0x53, 0x77, 0x7b, 0x6f, 0x63, 0xe7, 0xeb, 0xff, 0xf3, 0xd7, 0xdb, 0xcf, 0xc3, 0x87, 0x8b, 0x9f, 0x93, 0xb7, 0xbb, 0xaf, 0xa3, 0xba, 0xb6, 0xa2, 0xae, 0x8a, 0x86, 0x92, 0x9e, 0xda, 0xd6, 0xc2, 0xce, 0xea, 0xe6, 0xf2, 0xfe, 0x7a, 0x76, 0x62, 0x6e, 0x4a, 0x46, 0x52, 0x5e, 0x1a, 0x16, 0x2, 0xe, 0x2a, 0x26, 0x32, 0x3e, 0x4e, 0x42, 0x56, 0x5a, 0x7e, 0x72, 0x66, 0x6a, 0x2e, 0x22, 0x36, 0x3a, 0x1e, 0x12, 0x6, 0xa, 0x8e, 0x82, 0x96, 0x9a, 0xbe, 0xb2, 0xa6, 0xaa, 0xee, 0xe2, 0xf6, 0xfa, 0xde, 0xd2, 0xc6, 0xca, 0xd3, 0xdf, 0xcb, 0xc7, 0xe3, 0xef, 0xfb, 0xf7, 0xb3, 0xbf, 0xab, 0xa7, 0x83, 0x8f, 0x9b, 0x97, 0x13, 0x1f, 0xb, 0x7, 0x23, 0x2f, 0x3b, 0x37, 0x73, 0x7f, 0x6b, 0x67, 0x43, 0x4f, 0x5b, 0x57, 0x69, 0x65, 0x71, 0x7d, 0x59, 0x55, 0x41, 0x4d, 0x9, 0x5, 0x11, 0x1d, 0x39, 0x35, 0x21, 0x2d, 0xa9, 0xa5, 0xb1, 0xbd, 0x99, 0x95, 0x81, 0x8d, 0xc9, 0xc5, 0xd1, 0xdd, 0xf9, 0xf5, 0xe1, 0xed, 0xf4, 0xf8, 0xec, 0xe0, 0xc4, 0xc8, 0xdc, 0xd0, 0x94, 0x98, 0x8c, 0x80, 0xa4, 0xa8, 0xbc, 0xb0, 0x34, 0x38, 0x2c, 0x20, 0x4, 0x8, 0x1c, 0x10, 0x54, 0x58, 0x4c, 0x40, 0x64, 0x68, 0x7c, 0x70}, {0x0, 0xd, 0x1a, 0x17, 0x34, 0x39, 0x2e, 0x23, 0x68, 0x65, 0x72, 0x7f, 0x5c, 0x51, 0x46, 0x4b, 0xd0, 0xdd, 0xca, 0xc7, 0xe4, 0xe9, 0xfe, 0xf3, 0xb8, 0xb5, 0xa2, 0xaf, 0x8c, 0x81, 0x96, 0x9b, 0xbd, 0xb0, 0xa7, 0xaa, 0x89, 0x84, 0x93, 0x9e, 0xd5, 0xd8, 0xcf, 0xc2, 0xe1, 0xec, 0xfb, 0xf6, 0x6d, 0x60, 0x77, 0x7a, 0x59, 0x54, 0x43, 0x4e, 0x5, 0x8, 0x1f, 0x12, 0x31, 0x3c, 0x2b, 0x26, 0x67, 0x6a, 0x7d, 0x70, 0x53, 0x5e, 0x49, 0x44, 0xf, 0x2, 0x15, 0x18, 0x3b, 0x36, 0x21, 0x2c, 0xb7, 0xba, 0xad, 0xa0, 0x83, 0x8e, 0x99, 0x94, 0xdf, 0xd2, 0xc5, 0xc8, 0xeb, 0xe6, 0xf1, 0xfc, 0xda, 0xd7, 0xc0, 0xcd, 0xee, 0xe3, 0xf4, 0xf9, 0xb2, 0xbf, 0xa8, 0xa5, 0x86, 0x8b, 0x9c, 0x91, 0xa, 0x7, 0x10, 0x1d, 0x3e, 0x33, 0x24, 0x29, 0x62, 0x6f, 0x78, 0x75, 0x56, 0x5b, 0x4c, 0x41, 0xce, 0xc3, 0xd4, 0xd9, 0xfa, 0xf7, 0xe0, 0xed, 0xa6, 0xab, 0xbc, 0xb1, 0x92, 0x9f, 0x88, 0x85, 0x1e, 0x13, 0x4, 0x9, 0x2a, 0x27, 0x30, 0x3d, 0x76, 0x7b, 0x6c, 0x61, 0x42, 0x4f, 0x58, 0x55, 0x73, 0x7e, 0x69, 0x64, 0x47, 0x4a, 0x5d, 0x50, 0x1b, 0x16, 0x1, 0xc, 0x2f, 0x22, 0x35, 0x38, 0xa3, 0xae, 0xb9, 0xb4, 0x97, 0x9a, 0x8d, 0x80, 0xcb, 0xc6, 0xd1, 0xdc, 0xff, 0xf2, 0xe5, 0xe8, 0xa9, 0xa4, 0xb3, 0xbe, 0x9d, 0x90, 0x87, 0x8a, 0xc1, 0xcc, 0xdb, 0xd6, 0xf5, 0xf8, 0xef, 0xe2, 0x79, 0x74, 0x63, 0x6e, 0x4d, 0x40, 0x57, 0x5a, 0x11, 0x1c, 0xb, 0x6, 0x25, 0x28, 0x3f, 0x32, 0x14, 0x19, 0xe, 0x3, 0x20, 0x2d, 0x3a, 0x37, 0x7c, 0x71, 0x66, 0x6b, 0x48, 0x45, 0x52, 0x5f, 0xc4, 0xc9, 0xde, 0xd3, 0xf0, 0xfd, 0xea, 0xe7, 0xac, 0xa1, 0xb6, 0xbb, 0x98, 0x95, 0x82, 0x8f}, {0x0, 0xe, 0x1c, 0x12, 0x38, 0x36, 0x24, 0x2a, 0x70, 0x7e, 0x6c, 0x62, 0x48, 0x46, 0x54, 0x5a, 0xe0, 0xee, 0xfc, 0xf2, 0xd8, 0xd6, 0xc4, 0xca, 0x90, 0x9e, 0x8c, 0x82, 0xa8, 0xa6, 0xb4, 0xba, 0xdd, 0xd3, 0xc1, 0xcf, 0xe5, 0xeb, 0xf9, 0xf7, 0xad, 0xa3, 0xb1, 0xbf, 0x95, 0x9b, 0x89, 0x87, 0x3d, 0x33, 0x21, 0x2f, 0x5, 0xb, 0x19, 0x17, 0x4d, 0x43, 0x51, 0x5f, 0x75, 0x7b, 0x69, 0x67, 0xa7, 0xa9, 0xbb, 0xb5, 0x9f, 0x91, 0x83, 0x8d, 0xd7, 0xd9, 0xcb, 0xc5, 0xef, 0xe1, 0xf3, 0xfd, 0x47, 0x49, 0x5b, 0x55, 0x7f, 0x71, 0x63, 0x6d, 0x37, 0x39, 0x2b, 0x25, 0xf, 0x1, 0x13, 0x1d, 0x7a, 0x74, 0x66, 0x68, 0x42, 0x4c, 0x5e, 0x50, 0xa, 0x4, 0x16, 0x18, 0x32, 0x3c, 0x2e, 0x20, 0x9a, 0x94, 0x86, 0x88, 0xa2, 0xac, 0xbe, 0xb0, 0xea, 0xe4, 0xf6, 0xf8, 0xd2, 0xdc, 0xce, 0xc0, 0x53, 0x5d, 0x4f, 0x41, 0x6b, 0x65, 0x77, 0x79, 0x23, 0x2d, 0x3f, 0x31, 0x1b, 0x15, 0x7, 0x9, 0xb3, 0xbd, 0xaf, 0xa1, 0x8b, 0x85, 0x97, 0x99, 0xc3, 0xcd, 0xdf, 0xd1, 0xfb, 0xf5, 0xe7, 0xe9, 0x8e, 0x80, 0x92, 0x9c, 0xb6, 0xb8, 0xaa, 0xa4, 0xfe, 0xf0, 0xe2, 0xec, 0xc6, 0xc8, 0xda, 0xd4, 0x6e, 0x60, 0x72, 0x7c, 0x56, 0x58, 0x4a, 0x44, 0x1e, 0x10, 0x2, 0xc, 0x26, 0x28, 0x3a, 0x34, 0xf4, 0xfa, 0xe8, 0xe6, 0xcc, 0xc2, 0xd0, 0xde, 0x84, 0x8a, 0x98, 0x96, 0xbc, 0xb2, 0xa0, 0xae, 0x14, 0x1a, 0x8, 0x6, 0x2c, 0x22, 0x30, 0x3e, 0x64, 0x6a, 0x78, 0x76, 0x5c, 0x52, 0x40, 0x4e, 0x29, 0x27, 0x35, 0x3b, 0x11, 0x1f, 0xd, 0x3, 0x59, 0x57, 0x45, 0x4b, 0x61, 0x6f, 0x7d, 0x73, 0xc9, 0xc7, 0xd5, 0xdb, 0xf1, 0xff, 0xed, 0xe3, 0xb9, 0xb7, 0xa5, 0xab, 0x81, 0x8f, 0x9d, 0x93}, {0x0, 0xf, 0x1e, 0x11, 0x3c, 0x33, 0x22, 0x2d, 0x78, 0x77, 0x66, 0x69, 0x44, 0x4b, 0x5a, 0x55, 0xf0, 0xff, 0xee, 0xe1, 0xcc, 0xc3, 0xd2, 0xdd, 0x88, 0x87, 0x96, 0x99, 0xb4, 0xbb, 0xaa, 0xa5, 0xfd, 0xf2, 0xe3, 0xec, 0xc1, 0xce, 0xdf, 0xd0, 0x85, 0x8a, 0x9b, 0x94, 0xb9, 0xb6, 0xa7, 0xa8, 0xd, 0x2, 0x13, 0x1c, 0x31, 0x3e, 0x2f, 0x20, 0x75, 0x7a, 0x6b, 0x64, 0x49, 0x46, 0x57, 0x58, 0xe7, 0xe8, 0xf9, 0xf6, 0xdb, 0xd4, 0xc5, 0xca, 0x9f, 0x90, 0x81, 0x8e, 0xa3, 0xac, 0xbd, 0xb2, 0x17, 0x18, 0x9, 0x6, 0x2b, 0x24, 0x35, 0x3a, 0x6f, 0x60, 0x71, 0x7e, 0x53, 0x5c, 0x4d, 0x42, 0x1a, 0x15, 0x4, 0xb, 0x26, 0x29, 0x38, 0x37, 0x62, 0x6d, 0x7c, 0x73, 0x5e, 0x51, 0x40, 0x4f, 0xea, 0xe5, 0xf4, 0xfb, 0xd6, 0xd9, 0xc8, 0xc7, 0x92, 0x9d, 0x8c, 0x83, 0xae, 0xa1, 0xb0, 0xbf, 0xd3, 0xdc, 0xcd, 0xc2, 0xef, 0xe0, 0xf1, 0xfe, 0xab, 0xa4, 0xb5, 0xba, 0x97, 0x98, 0x89, 0x86, 0x23, 0x2c, 0x3d, 0x32, 0x1f, 0x10, 0x1, 0xe, 0x5b, 0x54, 0x45, 0x4a, 0x67, 0x68, 0x79, 0x76, 0x2e, 0x21, 0x30, 0x3f, 0x12, 0x1d, 0xc, 0x3, 0x56, 0x59, 0x48, 0x47, 0x6a, 0x65, 0x74, 0x7b, 0xde, 0xd1, 0xc0, 0xcf, 0xe2, 0xed, 0xfc, 0xf3, 0xa6, 0xa9, 0xb8, 0xb7, 0x9a, 0x95, 0x84, 0x8b, 0x34, 0x3b, 0x2a, 0x25, 0x8, 0x7, 0x16, 0x19, 0x4c, 0x43, 0x52, 0x5d, 0x70, 0x7f, 0x6e, 0x61, 0xc4, 0xcb, 0xda, 0xd5, 0xf8, 0xf7, 0xe6, 0xe9, 0xbc, 0xb3, 0xa2, 0xad, 0x80, 0x8f, 0x9e, 0x91, 0xc9, 0xc6, 0xd7, 0xd8, 0xf5, 0xfa, 0xeb, 0xe4, 0xb1, 0xbe, 0xaf, 0xa0, 0x8d, 0x82, 0x93, 0x9c, 0x39, 0x36, 0x27, 0x28, 0x5, 0xa, 0x1b, 0x14, 0x41, 0x4e, 0x5f, 0x50, 0x7d, 0x72, 0x63, 0x6c}, {0x0, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0, 0x1d, 0xd, 0x3d, 0x2d, 0x5d, 0x4d, 0x7d, 0x6d, 0x9d, 0x8d, 0xbd, 0xad, 0xdd, 0xcd, 0xfd, 0xed, 0x3a, 0x2a, 0x1a, 0xa, 0x7a, 0x6a, 0x5a, 0x4a, 0xba, 0xaa, 0x9a, 0x8a, 0xfa, 0xea, 0xda, 0xca, 0x27, 0x37, 0x7, 0x17, 0x67, 0x77, 0x47, 0x57, 0xa7, 0xb7, 0x87, 0x97, 0xe7, 0xf7, 0xc7, 0xd7, 0x74, 0x64, 0x54, 0x44, 0x34, 0x24, 0x14, 0x4, 0xf4, 0xe4, 0xd4, 0xc4, 0xb4, 0xa4, 0x94, 0x84, 0x69, 0x79, 0x49, 0x59, 0x29, 0x39, 0x9, 0x19, 0xe9, 0xf9, 0xc9, 0xd9, 0xa9, 0xb9, 0x89, 0x99, 0x4e, 0x5e, 0x6e, 0x7e, 0xe, 0x1e, 0x2e, 0x3e, 0xce, 0xde, 0xee, 0xfe, 0x8e, 0x9e, 0xae, 0xbe, 0x53, 0x43, 0x73, 0x63, 0x13, 0x3, 0x33, 0x23, 0xd3, 0xc3, 0xf3, 0xe3, 0x93, 0x83, 0xb3, 0xa3, 0xe8, 0xf8, 0xc8, 0xd8, 0xa8, 0xb8, 0x88, 0x98, 0x68, 0x78, 0x48, 0x58, 0x28, 0x38, 0x8, 0x18, 0xf5, 0xe5, 0xd5, 0xc5, 0xb5, 0xa5, 0x95, 0x85, 0x75, 0x65, 0x55, 0x45, 0x35, 0x25, 0x15, 0x5, 0xd2, 0xc2, 0xf2, 0xe2, 0x92, 0x82, 0xb2, 0xa2, 0x52, 0x42, 0x72, 0x62, 0x12, 0x2, 0x32, 0x22, 0xcf, 0xdf, 0xef, 0xff, 0x8f, 0x9f, 0xaf, 0xbf, 0x4f, 0x5f, 0x6f, 0x7f, 0xf, 0x1f, 0x2f, 0x3f, 0x9c, 0x8c, 0xbc, 0xac, 0xdc, 0xcc, 0xfc, 0xec, 0x1c, 0xc, 0x3c, 0x2c, 0x5c, 0x4c, 0x7c, 0x6c, 0x81, 0x91, 0xa1, 0xb1, 0xc1, 0xd1, 0xe1, 0xf1, 0x1, 0x11, 0x21, 0x31, 0x41, 0x51, 0x61, 0x71, 0xa6, 0xb6, 0x86, 0x96, 0xe6, 0xf6, 0xc6, 0xd6, 0x26, 0x36, 0x6, 0x16, 0x66, 0x76, 0x46, 0x56, 0xbb, 0xab, 0x9b, 0x8b, 0xfb, 0xeb, 0xdb, 0xcb, 0x3b, 0x2b, 0x1b, 0xb, 0x7b, 0x6b, 0x5b, 0x4b}, {0x0, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, 0x88, 0x99, 0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff, 0xd, 0x1c, 0x2f, 0x3e, 0x49, 0x58, 0x6b, 0x7a, 0x85, 0x94, 0xa7, 0xb6, 0xc1, 0xd0, 0xe3, 0xf2, 0x1a, 0xb, 0x38, 0x29, 0x5e, 0x4f, 0x7c, 0x6d, 0x92, 0x83, 0xb0, 0xa1, 0xd6, 0xc7, 0xf4, 0xe5, 0x17, 0x6, 0x35, 0x24, 0x53, 0x42, 0x71, 0x60, 0x9f, 0x8e, 0xbd, 0xac, 0xdb, 0xca, 0xf9, 0xe8, 0x34, 0x25, 0x16, 0x7, 0x70, 0x61, 0x52, 0x43, 0xbc, 0xad, 0x9e, 0x8f, 0xf8, 0xe9, 0xda, 0xcb, 0x39, 0x28, 0x1b, 0xa, 0x7d, 0x6c, 0x5f, 0x4e, 0xb1, 0xa0, 0x93, 0x82, 0xf5, 0xe4, 0xd7, 0xc6, 0x2e, 0x3f, 0xc, 0x1d, 0x6a, 0x7b, 0x48, 0x59, 0xa6, 0xb7, 0x84, 0x95, 0xe2, 0xf3, 0xc0, 0xd1, 0x23, 0x32, 0x1, 0x10, 0x67, 0x76, 0x45, 0x54, 0xab, 0xba, 0x89, 0x98, 0xef, 0xfe, 0xcd, 0xdc, 0x68, 0x79, 0x4a, 0x5b, 0x2c, 0x3d, 0xe, 0x1f, 0xe0, 0xf1, 0xc2, 0xd3, 0xa4, 0xb5, 0x86, 0x97, 0x65, 0x74, 0x47, 0x56, 0x21, 0x30, 0x3, 0x12, 0xed, 0xfc, 0xcf, 0xde, 0xa9, 0xb8, 0x8b, 0x9a, 0x72, 0x63, 0x50, 0x41, 0x36, 0x27, 0x14, 0x5, 0xfa, 0xeb, 0xd8, 0xc9, 0xbe, 0xaf, 0x9c, 0x8d, 0x7f, 0x6e, 0x5d, 0x4c, 0x3b, 0x2a, 0x19, 0x8, 0xf7, 0xe6, 0xd5, 0xc4, 0xb3, 0xa2, 0x91, 0x80, 0x5c, 0x4d, 0x7e, 0x6f, 0x18, 0x9, 0x3a, 0x2b, 0xd4, 0xc5, 0xf6, 0xe7, 0x90, 0x81, 0xb2, 0xa3, 0x51, 0x40, 0x73, 0x62, 0x15, 0x4, 0x37, 0x26, 0xd9, 0xc8, 0xfb, 0xea, 0x9d, 0x8c, 0xbf, 0xae, 0x46, 0x57, 0x64, 0x75, 0x2, 0x13, 0x20, 0x31, 0xce, 0xdf, 0xec, 0xfd, 0x8a, 0x9b, 0xa8, 0xb9, 0x4b, 0x5a, 0x69, 0x78, 0xf, 0x1e, 0x2d, 0x3c, 0xc3, 0xd2, 0xe1, 0xf0, 0x87, 0x96, 0xa5, 0xb4}, {0x0, 0x12, 0x24, 0x36, 0x48, 0x5a, 0x6c, 0x7e, 0x90, 0x82, 0xb4, 0xa6, 0xd8, 0xca, 0xfc, 0xee, 0x3d, 0x2f, 0x19, 0xb, 0x75, 0x67, 0x51, 0x43, 0xad, 0xbf, 0x89, 0x9b, 0xe5, 0xf7, 0xc1, 0xd3, 0x7a, 0x68, 0x5e, 0x4c, 0x32, 0x20, 0x16, 0x4, 0xea, 0xf8, 0xce, 0xdc, 0xa2, 0xb0, 0x86, 0x94, 0x47, 0x55, 0x63, 0x71, 0xf, 0x1d, 0x2b, 0x39, 0xd7, 0xc5, 0xf3, 0xe1, 0x9f, 0x8d, 0xbb, 0xa9, 0xf4, 0xe6, 0xd0, 0xc2, 0xbc, 0xae, 0x98, 0x8a, 0x64, 0x76, 0x40, 0x52, 0x2c, 0x3e, 0x8, 0x1a, 0xc9, 0xdb, 0xed, 0xff, 0x81, 0x93, 0xa5, 0xb7, 0x59, 0x4b, 0x7d, 0x6f, 0x11, 0x3, 0x35, 0x27, 0x8e, 0x9c, 0xaa, 0xb8, 0xc6, 0xd4, 0xe2, 0xf0, 0x1e, 0xc, 0x3a, 0x28, 0x56, 0x44, 0x72, 0x60, 0xb3, 0xa1, 0x97, 0x85, 0xfb, 0xe9, 0xdf, 0xcd, 0x23, 0x31, 0x7, 0x15, 0x6b, 0x79, 0x4f, 0x5d, 0xf5, 0xe7, 0xd1, 0xc3, 0xbd, 0xaf, 0x99, 0x8b, 0x65, 0x77, 0x41, 0x53, 0x2d, 0x3f, 0x9, 0x1b, 0xc8, 0xda, 0xec, 0xfe, 0x80, 0x92, 0xa4, 0xb6, 0x58, 0x4a, 0x7c, 0x6e, 0x10, 0x2, 0x34, 0x26, 0x8f, 0x9d, 0xab, 0xb9, 0xc7, 0xd5, 0xe3, 0xf1, 0x1f, 0xd, 0x3b, 0x29, 0x57, 0x45, 0x73, 0x61, 0xb2, 0xa0, 0x96, 0x84, 0xfa, 0xe8, 0xde, 0xcc, 0x22, 0x30, 0x6, 0x14, 0x6a, 0x78, 0x4e, 0x5c, 0x1, 0x13, 0x25, 0x37, 0x49, 0x5b, 0x6d, 0x7f, 0x91, 0x83, 0xb5, 0xa7, 0xd9, 0xcb, 0xfd, 0xef, 0x3c, 0x2e, 0x18, 0xa, 0x74, 0x66, 0x50, 0x42, 0xac, 0xbe, 0x88, 0x9a, 0xe4, 0xf6, 0xc0, 0xd2, 0x7b, 0x69, 0x5f, 0x4d, 0x33, 0x21, 0x17, 0x5, 0xeb, 0xf9, 0xcf, 0xdd, 0xa3, 0xb1, 0x87, 0x95, 0x46, 0x54, 0x62, 0x70, 0xe, 0x1c, 0x2a, 0x38, 0xd6, 0xc4, 0xf2, 0xe0, 0x9e, 0x8c, 0xba, 0xa8}, {0x0, 0x13, 0x26, 0x35, 0x4c, 0x5f, 0x6a, 0x79, 0x98, 0x8b, 0xbe, 0xad, 0xd4, 0xc7, 0xf2, 0xe1, 0x2d, 0x3e, 0xb, 0x18, 0x61, 0x72, 0x47, 0x54, 0xb5, 0xa6, 0x93, 0x80, 0xf9, 0xea, 0xdf, 0xcc, 0x5a, 0x49, 0x7c, 0x6f, 0x16, 0x5, 0x30, 0x23, 0xc2, 0xd1, 0xe4, 0xf7, 0x8e, 0x9d, 0xa8, 0xbb, 0x77, 0x64, 0x51, 0x42, 0x3b, 0x28, 0x1d, 0xe, 0xef, 0xfc, 0xc9, 0xda, 0xa3, 0xb0, 0x85, 0x96, 0xb4, 0xa7, 0x92, 0x81, 0xf8, 0xeb, 0xde, 0xcd, 0x2c, 0x3f, 0xa, 0x19, 0x60, 0x73, 0x46, 0x55, 0x99, 0x8a, 0xbf, 0xac, 0xd5, 0xc6, 0xf3, 0xe0, 0x1, 0x12, 0x27, 0x34, 0x4d, 0x5e, 0x6b, 0x78, 0xee, 0xfd, 0xc8, 0xdb, 0xa2, 0xb1, 0x84, 0x97, 0x76, 0x65, 0x50, 0x43, 0x3a, 0x29, 0x1c, 0xf, 0xc3, 0xd0, 0xe5, 0xf6, 0x8f, 0x9c, 0xa9, 0xba, 0x5b, 0x48, 0x7d, 0x6e, 0x17, 0x4, 0x31, 0x22, 0x75, 0x66, 0x53, 0x40, 0x39, 0x2a, 0x1f, 0xc, 0xed, 0xfe, 0xcb, 0xd8, 0xa1, 0xb2, 0x87, 0x94, 0x58, 0x4b, 0x7e, 0x6d, 0x14, 0x7, 0x32, 0x21, 0xc0, 0xd3, 0xe6, 0xf5, 0x8c, 0x9f, 0xaa, 0xb9, 0x2f, 0x3c, 0x9, 0x1a, 0x63, 0x70, 0x45, 0x56, 0xb7, 0xa4, 0x91, 0x82, 0xfb, 0xe8, 0xdd, 0xce, 0x2, 0x11, 0x24, 0x37, 0x4e, 0x5d, 0x68, 0x7b, 0x9a, 0x89, 0xbc, 0xaf, 0xd6, 0xc5, 0xf0, 0xe3, 0xc1, 0xd2, 0xe7, 0xf4, 0x8d, 0x9e, 0xab, 0xb8, 0x59, 0x4a, 0x7f, 0x6c, 0x15, 0x6, 0x33, 0x20, 0xec, 0xff, 0xca, 0xd9, 0xa0, 0xb3, 0x86, 0x95, 0x74, 0x67, 0x52, 0x41, 0x38, 0x2b, 0x1e, 0xd, 0x9b, 0x88, 0xbd, 0xae, 0xd7, 0xc4, 0xf1, 0xe2, 0x3, 0x10, 0x25, 0x36, 0x4f, 0x5c, 0x69, 0x7a, 0xb6, 0xa5, 0x90, 0x83, 0xfa, 0xe9, 0xdc, 0xcf, 0x2e, 0x3d, 0x8, 0x1b, 0x62, 0x71, 0x44, 0x57}, {0x0, 0x14, 0x28, 0x3c, 0x50, 0x44, 0x78, 0x6c, 0xa0, 0xb4, 0x88, 0x9c, 0xf0, 0xe4, 0xd8, 0xcc, 0x5d, 0x49, 0x75, 0x61, 0xd, 0x19, 0x25, 0x31, 0xfd, 0xe9, 0xd5, 0xc1, 0xad, 0xb9, 0x85, 0x91, 0xba, 0xae, 0x92, 0x86, 0xea, 0xfe, 0xc2, 0xd6, 0x1a, 0xe, 0x32, 0x26, 0x4a, 0x5e, 0x62, 0x76, 0xe7, 0xf3, 0xcf, 0xdb, 0xb7, 0xa3, 0x9f, 0x8b, 0x47, 0x53, 0x6f, 0x7b, 0x17, 0x3, 0x3f, 0x2b, 0x69, 0x7d, 0x41, 0x55, 0x39, 0x2d, 0x11, 0x5, 0xc9, 0xdd, 0xe1, 0xf5, 0x99, 0x8d, 0xb1, 0xa5, 0x34, 0x20, 0x1c, 0x8, 0x64, 0x70, 0x4c, 0x58, 0x94, 0x80, 0xbc, 0xa8, 0xc4, 0xd0, 0xec, 0xf8, 0xd3, 0xc7, 0xfb, 0xef, 0x83, 0x97, 0xab, 0xbf, 0x73, 0x67, 0x5b, 0x4f, 0x23, 0x37, 0xb, 0x1f, 0x8e, 0x9a, 0xa6, 0xb2, 0xde, 0xca, 0xf6, 0xe2, 0x2e, 0x3a, 0x6, 0x12, 0x7e, 0x6a, 0x56, 0x42, 0xd2, 0xc6, 0xfa, 0xee, 0x82, 0x96, 0xaa, 0xbe, 0x72, 0x66, 0x5a, 0x4e, 0x22, 0x36, 0xa, 0x1e, 0x8f, 0x9b, 0xa7, 0xb3, 0xdf, 0xcb, 0xf7, 0xe3, 0x2f, 0x3b, 0x7, 0x13, 0x7f, 0x6b, 0x57, 0x43, 0x68, 0x7c, 0x40, 0x54, 0x38, 0x2c, 0x10, 0x4, 0xc8, 0xdc, 0xe0, 0xf4, 0x98, 0x8c, 0xb0, 0xa4, 0x35, 0x21, 0x1d, 0x9, 0x65, 0x71, 0x4d, 0x59, 0x95, 0x81, 0xbd, 0xa9, 0xc5, 0xd1, 0xed, 0xf9, 0xbb, 0xaf, 0x93, 0x87, 0xeb, 0xff, 0xc3, 0xd7, 0x1b, 0xf, 0x33, 0x27, 0x4b, 0x5f, 0x63, 0x77, 0xe6, 0xf2, 0xce, 0xda, 0xb6, 0xa2, 0x9e, 0x8a, 0x46, 0x52, 0x6e, 0x7a, 0x16, 0x2, 0x3e, 0x2a, 0x1, 0x15, 0x29, 0x3d, 0x51, 0x45, 0x79, 0x6d, 0xa1, 0xb5, 0x89, 0x9d, 0xf1, 0xe5, 0xd9, 0xcd, 0x5c, 0x48, 0x74, 0x60, 0xc, 0x18, 0x24, 0x30, 0xfc, 0xe8, 0xd4, 0xc0, 0xac, 0xb8, 0x84, 0x90}, {0x0, 0x15, 0x2a, 0x3f, 0x54, 0x41, 0x7e, 0x6b, 0xa8, 0xbd, 0x82, 0x97, 0xfc, 0xe9, 0xd6, 0xc3, 0x4d, 0x58, 0x67, 0x72, 0x19, 0xc, 0x33, 0x26, 0xe5, 0xf0, 0xcf, 0xda, 0xb1, 0xa4, 0x9b, 0x8e, 0x9a, 0x8f, 0xb0, 0xa5, 0xce, 0xdb, 0xe4, 0xf1, 0x32, 0x27, 0x18, 0xd, 0x66, 0x73, 0x4c, 0x59, 0xd7, 0xc2, 0xfd, 0xe8, 0x83, 0x96, 0xa9, 0xbc, 0x7f, 0x6a, 0x55, 0x40, 0x2b, 0x3e, 0x1, 0x14, 0x29, 0x3c, 0x3, 0x16, 0x7d, 0x68, 0x57, 0x42, 0x81, 0x94, 0xab, 0xbe, 0xd5, 0xc0, 0xff, 0xea, 0x64, 0x71, 0x4e, 0x5b, 0x30, 0x25, 0x1a, 0xf, 0xcc, 0xd9, 0xe6, 0xf3, 0x98, 0x8d, 0xb2, 0xa7, 0xb3, 0xa6, 0x99, 0x8c, 0xe7, 0xf2, 0xcd, 0xd8, 0x1b, 0xe, 0x31, 0x24, 0x4f, 0x5a, 0x65, 0x70, 0xfe, 0xeb, 0xd4, 0xc1, 0xaa, 0xbf, 0x80, 0x95, 0x56, 0x43, 0x7c, 0x69, 0x2, 0x17, 0x28, 0x3d, 0x52, 0x47, 0x78, 0x6d, 0x6, 0x13, 0x2c, 0x39, 0xfa, 0xef, 0xd0, 0xc5, 0xae, 0xbb, 0x84, 0x91, 0x1f, 0xa, 0x35, 0x20, 0x4b, 0x5e, 0x61, 0x74, 0xb7, 0xa2, 0x9d, 0x88, 0xe3, 0xf6, 0xc9, 0xdc, 0xc8, 0xdd, 0xe2, 0xf7, 0x9c, 0x89, 0xb6, 0xa3, 0x60, 0x75, 0x4a, 0x5f, 0x34, 0x21, 0x1e, 0xb, 0x85, 0x90, 0xaf, 0xba, 0xd1, 0xc4, 0xfb, 0xee, 0x2d, 0x38, 0x7, 0x12, 0x79, 0x6c, 0x53, 0x46, 0x7b, 0x6e, 0x51, 0x44, 0x2f, 0x3a, 0x5, 0x10, 0xd3, 0xc6, 0xf9, 0xec, 0x87, 0x92, 0xad, 0xb8, 0x36, 0x23, 0x1c, 0x9, 0x62, 0x77, 0x48, 0x5d, 0x9e, 0x8b, 0xb4, 0xa1, 0xca, 0xdf, 0xe0, 0xf5, 0xe1, 0xf4, 0xcb, 0xde, 0xb5, 0xa0, 0x9f, 0x8a, 0x49, 0x5c, 0x63, 0x76, 0x1d, 0x8, 0x37, 0x22, 0xac, 0xb9, 0x86, 0x93, 0xf8, 0xed, 0xd2, 0xc7, 0x4, 0x11, 0x2e, 0x3b, 0x50, 0x45, 0x7a, 0x6f}, {0x0, 0x16, 0x2c, 0x3a, 0x58, 0x4e, 0x74, 0x62, 0xb0, 0xa6, 0x9c, 0x8a, 0xe8, 0xfe, 0xc4, 0xd2, 0x7d, 0x6b, 0x51, 0x47, 0x25, 0x33, 0x9, 0x1f, 0xcd, 0xdb, 0xe1, 0xf7, 0x95, 0x83, 0xb9, 0xaf, 0xfa, 0xec, 0xd6, 0xc0, 0xa2, 0xb4, 0x8e, 0x98, 0x4a, 0x5c, 0x66, 0x70, 0x12, 0x4, 0x3e, 0x28, 0x87, 0x91, 0xab, 0xbd, 0xdf, 0xc9, 0xf3, 0xe5, 0x37, 0x21, 0x1b, 0xd, 0x6f, 0x79, 0x43, 0x55, 0xe9, 0xff, 0xc5, 0xd3, 0xb1, 0xa7, 0x9d, 0x8b, 0x59, 0x4f, 0x75, 0x63, 0x1, 0x17, 0x2d, 0x3b, 0x94, 0x82, 0xb8, 0xae, 0xcc, 0xda, 0xe0, 0xf6, 0x24, 0x32, 0x8, 0x1e, 0x7c, 0x6a, 0x50, 0x46, 0x13, 0x5, 0x3f, 0x29, 0x4b, 0x5d, 0x67, 0x71, 0xa3, 0xb5, 0x8f, 0x99, 0xfb, 0xed, 0xd7, 0xc1, 0x6e, 0x78, 0x42, 0x54, 0x36, 0x20, 0x1a, 0xc, 0xde, 0xc8, 0xf2, 0xe4, 0x86, 0x90, 0xaa, 0xbc, 0xcf, 0xd9, 0xe3, 0xf5, 0x97, 0x81, 0xbb, 0xad, 0x7f, 0x69, 0x53, 0x45, 0x27, 0x31, 0xb, 0x1d, 0xb2, 0xa4, 0x9e, 0x88, 0xea, 0xfc, 0xc6, 0xd0, 0x2, 0x14, 0x2e, 0x38, 0x5a, 0x4c, 0x76, 0x60, 0x35, 0x23, 0x19, 0xf, 0x6d, 0x7b, 0x41, 0x57, 0x85, 0x93, 0xa9, 0xbf, 0xdd, 0xcb, 0xf1, 0xe7, 0x48, 0x5e, 0x64, 0x72, 0x10, 0x6, 0x3c, 0x2a, 0xf8, 0xee, 0xd4, 0xc2, 0xa0, 0xb6, 0x8c, 0x9a, 0x26, 0x30, 0xa, 0x1c, 0x7e, 0x68, 0x52, 0x44, 0x96, 0x80, 0xba, 0xac, 0xce, 0xd8, 0xe2, 0xf4, 0x5b, 0x4d, 0x77, 0x61, 0x3, 0x15, 0x2f, 0x39, 0xeb, 0xfd, 0xc7, 0xd1, 0xb3, 0xa5, 0x9f, 0x89, 0xdc, 0xca, 0xf0, 0xe6, 0x84, 0x92, 0xa8, 0xbe, 0x6c, 0x7a, 0x40, 0x56, 0x34, 0x22, 0x18, 0xe, 0xa1, 0xb7, 0x8d, 0x9b, 0xf9, 0xef, 0xd5, 0xc3, 0x11, 0x7, 0x3d, 0x2b, 0x49, 0x5f, 0x65, 0x73}, {0x0, 0x17, 0x2e, 0x39, 0x5c, 0x4b, 0x72, 0x65, 0xb8, 0xaf, 0x96, 0x81, 0xe4, 0xf3, 0xca, 0xdd, 0x6d, 0x7a, 0x43, 0x54, 0x31, 0x26, 0x1f, 0x8, 0xd5, 0xc2, 0xfb, 0xec, 0x89, 0x9e, 0xa7, 0xb0, 0xda, 0xcd, 0xf4, 0xe3, 0x86, 0x91, 0xa8, 0xbf, 0x62, 0x75, 0x4c, 0x5b, 0x3e, 0x29, 0x10, 0x7, 0xb7, 0xa0, 0x99, 0x8e, 0xeb, 0xfc, 0xc5, 0xd2, 0xf, 0x18, 0x21, 0x36, 0x53, 0x44, 0x7d, 0x6a, 0xa9, 0xbe, 0x87, 0x90, 0xf5, 0xe2, 0xdb, 0xcc, 0x11, 0x6, 0x3f, 0x28, 0x4d, 0x5a, 0x63, 0x74, 0xc4, 0xd3, 0xea, 0xfd, 0x98, 0x8f, 0xb6, 0xa1, 0x7c, 0x6b, 0x52, 0x45, 0x20, 0x37, 0xe, 0x19, 0x73, 0x64, 0x5d, 0x4a, 0x2f, 0x38, 0x1, 0x16, 0xcb, 0xdc, 0xe5, 0xf2, 0x97, 0x80, 0xb9, 0xae, 0x1e, 0x9, 0x30, 0x27, 0x42, 0x55, 0x6c, 0x7b, 0xa6, 0xb1, 0x88, 0x9f, 0xfa, 0xed, 0xd4, 0xc3, 0x4f, 0x58, 0x61, 0x76, 0x13, 0x4, 0x3d, 0x2a, 0xf7, 0xe0, 0xd9, 0xce, 0xab, 0xbc, 0x85, 0x92, 0x22, 0x35, 0xc, 0x1b, 0x7e, 0x69, 0x50, 0x47, 0x9a, 0x8d, 0xb4, 0xa3, 0xc6, 0xd1, 0xe8, 0xff, 0x95, 0x82, 0xbb, 0xac, 0xc9, 0xde, 0xe7, 0xf0, 0x2d, 0x3a, 0x3, 0x14, 0x71, 0x66, 0x5f, 0x48, 0xf8, 0xef, 0xd6, 0xc1, 0xa4, 0xb3, 0x8a, 0x9d, 0x40, 0x57, 0x6e, 0x79, 0x1c, 0xb, 0x32, 0x25, 0xe6, 0xf1, 0xc8, 0xdf, 0xba, 0xad, 0x94, 0x83, 0x5e, 0x49, 0x70, 0x67, 0x2, 0x15, 0x2c, 0x3b, 0x8b, 0x9c, 0xa5, 0xb2, 0xd7, 0xc0, 0xf9, 0xee, 0x33, 0x24, 0x1d, 0xa, 0x6f, 0x78, 0x41, 0x56, 0x3c, 0x2b, 0x12, 0x5, 0x60, 0x77, 0x4e, 0x59, 0x84, 0x93, 0xaa, 0xbd, 0xd8, 0xcf, 0xf6, 0xe1, 0x51, 0x46, 0x7f, 0x68, 0xd, 0x1a, 0x23, 0x34, 0xe9, 0xfe, 0xc7, 0xd0, 0xb5, 0xa2, 0x9b, 0x8c}, {0x0, 0x18, 0x30, 0x28, 0x60, 0x78, 0x50, 0x48, 0xc0, 0xd8, 0xf0, 0xe8, 0xa0, 0xb8, 0x90, 0x88, 0x9d, 0x85, 0xad, 0xb5, 0xfd, 0xe5, 0xcd, 0xd5, 0x5d, 0x45, 0x6d, 0x75, 0x3d, 0x25, 0xd, 0x15, 0x27, 0x3f, 0x17, 0xf, 0x47, 0x5f, 0x77, 0x6f, 0xe7, 0xff, 0xd7, 0xcf, 0x87, 0x9f, 0xb7, 0xaf, 0xba, 0xa2, 0x8a, 0x92, 0xda, 0xc2, 0xea, 0xf2, 0x7a, 0x62, 0x4a, 0x52, 0x1a, 0x2, 0x2a, 0x32, 0x4e, 0x56, 0x7e, 0x66, 0x2e, 0x36, 0x1e, 0x6, 0x8e, 0x96, 0xbe, 0xa6, 0xee, 0xf6, 0xde, 0xc6, 0xd3, 0xcb, 0xe3, 0xfb, 0xb3, 0xab, 0x83, 0x9b, 0x13, 0xb, 0x23, 0x3b, 0x73, 0x6b, 0x43, 0x5b, 0x69, 0x71, 0x59, 0x41, 0x9, 0x11, 0x39, 0x21, 0xa9, 0xb1, 0x99, 0x81, 0xc9, 0xd1, 0xf9, 0xe1, 0xf4, 0xec, 0xc4, 0xdc, 0x94, 0x8c, 0xa4, 0xbc, 0x34, 0x2c, 0x4, 0x1c, 0x54, 0x4c, 0x64, 0x7c, 0x9c, 0x84, 0xac, 0xb4, 0xfc, 0xe4, 0xcc, 0xd4, 0x5c, 0x44, 0x6c, 0x74, 0x3c, 0x24, 0xc, 0x14, 0x1, 0x19, 0x31, 0x29, 0x61, 0x79, 0x51, 0x49, 0xc1, 0xd9, 0xf1, 0xe9, 0xa1, 0xb9, 0x91, 0x89, 0xbb, 0xa3, 0x8b, 0x93, 0xdb, 0xc3, 0xeb, 0xf3, 0x7b, 0x63, 0x4b, 0x53, 0x1b, 0x3, 0x2b, 0x33, 0x26, 0x3e, 0x16, 0xe, 0x46, 0x5e, 0x76, 0x6e, 0xe6, 0xfe, 0xd6, 0xce, 0x86, 0x9e, 0xb6, 0xae, 0xd2, 0xca, 0xe2, 0xfa, 0xb2, 0xaa, 0x82, 0x9a, 0x12, 0xa, 0x22, 0x3a, 0x72, 0x6a, 0x42, 0x5a, 0x4f, 0x57, 0x7f, 0x67, 0x2f, 0x37, 0x1f, 0x7, 0x8f, 0x97, 0xbf, 0xa7, 0xef, 0xf7, 0xdf, 0xc7, 0xf5, 0xed, 0xc5, 0xdd, 0x95, 0x8d, 0xa5, 0xbd, 0x35, 0x2d, 0x5, 0x1d, 0x55, 0x4d, 0x65, 0x7d, 0x68, 0x70, 0x58, 0x40, 0x8, 0x10, 0x38, 0x20, 0xa8, 0xb0, 0x98, 0x80, 0xc8, 0xd0, 0xf8, 0xe0}, {0x0, 0x19, 0x32, 0x2b, 0x64, 0x7d, 0x56, 0x4f, 0xc8, 0xd1, 0xfa, 0xe3, 0xac, 0xb5, 0x9e, 0x87, 0x8d, 0x94, 0xbf, 0xa6, 0xe9, 0xf0, 0xdb, 0xc2, 0x45, 0x5c, 0x77, 0x6e, 0x21, 0x38, 0x13, 0xa, 0x7, 0x1e, 0x35, 0x2c, 0x63, 0x7a, 0x51, 0x48, 0xcf, 0xd6, 0xfd, 0xe4, 0xab, 0xb2, 0x99, 0x80, 0x8a, 0x93, 0xb8, 0xa1, 0xee, 0xf7, 0xdc, 0xc5, 0x42, 0x5b, 0x70, 0x69, 0x26, 0x3f, 0x14, 0xd, 0xe, 0x17, 0x3c, 0x25, 0x6a, 0x73, 0x58, 0x41, 0xc6, 0xdf, 0xf4, 0xed, 0xa2, 0xbb, 0x90, 0x89, 0x83, 0x9a, 0xb1, 0xa8, 0xe7, 0xfe, 0xd5, 0xcc, 0x4b, 0x52, 0x79, 0x60, 0x2f, 0x36, 0x1d, 0x4, 0x9, 0x10, 0x3b, 0x22, 0x6d, 0x74, 0x5f, 0x46, 0xc1, 0xd8, 0xf3, 0xea, 0xa5, 0xbc, 0x97, 0x8e, 0x84, 0x9d, 0xb6, 0xaf, 0xe0, 0xf9, 0xd2, 0xcb, 0x4c, 0x55, 0x7e, 0x67, 0x28, 0x31, 0x1a, 0x3, 0x1c, 0x5, 0x2e, 0x37, 0x78, 0x61, 0x4a, 0x53, 0xd4, 0xcd, 0xe6, 0xff, 0xb0, 0xa9, 0x82, 0x9b, 0x91, 0x88, 0xa3, 0xba, 0xf5, 0xec, 0xc7, 0xde, 0x59, 0x40, 0x6b, 0x72, 0x3d, 0x24, 0xf, 0x16, 0x1b, 0x2, 0x29, 0x30, 0x7f, 0x66, 0x4d, 0x54, 0xd3, 0xca, 0xe1, 0xf8, 0xb7, 0xae, 0x85, 0x9c, 0x96, 0x8f, 0xa4, 0xbd, 0xf2, 0xeb, 0xc0, 0xd9, 0x5e, 0x47, 0x6c, 0x75, 0x3a, 0x23, 0x8, 0x11, 0x12, 0xb, 0x20, 0x39, 0x76, 0x6f, 0x44, 0x5d, 0xda, 0xc3, 0xe8, 0xf1, 0xbe, 0xa7, 0x8c, 0x95, 0x9f, 0x86, 0xad, 0xb4, 0xfb, 0xe2, 0xc9, 0xd0, 0x57, 0x4e, 0x65, 0x7c, 0x33, 0x2a, 0x1, 0x18, 0x15, 0xc, 0x27, 0x3e, 0x71, 0x68, 0x43, 0x5a, 0xdd, 0xc4, 0xef, 0xf6, 0xb9, 0xa0, 0x8b, 0x92, 0x98, 0x81, 0xaa, 0xb3, 0xfc, 0xe5, 0xce, 0xd7, 0x50, 0x49, 0x62, 0x7b, 0x34, 0x2d, 0x6, 0x1f}, {0x0, 0x1a, 0x34, 0x2e, 0x68, 0x72, 0x5c, 0x46, 0xd0, 0xca, 0xe4, 0xfe, 0xb8, 0xa2, 0x8c, 0x96, 0xbd, 0xa7, 0x89, 0x93, 0xd5, 0xcf, 0xe1, 0xfb, 0x6d, 0x77, 0x59, 0x43, 0x5, 0x1f, 0x31, 0x2b, 0x67, 0x7d, 0x53, 0x49, 0xf, 0x15, 0x3b, 0x21, 0xb7, 0xad, 0x83, 0x99, 0xdf, 0xc5, 0xeb, 0xf1, 0xda, 0xc0, 0xee, 0xf4, 0xb2, 0xa8, 0x86, 0x9c, 0xa, 0x10, 0x3e, 0x24, 0x62, 0x78, 0x56, 0x4c, 0xce, 0xd4, 0xfa, 0xe0, 0xa6, 0xbc, 0x92, 0x88, 0x1e, 0x4, 0x2a, 0x30, 0x76, 0x6c, 0x42, 0x58, 0x73, 0x69, 0x47, 0x5d, 0x1b, 0x1, 0x2f, 0x35, 0xa3, 0xb9, 0x97, 0x8d, 0xcb, 0xd1, 0xff, 0xe5, 0xa9, 0xb3, 0x9d, 0x87, 0xc1, 0xdb, 0xf5, 0xef, 0x79, 0x63, 0x4d, 0x57, 0x11, 0xb, 0x25, 0x3f, 0x14, 0xe, 0x20, 0x3a, 0x7c, 0x66, 0x48, 0x52, 0xc4, 0xde, 0xf0, 0xea, 0xac, 0xb6, 0x98, 0x82, 0x81, 0x9b, 0xb5, 0xaf, 0xe9, 0xf3, 0xdd, 0xc7, 0x51, 0x4b, 0x65, 0x7f, 0x39, 0x23, 0xd, 0x17, 0x3c, 0x26, 0x8, 0x12, 0x54, 0x4e, 0x60, 0x7a, 0xec, 0xf6, 0xd8, 0xc2, 0x84, 0x9e, 0xb0, 0xaa, 0xe6, 0xfc, 0xd2, 0xc8, 0x8e, 0x94, 0xba, 0xa0, 0x36, 0x2c, 0x2, 0x18, 0x5e, 0x44, 0x6a, 0x70, 0x5b, 0x41, 0x6f, 0x75, 0x33, 0x29, 0x7, 0x1d, 0x8b, 0x91, 0xbf, 0xa5, 0xe3, 0xf9, 0xd7, 0xcd, 0x4f, 0x55, 0x7b, 0x61, 0x27, 0x3d, 0x13, 0x9, 0x9f, 0x85, 0xab, 0xb1, 0xf7, 0xed, 0xc3, 0xd9, 0xf2, 0xe8, 0xc6, 0xdc, 0x9a, 0x80, 0xae, 0xb4, 0x22, 0x38, 0x16, 0xc, 0x4a, 0x50, 0x7e, 0x64, 0x28, 0x32, 0x1c, 0x6, 0x40, 0x5a, 0x74, 0x6e, 0xf8, 0xe2, 0xcc, 0xd6, 0x90, 0x8a, 0xa4, 0xbe, 0x95, 0x8f, 0xa1, 0xbb, 0xfd, 0xe7, 0xc9, 0xd3, 0x45, 0x5f, 0x71, 0x6b, 0x2d, 0x37, 0x19, 0x3}, {0x0, 0x1b, 0x36, 0x2d, 0x6c, 0x77, 0x5a, 0x41, 0xd8, 0xc3, 0xee, 0xf5, 0xb4, 0xaf, 0x82, 0x99, 0xad, 0xb6, 0x9b, 0x80, 0xc1, 0xda, 0xf7, 0xec, 0x75, 0x6e, 0x43, 0x58, 0x19, 0x2, 0x2f, 0x34, 0x47, 0x5c, 0x71, 0x6a, 0x2b, 0x30, 0x1d, 0x6, 0x9f, 0x84, 0xa9, 0xb2, 0xf3, 0xe8, 0xc5, 0xde, 0xea, 0xf1, 0xdc, 0xc7, 0x86, 0x9d, 0xb0, 0xab, 0x32, 0x29, 0x4, 0x1f, 0x5e, 0x45, 0x68, 0x73, 0x8e, 0x95, 0xb8, 0xa3, 0xe2, 0xf9, 0xd4, 0xcf, 0x56, 0x4d, 0x60, 0x7b, 0x3a, 0x21, 0xc, 0x17, 0x23, 0x38, 0x15, 0xe, 0x4f, 0x54, 0x79, 0x62, 0xfb, 0xe0, 0xcd, 0xd6, 0x97, 0x8c, 0xa1, 0xba, 0xc9, 0xd2, 0xff, 0xe4, 0xa5, 0xbe, 0x93, 0x88, 0x11, 0xa, 0x27, 0x3c, 0x7d, 0x66, 0x4b, 0x50, 0x64, 0x7f, 0x52, 0x49, 0x8, 0x13, 0x3e, 0x25, 0xbc, 0xa7, 0x8a, 0x91, 0xd0, 0xcb, 0xe6, 0xfd, 0x1, 0x1a, 0x37, 0x2c, 0x6d, 0x76, 0x5b, 0x40, 0xd9, 0xc2, 0xef, 0xf4, 0xb5, 0xae, 0x83, 0x98, 0xac, 0xb7, 0x9a, 0x81, 0xc0, 0xdb, 0xf6, 0xed, 0x74, 0x6f, 0x42, 0x59, 0x18, 0x3, 0x2e, 0x35, 0x46, 0x5d, 0x70, 0x6b, 0x2a, 0x31, 0x1c, 0x7, 0x9e, 0x85, 0xa8, 0xb3, 0xf2, 0xe9, 0xc4, 0xdf, 0xeb, 0xf0, 0xdd, 0xc6, 0x87, 0x9c, 0xb1, 0xaa, 0x33, 0x28, 0x5, 0x1e, 0x5f, 0x44, 0x69, 0x72, 0x8f, 0x94, 0xb9, 0xa2, 0xe3, 0xf8, 0xd5, 0xce, 0x57, 0x4c, 0x61, 0x7a, 0x3b, 0x20, 0xd, 0x16, 0x22, 0x39, 0x14, 0xf, 0x4e, 0x55, 0x78, 0x63, 0xfa, 0xe1, 0xcc, 0xd7, 0x96, 0x8d, 0xa0, 0xbb, 0xc8, 0xd3, 0xfe, 0xe5, 0xa4, 0xbf, 0x92, 0x89, 0x10, 0xb, 0x26, 0x3d, 0x7c, 0x67, 0x4a, 0x51, 0x65, 0x7e, 0x53, 0x48, 0x9, 0x12, 0x3f, 0x24, 0xbd, 0xa6, 0x8b, 0x90, 0xd1, 0xca, 0xe7, 0xfc}, {0x0, 0x1c, 0x38, 0x24, 0x70, 0x6c, 0x48, 0x54, 0xe0, 0xfc, 0xd8, 0xc4, 0x90, 0x8c, 0xa8, 0xb4, 0xdd, 0xc1, 0xe5, 0xf9, 0xad, 0xb1, 0x95, 0x89, 0x3d, 0x21, 0x5, 0x19, 0x4d, 0x51, 0x75, 0x69, 0xa7, 0xbb, 0x9f, 0x83, 0xd7, 0xcb, 0xef, 0xf3, 0x47, 0x5b, 0x7f, 0x63, 0x37, 0x2b, 0xf, 0x13, 0x7a, 0x66, 0x42, 0x5e, 0xa, 0x16, 0x32, 0x2e, 0x9a, 0x86, 0xa2, 0xbe, 0xea, 0xf6, 0xd2, 0xce, 0x53, 0x4f, 0x6b, 0x77, 0x23, 0x3f, 0x1b, 0x7, 0xb3, 0xaf, 0x8b, 0x97, 0xc3, 0xdf, 0xfb, 0xe7, 0x8e, 0x92, 0xb6, 0xaa, 0xfe, 0xe2, 0xc6, 0xda, 0x6e, 0x72, 0x56, 0x4a, 0x1e, 0x2, 0x26, 0x3a, 0xf4, 0xe8, 0xcc, 0xd0, 0x84, 0x98, 0xbc, 0xa0, 0x14, 0x8, 0x2c, 0x30, 0x64, 0x78, 0x5c, 0x40, 0x29, 0x35, 0x11, 0xd, 0x59, 0x45, 0x61, 0x7d, 0xc9, 0xd5, 0xf1, 0xed, 0xb9, 0xa5, 0x81, 0x9d, 0xa6, 0xba, 0x9e, 0x82, 0xd6, 0xca, 0xee, 0xf2, 0x46, 0x5a, 0x7e, 0x62, 0x36, 0x2a, 0xe, 0x12, 0x7b, 0x67, 0x43, 0x5f, 0xb, 0x17, 0x33, 0x2f, 0x9b, 0x87, 0xa3, 0xbf, 0xeb, 0xf7, 0xd3, 0xcf, 0x1, 0x1d, 0x39, 0x25, 0x71, 0x6d, 0x49, 0x55, 0xe1, 0xfd, 0xd9, 0xc5, 0x91, 0x8d, 0xa9, 0xb5, 0xdc, 0xc0, 0xe4, 0xf8, 0xac, 0xb0, 0x94, 0x88, 0x3c, 0x20, 0x4, 0x18, 0x4c, 0x50, 0x74, 0x68, 0xf5, 0xe9, 0xcd, 0xd1, 0x85, 0x99, 0xbd, 0xa1, 0x15, 0x9, 0x2d, 0x31, 0x65, 0x79, 0x5d, 0x41, 0x28, 0x34, 0x10, 0xc, 0x58, 0x44, 0x60, 0x7c, 0xc8, 0xd4, 0xf0, 0xec, 0xb8, 0xa4, 0x80, 0x9c, 0x52, 0x4e, 0x6a, 0x76, 0x22, 0x3e, 0x1a, 0x6, 0xb2, 0xae, 0x8a, 0x96, 0xc2, 0xde, 0xfa, 0xe6, 0x8f, 0x93, 0xb7, 0xab, 0xff, 0xe3, 0xc7, 0xdb, 0x6f, 0x73, 0x57, 0x4b, 0x1f, 0x3, 0x27, 0x3b}, {0x0, 0x1d, 0x3a, 0x27, 0x74, 0x69, 0x4e, 0x53, 0xe8, 0xf5, 0xd2, 0xcf, 0x9c, 0x81, 0xa6, 0xbb, 0xcd, 0xd0, 0xf7, 0xea, 0xb9, 0xa4, 0x83, 0x9e, 0x25, 0x38, 0x1f, 0x2, 0x51, 0x4c, 0x6b, 0x76, 0x87, 0x9a, 0xbd, 0xa0, 0xf3, 0xee, 0xc9, 0xd4, 0x6f, 0x72, 0x55, 0x48, 0x1b, 0x6, 0x21, 0x3c, 0x4a, 0x57, 0x70, 0x6d, 0x3e, 0x23, 0x4, 0x19, 0xa2, 0xbf, 0x98, 0x85, 0xd6, 0xcb, 0xec, 0xf1, 0x13, 0xe, 0x29, 0x34, 0x67, 0x7a, 0x5d, 0x40, 0xfb, 0xe6, 0xc1, 0xdc, 0x8f, 0x92, 0xb5, 0xa8, 0xde, 0xc3, 0xe4, 0xf9, 0xaa, 0xb7, 0x90, 0x8d, 0x36, 0x2b, 0xc, 0x11, 0x42, 0x5f, 0x78, 0x65, 0x94, 0x89, 0xae, 0xb3, 0xe0, 0xfd, 0xda, 0xc7, 0x7c, 0x61, 0x46, 0x5b, 0x8, 0x15, 0x32, 0x2f, 0x59, 0x44, 0x63, 0x7e, 0x2d, 0x30, 0x17, 0xa, 0xb1, 0xac, 0x8b, 0x96, 0xc5, 0xd8, 0xff, 0xe2, 0x26, 0x3b, 0x1c, 0x1, 0x52, 0x4f, 0x68, 0x75, 0xce, 0xd3, 0xf4, 0xe9, 0xba, 0xa7, 0x80, 0x9d, 0xeb, 0xf6, 0xd1, 0xcc, 0x9f, 0x82, 0xa5, 0xb8, 0x3, 0x1e, 0x39, 0x24, 0x77, 0x6a, 0x4d, 0x50, 0xa1, 0xbc, 0x9b, 0x86, 0xd5, 0xc8, 0xef, 0xf2, 0x49, 0x54, 0x73, 0x6e, 0x3d, 0x20, 0x7, 0x1a, 0x6c, 0x71, 0x56, 0x4b, 0x18, 0x5, 0x22, 0x3f, 0x84, 0x99, 0xbe, 0xa3, 0xf0, 0xed, 0xca, 0xd7, 0x35, 0x28, 0xf, 0x12, 0x41, 0x5c, 0x7b, 0x66, 0xdd, 0xc0, 0xe7, 0xfa, 0xa9, 0xb4, 0x93, 0x8e, 0xf8, 0xe5, 0xc2, 0xdf, 0x8c, 0x91, 0xb6, 0xab, 0x10, 0xd, 0x2a, 0x37, 0x64, 0x79, 0x5e, 0x43, 0xb2, 0xaf, 0x88, 0x95, 0xc6, 0xdb, 0xfc, 0xe1, 0x5a, 0x47, 0x60, 0x7d, 0x2e, 0x33, 0x14, 0x9, 0x7f, 0x62, 0x45, 0x58, 0xb, 0x16, 0x31, 0x2c, 0x97, 0x8a, 0xad, 0xb0, 0xe3, 0xfe, 0xd9, 0xc4}, {0x0, 0x1e, 0x3c, 0x22, 0x78, 0x66, 0x44, 0x5a, 0xf0, 0xee, 0xcc, 0xd2, 0x88, 0x96, 0xb4, 0xaa, 0xfd, 0xe3, 0xc1, 0xdf, 0x85, 0x9b, 0xb9, 0xa7, 0xd, 0x13, 0x31, 0x2f, 0x75, 0x6b, 0x49, 0x57, 0xe7, 0xf9, 0xdb, 0xc5, 0x9f, 0x81, 0xa3, 0xbd, 0x17, 0x9, 0x2b, 0x35, 0x6f, 0x71, 0x53, 0x4d, 0x1a, 0x4, 0x26, 0x38, 0x62, 0x7c, 0x5e, 0x40, 0xea, 0xf4, 0xd6, 0xc8, 0x92, 0x8c, 0xae, 0xb0, 0xd3, 0xcd, 0xef, 0xf1, 0xab, 0xb5, 0x97, 0x89, 0x23, 0x3d, 0x1f, 0x1, 0x5b, 0x45, 0x67, 0x79, 0x2e, 0x30, 0x12, 0xc, 0x56, 0x48, 0x6a, 0x74, 0xde, 0xc0, 0xe2, 0xfc, 0xa6, 0xb8, 0x9a, 0x84, 0x34, 0x2a, 0x8, 0x16, 0x4c, 0x52, 0x70, 0x6e, 0xc4, 0xda, 0xf8, 0xe6, 0xbc, 0xa2, 0x80, 0x9e, 0xc9, 0xd7, 0xf5, 0xeb, 0xb1, 0xaf, 0x8d, 0x93, 0x39, 0x27, 0x5, 0x1b, 0x41, 0x5f, 0x7d, 0x63, 0xbb, 0xa5, 0x87, 0x99, 0xc3, 0xdd, 0xff, 0xe1, 0x4b, 0x55, 0x77, 0x69, 0x33, 0x2d, 0xf, 0x11, 0x46, 0x58, 0x7a, 0x64, 0x3e, 0x20, 0x2, 0x1c, 0xb6, 0xa8, 0x8a, 0x94, 0xce, 0xd0, 0xf2, 0xec, 0x5c, 0x42, 0x60, 0x7e, 0x24, 0x3a, 0x18, 0x6, 0xac, 0xb2, 0x90, 0x8e, 0xd4, 0xca, 0xe8, 0xf6, 0xa1, 0xbf, 0x9d, 0x83, 0xd9, 0xc7, 0xe5, 0xfb, 0x51, 0x4f, 0x6d, 0x73, 0x29, 0x37, 0x15, 0xb, 0x68, 0x76, 0x54, 0x4a, 0x10, 0xe, 0x2c, 0x32, 0x98, 0x86, 0xa4, 0xba, 0xe0, 0xfe, 0xdc, 0xc2, 0x95, 0x8b, 0xa9, 0xb7, 0xed, 0xf3, 0xd1, 0xcf, 0x65, 0x7b, 0x59, 0x47, 0x1d, 0x3, 0x21, 0x3f, 0x8f, 0x91, 0xb3, 0xad, 0xf7, 0xe9, 0xcb, 0xd5, 0x7f, 0x61, 0x43, 0x5d, 0x7, 0x19, 0x3b, 0x25, 0x72, 0x6c, 0x4e, 0x50, 0xa, 0x14, 0x36, 0x28, 0x82, 0x9c, 0xbe, 0xa0, 0xfa, 0xe4, 0xc6, 0xd8}, {0x0, 0x1f, 0x3e, 0x21, 0x7c, 0x63, 0x42, 0x5d, 0xf8, 0xe7, 0xc6, 0xd9, 0x84, 0x9b, 0xba, 0xa5, 0xed, 0xf2, 0xd3, 0xcc, 0x91, 0x8e, 0xaf, 0xb0, 0x15, 0xa, 0x2b, 0x34, 0x69, 0x76, 0x57, 0x48, 0xc7, 0xd8, 0xf9, 0xe6, 0xbb, 0xa4, 0x85, 0x9a, 0x3f, 0x20, 0x1, 0x1e, 0x43, 0x5c, 0x7d, 0x62, 0x2a, 0x35, 0x14, 0xb, 0x56, 0x49, 0x68, 0x77, 0xd2, 0xcd, 0xec, 0xf3, 0xae, 0xb1, 0x90, 0x8f, 0x93, 0x8c, 0xad, 0xb2, 0xef, 0xf0, 0xd1, 0xce, 0x6b, 0x74, 0x55, 0x4a, 0x17, 0x8, 0x29, 0x36, 0x7e, 0x61, 0x40, 0x5f, 0x2, 0x1d, 0x3c, 0x23, 0x86, 0x99, 0xb8, 0xa7, 0xfa, 0xe5, 0xc4, 0xdb, 0x54, 0x4b, 0x6a, 0x75, 0x28, 0x37, 0x16, 0x9, 0xac, 0xb3, 0x92, 0x8d, 0xd0, 0xcf, 0xee, 0xf1, 0xb9, 0xa6, 0x87, 0x98, 0xc5, 0xda, 0xfb, 0xe4, 0x41, 0x5e, 0x7f, 0x60, 0x3d, 0x22, 0x3, 0x1c, 0x3b, 0x24, 0x5, 0x1a, 0x47, 0x58, 0x79, 0x66, 0xc3, 0xdc, 0xfd, 0xe2, 0xbf, 0xa0, 0x81, 0x9e, 0xd6, 0xc9, 0xe8, 0xf7, 0xaa, 0xb5, 0x94, 0x8b, 0x2e, 0x31, 0x10, 0xf, 0x52, 0x4d, 0x6c, 0x73, 0xfc, 0xe3, 0xc2, 0xdd, 0x80, 0x9f, 0xbe, 0xa1, 0x4, 0x1b, 0x3a, 0x25, 0x78, 0x67, 0x46, 0x59, 0x11, 0xe, 0x2f, 0x30, 0x6d, 0x72, 0x53, 0x4c, 0xe9, 0xf6, 0xd7, 0xc8, 0x95, 0x8a, 0xab, 0xb4, 0xa8, 0xb7, 0x96, 0x89, 0xd4, 0xcb, 0xea, 0xf5, 0x50, 0x4f, 0x6e, 0x71, 0x2c, 0x33, 0x12, 0xd, 0x45, 0x5a, 0x7b, 0x64, 0x39, 0x26, 0x7, 0x18, 0xbd, 0xa2, 0x83, 0x9c, 0xc1, 0xde, 0xff, 0xe0, 0x6f, 0x70, 0x51, 0x4e, 0x13, 0xc, 0x2d, 0x32, 0x97, 0x88, 0xa9, 0xb6, 0xeb, 0xf4, 0xd5, 0xca, 0x82, 0x9d, 0xbc, 0xa3, 0xfe, 0xe1, 0xc0, 0xdf, 0x7a, 0x65, 0x44, 0x5b, 0x6, 0x19, 0x38, 0x27}, {0x0, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, 0x1d, 0x3d, 0x5d, 0x7d, 0x9d, 0xbd, 0xdd, 0xfd, 0x3a, 0x1a, 0x7a, 0x5a, 0xba, 0x9a, 0xfa, 0xda, 0x27, 0x7, 0x67, 0x47, 0xa7, 0x87, 0xe7, 0xc7, 0x74, 0x54, 0x34, 0x14, 0xf4, 0xd4, 0xb4, 0x94, 0x69, 0x49, 0x29, 0x9, 0xe9, 0xc9, 0xa9, 0x89, 0x4e, 0x6e, 0xe, 0x2e, 0xce, 0xee, 0x8e, 0xae, 0x53, 0x73, 0x13, 0x33, 0xd3, 0xf3, 0x93, 0xb3, 0xe8, 0xc8, 0xa8, 0x88, 0x68, 0x48, 0x28, 0x8, 0xf5, 0xd5, 0xb5, 0x95, 0x75, 0x55, 0x35, 0x15, 0xd2, 0xf2, 0x92, 0xb2, 0x52, 0x72, 0x12, 0x32, 0xcf, 0xef, 0x8f, 0xaf, 0x4f, 0x6f, 0xf, 0x2f, 0x9c, 0xbc, 0xdc, 0xfc, 0x1c, 0x3c, 0x5c, 0x7c, 0x81, 0xa1, 0xc1, 0xe1, 0x1, 0x21, 0x41, 0x61, 0xa6, 0x86, 0xe6, 0xc6, 0x26, 0x6, 0x66, 0x46, 0xbb, 0x9b, 0xfb, 0xdb, 0x3b, 0x1b, 0x7b, 0x5b, 0xcd, 0xed, 0x8d, 0xad, 0x4d, 0x6d, 0xd, 0x2d, 0xd0, 0xf0, 0x90, 0xb0, 0x50, 0x70, 0x10, 0x30, 0xf7, 0xd7, 0xb7, 0x97, 0x77, 0x57, 0x37, 0x17, 0xea, 0xca, 0xaa, 0x8a, 0x6a, 0x4a, 0x2a, 0xa, 0xb9, 0x99, 0xf9, 0xd9, 0x39, 0x19, 0x79, 0x59, 0xa4, 0x84, 0xe4, 0xc4, 0x24, 0x4, 0x64, 0x44, 0x83, 0xa3, 0xc3, 0xe3, 0x3, 0x23, 0x43, 0x63, 0x9e, 0xbe, 0xde, 0xfe, 0x1e, 0x3e, 0x5e, 0x7e, 0x25, 0x5, 0x65, 0x45, 0xa5, 0x85, 0xe5, 0xc5, 0x38, 0x18, 0x78, 0x58, 0xb8, 0x98, 0xf8, 0xd8, 0x1f, 0x3f, 0x5f, 0x7f, 0x9f, 0xbf, 0xdf, 0xff, 0x2, 0x22, 0x42, 0x62, 0x82, 0xa2, 0xc2, 0xe2, 0x51, 0x71, 0x11, 0x31, 0xd1, 0xf1, 0x91, 0xb1, 0x4c, 0x6c, 0xc, 0x2c, 0xcc, 0xec, 0x8c, 0xac, 0x6b, 0x4b, 0x2b, 0xb, 0xeb, 0xcb, 0xab, 0x8b, 0x76, 0x56, 0x36, 0x16, 0xf6, 0xd6, 0xb6, 0x96}, {0x0, 0x21, 0x42, 0x63, 0x84, 0xa5, 0xc6, 0xe7, 0x15, 0x34, 0x57, 0x76, 0x91, 0xb0, 0xd3, 0xf2, 0x2a, 0xb, 0x68, 0x49, 0xae, 0x8f, 0xec, 0xcd, 0x3f, 0x1e, 0x7d, 0x5c, 0xbb, 0x9a, 0xf9, 0xd8, 0x54, 0x75, 0x16, 0x37, 0xd0, 0xf1, 0x92, 0xb3, 0x41, 0x60, 0x3, 0x22, 0xc5, 0xe4, 0x87, 0xa6, 0x7e, 0x5f, 0x3c, 0x1d, 0xfa, 0xdb, 0xb8, 0x99, 0x6b, 0x4a, 0x29, 0x8, 0xef, 0xce, 0xad, 0x8c, 0xa8, 0x89, 0xea, 0xcb, 0x2c, 0xd, 0x6e, 0x4f, 0xbd, 0x9c, 0xff, 0xde, 0x39, 0x18, 0x7b, 0x5a, 0x82, 0xa3, 0xc0, 0xe1, 0x6, 0x27, 0x44, 0x65, 0x97, 0xb6, 0xd5, 0xf4, 0x13, 0x32, 0x51, 0x70, 0xfc, 0xdd, 0xbe, 0x9f, 0x78, 0x59, 0x3a, 0x1b, 0xe9, 0xc8, 0xab, 0x8a, 0x6d, 0x4c, 0x2f, 0xe, 0xd6, 0xf7, 0x94, 0xb5, 0x52, 0x73, 0x10, 0x31, 0xc3, 0xe2, 0x81, 0xa0, 0x47, 0x66, 0x5, 0x24, 0x4d, 0x6c, 0xf, 0x2e, 0xc9, 0xe8, 0x8b, 0xaa, 0x58, 0x79, 0x1a, 0x3b, 0xdc, 0xfd, 0x9e, 0xbf, 0x67, 0x46, 0x25, 0x4, 0xe3, 0xc2, 0xa1, 0x80, 0x72, 0x53, 0x30, 0x11, 0xf6, 0xd7, 0xb4, 0x95, 0x19, 0x38, 0x5b, 0x7a, 0x9d, 0xbc, 0xdf, 0xfe, 0xc, 0x2d, 0x4e, 0x6f, 0x88, 0xa9, 0xca, 0xeb, 0x33, 0x12, 0x71, 0x50, 0xb7, 0x96, 0xf5, 0xd4, 0x26, 0x7, 0x64, 0x45, 0xa2, 0x83, 0xe0, 0xc1, 0xe5, 0xc4, 0xa7, 0x86, 0x61, 0x40, 0x23, 0x2, 0xf0, 0xd1, 0xb2, 0x93, 0x74, 0x55, 0x36, 0x17, 0xcf, 0xee, 0x8d, 0xac, 0x4b, 0x6a, 0x9, 0x28, 0xda, 0xfb, 0x98, 0xb9, 0x5e, 0x7f, 0x1c, 0x3d, 0xb1, 0x90, 0xf3, 0xd2, 0x35, 0x14, 0x77, 0x56, 0xa4, 0x85, 0xe6, 0xc7, 0x20, 0x1, 0x62, 0x43, 0x9b, 0xba, 0xd9, 0xf8, 0x1f, 0x3e, 0x5d, 0x7c, 0x8e, 0xaf, 0xcc, 0xed, 0xa, 0x2b, 0x48, 0x69}, {0x0, 0x22, 0x44, 0x66, 0x88, 0xaa, 0xcc, 0xee, 0xd, 0x2f, 0x49, 0x6b, 0x85, 0xa7, 0xc1, 0xe3, 0x1a, 0x38, 0x5e, 0x7c, 0x92, 0xb0, 0xd6, 0xf4, 0x17, 0x35, 0x53, 0x71, 0x9f, 0xbd, 0xdb, 0xf9, 0x34, 0x16, 0x70, 0x52, 0xbc, 0x9e, 0xf8, 0xda, 0x39, 0x1b, 0x7d, 0x5f, 0xb1, 0x93, 0xf5, 0xd7, 0x2e, 0xc, 0x6a, 0x48, 0xa6, 0x84, 0xe2, 0xc0, 0x23, 0x1, 0x67, 0x45, 0xab, 0x89, 0xef, 0xcd, 0x68, 0x4a, 0x2c, 0xe, 0xe0, 0xc2, 0xa4, 0x86, 0x65, 0x47, 0x21, 0x3, 0xed, 0xcf, 0xa9, 0x8b, 0x72, 0x50, 0x36, 0x14, 0xfa, 0xd8, 0xbe, 0x9c, 0x7f, 0x5d, 0x3b, 0x19, 0xf7, 0xd5, 0xb3, 0x91, 0x5c, 0x7e, 0x18, 0x3a, 0xd4, 0xf6, 0x90, 0xb2, 0x51, 0x73, 0x15, 0x37, 0xd9, 0xfb, 0x9d, 0xbf, 0x46, 0x64, 0x2, 0x20, 0xce, 0xec, 0x8a, 0xa8, 0x4b, 0x69, 0xf, 0x2d, 0xc3, 0xe1, 0x87, 0xa5, 0xd0, 0xf2, 0x94, 0xb6, 0x58, 0x7a, 0x1c, 0x3e, 0xdd, 0xff, 0x99, 0xbb, 0x55, 0x77, 0x11, 0x33, 0xca, 0xe8, 0x8e, 0xac, 0x42, 0x60, 0x6, 0x24, 0xc7, 0xe5, 0x83, 0xa1, 0x4f, 0x6d, 0xb, 0x29, 0xe4, 0xc6, 0xa0, 0x82, 0x6c, 0x4e, 0x28, 0xa, 0xe9, 0xcb, 0xad, 0x8f, 0x61, 0x43, 0x25, 0x7, 0xfe, 0xdc, 0xba, 0x98, 0x76, 0x54, 0x32, 0x10, 0xf3, 0xd1, 0xb7, 0x95, 0x7b, 0x59, 0x3f, 0x1d, 0xb8, 0x9a, 0xfc, 0xde, 0x30, 0x12, 0x74, 0x56, 0xb5, 0x97, 0xf1, 0xd3, 0x3d, 0x1f, 0x79, 0x5b, 0xa2, 0x80, 0xe6, 0xc4, 0x2a, 0x8, 0x6e, 0x4c, 0xaf, 0x8d, 0xeb, 0xc9, 0x27, 0x5, 0x63, 0x41, 0x8c, 0xae, 0xc8, 0xea, 0x4, 0x26, 0x40, 0x62, 0x81, 0xa3, 0xc5, 0xe7, 0x9, 0x2b, 0x4d, 0x6f, 0x96, 0xb4, 0xd2, 0xf0, 0x1e, 0x3c, 0x5a, 0x78, 0x9b, 0xb9, 0xdf, 0xfd, 0x13, 0x31, 0x57, 0x75}, {0x0, 0x23, 0x46, 0x65, 0x8c, 0xaf, 0xca, 0xe9, 0x5, 0x26, 0x43, 0x60, 0x89, 0xaa, 0xcf, 0xec, 0xa, 0x29, 0x4c, 0x6f, 0x86, 0xa5, 0xc0, 0xe3, 0xf, 0x2c, 0x49, 0x6a, 0x83, 0xa0, 0xc5, 0xe6, 0x14, 0x37, 0x52, 0x71, 0x98, 0xbb, 0xde, 0xfd, 0x11, 0x32, 0x57, 0x74, 0x9d, 0xbe, 0xdb, 0xf8, 0x1e, 0x3d, 0x58, 0x7b, 0x92, 0xb1, 0xd4, 0xf7, 0x1b, 0x38, 0x5d, 0x7e, 0x97, 0xb4, 0xd1, 0xf2, 0x28, 0xb, 0x6e, 0x4d, 0xa4, 0x87, 0xe2, 0xc1, 0x2d, 0xe, 0x6b, 0x48, 0xa1, 0x82, 0xe7, 0xc4, 0x22, 0x1, 0x64, 0x47, 0xae, 0x8d, 0xe8, 0xcb, 0x27, 0x4, 0x61, 0x42, 0xab, 0x88, 0xed, 0xce, 0x3c, 0x1f, 0x7a, 0x59, 0xb0, 0x93, 0xf6, 0xd5, 0x39, 0x1a, 0x7f, 0x5c, 0xb5, 0x96, 0xf3, 0xd0, 0x36, 0x15, 0x70, 0x53, 0xba, 0x99, 0xfc, 0xdf, 0x33, 0x10, 0x75, 0x56, 0xbf, 0x9c, 0xf9, 0xda, 0x50, 0x73, 0x16, 0x35, 0xdc, 0xff, 0x9a, 0xb9, 0x55, 0x76, 0x13, 0x30, 0xd9, 0xfa, 0x9f, 0xbc, 0x5a, 0x79, 0x1c, 0x3f, 0xd6, 0xf5, 0x90, 0xb3, 0x5f, 0x7c, 0x19, 0x3a, 0xd3, 0xf0, 0x95, 0xb6, 0x44, 0x67, 0x2, 0x21, 0xc8, 0xeb, 0x8e, 0xad, 0x41, 0x62, 0x7, 0x24, 0xcd, 0xee, 0x8b, 0xa8, 0x4e, 0x6d, 0x8, 0x2b, 0xc2, 0xe1, 0x84, 0xa7, 0x4b, 0x68, 0xd, 0x2e, 0xc7, 0xe4, 0x81, 0xa2, 0x78, 0x5b, 0x3e, 0x1d, 0xf4, 0xd7, 0xb2, 0x91, 0x7d, 0x5e, 0x3b, 0x18, 0xf1, 0xd2, 0xb7, 0x94, 0x72, 0x51, 0x34, 0x17, 0xfe, 0xdd, 0xb8, 0x9b, 0x77, 0x54, 0x31, 0x12, 0xfb, 0xd8, 0xbd, 0x9e, 0x6c, 0x4f, 0x2a, 0x9, 0xe0, 0xc3, 0xa6, 0x85, 0x69, 0x4a, 0x2f, 0xc, 0xe5, 0xc6, 0xa3, 0x80, 0x66, 0x45, 0x20, 0x3, 0xea, 0xc9, 0xac, 0x8f, 0x63, 0x40, 0x25, 0x6, 0xef, 0xcc, 0xa9, 0x8a}, {0x0, 0x24, 0x48, 0x6c, 0x90, 0xb4, 0xd8, 0xfc, 0x3d, 0x19, 0x75, 0x51, 0xad, 0x89, 0xe5, 0xc1, 0x7a, 0x5e, 0x32, 0x16, 0xea, 0xce, 0xa2, 0x86, 0x47, 0x63, 0xf, 0x2b, 0xd7, 0xf3, 0x9f, 0xbb, 0xf4, 0xd0, 0xbc, 0x98, 0x64, 0x40, 0x2c, 0x8, 0xc9, 0xed, 0x81, 0xa5, 0x59, 0x7d, 0x11, 0x35, 0x8e, 0xaa, 0xc6, 0xe2, 0x1e, 0x3a, 0x56, 0x72, 0xb3, 0x97, 0xfb, 0xdf, 0x23, 0x7, 0x6b, 0x4f, 0xf5, 0xd1, 0xbd, 0x99, 0x65, 0x41, 0x2d, 0x9, 0xc8, 0xec, 0x80, 0xa4, 0x58, 0x7c, 0x10, 0x34, 0x8f, 0xab, 0xc7, 0xe3, 0x1f, 0x3b, 0x57, 0x73, 0xb2, 0x96, 0xfa, 0xde, 0x22, 0x6, 0x6a, 0x4e, 0x1, 0x25, 0x49, 0x6d, 0x91, 0xb5, 0xd9, 0xfd, 0x3c, 0x18, 0x74, 0x50, 0xac, 0x88, 0xe4, 0xc0, 0x7b, 0x5f, 0x33, 0x17, 0xeb, 0xcf, 0xa3, 0x87, 0x46, 0x62, 0xe, 0x2a, 0xd6, 0xf2, 0x9e, 0xba, 0xf7, 0xd3, 0xbf, 0x9b, 0x67, 0x43, 0x2f, 0xb, 0xca, 0xee, 0x82, 0xa6, 0x5a, 0x7e, 0x12, 0x36, 0x8d, 0xa9, 0xc5, 0xe1, 0x1d, 0x39, 0x55, 0x71, 0xb0, 0x94, 0xf8, 0xdc, 0x20, 0x4, 0x68, 0x4c, 0x3, 0x27, 0x4b, 0x6f, 0x93, 0xb7, 0xdb, 0xff, 0x3e, 0x1a, 0x76, 0x52, 0xae, 0x8a, 0xe6, 0xc2, 0x79, 0x5d, 0x31, 0x15, 0xe9, 0xcd, 0xa1, 0x85, 0x44, 0x60, 0xc, 0x28, 0xd4, 0xf0, 0x9c, 0xb8, 0x2, 0x26, 0x4a, 0x6e, 0x92, 0xb6, 0xda, 0xfe, 0x3f, 0x1b, 0x77, 0x53, 0xaf, 0x8b, 0xe7, 0xc3, 0x78, 0x5c, 0x30, 0x14, 0xe8, 0xcc, 0xa0, 0x84, 0x45, 0x61, 0xd, 0x29, 0xd5, 0xf1, 0x9d, 0xb9, 0xf6, 0xd2, 0xbe, 0x9a, 0x66, 0x42, 0x2e, 0xa, 0xcb, 0xef, 0x83, 0xa7, 0x5b, 0x7f, 0x13, 0x37, 0x8c, 0xa8, 0xc4, 0xe0, 0x1c, 0x38, 0x54, 0x70, 0xb1, 0x95, 0xf9, 0xdd, 0x21, 0x5, 0x69, 0x4d}, {0x0, 0x25, 0x4a, 0x6f, 0x94, 0xb1, 0xde, 0xfb, 0x35, 0x10, 0x7f, 0x5a, 0xa1, 0x84, 0xeb, 0xce, 0x6a, 0x4f, 0x20, 0x5, 0xfe, 0xdb, 0xb4, 0x91, 0x5f, 0x7a, 0x15, 0x30, 0xcb, 0xee, 0x81, 0xa4, 0xd4, 0xf1, 0x9e, 0xbb, 0x40, 0x65, 0xa, 0x2f, 0xe1, 0xc4, 0xab, 0x8e, 0x75, 0x50, 0x3f, 0x1a, 0xbe, 0x9b, 0xf4, 0xd1, 0x2a, 0xf, 0x60, 0x45, 0x8b, 0xae, 0xc1, 0xe4, 0x1f, 0x3a, 0x55, 0x70, 0xb5, 0x90, 0xff, 0xda, 0x21, 0x4, 0x6b, 0x4e, 0x80, 0xa5, 0xca, 0xef, 0x14, 0x31, 0x5e, 0x7b, 0xdf, 0xfa, 0x95, 0xb0, 0x4b, 0x6e, 0x1, 0x24, 0xea, 0xcf, 0xa0, 0x85, 0x7e, 0x5b, 0x34, 0x11, 0x61, 0x44, 0x2b, 0xe, 0xf5, 0xd0, 0xbf, 0x9a, 0x54, 0x71, 0x1e, 0x3b, 0xc0, 0xe5, 0x8a, 0xaf, 0xb, 0x2e, 0x41, 0x64, 0x9f, 0xba, 0xd5, 0xf0, 0x3e, 0x1b, 0x74, 0x51, 0xaa, 0x8f, 0xe0, 0xc5, 0x77, 0x52, 0x3d, 0x18, 0xe3, 0xc6, 0xa9, 0x8c, 0x42, 0x67, 0x8, 0x2d, 0xd6, 0xf3, 0x9c, 0xb9, 0x1d, 0x38, 0x57, 0x72, 0x89, 0xac, 0xc3, 0xe6, 0x28, 0xd, 0x62, 0x47, 0xbc, 0x99, 0xf6, 0xd3, 0xa3, 0x86, 0xe9, 0xcc, 0x37, 0x12, 0x7d, 0x58, 0x96, 0xb3, 0xdc, 0xf9, 0x2, 0x27, 0x48, 0x6d, 0xc9, 0xec, 0x83, 0xa6, 0x5d, 0x78, 0x17, 0x32, 0xfc, 0xd9, 0xb6, 0x93, 0x68, 0x4d, 0x22, 0x7, 0xc2, 0xe7, 0x88, 0xad, 0x56, 0x73, 0x1c, 0x39, 0xf7, 0xd2, 0xbd, 0x98, 0x63, 0x46, 0x29, 0xc, 0xa8, 0x8d, 0xe2, 0xc7, 0x3c, 0x19, 0x76, 0x53, 0x9d, 0xb8, 0xd7, 0xf2, 0x9, 0x2c, 0x43, 0x66, 0x16, 0x33, 0x5c, 0x79, 0x82, 0xa7, 0xc8, 0xed, 0x23, 0x6, 0x69, 0x4c, 0xb7, 0x92, 0xfd, 0xd8, 0x7c, 0x59, 0x36, 0x13, 0xe8, 0xcd, 0xa2, 0x87, 0x49, 0x6c, 0x3, 0x26, 0xdd, 0xf8, 0x97, 0xb2}, {0x0, 0x26, 0x4c, 0x6a, 0x98, 0xbe, 0xd4, 0xf2, 0x2d, 0xb, 0x61, 0x47, 0xb5, 0x93, 0xf9, 0xdf, 0x5a, 0x7c, 0x16, 0x30, 0xc2, 0xe4, 0x8e, 0xa8, 0x77, 0x51, 0x3b, 0x1d, 0xef, 0xc9, 0xa3, 0x85, 0xb4, 0x92, 0xf8, 0xde, 0x2c, 0xa, 0x60, 0x46, 0x99, 0xbf, 0xd5, 0xf3, 0x1, 0x27, 0x4d, 0x6b, 0xee, 0xc8, 0xa2, 0x84, 0x76, 0x50, 0x3a, 0x1c, 0xc3, 0xe5, 0x8f, 0xa9, 0x5b, 0x7d, 0x17, 0x31, 0x75, 0x53, 0x39, 0x1f, 0xed, 0xcb, 0xa1, 0x87, 0x58, 0x7e, 0x14, 0x32, 0xc0, 0xe6, 0x8c, 0xaa, 0x2f, 0x9, 0x63, 0x45, 0xb7, 0x91, 0xfb, 0xdd, 0x2, 0x24, 0x4e, 0x68, 0x9a, 0xbc, 0xd6, 0xf0, 0xc1, 0xe7, 0x8d, 0xab, 0x59, 0x7f, 0x15, 0x33, 0xec, 0xca, 0xa0, 0x86, 0x74, 0x52, 0x38, 0x1e, 0x9b, 0xbd, 0xd7, 0xf1, 0x3, 0x25, 0x4f, 0x69, 0xb6, 0x90, 0xfa, 0xdc, 0x2e, 0x8, 0x62, 0x44, 0xea, 0xcc, 0xa6, 0x80, 0x72, 0x54, 0x3e, 0x18, 0xc7, 0xe1, 0x8b, 0xad, 0x5f, 0x79, 0x13, 0x35, 0xb0, 0x96, 0xfc, 0xda, 0x28, 0xe, 0x64, 0x42, 0x9d, 0xbb, 0xd1, 0xf7, 0x5, 0x23, 0x49, 0x6f, 0x5e, 0x78, 0x12, 0x34, 0xc6, 0xe0, 0x8a, 0xac, 0x73, 0x55, 0x3f, 0x19, 0xeb, 0xcd, 0xa7, 0x81, 0x4, 0x22, 0x48, 0x6e, 0x9c, 0xba, 0xd0, 0xf6, 0x29, 0xf, 0x65, 0x43, 0xb1, 0x97, 0xfd, 0xdb, 0x9f, 0xb9, 0xd3, 0xf5, 0x7, 0x21, 0x4b, 0x6d, 0xb2, 0x94, 0xfe, 0xd8, 0x2a, 0xc, 0x66, 0x40, 0xc5, 0xe3, 0x89, 0xaf, 0x5d, 0x7b, 0x11, 0x37, 0xe8, 0xce, 0xa4, 0x82, 0x70, 0x56, 0x3c, 0x1a, 0x2b, 0xd, 0x67, 0x41, 0xb3, 0x95, 0xff, 0xd9, 0x6, 0x20, 0x4a, 0x6c, 0x9e, 0xb8, 0xd2, 0xf4, 0x71, 0x57, 0x3d, 0x1b, 0xe9, 0xcf, 0xa5, 0x83, 0x5c, 0x7a, 0x10, 0x36, 0xc4, 0xe2, 0x88, 0xae}, {0x0, 0x27, 0x4e, 0x69, 0x9c, 0xbb, 0xd2, 0xf5, 0x25, 0x2, 0x6b, 0x4c, 0xb9, 0x9e, 0xf7, 0xd0, 0x4a, 0x6d, 0x4, 0x23, 0xd6, 0xf1, 0x98, 0xbf, 0x6f, 0x48, 0x21, 0x6, 0xf3, 0xd4, 0xbd, 0x9a, 0x94, 0xb3, 0xda, 0xfd, 0x8, 0x2f, 0x46, 0x61, 0xb1, 0x96, 0xff, 0xd8, 0x2d, 0xa, 0x63, 0x44, 0xde, 0xf9, 0x90, 0xb7, 0x42, 0x65, 0xc, 0x2b, 0xfb, 0xdc, 0xb5, 0x92, 0x67, 0x40, 0x29, 0xe, 0x35, 0x12, 0x7b, 0x5c, 0xa9, 0x8e, 0xe7, 0xc0, 0x10, 0x37, 0x5e, 0x79, 0x8c, 0xab, 0xc2, 0xe5, 0x7f, 0x58, 0x31, 0x16, 0xe3, 0xc4, 0xad, 0x8a, 0x5a, 0x7d, 0x14, 0x33, 0xc6, 0xe1, 0x88, 0xaf, 0xa1, 0x86, 0xef, 0xc8, 0x3d, 0x1a, 0x73, 0x54, 0x84, 0xa3, 0xca, 0xed, 0x18, 0x3f, 0x56, 0x71, 0xeb, 0xcc, 0xa5, 0x82, 0x77, 0x50, 0x39, 0x1e, 0xce, 0xe9, 0x80, 0xa7, 0x52, 0x75, 0x1c, 0x3b, 0x6a, 0x4d, 0x24, 0x3, 0xf6, 0xd1, 0xb8, 0x9f, 0x4f, 0x68, 0x1, 0x26, 0xd3, 0xf4, 0x9d, 0xba, 0x20, 0x7, 0x6e, 0x49, 0xbc, 0x9b, 0xf2, 0xd5, 0x5, 0x22, 0x4b, 0x6c, 0x99, 0xbe, 0xd7, 0xf0, 0xfe, 0xd9, 0xb0, 0x97, 0x62, 0x45, 0x2c, 0xb, 0xdb, 0xfc, 0x95, 0xb2, 0x47, 0x60, 0x9, 0x2e, 0xb4, 0x93, 0xfa, 0xdd, 0x28, 0xf, 0x66, 0x41, 0x91, 0xb6, 0xdf, 0xf8, 0xd, 0x2a, 0x43, 0x64, 0x5f, 0x78, 0x11, 0x36, 0xc3, 0xe4, 0x8d, 0xaa, 0x7a, 0x5d, 0x34, 0x13, 0xe6, 0xc1, 0xa8, 0x8f, 0x15, 0x32, 0x5b, 0x7c, 0x89, 0xae, 0xc7, 0xe0, 0x30, 0x17, 0x7e, 0x59, 0xac, 0x8b, 0xe2, 0xc5, 0xcb, 0xec, 0x85, 0xa2, 0x57, 0x70, 0x19, 0x3e, 0xee, 0xc9, 0xa0, 0x87, 0x72, 0x55, 0x3c, 0x1b, 0x81, 0xa6, 0xcf, 0xe8, 0x1d, 0x3a, 0x53, 0x74, 0xa4, 0x83, 0xea, 0xcd, 0x38, 0x1f, 0x76, 0x51}, {0x0, 0x28, 0x50, 0x78, 0xa0, 0x88, 0xf0, 0xd8, 0x5d, 0x75, 0xd, 0x25, 0xfd, 0xd5, 0xad, 0x85, 0xba, 0x92, 0xea, 0xc2, 0x1a, 0x32, 0x4a, 0x62, 0xe7, 0xcf, 0xb7, 0x9f, 0x47, 0x6f, 0x17, 0x3f, 0x69, 0x41, 0x39, 0x11, 0xc9, 0xe1, 0x99, 0xb1, 0x34, 0x1c, 0x64, 0x4c, 0x94, 0xbc, 0xc4, 0xec, 0xd3, 0xfb, 0x83, 0xab, 0x73, 0x5b, 0x23, 0xb, 0x8e, 0xa6, 0xde, 0xf6, 0x2e, 0x6, 0x7e, 0x56, 0xd2, 0xfa, 0x82, 0xaa, 0x72, 0x5a, 0x22, 0xa, 0x8f, 0xa7, 0xdf, 0xf7, 0x2f, 0x7, 0x7f, 0x57, 0x68, 0x40, 0x38, 0x10, 0xc8, 0xe0, 0x98, 0xb0, 0x35, 0x1d, 0x65, 0x4d, 0x95, 0xbd, 0xc5, 0xed, 0xbb, 0x93, 0xeb, 0xc3, 0x1b, 0x33, 0x4b, 0x63, 0xe6, 0xce, 0xb6, 0x9e, 0x46, 0x6e, 0x16, 0x3e, 0x1, 0x29, 0x51, 0x79, 0xa1, 0x89, 0xf1, 0xd9, 0x5c, 0x74, 0xc, 0x24, 0xfc, 0xd4, 0xac, 0x84, 0xb9, 0x91, 0xe9, 0xc1, 0x19, 0x31, 0x49, 0x61, 0xe4, 0xcc, 0xb4, 0x9c, 0x44, 0x6c, 0x14, 0x3c, 0x3, 0x2b, 0x53, 0x7b, 0xa3, 0x8b, 0xf3, 0xdb, 0x5e, 0x76, 0xe, 0x26, 0xfe, 0xd6, 0xae, 0x86, 0xd0, 0xf8, 0x80, 0xa8, 0x70, 0x58, 0x20, 0x8, 0x8d, 0xa5, 0xdd, 0xf5, 0x2d, 0x5, 0x7d, 0x55, 0x6a, 0x42, 0x3a, 0x12, 0xca, 0xe2, 0x9a, 0xb2, 0x37, 0x1f, 0x67, 0x4f, 0x97, 0xbf, 0xc7, 0xef, 0x6b, 0x43, 0x3b, 0x13, 0xcb, 0xe3, 0x9b, 0xb3, 0x36, 0x1e, 0x66, 0x4e, 0x96, 0xbe, 0xc6, 0xee, 0xd1, 0xf9, 0x81, 0xa9, 0x71, 0x59, 0x21, 0x9, 0x8c, 0xa4, 0xdc, 0xf4, 0x2c, 0x4, 0x7c, 0x54, 0x2, 0x2a, 0x52, 0x7a, 0xa2, 0x8a, 0xf2, 0xda, 0x5f, 0x77, 0xf, 0x27, 0xff, 0xd7, 0xaf, 0x87, 0xb8, 0x90, 0xe8, 0xc0, 0x18, 0x30, 0x48, 0x60, 0xe5, 0xcd, 0xb5, 0x9d, 0x45, 0x6d, 0x15, 0x3d}, {0x0, 0x29, 0x52, 0x7b, 0xa4, 0x8d, 0xf6, 0xdf, 0x55, 0x7c, 0x7, 0x2e, 0xf1, 0xd8, 0xa3, 0x8a, 0xaa, 0x83, 0xf8, 0xd1, 0xe, 0x27, 0x5c, 0x75, 0xff, 0xd6, 0xad, 0x84, 0x5b, 0x72, 0x9, 0x20, 0x49, 0x60, 0x1b, 0x32, 0xed, 0xc4, 0xbf, 0x96, 0x1c, 0x35, 0x4e, 0x67, 0xb8, 0x91, 0xea, 0xc3, 0xe3, 0xca, 0xb1, 0x98, 0x47, 0x6e, 0x15, 0x3c, 0xb6, 0x9f, 0xe4, 0xcd, 0x12, 0x3b, 0x40, 0x69, 0x92, 0xbb, 0xc0, 0xe9, 0x36, 0x1f, 0x64, 0x4d, 0xc7, 0xee, 0x95, 0xbc, 0x63, 0x4a, 0x31, 0x18, 0x38, 0x11, 0x6a, 0x43, 0x9c, 0xb5, 0xce, 0xe7, 0x6d, 0x44, 0x3f, 0x16, 0xc9, 0xe0, 0x9b, 0xb2, 0xdb, 0xf2, 0x89, 0xa0, 0x7f, 0x56, 0x2d, 0x4, 0x8e, 0xa7, 0xdc, 0xf5, 0x2a, 0x3, 0x78, 0x51, 0x71, 0x58, 0x23, 0xa, 0xd5, 0xfc, 0x87, 0xae, 0x24, 0xd, 0x76, 0x5f, 0x80, 0xa9, 0xd2, 0xfb, 0x39, 0x10, 0x6b, 0x42, 0x9d, 0xb4, 0xcf, 0xe6, 0x6c, 0x45, 0x3e, 0x17, 0xc8, 0xe1, 0x9a, 0xb3, 0x93, 0xba, 0xc1, 0xe8, 0x37, 0x1e, 0x65, 0x4c, 0xc6, 0xef, 0x94, 0xbd, 0x62, 0x4b, 0x30, 0x19, 0x70, 0x59, 0x22, 0xb, 0xd4, 0xfd, 0x86, 0xaf, 0x25, 0xc, 0x77, 0x5e, 0x81, 0xa8, 0xd3, 0xfa, 0xda, 0xf3, 0x88, 0xa1, 0x7e, 0x57, 0x2c, 0x5, 0x8f, 0xa6, 0xdd, 0xf4, 0x2b, 0x2, 0x79, 0x50, 0xab, 0x82, 0xf9, 0xd0, 0xf, 0x26, 0x5d, 0x74, 0xfe, 0xd7, 0xac, 0x85, 0x5a, 0x73, 0x8, 0x21, 0x1, 0x28, 0x53, 0x7a, 0xa5, 0x8c, 0xf7, 0xde, 0x54, 0x7d, 0x6, 0x2f, 0xf0, 0xd9, 0xa2, 0x8b, 0xe2, 0xcb, 0xb0, 0x99, 0x46, 0x6f, 0x14, 0x3d, 0xb7, 0x9e, 0xe5, 0xcc, 0x13, 0x3a, 0x41, 0x68, 0x48, 0x61, 0x1a, 0x33, 0xec, 0xc5, 0xbe, 0x97, 0x1d, 0x34, 0x4f, 0x66, 0xb9, 0x90, 0xeb, 0xc2}, {0x0, 0x2a, 0x54, 0x7e, 0xa8, 0x82, 0xfc, 0xd6, 0x4d, 0x67, 0x19, 0x33, 0xe5, 0xcf, 0xb1, 0x9b, 0x9a, 0xb0, 0xce, 0xe4, 0x32, 0x18, 0x66, 0x4c, 0xd7, 0xfd, 0x83, 0xa9, 0x7f, 0x55, 0x2b, 0x1, 0x29, 0x3, 0x7d, 0x57, 0x81, 0xab, 0xd5, 0xff, 0x64, 0x4e, 0x30, 0x1a, 0xcc, 0xe6, 0x98, 0xb2, 0xb3, 0x99, 0xe7, 0xcd, 0x1b, 0x31, 0x4f, 0x65, 0xfe, 0xd4, 0xaa, 0x80, 0x56, 0x7c, 0x2, 0x28, 0x52, 0x78, 0x6, 0x2c, 0xfa, 0xd0, 0xae, 0x84, 0x1f, 0x35, 0x4b, 0x61, 0xb7, 0x9d, 0xe3, 0xc9, 0xc8, 0xe2, 0x9c, 0xb6, 0x60, 0x4a, 0x34, 0x1e, 0x85, 0xaf, 0xd1, 0xfb, 0x2d, 0x7, 0x79, 0x53, 0x7b, 0x51, 0x2f, 0x5, 0xd3, 0xf9, 0x87, 0xad, 0x36, 0x1c, 0x62, 0x48, 0x9e, 0xb4, 0xca, 0xe0, 0xe1, 0xcb, 0xb5, 0x9f, 0x49, 0x63, 0x1d, 0x37, 0xac, 0x86, 0xf8, 0xd2, 0x4, 0x2e, 0x50, 0x7a, 0xa4, 0x8e, 0xf0, 0xda, 0xc, 0x26, 0x58, 0x72, 0xe9, 0xc3, 0xbd, 0x97, 0x41, 0x6b, 0x15, 0x3f, 0x3e, 0x14, 0x6a, 0x40, 0x96, 0xbc, 0xc2, 0xe8, 0x73, 0x59, 0x27, 0xd, 0xdb, 0xf1, 0x8f, 0xa5, 0x8d, 0xa7, 0xd9, 0xf3, 0x25, 0xf, 0x71, 0x5b, 0xc0, 0xea, 0x94, 0xbe, 0x68, 0x42, 0x3c, 0x16, 0x17, 0x3d, 0x43, 0x69, 0xbf, 0x95, 0xeb, 0xc1, 0x5a, 0x70, 0xe, 0x24, 0xf2, 0xd8, 0xa6, 0x8c, 0xf6, 0xdc, 0xa2, 0x88, 0x5e, 0x74, 0xa, 0x20, 0xbb, 0x91, 0xef, 0xc5, 0x13, 0x39, 0x47, 0x6d, 0x6c, 0x46, 0x38, 0x12, 0xc4, 0xee, 0x90, 0xba, 0x21, 0xb, 0x75, 0x5f, 0x89, 0xa3, 0xdd, 0xf7, 0xdf, 0xf5, 0x8b, 0xa1, 0x77, 0x5d, 0x23, 0x9, 0x92, 0xb8, 0xc6, 0xec, 0x3a, 0x10, 0x6e, 0x44, 0x45, 0x6f, 0x11, 0x3b, 0xed, 0xc7, 0xb9, 0x93, 0x8, 0x22, 0x5c, 0x76, 0xa0, 0x8a, 0xf4, 0xde}, {0x0, 0x2b, 0x56, 0x7d, 0xac, 0x87, 0xfa, 0xd1, 0x45, 0x6e, 0x13, 0x38, 0xe9, 0xc2, 0xbf, 0x94, 0x8a, 0xa1, 0xdc, 0xf7, 0x26, 0xd, 0x70, 0x5b, 0xcf, 0xe4, 0x99, 0xb2, 0x63, 0x48, 0x35, 0x1e, 0x9, 0x22, 0x5f, 0x74, 0xa5, 0x8e, 0xf3, 0xd8, 0x4c, 0x67, 0x1a, 0x31, 0xe0, 0xcb, 0xb6, 0x9d, 0x83, 0xa8, 0xd5, 0xfe, 0x2f, 0x4, 0x79, 0x52, 0xc6, 0xed, 0x90, 0xbb, 0x6a, 0x41, 0x3c, 0x17, 0x12, 0x39, 0x44, 0x6f, 0xbe, 0x95, 0xe8, 0xc3, 0x57, 0x7c, 0x1, 0x2a, 0xfb, 0xd0, 0xad, 0x86, 0x98, 0xb3, 0xce, 0xe5, 0x34, 0x1f, 0x62, 0x49, 0xdd, 0xf6, 0x8b, 0xa0, 0x71, 0x5a, 0x27, 0xc, 0x1b, 0x30, 0x4d, 0x66, 0xb7, 0x9c, 0xe1, 0xca, 0x5e, 0x75, 0x8, 0x23, 0xf2, 0xd9, 0xa4, 0x8f, 0x91, 0xba, 0xc7, 0xec, 0x3d, 0x16, 0x6b, 0x40, 0xd4, 0xff, 0x82, 0xa9, 0x78, 0x53, 0x2e, 0x5, 0x24, 0xf, 0x72, 0x59, 0x88, 0xa3, 0xde, 0xf5, 0x61, 0x4a, 0x37, 0x1c, 0xcd, 0xe6, 0x9b, 0xb0, 0xae, 0x85, 0xf8, 0xd3, 0x2, 0x29, 0x54, 0x7f, 0xeb, 0xc0, 0xbd, 0x96, 0x47, 0x6c, 0x11, 0x3a, 0x2d, 0x6, 0x7b, 0x50, 0x81, 0xaa, 0xd7, 0xfc, 0x68, 0x43, 0x3e, 0x15, 0xc4, 0xef, 0x92, 0xb9, 0xa7, 0x8c, 0xf1, 0xda, 0xb, 0x20, 0x5d, 0x76, 0xe2, 0xc9, 0xb4, 0x9f, 0x4e, 0x65, 0x18, 0x33, 0x36, 0x1d, 0x60, 0x4b, 0x9a, 0xb1, 0xcc, 0xe7, 0x73, 0x58, 0x25, 0xe, 0xdf, 0xf4, 0x89, 0xa2, 0xbc, 0x97, 0xea, 0xc1, 0x10, 0x3b, 0x46, 0x6d, 0xf9, 0xd2, 0xaf, 0x84, 0x55, 0x7e, 0x3, 0x28, 0x3f, 0x14, 0x69, 0x42, 0x93, 0xb8, 0xc5, 0xee, 0x7a, 0x51, 0x2c, 0x7, 0xd6, 0xfd, 0x80, 0xab, 0xb5, 0x9e, 0xe3, 0xc8, 0x19, 0x32, 0x4f, 0x64, 0xf0, 0xdb, 0xa6, 0x8d, 0x5c, 0x77, 0xa, 0x21}, {0x0, 0x2c, 0x58, 0x74, 0xb0, 0x9c, 0xe8, 0xc4, 0x7d, 0x51, 0x25, 0x9, 0xcd, 0xe1, 0x95, 0xb9, 0xfa, 0xd6, 0xa2, 0x8e, 0x4a, 0x66, 0x12, 0x3e, 0x87, 0xab, 0xdf, 0xf3, 0x37, 0x1b, 0x6f, 0x43, 0xe9, 0xc5, 0xb1, 0x9d, 0x59, 0x75, 0x1, 0x2d, 0x94, 0xb8, 0xcc, 0xe0, 0x24, 0x8, 0x7c, 0x50, 0x13, 0x3f, 0x4b, 0x67, 0xa3, 0x8f, 0xfb, 0xd7, 0x6e, 0x42, 0x36, 0x1a, 0xde, 0xf2, 0x86, 0xaa, 0xcf, 0xe3, 0x97, 0xbb, 0x7f, 0x53, 0x27, 0xb, 0xb2, 0x9e, 0xea, 0xc6, 0x2, 0x2e, 0x5a, 0x76, 0x35, 0x19, 0x6d, 0x41, 0x85, 0xa9, 0xdd, 0xf1, 0x48, 0x64, 0x10, 0x3c, 0xf8, 0xd4, 0xa0, 0x8c, 0x26, 0xa, 0x7e, 0x52, 0x96, 0xba, 0xce, 0xe2, 0x5b, 0x77, 0x3, 0x2f, 0xeb, 0xc7, 0xb3, 0x9f, 0xdc, 0xf0, 0x84, 0xa8, 0x6c, 0x40, 0x34, 0x18, 0xa1, 0x8d, 0xf9, 0xd5, 0x11, 0x3d, 0x49, 0x65, 0x83, 0xaf, 0xdb, 0xf7, 0x33, 0x1f, 0x6b, 0x47, 0xfe, 0xd2, 0xa6, 0x8a, 0x4e, 0x62, 0x16, 0x3a, 0x79, 0x55, 0x21, 0xd, 0xc9, 0xe5, 0x91, 0xbd, 0x4, 0x28, 0x5c, 0x70, 0xb4, 0x98, 0xec, 0xc0, 0x6a, 0x46, 0x32, 0x1e, 0xda, 0xf6, 0x82, 0xae, 0x17, 0x3b, 0x4f, 0x63, 0xa7, 0x8b, 0xff, 0xd3, 0x90, 0xbc, 0xc8, 0xe4, 0x20, 0xc, 0x78, 0x54, 0xed, 0xc1, 0xb5, 0x99, 0x5d, 0x71, 0x5, 0x29, 0x4c, 0x60, 0x14, 0x38, 0xfc, 0xd0, 0xa4, 0x88, 0x31, 0x1d, 0x69, 0x45, 0x81, 0xad, 0xd9, 0xf5, 0xb6, 0x9a, 0xee, 0xc2, 0x6, 0x2a, 0x5e, 0x72, 0xcb, 0xe7, 0x93, 0xbf, 0x7b, 0x57, 0x23, 0xf, 0xa5, 0x89, 0xfd, 0xd1, 0x15, 0x39, 0x4d, 0x61, 0xd8, 0xf4, 0x80, 0xac, 0x68, 0x44, 0x30, 0x1c, 0x5f, 0x73, 0x7, 0x2b, 0xef, 0xc3, 0xb7, 0x9b, 0x22, 0xe, 0x7a, 0x56, 0x92, 0xbe, 0xca, 0xe6}, {0x0, 0x2d, 0x5a, 0x77, 0xb4, 0x99, 0xee, 0xc3, 0x75, 0x58, 0x2f, 0x2, 0xc1, 0xec, 0x9b, 0xb6, 0xea, 0xc7, 0xb0, 0x9d, 0x5e, 0x73, 0x4, 0x29, 0x9f, 0xb2, 0xc5, 0xe8, 0x2b, 0x6, 0x71, 0x5c, 0xc9, 0xe4, 0x93, 0xbe, 0x7d, 0x50, 0x27, 0xa, 0xbc, 0x91, 0xe6, 0xcb, 0x8, 0x25, 0x52, 0x7f, 0x23, 0xe, 0x79, 0x54, 0x97, 0xba, 0xcd, 0xe0, 0x56, 0x7b, 0xc, 0x21, 0xe2, 0xcf, 0xb8, 0x95, 0x8f, 0xa2, 0xd5, 0xf8, 0x3b, 0x16, 0x61, 0x4c, 0xfa, 0xd7, 0xa0, 0x8d, 0x4e, 0x63, 0x14, 0x39, 0x65, 0x48, 0x3f, 0x12, 0xd1, 0xfc, 0x8b, 0xa6, 0x10, 0x3d, 0x4a, 0x67, 0xa4, 0x89, 0xfe, 0xd3, 0x46, 0x6b, 0x1c, 0x31, 0xf2, 0xdf, 0xa8, 0x85, 0x33, 0x1e, 0x69, 0x44, 0x87, 0xaa, 0xdd, 0xf0, 0xac, 0x81, 0xf6, 0xdb, 0x18, 0x35, 0x42, 0x6f, 0xd9, 0xf4, 0x83, 0xae, 0x6d, 0x40, 0x37, 0x1a, 0x3, 0x2e, 0x59, 0x74, 0xb7, 0x9a, 0xed, 0xc0, 0x76, 0x5b, 0x2c, 0x1, 0xc2, 0xef, 0x98, 0xb5, 0xe9, 0xc4, 0xb3, 0x9e, 0x5d, 0x70, 0x7, 0x2a, 0x9c, 0xb1, 0xc6, 0xeb, 0x28, 0x5, 0x72, 0x5f, 0xca, 0xe7, 0x90, 0xbd, 0x7e, 0x53, 0x24, 0x9, 0xbf, 0x92, 0xe5, 0xc8, 0xb, 0x26, 0x51, 0x7c, 0x20, 0xd, 0x7a, 0x57, 0x94, 0xb9, 0xce, 0xe3, 0x55, 0x78, 0xf, 0x22, 0xe1, 0xcc, 0xbb, 0x96, 0x8c, 0xa1, 0xd6, 0xfb, 0x38, 0x15, 0x62, 0x4f, 0xf9, 0xd4, 0xa3, 0x8e, 0x4d, 0x60, 0x17, 0x3a, 0x66, 0x4b, 0x3c, 0x11, 0xd2, 0xff, 0x88, 0xa5, 0x13, 0x3e, 0x49, 0x64, 0xa7, 0x8a, 0xfd, 0xd0, 0x45, 0x68, 0x1f, 0x32, 0xf1, 0xdc, 0xab, 0x86, 0x30, 0x1d, 0x6a, 0x47, 0x84, 0xa9, 0xde, 0xf3, 0xaf, 0x82, 0xf5, 0xd8, 0x1b, 0x36, 0x41, 0x6c, 0xda, 0xf7, 0x80, 0xad, 0x6e, 0x43, 0x34, 0x19}, {0x0, 0x2e, 0x5c, 0x72, 0xb8, 0x96, 0xe4, 0xca, 0x6d, 0x43, 0x31, 0x1f, 0xd5, 0xfb, 0x89, 0xa7, 0xda, 0xf4, 0x86, 0xa8, 0x62, 0x4c, 0x3e, 0x10, 0xb7, 0x99, 0xeb, 0xc5, 0xf, 0x21, 0x53, 0x7d, 0xa9, 0x87, 0xf5, 0xdb, 0x11, 0x3f, 0x4d, 0x63, 0xc4, 0xea, 0x98, 0xb6, 0x7c, 0x52, 0x20, 0xe, 0x73, 0x5d, 0x2f, 0x1, 0xcb, 0xe5, 0x97, 0xb9, 0x1e, 0x30, 0x42, 0x6c, 0xa6, 0x88, 0xfa, 0xd4, 0x4f, 0x61, 0x13, 0x3d, 0xf7, 0xd9, 0xab, 0x85, 0x22, 0xc, 0x7e, 0x50, 0x9a, 0xb4, 0xc6, 0xe8, 0x95, 0xbb, 0xc9, 0xe7, 0x2d, 0x3, 0x71, 0x5f, 0xf8, 0xd6, 0xa4, 0x8a, 0x40, 0x6e, 0x1c, 0x32, 0xe6, 0xc8, 0xba, 0x94, 0x5e, 0x70, 0x2, 0x2c, 0x8b, 0xa5, 0xd7, 0xf9, 0x33, 0x1d, 0x6f, 0x41, 0x3c, 0x12, 0x60, 0x4e, 0x84, 0xaa, 0xd8, 0xf6, 0x51, 0x7f, 0xd, 0x23, 0xe9, 0xc7, 0xb5, 0x9b, 0x9e, 0xb0, 0xc2, 0xec, 0x26, 0x8, 0x7a, 0x54, 0xf3, 0xdd, 0xaf, 0x81, 0x4b, 0x65, 0x17, 0x39, 0x44, 0x6a, 0x18, 0x36, 0xfc, 0xd2, 0xa0, 0x8e, 0x29, 0x7, 0x75, 0x5b, 0x91, 0xbf, 0xcd, 0xe3, 0x37, 0x19, 0x6b, 0x45, 0x8f, 0xa1, 0xd3, 0xfd, 0x5a, 0x74, 0x6, 0x28, 0xe2, 0xcc, 0xbe, 0x90, 0xed, 0xc3, 0xb1, 0x9f, 0x55, 0x7b, 0x9, 0x27, 0x80, 0xae, 0xdc, 0xf2, 0x38, 0x16, 0x64, 0x4a, 0xd1, 0xff, 0x8d, 0xa3, 0x69, 0x47, 0x35, 0x1b, 0xbc, 0x92, 0xe0, 0xce, 0x4, 0x2a, 0x58, 0x76, 0xb, 0x25, 0x57, 0x79, 0xb3, 0x9d, 0xef, 0xc1, 0x66, 0x48, 0x3a, 0x14, 0xde, 0xf0, 0x82, 0xac, 0x78, 0x56, 0x24, 0xa, 0xc0, 0xee, 0x9c, 0xb2, 0x15, 0x3b, 0x49, 0x67, 0xad, 0x83, 0xf1, 0xdf, 0xa2, 0x8c, 0xfe, 0xd0, 0x1a, 0x34, 0x46, 0x68, 0xcf, 0xe1, 0x93, 0xbd, 0x77, 0x59, 0x2b, 0x5}, {0x0, 0x2f, 0x5e, 0x71, 0xbc, 0x93, 0xe2, 0xcd, 0x65, 0x4a, 0x3b, 0x14, 0xd9, 0xf6, 0x87, 0xa8, 0xca, 0xe5, 0x94, 0xbb, 0x76, 0x59, 0x28, 0x7, 0xaf, 0x80, 0xf1, 0xde, 0x13, 0x3c, 0x4d, 0x62, 0x89, 0xa6, 0xd7, 0xf8, 0x35, 0x1a, 0x6b, 0x44, 0xec, 0xc3, 0xb2, 0x9d, 0x50, 0x7f, 0xe, 0x21, 0x43, 0x6c, 0x1d, 0x32, 0xff, 0xd0, 0xa1, 0x8e, 0x26, 0x9, 0x78, 0x57, 0x9a, 0xb5, 0xc4, 0xeb, 0xf, 0x20, 0x51, 0x7e, 0xb3, 0x9c, 0xed, 0xc2, 0x6a, 0x45, 0x34, 0x1b, 0xd6, 0xf9, 0x88, 0xa7, 0xc5, 0xea, 0x9b, 0xb4, 0x79, 0x56, 0x27, 0x8, 0xa0, 0x8f, 0xfe, 0xd1, 0x1c, 0x33, 0x42, 0x6d, 0x86, 0xa9, 0xd8, 0xf7, 0x3a, 0x15, 0x64, 0x4b, 0xe3, 0xcc, 0xbd, 0x92, 0x5f, 0x70, 0x1, 0x2e, 0x4c, 0x63, 0x12, 0x3d, 0xf0, 0xdf, 0xae, 0x81, 0x29, 0x6, 0x77, 0x58, 0x95, 0xba, 0xcb, 0xe4, 0x1e, 0x31, 0x40, 0x6f, 0xa2, 0x8d, 0xfc, 0xd3, 0x7b, 0x54, 0x25, 0xa, 0xc7, 0xe8, 0x99, 0xb6, 0xd4, 0xfb, 0x8a, 0xa5, 0x68, 0x47, 0x36, 0x19, 0xb1, 0x9e, 0xef, 0xc0, 0xd, 0x22, 0x53, 0x7c, 0x97, 0xb8, 0xc9, 0xe6, 0x2b, 0x4, 0x75, 0x5a, 0xf2, 0xdd, 0xac, 0x83, 0x4e, 0x61, 0x10, 0x3f, 0x5d, 0x72, 0x3, 0x2c, 0xe1, 0xce, 0xbf, 0x90, 0x38, 0x17, 0x66, 0x49, 0x84, 0xab, 0xda, 0xf5, 0x11, 0x3e, 0x4f, 0x60, 0xad, 0x82, 0xf3, 0xdc, 0x74, 0x5b, 0x2a, 0x5, 0xc8, 0xe7, 0x96, 0xb9, 0xdb, 0xf4, 0x85, 0xaa, 0x67, 0x48, 0x39, 0x16, 0xbe, 0x91, 0xe0, 0xcf, 0x2, 0x2d, 0x5c, 0x73, 0x98, 0xb7, 0xc6, 0xe9, 0x24, 0xb, 0x7a, 0x55, 0xfd, 0xd2, 0xa3, 0x8c, 0x41, 0x6e, 0x1f, 0x30, 0x52, 0x7d, 0xc, 0x23, 0xee, 0xc1, 0xb0, 0x9f, 0x37, 0x18, 0x69, 0x46, 0x8b, 0xa4, 0xd5, 0xfa}, {0x0, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, 0x9d, 0xad, 0xfd, 0xcd, 0x5d, 0x6d, 0x3d, 0xd, 0x27, 0x17, 0x47, 0x77, 0xe7, 0xd7, 0x87, 0xb7, 0xba, 0x8a, 0xda, 0xea, 0x7a, 0x4a, 0x1a, 0x2a, 0x4e, 0x7e, 0x2e, 0x1e, 0x8e, 0xbe, 0xee, 0xde, 0xd3, 0xe3, 0xb3, 0x83, 0x13, 0x23, 0x73, 0x43, 0x69, 0x59, 0x9, 0x39, 0xa9, 0x99, 0xc9, 0xf9, 0xf4, 0xc4, 0x94, 0xa4, 0x34, 0x4, 0x54, 0x64, 0x9c, 0xac, 0xfc, 0xcc, 0x5c, 0x6c, 0x3c, 0xc, 0x1, 0x31, 0x61, 0x51, 0xc1, 0xf1, 0xa1, 0x91, 0xbb, 0x8b, 0xdb, 0xeb, 0x7b, 0x4b, 0x1b, 0x2b, 0x26, 0x16, 0x46, 0x76, 0xe6, 0xd6, 0x86, 0xb6, 0xd2, 0xe2, 0xb2, 0x82, 0x12, 0x22, 0x72, 0x42, 0x4f, 0x7f, 0x2f, 0x1f, 0x8f, 0xbf, 0xef, 0xdf, 0xf5, 0xc5, 0x95, 0xa5, 0x35, 0x5, 0x55, 0x65, 0x68, 0x58, 0x8, 0x38, 0xa8, 0x98, 0xc8, 0xf8, 0x25, 0x15, 0x45, 0x75, 0xe5, 0xd5, 0x85, 0xb5, 0xb8, 0x88, 0xd8, 0xe8, 0x78, 0x48, 0x18, 0x28, 0x2, 0x32, 0x62, 0x52, 0xc2, 0xf2, 0xa2, 0x92, 0x9f, 0xaf, 0xff, 0xcf, 0x5f, 0x6f, 0x3f, 0xf, 0x6b, 0x5b, 0xb, 0x3b, 0xab, 0x9b, 0xcb, 0xfb, 0xf6, 0xc6, 0x96, 0xa6, 0x36, 0x6, 0x56, 0x66, 0x4c, 0x7c, 0x2c, 0x1c, 0x8c, 0xbc, 0xec, 0xdc, 0xd1, 0xe1, 0xb1, 0x81, 0x11, 0x21, 0x71, 0x41, 0xb9, 0x89, 0xd9, 0xe9, 0x79, 0x49, 0x19, 0x29, 0x24, 0x14, 0x44, 0x74, 0xe4, 0xd4, 0x84, 0xb4, 0x9e, 0xae, 0xfe, 0xce, 0x5e, 0x6e, 0x3e, 0xe, 0x3, 0x33, 0x63, 0x53, 0xc3, 0xf3, 0xa3, 0x93, 0xf7, 0xc7, 0x97, 0xa7, 0x37, 0x7, 0x57, 0x67, 0x6a, 0x5a, 0xa, 0x3a, 0xaa, 0x9a, 0xca, 0xfa, 0xd0, 0xe0, 0xb0, 0x80, 0x10, 0x20, 0x70, 0x40, 0x4d, 0x7d, 0x2d, 0x1d, 0x8d, 0xbd, 0xed, 0xdd}, {0x0, 0x31, 0x62, 0x53, 0xc4, 0xf5, 0xa6, 0x97, 0x95, 0xa4, 0xf7, 0xc6, 0x51, 0x60, 0x33, 0x2, 0x37, 0x6, 0x55, 0x64, 0xf3, 0xc2, 0x91, 0xa0, 0xa2, 0x93, 0xc0, 0xf1, 0x66, 0x57, 0x4, 0x35, 0x6e, 0x5f, 0xc, 0x3d, 0xaa, 0x9b, 0xc8, 0xf9, 0xfb, 0xca, 0x99, 0xa8, 0x3f, 0xe, 0x5d, 0x6c, 0x59, 0x68, 0x3b, 0xa, 0x9d, 0xac, 0xff, 0xce, 0xcc, 0xfd, 0xae, 0x9f, 0x8, 0x39, 0x6a, 0x5b, 0xdc, 0xed, 0xbe, 0x8f, 0x18, 0x29, 0x7a, 0x4b, 0x49, 0x78, 0x2b, 0x1a, 0x8d, 0xbc, 0xef, 0xde, 0xeb, 0xda, 0x89, 0xb8, 0x2f, 0x1e, 0x4d, 0x7c, 0x7e, 0x4f, 0x1c, 0x2d, 0xba, 0x8b, 0xd8, 0xe9, 0xb2, 0x83, 0xd0, 0xe1, 0x76, 0x47, 0x14, 0x25, 0x27, 0x16, 0x45, 0x74, 0xe3, 0xd2, 0x81, 0xb0, 0x85, 0xb4, 0xe7, 0xd6, 0x41, 0x70, 0x23, 0x12, 0x10, 0x21, 0x72, 0x43, 0xd4, 0xe5, 0xb6, 0x87, 0xa5, 0x94, 0xc7, 0xf6, 0x61, 0x50, 0x3, 0x32, 0x30, 0x1, 0x52, 0x63, 0xf4, 0xc5, 0x96, 0xa7, 0x92, 0xa3, 0xf0, 0xc1, 0x56, 0x67, 0x34, 0x5, 0x7, 0x36, 0x65, 0x54, 0xc3, 0xf2, 0xa1, 0x90, 0xcb, 0xfa, 0xa9, 0x98, 0xf, 0x3e, 0x6d, 0x5c, 0x5e, 0x6f, 0x3c, 0xd, 0x9a, 0xab, 0xf8, 0xc9, 0xfc, 0xcd, 0x9e, 0xaf, 0x38, 0x9, 0x5a, 0x6b, 0x69, 0x58, 0xb, 0x3a, 0xad, 0x9c, 0xcf, 0xfe, 0x79, 0x48, 0x1b, 0x2a, 0xbd, 0x8c, 0xdf, 0xee, 0xec, 0xdd, 0x8e, 0xbf, 0x28, 0x19, 0x4a, 0x7b, 0x4e, 0x7f, 0x2c, 0x1d, 0x8a, 0xbb, 0xe8, 0xd9, 0xdb, 0xea, 0xb9, 0x88, 0x1f, 0x2e, 0x7d, 0x4c, 0x17, 0x26, 0x75, 0x44, 0xd3, 0xe2, 0xb1, 0x80, 0x82, 0xb3, 0xe0, 0xd1, 0x46, 0x77, 0x24, 0x15, 0x20, 0x11, 0x42, 0x73, 0xe4, 0xd5, 0x86, 0xb7, 0xb5, 0x84, 0xd7, 0xe6, 0x71, 0x40, 0x13, 0x22}, {0x0, 0x32, 0x64, 0x56, 0xc8, 0xfa, 0xac, 0x9e, 0x8d, 0xbf, 0xe9, 0xdb, 0x45, 0x77, 0x21, 0x13, 0x7, 0x35, 0x63, 0x51, 0xcf, 0xfd, 0xab, 0x99, 0x8a, 0xb8, 0xee, 0xdc, 0x42, 0x70, 0x26, 0x14, 0xe, 0x3c, 0x6a, 0x58, 0xc6, 0xf4, 0xa2, 0x90, 0x83, 0xb1, 0xe7, 0xd5, 0x4b, 0x79, 0x2f, 0x1d, 0x9, 0x3b, 0x6d, 0x5f, 0xc1, 0xf3, 0xa5, 0x97, 0x84, 0xb6, 0xe0, 0xd2, 0x4c, 0x7e, 0x28, 0x1a, 0x1c, 0x2e, 0x78, 0x4a, 0xd4, 0xe6, 0xb0, 0x82, 0x91, 0xa3, 0xf5, 0xc7, 0x59, 0x6b, 0x3d, 0xf, 0x1b, 0x29, 0x7f, 0x4d, 0xd3, 0xe1, 0xb7, 0x85, 0x96, 0xa4, 0xf2, 0xc0, 0x5e, 0x6c, 0x3a, 0x8, 0x12, 0x20, 0x76, 0x44, 0xda, 0xe8, 0xbe, 0x8c, 0x9f, 0xad, 0xfb, 0xc9, 0x57, 0x65, 0x33, 0x1, 0x15, 0x27, 0x71, 0x43, 0xdd, 0xef, 0xb9, 0x8b, 0x98, 0xaa, 0xfc, 0xce, 0x50, 0x62, 0x34, 0x6, 0x38, 0xa, 0x5c, 0x6e, 0xf0, 0xc2, 0x94, 0xa6, 0xb5, 0x87, 0xd1, 0xe3, 0x7d, 0x4f, 0x19, 0x2b, 0x3f, 0xd, 0x5b, 0x69, 0xf7, 0xc5, 0x93, 0xa1, 0xb2, 0x80, 0xd6, 0xe4, 0x7a, 0x48, 0x1e, 0x2c, 0x36, 0x4, 0x52, 0x60, 0xfe, 0xcc, 0x9a, 0xa8, 0xbb, 0x89, 0xdf, 0xed, 0x73, 0x41, 0x17, 0x25, 0x31, 0x3, 0x55, 0x67, 0xf9, 0xcb, 0x9d, 0xaf, 0xbc, 0x8e, 0xd8, 0xea, 0x74, 0x46, 0x10, 0x22, 0x24, 0x16, 0x40, 0x72, 0xec, 0xde, 0x88, 0xba, 0xa9, 0x9b, 0xcd, 0xff, 0x61, 0x53, 0x5, 0x37, 0x23, 0x11, 0x47, 0x75, 0xeb, 0xd9, 0x8f, 0xbd, 0xae, 0x9c, 0xca, 0xf8, 0x66, 0x54, 0x2, 0x30, 0x2a, 0x18, 0x4e, 0x7c, 0xe2, 0xd0, 0x86, 0xb4, 0xa7, 0x95, 0xc3, 0xf1, 0x6f, 0x5d, 0xb, 0x39, 0x2d, 0x1f, 0x49, 0x7b, 0xe5, 0xd7, 0x81, 0xb3, 0xa0, 0x92, 0xc4, 0xf6, 0x68, 0x5a, 0xc, 0x3e}, {0x0, 0x33, 0x66, 0x55, 0xcc, 0xff, 0xaa, 0x99, 0x85, 0xb6, 0xe3, 0xd0, 0x49, 0x7a, 0x2f, 0x1c, 0x17, 0x24, 0x71, 0x42, 0xdb, 0xe8, 0xbd, 0x8e, 0x92, 0xa1, 0xf4, 0xc7, 0x5e, 0x6d, 0x38, 0xb, 0x2e, 0x1d, 0x48, 0x7b, 0xe2, 0xd1, 0x84, 0xb7, 0xab, 0x98, 0xcd, 0xfe, 0x67, 0x54, 0x1, 0x32, 0x39, 0xa, 0x5f, 0x6c, 0xf5, 0xc6, 0x93, 0xa0, 0xbc, 0x8f, 0xda, 0xe9, 0x70, 0x43, 0x16, 0x25, 0x5c, 0x6f, 0x3a, 0x9, 0x90, 0xa3, 0xf6, 0xc5, 0xd9, 0xea, 0xbf, 0x8c, 0x15, 0x26, 0x73, 0x40, 0x4b, 0x78, 0x2d, 0x1e, 0x87, 0xb4, 0xe1, 0xd2, 0xce, 0xfd, 0xa8, 0x9b, 0x2, 0x31, 0x64, 0x57, 0x72, 0x41, 0x14, 0x27, 0xbe, 0x8d, 0xd8, 0xeb, 0xf7, 0xc4, 0x91, 0xa2, 0x3b, 0x8, 0x5d, 0x6e, 0x65, 0x56, 0x3, 0x30, 0xa9, 0x9a, 0xcf, 0xfc, 0xe0, 0xd3, 0x86, 0xb5, 0x2c, 0x1f, 0x4a, 0x79, 0xb8, 0x8b, 0xde, 0xed, 0x74, 0x47, 0x12, 0x21, 0x3d, 0xe, 0x5b, 0x68, 0xf1, 0xc2, 0x97, 0xa4, 0xaf, 0x9c, 0xc9, 0xfa, 0x63, 0x50, 0x5, 0x36, 0x2a, 0x19, 0x4c, 0x7f, 0xe6, 0xd5, 0x80, 0xb3, 0x96, 0xa5, 0xf0, 0xc3, 0x5a, 0x69, 0x3c, 0xf, 0x13, 0x20, 0x75, 0x46, 0xdf, 0xec, 0xb9, 0x8a, 0x81, 0xb2, 0xe7, 0xd4, 0x4d, 0x7e, 0x2b, 0x18, 0x4, 0x37, 0x62, 0x51, 0xc8, 0xfb, 0xae, 0x9d, 0xe4, 0xd7, 0x82, 0xb1, 0x28, 0x1b, 0x4e, 0x7d, 0x61, 0x52, 0x7, 0x34, 0xad, 0x9e, 0xcb, 0xf8, 0xf3, 0xc0, 0x95, 0xa6, 0x3f, 0xc, 0x59, 0x6a, 0x76, 0x45, 0x10, 0x23, 0xba, 0x89, 0xdc, 0xef, 0xca, 0xf9, 0xac, 0x9f, 0x6, 0x35, 0x60, 0x53, 0x4f, 0x7c, 0x29, 0x1a, 0x83, 0xb0, 0xe5, 0xd6, 0xdd, 0xee, 0xbb, 0x88, 0x11, 0x22, 0x77, 0x44, 0x58, 0x6b, 0x3e, 0xd, 0x94, 0xa7, 0xf2, 0xc1}, {0x0, 0x34, 0x68, 0x5c, 0xd0, 0xe4, 0xb8, 0x8c, 0xbd, 0x89, 0xd5, 0xe1, 0x6d, 0x59, 0x5, 0x31, 0x67, 0x53, 0xf, 0x3b, 0xb7, 0x83, 0xdf, 0xeb, 0xda, 0xee, 0xb2, 0x86, 0xa, 0x3e, 0x62, 0x56, 0xce, 0xfa, 0xa6, 0x92, 0x1e, 0x2a, 0x76, 0x42, 0x73, 0x47, 0x1b, 0x2f, 0xa3, 0x97, 0xcb, 0xff, 0xa9, 0x9d, 0xc1, 0xf5, 0x79, 0x4d, 0x11, 0x25, 0x14, 0x20, 0x7c, 0x48, 0xc4, 0xf0, 0xac, 0x98, 0x81, 0xb5, 0xe9, 0xdd, 0x51, 0x65, 0x39, 0xd, 0x3c, 0x8, 0x54, 0x60, 0xec, 0xd8, 0x84, 0xb0, 0xe6, 0xd2, 0x8e, 0xba, 0x36, 0x2, 0x5e, 0x6a, 0x5b, 0x6f, 0x33, 0x7, 0x8b, 0xbf, 0xe3, 0xd7, 0x4f, 0x7b, 0x27, 0x13, 0x9f, 0xab, 0xf7, 0xc3, 0xf2, 0xc6, 0x9a, 0xae, 0x22, 0x16, 0x4a, 0x7e, 0x28, 0x1c, 0x40, 0x74, 0xf8, 0xcc, 0x90, 0xa4, 0x95, 0xa1, 0xfd, 0xc9, 0x45, 0x71, 0x2d, 0x19, 0x1f, 0x2b, 0x77, 0x43, 0xcf, 0xfb, 0xa7, 0x93, 0xa2, 0x96, 0xca, 0xfe, 0x72, 0x46, 0x1a, 0x2e, 0x78, 0x4c, 0x10, 0x24, 0xa8, 0x9c, 0xc0, 0xf4, 0xc5, 0xf1, 0xad, 0x99, 0x15, 0x21, 0x7d, 0x49, 0xd1, 0xe5, 0xb9, 0x8d, 0x1, 0x35, 0x69, 0x5d, 0x6c, 0x58, 0x4, 0x30, 0xbc, 0x88, 0xd4, 0xe0, 0xb6, 0x82, 0xde, 0xea, 0x66, 0x52, 0xe, 0x3a, 0xb, 0x3f, 0x63, 0x57, 0xdb, 0xef, 0xb3, 0x87, 0x9e, 0xaa, 0xf6, 0xc2, 0x4e, 0x7a, 0x26, 0x12, 0x23, 0x17, 0x4b, 0x7f, 0xf3, 0xc7, 0x9b, 0xaf, 0xf9, 0xcd, 0x91, 0xa5, 0x29, 0x1d, 0x41, 0x75, 0x44, 0x70, 0x2c, 0x18, 0x94, 0xa0, 0xfc, 0xc8, 0x50, 0x64, 0x38, 0xc, 0x80, 0xb4, 0xe8, 0xdc, 0xed, 0xd9, 0x85, 0xb1, 0x3d, 0x9, 0x55, 0x61, 0x37, 0x3, 0x5f, 0x6b, 0xe7, 0xd3, 0x8f, 0xbb, 0x8a, 0xbe, 0xe2, 0xd6, 0x5a, 0x6e, 0x32, 0x6}, {0x0, 0x35, 0x6a, 0x5f, 0xd4, 0xe1, 0xbe, 0x8b, 0xb5, 0x80, 0xdf, 0xea, 0x61, 0x54, 0xb, 0x3e, 0x77, 0x42, 0x1d, 0x28, 0xa3, 0x96, 0xc9, 0xfc, 0xc2, 0xf7, 0xa8, 0x9d, 0x16, 0x23, 0x7c, 0x49, 0xee, 0xdb, 0x84, 0xb1, 0x3a, 0xf, 0x50, 0x65, 0x5b, 0x6e, 0x31, 0x4, 0x8f, 0xba, 0xe5, 0xd0, 0x99, 0xac, 0xf3, 0xc6, 0x4d, 0x78, 0x27, 0x12, 0x2c, 0x19, 0x46, 0x73, 0xf8, 0xcd, 0x92, 0xa7, 0xc1, 0xf4, 0xab, 0x9e, 0x15, 0x20, 0x7f, 0x4a, 0x74, 0x41, 0x1e, 0x2b, 0xa0, 0x95, 0xca, 0xff, 0xb6, 0x83, 0xdc, 0xe9, 0x62, 0x57, 0x8, 0x3d, 0x3, 0x36, 0x69, 0x5c, 0xd7, 0xe2, 0xbd, 0x88, 0x2f, 0x1a, 0x45, 0x70, 0xfb, 0xce, 0x91, 0xa4, 0x9a, 0xaf, 0xf0, 0xc5, 0x4e, 0x7b, 0x24, 0x11, 0x58, 0x6d, 0x32, 0x7, 0x8c, 0xb9, 0xe6, 0xd3, 0xed, 0xd8, 0x87, 0xb2, 0x39, 0xc, 0x53, 0x66, 0x9f, 0xaa, 0xf5, 0xc0, 0x4b, 0x7e, 0x21, 0x14, 0x2a, 0x1f, 0x40, 0x75, 0xfe, 0xcb, 0x94, 0xa1, 0xe8, 0xdd, 0x82, 0xb7, 0x3c, 0x9, 0x56, 0x63, 0x5d, 0x68, 0x37, 0x2, 0x89, 0xbc, 0xe3, 0xd6, 0x71, 0x44, 0x1b, 0x2e, 0xa5, 0x90, 0xcf, 0xfa, 0xc4, 0xf1, 0xae, 0x9b, 0x10, 0x25, 0x7a, 0x4f, 0x6, 0x33, 0x6c, 0x59, 0xd2, 0xe7, 0xb8, 0x8d, 0xb3, 0x86, 0xd9, 0xec, 0x67, 0x52, 0xd, 0x38, 0x5e, 0x6b, 0x34, 0x1, 0x8a, 0xbf, 0xe0, 0xd5, 0xeb, 0xde, 0x81, 0xb4, 0x3f, 0xa, 0x55, 0x60, 0x29, 0x1c, 0x43, 0x76, 0xfd, 0xc8, 0x97, 0xa2, 0x9c, 0xa9, 0xf6, 0xc3, 0x48, 0x7d, 0x22, 0x17, 0xb0, 0x85, 0xda, 0xef, 0x64, 0x51, 0xe, 0x3b, 0x5, 0x30, 0x6f, 0x5a, 0xd1, 0xe4, 0xbb, 0x8e, 0xc7, 0xf2, 0xad, 0x98, 0x13, 0x26, 0x79, 0x4c, 0x72, 0x47, 0x18, 0x2d, 0xa6, 0x93, 0xcc, 0xf9}, {0x0, 0x36, 0x6c, 0x5a, 0xd8, 0xee, 0xb4, 0x82, 0xad, 0x9b, 0xc1, 0xf7, 0x75, 0x43, 0x19, 0x2f, 0x47, 0x71, 0x2b, 0x1d, 0x9f, 0xa9, 0xf3, 0xc5, 0xea, 0xdc, 0x86, 0xb0, 0x32, 0x4, 0x5e, 0x68, 0x8e, 0xb8, 0xe2, 0xd4, 0x56, 0x60, 0x3a, 0xc, 0x23, 0x15, 0x4f, 0x79, 0xfb, 0xcd, 0x97, 0xa1, 0xc9, 0xff, 0xa5, 0x93, 0x11, 0x27, 0x7d, 0x4b, 0x64, 0x52, 0x8, 0x3e, 0xbc, 0x8a, 0xd0, 0xe6, 0x1, 0x37, 0x6d, 0x5b, 0xd9, 0xef, 0xb5, 0x83, 0xac, 0x9a, 0xc0, 0xf6, 0x74, 0x42, 0x18, 0x2e, 0x46, 0x70, 0x2a, 0x1c, 0x9e, 0xa8, 0xf2, 0xc4, 0xeb, 0xdd, 0x87, 0xb1, 0x33, 0x5, 0x5f, 0x69, 0x8f, 0xb9, 0xe3, 0xd5, 0x57, 0x61, 0x3b, 0xd, 0x22, 0x14, 0x4e, 0x78, 0xfa, 0xcc, 0x96, 0xa0, 0xc8, 0xfe, 0xa4, 0x92, 0x10, 0x26, 0x7c, 0x4a, 0x65, 0x53, 0x9, 0x3f, 0xbd, 0x8b, 0xd1, 0xe7, 0x2, 0x34, 0x6e, 0x58, 0xda, 0xec, 0xb6, 0x80, 0xaf, 0x99, 0xc3, 0xf5, 0x77, 0x41, 0x1b, 0x2d, 0x45, 0x73, 0x29, 0x1f, 0x9d, 0xab, 0xf1, 0xc7, 0xe8, 0xde, 0x84, 0xb2, 0x30, 0x6, 0x5c, 0x6a, 0x8c, 0xba, 0xe0, 0xd6, 0x54, 0x62, 0x38, 0xe, 0x21, 0x17, 0x4d, 0x7b, 0xf9, 0xcf, 0x95, 0xa3, 0xcb, 0xfd, 0xa7, 0x91, 0x13, 0x25, 0x7f, 0x49, 0x66, 0x50, 0xa, 0x3c, 0xbe, 0x88, 0xd2, 0xe4, 0x3, 0x35, 0x6f, 0x59, 0xdb, 0xed, 0xb7, 0x81, 0xae, 0x98, 0xc2, 0xf4, 0x76, 0x40, 0x1a, 0x2c, 0x44, 0x72, 0x28, 0x1e, 0x9c, 0xaa, 0xf0, 0xc6, 0xe9, 0xdf, 0x85, 0xb3, 0x31, 0x7, 0x5d, 0x6b, 0x8d, 0xbb, 0xe1, 0xd7, 0x55, 0x63, 0x39, 0xf, 0x20, 0x16, 0x4c, 0x7a, 0xf8, 0xce, 0x94, 0xa2, 0xca, 0xfc, 0xa6, 0x90, 0x12, 0x24, 0x7e, 0x48, 0x67, 0x51, 0xb, 0x3d, 0xbf, 0x89, 0xd3, 0xe5}, {0x0, 0x37, 0x6e, 0x59, 0xdc, 0xeb, 0xb2, 0x85, 0xa5, 0x92, 0xcb, 0xfc, 0x79, 0x4e, 0x17, 0x20, 0x57, 0x60, 0x39, 0xe, 0x8b, 0xbc, 0xe5, 0xd2, 0xf2, 0xc5, 0x9c, 0xab, 0x2e, 0x19, 0x40, 0x77, 0xae, 0x99, 0xc0, 0xf7, 0x72, 0x45, 0x1c, 0x2b, 0xb, 0x3c, 0x65, 0x52, 0xd7, 0xe0, 0xb9, 0x8e, 0xf9, 0xce, 0x97, 0xa0, 0x25, 0x12, 0x4b, 0x7c, 0x5c, 0x6b, 0x32, 0x5, 0x80, 0xb7, 0xee, 0xd9, 0x41, 0x76, 0x2f, 0x18, 0x9d, 0xaa, 0xf3, 0xc4, 0xe4, 0xd3, 0x8a, 0xbd, 0x38, 0xf, 0x56, 0x61, 0x16, 0x21, 0x78, 0x4f, 0xca, 0xfd, 0xa4, 0x93, 0xb3, 0x84, 0xdd, 0xea, 0x6f, 0x58, 0x1, 0x36, 0xef, 0xd8, 0x81, 0xb6, 0x33, 0x4, 0x5d, 0x6a, 0x4a, 0x7d, 0x24, 0x13, 0x96, 0xa1, 0xf8, 0xcf, 0xb8, 0x8f, 0xd6, 0xe1, 0x64, 0x53, 0xa, 0x3d, 0x1d, 0x2a, 0x73, 0x44, 0xc1, 0xf6, 0xaf, 0x98, 0x82, 0xb5, 0xec, 0xdb, 0x5e, 0x69, 0x30, 0x7, 0x27, 0x10, 0x49, 0x7e, 0xfb, 0xcc, 0x95, 0xa2, 0xd5, 0xe2, 0xbb, 0x8c, 0x9, 0x3e, 0x67, 0x50, 0x70, 0x47, 0x1e, 0x29, 0xac, 0x9b, 0xc2, 0xf5, 0x2c, 0x1b, 0x42, 0x75, 0xf0, 0xc7, 0x9e, 0xa9, 0x89, 0xbe, 0xe7, 0xd0, 0x55, 0x62, 0x3b, 0xc, 0x7b, 0x4c, 0x15, 0x22, 0xa7, 0x90, 0xc9, 0xfe, 0xde, 0xe9, 0xb0, 0x87, 0x2, 0x35, 0x6c, 0x5b, 0xc3, 0xf4, 0xad, 0x9a, 0x1f, 0x28, 0x71, 0x46, 0x66, 0x51, 0x8, 0x3f, 0xba, 0x8d, 0xd4, 0xe3, 0x94, 0xa3, 0xfa, 0xcd, 0x48, 0x7f, 0x26, 0x11, 0x31, 0x6, 0x5f, 0x68, 0xed, 0xda, 0x83, 0xb4, 0x6d, 0x5a, 0x3, 0x34, 0xb1, 0x86, 0xdf, 0xe8, 0xc8, 0xff, 0xa6, 0x91, 0x14, 0x23, 0x7a, 0x4d, 0x3a, 0xd, 0x54, 0x63, 0xe6, 0xd1, 0x88, 0xbf, 0x9f, 0xa8, 0xf1, 0xc6, 0x43, 0x74, 0x2d, 0x1a}, {0x0, 0x38, 0x70, 0x48, 0xe0, 0xd8, 0x90, 0xa8, 0xdd, 0xe5, 0xad, 0x95, 0x3d, 0x5, 0x4d, 0x75, 0xa7, 0x9f, 0xd7, 0xef, 0x47, 0x7f, 0x37, 0xf, 0x7a, 0x42, 0xa, 0x32, 0x9a, 0xa2, 0xea, 0xd2, 0x53, 0x6b, 0x23, 0x1b, 0xb3, 0x8b, 0xc3, 0xfb, 0x8e, 0xb6, 0xfe, 0xc6, 0x6e, 0x56, 0x1e, 0x26, 0xf4, 0xcc, 0x84, 0xbc, 0x14, 0x2c, 0x64, 0x5c, 0x29, 0x11, 0x59, 0x61, 0xc9, 0xf1, 0xb9, 0x81, 0xa6, 0x9e, 0xd6, 0xee, 0x46, 0x7e, 0x36, 0xe, 0x7b, 0x43, 0xb, 0x33, 0x9b, 0xa3, 0xeb, 0xd3, 0x1, 0x39, 0x71, 0x49, 0xe1, 0xd9, 0x91, 0xa9, 0xdc, 0xe4, 0xac, 0x94, 0x3c, 0x4, 0x4c, 0x74, 0xf5, 0xcd, 0x85, 0xbd, 0x15, 0x2d, 0x65, 0x5d, 0x28, 0x10, 0x58, 0x60, 0xc8, 0xf0, 0xb8, 0x80, 0x52, 0x6a, 0x22, 0x1a, 0xb2, 0x8a, 0xc2, 0xfa, 0x8f, 0xb7, 0xff, 0xc7, 0x6f, 0x57, 0x1f, 0x27, 0x51, 0x69, 0x21, 0x19, 0xb1, 0x89, 0xc1, 0xf9, 0x8c, 0xb4, 0xfc, 0xc4, 0x6c, 0x54, 0x1c, 0x24, 0xf6, 0xce, 0x86, 0xbe, 0x16, 0x2e, 0x66, 0x5e, 0x2b, 0x13, 0x5b, 0x63, 0xcb, 0xf3, 0xbb, 0x83, 0x2, 0x3a, 0x72, 0x4a, 0xe2, 0xda, 0x92, 0xaa, 0xdf, 0xe7, 0xaf, 0x97, 0x3f, 0x7, 0x4f, 0x77, 0xa5, 0x9d, 0xd5, 0xed, 0x45, 0x7d, 0x35, 0xd, 0x78, 0x40, 0x8, 0x30, 0x98, 0xa0, 0xe8, 0xd0, 0xf7, 0xcf, 0x87, 0xbf, 0x17, 0x2f, 0x67, 0x5f, 0x2a, 0x12, 0x5a, 0x62, 0xca, 0xf2, 0xba, 0x82, 0x50, 0x68, 0x20, 0x18, 0xb0, 0x88, 0xc0, 0xf8, 0x8d, 0xb5, 0xfd, 0xc5, 0x6d, 0x55, 0x1d, 0x25, 0xa4, 0x9c, 0xd4, 0xec, 0x44, 0x7c, 0x34, 0xc, 0x79, 0x41, 0x9, 0x31, 0x99, 0xa1, 0xe9, 0xd1, 0x3, 0x3b, 0x73, 0x4b, 0xe3, 0xdb, 0x93, 0xab, 0xde, 0xe6, 0xae, 0x96, 0x3e, 0x6, 0x4e, 0x76}, {0x0, 0x39, 0x72, 0x4b, 0xe4, 0xdd, 0x96, 0xaf, 0xd5, 0xec, 0xa7, 0x9e, 0x31, 0x8, 0x43, 0x7a, 0xb7, 0x8e, 0xc5, 0xfc, 0x53, 0x6a, 0x21, 0x18, 0x62, 0x5b, 0x10, 0x29, 0x86, 0xbf, 0xf4, 0xcd, 0x73, 0x4a, 0x1, 0x38, 0x97, 0xae, 0xe5, 0xdc, 0xa6, 0x9f, 0xd4, 0xed, 0x42, 0x7b, 0x30, 0x9, 0xc4, 0xfd, 0xb6, 0x8f, 0x20, 0x19, 0x52, 0x6b, 0x11, 0x28, 0x63, 0x5a, 0xf5, 0xcc, 0x87, 0xbe, 0xe6, 0xdf, 0x94, 0xad, 0x2, 0x3b, 0x70, 0x49, 0x33, 0xa, 0x41, 0x78, 0xd7, 0xee, 0xa5, 0x9c, 0x51, 0x68, 0x23, 0x1a, 0xb5, 0x8c, 0xc7, 0xfe, 0x84, 0xbd, 0xf6, 0xcf, 0x60, 0x59, 0x12, 0x2b, 0x95, 0xac, 0xe7, 0xde, 0x71, 0x48, 0x3, 0x3a, 0x40, 0x79, 0x32, 0xb, 0xa4, 0x9d, 0xd6, 0xef, 0x22, 0x1b, 0x50, 0x69, 0xc6, 0xff, 0xb4, 0x8d, 0xf7, 0xce, 0x85, 0xbc, 0x13, 0x2a, 0x61, 0x58, 0xd1, 0xe8, 0xa3, 0x9a, 0x35, 0xc, 0x47, 0x7e, 0x4, 0x3d, 0x76, 0x4f, 0xe0, 0xd9, 0x92, 0xab, 0x66, 0x5f, 0x14, 0x2d, 0x82, 0xbb, 0xf0, 0xc9, 0xb3, 0x8a, 0xc1, 0xf8, 0x57, 0x6e, 0x25, 0x1c, 0xa2, 0x9b, 0xd0, 0xe9, 0x46, 0x7f, 0x34, 0xd, 0x77, 0x4e, 0x5, 0x3c, 0x93, 0xaa, 0xe1, 0xd8, 0x15, 0x2c, 0x67, 0x5e, 0xf1, 0xc8, 0x83, 0xba, 0xc0, 0xf9, 0xb2, 0x8b, 0x24, 0x1d, 0x56, 0x6f, 0x37, 0xe, 0x45, 0x7c, 0xd3, 0xea, 0xa1, 0x98, 0xe2, 0xdb, 0x90, 0xa9, 0x6, 0x3f, 0x74, 0x4d, 0x80, 0xb9, 0xf2, 0xcb, 0x64, 0x5d, 0x16, 0x2f, 0x55, 0x6c, 0x27, 0x1e, 0xb1, 0x88, 0xc3, 0xfa, 0x44, 0x7d, 0x36, 0xf, 0xa0, 0x99, 0xd2, 0xeb, 0x91, 0xa8, 0xe3, 0xda, 0x75, 0x4c, 0x7, 0x3e, 0xf3, 0xca, 0x81, 0xb8, 0x17, 0x2e, 0x65, 0x5c, 0x26, 0x1f, 0x54, 0x6d, 0xc2, 0xfb, 0xb0, 0x89}, {0x0, 0x3a, 0x74, 0x4e, 0xe8, 0xd2, 0x9c, 0xa6, 0xcd, 0xf7, 0xb9, 0x83, 0x25, 0x1f, 0x51, 0x6b, 0x87, 0xbd, 0xf3, 0xc9, 0x6f, 0x55, 0x1b, 0x21, 0x4a, 0x70, 0x3e, 0x4, 0xa2, 0x98, 0xd6, 0xec, 0x13, 0x29, 0x67, 0x5d, 0xfb, 0xc1, 0x8f, 0xb5, 0xde, 0xe4, 0xaa, 0x90, 0x36, 0xc, 0x42, 0x78, 0x94, 0xae, 0xe0, 0xda, 0x7c, 0x46, 0x8, 0x32, 0x59, 0x63, 0x2d, 0x17, 0xb1, 0x8b, 0xc5, 0xff, 0x26, 0x1c, 0x52, 0x68, 0xce, 0xf4, 0xba, 0x80, 0xeb, 0xd1, 0x9f, 0xa5, 0x3, 0x39, 0x77, 0x4d, 0xa1, 0x9b, 0xd5, 0xef, 0x49, 0x73, 0x3d, 0x7, 0x6c, 0x56, 0x18, 0x22, 0x84, 0xbe, 0xf0, 0xca, 0x35, 0xf, 0x41, 0x7b, 0xdd, 0xe7, 0xa9, 0x93, 0xf8, 0xc2, 0x8c, 0xb6, 0x10, 0x2a, 0x64, 0x5e, 0xb2, 0x88, 0xc6, 0xfc, 0x5a, 0x60, 0x2e, 0x14, 0x7f, 0x45, 0xb, 0x31, 0x97, 0xad, 0xe3, 0xd9, 0x4c, 0x76, 0x38, 0x2, 0xa4, 0x9e, 0xd0, 0xea, 0x81, 0xbb, 0xf5, 0xcf, 0x69, 0x53, 0x1d, 0x27, 0xcb, 0xf1, 0xbf, 0x85, 0x23, 0x19, 0x57, 0x6d, 0x6, 0x3c, 0x72, 0x48, 0xee, 0xd4, 0x9a, 0xa0, 0x5f, 0x65, 0x2b, 0x11, 0xb7, 0x8d, 0xc3, 0xf9, 0x92, 0xa8, 0xe6, 0xdc, 0x7a, 0x40, 0xe, 0x34, 0xd8, 0xe2, 0xac, 0x96, 0x30, 0xa, 0x44, 0x7e, 0x15, 0x2f, 0x61, 0x5b, 0xfd, 0xc7, 0x89, 0xb3, 0x6a, 0x50, 0x1e, 0x24, 0x82, 0xb8, 0xf6, 0xcc, 0xa7, 0x9d, 0xd3, 0xe9, 0x4f, 0x75, 0x3b, 0x1, 0xed, 0xd7, 0x99, 0xa3, 0x5, 0x3f, 0x71, 0x4b, 0x20, 0x1a, 0x54, 0x6e, 0xc8, 0xf2, 0xbc, 0x86, 0x79, 0x43, 0xd, 0x37, 0x91, 0xab, 0xe5, 0xdf, 0xb4, 0x8e, 0xc0, 0xfa, 0x5c, 0x66, 0x28, 0x12, 0xfe, 0xc4, 0x8a, 0xb0, 0x16, 0x2c, 0x62, 0x58, 0x33, 0x9, 0x47, 0x7d, 0xdb, 0xe1, 0xaf, 0x95}, {0x0, 0x3b, 0x76, 0x4d, 0xec, 0xd7, 0x9a, 0xa1, 0xc5, 0xfe, 0xb3, 0x88, 0x29, 0x12, 0x5f, 0x64, 0x97, 0xac, 0xe1, 0xda, 0x7b, 0x40, 0xd, 0x36, 0x52, 0x69, 0x24, 0x1f, 0xbe, 0x85, 0xc8, 0xf3, 0x33, 0x8, 0x45, 0x7e, 0xdf, 0xe4, 0xa9, 0x92, 0xf6, 0xcd, 0x80, 0xbb, 0x1a, 0x21, 0x6c, 0x57, 0xa4, 0x9f, 0xd2, 0xe9, 0x48, 0x73, 0x3e, 0x5, 0x61, 0x5a, 0x17, 0x2c, 0x8d, 0xb6, 0xfb, 0xc0, 0x66, 0x5d, 0x10, 0x2b, 0x8a, 0xb1, 0xfc, 0xc7, 0xa3, 0x98, 0xd5, 0xee, 0x4f, 0x74, 0x39, 0x2, 0xf1, 0xca, 0x87, 0xbc, 0x1d, 0x26, 0x6b, 0x50, 0x34, 0xf, 0x42, 0x79, 0xd8, 0xe3, 0xae, 0x95, 0x55, 0x6e, 0x23, 0x18, 0xb9, 0x82, 0xcf, 0xf4, 0x90, 0xab, 0xe6, 0xdd, 0x7c, 0x47, 0xa, 0x31, 0xc2, 0xf9, 0xb4, 0x8f, 0x2e, 0x15, 0x58, 0x63, 0x7, 0x3c, 0x71, 0x4a, 0xeb, 0xd0, 0x9d, 0xa6, 0xcc, 0xf7, 0xba, 0x81, 0x20, 0x1b, 0x56, 0x6d, 0x9, 0x32, 0x7f, 0x44, 0xe5, 0xde, 0x93, 0xa8, 0x5b, 0x60, 0x2d, 0x16, 0xb7, 0x8c, 0xc1, 0xfa, 0x9e, 0xa5, 0xe8, 0xd3, 0x72, 0x49, 0x4, 0x3f, 0xff, 0xc4, 0x89, 0xb2, 0x13, 0x28, 0x65, 0x5e, 0x3a, 0x1, 0x4c, 0x77, 0xd6, 0xed, 0xa0, 0x9b, 0x68, 0x53, 0x1e, 0x25, 0x84, 0xbf, 0xf2, 0xc9, 0xad, 0x96, 0xdb, 0xe0, 0x41, 0x7a, 0x37, 0xc, 0xaa, 0x91, 0xdc, 0xe7, 0x46, 0x7d, 0x30, 0xb, 0x6f, 0x54, 0x19, 0x22, 0x83, 0xb8, 0xf5, 0xce, 0x3d, 0x6, 0x4b, 0x70, 0xd1, 0xea, 0xa7, 0x9c, 0xf8, 0xc3, 0x8e, 0xb5, 0x14, 0x2f, 0x62, 0x59, 0x99, 0xa2, 0xef, 0xd4, 0x75, 0x4e, 0x3, 0x38, 0x5c, 0x67, 0x2a, 0x11, 0xb0, 0x8b, 0xc6, 0xfd, 0xe, 0x35, 0x78, 0x43, 0xe2, 0xd9, 0x94, 0xaf, 0xcb, 0xf0, 0xbd, 0x86, 0x27, 0x1c, 0x51, 0x6a}, {0x0, 0x3c, 0x78, 0x44, 0xf0, 0xcc, 0x88, 0xb4, 0xfd, 0xc1, 0x85, 0xb9, 0xd, 0x31, 0x75, 0x49, 0xe7, 0xdb, 0x9f, 0xa3, 0x17, 0x2b, 0x6f, 0x53, 0x1a, 0x26, 0x62, 0x5e, 0xea, 0xd6, 0x92, 0xae, 0xd3, 0xef, 0xab, 0x97, 0x23, 0x1f, 0x5b, 0x67, 0x2e, 0x12, 0x56, 0x6a, 0xde, 0xe2, 0xa6, 0x9a, 0x34, 0x8, 0x4c, 0x70, 0xc4, 0xf8, 0xbc, 0x80, 0xc9, 0xf5, 0xb1, 0x8d, 0x39, 0x5, 0x41, 0x7d, 0xbb, 0x87, 0xc3, 0xff, 0x4b, 0x77, 0x33, 0xf, 0x46, 0x7a, 0x3e, 0x2, 0xb6, 0x8a, 0xce, 0xf2, 0x5c, 0x60, 0x24, 0x18, 0xac, 0x90, 0xd4, 0xe8, 0xa1, 0x9d, 0xd9, 0xe5, 0x51, 0x6d, 0x29, 0x15, 0x68, 0x54, 0x10, 0x2c, 0x98, 0xa4, 0xe0, 0xdc, 0x95, 0xa9, 0xed, 0xd1, 0x65, 0x59, 0x1d, 0x21, 0x8f, 0xb3, 0xf7, 0xcb, 0x7f, 0x43, 0x7, 0x3b, 0x72, 0x4e, 0xa, 0x36, 0x82, 0xbe, 0xfa, 0xc6, 0x6b, 0x57, 0x13, 0x2f, 0x9b, 0xa7, 0xe3, 0xdf, 0x96, 0xaa, 0xee, 0xd2, 0x66, 0x5a, 0x1e, 0x22, 0x8c, 0xb0, 0xf4, 0xc8, 0x7c, 0x40, 0x4, 0x38, 0x71, 0x4d, 0x9, 0x35, 0x81, 0xbd, 0xf9, 0xc5, 0xb8, 0x84, 0xc0, 0xfc, 0x48, 0x74, 0x30, 0xc, 0x45, 0x79, 0x3d, 0x1, 0xb5, 0x89, 0xcd, 0xf1, 0x5f, 0x63, 0x27, 0x1b, 0xaf, 0x93, 0xd7, 0xeb, 0xa2, 0x9e, 0xda, 0xe6, 0x52, 0x6e, 0x2a, 0x16, 0xd0, 0xec, 0xa8, 0x94, 0x20, 0x1c, 0x58, 0x64, 0x2d, 0x11, 0x55, 0x69, 0xdd, 0xe1, 0xa5, 0x99, 0x37, 0xb, 0x4f, 0x73, 0xc7, 0xfb, 0xbf, 0x83, 0xca, 0xf6, 0xb2, 0x8e, 0x3a, 0x6, 0x42, 0x7e, 0x3, 0x3f, 0x7b, 0x47, 0xf3, 0xcf, 0x8b, 0xb7, 0xfe, 0xc2, 0x86, 0xba, 0xe, 0x32, 0x76, 0x4a, 0xe4, 0xd8, 0x9c, 0xa0, 0x14, 0x28, 0x6c, 0x50, 0x19, 0x25, 0x61, 0x5d, 0xe9, 0xd5, 0x91, 0xad}, {0x0, 0x3d, 0x7a, 0x47, 0xf4, 0xc9, 0x8e, 0xb3, 0xf5, 0xc8, 0x8f, 0xb2, 0x1, 0x3c, 0x7b, 0x46, 0xf7, 0xca, 0x8d, 0xb0, 0x3, 0x3e, 0x79, 0x44, 0x2, 0x3f, 0x78, 0x45, 0xf6, 0xcb, 0x8c, 0xb1, 0xf3, 0xce, 0x89, 0xb4, 0x7, 0x3a, 0x7d, 0x40, 0x6, 0x3b, 0x7c, 0x41, 0xf2, 0xcf, 0x88, 0xb5, 0x4, 0x39, 0x7e, 0x43, 0xf0, 0xcd, 0x8a, 0xb7, 0xf1, 0xcc, 0x8b, 0xb6, 0x5, 0x38, 0x7f, 0x42, 0xfb, 0xc6, 0x81, 0xbc, 0xf, 0x32, 0x75, 0x48, 0xe, 0x33, 0x74, 0x49, 0xfa, 0xc7, 0x80, 0xbd, 0xc, 0x31, 0x76, 0x4b, 0xf8, 0xc5, 0x82, 0xbf, 0xf9, 0xc4, 0x83, 0xbe, 0xd, 0x30, 0x77, 0x4a, 0x8, 0x35, 0x72, 0x4f, 0xfc, 0xc1, 0x86, 0xbb, 0xfd, 0xc0, 0x87, 0xba, 0x9, 0x34, 0x73, 0x4e, 0xff, 0xc2, 0x85, 0xb8, 0xb, 0x36, 0x71, 0x4c, 0xa, 0x37, 0x70, 0x4d, 0xfe, 0xc3, 0x84, 0xb9, 0xeb, 0xd6, 0x91, 0xac, 0x1f, 0x22, 0x65, 0x58, 0x1e, 0x23, 0x64, 0x59, 0xea, 0xd7, 0x90, 0xad, 0x1c, 0x21, 0x66, 0x5b, 0xe8, 0xd5, 0x92, 0xaf, 0xe9, 0xd4, 0x93, 0xae, 0x1d, 0x20, 0x67, 0x5a, 0x18, 0x25, 0x62, 0x5f, 0xec, 0xd1, 0x96, 0xab, 0xed, 0xd0, 0x97, 0xaa, 0x19, 0x24, 0x63, 0x5e, 0xef, 0xd2, 0x95, 0xa8, 0x1b, 0x26, 0x61, 0x5c, 0x1a, 0x27, 0x60, 0x5d, 0xee, 0xd3, 0x94, 0xa9, 0x10, 0x2d, 0x6a, 0x57, 0xe4, 0xd9, 0x9e, 0xa3, 0xe5, 0xd8, 0x9f, 0xa2, 0x11, 0x2c, 0x6b, 0x56, 0xe7, 0xda, 0x9d, 0xa0, 0x13, 0x2e, 0x69, 0x54, 0x12, 0x2f, 0x68, 0x55, 0xe6, 0xdb, 0x9c, 0xa1, 0xe3, 0xde, 0x99, 0xa4, 0x17, 0x2a, 0x6d, 0x50, 0x16, 0x2b, 0x6c, 0x51, 0xe2, 0xdf, 0x98, 0xa5, 0x14, 0x29, 0x6e, 0x53, 0xe0, 0xdd, 0x9a, 0xa7, 0xe1, 0xdc, 0x9b, 0xa6, 0x15, 0x28, 0x6f, 0x52}, {0x0, 0x3e, 0x7c, 0x42, 0xf8, 0xc6, 0x84, 0xba, 0xed, 0xd3, 0x91, 0xaf, 0x15, 0x2b, 0x69, 0x57, 0xc7, 0xf9, 0xbb, 0x85, 0x3f, 0x1, 0x43, 0x7d, 0x2a, 0x14, 0x56, 0x68, 0xd2, 0xec, 0xae, 0x90, 0x93, 0xad, 0xef, 0xd1, 0x6b, 0x55, 0x17, 0x29, 0x7e, 0x40, 0x2, 0x3c, 0x86, 0xb8, 0xfa, 0xc4, 0x54, 0x6a, 0x28, 0x16, 0xac, 0x92, 0xd0, 0xee, 0xb9, 0x87, 0xc5, 0xfb, 0x41, 0x7f, 0x3d, 0x3, 0x3b, 0x5, 0x47, 0x79, 0xc3, 0xfd, 0xbf, 0x81, 0xd6, 0xe8, 0xaa, 0x94, 0x2e, 0x10, 0x52, 0x6c, 0xfc, 0xc2, 0x80, 0xbe, 0x4, 0x3a, 0x78, 0x46, 0x11, 0x2f, 0x6d, 0x53, 0xe9, 0xd7, 0x95, 0xab, 0xa8, 0x96, 0xd4, 0xea, 0x50, 0x6e, 0x2c, 0x12, 0x45, 0x7b, 0x39, 0x7, 0xbd, 0x83, 0xc1, 0xff, 0x6f, 0x51, 0x13, 0x2d, 0x97, 0xa9, 0xeb, 0xd5, 0x82, 0xbc, 0xfe, 0xc0, 0x7a, 0x44, 0x6, 0x38, 0x76, 0x48, 0xa, 0x34, 0x8e, 0xb0, 0xf2, 0xcc, 0x9b, 0xa5, 0xe7, 0xd9, 0x63, 0x5d, 0x1f, 0x21, 0xb1, 0x8f, 0xcd, 0xf3, 0x49, 0x77, 0x35, 0xb, 0x5c, 0x62, 0x20, 0x1e, 0xa4, 0x9a, 0xd8, 0xe6, 0xe5, 0xdb, 0x99, 0xa7, 0x1d, 0x23, 0x61, 0x5f, 0x8, 0x36, 0x74, 0x4a, 0xf0, 0xce, 0x8c, 0xb2, 0x22, 0x1c, 0x5e, 0x60, 0xda, 0xe4, 0xa6, 0x98, 0xcf, 0xf1, 0xb3, 0x8d, 0x37, 0x9, 0x4b, 0x75, 0x4d, 0x73, 0x31, 0xf, 0xb5, 0x8b, 0xc9, 0xf7, 0xa0, 0x9e, 0xdc, 0xe2, 0x58, 0x66, 0x24, 0x1a, 0x8a, 0xb4, 0xf6, 0xc8, 0x72, 0x4c, 0xe, 0x30, 0x67, 0x59, 0x1b, 0x25, 0x9f, 0xa1, 0xe3, 0xdd, 0xde, 0xe0, 0xa2, 0x9c, 0x26, 0x18, 0x5a, 0x64, 0x33, 0xd, 0x4f, 0x71, 0xcb, 0xf5, 0xb7, 0x89, 0x19, 0x27, 0x65, 0x5b, 0xe1, 0xdf, 0x9d, 0xa3, 0xf4, 0xca, 0x88, 0xb6, 0xc, 0x32, 0x70, 0x4e}, {0x0, 0x3f, 0x7e, 0x41, 0xfc, 0xc3, 0x82, 0xbd, 0xe5, 0xda, 0x9b, 0xa4, 0x19, 0x26, 0x67, 0x58, 0xd7, 0xe8, 0xa9, 0x96, 0x2b, 0x14, 0x55, 0x6a, 0x32, 0xd, 0x4c, 0x73, 0xce, 0xf1, 0xb0, 0x8f, 0xb3, 0x8c, 0xcd, 0xf2, 0x4f, 0x70, 0x31, 0xe, 0x56, 0x69, 0x28, 0x17, 0xaa, 0x95, 0xd4, 0xeb, 0x64, 0x5b, 0x1a, 0x25, 0x98, 0xa7, 0xe6, 0xd9, 0x81, 0xbe, 0xff, 0xc0, 0x7d, 0x42, 0x3, 0x3c, 0x7b, 0x44, 0x5, 0x3a, 0x87, 0xb8, 0xf9, 0xc6, 0x9e, 0xa1, 0xe0, 0xdf, 0x62, 0x5d, 0x1c, 0x23, 0xac, 0x93, 0xd2, 0xed, 0x50, 0x6f, 0x2e, 0x11, 0x49, 0x76, 0x37, 0x8, 0xb5, 0x8a, 0xcb, 0xf4, 0xc8, 0xf7, 0xb6, 0x89, 0x34, 0xb, 0x4a, 0x75, 0x2d, 0x12, 0x53, 0x6c, 0xd1, 0xee, 0xaf, 0x90, 0x1f, 0x20, 0x61, 0x5e, 0xe3, 0xdc, 0x9d, 0xa2, 0xfa, 0xc5, 0x84, 0xbb, 0x6, 0x39, 0x78, 0x47, 0xf6, 0xc9, 0x88, 0xb7, 0xa, 0x35, 0x74, 0x4b, 0x13, 0x2c, 0x6d, 0x52, 0xef, 0xd0, 0x91, 0xae, 0x21, 0x1e, 0x5f, 0x60, 0xdd, 0xe2, 0xa3, 0x9c, 0xc4, 0xfb, 0xba, 0x85, 0x38, 0x7, 0x46, 0x79, 0x45, 0x7a, 0x3b, 0x4, 0xb9, 0x86, 0xc7, 0xf8, 0xa0, 0x9f, 0xde, 0xe1, 0x5c, 0x63, 0x22, 0x1d, 0x92, 0xad, 0xec, 0xd3, 0x6e, 0x51, 0x10, 0x2f, 0x77, 0x48, 0x9, 0x36, 0x8b, 0xb4, 0xf5, 0xca, 0x8d, 0xb2, 0xf3, 0xcc, 0x71, 0x4e, 0xf, 0x30, 0x68, 0x57, 0x16, 0x29, 0x94, 0xab, 0xea, 0xd5, 0x5a, 0x65, 0x24, 0x1b, 0xa6, 0x99, 0xd8, 0xe7, 0xbf, 0x80, 0xc1, 0xfe, 0x43, 0x7c, 0x3d, 0x2, 0x3e, 0x1, 0x40, 0x7f, 0xc2, 0xfd, 0xbc, 0x83, 0xdb, 0xe4, 0xa5, 0x9a, 0x27, 0x18, 0x59, 0x66, 0xe9, 0xd6, 0x97, 0xa8, 0x15, 0x2a, 0x6b, 0x54, 0xc, 0x33, 0x72, 0x4d, 0xf0, 0xcf, 0x8e, 0xb1}, {0x0, 0x40, 0x80, 0xc0, 0x1d, 0x5d, 0x9d, 0xdd, 0x3a, 0x7a, 0xba, 0xfa, 0x27, 0x67, 0xa7, 0xe7, 0x74, 0x34, 0xf4, 0xb4, 0x69, 0x29, 0xe9, 0xa9, 0x4e, 0xe, 0xce, 0x8e, 0x53, 0x13, 0xd3, 0x93, 0xe8, 0xa8, 0x68, 0x28, 0xf5, 0xb5, 0x75, 0x35, 0xd2, 0x92, 0x52, 0x12, 0xcf, 0x8f, 0x4f, 0xf, 0x9c, 0xdc, 0x1c, 0x5c, 0x81, 0xc1, 0x1, 0x41, 0xa6, 0xe6, 0x26, 0x66, 0xbb, 0xfb, 0x3b, 0x7b, 0xcd, 0x8d, 0x4d, 0xd, 0xd0, 0x90, 0x50, 0x10, 0xf7, 0xb7, 0x77, 0x37, 0xea, 0xaa, 0x6a, 0x2a, 0xb9, 0xf9, 0x39, 0x79, 0xa4, 0xe4, 0x24, 0x64, 0x83, 0xc3, 0x3, 0x43, 0x9e, 0xde, 0x1e, 0x5e, 0x25, 0x65, 0xa5, 0xe5, 0x38, 0x78, 0xb8, 0xf8, 0x1f, 0x5f, 0x9f, 0xdf, 0x2, 0x42, 0x82, 0xc2, 0x51, 0x11, 0xd1, 0x91, 0x4c, 0xc, 0xcc, 0x8c, 0x6b, 0x2b, 0xeb, 0xab, 0x76, 0x36, 0xf6, 0xb6, 0x87, 0xc7, 0x7, 0x47, 0x9a, 0xda, 0x1a, 0x5a, 0xbd, 0xfd, 0x3d, 0x7d, 0xa0, 0xe0, 0x20, 0x60, 0xf3, 0xb3, 0x73, 0x33, 0xee, 0xae, 0x6e, 0x2e, 0xc9, 0x89, 0x49, 0x9, 0xd4, 0x94, 0x54, 0x14, 0x6f, 0x2f, 0xef, 0xaf, 0x72, 0x32, 0xf2, 0xb2, 0x55, 0x15, 0xd5, 0x95, 0x48, 0x8, 0xc8, 0x88, 0x1b, 0x5b, 0x9b, 0xdb, 0x6, 0x46, 0x86, 0xc6, 0x21, 0x61, 0xa1, 0xe1, 0x3c, 0x7c, 0xbc, 0xfc, 0x4a, 0xa, 0xca, 0x8a, 0x57, 0x17, 0xd7, 0x97, 0x70, 0x30, 0xf0, 0xb0, 0x6d, 0x2d, 0xed, 0xad, 0x3e, 0x7e, 0xbe, 0xfe, 0x23, 0x63, 0xa3, 0xe3, 0x4, 0x44, 0x84, 0xc4, 0x19, 0x59, 0x99, 0xd9, 0xa2, 0xe2, 0x22, 0x62, 0xbf, 0xff, 0x3f, 0x7f, 0x98, 0xd8, 0x18, 0x58, 0x85, 0xc5, 0x5, 0x45, 0xd6, 0x96, 0x56, 0x16, 0xcb, 0x8b, 0x4b, 0xb, 0xec, 0xac, 0x6c, 0x2c, 0xf1, 0xb1, 0x71, 0x31}, {0x0, 0x41, 0x82, 0xc3, 0x19, 0x58, 0x9b, 0xda, 0x32, 0x73, 0xb0, 0xf1, 0x2b, 0x6a, 0xa9, 0xe8, 0x64, 0x25, 0xe6, 0xa7, 0x7d, 0x3c, 0xff, 0xbe, 0x56, 0x17, 0xd4, 0x95, 0x4f, 0xe, 0xcd, 0x8c, 0xc8, 0x89, 0x4a, 0xb, 0xd1, 0x90, 0x53, 0x12, 0xfa, 0xbb, 0x78, 0x39, 0xe3, 0xa2, 0x61, 0x20, 0xac, 0xed, 0x2e, 0x6f, 0xb5, 0xf4, 0x37, 0x76, 0x9e, 0xdf, 0x1c, 0x5d, 0x87, 0xc6, 0x5, 0x44, 0x8d, 0xcc, 0xf, 0x4e, 0x94, 0xd5, 0x16, 0x57, 0xbf, 0xfe, 0x3d, 0x7c, 0xa6, 0xe7, 0x24, 0x65, 0xe9, 0xa8, 0x6b, 0x2a, 0xf0, 0xb1, 0x72, 0x33, 0xdb, 0x9a, 0x59, 0x18, 0xc2, 0x83, 0x40, 0x1, 0x45, 0x4, 0xc7, 0x86, 0x5c, 0x1d, 0xde, 0x9f, 0x77, 0x36, 0xf5, 0xb4, 0x6e, 0x2f, 0xec, 0xad, 0x21, 0x60, 0xa3, 0xe2, 0x38, 0x79, 0xba, 0xfb, 0x13, 0x52, 0x91, 0xd0, 0xa, 0x4b, 0x88, 0xc9, 0x7, 0x46, 0x85, 0xc4, 0x1e, 0x5f, 0x9c, 0xdd, 0x35, 0x74, 0xb7, 0xf6, 0x2c, 0x6d, 0xae, 0xef, 0x63, 0x22, 0xe1, 0xa0, 0x7a, 0x3b, 0xf8, 0xb9, 0x51, 0x10, 0xd3, 0x92, 0x48, 0x9, 0xca, 0x8b, 0xcf, 0x8e, 0x4d, 0xc, 0xd6, 0x97, 0x54, 0x15, 0xfd, 0xbc, 0x7f, 0x3e, 0xe4, 0xa5, 0x66, 0x27, 0xab, 0xea, 0x29, 0x68, 0xb2, 0xf3, 0x30, 0x71, 0x99, 0xd8, 0x1b, 0x5a, 0x80, 0xc1, 0x2, 0x43, 0x8a, 0xcb, 0x8, 0x49, 0x93, 0xd2, 0x11, 0x50, 0xb8, 0xf9, 0x3a, 0x7b, 0xa1, 0xe0, 0x23, 0x62, 0xee, 0xaf, 0x6c, 0x2d, 0xf7, 0xb6, 0x75, 0x34, 0xdc, 0x9d, 0x5e, 0x1f, 0xc5, 0x84, 0x47, 0x6, 0x42, 0x3, 0xc0, 0x81, 0x5b, 0x1a, 0xd9, 0x98, 0x70, 0x31, 0xf2, 0xb3, 0x69, 0x28, 0xeb, 0xaa, 0x26, 0x67, 0xa4, 0xe5, 0x3f, 0x7e, 0xbd, 0xfc, 0x14, 0x55, 0x96, 0xd7, 0xd, 0x4c, 0x8f, 0xce}, {0x0, 0x42, 0x84, 0xc6, 0x15, 0x57, 0x91, 0xd3, 0x2a, 0x68, 0xae, 0xec, 0x3f, 0x7d, 0xbb, 0xf9, 0x54, 0x16, 0xd0, 0x92, 0x41, 0x3, 0xc5, 0x87, 0x7e, 0x3c, 0xfa, 0xb8, 0x6b, 0x29, 0xef, 0xad, 0xa8, 0xea, 0x2c, 0x6e, 0xbd, 0xff, 0x39, 0x7b, 0x82, 0xc0, 0x6, 0x44, 0x97, 0xd5, 0x13, 0x51, 0xfc, 0xbe, 0x78, 0x3a, 0xe9, 0xab, 0x6d, 0x2f, 0xd6, 0x94, 0x52, 0x10, 0xc3, 0x81, 0x47, 0x5, 0x4d, 0xf, 0xc9, 0x8b, 0x58, 0x1a, 0xdc, 0x9e, 0x67, 0x25, 0xe3, 0xa1, 0x72, 0x30, 0xf6, 0xb4, 0x19, 0x5b, 0x9d, 0xdf, 0xc, 0x4e, 0x88, 0xca, 0x33, 0x71, 0xb7, 0xf5, 0x26, 0x64, 0xa2, 0xe0, 0xe5, 0xa7, 0x61, 0x23, 0xf0, 0xb2, 0x74, 0x36, 0xcf, 0x8d, 0x4b, 0x9, 0xda, 0x98, 0x5e, 0x1c, 0xb1, 0xf3, 0x35, 0x77, 0xa4, 0xe6, 0x20, 0x62, 0x9b, 0xd9, 0x1f, 0x5d, 0x8e, 0xcc, 0xa, 0x48, 0x9a, 0xd8, 0x1e, 0x5c, 0x8f, 0xcd, 0xb, 0x49, 0xb0, 0xf2, 0x34, 0x76, 0xa5, 0xe7, 0x21, 0x63, 0xce, 0x8c, 0x4a, 0x8, 0xdb, 0x99, 0x5f, 0x1d, 0xe4, 0xa6, 0x60, 0x22, 0xf1, 0xb3, 0x75, 0x37, 0x32, 0x70, 0xb6, 0xf4, 0x27, 0x65, 0xa3, 0xe1, 0x18, 0x5a, 0x9c, 0xde, 0xd, 0x4f, 0x89, 0xcb, 0x66, 0x24, 0xe2, 0xa0, 0x73, 0x31, 0xf7, 0xb5, 0x4c, 0xe, 0xc8, 0x8a, 0x59, 0x1b, 0xdd, 0x9f, 0xd7, 0x95, 0x53, 0x11, 0xc2, 0x80, 0x46, 0x4, 0xfd, 0xbf, 0x79, 0x3b, 0xe8, 0xaa, 0x6c, 0x2e, 0x83, 0xc1, 0x7, 0x45, 0x96, 0xd4, 0x12, 0x50, 0xa9, 0xeb, 0x2d, 0x6f, 0xbc, 0xfe, 0x38, 0x7a, 0x7f, 0x3d, 0xfb, 0xb9, 0x6a, 0x28, 0xee, 0xac, 0x55, 0x17, 0xd1, 0x93, 0x40, 0x2, 0xc4, 0x86, 0x2b, 0x69, 0xaf, 0xed, 0x3e, 0x7c, 0xba, 0xf8, 0x1, 0x43, 0x85, 0xc7, 0x14, 0x56, 0x90, 0xd2}, {0x0, 0x43, 0x86, 0xc5, 0x11, 0x52, 0x97, 0xd4, 0x22, 0x61, 0xa4, 0xe7, 0x33, 0x70, 0xb5, 0xf6, 0x44, 0x7, 0xc2, 0x81, 0x55, 0x16, 0xd3, 0x90, 0x66, 0x25, 0xe0, 0xa3, 0x77, 0x34, 0xf1, 0xb2, 0x88, 0xcb, 0xe, 0x4d, 0x99, 0xda, 0x1f, 0x5c, 0xaa, 0xe9, 0x2c, 0x6f, 0xbb, 0xf8, 0x3d, 0x7e, 0xcc, 0x8f, 0x4a, 0x9, 0xdd, 0x9e, 0x5b, 0x18, 0xee, 0xad, 0x68, 0x2b, 0xff, 0xbc, 0x79, 0x3a, 0xd, 0x4e, 0x8b, 0xc8, 0x1c, 0x5f, 0x9a, 0xd9, 0x2f, 0x6c, 0xa9, 0xea, 0x3e, 0x7d, 0xb8, 0xfb, 0x49, 0xa, 0xcf, 0x8c, 0x58, 0x1b, 0xde, 0x9d, 0x6b, 0x28, 0xed, 0xae, 0x7a, 0x39, 0xfc, 0xbf, 0x85, 0xc6, 0x3, 0x40, 0x94, 0xd7, 0x12, 0x51, 0xa7, 0xe4, 0x21, 0x62, 0xb6, 0xf5, 0x30, 0x73, 0xc1, 0x82, 0x47, 0x4, 0xd0, 0x93, 0x56, 0x15, 0xe3, 0xa0, 0x65, 0x26, 0xf2, 0xb1, 0x74, 0x37, 0x1a, 0x59, 0x9c, 0xdf, 0xb, 0x48, 0x8d, 0xce, 0x38, 0x7b, 0xbe, 0xfd, 0x29, 0x6a, 0xaf, 0xec, 0x5e, 0x1d, 0xd8, 0x9b, 0x4f, 0xc, 0xc9, 0x8a, 0x7c, 0x3f, 0xfa, 0xb9, 0x6d, 0x2e, 0xeb, 0xa8, 0x92, 0xd1, 0x14, 0x57, 0x83, 0xc0, 0x5, 0x46, 0xb0, 0xf3, 0x36, 0x75, 0xa1, 0xe2, 0x27, 0x64, 0xd6, 0x95, 0x50, 0x13, 0xc7, 0x84, 0x41, 0x2, 0xf4, 0xb7, 0x72, 0x31, 0xe5, 0xa6, 0x63, 0x20, 0x17, 0x54, 0x91, 0xd2, 0x6, 0x45, 0x80, 0xc3, 0x35, 0x76, 0xb3, 0xf0, 0x24, 0x67, 0xa2, 0xe1, 0x53, 0x10, 0xd5, 0x96, 0x42, 0x1, 0xc4, 0x87, 0x71, 0x32, 0xf7, 0xb4, 0x60, 0x23, 0xe6, 0xa5, 0x9f, 0xdc, 0x19, 0x5a, 0x8e, 0xcd, 0x8, 0x4b, 0xbd, 0xfe, 0x3b, 0x78, 0xac, 0xef, 0x2a, 0x69, 0xdb, 0x98, 0x5d, 0x1e, 0xca, 0x89, 0x4c, 0xf, 0xf9, 0xba, 0x7f, 0x3c, 0xe8, 0xab, 0x6e, 0x2d}, {0x0, 0x44, 0x88, 0xcc, 0xd, 0x49, 0x85, 0xc1, 0x1a, 0x5e, 0x92, 0xd6, 0x17, 0x53, 0x9f, 0xdb, 0x34, 0x70, 0xbc, 0xf8, 0x39, 0x7d, 0xb1, 0xf5, 0x2e, 0x6a, 0xa6, 0xe2, 0x23, 0x67, 0xab, 0xef, 0x68, 0x2c, 0xe0, 0xa4, 0x65, 0x21, 0xed, 0xa9, 0x72, 0x36, 0xfa, 0xbe, 0x7f, 0x3b, 0xf7, 0xb3, 0x5c, 0x18, 0xd4, 0x90, 0x51, 0x15, 0xd9, 0x9d, 0x46, 0x2, 0xce, 0x8a, 0x4b, 0xf, 0xc3, 0x87, 0xd0, 0x94, 0x58, 0x1c, 0xdd, 0x99, 0x55, 0x11, 0xca, 0x8e, 0x42, 0x6, 0xc7, 0x83, 0x4f, 0xb, 0xe4, 0xa0, 0x6c, 0x28, 0xe9, 0xad, 0x61, 0x25, 0xfe, 0xba, 0x76, 0x32, 0xf3, 0xb7, 0x7b, 0x3f, 0xb8, 0xfc, 0x30, 0x74, 0xb5, 0xf1, 0x3d, 0x79, 0xa2, 0xe6, 0x2a, 0x6e, 0xaf, 0xeb, 0x27, 0x63, 0x8c, 0xc8, 0x4, 0x40, 0x81, 0xc5, 0x9, 0x4d, 0x96, 0xd2, 0x1e, 0x5a, 0x9b, 0xdf, 0x13, 0x57, 0xbd, 0xf9, 0x35, 0x71, 0xb0, 0xf4, 0x38, 0x7c, 0xa7, 0xe3, 0x2f, 0x6b, 0xaa, 0xee, 0x22, 0x66, 0x89, 0xcd, 0x1, 0x45, 0x84, 0xc0, 0xc, 0x48, 0x93, 0xd7, 0x1b, 0x5f, 0x9e, 0xda, 0x16, 0x52, 0xd5, 0x91, 0x5d, 0x19, 0xd8, 0x9c, 0x50, 0x14, 0xcf, 0x8b, 0x47, 0x3, 0xc2, 0x86, 0x4a, 0xe, 0xe1, 0xa5, 0x69, 0x2d, 0xec, 0xa8, 0x64, 0x20, 0xfb, 0xbf, 0x73, 0x37, 0xf6, 0xb2, 0x7e, 0x3a, 0x6d, 0x29, 0xe5, 0xa1, 0x60, 0x24, 0xe8, 0xac, 0x77, 0x33, 0xff, 0xbb, 0x7a, 0x3e, 0xf2, 0xb6, 0x59, 0x1d, 0xd1, 0x95, 0x54, 0x10, 0xdc, 0x98, 0x43, 0x7, 0xcb, 0x8f, 0x4e, 0xa, 0xc6, 0x82, 0x5, 0x41, 0x8d, 0xc9, 0x8, 0x4c, 0x80, 0xc4, 0x1f, 0x5b, 0x97, 0xd3, 0x12, 0x56, 0x9a, 0xde, 0x31, 0x75, 0xb9, 0xfd, 0x3c, 0x78, 0xb4, 0xf0, 0x2b, 0x6f, 0xa3, 0xe7, 0x26, 0x62, 0xae, 0xea}, {0x0, 0x45, 0x8a, 0xcf, 0x9, 0x4c, 0x83, 0xc6, 0x12, 0x57, 0x98, 0xdd, 0x1b, 0x5e, 0x91, 0xd4, 0x24, 0x61, 0xae, 0xeb, 0x2d, 0x68, 0xa7, 0xe2, 0x36, 0x73, 0xbc, 0xf9, 0x3f, 0x7a, 0xb5, 0xf0, 0x48, 0xd, 0xc2, 0x87, 0x41, 0x4, 0xcb, 0x8e, 0x5a, 0x1f, 0xd0, 0x95, 0x53, 0x16, 0xd9, 0x9c, 0x6c, 0x29, 0xe6, 0xa3, 0x65, 0x20, 0xef, 0xaa, 0x7e, 0x3b, 0xf4, 0xb1, 0x77, 0x32, 0xfd, 0xb8, 0x90, 0xd5, 0x1a, 0x5f, 0x99, 0xdc, 0x13, 0x56, 0x82, 0xc7, 0x8, 0x4d, 0x8b, 0xce, 0x1, 0x44, 0xb4, 0xf1, 0x3e, 0x7b, 0xbd, 0xf8, 0x37, 0x72, 0xa6, 0xe3, 0x2c, 0x69, 0xaf, 0xea, 0x25, 0x60, 0xd8, 0x9d, 0x52, 0x17, 0xd1, 0x94, 0x5b, 0x1e, 0xca, 0x8f, 0x40, 0x5, 0xc3, 0x86, 0x49, 0xc, 0xfc, 0xb9, 0x76, 0x33, 0xf5, 0xb0, 0x7f, 0x3a, 0xee, 0xab, 0x64, 0x21, 0xe7, 0xa2, 0x6d, 0x28, 0x3d, 0x78, 0xb7, 0xf2, 0x34, 0x71, 0xbe, 0xfb, 0x2f, 0x6a, 0xa5, 0xe0, 0x26, 0x63, 0xac, 0xe9, 0x19, 0x5c, 0x93, 0xd6, 0x10, 0x55, 0x9a, 0xdf, 0xb, 0x4e, 0x81, 0xc4, 0x2, 0x47, 0x88, 0xcd, 0x75, 0x30, 0xff, 0xba, 0x7c, 0x39, 0xf6, 0xb3, 0x67, 0x22, 0xed, 0xa8, 0x6e, 0x2b, 0xe4, 0xa1, 0x51, 0x14, 0xdb, 0x9e, 0x58, 0x1d, 0xd2, 0x97, 0x43, 0x6, 0xc9, 0x8c, 0x4a, 0xf, 0xc0, 0x85, 0xad, 0xe8, 0x27, 0x62, 0xa4, 0xe1, 0x2e, 0x6b, 0xbf, 0xfa, 0x35, 0x70, 0xb6, 0xf3, 0x3c, 0x79, 0x89, 0xcc, 0x3, 0x46, 0x80, 0xc5, 0xa, 0x4f, 0x9b, 0xde, 0x11, 0x54, 0x92, 0xd7, 0x18, 0x5d, 0xe5, 0xa0, 0x6f, 0x2a, 0xec, 0xa9, 0x66, 0x23, 0xf7, 0xb2, 0x7d, 0x38, 0xfe, 0xbb, 0x74, 0x31, 0xc1, 0x84, 0x4b, 0xe, 0xc8, 0x8d, 0x42, 0x7, 0xd3, 0x96, 0x59, 0x1c, 0xda, 0x9f, 0x50, 0x15}, {0x0, 0x46, 0x8c, 0xca, 0x5, 0x43, 0x89, 0xcf, 0xa, 0x4c, 0x86, 0xc0, 0xf, 0x49, 0x83, 0xc5, 0x14, 0x52, 0x98, 0xde, 0x11, 0x57, 0x9d, 0xdb, 0x1e, 0x58, 0x92, 0xd4, 0x1b, 0x5d, 0x97, 0xd1, 0x28, 0x6e, 0xa4, 0xe2, 0x2d, 0x6b, 0xa1, 0xe7, 0x22, 0x64, 0xae, 0xe8, 0x27, 0x61, 0xab, 0xed, 0x3c, 0x7a, 0xb0, 0xf6, 0x39, 0x7f, 0xb5, 0xf3, 0x36, 0x70, 0xba, 0xfc, 0x33, 0x75, 0xbf, 0xf9, 0x50, 0x16, 0xdc, 0x9a, 0x55, 0x13, 0xd9, 0x9f, 0x5a, 0x1c, 0xd6, 0x90, 0x5f, 0x19, 0xd3, 0x95, 0x44, 0x2, 0xc8, 0x8e, 0x41, 0x7, 0xcd, 0x8b, 0x4e, 0x8, 0xc2, 0x84, 0x4b, 0xd, 0xc7, 0x81, 0x78, 0x3e, 0xf4, 0xb2, 0x7d, 0x3b, 0xf1, 0xb7, 0x72, 0x34, 0xfe, 0xb8, 0x77, 0x31, 0xfb, 0xbd, 0x6c, 0x2a, 0xe0, 0xa6, 0x69, 0x2f, 0xe5, 0xa3, 0x66, 0x20, 0xea, 0xac, 0x63, 0x25, 0xef, 0xa9, 0xa0, 0xe6, 0x2c, 0x6a, 0xa5, 0xe3, 0x29, 0x6f, 0xaa, 0xec, 0x26, 0x60, 0xaf, 0xe9, 0x23, 0x65, 0xb4, 0xf2, 0x38, 0x7e, 0xb1, 0xf7, 0x3d, 0x7b, 0xbe, 0xf8, 0x32, 0x74, 0xbb, 0xfd, 0x37, 0x71, 0x88, 0xce, 0x4, 0x42, 0x8d, 0xcb, 0x1, 0x47, 0x82, 0xc4, 0xe, 0x48, 0x87, 0xc1, 0xb, 0x4d, 0x9c, 0xda, 0x10, 0x56, 0x99, 0xdf, 0x15, 0x53, 0x96, 0xd0, 0x1a, 0x5c, 0x93, 0xd5, 0x1f, 0x59, 0xf0, 0xb6, 0x7c, 0x3a, 0xf5, 0xb3, 0x79, 0x3f, 0xfa, 0xbc, 0x76, 0x30, 0xff, 0xb9, 0x73, 0x35, 0xe4, 0xa2, 0x68, 0x2e, 0xe1, 0xa7, 0x6d, 0x2b, 0xee, 0xa8, 0x62, 0x24, 0xeb, 0xad, 0x67, 0x21, 0xd8, 0x9e, 0x54, 0x12, 0xdd, 0x9b, 0x51, 0x17, 0xd2, 0x94, 0x5e, 0x18, 0xd7, 0x91, 0x5b, 0x1d, 0xcc, 0x8a, 0x40, 0x6, 0xc9, 0x8f, 0x45, 0x3, 0xc6, 0x80, 0x4a, 0xc, 0xc3, 0x85, 0x4f, 0x9}, {0x0, 0x47, 0x8e, 0xc9, 0x1, 0x46, 0x8f, 0xc8, 0x2, 0x45, 0x8c, 0xcb, 0x3, 0x44, 0x8d, 0xca, 0x4, 0x43, 0x8a, 0xcd, 0x5, 0x42, 0x8b, 0xcc, 0x6, 0x41, 0x88, 0xcf, 0x7, 0x40, 0x89, 0xce, 0x8, 0x4f, 0x86, 0xc1, 0x9, 0x4e, 0x87, 0xc0, 0xa, 0x4d, 0x84, 0xc3, 0xb, 0x4c, 0x85, 0xc2, 0xc, 0x4b, 0x82, 0xc5, 0xd, 0x4a, 0x83, 0xc4, 0xe, 0x49, 0x80, 0xc7, 0xf, 0x48, 0x81, 0xc6, 0x10, 0x57, 0x9e, 0xd9, 0x11, 0x56, 0x9f, 0xd8, 0x12, 0x55, 0x9c, 0xdb, 0x13, 0x54, 0x9d, 0xda, 0x14, 0x53, 0x9a, 0xdd, 0x15, 0x52, 0x9b, 0xdc, 0x16, 0x51, 0x98, 0xdf, 0x17, 0x50, 0x99, 0xde, 0x18, 0x5f, 0x96, 0xd1, 0x19, 0x5e, 0x97, 0xd0, 0x1a, 0x5d, 0x94, 0xd3, 0x1b, 0x5c, 0x95, 0xd2, 0x1c, 0x5b, 0x92, 0xd5, 0x1d, 0x5a, 0x93, 0xd4, 0x1e, 0x59, 0x90, 0xd7, 0x1f, 0x58, 0x91, 0xd6, 0x20, 0x67, 0xae, 0xe9, 0x21, 0x66, 0xaf, 0xe8, 0x22, 0x65, 0xac, 0xeb, 0x23, 0x64, 0xad, 0xea, 0x24, 0x63, 0xaa, 0xed, 0x25, 0x62, 0xab, 0xec, 0x26, 0x61, 0xa8, 0xef, 0x27, 0x60, 0xa9, 0xee, 0x28, 0x6f, 0xa6, 0xe1, 0x29, 0x6e, 0xa7, 0xe0, 0x2a, 0x6d, 0xa4, 0xe3, 0x2b, 0x6c, 0xa5, 0xe2, 0x2c, 0x6b, 0xa2, 0xe5, 0x2d, 0x6a, 0xa3, 0xe4, 0x2e, 0x69, 0xa0, 0xe7, 0x2f, 0x68, 0xa1, 0xe6, 0x30, 0x77, 0xbe, 0xf9, 0x31, 0x76, 0xbf, 0xf8, 0x32, 0x75, 0xbc, 0xfb, 0x33, 0x74, 0xbd, 0xfa, 0x34, 0x73, 0xba, 0xfd, 0x35, 0x72, 0xbb, 0xfc, 0x36, 0x71, 0xb8, 0xff, 0x37, 0x70, 0xb9, 0xfe, 0x38, 0x7f, 0xb6, 0xf1, 0x39, 0x7e, 0xb7, 0xf0, 0x3a, 0x7d, 0xb4, 0xf3, 0x3b, 0x7c, 0xb5, 0xf2, 0x3c, 0x7b, 0xb2, 0xf5, 0x3d, 0x7a, 0xb3, 0xf4, 0x3e, 0x79, 0xb0, 0xf7, 0x3f, 0x78, 0xb1, 0xf6}, {0x0, 0x48, 0x90, 0xd8, 0x3d, 0x75, 0xad, 0xe5, 0x7a, 0x32, 0xea, 0xa2, 0x47, 0xf, 0xd7, 0x9f, 0xf4, 0xbc, 0x64, 0x2c, 0xc9, 0x81, 0x59, 0x11, 0x8e, 0xc6, 0x1e, 0x56, 0xb3, 0xfb, 0x23, 0x6b, 0xf5, 0xbd, 0x65, 0x2d, 0xc8, 0x80, 0x58, 0x10, 0x8f, 0xc7, 0x1f, 0x57, 0xb2, 0xfa, 0x22, 0x6a, 0x1, 0x49, 0x91, 0xd9, 0x3c, 0x74, 0xac, 0xe4, 0x7b, 0x33, 0xeb, 0xa3, 0x46, 0xe, 0xd6, 0x9e, 0xf7, 0xbf, 0x67, 0x2f, 0xca, 0x82, 0x5a, 0x12, 0x8d, 0xc5, 0x1d, 0x55, 0xb0, 0xf8, 0x20, 0x68, 0x3, 0x4b, 0x93, 0xdb, 0x3e, 0x76, 0xae, 0xe6, 0x79, 0x31, 0xe9, 0xa1, 0x44, 0xc, 0xd4, 0x9c, 0x2, 0x4a, 0x92, 0xda, 0x3f, 0x77, 0xaf, 0xe7, 0x78, 0x30, 0xe8, 0xa0, 0x45, 0xd, 0xd5, 0x9d, 0xf6, 0xbe, 0x66, 0x2e, 0xcb, 0x83, 0x5b, 0x13, 0x8c, 0xc4, 0x1c, 0x54, 0xb1, 0xf9, 0x21, 0x69, 0xf3, 0xbb, 0x63, 0x2b, 0xce, 0x86, 0x5e, 0x16, 0x89, 0xc1, 0x19, 0x51, 0xb4, 0xfc, 0x24, 0x6c, 0x7, 0x4f, 0x97, 0xdf, 0x3a, 0x72, 0xaa, 0xe2, 0x7d, 0x35, 0xed, 0xa5, 0x40, 0x8, 0xd0, 0x98, 0x6, 0x4e, 0x96, 0xde, 0x3b, 0x73, 0xab, 0xe3, 0x7c, 0x34, 0xec, 0xa4, 0x41, 0x9, 0xd1, 0x99, 0xf2, 0xba, 0x62, 0x2a, 0xcf, 0x87, 0x5f, 0x17, 0x88, 0xc0, 0x18, 0x50, 0xb5, 0xfd, 0x25, 0x6d, 0x4, 0x4c, 0x94, 0xdc, 0x39, 0x71, 0xa9, 0xe1, 0x7e, 0x36, 0xee, 0xa6, 0x43, 0xb, 0xd3, 0x9b, 0xf0, 0xb8, 0x60, 0x28, 0xcd, 0x85, 0x5d, 0x15, 0x8a, 0xc2, 0x1a, 0x52, 0xb7, 0xff, 0x27, 0x6f, 0xf1, 0xb9, 0x61, 0x29, 0xcc, 0x84, 0x5c, 0x14, 0x8b, 0xc3, 0x1b, 0x53, 0xb6, 0xfe, 0x26, 0x6e, 0x5, 0x4d, 0x95, 0xdd, 0x38, 0x70, 0xa8, 0xe0, 0x7f, 0x37, 0xef, 0xa7, 0x42, 0xa, 0xd2, 0x9a}, {0x0, 0x49, 0x92, 0xdb, 0x39, 0x70, 0xab, 0xe2, 0x72, 0x3b, 0xe0, 0xa9, 0x4b, 0x2, 0xd9, 0x90, 0xe4, 0xad, 0x76, 0x3f, 0xdd, 0x94, 0x4f, 0x6, 0x96, 0xdf, 0x4, 0x4d, 0xaf, 0xe6, 0x3d, 0x74, 0xd5, 0x9c, 0x47, 0xe, 0xec, 0xa5, 0x7e, 0x37, 0xa7, 0xee, 0x35, 0x7c, 0x9e, 0xd7, 0xc, 0x45, 0x31, 0x78, 0xa3, 0xea, 0x8, 0x41, 0x9a, 0xd3, 0x43, 0xa, 0xd1, 0x98, 0x7a, 0x33, 0xe8, 0xa1, 0xb7, 0xfe, 0x25, 0x6c, 0x8e, 0xc7, 0x1c, 0x55, 0xc5, 0x8c, 0x57, 0x1e, 0xfc, 0xb5, 0x6e, 0x27, 0x53, 0x1a, 0xc1, 0x88, 0x6a, 0x23, 0xf8, 0xb1, 0x21, 0x68, 0xb3, 0xfa, 0x18, 0x51, 0x8a, 0xc3, 0x62, 0x2b, 0xf0, 0xb9, 0x5b, 0x12, 0xc9, 0x80, 0x10, 0x59, 0x82, 0xcb, 0x29, 0x60, 0xbb, 0xf2, 0x86, 0xcf, 0x14, 0x5d, 0xbf, 0xf6, 0x2d, 0x64, 0xf4, 0xbd, 0x66, 0x2f, 0xcd, 0x84, 0x5f, 0x16, 0x73, 0x3a, 0xe1, 0xa8, 0x4a, 0x3, 0xd8, 0x91, 0x1, 0x48, 0x93, 0xda, 0x38, 0x71, 0xaa, 0xe3, 0x97, 0xde, 0x5, 0x4c, 0xae, 0xe7, 0x3c, 0x75, 0xe5, 0xac, 0x77, 0x3e, 0xdc, 0x95, 0x4e, 0x7, 0xa6, 0xef, 0x34, 0x7d, 0x9f, 0xd6, 0xd, 0x44, 0xd4, 0x9d, 0x46, 0xf, 0xed, 0xa4, 0x7f, 0x36, 0x42, 0xb, 0xd0, 0x99, 0x7b, 0x32, 0xe9, 0xa0, 0x30, 0x79, 0xa2, 0xeb, 0x9, 0x40, 0x9b, 0xd2, 0xc4, 0x8d, 0x56, 0x1f, 0xfd, 0xb4, 0x6f, 0x26, 0xb6, 0xff, 0x24, 0x6d, 0x8f, 0xc6, 0x1d, 0x54, 0x20, 0x69, 0xb2, 0xfb, 0x19, 0x50, 0x8b, 0xc2, 0x52, 0x1b, 0xc0, 0x89, 0x6b, 0x22, 0xf9, 0xb0, 0x11, 0x58, 0x83, 0xca, 0x28, 0x61, 0xba, 0xf3, 0x63, 0x2a, 0xf1, 0xb8, 0x5a, 0x13, 0xc8, 0x81, 0xf5, 0xbc, 0x67, 0x2e, 0xcc, 0x85, 0x5e, 0x17, 0x87, 0xce, 0x15, 0x5c, 0xbe, 0xf7, 0x2c, 0x65}, {0x0, 0x4a, 0x94, 0xde, 0x35, 0x7f, 0xa1, 0xeb, 0x6a, 0x20, 0xfe, 0xb4, 0x5f, 0x15, 0xcb, 0x81, 0xd4, 0x9e, 0x40, 0xa, 0xe1, 0xab, 0x75, 0x3f, 0xbe, 0xf4, 0x2a, 0x60, 0x8b, 0xc1, 0x1f, 0x55, 0xb5, 0xff, 0x21, 0x6b, 0x80, 0xca, 0x14, 0x5e, 0xdf, 0x95, 0x4b, 0x1, 0xea, 0xa0, 0x7e, 0x34, 0x61, 0x2b, 0xf5, 0xbf, 0x54, 0x1e, 0xc0, 0x8a, 0xb, 0x41, 0x9f, 0xd5, 0x3e, 0x74, 0xaa, 0xe0, 0x77, 0x3d, 0xe3, 0xa9, 0x42, 0x8, 0xd6, 0x9c, 0x1d, 0x57, 0x89, 0xc3, 0x28, 0x62, 0xbc, 0xf6, 0xa3, 0xe9, 0x37, 0x7d, 0x96, 0xdc, 0x2, 0x48, 0xc9, 0x83, 0x5d, 0x17, 0xfc, 0xb6, 0x68, 0x22, 0xc2, 0x88, 0x56, 0x1c, 0xf7, 0xbd, 0x63, 0x29, 0xa8, 0xe2, 0x3c, 0x76, 0x9d, 0xd7, 0x9, 0x43, 0x16, 0x5c, 0x82, 0xc8, 0x23, 0x69, 0xb7, 0xfd, 0x7c, 0x36, 0xe8, 0xa2, 0x49, 0x3, 0xdd, 0x97, 0xee, 0xa4, 0x7a, 0x30, 0xdb, 0x91, 0x4f, 0x5, 0x84, 0xce, 0x10, 0x5a, 0xb1, 0xfb, 0x25, 0x6f, 0x3a, 0x70, 0xae, 0xe4, 0xf, 0x45, 0x9b, 0xd1, 0x50, 0x1a, 0xc4, 0x8e, 0x65, 0x2f, 0xf1, 0xbb, 0x5b, 0x11, 0xcf, 0x85, 0x6e, 0x24, 0xfa, 0xb0, 0x31, 0x7b, 0xa5, 0xef, 0x4, 0x4e, 0x90, 0xda, 0x8f, 0xc5, 0x1b, 0x51, 0xba, 0xf0, 0x2e, 0x64, 0xe5, 0xaf, 0x71, 0x3b, 0xd0, 0x9a, 0x44, 0xe, 0x99, 0xd3, 0xd, 0x47, 0xac, 0xe6, 0x38, 0x72, 0xf3, 0xb9, 0x67, 0x2d, 0xc6, 0x8c, 0x52, 0x18, 0x4d, 0x7, 0xd9, 0x93, 0x78, 0x32, 0xec, 0xa6, 0x27, 0x6d, 0xb3, 0xf9, 0x12, 0x58, 0x86, 0xcc, 0x2c, 0x66, 0xb8, 0xf2, 0x19, 0x53, 0x8d, 0xc7, 0x46, 0xc, 0xd2, 0x98, 0x73, 0x39, 0xe7, 0xad, 0xf8, 0xb2, 0x6c, 0x26, 0xcd, 0x87, 0x59, 0x13, 0x92, 0xd8, 0x6, 0x4c, 0xa7, 0xed, 0x33, 0x79}, {0x0, 0x4b, 0x96, 0xdd, 0x31, 0x7a, 0xa7, 0xec, 0x62, 0x29, 0xf4, 0xbf, 0x53, 0x18, 0xc5, 0x8e, 0xc4, 0x8f, 0x52, 0x19, 0xf5, 0xbe, 0x63, 0x28, 0xa6, 0xed, 0x30, 0x7b, 0x97, 0xdc, 0x1, 0x4a, 0x95, 0xde, 0x3, 0x48, 0xa4, 0xef, 0x32, 0x79, 0xf7, 0xbc, 0x61, 0x2a, 0xc6, 0x8d, 0x50, 0x1b, 0x51, 0x1a, 0xc7, 0x8c, 0x60, 0x2b, 0xf6, 0xbd, 0x33, 0x78, 0xa5, 0xee, 0x2, 0x49, 0x94, 0xdf, 0x37, 0x7c, 0xa1, 0xea, 0x6, 0x4d, 0x90, 0xdb, 0x55, 0x1e, 0xc3, 0x88, 0x64, 0x2f, 0xf2, 0xb9, 0xf3, 0xb8, 0x65, 0x2e, 0xc2, 0x89, 0x54, 0x1f, 0x91, 0xda, 0x7, 0x4c, 0xa0, 0xeb, 0x36, 0x7d, 0xa2, 0xe9, 0x34, 0x7f, 0x93, 0xd8, 0x5, 0x4e, 0xc0, 0x8b, 0x56, 0x1d, 0xf1, 0xba, 0x67, 0x2c, 0x66, 0x2d, 0xf0, 0xbb, 0x57, 0x1c, 0xc1, 0x8a, 0x4, 0x4f, 0x92, 0xd9, 0x35, 0x7e, 0xa3, 0xe8, 0x6e, 0x25, 0xf8, 0xb3, 0x5f, 0x14, 0xc9, 0x82, 0xc, 0x47, 0x9a, 0xd1, 0x3d, 0x76, 0xab, 0xe0, 0xaa, 0xe1, 0x3c, 0x77, 0x9b, 0xd0, 0xd, 0x46, 0xc8, 0x83, 0x5e, 0x15, 0xf9, 0xb2, 0x6f, 0x24, 0xfb, 0xb0, 0x6d, 0x26, 0xca, 0x81, 0x5c, 0x17, 0x99, 0xd2, 0xf, 0x44, 0xa8, 0xe3, 0x3e, 0x75, 0x3f, 0x74, 0xa9, 0xe2, 0xe, 0x45, 0x98, 0xd3, 0x5d, 0x16, 0xcb, 0x80, 0x6c, 0x27, 0xfa, 0xb1, 0x59, 0x12, 0xcf, 0x84, 0x68, 0x23, 0xfe, 0xb5, 0x3b, 0x70, 0xad, 0xe6, 0xa, 0x41, 0x9c, 0xd7, 0x9d, 0xd6, 0xb, 0x40, 0xac, 0xe7, 0x3a, 0x71, 0xff, 0xb4, 0x69, 0x22, 0xce, 0x85, 0x58, 0x13, 0xcc, 0x87, 0x5a, 0x11, 0xfd, 0xb6, 0x6b, 0x20, 0xae, 0xe5, 0x38, 0x73, 0x9f, 0xd4, 0x9, 0x42, 0x8, 0x43, 0x9e, 0xd5, 0x39, 0x72, 0xaf, 0xe4, 0x6a, 0x21, 0xfc, 0xb7, 0x5b, 0x10, 0xcd, 0x86}, {0x0, 0x4c, 0x98, 0xd4, 0x2d, 0x61, 0xb5, 0xf9, 0x5a, 0x16, 0xc2, 0x8e, 0x77, 0x3b, 0xef, 0xa3, 0xb4, 0xf8, 0x2c, 0x60, 0x99, 0xd5, 0x1, 0x4d, 0xee, 0xa2, 0x76, 0x3a, 0xc3, 0x8f, 0x5b, 0x17, 0x75, 0x39, 0xed, 0xa1, 0x58, 0x14, 0xc0, 0x8c, 0x2f, 0x63, 0xb7, 0xfb, 0x2, 0x4e, 0x9a, 0xd6, 0xc1, 0x8d, 0x59, 0x15, 0xec, 0xa0, 0x74, 0x38, 0x9b, 0xd7, 0x3, 0x4f, 0xb6, 0xfa, 0x2e, 0x62, 0xea, 0xa6, 0x72, 0x3e, 0xc7, 0x8b, 0x5f, 0x13, 0xb0, 0xfc, 0x28, 0x64, 0x9d, 0xd1, 0x5, 0x49, 0x5e, 0x12, 0xc6, 0x8a, 0x73, 0x3f, 0xeb, 0xa7, 0x4, 0x48, 0x9c, 0xd0, 0x29, 0x65, 0xb1, 0xfd, 0x9f, 0xd3, 0x7, 0x4b, 0xb2, 0xfe, 0x2a, 0x66, 0xc5, 0x89, 0x5d, 0x11, 0xe8, 0xa4, 0x70, 0x3c, 0x2b, 0x67, 0xb3, 0xff, 0x6, 0x4a, 0x9e, 0xd2, 0x71, 0x3d, 0xe9, 0xa5, 0x5c, 0x10, 0xc4, 0x88, 0xc9, 0x85, 0x51, 0x1d, 0xe4, 0xa8, 0x7c, 0x30, 0x93, 0xdf, 0xb, 0x47, 0xbe, 0xf2, 0x26, 0x6a, 0x7d, 0x31, 0xe5, 0xa9, 0x50, 0x1c, 0xc8, 0x84, 0x27, 0x6b, 0xbf, 0xf3, 0xa, 0x46, 0x92, 0xde, 0xbc, 0xf0, 0x24, 0x68, 0x91, 0xdd, 0x9, 0x45, 0xe6, 0xaa, 0x7e, 0x32, 0xcb, 0x87, 0x53, 0x1f, 0x8, 0x44, 0x90, 0xdc, 0x25, 0x69, 0xbd, 0xf1, 0x52, 0x1e, 0xca, 0x86, 0x7f, 0x33, 0xe7, 0xab, 0x23, 0x6f, 0xbb, 0xf7, 0xe, 0x42, 0x96, 0xda, 0x79, 0x35, 0xe1, 0xad, 0x54, 0x18, 0xcc, 0x80, 0x97, 0xdb, 0xf, 0x43, 0xba, 0xf6, 0x22, 0x6e, 0xcd, 0x81, 0x55, 0x19, 0xe0, 0xac, 0x78, 0x34, 0x56, 0x1a, 0xce, 0x82, 0x7b, 0x37, 0xe3, 0xaf, 0xc, 0x40, 0x94, 0xd8, 0x21, 0x6d, 0xb9, 0xf5, 0xe2, 0xae, 0x7a, 0x36, 0xcf, 0x83, 0x57, 0x1b, 0xb8, 0xf4, 0x20, 0x6c, 0x95, 0xd9, 0xd, 0x41}, {0x0, 0x4d, 0x9a, 0xd7, 0x29, 0x64, 0xb3, 0xfe, 0x52, 0x1f, 0xc8, 0x85, 0x7b, 0x36, 0xe1, 0xac, 0xa4, 0xe9, 0x3e, 0x73, 0x8d, 0xc0, 0x17, 0x5a, 0xf6, 0xbb, 0x6c, 0x21, 0xdf, 0x92, 0x45, 0x8, 0x55, 0x18, 0xcf, 0x82, 0x7c, 0x31, 0xe6, 0xab, 0x7, 0x4a, 0x9d, 0xd0, 0x2e, 0x63, 0xb4, 0xf9, 0xf1, 0xbc, 0x6b, 0x26, 0xd8, 0x95, 0x42, 0xf, 0xa3, 0xee, 0x39, 0x74, 0x8a, 0xc7, 0x10, 0x5d, 0xaa, 0xe7, 0x30, 0x7d, 0x83, 0xce, 0x19, 0x54, 0xf8, 0xb5, 0x62, 0x2f, 0xd1, 0x9c, 0x4b, 0x6, 0xe, 0x43, 0x94, 0xd9, 0x27, 0x6a, 0xbd, 0xf0, 0x5c, 0x11, 0xc6, 0x8b, 0x75, 0x38, 0xef, 0xa2, 0xff, 0xb2, 0x65, 0x28, 0xd6, 0x9b, 0x4c, 0x1, 0xad, 0xe0, 0x37, 0x7a, 0x84, 0xc9, 0x1e, 0x53, 0x5b, 0x16, 0xc1, 0x8c, 0x72, 0x3f, 0xe8, 0xa5, 0x9, 0x44, 0x93, 0xde, 0x20, 0x6d, 0xba, 0xf7, 0x49, 0x4, 0xd3, 0x9e, 0x60, 0x2d, 0xfa, 0xb7, 0x1b, 0x56, 0x81, 0xcc, 0x32, 0x7f, 0xa8, 0xe5, 0xed, 0xa0, 0x77, 0x3a, 0xc4, 0x89, 0x5e, 0x13, 0xbf, 0xf2, 0x25, 0x68, 0x96, 0xdb, 0xc, 0x41, 0x1c, 0x51, 0x86, 0xcb, 0x35, 0x78, 0xaf, 0xe2, 0x4e, 0x3, 0xd4, 0x99, 0x67, 0x2a, 0xfd, 0xb0, 0xb8, 0xf5, 0x22, 0x6f, 0x91, 0xdc, 0xb, 0x46, 0xea, 0xa7, 0x70, 0x3d, 0xc3, 0x8e, 0x59, 0x14, 0xe3, 0xae, 0x79, 0x34, 0xca, 0x87, 0x50, 0x1d, 0xb1, 0xfc, 0x2b, 0x66, 0x98, 0xd5, 0x2, 0x4f, 0x47, 0xa, 0xdd, 0x90, 0x6e, 0x23, 0xf4, 0xb9, 0x15, 0x58, 0x8f, 0xc2, 0x3c, 0x71, 0xa6, 0xeb, 0xb6, 0xfb, 0x2c, 0x61, 0x9f, 0xd2, 0x5, 0x48, 0xe4, 0xa9, 0x7e, 0x33, 0xcd, 0x80, 0x57, 0x1a, 0x12, 0x5f, 0x88, 0xc5, 0x3b, 0x76, 0xa1, 0xec, 0x40, 0xd, 0xda, 0x97, 0x69, 0x24, 0xf3, 0xbe}, {0x0, 0x4e, 0x9c, 0xd2, 0x25, 0x6b, 0xb9, 0xf7, 0x4a, 0x4, 0xd6, 0x98, 0x6f, 0x21, 0xf3, 0xbd, 0x94, 0xda, 0x8, 0x46, 0xb1, 0xff, 0x2d, 0x63, 0xde, 0x90, 0x42, 0xc, 0xfb, 0xb5, 0x67, 0x29, 0x35, 0x7b, 0xa9, 0xe7, 0x10, 0x5e, 0x8c, 0xc2, 0x7f, 0x31, 0xe3, 0xad, 0x5a, 0x14, 0xc6, 0x88, 0xa1, 0xef, 0x3d, 0x73, 0x84, 0xca, 0x18, 0x56, 0xeb, 0xa5, 0x77, 0x39, 0xce, 0x80, 0x52, 0x1c, 0x6a, 0x24, 0xf6, 0xb8, 0x4f, 0x1, 0xd3, 0x9d, 0x20, 0x6e, 0xbc, 0xf2, 0x5, 0x4b, 0x99, 0xd7, 0xfe, 0xb0, 0x62, 0x2c, 0xdb, 0x95, 0x47, 0x9, 0xb4, 0xfa, 0x28, 0x66, 0x91, 0xdf, 0xd, 0x43, 0x5f, 0x11, 0xc3, 0x8d, 0x7a, 0x34, 0xe6, 0xa8, 0x15, 0x5b, 0x89, 0xc7, 0x30, 0x7e, 0xac, 0xe2, 0xcb, 0x85, 0x57, 0x19, 0xee, 0xa0, 0x72, 0x3c, 0x81, 0xcf, 0x1d, 0x53, 0xa4, 0xea, 0x38, 0x76, 0xd4, 0x9a, 0x48, 0x6, 0xf1, 0xbf, 0x6d, 0x23, 0x9e, 0xd0, 0x2, 0x4c, 0xbb, 0xf5, 0x27, 0x69, 0x40, 0xe, 0xdc, 0x92, 0x65, 0x2b, 0xf9, 0xb7, 0xa, 0x44, 0x96, 0xd8, 0x2f, 0x61, 0xb3, 0xfd, 0xe1, 0xaf, 0x7d, 0x33, 0xc4, 0x8a, 0x58, 0x16, 0xab, 0xe5, 0x37, 0x79, 0x8e, 0xc0, 0x12, 0x5c, 0x75, 0x3b, 0xe9, 0xa7, 0x50, 0x1e, 0xcc, 0x82, 0x3f, 0x71, 0xa3, 0xed, 0x1a, 0x54, 0x86, 0xc8, 0xbe, 0xf0, 0x22, 0x6c, 0x9b, 0xd5, 0x7, 0x49, 0xf4, 0xba, 0x68, 0x26, 0xd1, 0x9f, 0x4d, 0x3, 0x2a, 0x64, 0xb6, 0xf8, 0xf, 0x41, 0x93, 0xdd, 0x60, 0x2e, 0xfc, 0xb2, 0x45, 0xb, 0xd9, 0x97, 0x8b, 0xc5, 0x17, 0x59, 0xae, 0xe0, 0x32, 0x7c, 0xc1, 0x8f, 0x5d, 0x13, 0xe4, 0xaa, 0x78, 0x36, 0x1f, 0x51, 0x83, 0xcd, 0x3a, 0x74, 0xa6, 0xe8, 0x55, 0x1b, 0xc9, 0x87, 0x70, 0x3e, 0xec, 0xa2}, {0x0, 0x4f, 0x9e, 0xd1, 0x21, 0x6e, 0xbf, 0xf0, 0x42, 0xd, 0xdc, 0x93, 0x63, 0x2c, 0xfd, 0xb2, 0x84, 0xcb, 0x1a, 0x55, 0xa5, 0xea, 0x3b, 0x74, 0xc6, 0x89, 0x58, 0x17, 0xe7, 0xa8, 0x79, 0x36, 0x15, 0x5a, 0x8b, 0xc4, 0x34, 0x7b, 0xaa, 0xe5, 0x57, 0x18, 0xc9, 0x86, 0x76, 0x39, 0xe8, 0xa7, 0x91, 0xde, 0xf, 0x40, 0xb0, 0xff, 0x2e, 0x61, 0xd3, 0x9c, 0x4d, 0x2, 0xf2, 0xbd, 0x6c, 0x23, 0x2a, 0x65, 0xb4, 0xfb, 0xb, 0x44, 0x95, 0xda, 0x68, 0x27, 0xf6, 0xb9, 0x49, 0x6, 0xd7, 0x98, 0xae, 0xe1, 0x30, 0x7f, 0x8f, 0xc0, 0x11, 0x5e, 0xec, 0xa3, 0x72, 0x3d, 0xcd, 0x82, 0x53, 0x1c, 0x3f, 0x70, 0xa1, 0xee, 0x1e, 0x51, 0x80, 0xcf, 0x7d, 0x32, 0xe3, 0xac, 0x5c, 0x13, 0xc2, 0x8d, 0xbb, 0xf4, 0x25, 0x6a, 0x9a, 0xd5, 0x4, 0x4b, 0xf9, 0xb6, 0x67, 0x28, 0xd8, 0x97, 0x46, 0x9, 0x54, 0x1b, 0xca, 0x85, 0x75, 0x3a, 0xeb, 0xa4, 0x16, 0x59, 0x88, 0xc7, 0x37, 0x78, 0xa9, 0xe6, 0xd0, 0x9f, 0x4e, 0x1, 0xf1, 0xbe, 0x6f, 0x20, 0x92, 0xdd, 0xc, 0x43, 0xb3, 0xfc, 0x2d, 0x62, 0x41, 0xe, 0xdf, 0x90, 0x60, 0x2f, 0xfe, 0xb1, 0x3, 0x4c, 0x9d, 0xd2, 0x22, 0x6d, 0xbc, 0xf3, 0xc5, 0x8a, 0x5b, 0x14, 0xe4, 0xab, 0x7a, 0x35, 0x87, 0xc8, 0x19, 0x56, 0xa6, 0xe9, 0x38, 0x77, 0x7e, 0x31, 0xe0, 0xaf, 0x5f, 0x10, 0xc1, 0x8e, 0x3c, 0x73, 0xa2, 0xed, 0x1d, 0x52, 0x83, 0xcc, 0xfa, 0xb5, 0x64, 0x2b, 0xdb, 0x94, 0x45, 0xa, 0xb8, 0xf7, 0x26, 0x69, 0x99, 0xd6, 0x7, 0x48, 0x6b, 0x24, 0xf5, 0xba, 0x4a, 0x5, 0xd4, 0x9b, 0x29, 0x66, 0xb7, 0xf8, 0x8, 0x47, 0x96, 0xd9, 0xef, 0xa0, 0x71, 0x3e, 0xce, 0x81, 0x50, 0x1f, 0xad, 0xe2, 0x33, 0x7c, 0x8c, 0xc3, 0x12, 0x5d}, {0x0, 0x50, 0xa0, 0xf0, 0x5d, 0xd, 0xfd, 0xad, 0xba, 0xea, 0x1a, 0x4a, 0xe7, 0xb7, 0x47, 0x17, 0x69, 0x39, 0xc9, 0x99, 0x34, 0x64, 0x94, 0xc4, 0xd3, 0x83, 0x73, 0x23, 0x8e, 0xde, 0x2e, 0x7e, 0xd2, 0x82, 0x72, 0x22, 0x8f, 0xdf, 0x2f, 0x7f, 0x68, 0x38, 0xc8, 0x98, 0x35, 0x65, 0x95, 0xc5, 0xbb, 0xeb, 0x1b, 0x4b, 0xe6, 0xb6, 0x46, 0x16, 0x1, 0x51, 0xa1, 0xf1, 0x5c, 0xc, 0xfc, 0xac, 0xb9, 0xe9, 0x19, 0x49, 0xe4, 0xb4, 0x44, 0x14, 0x3, 0x53, 0xa3, 0xf3, 0x5e, 0xe, 0xfe, 0xae, 0xd0, 0x80, 0x70, 0x20, 0x8d, 0xdd, 0x2d, 0x7d, 0x6a, 0x3a, 0xca, 0x9a, 0x37, 0x67, 0x97, 0xc7, 0x6b, 0x3b, 0xcb, 0x9b, 0x36, 0x66, 0x96, 0xc6, 0xd1, 0x81, 0x71, 0x21, 0x8c, 0xdc, 0x2c, 0x7c, 0x2, 0x52, 0xa2, 0xf2, 0x5f, 0xf, 0xff, 0xaf, 0xb8, 0xe8, 0x18, 0x48, 0xe5, 0xb5, 0x45, 0x15, 0x6f, 0x3f, 0xcf, 0x9f, 0x32, 0x62, 0x92, 0xc2, 0xd5, 0x85, 0x75, 0x25, 0x88, 0xd8, 0x28, 0x78, 0x6, 0x56, 0xa6, 0xf6, 0x5b, 0xb, 0xfb, 0xab, 0xbc, 0xec, 0x1c, 0x4c, 0xe1, 0xb1, 0x41, 0x11, 0xbd, 0xed, 0x1d, 0x4d, 0xe0, 0xb0, 0x40, 0x10, 0x7, 0x57, 0xa7, 0xf7, 0x5a, 0xa, 0xfa, 0xaa, 0xd4, 0x84, 0x74, 0x24, 0x89, 0xd9, 0x29, 0x79, 0x6e, 0x3e, 0xce, 0x9e, 0x33, 0x63, 0x93, 0xc3, 0xd6, 0x86, 0x76, 0x26, 0x8b, 0xdb, 0x2b, 0x7b, 0x6c, 0x3c, 0xcc, 0x9c, 0x31, 0x61, 0x91, 0xc1, 0xbf, 0xef, 0x1f, 0x4f, 0xe2, 0xb2, 0x42, 0x12, 0x5, 0x55, 0xa5, 0xf5, 0x58, 0x8, 0xf8, 0xa8, 0x4, 0x54, 0xa4, 0xf4, 0x59, 0x9, 0xf9, 0xa9, 0xbe, 0xee, 0x1e, 0x4e, 0xe3, 0xb3, 0x43, 0x13, 0x6d, 0x3d, 0xcd, 0x9d, 0x30, 0x60, 0x90, 0xc0, 0xd7, 0x87, 0x77, 0x27, 0x8a, 0xda, 0x2a, 0x7a}, {0x0, 0x51, 0xa2, 0xf3, 0x59, 0x8, 0xfb, 0xaa, 0xb2, 0xe3, 0x10, 0x41, 0xeb, 0xba, 0x49, 0x18, 0x79, 0x28, 0xdb, 0x8a, 0x20, 0x71, 0x82, 0xd3, 0xcb, 0x9a, 0x69, 0x38, 0x92, 0xc3, 0x30, 0x61, 0xf2, 0xa3, 0x50, 0x1, 0xab, 0xfa, 0x9, 0x58, 0x40, 0x11, 0xe2, 0xb3, 0x19, 0x48, 0xbb, 0xea, 0x8b, 0xda, 0x29, 0x78, 0xd2, 0x83, 0x70, 0x21, 0x39, 0x68, 0x9b, 0xca, 0x60, 0x31, 0xc2, 0x93, 0xf9, 0xa8, 0x5b, 0xa, 0xa0, 0xf1, 0x2, 0x53, 0x4b, 0x1a, 0xe9, 0xb8, 0x12, 0x43, 0xb0, 0xe1, 0x80, 0xd1, 0x22, 0x73, 0xd9, 0x88, 0x7b, 0x2a, 0x32, 0x63, 0x90, 0xc1, 0x6b, 0x3a, 0xc9, 0x98, 0xb, 0x5a, 0xa9, 0xf8, 0x52, 0x3, 0xf0, 0xa1, 0xb9, 0xe8, 0x1b, 0x4a, 0xe0, 0xb1, 0x42, 0x13, 0x72, 0x23, 0xd0, 0x81, 0x2b, 0x7a, 0x89, 0xd8, 0xc0, 0x91, 0x62, 0x33, 0x99, 0xc8, 0x3b, 0x6a, 0xef, 0xbe, 0x4d, 0x1c, 0xb6, 0xe7, 0x14, 0x45, 0x5d, 0xc, 0xff, 0xae, 0x4, 0x55, 0xa6, 0xf7, 0x96, 0xc7, 0x34, 0x65, 0xcf, 0x9e, 0x6d, 0x3c, 0x24, 0x75, 0x86, 0xd7, 0x7d, 0x2c, 0xdf, 0x8e, 0x1d, 0x4c, 0xbf, 0xee, 0x44, 0x15, 0xe6, 0xb7, 0xaf, 0xfe, 0xd, 0x5c, 0xf6, 0xa7, 0x54, 0x5, 0x64, 0x35, 0xc6, 0x97, 0x3d, 0x6c, 0x9f, 0xce, 0xd6, 0x87, 0x74, 0x25, 0x8f, 0xde, 0x2d, 0x7c, 0x16, 0x47, 0xb4, 0xe5, 0x4f, 0x1e, 0xed, 0xbc, 0xa4, 0xf5, 0x6, 0x57, 0xfd, 0xac, 0x5f, 0xe, 0x6f, 0x3e, 0xcd, 0x9c, 0x36, 0x67, 0x94, 0xc5, 0xdd, 0x8c, 0x7f, 0x2e, 0x84, 0xd5, 0x26, 0x77, 0xe4, 0xb5, 0x46, 0x17, 0xbd, 0xec, 0x1f, 0x4e, 0x56, 0x7, 0xf4, 0xa5, 0xf, 0x5e, 0xad, 0xfc, 0x9d, 0xcc, 0x3f, 0x6e, 0xc4, 0x95, 0x66, 0x37, 0x2f, 0x7e, 0x8d, 0xdc, 0x76, 0x27, 0xd4, 0x85}, {0x0, 0x52, 0xa4, 0xf6, 0x55, 0x7, 0xf1, 0xa3, 0xaa, 0xf8, 0xe, 0x5c, 0xff, 0xad, 0x5b, 0x9, 0x49, 0x1b, 0xed, 0xbf, 0x1c, 0x4e, 0xb8, 0xea, 0xe3, 0xb1, 0x47, 0x15, 0xb6, 0xe4, 0x12, 0x40, 0x92, 0xc0, 0x36, 0x64, 0xc7, 0x95, 0x63, 0x31, 0x38, 0x6a, 0x9c, 0xce, 0x6d, 0x3f, 0xc9, 0x9b, 0xdb, 0x89, 0x7f, 0x2d, 0x8e, 0xdc, 0x2a, 0x78, 0x71, 0x23, 0xd5, 0x87, 0x24, 0x76, 0x80, 0xd2, 0x39, 0x6b, 0x9d, 0xcf, 0x6c, 0x3e, 0xc8, 0x9a, 0x93, 0xc1, 0x37, 0x65, 0xc6, 0x94, 0x62, 0x30, 0x70, 0x22, 0xd4, 0x86, 0x25, 0x77, 0x81, 0xd3, 0xda, 0x88, 0x7e, 0x2c, 0x8f, 0xdd, 0x2b, 0x79, 0xab, 0xf9, 0xf, 0x5d, 0xfe, 0xac, 0x5a, 0x8, 0x1, 0x53, 0xa5, 0xf7, 0x54, 0x6, 0xf0, 0xa2, 0xe2, 0xb0, 0x46, 0x14, 0xb7, 0xe5, 0x13, 0x41, 0x48, 0x1a, 0xec, 0xbe, 0x1d, 0x4f, 0xb9, 0xeb, 0x72, 0x20, 0xd6, 0x84, 0x27, 0x75, 0x83, 0xd1, 0xd8, 0x8a, 0x7c, 0x2e, 0x8d, 0xdf, 0x29, 0x7b, 0x3b, 0x69, 0x9f, 0xcd, 0x6e, 0x3c, 0xca, 0x98, 0x91, 0xc3, 0x35, 0x67, 0xc4, 0x96, 0x60, 0x32, 0xe0, 0xb2, 0x44, 0x16, 0xb5, 0xe7, 0x11, 0x43, 0x4a, 0x18, 0xee, 0xbc, 0x1f, 0x4d, 0xbb, 0xe9, 0xa9, 0xfb, 0xd, 0x5f, 0xfc, 0xae, 0x58, 0xa, 0x3, 0x51, 0xa7, 0xf5, 0x56, 0x4, 0xf2, 0xa0, 0x4b, 0x19, 0xef, 0xbd, 0x1e, 0x4c, 0xba, 0xe8, 0xe1, 0xb3, 0x45, 0x17, 0xb4, 0xe6, 0x10, 0x42, 0x2, 0x50, 0xa6, 0xf4, 0x57, 0x5, 0xf3, 0xa1, 0xa8, 0xfa, 0xc, 0x5e, 0xfd, 0xaf, 0x59, 0xb, 0xd9, 0x8b, 0x7d, 0x2f, 0x8c, 0xde, 0x28, 0x7a, 0x73, 0x21, 0xd7, 0x85, 0x26, 0x74, 0x82, 0xd0, 0x90, 0xc2, 0x34, 0x66, 0xc5, 0x97, 0x61, 0x33, 0x3a, 0x68, 0x9e, 0xcc, 0x6f, 0x3d, 0xcb, 0x99}, {0x0, 0x53, 0xa6, 0xf5, 0x51, 0x2, 0xf7, 0xa4, 0xa2, 0xf1, 0x4, 0x57, 0xf3, 0xa0, 0x55, 0x6, 0x59, 0xa, 0xff, 0xac, 0x8, 0x5b, 0xae, 0xfd, 0xfb, 0xa8, 0x5d, 0xe, 0xaa, 0xf9, 0xc, 0x5f, 0xb2, 0xe1, 0x14, 0x47, 0xe3, 0xb0, 0x45, 0x16, 0x10, 0x43, 0xb6, 0xe5, 0x41, 0x12, 0xe7, 0xb4, 0xeb, 0xb8, 0x4d, 0x1e, 0xba, 0xe9, 0x1c, 0x4f, 0x49, 0x1a, 0xef, 0xbc, 0x18, 0x4b, 0xbe, 0xed, 0x79, 0x2a, 0xdf, 0x8c, 0x28, 0x7b, 0x8e, 0xdd, 0xdb, 0x88, 0x7d, 0x2e, 0x8a, 0xd9, 0x2c, 0x7f, 0x20, 0x73, 0x86, 0xd5, 0x71, 0x22, 0xd7, 0x84, 0x82, 0xd1, 0x24, 0x77, 0xd3, 0x80, 0x75, 0x26, 0xcb, 0x98, 0x6d, 0x3e, 0x9a, 0xc9, 0x3c, 0x6f, 0x69, 0x3a, 0xcf, 0x9c, 0x38, 0x6b, 0x9e, 0xcd, 0x92, 0xc1, 0x34, 0x67, 0xc3, 0x90, 0x65, 0x36, 0x30, 0x63, 0x96, 0xc5, 0x61, 0x32, 0xc7, 0x94, 0xf2, 0xa1, 0x54, 0x7, 0xa3, 0xf0, 0x5, 0x56, 0x50, 0x3, 0xf6, 0xa5, 0x1, 0x52, 0xa7, 0xf4, 0xab, 0xf8, 0xd, 0x5e, 0xfa, 0xa9, 0x5c, 0xf, 0x9, 0x5a, 0xaf, 0xfc, 0x58, 0xb, 0xfe, 0xad, 0x40, 0x13, 0xe6, 0xb5, 0x11, 0x42, 0xb7, 0xe4, 0xe2, 0xb1, 0x44, 0x17, 0xb3, 0xe0, 0x15, 0x46, 0x19, 0x4a, 0xbf, 0xec, 0x48, 0x1b, 0xee, 0xbd, 0xbb, 0xe8, 0x1d, 0x4e, 0xea, 0xb9, 0x4c, 0x1f, 0x8b, 0xd8, 0x2d, 0x7e, 0xda, 0x89, 0x7c, 0x2f, 0x29, 0x7a, 0x8f, 0xdc, 0x78, 0x2b, 0xde, 0x8d, 0xd2, 0x81, 0x74, 0x27, 0x83, 0xd0, 0x25, 0x76, 0x70, 0x23, 0xd6, 0x85, 0x21, 0x72, 0x87, 0xd4, 0x39, 0x6a, 0x9f, 0xcc, 0x68, 0x3b, 0xce, 0x9d, 0x9b, 0xc8, 0x3d, 0x6e, 0xca, 0x99, 0x6c, 0x3f, 0x60, 0x33, 0xc6, 0x95, 0x31, 0x62, 0x97, 0xc4, 0xc2, 0x91, 0x64, 0x37, 0x93, 0xc0, 0x35, 0x66}, {0x0, 0x54, 0xa8, 0xfc, 0x4d, 0x19, 0xe5, 0xb1, 0x9a, 0xce, 0x32, 0x66, 0xd7, 0x83, 0x7f, 0x2b, 0x29, 0x7d, 0x81, 0xd5, 0x64, 0x30, 0xcc, 0x98, 0xb3, 0xe7, 0x1b, 0x4f, 0xfe, 0xaa, 0x56, 0x2, 0x52, 0x6, 0xfa, 0xae, 0x1f, 0x4b, 0xb7, 0xe3, 0xc8, 0x9c, 0x60, 0x34, 0x85, 0xd1, 0x2d, 0x79, 0x7b, 0x2f, 0xd3, 0x87, 0x36, 0x62, 0x9e, 0xca, 0xe1, 0xb5, 0x49, 0x1d, 0xac, 0xf8, 0x4, 0x50, 0xa4, 0xf0, 0xc, 0x58, 0xe9, 0xbd, 0x41, 0x15, 0x3e, 0x6a, 0x96, 0xc2, 0x73, 0x27, 0xdb, 0x8f, 0x8d, 0xd9, 0x25, 0x71, 0xc0, 0x94, 0x68, 0x3c, 0x17, 0x43, 0xbf, 0xeb, 0x5a, 0xe, 0xf2, 0xa6, 0xf6, 0xa2, 0x5e, 0xa, 0xbb, 0xef, 0x13, 0x47, 0x6c, 0x38, 0xc4, 0x90, 0x21, 0x75, 0x89, 0xdd, 0xdf, 0x8b, 0x77, 0x23, 0x92, 0xc6, 0x3a, 0x6e, 0x45, 0x11, 0xed, 0xb9, 0x8, 0x5c, 0xa0, 0xf4, 0x55, 0x1, 0xfd, 0xa9, 0x18, 0x4c, 0xb0, 0xe4, 0xcf, 0x9b, 0x67, 0x33, 0x82, 0xd6, 0x2a, 0x7e, 0x7c, 0x28, 0xd4, 0x80, 0x31, 0x65, 0x99, 0xcd, 0xe6, 0xb2, 0x4e, 0x1a, 0xab, 0xff, 0x3, 0x57, 0x7, 0x53, 0xaf, 0xfb, 0x4a, 0x1e, 0xe2, 0xb6, 0x9d, 0xc9, 0x35, 0x61, 0xd0, 0x84, 0x78, 0x2c, 0x2e, 0x7a, 0x86, 0xd2, 0x63, 0x37, 0xcb, 0x9f, 0xb4, 0xe0, 0x1c, 0x48, 0xf9, 0xad, 0x51, 0x5, 0xf1, 0xa5, 0x59, 0xd, 0xbc, 0xe8, 0x14, 0x40, 0x6b, 0x3f, 0xc3, 0x97, 0x26, 0x72, 0x8e, 0xda, 0xd8, 0x8c, 0x70, 0x24, 0x95, 0xc1, 0x3d, 0x69, 0x42, 0x16, 0xea, 0xbe, 0xf, 0x5b, 0xa7, 0xf3, 0xa3, 0xf7, 0xb, 0x5f, 0xee, 0xba, 0x46, 0x12, 0x39, 0x6d, 0x91, 0xc5, 0x74, 0x20, 0xdc, 0x88, 0x8a, 0xde, 0x22, 0x76, 0xc7, 0x93, 0x6f, 0x3b, 0x10, 0x44, 0xb8, 0xec, 0x5d, 0x9, 0xf5, 0xa1}, {0x0, 0x55, 0xaa, 0xff, 0x49, 0x1c, 0xe3, 0xb6, 0x92, 0xc7, 0x38, 0x6d, 0xdb, 0x8e, 0x71, 0x24, 0x39, 0x6c, 0x93, 0xc6, 0x70, 0x25, 0xda, 0x8f, 0xab, 0xfe, 0x1, 0x54, 0xe2, 0xb7, 0x48, 0x1d, 0x72, 0x27, 0xd8, 0x8d, 0x3b, 0x6e, 0x91, 0xc4, 0xe0, 0xb5, 0x4a, 0x1f, 0xa9, 0xfc, 0x3, 0x56, 0x4b, 0x1e, 0xe1, 0xb4, 0x2, 0x57, 0xa8, 0xfd, 0xd9, 0x8c, 0x73, 0x26, 0x90, 0xc5, 0x3a, 0x6f, 0xe4, 0xb1, 0x4e, 0x1b, 0xad, 0xf8, 0x7, 0x52, 0x76, 0x23, 0xdc, 0x89, 0x3f, 0x6a, 0x95, 0xc0, 0xdd, 0x88, 0x77, 0x22, 0x94, 0xc1, 0x3e, 0x6b, 0x4f, 0x1a, 0xe5, 0xb0, 0x6, 0x53, 0xac, 0xf9, 0x96, 0xc3, 0x3c, 0x69, 0xdf, 0x8a, 0x75, 0x20, 0x4, 0x51, 0xae, 0xfb, 0x4d, 0x18, 0xe7, 0xb2, 0xaf, 0xfa, 0x5, 0x50, 0xe6, 0xb3, 0x4c, 0x19, 0x3d, 0x68, 0x97, 0xc2, 0x74, 0x21, 0xde, 0x8b, 0xd5, 0x80, 0x7f, 0x2a, 0x9c, 0xc9, 0x36, 0x63, 0x47, 0x12, 0xed, 0xb8, 0xe, 0x5b, 0xa4, 0xf1, 0xec, 0xb9, 0x46, 0x13, 0xa5, 0xf0, 0xf, 0x5a, 0x7e, 0x2b, 0xd4, 0x81, 0x37, 0x62, 0x9d, 0xc8, 0xa7, 0xf2, 0xd, 0x58, 0xee, 0xbb, 0x44, 0x11, 0x35, 0x60, 0x9f, 0xca, 0x7c, 0x29, 0xd6, 0x83, 0x9e, 0xcb, 0x34, 0x61, 0xd7, 0x82, 0x7d, 0x28, 0xc, 0x59, 0xa6, 0xf3, 0x45, 0x10, 0xef, 0xba, 0x31, 0x64, 0x9b, 0xce, 0x78, 0x2d, 0xd2, 0x87, 0xa3, 0xf6, 0x9, 0x5c, 0xea, 0xbf, 0x40, 0x15, 0x8, 0x5d, 0xa2, 0xf7, 0x41, 0x14, 0xeb, 0xbe, 0x9a, 0xcf, 0x30, 0x65, 0xd3, 0x86, 0x79, 0x2c, 0x43, 0x16, 0xe9, 0xbc, 0xa, 0x5f, 0xa0, 0xf5, 0xd1, 0x84, 0x7b, 0x2e, 0x98, 0xcd, 0x32, 0x67, 0x7a, 0x2f, 0xd0, 0x85, 0x33, 0x66, 0x99, 0xcc, 0xe8, 0xbd, 0x42, 0x17, 0xa1, 0xf4, 0xb, 0x5e}, {0x0, 0x56, 0xac, 0xfa, 0x45, 0x13, 0xe9, 0xbf, 0x8a, 0xdc, 0x26, 0x70, 0xcf, 0x99, 0x63, 0x35, 0x9, 0x5f, 0xa5, 0xf3, 0x4c, 0x1a, 0xe0, 0xb6, 0x83, 0xd5, 0x2f, 0x79, 0xc6, 0x90, 0x6a, 0x3c, 0x12, 0x44, 0xbe, 0xe8, 0x57, 0x1, 0xfb, 0xad, 0x98, 0xce, 0x34, 0x62, 0xdd, 0x8b, 0x71, 0x27, 0x1b, 0x4d, 0xb7, 0xe1, 0x5e, 0x8, 0xf2, 0xa4, 0x91, 0xc7, 0x3d, 0x6b, 0xd4, 0x82, 0x78, 0x2e, 0x24, 0x72, 0x88, 0xde, 0x61, 0x37, 0xcd, 0x9b, 0xae, 0xf8, 0x2, 0x54, 0xeb, 0xbd, 0x47, 0x11, 0x2d, 0x7b, 0x81, 0xd7, 0x68, 0x3e, 0xc4, 0x92, 0xa7, 0xf1, 0xb, 0x5d, 0xe2, 0xb4, 0x4e, 0x18, 0x36, 0x60, 0x9a, 0xcc, 0x73, 0x25, 0xdf, 0x89, 0xbc, 0xea, 0x10, 0x46, 0xf9, 0xaf, 0x55, 0x3, 0x3f, 0x69, 0x93, 0xc5, 0x7a, 0x2c, 0xd6, 0x80, 0xb5, 0xe3, 0x19, 0x4f, 0xf0, 0xa6, 0x5c, 0xa, 0x48, 0x1e, 0xe4, 0xb2, 0xd, 0x5b, 0xa1, 0xf7, 0xc2, 0x94, 0x6e, 0x38, 0x87, 0xd1, 0x2b, 0x7d, 0x41, 0x17, 0xed, 0xbb, 0x4, 0x52, 0xa8, 0xfe, 0xcb, 0x9d, 0x67, 0x31, 0x8e, 0xd8, 0x22, 0x74, 0x5a, 0xc, 0xf6, 0xa0, 0x1f, 0x49, 0xb3, 0xe5, 0xd0, 0x86, 0x7c, 0x2a, 0x95, 0xc3, 0x39, 0x6f, 0x53, 0x5, 0xff, 0xa9, 0x16, 0x40, 0xba, 0xec, 0xd9, 0x8f, 0x75, 0x23, 0x9c, 0xca, 0x30, 0x66, 0x6c, 0x3a, 0xc0, 0x96, 0x29, 0x7f, 0x85, 0xd3, 0xe6, 0xb0, 0x4a, 0x1c, 0xa3, 0xf5, 0xf, 0x59, 0x65, 0x33, 0xc9, 0x9f, 0x20, 0x76, 0x8c, 0xda, 0xef, 0xb9, 0x43, 0x15, 0xaa, 0xfc, 0x6, 0x50, 0x7e, 0x28, 0xd2, 0x84, 0x3b, 0x6d, 0x97, 0xc1, 0xf4, 0xa2, 0x58, 0xe, 0xb1, 0xe7, 0x1d, 0x4b, 0x77, 0x21, 0xdb, 0x8d, 0x32, 0x64, 0x9e, 0xc8, 0xfd, 0xab, 0x51, 0x7, 0xb8, 0xee, 0x14, 0x42}, {0x0, 0x57, 0xae, 0xf9, 0x41, 0x16, 0xef, 0xb8, 0x82, 0xd5, 0x2c, 0x7b, 0xc3, 0x94, 0x6d, 0x3a, 0x19, 0x4e, 0xb7, 0xe0, 0x58, 0xf, 0xf6, 0xa1, 0x9b, 0xcc, 0x35, 0x62, 0xda, 0x8d, 0x74, 0x23, 0x32, 0x65, 0x9c, 0xcb, 0x73, 0x24, 0xdd, 0x8a, 0xb0, 0xe7, 0x1e, 0x49, 0xf1, 0xa6, 0x5f, 0x8, 0x2b, 0x7c, 0x85, 0xd2, 0x6a, 0x3d, 0xc4, 0x93, 0xa9, 0xfe, 0x7, 0x50, 0xe8, 0xbf, 0x46, 0x11, 0x64, 0x33, 0xca, 0x9d, 0x25, 0x72, 0x8b, 0xdc, 0xe6, 0xb1, 0x48, 0x1f, 0xa7, 0xf0, 0x9, 0x5e, 0x7d, 0x2a, 0xd3, 0x84, 0x3c, 0x6b, 0x92, 0xc5, 0xff, 0xa8, 0x51, 0x6, 0xbe, 0xe9, 0x10, 0x47, 0x56, 0x1, 0xf8, 0xaf, 0x17, 0x40, 0xb9, 0xee, 0xd4, 0x83, 0x7a, 0x2d, 0x95, 0xc2, 0x3b, 0x6c, 0x4f, 0x18, 0xe1, 0xb6, 0xe, 0x59, 0xa0, 0xf7, 0xcd, 0x9a, 0x63, 0x34, 0x8c, 0xdb, 0x22, 0x75, 0xc8, 0x9f, 0x66, 0x31, 0x89, 0xde, 0x27, 0x70, 0x4a, 0x1d, 0xe4, 0xb3, 0xb, 0x5c, 0xa5, 0xf2, 0xd1, 0x86, 0x7f, 0x28, 0x90, 0xc7, 0x3e, 0x69, 0x53, 0x4, 0xfd, 0xaa, 0x12, 0x45, 0xbc, 0xeb, 0xfa, 0xad, 0x54, 0x3, 0xbb, 0xec, 0x15, 0x42, 0x78, 0x2f, 0xd6, 0x81, 0x39, 0x6e, 0x97, 0xc0, 0xe3, 0xb4, 0x4d, 0x1a, 0xa2, 0xf5, 0xc, 0x5b, 0x61, 0x36, 0xcf, 0x98, 0x20, 0x77, 0x8e, 0xd9, 0xac, 0xfb, 0x2, 0x55, 0xed, 0xba, 0x43, 0x14, 0x2e, 0x79, 0x80, 0xd7, 0x6f, 0x38, 0xc1, 0x96, 0xb5, 0xe2, 0x1b, 0x4c, 0xf4, 0xa3, 0x5a, 0xd, 0x37, 0x60, 0x99, 0xce, 0x76, 0x21, 0xd8, 0x8f, 0x9e, 0xc9, 0x30, 0x67, 0xdf, 0x88, 0x71, 0x26, 0x1c, 0x4b, 0xb2, 0xe5, 0x5d, 0xa, 0xf3, 0xa4, 0x87, 0xd0, 0x29, 0x7e, 0xc6, 0x91, 0x68, 0x3f, 0x5, 0x52, 0xab, 0xfc, 0x44, 0x13, 0xea, 0xbd}, {0x0, 0x58, 0xb0, 0xe8, 0x7d, 0x25, 0xcd, 0x95, 0xfa, 0xa2, 0x4a, 0x12, 0x87, 0xdf, 0x37, 0x6f, 0xe9, 0xb1, 0x59, 0x1, 0x94, 0xcc, 0x24, 0x7c, 0x13, 0x4b, 0xa3, 0xfb, 0x6e, 0x36, 0xde, 0x86, 0xcf, 0x97, 0x7f, 0x27, 0xb2, 0xea, 0x2, 0x5a, 0x35, 0x6d, 0x85, 0xdd, 0x48, 0x10, 0xf8, 0xa0, 0x26, 0x7e, 0x96, 0xce, 0x5b, 0x3, 0xeb, 0xb3, 0xdc, 0x84, 0x6c, 0x34, 0xa1, 0xf9, 0x11, 0x49, 0x83, 0xdb, 0x33, 0x6b, 0xfe, 0xa6, 0x4e, 0x16, 0x79, 0x21, 0xc9, 0x91, 0x4, 0x5c, 0xb4, 0xec, 0x6a, 0x32, 0xda, 0x82, 0x17, 0x4f, 0xa7, 0xff, 0x90, 0xc8, 0x20, 0x78, 0xed, 0xb5, 0x5d, 0x5, 0x4c, 0x14, 0xfc, 0xa4, 0x31, 0x69, 0x81, 0xd9, 0xb6, 0xee, 0x6, 0x5e, 0xcb, 0x93, 0x7b, 0x23, 0xa5, 0xfd, 0x15, 0x4d, 0xd8, 0x80, 0x68, 0x30, 0x5f, 0x7, 0xef, 0xb7, 0x22, 0x7a, 0x92, 0xca, 0x1b, 0x43, 0xab, 0xf3, 0x66, 0x3e, 0xd6, 0x8e, 0xe1, 0xb9, 0x51, 0x9, 0x9c, 0xc4, 0x2c, 0x74, 0xf2, 0xaa, 0x42, 0x1a, 0x8f, 0xd7, 0x3f, 0x67, 0x8, 0x50, 0xb8, 0xe0, 0x75, 0x2d, 0xc5, 0x9d, 0xd4, 0x8c, 0x64, 0x3c, 0xa9, 0xf1, 0x19, 0x41, 0x2e, 0x76, 0x9e, 0xc6, 0x53, 0xb, 0xe3, 0xbb, 0x3d, 0x65, 0x8d, 0xd5, 0x40, 0x18, 0xf0, 0xa8, 0xc7, 0x9f, 0x77, 0x2f, 0xba, 0xe2, 0xa, 0x52, 0x98, 0xc0, 0x28, 0x70, 0xe5, 0xbd, 0x55, 0xd, 0x62, 0x3a, 0xd2, 0x8a, 0x1f, 0x47, 0xaf, 0xf7, 0x71, 0x29, 0xc1, 0x99, 0xc, 0x54, 0xbc, 0xe4, 0x8b, 0xd3, 0x3b, 0x63, 0xf6, 0xae, 0x46, 0x1e, 0x57, 0xf, 0xe7, 0xbf, 0x2a, 0x72, 0x9a, 0xc2, 0xad, 0xf5, 0x1d, 0x45, 0xd0, 0x88, 0x60, 0x38, 0xbe, 0xe6, 0xe, 0x56, 0xc3, 0x9b, 0x73, 0x2b, 0x44, 0x1c, 0xf4, 0xac, 0x39, 0x61, 0x89, 0xd1}, {0x0, 0x59, 0xb2, 0xeb, 0x79, 0x20, 0xcb, 0x92, 0xf2, 0xab, 0x40, 0x19, 0x8b, 0xd2, 0x39, 0x60, 0xf9, 0xa0, 0x4b, 0x12, 0x80, 0xd9, 0x32, 0x6b, 0xb, 0x52, 0xb9, 0xe0, 0x72, 0x2b, 0xc0, 0x99, 0xef, 0xb6, 0x5d, 0x4, 0x96, 0xcf, 0x24, 0x7d, 0x1d, 0x44, 0xaf, 0xf6, 0x64, 0x3d, 0xd6, 0x8f, 0x16, 0x4f, 0xa4, 0xfd, 0x6f, 0x36, 0xdd, 0x84, 0xe4, 0xbd, 0x56, 0xf, 0x9d, 0xc4, 0x2f, 0x76, 0xc3, 0x9a, 0x71, 0x28, 0xba, 0xe3, 0x8, 0x51, 0x31, 0x68, 0x83, 0xda, 0x48, 0x11, 0xfa, 0xa3, 0x3a, 0x63, 0x88, 0xd1, 0x43, 0x1a, 0xf1, 0xa8, 0xc8, 0x91, 0x7a, 0x23, 0xb1, 0xe8, 0x3, 0x5a, 0x2c, 0x75, 0x9e, 0xc7, 0x55, 0xc, 0xe7, 0xbe, 0xde, 0x87, 0x6c, 0x35, 0xa7, 0xfe, 0x15, 0x4c, 0xd5, 0x8c, 0x67, 0x3e, 0xac, 0xf5, 0x1e, 0x47, 0x27, 0x7e, 0x95, 0xcc, 0x5e, 0x7, 0xec, 0xb5, 0x9b, 0xc2, 0x29, 0x70, 0xe2, 0xbb, 0x50, 0x9, 0x69, 0x30, 0xdb, 0x82, 0x10, 0x49, 0xa2, 0xfb, 0x62, 0x3b, 0xd0, 0x89, 0x1b, 0x42, 0xa9, 0xf0, 0x90, 0xc9, 0x22, 0x7b, 0xe9, 0xb0, 0x5b, 0x2, 0x74, 0x2d, 0xc6, 0x9f, 0xd, 0x54, 0xbf, 0xe6, 0x86, 0xdf, 0x34, 0x6d, 0xff, 0xa6, 0x4d, 0x14, 0x8d, 0xd4, 0x3f, 0x66, 0xf4, 0xad, 0x46, 0x1f, 0x7f, 0x26, 0xcd, 0x94, 0x6, 0x5f, 0xb4, 0xed, 0x58, 0x1, 0xea, 0xb3, 0x21, 0x78, 0x93, 0xca, 0xaa, 0xf3, 0x18, 0x41, 0xd3, 0x8a, 0x61, 0x38, 0xa1, 0xf8, 0x13, 0x4a, 0xd8, 0x81, 0x6a, 0x33, 0x53, 0xa, 0xe1, 0xb8, 0x2a, 0x73, 0x98, 0xc1, 0xb7, 0xee, 0x5, 0x5c, 0xce, 0x97, 0x7c, 0x25, 0x45, 0x1c, 0xf7, 0xae, 0x3c, 0x65, 0x8e, 0xd7, 0x4e, 0x17, 0xfc, 0xa5, 0x37, 0x6e, 0x85, 0xdc, 0xbc, 0xe5, 0xe, 0x57, 0xc5, 0x9c, 0x77, 0x2e}, {0x0, 0x5a, 0xb4, 0xee, 0x75, 0x2f, 0xc1, 0x9b, 0xea, 0xb0, 0x5e, 0x4, 0x9f, 0xc5, 0x2b, 0x71, 0xc9, 0x93, 0x7d, 0x27, 0xbc, 0xe6, 0x8, 0x52, 0x23, 0x79, 0x97, 0xcd, 0x56, 0xc, 0xe2, 0xb8, 0x8f, 0xd5, 0x3b, 0x61, 0xfa, 0xa0, 0x4e, 0x14, 0x65, 0x3f, 0xd1, 0x8b, 0x10, 0x4a, 0xa4, 0xfe, 0x46, 0x1c, 0xf2, 0xa8, 0x33, 0x69, 0x87, 0xdd, 0xac, 0xf6, 0x18, 0x42, 0xd9, 0x83, 0x6d, 0x37, 0x3, 0x59, 0xb7, 0xed, 0x76, 0x2c, 0xc2, 0x98, 0xe9, 0xb3, 0x5d, 0x7, 0x9c, 0xc6, 0x28, 0x72, 0xca, 0x90, 0x7e, 0x24, 0xbf, 0xe5, 0xb, 0x51, 0x20, 0x7a, 0x94, 0xce, 0x55, 0xf, 0xe1, 0xbb, 0x8c, 0xd6, 0x38, 0x62, 0xf9, 0xa3, 0x4d, 0x17, 0x66, 0x3c, 0xd2, 0x88, 0x13, 0x49, 0xa7, 0xfd, 0x45, 0x1f, 0xf1, 0xab, 0x30, 0x6a, 0x84, 0xde, 0xaf, 0xf5, 0x1b, 0x41, 0xda, 0x80, 0x6e, 0x34, 0x6, 0x5c, 0xb2, 0xe8, 0x73, 0x29, 0xc7, 0x9d, 0xec, 0xb6, 0x58, 0x2, 0x99, 0xc3, 0x2d, 0x77, 0xcf, 0x95, 0x7b, 0x21, 0xba, 0xe0, 0xe, 0x54, 0x25, 0x7f, 0x91, 0xcb, 0x50, 0xa, 0xe4, 0xbe, 0x89, 0xd3, 0x3d, 0x67, 0xfc, 0xa6, 0x48, 0x12, 0x63, 0x39, 0xd7, 0x8d, 0x16, 0x4c, 0xa2, 0xf8, 0x40, 0x1a, 0xf4, 0xae, 0x35, 0x6f, 0x81, 0xdb, 0xaa, 0xf0, 0x1e, 0x44, 0xdf, 0x85, 0x6b, 0x31, 0x5, 0x5f, 0xb1, 0xeb, 0x70, 0x2a, 0xc4, 0x9e, 0xef, 0xb5, 0x5b, 0x1, 0x9a, 0xc0, 0x2e, 0x74, 0xcc, 0x96, 0x78, 0x22, 0xb9, 0xe3, 0xd, 0x57, 0x26, 0x7c, 0x92, 0xc8, 0x53, 0x9, 0xe7, 0xbd, 0x8a, 0xd0, 0x3e, 0x64, 0xff, 0xa5, 0x4b, 0x11, 0x60, 0x3a, 0xd4, 0x8e, 0x15, 0x4f, 0xa1, 0xfb, 0x43, 0x19, 0xf7, 0xad, 0x36, 0x6c, 0x82, 0xd8, 0xa9, 0xf3, 0x1d, 0x47, 0xdc, 0x86, 0x68, 0x32}, {0x0, 0x5b, 0xb6, 0xed, 0x71, 0x2a, 0xc7, 0x9c, 0xe2, 0xb9, 0x54, 0xf, 0x93, 0xc8, 0x25, 0x7e, 0xd9, 0x82, 0x6f, 0x34, 0xa8, 0xf3, 0x1e, 0x45, 0x3b, 0x60, 0x8d, 0xd6, 0x4a, 0x11, 0xfc, 0xa7, 0xaf, 0xf4, 0x19, 0x42, 0xde, 0x85, 0x68, 0x33, 0x4d, 0x16, 0xfb, 0xa0, 0x3c, 0x67, 0x8a, 0xd1, 0x76, 0x2d, 0xc0, 0x9b, 0x7, 0x5c, 0xb1, 0xea, 0x94, 0xcf, 0x22, 0x79, 0xe5, 0xbe, 0x53, 0x8, 0x43, 0x18, 0xf5, 0xae, 0x32, 0x69, 0x84, 0xdf, 0xa1, 0xfa, 0x17, 0x4c, 0xd0, 0x8b, 0x66, 0x3d, 0x9a, 0xc1, 0x2c, 0x77, 0xeb, 0xb0, 0x5d, 0x6, 0x78, 0x23, 0xce, 0x95, 0x9, 0x52, 0xbf, 0xe4, 0xec, 0xb7, 0x5a, 0x1, 0x9d, 0xc6, 0x2b, 0x70, 0xe, 0x55, 0xb8, 0xe3, 0x7f, 0x24, 0xc9, 0x92, 0x35, 0x6e, 0x83, 0xd8, 0x44, 0x1f, 0xf2, 0xa9, 0xd7, 0x8c, 0x61, 0x3a, 0xa6, 0xfd, 0x10, 0x4b, 0x86, 0xdd, 0x30, 0x6b, 0xf7, 0xac, 0x41, 0x1a, 0x64, 0x3f, 0xd2, 0x89, 0x15, 0x4e, 0xa3, 0xf8, 0x5f, 0x4, 0xe9, 0xb2, 0x2e, 0x75, 0x98, 0xc3, 0xbd, 0xe6, 0xb, 0x50, 0xcc, 0x97, 0x7a, 0x21, 0x29, 0x72, 0x9f, 0xc4, 0x58, 0x3, 0xee, 0xb5, 0xcb, 0x90, 0x7d, 0x26, 0xba, 0xe1, 0xc, 0x57, 0xf0, 0xab, 0x46, 0x1d, 0x81, 0xda, 0x37, 0x6c, 0x12, 0x49, 0xa4, 0xff, 0x63, 0x38, 0xd5, 0x8e, 0xc5, 0x9e, 0x73, 0x28, 0xb4, 0xef, 0x2, 0x59, 0x27, 0x7c, 0x91, 0xca, 0x56, 0xd, 0xe0, 0xbb, 0x1c, 0x47, 0xaa, 0xf1, 0x6d, 0x36, 0xdb, 0x80, 0xfe, 0xa5, 0x48, 0x13, 0x8f, 0xd4, 0x39, 0x62, 0x6a, 0x31, 0xdc, 0x87, 0x1b, 0x40, 0xad, 0xf6, 0x88, 0xd3, 0x3e, 0x65, 0xf9, 0xa2, 0x4f, 0x14, 0xb3, 0xe8, 0x5, 0x5e, 0xc2, 0x99, 0x74, 0x2f, 0x51, 0xa, 0xe7, 0xbc, 0x20, 0x7b, 0x96, 0xcd}, {0x0, 0x5c, 0xb8, 0xe4, 0x6d, 0x31, 0xd5, 0x89, 0xda, 0x86, 0x62, 0x3e, 0xb7, 0xeb, 0xf, 0x53, 0xa9, 0xf5, 0x11, 0x4d, 0xc4, 0x98, 0x7c, 0x20, 0x73, 0x2f, 0xcb, 0x97, 0x1e, 0x42, 0xa6, 0xfa, 0x4f, 0x13, 0xf7, 0xab, 0x22, 0x7e, 0x9a, 0xc6, 0x95, 0xc9, 0x2d, 0x71, 0xf8, 0xa4, 0x40, 0x1c, 0xe6, 0xba, 0x5e, 0x2, 0x8b, 0xd7, 0x33, 0x6f, 0x3c, 0x60, 0x84, 0xd8, 0x51, 0xd, 0xe9, 0xb5, 0x9e, 0xc2, 0x26, 0x7a, 0xf3, 0xaf, 0x4b, 0x17, 0x44, 0x18, 0xfc, 0xa0, 0x29, 0x75, 0x91, 0xcd, 0x37, 0x6b, 0x8f, 0xd3, 0x5a, 0x6, 0xe2, 0xbe, 0xed, 0xb1, 0x55, 0x9, 0x80, 0xdc, 0x38, 0x64, 0xd1, 0x8d, 0x69, 0x35, 0xbc, 0xe0, 0x4, 0x58, 0xb, 0x57, 0xb3, 0xef, 0x66, 0x3a, 0xde, 0x82, 0x78, 0x24, 0xc0, 0x9c, 0x15, 0x49, 0xad, 0xf1, 0xa2, 0xfe, 0x1a, 0x46, 0xcf, 0x93, 0x77, 0x2b, 0x21, 0x7d, 0x99, 0xc5, 0x4c, 0x10, 0xf4, 0xa8, 0xfb, 0xa7, 0x43, 0x1f, 0x96, 0xca, 0x2e, 0x72, 0x88, 0xd4, 0x30, 0x6c, 0xe5, 0xb9, 0x5d, 0x1, 0x52, 0xe, 0xea, 0xb6, 0x3f, 0x63, 0x87, 0xdb, 0x6e, 0x32, 0xd6, 0x8a, 0x3, 0x5f, 0xbb, 0xe7, 0xb4, 0xe8, 0xc, 0x50, 0xd9, 0x85, 0x61, 0x3d, 0xc7, 0x9b, 0x7f, 0x23, 0xaa, 0xf6, 0x12, 0x4e, 0x1d, 0x41, 0xa5, 0xf9, 0x70, 0x2c, 0xc8, 0x94, 0xbf, 0xe3, 0x7, 0x5b, 0xd2, 0x8e, 0x6a, 0x36, 0x65, 0x39, 0xdd, 0x81, 0x8, 0x54, 0xb0, 0xec, 0x16, 0x4a, 0xae, 0xf2, 0x7b, 0x27, 0xc3, 0x9f, 0xcc, 0x90, 0x74, 0x28, 0xa1, 0xfd, 0x19, 0x45, 0xf0, 0xac, 0x48, 0x14, 0x9d, 0xc1, 0x25, 0x79, 0x2a, 0x76, 0x92, 0xce, 0x47, 0x1b, 0xff, 0xa3, 0x59, 0x5, 0xe1, 0xbd, 0x34, 0x68, 0x8c, 0xd0, 0x83, 0xdf, 0x3b, 0x67, 0xee, 0xb2, 0x56, 0xa}, {0x0, 0x5d, 0xba, 0xe7, 0x69, 0x34, 0xd3, 0x8e, 0xd2, 0x8f, 0x68, 0x35, 0xbb, 0xe6, 0x1, 0x5c, 0xb9, 0xe4, 0x3, 0x5e, 0xd0, 0x8d, 0x6a, 0x37, 0x6b, 0x36, 0xd1, 0x8c, 0x2, 0x5f, 0xb8, 0xe5, 0x6f, 0x32, 0xd5, 0x88, 0x6, 0x5b, 0xbc, 0xe1, 0xbd, 0xe0, 0x7, 0x5a, 0xd4, 0x89, 0x6e, 0x33, 0xd6, 0x8b, 0x6c, 0x31, 0xbf, 0xe2, 0x5, 0x58, 0x4, 0x59, 0xbe, 0xe3, 0x6d, 0x30, 0xd7, 0x8a, 0xde, 0x83, 0x64, 0x39, 0xb7, 0xea, 0xd, 0x50, 0xc, 0x51, 0xb6, 0xeb, 0x65, 0x38, 0xdf, 0x82, 0x67, 0x3a, 0xdd, 0x80, 0xe, 0x53, 0xb4, 0xe9, 0xb5, 0xe8, 0xf, 0x52, 0xdc, 0x81, 0x66, 0x3b, 0xb1, 0xec, 0xb, 0x56, 0xd8, 0x85, 0x62, 0x3f, 0x63, 0x3e, 0xd9, 0x84, 0xa, 0x57, 0xb0, 0xed, 0x8, 0x55, 0xb2, 0xef, 0x61, 0x3c, 0xdb, 0x86, 0xda, 0x87, 0x60, 0x3d, 0xb3, 0xee, 0x9, 0x54, 0xa1, 0xfc, 0x1b, 0x46, 0xc8, 0x95, 0x72, 0x2f, 0x73, 0x2e, 0xc9, 0x94, 0x1a, 0x47, 0xa0, 0xfd, 0x18, 0x45, 0xa2, 0xff, 0x71, 0x2c, 0xcb, 0x96, 0xca, 0x97, 0x70, 0x2d, 0xa3, 0xfe, 0x19, 0x44, 0xce, 0x93, 0x74, 0x29, 0xa7, 0xfa, 0x1d, 0x40, 0x1c, 0x41, 0xa6, 0xfb, 0x75, 0x28, 0xcf, 0x92, 0x77, 0x2a, 0xcd, 0x90, 0x1e, 0x43, 0xa4, 0xf9, 0xa5, 0xf8, 0x1f, 0x42, 0xcc, 0x91, 0x76, 0x2b, 0x7f, 0x22, 0xc5, 0x98, 0x16, 0x4b, 0xac, 0xf1, 0xad, 0xf0, 0x17, 0x4a, 0xc4, 0x99, 0x7e, 0x23, 0xc6, 0x9b, 0x7c, 0x21, 0xaf, 0xf2, 0x15, 0x48, 0x14, 0x49, 0xae, 0xf3, 0x7d, 0x20, 0xc7, 0x9a, 0x10, 0x4d, 0xaa, 0xf7, 0x79, 0x24, 0xc3, 0x9e, 0xc2, 0x9f, 0x78, 0x25, 0xab, 0xf6, 0x11, 0x4c, 0xa9, 0xf4, 0x13, 0x4e, 0xc0, 0x9d, 0x7a, 0x27, 0x7b, 0x26, 0xc1, 0x9c, 0x12, 0x4f, 0xa8, 0xf5}, {0x0, 0x5e, 0xbc, 0xe2, 0x65, 0x3b, 0xd9, 0x87, 0xca, 0x94, 0x76, 0x28, 0xaf, 0xf1, 0x13, 0x4d, 0x89, 0xd7, 0x35, 0x6b, 0xec, 0xb2, 0x50, 0xe, 0x43, 0x1d, 0xff, 0xa1, 0x26, 0x78, 0x9a, 0xc4, 0xf, 0x51, 0xb3, 0xed, 0x6a, 0x34, 0xd6, 0x88, 0xc5, 0x9b, 0x79, 0x27, 0xa0, 0xfe, 0x1c, 0x42, 0x86, 0xd8, 0x3a, 0x64, 0xe3, 0xbd, 0x5f, 0x1, 0x4c, 0x12, 0xf0, 0xae, 0x29, 0x77, 0x95, 0xcb, 0x1e, 0x40, 0xa2, 0xfc, 0x7b, 0x25, 0xc7, 0x99, 0xd4, 0x8a, 0x68, 0x36, 0xb1, 0xef, 0xd, 0x53, 0x97, 0xc9, 0x2b, 0x75, 0xf2, 0xac, 0x4e, 0x10, 0x5d, 0x3, 0xe1, 0xbf, 0x38, 0x66, 0x84, 0xda, 0x11, 0x4f, 0xad, 0xf3, 0x74, 0x2a, 0xc8, 0x96, 0xdb, 0x85, 0x67, 0x39, 0xbe, 0xe0, 0x2, 0x5c, 0x98, 0xc6, 0x24, 0x7a, 0xfd, 0xa3, 0x41, 0x1f, 0x52, 0xc, 0xee, 0xb0, 0x37, 0x69, 0x8b, 0xd5, 0x3c, 0x62, 0x80, 0xde, 0x59, 0x7, 0xe5, 0xbb, 0xf6, 0xa8, 0x4a, 0x14, 0x93, 0xcd, 0x2f, 0x71, 0xb5, 0xeb, 0x9, 0x57, 0xd0, 0x8e, 0x6c, 0x32, 0x7f, 0x21, 0xc3, 0x9d, 0x1a, 0x44, 0xa6, 0xf8, 0x33, 0x6d, 0x8f, 0xd1, 0x56, 0x8, 0xea, 0xb4, 0xf9, 0xa7, 0x45, 0x1b, 0x9c, 0xc2, 0x20, 0x7e, 0xba, 0xe4, 0x6, 0x58, 0xdf, 0x81, 0x63, 0x3d, 0x70, 0x2e, 0xcc, 0x92, 0x15, 0x4b, 0xa9, 0xf7, 0x22, 0x7c, 0x9e, 0xc0, 0x47, 0x19, 0xfb, 0xa5, 0xe8, 0xb6, 0x54, 0xa, 0x8d, 0xd3, 0x31, 0x6f, 0xab, 0xf5, 0x17, 0x49, 0xce, 0x90, 0x72, 0x2c, 0x61, 0x3f, 0xdd, 0x83, 0x4, 0x5a, 0xb8, 0xe6, 0x2d, 0x73, 0x91, 0xcf, 0x48, 0x16, 0xf4, 0xaa, 0xe7, 0xb9, 0x5b, 0x5, 0x82, 0xdc, 0x3e, 0x60, 0xa4, 0xfa, 0x18, 0x46, 0xc1, 0x9f, 0x7d, 0x23, 0x6e, 0x30, 0xd2, 0x8c, 0xb, 0x55, 0xb7, 0xe9}, {0x0, 0x5f, 0xbe, 0xe1, 0x61, 0x3e, 0xdf, 0x80, 0xc2, 0x9d, 0x7c, 0x23, 0xa3, 0xfc, 0x1d, 0x42, 0x99, 0xc6, 0x27, 0x78, 0xf8, 0xa7, 0x46, 0x19, 0x5b, 0x4, 0xe5, 0xba, 0x3a, 0x65, 0x84, 0xdb, 0x2f, 0x70, 0x91, 0xce, 0x4e, 0x11, 0xf0, 0xaf, 0xed, 0xb2, 0x53, 0xc, 0x8c, 0xd3, 0x32, 0x6d, 0xb6, 0xe9, 0x8, 0x57, 0xd7, 0x88, 0x69, 0x36, 0x74, 0x2b, 0xca, 0x95, 0x15, 0x4a, 0xab, 0xf4, 0x5e, 0x1, 0xe0, 0xbf, 0x3f, 0x60, 0x81, 0xde, 0x9c, 0xc3, 0x22, 0x7d, 0xfd, 0xa2, 0x43, 0x1c, 0xc7, 0x98, 0x79, 0x26, 0xa6, 0xf9, 0x18, 0x47, 0x5, 0x5a, 0xbb, 0xe4, 0x64, 0x3b, 0xda, 0x85, 0x71, 0x2e, 0xcf, 0x90, 0x10, 0x4f, 0xae, 0xf1, 0xb3, 0xec, 0xd, 0x52, 0xd2, 0x8d, 0x6c, 0x33, 0xe8, 0xb7, 0x56, 0x9, 0x89, 0xd6, 0x37, 0x68, 0x2a, 0x75, 0x94, 0xcb, 0x4b, 0x14, 0xf5, 0xaa, 0xbc, 0xe3, 0x2, 0x5d, 0xdd, 0x82, 0x63, 0x3c, 0x7e, 0x21, 0xc0, 0x9f, 0x1f, 0x40, 0xa1, 0xfe, 0x25, 0x7a, 0x9b, 0xc4, 0x44, 0x1b, 0xfa, 0xa5, 0xe7, 0xb8, 0x59, 0x6, 0x86, 0xd9, 0x38, 0x67, 0x93, 0xcc, 0x2d, 0x72, 0xf2, 0xad, 0x4c, 0x13, 0x51, 0xe, 0xef, 0xb0, 0x30, 0x6f, 0x8e, 0xd1, 0xa, 0x55, 0xb4, 0xeb, 0x6b, 0x34, 0xd5, 0x8a, 0xc8, 0x97, 0x76, 0x29, 0xa9, 0xf6, 0x17, 0x48, 0xe2, 0xbd, 0x5c, 0x3, 0x83, 0xdc, 0x3d, 0x62, 0x20, 0x7f, 0x9e, 0xc1, 0x41, 0x1e, 0xff, 0xa0, 0x7b, 0x24, 0xc5, 0x9a, 0x1a, 0x45, 0xa4, 0xfb, 0xb9, 0xe6, 0x7, 0x58, 0xd8, 0x87, 0x66, 0x39, 0xcd, 0x92, 0x73, 0x2c, 0xac, 0xf3, 0x12, 0x4d, 0xf, 0x50, 0xb1, 0xee, 0x6e, 0x31, 0xd0, 0x8f, 0x54, 0xb, 0xea, 0xb5, 0x35, 0x6a, 0x8b, 0xd4, 0x96, 0xc9, 0x28, 0x77, 0xf7, 0xa8, 0x49, 0x16}, {0x0, 0x60, 0xc0, 0xa0, 0x9d, 0xfd, 0x5d, 0x3d, 0x27, 0x47, 0xe7, 0x87, 0xba, 0xda, 0x7a, 0x1a, 0x4e, 0x2e, 0x8e, 0xee, 0xd3, 0xb3, 0x13, 0x73, 0x69, 0x9, 0xa9, 0xc9, 0xf4, 0x94, 0x34, 0x54, 0x9c, 0xfc, 0x5c, 0x3c, 0x1, 0x61, 0xc1, 0xa1, 0xbb, 0xdb, 0x7b, 0x1b, 0x26, 0x46, 0xe6, 0x86, 0xd2, 0xb2, 0x12, 0x72, 0x4f, 0x2f, 0x8f, 0xef, 0xf5, 0x95, 0x35, 0x55, 0x68, 0x8, 0xa8, 0xc8, 0x25, 0x45, 0xe5, 0x85, 0xb8, 0xd8, 0x78, 0x18, 0x2, 0x62, 0xc2, 0xa2, 0x9f, 0xff, 0x5f, 0x3f, 0x6b, 0xb, 0xab, 0xcb, 0xf6, 0x96, 0x36, 0x56, 0x4c, 0x2c, 0x8c, 0xec, 0xd1, 0xb1, 0x11, 0x71, 0xb9, 0xd9, 0x79, 0x19, 0x24, 0x44, 0xe4, 0x84, 0x9e, 0xfe, 0x5e, 0x3e, 0x3, 0x63, 0xc3, 0xa3, 0xf7, 0x97, 0x37, 0x57, 0x6a, 0xa, 0xaa, 0xca, 0xd0, 0xb0, 0x10, 0x70, 0x4d, 0x2d, 0x8d, 0xed, 0x4a, 0x2a, 0x8a, 0xea, 0xd7, 0xb7, 0x17, 0x77, 0x6d, 0xd, 0xad, 0xcd, 0xf0, 0x90, 0x30, 0x50, 0x4, 0x64, 0xc4, 0xa4, 0x99, 0xf9, 0x59, 0x39, 0x23, 0x43, 0xe3, 0x83, 0xbe, 0xde, 0x7e, 0x1e, 0xd6, 0xb6, 0x16, 0x76, 0x4b, 0x2b, 0x8b, 0xeb, 0xf1, 0x91, 0x31, 0x51, 0x6c, 0xc, 0xac, 0xcc, 0x98, 0xf8, 0x58, 0x38, 0x5, 0x65, 0xc5, 0xa5, 0xbf, 0xdf, 0x7f, 0x1f, 0x22, 0x42, 0xe2, 0x82, 0x6f, 0xf, 0xaf, 0xcf, 0xf2, 0x92, 0x32, 0x52, 0x48, 0x28, 0x88, 0xe8, 0xd5, 0xb5, 0x15, 0x75, 0x21, 0x41, 0xe1, 0x81, 0xbc, 0xdc, 0x7c, 0x1c, 0x6, 0x66, 0xc6, 0xa6, 0x9b, 0xfb, 0x5b, 0x3b, 0xf3, 0x93, 0x33, 0x53, 0x6e, 0xe, 0xae, 0xce, 0xd4, 0xb4, 0x14, 0x74, 0x49, 0x29, 0x89, 0xe9, 0xbd, 0xdd, 0x7d, 0x1d, 0x20, 0x40, 0xe0, 0x80, 0x9a, 0xfa, 0x5a, 0x3a, 0x7, 0x67, 0xc7, 0xa7}, {0x0, 0x61, 0xc2, 0xa3, 0x99, 0xf8, 0x5b, 0x3a, 0x2f, 0x4e, 0xed, 0x8c, 0xb6, 0xd7, 0x74, 0x15, 0x5e, 0x3f, 0x9c, 0xfd, 0xc7, 0xa6, 0x5, 0x64, 0x71, 0x10, 0xb3, 0xd2, 0xe8, 0x89, 0x2a, 0x4b, 0xbc, 0xdd, 0x7e, 0x1f, 0x25, 0x44, 0xe7, 0x86, 0x93, 0xf2, 0x51, 0x30, 0xa, 0x6b, 0xc8, 0xa9, 0xe2, 0x83, 0x20, 0x41, 0x7b, 0x1a, 0xb9, 0xd8, 0xcd, 0xac, 0xf, 0x6e, 0x54, 0x35, 0x96, 0xf7, 0x65, 0x4, 0xa7, 0xc6, 0xfc, 0x9d, 0x3e, 0x5f, 0x4a, 0x2b, 0x88, 0xe9, 0xd3, 0xb2, 0x11, 0x70, 0x3b, 0x5a, 0xf9, 0x98, 0xa2, 0xc3, 0x60, 0x1, 0x14, 0x75, 0xd6, 0xb7, 0x8d, 0xec, 0x4f, 0x2e, 0xd9, 0xb8, 0x1b, 0x7a, 0x40, 0x21, 0x82, 0xe3, 0xf6, 0x97, 0x34, 0x55, 0x6f, 0xe, 0xad, 0xcc, 0x87, 0xe6, 0x45, 0x24, 0x1e, 0x7f, 0xdc, 0xbd, 0xa8, 0xc9, 0x6a, 0xb, 0x31, 0x50, 0xf3, 0x92, 0xca, 0xab, 0x8, 0x69, 0x53, 0x32, 0x91, 0xf0, 0xe5, 0x84, 0x27, 0x46, 0x7c, 0x1d, 0xbe, 0xdf, 0x94, 0xf5, 0x56, 0x37, 0xd, 0x6c, 0xcf, 0xae, 0xbb, 0xda, 0x79, 0x18, 0x22, 0x43, 0xe0, 0x81, 0x76, 0x17, 0xb4, 0xd5, 0xef, 0x8e, 0x2d, 0x4c, 0x59, 0x38, 0x9b, 0xfa, 0xc0, 0xa1, 0x2, 0x63, 0x28, 0x49, 0xea, 0x8b, 0xb1, 0xd0, 0x73, 0x12, 0x7, 0x66, 0xc5, 0xa4, 0x9e, 0xff, 0x5c, 0x3d, 0xaf, 0xce, 0x6d, 0xc, 0x36, 0x57, 0xf4, 0x95, 0x80, 0xe1, 0x42, 0x23, 0x19, 0x78, 0xdb, 0xba, 0xf1, 0x90, 0x33, 0x52, 0x68, 0x9, 0xaa, 0xcb, 0xde, 0xbf, 0x1c, 0x7d, 0x47, 0x26, 0x85, 0xe4, 0x13, 0x72, 0xd1, 0xb0, 0x8a, 0xeb, 0x48, 0x29, 0x3c, 0x5d, 0xfe, 0x9f, 0xa5, 0xc4, 0x67, 0x6, 0x4d, 0x2c, 0x8f, 0xee, 0xd4, 0xb5, 0x16, 0x77, 0x62, 0x3, 0xa0, 0xc1, 0xfb, 0x9a, 0x39, 0x58}, {0x0, 0x62, 0xc4, 0xa6, 0x95, 0xf7, 0x51, 0x33, 0x37, 0x55, 0xf3, 0x91, 0xa2, 0xc0, 0x66, 0x4, 0x6e, 0xc, 0xaa, 0xc8, 0xfb, 0x99, 0x3f, 0x5d, 0x59, 0x3b, 0x9d, 0xff, 0xcc, 0xae, 0x8, 0x6a, 0xdc, 0xbe, 0x18, 0x7a, 0x49, 0x2b, 0x8d, 0xef, 0xeb, 0x89, 0x2f, 0x4d, 0x7e, 0x1c, 0xba, 0xd8, 0xb2, 0xd0, 0x76, 0x14, 0x27, 0x45, 0xe3, 0x81, 0x85, 0xe7, 0x41, 0x23, 0x10, 0x72, 0xd4, 0xb6, 0xa5, 0xc7, 0x61, 0x3, 0x30, 0x52, 0xf4, 0x96, 0x92, 0xf0, 0x56, 0x34, 0x7, 0x65, 0xc3, 0xa1, 0xcb, 0xa9, 0xf, 0x6d, 0x5e, 0x3c, 0x9a, 0xf8, 0xfc, 0x9e, 0x38, 0x5a, 0x69, 0xb, 0xad, 0xcf, 0x79, 0x1b, 0xbd, 0xdf, 0xec, 0x8e, 0x28, 0x4a, 0x4e, 0x2c, 0x8a, 0xe8, 0xdb, 0xb9, 0x1f, 0x7d, 0x17, 0x75, 0xd3, 0xb1, 0x82, 0xe0, 0x46, 0x24, 0x20, 0x42, 0xe4, 0x86, 0xb5, 0xd7, 0x71, 0x13, 0x57, 0x35, 0x93, 0xf1, 0xc2, 0xa0, 0x6, 0x64, 0x60, 0x2, 0xa4, 0xc6, 0xf5, 0x97, 0x31, 0x53, 0x39, 0x5b, 0xfd, 0x9f, 0xac, 0xce, 0x68, 0xa, 0xe, 0x6c, 0xca, 0xa8, 0x9b, 0xf9, 0x5f, 0x3d, 0x8b, 0xe9, 0x4f, 0x2d, 0x1e, 0x7c, 0xda, 0xb8, 0xbc, 0xde, 0x78, 0x1a, 0x29, 0x4b, 0xed, 0x8f, 0xe5, 0x87, 0x21, 0x43, 0x70, 0x12, 0xb4, 0xd6, 0xd2, 0xb0, 0x16, 0x74, 0x47, 0x25, 0x83, 0xe1, 0xf2, 0x90, 0x36, 0x54, 0x67, 0x5, 0xa3, 0xc1, 0xc5, 0xa7, 0x1, 0x63, 0x50, 0x32, 0x94, 0xf6, 0x9c, 0xfe, 0x58, 0x3a, 0x9, 0x6b, 0xcd, 0xaf, 0xab, 0xc9, 0x6f, 0xd, 0x3e, 0x5c, 0xfa, 0x98, 0x2e, 0x4c, 0xea, 0x88, 0xbb, 0xd9, 0x7f, 0x1d, 0x19, 0x7b, 0xdd, 0xbf, 0x8c, 0xee, 0x48, 0x2a, 0x40, 0x22, 0x84, 0xe6, 0xd5, 0xb7, 0x11, 0x73, 0x77, 0x15, 0xb3, 0xd1, 0xe2, 0x80, 0x26, 0x44}, {0x0, 0x63, 0xc6, 0xa5, 0x91, 0xf2, 0x57, 0x34, 0x3f, 0x5c, 0xf9, 0x9a, 0xae, 0xcd, 0x68, 0xb, 0x7e, 0x1d, 0xb8, 0xdb, 0xef, 0x8c, 0x29, 0x4a, 0x41, 0x22, 0x87, 0xe4, 0xd0, 0xb3, 0x16, 0x75, 0xfc, 0x9f, 0x3a, 0x59, 0x6d, 0xe, 0xab, 0xc8, 0xc3, 0xa0, 0x5, 0x66, 0x52, 0x31, 0x94, 0xf7, 0x82, 0xe1, 0x44, 0x27, 0x13, 0x70, 0xd5, 0xb6, 0xbd, 0xde, 0x7b, 0x18, 0x2c, 0x4f, 0xea, 0x89, 0xe5, 0x86, 0x23, 0x40, 0x74, 0x17, 0xb2, 0xd1, 0xda, 0xb9, 0x1c, 0x7f, 0x4b, 0x28, 0x8d, 0xee, 0x9b, 0xf8, 0x5d, 0x3e, 0xa, 0x69, 0xcc, 0xaf, 0xa4, 0xc7, 0x62, 0x1, 0x35, 0x56, 0xf3, 0x90, 0x19, 0x7a, 0xdf, 0xbc, 0x88, 0xeb, 0x4e, 0x2d, 0x26, 0x45, 0xe0, 0x83, 0xb7, 0xd4, 0x71, 0x12, 0x67, 0x4, 0xa1, 0xc2, 0xf6, 0x95, 0x30, 0x53, 0x58, 0x3b, 0x9e, 0xfd, 0xc9, 0xaa, 0xf, 0x6c, 0xd7, 0xb4, 0x11, 0x72, 0x46, 0x25, 0x80, 0xe3, 0xe8, 0x8b, 0x2e, 0x4d, 0x79, 0x1a, 0xbf, 0xdc, 0xa9, 0xca, 0x6f, 0xc, 0x38, 0x5b, 0xfe, 0x9d, 0x96, 0xf5, 0x50, 0x33, 0x7, 0x64, 0xc1, 0xa2, 0x2b, 0x48, 0xed, 0x8e, 0xba, 0xd9, 0x7c, 0x1f, 0x14, 0x77, 0xd2, 0xb1, 0x85, 0xe6, 0x43, 0x20, 0x55, 0x36, 0x93, 0xf0, 0xc4, 0xa7, 0x2, 0x61, 0x6a, 0x9, 0xac, 0xcf, 0xfb, 0x98, 0x3d, 0x5e, 0x32, 0x51, 0xf4, 0x97, 0xa3, 0xc0, 0x65, 0x6, 0xd, 0x6e, 0xcb, 0xa8, 0x9c, 0xff, 0x5a, 0x39, 0x4c, 0x2f, 0x8a, 0xe9, 0xdd, 0xbe, 0x1b, 0x78, 0x73, 0x10, 0xb5, 0xd6, 0xe2, 0x81, 0x24, 0x47, 0xce, 0xad, 0x8, 0x6b, 0x5f, 0x3c, 0x99, 0xfa, 0xf1, 0x92, 0x37, 0x54, 0x60, 0x3, 0xa6, 0xc5, 0xb0, 0xd3, 0x76, 0x15, 0x21, 0x42, 0xe7, 0x84, 0x8f, 0xec, 0x49, 0x2a, 0x1e, 0x7d, 0xd8, 0xbb}, {0x0, 0x64, 0xc8, 0xac, 0x8d, 0xe9, 0x45, 0x21, 0x7, 0x63, 0xcf, 0xab, 0x8a, 0xee, 0x42, 0x26, 0xe, 0x6a, 0xc6, 0xa2, 0x83, 0xe7, 0x4b, 0x2f, 0x9, 0x6d, 0xc1, 0xa5, 0x84, 0xe0, 0x4c, 0x28, 0x1c, 0x78, 0xd4, 0xb0, 0x91, 0xf5, 0x59, 0x3d, 0x1b, 0x7f, 0xd3, 0xb7, 0x96, 0xf2, 0x5e, 0x3a, 0x12, 0x76, 0xda, 0xbe, 0x9f, 0xfb, 0x57, 0x33, 0x15, 0x71, 0xdd, 0xb9, 0x98, 0xfc, 0x50, 0x34, 0x38, 0x5c, 0xf0, 0x94, 0xb5, 0xd1, 0x7d, 0x19, 0x3f, 0x5b, 0xf7, 0x93, 0xb2, 0xd6, 0x7a, 0x1e, 0x36, 0x52, 0xfe, 0x9a, 0xbb, 0xdf, 0x73, 0x17, 0x31, 0x55, 0xf9, 0x9d, 0xbc, 0xd8, 0x74, 0x10, 0x24, 0x40, 0xec, 0x88, 0xa9, 0xcd, 0x61, 0x5, 0x23, 0x47, 0xeb, 0x8f, 0xae, 0xca, 0x66, 0x2, 0x2a, 0x4e, 0xe2, 0x86, 0xa7, 0xc3, 0x6f, 0xb, 0x2d, 0x49, 0xe5, 0x81, 0xa0, 0xc4, 0x68, 0xc, 0x70, 0x14, 0xb8, 0xdc, 0xfd, 0x99, 0x35, 0x51, 0x77, 0x13, 0xbf, 0xdb, 0xfa, 0x9e, 0x32, 0x56, 0x7e, 0x1a, 0xb6, 0xd2, 0xf3, 0x97, 0x3b, 0x5f, 0x79, 0x1d, 0xb1, 0xd5, 0xf4, 0x90, 0x3c, 0x58, 0x6c, 0x8, 0xa4, 0xc0, 0xe1, 0x85, 0x29, 0x4d, 0x6b, 0xf, 0xa3, 0xc7, 0xe6, 0x82, 0x2e, 0x4a, 0x62, 0x6, 0xaa, 0xce, 0xef, 0x8b, 0x27, 0x43, 0x65, 0x1, 0xad, 0xc9, 0xe8, 0x8c, 0x20, 0x44, 0x48, 0x2c, 0x80, 0xe4, 0xc5, 0xa1, 0xd, 0x69, 0x4f, 0x2b, 0x87, 0xe3, 0xc2, 0xa6, 0xa, 0x6e, 0x46, 0x22, 0x8e, 0xea, 0xcb, 0xaf, 0x3, 0x67, 0x41, 0x25, 0x89, 0xed, 0xcc, 0xa8, 0x4, 0x60, 0x54, 0x30, 0x9c, 0xf8, 0xd9, 0xbd, 0x11, 0x75, 0x53, 0x37, 0x9b, 0xff, 0xde, 0xba, 0x16, 0x72, 0x5a, 0x3e, 0x92, 0xf6, 0xd7, 0xb3, 0x1f, 0x7b, 0x5d, 0x39, 0x95, 0xf1, 0xd0, 0xb4, 0x18, 0x7c}, {0x0, 0x65, 0xca, 0xaf, 0x89, 0xec, 0x43, 0x26, 0xf, 0x6a, 0xc5, 0xa0, 0x86, 0xe3, 0x4c, 0x29, 0x1e, 0x7b, 0xd4, 0xb1, 0x97, 0xf2, 0x5d, 0x38, 0x11, 0x74, 0xdb, 0xbe, 0x98, 0xfd, 0x52, 0x37, 0x3c, 0x59, 0xf6, 0x93, 0xb5, 0xd0, 0x7f, 0x1a, 0x33, 0x56, 0xf9, 0x9c, 0xba, 0xdf, 0x70, 0x15, 0x22, 0x47, 0xe8, 0x8d, 0xab, 0xce, 0x61, 0x4, 0x2d, 0x48, 0xe7, 0x82, 0xa4, 0xc1, 0x6e, 0xb, 0x78, 0x1d, 0xb2, 0xd7, 0xf1, 0x94, 0x3b, 0x5e, 0x77, 0x12, 0xbd, 0xd8, 0xfe, 0x9b, 0x34, 0x51, 0x66, 0x3, 0xac, 0xc9, 0xef, 0x8a, 0x25, 0x40, 0x69, 0xc, 0xa3, 0xc6, 0xe0, 0x85, 0x2a, 0x4f, 0x44, 0x21, 0x8e, 0xeb, 0xcd, 0xa8, 0x7, 0x62, 0x4b, 0x2e, 0x81, 0xe4, 0xc2, 0xa7, 0x8, 0x6d, 0x5a, 0x3f, 0x90, 0xf5, 0xd3, 0xb6, 0x19, 0x7c, 0x55, 0x30, 0x9f, 0xfa, 0xdc, 0xb9, 0x16, 0x73, 0xf0, 0x95, 0x3a, 0x5f, 0x79, 0x1c, 0xb3, 0xd6, 0xff, 0x9a, 0x35, 0x50, 0x76, 0x13, 0xbc, 0xd9, 0xee, 0x8b, 0x24, 0x41, 0x67, 0x2, 0xad, 0xc8, 0xe1, 0x84, 0x2b, 0x4e, 0x68, 0xd, 0xa2, 0xc7, 0xcc, 0xa9, 0x6, 0x63, 0x45, 0x20, 0x8f, 0xea, 0xc3, 0xa6, 0x9, 0x6c, 0x4a, 0x2f, 0x80, 0xe5, 0xd2, 0xb7, 0x18, 0x7d, 0x5b, 0x3e, 0x91, 0xf4, 0xdd, 0xb8, 0x17, 0x72, 0x54, 0x31, 0x9e, 0xfb, 0x88, 0xed, 0x42, 0x27, 0x1, 0x64, 0xcb, 0xae, 0x87, 0xe2, 0x4d, 0x28, 0xe, 0x6b, 0xc4, 0xa1, 0x96, 0xf3, 0x5c, 0x39, 0x1f, 0x7a, 0xd5, 0xb0, 0x99, 0xfc, 0x53, 0x36, 0x10, 0x75, 0xda, 0xbf, 0xb4, 0xd1, 0x7e, 0x1b, 0x3d, 0x58, 0xf7, 0x92, 0xbb, 0xde, 0x71, 0x14, 0x32, 0x57, 0xf8, 0x9d, 0xaa, 0xcf, 0x60, 0x5, 0x23, 0x46, 0xe9, 0x8c, 0xa5, 0xc0, 0x6f, 0xa, 0x2c, 0x49, 0xe6, 0x83}, {0x0, 0x66, 0xcc, 0xaa, 0x85, 0xe3, 0x49, 0x2f, 0x17, 0x71, 0xdb, 0xbd, 0x92, 0xf4, 0x5e, 0x38, 0x2e, 0x48, 0xe2, 0x84, 0xab, 0xcd, 0x67, 0x1, 0x39, 0x5f, 0xf5, 0x93, 0xbc, 0xda, 0x70, 0x16, 0x5c, 0x3a, 0x90, 0xf6, 0xd9, 0xbf, 0x15, 0x73, 0x4b, 0x2d, 0x87, 0xe1, 0xce, 0xa8, 0x2, 0x64, 0x72, 0x14, 0xbe, 0xd8, 0xf7, 0x91, 0x3b, 0x5d, 0x65, 0x3, 0xa9, 0xcf, 0xe0, 0x86, 0x2c, 0x4a, 0xb8, 0xde, 0x74, 0x12, 0x3d, 0x5b, 0xf1, 0x97, 0xaf, 0xc9, 0x63, 0x5, 0x2a, 0x4c, 0xe6, 0x80, 0x96, 0xf0, 0x5a, 0x3c, 0x13, 0x75, 0xdf, 0xb9, 0x81, 0xe7, 0x4d, 0x2b, 0x4, 0x62, 0xc8, 0xae, 0xe4, 0x82, 0x28, 0x4e, 0x61, 0x7, 0xad, 0xcb, 0xf3, 0x95, 0x3f, 0x59, 0x76, 0x10, 0xba, 0xdc, 0xca, 0xac, 0x6, 0x60, 0x4f, 0x29, 0x83, 0xe5, 0xdd, 0xbb, 0x11, 0x77, 0x58, 0x3e, 0x94, 0xf2, 0x6d, 0xb, 0xa1, 0xc7, 0xe8, 0x8e, 0x24, 0x42, 0x7a, 0x1c, 0xb6, 0xd0, 0xff, 0x99, 0x33, 0x55, 0x43, 0x25, 0x8f, 0xe9, 0xc6, 0xa0, 0xa, 0x6c, 0x54, 0x32, 0x98, 0xfe, 0xd1, 0xb7, 0x1d, 0x7b, 0x31, 0x57, 0xfd, 0x9b, 0xb4, 0xd2, 0x78, 0x1e, 0x26, 0x40, 0xea, 0x8c, 0xa3, 0xc5, 0x6f, 0x9, 0x1f, 0x79, 0xd3, 0xb5, 0x9a, 0xfc, 0x56, 0x30, 0x8, 0x6e, 0xc4, 0xa2, 0x8d, 0xeb, 0x41, 0x27, 0xd5, 0xb3, 0x19, 0x7f, 0x50, 0x36, 0x9c, 0xfa, 0xc2, 0xa4, 0xe, 0x68, 0x47, 0x21, 0x8b, 0xed, 0xfb, 0x9d, 0x37, 0x51, 0x7e, 0x18, 0xb2, 0xd4, 0xec, 0x8a, 0x20, 0x46, 0x69, 0xf, 0xa5, 0xc3, 0x89, 0xef, 0x45, 0x23, 0xc, 0x6a, 0xc0, 0xa6, 0x9e, 0xf8, 0x52, 0x34, 0x1b, 0x7d, 0xd7, 0xb1, 0xa7, 0xc1, 0x6b, 0xd, 0x22, 0x44, 0xee, 0x88, 0xb0, 0xd6, 0x7c, 0x1a, 0x35, 0x53, 0xf9, 0x9f}, {0x0, 0x67, 0xce, 0xa9, 0x81, 0xe6, 0x4f, 0x28, 0x1f, 0x78, 0xd1, 0xb6, 0x9e, 0xf9, 0x50, 0x37, 0x3e, 0x59, 0xf0, 0x97, 0xbf, 0xd8, 0x71, 0x16, 0x21, 0x46, 0xef, 0x88, 0xa0, 0xc7, 0x6e, 0x9, 0x7c, 0x1b, 0xb2, 0xd5, 0xfd, 0x9a, 0x33, 0x54, 0x63, 0x4, 0xad, 0xca, 0xe2, 0x85, 0x2c, 0x4b, 0x42, 0x25, 0x8c, 0xeb, 0xc3, 0xa4, 0xd, 0x6a, 0x5d, 0x3a, 0x93, 0xf4, 0xdc, 0xbb, 0x12, 0x75, 0xf8, 0x9f, 0x36, 0x51, 0x79, 0x1e, 0xb7, 0xd0, 0xe7, 0x80, 0x29, 0x4e, 0x66, 0x1, 0xa8, 0xcf, 0xc6, 0xa1, 0x8, 0x6f, 0x47, 0x20, 0x89, 0xee, 0xd9, 0xbe, 0x17, 0x70, 0x58, 0x3f, 0x96, 0xf1, 0x84, 0xe3, 0x4a, 0x2d, 0x5, 0x62, 0xcb, 0xac, 0x9b, 0xfc, 0x55, 0x32, 0x1a, 0x7d, 0xd4, 0xb3, 0xba, 0xdd, 0x74, 0x13, 0x3b, 0x5c, 0xf5, 0x92, 0xa5, 0xc2, 0x6b, 0xc, 0x24, 0x43, 0xea, 0x8d, 0xed, 0x8a, 0x23, 0x44, 0x6c, 0xb, 0xa2, 0xc5, 0xf2, 0x95, 0x3c, 0x5b, 0x73, 0x14, 0xbd, 0xda, 0xd3, 0xb4, 0x1d, 0x7a, 0x52, 0x35, 0x9c, 0xfb, 0xcc, 0xab, 0x2, 0x65, 0x4d, 0x2a, 0x83, 0xe4, 0x91, 0xf6, 0x5f, 0x38, 0x10, 0x77, 0xde, 0xb9, 0x8e, 0xe9, 0x40, 0x27, 0xf, 0x68, 0xc1, 0xa6, 0xaf, 0xc8, 0x61, 0x6, 0x2e, 0x49, 0xe0, 0x87, 0xb0, 0xd7, 0x7e, 0x19, 0x31, 0x56, 0xff, 0x98, 0x15, 0x72, 0xdb, 0xbc, 0x94, 0xf3, 0x5a, 0x3d, 0xa, 0x6d, 0xc4, 0xa3, 0x8b, 0xec, 0x45, 0x22, 0x2b, 0x4c, 0xe5, 0x82, 0xaa, 0xcd, 0x64, 0x3, 0x34, 0x53, 0xfa, 0x9d, 0xb5, 0xd2, 0x7b, 0x1c, 0x69, 0xe, 0xa7, 0xc0, 0xe8, 0x8f, 0x26, 0x41, 0x76, 0x11, 0xb8, 0xdf, 0xf7, 0x90, 0x39, 0x5e, 0x57, 0x30, 0x99, 0xfe, 0xd6, 0xb1, 0x18, 0x7f, 0x48, 0x2f, 0x86, 0xe1, 0xc9, 0xae, 0x7, 0x60}, {0x0, 0x68, 0xd0, 0xb8, 0xbd, 0xd5, 0x6d, 0x5, 0x67, 0xf, 0xb7, 0xdf, 0xda, 0xb2, 0xa, 0x62, 0xce, 0xa6, 0x1e, 0x76, 0x73, 0x1b, 0xa3, 0xcb, 0xa9, 0xc1, 0x79, 0x11, 0x14, 0x7c, 0xc4, 0xac, 0x81, 0xe9, 0x51, 0x39, 0x3c, 0x54, 0xec, 0x84, 0xe6, 0x8e, 0x36, 0x5e, 0x5b, 0x33, 0x8b, 0xe3, 0x4f, 0x27, 0x9f, 0xf7, 0xf2, 0x9a, 0x22, 0x4a, 0x28, 0x40, 0xf8, 0x90, 0x95, 0xfd, 0x45, 0x2d, 0x1f, 0x77, 0xcf, 0xa7, 0xa2, 0xca, 0x72, 0x1a, 0x78, 0x10, 0xa8, 0xc0, 0xc5, 0xad, 0x15, 0x7d, 0xd1, 0xb9, 0x1, 0x69, 0x6c, 0x4, 0xbc, 0xd4, 0xb6, 0xde, 0x66, 0xe, 0xb, 0x63, 0xdb, 0xb3, 0x9e, 0xf6, 0x4e, 0x26, 0x23, 0x4b, 0xf3, 0x9b, 0xf9, 0x91, 0x29, 0x41, 0x44, 0x2c, 0x94, 0xfc, 0x50, 0x38, 0x80, 0xe8, 0xed, 0x85, 0x3d, 0x55, 0x37, 0x5f, 0xe7, 0x8f, 0x8a, 0xe2, 0x5a, 0x32, 0x3e, 0x56, 0xee, 0x86, 0x83, 0xeb, 0x53, 0x3b, 0x59, 0x31, 0x89, 0xe1, 0xe4, 0x8c, 0x34, 0x5c, 0xf0, 0x98, 0x20, 0x48, 0x4d, 0x25, 0x9d, 0xf5, 0x97, 0xff, 0x47, 0x2f, 0x2a, 0x42, 0xfa, 0x92, 0xbf, 0xd7, 0x6f, 0x7, 0x2, 0x6a, 0xd2, 0xba, 0xd8, 0xb0, 0x8, 0x60, 0x65, 0xd, 0xb5, 0xdd, 0x71, 0x19, 0xa1, 0xc9, 0xcc, 0xa4, 0x1c, 0x74, 0x16, 0x7e, 0xc6, 0xae, 0xab, 0xc3, 0x7b, 0x13, 0x21, 0x49, 0xf1, 0x99, 0x9c, 0xf4, 0x4c, 0x24, 0x46, 0x2e, 0x96, 0xfe, 0xfb, 0x93, 0x2b, 0x43, 0xef, 0x87, 0x3f, 0x57, 0x52, 0x3a, 0x82, 0xea, 0x88, 0xe0, 0x58, 0x30, 0x35, 0x5d, 0xe5, 0x8d, 0xa0, 0xc8, 0x70, 0x18, 0x1d, 0x75, 0xcd, 0xa5, 0xc7, 0xaf, 0x17, 0x7f, 0x7a, 0x12, 0xaa, 0xc2, 0x6e, 0x6, 0xbe, 0xd6, 0xd3, 0xbb, 0x3, 0x6b, 0x9, 0x61, 0xd9, 0xb1, 0xb4, 0xdc, 0x64, 0xc}, {0x0, 0x69, 0xd2, 0xbb, 0xb9, 0xd0, 0x6b, 0x2, 0x6f, 0x6, 0xbd, 0xd4, 0xd6, 0xbf, 0x4, 0x6d, 0xde, 0xb7, 0xc, 0x65, 0x67, 0xe, 0xb5, 0xdc, 0xb1, 0xd8, 0x63, 0xa, 0x8, 0x61, 0xda, 0xb3, 0xa1, 0xc8, 0x73, 0x1a, 0x18, 0x71, 0xca, 0xa3, 0xce, 0xa7, 0x1c, 0x75, 0x77, 0x1e, 0xa5, 0xcc, 0x7f, 0x16, 0xad, 0xc4, 0xc6, 0xaf, 0x14, 0x7d, 0x10, 0x79, 0xc2, 0xab, 0xa9, 0xc0, 0x7b, 0x12, 0x5f, 0x36, 0x8d, 0xe4, 0xe6, 0x8f, 0x34, 0x5d, 0x30, 0x59, 0xe2, 0x8b, 0x89, 0xe0, 0x5b, 0x32, 0x81, 0xe8, 0x53, 0x3a, 0x38, 0x51, 0xea, 0x83, 0xee, 0x87, 0x3c, 0x55, 0x57, 0x3e, 0x85, 0xec, 0xfe, 0x97, 0x2c, 0x45, 0x47, 0x2e, 0x95, 0xfc, 0x91, 0xf8, 0x43, 0x2a, 0x28, 0x41, 0xfa, 0x93, 0x20, 0x49, 0xf2, 0x9b, 0x99, 0xf0, 0x4b, 0x22, 0x4f, 0x26, 0x9d, 0xf4, 0xf6, 0x9f, 0x24, 0x4d, 0xbe, 0xd7, 0x6c, 0x5, 0x7, 0x6e, 0xd5, 0xbc, 0xd1, 0xb8, 0x3, 0x6a, 0x68, 0x1, 0xba, 0xd3, 0x60, 0x9, 0xb2, 0xdb, 0xd9, 0xb0, 0xb, 0x62, 0xf, 0x66, 0xdd, 0xb4, 0xb6, 0xdf, 0x64, 0xd, 0x1f, 0x76, 0xcd, 0xa4, 0xa6, 0xcf, 0x74, 0x1d, 0x70, 0x19, 0xa2, 0xcb, 0xc9, 0xa0, 0x1b, 0x72, 0xc1, 0xa8, 0x13, 0x7a, 0x78, 0x11, 0xaa, 0xc3, 0xae, 0xc7, 0x7c, 0x15, 0x17, 0x7e, 0xc5, 0xac, 0xe1, 0x88, 0x33, 0x5a, 0x58, 0x31, 0x8a, 0xe3, 0x8e, 0xe7, 0x5c, 0x35, 0x37, 0x5e, 0xe5, 0x8c, 0x3f, 0x56, 0xed, 0x84, 0x86, 0xef, 0x54, 0x3d, 0x50, 0x39, 0x82, 0xeb, 0xe9, 0x80, 0x3b, 0x52, 0x40, 0x29, 0x92, 0xfb, 0xf9, 0x90, 0x2b, 0x42, 0x2f, 0x46, 0xfd, 0x94, 0x96, 0xff, 0x44, 0x2d, 0x9e, 0xf7, 0x4c, 0x25, 0x27, 0x4e, 0xf5, 0x9c, 0xf1, 0x98, 0x23, 0x4a, 0x48, 0x21, 0x9a, 0xf3}, {0x0, 0x6a, 0xd4, 0xbe, 0xb5, 0xdf, 0x61, 0xb, 0x77, 0x1d, 0xa3, 0xc9, 0xc2, 0xa8, 0x16, 0x7c, 0xee, 0x84, 0x3a, 0x50, 0x5b, 0x31, 0x8f, 0xe5, 0x99, 0xf3, 0x4d, 0x27, 0x2c, 0x46, 0xf8, 0x92, 0xc1, 0xab, 0x15, 0x7f, 0x74, 0x1e, 0xa0, 0xca, 0xb6, 0xdc, 0x62, 0x8, 0x3, 0x69, 0xd7, 0xbd, 0x2f, 0x45, 0xfb, 0x91, 0x9a, 0xf0, 0x4e, 0x24, 0x58, 0x32, 0x8c, 0xe6, 0xed, 0x87, 0x39, 0x53, 0x9f, 0xf5, 0x4b, 0x21, 0x2a, 0x40, 0xfe, 0x94, 0xe8, 0x82, 0x3c, 0x56, 0x5d, 0x37, 0x89, 0xe3, 0x71, 0x1b, 0xa5, 0xcf, 0xc4, 0xae, 0x10, 0x7a, 0x6, 0x6c, 0xd2, 0xb8, 0xb3, 0xd9, 0x67, 0xd, 0x5e, 0x34, 0x8a, 0xe0, 0xeb, 0x81, 0x3f, 0x55, 0x29, 0x43, 0xfd, 0x97, 0x9c, 0xf6, 0x48, 0x22, 0xb0, 0xda, 0x64, 0xe, 0x5, 0x6f, 0xd1, 0xbb, 0xc7, 0xad, 0x13, 0x79, 0x72, 0x18, 0xa6, 0xcc, 0x23, 0x49, 0xf7, 0x9d, 0x96, 0xfc, 0x42, 0x28, 0x54, 0x3e, 0x80, 0xea, 0xe1, 0x8b, 0x35, 0x5f, 0xcd, 0xa7, 0x19, 0x73, 0x78, 0x12, 0xac, 0xc6, 0xba, 0xd0, 0x6e, 0x4, 0xf, 0x65, 0xdb, 0xb1, 0xe2, 0x88, 0x36, 0x5c, 0x57, 0x3d, 0x83, 0xe9, 0x95, 0xff, 0x41, 0x2b, 0x20, 0x4a, 0xf4, 0x9e, 0xc, 0x66, 0xd8, 0xb2, 0xb9, 0xd3, 0x6d, 0x7, 0x7b, 0x11, 0xaf, 0xc5, 0xce, 0xa4, 0x1a, 0x70, 0xbc, 0xd6, 0x68, 0x2, 0x9, 0x63, 0xdd, 0xb7, 0xcb, 0xa1, 0x1f, 0x75, 0x7e, 0x14, 0xaa, 0xc0, 0x52, 0x38, 0x86, 0xec, 0xe7, 0x8d, 0x33, 0x59, 0x25, 0x4f, 0xf1, 0x9b, 0x90, 0xfa, 0x44, 0x2e, 0x7d, 0x17, 0xa9, 0xc3, 0xc8, 0xa2, 0x1c, 0x76, 0xa, 0x60, 0xde, 0xb4, 0xbf, 0xd5, 0x6b, 0x1, 0x93, 0xf9, 0x47, 0x2d, 0x26, 0x4c, 0xf2, 0x98, 0xe4, 0x8e, 0x30, 0x5a, 0x51, 0x3b, 0x85, 0xef}, {0x0, 0x6b, 0xd6, 0xbd, 0xb1, 0xda, 0x67, 0xc, 0x7f, 0x14, 0xa9, 0xc2, 0xce, 0xa5, 0x18, 0x73, 0xfe, 0x95, 0x28, 0x43, 0x4f, 0x24, 0x99, 0xf2, 0x81, 0xea, 0x57, 0x3c, 0x30, 0x5b, 0xe6, 0x8d, 0xe1, 0x8a, 0x37, 0x5c, 0x50, 0x3b, 0x86, 0xed, 0x9e, 0xf5, 0x48, 0x23, 0x2f, 0x44, 0xf9, 0x92, 0x1f, 0x74, 0xc9, 0xa2, 0xae, 0xc5, 0x78, 0x13, 0x60, 0xb, 0xb6, 0xdd, 0xd1, 0xba, 0x7, 0x6c, 0xdf, 0xb4, 0x9, 0x62, 0x6e, 0x5, 0xb8, 0xd3, 0xa0, 0xcb, 0x76, 0x1d, 0x11, 0x7a, 0xc7, 0xac, 0x21, 0x4a, 0xf7, 0x9c, 0x90, 0xfb, 0x46, 0x2d, 0x5e, 0x35, 0x88, 0xe3, 0xef, 0x84, 0x39, 0x52, 0x3e, 0x55, 0xe8, 0x83, 0x8f, 0xe4, 0x59, 0x32, 0x41, 0x2a, 0x97, 0xfc, 0xf0, 0x9b, 0x26, 0x4d, 0xc0, 0xab, 0x16, 0x7d, 0x71, 0x1a, 0xa7, 0xcc, 0xbf, 0xd4, 0x69, 0x2, 0xe, 0x65, 0xd8, 0xb3, 0xa3, 0xc8, 0x75, 0x1e, 0x12, 0x79, 0xc4, 0xaf, 0xdc, 0xb7, 0xa, 0x61, 0x6d, 0x6, 0xbb, 0xd0, 0x5d, 0x36, 0x8b, 0xe0, 0xec, 0x87, 0x3a, 0x51, 0x22, 0x49, 0xf4, 0x9f, 0x93, 0xf8, 0x45, 0x2e, 0x42, 0x29, 0x94, 0xff, 0xf3, 0x98, 0x25, 0x4e, 0x3d, 0x56, 0xeb, 0x80, 0x8c, 0xe7, 0x5a, 0x31, 0xbc, 0xd7, 0x6a, 0x1, 0xd, 0x66, 0xdb, 0xb0, 0xc3, 0xa8, 0x15, 0x7e, 0x72, 0x19, 0xa4, 0xcf, 0x7c, 0x17, 0xaa, 0xc1, 0xcd, 0xa6, 0x1b, 0x70, 0x3, 0x68, 0xd5, 0xbe, 0xb2, 0xd9, 0x64, 0xf, 0x82, 0xe9, 0x54, 0x3f, 0x33, 0x58, 0xe5, 0x8e, 0xfd, 0x96, 0x2b, 0x40, 0x4c, 0x27, 0x9a, 0xf1, 0x9d, 0xf6, 0x4b, 0x20, 0x2c, 0x47, 0xfa, 0x91, 0xe2, 0x89, 0x34, 0x5f, 0x53, 0x38, 0x85, 0xee, 0x63, 0x8, 0xb5, 0xde, 0xd2, 0xb9, 0x4, 0x6f, 0x1c, 0x77, 0xca, 0xa1, 0xad, 0xc6, 0x7b, 0x10}, {0x0, 0x6c, 0xd8, 0xb4, 0xad, 0xc1, 0x75, 0x19, 0x47, 0x2b, 0x9f, 0xf3, 0xea, 0x86, 0x32, 0x5e, 0x8e, 0xe2, 0x56, 0x3a, 0x23, 0x4f, 0xfb, 0x97, 0xc9, 0xa5, 0x11, 0x7d, 0x64, 0x8, 0xbc, 0xd0, 0x1, 0x6d, 0xd9, 0xb5, 0xac, 0xc0, 0x74, 0x18, 0x46, 0x2a, 0x9e, 0xf2, 0xeb, 0x87, 0x33, 0x5f, 0x8f, 0xe3, 0x57, 0x3b, 0x22, 0x4e, 0xfa, 0x96, 0xc8, 0xa4, 0x10, 0x7c, 0x65, 0x9, 0xbd, 0xd1, 0x2, 0x6e, 0xda, 0xb6, 0xaf, 0xc3, 0x77, 0x1b, 0x45, 0x29, 0x9d, 0xf1, 0xe8, 0x84, 0x30, 0x5c, 0x8c, 0xe0, 0x54, 0x38, 0x21, 0x4d, 0xf9, 0x95, 0xcb, 0xa7, 0x13, 0x7f, 0x66, 0xa, 0xbe, 0xd2, 0x3, 0x6f, 0xdb, 0xb7, 0xae, 0xc2, 0x76, 0x1a, 0x44, 0x28, 0x9c, 0xf0, 0xe9, 0x85, 0x31, 0x5d, 0x8d, 0xe1, 0x55, 0x39, 0x20, 0x4c, 0xf8, 0x94, 0xca, 0xa6, 0x12, 0x7e, 0x67, 0xb, 0xbf, 0xd3, 0x4, 0x68, 0xdc, 0xb0, 0xa9, 0xc5, 0x71, 0x1d, 0x43, 0x2f, 0x9b, 0xf7, 0xee, 0x82, 0x36, 0x5a, 0x8a, 0xe6, 0x52, 0x3e, 0x27, 0x4b, 0xff, 0x93, 0xcd, 0xa1, 0x15, 0x79, 0x60, 0xc, 0xb8, 0xd4, 0x5, 0x69, 0xdd, 0xb1, 0xa8, 0xc4, 0x70, 0x1c, 0x42, 0x2e, 0x9a, 0xf6, 0xef, 0x83, 0x37, 0x5b, 0x8b, 0xe7, 0x53, 0x3f, 0x26, 0x4a, 0xfe, 0x92, 0xcc, 0xa0, 0x14, 0x78, 0x61, 0xd, 0xb9, 0xd5, 0x6, 0x6a, 0xde, 0xb2, 0xab, 0xc7, 0x73, 0x1f, 0x41, 0x2d, 0x99, 0xf5, 0xec, 0x80, 0x34, 0x58, 0x88, 0xe4, 0x50, 0x3c, 0x25, 0x49, 0xfd, 0x91, 0xcf, 0xa3, 0x17, 0x7b, 0x62, 0xe, 0xba, 0xd6, 0x7, 0x6b, 0xdf, 0xb3, 0xaa, 0xc6, 0x72, 0x1e, 0x40, 0x2c, 0x98, 0xf4, 0xed, 0x81, 0x35, 0x59, 0x89, 0xe5, 0x51, 0x3d, 0x24, 0x48, 0xfc, 0x90, 0xce, 0xa2, 0x16, 0x7a, 0x63, 0xf, 0xbb, 0xd7}, {0x0, 0x6d, 0xda, 0xb7, 0xa9, 0xc4, 0x73, 0x1e, 0x4f, 0x22, 0x95, 0xf8, 0xe6, 0x8b, 0x3c, 0x51, 0x9e, 0xf3, 0x44, 0x29, 0x37, 0x5a, 0xed, 0x80, 0xd1, 0xbc, 0xb, 0x66, 0x78, 0x15, 0xa2, 0xcf, 0x21, 0x4c, 0xfb, 0x96, 0x88, 0xe5, 0x52, 0x3f, 0x6e, 0x3, 0xb4, 0xd9, 0xc7, 0xaa, 0x1d, 0x70, 0xbf, 0xd2, 0x65, 0x8, 0x16, 0x7b, 0xcc, 0xa1, 0xf0, 0x9d, 0x2a, 0x47, 0x59, 0x34, 0x83, 0xee, 0x42, 0x2f, 0x98, 0xf5, 0xeb, 0x86, 0x31, 0x5c, 0xd, 0x60, 0xd7, 0xba, 0xa4, 0xc9, 0x7e, 0x13, 0xdc, 0xb1, 0x6, 0x6b, 0x75, 0x18, 0xaf, 0xc2, 0x93, 0xfe, 0x49, 0x24, 0x3a, 0x57, 0xe0, 0x8d, 0x63, 0xe, 0xb9, 0xd4, 0xca, 0xa7, 0x10, 0x7d, 0x2c, 0x41, 0xf6, 0x9b, 0x85, 0xe8, 0x5f, 0x32, 0xfd, 0x90, 0x27, 0x4a, 0x54, 0x39, 0x8e, 0xe3, 0xb2, 0xdf, 0x68, 0x5, 0x1b, 0x76, 0xc1, 0xac, 0x84, 0xe9, 0x5e, 0x33, 0x2d, 0x40, 0xf7, 0x9a, 0xcb, 0xa6, 0x11, 0x7c, 0x62, 0xf, 0xb8, 0xd5, 0x1a, 0x77, 0xc0, 0xad, 0xb3, 0xde, 0x69, 0x4, 0x55, 0x38, 0x8f, 0xe2, 0xfc, 0x91, 0x26, 0x4b, 0xa5, 0xc8, 0x7f, 0x12, 0xc, 0x61, 0xd6, 0xbb, 0xea, 0x87, 0x30, 0x5d, 0x43, 0x2e, 0x99, 0xf4, 0x3b, 0x56, 0xe1, 0x8c, 0x92, 0xff, 0x48, 0x25, 0x74, 0x19, 0xae, 0xc3, 0xdd, 0xb0, 0x7, 0x6a, 0xc6, 0xab, 0x1c, 0x71, 0x6f, 0x2, 0xb5, 0xd8, 0x89, 0xe4, 0x53, 0x3e, 0x20, 0x4d, 0xfa, 0x97, 0x58, 0x35, 0x82, 0xef, 0xf1, 0x9c, 0x2b, 0x46, 0x17, 0x7a, 0xcd, 0xa0, 0xbe, 0xd3, 0x64, 0x9, 0xe7, 0x8a, 0x3d, 0x50, 0x4e, 0x23, 0x94, 0xf9, 0xa8, 0xc5, 0x72, 0x1f, 0x1, 0x6c, 0xdb, 0xb6, 0x79, 0x14, 0xa3, 0xce, 0xd0, 0xbd, 0xa, 0x67, 0x36, 0x5b, 0xec, 0x81, 0x9f, 0xf2, 0x45, 0x28}, {0x0, 0x6e, 0xdc, 0xb2, 0xa5, 0xcb, 0x79, 0x17, 0x57, 0x39, 0x8b, 0xe5, 0xf2, 0x9c, 0x2e, 0x40, 0xae, 0xc0, 0x72, 0x1c, 0xb, 0x65, 0xd7, 0xb9, 0xf9, 0x97, 0x25, 0x4b, 0x5c, 0x32, 0x80, 0xee, 0x41, 0x2f, 0x9d, 0xf3, 0xe4, 0x8a, 0x38, 0x56, 0x16, 0x78, 0xca, 0xa4, 0xb3, 0xdd, 0x6f, 0x1, 0xef, 0x81, 0x33, 0x5d, 0x4a, 0x24, 0x96, 0xf8, 0xb8, 0xd6, 0x64, 0xa, 0x1d, 0x73, 0xc1, 0xaf, 0x82, 0xec, 0x5e, 0x30, 0x27, 0x49, 0xfb, 0x95, 0xd5, 0xbb, 0x9, 0x67, 0x70, 0x1e, 0xac, 0xc2, 0x2c, 0x42, 0xf0, 0x9e, 0x89, 0xe7, 0x55, 0x3b, 0x7b, 0x15, 0xa7, 0xc9, 0xde, 0xb0, 0x2, 0x6c, 0xc3, 0xad, 0x1f, 0x71, 0x66, 0x8, 0xba, 0xd4, 0x94, 0xfa, 0x48, 0x26, 0x31, 0x5f, 0xed, 0x83, 0x6d, 0x3, 0xb1, 0xdf, 0xc8, 0xa6, 0x14, 0x7a, 0x3a, 0x54, 0xe6, 0x88, 0x9f, 0xf1, 0x43, 0x2d, 0x19, 0x77, 0xc5, 0xab, 0xbc, 0xd2, 0x60, 0xe, 0x4e, 0x20, 0x92, 0xfc, 0xeb, 0x85, 0x37, 0x59, 0xb7, 0xd9, 0x6b, 0x5, 0x12, 0x7c, 0xce, 0xa0, 0xe0, 0x8e, 0x3c, 0x52, 0x45, 0x2b, 0x99, 0xf7, 0x58, 0x36, 0x84, 0xea, 0xfd, 0x93, 0x21, 0x4f, 0xf, 0x61, 0xd3, 0xbd, 0xaa, 0xc4, 0x76, 0x18, 0xf6, 0x98, 0x2a, 0x44, 0x53, 0x3d, 0x8f, 0xe1, 0xa1, 0xcf, 0x7d, 0x13, 0x4, 0x6a, 0xd8, 0xb6, 0x9b, 0xf5, 0x47, 0x29, 0x3e, 0x50, 0xe2, 0x8c, 0xcc, 0xa2, 0x10, 0x7e, 0x69, 0x7, 0xb5, 0xdb, 0x35, 0x5b, 0xe9, 0x87, 0x90, 0xfe, 0x4c, 0x22, 0x62, 0xc, 0xbe, 0xd0, 0xc7, 0xa9, 0x1b, 0x75, 0xda, 0xb4, 0x6, 0x68, 0x7f, 0x11, 0xa3, 0xcd, 0x8d, 0xe3, 0x51, 0x3f, 0x28, 0x46, 0xf4, 0x9a, 0x74, 0x1a, 0xa8, 0xc6, 0xd1, 0xbf, 0xd, 0x63, 0x23, 0x4d, 0xff, 0x91, 0x86, 0xe8, 0x5a, 0x34}, {0x0, 0x6f, 0xde, 0xb1, 0xa1, 0xce, 0x7f, 0x10, 0x5f, 0x30, 0x81, 0xee, 0xfe, 0x91, 0x20, 0x4f, 0xbe, 0xd1, 0x60, 0xf, 0x1f, 0x70, 0xc1, 0xae, 0xe1, 0x8e, 0x3f, 0x50, 0x40, 0x2f, 0x9e, 0xf1, 0x61, 0xe, 0xbf, 0xd0, 0xc0, 0xaf, 0x1e, 0x71, 0x3e, 0x51, 0xe0, 0x8f, 0x9f, 0xf0, 0x41, 0x2e, 0xdf, 0xb0, 0x1, 0x6e, 0x7e, 0x11, 0xa0, 0xcf, 0x80, 0xef, 0x5e, 0x31, 0x21, 0x4e, 0xff, 0x90, 0xc2, 0xad, 0x1c, 0x73, 0x63, 0xc, 0xbd, 0xd2, 0x9d, 0xf2, 0x43, 0x2c, 0x3c, 0x53, 0xe2, 0x8d, 0x7c, 0x13, 0xa2, 0xcd, 0xdd, 0xb2, 0x3, 0x6c, 0x23, 0x4c, 0xfd, 0x92, 0x82, 0xed, 0x5c, 0x33, 0xa3, 0xcc, 0x7d, 0x12, 0x2, 0x6d, 0xdc, 0xb3, 0xfc, 0x93, 0x22, 0x4d, 0x5d, 0x32, 0x83, 0xec, 0x1d, 0x72, 0xc3, 0xac, 0xbc, 0xd3, 0x62, 0xd, 0x42, 0x2d, 0x9c, 0xf3, 0xe3, 0x8c, 0x3d, 0x52, 0x99, 0xf6, 0x47, 0x28, 0x38, 0x57, 0xe6, 0x89, 0xc6, 0xa9, 0x18, 0x77, 0x67, 0x8, 0xb9, 0xd6, 0x27, 0x48, 0xf9, 0x96, 0x86, 0xe9, 0x58, 0x37, 0x78, 0x17, 0xa6, 0xc9, 0xd9, 0xb6, 0x7, 0x68, 0xf8, 0x97, 0x26, 0x49, 0x59, 0x36, 0x87, 0xe8, 0xa7, 0xc8, 0x79, 0x16, 0x6, 0x69, 0xd8, 0xb7, 0x46, 0x29, 0x98, 0xf7, 0xe7, 0x88, 0x39, 0x56, 0x19, 0x76, 0xc7, 0xa8, 0xb8, 0xd7, 0x66, 0x9, 0x5b, 0x34, 0x85, 0xea, 0xfa, 0x95, 0x24, 0x4b, 0x4, 0x6b, 0xda, 0xb5, 0xa5, 0xca, 0x7b, 0x14, 0xe5, 0x8a, 0x3b, 0x54, 0x44, 0x2b, 0x9a, 0xf5, 0xba, 0xd5, 0x64, 0xb, 0x1b, 0x74, 0xc5, 0xaa, 0x3a, 0x55, 0xe4, 0x8b, 0x9b, 0xf4, 0x45, 0x2a, 0x65, 0xa, 0xbb, 0xd4, 0xc4, 0xab, 0x1a, 0x75, 0x84, 0xeb, 0x5a, 0x35, 0x25, 0x4a, 0xfb, 0x94, 0xdb, 0xb4, 0x5, 0x6a, 0x7a, 0x15, 0xa4, 0xcb}, {0x0, 0x70, 0xe0, 0x90, 0xdd, 0xad, 0x3d, 0x4d, 0xa7, 0xd7, 0x47, 0x37, 0x7a, 0xa, 0x9a, 0xea, 0x53, 0x23, 0xb3, 0xc3, 0x8e, 0xfe, 0x6e, 0x1e, 0xf4, 0x84, 0x14, 0x64, 0x29, 0x59, 0xc9, 0xb9, 0xa6, 0xd6, 0x46, 0x36, 0x7b, 0xb, 0x9b, 0xeb, 0x1, 0x71, 0xe1, 0x91, 0xdc, 0xac, 0x3c, 0x4c, 0xf5, 0x85, 0x15, 0x65, 0x28, 0x58, 0xc8, 0xb8, 0x52, 0x22, 0xb2, 0xc2, 0x8f, 0xff, 0x6f, 0x1f, 0x51, 0x21, 0xb1, 0xc1, 0x8c, 0xfc, 0x6c, 0x1c, 0xf6, 0x86, 0x16, 0x66, 0x2b, 0x5b, 0xcb, 0xbb, 0x2, 0x72, 0xe2, 0x92, 0xdf, 0xaf, 0x3f, 0x4f, 0xa5, 0xd5, 0x45, 0x35, 0x78, 0x8, 0x98, 0xe8, 0xf7, 0x87, 0x17, 0x67, 0x2a, 0x5a, 0xca, 0xba, 0x50, 0x20, 0xb0, 0xc0, 0x8d, 0xfd, 0x6d, 0x1d, 0xa4, 0xd4, 0x44, 0x34, 0x79, 0x9, 0x99, 0xe9, 0x3, 0x73, 0xe3, 0x93, 0xde, 0xae, 0x3e, 0x4e, 0xa2, 0xd2, 0x42, 0x32, 0x7f, 0xf, 0x9f, 0xef, 0x5, 0x75, 0xe5, 0x95, 0xd8, 0xa8, 0x38, 0x48, 0xf1, 0x81, 0x11, 0x61, 0x2c, 0x5c, 0xcc, 0xbc, 0x56, 0x26, 0xb6, 0xc6, 0x8b, 0xfb, 0x6b, 0x1b, 0x4, 0x74, 0xe4, 0x94, 0xd9, 0xa9, 0x39, 0x49, 0xa3, 0xd3, 0x43, 0x33, 0x7e, 0xe, 0x9e, 0xee, 0x57, 0x27, 0xb7, 0xc7, 0x8a, 0xfa, 0x6a, 0x1a, 0xf0, 0x80, 0x10, 0x60, 0x2d, 0x5d, 0xcd, 0xbd, 0xf3, 0x83, 0x13, 0x63, 0x2e, 0x5e, 0xce, 0xbe, 0x54, 0x24, 0xb4, 0xc4, 0x89, 0xf9, 0x69, 0x19, 0xa0, 0xd0, 0x40, 0x30, 0x7d, 0xd, 0x9d, 0xed, 0x7, 0x77, 0xe7, 0x97, 0xda, 0xaa, 0x3a, 0x4a, 0x55, 0x25, 0xb5, 0xc5, 0x88, 0xf8, 0x68, 0x18, 0xf2, 0x82, 0x12, 0x62, 0x2f, 0x5f, 0xcf, 0xbf, 0x6, 0x76, 0xe6, 0x96, 0xdb, 0xab, 0x3b, 0x4b, 0xa1, 0xd1, 0x41, 0x31, 0x7c, 0xc, 0x9c, 0xec}, {0x0, 0x71, 0xe2, 0x93, 0xd9, 0xa8, 0x3b, 0x4a, 0xaf, 0xde, 0x4d, 0x3c, 0x76, 0x7, 0x94, 0xe5, 0x43, 0x32, 0xa1, 0xd0, 0x9a, 0xeb, 0x78, 0x9, 0xec, 0x9d, 0xe, 0x7f, 0x35, 0x44, 0xd7, 0xa6, 0x86, 0xf7, 0x64, 0x15, 0x5f, 0x2e, 0xbd, 0xcc, 0x29, 0x58, 0xcb, 0xba, 0xf0, 0x81, 0x12, 0x63, 0xc5, 0xb4, 0x27, 0x56, 0x1c, 0x6d, 0xfe, 0x8f, 0x6a, 0x1b, 0x88, 0xf9, 0xb3, 0xc2, 0x51, 0x20, 0x11, 0x60, 0xf3, 0x82, 0xc8, 0xb9, 0x2a, 0x5b, 0xbe, 0xcf, 0x5c, 0x2d, 0x67, 0x16, 0x85, 0xf4, 0x52, 0x23, 0xb0, 0xc1, 0x8b, 0xfa, 0x69, 0x18, 0xfd, 0x8c, 0x1f, 0x6e, 0x24, 0x55, 0xc6, 0xb7, 0x97, 0xe6, 0x75, 0x4, 0x4e, 0x3f, 0xac, 0xdd, 0x38, 0x49, 0xda, 0xab, 0xe1, 0x90, 0x3, 0x72, 0xd4, 0xa5, 0x36, 0x47, 0xd, 0x7c, 0xef, 0x9e, 0x7b, 0xa, 0x99, 0xe8, 0xa2, 0xd3, 0x40, 0x31, 0x22, 0x53, 0xc0, 0xb1, 0xfb, 0x8a, 0x19, 0x68, 0x8d, 0xfc, 0x6f, 0x1e, 0x54, 0x25, 0xb6, 0xc7, 0x61, 0x10, 0x83, 0xf2, 0xb8, 0xc9, 0x5a, 0x2b, 0xce, 0xbf, 0x2c, 0x5d, 0x17, 0x66, 0xf5, 0x84, 0xa4, 0xd5, 0x46, 0x37, 0x7d, 0xc, 0x9f, 0xee, 0xb, 0x7a, 0xe9, 0x98, 0xd2, 0xa3, 0x30, 0x41, 0xe7, 0x96, 0x5, 0x74, 0x3e, 0x4f, 0xdc, 0xad, 0x48, 0x39, 0xaa, 0xdb, 0x91, 0xe0, 0x73, 0x2, 0x33, 0x42, 0xd1, 0xa0, 0xea, 0x9b, 0x8, 0x79, 0x9c, 0xed, 0x7e, 0xf, 0x45, 0x34, 0xa7, 0xd6, 0x70, 0x1, 0x92, 0xe3, 0xa9, 0xd8, 0x4b, 0x3a, 0xdf, 0xae, 0x3d, 0x4c, 0x6, 0x77, 0xe4, 0x95, 0xb5, 0xc4, 0x57, 0x26, 0x6c, 0x1d, 0x8e, 0xff, 0x1a, 0x6b, 0xf8, 0x89, 0xc3, 0xb2, 0x21, 0x50, 0xf6, 0x87, 0x14, 0x65, 0x2f, 0x5e, 0xcd, 0xbc, 0x59, 0x28, 0xbb, 0xca, 0x80, 0xf1, 0x62, 0x13}, {0x0, 0x72, 0xe4, 0x96, 0xd5, 0xa7, 0x31, 0x43, 0xb7, 0xc5, 0x53, 0x21, 0x62, 0x10, 0x86, 0xf4, 0x73, 0x1, 0x97, 0xe5, 0xa6, 0xd4, 0x42, 0x30, 0xc4, 0xb6, 0x20, 0x52, 0x11, 0x63, 0xf5, 0x87, 0xe6, 0x94, 0x2, 0x70, 0x33, 0x41, 0xd7, 0xa5, 0x51, 0x23, 0xb5, 0xc7, 0x84, 0xf6, 0x60, 0x12, 0x95, 0xe7, 0x71, 0x3, 0x40, 0x32, 0xa4, 0xd6, 0x22, 0x50, 0xc6, 0xb4, 0xf7, 0x85, 0x13, 0x61, 0xd1, 0xa3, 0x35, 0x47, 0x4, 0x76, 0xe0, 0x92, 0x66, 0x14, 0x82, 0xf0, 0xb3, 0xc1, 0x57, 0x25, 0xa2, 0xd0, 0x46, 0x34, 0x77, 0x5, 0x93, 0xe1, 0x15, 0x67, 0xf1, 0x83, 0xc0, 0xb2, 0x24, 0x56, 0x37, 0x45, 0xd3, 0xa1, 0xe2, 0x90, 0x6, 0x74, 0x80, 0xf2, 0x64, 0x16, 0x55, 0x27, 0xb1, 0xc3, 0x44, 0x36, 0xa0, 0xd2, 0x91, 0xe3, 0x75, 0x7, 0xf3, 0x81, 0x17, 0x65, 0x26, 0x54, 0xc2, 0xb0, 0xbf, 0xcd, 0x5b, 0x29, 0x6a, 0x18, 0x8e, 0xfc, 0x8, 0x7a, 0xec, 0x9e, 0xdd, 0xaf, 0x39, 0x4b, 0xcc, 0xbe, 0x28, 0x5a, 0x19, 0x6b, 0xfd, 0x8f, 0x7b, 0x9, 0x9f, 0xed, 0xae, 0xdc, 0x4a, 0x38, 0x59, 0x2b, 0xbd, 0xcf, 0x8c, 0xfe, 0x68, 0x1a, 0xee, 0x9c, 0xa, 0x78, 0x3b, 0x49, 0xdf, 0xad, 0x2a, 0x58, 0xce, 0xbc, 0xff, 0x8d, 0x1b, 0x69, 0x9d, 0xef, 0x79, 0xb, 0x48, 0x3a, 0xac, 0xde, 0x6e, 0x1c, 0x8a, 0xf8, 0xbb, 0xc9, 0x5f, 0x2d, 0xd9, 0xab, 0x3d, 0x4f, 0xc, 0x7e, 0xe8, 0x9a, 0x1d, 0x6f, 0xf9, 0x8b, 0xc8, 0xba, 0x2c, 0x5e, 0xaa, 0xd8, 0x4e, 0x3c, 0x7f, 0xd, 0x9b, 0xe9, 0x88, 0xfa, 0x6c, 0x1e, 0x5d, 0x2f, 0xb9, 0xcb, 0x3f, 0x4d, 0xdb, 0xa9, 0xea, 0x98, 0xe, 0x7c, 0xfb, 0x89, 0x1f, 0x6d, 0x2e, 0x5c, 0xca, 0xb8, 0x4c, 0x3e, 0xa8, 0xda, 0x99, 0xeb, 0x7d, 0xf}, {0x0, 0x73, 0xe6, 0x95, 0xd1, 0xa2, 0x37, 0x44, 0xbf, 0xcc, 0x59, 0x2a, 0x6e, 0x1d, 0x88, 0xfb, 0x63, 0x10, 0x85, 0xf6, 0xb2, 0xc1, 0x54, 0x27, 0xdc, 0xaf, 0x3a, 0x49, 0xd, 0x7e, 0xeb, 0x98, 0xc6, 0xb5, 0x20, 0x53, 0x17, 0x64, 0xf1, 0x82, 0x79, 0xa, 0x9f, 0xec, 0xa8, 0xdb, 0x4e, 0x3d, 0xa5, 0xd6, 0x43, 0x30, 0x74, 0x7, 0x92, 0xe1, 0x1a, 0x69, 0xfc, 0x8f, 0xcb, 0xb8, 0x2d, 0x5e, 0x91, 0xe2, 0x77, 0x4, 0x40, 0x33, 0xa6, 0xd5, 0x2e, 0x5d, 0xc8, 0xbb, 0xff, 0x8c, 0x19, 0x6a, 0xf2, 0x81, 0x14, 0x67, 0x23, 0x50, 0xc5, 0xb6, 0x4d, 0x3e, 0xab, 0xd8, 0x9c, 0xef, 0x7a, 0x9, 0x57, 0x24, 0xb1, 0xc2, 0x86, 0xf5, 0x60, 0x13, 0xe8, 0x9b, 0xe, 0x7d, 0x39, 0x4a, 0xdf, 0xac, 0x34, 0x47, 0xd2, 0xa1, 0xe5, 0x96, 0x3, 0x70, 0x8b, 0xf8, 0x6d, 0x1e, 0x5a, 0x29, 0xbc, 0xcf, 0x3f, 0x4c, 0xd9, 0xaa, 0xee, 0x9d, 0x8, 0x7b, 0x80, 0xf3, 0x66, 0x15, 0x51, 0x22, 0xb7, 0xc4, 0x5c, 0x2f, 0xba, 0xc9, 0x8d, 0xfe, 0x6b, 0x18, 0xe3, 0x90, 0x5, 0x76, 0x32, 0x41, 0xd4, 0xa7, 0xf9, 0x8a, 0x1f, 0x6c, 0x28, 0x5b, 0xce, 0xbd, 0x46, 0x35, 0xa0, 0xd3, 0x97, 0xe4, 0x71, 0x2, 0x9a, 0xe9, 0x7c, 0xf, 0x4b, 0x38, 0xad, 0xde, 0x25, 0x56, 0xc3, 0xb0, 0xf4, 0x87, 0x12, 0x61, 0xae, 0xdd, 0x48, 0x3b, 0x7f, 0xc, 0x99, 0xea, 0x11, 0x62, 0xf7, 0x84, 0xc0, 0xb3, 0x26, 0x55, 0xcd, 0xbe, 0x2b, 0x58, 0x1c, 0x6f, 0xfa, 0x89, 0x72, 0x1, 0x94, 0xe7, 0xa3, 0xd0, 0x45, 0x36, 0x68, 0x1b, 0x8e, 0xfd, 0xb9, 0xca, 0x5f, 0x2c, 0xd7, 0xa4, 0x31, 0x42, 0x6, 0x75, 0xe0, 0x93, 0xb, 0x78, 0xed, 0x9e, 0xda, 0xa9, 0x3c, 0x4f, 0xb4, 0xc7, 0x52, 0x21, 0x65, 0x16, 0x83, 0xf0}, {0x0, 0x74, 0xe8, 0x9c, 0xcd, 0xb9, 0x25, 0x51, 0x87, 0xf3, 0x6f, 0x1b, 0x4a, 0x3e, 0xa2, 0xd6, 0x13, 0x67, 0xfb, 0x8f, 0xde, 0xaa, 0x36, 0x42, 0x94, 0xe0, 0x7c, 0x8, 0x59, 0x2d, 0xb1, 0xc5, 0x26, 0x52, 0xce, 0xba, 0xeb, 0x9f, 0x3, 0x77, 0xa1, 0xd5, 0x49, 0x3d, 0x6c, 0x18, 0x84, 0xf0, 0x35, 0x41, 0xdd, 0xa9, 0xf8, 0x8c, 0x10, 0x64, 0xb2, 0xc6, 0x5a, 0x2e, 0x7f, 0xb, 0x97, 0xe3, 0x4c, 0x38, 0xa4, 0xd0, 0x81, 0xf5, 0x69, 0x1d, 0xcb, 0xbf, 0x23, 0x57, 0x6, 0x72, 0xee, 0x9a, 0x5f, 0x2b, 0xb7, 0xc3, 0x92, 0xe6, 0x7a, 0xe, 0xd8, 0xac, 0x30, 0x44, 0x15, 0x61, 0xfd, 0x89, 0x6a, 0x1e, 0x82, 0xf6, 0xa7, 0xd3, 0x4f, 0x3b, 0xed, 0x99, 0x5, 0x71, 0x20, 0x54, 0xc8, 0xbc, 0x79, 0xd, 0x91, 0xe5, 0xb4, 0xc0, 0x5c, 0x28, 0xfe, 0x8a, 0x16, 0x62, 0x33, 0x47, 0xdb, 0xaf, 0x98, 0xec, 0x70, 0x4, 0x55, 0x21, 0xbd, 0xc9, 0x1f, 0x6b, 0xf7, 0x83, 0xd2, 0xa6, 0x3a, 0x4e, 0x8b, 0xff, 0x63, 0x17, 0x46, 0x32, 0xae, 0xda, 0xc, 0x78, 0xe4, 0x90, 0xc1, 0xb5, 0x29, 0x5d, 0xbe, 0xca, 0x56, 0x22, 0x73, 0x7, 0x9b, 0xef, 0x39, 0x4d, 0xd1, 0xa5, 0xf4, 0x80, 0x1c, 0x68, 0xad, 0xd9, 0x45, 0x31, 0x60, 0x14, 0x88, 0xfc, 0x2a, 0x5e, 0xc2, 0xb6, 0xe7, 0x93, 0xf, 0x7b, 0xd4, 0xa0, 0x3c, 0x48, 0x19, 0x6d, 0xf1, 0x85, 0x53, 0x27, 0xbb, 0xcf, 0x9e, 0xea, 0x76, 0x2, 0xc7, 0xb3, 0x2f, 0x5b, 0xa, 0x7e, 0xe2, 0x96, 0x40, 0x34, 0xa8, 0xdc, 0x8d, 0xf9, 0x65, 0x11, 0xf2, 0x86, 0x1a, 0x6e, 0x3f, 0x4b, 0xd7, 0xa3, 0x75, 0x1, 0x9d, 0xe9, 0xb8, 0xcc, 0x50, 0x24, 0xe1, 0x95, 0x9, 0x7d, 0x2c, 0x58, 0xc4, 0xb0, 0x66, 0x12, 0x8e, 0xfa, 0xab, 0xdf, 0x43, 0x37}, {0x0, 0x75, 0xea, 0x9f, 0xc9, 0xbc, 0x23, 0x56, 0x8f, 0xfa, 0x65, 0x10, 0x46, 0x33, 0xac, 0xd9, 0x3, 0x76, 0xe9, 0x9c, 0xca, 0xbf, 0x20, 0x55, 0x8c, 0xf9, 0x66, 0x13, 0x45, 0x30, 0xaf, 0xda, 0x6, 0x73, 0xec, 0x99, 0xcf, 0xba, 0x25, 0x50, 0x89, 0xfc, 0x63, 0x16, 0x40, 0x35, 0xaa, 0xdf, 0x5, 0x70, 0xef, 0x9a, 0xcc, 0xb9, 0x26, 0x53, 0x8a, 0xff, 0x60, 0x15, 0x43, 0x36, 0xa9, 0xdc, 0xc, 0x79, 0xe6, 0x93, 0xc5, 0xb0, 0x2f, 0x5a, 0x83, 0xf6, 0x69, 0x1c, 0x4a, 0x3f, 0xa0, 0xd5, 0xf, 0x7a, 0xe5, 0x90, 0xc6, 0xb3, 0x2c, 0x59, 0x80, 0xf5, 0x6a, 0x1f, 0x49, 0x3c, 0xa3, 0xd6, 0xa, 0x7f, 0xe0, 0x95, 0xc3, 0xb6, 0x29, 0x5c, 0x85, 0xf0, 0x6f, 0x1a, 0x4c, 0x39, 0xa6, 0xd3, 0x9, 0x7c, 0xe3, 0x96, 0xc0, 0xb5, 0x2a, 0x5f, 0x86, 0xf3, 0x6c, 0x19, 0x4f, 0x3a, 0xa5, 0xd0, 0x18, 0x6d, 0xf2, 0x87, 0xd1, 0xa4, 0x3b, 0x4e, 0x97, 0xe2, 0x7d, 0x8, 0x5e, 0x2b, 0xb4, 0xc1, 0x1b, 0x6e, 0xf1, 0x84, 0xd2, 0xa7, 0x38, 0x4d, 0x94, 0xe1, 0x7e, 0xb, 0x5d, 0x28, 0xb7, 0xc2, 0x1e, 0x6b, 0xf4, 0x81, 0xd7, 0xa2, 0x3d, 0x48, 0x91, 0xe4, 0x7b, 0xe, 0x58, 0x2d, 0xb2, 0xc7, 0x1d, 0x68, 0xf7, 0x82, 0xd4, 0xa1, 0x3e, 0x4b, 0x92, 0xe7, 0x78, 0xd, 0x5b, 0x2e, 0xb1, 0xc4, 0x14, 0x61, 0xfe, 0x8b, 0xdd, 0xa8, 0x37, 0x42, 0x9b, 0xee, 0x71, 0x4, 0x52, 0x27, 0xb8, 0xcd, 0x17, 0x62, 0xfd, 0x88, 0xde, 0xab, 0x34, 0x41, 0x98, 0xed, 0x72, 0x7, 0x51, 0x24, 0xbb, 0xce, 0x12, 0x67, 0xf8, 0x8d, 0xdb, 0xae, 0x31, 0x44, 0x9d, 0xe8, 0x77, 0x2, 0x54, 0x21, 0xbe, 0xcb, 0x11, 0x64, 0xfb, 0x8e, 0xd8, 0xad, 0x32, 0x47, 0x9e, 0xeb, 0x74, 0x1, 0x57, 0x22, 0xbd, 0xc8}, {0x0, 0x76, 0xec, 0x9a, 0xc5, 0xb3, 0x29, 0x5f, 0x97, 0xe1, 0x7b, 0xd, 0x52, 0x24, 0xbe, 0xc8, 0x33, 0x45, 0xdf, 0xa9, 0xf6, 0x80, 0x1a, 0x6c, 0xa4, 0xd2, 0x48, 0x3e, 0x61, 0x17, 0x8d, 0xfb, 0x66, 0x10, 0x8a, 0xfc, 0xa3, 0xd5, 0x4f, 0x39, 0xf1, 0x87, 0x1d, 0x6b, 0x34, 0x42, 0xd8, 0xae, 0x55, 0x23, 0xb9, 0xcf, 0x90, 0xe6, 0x7c, 0xa, 0xc2, 0xb4, 0x2e, 0x58, 0x7, 0x71, 0xeb, 0x9d, 0xcc, 0xba, 0x20, 0x56, 0x9, 0x7f, 0xe5, 0x93, 0x5b, 0x2d, 0xb7, 0xc1, 0x9e, 0xe8, 0x72, 0x4, 0xff, 0x89, 0x13, 0x65, 0x3a, 0x4c, 0xd6, 0xa0, 0x68, 0x1e, 0x84, 0xf2, 0xad, 0xdb, 0x41, 0x37, 0xaa, 0xdc, 0x46, 0x30, 0x6f, 0x19, 0x83, 0xf5, 0x3d, 0x4b, 0xd1, 0xa7, 0xf8, 0x8e, 0x14, 0x62, 0x99, 0xef, 0x75, 0x3, 0x5c, 0x2a, 0xb0, 0xc6, 0xe, 0x78, 0xe2, 0x94, 0xcb, 0xbd, 0x27, 0x51, 0x85, 0xf3, 0x69, 0x1f, 0x40, 0x36, 0xac, 0xda, 0x12, 0x64, 0xfe, 0x88, 0xd7, 0xa1, 0x3b, 0x4d, 0xb6, 0xc0, 0x5a, 0x2c, 0x73, 0x5, 0x9f, 0xe9, 0x21, 0x57, 0xcd, 0xbb, 0xe4, 0x92, 0x8, 0x7e, 0xe3, 0x95, 0xf, 0x79, 0x26, 0x50, 0xca, 0xbc, 0x74, 0x2, 0x98, 0xee, 0xb1, 0xc7, 0x5d, 0x2b, 0xd0, 0xa6, 0x3c, 0x4a, 0x15, 0x63, 0xf9, 0x8f, 0x47, 0x31, 0xab, 0xdd, 0x82, 0xf4, 0x6e, 0x18, 0x49, 0x3f, 0xa5, 0xd3, 0x8c, 0xfa, 0x60, 0x16, 0xde, 0xa8, 0x32, 0x44, 0x1b, 0x6d, 0xf7, 0x81, 0x7a, 0xc, 0x96, 0xe0, 0xbf, 0xc9, 0x53, 0x25, 0xed, 0x9b, 0x1, 0x77, 0x28, 0x5e, 0xc4, 0xb2, 0x2f, 0x59, 0xc3, 0xb5, 0xea, 0x9c, 0x6, 0x70, 0xb8, 0xce, 0x54, 0x22, 0x7d, 0xb, 0x91, 0xe7, 0x1c, 0x6a, 0xf0, 0x86, 0xd9, 0xaf, 0x35, 0x43, 0x8b, 0xfd, 0x67, 0x11, 0x4e, 0x38, 0xa2, 0xd4}, {0x0, 0x77, 0xee, 0x99, 0xc1, 0xb6, 0x2f, 0x58, 0x9f, 0xe8, 0x71, 0x6, 0x5e, 0x29, 0xb0, 0xc7, 0x23, 0x54, 0xcd, 0xba, 0xe2, 0x95, 0xc, 0x7b, 0xbc, 0xcb, 0x52, 0x25, 0x7d, 0xa, 0x93, 0xe4, 0x46, 0x31, 0xa8, 0xdf, 0x87, 0xf0, 0x69, 0x1e, 0xd9, 0xae, 0x37, 0x40, 0x18, 0x6f, 0xf6, 0x81, 0x65, 0x12, 0x8b, 0xfc, 0xa4, 0xd3, 0x4a, 0x3d, 0xfa, 0x8d, 0x14, 0x63, 0x3b, 0x4c, 0xd5, 0xa2, 0x8c, 0xfb, 0x62, 0x15, 0x4d, 0x3a, 0xa3, 0xd4, 0x13, 0x64, 0xfd, 0x8a, 0xd2, 0xa5, 0x3c, 0x4b, 0xaf, 0xd8, 0x41, 0x36, 0x6e, 0x19, 0x80, 0xf7, 0x30, 0x47, 0xde, 0xa9, 0xf1, 0x86, 0x1f, 0x68, 0xca, 0xbd, 0x24, 0x53, 0xb, 0x7c, 0xe5, 0x92, 0x55, 0x22, 0xbb, 0xcc, 0x94, 0xe3, 0x7a, 0xd, 0xe9, 0x9e, 0x7, 0x70, 0x28, 0x5f, 0xc6, 0xb1, 0x76, 0x1, 0x98, 0xef, 0xb7, 0xc0, 0x59, 0x2e, 0x5, 0x72, 0xeb, 0x9c, 0xc4, 0xb3, 0x2a, 0x5d, 0x9a, 0xed, 0x74, 0x3, 0x5b, 0x2c, 0xb5, 0xc2, 0x26, 0x51, 0xc8, 0xbf, 0xe7, 0x90, 0x9, 0x7e, 0xb9, 0xce, 0x57, 0x20, 0x78, 0xf, 0x96, 0xe1, 0x43, 0x34, 0xad, 0xda, 0x82, 0xf5, 0x6c, 0x1b, 0xdc, 0xab, 0x32, 0x45, 0x1d, 0x6a, 0xf3, 0x84, 0x60, 0x17, 0x8e, 0xf9, 0xa1, 0xd6, 0x4f, 0x38, 0xff, 0x88, 0x11, 0x66, 0x3e, 0x49, 0xd0, 0xa7, 0x89, 0xfe, 0x67, 0x10, 0x48, 0x3f, 0xa6, 0xd1, 0x16, 0x61, 0xf8, 0x8f, 0xd7, 0xa0, 0x39, 0x4e, 0xaa, 0xdd, 0x44, 0x33, 0x6b, 0x1c, 0x85, 0xf2, 0x35, 0x42, 0xdb, 0xac, 0xf4, 0x83, 0x1a, 0x6d, 0xcf, 0xb8, 0x21, 0x56, 0xe, 0x79, 0xe0, 0x97, 0x50, 0x27, 0xbe, 0xc9, 0x91, 0xe6, 0x7f, 0x8, 0xec, 0x9b, 0x2, 0x75, 0x2d, 0x5a, 0xc3, 0xb4, 0x73, 0x4, 0x9d, 0xea, 0xb2, 0xc5, 0x5c, 0x2b}, {0x0, 0x78, 0xf0, 0x88, 0xfd, 0x85, 0xd, 0x75, 0xe7, 0x9f, 0x17, 0x6f, 0x1a, 0x62, 0xea, 0x92, 0xd3, 0xab, 0x23, 0x5b, 0x2e, 0x56, 0xde, 0xa6, 0x34, 0x4c, 0xc4, 0xbc, 0xc9, 0xb1, 0x39, 0x41, 0xbb, 0xc3, 0x4b, 0x33, 0x46, 0x3e, 0xb6, 0xce, 0x5c, 0x24, 0xac, 0xd4, 0xa1, 0xd9, 0x51, 0x29, 0x68, 0x10, 0x98, 0xe0, 0x95, 0xed, 0x65, 0x1d, 0x8f, 0xf7, 0x7f, 0x7, 0x72, 0xa, 0x82, 0xfa, 0x6b, 0x13, 0x9b, 0xe3, 0x96, 0xee, 0x66, 0x1e, 0x8c, 0xf4, 0x7c, 0x4, 0x71, 0x9, 0x81, 0xf9, 0xb8, 0xc0, 0x48, 0x30, 0x45, 0x3d, 0xb5, 0xcd, 0x5f, 0x27, 0xaf, 0xd7, 0xa2, 0xda, 0x52, 0x2a, 0xd0, 0xa8, 0x20, 0x58, 0x2d, 0x55, 0xdd, 0xa5, 0x37, 0x4f, 0xc7, 0xbf, 0xca, 0xb2, 0x3a, 0x42, 0x3, 0x7b, 0xf3, 0x8b, 0xfe, 0x86, 0xe, 0x76, 0xe4, 0x9c, 0x14, 0x6c, 0x19, 0x61, 0xe9, 0x91, 0xd6, 0xae, 0x26, 0x5e, 0x2b, 0x53, 0xdb, 0xa3, 0x31, 0x49, 0xc1, 0xb9, 0xcc, 0xb4, 0x3c, 0x44, 0x5, 0x7d, 0xf5, 0x8d, 0xf8, 0x80, 0x8, 0x70, 0xe2, 0x9a, 0x12, 0x6a, 0x1f, 0x67, 0xef, 0x97, 0x6d, 0x15, 0x9d, 0xe5, 0x90, 0xe8, 0x60, 0x18, 0x8a, 0xf2, 0x7a, 0x2, 0x77, 0xf, 0x87, 0xff, 0xbe, 0xc6, 0x4e, 0x36, 0x43, 0x3b, 0xb3, 0xcb, 0x59, 0x21, 0xa9, 0xd1, 0xa4, 0xdc, 0x54, 0x2c, 0xbd, 0xc5, 0x4d, 0x35, 0x40, 0x38, 0xb0, 0xc8, 0x5a, 0x22, 0xaa, 0xd2, 0xa7, 0xdf, 0x57, 0x2f, 0x6e, 0x16, 0x9e, 0xe6, 0x93, 0xeb, 0x63, 0x1b, 0x89, 0xf1, 0x79, 0x1, 0x74, 0xc, 0x84, 0xfc, 0x6, 0x7e, 0xf6, 0x8e, 0xfb, 0x83, 0xb, 0x73, 0xe1, 0x99, 0x11, 0x69, 0x1c, 0x64, 0xec, 0x94, 0xd5, 0xad, 0x25, 0x5d, 0x28, 0x50, 0xd8, 0xa0, 0x32, 0x4a, 0xc2, 0xba, 0xcf, 0xb7, 0x3f, 0x47}, {0x0, 0x79, 0xf2, 0x8b, 0xf9, 0x80, 0xb, 0x72, 0xef, 0x96, 0x1d, 0x64, 0x16, 0x6f, 0xe4, 0x9d, 0xc3, 0xba, 0x31, 0x48, 0x3a, 0x43, 0xc8, 0xb1, 0x2c, 0x55, 0xde, 0xa7, 0xd5, 0xac, 0x27, 0x5e, 0x9b, 0xe2, 0x69, 0x10, 0x62, 0x1b, 0x90, 0xe9, 0x74, 0xd, 0x86, 0xff, 0x8d, 0xf4, 0x7f, 0x6, 0x58, 0x21, 0xaa, 0xd3, 0xa1, 0xd8, 0x53, 0x2a, 0xb7, 0xce, 0x45, 0x3c, 0x4e, 0x37, 0xbc, 0xc5, 0x2b, 0x52, 0xd9, 0xa0, 0xd2, 0xab, 0x20, 0x59, 0xc4, 0xbd, 0x36, 0x4f, 0x3d, 0x44, 0xcf, 0xb6, 0xe8, 0x91, 0x1a, 0x63, 0x11, 0x68, 0xe3, 0x9a, 0x7, 0x7e, 0xf5, 0x8c, 0xfe, 0x87, 0xc, 0x75, 0xb0, 0xc9, 0x42, 0x3b, 0x49, 0x30, 0xbb, 0xc2, 0x5f, 0x26, 0xad, 0xd4, 0xa6, 0xdf, 0x54, 0x2d, 0x73, 0xa, 0x81, 0xf8, 0x8a, 0xf3, 0x78, 0x1, 0x9c, 0xe5, 0x6e, 0x17, 0x65, 0x1c, 0x97, 0xee, 0x56, 0x2f, 0xa4, 0xdd, 0xaf, 0xd6, 0x5d, 0x24, 0xb9, 0xc0, 0x4b, 0x32, 0x40, 0x39, 0xb2, 0xcb, 0x95, 0xec, 0x67, 0x1e, 0x6c, 0x15, 0x9e, 0xe7, 0x7a, 0x3, 0x88, 0xf1, 0x83, 0xfa, 0x71, 0x8, 0xcd, 0xb4, 0x3f, 0x46, 0x34, 0x4d, 0xc6, 0xbf, 0x22, 0x5b, 0xd0, 0xa9, 0xdb, 0xa2, 0x29, 0x50, 0xe, 0x77, 0xfc, 0x85, 0xf7, 0x8e, 0x5, 0x7c, 0xe1, 0x98, 0x13, 0x6a, 0x18, 0x61, 0xea, 0x93, 0x7d, 0x4, 0x8f, 0xf6, 0x84, 0xfd, 0x76, 0xf, 0x92, 0xeb, 0x60, 0x19, 0x6b, 0x12, 0x99, 0xe0, 0xbe, 0xc7, 0x4c, 0x35, 0x47, 0x3e, 0xb5, 0xcc, 0x51, 0x28, 0xa3, 0xda, 0xa8, 0xd1, 0x5a, 0x23, 0xe6, 0x9f, 0x14, 0x6d, 0x1f, 0x66, 0xed, 0x94, 0x9, 0x70, 0xfb, 0x82, 0xf0, 0x89, 0x2, 0x7b, 0x25, 0x5c, 0xd7, 0xae, 0xdc, 0xa5, 0x2e, 0x57, 0xca, 0xb3, 0x38, 0x41, 0x33, 0x4a, 0xc1, 0xb8}, {0x0, 0x7a, 0xf4, 0x8e, 0xf5, 0x8f, 0x1, 0x7b, 0xf7, 0x8d, 0x3, 0x79, 0x2, 0x78, 0xf6, 0x8c, 0xf3, 0x89, 0x7, 0x7d, 0x6, 0x7c, 0xf2, 0x88, 0x4, 0x7e, 0xf0, 0x8a, 0xf1, 0x8b, 0x5, 0x7f, 0xfb, 0x81, 0xf, 0x75, 0xe, 0x74, 0xfa, 0x80, 0xc, 0x76, 0xf8, 0x82, 0xf9, 0x83, 0xd, 0x77, 0x8, 0x72, 0xfc, 0x86, 0xfd, 0x87, 0x9, 0x73, 0xff, 0x85, 0xb, 0x71, 0xa, 0x70, 0xfe, 0x84, 0xeb, 0x91, 0x1f, 0x65, 0x1e, 0x64, 0xea, 0x90, 0x1c, 0x66, 0xe8, 0x92, 0xe9, 0x93, 0x1d, 0x67, 0x18, 0x62, 0xec, 0x96, 0xed, 0x97, 0x19, 0x63, 0xef, 0x95, 0x1b, 0x61, 0x1a, 0x60, 0xee, 0x94, 0x10, 0x6a, 0xe4, 0x9e, 0xe5, 0x9f, 0x11, 0x6b, 0xe7, 0x9d, 0x13, 0x69, 0x12, 0x68, 0xe6, 0x9c, 0xe3, 0x99, 0x17, 0x6d, 0x16, 0x6c, 0xe2, 0x98, 0x14, 0x6e, 0xe0, 0x9a, 0xe1, 0x9b, 0x15, 0x6f, 0xcb, 0xb1, 0x3f, 0x45, 0x3e, 0x44, 0xca, 0xb0, 0x3c, 0x46, 0xc8, 0xb2, 0xc9, 0xb3, 0x3d, 0x47, 0x38, 0x42, 0xcc, 0xb6, 0xcd, 0xb7, 0x39, 0x43, 0xcf, 0xb5, 0x3b, 0x41, 0x3a, 0x40, 0xce, 0xb4, 0x30, 0x4a, 0xc4, 0xbe, 0xc5, 0xbf, 0x31, 0x4b, 0xc7, 0xbd, 0x33, 0x49, 0x32, 0x48, 0xc6, 0xbc, 0xc3, 0xb9, 0x37, 0x4d, 0x36, 0x4c, 0xc2, 0xb8, 0x34, 0x4e, 0xc0, 0xba, 0xc1, 0xbb, 0x35, 0x4f, 0x20, 0x5a, 0xd4, 0xae, 0xd5, 0xaf, 0x21, 0x5b, 0xd7, 0xad, 0x23, 0x59, 0x22, 0x58, 0xd6, 0xac, 0xd3, 0xa9, 0x27, 0x5d, 0x26, 0x5c, 0xd2, 0xa8, 0x24, 0x5e, 0xd0, 0xaa, 0xd1, 0xab, 0x25, 0x5f, 0xdb, 0xa1, 0x2f, 0x55, 0x2e, 0x54, 0xda, 0xa0, 0x2c, 0x56, 0xd8, 0xa2, 0xd9, 0xa3, 0x2d, 0x57, 0x28, 0x52, 0xdc, 0xa6, 0xdd, 0xa7, 0x29, 0x53, 0xdf, 0xa5, 0x2b, 0x51, 0x2a, 0x50, 0xde, 0xa4}, {0x0, 0x7b, 0xf6, 0x8d, 0xf1, 0x8a, 0x7, 0x7c, 0xff, 0x84, 0x9, 0x72, 0xe, 0x75, 0xf8, 0x83, 0xe3, 0x98, 0x15, 0x6e, 0x12, 0x69, 0xe4, 0x9f, 0x1c, 0x67, 0xea, 0x91, 0xed, 0x96, 0x1b, 0x60, 0xdb, 0xa0, 0x2d, 0x56, 0x2a, 0x51, 0xdc, 0xa7, 0x24, 0x5f, 0xd2, 0xa9, 0xd5, 0xae, 0x23, 0x58, 0x38, 0x43, 0xce, 0xb5, 0xc9, 0xb2, 0x3f, 0x44, 0xc7, 0xbc, 0x31, 0x4a, 0x36, 0x4d, 0xc0, 0xbb, 0xab, 0xd0, 0x5d, 0x26, 0x5a, 0x21, 0xac, 0xd7, 0x54, 0x2f, 0xa2, 0xd9, 0xa5, 0xde, 0x53, 0x28, 0x48, 0x33, 0xbe, 0xc5, 0xb9, 0xc2, 0x4f, 0x34, 0xb7, 0xcc, 0x41, 0x3a, 0x46, 0x3d, 0xb0, 0xcb, 0x70, 0xb, 0x86, 0xfd, 0x81, 0xfa, 0x77, 0xc, 0x8f, 0xf4, 0x79, 0x2, 0x7e, 0x5, 0x88, 0xf3, 0x93, 0xe8, 0x65, 0x1e, 0x62, 0x19, 0x94, 0xef, 0x6c, 0x17, 0x9a, 0xe1, 0x9d, 0xe6, 0x6b, 0x10, 0x4b, 0x30, 0xbd, 0xc6, 0xba, 0xc1, 0x4c, 0x37, 0xb4, 0xcf, 0x42, 0x39, 0x45, 0x3e, 0xb3, 0xc8, 0xa8, 0xd3, 0x5e, 0x25, 0x59, 0x22, 0xaf, 0xd4, 0x57, 0x2c, 0xa1, 0xda, 0xa6, 0xdd, 0x50, 0x2b, 0x90, 0xeb, 0x66, 0x1d, 0x61, 0x1a, 0x97, 0xec, 0x6f, 0x14, 0x99, 0xe2, 0x9e, 0xe5, 0x68, 0x13, 0x73, 0x8, 0x85, 0xfe, 0x82, 0xf9, 0x74, 0xf, 0x8c, 0xf7, 0x7a, 0x1, 0x7d, 0x6, 0x8b, 0xf0, 0xe0, 0x9b, 0x16, 0x6d, 0x11, 0x6a, 0xe7, 0x9c, 0x1f, 0x64, 0xe9, 0x92, 0xee, 0x95, 0x18, 0x63, 0x3, 0x78, 0xf5, 0x8e, 0xf2, 0x89, 0x4, 0x7f, 0xfc, 0x87, 0xa, 0x71, 0xd, 0x76, 0xfb, 0x80, 0x3b, 0x40, 0xcd, 0xb6, 0xca, 0xb1, 0x3c, 0x47, 0xc4, 0xbf, 0x32, 0x49, 0x35, 0x4e, 0xc3, 0xb8, 0xd8, 0xa3, 0x2e, 0x55, 0x29, 0x52, 0xdf, 0xa4, 0x27, 0x5c, 0xd1, 0xaa, 0xd6, 0xad, 0x20, 0x5b}, {0x0, 0x7c, 0xf8, 0x84, 0xed, 0x91, 0x15, 0x69, 0xc7, 0xbb, 0x3f, 0x43, 0x2a, 0x56, 0xd2, 0xae, 0x93, 0xef, 0x6b, 0x17, 0x7e, 0x2, 0x86, 0xfa, 0x54, 0x28, 0xac, 0xd0, 0xb9, 0xc5, 0x41, 0x3d, 0x3b, 0x47, 0xc3, 0xbf, 0xd6, 0xaa, 0x2e, 0x52, 0xfc, 0x80, 0x4, 0x78, 0x11, 0x6d, 0xe9, 0x95, 0xa8, 0xd4, 0x50, 0x2c, 0x45, 0x39, 0xbd, 0xc1, 0x6f, 0x13, 0x97, 0xeb, 0x82, 0xfe, 0x7a, 0x6, 0x76, 0xa, 0x8e, 0xf2, 0x9b, 0xe7, 0x63, 0x1f, 0xb1, 0xcd, 0x49, 0x35, 0x5c, 0x20, 0xa4, 0xd8, 0xe5, 0x99, 0x1d, 0x61, 0x8, 0x74, 0xf0, 0x8c, 0x22, 0x5e, 0xda, 0xa6, 0xcf, 0xb3, 0x37, 0x4b, 0x4d, 0x31, 0xb5, 0xc9, 0xa0, 0xdc, 0x58, 0x24, 0x8a, 0xf6, 0x72, 0xe, 0x67, 0x1b, 0x9f, 0xe3, 0xde, 0xa2, 0x26, 0x5a, 0x33, 0x4f, 0xcb, 0xb7, 0x19, 0x65, 0xe1, 0x9d, 0xf4, 0x88, 0xc, 0x70, 0xec, 0x90, 0x14, 0x68, 0x1, 0x7d, 0xf9, 0x85, 0x2b, 0x57, 0xd3, 0xaf, 0xc6, 0xba, 0x3e, 0x42, 0x7f, 0x3, 0x87, 0xfb, 0x92, 0xee, 0x6a, 0x16, 0xb8, 0xc4, 0x40, 0x3c, 0x55, 0x29, 0xad, 0xd1, 0xd7, 0xab, 0x2f, 0x53, 0x3a, 0x46, 0xc2, 0xbe, 0x10, 0x6c, 0xe8, 0x94, 0xfd, 0x81, 0x5, 0x79, 0x44, 0x38, 0xbc, 0xc0, 0xa9, 0xd5, 0x51, 0x2d, 0x83, 0xff, 0x7b, 0x7, 0x6e, 0x12, 0x96, 0xea, 0x9a, 0xe6, 0x62, 0x1e, 0x77, 0xb, 0x8f, 0xf3, 0x5d, 0x21, 0xa5, 0xd9, 0xb0, 0xcc, 0x48, 0x34, 0x9, 0x75, 0xf1, 0x8d, 0xe4, 0x98, 0x1c, 0x60, 0xce, 0xb2, 0x36, 0x4a, 0x23, 0x5f, 0xdb, 0xa7, 0xa1, 0xdd, 0x59, 0x25, 0x4c, 0x30, 0xb4, 0xc8, 0x66, 0x1a, 0x9e, 0xe2, 0x8b, 0xf7, 0x73, 0xf, 0x32, 0x4e, 0xca, 0xb6, 0xdf, 0xa3, 0x27, 0x5b, 0xf5, 0x89, 0xd, 0x71, 0x18, 0x64, 0xe0, 0x9c}, {0x0, 0x7d, 0xfa, 0x87, 0xe9, 0x94, 0x13, 0x6e, 0xcf, 0xb2, 0x35, 0x48, 0x26, 0x5b, 0xdc, 0xa1, 0x83, 0xfe, 0x79, 0x4, 0x6a, 0x17, 0x90, 0xed, 0x4c, 0x31, 0xb6, 0xcb, 0xa5, 0xd8, 0x5f, 0x22, 0x1b, 0x66, 0xe1, 0x9c, 0xf2, 0x8f, 0x8, 0x75, 0xd4, 0xa9, 0x2e, 0x53, 0x3d, 0x40, 0xc7, 0xba, 0x98, 0xe5, 0x62, 0x1f, 0x71, 0xc, 0x8b, 0xf6, 0x57, 0x2a, 0xad, 0xd0, 0xbe, 0xc3, 0x44, 0x39, 0x36, 0x4b, 0xcc, 0xb1, 0xdf, 0xa2, 0x25, 0x58, 0xf9, 0x84, 0x3, 0x7e, 0x10, 0x6d, 0xea, 0x97, 0xb5, 0xc8, 0x4f, 0x32, 0x5c, 0x21, 0xa6, 0xdb, 0x7a, 0x7, 0x80, 0xfd, 0x93, 0xee, 0x69, 0x14, 0x2d, 0x50, 0xd7, 0xaa, 0xc4, 0xb9, 0x3e, 0x43, 0xe2, 0x9f, 0x18, 0x65, 0xb, 0x76, 0xf1, 0x8c, 0xae, 0xd3, 0x54, 0x29, 0x47, 0x3a, 0xbd, 0xc0, 0x61, 0x1c, 0x9b, 0xe6, 0x88, 0xf5, 0x72, 0xf, 0x6c, 0x11, 0x96, 0xeb, 0x85, 0xf8, 0x7f, 0x2, 0xa3, 0xde, 0x59, 0x24, 0x4a, 0x37, 0xb0, 0xcd, 0xef, 0x92, 0x15, 0x68, 0x6, 0x7b, 0xfc, 0x81, 0x20, 0x5d, 0xda, 0xa7, 0xc9, 0xb4, 0x33, 0x4e, 0x77, 0xa, 0x8d, 0xf0, 0x9e, 0xe3, 0x64, 0x19, 0xb8, 0xc5, 0x42, 0x3f, 0x51, 0x2c, 0xab, 0xd6, 0xf4, 0x89, 0xe, 0x73, 0x1d, 0x60, 0xe7, 0x9a, 0x3b, 0x46, 0xc1, 0xbc, 0xd2, 0xaf, 0x28, 0x55, 0x5a, 0x27, 0xa0, 0xdd, 0xb3, 0xce, 0x49, 0x34, 0x95, 0xe8, 0x6f, 0x12, 0x7c, 0x1, 0x86, 0xfb, 0xd9, 0xa4, 0x23, 0x5e, 0x30, 0x4d, 0xca, 0xb7, 0x16, 0x6b, 0xec, 0x91, 0xff, 0x82, 0x5, 0x78, 0x41, 0x3c, 0xbb, 0xc6, 0xa8, 0xd5, 0x52, 0x2f, 0x8e, 0xf3, 0x74, 0x9, 0x67, 0x1a, 0x9d, 0xe0, 0xc2, 0xbf, 0x38, 0x45, 0x2b, 0x56, 0xd1, 0xac, 0xd, 0x70, 0xf7, 0x8a, 0xe4, 0x99, 0x1e, 0x63}, {0x0, 0x7e, 0xfc, 0x82, 0xe5, 0x9b, 0x19, 0x67, 0xd7, 0xa9, 0x2b, 0x55, 0x32, 0x4c, 0xce, 0xb0, 0xb3, 0xcd, 0x4f, 0x31, 0x56, 0x28, 0xaa, 0xd4, 0x64, 0x1a, 0x98, 0xe6, 0x81, 0xff, 0x7d, 0x3, 0x7b, 0x5, 0x87, 0xf9, 0x9e, 0xe0, 0x62, 0x1c, 0xac, 0xd2, 0x50, 0x2e, 0x49, 0x37, 0xb5, 0xcb, 0xc8, 0xb6, 0x34, 0x4a, 0x2d, 0x53, 0xd1, 0xaf, 0x1f, 0x61, 0xe3, 0x9d, 0xfa, 0x84, 0x6, 0x78, 0xf6, 0x88, 0xa, 0x74, 0x13, 0x6d, 0xef, 0x91, 0x21, 0x5f, 0xdd, 0xa3, 0xc4, 0xba, 0x38, 0x46, 0x45, 0x3b, 0xb9, 0xc7, 0xa0, 0xde, 0x5c, 0x22, 0x92, 0xec, 0x6e, 0x10, 0x77, 0x9, 0x8b, 0xf5, 0x8d, 0xf3, 0x71, 0xf, 0x68, 0x16, 0x94, 0xea, 0x5a, 0x24, 0xa6, 0xd8, 0xbf, 0xc1, 0x43, 0x3d, 0x3e, 0x40, 0xc2, 0xbc, 0xdb, 0xa5, 0x27, 0x59, 0xe9, 0x97, 0x15, 0x6b, 0xc, 0x72, 0xf0, 0x8e, 0xf1, 0x8f, 0xd, 0x73, 0x14, 0x6a, 0xe8, 0x96, 0x26, 0x58, 0xda, 0xa4, 0xc3, 0xbd, 0x3f, 0x41, 0x42, 0x3c, 0xbe, 0xc0, 0xa7, 0xd9, 0x5b, 0x25, 0x95, 0xeb, 0x69, 0x17, 0x70, 0xe, 0x8c, 0xf2, 0x8a, 0xf4, 0x76, 0x8, 0x6f, 0x11, 0x93, 0xed, 0x5d, 0x23, 0xa1, 0xdf, 0xb8, 0xc6, 0x44, 0x3a, 0x39, 0x47, 0xc5, 0xbb, 0xdc, 0xa2, 0x20, 0x5e, 0xee, 0x90, 0x12, 0x6c, 0xb, 0x75, 0xf7, 0x89, 0x7, 0x79, 0xfb, 0x85, 0xe2, 0x9c, 0x1e, 0x60, 0xd0, 0xae, 0x2c, 0x52, 0x35, 0x4b, 0xc9, 0xb7, 0xb4, 0xca, 0x48, 0x36, 0x51, 0x2f, 0xad, 0xd3, 0x63, 0x1d, 0x9f, 0xe1, 0x86, 0xf8, 0x7a, 0x4, 0x7c, 0x2, 0x80, 0xfe, 0x99, 0xe7, 0x65, 0x1b, 0xab, 0xd5, 0x57, 0x29, 0x4e, 0x30, 0xb2, 0xcc, 0xcf, 0xb1, 0x33, 0x4d, 0x2a, 0x54, 0xd6, 0xa8, 0x18, 0x66, 0xe4, 0x9a, 0xfd, 0x83, 0x1, 0x7f}, {0x0, 0x7f, 0xfe, 0x81, 0xe1, 0x9e, 0x1f, 0x60, 0xdf, 0xa0, 0x21, 0x5e, 0x3e, 0x41, 0xc0, 0xbf, 0xa3, 0xdc, 0x5d, 0x22, 0x42, 0x3d, 0xbc, 0xc3, 0x7c, 0x3, 0x82, 0xfd, 0x9d, 0xe2, 0x63, 0x1c, 0x5b, 0x24, 0xa5, 0xda, 0xba, 0xc5, 0x44, 0x3b, 0x84, 0xfb, 0x7a, 0x5, 0x65, 0x1a, 0x9b, 0xe4, 0xf8, 0x87, 0x6, 0x79, 0x19, 0x66, 0xe7, 0x98, 0x27, 0x58, 0xd9, 0xa6, 0xc6, 0xb9, 0x38, 0x47, 0xb6, 0xc9, 0x48, 0x37, 0x57, 0x28, 0xa9, 0xd6, 0x69, 0x16, 0x97, 0xe8, 0x88, 0xf7, 0x76, 0x9, 0x15, 0x6a, 0xeb, 0x94, 0xf4, 0x8b, 0xa, 0x75, 0xca, 0xb5, 0x34, 0x4b, 0x2b, 0x54, 0xd5, 0xaa, 0xed, 0x92, 0x13, 0x6c, 0xc, 0x73, 0xf2, 0x8d, 0x32, 0x4d, 0xcc, 0xb3, 0xd3, 0xac, 0x2d, 0x52, 0x4e, 0x31, 0xb0, 0xcf, 0xaf, 0xd0, 0x51, 0x2e, 0x91, 0xee, 0x6f, 0x10, 0x70, 0xf, 0x8e, 0xf1, 0x71, 0xe, 0x8f, 0xf0, 0x90, 0xef, 0x6e, 0x11, 0xae, 0xd1, 0x50, 0x2f, 0x4f, 0x30, 0xb1, 0xce, 0xd2, 0xad, 0x2c, 0x53, 0x33, 0x4c, 0xcd, 0xb2, 0xd, 0x72, 0xf3, 0x8c, 0xec, 0x93, 0x12, 0x6d, 0x2a, 0x55, 0xd4, 0xab, 0xcb, 0xb4, 0x35, 0x4a, 0xf5, 0x8a, 0xb, 0x74, 0x14, 0x6b, 0xea, 0x95, 0x89, 0xf6, 0x77, 0x8, 0x68, 0x17, 0x96, 0xe9, 0x56, 0x29, 0xa8, 0xd7, 0xb7, 0xc8, 0x49, 0x36, 0xc7, 0xb8, 0x39, 0x46, 0x26, 0x59, 0xd8, 0xa7, 0x18, 0x67, 0xe6, 0x99, 0xf9, 0x86, 0x7, 0x78, 0x64, 0x1b, 0x9a, 0xe5, 0x85, 0xfa, 0x7b, 0x4, 0xbb, 0xc4, 0x45, 0x3a, 0x5a, 0x25, 0xa4, 0xdb, 0x9c, 0xe3, 0x62, 0x1d, 0x7d, 0x2, 0x83, 0xfc, 0x43, 0x3c, 0xbd, 0xc2, 0xa2, 0xdd, 0x5c, 0x23, 0x3f, 0x40, 0xc1, 0xbe, 0xde, 0xa1, 0x20, 0x5f, 0xe0, 0x9f, 0x1e, 0x61, 0x1, 0x7e, 0xff, 0x80}, {0x0, 0x80, 0x1d, 0x9d, 0x3a, 0xba, 0x27, 0xa7, 0x74, 0xf4, 0x69, 0xe9, 0x4e, 0xce, 0x53, 0xd3, 0xe8, 0x68, 0xf5, 0x75, 0xd2, 0x52, 0xcf, 0x4f, 0x9c, 0x1c, 0x81, 0x1, 0xa6, 0x26, 0xbb, 0x3b, 0xcd, 0x4d, 0xd0, 0x50, 0xf7, 0x77, 0xea, 0x6a, 0xb9, 0x39, 0xa4, 0x24, 0x83, 0x3, 0x9e, 0x1e, 0x25, 0xa5, 0x38, 0xb8, 0x1f, 0x9f, 0x2, 0x82, 0x51, 0xd1, 0x4c, 0xcc, 0x6b, 0xeb, 0x76, 0xf6, 0x87, 0x7, 0x9a, 0x1a, 0xbd, 0x3d, 0xa0, 0x20, 0xf3, 0x73, 0xee, 0x6e, 0xc9, 0x49, 0xd4, 0x54, 0x6f, 0xef, 0x72, 0xf2, 0x55, 0xd5, 0x48, 0xc8, 0x1b, 0x9b, 0x6, 0x86, 0x21, 0xa1, 0x3c, 0xbc, 0x4a, 0xca, 0x57, 0xd7, 0x70, 0xf0, 0x6d, 0xed, 0x3e, 0xbe, 0x23, 0xa3, 0x4, 0x84, 0x19, 0x99, 0xa2, 0x22, 0xbf, 0x3f, 0x98, 0x18, 0x85, 0x5, 0xd6, 0x56, 0xcb, 0x4b, 0xec, 0x6c, 0xf1, 0x71, 0x13, 0x93, 0xe, 0x8e, 0x29, 0xa9, 0x34, 0xb4, 0x67, 0xe7, 0x7a, 0xfa, 0x5d, 0xdd, 0x40, 0xc0, 0xfb, 0x7b, 0xe6, 0x66, 0xc1, 0x41, 0xdc, 0x5c, 0x8f, 0xf, 0x92, 0x12, 0xb5, 0x35, 0xa8, 0x28, 0xde, 0x5e, 0xc3, 0x43, 0xe4, 0x64, 0xf9, 0x79, 0xaa, 0x2a, 0xb7, 0x37, 0x90, 0x10, 0x8d, 0xd, 0x36, 0xb6, 0x2b, 0xab, 0xc, 0x8c, 0x11, 0x91, 0x42, 0xc2, 0x5f, 0xdf, 0x78, 0xf8, 0x65, 0xe5, 0x94, 0x14, 0x89, 0x9, 0xae, 0x2e, 0xb3, 0x33, 0xe0, 0x60, 0xfd, 0x7d, 0xda, 0x5a, 0xc7, 0x47, 0x7c, 0xfc, 0x61, 0xe1, 0x46, 0xc6, 0x5b, 0xdb, 0x8, 0x88, 0x15, 0x95, 0x32, 0xb2, 0x2f, 0xaf, 0x59, 0xd9, 0x44, 0xc4, 0x63, 0xe3, 0x7e, 0xfe, 0x2d, 0xad, 0x30, 0xb0, 0x17, 0x97, 0xa, 0x8a, 0xb1, 0x31, 0xac, 0x2c, 0x8b, 0xb, 0x96, 0x16, 0xc5, 0x45, 0xd8, 0x58, 0xff, 0x7f, 0xe2, 0x62}, {0x0, 0x81, 0x1f, 0x9e, 0x3e, 0xbf, 0x21, 0xa0, 0x7c, 0xfd, 0x63, 0xe2, 0x42, 0xc3, 0x5d, 0xdc, 0xf8, 0x79, 0xe7, 0x66, 0xc6, 0x47, 0xd9, 0x58, 0x84, 0x5, 0x9b, 0x1a, 0xba, 0x3b, 0xa5, 0x24, 0xed, 0x6c, 0xf2, 0x73, 0xd3, 0x52, 0xcc, 0x4d, 0x91, 0x10, 0x8e, 0xf, 0xaf, 0x2e, 0xb0, 0x31, 0x15, 0x94, 0xa, 0x8b, 0x2b, 0xaa, 0x34, 0xb5, 0x69, 0xe8, 0x76, 0xf7, 0x57, 0xd6, 0x48, 0xc9, 0xc7, 0x46, 0xd8, 0x59, 0xf9, 0x78, 0xe6, 0x67, 0xbb, 0x3a, 0xa4, 0x25, 0x85, 0x4, 0x9a, 0x1b, 0x3f, 0xbe, 0x20, 0xa1, 0x1, 0x80, 0x1e, 0x9f, 0x43, 0xc2, 0x5c, 0xdd, 0x7d, 0xfc, 0x62, 0xe3, 0x2a, 0xab, 0x35, 0xb4, 0x14, 0x95, 0xb, 0x8a, 0x56, 0xd7, 0x49, 0xc8, 0x68, 0xe9, 0x77, 0xf6, 0xd2, 0x53, 0xcd, 0x4c, 0xec, 0x6d, 0xf3, 0x72, 0xae, 0x2f, 0xb1, 0x30, 0x90, 0x11, 0x8f, 0xe, 0x93, 0x12, 0x8c, 0xd, 0xad, 0x2c, 0xb2, 0x33, 0xef, 0x6e, 0xf0, 0x71, 0xd1, 0x50, 0xce, 0x4f, 0x6b, 0xea, 0x74, 0xf5, 0x55, 0xd4, 0x4a, 0xcb, 0x17, 0x96, 0x8, 0x89, 0x29, 0xa8, 0x36, 0xb7, 0x7e, 0xff, 0x61, 0xe0, 0x40, 0xc1, 0x5f, 0xde, 0x2, 0x83, 0x1d, 0x9c, 0x3c, 0xbd, 0x23, 0xa2, 0x86, 0x7, 0x99, 0x18, 0xb8, 0x39, 0xa7, 0x26, 0xfa, 0x7b, 0xe5, 0x64, 0xc4, 0x45, 0xdb, 0x5a, 0x54, 0xd5, 0x4b, 0xca, 0x6a, 0xeb, 0x75, 0xf4, 0x28, 0xa9, 0x37, 0xb6, 0x16, 0x97, 0x9, 0x88, 0xac, 0x2d, 0xb3, 0x32, 0x92, 0x13, 0x8d, 0xc, 0xd0, 0x51, 0xcf, 0x4e, 0xee, 0x6f, 0xf1, 0x70, 0xb9, 0x38, 0xa6, 0x27, 0x87, 0x6, 0x98, 0x19, 0xc5, 0x44, 0xda, 0x5b, 0xfb, 0x7a, 0xe4, 0x65, 0x41, 0xc0, 0x5e, 0xdf, 0x7f, 0xfe, 0x60, 0xe1, 0x3d, 0xbc, 0x22, 0xa3, 0x3, 0x82, 0x1c, 0x9d}, {0x0, 0x82, 0x19, 0x9b, 0x32, 0xb0, 0x2b, 0xa9, 0x64, 0xe6, 0x7d, 0xff, 0x56, 0xd4, 0x4f, 0xcd, 0xc8, 0x4a, 0xd1, 0x53, 0xfa, 0x78, 0xe3, 0x61, 0xac, 0x2e, 0xb5, 0x37, 0x9e, 0x1c, 0x87, 0x5, 0x8d, 0xf, 0x94, 0x16, 0xbf, 0x3d, 0xa6, 0x24, 0xe9, 0x6b, 0xf0, 0x72, 0xdb, 0x59, 0xc2, 0x40, 0x45, 0xc7, 0x5c, 0xde, 0x77, 0xf5, 0x6e, 0xec, 0x21, 0xa3, 0x38, 0xba, 0x13, 0x91, 0xa, 0x88, 0x7, 0x85, 0x1e, 0x9c, 0x35, 0xb7, 0x2c, 0xae, 0x63, 0xe1, 0x7a, 0xf8, 0x51, 0xd3, 0x48, 0xca, 0xcf, 0x4d, 0xd6, 0x54, 0xfd, 0x7f, 0xe4, 0x66, 0xab, 0x29, 0xb2, 0x30, 0x99, 0x1b, 0x80, 0x2, 0x8a, 0x8, 0x93, 0x11, 0xb8, 0x3a, 0xa1, 0x23, 0xee, 0x6c, 0xf7, 0x75, 0xdc, 0x5e, 0xc5, 0x47, 0x42, 0xc0, 0x5b, 0xd9, 0x70, 0xf2, 0x69, 0xeb, 0x26, 0xa4, 0x3f, 0xbd, 0x14, 0x96, 0xd, 0x8f, 0xe, 0x8c, 0x17, 0x95, 0x3c, 0xbe, 0x25, 0xa7, 0x6a, 0xe8, 0x73, 0xf1, 0x58, 0xda, 0x41, 0xc3, 0xc6, 0x44, 0xdf, 0x5d, 0xf4, 0x76, 0xed, 0x6f, 0xa2, 0x20, 0xbb, 0x39, 0x90, 0x12, 0x89, 0xb, 0x83, 0x1, 0x9a, 0x18, 0xb1, 0x33, 0xa8, 0x2a, 0xe7, 0x65, 0xfe, 0x7c, 0xd5, 0x57, 0xcc, 0x4e, 0x4b, 0xc9, 0x52, 0xd0, 0x79, 0xfb, 0x60, 0xe2, 0x2f, 0xad, 0x36, 0xb4, 0x1d, 0x9f, 0x4, 0x86, 0x9, 0x8b, 0x10, 0x92, 0x3b, 0xb9, 0x22, 0xa0, 0x6d, 0xef, 0x74, 0xf6, 0x5f, 0xdd, 0x46, 0xc4, 0xc1, 0x43, 0xd8, 0x5a, 0xf3, 0x71, 0xea, 0x68, 0xa5, 0x27, 0xbc, 0x3e, 0x97, 0x15, 0x8e, 0xc, 0x84, 0x6, 0x9d, 0x1f, 0xb6, 0x34, 0xaf, 0x2d, 0xe0, 0x62, 0xf9, 0x7b, 0xd2, 0x50, 0xcb, 0x49, 0x4c, 0xce, 0x55, 0xd7, 0x7e, 0xfc, 0x67, 0xe5, 0x28, 0xaa, 0x31, 0xb3, 0x1a, 0x98, 0x3, 0x81}, {0x0, 0x83, 0x1b, 0x98, 0x36, 0xb5, 0x2d, 0xae, 0x6c, 0xef, 0x77, 0xf4, 0x5a, 0xd9, 0x41, 0xc2, 0xd8, 0x5b, 0xc3, 0x40, 0xee, 0x6d, 0xf5, 0x76, 0xb4, 0x37, 0xaf, 0x2c, 0x82, 0x1, 0x99, 0x1a, 0xad, 0x2e, 0xb6, 0x35, 0x9b, 0x18, 0x80, 0x3, 0xc1, 0x42, 0xda, 0x59, 0xf7, 0x74, 0xec, 0x6f, 0x75, 0xf6, 0x6e, 0xed, 0x43, 0xc0, 0x58, 0xdb, 0x19, 0x9a, 0x2, 0x81, 0x2f, 0xac, 0x34, 0xb7, 0x47, 0xc4, 0x5c, 0xdf, 0x71, 0xf2, 0x6a, 0xe9, 0x2b, 0xa8, 0x30, 0xb3, 0x1d, 0x9e, 0x6, 0x85, 0x9f, 0x1c, 0x84, 0x7, 0xa9, 0x2a, 0xb2, 0x31, 0xf3, 0x70, 0xe8, 0x6b, 0xc5, 0x46, 0xde, 0x5d, 0xea, 0x69, 0xf1, 0x72, 0xdc, 0x5f, 0xc7, 0x44, 0x86, 0x5, 0x9d, 0x1e, 0xb0, 0x33, 0xab, 0x28, 0x32, 0xb1, 0x29, 0xaa, 0x4, 0x87, 0x1f, 0x9c, 0x5e, 0xdd, 0x45, 0xc6, 0x68, 0xeb, 0x73, 0xf0, 0x8e, 0xd, 0x95, 0x16, 0xb8, 0x3b, 0xa3, 0x20, 0xe2, 0x61, 0xf9, 0x7a, 0xd4, 0x57, 0xcf, 0x4c, 0x56, 0xd5, 0x4d, 0xce, 0x60, 0xe3, 0x7b, 0xf8, 0x3a, 0xb9, 0x21, 0xa2, 0xc, 0x8f, 0x17, 0x94, 0x23, 0xa0, 0x38, 0xbb, 0x15, 0x96, 0xe, 0x8d, 0x4f, 0xcc, 0x54, 0xd7, 0x79, 0xfa, 0x62, 0xe1, 0xfb, 0x78, 0xe0, 0x63, 0xcd, 0x4e, 0xd6, 0x55, 0x97, 0x14, 0x8c, 0xf, 0xa1, 0x22, 0xba, 0x39, 0xc9, 0x4a, 0xd2, 0x51, 0xff, 0x7c, 0xe4, 0x67, 0xa5, 0x26, 0xbe, 0x3d, 0x93, 0x10, 0x88, 0xb, 0x11, 0x92, 0xa, 0x89, 0x27, 0xa4, 0x3c, 0xbf, 0x7d, 0xfe, 0x66, 0xe5, 0x4b, 0xc8, 0x50, 0xd3, 0x64, 0xe7, 0x7f, 0xfc, 0x52, 0xd1, 0x49, 0xca, 0x8, 0x8b, 0x13, 0x90, 0x3e, 0xbd, 0x25, 0xa6, 0xbc, 0x3f, 0xa7, 0x24, 0x8a, 0x9, 0x91, 0x12, 0xd0, 0x53, 0xcb, 0x48, 0xe6, 0x65, 0xfd, 0x7e}, {0x0, 0x84, 0x15, 0x91, 0x2a, 0xae, 0x3f, 0xbb, 0x54, 0xd0, 0x41, 0xc5, 0x7e, 0xfa, 0x6b, 0xef, 0xa8, 0x2c, 0xbd, 0x39, 0x82, 0x6, 0x97, 0x13, 0xfc, 0x78, 0xe9, 0x6d, 0xd6, 0x52, 0xc3, 0x47, 0x4d, 0xc9, 0x58, 0xdc, 0x67, 0xe3, 0x72, 0xf6, 0x19, 0x9d, 0xc, 0x88, 0x33, 0xb7, 0x26, 0xa2, 0xe5, 0x61, 0xf0, 0x74, 0xcf, 0x4b, 0xda, 0x5e, 0xb1, 0x35, 0xa4, 0x20, 0x9b, 0x1f, 0x8e, 0xa, 0x9a, 0x1e, 0x8f, 0xb, 0xb0, 0x34, 0xa5, 0x21, 0xce, 0x4a, 0xdb, 0x5f, 0xe4, 0x60, 0xf1, 0x75, 0x32, 0xb6, 0x27, 0xa3, 0x18, 0x9c, 0xd, 0x89, 0x66, 0xe2, 0x73, 0xf7, 0x4c, 0xc8, 0x59, 0xdd, 0xd7, 0x53, 0xc2, 0x46, 0xfd, 0x79, 0xe8, 0x6c, 0x83, 0x7, 0x96, 0x12, 0xa9, 0x2d, 0xbc, 0x38, 0x7f, 0xfb, 0x6a, 0xee, 0x55, 0xd1, 0x40, 0xc4, 0x2b, 0xaf, 0x3e, 0xba, 0x1, 0x85, 0x14, 0x90, 0x29, 0xad, 0x3c, 0xb8, 0x3, 0x87, 0x16, 0x92, 0x7d, 0xf9, 0x68, 0xec, 0x57, 0xd3, 0x42, 0xc6, 0x81, 0x5, 0x94, 0x10, 0xab, 0x2f, 0xbe, 0x3a, 0xd5, 0x51, 0xc0, 0x44, 0xff, 0x7b, 0xea, 0x6e, 0x64, 0xe0, 0x71, 0xf5, 0x4e, 0xca, 0x5b, 0xdf, 0x30, 0xb4, 0x25, 0xa1, 0x1a, 0x9e, 0xf, 0x8b, 0xcc, 0x48, 0xd9, 0x5d, 0xe6, 0x62, 0xf3, 0x77, 0x98, 0x1c, 0x8d, 0x9, 0xb2, 0x36, 0xa7, 0x23, 0xb3, 0x37, 0xa6, 0x22, 0x99, 0x1d, 0x8c, 0x8, 0xe7, 0x63, 0xf2, 0x76, 0xcd, 0x49, 0xd8, 0x5c, 0x1b, 0x9f, 0xe, 0x8a, 0x31, 0xb5, 0x24, 0xa0, 0x4f, 0xcb, 0x5a, 0xde, 0x65, 0xe1, 0x70, 0xf4, 0xfe, 0x7a, 0xeb, 0x6f, 0xd4, 0x50, 0xc1, 0x45, 0xaa, 0x2e, 0xbf, 0x3b, 0x80, 0x4, 0x95, 0x11, 0x56, 0xd2, 0x43, 0xc7, 0x7c, 0xf8, 0x69, 0xed, 0x2, 0x86, 0x17, 0x93, 0x28, 0xac, 0x3d, 0xb9}, {0x0, 0x85, 0x17, 0x92, 0x2e, 0xab, 0x39, 0xbc, 0x5c, 0xd9, 0x4b, 0xce, 0x72, 0xf7, 0x65, 0xe0, 0xb8, 0x3d, 0xaf, 0x2a, 0x96, 0x13, 0x81, 0x4, 0xe4, 0x61, 0xf3, 0x76, 0xca, 0x4f, 0xdd, 0x58, 0x6d, 0xe8, 0x7a, 0xff, 0x43, 0xc6, 0x54, 0xd1, 0x31, 0xb4, 0x26, 0xa3, 0x1f, 0x9a, 0x8, 0x8d, 0xd5, 0x50, 0xc2, 0x47, 0xfb, 0x7e, 0xec, 0x69, 0x89, 0xc, 0x9e, 0x1b, 0xa7, 0x22, 0xb0, 0x35, 0xda, 0x5f, 0xcd, 0x48, 0xf4, 0x71, 0xe3, 0x66, 0x86, 0x3, 0x91, 0x14, 0xa8, 0x2d, 0xbf, 0x3a, 0x62, 0xe7, 0x75, 0xf0, 0x4c, 0xc9, 0x5b, 0xde, 0x3e, 0xbb, 0x29, 0xac, 0x10, 0x95, 0x7, 0x82, 0xb7, 0x32, 0xa0, 0x25, 0x99, 0x1c, 0x8e, 0xb, 0xeb, 0x6e, 0xfc, 0x79, 0xc5, 0x40, 0xd2, 0x57, 0xf, 0x8a, 0x18, 0x9d, 0x21, 0xa4, 0x36, 0xb3, 0x53, 0xd6, 0x44, 0xc1, 0x7d, 0xf8, 0x6a, 0xef, 0xa9, 0x2c, 0xbe, 0x3b, 0x87, 0x2, 0x90, 0x15, 0xf5, 0x70, 0xe2, 0x67, 0xdb, 0x5e, 0xcc, 0x49, 0x11, 0x94, 0x6, 0x83, 0x3f, 0xba, 0x28, 0xad, 0x4d, 0xc8, 0x5a, 0xdf, 0x63, 0xe6, 0x74, 0xf1, 0xc4, 0x41, 0xd3, 0x56, 0xea, 0x6f, 0xfd, 0x78, 0x98, 0x1d, 0x8f, 0xa, 0xb6, 0x33, 0xa1, 0x24, 0x7c, 0xf9, 0x6b, 0xee, 0x52, 0xd7, 0x45, 0xc0, 0x20, 0xa5, 0x37, 0xb2, 0xe, 0x8b, 0x19, 0x9c, 0x73, 0xf6, 0x64, 0xe1, 0x5d, 0xd8, 0x4a, 0xcf, 0x2f, 0xaa, 0x38, 0xbd, 0x1, 0x84, 0x16, 0x93, 0xcb, 0x4e, 0xdc, 0x59, 0xe5, 0x60, 0xf2, 0x77, 0x97, 0x12, 0x80, 0x5, 0xb9, 0x3c, 0xae, 0x2b, 0x1e, 0x9b, 0x9, 0x8c, 0x30, 0xb5, 0x27, 0xa2, 0x42, 0xc7, 0x55, 0xd0, 0x6c, 0xe9, 0x7b, 0xfe, 0xa6, 0x23, 0xb1, 0x34, 0x88, 0xd, 0x9f, 0x1a, 0xfa, 0x7f, 0xed, 0x68, 0xd4, 0x51, 0xc3, 0x46}, {0x0, 0x86, 0x11, 0x97, 0x22, 0xa4, 0x33, 0xb5, 0x44, 0xc2, 0x55, 0xd3, 0x66, 0xe0, 0x77, 0xf1, 0x88, 0xe, 0x99, 0x1f, 0xaa, 0x2c, 0xbb, 0x3d, 0xcc, 0x4a, 0xdd, 0x5b, 0xee, 0x68, 0xff, 0x79, 0xd, 0x8b, 0x1c, 0x9a, 0x2f, 0xa9, 0x3e, 0xb8, 0x49, 0xcf, 0x58, 0xde, 0x6b, 0xed, 0x7a, 0xfc, 0x85, 0x3, 0x94, 0x12, 0xa7, 0x21, 0xb6, 0x30, 0xc1, 0x47, 0xd0, 0x56, 0xe3, 0x65, 0xf2, 0x74, 0x1a, 0x9c, 0xb, 0x8d, 0x38, 0xbe, 0x29, 0xaf, 0x5e, 0xd8, 0x4f, 0xc9, 0x7c, 0xfa, 0x6d, 0xeb, 0x92, 0x14, 0x83, 0x5, 0xb0, 0x36, 0xa1, 0x27, 0xd6, 0x50, 0xc7, 0x41, 0xf4, 0x72, 0xe5, 0x63, 0x17, 0x91, 0x6, 0x80, 0x35, 0xb3, 0x24, 0xa2, 0x53, 0xd5, 0x42, 0xc4, 0x71, 0xf7, 0x60, 0xe6, 0x9f, 0x19, 0x8e, 0x8, 0xbd, 0x3b, 0xac, 0x2a, 0xdb, 0x5d, 0xca, 0x4c, 0xf9, 0x7f, 0xe8, 0x6e, 0x34, 0xb2, 0x25, 0xa3, 0x16, 0x90, 0x7, 0x81, 0x70, 0xf6, 0x61, 0xe7, 0x52, 0xd4, 0x43, 0xc5, 0xbc, 0x3a, 0xad, 0x2b, 0x9e, 0x18, 0x8f, 0x9, 0xf8, 0x7e, 0xe9, 0x6f, 0xda, 0x5c, 0xcb, 0x4d, 0x39, 0xbf, 0x28, 0xae, 0x1b, 0x9d, 0xa, 0x8c, 0x7d, 0xfb, 0x6c, 0xea, 0x5f, 0xd9, 0x4e, 0xc8, 0xb1, 0x37, 0xa0, 0x26, 0x93, 0x15, 0x82, 0x4, 0xf5, 0x73, 0xe4, 0x62, 0xd7, 0x51, 0xc6, 0x40, 0x2e, 0xa8, 0x3f, 0xb9, 0xc, 0x8a, 0x1d, 0x9b, 0x6a, 0xec, 0x7b, 0xfd, 0x48, 0xce, 0x59, 0xdf, 0xa6, 0x20, 0xb7, 0x31, 0x84, 0x2, 0x95, 0x13, 0xe2, 0x64, 0xf3, 0x75, 0xc0, 0x46, 0xd1, 0x57, 0x23, 0xa5, 0x32, 0xb4, 0x1, 0x87, 0x10, 0x96, 0x67, 0xe1, 0x76, 0xf0, 0x45, 0xc3, 0x54, 0xd2, 0xab, 0x2d, 0xba, 0x3c, 0x89, 0xf, 0x98, 0x1e, 0xef, 0x69, 0xfe, 0x78, 0xcd, 0x4b, 0xdc, 0x5a}, {0x0, 0x87, 0x13, 0x94, 0x26, 0xa1, 0x35, 0xb2, 0x4c, 0xcb, 0x5f, 0xd8, 0x6a, 0xed, 0x79, 0xfe, 0x98, 0x1f, 0x8b, 0xc, 0xbe, 0x39, 0xad, 0x2a, 0xd4, 0x53, 0xc7, 0x40, 0xf2, 0x75, 0xe1, 0x66, 0x2d, 0xaa, 0x3e, 0xb9, 0xb, 0x8c, 0x18, 0x9f, 0x61, 0xe6, 0x72, 0xf5, 0x47, 0xc0, 0x54, 0xd3, 0xb5, 0x32, 0xa6, 0x21, 0x93, 0x14, 0x80, 0x7, 0xf9, 0x7e, 0xea, 0x6d, 0xdf, 0x58, 0xcc, 0x4b, 0x5a, 0xdd, 0x49, 0xce, 0x7c, 0xfb, 0x6f, 0xe8, 0x16, 0x91, 0x5, 0x82, 0x30, 0xb7, 0x23, 0xa4, 0xc2, 0x45, 0xd1, 0x56, 0xe4, 0x63, 0xf7, 0x70, 0x8e, 0x9, 0x9d, 0x1a, 0xa8, 0x2f, 0xbb, 0x3c, 0x77, 0xf0, 0x64, 0xe3, 0x51, 0xd6, 0x42, 0xc5, 0x3b, 0xbc, 0x28, 0xaf, 0x1d, 0x9a, 0xe, 0x89, 0xef, 0x68, 0xfc, 0x7b, 0xc9, 0x4e, 0xda, 0x5d, 0xa3, 0x24, 0xb0, 0x37, 0x85, 0x2, 0x96, 0x11, 0xb4, 0x33, 0xa7, 0x20, 0x92, 0x15, 0x81, 0x6, 0xf8, 0x7f, 0xeb, 0x6c, 0xde, 0x59, 0xcd, 0x4a, 0x2c, 0xab, 0x3f, 0xb8, 0xa, 0x8d, 0x19, 0x9e, 0x60, 0xe7, 0x73, 0xf4, 0x46, 0xc1, 0x55, 0xd2, 0x99, 0x1e, 0x8a, 0xd, 0xbf, 0x38, 0xac, 0x2b, 0xd5, 0x52, 0xc6, 0x41, 0xf3, 0x74, 0xe0, 0x67, 0x1, 0x86, 0x12, 0x95, 0x27, 0xa0, 0x34, 0xb3, 0x4d, 0xca, 0x5e, 0xd9, 0x6b, 0xec, 0x78, 0xff, 0xee, 0x69, 0xfd, 0x7a, 0xc8, 0x4f, 0xdb, 0x5c, 0xa2, 0x25, 0xb1, 0x36, 0x84, 0x3, 0x97, 0x10, 0x76, 0xf1, 0x65, 0xe2, 0x50, 0xd7, 0x43, 0xc4, 0x3a, 0xbd, 0x29, 0xae, 0x1c, 0x9b, 0xf, 0x88, 0xc3, 0x44, 0xd0, 0x57, 0xe5, 0x62, 0xf6, 0x71, 0x8f, 0x8, 0x9c, 0x1b, 0xa9, 0x2e, 0xba, 0x3d, 0x5b, 0xdc, 0x48, 0xcf, 0x7d, 0xfa, 0x6e, 0xe9, 0x17, 0x90, 0x4, 0x83, 0x31, 0xb6, 0x22, 0xa5}, {0x0, 0x88, 0xd, 0x85, 0x1a, 0x92, 0x17, 0x9f, 0x34, 0xbc, 0x39, 0xb1, 0x2e, 0xa6, 0x23, 0xab, 0x68, 0xe0, 0x65, 0xed, 0x72, 0xfa, 0x7f, 0xf7, 0x5c, 0xd4, 0x51, 0xd9, 0x46, 0xce, 0x4b, 0xc3, 0xd0, 0x58, 0xdd, 0x55, 0xca, 0x42, 0xc7, 0x4f, 0xe4, 0x6c, 0xe9, 0x61, 0xfe, 0x76, 0xf3, 0x7b, 0xb8, 0x30, 0xb5, 0x3d, 0xa2, 0x2a, 0xaf, 0x27, 0x8c, 0x4, 0x81, 0x9, 0x96, 0x1e, 0x9b, 0x13, 0xbd, 0x35, 0xb0, 0x38, 0xa7, 0x2f, 0xaa, 0x22, 0x89, 0x1, 0x84, 0xc, 0x93, 0x1b, 0x9e, 0x16, 0xd5, 0x5d, 0xd8, 0x50, 0xcf, 0x47, 0xc2, 0x4a, 0xe1, 0x69, 0xec, 0x64, 0xfb, 0x73, 0xf6, 0x7e, 0x6d, 0xe5, 0x60, 0xe8, 0x77, 0xff, 0x7a, 0xf2, 0x59, 0xd1, 0x54, 0xdc, 0x43, 0xcb, 0x4e, 0xc6, 0x5, 0x8d, 0x8, 0x80, 0x1f, 0x97, 0x12, 0x9a, 0x31, 0xb9, 0x3c, 0xb4, 0x2b, 0xa3, 0x26, 0xae, 0x67, 0xef, 0x6a, 0xe2, 0x7d, 0xf5, 0x70, 0xf8, 0x53, 0xdb, 0x5e, 0xd6, 0x49, 0xc1, 0x44, 0xcc, 0xf, 0x87, 0x2, 0x8a, 0x15, 0x9d, 0x18, 0x90, 0x3b, 0xb3, 0x36, 0xbe, 0x21, 0xa9, 0x2c, 0xa4, 0xb7, 0x3f, 0xba, 0x32, 0xad, 0x25, 0xa0, 0x28, 0x83, 0xb, 0x8e, 0x6, 0x99, 0x11, 0x94, 0x1c, 0xdf, 0x57, 0xd2, 0x5a, 0xc5, 0x4d, 0xc8, 0x40, 0xeb, 0x63, 0xe6, 0x6e, 0xf1, 0x79, 0xfc, 0x74, 0xda, 0x52, 0xd7, 0x5f, 0xc0, 0x48, 0xcd, 0x45, 0xee, 0x66, 0xe3, 0x6b, 0xf4, 0x7c, 0xf9, 0x71, 0xb2, 0x3a, 0xbf, 0x37, 0xa8, 0x20, 0xa5, 0x2d, 0x86, 0xe, 0x8b, 0x3, 0x9c, 0x14, 0x91, 0x19, 0xa, 0x82, 0x7, 0x8f, 0x10, 0x98, 0x1d, 0x95, 0x3e, 0xb6, 0x33, 0xbb, 0x24, 0xac, 0x29, 0xa1, 0x62, 0xea, 0x6f, 0xe7, 0x78, 0xf0, 0x75, 0xfd, 0x56, 0xde, 0x5b, 0xd3, 0x4c, 0xc4, 0x41, 0xc9}, {0x0, 0x89, 0xf, 0x86, 0x1e, 0x97, 0x11, 0x98, 0x3c, 0xb5, 0x33, 0xba, 0x22, 0xab, 0x2d, 0xa4, 0x78, 0xf1, 0x77, 0xfe, 0x66, 0xef, 0x69, 0xe0, 0x44, 0xcd, 0x4b, 0xc2, 0x5a, 0xd3, 0x55, 0xdc, 0xf0, 0x79, 0xff, 0x76, 0xee, 0x67, 0xe1, 0x68, 0xcc, 0x45, 0xc3, 0x4a, 0xd2, 0x5b, 0xdd, 0x54, 0x88, 0x1, 0x87, 0xe, 0x96, 0x1f, 0x99, 0x10, 0xb4, 0x3d, 0xbb, 0x32, 0xaa, 0x23, 0xa5, 0x2c, 0xfd, 0x74, 0xf2, 0x7b, 0xe3, 0x6a, 0xec, 0x65, 0xc1, 0x48, 0xce, 0x47, 0xdf, 0x56, 0xd0, 0x59, 0x85, 0xc, 0x8a, 0x3, 0x9b, 0x12, 0x94, 0x1d, 0xb9, 0x30, 0xb6, 0x3f, 0xa7, 0x2e, 0xa8, 0x21, 0xd, 0x84, 0x2, 0x8b, 0x13, 0x9a, 0x1c, 0x95, 0x31, 0xb8, 0x3e, 0xb7, 0x2f, 0xa6, 0x20, 0xa9, 0x75, 0xfc, 0x7a, 0xf3, 0x6b, 0xe2, 0x64, 0xed, 0x49, 0xc0, 0x46, 0xcf, 0x57, 0xde, 0x58, 0xd1, 0xe7, 0x6e, 0xe8, 0x61, 0xf9, 0x70, 0xf6, 0x7f, 0xdb, 0x52, 0xd4, 0x5d, 0xc5, 0x4c, 0xca, 0x43, 0x9f, 0x16, 0x90, 0x19, 0x81, 0x8, 0x8e, 0x7, 0xa3, 0x2a, 0xac, 0x25, 0xbd, 0x34, 0xb2, 0x3b, 0x17, 0x9e, 0x18, 0x91, 0x9, 0x80, 0x6, 0x8f, 0x2b, 0xa2, 0x24, 0xad, 0x35, 0xbc, 0x3a, 0xb3, 0x6f, 0xe6, 0x60, 0xe9, 0x71, 0xf8, 0x7e, 0xf7, 0x53, 0xda, 0x5c, 0xd5, 0x4d, 0xc4, 0x42, 0xcb, 0x1a, 0x93, 0x15, 0x9c, 0x4, 0x8d, 0xb, 0x82, 0x26, 0xaf, 0x29, 0xa0, 0x38, 0xb1, 0x37, 0xbe, 0x62, 0xeb, 0x6d, 0xe4, 0x7c, 0xf5, 0x73, 0xfa, 0x5e, 0xd7, 0x51, 0xd8, 0x40, 0xc9, 0x4f, 0xc6, 0xea, 0x63, 0xe5, 0x6c, 0xf4, 0x7d, 0xfb, 0x72, 0xd6, 0x5f, 0xd9, 0x50, 0xc8, 0x41, 0xc7, 0x4e, 0x92, 0x1b, 0x9d, 0x14, 0x8c, 0x5, 0x83, 0xa, 0xae, 0x27, 0xa1, 0x28, 0xb0, 0x39, 0xbf, 0x36}, {0x0, 0x8a, 0x9, 0x83, 0x12, 0x98, 0x1b, 0x91, 0x24, 0xae, 0x2d, 0xa7, 0x36, 0xbc, 0x3f, 0xb5, 0x48, 0xc2, 0x41, 0xcb, 0x5a, 0xd0, 0x53, 0xd9, 0x6c, 0xe6, 0x65, 0xef, 0x7e, 0xf4, 0x77, 0xfd, 0x90, 0x1a, 0x99, 0x13, 0x82, 0x8, 0x8b, 0x1, 0xb4, 0x3e, 0xbd, 0x37, 0xa6, 0x2c, 0xaf, 0x25, 0xd8, 0x52, 0xd1, 0x5b, 0xca, 0x40, 0xc3, 0x49, 0xfc, 0x76, 0xf5, 0x7f, 0xee, 0x64, 0xe7, 0x6d, 0x3d, 0xb7, 0x34, 0xbe, 0x2f, 0xa5, 0x26, 0xac, 0x19, 0x93, 0x10, 0x9a, 0xb, 0x81, 0x2, 0x88, 0x75, 0xff, 0x7c, 0xf6, 0x67, 0xed, 0x6e, 0xe4, 0x51, 0xdb, 0x58, 0xd2, 0x43, 0xc9, 0x4a, 0xc0, 0xad, 0x27, 0xa4, 0x2e, 0xbf, 0x35, 0xb6, 0x3c, 0x89, 0x3, 0x80, 0xa, 0x9b, 0x11, 0x92, 0x18, 0xe5, 0x6f, 0xec, 0x66, 0xf7, 0x7d, 0xfe, 0x74, 0xc1, 0x4b, 0xc8, 0x42, 0xd3, 0x59, 0xda, 0x50, 0x7a, 0xf0, 0x73, 0xf9, 0x68, 0xe2, 0x61, 0xeb, 0x5e, 0xd4, 0x57, 0xdd, 0x4c, 0xc6, 0x45, 0xcf, 0x32, 0xb8, 0x3b, 0xb1, 0x20, 0xaa, 0x29, 0xa3, 0x16, 0x9c, 0x1f, 0x95, 0x4, 0x8e, 0xd, 0x87, 0xea, 0x60, 0xe3, 0x69, 0xf8, 0x72, 0xf1, 0x7b, 0xce, 0x44, 0xc7, 0x4d, 0xdc, 0x56, 0xd5, 0x5f, 0xa2, 0x28, 0xab, 0x21, 0xb0, 0x3a, 0xb9, 0x33, 0x86, 0xc, 0x8f, 0x5, 0x94, 0x1e, 0x9d, 0x17, 0x47, 0xcd, 0x4e, 0xc4, 0x55, 0xdf, 0x5c, 0xd6, 0x63, 0xe9, 0x6a, 0xe0, 0x71, 0xfb, 0x78, 0xf2, 0xf, 0x85, 0x6, 0x8c, 0x1d, 0x97, 0x14, 0x9e, 0x2b, 0xa1, 0x22, 0xa8, 0x39, 0xb3, 0x30, 0xba, 0xd7, 0x5d, 0xde, 0x54, 0xc5, 0x4f, 0xcc, 0x46, 0xf3, 0x79, 0xfa, 0x70, 0xe1, 0x6b, 0xe8, 0x62, 0x9f, 0x15, 0x96, 0x1c, 0x8d, 0x7, 0x84, 0xe, 0xbb, 0x31, 0xb2, 0x38, 0xa9, 0x23, 0xa0, 0x2a}, {0x0, 0x8b, 0xb, 0x80, 0x16, 0x9d, 0x1d, 0x96, 0x2c, 0xa7, 0x27, 0xac, 0x3a, 0xb1, 0x31, 0xba, 0x58, 0xd3, 0x53, 0xd8, 0x4e, 0xc5, 0x45, 0xce, 0x74, 0xff, 0x7f, 0xf4, 0x62, 0xe9, 0x69, 0xe2, 0xb0, 0x3b, 0xbb, 0x30, 0xa6, 0x2d, 0xad, 0x26, 0x9c, 0x17, 0x97, 0x1c, 0x8a, 0x1, 0x81, 0xa, 0xe8, 0x63, 0xe3, 0x68, 0xfe, 0x75, 0xf5, 0x7e, 0xc4, 0x4f, 0xcf, 0x44, 0xd2, 0x59, 0xd9, 0x52, 0x7d, 0xf6, 0x76, 0xfd, 0x6b, 0xe0, 0x60, 0xeb, 0x51, 0xda, 0x5a, 0xd1, 0x47, 0xcc, 0x4c, 0xc7, 0x25, 0xae, 0x2e, 0xa5, 0x33, 0xb8, 0x38, 0xb3, 0x9, 0x82, 0x2, 0x89, 0x1f, 0x94, 0x14, 0x9f, 0xcd, 0x46, 0xc6, 0x4d, 0xdb, 0x50, 0xd0, 0x5b, 0xe1, 0x6a, 0xea, 0x61, 0xf7, 0x7c, 0xfc, 0x77, 0x95, 0x1e, 0x9e, 0x15, 0x83, 0x8, 0x88, 0x3, 0xb9, 0x32, 0xb2, 0x39, 0xaf, 0x24, 0xa4, 0x2f, 0xfa, 0x71, 0xf1, 0x7a, 0xec, 0x67, 0xe7, 0x6c, 0xd6, 0x5d, 0xdd, 0x56, 0xc0, 0x4b, 0xcb, 0x40, 0xa2, 0x29, 0xa9, 0x22, 0xb4, 0x3f, 0xbf, 0x34, 0x8e, 0x5, 0x85, 0xe, 0x98, 0x13, 0x93, 0x18, 0x4a, 0xc1, 0x41, 0xca, 0x5c, 0xd7, 0x57, 0xdc, 0x66, 0xed, 0x6d, 0xe6, 0x70, 0xfb, 0x7b, 0xf0, 0x12, 0x99, 0x19, 0x92, 0x4, 0x8f, 0xf, 0x84, 0x3e, 0xb5, 0x35, 0xbe, 0x28, 0xa3, 0x23, 0xa8, 0x87, 0xc, 0x8c, 0x7, 0x91, 0x1a, 0x9a, 0x11, 0xab, 0x20, 0xa0, 0x2b, 0xbd, 0x36, 0xb6, 0x3d, 0xdf, 0x54, 0xd4, 0x5f, 0xc9, 0x42, 0xc2, 0x49, 0xf3, 0x78, 0xf8, 0x73, 0xe5, 0x6e, 0xee, 0x65, 0x37, 0xbc, 0x3c, 0xb7, 0x21, 0xaa, 0x2a, 0xa1, 0x1b, 0x90, 0x10, 0x9b, 0xd, 0x86, 0x6, 0x8d, 0x6f, 0xe4, 0x64, 0xef, 0x79, 0xf2, 0x72, 0xf9, 0x43, 0xc8, 0x48, 0xc3, 0x55, 0xde, 0x5e, 0xd5}, {0x0, 0x8c, 0x5, 0x89, 0xa, 0x86, 0xf, 0x83, 0x14, 0x98, 0x11, 0x9d, 0x1e, 0x92, 0x1b, 0x97, 0x28, 0xa4, 0x2d, 0xa1, 0x22, 0xae, 0x27, 0xab, 0x3c, 0xb0, 0x39, 0xb5, 0x36, 0xba, 0x33, 0xbf, 0x50, 0xdc, 0x55, 0xd9, 0x5a, 0xd6, 0x5f, 0xd3, 0x44, 0xc8, 0x41, 0xcd, 0x4e, 0xc2, 0x4b, 0xc7, 0x78, 0xf4, 0x7d, 0xf1, 0x72, 0xfe, 0x77, 0xfb, 0x6c, 0xe0, 0x69, 0xe5, 0x66, 0xea, 0x63, 0xef, 0xa0, 0x2c, 0xa5, 0x29, 0xaa, 0x26, 0xaf, 0x23, 0xb4, 0x38, 0xb1, 0x3d, 0xbe, 0x32, 0xbb, 0x37, 0x88, 0x4, 0x8d, 0x1, 0x82, 0xe, 0x87, 0xb, 0x9c, 0x10, 0x99, 0x15, 0x96, 0x1a, 0x93, 0x1f, 0xf0, 0x7c, 0xf5, 0x79, 0xfa, 0x76, 0xff, 0x73, 0xe4, 0x68, 0xe1, 0x6d, 0xee, 0x62, 0xeb, 0x67, 0xd8, 0x54, 0xdd, 0x51, 0xd2, 0x5e, 0xd7, 0x5b, 0xcc, 0x40, 0xc9, 0x45, 0xc6, 0x4a, 0xc3, 0x4f, 0x5d, 0xd1, 0x58, 0xd4, 0x57, 0xdb, 0x52, 0xde, 0x49, 0xc5, 0x4c, 0xc0, 0x43, 0xcf, 0x46, 0xca, 0x75, 0xf9, 0x70, 0xfc, 0x7f, 0xf3, 0x7a, 0xf6, 0x61, 0xed, 0x64, 0xe8, 0x6b, 0xe7, 0x6e, 0xe2, 0xd, 0x81, 0x8, 0x84, 0x7, 0x8b, 0x2, 0x8e, 0x19, 0x95, 0x1c, 0x90, 0x13, 0x9f, 0x16, 0x9a, 0x25, 0xa9, 0x20, 0xac, 0x2f, 0xa3, 0x2a, 0xa6, 0x31, 0xbd, 0x34, 0xb8, 0x3b, 0xb7, 0x3e, 0xb2, 0xfd, 0x71, 0xf8, 0x74, 0xf7, 0x7b, 0xf2, 0x7e, 0xe9, 0x65, 0xec, 0x60, 0xe3, 0x6f, 0xe6, 0x6a, 0xd5, 0x59, 0xd0, 0x5c, 0xdf, 0x53, 0xda, 0x56, 0xc1, 0x4d, 0xc4, 0x48, 0xcb, 0x47, 0xce, 0x42, 0xad, 0x21, 0xa8, 0x24, 0xa7, 0x2b, 0xa2, 0x2e, 0xb9, 0x35, 0xbc, 0x30, 0xb3, 0x3f, 0xb6, 0x3a, 0x85, 0x9, 0x80, 0xc, 0x8f, 0x3, 0x8a, 0x6, 0x91, 0x1d, 0x94, 0x18, 0x9b, 0x17, 0x9e, 0x12}, {0x0, 0x8d, 0x7, 0x8a, 0xe, 0x83, 0x9, 0x84, 0x1c, 0x91, 0x1b, 0x96, 0x12, 0x9f, 0x15, 0x98, 0x38, 0xb5, 0x3f, 0xb2, 0x36, 0xbb, 0x31, 0xbc, 0x24, 0xa9, 0x23, 0xae, 0x2a, 0xa7, 0x2d, 0xa0, 0x70, 0xfd, 0x77, 0xfa, 0x7e, 0xf3, 0x79, 0xf4, 0x6c, 0xe1, 0x6b, 0xe6, 0x62, 0xef, 0x65, 0xe8, 0x48, 0xc5, 0x4f, 0xc2, 0x46, 0xcb, 0x41, 0xcc, 0x54, 0xd9, 0x53, 0xde, 0x5a, 0xd7, 0x5d, 0xd0, 0xe0, 0x6d, 0xe7, 0x6a, 0xee, 0x63, 0xe9, 0x64, 0xfc, 0x71, 0xfb, 0x76, 0xf2, 0x7f, 0xf5, 0x78, 0xd8, 0x55, 0xdf, 0x52, 0xd6, 0x5b, 0xd1, 0x5c, 0xc4, 0x49, 0xc3, 0x4e, 0xca, 0x47, 0xcd, 0x40, 0x90, 0x1d, 0x97, 0x1a, 0x9e, 0x13, 0x99, 0x14, 0x8c, 0x1, 0x8b, 0x6, 0x82, 0xf, 0x85, 0x8, 0xa8, 0x25, 0xaf, 0x22, 0xa6, 0x2b, 0xa1, 0x2c, 0xb4, 0x39, 0xb3, 0x3e, 0xba, 0x37, 0xbd, 0x30, 0xdd, 0x50, 0xda, 0x57, 0xd3, 0x5e, 0xd4, 0x59, 0xc1, 0x4c, 0xc6, 0x4b, 0xcf, 0x42, 0xc8, 0x45, 0xe5, 0x68, 0xe2, 0x6f, 0xeb, 0x66, 0xec, 0x61, 0xf9, 0x74, 0xfe, 0x73, 0xf7, 0x7a, 0xf0, 0x7d, 0xad, 0x20, 0xaa, 0x27, 0xa3, 0x2e, 0xa4, 0x29, 0xb1, 0x3c, 0xb6, 0x3b, 0xbf, 0x32, 0xb8, 0x35, 0x95, 0x18, 0x92, 0x1f, 0x9b, 0x16, 0x9c, 0x11, 0x89, 0x4, 0x8e, 0x3, 0x87, 0xa, 0x80, 0xd, 0x3d, 0xb0, 0x3a, 0xb7, 0x33, 0xbe, 0x34, 0xb9, 0x21, 0xac, 0x26, 0xab, 0x2f, 0xa2, 0x28, 0xa5, 0x5, 0x88, 0x2, 0x8f, 0xb, 0x86, 0xc, 0x81, 0x19, 0x94, 0x1e, 0x93, 0x17, 0x9a, 0x10, 0x9d, 0x4d, 0xc0, 0x4a, 0xc7, 0x43, 0xce, 0x44, 0xc9, 0x51, 0xdc, 0x56, 0xdb, 0x5f, 0xd2, 0x58, 0xd5, 0x75, 0xf8, 0x72, 0xff, 0x7b, 0xf6, 0x7c, 0xf1, 0x69, 0xe4, 0x6e, 0xe3, 0x67, 0xea, 0x60, 0xed}, {0x0, 0x8e, 0x1, 0x8f, 0x2, 0x8c, 0x3, 0x8d, 0x4, 0x8a, 0x5, 0x8b, 0x6, 0x88, 0x7, 0x89, 0x8, 0x86, 0x9, 0x87, 0xa, 0x84, 0xb, 0x85, 0xc, 0x82, 0xd, 0x83, 0xe, 0x80, 0xf, 0x81, 0x10, 0x9e, 0x11, 0x9f, 0x12, 0x9c, 0x13, 0x9d, 0x14, 0x9a, 0x15, 0x9b, 0x16, 0x98, 0x17, 0x99, 0x18, 0x96, 0x19, 0x97, 0x1a, 0x94, 0x1b, 0x95, 0x1c, 0x92, 0x1d, 0x93, 0x1e, 0x90, 0x1f, 0x91, 0x20, 0xae, 0x21, 0xaf, 0x22, 0xac, 0x23, 0xad, 0x24, 0xaa, 0x25, 0xab, 0x26, 0xa8, 0x27, 0xa9, 0x28, 0xa6, 0x29, 0xa7, 0x2a, 0xa4, 0x2b, 0xa5, 0x2c, 0xa2, 0x2d, 0xa3, 0x2e, 0xa0, 0x2f, 0xa1, 0x30, 0xbe, 0x31, 0xbf, 0x32, 0xbc, 0x33, 0xbd, 0x34, 0xba, 0x35, 0xbb, 0x36, 0xb8, 0x37, 0xb9, 0x38, 0xb6, 0x39, 0xb7, 0x3a, 0xb4, 0x3b, 0xb5, 0x3c, 0xb2, 0x3d, 0xb3, 0x3e, 0xb0, 0x3f, 0xb1, 0x40, 0xce, 0x41, 0xcf, 0x42, 0xcc, 0x43, 0xcd, 0x44, 0xca, 0x45, 0xcb, 0x46, 0xc8, 0x47, 0xc9, 0x48, 0xc6, 0x49, 0xc7, 0x4a, 0xc4, 0x4b, 0xc5, 0x4c, 0xc2, 0x4d, 0xc3, 0x4e, 0xc0, 0x4f, 0xc1, 0x50, 0xde, 0x51, 0xdf, 0x52, 0xdc, 0x53, 0xdd, 0x54, 0xda, 0x55, 0xdb, 0x56, 0xd8, 0x57, 0xd9, 0x58, 0xd6, 0x59, 0xd7, 0x5a, 0xd4, 0x5b, 0xd5, 0x5c, 0xd2, 0x5d, 0xd3, 0x5e, 0xd0, 0x5f, 0xd1, 0x60, 0xee, 0x61, 0xef, 0x62, 0xec, 0x63, 0xed, 0x64, 0xea, 0x65, 0xeb, 0x66, 0xe8, 0x67, 0xe9, 0x68, 0xe6, 0x69, 0xe7, 0x6a, 0xe4, 0x6b, 0xe5, 0x6c, 0xe2, 0x6d, 0xe3, 0x6e, 0xe0, 0x6f, 0xe1, 0x70, 0xfe, 0x71, 0xff, 0x72, 0xfc, 0x73, 0xfd, 0x74, 0xfa, 0x75, 0xfb, 0x76, 0xf8, 0x77, 0xf9, 0x78, 0xf6, 0x79, 0xf7, 0x7a, 0xf4, 0x7b, 0xf5, 0x7c, 0xf2, 0x7d, 0xf3, 0x7e, 0xf0, 0x7f, 0xf1}, {0x0, 0x8f, 0x3, 0x8c, 0x6, 0x89, 0x5, 0x8a, 0xc, 0x83, 0xf, 0x80, 0xa, 0x85, 0x9, 0x86, 0x18, 0x97, 0x1b, 0x94, 0x1e, 0x91, 0x1d, 0x92, 0x14, 0x9b, 0x17, 0x98, 0x12, 0x9d, 0x11, 0x9e, 0x30, 0xbf, 0x33, 0xbc, 0x36, 0xb9, 0x35, 0xba, 0x3c, 0xb3, 0x3f, 0xb0, 0x3a, 0xb5, 0x39, 0xb6, 0x28, 0xa7, 0x2b, 0xa4, 0x2e, 0xa1, 0x2d, 0xa2, 0x24, 0xab, 0x27, 0xa8, 0x22, 0xad, 0x21, 0xae, 0x60, 0xef, 0x63, 0xec, 0x66, 0xe9, 0x65, 0xea, 0x6c, 0xe3, 0x6f, 0xe0, 0x6a, 0xe5, 0x69, 0xe6, 0x78, 0xf7, 0x7b, 0xf4, 0x7e, 0xf1, 0x7d, 0xf2, 0x74, 0xfb, 0x77, 0xf8, 0x72, 0xfd, 0x71, 0xfe, 0x50, 0xdf, 0x53, 0xdc, 0x56, 0xd9, 0x55, 0xda, 0x5c, 0xd3, 0x5f, 0xd0, 0x5a, 0xd5, 0x59, 0xd6, 0x48, 0xc7, 0x4b, 0xc4, 0x4e, 0xc1, 0x4d, 0xc2, 0x44, 0xcb, 0x47, 0xc8, 0x42, 0xcd, 0x41, 0xce, 0xc0, 0x4f, 0xc3, 0x4c, 0xc6, 0x49, 0xc5, 0x4a, 0xcc, 0x43, 0xcf, 0x40, 0xca, 0x45, 0xc9, 0x46, 0xd8, 0x57, 0xdb, 0x54, 0xde, 0x51, 0xdd, 0x52, 0xd4, 0x5b, 0xd7, 0x58, 0xd2, 0x5d, 0xd1, 0x5e, 0xf0, 0x7f, 0xf3, 0x7c, 0xf6, 0x79, 0xf5, 0x7a, 0xfc, 0x73, 0xff, 0x70, 0xfa, 0x75, 0xf9, 0x76, 0xe8, 0x67, 0xeb, 0x64, 0xee, 0x61, 0xed, 0x62, 0xe4, 0x6b, 0xe7, 0x68, 0xe2, 0x6d, 0xe1, 0x6e, 0xa0, 0x2f, 0xa3, 0x2c, 0xa6, 0x29, 0xa5, 0x2a, 0xac, 0x23, 0xaf, 0x20, 0xaa, 0x25, 0xa9, 0x26, 0xb8, 0x37, 0xbb, 0x34, 0xbe, 0x31, 0xbd, 0x32, 0xb4, 0x3b, 0xb7, 0x38, 0xb2, 0x3d, 0xb1, 0x3e, 0x90, 0x1f, 0x93, 0x1c, 0x96, 0x19, 0x95, 0x1a, 0x9c, 0x13, 0x9f, 0x10, 0x9a, 0x15, 0x99, 0x16, 0x88, 0x7, 0x8b, 0x4, 0x8e, 0x1, 0x8d, 0x2, 0x84, 0xb, 0x87, 0x8, 0x82, 0xd, 0x81, 0xe}, {0x0, 0x90, 0x3d, 0xad, 0x7a, 0xea, 0x47, 0xd7, 0xf4, 0x64, 0xc9, 0x59, 0x8e, 0x1e, 0xb3, 0x23, 0xf5, 0x65, 0xc8, 0x58, 0x8f, 0x1f, 0xb2, 0x22, 0x1, 0x91, 0x3c, 0xac, 0x7b, 0xeb, 0x46, 0xd6, 0xf7, 0x67, 0xca, 0x5a, 0x8d, 0x1d, 0xb0, 0x20, 0x3, 0x93, 0x3e, 0xae, 0x79, 0xe9, 0x44, 0xd4, 0x2, 0x92, 0x3f, 0xaf, 0x78, 0xe8, 0x45, 0xd5, 0xf6, 0x66, 0xcb, 0x5b, 0x8c, 0x1c, 0xb1, 0x21, 0xf3, 0x63, 0xce, 0x5e, 0x89, 0x19, 0xb4, 0x24, 0x7, 0x97, 0x3a, 0xaa, 0x7d, 0xed, 0x40, 0xd0, 0x6, 0x96, 0x3b, 0xab, 0x7c, 0xec, 0x41, 0xd1, 0xf2, 0x62, 0xcf, 0x5f, 0x88, 0x18, 0xb5, 0x25, 0x4, 0x94, 0x39, 0xa9, 0x7e, 0xee, 0x43, 0xd3, 0xf0, 0x60, 0xcd, 0x5d, 0x8a, 0x1a, 0xb7, 0x27, 0xf1, 0x61, 0xcc, 0x5c, 0x8b, 0x1b, 0xb6, 0x26, 0x5, 0x95, 0x38, 0xa8, 0x7f, 0xef, 0x42, 0xd2, 0xfb, 0x6b, 0xc6, 0x56, 0x81, 0x11, 0xbc, 0x2c, 0xf, 0x9f, 0x32, 0xa2, 0x75, 0xe5, 0x48, 0xd8, 0xe, 0x9e, 0x33, 0xa3, 0x74, 0xe4, 0x49, 0xd9, 0xfa, 0x6a, 0xc7, 0x57, 0x80, 0x10, 0xbd, 0x2d, 0xc, 0x9c, 0x31, 0xa1, 0x76, 0xe6, 0x4b, 0xdb, 0xf8, 0x68, 0xc5, 0x55, 0x82, 0x12, 0xbf, 0x2f, 0xf9, 0x69, 0xc4, 0x54, 0x83, 0x13, 0xbe, 0x2e, 0xd, 0x9d, 0x30, 0xa0, 0x77, 0xe7, 0x4a, 0xda, 0x8, 0x98, 0x35, 0xa5, 0x72, 0xe2, 0x4f, 0xdf, 0xfc, 0x6c, 0xc1, 0x51, 0x86, 0x16, 0xbb, 0x2b, 0xfd, 0x6d, 0xc0, 0x50, 0x87, 0x17, 0xba, 0x2a, 0x9, 0x99, 0x34, 0xa4, 0x73, 0xe3, 0x4e, 0xde, 0xff, 0x6f, 0xc2, 0x52, 0x85, 0x15, 0xb8, 0x28, 0xb, 0x9b, 0x36, 0xa6, 0x71, 0xe1, 0x4c, 0xdc, 0xa, 0x9a, 0x37, 0xa7, 0x70, 0xe0, 0x4d, 0xdd, 0xfe, 0x6e, 0xc3, 0x53, 0x84, 0x14, 0xb9, 0x29}, {0x0, 0x91, 0x3f, 0xae, 0x7e, 0xef, 0x41, 0xd0, 0xfc, 0x6d, 0xc3, 0x52, 0x82, 0x13, 0xbd, 0x2c, 0xe5, 0x74, 0xda, 0x4b, 0x9b, 0xa, 0xa4, 0x35, 0x19, 0x88, 0x26, 0xb7, 0x67, 0xf6, 0x58, 0xc9, 0xd7, 0x46, 0xe8, 0x79, 0xa9, 0x38, 0x96, 0x7, 0x2b, 0xba, 0x14, 0x85, 0x55, 0xc4, 0x6a, 0xfb, 0x32, 0xa3, 0xd, 0x9c, 0x4c, 0xdd, 0x73, 0xe2, 0xce, 0x5f, 0xf1, 0x60, 0xb0, 0x21, 0x8f, 0x1e, 0xb3, 0x22, 0x8c, 0x1d, 0xcd, 0x5c, 0xf2, 0x63, 0x4f, 0xde, 0x70, 0xe1, 0x31, 0xa0, 0xe, 0x9f, 0x56, 0xc7, 0x69, 0xf8, 0x28, 0xb9, 0x17, 0x86, 0xaa, 0x3b, 0x95, 0x4, 0xd4, 0x45, 0xeb, 0x7a, 0x64, 0xf5, 0x5b, 0xca, 0x1a, 0x8b, 0x25, 0xb4, 0x98, 0x9, 0xa7, 0x36, 0xe6, 0x77, 0xd9, 0x48, 0x81, 0x10, 0xbe, 0x2f, 0xff, 0x6e, 0xc0, 0x51, 0x7d, 0xec, 0x42, 0xd3, 0x3, 0x92, 0x3c, 0xad, 0x7b, 0xea, 0x44, 0xd5, 0x5, 0x94, 0x3a, 0xab, 0x87, 0x16, 0xb8, 0x29, 0xf9, 0x68, 0xc6, 0x57, 0x9e, 0xf, 0xa1, 0x30, 0xe0, 0x71, 0xdf, 0x4e, 0x62, 0xf3, 0x5d, 0xcc, 0x1c, 0x8d, 0x23, 0xb2, 0xac, 0x3d, 0x93, 0x2, 0xd2, 0x43, 0xed, 0x7c, 0x50, 0xc1, 0x6f, 0xfe, 0x2e, 0xbf, 0x11, 0x80, 0x49, 0xd8, 0x76, 0xe7, 0x37, 0xa6, 0x8, 0x99, 0xb5, 0x24, 0x8a, 0x1b, 0xcb, 0x5a, 0xf4, 0x65, 0xc8, 0x59, 0xf7, 0x66, 0xb6, 0x27, 0x89, 0x18, 0x34, 0xa5, 0xb, 0x9a, 0x4a, 0xdb, 0x75, 0xe4, 0x2d, 0xbc, 0x12, 0x83, 0x53, 0xc2, 0x6c, 0xfd, 0xd1, 0x40, 0xee, 0x7f, 0xaf, 0x3e, 0x90, 0x1, 0x1f, 0x8e, 0x20, 0xb1, 0x61, 0xf0, 0x5e, 0xcf, 0xe3, 0x72, 0xdc, 0x4d, 0x9d, 0xc, 0xa2, 0x33, 0xfa, 0x6b, 0xc5, 0x54, 0x84, 0x15, 0xbb, 0x2a, 0x6, 0x97, 0x39, 0xa8, 0x78, 0xe9, 0x47, 0xd6}, {0x0, 0x92, 0x39, 0xab, 0x72, 0xe0, 0x4b, 0xd9, 0xe4, 0x76, 0xdd, 0x4f, 0x96, 0x4, 0xaf, 0x3d, 0xd5, 0x47, 0xec, 0x7e, 0xa7, 0x35, 0x9e, 0xc, 0x31, 0xa3, 0x8, 0x9a, 0x43, 0xd1, 0x7a, 0xe8, 0xb7, 0x25, 0x8e, 0x1c, 0xc5, 0x57, 0xfc, 0x6e, 0x53, 0xc1, 0x6a, 0xf8, 0x21, 0xb3, 0x18, 0x8a, 0x62, 0xf0, 0x5b, 0xc9, 0x10, 0x82, 0x29, 0xbb, 0x86, 0x14, 0xbf, 0x2d, 0xf4, 0x66, 0xcd, 0x5f, 0x73, 0xe1, 0x4a, 0xd8, 0x1, 0x93, 0x38, 0xaa, 0x97, 0x5, 0xae, 0x3c, 0xe5, 0x77, 0xdc, 0x4e, 0xa6, 0x34, 0x9f, 0xd, 0xd4, 0x46, 0xed, 0x7f, 0x42, 0xd0, 0x7b, 0xe9, 0x30, 0xa2, 0x9, 0x9b, 0xc4, 0x56, 0xfd, 0x6f, 0xb6, 0x24, 0x8f, 0x1d, 0x20, 0xb2, 0x19, 0x8b, 0x52, 0xc0, 0x6b, 0xf9, 0x11, 0x83, 0x28, 0xba, 0x63, 0xf1, 0x5a, 0xc8, 0xf5, 0x67, 0xcc, 0x5e, 0x87, 0x15, 0xbe, 0x2c, 0xe6, 0x74, 0xdf, 0x4d, 0x94, 0x6, 0xad, 0x3f, 0x2, 0x90, 0x3b, 0xa9, 0x70, 0xe2, 0x49, 0xdb, 0x33, 0xa1, 0xa, 0x98, 0x41, 0xd3, 0x78, 0xea, 0xd7, 0x45, 0xee, 0x7c, 0xa5, 0x37, 0x9c, 0xe, 0x51, 0xc3, 0x68, 0xfa, 0x23, 0xb1, 0x1a, 0x88, 0xb5, 0x27, 0x8c, 0x1e, 0xc7, 0x55, 0xfe, 0x6c, 0x84, 0x16, 0xbd, 0x2f, 0xf6, 0x64, 0xcf, 0x5d, 0x60, 0xf2, 0x59, 0xcb, 0x12, 0x80, 0x2b, 0xb9, 0x95, 0x7, 0xac, 0x3e, 0xe7, 0x75, 0xde, 0x4c, 0x71, 0xe3, 0x48, 0xda, 0x3, 0x91, 0x3a, 0xa8, 0x40, 0xd2, 0x79, 0xeb, 0x32, 0xa0, 0xb, 0x99, 0xa4, 0x36, 0x9d, 0xf, 0xd6, 0x44, 0xef, 0x7d, 0x22, 0xb0, 0x1b, 0x89, 0x50, 0xc2, 0x69, 0xfb, 0xc6, 0x54, 0xff, 0x6d, 0xb4, 0x26, 0x8d, 0x1f, 0xf7, 0x65, 0xce, 0x5c, 0x85, 0x17, 0xbc, 0x2e, 0x13, 0x81, 0x2a, 0xb8, 0x61, 0xf3, 0x58, 0xca}, {0x0, 0x93, 0x3b, 0xa8, 0x76, 0xe5, 0x4d, 0xde, 0xec, 0x7f, 0xd7, 0x44, 0x9a, 0x9, 0xa1, 0x32, 0xc5, 0x56, 0xfe, 0x6d, 0xb3, 0x20, 0x88, 0x1b, 0x29, 0xba, 0x12, 0x81, 0x5f, 0xcc, 0x64, 0xf7, 0x97, 0x4, 0xac, 0x3f, 0xe1, 0x72, 0xda, 0x49, 0x7b, 0xe8, 0x40, 0xd3, 0xd, 0x9e, 0x36, 0xa5, 0x52, 0xc1, 0x69, 0xfa, 0x24, 0xb7, 0x1f, 0x8c, 0xbe, 0x2d, 0x85, 0x16, 0xc8, 0x5b, 0xf3, 0x60, 0x33, 0xa0, 0x8, 0x9b, 0x45, 0xd6, 0x7e, 0xed, 0xdf, 0x4c, 0xe4, 0x77, 0xa9, 0x3a, 0x92, 0x1, 0xf6, 0x65, 0xcd, 0x5e, 0x80, 0x13, 0xbb, 0x28, 0x1a, 0x89, 0x21, 0xb2, 0x6c, 0xff, 0x57, 0xc4, 0xa4, 0x37, 0x9f, 0xc, 0xd2, 0x41, 0xe9, 0x7a, 0x48, 0xdb, 0x73, 0xe0, 0x3e, 0xad, 0x5, 0x96, 0x61, 0xf2, 0x5a, 0xc9, 0x17, 0x84, 0x2c, 0xbf, 0x8d, 0x1e, 0xb6, 0x25, 0xfb, 0x68, 0xc0, 0x53, 0x66, 0xf5, 0x5d, 0xce, 0x10, 0x83, 0x2b, 0xb8, 0x8a, 0x19, 0xb1, 0x22, 0xfc, 0x6f, 0xc7, 0x54, 0xa3, 0x30, 0x98, 0xb, 0xd5, 0x46, 0xee, 0x7d, 0x4f, 0xdc, 0x74, 0xe7, 0x39, 0xaa, 0x2, 0x91, 0xf1, 0x62, 0xca, 0x59, 0x87, 0x14, 0xbc, 0x2f, 0x1d, 0x8e, 0x26, 0xb5, 0x6b, 0xf8, 0x50, 0xc3, 0x34, 0xa7, 0xf, 0x9c, 0x42, 0xd1, 0x79, 0xea, 0xd8, 0x4b, 0xe3, 0x70, 0xae, 0x3d, 0x95, 0x6, 0x55, 0xc6, 0x6e, 0xfd, 0x23, 0xb0, 0x18, 0x8b, 0xb9, 0x2a, 0x82, 0x11, 0xcf, 0x5c, 0xf4, 0x67, 0x90, 0x3, 0xab, 0x38, 0xe6, 0x75, 0xdd, 0x4e, 0x7c, 0xef, 0x47, 0xd4, 0xa, 0x99, 0x31, 0xa2, 0xc2, 0x51, 0xf9, 0x6a, 0xb4, 0x27, 0x8f, 0x1c, 0x2e, 0xbd, 0x15, 0x86, 0x58, 0xcb, 0x63, 0xf0, 0x7, 0x94, 0x3c, 0xaf, 0x71, 0xe2, 0x4a, 0xd9, 0xeb, 0x78, 0xd0, 0x43, 0x9d, 0xe, 0xa6, 0x35}, {0x0, 0x94, 0x35, 0xa1, 0x6a, 0xfe, 0x5f, 0xcb, 0xd4, 0x40, 0xe1, 0x75, 0xbe, 0x2a, 0x8b, 0x1f, 0xb5, 0x21, 0x80, 0x14, 0xdf, 0x4b, 0xea, 0x7e, 0x61, 0xf5, 0x54, 0xc0, 0xb, 0x9f, 0x3e, 0xaa, 0x77, 0xe3, 0x42, 0xd6, 0x1d, 0x89, 0x28, 0xbc, 0xa3, 0x37, 0x96, 0x2, 0xc9, 0x5d, 0xfc, 0x68, 0xc2, 0x56, 0xf7, 0x63, 0xa8, 0x3c, 0x9d, 0x9, 0x16, 0x82, 0x23, 0xb7, 0x7c, 0xe8, 0x49, 0xdd, 0xee, 0x7a, 0xdb, 0x4f, 0x84, 0x10, 0xb1, 0x25, 0x3a, 0xae, 0xf, 0x9b, 0x50, 0xc4, 0x65, 0xf1, 0x5b, 0xcf, 0x6e, 0xfa, 0x31, 0xa5, 0x4, 0x90, 0x8f, 0x1b, 0xba, 0x2e, 0xe5, 0x71, 0xd0, 0x44, 0x99, 0xd, 0xac, 0x38, 0xf3, 0x67, 0xc6, 0x52, 0x4d, 0xd9, 0x78, 0xec, 0x27, 0xb3, 0x12, 0x86, 0x2c, 0xb8, 0x19, 0x8d, 0x46, 0xd2, 0x73, 0xe7, 0xf8, 0x6c, 0xcd, 0x59, 0x92, 0x6, 0xa7, 0x33, 0xc1, 0x55, 0xf4, 0x60, 0xab, 0x3f, 0x9e, 0xa, 0x15, 0x81, 0x20, 0xb4, 0x7f, 0xeb, 0x4a, 0xde, 0x74, 0xe0, 0x41, 0xd5, 0x1e, 0x8a, 0x2b, 0xbf, 0xa0, 0x34, 0x95, 0x1, 0xca, 0x5e, 0xff, 0x6b, 0xb6, 0x22, 0x83, 0x17, 0xdc, 0x48, 0xe9, 0x7d, 0x62, 0xf6, 0x57, 0xc3, 0x8, 0x9c, 0x3d, 0xa9, 0x3, 0x97, 0x36, 0xa2, 0x69, 0xfd, 0x5c, 0xc8, 0xd7, 0x43, 0xe2, 0x76, 0xbd, 0x29, 0x88, 0x1c, 0x2f, 0xbb, 0x1a, 0x8e, 0x45, 0xd1, 0x70, 0xe4, 0xfb, 0x6f, 0xce, 0x5a, 0x91, 0x5, 0xa4, 0x30, 0x9a, 0xe, 0xaf, 0x3b, 0xf0, 0x64, 0xc5, 0x51, 0x4e, 0xda, 0x7b, 0xef, 0x24, 0xb0, 0x11, 0x85, 0x58, 0xcc, 0x6d, 0xf9, 0x32, 0xa6, 0x7, 0x93, 0x8c, 0x18, 0xb9, 0x2d, 0xe6, 0x72, 0xd3, 0x47, 0xed, 0x79, 0xd8, 0x4c, 0x87, 0x13, 0xb2, 0x26, 0x39, 0xad, 0xc, 0x98, 0x53, 0xc7, 0x66, 0xf2}, {0x0, 0x95, 0x37, 0xa2, 0x6e, 0xfb, 0x59, 0xcc, 0xdc, 0x49, 0xeb, 0x7e, 0xb2, 0x27, 0x85, 0x10, 0xa5, 0x30, 0x92, 0x7, 0xcb, 0x5e, 0xfc, 0x69, 0x79, 0xec, 0x4e, 0xdb, 0x17, 0x82, 0x20, 0xb5, 0x57, 0xc2, 0x60, 0xf5, 0x39, 0xac, 0xe, 0x9b, 0x8b, 0x1e, 0xbc, 0x29, 0xe5, 0x70, 0xd2, 0x47, 0xf2, 0x67, 0xc5, 0x50, 0x9c, 0x9, 0xab, 0x3e, 0x2e, 0xbb, 0x19, 0x8c, 0x40, 0xd5, 0x77, 0xe2, 0xae, 0x3b, 0x99, 0xc, 0xc0, 0x55, 0xf7, 0x62, 0x72, 0xe7, 0x45, 0xd0, 0x1c, 0x89, 0x2b, 0xbe, 0xb, 0x9e, 0x3c, 0xa9, 0x65, 0xf0, 0x52, 0xc7, 0xd7, 0x42, 0xe0, 0x75, 0xb9, 0x2c, 0x8e, 0x1b, 0xf9, 0x6c, 0xce, 0x5b, 0x97, 0x2, 0xa0, 0x35, 0x25, 0xb0, 0x12, 0x87, 0x4b, 0xde, 0x7c, 0xe9, 0x5c, 0xc9, 0x6b, 0xfe, 0x32, 0xa7, 0x5, 0x90, 0x80, 0x15, 0xb7, 0x22, 0xee, 0x7b, 0xd9, 0x4c, 0x41, 0xd4, 0x76, 0xe3, 0x2f, 0xba, 0x18, 0x8d, 0x9d, 0x8, 0xaa, 0x3f, 0xf3, 0x66, 0xc4, 0x51, 0xe4, 0x71, 0xd3, 0x46, 0x8a, 0x1f, 0xbd, 0x28, 0x38, 0xad, 0xf, 0x9a, 0x56, 0xc3, 0x61, 0xf4, 0x16, 0x83, 0x21, 0xb4, 0x78, 0xed, 0x4f, 0xda, 0xca, 0x5f, 0xfd, 0x68, 0xa4, 0x31, 0x93, 0x6, 0xb3, 0x26, 0x84, 0x11, 0xdd, 0x48, 0xea, 0x7f, 0x6f, 0xfa, 0x58, 0xcd, 0x1, 0x94, 0x36, 0xa3, 0xef, 0x7a, 0xd8, 0x4d, 0x81, 0x14, 0xb6, 0x23, 0x33, 0xa6, 0x4, 0x91, 0x5d, 0xc8, 0x6a, 0xff, 0x4a, 0xdf, 0x7d, 0xe8, 0x24, 0xb1, 0x13, 0x86, 0x96, 0x3, 0xa1, 0x34, 0xf8, 0x6d, 0xcf, 0x5a, 0xb8, 0x2d, 0x8f, 0x1a, 0xd6, 0x43, 0xe1, 0x74, 0x64, 0xf1, 0x53, 0xc6, 0xa, 0x9f, 0x3d, 0xa8, 0x1d, 0x88, 0x2a, 0xbf, 0x73, 0xe6, 0x44, 0xd1, 0xc1, 0x54, 0xf6, 0x63, 0xaf, 0x3a, 0x98, 0xd}, {0x0, 0x96, 0x31, 0xa7, 0x62, 0xf4, 0x53, 0xc5, 0xc4, 0x52, 0xf5, 0x63, 0xa6, 0x30, 0x97, 0x1, 0x95, 0x3, 0xa4, 0x32, 0xf7, 0x61, 0xc6, 0x50, 0x51, 0xc7, 0x60, 0xf6, 0x33, 0xa5, 0x2, 0x94, 0x37, 0xa1, 0x6, 0x90, 0x55, 0xc3, 0x64, 0xf2, 0xf3, 0x65, 0xc2, 0x54, 0x91, 0x7, 0xa0, 0x36, 0xa2, 0x34, 0x93, 0x5, 0xc0, 0x56, 0xf1, 0x67, 0x66, 0xf0, 0x57, 0xc1, 0x4, 0x92, 0x35, 0xa3, 0x6e, 0xf8, 0x5f, 0xc9, 0xc, 0x9a, 0x3d, 0xab, 0xaa, 0x3c, 0x9b, 0xd, 0xc8, 0x5e, 0xf9, 0x6f, 0xfb, 0x6d, 0xca, 0x5c, 0x99, 0xf, 0xa8, 0x3e, 0x3f, 0xa9, 0xe, 0x98, 0x5d, 0xcb, 0x6c, 0xfa, 0x59, 0xcf, 0x68, 0xfe, 0x3b, 0xad, 0xa, 0x9c, 0x9d, 0xb, 0xac, 0x3a, 0xff, 0x69, 0xce, 0x58, 0xcc, 0x5a, 0xfd, 0x6b, 0xae, 0x38, 0x9f, 0x9, 0x8, 0x9e, 0x39, 0xaf, 0x6a, 0xfc, 0x5b, 0xcd, 0xdc, 0x4a, 0xed, 0x7b, 0xbe, 0x28, 0x8f, 0x19, 0x18, 0x8e, 0x29, 0xbf, 0x7a, 0xec, 0x4b, 0xdd, 0x49, 0xdf, 0x78, 0xee, 0x2b, 0xbd, 0x1a, 0x8c, 0x8d, 0x1b, 0xbc, 0x2a, 0xef, 0x79, 0xde, 0x48, 0xeb, 0x7d, 0xda, 0x4c, 0x89, 0x1f, 0xb8, 0x2e, 0x2f, 0xb9, 0x1e, 0x88, 0x4d, 0xdb, 0x7c, 0xea, 0x7e, 0xe8, 0x4f, 0xd9, 0x1c, 0x8a, 0x2d, 0xbb, 0xba, 0x2c, 0x8b, 0x1d, 0xd8, 0x4e, 0xe9, 0x7f, 0xb2, 0x24, 0x83, 0x15, 0xd0, 0x46, 0xe1, 0x77, 0x76, 0xe0, 0x47, 0xd1, 0x14, 0x82, 0x25, 0xb3, 0x27, 0xb1, 0x16, 0x80, 0x45, 0xd3, 0x74, 0xe2, 0xe3, 0x75, 0xd2, 0x44, 0x81, 0x17, 0xb0, 0x26, 0x85, 0x13, 0xb4, 0x22, 0xe7, 0x71, 0xd6, 0x40, 0x41, 0xd7, 0x70, 0xe6, 0x23, 0xb5, 0x12, 0x84, 0x10, 0x86, 0x21, 0xb7, 0x72, 0xe4, 0x43, 0xd5, 0xd4, 0x42, 0xe5, 0x73, 0xb6, 0x20, 0x87, 0x11}, {0x0, 0x97, 0x33, 0xa4, 0x66, 0xf1, 0x55, 0xc2, 0xcc, 0x5b, 0xff, 0x68, 0xaa, 0x3d, 0x99, 0xe, 0x85, 0x12, 0xb6, 0x21, 0xe3, 0x74, 0xd0, 0x47, 0x49, 0xde, 0x7a, 0xed, 0x2f, 0xb8, 0x1c, 0x8b, 0x17, 0x80, 0x24, 0xb3, 0x71, 0xe6, 0x42, 0xd5, 0xdb, 0x4c, 0xe8, 0x7f, 0xbd, 0x2a, 0x8e, 0x19, 0x92, 0x5, 0xa1, 0x36, 0xf4, 0x63, 0xc7, 0x50, 0x5e, 0xc9, 0x6d, 0xfa, 0x38, 0xaf, 0xb, 0x9c, 0x2e, 0xb9, 0x1d, 0x8a, 0x48, 0xdf, 0x7b, 0xec, 0xe2, 0x75, 0xd1, 0x46, 0x84, 0x13, 0xb7, 0x20, 0xab, 0x3c, 0x98, 0xf, 0xcd, 0x5a, 0xfe, 0x69, 0x67, 0xf0, 0x54, 0xc3, 0x1, 0x96, 0x32, 0xa5, 0x39, 0xae, 0xa, 0x9d, 0x5f, 0xc8, 0x6c, 0xfb, 0xf5, 0x62, 0xc6, 0x51, 0x93, 0x4, 0xa0, 0x37, 0xbc, 0x2b, 0x8f, 0x18, 0xda, 0x4d, 0xe9, 0x7e, 0x70, 0xe7, 0x43, 0xd4, 0x16, 0x81, 0x25, 0xb2, 0x5c, 0xcb, 0x6f, 0xf8, 0x3a, 0xad, 0x9, 0x9e, 0x90, 0x7, 0xa3, 0x34, 0xf6, 0x61, 0xc5, 0x52, 0xd9, 0x4e, 0xea, 0x7d, 0xbf, 0x28, 0x8c, 0x1b, 0x15, 0x82, 0x26, 0xb1, 0x73, 0xe4, 0x40, 0xd7, 0x4b, 0xdc, 0x78, 0xef, 0x2d, 0xba, 0x1e, 0x89, 0x87, 0x10, 0xb4, 0x23, 0xe1, 0x76, 0xd2, 0x45, 0xce, 0x59, 0xfd, 0x6a, 0xa8, 0x3f, 0x9b, 0xc, 0x2, 0x95, 0x31, 0xa6, 0x64, 0xf3, 0x57, 0xc0, 0x72, 0xe5, 0x41, 0xd6, 0x14, 0x83, 0x27, 0xb0, 0xbe, 0x29, 0x8d, 0x1a, 0xd8, 0x4f, 0xeb, 0x7c, 0xf7, 0x60, 0xc4, 0x53, 0x91, 0x6, 0xa2, 0x35, 0x3b, 0xac, 0x8, 0x9f, 0x5d, 0xca, 0x6e, 0xf9, 0x65, 0xf2, 0x56, 0xc1, 0x3, 0x94, 0x30, 0xa7, 0xa9, 0x3e, 0x9a, 0xd, 0xcf, 0x58, 0xfc, 0x6b, 0xe0, 0x77, 0xd3, 0x44, 0x86, 0x11, 0xb5, 0x22, 0x2c, 0xbb, 0x1f, 0x88, 0x4a, 0xdd, 0x79, 0xee}, {0x0, 0x98, 0x2d, 0xb5, 0x5a, 0xc2, 0x77, 0xef, 0xb4, 0x2c, 0x99, 0x1, 0xee, 0x76, 0xc3, 0x5b, 0x75, 0xed, 0x58, 0xc0, 0x2f, 0xb7, 0x2, 0x9a, 0xc1, 0x59, 0xec, 0x74, 0x9b, 0x3, 0xb6, 0x2e, 0xea, 0x72, 0xc7, 0x5f, 0xb0, 0x28, 0x9d, 0x5, 0x5e, 0xc6, 0x73, 0xeb, 0x4, 0x9c, 0x29, 0xb1, 0x9f, 0x7, 0xb2, 0x2a, 0xc5, 0x5d, 0xe8, 0x70, 0x2b, 0xb3, 0x6, 0x9e, 0x71, 0xe9, 0x5c, 0xc4, 0xc9, 0x51, 0xe4, 0x7c, 0x93, 0xb, 0xbe, 0x26, 0x7d, 0xe5, 0x50, 0xc8, 0x27, 0xbf, 0xa, 0x92, 0xbc, 0x24, 0x91, 0x9, 0xe6, 0x7e, 0xcb, 0x53, 0x8, 0x90, 0x25, 0xbd, 0x52, 0xca, 0x7f, 0xe7, 0x23, 0xbb, 0xe, 0x96, 0x79, 0xe1, 0x54, 0xcc, 0x97, 0xf, 0xba, 0x22, 0xcd, 0x55, 0xe0, 0x78, 0x56, 0xce, 0x7b, 0xe3, 0xc, 0x94, 0x21, 0xb9, 0xe2, 0x7a, 0xcf, 0x57, 0xb8, 0x20, 0x95, 0xd, 0x8f, 0x17, 0xa2, 0x3a, 0xd5, 0x4d, 0xf8, 0x60, 0x3b, 0xa3, 0x16, 0x8e, 0x61, 0xf9, 0x4c, 0xd4, 0xfa, 0x62, 0xd7, 0x4f, 0xa0, 0x38, 0x8d, 0x15, 0x4e, 0xd6, 0x63, 0xfb, 0x14, 0x8c, 0x39, 0xa1, 0x65, 0xfd, 0x48, 0xd0, 0x3f, 0xa7, 0x12, 0x8a, 0xd1, 0x49, 0xfc, 0x64, 0x8b, 0x13, 0xa6, 0x3e, 0x10, 0x88, 0x3d, 0xa5, 0x4a, 0xd2, 0x67, 0xff, 0xa4, 0x3c, 0x89, 0x11, 0xfe, 0x66, 0xd3, 0x4b, 0x46, 0xde, 0x6b, 0xf3, 0x1c, 0x84, 0x31, 0xa9, 0xf2, 0x6a, 0xdf, 0x47, 0xa8, 0x30, 0x85, 0x1d, 0x33, 0xab, 0x1e, 0x86, 0x69, 0xf1, 0x44, 0xdc, 0x87, 0x1f, 0xaa, 0x32, 0xdd, 0x45, 0xf0, 0x68, 0xac, 0x34, 0x81, 0x19, 0xf6, 0x6e, 0xdb, 0x43, 0x18, 0x80, 0x35, 0xad, 0x42, 0xda, 0x6f, 0xf7, 0xd9, 0x41, 0xf4, 0x6c, 0x83, 0x1b, 0xae, 0x36, 0x6d, 0xf5, 0x40, 0xd8, 0x37, 0xaf, 0x1a, 0x82}, {0x0, 0x99, 0x2f, 0xb6, 0x5e, 0xc7, 0x71, 0xe8, 0xbc, 0x25, 0x93, 0xa, 0xe2, 0x7b, 0xcd, 0x54, 0x65, 0xfc, 0x4a, 0xd3, 0x3b, 0xa2, 0x14, 0x8d, 0xd9, 0x40, 0xf6, 0x6f, 0x87, 0x1e, 0xa8, 0x31, 0xca, 0x53, 0xe5, 0x7c, 0x94, 0xd, 0xbb, 0x22, 0x76, 0xef, 0x59, 0xc0, 0x28, 0xb1, 0x7, 0x9e, 0xaf, 0x36, 0x80, 0x19, 0xf1, 0x68, 0xde, 0x47, 0x13, 0x8a, 0x3c, 0xa5, 0x4d, 0xd4, 0x62, 0xfb, 0x89, 0x10, 0xa6, 0x3f, 0xd7, 0x4e, 0xf8, 0x61, 0x35, 0xac, 0x1a, 0x83, 0x6b, 0xf2, 0x44, 0xdd, 0xec, 0x75, 0xc3, 0x5a, 0xb2, 0x2b, 0x9d, 0x4, 0x50, 0xc9, 0x7f, 0xe6, 0xe, 0x97, 0x21, 0xb8, 0x43, 0xda, 0x6c, 0xf5, 0x1d, 0x84, 0x32, 0xab, 0xff, 0x66, 0xd0, 0x49, 0xa1, 0x38, 0x8e, 0x17, 0x26, 0xbf, 0x9, 0x90, 0x78, 0xe1, 0x57, 0xce, 0x9a, 0x3, 0xb5, 0x2c, 0xc4, 0x5d, 0xeb, 0x72, 0xf, 0x96, 0x20, 0xb9, 0x51, 0xc8, 0x7e, 0xe7, 0xb3, 0x2a, 0x9c, 0x5, 0xed, 0x74, 0xc2, 0x5b, 0x6a, 0xf3, 0x45, 0xdc, 0x34, 0xad, 0x1b, 0x82, 0xd6, 0x4f, 0xf9, 0x60, 0x88, 0x11, 0xa7, 0x3e, 0xc5, 0x5c, 0xea, 0x73, 0x9b, 0x2, 0xb4, 0x2d, 0x79, 0xe0, 0x56, 0xcf, 0x27, 0xbe, 0x8, 0x91, 0xa0, 0x39, 0x8f, 0x16, 0xfe, 0x67, 0xd1, 0x48, 0x1c, 0x85, 0x33, 0xaa, 0x42, 0xdb, 0x6d, 0xf4, 0x86, 0x1f, 0xa9, 0x30, 0xd8, 0x41, 0xf7, 0x6e, 0x3a, 0xa3, 0x15, 0x8c, 0x64, 0xfd, 0x4b, 0xd2, 0xe3, 0x7a, 0xcc, 0x55, 0xbd, 0x24, 0x92, 0xb, 0x5f, 0xc6, 0x70, 0xe9, 0x1, 0x98, 0x2e, 0xb7, 0x4c, 0xd5, 0x63, 0xfa, 0x12, 0x8b, 0x3d, 0xa4, 0xf0, 0x69, 0xdf, 0x46, 0xae, 0x37, 0x81, 0x18, 0x29, 0xb0, 0x6, 0x9f, 0x77, 0xee, 0x58, 0xc1, 0x95, 0xc, 0xba, 0x23, 0xcb, 0x52, 0xe4, 0x7d}, {0x0, 0x9a, 0x29, 0xb3, 0x52, 0xc8, 0x7b, 0xe1, 0xa4, 0x3e, 0x8d, 0x17, 0xf6, 0x6c, 0xdf, 0x45, 0x55, 0xcf, 0x7c, 0xe6, 0x7, 0x9d, 0x2e, 0xb4, 0xf1, 0x6b, 0xd8, 0x42, 0xa3, 0x39, 0x8a, 0x10, 0xaa, 0x30, 0x83, 0x19, 0xf8, 0x62, 0xd1, 0x4b, 0xe, 0x94, 0x27, 0xbd, 0x5c, 0xc6, 0x75, 0xef, 0xff, 0x65, 0xd6, 0x4c, 0xad, 0x37, 0x84, 0x1e, 0x5b, 0xc1, 0x72, 0xe8, 0x9, 0x93, 0x20, 0xba, 0x49, 0xd3, 0x60, 0xfa, 0x1b, 0x81, 0x32, 0xa8, 0xed, 0x77, 0xc4, 0x5e, 0xbf, 0x25, 0x96, 0xc, 0x1c, 0x86, 0x35, 0xaf, 0x4e, 0xd4, 0x67, 0xfd, 0xb8, 0x22, 0x91, 0xb, 0xea, 0x70, 0xc3, 0x59, 0xe3, 0x79, 0xca, 0x50, 0xb1, 0x2b, 0x98, 0x2, 0x47, 0xdd, 0x6e, 0xf4, 0x15, 0x8f, 0x3c, 0xa6, 0xb6, 0x2c, 0x9f, 0x5, 0xe4, 0x7e, 0xcd, 0x57, 0x12, 0x88, 0x3b, 0xa1, 0x40, 0xda, 0x69, 0xf3, 0x92, 0x8, 0xbb, 0x21, 0xc0, 0x5a, 0xe9, 0x73, 0x36, 0xac, 0x1f, 0x85, 0x64, 0xfe, 0x4d, 0xd7, 0xc7, 0x5d, 0xee, 0x74, 0x95, 0xf, 0xbc, 0x26, 0x63, 0xf9, 0x4a, 0xd0, 0x31, 0xab, 0x18, 0x82, 0x38, 0xa2, 0x11, 0x8b, 0x6a, 0xf0, 0x43, 0xd9, 0x9c, 0x6, 0xb5, 0x2f, 0xce, 0x54, 0xe7, 0x7d, 0x6d, 0xf7, 0x44, 0xde, 0x3f, 0xa5, 0x16, 0x8c, 0xc9, 0x53, 0xe0, 0x7a, 0x9b, 0x1, 0xb2, 0x28, 0xdb, 0x41, 0xf2, 0x68, 0x89, 0x13, 0xa0, 0x3a, 0x7f, 0xe5, 0x56, 0xcc, 0x2d, 0xb7, 0x4, 0x9e, 0x8e, 0x14, 0xa7, 0x3d, 0xdc, 0x46, 0xf5, 0x6f, 0x2a, 0xb0, 0x3, 0x99, 0x78, 0xe2, 0x51, 0xcb, 0x71, 0xeb, 0x58, 0xc2, 0x23, 0xb9, 0xa, 0x90, 0xd5, 0x4f, 0xfc, 0x66, 0x87, 0x1d, 0xae, 0x34, 0x24, 0xbe, 0xd, 0x97, 0x76, 0xec, 0x5f, 0xc5, 0x80, 0x1a, 0xa9, 0x33, 0xd2, 0x48, 0xfb, 0x61}, {0x0, 0x9b, 0x2b, 0xb0, 0x56, 0xcd, 0x7d, 0xe6, 0xac, 0x37, 0x87, 0x1c, 0xfa, 0x61, 0xd1, 0x4a, 0x45, 0xde, 0x6e, 0xf5, 0x13, 0x88, 0x38, 0xa3, 0xe9, 0x72, 0xc2, 0x59, 0xbf, 0x24, 0x94, 0xf, 0x8a, 0x11, 0xa1, 0x3a, 0xdc, 0x47, 0xf7, 0x6c, 0x26, 0xbd, 0xd, 0x96, 0x70, 0xeb, 0x5b, 0xc0, 0xcf, 0x54, 0xe4, 0x7f, 0x99, 0x2, 0xb2, 0x29, 0x63, 0xf8, 0x48, 0xd3, 0x35, 0xae, 0x1e, 0x85, 0x9, 0x92, 0x22, 0xb9, 0x5f, 0xc4, 0x74, 0xef, 0xa5, 0x3e, 0x8e, 0x15, 0xf3, 0x68, 0xd8, 0x43, 0x4c, 0xd7, 0x67, 0xfc, 0x1a, 0x81, 0x31, 0xaa, 0xe0, 0x7b, 0xcb, 0x50, 0xb6, 0x2d, 0x9d, 0x6, 0x83, 0x18, 0xa8, 0x33, 0xd5, 0x4e, 0xfe, 0x65, 0x2f, 0xb4, 0x4, 0x9f, 0x79, 0xe2, 0x52, 0xc9, 0xc6, 0x5d, 0xed, 0x76, 0x90, 0xb, 0xbb, 0x20, 0x6a, 0xf1, 0x41, 0xda, 0x3c, 0xa7, 0x17, 0x8c, 0x12, 0x89, 0x39, 0xa2, 0x44, 0xdf, 0x6f, 0xf4, 0xbe, 0x25, 0x95, 0xe, 0xe8, 0x73, 0xc3, 0x58, 0x57, 0xcc, 0x7c, 0xe7, 0x1, 0x9a, 0x2a, 0xb1, 0xfb, 0x60, 0xd0, 0x4b, 0xad, 0x36, 0x86, 0x1d, 0x98, 0x3, 0xb3, 0x28, 0xce, 0x55, 0xe5, 0x7e, 0x34, 0xaf, 0x1f, 0x84, 0x62, 0xf9, 0x49, 0xd2, 0xdd, 0x46, 0xf6, 0x6d, 0x8b, 0x10, 0xa0, 0x3b, 0x71, 0xea, 0x5a, 0xc1, 0x27, 0xbc, 0xc, 0x97, 0x1b, 0x80, 0x30, 0xab, 0x4d, 0xd6, 0x66, 0xfd, 0xb7, 0x2c, 0x9c, 0x7, 0xe1, 0x7a, 0xca, 0x51, 0x5e, 0xc5, 0x75, 0xee, 0x8, 0x93, 0x23, 0xb8, 0xf2, 0x69, 0xd9, 0x42, 0xa4, 0x3f, 0x8f, 0x14, 0x91, 0xa, 0xba, 0x21, 0xc7, 0x5c, 0xec, 0x77, 0x3d, 0xa6, 0x16, 0x8d, 0x6b, 0xf0, 0x40, 0xdb, 0xd4, 0x4f, 0xff, 0x64, 0x82, 0x19, 0xa9, 0x32, 0x78, 0xe3, 0x53, 0xc8, 0x2e, 0xb5, 0x5, 0x9e}, {0x0, 0x9c, 0x25, 0xb9, 0x4a, 0xd6, 0x6f, 0xf3, 0x94, 0x8, 0xb1, 0x2d, 0xde, 0x42, 0xfb, 0x67, 0x35, 0xa9, 0x10, 0x8c, 0x7f, 0xe3, 0x5a, 0xc6, 0xa1, 0x3d, 0x84, 0x18, 0xeb, 0x77, 0xce, 0x52, 0x6a, 0xf6, 0x4f, 0xd3, 0x20, 0xbc, 0x5, 0x99, 0xfe, 0x62, 0xdb, 0x47, 0xb4, 0x28, 0x91, 0xd, 0x5f, 0xc3, 0x7a, 0xe6, 0x15, 0x89, 0x30, 0xac, 0xcb, 0x57, 0xee, 0x72, 0x81, 0x1d, 0xa4, 0x38, 0xd4, 0x48, 0xf1, 0x6d, 0x9e, 0x2, 0xbb, 0x27, 0x40, 0xdc, 0x65, 0xf9, 0xa, 0x96, 0x2f, 0xb3, 0xe1, 0x7d, 0xc4, 0x58, 0xab, 0x37, 0x8e, 0x12, 0x75, 0xe9, 0x50, 0xcc, 0x3f, 0xa3, 0x1a, 0x86, 0xbe, 0x22, 0x9b, 0x7, 0xf4, 0x68, 0xd1, 0x4d, 0x2a, 0xb6, 0xf, 0x93, 0x60, 0xfc, 0x45, 0xd9, 0x8b, 0x17, 0xae, 0x32, 0xc1, 0x5d, 0xe4, 0x78, 0x1f, 0x83, 0x3a, 0xa6, 0x55, 0xc9, 0x70, 0xec, 0xb5, 0x29, 0x90, 0xc, 0xff, 0x63, 0xda, 0x46, 0x21, 0xbd, 0x4, 0x98, 0x6b, 0xf7, 0x4e, 0xd2, 0x80, 0x1c, 0xa5, 0x39, 0xca, 0x56, 0xef, 0x73, 0x14, 0x88, 0x31, 0xad, 0x5e, 0xc2, 0x7b, 0xe7, 0xdf, 0x43, 0xfa, 0x66, 0x95, 0x9, 0xb0, 0x2c, 0x4b, 0xd7, 0x6e, 0xf2, 0x1, 0x9d, 0x24, 0xb8, 0xea, 0x76, 0xcf, 0x53, 0xa0, 0x3c, 0x85, 0x19, 0x7e, 0xe2, 0x5b, 0xc7, 0x34, 0xa8, 0x11, 0x8d, 0x61, 0xfd, 0x44, 0xd8, 0x2b, 0xb7, 0xe, 0x92, 0xf5, 0x69, 0xd0, 0x4c, 0xbf, 0x23, 0x9a, 0x6, 0x54, 0xc8, 0x71, 0xed, 0x1e, 0x82, 0x3b, 0xa7, 0xc0, 0x5c, 0xe5, 0x79, 0x8a, 0x16, 0xaf, 0x33, 0xb, 0x97, 0x2e, 0xb2, 0x41, 0xdd, 0x64, 0xf8, 0x9f, 0x3, 0xba, 0x26, 0xd5, 0x49, 0xf0, 0x6c, 0x3e, 0xa2, 0x1b, 0x87, 0x74, 0xe8, 0x51, 0xcd, 0xaa, 0x36, 0x8f, 0x13, 0xe0, 0x7c, 0xc5, 0x59}, {0x0, 0x9d, 0x27, 0xba, 0x4e, 0xd3, 0x69, 0xf4, 0x9c, 0x1, 0xbb, 0x26, 0xd2, 0x4f, 0xf5, 0x68, 0x25, 0xb8, 0x2, 0x9f, 0x6b, 0xf6, 0x4c, 0xd1, 0xb9, 0x24, 0x9e, 0x3, 0xf7, 0x6a, 0xd0, 0x4d, 0x4a, 0xd7, 0x6d, 0xf0, 0x4, 0x99, 0x23, 0xbe, 0xd6, 0x4b, 0xf1, 0x6c, 0x98, 0x5, 0xbf, 0x22, 0x6f, 0xf2, 0x48, 0xd5, 0x21, 0xbc, 0x6, 0x9b, 0xf3, 0x6e, 0xd4, 0x49, 0xbd, 0x20, 0x9a, 0x7, 0x94, 0x9, 0xb3, 0x2e, 0xda, 0x47, 0xfd, 0x60, 0x8, 0x95, 0x2f, 0xb2, 0x46, 0xdb, 0x61, 0xfc, 0xb1, 0x2c, 0x96, 0xb, 0xff, 0x62, 0xd8, 0x45, 0x2d, 0xb0, 0xa, 0x97, 0x63, 0xfe, 0x44, 0xd9, 0xde, 0x43, 0xf9, 0x64, 0x90, 0xd, 0xb7, 0x2a, 0x42, 0xdf, 0x65, 0xf8, 0xc, 0x91, 0x2b, 0xb6, 0xfb, 0x66, 0xdc, 0x41, 0xb5, 0x28, 0x92, 0xf, 0x67, 0xfa, 0x40, 0xdd, 0x29, 0xb4, 0xe, 0x93, 0x35, 0xa8, 0x12, 0x8f, 0x7b, 0xe6, 0x5c, 0xc1, 0xa9, 0x34, 0x8e, 0x13, 0xe7, 0x7a, 0xc0, 0x5d, 0x10, 0x8d, 0x37, 0xaa, 0x5e, 0xc3, 0x79, 0xe4, 0x8c, 0x11, 0xab, 0x36, 0xc2, 0x5f, 0xe5, 0x78, 0x7f, 0xe2, 0x58, 0xc5, 0x31, 0xac, 0x16, 0x8b, 0xe3, 0x7e, 0xc4, 0x59, 0xad, 0x30, 0x8a, 0x17, 0x5a, 0xc7, 0x7d, 0xe0, 0x14, 0x89, 0x33, 0xae, 0xc6, 0x5b, 0xe1, 0x7c, 0x88, 0x15, 0xaf, 0x32, 0xa1, 0x3c, 0x86, 0x1b, 0xef, 0x72, 0xc8, 0x55, 0x3d, 0xa0, 0x1a, 0x87, 0x73, 0xee, 0x54, 0xc9, 0x84, 0x19, 0xa3, 0x3e, 0xca, 0x57, 0xed, 0x70, 0x18, 0x85, 0x3f, 0xa2, 0x56, 0xcb, 0x71, 0xec, 0xeb, 0x76, 0xcc, 0x51, 0xa5, 0x38, 0x82, 0x1f, 0x77, 0xea, 0x50, 0xcd, 0x39, 0xa4, 0x1e, 0x83, 0xce, 0x53, 0xe9, 0x74, 0x80, 0x1d, 0xa7, 0x3a, 0x52, 0xcf, 0x75, 0xe8, 0x1c, 0x81, 0x3b, 0xa6}, {0x0, 0x9e, 0x21, 0xbf, 0x42, 0xdc, 0x63, 0xfd, 0x84, 0x1a, 0xa5, 0x3b, 0xc6, 0x58, 0xe7, 0x79, 0x15, 0x8b, 0x34, 0xaa, 0x57, 0xc9, 0x76, 0xe8, 0x91, 0xf, 0xb0, 0x2e, 0xd3, 0x4d, 0xf2, 0x6c, 0x2a, 0xb4, 0xb, 0x95, 0x68, 0xf6, 0x49, 0xd7, 0xae, 0x30, 0x8f, 0x11, 0xec, 0x72, 0xcd, 0x53, 0x3f, 0xa1, 0x1e, 0x80, 0x7d, 0xe3, 0x5c, 0xc2, 0xbb, 0x25, 0x9a, 0x4, 0xf9, 0x67, 0xd8, 0x46, 0x54, 0xca, 0x75, 0xeb, 0x16, 0x88, 0x37, 0xa9, 0xd0, 0x4e, 0xf1, 0x6f, 0x92, 0xc, 0xb3, 0x2d, 0x41, 0xdf, 0x60, 0xfe, 0x3, 0x9d, 0x22, 0xbc, 0xc5, 0x5b, 0xe4, 0x7a, 0x87, 0x19, 0xa6, 0x38, 0x7e, 0xe0, 0x5f, 0xc1, 0x3c, 0xa2, 0x1d, 0x83, 0xfa, 0x64, 0xdb, 0x45, 0xb8, 0x26, 0x99, 0x7, 0x6b, 0xf5, 0x4a, 0xd4, 0x29, 0xb7, 0x8, 0x96, 0xef, 0x71, 0xce, 0x50, 0xad, 0x33, 0x8c, 0x12, 0xa8, 0x36, 0x89, 0x17, 0xea, 0x74, 0xcb, 0x55, 0x2c, 0xb2, 0xd, 0x93, 0x6e, 0xf0, 0x4f, 0xd1, 0xbd, 0x23, 0x9c, 0x2, 0xff, 0x61, 0xde, 0x40, 0x39, 0xa7, 0x18, 0x86, 0x7b, 0xe5, 0x5a, 0xc4, 0x82, 0x1c, 0xa3, 0x3d, 0xc0, 0x5e, 0xe1, 0x7f, 0x6, 0x98, 0x27, 0xb9, 0x44, 0xda, 0x65, 0xfb, 0x97, 0x9, 0xb6, 0x28, 0xd5, 0x4b, 0xf4, 0x6a, 0x13, 0x8d, 0x32, 0xac, 0x51, 0xcf, 0x70, 0xee, 0xfc, 0x62, 0xdd, 0x43, 0xbe, 0x20, 0x9f, 0x1, 0x78, 0xe6, 0x59, 0xc7, 0x3a, 0xa4, 0x1b, 0x85, 0xe9, 0x77, 0xc8, 0x56, 0xab, 0x35, 0x8a, 0x14, 0x6d, 0xf3, 0x4c, 0xd2, 0x2f, 0xb1, 0xe, 0x90, 0xd6, 0x48, 0xf7, 0x69, 0x94, 0xa, 0xb5, 0x2b, 0x52, 0xcc, 0x73, 0xed, 0x10, 0x8e, 0x31, 0xaf, 0xc3, 0x5d, 0xe2, 0x7c, 0x81, 0x1f, 0xa0, 0x3e, 0x47, 0xd9, 0x66, 0xf8, 0x5, 0x9b, 0x24, 0xba}, {0x0, 0x9f, 0x23, 0xbc, 0x46, 0xd9, 0x65, 0xfa, 0x8c, 0x13, 0xaf, 0x30, 0xca, 0x55, 0xe9, 0x76, 0x5, 0x9a, 0x26, 0xb9, 0x43, 0xdc, 0x60, 0xff, 0x89, 0x16, 0xaa, 0x35, 0xcf, 0x50, 0xec, 0x73, 0xa, 0x95, 0x29, 0xb6, 0x4c, 0xd3, 0x6f, 0xf0, 0x86, 0x19, 0xa5, 0x3a, 0xc0, 0x5f, 0xe3, 0x7c, 0xf, 0x90, 0x2c, 0xb3, 0x49, 0xd6, 0x6a, 0xf5, 0x83, 0x1c, 0xa0, 0x3f, 0xc5, 0x5a, 0xe6, 0x79, 0x14, 0x8b, 0x37, 0xa8, 0x52, 0xcd, 0x71, 0xee, 0x98, 0x7, 0xbb, 0x24, 0xde, 0x41, 0xfd, 0x62, 0x11, 0x8e, 0x32, 0xad, 0x57, 0xc8, 0x74, 0xeb, 0x9d, 0x2, 0xbe, 0x21, 0xdb, 0x44, 0xf8, 0x67, 0x1e, 0x81, 0x3d, 0xa2, 0x58, 0xc7, 0x7b, 0xe4, 0x92, 0xd, 0xb1, 0x2e, 0xd4, 0x4b, 0xf7, 0x68, 0x1b, 0x84, 0x38, 0xa7, 0x5d, 0xc2, 0x7e, 0xe1, 0x97, 0x8, 0xb4, 0x2b, 0xd1, 0x4e, 0xf2, 0x6d, 0x28, 0xb7, 0xb, 0x94, 0x6e, 0xf1, 0x4d, 0xd2, 0xa4, 0x3b, 0x87, 0x18, 0xe2, 0x7d, 0xc1, 0x5e, 0x2d, 0xb2, 0xe, 0x91, 0x6b, 0xf4, 0x48, 0xd7, 0xa1, 0x3e, 0x82, 0x1d, 0xe7, 0x78, 0xc4, 0x5b, 0x22, 0xbd, 0x1, 0x9e, 0x64, 0xfb, 0x47, 0xd8, 0xae, 0x31, 0x8d, 0x12, 0xe8, 0x77, 0xcb, 0x54, 0x27, 0xb8, 0x4, 0x9b, 0x61, 0xfe, 0x42, 0xdd, 0xab, 0x34, 0x88, 0x17, 0xed, 0x72, 0xce, 0x51, 0x3c, 0xa3, 0x1f, 0x80, 0x7a, 0xe5, 0x59, 0xc6, 0xb0, 0x2f, 0x93, 0xc, 0xf6, 0x69, 0xd5, 0x4a, 0x39, 0xa6, 0x1a, 0x85, 0x7f, 0xe0, 0x5c, 0xc3, 0xb5, 0x2a, 0x96, 0x9, 0xf3, 0x6c, 0xd0, 0x4f, 0x36, 0xa9, 0x15, 0x8a, 0x70, 0xef, 0x53, 0xcc, 0xba, 0x25, 0x99, 0x6, 0xfc, 0x63, 0xdf, 0x40, 0x33, 0xac, 0x10, 0x8f, 0x75, 0xea, 0x56, 0xc9, 0xbf, 0x20, 0x9c, 0x3, 0xf9, 0x66, 0xda, 0x45}, {0x0, 0xa0, 0x5d, 0xfd, 0xba, 0x1a, 0xe7, 0x47, 0x69, 0xc9, 0x34, 0x94, 0xd3, 0x73, 0x8e, 0x2e, 0xd2, 0x72, 0x8f, 0x2f, 0x68, 0xc8, 0x35, 0x95, 0xbb, 0x1b, 0xe6, 0x46, 0x1, 0xa1, 0x5c, 0xfc, 0xb9, 0x19, 0xe4, 0x44, 0x3, 0xa3, 0x5e, 0xfe, 0xd0, 0x70, 0x8d, 0x2d, 0x6a, 0xca, 0x37, 0x97, 0x6b, 0xcb, 0x36, 0x96, 0xd1, 0x71, 0x8c, 0x2c, 0x2, 0xa2, 0x5f, 0xff, 0xb8, 0x18, 0xe5, 0x45, 0x6f, 0xcf, 0x32, 0x92, 0xd5, 0x75, 0x88, 0x28, 0x6, 0xa6, 0x5b, 0xfb, 0xbc, 0x1c, 0xe1, 0x41, 0xbd, 0x1d, 0xe0, 0x40, 0x7, 0xa7, 0x5a, 0xfa, 0xd4, 0x74, 0x89, 0x29, 0x6e, 0xce, 0x33, 0x93, 0xd6, 0x76, 0x8b, 0x2b, 0x6c, 0xcc, 0x31, 0x91, 0xbf, 0x1f, 0xe2, 0x42, 0x5, 0xa5, 0x58, 0xf8, 0x4, 0xa4, 0x59, 0xf9, 0xbe, 0x1e, 0xe3, 0x43, 0x6d, 0xcd, 0x30, 0x90, 0xd7, 0x77, 0x8a, 0x2a, 0xde, 0x7e, 0x83, 0x23, 0x64, 0xc4, 0x39, 0x99, 0xb7, 0x17, 0xea, 0x4a, 0xd, 0xad, 0x50, 0xf0, 0xc, 0xac, 0x51, 0xf1, 0xb6, 0x16, 0xeb, 0x4b, 0x65, 0xc5, 0x38, 0x98, 0xdf, 0x7f, 0x82, 0x22, 0x67, 0xc7, 0x3a, 0x9a, 0xdd, 0x7d, 0x80, 0x20, 0xe, 0xae, 0x53, 0xf3, 0xb4, 0x14, 0xe9, 0x49, 0xb5, 0x15, 0xe8, 0x48, 0xf, 0xaf, 0x52, 0xf2, 0xdc, 0x7c, 0x81, 0x21, 0x66, 0xc6, 0x3b, 0x9b, 0xb1, 0x11, 0xec, 0x4c, 0xb, 0xab, 0x56, 0xf6, 0xd8, 0x78, 0x85, 0x25, 0x62, 0xc2, 0x3f, 0x9f, 0x63, 0xc3, 0x3e, 0x9e, 0xd9, 0x79, 0x84, 0x24, 0xa, 0xaa, 0x57, 0xf7, 0xb0, 0x10, 0xed, 0x4d, 0x8, 0xa8, 0x55, 0xf5, 0xb2, 0x12, 0xef, 0x4f, 0x61, 0xc1, 0x3c, 0x9c, 0xdb, 0x7b, 0x86, 0x26, 0xda, 0x7a, 0x87, 0x27, 0x60, 0xc0, 0x3d, 0x9d, 0xb3, 0x13, 0xee, 0x4e, 0x9, 0xa9, 0x54, 0xf4}, {0x0, 0xa1, 0x5f, 0xfe, 0xbe, 0x1f, 0xe1, 0x40, 0x61, 0xc0, 0x3e, 0x9f, 0xdf, 0x7e, 0x80, 0x21, 0xc2, 0x63, 0x9d, 0x3c, 0x7c, 0xdd, 0x23, 0x82, 0xa3, 0x2, 0xfc, 0x5d, 0x1d, 0xbc, 0x42, 0xe3, 0x99, 0x38, 0xc6, 0x67, 0x27, 0x86, 0x78, 0xd9, 0xf8, 0x59, 0xa7, 0x6, 0x46, 0xe7, 0x19, 0xb8, 0x5b, 0xfa, 0x4, 0xa5, 0xe5, 0x44, 0xba, 0x1b, 0x3a, 0x9b, 0x65, 0xc4, 0x84, 0x25, 0xdb, 0x7a, 0x2f, 0x8e, 0x70, 0xd1, 0x91, 0x30, 0xce, 0x6f, 0x4e, 0xef, 0x11, 0xb0, 0xf0, 0x51, 0xaf, 0xe, 0xed, 0x4c, 0xb2, 0x13, 0x53, 0xf2, 0xc, 0xad, 0x8c, 0x2d, 0xd3, 0x72, 0x32, 0x93, 0x6d, 0xcc, 0xb6, 0x17, 0xe9, 0x48, 0x8, 0xa9, 0x57, 0xf6, 0xd7, 0x76, 0x88, 0x29, 0x69, 0xc8, 0x36, 0x97, 0x74, 0xd5, 0x2b, 0x8a, 0xca, 0x6b, 0x95, 0x34, 0x15, 0xb4, 0x4a, 0xeb, 0xab, 0xa, 0xf4, 0x55, 0x5e, 0xff, 0x1, 0xa0, 0xe0, 0x41, 0xbf, 0x1e, 0x3f, 0x9e, 0x60, 0xc1, 0x81, 0x20, 0xde, 0x7f, 0x9c, 0x3d, 0xc3, 0x62, 0x22, 0x83, 0x7d, 0xdc, 0xfd, 0x5c, 0xa2, 0x3, 0x43, 0xe2, 0x1c, 0xbd, 0xc7, 0x66, 0x98, 0x39, 0x79, 0xd8, 0x26, 0x87, 0xa6, 0x7, 0xf9, 0x58, 0x18, 0xb9, 0x47, 0xe6, 0x5, 0xa4, 0x5a, 0xfb, 0xbb, 0x1a, 0xe4, 0x45, 0x64, 0xc5, 0x3b, 0x9a, 0xda, 0x7b, 0x85, 0x24, 0x71, 0xd0, 0x2e, 0x8f, 0xcf, 0x6e, 0x90, 0x31, 0x10, 0xb1, 0x4f, 0xee, 0xae, 0xf, 0xf1, 0x50, 0xb3, 0x12, 0xec, 0x4d, 0xd, 0xac, 0x52, 0xf3, 0xd2, 0x73, 0x8d, 0x2c, 0x6c, 0xcd, 0x33, 0x92, 0xe8, 0x49, 0xb7, 0x16, 0x56, 0xf7, 0x9, 0xa8, 0x89, 0x28, 0xd6, 0x77, 0x37, 0x96, 0x68, 0xc9, 0x2a, 0x8b, 0x75, 0xd4, 0x94, 0x35, 0xcb, 0x6a, 0x4b, 0xea, 0x14, 0xb5, 0xf5, 0x54, 0xaa, 0xb}, {0x0, 0xa2, 0x59, 0xfb, 0xb2, 0x10, 0xeb, 0x49, 0x79, 0xdb, 0x20, 0x82, 0xcb, 0x69, 0x92, 0x30, 0xf2, 0x50, 0xab, 0x9, 0x40, 0xe2, 0x19, 0xbb, 0x8b, 0x29, 0xd2, 0x70, 0x39, 0x9b, 0x60, 0xc2, 0xf9, 0x5b, 0xa0, 0x2, 0x4b, 0xe9, 0x12, 0xb0, 0x80, 0x22, 0xd9, 0x7b, 0x32, 0x90, 0x6b, 0xc9, 0xb, 0xa9, 0x52, 0xf0, 0xb9, 0x1b, 0xe0, 0x42, 0x72, 0xd0, 0x2b, 0x89, 0xc0, 0x62, 0x99, 0x3b, 0xef, 0x4d, 0xb6, 0x14, 0x5d, 0xff, 0x4, 0xa6, 0x96, 0x34, 0xcf, 0x6d, 0x24, 0x86, 0x7d, 0xdf, 0x1d, 0xbf, 0x44, 0xe6, 0xaf, 0xd, 0xf6, 0x54, 0x64, 0xc6, 0x3d, 0x9f, 0xd6, 0x74, 0x8f, 0x2d, 0x16, 0xb4, 0x4f, 0xed, 0xa4, 0x6, 0xfd, 0x5f, 0x6f, 0xcd, 0x36, 0x94, 0xdd, 0x7f, 0x84, 0x26, 0xe4, 0x46, 0xbd, 0x1f, 0x56, 0xf4, 0xf, 0xad, 0x9d, 0x3f, 0xc4, 0x66, 0x2f, 0x8d, 0x76, 0xd4, 0xc3, 0x61, 0x9a, 0x38, 0x71, 0xd3, 0x28, 0x8a, 0xba, 0x18, 0xe3, 0x41, 0x8, 0xaa, 0x51, 0xf3, 0x31, 0x93, 0x68, 0xca, 0x83, 0x21, 0xda, 0x78, 0x48, 0xea, 0x11, 0xb3, 0xfa, 0x58, 0xa3, 0x1, 0x3a, 0x98, 0x63, 0xc1, 0x88, 0x2a, 0xd1, 0x73, 0x43, 0xe1, 0x1a, 0xb8, 0xf1, 0x53, 0xa8, 0xa, 0xc8, 0x6a, 0x91, 0x33, 0x7a, 0xd8, 0x23, 0x81, 0xb1, 0x13, 0xe8, 0x4a, 0x3, 0xa1, 0x5a, 0xf8, 0x2c, 0x8e, 0x75, 0xd7, 0x9e, 0x3c, 0xc7, 0x65, 0x55, 0xf7, 0xc, 0xae, 0xe7, 0x45, 0xbe, 0x1c, 0xde, 0x7c, 0x87, 0x25, 0x6c, 0xce, 0x35, 0x97, 0xa7, 0x5, 0xfe, 0x5c, 0x15, 0xb7, 0x4c, 0xee, 0xd5, 0x77, 0x8c, 0x2e, 0x67, 0xc5, 0x3e, 0x9c, 0xac, 0xe, 0xf5, 0x57, 0x1e, 0xbc, 0x47, 0xe5, 0x27, 0x85, 0x7e, 0xdc, 0x95, 0x37, 0xcc, 0x6e, 0x5e, 0xfc, 0x7, 0xa5, 0xec, 0x4e, 0xb5, 0x17}, {0x0, 0xa3, 0x5b, 0xf8, 0xb6, 0x15, 0xed, 0x4e, 0x71, 0xd2, 0x2a, 0x89, 0xc7, 0x64, 0x9c, 0x3f, 0xe2, 0x41, 0xb9, 0x1a, 0x54, 0xf7, 0xf, 0xac, 0x93, 0x30, 0xc8, 0x6b, 0x25, 0x86, 0x7e, 0xdd, 0xd9, 0x7a, 0x82, 0x21, 0x6f, 0xcc, 0x34, 0x97, 0xa8, 0xb, 0xf3, 0x50, 0x1e, 0xbd, 0x45, 0xe6, 0x3b, 0x98, 0x60, 0xc3, 0x8d, 0x2e, 0xd6, 0x75, 0x4a, 0xe9, 0x11, 0xb2, 0xfc, 0x5f, 0xa7, 0x4, 0xaf, 0xc, 0xf4, 0x57, 0x19, 0xba, 0x42, 0xe1, 0xde, 0x7d, 0x85, 0x26, 0x68, 0xcb, 0x33, 0x90, 0x4d, 0xee, 0x16, 0xb5, 0xfb, 0x58, 0xa0, 0x3, 0x3c, 0x9f, 0x67, 0xc4, 0x8a, 0x29, 0xd1, 0x72, 0x76, 0xd5, 0x2d, 0x8e, 0xc0, 0x63, 0x9b, 0x38, 0x7, 0xa4, 0x5c, 0xff, 0xb1, 0x12, 0xea, 0x49, 0x94, 0x37, 0xcf, 0x6c, 0x22, 0x81, 0x79, 0xda, 0xe5, 0x46, 0xbe, 0x1d, 0x53, 0xf0, 0x8, 0xab, 0x43, 0xe0, 0x18, 0xbb, 0xf5, 0x56, 0xae, 0xd, 0x32, 0x91, 0x69, 0xca, 0x84, 0x27, 0xdf, 0x7c, 0xa1, 0x2, 0xfa, 0x59, 0x17, 0xb4, 0x4c, 0xef, 0xd0, 0x73, 0x8b, 0x28, 0x66, 0xc5, 0x3d, 0x9e, 0x9a, 0x39, 0xc1, 0x62, 0x2c, 0x8f, 0x77, 0xd4, 0xeb, 0x48, 0xb0, 0x13, 0x5d, 0xfe, 0x6, 0xa5, 0x78, 0xdb, 0x23, 0x80, 0xce, 0x6d, 0x95, 0x36, 0x9, 0xaa, 0x52, 0xf1, 0xbf, 0x1c, 0xe4, 0x47, 0xec, 0x4f, 0xb7, 0x14, 0x5a, 0xf9, 0x1, 0xa2, 0x9d, 0x3e, 0xc6, 0x65, 0x2b, 0x88, 0x70, 0xd3, 0xe, 0xad, 0x55, 0xf6, 0xb8, 0x1b, 0xe3, 0x40, 0x7f, 0xdc, 0x24, 0x87, 0xc9, 0x6a, 0x92, 0x31, 0x35, 0x96, 0x6e, 0xcd, 0x83, 0x20, 0xd8, 0x7b, 0x44, 0xe7, 0x1f, 0xbc, 0xf2, 0x51, 0xa9, 0xa, 0xd7, 0x74, 0x8c, 0x2f, 0x61, 0xc2, 0x3a, 0x99, 0xa6, 0x5, 0xfd, 0x5e, 0x10, 0xb3, 0x4b, 0xe8}, {0x0, 0xa4, 0x55, 0xf1, 0xaa, 0xe, 0xff, 0x5b, 0x49, 0xed, 0x1c, 0xb8, 0xe3, 0x47, 0xb6, 0x12, 0x92, 0x36, 0xc7, 0x63, 0x38, 0x9c, 0x6d, 0xc9, 0xdb, 0x7f, 0x8e, 0x2a, 0x71, 0xd5, 0x24, 0x80, 0x39, 0x9d, 0x6c, 0xc8, 0x93, 0x37, 0xc6, 0x62, 0x70, 0xd4, 0x25, 0x81, 0xda, 0x7e, 0x8f, 0x2b, 0xab, 0xf, 0xfe, 0x5a, 0x1, 0xa5, 0x54, 0xf0, 0xe2, 0x46, 0xb7, 0x13, 0x48, 0xec, 0x1d, 0xb9, 0x72, 0xd6, 0x27, 0x83, 0xd8, 0x7c, 0x8d, 0x29, 0x3b, 0x9f, 0x6e, 0xca, 0x91, 0x35, 0xc4, 0x60, 0xe0, 0x44, 0xb5, 0x11, 0x4a, 0xee, 0x1f, 0xbb, 0xa9, 0xd, 0xfc, 0x58, 0x3, 0xa7, 0x56, 0xf2, 0x4b, 0xef, 0x1e, 0xba, 0xe1, 0x45, 0xb4, 0x10, 0x2, 0xa6, 0x57, 0xf3, 0xa8, 0xc, 0xfd, 0x59, 0xd9, 0x7d, 0x8c, 0x28, 0x73, 0xd7, 0x26, 0x82, 0x90, 0x34, 0xc5, 0x61, 0x3a, 0x9e, 0x6f, 0xcb, 0xe4, 0x40, 0xb1, 0x15, 0x4e, 0xea, 0x1b, 0xbf, 0xad, 0x9, 0xf8, 0x5c, 0x7, 0xa3, 0x52, 0xf6, 0x76, 0xd2, 0x23, 0x87, 0xdc, 0x78, 0x89, 0x2d, 0x3f, 0x9b, 0x6a, 0xce, 0x95, 0x31, 0xc0, 0x64, 0xdd, 0x79, 0x88, 0x2c, 0x77, 0xd3, 0x22, 0x86, 0x94, 0x30, 0xc1, 0x65, 0x3e, 0x9a, 0x6b, 0xcf, 0x4f, 0xeb, 0x1a, 0xbe, 0xe5, 0x41, 0xb0, 0x14, 0x6, 0xa2, 0x53, 0xf7, 0xac, 0x8, 0xf9, 0x5d, 0x96, 0x32, 0xc3, 0x67, 0x3c, 0x98, 0x69, 0xcd, 0xdf, 0x7b, 0x8a, 0x2e, 0x75, 0xd1, 0x20, 0x84, 0x4, 0xa0, 0x51, 0xf5, 0xae, 0xa, 0xfb, 0x5f, 0x4d, 0xe9, 0x18, 0xbc, 0xe7, 0x43, 0xb2, 0x16, 0xaf, 0xb, 0xfa, 0x5e, 0x5, 0xa1, 0x50, 0xf4, 0xe6, 0x42, 0xb3, 0x17, 0x4c, 0xe8, 0x19, 0xbd, 0x3d, 0x99, 0x68, 0xcc, 0x97, 0x33, 0xc2, 0x66, 0x74, 0xd0, 0x21, 0x85, 0xde, 0x7a, 0x8b, 0x2f}, {0x0, 0xa5, 0x57, 0xf2, 0xae, 0xb, 0xf9, 0x5c, 0x41, 0xe4, 0x16, 0xb3, 0xef, 0x4a, 0xb8, 0x1d, 0x82, 0x27, 0xd5, 0x70, 0x2c, 0x89, 0x7b, 0xde, 0xc3, 0x66, 0x94, 0x31, 0x6d, 0xc8, 0x3a, 0x9f, 0x19, 0xbc, 0x4e, 0xeb, 0xb7, 0x12, 0xe0, 0x45, 0x58, 0xfd, 0xf, 0xaa, 0xf6, 0x53, 0xa1, 0x4, 0x9b, 0x3e, 0xcc, 0x69, 0x35, 0x90, 0x62, 0xc7, 0xda, 0x7f, 0x8d, 0x28, 0x74, 0xd1, 0x23, 0x86, 0x32, 0x97, 0x65, 0xc0, 0x9c, 0x39, 0xcb, 0x6e, 0x73, 0xd6, 0x24, 0x81, 0xdd, 0x78, 0x8a, 0x2f, 0xb0, 0x15, 0xe7, 0x42, 0x1e, 0xbb, 0x49, 0xec, 0xf1, 0x54, 0xa6, 0x3, 0x5f, 0xfa, 0x8, 0xad, 0x2b, 0x8e, 0x7c, 0xd9, 0x85, 0x20, 0xd2, 0x77, 0x6a, 0xcf, 0x3d, 0x98, 0xc4, 0x61, 0x93, 0x36, 0xa9, 0xc, 0xfe, 0x5b, 0x7, 0xa2, 0x50, 0xf5, 0xe8, 0x4d, 0xbf, 0x1a, 0x46, 0xe3, 0x11, 0xb4, 0x64, 0xc1, 0x33, 0x96, 0xca, 0x6f, 0x9d, 0x38, 0x25, 0x80, 0x72, 0xd7, 0x8b, 0x2e, 0xdc, 0x79, 0xe6, 0x43, 0xb1, 0x14, 0x48, 0xed, 0x1f, 0xba, 0xa7, 0x2, 0xf0, 0x55, 0x9, 0xac, 0x5e, 0xfb, 0x7d, 0xd8, 0x2a, 0x8f, 0xd3, 0x76, 0x84, 0x21, 0x3c, 0x99, 0x6b, 0xce, 0x92, 0x37, 0xc5, 0x60, 0xff, 0x5a, 0xa8, 0xd, 0x51, 0xf4, 0x6, 0xa3, 0xbe, 0x1b, 0xe9, 0x4c, 0x10, 0xb5, 0x47, 0xe2, 0x56, 0xf3, 0x1, 0xa4, 0xf8, 0x5d, 0xaf, 0xa, 0x17, 0xb2, 0x40, 0xe5, 0xb9, 0x1c, 0xee, 0x4b, 0xd4, 0x71, 0x83, 0x26, 0x7a, 0xdf, 0x2d, 0x88, 0x95, 0x30, 0xc2, 0x67, 0x3b, 0x9e, 0x6c, 0xc9, 0x4f, 0xea, 0x18, 0xbd, 0xe1, 0x44, 0xb6, 0x13, 0xe, 0xab, 0x59, 0xfc, 0xa0, 0x5, 0xf7, 0x52, 0xcd, 0x68, 0x9a, 0x3f, 0x63, 0xc6, 0x34, 0x91, 0x8c, 0x29, 0xdb, 0x7e, 0x22, 0x87, 0x75, 0xd0}, {0x0, 0xa6, 0x51, 0xf7, 0xa2, 0x4, 0xf3, 0x55, 0x59, 0xff, 0x8, 0xae, 0xfb, 0x5d, 0xaa, 0xc, 0xb2, 0x14, 0xe3, 0x45, 0x10, 0xb6, 0x41, 0xe7, 0xeb, 0x4d, 0xba, 0x1c, 0x49, 0xef, 0x18, 0xbe, 0x79, 0xdf, 0x28, 0x8e, 0xdb, 0x7d, 0x8a, 0x2c, 0x20, 0x86, 0x71, 0xd7, 0x82, 0x24, 0xd3, 0x75, 0xcb, 0x6d, 0x9a, 0x3c, 0x69, 0xcf, 0x38, 0x9e, 0x92, 0x34, 0xc3, 0x65, 0x30, 0x96, 0x61, 0xc7, 0xf2, 0x54, 0xa3, 0x5, 0x50, 0xf6, 0x1, 0xa7, 0xab, 0xd, 0xfa, 0x5c, 0x9, 0xaf, 0x58, 0xfe, 0x40, 0xe6, 0x11, 0xb7, 0xe2, 0x44, 0xb3, 0x15, 0x19, 0xbf, 0x48, 0xee, 0xbb, 0x1d, 0xea, 0x4c, 0x8b, 0x2d, 0xda, 0x7c, 0x29, 0x8f, 0x78, 0xde, 0xd2, 0x74, 0x83, 0x25, 0x70, 0xd6, 0x21, 0x87, 0x39, 0x9f, 0x68, 0xce, 0x9b, 0x3d, 0xca, 0x6c, 0x60, 0xc6, 0x31, 0x97, 0xc2, 0x64, 0x93, 0x35, 0xf9, 0x5f, 0xa8, 0xe, 0x5b, 0xfd, 0xa, 0xac, 0xa0, 0x6, 0xf1, 0x57, 0x2, 0xa4, 0x53, 0xf5, 0x4b, 0xed, 0x1a, 0xbc, 0xe9, 0x4f, 0xb8, 0x1e, 0x12, 0xb4, 0x43, 0xe5, 0xb0, 0x16, 0xe1, 0x47, 0x80, 0x26, 0xd1, 0x77, 0x22, 0x84, 0x73, 0xd5, 0xd9, 0x7f, 0x88, 0x2e, 0x7b, 0xdd, 0x2a, 0x8c, 0x32, 0x94, 0x63, 0xc5, 0x90, 0x36, 0xc1, 0x67, 0x6b, 0xcd, 0x3a, 0x9c, 0xc9, 0x6f, 0x98, 0x3e, 0xb, 0xad, 0x5a, 0xfc, 0xa9, 0xf, 0xf8, 0x5e, 0x52, 0xf4, 0x3, 0xa5, 0xf0, 0x56, 0xa1, 0x7, 0xb9, 0x1f, 0xe8, 0x4e, 0x1b, 0xbd, 0x4a, 0xec, 0xe0, 0x46, 0xb1, 0x17, 0x42, 0xe4, 0x13, 0xb5, 0x72, 0xd4, 0x23, 0x85, 0xd0, 0x76, 0x81, 0x27, 0x2b, 0x8d, 0x7a, 0xdc, 0x89, 0x2f, 0xd8, 0x7e, 0xc0, 0x66, 0x91, 0x37, 0x62, 0xc4, 0x33, 0x95, 0x99, 0x3f, 0xc8, 0x6e, 0x3b, 0x9d, 0x6a, 0xcc}, {0x0, 0xa7, 0x53, 0xf4, 0xa6, 0x1, 0xf5, 0x52, 0x51, 0xf6, 0x2, 0xa5, 0xf7, 0x50, 0xa4, 0x3, 0xa2, 0x5, 0xf1, 0x56, 0x4, 0xa3, 0x57, 0xf0, 0xf3, 0x54, 0xa0, 0x7, 0x55, 0xf2, 0x6, 0xa1, 0x59, 0xfe, 0xa, 0xad, 0xff, 0x58, 0xac, 0xb, 0x8, 0xaf, 0x5b, 0xfc, 0xae, 0x9, 0xfd, 0x5a, 0xfb, 0x5c, 0xa8, 0xf, 0x5d, 0xfa, 0xe, 0xa9, 0xaa, 0xd, 0xf9, 0x5e, 0xc, 0xab, 0x5f, 0xf8, 0xb2, 0x15, 0xe1, 0x46, 0x14, 0xb3, 0x47, 0xe0, 0xe3, 0x44, 0xb0, 0x17, 0x45, 0xe2, 0x16, 0xb1, 0x10, 0xb7, 0x43, 0xe4, 0xb6, 0x11, 0xe5, 0x42, 0x41, 0xe6, 0x12, 0xb5, 0xe7, 0x40, 0xb4, 0x13, 0xeb, 0x4c, 0xb8, 0x1f, 0x4d, 0xea, 0x1e, 0xb9, 0xba, 0x1d, 0xe9, 0x4e, 0x1c, 0xbb, 0x4f, 0xe8, 0x49, 0xee, 0x1a, 0xbd, 0xef, 0x48, 0xbc, 0x1b, 0x18, 0xbf, 0x4b, 0xec, 0xbe, 0x19, 0xed, 0x4a, 0x79, 0xde, 0x2a, 0x8d, 0xdf, 0x78, 0x8c, 0x2b, 0x28, 0x8f, 0x7b, 0xdc, 0x8e, 0x29, 0xdd, 0x7a, 0xdb, 0x7c, 0x88, 0x2f, 0x7d, 0xda, 0x2e, 0x89, 0x8a, 0x2d, 0xd9, 0x7e, 0x2c, 0x8b, 0x7f, 0xd8, 0x20, 0x87, 0x73, 0xd4, 0x86, 0x21, 0xd5, 0x72, 0x71, 0xd6, 0x22, 0x85, 0xd7, 0x70, 0x84, 0x23, 0x82, 0x25, 0xd1, 0x76, 0x24, 0x83, 0x77, 0xd0, 0xd3, 0x74, 0x80, 0x27, 0x75, 0xd2, 0x26, 0x81, 0xcb, 0x6c, 0x98, 0x3f, 0x6d, 0xca, 0x3e, 0x99, 0x9a, 0x3d, 0xc9, 0x6e, 0x3c, 0x9b, 0x6f, 0xc8, 0x69, 0xce, 0x3a, 0x9d, 0xcf, 0x68, 0x9c, 0x3b, 0x38, 0x9f, 0x6b, 0xcc, 0x9e, 0x39, 0xcd, 0x6a, 0x92, 0x35, 0xc1, 0x66, 0x34, 0x93, 0x67, 0xc0, 0xc3, 0x64, 0x90, 0x37, 0x65, 0xc2, 0x36, 0x91, 0x30, 0x97, 0x63, 0xc4, 0x96, 0x31, 0xc5, 0x62, 0x61, 0xc6, 0x32, 0x95, 0xc7, 0x60, 0x94, 0x33}, {0x0, 0xa8, 0x4d, 0xe5, 0x9a, 0x32, 0xd7, 0x7f, 0x29, 0x81, 0x64, 0xcc, 0xb3, 0x1b, 0xfe, 0x56, 0x52, 0xfa, 0x1f, 0xb7, 0xc8, 0x60, 0x85, 0x2d, 0x7b, 0xd3, 0x36, 0x9e, 0xe1, 0x49, 0xac, 0x4, 0xa4, 0xc, 0xe9, 0x41, 0x3e, 0x96, 0x73, 0xdb, 0x8d, 0x25, 0xc0, 0x68, 0x17, 0xbf, 0x5a, 0xf2, 0xf6, 0x5e, 0xbb, 0x13, 0x6c, 0xc4, 0x21, 0x89, 0xdf, 0x77, 0x92, 0x3a, 0x45, 0xed, 0x8, 0xa0, 0x55, 0xfd, 0x18, 0xb0, 0xcf, 0x67, 0x82, 0x2a, 0x7c, 0xd4, 0x31, 0x99, 0xe6, 0x4e, 0xab, 0x3, 0x7, 0xaf, 0x4a, 0xe2, 0x9d, 0x35, 0xd0, 0x78, 0x2e, 0x86, 0x63, 0xcb, 0xb4, 0x1c, 0xf9, 0x51, 0xf1, 0x59, 0xbc, 0x14, 0x6b, 0xc3, 0x26, 0x8e, 0xd8, 0x70, 0x95, 0x3d, 0x42, 0xea, 0xf, 0xa7, 0xa3, 0xb, 0xee, 0x46, 0x39, 0x91, 0x74, 0xdc, 0x8a, 0x22, 0xc7, 0x6f, 0x10, 0xb8, 0x5d, 0xf5, 0xaa, 0x2, 0xe7, 0x4f, 0x30, 0x98, 0x7d, 0xd5, 0x83, 0x2b, 0xce, 0x66, 0x19, 0xb1, 0x54, 0xfc, 0xf8, 0x50, 0xb5, 0x1d, 0x62, 0xca, 0x2f, 0x87, 0xd1, 0x79, 0x9c, 0x34, 0x4b, 0xe3, 0x6, 0xae, 0xe, 0xa6, 0x43, 0xeb, 0x94, 0x3c, 0xd9, 0x71, 0x27, 0x8f, 0x6a, 0xc2, 0xbd, 0x15, 0xf0, 0x58, 0x5c, 0xf4, 0x11, 0xb9, 0xc6, 0x6e, 0x8b, 0x23, 0x75, 0xdd, 0x38, 0x90, 0xef, 0x47, 0xa2, 0xa, 0xff, 0x57, 0xb2, 0x1a, 0x65, 0xcd, 0x28, 0x80, 0xd6, 0x7e, 0x9b, 0x33, 0x4c, 0xe4, 0x1, 0xa9, 0xad, 0x5, 0xe0, 0x48, 0x37, 0x9f, 0x7a, 0xd2, 0x84, 0x2c, 0xc9, 0x61, 0x1e, 0xb6, 0x53, 0xfb, 0x5b, 0xf3, 0x16, 0xbe, 0xc1, 0x69, 0x8c, 0x24, 0x72, 0xda, 0x3f, 0x97, 0xe8, 0x40, 0xa5, 0xd, 0x9, 0xa1, 0x44, 0xec, 0x93, 0x3b, 0xde, 0x76, 0x20, 0x88, 0x6d, 0xc5, 0xba, 0x12, 0xf7, 0x5f}, {0x0, 0xa9, 0x4f, 0xe6, 0x9e, 0x37, 0xd1, 0x78, 0x21, 0x88, 0x6e, 0xc7, 0xbf, 0x16, 0xf0, 0x59, 0x42, 0xeb, 0xd, 0xa4, 0xdc, 0x75, 0x93, 0x3a, 0x63, 0xca, 0x2c, 0x85, 0xfd, 0x54, 0xb2, 0x1b, 0x84, 0x2d, 0xcb, 0x62, 0x1a, 0xb3, 0x55, 0xfc, 0xa5, 0xc, 0xea, 0x43, 0x3b, 0x92, 0x74, 0xdd, 0xc6, 0x6f, 0x89, 0x20, 0x58, 0xf1, 0x17, 0xbe, 0xe7, 0x4e, 0xa8, 0x1, 0x79, 0xd0, 0x36, 0x9f, 0x15, 0xbc, 0x5a, 0xf3, 0x8b, 0x22, 0xc4, 0x6d, 0x34, 0x9d, 0x7b, 0xd2, 0xaa, 0x3, 0xe5, 0x4c, 0x57, 0xfe, 0x18, 0xb1, 0xc9, 0x60, 0x86, 0x2f, 0x76, 0xdf, 0x39, 0x90, 0xe8, 0x41, 0xa7, 0xe, 0x91, 0x38, 0xde, 0x77, 0xf, 0xa6, 0x40, 0xe9, 0xb0, 0x19, 0xff, 0x56, 0x2e, 0x87, 0x61, 0xc8, 0xd3, 0x7a, 0x9c, 0x35, 0x4d, 0xe4, 0x2, 0xab, 0xf2, 0x5b, 0xbd, 0x14, 0x6c, 0xc5, 0x23, 0x8a, 0x2a, 0x83, 0x65, 0xcc, 0xb4, 0x1d, 0xfb, 0x52, 0xb, 0xa2, 0x44, 0xed, 0x95, 0x3c, 0xda, 0x73, 0x68, 0xc1, 0x27, 0x8e, 0xf6, 0x5f, 0xb9, 0x10, 0x49, 0xe0, 0x6, 0xaf, 0xd7, 0x7e, 0x98, 0x31, 0xae, 0x7, 0xe1, 0x48, 0x30, 0x99, 0x7f, 0xd6, 0x8f, 0x26, 0xc0, 0x69, 0x11, 0xb8, 0x5e, 0xf7, 0xec, 0x45, 0xa3, 0xa, 0x72, 0xdb, 0x3d, 0x94, 0xcd, 0x64, 0x82, 0x2b, 0x53, 0xfa, 0x1c, 0xb5, 0x3f, 0x96, 0x70, 0xd9, 0xa1, 0x8, 0xee, 0x47, 0x1e, 0xb7, 0x51, 0xf8, 0x80, 0x29, 0xcf, 0x66, 0x7d, 0xd4, 0x32, 0x9b, 0xe3, 0x4a, 0xac, 0x5, 0x5c, 0xf5, 0x13, 0xba, 0xc2, 0x6b, 0x8d, 0x24, 0xbb, 0x12, 0xf4, 0x5d, 0x25, 0x8c, 0x6a, 0xc3, 0x9a, 0x33, 0xd5, 0x7c, 0x4, 0xad, 0x4b, 0xe2, 0xf9, 0x50, 0xb6, 0x1f, 0x67, 0xce, 0x28, 0x81, 0xd8, 0x71, 0x97, 0x3e, 0x46, 0xef, 0x9, 0xa0}, {0x0, 0xaa, 0x49, 0xe3, 0x92, 0x38, 0xdb, 0x71, 0x39, 0x93, 0x70, 0xda, 0xab, 0x1, 0xe2, 0x48, 0x72, 0xd8, 0x3b, 0x91, 0xe0, 0x4a, 0xa9, 0x3, 0x4b, 0xe1, 0x2, 0xa8, 0xd9, 0x73, 0x90, 0x3a, 0xe4, 0x4e, 0xad, 0x7, 0x76, 0xdc, 0x3f, 0x95, 0xdd, 0x77, 0x94, 0x3e, 0x4f, 0xe5, 0x6, 0xac, 0x96, 0x3c, 0xdf, 0x75, 0x4, 0xae, 0x4d, 0xe7, 0xaf, 0x5, 0xe6, 0x4c, 0x3d, 0x97, 0x74, 0xde, 0xd5, 0x7f, 0x9c, 0x36, 0x47, 0xed, 0xe, 0xa4, 0xec, 0x46, 0xa5, 0xf, 0x7e, 0xd4, 0x37, 0x9d, 0xa7, 0xd, 0xee, 0x44, 0x35, 0x9f, 0x7c, 0xd6, 0x9e, 0x34, 0xd7, 0x7d, 0xc, 0xa6, 0x45, 0xef, 0x31, 0x9b, 0x78, 0xd2, 0xa3, 0x9, 0xea, 0x40, 0x8, 0xa2, 0x41, 0xeb, 0x9a, 0x30, 0xd3, 0x79, 0x43, 0xe9, 0xa, 0xa0, 0xd1, 0x7b, 0x98, 0x32, 0x7a, 0xd0, 0x33, 0x99, 0xe8, 0x42, 0xa1, 0xb, 0xb7, 0x1d, 0xfe, 0x54, 0x25, 0x8f, 0x6c, 0xc6, 0x8e, 0x24, 0xc7, 0x6d, 0x1c, 0xb6, 0x55, 0xff, 0xc5, 0x6f, 0x8c, 0x26, 0x57, 0xfd, 0x1e, 0xb4, 0xfc, 0x56, 0xb5, 0x1f, 0x6e, 0xc4, 0x27, 0x8d, 0x53, 0xf9, 0x1a, 0xb0, 0xc1, 0x6b, 0x88, 0x22, 0x6a, 0xc0, 0x23, 0x89, 0xf8, 0x52, 0xb1, 0x1b, 0x21, 0x8b, 0x68, 0xc2, 0xb3, 0x19, 0xfa, 0x50, 0x18, 0xb2, 0x51, 0xfb, 0x8a, 0x20, 0xc3, 0x69, 0x62, 0xc8, 0x2b, 0x81, 0xf0, 0x5a, 0xb9, 0x13, 0x5b, 0xf1, 0x12, 0xb8, 0xc9, 0x63, 0x80, 0x2a, 0x10, 0xba, 0x59, 0xf3, 0x82, 0x28, 0xcb, 0x61, 0x29, 0x83, 0x60, 0xca, 0xbb, 0x11, 0xf2, 0x58, 0x86, 0x2c, 0xcf, 0x65, 0x14, 0xbe, 0x5d, 0xf7, 0xbf, 0x15, 0xf6, 0x5c, 0x2d, 0x87, 0x64, 0xce, 0xf4, 0x5e, 0xbd, 0x17, 0x66, 0xcc, 0x2f, 0x85, 0xcd, 0x67, 0x84, 0x2e, 0x5f, 0xf5, 0x16, 0xbc}, {0x0, 0xab, 0x4b, 0xe0, 0x96, 0x3d, 0xdd, 0x76, 0x31, 0x9a, 0x7a, 0xd1, 0xa7, 0xc, 0xec, 0x47, 0x62, 0xc9, 0x29, 0x82, 0xf4, 0x5f, 0xbf, 0x14, 0x53, 0xf8, 0x18, 0xb3, 0xc5, 0x6e, 0x8e, 0x25, 0xc4, 0x6f, 0x8f, 0x24, 0x52, 0xf9, 0x19, 0xb2, 0xf5, 0x5e, 0xbe, 0x15, 0x63, 0xc8, 0x28, 0x83, 0xa6, 0xd, 0xed, 0x46, 0x30, 0x9b, 0x7b, 0xd0, 0x97, 0x3c, 0xdc, 0x77, 0x1, 0xaa, 0x4a, 0xe1, 0x95, 0x3e, 0xde, 0x75, 0x3, 0xa8, 0x48, 0xe3, 0xa4, 0xf, 0xef, 0x44, 0x32, 0x99, 0x79, 0xd2, 0xf7, 0x5c, 0xbc, 0x17, 0x61, 0xca, 0x2a, 0x81, 0xc6, 0x6d, 0x8d, 0x26, 0x50, 0xfb, 0x1b, 0xb0, 0x51, 0xfa, 0x1a, 0xb1, 0xc7, 0x6c, 0x8c, 0x27, 0x60, 0xcb, 0x2b, 0x80, 0xf6, 0x5d, 0xbd, 0x16, 0x33, 0x98, 0x78, 0xd3, 0xa5, 0xe, 0xee, 0x45, 0x2, 0xa9, 0x49, 0xe2, 0x94, 0x3f, 0xdf, 0x74, 0x37, 0x9c, 0x7c, 0xd7, 0xa1, 0xa, 0xea, 0x41, 0x6, 0xad, 0x4d, 0xe6, 0x90, 0x3b, 0xdb, 0x70, 0x55, 0xfe, 0x1e, 0xb5, 0xc3, 0x68, 0x88, 0x23, 0x64, 0xcf, 0x2f, 0x84, 0xf2, 0x59, 0xb9, 0x12, 0xf3, 0x58, 0xb8, 0x13, 0x65, 0xce, 0x2e, 0x85, 0xc2, 0x69, 0x89, 0x22, 0x54, 0xff, 0x1f, 0xb4, 0x91, 0x3a, 0xda, 0x71, 0x7, 0xac, 0x4c, 0xe7, 0xa0, 0xb, 0xeb, 0x40, 0x36, 0x9d, 0x7d, 0xd6, 0xa2, 0x9, 0xe9, 0x42, 0x34, 0x9f, 0x7f, 0xd4, 0x93, 0x38, 0xd8, 0x73, 0x5, 0xae, 0x4e, 0xe5, 0xc0, 0x6b, 0x8b, 0x20, 0x56, 0xfd, 0x1d, 0xb6, 0xf1, 0x5a, 0xba, 0x11, 0x67, 0xcc, 0x2c, 0x87, 0x66, 0xcd, 0x2d, 0x86, 0xf0, 0x5b, 0xbb, 0x10, 0x57, 0xfc, 0x1c, 0xb7, 0xc1, 0x6a, 0x8a, 0x21, 0x4, 0xaf, 0x4f, 0xe4, 0x92, 0x39, 0xd9, 0x72, 0x35, 0x9e, 0x7e, 0xd5, 0xa3, 0x8, 0xe8, 0x43}, {0x0, 0xac, 0x45, 0xe9, 0x8a, 0x26, 0xcf, 0x63, 0x9, 0xa5, 0x4c, 0xe0, 0x83, 0x2f, 0xc6, 0x6a, 0x12, 0xbe, 0x57, 0xfb, 0x98, 0x34, 0xdd, 0x71, 0x1b, 0xb7, 0x5e, 0xf2, 0x91, 0x3d, 0xd4, 0x78, 0x24, 0x88, 0x61, 0xcd, 0xae, 0x2, 0xeb, 0x47, 0x2d, 0x81, 0x68, 0xc4, 0xa7, 0xb, 0xe2, 0x4e, 0x36, 0x9a, 0x73, 0xdf, 0xbc, 0x10, 0xf9, 0x55, 0x3f, 0x93, 0x7a, 0xd6, 0xb5, 0x19, 0xf0, 0x5c, 0x48, 0xe4, 0xd, 0xa1, 0xc2, 0x6e, 0x87, 0x2b, 0x41, 0xed, 0x4, 0xa8, 0xcb, 0x67, 0x8e, 0x22, 0x5a, 0xf6, 0x1f, 0xb3, 0xd0, 0x7c, 0x95, 0x39, 0x53, 0xff, 0x16, 0xba, 0xd9, 0x75, 0x9c, 0x30, 0x6c, 0xc0, 0x29, 0x85, 0xe6, 0x4a, 0xa3, 0xf, 0x65, 0xc9, 0x20, 0x8c, 0xef, 0x43, 0xaa, 0x6, 0x7e, 0xd2, 0x3b, 0x97, 0xf4, 0x58, 0xb1, 0x1d, 0x77, 0xdb, 0x32, 0x9e, 0xfd, 0x51, 0xb8, 0x14, 0x90, 0x3c, 0xd5, 0x79, 0x1a, 0xb6, 0x5f, 0xf3, 0x99, 0x35, 0xdc, 0x70, 0x13, 0xbf, 0x56, 0xfa, 0x82, 0x2e, 0xc7, 0x6b, 0x8, 0xa4, 0x4d, 0xe1, 0x8b, 0x27, 0xce, 0x62, 0x1, 0xad, 0x44, 0xe8, 0xb4, 0x18, 0xf1, 0x5d, 0x3e, 0x92, 0x7b, 0xd7, 0xbd, 0x11, 0xf8, 0x54, 0x37, 0x9b, 0x72, 0xde, 0xa6, 0xa, 0xe3, 0x4f, 0x2c, 0x80, 0x69, 0xc5, 0xaf, 0x3, 0xea, 0x46, 0x25, 0x89, 0x60, 0xcc, 0xd8, 0x74, 0x9d, 0x31, 0x52, 0xfe, 0x17, 0xbb, 0xd1, 0x7d, 0x94, 0x38, 0x5b, 0xf7, 0x1e, 0xb2, 0xca, 0x66, 0x8f, 0x23, 0x40, 0xec, 0x5, 0xa9, 0xc3, 0x6f, 0x86, 0x2a, 0x49, 0xe5, 0xc, 0xa0, 0xfc, 0x50, 0xb9, 0x15, 0x76, 0xda, 0x33, 0x9f, 0xf5, 0x59, 0xb0, 0x1c, 0x7f, 0xd3, 0x3a, 0x96, 0xee, 0x42, 0xab, 0x7, 0x64, 0xc8, 0x21, 0x8d, 0xe7, 0x4b, 0xa2, 0xe, 0x6d, 0xc1, 0x28, 0x84}, {0x0, 0xad, 0x47, 0xea, 0x8e, 0x23, 0xc9, 0x64, 0x1, 0xac, 0x46, 0xeb, 0x8f, 0x22, 0xc8, 0x65, 0x2, 0xaf, 0x45, 0xe8, 0x8c, 0x21, 0xcb, 0x66, 0x3, 0xae, 0x44, 0xe9, 0x8d, 0x20, 0xca, 0x67, 0x4, 0xa9, 0x43, 0xee, 0x8a, 0x27, 0xcd, 0x60, 0x5, 0xa8, 0x42, 0xef, 0x8b, 0x26, 0xcc, 0x61, 0x6, 0xab, 0x41, 0xec, 0x88, 0x25, 0xcf, 0x62, 0x7, 0xaa, 0x40, 0xed, 0x89, 0x24, 0xce, 0x63, 0x8, 0xa5, 0x4f, 0xe2, 0x86, 0x2b, 0xc1, 0x6c, 0x9, 0xa4, 0x4e, 0xe3, 0x87, 0x2a, 0xc0, 0x6d, 0xa, 0xa7, 0x4d, 0xe0, 0x84, 0x29, 0xc3, 0x6e, 0xb, 0xa6, 0x4c, 0xe1, 0x85, 0x28, 0xc2, 0x6f, 0xc, 0xa1, 0x4b, 0xe6, 0x82, 0x2f, 0xc5, 0x68, 0xd, 0xa0, 0x4a, 0xe7, 0x83, 0x2e, 0xc4, 0x69, 0xe, 0xa3, 0x49, 0xe4, 0x80, 0x2d, 0xc7, 0x6a, 0xf, 0xa2, 0x48, 0xe5, 0x81, 0x2c, 0xc6, 0x6b, 0x10, 0xbd, 0x57, 0xfa, 0x9e, 0x33, 0xd9, 0x74, 0x11, 0xbc, 0x56, 0xfb, 0x9f, 0x32, 0xd8, 0x75, 0x12, 0xbf, 0x55, 0xf8, 0x9c, 0x31, 0xdb, 0x76, 0x13, 0xbe, 0x54, 0xf9, 0x9d, 0x30, 0xda, 0x77, 0x14, 0xb9, 0x53, 0xfe, 0x9a, 0x37, 0xdd, 0x70, 0x15, 0xb8, 0x52, 0xff, 0x9b, 0x36, 0xdc, 0x71, 0x16, 0xbb, 0x51, 0xfc, 0x98, 0x35, 0xdf, 0x72, 0x17, 0xba, 0x50, 0xfd, 0x99, 0x34, 0xde, 0x73, 0x18, 0xb5, 0x5f, 0xf2, 0x96, 0x3b, 0xd1, 0x7c, 0x19, 0xb4, 0x5e, 0xf3, 0x97, 0x3a, 0xd0, 0x7d, 0x1a, 0xb7, 0x5d, 0xf0, 0x94, 0x39, 0xd3, 0x7e, 0x1b, 0xb6, 0x5c, 0xf1, 0x95, 0x38, 0xd2, 0x7f, 0x1c, 0xb1, 0x5b, 0xf6, 0x92, 0x3f, 0xd5, 0x78, 0x1d, 0xb0, 0x5a, 0xf7, 0x93, 0x3e, 0xd4, 0x79, 0x1e, 0xb3, 0x59, 0xf4, 0x90, 0x3d, 0xd7, 0x7a, 0x1f, 0xb2, 0x58, 0xf5, 0x91, 0x3c, 0xd6, 0x7b}, {0x0, 0xae, 0x41, 0xef, 0x82, 0x2c, 0xc3, 0x6d, 0x19, 0xb7, 0x58, 0xf6, 0x9b, 0x35, 0xda, 0x74, 0x32, 0x9c, 0x73, 0xdd, 0xb0, 0x1e, 0xf1, 0x5f, 0x2b, 0x85, 0x6a, 0xc4, 0xa9, 0x7, 0xe8, 0x46, 0x64, 0xca, 0x25, 0x8b, 0xe6, 0x48, 0xa7, 0x9, 0x7d, 0xd3, 0x3c, 0x92, 0xff, 0x51, 0xbe, 0x10, 0x56, 0xf8, 0x17, 0xb9, 0xd4, 0x7a, 0x95, 0x3b, 0x4f, 0xe1, 0xe, 0xa0, 0xcd, 0x63, 0x8c, 0x22, 0xc8, 0x66, 0x89, 0x27, 0x4a, 0xe4, 0xb, 0xa5, 0xd1, 0x7f, 0x90, 0x3e, 0x53, 0xfd, 0x12, 0xbc, 0xfa, 0x54, 0xbb, 0x15, 0x78, 0xd6, 0x39, 0x97, 0xe3, 0x4d, 0xa2, 0xc, 0x61, 0xcf, 0x20, 0x8e, 0xac, 0x2, 0xed, 0x43, 0x2e, 0x80, 0x6f, 0xc1, 0xb5, 0x1b, 0xf4, 0x5a, 0x37, 0x99, 0x76, 0xd8, 0x9e, 0x30, 0xdf, 0x71, 0x1c, 0xb2, 0x5d, 0xf3, 0x87, 0x29, 0xc6, 0x68, 0x5, 0xab, 0x44, 0xea, 0x8d, 0x23, 0xcc, 0x62, 0xf, 0xa1, 0x4e, 0xe0, 0x94, 0x3a, 0xd5, 0x7b, 0x16, 0xb8, 0x57, 0xf9, 0xbf, 0x11, 0xfe, 0x50, 0x3d, 0x93, 0x7c, 0xd2, 0xa6, 0x8, 0xe7, 0x49, 0x24, 0x8a, 0x65, 0xcb, 0xe9, 0x47, 0xa8, 0x6, 0x6b, 0xc5, 0x2a, 0x84, 0xf0, 0x5e, 0xb1, 0x1f, 0x72, 0xdc, 0x33, 0x9d, 0xdb, 0x75, 0x9a, 0x34, 0x59, 0xf7, 0x18, 0xb6, 0xc2, 0x6c, 0x83, 0x2d, 0x40, 0xee, 0x1, 0xaf, 0x45, 0xeb, 0x4, 0xaa, 0xc7, 0x69, 0x86, 0x28, 0x5c, 0xf2, 0x1d, 0xb3, 0xde, 0x70, 0x9f, 0x31, 0x77, 0xd9, 0x36, 0x98, 0xf5, 0x5b, 0xb4, 0x1a, 0x6e, 0xc0, 0x2f, 0x81, 0xec, 0x42, 0xad, 0x3, 0x21, 0x8f, 0x60, 0xce, 0xa3, 0xd, 0xe2, 0x4c, 0x38, 0x96, 0x79, 0xd7, 0xba, 0x14, 0xfb, 0x55, 0x13, 0xbd, 0x52, 0xfc, 0x91, 0x3f, 0xd0, 0x7e, 0xa, 0xa4, 0x4b, 0xe5, 0x88, 0x26, 0xc9, 0x67}, {0x0, 0xaf, 0x43, 0xec, 0x86, 0x29, 0xc5, 0x6a, 0x11, 0xbe, 0x52, 0xfd, 0x97, 0x38, 0xd4, 0x7b, 0x22, 0x8d, 0x61, 0xce, 0xa4, 0xb, 0xe7, 0x48, 0x33, 0x9c, 0x70, 0xdf, 0xb5, 0x1a, 0xf6, 0x59, 0x44, 0xeb, 0x7, 0xa8, 0xc2, 0x6d, 0x81, 0x2e, 0x55, 0xfa, 0x16, 0xb9, 0xd3, 0x7c, 0x90, 0x3f, 0x66, 0xc9, 0x25, 0x8a, 0xe0, 0x4f, 0xa3, 0xc, 0x77, 0xd8, 0x34, 0x9b, 0xf1, 0x5e, 0xb2, 0x1d, 0x88, 0x27, 0xcb, 0x64, 0xe, 0xa1, 0x4d, 0xe2, 0x99, 0x36, 0xda, 0x75, 0x1f, 0xb0, 0x5c, 0xf3, 0xaa, 0x5, 0xe9, 0x46, 0x2c, 0x83, 0x6f, 0xc0, 0xbb, 0x14, 0xf8, 0x57, 0x3d, 0x92, 0x7e, 0xd1, 0xcc, 0x63, 0x8f, 0x20, 0x4a, 0xe5, 0x9, 0xa6, 0xdd, 0x72, 0x9e, 0x31, 0x5b, 0xf4, 0x18, 0xb7, 0xee, 0x41, 0xad, 0x2, 0x68, 0xc7, 0x2b, 0x84, 0xff, 0x50, 0xbc, 0x13, 0x79, 0xd6, 0x3a, 0x95, 0xd, 0xa2, 0x4e, 0xe1, 0x8b, 0x24, 0xc8, 0x67, 0x1c, 0xb3, 0x5f, 0xf0, 0x9a, 0x35, 0xd9, 0x76, 0x2f, 0x80, 0x6c, 0xc3, 0xa9, 0x6, 0xea, 0x45, 0x3e, 0x91, 0x7d, 0xd2, 0xb8, 0x17, 0xfb, 0x54, 0x49, 0xe6, 0xa, 0xa5, 0xcf, 0x60, 0x8c, 0x23, 0x58, 0xf7, 0x1b, 0xb4, 0xde, 0x71, 0x9d, 0x32, 0x6b, 0xc4, 0x28, 0x87, 0xed, 0x42, 0xae, 0x1, 0x7a, 0xd5, 0x39, 0x96, 0xfc, 0x53, 0xbf, 0x10, 0x85, 0x2a, 0xc6, 0x69, 0x3, 0xac, 0x40, 0xef, 0x94, 0x3b, 0xd7, 0x78, 0x12, 0xbd, 0x51, 0xfe, 0xa7, 0x8, 0xe4, 0x4b, 0x21, 0x8e, 0x62, 0xcd, 0xb6, 0x19, 0xf5, 0x5a, 0x30, 0x9f, 0x73, 0xdc, 0xc1, 0x6e, 0x82, 0x2d, 0x47, 0xe8, 0x4, 0xab, 0xd0, 0x7f, 0x93, 0x3c, 0x56, 0xf9, 0x15, 0xba, 0xe3, 0x4c, 0xa0, 0xf, 0x65, 0xca, 0x26, 0x89, 0xf2, 0x5d, 0xb1, 0x1e, 0x74, 0xdb, 0x37, 0x98}, {0x0, 0xb0, 0x7d, 0xcd, 0xfa, 0x4a, 0x87, 0x37, 0xe9, 0x59, 0x94, 0x24, 0x13, 0xa3, 0x6e, 0xde, 0xcf, 0x7f, 0xb2, 0x2, 0x35, 0x85, 0x48, 0xf8, 0x26, 0x96, 0x5b, 0xeb, 0xdc, 0x6c, 0xa1, 0x11, 0x83, 0x33, 0xfe, 0x4e, 0x79, 0xc9, 0x4, 0xb4, 0x6a, 0xda, 0x17, 0xa7, 0x90, 0x20, 0xed, 0x5d, 0x4c, 0xfc, 0x31, 0x81, 0xb6, 0x6, 0xcb, 0x7b, 0xa5, 0x15, 0xd8, 0x68, 0x5f, 0xef, 0x22, 0x92, 0x1b, 0xab, 0x66, 0xd6, 0xe1, 0x51, 0x9c, 0x2c, 0xf2, 0x42, 0x8f, 0x3f, 0x8, 0xb8, 0x75, 0xc5, 0xd4, 0x64, 0xa9, 0x19, 0x2e, 0x9e, 0x53, 0xe3, 0x3d, 0x8d, 0x40, 0xf0, 0xc7, 0x77, 0xba, 0xa, 0x98, 0x28, 0xe5, 0x55, 0x62, 0xd2, 0x1f, 0xaf, 0x71, 0xc1, 0xc, 0xbc, 0x8b, 0x3b, 0xf6, 0x46, 0x57, 0xe7, 0x2a, 0x9a, 0xad, 0x1d, 0xd0, 0x60, 0xbe, 0xe, 0xc3, 0x73, 0x44, 0xf4, 0x39, 0x89, 0x36, 0x86, 0x4b, 0xfb, 0xcc, 0x7c, 0xb1, 0x1, 0xdf, 0x6f, 0xa2, 0x12, 0x25, 0x95, 0x58, 0xe8, 0xf9, 0x49, 0x84, 0x34, 0x3, 0xb3, 0x7e, 0xce, 0x10, 0xa0, 0x6d, 0xdd, 0xea, 0x5a, 0x97, 0x27, 0xb5, 0x5, 0xc8, 0x78, 0x4f, 0xff, 0x32, 0x82, 0x5c, 0xec, 0x21, 0x91, 0xa6, 0x16, 0xdb, 0x6b, 0x7a, 0xca, 0x7, 0xb7, 0x80, 0x30, 0xfd, 0x4d, 0x93, 0x23, 0xee, 0x5e, 0x69, 0xd9, 0x14, 0xa4, 0x2d, 0x9d, 0x50, 0xe0, 0xd7, 0x67, 0xaa, 0x1a, 0xc4, 0x74, 0xb9, 0x9, 0x3e, 0x8e, 0x43, 0xf3, 0xe2, 0x52, 0x9f, 0x2f, 0x18, 0xa8, 0x65, 0xd5, 0xb, 0xbb, 0x76, 0xc6, 0xf1, 0x41, 0x8c, 0x3c, 0xae, 0x1e, 0xd3, 0x63, 0x54, 0xe4, 0x29, 0x99, 0x47, 0xf7, 0x3a, 0x8a, 0xbd, 0xd, 0xc0, 0x70, 0x61, 0xd1, 0x1c, 0xac, 0x9b, 0x2b, 0xe6, 0x56, 0x88, 0x38, 0xf5, 0x45, 0x72, 0xc2, 0xf, 0xbf}, {0x0, 0xb1, 0x7f, 0xce, 0xfe, 0x4f, 0x81, 0x30, 0xe1, 0x50, 0x9e, 0x2f, 0x1f, 0xae, 0x60, 0xd1, 0xdf, 0x6e, 0xa0, 0x11, 0x21, 0x90, 0x5e, 0xef, 0x3e, 0x8f, 0x41, 0xf0, 0xc0, 0x71, 0xbf, 0xe, 0xa3, 0x12, 0xdc, 0x6d, 0x5d, 0xec, 0x22, 0x93, 0x42, 0xf3, 0x3d, 0x8c, 0xbc, 0xd, 0xc3, 0x72, 0x7c, 0xcd, 0x3, 0xb2, 0x82, 0x33, 0xfd, 0x4c, 0x9d, 0x2c, 0xe2, 0x53, 0x63, 0xd2, 0x1c, 0xad, 0x5b, 0xea, 0x24, 0x95, 0xa5, 0x14, 0xda, 0x6b, 0xba, 0xb, 0xc5, 0x74, 0x44, 0xf5, 0x3b, 0x8a, 0x84, 0x35, 0xfb, 0x4a, 0x7a, 0xcb, 0x5, 0xb4, 0x65, 0xd4, 0x1a, 0xab, 0x9b, 0x2a, 0xe4, 0x55, 0xf8, 0x49, 0x87, 0x36, 0x6, 0xb7, 0x79, 0xc8, 0x19, 0xa8, 0x66, 0xd7, 0xe7, 0x56, 0x98, 0x29, 0x27, 0x96, 0x58, 0xe9, 0xd9, 0x68, 0xa6, 0x17, 0xc6, 0x77, 0xb9, 0x8, 0x38, 0x89, 0x47, 0xf6, 0xb6, 0x7, 0xc9, 0x78, 0x48, 0xf9, 0x37, 0x86, 0x57, 0xe6, 0x28, 0x99, 0xa9, 0x18, 0xd6, 0x67, 0x69, 0xd8, 0x16, 0xa7, 0x97, 0x26, 0xe8, 0x59, 0x88, 0x39, 0xf7, 0x46, 0x76, 0xc7, 0x9, 0xb8, 0x15, 0xa4, 0x6a, 0xdb, 0xeb, 0x5a, 0x94, 0x25, 0xf4, 0x45, 0x8b, 0x3a, 0xa, 0xbb, 0x75, 0xc4, 0xca, 0x7b, 0xb5, 0x4, 0x34, 0x85, 0x4b, 0xfa, 0x2b, 0x9a, 0x54, 0xe5, 0xd5, 0x64, 0xaa, 0x1b, 0xed, 0x5c, 0x92, 0x23, 0x13, 0xa2, 0x6c, 0xdd, 0xc, 0xbd, 0x73, 0xc2, 0xf2, 0x43, 0x8d, 0x3c, 0x32, 0x83, 0x4d, 0xfc, 0xcc, 0x7d, 0xb3, 0x2, 0xd3, 0x62, 0xac, 0x1d, 0x2d, 0x9c, 0x52, 0xe3, 0x4e, 0xff, 0x31, 0x80, 0xb0, 0x1, 0xcf, 0x7e, 0xaf, 0x1e, 0xd0, 0x61, 0x51, 0xe0, 0x2e, 0x9f, 0x91, 0x20, 0xee, 0x5f, 0x6f, 0xde, 0x10, 0xa1, 0x70, 0xc1, 0xf, 0xbe, 0x8e, 0x3f, 0xf1, 0x40}, {0x0, 0xb2, 0x79, 0xcb, 0xf2, 0x40, 0x8b, 0x39, 0xf9, 0x4b, 0x80, 0x32, 0xb, 0xb9, 0x72, 0xc0, 0xef, 0x5d, 0x96, 0x24, 0x1d, 0xaf, 0x64, 0xd6, 0x16, 0xa4, 0x6f, 0xdd, 0xe4, 0x56, 0x9d, 0x2f, 0xc3, 0x71, 0xba, 0x8, 0x31, 0x83, 0x48, 0xfa, 0x3a, 0x88, 0x43, 0xf1, 0xc8, 0x7a, 0xb1, 0x3, 0x2c, 0x9e, 0x55, 0xe7, 0xde, 0x6c, 0xa7, 0x15, 0xd5, 0x67, 0xac, 0x1e, 0x27, 0x95, 0x5e, 0xec, 0x9b, 0x29, 0xe2, 0x50, 0x69, 0xdb, 0x10, 0xa2, 0x62, 0xd0, 0x1b, 0xa9, 0x90, 0x22, 0xe9, 0x5b, 0x74, 0xc6, 0xd, 0xbf, 0x86, 0x34, 0xff, 0x4d, 0x8d, 0x3f, 0xf4, 0x46, 0x7f, 0xcd, 0x6, 0xb4, 0x58, 0xea, 0x21, 0x93, 0xaa, 0x18, 0xd3, 0x61, 0xa1, 0x13, 0xd8, 0x6a, 0x53, 0xe1, 0x2a, 0x98, 0xb7, 0x5, 0xce, 0x7c, 0x45, 0xf7, 0x3c, 0x8e, 0x4e, 0xfc, 0x37, 0x85, 0xbc, 0xe, 0xc5, 0x77, 0x2b, 0x99, 0x52, 0xe0, 0xd9, 0x6b, 0xa0, 0x12, 0xd2, 0x60, 0xab, 0x19, 0x20, 0x92, 0x59, 0xeb, 0xc4, 0x76, 0xbd, 0xf, 0x36, 0x84, 0x4f, 0xfd, 0x3d, 0x8f, 0x44, 0xf6, 0xcf, 0x7d, 0xb6, 0x4, 0xe8, 0x5a, 0x91, 0x23, 0x1a, 0xa8, 0x63, 0xd1, 0x11, 0xa3, 0x68, 0xda, 0xe3, 0x51, 0x9a, 0x28, 0x7, 0xb5, 0x7e, 0xcc, 0xf5, 0x47, 0x8c, 0x3e, 0xfe, 0x4c, 0x87, 0x35, 0xc, 0xbe, 0x75, 0xc7, 0xb0, 0x2, 0xc9, 0x7b, 0x42, 0xf0, 0x3b, 0x89, 0x49, 0xfb, 0x30, 0x82, 0xbb, 0x9, 0xc2, 0x70, 0x5f, 0xed, 0x26, 0x94, 0xad, 0x1f, 0xd4, 0x66, 0xa6, 0x14, 0xdf, 0x6d, 0x54, 0xe6, 0x2d, 0x9f, 0x73, 0xc1, 0xa, 0xb8, 0x81, 0x33, 0xf8, 0x4a, 0x8a, 0x38, 0xf3, 0x41, 0x78, 0xca, 0x1, 0xb3, 0x9c, 0x2e, 0xe5, 0x57, 0x6e, 0xdc, 0x17, 0xa5, 0x65, 0xd7, 0x1c, 0xae, 0x97, 0x25, 0xee, 0x5c}, {0x0, 0xb3, 0x7b, 0xc8, 0xf6, 0x45, 0x8d, 0x3e, 0xf1, 0x42, 0x8a, 0x39, 0x7, 0xb4, 0x7c, 0xcf, 0xff, 0x4c, 0x84, 0x37, 0x9, 0xba, 0x72, 0xc1, 0xe, 0xbd, 0x75, 0xc6, 0xf8, 0x4b, 0x83, 0x30, 0xe3, 0x50, 0x98, 0x2b, 0x15, 0xa6, 0x6e, 0xdd, 0x12, 0xa1, 0x69, 0xda, 0xe4, 0x57, 0x9f, 0x2c, 0x1c, 0xaf, 0x67, 0xd4, 0xea, 0x59, 0x91, 0x22, 0xed, 0x5e, 0x96, 0x25, 0x1b, 0xa8, 0x60, 0xd3, 0xdb, 0x68, 0xa0, 0x13, 0x2d, 0x9e, 0x56, 0xe5, 0x2a, 0x99, 0x51, 0xe2, 0xdc, 0x6f, 0xa7, 0x14, 0x24, 0x97, 0x5f, 0xec, 0xd2, 0x61, 0xa9, 0x1a, 0xd5, 0x66, 0xae, 0x1d, 0x23, 0x90, 0x58, 0xeb, 0x38, 0x8b, 0x43, 0xf0, 0xce, 0x7d, 0xb5, 0x6, 0xc9, 0x7a, 0xb2, 0x1, 0x3f, 0x8c, 0x44, 0xf7, 0xc7, 0x74, 0xbc, 0xf, 0x31, 0x82, 0x4a, 0xf9, 0x36, 0x85, 0x4d, 0xfe, 0xc0, 0x73, 0xbb, 0x8, 0xab, 0x18, 0xd0, 0x63, 0x5d, 0xee, 0x26, 0x95, 0x5a, 0xe9, 0x21, 0x92, 0xac, 0x1f, 0xd7, 0x64, 0x54, 0xe7, 0x2f, 0x9c, 0xa2, 0x11, 0xd9, 0x6a, 0xa5, 0x16, 0xde, 0x6d, 0x53, 0xe0, 0x28, 0x9b, 0x48, 0xfb, 0x33, 0x80, 0xbe, 0xd, 0xc5, 0x76, 0xb9, 0xa, 0xc2, 0x71, 0x4f, 0xfc, 0x34, 0x87, 0xb7, 0x4, 0xcc, 0x7f, 0x41, 0xf2, 0x3a, 0x89, 0x46, 0xf5, 0x3d, 0x8e, 0xb0, 0x3, 0xcb, 0x78, 0x70, 0xc3, 0xb, 0xb8, 0x86, 0x35, 0xfd, 0x4e, 0x81, 0x32, 0xfa, 0x49, 0x77, 0xc4, 0xc, 0xbf, 0x8f, 0x3c, 0xf4, 0x47, 0x79, 0xca, 0x2, 0xb1, 0x7e, 0xcd, 0x5, 0xb6, 0x88, 0x3b, 0xf3, 0x40, 0x93, 0x20, 0xe8, 0x5b, 0x65, 0xd6, 0x1e, 0xad, 0x62, 0xd1, 0x19, 0xaa, 0x94, 0x27, 0xef, 0x5c, 0x6c, 0xdf, 0x17, 0xa4, 0x9a, 0x29, 0xe1, 0x52, 0x9d, 0x2e, 0xe6, 0x55, 0x6b, 0xd8, 0x10, 0xa3}, {0x0, 0xb4, 0x75, 0xc1, 0xea, 0x5e, 0x9f, 0x2b, 0xc9, 0x7d, 0xbc, 0x8, 0x23, 0x97, 0x56, 0xe2, 0x8f, 0x3b, 0xfa, 0x4e, 0x65, 0xd1, 0x10, 0xa4, 0x46, 0xf2, 0x33, 0x87, 0xac, 0x18, 0xd9, 0x6d, 0x3, 0xb7, 0x76, 0xc2, 0xe9, 0x5d, 0x9c, 0x28, 0xca, 0x7e, 0xbf, 0xb, 0x20, 0x94, 0x55, 0xe1, 0x8c, 0x38, 0xf9, 0x4d, 0x66, 0xd2, 0x13, 0xa7, 0x45, 0xf1, 0x30, 0x84, 0xaf, 0x1b, 0xda, 0x6e, 0x6, 0xb2, 0x73, 0xc7, 0xec, 0x58, 0x99, 0x2d, 0xcf, 0x7b, 0xba, 0xe, 0x25, 0x91, 0x50, 0xe4, 0x89, 0x3d, 0xfc, 0x48, 0x63, 0xd7, 0x16, 0xa2, 0x40, 0xf4, 0x35, 0x81, 0xaa, 0x1e, 0xdf, 0x6b, 0x5, 0xb1, 0x70, 0xc4, 0xef, 0x5b, 0x9a, 0x2e, 0xcc, 0x78, 0xb9, 0xd, 0x26, 0x92, 0x53, 0xe7, 0x8a, 0x3e, 0xff, 0x4b, 0x60, 0xd4, 0x15, 0xa1, 0x43, 0xf7, 0x36, 0x82, 0xa9, 0x1d, 0xdc, 0x68, 0xc, 0xb8, 0x79, 0xcd, 0xe6, 0x52, 0x93, 0x27, 0xc5, 0x71, 0xb0, 0x4, 0x2f, 0x9b, 0x5a, 0xee, 0x83, 0x37, 0xf6, 0x42, 0x69, 0xdd, 0x1c, 0xa8, 0x4a, 0xfe, 0x3f, 0x8b, 0xa0, 0x14, 0xd5, 0x61, 0xf, 0xbb, 0x7a, 0xce, 0xe5, 0x51, 0x90, 0x24, 0xc6, 0x72, 0xb3, 0x7, 0x2c, 0x98, 0x59, 0xed, 0x80, 0x34, 0xf5, 0x41, 0x6a, 0xde, 0x1f, 0xab, 0x49, 0xfd, 0x3c, 0x88, 0xa3, 0x17, 0xd6, 0x62, 0xa, 0xbe, 0x7f, 0xcb, 0xe0, 0x54, 0x95, 0x21, 0xc3, 0x77, 0xb6, 0x2, 0x29, 0x9d, 0x5c, 0xe8, 0x85, 0x31, 0xf0, 0x44, 0x6f, 0xdb, 0x1a, 0xae, 0x4c, 0xf8, 0x39, 0x8d, 0xa6, 0x12, 0xd3, 0x67, 0x9, 0xbd, 0x7c, 0xc8, 0xe3, 0x57, 0x96, 0x22, 0xc0, 0x74, 0xb5, 0x1, 0x2a, 0x9e, 0x5f, 0xeb, 0x86, 0x32, 0xf3, 0x47, 0x6c, 0xd8, 0x19, 0xad, 0x4f, 0xfb, 0x3a, 0x8e, 0xa5, 0x11, 0xd0, 0x64}, {0x0, 0xb5, 0x77, 0xc2, 0xee, 0x5b, 0x99, 0x2c, 0xc1, 0x74, 0xb6, 0x3, 0x2f, 0x9a, 0x58, 0xed, 0x9f, 0x2a, 0xe8, 0x5d, 0x71, 0xc4, 0x6, 0xb3, 0x5e, 0xeb, 0x29, 0x9c, 0xb0, 0x5, 0xc7, 0x72, 0x23, 0x96, 0x54, 0xe1, 0xcd, 0x78, 0xba, 0xf, 0xe2, 0x57, 0x95, 0x20, 0xc, 0xb9, 0x7b, 0xce, 0xbc, 0x9, 0xcb, 0x7e, 0x52, 0xe7, 0x25, 0x90, 0x7d, 0xc8, 0xa, 0xbf, 0x93, 0x26, 0xe4, 0x51, 0x46, 0xf3, 0x31, 0x84, 0xa8, 0x1d, 0xdf, 0x6a, 0x87, 0x32, 0xf0, 0x45, 0x69, 0xdc, 0x1e, 0xab, 0xd9, 0x6c, 0xae, 0x1b, 0x37, 0x82, 0x40, 0xf5, 0x18, 0xad, 0x6f, 0xda, 0xf6, 0x43, 0x81, 0x34, 0x65, 0xd0, 0x12, 0xa7, 0x8b, 0x3e, 0xfc, 0x49, 0xa4, 0x11, 0xd3, 0x66, 0x4a, 0xff, 0x3d, 0x88, 0xfa, 0x4f, 0x8d, 0x38, 0x14, 0xa1, 0x63, 0xd6, 0x3b, 0x8e, 0x4c, 0xf9, 0xd5, 0x60, 0xa2, 0x17, 0x8c, 0x39, 0xfb, 0x4e, 0x62, 0xd7, 0x15, 0xa0, 0x4d, 0xf8, 0x3a, 0x8f, 0xa3, 0x16, 0xd4, 0x61, 0x13, 0xa6, 0x64, 0xd1, 0xfd, 0x48, 0x8a, 0x3f, 0xd2, 0x67, 0xa5, 0x10, 0x3c, 0x89, 0x4b, 0xfe, 0xaf, 0x1a, 0xd8, 0x6d, 0x41, 0xf4, 0x36, 0x83, 0x6e, 0xdb, 0x19, 0xac, 0x80, 0x35, 0xf7, 0x42, 0x30, 0x85, 0x47, 0xf2, 0xde, 0x6b, 0xa9, 0x1c, 0xf1, 0x44, 0x86, 0x33, 0x1f, 0xaa, 0x68, 0xdd, 0xca, 0x7f, 0xbd, 0x8, 0x24, 0x91, 0x53, 0xe6, 0xb, 0xbe, 0x7c, 0xc9, 0xe5, 0x50, 0x92, 0x27, 0x55, 0xe0, 0x22, 0x97, 0xbb, 0xe, 0xcc, 0x79, 0x94, 0x21, 0xe3, 0x56, 0x7a, 0xcf, 0xd, 0xb8, 0xe9, 0x5c, 0x9e, 0x2b, 0x7, 0xb2, 0x70, 0xc5, 0x28, 0x9d, 0x5f, 0xea, 0xc6, 0x73, 0xb1, 0x4, 0x76, 0xc3, 0x1, 0xb4, 0x98, 0x2d, 0xef, 0x5a, 0xb7, 0x2, 0xc0, 0x75, 0x59, 0xec, 0x2e, 0x9b}, {0x0, 0xb6, 0x71, 0xc7, 0xe2, 0x54, 0x93, 0x25, 0xd9, 0x6f, 0xa8, 0x1e, 0x3b, 0x8d, 0x4a, 0xfc, 0xaf, 0x19, 0xde, 0x68, 0x4d, 0xfb, 0x3c, 0x8a, 0x76, 0xc0, 0x7, 0xb1, 0x94, 0x22, 0xe5, 0x53, 0x43, 0xf5, 0x32, 0x84, 0xa1, 0x17, 0xd0, 0x66, 0x9a, 0x2c, 0xeb, 0x5d, 0x78, 0xce, 0x9, 0xbf, 0xec, 0x5a, 0x9d, 0x2b, 0xe, 0xb8, 0x7f, 0xc9, 0x35, 0x83, 0x44, 0xf2, 0xd7, 0x61, 0xa6, 0x10, 0x86, 0x30, 0xf7, 0x41, 0x64, 0xd2, 0x15, 0xa3, 0x5f, 0xe9, 0x2e, 0x98, 0xbd, 0xb, 0xcc, 0x7a, 0x29, 0x9f, 0x58, 0xee, 0xcb, 0x7d, 0xba, 0xc, 0xf0, 0x46, 0x81, 0x37, 0x12, 0xa4, 0x63, 0xd5, 0xc5, 0x73, 0xb4, 0x2, 0x27, 0x91, 0x56, 0xe0, 0x1c, 0xaa, 0x6d, 0xdb, 0xfe, 0x48, 0x8f, 0x39, 0x6a, 0xdc, 0x1b, 0xad, 0x88, 0x3e, 0xf9, 0x4f, 0xb3, 0x5, 0xc2, 0x74, 0x51, 0xe7, 0x20, 0x96, 0x11, 0xa7, 0x60, 0xd6, 0xf3, 0x45, 0x82, 0x34, 0xc8, 0x7e, 0xb9, 0xf, 0x2a, 0x9c, 0x5b, 0xed, 0xbe, 0x8, 0xcf, 0x79, 0x5c, 0xea, 0x2d, 0x9b, 0x67, 0xd1, 0x16, 0xa0, 0x85, 0x33, 0xf4, 0x42, 0x52, 0xe4, 0x23, 0x95, 0xb0, 0x6, 0xc1, 0x77, 0x8b, 0x3d, 0xfa, 0x4c, 0x69, 0xdf, 0x18, 0xae, 0xfd, 0x4b, 0x8c, 0x3a, 0x1f, 0xa9, 0x6e, 0xd8, 0x24, 0x92, 0x55, 0xe3, 0xc6, 0x70, 0xb7, 0x1, 0x97, 0x21, 0xe6, 0x50, 0x75, 0xc3, 0x4, 0xb2, 0x4e, 0xf8, 0x3f, 0x89, 0xac, 0x1a, 0xdd, 0x6b, 0x38, 0x8e, 0x49, 0xff, 0xda, 0x6c, 0xab, 0x1d, 0xe1, 0x57, 0x90, 0x26, 0x3, 0xb5, 0x72, 0xc4, 0xd4, 0x62, 0xa5, 0x13, 0x36, 0x80, 0x47, 0xf1, 0xd, 0xbb, 0x7c, 0xca, 0xef, 0x59, 0x9e, 0x28, 0x7b, 0xcd, 0xa, 0xbc, 0x99, 0x2f, 0xe8, 0x5e, 0xa2, 0x14, 0xd3, 0x65, 0x40, 0xf6, 0x31, 0x87}, {0x0, 0xb7, 0x73, 0xc4, 0xe6, 0x51, 0x95, 0x22, 0xd1, 0x66, 0xa2, 0x15, 0x37, 0x80, 0x44, 0xf3, 0xbf, 0x8, 0xcc, 0x7b, 0x59, 0xee, 0x2a, 0x9d, 0x6e, 0xd9, 0x1d, 0xaa, 0x88, 0x3f, 0xfb, 0x4c, 0x63, 0xd4, 0x10, 0xa7, 0x85, 0x32, 0xf6, 0x41, 0xb2, 0x5, 0xc1, 0x76, 0x54, 0xe3, 0x27, 0x90, 0xdc, 0x6b, 0xaf, 0x18, 0x3a, 0x8d, 0x49, 0xfe, 0xd, 0xba, 0x7e, 0xc9, 0xeb, 0x5c, 0x98, 0x2f, 0xc6, 0x71, 0xb5, 0x2, 0x20, 0x97, 0x53, 0xe4, 0x17, 0xa0, 0x64, 0xd3, 0xf1, 0x46, 0x82, 0x35, 0x79, 0xce, 0xa, 0xbd, 0x9f, 0x28, 0xec, 0x5b, 0xa8, 0x1f, 0xdb, 0x6c, 0x4e, 0xf9, 0x3d, 0x8a, 0xa5, 0x12, 0xd6, 0x61, 0x43, 0xf4, 0x30, 0x87, 0x74, 0xc3, 0x7, 0xb0, 0x92, 0x25, 0xe1, 0x56, 0x1a, 0xad, 0x69, 0xde, 0xfc, 0x4b, 0x8f, 0x38, 0xcb, 0x7c, 0xb8, 0xf, 0x2d, 0x9a, 0x5e, 0xe9, 0x91, 0x26, 0xe2, 0x55, 0x77, 0xc0, 0x4, 0xb3, 0x40, 0xf7, 0x33, 0x84, 0xa6, 0x11, 0xd5, 0x62, 0x2e, 0x99, 0x5d, 0xea, 0xc8, 0x7f, 0xbb, 0xc, 0xff, 0x48, 0x8c, 0x3b, 0x19, 0xae, 0x6a, 0xdd, 0xf2, 0x45, 0x81, 0x36, 0x14, 0xa3, 0x67, 0xd0, 0x23, 0x94, 0x50, 0xe7, 0xc5, 0x72, 0xb6, 0x1, 0x4d, 0xfa, 0x3e, 0x89, 0xab, 0x1c, 0xd8, 0x6f, 0x9c, 0x2b, 0xef, 0x58, 0x7a, 0xcd, 0x9, 0xbe, 0x57, 0xe0, 0x24, 0x93, 0xb1, 0x6, 0xc2, 0x75, 0x86, 0x31, 0xf5, 0x42, 0x60, 0xd7, 0x13, 0xa4, 0xe8, 0x5f, 0x9b, 0x2c, 0xe, 0xb9, 0x7d, 0xca, 0x39, 0x8e, 0x4a, 0xfd, 0xdf, 0x68, 0xac, 0x1b, 0x34, 0x83, 0x47, 0xf0, 0xd2, 0x65, 0xa1, 0x16, 0xe5, 0x52, 0x96, 0x21, 0x3, 0xb4, 0x70, 0xc7, 0x8b, 0x3c, 0xf8, 0x4f, 0x6d, 0xda, 0x1e, 0xa9, 0x5a, 0xed, 0x29, 0x9e, 0xbc, 0xb, 0xcf, 0x78}, {0x0, 0xb8, 0x6d, 0xd5, 0xda, 0x62, 0xb7, 0xf, 0xa9, 0x11, 0xc4, 0x7c, 0x73, 0xcb, 0x1e, 0xa6, 0x4f, 0xf7, 0x22, 0x9a, 0x95, 0x2d, 0xf8, 0x40, 0xe6, 0x5e, 0x8b, 0x33, 0x3c, 0x84, 0x51, 0xe9, 0x9e, 0x26, 0xf3, 0x4b, 0x44, 0xfc, 0x29, 0x91, 0x37, 0x8f, 0x5a, 0xe2, 0xed, 0x55, 0x80, 0x38, 0xd1, 0x69, 0xbc, 0x4, 0xb, 0xb3, 0x66, 0xde, 0x78, 0xc0, 0x15, 0xad, 0xa2, 0x1a, 0xcf, 0x77, 0x21, 0x99, 0x4c, 0xf4, 0xfb, 0x43, 0x96, 0x2e, 0x88, 0x30, 0xe5, 0x5d, 0x52, 0xea, 0x3f, 0x87, 0x6e, 0xd6, 0x3, 0xbb, 0xb4, 0xc, 0xd9, 0x61, 0xc7, 0x7f, 0xaa, 0x12, 0x1d, 0xa5, 0x70, 0xc8, 0xbf, 0x7, 0xd2, 0x6a, 0x65, 0xdd, 0x8, 0xb0, 0x16, 0xae, 0x7b, 0xc3, 0xcc, 0x74, 0xa1, 0x19, 0xf0, 0x48, 0x9d, 0x25, 0x2a, 0x92, 0x47, 0xff, 0x59, 0xe1, 0x34, 0x8c, 0x83, 0x3b, 0xee, 0x56, 0x42, 0xfa, 0x2f, 0x97, 0x98, 0x20, 0xf5, 0x4d, 0xeb, 0x53, 0x86, 0x3e, 0x31, 0x89, 0x5c, 0xe4, 0xd, 0xb5, 0x60, 0xd8, 0xd7, 0x6f, 0xba, 0x2, 0xa4, 0x1c, 0xc9, 0x71, 0x7e, 0xc6, 0x13, 0xab, 0xdc, 0x64, 0xb1, 0x9, 0x6, 0xbe, 0x6b, 0xd3, 0x75, 0xcd, 0x18, 0xa0, 0xaf, 0x17, 0xc2, 0x7a, 0x93, 0x2b, 0xfe, 0x46, 0x49, 0xf1, 0x24, 0x9c, 0x3a, 0x82, 0x57, 0xef, 0xe0, 0x58, 0x8d, 0x35, 0x63, 0xdb, 0xe, 0xb6, 0xb9, 0x1, 0xd4, 0x6c, 0xca, 0x72, 0xa7, 0x1f, 0x10, 0xa8, 0x7d, 0xc5, 0x2c, 0x94, 0x41, 0xf9, 0xf6, 0x4e, 0x9b, 0x23, 0x85, 0x3d, 0xe8, 0x50, 0x5f, 0xe7, 0x32, 0x8a, 0xfd, 0x45, 0x90, 0x28, 0x27, 0x9f, 0x4a, 0xf2, 0x54, 0xec, 0x39, 0x81, 0x8e, 0x36, 0xe3, 0x5b, 0xb2, 0xa, 0xdf, 0x67, 0x68, 0xd0, 0x5, 0xbd, 0x1b, 0xa3, 0x76, 0xce, 0xc1, 0x79, 0xac, 0x14}, {0x0, 0xb9, 0x6f, 0xd6, 0xde, 0x67, 0xb1, 0x8, 0xa1, 0x18, 0xce, 0x77, 0x7f, 0xc6, 0x10, 0xa9, 0x5f, 0xe6, 0x30, 0x89, 0x81, 0x38, 0xee, 0x57, 0xfe, 0x47, 0x91, 0x28, 0x20, 0x99, 0x4f, 0xf6, 0xbe, 0x7, 0xd1, 0x68, 0x60, 0xd9, 0xf, 0xb6, 0x1f, 0xa6, 0x70, 0xc9, 0xc1, 0x78, 0xae, 0x17, 0xe1, 0x58, 0x8e, 0x37, 0x3f, 0x86, 0x50, 0xe9, 0x40, 0xf9, 0x2f, 0x96, 0x9e, 0x27, 0xf1, 0x48, 0x61, 0xd8, 0xe, 0xb7, 0xbf, 0x6, 0xd0, 0x69, 0xc0, 0x79, 0xaf, 0x16, 0x1e, 0xa7, 0x71, 0xc8, 0x3e, 0x87, 0x51, 0xe8, 0xe0, 0x59, 0x8f, 0x36, 0x9f, 0x26, 0xf0, 0x49, 0x41, 0xf8, 0x2e, 0x97, 0xdf, 0x66, 0xb0, 0x9, 0x1, 0xb8, 0x6e, 0xd7, 0x7e, 0xc7, 0x11, 0xa8, 0xa0, 0x19, 0xcf, 0x76, 0x80, 0x39, 0xef, 0x56, 0x5e, 0xe7, 0x31, 0x88, 0x21, 0x98, 0x4e, 0xf7, 0xff, 0x46, 0x90, 0x29, 0xc2, 0x7b, 0xad, 0x14, 0x1c, 0xa5, 0x73, 0xca, 0x63, 0xda, 0xc, 0xb5, 0xbd, 0x4, 0xd2, 0x6b, 0x9d, 0x24, 0xf2, 0x4b, 0x43, 0xfa, 0x2c, 0x95, 0x3c, 0x85, 0x53, 0xea, 0xe2, 0x5b, 0x8d, 0x34, 0x7c, 0xc5, 0x13, 0xaa, 0xa2, 0x1b, 0xcd, 0x74, 0xdd, 0x64, 0xb2, 0xb, 0x3, 0xba, 0x6c, 0xd5, 0x23, 0x9a, 0x4c, 0xf5, 0xfd, 0x44, 0x92, 0x2b, 0x82, 0x3b, 0xed, 0x54, 0x5c, 0xe5, 0x33, 0x8a, 0xa3, 0x1a, 0xcc, 0x75, 0x7d, 0xc4, 0x12, 0xab, 0x2, 0xbb, 0x6d, 0xd4, 0xdc, 0x65, 0xb3, 0xa, 0xfc, 0x45, 0x93, 0x2a, 0x22, 0x9b, 0x4d, 0xf4, 0x5d, 0xe4, 0x32, 0x8b, 0x83, 0x3a, 0xec, 0x55, 0x1d, 0xa4, 0x72, 0xcb, 0xc3, 0x7a, 0xac, 0x15, 0xbc, 0x5, 0xd3, 0x6a, 0x62, 0xdb, 0xd, 0xb4, 0x42, 0xfb, 0x2d, 0x94, 0x9c, 0x25, 0xf3, 0x4a, 0xe3, 0x5a, 0x8c, 0x35, 0x3d, 0x84, 0x52, 0xeb}, {0x0, 0xba, 0x69, 0xd3, 0xd2, 0x68, 0xbb, 0x1, 0xb9, 0x3, 0xd0, 0x6a, 0x6b, 0xd1, 0x2, 0xb8, 0x6f, 0xd5, 0x6, 0xbc, 0xbd, 0x7, 0xd4, 0x6e, 0xd6, 0x6c, 0xbf, 0x5, 0x4, 0xbe, 0x6d, 0xd7, 0xde, 0x64, 0xb7, 0xd, 0xc, 0xb6, 0x65, 0xdf, 0x67, 0xdd, 0xe, 0xb4, 0xb5, 0xf, 0xdc, 0x66, 0xb1, 0xb, 0xd8, 0x62, 0x63, 0xd9, 0xa, 0xb0, 0x8, 0xb2, 0x61, 0xdb, 0xda, 0x60, 0xb3, 0x9, 0xa1, 0x1b, 0xc8, 0x72, 0x73, 0xc9, 0x1a, 0xa0, 0x18, 0xa2, 0x71, 0xcb, 0xca, 0x70, 0xa3, 0x19, 0xce, 0x74, 0xa7, 0x1d, 0x1c, 0xa6, 0x75, 0xcf, 0x77, 0xcd, 0x1e, 0xa4, 0xa5, 0x1f, 0xcc, 0x76, 0x7f, 0xc5, 0x16, 0xac, 0xad, 0x17, 0xc4, 0x7e, 0xc6, 0x7c, 0xaf, 0x15, 0x14, 0xae, 0x7d, 0xc7, 0x10, 0xaa, 0x79, 0xc3, 0xc2, 0x78, 0xab, 0x11, 0xa9, 0x13, 0xc0, 0x7a, 0x7b, 0xc1, 0x12, 0xa8, 0x5f, 0xe5, 0x36, 0x8c, 0x8d, 0x37, 0xe4, 0x5e, 0xe6, 0x5c, 0x8f, 0x35, 0x34, 0x8e, 0x5d, 0xe7, 0x30, 0x8a, 0x59, 0xe3, 0xe2, 0x58, 0x8b, 0x31, 0x89, 0x33, 0xe0, 0x5a, 0x5b, 0xe1, 0x32, 0x88, 0x81, 0x3b, 0xe8, 0x52, 0x53, 0xe9, 0x3a, 0x80, 0x38, 0x82, 0x51, 0xeb, 0xea, 0x50, 0x83, 0x39, 0xee, 0x54, 0x87, 0x3d, 0x3c, 0x86, 0x55, 0xef, 0x57, 0xed, 0x3e, 0x84, 0x85, 0x3f, 0xec, 0x56, 0xfe, 0x44, 0x97, 0x2d, 0x2c, 0x96, 0x45, 0xff, 0x47, 0xfd, 0x2e, 0x94, 0x95, 0x2f, 0xfc, 0x46, 0x91, 0x2b, 0xf8, 0x42, 0x43, 0xf9, 0x2a, 0x90, 0x28, 0x92, 0x41, 0xfb, 0xfa, 0x40, 0x93, 0x29, 0x20, 0x9a, 0x49, 0xf3, 0xf2, 0x48, 0x9b, 0x21, 0x99, 0x23, 0xf0, 0x4a, 0x4b, 0xf1, 0x22, 0x98, 0x4f, 0xf5, 0x26, 0x9c, 0x9d, 0x27, 0xf4, 0x4e, 0xf6, 0x4c, 0x9f, 0x25, 0x24, 0x9e, 0x4d, 0xf7}, {0x0, 0xbb, 0x6b, 0xd0, 0xd6, 0x6d, 0xbd, 0x6, 0xb1, 0xa, 0xda, 0x61, 0x67, 0xdc, 0xc, 0xb7, 0x7f, 0xc4, 0x14, 0xaf, 0xa9, 0x12, 0xc2, 0x79, 0xce, 0x75, 0xa5, 0x1e, 0x18, 0xa3, 0x73, 0xc8, 0xfe, 0x45, 0x95, 0x2e, 0x28, 0x93, 0x43, 0xf8, 0x4f, 0xf4, 0x24, 0x9f, 0x99, 0x22, 0xf2, 0x49, 0x81, 0x3a, 0xea, 0x51, 0x57, 0xec, 0x3c, 0x87, 0x30, 0x8b, 0x5b, 0xe0, 0xe6, 0x5d, 0x8d, 0x36, 0xe1, 0x5a, 0x8a, 0x31, 0x37, 0x8c, 0x5c, 0xe7, 0x50, 0xeb, 0x3b, 0x80, 0x86, 0x3d, 0xed, 0x56, 0x9e, 0x25, 0xf5, 0x4e, 0x48, 0xf3, 0x23, 0x98, 0x2f, 0x94, 0x44, 0xff, 0xf9, 0x42, 0x92, 0x29, 0x1f, 0xa4, 0x74, 0xcf, 0xc9, 0x72, 0xa2, 0x19, 0xae, 0x15, 0xc5, 0x7e, 0x78, 0xc3, 0x13, 0xa8, 0x60, 0xdb, 0xb, 0xb0, 0xb6, 0xd, 0xdd, 0x66, 0xd1, 0x6a, 0xba, 0x1, 0x7, 0xbc, 0x6c, 0xd7, 0xdf, 0x64, 0xb4, 0xf, 0x9, 0xb2, 0x62, 0xd9, 0x6e, 0xd5, 0x5, 0xbe, 0xb8, 0x3, 0xd3, 0x68, 0xa0, 0x1b, 0xcb, 0x70, 0x76, 0xcd, 0x1d, 0xa6, 0x11, 0xaa, 0x7a, 0xc1, 0xc7, 0x7c, 0xac, 0x17, 0x21, 0x9a, 0x4a, 0xf1, 0xf7, 0x4c, 0x9c, 0x27, 0x90, 0x2b, 0xfb, 0x40, 0x46, 0xfd, 0x2d, 0x96, 0x5e, 0xe5, 0x35, 0x8e, 0x88, 0x33, 0xe3, 0x58, 0xef, 0x54, 0x84, 0x3f, 0x39, 0x82, 0x52, 0xe9, 0x3e, 0x85, 0x55, 0xee, 0xe8, 0x53, 0x83, 0x38, 0x8f, 0x34, 0xe4, 0x5f, 0x59, 0xe2, 0x32, 0x89, 0x41, 0xfa, 0x2a, 0x91, 0x97, 0x2c, 0xfc, 0x47, 0xf0, 0x4b, 0x9b, 0x20, 0x26, 0x9d, 0x4d, 0xf6, 0xc0, 0x7b, 0xab, 0x10, 0x16, 0xad, 0x7d, 0xc6, 0x71, 0xca, 0x1a, 0xa1, 0xa7, 0x1c, 0xcc, 0x77, 0xbf, 0x4, 0xd4, 0x6f, 0x69, 0xd2, 0x2, 0xb9, 0xe, 0xb5, 0x65, 0xde, 0xd8, 0x63, 0xb3, 0x8}, {0x0, 0xbc, 0x65, 0xd9, 0xca, 0x76, 0xaf, 0x13, 0x89, 0x35, 0xec, 0x50, 0x43, 0xff, 0x26, 0x9a, 0xf, 0xb3, 0x6a, 0xd6, 0xc5, 0x79, 0xa0, 0x1c, 0x86, 0x3a, 0xe3, 0x5f, 0x4c, 0xf0, 0x29, 0x95, 0x1e, 0xa2, 0x7b, 0xc7, 0xd4, 0x68, 0xb1, 0xd, 0x97, 0x2b, 0xf2, 0x4e, 0x5d, 0xe1, 0x38, 0x84, 0x11, 0xad, 0x74, 0xc8, 0xdb, 0x67, 0xbe, 0x2, 0x98, 0x24, 0xfd, 0x41, 0x52, 0xee, 0x37, 0x8b, 0x3c, 0x80, 0x59, 0xe5, 0xf6, 0x4a, 0x93, 0x2f, 0xb5, 0x9, 0xd0, 0x6c, 0x7f, 0xc3, 0x1a, 0xa6, 0x33, 0x8f, 0x56, 0xea, 0xf9, 0x45, 0x9c, 0x20, 0xba, 0x6, 0xdf, 0x63, 0x70, 0xcc, 0x15, 0xa9, 0x22, 0x9e, 0x47, 0xfb, 0xe8, 0x54, 0x8d, 0x31, 0xab, 0x17, 0xce, 0x72, 0x61, 0xdd, 0x4, 0xb8, 0x2d, 0x91, 0x48, 0xf4, 0xe7, 0x5b, 0x82, 0x3e, 0xa4, 0x18, 0xc1, 0x7d, 0x6e, 0xd2, 0xb, 0xb7, 0x78, 0xc4, 0x1d, 0xa1, 0xb2, 0xe, 0xd7, 0x6b, 0xf1, 0x4d, 0x94, 0x28, 0x3b, 0x87, 0x5e, 0xe2, 0x77, 0xcb, 0x12, 0xae, 0xbd, 0x1, 0xd8, 0x64, 0xfe, 0x42, 0x9b, 0x27, 0x34, 0x88, 0x51, 0xed, 0x66, 0xda, 0x3, 0xbf, 0xac, 0x10, 0xc9, 0x75, 0xef, 0x53, 0x8a, 0x36, 0x25, 0x99, 0x40, 0xfc, 0x69, 0xd5, 0xc, 0xb0, 0xa3, 0x1f, 0xc6, 0x7a, 0xe0, 0x5c, 0x85, 0x39, 0x2a, 0x96, 0x4f, 0xf3, 0x44, 0xf8, 0x21, 0x9d, 0x8e, 0x32, 0xeb, 0x57, 0xcd, 0x71, 0xa8, 0x14, 0x7, 0xbb, 0x62, 0xde, 0x4b, 0xf7, 0x2e, 0x92, 0x81, 0x3d, 0xe4, 0x58, 0xc2, 0x7e, 0xa7, 0x1b, 0x8, 0xb4, 0x6d, 0xd1, 0x5a, 0xe6, 0x3f, 0x83, 0x90, 0x2c, 0xf5, 0x49, 0xd3, 0x6f, 0xb6, 0xa, 0x19, 0xa5, 0x7c, 0xc0, 0x55, 0xe9, 0x30, 0x8c, 0x9f, 0x23, 0xfa, 0x46, 0xdc, 0x60, 0xb9, 0x5, 0x16, 0xaa, 0x73, 0xcf}, {0x0, 0xbd, 0x67, 0xda, 0xce, 0x73, 0xa9, 0x14, 0x81, 0x3c, 0xe6, 0x5b, 0x4f, 0xf2, 0x28, 0x95, 0x1f, 0xa2, 0x78, 0xc5, 0xd1, 0x6c, 0xb6, 0xb, 0x9e, 0x23, 0xf9, 0x44, 0x50, 0xed, 0x37, 0x8a, 0x3e, 0x83, 0x59, 0xe4, 0xf0, 0x4d, 0x97, 0x2a, 0xbf, 0x2, 0xd8, 0x65, 0x71, 0xcc, 0x16, 0xab, 0x21, 0x9c, 0x46, 0xfb, 0xef, 0x52, 0x88, 0x35, 0xa0, 0x1d, 0xc7, 0x7a, 0x6e, 0xd3, 0x9, 0xb4, 0x7c, 0xc1, 0x1b, 0xa6, 0xb2, 0xf, 0xd5, 0x68, 0xfd, 0x40, 0x9a, 0x27, 0x33, 0x8e, 0x54, 0xe9, 0x63, 0xde, 0x4, 0xb9, 0xad, 0x10, 0xca, 0x77, 0xe2, 0x5f, 0x85, 0x38, 0x2c, 0x91, 0x4b, 0xf6, 0x42, 0xff, 0x25, 0x98, 0x8c, 0x31, 0xeb, 0x56, 0xc3, 0x7e, 0xa4, 0x19, 0xd, 0xb0, 0x6a, 0xd7, 0x5d, 0xe0, 0x3a, 0x87, 0x93, 0x2e, 0xf4, 0x49, 0xdc, 0x61, 0xbb, 0x6, 0x12, 0xaf, 0x75, 0xc8, 0xf8, 0x45, 0x9f, 0x22, 0x36, 0x8b, 0x51, 0xec, 0x79, 0xc4, 0x1e, 0xa3, 0xb7, 0xa, 0xd0, 0x6d, 0xe7, 0x5a, 0x80, 0x3d, 0x29, 0x94, 0x4e, 0xf3, 0x66, 0xdb, 0x1, 0xbc, 0xa8, 0x15, 0xcf, 0x72, 0xc6, 0x7b, 0xa1, 0x1c, 0x8, 0xb5, 0x6f, 0xd2, 0x47, 0xfa, 0x20, 0x9d, 0x89, 0x34, 0xee, 0x53, 0xd9, 0x64, 0xbe, 0x3, 0x17, 0xaa, 0x70, 0xcd, 0x58, 0xe5, 0x3f, 0x82, 0x96, 0x2b, 0xf1, 0x4c, 0x84, 0x39, 0xe3, 0x5e, 0x4a, 0xf7, 0x2d, 0x90, 0x5, 0xb8, 0x62, 0xdf, 0xcb, 0x76, 0xac, 0x11, 0x9b, 0x26, 0xfc, 0x41, 0x55, 0xe8, 0x32, 0x8f, 0x1a, 0xa7, 0x7d, 0xc0, 0xd4, 0x69, 0xb3, 0xe, 0xba, 0x7, 0xdd, 0x60, 0x74, 0xc9, 0x13, 0xae, 0x3b, 0x86, 0x5c, 0xe1, 0xf5, 0x48, 0x92, 0x2f, 0xa5, 0x18, 0xc2, 0x7f, 0x6b, 0xd6, 0xc, 0xb1, 0x24, 0x99, 0x43, 0xfe, 0xea, 0x57, 0x8d, 0x30}, {0x0, 0xbe, 0x61, 0xdf, 0xc2, 0x7c, 0xa3, 0x1d, 0x99, 0x27, 0xf8, 0x46, 0x5b, 0xe5, 0x3a, 0x84, 0x2f, 0x91, 0x4e, 0xf0, 0xed, 0x53, 0x8c, 0x32, 0xb6, 0x8, 0xd7, 0x69, 0x74, 0xca, 0x15, 0xab, 0x5e, 0xe0, 0x3f, 0x81, 0x9c, 0x22, 0xfd, 0x43, 0xc7, 0x79, 0xa6, 0x18, 0x5, 0xbb, 0x64, 0xda, 0x71, 0xcf, 0x10, 0xae, 0xb3, 0xd, 0xd2, 0x6c, 0xe8, 0x56, 0x89, 0x37, 0x2a, 0x94, 0x4b, 0xf5, 0xbc, 0x2, 0xdd, 0x63, 0x7e, 0xc0, 0x1f, 0xa1, 0x25, 0x9b, 0x44, 0xfa, 0xe7, 0x59, 0x86, 0x38, 0x93, 0x2d, 0xf2, 0x4c, 0x51, 0xef, 0x30, 0x8e, 0xa, 0xb4, 0x6b, 0xd5, 0xc8, 0x76, 0xa9, 0x17, 0xe2, 0x5c, 0x83, 0x3d, 0x20, 0x9e, 0x41, 0xff, 0x7b, 0xc5, 0x1a, 0xa4, 0xb9, 0x7, 0xd8, 0x66, 0xcd, 0x73, 0xac, 0x12, 0xf, 0xb1, 0x6e, 0xd0, 0x54, 0xea, 0x35, 0x8b, 0x96, 0x28, 0xf7, 0x49, 0x65, 0xdb, 0x4, 0xba, 0xa7, 0x19, 0xc6, 0x78, 0xfc, 0x42, 0x9d, 0x23, 0x3e, 0x80, 0x5f, 0xe1, 0x4a, 0xf4, 0x2b, 0x95, 0x88, 0x36, 0xe9, 0x57, 0xd3, 0x6d, 0xb2, 0xc, 0x11, 0xaf, 0x70, 0xce, 0x3b, 0x85, 0x5a, 0xe4, 0xf9, 0x47, 0x98, 0x26, 0xa2, 0x1c, 0xc3, 0x7d, 0x60, 0xde, 0x1, 0xbf, 0x14, 0xaa, 0x75, 0xcb, 0xd6, 0x68, 0xb7, 0x9, 0x8d, 0x33, 0xec, 0x52, 0x4f, 0xf1, 0x2e, 0x90, 0xd9, 0x67, 0xb8, 0x6, 0x1b, 0xa5, 0x7a, 0xc4, 0x40, 0xfe, 0x21, 0x9f, 0x82, 0x3c, 0xe3, 0x5d, 0xf6, 0x48, 0x97, 0x29, 0x34, 0x8a, 0x55, 0xeb, 0x6f, 0xd1, 0xe, 0xb0, 0xad, 0x13, 0xcc, 0x72, 0x87, 0x39, 0xe6, 0x58, 0x45, 0xfb, 0x24, 0x9a, 0x1e, 0xa0, 0x7f, 0xc1, 0xdc, 0x62, 0xbd, 0x3, 0xa8, 0x16, 0xc9, 0x77, 0x6a, 0xd4, 0xb, 0xb5, 0x31, 0x8f, 0x50, 0xee, 0xf3, 0x4d, 0x92, 0x2c}, {0x0, 0xbf, 0x63, 0xdc, 0xc6, 0x79, 0xa5, 0x1a, 0x91, 0x2e, 0xf2, 0x4d, 0x57, 0xe8, 0x34, 0x8b, 0x3f, 0x80, 0x5c, 0xe3, 0xf9, 0x46, 0x9a, 0x25, 0xae, 0x11, 0xcd, 0x72, 0x68, 0xd7, 0xb, 0xb4, 0x7e, 0xc1, 0x1d, 0xa2, 0xb8, 0x7, 0xdb, 0x64, 0xef, 0x50, 0x8c, 0x33, 0x29, 0x96, 0x4a, 0xf5, 0x41, 0xfe, 0x22, 0x9d, 0x87, 0x38, 0xe4, 0x5b, 0xd0, 0x6f, 0xb3, 0xc, 0x16, 0xa9, 0x75, 0xca, 0xfc, 0x43, 0x9f, 0x20, 0x3a, 0x85, 0x59, 0xe6, 0x6d, 0xd2, 0xe, 0xb1, 0xab, 0x14, 0xc8, 0x77, 0xc3, 0x7c, 0xa0, 0x1f, 0x5, 0xba, 0x66, 0xd9, 0x52, 0xed, 0x31, 0x8e, 0x94, 0x2b, 0xf7, 0x48, 0x82, 0x3d, 0xe1, 0x5e, 0x44, 0xfb, 0x27, 0x98, 0x13, 0xac, 0x70, 0xcf, 0xd5, 0x6a, 0xb6, 0x9, 0xbd, 0x2, 0xde, 0x61, 0x7b, 0xc4, 0x18, 0xa7, 0x2c, 0x93, 0x4f, 0xf0, 0xea, 0x55, 0x89, 0x36, 0xe5, 0x5a, 0x86, 0x39, 0x23, 0x9c, 0x40, 0xff, 0x74, 0xcb, 0x17, 0xa8, 0xb2, 0xd, 0xd1, 0x6e, 0xda, 0x65, 0xb9, 0x6, 0x1c, 0xa3, 0x7f, 0xc0, 0x4b, 0xf4, 0x28, 0x97, 0x8d, 0x32, 0xee, 0x51, 0x9b, 0x24, 0xf8, 0x47, 0x5d, 0xe2, 0x3e, 0x81, 0xa, 0xb5, 0x69, 0xd6, 0xcc, 0x73, 0xaf, 0x10, 0xa4, 0x1b, 0xc7, 0x78, 0x62, 0xdd, 0x1, 0xbe, 0x35, 0x8a, 0x56, 0xe9, 0xf3, 0x4c, 0x90, 0x2f, 0x19, 0xa6, 0x7a, 0xc5, 0xdf, 0x60, 0xbc, 0x3, 0x88, 0x37, 0xeb, 0x54, 0x4e, 0xf1, 0x2d, 0x92, 0x26, 0x99, 0x45, 0xfa, 0xe0, 0x5f, 0x83, 0x3c, 0xb7, 0x8, 0xd4, 0x6b, 0x71, 0xce, 0x12, 0xad, 0x67, 0xd8, 0x4, 0xbb, 0xa1, 0x1e, 0xc2, 0x7d, 0xf6, 0x49, 0x95, 0x2a, 0x30, 0x8f, 0x53, 0xec, 0x58, 0xe7, 0x3b, 0x84, 0x9e, 0x21, 0xfd, 0x42, 0xc9, 0x76, 0xaa, 0x15, 0xf, 0xb0, 0x6c, 0xd3}, {0x0, 0xc0, 0x9d, 0x5d, 0x27, 0xe7, 0xba, 0x7a, 0x4e, 0x8e, 0xd3, 0x13, 0x69, 0xa9, 0xf4, 0x34, 0x9c, 0x5c, 0x1, 0xc1, 0xbb, 0x7b, 0x26, 0xe6, 0xd2, 0x12, 0x4f, 0x8f, 0xf5, 0x35, 0x68, 0xa8, 0x25, 0xe5, 0xb8, 0x78, 0x2, 0xc2, 0x9f, 0x5f, 0x6b, 0xab, 0xf6, 0x36, 0x4c, 0x8c, 0xd1, 0x11, 0xb9, 0x79, 0x24, 0xe4, 0x9e, 0x5e, 0x3, 0xc3, 0xf7, 0x37, 0x6a, 0xaa, 0xd0, 0x10, 0x4d, 0x8d, 0x4a, 0x8a, 0xd7, 0x17, 0x6d, 0xad, 0xf0, 0x30, 0x4, 0xc4, 0x99, 0x59, 0x23, 0xe3, 0xbe, 0x7e, 0xd6, 0x16, 0x4b, 0x8b, 0xf1, 0x31, 0x6c, 0xac, 0x98, 0x58, 0x5, 0xc5, 0xbf, 0x7f, 0x22, 0xe2, 0x6f, 0xaf, 0xf2, 0x32, 0x48, 0x88, 0xd5, 0x15, 0x21, 0xe1, 0xbc, 0x7c, 0x6, 0xc6, 0x9b, 0x5b, 0xf3, 0x33, 0x6e, 0xae, 0xd4, 0x14, 0x49, 0x89, 0xbd, 0x7d, 0x20, 0xe0, 0x9a, 0x5a, 0x7, 0xc7, 0x94, 0x54, 0x9, 0xc9, 0xb3, 0x73, 0x2e, 0xee, 0xda, 0x1a, 0x47, 0x87, 0xfd, 0x3d, 0x60, 0xa0, 0x8, 0xc8, 0x95, 0x55, 0x2f, 0xef, 0xb2, 0x72, 0x46, 0x86, 0xdb, 0x1b, 0x61, 0xa1, 0xfc, 0x3c, 0xb1, 0x71, 0x2c, 0xec, 0x96, 0x56, 0xb, 0xcb, 0xff, 0x3f, 0x62, 0xa2, 0xd8, 0x18, 0x45, 0x85, 0x2d, 0xed, 0xb0, 0x70, 0xa, 0xca, 0x97, 0x57, 0x63, 0xa3, 0xfe, 0x3e, 0x44, 0x84, 0xd9, 0x19, 0xde, 0x1e, 0x43, 0x83, 0xf9, 0x39, 0x64, 0xa4, 0x90, 0x50, 0xd, 0xcd, 0xb7, 0x77, 0x2a, 0xea, 0x42, 0x82, 0xdf, 0x1f, 0x65, 0xa5, 0xf8, 0x38, 0xc, 0xcc, 0x91, 0x51, 0x2b, 0xeb, 0xb6, 0x76, 0xfb, 0x3b, 0x66, 0xa6, 0xdc, 0x1c, 0x41, 0x81, 0xb5, 0x75, 0x28, 0xe8, 0x92, 0x52, 0xf, 0xcf, 0x67, 0xa7, 0xfa, 0x3a, 0x40, 0x80, 0xdd, 0x1d, 0x29, 0xe9, 0xb4, 0x74, 0xe, 0xce, 0x93, 0x53}, {0x0, 0xc1, 0x9f, 0x5e, 0x23, 0xe2, 0xbc, 0x7d, 0x46, 0x87, 0xd9, 0x18, 0x65, 0xa4, 0xfa, 0x3b, 0x8c, 0x4d, 0x13, 0xd2, 0xaf, 0x6e, 0x30, 0xf1, 0xca, 0xb, 0x55, 0x94, 0xe9, 0x28, 0x76, 0xb7, 0x5, 0xc4, 0x9a, 0x5b, 0x26, 0xe7, 0xb9, 0x78, 0x43, 0x82, 0xdc, 0x1d, 0x60, 0xa1, 0xff, 0x3e, 0x89, 0x48, 0x16, 0xd7, 0xaa, 0x6b, 0x35, 0xf4, 0xcf, 0xe, 0x50, 0x91, 0xec, 0x2d, 0x73, 0xb2, 0xa, 0xcb, 0x95, 0x54, 0x29, 0xe8, 0xb6, 0x77, 0x4c, 0x8d, 0xd3, 0x12, 0x6f, 0xae, 0xf0, 0x31, 0x86, 0x47, 0x19, 0xd8, 0xa5, 0x64, 0x3a, 0xfb, 0xc0, 0x1, 0x5f, 0x9e, 0xe3, 0x22, 0x7c, 0xbd, 0xf, 0xce, 0x90, 0x51, 0x2c, 0xed, 0xb3, 0x72, 0x49, 0x88, 0xd6, 0x17, 0x6a, 0xab, 0xf5, 0x34, 0x83, 0x42, 0x1c, 0xdd, 0xa0, 0x61, 0x3f, 0xfe, 0xc5, 0x4, 0x5a, 0x9b, 0xe6, 0x27, 0x79, 0xb8, 0x14, 0xd5, 0x8b, 0x4a, 0x37, 0xf6, 0xa8, 0x69, 0x52, 0x93, 0xcd, 0xc, 0x71, 0xb0, 0xee, 0x2f, 0x98, 0x59, 0x7, 0xc6, 0xbb, 0x7a, 0x24, 0xe5, 0xde, 0x1f, 0x41, 0x80, 0xfd, 0x3c, 0x62, 0xa3, 0x11, 0xd0, 0x8e, 0x4f, 0x32, 0xf3, 0xad, 0x6c, 0x57, 0x96, 0xc8, 0x9, 0x74, 0xb5, 0xeb, 0x2a, 0x9d, 0x5c, 0x2, 0xc3, 0xbe, 0x7f, 0x21, 0xe0, 0xdb, 0x1a, 0x44, 0x85, 0xf8, 0x39, 0x67, 0xa6, 0x1e, 0xdf, 0x81, 0x40, 0x3d, 0xfc, 0xa2, 0x63, 0x58, 0x99, 0xc7, 0x6, 0x7b, 0xba, 0xe4, 0x25, 0x92, 0x53, 0xd, 0xcc, 0xb1, 0x70, 0x2e, 0xef, 0xd4, 0x15, 0x4b, 0x8a, 0xf7, 0x36, 0x68, 0xa9, 0x1b, 0xda, 0x84, 0x45, 0x38, 0xf9, 0xa7, 0x66, 0x5d, 0x9c, 0xc2, 0x3, 0x7e, 0xbf, 0xe1, 0x20, 0x97, 0x56, 0x8, 0xc9, 0xb4, 0x75, 0x2b, 0xea, 0xd1, 0x10, 0x4e, 0x8f, 0xf2, 0x33, 0x6d, 0xac}, {0x0, 0xc2, 0x99, 0x5b, 0x2f, 0xed, 0xb6, 0x74, 0x5e, 0x9c, 0xc7, 0x5, 0x71, 0xb3, 0xe8, 0x2a, 0xbc, 0x7e, 0x25, 0xe7, 0x93, 0x51, 0xa, 0xc8, 0xe2, 0x20, 0x7b, 0xb9, 0xcd, 0xf, 0x54, 0x96, 0x65, 0xa7, 0xfc, 0x3e, 0x4a, 0x88, 0xd3, 0x11, 0x3b, 0xf9, 0xa2, 0x60, 0x14, 0xd6, 0x8d, 0x4f, 0xd9, 0x1b, 0x40, 0x82, 0xf6, 0x34, 0x6f, 0xad, 0x87, 0x45, 0x1e, 0xdc, 0xa8, 0x6a, 0x31, 0xf3, 0xca, 0x8, 0x53, 0x91, 0xe5, 0x27, 0x7c, 0xbe, 0x94, 0x56, 0xd, 0xcf, 0xbb, 0x79, 0x22, 0xe0, 0x76, 0xb4, 0xef, 0x2d, 0x59, 0x9b, 0xc0, 0x2, 0x28, 0xea, 0xb1, 0x73, 0x7, 0xc5, 0x9e, 0x5c, 0xaf, 0x6d, 0x36, 0xf4, 0x80, 0x42, 0x19, 0xdb, 0xf1, 0x33, 0x68, 0xaa, 0xde, 0x1c, 0x47, 0x85, 0x13, 0xd1, 0x8a, 0x48, 0x3c, 0xfe, 0xa5, 0x67, 0x4d, 0x8f, 0xd4, 0x16, 0x62, 0xa0, 0xfb, 0x39, 0x89, 0x4b, 0x10, 0xd2, 0xa6, 0x64, 0x3f, 0xfd, 0xd7, 0x15, 0x4e, 0x8c, 0xf8, 0x3a, 0x61, 0xa3, 0x35, 0xf7, 0xac, 0x6e, 0x1a, 0xd8, 0x83, 0x41, 0x6b, 0xa9, 0xf2, 0x30, 0x44, 0x86, 0xdd, 0x1f, 0xec, 0x2e, 0x75, 0xb7, 0xc3, 0x1, 0x5a, 0x98, 0xb2, 0x70, 0x2b, 0xe9, 0x9d, 0x5f, 0x4, 0xc6, 0x50, 0x92, 0xc9, 0xb, 0x7f, 0xbd, 0xe6, 0x24, 0xe, 0xcc, 0x97, 0x55, 0x21, 0xe3, 0xb8, 0x7a, 0x43, 0x81, 0xda, 0x18, 0x6c, 0xae, 0xf5, 0x37, 0x1d, 0xdf, 0x84, 0x46, 0x32, 0xf0, 0xab, 0x69, 0xff, 0x3d, 0x66, 0xa4, 0xd0, 0x12, 0x49, 0x8b, 0xa1, 0x63, 0x38, 0xfa, 0x8e, 0x4c, 0x17, 0xd5, 0x26, 0xe4, 0xbf, 0x7d, 0x9, 0xcb, 0x90, 0x52, 0x78, 0xba, 0xe1, 0x23, 0x57, 0x95, 0xce, 0xc, 0x9a, 0x58, 0x3, 0xc1, 0xb5, 0x77, 0x2c, 0xee, 0xc4, 0x6, 0x5d, 0x9f, 0xeb, 0x29, 0x72, 0xb0}, {0x0, 0xc3, 0x9b, 0x58, 0x2b, 0xe8, 0xb0, 0x73, 0x56, 0x95, 0xcd, 0xe, 0x7d, 0xbe, 0xe6, 0x25, 0xac, 0x6f, 0x37, 0xf4, 0x87, 0x44, 0x1c, 0xdf, 0xfa, 0x39, 0x61, 0xa2, 0xd1, 0x12, 0x4a, 0x89, 0x45, 0x86, 0xde, 0x1d, 0x6e, 0xad, 0xf5, 0x36, 0x13, 0xd0, 0x88, 0x4b, 0x38, 0xfb, 0xa3, 0x60, 0xe9, 0x2a, 0x72, 0xb1, 0xc2, 0x1, 0x59, 0x9a, 0xbf, 0x7c, 0x24, 0xe7, 0x94, 0x57, 0xf, 0xcc, 0x8a, 0x49, 0x11, 0xd2, 0xa1, 0x62, 0x3a, 0xf9, 0xdc, 0x1f, 0x47, 0x84, 0xf7, 0x34, 0x6c, 0xaf, 0x26, 0xe5, 0xbd, 0x7e, 0xd, 0xce, 0x96, 0x55, 0x70, 0xb3, 0xeb, 0x28, 0x5b, 0x98, 0xc0, 0x3, 0xcf, 0xc, 0x54, 0x97, 0xe4, 0x27, 0x7f, 0xbc, 0x99, 0x5a, 0x2, 0xc1, 0xb2, 0x71, 0x29, 0xea, 0x63, 0xa0, 0xf8, 0x3b, 0x48, 0x8b, 0xd3, 0x10, 0x35, 0xf6, 0xae, 0x6d, 0x1e, 0xdd, 0x85, 0x46, 0x9, 0xca, 0x92, 0x51, 0x22, 0xe1, 0xb9, 0x7a, 0x5f, 0x9c, 0xc4, 0x7, 0x74, 0xb7, 0xef, 0x2c, 0xa5, 0x66, 0x3e, 0xfd, 0x8e, 0x4d, 0x15, 0xd6, 0xf3, 0x30, 0x68, 0xab, 0xd8, 0x1b, 0x43, 0x80, 0x4c, 0x8f, 0xd7, 0x14, 0x67, 0xa4, 0xfc, 0x3f, 0x1a, 0xd9, 0x81, 0x42, 0x31, 0xf2, 0xaa, 0x69, 0xe0, 0x23, 0x7b, 0xb8, 0xcb, 0x8, 0x50, 0x93, 0xb6, 0x75, 0x2d, 0xee, 0x9d, 0x5e, 0x6, 0xc5, 0x83, 0x40, 0x18, 0xdb, 0xa8, 0x6b, 0x33, 0xf0, 0xd5, 0x16, 0x4e, 0x8d, 0xfe, 0x3d, 0x65, 0xa6, 0x2f, 0xec, 0xb4, 0x77, 0x4, 0xc7, 0x9f, 0x5c, 0x79, 0xba, 0xe2, 0x21, 0x52, 0x91, 0xc9, 0xa, 0xc6, 0x5, 0x5d, 0x9e, 0xed, 0x2e, 0x76, 0xb5, 0x90, 0x53, 0xb, 0xc8, 0xbb, 0x78, 0x20, 0xe3, 0x6a, 0xa9, 0xf1, 0x32, 0x41, 0x82, 0xda, 0x19, 0x3c, 0xff, 0xa7, 0x64, 0x17, 0xd4, 0x8c, 0x4f}, {0x0, 0xc4, 0x95, 0x51, 0x37, 0xf3, 0xa2, 0x66, 0x6e, 0xaa, 0xfb, 0x3f, 0x59, 0x9d, 0xcc, 0x8, 0xdc, 0x18, 0x49, 0x8d, 0xeb, 0x2f, 0x7e, 0xba, 0xb2, 0x76, 0x27, 0xe3, 0x85, 0x41, 0x10, 0xd4, 0xa5, 0x61, 0x30, 0xf4, 0x92, 0x56, 0x7, 0xc3, 0xcb, 0xf, 0x5e, 0x9a, 0xfc, 0x38, 0x69, 0xad, 0x79, 0xbd, 0xec, 0x28, 0x4e, 0x8a, 0xdb, 0x1f, 0x17, 0xd3, 0x82, 0x46, 0x20, 0xe4, 0xb5, 0x71, 0x57, 0x93, 0xc2, 0x6, 0x60, 0xa4, 0xf5, 0x31, 0x39, 0xfd, 0xac, 0x68, 0xe, 0xca, 0x9b, 0x5f, 0x8b, 0x4f, 0x1e, 0xda, 0xbc, 0x78, 0x29, 0xed, 0xe5, 0x21, 0x70, 0xb4, 0xd2, 0x16, 0x47, 0x83, 0xf2, 0x36, 0x67, 0xa3, 0xc5, 0x1, 0x50, 0x94, 0x9c, 0x58, 0x9, 0xcd, 0xab, 0x6f, 0x3e, 0xfa, 0x2e, 0xea, 0xbb, 0x7f, 0x19, 0xdd, 0x8c, 0x48, 0x40, 0x84, 0xd5, 0x11, 0x77, 0xb3, 0xe2, 0x26, 0xae, 0x6a, 0x3b, 0xff, 0x99, 0x5d, 0xc, 0xc8, 0xc0, 0x4, 0x55, 0x91, 0xf7, 0x33, 0x62, 0xa6, 0x72, 0xb6, 0xe7, 0x23, 0x45, 0x81, 0xd0, 0x14, 0x1c, 0xd8, 0x89, 0x4d, 0x2b, 0xef, 0xbe, 0x7a, 0xb, 0xcf, 0x9e, 0x5a, 0x3c, 0xf8, 0xa9, 0x6d, 0x65, 0xa1, 0xf0, 0x34, 0x52, 0x96, 0xc7, 0x3, 0xd7, 0x13, 0x42, 0x86, 0xe0, 0x24, 0x75, 0xb1, 0xb9, 0x7d, 0x2c, 0xe8, 0x8e, 0x4a, 0x1b, 0xdf, 0xf9, 0x3d, 0x6c, 0xa8, 0xce, 0xa, 0x5b, 0x9f, 0x97, 0x53, 0x2, 0xc6, 0xa0, 0x64, 0x35, 0xf1, 0x25, 0xe1, 0xb0, 0x74, 0x12, 0xd6, 0x87, 0x43, 0x4b, 0x8f, 0xde, 0x1a, 0x7c, 0xb8, 0xe9, 0x2d, 0x5c, 0x98, 0xc9, 0xd, 0x6b, 0xaf, 0xfe, 0x3a, 0x32, 0xf6, 0xa7, 0x63, 0x5, 0xc1, 0x90, 0x54, 0x80, 0x44, 0x15, 0xd1, 0xb7, 0x73, 0x22, 0xe6, 0xee, 0x2a, 0x7b, 0xbf, 0xd9, 0x1d, 0x4c, 0x88}, {0x0, 0xc5, 0x97, 0x52, 0x33, 0xf6, 0xa4, 0x61, 0x66, 0xa3, 0xf1, 0x34, 0x55, 0x90, 0xc2, 0x7, 0xcc, 0x9, 0x5b, 0x9e, 0xff, 0x3a, 0x68, 0xad, 0xaa, 0x6f, 0x3d, 0xf8, 0x99, 0x5c, 0xe, 0xcb, 0x85, 0x40, 0x12, 0xd7, 0xb6, 0x73, 0x21, 0xe4, 0xe3, 0x26, 0x74, 0xb1, 0xd0, 0x15, 0x47, 0x82, 0x49, 0x8c, 0xde, 0x1b, 0x7a, 0xbf, 0xed, 0x28, 0x2f, 0xea, 0xb8, 0x7d, 0x1c, 0xd9, 0x8b, 0x4e, 0x17, 0xd2, 0x80, 0x45, 0x24, 0xe1, 0xb3, 0x76, 0x71, 0xb4, 0xe6, 0x23, 0x42, 0x87, 0xd5, 0x10, 0xdb, 0x1e, 0x4c, 0x89, 0xe8, 0x2d, 0x7f, 0xba, 0xbd, 0x78, 0x2a, 0xef, 0x8e, 0x4b, 0x19, 0xdc, 0x92, 0x57, 0x5, 0xc0, 0xa1, 0x64, 0x36, 0xf3, 0xf4, 0x31, 0x63, 0xa6, 0xc7, 0x2, 0x50, 0x95, 0x5e, 0x9b, 0xc9, 0xc, 0x6d, 0xa8, 0xfa, 0x3f, 0x38, 0xfd, 0xaf, 0x6a, 0xb, 0xce, 0x9c, 0x59, 0x2e, 0xeb, 0xb9, 0x7c, 0x1d, 0xd8, 0x8a, 0x4f, 0x48, 0x8d, 0xdf, 0x1a, 0x7b, 0xbe, 0xec, 0x29, 0xe2, 0x27, 0x75, 0xb0, 0xd1, 0x14, 0x46, 0x83, 0x84, 0x41, 0x13, 0xd6, 0xb7, 0x72, 0x20, 0xe5, 0xab, 0x6e, 0x3c, 0xf9, 0x98, 0x5d, 0xf, 0xca, 0xcd, 0x8, 0x5a, 0x9f, 0xfe, 0x3b, 0x69, 0xac, 0x67, 0xa2, 0xf0, 0x35, 0x54, 0x91, 0xc3, 0x6, 0x1, 0xc4, 0x96, 0x53, 0x32, 0xf7, 0xa5, 0x60, 0x39, 0xfc, 0xae, 0x6b, 0xa, 0xcf, 0x9d, 0x58, 0x5f, 0x9a, 0xc8, 0xd, 0x6c, 0xa9, 0xfb, 0x3e, 0xf5, 0x30, 0x62, 0xa7, 0xc6, 0x3, 0x51, 0x94, 0x93, 0x56, 0x4, 0xc1, 0xa0, 0x65, 0x37, 0xf2, 0xbc, 0x79, 0x2b, 0xee, 0x8f, 0x4a, 0x18, 0xdd, 0xda, 0x1f, 0x4d, 0x88, 0xe9, 0x2c, 0x7e, 0xbb, 0x70, 0xb5, 0xe7, 0x22, 0x43, 0x86, 0xd4, 0x11, 0x16, 0xd3, 0x81, 0x44, 0x25, 0xe0, 0xb2, 0x77}, {0x0, 0xc6, 0x91, 0x57, 0x3f, 0xf9, 0xae, 0x68, 0x7e, 0xb8, 0xef, 0x29, 0x41, 0x87, 0xd0, 0x16, 0xfc, 0x3a, 0x6d, 0xab, 0xc3, 0x5, 0x52, 0x94, 0x82, 0x44, 0x13, 0xd5, 0xbd, 0x7b, 0x2c, 0xea, 0xe5, 0x23, 0x74, 0xb2, 0xda, 0x1c, 0x4b, 0x8d, 0x9b, 0x5d, 0xa, 0xcc, 0xa4, 0x62, 0x35, 0xf3, 0x19, 0xdf, 0x88, 0x4e, 0x26, 0xe0, 0xb7, 0x71, 0x67, 0xa1, 0xf6, 0x30, 0x58, 0x9e, 0xc9, 0xf, 0xd7, 0x11, 0x46, 0x80, 0xe8, 0x2e, 0x79, 0xbf, 0xa9, 0x6f, 0x38, 0xfe, 0x96, 0x50, 0x7, 0xc1, 0x2b, 0xed, 0xba, 0x7c, 0x14, 0xd2, 0x85, 0x43, 0x55, 0x93, 0xc4, 0x2, 0x6a, 0xac, 0xfb, 0x3d, 0x32, 0xf4, 0xa3, 0x65, 0xd, 0xcb, 0x9c, 0x5a, 0x4c, 0x8a, 0xdd, 0x1b, 0x73, 0xb5, 0xe2, 0x24, 0xce, 0x8, 0x5f, 0x99, 0xf1, 0x37, 0x60, 0xa6, 0xb0, 0x76, 0x21, 0xe7, 0x8f, 0x49, 0x1e, 0xd8, 0xb3, 0x75, 0x22, 0xe4, 0x8c, 0x4a, 0x1d, 0xdb, 0xcd, 0xb, 0x5c, 0x9a, 0xf2, 0x34, 0x63, 0xa5, 0x4f, 0x89, 0xde, 0x18, 0x70, 0xb6, 0xe1, 0x27, 0x31, 0xf7, 0xa0, 0x66, 0xe, 0xc8, 0x9f, 0x59, 0x56, 0x90, 0xc7, 0x1, 0x69, 0xaf, 0xf8, 0x3e, 0x28, 0xee, 0xb9, 0x7f, 0x17, 0xd1, 0x86, 0x40, 0xaa, 0x6c, 0x3b, 0xfd, 0x95, 0x53, 0x4, 0xc2, 0xd4, 0x12, 0x45, 0x83, 0xeb, 0x2d, 0x7a, 0xbc, 0x64, 0xa2, 0xf5, 0x33, 0x5b, 0x9d, 0xca, 0xc, 0x1a, 0xdc, 0x8b, 0x4d, 0x25, 0xe3, 0xb4, 0x72, 0x98, 0x5e, 0x9, 0xcf, 0xa7, 0x61, 0x36, 0xf0, 0xe6, 0x20, 0x77, 0xb1, 0xd9, 0x1f, 0x48, 0x8e, 0x81, 0x47, 0x10, 0xd6, 0xbe, 0x78, 0x2f, 0xe9, 0xff, 0x39, 0x6e, 0xa8, 0xc0, 0x6, 0x51, 0x97, 0x7d, 0xbb, 0xec, 0x2a, 0x42, 0x84, 0xd3, 0x15, 0x3, 0xc5, 0x92, 0x54, 0x3c, 0xfa, 0xad, 0x6b}, {0x0, 0xc7, 0x93, 0x54, 0x3b, 0xfc, 0xa8, 0x6f, 0x76, 0xb1, 0xe5, 0x22, 0x4d, 0x8a, 0xde, 0x19, 0xec, 0x2b, 0x7f, 0xb8, 0xd7, 0x10, 0x44, 0x83, 0x9a, 0x5d, 0x9, 0xce, 0xa1, 0x66, 0x32, 0xf5, 0xc5, 0x2, 0x56, 0x91, 0xfe, 0x39, 0x6d, 0xaa, 0xb3, 0x74, 0x20, 0xe7, 0x88, 0x4f, 0x1b, 0xdc, 0x29, 0xee, 0xba, 0x7d, 0x12, 0xd5, 0x81, 0x46, 0x5f, 0x98, 0xcc, 0xb, 0x64, 0xa3, 0xf7, 0x30, 0x97, 0x50, 0x4, 0xc3, 0xac, 0x6b, 0x3f, 0xf8, 0xe1, 0x26, 0x72, 0xb5, 0xda, 0x1d, 0x49, 0x8e, 0x7b, 0xbc, 0xe8, 0x2f, 0x40, 0x87, 0xd3, 0x14, 0xd, 0xca, 0x9e, 0x59, 0x36, 0xf1, 0xa5, 0x62, 0x52, 0x95, 0xc1, 0x6, 0x69, 0xae, 0xfa, 0x3d, 0x24, 0xe3, 0xb7, 0x70, 0x1f, 0xd8, 0x8c, 0x4b, 0xbe, 0x79, 0x2d, 0xea, 0x85, 0x42, 0x16, 0xd1, 0xc8, 0xf, 0x5b, 0x9c, 0xf3, 0x34, 0x60, 0xa7, 0x33, 0xf4, 0xa0, 0x67, 0x8, 0xcf, 0x9b, 0x5c, 0x45, 0x82, 0xd6, 0x11, 0x7e, 0xb9, 0xed, 0x2a, 0xdf, 0x18, 0x4c, 0x8b, 0xe4, 0x23, 0x77, 0xb0, 0xa9, 0x6e, 0x3a, 0xfd, 0x92, 0x55, 0x1, 0xc6, 0xf6, 0x31, 0x65, 0xa2, 0xcd, 0xa, 0x5e, 0x99, 0x80, 0x47, 0x13, 0xd4, 0xbb, 0x7c, 0x28, 0xef, 0x1a, 0xdd, 0x89, 0x4e, 0x21, 0xe6, 0xb2, 0x75, 0x6c, 0xab, 0xff, 0x38, 0x57, 0x90, 0xc4, 0x3, 0xa4, 0x63, 0x37, 0xf0, 0x9f, 0x58, 0xc, 0xcb, 0xd2, 0x15, 0x41, 0x86, 0xe9, 0x2e, 0x7a, 0xbd, 0x48, 0x8f, 0xdb, 0x1c, 0x73, 0xb4, 0xe0, 0x27, 0x3e, 0xf9, 0xad, 0x6a, 0x5, 0xc2, 0x96, 0x51, 0x61, 0xa6, 0xf2, 0x35, 0x5a, 0x9d, 0xc9, 0xe, 0x17, 0xd0, 0x84, 0x43, 0x2c, 0xeb, 0xbf, 0x78, 0x8d, 0x4a, 0x1e, 0xd9, 0xb6, 0x71, 0x25, 0xe2, 0xfb, 0x3c, 0x68, 0xaf, 0xc0, 0x7, 0x53, 0x94}, {0x0, 0xc8, 0x8d, 0x45, 0x7, 0xcf, 0x8a, 0x42, 0xe, 0xc6, 0x83, 0x4b, 0x9, 0xc1, 0x84, 0x4c, 0x1c, 0xd4, 0x91, 0x59, 0x1b, 0xd3, 0x96, 0x5e, 0x12, 0xda, 0x9f, 0x57, 0x15, 0xdd, 0x98, 0x50, 0x38, 0xf0, 0xb5, 0x7d, 0x3f, 0xf7, 0xb2, 0x7a, 0x36, 0xfe, 0xbb, 0x73, 0x31, 0xf9, 0xbc, 0x74, 0x24, 0xec, 0xa9, 0x61, 0x23, 0xeb, 0xae, 0x66, 0x2a, 0xe2, 0xa7, 0x6f, 0x2d, 0xe5, 0xa0, 0x68, 0x70, 0xb8, 0xfd, 0x35, 0x77, 0xbf, 0xfa, 0x32, 0x7e, 0xb6, 0xf3, 0x3b, 0x79, 0xb1, 0xf4, 0x3c, 0x6c, 0xa4, 0xe1, 0x29, 0x6b, 0xa3, 0xe6, 0x2e, 0x62, 0xaa, 0xef, 0x27, 0x65, 0xad, 0xe8, 0x20, 0x48, 0x80, 0xc5, 0xd, 0x4f, 0x87, 0xc2, 0xa, 0x46, 0x8e, 0xcb, 0x3, 0x41, 0x89, 0xcc, 0x4, 0x54, 0x9c, 0xd9, 0x11, 0x53, 0x9b, 0xde, 0x16, 0x5a, 0x92, 0xd7, 0x1f, 0x5d, 0x95, 0xd0, 0x18, 0xe0, 0x28, 0x6d, 0xa5, 0xe7, 0x2f, 0x6a, 0xa2, 0xee, 0x26, 0x63, 0xab, 0xe9, 0x21, 0x64, 0xac, 0xfc, 0x34, 0x71, 0xb9, 0xfb, 0x33, 0x76, 0xbe, 0xf2, 0x3a, 0x7f, 0xb7, 0xf5, 0x3d, 0x78, 0xb0, 0xd8, 0x10, 0x55, 0x9d, 0xdf, 0x17, 0x52, 0x9a, 0xd6, 0x1e, 0x5b, 0x93, 0xd1, 0x19, 0x5c, 0x94, 0xc4, 0xc, 0x49, 0x81, 0xc3, 0xb, 0x4e, 0x86, 0xca, 0x2, 0x47, 0x8f, 0xcd, 0x5, 0x40, 0x88, 0x90, 0x58, 0x1d, 0xd5, 0x97, 0x5f, 0x1a, 0xd2, 0x9e, 0x56, 0x13, 0xdb, 0x99, 0x51, 0x14, 0xdc, 0x8c, 0x44, 0x1, 0xc9, 0x8b, 0x43, 0x6, 0xce, 0x82, 0x4a, 0xf, 0xc7, 0x85, 0x4d, 0x8, 0xc0, 0xa8, 0x60, 0x25, 0xed, 0xaf, 0x67, 0x22, 0xea, 0xa6, 0x6e, 0x2b, 0xe3, 0xa1, 0x69, 0x2c, 0xe4, 0xb4, 0x7c, 0x39, 0xf1, 0xb3, 0x7b, 0x3e, 0xf6, 0xba, 0x72, 0x37, 0xff, 0xbd, 0x75, 0x30, 0xf8}, {0x0, 0xc9, 0x8f, 0x46, 0x3, 0xca, 0x8c, 0x45, 0x6, 0xcf, 0x89, 0x40, 0x5, 0xcc, 0x8a, 0x43, 0xc, 0xc5, 0x83, 0x4a, 0xf, 0xc6, 0x80, 0x49, 0xa, 0xc3, 0x85, 0x4c, 0x9, 0xc0, 0x86, 0x4f, 0x18, 0xd1, 0x97, 0x5e, 0x1b, 0xd2, 0x94, 0x5d, 0x1e, 0xd7, 0x91, 0x58, 0x1d, 0xd4, 0x92, 0x5b, 0x14, 0xdd, 0x9b, 0x52, 0x17, 0xde, 0x98, 0x51, 0x12, 0xdb, 0x9d, 0x54, 0x11, 0xd8, 0x9e, 0x57, 0x30, 0xf9, 0xbf, 0x76, 0x33, 0xfa, 0xbc, 0x75, 0x36, 0xff, 0xb9, 0x70, 0x35, 0xfc, 0xba, 0x73, 0x3c, 0xf5, 0xb3, 0x7a, 0x3f, 0xf6, 0xb0, 0x79, 0x3a, 0xf3, 0xb5, 0x7c, 0x39, 0xf0, 0xb6, 0x7f, 0x28, 0xe1, 0xa7, 0x6e, 0x2b, 0xe2, 0xa4, 0x6d, 0x2e, 0xe7, 0xa1, 0x68, 0x2d, 0xe4, 0xa2, 0x6b, 0x24, 0xed, 0xab, 0x62, 0x27, 0xee, 0xa8, 0x61, 0x22, 0xeb, 0xad, 0x64, 0x21, 0xe8, 0xae, 0x67, 0x60, 0xa9, 0xef, 0x26, 0x63, 0xaa, 0xec, 0x25, 0x66, 0xaf, 0xe9, 0x20, 0x65, 0xac, 0xea, 0x23, 0x6c, 0xa5, 0xe3, 0x2a, 0x6f, 0xa6, 0xe0, 0x29, 0x6a, 0xa3, 0xe5, 0x2c, 0x69, 0xa0, 0xe6, 0x2f, 0x78, 0xb1, 0xf7, 0x3e, 0x7b, 0xb2, 0xf4, 0x3d, 0x7e, 0xb7, 0xf1, 0x38, 0x7d, 0xb4, 0xf2, 0x3b, 0x74, 0xbd, 0xfb, 0x32, 0x77, 0xbe, 0xf8, 0x31, 0x72, 0xbb, 0xfd, 0x34, 0x71, 0xb8, 0xfe, 0x37, 0x50, 0x99, 0xdf, 0x16, 0x53, 0x9a, 0xdc, 0x15, 0x56, 0x9f, 0xd9, 0x10, 0x55, 0x9c, 0xda, 0x13, 0x5c, 0x95, 0xd3, 0x1a, 0x5f, 0x96, 0xd0, 0x19, 0x5a, 0x93, 0xd5, 0x1c, 0x59, 0x90, 0xd6, 0x1f, 0x48, 0x81, 0xc7, 0xe, 0x4b, 0x82, 0xc4, 0xd, 0x4e, 0x87, 0xc1, 0x8, 0x4d, 0x84, 0xc2, 0xb, 0x44, 0x8d, 0xcb, 0x2, 0x47, 0x8e, 0xc8, 0x1, 0x42, 0x8b, 0xcd, 0x4, 0x41, 0x88, 0xce, 0x7}, {0x0, 0xca, 0x89, 0x43, 0xf, 0xc5, 0x86, 0x4c, 0x1e, 0xd4, 0x97, 0x5d, 0x11, 0xdb, 0x98, 0x52, 0x3c, 0xf6, 0xb5, 0x7f, 0x33, 0xf9, 0xba, 0x70, 0x22, 0xe8, 0xab, 0x61, 0x2d, 0xe7, 0xa4, 0x6e, 0x78, 0xb2, 0xf1, 0x3b, 0x77, 0xbd, 0xfe, 0x34, 0x66, 0xac, 0xef, 0x25, 0x69, 0xa3, 0xe0, 0x2a, 0x44, 0x8e, 0xcd, 0x7, 0x4b, 0x81, 0xc2, 0x8, 0x5a, 0x90, 0xd3, 0x19, 0x55, 0x9f, 0xdc, 0x16, 0xf0, 0x3a, 0x79, 0xb3, 0xff, 0x35, 0x76, 0xbc, 0xee, 0x24, 0x67, 0xad, 0xe1, 0x2b, 0x68, 0xa2, 0xcc, 0x6, 0x45, 0x8f, 0xc3, 0x9, 0x4a, 0x80, 0xd2, 0x18, 0x5b, 0x91, 0xdd, 0x17, 0x54, 0x9e, 0x88, 0x42, 0x1, 0xcb, 0x87, 0x4d, 0xe, 0xc4, 0x96, 0x5c, 0x1f, 0xd5, 0x99, 0x53, 0x10, 0xda, 0xb4, 0x7e, 0x3d, 0xf7, 0xbb, 0x71, 0x32, 0xf8, 0xaa, 0x60, 0x23, 0xe9, 0xa5, 0x6f, 0x2c, 0xe6, 0xfd, 0x37, 0x74, 0xbe, 0xf2, 0x38, 0x7b, 0xb1, 0xe3, 0x29, 0x6a, 0xa0, 0xec, 0x26, 0x65, 0xaf, 0xc1, 0xb, 0x48, 0x82, 0xce, 0x4, 0x47, 0x8d, 0xdf, 0x15, 0x56, 0x9c, 0xd0, 0x1a, 0x59, 0x93, 0x85, 0x4f, 0xc, 0xc6, 0x8a, 0x40, 0x3, 0xc9, 0x9b, 0x51, 0x12, 0xd8, 0x94, 0x5e, 0x1d, 0xd7, 0xb9, 0x73, 0x30, 0xfa, 0xb6, 0x7c, 0x3f, 0xf5, 0xa7, 0x6d, 0x2e, 0xe4, 0xa8, 0x62, 0x21, 0xeb, 0xd, 0xc7, 0x84, 0x4e, 0x2, 0xc8, 0x8b, 0x41, 0x13, 0xd9, 0x9a, 0x50, 0x1c, 0xd6, 0x95, 0x5f, 0x31, 0xfb, 0xb8, 0x72, 0x3e, 0xf4, 0xb7, 0x7d, 0x2f, 0xe5, 0xa6, 0x6c, 0x20, 0xea, 0xa9, 0x63, 0x75, 0xbf, 0xfc, 0x36, 0x7a, 0xb0, 0xf3, 0x39, 0x6b, 0xa1, 0xe2, 0x28, 0x64, 0xae, 0xed, 0x27, 0x49, 0x83, 0xc0, 0xa, 0x46, 0x8c, 0xcf, 0x5, 0x57, 0x9d, 0xde, 0x14, 0x58, 0x92, 0xd1, 0x1b}, {0x0, 0xcb, 0x8b, 0x40, 0xb, 0xc0, 0x80, 0x4b, 0x16, 0xdd, 0x9d, 0x56, 0x1d, 0xd6, 0x96, 0x5d, 0x2c, 0xe7, 0xa7, 0x6c, 0x27, 0xec, 0xac, 0x67, 0x3a, 0xf1, 0xb1, 0x7a, 0x31, 0xfa, 0xba, 0x71, 0x58, 0x93, 0xd3, 0x18, 0x53, 0x98, 0xd8, 0x13, 0x4e, 0x85, 0xc5, 0xe, 0x45, 0x8e, 0xce, 0x5, 0x74, 0xbf, 0xff, 0x34, 0x7f, 0xb4, 0xf4, 0x3f, 0x62, 0xa9, 0xe9, 0x22, 0x69, 0xa2, 0xe2, 0x29, 0xb0, 0x7b, 0x3b, 0xf0, 0xbb, 0x70, 0x30, 0xfb, 0xa6, 0x6d, 0x2d, 0xe6, 0xad, 0x66, 0x26, 0xed, 0x9c, 0x57, 0x17, 0xdc, 0x97, 0x5c, 0x1c, 0xd7, 0x8a, 0x41, 0x1, 0xca, 0x81, 0x4a, 0xa, 0xc1, 0xe8, 0x23, 0x63, 0xa8, 0xe3, 0x28, 0x68, 0xa3, 0xfe, 0x35, 0x75, 0xbe, 0xf5, 0x3e, 0x7e, 0xb5, 0xc4, 0xf, 0x4f, 0x84, 0xcf, 0x4, 0x44, 0x8f, 0xd2, 0x19, 0x59, 0x92, 0xd9, 0x12, 0x52, 0x99, 0x7d, 0xb6, 0xf6, 0x3d, 0x76, 0xbd, 0xfd, 0x36, 0x6b, 0xa0, 0xe0, 0x2b, 0x60, 0xab, 0xeb, 0x20, 0x51, 0x9a, 0xda, 0x11, 0x5a, 0x91, 0xd1, 0x1a, 0x47, 0x8c, 0xcc, 0x7, 0x4c, 0x87, 0xc7, 0xc, 0x25, 0xee, 0xae, 0x65, 0x2e, 0xe5, 0xa5, 0x6e, 0x33, 0xf8, 0xb8, 0x73, 0x38, 0xf3, 0xb3, 0x78, 0x9, 0xc2, 0x82, 0x49, 0x2, 0xc9, 0x89, 0x42, 0x1f, 0xd4, 0x94, 0x5f, 0x14, 0xdf, 0x9f, 0x54, 0xcd, 0x6, 0x46, 0x8d, 0xc6, 0xd, 0x4d, 0x86, 0xdb, 0x10, 0x50, 0x9b, 0xd0, 0x1b, 0x5b, 0x90, 0xe1, 0x2a, 0x6a, 0xa1, 0xea, 0x21, 0x61, 0xaa, 0xf7, 0x3c, 0x7c, 0xb7, 0xfc, 0x37, 0x77, 0xbc, 0x95, 0x5e, 0x1e, 0xd5, 0x9e, 0x55, 0x15, 0xde, 0x83, 0x48, 0x8, 0xc3, 0x88, 0x43, 0x3, 0xc8, 0xb9, 0x72, 0x32, 0xf9, 0xb2, 0x79, 0x39, 0xf2, 0xaf, 0x64, 0x24, 0xef, 0xa4, 0x6f, 0x2f, 0xe4}, {0x0, 0xcc, 0x85, 0x49, 0x17, 0xdb, 0x92, 0x5e, 0x2e, 0xe2, 0xab, 0x67, 0x39, 0xf5, 0xbc, 0x70, 0x5c, 0x90, 0xd9, 0x15, 0x4b, 0x87, 0xce, 0x2, 0x72, 0xbe, 0xf7, 0x3b, 0x65, 0xa9, 0xe0, 0x2c, 0xb8, 0x74, 0x3d, 0xf1, 0xaf, 0x63, 0x2a, 0xe6, 0x96, 0x5a, 0x13, 0xdf, 0x81, 0x4d, 0x4, 0xc8, 0xe4, 0x28, 0x61, 0xad, 0xf3, 0x3f, 0x76, 0xba, 0xca, 0x6, 0x4f, 0x83, 0xdd, 0x11, 0x58, 0x94, 0x6d, 0xa1, 0xe8, 0x24, 0x7a, 0xb6, 0xff, 0x33, 0x43, 0x8f, 0xc6, 0xa, 0x54, 0x98, 0xd1, 0x1d, 0x31, 0xfd, 0xb4, 0x78, 0x26, 0xea, 0xa3, 0x6f, 0x1f, 0xd3, 0x9a, 0x56, 0x8, 0xc4, 0x8d, 0x41, 0xd5, 0x19, 0x50, 0x9c, 0xc2, 0xe, 0x47, 0x8b, 0xfb, 0x37, 0x7e, 0xb2, 0xec, 0x20, 0x69, 0xa5, 0x89, 0x45, 0xc, 0xc0, 0x9e, 0x52, 0x1b, 0xd7, 0xa7, 0x6b, 0x22, 0xee, 0xb0, 0x7c, 0x35, 0xf9, 0xda, 0x16, 0x5f, 0x93, 0xcd, 0x1, 0x48, 0x84, 0xf4, 0x38, 0x71, 0xbd, 0xe3, 0x2f, 0x66, 0xaa, 0x86, 0x4a, 0x3, 0xcf, 0x91, 0x5d, 0x14, 0xd8, 0xa8, 0x64, 0x2d, 0xe1, 0xbf, 0x73, 0x3a, 0xf6, 0x62, 0xae, 0xe7, 0x2b, 0x75, 0xb9, 0xf0, 0x3c, 0x4c, 0x80, 0xc9, 0x5, 0x5b, 0x97, 0xde, 0x12, 0x3e, 0xf2, 0xbb, 0x77, 0x29, 0xe5, 0xac, 0x60, 0x10, 0xdc, 0x95, 0x59, 0x7, 0xcb, 0x82, 0x4e, 0xb7, 0x7b, 0x32, 0xfe, 0xa0, 0x6c, 0x25, 0xe9, 0x99, 0x55, 0x1c, 0xd0, 0x8e, 0x42, 0xb, 0xc7, 0xeb, 0x27, 0x6e, 0xa2, 0xfc, 0x30, 0x79, 0xb5, 0xc5, 0x9, 0x40, 0x8c, 0xd2, 0x1e, 0x57, 0x9b, 0xf, 0xc3, 0x8a, 0x46, 0x18, 0xd4, 0x9d, 0x51, 0x21, 0xed, 0xa4, 0x68, 0x36, 0xfa, 0xb3, 0x7f, 0x53, 0x9f, 0xd6, 0x1a, 0x44, 0x88, 0xc1, 0xd, 0x7d, 0xb1, 0xf8, 0x34, 0x6a, 0xa6, 0xef, 0x23}, {0x0, 0xcd, 0x87, 0x4a, 0x13, 0xde, 0x94, 0x59, 0x26, 0xeb, 0xa1, 0x6c, 0x35, 0xf8, 0xb2, 0x7f, 0x4c, 0x81, 0xcb, 0x6, 0x5f, 0x92, 0xd8, 0x15, 0x6a, 0xa7, 0xed, 0x20, 0x79, 0xb4, 0xfe, 0x33, 0x98, 0x55, 0x1f, 0xd2, 0x8b, 0x46, 0xc, 0xc1, 0xbe, 0x73, 0x39, 0xf4, 0xad, 0x60, 0x2a, 0xe7, 0xd4, 0x19, 0x53, 0x9e, 0xc7, 0xa, 0x40, 0x8d, 0xf2, 0x3f, 0x75, 0xb8, 0xe1, 0x2c, 0x66, 0xab, 0x2d, 0xe0, 0xaa, 0x67, 0x3e, 0xf3, 0xb9, 0x74, 0xb, 0xc6, 0x8c, 0x41, 0x18, 0xd5, 0x9f, 0x52, 0x61, 0xac, 0xe6, 0x2b, 0x72, 0xbf, 0xf5, 0x38, 0x47, 0x8a, 0xc0, 0xd, 0x54, 0x99, 0xd3, 0x1e, 0xb5, 0x78, 0x32, 0xff, 0xa6, 0x6b, 0x21, 0xec, 0x93, 0x5e, 0x14, 0xd9, 0x80, 0x4d, 0x7, 0xca, 0xf9, 0x34, 0x7e, 0xb3, 0xea, 0x27, 0x6d, 0xa0, 0xdf, 0x12, 0x58, 0x95, 0xcc, 0x1, 0x4b, 0x86, 0x5a, 0x97, 0xdd, 0x10, 0x49, 0x84, 0xce, 0x3, 0x7c, 0xb1, 0xfb, 0x36, 0x6f, 0xa2, 0xe8, 0x25, 0x16, 0xdb, 0x91, 0x5c, 0x5, 0xc8, 0x82, 0x4f, 0x30, 0xfd, 0xb7, 0x7a, 0x23, 0xee, 0xa4, 0x69, 0xc2, 0xf, 0x45, 0x88, 0xd1, 0x1c, 0x56, 0x9b, 0xe4, 0x29, 0x63, 0xae, 0xf7, 0x3a, 0x70, 0xbd, 0x8e, 0x43, 0x9, 0xc4, 0x9d, 0x50, 0x1a, 0xd7, 0xa8, 0x65, 0x2f, 0xe2, 0xbb, 0x76, 0x3c, 0xf1, 0x77, 0xba, 0xf0, 0x3d, 0x64, 0xa9, 0xe3, 0x2e, 0x51, 0x9c, 0xd6, 0x1b, 0x42, 0x8f, 0xc5, 0x8, 0x3b, 0xf6, 0xbc, 0x71, 0x28, 0xe5, 0xaf, 0x62, 0x1d, 0xd0, 0x9a, 0x57, 0xe, 0xc3, 0x89, 0x44, 0xef, 0x22, 0x68, 0xa5, 0xfc, 0x31, 0x7b, 0xb6, 0xc9, 0x4, 0x4e, 0x83, 0xda, 0x17, 0x5d, 0x90, 0xa3, 0x6e, 0x24, 0xe9, 0xb0, 0x7d, 0x37, 0xfa, 0x85, 0x48, 0x2, 0xcf, 0x96, 0x5b, 0x11, 0xdc}, {0x0, 0xce, 0x81, 0x4f, 0x1f, 0xd1, 0x9e, 0x50, 0x3e, 0xf0, 0xbf, 0x71, 0x21, 0xef, 0xa0, 0x6e, 0x7c, 0xb2, 0xfd, 0x33, 0x63, 0xad, 0xe2, 0x2c, 0x42, 0x8c, 0xc3, 0xd, 0x5d, 0x93, 0xdc, 0x12, 0xf8, 0x36, 0x79, 0xb7, 0xe7, 0x29, 0x66, 0xa8, 0xc6, 0x8, 0x47, 0x89, 0xd9, 0x17, 0x58, 0x96, 0x84, 0x4a, 0x5, 0xcb, 0x9b, 0x55, 0x1a, 0xd4, 0xba, 0x74, 0x3b, 0xf5, 0xa5, 0x6b, 0x24, 0xea, 0xed, 0x23, 0x6c, 0xa2, 0xf2, 0x3c, 0x73, 0xbd, 0xd3, 0x1d, 0x52, 0x9c, 0xcc, 0x2, 0x4d, 0x83, 0x91, 0x5f, 0x10, 0xde, 0x8e, 0x40, 0xf, 0xc1, 0xaf, 0x61, 0x2e, 0xe0, 0xb0, 0x7e, 0x31, 0xff, 0x15, 0xdb, 0x94, 0x5a, 0xa, 0xc4, 0x8b, 0x45, 0x2b, 0xe5, 0xaa, 0x64, 0x34, 0xfa, 0xb5, 0x7b, 0x69, 0xa7, 0xe8, 0x26, 0x76, 0xb8, 0xf7, 0x39, 0x57, 0x99, 0xd6, 0x18, 0x48, 0x86, 0xc9, 0x7, 0xc7, 0x9, 0x46, 0x88, 0xd8, 0x16, 0x59, 0x97, 0xf9, 0x37, 0x78, 0xb6, 0xe6, 0x28, 0x67, 0xa9, 0xbb, 0x75, 0x3a, 0xf4, 0xa4, 0x6a, 0x25, 0xeb, 0x85, 0x4b, 0x4, 0xca, 0x9a, 0x54, 0x1b, 0xd5, 0x3f, 0xf1, 0xbe, 0x70, 0x20, 0xee, 0xa1, 0x6f, 0x1, 0xcf, 0x80, 0x4e, 0x1e, 0xd0, 0x9f, 0x51, 0x43, 0x8d, 0xc2, 0xc, 0x5c, 0x92, 0xdd, 0x13, 0x7d, 0xb3, 0xfc, 0x32, 0x62, 0xac, 0xe3, 0x2d, 0x2a, 0xe4, 0xab, 0x65, 0x35, 0xfb, 0xb4, 0x7a, 0x14, 0xda, 0x95, 0x5b, 0xb, 0xc5, 0x8a, 0x44, 0x56, 0x98, 0xd7, 0x19, 0x49, 0x87, 0xc8, 0x6, 0x68, 0xa6, 0xe9, 0x27, 0x77, 0xb9, 0xf6, 0x38, 0xd2, 0x1c, 0x53, 0x9d, 0xcd, 0x3, 0x4c, 0x82, 0xec, 0x22, 0x6d, 0xa3, 0xf3, 0x3d, 0x72, 0xbc, 0xae, 0x60, 0x2f, 0xe1, 0xb1, 0x7f, 0x30, 0xfe, 0x90, 0x5e, 0x11, 0xdf, 0x8f, 0x41, 0xe, 0xc0}, {0x0, 0xcf, 0x83, 0x4c, 0x1b, 0xd4, 0x98, 0x57, 0x36, 0xf9, 0xb5, 0x7a, 0x2d, 0xe2, 0xae, 0x61, 0x6c, 0xa3, 0xef, 0x20, 0x77, 0xb8, 0xf4, 0x3b, 0x5a, 0x95, 0xd9, 0x16, 0x41, 0x8e, 0xc2, 0xd, 0xd8, 0x17, 0x5b, 0x94, 0xc3, 0xc, 0x40, 0x8f, 0xee, 0x21, 0x6d, 0xa2, 0xf5, 0x3a, 0x76, 0xb9, 0xb4, 0x7b, 0x37, 0xf8, 0xaf, 0x60, 0x2c, 0xe3, 0x82, 0x4d, 0x1, 0xce, 0x99, 0x56, 0x1a, 0xd5, 0xad, 0x62, 0x2e, 0xe1, 0xb6, 0x79, 0x35, 0xfa, 0x9b, 0x54, 0x18, 0xd7, 0x80, 0x4f, 0x3, 0xcc, 0xc1, 0xe, 0x42, 0x8d, 0xda, 0x15, 0x59, 0x96, 0xf7, 0x38, 0x74, 0xbb, 0xec, 0x23, 0x6f, 0xa0, 0x75, 0xba, 0xf6, 0x39, 0x6e, 0xa1, 0xed, 0x22, 0x43, 0x8c, 0xc0, 0xf, 0x58, 0x97, 0xdb, 0x14, 0x19, 0xd6, 0x9a, 0x55, 0x2, 0xcd, 0x81, 0x4e, 0x2f, 0xe0, 0xac, 0x63, 0x34, 0xfb, 0xb7, 0x78, 0x47, 0x88, 0xc4, 0xb, 0x5c, 0x93, 0xdf, 0x10, 0x71, 0xbe, 0xf2, 0x3d, 0x6a, 0xa5, 0xe9, 0x26, 0x2b, 0xe4, 0xa8, 0x67, 0x30, 0xff, 0xb3, 0x7c, 0x1d, 0xd2, 0x9e, 0x51, 0x6, 0xc9, 0x85, 0x4a, 0x9f, 0x50, 0x1c, 0xd3, 0x84, 0x4b, 0x7, 0xc8, 0xa9, 0x66, 0x2a, 0xe5, 0xb2, 0x7d, 0x31, 0xfe, 0xf3, 0x3c, 0x70, 0xbf, 0xe8, 0x27, 0x6b, 0xa4, 0xc5, 0xa, 0x46, 0x89, 0xde, 0x11, 0x5d, 0x92, 0xea, 0x25, 0x69, 0xa6, 0xf1, 0x3e, 0x72, 0xbd, 0xdc, 0x13, 0x5f, 0x90, 0xc7, 0x8, 0x44, 0x8b, 0x86, 0x49, 0x5, 0xca, 0x9d, 0x52, 0x1e, 0xd1, 0xb0, 0x7f, 0x33, 0xfc, 0xab, 0x64, 0x28, 0xe7, 0x32, 0xfd, 0xb1, 0x7e, 0x29, 0xe6, 0xaa, 0x65, 0x4, 0xcb, 0x87, 0x48, 0x1f, 0xd0, 0x9c, 0x53, 0x5e, 0x91, 0xdd, 0x12, 0x45, 0x8a, 0xc6, 0x9, 0x68, 0xa7, 0xeb, 0x24, 0x73, 0xbc, 0xf0, 0x3f}, {0x0, 0xd0, 0xbd, 0x6d, 0x67, 0xb7, 0xda, 0xa, 0xce, 0x1e, 0x73, 0xa3, 0xa9, 0x79, 0x14, 0xc4, 0x81, 0x51, 0x3c, 0xec, 0xe6, 0x36, 0x5b, 0x8b, 0x4f, 0x9f, 0xf2, 0x22, 0x28, 0xf8, 0x95, 0x45, 0x1f, 0xcf, 0xa2, 0x72, 0x78, 0xa8, 0xc5, 0x15, 0xd1, 0x1, 0x6c, 0xbc, 0xb6, 0x66, 0xb, 0xdb, 0x9e, 0x4e, 0x23, 0xf3, 0xf9, 0x29, 0x44, 0x94, 0x50, 0x80, 0xed, 0x3d, 0x37, 0xe7, 0x8a, 0x5a, 0x3e, 0xee, 0x83, 0x53, 0x59, 0x89, 0xe4, 0x34, 0xf0, 0x20, 0x4d, 0x9d, 0x97, 0x47, 0x2a, 0xfa, 0xbf, 0x6f, 0x2, 0xd2, 0xd8, 0x8, 0x65, 0xb5, 0x71, 0xa1, 0xcc, 0x1c, 0x16, 0xc6, 0xab, 0x7b, 0x21, 0xf1, 0x9c, 0x4c, 0x46, 0x96, 0xfb, 0x2b, 0xef, 0x3f, 0x52, 0x82, 0x88, 0x58, 0x35, 0xe5, 0xa0, 0x70, 0x1d, 0xcd, 0xc7, 0x17, 0x7a, 0xaa, 0x6e, 0xbe, 0xd3, 0x3, 0x9, 0xd9, 0xb4, 0x64, 0x7c, 0xac, 0xc1, 0x11, 0x1b, 0xcb, 0xa6, 0x76, 0xb2, 0x62, 0xf, 0xdf, 0xd5, 0x5, 0x68, 0xb8, 0xfd, 0x2d, 0x40, 0x90, 0x9a, 0x4a, 0x27, 0xf7, 0x33, 0xe3, 0x8e, 0x5e, 0x54, 0x84, 0xe9, 0x39, 0x63, 0xb3, 0xde, 0xe, 0x4, 0xd4, 0xb9, 0x69, 0xad, 0x7d, 0x10, 0xc0, 0xca, 0x1a, 0x77, 0xa7, 0xe2, 0x32, 0x5f, 0x8f, 0x85, 0x55, 0x38, 0xe8, 0x2c, 0xfc, 0x91, 0x41, 0x4b, 0x9b, 0xf6, 0x26, 0x42, 0x92, 0xff, 0x2f, 0x25, 0xf5, 0x98, 0x48, 0x8c, 0x5c, 0x31, 0xe1, 0xeb, 0x3b, 0x56, 0x86, 0xc3, 0x13, 0x7e, 0xae, 0xa4, 0x74, 0x19, 0xc9, 0xd, 0xdd, 0xb0, 0x60, 0x6a, 0xba, 0xd7, 0x7, 0x5d, 0x8d, 0xe0, 0x30, 0x3a, 0xea, 0x87, 0x57, 0x93, 0x43, 0x2e, 0xfe, 0xf4, 0x24, 0x49, 0x99, 0xdc, 0xc, 0x61, 0xb1, 0xbb, 0x6b, 0x6, 0xd6, 0x12, 0xc2, 0xaf, 0x7f, 0x75, 0xa5, 0xc8, 0x18}, {0x0, 0xd1, 0xbf, 0x6e, 0x63, 0xb2, 0xdc, 0xd, 0xc6, 0x17, 0x79, 0xa8, 0xa5, 0x74, 0x1a, 0xcb, 0x91, 0x40, 0x2e, 0xff, 0xf2, 0x23, 0x4d, 0x9c, 0x57, 0x86, 0xe8, 0x39, 0x34, 0xe5, 0x8b, 0x5a, 0x3f, 0xee, 0x80, 0x51, 0x5c, 0x8d, 0xe3, 0x32, 0xf9, 0x28, 0x46, 0x97, 0x9a, 0x4b, 0x25, 0xf4, 0xae, 0x7f, 0x11, 0xc0, 0xcd, 0x1c, 0x72, 0xa3, 0x68, 0xb9, 0xd7, 0x6, 0xb, 0xda, 0xb4, 0x65, 0x7e, 0xaf, 0xc1, 0x10, 0x1d, 0xcc, 0xa2, 0x73, 0xb8, 0x69, 0x7, 0xd6, 0xdb, 0xa, 0x64, 0xb5, 0xef, 0x3e, 0x50, 0x81, 0x8c, 0x5d, 0x33, 0xe2, 0x29, 0xf8, 0x96, 0x47, 0x4a, 0x9b, 0xf5, 0x24, 0x41, 0x90, 0xfe, 0x2f, 0x22, 0xf3, 0x9d, 0x4c, 0x87, 0x56, 0x38, 0xe9, 0xe4, 0x35, 0x5b, 0x8a, 0xd0, 0x1, 0x6f, 0xbe, 0xb3, 0x62, 0xc, 0xdd, 0x16, 0xc7, 0xa9, 0x78, 0x75, 0xa4, 0xca, 0x1b, 0xfc, 0x2d, 0x43, 0x92, 0x9f, 0x4e, 0x20, 0xf1, 0x3a, 0xeb, 0x85, 0x54, 0x59, 0x88, 0xe6, 0x37, 0x6d, 0xbc, 0xd2, 0x3, 0xe, 0xdf, 0xb1, 0x60, 0xab, 0x7a, 0x14, 0xc5, 0xc8, 0x19, 0x77, 0xa6, 0xc3, 0x12, 0x7c, 0xad, 0xa0, 0x71, 0x1f, 0xce, 0x5, 0xd4, 0xba, 0x6b, 0x66, 0xb7, 0xd9, 0x8, 0x52, 0x83, 0xed, 0x3c, 0x31, 0xe0, 0x8e, 0x5f, 0x94, 0x45, 0x2b, 0xfa, 0xf7, 0x26, 0x48, 0x99, 0x82, 0x53, 0x3d, 0xec, 0xe1, 0x30, 0x5e, 0x8f, 0x44, 0x95, 0xfb, 0x2a, 0x27, 0xf6, 0x98, 0x49, 0x13, 0xc2, 0xac, 0x7d, 0x70, 0xa1, 0xcf, 0x1e, 0xd5, 0x4, 0x6a, 0xbb, 0xb6, 0x67, 0x9, 0xd8, 0xbd, 0x6c, 0x2, 0xd3, 0xde, 0xf, 0x61, 0xb0, 0x7b, 0xaa, 0xc4, 0x15, 0x18, 0xc9, 0xa7, 0x76, 0x2c, 0xfd, 0x93, 0x42, 0x4f, 0x9e, 0xf0, 0x21, 0xea, 0x3b, 0x55, 0x84, 0x89, 0x58, 0x36, 0xe7}, {0x0, 0xd2, 0xb9, 0x6b, 0x6f, 0xbd, 0xd6, 0x4, 0xde, 0xc, 0x67, 0xb5, 0xb1, 0x63, 0x8, 0xda, 0xa1, 0x73, 0x18, 0xca, 0xce, 0x1c, 0x77, 0xa5, 0x7f, 0xad, 0xc6, 0x14, 0x10, 0xc2, 0xa9, 0x7b, 0x5f, 0x8d, 0xe6, 0x34, 0x30, 0xe2, 0x89, 0x5b, 0x81, 0x53, 0x38, 0xea, 0xee, 0x3c, 0x57, 0x85, 0xfe, 0x2c, 0x47, 0x95, 0x91, 0x43, 0x28, 0xfa, 0x20, 0xf2, 0x99, 0x4b, 0x4f, 0x9d, 0xf6, 0x24, 0xbe, 0x6c, 0x7, 0xd5, 0xd1, 0x3, 0x68, 0xba, 0x60, 0xb2, 0xd9, 0xb, 0xf, 0xdd, 0xb6, 0x64, 0x1f, 0xcd, 0xa6, 0x74, 0x70, 0xa2, 0xc9, 0x1b, 0xc1, 0x13, 0x78, 0xaa, 0xae, 0x7c, 0x17, 0xc5, 0xe1, 0x33, 0x58, 0x8a, 0x8e, 0x5c, 0x37, 0xe5, 0x3f, 0xed, 0x86, 0x54, 0x50, 0x82, 0xe9, 0x3b, 0x40, 0x92, 0xf9, 0x2b, 0x2f, 0xfd, 0x96, 0x44, 0x9e, 0x4c, 0x27, 0xf5, 0xf1, 0x23, 0x48, 0x9a, 0x61, 0xb3, 0xd8, 0xa, 0xe, 0xdc, 0xb7, 0x65, 0xbf, 0x6d, 0x6, 0xd4, 0xd0, 0x2, 0x69, 0xbb, 0xc0, 0x12, 0x79, 0xab, 0xaf, 0x7d, 0x16, 0xc4, 0x1e, 0xcc, 0xa7, 0x75, 0x71, 0xa3, 0xc8, 0x1a, 0x3e, 0xec, 0x87, 0x55, 0x51, 0x83, 0xe8, 0x3a, 0xe0, 0x32, 0x59, 0x8b, 0x8f, 0x5d, 0x36, 0xe4, 0x9f, 0x4d, 0x26, 0xf4, 0xf0, 0x22, 0x49, 0x9b, 0x41, 0x93, 0xf8, 0x2a, 0x2e, 0xfc, 0x97, 0x45, 0xdf, 0xd, 0x66, 0xb4, 0xb0, 0x62, 0x9, 0xdb, 0x1, 0xd3, 0xb8, 0x6a, 0x6e, 0xbc, 0xd7, 0x5, 0x7e, 0xac, 0xc7, 0x15, 0x11, 0xc3, 0xa8, 0x7a, 0xa0, 0x72, 0x19, 0xcb, 0xcf, 0x1d, 0x76, 0xa4, 0x80, 0x52, 0x39, 0xeb, 0xef, 0x3d, 0x56, 0x84, 0x5e, 0x8c, 0xe7, 0x35, 0x31, 0xe3, 0x88, 0x5a, 0x21, 0xf3, 0x98, 0x4a, 0x4e, 0x9c, 0xf7, 0x25, 0xff, 0x2d, 0x46, 0x94, 0x90, 0x42, 0x29, 0xfb}, {0x0, 0xd3, 0xbb, 0x68, 0x6b, 0xb8, 0xd0, 0x3, 0xd6, 0x5, 0x6d, 0xbe, 0xbd, 0x6e, 0x6, 0xd5, 0xb1, 0x62, 0xa, 0xd9, 0xda, 0x9, 0x61, 0xb2, 0x67, 0xb4, 0xdc, 0xf, 0xc, 0xdf, 0xb7, 0x64, 0x7f, 0xac, 0xc4, 0x17, 0x14, 0xc7, 0xaf, 0x7c, 0xa9, 0x7a, 0x12, 0xc1, 0xc2, 0x11, 0x79, 0xaa, 0xce, 0x1d, 0x75, 0xa6, 0xa5, 0x76, 0x1e, 0xcd, 0x18, 0xcb, 0xa3, 0x70, 0x73, 0xa0, 0xc8, 0x1b, 0xfe, 0x2d, 0x45, 0x96, 0x95, 0x46, 0x2e, 0xfd, 0x28, 0xfb, 0x93, 0x40, 0x43, 0x90, 0xf8, 0x2b, 0x4f, 0x9c, 0xf4, 0x27, 0x24, 0xf7, 0x9f, 0x4c, 0x99, 0x4a, 0x22, 0xf1, 0xf2, 0x21, 0x49, 0x9a, 0x81, 0x52, 0x3a, 0xe9, 0xea, 0x39, 0x51, 0x82, 0x57, 0x84, 0xec, 0x3f, 0x3c, 0xef, 0x87, 0x54, 0x30, 0xe3, 0x8b, 0x58, 0x5b, 0x88, 0xe0, 0x33, 0xe6, 0x35, 0x5d, 0x8e, 0x8d, 0x5e, 0x36, 0xe5, 0xe1, 0x32, 0x5a, 0x89, 0x8a, 0x59, 0x31, 0xe2, 0x37, 0xe4, 0x8c, 0x5f, 0x5c, 0x8f, 0xe7, 0x34, 0x50, 0x83, 0xeb, 0x38, 0x3b, 0xe8, 0x80, 0x53, 0x86, 0x55, 0x3d, 0xee, 0xed, 0x3e, 0x56, 0x85, 0x9e, 0x4d, 0x25, 0xf6, 0xf5, 0x26, 0x4e, 0x9d, 0x48, 0x9b, 0xf3, 0x20, 0x23, 0xf0, 0x98, 0x4b, 0x2f, 0xfc, 0x94, 0x47, 0x44, 0x97, 0xff, 0x2c, 0xf9, 0x2a, 0x42, 0x91, 0x92, 0x41, 0x29, 0xfa, 0x1f, 0xcc, 0xa4, 0x77, 0x74, 0xa7, 0xcf, 0x1c, 0xc9, 0x1a, 0x72, 0xa1, 0xa2, 0x71, 0x19, 0xca, 0xae, 0x7d, 0x15, 0xc6, 0xc5, 0x16, 0x7e, 0xad, 0x78, 0xab, 0xc3, 0x10, 0x13, 0xc0, 0xa8, 0x7b, 0x60, 0xb3, 0xdb, 0x8, 0xb, 0xd8, 0xb0, 0x63, 0xb6, 0x65, 0xd, 0xde, 0xdd, 0xe, 0x66, 0xb5, 0xd1, 0x2, 0x6a, 0xb9, 0xba, 0x69, 0x1, 0xd2, 0x7, 0xd4, 0xbc, 0x6f, 0x6c, 0xbf, 0xd7, 0x4}, {0x0, 0xd4, 0xb5, 0x61, 0x77, 0xa3, 0xc2, 0x16, 0xee, 0x3a, 0x5b, 0x8f, 0x99, 0x4d, 0x2c, 0xf8, 0xc1, 0x15, 0x74, 0xa0, 0xb6, 0x62, 0x3, 0xd7, 0x2f, 0xfb, 0x9a, 0x4e, 0x58, 0x8c, 0xed, 0x39, 0x9f, 0x4b, 0x2a, 0xfe, 0xe8, 0x3c, 0x5d, 0x89, 0x71, 0xa5, 0xc4, 0x10, 0x6, 0xd2, 0xb3, 0x67, 0x5e, 0x8a, 0xeb, 0x3f, 0x29, 0xfd, 0x9c, 0x48, 0xb0, 0x64, 0x5, 0xd1, 0xc7, 0x13, 0x72, 0xa6, 0x23, 0xf7, 0x96, 0x42, 0x54, 0x80, 0xe1, 0x35, 0xcd, 0x19, 0x78, 0xac, 0xba, 0x6e, 0xf, 0xdb, 0xe2, 0x36, 0x57, 0x83, 0x95, 0x41, 0x20, 0xf4, 0xc, 0xd8, 0xb9, 0x6d, 0x7b, 0xaf, 0xce, 0x1a, 0xbc, 0x68, 0x9, 0xdd, 0xcb, 0x1f, 0x7e, 0xaa, 0x52, 0x86, 0xe7, 0x33, 0x25, 0xf1, 0x90, 0x44, 0x7d, 0xa9, 0xc8, 0x1c, 0xa, 0xde, 0xbf, 0x6b, 0x93, 0x47, 0x26, 0xf2, 0xe4, 0x30, 0x51, 0x85, 0x46, 0x92, 0xf3, 0x27, 0x31, 0xe5, 0x84, 0x50, 0xa8, 0x7c, 0x1d, 0xc9, 0xdf, 0xb, 0x6a, 0xbe, 0x87, 0x53, 0x32, 0xe6, 0xf0, 0x24, 0x45, 0x91, 0x69, 0xbd, 0xdc, 0x8, 0x1e, 0xca, 0xab, 0x7f, 0xd9, 0xd, 0x6c, 0xb8, 0xae, 0x7a, 0x1b, 0xcf, 0x37, 0xe3, 0x82, 0x56, 0x40, 0x94, 0xf5, 0x21, 0x18, 0xcc, 0xad, 0x79, 0x6f, 0xbb, 0xda, 0xe, 0xf6, 0x22, 0x43, 0x97, 0x81, 0x55, 0x34, 0xe0, 0x65, 0xb1, 0xd0, 0x4, 0x12, 0xc6, 0xa7, 0x73, 0x8b, 0x5f, 0x3e, 0xea, 0xfc, 0x28, 0x49, 0x9d, 0xa4, 0x70, 0x11, 0xc5, 0xd3, 0x7, 0x66, 0xb2, 0x4a, 0x9e, 0xff, 0x2b, 0x3d, 0xe9, 0x88, 0x5c, 0xfa, 0x2e, 0x4f, 0x9b, 0x8d, 0x59, 0x38, 0xec, 0x14, 0xc0, 0xa1, 0x75, 0x63, 0xb7, 0xd6, 0x2, 0x3b, 0xef, 0x8e, 0x5a, 0x4c, 0x98, 0xf9, 0x2d, 0xd5, 0x1, 0x60, 0xb4, 0xa2, 0x76, 0x17, 0xc3}, {0x0, 0xd5, 0xb7, 0x62, 0x73, 0xa6, 0xc4, 0x11, 0xe6, 0x33, 0x51, 0x84, 0x95, 0x40, 0x22, 0xf7, 0xd1, 0x4, 0x66, 0xb3, 0xa2, 0x77, 0x15, 0xc0, 0x37, 0xe2, 0x80, 0x55, 0x44, 0x91, 0xf3, 0x26, 0xbf, 0x6a, 0x8, 0xdd, 0xcc, 0x19, 0x7b, 0xae, 0x59, 0x8c, 0xee, 0x3b, 0x2a, 0xff, 0x9d, 0x48, 0x6e, 0xbb, 0xd9, 0xc, 0x1d, 0xc8, 0xaa, 0x7f, 0x88, 0x5d, 0x3f, 0xea, 0xfb, 0x2e, 0x4c, 0x99, 0x63, 0xb6, 0xd4, 0x1, 0x10, 0xc5, 0xa7, 0x72, 0x85, 0x50, 0x32, 0xe7, 0xf6, 0x23, 0x41, 0x94, 0xb2, 0x67, 0x5, 0xd0, 0xc1, 0x14, 0x76, 0xa3, 0x54, 0x81, 0xe3, 0x36, 0x27, 0xf2, 0x90, 0x45, 0xdc, 0x9, 0x6b, 0xbe, 0xaf, 0x7a, 0x18, 0xcd, 0x3a, 0xef, 0x8d, 0x58, 0x49, 0x9c, 0xfe, 0x2b, 0xd, 0xd8, 0xba, 0x6f, 0x7e, 0xab, 0xc9, 0x1c, 0xeb, 0x3e, 0x5c, 0x89, 0x98, 0x4d, 0x2f, 0xfa, 0xc6, 0x13, 0x71, 0xa4, 0xb5, 0x60, 0x2, 0xd7, 0x20, 0xf5, 0x97, 0x42, 0x53, 0x86, 0xe4, 0x31, 0x17, 0xc2, 0xa0, 0x75, 0x64, 0xb1, 0xd3, 0x6, 0xf1, 0x24, 0x46, 0x93, 0x82, 0x57, 0x35, 0xe0, 0x79, 0xac, 0xce, 0x1b, 0xa, 0xdf, 0xbd, 0x68, 0x9f, 0x4a, 0x28, 0xfd, 0xec, 0x39, 0x5b, 0x8e, 0xa8, 0x7d, 0x1f, 0xca, 0xdb, 0xe, 0x6c, 0xb9, 0x4e, 0x9b, 0xf9, 0x2c, 0x3d, 0xe8, 0x8a, 0x5f, 0xa5, 0x70, 0x12, 0xc7, 0xd6, 0x3, 0x61, 0xb4, 0x43, 0x96, 0xf4, 0x21, 0x30, 0xe5, 0x87, 0x52, 0x74, 0xa1, 0xc3, 0x16, 0x7, 0xd2, 0xb0, 0x65, 0x92, 0x47, 0x25, 0xf0, 0xe1, 0x34, 0x56, 0x83, 0x1a, 0xcf, 0xad, 0x78, 0x69, 0xbc, 0xde, 0xb, 0xfc, 0x29, 0x4b, 0x9e, 0x8f, 0x5a, 0x38, 0xed, 0xcb, 0x1e, 0x7c, 0xa9, 0xb8, 0x6d, 0xf, 0xda, 0x2d, 0xf8, 0x9a, 0x4f, 0x5e, 0x8b, 0xe9, 0x3c}, {0x0, 0xd6, 0xb1, 0x67, 0x7f, 0xa9, 0xce, 0x18, 0xfe, 0x28, 0x4f, 0x99, 0x81, 0x57, 0x30, 0xe6, 0xe1, 0x37, 0x50, 0x86, 0x9e, 0x48, 0x2f, 0xf9, 0x1f, 0xc9, 0xae, 0x78, 0x60, 0xb6, 0xd1, 0x7, 0xdf, 0x9, 0x6e, 0xb8, 0xa0, 0x76, 0x11, 0xc7, 0x21, 0xf7, 0x90, 0x46, 0x5e, 0x88, 0xef, 0x39, 0x3e, 0xe8, 0x8f, 0x59, 0x41, 0x97, 0xf0, 0x26, 0xc0, 0x16, 0x71, 0xa7, 0xbf, 0x69, 0xe, 0xd8, 0xa3, 0x75, 0x12, 0xc4, 0xdc, 0xa, 0x6d, 0xbb, 0x5d, 0x8b, 0xec, 0x3a, 0x22, 0xf4, 0x93, 0x45, 0x42, 0x94, 0xf3, 0x25, 0x3d, 0xeb, 0x8c, 0x5a, 0xbc, 0x6a, 0xd, 0xdb, 0xc3, 0x15, 0x72, 0xa4, 0x7c, 0xaa, 0xcd, 0x1b, 0x3, 0xd5, 0xb2, 0x64, 0x82, 0x54, 0x33, 0xe5, 0xfd, 0x2b, 0x4c, 0x9a, 0x9d, 0x4b, 0x2c, 0xfa, 0xe2, 0x34, 0x53, 0x85, 0x63, 0xb5, 0xd2, 0x4, 0x1c, 0xca, 0xad, 0x7b, 0x5b, 0x8d, 0xea, 0x3c, 0x24, 0xf2, 0x95, 0x43, 0xa5, 0x73, 0x14, 0xc2, 0xda, 0xc, 0x6b, 0xbd, 0xba, 0x6c, 0xb, 0xdd, 0xc5, 0x13, 0x74, 0xa2, 0x44, 0x92, 0xf5, 0x23, 0x3b, 0xed, 0x8a, 0x5c, 0x84, 0x52, 0x35, 0xe3, 0xfb, 0x2d, 0x4a, 0x9c, 0x7a, 0xac, 0xcb, 0x1d, 0x5, 0xd3, 0xb4, 0x62, 0x65, 0xb3, 0xd4, 0x2, 0x1a, 0xcc, 0xab, 0x7d, 0x9b, 0x4d, 0x2a, 0xfc, 0xe4, 0x32, 0x55, 0x83, 0xf8, 0x2e, 0x49, 0x9f, 0x87, 0x51, 0x36, 0xe0, 0x6, 0xd0, 0xb7, 0x61, 0x79, 0xaf, 0xc8, 0x1e, 0x19, 0xcf, 0xa8, 0x7e, 0x66, 0xb0, 0xd7, 0x1, 0xe7, 0x31, 0x56, 0x80, 0x98, 0x4e, 0x29, 0xff, 0x27, 0xf1, 0x96, 0x40, 0x58, 0x8e, 0xe9, 0x3f, 0xd9, 0xf, 0x68, 0xbe, 0xa6, 0x70, 0x17, 0xc1, 0xc6, 0x10, 0x77, 0xa1, 0xb9, 0x6f, 0x8, 0xde, 0x38, 0xee, 0x89, 0x5f, 0x47, 0x91, 0xf6, 0x20}, {0x0, 0xd7, 0xb3, 0x64, 0x7b, 0xac, 0xc8, 0x1f, 0xf6, 0x21, 0x45, 0x92, 0x8d, 0x5a, 0x3e, 0xe9, 0xf1, 0x26, 0x42, 0x95, 0x8a, 0x5d, 0x39, 0xee, 0x7, 0xd0, 0xb4, 0x63, 0x7c, 0xab, 0xcf, 0x18, 0xff, 0x28, 0x4c, 0x9b, 0x84, 0x53, 0x37, 0xe0, 0x9, 0xde, 0xba, 0x6d, 0x72, 0xa5, 0xc1, 0x16, 0xe, 0xd9, 0xbd, 0x6a, 0x75, 0xa2, 0xc6, 0x11, 0xf8, 0x2f, 0x4b, 0x9c, 0x83, 0x54, 0x30, 0xe7, 0xe3, 0x34, 0x50, 0x87, 0x98, 0x4f, 0x2b, 0xfc, 0x15, 0xc2, 0xa6, 0x71, 0x6e, 0xb9, 0xdd, 0xa, 0x12, 0xc5, 0xa1, 0x76, 0x69, 0xbe, 0xda, 0xd, 0xe4, 0x33, 0x57, 0x80, 0x9f, 0x48, 0x2c, 0xfb, 0x1c, 0xcb, 0xaf, 0x78, 0x67, 0xb0, 0xd4, 0x3, 0xea, 0x3d, 0x59, 0x8e, 0x91, 0x46, 0x22, 0xf5, 0xed, 0x3a, 0x5e, 0x89, 0x96, 0x41, 0x25, 0xf2, 0x1b, 0xcc, 0xa8, 0x7f, 0x60, 0xb7, 0xd3, 0x4, 0xdb, 0xc, 0x68, 0xbf, 0xa0, 0x77, 0x13, 0xc4, 0x2d, 0xfa, 0x9e, 0x49, 0x56, 0x81, 0xe5, 0x32, 0x2a, 0xfd, 0x99, 0x4e, 0x51, 0x86, 0xe2, 0x35, 0xdc, 0xb, 0x6f, 0xb8, 0xa7, 0x70, 0x14, 0xc3, 0x24, 0xf3, 0x97, 0x40, 0x5f, 0x88, 0xec, 0x3b, 0xd2, 0x5, 0x61, 0xb6, 0xa9, 0x7e, 0x1a, 0xcd, 0xd5, 0x2, 0x66, 0xb1, 0xae, 0x79, 0x1d, 0xca, 0x23, 0xf4, 0x90, 0x47, 0x58, 0x8f, 0xeb, 0x3c, 0x38, 0xef, 0x8b, 0x5c, 0x43, 0x94, 0xf0, 0x27, 0xce, 0x19, 0x7d, 0xaa, 0xb5, 0x62, 0x6, 0xd1, 0xc9, 0x1e, 0x7a, 0xad, 0xb2, 0x65, 0x1, 0xd6, 0x3f, 0xe8, 0x8c, 0x5b, 0x44, 0x93, 0xf7, 0x20, 0xc7, 0x10, 0x74, 0xa3, 0xbc, 0x6b, 0xf, 0xd8, 0x31, 0xe6, 0x82, 0x55, 0x4a, 0x9d, 0xf9, 0x2e, 0x36, 0xe1, 0x85, 0x52, 0x4d, 0x9a, 0xfe, 0x29, 0xc0, 0x17, 0x73, 0xa4, 0xbb, 0x6c, 0x8, 0xdf}, {0x0, 0xd8, 0xad, 0x75, 0x47, 0x9f, 0xea, 0x32, 0x8e, 0x56, 0x23, 0xfb, 0xc9, 0x11, 0x64, 0xbc, 0x1, 0xd9, 0xac, 0x74, 0x46, 0x9e, 0xeb, 0x33, 0x8f, 0x57, 0x22, 0xfa, 0xc8, 0x10, 0x65, 0xbd, 0x2, 0xda, 0xaf, 0x77, 0x45, 0x9d, 0xe8, 0x30, 0x8c, 0x54, 0x21, 0xf9, 0xcb, 0x13, 0x66, 0xbe, 0x3, 0xdb, 0xae, 0x76, 0x44, 0x9c, 0xe9, 0x31, 0x8d, 0x55, 0x20, 0xf8, 0xca, 0x12, 0x67, 0xbf, 0x4, 0xdc, 0xa9, 0x71, 0x43, 0x9b, 0xee, 0x36, 0x8a, 0x52, 0x27, 0xff, 0xcd, 0x15, 0x60, 0xb8, 0x5, 0xdd, 0xa8, 0x70, 0x42, 0x9a, 0xef, 0x37, 0x8b, 0x53, 0x26, 0xfe, 0xcc, 0x14, 0x61, 0xb9, 0x6, 0xde, 0xab, 0x73, 0x41, 0x99, 0xec, 0x34, 0x88, 0x50, 0x25, 0xfd, 0xcf, 0x17, 0x62, 0xba, 0x7, 0xdf, 0xaa, 0x72, 0x40, 0x98, 0xed, 0x35, 0x89, 0x51, 0x24, 0xfc, 0xce, 0x16, 0x63, 0xbb, 0x8, 0xd0, 0xa5, 0x7d, 0x4f, 0x97, 0xe2, 0x3a, 0x86, 0x5e, 0x2b, 0xf3, 0xc1, 0x19, 0x6c, 0xb4, 0x9, 0xd1, 0xa4, 0x7c, 0x4e, 0x96, 0xe3, 0x3b, 0x87, 0x5f, 0x2a, 0xf2, 0xc0, 0x18, 0x6d, 0xb5, 0xa, 0xd2, 0xa7, 0x7f, 0x4d, 0x95, 0xe0, 0x38, 0x84, 0x5c, 0x29, 0xf1, 0xc3, 0x1b, 0x6e, 0xb6, 0xb, 0xd3, 0xa6, 0x7e, 0x4c, 0x94, 0xe1, 0x39, 0x85, 0x5d, 0x28, 0xf0, 0xc2, 0x1a, 0x6f, 0xb7, 0xc, 0xd4, 0xa1, 0x79, 0x4b, 0x93, 0xe6, 0x3e, 0x82, 0x5a, 0x2f, 0xf7, 0xc5, 0x1d, 0x68, 0xb0, 0xd, 0xd5, 0xa0, 0x78, 0x4a, 0x92, 0xe7, 0x3f, 0x83, 0x5b, 0x2e, 0xf6, 0xc4, 0x1c, 0x69, 0xb1, 0xe, 0xd6, 0xa3, 0x7b, 0x49, 0x91, 0xe4, 0x3c, 0x80, 0x58, 0x2d, 0xf5, 0xc7, 0x1f, 0x6a, 0xb2, 0xf, 0xd7, 0xa2, 0x7a, 0x48, 0x90, 0xe5, 0x3d, 0x81, 0x59, 0x2c, 0xf4, 0xc6, 0x1e, 0x6b, 0xb3}, {0x0, 0xd9, 0xaf, 0x76, 0x43, 0x9a, 0xec, 0x35, 0x86, 0x5f, 0x29, 0xf0, 0xc5, 0x1c, 0x6a, 0xb3, 0x11, 0xc8, 0xbe, 0x67, 0x52, 0x8b, 0xfd, 0x24, 0x97, 0x4e, 0x38, 0xe1, 0xd4, 0xd, 0x7b, 0xa2, 0x22, 0xfb, 0x8d, 0x54, 0x61, 0xb8, 0xce, 0x17, 0xa4, 0x7d, 0xb, 0xd2, 0xe7, 0x3e, 0x48, 0x91, 0x33, 0xea, 0x9c, 0x45, 0x70, 0xa9, 0xdf, 0x6, 0xb5, 0x6c, 0x1a, 0xc3, 0xf6, 0x2f, 0x59, 0x80, 0x44, 0x9d, 0xeb, 0x32, 0x7, 0xde, 0xa8, 0x71, 0xc2, 0x1b, 0x6d, 0xb4, 0x81, 0x58, 0x2e, 0xf7, 0x55, 0x8c, 0xfa, 0x23, 0x16, 0xcf, 0xb9, 0x60, 0xd3, 0xa, 0x7c, 0xa5, 0x90, 0x49, 0x3f, 0xe6, 0x66, 0xbf, 0xc9, 0x10, 0x25, 0xfc, 0x8a, 0x53, 0xe0, 0x39, 0x4f, 0x96, 0xa3, 0x7a, 0xc, 0xd5, 0x77, 0xae, 0xd8, 0x1, 0x34, 0xed, 0x9b, 0x42, 0xf1, 0x28, 0x5e, 0x87, 0xb2, 0x6b, 0x1d, 0xc4, 0x88, 0x51, 0x27, 0xfe, 0xcb, 0x12, 0x64, 0xbd, 0xe, 0xd7, 0xa1, 0x78, 0x4d, 0x94, 0xe2, 0x3b, 0x99, 0x40, 0x36, 0xef, 0xda, 0x3, 0x75, 0xac, 0x1f, 0xc6, 0xb0, 0x69, 0x5c, 0x85, 0xf3, 0x2a, 0xaa, 0x73, 0x5, 0xdc, 0xe9, 0x30, 0x46, 0x9f, 0x2c, 0xf5, 0x83, 0x5a, 0x6f, 0xb6, 0xc0, 0x19, 0xbb, 0x62, 0x14, 0xcd, 0xf8, 0x21, 0x57, 0x8e, 0x3d, 0xe4, 0x92, 0x4b, 0x7e, 0xa7, 0xd1, 0x8, 0xcc, 0x15, 0x63, 0xba, 0x8f, 0x56, 0x20, 0xf9, 0x4a, 0x93, 0xe5, 0x3c, 0x9, 0xd0, 0xa6, 0x7f, 0xdd, 0x4, 0x72, 0xab, 0x9e, 0x47, 0x31, 0xe8, 0x5b, 0x82, 0xf4, 0x2d, 0x18, 0xc1, 0xb7, 0x6e, 0xee, 0x37, 0x41, 0x98, 0xad, 0x74, 0x2, 0xdb, 0x68, 0xb1, 0xc7, 0x1e, 0x2b, 0xf2, 0x84, 0x5d, 0xff, 0x26, 0x50, 0x89, 0xbc, 0x65, 0x13, 0xca, 0x79, 0xa0, 0xd6, 0xf, 0x3a, 0xe3, 0x95, 0x4c}, {0x0, 0xda, 0xa9, 0x73, 0x4f, 0x95, 0xe6, 0x3c, 0x9e, 0x44, 0x37, 0xed, 0xd1, 0xb, 0x78, 0xa2, 0x21, 0xfb, 0x88, 0x52, 0x6e, 0xb4, 0xc7, 0x1d, 0xbf, 0x65, 0x16, 0xcc, 0xf0, 0x2a, 0x59, 0x83, 0x42, 0x98, 0xeb, 0x31, 0xd, 0xd7, 0xa4, 0x7e, 0xdc, 0x6, 0x75, 0xaf, 0x93, 0x49, 0x3a, 0xe0, 0x63, 0xb9, 0xca, 0x10, 0x2c, 0xf6, 0x85, 0x5f, 0xfd, 0x27, 0x54, 0x8e, 0xb2, 0x68, 0x1b, 0xc1, 0x84, 0x5e, 0x2d, 0xf7, 0xcb, 0x11, 0x62, 0xb8, 0x1a, 0xc0, 0xb3, 0x69, 0x55, 0x8f, 0xfc, 0x26, 0xa5, 0x7f, 0xc, 0xd6, 0xea, 0x30, 0x43, 0x99, 0x3b, 0xe1, 0x92, 0x48, 0x74, 0xae, 0xdd, 0x7, 0xc6, 0x1c, 0x6f, 0xb5, 0x89, 0x53, 0x20, 0xfa, 0x58, 0x82, 0xf1, 0x2b, 0x17, 0xcd, 0xbe, 0x64, 0xe7, 0x3d, 0x4e, 0x94, 0xa8, 0x72, 0x1, 0xdb, 0x79, 0xa3, 0xd0, 0xa, 0x36, 0xec, 0x9f, 0x45, 0x15, 0xcf, 0xbc, 0x66, 0x5a, 0x80, 0xf3, 0x29, 0x8b, 0x51, 0x22, 0xf8, 0xc4, 0x1e, 0x6d, 0xb7, 0x34, 0xee, 0x9d, 0x47, 0x7b, 0xa1, 0xd2, 0x8, 0xaa, 0x70, 0x3, 0xd9, 0xe5, 0x3f, 0x4c, 0x96, 0x57, 0x8d, 0xfe, 0x24, 0x18, 0xc2, 0xb1, 0x6b, 0xc9, 0x13, 0x60, 0xba, 0x86, 0x5c, 0x2f, 0xf5, 0x76, 0xac, 0xdf, 0x5, 0x39, 0xe3, 0x90, 0x4a, 0xe8, 0x32, 0x41, 0x9b, 0xa7, 0x7d, 0xe, 0xd4, 0x91, 0x4b, 0x38, 0xe2, 0xde, 0x4, 0x77, 0xad, 0xf, 0xd5, 0xa6, 0x7c, 0x40, 0x9a, 0xe9, 0x33, 0xb0, 0x6a, 0x19, 0xc3, 0xff, 0x25, 0x56, 0x8c, 0x2e, 0xf4, 0x87, 0x5d, 0x61, 0xbb, 0xc8, 0x12, 0xd3, 0x9, 0x7a, 0xa0, 0x9c, 0x46, 0x35, 0xef, 0x4d, 0x97, 0xe4, 0x3e, 0x2, 0xd8, 0xab, 0x71, 0xf2, 0x28, 0x5b, 0x81, 0xbd, 0x67, 0x14, 0xce, 0x6c, 0xb6, 0xc5, 0x1f, 0x23, 0xf9, 0x8a, 0x50}, {0x0, 0xdb, 0xab, 0x70, 0x4b, 0x90, 0xe0, 0x3b, 0x96, 0x4d, 0x3d, 0xe6, 0xdd, 0x6, 0x76, 0xad, 0x31, 0xea, 0x9a, 0x41, 0x7a, 0xa1, 0xd1, 0xa, 0xa7, 0x7c, 0xc, 0xd7, 0xec, 0x37, 0x47, 0x9c, 0x62, 0xb9, 0xc9, 0x12, 0x29, 0xf2, 0x82, 0x59, 0xf4, 0x2f, 0x5f, 0x84, 0xbf, 0x64, 0x14, 0xcf, 0x53, 0x88, 0xf8, 0x23, 0x18, 0xc3, 0xb3, 0x68, 0xc5, 0x1e, 0x6e, 0xb5, 0x8e, 0x55, 0x25, 0xfe, 0xc4, 0x1f, 0x6f, 0xb4, 0x8f, 0x54, 0x24, 0xff, 0x52, 0x89, 0xf9, 0x22, 0x19, 0xc2, 0xb2, 0x69, 0xf5, 0x2e, 0x5e, 0x85, 0xbe, 0x65, 0x15, 0xce, 0x63, 0xb8, 0xc8, 0x13, 0x28, 0xf3, 0x83, 0x58, 0xa6, 0x7d, 0xd, 0xd6, 0xed, 0x36, 0x46, 0x9d, 0x30, 0xeb, 0x9b, 0x40, 0x7b, 0xa0, 0xd0, 0xb, 0x97, 0x4c, 0x3c, 0xe7, 0xdc, 0x7, 0x77, 0xac, 0x1, 0xda, 0xaa, 0x71, 0x4a, 0x91, 0xe1, 0x3a, 0x95, 0x4e, 0x3e, 0xe5, 0xde, 0x5, 0x75, 0xae, 0x3, 0xd8, 0xa8, 0x73, 0x48, 0x93, 0xe3, 0x38, 0xa4, 0x7f, 0xf, 0xd4, 0xef, 0x34, 0x44, 0x9f, 0x32, 0xe9, 0x99, 0x42, 0x79, 0xa2, 0xd2, 0x9, 0xf7, 0x2c, 0x5c, 0x87, 0xbc, 0x67, 0x17, 0xcc, 0x61, 0xba, 0xca, 0x11, 0x2a, 0xf1, 0x81, 0x5a, 0xc6, 0x1d, 0x6d, 0xb6, 0x8d, 0x56, 0x26, 0xfd, 0x50, 0x8b, 0xfb, 0x20, 0x1b, 0xc0, 0xb0, 0x6b, 0x51, 0x8a, 0xfa, 0x21, 0x1a, 0xc1, 0xb1, 0x6a, 0xc7, 0x1c, 0x6c, 0xb7, 0x8c, 0x57, 0x27, 0xfc, 0x60, 0xbb, 0xcb, 0x10, 0x2b, 0xf0, 0x80, 0x5b, 0xf6, 0x2d, 0x5d, 0x86, 0xbd, 0x66, 0x16, 0xcd, 0x33, 0xe8, 0x98, 0x43, 0x78, 0xa3, 0xd3, 0x8, 0xa5, 0x7e, 0xe, 0xd5, 0xee, 0x35, 0x45, 0x9e, 0x2, 0xd9, 0xa9, 0x72, 0x49, 0x92, 0xe2, 0x39, 0x94, 0x4f, 0x3f, 0xe4, 0xdf, 0x4, 0x74, 0xaf}, {0x0, 0xdc, 0xa5, 0x79, 0x57, 0x8b, 0xf2, 0x2e, 0xae, 0x72, 0xb, 0xd7, 0xf9, 0x25, 0x5c, 0x80, 0x41, 0x9d, 0xe4, 0x38, 0x16, 0xca, 0xb3, 0x6f, 0xef, 0x33, 0x4a, 0x96, 0xb8, 0x64, 0x1d, 0xc1, 0x82, 0x5e, 0x27, 0xfb, 0xd5, 0x9, 0x70, 0xac, 0x2c, 0xf0, 0x89, 0x55, 0x7b, 0xa7, 0xde, 0x2, 0xc3, 0x1f, 0x66, 0xba, 0x94, 0x48, 0x31, 0xed, 0x6d, 0xb1, 0xc8, 0x14, 0x3a, 0xe6, 0x9f, 0x43, 0x19, 0xc5, 0xbc, 0x60, 0x4e, 0x92, 0xeb, 0x37, 0xb7, 0x6b, 0x12, 0xce, 0xe0, 0x3c, 0x45, 0x99, 0x58, 0x84, 0xfd, 0x21, 0xf, 0xd3, 0xaa, 0x76, 0xf6, 0x2a, 0x53, 0x8f, 0xa1, 0x7d, 0x4, 0xd8, 0x9b, 0x47, 0x3e, 0xe2, 0xcc, 0x10, 0x69, 0xb5, 0x35, 0xe9, 0x90, 0x4c, 0x62, 0xbe, 0xc7, 0x1b, 0xda, 0x6, 0x7f, 0xa3, 0x8d, 0x51, 0x28, 0xf4, 0x74, 0xa8, 0xd1, 0xd, 0x23, 0xff, 0x86, 0x5a, 0x32, 0xee, 0x97, 0x4b, 0x65, 0xb9, 0xc0, 0x1c, 0x9c, 0x40, 0x39, 0xe5, 0xcb, 0x17, 0x6e, 0xb2, 0x73, 0xaf, 0xd6, 0xa, 0x24, 0xf8, 0x81, 0x5d, 0xdd, 0x1, 0x78, 0xa4, 0x8a, 0x56, 0x2f, 0xf3, 0xb0, 0x6c, 0x15, 0xc9, 0xe7, 0x3b, 0x42, 0x9e, 0x1e, 0xc2, 0xbb, 0x67, 0x49, 0x95, 0xec, 0x30, 0xf1, 0x2d, 0x54, 0x88, 0xa6, 0x7a, 0x3, 0xdf, 0x5f, 0x83, 0xfa, 0x26, 0x8, 0xd4, 0xad, 0x71, 0x2b, 0xf7, 0x8e, 0x52, 0x7c, 0xa0, 0xd9, 0x5, 0x85, 0x59, 0x20, 0xfc, 0xd2, 0xe, 0x77, 0xab, 0x6a, 0xb6, 0xcf, 0x13, 0x3d, 0xe1, 0x98, 0x44, 0xc4, 0x18, 0x61, 0xbd, 0x93, 0x4f, 0x36, 0xea, 0xa9, 0x75, 0xc, 0xd0, 0xfe, 0x22, 0x5b, 0x87, 0x7, 0xdb, 0xa2, 0x7e, 0x50, 0x8c, 0xf5, 0x29, 0xe8, 0x34, 0x4d, 0x91, 0xbf, 0x63, 0x1a, 0xc6, 0x46, 0x9a, 0xe3, 0x3f, 0x11, 0xcd, 0xb4, 0x68}, {0x0, 0xdd, 0xa7, 0x7a, 0x53, 0x8e, 0xf4, 0x29, 0xa6, 0x7b, 0x1, 0xdc, 0xf5, 0x28, 0x52, 0x8f, 0x51, 0x8c, 0xf6, 0x2b, 0x2, 0xdf, 0xa5, 0x78, 0xf7, 0x2a, 0x50, 0x8d, 0xa4, 0x79, 0x3, 0xde, 0xa2, 0x7f, 0x5, 0xd8, 0xf1, 0x2c, 0x56, 0x8b, 0x4, 0xd9, 0xa3, 0x7e, 0x57, 0x8a, 0xf0, 0x2d, 0xf3, 0x2e, 0x54, 0x89, 0xa0, 0x7d, 0x7, 0xda, 0x55, 0x88, 0xf2, 0x2f, 0x6, 0xdb, 0xa1, 0x7c, 0x59, 0x84, 0xfe, 0x23, 0xa, 0xd7, 0xad, 0x70, 0xff, 0x22, 0x58, 0x85, 0xac, 0x71, 0xb, 0xd6, 0x8, 0xd5, 0xaf, 0x72, 0x5b, 0x86, 0xfc, 0x21, 0xae, 0x73, 0x9, 0xd4, 0xfd, 0x20, 0x5a, 0x87, 0xfb, 0x26, 0x5c, 0x81, 0xa8, 0x75, 0xf, 0xd2, 0x5d, 0x80, 0xfa, 0x27, 0xe, 0xd3, 0xa9, 0x74, 0xaa, 0x77, 0xd, 0xd0, 0xf9, 0x24, 0x5e, 0x83, 0xc, 0xd1, 0xab, 0x76, 0x5f, 0x82, 0xf8, 0x25, 0xb2, 0x6f, 0x15, 0xc8, 0xe1, 0x3c, 0x46, 0x9b, 0x14, 0xc9, 0xb3, 0x6e, 0x47, 0x9a, 0xe0, 0x3d, 0xe3, 0x3e, 0x44, 0x99, 0xb0, 0x6d, 0x17, 0xca, 0x45, 0x98, 0xe2, 0x3f, 0x16, 0xcb, 0xb1, 0x6c, 0x10, 0xcd, 0xb7, 0x6a, 0x43, 0x9e, 0xe4, 0x39, 0xb6, 0x6b, 0x11, 0xcc, 0xe5, 0x38, 0x42, 0x9f, 0x41, 0x9c, 0xe6, 0x3b, 0x12, 0xcf, 0xb5, 0x68, 0xe7, 0x3a, 0x40, 0x9d, 0xb4, 0x69, 0x13, 0xce, 0xeb, 0x36, 0x4c, 0x91, 0xb8, 0x65, 0x1f, 0xc2, 0x4d, 0x90, 0xea, 0x37, 0x1e, 0xc3, 0xb9, 0x64, 0xba, 0x67, 0x1d, 0xc0, 0xe9, 0x34, 0x4e, 0x93, 0x1c, 0xc1, 0xbb, 0x66, 0x4f, 0x92, 0xe8, 0x35, 0x49, 0x94, 0xee, 0x33, 0x1a, 0xc7, 0xbd, 0x60, 0xef, 0x32, 0x48, 0x95, 0xbc, 0x61, 0x1b, 0xc6, 0x18, 0xc5, 0xbf, 0x62, 0x4b, 0x96, 0xec, 0x31, 0xbe, 0x63, 0x19, 0xc4, 0xed, 0x30, 0x4a, 0x97}, {0x0, 0xde, 0xa1, 0x7f, 0x5f, 0x81, 0xfe, 0x20, 0xbe, 0x60, 0x1f, 0xc1, 0xe1, 0x3f, 0x40, 0x9e, 0x61, 0xbf, 0xc0, 0x1e, 0x3e, 0xe0, 0x9f, 0x41, 0xdf, 0x1, 0x7e, 0xa0, 0x80, 0x5e, 0x21, 0xff, 0xc2, 0x1c, 0x63, 0xbd, 0x9d, 0x43, 0x3c, 0xe2, 0x7c, 0xa2, 0xdd, 0x3, 0x23, 0xfd, 0x82, 0x5c, 0xa3, 0x7d, 0x2, 0xdc, 0xfc, 0x22, 0x5d, 0x83, 0x1d, 0xc3, 0xbc, 0x62, 0x42, 0x9c, 0xe3, 0x3d, 0x99, 0x47, 0x38, 0xe6, 0xc6, 0x18, 0x67, 0xb9, 0x27, 0xf9, 0x86, 0x58, 0x78, 0xa6, 0xd9, 0x7, 0xf8, 0x26, 0x59, 0x87, 0xa7, 0x79, 0x6, 0xd8, 0x46, 0x98, 0xe7, 0x39, 0x19, 0xc7, 0xb8, 0x66, 0x5b, 0x85, 0xfa, 0x24, 0x4, 0xda, 0xa5, 0x7b, 0xe5, 0x3b, 0x44, 0x9a, 0xba, 0x64, 0x1b, 0xc5, 0x3a, 0xe4, 0x9b, 0x45, 0x65, 0xbb, 0xc4, 0x1a, 0x84, 0x5a, 0x25, 0xfb, 0xdb, 0x5, 0x7a, 0xa4, 0x2f, 0xf1, 0x8e, 0x50, 0x70, 0xae, 0xd1, 0xf, 0x91, 0x4f, 0x30, 0xee, 0xce, 0x10, 0x6f, 0xb1, 0x4e, 0x90, 0xef, 0x31, 0x11, 0xcf, 0xb0, 0x6e, 0xf0, 0x2e, 0x51, 0x8f, 0xaf, 0x71, 0xe, 0xd0, 0xed, 0x33, 0x4c, 0x92, 0xb2, 0x6c, 0x13, 0xcd, 0x53, 0x8d, 0xf2, 0x2c, 0xc, 0xd2, 0xad, 0x73, 0x8c, 0x52, 0x2d, 0xf3, 0xd3, 0xd, 0x72, 0xac, 0x32, 0xec, 0x93, 0x4d, 0x6d, 0xb3, 0xcc, 0x12, 0xb6, 0x68, 0x17, 0xc9, 0xe9, 0x37, 0x48, 0x96, 0x8, 0xd6, 0xa9, 0x77, 0x57, 0x89, 0xf6, 0x28, 0xd7, 0x9, 0x76, 0xa8, 0x88, 0x56, 0x29, 0xf7, 0x69, 0xb7, 0xc8, 0x16, 0x36, 0xe8, 0x97, 0x49, 0x74, 0xaa, 0xd5, 0xb, 0x2b, 0xf5, 0x8a, 0x54, 0xca, 0x14, 0x6b, 0xb5, 0x95, 0x4b, 0x34, 0xea, 0x15, 0xcb, 0xb4, 0x6a, 0x4a, 0x94, 0xeb, 0x35, 0xab, 0x75, 0xa, 0xd4, 0xf4, 0x2a, 0x55, 0x8b}, {0x0, 0xdf, 0xa3, 0x7c, 0x5b, 0x84, 0xf8, 0x27, 0xb6, 0x69, 0x15, 0xca, 0xed, 0x32, 0x4e, 0x91, 0x71, 0xae, 0xd2, 0xd, 0x2a, 0xf5, 0x89, 0x56, 0xc7, 0x18, 0x64, 0xbb, 0x9c, 0x43, 0x3f, 0xe0, 0xe2, 0x3d, 0x41, 0x9e, 0xb9, 0x66, 0x1a, 0xc5, 0x54, 0x8b, 0xf7, 0x28, 0xf, 0xd0, 0xac, 0x73, 0x93, 0x4c, 0x30, 0xef, 0xc8, 0x17, 0x6b, 0xb4, 0x25, 0xfa, 0x86, 0x59, 0x7e, 0xa1, 0xdd, 0x2, 0xd9, 0x6, 0x7a, 0xa5, 0x82, 0x5d, 0x21, 0xfe, 0x6f, 0xb0, 0xcc, 0x13, 0x34, 0xeb, 0x97, 0x48, 0xa8, 0x77, 0xb, 0xd4, 0xf3, 0x2c, 0x50, 0x8f, 0x1e, 0xc1, 0xbd, 0x62, 0x45, 0x9a, 0xe6, 0x39, 0x3b, 0xe4, 0x98, 0x47, 0x60, 0xbf, 0xc3, 0x1c, 0x8d, 0x52, 0x2e, 0xf1, 0xd6, 0x9, 0x75, 0xaa, 0x4a, 0x95, 0xe9, 0x36, 0x11, 0xce, 0xb2, 0x6d, 0xfc, 0x23, 0x5f, 0x80, 0xa7, 0x78, 0x4, 0xdb, 0xaf, 0x70, 0xc, 0xd3, 0xf4, 0x2b, 0x57, 0x88, 0x19, 0xc6, 0xba, 0x65, 0x42, 0x9d, 0xe1, 0x3e, 0xde, 0x1, 0x7d, 0xa2, 0x85, 0x5a, 0x26, 0xf9, 0x68, 0xb7, 0xcb, 0x14, 0x33, 0xec, 0x90, 0x4f, 0x4d, 0x92, 0xee, 0x31, 0x16, 0xc9, 0xb5, 0x6a, 0xfb, 0x24, 0x58, 0x87, 0xa0, 0x7f, 0x3, 0xdc, 0x3c, 0xe3, 0x9f, 0x40, 0x67, 0xb8, 0xc4, 0x1b, 0x8a, 0x55, 0x29, 0xf6, 0xd1, 0xe, 0x72, 0xad, 0x76, 0xa9, 0xd5, 0xa, 0x2d, 0xf2, 0x8e, 0x51, 0xc0, 0x1f, 0x63, 0xbc, 0x9b, 0x44, 0x38, 0xe7, 0x7, 0xd8, 0xa4, 0x7b, 0x5c, 0x83, 0xff, 0x20, 0xb1, 0x6e, 0x12, 0xcd, 0xea, 0x35, 0x49, 0x96, 0x94, 0x4b, 0x37, 0xe8, 0xcf, 0x10, 0x6c, 0xb3, 0x22, 0xfd, 0x81, 0x5e, 0x79, 0xa6, 0xda, 0x5, 0xe5, 0x3a, 0x46, 0x99, 0xbe, 0x61, 0x1d, 0xc2, 0x53, 0x8c, 0xf0, 0x2f, 0x8, 0xd7, 0xab, 0x74}, {0x0, 0xe0, 0xdd, 0x3d, 0xa7, 0x47, 0x7a, 0x9a, 0x53, 0xb3, 0x8e, 0x6e, 0xf4, 0x14, 0x29, 0xc9, 0xa6, 0x46, 0x7b, 0x9b, 0x1, 0xe1, 0xdc, 0x3c, 0xf5, 0x15, 0x28, 0xc8, 0x52, 0xb2, 0x8f, 0x6f, 0x51, 0xb1, 0x8c, 0x6c, 0xf6, 0x16, 0x2b, 0xcb, 0x2, 0xe2, 0xdf, 0x3f, 0xa5, 0x45, 0x78, 0x98, 0xf7, 0x17, 0x2a, 0xca, 0x50, 0xb0, 0x8d, 0x6d, 0xa4, 0x44, 0x79, 0x99, 0x3, 0xe3, 0xde, 0x3e, 0xa2, 0x42, 0x7f, 0x9f, 0x5, 0xe5, 0xd8, 0x38, 0xf1, 0x11, 0x2c, 0xcc, 0x56, 0xb6, 0x8b, 0x6b, 0x4, 0xe4, 0xd9, 0x39, 0xa3, 0x43, 0x7e, 0x9e, 0x57, 0xb7, 0x8a, 0x6a, 0xf0, 0x10, 0x2d, 0xcd, 0xf3, 0x13, 0x2e, 0xce, 0x54, 0xb4, 0x89, 0x69, 0xa0, 0x40, 0x7d, 0x9d, 0x7, 0xe7, 0xda, 0x3a, 0x55, 0xb5, 0x88, 0x68, 0xf2, 0x12, 0x2f, 0xcf, 0x6, 0xe6, 0xdb, 0x3b, 0xa1, 0x41, 0x7c, 0x9c, 0x59, 0xb9, 0x84, 0x64, 0xfe, 0x1e, 0x23, 0xc3, 0xa, 0xea, 0xd7, 0x37, 0xad, 0x4d, 0x70, 0x90, 0xff, 0x1f, 0x22, 0xc2, 0x58, 0xb8, 0x85, 0x65, 0xac, 0x4c, 0x71, 0x91, 0xb, 0xeb, 0xd6, 0x36, 0x8, 0xe8, 0xd5, 0x35, 0xaf, 0x4f, 0x72, 0x92, 0x5b, 0xbb, 0x86, 0x66, 0xfc, 0x1c, 0x21, 0xc1, 0xae, 0x4e, 0x73, 0x93, 0x9, 0xe9, 0xd4, 0x34, 0xfd, 0x1d, 0x20, 0xc0, 0x5a, 0xba, 0x87, 0x67, 0xfb, 0x1b, 0x26, 0xc6, 0x5c, 0xbc, 0x81, 0x61, 0xa8, 0x48, 0x75, 0x95, 0xf, 0xef, 0xd2, 0x32, 0x5d, 0xbd, 0x80, 0x60, 0xfa, 0x1a, 0x27, 0xc7, 0xe, 0xee, 0xd3, 0x33, 0xa9, 0x49, 0x74, 0x94, 0xaa, 0x4a, 0x77, 0x97, 0xd, 0xed, 0xd0, 0x30, 0xf9, 0x19, 0x24, 0xc4, 0x5e, 0xbe, 0x83, 0x63, 0xc, 0xec, 0xd1, 0x31, 0xab, 0x4b, 0x76, 0x96, 0x5f, 0xbf, 0x82, 0x62, 0xf8, 0x18, 0x25, 0xc5}, {0x0, 0xe1, 0xdf, 0x3e, 0xa3, 0x42, 0x7c, 0x9d, 0x5b, 0xba, 0x84, 0x65, 0xf8, 0x19, 0x27, 0xc6, 0xb6, 0x57, 0x69, 0x88, 0x15, 0xf4, 0xca, 0x2b, 0xed, 0xc, 0x32, 0xd3, 0x4e, 0xaf, 0x91, 0x70, 0x71, 0x90, 0xae, 0x4f, 0xd2, 0x33, 0xd, 0xec, 0x2a, 0xcb, 0xf5, 0x14, 0x89, 0x68, 0x56, 0xb7, 0xc7, 0x26, 0x18, 0xf9, 0x64, 0x85, 0xbb, 0x5a, 0x9c, 0x7d, 0x43, 0xa2, 0x3f, 0xde, 0xe0, 0x1, 0xe2, 0x3, 0x3d, 0xdc, 0x41, 0xa0, 0x9e, 0x7f, 0xb9, 0x58, 0x66, 0x87, 0x1a, 0xfb, 0xc5, 0x24, 0x54, 0xb5, 0x8b, 0x6a, 0xf7, 0x16, 0x28, 0xc9, 0xf, 0xee, 0xd0, 0x31, 0xac, 0x4d, 0x73, 0x92, 0x93, 0x72, 0x4c, 0xad, 0x30, 0xd1, 0xef, 0xe, 0xc8, 0x29, 0x17, 0xf6, 0x6b, 0x8a, 0xb4, 0x55, 0x25, 0xc4, 0xfa, 0x1b, 0x86, 0x67, 0x59, 0xb8, 0x7e, 0x9f, 0xa1, 0x40, 0xdd, 0x3c, 0x2, 0xe3, 0xd9, 0x38, 0x6, 0xe7, 0x7a, 0x9b, 0xa5, 0x44, 0x82, 0x63, 0x5d, 0xbc, 0x21, 0xc0, 0xfe, 0x1f, 0x6f, 0x8e, 0xb0, 0x51, 0xcc, 0x2d, 0x13, 0xf2, 0x34, 0xd5, 0xeb, 0xa, 0x97, 0x76, 0x48, 0xa9, 0xa8, 0x49, 0x77, 0x96, 0xb, 0xea, 0xd4, 0x35, 0xf3, 0x12, 0x2c, 0xcd, 0x50, 0xb1, 0x8f, 0x6e, 0x1e, 0xff, 0xc1, 0x20, 0xbd, 0x5c, 0x62, 0x83, 0x45, 0xa4, 0x9a, 0x7b, 0xe6, 0x7, 0x39, 0xd8, 0x3b, 0xda, 0xe4, 0x5, 0x98, 0x79, 0x47, 0xa6, 0x60, 0x81, 0xbf, 0x5e, 0xc3, 0x22, 0x1c, 0xfd, 0x8d, 0x6c, 0x52, 0xb3, 0x2e, 0xcf, 0xf1, 0x10, 0xd6, 0x37, 0x9, 0xe8, 0x75, 0x94, 0xaa, 0x4b, 0x4a, 0xab, 0x95, 0x74, 0xe9, 0x8, 0x36, 0xd7, 0x11, 0xf0, 0xce, 0x2f, 0xb2, 0x53, 0x6d, 0x8c, 0xfc, 0x1d, 0x23, 0xc2, 0x5f, 0xbe, 0x80, 0x61, 0xa7, 0x46, 0x78, 0x99, 0x4, 0xe5, 0xdb, 0x3a}, {0x0, 0xe2, 0xd9, 0x3b, 0xaf, 0x4d, 0x76, 0x94, 0x43, 0xa1, 0x9a, 0x78, 0xec, 0xe, 0x35, 0xd7, 0x86, 0x64, 0x5f, 0xbd, 0x29, 0xcb, 0xf0, 0x12, 0xc5, 0x27, 0x1c, 0xfe, 0x6a, 0x88, 0xb3, 0x51, 0x11, 0xf3, 0xc8, 0x2a, 0xbe, 0x5c, 0x67, 0x85, 0x52, 0xb0, 0x8b, 0x69, 0xfd, 0x1f, 0x24, 0xc6, 0x97, 0x75, 0x4e, 0xac, 0x38, 0xda, 0xe1, 0x3, 0xd4, 0x36, 0xd, 0xef, 0x7b, 0x99, 0xa2, 0x40, 0x22, 0xc0, 0xfb, 0x19, 0x8d, 0x6f, 0x54, 0xb6, 0x61, 0x83, 0xb8, 0x5a, 0xce, 0x2c, 0x17, 0xf5, 0xa4, 0x46, 0x7d, 0x9f, 0xb, 0xe9, 0xd2, 0x30, 0xe7, 0x5, 0x3e, 0xdc, 0x48, 0xaa, 0x91, 0x73, 0x33, 0xd1, 0xea, 0x8, 0x9c, 0x7e, 0x45, 0xa7, 0x70, 0x92, 0xa9, 0x4b, 0xdf, 0x3d, 0x6, 0xe4, 0xb5, 0x57, 0x6c, 0x8e, 0x1a, 0xf8, 0xc3, 0x21, 0xf6, 0x14, 0x2f, 0xcd, 0x59, 0xbb, 0x80, 0x62, 0x44, 0xa6, 0x9d, 0x7f, 0xeb, 0x9, 0x32, 0xd0, 0x7, 0xe5, 0xde, 0x3c, 0xa8, 0x4a, 0x71, 0x93, 0xc2, 0x20, 0x1b, 0xf9, 0x6d, 0x8f, 0xb4, 0x56, 0x81, 0x63, 0x58, 0xba, 0x2e, 0xcc, 0xf7, 0x15, 0x55, 0xb7, 0x8c, 0x6e, 0xfa, 0x18, 0x23, 0xc1, 0x16, 0xf4, 0xcf, 0x2d, 0xb9, 0x5b, 0x60, 0x82, 0xd3, 0x31, 0xa, 0xe8, 0x7c, 0x9e, 0xa5, 0x47, 0x90, 0x72, 0x49, 0xab, 0x3f, 0xdd, 0xe6, 0x4, 0x66, 0x84, 0xbf, 0x5d, 0xc9, 0x2b, 0x10, 0xf2, 0x25, 0xc7, 0xfc, 0x1e, 0x8a, 0x68, 0x53, 0xb1, 0xe0, 0x2, 0x39, 0xdb, 0x4f, 0xad, 0x96, 0x74, 0xa3, 0x41, 0x7a, 0x98, 0xc, 0xee, 0xd5, 0x37, 0x77, 0x95, 0xae, 0x4c, 0xd8, 0x3a, 0x1, 0xe3, 0x34, 0xd6, 0xed, 0xf, 0x9b, 0x79, 0x42, 0xa0, 0xf1, 0x13, 0x28, 0xca, 0x5e, 0xbc, 0x87, 0x65, 0xb2, 0x50, 0x6b, 0x89, 0x1d, 0xff, 0xc4, 0x26}, {0x0, 0xe3, 0xdb, 0x38, 0xab, 0x48, 0x70, 0x93, 0x4b, 0xa8, 0x90, 0x73, 0xe0, 0x3, 0x3b, 0xd8, 0x96, 0x75, 0x4d, 0xae, 0x3d, 0xde, 0xe6, 0x5, 0xdd, 0x3e, 0x6, 0xe5, 0x76, 0x95, 0xad, 0x4e, 0x31, 0xd2, 0xea, 0x9, 0x9a, 0x79, 0x41, 0xa2, 0x7a, 0x99, 0xa1, 0x42, 0xd1, 0x32, 0xa, 0xe9, 0xa7, 0x44, 0x7c, 0x9f, 0xc, 0xef, 0xd7, 0x34, 0xec, 0xf, 0x37, 0xd4, 0x47, 0xa4, 0x9c, 0x7f, 0x62, 0x81, 0xb9, 0x5a, 0xc9, 0x2a, 0x12, 0xf1, 0x29, 0xca, 0xf2, 0x11, 0x82, 0x61, 0x59, 0xba, 0xf4, 0x17, 0x2f, 0xcc, 0x5f, 0xbc, 0x84, 0x67, 0xbf, 0x5c, 0x64, 0x87, 0x14, 0xf7, 0xcf, 0x2c, 0x53, 0xb0, 0x88, 0x6b, 0xf8, 0x1b, 0x23, 0xc0, 0x18, 0xfb, 0xc3, 0x20, 0xb3, 0x50, 0x68, 0x8b, 0xc5, 0x26, 0x1e, 0xfd, 0x6e, 0x8d, 0xb5, 0x56, 0x8e, 0x6d, 0x55, 0xb6, 0x25, 0xc6, 0xfe, 0x1d, 0xc4, 0x27, 0x1f, 0xfc, 0x6f, 0x8c, 0xb4, 0x57, 0x8f, 0x6c, 0x54, 0xb7, 0x24, 0xc7, 0xff, 0x1c, 0x52, 0xb1, 0x89, 0x6a, 0xf9, 0x1a, 0x22, 0xc1, 0x19, 0xfa, 0xc2, 0x21, 0xb2, 0x51, 0x69, 0x8a, 0xf5, 0x16, 0x2e, 0xcd, 0x5e, 0xbd, 0x85, 0x66, 0xbe, 0x5d, 0x65, 0x86, 0x15, 0xf6, 0xce, 0x2d, 0x63, 0x80, 0xb8, 0x5b, 0xc8, 0x2b, 0x13, 0xf0, 0x28, 0xcb, 0xf3, 0x10, 0x83, 0x60, 0x58, 0xbb, 0xa6, 0x45, 0x7d, 0x9e, 0xd, 0xee, 0xd6, 0x35, 0xed, 0xe, 0x36, 0xd5, 0x46, 0xa5, 0x9d, 0x7e, 0x30, 0xd3, 0xeb, 0x8, 0x9b, 0x78, 0x40, 0xa3, 0x7b, 0x98, 0xa0, 0x43, 0xd0, 0x33, 0xb, 0xe8, 0x97, 0x74, 0x4c, 0xaf, 0x3c, 0xdf, 0xe7, 0x4, 0xdc, 0x3f, 0x7, 0xe4, 0x77, 0x94, 0xac, 0x4f, 0x1, 0xe2, 0xda, 0x39, 0xaa, 0x49, 0x71, 0x92, 0x4a, 0xa9, 0x91, 0x72, 0xe1, 0x2, 0x3a, 0xd9}, {0x0, 0xe4, 0xd5, 0x31, 0xb7, 0x53, 0x62, 0x86, 0x73, 0x97, 0xa6, 0x42, 0xc4, 0x20, 0x11, 0xf5, 0xe6, 0x2, 0x33, 0xd7, 0x51, 0xb5, 0x84, 0x60, 0x95, 0x71, 0x40, 0xa4, 0x22, 0xc6, 0xf7, 0x13, 0xd1, 0x35, 0x4, 0xe0, 0x66, 0x82, 0xb3, 0x57, 0xa2, 0x46, 0x77, 0x93, 0x15, 0xf1, 0xc0, 0x24, 0x37, 0xd3, 0xe2, 0x6, 0x80, 0x64, 0x55, 0xb1, 0x44, 0xa0, 0x91, 0x75, 0xf3, 0x17, 0x26, 0xc2, 0xbf, 0x5b, 0x6a, 0x8e, 0x8, 0xec, 0xdd, 0x39, 0xcc, 0x28, 0x19, 0xfd, 0x7b, 0x9f, 0xae, 0x4a, 0x59, 0xbd, 0x8c, 0x68, 0xee, 0xa, 0x3b, 0xdf, 0x2a, 0xce, 0xff, 0x1b, 0x9d, 0x79, 0x48, 0xac, 0x6e, 0x8a, 0xbb, 0x5f, 0xd9, 0x3d, 0xc, 0xe8, 0x1d, 0xf9, 0xc8, 0x2c, 0xaa, 0x4e, 0x7f, 0x9b, 0x88, 0x6c, 0x5d, 0xb9, 0x3f, 0xdb, 0xea, 0xe, 0xfb, 0x1f, 0x2e, 0xca, 0x4c, 0xa8, 0x99, 0x7d, 0x63, 0x87, 0xb6, 0x52, 0xd4, 0x30, 0x1, 0xe5, 0x10, 0xf4, 0xc5, 0x21, 0xa7, 0x43, 0x72, 0x96, 0x85, 0x61, 0x50, 0xb4, 0x32, 0xd6, 0xe7, 0x3, 0xf6, 0x12, 0x23, 0xc7, 0x41, 0xa5, 0x94, 0x70, 0xb2, 0x56, 0x67, 0x83, 0x5, 0xe1, 0xd0, 0x34, 0xc1, 0x25, 0x14, 0xf0, 0x76, 0x92, 0xa3, 0x47, 0x54, 0xb0, 0x81, 0x65, 0xe3, 0x7, 0x36, 0xd2, 0x27, 0xc3, 0xf2, 0x16, 0x90, 0x74, 0x45, 0xa1, 0xdc, 0x38, 0x9, 0xed, 0x6b, 0x8f, 0xbe, 0x5a, 0xaf, 0x4b, 0x7a, 0x9e, 0x18, 0xfc, 0xcd, 0x29, 0x3a, 0xde, 0xef, 0xb, 0x8d, 0x69, 0x58, 0xbc, 0x49, 0xad, 0x9c, 0x78, 0xfe, 0x1a, 0x2b, 0xcf, 0xd, 0xe9, 0xd8, 0x3c, 0xba, 0x5e, 0x6f, 0x8b, 0x7e, 0x9a, 0xab, 0x4f, 0xc9, 0x2d, 0x1c, 0xf8, 0xeb, 0xf, 0x3e, 0xda, 0x5c, 0xb8, 0x89, 0x6d, 0x98, 0x7c, 0x4d, 0xa9, 0x2f, 0xcb, 0xfa, 0x1e}, {0x0, 0xe5, 0xd7, 0x32, 0xb3, 0x56, 0x64, 0x81, 0x7b, 0x9e, 0xac, 0x49, 0xc8, 0x2d, 0x1f, 0xfa, 0xf6, 0x13, 0x21, 0xc4, 0x45, 0xa0, 0x92, 0x77, 0x8d, 0x68, 0x5a, 0xbf, 0x3e, 0xdb, 0xe9, 0xc, 0xf1, 0x14, 0x26, 0xc3, 0x42, 0xa7, 0x95, 0x70, 0x8a, 0x6f, 0x5d, 0xb8, 0x39, 0xdc, 0xee, 0xb, 0x7, 0xe2, 0xd0, 0x35, 0xb4, 0x51, 0x63, 0x86, 0x7c, 0x99, 0xab, 0x4e, 0xcf, 0x2a, 0x18, 0xfd, 0xff, 0x1a, 0x28, 0xcd, 0x4c, 0xa9, 0x9b, 0x7e, 0x84, 0x61, 0x53, 0xb6, 0x37, 0xd2, 0xe0, 0x5, 0x9, 0xec, 0xde, 0x3b, 0xba, 0x5f, 0x6d, 0x88, 0x72, 0x97, 0xa5, 0x40, 0xc1, 0x24, 0x16, 0xf3, 0xe, 0xeb, 0xd9, 0x3c, 0xbd, 0x58, 0x6a, 0x8f, 0x75, 0x90, 0xa2, 0x47, 0xc6, 0x23, 0x11, 0xf4, 0xf8, 0x1d, 0x2f, 0xca, 0x4b, 0xae, 0x9c, 0x79, 0x83, 0x66, 0x54, 0xb1, 0x30, 0xd5, 0xe7, 0x2, 0xe3, 0x6, 0x34, 0xd1, 0x50, 0xb5, 0x87, 0x62, 0x98, 0x7d, 0x4f, 0xaa, 0x2b, 0xce, 0xfc, 0x19, 0x15, 0xf0, 0xc2, 0x27, 0xa6, 0x43, 0x71, 0x94, 0x6e, 0x8b, 0xb9, 0x5c, 0xdd, 0x38, 0xa, 0xef, 0x12, 0xf7, 0xc5, 0x20, 0xa1, 0x44, 0x76, 0x93, 0x69, 0x8c, 0xbe, 0x5b, 0xda, 0x3f, 0xd, 0xe8, 0xe4, 0x1, 0x33, 0xd6, 0x57, 0xb2, 0x80, 0x65, 0x9f, 0x7a, 0x48, 0xad, 0x2c, 0xc9, 0xfb, 0x1e, 0x1c, 0xf9, 0xcb, 0x2e, 0xaf, 0x4a, 0x78, 0x9d, 0x67, 0x82, 0xb0, 0x55, 0xd4, 0x31, 0x3, 0xe6, 0xea, 0xf, 0x3d, 0xd8, 0x59, 0xbc, 0x8e, 0x6b, 0x91, 0x74, 0x46, 0xa3, 0x22, 0xc7, 0xf5, 0x10, 0xed, 0x8, 0x3a, 0xdf, 0x5e, 0xbb, 0x89, 0x6c, 0x96, 0x73, 0x41, 0xa4, 0x25, 0xc0, 0xf2, 0x17, 0x1b, 0xfe, 0xcc, 0x29, 0xa8, 0x4d, 0x7f, 0x9a, 0x60, 0x85, 0xb7, 0x52, 0xd3, 0x36, 0x4, 0xe1}, {0x0, 0xe6, 0xd1, 0x37, 0xbf, 0x59, 0x6e, 0x88, 0x63, 0x85, 0xb2, 0x54, 0xdc, 0x3a, 0xd, 0xeb, 0xc6, 0x20, 0x17, 0xf1, 0x79, 0x9f, 0xa8, 0x4e, 0xa5, 0x43, 0x74, 0x92, 0x1a, 0xfc, 0xcb, 0x2d, 0x91, 0x77, 0x40, 0xa6, 0x2e, 0xc8, 0xff, 0x19, 0xf2, 0x14, 0x23, 0xc5, 0x4d, 0xab, 0x9c, 0x7a, 0x57, 0xb1, 0x86, 0x60, 0xe8, 0xe, 0x39, 0xdf, 0x34, 0xd2, 0xe5, 0x3, 0x8b, 0x6d, 0x5a, 0xbc, 0x3f, 0xd9, 0xee, 0x8, 0x80, 0x66, 0x51, 0xb7, 0x5c, 0xba, 0x8d, 0x6b, 0xe3, 0x5, 0x32, 0xd4, 0xf9, 0x1f, 0x28, 0xce, 0x46, 0xa0, 0x97, 0x71, 0x9a, 0x7c, 0x4b, 0xad, 0x25, 0xc3, 0xf4, 0x12, 0xae, 0x48, 0x7f, 0x99, 0x11, 0xf7, 0xc0, 0x26, 0xcd, 0x2b, 0x1c, 0xfa, 0x72, 0x94, 0xa3, 0x45, 0x68, 0x8e, 0xb9, 0x5f, 0xd7, 0x31, 0x6, 0xe0, 0xb, 0xed, 0xda, 0x3c, 0xb4, 0x52, 0x65, 0x83, 0x7e, 0x98, 0xaf, 0x49, 0xc1, 0x27, 0x10, 0xf6, 0x1d, 0xfb, 0xcc, 0x2a, 0xa2, 0x44, 0x73, 0x95, 0xb8, 0x5e, 0x69, 0x8f, 0x7, 0xe1, 0xd6, 0x30, 0xdb, 0x3d, 0xa, 0xec, 0x64, 0x82, 0xb5, 0x53, 0xef, 0x9, 0x3e, 0xd8, 0x50, 0xb6, 0x81, 0x67, 0x8c, 0x6a, 0x5d, 0xbb, 0x33, 0xd5, 0xe2, 0x4, 0x29, 0xcf, 0xf8, 0x1e, 0x96, 0x70, 0x47, 0xa1, 0x4a, 0xac, 0x9b, 0x7d, 0xf5, 0x13, 0x24, 0xc2, 0x41, 0xa7, 0x90, 0x76, 0xfe, 0x18, 0x2f, 0xc9, 0x22, 0xc4, 0xf3, 0x15, 0x9d, 0x7b, 0x4c, 0xaa, 0x87, 0x61, 0x56, 0xb0, 0x38, 0xde, 0xe9, 0xf, 0xe4, 0x2, 0x35, 0xd3, 0x5b, 0xbd, 0x8a, 0x6c, 0xd0, 0x36, 0x1, 0xe7, 0x6f, 0x89, 0xbe, 0x58, 0xb3, 0x55, 0x62, 0x84, 0xc, 0xea, 0xdd, 0x3b, 0x16, 0xf0, 0xc7, 0x21, 0xa9, 0x4f, 0x78, 0x9e, 0x75, 0x93, 0xa4, 0x42, 0xca, 0x2c, 0x1b, 0xfd}, {0x0, 0xe7, 0xd3, 0x34, 0xbb, 0x5c, 0x68, 0x8f, 0x6b, 0x8c, 0xb8, 0x5f, 0xd0, 0x37, 0x3, 0xe4, 0xd6, 0x31, 0x5, 0xe2, 0x6d, 0x8a, 0xbe, 0x59, 0xbd, 0x5a, 0x6e, 0x89, 0x6, 0xe1, 0xd5, 0x32, 0xb1, 0x56, 0x62, 0x85, 0xa, 0xed, 0xd9, 0x3e, 0xda, 0x3d, 0x9, 0xee, 0x61, 0x86, 0xb2, 0x55, 0x67, 0x80, 0xb4, 0x53, 0xdc, 0x3b, 0xf, 0xe8, 0xc, 0xeb, 0xdf, 0x38, 0xb7, 0x50, 0x64, 0x83, 0x7f, 0x98, 0xac, 0x4b, 0xc4, 0x23, 0x17, 0xf0, 0x14, 0xf3, 0xc7, 0x20, 0xaf, 0x48, 0x7c, 0x9b, 0xa9, 0x4e, 0x7a, 0x9d, 0x12, 0xf5, 0xc1, 0x26, 0xc2, 0x25, 0x11, 0xf6, 0x79, 0x9e, 0xaa, 0x4d, 0xce, 0x29, 0x1d, 0xfa, 0x75, 0x92, 0xa6, 0x41, 0xa5, 0x42, 0x76, 0x91, 0x1e, 0xf9, 0xcd, 0x2a, 0x18, 0xff, 0xcb, 0x2c, 0xa3, 0x44, 0x70, 0x97, 0x73, 0x94, 0xa0, 0x47, 0xc8, 0x2f, 0x1b, 0xfc, 0xfe, 0x19, 0x2d, 0xca, 0x45, 0xa2, 0x96, 0x71, 0x95, 0x72, 0x46, 0xa1, 0x2e, 0xc9, 0xfd, 0x1a, 0x28, 0xcf, 0xfb, 0x1c, 0x93, 0x74, 0x40, 0xa7, 0x43, 0xa4, 0x90, 0x77, 0xf8, 0x1f, 0x2b, 0xcc, 0x4f, 0xa8, 0x9c, 0x7b, 0xf4, 0x13, 0x27, 0xc0, 0x24, 0xc3, 0xf7, 0x10, 0x9f, 0x78, 0x4c, 0xab, 0x99, 0x7e, 0x4a, 0xad, 0x22, 0xc5, 0xf1, 0x16, 0xf2, 0x15, 0x21, 0xc6, 0x49, 0xae, 0x9a, 0x7d, 0x81, 0x66, 0x52, 0xb5, 0x3a, 0xdd, 0xe9, 0xe, 0xea, 0xd, 0x39, 0xde, 0x51, 0xb6, 0x82, 0x65, 0x57, 0xb0, 0x84, 0x63, 0xec, 0xb, 0x3f, 0xd8, 0x3c, 0xdb, 0xef, 0x8, 0x87, 0x60, 0x54, 0xb3, 0x30, 0xd7, 0xe3, 0x4, 0x8b, 0x6c, 0x58, 0xbf, 0x5b, 0xbc, 0x88, 0x6f, 0xe0, 0x7, 0x33, 0xd4, 0xe6, 0x1, 0x35, 0xd2, 0x5d, 0xba, 0x8e, 0x69, 0x8d, 0x6a, 0x5e, 0xb9, 0x36, 0xd1, 0xe5, 0x2}, {0x0, 0xe8, 0xcd, 0x25, 0x87, 0x6f, 0x4a, 0xa2, 0x13, 0xfb, 0xde, 0x36, 0x94, 0x7c, 0x59, 0xb1, 0x26, 0xce, 0xeb, 0x3, 0xa1, 0x49, 0x6c, 0x84, 0x35, 0xdd, 0xf8, 0x10, 0xb2, 0x5a, 0x7f, 0x97, 0x4c, 0xa4, 0x81, 0x69, 0xcb, 0x23, 0x6, 0xee, 0x5f, 0xb7, 0x92, 0x7a, 0xd8, 0x30, 0x15, 0xfd, 0x6a, 0x82, 0xa7, 0x4f, 0xed, 0x5, 0x20, 0xc8, 0x79, 0x91, 0xb4, 0x5c, 0xfe, 0x16, 0x33, 0xdb, 0x98, 0x70, 0x55, 0xbd, 0x1f, 0xf7, 0xd2, 0x3a, 0x8b, 0x63, 0x46, 0xae, 0xc, 0xe4, 0xc1, 0x29, 0xbe, 0x56, 0x73, 0x9b, 0x39, 0xd1, 0xf4, 0x1c, 0xad, 0x45, 0x60, 0x88, 0x2a, 0xc2, 0xe7, 0xf, 0xd4, 0x3c, 0x19, 0xf1, 0x53, 0xbb, 0x9e, 0x76, 0xc7, 0x2f, 0xa, 0xe2, 0x40, 0xa8, 0x8d, 0x65, 0xf2, 0x1a, 0x3f, 0xd7, 0x75, 0x9d, 0xb8, 0x50, 0xe1, 0x9, 0x2c, 0xc4, 0x66, 0x8e, 0xab, 0x43, 0x2d, 0xc5, 0xe0, 0x8, 0xaa, 0x42, 0x67, 0x8f, 0x3e, 0xd6, 0xf3, 0x1b, 0xb9, 0x51, 0x74, 0x9c, 0xb, 0xe3, 0xc6, 0x2e, 0x8c, 0x64, 0x41, 0xa9, 0x18, 0xf0, 0xd5, 0x3d, 0x9f, 0x77, 0x52, 0xba, 0x61, 0x89, 0xac, 0x44, 0xe6, 0xe, 0x2b, 0xc3, 0x72, 0x9a, 0xbf, 0x57, 0xf5, 0x1d, 0x38, 0xd0, 0x47, 0xaf, 0x8a, 0x62, 0xc0, 0x28, 0xd, 0xe5, 0x54, 0xbc, 0x99, 0x71, 0xd3, 0x3b, 0x1e, 0xf6, 0xb5, 0x5d, 0x78, 0x90, 0x32, 0xda, 0xff, 0x17, 0xa6, 0x4e, 0x6b, 0x83, 0x21, 0xc9, 0xec, 0x4, 0x93, 0x7b, 0x5e, 0xb6, 0x14, 0xfc, 0xd9, 0x31, 0x80, 0x68, 0x4d, 0xa5, 0x7, 0xef, 0xca, 0x22, 0xf9, 0x11, 0x34, 0xdc, 0x7e, 0x96, 0xb3, 0x5b, 0xea, 0x2, 0x27, 0xcf, 0x6d, 0x85, 0xa0, 0x48, 0xdf, 0x37, 0x12, 0xfa, 0x58, 0xb0, 0x95, 0x7d, 0xcc, 0x24, 0x1, 0xe9, 0x4b, 0xa3, 0x86, 0x6e}, {0x0, 0xe9, 0xcf, 0x26, 0x83, 0x6a, 0x4c, 0xa5, 0x1b, 0xf2, 0xd4, 0x3d, 0x98, 0x71, 0x57, 0xbe, 0x36, 0xdf, 0xf9, 0x10, 0xb5, 0x5c, 0x7a, 0x93, 0x2d, 0xc4, 0xe2, 0xb, 0xae, 0x47, 0x61, 0x88, 0x6c, 0x85, 0xa3, 0x4a, 0xef, 0x6, 0x20, 0xc9, 0x77, 0x9e, 0xb8, 0x51, 0xf4, 0x1d, 0x3b, 0xd2, 0x5a, 0xb3, 0x95, 0x7c, 0xd9, 0x30, 0x16, 0xff, 0x41, 0xa8, 0x8e, 0x67, 0xc2, 0x2b, 0xd, 0xe4, 0xd8, 0x31, 0x17, 0xfe, 0x5b, 0xb2, 0x94, 0x7d, 0xc3, 0x2a, 0xc, 0xe5, 0x40, 0xa9, 0x8f, 0x66, 0xee, 0x7, 0x21, 0xc8, 0x6d, 0x84, 0xa2, 0x4b, 0xf5, 0x1c, 0x3a, 0xd3, 0x76, 0x9f, 0xb9, 0x50, 0xb4, 0x5d, 0x7b, 0x92, 0x37, 0xde, 0xf8, 0x11, 0xaf, 0x46, 0x60, 0x89, 0x2c, 0xc5, 0xe3, 0xa, 0x82, 0x6b, 0x4d, 0xa4, 0x1, 0xe8, 0xce, 0x27, 0x99, 0x70, 0x56, 0xbf, 0x1a, 0xf3, 0xd5, 0x3c, 0xad, 0x44, 0x62, 0x8b, 0x2e, 0xc7, 0xe1, 0x8, 0xb6, 0x5f, 0x79, 0x90, 0x35, 0xdc, 0xfa, 0x13, 0x9b, 0x72, 0x54, 0xbd, 0x18, 0xf1, 0xd7, 0x3e, 0x80, 0x69, 0x4f, 0xa6, 0x3, 0xea, 0xcc, 0x25, 0xc1, 0x28, 0xe, 0xe7, 0x42, 0xab, 0x8d, 0x64, 0xda, 0x33, 0x15, 0xfc, 0x59, 0xb0, 0x96, 0x7f, 0xf7, 0x1e, 0x38, 0xd1, 0x74, 0x9d, 0xbb, 0x52, 0xec, 0x5, 0x23, 0xca, 0x6f, 0x86, 0xa0, 0x49, 0x75, 0x9c, 0xba, 0x53, 0xf6, 0x1f, 0x39, 0xd0, 0x6e, 0x87, 0xa1, 0x48, 0xed, 0x4, 0x22, 0xcb, 0x43, 0xaa, 0x8c, 0x65, 0xc0, 0x29, 0xf, 0xe6, 0x58, 0xb1, 0x97, 0x7e, 0xdb, 0x32, 0x14, 0xfd, 0x19, 0xf0, 0xd6, 0x3f, 0x9a, 0x73, 0x55, 0xbc, 0x2, 0xeb, 0xcd, 0x24, 0x81, 0x68, 0x4e, 0xa7, 0x2f, 0xc6, 0xe0, 0x9, 0xac, 0x45, 0x63, 0x8a, 0x34, 0xdd, 0xfb, 0x12, 0xb7, 0x5e, 0x78, 0x91}, {0x0, 0xea, 0xc9, 0x23, 0x8f, 0x65, 0x46, 0xac, 0x3, 0xe9, 0xca, 0x20, 0x8c, 0x66, 0x45, 0xaf, 0x6, 0xec, 0xcf, 0x25, 0x89, 0x63, 0x40, 0xaa, 0x5, 0xef, 0xcc, 0x26, 0x8a, 0x60, 0x43, 0xa9, 0xc, 0xe6, 0xc5, 0x2f, 0x83, 0x69, 0x4a, 0xa0, 0xf, 0xe5, 0xc6, 0x2c, 0x80, 0x6a, 0x49, 0xa3, 0xa, 0xe0, 0xc3, 0x29, 0x85, 0x6f, 0x4c, 0xa6, 0x9, 0xe3, 0xc0, 0x2a, 0x86, 0x6c, 0x4f, 0xa5, 0x18, 0xf2, 0xd1, 0x3b, 0x97, 0x7d, 0x5e, 0xb4, 0x1b, 0xf1, 0xd2, 0x38, 0x94, 0x7e, 0x5d, 0xb7, 0x1e, 0xf4, 0xd7, 0x3d, 0x91, 0x7b, 0x58, 0xb2, 0x1d, 0xf7, 0xd4, 0x3e, 0x92, 0x78, 0x5b, 0xb1, 0x14, 0xfe, 0xdd, 0x37, 0x9b, 0x71, 0x52, 0xb8, 0x17, 0xfd, 0xde, 0x34, 0x98, 0x72, 0x51, 0xbb, 0x12, 0xf8, 0xdb, 0x31, 0x9d, 0x77, 0x54, 0xbe, 0x11, 0xfb, 0xd8, 0x32, 0x9e, 0x74, 0x57, 0xbd, 0x30, 0xda, 0xf9, 0x13, 0xbf, 0x55, 0x76, 0x9c, 0x33, 0xd9, 0xfa, 0x10, 0xbc, 0x56, 0x75, 0x9f, 0x36, 0xdc, 0xff, 0x15, 0xb9, 0x53, 0x70, 0x9a, 0x35, 0xdf, 0xfc, 0x16, 0xba, 0x50, 0x73, 0x99, 0x3c, 0xd6, 0xf5, 0x1f, 0xb3, 0x59, 0x7a, 0x90, 0x3f, 0xd5, 0xf6, 0x1c, 0xb0, 0x5a, 0x79, 0x93, 0x3a, 0xd0, 0xf3, 0x19, 0xb5, 0x5f, 0x7c, 0x96, 0x39, 0xd3, 0xf0, 0x1a, 0xb6, 0x5c, 0x7f, 0x95, 0x28, 0xc2, 0xe1, 0xb, 0xa7, 0x4d, 0x6e, 0x84, 0x2b, 0xc1, 0xe2, 0x8, 0xa4, 0x4e, 0x6d, 0x87, 0x2e, 0xc4, 0xe7, 0xd, 0xa1, 0x4b, 0x68, 0x82, 0x2d, 0xc7, 0xe4, 0xe, 0xa2, 0x48, 0x6b, 0x81, 0x24, 0xce, 0xed, 0x7, 0xab, 0x41, 0x62, 0x88, 0x27, 0xcd, 0xee, 0x4, 0xa8, 0x42, 0x61, 0x8b, 0x22, 0xc8, 0xeb, 0x1, 0xad, 0x47, 0x64, 0x8e, 0x21, 0xcb, 0xe8, 0x2, 0xae, 0x44, 0x67, 0x8d}, {0x0, 0xeb, 0xcb, 0x20, 0x8b, 0x60, 0x40, 0xab, 0xb, 0xe0, 0xc0, 0x2b, 0x80, 0x6b, 0x4b, 0xa0, 0x16, 0xfd, 0xdd, 0x36, 0x9d, 0x76, 0x56, 0xbd, 0x1d, 0xf6, 0xd6, 0x3d, 0x96, 0x7d, 0x5d, 0xb6, 0x2c, 0xc7, 0xe7, 0xc, 0xa7, 0x4c, 0x6c, 0x87, 0x27, 0xcc, 0xec, 0x7, 0xac, 0x47, 0x67, 0x8c, 0x3a, 0xd1, 0xf1, 0x1a, 0xb1, 0x5a, 0x7a, 0x91, 0x31, 0xda, 0xfa, 0x11, 0xba, 0x51, 0x71, 0x9a, 0x58, 0xb3, 0x93, 0x78, 0xd3, 0x38, 0x18, 0xf3, 0x53, 0xb8, 0x98, 0x73, 0xd8, 0x33, 0x13, 0xf8, 0x4e, 0xa5, 0x85, 0x6e, 0xc5, 0x2e, 0xe, 0xe5, 0x45, 0xae, 0x8e, 0x65, 0xce, 0x25, 0x5, 0xee, 0x74, 0x9f, 0xbf, 0x54, 0xff, 0x14, 0x34, 0xdf, 0x7f, 0x94, 0xb4, 0x5f, 0xf4, 0x1f, 0x3f, 0xd4, 0x62, 0x89, 0xa9, 0x42, 0xe9, 0x2, 0x22, 0xc9, 0x69, 0x82, 0xa2, 0x49, 0xe2, 0x9, 0x29, 0xc2, 0xb0, 0x5b, 0x7b, 0x90, 0x3b, 0xd0, 0xf0, 0x1b, 0xbb, 0x50, 0x70, 0x9b, 0x30, 0xdb, 0xfb, 0x10, 0xa6, 0x4d, 0x6d, 0x86, 0x2d, 0xc6, 0xe6, 0xd, 0xad, 0x46, 0x66, 0x8d, 0x26, 0xcd, 0xed, 0x6, 0x9c, 0x77, 0x57, 0xbc, 0x17, 0xfc, 0xdc, 0x37, 0x97, 0x7c, 0x5c, 0xb7, 0x1c, 0xf7, 0xd7, 0x3c, 0x8a, 0x61, 0x41, 0xaa, 0x1, 0xea, 0xca, 0x21, 0x81, 0x6a, 0x4a, 0xa1, 0xa, 0xe1, 0xc1, 0x2a, 0xe8, 0x3, 0x23, 0xc8, 0x63, 0x88, 0xa8, 0x43, 0xe3, 0x8, 0x28, 0xc3, 0x68, 0x83, 0xa3, 0x48, 0xfe, 0x15, 0x35, 0xde, 0x75, 0x9e, 0xbe, 0x55, 0xf5, 0x1e, 0x3e, 0xd5, 0x7e, 0x95, 0xb5, 0x5e, 0xc4, 0x2f, 0xf, 0xe4, 0x4f, 0xa4, 0x84, 0x6f, 0xcf, 0x24, 0x4, 0xef, 0x44, 0xaf, 0x8f, 0x64, 0xd2, 0x39, 0x19, 0xf2, 0x59, 0xb2, 0x92, 0x79, 0xd9, 0x32, 0x12, 0xf9, 0x52, 0xb9, 0x99, 0x72}, {0x0, 0xec, 0xc5, 0x29, 0x97, 0x7b, 0x52, 0xbe, 0x33, 0xdf, 0xf6, 0x1a, 0xa4, 0x48, 0x61, 0x8d, 0x66, 0x8a, 0xa3, 0x4f, 0xf1, 0x1d, 0x34, 0xd8, 0x55, 0xb9, 0x90, 0x7c, 0xc2, 0x2e, 0x7, 0xeb, 0xcc, 0x20, 0x9, 0xe5, 0x5b, 0xb7, 0x9e, 0x72, 0xff, 0x13, 0x3a, 0xd6, 0x68, 0x84, 0xad, 0x41, 0xaa, 0x46, 0x6f, 0x83, 0x3d, 0xd1, 0xf8, 0x14, 0x99, 0x75, 0x5c, 0xb0, 0xe, 0xe2, 0xcb, 0x27, 0x85, 0x69, 0x40, 0xac, 0x12, 0xfe, 0xd7, 0x3b, 0xb6, 0x5a, 0x73, 0x9f, 0x21, 0xcd, 0xe4, 0x8, 0xe3, 0xf, 0x26, 0xca, 0x74, 0x98, 0xb1, 0x5d, 0xd0, 0x3c, 0x15, 0xf9, 0x47, 0xab, 0x82, 0x6e, 0x49, 0xa5, 0x8c, 0x60, 0xde, 0x32, 0x1b, 0xf7, 0x7a, 0x96, 0xbf, 0x53, 0xed, 0x1, 0x28, 0xc4, 0x2f, 0xc3, 0xea, 0x6, 0xb8, 0x54, 0x7d, 0x91, 0x1c, 0xf0, 0xd9, 0x35, 0x8b, 0x67, 0x4e, 0xa2, 0x17, 0xfb, 0xd2, 0x3e, 0x80, 0x6c, 0x45, 0xa9, 0x24, 0xc8, 0xe1, 0xd, 0xb3, 0x5f, 0x76, 0x9a, 0x71, 0x9d, 0xb4, 0x58, 0xe6, 0xa, 0x23, 0xcf, 0x42, 0xae, 0x87, 0x6b, 0xd5, 0x39, 0x10, 0xfc, 0xdb, 0x37, 0x1e, 0xf2, 0x4c, 0xa0, 0x89, 0x65, 0xe8, 0x4, 0x2d, 0xc1, 0x7f, 0x93, 0xba, 0x56, 0xbd, 0x51, 0x78, 0x94, 0x2a, 0xc6, 0xef, 0x3, 0x8e, 0x62, 0x4b, 0xa7, 0x19, 0xf5, 0xdc, 0x30, 0x92, 0x7e, 0x57, 0xbb, 0x5, 0xe9, 0xc0, 0x2c, 0xa1, 0x4d, 0x64, 0x88, 0x36, 0xda, 0xf3, 0x1f, 0xf4, 0x18, 0x31, 0xdd, 0x63, 0x8f, 0xa6, 0x4a, 0xc7, 0x2b, 0x2, 0xee, 0x50, 0xbc, 0x95, 0x79, 0x5e, 0xb2, 0x9b, 0x77, 0xc9, 0x25, 0xc, 0xe0, 0x6d, 0x81, 0xa8, 0x44, 0xfa, 0x16, 0x3f, 0xd3, 0x38, 0xd4, 0xfd, 0x11, 0xaf, 0x43, 0x6a, 0x86, 0xb, 0xe7, 0xce, 0x22, 0x9c, 0x70, 0x59, 0xb5}, {0x0, 0xed, 0xc7, 0x2a, 0x93, 0x7e, 0x54, 0xb9, 0x3b, 0xd6, 0xfc, 0x11, 0xa8, 0x45, 0x6f, 0x82, 0x76, 0x9b, 0xb1, 0x5c, 0xe5, 0x8, 0x22, 0xcf, 0x4d, 0xa0, 0x8a, 0x67, 0xde, 0x33, 0x19, 0xf4, 0xec, 0x1, 0x2b, 0xc6, 0x7f, 0x92, 0xb8, 0x55, 0xd7, 0x3a, 0x10, 0xfd, 0x44, 0xa9, 0x83, 0x6e, 0x9a, 0x77, 0x5d, 0xb0, 0x9, 0xe4, 0xce, 0x23, 0xa1, 0x4c, 0x66, 0x8b, 0x32, 0xdf, 0xf5, 0x18, 0xc5, 0x28, 0x2, 0xef, 0x56, 0xbb, 0x91, 0x7c, 0xfe, 0x13, 0x39, 0xd4, 0x6d, 0x80, 0xaa, 0x47, 0xb3, 0x5e, 0x74, 0x99, 0x20, 0xcd, 0xe7, 0xa, 0x88, 0x65, 0x4f, 0xa2, 0x1b, 0xf6, 0xdc, 0x31, 0x29, 0xc4, 0xee, 0x3, 0xba, 0x57, 0x7d, 0x90, 0x12, 0xff, 0xd5, 0x38, 0x81, 0x6c, 0x46, 0xab, 0x5f, 0xb2, 0x98, 0x75, 0xcc, 0x21, 0xb, 0xe6, 0x64, 0x89, 0xa3, 0x4e, 0xf7, 0x1a, 0x30, 0xdd, 0x97, 0x7a, 0x50, 0xbd, 0x4, 0xe9, 0xc3, 0x2e, 0xac, 0x41, 0x6b, 0x86, 0x3f, 0xd2, 0xf8, 0x15, 0xe1, 0xc, 0x26, 0xcb, 0x72, 0x9f, 0xb5, 0x58, 0xda, 0x37, 0x1d, 0xf0, 0x49, 0xa4, 0x8e, 0x63, 0x7b, 0x96, 0xbc, 0x51, 0xe8, 0x5, 0x2f, 0xc2, 0x40, 0xad, 0x87, 0x6a, 0xd3, 0x3e, 0x14, 0xf9, 0xd, 0xe0, 0xca, 0x27, 0x9e, 0x73, 0x59, 0xb4, 0x36, 0xdb, 0xf1, 0x1c, 0xa5, 0x48, 0x62, 0x8f, 0x52, 0xbf, 0x95, 0x78, 0xc1, 0x2c, 0x6, 0xeb, 0x69, 0x84, 0xae, 0x43, 0xfa, 0x17, 0x3d, 0xd0, 0x24, 0xc9, 0xe3, 0xe, 0xb7, 0x5a, 0x70, 0x9d, 0x1f, 0xf2, 0xd8, 0x35, 0x8c, 0x61, 0x4b, 0xa6, 0xbe, 0x53, 0x79, 0x94, 0x2d, 0xc0, 0xea, 0x7, 0x85, 0x68, 0x42, 0xaf, 0x16, 0xfb, 0xd1, 0x3c, 0xc8, 0x25, 0xf, 0xe2, 0x5b, 0xb6, 0x9c, 0x71, 0xf3, 0x1e, 0x34, 0xd9, 0x60, 0x8d, 0xa7, 0x4a}, {0x0, 0xee, 0xc1, 0x2f, 0x9f, 0x71, 0x5e, 0xb0, 0x23, 0xcd, 0xe2, 0xc, 0xbc, 0x52, 0x7d, 0x93, 0x46, 0xa8, 0x87, 0x69, 0xd9, 0x37, 0x18, 0xf6, 0x65, 0x8b, 0xa4, 0x4a, 0xfa, 0x14, 0x3b, 0xd5, 0x8c, 0x62, 0x4d, 0xa3, 0x13, 0xfd, 0xd2, 0x3c, 0xaf, 0x41, 0x6e, 0x80, 0x30, 0xde, 0xf1, 0x1f, 0xca, 0x24, 0xb, 0xe5, 0x55, 0xbb, 0x94, 0x7a, 0xe9, 0x7, 0x28, 0xc6, 0x76, 0x98, 0xb7, 0x59, 0x5, 0xeb, 0xc4, 0x2a, 0x9a, 0x74, 0x5b, 0xb5, 0x26, 0xc8, 0xe7, 0x9, 0xb9, 0x57, 0x78, 0x96, 0x43, 0xad, 0x82, 0x6c, 0xdc, 0x32, 0x1d, 0xf3, 0x60, 0x8e, 0xa1, 0x4f, 0xff, 0x11, 0x3e, 0xd0, 0x89, 0x67, 0x48, 0xa6, 0x16, 0xf8, 0xd7, 0x39, 0xaa, 0x44, 0x6b, 0x85, 0x35, 0xdb, 0xf4, 0x1a, 0xcf, 0x21, 0xe, 0xe0, 0x50, 0xbe, 0x91, 0x7f, 0xec, 0x2, 0x2d, 0xc3, 0x73, 0x9d, 0xb2, 0x5c, 0xa, 0xe4, 0xcb, 0x25, 0x95, 0x7b, 0x54, 0xba, 0x29, 0xc7, 0xe8, 0x6, 0xb6, 0x58, 0x77, 0x99, 0x4c, 0xa2, 0x8d, 0x63, 0xd3, 0x3d, 0x12, 0xfc, 0x6f, 0x81, 0xae, 0x40, 0xf0, 0x1e, 0x31, 0xdf, 0x86, 0x68, 0x47, 0xa9, 0x19, 0xf7, 0xd8, 0x36, 0xa5, 0x4b, 0x64, 0x8a, 0x3a, 0xd4, 0xfb, 0x15, 0xc0, 0x2e, 0x1, 0xef, 0x5f, 0xb1, 0x9e, 0x70, 0xe3, 0xd, 0x22, 0xcc, 0x7c, 0x92, 0xbd, 0x53, 0xf, 0xe1, 0xce, 0x20, 0x90, 0x7e, 0x51, 0xbf, 0x2c, 0xc2, 0xed, 0x3, 0xb3, 0x5d, 0x72, 0x9c, 0x49, 0xa7, 0x88, 0x66, 0xd6, 0x38, 0x17, 0xf9, 0x6a, 0x84, 0xab, 0x45, 0xf5, 0x1b, 0x34, 0xda, 0x83, 0x6d, 0x42, 0xac, 0x1c, 0xf2, 0xdd, 0x33, 0xa0, 0x4e, 0x61, 0x8f, 0x3f, 0xd1, 0xfe, 0x10, 0xc5, 0x2b, 0x4, 0xea, 0x5a, 0xb4, 0x9b, 0x75, 0xe6, 0x8, 0x27, 0xc9, 0x79, 0x97, 0xb8, 0x56}, {0x0, 0xef, 0xc3, 0x2c, 0x9b, 0x74, 0x58, 0xb7, 0x2b, 0xc4, 0xe8, 0x7, 0xb0, 0x5f, 0x73, 0x9c, 0x56, 0xb9, 0x95, 0x7a, 0xcd, 0x22, 0xe, 0xe1, 0x7d, 0x92, 0xbe, 0x51, 0xe6, 0x9, 0x25, 0xca, 0xac, 0x43, 0x6f, 0x80, 0x37, 0xd8, 0xf4, 0x1b, 0x87, 0x68, 0x44, 0xab, 0x1c, 0xf3, 0xdf, 0x30, 0xfa, 0x15, 0x39, 0xd6, 0x61, 0x8e, 0xa2, 0x4d, 0xd1, 0x3e, 0x12, 0xfd, 0x4a, 0xa5, 0x89, 0x66, 0x45, 0xaa, 0x86, 0x69, 0xde, 0x31, 0x1d, 0xf2, 0x6e, 0x81, 0xad, 0x42, 0xf5, 0x1a, 0x36, 0xd9, 0x13, 0xfc, 0xd0, 0x3f, 0x88, 0x67, 0x4b, 0xa4, 0x38, 0xd7, 0xfb, 0x14, 0xa3, 0x4c, 0x60, 0x8f, 0xe9, 0x6, 0x2a, 0xc5, 0x72, 0x9d, 0xb1, 0x5e, 0xc2, 0x2d, 0x1, 0xee, 0x59, 0xb6, 0x9a, 0x75, 0xbf, 0x50, 0x7c, 0x93, 0x24, 0xcb, 0xe7, 0x8, 0x94, 0x7b, 0x57, 0xb8, 0xf, 0xe0, 0xcc, 0x23, 0x8a, 0x65, 0x49, 0xa6, 0x11, 0xfe, 0xd2, 0x3d, 0xa1, 0x4e, 0x62, 0x8d, 0x3a, 0xd5, 0xf9, 0x16, 0xdc, 0x33, 0x1f, 0xf0, 0x47, 0xa8, 0x84, 0x6b, 0xf7, 0x18, 0x34, 0xdb, 0x6c, 0x83, 0xaf, 0x40, 0x26, 0xc9, 0xe5, 0xa, 0xbd, 0x52, 0x7e, 0x91, 0xd, 0xe2, 0xce, 0x21, 0x96, 0x79, 0x55, 0xba, 0x70, 0x9f, 0xb3, 0x5c, 0xeb, 0x4, 0x28, 0xc7, 0x5b, 0xb4, 0x98, 0x77, 0xc0, 0x2f, 0x3, 0xec, 0xcf, 0x20, 0xc, 0xe3, 0x54, 0xbb, 0x97, 0x78, 0xe4, 0xb, 0x27, 0xc8, 0x7f, 0x90, 0xbc, 0x53, 0x99, 0x76, 0x5a, 0xb5, 0x2, 0xed, 0xc1, 0x2e, 0xb2, 0x5d, 0x71, 0x9e, 0x29, 0xc6, 0xea, 0x5, 0x63, 0x8c, 0xa0, 0x4f, 0xf8, 0x17, 0x3b, 0xd4, 0x48, 0xa7, 0x8b, 0x64, 0xd3, 0x3c, 0x10, 0xff, 0x35, 0xda, 0xf6, 0x19, 0xae, 0x41, 0x6d, 0x82, 0x1e, 0xf1, 0xdd, 0x32, 0x85, 0x6a, 0x46, 0xa9}, {0x0, 0xf0, 0xfd, 0xd, 0xe7, 0x17, 0x1a, 0xea, 0xd3, 0x23, 0x2e, 0xde, 0x34, 0xc4, 0xc9, 0x39, 0xbb, 0x4b, 0x46, 0xb6, 0x5c, 0xac, 0xa1, 0x51, 0x68, 0x98, 0x95, 0x65, 0x8f, 0x7f, 0x72, 0x82, 0x6b, 0x9b, 0x96, 0x66, 0x8c, 0x7c, 0x71, 0x81, 0xb8, 0x48, 0x45, 0xb5, 0x5f, 0xaf, 0xa2, 0x52, 0xd0, 0x20, 0x2d, 0xdd, 0x37, 0xc7, 0xca, 0x3a, 0x3, 0xf3, 0xfe, 0xe, 0xe4, 0x14, 0x19, 0xe9, 0xd6, 0x26, 0x2b, 0xdb, 0x31, 0xc1, 0xcc, 0x3c, 0x5, 0xf5, 0xf8, 0x8, 0xe2, 0x12, 0x1f, 0xef, 0x6d, 0x9d, 0x90, 0x60, 0x8a, 0x7a, 0x77, 0x87, 0xbe, 0x4e, 0x43, 0xb3, 0x59, 0xa9, 0xa4, 0x54, 0xbd, 0x4d, 0x40, 0xb0, 0x5a, 0xaa, 0xa7, 0x57, 0x6e, 0x9e, 0x93, 0x63, 0x89, 0x79, 0x74, 0x84, 0x6, 0xf6, 0xfb, 0xb, 0xe1, 0x11, 0x1c, 0xec, 0xd5, 0x25, 0x28, 0xd8, 0x32, 0xc2, 0xcf, 0x3f, 0xb1, 0x41, 0x4c, 0xbc, 0x56, 0xa6, 0xab, 0x5b, 0x62, 0x92, 0x9f, 0x6f, 0x85, 0x75, 0x78, 0x88, 0xa, 0xfa, 0xf7, 0x7, 0xed, 0x1d, 0x10, 0xe0, 0xd9, 0x29, 0x24, 0xd4, 0x3e, 0xce, 0xc3, 0x33, 0xda, 0x2a, 0x27, 0xd7, 0x3d, 0xcd, 0xc0, 0x30, 0x9, 0xf9, 0xf4, 0x4, 0xee, 0x1e, 0x13, 0xe3, 0x61, 0x91, 0x9c, 0x6c, 0x86, 0x76, 0x7b, 0x8b, 0xb2, 0x42, 0x4f, 0xbf, 0x55, 0xa5, 0xa8, 0x58, 0x67, 0x97, 0x9a, 0x6a, 0x80, 0x70, 0x7d, 0x8d, 0xb4, 0x44, 0x49, 0xb9, 0x53, 0xa3, 0xae, 0x5e, 0xdc, 0x2c, 0x21, 0xd1, 0x3b, 0xcb, 0xc6, 0x36, 0xf, 0xff, 0xf2, 0x2, 0xe8, 0x18, 0x15, 0xe5, 0xc, 0xfc, 0xf1, 0x1, 0xeb, 0x1b, 0x16, 0xe6, 0xdf, 0x2f, 0x22, 0xd2, 0x38, 0xc8, 0xc5, 0x35, 0xb7, 0x47, 0x4a, 0xba, 0x50, 0xa0, 0xad, 0x5d, 0x64, 0x94, 0x99, 0x69, 0x83, 0x73, 0x7e, 0x8e}, {0x0, 0xf1, 0xff, 0xe, 0xe3, 0x12, 0x1c, 0xed, 0xdb, 0x2a, 0x24, 0xd5, 0x38, 0xc9, 0xc7, 0x36, 0xab, 0x5a, 0x54, 0xa5, 0x48, 0xb9, 0xb7, 0x46, 0x70, 0x81, 0x8f, 0x7e, 0x93, 0x62, 0x6c, 0x9d, 0x4b, 0xba, 0xb4, 0x45, 0xa8, 0x59, 0x57, 0xa6, 0x90, 0x61, 0x6f, 0x9e, 0x73, 0x82, 0x8c, 0x7d, 0xe0, 0x11, 0x1f, 0xee, 0x3, 0xf2, 0xfc, 0xd, 0x3b, 0xca, 0xc4, 0x35, 0xd8, 0x29, 0x27, 0xd6, 0x96, 0x67, 0x69, 0x98, 0x75, 0x84, 0x8a, 0x7b, 0x4d, 0xbc, 0xb2, 0x43, 0xae, 0x5f, 0x51, 0xa0, 0x3d, 0xcc, 0xc2, 0x33, 0xde, 0x2f, 0x21, 0xd0, 0xe6, 0x17, 0x19, 0xe8, 0x5, 0xf4, 0xfa, 0xb, 0xdd, 0x2c, 0x22, 0xd3, 0x3e, 0xcf, 0xc1, 0x30, 0x6, 0xf7, 0xf9, 0x8, 0xe5, 0x14, 0x1a, 0xeb, 0x76, 0x87, 0x89, 0x78, 0x95, 0x64, 0x6a, 0x9b, 0xad, 0x5c, 0x52, 0xa3, 0x4e, 0xbf, 0xb1, 0x40, 0x31, 0xc0, 0xce, 0x3f, 0xd2, 0x23, 0x2d, 0xdc, 0xea, 0x1b, 0x15, 0xe4, 0x9, 0xf8, 0xf6, 0x7, 0x9a, 0x6b, 0x65, 0x94, 0x79, 0x88, 0x86, 0x77, 0x41, 0xb0, 0xbe, 0x4f, 0xa2, 0x53, 0x5d, 0xac, 0x7a, 0x8b, 0x85, 0x74, 0x99, 0x68, 0x66, 0x97, 0xa1, 0x50, 0x5e, 0xaf, 0x42, 0xb3, 0xbd, 0x4c, 0xd1, 0x20, 0x2e, 0xdf, 0x32, 0xc3, 0xcd, 0x3c, 0xa, 0xfb, 0xf5, 0x4, 0xe9, 0x18, 0x16, 0xe7, 0xa7, 0x56, 0x58, 0xa9, 0x44, 0xb5, 0xbb, 0x4a, 0x7c, 0x8d, 0x83, 0x72, 0x9f, 0x6e, 0x60, 0x91, 0xc, 0xfd, 0xf3, 0x2, 0xef, 0x1e, 0x10, 0xe1, 0xd7, 0x26, 0x28, 0xd9, 0x34, 0xc5, 0xcb, 0x3a, 0xec, 0x1d, 0x13, 0xe2, 0xf, 0xfe, 0xf0, 0x1, 0x37, 0xc6, 0xc8, 0x39, 0xd4, 0x25, 0x2b, 0xda, 0x47, 0xb6, 0xb8, 0x49, 0xa4, 0x55, 0x5b, 0xaa, 0x9c, 0x6d, 0x63, 0x92, 0x7f, 0x8e, 0x80, 0x71}, {0x0, 0xf2, 0xf9, 0xb, 0xef, 0x1d, 0x16, 0xe4, 0xc3, 0x31, 0x3a, 0xc8, 0x2c, 0xde, 0xd5, 0x27, 0x9b, 0x69, 0x62, 0x90, 0x74, 0x86, 0x8d, 0x7f, 0x58, 0xaa, 0xa1, 0x53, 0xb7, 0x45, 0x4e, 0xbc, 0x2b, 0xd9, 0xd2, 0x20, 0xc4, 0x36, 0x3d, 0xcf, 0xe8, 0x1a, 0x11, 0xe3, 0x7, 0xf5, 0xfe, 0xc, 0xb0, 0x42, 0x49, 0xbb, 0x5f, 0xad, 0xa6, 0x54, 0x73, 0x81, 0x8a, 0x78, 0x9c, 0x6e, 0x65, 0x97, 0x56, 0xa4, 0xaf, 0x5d, 0xb9, 0x4b, 0x40, 0xb2, 0x95, 0x67, 0x6c, 0x9e, 0x7a, 0x88, 0x83, 0x71, 0xcd, 0x3f, 0x34, 0xc6, 0x22, 0xd0, 0xdb, 0x29, 0xe, 0xfc, 0xf7, 0x5, 0xe1, 0x13, 0x18, 0xea, 0x7d, 0x8f, 0x84, 0x76, 0x92, 0x60, 0x6b, 0x99, 0xbe, 0x4c, 0x47, 0xb5, 0x51, 0xa3, 0xa8, 0x5a, 0xe6, 0x14, 0x1f, 0xed, 0x9, 0xfb, 0xf0, 0x2, 0x25, 0xd7, 0xdc, 0x2e, 0xca, 0x38, 0x33, 0xc1, 0xac, 0x5e, 0x55, 0xa7, 0x43, 0xb1, 0xba, 0x48, 0x6f, 0x9d, 0x96, 0x64, 0x80, 0x72, 0x79, 0x8b, 0x37, 0xc5, 0xce, 0x3c, 0xd8, 0x2a, 0x21, 0xd3, 0xf4, 0x6, 0xd, 0xff, 0x1b, 0xe9, 0xe2, 0x10, 0x87, 0x75, 0x7e, 0x8c, 0x68, 0x9a, 0x91, 0x63, 0x44, 0xb6, 0xbd, 0x4f, 0xab, 0x59, 0x52, 0xa0, 0x1c, 0xee, 0xe5, 0x17, 0xf3, 0x1, 0xa, 0xf8, 0xdf, 0x2d, 0x26, 0xd4, 0x30, 0xc2, 0xc9, 0x3b, 0xfa, 0x8, 0x3, 0xf1, 0x15, 0xe7, 0xec, 0x1e, 0x39, 0xcb, 0xc0, 0x32, 0xd6, 0x24, 0x2f, 0xdd, 0x61, 0x93, 0x98, 0x6a, 0x8e, 0x7c, 0x77, 0x85, 0xa2, 0x50, 0x5b, 0xa9, 0x4d, 0xbf, 0xb4, 0x46, 0xd1, 0x23, 0x28, 0xda, 0x3e, 0xcc, 0xc7, 0x35, 0x12, 0xe0, 0xeb, 0x19, 0xfd, 0xf, 0x4, 0xf6, 0x4a, 0xb8, 0xb3, 0x41, 0xa5, 0x57, 0x5c, 0xae, 0x89, 0x7b, 0x70, 0x82, 0x66, 0x94, 0x9f, 0x6d}, {0x0, 0xf3, 0xfb, 0x8, 0xeb, 0x18, 0x10, 0xe3, 0xcb, 0x38, 0x30, 0xc3, 0x20, 0xd3, 0xdb, 0x28, 0x8b, 0x78, 0x70, 0x83, 0x60, 0x93, 0x9b, 0x68, 0x40, 0xb3, 0xbb, 0x48, 0xab, 0x58, 0x50, 0xa3, 0xb, 0xf8, 0xf0, 0x3, 0xe0, 0x13, 0x1b, 0xe8, 0xc0, 0x33, 0x3b, 0xc8, 0x2b, 0xd8, 0xd0, 0x23, 0x80, 0x73, 0x7b, 0x88, 0x6b, 0x98, 0x90, 0x63, 0x4b, 0xb8, 0xb0, 0x43, 0xa0, 0x53, 0x5b, 0xa8, 0x16, 0xe5, 0xed, 0x1e, 0xfd, 0xe, 0x6, 0xf5, 0xdd, 0x2e, 0x26, 0xd5, 0x36, 0xc5, 0xcd, 0x3e, 0x9d, 0x6e, 0x66, 0x95, 0x76, 0x85, 0x8d, 0x7e, 0x56, 0xa5, 0xad, 0x5e, 0xbd, 0x4e, 0x46, 0xb5, 0x1d, 0xee, 0xe6, 0x15, 0xf6, 0x5, 0xd, 0xfe, 0xd6, 0x25, 0x2d, 0xde, 0x3d, 0xce, 0xc6, 0x35, 0x96, 0x65, 0x6d, 0x9e, 0x7d, 0x8e, 0x86, 0x75, 0x5d, 0xae, 0xa6, 0x55, 0xb6, 0x45, 0x4d, 0xbe, 0x2c, 0xdf, 0xd7, 0x24, 0xc7, 0x34, 0x3c, 0xcf, 0xe7, 0x14, 0x1c, 0xef, 0xc, 0xff, 0xf7, 0x4, 0xa7, 0x54, 0x5c, 0xaf, 0x4c, 0xbf, 0xb7, 0x44, 0x6c, 0x9f, 0x97, 0x64, 0x87, 0x74, 0x7c, 0x8f, 0x27, 0xd4, 0xdc, 0x2f, 0xcc, 0x3f, 0x37, 0xc4, 0xec, 0x1f, 0x17, 0xe4, 0x7, 0xf4, 0xfc, 0xf, 0xac, 0x5f, 0x57, 0xa4, 0x47, 0xb4, 0xbc, 0x4f, 0x67, 0x94, 0x9c, 0x6f, 0x8c, 0x7f, 0x77, 0x84, 0x3a, 0xc9, 0xc1, 0x32, 0xd1, 0x22, 0x2a, 0xd9, 0xf1, 0x2, 0xa, 0xf9, 0x1a, 0xe9, 0xe1, 0x12, 0xb1, 0x42, 0x4a, 0xb9, 0x5a, 0xa9, 0xa1, 0x52, 0x7a, 0x89, 0x81, 0x72, 0x91, 0x62, 0x6a, 0x99, 0x31, 0xc2, 0xca, 0x39, 0xda, 0x29, 0x21, 0xd2, 0xfa, 0x9, 0x1, 0xf2, 0x11, 0xe2, 0xea, 0x19, 0xba, 0x49, 0x41, 0xb2, 0x51, 0xa2, 0xaa, 0x59, 0x71, 0x82, 0x8a, 0x79, 0x9a, 0x69, 0x61, 0x92}, {0x0, 0xf4, 0xf5, 0x1, 0xf7, 0x3, 0x2, 0xf6, 0xf3, 0x7, 0x6, 0xf2, 0x4, 0xf0, 0xf1, 0x5, 0xfb, 0xf, 0xe, 0xfa, 0xc, 0xf8, 0xf9, 0xd, 0x8, 0xfc, 0xfd, 0x9, 0xff, 0xb, 0xa, 0xfe, 0xeb, 0x1f, 0x1e, 0xea, 0x1c, 0xe8, 0xe9, 0x1d, 0x18, 0xec, 0xed, 0x19, 0xef, 0x1b, 0x1a, 0xee, 0x10, 0xe4, 0xe5, 0x11, 0xe7, 0x13, 0x12, 0xe6, 0xe3, 0x17, 0x16, 0xe2, 0x14, 0xe0, 0xe1, 0x15, 0xcb, 0x3f, 0x3e, 0xca, 0x3c, 0xc8, 0xc9, 0x3d, 0x38, 0xcc, 0xcd, 0x39, 0xcf, 0x3b, 0x3a, 0xce, 0x30, 0xc4, 0xc5, 0x31, 0xc7, 0x33, 0x32, 0xc6, 0xc3, 0x37, 0x36, 0xc2, 0x34, 0xc0, 0xc1, 0x35, 0x20, 0xd4, 0xd5, 0x21, 0xd7, 0x23, 0x22, 0xd6, 0xd3, 0x27, 0x26, 0xd2, 0x24, 0xd0, 0xd1, 0x25, 0xdb, 0x2f, 0x2e, 0xda, 0x2c, 0xd8, 0xd9, 0x2d, 0x28, 0xdc, 0xdd, 0x29, 0xdf, 0x2b, 0x2a, 0xde, 0x8b, 0x7f, 0x7e, 0x8a, 0x7c, 0x88, 0x89, 0x7d, 0x78, 0x8c, 0x8d, 0x79, 0x8f, 0x7b, 0x7a, 0x8e, 0x70, 0x84, 0x85, 0x71, 0x87, 0x73, 0x72, 0x86, 0x83, 0x77, 0x76, 0x82, 0x74, 0x80, 0x81, 0x75, 0x60, 0x94, 0x95, 0x61, 0x97, 0x63, 0x62, 0x96, 0x93, 0x67, 0x66, 0x92, 0x64, 0x90, 0x91, 0x65, 0x9b, 0x6f, 0x6e, 0x9a, 0x6c, 0x98, 0x99, 0x6d, 0x68, 0x9c, 0x9d, 0x69, 0x9f, 0x6b, 0x6a, 0x9e, 0x40, 0xb4, 0xb5, 0x41, 0xb7, 0x43, 0x42, 0xb6, 0xb3, 0x47, 0x46, 0xb2, 0x44, 0xb0, 0xb1, 0x45, 0xbb, 0x4f, 0x4e, 0xba, 0x4c, 0xb8, 0xb9, 0x4d, 0x48, 0xbc, 0xbd, 0x49, 0xbf, 0x4b, 0x4a, 0xbe, 0xab, 0x5f, 0x5e, 0xaa, 0x5c, 0xa8, 0xa9, 0x5d, 0x58, 0xac, 0xad, 0x59, 0xaf, 0x5b, 0x5a, 0xae, 0x50, 0xa4, 0xa5, 0x51, 0xa7, 0x53, 0x52, 0xa6, 0xa3, 0x57, 0x56, 0xa2, 0x54, 0xa0, 0xa1, 0x55}, {0x0, 0xf5, 0xf7, 0x2, 0xf3, 0x6, 0x4, 0xf1, 0xfb, 0xe, 0xc, 0xf9, 0x8, 0xfd, 0xff, 0xa, 0xeb, 0x1e, 0x1c, 0xe9, 0x18, 0xed, 0xef, 0x1a, 0x10, 0xe5, 0xe7, 0x12, 0xe3, 0x16, 0x14, 0xe1, 0xcb, 0x3e, 0x3c, 0xc9, 0x38, 0xcd, 0xcf, 0x3a, 0x30, 0xc5, 0xc7, 0x32, 0xc3, 0x36, 0x34, 0xc1, 0x20, 0xd5, 0xd7, 0x22, 0xd3, 0x26, 0x24, 0xd1, 0xdb, 0x2e, 0x2c, 0xd9, 0x28, 0xdd, 0xdf, 0x2a, 0x8b, 0x7e, 0x7c, 0x89, 0x78, 0x8d, 0x8f, 0x7a, 0x70, 0x85, 0x87, 0x72, 0x83, 0x76, 0x74, 0x81, 0x60, 0x95, 0x97, 0x62, 0x93, 0x66, 0x64, 0x91, 0x9b, 0x6e, 0x6c, 0x99, 0x68, 0x9d, 0x9f, 0x6a, 0x40, 0xb5, 0xb7, 0x42, 0xb3, 0x46, 0x44, 0xb1, 0xbb, 0x4e, 0x4c, 0xb9, 0x48, 0xbd, 0xbf, 0x4a, 0xab, 0x5e, 0x5c, 0xa9, 0x58, 0xad, 0xaf, 0x5a, 0x50, 0xa5, 0xa7, 0x52, 0xa3, 0x56, 0x54, 0xa1, 0xb, 0xfe, 0xfc, 0x9, 0xf8, 0xd, 0xf, 0xfa, 0xf0, 0x5, 0x7, 0xf2, 0x3, 0xf6, 0xf4, 0x1, 0xe0, 0x15, 0x17, 0xe2, 0x13, 0xe6, 0xe4, 0x11, 0x1b, 0xee, 0xec, 0x19, 0xe8, 0x1d, 0x1f, 0xea, 0xc0, 0x35, 0x37, 0xc2, 0x33, 0xc6, 0xc4, 0x31, 0x3b, 0xce, 0xcc, 0x39, 0xc8, 0x3d, 0x3f, 0xca, 0x2b, 0xde, 0xdc, 0x29, 0xd8, 0x2d, 0x2f, 0xda, 0xd0, 0x25, 0x27, 0xd2, 0x23, 0xd6, 0xd4, 0x21, 0x80, 0x75, 0x77, 0x82, 0x73, 0x86, 0x84, 0x71, 0x7b, 0x8e, 0x8c, 0x79, 0x88, 0x7d, 0x7f, 0x8a, 0x6b, 0x9e, 0x9c, 0x69, 0x98, 0x6d, 0x6f, 0x9a, 0x90, 0x65, 0x67, 0x92, 0x63, 0x96, 0x94, 0x61, 0x4b, 0xbe, 0xbc, 0x49, 0xb8, 0x4d, 0x4f, 0xba, 0xb0, 0x45, 0x47, 0xb2, 0x43, 0xb6, 0xb4, 0x41, 0xa0, 0x55, 0x57, 0xa2, 0x53, 0xa6, 0xa4, 0x51, 0x5b, 0xae, 0xac, 0x59, 0xa8, 0x5d, 0x5f, 0xaa}, {0x0, 0xf6, 0xf1, 0x7, 0xff, 0x9, 0xe, 0xf8, 0xe3, 0x15, 0x12, 0xe4, 0x1c, 0xea, 0xed, 0x1b, 0xdb, 0x2d, 0x2a, 0xdc, 0x24, 0xd2, 0xd5, 0x23, 0x38, 0xce, 0xc9, 0x3f, 0xc7, 0x31, 0x36, 0xc0, 0xab, 0x5d, 0x5a, 0xac, 0x54, 0xa2, 0xa5, 0x53, 0x48, 0xbe, 0xb9, 0x4f, 0xb7, 0x41, 0x46, 0xb0, 0x70, 0x86, 0x81, 0x77, 0x8f, 0x79, 0x7e, 0x88, 0x93, 0x65, 0x62, 0x94, 0x6c, 0x9a, 0x9d, 0x6b, 0x4b, 0xbd, 0xba, 0x4c, 0xb4, 0x42, 0x45, 0xb3, 0xa8, 0x5e, 0x59, 0xaf, 0x57, 0xa1, 0xa6, 0x50, 0x90, 0x66, 0x61, 0x97, 0x6f, 0x99, 0x9e, 0x68, 0x73, 0x85, 0x82, 0x74, 0x8c, 0x7a, 0x7d, 0x8b, 0xe0, 0x16, 0x11, 0xe7, 0x1f, 0xe9, 0xee, 0x18, 0x3, 0xf5, 0xf2, 0x4, 0xfc, 0xa, 0xd, 0xfb, 0x3b, 0xcd, 0xca, 0x3c, 0xc4, 0x32, 0x35, 0xc3, 0xd8, 0x2e, 0x29, 0xdf, 0x27, 0xd1, 0xd6, 0x20, 0x96, 0x60, 0x67, 0x91, 0x69, 0x9f, 0x98, 0x6e, 0x75, 0x83, 0x84, 0x72, 0x8a, 0x7c, 0x7b, 0x8d, 0x4d, 0xbb, 0xbc, 0x4a, 0xb2, 0x44, 0x43, 0xb5, 0xae, 0x58, 0x5f, 0xa9, 0x51, 0xa7, 0xa0, 0x56, 0x3d, 0xcb, 0xcc, 0x3a, 0xc2, 0x34, 0x33, 0xc5, 0xde, 0x28, 0x2f, 0xd9, 0x21, 0xd7, 0xd0, 0x26, 0xe6, 0x10, 0x17, 0xe1, 0x19, 0xef, 0xe8, 0x1e, 0x5, 0xf3, 0xf4, 0x2, 0xfa, 0xc, 0xb, 0xfd, 0xdd, 0x2b, 0x2c, 0xda, 0x22, 0xd4, 0xd3, 0x25, 0x3e, 0xc8, 0xcf, 0x39, 0xc1, 0x37, 0x30, 0xc6, 0x6, 0xf0, 0xf7, 0x1, 0xf9, 0xf, 0x8, 0xfe, 0xe5, 0x13, 0x14, 0xe2, 0x1a, 0xec, 0xeb, 0x1d, 0x76, 0x80, 0x87, 0x71, 0x89, 0x7f, 0x78, 0x8e, 0x95, 0x63, 0x64, 0x92, 0x6a, 0x9c, 0x9b, 0x6d, 0xad, 0x5b, 0x5c, 0xaa, 0x52, 0xa4, 0xa3, 0x55, 0x4e, 0xb8, 0xbf, 0x49, 0xb1, 0x47, 0x40, 0xb6}, {0x0, 0xf7, 0xf3, 0x4, 0xfb, 0xc, 0x8, 0xff, 0xeb, 0x1c, 0x18, 0xef, 0x10, 0xe7, 0xe3, 0x14, 0xcb, 0x3c, 0x38, 0xcf, 0x30, 0xc7, 0xc3, 0x34, 0x20, 0xd7, 0xd3, 0x24, 0xdb, 0x2c, 0x28, 0xdf, 0x8b, 0x7c, 0x78, 0x8f, 0x70, 0x87, 0x83, 0x74, 0x60, 0x97, 0x93, 0x64, 0x9b, 0x6c, 0x68, 0x9f, 0x40, 0xb7, 0xb3, 0x44, 0xbb, 0x4c, 0x48, 0xbf, 0xab, 0x5c, 0x58, 0xaf, 0x50, 0xa7, 0xa3, 0x54, 0xb, 0xfc, 0xf8, 0xf, 0xf0, 0x7, 0x3, 0xf4, 0xe0, 0x17, 0x13, 0xe4, 0x1b, 0xec, 0xe8, 0x1f, 0xc0, 0x37, 0x33, 0xc4, 0x3b, 0xcc, 0xc8, 0x3f, 0x2b, 0xdc, 0xd8, 0x2f, 0xd0, 0x27, 0x23, 0xd4, 0x80, 0x77, 0x73, 0x84, 0x7b, 0x8c, 0x88, 0x7f, 0x6b, 0x9c, 0x98, 0x6f, 0x90, 0x67, 0x63, 0x94, 0x4b, 0xbc, 0xb8, 0x4f, 0xb0, 0x47, 0x43, 0xb4, 0xa0, 0x57, 0x53, 0xa4, 0x5b, 0xac, 0xa8, 0x5f, 0x16, 0xe1, 0xe5, 0x12, 0xed, 0x1a, 0x1e, 0xe9, 0xfd, 0xa, 0xe, 0xf9, 0x6, 0xf1, 0xf5, 0x2, 0xdd, 0x2a, 0x2e, 0xd9, 0x26, 0xd1, 0xd5, 0x22, 0x36, 0xc1, 0xc5, 0x32, 0xcd, 0x3a, 0x3e, 0xc9, 0x9d, 0x6a, 0x6e, 0x99, 0x66, 0x91, 0x95, 0x62, 0x76, 0x81, 0x85, 0x72, 0x8d, 0x7a, 0x7e, 0x89, 0x56, 0xa1, 0xa5, 0x52, 0xad, 0x5a, 0x5e, 0xa9, 0xbd, 0x4a, 0x4e, 0xb9, 0x46, 0xb1, 0xb5, 0x42, 0x1d, 0xea, 0xee, 0x19, 0xe6, 0x11, 0x15, 0xe2, 0xf6, 0x1, 0x5, 0xf2, 0xd, 0xfa, 0xfe, 0x9, 0xd6, 0x21, 0x25, 0xd2, 0x2d, 0xda, 0xde, 0x29, 0x3d, 0xca, 0xce, 0x39, 0xc6, 0x31, 0x35, 0xc2, 0x96, 0x61, 0x65, 0x92, 0x6d, 0x9a, 0x9e, 0x69, 0x7d, 0x8a, 0x8e, 0x79, 0x86, 0x71, 0x75, 0x82, 0x5d, 0xaa, 0xae, 0x59, 0xa6, 0x51, 0x55, 0xa2, 0xb6, 0x41, 0x45, 0xb2, 0x4d, 0xba, 0xbe, 0x49}, {0x0, 0xf8, 0xed, 0x15, 0xc7, 0x3f, 0x2a, 0xd2, 0x93, 0x6b, 0x7e, 0x86, 0x54, 0xac, 0xb9, 0x41, 0x3b, 0xc3, 0xd6, 0x2e, 0xfc, 0x4, 0x11, 0xe9, 0xa8, 0x50, 0x45, 0xbd, 0x6f, 0x97, 0x82, 0x7a, 0x76, 0x8e, 0x9b, 0x63, 0xb1, 0x49, 0x5c, 0xa4, 0xe5, 0x1d, 0x8, 0xf0, 0x22, 0xda, 0xcf, 0x37, 0x4d, 0xb5, 0xa0, 0x58, 0x8a, 0x72, 0x67, 0x9f, 0xde, 0x26, 0x33, 0xcb, 0x19, 0xe1, 0xf4, 0xc, 0xec, 0x14, 0x1, 0xf9, 0x2b, 0xd3, 0xc6, 0x3e, 0x7f, 0x87, 0x92, 0x6a, 0xb8, 0x40, 0x55, 0xad, 0xd7, 0x2f, 0x3a, 0xc2, 0x10, 0xe8, 0xfd, 0x5, 0x44, 0xbc, 0xa9, 0x51, 0x83, 0x7b, 0x6e, 0x96, 0x9a, 0x62, 0x77, 0x8f, 0x5d, 0xa5, 0xb0, 0x48, 0x9, 0xf1, 0xe4, 0x1c, 0xce, 0x36, 0x23, 0xdb, 0xa1, 0x59, 0x4c, 0xb4, 0x66, 0x9e, 0x8b, 0x73, 0x32, 0xca, 0xdf, 0x27, 0xf5, 0xd, 0x18, 0xe0, 0xc5, 0x3d, 0x28, 0xd0, 0x2, 0xfa, 0xef, 0x17, 0x56, 0xae, 0xbb, 0x43, 0x91, 0x69, 0x7c, 0x84, 0xfe, 0x6, 0x13, 0xeb, 0x39, 0xc1, 0xd4, 0x2c, 0x6d, 0x95, 0x80, 0x78, 0xaa, 0x52, 0x47, 0xbf, 0xb3, 0x4b, 0x5e, 0xa6, 0x74, 0x8c, 0x99, 0x61, 0x20, 0xd8, 0xcd, 0x35, 0xe7, 0x1f, 0xa, 0xf2, 0x88, 0x70, 0x65, 0x9d, 0x4f, 0xb7, 0xa2, 0x5a, 0x1b, 0xe3, 0xf6, 0xe, 0xdc, 0x24, 0x31, 0xc9, 0x29, 0xd1, 0xc4, 0x3c, 0xee, 0x16, 0x3, 0xfb, 0xba, 0x42, 0x57, 0xaf, 0x7d, 0x85, 0x90, 0x68, 0x12, 0xea, 0xff, 0x7, 0xd5, 0x2d, 0x38, 0xc0, 0x81, 0x79, 0x6c, 0x94, 0x46, 0xbe, 0xab, 0x53, 0x5f, 0xa7, 0xb2, 0x4a, 0x98, 0x60, 0x75, 0x8d, 0xcc, 0x34, 0x21, 0xd9, 0xb, 0xf3, 0xe6, 0x1e, 0x64, 0x9c, 0x89, 0x71, 0xa3, 0x5b, 0x4e, 0xb6, 0xf7, 0xf, 0x1a, 0xe2, 0x30, 0xc8, 0xdd, 0x25}, {0x0, 0xf9, 0xef, 0x16, 0xc3, 0x3a, 0x2c, 0xd5, 0x9b, 0x62, 0x74, 0x8d, 0x58, 0xa1, 0xb7, 0x4e, 0x2b, 0xd2, 0xc4, 0x3d, 0xe8, 0x11, 0x7, 0xfe, 0xb0, 0x49, 0x5f, 0xa6, 0x73, 0x8a, 0x9c, 0x65, 0x56, 0xaf, 0xb9, 0x40, 0x95, 0x6c, 0x7a, 0x83, 0xcd, 0x34, 0x22, 0xdb, 0xe, 0xf7, 0xe1, 0x18, 0x7d, 0x84, 0x92, 0x6b, 0xbe, 0x47, 0x51, 0xa8, 0xe6, 0x1f, 0x9, 0xf0, 0x25, 0xdc, 0xca, 0x33, 0xac, 0x55, 0x43, 0xba, 0x6f, 0x96, 0x80, 0x79, 0x37, 0xce, 0xd8, 0x21, 0xf4, 0xd, 0x1b, 0xe2, 0x87, 0x7e, 0x68, 0x91, 0x44, 0xbd, 0xab, 0x52, 0x1c, 0xe5, 0xf3, 0xa, 0xdf, 0x26, 0x30, 0xc9, 0xfa, 0x3, 0x15, 0xec, 0x39, 0xc0, 0xd6, 0x2f, 0x61, 0x98, 0x8e, 0x77, 0xa2, 0x5b, 0x4d, 0xb4, 0xd1, 0x28, 0x3e, 0xc7, 0x12, 0xeb, 0xfd, 0x4, 0x4a, 0xb3, 0xa5, 0x5c, 0x89, 0x70, 0x66, 0x9f, 0x45, 0xbc, 0xaa, 0x53, 0x86, 0x7f, 0x69, 0x90, 0xde, 0x27, 0x31, 0xc8, 0x1d, 0xe4, 0xf2, 0xb, 0x6e, 0x97, 0x81, 0x78, 0xad, 0x54, 0x42, 0xbb, 0xf5, 0xc, 0x1a, 0xe3, 0x36, 0xcf, 0xd9, 0x20, 0x13, 0xea, 0xfc, 0x5, 0xd0, 0x29, 0x3f, 0xc6, 0x88, 0x71, 0x67, 0x9e, 0x4b, 0xb2, 0xa4, 0x5d, 0x38, 0xc1, 0xd7, 0x2e, 0xfb, 0x2, 0x14, 0xed, 0xa3, 0x5a, 0x4c, 0xb5, 0x60, 0x99, 0x8f, 0x76, 0xe9, 0x10, 0x6, 0xff, 0x2a, 0xd3, 0xc5, 0x3c, 0x72, 0x8b, 0x9d, 0x64, 0xb1, 0x48, 0x5e, 0xa7, 0xc2, 0x3b, 0x2d, 0xd4, 0x1, 0xf8, 0xee, 0x17, 0x59, 0xa0, 0xb6, 0x4f, 0x9a, 0x63, 0x75, 0x8c, 0xbf, 0x46, 0x50, 0xa9, 0x7c, 0x85, 0x93, 0x6a, 0x24, 0xdd, 0xcb, 0x32, 0xe7, 0x1e, 0x8, 0xf1, 0x94, 0x6d, 0x7b, 0x82, 0x57, 0xae, 0xb8, 0x41, 0xf, 0xf6, 0xe0, 0x19, 0xcc, 0x35, 0x23, 0xda}, {0x0, 0xfa, 0xe9, 0x13, 0xcf, 0x35, 0x26, 0xdc, 0x83, 0x79, 0x6a, 0x90, 0x4c, 0xb6, 0xa5, 0x5f, 0x1b, 0xe1, 0xf2, 0x8, 0xd4, 0x2e, 0x3d, 0xc7, 0x98, 0x62, 0x71, 0x8b, 0x57, 0xad, 0xbe, 0x44, 0x36, 0xcc, 0xdf, 0x25, 0xf9, 0x3, 0x10, 0xea, 0xb5, 0x4f, 0x5c, 0xa6, 0x7a, 0x80, 0x93, 0x69, 0x2d, 0xd7, 0xc4, 0x3e, 0xe2, 0x18, 0xb, 0xf1, 0xae, 0x54, 0x47, 0xbd, 0x61, 0x9b, 0x88, 0x72, 0x6c, 0x96, 0x85, 0x7f, 0xa3, 0x59, 0x4a, 0xb0, 0xef, 0x15, 0x6, 0xfc, 0x20, 0xda, 0xc9, 0x33, 0x77, 0x8d, 0x9e, 0x64, 0xb8, 0x42, 0x51, 0xab, 0xf4, 0xe, 0x1d, 0xe7, 0x3b, 0xc1, 0xd2, 0x28, 0x5a, 0xa0, 0xb3, 0x49, 0x95, 0x6f, 0x7c, 0x86, 0xd9, 0x23, 0x30, 0xca, 0x16, 0xec, 0xff, 0x5, 0x41, 0xbb, 0xa8, 0x52, 0x8e, 0x74, 0x67, 0x9d, 0xc2, 0x38, 0x2b, 0xd1, 0xd, 0xf7, 0xe4, 0x1e, 0xd8, 0x22, 0x31, 0xcb, 0x17, 0xed, 0xfe, 0x4, 0x5b, 0xa1, 0xb2, 0x48, 0x94, 0x6e, 0x7d, 0x87, 0xc3, 0x39, 0x2a, 0xd0, 0xc, 0xf6, 0xe5, 0x1f, 0x40, 0xba, 0xa9, 0x53, 0x8f, 0x75, 0x66, 0x9c, 0xee, 0x14, 0x7, 0xfd, 0x21, 0xdb, 0xc8, 0x32, 0x6d, 0x97, 0x84, 0x7e, 0xa2, 0x58, 0x4b, 0xb1, 0xf5, 0xf, 0x1c, 0xe6, 0x3a, 0xc0, 0xd3, 0x29, 0x76, 0x8c, 0x9f, 0x65, 0xb9, 0x43, 0x50, 0xaa, 0xb4, 0x4e, 0x5d, 0xa7, 0x7b, 0x81, 0x92, 0x68, 0x37, 0xcd, 0xde, 0x24, 0xf8, 0x2, 0x11, 0xeb, 0xaf, 0x55, 0x46, 0xbc, 0x60, 0x9a, 0x89, 0x73, 0x2c, 0xd6, 0xc5, 0x3f, 0xe3, 0x19, 0xa, 0xf0, 0x82, 0x78, 0x6b, 0x91, 0x4d, 0xb7, 0xa4, 0x5e, 0x1, 0xfb, 0xe8, 0x12, 0xce, 0x34, 0x27, 0xdd, 0x99, 0x63, 0x70, 0x8a, 0x56, 0xac, 0xbf, 0x45, 0x1a, 0xe0, 0xf3, 0x9, 0xd5, 0x2f, 0x3c, 0xc6}, {0x0, 0xfb, 0xeb, 0x10, 0xcb, 0x30, 0x20, 0xdb, 0x8b, 0x70, 0x60, 0x9b, 0x40, 0xbb, 0xab, 0x50, 0xb, 0xf0, 0xe0, 0x1b, 0xc0, 0x3b, 0x2b, 0xd0, 0x80, 0x7b, 0x6b, 0x90, 0x4b, 0xb0, 0xa0, 0x5b, 0x16, 0xed, 0xfd, 0x6, 0xdd, 0x26, 0x36, 0xcd, 0x9d, 0x66, 0x76, 0x8d, 0x56, 0xad, 0xbd, 0x46, 0x1d, 0xe6, 0xf6, 0xd, 0xd6, 0x2d, 0x3d, 0xc6, 0x96, 0x6d, 0x7d, 0x86, 0x5d, 0xa6, 0xb6, 0x4d, 0x2c, 0xd7, 0xc7, 0x3c, 0xe7, 0x1c, 0xc, 0xf7, 0xa7, 0x5c, 0x4c, 0xb7, 0x6c, 0x97, 0x87, 0x7c, 0x27, 0xdc, 0xcc, 0x37, 0xec, 0x17, 0x7, 0xfc, 0xac, 0x57, 0x47, 0xbc, 0x67, 0x9c, 0x8c, 0x77, 0x3a, 0xc1, 0xd1, 0x2a, 0xf1, 0xa, 0x1a, 0xe1, 0xb1, 0x4a, 0x5a, 0xa1, 0x7a, 0x81, 0x91, 0x6a, 0x31, 0xca, 0xda, 0x21, 0xfa, 0x1, 0x11, 0xea, 0xba, 0x41, 0x51, 0xaa, 0x71, 0x8a, 0x9a, 0x61, 0x58, 0xa3, 0xb3, 0x48, 0x93, 0x68, 0x78, 0x83, 0xd3, 0x28, 0x38, 0xc3, 0x18, 0xe3, 0xf3, 0x8, 0x53, 0xa8, 0xb8, 0x43, 0x98, 0x63, 0x73, 0x88, 0xd8, 0x23, 0x33, 0xc8, 0x13, 0xe8, 0xf8, 0x3, 0x4e, 0xb5, 0xa5, 0x5e, 0x85, 0x7e, 0x6e, 0x95, 0xc5, 0x3e, 0x2e, 0xd5, 0xe, 0xf5, 0xe5, 0x1e, 0x45, 0xbe, 0xae, 0x55, 0x8e, 0x75, 0x65, 0x9e, 0xce, 0x35, 0x25, 0xde, 0x5, 0xfe, 0xee, 0x15, 0x74, 0x8f, 0x9f, 0x64, 0xbf, 0x44, 0x54, 0xaf, 0xff, 0x4, 0x14, 0xef, 0x34, 0xcf, 0xdf, 0x24, 0x7f, 0x84, 0x94, 0x6f, 0xb4, 0x4f, 0x5f, 0xa4, 0xf4, 0xf, 0x1f, 0xe4, 0x3f, 0xc4, 0xd4, 0x2f, 0x62, 0x99, 0x89, 0x72, 0xa9, 0x52, 0x42, 0xb9, 0xe9, 0x12, 0x2, 0xf9, 0x22, 0xd9, 0xc9, 0x32, 0x69, 0x92, 0x82, 0x79, 0xa2, 0x59, 0x49, 0xb2, 0xe2, 0x19, 0x9, 0xf2, 0x29, 0xd2, 0xc2, 0x39}, {0x0, 0xfc, 0xe5, 0x19, 0xd7, 0x2b, 0x32, 0xce, 0xb3, 0x4f, 0x56, 0xaa, 0x64, 0x98, 0x81, 0x7d, 0x7b, 0x87, 0x9e, 0x62, 0xac, 0x50, 0x49, 0xb5, 0xc8, 0x34, 0x2d, 0xd1, 0x1f, 0xe3, 0xfa, 0x6, 0xf6, 0xa, 0x13, 0xef, 0x21, 0xdd, 0xc4, 0x38, 0x45, 0xb9, 0xa0, 0x5c, 0x92, 0x6e, 0x77, 0x8b, 0x8d, 0x71, 0x68, 0x94, 0x5a, 0xa6, 0xbf, 0x43, 0x3e, 0xc2, 0xdb, 0x27, 0xe9, 0x15, 0xc, 0xf0, 0xf1, 0xd, 0x14, 0xe8, 0x26, 0xda, 0xc3, 0x3f, 0x42, 0xbe, 0xa7, 0x5b, 0x95, 0x69, 0x70, 0x8c, 0x8a, 0x76, 0x6f, 0x93, 0x5d, 0xa1, 0xb8, 0x44, 0x39, 0xc5, 0xdc, 0x20, 0xee, 0x12, 0xb, 0xf7, 0x7, 0xfb, 0xe2, 0x1e, 0xd0, 0x2c, 0x35, 0xc9, 0xb4, 0x48, 0x51, 0xad, 0x63, 0x9f, 0x86, 0x7a, 0x7c, 0x80, 0x99, 0x65, 0xab, 0x57, 0x4e, 0xb2, 0xcf, 0x33, 0x2a, 0xd6, 0x18, 0xe4, 0xfd, 0x1, 0xff, 0x3, 0x1a, 0xe6, 0x28, 0xd4, 0xcd, 0x31, 0x4c, 0xb0, 0xa9, 0x55, 0x9b, 0x67, 0x7e, 0x82, 0x84, 0x78, 0x61, 0x9d, 0x53, 0xaf, 0xb6, 0x4a, 0x37, 0xcb, 0xd2, 0x2e, 0xe0, 0x1c, 0x5, 0xf9, 0x9, 0xf5, 0xec, 0x10, 0xde, 0x22, 0x3b, 0xc7, 0xba, 0x46, 0x5f, 0xa3, 0x6d, 0x91, 0x88, 0x74, 0x72, 0x8e, 0x97, 0x6b, 0xa5, 0x59, 0x40, 0xbc, 0xc1, 0x3d, 0x24, 0xd8, 0x16, 0xea, 0xf3, 0xf, 0xe, 0xf2, 0xeb, 0x17, 0xd9, 0x25, 0x3c, 0xc0, 0xbd, 0x41, 0x58, 0xa4, 0x6a, 0x96, 0x8f, 0x73, 0x75, 0x89, 0x90, 0x6c, 0xa2, 0x5e, 0x47, 0xbb, 0xc6, 0x3a, 0x23, 0xdf, 0x11, 0xed, 0xf4, 0x8, 0xf8, 0x4, 0x1d, 0xe1, 0x2f, 0xd3, 0xca, 0x36, 0x4b, 0xb7, 0xae, 0x52, 0x9c, 0x60, 0x79, 0x85, 0x83, 0x7f, 0x66, 0x9a, 0x54, 0xa8, 0xb1, 0x4d, 0x30, 0xcc, 0xd5, 0x29, 0xe7, 0x1b, 0x2, 0xfe}, {0x0, 0xfd, 0xe7, 0x1a, 0xd3, 0x2e, 0x34, 0xc9, 0xbb, 0x46, 0x5c, 0xa1, 0x68, 0x95, 0x8f, 0x72, 0x6b, 0x96, 0x8c, 0x71, 0xb8, 0x45, 0x5f, 0xa2, 0xd0, 0x2d, 0x37, 0xca, 0x3, 0xfe, 0xe4, 0x19, 0xd6, 0x2b, 0x31, 0xcc, 0x5, 0xf8, 0xe2, 0x1f, 0x6d, 0x90, 0x8a, 0x77, 0xbe, 0x43, 0x59, 0xa4, 0xbd, 0x40, 0x5a, 0xa7, 0x6e, 0x93, 0x89, 0x74, 0x6, 0xfb, 0xe1, 0x1c, 0xd5, 0x28, 0x32, 0xcf, 0xb1, 0x4c, 0x56, 0xab, 0x62, 0x9f, 0x85, 0x78, 0xa, 0xf7, 0xed, 0x10, 0xd9, 0x24, 0x3e, 0xc3, 0xda, 0x27, 0x3d, 0xc0, 0x9, 0xf4, 0xee, 0x13, 0x61, 0x9c, 0x86, 0x7b, 0xb2, 0x4f, 0x55, 0xa8, 0x67, 0x9a, 0x80, 0x7d, 0xb4, 0x49, 0x53, 0xae, 0xdc, 0x21, 0x3b, 0xc6, 0xf, 0xf2, 0xe8, 0x15, 0xc, 0xf1, 0xeb, 0x16, 0xdf, 0x22, 0x38, 0xc5, 0xb7, 0x4a, 0x50, 0xad, 0x64, 0x99, 0x83, 0x7e, 0x7f, 0x82, 0x98, 0x65, 0xac, 0x51, 0x4b, 0xb6, 0xc4, 0x39, 0x23, 0xde, 0x17, 0xea, 0xf0, 0xd, 0x14, 0xe9, 0xf3, 0xe, 0xc7, 0x3a, 0x20, 0xdd, 0xaf, 0x52, 0x48, 0xb5, 0x7c, 0x81, 0x9b, 0x66, 0xa9, 0x54, 0x4e, 0xb3, 0x7a, 0x87, 0x9d, 0x60, 0x12, 0xef, 0xf5, 0x8, 0xc1, 0x3c, 0x26, 0xdb, 0xc2, 0x3f, 0x25, 0xd8, 0x11, 0xec, 0xf6, 0xb, 0x79, 0x84, 0x9e, 0x63, 0xaa, 0x57, 0x4d, 0xb0, 0xce, 0x33, 0x29, 0xd4, 0x1d, 0xe0, 0xfa, 0x7, 0x75, 0x88, 0x92, 0x6f, 0xa6, 0x5b, 0x41, 0xbc, 0xa5, 0x58, 0x42, 0xbf, 0x76, 0x8b, 0x91, 0x6c, 0x1e, 0xe3, 0xf9, 0x4, 0xcd, 0x30, 0x2a, 0xd7, 0x18, 0xe5, 0xff, 0x2, 0xcb, 0x36, 0x2c, 0xd1, 0xa3, 0x5e, 0x44, 0xb9, 0x70, 0x8d, 0x97, 0x6a, 0x73, 0x8e, 0x94, 0x69, 0xa0, 0x5d, 0x47, 0xba, 0xc8, 0x35, 0x2f, 0xd2, 0x1b, 0xe6, 0xfc, 0x1}, {0x0, 0xfe, 0xe1, 0x1f, 0xdf, 0x21, 0x3e, 0xc0, 0xa3, 0x5d, 0x42, 0xbc, 0x7c, 0x82, 0x9d, 0x63, 0x5b, 0xa5, 0xba, 0x44, 0x84, 0x7a, 0x65, 0x9b, 0xf8, 0x6, 0x19, 0xe7, 0x27, 0xd9, 0xc6, 0x38, 0xb6, 0x48, 0x57, 0xa9, 0x69, 0x97, 0x88, 0x76, 0x15, 0xeb, 0xf4, 0xa, 0xca, 0x34, 0x2b, 0xd5, 0xed, 0x13, 0xc, 0xf2, 0x32, 0xcc, 0xd3, 0x2d, 0x4e, 0xb0, 0xaf, 0x51, 0x91, 0x6f, 0x70, 0x8e, 0x71, 0x8f, 0x90, 0x6e, 0xae, 0x50, 0x4f, 0xb1, 0xd2, 0x2c, 0x33, 0xcd, 0xd, 0xf3, 0xec, 0x12, 0x2a, 0xd4, 0xcb, 0x35, 0xf5, 0xb, 0x14, 0xea, 0x89, 0x77, 0x68, 0x96, 0x56, 0xa8, 0xb7, 0x49, 0xc7, 0x39, 0x26, 0xd8, 0x18, 0xe6, 0xf9, 0x7, 0x64, 0x9a, 0x85, 0x7b, 0xbb, 0x45, 0x5a, 0xa4, 0x9c, 0x62, 0x7d, 0x83, 0x43, 0xbd, 0xa2, 0x5c, 0x3f, 0xc1, 0xde, 0x20, 0xe0, 0x1e, 0x1, 0xff, 0xe2, 0x1c, 0x3, 0xfd, 0x3d, 0xc3, 0xdc, 0x22, 0x41, 0xbf, 0xa0, 0x5e, 0x9e, 0x60, 0x7f, 0x81, 0xb9, 0x47, 0x58, 0xa6, 0x66, 0x98, 0x87, 0x79, 0x1a, 0xe4, 0xfb, 0x5, 0xc5, 0x3b, 0x24, 0xda, 0x54, 0xaa, 0xb5, 0x4b, 0x8b, 0x75, 0x6a, 0x94, 0xf7, 0x9, 0x16, 0xe8, 0x28, 0xd6, 0xc9, 0x37, 0xf, 0xf1, 0xee, 0x10, 0xd0, 0x2e, 0x31, 0xcf, 0xac, 0x52, 0x4d, 0xb3, 0x73, 0x8d, 0x92, 0x6c, 0x93, 0x6d, 0x72, 0x8c, 0x4c, 0xb2, 0xad, 0x53, 0x30, 0xce, 0xd1, 0x2f, 0xef, 0x11, 0xe, 0xf0, 0xc8, 0x36, 0x29, 0xd7, 0x17, 0xe9, 0xf6, 0x8, 0x6b, 0x95, 0x8a, 0x74, 0xb4, 0x4a, 0x55, 0xab, 0x25, 0xdb, 0xc4, 0x3a, 0xfa, 0x4, 0x1b, 0xe5, 0x86, 0x78, 0x67, 0x99, 0x59, 0xa7, 0xb8, 0x46, 0x7e, 0x80, 0x9f, 0x61, 0xa1, 0x5f, 0x40, 0xbe, 0xdd, 0x23, 0x3c, 0xc2, 0x2, 0xfc, 0xe3, 0x1d}, {0x0, 0xff, 0xe3, 0x1c, 0xdb, 0x24, 0x38, 0xc7, 0xab, 0x54, 0x48, 0xb7, 0x70, 0x8f, 0x93, 0x6c, 0x4b, 0xb4, 0xa8, 0x57, 0x90, 0x6f, 0x73, 0x8c, 0xe0, 0x1f, 0x3, 0xfc, 0x3b, 0xc4, 0xd8, 0x27, 0x96, 0x69, 0x75, 0x8a, 0x4d, 0xb2, 0xae, 0x51, 0x3d, 0xc2, 0xde, 0x21, 0xe6, 0x19, 0x5, 0xfa, 0xdd, 0x22, 0x3e, 0xc1, 0x6, 0xf9, 0xe5, 0x1a, 0x76, 0x89, 0x95, 0x6a, 0xad, 0x52, 0x4e, 0xb1, 0x31, 0xce, 0xd2, 0x2d, 0xea, 0x15, 0x9, 0xf6, 0x9a, 0x65, 0x79, 0x86, 0x41, 0xbe, 0xa2, 0x5d, 0x7a, 0x85, 0x99, 0x66, 0xa1, 0x5e, 0x42, 0xbd, 0xd1, 0x2e, 0x32, 0xcd, 0xa, 0xf5, 0xe9, 0x16, 0xa7, 0x58, 0x44, 0xbb, 0x7c, 0x83, 0x9f, 0x60, 0xc, 0xf3, 0xef, 0x10, 0xd7, 0x28, 0x34, 0xcb, 0xec, 0x13, 0xf, 0xf0, 0x37, 0xc8, 0xd4, 0x2b, 0x47, 0xb8, 0xa4, 0x5b, 0x9c, 0x63, 0x7f, 0x80, 0x62, 0x9d, 0x81, 0x7e, 0xb9, 0x46, 0x5a, 0xa5, 0xc9, 0x36, 0x2a, 0xd5, 0x12, 0xed, 0xf1, 0xe, 0x29, 0xd6, 0xca, 0x35, 0xf2, 0xd, 0x11, 0xee, 0x82, 0x7d, 0x61, 0x9e, 0x59, 0xa6, 0xba, 0x45, 0xf4, 0xb, 0x17, 0xe8, 0x2f, 0xd0, 0xcc, 0x33, 0x5f, 0xa0, 0xbc, 0x43, 0x84, 0x7b, 0x67, 0x98, 0xbf, 0x40, 0x5c, 0xa3, 0x64, 0x9b, 0x87, 0x78, 0x14, 0xeb, 0xf7, 0x8, 0xcf, 0x30, 0x2c, 0xd3, 0x53, 0xac, 0xb0, 0x4f, 0x88, 0x77, 0x6b, 0x94, 0xf8, 0x7, 0x1b, 0xe4, 0x23, 0xdc, 0xc0, 0x3f, 0x18, 0xe7, 0xfb, 0x4, 0xc3, 0x3c, 0x20, 0xdf, 0xb3, 0x4c, 0x50, 0xaf, 0x68, 0x97, 0x8b, 0x74, 0xc5, 0x3a, 0x26, 0xd9, 0x1e, 0xe1, 0xfd, 0x2, 0x6e, 0x91, 0x8d, 0x72, 0xb5, 0x4a, 0x56, 0xa9, 0x8e, 0x71, 0x6d, 0x92, 0x55, 0xaa, 0xb6, 0x49, 0x25, 0xda, 0xc6, 0x39, 0xfe, 0x1, 0x1d, 0xe2}} var mulTableLow = [256][16]uint8{{0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf}, {0x0, 0x2, 0x4, 0x6, 0x8, 0xa, 0xc, 0xe, 0x10, 0x12, 0x14, 0x16, 0x18, 0x1a, 0x1c, 0x1e}, {0x0, 0x3, 0x6, 0x5, 0xc, 0xf, 0xa, 0x9, 0x18, 0x1b, 0x1e, 0x1d, 0x14, 0x17, 0x12, 0x11}, {0x0, 0x4, 0x8, 0xc, 0x10, 0x14, 0x18, 0x1c, 0x20, 0x24, 0x28, 0x2c, 0x30, 0x34, 0x38, 0x3c}, {0x0, 0x5, 0xa, 0xf, 0x14, 0x11, 0x1e, 0x1b, 0x28, 0x2d, 0x22, 0x27, 0x3c, 0x39, 0x36, 0x33}, {0x0, 0x6, 0xc, 0xa, 0x18, 0x1e, 0x14, 0x12, 0x30, 0x36, 0x3c, 0x3a, 0x28, 0x2e, 0x24, 0x22}, {0x0, 0x7, 0xe, 0x9, 0x1c, 0x1b, 0x12, 0x15, 0x38, 0x3f, 0x36, 0x31, 0x24, 0x23, 0x2a, 0x2d}, {0x0, 0x8, 0x10, 0x18, 0x20, 0x28, 0x30, 0x38, 0x40, 0x48, 0x50, 0x58, 0x60, 0x68, 0x70, 0x78}, {0x0, 0x9, 0x12, 0x1b, 0x24, 0x2d, 0x36, 0x3f, 0x48, 0x41, 0x5a, 0x53, 0x6c, 0x65, 0x7e, 0x77}, {0x0, 0xa, 0x14, 0x1e, 0x28, 0x22, 0x3c, 0x36, 0x50, 0x5a, 0x44, 0x4e, 0x78, 0x72, 0x6c, 0x66}, {0x0, 0xb, 0x16, 0x1d, 0x2c, 0x27, 0x3a, 0x31, 0x58, 0x53, 0x4e, 0x45, 0x74, 0x7f, 0x62, 0x69}, {0x0, 0xc, 0x18, 0x14, 0x30, 0x3c, 0x28, 0x24, 0x60, 0x6c, 0x78, 0x74, 0x50, 0x5c, 0x48, 0x44}, {0x0, 0xd, 0x1a, 0x17, 0x34, 0x39, 0x2e, 0x23, 0x68, 0x65, 0x72, 0x7f, 0x5c, 0x51, 0x46, 0x4b}, {0x0, 0xe, 0x1c, 0x12, 0x38, 0x36, 0x24, 0x2a, 0x70, 0x7e, 0x6c, 0x62, 0x48, 0x46, 0x54, 0x5a}, {0x0, 0xf, 0x1e, 0x11, 0x3c, 0x33, 0x22, 0x2d, 0x78, 0x77, 0x66, 0x69, 0x44, 0x4b, 0x5a, 0x55}, {0x0, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0}, {0x0, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, 0x88, 0x99, 0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff}, {0x0, 0x12, 0x24, 0x36, 0x48, 0x5a, 0x6c, 0x7e, 0x90, 0x82, 0xb4, 0xa6, 0xd8, 0xca, 0xfc, 0xee}, {0x0, 0x13, 0x26, 0x35, 0x4c, 0x5f, 0x6a, 0x79, 0x98, 0x8b, 0xbe, 0xad, 0xd4, 0xc7, 0xf2, 0xe1}, {0x0, 0x14, 0x28, 0x3c, 0x50, 0x44, 0x78, 0x6c, 0xa0, 0xb4, 0x88, 0x9c, 0xf0, 0xe4, 0xd8, 0xcc}, {0x0, 0x15, 0x2a, 0x3f, 0x54, 0x41, 0x7e, 0x6b, 0xa8, 0xbd, 0x82, 0x97, 0xfc, 0xe9, 0xd6, 0xc3}, {0x0, 0x16, 0x2c, 0x3a, 0x58, 0x4e, 0x74, 0x62, 0xb0, 0xa6, 0x9c, 0x8a, 0xe8, 0xfe, 0xc4, 0xd2}, {0x0, 0x17, 0x2e, 0x39, 0x5c, 0x4b, 0x72, 0x65, 0xb8, 0xaf, 0x96, 0x81, 0xe4, 0xf3, 0xca, 0xdd}, {0x0, 0x18, 0x30, 0x28, 0x60, 0x78, 0x50, 0x48, 0xc0, 0xd8, 0xf0, 0xe8, 0xa0, 0xb8, 0x90, 0x88}, {0x0, 0x19, 0x32, 0x2b, 0x64, 0x7d, 0x56, 0x4f, 0xc8, 0xd1, 0xfa, 0xe3, 0xac, 0xb5, 0x9e, 0x87}, {0x0, 0x1a, 0x34, 0x2e, 0x68, 0x72, 0x5c, 0x46, 0xd0, 0xca, 0xe4, 0xfe, 0xb8, 0xa2, 0x8c, 0x96}, {0x0, 0x1b, 0x36, 0x2d, 0x6c, 0x77, 0x5a, 0x41, 0xd8, 0xc3, 0xee, 0xf5, 0xb4, 0xaf, 0x82, 0x99}, {0x0, 0x1c, 0x38, 0x24, 0x70, 0x6c, 0x48, 0x54, 0xe0, 0xfc, 0xd8, 0xc4, 0x90, 0x8c, 0xa8, 0xb4}, {0x0, 0x1d, 0x3a, 0x27, 0x74, 0x69, 0x4e, 0x53, 0xe8, 0xf5, 0xd2, 0xcf, 0x9c, 0x81, 0xa6, 0xbb}, {0x0, 0x1e, 0x3c, 0x22, 0x78, 0x66, 0x44, 0x5a, 0xf0, 0xee, 0xcc, 0xd2, 0x88, 0x96, 0xb4, 0xaa}, {0x0, 0x1f, 0x3e, 0x21, 0x7c, 0x63, 0x42, 0x5d, 0xf8, 0xe7, 0xc6, 0xd9, 0x84, 0x9b, 0xba, 0xa5}, {0x0, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, 0x1d, 0x3d, 0x5d, 0x7d, 0x9d, 0xbd, 0xdd, 0xfd}, {0x0, 0x21, 0x42, 0x63, 0x84, 0xa5, 0xc6, 0xe7, 0x15, 0x34, 0x57, 0x76, 0x91, 0xb0, 0xd3, 0xf2}, {0x0, 0x22, 0x44, 0x66, 0x88, 0xaa, 0xcc, 0xee, 0xd, 0x2f, 0x49, 0x6b, 0x85, 0xa7, 0xc1, 0xe3}, {0x0, 0x23, 0x46, 0x65, 0x8c, 0xaf, 0xca, 0xe9, 0x5, 0x26, 0x43, 0x60, 0x89, 0xaa, 0xcf, 0xec}, {0x0, 0x24, 0x48, 0x6c, 0x90, 0xb4, 0xd8, 0xfc, 0x3d, 0x19, 0x75, 0x51, 0xad, 0x89, 0xe5, 0xc1}, {0x0, 0x25, 0x4a, 0x6f, 0x94, 0xb1, 0xde, 0xfb, 0x35, 0x10, 0x7f, 0x5a, 0xa1, 0x84, 0xeb, 0xce}, {0x0, 0x26, 0x4c, 0x6a, 0x98, 0xbe, 0xd4, 0xf2, 0x2d, 0xb, 0x61, 0x47, 0xb5, 0x93, 0xf9, 0xdf}, {0x0, 0x27, 0x4e, 0x69, 0x9c, 0xbb, 0xd2, 0xf5, 0x25, 0x2, 0x6b, 0x4c, 0xb9, 0x9e, 0xf7, 0xd0}, {0x0, 0x28, 0x50, 0x78, 0xa0, 0x88, 0xf0, 0xd8, 0x5d, 0x75, 0xd, 0x25, 0xfd, 0xd5, 0xad, 0x85}, {0x0, 0x29, 0x52, 0x7b, 0xa4, 0x8d, 0xf6, 0xdf, 0x55, 0x7c, 0x7, 0x2e, 0xf1, 0xd8, 0xa3, 0x8a}, {0x0, 0x2a, 0x54, 0x7e, 0xa8, 0x82, 0xfc, 0xd6, 0x4d, 0x67, 0x19, 0x33, 0xe5, 0xcf, 0xb1, 0x9b}, {0x0, 0x2b, 0x56, 0x7d, 0xac, 0x87, 0xfa, 0xd1, 0x45, 0x6e, 0x13, 0x38, 0xe9, 0xc2, 0xbf, 0x94}, {0x0, 0x2c, 0x58, 0x74, 0xb0, 0x9c, 0xe8, 0xc4, 0x7d, 0x51, 0x25, 0x9, 0xcd, 0xe1, 0x95, 0xb9}, {0x0, 0x2d, 0x5a, 0x77, 0xb4, 0x99, 0xee, 0xc3, 0x75, 0x58, 0x2f, 0x2, 0xc1, 0xec, 0x9b, 0xb6}, {0x0, 0x2e, 0x5c, 0x72, 0xb8, 0x96, 0xe4, 0xca, 0x6d, 0x43, 0x31, 0x1f, 0xd5, 0xfb, 0x89, 0xa7}, {0x0, 0x2f, 0x5e, 0x71, 0xbc, 0x93, 0xe2, 0xcd, 0x65, 0x4a, 0x3b, 0x14, 0xd9, 0xf6, 0x87, 0xa8}, {0x0, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, 0x9d, 0xad, 0xfd, 0xcd, 0x5d, 0x6d, 0x3d, 0xd}, {0x0, 0x31, 0x62, 0x53, 0xc4, 0xf5, 0xa6, 0x97, 0x95, 0xa4, 0xf7, 0xc6, 0x51, 0x60, 0x33, 0x2}, {0x0, 0x32, 0x64, 0x56, 0xc8, 0xfa, 0xac, 0x9e, 0x8d, 0xbf, 0xe9, 0xdb, 0x45, 0x77, 0x21, 0x13}, {0x0, 0x33, 0x66, 0x55, 0xcc, 0xff, 0xaa, 0x99, 0x85, 0xb6, 0xe3, 0xd0, 0x49, 0x7a, 0x2f, 0x1c}, {0x0, 0x34, 0x68, 0x5c, 0xd0, 0xe4, 0xb8, 0x8c, 0xbd, 0x89, 0xd5, 0xe1, 0x6d, 0x59, 0x5, 0x31}, {0x0, 0x35, 0x6a, 0x5f, 0xd4, 0xe1, 0xbe, 0x8b, 0xb5, 0x80, 0xdf, 0xea, 0x61, 0x54, 0xb, 0x3e}, {0x0, 0x36, 0x6c, 0x5a, 0xd8, 0xee, 0xb4, 0x82, 0xad, 0x9b, 0xc1, 0xf7, 0x75, 0x43, 0x19, 0x2f}, {0x0, 0x37, 0x6e, 0x59, 0xdc, 0xeb, 0xb2, 0x85, 0xa5, 0x92, 0xcb, 0xfc, 0x79, 0x4e, 0x17, 0x20}, {0x0, 0x38, 0x70, 0x48, 0xe0, 0xd8, 0x90, 0xa8, 0xdd, 0xe5, 0xad, 0x95, 0x3d, 0x5, 0x4d, 0x75}, {0x0, 0x39, 0x72, 0x4b, 0xe4, 0xdd, 0x96, 0xaf, 0xd5, 0xec, 0xa7, 0x9e, 0x31, 0x8, 0x43, 0x7a}, {0x0, 0x3a, 0x74, 0x4e, 0xe8, 0xd2, 0x9c, 0xa6, 0xcd, 0xf7, 0xb9, 0x83, 0x25, 0x1f, 0x51, 0x6b}, {0x0, 0x3b, 0x76, 0x4d, 0xec, 0xd7, 0x9a, 0xa1, 0xc5, 0xfe, 0xb3, 0x88, 0x29, 0x12, 0x5f, 0x64}, {0x0, 0x3c, 0x78, 0x44, 0xf0, 0xcc, 0x88, 0xb4, 0xfd, 0xc1, 0x85, 0xb9, 0xd, 0x31, 0x75, 0x49}, {0x0, 0x3d, 0x7a, 0x47, 0xf4, 0xc9, 0x8e, 0xb3, 0xf5, 0xc8, 0x8f, 0xb2, 0x1, 0x3c, 0x7b, 0x46}, {0x0, 0x3e, 0x7c, 0x42, 0xf8, 0xc6, 0x84, 0xba, 0xed, 0xd3, 0x91, 0xaf, 0x15, 0x2b, 0x69, 0x57}, {0x0, 0x3f, 0x7e, 0x41, 0xfc, 0xc3, 0x82, 0xbd, 0xe5, 0xda, 0x9b, 0xa4, 0x19, 0x26, 0x67, 0x58}, {0x0, 0x40, 0x80, 0xc0, 0x1d, 0x5d, 0x9d, 0xdd, 0x3a, 0x7a, 0xba, 0xfa, 0x27, 0x67, 0xa7, 0xe7}, {0x0, 0x41, 0x82, 0xc3, 0x19, 0x58, 0x9b, 0xda, 0x32, 0x73, 0xb0, 0xf1, 0x2b, 0x6a, 0xa9, 0xe8}, {0x0, 0x42, 0x84, 0xc6, 0x15, 0x57, 0x91, 0xd3, 0x2a, 0x68, 0xae, 0xec, 0x3f, 0x7d, 0xbb, 0xf9}, {0x0, 0x43, 0x86, 0xc5, 0x11, 0x52, 0x97, 0xd4, 0x22, 0x61, 0xa4, 0xe7, 0x33, 0x70, 0xb5, 0xf6}, {0x0, 0x44, 0x88, 0xcc, 0xd, 0x49, 0x85, 0xc1, 0x1a, 0x5e, 0x92, 0xd6, 0x17, 0x53, 0x9f, 0xdb}, {0x0, 0x45, 0x8a, 0xcf, 0x9, 0x4c, 0x83, 0xc6, 0x12, 0x57, 0x98, 0xdd, 0x1b, 0x5e, 0x91, 0xd4}, {0x0, 0x46, 0x8c, 0xca, 0x5, 0x43, 0x89, 0xcf, 0xa, 0x4c, 0x86, 0xc0, 0xf, 0x49, 0x83, 0xc5}, {0x0, 0x47, 0x8e, 0xc9, 0x1, 0x46, 0x8f, 0xc8, 0x2, 0x45, 0x8c, 0xcb, 0x3, 0x44, 0x8d, 0xca}, {0x0, 0x48, 0x90, 0xd8, 0x3d, 0x75, 0xad, 0xe5, 0x7a, 0x32, 0xea, 0xa2, 0x47, 0xf, 0xd7, 0x9f}, {0x0, 0x49, 0x92, 0xdb, 0x39, 0x70, 0xab, 0xe2, 0x72, 0x3b, 0xe0, 0xa9, 0x4b, 0x2, 0xd9, 0x90}, {0x0, 0x4a, 0x94, 0xde, 0x35, 0x7f, 0xa1, 0xeb, 0x6a, 0x20, 0xfe, 0xb4, 0x5f, 0x15, 0xcb, 0x81}, {0x0, 0x4b, 0x96, 0xdd, 0x31, 0x7a, 0xa7, 0xec, 0x62, 0x29, 0xf4, 0xbf, 0x53, 0x18, 0xc5, 0x8e}, {0x0, 0x4c, 0x98, 0xd4, 0x2d, 0x61, 0xb5, 0xf9, 0x5a, 0x16, 0xc2, 0x8e, 0x77, 0x3b, 0xef, 0xa3}, {0x0, 0x4d, 0x9a, 0xd7, 0x29, 0x64, 0xb3, 0xfe, 0x52, 0x1f, 0xc8, 0x85, 0x7b, 0x36, 0xe1, 0xac}, {0x0, 0x4e, 0x9c, 0xd2, 0x25, 0x6b, 0xb9, 0xf7, 0x4a, 0x4, 0xd6, 0x98, 0x6f, 0x21, 0xf3, 0xbd}, {0x0, 0x4f, 0x9e, 0xd1, 0x21, 0x6e, 0xbf, 0xf0, 0x42, 0xd, 0xdc, 0x93, 0x63, 0x2c, 0xfd, 0xb2}, {0x0, 0x50, 0xa0, 0xf0, 0x5d, 0xd, 0xfd, 0xad, 0xba, 0xea, 0x1a, 0x4a, 0xe7, 0xb7, 0x47, 0x17}, {0x0, 0x51, 0xa2, 0xf3, 0x59, 0x8, 0xfb, 0xaa, 0xb2, 0xe3, 0x10, 0x41, 0xeb, 0xba, 0x49, 0x18}, {0x0, 0x52, 0xa4, 0xf6, 0x55, 0x7, 0xf1, 0xa3, 0xaa, 0xf8, 0xe, 0x5c, 0xff, 0xad, 0x5b, 0x9}, {0x0, 0x53, 0xa6, 0xf5, 0x51, 0x2, 0xf7, 0xa4, 0xa2, 0xf1, 0x4, 0x57, 0xf3, 0xa0, 0x55, 0x6}, {0x0, 0x54, 0xa8, 0xfc, 0x4d, 0x19, 0xe5, 0xb1, 0x9a, 0xce, 0x32, 0x66, 0xd7, 0x83, 0x7f, 0x2b}, {0x0, 0x55, 0xaa, 0xff, 0x49, 0x1c, 0xe3, 0xb6, 0x92, 0xc7, 0x38, 0x6d, 0xdb, 0x8e, 0x71, 0x24}, {0x0, 0x56, 0xac, 0xfa, 0x45, 0x13, 0xe9, 0xbf, 0x8a, 0xdc, 0x26, 0x70, 0xcf, 0x99, 0x63, 0x35}, {0x0, 0x57, 0xae, 0xf9, 0x41, 0x16, 0xef, 0xb8, 0x82, 0xd5, 0x2c, 0x7b, 0xc3, 0x94, 0x6d, 0x3a}, {0x0, 0x58, 0xb0, 0xe8, 0x7d, 0x25, 0xcd, 0x95, 0xfa, 0xa2, 0x4a, 0x12, 0x87, 0xdf, 0x37, 0x6f}, {0x0, 0x59, 0xb2, 0xeb, 0x79, 0x20, 0xcb, 0x92, 0xf2, 0xab, 0x40, 0x19, 0x8b, 0xd2, 0x39, 0x60}, {0x0, 0x5a, 0xb4, 0xee, 0x75, 0x2f, 0xc1, 0x9b, 0xea, 0xb0, 0x5e, 0x4, 0x9f, 0xc5, 0x2b, 0x71}, {0x0, 0x5b, 0xb6, 0xed, 0x71, 0x2a, 0xc7, 0x9c, 0xe2, 0xb9, 0x54, 0xf, 0x93, 0xc8, 0x25, 0x7e}, {0x0, 0x5c, 0xb8, 0xe4, 0x6d, 0x31, 0xd5, 0x89, 0xda, 0x86, 0x62, 0x3e, 0xb7, 0xeb, 0xf, 0x53}, {0x0, 0x5d, 0xba, 0xe7, 0x69, 0x34, 0xd3, 0x8e, 0xd2, 0x8f, 0x68, 0x35, 0xbb, 0xe6, 0x1, 0x5c}, {0x0, 0x5e, 0xbc, 0xe2, 0x65, 0x3b, 0xd9, 0x87, 0xca, 0x94, 0x76, 0x28, 0xaf, 0xf1, 0x13, 0x4d}, {0x0, 0x5f, 0xbe, 0xe1, 0x61, 0x3e, 0xdf, 0x80, 0xc2, 0x9d, 0x7c, 0x23, 0xa3, 0xfc, 0x1d, 0x42}, {0x0, 0x60, 0xc0, 0xa0, 0x9d, 0xfd, 0x5d, 0x3d, 0x27, 0x47, 0xe7, 0x87, 0xba, 0xda, 0x7a, 0x1a}, {0x0, 0x61, 0xc2, 0xa3, 0x99, 0xf8, 0x5b, 0x3a, 0x2f, 0x4e, 0xed, 0x8c, 0xb6, 0xd7, 0x74, 0x15}, {0x0, 0x62, 0xc4, 0xa6, 0x95, 0xf7, 0x51, 0x33, 0x37, 0x55, 0xf3, 0x91, 0xa2, 0xc0, 0x66, 0x4}, {0x0, 0x63, 0xc6, 0xa5, 0x91, 0xf2, 0x57, 0x34, 0x3f, 0x5c, 0xf9, 0x9a, 0xae, 0xcd, 0x68, 0xb}, {0x0, 0x64, 0xc8, 0xac, 0x8d, 0xe9, 0x45, 0x21, 0x7, 0x63, 0xcf, 0xab, 0x8a, 0xee, 0x42, 0x26}, {0x0, 0x65, 0xca, 0xaf, 0x89, 0xec, 0x43, 0x26, 0xf, 0x6a, 0xc5, 0xa0, 0x86, 0xe3, 0x4c, 0x29}, {0x0, 0x66, 0xcc, 0xaa, 0x85, 0xe3, 0x49, 0x2f, 0x17, 0x71, 0xdb, 0xbd, 0x92, 0xf4, 0x5e, 0x38}, {0x0, 0x67, 0xce, 0xa9, 0x81, 0xe6, 0x4f, 0x28, 0x1f, 0x78, 0xd1, 0xb6, 0x9e, 0xf9, 0x50, 0x37}, {0x0, 0x68, 0xd0, 0xb8, 0xbd, 0xd5, 0x6d, 0x5, 0x67, 0xf, 0xb7, 0xdf, 0xda, 0xb2, 0xa, 0x62}, {0x0, 0x69, 0xd2, 0xbb, 0xb9, 0xd0, 0x6b, 0x2, 0x6f, 0x6, 0xbd, 0xd4, 0xd6, 0xbf, 0x4, 0x6d}, {0x0, 0x6a, 0xd4, 0xbe, 0xb5, 0xdf, 0x61, 0xb, 0x77, 0x1d, 0xa3, 0xc9, 0xc2, 0xa8, 0x16, 0x7c}, {0x0, 0x6b, 0xd6, 0xbd, 0xb1, 0xda, 0x67, 0xc, 0x7f, 0x14, 0xa9, 0xc2, 0xce, 0xa5, 0x18, 0x73}, {0x0, 0x6c, 0xd8, 0xb4, 0xad, 0xc1, 0x75, 0x19, 0x47, 0x2b, 0x9f, 0xf3, 0xea, 0x86, 0x32, 0x5e}, {0x0, 0x6d, 0xda, 0xb7, 0xa9, 0xc4, 0x73, 0x1e, 0x4f, 0x22, 0x95, 0xf8, 0xe6, 0x8b, 0x3c, 0x51}, {0x0, 0x6e, 0xdc, 0xb2, 0xa5, 0xcb, 0x79, 0x17, 0x57, 0x39, 0x8b, 0xe5, 0xf2, 0x9c, 0x2e, 0x40}, {0x0, 0x6f, 0xde, 0xb1, 0xa1, 0xce, 0x7f, 0x10, 0x5f, 0x30, 0x81, 0xee, 0xfe, 0x91, 0x20, 0x4f}, {0x0, 0x70, 0xe0, 0x90, 0xdd, 0xad, 0x3d, 0x4d, 0xa7, 0xd7, 0x47, 0x37, 0x7a, 0xa, 0x9a, 0xea}, {0x0, 0x71, 0xe2, 0x93, 0xd9, 0xa8, 0x3b, 0x4a, 0xaf, 0xde, 0x4d, 0x3c, 0x76, 0x7, 0x94, 0xe5}, {0x0, 0x72, 0xe4, 0x96, 0xd5, 0xa7, 0x31, 0x43, 0xb7, 0xc5, 0x53, 0x21, 0x62, 0x10, 0x86, 0xf4}, {0x0, 0x73, 0xe6, 0x95, 0xd1, 0xa2, 0x37, 0x44, 0xbf, 0xcc, 0x59, 0x2a, 0x6e, 0x1d, 0x88, 0xfb}, {0x0, 0x74, 0xe8, 0x9c, 0xcd, 0xb9, 0x25, 0x51, 0x87, 0xf3, 0x6f, 0x1b, 0x4a, 0x3e, 0xa2, 0xd6}, {0x0, 0x75, 0xea, 0x9f, 0xc9, 0xbc, 0x23, 0x56, 0x8f, 0xfa, 0x65, 0x10, 0x46, 0x33, 0xac, 0xd9}, {0x0, 0x76, 0xec, 0x9a, 0xc5, 0xb3, 0x29, 0x5f, 0x97, 0xe1, 0x7b, 0xd, 0x52, 0x24, 0xbe, 0xc8}, {0x0, 0x77, 0xee, 0x99, 0xc1, 0xb6, 0x2f, 0x58, 0x9f, 0xe8, 0x71, 0x6, 0x5e, 0x29, 0xb0, 0xc7}, {0x0, 0x78, 0xf0, 0x88, 0xfd, 0x85, 0xd, 0x75, 0xe7, 0x9f, 0x17, 0x6f, 0x1a, 0x62, 0xea, 0x92}, {0x0, 0x79, 0xf2, 0x8b, 0xf9, 0x80, 0xb, 0x72, 0xef, 0x96, 0x1d, 0x64, 0x16, 0x6f, 0xe4, 0x9d}, {0x0, 0x7a, 0xf4, 0x8e, 0xf5, 0x8f, 0x1, 0x7b, 0xf7, 0x8d, 0x3, 0x79, 0x2, 0x78, 0xf6, 0x8c}, {0x0, 0x7b, 0xf6, 0x8d, 0xf1, 0x8a, 0x7, 0x7c, 0xff, 0x84, 0x9, 0x72, 0xe, 0x75, 0xf8, 0x83}, {0x0, 0x7c, 0xf8, 0x84, 0xed, 0x91, 0x15, 0x69, 0xc7, 0xbb, 0x3f, 0x43, 0x2a, 0x56, 0xd2, 0xae}, {0x0, 0x7d, 0xfa, 0x87, 0xe9, 0x94, 0x13, 0x6e, 0xcf, 0xb2, 0x35, 0x48, 0x26, 0x5b, 0xdc, 0xa1}, {0x0, 0x7e, 0xfc, 0x82, 0xe5, 0x9b, 0x19, 0x67, 0xd7, 0xa9, 0x2b, 0x55, 0x32, 0x4c, 0xce, 0xb0}, {0x0, 0x7f, 0xfe, 0x81, 0xe1, 0x9e, 0x1f, 0x60, 0xdf, 0xa0, 0x21, 0x5e, 0x3e, 0x41, 0xc0, 0xbf}, {0x0, 0x80, 0x1d, 0x9d, 0x3a, 0xba, 0x27, 0xa7, 0x74, 0xf4, 0x69, 0xe9, 0x4e, 0xce, 0x53, 0xd3}, {0x0, 0x81, 0x1f, 0x9e, 0x3e, 0xbf, 0x21, 0xa0, 0x7c, 0xfd, 0x63, 0xe2, 0x42, 0xc3, 0x5d, 0xdc}, {0x0, 0x82, 0x19, 0x9b, 0x32, 0xb0, 0x2b, 0xa9, 0x64, 0xe6, 0x7d, 0xff, 0x56, 0xd4, 0x4f, 0xcd}, {0x0, 0x83, 0x1b, 0x98, 0x36, 0xb5, 0x2d, 0xae, 0x6c, 0xef, 0x77, 0xf4, 0x5a, 0xd9, 0x41, 0xc2}, {0x0, 0x84, 0x15, 0x91, 0x2a, 0xae, 0x3f, 0xbb, 0x54, 0xd0, 0x41, 0xc5, 0x7e, 0xfa, 0x6b, 0xef}, {0x0, 0x85, 0x17, 0x92, 0x2e, 0xab, 0x39, 0xbc, 0x5c, 0xd9, 0x4b, 0xce, 0x72, 0xf7, 0x65, 0xe0}, {0x0, 0x86, 0x11, 0x97, 0x22, 0xa4, 0x33, 0xb5, 0x44, 0xc2, 0x55, 0xd3, 0x66, 0xe0, 0x77, 0xf1}, {0x0, 0x87, 0x13, 0x94, 0x26, 0xa1, 0x35, 0xb2, 0x4c, 0xcb, 0x5f, 0xd8, 0x6a, 0xed, 0x79, 0xfe}, {0x0, 0x88, 0xd, 0x85, 0x1a, 0x92, 0x17, 0x9f, 0x34, 0xbc, 0x39, 0xb1, 0x2e, 0xa6, 0x23, 0xab}, {0x0, 0x89, 0xf, 0x86, 0x1e, 0x97, 0x11, 0x98, 0x3c, 0xb5, 0x33, 0xba, 0x22, 0xab, 0x2d, 0xa4}, {0x0, 0x8a, 0x9, 0x83, 0x12, 0x98, 0x1b, 0x91, 0x24, 0xae, 0x2d, 0xa7, 0x36, 0xbc, 0x3f, 0xb5}, {0x0, 0x8b, 0xb, 0x80, 0x16, 0x9d, 0x1d, 0x96, 0x2c, 0xa7, 0x27, 0xac, 0x3a, 0xb1, 0x31, 0xba}, {0x0, 0x8c, 0x5, 0x89, 0xa, 0x86, 0xf, 0x83, 0x14, 0x98, 0x11, 0x9d, 0x1e, 0x92, 0x1b, 0x97}, {0x0, 0x8d, 0x7, 0x8a, 0xe, 0x83, 0x9, 0x84, 0x1c, 0x91, 0x1b, 0x96, 0x12, 0x9f, 0x15, 0x98}, {0x0, 0x8e, 0x1, 0x8f, 0x2, 0x8c, 0x3, 0x8d, 0x4, 0x8a, 0x5, 0x8b, 0x6, 0x88, 0x7, 0x89}, {0x0, 0x8f, 0x3, 0x8c, 0x6, 0x89, 0x5, 0x8a, 0xc, 0x83, 0xf, 0x80, 0xa, 0x85, 0x9, 0x86}, {0x0, 0x90, 0x3d, 0xad, 0x7a, 0xea, 0x47, 0xd7, 0xf4, 0x64, 0xc9, 0x59, 0x8e, 0x1e, 0xb3, 0x23}, {0x0, 0x91, 0x3f, 0xae, 0x7e, 0xef, 0x41, 0xd0, 0xfc, 0x6d, 0xc3, 0x52, 0x82, 0x13, 0xbd, 0x2c}, {0x0, 0x92, 0x39, 0xab, 0x72, 0xe0, 0x4b, 0xd9, 0xe4, 0x76, 0xdd, 0x4f, 0x96, 0x4, 0xaf, 0x3d}, {0x0, 0x93, 0x3b, 0xa8, 0x76, 0xe5, 0x4d, 0xde, 0xec, 0x7f, 0xd7, 0x44, 0x9a, 0x9, 0xa1, 0x32}, {0x0, 0x94, 0x35, 0xa1, 0x6a, 0xfe, 0x5f, 0xcb, 0xd4, 0x40, 0xe1, 0x75, 0xbe, 0x2a, 0x8b, 0x1f}, {0x0, 0x95, 0x37, 0xa2, 0x6e, 0xfb, 0x59, 0xcc, 0xdc, 0x49, 0xeb, 0x7e, 0xb2, 0x27, 0x85, 0x10}, {0x0, 0x96, 0x31, 0xa7, 0x62, 0xf4, 0x53, 0xc5, 0xc4, 0x52, 0xf5, 0x63, 0xa6, 0x30, 0x97, 0x1}, {0x0, 0x97, 0x33, 0xa4, 0x66, 0xf1, 0x55, 0xc2, 0xcc, 0x5b, 0xff, 0x68, 0xaa, 0x3d, 0x99, 0xe}, {0x0, 0x98, 0x2d, 0xb5, 0x5a, 0xc2, 0x77, 0xef, 0xb4, 0x2c, 0x99, 0x1, 0xee, 0x76, 0xc3, 0x5b}, {0x0, 0x99, 0x2f, 0xb6, 0x5e, 0xc7, 0x71, 0xe8, 0xbc, 0x25, 0x93, 0xa, 0xe2, 0x7b, 0xcd, 0x54}, {0x0, 0x9a, 0x29, 0xb3, 0x52, 0xc8, 0x7b, 0xe1, 0xa4, 0x3e, 0x8d, 0x17, 0xf6, 0x6c, 0xdf, 0x45}, {0x0, 0x9b, 0x2b, 0xb0, 0x56, 0xcd, 0x7d, 0xe6, 0xac, 0x37, 0x87, 0x1c, 0xfa, 0x61, 0xd1, 0x4a}, {0x0, 0x9c, 0x25, 0xb9, 0x4a, 0xd6, 0x6f, 0xf3, 0x94, 0x8, 0xb1, 0x2d, 0xde, 0x42, 0xfb, 0x67}, {0x0, 0x9d, 0x27, 0xba, 0x4e, 0xd3, 0x69, 0xf4, 0x9c, 0x1, 0xbb, 0x26, 0xd2, 0x4f, 0xf5, 0x68}, {0x0, 0x9e, 0x21, 0xbf, 0x42, 0xdc, 0x63, 0xfd, 0x84, 0x1a, 0xa5, 0x3b, 0xc6, 0x58, 0xe7, 0x79}, {0x0, 0x9f, 0x23, 0xbc, 0x46, 0xd9, 0x65, 0xfa, 0x8c, 0x13, 0xaf, 0x30, 0xca, 0x55, 0xe9, 0x76}, {0x0, 0xa0, 0x5d, 0xfd, 0xba, 0x1a, 0xe7, 0x47, 0x69, 0xc9, 0x34, 0x94, 0xd3, 0x73, 0x8e, 0x2e}, {0x0, 0xa1, 0x5f, 0xfe, 0xbe, 0x1f, 0xe1, 0x40, 0x61, 0xc0, 0x3e, 0x9f, 0xdf, 0x7e, 0x80, 0x21}, {0x0, 0xa2, 0x59, 0xfb, 0xb2, 0x10, 0xeb, 0x49, 0x79, 0xdb, 0x20, 0x82, 0xcb, 0x69, 0x92, 0x30}, {0x0, 0xa3, 0x5b, 0xf8, 0xb6, 0x15, 0xed, 0x4e, 0x71, 0xd2, 0x2a, 0x89, 0xc7, 0x64, 0x9c, 0x3f}, {0x0, 0xa4, 0x55, 0xf1, 0xaa, 0xe, 0xff, 0x5b, 0x49, 0xed, 0x1c, 0xb8, 0xe3, 0x47, 0xb6, 0x12}, {0x0, 0xa5, 0x57, 0xf2, 0xae, 0xb, 0xf9, 0x5c, 0x41, 0xe4, 0x16, 0xb3, 0xef, 0x4a, 0xb8, 0x1d}, {0x0, 0xa6, 0x51, 0xf7, 0xa2, 0x4, 0xf3, 0x55, 0x59, 0xff, 0x8, 0xae, 0xfb, 0x5d, 0xaa, 0xc}, {0x0, 0xa7, 0x53, 0xf4, 0xa6, 0x1, 0xf5, 0x52, 0x51, 0xf6, 0x2, 0xa5, 0xf7, 0x50, 0xa4, 0x3}, {0x0, 0xa8, 0x4d, 0xe5, 0x9a, 0x32, 0xd7, 0x7f, 0x29, 0x81, 0x64, 0xcc, 0xb3, 0x1b, 0xfe, 0x56}, {0x0, 0xa9, 0x4f, 0xe6, 0x9e, 0x37, 0xd1, 0x78, 0x21, 0x88, 0x6e, 0xc7, 0xbf, 0x16, 0xf0, 0x59}, {0x0, 0xaa, 0x49, 0xe3, 0x92, 0x38, 0xdb, 0x71, 0x39, 0x93, 0x70, 0xda, 0xab, 0x1, 0xe2, 0x48}, {0x0, 0xab, 0x4b, 0xe0, 0x96, 0x3d, 0xdd, 0x76, 0x31, 0x9a, 0x7a, 0xd1, 0xa7, 0xc, 0xec, 0x47}, {0x0, 0xac, 0x45, 0xe9, 0x8a, 0x26, 0xcf, 0x63, 0x9, 0xa5, 0x4c, 0xe0, 0x83, 0x2f, 0xc6, 0x6a}, {0x0, 0xad, 0x47, 0xea, 0x8e, 0x23, 0xc9, 0x64, 0x1, 0xac, 0x46, 0xeb, 0x8f, 0x22, 0xc8, 0x65}, {0x0, 0xae, 0x41, 0xef, 0x82, 0x2c, 0xc3, 0x6d, 0x19, 0xb7, 0x58, 0xf6, 0x9b, 0x35, 0xda, 0x74}, {0x0, 0xaf, 0x43, 0xec, 0x86, 0x29, 0xc5, 0x6a, 0x11, 0xbe, 0x52, 0xfd, 0x97, 0x38, 0xd4, 0x7b}, {0x0, 0xb0, 0x7d, 0xcd, 0xfa, 0x4a, 0x87, 0x37, 0xe9, 0x59, 0x94, 0x24, 0x13, 0xa3, 0x6e, 0xde}, {0x0, 0xb1, 0x7f, 0xce, 0xfe, 0x4f, 0x81, 0x30, 0xe1, 0x50, 0x9e, 0x2f, 0x1f, 0xae, 0x60, 0xd1}, {0x0, 0xb2, 0x79, 0xcb, 0xf2, 0x40, 0x8b, 0x39, 0xf9, 0x4b, 0x80, 0x32, 0xb, 0xb9, 0x72, 0xc0}, {0x0, 0xb3, 0x7b, 0xc8, 0xf6, 0x45, 0x8d, 0x3e, 0xf1, 0x42, 0x8a, 0x39, 0x7, 0xb4, 0x7c, 0xcf}, {0x0, 0xb4, 0x75, 0xc1, 0xea, 0x5e, 0x9f, 0x2b, 0xc9, 0x7d, 0xbc, 0x8, 0x23, 0x97, 0x56, 0xe2}, {0x0, 0xb5, 0x77, 0xc2, 0xee, 0x5b, 0x99, 0x2c, 0xc1, 0x74, 0xb6, 0x3, 0x2f, 0x9a, 0x58, 0xed}, {0x0, 0xb6, 0x71, 0xc7, 0xe2, 0x54, 0x93, 0x25, 0xd9, 0x6f, 0xa8, 0x1e, 0x3b, 0x8d, 0x4a, 0xfc}, {0x0, 0xb7, 0x73, 0xc4, 0xe6, 0x51, 0x95, 0x22, 0xd1, 0x66, 0xa2, 0x15, 0x37, 0x80, 0x44, 0xf3}, {0x0, 0xb8, 0x6d, 0xd5, 0xda, 0x62, 0xb7, 0xf, 0xa9, 0x11, 0xc4, 0x7c, 0x73, 0xcb, 0x1e, 0xa6}, {0x0, 0xb9, 0x6f, 0xd6, 0xde, 0x67, 0xb1, 0x8, 0xa1, 0x18, 0xce, 0x77, 0x7f, 0xc6, 0x10, 0xa9}, {0x0, 0xba, 0x69, 0xd3, 0xd2, 0x68, 0xbb, 0x1, 0xb9, 0x3, 0xd0, 0x6a, 0x6b, 0xd1, 0x2, 0xb8}, {0x0, 0xbb, 0x6b, 0xd0, 0xd6, 0x6d, 0xbd, 0x6, 0xb1, 0xa, 0xda, 0x61, 0x67, 0xdc, 0xc, 0xb7}, {0x0, 0xbc, 0x65, 0xd9, 0xca, 0x76, 0xaf, 0x13, 0x89, 0x35, 0xec, 0x50, 0x43, 0xff, 0x26, 0x9a}, {0x0, 0xbd, 0x67, 0xda, 0xce, 0x73, 0xa9, 0x14, 0x81, 0x3c, 0xe6, 0x5b, 0x4f, 0xf2, 0x28, 0x95}, {0x0, 0xbe, 0x61, 0xdf, 0xc2, 0x7c, 0xa3, 0x1d, 0x99, 0x27, 0xf8, 0x46, 0x5b, 0xe5, 0x3a, 0x84}, {0x0, 0xbf, 0x63, 0xdc, 0xc6, 0x79, 0xa5, 0x1a, 0x91, 0x2e, 0xf2, 0x4d, 0x57, 0xe8, 0x34, 0x8b}, {0x0, 0xc0, 0x9d, 0x5d, 0x27, 0xe7, 0xba, 0x7a, 0x4e, 0x8e, 0xd3, 0x13, 0x69, 0xa9, 0xf4, 0x34}, {0x0, 0xc1, 0x9f, 0x5e, 0x23, 0xe2, 0xbc, 0x7d, 0x46, 0x87, 0xd9, 0x18, 0x65, 0xa4, 0xfa, 0x3b}, {0x0, 0xc2, 0x99, 0x5b, 0x2f, 0xed, 0xb6, 0x74, 0x5e, 0x9c, 0xc7, 0x5, 0x71, 0xb3, 0xe8, 0x2a}, {0x0, 0xc3, 0x9b, 0x58, 0x2b, 0xe8, 0xb0, 0x73, 0x56, 0x95, 0xcd, 0xe, 0x7d, 0xbe, 0xe6, 0x25}, {0x0, 0xc4, 0x95, 0x51, 0x37, 0xf3, 0xa2, 0x66, 0x6e, 0xaa, 0xfb, 0x3f, 0x59, 0x9d, 0xcc, 0x8}, {0x0, 0xc5, 0x97, 0x52, 0x33, 0xf6, 0xa4, 0x61, 0x66, 0xa3, 0xf1, 0x34, 0x55, 0x90, 0xc2, 0x7}, {0x0, 0xc6, 0x91, 0x57, 0x3f, 0xf9, 0xae, 0x68, 0x7e, 0xb8, 0xef, 0x29, 0x41, 0x87, 0xd0, 0x16}, {0x0, 0xc7, 0x93, 0x54, 0x3b, 0xfc, 0xa8, 0x6f, 0x76, 0xb1, 0xe5, 0x22, 0x4d, 0x8a, 0xde, 0x19}, {0x0, 0xc8, 0x8d, 0x45, 0x7, 0xcf, 0x8a, 0x42, 0xe, 0xc6, 0x83, 0x4b, 0x9, 0xc1, 0x84, 0x4c}, {0x0, 0xc9, 0x8f, 0x46, 0x3, 0xca, 0x8c, 0x45, 0x6, 0xcf, 0x89, 0x40, 0x5, 0xcc, 0x8a, 0x43}, {0x0, 0xca, 0x89, 0x43, 0xf, 0xc5, 0x86, 0x4c, 0x1e, 0xd4, 0x97, 0x5d, 0x11, 0xdb, 0x98, 0x52}, {0x0, 0xcb, 0x8b, 0x40, 0xb, 0xc0, 0x80, 0x4b, 0x16, 0xdd, 0x9d, 0x56, 0x1d, 0xd6, 0x96, 0x5d}, {0x0, 0xcc, 0x85, 0x49, 0x17, 0xdb, 0x92, 0x5e, 0x2e, 0xe2, 0xab, 0x67, 0x39, 0xf5, 0xbc, 0x70}, {0x0, 0xcd, 0x87, 0x4a, 0x13, 0xde, 0x94, 0x59, 0x26, 0xeb, 0xa1, 0x6c, 0x35, 0xf8, 0xb2, 0x7f}, {0x0, 0xce, 0x81, 0x4f, 0x1f, 0xd1, 0x9e, 0x50, 0x3e, 0xf0, 0xbf, 0x71, 0x21, 0xef, 0xa0, 0x6e}, {0x0, 0xcf, 0x83, 0x4c, 0x1b, 0xd4, 0x98, 0x57, 0x36, 0xf9, 0xb5, 0x7a, 0x2d, 0xe2, 0xae, 0x61}, {0x0, 0xd0, 0xbd, 0x6d, 0x67, 0xb7, 0xda, 0xa, 0xce, 0x1e, 0x73, 0xa3, 0xa9, 0x79, 0x14, 0xc4}, {0x0, 0xd1, 0xbf, 0x6e, 0x63, 0xb2, 0xdc, 0xd, 0xc6, 0x17, 0x79, 0xa8, 0xa5, 0x74, 0x1a, 0xcb}, {0x0, 0xd2, 0xb9, 0x6b, 0x6f, 0xbd, 0xd6, 0x4, 0xde, 0xc, 0x67, 0xb5, 0xb1, 0x63, 0x8, 0xda}, {0x0, 0xd3, 0xbb, 0x68, 0x6b, 0xb8, 0xd0, 0x3, 0xd6, 0x5, 0x6d, 0xbe, 0xbd, 0x6e, 0x6, 0xd5}, {0x0, 0xd4, 0xb5, 0x61, 0x77, 0xa3, 0xc2, 0x16, 0xee, 0x3a, 0x5b, 0x8f, 0x99, 0x4d, 0x2c, 0xf8}, {0x0, 0xd5, 0xb7, 0x62, 0x73, 0xa6, 0xc4, 0x11, 0xe6, 0x33, 0x51, 0x84, 0x95, 0x40, 0x22, 0xf7}, {0x0, 0xd6, 0xb1, 0x67, 0x7f, 0xa9, 0xce, 0x18, 0xfe, 0x28, 0x4f, 0x99, 0x81, 0x57, 0x30, 0xe6}, {0x0, 0xd7, 0xb3, 0x64, 0x7b, 0xac, 0xc8, 0x1f, 0xf6, 0x21, 0x45, 0x92, 0x8d, 0x5a, 0x3e, 0xe9}, {0x0, 0xd8, 0xad, 0x75, 0x47, 0x9f, 0xea, 0x32, 0x8e, 0x56, 0x23, 0xfb, 0xc9, 0x11, 0x64, 0xbc}, {0x0, 0xd9, 0xaf, 0x76, 0x43, 0x9a, 0xec, 0x35, 0x86, 0x5f, 0x29, 0xf0, 0xc5, 0x1c, 0x6a, 0xb3}, {0x0, 0xda, 0xa9, 0x73, 0x4f, 0x95, 0xe6, 0x3c, 0x9e, 0x44, 0x37, 0xed, 0xd1, 0xb, 0x78, 0xa2}, {0x0, 0xdb, 0xab, 0x70, 0x4b, 0x90, 0xe0, 0x3b, 0x96, 0x4d, 0x3d, 0xe6, 0xdd, 0x6, 0x76, 0xad}, {0x0, 0xdc, 0xa5, 0x79, 0x57, 0x8b, 0xf2, 0x2e, 0xae, 0x72, 0xb, 0xd7, 0xf9, 0x25, 0x5c, 0x80}, {0x0, 0xdd, 0xa7, 0x7a, 0x53, 0x8e, 0xf4, 0x29, 0xa6, 0x7b, 0x1, 0xdc, 0xf5, 0x28, 0x52, 0x8f}, {0x0, 0xde, 0xa1, 0x7f, 0x5f, 0x81, 0xfe, 0x20, 0xbe, 0x60, 0x1f, 0xc1, 0xe1, 0x3f, 0x40, 0x9e}, {0x0, 0xdf, 0xa3, 0x7c, 0x5b, 0x84, 0xf8, 0x27, 0xb6, 0x69, 0x15, 0xca, 0xed, 0x32, 0x4e, 0x91}, {0x0, 0xe0, 0xdd, 0x3d, 0xa7, 0x47, 0x7a, 0x9a, 0x53, 0xb3, 0x8e, 0x6e, 0xf4, 0x14, 0x29, 0xc9}, {0x0, 0xe1, 0xdf, 0x3e, 0xa3, 0x42, 0x7c, 0x9d, 0x5b, 0xba, 0x84, 0x65, 0xf8, 0x19, 0x27, 0xc6}, {0x0, 0xe2, 0xd9, 0x3b, 0xaf, 0x4d, 0x76, 0x94, 0x43, 0xa1, 0x9a, 0x78, 0xec, 0xe, 0x35, 0xd7}, {0x0, 0xe3, 0xdb, 0x38, 0xab, 0x48, 0x70, 0x93, 0x4b, 0xa8, 0x90, 0x73, 0xe0, 0x3, 0x3b, 0xd8}, {0x0, 0xe4, 0xd5, 0x31, 0xb7, 0x53, 0x62, 0x86, 0x73, 0x97, 0xa6, 0x42, 0xc4, 0x20, 0x11, 0xf5}, {0x0, 0xe5, 0xd7, 0x32, 0xb3, 0x56, 0x64, 0x81, 0x7b, 0x9e, 0xac, 0x49, 0xc8, 0x2d, 0x1f, 0xfa}, {0x0, 0xe6, 0xd1, 0x37, 0xbf, 0x59, 0x6e, 0x88, 0x63, 0x85, 0xb2, 0x54, 0xdc, 0x3a, 0xd, 0xeb}, {0x0, 0xe7, 0xd3, 0x34, 0xbb, 0x5c, 0x68, 0x8f, 0x6b, 0x8c, 0xb8, 0x5f, 0xd0, 0x37, 0x3, 0xe4}, {0x0, 0xe8, 0xcd, 0x25, 0x87, 0x6f, 0x4a, 0xa2, 0x13, 0xfb, 0xde, 0x36, 0x94, 0x7c, 0x59, 0xb1}, {0x0, 0xe9, 0xcf, 0x26, 0x83, 0x6a, 0x4c, 0xa5, 0x1b, 0xf2, 0xd4, 0x3d, 0x98, 0x71, 0x57, 0xbe}, {0x0, 0xea, 0xc9, 0x23, 0x8f, 0x65, 0x46, 0xac, 0x3, 0xe9, 0xca, 0x20, 0x8c, 0x66, 0x45, 0xaf}, {0x0, 0xeb, 0xcb, 0x20, 0x8b, 0x60, 0x40, 0xab, 0xb, 0xe0, 0xc0, 0x2b, 0x80, 0x6b, 0x4b, 0xa0}, {0x0, 0xec, 0xc5, 0x29, 0x97, 0x7b, 0x52, 0xbe, 0x33, 0xdf, 0xf6, 0x1a, 0xa4, 0x48, 0x61, 0x8d}, {0x0, 0xed, 0xc7, 0x2a, 0x93, 0x7e, 0x54, 0xb9, 0x3b, 0xd6, 0xfc, 0x11, 0xa8, 0x45, 0x6f, 0x82}, {0x0, 0xee, 0xc1, 0x2f, 0x9f, 0x71, 0x5e, 0xb0, 0x23, 0xcd, 0xe2, 0xc, 0xbc, 0x52, 0x7d, 0x93}, {0x0, 0xef, 0xc3, 0x2c, 0x9b, 0x74, 0x58, 0xb7, 0x2b, 0xc4, 0xe8, 0x7, 0xb0, 0x5f, 0x73, 0x9c}, {0x0, 0xf0, 0xfd, 0xd, 0xe7, 0x17, 0x1a, 0xea, 0xd3, 0x23, 0x2e, 0xde, 0x34, 0xc4, 0xc9, 0x39}, {0x0, 0xf1, 0xff, 0xe, 0xe3, 0x12, 0x1c, 0xed, 0xdb, 0x2a, 0x24, 0xd5, 0x38, 0xc9, 0xc7, 0x36}, {0x0, 0xf2, 0xf9, 0xb, 0xef, 0x1d, 0x16, 0xe4, 0xc3, 0x31, 0x3a, 0xc8, 0x2c, 0xde, 0xd5, 0x27}, {0x0, 0xf3, 0xfb, 0x8, 0xeb, 0x18, 0x10, 0xe3, 0xcb, 0x38, 0x30, 0xc3, 0x20, 0xd3, 0xdb, 0x28}, {0x0, 0xf4, 0xf5, 0x1, 0xf7, 0x3, 0x2, 0xf6, 0xf3, 0x7, 0x6, 0xf2, 0x4, 0xf0, 0xf1, 0x5}, {0x0, 0xf5, 0xf7, 0x2, 0xf3, 0x6, 0x4, 0xf1, 0xfb, 0xe, 0xc, 0xf9, 0x8, 0xfd, 0xff, 0xa}, {0x0, 0xf6, 0xf1, 0x7, 0xff, 0x9, 0xe, 0xf8, 0xe3, 0x15, 0x12, 0xe4, 0x1c, 0xea, 0xed, 0x1b}, {0x0, 0xf7, 0xf3, 0x4, 0xfb, 0xc, 0x8, 0xff, 0xeb, 0x1c, 0x18, 0xef, 0x10, 0xe7, 0xe3, 0x14}, {0x0, 0xf8, 0xed, 0x15, 0xc7, 0x3f, 0x2a, 0xd2, 0x93, 0x6b, 0x7e, 0x86, 0x54, 0xac, 0xb9, 0x41}, {0x0, 0xf9, 0xef, 0x16, 0xc3, 0x3a, 0x2c, 0xd5, 0x9b, 0x62, 0x74, 0x8d, 0x58, 0xa1, 0xb7, 0x4e}, {0x0, 0xfa, 0xe9, 0x13, 0xcf, 0x35, 0x26, 0xdc, 0x83, 0x79, 0x6a, 0x90, 0x4c, 0xb6, 0xa5, 0x5f}, {0x0, 0xfb, 0xeb, 0x10, 0xcb, 0x30, 0x20, 0xdb, 0x8b, 0x70, 0x60, 0x9b, 0x40, 0xbb, 0xab, 0x50}, {0x0, 0xfc, 0xe5, 0x19, 0xd7, 0x2b, 0x32, 0xce, 0xb3, 0x4f, 0x56, 0xaa, 0x64, 0x98, 0x81, 0x7d}, {0x0, 0xfd, 0xe7, 0x1a, 0xd3, 0x2e, 0x34, 0xc9, 0xbb, 0x46, 0x5c, 0xa1, 0x68, 0x95, 0x8f, 0x72}, {0x0, 0xfe, 0xe1, 0x1f, 0xdf, 0x21, 0x3e, 0xc0, 0xa3, 0x5d, 0x42, 0xbc, 0x7c, 0x82, 0x9d, 0x63}, {0x0, 0xff, 0xe3, 0x1c, 0xdb, 0x24, 0x38, 0xc7, 0xab, 0x54, 0x48, 0xb7, 0x70, 0x8f, 0x93, 0x6c}} var mulTableHigh = [256][16]uint8{{0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, {0x0, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0}, {0x0, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, 0x1d, 0x3d, 0x5d, 0x7d, 0x9d, 0xbd, 0xdd, 0xfd}, {0x0, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, 0x9d, 0xad, 0xfd, 0xcd, 0x5d, 0x6d, 0x3d, 0xd}, {0x0, 0x40, 0x80, 0xc0, 0x1d, 0x5d, 0x9d, 0xdd, 0x3a, 0x7a, 0xba, 0xfa, 0x27, 0x67, 0xa7, 0xe7}, {0x0, 0x50, 0xa0, 0xf0, 0x5d, 0xd, 0xfd, 0xad, 0xba, 0xea, 0x1a, 0x4a, 0xe7, 0xb7, 0x47, 0x17}, {0x0, 0x60, 0xc0, 0xa0, 0x9d, 0xfd, 0x5d, 0x3d, 0x27, 0x47, 0xe7, 0x87, 0xba, 0xda, 0x7a, 0x1a}, {0x0, 0x70, 0xe0, 0x90, 0xdd, 0xad, 0x3d, 0x4d, 0xa7, 0xd7, 0x47, 0x37, 0x7a, 0xa, 0x9a, 0xea}, {0x0, 0x80, 0x1d, 0x9d, 0x3a, 0xba, 0x27, 0xa7, 0x74, 0xf4, 0x69, 0xe9, 0x4e, 0xce, 0x53, 0xd3}, {0x0, 0x90, 0x3d, 0xad, 0x7a, 0xea, 0x47, 0xd7, 0xf4, 0x64, 0xc9, 0x59, 0x8e, 0x1e, 0xb3, 0x23}, {0x0, 0xa0, 0x5d, 0xfd, 0xba, 0x1a, 0xe7, 0x47, 0x69, 0xc9, 0x34, 0x94, 0xd3, 0x73, 0x8e, 0x2e}, {0x0, 0xb0, 0x7d, 0xcd, 0xfa, 0x4a, 0x87, 0x37, 0xe9, 0x59, 0x94, 0x24, 0x13, 0xa3, 0x6e, 0xde}, {0x0, 0xc0, 0x9d, 0x5d, 0x27, 0xe7, 0xba, 0x7a, 0x4e, 0x8e, 0xd3, 0x13, 0x69, 0xa9, 0xf4, 0x34}, {0x0, 0xd0, 0xbd, 0x6d, 0x67, 0xb7, 0xda, 0xa, 0xce, 0x1e, 0x73, 0xa3, 0xa9, 0x79, 0x14, 0xc4}, {0x0, 0xe0, 0xdd, 0x3d, 0xa7, 0x47, 0x7a, 0x9a, 0x53, 0xb3, 0x8e, 0x6e, 0xf4, 0x14, 0x29, 0xc9}, {0x0, 0xf0, 0xfd, 0xd, 0xe7, 0x17, 0x1a, 0xea, 0xd3, 0x23, 0x2e, 0xde, 0x34, 0xc4, 0xc9, 0x39}, {0x0, 0x1d, 0x3a, 0x27, 0x74, 0x69, 0x4e, 0x53, 0xe8, 0xf5, 0xd2, 0xcf, 0x9c, 0x81, 0xa6, 0xbb}, {0x0, 0xd, 0x1a, 0x17, 0x34, 0x39, 0x2e, 0x23, 0x68, 0x65, 0x72, 0x7f, 0x5c, 0x51, 0x46, 0x4b}, {0x0, 0x3d, 0x7a, 0x47, 0xf4, 0xc9, 0x8e, 0xb3, 0xf5, 0xc8, 0x8f, 0xb2, 0x1, 0x3c, 0x7b, 0x46}, {0x0, 0x2d, 0x5a, 0x77, 0xb4, 0x99, 0xee, 0xc3, 0x75, 0x58, 0x2f, 0x2, 0xc1, 0xec, 0x9b, 0xb6}, {0x0, 0x5d, 0xba, 0xe7, 0x69, 0x34, 0xd3, 0x8e, 0xd2, 0x8f, 0x68, 0x35, 0xbb, 0xe6, 0x1, 0x5c}, {0x0, 0x4d, 0x9a, 0xd7, 0x29, 0x64, 0xb3, 0xfe, 0x52, 0x1f, 0xc8, 0x85, 0x7b, 0x36, 0xe1, 0xac}, {0x0, 0x7d, 0xfa, 0x87, 0xe9, 0x94, 0x13, 0x6e, 0xcf, 0xb2, 0x35, 0x48, 0x26, 0x5b, 0xdc, 0xa1}, {0x0, 0x6d, 0xda, 0xb7, 0xa9, 0xc4, 0x73, 0x1e, 0x4f, 0x22, 0x95, 0xf8, 0xe6, 0x8b, 0x3c, 0x51}, {0x0, 0x9d, 0x27, 0xba, 0x4e, 0xd3, 0x69, 0xf4, 0x9c, 0x1, 0xbb, 0x26, 0xd2, 0x4f, 0xf5, 0x68}, {0x0, 0x8d, 0x7, 0x8a, 0xe, 0x83, 0x9, 0x84, 0x1c, 0x91, 0x1b, 0x96, 0x12, 0x9f, 0x15, 0x98}, {0x0, 0xbd, 0x67, 0xda, 0xce, 0x73, 0xa9, 0x14, 0x81, 0x3c, 0xe6, 0x5b, 0x4f, 0xf2, 0x28, 0x95}, {0x0, 0xad, 0x47, 0xea, 0x8e, 0x23, 0xc9, 0x64, 0x1, 0xac, 0x46, 0xeb, 0x8f, 0x22, 0xc8, 0x65}, {0x0, 0xdd, 0xa7, 0x7a, 0x53, 0x8e, 0xf4, 0x29, 0xa6, 0x7b, 0x1, 0xdc, 0xf5, 0x28, 0x52, 0x8f}, {0x0, 0xcd, 0x87, 0x4a, 0x13, 0xde, 0x94, 0x59, 0x26, 0xeb, 0xa1, 0x6c, 0x35, 0xf8, 0xb2, 0x7f}, {0x0, 0xfd, 0xe7, 0x1a, 0xd3, 0x2e, 0x34, 0xc9, 0xbb, 0x46, 0x5c, 0xa1, 0x68, 0x95, 0x8f, 0x72}, {0x0, 0xed, 0xc7, 0x2a, 0x93, 0x7e, 0x54, 0xb9, 0x3b, 0xd6, 0xfc, 0x11, 0xa8, 0x45, 0x6f, 0x82}, {0x0, 0x3a, 0x74, 0x4e, 0xe8, 0xd2, 0x9c, 0xa6, 0xcd, 0xf7, 0xb9, 0x83, 0x25, 0x1f, 0x51, 0x6b}, {0x0, 0x2a, 0x54, 0x7e, 0xa8, 0x82, 0xfc, 0xd6, 0x4d, 0x67, 0x19, 0x33, 0xe5, 0xcf, 0xb1, 0x9b}, {0x0, 0x1a, 0x34, 0x2e, 0x68, 0x72, 0x5c, 0x46, 0xd0, 0xca, 0xe4, 0xfe, 0xb8, 0xa2, 0x8c, 0x96}, {0x0, 0xa, 0x14, 0x1e, 0x28, 0x22, 0x3c, 0x36, 0x50, 0x5a, 0x44, 0x4e, 0x78, 0x72, 0x6c, 0x66}, {0x0, 0x7a, 0xf4, 0x8e, 0xf5, 0x8f, 0x1, 0x7b, 0xf7, 0x8d, 0x3, 0x79, 0x2, 0x78, 0xf6, 0x8c}, {0x0, 0x6a, 0xd4, 0xbe, 0xb5, 0xdf, 0x61, 0xb, 0x77, 0x1d, 0xa3, 0xc9, 0xc2, 0xa8, 0x16, 0x7c}, {0x0, 0x5a, 0xb4, 0xee, 0x75, 0x2f, 0xc1, 0x9b, 0xea, 0xb0, 0x5e, 0x4, 0x9f, 0xc5, 0x2b, 0x71}, {0x0, 0x4a, 0x94, 0xde, 0x35, 0x7f, 0xa1, 0xeb, 0x6a, 0x20, 0xfe, 0xb4, 0x5f, 0x15, 0xcb, 0x81}, {0x0, 0xba, 0x69, 0xd3, 0xd2, 0x68, 0xbb, 0x1, 0xb9, 0x3, 0xd0, 0x6a, 0x6b, 0xd1, 0x2, 0xb8}, {0x0, 0xaa, 0x49, 0xe3, 0x92, 0x38, 0xdb, 0x71, 0x39, 0x93, 0x70, 0xda, 0xab, 0x1, 0xe2, 0x48}, {0x0, 0x9a, 0x29, 0xb3, 0x52, 0xc8, 0x7b, 0xe1, 0xa4, 0x3e, 0x8d, 0x17, 0xf6, 0x6c, 0xdf, 0x45}, {0x0, 0x8a, 0x9, 0x83, 0x12, 0x98, 0x1b, 0x91, 0x24, 0xae, 0x2d, 0xa7, 0x36, 0xbc, 0x3f, 0xb5}, {0x0, 0xfa, 0xe9, 0x13, 0xcf, 0x35, 0x26, 0xdc, 0x83, 0x79, 0x6a, 0x90, 0x4c, 0xb6, 0xa5, 0x5f}, {0x0, 0xea, 0xc9, 0x23, 0x8f, 0x65, 0x46, 0xac, 0x3, 0xe9, 0xca, 0x20, 0x8c, 0x66, 0x45, 0xaf}, {0x0, 0xda, 0xa9, 0x73, 0x4f, 0x95, 0xe6, 0x3c, 0x9e, 0x44, 0x37, 0xed, 0xd1, 0xb, 0x78, 0xa2}, {0x0, 0xca, 0x89, 0x43, 0xf, 0xc5, 0x86, 0x4c, 0x1e, 0xd4, 0x97, 0x5d, 0x11, 0xdb, 0x98, 0x52}, {0x0, 0x27, 0x4e, 0x69, 0x9c, 0xbb, 0xd2, 0xf5, 0x25, 0x2, 0x6b, 0x4c, 0xb9, 0x9e, 0xf7, 0xd0}, {0x0, 0x37, 0x6e, 0x59, 0xdc, 0xeb, 0xb2, 0x85, 0xa5, 0x92, 0xcb, 0xfc, 0x79, 0x4e, 0x17, 0x20}, {0x0, 0x7, 0xe, 0x9, 0x1c, 0x1b, 0x12, 0x15, 0x38, 0x3f, 0x36, 0x31, 0x24, 0x23, 0x2a, 0x2d}, {0x0, 0x17, 0x2e, 0x39, 0x5c, 0x4b, 0x72, 0x65, 0xb8, 0xaf, 0x96, 0x81, 0xe4, 0xf3, 0xca, 0xdd}, {0x0, 0x67, 0xce, 0xa9, 0x81, 0xe6, 0x4f, 0x28, 0x1f, 0x78, 0xd1, 0xb6, 0x9e, 0xf9, 0x50, 0x37}, {0x0, 0x77, 0xee, 0x99, 0xc1, 0xb6, 0x2f, 0x58, 0x9f, 0xe8, 0x71, 0x6, 0x5e, 0x29, 0xb0, 0xc7}, {0x0, 0x47, 0x8e, 0xc9, 0x1, 0x46, 0x8f, 0xc8, 0x2, 0x45, 0x8c, 0xcb, 0x3, 0x44, 0x8d, 0xca}, {0x0, 0x57, 0xae, 0xf9, 0x41, 0x16, 0xef, 0xb8, 0x82, 0xd5, 0x2c, 0x7b, 0xc3, 0x94, 0x6d, 0x3a}, {0x0, 0xa7, 0x53, 0xf4, 0xa6, 0x1, 0xf5, 0x52, 0x51, 0xf6, 0x2, 0xa5, 0xf7, 0x50, 0xa4, 0x3}, {0x0, 0xb7, 0x73, 0xc4, 0xe6, 0x51, 0x95, 0x22, 0xd1, 0x66, 0xa2, 0x15, 0x37, 0x80, 0x44, 0xf3}, {0x0, 0x87, 0x13, 0x94, 0x26, 0xa1, 0x35, 0xb2, 0x4c, 0xcb, 0x5f, 0xd8, 0x6a, 0xed, 0x79, 0xfe}, {0x0, 0x97, 0x33, 0xa4, 0x66, 0xf1, 0x55, 0xc2, 0xcc, 0x5b, 0xff, 0x68, 0xaa, 0x3d, 0x99, 0xe}, {0x0, 0xe7, 0xd3, 0x34, 0xbb, 0x5c, 0x68, 0x8f, 0x6b, 0x8c, 0xb8, 0x5f, 0xd0, 0x37, 0x3, 0xe4}, {0x0, 0xf7, 0xf3, 0x4, 0xfb, 0xc, 0x8, 0xff, 0xeb, 0x1c, 0x18, 0xef, 0x10, 0xe7, 0xe3, 0x14}, {0x0, 0xc7, 0x93, 0x54, 0x3b, 0xfc, 0xa8, 0x6f, 0x76, 0xb1, 0xe5, 0x22, 0x4d, 0x8a, 0xde, 0x19}, {0x0, 0xd7, 0xb3, 0x64, 0x7b, 0xac, 0xc8, 0x1f, 0xf6, 0x21, 0x45, 0x92, 0x8d, 0x5a, 0x3e, 0xe9}, {0x0, 0x74, 0xe8, 0x9c, 0xcd, 0xb9, 0x25, 0x51, 0x87, 0xf3, 0x6f, 0x1b, 0x4a, 0x3e, 0xa2, 0xd6}, {0x0, 0x64, 0xc8, 0xac, 0x8d, 0xe9, 0x45, 0x21, 0x7, 0x63, 0xcf, 0xab, 0x8a, 0xee, 0x42, 0x26}, {0x0, 0x54, 0xa8, 0xfc, 0x4d, 0x19, 0xe5, 0xb1, 0x9a, 0xce, 0x32, 0x66, 0xd7, 0x83, 0x7f, 0x2b}, {0x0, 0x44, 0x88, 0xcc, 0xd, 0x49, 0x85, 0xc1, 0x1a, 0x5e, 0x92, 0xd6, 0x17, 0x53, 0x9f, 0xdb}, {0x0, 0x34, 0x68, 0x5c, 0xd0, 0xe4, 0xb8, 0x8c, 0xbd, 0x89, 0xd5, 0xe1, 0x6d, 0x59, 0x5, 0x31}, {0x0, 0x24, 0x48, 0x6c, 0x90, 0xb4, 0xd8, 0xfc, 0x3d, 0x19, 0x75, 0x51, 0xad, 0x89, 0xe5, 0xc1}, {0x0, 0x14, 0x28, 0x3c, 0x50, 0x44, 0x78, 0x6c, 0xa0, 0xb4, 0x88, 0x9c, 0xf0, 0xe4, 0xd8, 0xcc}, {0x0, 0x4, 0x8, 0xc, 0x10, 0x14, 0x18, 0x1c, 0x20, 0x24, 0x28, 0x2c, 0x30, 0x34, 0x38, 0x3c}, {0x0, 0xf4, 0xf5, 0x1, 0xf7, 0x3, 0x2, 0xf6, 0xf3, 0x7, 0x6, 0xf2, 0x4, 0xf0, 0xf1, 0x5}, {0x0, 0xe4, 0xd5, 0x31, 0xb7, 0x53, 0x62, 0x86, 0x73, 0x97, 0xa6, 0x42, 0xc4, 0x20, 0x11, 0xf5}, {0x0, 0xd4, 0xb5, 0x61, 0x77, 0xa3, 0xc2, 0x16, 0xee, 0x3a, 0x5b, 0x8f, 0x99, 0x4d, 0x2c, 0xf8}, {0x0, 0xc4, 0x95, 0x51, 0x37, 0xf3, 0xa2, 0x66, 0x6e, 0xaa, 0xfb, 0x3f, 0x59, 0x9d, 0xcc, 0x8}, {0x0, 0xb4, 0x75, 0xc1, 0xea, 0x5e, 0x9f, 0x2b, 0xc9, 0x7d, 0xbc, 0x8, 0x23, 0x97, 0x56, 0xe2}, {0x0, 0xa4, 0x55, 0xf1, 0xaa, 0xe, 0xff, 0x5b, 0x49, 0xed, 0x1c, 0xb8, 0xe3, 0x47, 0xb6, 0x12}, {0x0, 0x94, 0x35, 0xa1, 0x6a, 0xfe, 0x5f, 0xcb, 0xd4, 0x40, 0xe1, 0x75, 0xbe, 0x2a, 0x8b, 0x1f}, {0x0, 0x84, 0x15, 0x91, 0x2a, 0xae, 0x3f, 0xbb, 0x54, 0xd0, 0x41, 0xc5, 0x7e, 0xfa, 0x6b, 0xef}, {0x0, 0x69, 0xd2, 0xbb, 0xb9, 0xd0, 0x6b, 0x2, 0x6f, 0x6, 0xbd, 0xd4, 0xd6, 0xbf, 0x4, 0x6d}, {0x0, 0x79, 0xf2, 0x8b, 0xf9, 0x80, 0xb, 0x72, 0xef, 0x96, 0x1d, 0x64, 0x16, 0x6f, 0xe4, 0x9d}, {0x0, 0x49, 0x92, 0xdb, 0x39, 0x70, 0xab, 0xe2, 0x72, 0x3b, 0xe0, 0xa9, 0x4b, 0x2, 0xd9, 0x90}, {0x0, 0x59, 0xb2, 0xeb, 0x79, 0x20, 0xcb, 0x92, 0xf2, 0xab, 0x40, 0x19, 0x8b, 0xd2, 0x39, 0x60}, {0x0, 0x29, 0x52, 0x7b, 0xa4, 0x8d, 0xf6, 0xdf, 0x55, 0x7c, 0x7, 0x2e, 0xf1, 0xd8, 0xa3, 0x8a}, {0x0, 0x39, 0x72, 0x4b, 0xe4, 0xdd, 0x96, 0xaf, 0xd5, 0xec, 0xa7, 0x9e, 0x31, 0x8, 0x43, 0x7a}, {0x0, 0x9, 0x12, 0x1b, 0x24, 0x2d, 0x36, 0x3f, 0x48, 0x41, 0x5a, 0x53, 0x6c, 0x65, 0x7e, 0x77}, {0x0, 0x19, 0x32, 0x2b, 0x64, 0x7d, 0x56, 0x4f, 0xc8, 0xd1, 0xfa, 0xe3, 0xac, 0xb5, 0x9e, 0x87}, {0x0, 0xe9, 0xcf, 0x26, 0x83, 0x6a, 0x4c, 0xa5, 0x1b, 0xf2, 0xd4, 0x3d, 0x98, 0x71, 0x57, 0xbe}, {0x0, 0xf9, 0xef, 0x16, 0xc3, 0x3a, 0x2c, 0xd5, 0x9b, 0x62, 0x74, 0x8d, 0x58, 0xa1, 0xb7, 0x4e}, {0x0, 0xc9, 0x8f, 0x46, 0x3, 0xca, 0x8c, 0x45, 0x6, 0xcf, 0x89, 0x40, 0x5, 0xcc, 0x8a, 0x43}, {0x0, 0xd9, 0xaf, 0x76, 0x43, 0x9a, 0xec, 0x35, 0x86, 0x5f, 0x29, 0xf0, 0xc5, 0x1c, 0x6a, 0xb3}, {0x0, 0xa9, 0x4f, 0xe6, 0x9e, 0x37, 0xd1, 0x78, 0x21, 0x88, 0x6e, 0xc7, 0xbf, 0x16, 0xf0, 0x59}, {0x0, 0xb9, 0x6f, 0xd6, 0xde, 0x67, 0xb1, 0x8, 0xa1, 0x18, 0xce, 0x77, 0x7f, 0xc6, 0x10, 0xa9}, {0x0, 0x89, 0xf, 0x86, 0x1e, 0x97, 0x11, 0x98, 0x3c, 0xb5, 0x33, 0xba, 0x22, 0xab, 0x2d, 0xa4}, {0x0, 0x99, 0x2f, 0xb6, 0x5e, 0xc7, 0x71, 0xe8, 0xbc, 0x25, 0x93, 0xa, 0xe2, 0x7b, 0xcd, 0x54}, {0x0, 0x4e, 0x9c, 0xd2, 0x25, 0x6b, 0xb9, 0xf7, 0x4a, 0x4, 0xd6, 0x98, 0x6f, 0x21, 0xf3, 0xbd}, {0x0, 0x5e, 0xbc, 0xe2, 0x65, 0x3b, 0xd9, 0x87, 0xca, 0x94, 0x76, 0x28, 0xaf, 0xf1, 0x13, 0x4d}, {0x0, 0x6e, 0xdc, 0xb2, 0xa5, 0xcb, 0x79, 0x17, 0x57, 0x39, 0x8b, 0xe5, 0xf2, 0x9c, 0x2e, 0x40}, {0x0, 0x7e, 0xfc, 0x82, 0xe5, 0x9b, 0x19, 0x67, 0xd7, 0xa9, 0x2b, 0x55, 0x32, 0x4c, 0xce, 0xb0}, {0x0, 0xe, 0x1c, 0x12, 0x38, 0x36, 0x24, 0x2a, 0x70, 0x7e, 0x6c, 0x62, 0x48, 0x46, 0x54, 0x5a}, {0x0, 0x1e, 0x3c, 0x22, 0x78, 0x66, 0x44, 0x5a, 0xf0, 0xee, 0xcc, 0xd2, 0x88, 0x96, 0xb4, 0xaa}, {0x0, 0x2e, 0x5c, 0x72, 0xb8, 0x96, 0xe4, 0xca, 0x6d, 0x43, 0x31, 0x1f, 0xd5, 0xfb, 0x89, 0xa7}, {0x0, 0x3e, 0x7c, 0x42, 0xf8, 0xc6, 0x84, 0xba, 0xed, 0xd3, 0x91, 0xaf, 0x15, 0x2b, 0x69, 0x57}, {0x0, 0xce, 0x81, 0x4f, 0x1f, 0xd1, 0x9e, 0x50, 0x3e, 0xf0, 0xbf, 0x71, 0x21, 0xef, 0xa0, 0x6e}, {0x0, 0xde, 0xa1, 0x7f, 0x5f, 0x81, 0xfe, 0x20, 0xbe, 0x60, 0x1f, 0xc1, 0xe1, 0x3f, 0x40, 0x9e}, {0x0, 0xee, 0xc1, 0x2f, 0x9f, 0x71, 0x5e, 0xb0, 0x23, 0xcd, 0xe2, 0xc, 0xbc, 0x52, 0x7d, 0x93}, {0x0, 0xfe, 0xe1, 0x1f, 0xdf, 0x21, 0x3e, 0xc0, 0xa3, 0x5d, 0x42, 0xbc, 0x7c, 0x82, 0x9d, 0x63}, {0x0, 0x8e, 0x1, 0x8f, 0x2, 0x8c, 0x3, 0x8d, 0x4, 0x8a, 0x5, 0x8b, 0x6, 0x88, 0x7, 0x89}, {0x0, 0x9e, 0x21, 0xbf, 0x42, 0xdc, 0x63, 0xfd, 0x84, 0x1a, 0xa5, 0x3b, 0xc6, 0x58, 0xe7, 0x79}, {0x0, 0xae, 0x41, 0xef, 0x82, 0x2c, 0xc3, 0x6d, 0x19, 0xb7, 0x58, 0xf6, 0x9b, 0x35, 0xda, 0x74}, {0x0, 0xbe, 0x61, 0xdf, 0xc2, 0x7c, 0xa3, 0x1d, 0x99, 0x27, 0xf8, 0x46, 0x5b, 0xe5, 0x3a, 0x84}, {0x0, 0x53, 0xa6, 0xf5, 0x51, 0x2, 0xf7, 0xa4, 0xa2, 0xf1, 0x4, 0x57, 0xf3, 0xa0, 0x55, 0x6}, {0x0, 0x43, 0x86, 0xc5, 0x11, 0x52, 0x97, 0xd4, 0x22, 0x61, 0xa4, 0xe7, 0x33, 0x70, 0xb5, 0xf6}, {0x0, 0x73, 0xe6, 0x95, 0xd1, 0xa2, 0x37, 0x44, 0xbf, 0xcc, 0x59, 0x2a, 0x6e, 0x1d, 0x88, 0xfb}, {0x0, 0x63, 0xc6, 0xa5, 0x91, 0xf2, 0x57, 0x34, 0x3f, 0x5c, 0xf9, 0x9a, 0xae, 0xcd, 0x68, 0xb}, {0x0, 0x13, 0x26, 0x35, 0x4c, 0x5f, 0x6a, 0x79, 0x98, 0x8b, 0xbe, 0xad, 0xd4, 0xc7, 0xf2, 0xe1}, {0x0, 0x3, 0x6, 0x5, 0xc, 0xf, 0xa, 0x9, 0x18, 0x1b, 0x1e, 0x1d, 0x14, 0x17, 0x12, 0x11}, {0x0, 0x33, 0x66, 0x55, 0xcc, 0xff, 0xaa, 0x99, 0x85, 0xb6, 0xe3, 0xd0, 0x49, 0x7a, 0x2f, 0x1c}, {0x0, 0x23, 0x46, 0x65, 0x8c, 0xaf, 0xca, 0xe9, 0x5, 0x26, 0x43, 0x60, 0x89, 0xaa, 0xcf, 0xec}, {0x0, 0xd3, 0xbb, 0x68, 0x6b, 0xb8, 0xd0, 0x3, 0xd6, 0x5, 0x6d, 0xbe, 0xbd, 0x6e, 0x6, 0xd5}, {0x0, 0xc3, 0x9b, 0x58, 0x2b, 0xe8, 0xb0, 0x73, 0x56, 0x95, 0xcd, 0xe, 0x7d, 0xbe, 0xe6, 0x25}, {0x0, 0xf3, 0xfb, 0x8, 0xeb, 0x18, 0x10, 0xe3, 0xcb, 0x38, 0x30, 0xc3, 0x20, 0xd3, 0xdb, 0x28}, {0x0, 0xe3, 0xdb, 0x38, 0xab, 0x48, 0x70, 0x93, 0x4b, 0xa8, 0x90, 0x73, 0xe0, 0x3, 0x3b, 0xd8}, {0x0, 0x93, 0x3b, 0xa8, 0x76, 0xe5, 0x4d, 0xde, 0xec, 0x7f, 0xd7, 0x44, 0x9a, 0x9, 0xa1, 0x32}, {0x0, 0x83, 0x1b, 0x98, 0x36, 0xb5, 0x2d, 0xae, 0x6c, 0xef, 0x77, 0xf4, 0x5a, 0xd9, 0x41, 0xc2}, {0x0, 0xb3, 0x7b, 0xc8, 0xf6, 0x45, 0x8d, 0x3e, 0xf1, 0x42, 0x8a, 0x39, 0x7, 0xb4, 0x7c, 0xcf}, {0x0, 0xa3, 0x5b, 0xf8, 0xb6, 0x15, 0xed, 0x4e, 0x71, 0xd2, 0x2a, 0x89, 0xc7, 0x64, 0x9c, 0x3f}, {0x0, 0xe8, 0xcd, 0x25, 0x87, 0x6f, 0x4a, 0xa2, 0x13, 0xfb, 0xde, 0x36, 0x94, 0x7c, 0x59, 0xb1}, {0x0, 0xf8, 0xed, 0x15, 0xc7, 0x3f, 0x2a, 0xd2, 0x93, 0x6b, 0x7e, 0x86, 0x54, 0xac, 0xb9, 0x41}, {0x0, 0xc8, 0x8d, 0x45, 0x7, 0xcf, 0x8a, 0x42, 0xe, 0xc6, 0x83, 0x4b, 0x9, 0xc1, 0x84, 0x4c}, {0x0, 0xd8, 0xad, 0x75, 0x47, 0x9f, 0xea, 0x32, 0x8e, 0x56, 0x23, 0xfb, 0xc9, 0x11, 0x64, 0xbc}, {0x0, 0xa8, 0x4d, 0xe5, 0x9a, 0x32, 0xd7, 0x7f, 0x29, 0x81, 0x64, 0xcc, 0xb3, 0x1b, 0xfe, 0x56}, {0x0, 0xb8, 0x6d, 0xd5, 0xda, 0x62, 0xb7, 0xf, 0xa9, 0x11, 0xc4, 0x7c, 0x73, 0xcb, 0x1e, 0xa6}, {0x0, 0x88, 0xd, 0x85, 0x1a, 0x92, 0x17, 0x9f, 0x34, 0xbc, 0x39, 0xb1, 0x2e, 0xa6, 0x23, 0xab}, {0x0, 0x98, 0x2d, 0xb5, 0x5a, 0xc2, 0x77, 0xef, 0xb4, 0x2c, 0x99, 0x1, 0xee, 0x76, 0xc3, 0x5b}, {0x0, 0x68, 0xd0, 0xb8, 0xbd, 0xd5, 0x6d, 0x5, 0x67, 0xf, 0xb7, 0xdf, 0xda, 0xb2, 0xa, 0x62}, {0x0, 0x78, 0xf0, 0x88, 0xfd, 0x85, 0xd, 0x75, 0xe7, 0x9f, 0x17, 0x6f, 0x1a, 0x62, 0xea, 0x92}, {0x0, 0x48, 0x90, 0xd8, 0x3d, 0x75, 0xad, 0xe5, 0x7a, 0x32, 0xea, 0xa2, 0x47, 0xf, 0xd7, 0x9f}, {0x0, 0x58, 0xb0, 0xe8, 0x7d, 0x25, 0xcd, 0x95, 0xfa, 0xa2, 0x4a, 0x12, 0x87, 0xdf, 0x37, 0x6f}, {0x0, 0x28, 0x50, 0x78, 0xa0, 0x88, 0xf0, 0xd8, 0x5d, 0x75, 0xd, 0x25, 0xfd, 0xd5, 0xad, 0x85}, {0x0, 0x38, 0x70, 0x48, 0xe0, 0xd8, 0x90, 0xa8, 0xdd, 0xe5, 0xad, 0x95, 0x3d, 0x5, 0x4d, 0x75}, {0x0, 0x8, 0x10, 0x18, 0x20, 0x28, 0x30, 0x38, 0x40, 0x48, 0x50, 0x58, 0x60, 0x68, 0x70, 0x78}, {0x0, 0x18, 0x30, 0x28, 0x60, 0x78, 0x50, 0x48, 0xc0, 0xd8, 0xf0, 0xe8, 0xa0, 0xb8, 0x90, 0x88}, {0x0, 0xf5, 0xf7, 0x2, 0xf3, 0x6, 0x4, 0xf1, 0xfb, 0xe, 0xc, 0xf9, 0x8, 0xfd, 0xff, 0xa}, {0x0, 0xe5, 0xd7, 0x32, 0xb3, 0x56, 0x64, 0x81, 0x7b, 0x9e, 0xac, 0x49, 0xc8, 0x2d, 0x1f, 0xfa}, {0x0, 0xd5, 0xb7, 0x62, 0x73, 0xa6, 0xc4, 0x11, 0xe6, 0x33, 0x51, 0x84, 0x95, 0x40, 0x22, 0xf7}, {0x0, 0xc5, 0x97, 0x52, 0x33, 0xf6, 0xa4, 0x61, 0x66, 0xa3, 0xf1, 0x34, 0x55, 0x90, 0xc2, 0x7}, {0x0, 0xb5, 0x77, 0xc2, 0xee, 0x5b, 0x99, 0x2c, 0xc1, 0x74, 0xb6, 0x3, 0x2f, 0x9a, 0x58, 0xed}, {0x0, 0xa5, 0x57, 0xf2, 0xae, 0xb, 0xf9, 0x5c, 0x41, 0xe4, 0x16, 0xb3, 0xef, 0x4a, 0xb8, 0x1d}, {0x0, 0x95, 0x37, 0xa2, 0x6e, 0xfb, 0x59, 0xcc, 0xdc, 0x49, 0xeb, 0x7e, 0xb2, 0x27, 0x85, 0x10}, {0x0, 0x85, 0x17, 0x92, 0x2e, 0xab, 0x39, 0xbc, 0x5c, 0xd9, 0x4b, 0xce, 0x72, 0xf7, 0x65, 0xe0}, {0x0, 0x75, 0xea, 0x9f, 0xc9, 0xbc, 0x23, 0x56, 0x8f, 0xfa, 0x65, 0x10, 0x46, 0x33, 0xac, 0xd9}, {0x0, 0x65, 0xca, 0xaf, 0x89, 0xec, 0x43, 0x26, 0xf, 0x6a, 0xc5, 0xa0, 0x86, 0xe3, 0x4c, 0x29}, {0x0, 0x55, 0xaa, 0xff, 0x49, 0x1c, 0xe3, 0xb6, 0x92, 0xc7, 0x38, 0x6d, 0xdb, 0x8e, 0x71, 0x24}, {0x0, 0x45, 0x8a, 0xcf, 0x9, 0x4c, 0x83, 0xc6, 0x12, 0x57, 0x98, 0xdd, 0x1b, 0x5e, 0x91, 0xd4}, {0x0, 0x35, 0x6a, 0x5f, 0xd4, 0xe1, 0xbe, 0x8b, 0xb5, 0x80, 0xdf, 0xea, 0x61, 0x54, 0xb, 0x3e}, {0x0, 0x25, 0x4a, 0x6f, 0x94, 0xb1, 0xde, 0xfb, 0x35, 0x10, 0x7f, 0x5a, 0xa1, 0x84, 0xeb, 0xce}, {0x0, 0x15, 0x2a, 0x3f, 0x54, 0x41, 0x7e, 0x6b, 0xa8, 0xbd, 0x82, 0x97, 0xfc, 0xe9, 0xd6, 0xc3}, {0x0, 0x5, 0xa, 0xf, 0x14, 0x11, 0x1e, 0x1b, 0x28, 0x2d, 0x22, 0x27, 0x3c, 0x39, 0x36, 0x33}, {0x0, 0xd2, 0xb9, 0x6b, 0x6f, 0xbd, 0xd6, 0x4, 0xde, 0xc, 0x67, 0xb5, 0xb1, 0x63, 0x8, 0xda}, {0x0, 0xc2, 0x99, 0x5b, 0x2f, 0xed, 0xb6, 0x74, 0x5e, 0x9c, 0xc7, 0x5, 0x71, 0xb3, 0xe8, 0x2a}, {0x0, 0xf2, 0xf9, 0xb, 0xef, 0x1d, 0x16, 0xe4, 0xc3, 0x31, 0x3a, 0xc8, 0x2c, 0xde, 0xd5, 0x27}, {0x0, 0xe2, 0xd9, 0x3b, 0xaf, 0x4d, 0x76, 0x94, 0x43, 0xa1, 0x9a, 0x78, 0xec, 0xe, 0x35, 0xd7}, {0x0, 0x92, 0x39, 0xab, 0x72, 0xe0, 0x4b, 0xd9, 0xe4, 0x76, 0xdd, 0x4f, 0x96, 0x4, 0xaf, 0x3d}, {0x0, 0x82, 0x19, 0x9b, 0x32, 0xb0, 0x2b, 0xa9, 0x64, 0xe6, 0x7d, 0xff, 0x56, 0xd4, 0x4f, 0xcd}, {0x0, 0xb2, 0x79, 0xcb, 0xf2, 0x40, 0x8b, 0x39, 0xf9, 0x4b, 0x80, 0x32, 0xb, 0xb9, 0x72, 0xc0}, {0x0, 0xa2, 0x59, 0xfb, 0xb2, 0x10, 0xeb, 0x49, 0x79, 0xdb, 0x20, 0x82, 0xcb, 0x69, 0x92, 0x30}, {0x0, 0x52, 0xa4, 0xf6, 0x55, 0x7, 0xf1, 0xa3, 0xaa, 0xf8, 0xe, 0x5c, 0xff, 0xad, 0x5b, 0x9}, {0x0, 0x42, 0x84, 0xc6, 0x15, 0x57, 0x91, 0xd3, 0x2a, 0x68, 0xae, 0xec, 0x3f, 0x7d, 0xbb, 0xf9}, {0x0, 0x72, 0xe4, 0x96, 0xd5, 0xa7, 0x31, 0x43, 0xb7, 0xc5, 0x53, 0x21, 0x62, 0x10, 0x86, 0xf4}, {0x0, 0x62, 0xc4, 0xa6, 0x95, 0xf7, 0x51, 0x33, 0x37, 0x55, 0xf3, 0x91, 0xa2, 0xc0, 0x66, 0x4}, {0x0, 0x12, 0x24, 0x36, 0x48, 0x5a, 0x6c, 0x7e, 0x90, 0x82, 0xb4, 0xa6, 0xd8, 0xca, 0xfc, 0xee}, {0x0, 0x2, 0x4, 0x6, 0x8, 0xa, 0xc, 0xe, 0x10, 0x12, 0x14, 0x16, 0x18, 0x1a, 0x1c, 0x1e}, {0x0, 0x32, 0x64, 0x56, 0xc8, 0xfa, 0xac, 0x9e, 0x8d, 0xbf, 0xe9, 0xdb, 0x45, 0x77, 0x21, 0x13}, {0x0, 0x22, 0x44, 0x66, 0x88, 0xaa, 0xcc, 0xee, 0xd, 0x2f, 0x49, 0x6b, 0x85, 0xa7, 0xc1, 0xe3}, {0x0, 0xcf, 0x83, 0x4c, 0x1b, 0xd4, 0x98, 0x57, 0x36, 0xf9, 0xb5, 0x7a, 0x2d, 0xe2, 0xae, 0x61}, {0x0, 0xdf, 0xa3, 0x7c, 0x5b, 0x84, 0xf8, 0x27, 0xb6, 0x69, 0x15, 0xca, 0xed, 0x32, 0x4e, 0x91}, {0x0, 0xef, 0xc3, 0x2c, 0x9b, 0x74, 0x58, 0xb7, 0x2b, 0xc4, 0xe8, 0x7, 0xb0, 0x5f, 0x73, 0x9c}, {0x0, 0xff, 0xe3, 0x1c, 0xdb, 0x24, 0x38, 0xc7, 0xab, 0x54, 0x48, 0xb7, 0x70, 0x8f, 0x93, 0x6c}, {0x0, 0x8f, 0x3, 0x8c, 0x6, 0x89, 0x5, 0x8a, 0xc, 0x83, 0xf, 0x80, 0xa, 0x85, 0x9, 0x86}, {0x0, 0x9f, 0x23, 0xbc, 0x46, 0xd9, 0x65, 0xfa, 0x8c, 0x13, 0xaf, 0x30, 0xca, 0x55, 0xe9, 0x76}, {0x0, 0xaf, 0x43, 0xec, 0x86, 0x29, 0xc5, 0x6a, 0x11, 0xbe, 0x52, 0xfd, 0x97, 0x38, 0xd4, 0x7b}, {0x0, 0xbf, 0x63, 0xdc, 0xc6, 0x79, 0xa5, 0x1a, 0x91, 0x2e, 0xf2, 0x4d, 0x57, 0xe8, 0x34, 0x8b}, {0x0, 0x4f, 0x9e, 0xd1, 0x21, 0x6e, 0xbf, 0xf0, 0x42, 0xd, 0xdc, 0x93, 0x63, 0x2c, 0xfd, 0xb2}, {0x0, 0x5f, 0xbe, 0xe1, 0x61, 0x3e, 0xdf, 0x80, 0xc2, 0x9d, 0x7c, 0x23, 0xa3, 0xfc, 0x1d, 0x42}, {0x0, 0x6f, 0xde, 0xb1, 0xa1, 0xce, 0x7f, 0x10, 0x5f, 0x30, 0x81, 0xee, 0xfe, 0x91, 0x20, 0x4f}, {0x0, 0x7f, 0xfe, 0x81, 0xe1, 0x9e, 0x1f, 0x60, 0xdf, 0xa0, 0x21, 0x5e, 0x3e, 0x41, 0xc0, 0xbf}, {0x0, 0xf, 0x1e, 0x11, 0x3c, 0x33, 0x22, 0x2d, 0x78, 0x77, 0x66, 0x69, 0x44, 0x4b, 0x5a, 0x55}, {0x0, 0x1f, 0x3e, 0x21, 0x7c, 0x63, 0x42, 0x5d, 0xf8, 0xe7, 0xc6, 0xd9, 0x84, 0x9b, 0xba, 0xa5}, {0x0, 0x2f, 0x5e, 0x71, 0xbc, 0x93, 0xe2, 0xcd, 0x65, 0x4a, 0x3b, 0x14, 0xd9, 0xf6, 0x87, 0xa8}, {0x0, 0x3f, 0x7e, 0x41, 0xfc, 0xc3, 0x82, 0xbd, 0xe5, 0xda, 0x9b, 0xa4, 0x19, 0x26, 0x67, 0x58}, {0x0, 0x9c, 0x25, 0xb9, 0x4a, 0xd6, 0x6f, 0xf3, 0x94, 0x8, 0xb1, 0x2d, 0xde, 0x42, 0xfb, 0x67}, {0x0, 0x8c, 0x5, 0x89, 0xa, 0x86, 0xf, 0x83, 0x14, 0x98, 0x11, 0x9d, 0x1e, 0x92, 0x1b, 0x97}, {0x0, 0xbc, 0x65, 0xd9, 0xca, 0x76, 0xaf, 0x13, 0x89, 0x35, 0xec, 0x50, 0x43, 0xff, 0x26, 0x9a}, {0x0, 0xac, 0x45, 0xe9, 0x8a, 0x26, 0xcf, 0x63, 0x9, 0xa5, 0x4c, 0xe0, 0x83, 0x2f, 0xc6, 0x6a}, {0x0, 0xdc, 0xa5, 0x79, 0x57, 0x8b, 0xf2, 0x2e, 0xae, 0x72, 0xb, 0xd7, 0xf9, 0x25, 0x5c, 0x80}, {0x0, 0xcc, 0x85, 0x49, 0x17, 0xdb, 0x92, 0x5e, 0x2e, 0xe2, 0xab, 0x67, 0x39, 0xf5, 0xbc, 0x70}, {0x0, 0xfc, 0xe5, 0x19, 0xd7, 0x2b, 0x32, 0xce, 0xb3, 0x4f, 0x56, 0xaa, 0x64, 0x98, 0x81, 0x7d}, {0x0, 0xec, 0xc5, 0x29, 0x97, 0x7b, 0x52, 0xbe, 0x33, 0xdf, 0xf6, 0x1a, 0xa4, 0x48, 0x61, 0x8d}, {0x0, 0x1c, 0x38, 0x24, 0x70, 0x6c, 0x48, 0x54, 0xe0, 0xfc, 0xd8, 0xc4, 0x90, 0x8c, 0xa8, 0xb4}, {0x0, 0xc, 0x18, 0x14, 0x30, 0x3c, 0x28, 0x24, 0x60, 0x6c, 0x78, 0x74, 0x50, 0x5c, 0x48, 0x44}, {0x0, 0x3c, 0x78, 0x44, 0xf0, 0xcc, 0x88, 0xb4, 0xfd, 0xc1, 0x85, 0xb9, 0xd, 0x31, 0x75, 0x49}, {0x0, 0x2c, 0x58, 0x74, 0xb0, 0x9c, 0xe8, 0xc4, 0x7d, 0x51, 0x25, 0x9, 0xcd, 0xe1, 0x95, 0xb9}, {0x0, 0x5c, 0xb8, 0xe4, 0x6d, 0x31, 0xd5, 0x89, 0xda, 0x86, 0x62, 0x3e, 0xb7, 0xeb, 0xf, 0x53}, {0x0, 0x4c, 0x98, 0xd4, 0x2d, 0x61, 0xb5, 0xf9, 0x5a, 0x16, 0xc2, 0x8e, 0x77, 0x3b, 0xef, 0xa3}, {0x0, 0x7c, 0xf8, 0x84, 0xed, 0x91, 0x15, 0x69, 0xc7, 0xbb, 0x3f, 0x43, 0x2a, 0x56, 0xd2, 0xae}, {0x0, 0x6c, 0xd8, 0xb4, 0xad, 0xc1, 0x75, 0x19, 0x47, 0x2b, 0x9f, 0xf3, 0xea, 0x86, 0x32, 0x5e}, {0x0, 0x81, 0x1f, 0x9e, 0x3e, 0xbf, 0x21, 0xa0, 0x7c, 0xfd, 0x63, 0xe2, 0x42, 0xc3, 0x5d, 0xdc}, {0x0, 0x91, 0x3f, 0xae, 0x7e, 0xef, 0x41, 0xd0, 0xfc, 0x6d, 0xc3, 0x52, 0x82, 0x13, 0xbd, 0x2c}, {0x0, 0xa1, 0x5f, 0xfe, 0xbe, 0x1f, 0xe1, 0x40, 0x61, 0xc0, 0x3e, 0x9f, 0xdf, 0x7e, 0x80, 0x21}, {0x0, 0xb1, 0x7f, 0xce, 0xfe, 0x4f, 0x81, 0x30, 0xe1, 0x50, 0x9e, 0x2f, 0x1f, 0xae, 0x60, 0xd1}, {0x0, 0xc1, 0x9f, 0x5e, 0x23, 0xe2, 0xbc, 0x7d, 0x46, 0x87, 0xd9, 0x18, 0x65, 0xa4, 0xfa, 0x3b}, {0x0, 0xd1, 0xbf, 0x6e, 0x63, 0xb2, 0xdc, 0xd, 0xc6, 0x17, 0x79, 0xa8, 0xa5, 0x74, 0x1a, 0xcb}, {0x0, 0xe1, 0xdf, 0x3e, 0xa3, 0x42, 0x7c, 0x9d, 0x5b, 0xba, 0x84, 0x65, 0xf8, 0x19, 0x27, 0xc6}, {0x0, 0xf1, 0xff, 0xe, 0xe3, 0x12, 0x1c, 0xed, 0xdb, 0x2a, 0x24, 0xd5, 0x38, 0xc9, 0xc7, 0x36}, {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf}, {0x0, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, 0x88, 0x99, 0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff}, {0x0, 0x21, 0x42, 0x63, 0x84, 0xa5, 0xc6, 0xe7, 0x15, 0x34, 0x57, 0x76, 0x91, 0xb0, 0xd3, 0xf2}, {0x0, 0x31, 0x62, 0x53, 0xc4, 0xf5, 0xa6, 0x97, 0x95, 0xa4, 0xf7, 0xc6, 0x51, 0x60, 0x33, 0x2}, {0x0, 0x41, 0x82, 0xc3, 0x19, 0x58, 0x9b, 0xda, 0x32, 0x73, 0xb0, 0xf1, 0x2b, 0x6a, 0xa9, 0xe8}, {0x0, 0x51, 0xa2, 0xf3, 0x59, 0x8, 0xfb, 0xaa, 0xb2, 0xe3, 0x10, 0x41, 0xeb, 0xba, 0x49, 0x18}, {0x0, 0x61, 0xc2, 0xa3, 0x99, 0xf8, 0x5b, 0x3a, 0x2f, 0x4e, 0xed, 0x8c, 0xb6, 0xd7, 0x74, 0x15}, {0x0, 0x71, 0xe2, 0x93, 0xd9, 0xa8, 0x3b, 0x4a, 0xaf, 0xde, 0x4d, 0x3c, 0x76, 0x7, 0x94, 0xe5}, {0x0, 0xa6, 0x51, 0xf7, 0xa2, 0x4, 0xf3, 0x55, 0x59, 0xff, 0x8, 0xae, 0xfb, 0x5d, 0xaa, 0xc}, {0x0, 0xb6, 0x71, 0xc7, 0xe2, 0x54, 0x93, 0x25, 0xd9, 0x6f, 0xa8, 0x1e, 0x3b, 0x8d, 0x4a, 0xfc}, {0x0, 0x86, 0x11, 0x97, 0x22, 0xa4, 0x33, 0xb5, 0x44, 0xc2, 0x55, 0xd3, 0x66, 0xe0, 0x77, 0xf1}, {0x0, 0x96, 0x31, 0xa7, 0x62, 0xf4, 0x53, 0xc5, 0xc4, 0x52, 0xf5, 0x63, 0xa6, 0x30, 0x97, 0x1}, {0x0, 0xe6, 0xd1, 0x37, 0xbf, 0x59, 0x6e, 0x88, 0x63, 0x85, 0xb2, 0x54, 0xdc, 0x3a, 0xd, 0xeb}, {0x0, 0xf6, 0xf1, 0x7, 0xff, 0x9, 0xe, 0xf8, 0xe3, 0x15, 0x12, 0xe4, 0x1c, 0xea, 0xed, 0x1b}, {0x0, 0xc6, 0x91, 0x57, 0x3f, 0xf9, 0xae, 0x68, 0x7e, 0xb8, 0xef, 0x29, 0x41, 0x87, 0xd0, 0x16}, {0x0, 0xd6, 0xb1, 0x67, 0x7f, 0xa9, 0xce, 0x18, 0xfe, 0x28, 0x4f, 0x99, 0x81, 0x57, 0x30, 0xe6}, {0x0, 0x26, 0x4c, 0x6a, 0x98, 0xbe, 0xd4, 0xf2, 0x2d, 0xb, 0x61, 0x47, 0xb5, 0x93, 0xf9, 0xdf}, {0x0, 0x36, 0x6c, 0x5a, 0xd8, 0xee, 0xb4, 0x82, 0xad, 0x9b, 0xc1, 0xf7, 0x75, 0x43, 0x19, 0x2f}, {0x0, 0x6, 0xc, 0xa, 0x18, 0x1e, 0x14, 0x12, 0x30, 0x36, 0x3c, 0x3a, 0x28, 0x2e, 0x24, 0x22}, {0x0, 0x16, 0x2c, 0x3a, 0x58, 0x4e, 0x74, 0x62, 0xb0, 0xa6, 0x9c, 0x8a, 0xe8, 0xfe, 0xc4, 0xd2}, {0x0, 0x66, 0xcc, 0xaa, 0x85, 0xe3, 0x49, 0x2f, 0x17, 0x71, 0xdb, 0xbd, 0x92, 0xf4, 0x5e, 0x38}, {0x0, 0x76, 0xec, 0x9a, 0xc5, 0xb3, 0x29, 0x5f, 0x97, 0xe1, 0x7b, 0xd, 0x52, 0x24, 0xbe, 0xc8}, {0x0, 0x46, 0x8c, 0xca, 0x5, 0x43, 0x89, 0xcf, 0xa, 0x4c, 0x86, 0xc0, 0xf, 0x49, 0x83, 0xc5}, {0x0, 0x56, 0xac, 0xfa, 0x45, 0x13, 0xe9, 0xbf, 0x8a, 0xdc, 0x26, 0x70, 0xcf, 0x99, 0x63, 0x35}, {0x0, 0xbb, 0x6b, 0xd0, 0xd6, 0x6d, 0xbd, 0x6, 0xb1, 0xa, 0xda, 0x61, 0x67, 0xdc, 0xc, 0xb7}, {0x0, 0xab, 0x4b, 0xe0, 0x96, 0x3d, 0xdd, 0x76, 0x31, 0x9a, 0x7a, 0xd1, 0xa7, 0xc, 0xec, 0x47}, {0x0, 0x9b, 0x2b, 0xb0, 0x56, 0xcd, 0x7d, 0xe6, 0xac, 0x37, 0x87, 0x1c, 0xfa, 0x61, 0xd1, 0x4a}, {0x0, 0x8b, 0xb, 0x80, 0x16, 0x9d, 0x1d, 0x96, 0x2c, 0xa7, 0x27, 0xac, 0x3a, 0xb1, 0x31, 0xba}, {0x0, 0xfb, 0xeb, 0x10, 0xcb, 0x30, 0x20, 0xdb, 0x8b, 0x70, 0x60, 0x9b, 0x40, 0xbb, 0xab, 0x50}, {0x0, 0xeb, 0xcb, 0x20, 0x8b, 0x60, 0x40, 0xab, 0xb, 0xe0, 0xc0, 0x2b, 0x80, 0x6b, 0x4b, 0xa0}, {0x0, 0xdb, 0xab, 0x70, 0x4b, 0x90, 0xe0, 0x3b, 0x96, 0x4d, 0x3d, 0xe6, 0xdd, 0x6, 0x76, 0xad}, {0x0, 0xcb, 0x8b, 0x40, 0xb, 0xc0, 0x80, 0x4b, 0x16, 0xdd, 0x9d, 0x56, 0x1d, 0xd6, 0x96, 0x5d}, {0x0, 0x3b, 0x76, 0x4d, 0xec, 0xd7, 0x9a, 0xa1, 0xc5, 0xfe, 0xb3, 0x88, 0x29, 0x12, 0x5f, 0x64}, {0x0, 0x2b, 0x56, 0x7d, 0xac, 0x87, 0xfa, 0xd1, 0x45, 0x6e, 0x13, 0x38, 0xe9, 0xc2, 0xbf, 0x94}, {0x0, 0x1b, 0x36, 0x2d, 0x6c, 0x77, 0x5a, 0x41, 0xd8, 0xc3, 0xee, 0xf5, 0xb4, 0xaf, 0x82, 0x99}, {0x0, 0xb, 0x16, 0x1d, 0x2c, 0x27, 0x3a, 0x31, 0x58, 0x53, 0x4e, 0x45, 0x74, 0x7f, 0x62, 0x69}, {0x0, 0x7b, 0xf6, 0x8d, 0xf1, 0x8a, 0x7, 0x7c, 0xff, 0x84, 0x9, 0x72, 0xe, 0x75, 0xf8, 0x83}, {0x0, 0x6b, 0xd6, 0xbd, 0xb1, 0xda, 0x67, 0xc, 0x7f, 0x14, 0xa9, 0xc2, 0xce, 0xa5, 0x18, 0x73}, {0x0, 0x5b, 0xb6, 0xed, 0x71, 0x2a, 0xc7, 0x9c, 0xe2, 0xb9, 0x54, 0xf, 0x93, 0xc8, 0x25, 0x7e}, {0x0, 0x4b, 0x96, 0xdd, 0x31, 0x7a, 0xa7, 0xec, 0x62, 0x29, 0xf4, 0xbf, 0x53, 0x18, 0xc5, 0x8e}} // galMultiply multiplies to elements of the field. // Uses lookup table ~40% faster func galMultiply(a, b byte) byte { return mulTable[a][b] } // Original function: /* // galMultiply multiplies to elements of the field. func galMultiply(a, b byte) byte { if a == 0 || b == 0 { return 0 } logA := int(logTable[a]) logB := int(logTable[b]) return expTable[logA+logB] } */ // galDivide is inverse of galMultiply. func galDivide(a, b byte) byte { if a == 0 { return 0 } if b == 0 { panic("Argument 'divisor' is 0") } logA := int(logTable[a]) logB := int(logTable[b]) logResult := logA - logB if logResult < 0 { logResult += 255 } return expTable[logResult] } // Computes a**n. // // The result will be the same as multiplying a times itself n times. func galExp(a byte, n int) byte { if n == 0 { return 1 } if a == 0 { return 0 } logA := logTable[a] logResult := int(logA) * n for logResult >= 255 { logResult -= 255 } return expTable[logResult] } func genAvx2Matrix(matrixRows [][]byte, inputs, outputs int, dst []byte) []byte { if !avx2CodeGen { panic("codegen not enabled") } total := inputs * outputs // Duplicated in+out wantBytes := total * 32 * 2 if cap(dst) < wantBytes { dst = make([]byte, wantBytes) } else { dst = dst[:wantBytes] } for i, row := range matrixRows[:outputs] { for j, idx := range row[:inputs] { dstIdx := (j*outputs + i) * 64 dstPart := dst[dstIdx:] dstPart = dstPart[:64] lo := mulTableLow[idx][:] hi := mulTableHigh[idx][:] copy(dstPart[:16], lo) copy(dstPart[16:32], lo) copy(dstPart[32:48], hi) copy(dstPart[48:64], hi) } } return dst } reedsolomon-1.9.13/galoisAvx512_amd64.go000066400000000000000000000255561406411035300176540ustar00rootroot00000000000000//+build !noasm //+build !appengine //+build !gccgo // Copyright 2015, Klaus Post, see LICENSE for details. // Copyright 2019, Minio, Inc. package reedsolomon import ( "sync" ) //go:noescape func _galMulAVX512Parallel81(in, out [][]byte, matrix *[matrixSize81]byte, addTo bool) //go:noescape func _galMulAVX512Parallel82(in, out [][]byte, matrix *[matrixSize82]byte, addTo bool) //go:noescape func _galMulAVX512Parallel84(in, out [][]byte, matrix *[matrixSize84]byte, addTo bool) const ( dimIn = 8 // Number of input rows processed simultaneously dimOut81 = 1 // Number of output rows processed simultaneously for x1 routine dimOut82 = 2 // Number of output rows processed simultaneously for x2 routine dimOut84 = 4 // Number of output rows processed simultaneously for x4 routine matrixSize81 = (16 + 16) * dimIn * dimOut81 // Dimension of slice of matrix coefficient passed into x1 routine matrixSize82 = (16 + 16) * dimIn * dimOut82 // Dimension of slice of matrix coefficient passed into x2 routine matrixSize84 = (16 + 16) * dimIn * dimOut84 // Dimension of slice of matrix coefficient passed into x4 routine ) // Construct block of matrix coefficients for single output row in parallel func setupMatrix81(matrixRows [][]byte, inputOffset, outputOffset int, matrix *[matrixSize81]byte) { offset := 0 for c := inputOffset; c < inputOffset+dimIn; c++ { for iRow := outputOffset; iRow < outputOffset+dimOut81; iRow++ { if c < len(matrixRows[iRow]) { coeff := matrixRows[iRow][c] copy(matrix[offset*32:], mulTableLow[coeff][:]) copy(matrix[offset*32+16:], mulTableHigh[coeff][:]) } else { // coefficients not used for this input shard (so null out) v := matrix[offset*32 : offset*32+32] for i := range v { v[i] = 0 } } offset += dimIn if offset >= dimIn*dimOut81 { offset -= dimIn*dimOut81 - 1 } } } } // Construct block of matrix coefficients for 2 output rows in parallel func setupMatrix82(matrixRows [][]byte, inputOffset, outputOffset int, matrix *[matrixSize82]byte) { offset := 0 for c := inputOffset; c < inputOffset+dimIn; c++ { for iRow := outputOffset; iRow < outputOffset+dimOut82; iRow++ { if c < len(matrixRows[iRow]) { coeff := matrixRows[iRow][c] copy(matrix[offset*32:], mulTableLow[coeff][:]) copy(matrix[offset*32+16:], mulTableHigh[coeff][:]) } else { // coefficients not used for this input shard (so null out) v := matrix[offset*32 : offset*32+32] for i := range v { v[i] = 0 } } offset += dimIn if offset >= dimIn*dimOut82 { offset -= dimIn*dimOut82 - 1 } } } } // Construct block of matrix coefficients for 4 output rows in parallel func setupMatrix84(matrixRows [][]byte, inputOffset, outputOffset int, matrix *[matrixSize84]byte) { offset := 0 for c := inputOffset; c < inputOffset+dimIn; c++ { for iRow := outputOffset; iRow < outputOffset+dimOut84; iRow++ { if c < len(matrixRows[iRow]) { coeff := matrixRows[iRow][c] copy(matrix[offset*32:], mulTableLow[coeff][:]) copy(matrix[offset*32+16:], mulTableHigh[coeff][:]) } else { // coefficients not used for this input shard (so null out) v := matrix[offset*32 : offset*32+32] for i := range v { v[i] = 0 } } offset += dimIn if offset >= dimIn*dimOut84 { offset -= dimIn*dimOut84 - 1 } } } } // Invoke AVX512 routine for single output row in parallel func galMulAVX512Parallel81(in, out [][]byte, matrixRows [][]byte, inputOffset, outputOffset, start, stop int, matrix81 *[matrixSize81]byte) { done := stop - start if done <= 0 { return } inputEnd := inputOffset + dimIn if inputEnd > len(in) { inputEnd = len(in) } outputEnd := outputOffset + dimOut81 if outputEnd > len(out) { outputEnd = len(out) } // We know the max size, alloc temp array. var inTmp [dimIn][]byte for i, v := range in[inputOffset:inputEnd] { inTmp[i] = v[start:stop] } var outTmp [dimOut81][]byte for i, v := range out[outputOffset:outputEnd] { outTmp[i] = v[start:stop] } addTo := inputOffset != 0 // Except for the first input column, add to previous results _galMulAVX512Parallel81(inTmp[:inputEnd-inputOffset], outTmp[:outputEnd-outputOffset], matrix81, addTo) done = start + ((done >> 6) << 6) if done < stop { galMulAVX512LastInput(inputOffset, inputEnd, outputOffset, outputEnd, matrixRows, done, stop, out, in) } } // Invoke AVX512 routine for 2 output rows in parallel func galMulAVX512Parallel82(in, out [][]byte, matrixRows [][]byte, inputOffset, outputOffset, start, stop int, matrix82 *[matrixSize82]byte) { done := stop - start if done <= 0 { return } inputEnd := inputOffset + dimIn if inputEnd > len(in) { inputEnd = len(in) } outputEnd := outputOffset + dimOut82 if outputEnd > len(out) { outputEnd = len(out) } // We know the max size, alloc temp array. var inTmp [dimIn][]byte for i, v := range in[inputOffset:inputEnd] { inTmp[i] = v[start:stop] } var outTmp [dimOut82][]byte for i, v := range out[outputOffset:outputEnd] { outTmp[i] = v[start:stop] } addTo := inputOffset != 0 // Except for the first input column, add to previous results _galMulAVX512Parallel82(inTmp[:inputEnd-inputOffset], outTmp[:outputEnd-outputOffset], matrix82, addTo) done = start + ((done >> 6) << 6) if done < stop { galMulAVX512LastInput(inputOffset, inputEnd, outputOffset, outputEnd, matrixRows, done, stop, out, in) } } // Invoke AVX512 routine for 4 output rows in parallel func galMulAVX512Parallel84(in, out [][]byte, matrixRows [][]byte, inputOffset, outputOffset, start, stop int, matrix84 *[matrixSize84]byte) { done := stop - start if done <= 0 { return } inputEnd := inputOffset + dimIn if inputEnd > len(in) { inputEnd = len(in) } outputEnd := outputOffset + dimOut84 if outputEnd > len(out) { outputEnd = len(out) } // We know the max size, alloc temp array. var inTmp [dimIn][]byte for i, v := range in[inputOffset:inputEnd] { inTmp[i] = v[start:stop] } var outTmp [dimOut84][]byte for i, v := range out[outputOffset:outputEnd] { outTmp[i] = v[start:stop] } addTo := inputOffset != 0 // Except for the first input column, add to previous results _galMulAVX512Parallel84(inTmp[:inputEnd-inputOffset], outTmp[:outputEnd-outputOffset], matrix84, addTo) done = start + ((done >> 6) << 6) if done < stop { galMulAVX512LastInput(inputOffset, inputEnd, outputOffset, outputEnd, matrixRows, done, stop, out, in) } } func galMulAVX512LastInput(inputOffset int, inputEnd int, outputOffset int, outputEnd int, matrixRows [][]byte, done int, stop int, out [][]byte, in [][]byte) { for c := inputOffset; c < inputEnd; c++ { for iRow := outputOffset; iRow < outputEnd; iRow++ { if c < len(matrixRows[iRow]) { mt := mulTable[matrixRows[iRow][c]][:256] for i := done; i < stop; i++ { if c == 0 { // only set value for first input column out[iRow][i] = mt[in[c][i]] } else { // and add for all others out[iRow][i] ^= mt[in[c][i]] } } } } } } // Perform the same as codeSomeShards, but taking advantage of // AVX512 parallelism for up to 4x faster execution as compared to AVX2 func (r *reedSolomon) codeSomeShardsAvx512(matrixRows, inputs, outputs [][]byte, outputCount, byteCount int) { // Process using no goroutines start, end := 0, r.o.perRound if end > byteCount { end = byteCount } for start < byteCount { matrix84 := [matrixSize84]byte{} matrix82 := [matrixSize82]byte{} matrix81 := [matrixSize81]byte{} outputRow := 0 // First process (multiple) batches of 4 output rows in parallel if outputRow+dimOut84 <= outputCount { for ; outputRow+dimOut84 <= outputCount; outputRow += dimOut84 { for inputRow := 0; inputRow < len(inputs); inputRow += dimIn { setupMatrix84(matrixRows, inputRow, outputRow, &matrix84) galMulAVX512Parallel84(inputs, outputs, matrixRows, inputRow, outputRow, start, end, &matrix84) } } } // Then process a (single) batch of 2 output rows in parallel if outputRow+dimOut82 <= outputCount { for inputRow := 0; inputRow < len(inputs); inputRow += dimIn { setupMatrix82(matrixRows, inputRow, outputRow, &matrix82) galMulAVX512Parallel82(inputs, outputs, matrixRows, inputRow, outputRow, start, end, &matrix82) } outputRow += dimOut82 } // Lastly, we may have a single output row left (for uneven parity) if outputRow < outputCount { for inputRow := 0; inputRow < len(inputs); inputRow += dimIn { setupMatrix81(matrixRows, inputRow, outputRow, &matrix81) galMulAVX512Parallel81(inputs, outputs, matrixRows, inputRow, outputRow, start, end, &matrix81) } } start = end end += r.o.perRound if end > byteCount { end = byteCount } } } // Perform the same as codeSomeShards, but taking advantage of // AVX512 parallelism for up to 4x faster execution as compared to AVX2 func (r *reedSolomon) codeSomeShardsAvx512P(matrixRows, inputs, outputs [][]byte, outputCount, byteCount int) { var wg sync.WaitGroup do := byteCount / r.o.maxGoroutines if do < r.o.minSplitSize { do = r.o.minSplitSize } // Make sizes divisible by 64 do = (do + 63) & (^63) start := 0 for start < byteCount { if start+do > byteCount { do = byteCount - start } wg.Add(1) go func(grStart, grStop int) { start, stop := grStart, grStart+r.o.perRound if stop > grStop { stop = grStop } // Loop for each round. matrix84 := [matrixSize84]byte{} matrix82 := [matrixSize82]byte{} matrix81 := [matrixSize81]byte{} for start < grStop { outputRow := 0 // First process (multiple) batches of 4 output rows in parallel if outputRow+dimOut84 <= outputCount { // 1K matrix buffer for ; outputRow+dimOut84 <= outputCount; outputRow += dimOut84 { for inputRow := 0; inputRow < len(inputs); inputRow += dimIn { setupMatrix84(matrixRows, inputRow, outputRow, &matrix84) galMulAVX512Parallel84(inputs, outputs, matrixRows, inputRow, outputRow, start, stop, &matrix84) } } } // Then process a (single) batch of 2 output rows in parallel if outputRow+dimOut82 <= outputCount { // 512B matrix buffer for inputRow := 0; inputRow < len(inputs); inputRow += dimIn { setupMatrix82(matrixRows, inputRow, outputRow, &matrix82) galMulAVX512Parallel82(inputs, outputs, matrixRows, inputRow, outputRow, start, stop, &matrix82) } outputRow += dimOut82 } // Lastly, we may have a single output row left (for uneven parity) if outputRow < outputCount { for inputRow := 0; inputRow < len(inputs); inputRow += dimIn { setupMatrix81(matrixRows, inputRow, outputRow, &matrix81) galMulAVX512Parallel81(inputs, outputs, matrixRows, inputRow, outputRow, start, stop, &matrix81) } } start = stop stop += r.o.perRound if stop > grStop { stop = grStop } } wg.Done() }(start, start+do) start += do } wg.Wait() } reedsolomon-1.9.13/galoisAvx512_amd64.s000066400000000000000000000233351406411035300175020ustar00rootroot00000000000000//+build !noasm //+build !appengine //+build !gccgo // Copyright 2015, Klaus Post, see LICENSE for details. // Copyright 2019, Minio, Inc. #define LOAD(OFFSET) \ MOVQ OFFSET(SI), BX \ VMOVDQU64 (BX)(R11*1), Z0 \ VPSRLQ $4, Z0, Z1 \ // high input VPANDQ Z2, Z0, Z0 \ // low input VPANDQ Z2, Z1, Z1 // high input #define GALOIS_MUL(MUL_LO, MUL_HI, LO, HI, OUT) \ VPSHUFB Z0, MUL_LO, LO \ // mul low part VPSHUFB Z1, MUL_HI, HI \ // mul high part VPTERNLOGD $0x96, LO, HI, OUT #define GALOIS(C1, C2, IN, LO, HI, OUT) \ VSHUFI64X2 $C1, IN, IN, LO \ VSHUFI64X2 $C2, IN, IN, HI \ GALOIS_MUL(LO, HI, LO, HI, OUT) // // Process single output row from a total of 8 input rows // // func _galMulAVX512Parallel81(in, out [][]byte, matrix *[matrixSize81]byte, addTo bool) TEXT ·_galMulAVX512Parallel81(SB), 7, $0 MOVQ in+0(FP), SI MOVQ 8(SI), R9 // R9: len(in) SHRQ $6, R9 // len(in) / 64 TESTQ R9, R9 JZ done_avx512_parallel81 MOVQ matrix+48(FP), SI VMOVDQU64 0x000(SI), Z16 VMOVDQU64 0x040(SI), Z17 VMOVDQU64 0x080(SI), Z18 VMOVDQU64 0x0c0(SI), Z19 // Initialize multiplication constants VSHUFI64X2 $0x55, Z16, Z16, Z20 VSHUFI64X2 $0xaa, Z16, Z16, Z24 VSHUFI64X2 $0xff, Z16, Z16, Z28 VSHUFI64X2 $0x00, Z16, Z16, Z16 VSHUFI64X2 $0x55, Z17, Z17, Z21 VSHUFI64X2 $0xaa, Z17, Z17, Z25 VSHUFI64X2 $0xff, Z17, Z17, Z29 VSHUFI64X2 $0x00, Z17, Z17, Z17 VSHUFI64X2 $0x55, Z18, Z18, Z22 VSHUFI64X2 $0xaa, Z18, Z18, Z26 VSHUFI64X2 $0xff, Z18, Z18, Z30 VSHUFI64X2 $0x00, Z18, Z18, Z18 VSHUFI64X2 $0x55, Z19, Z19, Z23 VSHUFI64X2 $0xaa, Z19, Z19, Z27 VSHUFI64X2 $0xff, Z19, Z19, Z31 VSHUFI64X2 $0x00, Z19, Z19, Z19 MOVQ $15, BX VPBROADCASTB BX, Z2 MOVB addTo+56(FP), AX IMULQ $-0x1, AX KMOVQ AX, K1 MOVQ in+0(FP), SI // SI: &in MOVQ in_len+8(FP), AX // number of inputs XORQ R11, R11 MOVQ out+24(FP), DX MOVQ (DX), DX // DX: &out[0][0] loopback_avx512_parallel81: VMOVDQU64.Z (DX), K1, Z4 LOAD(0x00) // &in[0][0] GALOIS_MUL(Z16, Z20, Z14, Z15, Z4) CMPQ AX, $1 JE skip_avx512_parallel81 LOAD(0x18) // &in[1][0] GALOIS_MUL(Z24, Z28, Z14, Z15, Z4) CMPQ AX, $2 JE skip_avx512_parallel81 LOAD(0x30) // &in[2][0] GALOIS_MUL(Z17, Z21, Z14, Z15, Z4) CMPQ AX, $3 JE skip_avx512_parallel81 LOAD(0x48) // &in[3][0] GALOIS_MUL(Z25, Z29, Z14, Z15, Z4) CMPQ AX, $4 JE skip_avx512_parallel81 LOAD(0x60) // &in[4][0] GALOIS_MUL(Z18, Z22, Z14, Z15, Z4) CMPQ AX, $5 JE skip_avx512_parallel81 LOAD(0x78) // &in[5][0] GALOIS_MUL(Z26, Z30, Z14, Z15, Z4) CMPQ AX, $6 JE skip_avx512_parallel81 LOAD(0x90) // &in[6][0] GALOIS_MUL(Z19, Z23, Z14, Z15, Z4) CMPQ AX, $7 JE skip_avx512_parallel81 LOAD(0xa8) // &in[7][0] GALOIS_MUL(Z27, Z31, Z14, Z15, Z4) skip_avx512_parallel81: VMOVDQU64 Z4, (DX) ADDQ $64, R11 // in4+=64 ADDQ $64, DX // out+=64 SUBQ $1, R9 JNZ loopback_avx512_parallel81 done_avx512_parallel81: VZEROUPPER RET // // Process 2 output rows in parallel from a total of 8 input rows // // func _galMulAVX512Parallel82(in, out [][]byte, matrix *[matrixSize82]byte, addTo bool) TEXT ·_galMulAVX512Parallel82(SB), 7, $0 MOVQ in+0(FP), SI MOVQ 8(SI), R9 // R9: len(in) SHRQ $6, R9 // len(in) / 64 TESTQ R9, R9 JZ done_avx512_parallel82 MOVQ matrix+48(FP), SI VMOVDQU64 0x000(SI), Z16 VMOVDQU64 0x040(SI), Z17 VMOVDQU64 0x080(SI), Z18 VMOVDQU64 0x0c0(SI), Z19 VMOVDQU64 0x100(SI), Z20 VMOVDQU64 0x140(SI), Z21 VMOVDQU64 0x180(SI), Z22 VMOVDQU64 0x1c0(SI), Z23 // Initialize multiplication constants VSHUFI64X2 $0x55, Z16, Z16, Z24 VSHUFI64X2 $0xaa, Z16, Z16, Z25 VSHUFI64X2 $0xff, Z16, Z16, Z26 VSHUFI64X2 $0x00, Z16, Z16, Z16 VSHUFI64X2 $0x55, Z20, Z20, Z27 VSHUFI64X2 $0xaa, Z20, Z20, Z28 VSHUFI64X2 $0xff, Z20, Z20, Z29 VSHUFI64X2 $0x00, Z20, Z20, Z20 VSHUFI64X2 $0x55, Z17, Z17, Z30 VSHUFI64X2 $0xaa, Z17, Z17, Z31 VSHUFI64X2 $0xff, Z17, Z17, Z11 VSHUFI64X2 $0x00, Z17, Z17, Z17 VSHUFI64X2 $0x55, Z21, Z21, Z8 VSHUFI64X2 $0xaa, Z21, Z21, Z9 VSHUFI64X2 $0xff, Z21, Z21, Z10 VSHUFI64X2 $0x00, Z21, Z21, Z21 MOVQ $15, BX VPBROADCASTB BX, Z2 MOVB addTo+56(FP), AX IMULQ $-0x1, AX KMOVQ AX, K1 MOVQ in+0(FP), SI // SI: &in MOVQ in_len+8(FP), AX // number of inputs XORQ R11, R11 MOVQ out+24(FP), DX MOVQ 24(DX), CX // CX: &out[1][0] MOVQ (DX), DX // DX: &out[0][0] loopback_avx512_parallel82: VMOVDQU64.Z (DX), K1, Z4 VMOVDQU64.Z (CX), K1, Z5 LOAD(0x00) // &in[0][0] GALOIS_MUL(Z16, Z24, Z14, Z15, Z4) GALOIS_MUL(Z20, Z27, Z12, Z13, Z5) CMPQ AX, $1 JE skip_avx512_parallel82 LOAD(0x18) // &in[1][0] GALOIS_MUL(Z25, Z26, Z14, Z15, Z4) GALOIS_MUL(Z28, Z29, Z12, Z13, Z5) CMPQ AX, $2 JE skip_avx512_parallel82 LOAD(0x30) // &in[2][0] GALOIS_MUL(Z17, Z30, Z14, Z15, Z4) GALOIS_MUL(Z21, Z8, Z12, Z13, Z5) CMPQ AX, $3 JE skip_avx512_parallel82 LOAD(0x48) // &in[3][0] GALOIS_MUL(Z31, Z11, Z14, Z15, Z4) GALOIS_MUL(Z9, Z10, Z12, Z13, Z5) CMPQ AX, $4 JE skip_avx512_parallel82 LOAD(0x60) // &in[4][0] GALOIS(0x00, 0x55, Z18, Z14, Z15, Z4) GALOIS(0x00, 0x55, Z22, Z12, Z13, Z5) CMPQ AX, $5 JE skip_avx512_parallel82 LOAD(0x78) // &in[5][0] GALOIS(0xaa, 0xff, Z18, Z14, Z15, Z4) GALOIS(0xaa, 0xff, Z22, Z12, Z13, Z5) CMPQ AX, $6 JE skip_avx512_parallel82 LOAD(0x90) // &in[6][0] GALOIS(0x00, 0x55, Z19, Z14, Z15, Z4) GALOIS(0x00, 0x55, Z23, Z12, Z13, Z5) CMPQ AX, $7 JE skip_avx512_parallel82 LOAD(0xa8) // &in[7][0] GALOIS(0xaa, 0xff, Z19, Z14, Z15, Z4) GALOIS(0xaa, 0xff, Z23, Z12, Z13, Z5) skip_avx512_parallel82: VMOVDQU64 Z4, (DX) VMOVDQU64 Z5, (CX) ADDQ $64, R11 // in4+=64 ADDQ $64, DX // out+=64 ADDQ $64, CX // out2+=64 SUBQ $1, R9 JNZ loopback_avx512_parallel82 done_avx512_parallel82: VZEROUPPER RET // // Process 4 output rows in parallel from a total of 8 input rows // // func _galMulAVX512Parallel84(in, out [][]byte, matrix *[matrixSize84]byte, addTo bool) TEXT ·_galMulAVX512Parallel84(SB), 7, $0 MOVQ in+0(FP), SI MOVQ 8(SI), R9 // R9: len(in) SHRQ $6, R9 // len(in) / 64 TESTQ R9, R9 JZ done_avx512_parallel84 MOVQ matrix+48(FP), SI VMOVDQU64 0x000(SI), Z16 VMOVDQU64 0x040(SI), Z17 VMOVDQU64 0x080(SI), Z18 VMOVDQU64 0x0c0(SI), Z19 VMOVDQU64 0x100(SI), Z20 VMOVDQU64 0x140(SI), Z21 VMOVDQU64 0x180(SI), Z22 VMOVDQU64 0x1c0(SI), Z23 VMOVDQU64 0x200(SI), Z24 VMOVDQU64 0x240(SI), Z25 VMOVDQU64 0x280(SI), Z26 VMOVDQU64 0x2c0(SI), Z27 VMOVDQU64 0x300(SI), Z28 VMOVDQU64 0x340(SI), Z29 VMOVDQU64 0x380(SI), Z30 VMOVDQU64 0x3c0(SI), Z31 MOVQ $15, BX VPBROADCASTB BX, Z2 MOVB addTo+56(FP), AX IMULQ $-0x1, AX KMOVQ AX, K1 MOVQ in+0(FP), SI // SI: &in MOVQ in_len+8(FP), AX // number of inputs XORQ R11, R11 MOVQ out+24(FP), DX MOVQ 24(DX), CX // CX: &out[1][0] MOVQ 48(DX), R10 // R10: &out[2][0] MOVQ 72(DX), R12 // R12: &out[3][0] MOVQ (DX), DX // DX: &out[0][0] loopback_avx512_parallel84: VMOVDQU64.Z (DX), K1, Z4 VMOVDQU64.Z (CX), K1, Z5 VMOVDQU64.Z (R10), K1, Z6 VMOVDQU64.Z (R12), K1, Z7 LOAD(0x00) // &in[0][0] GALOIS(0x00, 0x55, Z16, Z14, Z15, Z4) GALOIS(0x00, 0x55, Z20, Z12, Z13, Z5) GALOIS(0x00, 0x55, Z24, Z10, Z11, Z6) GALOIS(0x00, 0x55, Z28, Z8, Z9, Z7) CMPQ AX, $1 JE skip_avx512_parallel84 LOAD(0x18) // &in[1][0] GALOIS(0xaa, 0xff, Z16, Z14, Z15, Z4) GALOIS(0xaa, 0xff, Z20, Z12, Z13, Z5) GALOIS(0xaa, 0xff, Z24, Z10, Z11, Z6) GALOIS(0xaa, 0xff, Z28, Z8, Z9, Z7) CMPQ AX, $2 JE skip_avx512_parallel84 LOAD(0x30) // &in[2][0] GALOIS(0x00, 0x55, Z17, Z14, Z15, Z4) GALOIS(0x00, 0x55, Z21, Z12, Z13, Z5) GALOIS(0x00, 0x55, Z25, Z10, Z11, Z6) GALOIS(0x00, 0x55, Z29, Z8, Z9, Z7) CMPQ AX, $3 JE skip_avx512_parallel84 LOAD(0x48) // &in[3][0] GALOIS(0xaa, 0xff, Z17, Z14, Z15, Z4) GALOIS(0xaa, 0xff, Z21, Z12, Z13, Z5) GALOIS(0xaa, 0xff, Z25, Z10, Z11, Z6) GALOIS(0xaa, 0xff, Z29, Z8, Z9, Z7) CMPQ AX, $4 JE skip_avx512_parallel84 LOAD(0x60) // &in[4][0] GALOIS(0x00, 0x55, Z18, Z14, Z15, Z4) GALOIS(0x00, 0x55, Z22, Z12, Z13, Z5) GALOIS(0x00, 0x55, Z26, Z10, Z11, Z6) GALOIS(0x00, 0x55, Z30, Z8, Z9, Z7) CMPQ AX, $5 JE skip_avx512_parallel84 LOAD(0x78) // &in[5][0] GALOIS(0xaa, 0xff, Z18, Z14, Z15, Z4) GALOIS(0xaa, 0xff, Z22, Z12, Z13, Z5) GALOIS(0xaa, 0xff, Z26, Z10, Z11, Z6) GALOIS(0xaa, 0xff, Z30, Z8, Z9, Z7) CMPQ AX, $6 JE skip_avx512_parallel84 LOAD(0x90) // &in[6][0] GALOIS(0x00, 0x55, Z19, Z14, Z15, Z4) GALOIS(0x00, 0x55, Z23, Z12, Z13, Z5) GALOIS(0x00, 0x55, Z27, Z10, Z11, Z6) GALOIS(0x00, 0x55, Z31, Z8, Z9, Z7) CMPQ AX, $7 JE skip_avx512_parallel84 LOAD(0xa8) // &in[7][0] GALOIS(0xaa, 0xff, Z19, Z14, Z15, Z4) GALOIS(0xaa, 0xff, Z23, Z12, Z13, Z5) GALOIS(0xaa, 0xff, Z27, Z10, Z11, Z6) GALOIS(0xaa, 0xff, Z31, Z8, Z9, Z7) skip_avx512_parallel84: VMOVDQU64 Z4, (DX) VMOVDQU64 Z5, (CX) VMOVDQU64 Z6, (R10) VMOVDQU64 Z7, (R12) ADDQ $64, R11 // in4+=64 ADDQ $64, DX // out+=64 ADDQ $64, CX // out2+=64 ADDQ $64, R10 // out3+=64 ADDQ $64, R12 // out4+=64 SUBQ $1, R9 JNZ loopback_avx512_parallel84 done_avx512_parallel84: VZEROUPPER RET reedsolomon-1.9.13/galoisAvx512_amd64_test.go000066400000000000000000000307441406411035300207060ustar00rootroot00000000000000//+build !noasm //+build !appengine //+build !gccgo // Copyright 2015, Klaus Post, see LICENSE for details. // Copyright 2019, Minio, Inc. package reedsolomon import ( "bytes" "math/rand" "testing" "time" ) func testGaloisAvx512Parallelx1(t *testing.T, inputSize int) { if !defaultOptions.useAVX512 { t.Skip("AVX512 not detected") } rand.Seed(time.Now().UnixNano()) var size = 1024 * 1024 if testing.Short() { size = 4096 } in, out := make([][]byte, inputSize), make([][]byte, dimOut81) for i := range in { in[i] = make([]byte, size) rand.Read(in[i]) } for i := range out { out[i] = make([]byte, size) rand.Read(out[i]) } opts := defaultOptions opts.useSSSE3 = true matrix := [(16 + 16) * dimIn * dimOut81]byte{} coeffs := make([]byte, dimIn*len(out)) for i := 0; i < dimIn*len(out); i++ { coeffs[i] = byte(rand.Int31n(256)) copy(matrix[i*32:], mulTableLow[coeffs[i]][:]) copy(matrix[i*32+16:], mulTableHigh[coeffs[i]][:]) } // Do first run with clearing out any existing results _galMulAVX512Parallel81(in, out, &matrix, false) expect := make([][]byte, len(out)) for i := range expect { expect[i] = make([]byte, size) rand.Read(expect[i]) } for i := range in { if i == 0 { galMulSlice(coeffs[i], in[i], expect[0], &options{}) } else { galMulSliceXor(coeffs[i], in[i], expect[0], &options{}) } } for i := range out { if 0 != bytes.Compare(out[i], expect[i]) { t.Errorf("got [%d]%#v...,\n expected [%d]%#v...", i, out[i][:8], i, expect[i][:8]) } } inToAdd := make([][]byte, len(in)) for i := range inToAdd { inToAdd[i] = make([]byte, size) rand.Read(inToAdd[i]) } for i := 0; i < dimIn*len(out); i++ { coeffs[i] = byte(rand.Int31n(256)) copy(matrix[i*32:], mulTableLow[coeffs[i]][:]) copy(matrix[i*32+16:], mulTableHigh[coeffs[i]][:]) } // Do second run by adding to original run _galMulAVX512Parallel81(inToAdd, out, &matrix, true) for i := range in { galMulSliceXor(coeffs[i], inToAdd[i], expect[0], &options{}) } for i := range out { if 0 != bytes.Compare(out[i], expect[i]) { t.Errorf("got [%d]%#v...,\n expected [%d]%#v...", i, out[i][:8], i, expect[i][:8]) } } } func TestGaloisAvx512Parallel11(t *testing.T) { testGaloisAvx512Parallelx1(t, 1) } func TestGaloisAvx512Parallel21(t *testing.T) { testGaloisAvx512Parallelx1(t, 2) } func TestGaloisAvx512Parallel31(t *testing.T) { testGaloisAvx512Parallelx1(t, 3) } func TestGaloisAvx512Parallel41(t *testing.T) { testGaloisAvx512Parallelx1(t, 4) } func TestGaloisAvx512Parallel51(t *testing.T) { testGaloisAvx512Parallelx1(t, 5) } func TestGaloisAvx512Parallel61(t *testing.T) { testGaloisAvx512Parallelx1(t, 6) } func TestGaloisAvx512Parallel71(t *testing.T) { testGaloisAvx512Parallelx1(t, 7) } func TestGaloisAvx512Parallel81(t *testing.T) { testGaloisAvx512Parallelx1(t, 8) } func testGaloisAvx512Parallelx2(t *testing.T, inputSize int) { if !defaultOptions.useAVX512 { t.Skip("AVX512 not detected") } rand.Seed(time.Now().UnixNano()) var size = 1024 * 1024 if testing.Short() { size = 4096 } in, out := make([][]byte, inputSize), make([][]byte, dimOut82) for i := range in { in[i] = make([]byte, size) rand.Read(in[i]) } for i := range out { out[i] = make([]byte, size) rand.Read(out[i]) } opts := defaultOptions opts.useSSSE3 = true matrix := [(16 + 16) * dimIn * dimOut82]byte{} coeffs := make([]byte, dimIn*len(out)) for i := 0; i < dimIn*len(out); i++ { coeffs[i] = byte(rand.Int31n(256)) copy(matrix[i*32:], mulTableLow[coeffs[i]][:]) copy(matrix[i*32+16:], mulTableHigh[coeffs[i]][:]) } // Do first run with clearing out any existing results _galMulAVX512Parallel82(in, out, &matrix, false) expect := make([][]byte, len(out)) for i := range expect { expect[i] = make([]byte, size) rand.Read(expect[i]) } for i := range in { if i == 0 { galMulSlice(coeffs[i], in[i], expect[0], &options{}) galMulSlice(coeffs[dimIn+i], in[i], expect[1], &options{}) } else { galMulSliceXor(coeffs[i], in[i], expect[0], &options{}) galMulSliceXor(coeffs[dimIn+i], in[i], expect[1], &options{}) } } for i := range out { if 0 != bytes.Compare(out[i], expect[i]) { t.Errorf("got [%d]%#v...,\n expected [%d]%#v...", i, out[i][:8], i, expect[i][:8]) } } inToAdd := make([][]byte, len(in)) for i := range inToAdd { inToAdd[i] = make([]byte, size) rand.Read(inToAdd[i]) } for i := 0; i < dimIn*len(out); i++ { coeffs[i] = byte(rand.Int31n(256)) copy(matrix[i*32:], mulTableLow[coeffs[i]][:]) copy(matrix[i*32+16:], mulTableHigh[coeffs[i]][:]) } // Do second run by adding to original run _galMulAVX512Parallel82(inToAdd, out, &matrix, true) for i := range in { galMulSliceXor(coeffs[i], inToAdd[i], expect[0], &options{}) galMulSliceXor(coeffs[dimIn+i], inToAdd[i], expect[1], &options{}) } for i := range out { if 0 != bytes.Compare(out[i], expect[i]) { t.Errorf("got [%d]%#v...,\n expected [%d]%#v...", i, out[i][:8], i, expect[i][:8]) } } } func TestGaloisAvx512Parallel12(t *testing.T) { testGaloisAvx512Parallelx2(t, 1) } func TestGaloisAvx512Parallel22(t *testing.T) { testGaloisAvx512Parallelx2(t, 2) } func TestGaloisAvx512Parallel32(t *testing.T) { testGaloisAvx512Parallelx2(t, 3) } func TestGaloisAvx512Parallel42(t *testing.T) { testGaloisAvx512Parallelx2(t, 4) } func TestGaloisAvx512Parallel52(t *testing.T) { testGaloisAvx512Parallelx2(t, 5) } func TestGaloisAvx512Parallel62(t *testing.T) { testGaloisAvx512Parallelx2(t, 6) } func TestGaloisAvx512Parallel72(t *testing.T) { testGaloisAvx512Parallelx2(t, 7) } func TestGaloisAvx512Parallel82(t *testing.T) { testGaloisAvx512Parallelx2(t, 8) } func testGaloisAvx512Parallelx4(t *testing.T, inputSize int) { if !defaultOptions.useAVX512 { t.Skip("AVX512 not detected") } rand.Seed(time.Now().UnixNano()) var size = 1 << 20 if testing.Short() { size = 4096 } in, out := make([][]byte, inputSize), make([][]byte, dimOut84) for i := range in { in[i] = make([]byte, size) rand.Read(in[i]) } for i := range out { out[i] = make([]byte, size) rand.Read(out[i]) } opts := defaultOptions opts.useSSSE3 = true matrix := [(16 + 16) * dimIn * dimOut84]byte{} coeffs := make([]byte, dimIn*len(out)) for i := 0; i < dimIn*len(out); i++ { coeffs[i] = byte(rand.Int31n(256)) copy(matrix[i*32:], mulTableLow[coeffs[i]][:]) copy(matrix[i*32+16:], mulTableHigh[coeffs[i]][:]) } // Do first run with clearing out any existing results _galMulAVX512Parallel84(in, out, &matrix, false) expect := make([][]byte, 4) for i := range expect { expect[i] = make([]byte, size) rand.Read(expect[i]) } for i := range in { if i == 0 { galMulSlice(coeffs[i], in[i], expect[0], &options{}) galMulSlice(coeffs[dimIn+i], in[i], expect[1], &options{}) galMulSlice(coeffs[dimIn*2+i], in[i], expect[2], &options{}) galMulSlice(coeffs[dimIn*3+i], in[i], expect[3], &options{}) } else { galMulSliceXor(coeffs[i], in[i], expect[0], &options{}) galMulSliceXor(coeffs[dimIn+i], in[i], expect[1], &options{}) galMulSliceXor(coeffs[dimIn*2+i], in[i], expect[2], &options{}) galMulSliceXor(coeffs[dimIn*3+i], in[i], expect[3], &options{}) } } for i := range out { if 0 != bytes.Compare(out[i], expect[i]) { t.Errorf("got [%d]%#v...,\n expected [%d]%#v...", i, out[i][:8], i, expect[i][:8]) } } inToAdd := make([][]byte, len(in)) for i := range inToAdd { inToAdd[i] = make([]byte, size) rand.Read(inToAdd[i]) } for i := 0; i < dimIn*len(out); i++ { coeffs[i] = byte(rand.Int31n(256)) copy(matrix[i*32:], mulTableLow[coeffs[i]][:]) copy(matrix[i*32+16:], mulTableHigh[coeffs[i]][:]) } // Do second run by adding to original run _galMulAVX512Parallel84(inToAdd, out, &matrix, true) for i := range in { galMulSliceXor(coeffs[i], inToAdd[i], expect[0], &options{}) galMulSliceXor(coeffs[dimIn+i], inToAdd[i], expect[1], &options{}) galMulSliceXor(coeffs[dimIn*2+i], inToAdd[i], expect[2], &options{}) galMulSliceXor(coeffs[dimIn*3+i], inToAdd[i], expect[3], &options{}) } for i := range out { if 0 != bytes.Compare(out[i], expect[i]) { t.Errorf("got [%d]%#v...,\n expected [%d]%#v...", i, out[i][:8], i, expect[i][:8]) } } } func TestGaloisAvx512Parallel14(t *testing.T) { testGaloisAvx512Parallelx4(t, 1) } func TestGaloisAvx512Parallel24(t *testing.T) { testGaloisAvx512Parallelx4(t, 2) } func TestGaloisAvx512Parallel34(t *testing.T) { testGaloisAvx512Parallelx4(t, 3) } func TestGaloisAvx512Parallel44(t *testing.T) { testGaloisAvx512Parallelx4(t, 4) } func TestGaloisAvx512Parallel54(t *testing.T) { testGaloisAvx512Parallelx4(t, 5) } func TestGaloisAvx512Parallel64(t *testing.T) { testGaloisAvx512Parallelx4(t, 6) } func TestGaloisAvx512Parallel74(t *testing.T) { testGaloisAvx512Parallelx4(t, 7) } func TestGaloisAvx512Parallel84(t *testing.T) { testGaloisAvx512Parallelx4(t, 8) } func testCodeSomeShardsAvx512WithLength(t *testing.T, ds, ps, l int, parallel bool) { if !defaultOptions.useAVX512 { t.Skip("AVX512 not detected") } var data = make([]byte, l) fillRandom(data) enc, _ := New(ds, ps) r := enc.(*reedSolomon) // need to access private methods shards, _ := enc.Split(data) // Fill shards to encode with garbage for i := r.DataShards; i < r.DataShards+r.ParityShards; i++ { rand.Read(shards[i]) } if parallel { r.codeSomeShardsAvx512P(r.parity, shards[:r.DataShards], shards[r.DataShards:], r.ParityShards, len(shards[0])) } else { r.codeSomeShardsAvx512(r.parity, shards[:r.DataShards], shards[r.DataShards:], r.ParityShards, len(shards[0])) } correct, _ := r.Verify(shards) if !correct { t.Errorf("Verification of encoded shards failed") } } func testCodeSomeShardsAvx512(t *testing.T, ds, ps int) { if !defaultOptions.useAVX512 { t.Skip("AVX512 not detected") } step := 1 if testing.Short() { // A prime for variation step += 29 } for l := 1; l <= 8192; l += step { testCodeSomeShardsAvx512WithLength(t, ds, ps, l, false) testCodeSomeShardsAvx512WithLength(t, ds, ps, l, true) } } func TestCodeSomeShardsAvx512_8x2(t *testing.T) { testCodeSomeShardsAvx512(t, 8, 2) } func TestCodeSomeShardsAvx512_1x4(t *testing.T) { testCodeSomeShardsAvx512(t, 1, 4) } func TestCodeSomeShardsAvx512_2x4(t *testing.T) { testCodeSomeShardsAvx512(t, 2, 4) } func TestCodeSomeShardsAvx512_3x4(t *testing.T) { testCodeSomeShardsAvx512(t, 3, 4) } func TestCodeSomeShardsAvx512_4x4(t *testing.T) { testCodeSomeShardsAvx512(t, 4, 4) } func TestCodeSomeShardsAvx512_5x4(t *testing.T) { testCodeSomeShardsAvx512(t, 5, 4) } func TestCodeSomeShardsAvx512_6x4(t *testing.T) { testCodeSomeShardsAvx512(t, 6, 4) } func TestCodeSomeShardsAvx512_7x4(t *testing.T) { testCodeSomeShardsAvx512(t, 7, 4) } func TestCodeSomeShardsAvx512_8x4(t *testing.T) { testCodeSomeShardsAvx512(t, 8, 4) } func TestCodeSomeShardsAvx512_9x4(t *testing.T) { testCodeSomeShardsAvx512(t, 9, 4) } func TestCodeSomeShardsAvx512_10x4(t *testing.T) { testCodeSomeShardsAvx512(t, 10, 4) } func TestCodeSomeShardsAvx512_12x4(t *testing.T) { testCodeSomeShardsAvx512(t, 12, 4) } func TestCodeSomeShardsAvx512_16x4(t *testing.T) { testCodeSomeShardsAvx512(t, 16, 4) } func TestCodeSomeShardsAvx512_3x6(t *testing.T) { testCodeSomeShardsAvx512(t, 3, 6) } func TestCodeSomeShardsAvx512_8x6(t *testing.T) { testCodeSomeShardsAvx512(t, 8, 6) } func TestCodeSomeShardsAvx512_8x7(t *testing.T) { testCodeSomeShardsAvx512(t, 8, 7) } func TestCodeSomeShardsAvx512_3x8(t *testing.T) { testCodeSomeShardsAvx512(t, 3, 8) } func TestCodeSomeShardsAvx512_8x8(t *testing.T) { testCodeSomeShardsAvx512(t, 8, 8) } func TestCodeSomeShardsAvx512_5x10(t *testing.T) { testCodeSomeShardsAvx512(t, 5, 10) } func TestCodeSomeShardsAvx512_8x10(t *testing.T) { testCodeSomeShardsAvx512(t, 8, 10) } func TestCodeSomeShardsAvx512_9x10(t *testing.T) { testCodeSomeShardsAvx512(t, 9, 10) } func TestCodeSomeShardsAvx512_Manyx4(t *testing.T) { if !defaultOptions.useAVX512 { return } step := 1 if testing.Short() { step += 7 } for inputs := 1; inputs <= 200; inputs += step { testCodeSomeShardsAvx512WithLength(t, inputs, 4, 1024+33, false) testCodeSomeShardsAvx512WithLength(t, inputs, 4, 1024+33, true) } } func TestCodeSomeShardsAvx512_ManyxMany(t *testing.T) { if !defaultOptions.useAVX512 { return } step := 1 if testing.Short() { step += 5 } for outputs := 1; outputs <= 32; outputs += step { for inputs := 1; inputs <= 32; inputs += step { testCodeSomeShardsAvx512WithLength(t, inputs, outputs, 1024+33, false) testCodeSomeShardsAvx512WithLength(t, inputs, outputs, 1024+33, true) } } } reedsolomon-1.9.13/galois_amd64.go000066400000000000000000000054431406411035300167360ustar00rootroot00000000000000//+build !noasm //+build !appengine //+build !gccgo // Copyright 2015, Klaus Post, see LICENSE for details. package reedsolomon //go:noescape func galMulSSSE3(low, high, in, out []byte) //go:noescape func galMulSSSE3Xor(low, high, in, out []byte) //go:noescape func galMulAVX2Xor(low, high, in, out []byte) //go:noescape func galMulAVX2(low, high, in, out []byte) //go:noescape func sSE2XorSlice(in, out []byte) //go:noescape func galMulAVX2Xor_64(low, high, in, out []byte) //go:noescape func galMulAVX2_64(low, high, in, out []byte) //go:noescape func sSE2XorSlice_64(in, out []byte) // This is what the assembler routines do in blocks of 16 bytes: /* func galMulSSSE3(low, high, in, out []byte) { for n, input := range in { l := input & 0xf h := input >> 4 out[n] = low[l] ^ high[h] } } func galMulSSSE3Xor(low, high, in, out []byte) { for n, input := range in { l := input & 0xf h := input >> 4 out[n] ^= low[l] ^ high[h] } } */ // bigSwitchover is the size where 64 bytes are processed per loop. const bigSwitchover = 128 func galMulSlice(c byte, in, out []byte, o *options) { if c == 1 { copy(out, in) return } if o.useAVX2 { if len(in) >= bigSwitchover { galMulAVX2_64(mulTableLow[c][:], mulTableHigh[c][:], in, out) done := (len(in) >> 6) << 6 in = in[done:] out = out[done:] } if len(in) > 32 { galMulAVX2(mulTableLow[c][:], mulTableHigh[c][:], in, out) done := (len(in) >> 5) << 5 in = in[done:] out = out[done:] } } else if o.useSSSE3 { galMulSSSE3(mulTableLow[c][:], mulTableHigh[c][:], in, out) done := (len(in) >> 4) << 4 in = in[done:] out = out[done:] } out = out[:len(in)] mt := mulTable[c][:256] for i := range in { out[i] = mt[in[i]] } } func galMulSliceXor(c byte, in, out []byte, o *options) { if c == 1 { sliceXor(in, out, o) return } if o.useAVX2 { if len(in) >= bigSwitchover { galMulAVX2Xor_64(mulTableLow[c][:], mulTableHigh[c][:], in, out) done := (len(in) >> 6) << 6 in = in[done:] out = out[done:] } if len(in) >= 32 { galMulAVX2Xor(mulTableLow[c][:], mulTableHigh[c][:], in, out) done := (len(in) >> 5) << 5 in = in[done:] out = out[done:] } } else if o.useSSSE3 { galMulSSSE3Xor(mulTableLow[c][:], mulTableHigh[c][:], in, out) done := (len(in) >> 4) << 4 in = in[done:] out = out[done:] } out = out[:len(in)] mt := mulTable[c][:256] for i := range in { out[i] ^= mt[in[i]] } } // slice galois add func sliceXor(in, out []byte, o *options) { if o.useSSE2 { if len(in) >= bigSwitchover { sSE2XorSlice_64(in, out) done := (len(in) >> 6) << 6 in = in[done:] out = out[done:] } if len(in) >= 16 { sSE2XorSlice(in, out) done := (len(in) >> 4) << 4 in = in[done:] out = out[done:] } } out = out[:len(in)] for i := range in { out[i] ^= in[i] } } reedsolomon-1.9.13/galois_amd64.s000066400000000000000000000240131406411035300165650ustar00rootroot00000000000000//+build !noasm //+build !appengine //+build !gccgo // Copyright 2015, Klaus Post, see LICENSE for details. // Based on http://www.snia.org/sites/default/files2/SDC2013/presentations/NewThinking/EthanMiller_Screaming_Fast_Galois_Field%20Arithmetic_SIMD%20Instructions.pdf // and http://jerasure.org/jerasure/gf-complete/tree/master // func galMulSSSE3Xor(low, high, in, out []byte) TEXT ·galMulSSSE3Xor(SB), 7, $0 MOVQ low+0(FP), SI // SI: &low MOVQ high+24(FP), DX // DX: &high MOVOU (SI), X6 // X6 low MOVOU (DX), X7 // X7: high MOVQ $15, BX // BX: low mask MOVQ BX, X8 PXOR X5, X5 MOVQ in+48(FP), SI // R11: &in MOVQ in_len+56(FP), R9 // R9: len(in) MOVQ out+72(FP), DX // DX: &out PSHUFB X5, X8 // X8: lomask (unpacked) SHRQ $4, R9 // len(in) / 16 MOVQ SI, AX MOVQ DX, BX ANDQ $15, AX ANDQ $15, BX CMPQ R9, $0 JEQ done_xor ORQ AX, BX CMPQ BX, $0 JNZ loopback_xor loopback_xor_aligned: MOVOA (SI), X0 // in[x] MOVOA (DX), X4 // out[x] MOVOA X0, X1 // in[x] MOVOA X6, X2 // low copy MOVOA X7, X3 // high copy PSRLQ $4, X1 // X1: high input PAND X8, X0 // X0: low input PAND X8, X1 // X0: high input PSHUFB X0, X2 // X2: mul low part PSHUFB X1, X3 // X3: mul high part PXOR X2, X3 // X3: Result PXOR X4, X3 // X3: Result xor existing out MOVOA X3, (DX) // Store ADDQ $16, SI // in+=16 ADDQ $16, DX // out+=16 SUBQ $1, R9 JNZ loopback_xor_aligned JMP done_xor loopback_xor: MOVOU (SI), X0 // in[x] MOVOU (DX), X4 // out[x] MOVOU X0, X1 // in[x] MOVOU X6, X2 // low copy MOVOU X7, X3 // high copy PSRLQ $4, X1 // X1: high input PAND X8, X0 // X0: low input PAND X8, X1 // X0: high input PSHUFB X0, X2 // X2: mul low part PSHUFB X1, X3 // X3: mul high part PXOR X2, X3 // X3: Result PXOR X4, X3 // X3: Result xor existing out MOVOU X3, (DX) // Store ADDQ $16, SI // in+=16 ADDQ $16, DX // out+=16 SUBQ $1, R9 JNZ loopback_xor done_xor: RET // func galMulSSSE3(low, high, in, out []byte) TEXT ·galMulSSSE3(SB), 7, $0 MOVQ low+0(FP), SI // SI: &low MOVQ high+24(FP), DX // DX: &high MOVOU (SI), X6 // X6 low MOVOU (DX), X7 // X7: high MOVQ $15, BX // BX: low mask MOVQ BX, X8 PXOR X5, X5 MOVQ in+48(FP), SI // R11: &in MOVQ in_len+56(FP), R9 // R9: len(in) MOVQ out+72(FP), DX // DX: &out PSHUFB X5, X8 // X8: lomask (unpacked) MOVQ SI, AX MOVQ DX, BX SHRQ $4, R9 // len(in) / 16 ANDQ $15, AX ANDQ $15, BX CMPQ R9, $0 JEQ done ORQ AX, BX CMPQ BX, $0 JNZ loopback loopback_aligned: MOVOA (SI), X0 // in[x] MOVOA X0, X1 // in[x] MOVOA X6, X2 // low copy MOVOA X7, X3 // high copy PSRLQ $4, X1 // X1: high input PAND X8, X0 // X0: low input PAND X8, X1 // X0: high input PSHUFB X0, X2 // X2: mul low part PSHUFB X1, X3 // X3: mul high part PXOR X2, X3 // X3: Result MOVOA X3, (DX) // Store ADDQ $16, SI // in+=16 ADDQ $16, DX // out+=16 SUBQ $1, R9 JNZ loopback_aligned JMP done loopback: MOVOU (SI), X0 // in[x] MOVOU X0, X1 // in[x] MOVOA X6, X2 // low copy MOVOA X7, X3 // high copy PSRLQ $4, X1 // X1: high input PAND X8, X0 // X0: low input PAND X8, X1 // X0: high input PSHUFB X0, X2 // X2: mul low part PSHUFB X1, X3 // X3: mul high part PXOR X2, X3 // X3: Result MOVOU X3, (DX) // Store ADDQ $16, SI // in+=16 ADDQ $16, DX // out+=16 SUBQ $1, R9 JNZ loopback done: RET // func galMulAVX2Xor(low, high, in, out []byte) TEXT ·galMulAVX2Xor(SB), 7, $0 MOVQ low+0(FP), SI // SI: &low MOVQ high+24(FP), DX // DX: &high MOVQ $15, BX // BX: low mask MOVQ BX, X5 MOVOU (SI), X6 // X6: low MOVOU (DX), X7 // X7: high MOVQ in_len+56(FP), R9 // R9: len(in) VINSERTI128 $1, X6, Y6, Y6 // low VINSERTI128 $1, X7, Y7, Y7 // high VPBROADCASTB X5, Y8 // Y8: lomask (unpacked) SHRQ $5, R9 // len(in) / 32 MOVQ out+72(FP), DX // DX: &out MOVQ in+48(FP), SI // SI: &in TESTQ R9, R9 JZ done_xor_avx2 loopback_xor_avx2: VMOVDQU (SI), Y0 VMOVDQU (DX), Y4 VPSRLQ $4, Y0, Y1 // Y1: high input VPAND Y8, Y0, Y0 // Y0: low input VPAND Y8, Y1, Y1 // Y1: high input VPSHUFB Y0, Y6, Y2 // Y2: mul low part VPSHUFB Y1, Y7, Y3 // Y3: mul high part VPXOR Y3, Y2, Y3 // Y3: Result VPXOR Y4, Y3, Y4 // Y4: Result VMOVDQU Y4, (DX) ADDQ $32, SI // in+=32 ADDQ $32, DX // out+=32 SUBQ $1, R9 JNZ loopback_xor_avx2 done_xor_avx2: VZEROUPPER RET // func galMulAVX2(low, high, in, out []byte) TEXT ·galMulAVX2(SB), 7, $0 MOVQ low+0(FP), SI // SI: &low MOVQ high+24(FP), DX // DX: &high MOVQ $15, BX // BX: low mask MOVQ BX, X5 MOVOU (SI), X6 // X6: low MOVOU (DX), X7 // X7: high MOVQ in_len+56(FP), R9 // R9: len(in) VINSERTI128 $1, X6, Y6, Y6 // low VINSERTI128 $1, X7, Y7, Y7 // high VPBROADCASTB X5, Y8 // Y8: lomask (unpacked) SHRQ $5, R9 // len(in) / 32 MOVQ out+72(FP), DX // DX: &out MOVQ in+48(FP), SI // SI: &in TESTQ R9, R9 JZ done_avx2 loopback_avx2: VMOVDQU (SI), Y0 VPSRLQ $4, Y0, Y1 // Y1: high input VPAND Y8, Y0, Y0 // Y0: low input VPAND Y8, Y1, Y1 // Y1: high input VPSHUFB Y0, Y6, Y2 // Y2: mul low part VPSHUFB Y1, Y7, Y3 // Y3: mul high part VPXOR Y3, Y2, Y4 // Y4: Result VMOVDQU Y4, (DX) ADDQ $32, SI // in+=32 ADDQ $32, DX // out+=32 SUBQ $1, R9 JNZ loopback_avx2 done_avx2: VZEROUPPER RET // func sSE2XorSlice(in, out []byte) TEXT ·sSE2XorSlice(SB), 7, $0 MOVQ in+0(FP), SI // SI: &in MOVQ in_len+8(FP), R9 // R9: len(in) MOVQ out+24(FP), DX // DX: &out SHRQ $4, R9 // len(in) / 16 CMPQ R9, $0 JEQ done_xor_sse2 loopback_xor_sse2: MOVOU (SI), X0 // in[x] MOVOU (DX), X1 // out[x] PXOR X0, X1 MOVOU X1, (DX) ADDQ $16, SI // in+=16 ADDQ $16, DX // out+=16 SUBQ $1, R9 JNZ loopback_xor_sse2 done_xor_sse2: RET // func galMulAVX2Xor_64(low, high, in, out []byte) TEXT ·galMulAVX2Xor_64(SB), 7, $0 MOVQ low+0(FP), SI // SI: &low MOVQ high+24(FP), DX // DX: &high MOVQ $15, BX // BX: low mask MOVQ BX, X5 MOVOU (SI), X6 // X6: low MOVOU (DX), X7 // X7: high MOVQ in_len+56(FP), R9 // R9: len(in) VINSERTI128 $1, X6, Y6, Y6 // low VINSERTI128 $1, X7, Y7, Y7 // high VPBROADCASTB X5, Y8 // Y8: lomask (unpacked) SHRQ $6, R9 // len(in) / 64 MOVQ out+72(FP), DX // DX: &out MOVQ in+48(FP), SI // SI: &in TESTQ R9, R9 JZ done_xor_avx2_64 loopback_xor_avx2_64: VMOVDQU (SI), Y0 VMOVDQU 32(SI), Y10 VMOVDQU (DX), Y4 VMOVDQU 32(DX), Y14 VPSRLQ $4, Y0, Y1 // Y1: high input VPSRLQ $4, Y10, Y11 // Y11: high input 2 VPAND Y8, Y0, Y0 // Y0: low input VPAND Y8, Y10, Y10 // Y10: low input 2 VPAND Y8, Y1, Y1 // Y11: high input VPAND Y8, Y11, Y11 // Y11: high input 2 VPSHUFB Y0, Y6, Y2 // Y2: mul low part VPSHUFB Y10, Y6, Y12 // Y12: mul low part 2 VPSHUFB Y1, Y7, Y3 // Y3: mul high part VPSHUFB Y11, Y7, Y13 // Y13: mul high part 2 VPXOR Y3, Y2, Y3 // Y3: Result VPXOR Y13, Y12, Y13 // Y13: Result 2 VPXOR Y4, Y3, Y4 // Y4: Result VPXOR Y14, Y13, Y14 // Y4: Result 2 VMOVDQU Y4, (DX) VMOVDQU Y14, 32(DX) ADDQ $64, SI // in+=64 ADDQ $64, DX // out+=64 SUBQ $1, R9 JNZ loopback_xor_avx2_64 done_xor_avx2_64: VZEROUPPER RET // func galMulAVX2_64(low, high, in, out []byte) TEXT ·galMulAVX2_64(SB), 7, $0 MOVQ low+0(FP), SI // SI: &low MOVQ high+24(FP), DX // DX: &high MOVQ $15, BX // BX: low mask MOVQ BX, X5 MOVOU (SI), X6 // X6: low MOVOU (DX), X7 // X7: high MOVQ in_len+56(FP), R9 // R9: len(in) VINSERTI128 $1, X6, Y6, Y6 // low VINSERTI128 $1, X7, Y7, Y7 // high VPBROADCASTB X5, Y8 // Y8: lomask (unpacked) SHRQ $6, R9 // len(in) / 64 MOVQ out+72(FP), DX // DX: &out MOVQ in+48(FP), SI // SI: &in TESTQ R9, R9 JZ done_avx2_64 loopback_avx2_64: VMOVDQU (SI), Y0 VMOVDQU 32(SI), Y10 VPSRLQ $4, Y0, Y1 // Y1: high input VPSRLQ $4, Y10, Y11 // Y11: high input 2 VPAND Y8, Y0, Y0 // Y0: low input VPAND Y8, Y10, Y10 // Y10: low input VPAND Y8, Y1, Y1 // Y1: high input VPAND Y8, Y11, Y11 // Y11: high input 2 VPSHUFB Y0, Y6, Y2 // Y2: mul low part VPSHUFB Y10, Y6, Y12 // Y12: mul low part 2 VPSHUFB Y1, Y7, Y3 // Y3: mul high part VPSHUFB Y11, Y7, Y13 // Y13: mul high part 2 VPXOR Y3, Y2, Y4 // Y4: Result VPXOR Y13, Y12, Y14 // Y14: Result 2 VMOVDQU Y4, (DX) VMOVDQU Y14, 32(DX) ADDQ $64, SI // in+=64 ADDQ $64, DX // out+=64 SUBQ $1, R9 JNZ loopback_avx2_64 done_avx2_64: VZEROUPPER RET // func sSE2XorSlice_64(in, out []byte) TEXT ·sSE2XorSlice_64(SB), 7, $0 MOVQ in+0(FP), SI // SI: &in MOVQ in_len+8(FP), R9 // R9: len(in) MOVQ out+24(FP), DX // DX: &out SHRQ $6, R9 // len(in) / 64 CMPQ R9, $0 JEQ done_xor_sse2_64 loopback_xor_sse2_64: MOVOU (SI), X0 // in[x] MOVOU 16(SI), X2 // in[x] MOVOU 32(SI), X4 // in[x] MOVOU 48(SI), X6 // in[x] MOVOU (DX), X1 // out[x] MOVOU 16(DX), X3 // out[x] MOVOU 32(DX), X5 // out[x] MOVOU 48(DX), X7 // out[x] PXOR X0, X1 PXOR X2, X3 PXOR X4, X5 PXOR X6, X7 MOVOU X1, (DX) MOVOU X3, 16(DX) MOVOU X5, 32(DX) MOVOU X7, 48(DX) ADDQ $64, SI // in+=64 ADDQ $64, DX // out+=64 SUBQ $1, R9 JNZ loopback_xor_sse2_64 done_xor_sse2_64: RET reedsolomon-1.9.13/galois_arm64.go000066400000000000000000000023251406411035300167500ustar00rootroot00000000000000//+build !noasm //+build !appengine //+build !gccgo // Copyright 2015, Klaus Post, see LICENSE for details. // Copyright 2017, Minio, Inc. package reedsolomon //go:noescape func galMulNEON(low, high, in, out []byte) //go:noescape func galMulXorNEON(low, high, in, out []byte) //go:noescape func galXorNEON(in, out []byte) func galMulSlice(c byte, in, out []byte, o *options) { if c == 1 { copy(out, in) return } var done int galMulNEON(mulTableLow[c][:], mulTableHigh[c][:], in, out) done = (len(in) >> 5) << 5 remain := len(in) - done if remain > 0 { mt := mulTable[c][:256] for i := done; i < len(in); i++ { out[i] = mt[in[i]] } } } func galMulSliceXor(c byte, in, out []byte, o *options) { if c == 1 { sliceXor(in, out, o) return } var done int galMulXorNEON(mulTableLow[c][:], mulTableHigh[c][:], in, out) done = (len(in) >> 5) << 5 remain := len(in) - done if remain > 0 { mt := mulTable[c][:256] for i := done; i < len(in); i++ { out[i] ^= mt[in[i]] } } } // slice galois add func sliceXor(in, out []byte, o *options) { galXorNEON(in, out) done := (len(in) >> 5) << 5 remain := len(in) - done if remain > 0 { for i := done; i < len(in); i++ { out[i] ^= in[i] } } } reedsolomon-1.9.13/galois_arm64.s000066400000000000000000000052731406411035300166120ustar00rootroot00000000000000//+build !noasm //+build !appengine //+build !gccgo // Copyright 2015, Klaus Post, see LICENSE for details. // Copyright 2017, Minio, Inc. #define LOAD(LO1, LO2, HI1, HI2) \ VLD1.P 32(R1), [LO1.B16, LO2.B16] \ \ \ // Get low input and high input VUSHR $4, LO1.B16, HI1.B16 \ VUSHR $4, LO2.B16, HI2.B16 \ VAND V8.B16, LO1.B16, LO1.B16 \ VAND V8.B16, LO2.B16, LO2.B16 #define GALOIS_MUL(MUL_LO, MUL_HI, OUT1, OUT2, TMP1, TMP2) \ \ // Mul low part and mul high part VTBL V0.B16, [MUL_LO.B16], OUT1.B16 \ VTBL V10.B16, [MUL_HI.B16], OUT2.B16 \ VTBL V1.B16, [MUL_LO.B16], TMP1.B16 \ VTBL V11.B16, [MUL_HI.B16], TMP2.B16 \ \ \ // Combine results VEOR OUT2.B16, OUT1.B16, OUT1.B16 \ VEOR TMP2.B16, TMP1.B16, OUT2.B16 // func galMulNEON(low, high, in, out []byte) TEXT ·galMulNEON(SB), 7, $0 MOVD in_base+48(FP), R1 MOVD in_len+56(FP), R2 // length of message MOVD out_base+72(FP), R5 SUBS $32, R2 BMI complete MOVD low+0(FP), R10 // R10: &low MOVD high+24(FP), R11 // R11: &high VLD1 (R10), [V6.B16] VLD1 (R11), [V7.B16] // // Use an extra instruction below since `VDUP R3, V8.B16` generates assembler error // WORD $0x4e010c68 // dup v8.16b, w3 // MOVD $0x0f, R3 VMOV R3, V8.B[0] VDUP V8.B[0], V8.B16 loop: // Main loop LOAD(V0, V1, V10, V11) GALOIS_MUL(V6, V7, V4, V5, V14, V15) // Store result VST1.P [V4.D2, V5.D2], 32(R5) SUBS $32, R2 BPL loop complete: RET // func galMulXorNEON(low, high, in, out []byte) TEXT ·galMulXorNEON(SB), 7, $0 MOVD in_base+48(FP), R1 MOVD in_len+56(FP), R2 // length of message MOVD out_base+72(FP), R5 SUBS $32, R2 BMI completeXor MOVD low+0(FP), R10 // R10: &low MOVD high+24(FP), R11 // R11: &high VLD1 (R10), [V6.B16] VLD1 (R11), [V7.B16] // // Use an extra instruction below since `VDUP R3, V8.B16` generates assembler error // WORD $0x4e010c68 // dup v8.16b, w3 // MOVD $0x0f, R3 VMOV R3, V8.B[0] VDUP V8.B[0], V8.B16 loopXor: // Main loop VLD1 (R5), [V20.B16, V21.B16] LOAD(V0, V1, V10, V11) GALOIS_MUL(V6, V7, V4, V5, V14, V15) VEOR V20.B16, V4.B16, V4.B16 VEOR V21.B16, V5.B16, V5.B16 // Store result VST1.P [V4.D2, V5.D2], 32(R5) SUBS $32, R2 BPL loopXor completeXor: RET // func galXorNEON(in, out []byte) TEXT ·galXorNEON(SB), 7, $0 MOVD in_base+0(FP), R1 MOVD in_len+8(FP), R2 // length of message MOVD out_base+24(FP), R5 SUBS $32, R2 BMI completeXor loopXor: // Main loop VLD1.P 32(R1), [V0.B16, V1.B16] VLD1 (R5), [V20.B16, V21.B16] VEOR V20.B16, V0.B16, V4.B16 VEOR V21.B16, V1.B16, V5.B16 // Store result VST1.P [V4.D2, V5.D2], 32(R5) SUBS $32, R2 BPL loopXor completeXor: RET reedsolomon-1.9.13/galois_gen_amd64.go000066400000000000000000000577111406411035300175740ustar00rootroot00000000000000// Code generated by command: go run gen.go -out ../galois_gen_amd64.s -stubs ../galois_gen_amd64.go -pkg=reedsolomon. DO NOT EDIT. // +build !appengine // +build !noasm // +build !nogen // +build gc package reedsolomon // mulAvxTwo_1x1 takes 1 inputs and produces 1 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_1x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_1x1_64 takes 1 inputs and produces 1 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_1x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_1x2 takes 1 inputs and produces 2 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_1x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_1x2_64 takes 1 inputs and produces 2 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_1x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_1x3 takes 1 inputs and produces 3 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_1x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_1x3_64 takes 1 inputs and produces 3 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_1x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_1x4 takes 1 inputs and produces 4 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_1x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_1x5 takes 1 inputs and produces 5 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_1x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_1x6 takes 1 inputs and produces 6 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_1x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_1x7 takes 1 inputs and produces 7 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_1x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_1x8 takes 1 inputs and produces 8 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_1x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_1x9 takes 1 inputs and produces 9 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_1x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_1x10 takes 1 inputs and produces 10 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_1x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_2x1 takes 2 inputs and produces 1 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_2x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_2x1_64 takes 2 inputs and produces 1 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_2x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_2x2 takes 2 inputs and produces 2 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_2x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_2x2_64 takes 2 inputs and produces 2 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_2x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_2x3 takes 2 inputs and produces 3 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_2x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_2x3_64 takes 2 inputs and produces 3 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_2x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_2x4 takes 2 inputs and produces 4 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_2x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_2x5 takes 2 inputs and produces 5 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_2x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_2x6 takes 2 inputs and produces 6 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_2x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_2x7 takes 2 inputs and produces 7 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_2x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_2x8 takes 2 inputs and produces 8 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_2x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_2x9 takes 2 inputs and produces 9 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_2x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_2x10 takes 2 inputs and produces 10 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_2x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_3x1 takes 3 inputs and produces 1 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_3x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_3x1_64 takes 3 inputs and produces 1 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_3x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_3x2 takes 3 inputs and produces 2 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_3x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_3x2_64 takes 3 inputs and produces 2 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_3x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_3x3 takes 3 inputs and produces 3 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_3x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_3x3_64 takes 3 inputs and produces 3 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_3x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_3x4 takes 3 inputs and produces 4 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_3x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_3x5 takes 3 inputs and produces 5 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_3x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_3x6 takes 3 inputs and produces 6 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_3x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_3x7 takes 3 inputs and produces 7 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_3x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_3x8 takes 3 inputs and produces 8 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_3x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_3x9 takes 3 inputs and produces 9 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_3x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_3x10 takes 3 inputs and produces 10 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_3x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_4x1 takes 4 inputs and produces 1 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_4x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_4x1_64 takes 4 inputs and produces 1 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_4x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_4x2 takes 4 inputs and produces 2 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_4x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_4x2_64 takes 4 inputs and produces 2 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_4x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_4x3 takes 4 inputs and produces 3 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_4x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_4x3_64 takes 4 inputs and produces 3 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_4x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_4x4 takes 4 inputs and produces 4 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_4x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_4x5 takes 4 inputs and produces 5 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_4x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_4x6 takes 4 inputs and produces 6 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_4x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_4x7 takes 4 inputs and produces 7 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_4x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_4x8 takes 4 inputs and produces 8 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_4x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_4x9 takes 4 inputs and produces 9 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_4x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_4x10 takes 4 inputs and produces 10 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_4x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_5x1 takes 5 inputs and produces 1 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_5x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_5x1_64 takes 5 inputs and produces 1 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_5x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_5x2 takes 5 inputs and produces 2 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_5x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_5x2_64 takes 5 inputs and produces 2 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_5x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_5x3 takes 5 inputs and produces 3 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_5x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_5x3_64 takes 5 inputs and produces 3 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_5x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_5x4 takes 5 inputs and produces 4 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_5x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_5x5 takes 5 inputs and produces 5 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_5x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_5x6 takes 5 inputs and produces 6 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_5x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_5x7 takes 5 inputs and produces 7 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_5x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_5x8 takes 5 inputs and produces 8 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_5x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_5x9 takes 5 inputs and produces 9 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_5x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_5x10 takes 5 inputs and produces 10 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_5x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_6x1 takes 6 inputs and produces 1 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_6x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_6x1_64 takes 6 inputs and produces 1 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_6x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_6x2 takes 6 inputs and produces 2 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_6x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_6x2_64 takes 6 inputs and produces 2 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_6x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_6x3 takes 6 inputs and produces 3 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_6x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_6x3_64 takes 6 inputs and produces 3 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_6x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_6x4 takes 6 inputs and produces 4 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_6x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_6x5 takes 6 inputs and produces 5 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_6x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_6x6 takes 6 inputs and produces 6 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_6x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_6x7 takes 6 inputs and produces 7 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_6x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_6x8 takes 6 inputs and produces 8 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_6x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_6x9 takes 6 inputs and produces 9 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_6x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_6x10 takes 6 inputs and produces 10 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_6x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_7x1 takes 7 inputs and produces 1 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_7x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_7x1_64 takes 7 inputs and produces 1 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_7x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_7x2 takes 7 inputs and produces 2 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_7x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_7x2_64 takes 7 inputs and produces 2 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_7x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_7x3 takes 7 inputs and produces 3 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_7x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_7x3_64 takes 7 inputs and produces 3 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_7x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_7x4 takes 7 inputs and produces 4 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_7x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_7x5 takes 7 inputs and produces 5 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_7x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_7x6 takes 7 inputs and produces 6 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_7x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_7x7 takes 7 inputs and produces 7 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_7x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_7x8 takes 7 inputs and produces 8 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_7x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_7x9 takes 7 inputs and produces 9 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_7x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_7x10 takes 7 inputs and produces 10 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_7x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_8x1 takes 8 inputs and produces 1 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_8x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_8x1_64 takes 8 inputs and produces 1 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_8x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_8x2 takes 8 inputs and produces 2 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_8x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_8x2_64 takes 8 inputs and produces 2 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_8x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_8x3 takes 8 inputs and produces 3 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_8x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_8x3_64 takes 8 inputs and produces 3 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_8x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_8x4 takes 8 inputs and produces 4 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_8x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_8x5 takes 8 inputs and produces 5 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_8x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_8x6 takes 8 inputs and produces 6 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_8x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_8x7 takes 8 inputs and produces 7 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_8x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_8x8 takes 8 inputs and produces 8 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_8x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_8x9 takes 8 inputs and produces 9 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_8x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_8x10 takes 8 inputs and produces 10 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_8x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_9x1 takes 9 inputs and produces 1 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_9x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_9x1_64 takes 9 inputs and produces 1 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_9x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_9x2 takes 9 inputs and produces 2 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_9x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_9x2_64 takes 9 inputs and produces 2 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_9x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_9x3 takes 9 inputs and produces 3 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_9x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_9x3_64 takes 9 inputs and produces 3 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_9x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_9x4 takes 9 inputs and produces 4 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_9x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_9x5 takes 9 inputs and produces 5 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_9x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_9x6 takes 9 inputs and produces 6 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_9x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_9x7 takes 9 inputs and produces 7 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_9x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_9x8 takes 9 inputs and produces 8 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_9x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_9x9 takes 9 inputs and produces 9 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_9x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_9x10 takes 9 inputs and produces 10 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_9x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_10x1 takes 10 inputs and produces 1 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_10x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_10x1_64 takes 10 inputs and produces 1 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_10x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_10x2 takes 10 inputs and produces 2 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_10x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_10x2_64 takes 10 inputs and produces 2 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_10x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_10x3 takes 10 inputs and produces 3 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_10x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_10x3_64 takes 10 inputs and produces 3 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_10x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_10x4 takes 10 inputs and produces 4 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_10x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_10x5 takes 10 inputs and produces 5 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_10x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_10x6 takes 10 inputs and produces 6 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_10x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_10x7 takes 10 inputs and produces 7 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_10x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_10x8 takes 10 inputs and produces 8 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_10x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_10x9 takes 10 inputs and produces 9 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_10x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_10x10 takes 10 inputs and produces 10 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_10x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) reedsolomon-1.9.13/galois_gen_amd64.s000066400000000000000000027240221406411035300174270ustar00rootroot00000000000000// Code generated by command: go run gen.go -out ../galois_gen_amd64.s -stubs ../galois_gen_amd64.go -pkg=reedsolomon. DO NOT EDIT. // +build !appengine // +build !noasm // +build !nogen // +build gc #include "textflag.h" // func mulAvxTwo_1x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_1x1(SB), NOSPLIT, $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 6 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_1x1_end VMOVDQU (CX), Y0 VMOVDQU 32(CX), Y1 MOVQ in_base+24(FP), CX MOVQ (CX), CX MOVQ out_base+48(FP), DX MOVQ (DX), DX MOVQ start+72(FP), BX // Add start offset to output ADDQ BX, DX // Add start offset to input ADDQ BX, CX MOVQ $0x0000000f, BX MOVQ BX, X3 VPBROADCASTB X3, Y3 mulAvxTwo_1x1_loop: // Clear 1 outputs VPXOR Y2, Y2, Y2 // Load and process 32 bytes from input 0 to 1 outputs VMOVDQU (CX), Y4 ADDQ $0x20, CX VPSRLQ $0x04, Y4, Y5 VPAND Y3, Y4, Y4 VPAND Y3, Y5, Y5 VPSHUFB Y4, Y0, Y4 VPSHUFB Y5, Y1, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y2, Y2 // Store 1 outputs VMOVDQU Y2, (DX) ADDQ $0x20, DX // Prepare for next loop DECQ AX JNZ mulAvxTwo_1x1_loop VZEROUPPER mulAvxTwo_1x1_end: RET // func mulAvxTwo_1x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_1x1_64(SB), $0-88 // Loading no tables to registers // Destination kept on stack // Full registers estimated 6 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_1x1_64_end MOVQ in_base+24(FP), AX MOVQ (AX), AX MOVQ out_base+48(FP), DX MOVQ out_base+48(FP), DX MOVQ start+72(FP), BX // Add start offset to input ADDQ BX, AX MOVQ $0x0000000f, SI MOVQ SI, X2 VPBROADCASTB X2, Y2 MOVQ n+80(FP), SI SHRQ $0x06, SI mulAvxTwo_1x1_64_loop: // Clear 1 outputs VPXOR Y0, Y0, Y0 VPXOR Y1, Y1, Y1 // Load and process 64 bytes from input 0 to 1 outputs VMOVDQU (AX), Y6 VMOVDQU 32(AX), Y5 ADDQ $0x40, AX VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU (CX), Y3 VMOVDQU 32(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y5, Y6, Y5 VPXOR Y3, Y0, Y0 VPXOR Y5, Y1, Y1 // Store 1 outputs MOVQ (DX), DI VMOVDQU Y0, (DI)(BX*1) VMOVDQU Y1, 32(DI)(BX*1) // Prepare for next loop ADDQ $0x40, BX DECQ SI JNZ mulAvxTwo_1x1_64_loop VZEROUPPER mulAvxTwo_1x1_64_end: RET // func mulAvxTwo_1x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_1x2(SB), NOSPLIT, $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 11 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_1x2_end VMOVDQU (CX), Y0 VMOVDQU 32(CX), Y1 VMOVDQU 64(CX), Y2 VMOVDQU 96(CX), Y3 MOVQ in_base+24(FP), CX MOVQ (CX), CX MOVQ out_base+48(FP), DX MOVQ (DX), BX MOVQ 24(DX), DX MOVQ start+72(FP), SI // Add start offset to output ADDQ SI, BX ADDQ SI, DX // Add start offset to input ADDQ SI, CX MOVQ $0x0000000f, SI MOVQ SI, X6 VPBROADCASTB X6, Y6 mulAvxTwo_1x2_loop: // Clear 2 outputs VPXOR Y4, Y4, Y4 VPXOR Y5, Y5, Y5 // Load and process 32 bytes from input 0 to 2 outputs VMOVDQU (CX), Y9 ADDQ $0x20, CX VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VPSHUFB Y9, Y0, Y7 VPSHUFB Y10, Y1, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y4, Y4 VPSHUFB Y9, Y2, Y7 VPSHUFB Y10, Y3, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y5, Y5 // Store 2 outputs VMOVDQU Y4, (BX) ADDQ $0x20, BX VMOVDQU Y5, (DX) ADDQ $0x20, DX // Prepare for next loop DECQ AX JNZ mulAvxTwo_1x2_loop VZEROUPPER mulAvxTwo_1x2_end: RET // func mulAvxTwo_1x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_1x2_64(SB), $0-88 // Loading no tables to registers // Destination kept on stack // Full registers estimated 11 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_1x2_64_end MOVQ in_base+24(FP), AX MOVQ (AX), AX MOVQ out_base+48(FP), DX MOVQ out_base+48(FP), DX MOVQ start+72(FP), BX // Add start offset to input ADDQ BX, AX MOVQ $0x0000000f, SI MOVQ SI, X4 VPBROADCASTB X4, Y4 MOVQ n+80(FP), SI SHRQ $0x06, SI mulAvxTwo_1x2_64_loop: // Clear 2 outputs VPXOR Y0, Y0, Y0 VPXOR Y1, Y1, Y1 VPXOR Y2, Y2, Y2 VPXOR Y3, Y3, Y3 // Load and process 64 bytes from input 0 to 2 outputs VMOVDQU (AX), Y9 VMOVDQU 32(AX), Y11 ADDQ $0x40, AX VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU (CX), Y5 VMOVDQU 32(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y7, Y8, Y7 VPXOR Y5, Y0, Y0 VPXOR Y7, Y1, Y1 VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y7, Y8, Y7 VPXOR Y5, Y2, Y2 VPXOR Y7, Y3, Y3 // Store 2 outputs MOVQ (DX), DI VMOVDQU Y0, (DI)(BX*1) VMOVDQU Y1, 32(DI)(BX*1) MOVQ 24(DX), DI VMOVDQU Y2, (DI)(BX*1) VMOVDQU Y3, 32(DI)(BX*1) // Prepare for next loop ADDQ $0x40, BX DECQ SI JNZ mulAvxTwo_1x2_64_loop VZEROUPPER mulAvxTwo_1x2_64_end: RET // func mulAvxTwo_1x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_1x3(SB), NOSPLIT, $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 14 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_1x3_end VMOVDQU (CX), Y0 VMOVDQU 32(CX), Y1 VMOVDQU 64(CX), Y2 VMOVDQU 96(CX), Y3 VMOVDQU 128(CX), Y4 VMOVDQU 160(CX), Y5 MOVQ in_base+24(FP), CX MOVQ (CX), CX MOVQ out_base+48(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DX MOVQ start+72(FP), DI // Add start offset to output ADDQ DI, BX ADDQ DI, SI ADDQ DI, DX // Add start offset to input ADDQ DI, CX MOVQ $0x0000000f, DI MOVQ DI, X9 VPBROADCASTB X9, Y9 mulAvxTwo_1x3_loop: // Clear 3 outputs VPXOR Y6, Y6, Y6 VPXOR Y7, Y7, Y7 VPXOR Y8, Y8, Y8 // Load and process 32 bytes from input 0 to 3 outputs VMOVDQU (CX), Y12 ADDQ $0x20, CX VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VPSHUFB Y12, Y0, Y10 VPSHUFB Y13, Y1, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y6, Y6 VPSHUFB Y12, Y2, Y10 VPSHUFB Y13, Y3, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y7, Y7 VPSHUFB Y12, Y4, Y10 VPSHUFB Y13, Y5, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y8, Y8 // Store 3 outputs VMOVDQU Y6, (BX) ADDQ $0x20, BX VMOVDQU Y7, (SI) ADDQ $0x20, SI VMOVDQU Y8, (DX) ADDQ $0x20, DX // Prepare for next loop DECQ AX JNZ mulAvxTwo_1x3_loop VZEROUPPER mulAvxTwo_1x3_end: RET // func mulAvxTwo_1x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_1x3_64(SB), $0-88 // Loading no tables to registers // Destination kept on stack // Full registers estimated 14 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_1x3_64_end MOVQ in_base+24(FP), AX MOVQ (AX), AX MOVQ out_base+48(FP), DX MOVQ out_base+48(FP), DX MOVQ start+72(FP), BX // Add start offset to input ADDQ BX, AX MOVQ $0x0000000f, SI MOVQ SI, X6 VPBROADCASTB X6, Y6 MOVQ n+80(FP), SI SHRQ $0x06, SI mulAvxTwo_1x3_64_loop: // Clear 3 outputs VPXOR Y0, Y0, Y0 VPXOR Y1, Y1, Y1 VPXOR Y2, Y2, Y2 VPXOR Y3, Y3, Y3 VPXOR Y4, Y4, Y4 VPXOR Y5, Y5, Y5 // Load and process 64 bytes from input 0 to 3 outputs VMOVDQU (AX), Y11 VMOVDQU 32(AX), Y13 ADDQ $0x40, AX VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU (CX), Y7 VMOVDQU 32(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y0, Y0 VPXOR Y9, Y1, Y1 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y2, Y2 VPXOR Y9, Y3, Y3 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y4, Y4 VPXOR Y9, Y5, Y5 // Store 3 outputs MOVQ (DX), DI VMOVDQU Y0, (DI)(BX*1) VMOVDQU Y1, 32(DI)(BX*1) MOVQ 24(DX), DI VMOVDQU Y2, (DI)(BX*1) VMOVDQU Y3, 32(DI)(BX*1) MOVQ 48(DX), DI VMOVDQU Y4, (DI)(BX*1) VMOVDQU Y5, 32(DI)(BX*1) // Prepare for next loop ADDQ $0x40, BX DECQ SI JNZ mulAvxTwo_1x3_64_loop VZEROUPPER mulAvxTwo_1x3_64_end: RET // func mulAvxTwo_1x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_1x4(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 17 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_1x4_end MOVQ in_base+24(FP), DX MOVQ (DX), DX MOVQ out_base+48(FP), BX MOVQ (BX), SI MOVQ 24(BX), DI MOVQ 48(BX), R8 MOVQ 72(BX), BX MOVQ start+72(FP), R9 // Add start offset to output ADDQ R9, SI ADDQ R9, DI ADDQ R9, R8 ADDQ R9, BX // Add start offset to input ADDQ R9, DX MOVQ $0x0000000f, R9 MOVQ R9, X4 VPBROADCASTB X4, Y4 mulAvxTwo_1x4_loop: // Clear 4 outputs VPXOR Y0, Y0, Y0 VPXOR Y1, Y1, Y1 VPXOR Y2, Y2, Y2 VPXOR Y3, Y3, Y3 // Load and process 32 bytes from input 0 to 4 outputs VMOVDQU (DX), Y7 ADDQ $0x20, DX VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU (CX), Y5 VMOVDQU 32(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y0, Y0 VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y1, Y1 VMOVDQU 128(CX), Y5 VMOVDQU 160(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y2, Y2 VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y3, Y3 // Store 4 outputs VMOVDQU Y0, (SI) ADDQ $0x20, SI VMOVDQU Y1, (DI) ADDQ $0x20, DI VMOVDQU Y2, (R8) ADDQ $0x20, R8 VMOVDQU Y3, (BX) ADDQ $0x20, BX // Prepare for next loop DECQ AX JNZ mulAvxTwo_1x4_loop VZEROUPPER mulAvxTwo_1x4_end: RET // func mulAvxTwo_1x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_1x5(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 20 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_1x5_end MOVQ in_base+24(FP), DX MOVQ (DX), DX MOVQ out_base+48(FP), BX MOVQ (BX), SI MOVQ 24(BX), DI MOVQ 48(BX), R8 MOVQ 72(BX), R9 MOVQ 96(BX), BX MOVQ start+72(FP), R10 // Add start offset to output ADDQ R10, SI ADDQ R10, DI ADDQ R10, R8 ADDQ R10, R9 ADDQ R10, BX // Add start offset to input ADDQ R10, DX MOVQ $0x0000000f, R10 MOVQ R10, X5 VPBROADCASTB X5, Y5 mulAvxTwo_1x5_loop: // Clear 5 outputs VPXOR Y0, Y0, Y0 VPXOR Y1, Y1, Y1 VPXOR Y2, Y2, Y2 VPXOR Y3, Y3, Y3 VPXOR Y4, Y4, Y4 // Load and process 32 bytes from input 0 to 5 outputs VMOVDQU (DX), Y8 ADDQ $0x20, DX VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU (CX), Y6 VMOVDQU 32(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y0, Y0 VMOVDQU 64(CX), Y6 VMOVDQU 96(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y1, Y1 VMOVDQU 128(CX), Y6 VMOVDQU 160(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y2, Y2 VMOVDQU 192(CX), Y6 VMOVDQU 224(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y3, Y3 VMOVDQU 256(CX), Y6 VMOVDQU 288(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y4, Y4 // Store 5 outputs VMOVDQU Y0, (SI) ADDQ $0x20, SI VMOVDQU Y1, (DI) ADDQ $0x20, DI VMOVDQU Y2, (R8) ADDQ $0x20, R8 VMOVDQU Y3, (R9) ADDQ $0x20, R9 VMOVDQU Y4, (BX) ADDQ $0x20, BX // Prepare for next loop DECQ AX JNZ mulAvxTwo_1x5_loop VZEROUPPER mulAvxTwo_1x5_end: RET // func mulAvxTwo_1x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_1x6(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 23 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_1x6_end MOVQ in_base+24(FP), DX MOVQ (DX), DX MOVQ out_base+48(FP), BX MOVQ (BX), SI MOVQ 24(BX), DI MOVQ 48(BX), R8 MOVQ 72(BX), R9 MOVQ 96(BX), R10 MOVQ 120(BX), BX MOVQ start+72(FP), R11 // Add start offset to output ADDQ R11, SI ADDQ R11, DI ADDQ R11, R8 ADDQ R11, R9 ADDQ R11, R10 ADDQ R11, BX // Add start offset to input ADDQ R11, DX MOVQ $0x0000000f, R11 MOVQ R11, X6 VPBROADCASTB X6, Y6 mulAvxTwo_1x6_loop: // Clear 6 outputs VPXOR Y0, Y0, Y0 VPXOR Y1, Y1, Y1 VPXOR Y2, Y2, Y2 VPXOR Y3, Y3, Y3 VPXOR Y4, Y4, Y4 VPXOR Y5, Y5, Y5 // Load and process 32 bytes from input 0 to 6 outputs VMOVDQU (DX), Y9 ADDQ $0x20, DX VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU (CX), Y7 VMOVDQU 32(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y0, Y0 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y1, Y1 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y2, Y2 VMOVDQU 192(CX), Y7 VMOVDQU 224(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y3, Y3 VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y4, Y4 VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y5, Y5 // Store 6 outputs VMOVDQU Y0, (SI) ADDQ $0x20, SI VMOVDQU Y1, (DI) ADDQ $0x20, DI VMOVDQU Y2, (R8) ADDQ $0x20, R8 VMOVDQU Y3, (R9) ADDQ $0x20, R9 VMOVDQU Y4, (R10) ADDQ $0x20, R10 VMOVDQU Y5, (BX) ADDQ $0x20, BX // Prepare for next loop DECQ AX JNZ mulAvxTwo_1x6_loop VZEROUPPER mulAvxTwo_1x6_end: RET // func mulAvxTwo_1x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_1x7(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 26 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_1x7_end MOVQ in_base+24(FP), DX MOVQ (DX), DX MOVQ out_base+48(FP), BX MOVQ (BX), SI MOVQ 24(BX), DI MOVQ 48(BX), R8 MOVQ 72(BX), R9 MOVQ 96(BX), R10 MOVQ 120(BX), R11 MOVQ 144(BX), BX MOVQ start+72(FP), R12 // Add start offset to output ADDQ R12, SI ADDQ R12, DI ADDQ R12, R8 ADDQ R12, R9 ADDQ R12, R10 ADDQ R12, R11 ADDQ R12, BX // Add start offset to input ADDQ R12, DX MOVQ $0x0000000f, R12 MOVQ R12, X7 VPBROADCASTB X7, Y7 mulAvxTwo_1x7_loop: // Clear 7 outputs VPXOR Y0, Y0, Y0 VPXOR Y1, Y1, Y1 VPXOR Y2, Y2, Y2 VPXOR Y3, Y3, Y3 VPXOR Y4, Y4, Y4 VPXOR Y5, Y5, Y5 VPXOR Y6, Y6, Y6 // Load and process 32 bytes from input 0 to 7 outputs VMOVDQU (DX), Y10 ADDQ $0x20, DX VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU (CX), Y8 VMOVDQU 32(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y0, Y0 VMOVDQU 64(CX), Y8 VMOVDQU 96(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y1, Y1 VMOVDQU 128(CX), Y8 VMOVDQU 160(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y2, Y2 VMOVDQU 192(CX), Y8 VMOVDQU 224(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y3, Y3 VMOVDQU 256(CX), Y8 VMOVDQU 288(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y4, Y4 VMOVDQU 320(CX), Y8 VMOVDQU 352(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y5, Y5 VMOVDQU 384(CX), Y8 VMOVDQU 416(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y6, Y6 // Store 7 outputs VMOVDQU Y0, (SI) ADDQ $0x20, SI VMOVDQU Y1, (DI) ADDQ $0x20, DI VMOVDQU Y2, (R8) ADDQ $0x20, R8 VMOVDQU Y3, (R9) ADDQ $0x20, R9 VMOVDQU Y4, (R10) ADDQ $0x20, R10 VMOVDQU Y5, (R11) ADDQ $0x20, R11 VMOVDQU Y6, (BX) ADDQ $0x20, BX // Prepare for next loop DECQ AX JNZ mulAvxTwo_1x7_loop VZEROUPPER mulAvxTwo_1x7_end: RET // func mulAvxTwo_1x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_1x8(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 29 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_1x8_end MOVQ in_base+24(FP), DX MOVQ (DX), DX MOVQ out_base+48(FP), BX MOVQ (BX), SI MOVQ 24(BX), DI MOVQ 48(BX), R8 MOVQ 72(BX), R9 MOVQ 96(BX), R10 MOVQ 120(BX), R11 MOVQ 144(BX), R12 MOVQ 168(BX), BX MOVQ start+72(FP), R13 // Add start offset to output ADDQ R13, SI ADDQ R13, DI ADDQ R13, R8 ADDQ R13, R9 ADDQ R13, R10 ADDQ R13, R11 ADDQ R13, R12 ADDQ R13, BX // Add start offset to input ADDQ R13, DX MOVQ $0x0000000f, R13 MOVQ R13, X8 VPBROADCASTB X8, Y8 mulAvxTwo_1x8_loop: // Clear 8 outputs VPXOR Y0, Y0, Y0 VPXOR Y1, Y1, Y1 VPXOR Y2, Y2, Y2 VPXOR Y3, Y3, Y3 VPXOR Y4, Y4, Y4 VPXOR Y5, Y5, Y5 VPXOR Y6, Y6, Y6 VPXOR Y7, Y7, Y7 // Load and process 32 bytes from input 0 to 8 outputs VMOVDQU (DX), Y11 ADDQ $0x20, DX VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU (CX), Y9 VMOVDQU 32(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y0, Y0 VMOVDQU 64(CX), Y9 VMOVDQU 96(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y1, Y1 VMOVDQU 128(CX), Y9 VMOVDQU 160(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y2, Y2 VMOVDQU 192(CX), Y9 VMOVDQU 224(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y3, Y3 VMOVDQU 256(CX), Y9 VMOVDQU 288(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y4, Y4 VMOVDQU 320(CX), Y9 VMOVDQU 352(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y5, Y5 VMOVDQU 384(CX), Y9 VMOVDQU 416(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y6, Y6 VMOVDQU 448(CX), Y9 VMOVDQU 480(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y7, Y7 // Store 8 outputs VMOVDQU Y0, (SI) ADDQ $0x20, SI VMOVDQU Y1, (DI) ADDQ $0x20, DI VMOVDQU Y2, (R8) ADDQ $0x20, R8 VMOVDQU Y3, (R9) ADDQ $0x20, R9 VMOVDQU Y4, (R10) ADDQ $0x20, R10 VMOVDQU Y5, (R11) ADDQ $0x20, R11 VMOVDQU Y6, (R12) ADDQ $0x20, R12 VMOVDQU Y7, (BX) ADDQ $0x20, BX // Prepare for next loop DECQ AX JNZ mulAvxTwo_1x8_loop VZEROUPPER mulAvxTwo_1x8_end: RET // func mulAvxTwo_1x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_1x9(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 32 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_1x9_end MOVQ in_base+24(FP), DX MOVQ (DX), DX MOVQ out_base+48(FP), BX MOVQ (BX), SI MOVQ 24(BX), DI MOVQ 48(BX), R8 MOVQ 72(BX), R9 MOVQ 96(BX), R10 MOVQ 120(BX), R11 MOVQ 144(BX), R12 MOVQ 168(BX), R13 MOVQ 192(BX), BX MOVQ start+72(FP), R14 // Add start offset to output ADDQ R14, SI ADDQ R14, DI ADDQ R14, R8 ADDQ R14, R9 ADDQ R14, R10 ADDQ R14, R11 ADDQ R14, R12 ADDQ R14, R13 ADDQ R14, BX // Add start offset to input ADDQ R14, DX MOVQ $0x0000000f, R14 MOVQ R14, X9 VPBROADCASTB X9, Y9 mulAvxTwo_1x9_loop: // Clear 9 outputs VPXOR Y0, Y0, Y0 VPXOR Y1, Y1, Y1 VPXOR Y2, Y2, Y2 VPXOR Y3, Y3, Y3 VPXOR Y4, Y4, Y4 VPXOR Y5, Y5, Y5 VPXOR Y6, Y6, Y6 VPXOR Y7, Y7, Y7 VPXOR Y8, Y8, Y8 // Load and process 32 bytes from input 0 to 9 outputs VMOVDQU (DX), Y12 ADDQ $0x20, DX VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU (CX), Y10 VMOVDQU 32(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y0, Y0 VMOVDQU 64(CX), Y10 VMOVDQU 96(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y1, Y1 VMOVDQU 128(CX), Y10 VMOVDQU 160(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y2, Y2 VMOVDQU 192(CX), Y10 VMOVDQU 224(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y3, Y3 VMOVDQU 256(CX), Y10 VMOVDQU 288(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y4, Y4 VMOVDQU 320(CX), Y10 VMOVDQU 352(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y5, Y5 VMOVDQU 384(CX), Y10 VMOVDQU 416(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y6, Y6 VMOVDQU 448(CX), Y10 VMOVDQU 480(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y7, Y7 VMOVDQU 512(CX), Y10 VMOVDQU 544(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y8, Y8 // Store 9 outputs VMOVDQU Y0, (SI) ADDQ $0x20, SI VMOVDQU Y1, (DI) ADDQ $0x20, DI VMOVDQU Y2, (R8) ADDQ $0x20, R8 VMOVDQU Y3, (R9) ADDQ $0x20, R9 VMOVDQU Y4, (R10) ADDQ $0x20, R10 VMOVDQU Y5, (R11) ADDQ $0x20, R11 VMOVDQU Y6, (R12) ADDQ $0x20, R12 VMOVDQU Y7, (R13) ADDQ $0x20, R13 VMOVDQU Y8, (BX) ADDQ $0x20, BX // Prepare for next loop DECQ AX JNZ mulAvxTwo_1x9_loop VZEROUPPER mulAvxTwo_1x9_end: RET // func mulAvxTwo_1x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_1x10(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 35 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_1x10_end MOVQ in_base+24(FP), DX MOVQ (DX), DX MOVQ out_base+48(FP), BX MOVQ (BX), SI MOVQ 24(BX), DI MOVQ 48(BX), R8 MOVQ 72(BX), R9 MOVQ 96(BX), R10 MOVQ 120(BX), R11 MOVQ 144(BX), R12 MOVQ 168(BX), R13 MOVQ 192(BX), R14 MOVQ 216(BX), BX MOVQ start+72(FP), R15 // Add start offset to output ADDQ R15, SI ADDQ R15, DI ADDQ R15, R8 ADDQ R15, R9 ADDQ R15, R10 ADDQ R15, R11 ADDQ R15, R12 ADDQ R15, R13 ADDQ R15, R14 ADDQ R15, BX // Add start offset to input ADDQ R15, DX MOVQ $0x0000000f, R15 MOVQ R15, X10 VPBROADCASTB X10, Y10 mulAvxTwo_1x10_loop: // Clear 10 outputs VPXOR Y0, Y0, Y0 VPXOR Y1, Y1, Y1 VPXOR Y2, Y2, Y2 VPXOR Y3, Y3, Y3 VPXOR Y4, Y4, Y4 VPXOR Y5, Y5, Y5 VPXOR Y6, Y6, Y6 VPXOR Y7, Y7, Y7 VPXOR Y8, Y8, Y8 VPXOR Y9, Y9, Y9 // Load and process 32 bytes from input 0 to 10 outputs VMOVDQU (DX), Y13 ADDQ $0x20, DX VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU (CX), Y11 VMOVDQU 32(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y0, Y0 VMOVDQU 64(CX), Y11 VMOVDQU 96(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y1, Y1 VMOVDQU 128(CX), Y11 VMOVDQU 160(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y2, Y2 VMOVDQU 192(CX), Y11 VMOVDQU 224(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y3, Y3 VMOVDQU 256(CX), Y11 VMOVDQU 288(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y4, Y4 VMOVDQU 320(CX), Y11 VMOVDQU 352(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y5, Y5 VMOVDQU 384(CX), Y11 VMOVDQU 416(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y6, Y6 VMOVDQU 448(CX), Y11 VMOVDQU 480(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y7, Y7 VMOVDQU 512(CX), Y11 VMOVDQU 544(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y8, Y8 VMOVDQU 576(CX), Y11 VMOVDQU 608(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y9, Y9 // Store 10 outputs VMOVDQU Y0, (SI) ADDQ $0x20, SI VMOVDQU Y1, (DI) ADDQ $0x20, DI VMOVDQU Y2, (R8) ADDQ $0x20, R8 VMOVDQU Y3, (R9) ADDQ $0x20, R9 VMOVDQU Y4, (R10) ADDQ $0x20, R10 VMOVDQU Y5, (R11) ADDQ $0x20, R11 VMOVDQU Y6, (R12) ADDQ $0x20, R12 VMOVDQU Y7, (R13) ADDQ $0x20, R13 VMOVDQU Y8, (R14) ADDQ $0x20, R14 VMOVDQU Y9, (BX) ADDQ $0x20, BX // Prepare for next loop DECQ AX JNZ mulAvxTwo_1x10_loop VZEROUPPER mulAvxTwo_1x10_end: RET // func mulAvxTwo_2x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_2x1(SB), NOSPLIT, $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 8 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_2x1_end VMOVDQU (CX), Y0 VMOVDQU 32(CX), Y1 VMOVDQU 64(CX), Y2 VMOVDQU 96(CX), Y3 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), CX MOVQ out_base+48(FP), BX MOVQ (BX), BX MOVQ start+72(FP), SI // Add start offset to output ADDQ SI, BX // Add start offset to input ADDQ SI, DX ADDQ SI, CX MOVQ $0x0000000f, SI MOVQ SI, X5 VPBROADCASTB X5, Y5 mulAvxTwo_2x1_loop: // Clear 1 outputs VPXOR Y4, Y4, Y4 // Load and process 32 bytes from input 0 to 1 outputs VMOVDQU (DX), Y6 ADDQ $0x20, DX VPSRLQ $0x04, Y6, Y7 VPAND Y5, Y6, Y6 VPAND Y5, Y7, Y7 VPSHUFB Y6, Y0, Y6 VPSHUFB Y7, Y1, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 1 to 1 outputs VMOVDQU (CX), Y6 ADDQ $0x20, CX VPSRLQ $0x04, Y6, Y7 VPAND Y5, Y6, Y6 VPAND Y5, Y7, Y7 VPSHUFB Y6, Y2, Y6 VPSHUFB Y7, Y3, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y4, Y4 // Store 1 outputs VMOVDQU Y4, (BX) ADDQ $0x20, BX // Prepare for next loop DECQ AX JNZ mulAvxTwo_2x1_loop VZEROUPPER mulAvxTwo_2x1_end: RET // func mulAvxTwo_2x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_2x1_64(SB), $0-88 // Loading no tables to registers // Destination kept on stack // Full registers estimated 8 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_2x1_64_end MOVQ in_base+24(FP), AX MOVQ (AX), DX MOVQ 24(AX), AX MOVQ out_base+48(FP), BX MOVQ out_base+48(FP), BX MOVQ start+72(FP), SI // Add start offset to input ADDQ SI, DX ADDQ SI, AX MOVQ $0x0000000f, DI MOVQ DI, X2 VPBROADCASTB X2, Y2 MOVQ n+80(FP), DI SHRQ $0x06, DI mulAvxTwo_2x1_64_loop: // Clear 1 outputs VPXOR Y0, Y0, Y0 VPXOR Y1, Y1, Y1 // Load and process 64 bytes from input 0 to 1 outputs VMOVDQU (DX), Y6 VMOVDQU 32(DX), Y5 ADDQ $0x40, DX VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU (CX), Y3 VMOVDQU 32(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y5, Y6, Y5 VPXOR Y3, Y0, Y0 VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 1 to 1 outputs VMOVDQU (AX), Y6 VMOVDQU 32(AX), Y5 ADDQ $0x40, AX VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 64(CX), Y3 VMOVDQU 96(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y5, Y6, Y5 VPXOR Y3, Y0, Y0 VPXOR Y5, Y1, Y1 // Store 1 outputs MOVQ (BX), R8 VMOVDQU Y0, (R8)(SI*1) VMOVDQU Y1, 32(R8)(SI*1) // Prepare for next loop ADDQ $0x40, SI DECQ DI JNZ mulAvxTwo_2x1_64_loop VZEROUPPER mulAvxTwo_2x1_64_end: RET // func mulAvxTwo_2x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_2x2(SB), NOSPLIT, $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 15 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_2x2_end VMOVDQU (CX), Y0 VMOVDQU 32(CX), Y1 VMOVDQU 64(CX), Y2 VMOVDQU 96(CX), Y3 VMOVDQU 128(CX), Y4 VMOVDQU 160(CX), Y5 VMOVDQU 192(CX), Y6 VMOVDQU 224(CX), Y7 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), CX MOVQ out_base+48(FP), BX MOVQ (BX), SI MOVQ 24(BX), BX MOVQ start+72(FP), DI // Add start offset to output ADDQ DI, SI ADDQ DI, BX // Add start offset to input ADDQ DI, DX ADDQ DI, CX MOVQ $0x0000000f, DI MOVQ DI, X10 VPBROADCASTB X10, Y10 mulAvxTwo_2x2_loop: // Clear 2 outputs VPXOR Y8, Y8, Y8 VPXOR Y9, Y9, Y9 // Load and process 32 bytes from input 0 to 2 outputs VMOVDQU (DX), Y13 ADDQ $0x20, DX VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VPSHUFB Y13, Y0, Y11 VPSHUFB Y14, Y1, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y8, Y8 VPSHUFB Y13, Y2, Y11 VPSHUFB Y14, Y3, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 1 to 2 outputs VMOVDQU (CX), Y13 ADDQ $0x20, CX VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VPSHUFB Y13, Y4, Y11 VPSHUFB Y14, Y5, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y8, Y8 VPSHUFB Y13, Y6, Y11 VPSHUFB Y14, Y7, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y9, Y9 // Store 2 outputs VMOVDQU Y8, (SI) ADDQ $0x20, SI VMOVDQU Y9, (BX) ADDQ $0x20, BX // Prepare for next loop DECQ AX JNZ mulAvxTwo_2x2_loop VZEROUPPER mulAvxTwo_2x2_end: RET // func mulAvxTwo_2x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_2x2_64(SB), $0-88 // Loading no tables to registers // Destination kept on stack // Full registers estimated 15 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_2x2_64_end MOVQ in_base+24(FP), AX MOVQ (AX), DX MOVQ 24(AX), AX MOVQ out_base+48(FP), BX MOVQ out_base+48(FP), BX MOVQ start+72(FP), SI // Add start offset to input ADDQ SI, DX ADDQ SI, AX MOVQ $0x0000000f, DI MOVQ DI, X4 VPBROADCASTB X4, Y4 MOVQ n+80(FP), DI SHRQ $0x06, DI mulAvxTwo_2x2_64_loop: // Clear 2 outputs VPXOR Y0, Y0, Y0 VPXOR Y1, Y1, Y1 VPXOR Y2, Y2, Y2 VPXOR Y3, Y3, Y3 // Load and process 64 bytes from input 0 to 2 outputs VMOVDQU (DX), Y9 VMOVDQU 32(DX), Y11 ADDQ $0x40, DX VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU (CX), Y5 VMOVDQU 32(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y7, Y8, Y7 VPXOR Y5, Y0, Y0 VPXOR Y7, Y1, Y1 VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y7, Y8, Y7 VPXOR Y5, Y2, Y2 VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 1 to 2 outputs VMOVDQU (AX), Y9 VMOVDQU 32(AX), Y11 ADDQ $0x40, AX VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 128(CX), Y5 VMOVDQU 160(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y7, Y8, Y7 VPXOR Y5, Y0, Y0 VPXOR Y7, Y1, Y1 VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y7, Y8, Y7 VPXOR Y5, Y2, Y2 VPXOR Y7, Y3, Y3 // Store 2 outputs MOVQ (BX), R8 VMOVDQU Y0, (R8)(SI*1) VMOVDQU Y1, 32(R8)(SI*1) MOVQ 24(BX), R8 VMOVDQU Y2, (R8)(SI*1) VMOVDQU Y3, 32(R8)(SI*1) // Prepare for next loop ADDQ $0x40, SI DECQ DI JNZ mulAvxTwo_2x2_64_loop VZEROUPPER mulAvxTwo_2x2_64_end: RET // func mulAvxTwo_2x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_2x3(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 20 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_2x3_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), DX MOVQ out_base+48(FP), SI MOVQ (SI), DI MOVQ 24(SI), R8 MOVQ 48(SI), SI MOVQ start+72(FP), R9 // Add start offset to output ADDQ R9, DI ADDQ R9, R8 ADDQ R9, SI // Add start offset to input ADDQ R9, BX ADDQ R9, DX MOVQ $0x0000000f, R9 MOVQ R9, X3 VPBROADCASTB X3, Y3 mulAvxTwo_2x3_loop: // Clear 3 outputs VPXOR Y0, Y0, Y0 VPXOR Y1, Y1, Y1 VPXOR Y2, Y2, Y2 // Load and process 32 bytes from input 0 to 3 outputs VMOVDQU (BX), Y6 ADDQ $0x20, BX VPSRLQ $0x04, Y6, Y7 VPAND Y3, Y6, Y6 VPAND Y3, Y7, Y7 VMOVDQU (CX), Y4 VMOVDQU 32(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y0, Y0 VMOVDQU 64(CX), Y4 VMOVDQU 96(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y1, Y1 VMOVDQU 128(CX), Y4 VMOVDQU 160(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 1 to 3 outputs VMOVDQU (DX), Y6 ADDQ $0x20, DX VPSRLQ $0x04, Y6, Y7 VPAND Y3, Y6, Y6 VPAND Y3, Y7, Y7 VMOVDQU 192(CX), Y4 VMOVDQU 224(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y0, Y0 VMOVDQU 256(CX), Y4 VMOVDQU 288(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y1, Y1 VMOVDQU 320(CX), Y4 VMOVDQU 352(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y2, Y2 // Store 3 outputs VMOVDQU Y0, (DI) ADDQ $0x20, DI VMOVDQU Y1, (R8) ADDQ $0x20, R8 VMOVDQU Y2, (SI) ADDQ $0x20, SI // Prepare for next loop DECQ AX JNZ mulAvxTwo_2x3_loop VZEROUPPER mulAvxTwo_2x3_end: RET // func mulAvxTwo_2x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_2x3_64(SB), $0-88 // Loading no tables to registers // Destination kept on stack // Full registers estimated 20 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_2x3_64_end MOVQ in_base+24(FP), AX MOVQ (AX), DX MOVQ 24(AX), AX MOVQ out_base+48(FP), BX MOVQ out_base+48(FP), BX MOVQ start+72(FP), SI // Add start offset to input ADDQ SI, DX ADDQ SI, AX MOVQ $0x0000000f, DI MOVQ DI, X6 VPBROADCASTB X6, Y6 MOVQ n+80(FP), DI SHRQ $0x06, DI mulAvxTwo_2x3_64_loop: // Clear 3 outputs VPXOR Y0, Y0, Y0 VPXOR Y1, Y1, Y1 VPXOR Y2, Y2, Y2 VPXOR Y3, Y3, Y3 VPXOR Y4, Y4, Y4 VPXOR Y5, Y5, Y5 // Load and process 64 bytes from input 0 to 3 outputs VMOVDQU (DX), Y11 VMOVDQU 32(DX), Y13 ADDQ $0x40, DX VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU (CX), Y7 VMOVDQU 32(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y0, Y0 VPXOR Y9, Y1, Y1 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y2, Y2 VPXOR Y9, Y3, Y3 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y4, Y4 VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 1 to 3 outputs VMOVDQU (AX), Y11 VMOVDQU 32(AX), Y13 ADDQ $0x40, AX VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 192(CX), Y7 VMOVDQU 224(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y0, Y0 VPXOR Y9, Y1, Y1 VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y2, Y2 VPXOR Y9, Y3, Y3 VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y4, Y4 VPXOR Y9, Y5, Y5 // Store 3 outputs MOVQ (BX), R8 VMOVDQU Y0, (R8)(SI*1) VMOVDQU Y1, 32(R8)(SI*1) MOVQ 24(BX), R8 VMOVDQU Y2, (R8)(SI*1) VMOVDQU Y3, 32(R8)(SI*1) MOVQ 48(BX), R8 VMOVDQU Y4, (R8)(SI*1) VMOVDQU Y5, 32(R8)(SI*1) // Prepare for next loop ADDQ $0x40, SI DECQ DI JNZ mulAvxTwo_2x3_64_loop VZEROUPPER mulAvxTwo_2x3_64_end: RET // func mulAvxTwo_2x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_2x4(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 25 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_2x4_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), DX MOVQ out_base+48(FP), SI MOVQ (SI), DI MOVQ 24(SI), R8 MOVQ 48(SI), R9 MOVQ 72(SI), SI MOVQ start+72(FP), R10 // Add start offset to output ADDQ R10, DI ADDQ R10, R8 ADDQ R10, R9 ADDQ R10, SI // Add start offset to input ADDQ R10, BX ADDQ R10, DX MOVQ $0x0000000f, R10 MOVQ R10, X4 VPBROADCASTB X4, Y4 mulAvxTwo_2x4_loop: // Clear 4 outputs VPXOR Y0, Y0, Y0 VPXOR Y1, Y1, Y1 VPXOR Y2, Y2, Y2 VPXOR Y3, Y3, Y3 // Load and process 32 bytes from input 0 to 4 outputs VMOVDQU (BX), Y7 ADDQ $0x20, BX VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU (CX), Y5 VMOVDQU 32(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y0, Y0 VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y1, Y1 VMOVDQU 128(CX), Y5 VMOVDQU 160(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y2, Y2 VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 1 to 4 outputs VMOVDQU (DX), Y7 ADDQ $0x20, DX VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 256(CX), Y5 VMOVDQU 288(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y0, Y0 VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y1, Y1 VMOVDQU 384(CX), Y5 VMOVDQU 416(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y2, Y2 VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y3, Y3 // Store 4 outputs VMOVDQU Y0, (DI) ADDQ $0x20, DI VMOVDQU Y1, (R8) ADDQ $0x20, R8 VMOVDQU Y2, (R9) ADDQ $0x20, R9 VMOVDQU Y3, (SI) ADDQ $0x20, SI // Prepare for next loop DECQ AX JNZ mulAvxTwo_2x4_loop VZEROUPPER mulAvxTwo_2x4_end: RET // func mulAvxTwo_2x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_2x5(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 30 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_2x5_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), DX MOVQ out_base+48(FP), SI MOVQ (SI), DI MOVQ 24(SI), R8 MOVQ 48(SI), R9 MOVQ 72(SI), R10 MOVQ 96(SI), SI MOVQ start+72(FP), R11 // Add start offset to output ADDQ R11, DI ADDQ R11, R8 ADDQ R11, R9 ADDQ R11, R10 ADDQ R11, SI // Add start offset to input ADDQ R11, BX ADDQ R11, DX MOVQ $0x0000000f, R11 MOVQ R11, X5 VPBROADCASTB X5, Y5 mulAvxTwo_2x5_loop: // Clear 5 outputs VPXOR Y0, Y0, Y0 VPXOR Y1, Y1, Y1 VPXOR Y2, Y2, Y2 VPXOR Y3, Y3, Y3 VPXOR Y4, Y4, Y4 // Load and process 32 bytes from input 0 to 5 outputs VMOVDQU (BX), Y8 ADDQ $0x20, BX VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU (CX), Y6 VMOVDQU 32(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y0, Y0 VMOVDQU 64(CX), Y6 VMOVDQU 96(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y1, Y1 VMOVDQU 128(CX), Y6 VMOVDQU 160(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y2, Y2 VMOVDQU 192(CX), Y6 VMOVDQU 224(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y3, Y3 VMOVDQU 256(CX), Y6 VMOVDQU 288(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 1 to 5 outputs VMOVDQU (DX), Y8 ADDQ $0x20, DX VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 320(CX), Y6 VMOVDQU 352(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y0, Y0 VMOVDQU 384(CX), Y6 VMOVDQU 416(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y1, Y1 VMOVDQU 448(CX), Y6 VMOVDQU 480(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y2, Y2 VMOVDQU 512(CX), Y6 VMOVDQU 544(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y3, Y3 VMOVDQU 576(CX), Y6 VMOVDQU 608(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y4, Y4 // Store 5 outputs VMOVDQU Y0, (DI) ADDQ $0x20, DI VMOVDQU Y1, (R8) ADDQ $0x20, R8 VMOVDQU Y2, (R9) ADDQ $0x20, R9 VMOVDQU Y3, (R10) ADDQ $0x20, R10 VMOVDQU Y4, (SI) ADDQ $0x20, SI // Prepare for next loop DECQ AX JNZ mulAvxTwo_2x5_loop VZEROUPPER mulAvxTwo_2x5_end: RET // func mulAvxTwo_2x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_2x6(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 35 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_2x6_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), DX MOVQ out_base+48(FP), SI MOVQ (SI), DI MOVQ 24(SI), R8 MOVQ 48(SI), R9 MOVQ 72(SI), R10 MOVQ 96(SI), R11 MOVQ 120(SI), SI MOVQ start+72(FP), R12 // Add start offset to output ADDQ R12, DI ADDQ R12, R8 ADDQ R12, R9 ADDQ R12, R10 ADDQ R12, R11 ADDQ R12, SI // Add start offset to input ADDQ R12, BX ADDQ R12, DX MOVQ $0x0000000f, R12 MOVQ R12, X6 VPBROADCASTB X6, Y6 mulAvxTwo_2x6_loop: // Clear 6 outputs VPXOR Y0, Y0, Y0 VPXOR Y1, Y1, Y1 VPXOR Y2, Y2, Y2 VPXOR Y3, Y3, Y3 VPXOR Y4, Y4, Y4 VPXOR Y5, Y5, Y5 // Load and process 32 bytes from input 0 to 6 outputs VMOVDQU (BX), Y9 ADDQ $0x20, BX VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU (CX), Y7 VMOVDQU 32(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y0, Y0 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y1, Y1 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y2, Y2 VMOVDQU 192(CX), Y7 VMOVDQU 224(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y3, Y3 VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y4, Y4 VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 1 to 6 outputs VMOVDQU (DX), Y9 ADDQ $0x20, DX VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 384(CX), Y7 VMOVDQU 416(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y0, Y0 VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y1, Y1 VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y2, Y2 VMOVDQU 576(CX), Y7 VMOVDQU 608(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y3, Y3 VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y4, Y4 VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y5, Y5 // Store 6 outputs VMOVDQU Y0, (DI) ADDQ $0x20, DI VMOVDQU Y1, (R8) ADDQ $0x20, R8 VMOVDQU Y2, (R9) ADDQ $0x20, R9 VMOVDQU Y3, (R10) ADDQ $0x20, R10 VMOVDQU Y4, (R11) ADDQ $0x20, R11 VMOVDQU Y5, (SI) ADDQ $0x20, SI // Prepare for next loop DECQ AX JNZ mulAvxTwo_2x6_loop VZEROUPPER mulAvxTwo_2x6_end: RET // func mulAvxTwo_2x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_2x7(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 40 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_2x7_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), DX MOVQ out_base+48(FP), SI MOVQ (SI), DI MOVQ 24(SI), R8 MOVQ 48(SI), R9 MOVQ 72(SI), R10 MOVQ 96(SI), R11 MOVQ 120(SI), R12 MOVQ 144(SI), SI MOVQ start+72(FP), R13 // Add start offset to output ADDQ R13, DI ADDQ R13, R8 ADDQ R13, R9 ADDQ R13, R10 ADDQ R13, R11 ADDQ R13, R12 ADDQ R13, SI // Add start offset to input ADDQ R13, BX ADDQ R13, DX MOVQ $0x0000000f, R13 MOVQ R13, X7 VPBROADCASTB X7, Y7 mulAvxTwo_2x7_loop: // Clear 7 outputs VPXOR Y0, Y0, Y0 VPXOR Y1, Y1, Y1 VPXOR Y2, Y2, Y2 VPXOR Y3, Y3, Y3 VPXOR Y4, Y4, Y4 VPXOR Y5, Y5, Y5 VPXOR Y6, Y6, Y6 // Load and process 32 bytes from input 0 to 7 outputs VMOVDQU (BX), Y10 ADDQ $0x20, BX VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU (CX), Y8 VMOVDQU 32(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y0, Y0 VMOVDQU 64(CX), Y8 VMOVDQU 96(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y1, Y1 VMOVDQU 128(CX), Y8 VMOVDQU 160(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y2, Y2 VMOVDQU 192(CX), Y8 VMOVDQU 224(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y3, Y3 VMOVDQU 256(CX), Y8 VMOVDQU 288(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y4, Y4 VMOVDQU 320(CX), Y8 VMOVDQU 352(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y5, Y5 VMOVDQU 384(CX), Y8 VMOVDQU 416(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 1 to 7 outputs VMOVDQU (DX), Y10 ADDQ $0x20, DX VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 448(CX), Y8 VMOVDQU 480(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y0, Y0 VMOVDQU 512(CX), Y8 VMOVDQU 544(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y1, Y1 VMOVDQU 576(CX), Y8 VMOVDQU 608(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y2, Y2 VMOVDQU 640(CX), Y8 VMOVDQU 672(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y3, Y3 VMOVDQU 704(CX), Y8 VMOVDQU 736(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y4, Y4 VMOVDQU 768(CX), Y8 VMOVDQU 800(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y5, Y5 VMOVDQU 832(CX), Y8 VMOVDQU 864(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y6, Y6 // Store 7 outputs VMOVDQU Y0, (DI) ADDQ $0x20, DI VMOVDQU Y1, (R8) ADDQ $0x20, R8 VMOVDQU Y2, (R9) ADDQ $0x20, R9 VMOVDQU Y3, (R10) ADDQ $0x20, R10 VMOVDQU Y4, (R11) ADDQ $0x20, R11 VMOVDQU Y5, (R12) ADDQ $0x20, R12 VMOVDQU Y6, (SI) ADDQ $0x20, SI // Prepare for next loop DECQ AX JNZ mulAvxTwo_2x7_loop VZEROUPPER mulAvxTwo_2x7_end: RET // func mulAvxTwo_2x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_2x8(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 45 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_2x8_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), DX MOVQ out_base+48(FP), SI MOVQ (SI), DI MOVQ 24(SI), R8 MOVQ 48(SI), R9 MOVQ 72(SI), R10 MOVQ 96(SI), R11 MOVQ 120(SI), R12 MOVQ 144(SI), R13 MOVQ 168(SI), SI MOVQ start+72(FP), R14 // Add start offset to output ADDQ R14, DI ADDQ R14, R8 ADDQ R14, R9 ADDQ R14, R10 ADDQ R14, R11 ADDQ R14, R12 ADDQ R14, R13 ADDQ R14, SI // Add start offset to input ADDQ R14, BX ADDQ R14, DX MOVQ $0x0000000f, R14 MOVQ R14, X8 VPBROADCASTB X8, Y8 mulAvxTwo_2x8_loop: // Clear 8 outputs VPXOR Y0, Y0, Y0 VPXOR Y1, Y1, Y1 VPXOR Y2, Y2, Y2 VPXOR Y3, Y3, Y3 VPXOR Y4, Y4, Y4 VPXOR Y5, Y5, Y5 VPXOR Y6, Y6, Y6 VPXOR Y7, Y7, Y7 // Load and process 32 bytes from input 0 to 8 outputs VMOVDQU (BX), Y11 ADDQ $0x20, BX VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU (CX), Y9 VMOVDQU 32(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y0, Y0 VMOVDQU 64(CX), Y9 VMOVDQU 96(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y1, Y1 VMOVDQU 128(CX), Y9 VMOVDQU 160(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y2, Y2 VMOVDQU 192(CX), Y9 VMOVDQU 224(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y3, Y3 VMOVDQU 256(CX), Y9 VMOVDQU 288(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y4, Y4 VMOVDQU 320(CX), Y9 VMOVDQU 352(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y5, Y5 VMOVDQU 384(CX), Y9 VMOVDQU 416(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y6, Y6 VMOVDQU 448(CX), Y9 VMOVDQU 480(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 1 to 8 outputs VMOVDQU (DX), Y11 ADDQ $0x20, DX VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 512(CX), Y9 VMOVDQU 544(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y0, Y0 VMOVDQU 576(CX), Y9 VMOVDQU 608(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y1, Y1 VMOVDQU 640(CX), Y9 VMOVDQU 672(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y2, Y2 VMOVDQU 704(CX), Y9 VMOVDQU 736(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y3, Y3 VMOVDQU 768(CX), Y9 VMOVDQU 800(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y4, Y4 VMOVDQU 832(CX), Y9 VMOVDQU 864(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y5, Y5 VMOVDQU 896(CX), Y9 VMOVDQU 928(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y6, Y6 VMOVDQU 960(CX), Y9 VMOVDQU 992(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y7, Y7 // Store 8 outputs VMOVDQU Y0, (DI) ADDQ $0x20, DI VMOVDQU Y1, (R8) ADDQ $0x20, R8 VMOVDQU Y2, (R9) ADDQ $0x20, R9 VMOVDQU Y3, (R10) ADDQ $0x20, R10 VMOVDQU Y4, (R11) ADDQ $0x20, R11 VMOVDQU Y5, (R12) ADDQ $0x20, R12 VMOVDQU Y6, (R13) ADDQ $0x20, R13 VMOVDQU Y7, (SI) ADDQ $0x20, SI // Prepare for next loop DECQ AX JNZ mulAvxTwo_2x8_loop VZEROUPPER mulAvxTwo_2x8_end: RET // func mulAvxTwo_2x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_2x9(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 50 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_2x9_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), DX MOVQ out_base+48(FP), SI MOVQ (SI), DI MOVQ 24(SI), R8 MOVQ 48(SI), R9 MOVQ 72(SI), R10 MOVQ 96(SI), R11 MOVQ 120(SI), R12 MOVQ 144(SI), R13 MOVQ 168(SI), R14 MOVQ 192(SI), SI MOVQ start+72(FP), R15 // Add start offset to output ADDQ R15, DI ADDQ R15, R8 ADDQ R15, R9 ADDQ R15, R10 ADDQ R15, R11 ADDQ R15, R12 ADDQ R15, R13 ADDQ R15, R14 ADDQ R15, SI // Add start offset to input ADDQ R15, BX ADDQ R15, DX MOVQ $0x0000000f, R15 MOVQ R15, X9 VPBROADCASTB X9, Y9 mulAvxTwo_2x9_loop: // Clear 9 outputs VPXOR Y0, Y0, Y0 VPXOR Y1, Y1, Y1 VPXOR Y2, Y2, Y2 VPXOR Y3, Y3, Y3 VPXOR Y4, Y4, Y4 VPXOR Y5, Y5, Y5 VPXOR Y6, Y6, Y6 VPXOR Y7, Y7, Y7 VPXOR Y8, Y8, Y8 // Load and process 32 bytes from input 0 to 9 outputs VMOVDQU (BX), Y12 ADDQ $0x20, BX VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU (CX), Y10 VMOVDQU 32(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y0, Y0 VMOVDQU 64(CX), Y10 VMOVDQU 96(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y1, Y1 VMOVDQU 128(CX), Y10 VMOVDQU 160(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y2, Y2 VMOVDQU 192(CX), Y10 VMOVDQU 224(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y3, Y3 VMOVDQU 256(CX), Y10 VMOVDQU 288(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y4, Y4 VMOVDQU 320(CX), Y10 VMOVDQU 352(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y5, Y5 VMOVDQU 384(CX), Y10 VMOVDQU 416(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y6, Y6 VMOVDQU 448(CX), Y10 VMOVDQU 480(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y7, Y7 VMOVDQU 512(CX), Y10 VMOVDQU 544(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 1 to 9 outputs VMOVDQU (DX), Y12 ADDQ $0x20, DX VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 576(CX), Y10 VMOVDQU 608(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y0, Y0 VMOVDQU 640(CX), Y10 VMOVDQU 672(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y1, Y1 VMOVDQU 704(CX), Y10 VMOVDQU 736(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y2, Y2 VMOVDQU 768(CX), Y10 VMOVDQU 800(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y3, Y3 VMOVDQU 832(CX), Y10 VMOVDQU 864(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y4, Y4 VMOVDQU 896(CX), Y10 VMOVDQU 928(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y5, Y5 VMOVDQU 960(CX), Y10 VMOVDQU 992(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y6, Y6 VMOVDQU 1024(CX), Y10 VMOVDQU 1056(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y7, Y7 VMOVDQU 1088(CX), Y10 VMOVDQU 1120(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y8, Y8 // Store 9 outputs VMOVDQU Y0, (DI) ADDQ $0x20, DI VMOVDQU Y1, (R8) ADDQ $0x20, R8 VMOVDQU Y2, (R9) ADDQ $0x20, R9 VMOVDQU Y3, (R10) ADDQ $0x20, R10 VMOVDQU Y4, (R11) ADDQ $0x20, R11 VMOVDQU Y5, (R12) ADDQ $0x20, R12 VMOVDQU Y6, (R13) ADDQ $0x20, R13 VMOVDQU Y7, (R14) ADDQ $0x20, R14 VMOVDQU Y8, (SI) ADDQ $0x20, SI // Prepare for next loop DECQ AX JNZ mulAvxTwo_2x9_loop VZEROUPPER mulAvxTwo_2x9_end: RET // func mulAvxTwo_2x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_2x10(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 55 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_2x10_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), DX MOVQ out_base+48(FP), SI MOVQ (SI), DI MOVQ 24(SI), R8 MOVQ 48(SI), R9 MOVQ 72(SI), R10 MOVQ 96(SI), R11 MOVQ 120(SI), R12 MOVQ 144(SI), R13 MOVQ 168(SI), R14 MOVQ 192(SI), R15 MOVQ 216(SI), SI MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, DI ADDQ BP, R8 ADDQ BP, R9 ADDQ BP, R10 ADDQ BP, R11 ADDQ BP, R12 ADDQ BP, R13 ADDQ BP, R14 ADDQ BP, R15 ADDQ BP, SI // Add start offset to input ADDQ BP, BX ADDQ BP, DX MOVQ $0x0000000f, BP MOVQ BP, X10 VPBROADCASTB X10, Y10 mulAvxTwo_2x10_loop: // Clear 10 outputs VPXOR Y0, Y0, Y0 VPXOR Y1, Y1, Y1 VPXOR Y2, Y2, Y2 VPXOR Y3, Y3, Y3 VPXOR Y4, Y4, Y4 VPXOR Y5, Y5, Y5 VPXOR Y6, Y6, Y6 VPXOR Y7, Y7, Y7 VPXOR Y8, Y8, Y8 VPXOR Y9, Y9, Y9 // Load and process 32 bytes from input 0 to 10 outputs VMOVDQU (BX), Y13 ADDQ $0x20, BX VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU (CX), Y11 VMOVDQU 32(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y0, Y0 VMOVDQU 64(CX), Y11 VMOVDQU 96(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y1, Y1 VMOVDQU 128(CX), Y11 VMOVDQU 160(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y2, Y2 VMOVDQU 192(CX), Y11 VMOVDQU 224(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y3, Y3 VMOVDQU 256(CX), Y11 VMOVDQU 288(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y4, Y4 VMOVDQU 320(CX), Y11 VMOVDQU 352(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y5, Y5 VMOVDQU 384(CX), Y11 VMOVDQU 416(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y6, Y6 VMOVDQU 448(CX), Y11 VMOVDQU 480(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y7, Y7 VMOVDQU 512(CX), Y11 VMOVDQU 544(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y8, Y8 VMOVDQU 576(CX), Y11 VMOVDQU 608(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 1 to 10 outputs VMOVDQU (DX), Y13 ADDQ $0x20, DX VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 640(CX), Y11 VMOVDQU 672(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y0, Y0 VMOVDQU 704(CX), Y11 VMOVDQU 736(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y1, Y1 VMOVDQU 768(CX), Y11 VMOVDQU 800(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y2, Y2 VMOVDQU 832(CX), Y11 VMOVDQU 864(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y3, Y3 VMOVDQU 896(CX), Y11 VMOVDQU 928(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y4, Y4 VMOVDQU 960(CX), Y11 VMOVDQU 992(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y5, Y5 VMOVDQU 1024(CX), Y11 VMOVDQU 1056(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y6, Y6 VMOVDQU 1088(CX), Y11 VMOVDQU 1120(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y7, Y7 VMOVDQU 1152(CX), Y11 VMOVDQU 1184(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y8, Y8 VMOVDQU 1216(CX), Y11 VMOVDQU 1248(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y9, Y9 // Store 10 outputs VMOVDQU Y0, (DI) ADDQ $0x20, DI VMOVDQU Y1, (R8) ADDQ $0x20, R8 VMOVDQU Y2, (R9) ADDQ $0x20, R9 VMOVDQU Y3, (R10) ADDQ $0x20, R10 VMOVDQU Y4, (R11) ADDQ $0x20, R11 VMOVDQU Y5, (R12) ADDQ $0x20, R12 VMOVDQU Y6, (R13) ADDQ $0x20, R13 VMOVDQU Y7, (R14) ADDQ $0x20, R14 VMOVDQU Y8, (R15) ADDQ $0x20, R15 VMOVDQU Y9, (SI) ADDQ $0x20, SI // Prepare for next loop DECQ AX JNZ mulAvxTwo_2x10_loop VZEROUPPER mulAvxTwo_2x10_end: RET // func mulAvxTwo_3x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_3x1(SB), NOSPLIT, $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 10 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_3x1_end VMOVDQU (CX), Y0 VMOVDQU 32(CX), Y1 VMOVDQU 64(CX), Y2 VMOVDQU 96(CX), Y3 VMOVDQU 128(CX), Y4 VMOVDQU 160(CX), Y5 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), BX MOVQ 48(CX), CX MOVQ out_base+48(FP), SI MOVQ (SI), SI MOVQ start+72(FP), DI // Add start offset to output ADDQ DI, SI // Add start offset to input ADDQ DI, DX ADDQ DI, BX ADDQ DI, CX MOVQ $0x0000000f, DI MOVQ DI, X7 VPBROADCASTB X7, Y7 mulAvxTwo_3x1_loop: // Clear 1 outputs VPXOR Y6, Y6, Y6 // Load and process 32 bytes from input 0 to 1 outputs VMOVDQU (DX), Y8 ADDQ $0x20, DX VPSRLQ $0x04, Y8, Y9 VPAND Y7, Y8, Y8 VPAND Y7, Y9, Y9 VPSHUFB Y8, Y0, Y8 VPSHUFB Y9, Y1, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 1 to 1 outputs VMOVDQU (BX), Y8 ADDQ $0x20, BX VPSRLQ $0x04, Y8, Y9 VPAND Y7, Y8, Y8 VPAND Y7, Y9, Y9 VPSHUFB Y8, Y2, Y8 VPSHUFB Y9, Y3, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 2 to 1 outputs VMOVDQU (CX), Y8 ADDQ $0x20, CX VPSRLQ $0x04, Y8, Y9 VPAND Y7, Y8, Y8 VPAND Y7, Y9, Y9 VPSHUFB Y8, Y4, Y8 VPSHUFB Y9, Y5, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y6, Y6 // Store 1 outputs VMOVDQU Y6, (SI) ADDQ $0x20, SI // Prepare for next loop DECQ AX JNZ mulAvxTwo_3x1_loop VZEROUPPER mulAvxTwo_3x1_end: RET // func mulAvxTwo_3x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_3x1_64(SB), $0-88 // Loading no tables to registers // Destination kept on stack // Full registers estimated 10 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_3x1_64_end MOVQ in_base+24(FP), AX MOVQ (AX), DX MOVQ 24(AX), BX MOVQ 48(AX), AX MOVQ out_base+48(FP), SI MOVQ out_base+48(FP), SI MOVQ start+72(FP), DI // Add start offset to input ADDQ DI, DX ADDQ DI, BX ADDQ DI, AX MOVQ $0x0000000f, R8 MOVQ R8, X2 VPBROADCASTB X2, Y2 MOVQ n+80(FP), R8 SHRQ $0x06, R8 mulAvxTwo_3x1_64_loop: // Clear 1 outputs VPXOR Y0, Y0, Y0 VPXOR Y1, Y1, Y1 // Load and process 64 bytes from input 0 to 1 outputs VMOVDQU (DX), Y6 VMOVDQU 32(DX), Y5 ADDQ $0x40, DX VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU (CX), Y3 VMOVDQU 32(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y5, Y6, Y5 VPXOR Y3, Y0, Y0 VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 1 to 1 outputs VMOVDQU (BX), Y6 VMOVDQU 32(BX), Y5 ADDQ $0x40, BX VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 64(CX), Y3 VMOVDQU 96(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y5, Y6, Y5 VPXOR Y3, Y0, Y0 VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 2 to 1 outputs VMOVDQU (AX), Y6 VMOVDQU 32(AX), Y5 ADDQ $0x40, AX VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 128(CX), Y3 VMOVDQU 160(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y5, Y6, Y5 VPXOR Y3, Y0, Y0 VPXOR Y5, Y1, Y1 // Store 1 outputs MOVQ (SI), R9 VMOVDQU Y0, (R9)(DI*1) VMOVDQU Y1, 32(R9)(DI*1) // Prepare for next loop ADDQ $0x40, DI DECQ R8 JNZ mulAvxTwo_3x1_64_loop VZEROUPPER mulAvxTwo_3x1_64_end: RET // func mulAvxTwo_3x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_3x2(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 19 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_3x2_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DX MOVQ out_base+48(FP), DI MOVQ (DI), R8 MOVQ 24(DI), DI MOVQ start+72(FP), R9 // Add start offset to output ADDQ R9, R8 ADDQ R9, DI // Add start offset to input ADDQ R9, BX ADDQ R9, SI ADDQ R9, DX MOVQ $0x0000000f, R9 MOVQ R9, X2 VPBROADCASTB X2, Y2 mulAvxTwo_3x2_loop: // Clear 2 outputs VPXOR Y0, Y0, Y0 VPXOR Y1, Y1, Y1 // Load and process 32 bytes from input 0 to 2 outputs VMOVDQU (BX), Y5 ADDQ $0x20, BX VPSRLQ $0x04, Y5, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y6, Y6 VMOVDQU (CX), Y3 VMOVDQU 32(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y3, Y0, Y0 VMOVDQU 64(CX), Y3 VMOVDQU 96(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 1 to 2 outputs VMOVDQU (SI), Y5 ADDQ $0x20, SI VPSRLQ $0x04, Y5, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y6, Y6 VMOVDQU 128(CX), Y3 VMOVDQU 160(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y3, Y0, Y0 VMOVDQU 192(CX), Y3 VMOVDQU 224(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 2 to 2 outputs VMOVDQU (DX), Y5 ADDQ $0x20, DX VPSRLQ $0x04, Y5, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y6, Y6 VMOVDQU 256(CX), Y3 VMOVDQU 288(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y3, Y0, Y0 VMOVDQU 320(CX), Y3 VMOVDQU 352(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y3, Y1, Y1 // Store 2 outputs VMOVDQU Y0, (R8) ADDQ $0x20, R8 VMOVDQU Y1, (DI) ADDQ $0x20, DI // Prepare for next loop DECQ AX JNZ mulAvxTwo_3x2_loop VZEROUPPER mulAvxTwo_3x2_end: RET // func mulAvxTwo_3x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_3x2_64(SB), $0-88 // Loading no tables to registers // Destination kept on stack // Full registers estimated 19 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_3x2_64_end MOVQ in_base+24(FP), AX MOVQ (AX), DX MOVQ 24(AX), BX MOVQ 48(AX), AX MOVQ out_base+48(FP), SI MOVQ out_base+48(FP), SI MOVQ start+72(FP), DI // Add start offset to input ADDQ DI, DX ADDQ DI, BX ADDQ DI, AX MOVQ $0x0000000f, R8 MOVQ R8, X4 VPBROADCASTB X4, Y4 MOVQ n+80(FP), R8 SHRQ $0x06, R8 mulAvxTwo_3x2_64_loop: // Clear 2 outputs VPXOR Y0, Y0, Y0 VPXOR Y1, Y1, Y1 VPXOR Y2, Y2, Y2 VPXOR Y3, Y3, Y3 // Load and process 64 bytes from input 0 to 2 outputs VMOVDQU (DX), Y9 VMOVDQU 32(DX), Y11 ADDQ $0x40, DX VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU (CX), Y5 VMOVDQU 32(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y7, Y8, Y7 VPXOR Y5, Y0, Y0 VPXOR Y7, Y1, Y1 VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y7, Y8, Y7 VPXOR Y5, Y2, Y2 VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 1 to 2 outputs VMOVDQU (BX), Y9 VMOVDQU 32(BX), Y11 ADDQ $0x40, BX VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 128(CX), Y5 VMOVDQU 160(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y7, Y8, Y7 VPXOR Y5, Y0, Y0 VPXOR Y7, Y1, Y1 VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y7, Y8, Y7 VPXOR Y5, Y2, Y2 VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 2 to 2 outputs VMOVDQU (AX), Y9 VMOVDQU 32(AX), Y11 ADDQ $0x40, AX VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 256(CX), Y5 VMOVDQU 288(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y7, Y8, Y7 VPXOR Y5, Y0, Y0 VPXOR Y7, Y1, Y1 VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y7, Y8, Y7 VPXOR Y5, Y2, Y2 VPXOR Y7, Y3, Y3 // Store 2 outputs MOVQ (SI), R9 VMOVDQU Y0, (R9)(DI*1) VMOVDQU Y1, 32(R9)(DI*1) MOVQ 24(SI), R9 VMOVDQU Y2, (R9)(DI*1) VMOVDQU Y3, 32(R9)(DI*1) // Prepare for next loop ADDQ $0x40, DI DECQ R8 JNZ mulAvxTwo_3x2_64_loop VZEROUPPER mulAvxTwo_3x2_64_end: RET // func mulAvxTwo_3x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_3x3(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 26 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_3x3_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DX MOVQ out_base+48(FP), DI MOVQ (DI), R8 MOVQ 24(DI), R9 MOVQ 48(DI), DI MOVQ start+72(FP), R10 // Add start offset to output ADDQ R10, R8 ADDQ R10, R9 ADDQ R10, DI // Add start offset to input ADDQ R10, BX ADDQ R10, SI ADDQ R10, DX MOVQ $0x0000000f, R10 MOVQ R10, X3 VPBROADCASTB X3, Y3 mulAvxTwo_3x3_loop: // Clear 3 outputs VPXOR Y0, Y0, Y0 VPXOR Y1, Y1, Y1 VPXOR Y2, Y2, Y2 // Load and process 32 bytes from input 0 to 3 outputs VMOVDQU (BX), Y6 ADDQ $0x20, BX VPSRLQ $0x04, Y6, Y7 VPAND Y3, Y6, Y6 VPAND Y3, Y7, Y7 VMOVDQU (CX), Y4 VMOVDQU 32(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y0, Y0 VMOVDQU 64(CX), Y4 VMOVDQU 96(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y1, Y1 VMOVDQU 128(CX), Y4 VMOVDQU 160(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 1 to 3 outputs VMOVDQU (SI), Y6 ADDQ $0x20, SI VPSRLQ $0x04, Y6, Y7 VPAND Y3, Y6, Y6 VPAND Y3, Y7, Y7 VMOVDQU 192(CX), Y4 VMOVDQU 224(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y0, Y0 VMOVDQU 256(CX), Y4 VMOVDQU 288(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y1, Y1 VMOVDQU 320(CX), Y4 VMOVDQU 352(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 2 to 3 outputs VMOVDQU (DX), Y6 ADDQ $0x20, DX VPSRLQ $0x04, Y6, Y7 VPAND Y3, Y6, Y6 VPAND Y3, Y7, Y7 VMOVDQU 384(CX), Y4 VMOVDQU 416(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y0, Y0 VMOVDQU 448(CX), Y4 VMOVDQU 480(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y1, Y1 VMOVDQU 512(CX), Y4 VMOVDQU 544(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y2, Y2 // Store 3 outputs VMOVDQU Y0, (R8) ADDQ $0x20, R8 VMOVDQU Y1, (R9) ADDQ $0x20, R9 VMOVDQU Y2, (DI) ADDQ $0x20, DI // Prepare for next loop DECQ AX JNZ mulAvxTwo_3x3_loop VZEROUPPER mulAvxTwo_3x3_end: RET // func mulAvxTwo_3x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_3x3_64(SB), $0-88 // Loading no tables to registers // Destination kept on stack // Full registers estimated 26 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_3x3_64_end MOVQ in_base+24(FP), AX MOVQ (AX), DX MOVQ 24(AX), BX MOVQ 48(AX), AX MOVQ out_base+48(FP), SI MOVQ out_base+48(FP), SI MOVQ start+72(FP), DI // Add start offset to input ADDQ DI, DX ADDQ DI, BX ADDQ DI, AX MOVQ $0x0000000f, R8 MOVQ R8, X6 VPBROADCASTB X6, Y6 MOVQ n+80(FP), R8 SHRQ $0x06, R8 mulAvxTwo_3x3_64_loop: // Clear 3 outputs VPXOR Y0, Y0, Y0 VPXOR Y1, Y1, Y1 VPXOR Y2, Y2, Y2 VPXOR Y3, Y3, Y3 VPXOR Y4, Y4, Y4 VPXOR Y5, Y5, Y5 // Load and process 64 bytes from input 0 to 3 outputs VMOVDQU (DX), Y11 VMOVDQU 32(DX), Y13 ADDQ $0x40, DX VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU (CX), Y7 VMOVDQU 32(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y0, Y0 VPXOR Y9, Y1, Y1 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y2, Y2 VPXOR Y9, Y3, Y3 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y4, Y4 VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 1 to 3 outputs VMOVDQU (BX), Y11 VMOVDQU 32(BX), Y13 ADDQ $0x40, BX VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 192(CX), Y7 VMOVDQU 224(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y0, Y0 VPXOR Y9, Y1, Y1 VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y2, Y2 VPXOR Y9, Y3, Y3 VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y4, Y4 VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 2 to 3 outputs VMOVDQU (AX), Y11 VMOVDQU 32(AX), Y13 ADDQ $0x40, AX VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 384(CX), Y7 VMOVDQU 416(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y0, Y0 VPXOR Y9, Y1, Y1 VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y2, Y2 VPXOR Y9, Y3, Y3 VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y4, Y4 VPXOR Y9, Y5, Y5 // Store 3 outputs MOVQ (SI), R9 VMOVDQU Y0, (R9)(DI*1) VMOVDQU Y1, 32(R9)(DI*1) MOVQ 24(SI), R9 VMOVDQU Y2, (R9)(DI*1) VMOVDQU Y3, 32(R9)(DI*1) MOVQ 48(SI), R9 VMOVDQU Y4, (R9)(DI*1) VMOVDQU Y5, 32(R9)(DI*1) // Prepare for next loop ADDQ $0x40, DI DECQ R8 JNZ mulAvxTwo_3x3_64_loop VZEROUPPER mulAvxTwo_3x3_64_end: RET // func mulAvxTwo_3x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_3x4(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 33 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_3x4_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DX MOVQ out_base+48(FP), DI MOVQ (DI), R8 MOVQ 24(DI), R9 MOVQ 48(DI), R10 MOVQ 72(DI), DI MOVQ start+72(FP), R11 // Add start offset to output ADDQ R11, R8 ADDQ R11, R9 ADDQ R11, R10 ADDQ R11, DI // Add start offset to input ADDQ R11, BX ADDQ R11, SI ADDQ R11, DX MOVQ $0x0000000f, R11 MOVQ R11, X4 VPBROADCASTB X4, Y4 mulAvxTwo_3x4_loop: // Clear 4 outputs VPXOR Y0, Y0, Y0 VPXOR Y1, Y1, Y1 VPXOR Y2, Y2, Y2 VPXOR Y3, Y3, Y3 // Load and process 32 bytes from input 0 to 4 outputs VMOVDQU (BX), Y7 ADDQ $0x20, BX VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU (CX), Y5 VMOVDQU 32(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y0, Y0 VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y1, Y1 VMOVDQU 128(CX), Y5 VMOVDQU 160(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y2, Y2 VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 1 to 4 outputs VMOVDQU (SI), Y7 ADDQ $0x20, SI VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 256(CX), Y5 VMOVDQU 288(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y0, Y0 VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y1, Y1 VMOVDQU 384(CX), Y5 VMOVDQU 416(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y2, Y2 VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 2 to 4 outputs VMOVDQU (DX), Y7 ADDQ $0x20, DX VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 512(CX), Y5 VMOVDQU 544(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y0, Y0 VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y1, Y1 VMOVDQU 640(CX), Y5 VMOVDQU 672(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y2, Y2 VMOVDQU 704(CX), Y5 VMOVDQU 736(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y3, Y3 // Store 4 outputs VMOVDQU Y0, (R8) ADDQ $0x20, R8 VMOVDQU Y1, (R9) ADDQ $0x20, R9 VMOVDQU Y2, (R10) ADDQ $0x20, R10 VMOVDQU Y3, (DI) ADDQ $0x20, DI // Prepare for next loop DECQ AX JNZ mulAvxTwo_3x4_loop VZEROUPPER mulAvxTwo_3x4_end: RET // func mulAvxTwo_3x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_3x5(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 40 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_3x5_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DX MOVQ out_base+48(FP), DI MOVQ (DI), R8 MOVQ 24(DI), R9 MOVQ 48(DI), R10 MOVQ 72(DI), R11 MOVQ 96(DI), DI MOVQ start+72(FP), R12 // Add start offset to output ADDQ R12, R8 ADDQ R12, R9 ADDQ R12, R10 ADDQ R12, R11 ADDQ R12, DI // Add start offset to input ADDQ R12, BX ADDQ R12, SI ADDQ R12, DX MOVQ $0x0000000f, R12 MOVQ R12, X5 VPBROADCASTB X5, Y5 mulAvxTwo_3x5_loop: // Clear 5 outputs VPXOR Y0, Y0, Y0 VPXOR Y1, Y1, Y1 VPXOR Y2, Y2, Y2 VPXOR Y3, Y3, Y3 VPXOR Y4, Y4, Y4 // Load and process 32 bytes from input 0 to 5 outputs VMOVDQU (BX), Y8 ADDQ $0x20, BX VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU (CX), Y6 VMOVDQU 32(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y0, Y0 VMOVDQU 64(CX), Y6 VMOVDQU 96(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y1, Y1 VMOVDQU 128(CX), Y6 VMOVDQU 160(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y2, Y2 VMOVDQU 192(CX), Y6 VMOVDQU 224(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y3, Y3 VMOVDQU 256(CX), Y6 VMOVDQU 288(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 1 to 5 outputs VMOVDQU (SI), Y8 ADDQ $0x20, SI VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 320(CX), Y6 VMOVDQU 352(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y0, Y0 VMOVDQU 384(CX), Y6 VMOVDQU 416(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y1, Y1 VMOVDQU 448(CX), Y6 VMOVDQU 480(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y2, Y2 VMOVDQU 512(CX), Y6 VMOVDQU 544(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y3, Y3 VMOVDQU 576(CX), Y6 VMOVDQU 608(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 2 to 5 outputs VMOVDQU (DX), Y8 ADDQ $0x20, DX VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 640(CX), Y6 VMOVDQU 672(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y0, Y0 VMOVDQU 704(CX), Y6 VMOVDQU 736(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y1, Y1 VMOVDQU 768(CX), Y6 VMOVDQU 800(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y2, Y2 VMOVDQU 832(CX), Y6 VMOVDQU 864(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y3, Y3 VMOVDQU 896(CX), Y6 VMOVDQU 928(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y4, Y4 // Store 5 outputs VMOVDQU Y0, (R8) ADDQ $0x20, R8 VMOVDQU Y1, (R9) ADDQ $0x20, R9 VMOVDQU Y2, (R10) ADDQ $0x20, R10 VMOVDQU Y3, (R11) ADDQ $0x20, R11 VMOVDQU Y4, (DI) ADDQ $0x20, DI // Prepare for next loop DECQ AX JNZ mulAvxTwo_3x5_loop VZEROUPPER mulAvxTwo_3x5_end: RET // func mulAvxTwo_3x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_3x6(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 47 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_3x6_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DX MOVQ out_base+48(FP), DI MOVQ (DI), R8 MOVQ 24(DI), R9 MOVQ 48(DI), R10 MOVQ 72(DI), R11 MOVQ 96(DI), R12 MOVQ 120(DI), DI MOVQ start+72(FP), R13 // Add start offset to output ADDQ R13, R8 ADDQ R13, R9 ADDQ R13, R10 ADDQ R13, R11 ADDQ R13, R12 ADDQ R13, DI // Add start offset to input ADDQ R13, BX ADDQ R13, SI ADDQ R13, DX MOVQ $0x0000000f, R13 MOVQ R13, X6 VPBROADCASTB X6, Y6 mulAvxTwo_3x6_loop: // Clear 6 outputs VPXOR Y0, Y0, Y0 VPXOR Y1, Y1, Y1 VPXOR Y2, Y2, Y2 VPXOR Y3, Y3, Y3 VPXOR Y4, Y4, Y4 VPXOR Y5, Y5, Y5 // Load and process 32 bytes from input 0 to 6 outputs VMOVDQU (BX), Y9 ADDQ $0x20, BX VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU (CX), Y7 VMOVDQU 32(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y0, Y0 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y1, Y1 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y2, Y2 VMOVDQU 192(CX), Y7 VMOVDQU 224(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y3, Y3 VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y4, Y4 VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 1 to 6 outputs VMOVDQU (SI), Y9 ADDQ $0x20, SI VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 384(CX), Y7 VMOVDQU 416(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y0, Y0 VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y1, Y1 VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y2, Y2 VMOVDQU 576(CX), Y7 VMOVDQU 608(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y3, Y3 VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y4, Y4 VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 2 to 6 outputs VMOVDQU (DX), Y9 ADDQ $0x20, DX VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 768(CX), Y7 VMOVDQU 800(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y0, Y0 VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y1, Y1 VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y2, Y2 VMOVDQU 960(CX), Y7 VMOVDQU 992(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y3, Y3 VMOVDQU 1024(CX), Y7 VMOVDQU 1056(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y4, Y4 VMOVDQU 1088(CX), Y7 VMOVDQU 1120(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y5, Y5 // Store 6 outputs VMOVDQU Y0, (R8) ADDQ $0x20, R8 VMOVDQU Y1, (R9) ADDQ $0x20, R9 VMOVDQU Y2, (R10) ADDQ $0x20, R10 VMOVDQU Y3, (R11) ADDQ $0x20, R11 VMOVDQU Y4, (R12) ADDQ $0x20, R12 VMOVDQU Y5, (DI) ADDQ $0x20, DI // Prepare for next loop DECQ AX JNZ mulAvxTwo_3x6_loop VZEROUPPER mulAvxTwo_3x6_end: RET // func mulAvxTwo_3x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_3x7(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 54 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_3x7_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DX MOVQ out_base+48(FP), DI MOVQ (DI), R8 MOVQ 24(DI), R9 MOVQ 48(DI), R10 MOVQ 72(DI), R11 MOVQ 96(DI), R12 MOVQ 120(DI), R13 MOVQ 144(DI), DI MOVQ start+72(FP), R14 // Add start offset to output ADDQ R14, R8 ADDQ R14, R9 ADDQ R14, R10 ADDQ R14, R11 ADDQ R14, R12 ADDQ R14, R13 ADDQ R14, DI // Add start offset to input ADDQ R14, BX ADDQ R14, SI ADDQ R14, DX MOVQ $0x0000000f, R14 MOVQ R14, X7 VPBROADCASTB X7, Y7 mulAvxTwo_3x7_loop: // Clear 7 outputs VPXOR Y0, Y0, Y0 VPXOR Y1, Y1, Y1 VPXOR Y2, Y2, Y2 VPXOR Y3, Y3, Y3 VPXOR Y4, Y4, Y4 VPXOR Y5, Y5, Y5 VPXOR Y6, Y6, Y6 // Load and process 32 bytes from input 0 to 7 outputs VMOVDQU (BX), Y10 ADDQ $0x20, BX VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU (CX), Y8 VMOVDQU 32(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y0, Y0 VMOVDQU 64(CX), Y8 VMOVDQU 96(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y1, Y1 VMOVDQU 128(CX), Y8 VMOVDQU 160(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y2, Y2 VMOVDQU 192(CX), Y8 VMOVDQU 224(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y3, Y3 VMOVDQU 256(CX), Y8 VMOVDQU 288(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y4, Y4 VMOVDQU 320(CX), Y8 VMOVDQU 352(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y5, Y5 VMOVDQU 384(CX), Y8 VMOVDQU 416(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 1 to 7 outputs VMOVDQU (SI), Y10 ADDQ $0x20, SI VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 448(CX), Y8 VMOVDQU 480(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y0, Y0 VMOVDQU 512(CX), Y8 VMOVDQU 544(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y1, Y1 VMOVDQU 576(CX), Y8 VMOVDQU 608(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y2, Y2 VMOVDQU 640(CX), Y8 VMOVDQU 672(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y3, Y3 VMOVDQU 704(CX), Y8 VMOVDQU 736(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y4, Y4 VMOVDQU 768(CX), Y8 VMOVDQU 800(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y5, Y5 VMOVDQU 832(CX), Y8 VMOVDQU 864(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 2 to 7 outputs VMOVDQU (DX), Y10 ADDQ $0x20, DX VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 896(CX), Y8 VMOVDQU 928(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y0, Y0 VMOVDQU 960(CX), Y8 VMOVDQU 992(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y1, Y1 VMOVDQU 1024(CX), Y8 VMOVDQU 1056(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y2, Y2 VMOVDQU 1088(CX), Y8 VMOVDQU 1120(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y3, Y3 VMOVDQU 1152(CX), Y8 VMOVDQU 1184(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y4, Y4 VMOVDQU 1216(CX), Y8 VMOVDQU 1248(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y5, Y5 VMOVDQU 1280(CX), Y8 VMOVDQU 1312(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y6, Y6 // Store 7 outputs VMOVDQU Y0, (R8) ADDQ $0x20, R8 VMOVDQU Y1, (R9) ADDQ $0x20, R9 VMOVDQU Y2, (R10) ADDQ $0x20, R10 VMOVDQU Y3, (R11) ADDQ $0x20, R11 VMOVDQU Y4, (R12) ADDQ $0x20, R12 VMOVDQU Y5, (R13) ADDQ $0x20, R13 VMOVDQU Y6, (DI) ADDQ $0x20, DI // Prepare for next loop DECQ AX JNZ mulAvxTwo_3x7_loop VZEROUPPER mulAvxTwo_3x7_end: RET // func mulAvxTwo_3x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_3x8(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 61 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_3x8_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DX MOVQ out_base+48(FP), DI MOVQ (DI), R8 MOVQ 24(DI), R9 MOVQ 48(DI), R10 MOVQ 72(DI), R11 MOVQ 96(DI), R12 MOVQ 120(DI), R13 MOVQ 144(DI), R14 MOVQ 168(DI), DI MOVQ start+72(FP), R15 // Add start offset to output ADDQ R15, R8 ADDQ R15, R9 ADDQ R15, R10 ADDQ R15, R11 ADDQ R15, R12 ADDQ R15, R13 ADDQ R15, R14 ADDQ R15, DI // Add start offset to input ADDQ R15, BX ADDQ R15, SI ADDQ R15, DX MOVQ $0x0000000f, R15 MOVQ R15, X8 VPBROADCASTB X8, Y8 mulAvxTwo_3x8_loop: // Clear 8 outputs VPXOR Y0, Y0, Y0 VPXOR Y1, Y1, Y1 VPXOR Y2, Y2, Y2 VPXOR Y3, Y3, Y3 VPXOR Y4, Y4, Y4 VPXOR Y5, Y5, Y5 VPXOR Y6, Y6, Y6 VPXOR Y7, Y7, Y7 // Load and process 32 bytes from input 0 to 8 outputs VMOVDQU (BX), Y11 ADDQ $0x20, BX VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU (CX), Y9 VMOVDQU 32(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y0, Y0 VMOVDQU 64(CX), Y9 VMOVDQU 96(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y1, Y1 VMOVDQU 128(CX), Y9 VMOVDQU 160(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y2, Y2 VMOVDQU 192(CX), Y9 VMOVDQU 224(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y3, Y3 VMOVDQU 256(CX), Y9 VMOVDQU 288(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y4, Y4 VMOVDQU 320(CX), Y9 VMOVDQU 352(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y5, Y5 VMOVDQU 384(CX), Y9 VMOVDQU 416(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y6, Y6 VMOVDQU 448(CX), Y9 VMOVDQU 480(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 1 to 8 outputs VMOVDQU (SI), Y11 ADDQ $0x20, SI VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 512(CX), Y9 VMOVDQU 544(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y0, Y0 VMOVDQU 576(CX), Y9 VMOVDQU 608(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y1, Y1 VMOVDQU 640(CX), Y9 VMOVDQU 672(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y2, Y2 VMOVDQU 704(CX), Y9 VMOVDQU 736(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y3, Y3 VMOVDQU 768(CX), Y9 VMOVDQU 800(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y4, Y4 VMOVDQU 832(CX), Y9 VMOVDQU 864(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y5, Y5 VMOVDQU 896(CX), Y9 VMOVDQU 928(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y6, Y6 VMOVDQU 960(CX), Y9 VMOVDQU 992(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 2 to 8 outputs VMOVDQU (DX), Y11 ADDQ $0x20, DX VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 1024(CX), Y9 VMOVDQU 1056(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y0, Y0 VMOVDQU 1088(CX), Y9 VMOVDQU 1120(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y1, Y1 VMOVDQU 1152(CX), Y9 VMOVDQU 1184(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y2, Y2 VMOVDQU 1216(CX), Y9 VMOVDQU 1248(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y3, Y3 VMOVDQU 1280(CX), Y9 VMOVDQU 1312(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y4, Y4 VMOVDQU 1344(CX), Y9 VMOVDQU 1376(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y5, Y5 VMOVDQU 1408(CX), Y9 VMOVDQU 1440(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y6, Y6 VMOVDQU 1472(CX), Y9 VMOVDQU 1504(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y7, Y7 // Store 8 outputs VMOVDQU Y0, (R8) ADDQ $0x20, R8 VMOVDQU Y1, (R9) ADDQ $0x20, R9 VMOVDQU Y2, (R10) ADDQ $0x20, R10 VMOVDQU Y3, (R11) ADDQ $0x20, R11 VMOVDQU Y4, (R12) ADDQ $0x20, R12 VMOVDQU Y5, (R13) ADDQ $0x20, R13 VMOVDQU Y6, (R14) ADDQ $0x20, R14 VMOVDQU Y7, (DI) ADDQ $0x20, DI // Prepare for next loop DECQ AX JNZ mulAvxTwo_3x8_loop VZEROUPPER mulAvxTwo_3x8_end: RET // func mulAvxTwo_3x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_3x9(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 68 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_3x9_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DX MOVQ out_base+48(FP), DI MOVQ (DI), R8 MOVQ 24(DI), R9 MOVQ 48(DI), R10 MOVQ 72(DI), R11 MOVQ 96(DI), R12 MOVQ 120(DI), R13 MOVQ 144(DI), R14 MOVQ 168(DI), R15 MOVQ 192(DI), DI MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, R8 ADDQ BP, R9 ADDQ BP, R10 ADDQ BP, R11 ADDQ BP, R12 ADDQ BP, R13 ADDQ BP, R14 ADDQ BP, R15 ADDQ BP, DI // Add start offset to input ADDQ BP, BX ADDQ BP, SI ADDQ BP, DX MOVQ $0x0000000f, BP MOVQ BP, X9 VPBROADCASTB X9, Y9 mulAvxTwo_3x9_loop: // Clear 9 outputs VPXOR Y0, Y0, Y0 VPXOR Y1, Y1, Y1 VPXOR Y2, Y2, Y2 VPXOR Y3, Y3, Y3 VPXOR Y4, Y4, Y4 VPXOR Y5, Y5, Y5 VPXOR Y6, Y6, Y6 VPXOR Y7, Y7, Y7 VPXOR Y8, Y8, Y8 // Load and process 32 bytes from input 0 to 9 outputs VMOVDQU (BX), Y12 ADDQ $0x20, BX VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU (CX), Y10 VMOVDQU 32(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y0, Y0 VMOVDQU 64(CX), Y10 VMOVDQU 96(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y1, Y1 VMOVDQU 128(CX), Y10 VMOVDQU 160(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y2, Y2 VMOVDQU 192(CX), Y10 VMOVDQU 224(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y3, Y3 VMOVDQU 256(CX), Y10 VMOVDQU 288(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y4, Y4 VMOVDQU 320(CX), Y10 VMOVDQU 352(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y5, Y5 VMOVDQU 384(CX), Y10 VMOVDQU 416(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y6, Y6 VMOVDQU 448(CX), Y10 VMOVDQU 480(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y7, Y7 VMOVDQU 512(CX), Y10 VMOVDQU 544(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 1 to 9 outputs VMOVDQU (SI), Y12 ADDQ $0x20, SI VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 576(CX), Y10 VMOVDQU 608(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y0, Y0 VMOVDQU 640(CX), Y10 VMOVDQU 672(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y1, Y1 VMOVDQU 704(CX), Y10 VMOVDQU 736(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y2, Y2 VMOVDQU 768(CX), Y10 VMOVDQU 800(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y3, Y3 VMOVDQU 832(CX), Y10 VMOVDQU 864(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y4, Y4 VMOVDQU 896(CX), Y10 VMOVDQU 928(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y5, Y5 VMOVDQU 960(CX), Y10 VMOVDQU 992(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y6, Y6 VMOVDQU 1024(CX), Y10 VMOVDQU 1056(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y7, Y7 VMOVDQU 1088(CX), Y10 VMOVDQU 1120(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 2 to 9 outputs VMOVDQU (DX), Y12 ADDQ $0x20, DX VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 1152(CX), Y10 VMOVDQU 1184(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y0, Y0 VMOVDQU 1216(CX), Y10 VMOVDQU 1248(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y1, Y1 VMOVDQU 1280(CX), Y10 VMOVDQU 1312(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y2, Y2 VMOVDQU 1344(CX), Y10 VMOVDQU 1376(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y3, Y3 VMOVDQU 1408(CX), Y10 VMOVDQU 1440(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y4, Y4 VMOVDQU 1472(CX), Y10 VMOVDQU 1504(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y5, Y5 VMOVDQU 1536(CX), Y10 VMOVDQU 1568(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y6, Y6 VMOVDQU 1600(CX), Y10 VMOVDQU 1632(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y7, Y7 VMOVDQU 1664(CX), Y10 VMOVDQU 1696(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y8, Y8 // Store 9 outputs VMOVDQU Y0, (R8) ADDQ $0x20, R8 VMOVDQU Y1, (R9) ADDQ $0x20, R9 VMOVDQU Y2, (R10) ADDQ $0x20, R10 VMOVDQU Y3, (R11) ADDQ $0x20, R11 VMOVDQU Y4, (R12) ADDQ $0x20, R12 VMOVDQU Y5, (R13) ADDQ $0x20, R13 VMOVDQU Y6, (R14) ADDQ $0x20, R14 VMOVDQU Y7, (R15) ADDQ $0x20, R15 VMOVDQU Y8, (DI) ADDQ $0x20, DI // Prepare for next loop DECQ AX JNZ mulAvxTwo_3x9_loop VZEROUPPER mulAvxTwo_3x9_end: RET // func mulAvxTwo_3x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_3x10(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 75 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_3x10_end MOVQ in_base+24(FP), AX MOVQ (AX), DX MOVQ 24(AX), BX MOVQ 48(AX), AX MOVQ out_base+48(FP), SI MOVQ (SI), DI MOVQ 24(SI), R8 MOVQ 48(SI), R9 MOVQ 72(SI), R10 MOVQ 96(SI), R11 MOVQ 120(SI), R12 MOVQ 144(SI), R13 MOVQ 168(SI), R14 MOVQ 192(SI), R15 MOVQ 216(SI), SI MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, DI ADDQ BP, R8 ADDQ BP, R9 ADDQ BP, R10 ADDQ BP, R11 ADDQ BP, R12 ADDQ BP, R13 ADDQ BP, R14 ADDQ BP, R15 ADDQ BP, SI // Add start offset to input ADDQ BP, DX ADDQ BP, BX ADDQ BP, AX MOVQ $0x0000000f, BP MOVQ BP, X10 VPBROADCASTB X10, Y10 MOVQ n+80(FP), BP SHRQ $0x05, BP mulAvxTwo_3x10_loop: // Clear 10 outputs VPXOR Y0, Y0, Y0 VPXOR Y1, Y1, Y1 VPXOR Y2, Y2, Y2 VPXOR Y3, Y3, Y3 VPXOR Y4, Y4, Y4 VPXOR Y5, Y5, Y5 VPXOR Y6, Y6, Y6 VPXOR Y7, Y7, Y7 VPXOR Y8, Y8, Y8 VPXOR Y9, Y9, Y9 // Load and process 32 bytes from input 0 to 10 outputs VMOVDQU (DX), Y13 ADDQ $0x20, DX VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU (CX), Y11 VMOVDQU 32(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y0, Y0 VMOVDQU 64(CX), Y11 VMOVDQU 96(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y1, Y1 VMOVDQU 128(CX), Y11 VMOVDQU 160(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y2, Y2 VMOVDQU 192(CX), Y11 VMOVDQU 224(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y3, Y3 VMOVDQU 256(CX), Y11 VMOVDQU 288(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y4, Y4 VMOVDQU 320(CX), Y11 VMOVDQU 352(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y5, Y5 VMOVDQU 384(CX), Y11 VMOVDQU 416(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y6, Y6 VMOVDQU 448(CX), Y11 VMOVDQU 480(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y7, Y7 VMOVDQU 512(CX), Y11 VMOVDQU 544(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y8, Y8 VMOVDQU 576(CX), Y11 VMOVDQU 608(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 1 to 10 outputs VMOVDQU (BX), Y13 ADDQ $0x20, BX VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 640(CX), Y11 VMOVDQU 672(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y0, Y0 VMOVDQU 704(CX), Y11 VMOVDQU 736(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y1, Y1 VMOVDQU 768(CX), Y11 VMOVDQU 800(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y2, Y2 VMOVDQU 832(CX), Y11 VMOVDQU 864(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y3, Y3 VMOVDQU 896(CX), Y11 VMOVDQU 928(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y4, Y4 VMOVDQU 960(CX), Y11 VMOVDQU 992(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y5, Y5 VMOVDQU 1024(CX), Y11 VMOVDQU 1056(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y6, Y6 VMOVDQU 1088(CX), Y11 VMOVDQU 1120(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y7, Y7 VMOVDQU 1152(CX), Y11 VMOVDQU 1184(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y8, Y8 VMOVDQU 1216(CX), Y11 VMOVDQU 1248(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 2 to 10 outputs VMOVDQU (AX), Y13 ADDQ $0x20, AX VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 1280(CX), Y11 VMOVDQU 1312(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y0, Y0 VMOVDQU 1344(CX), Y11 VMOVDQU 1376(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y1, Y1 VMOVDQU 1408(CX), Y11 VMOVDQU 1440(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y2, Y2 VMOVDQU 1472(CX), Y11 VMOVDQU 1504(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y3, Y3 VMOVDQU 1536(CX), Y11 VMOVDQU 1568(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y4, Y4 VMOVDQU 1600(CX), Y11 VMOVDQU 1632(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y5, Y5 VMOVDQU 1664(CX), Y11 VMOVDQU 1696(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y6, Y6 VMOVDQU 1728(CX), Y11 VMOVDQU 1760(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y7, Y7 VMOVDQU 1792(CX), Y11 VMOVDQU 1824(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y8, Y8 VMOVDQU 1856(CX), Y11 VMOVDQU 1888(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y9, Y9 // Store 10 outputs VMOVDQU Y0, (DI) ADDQ $0x20, DI VMOVDQU Y1, (R8) ADDQ $0x20, R8 VMOVDQU Y2, (R9) ADDQ $0x20, R9 VMOVDQU Y3, (R10) ADDQ $0x20, R10 VMOVDQU Y4, (R11) ADDQ $0x20, R11 VMOVDQU Y5, (R12) ADDQ $0x20, R12 VMOVDQU Y6, (R13) ADDQ $0x20, R13 VMOVDQU Y7, (R14) ADDQ $0x20, R14 VMOVDQU Y8, (R15) ADDQ $0x20, R15 VMOVDQU Y9, (SI) ADDQ $0x20, SI // Prepare for next loop DECQ BP JNZ mulAvxTwo_3x10_loop VZEROUPPER mulAvxTwo_3x10_end: RET // func mulAvxTwo_4x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_4x1(SB), NOSPLIT, $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 12 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_4x1_end VMOVDQU (CX), Y0 VMOVDQU 32(CX), Y1 VMOVDQU 64(CX), Y2 VMOVDQU 96(CX), Y3 VMOVDQU 128(CX), Y4 VMOVDQU 160(CX), Y5 VMOVDQU 192(CX), Y6 VMOVDQU 224(CX), Y7 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), BX MOVQ 48(CX), SI MOVQ 72(CX), CX MOVQ out_base+48(FP), DI MOVQ (DI), DI MOVQ start+72(FP), R8 // Add start offset to output ADDQ R8, DI // Add start offset to input ADDQ R8, DX ADDQ R8, BX ADDQ R8, SI ADDQ R8, CX MOVQ $0x0000000f, R8 MOVQ R8, X9 VPBROADCASTB X9, Y9 mulAvxTwo_4x1_loop: // Clear 1 outputs VPXOR Y8, Y8, Y8 // Load and process 32 bytes from input 0 to 1 outputs VMOVDQU (DX), Y10 ADDQ $0x20, DX VPSRLQ $0x04, Y10, Y11 VPAND Y9, Y10, Y10 VPAND Y9, Y11, Y11 VPSHUFB Y10, Y0, Y10 VPSHUFB Y11, Y1, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 1 to 1 outputs VMOVDQU (BX), Y10 ADDQ $0x20, BX VPSRLQ $0x04, Y10, Y11 VPAND Y9, Y10, Y10 VPAND Y9, Y11, Y11 VPSHUFB Y10, Y2, Y10 VPSHUFB Y11, Y3, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 2 to 1 outputs VMOVDQU (SI), Y10 ADDQ $0x20, SI VPSRLQ $0x04, Y10, Y11 VPAND Y9, Y10, Y10 VPAND Y9, Y11, Y11 VPSHUFB Y10, Y4, Y10 VPSHUFB Y11, Y5, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 3 to 1 outputs VMOVDQU (CX), Y10 ADDQ $0x20, CX VPSRLQ $0x04, Y10, Y11 VPAND Y9, Y10, Y10 VPAND Y9, Y11, Y11 VPSHUFB Y10, Y6, Y10 VPSHUFB Y11, Y7, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y8, Y8 // Store 1 outputs VMOVDQU Y8, (DI) ADDQ $0x20, DI // Prepare for next loop DECQ AX JNZ mulAvxTwo_4x1_loop VZEROUPPER mulAvxTwo_4x1_end: RET // func mulAvxTwo_4x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_4x1_64(SB), $0-88 // Loading no tables to registers // Destination kept on stack // Full registers estimated 12 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_4x1_64_end MOVQ in_base+24(FP), AX MOVQ (AX), DX MOVQ 24(AX), BX MOVQ 48(AX), SI MOVQ 72(AX), AX MOVQ out_base+48(FP), DI MOVQ out_base+48(FP), DI MOVQ start+72(FP), R8 // Add start offset to input ADDQ R8, DX ADDQ R8, BX ADDQ R8, SI ADDQ R8, AX MOVQ $0x0000000f, R9 MOVQ R9, X2 VPBROADCASTB X2, Y2 MOVQ n+80(FP), R9 SHRQ $0x06, R9 mulAvxTwo_4x1_64_loop: // Clear 1 outputs VPXOR Y0, Y0, Y0 VPXOR Y1, Y1, Y1 // Load and process 64 bytes from input 0 to 1 outputs VMOVDQU (DX), Y6 VMOVDQU 32(DX), Y5 ADDQ $0x40, DX VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU (CX), Y3 VMOVDQU 32(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y5, Y6, Y5 VPXOR Y3, Y0, Y0 VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 1 to 1 outputs VMOVDQU (BX), Y6 VMOVDQU 32(BX), Y5 ADDQ $0x40, BX VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 64(CX), Y3 VMOVDQU 96(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y5, Y6, Y5 VPXOR Y3, Y0, Y0 VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 2 to 1 outputs VMOVDQU (SI), Y6 VMOVDQU 32(SI), Y5 ADDQ $0x40, SI VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 128(CX), Y3 VMOVDQU 160(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y5, Y6, Y5 VPXOR Y3, Y0, Y0 VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 3 to 1 outputs VMOVDQU (AX), Y6 VMOVDQU 32(AX), Y5 ADDQ $0x40, AX VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 192(CX), Y3 VMOVDQU 224(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y5, Y6, Y5 VPXOR Y3, Y0, Y0 VPXOR Y5, Y1, Y1 // Store 1 outputs MOVQ (DI), R10 VMOVDQU Y0, (R10)(R8*1) VMOVDQU Y1, 32(R10)(R8*1) // Prepare for next loop ADDQ $0x40, R8 DECQ R9 JNZ mulAvxTwo_4x1_64_loop VZEROUPPER mulAvxTwo_4x1_64_end: RET // func mulAvxTwo_4x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_4x2(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 23 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_4x2_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), DX MOVQ out_base+48(FP), R8 MOVQ (R8), R9 MOVQ 24(R8), R8 MOVQ start+72(FP), R10 // Add start offset to output ADDQ R10, R9 ADDQ R10, R8 // Add start offset to input ADDQ R10, BX ADDQ R10, SI ADDQ R10, DI ADDQ R10, DX MOVQ $0x0000000f, R10 MOVQ R10, X2 VPBROADCASTB X2, Y2 mulAvxTwo_4x2_loop: // Clear 2 outputs VPXOR Y0, Y0, Y0 VPXOR Y1, Y1, Y1 // Load and process 32 bytes from input 0 to 2 outputs VMOVDQU (BX), Y5 ADDQ $0x20, BX VPSRLQ $0x04, Y5, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y6, Y6 VMOVDQU (CX), Y3 VMOVDQU 32(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y3, Y0, Y0 VMOVDQU 64(CX), Y3 VMOVDQU 96(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 1 to 2 outputs VMOVDQU (SI), Y5 ADDQ $0x20, SI VPSRLQ $0x04, Y5, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y6, Y6 VMOVDQU 128(CX), Y3 VMOVDQU 160(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y3, Y0, Y0 VMOVDQU 192(CX), Y3 VMOVDQU 224(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 2 to 2 outputs VMOVDQU (DI), Y5 ADDQ $0x20, DI VPSRLQ $0x04, Y5, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y6, Y6 VMOVDQU 256(CX), Y3 VMOVDQU 288(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y3, Y0, Y0 VMOVDQU 320(CX), Y3 VMOVDQU 352(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 3 to 2 outputs VMOVDQU (DX), Y5 ADDQ $0x20, DX VPSRLQ $0x04, Y5, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y6, Y6 VMOVDQU 384(CX), Y3 VMOVDQU 416(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y3, Y0, Y0 VMOVDQU 448(CX), Y3 VMOVDQU 480(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y3, Y1, Y1 // Store 2 outputs VMOVDQU Y0, (R9) ADDQ $0x20, R9 VMOVDQU Y1, (R8) ADDQ $0x20, R8 // Prepare for next loop DECQ AX JNZ mulAvxTwo_4x2_loop VZEROUPPER mulAvxTwo_4x2_end: RET // func mulAvxTwo_4x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_4x2_64(SB), $0-88 // Loading no tables to registers // Destination kept on stack // Full registers estimated 23 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_4x2_64_end MOVQ in_base+24(FP), AX MOVQ (AX), DX MOVQ 24(AX), BX MOVQ 48(AX), SI MOVQ 72(AX), AX MOVQ out_base+48(FP), DI MOVQ out_base+48(FP), DI MOVQ start+72(FP), R8 // Add start offset to input ADDQ R8, DX ADDQ R8, BX ADDQ R8, SI ADDQ R8, AX MOVQ $0x0000000f, R9 MOVQ R9, X4 VPBROADCASTB X4, Y4 MOVQ n+80(FP), R9 SHRQ $0x06, R9 mulAvxTwo_4x2_64_loop: // Clear 2 outputs VPXOR Y0, Y0, Y0 VPXOR Y1, Y1, Y1 VPXOR Y2, Y2, Y2 VPXOR Y3, Y3, Y3 // Load and process 64 bytes from input 0 to 2 outputs VMOVDQU (DX), Y9 VMOVDQU 32(DX), Y11 ADDQ $0x40, DX VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU (CX), Y5 VMOVDQU 32(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y7, Y8, Y7 VPXOR Y5, Y0, Y0 VPXOR Y7, Y1, Y1 VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y7, Y8, Y7 VPXOR Y5, Y2, Y2 VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 1 to 2 outputs VMOVDQU (BX), Y9 VMOVDQU 32(BX), Y11 ADDQ $0x40, BX VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 128(CX), Y5 VMOVDQU 160(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y7, Y8, Y7 VPXOR Y5, Y0, Y0 VPXOR Y7, Y1, Y1 VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y7, Y8, Y7 VPXOR Y5, Y2, Y2 VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 2 to 2 outputs VMOVDQU (SI), Y9 VMOVDQU 32(SI), Y11 ADDQ $0x40, SI VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 256(CX), Y5 VMOVDQU 288(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y7, Y8, Y7 VPXOR Y5, Y0, Y0 VPXOR Y7, Y1, Y1 VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y7, Y8, Y7 VPXOR Y5, Y2, Y2 VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 3 to 2 outputs VMOVDQU (AX), Y9 VMOVDQU 32(AX), Y11 ADDQ $0x40, AX VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 384(CX), Y5 VMOVDQU 416(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y7, Y8, Y7 VPXOR Y5, Y0, Y0 VPXOR Y7, Y1, Y1 VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y7, Y8, Y7 VPXOR Y5, Y2, Y2 VPXOR Y7, Y3, Y3 // Store 2 outputs MOVQ (DI), R10 VMOVDQU Y0, (R10)(R8*1) VMOVDQU Y1, 32(R10)(R8*1) MOVQ 24(DI), R10 VMOVDQU Y2, (R10)(R8*1) VMOVDQU Y3, 32(R10)(R8*1) // Prepare for next loop ADDQ $0x40, R8 DECQ R9 JNZ mulAvxTwo_4x2_64_loop VZEROUPPER mulAvxTwo_4x2_64_end: RET // func mulAvxTwo_4x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_4x3(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 32 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_4x3_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), DX MOVQ out_base+48(FP), R8 MOVQ (R8), R9 MOVQ 24(R8), R10 MOVQ 48(R8), R8 MOVQ start+72(FP), R11 // Add start offset to output ADDQ R11, R9 ADDQ R11, R10 ADDQ R11, R8 // Add start offset to input ADDQ R11, BX ADDQ R11, SI ADDQ R11, DI ADDQ R11, DX MOVQ $0x0000000f, R11 MOVQ R11, X3 VPBROADCASTB X3, Y3 mulAvxTwo_4x3_loop: // Clear 3 outputs VPXOR Y0, Y0, Y0 VPXOR Y1, Y1, Y1 VPXOR Y2, Y2, Y2 // Load and process 32 bytes from input 0 to 3 outputs VMOVDQU (BX), Y6 ADDQ $0x20, BX VPSRLQ $0x04, Y6, Y7 VPAND Y3, Y6, Y6 VPAND Y3, Y7, Y7 VMOVDQU (CX), Y4 VMOVDQU 32(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y0, Y0 VMOVDQU 64(CX), Y4 VMOVDQU 96(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y1, Y1 VMOVDQU 128(CX), Y4 VMOVDQU 160(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 1 to 3 outputs VMOVDQU (SI), Y6 ADDQ $0x20, SI VPSRLQ $0x04, Y6, Y7 VPAND Y3, Y6, Y6 VPAND Y3, Y7, Y7 VMOVDQU 192(CX), Y4 VMOVDQU 224(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y0, Y0 VMOVDQU 256(CX), Y4 VMOVDQU 288(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y1, Y1 VMOVDQU 320(CX), Y4 VMOVDQU 352(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 2 to 3 outputs VMOVDQU (DI), Y6 ADDQ $0x20, DI VPSRLQ $0x04, Y6, Y7 VPAND Y3, Y6, Y6 VPAND Y3, Y7, Y7 VMOVDQU 384(CX), Y4 VMOVDQU 416(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y0, Y0 VMOVDQU 448(CX), Y4 VMOVDQU 480(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y1, Y1 VMOVDQU 512(CX), Y4 VMOVDQU 544(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 3 to 3 outputs VMOVDQU (DX), Y6 ADDQ $0x20, DX VPSRLQ $0x04, Y6, Y7 VPAND Y3, Y6, Y6 VPAND Y3, Y7, Y7 VMOVDQU 576(CX), Y4 VMOVDQU 608(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y0, Y0 VMOVDQU 640(CX), Y4 VMOVDQU 672(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y1, Y1 VMOVDQU 704(CX), Y4 VMOVDQU 736(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y2, Y2 // Store 3 outputs VMOVDQU Y0, (R9) ADDQ $0x20, R9 VMOVDQU Y1, (R10) ADDQ $0x20, R10 VMOVDQU Y2, (R8) ADDQ $0x20, R8 // Prepare for next loop DECQ AX JNZ mulAvxTwo_4x3_loop VZEROUPPER mulAvxTwo_4x3_end: RET // func mulAvxTwo_4x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_4x3_64(SB), $0-88 // Loading no tables to registers // Destination kept on stack // Full registers estimated 32 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_4x3_64_end MOVQ in_base+24(FP), AX MOVQ (AX), DX MOVQ 24(AX), BX MOVQ 48(AX), SI MOVQ 72(AX), AX MOVQ out_base+48(FP), DI MOVQ out_base+48(FP), DI MOVQ start+72(FP), R8 // Add start offset to input ADDQ R8, DX ADDQ R8, BX ADDQ R8, SI ADDQ R8, AX MOVQ $0x0000000f, R9 MOVQ R9, X6 VPBROADCASTB X6, Y6 MOVQ n+80(FP), R9 SHRQ $0x06, R9 mulAvxTwo_4x3_64_loop: // Clear 3 outputs VPXOR Y0, Y0, Y0 VPXOR Y1, Y1, Y1 VPXOR Y2, Y2, Y2 VPXOR Y3, Y3, Y3 VPXOR Y4, Y4, Y4 VPXOR Y5, Y5, Y5 // Load and process 64 bytes from input 0 to 3 outputs VMOVDQU (DX), Y11 VMOVDQU 32(DX), Y13 ADDQ $0x40, DX VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU (CX), Y7 VMOVDQU 32(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y0, Y0 VPXOR Y9, Y1, Y1 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y2, Y2 VPXOR Y9, Y3, Y3 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y4, Y4 VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 1 to 3 outputs VMOVDQU (BX), Y11 VMOVDQU 32(BX), Y13 ADDQ $0x40, BX VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 192(CX), Y7 VMOVDQU 224(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y0, Y0 VPXOR Y9, Y1, Y1 VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y2, Y2 VPXOR Y9, Y3, Y3 VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y4, Y4 VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 2 to 3 outputs VMOVDQU (SI), Y11 VMOVDQU 32(SI), Y13 ADDQ $0x40, SI VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 384(CX), Y7 VMOVDQU 416(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y0, Y0 VPXOR Y9, Y1, Y1 VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y2, Y2 VPXOR Y9, Y3, Y3 VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y4, Y4 VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 3 to 3 outputs VMOVDQU (AX), Y11 VMOVDQU 32(AX), Y13 ADDQ $0x40, AX VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 576(CX), Y7 VMOVDQU 608(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y0, Y0 VPXOR Y9, Y1, Y1 VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y2, Y2 VPXOR Y9, Y3, Y3 VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y4, Y4 VPXOR Y9, Y5, Y5 // Store 3 outputs MOVQ (DI), R10 VMOVDQU Y0, (R10)(R8*1) VMOVDQU Y1, 32(R10)(R8*1) MOVQ 24(DI), R10 VMOVDQU Y2, (R10)(R8*1) VMOVDQU Y3, 32(R10)(R8*1) MOVQ 48(DI), R10 VMOVDQU Y4, (R10)(R8*1) VMOVDQU Y5, 32(R10)(R8*1) // Prepare for next loop ADDQ $0x40, R8 DECQ R9 JNZ mulAvxTwo_4x3_64_loop VZEROUPPER mulAvxTwo_4x3_64_end: RET // func mulAvxTwo_4x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_4x4(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 41 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_4x4_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), DX MOVQ out_base+48(FP), R8 MOVQ (R8), R9 MOVQ 24(R8), R10 MOVQ 48(R8), R11 MOVQ 72(R8), R8 MOVQ start+72(FP), R12 // Add start offset to output ADDQ R12, R9 ADDQ R12, R10 ADDQ R12, R11 ADDQ R12, R8 // Add start offset to input ADDQ R12, BX ADDQ R12, SI ADDQ R12, DI ADDQ R12, DX MOVQ $0x0000000f, R12 MOVQ R12, X4 VPBROADCASTB X4, Y4 mulAvxTwo_4x4_loop: // Clear 4 outputs VPXOR Y0, Y0, Y0 VPXOR Y1, Y1, Y1 VPXOR Y2, Y2, Y2 VPXOR Y3, Y3, Y3 // Load and process 32 bytes from input 0 to 4 outputs VMOVDQU (BX), Y7 ADDQ $0x20, BX VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU (CX), Y5 VMOVDQU 32(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y0, Y0 VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y1, Y1 VMOVDQU 128(CX), Y5 VMOVDQU 160(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y2, Y2 VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 1 to 4 outputs VMOVDQU (SI), Y7 ADDQ $0x20, SI VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 256(CX), Y5 VMOVDQU 288(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y0, Y0 VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y1, Y1 VMOVDQU 384(CX), Y5 VMOVDQU 416(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y2, Y2 VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 2 to 4 outputs VMOVDQU (DI), Y7 ADDQ $0x20, DI VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 512(CX), Y5 VMOVDQU 544(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y0, Y0 VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y1, Y1 VMOVDQU 640(CX), Y5 VMOVDQU 672(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y2, Y2 VMOVDQU 704(CX), Y5 VMOVDQU 736(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 3 to 4 outputs VMOVDQU (DX), Y7 ADDQ $0x20, DX VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 768(CX), Y5 VMOVDQU 800(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y0, Y0 VMOVDQU 832(CX), Y5 VMOVDQU 864(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y1, Y1 VMOVDQU 896(CX), Y5 VMOVDQU 928(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y2, Y2 VMOVDQU 960(CX), Y5 VMOVDQU 992(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y3, Y3 // Store 4 outputs VMOVDQU Y0, (R9) ADDQ $0x20, R9 VMOVDQU Y1, (R10) ADDQ $0x20, R10 VMOVDQU Y2, (R11) ADDQ $0x20, R11 VMOVDQU Y3, (R8) ADDQ $0x20, R8 // Prepare for next loop DECQ AX JNZ mulAvxTwo_4x4_loop VZEROUPPER mulAvxTwo_4x4_end: RET // func mulAvxTwo_4x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_4x5(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 50 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_4x5_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), DX MOVQ out_base+48(FP), R8 MOVQ (R8), R9 MOVQ 24(R8), R10 MOVQ 48(R8), R11 MOVQ 72(R8), R12 MOVQ 96(R8), R8 MOVQ start+72(FP), R13 // Add start offset to output ADDQ R13, R9 ADDQ R13, R10 ADDQ R13, R11 ADDQ R13, R12 ADDQ R13, R8 // Add start offset to input ADDQ R13, BX ADDQ R13, SI ADDQ R13, DI ADDQ R13, DX MOVQ $0x0000000f, R13 MOVQ R13, X5 VPBROADCASTB X5, Y5 mulAvxTwo_4x5_loop: // Clear 5 outputs VPXOR Y0, Y0, Y0 VPXOR Y1, Y1, Y1 VPXOR Y2, Y2, Y2 VPXOR Y3, Y3, Y3 VPXOR Y4, Y4, Y4 // Load and process 32 bytes from input 0 to 5 outputs VMOVDQU (BX), Y8 ADDQ $0x20, BX VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU (CX), Y6 VMOVDQU 32(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y0, Y0 VMOVDQU 64(CX), Y6 VMOVDQU 96(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y1, Y1 VMOVDQU 128(CX), Y6 VMOVDQU 160(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y2, Y2 VMOVDQU 192(CX), Y6 VMOVDQU 224(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y3, Y3 VMOVDQU 256(CX), Y6 VMOVDQU 288(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 1 to 5 outputs VMOVDQU (SI), Y8 ADDQ $0x20, SI VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 320(CX), Y6 VMOVDQU 352(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y0, Y0 VMOVDQU 384(CX), Y6 VMOVDQU 416(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y1, Y1 VMOVDQU 448(CX), Y6 VMOVDQU 480(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y2, Y2 VMOVDQU 512(CX), Y6 VMOVDQU 544(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y3, Y3 VMOVDQU 576(CX), Y6 VMOVDQU 608(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 2 to 5 outputs VMOVDQU (DI), Y8 ADDQ $0x20, DI VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 640(CX), Y6 VMOVDQU 672(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y0, Y0 VMOVDQU 704(CX), Y6 VMOVDQU 736(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y1, Y1 VMOVDQU 768(CX), Y6 VMOVDQU 800(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y2, Y2 VMOVDQU 832(CX), Y6 VMOVDQU 864(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y3, Y3 VMOVDQU 896(CX), Y6 VMOVDQU 928(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 3 to 5 outputs VMOVDQU (DX), Y8 ADDQ $0x20, DX VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 960(CX), Y6 VMOVDQU 992(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y0, Y0 VMOVDQU 1024(CX), Y6 VMOVDQU 1056(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y1, Y1 VMOVDQU 1088(CX), Y6 VMOVDQU 1120(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y2, Y2 VMOVDQU 1152(CX), Y6 VMOVDQU 1184(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y3, Y3 VMOVDQU 1216(CX), Y6 VMOVDQU 1248(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y4, Y4 // Store 5 outputs VMOVDQU Y0, (R9) ADDQ $0x20, R9 VMOVDQU Y1, (R10) ADDQ $0x20, R10 VMOVDQU Y2, (R11) ADDQ $0x20, R11 VMOVDQU Y3, (R12) ADDQ $0x20, R12 VMOVDQU Y4, (R8) ADDQ $0x20, R8 // Prepare for next loop DECQ AX JNZ mulAvxTwo_4x5_loop VZEROUPPER mulAvxTwo_4x5_end: RET // func mulAvxTwo_4x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_4x6(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 59 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_4x6_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), DX MOVQ out_base+48(FP), R8 MOVQ (R8), R9 MOVQ 24(R8), R10 MOVQ 48(R8), R11 MOVQ 72(R8), R12 MOVQ 96(R8), R13 MOVQ 120(R8), R8 MOVQ start+72(FP), R14 // Add start offset to output ADDQ R14, R9 ADDQ R14, R10 ADDQ R14, R11 ADDQ R14, R12 ADDQ R14, R13 ADDQ R14, R8 // Add start offset to input ADDQ R14, BX ADDQ R14, SI ADDQ R14, DI ADDQ R14, DX MOVQ $0x0000000f, R14 MOVQ R14, X6 VPBROADCASTB X6, Y6 mulAvxTwo_4x6_loop: // Clear 6 outputs VPXOR Y0, Y0, Y0 VPXOR Y1, Y1, Y1 VPXOR Y2, Y2, Y2 VPXOR Y3, Y3, Y3 VPXOR Y4, Y4, Y4 VPXOR Y5, Y5, Y5 // Load and process 32 bytes from input 0 to 6 outputs VMOVDQU (BX), Y9 ADDQ $0x20, BX VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU (CX), Y7 VMOVDQU 32(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y0, Y0 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y1, Y1 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y2, Y2 VMOVDQU 192(CX), Y7 VMOVDQU 224(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y3, Y3 VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y4, Y4 VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 1 to 6 outputs VMOVDQU (SI), Y9 ADDQ $0x20, SI VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 384(CX), Y7 VMOVDQU 416(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y0, Y0 VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y1, Y1 VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y2, Y2 VMOVDQU 576(CX), Y7 VMOVDQU 608(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y3, Y3 VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y4, Y4 VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 2 to 6 outputs VMOVDQU (DI), Y9 ADDQ $0x20, DI VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 768(CX), Y7 VMOVDQU 800(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y0, Y0 VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y1, Y1 VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y2, Y2 VMOVDQU 960(CX), Y7 VMOVDQU 992(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y3, Y3 VMOVDQU 1024(CX), Y7 VMOVDQU 1056(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y4, Y4 VMOVDQU 1088(CX), Y7 VMOVDQU 1120(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 3 to 6 outputs VMOVDQU (DX), Y9 ADDQ $0x20, DX VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 1152(CX), Y7 VMOVDQU 1184(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y0, Y0 VMOVDQU 1216(CX), Y7 VMOVDQU 1248(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y1, Y1 VMOVDQU 1280(CX), Y7 VMOVDQU 1312(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y2, Y2 VMOVDQU 1344(CX), Y7 VMOVDQU 1376(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y3, Y3 VMOVDQU 1408(CX), Y7 VMOVDQU 1440(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y4, Y4 VMOVDQU 1472(CX), Y7 VMOVDQU 1504(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y5, Y5 // Store 6 outputs VMOVDQU Y0, (R9) ADDQ $0x20, R9 VMOVDQU Y1, (R10) ADDQ $0x20, R10 VMOVDQU Y2, (R11) ADDQ $0x20, R11 VMOVDQU Y3, (R12) ADDQ $0x20, R12 VMOVDQU Y4, (R13) ADDQ $0x20, R13 VMOVDQU Y5, (R8) ADDQ $0x20, R8 // Prepare for next loop DECQ AX JNZ mulAvxTwo_4x6_loop VZEROUPPER mulAvxTwo_4x6_end: RET // func mulAvxTwo_4x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_4x7(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 68 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_4x7_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), DX MOVQ out_base+48(FP), R8 MOVQ (R8), R9 MOVQ 24(R8), R10 MOVQ 48(R8), R11 MOVQ 72(R8), R12 MOVQ 96(R8), R13 MOVQ 120(R8), R14 MOVQ 144(R8), R8 MOVQ start+72(FP), R15 // Add start offset to output ADDQ R15, R9 ADDQ R15, R10 ADDQ R15, R11 ADDQ R15, R12 ADDQ R15, R13 ADDQ R15, R14 ADDQ R15, R8 // Add start offset to input ADDQ R15, BX ADDQ R15, SI ADDQ R15, DI ADDQ R15, DX MOVQ $0x0000000f, R15 MOVQ R15, X7 VPBROADCASTB X7, Y7 mulAvxTwo_4x7_loop: // Clear 7 outputs VPXOR Y0, Y0, Y0 VPXOR Y1, Y1, Y1 VPXOR Y2, Y2, Y2 VPXOR Y3, Y3, Y3 VPXOR Y4, Y4, Y4 VPXOR Y5, Y5, Y5 VPXOR Y6, Y6, Y6 // Load and process 32 bytes from input 0 to 7 outputs VMOVDQU (BX), Y10 ADDQ $0x20, BX VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU (CX), Y8 VMOVDQU 32(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y0, Y0 VMOVDQU 64(CX), Y8 VMOVDQU 96(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y1, Y1 VMOVDQU 128(CX), Y8 VMOVDQU 160(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y2, Y2 VMOVDQU 192(CX), Y8 VMOVDQU 224(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y3, Y3 VMOVDQU 256(CX), Y8 VMOVDQU 288(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y4, Y4 VMOVDQU 320(CX), Y8 VMOVDQU 352(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y5, Y5 VMOVDQU 384(CX), Y8 VMOVDQU 416(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 1 to 7 outputs VMOVDQU (SI), Y10 ADDQ $0x20, SI VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 448(CX), Y8 VMOVDQU 480(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y0, Y0 VMOVDQU 512(CX), Y8 VMOVDQU 544(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y1, Y1 VMOVDQU 576(CX), Y8 VMOVDQU 608(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y2, Y2 VMOVDQU 640(CX), Y8 VMOVDQU 672(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y3, Y3 VMOVDQU 704(CX), Y8 VMOVDQU 736(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y4, Y4 VMOVDQU 768(CX), Y8 VMOVDQU 800(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y5, Y5 VMOVDQU 832(CX), Y8 VMOVDQU 864(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 2 to 7 outputs VMOVDQU (DI), Y10 ADDQ $0x20, DI VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 896(CX), Y8 VMOVDQU 928(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y0, Y0 VMOVDQU 960(CX), Y8 VMOVDQU 992(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y1, Y1 VMOVDQU 1024(CX), Y8 VMOVDQU 1056(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y2, Y2 VMOVDQU 1088(CX), Y8 VMOVDQU 1120(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y3, Y3 VMOVDQU 1152(CX), Y8 VMOVDQU 1184(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y4, Y4 VMOVDQU 1216(CX), Y8 VMOVDQU 1248(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y5, Y5 VMOVDQU 1280(CX), Y8 VMOVDQU 1312(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 3 to 7 outputs VMOVDQU (DX), Y10 ADDQ $0x20, DX VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 1344(CX), Y8 VMOVDQU 1376(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y0, Y0 VMOVDQU 1408(CX), Y8 VMOVDQU 1440(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y1, Y1 VMOVDQU 1472(CX), Y8 VMOVDQU 1504(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y2, Y2 VMOVDQU 1536(CX), Y8 VMOVDQU 1568(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y3, Y3 VMOVDQU 1600(CX), Y8 VMOVDQU 1632(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y4, Y4 VMOVDQU 1664(CX), Y8 VMOVDQU 1696(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y5, Y5 VMOVDQU 1728(CX), Y8 VMOVDQU 1760(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y6, Y6 // Store 7 outputs VMOVDQU Y0, (R9) ADDQ $0x20, R9 VMOVDQU Y1, (R10) ADDQ $0x20, R10 VMOVDQU Y2, (R11) ADDQ $0x20, R11 VMOVDQU Y3, (R12) ADDQ $0x20, R12 VMOVDQU Y4, (R13) ADDQ $0x20, R13 VMOVDQU Y5, (R14) ADDQ $0x20, R14 VMOVDQU Y6, (R8) ADDQ $0x20, R8 // Prepare for next loop DECQ AX JNZ mulAvxTwo_4x7_loop VZEROUPPER mulAvxTwo_4x7_end: RET // func mulAvxTwo_4x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_4x8(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 77 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_4x8_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), DX MOVQ out_base+48(FP), R8 MOVQ (R8), R9 MOVQ 24(R8), R10 MOVQ 48(R8), R11 MOVQ 72(R8), R12 MOVQ 96(R8), R13 MOVQ 120(R8), R14 MOVQ 144(R8), R15 MOVQ 168(R8), R8 MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, R9 ADDQ BP, R10 ADDQ BP, R11 ADDQ BP, R12 ADDQ BP, R13 ADDQ BP, R14 ADDQ BP, R15 ADDQ BP, R8 // Add start offset to input ADDQ BP, BX ADDQ BP, SI ADDQ BP, DI ADDQ BP, DX MOVQ $0x0000000f, BP MOVQ BP, X8 VPBROADCASTB X8, Y8 mulAvxTwo_4x8_loop: // Clear 8 outputs VPXOR Y0, Y0, Y0 VPXOR Y1, Y1, Y1 VPXOR Y2, Y2, Y2 VPXOR Y3, Y3, Y3 VPXOR Y4, Y4, Y4 VPXOR Y5, Y5, Y5 VPXOR Y6, Y6, Y6 VPXOR Y7, Y7, Y7 // Load and process 32 bytes from input 0 to 8 outputs VMOVDQU (BX), Y11 ADDQ $0x20, BX VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU (CX), Y9 VMOVDQU 32(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y0, Y0 VMOVDQU 64(CX), Y9 VMOVDQU 96(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y1, Y1 VMOVDQU 128(CX), Y9 VMOVDQU 160(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y2, Y2 VMOVDQU 192(CX), Y9 VMOVDQU 224(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y3, Y3 VMOVDQU 256(CX), Y9 VMOVDQU 288(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y4, Y4 VMOVDQU 320(CX), Y9 VMOVDQU 352(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y5, Y5 VMOVDQU 384(CX), Y9 VMOVDQU 416(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y6, Y6 VMOVDQU 448(CX), Y9 VMOVDQU 480(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 1 to 8 outputs VMOVDQU (SI), Y11 ADDQ $0x20, SI VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 512(CX), Y9 VMOVDQU 544(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y0, Y0 VMOVDQU 576(CX), Y9 VMOVDQU 608(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y1, Y1 VMOVDQU 640(CX), Y9 VMOVDQU 672(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y2, Y2 VMOVDQU 704(CX), Y9 VMOVDQU 736(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y3, Y3 VMOVDQU 768(CX), Y9 VMOVDQU 800(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y4, Y4 VMOVDQU 832(CX), Y9 VMOVDQU 864(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y5, Y5 VMOVDQU 896(CX), Y9 VMOVDQU 928(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y6, Y6 VMOVDQU 960(CX), Y9 VMOVDQU 992(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 2 to 8 outputs VMOVDQU (DI), Y11 ADDQ $0x20, DI VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 1024(CX), Y9 VMOVDQU 1056(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y0, Y0 VMOVDQU 1088(CX), Y9 VMOVDQU 1120(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y1, Y1 VMOVDQU 1152(CX), Y9 VMOVDQU 1184(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y2, Y2 VMOVDQU 1216(CX), Y9 VMOVDQU 1248(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y3, Y3 VMOVDQU 1280(CX), Y9 VMOVDQU 1312(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y4, Y4 VMOVDQU 1344(CX), Y9 VMOVDQU 1376(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y5, Y5 VMOVDQU 1408(CX), Y9 VMOVDQU 1440(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y6, Y6 VMOVDQU 1472(CX), Y9 VMOVDQU 1504(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 3 to 8 outputs VMOVDQU (DX), Y11 ADDQ $0x20, DX VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 1536(CX), Y9 VMOVDQU 1568(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y0, Y0 VMOVDQU 1600(CX), Y9 VMOVDQU 1632(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y1, Y1 VMOVDQU 1664(CX), Y9 VMOVDQU 1696(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y2, Y2 VMOVDQU 1728(CX), Y9 VMOVDQU 1760(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y3, Y3 VMOVDQU 1792(CX), Y9 VMOVDQU 1824(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y4, Y4 VMOVDQU 1856(CX), Y9 VMOVDQU 1888(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y5, Y5 VMOVDQU 1920(CX), Y9 VMOVDQU 1952(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y6, Y6 VMOVDQU 1984(CX), Y9 VMOVDQU 2016(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y7, Y7 // Store 8 outputs VMOVDQU Y0, (R9) ADDQ $0x20, R9 VMOVDQU Y1, (R10) ADDQ $0x20, R10 VMOVDQU Y2, (R11) ADDQ $0x20, R11 VMOVDQU Y3, (R12) ADDQ $0x20, R12 VMOVDQU Y4, (R13) ADDQ $0x20, R13 VMOVDQU Y5, (R14) ADDQ $0x20, R14 VMOVDQU Y6, (R15) ADDQ $0x20, R15 VMOVDQU Y7, (R8) ADDQ $0x20, R8 // Prepare for next loop DECQ AX JNZ mulAvxTwo_4x8_loop VZEROUPPER mulAvxTwo_4x8_end: RET // func mulAvxTwo_4x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_4x9(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 86 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_4x9_end MOVQ in_base+24(FP), AX MOVQ (AX), DX MOVQ 24(AX), BX MOVQ 48(AX), SI MOVQ 72(AX), AX MOVQ out_base+48(FP), DI MOVQ (DI), R8 MOVQ 24(DI), R9 MOVQ 48(DI), R10 MOVQ 72(DI), R11 MOVQ 96(DI), R12 MOVQ 120(DI), R13 MOVQ 144(DI), R14 MOVQ 168(DI), R15 MOVQ 192(DI), DI MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, R8 ADDQ BP, R9 ADDQ BP, R10 ADDQ BP, R11 ADDQ BP, R12 ADDQ BP, R13 ADDQ BP, R14 ADDQ BP, R15 ADDQ BP, DI // Add start offset to input ADDQ BP, DX ADDQ BP, BX ADDQ BP, SI ADDQ BP, AX MOVQ $0x0000000f, BP MOVQ BP, X9 VPBROADCASTB X9, Y9 MOVQ n+80(FP), BP SHRQ $0x05, BP mulAvxTwo_4x9_loop: // Clear 9 outputs VPXOR Y0, Y0, Y0 VPXOR Y1, Y1, Y1 VPXOR Y2, Y2, Y2 VPXOR Y3, Y3, Y3 VPXOR Y4, Y4, Y4 VPXOR Y5, Y5, Y5 VPXOR Y6, Y6, Y6 VPXOR Y7, Y7, Y7 VPXOR Y8, Y8, Y8 // Load and process 32 bytes from input 0 to 9 outputs VMOVDQU (DX), Y12 ADDQ $0x20, DX VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU (CX), Y10 VMOVDQU 32(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y0, Y0 VMOVDQU 64(CX), Y10 VMOVDQU 96(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y1, Y1 VMOVDQU 128(CX), Y10 VMOVDQU 160(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y2, Y2 VMOVDQU 192(CX), Y10 VMOVDQU 224(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y3, Y3 VMOVDQU 256(CX), Y10 VMOVDQU 288(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y4, Y4 VMOVDQU 320(CX), Y10 VMOVDQU 352(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y5, Y5 VMOVDQU 384(CX), Y10 VMOVDQU 416(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y6, Y6 VMOVDQU 448(CX), Y10 VMOVDQU 480(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y7, Y7 VMOVDQU 512(CX), Y10 VMOVDQU 544(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 1 to 9 outputs VMOVDQU (BX), Y12 ADDQ $0x20, BX VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 576(CX), Y10 VMOVDQU 608(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y0, Y0 VMOVDQU 640(CX), Y10 VMOVDQU 672(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y1, Y1 VMOVDQU 704(CX), Y10 VMOVDQU 736(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y2, Y2 VMOVDQU 768(CX), Y10 VMOVDQU 800(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y3, Y3 VMOVDQU 832(CX), Y10 VMOVDQU 864(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y4, Y4 VMOVDQU 896(CX), Y10 VMOVDQU 928(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y5, Y5 VMOVDQU 960(CX), Y10 VMOVDQU 992(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y6, Y6 VMOVDQU 1024(CX), Y10 VMOVDQU 1056(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y7, Y7 VMOVDQU 1088(CX), Y10 VMOVDQU 1120(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 2 to 9 outputs VMOVDQU (SI), Y12 ADDQ $0x20, SI VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 1152(CX), Y10 VMOVDQU 1184(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y0, Y0 VMOVDQU 1216(CX), Y10 VMOVDQU 1248(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y1, Y1 VMOVDQU 1280(CX), Y10 VMOVDQU 1312(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y2, Y2 VMOVDQU 1344(CX), Y10 VMOVDQU 1376(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y3, Y3 VMOVDQU 1408(CX), Y10 VMOVDQU 1440(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y4, Y4 VMOVDQU 1472(CX), Y10 VMOVDQU 1504(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y5, Y5 VMOVDQU 1536(CX), Y10 VMOVDQU 1568(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y6, Y6 VMOVDQU 1600(CX), Y10 VMOVDQU 1632(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y7, Y7 VMOVDQU 1664(CX), Y10 VMOVDQU 1696(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 3 to 9 outputs VMOVDQU (AX), Y12 ADDQ $0x20, AX VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 1728(CX), Y10 VMOVDQU 1760(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y0, Y0 VMOVDQU 1792(CX), Y10 VMOVDQU 1824(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y1, Y1 VMOVDQU 1856(CX), Y10 VMOVDQU 1888(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y2, Y2 VMOVDQU 1920(CX), Y10 VMOVDQU 1952(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y3, Y3 VMOVDQU 1984(CX), Y10 VMOVDQU 2016(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y4, Y4 VMOVDQU 2048(CX), Y10 VMOVDQU 2080(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y5, Y5 VMOVDQU 2112(CX), Y10 VMOVDQU 2144(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y6, Y6 VMOVDQU 2176(CX), Y10 VMOVDQU 2208(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y7, Y7 VMOVDQU 2240(CX), Y10 VMOVDQU 2272(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y8, Y8 // Store 9 outputs VMOVDQU Y0, (R8) ADDQ $0x20, R8 VMOVDQU Y1, (R9) ADDQ $0x20, R9 VMOVDQU Y2, (R10) ADDQ $0x20, R10 VMOVDQU Y3, (R11) ADDQ $0x20, R11 VMOVDQU Y4, (R12) ADDQ $0x20, R12 VMOVDQU Y5, (R13) ADDQ $0x20, R13 VMOVDQU Y6, (R14) ADDQ $0x20, R14 VMOVDQU Y7, (R15) ADDQ $0x20, R15 VMOVDQU Y8, (DI) ADDQ $0x20, DI // Prepare for next loop DECQ BP JNZ mulAvxTwo_4x9_loop VZEROUPPER mulAvxTwo_4x9_end: RET // func mulAvxTwo_4x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_4x10(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack // Full registers estimated 95 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_4x10_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), DX MOVQ out_base+48(FP), R8 MOVQ start+72(FP), R9 // Add start offset to input ADDQ R9, BX ADDQ R9, SI ADDQ R9, DI ADDQ R9, DX MOVQ $0x0000000f, R10 MOVQ R10, X10 VPBROADCASTB X10, Y10 mulAvxTwo_4x10_loop: // Clear 10 outputs VPXOR Y0, Y0, Y0 VPXOR Y1, Y1, Y1 VPXOR Y2, Y2, Y2 VPXOR Y3, Y3, Y3 VPXOR Y4, Y4, Y4 VPXOR Y5, Y5, Y5 VPXOR Y6, Y6, Y6 VPXOR Y7, Y7, Y7 VPXOR Y8, Y8, Y8 VPXOR Y9, Y9, Y9 // Load and process 32 bytes from input 0 to 10 outputs VMOVDQU (BX), Y13 ADDQ $0x20, BX VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU (CX), Y11 VMOVDQU 32(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y0, Y0 VMOVDQU 64(CX), Y11 VMOVDQU 96(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y1, Y1 VMOVDQU 128(CX), Y11 VMOVDQU 160(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y2, Y2 VMOVDQU 192(CX), Y11 VMOVDQU 224(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y3, Y3 VMOVDQU 256(CX), Y11 VMOVDQU 288(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y4, Y4 VMOVDQU 320(CX), Y11 VMOVDQU 352(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y5, Y5 VMOVDQU 384(CX), Y11 VMOVDQU 416(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y6, Y6 VMOVDQU 448(CX), Y11 VMOVDQU 480(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y7, Y7 VMOVDQU 512(CX), Y11 VMOVDQU 544(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y8, Y8 VMOVDQU 576(CX), Y11 VMOVDQU 608(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 1 to 10 outputs VMOVDQU (SI), Y13 ADDQ $0x20, SI VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 640(CX), Y11 VMOVDQU 672(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y0, Y0 VMOVDQU 704(CX), Y11 VMOVDQU 736(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y1, Y1 VMOVDQU 768(CX), Y11 VMOVDQU 800(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y2, Y2 VMOVDQU 832(CX), Y11 VMOVDQU 864(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y3, Y3 VMOVDQU 896(CX), Y11 VMOVDQU 928(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y4, Y4 VMOVDQU 960(CX), Y11 VMOVDQU 992(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y5, Y5 VMOVDQU 1024(CX), Y11 VMOVDQU 1056(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y6, Y6 VMOVDQU 1088(CX), Y11 VMOVDQU 1120(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y7, Y7 VMOVDQU 1152(CX), Y11 VMOVDQU 1184(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y8, Y8 VMOVDQU 1216(CX), Y11 VMOVDQU 1248(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 2 to 10 outputs VMOVDQU (DI), Y13 ADDQ $0x20, DI VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 1280(CX), Y11 VMOVDQU 1312(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y0, Y0 VMOVDQU 1344(CX), Y11 VMOVDQU 1376(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y1, Y1 VMOVDQU 1408(CX), Y11 VMOVDQU 1440(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y2, Y2 VMOVDQU 1472(CX), Y11 VMOVDQU 1504(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y3, Y3 VMOVDQU 1536(CX), Y11 VMOVDQU 1568(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y4, Y4 VMOVDQU 1600(CX), Y11 VMOVDQU 1632(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y5, Y5 VMOVDQU 1664(CX), Y11 VMOVDQU 1696(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y6, Y6 VMOVDQU 1728(CX), Y11 VMOVDQU 1760(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y7, Y7 VMOVDQU 1792(CX), Y11 VMOVDQU 1824(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y8, Y8 VMOVDQU 1856(CX), Y11 VMOVDQU 1888(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 3 to 10 outputs VMOVDQU (DX), Y13 ADDQ $0x20, DX VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 1920(CX), Y11 VMOVDQU 1952(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y0, Y0 VMOVDQU 1984(CX), Y11 VMOVDQU 2016(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y1, Y1 VMOVDQU 2048(CX), Y11 VMOVDQU 2080(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y2, Y2 VMOVDQU 2112(CX), Y11 VMOVDQU 2144(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y3, Y3 VMOVDQU 2176(CX), Y11 VMOVDQU 2208(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y4, Y4 VMOVDQU 2240(CX), Y11 VMOVDQU 2272(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y5, Y5 VMOVDQU 2304(CX), Y11 VMOVDQU 2336(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y6, Y6 VMOVDQU 2368(CX), Y11 VMOVDQU 2400(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y7, Y7 VMOVDQU 2432(CX), Y11 VMOVDQU 2464(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y8, Y8 VMOVDQU 2496(CX), Y11 VMOVDQU 2528(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y9, Y9 // Store 10 outputs MOVQ (R8), R10 VMOVDQU Y0, (R10)(R9*1) MOVQ 24(R8), R10 VMOVDQU Y1, (R10)(R9*1) MOVQ 48(R8), R10 VMOVDQU Y2, (R10)(R9*1) MOVQ 72(R8), R10 VMOVDQU Y3, (R10)(R9*1) MOVQ 96(R8), R10 VMOVDQU Y4, (R10)(R9*1) MOVQ 120(R8), R10 VMOVDQU Y5, (R10)(R9*1) MOVQ 144(R8), R10 VMOVDQU Y6, (R10)(R9*1) MOVQ 168(R8), R10 VMOVDQU Y7, (R10)(R9*1) MOVQ 192(R8), R10 VMOVDQU Y8, (R10)(R9*1) MOVQ 216(R8), R10 VMOVDQU Y9, (R10)(R9*1) // Prepare for next loop ADDQ $0x20, R9 DECQ AX JNZ mulAvxTwo_4x10_loop VZEROUPPER mulAvxTwo_4x10_end: RET // func mulAvxTwo_5x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_5x1(SB), NOSPLIT, $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 14 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_5x1_end VMOVDQU (CX), Y0 VMOVDQU 32(CX), Y1 VMOVDQU 64(CX), Y2 VMOVDQU 96(CX), Y3 VMOVDQU 128(CX), Y4 VMOVDQU 160(CX), Y5 VMOVDQU 192(CX), Y6 VMOVDQU 224(CX), Y7 VMOVDQU 256(CX), Y8 VMOVDQU 288(CX), Y9 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), BX MOVQ 48(CX), SI MOVQ 72(CX), DI MOVQ 96(CX), CX MOVQ out_base+48(FP), R8 MOVQ (R8), R8 MOVQ start+72(FP), R9 // Add start offset to output ADDQ R9, R8 // Add start offset to input ADDQ R9, DX ADDQ R9, BX ADDQ R9, SI ADDQ R9, DI ADDQ R9, CX MOVQ $0x0000000f, R9 MOVQ R9, X11 VPBROADCASTB X11, Y11 mulAvxTwo_5x1_loop: // Clear 1 outputs VPXOR Y10, Y10, Y10 // Load and process 32 bytes from input 0 to 1 outputs VMOVDQU (DX), Y12 ADDQ $0x20, DX VPSRLQ $0x04, Y12, Y13 VPAND Y11, Y12, Y12 VPAND Y11, Y13, Y13 VPSHUFB Y12, Y0, Y12 VPSHUFB Y13, Y1, Y13 VPXOR Y12, Y13, Y12 VPXOR Y12, Y10, Y10 // Load and process 32 bytes from input 1 to 1 outputs VMOVDQU (BX), Y12 ADDQ $0x20, BX VPSRLQ $0x04, Y12, Y13 VPAND Y11, Y12, Y12 VPAND Y11, Y13, Y13 VPSHUFB Y12, Y2, Y12 VPSHUFB Y13, Y3, Y13 VPXOR Y12, Y13, Y12 VPXOR Y12, Y10, Y10 // Load and process 32 bytes from input 2 to 1 outputs VMOVDQU (SI), Y12 ADDQ $0x20, SI VPSRLQ $0x04, Y12, Y13 VPAND Y11, Y12, Y12 VPAND Y11, Y13, Y13 VPSHUFB Y12, Y4, Y12 VPSHUFB Y13, Y5, Y13 VPXOR Y12, Y13, Y12 VPXOR Y12, Y10, Y10 // Load and process 32 bytes from input 3 to 1 outputs VMOVDQU (DI), Y12 ADDQ $0x20, DI VPSRLQ $0x04, Y12, Y13 VPAND Y11, Y12, Y12 VPAND Y11, Y13, Y13 VPSHUFB Y12, Y6, Y12 VPSHUFB Y13, Y7, Y13 VPXOR Y12, Y13, Y12 VPXOR Y12, Y10, Y10 // Load and process 32 bytes from input 4 to 1 outputs VMOVDQU (CX), Y12 ADDQ $0x20, CX VPSRLQ $0x04, Y12, Y13 VPAND Y11, Y12, Y12 VPAND Y11, Y13, Y13 VPSHUFB Y12, Y8, Y12 VPSHUFB Y13, Y9, Y13 VPXOR Y12, Y13, Y12 VPXOR Y12, Y10, Y10 // Store 1 outputs VMOVDQU Y10, (R8) ADDQ $0x20, R8 // Prepare for next loop DECQ AX JNZ mulAvxTwo_5x1_loop VZEROUPPER mulAvxTwo_5x1_end: RET // func mulAvxTwo_5x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_5x1_64(SB), $0-88 // Loading no tables to registers // Destination kept on stack // Full registers estimated 14 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_5x1_64_end MOVQ in_base+24(FP), AX MOVQ (AX), DX MOVQ 24(AX), BX MOVQ 48(AX), SI MOVQ 72(AX), DI MOVQ 96(AX), AX MOVQ out_base+48(FP), R8 MOVQ out_base+48(FP), R8 MOVQ start+72(FP), R9 // Add start offset to input ADDQ R9, DX ADDQ R9, BX ADDQ R9, SI ADDQ R9, DI ADDQ R9, AX MOVQ $0x0000000f, R10 MOVQ R10, X2 VPBROADCASTB X2, Y2 MOVQ n+80(FP), R10 SHRQ $0x06, R10 mulAvxTwo_5x1_64_loop: // Clear 1 outputs VPXOR Y0, Y0, Y0 VPXOR Y1, Y1, Y1 // Load and process 64 bytes from input 0 to 1 outputs VMOVDQU (DX), Y6 VMOVDQU 32(DX), Y5 ADDQ $0x40, DX VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU (CX), Y3 VMOVDQU 32(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y5, Y6, Y5 VPXOR Y3, Y0, Y0 VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 1 to 1 outputs VMOVDQU (BX), Y6 VMOVDQU 32(BX), Y5 ADDQ $0x40, BX VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 64(CX), Y3 VMOVDQU 96(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y5, Y6, Y5 VPXOR Y3, Y0, Y0 VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 2 to 1 outputs VMOVDQU (SI), Y6 VMOVDQU 32(SI), Y5 ADDQ $0x40, SI VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 128(CX), Y3 VMOVDQU 160(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y5, Y6, Y5 VPXOR Y3, Y0, Y0 VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 3 to 1 outputs VMOVDQU (DI), Y6 VMOVDQU 32(DI), Y5 ADDQ $0x40, DI VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 192(CX), Y3 VMOVDQU 224(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y5, Y6, Y5 VPXOR Y3, Y0, Y0 VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 4 to 1 outputs VMOVDQU (AX), Y6 VMOVDQU 32(AX), Y5 ADDQ $0x40, AX VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 256(CX), Y3 VMOVDQU 288(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y5, Y6, Y5 VPXOR Y3, Y0, Y0 VPXOR Y5, Y1, Y1 // Store 1 outputs MOVQ (R8), R11 VMOVDQU Y0, (R11)(R9*1) VMOVDQU Y1, 32(R11)(R9*1) // Prepare for next loop ADDQ $0x40, R9 DECQ R10 JNZ mulAvxTwo_5x1_64_loop VZEROUPPER mulAvxTwo_5x1_64_end: RET // func mulAvxTwo_5x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_5x2(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 27 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_5x2_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), DX MOVQ out_base+48(FP), R9 MOVQ (R9), R10 MOVQ 24(R9), R9 MOVQ start+72(FP), R11 // Add start offset to output ADDQ R11, R10 ADDQ R11, R9 // Add start offset to input ADDQ R11, BX ADDQ R11, SI ADDQ R11, DI ADDQ R11, R8 ADDQ R11, DX MOVQ $0x0000000f, R11 MOVQ R11, X2 VPBROADCASTB X2, Y2 mulAvxTwo_5x2_loop: // Clear 2 outputs VPXOR Y0, Y0, Y0 VPXOR Y1, Y1, Y1 // Load and process 32 bytes from input 0 to 2 outputs VMOVDQU (BX), Y5 ADDQ $0x20, BX VPSRLQ $0x04, Y5, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y6, Y6 VMOVDQU (CX), Y3 VMOVDQU 32(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y3, Y0, Y0 VMOVDQU 64(CX), Y3 VMOVDQU 96(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 1 to 2 outputs VMOVDQU (SI), Y5 ADDQ $0x20, SI VPSRLQ $0x04, Y5, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y6, Y6 VMOVDQU 128(CX), Y3 VMOVDQU 160(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y3, Y0, Y0 VMOVDQU 192(CX), Y3 VMOVDQU 224(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 2 to 2 outputs VMOVDQU (DI), Y5 ADDQ $0x20, DI VPSRLQ $0x04, Y5, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y6, Y6 VMOVDQU 256(CX), Y3 VMOVDQU 288(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y3, Y0, Y0 VMOVDQU 320(CX), Y3 VMOVDQU 352(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 3 to 2 outputs VMOVDQU (R8), Y5 ADDQ $0x20, R8 VPSRLQ $0x04, Y5, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y6, Y6 VMOVDQU 384(CX), Y3 VMOVDQU 416(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y3, Y0, Y0 VMOVDQU 448(CX), Y3 VMOVDQU 480(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 4 to 2 outputs VMOVDQU (DX), Y5 ADDQ $0x20, DX VPSRLQ $0x04, Y5, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y6, Y6 VMOVDQU 512(CX), Y3 VMOVDQU 544(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y3, Y0, Y0 VMOVDQU 576(CX), Y3 VMOVDQU 608(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y3, Y1, Y1 // Store 2 outputs VMOVDQU Y0, (R10) ADDQ $0x20, R10 VMOVDQU Y1, (R9) ADDQ $0x20, R9 // Prepare for next loop DECQ AX JNZ mulAvxTwo_5x2_loop VZEROUPPER mulAvxTwo_5x2_end: RET // func mulAvxTwo_5x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_5x2_64(SB), $0-88 // Loading no tables to registers // Destination kept on stack // Full registers estimated 27 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_5x2_64_end MOVQ in_base+24(FP), AX MOVQ (AX), DX MOVQ 24(AX), BX MOVQ 48(AX), SI MOVQ 72(AX), DI MOVQ 96(AX), AX MOVQ out_base+48(FP), R8 MOVQ out_base+48(FP), R8 MOVQ start+72(FP), R9 // Add start offset to input ADDQ R9, DX ADDQ R9, BX ADDQ R9, SI ADDQ R9, DI ADDQ R9, AX MOVQ $0x0000000f, R10 MOVQ R10, X4 VPBROADCASTB X4, Y4 MOVQ n+80(FP), R10 SHRQ $0x06, R10 mulAvxTwo_5x2_64_loop: // Clear 2 outputs VPXOR Y0, Y0, Y0 VPXOR Y1, Y1, Y1 VPXOR Y2, Y2, Y2 VPXOR Y3, Y3, Y3 // Load and process 64 bytes from input 0 to 2 outputs VMOVDQU (DX), Y9 VMOVDQU 32(DX), Y11 ADDQ $0x40, DX VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU (CX), Y5 VMOVDQU 32(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y7, Y8, Y7 VPXOR Y5, Y0, Y0 VPXOR Y7, Y1, Y1 VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y7, Y8, Y7 VPXOR Y5, Y2, Y2 VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 1 to 2 outputs VMOVDQU (BX), Y9 VMOVDQU 32(BX), Y11 ADDQ $0x40, BX VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 128(CX), Y5 VMOVDQU 160(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y7, Y8, Y7 VPXOR Y5, Y0, Y0 VPXOR Y7, Y1, Y1 VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y7, Y8, Y7 VPXOR Y5, Y2, Y2 VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 2 to 2 outputs VMOVDQU (SI), Y9 VMOVDQU 32(SI), Y11 ADDQ $0x40, SI VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 256(CX), Y5 VMOVDQU 288(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y7, Y8, Y7 VPXOR Y5, Y0, Y0 VPXOR Y7, Y1, Y1 VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y7, Y8, Y7 VPXOR Y5, Y2, Y2 VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 3 to 2 outputs VMOVDQU (DI), Y9 VMOVDQU 32(DI), Y11 ADDQ $0x40, DI VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 384(CX), Y5 VMOVDQU 416(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y7, Y8, Y7 VPXOR Y5, Y0, Y0 VPXOR Y7, Y1, Y1 VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y7, Y8, Y7 VPXOR Y5, Y2, Y2 VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 4 to 2 outputs VMOVDQU (AX), Y9 VMOVDQU 32(AX), Y11 ADDQ $0x40, AX VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 512(CX), Y5 VMOVDQU 544(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y7, Y8, Y7 VPXOR Y5, Y0, Y0 VPXOR Y7, Y1, Y1 VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y7, Y8, Y7 VPXOR Y5, Y2, Y2 VPXOR Y7, Y3, Y3 // Store 2 outputs MOVQ (R8), R11 VMOVDQU Y0, (R11)(R9*1) VMOVDQU Y1, 32(R11)(R9*1) MOVQ 24(R8), R11 VMOVDQU Y2, (R11)(R9*1) VMOVDQU Y3, 32(R11)(R9*1) // Prepare for next loop ADDQ $0x40, R9 DECQ R10 JNZ mulAvxTwo_5x2_64_loop VZEROUPPER mulAvxTwo_5x2_64_end: RET // func mulAvxTwo_5x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_5x3(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 38 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_5x3_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), DX MOVQ out_base+48(FP), R9 MOVQ (R9), R10 MOVQ 24(R9), R11 MOVQ 48(R9), R9 MOVQ start+72(FP), R12 // Add start offset to output ADDQ R12, R10 ADDQ R12, R11 ADDQ R12, R9 // Add start offset to input ADDQ R12, BX ADDQ R12, SI ADDQ R12, DI ADDQ R12, R8 ADDQ R12, DX MOVQ $0x0000000f, R12 MOVQ R12, X3 VPBROADCASTB X3, Y3 mulAvxTwo_5x3_loop: // Clear 3 outputs VPXOR Y0, Y0, Y0 VPXOR Y1, Y1, Y1 VPXOR Y2, Y2, Y2 // Load and process 32 bytes from input 0 to 3 outputs VMOVDQU (BX), Y6 ADDQ $0x20, BX VPSRLQ $0x04, Y6, Y7 VPAND Y3, Y6, Y6 VPAND Y3, Y7, Y7 VMOVDQU (CX), Y4 VMOVDQU 32(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y0, Y0 VMOVDQU 64(CX), Y4 VMOVDQU 96(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y1, Y1 VMOVDQU 128(CX), Y4 VMOVDQU 160(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 1 to 3 outputs VMOVDQU (SI), Y6 ADDQ $0x20, SI VPSRLQ $0x04, Y6, Y7 VPAND Y3, Y6, Y6 VPAND Y3, Y7, Y7 VMOVDQU 192(CX), Y4 VMOVDQU 224(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y0, Y0 VMOVDQU 256(CX), Y4 VMOVDQU 288(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y1, Y1 VMOVDQU 320(CX), Y4 VMOVDQU 352(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 2 to 3 outputs VMOVDQU (DI), Y6 ADDQ $0x20, DI VPSRLQ $0x04, Y6, Y7 VPAND Y3, Y6, Y6 VPAND Y3, Y7, Y7 VMOVDQU 384(CX), Y4 VMOVDQU 416(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y0, Y0 VMOVDQU 448(CX), Y4 VMOVDQU 480(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y1, Y1 VMOVDQU 512(CX), Y4 VMOVDQU 544(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 3 to 3 outputs VMOVDQU (R8), Y6 ADDQ $0x20, R8 VPSRLQ $0x04, Y6, Y7 VPAND Y3, Y6, Y6 VPAND Y3, Y7, Y7 VMOVDQU 576(CX), Y4 VMOVDQU 608(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y0, Y0 VMOVDQU 640(CX), Y4 VMOVDQU 672(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y1, Y1 VMOVDQU 704(CX), Y4 VMOVDQU 736(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 4 to 3 outputs VMOVDQU (DX), Y6 ADDQ $0x20, DX VPSRLQ $0x04, Y6, Y7 VPAND Y3, Y6, Y6 VPAND Y3, Y7, Y7 VMOVDQU 768(CX), Y4 VMOVDQU 800(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y0, Y0 VMOVDQU 832(CX), Y4 VMOVDQU 864(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y1, Y1 VMOVDQU 896(CX), Y4 VMOVDQU 928(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y2, Y2 // Store 3 outputs VMOVDQU Y0, (R10) ADDQ $0x20, R10 VMOVDQU Y1, (R11) ADDQ $0x20, R11 VMOVDQU Y2, (R9) ADDQ $0x20, R9 // Prepare for next loop DECQ AX JNZ mulAvxTwo_5x3_loop VZEROUPPER mulAvxTwo_5x3_end: RET // func mulAvxTwo_5x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_5x3_64(SB), $0-88 // Loading no tables to registers // Destination kept on stack // Full registers estimated 38 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_5x3_64_end MOVQ in_base+24(FP), AX MOVQ (AX), DX MOVQ 24(AX), BX MOVQ 48(AX), SI MOVQ 72(AX), DI MOVQ 96(AX), AX MOVQ out_base+48(FP), R8 MOVQ out_base+48(FP), R8 MOVQ start+72(FP), R9 // Add start offset to input ADDQ R9, DX ADDQ R9, BX ADDQ R9, SI ADDQ R9, DI ADDQ R9, AX MOVQ $0x0000000f, R10 MOVQ R10, X6 VPBROADCASTB X6, Y6 MOVQ n+80(FP), R10 SHRQ $0x06, R10 mulAvxTwo_5x3_64_loop: // Clear 3 outputs VPXOR Y0, Y0, Y0 VPXOR Y1, Y1, Y1 VPXOR Y2, Y2, Y2 VPXOR Y3, Y3, Y3 VPXOR Y4, Y4, Y4 VPXOR Y5, Y5, Y5 // Load and process 64 bytes from input 0 to 3 outputs VMOVDQU (DX), Y11 VMOVDQU 32(DX), Y13 ADDQ $0x40, DX VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU (CX), Y7 VMOVDQU 32(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y0, Y0 VPXOR Y9, Y1, Y1 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y2, Y2 VPXOR Y9, Y3, Y3 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y4, Y4 VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 1 to 3 outputs VMOVDQU (BX), Y11 VMOVDQU 32(BX), Y13 ADDQ $0x40, BX VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 192(CX), Y7 VMOVDQU 224(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y0, Y0 VPXOR Y9, Y1, Y1 VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y2, Y2 VPXOR Y9, Y3, Y3 VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y4, Y4 VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 2 to 3 outputs VMOVDQU (SI), Y11 VMOVDQU 32(SI), Y13 ADDQ $0x40, SI VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 384(CX), Y7 VMOVDQU 416(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y0, Y0 VPXOR Y9, Y1, Y1 VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y2, Y2 VPXOR Y9, Y3, Y3 VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y4, Y4 VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 3 to 3 outputs VMOVDQU (DI), Y11 VMOVDQU 32(DI), Y13 ADDQ $0x40, DI VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 576(CX), Y7 VMOVDQU 608(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y0, Y0 VPXOR Y9, Y1, Y1 VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y2, Y2 VPXOR Y9, Y3, Y3 VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y4, Y4 VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 4 to 3 outputs VMOVDQU (AX), Y11 VMOVDQU 32(AX), Y13 ADDQ $0x40, AX VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 768(CX), Y7 VMOVDQU 800(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y0, Y0 VPXOR Y9, Y1, Y1 VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y2, Y2 VPXOR Y9, Y3, Y3 VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y4, Y4 VPXOR Y9, Y5, Y5 // Store 3 outputs MOVQ (R8), R11 VMOVDQU Y0, (R11)(R9*1) VMOVDQU Y1, 32(R11)(R9*1) MOVQ 24(R8), R11 VMOVDQU Y2, (R11)(R9*1) VMOVDQU Y3, 32(R11)(R9*1) MOVQ 48(R8), R11 VMOVDQU Y4, (R11)(R9*1) VMOVDQU Y5, 32(R11)(R9*1) // Prepare for next loop ADDQ $0x40, R9 DECQ R10 JNZ mulAvxTwo_5x3_64_loop VZEROUPPER mulAvxTwo_5x3_64_end: RET // func mulAvxTwo_5x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_5x4(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 49 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_5x4_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), DX MOVQ out_base+48(FP), R9 MOVQ (R9), R10 MOVQ 24(R9), R11 MOVQ 48(R9), R12 MOVQ 72(R9), R9 MOVQ start+72(FP), R13 // Add start offset to output ADDQ R13, R10 ADDQ R13, R11 ADDQ R13, R12 ADDQ R13, R9 // Add start offset to input ADDQ R13, BX ADDQ R13, SI ADDQ R13, DI ADDQ R13, R8 ADDQ R13, DX MOVQ $0x0000000f, R13 MOVQ R13, X4 VPBROADCASTB X4, Y4 mulAvxTwo_5x4_loop: // Clear 4 outputs VPXOR Y0, Y0, Y0 VPXOR Y1, Y1, Y1 VPXOR Y2, Y2, Y2 VPXOR Y3, Y3, Y3 // Load and process 32 bytes from input 0 to 4 outputs VMOVDQU (BX), Y7 ADDQ $0x20, BX VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU (CX), Y5 VMOVDQU 32(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y0, Y0 VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y1, Y1 VMOVDQU 128(CX), Y5 VMOVDQU 160(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y2, Y2 VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 1 to 4 outputs VMOVDQU (SI), Y7 ADDQ $0x20, SI VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 256(CX), Y5 VMOVDQU 288(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y0, Y0 VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y1, Y1 VMOVDQU 384(CX), Y5 VMOVDQU 416(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y2, Y2 VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 2 to 4 outputs VMOVDQU (DI), Y7 ADDQ $0x20, DI VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 512(CX), Y5 VMOVDQU 544(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y0, Y0 VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y1, Y1 VMOVDQU 640(CX), Y5 VMOVDQU 672(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y2, Y2 VMOVDQU 704(CX), Y5 VMOVDQU 736(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 3 to 4 outputs VMOVDQU (R8), Y7 ADDQ $0x20, R8 VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 768(CX), Y5 VMOVDQU 800(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y0, Y0 VMOVDQU 832(CX), Y5 VMOVDQU 864(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y1, Y1 VMOVDQU 896(CX), Y5 VMOVDQU 928(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y2, Y2 VMOVDQU 960(CX), Y5 VMOVDQU 992(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 4 to 4 outputs VMOVDQU (DX), Y7 ADDQ $0x20, DX VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 1024(CX), Y5 VMOVDQU 1056(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y0, Y0 VMOVDQU 1088(CX), Y5 VMOVDQU 1120(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y1, Y1 VMOVDQU 1152(CX), Y5 VMOVDQU 1184(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y2, Y2 VMOVDQU 1216(CX), Y5 VMOVDQU 1248(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y3, Y3 // Store 4 outputs VMOVDQU Y0, (R10) ADDQ $0x20, R10 VMOVDQU Y1, (R11) ADDQ $0x20, R11 VMOVDQU Y2, (R12) ADDQ $0x20, R12 VMOVDQU Y3, (R9) ADDQ $0x20, R9 // Prepare for next loop DECQ AX JNZ mulAvxTwo_5x4_loop VZEROUPPER mulAvxTwo_5x4_end: RET // func mulAvxTwo_5x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_5x5(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 60 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_5x5_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), DX MOVQ out_base+48(FP), R9 MOVQ (R9), R10 MOVQ 24(R9), R11 MOVQ 48(R9), R12 MOVQ 72(R9), R13 MOVQ 96(R9), R9 MOVQ start+72(FP), R14 // Add start offset to output ADDQ R14, R10 ADDQ R14, R11 ADDQ R14, R12 ADDQ R14, R13 ADDQ R14, R9 // Add start offset to input ADDQ R14, BX ADDQ R14, SI ADDQ R14, DI ADDQ R14, R8 ADDQ R14, DX MOVQ $0x0000000f, R14 MOVQ R14, X5 VPBROADCASTB X5, Y5 mulAvxTwo_5x5_loop: // Clear 5 outputs VPXOR Y0, Y0, Y0 VPXOR Y1, Y1, Y1 VPXOR Y2, Y2, Y2 VPXOR Y3, Y3, Y3 VPXOR Y4, Y4, Y4 // Load and process 32 bytes from input 0 to 5 outputs VMOVDQU (BX), Y8 ADDQ $0x20, BX VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU (CX), Y6 VMOVDQU 32(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y0, Y0 VMOVDQU 64(CX), Y6 VMOVDQU 96(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y1, Y1 VMOVDQU 128(CX), Y6 VMOVDQU 160(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y2, Y2 VMOVDQU 192(CX), Y6 VMOVDQU 224(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y3, Y3 VMOVDQU 256(CX), Y6 VMOVDQU 288(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 1 to 5 outputs VMOVDQU (SI), Y8 ADDQ $0x20, SI VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 320(CX), Y6 VMOVDQU 352(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y0, Y0 VMOVDQU 384(CX), Y6 VMOVDQU 416(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y1, Y1 VMOVDQU 448(CX), Y6 VMOVDQU 480(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y2, Y2 VMOVDQU 512(CX), Y6 VMOVDQU 544(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y3, Y3 VMOVDQU 576(CX), Y6 VMOVDQU 608(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 2 to 5 outputs VMOVDQU (DI), Y8 ADDQ $0x20, DI VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 640(CX), Y6 VMOVDQU 672(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y0, Y0 VMOVDQU 704(CX), Y6 VMOVDQU 736(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y1, Y1 VMOVDQU 768(CX), Y6 VMOVDQU 800(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y2, Y2 VMOVDQU 832(CX), Y6 VMOVDQU 864(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y3, Y3 VMOVDQU 896(CX), Y6 VMOVDQU 928(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 3 to 5 outputs VMOVDQU (R8), Y8 ADDQ $0x20, R8 VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 960(CX), Y6 VMOVDQU 992(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y0, Y0 VMOVDQU 1024(CX), Y6 VMOVDQU 1056(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y1, Y1 VMOVDQU 1088(CX), Y6 VMOVDQU 1120(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y2, Y2 VMOVDQU 1152(CX), Y6 VMOVDQU 1184(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y3, Y3 VMOVDQU 1216(CX), Y6 VMOVDQU 1248(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 4 to 5 outputs VMOVDQU (DX), Y8 ADDQ $0x20, DX VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 1280(CX), Y6 VMOVDQU 1312(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y0, Y0 VMOVDQU 1344(CX), Y6 VMOVDQU 1376(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y1, Y1 VMOVDQU 1408(CX), Y6 VMOVDQU 1440(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y2, Y2 VMOVDQU 1472(CX), Y6 VMOVDQU 1504(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y3, Y3 VMOVDQU 1536(CX), Y6 VMOVDQU 1568(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y4, Y4 // Store 5 outputs VMOVDQU Y0, (R10) ADDQ $0x20, R10 VMOVDQU Y1, (R11) ADDQ $0x20, R11 VMOVDQU Y2, (R12) ADDQ $0x20, R12 VMOVDQU Y3, (R13) ADDQ $0x20, R13 VMOVDQU Y4, (R9) ADDQ $0x20, R9 // Prepare for next loop DECQ AX JNZ mulAvxTwo_5x5_loop VZEROUPPER mulAvxTwo_5x5_end: RET // func mulAvxTwo_5x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_5x6(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 71 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_5x6_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), DX MOVQ out_base+48(FP), R9 MOVQ (R9), R10 MOVQ 24(R9), R11 MOVQ 48(R9), R12 MOVQ 72(R9), R13 MOVQ 96(R9), R14 MOVQ 120(R9), R9 MOVQ start+72(FP), R15 // Add start offset to output ADDQ R15, R10 ADDQ R15, R11 ADDQ R15, R12 ADDQ R15, R13 ADDQ R15, R14 ADDQ R15, R9 // Add start offset to input ADDQ R15, BX ADDQ R15, SI ADDQ R15, DI ADDQ R15, R8 ADDQ R15, DX MOVQ $0x0000000f, R15 MOVQ R15, X6 VPBROADCASTB X6, Y6 mulAvxTwo_5x6_loop: // Clear 6 outputs VPXOR Y0, Y0, Y0 VPXOR Y1, Y1, Y1 VPXOR Y2, Y2, Y2 VPXOR Y3, Y3, Y3 VPXOR Y4, Y4, Y4 VPXOR Y5, Y5, Y5 // Load and process 32 bytes from input 0 to 6 outputs VMOVDQU (BX), Y9 ADDQ $0x20, BX VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU (CX), Y7 VMOVDQU 32(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y0, Y0 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y1, Y1 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y2, Y2 VMOVDQU 192(CX), Y7 VMOVDQU 224(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y3, Y3 VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y4, Y4 VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 1 to 6 outputs VMOVDQU (SI), Y9 ADDQ $0x20, SI VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 384(CX), Y7 VMOVDQU 416(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y0, Y0 VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y1, Y1 VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y2, Y2 VMOVDQU 576(CX), Y7 VMOVDQU 608(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y3, Y3 VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y4, Y4 VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 2 to 6 outputs VMOVDQU (DI), Y9 ADDQ $0x20, DI VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 768(CX), Y7 VMOVDQU 800(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y0, Y0 VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y1, Y1 VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y2, Y2 VMOVDQU 960(CX), Y7 VMOVDQU 992(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y3, Y3 VMOVDQU 1024(CX), Y7 VMOVDQU 1056(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y4, Y4 VMOVDQU 1088(CX), Y7 VMOVDQU 1120(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 3 to 6 outputs VMOVDQU (R8), Y9 ADDQ $0x20, R8 VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 1152(CX), Y7 VMOVDQU 1184(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y0, Y0 VMOVDQU 1216(CX), Y7 VMOVDQU 1248(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y1, Y1 VMOVDQU 1280(CX), Y7 VMOVDQU 1312(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y2, Y2 VMOVDQU 1344(CX), Y7 VMOVDQU 1376(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y3, Y3 VMOVDQU 1408(CX), Y7 VMOVDQU 1440(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y4, Y4 VMOVDQU 1472(CX), Y7 VMOVDQU 1504(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 4 to 6 outputs VMOVDQU (DX), Y9 ADDQ $0x20, DX VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 1536(CX), Y7 VMOVDQU 1568(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y0, Y0 VMOVDQU 1600(CX), Y7 VMOVDQU 1632(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y1, Y1 VMOVDQU 1664(CX), Y7 VMOVDQU 1696(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y2, Y2 VMOVDQU 1728(CX), Y7 VMOVDQU 1760(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y3, Y3 VMOVDQU 1792(CX), Y7 VMOVDQU 1824(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y4, Y4 VMOVDQU 1856(CX), Y7 VMOVDQU 1888(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y5, Y5 // Store 6 outputs VMOVDQU Y0, (R10) ADDQ $0x20, R10 VMOVDQU Y1, (R11) ADDQ $0x20, R11 VMOVDQU Y2, (R12) ADDQ $0x20, R12 VMOVDQU Y3, (R13) ADDQ $0x20, R13 VMOVDQU Y4, (R14) ADDQ $0x20, R14 VMOVDQU Y5, (R9) ADDQ $0x20, R9 // Prepare for next loop DECQ AX JNZ mulAvxTwo_5x6_loop VZEROUPPER mulAvxTwo_5x6_end: RET // func mulAvxTwo_5x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_5x7(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 82 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_5x7_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), DX MOVQ out_base+48(FP), R9 MOVQ (R9), R10 MOVQ 24(R9), R11 MOVQ 48(R9), R12 MOVQ 72(R9), R13 MOVQ 96(R9), R14 MOVQ 120(R9), R15 MOVQ 144(R9), R9 MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, R10 ADDQ BP, R11 ADDQ BP, R12 ADDQ BP, R13 ADDQ BP, R14 ADDQ BP, R15 ADDQ BP, R9 // Add start offset to input ADDQ BP, BX ADDQ BP, SI ADDQ BP, DI ADDQ BP, R8 ADDQ BP, DX MOVQ $0x0000000f, BP MOVQ BP, X7 VPBROADCASTB X7, Y7 mulAvxTwo_5x7_loop: // Clear 7 outputs VPXOR Y0, Y0, Y0 VPXOR Y1, Y1, Y1 VPXOR Y2, Y2, Y2 VPXOR Y3, Y3, Y3 VPXOR Y4, Y4, Y4 VPXOR Y5, Y5, Y5 VPXOR Y6, Y6, Y6 // Load and process 32 bytes from input 0 to 7 outputs VMOVDQU (BX), Y10 ADDQ $0x20, BX VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU (CX), Y8 VMOVDQU 32(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y0, Y0 VMOVDQU 64(CX), Y8 VMOVDQU 96(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y1, Y1 VMOVDQU 128(CX), Y8 VMOVDQU 160(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y2, Y2 VMOVDQU 192(CX), Y8 VMOVDQU 224(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y3, Y3 VMOVDQU 256(CX), Y8 VMOVDQU 288(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y4, Y4 VMOVDQU 320(CX), Y8 VMOVDQU 352(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y5, Y5 VMOVDQU 384(CX), Y8 VMOVDQU 416(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 1 to 7 outputs VMOVDQU (SI), Y10 ADDQ $0x20, SI VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 448(CX), Y8 VMOVDQU 480(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y0, Y0 VMOVDQU 512(CX), Y8 VMOVDQU 544(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y1, Y1 VMOVDQU 576(CX), Y8 VMOVDQU 608(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y2, Y2 VMOVDQU 640(CX), Y8 VMOVDQU 672(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y3, Y3 VMOVDQU 704(CX), Y8 VMOVDQU 736(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y4, Y4 VMOVDQU 768(CX), Y8 VMOVDQU 800(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y5, Y5 VMOVDQU 832(CX), Y8 VMOVDQU 864(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 2 to 7 outputs VMOVDQU (DI), Y10 ADDQ $0x20, DI VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 896(CX), Y8 VMOVDQU 928(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y0, Y0 VMOVDQU 960(CX), Y8 VMOVDQU 992(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y1, Y1 VMOVDQU 1024(CX), Y8 VMOVDQU 1056(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y2, Y2 VMOVDQU 1088(CX), Y8 VMOVDQU 1120(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y3, Y3 VMOVDQU 1152(CX), Y8 VMOVDQU 1184(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y4, Y4 VMOVDQU 1216(CX), Y8 VMOVDQU 1248(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y5, Y5 VMOVDQU 1280(CX), Y8 VMOVDQU 1312(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 3 to 7 outputs VMOVDQU (R8), Y10 ADDQ $0x20, R8 VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 1344(CX), Y8 VMOVDQU 1376(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y0, Y0 VMOVDQU 1408(CX), Y8 VMOVDQU 1440(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y1, Y1 VMOVDQU 1472(CX), Y8 VMOVDQU 1504(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y2, Y2 VMOVDQU 1536(CX), Y8 VMOVDQU 1568(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y3, Y3 VMOVDQU 1600(CX), Y8 VMOVDQU 1632(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y4, Y4 VMOVDQU 1664(CX), Y8 VMOVDQU 1696(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y5, Y5 VMOVDQU 1728(CX), Y8 VMOVDQU 1760(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 4 to 7 outputs VMOVDQU (DX), Y10 ADDQ $0x20, DX VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 1792(CX), Y8 VMOVDQU 1824(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y0, Y0 VMOVDQU 1856(CX), Y8 VMOVDQU 1888(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y1, Y1 VMOVDQU 1920(CX), Y8 VMOVDQU 1952(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y2, Y2 VMOVDQU 1984(CX), Y8 VMOVDQU 2016(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y3, Y3 VMOVDQU 2048(CX), Y8 VMOVDQU 2080(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y4, Y4 VMOVDQU 2112(CX), Y8 VMOVDQU 2144(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y5, Y5 VMOVDQU 2176(CX), Y8 VMOVDQU 2208(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y6, Y6 // Store 7 outputs VMOVDQU Y0, (R10) ADDQ $0x20, R10 VMOVDQU Y1, (R11) ADDQ $0x20, R11 VMOVDQU Y2, (R12) ADDQ $0x20, R12 VMOVDQU Y3, (R13) ADDQ $0x20, R13 VMOVDQU Y4, (R14) ADDQ $0x20, R14 VMOVDQU Y5, (R15) ADDQ $0x20, R15 VMOVDQU Y6, (R9) ADDQ $0x20, R9 // Prepare for next loop DECQ AX JNZ mulAvxTwo_5x7_loop VZEROUPPER mulAvxTwo_5x7_end: RET // func mulAvxTwo_5x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_5x8(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 93 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_5x8_end MOVQ in_base+24(FP), AX MOVQ (AX), DX MOVQ 24(AX), BX MOVQ 48(AX), SI MOVQ 72(AX), DI MOVQ 96(AX), AX MOVQ out_base+48(FP), R8 MOVQ (R8), R9 MOVQ 24(R8), R10 MOVQ 48(R8), R11 MOVQ 72(R8), R12 MOVQ 96(R8), R13 MOVQ 120(R8), R14 MOVQ 144(R8), R15 MOVQ 168(R8), R8 MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, R9 ADDQ BP, R10 ADDQ BP, R11 ADDQ BP, R12 ADDQ BP, R13 ADDQ BP, R14 ADDQ BP, R15 ADDQ BP, R8 // Add start offset to input ADDQ BP, DX ADDQ BP, BX ADDQ BP, SI ADDQ BP, DI ADDQ BP, AX MOVQ $0x0000000f, BP MOVQ BP, X8 VPBROADCASTB X8, Y8 MOVQ n+80(FP), BP SHRQ $0x05, BP mulAvxTwo_5x8_loop: // Clear 8 outputs VPXOR Y0, Y0, Y0 VPXOR Y1, Y1, Y1 VPXOR Y2, Y2, Y2 VPXOR Y3, Y3, Y3 VPXOR Y4, Y4, Y4 VPXOR Y5, Y5, Y5 VPXOR Y6, Y6, Y6 VPXOR Y7, Y7, Y7 // Load and process 32 bytes from input 0 to 8 outputs VMOVDQU (DX), Y11 ADDQ $0x20, DX VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU (CX), Y9 VMOVDQU 32(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y0, Y0 VMOVDQU 64(CX), Y9 VMOVDQU 96(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y1, Y1 VMOVDQU 128(CX), Y9 VMOVDQU 160(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y2, Y2 VMOVDQU 192(CX), Y9 VMOVDQU 224(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y3, Y3 VMOVDQU 256(CX), Y9 VMOVDQU 288(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y4, Y4 VMOVDQU 320(CX), Y9 VMOVDQU 352(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y5, Y5 VMOVDQU 384(CX), Y9 VMOVDQU 416(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y6, Y6 VMOVDQU 448(CX), Y9 VMOVDQU 480(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 1 to 8 outputs VMOVDQU (BX), Y11 ADDQ $0x20, BX VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 512(CX), Y9 VMOVDQU 544(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y0, Y0 VMOVDQU 576(CX), Y9 VMOVDQU 608(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y1, Y1 VMOVDQU 640(CX), Y9 VMOVDQU 672(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y2, Y2 VMOVDQU 704(CX), Y9 VMOVDQU 736(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y3, Y3 VMOVDQU 768(CX), Y9 VMOVDQU 800(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y4, Y4 VMOVDQU 832(CX), Y9 VMOVDQU 864(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y5, Y5 VMOVDQU 896(CX), Y9 VMOVDQU 928(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y6, Y6 VMOVDQU 960(CX), Y9 VMOVDQU 992(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 2 to 8 outputs VMOVDQU (SI), Y11 ADDQ $0x20, SI VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 1024(CX), Y9 VMOVDQU 1056(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y0, Y0 VMOVDQU 1088(CX), Y9 VMOVDQU 1120(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y1, Y1 VMOVDQU 1152(CX), Y9 VMOVDQU 1184(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y2, Y2 VMOVDQU 1216(CX), Y9 VMOVDQU 1248(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y3, Y3 VMOVDQU 1280(CX), Y9 VMOVDQU 1312(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y4, Y4 VMOVDQU 1344(CX), Y9 VMOVDQU 1376(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y5, Y5 VMOVDQU 1408(CX), Y9 VMOVDQU 1440(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y6, Y6 VMOVDQU 1472(CX), Y9 VMOVDQU 1504(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 3 to 8 outputs VMOVDQU (DI), Y11 ADDQ $0x20, DI VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 1536(CX), Y9 VMOVDQU 1568(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y0, Y0 VMOVDQU 1600(CX), Y9 VMOVDQU 1632(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y1, Y1 VMOVDQU 1664(CX), Y9 VMOVDQU 1696(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y2, Y2 VMOVDQU 1728(CX), Y9 VMOVDQU 1760(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y3, Y3 VMOVDQU 1792(CX), Y9 VMOVDQU 1824(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y4, Y4 VMOVDQU 1856(CX), Y9 VMOVDQU 1888(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y5, Y5 VMOVDQU 1920(CX), Y9 VMOVDQU 1952(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y6, Y6 VMOVDQU 1984(CX), Y9 VMOVDQU 2016(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 4 to 8 outputs VMOVDQU (AX), Y11 ADDQ $0x20, AX VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 2048(CX), Y9 VMOVDQU 2080(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y0, Y0 VMOVDQU 2112(CX), Y9 VMOVDQU 2144(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y1, Y1 VMOVDQU 2176(CX), Y9 VMOVDQU 2208(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y2, Y2 VMOVDQU 2240(CX), Y9 VMOVDQU 2272(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y3, Y3 VMOVDQU 2304(CX), Y9 VMOVDQU 2336(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y4, Y4 VMOVDQU 2368(CX), Y9 VMOVDQU 2400(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y5, Y5 VMOVDQU 2432(CX), Y9 VMOVDQU 2464(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y6, Y6 VMOVDQU 2496(CX), Y9 VMOVDQU 2528(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y7, Y7 // Store 8 outputs VMOVDQU Y0, (R9) ADDQ $0x20, R9 VMOVDQU Y1, (R10) ADDQ $0x20, R10 VMOVDQU Y2, (R11) ADDQ $0x20, R11 VMOVDQU Y3, (R12) ADDQ $0x20, R12 VMOVDQU Y4, (R13) ADDQ $0x20, R13 VMOVDQU Y5, (R14) ADDQ $0x20, R14 VMOVDQU Y6, (R15) ADDQ $0x20, R15 VMOVDQU Y7, (R8) ADDQ $0x20, R8 // Prepare for next loop DECQ BP JNZ mulAvxTwo_5x8_loop VZEROUPPER mulAvxTwo_5x8_end: RET // func mulAvxTwo_5x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_5x9(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack // Full registers estimated 104 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_5x9_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), DX MOVQ out_base+48(FP), R9 MOVQ start+72(FP), R10 // Add start offset to input ADDQ R10, BX ADDQ R10, SI ADDQ R10, DI ADDQ R10, R8 ADDQ R10, DX MOVQ $0x0000000f, R11 MOVQ R11, X9 VPBROADCASTB X9, Y9 mulAvxTwo_5x9_loop: // Clear 9 outputs VPXOR Y0, Y0, Y0 VPXOR Y1, Y1, Y1 VPXOR Y2, Y2, Y2 VPXOR Y3, Y3, Y3 VPXOR Y4, Y4, Y4 VPXOR Y5, Y5, Y5 VPXOR Y6, Y6, Y6 VPXOR Y7, Y7, Y7 VPXOR Y8, Y8, Y8 // Load and process 32 bytes from input 0 to 9 outputs VMOVDQU (BX), Y12 ADDQ $0x20, BX VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU (CX), Y10 VMOVDQU 32(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y0, Y0 VMOVDQU 64(CX), Y10 VMOVDQU 96(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y1, Y1 VMOVDQU 128(CX), Y10 VMOVDQU 160(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y2, Y2 VMOVDQU 192(CX), Y10 VMOVDQU 224(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y3, Y3 VMOVDQU 256(CX), Y10 VMOVDQU 288(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y4, Y4 VMOVDQU 320(CX), Y10 VMOVDQU 352(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y5, Y5 VMOVDQU 384(CX), Y10 VMOVDQU 416(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y6, Y6 VMOVDQU 448(CX), Y10 VMOVDQU 480(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y7, Y7 VMOVDQU 512(CX), Y10 VMOVDQU 544(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 1 to 9 outputs VMOVDQU (SI), Y12 ADDQ $0x20, SI VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 576(CX), Y10 VMOVDQU 608(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y0, Y0 VMOVDQU 640(CX), Y10 VMOVDQU 672(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y1, Y1 VMOVDQU 704(CX), Y10 VMOVDQU 736(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y2, Y2 VMOVDQU 768(CX), Y10 VMOVDQU 800(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y3, Y3 VMOVDQU 832(CX), Y10 VMOVDQU 864(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y4, Y4 VMOVDQU 896(CX), Y10 VMOVDQU 928(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y5, Y5 VMOVDQU 960(CX), Y10 VMOVDQU 992(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y6, Y6 VMOVDQU 1024(CX), Y10 VMOVDQU 1056(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y7, Y7 VMOVDQU 1088(CX), Y10 VMOVDQU 1120(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 2 to 9 outputs VMOVDQU (DI), Y12 ADDQ $0x20, DI VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 1152(CX), Y10 VMOVDQU 1184(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y0, Y0 VMOVDQU 1216(CX), Y10 VMOVDQU 1248(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y1, Y1 VMOVDQU 1280(CX), Y10 VMOVDQU 1312(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y2, Y2 VMOVDQU 1344(CX), Y10 VMOVDQU 1376(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y3, Y3 VMOVDQU 1408(CX), Y10 VMOVDQU 1440(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y4, Y4 VMOVDQU 1472(CX), Y10 VMOVDQU 1504(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y5, Y5 VMOVDQU 1536(CX), Y10 VMOVDQU 1568(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y6, Y6 VMOVDQU 1600(CX), Y10 VMOVDQU 1632(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y7, Y7 VMOVDQU 1664(CX), Y10 VMOVDQU 1696(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 3 to 9 outputs VMOVDQU (R8), Y12 ADDQ $0x20, R8 VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 1728(CX), Y10 VMOVDQU 1760(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y0, Y0 VMOVDQU 1792(CX), Y10 VMOVDQU 1824(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y1, Y1 VMOVDQU 1856(CX), Y10 VMOVDQU 1888(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y2, Y2 VMOVDQU 1920(CX), Y10 VMOVDQU 1952(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y3, Y3 VMOVDQU 1984(CX), Y10 VMOVDQU 2016(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y4, Y4 VMOVDQU 2048(CX), Y10 VMOVDQU 2080(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y5, Y5 VMOVDQU 2112(CX), Y10 VMOVDQU 2144(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y6, Y6 VMOVDQU 2176(CX), Y10 VMOVDQU 2208(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y7, Y7 VMOVDQU 2240(CX), Y10 VMOVDQU 2272(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 4 to 9 outputs VMOVDQU (DX), Y12 ADDQ $0x20, DX VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 2304(CX), Y10 VMOVDQU 2336(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y0, Y0 VMOVDQU 2368(CX), Y10 VMOVDQU 2400(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y1, Y1 VMOVDQU 2432(CX), Y10 VMOVDQU 2464(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y2, Y2 VMOVDQU 2496(CX), Y10 VMOVDQU 2528(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y3, Y3 VMOVDQU 2560(CX), Y10 VMOVDQU 2592(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y4, Y4 VMOVDQU 2624(CX), Y10 VMOVDQU 2656(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y5, Y5 VMOVDQU 2688(CX), Y10 VMOVDQU 2720(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y6, Y6 VMOVDQU 2752(CX), Y10 VMOVDQU 2784(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y7, Y7 VMOVDQU 2816(CX), Y10 VMOVDQU 2848(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y8, Y8 // Store 9 outputs MOVQ (R9), R11 VMOVDQU Y0, (R11)(R10*1) MOVQ 24(R9), R11 VMOVDQU Y1, (R11)(R10*1) MOVQ 48(R9), R11 VMOVDQU Y2, (R11)(R10*1) MOVQ 72(R9), R11 VMOVDQU Y3, (R11)(R10*1) MOVQ 96(R9), R11 VMOVDQU Y4, (R11)(R10*1) MOVQ 120(R9), R11 VMOVDQU Y5, (R11)(R10*1) MOVQ 144(R9), R11 VMOVDQU Y6, (R11)(R10*1) MOVQ 168(R9), R11 VMOVDQU Y7, (R11)(R10*1) MOVQ 192(R9), R11 VMOVDQU Y8, (R11)(R10*1) // Prepare for next loop ADDQ $0x20, R10 DECQ AX JNZ mulAvxTwo_5x9_loop VZEROUPPER mulAvxTwo_5x9_end: RET // func mulAvxTwo_5x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_5x10(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack // Full registers estimated 115 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_5x10_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), DX MOVQ out_base+48(FP), R9 MOVQ start+72(FP), R10 // Add start offset to input ADDQ R10, BX ADDQ R10, SI ADDQ R10, DI ADDQ R10, R8 ADDQ R10, DX MOVQ $0x0000000f, R11 MOVQ R11, X10 VPBROADCASTB X10, Y10 mulAvxTwo_5x10_loop: // Clear 10 outputs VPXOR Y0, Y0, Y0 VPXOR Y1, Y1, Y1 VPXOR Y2, Y2, Y2 VPXOR Y3, Y3, Y3 VPXOR Y4, Y4, Y4 VPXOR Y5, Y5, Y5 VPXOR Y6, Y6, Y6 VPXOR Y7, Y7, Y7 VPXOR Y8, Y8, Y8 VPXOR Y9, Y9, Y9 // Load and process 32 bytes from input 0 to 10 outputs VMOVDQU (BX), Y13 ADDQ $0x20, BX VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU (CX), Y11 VMOVDQU 32(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y0, Y0 VMOVDQU 64(CX), Y11 VMOVDQU 96(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y1, Y1 VMOVDQU 128(CX), Y11 VMOVDQU 160(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y2, Y2 VMOVDQU 192(CX), Y11 VMOVDQU 224(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y3, Y3 VMOVDQU 256(CX), Y11 VMOVDQU 288(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y4, Y4 VMOVDQU 320(CX), Y11 VMOVDQU 352(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y5, Y5 VMOVDQU 384(CX), Y11 VMOVDQU 416(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y6, Y6 VMOVDQU 448(CX), Y11 VMOVDQU 480(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y7, Y7 VMOVDQU 512(CX), Y11 VMOVDQU 544(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y8, Y8 VMOVDQU 576(CX), Y11 VMOVDQU 608(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 1 to 10 outputs VMOVDQU (SI), Y13 ADDQ $0x20, SI VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 640(CX), Y11 VMOVDQU 672(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y0, Y0 VMOVDQU 704(CX), Y11 VMOVDQU 736(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y1, Y1 VMOVDQU 768(CX), Y11 VMOVDQU 800(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y2, Y2 VMOVDQU 832(CX), Y11 VMOVDQU 864(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y3, Y3 VMOVDQU 896(CX), Y11 VMOVDQU 928(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y4, Y4 VMOVDQU 960(CX), Y11 VMOVDQU 992(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y5, Y5 VMOVDQU 1024(CX), Y11 VMOVDQU 1056(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y6, Y6 VMOVDQU 1088(CX), Y11 VMOVDQU 1120(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y7, Y7 VMOVDQU 1152(CX), Y11 VMOVDQU 1184(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y8, Y8 VMOVDQU 1216(CX), Y11 VMOVDQU 1248(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 2 to 10 outputs VMOVDQU (DI), Y13 ADDQ $0x20, DI VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 1280(CX), Y11 VMOVDQU 1312(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y0, Y0 VMOVDQU 1344(CX), Y11 VMOVDQU 1376(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y1, Y1 VMOVDQU 1408(CX), Y11 VMOVDQU 1440(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y2, Y2 VMOVDQU 1472(CX), Y11 VMOVDQU 1504(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y3, Y3 VMOVDQU 1536(CX), Y11 VMOVDQU 1568(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y4, Y4 VMOVDQU 1600(CX), Y11 VMOVDQU 1632(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y5, Y5 VMOVDQU 1664(CX), Y11 VMOVDQU 1696(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y6, Y6 VMOVDQU 1728(CX), Y11 VMOVDQU 1760(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y7, Y7 VMOVDQU 1792(CX), Y11 VMOVDQU 1824(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y8, Y8 VMOVDQU 1856(CX), Y11 VMOVDQU 1888(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 3 to 10 outputs VMOVDQU (R8), Y13 ADDQ $0x20, R8 VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 1920(CX), Y11 VMOVDQU 1952(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y0, Y0 VMOVDQU 1984(CX), Y11 VMOVDQU 2016(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y1, Y1 VMOVDQU 2048(CX), Y11 VMOVDQU 2080(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y2, Y2 VMOVDQU 2112(CX), Y11 VMOVDQU 2144(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y3, Y3 VMOVDQU 2176(CX), Y11 VMOVDQU 2208(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y4, Y4 VMOVDQU 2240(CX), Y11 VMOVDQU 2272(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y5, Y5 VMOVDQU 2304(CX), Y11 VMOVDQU 2336(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y6, Y6 VMOVDQU 2368(CX), Y11 VMOVDQU 2400(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y7, Y7 VMOVDQU 2432(CX), Y11 VMOVDQU 2464(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y8, Y8 VMOVDQU 2496(CX), Y11 VMOVDQU 2528(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 4 to 10 outputs VMOVDQU (DX), Y13 ADDQ $0x20, DX VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 2560(CX), Y11 VMOVDQU 2592(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y0, Y0 VMOVDQU 2624(CX), Y11 VMOVDQU 2656(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y1, Y1 VMOVDQU 2688(CX), Y11 VMOVDQU 2720(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y2, Y2 VMOVDQU 2752(CX), Y11 VMOVDQU 2784(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y3, Y3 VMOVDQU 2816(CX), Y11 VMOVDQU 2848(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y4, Y4 VMOVDQU 2880(CX), Y11 VMOVDQU 2912(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y5, Y5 VMOVDQU 2944(CX), Y11 VMOVDQU 2976(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y6, Y6 VMOVDQU 3008(CX), Y11 VMOVDQU 3040(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y7, Y7 VMOVDQU 3072(CX), Y11 VMOVDQU 3104(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y8, Y8 VMOVDQU 3136(CX), Y11 VMOVDQU 3168(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y9, Y9 // Store 10 outputs MOVQ (R9), R11 VMOVDQU Y0, (R11)(R10*1) MOVQ 24(R9), R11 VMOVDQU Y1, (R11)(R10*1) MOVQ 48(R9), R11 VMOVDQU Y2, (R11)(R10*1) MOVQ 72(R9), R11 VMOVDQU Y3, (R11)(R10*1) MOVQ 96(R9), R11 VMOVDQU Y4, (R11)(R10*1) MOVQ 120(R9), R11 VMOVDQU Y5, (R11)(R10*1) MOVQ 144(R9), R11 VMOVDQU Y6, (R11)(R10*1) MOVQ 168(R9), R11 VMOVDQU Y7, (R11)(R10*1) MOVQ 192(R9), R11 VMOVDQU Y8, (R11)(R10*1) MOVQ 216(R9), R11 VMOVDQU Y9, (R11)(R10*1) // Prepare for next loop ADDQ $0x20, R10 DECQ AX JNZ mulAvxTwo_5x10_loop VZEROUPPER mulAvxTwo_5x10_end: RET // func mulAvxTwo_6x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_6x1(SB), NOSPLIT, $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 16 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_6x1_end VMOVDQU (CX), Y0 VMOVDQU 32(CX), Y1 VMOVDQU 64(CX), Y2 VMOVDQU 96(CX), Y3 VMOVDQU 128(CX), Y4 VMOVDQU 160(CX), Y5 VMOVDQU 192(CX), Y6 VMOVDQU 224(CX), Y7 VMOVDQU 256(CX), Y8 VMOVDQU 288(CX), Y9 VMOVDQU 320(CX), Y10 VMOVDQU 352(CX), Y11 MOVQ in_base+24(FP), CX MOVQ (CX), DX MOVQ 24(CX), BX MOVQ 48(CX), SI MOVQ 72(CX), DI MOVQ 96(CX), R8 MOVQ 120(CX), CX MOVQ out_base+48(FP), R9 MOVQ (R9), R9 MOVQ start+72(FP), R10 // Add start offset to output ADDQ R10, R9 // Add start offset to input ADDQ R10, DX ADDQ R10, BX ADDQ R10, SI ADDQ R10, DI ADDQ R10, R8 ADDQ R10, CX MOVQ $0x0000000f, R10 MOVQ R10, X13 VPBROADCASTB X13, Y13 mulAvxTwo_6x1_loop: // Clear 1 outputs VPXOR Y12, Y12, Y12 // Load and process 32 bytes from input 0 to 1 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX VPSRLQ $0x04, Y14, Y15 VPAND Y13, Y14, Y14 VPAND Y13, Y15, Y15 VPSHUFB Y14, Y0, Y14 VPSHUFB Y15, Y1, Y15 VPXOR Y14, Y15, Y14 VPXOR Y14, Y12, Y12 // Load and process 32 bytes from input 1 to 1 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX VPSRLQ $0x04, Y14, Y15 VPAND Y13, Y14, Y14 VPAND Y13, Y15, Y15 VPSHUFB Y14, Y2, Y14 VPSHUFB Y15, Y3, Y15 VPXOR Y14, Y15, Y14 VPXOR Y14, Y12, Y12 // Load and process 32 bytes from input 2 to 1 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI VPSRLQ $0x04, Y14, Y15 VPAND Y13, Y14, Y14 VPAND Y13, Y15, Y15 VPSHUFB Y14, Y4, Y14 VPSHUFB Y15, Y5, Y15 VPXOR Y14, Y15, Y14 VPXOR Y14, Y12, Y12 // Load and process 32 bytes from input 3 to 1 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI VPSRLQ $0x04, Y14, Y15 VPAND Y13, Y14, Y14 VPAND Y13, Y15, Y15 VPSHUFB Y14, Y6, Y14 VPSHUFB Y15, Y7, Y15 VPXOR Y14, Y15, Y14 VPXOR Y14, Y12, Y12 // Load and process 32 bytes from input 4 to 1 outputs VMOVDQU (R8), Y14 ADDQ $0x20, R8 VPSRLQ $0x04, Y14, Y15 VPAND Y13, Y14, Y14 VPAND Y13, Y15, Y15 VPSHUFB Y14, Y8, Y14 VPSHUFB Y15, Y9, Y15 VPXOR Y14, Y15, Y14 VPXOR Y14, Y12, Y12 // Load and process 32 bytes from input 5 to 1 outputs VMOVDQU (CX), Y14 ADDQ $0x20, CX VPSRLQ $0x04, Y14, Y15 VPAND Y13, Y14, Y14 VPAND Y13, Y15, Y15 VPSHUFB Y14, Y10, Y14 VPSHUFB Y15, Y11, Y15 VPXOR Y14, Y15, Y14 VPXOR Y14, Y12, Y12 // Store 1 outputs VMOVDQU Y12, (R9) ADDQ $0x20, R9 // Prepare for next loop DECQ AX JNZ mulAvxTwo_6x1_loop VZEROUPPER mulAvxTwo_6x1_end: RET // func mulAvxTwo_6x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_6x1_64(SB), $0-88 // Loading no tables to registers // Destination kept on stack // Full registers estimated 16 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_6x1_64_end MOVQ in_base+24(FP), AX MOVQ (AX), DX MOVQ 24(AX), BX MOVQ 48(AX), SI MOVQ 72(AX), DI MOVQ 96(AX), R8 MOVQ 120(AX), AX MOVQ out_base+48(FP), R9 MOVQ out_base+48(FP), R9 MOVQ start+72(FP), R10 // Add start offset to input ADDQ R10, DX ADDQ R10, BX ADDQ R10, SI ADDQ R10, DI ADDQ R10, R8 ADDQ R10, AX MOVQ $0x0000000f, R11 MOVQ R11, X2 VPBROADCASTB X2, Y2 MOVQ n+80(FP), R11 SHRQ $0x06, R11 mulAvxTwo_6x1_64_loop: // Clear 1 outputs VPXOR Y0, Y0, Y0 VPXOR Y1, Y1, Y1 // Load and process 64 bytes from input 0 to 1 outputs VMOVDQU (DX), Y6 VMOVDQU 32(DX), Y5 ADDQ $0x40, DX VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU (CX), Y3 VMOVDQU 32(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y5, Y6, Y5 VPXOR Y3, Y0, Y0 VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 1 to 1 outputs VMOVDQU (BX), Y6 VMOVDQU 32(BX), Y5 ADDQ $0x40, BX VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 64(CX), Y3 VMOVDQU 96(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y5, Y6, Y5 VPXOR Y3, Y0, Y0 VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 2 to 1 outputs VMOVDQU (SI), Y6 VMOVDQU 32(SI), Y5 ADDQ $0x40, SI VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 128(CX), Y3 VMOVDQU 160(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y5, Y6, Y5 VPXOR Y3, Y0, Y0 VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 3 to 1 outputs VMOVDQU (DI), Y6 VMOVDQU 32(DI), Y5 ADDQ $0x40, DI VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 192(CX), Y3 VMOVDQU 224(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y5, Y6, Y5 VPXOR Y3, Y0, Y0 VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 4 to 1 outputs VMOVDQU (R8), Y6 VMOVDQU 32(R8), Y5 ADDQ $0x40, R8 VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 256(CX), Y3 VMOVDQU 288(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y5, Y6, Y5 VPXOR Y3, Y0, Y0 VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 5 to 1 outputs VMOVDQU (AX), Y6 VMOVDQU 32(AX), Y5 ADDQ $0x40, AX VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 320(CX), Y3 VMOVDQU 352(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y5, Y6, Y5 VPXOR Y3, Y0, Y0 VPXOR Y5, Y1, Y1 // Store 1 outputs MOVQ (R9), R12 VMOVDQU Y0, (R12)(R10*1) VMOVDQU Y1, 32(R12)(R10*1) // Prepare for next loop ADDQ $0x40, R10 DECQ R11 JNZ mulAvxTwo_6x1_64_loop VZEROUPPER mulAvxTwo_6x1_64_end: RET // func mulAvxTwo_6x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_6x2(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 31 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_6x2_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), DX MOVQ out_base+48(FP), R10 MOVQ (R10), R11 MOVQ 24(R10), R10 MOVQ start+72(FP), R12 // Add start offset to output ADDQ R12, R11 ADDQ R12, R10 // Add start offset to input ADDQ R12, BX ADDQ R12, SI ADDQ R12, DI ADDQ R12, R8 ADDQ R12, R9 ADDQ R12, DX MOVQ $0x0000000f, R12 MOVQ R12, X2 VPBROADCASTB X2, Y2 mulAvxTwo_6x2_loop: // Clear 2 outputs VPXOR Y0, Y0, Y0 VPXOR Y1, Y1, Y1 // Load and process 32 bytes from input 0 to 2 outputs VMOVDQU (BX), Y5 ADDQ $0x20, BX VPSRLQ $0x04, Y5, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y6, Y6 VMOVDQU (CX), Y3 VMOVDQU 32(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y3, Y0, Y0 VMOVDQU 64(CX), Y3 VMOVDQU 96(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 1 to 2 outputs VMOVDQU (SI), Y5 ADDQ $0x20, SI VPSRLQ $0x04, Y5, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y6, Y6 VMOVDQU 128(CX), Y3 VMOVDQU 160(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y3, Y0, Y0 VMOVDQU 192(CX), Y3 VMOVDQU 224(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 2 to 2 outputs VMOVDQU (DI), Y5 ADDQ $0x20, DI VPSRLQ $0x04, Y5, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y6, Y6 VMOVDQU 256(CX), Y3 VMOVDQU 288(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y3, Y0, Y0 VMOVDQU 320(CX), Y3 VMOVDQU 352(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 3 to 2 outputs VMOVDQU (R8), Y5 ADDQ $0x20, R8 VPSRLQ $0x04, Y5, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y6, Y6 VMOVDQU 384(CX), Y3 VMOVDQU 416(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y3, Y0, Y0 VMOVDQU 448(CX), Y3 VMOVDQU 480(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 4 to 2 outputs VMOVDQU (R9), Y5 ADDQ $0x20, R9 VPSRLQ $0x04, Y5, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y6, Y6 VMOVDQU 512(CX), Y3 VMOVDQU 544(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y3, Y0, Y0 VMOVDQU 576(CX), Y3 VMOVDQU 608(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 5 to 2 outputs VMOVDQU (DX), Y5 ADDQ $0x20, DX VPSRLQ $0x04, Y5, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y6, Y6 VMOVDQU 640(CX), Y3 VMOVDQU 672(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y3, Y0, Y0 VMOVDQU 704(CX), Y3 VMOVDQU 736(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y3, Y1, Y1 // Store 2 outputs VMOVDQU Y0, (R11) ADDQ $0x20, R11 VMOVDQU Y1, (R10) ADDQ $0x20, R10 // Prepare for next loop DECQ AX JNZ mulAvxTwo_6x2_loop VZEROUPPER mulAvxTwo_6x2_end: RET // func mulAvxTwo_6x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_6x2_64(SB), $0-88 // Loading no tables to registers // Destination kept on stack // Full registers estimated 31 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_6x2_64_end MOVQ in_base+24(FP), AX MOVQ (AX), DX MOVQ 24(AX), BX MOVQ 48(AX), SI MOVQ 72(AX), DI MOVQ 96(AX), R8 MOVQ 120(AX), AX MOVQ out_base+48(FP), R9 MOVQ out_base+48(FP), R9 MOVQ start+72(FP), R10 // Add start offset to input ADDQ R10, DX ADDQ R10, BX ADDQ R10, SI ADDQ R10, DI ADDQ R10, R8 ADDQ R10, AX MOVQ $0x0000000f, R11 MOVQ R11, X4 VPBROADCASTB X4, Y4 MOVQ n+80(FP), R11 SHRQ $0x06, R11 mulAvxTwo_6x2_64_loop: // Clear 2 outputs VPXOR Y0, Y0, Y0 VPXOR Y1, Y1, Y1 VPXOR Y2, Y2, Y2 VPXOR Y3, Y3, Y3 // Load and process 64 bytes from input 0 to 2 outputs VMOVDQU (DX), Y9 VMOVDQU 32(DX), Y11 ADDQ $0x40, DX VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU (CX), Y5 VMOVDQU 32(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y7, Y8, Y7 VPXOR Y5, Y0, Y0 VPXOR Y7, Y1, Y1 VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y7, Y8, Y7 VPXOR Y5, Y2, Y2 VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 1 to 2 outputs VMOVDQU (BX), Y9 VMOVDQU 32(BX), Y11 ADDQ $0x40, BX VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 128(CX), Y5 VMOVDQU 160(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y7, Y8, Y7 VPXOR Y5, Y0, Y0 VPXOR Y7, Y1, Y1 VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y7, Y8, Y7 VPXOR Y5, Y2, Y2 VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 2 to 2 outputs VMOVDQU (SI), Y9 VMOVDQU 32(SI), Y11 ADDQ $0x40, SI VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 256(CX), Y5 VMOVDQU 288(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y7, Y8, Y7 VPXOR Y5, Y0, Y0 VPXOR Y7, Y1, Y1 VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y7, Y8, Y7 VPXOR Y5, Y2, Y2 VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 3 to 2 outputs VMOVDQU (DI), Y9 VMOVDQU 32(DI), Y11 ADDQ $0x40, DI VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 384(CX), Y5 VMOVDQU 416(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y7, Y8, Y7 VPXOR Y5, Y0, Y0 VPXOR Y7, Y1, Y1 VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y7, Y8, Y7 VPXOR Y5, Y2, Y2 VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 4 to 2 outputs VMOVDQU (R8), Y9 VMOVDQU 32(R8), Y11 ADDQ $0x40, R8 VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 512(CX), Y5 VMOVDQU 544(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y7, Y8, Y7 VPXOR Y5, Y0, Y0 VPXOR Y7, Y1, Y1 VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y7, Y8, Y7 VPXOR Y5, Y2, Y2 VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 5 to 2 outputs VMOVDQU (AX), Y9 VMOVDQU 32(AX), Y11 ADDQ $0x40, AX VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 640(CX), Y5 VMOVDQU 672(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y7, Y8, Y7 VPXOR Y5, Y0, Y0 VPXOR Y7, Y1, Y1 VMOVDQU 704(CX), Y5 VMOVDQU 736(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y7, Y8, Y7 VPXOR Y5, Y2, Y2 VPXOR Y7, Y3, Y3 // Store 2 outputs MOVQ (R9), R12 VMOVDQU Y0, (R12)(R10*1) VMOVDQU Y1, 32(R12)(R10*1) MOVQ 24(R9), R12 VMOVDQU Y2, (R12)(R10*1) VMOVDQU Y3, 32(R12)(R10*1) // Prepare for next loop ADDQ $0x40, R10 DECQ R11 JNZ mulAvxTwo_6x2_64_loop VZEROUPPER mulAvxTwo_6x2_64_end: RET // func mulAvxTwo_6x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_6x3(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 44 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_6x3_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), DX MOVQ out_base+48(FP), R10 MOVQ (R10), R11 MOVQ 24(R10), R12 MOVQ 48(R10), R10 MOVQ start+72(FP), R13 // Add start offset to output ADDQ R13, R11 ADDQ R13, R12 ADDQ R13, R10 // Add start offset to input ADDQ R13, BX ADDQ R13, SI ADDQ R13, DI ADDQ R13, R8 ADDQ R13, R9 ADDQ R13, DX MOVQ $0x0000000f, R13 MOVQ R13, X3 VPBROADCASTB X3, Y3 mulAvxTwo_6x3_loop: // Clear 3 outputs VPXOR Y0, Y0, Y0 VPXOR Y1, Y1, Y1 VPXOR Y2, Y2, Y2 // Load and process 32 bytes from input 0 to 3 outputs VMOVDQU (BX), Y6 ADDQ $0x20, BX VPSRLQ $0x04, Y6, Y7 VPAND Y3, Y6, Y6 VPAND Y3, Y7, Y7 VMOVDQU (CX), Y4 VMOVDQU 32(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y0, Y0 VMOVDQU 64(CX), Y4 VMOVDQU 96(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y1, Y1 VMOVDQU 128(CX), Y4 VMOVDQU 160(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 1 to 3 outputs VMOVDQU (SI), Y6 ADDQ $0x20, SI VPSRLQ $0x04, Y6, Y7 VPAND Y3, Y6, Y6 VPAND Y3, Y7, Y7 VMOVDQU 192(CX), Y4 VMOVDQU 224(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y0, Y0 VMOVDQU 256(CX), Y4 VMOVDQU 288(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y1, Y1 VMOVDQU 320(CX), Y4 VMOVDQU 352(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 2 to 3 outputs VMOVDQU (DI), Y6 ADDQ $0x20, DI VPSRLQ $0x04, Y6, Y7 VPAND Y3, Y6, Y6 VPAND Y3, Y7, Y7 VMOVDQU 384(CX), Y4 VMOVDQU 416(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y0, Y0 VMOVDQU 448(CX), Y4 VMOVDQU 480(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y1, Y1 VMOVDQU 512(CX), Y4 VMOVDQU 544(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 3 to 3 outputs VMOVDQU (R8), Y6 ADDQ $0x20, R8 VPSRLQ $0x04, Y6, Y7 VPAND Y3, Y6, Y6 VPAND Y3, Y7, Y7 VMOVDQU 576(CX), Y4 VMOVDQU 608(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y0, Y0 VMOVDQU 640(CX), Y4 VMOVDQU 672(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y1, Y1 VMOVDQU 704(CX), Y4 VMOVDQU 736(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 4 to 3 outputs VMOVDQU (R9), Y6 ADDQ $0x20, R9 VPSRLQ $0x04, Y6, Y7 VPAND Y3, Y6, Y6 VPAND Y3, Y7, Y7 VMOVDQU 768(CX), Y4 VMOVDQU 800(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y0, Y0 VMOVDQU 832(CX), Y4 VMOVDQU 864(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y1, Y1 VMOVDQU 896(CX), Y4 VMOVDQU 928(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 5 to 3 outputs VMOVDQU (DX), Y6 ADDQ $0x20, DX VPSRLQ $0x04, Y6, Y7 VPAND Y3, Y6, Y6 VPAND Y3, Y7, Y7 VMOVDQU 960(CX), Y4 VMOVDQU 992(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y0, Y0 VMOVDQU 1024(CX), Y4 VMOVDQU 1056(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y1, Y1 VMOVDQU 1088(CX), Y4 VMOVDQU 1120(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y2, Y2 // Store 3 outputs VMOVDQU Y0, (R11) ADDQ $0x20, R11 VMOVDQU Y1, (R12) ADDQ $0x20, R12 VMOVDQU Y2, (R10) ADDQ $0x20, R10 // Prepare for next loop DECQ AX JNZ mulAvxTwo_6x3_loop VZEROUPPER mulAvxTwo_6x3_end: RET // func mulAvxTwo_6x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_6x3_64(SB), $0-88 // Loading no tables to registers // Destination kept on stack // Full registers estimated 44 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_6x3_64_end MOVQ in_base+24(FP), AX MOVQ (AX), DX MOVQ 24(AX), BX MOVQ 48(AX), SI MOVQ 72(AX), DI MOVQ 96(AX), R8 MOVQ 120(AX), AX MOVQ out_base+48(FP), R9 MOVQ out_base+48(FP), R9 MOVQ start+72(FP), R10 // Add start offset to input ADDQ R10, DX ADDQ R10, BX ADDQ R10, SI ADDQ R10, DI ADDQ R10, R8 ADDQ R10, AX MOVQ $0x0000000f, R11 MOVQ R11, X6 VPBROADCASTB X6, Y6 MOVQ n+80(FP), R11 SHRQ $0x06, R11 mulAvxTwo_6x3_64_loop: // Clear 3 outputs VPXOR Y0, Y0, Y0 VPXOR Y1, Y1, Y1 VPXOR Y2, Y2, Y2 VPXOR Y3, Y3, Y3 VPXOR Y4, Y4, Y4 VPXOR Y5, Y5, Y5 // Load and process 64 bytes from input 0 to 3 outputs VMOVDQU (DX), Y11 VMOVDQU 32(DX), Y13 ADDQ $0x40, DX VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU (CX), Y7 VMOVDQU 32(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y0, Y0 VPXOR Y9, Y1, Y1 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y2, Y2 VPXOR Y9, Y3, Y3 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y4, Y4 VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 1 to 3 outputs VMOVDQU (BX), Y11 VMOVDQU 32(BX), Y13 ADDQ $0x40, BX VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 192(CX), Y7 VMOVDQU 224(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y0, Y0 VPXOR Y9, Y1, Y1 VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y2, Y2 VPXOR Y9, Y3, Y3 VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y4, Y4 VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 2 to 3 outputs VMOVDQU (SI), Y11 VMOVDQU 32(SI), Y13 ADDQ $0x40, SI VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 384(CX), Y7 VMOVDQU 416(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y0, Y0 VPXOR Y9, Y1, Y1 VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y2, Y2 VPXOR Y9, Y3, Y3 VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y4, Y4 VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 3 to 3 outputs VMOVDQU (DI), Y11 VMOVDQU 32(DI), Y13 ADDQ $0x40, DI VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 576(CX), Y7 VMOVDQU 608(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y0, Y0 VPXOR Y9, Y1, Y1 VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y2, Y2 VPXOR Y9, Y3, Y3 VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y4, Y4 VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 4 to 3 outputs VMOVDQU (R8), Y11 VMOVDQU 32(R8), Y13 ADDQ $0x40, R8 VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 768(CX), Y7 VMOVDQU 800(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y0, Y0 VPXOR Y9, Y1, Y1 VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y2, Y2 VPXOR Y9, Y3, Y3 VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y4, Y4 VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 5 to 3 outputs VMOVDQU (AX), Y11 VMOVDQU 32(AX), Y13 ADDQ $0x40, AX VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 960(CX), Y7 VMOVDQU 992(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y0, Y0 VPXOR Y9, Y1, Y1 VMOVDQU 1024(CX), Y7 VMOVDQU 1056(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y2, Y2 VPXOR Y9, Y3, Y3 VMOVDQU 1088(CX), Y7 VMOVDQU 1120(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y4, Y4 VPXOR Y9, Y5, Y5 // Store 3 outputs MOVQ (R9), R12 VMOVDQU Y0, (R12)(R10*1) VMOVDQU Y1, 32(R12)(R10*1) MOVQ 24(R9), R12 VMOVDQU Y2, (R12)(R10*1) VMOVDQU Y3, 32(R12)(R10*1) MOVQ 48(R9), R12 VMOVDQU Y4, (R12)(R10*1) VMOVDQU Y5, 32(R12)(R10*1) // Prepare for next loop ADDQ $0x40, R10 DECQ R11 JNZ mulAvxTwo_6x3_64_loop VZEROUPPER mulAvxTwo_6x3_64_end: RET // func mulAvxTwo_6x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_6x4(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 57 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_6x4_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), DX MOVQ out_base+48(FP), R10 MOVQ (R10), R11 MOVQ 24(R10), R12 MOVQ 48(R10), R13 MOVQ 72(R10), R10 MOVQ start+72(FP), R14 // Add start offset to output ADDQ R14, R11 ADDQ R14, R12 ADDQ R14, R13 ADDQ R14, R10 // Add start offset to input ADDQ R14, BX ADDQ R14, SI ADDQ R14, DI ADDQ R14, R8 ADDQ R14, R9 ADDQ R14, DX MOVQ $0x0000000f, R14 MOVQ R14, X4 VPBROADCASTB X4, Y4 mulAvxTwo_6x4_loop: // Clear 4 outputs VPXOR Y0, Y0, Y0 VPXOR Y1, Y1, Y1 VPXOR Y2, Y2, Y2 VPXOR Y3, Y3, Y3 // Load and process 32 bytes from input 0 to 4 outputs VMOVDQU (BX), Y7 ADDQ $0x20, BX VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU (CX), Y5 VMOVDQU 32(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y0, Y0 VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y1, Y1 VMOVDQU 128(CX), Y5 VMOVDQU 160(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y2, Y2 VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 1 to 4 outputs VMOVDQU (SI), Y7 ADDQ $0x20, SI VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 256(CX), Y5 VMOVDQU 288(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y0, Y0 VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y1, Y1 VMOVDQU 384(CX), Y5 VMOVDQU 416(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y2, Y2 VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 2 to 4 outputs VMOVDQU (DI), Y7 ADDQ $0x20, DI VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 512(CX), Y5 VMOVDQU 544(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y0, Y0 VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y1, Y1 VMOVDQU 640(CX), Y5 VMOVDQU 672(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y2, Y2 VMOVDQU 704(CX), Y5 VMOVDQU 736(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 3 to 4 outputs VMOVDQU (R8), Y7 ADDQ $0x20, R8 VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 768(CX), Y5 VMOVDQU 800(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y0, Y0 VMOVDQU 832(CX), Y5 VMOVDQU 864(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y1, Y1 VMOVDQU 896(CX), Y5 VMOVDQU 928(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y2, Y2 VMOVDQU 960(CX), Y5 VMOVDQU 992(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 4 to 4 outputs VMOVDQU (R9), Y7 ADDQ $0x20, R9 VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 1024(CX), Y5 VMOVDQU 1056(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y0, Y0 VMOVDQU 1088(CX), Y5 VMOVDQU 1120(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y1, Y1 VMOVDQU 1152(CX), Y5 VMOVDQU 1184(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y2, Y2 VMOVDQU 1216(CX), Y5 VMOVDQU 1248(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 5 to 4 outputs VMOVDQU (DX), Y7 ADDQ $0x20, DX VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 1280(CX), Y5 VMOVDQU 1312(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y0, Y0 VMOVDQU 1344(CX), Y5 VMOVDQU 1376(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y1, Y1 VMOVDQU 1408(CX), Y5 VMOVDQU 1440(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y2, Y2 VMOVDQU 1472(CX), Y5 VMOVDQU 1504(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y3, Y3 // Store 4 outputs VMOVDQU Y0, (R11) ADDQ $0x20, R11 VMOVDQU Y1, (R12) ADDQ $0x20, R12 VMOVDQU Y2, (R13) ADDQ $0x20, R13 VMOVDQU Y3, (R10) ADDQ $0x20, R10 // Prepare for next loop DECQ AX JNZ mulAvxTwo_6x4_loop VZEROUPPER mulAvxTwo_6x4_end: RET // func mulAvxTwo_6x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_6x5(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 70 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_6x5_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), DX MOVQ out_base+48(FP), R10 MOVQ (R10), R11 MOVQ 24(R10), R12 MOVQ 48(R10), R13 MOVQ 72(R10), R14 MOVQ 96(R10), R10 MOVQ start+72(FP), R15 // Add start offset to output ADDQ R15, R11 ADDQ R15, R12 ADDQ R15, R13 ADDQ R15, R14 ADDQ R15, R10 // Add start offset to input ADDQ R15, BX ADDQ R15, SI ADDQ R15, DI ADDQ R15, R8 ADDQ R15, R9 ADDQ R15, DX MOVQ $0x0000000f, R15 MOVQ R15, X5 VPBROADCASTB X5, Y5 mulAvxTwo_6x5_loop: // Clear 5 outputs VPXOR Y0, Y0, Y0 VPXOR Y1, Y1, Y1 VPXOR Y2, Y2, Y2 VPXOR Y3, Y3, Y3 VPXOR Y4, Y4, Y4 // Load and process 32 bytes from input 0 to 5 outputs VMOVDQU (BX), Y8 ADDQ $0x20, BX VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU (CX), Y6 VMOVDQU 32(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y0, Y0 VMOVDQU 64(CX), Y6 VMOVDQU 96(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y1, Y1 VMOVDQU 128(CX), Y6 VMOVDQU 160(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y2, Y2 VMOVDQU 192(CX), Y6 VMOVDQU 224(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y3, Y3 VMOVDQU 256(CX), Y6 VMOVDQU 288(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 1 to 5 outputs VMOVDQU (SI), Y8 ADDQ $0x20, SI VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 320(CX), Y6 VMOVDQU 352(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y0, Y0 VMOVDQU 384(CX), Y6 VMOVDQU 416(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y1, Y1 VMOVDQU 448(CX), Y6 VMOVDQU 480(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y2, Y2 VMOVDQU 512(CX), Y6 VMOVDQU 544(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y3, Y3 VMOVDQU 576(CX), Y6 VMOVDQU 608(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 2 to 5 outputs VMOVDQU (DI), Y8 ADDQ $0x20, DI VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 640(CX), Y6 VMOVDQU 672(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y0, Y0 VMOVDQU 704(CX), Y6 VMOVDQU 736(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y1, Y1 VMOVDQU 768(CX), Y6 VMOVDQU 800(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y2, Y2 VMOVDQU 832(CX), Y6 VMOVDQU 864(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y3, Y3 VMOVDQU 896(CX), Y6 VMOVDQU 928(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 3 to 5 outputs VMOVDQU (R8), Y8 ADDQ $0x20, R8 VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 960(CX), Y6 VMOVDQU 992(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y0, Y0 VMOVDQU 1024(CX), Y6 VMOVDQU 1056(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y1, Y1 VMOVDQU 1088(CX), Y6 VMOVDQU 1120(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y2, Y2 VMOVDQU 1152(CX), Y6 VMOVDQU 1184(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y3, Y3 VMOVDQU 1216(CX), Y6 VMOVDQU 1248(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 4 to 5 outputs VMOVDQU (R9), Y8 ADDQ $0x20, R9 VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 1280(CX), Y6 VMOVDQU 1312(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y0, Y0 VMOVDQU 1344(CX), Y6 VMOVDQU 1376(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y1, Y1 VMOVDQU 1408(CX), Y6 VMOVDQU 1440(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y2, Y2 VMOVDQU 1472(CX), Y6 VMOVDQU 1504(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y3, Y3 VMOVDQU 1536(CX), Y6 VMOVDQU 1568(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 5 to 5 outputs VMOVDQU (DX), Y8 ADDQ $0x20, DX VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 1600(CX), Y6 VMOVDQU 1632(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y0, Y0 VMOVDQU 1664(CX), Y6 VMOVDQU 1696(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y1, Y1 VMOVDQU 1728(CX), Y6 VMOVDQU 1760(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y2, Y2 VMOVDQU 1792(CX), Y6 VMOVDQU 1824(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y3, Y3 VMOVDQU 1856(CX), Y6 VMOVDQU 1888(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y4, Y4 // Store 5 outputs VMOVDQU Y0, (R11) ADDQ $0x20, R11 VMOVDQU Y1, (R12) ADDQ $0x20, R12 VMOVDQU Y2, (R13) ADDQ $0x20, R13 VMOVDQU Y3, (R14) ADDQ $0x20, R14 VMOVDQU Y4, (R10) ADDQ $0x20, R10 // Prepare for next loop DECQ AX JNZ mulAvxTwo_6x5_loop VZEROUPPER mulAvxTwo_6x5_end: RET // func mulAvxTwo_6x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_6x6(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 83 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_6x6_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), DX MOVQ out_base+48(FP), R10 MOVQ (R10), R11 MOVQ 24(R10), R12 MOVQ 48(R10), R13 MOVQ 72(R10), R14 MOVQ 96(R10), R15 MOVQ 120(R10), R10 MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, R11 ADDQ BP, R12 ADDQ BP, R13 ADDQ BP, R14 ADDQ BP, R15 ADDQ BP, R10 // Add start offset to input ADDQ BP, BX ADDQ BP, SI ADDQ BP, DI ADDQ BP, R8 ADDQ BP, R9 ADDQ BP, DX MOVQ $0x0000000f, BP MOVQ BP, X6 VPBROADCASTB X6, Y6 mulAvxTwo_6x6_loop: // Clear 6 outputs VPXOR Y0, Y0, Y0 VPXOR Y1, Y1, Y1 VPXOR Y2, Y2, Y2 VPXOR Y3, Y3, Y3 VPXOR Y4, Y4, Y4 VPXOR Y5, Y5, Y5 // Load and process 32 bytes from input 0 to 6 outputs VMOVDQU (BX), Y9 ADDQ $0x20, BX VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU (CX), Y7 VMOVDQU 32(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y0, Y0 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y1, Y1 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y2, Y2 VMOVDQU 192(CX), Y7 VMOVDQU 224(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y3, Y3 VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y4, Y4 VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 1 to 6 outputs VMOVDQU (SI), Y9 ADDQ $0x20, SI VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 384(CX), Y7 VMOVDQU 416(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y0, Y0 VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y1, Y1 VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y2, Y2 VMOVDQU 576(CX), Y7 VMOVDQU 608(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y3, Y3 VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y4, Y4 VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 2 to 6 outputs VMOVDQU (DI), Y9 ADDQ $0x20, DI VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 768(CX), Y7 VMOVDQU 800(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y0, Y0 VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y1, Y1 VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y2, Y2 VMOVDQU 960(CX), Y7 VMOVDQU 992(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y3, Y3 VMOVDQU 1024(CX), Y7 VMOVDQU 1056(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y4, Y4 VMOVDQU 1088(CX), Y7 VMOVDQU 1120(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 3 to 6 outputs VMOVDQU (R8), Y9 ADDQ $0x20, R8 VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 1152(CX), Y7 VMOVDQU 1184(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y0, Y0 VMOVDQU 1216(CX), Y7 VMOVDQU 1248(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y1, Y1 VMOVDQU 1280(CX), Y7 VMOVDQU 1312(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y2, Y2 VMOVDQU 1344(CX), Y7 VMOVDQU 1376(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y3, Y3 VMOVDQU 1408(CX), Y7 VMOVDQU 1440(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y4, Y4 VMOVDQU 1472(CX), Y7 VMOVDQU 1504(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 4 to 6 outputs VMOVDQU (R9), Y9 ADDQ $0x20, R9 VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 1536(CX), Y7 VMOVDQU 1568(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y0, Y0 VMOVDQU 1600(CX), Y7 VMOVDQU 1632(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y1, Y1 VMOVDQU 1664(CX), Y7 VMOVDQU 1696(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y2, Y2 VMOVDQU 1728(CX), Y7 VMOVDQU 1760(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y3, Y3 VMOVDQU 1792(CX), Y7 VMOVDQU 1824(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y4, Y4 VMOVDQU 1856(CX), Y7 VMOVDQU 1888(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 5 to 6 outputs VMOVDQU (DX), Y9 ADDQ $0x20, DX VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 1920(CX), Y7 VMOVDQU 1952(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y0, Y0 VMOVDQU 1984(CX), Y7 VMOVDQU 2016(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y1, Y1 VMOVDQU 2048(CX), Y7 VMOVDQU 2080(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y2, Y2 VMOVDQU 2112(CX), Y7 VMOVDQU 2144(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y3, Y3 VMOVDQU 2176(CX), Y7 VMOVDQU 2208(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y4, Y4 VMOVDQU 2240(CX), Y7 VMOVDQU 2272(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y5, Y5 // Store 6 outputs VMOVDQU Y0, (R11) ADDQ $0x20, R11 VMOVDQU Y1, (R12) ADDQ $0x20, R12 VMOVDQU Y2, (R13) ADDQ $0x20, R13 VMOVDQU Y3, (R14) ADDQ $0x20, R14 VMOVDQU Y4, (R15) ADDQ $0x20, R15 VMOVDQU Y5, (R10) ADDQ $0x20, R10 // Prepare for next loop DECQ AX JNZ mulAvxTwo_6x6_loop VZEROUPPER mulAvxTwo_6x6_end: RET // func mulAvxTwo_6x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_6x7(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 96 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_6x7_end MOVQ in_base+24(FP), AX MOVQ (AX), DX MOVQ 24(AX), BX MOVQ 48(AX), SI MOVQ 72(AX), DI MOVQ 96(AX), R8 MOVQ 120(AX), AX MOVQ out_base+48(FP), R9 MOVQ (R9), R10 MOVQ 24(R9), R11 MOVQ 48(R9), R12 MOVQ 72(R9), R13 MOVQ 96(R9), R14 MOVQ 120(R9), R15 MOVQ 144(R9), R9 MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, R10 ADDQ BP, R11 ADDQ BP, R12 ADDQ BP, R13 ADDQ BP, R14 ADDQ BP, R15 ADDQ BP, R9 // Add start offset to input ADDQ BP, DX ADDQ BP, BX ADDQ BP, SI ADDQ BP, DI ADDQ BP, R8 ADDQ BP, AX MOVQ $0x0000000f, BP MOVQ BP, X7 VPBROADCASTB X7, Y7 MOVQ n+80(FP), BP SHRQ $0x05, BP mulAvxTwo_6x7_loop: // Clear 7 outputs VPXOR Y0, Y0, Y0 VPXOR Y1, Y1, Y1 VPXOR Y2, Y2, Y2 VPXOR Y3, Y3, Y3 VPXOR Y4, Y4, Y4 VPXOR Y5, Y5, Y5 VPXOR Y6, Y6, Y6 // Load and process 32 bytes from input 0 to 7 outputs VMOVDQU (DX), Y10 ADDQ $0x20, DX VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU (CX), Y8 VMOVDQU 32(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y0, Y0 VMOVDQU 64(CX), Y8 VMOVDQU 96(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y1, Y1 VMOVDQU 128(CX), Y8 VMOVDQU 160(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y2, Y2 VMOVDQU 192(CX), Y8 VMOVDQU 224(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y3, Y3 VMOVDQU 256(CX), Y8 VMOVDQU 288(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y4, Y4 VMOVDQU 320(CX), Y8 VMOVDQU 352(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y5, Y5 VMOVDQU 384(CX), Y8 VMOVDQU 416(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 1 to 7 outputs VMOVDQU (BX), Y10 ADDQ $0x20, BX VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 448(CX), Y8 VMOVDQU 480(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y0, Y0 VMOVDQU 512(CX), Y8 VMOVDQU 544(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y1, Y1 VMOVDQU 576(CX), Y8 VMOVDQU 608(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y2, Y2 VMOVDQU 640(CX), Y8 VMOVDQU 672(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y3, Y3 VMOVDQU 704(CX), Y8 VMOVDQU 736(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y4, Y4 VMOVDQU 768(CX), Y8 VMOVDQU 800(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y5, Y5 VMOVDQU 832(CX), Y8 VMOVDQU 864(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 2 to 7 outputs VMOVDQU (SI), Y10 ADDQ $0x20, SI VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 896(CX), Y8 VMOVDQU 928(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y0, Y0 VMOVDQU 960(CX), Y8 VMOVDQU 992(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y1, Y1 VMOVDQU 1024(CX), Y8 VMOVDQU 1056(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y2, Y2 VMOVDQU 1088(CX), Y8 VMOVDQU 1120(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y3, Y3 VMOVDQU 1152(CX), Y8 VMOVDQU 1184(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y4, Y4 VMOVDQU 1216(CX), Y8 VMOVDQU 1248(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y5, Y5 VMOVDQU 1280(CX), Y8 VMOVDQU 1312(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 3 to 7 outputs VMOVDQU (DI), Y10 ADDQ $0x20, DI VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 1344(CX), Y8 VMOVDQU 1376(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y0, Y0 VMOVDQU 1408(CX), Y8 VMOVDQU 1440(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y1, Y1 VMOVDQU 1472(CX), Y8 VMOVDQU 1504(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y2, Y2 VMOVDQU 1536(CX), Y8 VMOVDQU 1568(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y3, Y3 VMOVDQU 1600(CX), Y8 VMOVDQU 1632(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y4, Y4 VMOVDQU 1664(CX), Y8 VMOVDQU 1696(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y5, Y5 VMOVDQU 1728(CX), Y8 VMOVDQU 1760(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 4 to 7 outputs VMOVDQU (R8), Y10 ADDQ $0x20, R8 VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 1792(CX), Y8 VMOVDQU 1824(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y0, Y0 VMOVDQU 1856(CX), Y8 VMOVDQU 1888(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y1, Y1 VMOVDQU 1920(CX), Y8 VMOVDQU 1952(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y2, Y2 VMOVDQU 1984(CX), Y8 VMOVDQU 2016(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y3, Y3 VMOVDQU 2048(CX), Y8 VMOVDQU 2080(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y4, Y4 VMOVDQU 2112(CX), Y8 VMOVDQU 2144(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y5, Y5 VMOVDQU 2176(CX), Y8 VMOVDQU 2208(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 5 to 7 outputs VMOVDQU (AX), Y10 ADDQ $0x20, AX VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 2240(CX), Y8 VMOVDQU 2272(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y0, Y0 VMOVDQU 2304(CX), Y8 VMOVDQU 2336(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y1, Y1 VMOVDQU 2368(CX), Y8 VMOVDQU 2400(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y2, Y2 VMOVDQU 2432(CX), Y8 VMOVDQU 2464(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y3, Y3 VMOVDQU 2496(CX), Y8 VMOVDQU 2528(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y4, Y4 VMOVDQU 2560(CX), Y8 VMOVDQU 2592(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y5, Y5 VMOVDQU 2624(CX), Y8 VMOVDQU 2656(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y6, Y6 // Store 7 outputs VMOVDQU Y0, (R10) ADDQ $0x20, R10 VMOVDQU Y1, (R11) ADDQ $0x20, R11 VMOVDQU Y2, (R12) ADDQ $0x20, R12 VMOVDQU Y3, (R13) ADDQ $0x20, R13 VMOVDQU Y4, (R14) ADDQ $0x20, R14 VMOVDQU Y5, (R15) ADDQ $0x20, R15 VMOVDQU Y6, (R9) ADDQ $0x20, R9 // Prepare for next loop DECQ BP JNZ mulAvxTwo_6x7_loop VZEROUPPER mulAvxTwo_6x7_end: RET // func mulAvxTwo_6x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_6x8(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack // Full registers estimated 109 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_6x8_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), DX MOVQ out_base+48(FP), R10 MOVQ start+72(FP), R11 // Add start offset to input ADDQ R11, BX ADDQ R11, SI ADDQ R11, DI ADDQ R11, R8 ADDQ R11, R9 ADDQ R11, DX MOVQ $0x0000000f, R12 MOVQ R12, X8 VPBROADCASTB X8, Y8 mulAvxTwo_6x8_loop: // Clear 8 outputs VPXOR Y0, Y0, Y0 VPXOR Y1, Y1, Y1 VPXOR Y2, Y2, Y2 VPXOR Y3, Y3, Y3 VPXOR Y4, Y4, Y4 VPXOR Y5, Y5, Y5 VPXOR Y6, Y6, Y6 VPXOR Y7, Y7, Y7 // Load and process 32 bytes from input 0 to 8 outputs VMOVDQU (BX), Y11 ADDQ $0x20, BX VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU (CX), Y9 VMOVDQU 32(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y0, Y0 VMOVDQU 64(CX), Y9 VMOVDQU 96(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y1, Y1 VMOVDQU 128(CX), Y9 VMOVDQU 160(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y2, Y2 VMOVDQU 192(CX), Y9 VMOVDQU 224(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y3, Y3 VMOVDQU 256(CX), Y9 VMOVDQU 288(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y4, Y4 VMOVDQU 320(CX), Y9 VMOVDQU 352(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y5, Y5 VMOVDQU 384(CX), Y9 VMOVDQU 416(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y6, Y6 VMOVDQU 448(CX), Y9 VMOVDQU 480(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 1 to 8 outputs VMOVDQU (SI), Y11 ADDQ $0x20, SI VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 512(CX), Y9 VMOVDQU 544(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y0, Y0 VMOVDQU 576(CX), Y9 VMOVDQU 608(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y1, Y1 VMOVDQU 640(CX), Y9 VMOVDQU 672(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y2, Y2 VMOVDQU 704(CX), Y9 VMOVDQU 736(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y3, Y3 VMOVDQU 768(CX), Y9 VMOVDQU 800(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y4, Y4 VMOVDQU 832(CX), Y9 VMOVDQU 864(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y5, Y5 VMOVDQU 896(CX), Y9 VMOVDQU 928(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y6, Y6 VMOVDQU 960(CX), Y9 VMOVDQU 992(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 2 to 8 outputs VMOVDQU (DI), Y11 ADDQ $0x20, DI VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 1024(CX), Y9 VMOVDQU 1056(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y0, Y0 VMOVDQU 1088(CX), Y9 VMOVDQU 1120(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y1, Y1 VMOVDQU 1152(CX), Y9 VMOVDQU 1184(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y2, Y2 VMOVDQU 1216(CX), Y9 VMOVDQU 1248(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y3, Y3 VMOVDQU 1280(CX), Y9 VMOVDQU 1312(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y4, Y4 VMOVDQU 1344(CX), Y9 VMOVDQU 1376(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y5, Y5 VMOVDQU 1408(CX), Y9 VMOVDQU 1440(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y6, Y6 VMOVDQU 1472(CX), Y9 VMOVDQU 1504(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 3 to 8 outputs VMOVDQU (R8), Y11 ADDQ $0x20, R8 VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 1536(CX), Y9 VMOVDQU 1568(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y0, Y0 VMOVDQU 1600(CX), Y9 VMOVDQU 1632(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y1, Y1 VMOVDQU 1664(CX), Y9 VMOVDQU 1696(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y2, Y2 VMOVDQU 1728(CX), Y9 VMOVDQU 1760(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y3, Y3 VMOVDQU 1792(CX), Y9 VMOVDQU 1824(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y4, Y4 VMOVDQU 1856(CX), Y9 VMOVDQU 1888(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y5, Y5 VMOVDQU 1920(CX), Y9 VMOVDQU 1952(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y6, Y6 VMOVDQU 1984(CX), Y9 VMOVDQU 2016(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 4 to 8 outputs VMOVDQU (R9), Y11 ADDQ $0x20, R9 VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 2048(CX), Y9 VMOVDQU 2080(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y0, Y0 VMOVDQU 2112(CX), Y9 VMOVDQU 2144(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y1, Y1 VMOVDQU 2176(CX), Y9 VMOVDQU 2208(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y2, Y2 VMOVDQU 2240(CX), Y9 VMOVDQU 2272(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y3, Y3 VMOVDQU 2304(CX), Y9 VMOVDQU 2336(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y4, Y4 VMOVDQU 2368(CX), Y9 VMOVDQU 2400(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y5, Y5 VMOVDQU 2432(CX), Y9 VMOVDQU 2464(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y6, Y6 VMOVDQU 2496(CX), Y9 VMOVDQU 2528(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 5 to 8 outputs VMOVDQU (DX), Y11 ADDQ $0x20, DX VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 2560(CX), Y9 VMOVDQU 2592(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y0, Y0 VMOVDQU 2624(CX), Y9 VMOVDQU 2656(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y1, Y1 VMOVDQU 2688(CX), Y9 VMOVDQU 2720(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y2, Y2 VMOVDQU 2752(CX), Y9 VMOVDQU 2784(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y3, Y3 VMOVDQU 2816(CX), Y9 VMOVDQU 2848(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y4, Y4 VMOVDQU 2880(CX), Y9 VMOVDQU 2912(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y5, Y5 VMOVDQU 2944(CX), Y9 VMOVDQU 2976(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y6, Y6 VMOVDQU 3008(CX), Y9 VMOVDQU 3040(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y7, Y7 // Store 8 outputs MOVQ (R10), R12 VMOVDQU Y0, (R12)(R11*1) MOVQ 24(R10), R12 VMOVDQU Y1, (R12)(R11*1) MOVQ 48(R10), R12 VMOVDQU Y2, (R12)(R11*1) MOVQ 72(R10), R12 VMOVDQU Y3, (R12)(R11*1) MOVQ 96(R10), R12 VMOVDQU Y4, (R12)(R11*1) MOVQ 120(R10), R12 VMOVDQU Y5, (R12)(R11*1) MOVQ 144(R10), R12 VMOVDQU Y6, (R12)(R11*1) MOVQ 168(R10), R12 VMOVDQU Y7, (R12)(R11*1) // Prepare for next loop ADDQ $0x20, R11 DECQ AX JNZ mulAvxTwo_6x8_loop VZEROUPPER mulAvxTwo_6x8_end: RET // func mulAvxTwo_6x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_6x9(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack // Full registers estimated 122 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_6x9_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), DX MOVQ out_base+48(FP), R10 MOVQ start+72(FP), R11 // Add start offset to input ADDQ R11, BX ADDQ R11, SI ADDQ R11, DI ADDQ R11, R8 ADDQ R11, R9 ADDQ R11, DX MOVQ $0x0000000f, R12 MOVQ R12, X9 VPBROADCASTB X9, Y9 mulAvxTwo_6x9_loop: // Clear 9 outputs VPXOR Y0, Y0, Y0 VPXOR Y1, Y1, Y1 VPXOR Y2, Y2, Y2 VPXOR Y3, Y3, Y3 VPXOR Y4, Y4, Y4 VPXOR Y5, Y5, Y5 VPXOR Y6, Y6, Y6 VPXOR Y7, Y7, Y7 VPXOR Y8, Y8, Y8 // Load and process 32 bytes from input 0 to 9 outputs VMOVDQU (BX), Y12 ADDQ $0x20, BX VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU (CX), Y10 VMOVDQU 32(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y0, Y0 VMOVDQU 64(CX), Y10 VMOVDQU 96(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y1, Y1 VMOVDQU 128(CX), Y10 VMOVDQU 160(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y2, Y2 VMOVDQU 192(CX), Y10 VMOVDQU 224(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y3, Y3 VMOVDQU 256(CX), Y10 VMOVDQU 288(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y4, Y4 VMOVDQU 320(CX), Y10 VMOVDQU 352(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y5, Y5 VMOVDQU 384(CX), Y10 VMOVDQU 416(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y6, Y6 VMOVDQU 448(CX), Y10 VMOVDQU 480(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y7, Y7 VMOVDQU 512(CX), Y10 VMOVDQU 544(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 1 to 9 outputs VMOVDQU (SI), Y12 ADDQ $0x20, SI VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 576(CX), Y10 VMOVDQU 608(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y0, Y0 VMOVDQU 640(CX), Y10 VMOVDQU 672(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y1, Y1 VMOVDQU 704(CX), Y10 VMOVDQU 736(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y2, Y2 VMOVDQU 768(CX), Y10 VMOVDQU 800(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y3, Y3 VMOVDQU 832(CX), Y10 VMOVDQU 864(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y4, Y4 VMOVDQU 896(CX), Y10 VMOVDQU 928(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y5, Y5 VMOVDQU 960(CX), Y10 VMOVDQU 992(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y6, Y6 VMOVDQU 1024(CX), Y10 VMOVDQU 1056(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y7, Y7 VMOVDQU 1088(CX), Y10 VMOVDQU 1120(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 2 to 9 outputs VMOVDQU (DI), Y12 ADDQ $0x20, DI VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 1152(CX), Y10 VMOVDQU 1184(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y0, Y0 VMOVDQU 1216(CX), Y10 VMOVDQU 1248(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y1, Y1 VMOVDQU 1280(CX), Y10 VMOVDQU 1312(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y2, Y2 VMOVDQU 1344(CX), Y10 VMOVDQU 1376(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y3, Y3 VMOVDQU 1408(CX), Y10 VMOVDQU 1440(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y4, Y4 VMOVDQU 1472(CX), Y10 VMOVDQU 1504(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y5, Y5 VMOVDQU 1536(CX), Y10 VMOVDQU 1568(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y6, Y6 VMOVDQU 1600(CX), Y10 VMOVDQU 1632(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y7, Y7 VMOVDQU 1664(CX), Y10 VMOVDQU 1696(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 3 to 9 outputs VMOVDQU (R8), Y12 ADDQ $0x20, R8 VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 1728(CX), Y10 VMOVDQU 1760(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y0, Y0 VMOVDQU 1792(CX), Y10 VMOVDQU 1824(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y1, Y1 VMOVDQU 1856(CX), Y10 VMOVDQU 1888(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y2, Y2 VMOVDQU 1920(CX), Y10 VMOVDQU 1952(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y3, Y3 VMOVDQU 1984(CX), Y10 VMOVDQU 2016(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y4, Y4 VMOVDQU 2048(CX), Y10 VMOVDQU 2080(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y5, Y5 VMOVDQU 2112(CX), Y10 VMOVDQU 2144(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y6, Y6 VMOVDQU 2176(CX), Y10 VMOVDQU 2208(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y7, Y7 VMOVDQU 2240(CX), Y10 VMOVDQU 2272(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 4 to 9 outputs VMOVDQU (R9), Y12 ADDQ $0x20, R9 VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 2304(CX), Y10 VMOVDQU 2336(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y0, Y0 VMOVDQU 2368(CX), Y10 VMOVDQU 2400(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y1, Y1 VMOVDQU 2432(CX), Y10 VMOVDQU 2464(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y2, Y2 VMOVDQU 2496(CX), Y10 VMOVDQU 2528(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y3, Y3 VMOVDQU 2560(CX), Y10 VMOVDQU 2592(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y4, Y4 VMOVDQU 2624(CX), Y10 VMOVDQU 2656(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y5, Y5 VMOVDQU 2688(CX), Y10 VMOVDQU 2720(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y6, Y6 VMOVDQU 2752(CX), Y10 VMOVDQU 2784(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y7, Y7 VMOVDQU 2816(CX), Y10 VMOVDQU 2848(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 5 to 9 outputs VMOVDQU (DX), Y12 ADDQ $0x20, DX VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 2880(CX), Y10 VMOVDQU 2912(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y0, Y0 VMOVDQU 2944(CX), Y10 VMOVDQU 2976(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y1, Y1 VMOVDQU 3008(CX), Y10 VMOVDQU 3040(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y2, Y2 VMOVDQU 3072(CX), Y10 VMOVDQU 3104(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y3, Y3 VMOVDQU 3136(CX), Y10 VMOVDQU 3168(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y4, Y4 VMOVDQU 3200(CX), Y10 VMOVDQU 3232(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y5, Y5 VMOVDQU 3264(CX), Y10 VMOVDQU 3296(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y6, Y6 VMOVDQU 3328(CX), Y10 VMOVDQU 3360(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y7, Y7 VMOVDQU 3392(CX), Y10 VMOVDQU 3424(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y8, Y8 // Store 9 outputs MOVQ (R10), R12 VMOVDQU Y0, (R12)(R11*1) MOVQ 24(R10), R12 VMOVDQU Y1, (R12)(R11*1) MOVQ 48(R10), R12 VMOVDQU Y2, (R12)(R11*1) MOVQ 72(R10), R12 VMOVDQU Y3, (R12)(R11*1) MOVQ 96(R10), R12 VMOVDQU Y4, (R12)(R11*1) MOVQ 120(R10), R12 VMOVDQU Y5, (R12)(R11*1) MOVQ 144(R10), R12 VMOVDQU Y6, (R12)(R11*1) MOVQ 168(R10), R12 VMOVDQU Y7, (R12)(R11*1) MOVQ 192(R10), R12 VMOVDQU Y8, (R12)(R11*1) // Prepare for next loop ADDQ $0x20, R11 DECQ AX JNZ mulAvxTwo_6x9_loop VZEROUPPER mulAvxTwo_6x9_end: RET // func mulAvxTwo_6x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_6x10(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack // Full registers estimated 135 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_6x10_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), DX MOVQ out_base+48(FP), R10 MOVQ start+72(FP), R11 // Add start offset to input ADDQ R11, BX ADDQ R11, SI ADDQ R11, DI ADDQ R11, R8 ADDQ R11, R9 ADDQ R11, DX MOVQ $0x0000000f, R12 MOVQ R12, X10 VPBROADCASTB X10, Y10 mulAvxTwo_6x10_loop: // Clear 10 outputs VPXOR Y0, Y0, Y0 VPXOR Y1, Y1, Y1 VPXOR Y2, Y2, Y2 VPXOR Y3, Y3, Y3 VPXOR Y4, Y4, Y4 VPXOR Y5, Y5, Y5 VPXOR Y6, Y6, Y6 VPXOR Y7, Y7, Y7 VPXOR Y8, Y8, Y8 VPXOR Y9, Y9, Y9 // Load and process 32 bytes from input 0 to 10 outputs VMOVDQU (BX), Y13 ADDQ $0x20, BX VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU (CX), Y11 VMOVDQU 32(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y0, Y0 VMOVDQU 64(CX), Y11 VMOVDQU 96(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y1, Y1 VMOVDQU 128(CX), Y11 VMOVDQU 160(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y2, Y2 VMOVDQU 192(CX), Y11 VMOVDQU 224(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y3, Y3 VMOVDQU 256(CX), Y11 VMOVDQU 288(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y4, Y4 VMOVDQU 320(CX), Y11 VMOVDQU 352(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y5, Y5 VMOVDQU 384(CX), Y11 VMOVDQU 416(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y6, Y6 VMOVDQU 448(CX), Y11 VMOVDQU 480(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y7, Y7 VMOVDQU 512(CX), Y11 VMOVDQU 544(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y8, Y8 VMOVDQU 576(CX), Y11 VMOVDQU 608(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 1 to 10 outputs VMOVDQU (SI), Y13 ADDQ $0x20, SI VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 640(CX), Y11 VMOVDQU 672(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y0, Y0 VMOVDQU 704(CX), Y11 VMOVDQU 736(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y1, Y1 VMOVDQU 768(CX), Y11 VMOVDQU 800(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y2, Y2 VMOVDQU 832(CX), Y11 VMOVDQU 864(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y3, Y3 VMOVDQU 896(CX), Y11 VMOVDQU 928(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y4, Y4 VMOVDQU 960(CX), Y11 VMOVDQU 992(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y5, Y5 VMOVDQU 1024(CX), Y11 VMOVDQU 1056(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y6, Y6 VMOVDQU 1088(CX), Y11 VMOVDQU 1120(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y7, Y7 VMOVDQU 1152(CX), Y11 VMOVDQU 1184(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y8, Y8 VMOVDQU 1216(CX), Y11 VMOVDQU 1248(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 2 to 10 outputs VMOVDQU (DI), Y13 ADDQ $0x20, DI VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 1280(CX), Y11 VMOVDQU 1312(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y0, Y0 VMOVDQU 1344(CX), Y11 VMOVDQU 1376(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y1, Y1 VMOVDQU 1408(CX), Y11 VMOVDQU 1440(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y2, Y2 VMOVDQU 1472(CX), Y11 VMOVDQU 1504(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y3, Y3 VMOVDQU 1536(CX), Y11 VMOVDQU 1568(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y4, Y4 VMOVDQU 1600(CX), Y11 VMOVDQU 1632(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y5, Y5 VMOVDQU 1664(CX), Y11 VMOVDQU 1696(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y6, Y6 VMOVDQU 1728(CX), Y11 VMOVDQU 1760(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y7, Y7 VMOVDQU 1792(CX), Y11 VMOVDQU 1824(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y8, Y8 VMOVDQU 1856(CX), Y11 VMOVDQU 1888(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 3 to 10 outputs VMOVDQU (R8), Y13 ADDQ $0x20, R8 VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 1920(CX), Y11 VMOVDQU 1952(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y0, Y0 VMOVDQU 1984(CX), Y11 VMOVDQU 2016(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y1, Y1 VMOVDQU 2048(CX), Y11 VMOVDQU 2080(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y2, Y2 VMOVDQU 2112(CX), Y11 VMOVDQU 2144(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y3, Y3 VMOVDQU 2176(CX), Y11 VMOVDQU 2208(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y4, Y4 VMOVDQU 2240(CX), Y11 VMOVDQU 2272(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y5, Y5 VMOVDQU 2304(CX), Y11 VMOVDQU 2336(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y6, Y6 VMOVDQU 2368(CX), Y11 VMOVDQU 2400(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y7, Y7 VMOVDQU 2432(CX), Y11 VMOVDQU 2464(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y8, Y8 VMOVDQU 2496(CX), Y11 VMOVDQU 2528(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 4 to 10 outputs VMOVDQU (R9), Y13 ADDQ $0x20, R9 VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 2560(CX), Y11 VMOVDQU 2592(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y0, Y0 VMOVDQU 2624(CX), Y11 VMOVDQU 2656(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y1, Y1 VMOVDQU 2688(CX), Y11 VMOVDQU 2720(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y2, Y2 VMOVDQU 2752(CX), Y11 VMOVDQU 2784(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y3, Y3 VMOVDQU 2816(CX), Y11 VMOVDQU 2848(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y4, Y4 VMOVDQU 2880(CX), Y11 VMOVDQU 2912(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y5, Y5 VMOVDQU 2944(CX), Y11 VMOVDQU 2976(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y6, Y6 VMOVDQU 3008(CX), Y11 VMOVDQU 3040(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y7, Y7 VMOVDQU 3072(CX), Y11 VMOVDQU 3104(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y8, Y8 VMOVDQU 3136(CX), Y11 VMOVDQU 3168(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 5 to 10 outputs VMOVDQU (DX), Y13 ADDQ $0x20, DX VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 3200(CX), Y11 VMOVDQU 3232(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y0, Y0 VMOVDQU 3264(CX), Y11 VMOVDQU 3296(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y1, Y1 VMOVDQU 3328(CX), Y11 VMOVDQU 3360(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y2, Y2 VMOVDQU 3392(CX), Y11 VMOVDQU 3424(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y3, Y3 VMOVDQU 3456(CX), Y11 VMOVDQU 3488(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y4, Y4 VMOVDQU 3520(CX), Y11 VMOVDQU 3552(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y5, Y5 VMOVDQU 3584(CX), Y11 VMOVDQU 3616(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y6, Y6 VMOVDQU 3648(CX), Y11 VMOVDQU 3680(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y7, Y7 VMOVDQU 3712(CX), Y11 VMOVDQU 3744(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y8, Y8 VMOVDQU 3776(CX), Y11 VMOVDQU 3808(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y9, Y9 // Store 10 outputs MOVQ (R10), R12 VMOVDQU Y0, (R12)(R11*1) MOVQ 24(R10), R12 VMOVDQU Y1, (R12)(R11*1) MOVQ 48(R10), R12 VMOVDQU Y2, (R12)(R11*1) MOVQ 72(R10), R12 VMOVDQU Y3, (R12)(R11*1) MOVQ 96(R10), R12 VMOVDQU Y4, (R12)(R11*1) MOVQ 120(R10), R12 VMOVDQU Y5, (R12)(R11*1) MOVQ 144(R10), R12 VMOVDQU Y6, (R12)(R11*1) MOVQ 168(R10), R12 VMOVDQU Y7, (R12)(R11*1) MOVQ 192(R10), R12 VMOVDQU Y8, (R12)(R11*1) MOVQ 216(R10), R12 VMOVDQU Y9, (R12)(R11*1) // Prepare for next loop ADDQ $0x20, R11 DECQ AX JNZ mulAvxTwo_6x10_loop VZEROUPPER mulAvxTwo_6x10_end: RET // func mulAvxTwo_7x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_7x1(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 18 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_7x1_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), DX MOVQ out_base+48(FP), R11 MOVQ (R11), R11 MOVQ start+72(FP), R12 // Add start offset to output ADDQ R12, R11 // Add start offset to input ADDQ R12, BX ADDQ R12, SI ADDQ R12, DI ADDQ R12, R8 ADDQ R12, R9 ADDQ R12, R10 ADDQ R12, DX MOVQ $0x0000000f, R12 MOVQ R12, X1 VPBROADCASTB X1, Y1 mulAvxTwo_7x1_loop: // Clear 1 outputs VPXOR Y0, Y0, Y0 // Load and process 32 bytes from input 0 to 1 outputs VMOVDQU (BX), Y4 ADDQ $0x20, BX VPSRLQ $0x04, Y4, Y5 VPAND Y1, Y4, Y4 VPAND Y1, Y5, Y5 VMOVDQU (CX), Y2 VMOVDQU 32(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 VPXOR Y2, Y3, Y2 VPXOR Y2, Y0, Y0 // Load and process 32 bytes from input 1 to 1 outputs VMOVDQU (SI), Y4 ADDQ $0x20, SI VPSRLQ $0x04, Y4, Y5 VPAND Y1, Y4, Y4 VPAND Y1, Y5, Y5 VMOVDQU 64(CX), Y2 VMOVDQU 96(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 VPXOR Y2, Y3, Y2 VPXOR Y2, Y0, Y0 // Load and process 32 bytes from input 2 to 1 outputs VMOVDQU (DI), Y4 ADDQ $0x20, DI VPSRLQ $0x04, Y4, Y5 VPAND Y1, Y4, Y4 VPAND Y1, Y5, Y5 VMOVDQU 128(CX), Y2 VMOVDQU 160(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 VPXOR Y2, Y3, Y2 VPXOR Y2, Y0, Y0 // Load and process 32 bytes from input 3 to 1 outputs VMOVDQU (R8), Y4 ADDQ $0x20, R8 VPSRLQ $0x04, Y4, Y5 VPAND Y1, Y4, Y4 VPAND Y1, Y5, Y5 VMOVDQU 192(CX), Y2 VMOVDQU 224(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 VPXOR Y2, Y3, Y2 VPXOR Y2, Y0, Y0 // Load and process 32 bytes from input 4 to 1 outputs VMOVDQU (R9), Y4 ADDQ $0x20, R9 VPSRLQ $0x04, Y4, Y5 VPAND Y1, Y4, Y4 VPAND Y1, Y5, Y5 VMOVDQU 256(CX), Y2 VMOVDQU 288(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 VPXOR Y2, Y3, Y2 VPXOR Y2, Y0, Y0 // Load and process 32 bytes from input 5 to 1 outputs VMOVDQU (R10), Y4 ADDQ $0x20, R10 VPSRLQ $0x04, Y4, Y5 VPAND Y1, Y4, Y4 VPAND Y1, Y5, Y5 VMOVDQU 320(CX), Y2 VMOVDQU 352(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 VPXOR Y2, Y3, Y2 VPXOR Y2, Y0, Y0 // Load and process 32 bytes from input 6 to 1 outputs VMOVDQU (DX), Y4 ADDQ $0x20, DX VPSRLQ $0x04, Y4, Y5 VPAND Y1, Y4, Y4 VPAND Y1, Y5, Y5 VMOVDQU 384(CX), Y2 VMOVDQU 416(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 VPXOR Y2, Y3, Y2 VPXOR Y2, Y0, Y0 // Store 1 outputs VMOVDQU Y0, (R11) ADDQ $0x20, R11 // Prepare for next loop DECQ AX JNZ mulAvxTwo_7x1_loop VZEROUPPER mulAvxTwo_7x1_end: RET // func mulAvxTwo_7x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_7x1_64(SB), $0-88 // Loading no tables to registers // Destination kept on stack // Full registers estimated 18 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_7x1_64_end MOVQ in_base+24(FP), AX MOVQ (AX), DX MOVQ 24(AX), BX MOVQ 48(AX), SI MOVQ 72(AX), DI MOVQ 96(AX), R8 MOVQ 120(AX), R9 MOVQ 144(AX), AX MOVQ out_base+48(FP), R10 MOVQ out_base+48(FP), R10 MOVQ start+72(FP), R11 // Add start offset to input ADDQ R11, DX ADDQ R11, BX ADDQ R11, SI ADDQ R11, DI ADDQ R11, R8 ADDQ R11, R9 ADDQ R11, AX MOVQ $0x0000000f, R12 MOVQ R12, X2 VPBROADCASTB X2, Y2 MOVQ n+80(FP), R12 SHRQ $0x06, R12 mulAvxTwo_7x1_64_loop: // Clear 1 outputs VPXOR Y0, Y0, Y0 VPXOR Y1, Y1, Y1 // Load and process 64 bytes from input 0 to 1 outputs VMOVDQU (DX), Y6 VMOVDQU 32(DX), Y5 ADDQ $0x40, DX VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU (CX), Y3 VMOVDQU 32(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y5, Y6, Y5 VPXOR Y3, Y0, Y0 VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 1 to 1 outputs VMOVDQU (BX), Y6 VMOVDQU 32(BX), Y5 ADDQ $0x40, BX VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 64(CX), Y3 VMOVDQU 96(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y5, Y6, Y5 VPXOR Y3, Y0, Y0 VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 2 to 1 outputs VMOVDQU (SI), Y6 VMOVDQU 32(SI), Y5 ADDQ $0x40, SI VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 128(CX), Y3 VMOVDQU 160(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y5, Y6, Y5 VPXOR Y3, Y0, Y0 VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 3 to 1 outputs VMOVDQU (DI), Y6 VMOVDQU 32(DI), Y5 ADDQ $0x40, DI VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 192(CX), Y3 VMOVDQU 224(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y5, Y6, Y5 VPXOR Y3, Y0, Y0 VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 4 to 1 outputs VMOVDQU (R8), Y6 VMOVDQU 32(R8), Y5 ADDQ $0x40, R8 VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 256(CX), Y3 VMOVDQU 288(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y5, Y6, Y5 VPXOR Y3, Y0, Y0 VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 5 to 1 outputs VMOVDQU (R9), Y6 VMOVDQU 32(R9), Y5 ADDQ $0x40, R9 VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 320(CX), Y3 VMOVDQU 352(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y5, Y6, Y5 VPXOR Y3, Y0, Y0 VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 6 to 1 outputs VMOVDQU (AX), Y6 VMOVDQU 32(AX), Y5 ADDQ $0x40, AX VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 384(CX), Y3 VMOVDQU 416(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y5, Y6, Y5 VPXOR Y3, Y0, Y0 VPXOR Y5, Y1, Y1 // Store 1 outputs MOVQ (R10), R13 VMOVDQU Y0, (R13)(R11*1) VMOVDQU Y1, 32(R13)(R11*1) // Prepare for next loop ADDQ $0x40, R11 DECQ R12 JNZ mulAvxTwo_7x1_64_loop VZEROUPPER mulAvxTwo_7x1_64_end: RET // func mulAvxTwo_7x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_7x2(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 35 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_7x2_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), DX MOVQ out_base+48(FP), R11 MOVQ (R11), R12 MOVQ 24(R11), R11 MOVQ start+72(FP), R13 // Add start offset to output ADDQ R13, R12 ADDQ R13, R11 // Add start offset to input ADDQ R13, BX ADDQ R13, SI ADDQ R13, DI ADDQ R13, R8 ADDQ R13, R9 ADDQ R13, R10 ADDQ R13, DX MOVQ $0x0000000f, R13 MOVQ R13, X2 VPBROADCASTB X2, Y2 mulAvxTwo_7x2_loop: // Clear 2 outputs VPXOR Y0, Y0, Y0 VPXOR Y1, Y1, Y1 // Load and process 32 bytes from input 0 to 2 outputs VMOVDQU (BX), Y5 ADDQ $0x20, BX VPSRLQ $0x04, Y5, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y6, Y6 VMOVDQU (CX), Y3 VMOVDQU 32(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y3, Y0, Y0 VMOVDQU 64(CX), Y3 VMOVDQU 96(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 1 to 2 outputs VMOVDQU (SI), Y5 ADDQ $0x20, SI VPSRLQ $0x04, Y5, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y6, Y6 VMOVDQU 128(CX), Y3 VMOVDQU 160(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y3, Y0, Y0 VMOVDQU 192(CX), Y3 VMOVDQU 224(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 2 to 2 outputs VMOVDQU (DI), Y5 ADDQ $0x20, DI VPSRLQ $0x04, Y5, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y6, Y6 VMOVDQU 256(CX), Y3 VMOVDQU 288(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y3, Y0, Y0 VMOVDQU 320(CX), Y3 VMOVDQU 352(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 3 to 2 outputs VMOVDQU (R8), Y5 ADDQ $0x20, R8 VPSRLQ $0x04, Y5, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y6, Y6 VMOVDQU 384(CX), Y3 VMOVDQU 416(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y3, Y0, Y0 VMOVDQU 448(CX), Y3 VMOVDQU 480(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 4 to 2 outputs VMOVDQU (R9), Y5 ADDQ $0x20, R9 VPSRLQ $0x04, Y5, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y6, Y6 VMOVDQU 512(CX), Y3 VMOVDQU 544(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y3, Y0, Y0 VMOVDQU 576(CX), Y3 VMOVDQU 608(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 5 to 2 outputs VMOVDQU (R10), Y5 ADDQ $0x20, R10 VPSRLQ $0x04, Y5, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y6, Y6 VMOVDQU 640(CX), Y3 VMOVDQU 672(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y3, Y0, Y0 VMOVDQU 704(CX), Y3 VMOVDQU 736(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 6 to 2 outputs VMOVDQU (DX), Y5 ADDQ $0x20, DX VPSRLQ $0x04, Y5, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y6, Y6 VMOVDQU 768(CX), Y3 VMOVDQU 800(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y3, Y0, Y0 VMOVDQU 832(CX), Y3 VMOVDQU 864(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y3, Y1, Y1 // Store 2 outputs VMOVDQU Y0, (R12) ADDQ $0x20, R12 VMOVDQU Y1, (R11) ADDQ $0x20, R11 // Prepare for next loop DECQ AX JNZ mulAvxTwo_7x2_loop VZEROUPPER mulAvxTwo_7x2_end: RET // func mulAvxTwo_7x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_7x2_64(SB), $0-88 // Loading no tables to registers // Destination kept on stack // Full registers estimated 35 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_7x2_64_end MOVQ in_base+24(FP), AX MOVQ (AX), DX MOVQ 24(AX), BX MOVQ 48(AX), SI MOVQ 72(AX), DI MOVQ 96(AX), R8 MOVQ 120(AX), R9 MOVQ 144(AX), AX MOVQ out_base+48(FP), R10 MOVQ out_base+48(FP), R10 MOVQ start+72(FP), R11 // Add start offset to input ADDQ R11, DX ADDQ R11, BX ADDQ R11, SI ADDQ R11, DI ADDQ R11, R8 ADDQ R11, R9 ADDQ R11, AX MOVQ $0x0000000f, R12 MOVQ R12, X4 VPBROADCASTB X4, Y4 MOVQ n+80(FP), R12 SHRQ $0x06, R12 mulAvxTwo_7x2_64_loop: // Clear 2 outputs VPXOR Y0, Y0, Y0 VPXOR Y1, Y1, Y1 VPXOR Y2, Y2, Y2 VPXOR Y3, Y3, Y3 // Load and process 64 bytes from input 0 to 2 outputs VMOVDQU (DX), Y9 VMOVDQU 32(DX), Y11 ADDQ $0x40, DX VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU (CX), Y5 VMOVDQU 32(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y7, Y8, Y7 VPXOR Y5, Y0, Y0 VPXOR Y7, Y1, Y1 VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y7, Y8, Y7 VPXOR Y5, Y2, Y2 VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 1 to 2 outputs VMOVDQU (BX), Y9 VMOVDQU 32(BX), Y11 ADDQ $0x40, BX VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 128(CX), Y5 VMOVDQU 160(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y7, Y8, Y7 VPXOR Y5, Y0, Y0 VPXOR Y7, Y1, Y1 VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y7, Y8, Y7 VPXOR Y5, Y2, Y2 VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 2 to 2 outputs VMOVDQU (SI), Y9 VMOVDQU 32(SI), Y11 ADDQ $0x40, SI VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 256(CX), Y5 VMOVDQU 288(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y7, Y8, Y7 VPXOR Y5, Y0, Y0 VPXOR Y7, Y1, Y1 VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y7, Y8, Y7 VPXOR Y5, Y2, Y2 VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 3 to 2 outputs VMOVDQU (DI), Y9 VMOVDQU 32(DI), Y11 ADDQ $0x40, DI VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 384(CX), Y5 VMOVDQU 416(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y7, Y8, Y7 VPXOR Y5, Y0, Y0 VPXOR Y7, Y1, Y1 VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y7, Y8, Y7 VPXOR Y5, Y2, Y2 VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 4 to 2 outputs VMOVDQU (R8), Y9 VMOVDQU 32(R8), Y11 ADDQ $0x40, R8 VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 512(CX), Y5 VMOVDQU 544(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y7, Y8, Y7 VPXOR Y5, Y0, Y0 VPXOR Y7, Y1, Y1 VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y7, Y8, Y7 VPXOR Y5, Y2, Y2 VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 5 to 2 outputs VMOVDQU (R9), Y9 VMOVDQU 32(R9), Y11 ADDQ $0x40, R9 VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 640(CX), Y5 VMOVDQU 672(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y7, Y8, Y7 VPXOR Y5, Y0, Y0 VPXOR Y7, Y1, Y1 VMOVDQU 704(CX), Y5 VMOVDQU 736(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y7, Y8, Y7 VPXOR Y5, Y2, Y2 VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 6 to 2 outputs VMOVDQU (AX), Y9 VMOVDQU 32(AX), Y11 ADDQ $0x40, AX VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 768(CX), Y5 VMOVDQU 800(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y7, Y8, Y7 VPXOR Y5, Y0, Y0 VPXOR Y7, Y1, Y1 VMOVDQU 832(CX), Y5 VMOVDQU 864(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y7, Y8, Y7 VPXOR Y5, Y2, Y2 VPXOR Y7, Y3, Y3 // Store 2 outputs MOVQ (R10), R13 VMOVDQU Y0, (R13)(R11*1) VMOVDQU Y1, 32(R13)(R11*1) MOVQ 24(R10), R13 VMOVDQU Y2, (R13)(R11*1) VMOVDQU Y3, 32(R13)(R11*1) // Prepare for next loop ADDQ $0x40, R11 DECQ R12 JNZ mulAvxTwo_7x2_64_loop VZEROUPPER mulAvxTwo_7x2_64_end: RET // func mulAvxTwo_7x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_7x3(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 50 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_7x3_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), DX MOVQ out_base+48(FP), R11 MOVQ (R11), R12 MOVQ 24(R11), R13 MOVQ 48(R11), R11 MOVQ start+72(FP), R14 // Add start offset to output ADDQ R14, R12 ADDQ R14, R13 ADDQ R14, R11 // Add start offset to input ADDQ R14, BX ADDQ R14, SI ADDQ R14, DI ADDQ R14, R8 ADDQ R14, R9 ADDQ R14, R10 ADDQ R14, DX MOVQ $0x0000000f, R14 MOVQ R14, X3 VPBROADCASTB X3, Y3 mulAvxTwo_7x3_loop: // Clear 3 outputs VPXOR Y0, Y0, Y0 VPXOR Y1, Y1, Y1 VPXOR Y2, Y2, Y2 // Load and process 32 bytes from input 0 to 3 outputs VMOVDQU (BX), Y6 ADDQ $0x20, BX VPSRLQ $0x04, Y6, Y7 VPAND Y3, Y6, Y6 VPAND Y3, Y7, Y7 VMOVDQU (CX), Y4 VMOVDQU 32(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y0, Y0 VMOVDQU 64(CX), Y4 VMOVDQU 96(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y1, Y1 VMOVDQU 128(CX), Y4 VMOVDQU 160(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 1 to 3 outputs VMOVDQU (SI), Y6 ADDQ $0x20, SI VPSRLQ $0x04, Y6, Y7 VPAND Y3, Y6, Y6 VPAND Y3, Y7, Y7 VMOVDQU 192(CX), Y4 VMOVDQU 224(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y0, Y0 VMOVDQU 256(CX), Y4 VMOVDQU 288(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y1, Y1 VMOVDQU 320(CX), Y4 VMOVDQU 352(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 2 to 3 outputs VMOVDQU (DI), Y6 ADDQ $0x20, DI VPSRLQ $0x04, Y6, Y7 VPAND Y3, Y6, Y6 VPAND Y3, Y7, Y7 VMOVDQU 384(CX), Y4 VMOVDQU 416(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y0, Y0 VMOVDQU 448(CX), Y4 VMOVDQU 480(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y1, Y1 VMOVDQU 512(CX), Y4 VMOVDQU 544(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 3 to 3 outputs VMOVDQU (R8), Y6 ADDQ $0x20, R8 VPSRLQ $0x04, Y6, Y7 VPAND Y3, Y6, Y6 VPAND Y3, Y7, Y7 VMOVDQU 576(CX), Y4 VMOVDQU 608(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y0, Y0 VMOVDQU 640(CX), Y4 VMOVDQU 672(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y1, Y1 VMOVDQU 704(CX), Y4 VMOVDQU 736(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 4 to 3 outputs VMOVDQU (R9), Y6 ADDQ $0x20, R9 VPSRLQ $0x04, Y6, Y7 VPAND Y3, Y6, Y6 VPAND Y3, Y7, Y7 VMOVDQU 768(CX), Y4 VMOVDQU 800(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y0, Y0 VMOVDQU 832(CX), Y4 VMOVDQU 864(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y1, Y1 VMOVDQU 896(CX), Y4 VMOVDQU 928(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 5 to 3 outputs VMOVDQU (R10), Y6 ADDQ $0x20, R10 VPSRLQ $0x04, Y6, Y7 VPAND Y3, Y6, Y6 VPAND Y3, Y7, Y7 VMOVDQU 960(CX), Y4 VMOVDQU 992(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y0, Y0 VMOVDQU 1024(CX), Y4 VMOVDQU 1056(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y1, Y1 VMOVDQU 1088(CX), Y4 VMOVDQU 1120(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 6 to 3 outputs VMOVDQU (DX), Y6 ADDQ $0x20, DX VPSRLQ $0x04, Y6, Y7 VPAND Y3, Y6, Y6 VPAND Y3, Y7, Y7 VMOVDQU 1152(CX), Y4 VMOVDQU 1184(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y0, Y0 VMOVDQU 1216(CX), Y4 VMOVDQU 1248(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y1, Y1 VMOVDQU 1280(CX), Y4 VMOVDQU 1312(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y2, Y2 // Store 3 outputs VMOVDQU Y0, (R12) ADDQ $0x20, R12 VMOVDQU Y1, (R13) ADDQ $0x20, R13 VMOVDQU Y2, (R11) ADDQ $0x20, R11 // Prepare for next loop DECQ AX JNZ mulAvxTwo_7x3_loop VZEROUPPER mulAvxTwo_7x3_end: RET // func mulAvxTwo_7x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_7x3_64(SB), $0-88 // Loading no tables to registers // Destination kept on stack // Full registers estimated 50 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_7x3_64_end MOVQ in_base+24(FP), AX MOVQ (AX), DX MOVQ 24(AX), BX MOVQ 48(AX), SI MOVQ 72(AX), DI MOVQ 96(AX), R8 MOVQ 120(AX), R9 MOVQ 144(AX), AX MOVQ out_base+48(FP), R10 MOVQ out_base+48(FP), R10 MOVQ start+72(FP), R11 // Add start offset to input ADDQ R11, DX ADDQ R11, BX ADDQ R11, SI ADDQ R11, DI ADDQ R11, R8 ADDQ R11, R9 ADDQ R11, AX MOVQ $0x0000000f, R12 MOVQ R12, X6 VPBROADCASTB X6, Y6 MOVQ n+80(FP), R12 SHRQ $0x06, R12 mulAvxTwo_7x3_64_loop: // Clear 3 outputs VPXOR Y0, Y0, Y0 VPXOR Y1, Y1, Y1 VPXOR Y2, Y2, Y2 VPXOR Y3, Y3, Y3 VPXOR Y4, Y4, Y4 VPXOR Y5, Y5, Y5 // Load and process 64 bytes from input 0 to 3 outputs VMOVDQU (DX), Y11 VMOVDQU 32(DX), Y13 ADDQ $0x40, DX VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU (CX), Y7 VMOVDQU 32(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y0, Y0 VPXOR Y9, Y1, Y1 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y2, Y2 VPXOR Y9, Y3, Y3 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y4, Y4 VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 1 to 3 outputs VMOVDQU (BX), Y11 VMOVDQU 32(BX), Y13 ADDQ $0x40, BX VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 192(CX), Y7 VMOVDQU 224(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y0, Y0 VPXOR Y9, Y1, Y1 VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y2, Y2 VPXOR Y9, Y3, Y3 VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y4, Y4 VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 2 to 3 outputs VMOVDQU (SI), Y11 VMOVDQU 32(SI), Y13 ADDQ $0x40, SI VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 384(CX), Y7 VMOVDQU 416(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y0, Y0 VPXOR Y9, Y1, Y1 VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y2, Y2 VPXOR Y9, Y3, Y3 VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y4, Y4 VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 3 to 3 outputs VMOVDQU (DI), Y11 VMOVDQU 32(DI), Y13 ADDQ $0x40, DI VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 576(CX), Y7 VMOVDQU 608(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y0, Y0 VPXOR Y9, Y1, Y1 VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y2, Y2 VPXOR Y9, Y3, Y3 VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y4, Y4 VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 4 to 3 outputs VMOVDQU (R8), Y11 VMOVDQU 32(R8), Y13 ADDQ $0x40, R8 VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 768(CX), Y7 VMOVDQU 800(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y0, Y0 VPXOR Y9, Y1, Y1 VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y2, Y2 VPXOR Y9, Y3, Y3 VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y4, Y4 VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 5 to 3 outputs VMOVDQU (R9), Y11 VMOVDQU 32(R9), Y13 ADDQ $0x40, R9 VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 960(CX), Y7 VMOVDQU 992(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y0, Y0 VPXOR Y9, Y1, Y1 VMOVDQU 1024(CX), Y7 VMOVDQU 1056(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y2, Y2 VPXOR Y9, Y3, Y3 VMOVDQU 1088(CX), Y7 VMOVDQU 1120(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y4, Y4 VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 6 to 3 outputs VMOVDQU (AX), Y11 VMOVDQU 32(AX), Y13 ADDQ $0x40, AX VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 1152(CX), Y7 VMOVDQU 1184(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y0, Y0 VPXOR Y9, Y1, Y1 VMOVDQU 1216(CX), Y7 VMOVDQU 1248(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y2, Y2 VPXOR Y9, Y3, Y3 VMOVDQU 1280(CX), Y7 VMOVDQU 1312(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y4, Y4 VPXOR Y9, Y5, Y5 // Store 3 outputs MOVQ (R10), R13 VMOVDQU Y0, (R13)(R11*1) VMOVDQU Y1, 32(R13)(R11*1) MOVQ 24(R10), R13 VMOVDQU Y2, (R13)(R11*1) VMOVDQU Y3, 32(R13)(R11*1) MOVQ 48(R10), R13 VMOVDQU Y4, (R13)(R11*1) VMOVDQU Y5, 32(R13)(R11*1) // Prepare for next loop ADDQ $0x40, R11 DECQ R12 JNZ mulAvxTwo_7x3_64_loop VZEROUPPER mulAvxTwo_7x3_64_end: RET // func mulAvxTwo_7x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_7x4(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 65 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_7x4_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), DX MOVQ out_base+48(FP), R11 MOVQ (R11), R12 MOVQ 24(R11), R13 MOVQ 48(R11), R14 MOVQ 72(R11), R11 MOVQ start+72(FP), R15 // Add start offset to output ADDQ R15, R12 ADDQ R15, R13 ADDQ R15, R14 ADDQ R15, R11 // Add start offset to input ADDQ R15, BX ADDQ R15, SI ADDQ R15, DI ADDQ R15, R8 ADDQ R15, R9 ADDQ R15, R10 ADDQ R15, DX MOVQ $0x0000000f, R15 MOVQ R15, X4 VPBROADCASTB X4, Y4 mulAvxTwo_7x4_loop: // Clear 4 outputs VPXOR Y0, Y0, Y0 VPXOR Y1, Y1, Y1 VPXOR Y2, Y2, Y2 VPXOR Y3, Y3, Y3 // Load and process 32 bytes from input 0 to 4 outputs VMOVDQU (BX), Y7 ADDQ $0x20, BX VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU (CX), Y5 VMOVDQU 32(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y0, Y0 VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y1, Y1 VMOVDQU 128(CX), Y5 VMOVDQU 160(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y2, Y2 VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 1 to 4 outputs VMOVDQU (SI), Y7 ADDQ $0x20, SI VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 256(CX), Y5 VMOVDQU 288(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y0, Y0 VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y1, Y1 VMOVDQU 384(CX), Y5 VMOVDQU 416(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y2, Y2 VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 2 to 4 outputs VMOVDQU (DI), Y7 ADDQ $0x20, DI VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 512(CX), Y5 VMOVDQU 544(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y0, Y0 VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y1, Y1 VMOVDQU 640(CX), Y5 VMOVDQU 672(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y2, Y2 VMOVDQU 704(CX), Y5 VMOVDQU 736(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 3 to 4 outputs VMOVDQU (R8), Y7 ADDQ $0x20, R8 VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 768(CX), Y5 VMOVDQU 800(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y0, Y0 VMOVDQU 832(CX), Y5 VMOVDQU 864(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y1, Y1 VMOVDQU 896(CX), Y5 VMOVDQU 928(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y2, Y2 VMOVDQU 960(CX), Y5 VMOVDQU 992(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 4 to 4 outputs VMOVDQU (R9), Y7 ADDQ $0x20, R9 VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 1024(CX), Y5 VMOVDQU 1056(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y0, Y0 VMOVDQU 1088(CX), Y5 VMOVDQU 1120(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y1, Y1 VMOVDQU 1152(CX), Y5 VMOVDQU 1184(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y2, Y2 VMOVDQU 1216(CX), Y5 VMOVDQU 1248(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 5 to 4 outputs VMOVDQU (R10), Y7 ADDQ $0x20, R10 VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 1280(CX), Y5 VMOVDQU 1312(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y0, Y0 VMOVDQU 1344(CX), Y5 VMOVDQU 1376(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y1, Y1 VMOVDQU 1408(CX), Y5 VMOVDQU 1440(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y2, Y2 VMOVDQU 1472(CX), Y5 VMOVDQU 1504(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 6 to 4 outputs VMOVDQU (DX), Y7 ADDQ $0x20, DX VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 1536(CX), Y5 VMOVDQU 1568(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y0, Y0 VMOVDQU 1600(CX), Y5 VMOVDQU 1632(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y1, Y1 VMOVDQU 1664(CX), Y5 VMOVDQU 1696(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y2, Y2 VMOVDQU 1728(CX), Y5 VMOVDQU 1760(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y3, Y3 // Store 4 outputs VMOVDQU Y0, (R12) ADDQ $0x20, R12 VMOVDQU Y1, (R13) ADDQ $0x20, R13 VMOVDQU Y2, (R14) ADDQ $0x20, R14 VMOVDQU Y3, (R11) ADDQ $0x20, R11 // Prepare for next loop DECQ AX JNZ mulAvxTwo_7x4_loop VZEROUPPER mulAvxTwo_7x4_end: RET // func mulAvxTwo_7x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_7x5(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 80 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_7x5_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), DX MOVQ out_base+48(FP), R11 MOVQ (R11), R12 MOVQ 24(R11), R13 MOVQ 48(R11), R14 MOVQ 72(R11), R15 MOVQ 96(R11), R11 MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, R12 ADDQ BP, R13 ADDQ BP, R14 ADDQ BP, R15 ADDQ BP, R11 // Add start offset to input ADDQ BP, BX ADDQ BP, SI ADDQ BP, DI ADDQ BP, R8 ADDQ BP, R9 ADDQ BP, R10 ADDQ BP, DX MOVQ $0x0000000f, BP MOVQ BP, X5 VPBROADCASTB X5, Y5 mulAvxTwo_7x5_loop: // Clear 5 outputs VPXOR Y0, Y0, Y0 VPXOR Y1, Y1, Y1 VPXOR Y2, Y2, Y2 VPXOR Y3, Y3, Y3 VPXOR Y4, Y4, Y4 // Load and process 32 bytes from input 0 to 5 outputs VMOVDQU (BX), Y8 ADDQ $0x20, BX VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU (CX), Y6 VMOVDQU 32(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y0, Y0 VMOVDQU 64(CX), Y6 VMOVDQU 96(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y1, Y1 VMOVDQU 128(CX), Y6 VMOVDQU 160(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y2, Y2 VMOVDQU 192(CX), Y6 VMOVDQU 224(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y3, Y3 VMOVDQU 256(CX), Y6 VMOVDQU 288(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 1 to 5 outputs VMOVDQU (SI), Y8 ADDQ $0x20, SI VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 320(CX), Y6 VMOVDQU 352(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y0, Y0 VMOVDQU 384(CX), Y6 VMOVDQU 416(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y1, Y1 VMOVDQU 448(CX), Y6 VMOVDQU 480(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y2, Y2 VMOVDQU 512(CX), Y6 VMOVDQU 544(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y3, Y3 VMOVDQU 576(CX), Y6 VMOVDQU 608(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 2 to 5 outputs VMOVDQU (DI), Y8 ADDQ $0x20, DI VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 640(CX), Y6 VMOVDQU 672(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y0, Y0 VMOVDQU 704(CX), Y6 VMOVDQU 736(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y1, Y1 VMOVDQU 768(CX), Y6 VMOVDQU 800(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y2, Y2 VMOVDQU 832(CX), Y6 VMOVDQU 864(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y3, Y3 VMOVDQU 896(CX), Y6 VMOVDQU 928(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 3 to 5 outputs VMOVDQU (R8), Y8 ADDQ $0x20, R8 VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 960(CX), Y6 VMOVDQU 992(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y0, Y0 VMOVDQU 1024(CX), Y6 VMOVDQU 1056(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y1, Y1 VMOVDQU 1088(CX), Y6 VMOVDQU 1120(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y2, Y2 VMOVDQU 1152(CX), Y6 VMOVDQU 1184(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y3, Y3 VMOVDQU 1216(CX), Y6 VMOVDQU 1248(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 4 to 5 outputs VMOVDQU (R9), Y8 ADDQ $0x20, R9 VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 1280(CX), Y6 VMOVDQU 1312(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y0, Y0 VMOVDQU 1344(CX), Y6 VMOVDQU 1376(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y1, Y1 VMOVDQU 1408(CX), Y6 VMOVDQU 1440(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y2, Y2 VMOVDQU 1472(CX), Y6 VMOVDQU 1504(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y3, Y3 VMOVDQU 1536(CX), Y6 VMOVDQU 1568(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 5 to 5 outputs VMOVDQU (R10), Y8 ADDQ $0x20, R10 VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 1600(CX), Y6 VMOVDQU 1632(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y0, Y0 VMOVDQU 1664(CX), Y6 VMOVDQU 1696(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y1, Y1 VMOVDQU 1728(CX), Y6 VMOVDQU 1760(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y2, Y2 VMOVDQU 1792(CX), Y6 VMOVDQU 1824(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y3, Y3 VMOVDQU 1856(CX), Y6 VMOVDQU 1888(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 6 to 5 outputs VMOVDQU (DX), Y8 ADDQ $0x20, DX VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 1920(CX), Y6 VMOVDQU 1952(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y0, Y0 VMOVDQU 1984(CX), Y6 VMOVDQU 2016(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y1, Y1 VMOVDQU 2048(CX), Y6 VMOVDQU 2080(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y2, Y2 VMOVDQU 2112(CX), Y6 VMOVDQU 2144(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y3, Y3 VMOVDQU 2176(CX), Y6 VMOVDQU 2208(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y4, Y4 // Store 5 outputs VMOVDQU Y0, (R12) ADDQ $0x20, R12 VMOVDQU Y1, (R13) ADDQ $0x20, R13 VMOVDQU Y2, (R14) ADDQ $0x20, R14 VMOVDQU Y3, (R15) ADDQ $0x20, R15 VMOVDQU Y4, (R11) ADDQ $0x20, R11 // Prepare for next loop DECQ AX JNZ mulAvxTwo_7x5_loop VZEROUPPER mulAvxTwo_7x5_end: RET // func mulAvxTwo_7x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_7x6(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 95 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_7x6_end MOVQ in_base+24(FP), AX MOVQ (AX), DX MOVQ 24(AX), BX MOVQ 48(AX), SI MOVQ 72(AX), DI MOVQ 96(AX), R8 MOVQ 120(AX), R9 MOVQ 144(AX), AX MOVQ out_base+48(FP), R10 MOVQ (R10), R11 MOVQ 24(R10), R12 MOVQ 48(R10), R13 MOVQ 72(R10), R14 MOVQ 96(R10), R15 MOVQ 120(R10), R10 MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, R11 ADDQ BP, R12 ADDQ BP, R13 ADDQ BP, R14 ADDQ BP, R15 ADDQ BP, R10 // Add start offset to input ADDQ BP, DX ADDQ BP, BX ADDQ BP, SI ADDQ BP, DI ADDQ BP, R8 ADDQ BP, R9 ADDQ BP, AX MOVQ $0x0000000f, BP MOVQ BP, X6 VPBROADCASTB X6, Y6 MOVQ n+80(FP), BP SHRQ $0x05, BP mulAvxTwo_7x6_loop: // Clear 6 outputs VPXOR Y0, Y0, Y0 VPXOR Y1, Y1, Y1 VPXOR Y2, Y2, Y2 VPXOR Y3, Y3, Y3 VPXOR Y4, Y4, Y4 VPXOR Y5, Y5, Y5 // Load and process 32 bytes from input 0 to 6 outputs VMOVDQU (DX), Y9 ADDQ $0x20, DX VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU (CX), Y7 VMOVDQU 32(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y0, Y0 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y1, Y1 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y2, Y2 VMOVDQU 192(CX), Y7 VMOVDQU 224(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y3, Y3 VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y4, Y4 VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 1 to 6 outputs VMOVDQU (BX), Y9 ADDQ $0x20, BX VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 384(CX), Y7 VMOVDQU 416(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y0, Y0 VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y1, Y1 VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y2, Y2 VMOVDQU 576(CX), Y7 VMOVDQU 608(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y3, Y3 VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y4, Y4 VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 2 to 6 outputs VMOVDQU (SI), Y9 ADDQ $0x20, SI VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 768(CX), Y7 VMOVDQU 800(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y0, Y0 VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y1, Y1 VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y2, Y2 VMOVDQU 960(CX), Y7 VMOVDQU 992(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y3, Y3 VMOVDQU 1024(CX), Y7 VMOVDQU 1056(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y4, Y4 VMOVDQU 1088(CX), Y7 VMOVDQU 1120(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 3 to 6 outputs VMOVDQU (DI), Y9 ADDQ $0x20, DI VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 1152(CX), Y7 VMOVDQU 1184(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y0, Y0 VMOVDQU 1216(CX), Y7 VMOVDQU 1248(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y1, Y1 VMOVDQU 1280(CX), Y7 VMOVDQU 1312(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y2, Y2 VMOVDQU 1344(CX), Y7 VMOVDQU 1376(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y3, Y3 VMOVDQU 1408(CX), Y7 VMOVDQU 1440(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y4, Y4 VMOVDQU 1472(CX), Y7 VMOVDQU 1504(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 4 to 6 outputs VMOVDQU (R8), Y9 ADDQ $0x20, R8 VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 1536(CX), Y7 VMOVDQU 1568(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y0, Y0 VMOVDQU 1600(CX), Y7 VMOVDQU 1632(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y1, Y1 VMOVDQU 1664(CX), Y7 VMOVDQU 1696(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y2, Y2 VMOVDQU 1728(CX), Y7 VMOVDQU 1760(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y3, Y3 VMOVDQU 1792(CX), Y7 VMOVDQU 1824(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y4, Y4 VMOVDQU 1856(CX), Y7 VMOVDQU 1888(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 5 to 6 outputs VMOVDQU (R9), Y9 ADDQ $0x20, R9 VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 1920(CX), Y7 VMOVDQU 1952(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y0, Y0 VMOVDQU 1984(CX), Y7 VMOVDQU 2016(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y1, Y1 VMOVDQU 2048(CX), Y7 VMOVDQU 2080(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y2, Y2 VMOVDQU 2112(CX), Y7 VMOVDQU 2144(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y3, Y3 VMOVDQU 2176(CX), Y7 VMOVDQU 2208(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y4, Y4 VMOVDQU 2240(CX), Y7 VMOVDQU 2272(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 6 to 6 outputs VMOVDQU (AX), Y9 ADDQ $0x20, AX VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 2304(CX), Y7 VMOVDQU 2336(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y0, Y0 VMOVDQU 2368(CX), Y7 VMOVDQU 2400(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y1, Y1 VMOVDQU 2432(CX), Y7 VMOVDQU 2464(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y2, Y2 VMOVDQU 2496(CX), Y7 VMOVDQU 2528(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y3, Y3 VMOVDQU 2560(CX), Y7 VMOVDQU 2592(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y4, Y4 VMOVDQU 2624(CX), Y7 VMOVDQU 2656(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y5, Y5 // Store 6 outputs VMOVDQU Y0, (R11) ADDQ $0x20, R11 VMOVDQU Y1, (R12) ADDQ $0x20, R12 VMOVDQU Y2, (R13) ADDQ $0x20, R13 VMOVDQU Y3, (R14) ADDQ $0x20, R14 VMOVDQU Y4, (R15) ADDQ $0x20, R15 VMOVDQU Y5, (R10) ADDQ $0x20, R10 // Prepare for next loop DECQ BP JNZ mulAvxTwo_7x6_loop VZEROUPPER mulAvxTwo_7x6_end: RET // func mulAvxTwo_7x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_7x7(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack // Full registers estimated 110 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_7x7_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), DX MOVQ out_base+48(FP), R11 MOVQ start+72(FP), R12 // Add start offset to input ADDQ R12, BX ADDQ R12, SI ADDQ R12, DI ADDQ R12, R8 ADDQ R12, R9 ADDQ R12, R10 ADDQ R12, DX MOVQ $0x0000000f, R13 MOVQ R13, X7 VPBROADCASTB X7, Y7 mulAvxTwo_7x7_loop: // Clear 7 outputs VPXOR Y0, Y0, Y0 VPXOR Y1, Y1, Y1 VPXOR Y2, Y2, Y2 VPXOR Y3, Y3, Y3 VPXOR Y4, Y4, Y4 VPXOR Y5, Y5, Y5 VPXOR Y6, Y6, Y6 // Load and process 32 bytes from input 0 to 7 outputs VMOVDQU (BX), Y10 ADDQ $0x20, BX VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU (CX), Y8 VMOVDQU 32(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y0, Y0 VMOVDQU 64(CX), Y8 VMOVDQU 96(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y1, Y1 VMOVDQU 128(CX), Y8 VMOVDQU 160(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y2, Y2 VMOVDQU 192(CX), Y8 VMOVDQU 224(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y3, Y3 VMOVDQU 256(CX), Y8 VMOVDQU 288(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y4, Y4 VMOVDQU 320(CX), Y8 VMOVDQU 352(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y5, Y5 VMOVDQU 384(CX), Y8 VMOVDQU 416(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 1 to 7 outputs VMOVDQU (SI), Y10 ADDQ $0x20, SI VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 448(CX), Y8 VMOVDQU 480(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y0, Y0 VMOVDQU 512(CX), Y8 VMOVDQU 544(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y1, Y1 VMOVDQU 576(CX), Y8 VMOVDQU 608(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y2, Y2 VMOVDQU 640(CX), Y8 VMOVDQU 672(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y3, Y3 VMOVDQU 704(CX), Y8 VMOVDQU 736(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y4, Y4 VMOVDQU 768(CX), Y8 VMOVDQU 800(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y5, Y5 VMOVDQU 832(CX), Y8 VMOVDQU 864(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 2 to 7 outputs VMOVDQU (DI), Y10 ADDQ $0x20, DI VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 896(CX), Y8 VMOVDQU 928(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y0, Y0 VMOVDQU 960(CX), Y8 VMOVDQU 992(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y1, Y1 VMOVDQU 1024(CX), Y8 VMOVDQU 1056(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y2, Y2 VMOVDQU 1088(CX), Y8 VMOVDQU 1120(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y3, Y3 VMOVDQU 1152(CX), Y8 VMOVDQU 1184(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y4, Y4 VMOVDQU 1216(CX), Y8 VMOVDQU 1248(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y5, Y5 VMOVDQU 1280(CX), Y8 VMOVDQU 1312(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 3 to 7 outputs VMOVDQU (R8), Y10 ADDQ $0x20, R8 VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 1344(CX), Y8 VMOVDQU 1376(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y0, Y0 VMOVDQU 1408(CX), Y8 VMOVDQU 1440(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y1, Y1 VMOVDQU 1472(CX), Y8 VMOVDQU 1504(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y2, Y2 VMOVDQU 1536(CX), Y8 VMOVDQU 1568(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y3, Y3 VMOVDQU 1600(CX), Y8 VMOVDQU 1632(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y4, Y4 VMOVDQU 1664(CX), Y8 VMOVDQU 1696(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y5, Y5 VMOVDQU 1728(CX), Y8 VMOVDQU 1760(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 4 to 7 outputs VMOVDQU (R9), Y10 ADDQ $0x20, R9 VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 1792(CX), Y8 VMOVDQU 1824(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y0, Y0 VMOVDQU 1856(CX), Y8 VMOVDQU 1888(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y1, Y1 VMOVDQU 1920(CX), Y8 VMOVDQU 1952(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y2, Y2 VMOVDQU 1984(CX), Y8 VMOVDQU 2016(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y3, Y3 VMOVDQU 2048(CX), Y8 VMOVDQU 2080(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y4, Y4 VMOVDQU 2112(CX), Y8 VMOVDQU 2144(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y5, Y5 VMOVDQU 2176(CX), Y8 VMOVDQU 2208(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 5 to 7 outputs VMOVDQU (R10), Y10 ADDQ $0x20, R10 VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 2240(CX), Y8 VMOVDQU 2272(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y0, Y0 VMOVDQU 2304(CX), Y8 VMOVDQU 2336(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y1, Y1 VMOVDQU 2368(CX), Y8 VMOVDQU 2400(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y2, Y2 VMOVDQU 2432(CX), Y8 VMOVDQU 2464(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y3, Y3 VMOVDQU 2496(CX), Y8 VMOVDQU 2528(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y4, Y4 VMOVDQU 2560(CX), Y8 VMOVDQU 2592(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y5, Y5 VMOVDQU 2624(CX), Y8 VMOVDQU 2656(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 6 to 7 outputs VMOVDQU (DX), Y10 ADDQ $0x20, DX VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 2688(CX), Y8 VMOVDQU 2720(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y0, Y0 VMOVDQU 2752(CX), Y8 VMOVDQU 2784(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y1, Y1 VMOVDQU 2816(CX), Y8 VMOVDQU 2848(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y2, Y2 VMOVDQU 2880(CX), Y8 VMOVDQU 2912(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y3, Y3 VMOVDQU 2944(CX), Y8 VMOVDQU 2976(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y4, Y4 VMOVDQU 3008(CX), Y8 VMOVDQU 3040(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y5, Y5 VMOVDQU 3072(CX), Y8 VMOVDQU 3104(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y6, Y6 // Store 7 outputs MOVQ (R11), R13 VMOVDQU Y0, (R13)(R12*1) MOVQ 24(R11), R13 VMOVDQU Y1, (R13)(R12*1) MOVQ 48(R11), R13 VMOVDQU Y2, (R13)(R12*1) MOVQ 72(R11), R13 VMOVDQU Y3, (R13)(R12*1) MOVQ 96(R11), R13 VMOVDQU Y4, (R13)(R12*1) MOVQ 120(R11), R13 VMOVDQU Y5, (R13)(R12*1) MOVQ 144(R11), R13 VMOVDQU Y6, (R13)(R12*1) // Prepare for next loop ADDQ $0x20, R12 DECQ AX JNZ mulAvxTwo_7x7_loop VZEROUPPER mulAvxTwo_7x7_end: RET // func mulAvxTwo_7x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_7x8(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack // Full registers estimated 125 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_7x8_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), DX MOVQ out_base+48(FP), R11 MOVQ start+72(FP), R12 // Add start offset to input ADDQ R12, BX ADDQ R12, SI ADDQ R12, DI ADDQ R12, R8 ADDQ R12, R9 ADDQ R12, R10 ADDQ R12, DX MOVQ $0x0000000f, R13 MOVQ R13, X8 VPBROADCASTB X8, Y8 mulAvxTwo_7x8_loop: // Clear 8 outputs VPXOR Y0, Y0, Y0 VPXOR Y1, Y1, Y1 VPXOR Y2, Y2, Y2 VPXOR Y3, Y3, Y3 VPXOR Y4, Y4, Y4 VPXOR Y5, Y5, Y5 VPXOR Y6, Y6, Y6 VPXOR Y7, Y7, Y7 // Load and process 32 bytes from input 0 to 8 outputs VMOVDQU (BX), Y11 ADDQ $0x20, BX VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU (CX), Y9 VMOVDQU 32(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y0, Y0 VMOVDQU 64(CX), Y9 VMOVDQU 96(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y1, Y1 VMOVDQU 128(CX), Y9 VMOVDQU 160(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y2, Y2 VMOVDQU 192(CX), Y9 VMOVDQU 224(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y3, Y3 VMOVDQU 256(CX), Y9 VMOVDQU 288(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y4, Y4 VMOVDQU 320(CX), Y9 VMOVDQU 352(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y5, Y5 VMOVDQU 384(CX), Y9 VMOVDQU 416(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y6, Y6 VMOVDQU 448(CX), Y9 VMOVDQU 480(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 1 to 8 outputs VMOVDQU (SI), Y11 ADDQ $0x20, SI VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 512(CX), Y9 VMOVDQU 544(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y0, Y0 VMOVDQU 576(CX), Y9 VMOVDQU 608(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y1, Y1 VMOVDQU 640(CX), Y9 VMOVDQU 672(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y2, Y2 VMOVDQU 704(CX), Y9 VMOVDQU 736(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y3, Y3 VMOVDQU 768(CX), Y9 VMOVDQU 800(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y4, Y4 VMOVDQU 832(CX), Y9 VMOVDQU 864(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y5, Y5 VMOVDQU 896(CX), Y9 VMOVDQU 928(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y6, Y6 VMOVDQU 960(CX), Y9 VMOVDQU 992(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 2 to 8 outputs VMOVDQU (DI), Y11 ADDQ $0x20, DI VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 1024(CX), Y9 VMOVDQU 1056(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y0, Y0 VMOVDQU 1088(CX), Y9 VMOVDQU 1120(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y1, Y1 VMOVDQU 1152(CX), Y9 VMOVDQU 1184(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y2, Y2 VMOVDQU 1216(CX), Y9 VMOVDQU 1248(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y3, Y3 VMOVDQU 1280(CX), Y9 VMOVDQU 1312(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y4, Y4 VMOVDQU 1344(CX), Y9 VMOVDQU 1376(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y5, Y5 VMOVDQU 1408(CX), Y9 VMOVDQU 1440(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y6, Y6 VMOVDQU 1472(CX), Y9 VMOVDQU 1504(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 3 to 8 outputs VMOVDQU (R8), Y11 ADDQ $0x20, R8 VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 1536(CX), Y9 VMOVDQU 1568(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y0, Y0 VMOVDQU 1600(CX), Y9 VMOVDQU 1632(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y1, Y1 VMOVDQU 1664(CX), Y9 VMOVDQU 1696(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y2, Y2 VMOVDQU 1728(CX), Y9 VMOVDQU 1760(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y3, Y3 VMOVDQU 1792(CX), Y9 VMOVDQU 1824(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y4, Y4 VMOVDQU 1856(CX), Y9 VMOVDQU 1888(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y5, Y5 VMOVDQU 1920(CX), Y9 VMOVDQU 1952(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y6, Y6 VMOVDQU 1984(CX), Y9 VMOVDQU 2016(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 4 to 8 outputs VMOVDQU (R9), Y11 ADDQ $0x20, R9 VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 2048(CX), Y9 VMOVDQU 2080(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y0, Y0 VMOVDQU 2112(CX), Y9 VMOVDQU 2144(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y1, Y1 VMOVDQU 2176(CX), Y9 VMOVDQU 2208(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y2, Y2 VMOVDQU 2240(CX), Y9 VMOVDQU 2272(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y3, Y3 VMOVDQU 2304(CX), Y9 VMOVDQU 2336(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y4, Y4 VMOVDQU 2368(CX), Y9 VMOVDQU 2400(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y5, Y5 VMOVDQU 2432(CX), Y9 VMOVDQU 2464(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y6, Y6 VMOVDQU 2496(CX), Y9 VMOVDQU 2528(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 5 to 8 outputs VMOVDQU (R10), Y11 ADDQ $0x20, R10 VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 2560(CX), Y9 VMOVDQU 2592(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y0, Y0 VMOVDQU 2624(CX), Y9 VMOVDQU 2656(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y1, Y1 VMOVDQU 2688(CX), Y9 VMOVDQU 2720(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y2, Y2 VMOVDQU 2752(CX), Y9 VMOVDQU 2784(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y3, Y3 VMOVDQU 2816(CX), Y9 VMOVDQU 2848(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y4, Y4 VMOVDQU 2880(CX), Y9 VMOVDQU 2912(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y5, Y5 VMOVDQU 2944(CX), Y9 VMOVDQU 2976(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y6, Y6 VMOVDQU 3008(CX), Y9 VMOVDQU 3040(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 6 to 8 outputs VMOVDQU (DX), Y11 ADDQ $0x20, DX VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 3072(CX), Y9 VMOVDQU 3104(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y0, Y0 VMOVDQU 3136(CX), Y9 VMOVDQU 3168(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y1, Y1 VMOVDQU 3200(CX), Y9 VMOVDQU 3232(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y2, Y2 VMOVDQU 3264(CX), Y9 VMOVDQU 3296(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y3, Y3 VMOVDQU 3328(CX), Y9 VMOVDQU 3360(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y4, Y4 VMOVDQU 3392(CX), Y9 VMOVDQU 3424(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y5, Y5 VMOVDQU 3456(CX), Y9 VMOVDQU 3488(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y6, Y6 VMOVDQU 3520(CX), Y9 VMOVDQU 3552(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y7, Y7 // Store 8 outputs MOVQ (R11), R13 VMOVDQU Y0, (R13)(R12*1) MOVQ 24(R11), R13 VMOVDQU Y1, (R13)(R12*1) MOVQ 48(R11), R13 VMOVDQU Y2, (R13)(R12*1) MOVQ 72(R11), R13 VMOVDQU Y3, (R13)(R12*1) MOVQ 96(R11), R13 VMOVDQU Y4, (R13)(R12*1) MOVQ 120(R11), R13 VMOVDQU Y5, (R13)(R12*1) MOVQ 144(R11), R13 VMOVDQU Y6, (R13)(R12*1) MOVQ 168(R11), R13 VMOVDQU Y7, (R13)(R12*1) // Prepare for next loop ADDQ $0x20, R12 DECQ AX JNZ mulAvxTwo_7x8_loop VZEROUPPER mulAvxTwo_7x8_end: RET // func mulAvxTwo_7x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_7x9(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack // Full registers estimated 140 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_7x9_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), DX MOVQ out_base+48(FP), R11 MOVQ start+72(FP), R12 // Add start offset to input ADDQ R12, BX ADDQ R12, SI ADDQ R12, DI ADDQ R12, R8 ADDQ R12, R9 ADDQ R12, R10 ADDQ R12, DX MOVQ $0x0000000f, R13 MOVQ R13, X9 VPBROADCASTB X9, Y9 mulAvxTwo_7x9_loop: // Clear 9 outputs VPXOR Y0, Y0, Y0 VPXOR Y1, Y1, Y1 VPXOR Y2, Y2, Y2 VPXOR Y3, Y3, Y3 VPXOR Y4, Y4, Y4 VPXOR Y5, Y5, Y5 VPXOR Y6, Y6, Y6 VPXOR Y7, Y7, Y7 VPXOR Y8, Y8, Y8 // Load and process 32 bytes from input 0 to 9 outputs VMOVDQU (BX), Y12 ADDQ $0x20, BX VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU (CX), Y10 VMOVDQU 32(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y0, Y0 VMOVDQU 64(CX), Y10 VMOVDQU 96(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y1, Y1 VMOVDQU 128(CX), Y10 VMOVDQU 160(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y2, Y2 VMOVDQU 192(CX), Y10 VMOVDQU 224(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y3, Y3 VMOVDQU 256(CX), Y10 VMOVDQU 288(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y4, Y4 VMOVDQU 320(CX), Y10 VMOVDQU 352(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y5, Y5 VMOVDQU 384(CX), Y10 VMOVDQU 416(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y6, Y6 VMOVDQU 448(CX), Y10 VMOVDQU 480(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y7, Y7 VMOVDQU 512(CX), Y10 VMOVDQU 544(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 1 to 9 outputs VMOVDQU (SI), Y12 ADDQ $0x20, SI VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 576(CX), Y10 VMOVDQU 608(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y0, Y0 VMOVDQU 640(CX), Y10 VMOVDQU 672(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y1, Y1 VMOVDQU 704(CX), Y10 VMOVDQU 736(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y2, Y2 VMOVDQU 768(CX), Y10 VMOVDQU 800(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y3, Y3 VMOVDQU 832(CX), Y10 VMOVDQU 864(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y4, Y4 VMOVDQU 896(CX), Y10 VMOVDQU 928(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y5, Y5 VMOVDQU 960(CX), Y10 VMOVDQU 992(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y6, Y6 VMOVDQU 1024(CX), Y10 VMOVDQU 1056(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y7, Y7 VMOVDQU 1088(CX), Y10 VMOVDQU 1120(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 2 to 9 outputs VMOVDQU (DI), Y12 ADDQ $0x20, DI VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 1152(CX), Y10 VMOVDQU 1184(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y0, Y0 VMOVDQU 1216(CX), Y10 VMOVDQU 1248(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y1, Y1 VMOVDQU 1280(CX), Y10 VMOVDQU 1312(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y2, Y2 VMOVDQU 1344(CX), Y10 VMOVDQU 1376(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y3, Y3 VMOVDQU 1408(CX), Y10 VMOVDQU 1440(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y4, Y4 VMOVDQU 1472(CX), Y10 VMOVDQU 1504(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y5, Y5 VMOVDQU 1536(CX), Y10 VMOVDQU 1568(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y6, Y6 VMOVDQU 1600(CX), Y10 VMOVDQU 1632(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y7, Y7 VMOVDQU 1664(CX), Y10 VMOVDQU 1696(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 3 to 9 outputs VMOVDQU (R8), Y12 ADDQ $0x20, R8 VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 1728(CX), Y10 VMOVDQU 1760(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y0, Y0 VMOVDQU 1792(CX), Y10 VMOVDQU 1824(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y1, Y1 VMOVDQU 1856(CX), Y10 VMOVDQU 1888(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y2, Y2 VMOVDQU 1920(CX), Y10 VMOVDQU 1952(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y3, Y3 VMOVDQU 1984(CX), Y10 VMOVDQU 2016(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y4, Y4 VMOVDQU 2048(CX), Y10 VMOVDQU 2080(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y5, Y5 VMOVDQU 2112(CX), Y10 VMOVDQU 2144(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y6, Y6 VMOVDQU 2176(CX), Y10 VMOVDQU 2208(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y7, Y7 VMOVDQU 2240(CX), Y10 VMOVDQU 2272(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 4 to 9 outputs VMOVDQU (R9), Y12 ADDQ $0x20, R9 VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 2304(CX), Y10 VMOVDQU 2336(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y0, Y0 VMOVDQU 2368(CX), Y10 VMOVDQU 2400(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y1, Y1 VMOVDQU 2432(CX), Y10 VMOVDQU 2464(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y2, Y2 VMOVDQU 2496(CX), Y10 VMOVDQU 2528(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y3, Y3 VMOVDQU 2560(CX), Y10 VMOVDQU 2592(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y4, Y4 VMOVDQU 2624(CX), Y10 VMOVDQU 2656(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y5, Y5 VMOVDQU 2688(CX), Y10 VMOVDQU 2720(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y6, Y6 VMOVDQU 2752(CX), Y10 VMOVDQU 2784(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y7, Y7 VMOVDQU 2816(CX), Y10 VMOVDQU 2848(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 5 to 9 outputs VMOVDQU (R10), Y12 ADDQ $0x20, R10 VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 2880(CX), Y10 VMOVDQU 2912(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y0, Y0 VMOVDQU 2944(CX), Y10 VMOVDQU 2976(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y1, Y1 VMOVDQU 3008(CX), Y10 VMOVDQU 3040(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y2, Y2 VMOVDQU 3072(CX), Y10 VMOVDQU 3104(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y3, Y3 VMOVDQU 3136(CX), Y10 VMOVDQU 3168(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y4, Y4 VMOVDQU 3200(CX), Y10 VMOVDQU 3232(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y5, Y5 VMOVDQU 3264(CX), Y10 VMOVDQU 3296(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y6, Y6 VMOVDQU 3328(CX), Y10 VMOVDQU 3360(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y7, Y7 VMOVDQU 3392(CX), Y10 VMOVDQU 3424(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 6 to 9 outputs VMOVDQU (DX), Y12 ADDQ $0x20, DX VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 3456(CX), Y10 VMOVDQU 3488(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y0, Y0 VMOVDQU 3520(CX), Y10 VMOVDQU 3552(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y1, Y1 VMOVDQU 3584(CX), Y10 VMOVDQU 3616(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y2, Y2 VMOVDQU 3648(CX), Y10 VMOVDQU 3680(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y3, Y3 VMOVDQU 3712(CX), Y10 VMOVDQU 3744(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y4, Y4 VMOVDQU 3776(CX), Y10 VMOVDQU 3808(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y5, Y5 VMOVDQU 3840(CX), Y10 VMOVDQU 3872(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y6, Y6 VMOVDQU 3904(CX), Y10 VMOVDQU 3936(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y7, Y7 VMOVDQU 3968(CX), Y10 VMOVDQU 4000(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y8, Y8 // Store 9 outputs MOVQ (R11), R13 VMOVDQU Y0, (R13)(R12*1) MOVQ 24(R11), R13 VMOVDQU Y1, (R13)(R12*1) MOVQ 48(R11), R13 VMOVDQU Y2, (R13)(R12*1) MOVQ 72(R11), R13 VMOVDQU Y3, (R13)(R12*1) MOVQ 96(R11), R13 VMOVDQU Y4, (R13)(R12*1) MOVQ 120(R11), R13 VMOVDQU Y5, (R13)(R12*1) MOVQ 144(R11), R13 VMOVDQU Y6, (R13)(R12*1) MOVQ 168(R11), R13 VMOVDQU Y7, (R13)(R12*1) MOVQ 192(R11), R13 VMOVDQU Y8, (R13)(R12*1) // Prepare for next loop ADDQ $0x20, R12 DECQ AX JNZ mulAvxTwo_7x9_loop VZEROUPPER mulAvxTwo_7x9_end: RET // func mulAvxTwo_7x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_7x10(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack // Full registers estimated 155 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_7x10_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), DX MOVQ out_base+48(FP), R11 MOVQ start+72(FP), R12 // Add start offset to input ADDQ R12, BX ADDQ R12, SI ADDQ R12, DI ADDQ R12, R8 ADDQ R12, R9 ADDQ R12, R10 ADDQ R12, DX MOVQ $0x0000000f, R13 MOVQ R13, X10 VPBROADCASTB X10, Y10 mulAvxTwo_7x10_loop: // Clear 10 outputs VPXOR Y0, Y0, Y0 VPXOR Y1, Y1, Y1 VPXOR Y2, Y2, Y2 VPXOR Y3, Y3, Y3 VPXOR Y4, Y4, Y4 VPXOR Y5, Y5, Y5 VPXOR Y6, Y6, Y6 VPXOR Y7, Y7, Y7 VPXOR Y8, Y8, Y8 VPXOR Y9, Y9, Y9 // Load and process 32 bytes from input 0 to 10 outputs VMOVDQU (BX), Y13 ADDQ $0x20, BX VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU (CX), Y11 VMOVDQU 32(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y0, Y0 VMOVDQU 64(CX), Y11 VMOVDQU 96(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y1, Y1 VMOVDQU 128(CX), Y11 VMOVDQU 160(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y2, Y2 VMOVDQU 192(CX), Y11 VMOVDQU 224(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y3, Y3 VMOVDQU 256(CX), Y11 VMOVDQU 288(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y4, Y4 VMOVDQU 320(CX), Y11 VMOVDQU 352(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y5, Y5 VMOVDQU 384(CX), Y11 VMOVDQU 416(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y6, Y6 VMOVDQU 448(CX), Y11 VMOVDQU 480(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y7, Y7 VMOVDQU 512(CX), Y11 VMOVDQU 544(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y8, Y8 VMOVDQU 576(CX), Y11 VMOVDQU 608(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 1 to 10 outputs VMOVDQU (SI), Y13 ADDQ $0x20, SI VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 640(CX), Y11 VMOVDQU 672(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y0, Y0 VMOVDQU 704(CX), Y11 VMOVDQU 736(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y1, Y1 VMOVDQU 768(CX), Y11 VMOVDQU 800(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y2, Y2 VMOVDQU 832(CX), Y11 VMOVDQU 864(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y3, Y3 VMOVDQU 896(CX), Y11 VMOVDQU 928(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y4, Y4 VMOVDQU 960(CX), Y11 VMOVDQU 992(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y5, Y5 VMOVDQU 1024(CX), Y11 VMOVDQU 1056(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y6, Y6 VMOVDQU 1088(CX), Y11 VMOVDQU 1120(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y7, Y7 VMOVDQU 1152(CX), Y11 VMOVDQU 1184(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y8, Y8 VMOVDQU 1216(CX), Y11 VMOVDQU 1248(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 2 to 10 outputs VMOVDQU (DI), Y13 ADDQ $0x20, DI VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 1280(CX), Y11 VMOVDQU 1312(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y0, Y0 VMOVDQU 1344(CX), Y11 VMOVDQU 1376(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y1, Y1 VMOVDQU 1408(CX), Y11 VMOVDQU 1440(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y2, Y2 VMOVDQU 1472(CX), Y11 VMOVDQU 1504(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y3, Y3 VMOVDQU 1536(CX), Y11 VMOVDQU 1568(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y4, Y4 VMOVDQU 1600(CX), Y11 VMOVDQU 1632(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y5, Y5 VMOVDQU 1664(CX), Y11 VMOVDQU 1696(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y6, Y6 VMOVDQU 1728(CX), Y11 VMOVDQU 1760(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y7, Y7 VMOVDQU 1792(CX), Y11 VMOVDQU 1824(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y8, Y8 VMOVDQU 1856(CX), Y11 VMOVDQU 1888(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 3 to 10 outputs VMOVDQU (R8), Y13 ADDQ $0x20, R8 VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 1920(CX), Y11 VMOVDQU 1952(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y0, Y0 VMOVDQU 1984(CX), Y11 VMOVDQU 2016(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y1, Y1 VMOVDQU 2048(CX), Y11 VMOVDQU 2080(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y2, Y2 VMOVDQU 2112(CX), Y11 VMOVDQU 2144(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y3, Y3 VMOVDQU 2176(CX), Y11 VMOVDQU 2208(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y4, Y4 VMOVDQU 2240(CX), Y11 VMOVDQU 2272(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y5, Y5 VMOVDQU 2304(CX), Y11 VMOVDQU 2336(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y6, Y6 VMOVDQU 2368(CX), Y11 VMOVDQU 2400(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y7, Y7 VMOVDQU 2432(CX), Y11 VMOVDQU 2464(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y8, Y8 VMOVDQU 2496(CX), Y11 VMOVDQU 2528(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 4 to 10 outputs VMOVDQU (R9), Y13 ADDQ $0x20, R9 VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 2560(CX), Y11 VMOVDQU 2592(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y0, Y0 VMOVDQU 2624(CX), Y11 VMOVDQU 2656(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y1, Y1 VMOVDQU 2688(CX), Y11 VMOVDQU 2720(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y2, Y2 VMOVDQU 2752(CX), Y11 VMOVDQU 2784(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y3, Y3 VMOVDQU 2816(CX), Y11 VMOVDQU 2848(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y4, Y4 VMOVDQU 2880(CX), Y11 VMOVDQU 2912(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y5, Y5 VMOVDQU 2944(CX), Y11 VMOVDQU 2976(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y6, Y6 VMOVDQU 3008(CX), Y11 VMOVDQU 3040(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y7, Y7 VMOVDQU 3072(CX), Y11 VMOVDQU 3104(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y8, Y8 VMOVDQU 3136(CX), Y11 VMOVDQU 3168(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 5 to 10 outputs VMOVDQU (R10), Y13 ADDQ $0x20, R10 VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 3200(CX), Y11 VMOVDQU 3232(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y0, Y0 VMOVDQU 3264(CX), Y11 VMOVDQU 3296(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y1, Y1 VMOVDQU 3328(CX), Y11 VMOVDQU 3360(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y2, Y2 VMOVDQU 3392(CX), Y11 VMOVDQU 3424(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y3, Y3 VMOVDQU 3456(CX), Y11 VMOVDQU 3488(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y4, Y4 VMOVDQU 3520(CX), Y11 VMOVDQU 3552(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y5, Y5 VMOVDQU 3584(CX), Y11 VMOVDQU 3616(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y6, Y6 VMOVDQU 3648(CX), Y11 VMOVDQU 3680(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y7, Y7 VMOVDQU 3712(CX), Y11 VMOVDQU 3744(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y8, Y8 VMOVDQU 3776(CX), Y11 VMOVDQU 3808(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 6 to 10 outputs VMOVDQU (DX), Y13 ADDQ $0x20, DX VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 3840(CX), Y11 VMOVDQU 3872(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y0, Y0 VMOVDQU 3904(CX), Y11 VMOVDQU 3936(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y1, Y1 VMOVDQU 3968(CX), Y11 VMOVDQU 4000(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y2, Y2 VMOVDQU 4032(CX), Y11 VMOVDQU 4064(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y3, Y3 VMOVDQU 4096(CX), Y11 VMOVDQU 4128(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y4, Y4 VMOVDQU 4160(CX), Y11 VMOVDQU 4192(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y5, Y5 VMOVDQU 4224(CX), Y11 VMOVDQU 4256(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y6, Y6 VMOVDQU 4288(CX), Y11 VMOVDQU 4320(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y7, Y7 VMOVDQU 4352(CX), Y11 VMOVDQU 4384(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y8, Y8 VMOVDQU 4416(CX), Y11 VMOVDQU 4448(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y9, Y9 // Store 10 outputs MOVQ (R11), R13 VMOVDQU Y0, (R13)(R12*1) MOVQ 24(R11), R13 VMOVDQU Y1, (R13)(R12*1) MOVQ 48(R11), R13 VMOVDQU Y2, (R13)(R12*1) MOVQ 72(R11), R13 VMOVDQU Y3, (R13)(R12*1) MOVQ 96(R11), R13 VMOVDQU Y4, (R13)(R12*1) MOVQ 120(R11), R13 VMOVDQU Y5, (R13)(R12*1) MOVQ 144(R11), R13 VMOVDQU Y6, (R13)(R12*1) MOVQ 168(R11), R13 VMOVDQU Y7, (R13)(R12*1) MOVQ 192(R11), R13 VMOVDQU Y8, (R13)(R12*1) MOVQ 216(R11), R13 VMOVDQU Y9, (R13)(R12*1) // Prepare for next loop ADDQ $0x20, R12 DECQ AX JNZ mulAvxTwo_7x10_loop VZEROUPPER mulAvxTwo_7x10_end: RET // func mulAvxTwo_8x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_8x1(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 20 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_8x1_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), DX MOVQ out_base+48(FP), R12 MOVQ (R12), R12 MOVQ start+72(FP), R13 // Add start offset to output ADDQ R13, R12 // Add start offset to input ADDQ R13, BX ADDQ R13, SI ADDQ R13, DI ADDQ R13, R8 ADDQ R13, R9 ADDQ R13, R10 ADDQ R13, R11 ADDQ R13, DX MOVQ $0x0000000f, R13 MOVQ R13, X1 VPBROADCASTB X1, Y1 mulAvxTwo_8x1_loop: // Clear 1 outputs VPXOR Y0, Y0, Y0 // Load and process 32 bytes from input 0 to 1 outputs VMOVDQU (BX), Y4 ADDQ $0x20, BX VPSRLQ $0x04, Y4, Y5 VPAND Y1, Y4, Y4 VPAND Y1, Y5, Y5 VMOVDQU (CX), Y2 VMOVDQU 32(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 VPXOR Y2, Y3, Y2 VPXOR Y2, Y0, Y0 // Load and process 32 bytes from input 1 to 1 outputs VMOVDQU (SI), Y4 ADDQ $0x20, SI VPSRLQ $0x04, Y4, Y5 VPAND Y1, Y4, Y4 VPAND Y1, Y5, Y5 VMOVDQU 64(CX), Y2 VMOVDQU 96(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 VPXOR Y2, Y3, Y2 VPXOR Y2, Y0, Y0 // Load and process 32 bytes from input 2 to 1 outputs VMOVDQU (DI), Y4 ADDQ $0x20, DI VPSRLQ $0x04, Y4, Y5 VPAND Y1, Y4, Y4 VPAND Y1, Y5, Y5 VMOVDQU 128(CX), Y2 VMOVDQU 160(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 VPXOR Y2, Y3, Y2 VPXOR Y2, Y0, Y0 // Load and process 32 bytes from input 3 to 1 outputs VMOVDQU (R8), Y4 ADDQ $0x20, R8 VPSRLQ $0x04, Y4, Y5 VPAND Y1, Y4, Y4 VPAND Y1, Y5, Y5 VMOVDQU 192(CX), Y2 VMOVDQU 224(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 VPXOR Y2, Y3, Y2 VPXOR Y2, Y0, Y0 // Load and process 32 bytes from input 4 to 1 outputs VMOVDQU (R9), Y4 ADDQ $0x20, R9 VPSRLQ $0x04, Y4, Y5 VPAND Y1, Y4, Y4 VPAND Y1, Y5, Y5 VMOVDQU 256(CX), Y2 VMOVDQU 288(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 VPXOR Y2, Y3, Y2 VPXOR Y2, Y0, Y0 // Load and process 32 bytes from input 5 to 1 outputs VMOVDQU (R10), Y4 ADDQ $0x20, R10 VPSRLQ $0x04, Y4, Y5 VPAND Y1, Y4, Y4 VPAND Y1, Y5, Y5 VMOVDQU 320(CX), Y2 VMOVDQU 352(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 VPXOR Y2, Y3, Y2 VPXOR Y2, Y0, Y0 // Load and process 32 bytes from input 6 to 1 outputs VMOVDQU (R11), Y4 ADDQ $0x20, R11 VPSRLQ $0x04, Y4, Y5 VPAND Y1, Y4, Y4 VPAND Y1, Y5, Y5 VMOVDQU 384(CX), Y2 VMOVDQU 416(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 VPXOR Y2, Y3, Y2 VPXOR Y2, Y0, Y0 // Load and process 32 bytes from input 7 to 1 outputs VMOVDQU (DX), Y4 ADDQ $0x20, DX VPSRLQ $0x04, Y4, Y5 VPAND Y1, Y4, Y4 VPAND Y1, Y5, Y5 VMOVDQU 448(CX), Y2 VMOVDQU 480(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 VPXOR Y2, Y3, Y2 VPXOR Y2, Y0, Y0 // Store 1 outputs VMOVDQU Y0, (R12) ADDQ $0x20, R12 // Prepare for next loop DECQ AX JNZ mulAvxTwo_8x1_loop VZEROUPPER mulAvxTwo_8x1_end: RET // func mulAvxTwo_8x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_8x1_64(SB), $0-88 // Loading no tables to registers // Destination kept on stack // Full registers estimated 20 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_8x1_64_end MOVQ in_base+24(FP), AX MOVQ (AX), DX MOVQ 24(AX), BX MOVQ 48(AX), SI MOVQ 72(AX), DI MOVQ 96(AX), R8 MOVQ 120(AX), R9 MOVQ 144(AX), R10 MOVQ 168(AX), AX MOVQ out_base+48(FP), R11 MOVQ out_base+48(FP), R11 MOVQ start+72(FP), R12 // Add start offset to input ADDQ R12, DX ADDQ R12, BX ADDQ R12, SI ADDQ R12, DI ADDQ R12, R8 ADDQ R12, R9 ADDQ R12, R10 ADDQ R12, AX MOVQ $0x0000000f, R13 MOVQ R13, X2 VPBROADCASTB X2, Y2 MOVQ n+80(FP), R13 SHRQ $0x06, R13 mulAvxTwo_8x1_64_loop: // Clear 1 outputs VPXOR Y0, Y0, Y0 VPXOR Y1, Y1, Y1 // Load and process 64 bytes from input 0 to 1 outputs VMOVDQU (DX), Y6 VMOVDQU 32(DX), Y5 ADDQ $0x40, DX VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU (CX), Y3 VMOVDQU 32(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y5, Y6, Y5 VPXOR Y3, Y0, Y0 VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 1 to 1 outputs VMOVDQU (BX), Y6 VMOVDQU 32(BX), Y5 ADDQ $0x40, BX VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 64(CX), Y3 VMOVDQU 96(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y5, Y6, Y5 VPXOR Y3, Y0, Y0 VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 2 to 1 outputs VMOVDQU (SI), Y6 VMOVDQU 32(SI), Y5 ADDQ $0x40, SI VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 128(CX), Y3 VMOVDQU 160(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y5, Y6, Y5 VPXOR Y3, Y0, Y0 VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 3 to 1 outputs VMOVDQU (DI), Y6 VMOVDQU 32(DI), Y5 ADDQ $0x40, DI VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 192(CX), Y3 VMOVDQU 224(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y5, Y6, Y5 VPXOR Y3, Y0, Y0 VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 4 to 1 outputs VMOVDQU (R8), Y6 VMOVDQU 32(R8), Y5 ADDQ $0x40, R8 VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 256(CX), Y3 VMOVDQU 288(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y5, Y6, Y5 VPXOR Y3, Y0, Y0 VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 5 to 1 outputs VMOVDQU (R9), Y6 VMOVDQU 32(R9), Y5 ADDQ $0x40, R9 VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 320(CX), Y3 VMOVDQU 352(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y5, Y6, Y5 VPXOR Y3, Y0, Y0 VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 6 to 1 outputs VMOVDQU (R10), Y6 VMOVDQU 32(R10), Y5 ADDQ $0x40, R10 VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 384(CX), Y3 VMOVDQU 416(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y5, Y6, Y5 VPXOR Y3, Y0, Y0 VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 7 to 1 outputs VMOVDQU (AX), Y6 VMOVDQU 32(AX), Y5 ADDQ $0x40, AX VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 448(CX), Y3 VMOVDQU 480(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y5, Y6, Y5 VPXOR Y3, Y0, Y0 VPXOR Y5, Y1, Y1 // Store 1 outputs MOVQ (R11), R14 VMOVDQU Y0, (R14)(R12*1) VMOVDQU Y1, 32(R14)(R12*1) // Prepare for next loop ADDQ $0x40, R12 DECQ R13 JNZ mulAvxTwo_8x1_64_loop VZEROUPPER mulAvxTwo_8x1_64_end: RET // func mulAvxTwo_8x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_8x2(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 39 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_8x2_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), DX MOVQ out_base+48(FP), R12 MOVQ (R12), R13 MOVQ 24(R12), R12 MOVQ start+72(FP), R14 // Add start offset to output ADDQ R14, R13 ADDQ R14, R12 // Add start offset to input ADDQ R14, BX ADDQ R14, SI ADDQ R14, DI ADDQ R14, R8 ADDQ R14, R9 ADDQ R14, R10 ADDQ R14, R11 ADDQ R14, DX MOVQ $0x0000000f, R14 MOVQ R14, X2 VPBROADCASTB X2, Y2 mulAvxTwo_8x2_loop: // Clear 2 outputs VPXOR Y0, Y0, Y0 VPXOR Y1, Y1, Y1 // Load and process 32 bytes from input 0 to 2 outputs VMOVDQU (BX), Y5 ADDQ $0x20, BX VPSRLQ $0x04, Y5, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y6, Y6 VMOVDQU (CX), Y3 VMOVDQU 32(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y3, Y0, Y0 VMOVDQU 64(CX), Y3 VMOVDQU 96(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 1 to 2 outputs VMOVDQU (SI), Y5 ADDQ $0x20, SI VPSRLQ $0x04, Y5, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y6, Y6 VMOVDQU 128(CX), Y3 VMOVDQU 160(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y3, Y0, Y0 VMOVDQU 192(CX), Y3 VMOVDQU 224(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 2 to 2 outputs VMOVDQU (DI), Y5 ADDQ $0x20, DI VPSRLQ $0x04, Y5, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y6, Y6 VMOVDQU 256(CX), Y3 VMOVDQU 288(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y3, Y0, Y0 VMOVDQU 320(CX), Y3 VMOVDQU 352(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 3 to 2 outputs VMOVDQU (R8), Y5 ADDQ $0x20, R8 VPSRLQ $0x04, Y5, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y6, Y6 VMOVDQU 384(CX), Y3 VMOVDQU 416(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y3, Y0, Y0 VMOVDQU 448(CX), Y3 VMOVDQU 480(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 4 to 2 outputs VMOVDQU (R9), Y5 ADDQ $0x20, R9 VPSRLQ $0x04, Y5, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y6, Y6 VMOVDQU 512(CX), Y3 VMOVDQU 544(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y3, Y0, Y0 VMOVDQU 576(CX), Y3 VMOVDQU 608(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 5 to 2 outputs VMOVDQU (R10), Y5 ADDQ $0x20, R10 VPSRLQ $0x04, Y5, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y6, Y6 VMOVDQU 640(CX), Y3 VMOVDQU 672(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y3, Y0, Y0 VMOVDQU 704(CX), Y3 VMOVDQU 736(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 6 to 2 outputs VMOVDQU (R11), Y5 ADDQ $0x20, R11 VPSRLQ $0x04, Y5, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y6, Y6 VMOVDQU 768(CX), Y3 VMOVDQU 800(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y3, Y0, Y0 VMOVDQU 832(CX), Y3 VMOVDQU 864(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 7 to 2 outputs VMOVDQU (DX), Y5 ADDQ $0x20, DX VPSRLQ $0x04, Y5, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y6, Y6 VMOVDQU 896(CX), Y3 VMOVDQU 928(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y3, Y0, Y0 VMOVDQU 960(CX), Y3 VMOVDQU 992(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y3, Y1, Y1 // Store 2 outputs VMOVDQU Y0, (R13) ADDQ $0x20, R13 VMOVDQU Y1, (R12) ADDQ $0x20, R12 // Prepare for next loop DECQ AX JNZ mulAvxTwo_8x2_loop VZEROUPPER mulAvxTwo_8x2_end: RET // func mulAvxTwo_8x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_8x2_64(SB), $0-88 // Loading no tables to registers // Destination kept on stack // Full registers estimated 39 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_8x2_64_end MOVQ in_base+24(FP), AX MOVQ (AX), DX MOVQ 24(AX), BX MOVQ 48(AX), SI MOVQ 72(AX), DI MOVQ 96(AX), R8 MOVQ 120(AX), R9 MOVQ 144(AX), R10 MOVQ 168(AX), AX MOVQ out_base+48(FP), R11 MOVQ out_base+48(FP), R11 MOVQ start+72(FP), R12 // Add start offset to input ADDQ R12, DX ADDQ R12, BX ADDQ R12, SI ADDQ R12, DI ADDQ R12, R8 ADDQ R12, R9 ADDQ R12, R10 ADDQ R12, AX MOVQ $0x0000000f, R13 MOVQ R13, X4 VPBROADCASTB X4, Y4 MOVQ n+80(FP), R13 SHRQ $0x06, R13 mulAvxTwo_8x2_64_loop: // Clear 2 outputs VPXOR Y0, Y0, Y0 VPXOR Y1, Y1, Y1 VPXOR Y2, Y2, Y2 VPXOR Y3, Y3, Y3 // Load and process 64 bytes from input 0 to 2 outputs VMOVDQU (DX), Y9 VMOVDQU 32(DX), Y11 ADDQ $0x40, DX VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU (CX), Y5 VMOVDQU 32(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y7, Y8, Y7 VPXOR Y5, Y0, Y0 VPXOR Y7, Y1, Y1 VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y7, Y8, Y7 VPXOR Y5, Y2, Y2 VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 1 to 2 outputs VMOVDQU (BX), Y9 VMOVDQU 32(BX), Y11 ADDQ $0x40, BX VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 128(CX), Y5 VMOVDQU 160(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y7, Y8, Y7 VPXOR Y5, Y0, Y0 VPXOR Y7, Y1, Y1 VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y7, Y8, Y7 VPXOR Y5, Y2, Y2 VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 2 to 2 outputs VMOVDQU (SI), Y9 VMOVDQU 32(SI), Y11 ADDQ $0x40, SI VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 256(CX), Y5 VMOVDQU 288(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y7, Y8, Y7 VPXOR Y5, Y0, Y0 VPXOR Y7, Y1, Y1 VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y7, Y8, Y7 VPXOR Y5, Y2, Y2 VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 3 to 2 outputs VMOVDQU (DI), Y9 VMOVDQU 32(DI), Y11 ADDQ $0x40, DI VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 384(CX), Y5 VMOVDQU 416(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y7, Y8, Y7 VPXOR Y5, Y0, Y0 VPXOR Y7, Y1, Y1 VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y7, Y8, Y7 VPXOR Y5, Y2, Y2 VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 4 to 2 outputs VMOVDQU (R8), Y9 VMOVDQU 32(R8), Y11 ADDQ $0x40, R8 VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 512(CX), Y5 VMOVDQU 544(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y7, Y8, Y7 VPXOR Y5, Y0, Y0 VPXOR Y7, Y1, Y1 VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y7, Y8, Y7 VPXOR Y5, Y2, Y2 VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 5 to 2 outputs VMOVDQU (R9), Y9 VMOVDQU 32(R9), Y11 ADDQ $0x40, R9 VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 640(CX), Y5 VMOVDQU 672(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y7, Y8, Y7 VPXOR Y5, Y0, Y0 VPXOR Y7, Y1, Y1 VMOVDQU 704(CX), Y5 VMOVDQU 736(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y7, Y8, Y7 VPXOR Y5, Y2, Y2 VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 6 to 2 outputs VMOVDQU (R10), Y9 VMOVDQU 32(R10), Y11 ADDQ $0x40, R10 VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 768(CX), Y5 VMOVDQU 800(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y7, Y8, Y7 VPXOR Y5, Y0, Y0 VPXOR Y7, Y1, Y1 VMOVDQU 832(CX), Y5 VMOVDQU 864(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y7, Y8, Y7 VPXOR Y5, Y2, Y2 VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 7 to 2 outputs VMOVDQU (AX), Y9 VMOVDQU 32(AX), Y11 ADDQ $0x40, AX VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 896(CX), Y5 VMOVDQU 928(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y7, Y8, Y7 VPXOR Y5, Y0, Y0 VPXOR Y7, Y1, Y1 VMOVDQU 960(CX), Y5 VMOVDQU 992(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y7, Y8, Y7 VPXOR Y5, Y2, Y2 VPXOR Y7, Y3, Y3 // Store 2 outputs MOVQ (R11), R14 VMOVDQU Y0, (R14)(R12*1) VMOVDQU Y1, 32(R14)(R12*1) MOVQ 24(R11), R14 VMOVDQU Y2, (R14)(R12*1) VMOVDQU Y3, 32(R14)(R12*1) // Prepare for next loop ADDQ $0x40, R12 DECQ R13 JNZ mulAvxTwo_8x2_64_loop VZEROUPPER mulAvxTwo_8x2_64_end: RET // func mulAvxTwo_8x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_8x3(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 56 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_8x3_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), DX MOVQ out_base+48(FP), R12 MOVQ (R12), R13 MOVQ 24(R12), R14 MOVQ 48(R12), R12 MOVQ start+72(FP), R15 // Add start offset to output ADDQ R15, R13 ADDQ R15, R14 ADDQ R15, R12 // Add start offset to input ADDQ R15, BX ADDQ R15, SI ADDQ R15, DI ADDQ R15, R8 ADDQ R15, R9 ADDQ R15, R10 ADDQ R15, R11 ADDQ R15, DX MOVQ $0x0000000f, R15 MOVQ R15, X3 VPBROADCASTB X3, Y3 mulAvxTwo_8x3_loop: // Clear 3 outputs VPXOR Y0, Y0, Y0 VPXOR Y1, Y1, Y1 VPXOR Y2, Y2, Y2 // Load and process 32 bytes from input 0 to 3 outputs VMOVDQU (BX), Y6 ADDQ $0x20, BX VPSRLQ $0x04, Y6, Y7 VPAND Y3, Y6, Y6 VPAND Y3, Y7, Y7 VMOVDQU (CX), Y4 VMOVDQU 32(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y0, Y0 VMOVDQU 64(CX), Y4 VMOVDQU 96(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y1, Y1 VMOVDQU 128(CX), Y4 VMOVDQU 160(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 1 to 3 outputs VMOVDQU (SI), Y6 ADDQ $0x20, SI VPSRLQ $0x04, Y6, Y7 VPAND Y3, Y6, Y6 VPAND Y3, Y7, Y7 VMOVDQU 192(CX), Y4 VMOVDQU 224(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y0, Y0 VMOVDQU 256(CX), Y4 VMOVDQU 288(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y1, Y1 VMOVDQU 320(CX), Y4 VMOVDQU 352(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 2 to 3 outputs VMOVDQU (DI), Y6 ADDQ $0x20, DI VPSRLQ $0x04, Y6, Y7 VPAND Y3, Y6, Y6 VPAND Y3, Y7, Y7 VMOVDQU 384(CX), Y4 VMOVDQU 416(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y0, Y0 VMOVDQU 448(CX), Y4 VMOVDQU 480(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y1, Y1 VMOVDQU 512(CX), Y4 VMOVDQU 544(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 3 to 3 outputs VMOVDQU (R8), Y6 ADDQ $0x20, R8 VPSRLQ $0x04, Y6, Y7 VPAND Y3, Y6, Y6 VPAND Y3, Y7, Y7 VMOVDQU 576(CX), Y4 VMOVDQU 608(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y0, Y0 VMOVDQU 640(CX), Y4 VMOVDQU 672(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y1, Y1 VMOVDQU 704(CX), Y4 VMOVDQU 736(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 4 to 3 outputs VMOVDQU (R9), Y6 ADDQ $0x20, R9 VPSRLQ $0x04, Y6, Y7 VPAND Y3, Y6, Y6 VPAND Y3, Y7, Y7 VMOVDQU 768(CX), Y4 VMOVDQU 800(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y0, Y0 VMOVDQU 832(CX), Y4 VMOVDQU 864(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y1, Y1 VMOVDQU 896(CX), Y4 VMOVDQU 928(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 5 to 3 outputs VMOVDQU (R10), Y6 ADDQ $0x20, R10 VPSRLQ $0x04, Y6, Y7 VPAND Y3, Y6, Y6 VPAND Y3, Y7, Y7 VMOVDQU 960(CX), Y4 VMOVDQU 992(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y0, Y0 VMOVDQU 1024(CX), Y4 VMOVDQU 1056(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y1, Y1 VMOVDQU 1088(CX), Y4 VMOVDQU 1120(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 6 to 3 outputs VMOVDQU (R11), Y6 ADDQ $0x20, R11 VPSRLQ $0x04, Y6, Y7 VPAND Y3, Y6, Y6 VPAND Y3, Y7, Y7 VMOVDQU 1152(CX), Y4 VMOVDQU 1184(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y0, Y0 VMOVDQU 1216(CX), Y4 VMOVDQU 1248(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y1, Y1 VMOVDQU 1280(CX), Y4 VMOVDQU 1312(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 7 to 3 outputs VMOVDQU (DX), Y6 ADDQ $0x20, DX VPSRLQ $0x04, Y6, Y7 VPAND Y3, Y6, Y6 VPAND Y3, Y7, Y7 VMOVDQU 1344(CX), Y4 VMOVDQU 1376(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y0, Y0 VMOVDQU 1408(CX), Y4 VMOVDQU 1440(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y1, Y1 VMOVDQU 1472(CX), Y4 VMOVDQU 1504(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y2, Y2 // Store 3 outputs VMOVDQU Y0, (R13) ADDQ $0x20, R13 VMOVDQU Y1, (R14) ADDQ $0x20, R14 VMOVDQU Y2, (R12) ADDQ $0x20, R12 // Prepare for next loop DECQ AX JNZ mulAvxTwo_8x3_loop VZEROUPPER mulAvxTwo_8x3_end: RET // func mulAvxTwo_8x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_8x3_64(SB), $0-88 // Loading no tables to registers // Destination kept on stack // Full registers estimated 56 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_8x3_64_end MOVQ in_base+24(FP), AX MOVQ (AX), DX MOVQ 24(AX), BX MOVQ 48(AX), SI MOVQ 72(AX), DI MOVQ 96(AX), R8 MOVQ 120(AX), R9 MOVQ 144(AX), R10 MOVQ 168(AX), AX MOVQ out_base+48(FP), R11 MOVQ out_base+48(FP), R11 MOVQ start+72(FP), R12 // Add start offset to input ADDQ R12, DX ADDQ R12, BX ADDQ R12, SI ADDQ R12, DI ADDQ R12, R8 ADDQ R12, R9 ADDQ R12, R10 ADDQ R12, AX MOVQ $0x0000000f, R13 MOVQ R13, X6 VPBROADCASTB X6, Y6 MOVQ n+80(FP), R13 SHRQ $0x06, R13 mulAvxTwo_8x3_64_loop: // Clear 3 outputs VPXOR Y0, Y0, Y0 VPXOR Y1, Y1, Y1 VPXOR Y2, Y2, Y2 VPXOR Y3, Y3, Y3 VPXOR Y4, Y4, Y4 VPXOR Y5, Y5, Y5 // Load and process 64 bytes from input 0 to 3 outputs VMOVDQU (DX), Y11 VMOVDQU 32(DX), Y13 ADDQ $0x40, DX VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU (CX), Y7 VMOVDQU 32(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y0, Y0 VPXOR Y9, Y1, Y1 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y2, Y2 VPXOR Y9, Y3, Y3 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y4, Y4 VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 1 to 3 outputs VMOVDQU (BX), Y11 VMOVDQU 32(BX), Y13 ADDQ $0x40, BX VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 192(CX), Y7 VMOVDQU 224(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y0, Y0 VPXOR Y9, Y1, Y1 VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y2, Y2 VPXOR Y9, Y3, Y3 VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y4, Y4 VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 2 to 3 outputs VMOVDQU (SI), Y11 VMOVDQU 32(SI), Y13 ADDQ $0x40, SI VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 384(CX), Y7 VMOVDQU 416(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y0, Y0 VPXOR Y9, Y1, Y1 VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y2, Y2 VPXOR Y9, Y3, Y3 VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y4, Y4 VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 3 to 3 outputs VMOVDQU (DI), Y11 VMOVDQU 32(DI), Y13 ADDQ $0x40, DI VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 576(CX), Y7 VMOVDQU 608(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y0, Y0 VPXOR Y9, Y1, Y1 VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y2, Y2 VPXOR Y9, Y3, Y3 VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y4, Y4 VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 4 to 3 outputs VMOVDQU (R8), Y11 VMOVDQU 32(R8), Y13 ADDQ $0x40, R8 VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 768(CX), Y7 VMOVDQU 800(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y0, Y0 VPXOR Y9, Y1, Y1 VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y2, Y2 VPXOR Y9, Y3, Y3 VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y4, Y4 VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 5 to 3 outputs VMOVDQU (R9), Y11 VMOVDQU 32(R9), Y13 ADDQ $0x40, R9 VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 960(CX), Y7 VMOVDQU 992(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y0, Y0 VPXOR Y9, Y1, Y1 VMOVDQU 1024(CX), Y7 VMOVDQU 1056(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y2, Y2 VPXOR Y9, Y3, Y3 VMOVDQU 1088(CX), Y7 VMOVDQU 1120(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y4, Y4 VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 6 to 3 outputs VMOVDQU (R10), Y11 VMOVDQU 32(R10), Y13 ADDQ $0x40, R10 VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 1152(CX), Y7 VMOVDQU 1184(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y0, Y0 VPXOR Y9, Y1, Y1 VMOVDQU 1216(CX), Y7 VMOVDQU 1248(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y2, Y2 VPXOR Y9, Y3, Y3 VMOVDQU 1280(CX), Y7 VMOVDQU 1312(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y4, Y4 VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 7 to 3 outputs VMOVDQU (AX), Y11 VMOVDQU 32(AX), Y13 ADDQ $0x40, AX VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 1344(CX), Y7 VMOVDQU 1376(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y0, Y0 VPXOR Y9, Y1, Y1 VMOVDQU 1408(CX), Y7 VMOVDQU 1440(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y2, Y2 VPXOR Y9, Y3, Y3 VMOVDQU 1472(CX), Y7 VMOVDQU 1504(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y4, Y4 VPXOR Y9, Y5, Y5 // Store 3 outputs MOVQ (R11), R14 VMOVDQU Y0, (R14)(R12*1) VMOVDQU Y1, 32(R14)(R12*1) MOVQ 24(R11), R14 VMOVDQU Y2, (R14)(R12*1) VMOVDQU Y3, 32(R14)(R12*1) MOVQ 48(R11), R14 VMOVDQU Y4, (R14)(R12*1) VMOVDQU Y5, 32(R14)(R12*1) // Prepare for next loop ADDQ $0x40, R12 DECQ R13 JNZ mulAvxTwo_8x3_64_loop VZEROUPPER mulAvxTwo_8x3_64_end: RET // func mulAvxTwo_8x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_8x4(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 73 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_8x4_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), DX MOVQ out_base+48(FP), R12 MOVQ (R12), R13 MOVQ 24(R12), R14 MOVQ 48(R12), R15 MOVQ 72(R12), R12 MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, R13 ADDQ BP, R14 ADDQ BP, R15 ADDQ BP, R12 // Add start offset to input ADDQ BP, BX ADDQ BP, SI ADDQ BP, DI ADDQ BP, R8 ADDQ BP, R9 ADDQ BP, R10 ADDQ BP, R11 ADDQ BP, DX MOVQ $0x0000000f, BP MOVQ BP, X4 VPBROADCASTB X4, Y4 mulAvxTwo_8x4_loop: // Clear 4 outputs VPXOR Y0, Y0, Y0 VPXOR Y1, Y1, Y1 VPXOR Y2, Y2, Y2 VPXOR Y3, Y3, Y3 // Load and process 32 bytes from input 0 to 4 outputs VMOVDQU (BX), Y7 ADDQ $0x20, BX VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU (CX), Y5 VMOVDQU 32(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y0, Y0 VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y1, Y1 VMOVDQU 128(CX), Y5 VMOVDQU 160(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y2, Y2 VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 1 to 4 outputs VMOVDQU (SI), Y7 ADDQ $0x20, SI VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 256(CX), Y5 VMOVDQU 288(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y0, Y0 VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y1, Y1 VMOVDQU 384(CX), Y5 VMOVDQU 416(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y2, Y2 VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 2 to 4 outputs VMOVDQU (DI), Y7 ADDQ $0x20, DI VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 512(CX), Y5 VMOVDQU 544(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y0, Y0 VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y1, Y1 VMOVDQU 640(CX), Y5 VMOVDQU 672(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y2, Y2 VMOVDQU 704(CX), Y5 VMOVDQU 736(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 3 to 4 outputs VMOVDQU (R8), Y7 ADDQ $0x20, R8 VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 768(CX), Y5 VMOVDQU 800(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y0, Y0 VMOVDQU 832(CX), Y5 VMOVDQU 864(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y1, Y1 VMOVDQU 896(CX), Y5 VMOVDQU 928(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y2, Y2 VMOVDQU 960(CX), Y5 VMOVDQU 992(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 4 to 4 outputs VMOVDQU (R9), Y7 ADDQ $0x20, R9 VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 1024(CX), Y5 VMOVDQU 1056(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y0, Y0 VMOVDQU 1088(CX), Y5 VMOVDQU 1120(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y1, Y1 VMOVDQU 1152(CX), Y5 VMOVDQU 1184(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y2, Y2 VMOVDQU 1216(CX), Y5 VMOVDQU 1248(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 5 to 4 outputs VMOVDQU (R10), Y7 ADDQ $0x20, R10 VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 1280(CX), Y5 VMOVDQU 1312(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y0, Y0 VMOVDQU 1344(CX), Y5 VMOVDQU 1376(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y1, Y1 VMOVDQU 1408(CX), Y5 VMOVDQU 1440(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y2, Y2 VMOVDQU 1472(CX), Y5 VMOVDQU 1504(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 6 to 4 outputs VMOVDQU (R11), Y7 ADDQ $0x20, R11 VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 1536(CX), Y5 VMOVDQU 1568(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y0, Y0 VMOVDQU 1600(CX), Y5 VMOVDQU 1632(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y1, Y1 VMOVDQU 1664(CX), Y5 VMOVDQU 1696(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y2, Y2 VMOVDQU 1728(CX), Y5 VMOVDQU 1760(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 7 to 4 outputs VMOVDQU (DX), Y7 ADDQ $0x20, DX VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 1792(CX), Y5 VMOVDQU 1824(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y0, Y0 VMOVDQU 1856(CX), Y5 VMOVDQU 1888(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y1, Y1 VMOVDQU 1920(CX), Y5 VMOVDQU 1952(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y2, Y2 VMOVDQU 1984(CX), Y5 VMOVDQU 2016(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y3, Y3 // Store 4 outputs VMOVDQU Y0, (R13) ADDQ $0x20, R13 VMOVDQU Y1, (R14) ADDQ $0x20, R14 VMOVDQU Y2, (R15) ADDQ $0x20, R15 VMOVDQU Y3, (R12) ADDQ $0x20, R12 // Prepare for next loop DECQ AX JNZ mulAvxTwo_8x4_loop VZEROUPPER mulAvxTwo_8x4_end: RET // func mulAvxTwo_8x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_8x5(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 90 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_8x5_end MOVQ in_base+24(FP), AX MOVQ (AX), DX MOVQ 24(AX), BX MOVQ 48(AX), SI MOVQ 72(AX), DI MOVQ 96(AX), R8 MOVQ 120(AX), R9 MOVQ 144(AX), R10 MOVQ 168(AX), AX MOVQ out_base+48(FP), R11 MOVQ (R11), R12 MOVQ 24(R11), R13 MOVQ 48(R11), R14 MOVQ 72(R11), R15 MOVQ 96(R11), R11 MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, R12 ADDQ BP, R13 ADDQ BP, R14 ADDQ BP, R15 ADDQ BP, R11 // Add start offset to input ADDQ BP, DX ADDQ BP, BX ADDQ BP, SI ADDQ BP, DI ADDQ BP, R8 ADDQ BP, R9 ADDQ BP, R10 ADDQ BP, AX MOVQ $0x0000000f, BP MOVQ BP, X5 VPBROADCASTB X5, Y5 MOVQ n+80(FP), BP SHRQ $0x05, BP mulAvxTwo_8x5_loop: // Clear 5 outputs VPXOR Y0, Y0, Y0 VPXOR Y1, Y1, Y1 VPXOR Y2, Y2, Y2 VPXOR Y3, Y3, Y3 VPXOR Y4, Y4, Y4 // Load and process 32 bytes from input 0 to 5 outputs VMOVDQU (DX), Y8 ADDQ $0x20, DX VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU (CX), Y6 VMOVDQU 32(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y0, Y0 VMOVDQU 64(CX), Y6 VMOVDQU 96(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y1, Y1 VMOVDQU 128(CX), Y6 VMOVDQU 160(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y2, Y2 VMOVDQU 192(CX), Y6 VMOVDQU 224(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y3, Y3 VMOVDQU 256(CX), Y6 VMOVDQU 288(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 1 to 5 outputs VMOVDQU (BX), Y8 ADDQ $0x20, BX VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 320(CX), Y6 VMOVDQU 352(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y0, Y0 VMOVDQU 384(CX), Y6 VMOVDQU 416(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y1, Y1 VMOVDQU 448(CX), Y6 VMOVDQU 480(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y2, Y2 VMOVDQU 512(CX), Y6 VMOVDQU 544(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y3, Y3 VMOVDQU 576(CX), Y6 VMOVDQU 608(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 2 to 5 outputs VMOVDQU (SI), Y8 ADDQ $0x20, SI VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 640(CX), Y6 VMOVDQU 672(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y0, Y0 VMOVDQU 704(CX), Y6 VMOVDQU 736(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y1, Y1 VMOVDQU 768(CX), Y6 VMOVDQU 800(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y2, Y2 VMOVDQU 832(CX), Y6 VMOVDQU 864(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y3, Y3 VMOVDQU 896(CX), Y6 VMOVDQU 928(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 3 to 5 outputs VMOVDQU (DI), Y8 ADDQ $0x20, DI VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 960(CX), Y6 VMOVDQU 992(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y0, Y0 VMOVDQU 1024(CX), Y6 VMOVDQU 1056(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y1, Y1 VMOVDQU 1088(CX), Y6 VMOVDQU 1120(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y2, Y2 VMOVDQU 1152(CX), Y6 VMOVDQU 1184(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y3, Y3 VMOVDQU 1216(CX), Y6 VMOVDQU 1248(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 4 to 5 outputs VMOVDQU (R8), Y8 ADDQ $0x20, R8 VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 1280(CX), Y6 VMOVDQU 1312(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y0, Y0 VMOVDQU 1344(CX), Y6 VMOVDQU 1376(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y1, Y1 VMOVDQU 1408(CX), Y6 VMOVDQU 1440(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y2, Y2 VMOVDQU 1472(CX), Y6 VMOVDQU 1504(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y3, Y3 VMOVDQU 1536(CX), Y6 VMOVDQU 1568(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 5 to 5 outputs VMOVDQU (R9), Y8 ADDQ $0x20, R9 VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 1600(CX), Y6 VMOVDQU 1632(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y0, Y0 VMOVDQU 1664(CX), Y6 VMOVDQU 1696(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y1, Y1 VMOVDQU 1728(CX), Y6 VMOVDQU 1760(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y2, Y2 VMOVDQU 1792(CX), Y6 VMOVDQU 1824(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y3, Y3 VMOVDQU 1856(CX), Y6 VMOVDQU 1888(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 6 to 5 outputs VMOVDQU (R10), Y8 ADDQ $0x20, R10 VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 1920(CX), Y6 VMOVDQU 1952(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y0, Y0 VMOVDQU 1984(CX), Y6 VMOVDQU 2016(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y1, Y1 VMOVDQU 2048(CX), Y6 VMOVDQU 2080(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y2, Y2 VMOVDQU 2112(CX), Y6 VMOVDQU 2144(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y3, Y3 VMOVDQU 2176(CX), Y6 VMOVDQU 2208(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 7 to 5 outputs VMOVDQU (AX), Y8 ADDQ $0x20, AX VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 2240(CX), Y6 VMOVDQU 2272(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y0, Y0 VMOVDQU 2304(CX), Y6 VMOVDQU 2336(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y1, Y1 VMOVDQU 2368(CX), Y6 VMOVDQU 2400(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y2, Y2 VMOVDQU 2432(CX), Y6 VMOVDQU 2464(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y3, Y3 VMOVDQU 2496(CX), Y6 VMOVDQU 2528(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y4, Y4 // Store 5 outputs VMOVDQU Y0, (R12) ADDQ $0x20, R12 VMOVDQU Y1, (R13) ADDQ $0x20, R13 VMOVDQU Y2, (R14) ADDQ $0x20, R14 VMOVDQU Y3, (R15) ADDQ $0x20, R15 VMOVDQU Y4, (R11) ADDQ $0x20, R11 // Prepare for next loop DECQ BP JNZ mulAvxTwo_8x5_loop VZEROUPPER mulAvxTwo_8x5_end: RET // func mulAvxTwo_8x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_8x6(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack // Full registers estimated 107 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_8x6_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), DX MOVQ out_base+48(FP), R12 MOVQ start+72(FP), R13 // Add start offset to input ADDQ R13, BX ADDQ R13, SI ADDQ R13, DI ADDQ R13, R8 ADDQ R13, R9 ADDQ R13, R10 ADDQ R13, R11 ADDQ R13, DX MOVQ $0x0000000f, R14 MOVQ R14, X6 VPBROADCASTB X6, Y6 mulAvxTwo_8x6_loop: // Clear 6 outputs VPXOR Y0, Y0, Y0 VPXOR Y1, Y1, Y1 VPXOR Y2, Y2, Y2 VPXOR Y3, Y3, Y3 VPXOR Y4, Y4, Y4 VPXOR Y5, Y5, Y5 // Load and process 32 bytes from input 0 to 6 outputs VMOVDQU (BX), Y9 ADDQ $0x20, BX VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU (CX), Y7 VMOVDQU 32(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y0, Y0 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y1, Y1 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y2, Y2 VMOVDQU 192(CX), Y7 VMOVDQU 224(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y3, Y3 VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y4, Y4 VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 1 to 6 outputs VMOVDQU (SI), Y9 ADDQ $0x20, SI VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 384(CX), Y7 VMOVDQU 416(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y0, Y0 VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y1, Y1 VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y2, Y2 VMOVDQU 576(CX), Y7 VMOVDQU 608(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y3, Y3 VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y4, Y4 VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 2 to 6 outputs VMOVDQU (DI), Y9 ADDQ $0x20, DI VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 768(CX), Y7 VMOVDQU 800(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y0, Y0 VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y1, Y1 VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y2, Y2 VMOVDQU 960(CX), Y7 VMOVDQU 992(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y3, Y3 VMOVDQU 1024(CX), Y7 VMOVDQU 1056(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y4, Y4 VMOVDQU 1088(CX), Y7 VMOVDQU 1120(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 3 to 6 outputs VMOVDQU (R8), Y9 ADDQ $0x20, R8 VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 1152(CX), Y7 VMOVDQU 1184(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y0, Y0 VMOVDQU 1216(CX), Y7 VMOVDQU 1248(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y1, Y1 VMOVDQU 1280(CX), Y7 VMOVDQU 1312(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y2, Y2 VMOVDQU 1344(CX), Y7 VMOVDQU 1376(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y3, Y3 VMOVDQU 1408(CX), Y7 VMOVDQU 1440(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y4, Y4 VMOVDQU 1472(CX), Y7 VMOVDQU 1504(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 4 to 6 outputs VMOVDQU (R9), Y9 ADDQ $0x20, R9 VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 1536(CX), Y7 VMOVDQU 1568(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y0, Y0 VMOVDQU 1600(CX), Y7 VMOVDQU 1632(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y1, Y1 VMOVDQU 1664(CX), Y7 VMOVDQU 1696(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y2, Y2 VMOVDQU 1728(CX), Y7 VMOVDQU 1760(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y3, Y3 VMOVDQU 1792(CX), Y7 VMOVDQU 1824(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y4, Y4 VMOVDQU 1856(CX), Y7 VMOVDQU 1888(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 5 to 6 outputs VMOVDQU (R10), Y9 ADDQ $0x20, R10 VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 1920(CX), Y7 VMOVDQU 1952(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y0, Y0 VMOVDQU 1984(CX), Y7 VMOVDQU 2016(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y1, Y1 VMOVDQU 2048(CX), Y7 VMOVDQU 2080(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y2, Y2 VMOVDQU 2112(CX), Y7 VMOVDQU 2144(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y3, Y3 VMOVDQU 2176(CX), Y7 VMOVDQU 2208(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y4, Y4 VMOVDQU 2240(CX), Y7 VMOVDQU 2272(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 6 to 6 outputs VMOVDQU (R11), Y9 ADDQ $0x20, R11 VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 2304(CX), Y7 VMOVDQU 2336(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y0, Y0 VMOVDQU 2368(CX), Y7 VMOVDQU 2400(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y1, Y1 VMOVDQU 2432(CX), Y7 VMOVDQU 2464(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y2, Y2 VMOVDQU 2496(CX), Y7 VMOVDQU 2528(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y3, Y3 VMOVDQU 2560(CX), Y7 VMOVDQU 2592(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y4, Y4 VMOVDQU 2624(CX), Y7 VMOVDQU 2656(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 7 to 6 outputs VMOVDQU (DX), Y9 ADDQ $0x20, DX VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 2688(CX), Y7 VMOVDQU 2720(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y0, Y0 VMOVDQU 2752(CX), Y7 VMOVDQU 2784(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y1, Y1 VMOVDQU 2816(CX), Y7 VMOVDQU 2848(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y2, Y2 VMOVDQU 2880(CX), Y7 VMOVDQU 2912(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y3, Y3 VMOVDQU 2944(CX), Y7 VMOVDQU 2976(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y4, Y4 VMOVDQU 3008(CX), Y7 VMOVDQU 3040(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y5, Y5 // Store 6 outputs MOVQ (R12), R14 VMOVDQU Y0, (R14)(R13*1) MOVQ 24(R12), R14 VMOVDQU Y1, (R14)(R13*1) MOVQ 48(R12), R14 VMOVDQU Y2, (R14)(R13*1) MOVQ 72(R12), R14 VMOVDQU Y3, (R14)(R13*1) MOVQ 96(R12), R14 VMOVDQU Y4, (R14)(R13*1) MOVQ 120(R12), R14 VMOVDQU Y5, (R14)(R13*1) // Prepare for next loop ADDQ $0x20, R13 DECQ AX JNZ mulAvxTwo_8x6_loop VZEROUPPER mulAvxTwo_8x6_end: RET // func mulAvxTwo_8x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_8x7(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack // Full registers estimated 124 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_8x7_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), DX MOVQ out_base+48(FP), R12 MOVQ start+72(FP), R13 // Add start offset to input ADDQ R13, BX ADDQ R13, SI ADDQ R13, DI ADDQ R13, R8 ADDQ R13, R9 ADDQ R13, R10 ADDQ R13, R11 ADDQ R13, DX MOVQ $0x0000000f, R14 MOVQ R14, X7 VPBROADCASTB X7, Y7 mulAvxTwo_8x7_loop: // Clear 7 outputs VPXOR Y0, Y0, Y0 VPXOR Y1, Y1, Y1 VPXOR Y2, Y2, Y2 VPXOR Y3, Y3, Y3 VPXOR Y4, Y4, Y4 VPXOR Y5, Y5, Y5 VPXOR Y6, Y6, Y6 // Load and process 32 bytes from input 0 to 7 outputs VMOVDQU (BX), Y10 ADDQ $0x20, BX VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU (CX), Y8 VMOVDQU 32(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y0, Y0 VMOVDQU 64(CX), Y8 VMOVDQU 96(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y1, Y1 VMOVDQU 128(CX), Y8 VMOVDQU 160(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y2, Y2 VMOVDQU 192(CX), Y8 VMOVDQU 224(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y3, Y3 VMOVDQU 256(CX), Y8 VMOVDQU 288(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y4, Y4 VMOVDQU 320(CX), Y8 VMOVDQU 352(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y5, Y5 VMOVDQU 384(CX), Y8 VMOVDQU 416(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 1 to 7 outputs VMOVDQU (SI), Y10 ADDQ $0x20, SI VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 448(CX), Y8 VMOVDQU 480(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y0, Y0 VMOVDQU 512(CX), Y8 VMOVDQU 544(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y1, Y1 VMOVDQU 576(CX), Y8 VMOVDQU 608(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y2, Y2 VMOVDQU 640(CX), Y8 VMOVDQU 672(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y3, Y3 VMOVDQU 704(CX), Y8 VMOVDQU 736(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y4, Y4 VMOVDQU 768(CX), Y8 VMOVDQU 800(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y5, Y5 VMOVDQU 832(CX), Y8 VMOVDQU 864(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 2 to 7 outputs VMOVDQU (DI), Y10 ADDQ $0x20, DI VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 896(CX), Y8 VMOVDQU 928(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y0, Y0 VMOVDQU 960(CX), Y8 VMOVDQU 992(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y1, Y1 VMOVDQU 1024(CX), Y8 VMOVDQU 1056(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y2, Y2 VMOVDQU 1088(CX), Y8 VMOVDQU 1120(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y3, Y3 VMOVDQU 1152(CX), Y8 VMOVDQU 1184(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y4, Y4 VMOVDQU 1216(CX), Y8 VMOVDQU 1248(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y5, Y5 VMOVDQU 1280(CX), Y8 VMOVDQU 1312(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 3 to 7 outputs VMOVDQU (R8), Y10 ADDQ $0x20, R8 VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 1344(CX), Y8 VMOVDQU 1376(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y0, Y0 VMOVDQU 1408(CX), Y8 VMOVDQU 1440(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y1, Y1 VMOVDQU 1472(CX), Y8 VMOVDQU 1504(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y2, Y2 VMOVDQU 1536(CX), Y8 VMOVDQU 1568(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y3, Y3 VMOVDQU 1600(CX), Y8 VMOVDQU 1632(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y4, Y4 VMOVDQU 1664(CX), Y8 VMOVDQU 1696(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y5, Y5 VMOVDQU 1728(CX), Y8 VMOVDQU 1760(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 4 to 7 outputs VMOVDQU (R9), Y10 ADDQ $0x20, R9 VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 1792(CX), Y8 VMOVDQU 1824(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y0, Y0 VMOVDQU 1856(CX), Y8 VMOVDQU 1888(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y1, Y1 VMOVDQU 1920(CX), Y8 VMOVDQU 1952(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y2, Y2 VMOVDQU 1984(CX), Y8 VMOVDQU 2016(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y3, Y3 VMOVDQU 2048(CX), Y8 VMOVDQU 2080(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y4, Y4 VMOVDQU 2112(CX), Y8 VMOVDQU 2144(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y5, Y5 VMOVDQU 2176(CX), Y8 VMOVDQU 2208(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 5 to 7 outputs VMOVDQU (R10), Y10 ADDQ $0x20, R10 VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 2240(CX), Y8 VMOVDQU 2272(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y0, Y0 VMOVDQU 2304(CX), Y8 VMOVDQU 2336(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y1, Y1 VMOVDQU 2368(CX), Y8 VMOVDQU 2400(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y2, Y2 VMOVDQU 2432(CX), Y8 VMOVDQU 2464(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y3, Y3 VMOVDQU 2496(CX), Y8 VMOVDQU 2528(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y4, Y4 VMOVDQU 2560(CX), Y8 VMOVDQU 2592(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y5, Y5 VMOVDQU 2624(CX), Y8 VMOVDQU 2656(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 6 to 7 outputs VMOVDQU (R11), Y10 ADDQ $0x20, R11 VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 2688(CX), Y8 VMOVDQU 2720(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y0, Y0 VMOVDQU 2752(CX), Y8 VMOVDQU 2784(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y1, Y1 VMOVDQU 2816(CX), Y8 VMOVDQU 2848(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y2, Y2 VMOVDQU 2880(CX), Y8 VMOVDQU 2912(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y3, Y3 VMOVDQU 2944(CX), Y8 VMOVDQU 2976(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y4, Y4 VMOVDQU 3008(CX), Y8 VMOVDQU 3040(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y5, Y5 VMOVDQU 3072(CX), Y8 VMOVDQU 3104(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 7 to 7 outputs VMOVDQU (DX), Y10 ADDQ $0x20, DX VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 3136(CX), Y8 VMOVDQU 3168(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y0, Y0 VMOVDQU 3200(CX), Y8 VMOVDQU 3232(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y1, Y1 VMOVDQU 3264(CX), Y8 VMOVDQU 3296(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y2, Y2 VMOVDQU 3328(CX), Y8 VMOVDQU 3360(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y3, Y3 VMOVDQU 3392(CX), Y8 VMOVDQU 3424(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y4, Y4 VMOVDQU 3456(CX), Y8 VMOVDQU 3488(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y5, Y5 VMOVDQU 3520(CX), Y8 VMOVDQU 3552(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y6, Y6 // Store 7 outputs MOVQ (R12), R14 VMOVDQU Y0, (R14)(R13*1) MOVQ 24(R12), R14 VMOVDQU Y1, (R14)(R13*1) MOVQ 48(R12), R14 VMOVDQU Y2, (R14)(R13*1) MOVQ 72(R12), R14 VMOVDQU Y3, (R14)(R13*1) MOVQ 96(R12), R14 VMOVDQU Y4, (R14)(R13*1) MOVQ 120(R12), R14 VMOVDQU Y5, (R14)(R13*1) MOVQ 144(R12), R14 VMOVDQU Y6, (R14)(R13*1) // Prepare for next loop ADDQ $0x20, R13 DECQ AX JNZ mulAvxTwo_8x7_loop VZEROUPPER mulAvxTwo_8x7_end: RET // func mulAvxTwo_8x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_8x8(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack // Full registers estimated 141 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_8x8_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), DX MOVQ out_base+48(FP), R12 MOVQ start+72(FP), R13 // Add start offset to input ADDQ R13, BX ADDQ R13, SI ADDQ R13, DI ADDQ R13, R8 ADDQ R13, R9 ADDQ R13, R10 ADDQ R13, R11 ADDQ R13, DX MOVQ $0x0000000f, R14 MOVQ R14, X8 VPBROADCASTB X8, Y8 mulAvxTwo_8x8_loop: // Clear 8 outputs VPXOR Y0, Y0, Y0 VPXOR Y1, Y1, Y1 VPXOR Y2, Y2, Y2 VPXOR Y3, Y3, Y3 VPXOR Y4, Y4, Y4 VPXOR Y5, Y5, Y5 VPXOR Y6, Y6, Y6 VPXOR Y7, Y7, Y7 // Load and process 32 bytes from input 0 to 8 outputs VMOVDQU (BX), Y11 ADDQ $0x20, BX VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU (CX), Y9 VMOVDQU 32(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y0, Y0 VMOVDQU 64(CX), Y9 VMOVDQU 96(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y1, Y1 VMOVDQU 128(CX), Y9 VMOVDQU 160(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y2, Y2 VMOVDQU 192(CX), Y9 VMOVDQU 224(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y3, Y3 VMOVDQU 256(CX), Y9 VMOVDQU 288(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y4, Y4 VMOVDQU 320(CX), Y9 VMOVDQU 352(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y5, Y5 VMOVDQU 384(CX), Y9 VMOVDQU 416(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y6, Y6 VMOVDQU 448(CX), Y9 VMOVDQU 480(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 1 to 8 outputs VMOVDQU (SI), Y11 ADDQ $0x20, SI VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 512(CX), Y9 VMOVDQU 544(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y0, Y0 VMOVDQU 576(CX), Y9 VMOVDQU 608(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y1, Y1 VMOVDQU 640(CX), Y9 VMOVDQU 672(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y2, Y2 VMOVDQU 704(CX), Y9 VMOVDQU 736(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y3, Y3 VMOVDQU 768(CX), Y9 VMOVDQU 800(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y4, Y4 VMOVDQU 832(CX), Y9 VMOVDQU 864(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y5, Y5 VMOVDQU 896(CX), Y9 VMOVDQU 928(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y6, Y6 VMOVDQU 960(CX), Y9 VMOVDQU 992(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 2 to 8 outputs VMOVDQU (DI), Y11 ADDQ $0x20, DI VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 1024(CX), Y9 VMOVDQU 1056(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y0, Y0 VMOVDQU 1088(CX), Y9 VMOVDQU 1120(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y1, Y1 VMOVDQU 1152(CX), Y9 VMOVDQU 1184(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y2, Y2 VMOVDQU 1216(CX), Y9 VMOVDQU 1248(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y3, Y3 VMOVDQU 1280(CX), Y9 VMOVDQU 1312(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y4, Y4 VMOVDQU 1344(CX), Y9 VMOVDQU 1376(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y5, Y5 VMOVDQU 1408(CX), Y9 VMOVDQU 1440(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y6, Y6 VMOVDQU 1472(CX), Y9 VMOVDQU 1504(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 3 to 8 outputs VMOVDQU (R8), Y11 ADDQ $0x20, R8 VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 1536(CX), Y9 VMOVDQU 1568(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y0, Y0 VMOVDQU 1600(CX), Y9 VMOVDQU 1632(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y1, Y1 VMOVDQU 1664(CX), Y9 VMOVDQU 1696(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y2, Y2 VMOVDQU 1728(CX), Y9 VMOVDQU 1760(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y3, Y3 VMOVDQU 1792(CX), Y9 VMOVDQU 1824(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y4, Y4 VMOVDQU 1856(CX), Y9 VMOVDQU 1888(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y5, Y5 VMOVDQU 1920(CX), Y9 VMOVDQU 1952(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y6, Y6 VMOVDQU 1984(CX), Y9 VMOVDQU 2016(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 4 to 8 outputs VMOVDQU (R9), Y11 ADDQ $0x20, R9 VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 2048(CX), Y9 VMOVDQU 2080(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y0, Y0 VMOVDQU 2112(CX), Y9 VMOVDQU 2144(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y1, Y1 VMOVDQU 2176(CX), Y9 VMOVDQU 2208(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y2, Y2 VMOVDQU 2240(CX), Y9 VMOVDQU 2272(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y3, Y3 VMOVDQU 2304(CX), Y9 VMOVDQU 2336(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y4, Y4 VMOVDQU 2368(CX), Y9 VMOVDQU 2400(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y5, Y5 VMOVDQU 2432(CX), Y9 VMOVDQU 2464(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y6, Y6 VMOVDQU 2496(CX), Y9 VMOVDQU 2528(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 5 to 8 outputs VMOVDQU (R10), Y11 ADDQ $0x20, R10 VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 2560(CX), Y9 VMOVDQU 2592(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y0, Y0 VMOVDQU 2624(CX), Y9 VMOVDQU 2656(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y1, Y1 VMOVDQU 2688(CX), Y9 VMOVDQU 2720(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y2, Y2 VMOVDQU 2752(CX), Y9 VMOVDQU 2784(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y3, Y3 VMOVDQU 2816(CX), Y9 VMOVDQU 2848(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y4, Y4 VMOVDQU 2880(CX), Y9 VMOVDQU 2912(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y5, Y5 VMOVDQU 2944(CX), Y9 VMOVDQU 2976(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y6, Y6 VMOVDQU 3008(CX), Y9 VMOVDQU 3040(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 6 to 8 outputs VMOVDQU (R11), Y11 ADDQ $0x20, R11 VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 3072(CX), Y9 VMOVDQU 3104(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y0, Y0 VMOVDQU 3136(CX), Y9 VMOVDQU 3168(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y1, Y1 VMOVDQU 3200(CX), Y9 VMOVDQU 3232(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y2, Y2 VMOVDQU 3264(CX), Y9 VMOVDQU 3296(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y3, Y3 VMOVDQU 3328(CX), Y9 VMOVDQU 3360(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y4, Y4 VMOVDQU 3392(CX), Y9 VMOVDQU 3424(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y5, Y5 VMOVDQU 3456(CX), Y9 VMOVDQU 3488(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y6, Y6 VMOVDQU 3520(CX), Y9 VMOVDQU 3552(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 7 to 8 outputs VMOVDQU (DX), Y11 ADDQ $0x20, DX VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 3584(CX), Y9 VMOVDQU 3616(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y0, Y0 VMOVDQU 3648(CX), Y9 VMOVDQU 3680(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y1, Y1 VMOVDQU 3712(CX), Y9 VMOVDQU 3744(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y2, Y2 VMOVDQU 3776(CX), Y9 VMOVDQU 3808(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y3, Y3 VMOVDQU 3840(CX), Y9 VMOVDQU 3872(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y4, Y4 VMOVDQU 3904(CX), Y9 VMOVDQU 3936(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y5, Y5 VMOVDQU 3968(CX), Y9 VMOVDQU 4000(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y6, Y6 VMOVDQU 4032(CX), Y9 VMOVDQU 4064(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y7, Y7 // Store 8 outputs MOVQ (R12), R14 VMOVDQU Y0, (R14)(R13*1) MOVQ 24(R12), R14 VMOVDQU Y1, (R14)(R13*1) MOVQ 48(R12), R14 VMOVDQU Y2, (R14)(R13*1) MOVQ 72(R12), R14 VMOVDQU Y3, (R14)(R13*1) MOVQ 96(R12), R14 VMOVDQU Y4, (R14)(R13*1) MOVQ 120(R12), R14 VMOVDQU Y5, (R14)(R13*1) MOVQ 144(R12), R14 VMOVDQU Y6, (R14)(R13*1) MOVQ 168(R12), R14 VMOVDQU Y7, (R14)(R13*1) // Prepare for next loop ADDQ $0x20, R13 DECQ AX JNZ mulAvxTwo_8x8_loop VZEROUPPER mulAvxTwo_8x8_end: RET // func mulAvxTwo_8x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_8x9(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack // Full registers estimated 158 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_8x9_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), DX MOVQ out_base+48(FP), R12 MOVQ start+72(FP), R13 // Add start offset to input ADDQ R13, BX ADDQ R13, SI ADDQ R13, DI ADDQ R13, R8 ADDQ R13, R9 ADDQ R13, R10 ADDQ R13, R11 ADDQ R13, DX MOVQ $0x0000000f, R14 MOVQ R14, X9 VPBROADCASTB X9, Y9 mulAvxTwo_8x9_loop: // Clear 9 outputs VPXOR Y0, Y0, Y0 VPXOR Y1, Y1, Y1 VPXOR Y2, Y2, Y2 VPXOR Y3, Y3, Y3 VPXOR Y4, Y4, Y4 VPXOR Y5, Y5, Y5 VPXOR Y6, Y6, Y6 VPXOR Y7, Y7, Y7 VPXOR Y8, Y8, Y8 // Load and process 32 bytes from input 0 to 9 outputs VMOVDQU (BX), Y12 ADDQ $0x20, BX VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU (CX), Y10 VMOVDQU 32(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y0, Y0 VMOVDQU 64(CX), Y10 VMOVDQU 96(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y1, Y1 VMOVDQU 128(CX), Y10 VMOVDQU 160(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y2, Y2 VMOVDQU 192(CX), Y10 VMOVDQU 224(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y3, Y3 VMOVDQU 256(CX), Y10 VMOVDQU 288(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y4, Y4 VMOVDQU 320(CX), Y10 VMOVDQU 352(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y5, Y5 VMOVDQU 384(CX), Y10 VMOVDQU 416(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y6, Y6 VMOVDQU 448(CX), Y10 VMOVDQU 480(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y7, Y7 VMOVDQU 512(CX), Y10 VMOVDQU 544(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 1 to 9 outputs VMOVDQU (SI), Y12 ADDQ $0x20, SI VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 576(CX), Y10 VMOVDQU 608(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y0, Y0 VMOVDQU 640(CX), Y10 VMOVDQU 672(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y1, Y1 VMOVDQU 704(CX), Y10 VMOVDQU 736(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y2, Y2 VMOVDQU 768(CX), Y10 VMOVDQU 800(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y3, Y3 VMOVDQU 832(CX), Y10 VMOVDQU 864(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y4, Y4 VMOVDQU 896(CX), Y10 VMOVDQU 928(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y5, Y5 VMOVDQU 960(CX), Y10 VMOVDQU 992(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y6, Y6 VMOVDQU 1024(CX), Y10 VMOVDQU 1056(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y7, Y7 VMOVDQU 1088(CX), Y10 VMOVDQU 1120(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 2 to 9 outputs VMOVDQU (DI), Y12 ADDQ $0x20, DI VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 1152(CX), Y10 VMOVDQU 1184(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y0, Y0 VMOVDQU 1216(CX), Y10 VMOVDQU 1248(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y1, Y1 VMOVDQU 1280(CX), Y10 VMOVDQU 1312(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y2, Y2 VMOVDQU 1344(CX), Y10 VMOVDQU 1376(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y3, Y3 VMOVDQU 1408(CX), Y10 VMOVDQU 1440(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y4, Y4 VMOVDQU 1472(CX), Y10 VMOVDQU 1504(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y5, Y5 VMOVDQU 1536(CX), Y10 VMOVDQU 1568(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y6, Y6 VMOVDQU 1600(CX), Y10 VMOVDQU 1632(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y7, Y7 VMOVDQU 1664(CX), Y10 VMOVDQU 1696(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 3 to 9 outputs VMOVDQU (R8), Y12 ADDQ $0x20, R8 VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 1728(CX), Y10 VMOVDQU 1760(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y0, Y0 VMOVDQU 1792(CX), Y10 VMOVDQU 1824(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y1, Y1 VMOVDQU 1856(CX), Y10 VMOVDQU 1888(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y2, Y2 VMOVDQU 1920(CX), Y10 VMOVDQU 1952(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y3, Y3 VMOVDQU 1984(CX), Y10 VMOVDQU 2016(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y4, Y4 VMOVDQU 2048(CX), Y10 VMOVDQU 2080(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y5, Y5 VMOVDQU 2112(CX), Y10 VMOVDQU 2144(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y6, Y6 VMOVDQU 2176(CX), Y10 VMOVDQU 2208(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y7, Y7 VMOVDQU 2240(CX), Y10 VMOVDQU 2272(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 4 to 9 outputs VMOVDQU (R9), Y12 ADDQ $0x20, R9 VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 2304(CX), Y10 VMOVDQU 2336(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y0, Y0 VMOVDQU 2368(CX), Y10 VMOVDQU 2400(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y1, Y1 VMOVDQU 2432(CX), Y10 VMOVDQU 2464(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y2, Y2 VMOVDQU 2496(CX), Y10 VMOVDQU 2528(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y3, Y3 VMOVDQU 2560(CX), Y10 VMOVDQU 2592(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y4, Y4 VMOVDQU 2624(CX), Y10 VMOVDQU 2656(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y5, Y5 VMOVDQU 2688(CX), Y10 VMOVDQU 2720(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y6, Y6 VMOVDQU 2752(CX), Y10 VMOVDQU 2784(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y7, Y7 VMOVDQU 2816(CX), Y10 VMOVDQU 2848(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 5 to 9 outputs VMOVDQU (R10), Y12 ADDQ $0x20, R10 VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 2880(CX), Y10 VMOVDQU 2912(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y0, Y0 VMOVDQU 2944(CX), Y10 VMOVDQU 2976(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y1, Y1 VMOVDQU 3008(CX), Y10 VMOVDQU 3040(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y2, Y2 VMOVDQU 3072(CX), Y10 VMOVDQU 3104(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y3, Y3 VMOVDQU 3136(CX), Y10 VMOVDQU 3168(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y4, Y4 VMOVDQU 3200(CX), Y10 VMOVDQU 3232(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y5, Y5 VMOVDQU 3264(CX), Y10 VMOVDQU 3296(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y6, Y6 VMOVDQU 3328(CX), Y10 VMOVDQU 3360(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y7, Y7 VMOVDQU 3392(CX), Y10 VMOVDQU 3424(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 6 to 9 outputs VMOVDQU (R11), Y12 ADDQ $0x20, R11 VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 3456(CX), Y10 VMOVDQU 3488(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y0, Y0 VMOVDQU 3520(CX), Y10 VMOVDQU 3552(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y1, Y1 VMOVDQU 3584(CX), Y10 VMOVDQU 3616(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y2, Y2 VMOVDQU 3648(CX), Y10 VMOVDQU 3680(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y3, Y3 VMOVDQU 3712(CX), Y10 VMOVDQU 3744(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y4, Y4 VMOVDQU 3776(CX), Y10 VMOVDQU 3808(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y5, Y5 VMOVDQU 3840(CX), Y10 VMOVDQU 3872(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y6, Y6 VMOVDQU 3904(CX), Y10 VMOVDQU 3936(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y7, Y7 VMOVDQU 3968(CX), Y10 VMOVDQU 4000(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 7 to 9 outputs VMOVDQU (DX), Y12 ADDQ $0x20, DX VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 4032(CX), Y10 VMOVDQU 4064(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y0, Y0 VMOVDQU 4096(CX), Y10 VMOVDQU 4128(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y1, Y1 VMOVDQU 4160(CX), Y10 VMOVDQU 4192(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y2, Y2 VMOVDQU 4224(CX), Y10 VMOVDQU 4256(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y3, Y3 VMOVDQU 4288(CX), Y10 VMOVDQU 4320(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y4, Y4 VMOVDQU 4352(CX), Y10 VMOVDQU 4384(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y5, Y5 VMOVDQU 4416(CX), Y10 VMOVDQU 4448(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y6, Y6 VMOVDQU 4480(CX), Y10 VMOVDQU 4512(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y7, Y7 VMOVDQU 4544(CX), Y10 VMOVDQU 4576(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y8, Y8 // Store 9 outputs MOVQ (R12), R14 VMOVDQU Y0, (R14)(R13*1) MOVQ 24(R12), R14 VMOVDQU Y1, (R14)(R13*1) MOVQ 48(R12), R14 VMOVDQU Y2, (R14)(R13*1) MOVQ 72(R12), R14 VMOVDQU Y3, (R14)(R13*1) MOVQ 96(R12), R14 VMOVDQU Y4, (R14)(R13*1) MOVQ 120(R12), R14 VMOVDQU Y5, (R14)(R13*1) MOVQ 144(R12), R14 VMOVDQU Y6, (R14)(R13*1) MOVQ 168(R12), R14 VMOVDQU Y7, (R14)(R13*1) MOVQ 192(R12), R14 VMOVDQU Y8, (R14)(R13*1) // Prepare for next loop ADDQ $0x20, R13 DECQ AX JNZ mulAvxTwo_8x9_loop VZEROUPPER mulAvxTwo_8x9_end: RET // func mulAvxTwo_8x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_8x10(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack // Full registers estimated 175 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_8x10_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), DX MOVQ out_base+48(FP), R12 MOVQ start+72(FP), R13 // Add start offset to input ADDQ R13, BX ADDQ R13, SI ADDQ R13, DI ADDQ R13, R8 ADDQ R13, R9 ADDQ R13, R10 ADDQ R13, R11 ADDQ R13, DX MOVQ $0x0000000f, R14 MOVQ R14, X10 VPBROADCASTB X10, Y10 mulAvxTwo_8x10_loop: // Clear 10 outputs VPXOR Y0, Y0, Y0 VPXOR Y1, Y1, Y1 VPXOR Y2, Y2, Y2 VPXOR Y3, Y3, Y3 VPXOR Y4, Y4, Y4 VPXOR Y5, Y5, Y5 VPXOR Y6, Y6, Y6 VPXOR Y7, Y7, Y7 VPXOR Y8, Y8, Y8 VPXOR Y9, Y9, Y9 // Load and process 32 bytes from input 0 to 10 outputs VMOVDQU (BX), Y13 ADDQ $0x20, BX VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU (CX), Y11 VMOVDQU 32(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y0, Y0 VMOVDQU 64(CX), Y11 VMOVDQU 96(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y1, Y1 VMOVDQU 128(CX), Y11 VMOVDQU 160(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y2, Y2 VMOVDQU 192(CX), Y11 VMOVDQU 224(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y3, Y3 VMOVDQU 256(CX), Y11 VMOVDQU 288(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y4, Y4 VMOVDQU 320(CX), Y11 VMOVDQU 352(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y5, Y5 VMOVDQU 384(CX), Y11 VMOVDQU 416(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y6, Y6 VMOVDQU 448(CX), Y11 VMOVDQU 480(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y7, Y7 VMOVDQU 512(CX), Y11 VMOVDQU 544(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y8, Y8 VMOVDQU 576(CX), Y11 VMOVDQU 608(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 1 to 10 outputs VMOVDQU (SI), Y13 ADDQ $0x20, SI VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 640(CX), Y11 VMOVDQU 672(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y0, Y0 VMOVDQU 704(CX), Y11 VMOVDQU 736(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y1, Y1 VMOVDQU 768(CX), Y11 VMOVDQU 800(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y2, Y2 VMOVDQU 832(CX), Y11 VMOVDQU 864(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y3, Y3 VMOVDQU 896(CX), Y11 VMOVDQU 928(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y4, Y4 VMOVDQU 960(CX), Y11 VMOVDQU 992(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y5, Y5 VMOVDQU 1024(CX), Y11 VMOVDQU 1056(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y6, Y6 VMOVDQU 1088(CX), Y11 VMOVDQU 1120(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y7, Y7 VMOVDQU 1152(CX), Y11 VMOVDQU 1184(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y8, Y8 VMOVDQU 1216(CX), Y11 VMOVDQU 1248(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 2 to 10 outputs VMOVDQU (DI), Y13 ADDQ $0x20, DI VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 1280(CX), Y11 VMOVDQU 1312(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y0, Y0 VMOVDQU 1344(CX), Y11 VMOVDQU 1376(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y1, Y1 VMOVDQU 1408(CX), Y11 VMOVDQU 1440(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y2, Y2 VMOVDQU 1472(CX), Y11 VMOVDQU 1504(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y3, Y3 VMOVDQU 1536(CX), Y11 VMOVDQU 1568(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y4, Y4 VMOVDQU 1600(CX), Y11 VMOVDQU 1632(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y5, Y5 VMOVDQU 1664(CX), Y11 VMOVDQU 1696(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y6, Y6 VMOVDQU 1728(CX), Y11 VMOVDQU 1760(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y7, Y7 VMOVDQU 1792(CX), Y11 VMOVDQU 1824(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y8, Y8 VMOVDQU 1856(CX), Y11 VMOVDQU 1888(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 3 to 10 outputs VMOVDQU (R8), Y13 ADDQ $0x20, R8 VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 1920(CX), Y11 VMOVDQU 1952(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y0, Y0 VMOVDQU 1984(CX), Y11 VMOVDQU 2016(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y1, Y1 VMOVDQU 2048(CX), Y11 VMOVDQU 2080(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y2, Y2 VMOVDQU 2112(CX), Y11 VMOVDQU 2144(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y3, Y3 VMOVDQU 2176(CX), Y11 VMOVDQU 2208(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y4, Y4 VMOVDQU 2240(CX), Y11 VMOVDQU 2272(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y5, Y5 VMOVDQU 2304(CX), Y11 VMOVDQU 2336(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y6, Y6 VMOVDQU 2368(CX), Y11 VMOVDQU 2400(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y7, Y7 VMOVDQU 2432(CX), Y11 VMOVDQU 2464(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y8, Y8 VMOVDQU 2496(CX), Y11 VMOVDQU 2528(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 4 to 10 outputs VMOVDQU (R9), Y13 ADDQ $0x20, R9 VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 2560(CX), Y11 VMOVDQU 2592(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y0, Y0 VMOVDQU 2624(CX), Y11 VMOVDQU 2656(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y1, Y1 VMOVDQU 2688(CX), Y11 VMOVDQU 2720(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y2, Y2 VMOVDQU 2752(CX), Y11 VMOVDQU 2784(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y3, Y3 VMOVDQU 2816(CX), Y11 VMOVDQU 2848(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y4, Y4 VMOVDQU 2880(CX), Y11 VMOVDQU 2912(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y5, Y5 VMOVDQU 2944(CX), Y11 VMOVDQU 2976(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y6, Y6 VMOVDQU 3008(CX), Y11 VMOVDQU 3040(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y7, Y7 VMOVDQU 3072(CX), Y11 VMOVDQU 3104(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y8, Y8 VMOVDQU 3136(CX), Y11 VMOVDQU 3168(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 5 to 10 outputs VMOVDQU (R10), Y13 ADDQ $0x20, R10 VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 3200(CX), Y11 VMOVDQU 3232(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y0, Y0 VMOVDQU 3264(CX), Y11 VMOVDQU 3296(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y1, Y1 VMOVDQU 3328(CX), Y11 VMOVDQU 3360(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y2, Y2 VMOVDQU 3392(CX), Y11 VMOVDQU 3424(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y3, Y3 VMOVDQU 3456(CX), Y11 VMOVDQU 3488(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y4, Y4 VMOVDQU 3520(CX), Y11 VMOVDQU 3552(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y5, Y5 VMOVDQU 3584(CX), Y11 VMOVDQU 3616(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y6, Y6 VMOVDQU 3648(CX), Y11 VMOVDQU 3680(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y7, Y7 VMOVDQU 3712(CX), Y11 VMOVDQU 3744(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y8, Y8 VMOVDQU 3776(CX), Y11 VMOVDQU 3808(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 6 to 10 outputs VMOVDQU (R11), Y13 ADDQ $0x20, R11 VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 3840(CX), Y11 VMOVDQU 3872(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y0, Y0 VMOVDQU 3904(CX), Y11 VMOVDQU 3936(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y1, Y1 VMOVDQU 3968(CX), Y11 VMOVDQU 4000(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y2, Y2 VMOVDQU 4032(CX), Y11 VMOVDQU 4064(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y3, Y3 VMOVDQU 4096(CX), Y11 VMOVDQU 4128(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y4, Y4 VMOVDQU 4160(CX), Y11 VMOVDQU 4192(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y5, Y5 VMOVDQU 4224(CX), Y11 VMOVDQU 4256(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y6, Y6 VMOVDQU 4288(CX), Y11 VMOVDQU 4320(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y7, Y7 VMOVDQU 4352(CX), Y11 VMOVDQU 4384(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y8, Y8 VMOVDQU 4416(CX), Y11 VMOVDQU 4448(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 7 to 10 outputs VMOVDQU (DX), Y13 ADDQ $0x20, DX VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 4480(CX), Y11 VMOVDQU 4512(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y0, Y0 VMOVDQU 4544(CX), Y11 VMOVDQU 4576(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y1, Y1 VMOVDQU 4608(CX), Y11 VMOVDQU 4640(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y2, Y2 VMOVDQU 4672(CX), Y11 VMOVDQU 4704(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y3, Y3 VMOVDQU 4736(CX), Y11 VMOVDQU 4768(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y4, Y4 VMOVDQU 4800(CX), Y11 VMOVDQU 4832(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y5, Y5 VMOVDQU 4864(CX), Y11 VMOVDQU 4896(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y6, Y6 VMOVDQU 4928(CX), Y11 VMOVDQU 4960(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y7, Y7 VMOVDQU 4992(CX), Y11 VMOVDQU 5024(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y8, Y8 VMOVDQU 5056(CX), Y11 VMOVDQU 5088(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y9, Y9 // Store 10 outputs MOVQ (R12), R14 VMOVDQU Y0, (R14)(R13*1) MOVQ 24(R12), R14 VMOVDQU Y1, (R14)(R13*1) MOVQ 48(R12), R14 VMOVDQU Y2, (R14)(R13*1) MOVQ 72(R12), R14 VMOVDQU Y3, (R14)(R13*1) MOVQ 96(R12), R14 VMOVDQU Y4, (R14)(R13*1) MOVQ 120(R12), R14 VMOVDQU Y5, (R14)(R13*1) MOVQ 144(R12), R14 VMOVDQU Y6, (R14)(R13*1) MOVQ 168(R12), R14 VMOVDQU Y7, (R14)(R13*1) MOVQ 192(R12), R14 VMOVDQU Y8, (R14)(R13*1) MOVQ 216(R12), R14 VMOVDQU Y9, (R14)(R13*1) // Prepare for next loop ADDQ $0x20, R13 DECQ AX JNZ mulAvxTwo_8x10_loop VZEROUPPER mulAvxTwo_8x10_end: RET // func mulAvxTwo_9x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_9x1(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 22 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_9x1_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), R12 MOVQ 192(DX), DX MOVQ out_base+48(FP), R13 MOVQ (R13), R13 MOVQ start+72(FP), R14 // Add start offset to output ADDQ R14, R13 // Add start offset to input ADDQ R14, BX ADDQ R14, SI ADDQ R14, DI ADDQ R14, R8 ADDQ R14, R9 ADDQ R14, R10 ADDQ R14, R11 ADDQ R14, R12 ADDQ R14, DX MOVQ $0x0000000f, R14 MOVQ R14, X1 VPBROADCASTB X1, Y1 mulAvxTwo_9x1_loop: // Clear 1 outputs VPXOR Y0, Y0, Y0 // Load and process 32 bytes from input 0 to 1 outputs VMOVDQU (BX), Y4 ADDQ $0x20, BX VPSRLQ $0x04, Y4, Y5 VPAND Y1, Y4, Y4 VPAND Y1, Y5, Y5 VMOVDQU (CX), Y2 VMOVDQU 32(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 VPXOR Y2, Y3, Y2 VPXOR Y2, Y0, Y0 // Load and process 32 bytes from input 1 to 1 outputs VMOVDQU (SI), Y4 ADDQ $0x20, SI VPSRLQ $0x04, Y4, Y5 VPAND Y1, Y4, Y4 VPAND Y1, Y5, Y5 VMOVDQU 64(CX), Y2 VMOVDQU 96(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 VPXOR Y2, Y3, Y2 VPXOR Y2, Y0, Y0 // Load and process 32 bytes from input 2 to 1 outputs VMOVDQU (DI), Y4 ADDQ $0x20, DI VPSRLQ $0x04, Y4, Y5 VPAND Y1, Y4, Y4 VPAND Y1, Y5, Y5 VMOVDQU 128(CX), Y2 VMOVDQU 160(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 VPXOR Y2, Y3, Y2 VPXOR Y2, Y0, Y0 // Load and process 32 bytes from input 3 to 1 outputs VMOVDQU (R8), Y4 ADDQ $0x20, R8 VPSRLQ $0x04, Y4, Y5 VPAND Y1, Y4, Y4 VPAND Y1, Y5, Y5 VMOVDQU 192(CX), Y2 VMOVDQU 224(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 VPXOR Y2, Y3, Y2 VPXOR Y2, Y0, Y0 // Load and process 32 bytes from input 4 to 1 outputs VMOVDQU (R9), Y4 ADDQ $0x20, R9 VPSRLQ $0x04, Y4, Y5 VPAND Y1, Y4, Y4 VPAND Y1, Y5, Y5 VMOVDQU 256(CX), Y2 VMOVDQU 288(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 VPXOR Y2, Y3, Y2 VPXOR Y2, Y0, Y0 // Load and process 32 bytes from input 5 to 1 outputs VMOVDQU (R10), Y4 ADDQ $0x20, R10 VPSRLQ $0x04, Y4, Y5 VPAND Y1, Y4, Y4 VPAND Y1, Y5, Y5 VMOVDQU 320(CX), Y2 VMOVDQU 352(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 VPXOR Y2, Y3, Y2 VPXOR Y2, Y0, Y0 // Load and process 32 bytes from input 6 to 1 outputs VMOVDQU (R11), Y4 ADDQ $0x20, R11 VPSRLQ $0x04, Y4, Y5 VPAND Y1, Y4, Y4 VPAND Y1, Y5, Y5 VMOVDQU 384(CX), Y2 VMOVDQU 416(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 VPXOR Y2, Y3, Y2 VPXOR Y2, Y0, Y0 // Load and process 32 bytes from input 7 to 1 outputs VMOVDQU (R12), Y4 ADDQ $0x20, R12 VPSRLQ $0x04, Y4, Y5 VPAND Y1, Y4, Y4 VPAND Y1, Y5, Y5 VMOVDQU 448(CX), Y2 VMOVDQU 480(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 VPXOR Y2, Y3, Y2 VPXOR Y2, Y0, Y0 // Load and process 32 bytes from input 8 to 1 outputs VMOVDQU (DX), Y4 ADDQ $0x20, DX VPSRLQ $0x04, Y4, Y5 VPAND Y1, Y4, Y4 VPAND Y1, Y5, Y5 VMOVDQU 512(CX), Y2 VMOVDQU 544(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 VPXOR Y2, Y3, Y2 VPXOR Y2, Y0, Y0 // Store 1 outputs VMOVDQU Y0, (R13) ADDQ $0x20, R13 // Prepare for next loop DECQ AX JNZ mulAvxTwo_9x1_loop VZEROUPPER mulAvxTwo_9x1_end: RET // func mulAvxTwo_9x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_9x1_64(SB), $0-88 // Loading no tables to registers // Destination kept on stack // Full registers estimated 22 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_9x1_64_end MOVQ in_base+24(FP), AX MOVQ (AX), DX MOVQ 24(AX), BX MOVQ 48(AX), SI MOVQ 72(AX), DI MOVQ 96(AX), R8 MOVQ 120(AX), R9 MOVQ 144(AX), R10 MOVQ 168(AX), R11 MOVQ 192(AX), AX MOVQ out_base+48(FP), R12 MOVQ out_base+48(FP), R12 MOVQ start+72(FP), R13 // Add start offset to input ADDQ R13, DX ADDQ R13, BX ADDQ R13, SI ADDQ R13, DI ADDQ R13, R8 ADDQ R13, R9 ADDQ R13, R10 ADDQ R13, R11 ADDQ R13, AX MOVQ $0x0000000f, R14 MOVQ R14, X2 VPBROADCASTB X2, Y2 MOVQ n+80(FP), R14 SHRQ $0x06, R14 mulAvxTwo_9x1_64_loop: // Clear 1 outputs VPXOR Y0, Y0, Y0 VPXOR Y1, Y1, Y1 // Load and process 64 bytes from input 0 to 1 outputs VMOVDQU (DX), Y6 VMOVDQU 32(DX), Y5 ADDQ $0x40, DX VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU (CX), Y3 VMOVDQU 32(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y5, Y6, Y5 VPXOR Y3, Y0, Y0 VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 1 to 1 outputs VMOVDQU (BX), Y6 VMOVDQU 32(BX), Y5 ADDQ $0x40, BX VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 64(CX), Y3 VMOVDQU 96(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y5, Y6, Y5 VPXOR Y3, Y0, Y0 VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 2 to 1 outputs VMOVDQU (SI), Y6 VMOVDQU 32(SI), Y5 ADDQ $0x40, SI VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 128(CX), Y3 VMOVDQU 160(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y5, Y6, Y5 VPXOR Y3, Y0, Y0 VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 3 to 1 outputs VMOVDQU (DI), Y6 VMOVDQU 32(DI), Y5 ADDQ $0x40, DI VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 192(CX), Y3 VMOVDQU 224(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y5, Y6, Y5 VPXOR Y3, Y0, Y0 VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 4 to 1 outputs VMOVDQU (R8), Y6 VMOVDQU 32(R8), Y5 ADDQ $0x40, R8 VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 256(CX), Y3 VMOVDQU 288(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y5, Y6, Y5 VPXOR Y3, Y0, Y0 VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 5 to 1 outputs VMOVDQU (R9), Y6 VMOVDQU 32(R9), Y5 ADDQ $0x40, R9 VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 320(CX), Y3 VMOVDQU 352(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y5, Y6, Y5 VPXOR Y3, Y0, Y0 VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 6 to 1 outputs VMOVDQU (R10), Y6 VMOVDQU 32(R10), Y5 ADDQ $0x40, R10 VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 384(CX), Y3 VMOVDQU 416(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y5, Y6, Y5 VPXOR Y3, Y0, Y0 VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 7 to 1 outputs VMOVDQU (R11), Y6 VMOVDQU 32(R11), Y5 ADDQ $0x40, R11 VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 448(CX), Y3 VMOVDQU 480(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y5, Y6, Y5 VPXOR Y3, Y0, Y0 VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 8 to 1 outputs VMOVDQU (AX), Y6 VMOVDQU 32(AX), Y5 ADDQ $0x40, AX VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 512(CX), Y3 VMOVDQU 544(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y5, Y6, Y5 VPXOR Y3, Y0, Y0 VPXOR Y5, Y1, Y1 // Store 1 outputs MOVQ (R12), R15 VMOVDQU Y0, (R15)(R13*1) VMOVDQU Y1, 32(R15)(R13*1) // Prepare for next loop ADDQ $0x40, R13 DECQ R14 JNZ mulAvxTwo_9x1_64_loop VZEROUPPER mulAvxTwo_9x1_64_end: RET // func mulAvxTwo_9x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_9x2(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 43 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_9x2_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), R12 MOVQ 192(DX), DX MOVQ out_base+48(FP), R13 MOVQ (R13), R14 MOVQ 24(R13), R13 MOVQ start+72(FP), R15 // Add start offset to output ADDQ R15, R14 ADDQ R15, R13 // Add start offset to input ADDQ R15, BX ADDQ R15, SI ADDQ R15, DI ADDQ R15, R8 ADDQ R15, R9 ADDQ R15, R10 ADDQ R15, R11 ADDQ R15, R12 ADDQ R15, DX MOVQ $0x0000000f, R15 MOVQ R15, X2 VPBROADCASTB X2, Y2 mulAvxTwo_9x2_loop: // Clear 2 outputs VPXOR Y0, Y0, Y0 VPXOR Y1, Y1, Y1 // Load and process 32 bytes from input 0 to 2 outputs VMOVDQU (BX), Y5 ADDQ $0x20, BX VPSRLQ $0x04, Y5, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y6, Y6 VMOVDQU (CX), Y3 VMOVDQU 32(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y3, Y0, Y0 VMOVDQU 64(CX), Y3 VMOVDQU 96(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 1 to 2 outputs VMOVDQU (SI), Y5 ADDQ $0x20, SI VPSRLQ $0x04, Y5, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y6, Y6 VMOVDQU 128(CX), Y3 VMOVDQU 160(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y3, Y0, Y0 VMOVDQU 192(CX), Y3 VMOVDQU 224(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 2 to 2 outputs VMOVDQU (DI), Y5 ADDQ $0x20, DI VPSRLQ $0x04, Y5, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y6, Y6 VMOVDQU 256(CX), Y3 VMOVDQU 288(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y3, Y0, Y0 VMOVDQU 320(CX), Y3 VMOVDQU 352(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 3 to 2 outputs VMOVDQU (R8), Y5 ADDQ $0x20, R8 VPSRLQ $0x04, Y5, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y6, Y6 VMOVDQU 384(CX), Y3 VMOVDQU 416(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y3, Y0, Y0 VMOVDQU 448(CX), Y3 VMOVDQU 480(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 4 to 2 outputs VMOVDQU (R9), Y5 ADDQ $0x20, R9 VPSRLQ $0x04, Y5, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y6, Y6 VMOVDQU 512(CX), Y3 VMOVDQU 544(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y3, Y0, Y0 VMOVDQU 576(CX), Y3 VMOVDQU 608(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 5 to 2 outputs VMOVDQU (R10), Y5 ADDQ $0x20, R10 VPSRLQ $0x04, Y5, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y6, Y6 VMOVDQU 640(CX), Y3 VMOVDQU 672(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y3, Y0, Y0 VMOVDQU 704(CX), Y3 VMOVDQU 736(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 6 to 2 outputs VMOVDQU (R11), Y5 ADDQ $0x20, R11 VPSRLQ $0x04, Y5, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y6, Y6 VMOVDQU 768(CX), Y3 VMOVDQU 800(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y3, Y0, Y0 VMOVDQU 832(CX), Y3 VMOVDQU 864(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 7 to 2 outputs VMOVDQU (R12), Y5 ADDQ $0x20, R12 VPSRLQ $0x04, Y5, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y6, Y6 VMOVDQU 896(CX), Y3 VMOVDQU 928(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y3, Y0, Y0 VMOVDQU 960(CX), Y3 VMOVDQU 992(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 8 to 2 outputs VMOVDQU (DX), Y5 ADDQ $0x20, DX VPSRLQ $0x04, Y5, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y6, Y6 VMOVDQU 1024(CX), Y3 VMOVDQU 1056(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y3, Y0, Y0 VMOVDQU 1088(CX), Y3 VMOVDQU 1120(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y3, Y1, Y1 // Store 2 outputs VMOVDQU Y0, (R14) ADDQ $0x20, R14 VMOVDQU Y1, (R13) ADDQ $0x20, R13 // Prepare for next loop DECQ AX JNZ mulAvxTwo_9x2_loop VZEROUPPER mulAvxTwo_9x2_end: RET // func mulAvxTwo_9x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_9x2_64(SB), $0-88 // Loading no tables to registers // Destination kept on stack // Full registers estimated 43 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_9x2_64_end MOVQ in_base+24(FP), AX MOVQ (AX), DX MOVQ 24(AX), BX MOVQ 48(AX), SI MOVQ 72(AX), DI MOVQ 96(AX), R8 MOVQ 120(AX), R9 MOVQ 144(AX), R10 MOVQ 168(AX), R11 MOVQ 192(AX), AX MOVQ out_base+48(FP), R12 MOVQ out_base+48(FP), R12 MOVQ start+72(FP), R13 // Add start offset to input ADDQ R13, DX ADDQ R13, BX ADDQ R13, SI ADDQ R13, DI ADDQ R13, R8 ADDQ R13, R9 ADDQ R13, R10 ADDQ R13, R11 ADDQ R13, AX MOVQ $0x0000000f, R14 MOVQ R14, X4 VPBROADCASTB X4, Y4 MOVQ n+80(FP), R14 SHRQ $0x06, R14 mulAvxTwo_9x2_64_loop: // Clear 2 outputs VPXOR Y0, Y0, Y0 VPXOR Y1, Y1, Y1 VPXOR Y2, Y2, Y2 VPXOR Y3, Y3, Y3 // Load and process 64 bytes from input 0 to 2 outputs VMOVDQU (DX), Y9 VMOVDQU 32(DX), Y11 ADDQ $0x40, DX VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU (CX), Y5 VMOVDQU 32(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y7, Y8, Y7 VPXOR Y5, Y0, Y0 VPXOR Y7, Y1, Y1 VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y7, Y8, Y7 VPXOR Y5, Y2, Y2 VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 1 to 2 outputs VMOVDQU (BX), Y9 VMOVDQU 32(BX), Y11 ADDQ $0x40, BX VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 128(CX), Y5 VMOVDQU 160(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y7, Y8, Y7 VPXOR Y5, Y0, Y0 VPXOR Y7, Y1, Y1 VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y7, Y8, Y7 VPXOR Y5, Y2, Y2 VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 2 to 2 outputs VMOVDQU (SI), Y9 VMOVDQU 32(SI), Y11 ADDQ $0x40, SI VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 256(CX), Y5 VMOVDQU 288(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y7, Y8, Y7 VPXOR Y5, Y0, Y0 VPXOR Y7, Y1, Y1 VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y7, Y8, Y7 VPXOR Y5, Y2, Y2 VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 3 to 2 outputs VMOVDQU (DI), Y9 VMOVDQU 32(DI), Y11 ADDQ $0x40, DI VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 384(CX), Y5 VMOVDQU 416(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y7, Y8, Y7 VPXOR Y5, Y0, Y0 VPXOR Y7, Y1, Y1 VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y7, Y8, Y7 VPXOR Y5, Y2, Y2 VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 4 to 2 outputs VMOVDQU (R8), Y9 VMOVDQU 32(R8), Y11 ADDQ $0x40, R8 VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 512(CX), Y5 VMOVDQU 544(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y7, Y8, Y7 VPXOR Y5, Y0, Y0 VPXOR Y7, Y1, Y1 VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y7, Y8, Y7 VPXOR Y5, Y2, Y2 VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 5 to 2 outputs VMOVDQU (R9), Y9 VMOVDQU 32(R9), Y11 ADDQ $0x40, R9 VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 640(CX), Y5 VMOVDQU 672(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y7, Y8, Y7 VPXOR Y5, Y0, Y0 VPXOR Y7, Y1, Y1 VMOVDQU 704(CX), Y5 VMOVDQU 736(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y7, Y8, Y7 VPXOR Y5, Y2, Y2 VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 6 to 2 outputs VMOVDQU (R10), Y9 VMOVDQU 32(R10), Y11 ADDQ $0x40, R10 VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 768(CX), Y5 VMOVDQU 800(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y7, Y8, Y7 VPXOR Y5, Y0, Y0 VPXOR Y7, Y1, Y1 VMOVDQU 832(CX), Y5 VMOVDQU 864(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y7, Y8, Y7 VPXOR Y5, Y2, Y2 VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 7 to 2 outputs VMOVDQU (R11), Y9 VMOVDQU 32(R11), Y11 ADDQ $0x40, R11 VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 896(CX), Y5 VMOVDQU 928(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y7, Y8, Y7 VPXOR Y5, Y0, Y0 VPXOR Y7, Y1, Y1 VMOVDQU 960(CX), Y5 VMOVDQU 992(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y7, Y8, Y7 VPXOR Y5, Y2, Y2 VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 8 to 2 outputs VMOVDQU (AX), Y9 VMOVDQU 32(AX), Y11 ADDQ $0x40, AX VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 1024(CX), Y5 VMOVDQU 1056(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y7, Y8, Y7 VPXOR Y5, Y0, Y0 VPXOR Y7, Y1, Y1 VMOVDQU 1088(CX), Y5 VMOVDQU 1120(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y7, Y8, Y7 VPXOR Y5, Y2, Y2 VPXOR Y7, Y3, Y3 // Store 2 outputs MOVQ (R12), R15 VMOVDQU Y0, (R15)(R13*1) VMOVDQU Y1, 32(R15)(R13*1) MOVQ 24(R12), R15 VMOVDQU Y2, (R15)(R13*1) VMOVDQU Y3, 32(R15)(R13*1) // Prepare for next loop ADDQ $0x40, R13 DECQ R14 JNZ mulAvxTwo_9x2_64_loop VZEROUPPER mulAvxTwo_9x2_64_end: RET // func mulAvxTwo_9x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_9x3(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 62 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_9x3_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), R12 MOVQ 192(DX), DX MOVQ out_base+48(FP), R13 MOVQ (R13), R14 MOVQ 24(R13), R15 MOVQ 48(R13), R13 MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, R14 ADDQ BP, R15 ADDQ BP, R13 // Add start offset to input ADDQ BP, BX ADDQ BP, SI ADDQ BP, DI ADDQ BP, R8 ADDQ BP, R9 ADDQ BP, R10 ADDQ BP, R11 ADDQ BP, R12 ADDQ BP, DX MOVQ $0x0000000f, BP MOVQ BP, X3 VPBROADCASTB X3, Y3 mulAvxTwo_9x3_loop: // Clear 3 outputs VPXOR Y0, Y0, Y0 VPXOR Y1, Y1, Y1 VPXOR Y2, Y2, Y2 // Load and process 32 bytes from input 0 to 3 outputs VMOVDQU (BX), Y6 ADDQ $0x20, BX VPSRLQ $0x04, Y6, Y7 VPAND Y3, Y6, Y6 VPAND Y3, Y7, Y7 VMOVDQU (CX), Y4 VMOVDQU 32(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y0, Y0 VMOVDQU 64(CX), Y4 VMOVDQU 96(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y1, Y1 VMOVDQU 128(CX), Y4 VMOVDQU 160(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 1 to 3 outputs VMOVDQU (SI), Y6 ADDQ $0x20, SI VPSRLQ $0x04, Y6, Y7 VPAND Y3, Y6, Y6 VPAND Y3, Y7, Y7 VMOVDQU 192(CX), Y4 VMOVDQU 224(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y0, Y0 VMOVDQU 256(CX), Y4 VMOVDQU 288(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y1, Y1 VMOVDQU 320(CX), Y4 VMOVDQU 352(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 2 to 3 outputs VMOVDQU (DI), Y6 ADDQ $0x20, DI VPSRLQ $0x04, Y6, Y7 VPAND Y3, Y6, Y6 VPAND Y3, Y7, Y7 VMOVDQU 384(CX), Y4 VMOVDQU 416(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y0, Y0 VMOVDQU 448(CX), Y4 VMOVDQU 480(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y1, Y1 VMOVDQU 512(CX), Y4 VMOVDQU 544(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 3 to 3 outputs VMOVDQU (R8), Y6 ADDQ $0x20, R8 VPSRLQ $0x04, Y6, Y7 VPAND Y3, Y6, Y6 VPAND Y3, Y7, Y7 VMOVDQU 576(CX), Y4 VMOVDQU 608(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y0, Y0 VMOVDQU 640(CX), Y4 VMOVDQU 672(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y1, Y1 VMOVDQU 704(CX), Y4 VMOVDQU 736(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 4 to 3 outputs VMOVDQU (R9), Y6 ADDQ $0x20, R9 VPSRLQ $0x04, Y6, Y7 VPAND Y3, Y6, Y6 VPAND Y3, Y7, Y7 VMOVDQU 768(CX), Y4 VMOVDQU 800(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y0, Y0 VMOVDQU 832(CX), Y4 VMOVDQU 864(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y1, Y1 VMOVDQU 896(CX), Y4 VMOVDQU 928(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 5 to 3 outputs VMOVDQU (R10), Y6 ADDQ $0x20, R10 VPSRLQ $0x04, Y6, Y7 VPAND Y3, Y6, Y6 VPAND Y3, Y7, Y7 VMOVDQU 960(CX), Y4 VMOVDQU 992(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y0, Y0 VMOVDQU 1024(CX), Y4 VMOVDQU 1056(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y1, Y1 VMOVDQU 1088(CX), Y4 VMOVDQU 1120(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 6 to 3 outputs VMOVDQU (R11), Y6 ADDQ $0x20, R11 VPSRLQ $0x04, Y6, Y7 VPAND Y3, Y6, Y6 VPAND Y3, Y7, Y7 VMOVDQU 1152(CX), Y4 VMOVDQU 1184(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y0, Y0 VMOVDQU 1216(CX), Y4 VMOVDQU 1248(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y1, Y1 VMOVDQU 1280(CX), Y4 VMOVDQU 1312(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 7 to 3 outputs VMOVDQU (R12), Y6 ADDQ $0x20, R12 VPSRLQ $0x04, Y6, Y7 VPAND Y3, Y6, Y6 VPAND Y3, Y7, Y7 VMOVDQU 1344(CX), Y4 VMOVDQU 1376(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y0, Y0 VMOVDQU 1408(CX), Y4 VMOVDQU 1440(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y1, Y1 VMOVDQU 1472(CX), Y4 VMOVDQU 1504(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 8 to 3 outputs VMOVDQU (DX), Y6 ADDQ $0x20, DX VPSRLQ $0x04, Y6, Y7 VPAND Y3, Y6, Y6 VPAND Y3, Y7, Y7 VMOVDQU 1536(CX), Y4 VMOVDQU 1568(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y0, Y0 VMOVDQU 1600(CX), Y4 VMOVDQU 1632(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y1, Y1 VMOVDQU 1664(CX), Y4 VMOVDQU 1696(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y2, Y2 // Store 3 outputs VMOVDQU Y0, (R14) ADDQ $0x20, R14 VMOVDQU Y1, (R15) ADDQ $0x20, R15 VMOVDQU Y2, (R13) ADDQ $0x20, R13 // Prepare for next loop DECQ AX JNZ mulAvxTwo_9x3_loop VZEROUPPER mulAvxTwo_9x3_end: RET // func mulAvxTwo_9x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_9x3_64(SB), $0-88 // Loading no tables to registers // Destination kept on stack // Full registers estimated 62 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_9x3_64_end MOVQ in_base+24(FP), AX MOVQ (AX), DX MOVQ 24(AX), BX MOVQ 48(AX), SI MOVQ 72(AX), DI MOVQ 96(AX), R8 MOVQ 120(AX), R9 MOVQ 144(AX), R10 MOVQ 168(AX), R11 MOVQ 192(AX), AX MOVQ out_base+48(FP), R12 MOVQ out_base+48(FP), R12 MOVQ start+72(FP), R13 // Add start offset to input ADDQ R13, DX ADDQ R13, BX ADDQ R13, SI ADDQ R13, DI ADDQ R13, R8 ADDQ R13, R9 ADDQ R13, R10 ADDQ R13, R11 ADDQ R13, AX MOVQ $0x0000000f, R14 MOVQ R14, X6 VPBROADCASTB X6, Y6 MOVQ n+80(FP), R14 SHRQ $0x06, R14 mulAvxTwo_9x3_64_loop: // Clear 3 outputs VPXOR Y0, Y0, Y0 VPXOR Y1, Y1, Y1 VPXOR Y2, Y2, Y2 VPXOR Y3, Y3, Y3 VPXOR Y4, Y4, Y4 VPXOR Y5, Y5, Y5 // Load and process 64 bytes from input 0 to 3 outputs VMOVDQU (DX), Y11 VMOVDQU 32(DX), Y13 ADDQ $0x40, DX VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU (CX), Y7 VMOVDQU 32(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y0, Y0 VPXOR Y9, Y1, Y1 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y2, Y2 VPXOR Y9, Y3, Y3 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y4, Y4 VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 1 to 3 outputs VMOVDQU (BX), Y11 VMOVDQU 32(BX), Y13 ADDQ $0x40, BX VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 192(CX), Y7 VMOVDQU 224(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y0, Y0 VPXOR Y9, Y1, Y1 VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y2, Y2 VPXOR Y9, Y3, Y3 VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y4, Y4 VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 2 to 3 outputs VMOVDQU (SI), Y11 VMOVDQU 32(SI), Y13 ADDQ $0x40, SI VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 384(CX), Y7 VMOVDQU 416(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y0, Y0 VPXOR Y9, Y1, Y1 VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y2, Y2 VPXOR Y9, Y3, Y3 VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y4, Y4 VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 3 to 3 outputs VMOVDQU (DI), Y11 VMOVDQU 32(DI), Y13 ADDQ $0x40, DI VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 576(CX), Y7 VMOVDQU 608(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y0, Y0 VPXOR Y9, Y1, Y1 VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y2, Y2 VPXOR Y9, Y3, Y3 VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y4, Y4 VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 4 to 3 outputs VMOVDQU (R8), Y11 VMOVDQU 32(R8), Y13 ADDQ $0x40, R8 VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 768(CX), Y7 VMOVDQU 800(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y0, Y0 VPXOR Y9, Y1, Y1 VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y2, Y2 VPXOR Y9, Y3, Y3 VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y4, Y4 VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 5 to 3 outputs VMOVDQU (R9), Y11 VMOVDQU 32(R9), Y13 ADDQ $0x40, R9 VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 960(CX), Y7 VMOVDQU 992(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y0, Y0 VPXOR Y9, Y1, Y1 VMOVDQU 1024(CX), Y7 VMOVDQU 1056(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y2, Y2 VPXOR Y9, Y3, Y3 VMOVDQU 1088(CX), Y7 VMOVDQU 1120(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y4, Y4 VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 6 to 3 outputs VMOVDQU (R10), Y11 VMOVDQU 32(R10), Y13 ADDQ $0x40, R10 VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 1152(CX), Y7 VMOVDQU 1184(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y0, Y0 VPXOR Y9, Y1, Y1 VMOVDQU 1216(CX), Y7 VMOVDQU 1248(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y2, Y2 VPXOR Y9, Y3, Y3 VMOVDQU 1280(CX), Y7 VMOVDQU 1312(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y4, Y4 VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 7 to 3 outputs VMOVDQU (R11), Y11 VMOVDQU 32(R11), Y13 ADDQ $0x40, R11 VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 1344(CX), Y7 VMOVDQU 1376(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y0, Y0 VPXOR Y9, Y1, Y1 VMOVDQU 1408(CX), Y7 VMOVDQU 1440(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y2, Y2 VPXOR Y9, Y3, Y3 VMOVDQU 1472(CX), Y7 VMOVDQU 1504(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y4, Y4 VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 8 to 3 outputs VMOVDQU (AX), Y11 VMOVDQU 32(AX), Y13 ADDQ $0x40, AX VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 1536(CX), Y7 VMOVDQU 1568(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y0, Y0 VPXOR Y9, Y1, Y1 VMOVDQU 1600(CX), Y7 VMOVDQU 1632(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y2, Y2 VPXOR Y9, Y3, Y3 VMOVDQU 1664(CX), Y7 VMOVDQU 1696(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y4, Y4 VPXOR Y9, Y5, Y5 // Store 3 outputs MOVQ (R12), R15 VMOVDQU Y0, (R15)(R13*1) VMOVDQU Y1, 32(R15)(R13*1) MOVQ 24(R12), R15 VMOVDQU Y2, (R15)(R13*1) VMOVDQU Y3, 32(R15)(R13*1) MOVQ 48(R12), R15 VMOVDQU Y4, (R15)(R13*1) VMOVDQU Y5, 32(R15)(R13*1) // Prepare for next loop ADDQ $0x40, R13 DECQ R14 JNZ mulAvxTwo_9x3_64_loop VZEROUPPER mulAvxTwo_9x3_64_end: RET // func mulAvxTwo_9x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_9x4(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 81 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_9x4_end MOVQ in_base+24(FP), AX MOVQ (AX), DX MOVQ 24(AX), BX MOVQ 48(AX), SI MOVQ 72(AX), DI MOVQ 96(AX), R8 MOVQ 120(AX), R9 MOVQ 144(AX), R10 MOVQ 168(AX), R11 MOVQ 192(AX), AX MOVQ out_base+48(FP), R12 MOVQ (R12), R13 MOVQ 24(R12), R14 MOVQ 48(R12), R15 MOVQ 72(R12), R12 MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, R13 ADDQ BP, R14 ADDQ BP, R15 ADDQ BP, R12 // Add start offset to input ADDQ BP, DX ADDQ BP, BX ADDQ BP, SI ADDQ BP, DI ADDQ BP, R8 ADDQ BP, R9 ADDQ BP, R10 ADDQ BP, R11 ADDQ BP, AX MOVQ $0x0000000f, BP MOVQ BP, X4 VPBROADCASTB X4, Y4 MOVQ n+80(FP), BP SHRQ $0x05, BP mulAvxTwo_9x4_loop: // Clear 4 outputs VPXOR Y0, Y0, Y0 VPXOR Y1, Y1, Y1 VPXOR Y2, Y2, Y2 VPXOR Y3, Y3, Y3 // Load and process 32 bytes from input 0 to 4 outputs VMOVDQU (DX), Y7 ADDQ $0x20, DX VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU (CX), Y5 VMOVDQU 32(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y0, Y0 VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y1, Y1 VMOVDQU 128(CX), Y5 VMOVDQU 160(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y2, Y2 VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 1 to 4 outputs VMOVDQU (BX), Y7 ADDQ $0x20, BX VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 256(CX), Y5 VMOVDQU 288(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y0, Y0 VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y1, Y1 VMOVDQU 384(CX), Y5 VMOVDQU 416(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y2, Y2 VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 2 to 4 outputs VMOVDQU (SI), Y7 ADDQ $0x20, SI VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 512(CX), Y5 VMOVDQU 544(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y0, Y0 VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y1, Y1 VMOVDQU 640(CX), Y5 VMOVDQU 672(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y2, Y2 VMOVDQU 704(CX), Y5 VMOVDQU 736(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 3 to 4 outputs VMOVDQU (DI), Y7 ADDQ $0x20, DI VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 768(CX), Y5 VMOVDQU 800(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y0, Y0 VMOVDQU 832(CX), Y5 VMOVDQU 864(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y1, Y1 VMOVDQU 896(CX), Y5 VMOVDQU 928(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y2, Y2 VMOVDQU 960(CX), Y5 VMOVDQU 992(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 4 to 4 outputs VMOVDQU (R8), Y7 ADDQ $0x20, R8 VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 1024(CX), Y5 VMOVDQU 1056(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y0, Y0 VMOVDQU 1088(CX), Y5 VMOVDQU 1120(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y1, Y1 VMOVDQU 1152(CX), Y5 VMOVDQU 1184(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y2, Y2 VMOVDQU 1216(CX), Y5 VMOVDQU 1248(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 5 to 4 outputs VMOVDQU (R9), Y7 ADDQ $0x20, R9 VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 1280(CX), Y5 VMOVDQU 1312(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y0, Y0 VMOVDQU 1344(CX), Y5 VMOVDQU 1376(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y1, Y1 VMOVDQU 1408(CX), Y5 VMOVDQU 1440(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y2, Y2 VMOVDQU 1472(CX), Y5 VMOVDQU 1504(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 6 to 4 outputs VMOVDQU (R10), Y7 ADDQ $0x20, R10 VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 1536(CX), Y5 VMOVDQU 1568(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y0, Y0 VMOVDQU 1600(CX), Y5 VMOVDQU 1632(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y1, Y1 VMOVDQU 1664(CX), Y5 VMOVDQU 1696(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y2, Y2 VMOVDQU 1728(CX), Y5 VMOVDQU 1760(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 7 to 4 outputs VMOVDQU (R11), Y7 ADDQ $0x20, R11 VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 1792(CX), Y5 VMOVDQU 1824(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y0, Y0 VMOVDQU 1856(CX), Y5 VMOVDQU 1888(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y1, Y1 VMOVDQU 1920(CX), Y5 VMOVDQU 1952(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y2, Y2 VMOVDQU 1984(CX), Y5 VMOVDQU 2016(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 8 to 4 outputs VMOVDQU (AX), Y7 ADDQ $0x20, AX VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 2048(CX), Y5 VMOVDQU 2080(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y0, Y0 VMOVDQU 2112(CX), Y5 VMOVDQU 2144(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y1, Y1 VMOVDQU 2176(CX), Y5 VMOVDQU 2208(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y2, Y2 VMOVDQU 2240(CX), Y5 VMOVDQU 2272(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y3, Y3 // Store 4 outputs VMOVDQU Y0, (R13) ADDQ $0x20, R13 VMOVDQU Y1, (R14) ADDQ $0x20, R14 VMOVDQU Y2, (R15) ADDQ $0x20, R15 VMOVDQU Y3, (R12) ADDQ $0x20, R12 // Prepare for next loop DECQ BP JNZ mulAvxTwo_9x4_loop VZEROUPPER mulAvxTwo_9x4_end: RET // func mulAvxTwo_9x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_9x5(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack // Full registers estimated 100 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_9x5_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), R12 MOVQ 192(DX), DX MOVQ out_base+48(FP), R13 MOVQ start+72(FP), R14 // Add start offset to input ADDQ R14, BX ADDQ R14, SI ADDQ R14, DI ADDQ R14, R8 ADDQ R14, R9 ADDQ R14, R10 ADDQ R14, R11 ADDQ R14, R12 ADDQ R14, DX MOVQ $0x0000000f, R15 MOVQ R15, X5 VPBROADCASTB X5, Y5 mulAvxTwo_9x5_loop: // Clear 5 outputs VPXOR Y0, Y0, Y0 VPXOR Y1, Y1, Y1 VPXOR Y2, Y2, Y2 VPXOR Y3, Y3, Y3 VPXOR Y4, Y4, Y4 // Load and process 32 bytes from input 0 to 5 outputs VMOVDQU (BX), Y8 ADDQ $0x20, BX VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU (CX), Y6 VMOVDQU 32(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y0, Y0 VMOVDQU 64(CX), Y6 VMOVDQU 96(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y1, Y1 VMOVDQU 128(CX), Y6 VMOVDQU 160(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y2, Y2 VMOVDQU 192(CX), Y6 VMOVDQU 224(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y3, Y3 VMOVDQU 256(CX), Y6 VMOVDQU 288(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 1 to 5 outputs VMOVDQU (SI), Y8 ADDQ $0x20, SI VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 320(CX), Y6 VMOVDQU 352(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y0, Y0 VMOVDQU 384(CX), Y6 VMOVDQU 416(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y1, Y1 VMOVDQU 448(CX), Y6 VMOVDQU 480(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y2, Y2 VMOVDQU 512(CX), Y6 VMOVDQU 544(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y3, Y3 VMOVDQU 576(CX), Y6 VMOVDQU 608(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 2 to 5 outputs VMOVDQU (DI), Y8 ADDQ $0x20, DI VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 640(CX), Y6 VMOVDQU 672(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y0, Y0 VMOVDQU 704(CX), Y6 VMOVDQU 736(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y1, Y1 VMOVDQU 768(CX), Y6 VMOVDQU 800(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y2, Y2 VMOVDQU 832(CX), Y6 VMOVDQU 864(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y3, Y3 VMOVDQU 896(CX), Y6 VMOVDQU 928(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 3 to 5 outputs VMOVDQU (R8), Y8 ADDQ $0x20, R8 VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 960(CX), Y6 VMOVDQU 992(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y0, Y0 VMOVDQU 1024(CX), Y6 VMOVDQU 1056(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y1, Y1 VMOVDQU 1088(CX), Y6 VMOVDQU 1120(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y2, Y2 VMOVDQU 1152(CX), Y6 VMOVDQU 1184(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y3, Y3 VMOVDQU 1216(CX), Y6 VMOVDQU 1248(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 4 to 5 outputs VMOVDQU (R9), Y8 ADDQ $0x20, R9 VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 1280(CX), Y6 VMOVDQU 1312(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y0, Y0 VMOVDQU 1344(CX), Y6 VMOVDQU 1376(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y1, Y1 VMOVDQU 1408(CX), Y6 VMOVDQU 1440(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y2, Y2 VMOVDQU 1472(CX), Y6 VMOVDQU 1504(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y3, Y3 VMOVDQU 1536(CX), Y6 VMOVDQU 1568(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 5 to 5 outputs VMOVDQU (R10), Y8 ADDQ $0x20, R10 VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 1600(CX), Y6 VMOVDQU 1632(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y0, Y0 VMOVDQU 1664(CX), Y6 VMOVDQU 1696(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y1, Y1 VMOVDQU 1728(CX), Y6 VMOVDQU 1760(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y2, Y2 VMOVDQU 1792(CX), Y6 VMOVDQU 1824(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y3, Y3 VMOVDQU 1856(CX), Y6 VMOVDQU 1888(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 6 to 5 outputs VMOVDQU (R11), Y8 ADDQ $0x20, R11 VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 1920(CX), Y6 VMOVDQU 1952(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y0, Y0 VMOVDQU 1984(CX), Y6 VMOVDQU 2016(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y1, Y1 VMOVDQU 2048(CX), Y6 VMOVDQU 2080(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y2, Y2 VMOVDQU 2112(CX), Y6 VMOVDQU 2144(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y3, Y3 VMOVDQU 2176(CX), Y6 VMOVDQU 2208(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 7 to 5 outputs VMOVDQU (R12), Y8 ADDQ $0x20, R12 VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 2240(CX), Y6 VMOVDQU 2272(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y0, Y0 VMOVDQU 2304(CX), Y6 VMOVDQU 2336(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y1, Y1 VMOVDQU 2368(CX), Y6 VMOVDQU 2400(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y2, Y2 VMOVDQU 2432(CX), Y6 VMOVDQU 2464(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y3, Y3 VMOVDQU 2496(CX), Y6 VMOVDQU 2528(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 8 to 5 outputs VMOVDQU (DX), Y8 ADDQ $0x20, DX VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 2560(CX), Y6 VMOVDQU 2592(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y0, Y0 VMOVDQU 2624(CX), Y6 VMOVDQU 2656(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y1, Y1 VMOVDQU 2688(CX), Y6 VMOVDQU 2720(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y2, Y2 VMOVDQU 2752(CX), Y6 VMOVDQU 2784(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y3, Y3 VMOVDQU 2816(CX), Y6 VMOVDQU 2848(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y4, Y4 // Store 5 outputs MOVQ (R13), R15 VMOVDQU Y0, (R15)(R14*1) MOVQ 24(R13), R15 VMOVDQU Y1, (R15)(R14*1) MOVQ 48(R13), R15 VMOVDQU Y2, (R15)(R14*1) MOVQ 72(R13), R15 VMOVDQU Y3, (R15)(R14*1) MOVQ 96(R13), R15 VMOVDQU Y4, (R15)(R14*1) // Prepare for next loop ADDQ $0x20, R14 DECQ AX JNZ mulAvxTwo_9x5_loop VZEROUPPER mulAvxTwo_9x5_end: RET // func mulAvxTwo_9x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_9x6(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack // Full registers estimated 119 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_9x6_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), R12 MOVQ 192(DX), DX MOVQ out_base+48(FP), R13 MOVQ start+72(FP), R14 // Add start offset to input ADDQ R14, BX ADDQ R14, SI ADDQ R14, DI ADDQ R14, R8 ADDQ R14, R9 ADDQ R14, R10 ADDQ R14, R11 ADDQ R14, R12 ADDQ R14, DX MOVQ $0x0000000f, R15 MOVQ R15, X6 VPBROADCASTB X6, Y6 mulAvxTwo_9x6_loop: // Clear 6 outputs VPXOR Y0, Y0, Y0 VPXOR Y1, Y1, Y1 VPXOR Y2, Y2, Y2 VPXOR Y3, Y3, Y3 VPXOR Y4, Y4, Y4 VPXOR Y5, Y5, Y5 // Load and process 32 bytes from input 0 to 6 outputs VMOVDQU (BX), Y9 ADDQ $0x20, BX VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU (CX), Y7 VMOVDQU 32(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y0, Y0 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y1, Y1 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y2, Y2 VMOVDQU 192(CX), Y7 VMOVDQU 224(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y3, Y3 VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y4, Y4 VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 1 to 6 outputs VMOVDQU (SI), Y9 ADDQ $0x20, SI VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 384(CX), Y7 VMOVDQU 416(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y0, Y0 VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y1, Y1 VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y2, Y2 VMOVDQU 576(CX), Y7 VMOVDQU 608(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y3, Y3 VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y4, Y4 VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 2 to 6 outputs VMOVDQU (DI), Y9 ADDQ $0x20, DI VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 768(CX), Y7 VMOVDQU 800(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y0, Y0 VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y1, Y1 VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y2, Y2 VMOVDQU 960(CX), Y7 VMOVDQU 992(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y3, Y3 VMOVDQU 1024(CX), Y7 VMOVDQU 1056(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y4, Y4 VMOVDQU 1088(CX), Y7 VMOVDQU 1120(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 3 to 6 outputs VMOVDQU (R8), Y9 ADDQ $0x20, R8 VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 1152(CX), Y7 VMOVDQU 1184(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y0, Y0 VMOVDQU 1216(CX), Y7 VMOVDQU 1248(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y1, Y1 VMOVDQU 1280(CX), Y7 VMOVDQU 1312(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y2, Y2 VMOVDQU 1344(CX), Y7 VMOVDQU 1376(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y3, Y3 VMOVDQU 1408(CX), Y7 VMOVDQU 1440(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y4, Y4 VMOVDQU 1472(CX), Y7 VMOVDQU 1504(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 4 to 6 outputs VMOVDQU (R9), Y9 ADDQ $0x20, R9 VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 1536(CX), Y7 VMOVDQU 1568(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y0, Y0 VMOVDQU 1600(CX), Y7 VMOVDQU 1632(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y1, Y1 VMOVDQU 1664(CX), Y7 VMOVDQU 1696(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y2, Y2 VMOVDQU 1728(CX), Y7 VMOVDQU 1760(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y3, Y3 VMOVDQU 1792(CX), Y7 VMOVDQU 1824(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y4, Y4 VMOVDQU 1856(CX), Y7 VMOVDQU 1888(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 5 to 6 outputs VMOVDQU (R10), Y9 ADDQ $0x20, R10 VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 1920(CX), Y7 VMOVDQU 1952(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y0, Y0 VMOVDQU 1984(CX), Y7 VMOVDQU 2016(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y1, Y1 VMOVDQU 2048(CX), Y7 VMOVDQU 2080(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y2, Y2 VMOVDQU 2112(CX), Y7 VMOVDQU 2144(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y3, Y3 VMOVDQU 2176(CX), Y7 VMOVDQU 2208(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y4, Y4 VMOVDQU 2240(CX), Y7 VMOVDQU 2272(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 6 to 6 outputs VMOVDQU (R11), Y9 ADDQ $0x20, R11 VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 2304(CX), Y7 VMOVDQU 2336(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y0, Y0 VMOVDQU 2368(CX), Y7 VMOVDQU 2400(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y1, Y1 VMOVDQU 2432(CX), Y7 VMOVDQU 2464(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y2, Y2 VMOVDQU 2496(CX), Y7 VMOVDQU 2528(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y3, Y3 VMOVDQU 2560(CX), Y7 VMOVDQU 2592(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y4, Y4 VMOVDQU 2624(CX), Y7 VMOVDQU 2656(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 7 to 6 outputs VMOVDQU (R12), Y9 ADDQ $0x20, R12 VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 2688(CX), Y7 VMOVDQU 2720(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y0, Y0 VMOVDQU 2752(CX), Y7 VMOVDQU 2784(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y1, Y1 VMOVDQU 2816(CX), Y7 VMOVDQU 2848(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y2, Y2 VMOVDQU 2880(CX), Y7 VMOVDQU 2912(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y3, Y3 VMOVDQU 2944(CX), Y7 VMOVDQU 2976(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y4, Y4 VMOVDQU 3008(CX), Y7 VMOVDQU 3040(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 8 to 6 outputs VMOVDQU (DX), Y9 ADDQ $0x20, DX VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 3072(CX), Y7 VMOVDQU 3104(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y0, Y0 VMOVDQU 3136(CX), Y7 VMOVDQU 3168(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y1, Y1 VMOVDQU 3200(CX), Y7 VMOVDQU 3232(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y2, Y2 VMOVDQU 3264(CX), Y7 VMOVDQU 3296(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y3, Y3 VMOVDQU 3328(CX), Y7 VMOVDQU 3360(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y4, Y4 VMOVDQU 3392(CX), Y7 VMOVDQU 3424(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y5, Y5 // Store 6 outputs MOVQ (R13), R15 VMOVDQU Y0, (R15)(R14*1) MOVQ 24(R13), R15 VMOVDQU Y1, (R15)(R14*1) MOVQ 48(R13), R15 VMOVDQU Y2, (R15)(R14*1) MOVQ 72(R13), R15 VMOVDQU Y3, (R15)(R14*1) MOVQ 96(R13), R15 VMOVDQU Y4, (R15)(R14*1) MOVQ 120(R13), R15 VMOVDQU Y5, (R15)(R14*1) // Prepare for next loop ADDQ $0x20, R14 DECQ AX JNZ mulAvxTwo_9x6_loop VZEROUPPER mulAvxTwo_9x6_end: RET // func mulAvxTwo_9x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_9x7(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack // Full registers estimated 138 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_9x7_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), R12 MOVQ 192(DX), DX MOVQ out_base+48(FP), R13 MOVQ start+72(FP), R14 // Add start offset to input ADDQ R14, BX ADDQ R14, SI ADDQ R14, DI ADDQ R14, R8 ADDQ R14, R9 ADDQ R14, R10 ADDQ R14, R11 ADDQ R14, R12 ADDQ R14, DX MOVQ $0x0000000f, R15 MOVQ R15, X7 VPBROADCASTB X7, Y7 mulAvxTwo_9x7_loop: // Clear 7 outputs VPXOR Y0, Y0, Y0 VPXOR Y1, Y1, Y1 VPXOR Y2, Y2, Y2 VPXOR Y3, Y3, Y3 VPXOR Y4, Y4, Y4 VPXOR Y5, Y5, Y5 VPXOR Y6, Y6, Y6 // Load and process 32 bytes from input 0 to 7 outputs VMOVDQU (BX), Y10 ADDQ $0x20, BX VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU (CX), Y8 VMOVDQU 32(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y0, Y0 VMOVDQU 64(CX), Y8 VMOVDQU 96(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y1, Y1 VMOVDQU 128(CX), Y8 VMOVDQU 160(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y2, Y2 VMOVDQU 192(CX), Y8 VMOVDQU 224(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y3, Y3 VMOVDQU 256(CX), Y8 VMOVDQU 288(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y4, Y4 VMOVDQU 320(CX), Y8 VMOVDQU 352(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y5, Y5 VMOVDQU 384(CX), Y8 VMOVDQU 416(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 1 to 7 outputs VMOVDQU (SI), Y10 ADDQ $0x20, SI VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 448(CX), Y8 VMOVDQU 480(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y0, Y0 VMOVDQU 512(CX), Y8 VMOVDQU 544(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y1, Y1 VMOVDQU 576(CX), Y8 VMOVDQU 608(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y2, Y2 VMOVDQU 640(CX), Y8 VMOVDQU 672(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y3, Y3 VMOVDQU 704(CX), Y8 VMOVDQU 736(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y4, Y4 VMOVDQU 768(CX), Y8 VMOVDQU 800(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y5, Y5 VMOVDQU 832(CX), Y8 VMOVDQU 864(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 2 to 7 outputs VMOVDQU (DI), Y10 ADDQ $0x20, DI VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 896(CX), Y8 VMOVDQU 928(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y0, Y0 VMOVDQU 960(CX), Y8 VMOVDQU 992(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y1, Y1 VMOVDQU 1024(CX), Y8 VMOVDQU 1056(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y2, Y2 VMOVDQU 1088(CX), Y8 VMOVDQU 1120(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y3, Y3 VMOVDQU 1152(CX), Y8 VMOVDQU 1184(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y4, Y4 VMOVDQU 1216(CX), Y8 VMOVDQU 1248(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y5, Y5 VMOVDQU 1280(CX), Y8 VMOVDQU 1312(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 3 to 7 outputs VMOVDQU (R8), Y10 ADDQ $0x20, R8 VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 1344(CX), Y8 VMOVDQU 1376(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y0, Y0 VMOVDQU 1408(CX), Y8 VMOVDQU 1440(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y1, Y1 VMOVDQU 1472(CX), Y8 VMOVDQU 1504(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y2, Y2 VMOVDQU 1536(CX), Y8 VMOVDQU 1568(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y3, Y3 VMOVDQU 1600(CX), Y8 VMOVDQU 1632(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y4, Y4 VMOVDQU 1664(CX), Y8 VMOVDQU 1696(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y5, Y5 VMOVDQU 1728(CX), Y8 VMOVDQU 1760(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 4 to 7 outputs VMOVDQU (R9), Y10 ADDQ $0x20, R9 VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 1792(CX), Y8 VMOVDQU 1824(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y0, Y0 VMOVDQU 1856(CX), Y8 VMOVDQU 1888(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y1, Y1 VMOVDQU 1920(CX), Y8 VMOVDQU 1952(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y2, Y2 VMOVDQU 1984(CX), Y8 VMOVDQU 2016(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y3, Y3 VMOVDQU 2048(CX), Y8 VMOVDQU 2080(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y4, Y4 VMOVDQU 2112(CX), Y8 VMOVDQU 2144(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y5, Y5 VMOVDQU 2176(CX), Y8 VMOVDQU 2208(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 5 to 7 outputs VMOVDQU (R10), Y10 ADDQ $0x20, R10 VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 2240(CX), Y8 VMOVDQU 2272(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y0, Y0 VMOVDQU 2304(CX), Y8 VMOVDQU 2336(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y1, Y1 VMOVDQU 2368(CX), Y8 VMOVDQU 2400(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y2, Y2 VMOVDQU 2432(CX), Y8 VMOVDQU 2464(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y3, Y3 VMOVDQU 2496(CX), Y8 VMOVDQU 2528(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y4, Y4 VMOVDQU 2560(CX), Y8 VMOVDQU 2592(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y5, Y5 VMOVDQU 2624(CX), Y8 VMOVDQU 2656(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 6 to 7 outputs VMOVDQU (R11), Y10 ADDQ $0x20, R11 VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 2688(CX), Y8 VMOVDQU 2720(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y0, Y0 VMOVDQU 2752(CX), Y8 VMOVDQU 2784(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y1, Y1 VMOVDQU 2816(CX), Y8 VMOVDQU 2848(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y2, Y2 VMOVDQU 2880(CX), Y8 VMOVDQU 2912(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y3, Y3 VMOVDQU 2944(CX), Y8 VMOVDQU 2976(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y4, Y4 VMOVDQU 3008(CX), Y8 VMOVDQU 3040(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y5, Y5 VMOVDQU 3072(CX), Y8 VMOVDQU 3104(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 7 to 7 outputs VMOVDQU (R12), Y10 ADDQ $0x20, R12 VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 3136(CX), Y8 VMOVDQU 3168(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y0, Y0 VMOVDQU 3200(CX), Y8 VMOVDQU 3232(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y1, Y1 VMOVDQU 3264(CX), Y8 VMOVDQU 3296(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y2, Y2 VMOVDQU 3328(CX), Y8 VMOVDQU 3360(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y3, Y3 VMOVDQU 3392(CX), Y8 VMOVDQU 3424(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y4, Y4 VMOVDQU 3456(CX), Y8 VMOVDQU 3488(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y5, Y5 VMOVDQU 3520(CX), Y8 VMOVDQU 3552(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 8 to 7 outputs VMOVDQU (DX), Y10 ADDQ $0x20, DX VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 3584(CX), Y8 VMOVDQU 3616(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y0, Y0 VMOVDQU 3648(CX), Y8 VMOVDQU 3680(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y1, Y1 VMOVDQU 3712(CX), Y8 VMOVDQU 3744(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y2, Y2 VMOVDQU 3776(CX), Y8 VMOVDQU 3808(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y3, Y3 VMOVDQU 3840(CX), Y8 VMOVDQU 3872(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y4, Y4 VMOVDQU 3904(CX), Y8 VMOVDQU 3936(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y5, Y5 VMOVDQU 3968(CX), Y8 VMOVDQU 4000(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y6, Y6 // Store 7 outputs MOVQ (R13), R15 VMOVDQU Y0, (R15)(R14*1) MOVQ 24(R13), R15 VMOVDQU Y1, (R15)(R14*1) MOVQ 48(R13), R15 VMOVDQU Y2, (R15)(R14*1) MOVQ 72(R13), R15 VMOVDQU Y3, (R15)(R14*1) MOVQ 96(R13), R15 VMOVDQU Y4, (R15)(R14*1) MOVQ 120(R13), R15 VMOVDQU Y5, (R15)(R14*1) MOVQ 144(R13), R15 VMOVDQU Y6, (R15)(R14*1) // Prepare for next loop ADDQ $0x20, R14 DECQ AX JNZ mulAvxTwo_9x7_loop VZEROUPPER mulAvxTwo_9x7_end: RET // func mulAvxTwo_9x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_9x8(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack // Full registers estimated 157 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_9x8_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), R12 MOVQ 192(DX), DX MOVQ out_base+48(FP), R13 MOVQ start+72(FP), R14 // Add start offset to input ADDQ R14, BX ADDQ R14, SI ADDQ R14, DI ADDQ R14, R8 ADDQ R14, R9 ADDQ R14, R10 ADDQ R14, R11 ADDQ R14, R12 ADDQ R14, DX MOVQ $0x0000000f, R15 MOVQ R15, X8 VPBROADCASTB X8, Y8 mulAvxTwo_9x8_loop: // Clear 8 outputs VPXOR Y0, Y0, Y0 VPXOR Y1, Y1, Y1 VPXOR Y2, Y2, Y2 VPXOR Y3, Y3, Y3 VPXOR Y4, Y4, Y4 VPXOR Y5, Y5, Y5 VPXOR Y6, Y6, Y6 VPXOR Y7, Y7, Y7 // Load and process 32 bytes from input 0 to 8 outputs VMOVDQU (BX), Y11 ADDQ $0x20, BX VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU (CX), Y9 VMOVDQU 32(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y0, Y0 VMOVDQU 64(CX), Y9 VMOVDQU 96(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y1, Y1 VMOVDQU 128(CX), Y9 VMOVDQU 160(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y2, Y2 VMOVDQU 192(CX), Y9 VMOVDQU 224(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y3, Y3 VMOVDQU 256(CX), Y9 VMOVDQU 288(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y4, Y4 VMOVDQU 320(CX), Y9 VMOVDQU 352(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y5, Y5 VMOVDQU 384(CX), Y9 VMOVDQU 416(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y6, Y6 VMOVDQU 448(CX), Y9 VMOVDQU 480(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 1 to 8 outputs VMOVDQU (SI), Y11 ADDQ $0x20, SI VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 512(CX), Y9 VMOVDQU 544(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y0, Y0 VMOVDQU 576(CX), Y9 VMOVDQU 608(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y1, Y1 VMOVDQU 640(CX), Y9 VMOVDQU 672(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y2, Y2 VMOVDQU 704(CX), Y9 VMOVDQU 736(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y3, Y3 VMOVDQU 768(CX), Y9 VMOVDQU 800(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y4, Y4 VMOVDQU 832(CX), Y9 VMOVDQU 864(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y5, Y5 VMOVDQU 896(CX), Y9 VMOVDQU 928(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y6, Y6 VMOVDQU 960(CX), Y9 VMOVDQU 992(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 2 to 8 outputs VMOVDQU (DI), Y11 ADDQ $0x20, DI VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 1024(CX), Y9 VMOVDQU 1056(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y0, Y0 VMOVDQU 1088(CX), Y9 VMOVDQU 1120(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y1, Y1 VMOVDQU 1152(CX), Y9 VMOVDQU 1184(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y2, Y2 VMOVDQU 1216(CX), Y9 VMOVDQU 1248(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y3, Y3 VMOVDQU 1280(CX), Y9 VMOVDQU 1312(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y4, Y4 VMOVDQU 1344(CX), Y9 VMOVDQU 1376(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y5, Y5 VMOVDQU 1408(CX), Y9 VMOVDQU 1440(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y6, Y6 VMOVDQU 1472(CX), Y9 VMOVDQU 1504(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 3 to 8 outputs VMOVDQU (R8), Y11 ADDQ $0x20, R8 VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 1536(CX), Y9 VMOVDQU 1568(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y0, Y0 VMOVDQU 1600(CX), Y9 VMOVDQU 1632(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y1, Y1 VMOVDQU 1664(CX), Y9 VMOVDQU 1696(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y2, Y2 VMOVDQU 1728(CX), Y9 VMOVDQU 1760(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y3, Y3 VMOVDQU 1792(CX), Y9 VMOVDQU 1824(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y4, Y4 VMOVDQU 1856(CX), Y9 VMOVDQU 1888(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y5, Y5 VMOVDQU 1920(CX), Y9 VMOVDQU 1952(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y6, Y6 VMOVDQU 1984(CX), Y9 VMOVDQU 2016(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 4 to 8 outputs VMOVDQU (R9), Y11 ADDQ $0x20, R9 VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 2048(CX), Y9 VMOVDQU 2080(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y0, Y0 VMOVDQU 2112(CX), Y9 VMOVDQU 2144(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y1, Y1 VMOVDQU 2176(CX), Y9 VMOVDQU 2208(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y2, Y2 VMOVDQU 2240(CX), Y9 VMOVDQU 2272(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y3, Y3 VMOVDQU 2304(CX), Y9 VMOVDQU 2336(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y4, Y4 VMOVDQU 2368(CX), Y9 VMOVDQU 2400(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y5, Y5 VMOVDQU 2432(CX), Y9 VMOVDQU 2464(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y6, Y6 VMOVDQU 2496(CX), Y9 VMOVDQU 2528(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 5 to 8 outputs VMOVDQU (R10), Y11 ADDQ $0x20, R10 VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 2560(CX), Y9 VMOVDQU 2592(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y0, Y0 VMOVDQU 2624(CX), Y9 VMOVDQU 2656(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y1, Y1 VMOVDQU 2688(CX), Y9 VMOVDQU 2720(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y2, Y2 VMOVDQU 2752(CX), Y9 VMOVDQU 2784(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y3, Y3 VMOVDQU 2816(CX), Y9 VMOVDQU 2848(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y4, Y4 VMOVDQU 2880(CX), Y9 VMOVDQU 2912(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y5, Y5 VMOVDQU 2944(CX), Y9 VMOVDQU 2976(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y6, Y6 VMOVDQU 3008(CX), Y9 VMOVDQU 3040(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 6 to 8 outputs VMOVDQU (R11), Y11 ADDQ $0x20, R11 VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 3072(CX), Y9 VMOVDQU 3104(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y0, Y0 VMOVDQU 3136(CX), Y9 VMOVDQU 3168(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y1, Y1 VMOVDQU 3200(CX), Y9 VMOVDQU 3232(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y2, Y2 VMOVDQU 3264(CX), Y9 VMOVDQU 3296(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y3, Y3 VMOVDQU 3328(CX), Y9 VMOVDQU 3360(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y4, Y4 VMOVDQU 3392(CX), Y9 VMOVDQU 3424(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y5, Y5 VMOVDQU 3456(CX), Y9 VMOVDQU 3488(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y6, Y6 VMOVDQU 3520(CX), Y9 VMOVDQU 3552(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 7 to 8 outputs VMOVDQU (R12), Y11 ADDQ $0x20, R12 VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 3584(CX), Y9 VMOVDQU 3616(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y0, Y0 VMOVDQU 3648(CX), Y9 VMOVDQU 3680(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y1, Y1 VMOVDQU 3712(CX), Y9 VMOVDQU 3744(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y2, Y2 VMOVDQU 3776(CX), Y9 VMOVDQU 3808(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y3, Y3 VMOVDQU 3840(CX), Y9 VMOVDQU 3872(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y4, Y4 VMOVDQU 3904(CX), Y9 VMOVDQU 3936(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y5, Y5 VMOVDQU 3968(CX), Y9 VMOVDQU 4000(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y6, Y6 VMOVDQU 4032(CX), Y9 VMOVDQU 4064(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 8 to 8 outputs VMOVDQU (DX), Y11 ADDQ $0x20, DX VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 4096(CX), Y9 VMOVDQU 4128(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y0, Y0 VMOVDQU 4160(CX), Y9 VMOVDQU 4192(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y1, Y1 VMOVDQU 4224(CX), Y9 VMOVDQU 4256(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y2, Y2 VMOVDQU 4288(CX), Y9 VMOVDQU 4320(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y3, Y3 VMOVDQU 4352(CX), Y9 VMOVDQU 4384(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y4, Y4 VMOVDQU 4416(CX), Y9 VMOVDQU 4448(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y5, Y5 VMOVDQU 4480(CX), Y9 VMOVDQU 4512(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y6, Y6 VMOVDQU 4544(CX), Y9 VMOVDQU 4576(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y7, Y7 // Store 8 outputs MOVQ (R13), R15 VMOVDQU Y0, (R15)(R14*1) MOVQ 24(R13), R15 VMOVDQU Y1, (R15)(R14*1) MOVQ 48(R13), R15 VMOVDQU Y2, (R15)(R14*1) MOVQ 72(R13), R15 VMOVDQU Y3, (R15)(R14*1) MOVQ 96(R13), R15 VMOVDQU Y4, (R15)(R14*1) MOVQ 120(R13), R15 VMOVDQU Y5, (R15)(R14*1) MOVQ 144(R13), R15 VMOVDQU Y6, (R15)(R14*1) MOVQ 168(R13), R15 VMOVDQU Y7, (R15)(R14*1) // Prepare for next loop ADDQ $0x20, R14 DECQ AX JNZ mulAvxTwo_9x8_loop VZEROUPPER mulAvxTwo_9x8_end: RET // func mulAvxTwo_9x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_9x9(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack // Full registers estimated 176 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_9x9_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), R12 MOVQ 192(DX), DX MOVQ out_base+48(FP), R13 MOVQ start+72(FP), R14 // Add start offset to input ADDQ R14, BX ADDQ R14, SI ADDQ R14, DI ADDQ R14, R8 ADDQ R14, R9 ADDQ R14, R10 ADDQ R14, R11 ADDQ R14, R12 ADDQ R14, DX MOVQ $0x0000000f, R15 MOVQ R15, X9 VPBROADCASTB X9, Y9 mulAvxTwo_9x9_loop: // Clear 9 outputs VPXOR Y0, Y0, Y0 VPXOR Y1, Y1, Y1 VPXOR Y2, Y2, Y2 VPXOR Y3, Y3, Y3 VPXOR Y4, Y4, Y4 VPXOR Y5, Y5, Y5 VPXOR Y6, Y6, Y6 VPXOR Y7, Y7, Y7 VPXOR Y8, Y8, Y8 // Load and process 32 bytes from input 0 to 9 outputs VMOVDQU (BX), Y12 ADDQ $0x20, BX VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU (CX), Y10 VMOVDQU 32(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y0, Y0 VMOVDQU 64(CX), Y10 VMOVDQU 96(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y1, Y1 VMOVDQU 128(CX), Y10 VMOVDQU 160(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y2, Y2 VMOVDQU 192(CX), Y10 VMOVDQU 224(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y3, Y3 VMOVDQU 256(CX), Y10 VMOVDQU 288(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y4, Y4 VMOVDQU 320(CX), Y10 VMOVDQU 352(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y5, Y5 VMOVDQU 384(CX), Y10 VMOVDQU 416(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y6, Y6 VMOVDQU 448(CX), Y10 VMOVDQU 480(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y7, Y7 VMOVDQU 512(CX), Y10 VMOVDQU 544(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 1 to 9 outputs VMOVDQU (SI), Y12 ADDQ $0x20, SI VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 576(CX), Y10 VMOVDQU 608(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y0, Y0 VMOVDQU 640(CX), Y10 VMOVDQU 672(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y1, Y1 VMOVDQU 704(CX), Y10 VMOVDQU 736(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y2, Y2 VMOVDQU 768(CX), Y10 VMOVDQU 800(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y3, Y3 VMOVDQU 832(CX), Y10 VMOVDQU 864(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y4, Y4 VMOVDQU 896(CX), Y10 VMOVDQU 928(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y5, Y5 VMOVDQU 960(CX), Y10 VMOVDQU 992(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y6, Y6 VMOVDQU 1024(CX), Y10 VMOVDQU 1056(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y7, Y7 VMOVDQU 1088(CX), Y10 VMOVDQU 1120(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 2 to 9 outputs VMOVDQU (DI), Y12 ADDQ $0x20, DI VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 1152(CX), Y10 VMOVDQU 1184(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y0, Y0 VMOVDQU 1216(CX), Y10 VMOVDQU 1248(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y1, Y1 VMOVDQU 1280(CX), Y10 VMOVDQU 1312(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y2, Y2 VMOVDQU 1344(CX), Y10 VMOVDQU 1376(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y3, Y3 VMOVDQU 1408(CX), Y10 VMOVDQU 1440(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y4, Y4 VMOVDQU 1472(CX), Y10 VMOVDQU 1504(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y5, Y5 VMOVDQU 1536(CX), Y10 VMOVDQU 1568(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y6, Y6 VMOVDQU 1600(CX), Y10 VMOVDQU 1632(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y7, Y7 VMOVDQU 1664(CX), Y10 VMOVDQU 1696(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 3 to 9 outputs VMOVDQU (R8), Y12 ADDQ $0x20, R8 VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 1728(CX), Y10 VMOVDQU 1760(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y0, Y0 VMOVDQU 1792(CX), Y10 VMOVDQU 1824(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y1, Y1 VMOVDQU 1856(CX), Y10 VMOVDQU 1888(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y2, Y2 VMOVDQU 1920(CX), Y10 VMOVDQU 1952(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y3, Y3 VMOVDQU 1984(CX), Y10 VMOVDQU 2016(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y4, Y4 VMOVDQU 2048(CX), Y10 VMOVDQU 2080(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y5, Y5 VMOVDQU 2112(CX), Y10 VMOVDQU 2144(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y6, Y6 VMOVDQU 2176(CX), Y10 VMOVDQU 2208(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y7, Y7 VMOVDQU 2240(CX), Y10 VMOVDQU 2272(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 4 to 9 outputs VMOVDQU (R9), Y12 ADDQ $0x20, R9 VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 2304(CX), Y10 VMOVDQU 2336(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y0, Y0 VMOVDQU 2368(CX), Y10 VMOVDQU 2400(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y1, Y1 VMOVDQU 2432(CX), Y10 VMOVDQU 2464(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y2, Y2 VMOVDQU 2496(CX), Y10 VMOVDQU 2528(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y3, Y3 VMOVDQU 2560(CX), Y10 VMOVDQU 2592(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y4, Y4 VMOVDQU 2624(CX), Y10 VMOVDQU 2656(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y5, Y5 VMOVDQU 2688(CX), Y10 VMOVDQU 2720(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y6, Y6 VMOVDQU 2752(CX), Y10 VMOVDQU 2784(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y7, Y7 VMOVDQU 2816(CX), Y10 VMOVDQU 2848(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 5 to 9 outputs VMOVDQU (R10), Y12 ADDQ $0x20, R10 VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 2880(CX), Y10 VMOVDQU 2912(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y0, Y0 VMOVDQU 2944(CX), Y10 VMOVDQU 2976(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y1, Y1 VMOVDQU 3008(CX), Y10 VMOVDQU 3040(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y2, Y2 VMOVDQU 3072(CX), Y10 VMOVDQU 3104(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y3, Y3 VMOVDQU 3136(CX), Y10 VMOVDQU 3168(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y4, Y4 VMOVDQU 3200(CX), Y10 VMOVDQU 3232(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y5, Y5 VMOVDQU 3264(CX), Y10 VMOVDQU 3296(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y6, Y6 VMOVDQU 3328(CX), Y10 VMOVDQU 3360(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y7, Y7 VMOVDQU 3392(CX), Y10 VMOVDQU 3424(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 6 to 9 outputs VMOVDQU (R11), Y12 ADDQ $0x20, R11 VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 3456(CX), Y10 VMOVDQU 3488(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y0, Y0 VMOVDQU 3520(CX), Y10 VMOVDQU 3552(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y1, Y1 VMOVDQU 3584(CX), Y10 VMOVDQU 3616(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y2, Y2 VMOVDQU 3648(CX), Y10 VMOVDQU 3680(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y3, Y3 VMOVDQU 3712(CX), Y10 VMOVDQU 3744(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y4, Y4 VMOVDQU 3776(CX), Y10 VMOVDQU 3808(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y5, Y5 VMOVDQU 3840(CX), Y10 VMOVDQU 3872(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y6, Y6 VMOVDQU 3904(CX), Y10 VMOVDQU 3936(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y7, Y7 VMOVDQU 3968(CX), Y10 VMOVDQU 4000(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 7 to 9 outputs VMOVDQU (R12), Y12 ADDQ $0x20, R12 VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 4032(CX), Y10 VMOVDQU 4064(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y0, Y0 VMOVDQU 4096(CX), Y10 VMOVDQU 4128(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y1, Y1 VMOVDQU 4160(CX), Y10 VMOVDQU 4192(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y2, Y2 VMOVDQU 4224(CX), Y10 VMOVDQU 4256(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y3, Y3 VMOVDQU 4288(CX), Y10 VMOVDQU 4320(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y4, Y4 VMOVDQU 4352(CX), Y10 VMOVDQU 4384(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y5, Y5 VMOVDQU 4416(CX), Y10 VMOVDQU 4448(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y6, Y6 VMOVDQU 4480(CX), Y10 VMOVDQU 4512(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y7, Y7 VMOVDQU 4544(CX), Y10 VMOVDQU 4576(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 8 to 9 outputs VMOVDQU (DX), Y12 ADDQ $0x20, DX VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 4608(CX), Y10 VMOVDQU 4640(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y0, Y0 VMOVDQU 4672(CX), Y10 VMOVDQU 4704(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y1, Y1 VMOVDQU 4736(CX), Y10 VMOVDQU 4768(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y2, Y2 VMOVDQU 4800(CX), Y10 VMOVDQU 4832(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y3, Y3 VMOVDQU 4864(CX), Y10 VMOVDQU 4896(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y4, Y4 VMOVDQU 4928(CX), Y10 VMOVDQU 4960(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y5, Y5 VMOVDQU 4992(CX), Y10 VMOVDQU 5024(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y6, Y6 VMOVDQU 5056(CX), Y10 VMOVDQU 5088(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y7, Y7 VMOVDQU 5120(CX), Y10 VMOVDQU 5152(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y8, Y8 // Store 9 outputs MOVQ (R13), R15 VMOVDQU Y0, (R15)(R14*1) MOVQ 24(R13), R15 VMOVDQU Y1, (R15)(R14*1) MOVQ 48(R13), R15 VMOVDQU Y2, (R15)(R14*1) MOVQ 72(R13), R15 VMOVDQU Y3, (R15)(R14*1) MOVQ 96(R13), R15 VMOVDQU Y4, (R15)(R14*1) MOVQ 120(R13), R15 VMOVDQU Y5, (R15)(R14*1) MOVQ 144(R13), R15 VMOVDQU Y6, (R15)(R14*1) MOVQ 168(R13), R15 VMOVDQU Y7, (R15)(R14*1) MOVQ 192(R13), R15 VMOVDQU Y8, (R15)(R14*1) // Prepare for next loop ADDQ $0x20, R14 DECQ AX JNZ mulAvxTwo_9x9_loop VZEROUPPER mulAvxTwo_9x9_end: RET // func mulAvxTwo_9x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_9x10(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack // Full registers estimated 195 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_9x10_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), R12 MOVQ 192(DX), DX MOVQ out_base+48(FP), R13 MOVQ start+72(FP), R14 // Add start offset to input ADDQ R14, BX ADDQ R14, SI ADDQ R14, DI ADDQ R14, R8 ADDQ R14, R9 ADDQ R14, R10 ADDQ R14, R11 ADDQ R14, R12 ADDQ R14, DX MOVQ $0x0000000f, R15 MOVQ R15, X10 VPBROADCASTB X10, Y10 mulAvxTwo_9x10_loop: // Clear 10 outputs VPXOR Y0, Y0, Y0 VPXOR Y1, Y1, Y1 VPXOR Y2, Y2, Y2 VPXOR Y3, Y3, Y3 VPXOR Y4, Y4, Y4 VPXOR Y5, Y5, Y5 VPXOR Y6, Y6, Y6 VPXOR Y7, Y7, Y7 VPXOR Y8, Y8, Y8 VPXOR Y9, Y9, Y9 // Load and process 32 bytes from input 0 to 10 outputs VMOVDQU (BX), Y13 ADDQ $0x20, BX VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU (CX), Y11 VMOVDQU 32(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y0, Y0 VMOVDQU 64(CX), Y11 VMOVDQU 96(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y1, Y1 VMOVDQU 128(CX), Y11 VMOVDQU 160(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y2, Y2 VMOVDQU 192(CX), Y11 VMOVDQU 224(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y3, Y3 VMOVDQU 256(CX), Y11 VMOVDQU 288(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y4, Y4 VMOVDQU 320(CX), Y11 VMOVDQU 352(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y5, Y5 VMOVDQU 384(CX), Y11 VMOVDQU 416(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y6, Y6 VMOVDQU 448(CX), Y11 VMOVDQU 480(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y7, Y7 VMOVDQU 512(CX), Y11 VMOVDQU 544(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y8, Y8 VMOVDQU 576(CX), Y11 VMOVDQU 608(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 1 to 10 outputs VMOVDQU (SI), Y13 ADDQ $0x20, SI VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 640(CX), Y11 VMOVDQU 672(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y0, Y0 VMOVDQU 704(CX), Y11 VMOVDQU 736(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y1, Y1 VMOVDQU 768(CX), Y11 VMOVDQU 800(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y2, Y2 VMOVDQU 832(CX), Y11 VMOVDQU 864(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y3, Y3 VMOVDQU 896(CX), Y11 VMOVDQU 928(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y4, Y4 VMOVDQU 960(CX), Y11 VMOVDQU 992(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y5, Y5 VMOVDQU 1024(CX), Y11 VMOVDQU 1056(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y6, Y6 VMOVDQU 1088(CX), Y11 VMOVDQU 1120(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y7, Y7 VMOVDQU 1152(CX), Y11 VMOVDQU 1184(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y8, Y8 VMOVDQU 1216(CX), Y11 VMOVDQU 1248(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 2 to 10 outputs VMOVDQU (DI), Y13 ADDQ $0x20, DI VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 1280(CX), Y11 VMOVDQU 1312(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y0, Y0 VMOVDQU 1344(CX), Y11 VMOVDQU 1376(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y1, Y1 VMOVDQU 1408(CX), Y11 VMOVDQU 1440(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y2, Y2 VMOVDQU 1472(CX), Y11 VMOVDQU 1504(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y3, Y3 VMOVDQU 1536(CX), Y11 VMOVDQU 1568(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y4, Y4 VMOVDQU 1600(CX), Y11 VMOVDQU 1632(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y5, Y5 VMOVDQU 1664(CX), Y11 VMOVDQU 1696(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y6, Y6 VMOVDQU 1728(CX), Y11 VMOVDQU 1760(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y7, Y7 VMOVDQU 1792(CX), Y11 VMOVDQU 1824(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y8, Y8 VMOVDQU 1856(CX), Y11 VMOVDQU 1888(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 3 to 10 outputs VMOVDQU (R8), Y13 ADDQ $0x20, R8 VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 1920(CX), Y11 VMOVDQU 1952(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y0, Y0 VMOVDQU 1984(CX), Y11 VMOVDQU 2016(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y1, Y1 VMOVDQU 2048(CX), Y11 VMOVDQU 2080(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y2, Y2 VMOVDQU 2112(CX), Y11 VMOVDQU 2144(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y3, Y3 VMOVDQU 2176(CX), Y11 VMOVDQU 2208(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y4, Y4 VMOVDQU 2240(CX), Y11 VMOVDQU 2272(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y5, Y5 VMOVDQU 2304(CX), Y11 VMOVDQU 2336(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y6, Y6 VMOVDQU 2368(CX), Y11 VMOVDQU 2400(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y7, Y7 VMOVDQU 2432(CX), Y11 VMOVDQU 2464(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y8, Y8 VMOVDQU 2496(CX), Y11 VMOVDQU 2528(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 4 to 10 outputs VMOVDQU (R9), Y13 ADDQ $0x20, R9 VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 2560(CX), Y11 VMOVDQU 2592(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y0, Y0 VMOVDQU 2624(CX), Y11 VMOVDQU 2656(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y1, Y1 VMOVDQU 2688(CX), Y11 VMOVDQU 2720(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y2, Y2 VMOVDQU 2752(CX), Y11 VMOVDQU 2784(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y3, Y3 VMOVDQU 2816(CX), Y11 VMOVDQU 2848(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y4, Y4 VMOVDQU 2880(CX), Y11 VMOVDQU 2912(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y5, Y5 VMOVDQU 2944(CX), Y11 VMOVDQU 2976(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y6, Y6 VMOVDQU 3008(CX), Y11 VMOVDQU 3040(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y7, Y7 VMOVDQU 3072(CX), Y11 VMOVDQU 3104(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y8, Y8 VMOVDQU 3136(CX), Y11 VMOVDQU 3168(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 5 to 10 outputs VMOVDQU (R10), Y13 ADDQ $0x20, R10 VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 3200(CX), Y11 VMOVDQU 3232(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y0, Y0 VMOVDQU 3264(CX), Y11 VMOVDQU 3296(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y1, Y1 VMOVDQU 3328(CX), Y11 VMOVDQU 3360(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y2, Y2 VMOVDQU 3392(CX), Y11 VMOVDQU 3424(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y3, Y3 VMOVDQU 3456(CX), Y11 VMOVDQU 3488(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y4, Y4 VMOVDQU 3520(CX), Y11 VMOVDQU 3552(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y5, Y5 VMOVDQU 3584(CX), Y11 VMOVDQU 3616(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y6, Y6 VMOVDQU 3648(CX), Y11 VMOVDQU 3680(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y7, Y7 VMOVDQU 3712(CX), Y11 VMOVDQU 3744(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y8, Y8 VMOVDQU 3776(CX), Y11 VMOVDQU 3808(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 6 to 10 outputs VMOVDQU (R11), Y13 ADDQ $0x20, R11 VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 3840(CX), Y11 VMOVDQU 3872(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y0, Y0 VMOVDQU 3904(CX), Y11 VMOVDQU 3936(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y1, Y1 VMOVDQU 3968(CX), Y11 VMOVDQU 4000(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y2, Y2 VMOVDQU 4032(CX), Y11 VMOVDQU 4064(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y3, Y3 VMOVDQU 4096(CX), Y11 VMOVDQU 4128(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y4, Y4 VMOVDQU 4160(CX), Y11 VMOVDQU 4192(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y5, Y5 VMOVDQU 4224(CX), Y11 VMOVDQU 4256(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y6, Y6 VMOVDQU 4288(CX), Y11 VMOVDQU 4320(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y7, Y7 VMOVDQU 4352(CX), Y11 VMOVDQU 4384(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y8, Y8 VMOVDQU 4416(CX), Y11 VMOVDQU 4448(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 7 to 10 outputs VMOVDQU (R12), Y13 ADDQ $0x20, R12 VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 4480(CX), Y11 VMOVDQU 4512(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y0, Y0 VMOVDQU 4544(CX), Y11 VMOVDQU 4576(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y1, Y1 VMOVDQU 4608(CX), Y11 VMOVDQU 4640(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y2, Y2 VMOVDQU 4672(CX), Y11 VMOVDQU 4704(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y3, Y3 VMOVDQU 4736(CX), Y11 VMOVDQU 4768(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y4, Y4 VMOVDQU 4800(CX), Y11 VMOVDQU 4832(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y5, Y5 VMOVDQU 4864(CX), Y11 VMOVDQU 4896(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y6, Y6 VMOVDQU 4928(CX), Y11 VMOVDQU 4960(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y7, Y7 VMOVDQU 4992(CX), Y11 VMOVDQU 5024(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y8, Y8 VMOVDQU 5056(CX), Y11 VMOVDQU 5088(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 8 to 10 outputs VMOVDQU (DX), Y13 ADDQ $0x20, DX VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 5120(CX), Y11 VMOVDQU 5152(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y0, Y0 VMOVDQU 5184(CX), Y11 VMOVDQU 5216(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y1, Y1 VMOVDQU 5248(CX), Y11 VMOVDQU 5280(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y2, Y2 VMOVDQU 5312(CX), Y11 VMOVDQU 5344(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y3, Y3 VMOVDQU 5376(CX), Y11 VMOVDQU 5408(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y4, Y4 VMOVDQU 5440(CX), Y11 VMOVDQU 5472(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y5, Y5 VMOVDQU 5504(CX), Y11 VMOVDQU 5536(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y6, Y6 VMOVDQU 5568(CX), Y11 VMOVDQU 5600(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y7, Y7 VMOVDQU 5632(CX), Y11 VMOVDQU 5664(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y8, Y8 VMOVDQU 5696(CX), Y11 VMOVDQU 5728(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y9, Y9 // Store 10 outputs MOVQ (R13), R15 VMOVDQU Y0, (R15)(R14*1) MOVQ 24(R13), R15 VMOVDQU Y1, (R15)(R14*1) MOVQ 48(R13), R15 VMOVDQU Y2, (R15)(R14*1) MOVQ 72(R13), R15 VMOVDQU Y3, (R15)(R14*1) MOVQ 96(R13), R15 VMOVDQU Y4, (R15)(R14*1) MOVQ 120(R13), R15 VMOVDQU Y5, (R15)(R14*1) MOVQ 144(R13), R15 VMOVDQU Y6, (R15)(R14*1) MOVQ 168(R13), R15 VMOVDQU Y7, (R15)(R14*1) MOVQ 192(R13), R15 VMOVDQU Y8, (R15)(R14*1) MOVQ 216(R13), R15 VMOVDQU Y9, (R15)(R14*1) // Prepare for next loop ADDQ $0x20, R14 DECQ AX JNZ mulAvxTwo_9x10_loop VZEROUPPER mulAvxTwo_9x10_end: RET // func mulAvxTwo_10x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_10x1(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 24 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_10x1_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), R12 MOVQ 192(DX), R13 MOVQ 216(DX), DX MOVQ out_base+48(FP), R14 MOVQ (R14), R14 MOVQ start+72(FP), R15 // Add start offset to output ADDQ R15, R14 // Add start offset to input ADDQ R15, BX ADDQ R15, SI ADDQ R15, DI ADDQ R15, R8 ADDQ R15, R9 ADDQ R15, R10 ADDQ R15, R11 ADDQ R15, R12 ADDQ R15, R13 ADDQ R15, DX MOVQ $0x0000000f, R15 MOVQ R15, X1 VPBROADCASTB X1, Y1 mulAvxTwo_10x1_loop: // Clear 1 outputs VPXOR Y0, Y0, Y0 // Load and process 32 bytes from input 0 to 1 outputs VMOVDQU (BX), Y4 ADDQ $0x20, BX VPSRLQ $0x04, Y4, Y5 VPAND Y1, Y4, Y4 VPAND Y1, Y5, Y5 VMOVDQU (CX), Y2 VMOVDQU 32(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 VPXOR Y2, Y3, Y2 VPXOR Y2, Y0, Y0 // Load and process 32 bytes from input 1 to 1 outputs VMOVDQU (SI), Y4 ADDQ $0x20, SI VPSRLQ $0x04, Y4, Y5 VPAND Y1, Y4, Y4 VPAND Y1, Y5, Y5 VMOVDQU 64(CX), Y2 VMOVDQU 96(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 VPXOR Y2, Y3, Y2 VPXOR Y2, Y0, Y0 // Load and process 32 bytes from input 2 to 1 outputs VMOVDQU (DI), Y4 ADDQ $0x20, DI VPSRLQ $0x04, Y4, Y5 VPAND Y1, Y4, Y4 VPAND Y1, Y5, Y5 VMOVDQU 128(CX), Y2 VMOVDQU 160(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 VPXOR Y2, Y3, Y2 VPXOR Y2, Y0, Y0 // Load and process 32 bytes from input 3 to 1 outputs VMOVDQU (R8), Y4 ADDQ $0x20, R8 VPSRLQ $0x04, Y4, Y5 VPAND Y1, Y4, Y4 VPAND Y1, Y5, Y5 VMOVDQU 192(CX), Y2 VMOVDQU 224(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 VPXOR Y2, Y3, Y2 VPXOR Y2, Y0, Y0 // Load and process 32 bytes from input 4 to 1 outputs VMOVDQU (R9), Y4 ADDQ $0x20, R9 VPSRLQ $0x04, Y4, Y5 VPAND Y1, Y4, Y4 VPAND Y1, Y5, Y5 VMOVDQU 256(CX), Y2 VMOVDQU 288(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 VPXOR Y2, Y3, Y2 VPXOR Y2, Y0, Y0 // Load and process 32 bytes from input 5 to 1 outputs VMOVDQU (R10), Y4 ADDQ $0x20, R10 VPSRLQ $0x04, Y4, Y5 VPAND Y1, Y4, Y4 VPAND Y1, Y5, Y5 VMOVDQU 320(CX), Y2 VMOVDQU 352(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 VPXOR Y2, Y3, Y2 VPXOR Y2, Y0, Y0 // Load and process 32 bytes from input 6 to 1 outputs VMOVDQU (R11), Y4 ADDQ $0x20, R11 VPSRLQ $0x04, Y4, Y5 VPAND Y1, Y4, Y4 VPAND Y1, Y5, Y5 VMOVDQU 384(CX), Y2 VMOVDQU 416(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 VPXOR Y2, Y3, Y2 VPXOR Y2, Y0, Y0 // Load and process 32 bytes from input 7 to 1 outputs VMOVDQU (R12), Y4 ADDQ $0x20, R12 VPSRLQ $0x04, Y4, Y5 VPAND Y1, Y4, Y4 VPAND Y1, Y5, Y5 VMOVDQU 448(CX), Y2 VMOVDQU 480(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 VPXOR Y2, Y3, Y2 VPXOR Y2, Y0, Y0 // Load and process 32 bytes from input 8 to 1 outputs VMOVDQU (R13), Y4 ADDQ $0x20, R13 VPSRLQ $0x04, Y4, Y5 VPAND Y1, Y4, Y4 VPAND Y1, Y5, Y5 VMOVDQU 512(CX), Y2 VMOVDQU 544(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 VPXOR Y2, Y3, Y2 VPXOR Y2, Y0, Y0 // Load and process 32 bytes from input 9 to 1 outputs VMOVDQU (DX), Y4 ADDQ $0x20, DX VPSRLQ $0x04, Y4, Y5 VPAND Y1, Y4, Y4 VPAND Y1, Y5, Y5 VMOVDQU 576(CX), Y2 VMOVDQU 608(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 VPXOR Y2, Y3, Y2 VPXOR Y2, Y0, Y0 // Store 1 outputs VMOVDQU Y0, (R14) ADDQ $0x20, R14 // Prepare for next loop DECQ AX JNZ mulAvxTwo_10x1_loop VZEROUPPER mulAvxTwo_10x1_end: RET // func mulAvxTwo_10x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_10x1_64(SB), $8-88 // Loading no tables to registers // Destination kept on stack // Full registers estimated 24 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_10x1_64_end MOVQ in_base+24(FP), AX MOVQ (AX), DX MOVQ 24(AX), BX MOVQ 48(AX), SI MOVQ 72(AX), DI MOVQ 96(AX), R8 MOVQ 120(AX), R9 MOVQ 144(AX), R10 MOVQ 168(AX), R11 MOVQ 192(AX), R12 MOVQ 216(AX), AX MOVQ out_base+48(FP), R13 MOVQ out_base+48(FP), R13 MOVQ start+72(FP), R14 // Add start offset to input ADDQ R14, DX ADDQ R14, BX ADDQ R14, SI ADDQ R14, DI ADDQ R14, R8 ADDQ R14, R9 ADDQ R14, R10 ADDQ R14, R11 ADDQ R14, R12 ADDQ R14, AX MOVQ $0x0000000f, R15 MOVQ R15, X2 VPBROADCASTB X2, Y2 MOVQ n+80(FP), R15 SHRQ $0x06, R15 mulAvxTwo_10x1_64_loop: // Clear 1 outputs VPXOR Y0, Y0, Y0 VPXOR Y1, Y1, Y1 // Load and process 64 bytes from input 0 to 1 outputs VMOVDQU (DX), Y6 VMOVDQU 32(DX), Y5 ADDQ $0x40, DX VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU (CX), Y3 VMOVDQU 32(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y5, Y6, Y5 VPXOR Y3, Y0, Y0 VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 1 to 1 outputs VMOVDQU (BX), Y6 VMOVDQU 32(BX), Y5 ADDQ $0x40, BX VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 64(CX), Y3 VMOVDQU 96(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y5, Y6, Y5 VPXOR Y3, Y0, Y0 VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 2 to 1 outputs VMOVDQU (SI), Y6 VMOVDQU 32(SI), Y5 ADDQ $0x40, SI VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 128(CX), Y3 VMOVDQU 160(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y5, Y6, Y5 VPXOR Y3, Y0, Y0 VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 3 to 1 outputs VMOVDQU (DI), Y6 VMOVDQU 32(DI), Y5 ADDQ $0x40, DI VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 192(CX), Y3 VMOVDQU 224(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y5, Y6, Y5 VPXOR Y3, Y0, Y0 VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 4 to 1 outputs VMOVDQU (R8), Y6 VMOVDQU 32(R8), Y5 ADDQ $0x40, R8 VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 256(CX), Y3 VMOVDQU 288(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y5, Y6, Y5 VPXOR Y3, Y0, Y0 VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 5 to 1 outputs VMOVDQU (R9), Y6 VMOVDQU 32(R9), Y5 ADDQ $0x40, R9 VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 320(CX), Y3 VMOVDQU 352(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y5, Y6, Y5 VPXOR Y3, Y0, Y0 VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 6 to 1 outputs VMOVDQU (R10), Y6 VMOVDQU 32(R10), Y5 ADDQ $0x40, R10 VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 384(CX), Y3 VMOVDQU 416(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y5, Y6, Y5 VPXOR Y3, Y0, Y0 VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 7 to 1 outputs VMOVDQU (R11), Y6 VMOVDQU 32(R11), Y5 ADDQ $0x40, R11 VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 448(CX), Y3 VMOVDQU 480(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y5, Y6, Y5 VPXOR Y3, Y0, Y0 VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 8 to 1 outputs VMOVDQU (R12), Y6 VMOVDQU 32(R12), Y5 ADDQ $0x40, R12 VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 512(CX), Y3 VMOVDQU 544(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y5, Y6, Y5 VPXOR Y3, Y0, Y0 VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 9 to 1 outputs VMOVDQU (AX), Y6 VMOVDQU 32(AX), Y5 ADDQ $0x40, AX VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y7, Y7 VPAND Y2, Y8, Y8 VMOVDQU 576(CX), Y3 VMOVDQU 608(CX), Y4 VPSHUFB Y5, Y3, Y5 VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y5, Y6, Y5 VPXOR Y3, Y0, Y0 VPXOR Y5, Y1, Y1 // Store 1 outputs MOVQ (R13), BP VMOVDQU Y0, (BP)(R14*1) VMOVDQU Y1, 32(BP)(R14*1) // Prepare for next loop ADDQ $0x40, R14 DECQ R15 JNZ mulAvxTwo_10x1_64_loop VZEROUPPER mulAvxTwo_10x1_64_end: RET // func mulAvxTwo_10x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_10x2(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 47 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_10x2_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), R12 MOVQ 192(DX), R13 MOVQ 216(DX), DX MOVQ out_base+48(FP), R14 MOVQ (R14), R15 MOVQ 24(R14), R14 MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, R15 ADDQ BP, R14 // Add start offset to input ADDQ BP, BX ADDQ BP, SI ADDQ BP, DI ADDQ BP, R8 ADDQ BP, R9 ADDQ BP, R10 ADDQ BP, R11 ADDQ BP, R12 ADDQ BP, R13 ADDQ BP, DX MOVQ $0x0000000f, BP MOVQ BP, X2 VPBROADCASTB X2, Y2 mulAvxTwo_10x2_loop: // Clear 2 outputs VPXOR Y0, Y0, Y0 VPXOR Y1, Y1, Y1 // Load and process 32 bytes from input 0 to 2 outputs VMOVDQU (BX), Y5 ADDQ $0x20, BX VPSRLQ $0x04, Y5, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y6, Y6 VMOVDQU (CX), Y3 VMOVDQU 32(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y3, Y0, Y0 VMOVDQU 64(CX), Y3 VMOVDQU 96(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 1 to 2 outputs VMOVDQU (SI), Y5 ADDQ $0x20, SI VPSRLQ $0x04, Y5, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y6, Y6 VMOVDQU 128(CX), Y3 VMOVDQU 160(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y3, Y0, Y0 VMOVDQU 192(CX), Y3 VMOVDQU 224(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 2 to 2 outputs VMOVDQU (DI), Y5 ADDQ $0x20, DI VPSRLQ $0x04, Y5, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y6, Y6 VMOVDQU 256(CX), Y3 VMOVDQU 288(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y3, Y0, Y0 VMOVDQU 320(CX), Y3 VMOVDQU 352(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 3 to 2 outputs VMOVDQU (R8), Y5 ADDQ $0x20, R8 VPSRLQ $0x04, Y5, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y6, Y6 VMOVDQU 384(CX), Y3 VMOVDQU 416(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y3, Y0, Y0 VMOVDQU 448(CX), Y3 VMOVDQU 480(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 4 to 2 outputs VMOVDQU (R9), Y5 ADDQ $0x20, R9 VPSRLQ $0x04, Y5, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y6, Y6 VMOVDQU 512(CX), Y3 VMOVDQU 544(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y3, Y0, Y0 VMOVDQU 576(CX), Y3 VMOVDQU 608(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 5 to 2 outputs VMOVDQU (R10), Y5 ADDQ $0x20, R10 VPSRLQ $0x04, Y5, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y6, Y6 VMOVDQU 640(CX), Y3 VMOVDQU 672(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y3, Y0, Y0 VMOVDQU 704(CX), Y3 VMOVDQU 736(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 6 to 2 outputs VMOVDQU (R11), Y5 ADDQ $0x20, R11 VPSRLQ $0x04, Y5, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y6, Y6 VMOVDQU 768(CX), Y3 VMOVDQU 800(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y3, Y0, Y0 VMOVDQU 832(CX), Y3 VMOVDQU 864(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 7 to 2 outputs VMOVDQU (R12), Y5 ADDQ $0x20, R12 VPSRLQ $0x04, Y5, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y6, Y6 VMOVDQU 896(CX), Y3 VMOVDQU 928(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y3, Y0, Y0 VMOVDQU 960(CX), Y3 VMOVDQU 992(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 8 to 2 outputs VMOVDQU (R13), Y5 ADDQ $0x20, R13 VPSRLQ $0x04, Y5, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y6, Y6 VMOVDQU 1024(CX), Y3 VMOVDQU 1056(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y3, Y0, Y0 VMOVDQU 1088(CX), Y3 VMOVDQU 1120(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 9 to 2 outputs VMOVDQU (DX), Y5 ADDQ $0x20, DX VPSRLQ $0x04, Y5, Y6 VPAND Y2, Y5, Y5 VPAND Y2, Y6, Y6 VMOVDQU 1152(CX), Y3 VMOVDQU 1184(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y3, Y0, Y0 VMOVDQU 1216(CX), Y3 VMOVDQU 1248(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 VPXOR Y3, Y4, Y3 VPXOR Y3, Y1, Y1 // Store 2 outputs VMOVDQU Y0, (R15) ADDQ $0x20, R15 VMOVDQU Y1, (R14) ADDQ $0x20, R14 // Prepare for next loop DECQ AX JNZ mulAvxTwo_10x2_loop VZEROUPPER mulAvxTwo_10x2_end: RET // func mulAvxTwo_10x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_10x2_64(SB), $8-88 // Loading no tables to registers // Destination kept on stack // Full registers estimated 47 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_10x2_64_end MOVQ in_base+24(FP), AX MOVQ (AX), DX MOVQ 24(AX), BX MOVQ 48(AX), SI MOVQ 72(AX), DI MOVQ 96(AX), R8 MOVQ 120(AX), R9 MOVQ 144(AX), R10 MOVQ 168(AX), R11 MOVQ 192(AX), R12 MOVQ 216(AX), AX MOVQ out_base+48(FP), R13 MOVQ out_base+48(FP), R13 MOVQ start+72(FP), R14 // Add start offset to input ADDQ R14, DX ADDQ R14, BX ADDQ R14, SI ADDQ R14, DI ADDQ R14, R8 ADDQ R14, R9 ADDQ R14, R10 ADDQ R14, R11 ADDQ R14, R12 ADDQ R14, AX MOVQ $0x0000000f, R15 MOVQ R15, X4 VPBROADCASTB X4, Y4 MOVQ n+80(FP), R15 SHRQ $0x06, R15 mulAvxTwo_10x2_64_loop: // Clear 2 outputs VPXOR Y0, Y0, Y0 VPXOR Y1, Y1, Y1 VPXOR Y2, Y2, Y2 VPXOR Y3, Y3, Y3 // Load and process 64 bytes from input 0 to 2 outputs VMOVDQU (DX), Y9 VMOVDQU 32(DX), Y11 ADDQ $0x40, DX VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU (CX), Y5 VMOVDQU 32(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y7, Y8, Y7 VPXOR Y5, Y0, Y0 VPXOR Y7, Y1, Y1 VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y7, Y8, Y7 VPXOR Y5, Y2, Y2 VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 1 to 2 outputs VMOVDQU (BX), Y9 VMOVDQU 32(BX), Y11 ADDQ $0x40, BX VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 128(CX), Y5 VMOVDQU 160(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y7, Y8, Y7 VPXOR Y5, Y0, Y0 VPXOR Y7, Y1, Y1 VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y7, Y8, Y7 VPXOR Y5, Y2, Y2 VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 2 to 2 outputs VMOVDQU (SI), Y9 VMOVDQU 32(SI), Y11 ADDQ $0x40, SI VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 256(CX), Y5 VMOVDQU 288(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y7, Y8, Y7 VPXOR Y5, Y0, Y0 VPXOR Y7, Y1, Y1 VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y7, Y8, Y7 VPXOR Y5, Y2, Y2 VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 3 to 2 outputs VMOVDQU (DI), Y9 VMOVDQU 32(DI), Y11 ADDQ $0x40, DI VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 384(CX), Y5 VMOVDQU 416(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y7, Y8, Y7 VPXOR Y5, Y0, Y0 VPXOR Y7, Y1, Y1 VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y7, Y8, Y7 VPXOR Y5, Y2, Y2 VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 4 to 2 outputs VMOVDQU (R8), Y9 VMOVDQU 32(R8), Y11 ADDQ $0x40, R8 VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 512(CX), Y5 VMOVDQU 544(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y7, Y8, Y7 VPXOR Y5, Y0, Y0 VPXOR Y7, Y1, Y1 VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y7, Y8, Y7 VPXOR Y5, Y2, Y2 VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 5 to 2 outputs VMOVDQU (R9), Y9 VMOVDQU 32(R9), Y11 ADDQ $0x40, R9 VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 640(CX), Y5 VMOVDQU 672(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y7, Y8, Y7 VPXOR Y5, Y0, Y0 VPXOR Y7, Y1, Y1 VMOVDQU 704(CX), Y5 VMOVDQU 736(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y7, Y8, Y7 VPXOR Y5, Y2, Y2 VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 6 to 2 outputs VMOVDQU (R10), Y9 VMOVDQU 32(R10), Y11 ADDQ $0x40, R10 VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 768(CX), Y5 VMOVDQU 800(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y7, Y8, Y7 VPXOR Y5, Y0, Y0 VPXOR Y7, Y1, Y1 VMOVDQU 832(CX), Y5 VMOVDQU 864(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y7, Y8, Y7 VPXOR Y5, Y2, Y2 VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 7 to 2 outputs VMOVDQU (R11), Y9 VMOVDQU 32(R11), Y11 ADDQ $0x40, R11 VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 896(CX), Y5 VMOVDQU 928(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y7, Y8, Y7 VPXOR Y5, Y0, Y0 VPXOR Y7, Y1, Y1 VMOVDQU 960(CX), Y5 VMOVDQU 992(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y7, Y8, Y7 VPXOR Y5, Y2, Y2 VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 8 to 2 outputs VMOVDQU (R12), Y9 VMOVDQU 32(R12), Y11 ADDQ $0x40, R12 VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 1024(CX), Y5 VMOVDQU 1056(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y7, Y8, Y7 VPXOR Y5, Y0, Y0 VPXOR Y7, Y1, Y1 VMOVDQU 1088(CX), Y5 VMOVDQU 1120(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y7, Y8, Y7 VPXOR Y5, Y2, Y2 VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 9 to 2 outputs VMOVDQU (AX), Y9 VMOVDQU 32(AX), Y11 ADDQ $0x40, AX VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 VPAND Y4, Y11, Y11 VPAND Y4, Y10, Y10 VPAND Y4, Y12, Y12 VMOVDQU 1152(CX), Y5 VMOVDQU 1184(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y7, Y8, Y7 VPXOR Y5, Y0, Y0 VPXOR Y7, Y1, Y1 VMOVDQU 1216(CX), Y5 VMOVDQU 1248(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y7, Y8, Y7 VPXOR Y5, Y2, Y2 VPXOR Y7, Y3, Y3 // Store 2 outputs MOVQ (R13), BP VMOVDQU Y0, (BP)(R14*1) VMOVDQU Y1, 32(BP)(R14*1) MOVQ 24(R13), BP VMOVDQU Y2, (BP)(R14*1) VMOVDQU Y3, 32(BP)(R14*1) // Prepare for next loop ADDQ $0x40, R14 DECQ R15 JNZ mulAvxTwo_10x2_64_loop VZEROUPPER mulAvxTwo_10x2_64_end: RET // func mulAvxTwo_10x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_10x3(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept in GP registers // Full registers estimated 68 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_10x3_end MOVQ in_base+24(FP), AX MOVQ (AX), DX MOVQ 24(AX), BX MOVQ 48(AX), SI MOVQ 72(AX), DI MOVQ 96(AX), R8 MOVQ 120(AX), R9 MOVQ 144(AX), R10 MOVQ 168(AX), R11 MOVQ 192(AX), R12 MOVQ 216(AX), AX MOVQ out_base+48(FP), R13 MOVQ (R13), R14 MOVQ 24(R13), R15 MOVQ 48(R13), R13 MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, R14 ADDQ BP, R15 ADDQ BP, R13 // Add start offset to input ADDQ BP, DX ADDQ BP, BX ADDQ BP, SI ADDQ BP, DI ADDQ BP, R8 ADDQ BP, R9 ADDQ BP, R10 ADDQ BP, R11 ADDQ BP, R12 ADDQ BP, AX MOVQ $0x0000000f, BP MOVQ BP, X3 VPBROADCASTB X3, Y3 MOVQ n+80(FP), BP SHRQ $0x05, BP mulAvxTwo_10x3_loop: // Clear 3 outputs VPXOR Y0, Y0, Y0 VPXOR Y1, Y1, Y1 VPXOR Y2, Y2, Y2 // Load and process 32 bytes from input 0 to 3 outputs VMOVDQU (DX), Y6 ADDQ $0x20, DX VPSRLQ $0x04, Y6, Y7 VPAND Y3, Y6, Y6 VPAND Y3, Y7, Y7 VMOVDQU (CX), Y4 VMOVDQU 32(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y0, Y0 VMOVDQU 64(CX), Y4 VMOVDQU 96(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y1, Y1 VMOVDQU 128(CX), Y4 VMOVDQU 160(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 1 to 3 outputs VMOVDQU (BX), Y6 ADDQ $0x20, BX VPSRLQ $0x04, Y6, Y7 VPAND Y3, Y6, Y6 VPAND Y3, Y7, Y7 VMOVDQU 192(CX), Y4 VMOVDQU 224(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y0, Y0 VMOVDQU 256(CX), Y4 VMOVDQU 288(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y1, Y1 VMOVDQU 320(CX), Y4 VMOVDQU 352(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 2 to 3 outputs VMOVDQU (SI), Y6 ADDQ $0x20, SI VPSRLQ $0x04, Y6, Y7 VPAND Y3, Y6, Y6 VPAND Y3, Y7, Y7 VMOVDQU 384(CX), Y4 VMOVDQU 416(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y0, Y0 VMOVDQU 448(CX), Y4 VMOVDQU 480(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y1, Y1 VMOVDQU 512(CX), Y4 VMOVDQU 544(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 3 to 3 outputs VMOVDQU (DI), Y6 ADDQ $0x20, DI VPSRLQ $0x04, Y6, Y7 VPAND Y3, Y6, Y6 VPAND Y3, Y7, Y7 VMOVDQU 576(CX), Y4 VMOVDQU 608(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y0, Y0 VMOVDQU 640(CX), Y4 VMOVDQU 672(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y1, Y1 VMOVDQU 704(CX), Y4 VMOVDQU 736(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 4 to 3 outputs VMOVDQU (R8), Y6 ADDQ $0x20, R8 VPSRLQ $0x04, Y6, Y7 VPAND Y3, Y6, Y6 VPAND Y3, Y7, Y7 VMOVDQU 768(CX), Y4 VMOVDQU 800(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y0, Y0 VMOVDQU 832(CX), Y4 VMOVDQU 864(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y1, Y1 VMOVDQU 896(CX), Y4 VMOVDQU 928(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 5 to 3 outputs VMOVDQU (R9), Y6 ADDQ $0x20, R9 VPSRLQ $0x04, Y6, Y7 VPAND Y3, Y6, Y6 VPAND Y3, Y7, Y7 VMOVDQU 960(CX), Y4 VMOVDQU 992(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y0, Y0 VMOVDQU 1024(CX), Y4 VMOVDQU 1056(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y1, Y1 VMOVDQU 1088(CX), Y4 VMOVDQU 1120(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 6 to 3 outputs VMOVDQU (R10), Y6 ADDQ $0x20, R10 VPSRLQ $0x04, Y6, Y7 VPAND Y3, Y6, Y6 VPAND Y3, Y7, Y7 VMOVDQU 1152(CX), Y4 VMOVDQU 1184(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y0, Y0 VMOVDQU 1216(CX), Y4 VMOVDQU 1248(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y1, Y1 VMOVDQU 1280(CX), Y4 VMOVDQU 1312(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 7 to 3 outputs VMOVDQU (R11), Y6 ADDQ $0x20, R11 VPSRLQ $0x04, Y6, Y7 VPAND Y3, Y6, Y6 VPAND Y3, Y7, Y7 VMOVDQU 1344(CX), Y4 VMOVDQU 1376(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y0, Y0 VMOVDQU 1408(CX), Y4 VMOVDQU 1440(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y1, Y1 VMOVDQU 1472(CX), Y4 VMOVDQU 1504(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 8 to 3 outputs VMOVDQU (R12), Y6 ADDQ $0x20, R12 VPSRLQ $0x04, Y6, Y7 VPAND Y3, Y6, Y6 VPAND Y3, Y7, Y7 VMOVDQU 1536(CX), Y4 VMOVDQU 1568(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y0, Y0 VMOVDQU 1600(CX), Y4 VMOVDQU 1632(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y1, Y1 VMOVDQU 1664(CX), Y4 VMOVDQU 1696(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 9 to 3 outputs VMOVDQU (AX), Y6 ADDQ $0x20, AX VPSRLQ $0x04, Y6, Y7 VPAND Y3, Y6, Y6 VPAND Y3, Y7, Y7 VMOVDQU 1728(CX), Y4 VMOVDQU 1760(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y0, Y0 VMOVDQU 1792(CX), Y4 VMOVDQU 1824(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y1, Y1 VMOVDQU 1856(CX), Y4 VMOVDQU 1888(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 VPXOR Y4, Y5, Y4 VPXOR Y4, Y2, Y2 // Store 3 outputs VMOVDQU Y0, (R14) ADDQ $0x20, R14 VMOVDQU Y1, (R15) ADDQ $0x20, R15 VMOVDQU Y2, (R13) ADDQ $0x20, R13 // Prepare for next loop DECQ BP JNZ mulAvxTwo_10x3_loop VZEROUPPER mulAvxTwo_10x3_end: RET // func mulAvxTwo_10x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_10x3_64(SB), $8-88 // Loading no tables to registers // Destination kept on stack // Full registers estimated 68 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_10x3_64_end MOVQ in_base+24(FP), AX MOVQ (AX), DX MOVQ 24(AX), BX MOVQ 48(AX), SI MOVQ 72(AX), DI MOVQ 96(AX), R8 MOVQ 120(AX), R9 MOVQ 144(AX), R10 MOVQ 168(AX), R11 MOVQ 192(AX), R12 MOVQ 216(AX), AX MOVQ out_base+48(FP), R13 MOVQ out_base+48(FP), R13 MOVQ start+72(FP), R14 // Add start offset to input ADDQ R14, DX ADDQ R14, BX ADDQ R14, SI ADDQ R14, DI ADDQ R14, R8 ADDQ R14, R9 ADDQ R14, R10 ADDQ R14, R11 ADDQ R14, R12 ADDQ R14, AX MOVQ $0x0000000f, R15 MOVQ R15, X6 VPBROADCASTB X6, Y6 MOVQ n+80(FP), R15 SHRQ $0x06, R15 mulAvxTwo_10x3_64_loop: // Clear 3 outputs VPXOR Y0, Y0, Y0 VPXOR Y1, Y1, Y1 VPXOR Y2, Y2, Y2 VPXOR Y3, Y3, Y3 VPXOR Y4, Y4, Y4 VPXOR Y5, Y5, Y5 // Load and process 64 bytes from input 0 to 3 outputs VMOVDQU (DX), Y11 VMOVDQU 32(DX), Y13 ADDQ $0x40, DX VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU (CX), Y7 VMOVDQU 32(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y0, Y0 VPXOR Y9, Y1, Y1 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y2, Y2 VPXOR Y9, Y3, Y3 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y4, Y4 VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 1 to 3 outputs VMOVDQU (BX), Y11 VMOVDQU 32(BX), Y13 ADDQ $0x40, BX VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 192(CX), Y7 VMOVDQU 224(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y0, Y0 VPXOR Y9, Y1, Y1 VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y2, Y2 VPXOR Y9, Y3, Y3 VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y4, Y4 VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 2 to 3 outputs VMOVDQU (SI), Y11 VMOVDQU 32(SI), Y13 ADDQ $0x40, SI VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 384(CX), Y7 VMOVDQU 416(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y0, Y0 VPXOR Y9, Y1, Y1 VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y2, Y2 VPXOR Y9, Y3, Y3 VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y4, Y4 VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 3 to 3 outputs VMOVDQU (DI), Y11 VMOVDQU 32(DI), Y13 ADDQ $0x40, DI VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 576(CX), Y7 VMOVDQU 608(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y0, Y0 VPXOR Y9, Y1, Y1 VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y2, Y2 VPXOR Y9, Y3, Y3 VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y4, Y4 VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 4 to 3 outputs VMOVDQU (R8), Y11 VMOVDQU 32(R8), Y13 ADDQ $0x40, R8 VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 768(CX), Y7 VMOVDQU 800(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y0, Y0 VPXOR Y9, Y1, Y1 VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y2, Y2 VPXOR Y9, Y3, Y3 VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y4, Y4 VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 5 to 3 outputs VMOVDQU (R9), Y11 VMOVDQU 32(R9), Y13 ADDQ $0x40, R9 VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 960(CX), Y7 VMOVDQU 992(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y0, Y0 VPXOR Y9, Y1, Y1 VMOVDQU 1024(CX), Y7 VMOVDQU 1056(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y2, Y2 VPXOR Y9, Y3, Y3 VMOVDQU 1088(CX), Y7 VMOVDQU 1120(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y4, Y4 VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 6 to 3 outputs VMOVDQU (R10), Y11 VMOVDQU 32(R10), Y13 ADDQ $0x40, R10 VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 1152(CX), Y7 VMOVDQU 1184(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y0, Y0 VPXOR Y9, Y1, Y1 VMOVDQU 1216(CX), Y7 VMOVDQU 1248(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y2, Y2 VPXOR Y9, Y3, Y3 VMOVDQU 1280(CX), Y7 VMOVDQU 1312(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y4, Y4 VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 7 to 3 outputs VMOVDQU (R11), Y11 VMOVDQU 32(R11), Y13 ADDQ $0x40, R11 VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 1344(CX), Y7 VMOVDQU 1376(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y0, Y0 VPXOR Y9, Y1, Y1 VMOVDQU 1408(CX), Y7 VMOVDQU 1440(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y2, Y2 VPXOR Y9, Y3, Y3 VMOVDQU 1472(CX), Y7 VMOVDQU 1504(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y4, Y4 VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 8 to 3 outputs VMOVDQU (R12), Y11 VMOVDQU 32(R12), Y13 ADDQ $0x40, R12 VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 1536(CX), Y7 VMOVDQU 1568(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y0, Y0 VPXOR Y9, Y1, Y1 VMOVDQU 1600(CX), Y7 VMOVDQU 1632(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y2, Y2 VPXOR Y9, Y3, Y3 VMOVDQU 1664(CX), Y7 VMOVDQU 1696(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y4, Y4 VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 9 to 3 outputs VMOVDQU (AX), Y11 VMOVDQU 32(AX), Y13 ADDQ $0x40, AX VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 VPAND Y6, Y13, Y13 VPAND Y6, Y12, Y12 VPAND Y6, Y14, Y14 VMOVDQU 1728(CX), Y7 VMOVDQU 1760(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y0, Y0 VPXOR Y9, Y1, Y1 VMOVDQU 1792(CX), Y7 VMOVDQU 1824(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y2, Y2 VPXOR Y9, Y3, Y3 VMOVDQU 1856(CX), Y7 VMOVDQU 1888(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y9, Y10, Y9 VPXOR Y7, Y4, Y4 VPXOR Y9, Y5, Y5 // Store 3 outputs MOVQ (R13), BP VMOVDQU Y0, (BP)(R14*1) VMOVDQU Y1, 32(BP)(R14*1) MOVQ 24(R13), BP VMOVDQU Y2, (BP)(R14*1) VMOVDQU Y3, 32(BP)(R14*1) MOVQ 48(R13), BP VMOVDQU Y4, (BP)(R14*1) VMOVDQU Y5, 32(BP)(R14*1) // Prepare for next loop ADDQ $0x40, R14 DECQ R15 JNZ mulAvxTwo_10x3_64_loop VZEROUPPER mulAvxTwo_10x3_64_end: RET // func mulAvxTwo_10x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_10x4(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept on stack // Full registers estimated 89 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_10x4_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), R12 MOVQ 192(DX), R13 MOVQ 216(DX), DX MOVQ out_base+48(FP), R14 MOVQ start+72(FP), R15 // Add start offset to input ADDQ R15, BX ADDQ R15, SI ADDQ R15, DI ADDQ R15, R8 ADDQ R15, R9 ADDQ R15, R10 ADDQ R15, R11 ADDQ R15, R12 ADDQ R15, R13 ADDQ R15, DX MOVQ $0x0000000f, BP MOVQ BP, X4 VPBROADCASTB X4, Y4 mulAvxTwo_10x4_loop: // Clear 4 outputs VPXOR Y0, Y0, Y0 VPXOR Y1, Y1, Y1 VPXOR Y2, Y2, Y2 VPXOR Y3, Y3, Y3 // Load and process 32 bytes from input 0 to 4 outputs VMOVDQU (BX), Y7 ADDQ $0x20, BX VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU (CX), Y5 VMOVDQU 32(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y0, Y0 VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y1, Y1 VMOVDQU 128(CX), Y5 VMOVDQU 160(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y2, Y2 VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 1 to 4 outputs VMOVDQU (SI), Y7 ADDQ $0x20, SI VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 256(CX), Y5 VMOVDQU 288(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y0, Y0 VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y1, Y1 VMOVDQU 384(CX), Y5 VMOVDQU 416(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y2, Y2 VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 2 to 4 outputs VMOVDQU (DI), Y7 ADDQ $0x20, DI VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 512(CX), Y5 VMOVDQU 544(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y0, Y0 VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y1, Y1 VMOVDQU 640(CX), Y5 VMOVDQU 672(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y2, Y2 VMOVDQU 704(CX), Y5 VMOVDQU 736(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 3 to 4 outputs VMOVDQU (R8), Y7 ADDQ $0x20, R8 VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 768(CX), Y5 VMOVDQU 800(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y0, Y0 VMOVDQU 832(CX), Y5 VMOVDQU 864(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y1, Y1 VMOVDQU 896(CX), Y5 VMOVDQU 928(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y2, Y2 VMOVDQU 960(CX), Y5 VMOVDQU 992(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 4 to 4 outputs VMOVDQU (R9), Y7 ADDQ $0x20, R9 VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 1024(CX), Y5 VMOVDQU 1056(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y0, Y0 VMOVDQU 1088(CX), Y5 VMOVDQU 1120(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y1, Y1 VMOVDQU 1152(CX), Y5 VMOVDQU 1184(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y2, Y2 VMOVDQU 1216(CX), Y5 VMOVDQU 1248(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 5 to 4 outputs VMOVDQU (R10), Y7 ADDQ $0x20, R10 VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 1280(CX), Y5 VMOVDQU 1312(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y0, Y0 VMOVDQU 1344(CX), Y5 VMOVDQU 1376(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y1, Y1 VMOVDQU 1408(CX), Y5 VMOVDQU 1440(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y2, Y2 VMOVDQU 1472(CX), Y5 VMOVDQU 1504(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 6 to 4 outputs VMOVDQU (R11), Y7 ADDQ $0x20, R11 VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 1536(CX), Y5 VMOVDQU 1568(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y0, Y0 VMOVDQU 1600(CX), Y5 VMOVDQU 1632(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y1, Y1 VMOVDQU 1664(CX), Y5 VMOVDQU 1696(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y2, Y2 VMOVDQU 1728(CX), Y5 VMOVDQU 1760(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 7 to 4 outputs VMOVDQU (R12), Y7 ADDQ $0x20, R12 VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 1792(CX), Y5 VMOVDQU 1824(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y0, Y0 VMOVDQU 1856(CX), Y5 VMOVDQU 1888(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y1, Y1 VMOVDQU 1920(CX), Y5 VMOVDQU 1952(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y2, Y2 VMOVDQU 1984(CX), Y5 VMOVDQU 2016(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 8 to 4 outputs VMOVDQU (R13), Y7 ADDQ $0x20, R13 VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 2048(CX), Y5 VMOVDQU 2080(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y0, Y0 VMOVDQU 2112(CX), Y5 VMOVDQU 2144(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y1, Y1 VMOVDQU 2176(CX), Y5 VMOVDQU 2208(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y2, Y2 VMOVDQU 2240(CX), Y5 VMOVDQU 2272(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 9 to 4 outputs VMOVDQU (DX), Y7 ADDQ $0x20, DX VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 VMOVDQU 2304(CX), Y5 VMOVDQU 2336(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y0, Y0 VMOVDQU 2368(CX), Y5 VMOVDQU 2400(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y1, Y1 VMOVDQU 2432(CX), Y5 VMOVDQU 2464(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y2, Y2 VMOVDQU 2496(CX), Y5 VMOVDQU 2528(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y3, Y3 // Store 4 outputs MOVQ (R14), BP VMOVDQU Y0, (BP)(R15*1) MOVQ 24(R14), BP VMOVDQU Y1, (BP)(R15*1) MOVQ 48(R14), BP VMOVDQU Y2, (BP)(R15*1) MOVQ 72(R14), BP VMOVDQU Y3, (BP)(R15*1) // Prepare for next loop ADDQ $0x20, R15 DECQ AX JNZ mulAvxTwo_10x4_loop VZEROUPPER mulAvxTwo_10x4_end: RET // func mulAvxTwo_10x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_10x5(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept on stack // Full registers estimated 110 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_10x5_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), R12 MOVQ 192(DX), R13 MOVQ 216(DX), DX MOVQ out_base+48(FP), R14 MOVQ start+72(FP), R15 // Add start offset to input ADDQ R15, BX ADDQ R15, SI ADDQ R15, DI ADDQ R15, R8 ADDQ R15, R9 ADDQ R15, R10 ADDQ R15, R11 ADDQ R15, R12 ADDQ R15, R13 ADDQ R15, DX MOVQ $0x0000000f, BP MOVQ BP, X5 VPBROADCASTB X5, Y5 mulAvxTwo_10x5_loop: // Clear 5 outputs VPXOR Y0, Y0, Y0 VPXOR Y1, Y1, Y1 VPXOR Y2, Y2, Y2 VPXOR Y3, Y3, Y3 VPXOR Y4, Y4, Y4 // Load and process 32 bytes from input 0 to 5 outputs VMOVDQU (BX), Y8 ADDQ $0x20, BX VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU (CX), Y6 VMOVDQU 32(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y0, Y0 VMOVDQU 64(CX), Y6 VMOVDQU 96(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y1, Y1 VMOVDQU 128(CX), Y6 VMOVDQU 160(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y2, Y2 VMOVDQU 192(CX), Y6 VMOVDQU 224(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y3, Y3 VMOVDQU 256(CX), Y6 VMOVDQU 288(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 1 to 5 outputs VMOVDQU (SI), Y8 ADDQ $0x20, SI VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 320(CX), Y6 VMOVDQU 352(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y0, Y0 VMOVDQU 384(CX), Y6 VMOVDQU 416(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y1, Y1 VMOVDQU 448(CX), Y6 VMOVDQU 480(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y2, Y2 VMOVDQU 512(CX), Y6 VMOVDQU 544(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y3, Y3 VMOVDQU 576(CX), Y6 VMOVDQU 608(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 2 to 5 outputs VMOVDQU (DI), Y8 ADDQ $0x20, DI VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 640(CX), Y6 VMOVDQU 672(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y0, Y0 VMOVDQU 704(CX), Y6 VMOVDQU 736(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y1, Y1 VMOVDQU 768(CX), Y6 VMOVDQU 800(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y2, Y2 VMOVDQU 832(CX), Y6 VMOVDQU 864(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y3, Y3 VMOVDQU 896(CX), Y6 VMOVDQU 928(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 3 to 5 outputs VMOVDQU (R8), Y8 ADDQ $0x20, R8 VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 960(CX), Y6 VMOVDQU 992(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y0, Y0 VMOVDQU 1024(CX), Y6 VMOVDQU 1056(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y1, Y1 VMOVDQU 1088(CX), Y6 VMOVDQU 1120(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y2, Y2 VMOVDQU 1152(CX), Y6 VMOVDQU 1184(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y3, Y3 VMOVDQU 1216(CX), Y6 VMOVDQU 1248(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 4 to 5 outputs VMOVDQU (R9), Y8 ADDQ $0x20, R9 VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 1280(CX), Y6 VMOVDQU 1312(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y0, Y0 VMOVDQU 1344(CX), Y6 VMOVDQU 1376(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y1, Y1 VMOVDQU 1408(CX), Y6 VMOVDQU 1440(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y2, Y2 VMOVDQU 1472(CX), Y6 VMOVDQU 1504(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y3, Y3 VMOVDQU 1536(CX), Y6 VMOVDQU 1568(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 5 to 5 outputs VMOVDQU (R10), Y8 ADDQ $0x20, R10 VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 1600(CX), Y6 VMOVDQU 1632(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y0, Y0 VMOVDQU 1664(CX), Y6 VMOVDQU 1696(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y1, Y1 VMOVDQU 1728(CX), Y6 VMOVDQU 1760(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y2, Y2 VMOVDQU 1792(CX), Y6 VMOVDQU 1824(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y3, Y3 VMOVDQU 1856(CX), Y6 VMOVDQU 1888(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 6 to 5 outputs VMOVDQU (R11), Y8 ADDQ $0x20, R11 VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 1920(CX), Y6 VMOVDQU 1952(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y0, Y0 VMOVDQU 1984(CX), Y6 VMOVDQU 2016(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y1, Y1 VMOVDQU 2048(CX), Y6 VMOVDQU 2080(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y2, Y2 VMOVDQU 2112(CX), Y6 VMOVDQU 2144(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y3, Y3 VMOVDQU 2176(CX), Y6 VMOVDQU 2208(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 7 to 5 outputs VMOVDQU (R12), Y8 ADDQ $0x20, R12 VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 2240(CX), Y6 VMOVDQU 2272(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y0, Y0 VMOVDQU 2304(CX), Y6 VMOVDQU 2336(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y1, Y1 VMOVDQU 2368(CX), Y6 VMOVDQU 2400(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y2, Y2 VMOVDQU 2432(CX), Y6 VMOVDQU 2464(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y3, Y3 VMOVDQU 2496(CX), Y6 VMOVDQU 2528(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 8 to 5 outputs VMOVDQU (R13), Y8 ADDQ $0x20, R13 VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 2560(CX), Y6 VMOVDQU 2592(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y0, Y0 VMOVDQU 2624(CX), Y6 VMOVDQU 2656(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y1, Y1 VMOVDQU 2688(CX), Y6 VMOVDQU 2720(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y2, Y2 VMOVDQU 2752(CX), Y6 VMOVDQU 2784(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y3, Y3 VMOVDQU 2816(CX), Y6 VMOVDQU 2848(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 9 to 5 outputs VMOVDQU (DX), Y8 ADDQ $0x20, DX VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 VMOVDQU 2880(CX), Y6 VMOVDQU 2912(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y0, Y0 VMOVDQU 2944(CX), Y6 VMOVDQU 2976(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y1, Y1 VMOVDQU 3008(CX), Y6 VMOVDQU 3040(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y2, Y2 VMOVDQU 3072(CX), Y6 VMOVDQU 3104(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y3, Y3 VMOVDQU 3136(CX), Y6 VMOVDQU 3168(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y4, Y4 // Store 5 outputs MOVQ (R14), BP VMOVDQU Y0, (BP)(R15*1) MOVQ 24(R14), BP VMOVDQU Y1, (BP)(R15*1) MOVQ 48(R14), BP VMOVDQU Y2, (BP)(R15*1) MOVQ 72(R14), BP VMOVDQU Y3, (BP)(R15*1) MOVQ 96(R14), BP VMOVDQU Y4, (BP)(R15*1) // Prepare for next loop ADDQ $0x20, R15 DECQ AX JNZ mulAvxTwo_10x5_loop VZEROUPPER mulAvxTwo_10x5_end: RET // func mulAvxTwo_10x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_10x6(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept on stack // Full registers estimated 131 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_10x6_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), R12 MOVQ 192(DX), R13 MOVQ 216(DX), DX MOVQ out_base+48(FP), R14 MOVQ start+72(FP), R15 // Add start offset to input ADDQ R15, BX ADDQ R15, SI ADDQ R15, DI ADDQ R15, R8 ADDQ R15, R9 ADDQ R15, R10 ADDQ R15, R11 ADDQ R15, R12 ADDQ R15, R13 ADDQ R15, DX MOVQ $0x0000000f, BP MOVQ BP, X6 VPBROADCASTB X6, Y6 mulAvxTwo_10x6_loop: // Clear 6 outputs VPXOR Y0, Y0, Y0 VPXOR Y1, Y1, Y1 VPXOR Y2, Y2, Y2 VPXOR Y3, Y3, Y3 VPXOR Y4, Y4, Y4 VPXOR Y5, Y5, Y5 // Load and process 32 bytes from input 0 to 6 outputs VMOVDQU (BX), Y9 ADDQ $0x20, BX VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU (CX), Y7 VMOVDQU 32(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y0, Y0 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y1, Y1 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y2, Y2 VMOVDQU 192(CX), Y7 VMOVDQU 224(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y3, Y3 VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y4, Y4 VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 1 to 6 outputs VMOVDQU (SI), Y9 ADDQ $0x20, SI VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 384(CX), Y7 VMOVDQU 416(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y0, Y0 VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y1, Y1 VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y2, Y2 VMOVDQU 576(CX), Y7 VMOVDQU 608(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y3, Y3 VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y4, Y4 VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 2 to 6 outputs VMOVDQU (DI), Y9 ADDQ $0x20, DI VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 768(CX), Y7 VMOVDQU 800(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y0, Y0 VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y1, Y1 VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y2, Y2 VMOVDQU 960(CX), Y7 VMOVDQU 992(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y3, Y3 VMOVDQU 1024(CX), Y7 VMOVDQU 1056(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y4, Y4 VMOVDQU 1088(CX), Y7 VMOVDQU 1120(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 3 to 6 outputs VMOVDQU (R8), Y9 ADDQ $0x20, R8 VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 1152(CX), Y7 VMOVDQU 1184(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y0, Y0 VMOVDQU 1216(CX), Y7 VMOVDQU 1248(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y1, Y1 VMOVDQU 1280(CX), Y7 VMOVDQU 1312(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y2, Y2 VMOVDQU 1344(CX), Y7 VMOVDQU 1376(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y3, Y3 VMOVDQU 1408(CX), Y7 VMOVDQU 1440(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y4, Y4 VMOVDQU 1472(CX), Y7 VMOVDQU 1504(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 4 to 6 outputs VMOVDQU (R9), Y9 ADDQ $0x20, R9 VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 1536(CX), Y7 VMOVDQU 1568(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y0, Y0 VMOVDQU 1600(CX), Y7 VMOVDQU 1632(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y1, Y1 VMOVDQU 1664(CX), Y7 VMOVDQU 1696(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y2, Y2 VMOVDQU 1728(CX), Y7 VMOVDQU 1760(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y3, Y3 VMOVDQU 1792(CX), Y7 VMOVDQU 1824(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y4, Y4 VMOVDQU 1856(CX), Y7 VMOVDQU 1888(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 5 to 6 outputs VMOVDQU (R10), Y9 ADDQ $0x20, R10 VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 1920(CX), Y7 VMOVDQU 1952(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y0, Y0 VMOVDQU 1984(CX), Y7 VMOVDQU 2016(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y1, Y1 VMOVDQU 2048(CX), Y7 VMOVDQU 2080(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y2, Y2 VMOVDQU 2112(CX), Y7 VMOVDQU 2144(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y3, Y3 VMOVDQU 2176(CX), Y7 VMOVDQU 2208(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y4, Y4 VMOVDQU 2240(CX), Y7 VMOVDQU 2272(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 6 to 6 outputs VMOVDQU (R11), Y9 ADDQ $0x20, R11 VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 2304(CX), Y7 VMOVDQU 2336(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y0, Y0 VMOVDQU 2368(CX), Y7 VMOVDQU 2400(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y1, Y1 VMOVDQU 2432(CX), Y7 VMOVDQU 2464(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y2, Y2 VMOVDQU 2496(CX), Y7 VMOVDQU 2528(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y3, Y3 VMOVDQU 2560(CX), Y7 VMOVDQU 2592(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y4, Y4 VMOVDQU 2624(CX), Y7 VMOVDQU 2656(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 7 to 6 outputs VMOVDQU (R12), Y9 ADDQ $0x20, R12 VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 2688(CX), Y7 VMOVDQU 2720(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y0, Y0 VMOVDQU 2752(CX), Y7 VMOVDQU 2784(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y1, Y1 VMOVDQU 2816(CX), Y7 VMOVDQU 2848(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y2, Y2 VMOVDQU 2880(CX), Y7 VMOVDQU 2912(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y3, Y3 VMOVDQU 2944(CX), Y7 VMOVDQU 2976(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y4, Y4 VMOVDQU 3008(CX), Y7 VMOVDQU 3040(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 8 to 6 outputs VMOVDQU (R13), Y9 ADDQ $0x20, R13 VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 3072(CX), Y7 VMOVDQU 3104(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y0, Y0 VMOVDQU 3136(CX), Y7 VMOVDQU 3168(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y1, Y1 VMOVDQU 3200(CX), Y7 VMOVDQU 3232(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y2, Y2 VMOVDQU 3264(CX), Y7 VMOVDQU 3296(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y3, Y3 VMOVDQU 3328(CX), Y7 VMOVDQU 3360(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y4, Y4 VMOVDQU 3392(CX), Y7 VMOVDQU 3424(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 9 to 6 outputs VMOVDQU (DX), Y9 ADDQ $0x20, DX VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 VMOVDQU 3456(CX), Y7 VMOVDQU 3488(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y0, Y0 VMOVDQU 3520(CX), Y7 VMOVDQU 3552(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y1, Y1 VMOVDQU 3584(CX), Y7 VMOVDQU 3616(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y2, Y2 VMOVDQU 3648(CX), Y7 VMOVDQU 3680(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y3, Y3 VMOVDQU 3712(CX), Y7 VMOVDQU 3744(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y4, Y4 VMOVDQU 3776(CX), Y7 VMOVDQU 3808(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y5, Y5 // Store 6 outputs MOVQ (R14), BP VMOVDQU Y0, (BP)(R15*1) MOVQ 24(R14), BP VMOVDQU Y1, (BP)(R15*1) MOVQ 48(R14), BP VMOVDQU Y2, (BP)(R15*1) MOVQ 72(R14), BP VMOVDQU Y3, (BP)(R15*1) MOVQ 96(R14), BP VMOVDQU Y4, (BP)(R15*1) MOVQ 120(R14), BP VMOVDQU Y5, (BP)(R15*1) // Prepare for next loop ADDQ $0x20, R15 DECQ AX JNZ mulAvxTwo_10x6_loop VZEROUPPER mulAvxTwo_10x6_end: RET // func mulAvxTwo_10x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_10x7(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept on stack // Full registers estimated 152 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_10x7_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), R12 MOVQ 192(DX), R13 MOVQ 216(DX), DX MOVQ out_base+48(FP), R14 MOVQ start+72(FP), R15 // Add start offset to input ADDQ R15, BX ADDQ R15, SI ADDQ R15, DI ADDQ R15, R8 ADDQ R15, R9 ADDQ R15, R10 ADDQ R15, R11 ADDQ R15, R12 ADDQ R15, R13 ADDQ R15, DX MOVQ $0x0000000f, BP MOVQ BP, X7 VPBROADCASTB X7, Y7 mulAvxTwo_10x7_loop: // Clear 7 outputs VPXOR Y0, Y0, Y0 VPXOR Y1, Y1, Y1 VPXOR Y2, Y2, Y2 VPXOR Y3, Y3, Y3 VPXOR Y4, Y4, Y4 VPXOR Y5, Y5, Y5 VPXOR Y6, Y6, Y6 // Load and process 32 bytes from input 0 to 7 outputs VMOVDQU (BX), Y10 ADDQ $0x20, BX VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU (CX), Y8 VMOVDQU 32(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y0, Y0 VMOVDQU 64(CX), Y8 VMOVDQU 96(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y1, Y1 VMOVDQU 128(CX), Y8 VMOVDQU 160(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y2, Y2 VMOVDQU 192(CX), Y8 VMOVDQU 224(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y3, Y3 VMOVDQU 256(CX), Y8 VMOVDQU 288(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y4, Y4 VMOVDQU 320(CX), Y8 VMOVDQU 352(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y5, Y5 VMOVDQU 384(CX), Y8 VMOVDQU 416(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 1 to 7 outputs VMOVDQU (SI), Y10 ADDQ $0x20, SI VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 448(CX), Y8 VMOVDQU 480(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y0, Y0 VMOVDQU 512(CX), Y8 VMOVDQU 544(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y1, Y1 VMOVDQU 576(CX), Y8 VMOVDQU 608(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y2, Y2 VMOVDQU 640(CX), Y8 VMOVDQU 672(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y3, Y3 VMOVDQU 704(CX), Y8 VMOVDQU 736(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y4, Y4 VMOVDQU 768(CX), Y8 VMOVDQU 800(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y5, Y5 VMOVDQU 832(CX), Y8 VMOVDQU 864(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 2 to 7 outputs VMOVDQU (DI), Y10 ADDQ $0x20, DI VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 896(CX), Y8 VMOVDQU 928(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y0, Y0 VMOVDQU 960(CX), Y8 VMOVDQU 992(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y1, Y1 VMOVDQU 1024(CX), Y8 VMOVDQU 1056(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y2, Y2 VMOVDQU 1088(CX), Y8 VMOVDQU 1120(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y3, Y3 VMOVDQU 1152(CX), Y8 VMOVDQU 1184(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y4, Y4 VMOVDQU 1216(CX), Y8 VMOVDQU 1248(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y5, Y5 VMOVDQU 1280(CX), Y8 VMOVDQU 1312(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 3 to 7 outputs VMOVDQU (R8), Y10 ADDQ $0x20, R8 VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 1344(CX), Y8 VMOVDQU 1376(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y0, Y0 VMOVDQU 1408(CX), Y8 VMOVDQU 1440(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y1, Y1 VMOVDQU 1472(CX), Y8 VMOVDQU 1504(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y2, Y2 VMOVDQU 1536(CX), Y8 VMOVDQU 1568(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y3, Y3 VMOVDQU 1600(CX), Y8 VMOVDQU 1632(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y4, Y4 VMOVDQU 1664(CX), Y8 VMOVDQU 1696(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y5, Y5 VMOVDQU 1728(CX), Y8 VMOVDQU 1760(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 4 to 7 outputs VMOVDQU (R9), Y10 ADDQ $0x20, R9 VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 1792(CX), Y8 VMOVDQU 1824(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y0, Y0 VMOVDQU 1856(CX), Y8 VMOVDQU 1888(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y1, Y1 VMOVDQU 1920(CX), Y8 VMOVDQU 1952(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y2, Y2 VMOVDQU 1984(CX), Y8 VMOVDQU 2016(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y3, Y3 VMOVDQU 2048(CX), Y8 VMOVDQU 2080(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y4, Y4 VMOVDQU 2112(CX), Y8 VMOVDQU 2144(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y5, Y5 VMOVDQU 2176(CX), Y8 VMOVDQU 2208(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 5 to 7 outputs VMOVDQU (R10), Y10 ADDQ $0x20, R10 VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 2240(CX), Y8 VMOVDQU 2272(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y0, Y0 VMOVDQU 2304(CX), Y8 VMOVDQU 2336(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y1, Y1 VMOVDQU 2368(CX), Y8 VMOVDQU 2400(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y2, Y2 VMOVDQU 2432(CX), Y8 VMOVDQU 2464(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y3, Y3 VMOVDQU 2496(CX), Y8 VMOVDQU 2528(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y4, Y4 VMOVDQU 2560(CX), Y8 VMOVDQU 2592(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y5, Y5 VMOVDQU 2624(CX), Y8 VMOVDQU 2656(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 6 to 7 outputs VMOVDQU (R11), Y10 ADDQ $0x20, R11 VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 2688(CX), Y8 VMOVDQU 2720(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y0, Y0 VMOVDQU 2752(CX), Y8 VMOVDQU 2784(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y1, Y1 VMOVDQU 2816(CX), Y8 VMOVDQU 2848(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y2, Y2 VMOVDQU 2880(CX), Y8 VMOVDQU 2912(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y3, Y3 VMOVDQU 2944(CX), Y8 VMOVDQU 2976(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y4, Y4 VMOVDQU 3008(CX), Y8 VMOVDQU 3040(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y5, Y5 VMOVDQU 3072(CX), Y8 VMOVDQU 3104(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 7 to 7 outputs VMOVDQU (R12), Y10 ADDQ $0x20, R12 VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 3136(CX), Y8 VMOVDQU 3168(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y0, Y0 VMOVDQU 3200(CX), Y8 VMOVDQU 3232(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y1, Y1 VMOVDQU 3264(CX), Y8 VMOVDQU 3296(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y2, Y2 VMOVDQU 3328(CX), Y8 VMOVDQU 3360(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y3, Y3 VMOVDQU 3392(CX), Y8 VMOVDQU 3424(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y4, Y4 VMOVDQU 3456(CX), Y8 VMOVDQU 3488(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y5, Y5 VMOVDQU 3520(CX), Y8 VMOVDQU 3552(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 8 to 7 outputs VMOVDQU (R13), Y10 ADDQ $0x20, R13 VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 3584(CX), Y8 VMOVDQU 3616(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y0, Y0 VMOVDQU 3648(CX), Y8 VMOVDQU 3680(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y1, Y1 VMOVDQU 3712(CX), Y8 VMOVDQU 3744(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y2, Y2 VMOVDQU 3776(CX), Y8 VMOVDQU 3808(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y3, Y3 VMOVDQU 3840(CX), Y8 VMOVDQU 3872(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y4, Y4 VMOVDQU 3904(CX), Y8 VMOVDQU 3936(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y5, Y5 VMOVDQU 3968(CX), Y8 VMOVDQU 4000(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 9 to 7 outputs VMOVDQU (DX), Y10 ADDQ $0x20, DX VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 VMOVDQU 4032(CX), Y8 VMOVDQU 4064(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y0, Y0 VMOVDQU 4096(CX), Y8 VMOVDQU 4128(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y1, Y1 VMOVDQU 4160(CX), Y8 VMOVDQU 4192(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y2, Y2 VMOVDQU 4224(CX), Y8 VMOVDQU 4256(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y3, Y3 VMOVDQU 4288(CX), Y8 VMOVDQU 4320(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y4, Y4 VMOVDQU 4352(CX), Y8 VMOVDQU 4384(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y5, Y5 VMOVDQU 4416(CX), Y8 VMOVDQU 4448(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y6, Y6 // Store 7 outputs MOVQ (R14), BP VMOVDQU Y0, (BP)(R15*1) MOVQ 24(R14), BP VMOVDQU Y1, (BP)(R15*1) MOVQ 48(R14), BP VMOVDQU Y2, (BP)(R15*1) MOVQ 72(R14), BP VMOVDQU Y3, (BP)(R15*1) MOVQ 96(R14), BP VMOVDQU Y4, (BP)(R15*1) MOVQ 120(R14), BP VMOVDQU Y5, (BP)(R15*1) MOVQ 144(R14), BP VMOVDQU Y6, (BP)(R15*1) // Prepare for next loop ADDQ $0x20, R15 DECQ AX JNZ mulAvxTwo_10x7_loop VZEROUPPER mulAvxTwo_10x7_end: RET // func mulAvxTwo_10x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_10x8(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept on stack // Full registers estimated 173 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_10x8_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), R12 MOVQ 192(DX), R13 MOVQ 216(DX), DX MOVQ out_base+48(FP), R14 MOVQ start+72(FP), R15 // Add start offset to input ADDQ R15, BX ADDQ R15, SI ADDQ R15, DI ADDQ R15, R8 ADDQ R15, R9 ADDQ R15, R10 ADDQ R15, R11 ADDQ R15, R12 ADDQ R15, R13 ADDQ R15, DX MOVQ $0x0000000f, BP MOVQ BP, X8 VPBROADCASTB X8, Y8 mulAvxTwo_10x8_loop: // Clear 8 outputs VPXOR Y0, Y0, Y0 VPXOR Y1, Y1, Y1 VPXOR Y2, Y2, Y2 VPXOR Y3, Y3, Y3 VPXOR Y4, Y4, Y4 VPXOR Y5, Y5, Y5 VPXOR Y6, Y6, Y6 VPXOR Y7, Y7, Y7 // Load and process 32 bytes from input 0 to 8 outputs VMOVDQU (BX), Y11 ADDQ $0x20, BX VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU (CX), Y9 VMOVDQU 32(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y0, Y0 VMOVDQU 64(CX), Y9 VMOVDQU 96(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y1, Y1 VMOVDQU 128(CX), Y9 VMOVDQU 160(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y2, Y2 VMOVDQU 192(CX), Y9 VMOVDQU 224(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y3, Y3 VMOVDQU 256(CX), Y9 VMOVDQU 288(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y4, Y4 VMOVDQU 320(CX), Y9 VMOVDQU 352(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y5, Y5 VMOVDQU 384(CX), Y9 VMOVDQU 416(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y6, Y6 VMOVDQU 448(CX), Y9 VMOVDQU 480(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 1 to 8 outputs VMOVDQU (SI), Y11 ADDQ $0x20, SI VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 512(CX), Y9 VMOVDQU 544(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y0, Y0 VMOVDQU 576(CX), Y9 VMOVDQU 608(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y1, Y1 VMOVDQU 640(CX), Y9 VMOVDQU 672(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y2, Y2 VMOVDQU 704(CX), Y9 VMOVDQU 736(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y3, Y3 VMOVDQU 768(CX), Y9 VMOVDQU 800(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y4, Y4 VMOVDQU 832(CX), Y9 VMOVDQU 864(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y5, Y5 VMOVDQU 896(CX), Y9 VMOVDQU 928(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y6, Y6 VMOVDQU 960(CX), Y9 VMOVDQU 992(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 2 to 8 outputs VMOVDQU (DI), Y11 ADDQ $0x20, DI VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 1024(CX), Y9 VMOVDQU 1056(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y0, Y0 VMOVDQU 1088(CX), Y9 VMOVDQU 1120(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y1, Y1 VMOVDQU 1152(CX), Y9 VMOVDQU 1184(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y2, Y2 VMOVDQU 1216(CX), Y9 VMOVDQU 1248(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y3, Y3 VMOVDQU 1280(CX), Y9 VMOVDQU 1312(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y4, Y4 VMOVDQU 1344(CX), Y9 VMOVDQU 1376(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y5, Y5 VMOVDQU 1408(CX), Y9 VMOVDQU 1440(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y6, Y6 VMOVDQU 1472(CX), Y9 VMOVDQU 1504(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 3 to 8 outputs VMOVDQU (R8), Y11 ADDQ $0x20, R8 VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 1536(CX), Y9 VMOVDQU 1568(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y0, Y0 VMOVDQU 1600(CX), Y9 VMOVDQU 1632(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y1, Y1 VMOVDQU 1664(CX), Y9 VMOVDQU 1696(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y2, Y2 VMOVDQU 1728(CX), Y9 VMOVDQU 1760(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y3, Y3 VMOVDQU 1792(CX), Y9 VMOVDQU 1824(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y4, Y4 VMOVDQU 1856(CX), Y9 VMOVDQU 1888(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y5, Y5 VMOVDQU 1920(CX), Y9 VMOVDQU 1952(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y6, Y6 VMOVDQU 1984(CX), Y9 VMOVDQU 2016(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 4 to 8 outputs VMOVDQU (R9), Y11 ADDQ $0x20, R9 VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 2048(CX), Y9 VMOVDQU 2080(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y0, Y0 VMOVDQU 2112(CX), Y9 VMOVDQU 2144(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y1, Y1 VMOVDQU 2176(CX), Y9 VMOVDQU 2208(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y2, Y2 VMOVDQU 2240(CX), Y9 VMOVDQU 2272(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y3, Y3 VMOVDQU 2304(CX), Y9 VMOVDQU 2336(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y4, Y4 VMOVDQU 2368(CX), Y9 VMOVDQU 2400(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y5, Y5 VMOVDQU 2432(CX), Y9 VMOVDQU 2464(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y6, Y6 VMOVDQU 2496(CX), Y9 VMOVDQU 2528(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 5 to 8 outputs VMOVDQU (R10), Y11 ADDQ $0x20, R10 VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 2560(CX), Y9 VMOVDQU 2592(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y0, Y0 VMOVDQU 2624(CX), Y9 VMOVDQU 2656(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y1, Y1 VMOVDQU 2688(CX), Y9 VMOVDQU 2720(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y2, Y2 VMOVDQU 2752(CX), Y9 VMOVDQU 2784(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y3, Y3 VMOVDQU 2816(CX), Y9 VMOVDQU 2848(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y4, Y4 VMOVDQU 2880(CX), Y9 VMOVDQU 2912(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y5, Y5 VMOVDQU 2944(CX), Y9 VMOVDQU 2976(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y6, Y6 VMOVDQU 3008(CX), Y9 VMOVDQU 3040(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 6 to 8 outputs VMOVDQU (R11), Y11 ADDQ $0x20, R11 VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 3072(CX), Y9 VMOVDQU 3104(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y0, Y0 VMOVDQU 3136(CX), Y9 VMOVDQU 3168(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y1, Y1 VMOVDQU 3200(CX), Y9 VMOVDQU 3232(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y2, Y2 VMOVDQU 3264(CX), Y9 VMOVDQU 3296(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y3, Y3 VMOVDQU 3328(CX), Y9 VMOVDQU 3360(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y4, Y4 VMOVDQU 3392(CX), Y9 VMOVDQU 3424(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y5, Y5 VMOVDQU 3456(CX), Y9 VMOVDQU 3488(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y6, Y6 VMOVDQU 3520(CX), Y9 VMOVDQU 3552(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 7 to 8 outputs VMOVDQU (R12), Y11 ADDQ $0x20, R12 VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 3584(CX), Y9 VMOVDQU 3616(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y0, Y0 VMOVDQU 3648(CX), Y9 VMOVDQU 3680(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y1, Y1 VMOVDQU 3712(CX), Y9 VMOVDQU 3744(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y2, Y2 VMOVDQU 3776(CX), Y9 VMOVDQU 3808(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y3, Y3 VMOVDQU 3840(CX), Y9 VMOVDQU 3872(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y4, Y4 VMOVDQU 3904(CX), Y9 VMOVDQU 3936(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y5, Y5 VMOVDQU 3968(CX), Y9 VMOVDQU 4000(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y6, Y6 VMOVDQU 4032(CX), Y9 VMOVDQU 4064(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 8 to 8 outputs VMOVDQU (R13), Y11 ADDQ $0x20, R13 VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 4096(CX), Y9 VMOVDQU 4128(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y0, Y0 VMOVDQU 4160(CX), Y9 VMOVDQU 4192(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y1, Y1 VMOVDQU 4224(CX), Y9 VMOVDQU 4256(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y2, Y2 VMOVDQU 4288(CX), Y9 VMOVDQU 4320(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y3, Y3 VMOVDQU 4352(CX), Y9 VMOVDQU 4384(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y4, Y4 VMOVDQU 4416(CX), Y9 VMOVDQU 4448(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y5, Y5 VMOVDQU 4480(CX), Y9 VMOVDQU 4512(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y6, Y6 VMOVDQU 4544(CX), Y9 VMOVDQU 4576(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 9 to 8 outputs VMOVDQU (DX), Y11 ADDQ $0x20, DX VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 VMOVDQU 4608(CX), Y9 VMOVDQU 4640(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y0, Y0 VMOVDQU 4672(CX), Y9 VMOVDQU 4704(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y1, Y1 VMOVDQU 4736(CX), Y9 VMOVDQU 4768(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y2, Y2 VMOVDQU 4800(CX), Y9 VMOVDQU 4832(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y3, Y3 VMOVDQU 4864(CX), Y9 VMOVDQU 4896(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y4, Y4 VMOVDQU 4928(CX), Y9 VMOVDQU 4960(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y5, Y5 VMOVDQU 4992(CX), Y9 VMOVDQU 5024(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y6, Y6 VMOVDQU 5056(CX), Y9 VMOVDQU 5088(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y7, Y7 // Store 8 outputs MOVQ (R14), BP VMOVDQU Y0, (BP)(R15*1) MOVQ 24(R14), BP VMOVDQU Y1, (BP)(R15*1) MOVQ 48(R14), BP VMOVDQU Y2, (BP)(R15*1) MOVQ 72(R14), BP VMOVDQU Y3, (BP)(R15*1) MOVQ 96(R14), BP VMOVDQU Y4, (BP)(R15*1) MOVQ 120(R14), BP VMOVDQU Y5, (BP)(R15*1) MOVQ 144(R14), BP VMOVDQU Y6, (BP)(R15*1) MOVQ 168(R14), BP VMOVDQU Y7, (BP)(R15*1) // Prepare for next loop ADDQ $0x20, R15 DECQ AX JNZ mulAvxTwo_10x8_loop VZEROUPPER mulAvxTwo_10x8_end: RET // func mulAvxTwo_10x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_10x9(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept on stack // Full registers estimated 194 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_10x9_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), R12 MOVQ 192(DX), R13 MOVQ 216(DX), DX MOVQ out_base+48(FP), R14 MOVQ start+72(FP), R15 // Add start offset to input ADDQ R15, BX ADDQ R15, SI ADDQ R15, DI ADDQ R15, R8 ADDQ R15, R9 ADDQ R15, R10 ADDQ R15, R11 ADDQ R15, R12 ADDQ R15, R13 ADDQ R15, DX MOVQ $0x0000000f, BP MOVQ BP, X9 VPBROADCASTB X9, Y9 mulAvxTwo_10x9_loop: // Clear 9 outputs VPXOR Y0, Y0, Y0 VPXOR Y1, Y1, Y1 VPXOR Y2, Y2, Y2 VPXOR Y3, Y3, Y3 VPXOR Y4, Y4, Y4 VPXOR Y5, Y5, Y5 VPXOR Y6, Y6, Y6 VPXOR Y7, Y7, Y7 VPXOR Y8, Y8, Y8 // Load and process 32 bytes from input 0 to 9 outputs VMOVDQU (BX), Y12 ADDQ $0x20, BX VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU (CX), Y10 VMOVDQU 32(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y0, Y0 VMOVDQU 64(CX), Y10 VMOVDQU 96(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y1, Y1 VMOVDQU 128(CX), Y10 VMOVDQU 160(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y2, Y2 VMOVDQU 192(CX), Y10 VMOVDQU 224(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y3, Y3 VMOVDQU 256(CX), Y10 VMOVDQU 288(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y4, Y4 VMOVDQU 320(CX), Y10 VMOVDQU 352(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y5, Y5 VMOVDQU 384(CX), Y10 VMOVDQU 416(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y6, Y6 VMOVDQU 448(CX), Y10 VMOVDQU 480(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y7, Y7 VMOVDQU 512(CX), Y10 VMOVDQU 544(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 1 to 9 outputs VMOVDQU (SI), Y12 ADDQ $0x20, SI VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 576(CX), Y10 VMOVDQU 608(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y0, Y0 VMOVDQU 640(CX), Y10 VMOVDQU 672(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y1, Y1 VMOVDQU 704(CX), Y10 VMOVDQU 736(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y2, Y2 VMOVDQU 768(CX), Y10 VMOVDQU 800(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y3, Y3 VMOVDQU 832(CX), Y10 VMOVDQU 864(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y4, Y4 VMOVDQU 896(CX), Y10 VMOVDQU 928(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y5, Y5 VMOVDQU 960(CX), Y10 VMOVDQU 992(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y6, Y6 VMOVDQU 1024(CX), Y10 VMOVDQU 1056(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y7, Y7 VMOVDQU 1088(CX), Y10 VMOVDQU 1120(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 2 to 9 outputs VMOVDQU (DI), Y12 ADDQ $0x20, DI VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 1152(CX), Y10 VMOVDQU 1184(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y0, Y0 VMOVDQU 1216(CX), Y10 VMOVDQU 1248(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y1, Y1 VMOVDQU 1280(CX), Y10 VMOVDQU 1312(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y2, Y2 VMOVDQU 1344(CX), Y10 VMOVDQU 1376(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y3, Y3 VMOVDQU 1408(CX), Y10 VMOVDQU 1440(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y4, Y4 VMOVDQU 1472(CX), Y10 VMOVDQU 1504(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y5, Y5 VMOVDQU 1536(CX), Y10 VMOVDQU 1568(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y6, Y6 VMOVDQU 1600(CX), Y10 VMOVDQU 1632(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y7, Y7 VMOVDQU 1664(CX), Y10 VMOVDQU 1696(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 3 to 9 outputs VMOVDQU (R8), Y12 ADDQ $0x20, R8 VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 1728(CX), Y10 VMOVDQU 1760(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y0, Y0 VMOVDQU 1792(CX), Y10 VMOVDQU 1824(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y1, Y1 VMOVDQU 1856(CX), Y10 VMOVDQU 1888(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y2, Y2 VMOVDQU 1920(CX), Y10 VMOVDQU 1952(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y3, Y3 VMOVDQU 1984(CX), Y10 VMOVDQU 2016(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y4, Y4 VMOVDQU 2048(CX), Y10 VMOVDQU 2080(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y5, Y5 VMOVDQU 2112(CX), Y10 VMOVDQU 2144(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y6, Y6 VMOVDQU 2176(CX), Y10 VMOVDQU 2208(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y7, Y7 VMOVDQU 2240(CX), Y10 VMOVDQU 2272(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 4 to 9 outputs VMOVDQU (R9), Y12 ADDQ $0x20, R9 VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 2304(CX), Y10 VMOVDQU 2336(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y0, Y0 VMOVDQU 2368(CX), Y10 VMOVDQU 2400(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y1, Y1 VMOVDQU 2432(CX), Y10 VMOVDQU 2464(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y2, Y2 VMOVDQU 2496(CX), Y10 VMOVDQU 2528(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y3, Y3 VMOVDQU 2560(CX), Y10 VMOVDQU 2592(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y4, Y4 VMOVDQU 2624(CX), Y10 VMOVDQU 2656(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y5, Y5 VMOVDQU 2688(CX), Y10 VMOVDQU 2720(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y6, Y6 VMOVDQU 2752(CX), Y10 VMOVDQU 2784(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y7, Y7 VMOVDQU 2816(CX), Y10 VMOVDQU 2848(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 5 to 9 outputs VMOVDQU (R10), Y12 ADDQ $0x20, R10 VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 2880(CX), Y10 VMOVDQU 2912(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y0, Y0 VMOVDQU 2944(CX), Y10 VMOVDQU 2976(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y1, Y1 VMOVDQU 3008(CX), Y10 VMOVDQU 3040(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y2, Y2 VMOVDQU 3072(CX), Y10 VMOVDQU 3104(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y3, Y3 VMOVDQU 3136(CX), Y10 VMOVDQU 3168(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y4, Y4 VMOVDQU 3200(CX), Y10 VMOVDQU 3232(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y5, Y5 VMOVDQU 3264(CX), Y10 VMOVDQU 3296(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y6, Y6 VMOVDQU 3328(CX), Y10 VMOVDQU 3360(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y7, Y7 VMOVDQU 3392(CX), Y10 VMOVDQU 3424(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 6 to 9 outputs VMOVDQU (R11), Y12 ADDQ $0x20, R11 VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 3456(CX), Y10 VMOVDQU 3488(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y0, Y0 VMOVDQU 3520(CX), Y10 VMOVDQU 3552(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y1, Y1 VMOVDQU 3584(CX), Y10 VMOVDQU 3616(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y2, Y2 VMOVDQU 3648(CX), Y10 VMOVDQU 3680(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y3, Y3 VMOVDQU 3712(CX), Y10 VMOVDQU 3744(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y4, Y4 VMOVDQU 3776(CX), Y10 VMOVDQU 3808(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y5, Y5 VMOVDQU 3840(CX), Y10 VMOVDQU 3872(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y6, Y6 VMOVDQU 3904(CX), Y10 VMOVDQU 3936(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y7, Y7 VMOVDQU 3968(CX), Y10 VMOVDQU 4000(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 7 to 9 outputs VMOVDQU (R12), Y12 ADDQ $0x20, R12 VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 4032(CX), Y10 VMOVDQU 4064(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y0, Y0 VMOVDQU 4096(CX), Y10 VMOVDQU 4128(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y1, Y1 VMOVDQU 4160(CX), Y10 VMOVDQU 4192(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y2, Y2 VMOVDQU 4224(CX), Y10 VMOVDQU 4256(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y3, Y3 VMOVDQU 4288(CX), Y10 VMOVDQU 4320(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y4, Y4 VMOVDQU 4352(CX), Y10 VMOVDQU 4384(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y5, Y5 VMOVDQU 4416(CX), Y10 VMOVDQU 4448(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y6, Y6 VMOVDQU 4480(CX), Y10 VMOVDQU 4512(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y7, Y7 VMOVDQU 4544(CX), Y10 VMOVDQU 4576(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 8 to 9 outputs VMOVDQU (R13), Y12 ADDQ $0x20, R13 VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 4608(CX), Y10 VMOVDQU 4640(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y0, Y0 VMOVDQU 4672(CX), Y10 VMOVDQU 4704(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y1, Y1 VMOVDQU 4736(CX), Y10 VMOVDQU 4768(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y2, Y2 VMOVDQU 4800(CX), Y10 VMOVDQU 4832(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y3, Y3 VMOVDQU 4864(CX), Y10 VMOVDQU 4896(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y4, Y4 VMOVDQU 4928(CX), Y10 VMOVDQU 4960(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y5, Y5 VMOVDQU 4992(CX), Y10 VMOVDQU 5024(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y6, Y6 VMOVDQU 5056(CX), Y10 VMOVDQU 5088(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y7, Y7 VMOVDQU 5120(CX), Y10 VMOVDQU 5152(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 9 to 9 outputs VMOVDQU (DX), Y12 ADDQ $0x20, DX VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 VMOVDQU 5184(CX), Y10 VMOVDQU 5216(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y0, Y0 VMOVDQU 5248(CX), Y10 VMOVDQU 5280(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y1, Y1 VMOVDQU 5312(CX), Y10 VMOVDQU 5344(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y2, Y2 VMOVDQU 5376(CX), Y10 VMOVDQU 5408(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y3, Y3 VMOVDQU 5440(CX), Y10 VMOVDQU 5472(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y4, Y4 VMOVDQU 5504(CX), Y10 VMOVDQU 5536(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y5, Y5 VMOVDQU 5568(CX), Y10 VMOVDQU 5600(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y6, Y6 VMOVDQU 5632(CX), Y10 VMOVDQU 5664(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y7, Y7 VMOVDQU 5696(CX), Y10 VMOVDQU 5728(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y8, Y8 // Store 9 outputs MOVQ (R14), BP VMOVDQU Y0, (BP)(R15*1) MOVQ 24(R14), BP VMOVDQU Y1, (BP)(R15*1) MOVQ 48(R14), BP VMOVDQU Y2, (BP)(R15*1) MOVQ 72(R14), BP VMOVDQU Y3, (BP)(R15*1) MOVQ 96(R14), BP VMOVDQU Y4, (BP)(R15*1) MOVQ 120(R14), BP VMOVDQU Y5, (BP)(R15*1) MOVQ 144(R14), BP VMOVDQU Y6, (BP)(R15*1) MOVQ 168(R14), BP VMOVDQU Y7, (BP)(R15*1) MOVQ 192(R14), BP VMOVDQU Y8, (BP)(R15*1) // Prepare for next loop ADDQ $0x20, R15 DECQ AX JNZ mulAvxTwo_10x9_loop VZEROUPPER mulAvxTwo_10x9_end: RET // func mulAvxTwo_10x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_10x10(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept on stack // Full registers estimated 215 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x05, AX TESTQ AX, AX JZ mulAvxTwo_10x10_end MOVQ in_base+24(FP), DX MOVQ (DX), BX MOVQ 24(DX), SI MOVQ 48(DX), DI MOVQ 72(DX), R8 MOVQ 96(DX), R9 MOVQ 120(DX), R10 MOVQ 144(DX), R11 MOVQ 168(DX), R12 MOVQ 192(DX), R13 MOVQ 216(DX), DX MOVQ out_base+48(FP), R14 MOVQ start+72(FP), R15 // Add start offset to input ADDQ R15, BX ADDQ R15, SI ADDQ R15, DI ADDQ R15, R8 ADDQ R15, R9 ADDQ R15, R10 ADDQ R15, R11 ADDQ R15, R12 ADDQ R15, R13 ADDQ R15, DX MOVQ $0x0000000f, BP MOVQ BP, X10 VPBROADCASTB X10, Y10 mulAvxTwo_10x10_loop: // Clear 10 outputs VPXOR Y0, Y0, Y0 VPXOR Y1, Y1, Y1 VPXOR Y2, Y2, Y2 VPXOR Y3, Y3, Y3 VPXOR Y4, Y4, Y4 VPXOR Y5, Y5, Y5 VPXOR Y6, Y6, Y6 VPXOR Y7, Y7, Y7 VPXOR Y8, Y8, Y8 VPXOR Y9, Y9, Y9 // Load and process 32 bytes from input 0 to 10 outputs VMOVDQU (BX), Y13 ADDQ $0x20, BX VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU (CX), Y11 VMOVDQU 32(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y0, Y0 VMOVDQU 64(CX), Y11 VMOVDQU 96(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y1, Y1 VMOVDQU 128(CX), Y11 VMOVDQU 160(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y2, Y2 VMOVDQU 192(CX), Y11 VMOVDQU 224(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y3, Y3 VMOVDQU 256(CX), Y11 VMOVDQU 288(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y4, Y4 VMOVDQU 320(CX), Y11 VMOVDQU 352(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y5, Y5 VMOVDQU 384(CX), Y11 VMOVDQU 416(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y6, Y6 VMOVDQU 448(CX), Y11 VMOVDQU 480(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y7, Y7 VMOVDQU 512(CX), Y11 VMOVDQU 544(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y8, Y8 VMOVDQU 576(CX), Y11 VMOVDQU 608(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 1 to 10 outputs VMOVDQU (SI), Y13 ADDQ $0x20, SI VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 640(CX), Y11 VMOVDQU 672(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y0, Y0 VMOVDQU 704(CX), Y11 VMOVDQU 736(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y1, Y1 VMOVDQU 768(CX), Y11 VMOVDQU 800(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y2, Y2 VMOVDQU 832(CX), Y11 VMOVDQU 864(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y3, Y3 VMOVDQU 896(CX), Y11 VMOVDQU 928(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y4, Y4 VMOVDQU 960(CX), Y11 VMOVDQU 992(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y5, Y5 VMOVDQU 1024(CX), Y11 VMOVDQU 1056(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y6, Y6 VMOVDQU 1088(CX), Y11 VMOVDQU 1120(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y7, Y7 VMOVDQU 1152(CX), Y11 VMOVDQU 1184(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y8, Y8 VMOVDQU 1216(CX), Y11 VMOVDQU 1248(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 2 to 10 outputs VMOVDQU (DI), Y13 ADDQ $0x20, DI VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 1280(CX), Y11 VMOVDQU 1312(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y0, Y0 VMOVDQU 1344(CX), Y11 VMOVDQU 1376(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y1, Y1 VMOVDQU 1408(CX), Y11 VMOVDQU 1440(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y2, Y2 VMOVDQU 1472(CX), Y11 VMOVDQU 1504(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y3, Y3 VMOVDQU 1536(CX), Y11 VMOVDQU 1568(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y4, Y4 VMOVDQU 1600(CX), Y11 VMOVDQU 1632(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y5, Y5 VMOVDQU 1664(CX), Y11 VMOVDQU 1696(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y6, Y6 VMOVDQU 1728(CX), Y11 VMOVDQU 1760(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y7, Y7 VMOVDQU 1792(CX), Y11 VMOVDQU 1824(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y8, Y8 VMOVDQU 1856(CX), Y11 VMOVDQU 1888(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 3 to 10 outputs VMOVDQU (R8), Y13 ADDQ $0x20, R8 VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 1920(CX), Y11 VMOVDQU 1952(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y0, Y0 VMOVDQU 1984(CX), Y11 VMOVDQU 2016(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y1, Y1 VMOVDQU 2048(CX), Y11 VMOVDQU 2080(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y2, Y2 VMOVDQU 2112(CX), Y11 VMOVDQU 2144(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y3, Y3 VMOVDQU 2176(CX), Y11 VMOVDQU 2208(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y4, Y4 VMOVDQU 2240(CX), Y11 VMOVDQU 2272(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y5, Y5 VMOVDQU 2304(CX), Y11 VMOVDQU 2336(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y6, Y6 VMOVDQU 2368(CX), Y11 VMOVDQU 2400(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y7, Y7 VMOVDQU 2432(CX), Y11 VMOVDQU 2464(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y8, Y8 VMOVDQU 2496(CX), Y11 VMOVDQU 2528(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 4 to 10 outputs VMOVDQU (R9), Y13 ADDQ $0x20, R9 VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 2560(CX), Y11 VMOVDQU 2592(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y0, Y0 VMOVDQU 2624(CX), Y11 VMOVDQU 2656(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y1, Y1 VMOVDQU 2688(CX), Y11 VMOVDQU 2720(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y2, Y2 VMOVDQU 2752(CX), Y11 VMOVDQU 2784(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y3, Y3 VMOVDQU 2816(CX), Y11 VMOVDQU 2848(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y4, Y4 VMOVDQU 2880(CX), Y11 VMOVDQU 2912(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y5, Y5 VMOVDQU 2944(CX), Y11 VMOVDQU 2976(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y6, Y6 VMOVDQU 3008(CX), Y11 VMOVDQU 3040(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y7, Y7 VMOVDQU 3072(CX), Y11 VMOVDQU 3104(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y8, Y8 VMOVDQU 3136(CX), Y11 VMOVDQU 3168(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 5 to 10 outputs VMOVDQU (R10), Y13 ADDQ $0x20, R10 VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 3200(CX), Y11 VMOVDQU 3232(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y0, Y0 VMOVDQU 3264(CX), Y11 VMOVDQU 3296(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y1, Y1 VMOVDQU 3328(CX), Y11 VMOVDQU 3360(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y2, Y2 VMOVDQU 3392(CX), Y11 VMOVDQU 3424(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y3, Y3 VMOVDQU 3456(CX), Y11 VMOVDQU 3488(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y4, Y4 VMOVDQU 3520(CX), Y11 VMOVDQU 3552(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y5, Y5 VMOVDQU 3584(CX), Y11 VMOVDQU 3616(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y6, Y6 VMOVDQU 3648(CX), Y11 VMOVDQU 3680(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y7, Y7 VMOVDQU 3712(CX), Y11 VMOVDQU 3744(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y8, Y8 VMOVDQU 3776(CX), Y11 VMOVDQU 3808(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 6 to 10 outputs VMOVDQU (R11), Y13 ADDQ $0x20, R11 VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 3840(CX), Y11 VMOVDQU 3872(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y0, Y0 VMOVDQU 3904(CX), Y11 VMOVDQU 3936(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y1, Y1 VMOVDQU 3968(CX), Y11 VMOVDQU 4000(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y2, Y2 VMOVDQU 4032(CX), Y11 VMOVDQU 4064(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y3, Y3 VMOVDQU 4096(CX), Y11 VMOVDQU 4128(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y4, Y4 VMOVDQU 4160(CX), Y11 VMOVDQU 4192(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y5, Y5 VMOVDQU 4224(CX), Y11 VMOVDQU 4256(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y6, Y6 VMOVDQU 4288(CX), Y11 VMOVDQU 4320(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y7, Y7 VMOVDQU 4352(CX), Y11 VMOVDQU 4384(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y8, Y8 VMOVDQU 4416(CX), Y11 VMOVDQU 4448(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 7 to 10 outputs VMOVDQU (R12), Y13 ADDQ $0x20, R12 VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 4480(CX), Y11 VMOVDQU 4512(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y0, Y0 VMOVDQU 4544(CX), Y11 VMOVDQU 4576(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y1, Y1 VMOVDQU 4608(CX), Y11 VMOVDQU 4640(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y2, Y2 VMOVDQU 4672(CX), Y11 VMOVDQU 4704(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y3, Y3 VMOVDQU 4736(CX), Y11 VMOVDQU 4768(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y4, Y4 VMOVDQU 4800(CX), Y11 VMOVDQU 4832(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y5, Y5 VMOVDQU 4864(CX), Y11 VMOVDQU 4896(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y6, Y6 VMOVDQU 4928(CX), Y11 VMOVDQU 4960(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y7, Y7 VMOVDQU 4992(CX), Y11 VMOVDQU 5024(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y8, Y8 VMOVDQU 5056(CX), Y11 VMOVDQU 5088(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 8 to 10 outputs VMOVDQU (R13), Y13 ADDQ $0x20, R13 VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 5120(CX), Y11 VMOVDQU 5152(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y0, Y0 VMOVDQU 5184(CX), Y11 VMOVDQU 5216(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y1, Y1 VMOVDQU 5248(CX), Y11 VMOVDQU 5280(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y2, Y2 VMOVDQU 5312(CX), Y11 VMOVDQU 5344(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y3, Y3 VMOVDQU 5376(CX), Y11 VMOVDQU 5408(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y4, Y4 VMOVDQU 5440(CX), Y11 VMOVDQU 5472(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y5, Y5 VMOVDQU 5504(CX), Y11 VMOVDQU 5536(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y6, Y6 VMOVDQU 5568(CX), Y11 VMOVDQU 5600(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y7, Y7 VMOVDQU 5632(CX), Y11 VMOVDQU 5664(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y8, Y8 VMOVDQU 5696(CX), Y11 VMOVDQU 5728(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 9 to 10 outputs VMOVDQU (DX), Y13 ADDQ $0x20, DX VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 VMOVDQU 5760(CX), Y11 VMOVDQU 5792(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y0, Y0 VMOVDQU 5824(CX), Y11 VMOVDQU 5856(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y1, Y1 VMOVDQU 5888(CX), Y11 VMOVDQU 5920(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y2, Y2 VMOVDQU 5952(CX), Y11 VMOVDQU 5984(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y3, Y3 VMOVDQU 6016(CX), Y11 VMOVDQU 6048(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y4, Y4 VMOVDQU 6080(CX), Y11 VMOVDQU 6112(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y5, Y5 VMOVDQU 6144(CX), Y11 VMOVDQU 6176(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y6, Y6 VMOVDQU 6208(CX), Y11 VMOVDQU 6240(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y7, Y7 VMOVDQU 6272(CX), Y11 VMOVDQU 6304(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y8, Y8 VMOVDQU 6336(CX), Y11 VMOVDQU 6368(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y9, Y9 // Store 10 outputs MOVQ (R14), BP VMOVDQU Y0, (BP)(R15*1) MOVQ 24(R14), BP VMOVDQU Y1, (BP)(R15*1) MOVQ 48(R14), BP VMOVDQU Y2, (BP)(R15*1) MOVQ 72(R14), BP VMOVDQU Y3, (BP)(R15*1) MOVQ 96(R14), BP VMOVDQU Y4, (BP)(R15*1) MOVQ 120(R14), BP VMOVDQU Y5, (BP)(R15*1) MOVQ 144(R14), BP VMOVDQU Y6, (BP)(R15*1) MOVQ 168(R14), BP VMOVDQU Y7, (BP)(R15*1) MOVQ 192(R14), BP VMOVDQU Y8, (BP)(R15*1) MOVQ 216(R14), BP VMOVDQU Y9, (BP)(R15*1) // Prepare for next loop ADDQ $0x20, R15 DECQ AX JNZ mulAvxTwo_10x10_loop VZEROUPPER mulAvxTwo_10x10_end: RET reedsolomon-1.9.13/galois_gen_none.go000066400000000000000000000004031406411035300176020ustar00rootroot00000000000000//+build !amd64 noasm appengine gccgo nogen package reedsolomon const maxAvx2Inputs = 0 const maxAvx2Outputs = 0 const avx2CodeGen = false func galMulSlicesAvx2(matrix []byte, in, out [][]byte, start, stop int) int { panic("avx2 codegen not available") } reedsolomon-1.9.13/galois_gen_switch_amd64.go000066400000000000000000000177021406411035300211510ustar00rootroot00000000000000// Code generated by command: go generate gen.go. DO NOT EDIT. // +build !appengine // +build !noasm // +build gc // +build !nogen package reedsolomon import "fmt" const avx2CodeGen = true const maxAvx2Inputs = 10 const maxAvx2Outputs = 10 func galMulSlicesAvx2(matrix []byte, in, out [][]byte, start, stop int) int { n := stop - start n = (n >> 5) << 5 switch len(in) { case 1: switch len(out) { case 1: n = (n >> 6) << 6 mulAvxTwo_1x1_64(matrix, in, out, start, n) return n case 2: n = (n >> 6) << 6 mulAvxTwo_1x2_64(matrix, in, out, start, n) return n case 3: n = (n >> 6) << 6 mulAvxTwo_1x3_64(matrix, in, out, start, n) return n case 4: mulAvxTwo_1x4(matrix, in, out, start, n) return n case 5: mulAvxTwo_1x5(matrix, in, out, start, n) return n case 6: mulAvxTwo_1x6(matrix, in, out, start, n) return n case 7: mulAvxTwo_1x7(matrix, in, out, start, n) return n case 8: mulAvxTwo_1x8(matrix, in, out, start, n) return n case 9: mulAvxTwo_1x9(matrix, in, out, start, n) return n case 10: mulAvxTwo_1x10(matrix, in, out, start, n) return n } case 2: switch len(out) { case 1: n = (n >> 6) << 6 mulAvxTwo_2x1_64(matrix, in, out, start, n) return n case 2: n = (n >> 6) << 6 mulAvxTwo_2x2_64(matrix, in, out, start, n) return n case 3: n = (n >> 6) << 6 mulAvxTwo_2x3_64(matrix, in, out, start, n) return n case 4: mulAvxTwo_2x4(matrix, in, out, start, n) return n case 5: mulAvxTwo_2x5(matrix, in, out, start, n) return n case 6: mulAvxTwo_2x6(matrix, in, out, start, n) return n case 7: mulAvxTwo_2x7(matrix, in, out, start, n) return n case 8: mulAvxTwo_2x8(matrix, in, out, start, n) return n case 9: mulAvxTwo_2x9(matrix, in, out, start, n) return n case 10: mulAvxTwo_2x10(matrix, in, out, start, n) return n } case 3: switch len(out) { case 1: n = (n >> 6) << 6 mulAvxTwo_3x1_64(matrix, in, out, start, n) return n case 2: n = (n >> 6) << 6 mulAvxTwo_3x2_64(matrix, in, out, start, n) return n case 3: n = (n >> 6) << 6 mulAvxTwo_3x3_64(matrix, in, out, start, n) return n case 4: mulAvxTwo_3x4(matrix, in, out, start, n) return n case 5: mulAvxTwo_3x5(matrix, in, out, start, n) return n case 6: mulAvxTwo_3x6(matrix, in, out, start, n) return n case 7: mulAvxTwo_3x7(matrix, in, out, start, n) return n case 8: mulAvxTwo_3x8(matrix, in, out, start, n) return n case 9: mulAvxTwo_3x9(matrix, in, out, start, n) return n case 10: mulAvxTwo_3x10(matrix, in, out, start, n) return n } case 4: switch len(out) { case 1: n = (n >> 6) << 6 mulAvxTwo_4x1_64(matrix, in, out, start, n) return n case 2: n = (n >> 6) << 6 mulAvxTwo_4x2_64(matrix, in, out, start, n) return n case 3: n = (n >> 6) << 6 mulAvxTwo_4x3_64(matrix, in, out, start, n) return n case 4: mulAvxTwo_4x4(matrix, in, out, start, n) return n case 5: mulAvxTwo_4x5(matrix, in, out, start, n) return n case 6: mulAvxTwo_4x6(matrix, in, out, start, n) return n case 7: mulAvxTwo_4x7(matrix, in, out, start, n) return n case 8: mulAvxTwo_4x8(matrix, in, out, start, n) return n case 9: mulAvxTwo_4x9(matrix, in, out, start, n) return n case 10: mulAvxTwo_4x10(matrix, in, out, start, n) return n } case 5: switch len(out) { case 1: n = (n >> 6) << 6 mulAvxTwo_5x1_64(matrix, in, out, start, n) return n case 2: n = (n >> 6) << 6 mulAvxTwo_5x2_64(matrix, in, out, start, n) return n case 3: n = (n >> 6) << 6 mulAvxTwo_5x3_64(matrix, in, out, start, n) return n case 4: mulAvxTwo_5x4(matrix, in, out, start, n) return n case 5: mulAvxTwo_5x5(matrix, in, out, start, n) return n case 6: mulAvxTwo_5x6(matrix, in, out, start, n) return n case 7: mulAvxTwo_5x7(matrix, in, out, start, n) return n case 8: mulAvxTwo_5x8(matrix, in, out, start, n) return n case 9: mulAvxTwo_5x9(matrix, in, out, start, n) return n case 10: mulAvxTwo_5x10(matrix, in, out, start, n) return n } case 6: switch len(out) { case 1: n = (n >> 6) << 6 mulAvxTwo_6x1_64(matrix, in, out, start, n) return n case 2: n = (n >> 6) << 6 mulAvxTwo_6x2_64(matrix, in, out, start, n) return n case 3: n = (n >> 6) << 6 mulAvxTwo_6x3_64(matrix, in, out, start, n) return n case 4: mulAvxTwo_6x4(matrix, in, out, start, n) return n case 5: mulAvxTwo_6x5(matrix, in, out, start, n) return n case 6: mulAvxTwo_6x6(matrix, in, out, start, n) return n case 7: mulAvxTwo_6x7(matrix, in, out, start, n) return n case 8: mulAvxTwo_6x8(matrix, in, out, start, n) return n case 9: mulAvxTwo_6x9(matrix, in, out, start, n) return n case 10: mulAvxTwo_6x10(matrix, in, out, start, n) return n } case 7: switch len(out) { case 1: n = (n >> 6) << 6 mulAvxTwo_7x1_64(matrix, in, out, start, n) return n case 2: n = (n >> 6) << 6 mulAvxTwo_7x2_64(matrix, in, out, start, n) return n case 3: n = (n >> 6) << 6 mulAvxTwo_7x3_64(matrix, in, out, start, n) return n case 4: mulAvxTwo_7x4(matrix, in, out, start, n) return n case 5: mulAvxTwo_7x5(matrix, in, out, start, n) return n case 6: mulAvxTwo_7x6(matrix, in, out, start, n) return n case 7: mulAvxTwo_7x7(matrix, in, out, start, n) return n case 8: mulAvxTwo_7x8(matrix, in, out, start, n) return n case 9: mulAvxTwo_7x9(matrix, in, out, start, n) return n case 10: mulAvxTwo_7x10(matrix, in, out, start, n) return n } case 8: switch len(out) { case 1: n = (n >> 6) << 6 mulAvxTwo_8x1_64(matrix, in, out, start, n) return n case 2: n = (n >> 6) << 6 mulAvxTwo_8x2_64(matrix, in, out, start, n) return n case 3: n = (n >> 6) << 6 mulAvxTwo_8x3_64(matrix, in, out, start, n) return n case 4: mulAvxTwo_8x4(matrix, in, out, start, n) return n case 5: mulAvxTwo_8x5(matrix, in, out, start, n) return n case 6: mulAvxTwo_8x6(matrix, in, out, start, n) return n case 7: mulAvxTwo_8x7(matrix, in, out, start, n) return n case 8: mulAvxTwo_8x8(matrix, in, out, start, n) return n case 9: mulAvxTwo_8x9(matrix, in, out, start, n) return n case 10: mulAvxTwo_8x10(matrix, in, out, start, n) return n } case 9: switch len(out) { case 1: n = (n >> 6) << 6 mulAvxTwo_9x1_64(matrix, in, out, start, n) return n case 2: n = (n >> 6) << 6 mulAvxTwo_9x2_64(matrix, in, out, start, n) return n case 3: n = (n >> 6) << 6 mulAvxTwo_9x3_64(matrix, in, out, start, n) return n case 4: mulAvxTwo_9x4(matrix, in, out, start, n) return n case 5: mulAvxTwo_9x5(matrix, in, out, start, n) return n case 6: mulAvxTwo_9x6(matrix, in, out, start, n) return n case 7: mulAvxTwo_9x7(matrix, in, out, start, n) return n case 8: mulAvxTwo_9x8(matrix, in, out, start, n) return n case 9: mulAvxTwo_9x9(matrix, in, out, start, n) return n case 10: mulAvxTwo_9x10(matrix, in, out, start, n) return n } case 10: switch len(out) { case 1: n = (n >> 6) << 6 mulAvxTwo_10x1_64(matrix, in, out, start, n) return n case 2: n = (n >> 6) << 6 mulAvxTwo_10x2_64(matrix, in, out, start, n) return n case 3: n = (n >> 6) << 6 mulAvxTwo_10x3_64(matrix, in, out, start, n) return n case 4: mulAvxTwo_10x4(matrix, in, out, start, n) return n case 5: mulAvxTwo_10x5(matrix, in, out, start, n) return n case 6: mulAvxTwo_10x6(matrix, in, out, start, n) return n case 7: mulAvxTwo_10x7(matrix, in, out, start, n) return n case 8: mulAvxTwo_10x8(matrix, in, out, start, n) return n case 9: mulAvxTwo_10x9(matrix, in, out, start, n) return n case 10: mulAvxTwo_10x10(matrix, in, out, start, n) return n } } panic(fmt.Sprintf("unhandled size: %dx%d", len(in), len(out))) } reedsolomon-1.9.13/galois_noasm.go000066400000000000000000000014331406411035300171330ustar00rootroot00000000000000//+build !amd64 noasm appengine gccgo //+build !arm64 noasm appengine gccgo //+build !ppc64le noasm appengine gccgo // Copyright 2015, Klaus Post, see LICENSE for details. package reedsolomon func galMulSlice(c byte, in, out []byte, o *options) { out = out[:len(in)] if c == 1 { copy(out, in) return } mt := mulTable[c][:256] for n, input := range in { out[n] = mt[input] } } func galMulSliceXor(c byte, in, out []byte, o *options) { out = out[:len(in)] if c == 1 { for n, input := range in { out[n] ^= input } return } mt := mulTable[c][:256] for n, input := range in { out[n] ^= mt[input] } } // slice galois add func sliceXor(in, out []byte, o *options) { for n, input := range in { out[n] ^= input } } func init() { defaultOptions.useAVX512 = false } reedsolomon-1.9.13/galois_notamd64.go000066400000000000000000000007541406411035300174570ustar00rootroot00000000000000//+build !amd64 noasm appengine gccgo // Copyright 2020, Klaus Post, see LICENSE for details. package reedsolomon func (r *reedSolomon) codeSomeShardsAvx512(matrixRows, inputs, outputs [][]byte, outputCount, byteCount int) { panic("codeSomeShardsAvx512 should not be called if built without asm") } func (r *reedSolomon) codeSomeShardsAvx512P(matrixRows, inputs, outputs [][]byte, outputCount, byteCount int) { panic("codeSomeShardsAvx512P should not be called if built without asm") } reedsolomon-1.9.13/galois_ppc64le.go000066400000000000000000000026661406411035300173040ustar00rootroot00000000000000//+build !noasm //+build !appengine //+build !gccgo // Copyright 2015, Klaus Post, see LICENSE for details. // Copyright 2018, Minio, Inc. package reedsolomon //go:noescape func galMulPpc(low, high, in, out []byte) //go:noescape func galMulPpcXor(low, high, in, out []byte) // This is what the assembler routines do in blocks of 16 bytes: /* func galMulPpc(low, high, in, out []byte) { for n, input := range in { l := input & 0xf h := input >> 4 out[n] = low[l] ^ high[h] } } func galMulPpcXor(low, high, in, out []byte) { for n, input := range in { l := input & 0xf h := input >> 4 out[n] ^= low[l] ^ high[h] } } */ func galMulSlice(c byte, in, out []byte, o *options) { if c == 1 { copy(out, in) return } done := (len(in) >> 4) << 4 if done > 0 { galMulPpc(mulTableLow[c][:], mulTableHigh[c][:], in[:done], out) } remain := len(in) - done if remain > 0 { mt := mulTable[c][:256] for i := done; i < len(in); i++ { out[i] = mt[in[i]] } } } func galMulSliceXor(c byte, in, out []byte, o *options) { if c == 1 { sliceXor(in, out, o) return } done := (len(in) >> 4) << 4 if done > 0 { galMulPpcXor(mulTableLow[c][:], mulTableHigh[c][:], in[:done], out) } remain := len(in) - done if remain > 0 { mt := mulTable[c][:256] for i := done; i < len(in); i++ { out[i] ^= mt[in[i]] } } } // slice galois add func sliceXor(in, out []byte, o *options) { for n, input := range in { out[n] ^= input } } reedsolomon-1.9.13/galois_ppc64le.s000066400000000000000000000051211406411035300171260ustar00rootroot00000000000000//+build !noasm //+build !appengine //+build !gccgo // Copyright 2015, Klaus Post, see LICENSE for details. // Copyright 2018, Minio, Inc. #include "textflag.h" #define LOW R3 #define HIGH R4 #define IN R5 #define LEN R6 #define OUT R7 #define CONSTANTS R8 #define OFFSET R9 #define OFFSET1 R10 #define OFFSET2 R11 #define X6 VS34 #define X6_ V2 #define X7 VS35 #define X7_ V3 #define MSG VS36 #define MSG_ V4 #define MSG_HI VS37 #define MSG_HI_ V5 #define RESULT VS38 #define RESULT_ V6 #define ROTATE VS39 #define ROTATE_ V7 #define MASK VS40 #define MASK_ V8 #define FLIP VS41 #define FLIP_ V9 // func galMulPpc(low, high, in, out []byte) TEXT ·galMulPpc(SB), NOFRAME|NOSPLIT, $0-96 MOVD low+0(FP), LOW MOVD high+24(FP), HIGH MOVD in+48(FP), IN MOVD in_len+56(FP), LEN MOVD out+72(FP), OUT MOVD $16, OFFSET1 MOVD $32, OFFSET2 MOVD $·constants(SB), CONSTANTS LXVD2X (CONSTANTS)(R0), ROTATE LXVD2X (CONSTANTS)(OFFSET1), MASK LXVD2X (CONSTANTS)(OFFSET2), FLIP LXVD2X (LOW)(R0), X6 LXVD2X (HIGH)(R0), X7 VPERM X6_, V31, FLIP_, X6_ VPERM X7_, V31, FLIP_, X7_ MOVD $0, OFFSET loop: LXVD2X (IN)(OFFSET), MSG VSRB MSG_, ROTATE_, MSG_HI_ VAND MSG_, MASK_, MSG_ VPERM X6_, V31, MSG_, MSG_ VPERM X7_, V31, MSG_HI_, MSG_HI_ VXOR MSG_, MSG_HI_, MSG_ STXVD2X MSG, (OUT)(OFFSET) ADD $16, OFFSET, OFFSET CMP LEN, OFFSET BGT loop RET // func galMulPpcXorlow, high, in, out []byte) TEXT ·galMulPpcXor(SB), NOFRAME|NOSPLIT, $0-96 MOVD low+0(FP), LOW MOVD high+24(FP), HIGH MOVD in+48(FP), IN MOVD in_len+56(FP), LEN MOVD out+72(FP), OUT MOVD $16, OFFSET1 MOVD $32, OFFSET2 MOVD $·constants(SB), CONSTANTS LXVD2X (CONSTANTS)(R0), ROTATE LXVD2X (CONSTANTS)(OFFSET1), MASK LXVD2X (CONSTANTS)(OFFSET2), FLIP LXVD2X (LOW)(R0), X6 LXVD2X (HIGH)(R0), X7 VPERM X6_, V31, FLIP_, X6_ VPERM X7_, V31, FLIP_, X7_ MOVD $0, OFFSET loopXor: LXVD2X (IN)(OFFSET), MSG LXVD2X (OUT)(OFFSET), RESULT VSRB MSG_, ROTATE_, MSG_HI_ VAND MSG_, MASK_, MSG_ VPERM X6_, V31, MSG_, MSG_ VPERM X7_, V31, MSG_HI_, MSG_HI_ VXOR MSG_, MSG_HI_, MSG_ VXOR MSG_, RESULT_, RESULT_ STXVD2X RESULT, (OUT)(OFFSET) ADD $16, OFFSET, OFFSET CMP LEN, OFFSET BGT loopXor RET DATA ·constants+0x0(SB)/8, $0x0404040404040404 DATA ·constants+0x8(SB)/8, $0x0404040404040404 DATA ·constants+0x10(SB)/8, $0x0f0f0f0f0f0f0f0f DATA ·constants+0x18(SB)/8, $0x0f0f0f0f0f0f0f0f DATA ·constants+0x20(SB)/8, $0x0706050403020100 DATA ·constants+0x28(SB)/8, $0x0f0e0d0c0b0a0908 GLOBL ·constants(SB), 8, $48 reedsolomon-1.9.13/galois_test.go000066400000000000000000000146271406411035300170060ustar00rootroot00000000000000/** * Unit tests for Galois * * Copyright 2015, Klaus Post * Copyright 2015, Backblaze, Inc. */ package reedsolomon import ( "bytes" "testing" ) func TestAssociativity(t *testing.T) { for i := 0; i < 256; i++ { a := byte(i) for j := 0; j < 256; j++ { b := byte(j) for k := 0; k < 256; k++ { c := byte(k) x := galAdd(a, galAdd(b, c)) y := galAdd(galAdd(a, b), c) if x != y { t.Fatal("add does not match:", x, "!=", y) } x = galMultiply(a, galMultiply(b, c)) y = galMultiply(galMultiply(a, b), c) if x != y { t.Fatal("multiply does not match:", x, "!=", y) } } } } } func TestIdentity(t *testing.T) { for i := 0; i < 256; i++ { a := byte(i) b := galAdd(a, 0) if a != b { t.Fatal("Add zero should yield same result", a, "!=", b) } b = galMultiply(a, 1) if a != b { t.Fatal("Mul by one should yield same result", a, "!=", b) } } } func TestInverse(t *testing.T) { for i := 0; i < 256; i++ { a := byte(i) b := galSub(0, a) c := galAdd(a, b) if c != 0 { t.Fatal("inverse sub/add", c, "!=", 0) } if a != 0 { b = galDivide(1, a) c = galMultiply(a, b) if c != 1 { t.Fatal("inverse div/mul", c, "!=", 1) } } } } func TestCommutativity(t *testing.T) { for i := 0; i < 256; i++ { a := byte(i) for j := 0; j < 256; j++ { b := byte(j) x := galAdd(a, b) y := galAdd(b, a) if x != y { t.Fatal(x, "!= ", y) } x = galMultiply(a, b) y = galMultiply(b, a) if x != y { t.Fatal(x, "!= ", y) } } } } func TestDistributivity(t *testing.T) { for i := 0; i < 256; i++ { a := byte(i) for j := 0; j < 256; j++ { b := byte(j) for k := 0; k < 256; k++ { c := byte(k) x := galMultiply(a, galAdd(b, c)) y := galAdd(galMultiply(a, b), galMultiply(a, c)) if x != y { t.Fatal(x, "!= ", y) } } } } } func TestExp(t *testing.T) { for i := 0; i < 256; i++ { a := byte(i) power := byte(1) for j := 0; j < 256; j++ { x := galExp(a, j) if x != power { t.Fatal(x, "!=", power) } power = galMultiply(power, a) } } } func testGalois(t *testing.T, o *options) { // These values were copied output of the Python code. if galMultiply(3, 4) != 12 { t.Fatal("galMultiply(3, 4) != 12") } if galMultiply(7, 7) != 21 { t.Fatal("galMultiply(7, 7) != 21") } if galMultiply(23, 45) != 41 { t.Fatal("galMultiply(23, 45) != 41") } // Test slices (>32 entries to test assembler -- AVX2 & NEON) in := []byte{0, 1, 2, 3, 4, 5, 6, 10, 50, 100, 150, 174, 201, 255, 99, 32, 67, 85, 200, 199, 198, 197, 196, 195, 194, 193, 192, 191, 190, 189, 188, 187, 186, 185} out := make([]byte, len(in)) galMulSlice(25, in, out, o) expect := []byte{0x0, 0x19, 0x32, 0x2b, 0x64, 0x7d, 0x56, 0xfa, 0xb8, 0x6d, 0xc7, 0x85, 0xc3, 0x1f, 0x22, 0x7, 0x25, 0xfe, 0xda, 0x5d, 0x44, 0x6f, 0x76, 0x39, 0x20, 0xb, 0x12, 0x11, 0x8, 0x23, 0x3a, 0x75, 0x6c, 0x47} if 0 != bytes.Compare(out, expect) { t.Errorf("got %#v, expected %#v", out, expect) } expectXor := []byte{0x0, 0x2d, 0x5a, 0x77, 0xb4, 0x99, 0xee, 0x2f, 0x79, 0xf2, 0x7, 0x51, 0xd4, 0x19, 0x31, 0xc9, 0xf8, 0xfc, 0xf9, 0x4f, 0x62, 0x15, 0x38, 0xfb, 0xd6, 0xa1, 0x8c, 0x96, 0xbb, 0xcc, 0xe1, 0x22, 0xf, 0x78} galMulSliceXor(52, in, out, o) if 0 != bytes.Compare(out, expectXor) { t.Errorf("got %#v, expected %#v", out, expectXor) } galMulSlice(177, in, out, o) expect = []byte{0x0, 0xb1, 0x7f, 0xce, 0xfe, 0x4f, 0x81, 0x9e, 0x3, 0x6, 0xe8, 0x75, 0xbd, 0x40, 0x36, 0xa3, 0x95, 0xcb, 0xc, 0xdd, 0x6c, 0xa2, 0x13, 0x23, 0x92, 0x5c, 0xed, 0x1b, 0xaa, 0x64, 0xd5, 0xe5, 0x54, 0x9a} if 0 != bytes.Compare(out, expect) { t.Errorf("got %#v, expected %#v", out, expect) } expectXor = []byte{0x0, 0xc4, 0x95, 0x51, 0x37, 0xf3, 0xa2, 0xfb, 0xec, 0xc5, 0xd0, 0xc7, 0x53, 0x88, 0xa3, 0xa5, 0x6, 0x78, 0x97, 0x9f, 0x5b, 0xa, 0xce, 0xa8, 0x6c, 0x3d, 0xf9, 0xdf, 0x1b, 0x4a, 0x8e, 0xe8, 0x2c, 0x7d} galMulSliceXor(117, in, out, o) if 0 != bytes.Compare(out, expectXor) { t.Errorf("got %#v, expected %#v", out, expectXor) } if galExp(2, 2) != 4 { t.Fatal("galExp(2, 2) != 4") } if galExp(5, 20) != 235 { t.Fatal("galExp(5, 20) != 235") } if galExp(13, 7) != 43 { t.Fatal("galExp(13, 7) != 43") } } func TestGalois(t *testing.T) { // invoke with all combinations of asm instructions o := options{} o.useSSSE3, o.useAVX2 = false, false testGalois(t, &o) o.useSSSE3, o.useAVX2 = true, false testGalois(t, &o) if defaultOptions.useAVX2 { o.useSSSE3, o.useAVX2 = false, true testGalois(t, &o) } } func TestSliceGalAdd(t *testing.T) { lengthList := []int{16, 32, 34} for _, length := range lengthList { in := make([]byte, length) fillRandom(in) out := make([]byte, length) fillRandom(out) expect := make([]byte, length) for i := range expect { expect[i] = in[i] ^ out[i] } noSSE2 := defaultOptions noSSE2.useSSE2 = false sliceXor(in, out, &noSSE2) if 0 != bytes.Compare(out, expect) { t.Errorf("got %#v, expected %#v", out, expect) } fillRandom(out) for i := range expect { expect[i] = in[i] ^ out[i] } sliceXor(in, out, &defaultOptions) if 0 != bytes.Compare(out, expect) { t.Errorf("got %#v, expected %#v", out, expect) } } for i := 0; i < 256; i++ { a := byte(i) for j := 0; j < 256; j++ { b := byte(j) for k := 0; k < 256; k++ { c := byte(k) x := galAdd(a, galAdd(b, c)) y := galAdd(galAdd(a, b), c) if x != y { t.Fatal("add does not match:", x, "!=", y) } x = galMultiply(a, galMultiply(b, c)) y = galMultiply(galMultiply(a, b), c) if x != y { t.Fatal("multiply does not match:", x, "!=", y) } } } } } func benchmarkGalois(b *testing.B, size int) { in := make([]byte, size) out := make([]byte, size) o := options{} o.useSSSE3, o.useAVX2 = !*noSSSE3, !*noAVX2 b.SetBytes(int64(size)) b.ResetTimer() for i := 0; i < b.N; i++ { galMulSlice(25, in[:], out[:], &o) } } func BenchmarkGalois128K(b *testing.B) { benchmarkGalois(b, 128*1024) } func BenchmarkGalois1M(b *testing.B) { benchmarkGalois(b, 1024*1024) } func benchmarkGaloisXor(b *testing.B, size int) { in := make([]byte, size) out := make([]byte, size) o := options{} o.useSSSE3, o.useAVX2 = !*noSSSE3, !*noAVX2 b.SetBytes(int64(size)) b.ResetTimer() for i := 0; i < b.N; i++ { galMulSliceXor(177, in[:], out[:], &o) } } func BenchmarkGaloisXor128K(b *testing.B) { benchmarkGaloisXor(b, 128*1024) } func BenchmarkGaloisXor1M(b *testing.B) { benchmarkGaloisXor(b, 1024*1024) } reedsolomon-1.9.13/gentables.go000066400000000000000000000063461406411035300164340ustar00rootroot00000000000000//+build ignore package main import ( "fmt" ) var logTable = [fieldSize]int16{ -1, 0, 1, 25, 2, 50, 26, 198, 3, 223, 51, 238, 27, 104, 199, 75, 4, 100, 224, 14, 52, 141, 239, 129, 28, 193, 105, 248, 200, 8, 76, 113, 5, 138, 101, 47, 225, 36, 15, 33, 53, 147, 142, 218, 240, 18, 130, 69, 29, 181, 194, 125, 106, 39, 249, 185, 201, 154, 9, 120, 77, 228, 114, 166, 6, 191, 139, 98, 102, 221, 48, 253, 226, 152, 37, 179, 16, 145, 34, 136, 54, 208, 148, 206, 143, 150, 219, 189, 241, 210, 19, 92, 131, 56, 70, 64, 30, 66, 182, 163, 195, 72, 126, 110, 107, 58, 40, 84, 250, 133, 186, 61, 202, 94, 155, 159, 10, 21, 121, 43, 78, 212, 229, 172, 115, 243, 167, 87, 7, 112, 192, 247, 140, 128, 99, 13, 103, 74, 222, 237, 49, 197, 254, 24, 227, 165, 153, 119, 38, 184, 180, 124, 17, 68, 146, 217, 35, 32, 137, 46, 55, 63, 209, 91, 149, 188, 207, 205, 144, 135, 151, 178, 220, 252, 190, 97, 242, 86, 211, 171, 20, 42, 93, 158, 132, 60, 57, 83, 71, 109, 65, 162, 31, 45, 67, 216, 183, 123, 164, 118, 196, 23, 73, 236, 127, 12, 111, 246, 108, 161, 59, 82, 41, 157, 85, 170, 251, 96, 134, 177, 187, 204, 62, 90, 203, 89, 95, 176, 156, 169, 160, 81, 11, 245, 22, 235, 122, 117, 44, 215, 79, 174, 213, 233, 230, 231, 173, 232, 116, 214, 244, 234, 168, 80, 88, 175, } const ( // The number of elements in the field. fieldSize = 256 // The polynomial used to generate the logarithm table. // // There are a number of polynomials that work to generate // a Galois field of 256 elements. The choice is arbitrary, // and we just use the first one. // // The possibilities are: 29, 43, 45, 77, 95, 99, 101, 105, //* 113, 135, 141, 169, 195, 207, 231, and 245. generatingPolynomial = 29 ) func main() { t := generateExpTable() fmt.Printf("var expTable = %#v\n", t) //t2 := generateMulTableSplit(t) //fmt.Printf("var mulTable = %#v\n", t2) low, high := generateMulTableHalf(t) fmt.Printf("var mulTableLow = %#v\n", low) fmt.Printf("var mulTableHigh = %#v\n", high) } /** * Generates the inverse log table. */ func generateExpTable() []byte { result := make([]byte, fieldSize*2-2) for i := 1; i < fieldSize; i++ { log := logTable[i] result[log] = byte(i) result[log+fieldSize-1] = byte(i) } return result } func generateMulTable(expTable []byte) []byte { result := make([]byte, 256*256) for v := range result { a := byte(v & 0xff) b := byte(v >> 8) if a == 0 || b == 0 { result[v] = 0 continue } logA := int(logTable[a]) logB := int(logTable[b]) result[v] = expTable[logA+logB] } return result } func generateMulTableSplit(expTable []byte) [256][256]byte { var result [256][256]byte for a := range result { for b := range result[a] { if a == 0 || b == 0 { result[a][b] = 0 continue } logA := int(logTable[a]) logB := int(logTable[b]) result[a][b] = expTable[logA+logB] } } return result } func generateMulTableHalf(expTable []byte) (low [256][16]byte, high [256][16]byte) { for a := range low { for b := range low { result := 0 if !(a == 0 || b == 0) { logA := int(logTable[a]) logB := int(logTable[b]) result = int(expTable[logA+logB]) } if (b & 0xf) == b { low[a][b] = byte(result) } if (b & 0xf0) == b { high[a][b>>4] = byte(result) } } } return } reedsolomon-1.9.13/go.mod000066400000000000000000000001371406411035300152370ustar00rootroot00000000000000module github.com/klauspost/reedsolomon go 1.14 require github.com/klauspost/cpuid/v2 v2.0.6 reedsolomon-1.9.13/go.sum000066400000000000000000000002611406411035300152620ustar00rootroot00000000000000github.com/klauspost/cpuid/v2 v2.0.6 h1:dQ5ueTiftKxp0gyjKSx5+8BtPWkyQbd95m8Gys/RarI= github.com/klauspost/cpuid/v2 v2.0.6/go.mod h1:FInQzS24/EEf25PyTYn52gqo7WaD8xa0213Md/qVLRg= reedsolomon-1.9.13/inversion_tree.go000066400000000000000000000135031406411035300175140ustar00rootroot00000000000000/** * A thread-safe tree which caches inverted matrices. * * Copyright 2016, Peter Collins */ package reedsolomon import ( "errors" "sync" ) // The tree uses a Reader-Writer mutex to make it thread-safe // when accessing cached matrices and inserting new ones. type inversionTree struct { mutex sync.RWMutex root inversionNode } type inversionNode struct { matrix matrix children []*inversionNode } // newInversionTree initializes a tree for storing inverted matrices. // Note that the root node is the identity matrix as it implies // there were no errors with the original data. func newInversionTree(dataShards, parityShards int) *inversionTree { identity, _ := identityMatrix(dataShards) return &inversionTree{ root: inversionNode{ matrix: identity, children: make([]*inversionNode, dataShards+parityShards), }, } } // GetInvertedMatrix returns the cached inverted matrix or nil if it // is not found in the tree keyed on the indices of invalid rows. func (t *inversionTree) GetInvertedMatrix(invalidIndices []int) matrix { if t == nil { return nil } // Lock the tree for reading before accessing the tree. t.mutex.RLock() defer t.mutex.RUnlock() // If no invalid indices were give we should return the root // identity matrix. if len(invalidIndices) == 0 { return t.root.matrix } // Recursively search for the inverted matrix in the tree, passing in // 0 as the parent index as we start at the root of the tree. return t.root.getInvertedMatrix(invalidIndices, 0) } // errAlreadySet is returned if the root node matrix is overwritten var errAlreadySet = errors.New("the root node identity matrix is already set") // InsertInvertedMatrix inserts a new inverted matrix into the tree // keyed by the indices of invalid rows. The total number of shards // is required for creating the proper length lists of child nodes for // each node. func (t *inversionTree) InsertInvertedMatrix(invalidIndices []int, matrix matrix, shards int) error { if t == nil { return nil } // If no invalid indices were given then we are done because the // root node is already set with the identity matrix. if len(invalidIndices) == 0 { return errAlreadySet } if !matrix.IsSquare() { return errNotSquare } // Lock the tree for writing and reading before accessing the tree. t.mutex.Lock() defer t.mutex.Unlock() // Recursively create nodes for the inverted matrix in the tree until // we reach the node to insert the matrix to. We start by passing in // 0 as the parent index as we start at the root of the tree. t.root.insertInvertedMatrix(invalidIndices, matrix, shards, 0) return nil } func (n *inversionNode) getInvertedMatrix(invalidIndices []int, parent int) matrix { // Get the child node to search next from the list of children. The // list of children starts relative to the parent index passed in // because the indices of invalid rows is sorted (by default). As we // search recursively, the first invalid index gets popped off the list, // so when searching through the list of children, use that first invalid // index to find the child node. firstIndex := invalidIndices[0] node := n.children[firstIndex-parent] // If the child node doesn't exist in the list yet, fail fast by // returning, so we can construct and insert the proper inverted matrix. if node == nil { return nil } // If there's more than one invalid index left in the list we should // keep searching recursively. if len(invalidIndices) > 1 { // Search recursively on the child node by passing in the invalid indices // with the first index popped off the front. Also the parent index to // pass down is the first index plus one. return node.getInvertedMatrix(invalidIndices[1:], firstIndex+1) } // If there aren't any more invalid indices to search, we've found our // node. Return it, however keep in mind that the matrix could still be // nil because intermediary nodes in the tree are created sometimes with // their inversion matrices uninitialized. return node.matrix } func (n *inversionNode) insertInvertedMatrix(invalidIndices []int, matrix matrix, shards, parent int) { // As above, get the child node to search next from the list of children. // The list of children starts relative to the parent index passed in // because the indices of invalid rows is sorted (by default). As we // search recursively, the first invalid index gets popped off the list, // so when searching through the list of children, use that first invalid // index to find the child node. firstIndex := invalidIndices[0] node := n.children[firstIndex-parent] // If the child node doesn't exist in the list yet, create a new // node because we have the writer lock and add it to the list // of children. if node == nil { // Make the length of the list of children equal to the number // of shards minus the first invalid index because the list of // invalid indices is sorted, so only this length of errors // are possible in the tree. node = &inversionNode{ children: make([]*inversionNode, shards-firstIndex), } // Insert the new node into the tree at the first index relative // to the parent index that was given in this recursive call. n.children[firstIndex-parent] = node } // If there's more than one invalid index left in the list we should // keep searching recursively in order to find the node to add our // matrix. if len(invalidIndices) > 1 { // As above, search recursively on the child node by passing in // the invalid indices with the first index popped off the front. // Also the total number of shards and parent index are passed down // which is equal to the first index plus one. node.insertInvertedMatrix(invalidIndices[1:], matrix, shards, firstIndex+1) } else { // If there aren't any more invalid indices to search, we've found our // node. Cache the inverted matrix in this node. node.matrix = matrix } } reedsolomon-1.9.13/inversion_tree_test.go000066400000000000000000000056231406411035300205570ustar00rootroot00000000000000/** * Unit tests for inversion tree. * * Copyright 2016, Peter Collins */ package reedsolomon import ( "testing" ) func TestNewInversionTree(t *testing.T) { tree := newInversionTree(3, 2) children := len(tree.root.children) if children != 5 { t.Fatal("Root node children list length", children, "!=", 5) } str := tree.root.matrix.String() expect := "[[1, 0, 0], [0, 1, 0], [0, 0, 1]]" if str != expect { t.Fatal(str, "!=", expect) } } func TestGetInvertedMatrix(t *testing.T) { tree := newInversionTree(3, 2) matrix := tree.GetInvertedMatrix([]int{}) str := matrix.String() expect := "[[1, 0, 0], [0, 1, 0], [0, 0, 1]]" if str != expect { t.Fatal(str, "!=", expect) } matrix = tree.GetInvertedMatrix([]int{1}) if matrix != nil { t.Fatal(matrix, "!= nil") } matrix = tree.GetInvertedMatrix([]int{1, 2}) if matrix != nil { t.Fatal(matrix, "!= nil") } matrix, err := newMatrix(3, 3) if err != nil { t.Fatalf("Failed initializing new Matrix : %s", err) } err = tree.InsertInvertedMatrix([]int{1}, matrix, 5) if err != nil { t.Fatalf("Failed inserting new Matrix : %s", err) } cachedMatrix := tree.GetInvertedMatrix([]int{1}) if cachedMatrix == nil { t.Fatal(cachedMatrix, "== nil") } if matrix.String() != cachedMatrix.String() { t.Fatal(matrix.String(), "!=", cachedMatrix.String()) } } func TestInsertInvertedMatrix(t *testing.T) { tree := newInversionTree(3, 2) matrix, err := newMatrix(3, 3) if err != nil { t.Fatalf("Failed initializing new Matrix : %s", err) } err = tree.InsertInvertedMatrix([]int{1}, matrix, 5) if err != nil { t.Fatalf("Failed inserting new Matrix : %s", err) } err = tree.InsertInvertedMatrix([]int{}, matrix, 5) if err == nil { t.Fatal("Should have failed inserting the root node matrix", matrix) } matrix, err = newMatrix(3, 2) if err != nil { t.Fatalf("Failed initializing new Matrix : %s", err) } err = tree.InsertInvertedMatrix([]int{2}, matrix, 5) if err == nil { t.Fatal("Should have failed inserting a non-square matrix", matrix) } matrix, err = newMatrix(3, 3) if err != nil { t.Fatalf("Failed initializing new Matrix : %s", err) } err = tree.InsertInvertedMatrix([]int{0, 1}, matrix, 5) if err != nil { t.Fatalf("Failed inserting new Matrix : %s", err) } } func TestDoubleInsertInvertedMatrix(t *testing.T) { tree := newInversionTree(3, 2) matrix, err := newMatrix(3, 3) if err != nil { t.Fatalf("Failed initializing new Matrix : %s", err) } err = tree.InsertInvertedMatrix([]int{1}, matrix, 5) if err != nil { t.Fatalf("Failed inserting new Matrix : %s", err) } err = tree.InsertInvertedMatrix([]int{1}, matrix, 5) if err != nil { t.Fatalf("Failed inserting new Matrix : %s", err) } cachedMatrix := tree.GetInvertedMatrix([]int{1}) if cachedMatrix == nil { t.Fatal(cachedMatrix, "== nil") } if matrix.String() != cachedMatrix.String() { t.Fatal(matrix.String(), "!=", cachedMatrix.String()) } } reedsolomon-1.9.13/matrix.go000066400000000000000000000154151406411035300157710ustar00rootroot00000000000000/** * Matrix Algebra over an 8-bit Galois Field * * Copyright 2015, Klaus Post * Copyright 2015, Backblaze, Inc. */ package reedsolomon import ( "errors" "fmt" "strconv" "strings" ) // byte[row][col] type matrix [][]byte // newMatrix returns a matrix of zeros. func newMatrix(rows, cols int) (matrix, error) { if rows <= 0 { return nil, errInvalidRowSize } if cols <= 0 { return nil, errInvalidColSize } m := matrix(make([][]byte, rows)) for i := range m { m[i] = make([]byte, cols) } return m, nil } // NewMatrixData initializes a matrix with the given row-major data. // Note that data is not copied from input. func newMatrixData(data [][]byte) (matrix, error) { m := matrix(data) err := m.Check() if err != nil { return nil, err } return m, nil } // IdentityMatrix returns an identity matrix of the given size. func identityMatrix(size int) (matrix, error) { m, err := newMatrix(size, size) if err != nil { return nil, err } for i := range m { m[i][i] = 1 } return m, nil } // errInvalidRowSize will be returned if attempting to create a matrix with negative or zero row number. var errInvalidRowSize = errors.New("invalid row size") // errInvalidColSize will be returned if attempting to create a matrix with negative or zero column number. var errInvalidColSize = errors.New("invalid column size") // errColSizeMismatch is returned if the size of matrix columns mismatch. var errColSizeMismatch = errors.New("column size is not the same for all rows") func (m matrix) Check() error { rows := len(m) if rows <= 0 { return errInvalidRowSize } cols := len(m[0]) if cols <= 0 { return errInvalidColSize } for _, col := range m { if len(col) != cols { return errColSizeMismatch } } return nil } // String returns a human-readable string of the matrix contents. // // Example: [[1, 2], [3, 4]] func (m matrix) String() string { rowOut := make([]string, 0, len(m)) for _, row := range m { colOut := make([]string, 0, len(row)) for _, col := range row { colOut = append(colOut, strconv.Itoa(int(col))) } rowOut = append(rowOut, "["+strings.Join(colOut, ", ")+"]") } return "[" + strings.Join(rowOut, ", ") + "]" } // Multiply multiplies this matrix (the one on the left) by another // matrix (the one on the right) and returns a new matrix with the result. func (m matrix) Multiply(right matrix) (matrix, error) { if len(m[0]) != len(right) { return nil, fmt.Errorf("columns on left (%d) is different than rows on right (%d)", len(m[0]), len(right)) } result, _ := newMatrix(len(m), len(right[0])) for r, row := range result { for c := range row { var value byte for i := range m[0] { value ^= galMultiply(m[r][i], right[i][c]) } result[r][c] = value } } return result, nil } // Augment returns the concatenation of this matrix and the matrix on the right. func (m matrix) Augment(right matrix) (matrix, error) { if len(m) != len(right) { return nil, errMatrixSize } result, _ := newMatrix(len(m), len(m[0])+len(right[0])) for r, row := range m { for c := range row { result[r][c] = m[r][c] } cols := len(m[0]) for c := range right[0] { result[r][cols+c] = right[r][c] } } return result, nil } // errMatrixSize is returned if matrix dimensions are doesn't match. var errMatrixSize = errors.New("matrix sizes do not match") func (m matrix) SameSize(n matrix) error { if len(m) != len(n) { return errMatrixSize } for i := range m { if len(m[i]) != len(n[i]) { return errMatrixSize } } return nil } // SubMatrix returns a part of this matrix. Data is copied. func (m matrix) SubMatrix(rmin, cmin, rmax, cmax int) (matrix, error) { result, err := newMatrix(rmax-rmin, cmax-cmin) if err != nil { return nil, err } // OPTME: If used heavily, use copy function to copy slice for r := rmin; r < rmax; r++ { for c := cmin; c < cmax; c++ { result[r-rmin][c-cmin] = m[r][c] } } return result, nil } // SwapRows Exchanges two rows in the matrix. func (m matrix) SwapRows(r1, r2 int) error { if r1 < 0 || len(m) <= r1 || r2 < 0 || len(m) <= r2 { return errInvalidRowSize } m[r2], m[r1] = m[r1], m[r2] return nil } // IsSquare will return true if the matrix is square // and nil if the matrix is square func (m matrix) IsSquare() bool { return len(m) == len(m[0]) } // errSingular is returned if the matrix is singular and cannot be inversed var errSingular = errors.New("matrix is singular") // errNotSquare is returned if attempting to inverse a non-square matrix. var errNotSquare = errors.New("only square matrices can be inverted") // Invert returns the inverse of this matrix. // Returns ErrSingular when the matrix is singular and doesn't have an inverse. // The matrix must be square, otherwise ErrNotSquare is returned. func (m matrix) Invert() (matrix, error) { if !m.IsSquare() { return nil, errNotSquare } size := len(m) work, _ := identityMatrix(size) work, _ = m.Augment(work) err := work.gaussianElimination() if err != nil { return nil, err } return work.SubMatrix(0, size, size, size*2) } func (m matrix) gaussianElimination() error { rows := len(m) columns := len(m[0]) // Clear out the part below the main diagonal and scale the main // diagonal to be 1. for r := 0; r < rows; r++ { // If the element on the diagonal is 0, find a row below // that has a non-zero and swap them. if m[r][r] == 0 { for rowBelow := r + 1; rowBelow < rows; rowBelow++ { if m[rowBelow][r] != 0 { err := m.SwapRows(r, rowBelow) if err != nil { return err } break } } } // If we couldn't find one, the matrix is singular. if m[r][r] == 0 { return errSingular } // Scale to 1. if m[r][r] != 1 { scale := galDivide(1, m[r][r]) for c := 0; c < columns; c++ { m[r][c] = galMultiply(m[r][c], scale) } } // Make everything below the 1 be a 0 by subtracting // a multiple of it. (Subtraction and addition are // both exclusive or in the Galois field.) for rowBelow := r + 1; rowBelow < rows; rowBelow++ { if m[rowBelow][r] != 0 { scale := m[rowBelow][r] for c := 0; c < columns; c++ { m[rowBelow][c] ^= galMultiply(scale, m[r][c]) } } } } // Now clear the part above the main diagonal. for d := 0; d < rows; d++ { for rowAbove := 0; rowAbove < d; rowAbove++ { if m[rowAbove][d] != 0 { scale := m[rowAbove][d] for c := 0; c < columns; c++ { m[rowAbove][c] ^= galMultiply(scale, m[d][c]) } } } } return nil } // Create a Vandermonde matrix, which is guaranteed to have the // property that any subset of rows that forms a square matrix // is invertible. func vandermonde(rows, cols int) (matrix, error) { result, err := newMatrix(rows, cols) if err != nil { return nil, err } for r, row := range result { for c := range row { result[r][c] = galExp(byte(r), c) } } return result, nil } reedsolomon-1.9.13/matrix_test.go000066400000000000000000000140511406411035300170230ustar00rootroot00000000000000/** * Unit tests for Matrix * * Copyright 2015, Klaus Post * Copyright 2015, Backblaze, Inc. All rights reserved. */ package reedsolomon import ( "testing" ) // TestNewMatrix - Tests validate the result for invalid input and the allocations made by newMatrix method. func TestNewMatrix(t *testing.T) { testCases := []struct { rows int columns int // flag to indicate whether the test should pass. shouldPass bool expectedResult matrix expectedErr error }{ // Test case - 1. // Test case with a negative row size. {-1, 10, false, nil, errInvalidRowSize}, // Test case - 2. // Test case with a negative column size. {10, -1, false, nil, errInvalidColSize}, // Test case - 3. // Test case with negative value for both row and column size. {-1, -1, false, nil, errInvalidRowSize}, // Test case - 4. // Test case with 0 value for row size. {0, 10, false, nil, errInvalidRowSize}, // Test case - 5. // Test case with 0 value for column size. {-1, 0, false, nil, errInvalidRowSize}, // Test case - 6. // Test case with 0 value for both row and column size. {0, 0, false, nil, errInvalidRowSize}, } for i, testCase := range testCases { actualResult, actualErr := newMatrix(testCase.rows, testCase.columns) if actualErr != nil && testCase.shouldPass { t.Errorf("Test %d: Expected to pass, but failed with: %s", i+1, actualErr.Error()) } if actualErr == nil && !testCase.shouldPass { t.Errorf("Test %d: Expected to fail with \"%s\", but passed instead.", i+1, testCase.expectedErr) } // Failed as expected, but does it fail for the expected reason. if actualErr != nil && !testCase.shouldPass { if testCase.expectedErr != actualErr { t.Errorf("Test %d: Expected to fail with error \"%s\", but instead failed with error \"%s\" instead.", i+1, testCase.expectedErr, actualErr) } } // Test passes as expected, but the output values // are verified for correctness here. if actualErr == nil && testCase.shouldPass { if testCase.rows != len(actualResult) { // End the tests here if the the size doesn't match number of rows. t.Fatalf("Test %d: Expected the size of the row of the new matrix to be `%d`, but instead found `%d`", i+1, testCase.rows, len(actualResult)) } // Iterating over each row and validating the size of the column. for j, row := range actualResult { // If the row check passes, verify the size of each columns. if testCase.columns != len(row) { t.Errorf("Test %d: Row %d: Expected the size of the column of the new matrix to be `%d`, but instead found `%d`", i+1, j+1, testCase.columns, len(row)) } } } } } // TestMatrixIdentity - validates the method for returning identity matrix of given size. func TestMatrixIdentity(t *testing.T) { m, err := identityMatrix(3) if err != nil { t.Fatal(err) } str := m.String() expect := "[[1, 0, 0], [0, 1, 0], [0, 0, 1]]" if str != expect { t.Fatal(str, "!=", expect) } } // Tests validate the output of matrix multiplication method. func TestMatrixMultiply(t *testing.T) { m1, err := newMatrixData( [][]byte{ []byte{1, 2}, []byte{3, 4}, }) if err != nil { t.Fatal(err) } m2, err := newMatrixData( [][]byte{ []byte{5, 6}, []byte{7, 8}, }) if err != nil { t.Fatal(err) } actual, err := m1.Multiply(m2) if err != nil { t.Fatal(err) } str := actual.String() expect := "[[11, 22], [19, 42]]" if str != expect { t.Fatal(str, "!=", expect) } } // Tests validate the output of the method with computes inverse of matrix. func TestMatrixInverse(t *testing.T) { testCases := []struct { matrixData [][]byte // expected inverse matrix. expectedResult string // flag indicating whether the test should pass. shouldPass bool expectedErr error }{ // Test case - 1. // Test case validating inverse of the input Matrix. { // input data to construct the matrix. [][]byte{ []byte{56, 23, 98}, []byte{3, 100, 200}, []byte{45, 201, 123}, }, // expected Inverse matrix. "[[175, 133, 33], [130, 13, 245], [112, 35, 126]]", // test is expected to pass. true, nil, }, // Test case - 2. // Test case validating inverse of the input Matrix. { // input data to construct the matrix. [][]byte{ []byte{1, 0, 0, 0, 0}, []byte{0, 1, 0, 0, 0}, []byte{0, 0, 0, 1, 0}, []byte{0, 0, 0, 0, 1}, []byte{7, 7, 6, 6, 1}, }, // expectedInverse matrix. "[[1, 0, 0, 0, 0]," + " [0, 1, 0, 0, 0]," + " [123, 123, 1, 122, 122]," + " [0, 0, 1, 0, 0]," + " [0, 0, 0, 1, 0]]", // test is expected to pass. true, nil, }, // Test case with a non-square matrix. // expected to fail with errNotSquare. { [][]byte{ []byte{56, 23}, []byte{3, 100}, []byte{45, 201}, }, "", false, errNotSquare, }, // Test case with singular matrix. // expected to fail with error errSingular. { [][]byte{ []byte{4, 2}, []byte{12, 6}, }, "", false, errSingular, }, } for i, testCase := range testCases { m, err := newMatrixData(testCase.matrixData) if err != nil { t.Fatalf("Test %d: Failed initializing new Matrix : %s", i+1, err) } actualResult, actualErr := m.Invert() if actualErr != nil && testCase.shouldPass { t.Errorf("Test %d: Expected to pass, but failed with: %s", i+1, actualErr.Error()) } if actualErr == nil && !testCase.shouldPass { t.Errorf("Test %d: Expected to fail with \"%s\", but passed instead.", i+1, testCase.expectedErr) } // Failed as expected, but does it fail for the expected reason. if actualErr != nil && !testCase.shouldPass { if testCase.expectedErr != actualErr { t.Errorf("Test %d: Expected to fail with error \"%s\", but instead failed with error \"%s\" instead.", i+1, testCase.expectedErr, actualErr) } } // Test passes as expected, but the output values // are verified for correctness here. if actualErr == nil && testCase.shouldPass { if testCase.expectedResult != actualResult.String() { t.Errorf("Test %d: The inverse matrix doesn't match the expected result", i+1) } } } } reedsolomon-1.9.13/options.go000066400000000000000000000117751406411035300161650ustar00rootroot00000000000000package reedsolomon import ( "runtime" "github.com/klauspost/cpuid/v2" ) // Option allows to override processing parameters. type Option func(*options) type options struct { maxGoroutines int minSplitSize int shardSize int perRound int useAVX512, useAVX2, useSSSE3, useSSE2 bool usePAR1Matrix bool useCauchy bool fastOneParity bool inversionCache bool // stream options concReads bool concWrites bool streamBS int } var defaultOptions = options{ maxGoroutines: 384, minSplitSize: -1, fastOneParity: false, inversionCache: true, // Detect CPU capabilities. useSSSE3: cpuid.CPU.Supports(cpuid.SSSE3), useSSE2: cpuid.CPU.Supports(cpuid.SSE2), useAVX2: cpuid.CPU.Supports(cpuid.AVX2), useAVX512: cpuid.CPU.Supports(cpuid.AVX512F, cpuid.AVX512BW), } func init() { if runtime.GOMAXPROCS(0) <= 1 { defaultOptions.maxGoroutines = 1 } } // WithMaxGoroutines is the maximum number of goroutines number for encoding & decoding. // Jobs will be split into this many parts, unless each goroutine would have to process // less than minSplitSize bytes (set with WithMinSplitSize). // For the best speed, keep this well above the GOMAXPROCS number for more fine grained // scheduling. // If n <= 0, it is ignored. func WithMaxGoroutines(n int) Option { return func(o *options) { if n > 0 { o.maxGoroutines = n } } } // WithAutoGoroutines will adjust the number of goroutines for optimal speed with a // specific shard size. // Send in the shard size you expect to send. Other shard sizes will work, but may not // run at the optimal speed. // Overwrites WithMaxGoroutines. // If shardSize <= 0, it is ignored. func WithAutoGoroutines(shardSize int) Option { return func(o *options) { o.shardSize = shardSize } } // WithMinSplitSize is the minimum encoding size in bytes per goroutine. // By default this parameter is determined by CPU cache characteristics. // See WithMaxGoroutines on how jobs are split. // If n <= 0, it is ignored. func WithMinSplitSize(n int) Option { return func(o *options) { if n > 0 { o.minSplitSize = n } } } // WithConcurrentStreams will enable concurrent reads and writes on the streams. // Default: Disabled, meaning only one stream will be read/written at the time. // Ignored if not used on a stream input. func WithConcurrentStreams(enabled bool) Option { return func(o *options) { o.concReads, o.concWrites = enabled, enabled } } // WithConcurrentStreamReads will enable concurrent reads from the input streams. // Default: Disabled, meaning only one stream will be read at the time. // Ignored if not used on a stream input. func WithConcurrentStreamReads(enabled bool) Option { return func(o *options) { o.concReads = enabled } } // WithConcurrentStreamWrites will enable concurrent writes to the the output streams. // Default: Disabled, meaning only one stream will be written at the time. // Ignored if not used on a stream input. func WithConcurrentStreamWrites(enabled bool) Option { return func(o *options) { o.concWrites = enabled } } // WithInversionCache allows to control the inversion cache. // This will cache reconstruction matrices so they can be reused. // Enabled by default. func WithInversionCache(enabled bool) Option { return func(o *options) { o.inversionCache = enabled } } // WithStreamBlockSize allows to set a custom block size per round of reads/writes. // If not set, any shard size set with WithAutoGoroutines will be used. // If WithAutoGoroutines is also unset, 4MB will be used. // Ignored if not used on stream. func WithStreamBlockSize(n int) Option { return func(o *options) { o.streamBS = n } } func withSSSE3(enabled bool) Option { return func(o *options) { o.useSSSE3 = enabled } } func withAVX2(enabled bool) Option { return func(o *options) { o.useAVX2 = enabled } } func withSSE2(enabled bool) Option { return func(o *options) { o.useSSE2 = enabled } } func withAVX512(enabled bool) Option { return func(o *options) { o.useAVX512 = enabled } } // WithPAR1Matrix causes the encoder to build the matrix how PARv1 // does. Note that the method they use is buggy, and may lead to cases // where recovery is impossible, even if there are enough parity // shards. func WithPAR1Matrix() Option { return func(o *options) { o.usePAR1Matrix = true o.useCauchy = false } } // WithCauchyMatrix will make the encoder build a Cauchy style matrix. // The output of this is not compatible with the standard output. // A Cauchy matrix is faster to generate. This does not affect data throughput, // but will result in slightly faster start-up time. func WithCauchyMatrix() Option { return func(o *options) { o.useCauchy = true o.usePAR1Matrix = false } } // WithFastOneParityMatrix will switch the matrix to a simple xor // if there is only one parity shard. // The PAR1 matrix already has this property so it has little effect there. func WithFastOneParityMatrix() Option { return func(o *options) { o.fastOneParity = true } } reedsolomon-1.9.13/reedsolomon.go000066400000000000000000000726461406411035300170240ustar00rootroot00000000000000/** * Reed-Solomon Coding over 8-bit values. * * Copyright 2015, Klaus Post * Copyright 2015, Backblaze, Inc. */ // Package reedsolomon enables Erasure Coding in Go // // For usage and examples, see https://github.com/klauspost/reedsolomon // package reedsolomon import ( "bytes" "errors" "io" "runtime" "sync" "github.com/klauspost/cpuid/v2" ) // Encoder is an interface to encode Reed-Salomon parity sets for your data. type Encoder interface { // Encode parity for a set of data shards. // Input is 'shards' containing data shards followed by parity shards. // The number of shards must match the number given to New(). // Each shard is a byte array, and they must all be the same size. // The parity shards will always be overwritten and the data shards // will remain the same, so it is safe for you to read from the // data shards while this is running. Encode(shards [][]byte) error // Verify returns true if the parity shards contain correct data. // The data is the same format as Encode. No data is modified, so // you are allowed to read from data while this is running. Verify(shards [][]byte) (bool, error) // Reconstruct will recreate the missing shards if possible. // // Given a list of shards, some of which contain data, fills in the // ones that don't have data. // // The length of the array must be equal to the total number of shards. // You indicate that a shard is missing by setting it to nil or zero-length. // If a shard is zero-length but has sufficient capacity, that memory will // be used, otherwise a new []byte will be allocated. // // If there are too few shards to reconstruct the missing // ones, ErrTooFewShards will be returned. // // The reconstructed shard set is complete, but integrity is not verified. // Use the Verify function to check if data set is ok. Reconstruct(shards [][]byte) error // ReconstructData will recreate any missing data shards, if possible. // // Given a list of shards, some of which contain data, fills in the // data shards that don't have data. // // The length of the array must be equal to Shards. // You indicate that a shard is missing by setting it to nil or zero-length. // If a shard is zero-length but has sufficient capacity, that memory will // be used, otherwise a new []byte will be allocated. // // If there are too few shards to reconstruct the missing // ones, ErrTooFewShards will be returned. // // As the reconstructed shard set may contain missing parity shards, // calling the Verify function is likely to fail. ReconstructData(shards [][]byte) error // Update parity is use for change a few data shards and update it's parity. // Input 'newDatashards' containing data shards changed. // Input 'shards' containing old data shards (if data shard not changed, it can be nil) and old parity shards. // new parity shards will in shards[DataShards:] // Update is very useful if DataShards much larger than ParityShards and changed data shards is few. It will // faster than Encode and not need read all data shards to encode. Update(shards [][]byte, newDatashards [][]byte) error // Split a data slice into the number of shards given to the encoder, // and create empty parity shards. // // The data will be split into equally sized shards. // If the data size isn't dividable by the number of shards, // the last shard will contain extra zeros. // // There must be at least 1 byte otherwise ErrShortData will be // returned. // // The data will not be copied, except for the last shard, so you // should not modify the data of the input slice afterwards. Split(data []byte) ([][]byte, error) // Join the shards and write the data segment to dst. // // Only the data shards are considered. // You must supply the exact output size you want. // If there are to few shards given, ErrTooFewShards will be returned. // If the total data size is less than outSize, ErrShortData will be returned. Join(dst io.Writer, shards [][]byte, outSize int) error } const ( avx2CodeGenMinSize = 64 avx2CodeGenMinShards = 3 avx2CodeGenMaxGoroutines = 8 ) // reedSolomon contains a matrix for a specific // distribution of datashards and parity shards. // Construct if using New() type reedSolomon struct { DataShards int // Number of data shards, should not be modified. ParityShards int // Number of parity shards, should not be modified. Shards int // Total number of shards. Calculated, and should not be modified. m matrix tree *inversionTree parity [][]byte o options mPool sync.Pool } // ErrInvShardNum will be returned by New, if you attempt to create // an Encoder with less than one data shard or less than zero parity // shards. var ErrInvShardNum = errors.New("cannot create Encoder with less than one data shard or less than zero parity shards") // ErrMaxShardNum will be returned by New, if you attempt to create an // Encoder where data and parity shards are bigger than the order of // GF(2^8). var ErrMaxShardNum = errors.New("cannot create Encoder with more than 256 data+parity shards") // buildMatrix creates the matrix to use for encoding, given the // number of data shards and the number of total shards. // // The top square of the matrix is guaranteed to be an identity // matrix, which means that the data shards are unchanged after // encoding. func buildMatrix(dataShards, totalShards int) (matrix, error) { // Start with a Vandermonde matrix. This matrix would work, // in theory, but doesn't have the property that the data // shards are unchanged after encoding. vm, err := vandermonde(totalShards, dataShards) if err != nil { return nil, err } // Multiply by the inverse of the top square of the matrix. // This will make the top square be the identity matrix, but // preserve the property that any square subset of rows is // invertible. top, err := vm.SubMatrix(0, 0, dataShards, dataShards) if err != nil { return nil, err } topInv, err := top.Invert() if err != nil { return nil, err } return vm.Multiply(topInv) } // buildMatrixPAR1 creates the matrix to use for encoding according to // the PARv1 spec, given the number of data shards and the number of // total shards. Note that the method they use is buggy, and may lead // to cases where recovery is impossible, even if there are enough // parity shards. // // The top square of the matrix is guaranteed to be an identity // matrix, which means that the data shards are unchanged after // encoding. func buildMatrixPAR1(dataShards, totalShards int) (matrix, error) { result, err := newMatrix(totalShards, dataShards) if err != nil { return nil, err } for r, row := range result { // The top portion of the matrix is the identity // matrix, and the bottom is a transposed Vandermonde // matrix starting at 1 instead of 0. if r < dataShards { result[r][r] = 1 } else { for c := range row { result[r][c] = galExp(byte(c+1), r-dataShards) } } } return result, nil } func buildMatrixCauchy(dataShards, totalShards int) (matrix, error) { result, err := newMatrix(totalShards, dataShards) if err != nil { return nil, err } for r, row := range result { // The top portion of the matrix is the identity // matrix, and the bottom is a transposed Cauchy matrix. if r < dataShards { result[r][r] = 1 } else { for c := range row { result[r][c] = invTable[(byte(r ^ c))] } } } return result, nil } // buildXorMatrix can be used to build a matrix with pure XOR // operations if there is only one parity shard. func buildXorMatrix(dataShards, totalShards int) (matrix, error) { if dataShards+1 != totalShards { return nil, errors.New("internal error") } result, err := newMatrix(totalShards, dataShards) if err != nil { return nil, err } for r, row := range result { // The top portion of the matrix is the identity // matrix. if r < dataShards { result[r][r] = 1 } else { // Set all values to 1 (XOR) for c := range row { result[r][c] = 1 } } } return result, nil } // New creates a new encoder and initializes it to // the number of data shards and parity shards that // you want to use. You can reuse this encoder. // Note that the maximum number of total shards is 256. // If no options are supplied, default options are used. func New(dataShards, parityShards int, opts ...Option) (Encoder, error) { r := reedSolomon{ DataShards: dataShards, ParityShards: parityShards, Shards: dataShards + parityShards, o: defaultOptions, } for _, opt := range opts { opt(&r.o) } if dataShards <= 0 || parityShards < 0 { return nil, ErrInvShardNum } if dataShards+parityShards > 256 { return nil, ErrMaxShardNum } if parityShards == 0 { return &r, nil } var err error switch { case r.o.fastOneParity && parityShards == 1: r.m, err = buildXorMatrix(dataShards, r.Shards) case r.o.useCauchy: r.m, err = buildMatrixCauchy(dataShards, r.Shards) case r.o.usePAR1Matrix: r.m, err = buildMatrixPAR1(dataShards, r.Shards) default: r.m, err = buildMatrix(dataShards, r.Shards) } if err != nil { return nil, err } // Calculate what we want per round r.o.perRound = cpuid.CPU.Cache.L2 if r.o.perRound <= 0 { // Set to 128K if undetectable. r.o.perRound = 128 << 10 } if cpuid.CPU.ThreadsPerCore > 1 && r.o.maxGoroutines > cpuid.CPU.PhysicalCores { // If multiple threads per core, make sure they don't contend for cache. r.o.perRound /= cpuid.CPU.ThreadsPerCore } // 1 input + parity must fit in cache, and we add one more to be safer. r.o.perRound = r.o.perRound / (1 + parityShards) // Align to 64 bytes. r.o.perRound = ((r.o.perRound + 63) / 64) * 64 if r.o.minSplitSize <= 0 { // Set minsplit as high as we can, but still have parity in L1. cacheSize := cpuid.CPU.Cache.L1D if cacheSize <= 0 { cacheSize = 32 << 10 } r.o.minSplitSize = cacheSize / (parityShards + 1) // Min 1K if r.o.minSplitSize < 1024 { r.o.minSplitSize = 1024 } } if r.o.perRound < r.o.minSplitSize { r.o.perRound = r.o.minSplitSize } if r.o.shardSize > 0 { p := runtime.GOMAXPROCS(0) if p == 1 || r.o.shardSize <= r.o.minSplitSize*2 { // Not worth it. r.o.maxGoroutines = 1 } else { g := r.o.shardSize / r.o.perRound // Overprovision by a factor of 2. if g < p*2 && r.o.perRound > r.o.minSplitSize*2 { g = p * 2 r.o.perRound /= 2 } // Have g be multiple of p g += p - 1 g -= g % p r.o.maxGoroutines = g } } // Generated AVX2 does not need data to stay in L1 cache between runs. // We will be purely limited by RAM speed. if r.canAVX2C(avx2CodeGenMinSize, r.DataShards, r.ParityShards) && r.o.maxGoroutines > avx2CodeGenMaxGoroutines { r.o.maxGoroutines = avx2CodeGenMaxGoroutines } // Inverted matrices are cached in a tree keyed by the indices // of the invalid rows of the data to reconstruct. // The inversion root node will have the identity matrix as // its inversion matrix because it implies there are no errors // with the original data. if r.o.inversionCache { r.tree = newInversionTree(dataShards, parityShards) } r.parity = make([][]byte, parityShards) for i := range r.parity { r.parity[i] = r.m[dataShards+i] } if avx2CodeGen && r.o.useAVX2 { r.mPool.New = func() interface{} { return make([]byte, r.Shards*2*32) } } return &r, err } // ErrTooFewShards is returned if too few shards where given to // Encode/Verify/Reconstruct/Update. It will also be returned from Reconstruct // if there were too few shards to reconstruct the missing data. var ErrTooFewShards = errors.New("too few shards given") // Encode parity for a set of data shards. // An array 'shards' containing data shards followed by parity shards. // The number of shards must match the number given to New. // Each shard is a byte array, and they must all be the same size. // The parity shards will always be overwritten and the data shards // will remain the same. func (r *reedSolomon) Encode(shards [][]byte) error { if len(shards) != r.Shards { return ErrTooFewShards } err := checkShards(shards, false) if err != nil { return err } // Get the slice of output buffers. output := shards[r.DataShards:] // Do the coding. r.codeSomeShards(r.parity, shards[0:r.DataShards], output, r.ParityShards, len(shards[0])) return nil } // ErrInvalidInput is returned if invalid input parameter of Update. var ErrInvalidInput = errors.New("invalid input") func (r *reedSolomon) Update(shards [][]byte, newDatashards [][]byte) error { if len(shards) != r.Shards { return ErrTooFewShards } if len(newDatashards) != r.DataShards { return ErrTooFewShards } err := checkShards(shards, true) if err != nil { return err } err = checkShards(newDatashards, true) if err != nil { return err } for i := range newDatashards { if newDatashards[i] != nil && shards[i] == nil { return ErrInvalidInput } } for _, p := range shards[r.DataShards:] { if p == nil { return ErrInvalidInput } } shardSize := shardSize(shards) // Get the slice of output buffers. output := shards[r.DataShards:] // Do the coding. r.updateParityShards(r.parity, shards[0:r.DataShards], newDatashards[0:r.DataShards], output, r.ParityShards, shardSize) return nil } func (r *reedSolomon) updateParityShards(matrixRows, oldinputs, newinputs, outputs [][]byte, outputCount, byteCount int) { if len(outputs) == 0 { return } if r.o.maxGoroutines > 1 && byteCount > r.o.minSplitSize { r.updateParityShardsP(matrixRows, oldinputs, newinputs, outputs, outputCount, byteCount) return } for c := 0; c < r.DataShards; c++ { in := newinputs[c] if in == nil { continue } oldin := oldinputs[c] // oldinputs data will be changed sliceXor(in, oldin, &r.o) for iRow := 0; iRow < outputCount; iRow++ { galMulSliceXor(matrixRows[iRow][c], oldin, outputs[iRow], &r.o) } } } func (r *reedSolomon) updateParityShardsP(matrixRows, oldinputs, newinputs, outputs [][]byte, outputCount, byteCount int) { var wg sync.WaitGroup do := byteCount / r.o.maxGoroutines if do < r.o.minSplitSize { do = r.o.minSplitSize } start := 0 for start < byteCount { if start+do > byteCount { do = byteCount - start } wg.Add(1) go func(start, stop int) { for c := 0; c < r.DataShards; c++ { in := newinputs[c] if in == nil { continue } oldin := oldinputs[c] // oldinputs data will be change sliceXor(in[start:stop], oldin[start:stop], &r.o) for iRow := 0; iRow < outputCount; iRow++ { galMulSliceXor(matrixRows[iRow][c], oldin[start:stop], outputs[iRow][start:stop], &r.o) } } wg.Done() }(start, start+do) start += do } wg.Wait() } // Verify returns true if the parity shards contain the right data. // The data is the same format as Encode. No data is modified. func (r *reedSolomon) Verify(shards [][]byte) (bool, error) { if len(shards) != r.Shards { return false, ErrTooFewShards } err := checkShards(shards, false) if err != nil { return false, err } // Slice of buffers being checked. toCheck := shards[r.DataShards:] // Do the checking. return r.checkSomeShards(r.parity, shards[0:r.DataShards], toCheck, r.ParityShards, len(shards[0])), nil } func (r *reedSolomon) canAVX2C(byteCount int, inputs, outputs int) bool { return avx2CodeGen && r.o.useAVX2 && byteCount >= avx2CodeGenMinSize && inputs+outputs >= avx2CodeGenMinShards && inputs <= maxAvx2Inputs && outputs <= maxAvx2Outputs } // Multiplies a subset of rows from a coding matrix by a full set of // input shards to produce some output shards. // 'matrixRows' is The rows from the matrix to use. // 'inputs' An array of byte arrays, each of which is one input shard. // The number of inputs used is determined by the length of each matrix row. // outputs Byte arrays where the computed shards are stored. // The number of outputs computed, and the // number of matrix rows used, is determined by // outputCount, which is the number of outputs to compute. func (r *reedSolomon) codeSomeShards(matrixRows, inputs, outputs [][]byte, outputCount, byteCount int) { if len(outputs) == 0 { return } switch { case r.o.useAVX512 && r.o.maxGoroutines > 1 && byteCount > r.o.minSplitSize && len(inputs) >= 4 && len(outputs) >= 2: r.codeSomeShardsAvx512P(matrixRows, inputs, outputs, outputCount, byteCount) return case r.o.useAVX512 && len(inputs) >= 4 && len(outputs) >= 2: r.codeSomeShardsAvx512(matrixRows, inputs, outputs, outputCount, byteCount) return case r.o.maxGoroutines > 1 && byteCount > r.o.minSplitSize: r.codeSomeShardsP(matrixRows, inputs, outputs, outputCount, byteCount) return } // Process using no goroutines start, end := 0, r.o.perRound if end > len(inputs[0]) { end = len(inputs[0]) } if r.canAVX2C(byteCount, len(inputs), len(outputs)) { m := genAvx2Matrix(matrixRows, len(inputs), len(outputs), r.mPool.Get().([]byte)) start += galMulSlicesAvx2(m, inputs, outputs, 0, byteCount) r.mPool.Put(m) end = len(inputs[0]) } for start < len(inputs[0]) { for c := 0; c < r.DataShards; c++ { in := inputs[c][start:end] for iRow := 0; iRow < outputCount; iRow++ { if c == 0 { galMulSlice(matrixRows[iRow][c], in, outputs[iRow][start:end], &r.o) } else { galMulSliceXor(matrixRows[iRow][c], in, outputs[iRow][start:end], &r.o) } } } start = end end += r.o.perRound if end > len(inputs[0]) { end = len(inputs[0]) } } } // Perform the same as codeSomeShards, but split the workload into // several goroutines. func (r *reedSolomon) codeSomeShardsP(matrixRows, inputs, outputs [][]byte, outputCount, byteCount int) { var wg sync.WaitGroup gor := r.o.maxGoroutines var avx2Matrix []byte useAvx2 := r.canAVX2C(byteCount, len(inputs), len(outputs)) if useAvx2 { avx2Matrix = genAvx2Matrix(matrixRows, len(inputs), len(outputs), r.mPool.Get().([]byte)) defer r.mPool.Put(avx2Matrix) } do := byteCount / gor if do < r.o.minSplitSize { do = r.o.minSplitSize } // Make sizes divisible by 64 do = (do + 63) & (^63) start := 0 for start < byteCount { if start+do > byteCount { do = byteCount - start } wg.Add(1) go func(start, stop int) { if useAvx2 && stop-start >= 64 { start += galMulSlicesAvx2(avx2Matrix, inputs, outputs, start, stop) } lstart, lstop := start, start+r.o.perRound if lstop > stop { lstop = stop } for lstart < stop { for c := 0; c < r.DataShards; c++ { in := inputs[c][lstart:lstop] for iRow := 0; iRow < outputCount; iRow++ { if c == 0 { galMulSlice(matrixRows[iRow][c], in, outputs[iRow][lstart:lstop], &r.o) } else { galMulSliceXor(matrixRows[iRow][c], in, outputs[iRow][lstart:lstop], &r.o) } } } lstart = lstop lstop += r.o.perRound if lstop > stop { lstop = stop } } wg.Done() }(start, start+do) start += do } wg.Wait() } // checkSomeShards is mostly the same as codeSomeShards, // except this will check values and return // as soon as a difference is found. func (r *reedSolomon) checkSomeShards(matrixRows, inputs, toCheck [][]byte, outputCount, byteCount int) bool { if len(toCheck) == 0 { return true } outputs := make([][]byte, len(toCheck)) for i := range outputs { outputs[i] = make([]byte, byteCount) } r.codeSomeShards(matrixRows, inputs, outputs, outputCount, byteCount) for i, calc := range outputs { if !bytes.Equal(calc, toCheck[i]) { return false } } return true } // ErrShardNoData will be returned if there are no shards, // or if the length of all shards is zero. var ErrShardNoData = errors.New("no shard data") // ErrShardSize is returned if shard length isn't the same for all // shards. var ErrShardSize = errors.New("shard sizes do not match") // checkShards will check if shards are the same size // or 0, if allowed. An error is returned if this fails. // An error is also returned if all shards are size 0. func checkShards(shards [][]byte, nilok bool) error { size := shardSize(shards) if size == 0 { return ErrShardNoData } for _, shard := range shards { if len(shard) != size { if len(shard) != 0 || !nilok { return ErrShardSize } } } return nil } // shardSize return the size of a single shard. // The first non-zero size is returned, // or 0 if all shards are size 0. func shardSize(shards [][]byte) int { for _, shard := range shards { if len(shard) != 0 { return len(shard) } } return 0 } // Reconstruct will recreate the missing shards, if possible. // // Given a list of shards, some of which contain data, fills in the // ones that don't have data. // // The length of the array must be equal to Shards. // You indicate that a shard is missing by setting it to nil or zero-length. // If a shard is zero-length but has sufficient capacity, that memory will // be used, otherwise a new []byte will be allocated. // // If there are too few shards to reconstruct the missing // ones, ErrTooFewShards will be returned. // // The reconstructed shard set is complete, but integrity is not verified. // Use the Verify function to check if data set is ok. func (r *reedSolomon) Reconstruct(shards [][]byte) error { return r.reconstruct(shards, false) } // ReconstructData will recreate any missing data shards, if possible. // // Given a list of shards, some of which contain data, fills in the // data shards that don't have data. // // The length of the array must be equal to Shards. // You indicate that a shard is missing by setting it to nil or zero-length. // If a shard is zero-length but has sufficient capacity, that memory will // be used, otherwise a new []byte will be allocated. // // If there are too few shards to reconstruct the missing // ones, ErrTooFewShards will be returned. // // As the reconstructed shard set may contain missing parity shards, // calling the Verify function is likely to fail. func (r *reedSolomon) ReconstructData(shards [][]byte) error { return r.reconstruct(shards, true) } // reconstruct will recreate the missing data shards, and unless // dataOnly is true, also the missing parity shards // // The length of the array must be equal to Shards. // You indicate that a shard is missing by setting it to nil. // // If there are too few shards to reconstruct the missing // ones, ErrTooFewShards will be returned. func (r *reedSolomon) reconstruct(shards [][]byte, dataOnly bool) error { if len(shards) != r.Shards { return ErrTooFewShards } // Check arguments. err := checkShards(shards, true) if err != nil { return err } shardSize := shardSize(shards) // Quick check: are all of the shards present? If so, there's // nothing to do. numberPresent := 0 dataPresent := 0 for i := 0; i < r.Shards; i++ { if len(shards[i]) != 0 { numberPresent++ if i < r.DataShards { dataPresent++ } } } if numberPresent == r.Shards || dataOnly && dataPresent == r.DataShards { // Cool. All of the shards data data. We don't // need to do anything. return nil } // More complete sanity check if numberPresent < r.DataShards { return ErrTooFewShards } // Pull out an array holding just the shards that // correspond to the rows of the submatrix. These shards // will be the input to the decoding process that re-creates // the missing data shards. // // Also, create an array of indices of the valid rows we do have // and the invalid rows we don't have up until we have enough valid rows. subShards := make([][]byte, r.DataShards) validIndices := make([]int, r.DataShards) invalidIndices := make([]int, 0) subMatrixRow := 0 for matrixRow := 0; matrixRow < r.Shards && subMatrixRow < r.DataShards; matrixRow++ { if len(shards[matrixRow]) != 0 { subShards[subMatrixRow] = shards[matrixRow] validIndices[subMatrixRow] = matrixRow subMatrixRow++ } else { invalidIndices = append(invalidIndices, matrixRow) } } // Attempt to get the cached inverted matrix out of the tree // based on the indices of the invalid rows. dataDecodeMatrix := r.tree.GetInvertedMatrix(invalidIndices) // If the inverted matrix isn't cached in the tree yet we must // construct it ourselves and insert it into the tree for the // future. In this way the inversion tree is lazily loaded. if dataDecodeMatrix == nil { // Pull out the rows of the matrix that correspond to the // shards that we have and build a square matrix. This // matrix could be used to generate the shards that we have // from the original data. subMatrix, _ := newMatrix(r.DataShards, r.DataShards) for subMatrixRow, validIndex := range validIndices { for c := 0; c < r.DataShards; c++ { subMatrix[subMatrixRow][c] = r.m[validIndex][c] } } // Invert the matrix, so we can go from the encoded shards // back to the original data. Then pull out the row that // generates the shard that we want to decode. Note that // since this matrix maps back to the original data, it can // be used to create a data shard, but not a parity shard. dataDecodeMatrix, err = subMatrix.Invert() if err != nil { return err } // Cache the inverted matrix in the tree for future use keyed on the // indices of the invalid rows. err = r.tree.InsertInvertedMatrix(invalidIndices, dataDecodeMatrix, r.Shards) if err != nil { return err } } // Re-create any data shards that were missing. // // The input to the coding is all of the shards we actually // have, and the output is the missing data shards. The computation // is done using the special decode matrix we just built. outputs := make([][]byte, r.ParityShards) matrixRows := make([][]byte, r.ParityShards) outputCount := 0 for iShard := 0; iShard < r.DataShards; iShard++ { if len(shards[iShard]) == 0 { if cap(shards[iShard]) >= shardSize { shards[iShard] = shards[iShard][0:shardSize] } else { shards[iShard] = make([]byte, shardSize) } outputs[outputCount] = shards[iShard] matrixRows[outputCount] = dataDecodeMatrix[iShard] outputCount++ } } r.codeSomeShards(matrixRows, subShards, outputs[:outputCount], outputCount, shardSize) if dataOnly { // Exit out early if we are only interested in the data shards return nil } // Now that we have all of the data shards intact, we can // compute any of the parity that is missing. // // The input to the coding is ALL of the data shards, including // any that we just calculated. The output is whichever of the // data shards were missing. outputCount = 0 for iShard := r.DataShards; iShard < r.Shards; iShard++ { if len(shards[iShard]) == 0 { if cap(shards[iShard]) >= shardSize { shards[iShard] = shards[iShard][0:shardSize] } else { shards[iShard] = make([]byte, shardSize) } outputs[outputCount] = shards[iShard] matrixRows[outputCount] = r.parity[iShard-r.DataShards] outputCount++ } } r.codeSomeShards(matrixRows, shards[:r.DataShards], outputs[:outputCount], outputCount, shardSize) return nil } // ErrShortData will be returned by Split(), if there isn't enough data // to fill the number of shards. var ErrShortData = errors.New("not enough data to fill the number of requested shards") // Split a data slice into the number of shards given to the encoder, // and create empty parity shards if necessary. // // The data will be split into equally sized shards. // If the data size isn't divisible by the number of shards, // the last shard will contain extra zeros. // // There must be at least 1 byte otherwise ErrShortData will be // returned. // // The data will not be copied, except for the last shard, so you // should not modify the data of the input slice afterwards. func (r *reedSolomon) Split(data []byte) ([][]byte, error) { if len(data) == 0 { return nil, ErrShortData } // Calculate number of bytes per data shard. perShard := (len(data) + r.DataShards - 1) / r.DataShards if cap(data) > len(data) { data = data[:cap(data)] } // Only allocate memory if necessary var padding []byte if len(data) < (r.Shards * perShard) { // calculate maximum number of full shards in `data` slice fullShards := len(data) / perShard padding = make([]byte, r.Shards*perShard-perShard*fullShards) copy(padding, data[perShard*fullShards:]) data = data[0 : perShard*fullShards] } // Split into equal-length shards. dst := make([][]byte, r.Shards) i := 0 for ; i < len(dst) && len(data) >= perShard; i++ { dst[i] = data[:perShard:perShard] data = data[perShard:] } for j := 0; i+j < len(dst); j++ { dst[i+j] = padding[:perShard:perShard] padding = padding[perShard:] } return dst, nil } // ErrReconstructRequired is returned if too few data shards are intact and a // reconstruction is required before you can successfully join the shards. var ErrReconstructRequired = errors.New("reconstruction required as one or more required data shards are nil") // Join the shards and write the data segment to dst. // // Only the data shards are considered. // You must supply the exact output size you want. // // If there are to few shards given, ErrTooFewShards will be returned. // If the total data size is less than outSize, ErrShortData will be returned. // If one or more required data shards are nil, ErrReconstructRequired will be returned. func (r *reedSolomon) Join(dst io.Writer, shards [][]byte, outSize int) error { // Do we have enough shards? if len(shards) < r.DataShards { return ErrTooFewShards } shards = shards[:r.DataShards] // Do we have enough data? size := 0 for _, shard := range shards { if shard == nil { return ErrReconstructRequired } size += len(shard) // Do we have enough data already? if size >= outSize { break } } if size < outSize { return ErrShortData } // Copy data to dst write := outSize for _, shard := range shards { if write < len(shard) { _, err := dst.Write(shard[:write]) return err } n, err := dst.Write(shard) if err != nil { return err } write -= n } return nil } reedsolomon-1.9.13/reedsolomon_test.go000066400000000000000000001127561406411035300200600ustar00rootroot00000000000000/** * Unit tests for ReedSolomon * * Copyright 2015, Klaus Post * Copyright 2015, Backblaze, Inc. All rights reserved. */ package reedsolomon import ( "bytes" "flag" "fmt" "math/rand" "os" "runtime" "testing" ) var noSSE2 = flag.Bool("no-sse2", !defaultOptions.useSSE2, "Disable SSE2") var noSSSE3 = flag.Bool("no-ssse3", !defaultOptions.useSSSE3, "Disable SSSE3") var noAVX2 = flag.Bool("no-avx2", !defaultOptions.useAVX2, "Disable AVX2") var noAVX512 = flag.Bool("no-avx512", !defaultOptions.useAVX512, "Disable AVX512") func TestMain(m *testing.M) { flag.Parse() os.Exit(m.Run()) } func testOptions(o ...Option) []Option { if *noSSSE3 { o = append(o, withSSSE3(false)) } if *noSSE2 { o = append(o, withSSE2(false)) } if *noAVX2 { o = append(o, withAVX2(false)) } if *noAVX512 { o = append(o, withAVX512(false)) } return o } func isIncreasingAndContainsDataRow(indices []int) bool { cols := len(indices) for i := 0; i < cols-1; i++ { if indices[i] >= indices[i+1] { return false } } // Data rows are in the upper square portion of the matrix. return indices[0] < cols } func incrementIndices(indices []int, indexBound int) (valid bool) { for i := len(indices) - 1; i >= 0; i-- { indices[i]++ if indices[i] < indexBound { break } if i == 0 { return false } indices[i] = 0 } return true } func incrementIndicesUntilIncreasingAndContainsDataRow( indices []int, maxIndex int) bool { for { valid := incrementIndices(indices, maxIndex) if !valid { return false } if isIncreasingAndContainsDataRow(indices) { return true } } } func findSingularSubMatrix(m matrix) (matrix, error) { rows := len(m) cols := len(m[0]) rowIndices := make([]int, cols) for incrementIndicesUntilIncreasingAndContainsDataRow(rowIndices, rows) { subMatrix, _ := newMatrix(cols, cols) for i, r := range rowIndices { for c := 0; c < cols; c++ { subMatrix[i][c] = m[r][c] } } _, err := subMatrix.Invert() if err == errSingular { return subMatrix, nil } else if err != nil { return nil, err } } return nil, nil } func TestBuildMatrixPAR1Singular(t *testing.T) { totalShards := 8 dataShards := 4 m, err := buildMatrixPAR1(dataShards, totalShards) if err != nil { t.Fatal(err) } singularSubMatrix, err := findSingularSubMatrix(m) if err != nil { t.Fatal(err) } if singularSubMatrix == nil { t.Fatal("No singular sub-matrix found") } t.Logf("matrix %s has singular sub-matrix %s", m, singularSubMatrix) } func testOpts() [][]Option { if testing.Short() { return [][]Option{ {WithPAR1Matrix()}, {WithCauchyMatrix()}, } } opts := [][]Option{ {WithPAR1Matrix()}, {WithCauchyMatrix()}, {WithFastOneParityMatrix()}, {WithPAR1Matrix(), WithFastOneParityMatrix()}, {WithCauchyMatrix(), WithFastOneParityMatrix()}, {WithMaxGoroutines(1), WithMinSplitSize(500), withSSSE3(false), withAVX2(false), withAVX512(false)}, {WithMaxGoroutines(5000), WithMinSplitSize(50), withSSSE3(false), withAVX2(false), withAVX512(false)}, {WithMaxGoroutines(5000), WithMinSplitSize(500000), withSSSE3(false), withAVX2(false), withAVX512(false)}, {WithMaxGoroutines(1), WithMinSplitSize(500000), withSSSE3(false), withAVX2(false), withAVX512(false)}, {WithAutoGoroutines(50000), WithMinSplitSize(500)}, {WithInversionCache(false)}, } for _, o := range opts[:] { if defaultOptions.useSSSE3 { n := make([]Option, len(o), len(o)+1) copy(n, o) n = append(n, withSSSE3(true)) opts = append(opts, n) } if defaultOptions.useAVX2 { n := make([]Option, len(o), len(o)+1) copy(n, o) n = append(n, withAVX2(true)) opts = append(opts, n) } if defaultOptions.useAVX512 { n := make([]Option, len(o), len(o)+1) copy(n, o) n = append(n, withAVX512(true)) opts = append(opts, n) } } return opts } func TestEncoding(t *testing.T) { t.Run("default", func(t *testing.T) { testEncoding(t, testOptions()...) }) for i, o := range testOpts() { t.Run(fmt.Sprintf("opt-%d", i), func(t *testing.T) { testEncoding(t, o...) }) } } // matrix sizes to test. // note that par1 matric will fail on some combinations. var testSizes = [][2]int{ {1, 0}, {3, 0}, {5, 0}, {8, 0}, {10, 0}, {12, 0}, {14, 0}, {41, 0}, {49, 0}, {1, 1}, {1, 2}, {3, 3}, {3, 1}, {5, 3}, {8, 4}, {10, 30}, {12, 10}, {14, 7}, {41, 17}, {49, 1}} var testDataSizes = []int{10, 100, 1000, 10001, 100003, 1000055} var testDataSizesShort = []int{10, 10001, 100003} func testEncoding(t *testing.T, o ...Option) { for _, size := range testSizes { data, parity := size[0], size[1] rng := rand.New(rand.NewSource(0xabadc0cac01a)) t.Run(fmt.Sprintf("%dx%d", data, parity), func(t *testing.T) { sz := testDataSizes if testing.Short() { sz = testDataSizesShort } for _, perShard := range sz { t.Run(fmt.Sprint(perShard), func(t *testing.T) { r, err := New(data, parity, testOptions(o...)...) if err != nil { t.Fatal(err) } shards := make([][]byte, data+parity) for s := range shards { shards[s] = make([]byte, perShard) } for s := 0; s < data; s++ { rng.Read(shards[s]) } err = r.Encode(shards) if err != nil { t.Fatal(err) } ok, err := r.Verify(shards) if err != nil { t.Fatal(err) } if !ok { t.Fatal("Verification failed") } if parity == 0 { // Check that Reconstruct and ReconstructData do nothing err = r.ReconstructData(shards) if err != nil { t.Fatal(err) } err = r.Reconstruct(shards) if err != nil { t.Fatal(err) } // Skip integrity checks return } // Delete one in data idx := rng.Intn(data) want := shards[idx] shards[idx] = nil err = r.ReconstructData(shards) if err != nil { t.Fatal(err) } if !bytes.Equal(shards[idx], want) { t.Fatal("did not ReconstructData correctly") } // Delete one randomly idx = rng.Intn(data + parity) want = shards[idx] shards[idx] = nil err = r.Reconstruct(shards) if err != nil { t.Fatal(err) } if !bytes.Equal(shards[idx], want) { t.Fatal("did not Reconstruct correctly") } err = r.Encode(make([][]byte, 1)) if err != ErrTooFewShards { t.Errorf("expected %v, got %v", ErrTooFewShards, err) } // Make one too short. shards[idx] = shards[idx][:perShard-1] err = r.Encode(shards) if err != ErrShardSize { t.Errorf("expected %v, got %v", ErrShardSize, err) } }) } }) } } func TestUpdate(t *testing.T) { for i, o := range testOpts() { t.Run(fmt.Sprintf("options %d", i), func(t *testing.T) { testUpdate(t, o...) }) } } func testUpdate(t *testing.T, o ...Option) { rand.Seed(0) for _, size := range [][2]int{{10, 3}, {17, 2}} { data, parity := size[0], size[1] t.Run(fmt.Sprintf("%dx%d", data, parity), func(t *testing.T) { sz := testDataSizesShort if testing.Short() { sz = []int{50000} } for _, perShard := range sz { t.Run(fmt.Sprint(perShard), func(t *testing.T) { r, err := New(data, parity, testOptions(o...)...) if err != nil { t.Fatal(err) } shards := make([][]byte, data+parity) for s := range shards { shards[s] = make([]byte, perShard) } for s := range shards { fillRandom(shards[s]) } err = r.Encode(shards) if err != nil { t.Fatal(err) } ok, err := r.Verify(shards) if err != nil { t.Fatal(err) } if !ok { t.Fatal("Verification failed") } newdatashards := make([][]byte, data) for s := range newdatashards { newdatashards[s] = make([]byte, perShard) fillRandom(newdatashards[s]) err = r.Update(shards, newdatashards) if err != nil { t.Fatal(err) } shards[s] = newdatashards[s] ok, err := r.Verify(shards) if err != nil { t.Fatal(err) } if !ok { t.Fatal("Verification failed") } newdatashards[s] = nil } for s := 0; s < len(newdatashards)-1; s++ { newdatashards[s] = make([]byte, perShard) newdatashards[s+1] = make([]byte, perShard) fillRandom(newdatashards[s]) fillRandom(newdatashards[s+1]) err = r.Update(shards, newdatashards) if err != nil { t.Fatal(err) } shards[s] = newdatashards[s] shards[s+1] = newdatashards[s+1] ok, err := r.Verify(shards) if err != nil { t.Fatal(err) } if !ok { t.Fatal("Verification failed") } newdatashards[s] = nil newdatashards[s+1] = nil } for newNum := 1; newNum <= data; newNum++ { for s := 0; s <= data-newNum; s++ { for i := 0; i < newNum; i++ { newdatashards[s+i] = make([]byte, perShard) fillRandom(newdatashards[s+i]) } err = r.Update(shards, newdatashards) if err != nil { t.Fatal(err) } for i := 0; i < newNum; i++ { shards[s+i] = newdatashards[s+i] } ok, err := r.Verify(shards) if err != nil { t.Fatal(err) } if !ok { t.Fatal("Verification failed") } for i := 0; i < newNum; i++ { newdatashards[s+i] = nil } } } }) } }) } } func TestReconstruct(t *testing.T) { testReconstruct(t) for i, o := range testOpts() { t.Run(fmt.Sprintf("options %d", i), func(t *testing.T) { testReconstruct(t, o...) }) } } func testReconstruct(t *testing.T, o ...Option) { perShard := 50000 r, err := New(10, 3, testOptions(o...)...) if err != nil { t.Fatal(err) } shards := make([][]byte, 13) for s := range shards { shards[s] = make([]byte, perShard) } rand.Seed(0) for s := 0; s < 13; s++ { fillRandom(shards[s]) } err = r.Encode(shards) if err != nil { t.Fatal(err) } // Reconstruct with all shards present err = r.Reconstruct(shards) if err != nil { t.Fatal(err) } // Reconstruct with 10 shards present. Use pre-allocated memory for one of them. shards[0] = nil shards[7] = nil shard11 := shards[11] shards[11] = shard11[:0] fillRandom(shard11) err = r.Reconstruct(shards) if err != nil { t.Fatal(err) } ok, err := r.Verify(shards) if err != nil { t.Fatal(err) } if !ok { t.Fatal("Verification failed") } if &shard11[0] != &shards[11][0] { t.Errorf("Shard was not reconstructed into pre-allocated memory") } // Reconstruct with 9 shards present (should fail) shards[0] = nil shards[4] = nil shards[7] = nil shards[11] = nil err = r.Reconstruct(shards) if err != ErrTooFewShards { t.Errorf("expected %v, got %v", ErrTooFewShards, err) } err = r.Reconstruct(make([][]byte, 1)) if err != ErrTooFewShards { t.Errorf("expected %v, got %v", ErrTooFewShards, err) } err = r.Reconstruct(make([][]byte, 13)) if err != ErrShardNoData { t.Errorf("expected %v, got %v", ErrShardNoData, err) } } func TestReconstructData(t *testing.T) { testReconstructData(t) for i, o := range testOpts() { t.Run(fmt.Sprintf("options %d", i), func(t *testing.T) { testReconstruct(t, o...) }) } } func testReconstructData(t *testing.T, o ...Option) { perShard := 100000 r, err := New(8, 5, testOptions(o...)...) if err != nil { t.Fatal(err) } shards := make([][]byte, 13) for s := range shards { shards[s] = make([]byte, perShard) } rand.Seed(0) for s := 0; s < 13; s++ { fillRandom(shards[s]) } err = r.Encode(shards) if err != nil { t.Fatal(err) } // Reconstruct with all shards present err = r.ReconstructData(shards) if err != nil { t.Fatal(err) } // Reconstruct with 10 shards present. Use pre-allocated memory for one of them. shards[0] = nil shards[2] = nil shard4 := shards[4] shards[4] = shard4[:0] fillRandom(shard4) err = r.ReconstructData(shards) if err != nil { t.Fatal(err) } // Since all parity shards are available, verification will succeed ok, err := r.Verify(shards) if err != nil { t.Fatal(err) } if !ok { t.Fatal("Verification failed") } if &shard4[0] != &shards[4][0] { t.Errorf("Shard was not reconstructed into pre-allocated memory") } // Reconstruct with 6 data and 4 parity shards shards[0] = nil shards[2] = nil shards[12] = nil err = r.ReconstructData(shards) if err != nil { t.Fatal(err) } // Verification will fail now due to absence of a parity block _, err = r.Verify(shards) if err != ErrShardSize { t.Errorf("expected %v, got %v", ErrTooFewShards, err) } // Reconstruct with 7 data and 1 parity shards shards[0] = nil shards[9] = nil shards[10] = nil shards[11] = nil shards[12] = nil err = r.ReconstructData(shards) if err != nil { t.Fatal(err) } _, err = r.Verify(shards) if err != ErrShardSize { t.Errorf("expected %v, got %v", ErrTooFewShards, err) } // Reconstruct with 6 data and 1 parity shards (should fail) shards[0] = nil shards[1] = nil shards[9] = nil shards[10] = nil shards[11] = nil shards[12] = nil err = r.ReconstructData(shards) if err != ErrTooFewShards { t.Errorf("expected %v, got %v", ErrTooFewShards, err) } err = r.ReconstructData(make([][]byte, 1)) if err != ErrTooFewShards { t.Errorf("expected %v, got %v", ErrTooFewShards, err) } err = r.ReconstructData(make([][]byte, 13)) if err != ErrShardNoData { t.Errorf("expected %v, got %v", ErrShardNoData, err) } } func TestReconstructPAR1Singular(t *testing.T) { perShard := 50 r, err := New(4, 4, testOptions(WithPAR1Matrix())...) if err != nil { t.Fatal(err) } shards := make([][]byte, 8) for s := range shards { shards[s] = make([]byte, perShard) } rand.Seed(0) for s := 0; s < 8; s++ { fillRandom(shards[s]) } err = r.Encode(shards) if err != nil { t.Fatal(err) } // Reconstruct with only the last data shard present, and the // first, second, and fourth parity shard present (based on // the result of TestBuildMatrixPAR1Singular). This should // fail. shards[0] = nil shards[1] = nil shards[2] = nil shards[6] = nil err = r.Reconstruct(shards) if err != errSingular { t.Fatal(err) t.Errorf("expected %v, got %v", errSingular, err) } } func TestVerify(t *testing.T) { testVerify(t) for i, o := range testOpts() { t.Run(fmt.Sprintf("options %d", i), func(t *testing.T) { testVerify(t, o...) }) } } func testVerify(t *testing.T, o ...Option) { perShard := 33333 r, err := New(10, 4, testOptions(o...)...) if err != nil { t.Fatal(err) } shards := make([][]byte, 14) for s := range shards { shards[s] = make([]byte, perShard) } rand.Seed(0) for s := 0; s < 10; s++ { fillRandom(shards[s]) } err = r.Encode(shards) if err != nil { t.Fatal(err) } ok, err := r.Verify(shards) if err != nil { t.Fatal(err) } if !ok { t.Error("Verification failed") return } // Put in random data. Verification should fail fillRandom(shards[10]) ok, err = r.Verify(shards) if err != nil { t.Fatal(err) } if ok { t.Fatal("Verification did not fail") } // Re-encode err = r.Encode(shards) if err != nil { t.Fatal(err) } // Fill a data segment with random data fillRandom(shards[0]) ok, err = r.Verify(shards) if err != nil { t.Fatal(err) } if ok { t.Fatal("Verification did not fail") } _, err = r.Verify(make([][]byte, 1)) if err != ErrTooFewShards { t.Errorf("expected %v, got %v", ErrTooFewShards, err) } _, err = r.Verify(make([][]byte, 14)) if err != ErrShardNoData { t.Errorf("expected %v, got %v", ErrShardNoData, err) } } func TestOneEncode(t *testing.T) { codec, err := New(5, 5, testOptions()...) if err != nil { t.Fatal(err) } shards := [][]byte{ {0, 1}, {4, 5}, {2, 3}, {6, 7}, {8, 9}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, } codec.Encode(shards) if shards[5][0] != 12 || shards[5][1] != 13 { t.Fatal("shard 5 mismatch") } if shards[6][0] != 10 || shards[6][1] != 11 { t.Fatal("shard 6 mismatch") } if shards[7][0] != 14 || shards[7][1] != 15 { t.Fatal("shard 7 mismatch") } if shards[8][0] != 90 || shards[8][1] != 91 { t.Fatal("shard 8 mismatch") } if shards[9][0] != 94 || shards[9][1] != 95 { t.Fatal("shard 9 mismatch") } ok, err := codec.Verify(shards) if err != nil { t.Fatal(err) } if !ok { t.Fatal("did not verify") } shards[8][0]++ ok, err = codec.Verify(shards) if err != nil { t.Fatal(err) } if ok { t.Fatal("verify did not fail as expected") } } func fillRandom(p []byte) { for i := 0; i < len(p); i += 7 { val := rand.Int63() for j := 0; i+j < len(p) && j < 7; j++ { p[i+j] = byte(val) val >>= 8 } } } func benchmarkEncode(b *testing.B, dataShards, parityShards, shardSize int) { r, err := New(dataShards, parityShards, testOptions(WithAutoGoroutines(shardSize))...) if err != nil { b.Fatal(err) } shards := make([][]byte, dataShards+parityShards) for s := range shards { shards[s] = make([]byte, shardSize) } rand.Seed(0) for s := 0; s < dataShards; s++ { fillRandom(shards[s]) } b.SetBytes(int64(shardSize * (dataShards + parityShards))) b.ResetTimer() for i := 0; i < b.N; i++ { err = r.Encode(shards) if err != nil { b.Fatal(err) } } } func BenchmarkEncode2x1x1M(b *testing.B) { benchmarkEncode(b, 2, 1, 1024*1024) } func BenchmarkEncode10x2x10000(b *testing.B) { benchmarkEncode(b, 10, 2, 10000) } func BenchmarkEncode100x20x10000(b *testing.B) { benchmarkEncode(b, 100, 20, 10000) } func BenchmarkEncode17x3x1M(b *testing.B) { benchmarkEncode(b, 17, 3, 1024*1024) } // Benchmark 10 data shards and 4 parity shards with 16MB each. func BenchmarkEncode10x4x16M(b *testing.B) { benchmarkEncode(b, 10, 4, 16*1024*1024) } // Benchmark 5 data shards and 2 parity shards with 1MB each. func BenchmarkEncode5x2x1M(b *testing.B) { benchmarkEncode(b, 5, 2, 1024*1024) } // Benchmark 1 data shards and 2 parity shards with 1MB each. func BenchmarkEncode10x2x1M(b *testing.B) { benchmarkEncode(b, 10, 2, 1024*1024) } // Benchmark 10 data shards and 4 parity shards with 1MB each. func BenchmarkEncode10x4x1M(b *testing.B) { benchmarkEncode(b, 10, 4, 1024*1024) } // Benchmark 50 data shards and 20 parity shards with 1MB each. func BenchmarkEncode50x20x1M(b *testing.B) { benchmarkEncode(b, 50, 20, 1024*1024) } // Benchmark 17 data shards and 3 parity shards with 16MB each. func BenchmarkEncode17x3x16M(b *testing.B) { benchmarkEncode(b, 17, 3, 16*1024*1024) } func BenchmarkEncode_8x4x8M(b *testing.B) { benchmarkEncode(b, 8, 4, 8*1024*1024) } func BenchmarkEncode_12x4x12M(b *testing.B) { benchmarkEncode(b, 12, 4, 12*1024*1024) } func BenchmarkEncode_16x4x16M(b *testing.B) { benchmarkEncode(b, 16, 4, 16*1024*1024) } func BenchmarkEncode_16x4x32M(b *testing.B) { benchmarkEncode(b, 16, 4, 32*1024*1024) } func BenchmarkEncode_16x4x64M(b *testing.B) { benchmarkEncode(b, 16, 4, 64*1024*1024) } func BenchmarkEncode_8x5x8M(b *testing.B) { benchmarkEncode(b, 8, 5, 8*1024*1024) } func BenchmarkEncode_8x6x8M(b *testing.B) { benchmarkEncode(b, 8, 6, 8*1024*1024) } func BenchmarkEncode_8x7x8M(b *testing.B) { benchmarkEncode(b, 8, 7, 8*1024*1024) } func BenchmarkEncode_8x9x8M(b *testing.B) { benchmarkEncode(b, 8, 9, 8*1024*1024) } func BenchmarkEncode_8x10x8M(b *testing.B) { benchmarkEncode(b, 8, 10, 8*1024*1024) } func BenchmarkEncode_8x11x8M(b *testing.B) { benchmarkEncode(b, 8, 11, 8*1024*1024) } func BenchmarkEncode_8x8x05M(b *testing.B) { benchmarkEncode(b, 8, 8, 1*1024*1024/2) } func BenchmarkEncode_8x8x1M(b *testing.B) { benchmarkEncode(b, 8, 8, 1*1024*1024) } func BenchmarkEncode_8x8x8M(b *testing.B) { benchmarkEncode(b, 8, 8, 8*1024*1024) } func BenchmarkEncode_8x8x32M(b *testing.B) { benchmarkEncode(b, 8, 8, 32*1024*1024) } func BenchmarkEncode_24x8x24M(b *testing.B) { benchmarkEncode(b, 24, 8, 24*1024*1024) } func BenchmarkEncode_24x8x48M(b *testing.B) { benchmarkEncode(b, 24, 8, 48*1024*1024) } func benchmarkVerify(b *testing.B, dataShards, parityShards, shardSize int) { r, err := New(dataShards, parityShards, testOptions(WithAutoGoroutines(shardSize))...) if err != nil { b.Fatal(err) } shards := make([][]byte, parityShards+dataShards) for s := range shards { shards[s] = make([]byte, shardSize) } rand.Seed(0) for s := 0; s < dataShards; s++ { fillRandom(shards[s]) } err = r.Encode(shards) if err != nil { b.Fatal(err) } b.SetBytes(int64(shardSize * (dataShards + parityShards))) b.ResetTimer() for i := 0; i < b.N; i++ { _, err = r.Verify(shards) if err != nil { b.Fatal(err) } } } // Benchmark 10 data slices with 2 parity slices holding 10000 bytes each func BenchmarkVerify10x2x10000(b *testing.B) { benchmarkVerify(b, 10, 2, 10000) } // Benchmark 50 data slices with 5 parity slices holding 100000 bytes each func BenchmarkVerify50x5x50000(b *testing.B) { benchmarkVerify(b, 50, 5, 100000) } // Benchmark 10 data slices with 2 parity slices holding 1MB bytes each func BenchmarkVerify10x2x1M(b *testing.B) { benchmarkVerify(b, 10, 2, 1024*1024) } // Benchmark 5 data slices with 2 parity slices holding 1MB bytes each func BenchmarkVerify5x2x1M(b *testing.B) { benchmarkVerify(b, 5, 2, 1024*1024) } // Benchmark 10 data slices with 4 parity slices holding 1MB bytes each func BenchmarkVerify10x4x1M(b *testing.B) { benchmarkVerify(b, 10, 4, 1024*1024) } // Benchmark 5 data slices with 2 parity slices holding 1MB bytes each func BenchmarkVerify50x20x1M(b *testing.B) { benchmarkVerify(b, 50, 20, 1024*1024) } // Benchmark 10 data slices with 4 parity slices holding 16MB bytes each func BenchmarkVerify10x4x16M(b *testing.B) { benchmarkVerify(b, 10, 4, 16*1024*1024) } func corruptRandom(shards [][]byte, dataShards, parityShards int) { shardsToCorrupt := rand.Intn(parityShards) + 1 for i := 0; i < shardsToCorrupt; i++ { n := rand.Intn(dataShards + parityShards) shards[n] = shards[n][:0] } } func benchmarkReconstruct(b *testing.B, dataShards, parityShards, shardSize int) { r, err := New(dataShards, parityShards, testOptions(WithAutoGoroutines(shardSize))...) if err != nil { b.Fatal(err) } shards := make([][]byte, parityShards+dataShards) for s := range shards { shards[s] = make([]byte, shardSize) } rand.Seed(0) for s := 0; s < dataShards; s++ { fillRandom(shards[s]) } err = r.Encode(shards) if err != nil { b.Fatal(err) } b.SetBytes(int64(shardSize * (dataShards + parityShards))) b.ResetTimer() for i := 0; i < b.N; i++ { corruptRandom(shards, dataShards, parityShards) err = r.Reconstruct(shards) if err != nil { b.Fatal(err) } } } // Benchmark 10 data slices with 2 parity slices holding 10000 bytes each func BenchmarkReconstruct10x2x10000(b *testing.B) { benchmarkReconstruct(b, 10, 2, 10000) } // Benchmark 50 data slices with 5 parity slices holding 100000 bytes each func BenchmarkReconstruct50x5x50000(b *testing.B) { benchmarkReconstruct(b, 50, 5, 100000) } // Benchmark 10 data slices with 2 parity slices holding 1MB bytes each func BenchmarkReconstruct10x2x1M(b *testing.B) { benchmarkReconstruct(b, 10, 2, 1024*1024) } // Benchmark 5 data slices with 2 parity slices holding 1MB bytes each func BenchmarkReconstruct5x2x1M(b *testing.B) { benchmarkReconstruct(b, 5, 2, 1024*1024) } // Benchmark 10 data slices with 4 parity slices holding 1MB bytes each func BenchmarkReconstruct10x4x1M(b *testing.B) { benchmarkReconstruct(b, 10, 4, 1024*1024) } // Benchmark 5 data slices with 2 parity slices holding 1MB bytes each func BenchmarkReconstruct50x20x1M(b *testing.B) { benchmarkReconstruct(b, 50, 20, 1024*1024) } // Benchmark 10 data slices with 4 parity slices holding 16MB bytes each func BenchmarkReconstruct10x4x16M(b *testing.B) { benchmarkReconstruct(b, 10, 4, 16*1024*1024) } func corruptRandomData(shards [][]byte, dataShards, parityShards int) { shardsToCorrupt := rand.Intn(parityShards) + 1 for i := 1; i <= shardsToCorrupt; i++ { n := rand.Intn(dataShards) shards[n] = shards[n][:0] } } func benchmarkReconstructData(b *testing.B, dataShards, parityShards, shardSize int) { r, err := New(dataShards, parityShards, testOptions(WithAutoGoroutines(shardSize))...) if err != nil { b.Fatal(err) } shards := make([][]byte, parityShards+dataShards) for s := range shards { shards[s] = make([]byte, shardSize) } rand.Seed(0) for s := 0; s < dataShards; s++ { fillRandom(shards[s]) } err = r.Encode(shards) if err != nil { b.Fatal(err) } b.SetBytes(int64(shardSize * (dataShards + parityShards))) b.ResetTimer() for i := 0; i < b.N; i++ { corruptRandomData(shards, dataShards, parityShards) err = r.ReconstructData(shards) if err != nil { b.Fatal(err) } } } // Benchmark 10 data slices with 2 parity slices holding 10000 bytes each func BenchmarkReconstructData10x2x10000(b *testing.B) { benchmarkReconstructData(b, 10, 2, 10000) } // Benchmark 50 data slices with 5 parity slices holding 100000 bytes each func BenchmarkReconstructData50x5x50000(b *testing.B) { benchmarkReconstructData(b, 50, 5, 100000) } // Benchmark 10 data slices with 2 parity slices holding 1MB bytes each func BenchmarkReconstructData10x2x1M(b *testing.B) { benchmarkReconstructData(b, 10, 2, 1024*1024) } // Benchmark 5 data slices with 2 parity slices holding 1MB bytes each func BenchmarkReconstructData5x2x1M(b *testing.B) { benchmarkReconstructData(b, 5, 2, 1024*1024) } // Benchmark 10 data slices with 4 parity slices holding 1MB bytes each func BenchmarkReconstructData10x4x1M(b *testing.B) { benchmarkReconstructData(b, 10, 4, 1024*1024) } // Benchmark 5 data slices with 2 parity slices holding 1MB bytes each func BenchmarkReconstructData50x20x1M(b *testing.B) { benchmarkReconstructData(b, 50, 20, 1024*1024) } // Benchmark 10 data slices with 4 parity slices holding 16MB bytes each func BenchmarkReconstructData10x4x16M(b *testing.B) { benchmarkReconstructData(b, 10, 4, 16*1024*1024) } func benchmarkReconstructP(b *testing.B, dataShards, parityShards, shardSize int) { r, err := New(dataShards, parityShards, testOptions(WithMaxGoroutines(1))...) if err != nil { b.Fatal(err) } b.SetBytes(int64(shardSize * (dataShards + parityShards))) b.ResetTimer() b.RunParallel(func(pb *testing.PB) { shards := make([][]byte, parityShards+dataShards) for s := range shards { shards[s] = make([]byte, shardSize) } rand.Seed(0) for s := 0; s < dataShards; s++ { fillRandom(shards[s]) } err = r.Encode(shards) if err != nil { b.Fatal(err) } b.ResetTimer() for pb.Next() { corruptRandom(shards, dataShards, parityShards) err = r.Reconstruct(shards) if err != nil { b.Fatal(err) } } }) } // Benchmark 10 data slices with 2 parity slices holding 10000 bytes each func BenchmarkReconstructP10x2x10000(b *testing.B) { benchmarkReconstructP(b, 10, 2, 10000) } // Benchmark 10 data slices with 5 parity slices holding 20000 bytes each func BenchmarkReconstructP10x5x20000(b *testing.B) { benchmarkReconstructP(b, 10, 5, 20000) } func TestEncoderReconstruct(t *testing.T) { testEncoderReconstruct(t) for _, o := range testOpts() { testEncoderReconstruct(t, o...) } } func testEncoderReconstruct(t *testing.T, o ...Option) { // Create some sample data var data = make([]byte, 250000) fillRandom(data) // Create 5 data slices of 50000 elements each enc, err := New(5, 3, testOptions(o...)...) if err != nil { t.Fatal(err) } shards, err := enc.Split(data) if err != nil { t.Fatal(err) } err = enc.Encode(shards) if err != nil { t.Fatal(err) } // Check that it verifies ok, err := enc.Verify(shards) if !ok || err != nil { t.Fatal("not ok:", ok, "err:", err) } // Delete a shard shards[0] = nil // Should reconstruct err = enc.Reconstruct(shards) if err != nil { t.Fatal(err) } // Check that it verifies ok, err = enc.Verify(shards) if !ok || err != nil { t.Fatal("not ok:", ok, "err:", err) } // Recover original bytes buf := new(bytes.Buffer) err = enc.Join(buf, shards, len(data)) if err != nil { t.Fatal(err) } if !bytes.Equal(buf.Bytes(), data) { t.Fatal("recovered bytes do not match") } // Corrupt a shard shards[0] = nil shards[1][0], shards[1][500] = 75, 75 // Should reconstruct (but with corrupted data) err = enc.Reconstruct(shards) if err != nil { t.Fatal(err) } // Check that it verifies ok, err = enc.Verify(shards) if ok || err != nil { t.Fatal("error or ok:", ok, "err:", err) } // Recovered data should not match original buf.Reset() err = enc.Join(buf, shards, len(data)) if err != nil { t.Fatal(err) } if bytes.Equal(buf.Bytes(), data) { t.Fatal("corrupted data matches original") } } func TestSplitJoin(t *testing.T) { var data = make([]byte, 250000) rand.Seed(0) fillRandom(data) enc, _ := New(5, 3, testOptions()...) shards, err := enc.Split(data) if err != nil { t.Fatal(err) } _, err = enc.Split([]byte{}) if err != ErrShortData { t.Errorf("expected %v, got %v", ErrShortData, err) } buf := new(bytes.Buffer) err = enc.Join(buf, shards, 50) if err != nil { t.Fatal(err) } if !bytes.Equal(buf.Bytes(), data[:50]) { t.Fatal("recovered data does match original") } err = enc.Join(buf, [][]byte{}, 0) if err != ErrTooFewShards { t.Errorf("expected %v, got %v", ErrTooFewShards, err) } err = enc.Join(buf, shards, len(data)+1) if err != ErrShortData { t.Errorf("expected %v, got %v", ErrShortData, err) } shards[0] = nil err = enc.Join(buf, shards, len(data)) if err != ErrReconstructRequired { t.Errorf("expected %v, got %v", ErrReconstructRequired, err) } } func TestCodeSomeShards(t *testing.T) { var data = make([]byte, 250000) fillRandom(data) enc, _ := New(5, 3, testOptions()...) r := enc.(*reedSolomon) // need to access private methods shards, _ := enc.Split(data) old := runtime.GOMAXPROCS(1) r.codeSomeShards(r.parity, shards[:r.DataShards], shards[r.DataShards:], r.ParityShards, len(shards[0])) // hopefully more than 1 CPU runtime.GOMAXPROCS(runtime.NumCPU()) r.codeSomeShards(r.parity, shards[:r.DataShards], shards[r.DataShards:], r.ParityShards, len(shards[0])) // reset MAXPROCS, otherwise testing complains runtime.GOMAXPROCS(old) } func TestStandardMatrices(t *testing.T) { if testing.Short() || runtime.GOMAXPROCS(0) < 4 { // Runtime ~15s. t.Skip("Skipping slow matrix check") } for i := 1; i < 256; i++ { i := i t.Run(fmt.Sprintf("x%d", i), func(t *testing.T) { t.Parallel() // i == n.o. datashards var shards = make([][]byte, 255) for p := range shards { v := byte(i) shards[p] = []byte{v} } rng := rand.New(rand.NewSource(0)) for j := 1; j < 256; j++ { // j == n.o. parity shards if i+j > 255 { continue } sh := shards[:i+j] r, err := New(i, j, testOptions(WithFastOneParityMatrix())...) if err != nil { // We are not supposed to write to t from goroutines. t.Fatal("creating matrix size", i, j, ":", err) } err = r.Encode(sh) if err != nil { t.Fatal("encoding", i, j, ":", err) } for k := 0; k < j; k++ { // Remove random shard. r := int(rng.Int63n(int64(i + j))) sh[r] = sh[r][:0] } err = r.Reconstruct(sh) if err != nil { t.Fatal("reconstructing", i, j, ":", err) } ok, err := r.Verify(sh) if err != nil { t.Fatal("verifying", i, j, ":", err) } if !ok { t.Fatal(i, j, ok) } for k := range sh { if k == i { // Only check data shards break } if sh[k][0] != byte(i) { t.Fatal("does not match", i, j, k, sh[0], sh[k]) } } } }) } } func TestCauchyMatrices(t *testing.T) { if testing.Short() || runtime.GOMAXPROCS(0) < 4 { // Runtime ~15s. t.Skip("Skipping slow matrix check") } for i := 1; i < 256; i++ { i := i t.Run(fmt.Sprintf("x%d", i), func(t *testing.T) { t.Parallel() var shards = make([][]byte, 255) for p := range shards { v := byte(i) shards[p] = []byte{v} } rng := rand.New(rand.NewSource(0)) for j := 1; j < 256; j++ { // j == n.o. parity shards if i+j > 255 { continue } sh := shards[:i+j] r, err := New(i, j, testOptions(WithCauchyMatrix(), WithFastOneParityMatrix())...) if err != nil { // We are not supposed to write to t from goroutines. t.Fatal("creating matrix size", i, j, ":", err) } err = r.Encode(sh) if err != nil { t.Fatal("encoding", i, j, ":", err) } for k := 0; k < j; k++ { // Remove random shard. r := int(rng.Int63n(int64(i + j))) sh[r] = sh[r][:0] } err = r.Reconstruct(sh) if err != nil { t.Fatal("reconstructing", i, j, ":", err) } ok, err := r.Verify(sh) if err != nil { t.Fatal("verifying", i, j, ":", err) } if !ok { t.Fatal(i, j, ok) } for k := range sh { if k == i { // Only check data shards break } if sh[k][0] != byte(i) { t.Fatal("does not match", i, j, k, sh[0], sh[k]) } } } }) } } func TestPar1Matrices(t *testing.T) { if testing.Short() || runtime.GOMAXPROCS(0) < 4 { // Runtime ~15s. t.Skip("Skipping slow matrix check") } for i := 1; i < 256; i++ { i := i t.Run(fmt.Sprintf("x%d", i), func(t *testing.T) { t.Parallel() var shards = make([][]byte, 255) for p := range shards { v := byte(i) shards[p] = []byte{v} } rng := rand.New(rand.NewSource(0)) for j := 1; j < 256; j++ { // j == n.o. parity shards if i+j > 255 { continue } sh := shards[:i+j] r, err := New(i, j, testOptions(WithPAR1Matrix())...) if err != nil { // We are not supposed to write to t from goroutines. t.Fatal("creating matrix size", i, j, ":", err) } err = r.Encode(sh) if err != nil { t.Fatal("encoding", i, j, ":", err) } for k := 0; k < j; k++ { // Remove random shard. r := int(rng.Int63n(int64(i + j))) sh[r] = sh[r][:0] } err = r.Reconstruct(sh) if err != nil { if err == errSingular { t.Logf("Singular: %d (data), %d (parity)", i, j) for p := range sh { if len(sh[p]) == 0 { shards[p] = []byte{byte(i)} } } continue } t.Fatal("reconstructing", i, j, ":", err) } ok, err := r.Verify(sh) if err != nil { t.Fatal("verifying", i, j, ":", err) } if !ok { t.Fatal(i, j, ok) } for k := range sh { if k == i { // Only check data shards break } if sh[k][0] != byte(i) { t.Fatal("does not match", i, j, k, sh[0], sh[k]) } } } }) } } func TestNew(t *testing.T) { tests := []struct { data, parity int err error }{ {127, 127, nil}, {128, 128, nil}, {255, 1, nil}, {255, 0, nil}, {1, 0, nil}, {256, 256, ErrMaxShardNum}, {0, 1, ErrInvShardNum}, {1, -1, ErrInvShardNum}, {256, 1, ErrMaxShardNum}, // overflow causes r.Shards to be negative {256, int(^uint(0) >> 1), errInvalidRowSize}, } for _, test := range tests { _, err := New(test.data, test.parity, testOptions()...) if err != test.err { t.Errorf("New(%v, %v): expected %v, got %v", test.data, test.parity, test.err, err) } } } // Benchmark 10 data shards and 4 parity shards and 160MB data. func BenchmarkSplit10x4x160M(b *testing.B) { benchmarkSplit(b, 10, 4, 160*1024*1024) } // Benchmark 5 data shards and 2 parity shards with 5MB data. func BenchmarkSplit5x2x5M(b *testing.B) { benchmarkSplit(b, 5, 2, 5*1024*1024) } // Benchmark 1 data shards and 2 parity shards with 1MB data. func BenchmarkSplit10x2x1M(b *testing.B) { benchmarkSplit(b, 10, 2, 1024*1024) } // Benchmark 10 data shards and 4 parity shards with 10MB data. func BenchmarkSplit10x4x10M(b *testing.B) { benchmarkSplit(b, 10, 4, 10*1024*1024) } // Benchmark 50 data shards and 20 parity shards with 50MB data. func BenchmarkSplit50x20x50M(b *testing.B) { benchmarkSplit(b, 50, 20, 50*1024*1024) } // Benchmark 17 data shards and 3 parity shards with 272MB data. func BenchmarkSplit17x3x272M(b *testing.B) { benchmarkSplit(b, 17, 3, 272*1024*1024) } func benchmarkSplit(b *testing.B, shards, parity, dataSize int) { r, err := New(shards, parity, testOptions(WithAutoGoroutines(dataSize))...) if err != nil { b.Fatal(err) } data := make([]byte, dataSize) b.ReportAllocs() b.ResetTimer() for i := 0; i < b.N; i++ { _, err = r.Split(data) if err != nil { b.Fatal(err) } } } func benchmarkParallel(b *testing.B, dataShards, parityShards, shardSize int) { // Run max 1 goroutine per operation. r, err := New(dataShards, parityShards, testOptions(WithMaxGoroutines(1))...) if err != nil { b.Fatal(err) } c := runtime.GOMAXPROCS(0) // Note that concurrency also affects total data size and will make caches less effective. b.Log("Total data:", (c*dataShards*shardSize)>>20, "MiB", "parity:", (c*parityShards*shardSize)>>20, "MiB") // Create independent shards shardsCh := make(chan [][]byte, c) for i := 0; i < c; i++ { rand.Seed(int64(i)) shards := make([][]byte, dataShards+parityShards) for s := range shards { shards[s] = make([]byte, shardSize) } for s := 0; s < dataShards; s++ { fillRandom(shards[s]) } shardsCh <- shards } b.SetBytes(int64(shardSize * (dataShards + parityShards))) b.SetParallelism(c) b.ReportAllocs() b.ResetTimer() b.RunParallel(func(pb *testing.PB) { for pb.Next() { shards := <-shardsCh err = r.Encode(shards) if err != nil { b.Fatal(err) } shardsCh <- shards } }) } func BenchmarkParallel_8x8x64K(b *testing.B) { benchmarkParallel(b, 8, 8, 64<<10) } func BenchmarkParallel_8x8x05M(b *testing.B) { benchmarkParallel(b, 8, 8, 512<<10) } func BenchmarkParallel_20x10x05M(b *testing.B) { benchmarkParallel(b, 20, 10, 512<<10) } func BenchmarkParallel_8x8x1M(b *testing.B) { benchmarkParallel(b, 8, 8, 1<<20) } func BenchmarkParallel_8x8x8M(b *testing.B) { benchmarkParallel(b, 8, 8, 8<<20) } func BenchmarkParallel_8x8x32M(b *testing.B) { benchmarkParallel(b, 8, 8, 32<<20) } func BenchmarkParallel_8x3x1M(b *testing.B) { benchmarkParallel(b, 8, 3, 1<<20) } func BenchmarkParallel_8x4x1M(b *testing.B) { benchmarkParallel(b, 8, 4, 1<<20) } func BenchmarkParallel_8x5x1M(b *testing.B) { benchmarkParallel(b, 8, 5, 1<<20) } reedsolomon-1.9.13/streaming.go000066400000000000000000000366631406411035300164660ustar00rootroot00000000000000/** * Reed-Solomon Coding over 8-bit values. * * Copyright 2015, Klaus Post * Copyright 2015, Backblaze, Inc. */ package reedsolomon import ( "bytes" "errors" "fmt" "io" "sync" ) // StreamEncoder is an interface to encode Reed-Salomon parity sets for your data. // It provides a fully streaming interface, and processes data in blocks of up to 4MB. // // For small shard sizes, 10MB and below, it is recommended to use the in-memory interface, // since the streaming interface has a start up overhead. // // For all operations, no readers and writers should not assume any order/size of // individual reads/writes. // // For usage examples, see "stream-encoder.go" and "streamdecoder.go" in the examples // folder. type StreamEncoder interface { // Encode parity shards for a set of data shards. // // Input is 'shards' containing readers for data shards followed by parity shards // io.Writer. // // The number of shards must match the number given to NewStream(). // // Each reader must supply the same number of bytes. // // The parity shards will be written to the writer. // The number of bytes written will match the input size. // // If a data stream returns an error, a StreamReadError type error // will be returned. If a parity writer returns an error, a // StreamWriteError will be returned. Encode(data []io.Reader, parity []io.Writer) error // Verify returns true if the parity shards contain correct data. // // The number of shards must match the number total data+parity shards // given to NewStream(). // // Each reader must supply the same number of bytes. // If a shard stream returns an error, a StreamReadError type error // will be returned. Verify(shards []io.Reader) (bool, error) // Reconstruct will recreate the missing shards if possible. // // Given a list of valid shards (to read) and invalid shards (to write) // // You indicate that a shard is missing by setting it to nil in the 'valid' // slice and at the same time setting a non-nil writer in "fill". // An index cannot contain both non-nil 'valid' and 'fill' entry. // If both are provided 'ErrReconstructMismatch' is returned. // // If there are too few shards to reconstruct the missing // ones, ErrTooFewShards will be returned. // // The reconstructed shard set is complete, but integrity is not verified. // Use the Verify function to check if data set is ok. Reconstruct(valid []io.Reader, fill []io.Writer) error // Split a an input stream into the number of shards given to the encoder. // // The data will be split into equally sized shards. // If the data size isn't dividable by the number of shards, // the last shard will contain extra zeros. // // You must supply the total size of your input. // 'ErrShortData' will be returned if it is unable to retrieve the // number of bytes indicated. Split(data io.Reader, dst []io.Writer, size int64) (err error) // Join the shards and write the data segment to dst. // // Only the data shards are considered. // // You must supply the exact output size you want. // If there are to few shards given, ErrTooFewShards will be returned. // If the total data size is less than outSize, ErrShortData will be returned. Join(dst io.Writer, shards []io.Reader, outSize int64) error } // StreamReadError is returned when a read error is encountered // that relates to a supplied stream. // This will allow you to find out which reader has failed. type StreamReadError struct { Err error // The error Stream int // The stream number on which the error occurred } // Error returns the error as a string func (s StreamReadError) Error() string { return fmt.Sprintf("error reading stream %d: %s", s.Stream, s.Err) } // String returns the error as a string func (s StreamReadError) String() string { return s.Error() } // StreamWriteError is returned when a write error is encountered // that relates to a supplied stream. This will allow you to // find out which reader has failed. type StreamWriteError struct { Err error // The error Stream int // The stream number on which the error occurred } // Error returns the error as a string func (s StreamWriteError) Error() string { return fmt.Sprintf("error writing stream %d: %s", s.Stream, s.Err) } // String returns the error as a string func (s StreamWriteError) String() string { return s.Error() } // rsStream contains a matrix for a specific // distribution of datashards and parity shards. // Construct if using NewStream() type rsStream struct { r *reedSolomon o options // Shard reader readShards func(dst [][]byte, in []io.Reader) error // Shard writer writeShards func(out []io.Writer, in [][]byte) error blockPool sync.Pool } // NewStream creates a new encoder and initializes it to // the number of data shards and parity shards that // you want to use. You can reuse this encoder. // Note that the maximum number of data shards is 256. func NewStream(dataShards, parityShards int, o ...Option) (StreamEncoder, error) { r := rsStream{o: defaultOptions} for _, opt := range o { opt(&r.o) } // Override block size if shard size is set. if r.o.streamBS == 0 && r.o.shardSize > 0 { r.o.streamBS = r.o.shardSize } if r.o.streamBS <= 0 { r.o.streamBS = 4 << 20 } if r.o.shardSize == 0 && r.o.maxGoroutines == defaultOptions.maxGoroutines { o = append(o, WithAutoGoroutines(r.o.streamBS)) } enc, err := New(dataShards, parityShards, o...) if err != nil { return nil, err } r.r = enc.(*reedSolomon) r.blockPool.New = func() interface{} { out := make([][]byte, dataShards+parityShards) for i := range out { out[i] = make([]byte, r.o.streamBS) } return out } r.readShards = readShards r.writeShards = writeShards if r.o.concReads { r.readShards = cReadShards } if r.o.concWrites { r.writeShards = cWriteShards } return &r, err } // NewStreamC creates a new encoder and initializes it to // the number of data shards and parity shards given. // // This functions as 'NewStream', but allows you to enable CONCURRENT reads and writes. func NewStreamC(dataShards, parityShards int, conReads, conWrites bool, o ...Option) (StreamEncoder, error) { return NewStream(dataShards, parityShards, append(o, WithConcurrentStreamReads(conReads), WithConcurrentStreamWrites(conWrites))...) } func (r *rsStream) createSlice() [][]byte { out := r.blockPool.Get().([][]byte) for i := range out { out[i] = out[i][:r.o.streamBS] } return out } // Encodes parity shards for a set of data shards. // // Input is 'shards' containing readers for data shards followed by parity shards // io.Writer. // // The number of shards must match the number given to NewStream(). // // Each reader must supply the same number of bytes. // // The parity shards will be written to the writer. // The number of bytes written will match the input size. // // If a data stream returns an error, a StreamReadError type error // will be returned. If a parity writer returns an error, a // StreamWriteError will be returned. func (r *rsStream) Encode(data []io.Reader, parity []io.Writer) error { if len(data) != r.r.DataShards { return ErrTooFewShards } if len(parity) != r.r.ParityShards { return ErrTooFewShards } all := r.createSlice() defer r.blockPool.Put(all) in := all[:r.r.DataShards] out := all[r.r.DataShards:] read := 0 for { err := r.readShards(in, data) switch err { case nil: case io.EOF: if read == 0 { return ErrShardNoData } return nil default: return err } out = trimShards(out, shardSize(in)) read += shardSize(in) err = r.r.Encode(all) if err != nil { return err } err = r.writeShards(parity, out) if err != nil { return err } } } // Trim the shards so they are all the same size func trimShards(in [][]byte, size int) [][]byte { for i := range in { if len(in[i]) != 0 { in[i] = in[i][0:size] } if len(in[i]) < size { in[i] = in[i][:0] } } return in } func readShards(dst [][]byte, in []io.Reader) error { if len(in) != len(dst) { panic("internal error: in and dst size do not match") } size := -1 for i := range in { if in[i] == nil { dst[i] = dst[i][:0] continue } n, err := io.ReadFull(in[i], dst[i]) // The error is EOF only if no bytes were read. // If an EOF happens after reading some but not all the bytes, // ReadFull returns ErrUnexpectedEOF. switch err { case io.ErrUnexpectedEOF, io.EOF: if size < 0 { size = n } else if n != size { // Shard sizes must match. return ErrShardSize } dst[i] = dst[i][0:n] case nil: continue default: return StreamReadError{Err: err, Stream: i} } } if size == 0 { return io.EOF } return nil } func writeShards(out []io.Writer, in [][]byte) error { if len(out) != len(in) { panic("internal error: in and out size do not match") } for i := range in { if out[i] == nil { continue } n, err := out[i].Write(in[i]) if err != nil { return StreamWriteError{Err: err, Stream: i} } // if n != len(in[i]) { return StreamWriteError{Err: io.ErrShortWrite, Stream: i} } } return nil } type readResult struct { n int size int err error } // cReadShards reads shards concurrently func cReadShards(dst [][]byte, in []io.Reader) error { if len(in) != len(dst) { panic("internal error: in and dst size do not match") } var wg sync.WaitGroup wg.Add(len(in)) res := make(chan readResult, len(in)) for i := range in { if in[i] == nil { dst[i] = dst[i][:0] wg.Done() continue } go func(i int) { defer wg.Done() n, err := io.ReadFull(in[i], dst[i]) // The error is EOF only if no bytes were read. // If an EOF happens after reading some but not all the bytes, // ReadFull returns ErrUnexpectedEOF. res <- readResult{size: n, err: err, n: i} }(i) } wg.Wait() close(res) size := -1 for r := range res { switch r.err { case io.ErrUnexpectedEOF, io.EOF: if size < 0 { size = r.size } else if r.size != size { // Shard sizes must match. return ErrShardSize } dst[r.n] = dst[r.n][0:r.size] case nil: default: return StreamReadError{Err: r.err, Stream: r.n} } } if size == 0 { return io.EOF } return nil } // cWriteShards writes shards concurrently func cWriteShards(out []io.Writer, in [][]byte) error { if len(out) != len(in) { panic("internal error: in and out size do not match") } var errs = make(chan error, len(out)) var wg sync.WaitGroup wg.Add(len(out)) for i := range in { go func(i int) { defer wg.Done() if out[i] == nil { errs <- nil return } n, err := out[i].Write(in[i]) if err != nil { errs <- StreamWriteError{Err: err, Stream: i} return } if n != len(in[i]) { errs <- StreamWriteError{Err: io.ErrShortWrite, Stream: i} } }(i) } wg.Wait() close(errs) for err := range errs { if err != nil { return err } } return nil } // Verify returns true if the parity shards contain correct data. // // The number of shards must match the number total data+parity shards // given to NewStream(). // // Each reader must supply the same number of bytes. // If a shard stream returns an error, a StreamReadError type error // will be returned. func (r *rsStream) Verify(shards []io.Reader) (bool, error) { if len(shards) != r.r.Shards { return false, ErrTooFewShards } read := 0 all := r.createSlice() defer r.blockPool.Put(all) for { err := r.readShards(all, shards) if err == io.EOF { if read == 0 { return false, ErrShardNoData } return true, nil } if err != nil { return false, err } read += shardSize(all) ok, err := r.r.Verify(all) if !ok || err != nil { return ok, err } } } // ErrReconstructMismatch is returned by the StreamEncoder, if you supply // "valid" and "fill" streams on the same index. // Therefore it is impossible to see if you consider the shard valid // or would like to have it reconstructed. var ErrReconstructMismatch = errors.New("valid shards and fill shards are mutually exclusive") // Reconstruct will recreate the missing shards if possible. // // Given a list of valid shards (to read) and invalid shards (to write) // // You indicate that a shard is missing by setting it to nil in the 'valid' // slice and at the same time setting a non-nil writer in "fill". // An index cannot contain both non-nil 'valid' and 'fill' entry. // // If there are too few shards to reconstruct the missing // ones, ErrTooFewShards will be returned. // // The reconstructed shard set is complete when explicitly asked for all missing shards. // However its integrity is not automatically verified. // Use the Verify function to check in case the data set is complete. func (r *rsStream) Reconstruct(valid []io.Reader, fill []io.Writer) error { if len(valid) != r.r.Shards { return ErrTooFewShards } if len(fill) != r.r.Shards { return ErrTooFewShards } all := r.createSlice() defer r.blockPool.Put(all) reconDataOnly := true for i := range valid { if valid[i] != nil && fill[i] != nil { return ErrReconstructMismatch } if i >= r.r.DataShards && fill[i] != nil { reconDataOnly = false } } read := 0 for { err := r.readShards(all, valid) if err == io.EOF { if read == 0 { return ErrShardNoData } return nil } if err != nil { return err } read += shardSize(all) all = trimShards(all, shardSize(all)) if reconDataOnly { err = r.r.ReconstructData(all) // just reconstruct missing data shards } else { err = r.r.Reconstruct(all) // reconstruct all missing shards } if err != nil { return err } err = r.writeShards(fill, all) if err != nil { return err } } } // Join the shards and write the data segment to dst. // // Only the data shards are considered. // // You must supply the exact output size you want. // If there are to few shards given, ErrTooFewShards will be returned. // If the total data size is less than outSize, ErrShortData will be returned. func (r *rsStream) Join(dst io.Writer, shards []io.Reader, outSize int64) error { // Do we have enough shards? if len(shards) < r.r.DataShards { return ErrTooFewShards } // Trim off parity shards if any shards = shards[:r.r.DataShards] for i := range shards { if shards[i] == nil { return StreamReadError{Err: ErrShardNoData, Stream: i} } } // Join all shards src := io.MultiReader(shards...) // Copy data to dst n, err := io.CopyN(dst, src, outSize) if err == io.EOF { return ErrShortData } if err != nil { return err } if n != outSize { return ErrShortData } return nil } // Split a an input stream into the number of shards given to the encoder. // // The data will be split into equally sized shards. // If the data size isn't dividable by the number of shards, // the last shard will contain extra zeros. // // You must supply the total size of your input. // 'ErrShortData' will be returned if it is unable to retrieve the // number of bytes indicated. func (r *rsStream) Split(data io.Reader, dst []io.Writer, size int64) error { if size == 0 { return ErrShortData } if len(dst) != r.r.DataShards { return ErrInvShardNum } for i := range dst { if dst[i] == nil { return StreamWriteError{Err: ErrShardNoData, Stream: i} } } // Calculate number of bytes per shard. perShard := (size + int64(r.r.DataShards) - 1) / int64(r.r.DataShards) // Pad data to r.Shards*perShard. padding := make([]byte, (int64(r.r.Shards)*perShard)-size) data = io.MultiReader(data, bytes.NewBuffer(padding)) // Split into equal-length shards and copy. for i := range dst { n, err := io.CopyN(dst[i], data, perShard) if err != io.EOF && err != nil { return err } if n != perShard { return ErrShortData } } return nil } reedsolomon-1.9.13/streaming_test.go000066400000000000000000000406501406411035300175140ustar00rootroot00000000000000/** * Unit tests for ReedSolomon Streaming API * * Copyright 2015, Klaus Post */ package reedsolomon import ( "bytes" "io" "io/ioutil" "math/rand" "testing" ) func TestStreamEncoding(t *testing.T) { perShard := 10 << 20 if testing.Short() { perShard = 50000 } r, err := NewStream(10, 3, testOptions()...) if err != nil { t.Fatal(err) } rand.Seed(0) input := randomBytes(10, perShard) data := toBuffers(input) par := emptyBuffers(3) err = r.Encode(toReaders(data), toWriters(par)) if err != nil { t.Fatal(err) } // Reset Data data = toBuffers(input) all := append(toReaders(data), toReaders(par)...) ok, err := r.Verify(all) if err != nil { t.Fatal(err) } if !ok { t.Fatal("Verification failed") } err = r.Encode(toReaders(emptyBuffers(1)), toWriters(emptyBuffers(1))) if err != ErrTooFewShards { t.Errorf("expected %v, got %v", ErrTooFewShards, err) } err = r.Encode(toReaders(emptyBuffers(10)), toWriters(emptyBuffers(1))) if err != ErrTooFewShards { t.Errorf("expected %v, got %v", ErrTooFewShards, err) } err = r.Encode(toReaders(emptyBuffers(10)), toWriters(emptyBuffers(3))) if err != ErrShardNoData { t.Errorf("expected %v, got %v", ErrShardNoData, err) } badShards := emptyBuffers(10) badShards[0] = randomBuffer(123) err = r.Encode(toReaders(badShards), toWriters(emptyBuffers(3))) if err != ErrShardSize { t.Errorf("expected %v, got %v", ErrShardSize, err) } } func TestStreamEncodingConcurrent(t *testing.T) { perShard := 10 << 20 if testing.Short() { perShard = 50000 } r, err := NewStreamC(10, 3, true, true, testOptions()...) if err != nil { t.Fatal(err) } rand.Seed(0) input := randomBytes(10, perShard) data := toBuffers(input) par := emptyBuffers(3) err = r.Encode(toReaders(data), toWriters(par)) if err != nil { t.Fatal(err) } // Reset Data data = toBuffers(input) all := append(toReaders(data), toReaders(par)...) ok, err := r.Verify(all) if err != nil { t.Fatal(err) } if !ok { t.Fatal("Verification failed") } err = r.Encode(toReaders(emptyBuffers(1)), toWriters(emptyBuffers(1))) if err != ErrTooFewShards { t.Errorf("expected %v, got %v", ErrTooFewShards, err) } err = r.Encode(toReaders(emptyBuffers(10)), toWriters(emptyBuffers(1))) if err != ErrTooFewShards { t.Errorf("expected %v, got %v", ErrTooFewShards, err) } err = r.Encode(toReaders(emptyBuffers(10)), toWriters(emptyBuffers(3))) if err != ErrShardNoData { t.Errorf("expected %v, got %v", ErrShardNoData, err) } badShards := emptyBuffers(10) badShards[0] = randomBuffer(123) badShards[1] = randomBuffer(123) err = r.Encode(toReaders(badShards), toWriters(emptyBuffers(3))) if err != ErrShardSize { t.Errorf("expected %v, got %v", ErrShardSize, err) } } func TestStreamZeroParity(t *testing.T) { perShard := 10 << 20 if testing.Short() { perShard = 50000 } r, err := NewStream(10, 0, testOptions()...) if err != nil { t.Fatal(err) } rand.Seed(0) input := randomBytes(10, perShard) data := toBuffers(input) err = r.Encode(toReaders(data), []io.Writer{}) if err != nil { t.Fatal(err) } // Reset Data data = toBuffers(input) all := toReaders(data) ok, err := r.Verify(all) if err != nil { t.Fatal(err) } if !ok { t.Fatal("Verification failed") } // Reset Data data = toBuffers(input) // Check that Reconstruct does nothing all = toReaders(data) err = r.Reconstruct(all, nilWriters(10)) if err != nil { t.Fatal(err) } } func TestStreamZeroParityConcurrent(t *testing.T) { perShard := 10 << 20 if testing.Short() { perShard = 50000 } r, err := NewStreamC(10, 0, true, true, testOptions()...) if err != nil { t.Fatal(err) } rand.Seed(0) input := randomBytes(10, perShard) data := toBuffers(input) err = r.Encode(toReaders(data), []io.Writer{}) if err != nil { t.Fatal(err) } // Reset Data data = toBuffers(input) all := toReaders(data) ok, err := r.Verify(all) if err != nil { t.Fatal(err) } if !ok { t.Fatal("Verification failed") } // Reset Data data = toBuffers(input) // Check that Reconstruct does nothing all = toReaders(data) err = r.Reconstruct(all, nilWriters(10)) if err != nil { t.Fatal(err) } } func randomBuffer(length int) *bytes.Buffer { b := make([]byte, length) fillRandom(b) return bytes.NewBuffer(b) } func randomBytes(n, length int) [][]byte { bufs := make([][]byte, n) for j := range bufs { bufs[j] = make([]byte, length) fillRandom(bufs[j]) } return bufs } func toBuffers(in [][]byte) []*bytes.Buffer { out := make([]*bytes.Buffer, len(in)) for i := range in { out[i] = bytes.NewBuffer(in[i]) } return out } func toReaders(in []*bytes.Buffer) []io.Reader { out := make([]io.Reader, len(in)) for i := range in { out[i] = in[i] } return out } func toWriters(in []*bytes.Buffer) []io.Writer { out := make([]io.Writer, len(in)) for i := range in { out[i] = in[i] } return out } func nilWriters(n int) []io.Writer { out := make([]io.Writer, n) for i := range out { out[i] = nil } return out } func emptyBuffers(n int) []*bytes.Buffer { b := make([]*bytes.Buffer, n) for i := range b { b[i] = &bytes.Buffer{} } return b } func toBytes(in []*bytes.Buffer) [][]byte { b := make([][]byte, len(in)) for i := range in { b[i] = in[i].Bytes() } return b } func TestStreamReconstruct(t *testing.T) { perShard := 10 << 20 if testing.Short() { perShard = 50000 } r, err := NewStream(10, 3, testOptions()...) if err != nil { t.Fatal(err) } rand.Seed(0) shards := randomBytes(10, perShard) parb := emptyBuffers(3) err = r.Encode(toReaders(toBuffers(shards)), toWriters(parb)) if err != nil { t.Fatal(err) } parity := toBytes(parb) all := append(toReaders(toBuffers(shards)), toReaders(toBuffers(parity))...) fill := make([]io.Writer, 13) // Reconstruct with all shards present, all fill nil err = r.Reconstruct(all, fill) if err != nil { t.Fatal(err) } all = append(toReaders(toBuffers(shards)), toReaders(toBuffers(parity))...) // Reconstruct with 10 shards present, asking for all shards to be reconstructed all[0] = nil fill[0] = emptyBuffers(1)[0] all[7] = nil fill[7] = emptyBuffers(1)[0] all[11] = nil fill[11] = emptyBuffers(1)[0] err = r.Reconstruct(all, fill) if err != nil { t.Fatal(err) } shards[0] = fill[0].(*bytes.Buffer).Bytes() shards[7] = fill[7].(*bytes.Buffer).Bytes() parity[1] = fill[11].(*bytes.Buffer).Bytes() all = append(toReaders(toBuffers(shards)), toReaders(toBuffers(parity))...) ok, err := r.Verify(all) if err != nil { t.Fatal(err) } if !ok { t.Fatal("Verification failed") } all = append(toReaders(toBuffers(shards)), toReaders(toBuffers(parity))...) // Reconstruct with 10 shards present, asking for just data shards to be reconstructed all[0] = nil fill[0] = emptyBuffers(1)[0] all[7] = nil fill[7] = emptyBuffers(1)[0] all[11] = nil fill[11] = nil err = r.Reconstruct(all, fill) if err != nil { t.Fatal(err) } if fill[11] != nil { t.Fatal("Unexpected parity block reconstructed") } all = append(toReaders(toBuffers(shards)), toReaders(toBuffers(parity))...) // Reconstruct with 9 shards present (should fail) all[0] = nil fill[0] = emptyBuffers(1)[0] all[4] = nil fill[4] = emptyBuffers(1)[0] all[7] = nil fill[7] = emptyBuffers(1)[0] all[11] = nil fill[11] = emptyBuffers(1)[0] err = r.Reconstruct(all, fill) if err != ErrTooFewShards { t.Errorf("expected %v, got %v", ErrTooFewShards, err) } err = r.Reconstruct(toReaders(emptyBuffers(3)), toWriters(emptyBuffers(3))) if err != ErrTooFewShards { t.Errorf("expected %v, got %v", ErrTooFewShards, err) } err = r.Reconstruct(toReaders(emptyBuffers(13)), toWriters(emptyBuffers(3))) if err != ErrTooFewShards { t.Errorf("expected %v, got %v", ErrTooFewShards, err) } err = r.Reconstruct(toReaders(emptyBuffers(13)), toWriters(emptyBuffers(13))) if err != ErrReconstructMismatch { t.Errorf("expected %v, got %v", ErrReconstructMismatch, err) } err = r.Reconstruct(toReaders(emptyBuffers(13)), nilWriters(13)) if err != ErrShardNoData { t.Errorf("expected %v, got %v", ErrShardNoData, err) } } func TestStreamVerify(t *testing.T) { perShard := 10 << 20 if testing.Short() { perShard = 50000 } r, err := NewStream(10, 4, testOptions()...) if err != nil { t.Fatal(err) } shards := randomBytes(10, perShard) parb := emptyBuffers(4) err = r.Encode(toReaders(toBuffers(shards)), toWriters(parb)) if err != nil { t.Fatal(err) } parity := toBytes(parb) all := append(toReaders(toBuffers(shards)), toReaders(parb)...) ok, err := r.Verify(all) if err != nil { t.Fatal(err) } if !ok { t.Fatal("Verification failed") } // Flip bits in a random byte parity[0][len(parity[0])-20000] = parity[0][len(parity[0])-20000] ^ 0xff all = append(toReaders(toBuffers(shards)), toReaders(toBuffers(parity))...) ok, err = r.Verify(all) if err != nil { t.Fatal(err) } if ok { t.Fatal("Verification did not fail") } // Re-encode err = r.Encode(toReaders(toBuffers(shards)), toWriters(parb)) if err != nil { t.Fatal(err) } // Fill a data segment with random data shards[0][len(shards[0])-30000] = shards[0][len(shards[0])-30000] ^ 0xff all = append(toReaders(toBuffers(shards)), toReaders(parb)...) ok, err = r.Verify(all) if err != nil { t.Fatal(err) } if ok { t.Fatal("Verification did not fail") } _, err = r.Verify(toReaders(emptyBuffers(10))) if err != ErrTooFewShards { t.Errorf("expected %v, got %v", ErrTooFewShards, err) } _, err = r.Verify(toReaders(emptyBuffers(14))) if err != ErrShardNoData { t.Errorf("expected %v, got %v", ErrShardNoData, err) } } func TestStreamOneEncode(t *testing.T) { codec, err := NewStream(5, 5, testOptions()...) if err != nil { t.Fatal(err) } shards := [][]byte{ {0, 1}, {4, 5}, {2, 3}, {6, 7}, {8, 9}, } parb := emptyBuffers(5) codec.Encode(toReaders(toBuffers(shards)), toWriters(parb)) parity := toBytes(parb) if parity[0][0] != 12 || parity[0][1] != 13 { t.Fatal("shard 5 mismatch") } if parity[1][0] != 10 || parity[1][1] != 11 { t.Fatal("shard 6 mismatch") } if parity[2][0] != 14 || parity[2][1] != 15 { t.Fatal("shard 7 mismatch") } if parity[3][0] != 90 || parity[3][1] != 91 { t.Fatal("shard 8 mismatch") } if parity[4][0] != 94 || parity[4][1] != 95 { t.Fatal("shard 9 mismatch") } all := append(toReaders(toBuffers(shards)), toReaders(toBuffers(parity))...) ok, err := codec.Verify(all) if err != nil { t.Fatal(err) } if !ok { t.Fatal("did not verify") } shards[3][0]++ all = append(toReaders(toBuffers(shards)), toReaders(toBuffers(parity))...) ok, err = codec.Verify(all) if err != nil { t.Fatal(err) } if ok { t.Fatal("verify did not fail as expected") } } func benchmarkStreamEncode(b *testing.B, dataShards, parityShards, shardSize int) { r, err := NewStream(dataShards, parityShards, testOptions(WithAutoGoroutines(shardSize))...) if err != nil { b.Fatal(err) } shards := make([][]byte, dataShards) for s := range shards { shards[s] = make([]byte, shardSize) } rand.Seed(0) for s := 0; s < dataShards; s++ { fillRandom(shards[s]) } b.SetBytes(int64(shardSize * dataShards)) b.ResetTimer() out := make([]io.Writer, parityShards) for i := range out { out[i] = ioutil.Discard } for i := 0; i < b.N; i++ { err = r.Encode(toReaders(toBuffers(shards)), out) if err != nil { b.Fatal(err) } } } func BenchmarkStreamEncode10x2x10000(b *testing.B) { benchmarkStreamEncode(b, 10, 2, 10000) } func BenchmarkStreamEncode100x20x10000(b *testing.B) { benchmarkStreamEncode(b, 100, 20, 10000) } func BenchmarkStreamEncode17x3x1M(b *testing.B) { benchmarkStreamEncode(b, 17, 3, 1024*1024) } // Benchmark 10 data shards and 4 parity shards with 16MB each. func BenchmarkStreamEncode10x4x16M(b *testing.B) { benchmarkStreamEncode(b, 10, 4, 16*1024*1024) } // Benchmark 5 data shards and 2 parity shards with 1MB each. func BenchmarkStreamEncode5x2x1M(b *testing.B) { benchmarkStreamEncode(b, 5, 2, 1024*1024) } // Benchmark 1 data shards and 2 parity shards with 1MB each. func BenchmarkStreamEncode10x2x1M(b *testing.B) { benchmarkStreamEncode(b, 10, 2, 1024*1024) } // Benchmark 10 data shards and 4 parity shards with 1MB each. func BenchmarkStreamEncode10x4x1M(b *testing.B) { benchmarkStreamEncode(b, 10, 4, 1024*1024) } // Benchmark 50 data shards and 20 parity shards with 1MB each. func BenchmarkStreamEncode50x20x1M(b *testing.B) { benchmarkStreamEncode(b, 50, 20, 1024*1024) } // Benchmark 17 data shards and 3 parity shards with 16MB each. func BenchmarkStreamEncode17x3x16M(b *testing.B) { benchmarkStreamEncode(b, 17, 3, 16*1024*1024) } func benchmarkStreamVerify(b *testing.B, dataShards, parityShards, shardSize int) { r, err := NewStream(dataShards, parityShards, testOptions(WithAutoGoroutines(shardSize))...) if err != nil { b.Fatal(err) } shards := make([][]byte, parityShards+dataShards) for s := range shards { shards[s] = make([]byte, shardSize) } rand.Seed(0) for s := 0; s < dataShards; s++ { fillRandom(shards[s]) } err = r.Encode(toReaders(toBuffers(shards[:dataShards])), toWriters(toBuffers(shards[dataShards:]))) if err != nil { b.Fatal(err) } b.SetBytes(int64(shardSize * dataShards)) b.ResetTimer() for i := 0; i < b.N; i++ { _, err = r.Verify(toReaders(toBuffers(shards))) if err != nil { b.Fatal(err) } } } // Benchmark 10 data slices with 2 parity slices holding 10000 bytes each func BenchmarkStreamVerify10x2x10000(b *testing.B) { benchmarkStreamVerify(b, 10, 2, 10000) } // Benchmark 50 data slices with 5 parity slices holding 100000 bytes each func BenchmarkStreamVerify50x5x50000(b *testing.B) { benchmarkStreamVerify(b, 50, 5, 100000) } // Benchmark 10 data slices with 2 parity slices holding 1MB bytes each func BenchmarkStreamVerify10x2x1M(b *testing.B) { benchmarkStreamVerify(b, 10, 2, 1024*1024) } // Benchmark 5 data slices with 2 parity slices holding 1MB bytes each func BenchmarkStreamVerify5x2x1M(b *testing.B) { benchmarkStreamVerify(b, 5, 2, 1024*1024) } // Benchmark 10 data slices with 4 parity slices holding 1MB bytes each func BenchmarkStreamVerify10x4x1M(b *testing.B) { benchmarkStreamVerify(b, 10, 4, 1024*1024) } // Benchmark 5 data slices with 2 parity slices holding 1MB bytes each func BenchmarkStreamVerify50x20x1M(b *testing.B) { benchmarkStreamVerify(b, 50, 20, 1024*1024) } // Benchmark 10 data slices with 4 parity slices holding 16MB bytes each func BenchmarkStreamVerify10x4x16M(b *testing.B) { benchmarkStreamVerify(b, 10, 4, 16*1024*1024) } func TestStreamSplitJoin(t *testing.T) { var data = make([]byte, 250000) rand.Seed(0) fillRandom(data) enc, _ := NewStream(5, 3, testOptions()...) split := emptyBuffers(5) err := enc.Split(bytes.NewBuffer(data), toWriters(split), int64(len(data))) if err != nil { t.Fatal(err) } splits := toBytes(split) expect := len(data) / 5 // Beware, if changing data size if split[0].Len() != expect { t.Errorf("unexpected size. expected %d, got %d", expect, split[0].Len()) } err = enc.Split(bytes.NewBuffer([]byte{}), toWriters(emptyBuffers(3)), 0) if err != ErrShortData { t.Errorf("expected %v, got %v", ErrShortData, err) } buf := new(bytes.Buffer) err = enc.Join(buf, toReaders(toBuffers(splits)), int64(len(data))) if err != nil { t.Fatal(err) } joined := buf.Bytes() if !bytes.Equal(joined, data) { t.Fatal("recovered data does match original", joined[:8], data[:8], "... lengths:", len(joined), len(data)) } err = enc.Join(buf, toReaders(emptyBuffers(2)), 0) if err != ErrTooFewShards { t.Errorf("expected %v, got %v", ErrTooFewShards, err) } bufs := toReaders(emptyBuffers(5)) bufs[2] = nil err = enc.Join(buf, bufs, 0) if se, ok := err.(StreamReadError); ok { if se.Err != ErrShardNoData { t.Errorf("expected %v, got %v", ErrShardNoData, se.Err) } if se.Stream != 2 { t.Errorf("Expected error on stream 2, got %d", se.Stream) } } else { t.Errorf("expected error type %T, got %T", StreamReadError{}, err) } err = enc.Join(buf, toReaders(toBuffers(splits)), int64(len(data)+1)) if err != ErrShortData { t.Errorf("expected %v, got %v", ErrShortData, err) } } func TestNewStream(t *testing.T) { tests := []struct { data, parity int err error }{ {127, 127, nil}, {1, 0, nil}, {256, 256, ErrMaxShardNum}, {0, 1, ErrInvShardNum}, {1, -1, ErrInvShardNum}, {257, 1, ErrMaxShardNum}, // overflow causes r.Shards to be negative {256, int(^uint(0) >> 1), errInvalidRowSize}, } for _, test := range tests { _, err := NewStream(test.data, test.parity, testOptions()...) if err != test.err { t.Errorf("New(%v, %v): expected %v, got %v", test.data, test.parity, test.err, err) } } }