pax_global_header00006660000000000000000000000064136036231170014514gustar00rootroot0000000000000052 comment=e2f34adfb57b524164fc712df2fcfd991ed0ccad xorsimd-0.4.1/000077500000000000000000000000001360362311700132035ustar00rootroot00000000000000xorsimd-0.4.1/.gitattributes000066400000000000000000000000331360362311700160720ustar00rootroot00000000000000*.s linguist-language=go:x xorsimd-0.4.1/.github/000077500000000000000000000000001360362311700145435ustar00rootroot00000000000000xorsimd-0.4.1/.github/workflows/000077500000000000000000000000001360362311700166005ustar00rootroot00000000000000xorsimd-0.4.1/.github/workflows/unit-test.yml000066400000000000000000000012741360362311700212630ustar00rootroot00000000000000name: unit-test on: push: branches: - master - release/* pull_request: branches: - master jobs: test: name: Test runs-on: ubuntu-latest steps: - name: Set up Go 1.13 uses: actions/setup-go@v1 with: go-version: 1.13 id: go - name: Check out code into the Go module directory uses: actions/checkout@v1 - name: Get dependencies run: | go get -v -t -d ./... if [ -f Gopkg.toml ]; then curl https://raw.githubusercontent.com/golang/dep/master/install.sh | sh dep ensure fi - name: Run test run: CGO_ENABLED=1 GO111MODULE=on go test -v -race xorsimd-0.4.1/.gitignore000066400000000000000000000003061360362311700151720ustar00rootroot00000000000000# Binaries for programs and plugins *.exe *.exe~ *.dll *.so *.dylib # Test binary, build with `go test -c` *.test # Output of the go coverage tool, specifically when used with LiteIDE *.out .idea xorsimd-0.4.1/LICENSE000066400000000000000000000020761360362311700142150ustar00rootroot00000000000000MIT License Copyright (c) 2019 Temple3x (temple3x@gmail.com) Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. xorsimd-0.4.1/README.md000066400000000000000000000032141360362311700144620ustar00rootroot00000000000000# XOR SIMD [![GoDoc][1]][2] [![MIT licensed][3]][4] [![Build Status][5]][6] [![Go Report Card][7]][8] [![Sourcegraph][9]][10] [1]: https://godoc.org/github.com/templexxx/xorsimd?status.svg [2]: https://godoc.org/github.com/templexxx/xorsimd [3]: https://img.shields.io/badge/license-MIT-blue.svg [4]: LICENSE [5]: https://github.com/templexxx/xorsimd/workflows/unit-test/badge.svg [6]: https://github.com/templexxx/xorsimd [7]: https://goreportcard.com/badge/github.com/templexxx/xorsimd [8]: https://goreportcard.com/report/github.com/templexxx/xorsimd [9]: https://sourcegraph.com/github.com/templexxx/xorsimd/-/badge.svg [10]: https://sourcegraph.com/github.com/templexxx/xorsimd?badge ## Introduction: >- XOR code engine in pure Go. > >- [High Performance](https://github.com/templexxx/xorsimd#performance): More than 270GB/s per physics core. ## Performance Performance depends mainly on: >- CPU instruction extension. > >- Number of source row vectors. **Platform:** *AWS c5d.xlarge (Intel(R) Xeon(R) Platinum 8124M CPU @ 3.00GHz)* **All test run on a single Core.** `I/O = (src_num + 1) * vector_size / cost` | Src Num | Vector size | AVX512 I/O (MB/S) | AVX2 I/O (MB/S) |SSE2 I/O (MB/S) | |-------|-------------|-------------|---------------|---------------| |5|4KB| 270403.73 | 142825.25 | 74443.91 | |5|1MB| 26948.34 | 26887.37 | 26950.65 | |5|8MB| 17881.32 | 17212.56 | 16402.97 | |10|4KB| 190445.30 | 102953.59 | 53244.04 | |10|1MB| 26424.44 | 26618.65 | 26094.39 | |10|8MB| 15471.31 | 14866.72 | 13565.80 | xorsimd-0.4.1/go.mod000066400000000000000000000001261360362311700143100ustar00rootroot00000000000000module github.com/templexxx/xorsimd require github.com/templexxx/cpu v0.0.1 go 1.13 xorsimd-0.4.1/go.sum000066400000000000000000000002471360362311700143410ustar00rootroot00000000000000github.com/templexxx/cpu v0.0.1 h1:hY4WdLOgKdc8y13EYklu9OUTXik80BkxHoWvTO6MQQY= github.com/templexxx/cpu v0.0.1/go.mod h1:w7Tb+7qgcAlIyX4NhLuDKt78AHA5SzPmq0Wj6HiEnnk= xorsimd-0.4.1/xor.go000066400000000000000000000034241360362311700143450ustar00rootroot00000000000000// Copyright (c) 2019. Temple3x (temple3x@gmail.com) // // Use of this source code is governed by the MIT License // that can be found in the LICENSE file. package xorsimd import "github.com/templexxx/cpu" // EnableAVX512 may slow down CPU Clock (maybe not). // TODO need more research: // https://lemire.me/blog/2018/04/19/by-how-much-does-avx-512-slow-down-your-cpu-a-first-experiment/ var EnableAVX512 = true // cpuFeature indicates which instruction set will be used. var cpuFeature = getCPUFeature() const ( avx512 = iota avx2 sse2 generic ) // TODO: Add ARM feature... func getCPUFeature() int { if hasAVX512() && EnableAVX512 { return avx512 } else if cpu.X86.HasAVX2 { return avx2 } else { return sse2 // amd64 must has sse2 } } func hasAVX512() (ok bool) { return cpu.X86.HasAVX512VL && cpu.X86.HasAVX512BW && cpu.X86.HasAVX512F && cpu.X86.HasAVX512DQ } // Encode encodes elements from source slice into a // destination slice. The source and destination may overlap. // Encode returns the number of bytes encoded, which will be the minimum of // len(src[i]) and len(dst). func Encode(dst []byte, src [][]byte) (n int) { n = checkLen(dst, src) if n == 0 { return } dst = dst[:n] for i := range src { src[i] = src[i][:n] } if len(src) == 1 { copy(dst, src[0]) return } encode(dst, src) return } func checkLen(dst []byte, src [][]byte) int { n := len(dst) for i := range src { if len(src[i]) < n { n = len(src[i]) } } if n <= 0 { return 0 } return n } // Bytes XORs the bytes in a and b into a // destination slice. The source and destination may overlap. // // Bytes returns the number of bytes encoded, which will be the minimum of // len(dst), len(a), len(b). func Bytes(dst, a, b []byte) int { return Encode(dst, [][]byte{a, b}) } xorsimd-0.4.1/xor_amd64.go000066400000000000000000000045031360362311700153370ustar00rootroot00000000000000// Copyright (c) 2019. Temple3x (temple3x@gmail.com) // // Use of this source code is governed by the MIT License // that can be found in the LICENSE file. package xorsimd func encode(dst []byte, src [][]byte) { switch cpuFeature { case avx512: encodeAVX512(dst, src) case avx2: encodeAVX2(dst, src) default: encodeSSE2(dst, src) } return } // Bytes8 XORs of 8 Bytes. // The slice arguments a, b, dst's lengths are assumed to be at least 8, // if not, Bytes8 will panic. func Bytes8(dst, a, b []byte) { bytes8(&dst[0], &a[0], &b[0]) } // Bytes16 XORs of packed 16 Bytes. // The slice arguments a, b, dst's lengths are assumed to be at least 16, // if not, Bytes16 will panic. func Bytes16(dst, a, b []byte) { bytes16(&dst[0], &a[0], &b[0]) } // Bytes8Align XORs of 8 Bytes. // The slice arguments a, b, dst's lengths are assumed to be at least 8, // if not, Bytes8 will panic. func Bytes8Align(dst, a, b []byte) { bytes8(&dst[0], &a[0], &b[0]) } // Bytes16Align XORs of packed 16 Bytes. // The slice arguments a, b, dst's lengths are assumed to be at least 16, // if not, Bytes16 will panic. func Bytes16Align(dst, a, b []byte) { bytes16(&dst[0], &a[0], &b[0]) } // BytesA XORs the len(a) bytes in a and b into a // destination slice. // The destination should have enough space. // // It's used for encoding small bytes slices (< dozens bytes), // and the slices may not be aligned to 8 bytes or 16 bytes. // If the length is big, it's better to use 'func Bytes(dst, a, b []byte)' instead // for gain better performance. func BytesA(dst, a, b []byte) { bytesN(&dst[0], &a[0], &b[0], len(a)) } // BytesB XORs the len(b) bytes in a and b into a // destination slice. // The destination should have enough space. // // It's used for encoding small bytes slices (< dozens bytes), // and the slices may not be aligned to 8 bytes or 16 bytes. // If the length is big, it's better to use 'func Bytes(dst, a, b []byte)' instead // for gain better performance. func BytesB(dst, a, b []byte) { bytesN(&dst[0], &a[0], &b[0], len(b)) } //go:noescape func encodeAVX512(dst []byte, src [][]byte) //go:noescape func encodeAVX2(dst []byte, src [][]byte) //go:noescape func encodeSSE2(dst []byte, src [][]byte) //go:noescape func bytesN(dst, a, b *byte, n int) //go:noescape func bytes8(dst, a, b *byte) //go:noescape func bytes16(dst, a, b *byte) xorsimd-0.4.1/xor_generic.go000066400000000000000000000121221360362311700160340ustar00rootroot00000000000000// Copyright (c) 2019. Temple3x (temple3x@gmail.com) // // Use of this source code is governed by the MIT License // that can be found in the LICENSE file. // // Copyright 2013 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. // +build !amd64 package xorsimd import ( "runtime" "unsafe" ) const wordSize = int(unsafe.Sizeof(uintptr(0))) const supportsUnaligned = runtime.GOARCH == "386" || runtime.GOARCH == "ppc64" || runtime.GOARCH == "ppc64le" || runtime.GOARCH == "s390x" func encode(dst []byte, src [][]byte) { if supportsUnaligned { fastEncode(dst, src, len(dst)) } else { // TODO(hanwen): if (dst, a, b) have common alignment // we could still try fastEncode. It is not clear // how often this happens, and it's only worth it if // the block encryption itself is hardware // accelerated. safeEncode(dst, src, len(dst)) } } // fastEncode xor in bulk. It only works on architectures that // support unaligned read/writes. func fastEncode(dst []byte, src [][]byte, n int) { w := n / wordSize if w > 0 { wordBytes := w * wordSize wordAlignSrc := make([][]byte, len(src)) for i := range src { wordAlignSrc[i] = src[i][:wordBytes] } fastEnc(dst[:wordBytes], wordAlignSrc) } for i := n - n%wordSize; i < n; i++ { s := src[0][i] for j := 1; j < len(src); j++ { s ^= src[j][i] } dst[i] = s } } func fastEnc(dst []byte, src [][]byte) { dw := *(*[]uintptr)(unsafe.Pointer(&dst)) sw := make([][]uintptr, len(src)) for i := range src { sw[i] = *(*[]uintptr)(unsafe.Pointer(&src[i])) } n := len(dst) / wordSize for i := 0; i < n; i++ { s := sw[0][i] for j := 1; j < len(sw); j++ { s ^= sw[j][i] } dw[i] = s } } func safeEncode(dst []byte, src [][]byte, n int) { for i := 0; i < n; i++ { s := src[0][i] for j := 1; j < len(src); j++ { s ^= src[j][i] } dst[i] = s } } // Bytes8 XORs of word 8 Bytes. // The slice arguments a, b, dst's lengths are assumed to be at least 8, // if not, Bytes8 will panic. func Bytes8(dst, a, b []byte) { bytesWords(dst[:8], a[:8], b[:8]) } // Bytes16 XORs of packed doubleword 16 Bytes. // The slice arguments a, b, dst's lengths are assumed to be at least 16, // if not, Bytes16 will panic. func Bytes16(dst, a, b []byte) { bytesWords(dst[:16], a[:16], b[:16]) } // bytesWords XORs multiples of 4 or 8 bytes (depending on architecture.) // The slice arguments a and b are assumed to be of equal length. func bytesWords(dst, a, b []byte) { if supportsUnaligned { dw := *(*[]uintptr)(unsafe.Pointer(&dst)) aw := *(*[]uintptr)(unsafe.Pointer(&a)) bw := *(*[]uintptr)(unsafe.Pointer(&b)) n := len(b) / wordSize for i := 0; i < n; i++ { dw[i] = aw[i] ^ bw[i] } } else { n := len(b) for i := 0; i < n; i++ { dst[i] = a[i] ^ b[i] } } } // Bytes8Align XORs of 8 Bytes. // The slice arguments a, b, dst's lengths are assumed to be at least 8, // if not, Bytes8 will panic. // // All the byte slices must be aligned to wordsize. func Bytes8Align(dst, a, b []byte) { bytesWordsAlign(dst[:8], a[:8], b[:8]) } // Bytes16Align XORs of packed 16 Bytes. // The slice arguments a, b, dst's lengths are assumed to be at least 16, // if not, Bytes16 will panic. // // All the byte slices must be aligned to wordsize. func Bytes16Align(dst, a, b []byte) { bytesWordsAlign(dst[:16], a[:16], b[:16]) } // bytesWordsAlign XORs multiples of 4 or 8 bytes (depending on architecture.) // The slice arguments a and b are assumed to be of equal length. // // All the byte slices must be aligned to wordsize. func bytesWordsAlign(dst, a, b []byte) { dw := *(*[]uintptr)(unsafe.Pointer(&dst)) aw := *(*[]uintptr)(unsafe.Pointer(&a)) bw := *(*[]uintptr)(unsafe.Pointer(&b)) n := len(b) / wordSize for i := 0; i < n; i++ { dw[i] = aw[i] ^ bw[i] } } // BytesA XORs the len(a) bytes in a and b into a // destination slice. // The destination should have enough space. // // It's used for encoding small bytes slices (< dozens bytes), // and the slices may not be aligned to 8 bytes or 16 bytes. // If the length is big, it's better to use 'func Bytes(dst, a, b []byte)' instead // for gain better performance. func BytesA(dst, a, b []byte) { n := len(a) bytesN(dst[:n], a[:n], b[:n], n) } // BytesB XORs the len(b) bytes in a and b into a // destination slice. // The destination should have enough space. // // It's used for encoding small bytes slices (< dozens bytes), // and the slices may not be aligned to 8 bytes or 16 bytes. // If the length is big, it's better to use 'func Bytes(dst, a, b []byte)' instead // for gain better performance. func BytesB(dst, a, b []byte) { n := len(b) bytesN(dst[:n], a[:n], b[:n], n) } func bytesN(dst, a, b []byte, n int) { switch { case supportsUnaligned: w := n / wordSize if w > 0 { dw := *(*[]uintptr)(unsafe.Pointer(&dst)) aw := *(*[]uintptr)(unsafe.Pointer(&a)) bw := *(*[]uintptr)(unsafe.Pointer(&b)) for i := 0; i < w; i++ { dw[i] = aw[i] ^ bw[i] } } for i := (n - n%wordSize); i < n; i++ { dst[i] = a[i] ^ b[i] } default: for i := 0; i < n; i++ { dst[i] = a[i] ^ b[i] } } } xorsimd-0.4.1/xor_test.go000066400000000000000000000207121360362311700154030ustar00rootroot00000000000000// Copyright (c) 2019. Temple3x (temple3x@gmail.com) // // Use of this source code is governed by the MIT License // that can be found in the LICENSE file. // // Copyright 2013 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. // // TestEncodeBytes is copied from Go Standard lib: // crypto/cipher/xor_test.go package xorsimd import ( "bytes" "fmt" "math/rand" "testing" "time" "unsafe" ) const ( kb = 1024 mb = 1024 * 1024 testSize = kb ) func TestBytes8(t *testing.T) { rand.Seed(time.Now().UnixNano()) for j := 0; j < 1024; j++ { a := make([]byte, 8) b := make([]byte, 8) fillRandom(a) fillRandom(b) dst0 := make([]byte, 8) Bytes8(dst0, a, b) dst1 := make([]byte, 8) for i := 0; i < 8; i++ { dst1[i] = a[i] ^ b[i] } if !bytes.Equal(dst0, dst1) { t.Fatal("not equal", a, b, dst0, dst1) } } } func TestBytes16(t *testing.T) { rand.Seed(time.Now().UnixNano()) for j := 0; j < 1024; j++ { a := make([]byte, 16) b := make([]byte, 16) fillRandom(a) fillRandom(b) dst0 := make([]byte, 16) Bytes16(dst0, a, b) dst1 := make([]byte, 16) for i := 0; i < 16; i++ { dst1[i] = a[i] ^ b[i] } if !bytes.Equal(dst0, dst1) { t.Fatal("not equal", dst0, dst1, a, b) } } } const wordSize = int(unsafe.Sizeof(uintptr(0))) func TestBytes8Align(t *testing.T) { rand.Seed(time.Now().UnixNano()) for j := 0; j < 1024; j++ { a := make([]byte, 8+wordSize) b := make([]byte, 8+wordSize) dst0 := make([]byte, 8+wordSize) dst1 := make([]byte, 8+wordSize) al := alignment(a) offset := 0 if al != 0 { offset = wordSize - al } a = a[offset : offset+8] al = alignment(b) offset = 0 if al != 0 { offset = wordSize - al } b = b[offset : offset+8] al = alignment(dst0) offset = 0 if al != 0 { offset = wordSize - al } dst0 = dst0[offset : offset+8] al = alignment(dst1) offset = 0 if al != 0 { offset = wordSize - al } dst1 = dst1[offset : offset+8] fillRandom(a) fillRandom(b) Bytes8Align(dst0, a, b) for i := 0; i < 8; i++ { dst1[i] = a[i] ^ b[i] } if !bytes.Equal(dst0, dst1) { t.Fatal("not equal", a, b, dst0, dst1) } } } func alignment(s []byte) int { return int(uintptr(unsafe.Pointer(&s[0])) & uintptr(wordSize-1)) } func TestBytes16Align(t *testing.T) { rand.Seed(time.Now().UnixNano()) for j := 0; j < 1024; j++ { a := make([]byte, 16+wordSize) b := make([]byte, 16+wordSize) dst0 := make([]byte, 16+wordSize) dst1 := make([]byte, 16+wordSize) al := alignment(a) offset := 0 if al != 0 { offset = wordSize - al } a = a[offset : offset+16] al = alignment(b) offset = 0 if al != 0 { offset = wordSize - al } b = b[offset : offset+16] al = alignment(dst0) offset = 0 if al != 0 { offset = wordSize - al } dst0 = dst0[offset : offset+16] al = alignment(dst1) offset = 0 if al != 0 { offset = wordSize - al } dst1 = dst1[offset : offset+16] fillRandom(a) fillRandom(b) Bytes16Align(dst0, a, b) for i := 0; i < 16; i++ { dst1[i] = a[i] ^ b[i] } if !bytes.Equal(dst0, dst1) { t.Fatal("not equal", a, b, dst0, dst1) } } } func TestBytesA(t *testing.T) { rand.Seed(time.Now().UnixNano()) for j := 2; j <= 1024; j++ { for alignP := 0; alignP < 2; alignP++ { p := make([]byte, j)[alignP:] q := make([]byte, j) d1 := make([]byte, j) d2 := make([]byte, j) fillRandom(p) fillRandom(q) BytesA(d1, p, q) for i := 0; i < j-alignP; i++ { d2[i] = p[i] ^ q[i] } if !bytes.Equal(d1, d2) { t.Fatal("not equal") } } } } func TestBytesB(t *testing.T) { rand.Seed(time.Now().UnixNano()) for j := 2; j <= 1024; j++ { for alignQ := 0; alignQ < 2; alignQ++ { p := make([]byte, j) q := make([]byte, j)[alignQ:] d1 := make([]byte, j) d2 := make([]byte, j) fillRandom(p) fillRandom(q) BytesB(d1, p, q) for i := 0; i < j-alignQ; i++ { d2[i] = p[i] ^ q[i] } if !bytes.Equal(d1, d2) { t.Fatal("not equal") } } } } func TestBytes(t *testing.T) { rand.Seed(time.Now().UnixNano()) for j := 1; j <= 1024; j++ { for alignP := 0; alignP < 2; alignP++ { for alignQ := 0; alignQ < 2; alignQ++ { for alignD := 0; alignD < 2; alignD++ { p := make([]byte, j)[alignP:] q := make([]byte, j)[alignQ:] d1 := make([]byte, j)[alignD:] d2 := make([]byte, j)[alignD:] fillRandom(p) fillRandom(q) Bytes(d1, p, q) n := min(p, q, d1) for i := 0; i < n; i++ { d2[i] = p[i] ^ q[i] } if !bytes.Equal(d1, d2) { t.Fatal("not equal") } } } } } } func min(a, b, c []byte) int { n := len(a) if len(b) < n { n = len(b) } if len(c) < n { n = len(c) } return n } func TestEncodeWithFeature(t *testing.T) { max := testSize switch getCPUFeature() { case avx512: testEncode(t, max, sse2, -1) testEncode(t, max, avx2, sse2) testEncode(t, max, avx512, avx2) case avx2: testEncode(t, max, sse2, -1) testEncode(t, max, avx2, sse2) case sse2: testEncode(t, max, sse2, -1) case generic: testEncode(t, max, generic, -1) } } func testEncode(t *testing.T, maxSize, feat, cmpFeat int) { rand.Seed(time.Now().UnixNano()) srcN := randIntn(10, 2) // Cannot be 1, see func encode(dst []byte, src [][]byte, feature int). fs := featToStr(feat) for size := 1; size <= maxSize; size++ { exp := make([]byte, size) src := make([][]byte, srcN) for j := 0; j < srcN; j++ { src[j] = make([]byte, size) fillRandom(src[j]) } if cmpFeat < 0 { encodeTested(exp, src) } else { cpuFeature = cmpFeat Encode(exp, src) } act := make([]byte, size) cpuFeature = feat Encode(act, src) if !bytes.Equal(exp, act) { t.Fatalf("%s mismatched with %s, src_num: %d, size: %d", fs, featToStr(cmpFeat), srcN, size) } } t.Logf("%s pass src_num:%d, max_size: %d", fs, srcN, maxSize) } func featToStr(f int) string { switch f { case avx512: return "AVX512" case avx2: return "AVX2" case sse2: return "SSE2" case generic: return "Generic" default: return "Tested" } } func encodeTested(dst []byte, src [][]byte) { n := len(dst) for i := 0; i < n; i++ { s := src[0][i] for j := 1; j < len(src); j++ { s ^= src[j][i] } dst[i] = s } } // randIntn returns, as an int, a non-negative pseudo-random number in [min,n) // from the default Source. func randIntn(n, min int) int { m := rand.Intn(n) if m < min { m = min } return m } func BenchmarkBytes8(b *testing.B) { s0 := make([]byte, 8) s1 := make([]byte, 8) fillRandom(s0) fillRandom(s1) dst0 := make([]byte, 8) b.ResetTimer() b.SetBytes(8) for i := 0; i < b.N; i++ { Bytes8(dst0, s0, s1) } } func BenchmarkBytes16(b *testing.B) { s0 := make([]byte, 16) s1 := make([]byte, 16) fillRandom(s0) fillRandom(s1) dst0 := make([]byte, 16) b.ResetTimer() b.SetBytes(16) for i := 0; i < b.N; i++ { Bytes16(dst0, s0, s1) } } func BenchmarkBytesN_16Bytes(b *testing.B) { s0 := make([]byte, 16) s1 := make([]byte, 16) fillRandom(s0) fillRandom(s1) dst0 := make([]byte, 16) b.ResetTimer() b.SetBytes(16) for i := 0; i < b.N; i++ { BytesA(dst0, s0, s1) } } func BenchmarkEncode(b *testing.B) { sizes := []int{4 * kb, mb, 8 * mb} srcNums := []int{5, 10} var feats []int switch getCPUFeature() { case avx512: feats = append(feats, avx512) feats = append(feats, avx2) feats = append(feats, sse2) case avx2: feats = append(feats, avx2) feats = append(feats, sse2) case sse2: feats = append(feats, sse2) default: feats = append(feats, generic) } b.Run("", benchEncRun(benchEnc, srcNums, sizes, feats)) } func benchEncRun(f func(*testing.B, int, int, int), srcNums, sizes, feats []int) func(*testing.B) { return func(b *testing.B) { for _, feat := range feats { for _, srcNum := range srcNums { for _, size := range sizes { b.Run(fmt.Sprintf("(%d+1)-%s-%s", srcNum, byteToStr(size), featToStr(feat)), func(b *testing.B) { f(b, srcNum, size, feat) }) } } } } } func benchEnc(b *testing.B, srcNum, size, feat int) { dst := make([]byte, size) src := make([][]byte, srcNum) for i := 0; i < srcNum; i++ { src[i] = make([]byte, size) fillRandom(src[i]) } cpuFeature = feat b.SetBytes(int64((srcNum + 1) * size)) b.ResetTimer() for i := 0; i < b.N; i++ { encode(dst, src) } } func fillRandom(p []byte) { rand.Read(p) } func byteToStr(n int) string { if n >= mb { return fmt.Sprintf("%dMB", n/mb) } return fmt.Sprintf("%dKB", n/kb) } xorsimd-0.4.1/xoravx2_amd64.s000066400000000000000000000051061360362311700157750ustar00rootroot00000000000000// Copyright (c) 2019. Temple3x (temple3x@gmail.com) // // Use of this source code is governed by the MIT License // that can be found in the LICENSE file. #include "textflag.h" #define dst BX // parity's address #define d2src SI // two-dimension src_slice's address #define csrc CX // cnt of src #define len DX // len of vect #define pos R8 // job position in vect #define csrc_tmp R9 #define d2src_off R10 #define src_tmp R11 #define not_aligned_len R12 #define src_val0 R13 #define src_val1 R14 // func encodeAVX2(dst []byte, src [][]byte) TEXT ·encodeAVX2(SB), NOSPLIT, $0 MOVQ d+0(FP), dst MOVQ s+24(FP), d2src MOVQ c+32(FP), csrc MOVQ l+8(FP), len TESTQ $127, len JNZ not_aligned aligned: MOVQ $0, pos loop128b: MOVQ csrc, csrc_tmp // store src_cnt -> csrc_tmp SUBQ $2, csrc_tmp MOVQ $0, d2src_off MOVQ (d2src)(d2src_off*1), src_tmp // get first src_vect's addr -> src_tmp VMOVDQU (src_tmp)(pos*1), Y0 VMOVDQU 32(src_tmp)(pos*1), Y1 VMOVDQU 64(src_tmp)(pos*1), Y2 VMOVDQU 96(src_tmp)(pos*1), Y3 next_vect: ADDQ $24, d2src_off // len(slice) = 24 MOVQ (d2src)(d2src_off*1), src_tmp // next data_vect VMOVDQU (src_tmp)(pos*1), Y4 VMOVDQU 32(src_tmp)(pos*1), Y5 VMOVDQU 64(src_tmp)(pos*1), Y6 VMOVDQU 96(src_tmp)(pos*1), Y7 VPXOR Y4, Y0, Y0 VPXOR Y5, Y1, Y1 VPXOR Y6, Y2, Y2 VPXOR Y7, Y3, Y3 SUBQ $1, csrc_tmp JGE next_vect VMOVDQU Y0, (dst)(pos*1) VMOVDQU Y1, 32(dst)(pos*1) VMOVDQU Y2, 64(dst)(pos*1) VMOVDQU Y3, 96(dst)(pos*1) ADDQ $128, pos CMPQ len, pos JNE loop128b VZEROUPPER RET loop_1b: MOVQ csrc, csrc_tmp MOVQ $0, d2src_off MOVQ (d2src)(d2src_off*1), src_tmp SUBQ $2, csrc_tmp MOVB -1(src_tmp)(len*1), src_val0 // encode from the end of src next_vect_1b: ADDQ $24, d2src_off MOVQ (d2src)(d2src_off*1), src_tmp MOVB -1(src_tmp)(len*1), src_val1 XORB src_val1, src_val0 SUBQ $1, csrc_tmp JGE next_vect_1b MOVB src_val0, -1(dst)(len*1) SUBQ $1, len TESTQ $7, len JNZ loop_1b CMPQ len, $0 JE ret TESTQ $127, len JZ aligned not_aligned: TESTQ $7, len JNE loop_1b MOVQ len, not_aligned_len ANDQ $127, not_aligned_len loop_8b: MOVQ csrc, csrc_tmp MOVQ $0, d2src_off MOVQ (d2src)(d2src_off*1), src_tmp SUBQ $2, csrc_tmp MOVQ -8(src_tmp)(len*1), src_val0 next_vect_8b: ADDQ $24, d2src_off MOVQ (d2src)(d2src_off*1), src_tmp MOVQ -8(src_tmp)(len*1), src_val1 XORQ src_val1, src_val0 SUBQ $1, csrc_tmp JGE next_vect_8b MOVQ src_val0, -8(dst)(len*1) SUBQ $8, len SUBQ $8, not_aligned_len JG loop_8b CMPQ len, $128 JGE aligned RET ret: RET xorsimd-0.4.1/xoravx512_amd64.s000066400000000000000000000051521360362311700161440ustar00rootroot00000000000000// Copyright (c) 2019. Temple3x (temple3x@gmail.com) // // Use of this source code is governed by the MIT License // that can be found in the LICENSE file. #include "textflag.h" #define dst BX // parity's address #define d2src SI // two-dimension src_slice's address #define csrc CX // cnt of src #define len DX // len of vect #define pos R8 // job position in vect #define csrc_tmp R9 #define d2src_off R10 #define src_tmp R11 #define not_aligned_len R12 #define src_val0 R13 #define src_val1 R14 // func encodeAVX512(dst []byte, src [][]byte) TEXT ·encodeAVX512(SB), NOSPLIT, $0 MOVQ d+0(FP), dst MOVQ src+24(FP), d2src MOVQ c+32(FP), csrc MOVQ l+8(FP), len TESTQ $255, len JNZ not_aligned aligned: MOVQ $0, pos loop256b: MOVQ csrc, csrc_tmp // store src_cnt -> csrc_tmp SUBQ $2, csrc_tmp MOVQ $0, d2src_off MOVQ (d2src)(d2src_off*1), src_tmp // get first src_vect's addr -> src_tmp VMOVDQU8 (src_tmp)(pos*1), Z0 VMOVDQU8 64(src_tmp)(pos*1), Z1 VMOVDQU8 128(src_tmp)(pos*1), Z2 VMOVDQU8 192(src_tmp)(pos*1), Z3 next_vect: ADDQ $24, d2src_off // len(slice) = 24 MOVQ (d2src)(d2src_off*1), src_tmp // next data_vect VMOVDQU8 (src_tmp)(pos*1), Z4 VMOVDQU8 64(src_tmp)(pos*1), Z5 VMOVDQU8 128(src_tmp)(pos*1), Z6 VMOVDQU8 192(src_tmp)(pos*1), Z7 VPXORQ Z4, Z0, Z0 VPXORQ Z5, Z1, Z1 VPXORQ Z6, Z2, Z2 VPXORQ Z7, Z3, Z3 SUBQ $1, csrc_tmp JGE next_vect VMOVDQU8 Z0, (dst)(pos*1) VMOVDQU8 Z1, 64(dst)(pos*1) VMOVDQU8 Z2, 128(dst)(pos*1) VMOVDQU8 Z3, 192(dst)(pos*1) ADDQ $256, pos CMPQ len, pos JNE loop256b VZEROUPPER RET loop_1b: MOVQ csrc, csrc_tmp MOVQ $0, d2src_off MOVQ (d2src)(d2src_off*1), src_tmp SUBQ $2, csrc_tmp MOVB -1(src_tmp)(len*1), src_val0 // encode from the end of src next_vect_1b: ADDQ $24, d2src_off MOVQ (d2src)(d2src_off*1), src_tmp MOVB -1(src_tmp)(len*1), src_val1 XORB src_val1, src_val0 SUBQ $1, csrc_tmp JGE next_vect_1b MOVB src_val0, -1(dst)(len*1) SUBQ $1, len TESTQ $7, len JNZ loop_1b CMPQ len, $0 JE ret TESTQ $255, len JZ aligned not_aligned: TESTQ $7, len JNE loop_1b MOVQ len, not_aligned_len ANDQ $255, not_aligned_len loop_8b: MOVQ csrc, csrc_tmp MOVQ $0, d2src_off MOVQ (d2src)(d2src_off*1), src_tmp SUBQ $2, csrc_tmp MOVQ -8(src_tmp)(len*1), src_val0 next_vect_8b: ADDQ $24, d2src_off MOVQ (d2src)(d2src_off*1), src_tmp MOVQ -8(src_tmp)(len*1), src_val1 XORQ src_val1, src_val0 SUBQ $1, csrc_tmp JGE next_vect_8b MOVQ src_val0, -8(dst)(len*1) SUBQ $8, len SUBQ $8, not_aligned_len JG loop_8b CMPQ len, $256 JGE aligned RET ret: RET xorsimd-0.4.1/xorbytes_amd64.s000066400000000000000000000030501360362311700162370ustar00rootroot00000000000000#include "textflag.h" // func bytesN(dst, a, b *byte, n int) TEXT ·bytesN(SB), NOSPLIT, $0 MOVQ d+0(FP), BX MOVQ a+8(FP), SI MOVQ b+16(FP), CX MOVQ n+24(FP), DX TESTQ $15, DX // AND 15 & len, if not zero jump to not_aligned. JNZ not_aligned aligned: MOVQ $0, AX // position in slices loop16b: MOVOU (SI)(AX*1), X0 // XOR 16byte forwards. MOVOU (CX)(AX*1), X1 PXOR X1, X0 MOVOU X0, (BX)(AX*1) ADDQ $16, AX CMPQ DX, AX JNE loop16b RET loop_1b: SUBQ $1, DX // XOR 1byte backwards. MOVB (SI)(DX*1), DI MOVB (CX)(DX*1), AX XORB AX, DI MOVB DI, (BX)(DX*1) TESTQ $7, DX // AND 7 & len, if not zero jump to loop_1b. JNZ loop_1b CMPQ DX, $0 // if len is 0, ret. JE ret TESTQ $15, DX // AND 15 & len, if zero jump to aligned. JZ aligned not_aligned: TESTQ $7, DX // AND $7 & len, if not zero jump to loop_1b. JNE loop_1b SUBQ $8, DX // XOR 8bytes backwards. MOVQ (SI)(DX*1), DI MOVQ (CX)(DX*1), AX XORQ AX, DI MOVQ DI, (BX)(DX*1) CMPQ DX, $16 // if len is greater or equal 16 here, it must be aligned. JGE aligned ret: RET // func bytes8(dst, a, b *byte) TEXT ·bytes8(SB), NOSPLIT, $0 MOVQ d+0(FP), BX MOVQ a+8(FP), SI MOVQ b+16(FP), CX MOVQ (SI), DI MOVQ (CX), AX XORQ AX, DI MOVQ DI, (BX) RET // func bytes16(dst, a, b *byte) TEXT ·bytes16(SB), NOSPLIT, $0 MOVQ d+0(FP), BX MOVQ a+8(FP), SI MOVQ b+16(FP), CX MOVOU (SI), X0 MOVOU (CX), X1 PXOR X1, X0 MOVOU X0, (BX) RET xorsimd-0.4.1/xorsse2_amd64.s000066400000000000000000000045161360362311700157750ustar00rootroot00000000000000// Copyright (c) 2019. Temple3x (temple3x@gmail.com) // // Use of this source code is governed by the MIT License // that can be found in the LICENSE file. #include "textflag.h" #define dst BX // parity's address #define d2src SI // two-dimension src_slice's address #define csrc CX // cnt of src #define len DX // len of vect #define pos R8 // job position in vect #define csrc_tmp R9 #define d2src_off R10 #define src_tmp R11 #define not_aligned_len R12 #define src_val0 R13 #define src_val1 R14 // func encodeSSE2(dst []byte, src [][]byte) TEXT ·encodeSSE2(SB), NOSPLIT, $0 MOVQ d+0(FP), dst MOVQ src+24(FP), d2src MOVQ c+32(FP), csrc MOVQ l+8(FP), len TESTQ $63, len JNZ not_aligned aligned: MOVQ $0, pos loop64b: MOVQ csrc, csrc_tmp SUBQ $2, csrc_tmp MOVQ $0, d2src_off MOVQ (d2src)(d2src_off*1), src_tmp MOVOU (src_tmp)(pos*1), X0 MOVOU 16(src_tmp)(pos*1), X1 MOVOU 32(src_tmp)(pos*1), X2 MOVOU 48(src_tmp)(pos*1), X3 next_vect: ADDQ $24, d2src_off MOVQ (d2src)(d2src_off*1), src_tmp MOVOU (src_tmp)(pos*1), X4 MOVOU 16(src_tmp)(pos*1), X5 MOVOU 32(src_tmp)(pos*1), X6 MOVOU 48(src_tmp)(pos*1), X7 PXOR X4, X0 PXOR X5, X1 PXOR X6, X2 PXOR X7, X3 SUBQ $1, csrc_tmp JGE next_vect MOVOU X0, (dst)(pos*1) MOVOU X1, 16(dst)(pos*1) MOVOU X2, 32(dst)(pos*1) MOVOU X3, 48(dst)(pos*1) ADDQ $64, pos CMPQ len, pos JNE loop64b RET loop_1b: MOVQ csrc, csrc_tmp MOVQ $0, d2src_off MOVQ (d2src)(d2src_off*1), src_tmp SUBQ $2, csrc_tmp MOVB -1(src_tmp)(len*1), src_val0 next_vect_1b: ADDQ $24, d2src_off MOVQ (d2src)(d2src_off*1), src_tmp MOVB -1(src_tmp)(len*1), src_val1 XORB src_val1, src_val0 SUBQ $1, csrc_tmp JGE next_vect_1b MOVB src_val0, -1(dst)(len*1) SUBQ $1, len TESTQ $7, len JNZ loop_1b CMPQ len, $0 JE ret TESTQ $63, len JZ aligned not_aligned: TESTQ $7, len JNE loop_1b MOVQ len, not_aligned_len ANDQ $63, not_aligned_len loop_8b: MOVQ csrc, csrc_tmp MOVQ $0, d2src_off MOVQ (d2src)(d2src_off*1), src_tmp SUBQ $2, csrc_tmp MOVQ -8(src_tmp)(len*1), src_val0 next_vect_8b: ADDQ $24, d2src_off MOVQ (d2src)(d2src_off*1), src_tmp MOVQ -8(src_tmp)(len*1), src_val1 XORQ src_val1, src_val0 SUBQ $1, csrc_tmp JGE next_vect_8b MOVQ src_val0, -8(dst)(len*1) SUBQ $8, len SUBQ $8, not_aligned_len JG loop_8b CMPQ len, $64 JGE aligned RET ret: RET