pax_global_header00006660000000000000000000000064146147205530014521gustar00rootroot0000000000000052 comment=4f5562c16b85a956bc8d650dc15344e1e1eb7c8c blake3-1.3.0/000077500000000000000000000000001461472055300126635ustar00rootroot00000000000000blake3-1.3.0/LICENSE000066400000000000000000000020701461472055300136670ustar00rootroot00000000000000The MIT License (MIT) Copyright (c) 2020 Luke Champine Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. blake3-1.3.0/README.md000066400000000000000000000042041461472055300141420ustar00rootroot00000000000000blake3 ------ [![GoDoc](https://godoc.org/lukechampine.com/blake3?status.svg)](https://godoc.org/lukechampine.com/blake3) [![Go Report Card](http://goreportcard.com/badge/lukechampine.com/blake3)](https://goreportcard.com/report/lukechampine.com/blake3) ``` go get lukechampine.com/blake3 ``` `blake3` implements the [BLAKE3 cryptographic hash function](https://github.com/BLAKE3-team/BLAKE3). This implementation aims to be performant without sacrificing (too much) readability, in the hopes of eventually landing in `x/crypto`. In addition to the pure-Go implementation, this package also contains AVX-512 and AVX2 routines (generated by [`avo`](https://github.com/mmcloughlin/avo)) that greatly increase performance for large inputs and outputs. ## Benchmarks Tested on a 2020 MacBook Air (i5-7600K @ 3.80GHz). Benchmarks will improve as soon as I get access to a beefier AVX-512 machine. :wink: ### AVX-512 ``` BenchmarkSum256/64 120 ns/op 533.00 MB/s BenchmarkSum256/1024 2229 ns/op 459.36 MB/s BenchmarkSum256/65536 16245 ns/op 4034.11 MB/s BenchmarkWrite 245 ns/op 4177.38 MB/s BenchmarkXOF 246 ns/op 4159.30 MB/s ``` ### AVX2 ``` BenchmarkSum256/64 120 ns/op 533.00 MB/s BenchmarkSum256/1024 2229 ns/op 459.36 MB/s BenchmarkSum256/65536 31137 ns/op 2104.76 MB/s BenchmarkWrite 487 ns/op 2103.12 MB/s BenchmarkXOF 329 ns/op 3111.27 MB/s ``` ### Pure Go ``` BenchmarkSum256/64 120 ns/op 533.00 MB/s BenchmarkSum256/1024 2229 ns/op 459.36 MB/s BenchmarkSum256/65536 133505 ns/op 490.89 MB/s BenchmarkWrite 2022 ns/op 506.36 MB/s BenchmarkXOF 1914 ns/op 534.98 MB/s ``` ## Shortcomings There is no assembly routine for single-block compressions. This is most noticeable for ~1KB inputs. Each assembly routine inlines all 7 rounds, causing thousands of lines of duplicated code. Ideally the routines could be merged such that only a single routine is generated for AVX-512 and AVX2, without sacrificing too much performance. blake3-1.3.0/avo/000077500000000000000000000000001461472055300134505ustar00rootroot00000000000000blake3-1.3.0/avo/gen.go000066400000000000000000000333451461472055300145600ustar00rootroot00000000000000//go:build ignore // +build ignore package main import ( "fmt" . "github.com/mmcloughlin/avo/build" . "github.com/mmcloughlin/avo/operand" . "github.com/mmcloughlin/avo/reg" ) func main() { genGlobals() genCompressBlocksAVX512() genCompressChunksAVX512() genCompressBlocksAVX2() genCompressChunksAVX2() genCompressParentsAVX2() Generate() } var globals struct { iv Mem seq Mem seq64 Mem // for loadCounter shuffleRot8 Mem shuffleRot16 Mem } func genGlobals() { globals.iv = GLOBL("iv", RODATA|NOPTR) DATA(0*4, U32(0x6A09E667)) DATA(1*4, U32(0xBB67AE85)) DATA(2*4, U32(0x3C6EF372)) DATA(3*4, U32(0xA54FF53A)) globals.seq = GLOBL("seq", RODATA|NOPTR) for i := 0; i < 16; i++ { DATA(i*4, U32(i)) } globals.seq64 = GLOBL("seq64", RODATA|NOPTR) for i := 0; i < 8; i++ { DATA(i*8, U64(i)) } globals.shuffleRot8 = GLOBL("shuffle_rot8", RODATA|NOPTR) for i := 0; i < 8; i++ { DATA(i*4, U32(0x00030201+0x04040404*i)) } globals.shuffleRot16 = GLOBL("shuffle_rot16", RODATA|NOPTR) for i := 0; i < 8; i++ { DATA(i*4, U32(0x01000302+0x04040404*i)) } } func genCompressBlocksAVX512() { TEXT("compressBlocksAVX512", NOSPLIT, "func(out *[1024]byte, block *[16]uint32, cv *[8]uint32, counter uint64, blockLen uint32, flags uint32)") out := Mem{Base: Load(Param("out"), GP64())} block := Mem{Base: Load(Param("block"), GP64())} cv := Mem{Base: Load(Param("cv"), GP64())} counter, _ := Param("counter").Resolve() blockLen, _ := Param("blockLen").Resolve() flags, _ := Param("flags").Resolve() Comment("Initialize block vectors") var vs, mv [16]VecVirtual for i := range vs { vs[i], mv[i] = ZMM(), ZMM() VPBROADCASTD(block.Offset(i*4), mv[i]) } Comment("Initialize state vectors") for i, v := range vs { switch i { case 0, 1, 2, 3, 4, 5, 6, 7: // cv VPBROADCASTD(cv.Offset(i*4), v) case 8, 9, 10, 11: // iv VPBROADCASTD(globals.iv.Offset((i-8)*4), v) case 12: // counter VPBROADCASTD(counter.Addr, vs[12]) VPADDD(globals.seq, vs[12], vs[12]) // set a 1 bit in K1 for each overflowed counter in vs[12] VPCMPUD(Imm(1), globals.seq, vs[12], K1) // add 1 to each counter in vs[13] for each 1 bit in K1 VPBROADCASTD(counter.Addr.Offset(1*4), vs[13]) VPADDD_BCST(globals.seq.Offset(4), vs[13], K1, vs[13]) case 14: // blockLen VPBROADCASTD(blockLen.Addr, v) case 15: // flags VPBROADCASTD(flags.Addr, v) } } performRoundsAVX512(vs, mv) Comment("Finalize CVs") for i, v := range vs[:8] { VPXORD(v, vs[i+8], v) } for i, v := range vs[8:] { VPXORD_BCST(cv.Offset(i*4), v, v) } stride := ZMM() VMOVDQU32(globals.seq, stride) VPSLLD(Imm(6), stride, stride) // stride of 64 for i, v := range vs { KXNORD(K1, K1, K1) // fastest way to set all bits to 1 VPSCATTERDD(v, K1, out.Offset(i*4).Idx(stride, 1)) } RET() } func genCompressChunksAVX512() { TEXT("compressChunksAVX512", NOSPLIT, "func(cvs *[16][8]uint32, buf *[16384]byte, key *[8]uint32, counter uint64, flags uint32)") cvs := Mem{Base: Load(Param("cvs"), GP64())} buf := Mem{Base: Load(Param("buf"), GP64())} key := Mem{Base: Load(Param("key"), GP64())} counter, _ := Param("counter").Resolve() flags, _ := Param("flags").Resolve() var vs, mv [16]VecVirtual for i := range vs { vs[i], mv[i] = ZMM(), ZMM() } Comment("Initialize counter") counterLo := AllocLocal(64) counterHi := AllocLocal(64) VPBROADCASTD(counter.Addr, vs[0]) VPADDD(globals.seq, vs[0], vs[0]) VPCMPUD(Imm(1), globals.seq, vs[0], K1) VPBROADCASTD(counter.Addr.Offset(4), vs[1]) VPADDD_BCST(globals.seq.Offset(4), vs[1], K1, vs[1]) VMOVDQU32(vs[0], counterLo) VMOVDQU32(vs[1], counterHi) Comment("Initialize flags") chunkFlags := AllocLocal(16 * 4) VPBROADCASTD(flags.Addr, vs[0]) VMOVDQU32(vs[0], chunkFlags) ORL(Imm(1), chunkFlags.Offset(0*4)) ORL(Imm(2), chunkFlags.Offset(15*4)) Comment("Load key") for i := 0; i < 8; i++ { VPBROADCASTD(key.Offset(i*4), vs[i]) } Comment("Loop index") loop := GP64() XORQ(loop, loop) Label("loop") Comment("Load transposed block") VMOVDQU32(globals.seq, vs[8]) VPSLLD(Imm(10), vs[8], vs[8]) // stride of 1024 for i, m := range mv { KXNORD(K1, K1, K1) VPGATHERDD(buf.Offset(i*4).Idx(vs[8], 1), K1, m) } ADDQ(Imm(64), buf.Base) Comment("Reload state vectors (other than CVs)") for i := 0; i < 4; i++ { VPBROADCASTD(globals.iv.Offset(i*4), vs[8+i]) } VMOVDQU32(counterLo, vs[12]) VMOVDQU32(counterHi, vs[13]) VPBROADCASTD(globals.seq.Offset(4), vs[14]) VPSLLD(Imm(6), vs[14], vs[14]) // 64 VPBROADCASTD(chunkFlags.Idx(loop, 4), vs[15]) performRoundsAVX512(vs, mv) Comment("Finalize CVs") for i := range vs[:8] { VPXORD(vs[i], vs[i+8], vs[i]) } Comment("Loop") INCQ(loop) CMPQ(loop, U32(16)) JNE(LabelRef("loop")) Comment("Finished; transpose CVs") VMOVDQU32(globals.seq, vs[8]) VPSLLD(Imm(5), vs[8], vs[8]) // stride of 32 for i, v := range vs[:8] { KXNORD(K1, K1, K1) // fastest way to set all bits to 1 VPSCATTERDD(v, K1, cvs.Offset(i*4).Idx(vs[8], 1)) } RET() } func performRoundsAVX512(vs, mv [16]VecVirtual) { g := func(a, b, c, d, mx, my VecVirtual) { VPADDD(a, b, a) VPADDD(mx, a, a) VPXORD(d, a, d) VPRORD(Imm(16), d, d) VPADDD(c, d, c) VPXORD(b, c, b) VPRORD(Imm(12), b, b) VPADDD(a, b, a) VPADDD(my, a, a) VPXORD(d, a, d) VPRORD(Imm(8), d, d) VPADDD(c, d, c) VPXORD(b, c, b) VPRORD(Imm(7), b, b) } for i := 0; i < 7; i++ { Comment(fmt.Sprintf("Round %v", i+1)) g(vs[0], vs[4], vs[8], vs[12], mv[0], mv[1]) g(vs[1], vs[5], vs[9], vs[13], mv[2], mv[3]) g(vs[2], vs[6], vs[10], vs[14], mv[4], mv[5]) g(vs[3], vs[7], vs[11], vs[15], mv[6], mv[7]) g(vs[0], vs[5], vs[10], vs[15], mv[8], mv[9]) g(vs[1], vs[6], vs[11], vs[12], mv[10], mv[11]) g(vs[2], vs[7], vs[8], vs[13], mv[12], mv[13]) g(vs[3], vs[4], vs[9], vs[14], mv[14], mv[15]) // permute mv = [16]VecVirtual{ mv[2], mv[6], mv[3], mv[10], mv[7], mv[0], mv[4], mv[13], mv[1], mv[11], mv[12], mv[5], mv[9], mv[14], mv[15], mv[8], } } } func genCompressBlocksAVX2() { TEXT("compressBlocksAVX2", NOSPLIT, "func(out *[512]byte, block *[16]uint32, cv *[8]uint32, counter uint64, blockLen uint32, flags uint32)") out := Mem{Base: Load(Param("out"), GP64())} block := Mem{Base: Load(Param("block"), GP64())} cv := Mem{Base: Load(Param("cv"), GP64())} counter, _ := Param("counter").Resolve() blockLen, _ := Param("blockLen").Resolve() flags, _ := Param("flags").Resolve() var vs [16]VecVirtual var mv [16]Mem for i := range vs { vs[i] = YMM() mv[i] = AllocLocal(32) } Comment("Load block") for i := 0; i < 16; i++ { VPBROADCASTD(block.Offset(i*4), vs[0]) VMOVDQU(vs[0], mv[i]) } Comment("Initialize state vectors") for i, v := range vs { switch i { case 0, 1, 2, 3, 4, 5, 6, 7: // cv VPBROADCASTD(cv.Offset(i*4), v) case 8, 9, 10, 11: // iv VPBROADCASTD(globals.iv.Offset((i-8)*4), v) case 12: // counter loadCounter(counter.Addr, vs[12:14], vs[14:16]) case 14: // blockLen VPBROADCASTD(blockLen.Addr, v) case 15: // flags VPBROADCASTD(flags.Addr, v) } } performRoundsAVX2(vs, mv) Comment("Finalize CVs") for i := 8; i < 16; i++ { VMOVDQU(vs[i], mv[i]) } for i := range vs[:8] { VPXOR(vs[i], vs[i+8], vs[i]) } transpose(vs[:8], vs[8:]) for i, v := range vs[8:] { VMOVDQU(v, out.Offset(i*64)) } for i := 8; i < 16; i++ { VMOVDQU(mv[i], vs[i]) } for i, v := range vs[8:] { VPBROADCASTD(cv.Offset(i*4), vs[0]) VPXOR(vs[0], v, v) } transpose(vs[8:], vs[:8]) for i, v := range vs[:8] { VMOVDQU(v, out.Offset(i*64+32)) } // See https://community.intel.com/t5/Intel-ISA-Extensions/What-is-the-status-of-VZEROUPPER-use/m-p/1098375 VZEROUPPER() RET() } func genCompressChunksAVX2() { TEXT("compressChunksAVX2", NOSPLIT, "func(cvs *[8][8]uint32, buf *[8192]byte, key *[8]uint32, counter uint64, flags uint32)") cvs := Mem{Base: Load(Param("cvs"), GP64())} buf := Mem{Base: Load(Param("buf"), GP64())} key := Mem{Base: Load(Param("key"), GP64())} counter, _ := Param("counter").Resolve() flags, _ := Param("flags").Resolve() var vs [16]VecVirtual var mv [16]Mem for i := range vs { vs[i] = YMM() mv[i] = AllocLocal(32) } Comment("Load key") for i := 0; i < 8; i++ { VPBROADCASTD(key.Offset(i*4), vs[i]) } Comment("Initialize counter") counterLo := AllocLocal(32) counterHi := AllocLocal(32) loadCounter(counter.Addr, vs[12:14], vs[14:16]) VMOVDQU(vs[12], counterLo) VMOVDQU(vs[13], counterHi) Comment("Initialize flags") chunkFlags := AllocLocal(16 * 4) VPBROADCASTD(flags.Addr, vs[14]) VMOVDQU(vs[14], chunkFlags.Offset(0*32)) VMOVDQU(vs[14], chunkFlags.Offset(1*32)) ORL(Imm(1), chunkFlags.Offset(0*4)) ORL(Imm(2), chunkFlags.Offset(15*4)) Comment("Loop index") loop := GP64() XORQ(loop, loop) Label("loop") Comment("Load transposed block") VMOVDQU(globals.seq, vs[9]) VPSLLD(Imm(10), vs[9], vs[9]) // stride of 1024 for i := 0; i < 16; i++ { VPCMPEQD(vs[8], vs[8], vs[8]) // fastest way to set all bits to 1 VPGATHERDD(vs[8], buf.Offset(i*4).Idx(vs[9], 1), vs[10]) VMOVDQU(vs[10], mv[i]) } ADDQ(Imm(64), buf.Base) Comment("Reload state vectors (other than CVs)") for i := 0; i < 4; i++ { VPBROADCASTD(globals.iv.Offset(i*4), vs[8+i]) } VMOVDQU(counterLo, vs[12]) VMOVDQU(counterHi, vs[13]) VPBROADCASTD(globals.seq.Offset(4), vs[14]) VPSLLD(Imm(6), vs[14], vs[14]) // 64 VPBROADCASTD(chunkFlags.Idx(loop, 4), vs[15]) performRoundsAVX2(vs, mv) Comment("Finalize CVs") for i := range vs[:8] { VPXOR(vs[i], vs[i+8], vs[i]) } Comment("Loop") INCQ(loop) CMPQ(loop, U32(16)) JNE(LabelRef("loop")) Comment("Finished; transpose CVs") transpose(vs[:8], vs[8:]) for i, v := range vs[8:] { VMOVDQU(v, cvs.Offset(i*32)) } VZEROUPPER() RET() } func genCompressParentsAVX2() { TEXT("compressParentsAVX2", NOSPLIT, "func(parents *[8][8]uint32, cvs *[16][8]uint32, key *[8]uint32, flags uint32)") parents := Mem{Base: Load(Param("parents"), GP64())} cvs := Mem{Base: Load(Param("cvs"), GP64())} key := Mem{Base: Load(Param("key"), GP64())} flags, _ := Param("flags").Resolve() var vs [16]VecVirtual var mv [16]Mem for i := range vs { vs[i] = YMM() mv[i] = AllocLocal(32) } Comment("Load transposed block") VMOVDQU(globals.seq, vs[9]) VPSLLD(Imm(6), vs[9], vs[9]) // stride of 64 for i := 0; i < 16; i++ { VPCMPEQD(vs[8], vs[8], vs[8]) // fastest way to set all bits to 1 VPGATHERDD(vs[8], cvs.Offset(i*4).Idx(vs[9], 1), vs[10]) VMOVDQU(vs[10], mv[i]) } Comment("Initialize state vectors") for i, v := range vs { switch i { case 0, 1, 2, 3, 4, 5, 6, 7: // cv VPBROADCASTD(key.Offset(i*4), v) case 8, 9, 10, 11: // iv VPBROADCASTD(globals.iv.Offset((i-8)*4), v) case 12, 13: // counter VPXOR(v, v, v) case 14: // blockLen VPBROADCASTD(globals.seq.Offset(1*4), v) VPSLLD(Imm(6), v, v) // 64 case 15: // flags ORL(Imm(4), flags.Addr) // flagParent VPBROADCASTD(flags.Addr, v) } } performRoundsAVX2(vs, mv) Comment("Finalize CVs") for i := range vs[:8] { VPXOR(vs[i], vs[i+8], vs[i]) } transpose(vs[:8], vs[8:]) for i, v := range vs[8:] { VMOVDQU(v, parents.Offset(i*32)) } VZEROUPPER() RET() } func performRoundsAVX2(sv [16]VecVirtual, mv [16]Mem) { spillMem := AllocLocal(32) tmp := sv[8] g := func(a, b, c, d VecVirtual, mx, my Mem) { // Helper function for performing rotations. Also manages c, tmp and // spillMem: if c == tmp, we need to spill and reload c using spillMem. rotr := func(v VecVirtual, n uint64, dst VecVirtual) { switch n { case 8, 16: shuf := [...]Mem{8: globals.shuffleRot8, 16: globals.shuffleRot16}[n] VPSHUFB(shuf, v, dst) if c == tmp { VMOVDQU(spillMem, c) } case 7, 12: if c == tmp { VMOVDQU(c, spillMem) } VPSRLD(Imm(n), v, tmp) VPSLLD(Imm(32-n), v, dst) VPOR(dst, tmp, dst) } } VPADDD(a, b, a) VPADDD(mx, a, a) VPXOR(d, a, d) rotr(d, 16, d) VPADDD(c, d, c) VPXOR(b, c, b) rotr(b, 12, b) VPADDD(a, b, a) VPADDD(my, a, a) VPXOR(d, a, d) rotr(d, 8, d) VPADDD(c, d, c) VPXOR(b, c, b) rotr(b, 7, b) } VMOVDQU(sv[8], spillMem) // spill for i := 0; i < 7; i++ { Comment(fmt.Sprintf("Round %v", i+1)) g(sv[0], sv[4], sv[8], sv[12], mv[0], mv[1]) g(sv[1], sv[5], sv[9], sv[13], mv[2], mv[3]) g(sv[2], sv[6], sv[10], sv[14], mv[4], mv[5]) g(sv[3], sv[7], sv[11], sv[15], mv[6], mv[7]) g(sv[0], sv[5], sv[10], sv[15], mv[8], mv[9]) g(sv[1], sv[6], sv[11], sv[12], mv[10], mv[11]) g(sv[2], sv[7], sv[8], sv[13], mv[12], mv[13]) g(sv[3], sv[4], sv[9], sv[14], mv[14], mv[15]) // permute mv = [16]Mem{ mv[2], mv[6], mv[3], mv[10], mv[7], mv[0], mv[4], mv[13], mv[1], mv[11], mv[12], mv[5], mv[9], mv[14], mv[15], mv[8], } } VMOVDQU(spillMem, sv[8]) // reload } func loadCounter(counter Mem, dst, scratch []VecVirtual) { // fill dst[0] and dst[1] with counter + 0,1,2,3,4,5,6,7, then transpose so // that dst[0] contains low 32 bits and dst[1] contains high 32 bits. VPBROADCASTQ(counter, dst[0]) VPBROADCASTQ(counter, dst[1]) VPADDQ(globals.seq64.Offset(0*4), dst[0], dst[0]) VPADDQ(globals.seq64.Offset(8*4), dst[1], dst[1]) VPUNPCKLDQ(dst[1], dst[0], scratch[0]) VPUNPCKHDQ(dst[1], dst[0], scratch[1]) VPUNPCKLDQ(scratch[1], scratch[0], dst[0]) VPUNPCKHDQ(scratch[1], scratch[0], dst[1]) const perm = 0<<0 | 2<<2 | 1<<4 | 3<<6 VPERMQ(Imm(perm), dst[0], dst[0]) VPERMQ(Imm(perm), dst[1], dst[1]) } func transpose(src, dst []VecVirtual) { // interleave uint32s for i := 0; i < 8; i += 2 { VPUNPCKLDQ(src[i+1], src[i], dst[i+0]) VPUNPCKHDQ(src[i+1], src[i], dst[i+1]) } // interleave groups of two uint32s for i := 0; i < 4; i++ { j := i*2 - i%2 // j := 0,1,4,5 VPUNPCKLQDQ(dst[j+2], dst[j], src[i*2+0]) VPUNPCKHQDQ(dst[j+2], dst[j], src[i*2+1]) } // interleave groups of four uint32s for i := 0; i < 4; i++ { VPERM2I128(Imm(0x20), src[i+4], src[i], dst[i+0]) VPERM2I128(Imm(0x31), src[i+4], src[i], dst[i+4]) } } blake3-1.3.0/bao/000077500000000000000000000000001461472055300134245ustar00rootroot00000000000000blake3-1.3.0/bao/bao.go000066400000000000000000000260761461472055300145270ustar00rootroot00000000000000// Package bao implements BLAKE3 verified streaming. package bao import ( "bytes" "encoding/binary" "errors" "io" "math/bits" "lukechampine.com/blake3/guts" ) func bytesToCV(b []byte) (cv [8]uint32) { _ = b[31] // bounds check hint for i := range cv { cv[i] = binary.LittleEndian.Uint32(b[4*i:]) } return cv } func cvToBytes(cv *[8]uint32) *[32]byte { var b [32]byte for i, w := range cv { binary.LittleEndian.PutUint32(b[4*i:], w) } return &b } func compressGroup(p []byte, counter uint64) guts.Node { var stack [54 - guts.MaxSIMD][8]uint32 var sc uint64 pushSubtree := func(cv [8]uint32) { i := 0 for sc&(1< 0 { if buflen == len(buf) { pushSubtree(guts.ChainingValue(guts.CompressBuffer(&buf, buflen, &guts.IV, counter+(sc*guts.MaxSIMD), 0))) buflen = 0 } n := copy(buf[buflen:], p) buflen += n p = p[n:] } n := guts.CompressBuffer(&buf, buflen, &guts.IV, counter+(sc*guts.MaxSIMD), 0) for i := bits.TrailingZeros64(sc); i < bits.Len64(sc); i++ { if sc&(1< 0 { chunks := (dataLen + groupSize - 1) / groupSize cvs := 2*chunks - 2 // no I will not elaborate size += cvs * 32 } if !outboard { size += dataLen } return size } // Encode computes the intermediate BLAKE3 tree hashes of data and writes them // to dst. If outboard is false, the contents of data are also written to dst, // interleaved with the tree hashes. It also returns the tree root, i.e. the // 256-bit BLAKE3 hash. The group parameter controls how many chunks are hashed // per "group," as a power of 2; for standard Bao, use 0. // // Note that dst is not written sequentially, and therefore must be initialized // with sufficient capacity to hold the encoding; see EncodedSize. func Encode(dst io.WriterAt, data io.Reader, dataLen int64, group int, outboard bool) ([32]byte, error) { groupSize := uint64(guts.ChunkSize << group) buf := make([]byte, groupSize) var err error read := func(p []byte) []byte { if err == nil { _, err = io.ReadFull(data, p) } return p } write := func(p []byte, off uint64) { if err == nil { _, err = dst.WriteAt(p, int64(off)) } } var counter uint64 // NOTE: unlike the reference implementation, we write directly in // pre-order, rather than writing in post-order and then flipping. This cuts // the I/O required in half, at the cost of making it a lot trickier to hash // multiple groups in SIMD. However, you can still get the SIMD speedup if // group > 0, so maybe just do that. var rec func(bufLen uint64, flags uint32, off uint64) (uint64, [8]uint32) rec = func(bufLen uint64, flags uint32, off uint64) (uint64, [8]uint32) { if err != nil { return 0, [8]uint32{} } else if bufLen <= groupSize { g := read(buf[:bufLen]) if !outboard { write(g, off) } n := compressGroup(g, counter) counter += bufLen / guts.ChunkSize n.Flags |= flags return 0, guts.ChainingValue(n) } mid := uint64(1) << (bits.Len64(bufLen-1) - 1) lchildren, l := rec(mid, 0, off+64) llen := lchildren * 32 if !outboard { llen += (mid / groupSize) * groupSize } rchildren, r := rec(bufLen-mid, 0, off+64+llen) write(cvToBytes(&l)[:], off) write(cvToBytes(&r)[:], off+32) return 2 + lchildren + rchildren, guts.ChainingValue(guts.ParentNode(l, r, &guts.IV, flags)) } binary.LittleEndian.PutUint64(buf[:8], uint64(dataLen)) write(buf[:8], 0) _, root := rec(uint64(dataLen), guts.FlagRoot, 8) return *cvToBytes(&root), err } // Decode reads content and tree data from the provided reader(s), and // streams the verified content to dst. It returns false if verification fails. // If the content and tree data are interleaved, outboard should be nil. func Decode(dst io.Writer, data, outboard io.Reader, group int, root [32]byte) (bool, error) { if outboard == nil { outboard = data } groupSize := uint64(guts.ChunkSize << group) buf := make([]byte, groupSize) var err error read := func(r io.Reader, p []byte) []byte { if err == nil { _, err = io.ReadFull(r, p) } return p } write := func(w io.Writer, p []byte) { if err == nil { _, err = w.Write(p) } } readParent := func() (l, r [8]uint32) { read(outboard, buf[:64]) return bytesToCV(buf[:32]), bytesToCV(buf[32:]) } var counter uint64 var rec func(cv [8]uint32, bufLen uint64, flags uint32) bool rec = func(cv [8]uint32, bufLen uint64, flags uint32) bool { if err != nil { return false } else if bufLen <= groupSize { n := compressGroup(read(data, buf[:bufLen]), counter) counter += bufLen / guts.ChunkSize n.Flags |= flags valid := cv == guts.ChainingValue(n) if valid { write(dst, buf[:bufLen]) } return valid } l, r := readParent() n := guts.ParentNode(l, r, &guts.IV, flags) mid := uint64(1) << (bits.Len64(bufLen-1) - 1) return guts.ChainingValue(n) == cv && rec(l, mid, 0) && rec(r, bufLen-mid, 0) } read(outboard, buf[:8]) dataLen := binary.LittleEndian.Uint64(buf[:8]) ok := rec(bytesToCV(root[:]), dataLen, guts.FlagRoot) return ok, err } type bufferAt struct { buf []byte } func (b *bufferAt) WriteAt(p []byte, off int64) (int, error) { if copy(b.buf[off:], p) != len(p) { panic("bad buffer size") } return len(p), nil } // EncodeBuf returns the Bao encoding and root (i.e. BLAKE3 hash) for data. func EncodeBuf(data []byte, group int, outboard bool) ([]byte, [32]byte) { buf := bufferAt{buf: make([]byte, EncodedSize(len(data), group, outboard))} root, _ := Encode(&buf, bytes.NewReader(data), int64(len(data)), group, outboard) return buf.buf, root } // VerifyBuf verifies the Bao encoding and root (i.e. BLAKE3 hash) for data. // If the content and tree data are interleaved, outboard should be nil. func VerifyBuf(data, outboard []byte, group int, root [32]byte) bool { d, o := bytes.NewBuffer(data), bytes.NewBuffer(outboard) var or io.Reader = o if outboard == nil { or = nil } ok, _ := Decode(io.Discard, d, or, group, root) return ok && d.Len() == 0 && o.Len() == 0 // check for trailing data } // ExtractSlice returns the slice encoding for the given offset and length. When // extracting from an outboard encoding, data should contain only the chunk // groups that will be present in the slice. func ExtractSlice(dst io.Writer, data, outboard io.Reader, group int, offset uint64, length uint64) error { combinedEncoding := outboard == nil if combinedEncoding { outboard = data } groupSize := uint64(guts.ChunkSize << group) buf := make([]byte, groupSize) var err error read := func(r io.Reader, n uint64, copy bool) { if err == nil { _, err = io.ReadFull(r, buf[:n]) if err == nil && copy { _, err = dst.Write(buf[:n]) } } } var rec func(pos, bufLen uint64) rec = func(pos, bufLen uint64) { inSlice := pos < (offset+length) && offset < (pos+bufLen) if err != nil { return } else if bufLen <= groupSize { if combinedEncoding || inSlice { read(data, bufLen, inSlice) } return } read(outboard, 64, inSlice) mid := uint64(1) << (bits.Len64(bufLen-1) - 1) rec(pos, mid) rec(pos+mid, bufLen-mid) } read(outboard, 8, true) dataLen := binary.LittleEndian.Uint64(buf[:8]) if dataLen < offset+length { return errors.New("invalid slice length") } rec(0, dataLen) return err } // DecodeSlice reads from data, which must contain a slice encoding for the // given offset and length, and streams verified content to dst. It returns // false if verification fails. func DecodeSlice(dst io.Writer, data io.Reader, group int, offset, length uint64, root [32]byte) (bool, error) { groupSize := uint64(guts.ChunkSize << group) buf := make([]byte, groupSize) var err error read := func(n uint64) []byte { if err == nil { _, err = io.ReadFull(data, buf[:n]) } return buf[:n] } readParent := func() (l, r [8]uint32) { read(64) return bytesToCV(buf[:32]), bytesToCV(buf[32:]) } write := func(p []byte) { if err == nil { _, err = dst.Write(p) } } var rec func(cv [8]uint32, pos, bufLen uint64, flags uint32) bool rec = func(cv [8]uint32, pos, bufLen uint64, flags uint32) bool { inSlice := pos < (offset+length) && offset < (pos+bufLen) if err != nil { return false } else if bufLen <= groupSize { if !inSlice { return true } n := compressGroup(read(bufLen), pos/guts.ChunkSize) n.Flags |= flags valid := cv == guts.ChainingValue(n) if valid { // only write within range p := buf[:bufLen] if pos+bufLen > offset+length { p = p[:offset+length-pos] } if pos < offset { p = p[offset-pos:] } write(p) } return valid } if !inSlice { return true } l, r := readParent() n := guts.ParentNode(l, r, &guts.IV, flags) mid := uint64(1) << (bits.Len64(bufLen-1) - 1) return guts.ChainingValue(n) == cv && rec(l, pos, mid, 0) && rec(r, pos+mid, bufLen-mid, 0) } dataLen := binary.LittleEndian.Uint64(read(8)) if dataLen < offset+length { return false, errors.New("invalid slice length") } ok := rec(bytesToCV(root[:]), 0, dataLen, guts.FlagRoot) return ok, err } // VerifySlice verifies the Bao slice encoding in data, returning the // verified bytes. func VerifySlice(data []byte, group int, offset uint64, length uint64, root [32]byte) ([]byte, bool) { d := bytes.NewBuffer(data) var buf bytes.Buffer if ok, _ := DecodeSlice(&buf, d, group, offset, length, root); !ok || d.Len() > 0 { return nil, false } return buf.Bytes(), true } // VerifyChunks verifies the provided chunks with a full outboard encoding. func VerifyChunk(chunks, outboard []byte, group int, offset uint64, root [32]byte) bool { cbuf := bytes.NewBuffer(chunks) obuf := bytes.NewBuffer(outboard) groupSize := uint64(guts.ChunkSize << group) length := uint64(len(chunks)) nodesWithin := func(bufLen uint64) int { n := int(bufLen / groupSize) if bufLen%groupSize == 0 { n-- } return n } var rec func(cv [8]uint32, pos, bufLen uint64, flags uint32) bool rec = func(cv [8]uint32, pos, bufLen uint64, flags uint32) bool { inSlice := pos < (offset+length) && offset < (pos+bufLen) if bufLen <= groupSize { if !inSlice { return true } n := compressGroup(cbuf.Next(int(groupSize)), pos/guts.ChunkSize) n.Flags |= flags return cv == guts.ChainingValue(n) } if !inSlice { _ = obuf.Next(64 * nodesWithin(bufLen)) // skip return true } l, r := bytesToCV(obuf.Next(32)), bytesToCV(obuf.Next(32)) n := guts.ParentNode(l, r, &guts.IV, flags) mid := uint64(1) << (bits.Len64(bufLen-1) - 1) return guts.ChainingValue(n) == cv && rec(l, pos, mid, 0) && rec(r, pos+mid, bufLen-mid, 0) } if obuf.Len() < 8 { return false } dataLen := binary.LittleEndian.Uint64(obuf.Next(8)) if dataLen < offset+length || obuf.Len() != 64*nodesWithin(dataLen) { return false } return rec(bytesToCV(root[:]), 0, dataLen, guts.FlagRoot) } blake3-1.3.0/bao/bao_test.go000066400000000000000000000173641461472055300155660ustar00rootroot00000000000000package bao_test import ( "bytes" "encoding/binary" "encoding/hex" "fmt" "os" "testing" "lukechampine.com/blake3" "lukechampine.com/blake3/bao" ) func toHex(data []byte) string { return hex.EncodeToString(data) } func TestBaoGolden(t *testing.T) { data, err := os.ReadFile("../testdata/vectors.json") if err != nil { t.Fatal(err) } goldenInterleaved, err := os.ReadFile("../testdata/bao-golden.bao") if err != nil { t.Fatal(err) } goldenOutboard, err := os.ReadFile("../testdata/bao-golden.obao") if err != nil { t.Fatal(err) } interleaved, root := bao.EncodeBuf(data, 0, false) if toHex(root[:]) != "6654fbd1836b531b25e2782c9cc9b792c80abb36b024f59db5d5f6bd3187ddfe" { t.Errorf("bad root: %x", root) } else if !bytes.Equal(interleaved, goldenInterleaved) { t.Error("bad interleaved encoding") } outboard, root := bao.EncodeBuf(data, 0, true) if toHex(root[:]) != "6654fbd1836b531b25e2782c9cc9b792c80abb36b024f59db5d5f6bd3187ddfe" { t.Errorf("bad root: %x", root) } else if !bytes.Equal(outboard, goldenOutboard) { t.Error("bad outboard encoding") } // test empty input interleaved, root = bao.EncodeBuf(nil, 0, false) if toHex(root[:]) != "af1349b9f5f9a1a6a0404dea36dcc9499bcb25c9adc112b7cc9a93cae41f3262" { t.Errorf("bad root: %x", root) } else if toHex(interleaved[:]) != "0000000000000000" { t.Errorf("bad interleaved encoding: %x", interleaved) } else if !bao.VerifyBuf(interleaved, nil, 0, root) { t.Error("verify failed") } outboard, root = bao.EncodeBuf(nil, 0, true) if toHex(root[:]) != "af1349b9f5f9a1a6a0404dea36dcc9499bcb25c9adc112b7cc9a93cae41f3262" { t.Errorf("bad root: %x", root) } else if toHex(outboard[:]) != "0000000000000000" { t.Errorf("bad outboard encoding: %x", outboard) } else if !bao.VerifyBuf(nil, outboard, 0, root) { t.Error("verify failed") } } func TestBaoInterleaved(t *testing.T) { data := make([]byte, 1<<20) blake3.New(0, nil).XOF().Read(data) for group := 0; group < 10; group++ { interleaved, root := bao.EncodeBuf(data, group, false) if !bao.VerifyBuf(interleaved, nil, group, root) { t.Fatal("verify failed") } badRoot := root badRoot[0] ^= 1 if bao.VerifyBuf(interleaved, nil, group, badRoot) { t.Fatal("verify succeeded with bad root") } badPrefix := append([]byte(nil), interleaved...) badPrefix[0] ^= 1 if bao.VerifyBuf(badPrefix, nil, group, root) { t.Fatal("verify succeeded with bad length prefix") } badCVs := append([]byte(nil), interleaved...) badCVs[8] ^= 1 if bao.VerifyBuf(badCVs, nil, group, root) { t.Fatal("verify succeeded with bad cv data") } badData := append([]byte(nil), interleaved...) badData[len(badData)-1] ^= 1 if bao.VerifyBuf(badData, nil, group, root) { t.Fatal("verify succeeded with bad content") } extraData := append(append([]byte(nil), interleaved...), 1, 2, 3) if bao.VerifyBuf(extraData, nil, group, root) { t.Fatal("verify succeeded with extra data") } } } func TestBaoOutboard(t *testing.T) { data := make([]byte, 1<<20) blake3.New(0, nil).XOF().Read(data) for group := 0; group < 10; group++ { outboard, root := bao.EncodeBuf(data, group, true) if !bao.VerifyBuf(data, outboard, group, root) { t.Fatal("verify failed") } badRoot := root badRoot[0] ^= 1 if bao.VerifyBuf(data, outboard, group, badRoot) { t.Fatal("verify succeeded with bad root") } badPrefix := append([]byte(nil), outboard...) badPrefix[0] ^= 1 if bao.VerifyBuf(data, badPrefix, group, root) { t.Fatal("verify succeeded with bad length prefix") } badCVs := append([]byte(nil), outboard...) badCVs[8] ^= 1 if bao.VerifyBuf(data, badCVs, group, root) { t.Fatal("verify succeeded with bad cv data") } } } func TestBaoChunkGroup(t *testing.T) { // from https://github.com/n0-computer/abao/blob/9b756ec8097afc782d76f7aec0a5ac9f4b82329a/tests/test_vectors.json const group = 4 // 16 KiB baoInput := func(n int) (in []byte) { for i := uint32(1); len(in) < n; i++ { in = binary.LittleEndian.AppendUint32(in, i) } return in[:n] } for _, test := range []struct { inputLen int exp string }{ {0, "af1349b9f5f9a1a6a0404dea36dcc9499bcb25c9adc112b7cc9a93cae41f3262"}, {1, "48fc721fbbc172e0925fa27af1671de225ba927134802998b10a1568a188652b"}, {1023, "15f8c1ae1049fe7e837186612c8ce732e66835841a4569b71e4ac3e3d3411b90"}, {1024, "f749c19181983b839cd97fe121cebaf076bc951e8c8e6d64accfedad5951ec22"}, {1025, "3613596275c4ea790774dedf20835b2daf86cacc892feef6ce720c121572f1f9"}, {16383, "f0970fbfe2f1c5145fa6aa31833779803d5c53743a8443ed1395218f511834ba"}, {16384, "b318758645c4467406c829a5f3da7cab00010fccccf4b7c314525cd85e2d0af8"}, {16385, "12a6a6b0554e7f3eed485f668bfd3b37382a2beee5e7ed5594c4a91c4c70f4aa"}, {32768, "8008de557073cab60f851191359ad9dc1afe9dc6152668ee01825c56ac5a754e"}, {49152, "91823357fefc308b57bb85ebed1d1edeba3c355e804dc63fa98fcb82554b1566"}, {180224, "4742cbae9485ce1b86ab359c1a84e203f819795d018b22a5c70c5c4577dd732e"}, {212992, "760c549edfe95c734b1d6a9b846d81692ed3ca022b541442949a0e42fe570df2"}, } { input := baoInput(test.inputLen) _, root := bao.EncodeBuf(input, group, false) if out := fmt.Sprintf("%x", root); out != test.exp { t.Errorf("output %v did not match test vector:\n\texpected: %v...\n\t got: %v...", test.inputLen, test.exp[:10], out[:10]) } } } func TestBaoStreaming(t *testing.T) { data := make([]byte, 1<<20) blake3.New(0, nil).XOF().Read(data) enc, root := bao.EncodeBuf(data, 0, false) if root != blake3.Sum256(data) { t.Fatal("bad root") } var buf bytes.Buffer if ok, err := bao.Decode(&buf, bytes.NewReader(enc), nil, 0, root); err != nil || !ok { t.Fatal("decode failed") } else if !bytes.Equal(buf.Bytes(), data) { t.Fatal("bad decode") } // corrupt root; nothing should be written to buf buf.Reset() if ok, err := bao.Decode(&buf, bytes.NewReader(enc), nil, 0, [32]byte{}); err != nil { t.Fatal("decode failed") } else if ok { t.Fatal("decode succeeded with bad root") } else if buf.Len() != 0 { t.Fatal("buf was written with bad root") } // corrupt a byte halfway through; buf should only be partially written buf.Reset() enc[len(enc)/2] ^= 1 if ok, err := bao.Decode(&buf, bytes.NewReader(enc), nil, 0, root); err != nil { t.Fatal("decode failed") } else if ok { t.Fatal("decode succeeded with bad data") } else if !bytes.Equal(buf.Bytes(), data[:buf.Len()]) { t.Fatal("invalid data was written to buf") } } func TestBaoSlice(t *testing.T) { data := make([]byte, 1<<20) blake3.New(0, nil).XOF().Read(data) for _, test := range []struct { off, len uint64 }{ {0, uint64(len(data))}, {0, 1024}, {1024, 1024}, {0, 10}, {1020, 10}, {1030, uint64(len(data) - 1030)}, } { // combined encoding { enc, root := bao.EncodeBuf(data, 0, false) var buf bytes.Buffer if err := bao.ExtractSlice(&buf, bytes.NewReader(enc), nil, 0, test.off, test.len); err != nil { t.Error(err) } else if vdata, ok := bao.VerifySlice(buf.Bytes(), 0, test.off, test.len, root); !ok { t.Error("combined verify failed", test) } else if !bytes.Equal(vdata, data[test.off:][:test.len]) { t.Error("combined bad decode", test, vdata, data[test.off:][:test.len]) } } // outboard encoding { enc, root := bao.EncodeBuf(data, 0, true) start, end := (test.off/1024)*1024, ((test.off+test.len+1024-1)/1024)*1024 if end > uint64(len(data)) { end = uint64(len(data)) } var buf bytes.Buffer if err := bao.ExtractSlice(&buf, bytes.NewReader(data[start:end]), bytes.NewReader(enc), 0, test.off, test.len); err != nil { t.Error(err) } else if vdata, ok := bao.VerifySlice(buf.Bytes(), 0, test.off, test.len, root); !ok { t.Error("outboard verify failed", test) } else if !bytes.Equal(vdata, data[test.off:][:test.len]) { t.Error("outboard bad decode", test, vdata, data[test.off:][:test.len]) } } } } blake3-1.3.0/blake3.go000066400000000000000000000203211461472055300143510ustar00rootroot00000000000000// Package blake3 implements the BLAKE3 cryptographic hash function. package blake3 // import "lukechampine.com/blake3" import ( "encoding/binary" "errors" "hash" "io" "math" "math/bits" "lukechampine.com/blake3/bao" "lukechampine.com/blake3/guts" ) // Hasher implements hash.Hash. type Hasher struct { key [8]uint32 flags uint32 size int // output size, for Sum // log(n) set of Merkle subtree roots, at most one per height. stack [64 - (guts.MaxSIMD + 10)][8]uint32 // 10 = log2(guts.ChunkSize) counter uint64 // number of buffers hashed; also serves as a bit vector indicating which stack elems are occupied buf [guts.MaxSIMD * guts.ChunkSize]byte buflen int } func (h *Hasher) hasSubtreeAtHeight(i int) bool { return h.counter&(1< 0 { if h.buflen == len(h.buf) { n := guts.CompressBuffer(&h.buf, h.buflen, &h.key, h.counter*guts.MaxSIMD, h.flags) h.pushSubtree(guts.ChainingValue(n)) h.buflen = 0 } n := copy(h.buf[h.buflen:], p) h.buflen += n p = p[n:] } return lenp, nil } // Sum implements hash.Hash. func (h *Hasher) Sum(b []byte) (sum []byte) { // We need to append h.Size() bytes to b. Reuse b's capacity if possible; // otherwise, allocate a new slice. if total := len(b) + h.Size(); cap(b) >= total { sum = b[:total] } else { sum = make([]byte, total) copy(sum, b) } // Read into the appended portion of sum. Use a low-latency-low-throughput // path for small digests (requiring a single compression), and a // high-latency-high-throughput path for large digests. if dst := sum[len(b):]; len(dst) <= 64 { out := guts.WordsToBytes(guts.CompressNode(h.rootNode())) copy(dst, out[:]) } else { h.XOF().Read(dst) } return } // Reset implements hash.Hash. func (h *Hasher) Reset() { h.counter = 0 h.buflen = 0 } // BlockSize implements hash.Hash. func (h *Hasher) BlockSize() int { return 64 } // Size implements hash.Hash. func (h *Hasher) Size() int { return h.size } // XOF returns an OutputReader initialized with the current hash state. func (h *Hasher) XOF() *OutputReader { return &OutputReader{ n: h.rootNode(), } } func newHasher(key [8]uint32, flags uint32, size int) *Hasher { return &Hasher{ key: key, flags: flags, size: size, } } // New returns a Hasher for the specified digest size and key. If key is nil, // the hash is unkeyed. Otherwise, len(key) must be 32. func New(size int, key []byte) *Hasher { if key == nil { return newHasher(guts.IV, 0, size) } var keyWords [8]uint32 for i := range keyWords { keyWords[i] = binary.LittleEndian.Uint32(key[i*4:]) } return newHasher(keyWords, guts.FlagKeyedHash, size) } // Sum256 and Sum512 always use the same hasher state, so we can save some time // when hashing small inputs by constructing the hasher ahead of time. var defaultHasher = New(64, nil) // Sum256 returns the unkeyed BLAKE3 hash of b, truncated to 256 bits. func Sum256(b []byte) (out [32]byte) { out512 := Sum512(b) copy(out[:], out512[:]) return } // Sum512 returns the unkeyed BLAKE3 hash of b, truncated to 512 bits. func Sum512(b []byte) (out [64]byte) { var n guts.Node if len(b) <= guts.BlockSize { var block [64]byte copy(block[:], b) return guts.WordsToBytes(guts.CompressNode(guts.Node{ CV: guts.IV, Block: guts.BytesToWords(block), BlockLen: uint32(len(b)), Flags: guts.FlagChunkStart | guts.FlagChunkEnd | guts.FlagRoot, })) } else if len(b) <= guts.ChunkSize { n = guts.CompressChunk(b, &guts.IV, 0, 0) n.Flags |= guts.FlagRoot } else { h := *defaultHasher h.Write(b) n = h.rootNode() } return guts.WordsToBytes(guts.CompressNode(n)) } // DeriveKey derives a subkey from ctx and srcKey. ctx should be hardcoded, // globally unique, and application-specific. A good format for ctx strings is: // // [application] [commit timestamp] [purpose] // // e.g.: // // example.com 2019-12-25 16:18:03 session tokens v1 // // The purpose of these requirements is to ensure that an attacker cannot trick // two different applications into using the same context string. func DeriveKey(subKey []byte, ctx string, srcKey []byte) { // construct the derivation Hasher const derivationIVLen = 32 h := newHasher(guts.IV, guts.FlagDeriveKeyContext, 32) h.Write([]byte(ctx)) derivationIV := h.Sum(make([]byte, 0, derivationIVLen)) var ivWords [8]uint32 for i := range ivWords { ivWords[i] = binary.LittleEndian.Uint32(derivationIV[i*4:]) } h = newHasher(ivWords, guts.FlagDeriveKeyMaterial, 0) // derive the subKey h.Write(srcKey) h.XOF().Read(subKey) } // An OutputReader produces an seekable stream of 2^64 - 1 pseudorandom output // bytes. type OutputReader struct { n guts.Node buf [guts.MaxSIMD * guts.BlockSize]byte off uint64 } // Read implements io.Reader. Callers may assume that Read returns len(p), nil // unless the read would extend beyond the end of the stream. func (or *OutputReader) Read(p []byte) (int, error) { if or.off == math.MaxUint64 { return 0, io.EOF } else if rem := math.MaxUint64 - or.off; uint64(len(p)) > rem { p = p[:rem] } lenp := len(p) for len(p) > 0 { if or.off%(guts.MaxSIMD*guts.BlockSize) == 0 { or.n.Counter = or.off / guts.BlockSize guts.CompressBlocks(&or.buf, or.n) } n := copy(p, or.buf[or.off%(guts.MaxSIMD*guts.BlockSize):]) p = p[n:] or.off += uint64(n) } return lenp, nil } // Seek implements io.Seeker. func (or *OutputReader) Seek(offset int64, whence int) (int64, error) { off := or.off switch whence { case io.SeekStart: if offset < 0 { return 0, errors.New("seek position cannot be negative") } off = uint64(offset) case io.SeekCurrent: if offset < 0 { if uint64(-offset) > off { return 0, errors.New("seek position cannot be negative") } off -= uint64(-offset) } else { off += uint64(offset) } case io.SeekEnd: off = uint64(offset) - 1 default: panic("invalid whence") } or.off = off or.n.Counter = uint64(off) / guts.BlockSize if or.off%(guts.MaxSIMD*guts.BlockSize) != 0 { guts.CompressBlocks(&or.buf, or.n) } // NOTE: or.off >= 2^63 will result in a negative return value. // Nothing we can do about this. return int64(or.off), nil } // ensure that Hasher implements hash.Hash var _ hash.Hash = (*Hasher)(nil) // EncodedSize returns the size of a Bao encoding for the provided quantity // of data. // // Deprecated: Use bao.EncodedSize instead. func BaoEncodedSize(dataLen int, outboard bool) int { return bao.EncodedSize(dataLen, 0, outboard) } // BaoEncode computes the intermediate BLAKE3 tree hashes of data and writes // them to dst. // // Deprecated: Use bao.Encode instead. func BaoEncode(dst io.WriterAt, data io.Reader, dataLen int64, outboard bool) ([32]byte, error) { return bao.Encode(dst, data, dataLen, 0, outboard) } // BaoDecode reads content and tree data from the provided reader(s), and // streams the verified content to dst. // // Deprecated: Use bao.Decode instead. func BaoDecode(dst io.Writer, data, outboard io.Reader, root [32]byte) (bool, error) { return bao.Decode(dst, data, outboard, 0, root) } // BaoEncodeBuf returns the Bao encoding and root (i.e. BLAKE3 hash) for data. // // Deprecated: Use bao.EncodeBuf instead. func BaoEncodeBuf(data []byte, outboard bool) ([]byte, [32]byte) { return bao.EncodeBuf(data, 0, outboard) } // BaoVerifyBuf verifies the Bao encoding and root (i.e. BLAKE3 hash) for data. // // Deprecated: Use bao.VerifyBuf instead. func BaoVerifyBuf(data, outboard []byte, root [32]byte) bool { return bao.VerifyBuf(data, outboard, 0, root) } blake3-1.3.0/blake3_test.go000066400000000000000000000133021461472055300154110ustar00rootroot00000000000000package blake3_test import ( "bytes" "encoding/hex" "encoding/json" "io" "os" "testing" "lukechampine.com/blake3" ) func toHex(data []byte) string { return hex.EncodeToString(data) } var testVectors = func() (vecs struct { Key string Cases []struct { InputLen int `json:"input_len"` Hash string `json:"hash"` KeyedHash string `json:"keyed_hash"` DeriveKey string `json:"derive_key"` } }) { data, err := os.ReadFile("testdata/vectors.json") if err != nil { panic(err) } if err := json.Unmarshal(data, &vecs); err != nil { panic(err) } return }() var testInput = func() []byte { input := make([]byte, 1e6) for i := range input { input[i] = byte(i % 251) } return input }() func TestVectors(t *testing.T) { for _, vec := range testVectors.Cases { in := testInput[:vec.InputLen] // regular h := blake3.New(len(vec.Hash)/2, nil) h.Write(in) if out := toHex(h.Sum(nil)); out != vec.Hash { t.Errorf("output did not match test vector:\n\texpected: %v...\n\t got: %v...", vec.Hash[:10], out[:10]) } // keyed h = blake3.New(len(vec.KeyedHash)/2, []byte(testVectors.Key)) h.Write(in) if out := toHex(h.Sum(nil)); out != vec.KeyedHash { t.Errorf("output did not match test vector:\n\texpected: %v...\n\t got: %v...", vec.KeyedHash[:10], out[:10]) } // derive key const ctx = "BLAKE3 2019-12-27 16:29:52 test vectors context" subKey := make([]byte, len(vec.DeriveKey)/2) blake3.DeriveKey(subKey, ctx, in) if out := toHex(subKey); out != vec.DeriveKey { t.Errorf("output did not match test vector:\n\texpected: %v...\n\t got: %v...", vec.DeriveKey[:10], out[:10]) } } } func TestXOF(t *testing.T) { for _, vec := range testVectors.Cases { in := testInput[:vec.InputLen] // XOF should produce same output as Sum, even when outputting 7 bytes at a time h := blake3.New(len(vec.Hash)/2, nil) h.Write(in) var xofBuf bytes.Buffer io.CopyBuffer(&xofBuf, io.LimitReader(h.XOF(), int64(len(vec.Hash)/2)), make([]byte, 7)) if out := toHex(xofBuf.Bytes()); out != vec.Hash { t.Errorf("XOF output did not match test vector:\n\texpected: %v...\n\t got: %v...", vec.Hash[:10], out[:10]) } // Should be able to Seek around in the output stream without affecting correctness seeks := []struct { offset int64 whence int }{ {0, io.SeekStart}, {17, io.SeekCurrent}, {-5, io.SeekCurrent}, {int64(h.Size()), io.SeekStart}, {int64(h.Size()), io.SeekCurrent}, } xof := h.XOF() outR := bytes.NewReader(xofBuf.Bytes()) for _, s := range seeks { outRead := make([]byte, 10) xofRead := make([]byte, 10) offset, _ := outR.Seek(s.offset, s.whence) n, _ := outR.Read(outRead) xof.Seek(s.offset, s.whence) xof.Read(xofRead[:n]) if !bytes.Equal(outRead[:n], xofRead[:n]) { t.Errorf("XOF output did not match test vector at offset %v:\n\texpected: %x...\n\t got: %x...", offset, outRead[:10], xofRead[:10]) } } } // test behavior at end of stream xof := blake3.New(0, nil).XOF() buf := make([]byte, 1024) xof.Seek(-1000, io.SeekEnd) n, err := xof.Read(buf) if n != 1000 || err != nil { t.Errorf("expected (1000, nil) when reading near end of stream, got (%v, %v)", n, err) } n, err = xof.Read(buf) if n != 0 || err != io.EOF { t.Errorf("expected (0, EOF) when reading past end of stream, got (%v, %v)", n, err) } // test invalid seek offsets _, err = xof.Seek(-1, io.SeekStart) if err == nil { t.Error("expected invalid offset error, got nil") } xof.Seek(0, io.SeekStart) _, err = xof.Seek(-1, io.SeekCurrent) if err == nil { t.Error("expected invalid offset error, got nil") } // test invalid seek whence didPanic := func() (p bool) { defer func() { p = recover() != nil }() xof.Seek(0, 17) return }() if !didPanic { t.Error("expected panic when seeking with invalid whence") } } func TestSum(t *testing.T) { for _, vec := range testVectors.Cases { in := testInput[:vec.InputLen] var exp256 [32]byte h := blake3.New(32, nil) h.Write(in) h.Sum(exp256[:0]) if got256 := blake3.Sum256(in); exp256 != got256 { t.Errorf("Sum256 output did not match Sum output:\n\texpected: %x...\n\t got: %x...", exp256[:5], got256[:5]) } var exp512 [64]byte h = blake3.New(64, nil) h.Write(in) h.Sum(exp512[:0]) if got512 := blake3.Sum512(in); exp512 != got512 { t.Errorf("Sum512 output did not match Sum output:\n\texpected: %x...\n\t got: %x...", exp512[:5], got512[:5]) } } } func TestReset(t *testing.T) { for _, vec := range testVectors.Cases { in := testInput[:vec.InputLen] h := blake3.New(32, nil) h.Write(in) out1 := h.Sum(nil) h.Reset() h.Write(in) out2 := h.Sum(nil) if !bytes.Equal(out1, out2) { t.Error("Reset did not reset Hasher state properly") } } // gotta have 100% test coverage... if blake3.New(0, nil).BlockSize() != 64 { t.Error("incorrect block size") } } type nopReader struct{} func (nopReader) Read(p []byte) (int, error) { return len(p), nil } func BenchmarkWrite(b *testing.B) { b.ReportAllocs() b.SetBytes(1024) io.CopyN(blake3.New(0, nil), nopReader{}, int64(b.N*1024)) } func BenchmarkXOF(b *testing.B) { b.ReportAllocs() b.SetBytes(1024) io.CopyN(io.Discard, blake3.New(0, nil).XOF(), int64(b.N*1024)) } func BenchmarkSum256(b *testing.B) { b.Run("64", func(b *testing.B) { b.ReportAllocs() b.SetBytes(64) buf := make([]byte, 64) for i := 0; i < b.N; i++ { blake3.Sum256(buf) } }) b.Run("1024", func(b *testing.B) { b.ReportAllocs() b.SetBytes(1024) buf := make([]byte, 1024) for i := 0; i < b.N; i++ { blake3.Sum256(buf) } }) b.Run("65536", func(b *testing.B) { b.ReportAllocs() b.SetBytes(65536) buf := make([]byte, 65536) for i := 0; i < b.N; i++ { blake3.Sum256(buf) } }) } blake3-1.3.0/go.mod000066400000000000000000000001261461472055300137700ustar00rootroot00000000000000module lukechampine.com/blake3 go 1.17 require github.com/klauspost/cpuid/v2 v2.0.9 blake3-1.3.0/go.sum000066400000000000000000000002611461472055300140150ustar00rootroot00000000000000github.com/klauspost/cpuid/v2 v2.0.9 h1:lgaqFMSdTdQYdZ04uHyN2d/eKdOMyi2YLSvlQIBFYa4= github.com/klauspost/cpuid/v2 v2.0.9/go.mod h1:FInQzS24/EEf25PyTYn52gqo7WaD8xa0213Md/qVLRg= blake3-1.3.0/guts/000077500000000000000000000000001461472055300136455ustar00rootroot00000000000000blake3-1.3.0/guts/compress_amd64.go000066400000000000000000000111221461472055300170170ustar00rootroot00000000000000package guts import "unsafe" //go:generate go run avo/gen.go -out blake3_amd64.s //go:noescape func compressChunksAVX512(cvs *[16][8]uint32, buf *[16 * ChunkSize]byte, key *[8]uint32, counter uint64, flags uint32) //go:noescape func compressChunksAVX2(cvs *[8][8]uint32, buf *[8 * ChunkSize]byte, key *[8]uint32, counter uint64, flags uint32) //go:noescape func compressBlocksAVX512(out *[1024]byte, block *[16]uint32, cv *[8]uint32, counter uint64, blockLen uint32, flags uint32) //go:noescape func compressBlocksAVX2(out *[512]byte, msgs *[16]uint32, cv *[8]uint32, counter uint64, blockLen uint32, flags uint32) //go:noescape func compressParentsAVX2(parents *[8][8]uint32, cvs *[16][8]uint32, key *[8]uint32, flags uint32) func compressBufferAVX512(buf *[MaxSIMD * ChunkSize]byte, buflen int, key *[8]uint32, counter uint64, flags uint32) Node { var cvs [MaxSIMD][8]uint32 compressChunksAVX512(&cvs, buf, key, counter, flags) numChunks := uint64(buflen / ChunkSize) if buflen%ChunkSize != 0 { // use non-asm for remainder partialChunk := buf[buflen-buflen%ChunkSize : buflen] cvs[numChunks] = ChainingValue(CompressChunk(partialChunk, key, counter+numChunks, flags)) numChunks++ } return mergeSubtrees(&cvs, numChunks, key, flags) } func compressBufferAVX2(buf *[MaxSIMD * ChunkSize]byte, buflen int, key *[8]uint32, counter uint64, flags uint32) Node { var cvs [MaxSIMD][8]uint32 cvHalves := (*[2][8][8]uint32)(unsafe.Pointer(&cvs)) bufHalves := (*[2][8 * ChunkSize]byte)(unsafe.Pointer(buf)) compressChunksAVX2(&cvHalves[0], &bufHalves[0], key, counter, flags) numChunks := uint64(buflen / ChunkSize) if numChunks > 8 { compressChunksAVX2(&cvHalves[1], &bufHalves[1], key, counter+8, flags) } if buflen%ChunkSize != 0 { // use non-asm for remainder partialChunk := buf[buflen-buflen%ChunkSize : buflen] cvs[numChunks] = ChainingValue(CompressChunk(partialChunk, key, counter+numChunks, flags)) numChunks++ } return mergeSubtrees(&cvs, numChunks, key, flags) } // CompressBuffer compresses up to MaxSIMD chunks in parallel and returns their // root node. func CompressBuffer(buf *[MaxSIMD * ChunkSize]byte, buflen int, key *[8]uint32, counter uint64, flags uint32) Node { if buflen <= ChunkSize { return CompressChunk(buf[:buflen], key, counter, flags) } switch { case haveAVX512 && buflen >= ChunkSize*2: return compressBufferAVX512(buf, buflen, key, counter, flags) case haveAVX2 && buflen >= ChunkSize*2: return compressBufferAVX2(buf, buflen, key, counter, flags) default: return compressBufferGeneric(buf, buflen, key, counter, flags) } } // CompressChunk compresses a single chunk, returning its final (uncompressed) // node. func CompressChunk(chunk []byte, key *[8]uint32, counter uint64, flags uint32) Node { n := Node{ CV: *key, Counter: counter, BlockLen: BlockSize, Flags: flags | FlagChunkStart, } blockBytes := (*[64]byte)(unsafe.Pointer(&n.Block))[:] for len(chunk) > BlockSize { copy(blockBytes, chunk) chunk = chunk[BlockSize:] n.CV = ChainingValue(n) n.Flags &^= FlagChunkStart } // pad last block with zeros n.Block = [16]uint32{} copy(blockBytes, chunk) n.BlockLen = uint32(len(chunk)) n.Flags |= FlagChunkEnd return n } // CompressBlocks compresses MaxSIMD copies of n with successive counter values, // storing the results in out. func CompressBlocks(out *[MaxSIMD * BlockSize]byte, n Node) { switch { case haveAVX512: compressBlocksAVX512(out, &n.Block, &n.CV, n.Counter, n.BlockLen, n.Flags) case haveAVX2: outs := (*[2][512]byte)(unsafe.Pointer(out)) compressBlocksAVX2(&outs[0], &n.Block, &n.CV, n.Counter, n.BlockLen, n.Flags) compressBlocksAVX2(&outs[1], &n.Block, &n.CV, n.Counter+8, n.BlockLen, n.Flags) default: outs := (*[MaxSIMD][64]byte)(unsafe.Pointer(out)) compressBlocksGeneric(outs, n) } } func mergeSubtrees(cvs *[MaxSIMD][8]uint32, numCVs uint64, key *[8]uint32, flags uint32) Node { if !haveAVX2 { return mergeSubtreesGeneric(cvs, numCVs, key, flags) } for numCVs > 2 { if numCVs%2 == 0 { compressParentsAVX2((*[8][8]uint32)(unsafe.Pointer(cvs)), cvs, key, flags) } else { keep := cvs[numCVs-1] compressParentsAVX2((*[8][8]uint32)(unsafe.Pointer(cvs)), cvs, key, flags) cvs[numCVs/2] = keep numCVs++ } numCVs /= 2 } return ParentNode(cvs[0], cvs[1], key, flags) } // BytesToWords converts an array of 64 bytes to an array of 16 bytes. func BytesToWords(bytes [64]byte) [16]uint32 { return *(*[16]uint32)(unsafe.Pointer(&bytes)) } // WordsToBytes converts an array of 16 words to an array of 64 bytes. func WordsToBytes(words [16]uint32) [64]byte { return *(*[64]byte)(unsafe.Pointer(&words)) } blake3-1.3.0/guts/compress_amd64.s000066400000000000000000003757771461472055300167110ustar00rootroot00000000000000// Code generated by command: go run gen.go -out compress_amd64.s. DO NOT EDIT. #include "textflag.h" DATA iv<>+0(SB)/4, $0x6a09e667 DATA iv<>+4(SB)/4, $0xbb67ae85 DATA iv<>+8(SB)/4, $0x3c6ef372 DATA iv<>+12(SB)/4, $0xa54ff53a GLOBL iv<>(SB), RODATA|NOPTR, $16 DATA seq<>+0(SB)/4, $0x00000000 DATA seq<>+4(SB)/4, $0x00000001 DATA seq<>+8(SB)/4, $0x00000002 DATA seq<>+12(SB)/4, $0x00000003 DATA seq<>+16(SB)/4, $0x00000004 DATA seq<>+20(SB)/4, $0x00000005 DATA seq<>+24(SB)/4, $0x00000006 DATA seq<>+28(SB)/4, $0x00000007 DATA seq<>+32(SB)/4, $0x00000008 DATA seq<>+36(SB)/4, $0x00000009 DATA seq<>+40(SB)/4, $0x0000000a DATA seq<>+44(SB)/4, $0x0000000b DATA seq<>+48(SB)/4, $0x0000000c DATA seq<>+52(SB)/4, $0x0000000d DATA seq<>+56(SB)/4, $0x0000000e DATA seq<>+60(SB)/4, $0x0000000f GLOBL seq<>(SB), RODATA|NOPTR, $64 DATA seq64<>+0(SB)/8, $0x0000000000000000 DATA seq64<>+8(SB)/8, $0x0000000000000001 DATA seq64<>+16(SB)/8, $0x0000000000000002 DATA seq64<>+24(SB)/8, $0x0000000000000003 DATA seq64<>+32(SB)/8, $0x0000000000000004 DATA seq64<>+40(SB)/8, $0x0000000000000005 DATA seq64<>+48(SB)/8, $0x0000000000000006 DATA seq64<>+56(SB)/8, $0x0000000000000007 GLOBL seq64<>(SB), RODATA|NOPTR, $64 DATA shuffle_rot8<>+0(SB)/4, $0x00030201 DATA shuffle_rot8<>+4(SB)/4, $0x04070605 DATA shuffle_rot8<>+8(SB)/4, $0x080b0a09 DATA shuffle_rot8<>+12(SB)/4, $0x0c0f0e0d DATA shuffle_rot8<>+16(SB)/4, $0x10131211 DATA shuffle_rot8<>+20(SB)/4, $0x14171615 DATA shuffle_rot8<>+24(SB)/4, $0x181b1a19 DATA shuffle_rot8<>+28(SB)/4, $0x1c1f1e1d GLOBL shuffle_rot8<>(SB), RODATA|NOPTR, $32 DATA shuffle_rot16<>+0(SB)/4, $0x01000302 DATA shuffle_rot16<>+4(SB)/4, $0x05040706 DATA shuffle_rot16<>+8(SB)/4, $0x09080b0a DATA shuffle_rot16<>+12(SB)/4, $0x0d0c0f0e DATA shuffle_rot16<>+16(SB)/4, $0x11101312 DATA shuffle_rot16<>+20(SB)/4, $0x15141716 DATA shuffle_rot16<>+24(SB)/4, $0x19181b1a DATA shuffle_rot16<>+28(SB)/4, $0x1d1c1f1e GLOBL shuffle_rot16<>(SB), RODATA|NOPTR, $32 // func compressBlocksAVX512(out *[1024]byte, block *[16]uint32, cv *[8]uint32, counter uint64, blockLen uint32, flags uint32) // Requires: AVX512BW, AVX512F TEXT ·compressBlocksAVX512(SB), NOSPLIT, $0-40 MOVQ out+0(FP), AX MOVQ block+8(FP), CX MOVQ cv+16(FP), DX // Initialize block vectors VPBROADCASTD (CX), Z1 VPBROADCASTD 4(CX), Z3 VPBROADCASTD 8(CX), Z5 VPBROADCASTD 12(CX), Z7 VPBROADCASTD 16(CX), Z9 VPBROADCASTD 20(CX), Z11 VPBROADCASTD 24(CX), Z13 VPBROADCASTD 28(CX), Z15 VPBROADCASTD 32(CX), Z17 VPBROADCASTD 36(CX), Z19 VPBROADCASTD 40(CX), Z21 VPBROADCASTD 44(CX), Z23 VPBROADCASTD 48(CX), Z25 VPBROADCASTD 52(CX), Z27 VPBROADCASTD 56(CX), Z29 VPBROADCASTD 60(CX), Z31 // Initialize state vectors VPBROADCASTD (DX), Z0 VPBROADCASTD 4(DX), Z2 VPBROADCASTD 8(DX), Z4 VPBROADCASTD 12(DX), Z6 VPBROADCASTD 16(DX), Z8 VPBROADCASTD 20(DX), Z10 VPBROADCASTD 24(DX), Z12 VPBROADCASTD 28(DX), Z14 VPBROADCASTD iv<>+0(SB), Z16 VPBROADCASTD iv<>+4(SB), Z18 VPBROADCASTD iv<>+8(SB), Z20 VPBROADCASTD iv<>+12(SB), Z22 VPBROADCASTD counter+24(FP), Z24 VPADDD seq<>+0(SB), Z24, Z24 VPCMPUD $0x01, seq<>+0(SB), Z24, K1 VPBROADCASTD counter+28(FP), Z26 VPADDD.BCST seq<>+4(SB), Z26, K1, Z26 VPBROADCASTD blockLen+32(FP), Z28 VPBROADCASTD flags+36(FP), Z30 // Round 1 VPADDD Z0, Z8, Z0 VPADDD Z1, Z0, Z0 VPXORD Z24, Z0, Z24 VPRORD $0x10, Z24, Z24 VPADDD Z16, Z24, Z16 VPXORD Z8, Z16, Z8 VPRORD $0x0c, Z8, Z8 VPADDD Z0, Z8, Z0 VPADDD Z3, Z0, Z0 VPXORD Z24, Z0, Z24 VPRORD $0x08, Z24, Z24 VPADDD Z16, Z24, Z16 VPXORD Z8, Z16, Z8 VPRORD $0x07, Z8, Z8 VPADDD Z2, Z10, Z2 VPADDD Z5, Z2, Z2 VPXORD Z26, Z2, Z26 VPRORD $0x10, Z26, Z26 VPADDD Z18, Z26, Z18 VPXORD Z10, Z18, Z10 VPRORD $0x0c, Z10, Z10 VPADDD Z2, Z10, Z2 VPADDD Z7, Z2, Z2 VPXORD Z26, Z2, Z26 VPRORD $0x08, Z26, Z26 VPADDD Z18, Z26, Z18 VPXORD Z10, Z18, Z10 VPRORD $0x07, Z10, Z10 VPADDD Z4, Z12, Z4 VPADDD Z9, Z4, Z4 VPXORD Z28, Z4, Z28 VPRORD $0x10, Z28, Z28 VPADDD Z20, Z28, Z20 VPXORD Z12, Z20, Z12 VPRORD $0x0c, Z12, Z12 VPADDD Z4, Z12, Z4 VPADDD Z11, Z4, Z4 VPXORD Z28, Z4, Z28 VPRORD $0x08, Z28, Z28 VPADDD Z20, Z28, Z20 VPXORD Z12, Z20, Z12 VPRORD $0x07, Z12, Z12 VPADDD Z6, Z14, Z6 VPADDD Z13, Z6, Z6 VPXORD Z30, Z6, Z30 VPRORD $0x10, Z30, Z30 VPADDD Z22, Z30, Z22 VPXORD Z14, Z22, Z14 VPRORD $0x0c, Z14, Z14 VPADDD Z6, Z14, Z6 VPADDD Z15, Z6, Z6 VPXORD Z30, Z6, Z30 VPRORD $0x08, Z30, Z30 VPADDD Z22, Z30, Z22 VPXORD Z14, Z22, Z14 VPRORD $0x07, Z14, Z14 VPADDD Z0, Z10, Z0 VPADDD Z17, Z0, Z0 VPXORD Z30, Z0, Z30 VPRORD $0x10, Z30, Z30 VPADDD Z20, Z30, Z20 VPXORD Z10, Z20, Z10 VPRORD $0x0c, Z10, Z10 VPADDD Z0, Z10, Z0 VPADDD Z19, Z0, Z0 VPXORD Z30, Z0, Z30 VPRORD $0x08, Z30, Z30 VPADDD Z20, Z30, Z20 VPXORD Z10, Z20, Z10 VPRORD $0x07, Z10, Z10 VPADDD Z2, Z12, Z2 VPADDD Z21, Z2, Z2 VPXORD Z24, Z2, Z24 VPRORD $0x10, Z24, Z24 VPADDD Z22, Z24, Z22 VPXORD Z12, Z22, Z12 VPRORD $0x0c, Z12, Z12 VPADDD Z2, Z12, Z2 VPADDD Z23, Z2, Z2 VPXORD Z24, Z2, Z24 VPRORD $0x08, Z24, Z24 VPADDD Z22, Z24, Z22 VPXORD Z12, Z22, Z12 VPRORD $0x07, Z12, Z12 VPADDD Z4, Z14, Z4 VPADDD Z25, Z4, Z4 VPXORD Z26, Z4, Z26 VPRORD $0x10, Z26, Z26 VPADDD Z16, Z26, Z16 VPXORD Z14, Z16, Z14 VPRORD $0x0c, Z14, Z14 VPADDD Z4, Z14, Z4 VPADDD Z27, Z4, Z4 VPXORD Z26, Z4, Z26 VPRORD $0x08, Z26, Z26 VPADDD Z16, Z26, Z16 VPXORD Z14, Z16, Z14 VPRORD $0x07, Z14, Z14 VPADDD Z6, Z8, Z6 VPADDD Z29, Z6, Z6 VPXORD Z28, Z6, Z28 VPRORD $0x10, Z28, Z28 VPADDD Z18, Z28, Z18 VPXORD Z8, Z18, Z8 VPRORD $0x0c, Z8, Z8 VPADDD Z6, Z8, Z6 VPADDD Z31, Z6, Z6 VPXORD Z28, Z6, Z28 VPRORD $0x08, Z28, Z28 VPADDD Z18, Z28, Z18 VPXORD Z8, Z18, Z8 VPRORD $0x07, Z8, Z8 // Round 2 VPADDD Z0, Z8, Z0 VPADDD Z5, Z0, Z0 VPXORD Z24, Z0, Z24 VPRORD $0x10, Z24, Z24 VPADDD Z16, Z24, Z16 VPXORD Z8, Z16, Z8 VPRORD $0x0c, Z8, Z8 VPADDD Z0, Z8, Z0 VPADDD Z13, Z0, Z0 VPXORD Z24, Z0, Z24 VPRORD $0x08, Z24, Z24 VPADDD Z16, Z24, Z16 VPXORD Z8, Z16, Z8 VPRORD $0x07, Z8, Z8 VPADDD Z2, Z10, Z2 VPADDD Z7, Z2, Z2 VPXORD Z26, Z2, Z26 VPRORD $0x10, Z26, Z26 VPADDD Z18, Z26, Z18 VPXORD Z10, Z18, Z10 VPRORD $0x0c, Z10, Z10 VPADDD Z2, Z10, Z2 VPADDD Z21, Z2, Z2 VPXORD Z26, Z2, Z26 VPRORD $0x08, Z26, Z26 VPADDD Z18, Z26, Z18 VPXORD Z10, Z18, Z10 VPRORD $0x07, Z10, Z10 VPADDD Z4, Z12, Z4 VPADDD Z15, Z4, Z4 VPXORD Z28, Z4, Z28 VPRORD $0x10, Z28, Z28 VPADDD Z20, Z28, Z20 VPXORD Z12, Z20, Z12 VPRORD $0x0c, Z12, Z12 VPADDD Z4, Z12, Z4 VPADDD Z1, Z4, Z4 VPXORD Z28, Z4, Z28 VPRORD $0x08, Z28, Z28 VPADDD Z20, Z28, Z20 VPXORD Z12, Z20, Z12 VPRORD $0x07, Z12, Z12 VPADDD Z6, Z14, Z6 VPADDD Z9, Z6, Z6 VPXORD Z30, Z6, Z30 VPRORD $0x10, Z30, Z30 VPADDD Z22, Z30, Z22 VPXORD Z14, Z22, Z14 VPRORD $0x0c, Z14, Z14 VPADDD Z6, Z14, Z6 VPADDD Z27, Z6, Z6 VPXORD Z30, Z6, Z30 VPRORD $0x08, Z30, Z30 VPADDD Z22, Z30, Z22 VPXORD Z14, Z22, Z14 VPRORD $0x07, Z14, Z14 VPADDD Z0, Z10, Z0 VPADDD Z3, Z0, Z0 VPXORD Z30, Z0, Z30 VPRORD $0x10, Z30, Z30 VPADDD Z20, Z30, Z20 VPXORD Z10, Z20, Z10 VPRORD $0x0c, Z10, Z10 VPADDD Z0, Z10, Z0 VPADDD Z23, Z0, Z0 VPXORD Z30, Z0, Z30 VPRORD $0x08, Z30, Z30 VPADDD Z20, Z30, Z20 VPXORD Z10, Z20, Z10 VPRORD $0x07, Z10, Z10 VPADDD Z2, Z12, Z2 VPADDD Z25, Z2, Z2 VPXORD Z24, Z2, Z24 VPRORD $0x10, Z24, Z24 VPADDD Z22, Z24, Z22 VPXORD Z12, Z22, Z12 VPRORD $0x0c, Z12, Z12 VPADDD Z2, Z12, Z2 VPADDD Z11, Z2, Z2 VPXORD Z24, Z2, Z24 VPRORD $0x08, Z24, Z24 VPADDD Z22, Z24, Z22 VPXORD Z12, Z22, Z12 VPRORD $0x07, Z12, Z12 VPADDD Z4, Z14, Z4 VPADDD Z19, Z4, Z4 VPXORD Z26, Z4, Z26 VPRORD $0x10, Z26, Z26 VPADDD Z16, Z26, Z16 VPXORD Z14, Z16, Z14 VPRORD $0x0c, Z14, Z14 VPADDD Z4, Z14, Z4 VPADDD Z29, Z4, Z4 VPXORD Z26, Z4, Z26 VPRORD $0x08, Z26, Z26 VPADDD Z16, Z26, Z16 VPXORD Z14, Z16, Z14 VPRORD $0x07, Z14, Z14 VPADDD Z6, Z8, Z6 VPADDD Z31, Z6, Z6 VPXORD Z28, Z6, Z28 VPRORD $0x10, Z28, Z28 VPADDD Z18, Z28, Z18 VPXORD Z8, Z18, Z8 VPRORD $0x0c, Z8, Z8 VPADDD Z6, Z8, Z6 VPADDD Z17, Z6, Z6 VPXORD Z28, Z6, Z28 VPRORD $0x08, Z28, Z28 VPADDD Z18, Z28, Z18 VPXORD Z8, Z18, Z8 VPRORD $0x07, Z8, Z8 // Round 3 VPADDD Z0, Z8, Z0 VPADDD Z7, Z0, Z0 VPXORD Z24, Z0, Z24 VPRORD $0x10, Z24, Z24 VPADDD Z16, Z24, Z16 VPXORD Z8, Z16, Z8 VPRORD $0x0c, Z8, Z8 VPADDD Z0, Z8, Z0 VPADDD Z9, Z0, Z0 VPXORD Z24, Z0, Z24 VPRORD $0x08, Z24, Z24 VPADDD Z16, Z24, Z16 VPXORD Z8, Z16, Z8 VPRORD $0x07, Z8, Z8 VPADDD Z2, Z10, Z2 VPADDD Z21, Z2, Z2 VPXORD Z26, Z2, Z26 VPRORD $0x10, Z26, Z26 VPADDD Z18, Z26, Z18 VPXORD Z10, Z18, Z10 VPRORD $0x0c, Z10, Z10 VPADDD Z2, Z10, Z2 VPADDD Z25, Z2, Z2 VPXORD Z26, Z2, Z26 VPRORD $0x08, Z26, Z26 VPADDD Z18, Z26, Z18 VPXORD Z10, Z18, Z10 VPRORD $0x07, Z10, Z10 VPADDD Z4, Z12, Z4 VPADDD Z27, Z4, Z4 VPXORD Z28, Z4, Z28 VPRORD $0x10, Z28, Z28 VPADDD Z20, Z28, Z20 VPXORD Z12, Z20, Z12 VPRORD $0x0c, Z12, Z12 VPADDD Z4, Z12, Z4 VPADDD Z5, Z4, Z4 VPXORD Z28, Z4, Z28 VPRORD $0x08, Z28, Z28 VPADDD Z20, Z28, Z20 VPXORD Z12, Z20, Z12 VPRORD $0x07, Z12, Z12 VPADDD Z6, Z14, Z6 VPADDD Z15, Z6, Z6 VPXORD Z30, Z6, Z30 VPRORD $0x10, Z30, Z30 VPADDD Z22, Z30, Z22 VPXORD Z14, Z22, Z14 VPRORD $0x0c, Z14, Z14 VPADDD Z6, Z14, Z6 VPADDD Z29, Z6, Z6 VPXORD Z30, Z6, Z30 VPRORD $0x08, Z30, Z30 VPADDD Z22, Z30, Z22 VPXORD Z14, Z22, Z14 VPRORD $0x07, Z14, Z14 VPADDD Z0, Z10, Z0 VPADDD Z13, Z0, Z0 VPXORD Z30, Z0, Z30 VPRORD $0x10, Z30, Z30 VPADDD Z20, Z30, Z20 VPXORD Z10, Z20, Z10 VPRORD $0x0c, Z10, Z10 VPADDD Z0, Z10, Z0 VPADDD Z11, Z0, Z0 VPXORD Z30, Z0, Z30 VPRORD $0x08, Z30, Z30 VPADDD Z20, Z30, Z20 VPXORD Z10, Z20, Z10 VPRORD $0x07, Z10, Z10 VPADDD Z2, Z12, Z2 VPADDD Z19, Z2, Z2 VPXORD Z24, Z2, Z24 VPRORD $0x10, Z24, Z24 VPADDD Z22, Z24, Z22 VPXORD Z12, Z22, Z12 VPRORD $0x0c, Z12, Z12 VPADDD Z2, Z12, Z2 VPADDD Z1, Z2, Z2 VPXORD Z24, Z2, Z24 VPRORD $0x08, Z24, Z24 VPADDD Z22, Z24, Z22 VPXORD Z12, Z22, Z12 VPRORD $0x07, Z12, Z12 VPADDD Z4, Z14, Z4 VPADDD Z23, Z4, Z4 VPXORD Z26, Z4, Z26 VPRORD $0x10, Z26, Z26 VPADDD Z16, Z26, Z16 VPXORD Z14, Z16, Z14 VPRORD $0x0c, Z14, Z14 VPADDD Z4, Z14, Z4 VPADDD Z31, Z4, Z4 VPXORD Z26, Z4, Z26 VPRORD $0x08, Z26, Z26 VPADDD Z16, Z26, Z16 VPXORD Z14, Z16, Z14 VPRORD $0x07, Z14, Z14 VPADDD Z6, Z8, Z6 VPADDD Z17, Z6, Z6 VPXORD Z28, Z6, Z28 VPRORD $0x10, Z28, Z28 VPADDD Z18, Z28, Z18 VPXORD Z8, Z18, Z8 VPRORD $0x0c, Z8, Z8 VPADDD Z6, Z8, Z6 VPADDD Z3, Z6, Z6 VPXORD Z28, Z6, Z28 VPRORD $0x08, Z28, Z28 VPADDD Z18, Z28, Z18 VPXORD Z8, Z18, Z8 VPRORD $0x07, Z8, Z8 // Round 4 VPADDD Z0, Z8, Z0 VPADDD Z21, Z0, Z0 VPXORD Z24, Z0, Z24 VPRORD $0x10, Z24, Z24 VPADDD Z16, Z24, Z16 VPXORD Z8, Z16, Z8 VPRORD $0x0c, Z8, Z8 VPADDD Z0, Z8, Z0 VPADDD Z15, Z0, Z0 VPXORD Z24, Z0, Z24 VPRORD $0x08, Z24, Z24 VPADDD Z16, Z24, Z16 VPXORD Z8, Z16, Z8 VPRORD $0x07, Z8, Z8 VPADDD Z2, Z10, Z2 VPADDD Z25, Z2, Z2 VPXORD Z26, Z2, Z26 VPRORD $0x10, Z26, Z26 VPADDD Z18, Z26, Z18 VPXORD Z10, Z18, Z10 VPRORD $0x0c, Z10, Z10 VPADDD Z2, Z10, Z2 VPADDD Z19, Z2, Z2 VPXORD Z26, Z2, Z26 VPRORD $0x08, Z26, Z26 VPADDD Z18, Z26, Z18 VPXORD Z10, Z18, Z10 VPRORD $0x07, Z10, Z10 VPADDD Z4, Z12, Z4 VPADDD Z29, Z4, Z4 VPXORD Z28, Z4, Z28 VPRORD $0x10, Z28, Z28 VPADDD Z20, Z28, Z20 VPXORD Z12, Z20, Z12 VPRORD $0x0c, Z12, Z12 VPADDD Z4, Z12, Z4 VPADDD Z7, Z4, Z4 VPXORD Z28, Z4, Z28 VPRORD $0x08, Z28, Z28 VPADDD Z20, Z28, Z20 VPXORD Z12, Z20, Z12 VPRORD $0x07, Z12, Z12 VPADDD Z6, Z14, Z6 VPADDD Z27, Z6, Z6 VPXORD Z30, Z6, Z30 VPRORD $0x10, Z30, Z30 VPADDD Z22, Z30, Z22 VPXORD Z14, Z22, Z14 VPRORD $0x0c, Z14, Z14 VPADDD Z6, Z14, Z6 VPADDD Z31, Z6, Z6 VPXORD Z30, Z6, Z30 VPRORD $0x08, Z30, Z30 VPADDD Z22, Z30, Z22 VPXORD Z14, Z22, Z14 VPRORD $0x07, Z14, Z14 VPADDD Z0, Z10, Z0 VPADDD Z9, Z0, Z0 VPXORD Z30, Z0, Z30 VPRORD $0x10, Z30, Z30 VPADDD Z20, Z30, Z20 VPXORD Z10, Z20, Z10 VPRORD $0x0c, Z10, Z10 VPADDD Z0, Z10, Z0 VPADDD Z1, Z0, Z0 VPXORD Z30, Z0, Z30 VPRORD $0x08, Z30, Z30 VPADDD Z20, Z30, Z20 VPXORD Z10, Z20, Z10 VPRORD $0x07, Z10, Z10 VPADDD Z2, Z12, Z2 VPADDD Z23, Z2, Z2 VPXORD Z24, Z2, Z24 VPRORD $0x10, Z24, Z24 VPADDD Z22, Z24, Z22 VPXORD Z12, Z22, Z12 VPRORD $0x0c, Z12, Z12 VPADDD Z2, Z12, Z2 VPADDD Z5, Z2, Z2 VPXORD Z24, Z2, Z24 VPRORD $0x08, Z24, Z24 VPADDD Z22, Z24, Z22 VPXORD Z12, Z22, Z12 VPRORD $0x07, Z12, Z12 VPADDD Z4, Z14, Z4 VPADDD Z11, Z4, Z4 VPXORD Z26, Z4, Z26 VPRORD $0x10, Z26, Z26 VPADDD Z16, Z26, Z16 VPXORD Z14, Z16, Z14 VPRORD $0x0c, Z14, Z14 VPADDD Z4, Z14, Z4 VPADDD Z17, Z4, Z4 VPXORD Z26, Z4, Z26 VPRORD $0x08, Z26, Z26 VPADDD Z16, Z26, Z16 VPXORD Z14, Z16, Z14 VPRORD $0x07, Z14, Z14 VPADDD Z6, Z8, Z6 VPADDD Z3, Z6, Z6 VPXORD Z28, Z6, Z28 VPRORD $0x10, Z28, Z28 VPADDD Z18, Z28, Z18 VPXORD Z8, Z18, Z8 VPRORD $0x0c, Z8, Z8 VPADDD Z6, Z8, Z6 VPADDD Z13, Z6, Z6 VPXORD Z28, Z6, Z28 VPRORD $0x08, Z28, Z28 VPADDD Z18, Z28, Z18 VPXORD Z8, Z18, Z8 VPRORD $0x07, Z8, Z8 // Round 5 VPADDD Z0, Z8, Z0 VPADDD Z25, Z0, Z0 VPXORD Z24, Z0, Z24 VPRORD $0x10, Z24, Z24 VPADDD Z16, Z24, Z16 VPXORD Z8, Z16, Z8 VPRORD $0x0c, Z8, Z8 VPADDD Z0, Z8, Z0 VPADDD Z27, Z0, Z0 VPXORD Z24, Z0, Z24 VPRORD $0x08, Z24, Z24 VPADDD Z16, Z24, Z16 VPXORD Z8, Z16, Z8 VPRORD $0x07, Z8, Z8 VPADDD Z2, Z10, Z2 VPADDD Z19, Z2, Z2 VPXORD Z26, Z2, Z26 VPRORD $0x10, Z26, Z26 VPADDD Z18, Z26, Z18 VPXORD Z10, Z18, Z10 VPRORD $0x0c, Z10, Z10 VPADDD Z2, Z10, Z2 VPADDD Z23, Z2, Z2 VPXORD Z26, Z2, Z26 VPRORD $0x08, Z26, Z26 VPADDD Z18, Z26, Z18 VPXORD Z10, Z18, Z10 VPRORD $0x07, Z10, Z10 VPADDD Z4, Z12, Z4 VPADDD Z31, Z4, Z4 VPXORD Z28, Z4, Z28 VPRORD $0x10, Z28, Z28 VPADDD Z20, Z28, Z20 VPXORD Z12, Z20, Z12 VPRORD $0x0c, Z12, Z12 VPADDD Z4, Z12, Z4 VPADDD Z21, Z4, Z4 VPXORD Z28, Z4, Z28 VPRORD $0x08, Z28, Z28 VPADDD Z20, Z28, Z20 VPXORD Z12, Z20, Z12 VPRORD $0x07, Z12, Z12 VPADDD Z6, Z14, Z6 VPADDD Z29, Z6, Z6 VPXORD Z30, Z6, Z30 VPRORD $0x10, Z30, Z30 VPADDD Z22, Z30, Z22 VPXORD Z14, Z22, Z14 VPRORD $0x0c, Z14, Z14 VPADDD Z6, Z14, Z6 VPADDD Z17, Z6, Z6 VPXORD Z30, Z6, Z30 VPRORD $0x08, Z30, Z30 VPADDD Z22, Z30, Z22 VPXORD Z14, Z22, Z14 VPRORD $0x07, Z14, Z14 VPADDD Z0, Z10, Z0 VPADDD Z15, Z0, Z0 VPXORD Z30, Z0, Z30 VPRORD $0x10, Z30, Z30 VPADDD Z20, Z30, Z20 VPXORD Z10, Z20, Z10 VPRORD $0x0c, Z10, Z10 VPADDD Z0, Z10, Z0 VPADDD Z5, Z0, Z0 VPXORD Z30, Z0, Z30 VPRORD $0x08, Z30, Z30 VPADDD Z20, Z30, Z20 VPXORD Z10, Z20, Z10 VPRORD $0x07, Z10, Z10 VPADDD Z2, Z12, Z2 VPADDD Z11, Z2, Z2 VPXORD Z24, Z2, Z24 VPRORD $0x10, Z24, Z24 VPADDD Z22, Z24, Z22 VPXORD Z12, Z22, Z12 VPRORD $0x0c, Z12, Z12 VPADDD Z2, Z12, Z2 VPADDD Z7, Z2, Z2 VPXORD Z24, Z2, Z24 VPRORD $0x08, Z24, Z24 VPADDD Z22, Z24, Z22 VPXORD Z12, Z22, Z12 VPRORD $0x07, Z12, Z12 VPADDD Z4, Z14, Z4 VPADDD Z1, Z4, Z4 VPXORD Z26, Z4, Z26 VPRORD $0x10, Z26, Z26 VPADDD Z16, Z26, Z16 VPXORD Z14, Z16, Z14 VPRORD $0x0c, Z14, Z14 VPADDD Z4, Z14, Z4 VPADDD Z3, Z4, Z4 VPXORD Z26, Z4, Z26 VPRORD $0x08, Z26, Z26 VPADDD Z16, Z26, Z16 VPXORD Z14, Z16, Z14 VPRORD $0x07, Z14, Z14 VPADDD Z6, Z8, Z6 VPADDD Z13, Z6, Z6 VPXORD Z28, Z6, Z28 VPRORD $0x10, Z28, Z28 VPADDD Z18, Z28, Z18 VPXORD Z8, Z18, Z8 VPRORD $0x0c, Z8, Z8 VPADDD Z6, Z8, Z6 VPADDD Z9, Z6, Z6 VPXORD Z28, Z6, Z28 VPRORD $0x08, Z28, Z28 VPADDD Z18, Z28, Z18 VPXORD Z8, Z18, Z8 VPRORD $0x07, Z8, Z8 // Round 6 VPADDD Z0, Z8, Z0 VPADDD Z19, Z0, Z0 VPXORD Z24, Z0, Z24 VPRORD $0x10, Z24, Z24 VPADDD Z16, Z24, Z16 VPXORD Z8, Z16, Z8 VPRORD $0x0c, Z8, Z8 VPADDD Z0, Z8, Z0 VPADDD Z29, Z0, Z0 VPXORD Z24, Z0, Z24 VPRORD $0x08, Z24, Z24 VPADDD Z16, Z24, Z16 VPXORD Z8, Z16, Z8 VPRORD $0x07, Z8, Z8 VPADDD Z2, Z10, Z2 VPADDD Z23, Z2, Z2 VPXORD Z26, Z2, Z26 VPRORD $0x10, Z26, Z26 VPADDD Z18, Z26, Z18 VPXORD Z10, Z18, Z10 VPRORD $0x0c, Z10, Z10 VPADDD Z2, Z10, Z2 VPADDD Z11, Z2, Z2 VPXORD Z26, Z2, Z26 VPRORD $0x08, Z26, Z26 VPADDD Z18, Z26, Z18 VPXORD Z10, Z18, Z10 VPRORD $0x07, Z10, Z10 VPADDD Z4, Z12, Z4 VPADDD Z17, Z4, Z4 VPXORD Z28, Z4, Z28 VPRORD $0x10, Z28, Z28 VPADDD Z20, Z28, Z20 VPXORD Z12, Z20, Z12 VPRORD $0x0c, Z12, Z12 VPADDD Z4, Z12, Z4 VPADDD Z25, Z4, Z4 VPXORD Z28, Z4, Z28 VPRORD $0x08, Z28, Z28 VPADDD Z20, Z28, Z20 VPXORD Z12, Z20, Z12 VPRORD $0x07, Z12, Z12 VPADDD Z6, Z14, Z6 VPADDD Z31, Z6, Z6 VPXORD Z30, Z6, Z30 VPRORD $0x10, Z30, Z30 VPADDD Z22, Z30, Z22 VPXORD Z14, Z22, Z14 VPRORD $0x0c, Z14, Z14 VPADDD Z6, Z14, Z6 VPADDD Z3, Z6, Z6 VPXORD Z30, Z6, Z30 VPRORD $0x08, Z30, Z30 VPADDD Z22, Z30, Z22 VPXORD Z14, Z22, Z14 VPRORD $0x07, Z14, Z14 VPADDD Z0, Z10, Z0 VPADDD Z27, Z0, Z0 VPXORD Z30, Z0, Z30 VPRORD $0x10, Z30, Z30 VPADDD Z20, Z30, Z20 VPXORD Z10, Z20, Z10 VPRORD $0x0c, Z10, Z10 VPADDD Z0, Z10, Z0 VPADDD Z7, Z0, Z0 VPXORD Z30, Z0, Z30 VPRORD $0x08, Z30, Z30 VPADDD Z20, Z30, Z20 VPXORD Z10, Z20, Z10 VPRORD $0x07, Z10, Z10 VPADDD Z2, Z12, Z2 VPADDD Z1, Z2, Z2 VPXORD Z24, Z2, Z24 VPRORD $0x10, Z24, Z24 VPADDD Z22, Z24, Z22 VPXORD Z12, Z22, Z12 VPRORD $0x0c, Z12, Z12 VPADDD Z2, Z12, Z2 VPADDD Z21, Z2, Z2 VPXORD Z24, Z2, Z24 VPRORD $0x08, Z24, Z24 VPADDD Z22, Z24, Z22 VPXORD Z12, Z22, Z12 VPRORD $0x07, Z12, Z12 VPADDD Z4, Z14, Z4 VPADDD Z5, Z4, Z4 VPXORD Z26, Z4, Z26 VPRORD $0x10, Z26, Z26 VPADDD Z16, Z26, Z16 VPXORD Z14, Z16, Z14 VPRORD $0x0c, Z14, Z14 VPADDD Z4, Z14, Z4 VPADDD Z13, Z4, Z4 VPXORD Z26, Z4, Z26 VPRORD $0x08, Z26, Z26 VPADDD Z16, Z26, Z16 VPXORD Z14, Z16, Z14 VPRORD $0x07, Z14, Z14 VPADDD Z6, Z8, Z6 VPADDD Z9, Z6, Z6 VPXORD Z28, Z6, Z28 VPRORD $0x10, Z28, Z28 VPADDD Z18, Z28, Z18 VPXORD Z8, Z18, Z8 VPRORD $0x0c, Z8, Z8 VPADDD Z6, Z8, Z6 VPADDD Z15, Z6, Z6 VPXORD Z28, Z6, Z28 VPRORD $0x08, Z28, Z28 VPADDD Z18, Z28, Z18 VPXORD Z8, Z18, Z8 VPRORD $0x07, Z8, Z8 // Round 7 VPADDD Z0, Z8, Z0 VPADDD Z23, Z0, Z0 VPXORD Z24, Z0, Z24 VPRORD $0x10, Z24, Z24 VPADDD Z16, Z24, Z16 VPXORD Z8, Z16, Z8 VPRORD $0x0c, Z8, Z8 VPADDD Z0, Z8, Z0 VPADDD Z31, Z0, Z0 VPXORD Z24, Z0, Z24 VPRORD $0x08, Z24, Z24 VPADDD Z16, Z24, Z16 VPXORD Z8, Z16, Z8 VPRORD $0x07, Z8, Z8 VPADDD Z2, Z10, Z2 VPADDD Z11, Z2, Z2 VPXORD Z26, Z2, Z26 VPRORD $0x10, Z26, Z26 VPADDD Z18, Z26, Z18 VPXORD Z10, Z18, Z10 VPRORD $0x0c, Z10, Z10 VPADDD Z2, Z10, Z2 VPADDD Z1, Z2, Z2 VPXORD Z26, Z2, Z26 VPRORD $0x08, Z26, Z26 VPADDD Z18, Z26, Z18 VPXORD Z10, Z18, Z10 VPRORD $0x07, Z10, Z10 VPADDD Z4, Z12, Z4 VPADDD Z3, Z4, Z4 VPXORD Z28, Z4, Z28 VPRORD $0x10, Z28, Z28 VPADDD Z20, Z28, Z20 VPXORD Z12, Z20, Z12 VPRORD $0x0c, Z12, Z12 VPADDD Z4, Z12, Z4 VPADDD Z19, Z4, Z4 VPXORD Z28, Z4, Z28 VPRORD $0x08, Z28, Z28 VPADDD Z20, Z28, Z20 VPXORD Z12, Z20, Z12 VPRORD $0x07, Z12, Z12 VPADDD Z6, Z14, Z6 VPADDD Z17, Z6, Z6 VPXORD Z30, Z6, Z30 VPRORD $0x10, Z30, Z30 VPADDD Z22, Z30, Z22 VPXORD Z14, Z22, Z14 VPRORD $0x0c, Z14, Z14 VPADDD Z6, Z14, Z6 VPADDD Z13, Z6, Z6 VPXORD Z30, Z6, Z30 VPRORD $0x08, Z30, Z30 VPADDD Z22, Z30, Z22 VPXORD Z14, Z22, Z14 VPRORD $0x07, Z14, Z14 VPADDD Z0, Z10, Z0 VPADDD Z29, Z0, Z0 VPXORD Z30, Z0, Z30 VPRORD $0x10, Z30, Z30 VPADDD Z20, Z30, Z20 VPXORD Z10, Z20, Z10 VPRORD $0x0c, Z10, Z10 VPADDD Z0, Z10, Z0 VPADDD Z21, Z0, Z0 VPXORD Z30, Z0, Z30 VPRORD $0x08, Z30, Z30 VPADDD Z20, Z30, Z20 VPXORD Z10, Z20, Z10 VPRORD $0x07, Z10, Z10 VPADDD Z2, Z12, Z2 VPADDD Z5, Z2, Z2 VPXORD Z24, Z2, Z24 VPRORD $0x10, Z24, Z24 VPADDD Z22, Z24, Z22 VPXORD Z12, Z22, Z12 VPRORD $0x0c, Z12, Z12 VPADDD Z2, Z12, Z2 VPADDD Z25, Z2, Z2 VPXORD Z24, Z2, Z24 VPRORD $0x08, Z24, Z24 VPADDD Z22, Z24, Z22 VPXORD Z12, Z22, Z12 VPRORD $0x07, Z12, Z12 VPADDD Z4, Z14, Z4 VPADDD Z7, Z4, Z4 VPXORD Z26, Z4, Z26 VPRORD $0x10, Z26, Z26 VPADDD Z16, Z26, Z16 VPXORD Z14, Z16, Z14 VPRORD $0x0c, Z14, Z14 VPADDD Z4, Z14, Z4 VPADDD Z9, Z4, Z4 VPXORD Z26, Z4, Z26 VPRORD $0x08, Z26, Z26 VPADDD Z16, Z26, Z16 VPXORD Z14, Z16, Z14 VPRORD $0x07, Z14, Z14 VPADDD Z6, Z8, Z6 VPADDD Z15, Z6, Z6 VPXORD Z28, Z6, Z28 VPRORD $0x10, Z28, Z28 VPADDD Z18, Z28, Z18 VPXORD Z8, Z18, Z8 VPRORD $0x0c, Z8, Z8 VPADDD Z6, Z8, Z6 VPADDD Z27, Z6, Z6 VPXORD Z28, Z6, Z28 VPRORD $0x08, Z28, Z28 VPADDD Z18, Z28, Z18 VPXORD Z8, Z18, Z8 VPRORD $0x07, Z8, Z8 // Finalize CVs VPXORD Z0, Z16, Z0 VPXORD Z2, Z18, Z2 VPXORD Z4, Z20, Z4 VPXORD Z6, Z22, Z6 VPXORD Z8, Z24, Z8 VPXORD Z10, Z26, Z10 VPXORD Z12, Z28, Z12 VPXORD Z14, Z30, Z14 VPXORD.BCST (DX), Z16, Z16 VPXORD.BCST 4(DX), Z18, Z18 VPXORD.BCST 8(DX), Z20, Z20 VPXORD.BCST 12(DX), Z22, Z22 VPXORD.BCST 16(DX), Z24, Z24 VPXORD.BCST 20(DX), Z26, Z26 VPXORD.BCST 24(DX), Z28, Z28 VPXORD.BCST 28(DX), Z30, Z30 VMOVDQU32 seq<>+0(SB), Z1 VPSLLD $0x06, Z1, Z1 KXNORD K1, K1, K1 VPSCATTERDD Z0, K1, (AX)(Z1*1) KXNORD K1, K1, K1 VPSCATTERDD Z2, K1, 4(AX)(Z1*1) KXNORD K1, K1, K1 VPSCATTERDD Z4, K1, 8(AX)(Z1*1) KXNORD K1, K1, K1 VPSCATTERDD Z6, K1, 12(AX)(Z1*1) KXNORD K1, K1, K1 VPSCATTERDD Z8, K1, 16(AX)(Z1*1) KXNORD K1, K1, K1 VPSCATTERDD Z10, K1, 20(AX)(Z1*1) KXNORD K1, K1, K1 VPSCATTERDD Z12, K1, 24(AX)(Z1*1) KXNORD K1, K1, K1 VPSCATTERDD Z14, K1, 28(AX)(Z1*1) KXNORD K1, K1, K1 VPSCATTERDD Z16, K1, 32(AX)(Z1*1) KXNORD K1, K1, K1 VPSCATTERDD Z18, K1, 36(AX)(Z1*1) KXNORD K1, K1, K1 VPSCATTERDD Z20, K1, 40(AX)(Z1*1) KXNORD K1, K1, K1 VPSCATTERDD Z22, K1, 44(AX)(Z1*1) KXNORD K1, K1, K1 VPSCATTERDD Z24, K1, 48(AX)(Z1*1) KXNORD K1, K1, K1 VPSCATTERDD Z26, K1, 52(AX)(Z1*1) KXNORD K1, K1, K1 VPSCATTERDD Z28, K1, 56(AX)(Z1*1) KXNORD K1, K1, K1 VPSCATTERDD Z30, K1, 60(AX)(Z1*1) RET // func compressChunksAVX512(cvs *[16][8]uint32, buf *[16384]byte, key *[8]uint32, counter uint64, flags uint32) // Requires: AVX512BW, AVX512F TEXT ·compressChunksAVX512(SB), NOSPLIT, $192-36 MOVQ cvs+0(FP), AX MOVQ buf+8(FP), CX MOVQ key+16(FP), DX // Initialize counter VPBROADCASTD counter+24(FP), Z0 VPADDD seq<>+0(SB), Z0, Z0 VPCMPUD $0x01, seq<>+0(SB), Z0, K1 VPBROADCASTD counter+28(FP), Z2 VPADDD.BCST seq<>+4(SB), Z2, K1, Z2 VMOVDQU32 Z0, (SP) VMOVDQU32 Z2, 64(SP) // Initialize flags VPBROADCASTD flags+32(FP), Z0 VMOVDQU32 Z0, 128(SP) ORL $0x01, 128(SP) ORL $0x02, 188(SP) // Load key VPBROADCASTD (DX), Z0 VPBROADCASTD 4(DX), Z2 VPBROADCASTD 8(DX), Z4 VPBROADCASTD 12(DX), Z6 VPBROADCASTD 16(DX), Z8 VPBROADCASTD 20(DX), Z10 VPBROADCASTD 24(DX), Z12 VPBROADCASTD 28(DX), Z14 // Loop index XORQ DX, DX loop: // Load transposed block VMOVDQU32 seq<>+0(SB), Z16 VPSLLD $0x0a, Z16, Z16 KXNORD K1, K1, K1 VPGATHERDD (CX)(Z16*1), K1, Z1 KXNORD K1, K1, K1 VPGATHERDD 4(CX)(Z16*1), K1, Z3 KXNORD K1, K1, K1 VPGATHERDD 8(CX)(Z16*1), K1, Z5 KXNORD K1, K1, K1 VPGATHERDD 12(CX)(Z16*1), K1, Z7 KXNORD K1, K1, K1 VPGATHERDD 16(CX)(Z16*1), K1, Z9 KXNORD K1, K1, K1 VPGATHERDD 20(CX)(Z16*1), K1, Z11 KXNORD K1, K1, K1 VPGATHERDD 24(CX)(Z16*1), K1, Z13 KXNORD K1, K1, K1 VPGATHERDD 28(CX)(Z16*1), K1, Z15 KXNORD K1, K1, K1 VPGATHERDD 32(CX)(Z16*1), K1, Z17 KXNORD K1, K1, K1 VPGATHERDD 36(CX)(Z16*1), K1, Z19 KXNORD K1, K1, K1 VPGATHERDD 40(CX)(Z16*1), K1, Z21 KXNORD K1, K1, K1 VPGATHERDD 44(CX)(Z16*1), K1, Z23 KXNORD K1, K1, K1 VPGATHERDD 48(CX)(Z16*1), K1, Z25 KXNORD K1, K1, K1 VPGATHERDD 52(CX)(Z16*1), K1, Z27 KXNORD K1, K1, K1 VPGATHERDD 56(CX)(Z16*1), K1, Z29 KXNORD K1, K1, K1 VPGATHERDD 60(CX)(Z16*1), K1, Z31 ADDQ $0x40, CX // Reload state vectors (other than CVs) VPBROADCASTD iv<>+0(SB), Z16 VPBROADCASTD iv<>+4(SB), Z18 VPBROADCASTD iv<>+8(SB), Z20 VPBROADCASTD iv<>+12(SB), Z22 VMOVDQU32 (SP), Z24 VMOVDQU32 64(SP), Z26 VPBROADCASTD seq<>+4(SB), Z28 VPSLLD $0x06, Z28, Z28 VPBROADCASTD 128(SP)(DX*4), Z30 // Round 1 VPADDD Z0, Z8, Z0 VPADDD Z1, Z0, Z0 VPXORD Z24, Z0, Z24 VPRORD $0x10, Z24, Z24 VPADDD Z16, Z24, Z16 VPXORD Z8, Z16, Z8 VPRORD $0x0c, Z8, Z8 VPADDD Z0, Z8, Z0 VPADDD Z3, Z0, Z0 VPXORD Z24, Z0, Z24 VPRORD $0x08, Z24, Z24 VPADDD Z16, Z24, Z16 VPXORD Z8, Z16, Z8 VPRORD $0x07, Z8, Z8 VPADDD Z2, Z10, Z2 VPADDD Z5, Z2, Z2 VPXORD Z26, Z2, Z26 VPRORD $0x10, Z26, Z26 VPADDD Z18, Z26, Z18 VPXORD Z10, Z18, Z10 VPRORD $0x0c, Z10, Z10 VPADDD Z2, Z10, Z2 VPADDD Z7, Z2, Z2 VPXORD Z26, Z2, Z26 VPRORD $0x08, Z26, Z26 VPADDD Z18, Z26, Z18 VPXORD Z10, Z18, Z10 VPRORD $0x07, Z10, Z10 VPADDD Z4, Z12, Z4 VPADDD Z9, Z4, Z4 VPXORD Z28, Z4, Z28 VPRORD $0x10, Z28, Z28 VPADDD Z20, Z28, Z20 VPXORD Z12, Z20, Z12 VPRORD $0x0c, Z12, Z12 VPADDD Z4, Z12, Z4 VPADDD Z11, Z4, Z4 VPXORD Z28, Z4, Z28 VPRORD $0x08, Z28, Z28 VPADDD Z20, Z28, Z20 VPXORD Z12, Z20, Z12 VPRORD $0x07, Z12, Z12 VPADDD Z6, Z14, Z6 VPADDD Z13, Z6, Z6 VPXORD Z30, Z6, Z30 VPRORD $0x10, Z30, Z30 VPADDD Z22, Z30, Z22 VPXORD Z14, Z22, Z14 VPRORD $0x0c, Z14, Z14 VPADDD Z6, Z14, Z6 VPADDD Z15, Z6, Z6 VPXORD Z30, Z6, Z30 VPRORD $0x08, Z30, Z30 VPADDD Z22, Z30, Z22 VPXORD Z14, Z22, Z14 VPRORD $0x07, Z14, Z14 VPADDD Z0, Z10, Z0 VPADDD Z17, Z0, Z0 VPXORD Z30, Z0, Z30 VPRORD $0x10, Z30, Z30 VPADDD Z20, Z30, Z20 VPXORD Z10, Z20, Z10 VPRORD $0x0c, Z10, Z10 VPADDD Z0, Z10, Z0 VPADDD Z19, Z0, Z0 VPXORD Z30, Z0, Z30 VPRORD $0x08, Z30, Z30 VPADDD Z20, Z30, Z20 VPXORD Z10, Z20, Z10 VPRORD $0x07, Z10, Z10 VPADDD Z2, Z12, Z2 VPADDD Z21, Z2, Z2 VPXORD Z24, Z2, Z24 VPRORD $0x10, Z24, Z24 VPADDD Z22, Z24, Z22 VPXORD Z12, Z22, Z12 VPRORD $0x0c, Z12, Z12 VPADDD Z2, Z12, Z2 VPADDD Z23, Z2, Z2 VPXORD Z24, Z2, Z24 VPRORD $0x08, Z24, Z24 VPADDD Z22, Z24, Z22 VPXORD Z12, Z22, Z12 VPRORD $0x07, Z12, Z12 VPADDD Z4, Z14, Z4 VPADDD Z25, Z4, Z4 VPXORD Z26, Z4, Z26 VPRORD $0x10, Z26, Z26 VPADDD Z16, Z26, Z16 VPXORD Z14, Z16, Z14 VPRORD $0x0c, Z14, Z14 VPADDD Z4, Z14, Z4 VPADDD Z27, Z4, Z4 VPXORD Z26, Z4, Z26 VPRORD $0x08, Z26, Z26 VPADDD Z16, Z26, Z16 VPXORD Z14, Z16, Z14 VPRORD $0x07, Z14, Z14 VPADDD Z6, Z8, Z6 VPADDD Z29, Z6, Z6 VPXORD Z28, Z6, Z28 VPRORD $0x10, Z28, Z28 VPADDD Z18, Z28, Z18 VPXORD Z8, Z18, Z8 VPRORD $0x0c, Z8, Z8 VPADDD Z6, Z8, Z6 VPADDD Z31, Z6, Z6 VPXORD Z28, Z6, Z28 VPRORD $0x08, Z28, Z28 VPADDD Z18, Z28, Z18 VPXORD Z8, Z18, Z8 VPRORD $0x07, Z8, Z8 // Round 2 VPADDD Z0, Z8, Z0 VPADDD Z5, Z0, Z0 VPXORD Z24, Z0, Z24 VPRORD $0x10, Z24, Z24 VPADDD Z16, Z24, Z16 VPXORD Z8, Z16, Z8 VPRORD $0x0c, Z8, Z8 VPADDD Z0, Z8, Z0 VPADDD Z13, Z0, Z0 VPXORD Z24, Z0, Z24 VPRORD $0x08, Z24, Z24 VPADDD Z16, Z24, Z16 VPXORD Z8, Z16, Z8 VPRORD $0x07, Z8, Z8 VPADDD Z2, Z10, Z2 VPADDD Z7, Z2, Z2 VPXORD Z26, Z2, Z26 VPRORD $0x10, Z26, Z26 VPADDD Z18, Z26, Z18 VPXORD Z10, Z18, Z10 VPRORD $0x0c, Z10, Z10 VPADDD Z2, Z10, Z2 VPADDD Z21, Z2, Z2 VPXORD Z26, Z2, Z26 VPRORD $0x08, Z26, Z26 VPADDD Z18, Z26, Z18 VPXORD Z10, Z18, Z10 VPRORD $0x07, Z10, Z10 VPADDD Z4, Z12, Z4 VPADDD Z15, Z4, Z4 VPXORD Z28, Z4, Z28 VPRORD $0x10, Z28, Z28 VPADDD Z20, Z28, Z20 VPXORD Z12, Z20, Z12 VPRORD $0x0c, Z12, Z12 VPADDD Z4, Z12, Z4 VPADDD Z1, Z4, Z4 VPXORD Z28, Z4, Z28 VPRORD $0x08, Z28, Z28 VPADDD Z20, Z28, Z20 VPXORD Z12, Z20, Z12 VPRORD $0x07, Z12, Z12 VPADDD Z6, Z14, Z6 VPADDD Z9, Z6, Z6 VPXORD Z30, Z6, Z30 VPRORD $0x10, Z30, Z30 VPADDD Z22, Z30, Z22 VPXORD Z14, Z22, Z14 VPRORD $0x0c, Z14, Z14 VPADDD Z6, Z14, Z6 VPADDD Z27, Z6, Z6 VPXORD Z30, Z6, Z30 VPRORD $0x08, Z30, Z30 VPADDD Z22, Z30, Z22 VPXORD Z14, Z22, Z14 VPRORD $0x07, Z14, Z14 VPADDD Z0, Z10, Z0 VPADDD Z3, Z0, Z0 VPXORD Z30, Z0, Z30 VPRORD $0x10, Z30, Z30 VPADDD Z20, Z30, Z20 VPXORD Z10, Z20, Z10 VPRORD $0x0c, Z10, Z10 VPADDD Z0, Z10, Z0 VPADDD Z23, Z0, Z0 VPXORD Z30, Z0, Z30 VPRORD $0x08, Z30, Z30 VPADDD Z20, Z30, Z20 VPXORD Z10, Z20, Z10 VPRORD $0x07, Z10, Z10 VPADDD Z2, Z12, Z2 VPADDD Z25, Z2, Z2 VPXORD Z24, Z2, Z24 VPRORD $0x10, Z24, Z24 VPADDD Z22, Z24, Z22 VPXORD Z12, Z22, Z12 VPRORD $0x0c, Z12, Z12 VPADDD Z2, Z12, Z2 VPADDD Z11, Z2, Z2 VPXORD Z24, Z2, Z24 VPRORD $0x08, Z24, Z24 VPADDD Z22, Z24, Z22 VPXORD Z12, Z22, Z12 VPRORD $0x07, Z12, Z12 VPADDD Z4, Z14, Z4 VPADDD Z19, Z4, Z4 VPXORD Z26, Z4, Z26 VPRORD $0x10, Z26, Z26 VPADDD Z16, Z26, Z16 VPXORD Z14, Z16, Z14 VPRORD $0x0c, Z14, Z14 VPADDD Z4, Z14, Z4 VPADDD Z29, Z4, Z4 VPXORD Z26, Z4, Z26 VPRORD $0x08, Z26, Z26 VPADDD Z16, Z26, Z16 VPXORD Z14, Z16, Z14 VPRORD $0x07, Z14, Z14 VPADDD Z6, Z8, Z6 VPADDD Z31, Z6, Z6 VPXORD Z28, Z6, Z28 VPRORD $0x10, Z28, Z28 VPADDD Z18, Z28, Z18 VPXORD Z8, Z18, Z8 VPRORD $0x0c, Z8, Z8 VPADDD Z6, Z8, Z6 VPADDD Z17, Z6, Z6 VPXORD Z28, Z6, Z28 VPRORD $0x08, Z28, Z28 VPADDD Z18, Z28, Z18 VPXORD Z8, Z18, Z8 VPRORD $0x07, Z8, Z8 // Round 3 VPADDD Z0, Z8, Z0 VPADDD Z7, Z0, Z0 VPXORD Z24, Z0, Z24 VPRORD $0x10, Z24, Z24 VPADDD Z16, Z24, Z16 VPXORD Z8, Z16, Z8 VPRORD $0x0c, Z8, Z8 VPADDD Z0, Z8, Z0 VPADDD Z9, Z0, Z0 VPXORD Z24, Z0, Z24 VPRORD $0x08, Z24, Z24 VPADDD Z16, Z24, Z16 VPXORD Z8, Z16, Z8 VPRORD $0x07, Z8, Z8 VPADDD Z2, Z10, Z2 VPADDD Z21, Z2, Z2 VPXORD Z26, Z2, Z26 VPRORD $0x10, Z26, Z26 VPADDD Z18, Z26, Z18 VPXORD Z10, Z18, Z10 VPRORD $0x0c, Z10, Z10 VPADDD Z2, Z10, Z2 VPADDD Z25, Z2, Z2 VPXORD Z26, Z2, Z26 VPRORD $0x08, Z26, Z26 VPADDD Z18, Z26, Z18 VPXORD Z10, Z18, Z10 VPRORD $0x07, Z10, Z10 VPADDD Z4, Z12, Z4 VPADDD Z27, Z4, Z4 VPXORD Z28, Z4, Z28 VPRORD $0x10, Z28, Z28 VPADDD Z20, Z28, Z20 VPXORD Z12, Z20, Z12 VPRORD $0x0c, Z12, Z12 VPADDD Z4, Z12, Z4 VPADDD Z5, Z4, Z4 VPXORD Z28, Z4, Z28 VPRORD $0x08, Z28, Z28 VPADDD Z20, Z28, Z20 VPXORD Z12, Z20, Z12 VPRORD $0x07, Z12, Z12 VPADDD Z6, Z14, Z6 VPADDD Z15, Z6, Z6 VPXORD Z30, Z6, Z30 VPRORD $0x10, Z30, Z30 VPADDD Z22, Z30, Z22 VPXORD Z14, Z22, Z14 VPRORD $0x0c, Z14, Z14 VPADDD Z6, Z14, Z6 VPADDD Z29, Z6, Z6 VPXORD Z30, Z6, Z30 VPRORD $0x08, Z30, Z30 VPADDD Z22, Z30, Z22 VPXORD Z14, Z22, Z14 VPRORD $0x07, Z14, Z14 VPADDD Z0, Z10, Z0 VPADDD Z13, Z0, Z0 VPXORD Z30, Z0, Z30 VPRORD $0x10, Z30, Z30 VPADDD Z20, Z30, Z20 VPXORD Z10, Z20, Z10 VPRORD $0x0c, Z10, Z10 VPADDD Z0, Z10, Z0 VPADDD Z11, Z0, Z0 VPXORD Z30, Z0, Z30 VPRORD $0x08, Z30, Z30 VPADDD Z20, Z30, Z20 VPXORD Z10, Z20, Z10 VPRORD $0x07, Z10, Z10 VPADDD Z2, Z12, Z2 VPADDD Z19, Z2, Z2 VPXORD Z24, Z2, Z24 VPRORD $0x10, Z24, Z24 VPADDD Z22, Z24, Z22 VPXORD Z12, Z22, Z12 VPRORD $0x0c, Z12, Z12 VPADDD Z2, Z12, Z2 VPADDD Z1, Z2, Z2 VPXORD Z24, Z2, Z24 VPRORD $0x08, Z24, Z24 VPADDD Z22, Z24, Z22 VPXORD Z12, Z22, Z12 VPRORD $0x07, Z12, Z12 VPADDD Z4, Z14, Z4 VPADDD Z23, Z4, Z4 VPXORD Z26, Z4, Z26 VPRORD $0x10, Z26, Z26 VPADDD Z16, Z26, Z16 VPXORD Z14, Z16, Z14 VPRORD $0x0c, Z14, Z14 VPADDD Z4, Z14, Z4 VPADDD Z31, Z4, Z4 VPXORD Z26, Z4, Z26 VPRORD $0x08, Z26, Z26 VPADDD Z16, Z26, Z16 VPXORD Z14, Z16, Z14 VPRORD $0x07, Z14, Z14 VPADDD Z6, Z8, Z6 VPADDD Z17, Z6, Z6 VPXORD Z28, Z6, Z28 VPRORD $0x10, Z28, Z28 VPADDD Z18, Z28, Z18 VPXORD Z8, Z18, Z8 VPRORD $0x0c, Z8, Z8 VPADDD Z6, Z8, Z6 VPADDD Z3, Z6, Z6 VPXORD Z28, Z6, Z28 VPRORD $0x08, Z28, Z28 VPADDD Z18, Z28, Z18 VPXORD Z8, Z18, Z8 VPRORD $0x07, Z8, Z8 // Round 4 VPADDD Z0, Z8, Z0 VPADDD Z21, Z0, Z0 VPXORD Z24, Z0, Z24 VPRORD $0x10, Z24, Z24 VPADDD Z16, Z24, Z16 VPXORD Z8, Z16, Z8 VPRORD $0x0c, Z8, Z8 VPADDD Z0, Z8, Z0 VPADDD Z15, Z0, Z0 VPXORD Z24, Z0, Z24 VPRORD $0x08, Z24, Z24 VPADDD Z16, Z24, Z16 VPXORD Z8, Z16, Z8 VPRORD $0x07, Z8, Z8 VPADDD Z2, Z10, Z2 VPADDD Z25, Z2, Z2 VPXORD Z26, Z2, Z26 VPRORD $0x10, Z26, Z26 VPADDD Z18, Z26, Z18 VPXORD Z10, Z18, Z10 VPRORD $0x0c, Z10, Z10 VPADDD Z2, Z10, Z2 VPADDD Z19, Z2, Z2 VPXORD Z26, Z2, Z26 VPRORD $0x08, Z26, Z26 VPADDD Z18, Z26, Z18 VPXORD Z10, Z18, Z10 VPRORD $0x07, Z10, Z10 VPADDD Z4, Z12, Z4 VPADDD Z29, Z4, Z4 VPXORD Z28, Z4, Z28 VPRORD $0x10, Z28, Z28 VPADDD Z20, Z28, Z20 VPXORD Z12, Z20, Z12 VPRORD $0x0c, Z12, Z12 VPADDD Z4, Z12, Z4 VPADDD Z7, Z4, Z4 VPXORD Z28, Z4, Z28 VPRORD $0x08, Z28, Z28 VPADDD Z20, Z28, Z20 VPXORD Z12, Z20, Z12 VPRORD $0x07, Z12, Z12 VPADDD Z6, Z14, Z6 VPADDD Z27, Z6, Z6 VPXORD Z30, Z6, Z30 VPRORD $0x10, Z30, Z30 VPADDD Z22, Z30, Z22 VPXORD Z14, Z22, Z14 VPRORD $0x0c, Z14, Z14 VPADDD Z6, Z14, Z6 VPADDD Z31, Z6, Z6 VPXORD Z30, Z6, Z30 VPRORD $0x08, Z30, Z30 VPADDD Z22, Z30, Z22 VPXORD Z14, Z22, Z14 VPRORD $0x07, Z14, Z14 VPADDD Z0, Z10, Z0 VPADDD Z9, Z0, Z0 VPXORD Z30, Z0, Z30 VPRORD $0x10, Z30, Z30 VPADDD Z20, Z30, Z20 VPXORD Z10, Z20, Z10 VPRORD $0x0c, Z10, Z10 VPADDD Z0, Z10, Z0 VPADDD Z1, Z0, Z0 VPXORD Z30, Z0, Z30 VPRORD $0x08, Z30, Z30 VPADDD Z20, Z30, Z20 VPXORD Z10, Z20, Z10 VPRORD $0x07, Z10, Z10 VPADDD Z2, Z12, Z2 VPADDD Z23, Z2, Z2 VPXORD Z24, Z2, Z24 VPRORD $0x10, Z24, Z24 VPADDD Z22, Z24, Z22 VPXORD Z12, Z22, Z12 VPRORD $0x0c, Z12, Z12 VPADDD Z2, Z12, Z2 VPADDD Z5, Z2, Z2 VPXORD Z24, Z2, Z24 VPRORD $0x08, Z24, Z24 VPADDD Z22, Z24, Z22 VPXORD Z12, Z22, Z12 VPRORD $0x07, Z12, Z12 VPADDD Z4, Z14, Z4 VPADDD Z11, Z4, Z4 VPXORD Z26, Z4, Z26 VPRORD $0x10, Z26, Z26 VPADDD Z16, Z26, Z16 VPXORD Z14, Z16, Z14 VPRORD $0x0c, Z14, Z14 VPADDD Z4, Z14, Z4 VPADDD Z17, Z4, Z4 VPXORD Z26, Z4, Z26 VPRORD $0x08, Z26, Z26 VPADDD Z16, Z26, Z16 VPXORD Z14, Z16, Z14 VPRORD $0x07, Z14, Z14 VPADDD Z6, Z8, Z6 VPADDD Z3, Z6, Z6 VPXORD Z28, Z6, Z28 VPRORD $0x10, Z28, Z28 VPADDD Z18, Z28, Z18 VPXORD Z8, Z18, Z8 VPRORD $0x0c, Z8, Z8 VPADDD Z6, Z8, Z6 VPADDD Z13, Z6, Z6 VPXORD Z28, Z6, Z28 VPRORD $0x08, Z28, Z28 VPADDD Z18, Z28, Z18 VPXORD Z8, Z18, Z8 VPRORD $0x07, Z8, Z8 // Round 5 VPADDD Z0, Z8, Z0 VPADDD Z25, Z0, Z0 VPXORD Z24, Z0, Z24 VPRORD $0x10, Z24, Z24 VPADDD Z16, Z24, Z16 VPXORD Z8, Z16, Z8 VPRORD $0x0c, Z8, Z8 VPADDD Z0, Z8, Z0 VPADDD Z27, Z0, Z0 VPXORD Z24, Z0, Z24 VPRORD $0x08, Z24, Z24 VPADDD Z16, Z24, Z16 VPXORD Z8, Z16, Z8 VPRORD $0x07, Z8, Z8 VPADDD Z2, Z10, Z2 VPADDD Z19, Z2, Z2 VPXORD Z26, Z2, Z26 VPRORD $0x10, Z26, Z26 VPADDD Z18, Z26, Z18 VPXORD Z10, Z18, Z10 VPRORD $0x0c, Z10, Z10 VPADDD Z2, Z10, Z2 VPADDD Z23, Z2, Z2 VPXORD Z26, Z2, Z26 VPRORD $0x08, Z26, Z26 VPADDD Z18, Z26, Z18 VPXORD Z10, Z18, Z10 VPRORD $0x07, Z10, Z10 VPADDD Z4, Z12, Z4 VPADDD Z31, Z4, Z4 VPXORD Z28, Z4, Z28 VPRORD $0x10, Z28, Z28 VPADDD Z20, Z28, Z20 VPXORD Z12, Z20, Z12 VPRORD $0x0c, Z12, Z12 VPADDD Z4, Z12, Z4 VPADDD Z21, Z4, Z4 VPXORD Z28, Z4, Z28 VPRORD $0x08, Z28, Z28 VPADDD Z20, Z28, Z20 VPXORD Z12, Z20, Z12 VPRORD $0x07, Z12, Z12 VPADDD Z6, Z14, Z6 VPADDD Z29, Z6, Z6 VPXORD Z30, Z6, Z30 VPRORD $0x10, Z30, Z30 VPADDD Z22, Z30, Z22 VPXORD Z14, Z22, Z14 VPRORD $0x0c, Z14, Z14 VPADDD Z6, Z14, Z6 VPADDD Z17, Z6, Z6 VPXORD Z30, Z6, Z30 VPRORD $0x08, Z30, Z30 VPADDD Z22, Z30, Z22 VPXORD Z14, Z22, Z14 VPRORD $0x07, Z14, Z14 VPADDD Z0, Z10, Z0 VPADDD Z15, Z0, Z0 VPXORD Z30, Z0, Z30 VPRORD $0x10, Z30, Z30 VPADDD Z20, Z30, Z20 VPXORD Z10, Z20, Z10 VPRORD $0x0c, Z10, Z10 VPADDD Z0, Z10, Z0 VPADDD Z5, Z0, Z0 VPXORD Z30, Z0, Z30 VPRORD $0x08, Z30, Z30 VPADDD Z20, Z30, Z20 VPXORD Z10, Z20, Z10 VPRORD $0x07, Z10, Z10 VPADDD Z2, Z12, Z2 VPADDD Z11, Z2, Z2 VPXORD Z24, Z2, Z24 VPRORD $0x10, Z24, Z24 VPADDD Z22, Z24, Z22 VPXORD Z12, Z22, Z12 VPRORD $0x0c, Z12, Z12 VPADDD Z2, Z12, Z2 VPADDD Z7, Z2, Z2 VPXORD Z24, Z2, Z24 VPRORD $0x08, Z24, Z24 VPADDD Z22, Z24, Z22 VPXORD Z12, Z22, Z12 VPRORD $0x07, Z12, Z12 VPADDD Z4, Z14, Z4 VPADDD Z1, Z4, Z4 VPXORD Z26, Z4, Z26 VPRORD $0x10, Z26, Z26 VPADDD Z16, Z26, Z16 VPXORD Z14, Z16, Z14 VPRORD $0x0c, Z14, Z14 VPADDD Z4, Z14, Z4 VPADDD Z3, Z4, Z4 VPXORD Z26, Z4, Z26 VPRORD $0x08, Z26, Z26 VPADDD Z16, Z26, Z16 VPXORD Z14, Z16, Z14 VPRORD $0x07, Z14, Z14 VPADDD Z6, Z8, Z6 VPADDD Z13, Z6, Z6 VPXORD Z28, Z6, Z28 VPRORD $0x10, Z28, Z28 VPADDD Z18, Z28, Z18 VPXORD Z8, Z18, Z8 VPRORD $0x0c, Z8, Z8 VPADDD Z6, Z8, Z6 VPADDD Z9, Z6, Z6 VPXORD Z28, Z6, Z28 VPRORD $0x08, Z28, Z28 VPADDD Z18, Z28, Z18 VPXORD Z8, Z18, Z8 VPRORD $0x07, Z8, Z8 // Round 6 VPADDD Z0, Z8, Z0 VPADDD Z19, Z0, Z0 VPXORD Z24, Z0, Z24 VPRORD $0x10, Z24, Z24 VPADDD Z16, Z24, Z16 VPXORD Z8, Z16, Z8 VPRORD $0x0c, Z8, Z8 VPADDD Z0, Z8, Z0 VPADDD Z29, Z0, Z0 VPXORD Z24, Z0, Z24 VPRORD $0x08, Z24, Z24 VPADDD Z16, Z24, Z16 VPXORD Z8, Z16, Z8 VPRORD $0x07, Z8, Z8 VPADDD Z2, Z10, Z2 VPADDD Z23, Z2, Z2 VPXORD Z26, Z2, Z26 VPRORD $0x10, Z26, Z26 VPADDD Z18, Z26, Z18 VPXORD Z10, Z18, Z10 VPRORD $0x0c, Z10, Z10 VPADDD Z2, Z10, Z2 VPADDD Z11, Z2, Z2 VPXORD Z26, Z2, Z26 VPRORD $0x08, Z26, Z26 VPADDD Z18, Z26, Z18 VPXORD Z10, Z18, Z10 VPRORD $0x07, Z10, Z10 VPADDD Z4, Z12, Z4 VPADDD Z17, Z4, Z4 VPXORD Z28, Z4, Z28 VPRORD $0x10, Z28, Z28 VPADDD Z20, Z28, Z20 VPXORD Z12, Z20, Z12 VPRORD $0x0c, Z12, Z12 VPADDD Z4, Z12, Z4 VPADDD Z25, Z4, Z4 VPXORD Z28, Z4, Z28 VPRORD $0x08, Z28, Z28 VPADDD Z20, Z28, Z20 VPXORD Z12, Z20, Z12 VPRORD $0x07, Z12, Z12 VPADDD Z6, Z14, Z6 VPADDD Z31, Z6, Z6 VPXORD Z30, Z6, Z30 VPRORD $0x10, Z30, Z30 VPADDD Z22, Z30, Z22 VPXORD Z14, Z22, Z14 VPRORD $0x0c, Z14, Z14 VPADDD Z6, Z14, Z6 VPADDD Z3, Z6, Z6 VPXORD Z30, Z6, Z30 VPRORD $0x08, Z30, Z30 VPADDD Z22, Z30, Z22 VPXORD Z14, Z22, Z14 VPRORD $0x07, Z14, Z14 VPADDD Z0, Z10, Z0 VPADDD Z27, Z0, Z0 VPXORD Z30, Z0, Z30 VPRORD $0x10, Z30, Z30 VPADDD Z20, Z30, Z20 VPXORD Z10, Z20, Z10 VPRORD $0x0c, Z10, Z10 VPADDD Z0, Z10, Z0 VPADDD Z7, Z0, Z0 VPXORD Z30, Z0, Z30 VPRORD $0x08, Z30, Z30 VPADDD Z20, Z30, Z20 VPXORD Z10, Z20, Z10 VPRORD $0x07, Z10, Z10 VPADDD Z2, Z12, Z2 VPADDD Z1, Z2, Z2 VPXORD Z24, Z2, Z24 VPRORD $0x10, Z24, Z24 VPADDD Z22, Z24, Z22 VPXORD Z12, Z22, Z12 VPRORD $0x0c, Z12, Z12 VPADDD Z2, Z12, Z2 VPADDD Z21, Z2, Z2 VPXORD Z24, Z2, Z24 VPRORD $0x08, Z24, Z24 VPADDD Z22, Z24, Z22 VPXORD Z12, Z22, Z12 VPRORD $0x07, Z12, Z12 VPADDD Z4, Z14, Z4 VPADDD Z5, Z4, Z4 VPXORD Z26, Z4, Z26 VPRORD $0x10, Z26, Z26 VPADDD Z16, Z26, Z16 VPXORD Z14, Z16, Z14 VPRORD $0x0c, Z14, Z14 VPADDD Z4, Z14, Z4 VPADDD Z13, Z4, Z4 VPXORD Z26, Z4, Z26 VPRORD $0x08, Z26, Z26 VPADDD Z16, Z26, Z16 VPXORD Z14, Z16, Z14 VPRORD $0x07, Z14, Z14 VPADDD Z6, Z8, Z6 VPADDD Z9, Z6, Z6 VPXORD Z28, Z6, Z28 VPRORD $0x10, Z28, Z28 VPADDD Z18, Z28, Z18 VPXORD Z8, Z18, Z8 VPRORD $0x0c, Z8, Z8 VPADDD Z6, Z8, Z6 VPADDD Z15, Z6, Z6 VPXORD Z28, Z6, Z28 VPRORD $0x08, Z28, Z28 VPADDD Z18, Z28, Z18 VPXORD Z8, Z18, Z8 VPRORD $0x07, Z8, Z8 // Round 7 VPADDD Z0, Z8, Z0 VPADDD Z23, Z0, Z0 VPXORD Z24, Z0, Z24 VPRORD $0x10, Z24, Z24 VPADDD Z16, Z24, Z16 VPXORD Z8, Z16, Z8 VPRORD $0x0c, Z8, Z8 VPADDD Z0, Z8, Z0 VPADDD Z31, Z0, Z0 VPXORD Z24, Z0, Z24 VPRORD $0x08, Z24, Z24 VPADDD Z16, Z24, Z16 VPXORD Z8, Z16, Z8 VPRORD $0x07, Z8, Z8 VPADDD Z2, Z10, Z2 VPADDD Z11, Z2, Z2 VPXORD Z26, Z2, Z26 VPRORD $0x10, Z26, Z26 VPADDD Z18, Z26, Z18 VPXORD Z10, Z18, Z10 VPRORD $0x0c, Z10, Z10 VPADDD Z2, Z10, Z2 VPADDD Z1, Z2, Z2 VPXORD Z26, Z2, Z26 VPRORD $0x08, Z26, Z26 VPADDD Z18, Z26, Z18 VPXORD Z10, Z18, Z10 VPRORD $0x07, Z10, Z10 VPADDD Z4, Z12, Z4 VPADDD Z3, Z4, Z4 VPXORD Z28, Z4, Z28 VPRORD $0x10, Z28, Z28 VPADDD Z20, Z28, Z20 VPXORD Z12, Z20, Z12 VPRORD $0x0c, Z12, Z12 VPADDD Z4, Z12, Z4 VPADDD Z19, Z4, Z4 VPXORD Z28, Z4, Z28 VPRORD $0x08, Z28, Z28 VPADDD Z20, Z28, Z20 VPXORD Z12, Z20, Z12 VPRORD $0x07, Z12, Z12 VPADDD Z6, Z14, Z6 VPADDD Z17, Z6, Z6 VPXORD Z30, Z6, Z30 VPRORD $0x10, Z30, Z30 VPADDD Z22, Z30, Z22 VPXORD Z14, Z22, Z14 VPRORD $0x0c, Z14, Z14 VPADDD Z6, Z14, Z6 VPADDD Z13, Z6, Z6 VPXORD Z30, Z6, Z30 VPRORD $0x08, Z30, Z30 VPADDD Z22, Z30, Z22 VPXORD Z14, Z22, Z14 VPRORD $0x07, Z14, Z14 VPADDD Z0, Z10, Z0 VPADDD Z29, Z0, Z0 VPXORD Z30, Z0, Z30 VPRORD $0x10, Z30, Z30 VPADDD Z20, Z30, Z20 VPXORD Z10, Z20, Z10 VPRORD $0x0c, Z10, Z10 VPADDD Z0, Z10, Z0 VPADDD Z21, Z0, Z0 VPXORD Z30, Z0, Z30 VPRORD $0x08, Z30, Z30 VPADDD Z20, Z30, Z20 VPXORD Z10, Z20, Z10 VPRORD $0x07, Z10, Z10 VPADDD Z2, Z12, Z2 VPADDD Z5, Z2, Z2 VPXORD Z24, Z2, Z24 VPRORD $0x10, Z24, Z24 VPADDD Z22, Z24, Z22 VPXORD Z12, Z22, Z12 VPRORD $0x0c, Z12, Z12 VPADDD Z2, Z12, Z2 VPADDD Z25, Z2, Z2 VPXORD Z24, Z2, Z24 VPRORD $0x08, Z24, Z24 VPADDD Z22, Z24, Z22 VPXORD Z12, Z22, Z12 VPRORD $0x07, Z12, Z12 VPADDD Z4, Z14, Z4 VPADDD Z7, Z4, Z4 VPXORD Z26, Z4, Z26 VPRORD $0x10, Z26, Z26 VPADDD Z16, Z26, Z16 VPXORD Z14, Z16, Z14 VPRORD $0x0c, Z14, Z14 VPADDD Z4, Z14, Z4 VPADDD Z9, Z4, Z4 VPXORD Z26, Z4, Z26 VPRORD $0x08, Z26, Z26 VPADDD Z16, Z26, Z16 VPXORD Z14, Z16, Z14 VPRORD $0x07, Z14, Z14 VPADDD Z6, Z8, Z6 VPADDD Z15, Z6, Z6 VPXORD Z28, Z6, Z28 VPRORD $0x10, Z28, Z28 VPADDD Z18, Z28, Z18 VPXORD Z8, Z18, Z8 VPRORD $0x0c, Z8, Z8 VPADDD Z6, Z8, Z6 VPADDD Z27, Z6, Z6 VPXORD Z28, Z6, Z28 VPRORD $0x08, Z28, Z28 VPADDD Z18, Z28, Z18 VPXORD Z8, Z18, Z8 VPRORD $0x07, Z8, Z8 // Finalize CVs VPXORD Z0, Z16, Z0 VPXORD Z2, Z18, Z2 VPXORD Z4, Z20, Z4 VPXORD Z6, Z22, Z6 VPXORD Z8, Z24, Z8 VPXORD Z10, Z26, Z10 VPXORD Z12, Z28, Z12 VPXORD Z14, Z30, Z14 // Loop INCQ DX CMPQ DX, $0x00000010 JNE loop // Finished; transpose CVs VMOVDQU32 seq<>+0(SB), Z16 VPSLLD $0x05, Z16, Z16 KXNORD K1, K1, K1 VPSCATTERDD Z0, K1, (AX)(Z16*1) KXNORD K1, K1, K1 VPSCATTERDD Z2, K1, 4(AX)(Z16*1) KXNORD K1, K1, K1 VPSCATTERDD Z4, K1, 8(AX)(Z16*1) KXNORD K1, K1, K1 VPSCATTERDD Z6, K1, 12(AX)(Z16*1) KXNORD K1, K1, K1 VPSCATTERDD Z8, K1, 16(AX)(Z16*1) KXNORD K1, K1, K1 VPSCATTERDD Z10, K1, 20(AX)(Z16*1) KXNORD K1, K1, K1 VPSCATTERDD Z12, K1, 24(AX)(Z16*1) KXNORD K1, K1, K1 VPSCATTERDD Z14, K1, 28(AX)(Z16*1) RET // func compressBlocksAVX2(out *[512]byte, block *[16]uint32, cv *[8]uint32, counter uint64, blockLen uint32, flags uint32) // Requires: AVX, AVX2 TEXT ·compressBlocksAVX2(SB), NOSPLIT, $544-40 MOVQ out+0(FP), AX MOVQ block+8(FP), CX MOVQ cv+16(FP), DX // Load block VPBROADCASTD (CX), Y0 VMOVDQU Y0, (SP) VPBROADCASTD 4(CX), Y0 VMOVDQU Y0, 32(SP) VPBROADCASTD 8(CX), Y0 VMOVDQU Y0, 64(SP) VPBROADCASTD 12(CX), Y0 VMOVDQU Y0, 96(SP) VPBROADCASTD 16(CX), Y0 VMOVDQU Y0, 128(SP) VPBROADCASTD 20(CX), Y0 VMOVDQU Y0, 160(SP) VPBROADCASTD 24(CX), Y0 VMOVDQU Y0, 192(SP) VPBROADCASTD 28(CX), Y0 VMOVDQU Y0, 224(SP) VPBROADCASTD 32(CX), Y0 VMOVDQU Y0, 256(SP) VPBROADCASTD 36(CX), Y0 VMOVDQU Y0, 288(SP) VPBROADCASTD 40(CX), Y0 VMOVDQU Y0, 320(SP) VPBROADCASTD 44(CX), Y0 VMOVDQU Y0, 352(SP) VPBROADCASTD 48(CX), Y0 VMOVDQU Y0, 384(SP) VPBROADCASTD 52(CX), Y0 VMOVDQU Y0, 416(SP) VPBROADCASTD 56(CX), Y0 VMOVDQU Y0, 448(SP) VPBROADCASTD 60(CX), Y0 VMOVDQU Y0, 480(SP) // Initialize state vectors VPBROADCASTD (DX), Y0 VPBROADCASTD 4(DX), Y1 VPBROADCASTD 8(DX), Y2 VPBROADCASTD 12(DX), Y3 VPBROADCASTD 16(DX), Y4 VPBROADCASTD 20(DX), Y5 VPBROADCASTD 24(DX), Y6 VPBROADCASTD 28(DX), Y7 VPBROADCASTD iv<>+0(SB), Y8 VPBROADCASTD iv<>+4(SB), Y9 VPBROADCASTD iv<>+8(SB), Y10 VPBROADCASTD iv<>+12(SB), Y11 VPBROADCASTQ counter+24(FP), Y12 VPBROADCASTQ counter+24(FP), Y13 VPADDQ seq64<>+0(SB), Y12, Y12 VPADDQ seq64<>+32(SB), Y13, Y13 VPUNPCKLDQ Y13, Y12, Y14 VPUNPCKHDQ Y13, Y12, Y15 VPUNPCKLDQ Y15, Y14, Y12 VPUNPCKHDQ Y15, Y14, Y13 VPERMQ $0xd8, Y12, Y12 VPERMQ $0xd8, Y13, Y13 VPBROADCASTD blockLen+32(FP), Y14 VPBROADCASTD flags+36(FP), Y15 VMOVDQU Y8, 512(SP) // Round 1 VPADDD Y0, Y4, Y0 VPADDD (SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y0, Y4, Y0 VPADDD 32(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y1, Y5, Y1 VPADDD 64(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y5, Y1 VPADDD 96(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y2, Y6, Y2 VPADDD 128(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y6, Y2 VPADDD 160(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y3, Y7, Y3 VPADDD 192(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y7, Y3 VPADDD 224(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y0, Y5, Y0 VPADDD 256(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y0, Y5, Y0 VPADDD 288(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y6, Y1 VPADDD 320(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y1, Y6, Y1 VPADDD 352(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y7, Y2 VPADDD 384(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y2, Y7, Y2 VPADDD 416(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y4, Y3 VPADDD 448(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y3, Y4, Y3 VPADDD 480(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 // Round 2 VPADDD Y0, Y4, Y0 VPADDD 64(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y0, Y4, Y0 VPADDD 192(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y1, Y5, Y1 VPADDD 96(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y5, Y1 VPADDD 320(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y2, Y6, Y2 VPADDD 224(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y6, Y2 VPADDD (SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y3, Y7, Y3 VPADDD 128(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y7, Y3 VPADDD 416(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y0, Y5, Y0 VPADDD 32(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y0, Y5, Y0 VPADDD 352(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y6, Y1 VPADDD 384(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y1, Y6, Y1 VPADDD 160(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y7, Y2 VPADDD 288(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y2, Y7, Y2 VPADDD 448(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y4, Y3 VPADDD 480(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y3, Y4, Y3 VPADDD 256(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 // Round 3 VPADDD Y0, Y4, Y0 VPADDD 96(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y0, Y4, Y0 VPADDD 128(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y1, Y5, Y1 VPADDD 320(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y5, Y1 VPADDD 384(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y2, Y6, Y2 VPADDD 416(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y6, Y2 VPADDD 64(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y3, Y7, Y3 VPADDD 224(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y7, Y3 VPADDD 448(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y0, Y5, Y0 VPADDD 192(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y0, Y5, Y0 VPADDD 160(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y6, Y1 VPADDD 288(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y1, Y6, Y1 VPADDD (SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y7, Y2 VPADDD 352(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y2, Y7, Y2 VPADDD 480(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y4, Y3 VPADDD 256(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y3, Y4, Y3 VPADDD 32(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 // Round 4 VPADDD Y0, Y4, Y0 VPADDD 320(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y0, Y4, Y0 VPADDD 224(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y1, Y5, Y1 VPADDD 384(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y5, Y1 VPADDD 288(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y2, Y6, Y2 VPADDD 448(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y6, Y2 VPADDD 96(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y3, Y7, Y3 VPADDD 416(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y7, Y3 VPADDD 480(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y0, Y5, Y0 VPADDD 128(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y0, Y5, Y0 VPADDD (SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y6, Y1 VPADDD 352(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y1, Y6, Y1 VPADDD 64(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y7, Y2 VPADDD 160(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y2, Y7, Y2 VPADDD 256(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y4, Y3 VPADDD 32(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y3, Y4, Y3 VPADDD 192(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 // Round 5 VPADDD Y0, Y4, Y0 VPADDD 384(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y0, Y4, Y0 VPADDD 416(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y1, Y5, Y1 VPADDD 288(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y5, Y1 VPADDD 352(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y2, Y6, Y2 VPADDD 480(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y6, Y2 VPADDD 320(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y3, Y7, Y3 VPADDD 448(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y7, Y3 VPADDD 256(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y0, Y5, Y0 VPADDD 224(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y0, Y5, Y0 VPADDD 64(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y6, Y1 VPADDD 160(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y1, Y6, Y1 VPADDD 96(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y7, Y2 VPADDD (SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y2, Y7, Y2 VPADDD 32(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y4, Y3 VPADDD 192(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y3, Y4, Y3 VPADDD 128(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 // Round 6 VPADDD Y0, Y4, Y0 VPADDD 288(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y0, Y4, Y0 VPADDD 448(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y1, Y5, Y1 VPADDD 352(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y5, Y1 VPADDD 160(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y2, Y6, Y2 VPADDD 256(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y6, Y2 VPADDD 384(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y3, Y7, Y3 VPADDD 480(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y7, Y3 VPADDD 32(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y0, Y5, Y0 VPADDD 416(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y0, Y5, Y0 VPADDD 96(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y6, Y1 VPADDD (SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y1, Y6, Y1 VPADDD 320(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y7, Y2 VPADDD 64(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y2, Y7, Y2 VPADDD 192(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y4, Y3 VPADDD 128(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y3, Y4, Y3 VPADDD 224(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 // Round 7 VPADDD Y0, Y4, Y0 VPADDD 352(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y0, Y4, Y0 VPADDD 480(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y1, Y5, Y1 VPADDD 160(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y5, Y1 VPADDD (SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y2, Y6, Y2 VPADDD 32(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y6, Y2 VPADDD 288(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y3, Y7, Y3 VPADDD 256(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y7, Y3 VPADDD 192(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y0, Y5, Y0 VPADDD 448(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y0, Y5, Y0 VPADDD 320(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y6, Y1 VPADDD 64(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y1, Y6, Y1 VPADDD 384(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y7, Y2 VPADDD 96(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y2, Y7, Y2 VPADDD 128(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y4, Y3 VPADDD 224(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y3, Y4, Y3 VPADDD 416(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 VMOVDQU 512(SP), Y8 // Finalize CVs VMOVDQU Y8, 256(SP) VMOVDQU Y9, 288(SP) VMOVDQU Y10, 320(SP) VMOVDQU Y11, 352(SP) VMOVDQU Y12, 384(SP) VMOVDQU Y13, 416(SP) VMOVDQU Y14, 448(SP) VMOVDQU Y15, 480(SP) VPXOR Y0, Y8, Y0 VPXOR Y1, Y9, Y1 VPXOR Y2, Y10, Y2 VPXOR Y3, Y11, Y3 VPXOR Y4, Y12, Y4 VPXOR Y5, Y13, Y5 VPXOR Y6, Y14, Y6 VPXOR Y7, Y15, Y7 VPUNPCKLDQ Y1, Y0, Y8 VPUNPCKHDQ Y1, Y0, Y9 VPUNPCKLDQ Y3, Y2, Y10 VPUNPCKHDQ Y3, Y2, Y11 VPUNPCKLDQ Y5, Y4, Y12 VPUNPCKHDQ Y5, Y4, Y13 VPUNPCKLDQ Y7, Y6, Y14 VPUNPCKHDQ Y7, Y6, Y15 VPUNPCKLQDQ Y10, Y8, Y0 VPUNPCKHQDQ Y10, Y8, Y1 VPUNPCKLQDQ Y11, Y9, Y2 VPUNPCKHQDQ Y11, Y9, Y3 VPUNPCKLQDQ Y14, Y12, Y4 VPUNPCKHQDQ Y14, Y12, Y5 VPUNPCKLQDQ Y15, Y13, Y6 VPUNPCKHQDQ Y15, Y13, Y7 VPERM2I128 $0x20, Y4, Y0, Y8 VPERM2I128 $0x31, Y4, Y0, Y12 VPERM2I128 $0x20, Y5, Y1, Y9 VPERM2I128 $0x31, Y5, Y1, Y13 VPERM2I128 $0x20, Y6, Y2, Y10 VPERM2I128 $0x31, Y6, Y2, Y14 VPERM2I128 $0x20, Y7, Y3, Y11 VPERM2I128 $0x31, Y7, Y3, Y15 VMOVDQU Y8, (AX) VMOVDQU Y9, 64(AX) VMOVDQU Y10, 128(AX) VMOVDQU Y11, 192(AX) VMOVDQU Y12, 256(AX) VMOVDQU Y13, 320(AX) VMOVDQU Y14, 384(AX) VMOVDQU Y15, 448(AX) VMOVDQU 256(SP), Y8 VMOVDQU 288(SP), Y9 VMOVDQU 320(SP), Y10 VMOVDQU 352(SP), Y11 VMOVDQU 384(SP), Y12 VMOVDQU 416(SP), Y13 VMOVDQU 448(SP), Y14 VMOVDQU 480(SP), Y15 VPBROADCASTD (DX), Y0 VPXOR Y0, Y8, Y8 VPBROADCASTD 4(DX), Y0 VPXOR Y0, Y9, Y9 VPBROADCASTD 8(DX), Y0 VPXOR Y0, Y10, Y10 VPBROADCASTD 12(DX), Y0 VPXOR Y0, Y11, Y11 VPBROADCASTD 16(DX), Y0 VPXOR Y0, Y12, Y12 VPBROADCASTD 20(DX), Y0 VPXOR Y0, Y13, Y13 VPBROADCASTD 24(DX), Y0 VPXOR Y0, Y14, Y14 VPBROADCASTD 28(DX), Y0 VPXOR Y0, Y15, Y15 VPUNPCKLDQ Y9, Y8, Y0 VPUNPCKHDQ Y9, Y8, Y1 VPUNPCKLDQ Y11, Y10, Y2 VPUNPCKHDQ Y11, Y10, Y3 VPUNPCKLDQ Y13, Y12, Y4 VPUNPCKHDQ Y13, Y12, Y5 VPUNPCKLDQ Y15, Y14, Y6 VPUNPCKHDQ Y15, Y14, Y7 VPUNPCKLQDQ Y2, Y0, Y8 VPUNPCKHQDQ Y2, Y0, Y9 VPUNPCKLQDQ Y3, Y1, Y10 VPUNPCKHQDQ Y3, Y1, Y11 VPUNPCKLQDQ Y6, Y4, Y12 VPUNPCKHQDQ Y6, Y4, Y13 VPUNPCKLQDQ Y7, Y5, Y14 VPUNPCKHQDQ Y7, Y5, Y15 VPERM2I128 $0x20, Y12, Y8, Y0 VPERM2I128 $0x31, Y12, Y8, Y4 VPERM2I128 $0x20, Y13, Y9, Y1 VPERM2I128 $0x31, Y13, Y9, Y5 VPERM2I128 $0x20, Y14, Y10, Y2 VPERM2I128 $0x31, Y14, Y10, Y6 VPERM2I128 $0x20, Y15, Y11, Y3 VPERM2I128 $0x31, Y15, Y11, Y7 VMOVDQU Y0, 32(AX) VMOVDQU Y1, 96(AX) VMOVDQU Y2, 160(AX) VMOVDQU Y3, 224(AX) VMOVDQU Y4, 288(AX) VMOVDQU Y5, 352(AX) VMOVDQU Y6, 416(AX) VMOVDQU Y7, 480(AX) VZEROUPPER RET // func compressChunksAVX2(cvs *[8][8]uint32, buf *[8192]byte, key *[8]uint32, counter uint64, flags uint32) // Requires: AVX, AVX2 TEXT ·compressChunksAVX2(SB), NOSPLIT, $672-36 MOVQ cvs+0(FP), AX MOVQ buf+8(FP), CX MOVQ key+16(FP), DX // Load key VPBROADCASTD (DX), Y0 VPBROADCASTD 4(DX), Y1 VPBROADCASTD 8(DX), Y2 VPBROADCASTD 12(DX), Y3 VPBROADCASTD 16(DX), Y4 VPBROADCASTD 20(DX), Y5 VPBROADCASTD 24(DX), Y6 VPBROADCASTD 28(DX), Y7 // Initialize counter VPBROADCASTQ counter+24(FP), Y12 VPBROADCASTQ counter+24(FP), Y13 VPADDQ seq64<>+0(SB), Y12, Y12 VPADDQ seq64<>+32(SB), Y13, Y13 VPUNPCKLDQ Y13, Y12, Y14 VPUNPCKHDQ Y13, Y12, Y15 VPUNPCKLDQ Y15, Y14, Y12 VPUNPCKHDQ Y15, Y14, Y13 VPERMQ $0xd8, Y12, Y12 VPERMQ $0xd8, Y13, Y13 VMOVDQU Y12, 512(SP) VMOVDQU Y13, 544(SP) // Initialize flags VPBROADCASTD flags+32(FP), Y14 VMOVDQU Y14, 576(SP) VMOVDQU Y14, 608(SP) ORL $0x01, 576(SP) ORL $0x02, 636(SP) // Loop index XORQ DX, DX loop: // Load transposed block VMOVDQU seq<>+0(SB), Y9 VPSLLD $0x0a, Y9, Y9 VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, (CX)(Y9*1), Y10 VMOVDQU Y10, (SP) VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, 4(CX)(Y9*1), Y10 VMOVDQU Y10, 32(SP) VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, 8(CX)(Y9*1), Y10 VMOVDQU Y10, 64(SP) VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, 12(CX)(Y9*1), Y10 VMOVDQU Y10, 96(SP) VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, 16(CX)(Y9*1), Y10 VMOVDQU Y10, 128(SP) VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, 20(CX)(Y9*1), Y10 VMOVDQU Y10, 160(SP) VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, 24(CX)(Y9*1), Y10 VMOVDQU Y10, 192(SP) VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, 28(CX)(Y9*1), Y10 VMOVDQU Y10, 224(SP) VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, 32(CX)(Y9*1), Y10 VMOVDQU Y10, 256(SP) VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, 36(CX)(Y9*1), Y10 VMOVDQU Y10, 288(SP) VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, 40(CX)(Y9*1), Y10 VMOVDQU Y10, 320(SP) VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, 44(CX)(Y9*1), Y10 VMOVDQU Y10, 352(SP) VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, 48(CX)(Y9*1), Y10 VMOVDQU Y10, 384(SP) VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, 52(CX)(Y9*1), Y10 VMOVDQU Y10, 416(SP) VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, 56(CX)(Y9*1), Y10 VMOVDQU Y10, 448(SP) VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, 60(CX)(Y9*1), Y10 VMOVDQU Y10, 480(SP) ADDQ $0x40, CX // Reload state vectors (other than CVs) VPBROADCASTD iv<>+0(SB), Y8 VPBROADCASTD iv<>+4(SB), Y9 VPBROADCASTD iv<>+8(SB), Y10 VPBROADCASTD iv<>+12(SB), Y11 VMOVDQU 512(SP), Y12 VMOVDQU 544(SP), Y13 VPBROADCASTD seq<>+4(SB), Y14 VPSLLD $0x06, Y14, Y14 VPBROADCASTD 576(SP)(DX*4), Y15 VMOVDQU Y8, 640(SP) // Round 1 VPADDD Y0, Y4, Y0 VPADDD (SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VMOVDQU 640(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 640(SP) VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y0, Y4, Y0 VPADDD 32(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VMOVDQU 640(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 640(SP) VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y1, Y5, Y1 VPADDD 64(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y5, Y1 VPADDD 96(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y2, Y6, Y2 VPADDD 128(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y6, Y2 VPADDD 160(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y3, Y7, Y3 VPADDD 192(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y7, Y3 VPADDD 224(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y0, Y5, Y0 VPADDD 256(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y0, Y5, Y0 VPADDD 288(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y6, Y1 VPADDD 320(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y1, Y6, Y1 VPADDD 352(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y7, Y2 VPADDD 384(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VMOVDQU 640(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 640(SP) VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y2, Y7, Y2 VPADDD 416(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VMOVDQU 640(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 640(SP) VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y4, Y3 VPADDD 448(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y3, Y4, Y3 VPADDD 480(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 // Round 2 VPADDD Y0, Y4, Y0 VPADDD 64(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VMOVDQU 640(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 640(SP) VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y0, Y4, Y0 VPADDD 192(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VMOVDQU 640(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 640(SP) VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y1, Y5, Y1 VPADDD 96(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y5, Y1 VPADDD 320(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y2, Y6, Y2 VPADDD 224(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y6, Y2 VPADDD (SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y3, Y7, Y3 VPADDD 128(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y7, Y3 VPADDD 416(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y0, Y5, Y0 VPADDD 32(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y0, Y5, Y0 VPADDD 352(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y6, Y1 VPADDD 384(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y1, Y6, Y1 VPADDD 160(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y7, Y2 VPADDD 288(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VMOVDQU 640(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 640(SP) VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y2, Y7, Y2 VPADDD 448(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VMOVDQU 640(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 640(SP) VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y4, Y3 VPADDD 480(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y3, Y4, Y3 VPADDD 256(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 // Round 3 VPADDD Y0, Y4, Y0 VPADDD 96(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VMOVDQU 640(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 640(SP) VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y0, Y4, Y0 VPADDD 128(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VMOVDQU 640(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 640(SP) VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y1, Y5, Y1 VPADDD 320(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y5, Y1 VPADDD 384(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y2, Y6, Y2 VPADDD 416(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y6, Y2 VPADDD 64(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y3, Y7, Y3 VPADDD 224(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y7, Y3 VPADDD 448(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y0, Y5, Y0 VPADDD 192(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y0, Y5, Y0 VPADDD 160(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y6, Y1 VPADDD 288(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y1, Y6, Y1 VPADDD (SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y7, Y2 VPADDD 352(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VMOVDQU 640(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 640(SP) VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y2, Y7, Y2 VPADDD 480(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VMOVDQU 640(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 640(SP) VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y4, Y3 VPADDD 256(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y3, Y4, Y3 VPADDD 32(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 // Round 4 VPADDD Y0, Y4, Y0 VPADDD 320(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VMOVDQU 640(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 640(SP) VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y0, Y4, Y0 VPADDD 224(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VMOVDQU 640(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 640(SP) VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y1, Y5, Y1 VPADDD 384(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y5, Y1 VPADDD 288(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y2, Y6, Y2 VPADDD 448(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y6, Y2 VPADDD 96(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y3, Y7, Y3 VPADDD 416(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y7, Y3 VPADDD 480(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y0, Y5, Y0 VPADDD 128(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y0, Y5, Y0 VPADDD (SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y6, Y1 VPADDD 352(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y1, Y6, Y1 VPADDD 64(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y7, Y2 VPADDD 160(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VMOVDQU 640(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 640(SP) VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y2, Y7, Y2 VPADDD 256(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VMOVDQU 640(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 640(SP) VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y4, Y3 VPADDD 32(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y3, Y4, Y3 VPADDD 192(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 // Round 5 VPADDD Y0, Y4, Y0 VPADDD 384(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VMOVDQU 640(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 640(SP) VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y0, Y4, Y0 VPADDD 416(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VMOVDQU 640(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 640(SP) VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y1, Y5, Y1 VPADDD 288(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y5, Y1 VPADDD 352(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y2, Y6, Y2 VPADDD 480(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y6, Y2 VPADDD 320(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y3, Y7, Y3 VPADDD 448(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y7, Y3 VPADDD 256(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y0, Y5, Y0 VPADDD 224(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y0, Y5, Y0 VPADDD 64(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y6, Y1 VPADDD 160(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y1, Y6, Y1 VPADDD 96(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y7, Y2 VPADDD (SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VMOVDQU 640(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 640(SP) VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y2, Y7, Y2 VPADDD 32(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VMOVDQU 640(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 640(SP) VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y4, Y3 VPADDD 192(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y3, Y4, Y3 VPADDD 128(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 // Round 6 VPADDD Y0, Y4, Y0 VPADDD 288(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VMOVDQU 640(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 640(SP) VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y0, Y4, Y0 VPADDD 448(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VMOVDQU 640(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 640(SP) VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y1, Y5, Y1 VPADDD 352(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y5, Y1 VPADDD 160(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y2, Y6, Y2 VPADDD 256(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y6, Y2 VPADDD 384(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y3, Y7, Y3 VPADDD 480(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y7, Y3 VPADDD 32(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y0, Y5, Y0 VPADDD 416(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y0, Y5, Y0 VPADDD 96(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y6, Y1 VPADDD (SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y1, Y6, Y1 VPADDD 320(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y7, Y2 VPADDD 64(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VMOVDQU 640(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 640(SP) VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y2, Y7, Y2 VPADDD 192(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VMOVDQU 640(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 640(SP) VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y4, Y3 VPADDD 128(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y3, Y4, Y3 VPADDD 224(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 // Round 7 VPADDD Y0, Y4, Y0 VPADDD 352(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VMOVDQU 640(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 640(SP) VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y0, Y4, Y0 VPADDD 480(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VMOVDQU 640(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 640(SP) VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y1, Y5, Y1 VPADDD 160(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y5, Y1 VPADDD (SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y2, Y6, Y2 VPADDD 32(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y6, Y2 VPADDD 288(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y3, Y7, Y3 VPADDD 256(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y7, Y3 VPADDD 192(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y0, Y5, Y0 VPADDD 448(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y0, Y5, Y0 VPADDD 320(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y6, Y1 VPADDD 64(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y1, Y6, Y1 VPADDD 384(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y7, Y2 VPADDD 96(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VMOVDQU 640(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 640(SP) VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y2, Y7, Y2 VPADDD 128(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VMOVDQU 640(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 640(SP) VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y4, Y3 VPADDD 224(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y3, Y4, Y3 VPADDD 416(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 VMOVDQU 640(SP), Y8 // Finalize CVs VPXOR Y0, Y8, Y0 VPXOR Y1, Y9, Y1 VPXOR Y2, Y10, Y2 VPXOR Y3, Y11, Y3 VPXOR Y4, Y12, Y4 VPXOR Y5, Y13, Y5 VPXOR Y6, Y14, Y6 VPXOR Y7, Y15, Y7 // Loop INCQ DX CMPQ DX, $0x00000010 JNE loop // Finished; transpose CVs VPUNPCKLDQ Y1, Y0, Y8 VPUNPCKHDQ Y1, Y0, Y9 VPUNPCKLDQ Y3, Y2, Y10 VPUNPCKHDQ Y3, Y2, Y11 VPUNPCKLDQ Y5, Y4, Y12 VPUNPCKHDQ Y5, Y4, Y13 VPUNPCKLDQ Y7, Y6, Y14 VPUNPCKHDQ Y7, Y6, Y15 VPUNPCKLQDQ Y10, Y8, Y0 VPUNPCKHQDQ Y10, Y8, Y1 VPUNPCKLQDQ Y11, Y9, Y2 VPUNPCKHQDQ Y11, Y9, Y3 VPUNPCKLQDQ Y14, Y12, Y4 VPUNPCKHQDQ Y14, Y12, Y5 VPUNPCKLQDQ Y15, Y13, Y6 VPUNPCKHQDQ Y15, Y13, Y7 VPERM2I128 $0x20, Y4, Y0, Y8 VPERM2I128 $0x31, Y4, Y0, Y12 VPERM2I128 $0x20, Y5, Y1, Y9 VPERM2I128 $0x31, Y5, Y1, Y13 VPERM2I128 $0x20, Y6, Y2, Y10 VPERM2I128 $0x31, Y6, Y2, Y14 VPERM2I128 $0x20, Y7, Y3, Y11 VPERM2I128 $0x31, Y7, Y3, Y15 VMOVDQU Y8, (AX) VMOVDQU Y9, 32(AX) VMOVDQU Y10, 64(AX) VMOVDQU Y11, 96(AX) VMOVDQU Y12, 128(AX) VMOVDQU Y13, 160(AX) VMOVDQU Y14, 192(AX) VMOVDQU Y15, 224(AX) VZEROUPPER RET // func compressParentsAVX2(parents *[8][8]uint32, cvs *[16][8]uint32, key *[8]uint32, flags uint32) // Requires: AVX, AVX2 TEXT ·compressParentsAVX2(SB), NOSPLIT, $544-28 MOVQ parents+0(FP), AX MOVQ cvs+8(FP), CX MOVQ key+16(FP), DX // Load transposed block VMOVDQU seq<>+0(SB), Y9 VPSLLD $0x06, Y9, Y9 VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, (CX)(Y9*1), Y10 VMOVDQU Y10, (SP) VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, 4(CX)(Y9*1), Y10 VMOVDQU Y10, 32(SP) VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, 8(CX)(Y9*1), Y10 VMOVDQU Y10, 64(SP) VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, 12(CX)(Y9*1), Y10 VMOVDQU Y10, 96(SP) VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, 16(CX)(Y9*1), Y10 VMOVDQU Y10, 128(SP) VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, 20(CX)(Y9*1), Y10 VMOVDQU Y10, 160(SP) VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, 24(CX)(Y9*1), Y10 VMOVDQU Y10, 192(SP) VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, 28(CX)(Y9*1), Y10 VMOVDQU Y10, 224(SP) VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, 32(CX)(Y9*1), Y10 VMOVDQU Y10, 256(SP) VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, 36(CX)(Y9*1), Y10 VMOVDQU Y10, 288(SP) VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, 40(CX)(Y9*1), Y10 VMOVDQU Y10, 320(SP) VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, 44(CX)(Y9*1), Y10 VMOVDQU Y10, 352(SP) VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, 48(CX)(Y9*1), Y10 VMOVDQU Y10, 384(SP) VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, 52(CX)(Y9*1), Y10 VMOVDQU Y10, 416(SP) VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, 56(CX)(Y9*1), Y10 VMOVDQU Y10, 448(SP) VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, 60(CX)(Y9*1), Y10 VMOVDQU Y10, 480(SP) // Initialize state vectors VPBROADCASTD (DX), Y0 VPBROADCASTD 4(DX), Y1 VPBROADCASTD 8(DX), Y2 VPBROADCASTD 12(DX), Y3 VPBROADCASTD 16(DX), Y4 VPBROADCASTD 20(DX), Y5 VPBROADCASTD 24(DX), Y6 VPBROADCASTD 28(DX), Y7 VPBROADCASTD iv<>+0(SB), Y8 VPBROADCASTD iv<>+4(SB), Y9 VPBROADCASTD iv<>+8(SB), Y10 VPBROADCASTD iv<>+12(SB), Y11 VPXOR Y12, Y12, Y12 VPXOR Y13, Y13, Y13 VPBROADCASTD seq<>+4(SB), Y14 VPSLLD $0x06, Y14, Y14 ORL $0x04, flags+24(FP) VPBROADCASTD flags+24(FP), Y15 VMOVDQU Y8, 512(SP) // Round 1 VPADDD Y0, Y4, Y0 VPADDD (SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y0, Y4, Y0 VPADDD 32(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y1, Y5, Y1 VPADDD 64(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y5, Y1 VPADDD 96(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y2, Y6, Y2 VPADDD 128(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y6, Y2 VPADDD 160(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y3, Y7, Y3 VPADDD 192(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y7, Y3 VPADDD 224(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y0, Y5, Y0 VPADDD 256(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y0, Y5, Y0 VPADDD 288(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y6, Y1 VPADDD 320(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y1, Y6, Y1 VPADDD 352(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y7, Y2 VPADDD 384(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y2, Y7, Y2 VPADDD 416(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y4, Y3 VPADDD 448(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y3, Y4, Y3 VPADDD 480(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 // Round 2 VPADDD Y0, Y4, Y0 VPADDD 64(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y0, Y4, Y0 VPADDD 192(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y1, Y5, Y1 VPADDD 96(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y5, Y1 VPADDD 320(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y2, Y6, Y2 VPADDD 224(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y6, Y2 VPADDD (SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y3, Y7, Y3 VPADDD 128(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y7, Y3 VPADDD 416(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y0, Y5, Y0 VPADDD 32(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y0, Y5, Y0 VPADDD 352(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y6, Y1 VPADDD 384(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y1, Y6, Y1 VPADDD 160(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y7, Y2 VPADDD 288(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y2, Y7, Y2 VPADDD 448(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y4, Y3 VPADDD 480(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y3, Y4, Y3 VPADDD 256(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 // Round 3 VPADDD Y0, Y4, Y0 VPADDD 96(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y0, Y4, Y0 VPADDD 128(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y1, Y5, Y1 VPADDD 320(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y5, Y1 VPADDD 384(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y2, Y6, Y2 VPADDD 416(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y6, Y2 VPADDD 64(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y3, Y7, Y3 VPADDD 224(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y7, Y3 VPADDD 448(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y0, Y5, Y0 VPADDD 192(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y0, Y5, Y0 VPADDD 160(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y6, Y1 VPADDD 288(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y1, Y6, Y1 VPADDD (SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y7, Y2 VPADDD 352(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y2, Y7, Y2 VPADDD 480(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y4, Y3 VPADDD 256(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y3, Y4, Y3 VPADDD 32(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 // Round 4 VPADDD Y0, Y4, Y0 VPADDD 320(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y0, Y4, Y0 VPADDD 224(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y1, Y5, Y1 VPADDD 384(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y5, Y1 VPADDD 288(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y2, Y6, Y2 VPADDD 448(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y6, Y2 VPADDD 96(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y3, Y7, Y3 VPADDD 416(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y7, Y3 VPADDD 480(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y0, Y5, Y0 VPADDD 128(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y0, Y5, Y0 VPADDD (SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y6, Y1 VPADDD 352(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y1, Y6, Y1 VPADDD 64(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y7, Y2 VPADDD 160(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y2, Y7, Y2 VPADDD 256(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y4, Y3 VPADDD 32(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y3, Y4, Y3 VPADDD 192(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 // Round 5 VPADDD Y0, Y4, Y0 VPADDD 384(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y0, Y4, Y0 VPADDD 416(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y1, Y5, Y1 VPADDD 288(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y5, Y1 VPADDD 352(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y2, Y6, Y2 VPADDD 480(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y6, Y2 VPADDD 320(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y3, Y7, Y3 VPADDD 448(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y7, Y3 VPADDD 256(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y0, Y5, Y0 VPADDD 224(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y0, Y5, Y0 VPADDD 64(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y6, Y1 VPADDD 160(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y1, Y6, Y1 VPADDD 96(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y7, Y2 VPADDD (SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y2, Y7, Y2 VPADDD 32(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y4, Y3 VPADDD 192(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y3, Y4, Y3 VPADDD 128(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 // Round 6 VPADDD Y0, Y4, Y0 VPADDD 288(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y0, Y4, Y0 VPADDD 448(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y1, Y5, Y1 VPADDD 352(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y5, Y1 VPADDD 160(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y2, Y6, Y2 VPADDD 256(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y6, Y2 VPADDD 384(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y3, Y7, Y3 VPADDD 480(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y7, Y3 VPADDD 32(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y0, Y5, Y0 VPADDD 416(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y0, Y5, Y0 VPADDD 96(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y6, Y1 VPADDD (SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y1, Y6, Y1 VPADDD 320(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y7, Y2 VPADDD 64(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y2, Y7, Y2 VPADDD 192(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y4, Y3 VPADDD 128(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y3, Y4, Y3 VPADDD 224(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 // Round 7 VPADDD Y0, Y4, Y0 VPADDD 352(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y0, Y4, Y0 VPADDD 480(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y1, Y5, Y1 VPADDD 160(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y5, Y1 VPADDD (SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y2, Y6, Y2 VPADDD 32(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y6, Y2 VPADDD 288(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y3, Y7, Y3 VPADDD 256(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y7, Y3 VPADDD 192(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y0, Y5, Y0 VPADDD 448(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y0, Y5, Y0 VPADDD 320(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y6, Y1 VPADDD 64(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y1, Y6, Y1 VPADDD 384(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y7, Y2 VPADDD 96(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y2, Y7, Y2 VPADDD 128(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y4, Y3 VPADDD 224(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y3, Y4, Y3 VPADDD 416(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 VMOVDQU 512(SP), Y8 // Finalize CVs VPXOR Y0, Y8, Y0 VPXOR Y1, Y9, Y1 VPXOR Y2, Y10, Y2 VPXOR Y3, Y11, Y3 VPXOR Y4, Y12, Y4 VPXOR Y5, Y13, Y5 VPXOR Y6, Y14, Y6 VPXOR Y7, Y15, Y7 VPUNPCKLDQ Y1, Y0, Y8 VPUNPCKHDQ Y1, Y0, Y9 VPUNPCKLDQ Y3, Y2, Y10 VPUNPCKHDQ Y3, Y2, Y11 VPUNPCKLDQ Y5, Y4, Y12 VPUNPCKHDQ Y5, Y4, Y13 VPUNPCKLDQ Y7, Y6, Y14 VPUNPCKHDQ Y7, Y6, Y15 VPUNPCKLQDQ Y10, Y8, Y0 VPUNPCKHQDQ Y10, Y8, Y1 VPUNPCKLQDQ Y11, Y9, Y2 VPUNPCKHQDQ Y11, Y9, Y3 VPUNPCKLQDQ Y14, Y12, Y4 VPUNPCKHQDQ Y14, Y12, Y5 VPUNPCKLQDQ Y15, Y13, Y6 VPUNPCKHQDQ Y15, Y13, Y7 VPERM2I128 $0x20, Y4, Y0, Y8 VPERM2I128 $0x31, Y4, Y0, Y12 VPERM2I128 $0x20, Y5, Y1, Y9 VPERM2I128 $0x31, Y5, Y1, Y13 VPERM2I128 $0x20, Y6, Y2, Y10 VPERM2I128 $0x31, Y6, Y2, Y14 VPERM2I128 $0x20, Y7, Y3, Y11 VPERM2I128 $0x31, Y7, Y3, Y15 VMOVDQU Y8, (AX) VMOVDQU Y9, 32(AX) VMOVDQU Y10, 64(AX) VMOVDQU Y11, 96(AX) VMOVDQU Y12, 128(AX) VMOVDQU Y13, 160(AX) VMOVDQU Y14, 192(AX) VMOVDQU Y15, 224(AX) VZEROUPPER RET blake3-1.3.0/guts/compress_generic.go000066400000000000000000000131631461472055300175270ustar00rootroot00000000000000package guts import ( "bytes" "math/bits" ) // CompressNode compresses a node into a 16-word output. func CompressNode(n Node) (out [16]uint32) { g := func(a, b, c, d, mx, my uint32) (uint32, uint32, uint32, uint32) { a += b + mx d = bits.RotateLeft32(d^a, -16) c += d b = bits.RotateLeft32(b^c, -12) a += b + my d = bits.RotateLeft32(d^a, -8) c += d b = bits.RotateLeft32(b^c, -7) return a, b, c, d } // NOTE: we unroll all of the rounds, as well as the permutations that occur // between rounds. // round 1 (also initializes state) // columns s0, s4, s8, s12 := g(n.CV[0], n.CV[4], IV[0], uint32(n.Counter), n.Block[0], n.Block[1]) s1, s5, s9, s13 := g(n.CV[1], n.CV[5], IV[1], uint32(n.Counter>>32), n.Block[2], n.Block[3]) s2, s6, s10, s14 := g(n.CV[2], n.CV[6], IV[2], n.BlockLen, n.Block[4], n.Block[5]) s3, s7, s11, s15 := g(n.CV[3], n.CV[7], IV[3], n.Flags, n.Block[6], n.Block[7]) // diagonals s0, s5, s10, s15 = g(s0, s5, s10, s15, n.Block[8], n.Block[9]) s1, s6, s11, s12 = g(s1, s6, s11, s12, n.Block[10], n.Block[11]) s2, s7, s8, s13 = g(s2, s7, s8, s13, n.Block[12], n.Block[13]) s3, s4, s9, s14 = g(s3, s4, s9, s14, n.Block[14], n.Block[15]) // round 2 s0, s4, s8, s12 = g(s0, s4, s8, s12, n.Block[2], n.Block[6]) s1, s5, s9, s13 = g(s1, s5, s9, s13, n.Block[3], n.Block[10]) s2, s6, s10, s14 = g(s2, s6, s10, s14, n.Block[7], n.Block[0]) s3, s7, s11, s15 = g(s3, s7, s11, s15, n.Block[4], n.Block[13]) s0, s5, s10, s15 = g(s0, s5, s10, s15, n.Block[1], n.Block[11]) s1, s6, s11, s12 = g(s1, s6, s11, s12, n.Block[12], n.Block[5]) s2, s7, s8, s13 = g(s2, s7, s8, s13, n.Block[9], n.Block[14]) s3, s4, s9, s14 = g(s3, s4, s9, s14, n.Block[15], n.Block[8]) // round 3 s0, s4, s8, s12 = g(s0, s4, s8, s12, n.Block[3], n.Block[4]) s1, s5, s9, s13 = g(s1, s5, s9, s13, n.Block[10], n.Block[12]) s2, s6, s10, s14 = g(s2, s6, s10, s14, n.Block[13], n.Block[2]) s3, s7, s11, s15 = g(s3, s7, s11, s15, n.Block[7], n.Block[14]) s0, s5, s10, s15 = g(s0, s5, s10, s15, n.Block[6], n.Block[5]) s1, s6, s11, s12 = g(s1, s6, s11, s12, n.Block[9], n.Block[0]) s2, s7, s8, s13 = g(s2, s7, s8, s13, n.Block[11], n.Block[15]) s3, s4, s9, s14 = g(s3, s4, s9, s14, n.Block[8], n.Block[1]) // round 4 s0, s4, s8, s12 = g(s0, s4, s8, s12, n.Block[10], n.Block[7]) s1, s5, s9, s13 = g(s1, s5, s9, s13, n.Block[12], n.Block[9]) s2, s6, s10, s14 = g(s2, s6, s10, s14, n.Block[14], n.Block[3]) s3, s7, s11, s15 = g(s3, s7, s11, s15, n.Block[13], n.Block[15]) s0, s5, s10, s15 = g(s0, s5, s10, s15, n.Block[4], n.Block[0]) s1, s6, s11, s12 = g(s1, s6, s11, s12, n.Block[11], n.Block[2]) s2, s7, s8, s13 = g(s2, s7, s8, s13, n.Block[5], n.Block[8]) s3, s4, s9, s14 = g(s3, s4, s9, s14, n.Block[1], n.Block[6]) // round 5 s0, s4, s8, s12 = g(s0, s4, s8, s12, n.Block[12], n.Block[13]) s1, s5, s9, s13 = g(s1, s5, s9, s13, n.Block[9], n.Block[11]) s2, s6, s10, s14 = g(s2, s6, s10, s14, n.Block[15], n.Block[10]) s3, s7, s11, s15 = g(s3, s7, s11, s15, n.Block[14], n.Block[8]) s0, s5, s10, s15 = g(s0, s5, s10, s15, n.Block[7], n.Block[2]) s1, s6, s11, s12 = g(s1, s6, s11, s12, n.Block[5], n.Block[3]) s2, s7, s8, s13 = g(s2, s7, s8, s13, n.Block[0], n.Block[1]) s3, s4, s9, s14 = g(s3, s4, s9, s14, n.Block[6], n.Block[4]) // round 6 s0, s4, s8, s12 = g(s0, s4, s8, s12, n.Block[9], n.Block[14]) s1, s5, s9, s13 = g(s1, s5, s9, s13, n.Block[11], n.Block[5]) s2, s6, s10, s14 = g(s2, s6, s10, s14, n.Block[8], n.Block[12]) s3, s7, s11, s15 = g(s3, s7, s11, s15, n.Block[15], n.Block[1]) s0, s5, s10, s15 = g(s0, s5, s10, s15, n.Block[13], n.Block[3]) s1, s6, s11, s12 = g(s1, s6, s11, s12, n.Block[0], n.Block[10]) s2, s7, s8, s13 = g(s2, s7, s8, s13, n.Block[2], n.Block[6]) s3, s4, s9, s14 = g(s3, s4, s9, s14, n.Block[4], n.Block[7]) // round 7 s0, s4, s8, s12 = g(s0, s4, s8, s12, n.Block[11], n.Block[15]) s1, s5, s9, s13 = g(s1, s5, s9, s13, n.Block[5], n.Block[0]) s2, s6, s10, s14 = g(s2, s6, s10, s14, n.Block[1], n.Block[9]) s3, s7, s11, s15 = g(s3, s7, s11, s15, n.Block[8], n.Block[6]) s0, s5, s10, s15 = g(s0, s5, s10, s15, n.Block[14], n.Block[10]) s1, s6, s11, s12 = g(s1, s6, s11, s12, n.Block[2], n.Block[12]) s2, s7, s8, s13 = g(s2, s7, s8, s13, n.Block[3], n.Block[4]) s3, s4, s9, s14 = g(s3, s4, s9, s14, n.Block[7], n.Block[13]) // finalization return [16]uint32{ s0 ^ s8, s1 ^ s9, s2 ^ s10, s3 ^ s11, s4 ^ s12, s5 ^ s13, s6 ^ s14, s7 ^ s15, s8 ^ n.CV[0], s9 ^ n.CV[1], s10 ^ n.CV[2], s11 ^ n.CV[3], s12 ^ n.CV[4], s13 ^ n.CV[5], s14 ^ n.CV[6], s15 ^ n.CV[7], } } // ChainingValue compresses n and returns the first 8 output words. func ChainingValue(n Node) (cv [8]uint32) { full := CompressNode(n) copy(cv[:], full[:]) return } func compressBufferGeneric(buf *[MaxSIMD * ChunkSize]byte, buflen int, key *[8]uint32, counter uint64, flags uint32) (n Node) { if buflen <= ChunkSize { return CompressChunk(buf[:buflen], key, counter, flags) } var cvs [MaxSIMD][8]uint32 var numCVs uint64 for bb := bytes.NewBuffer(buf[:buflen]); bb.Len() > 0; numCVs++ { cvs[numCVs] = ChainingValue(CompressChunk(bb.Next(ChunkSize), key, counter+numCVs, flags)) } return mergeSubtrees(&cvs, numCVs, key, flags) } func compressBlocksGeneric(outs *[MaxSIMD][64]byte, n Node) { for i := range outs { outs[i] = WordsToBytes(CompressNode(n)) n.Counter++ } } func mergeSubtreesGeneric(cvs *[MaxSIMD][8]uint32, numCVs uint64, key *[8]uint32, flags uint32) Node { for numCVs > 2 { rem := numCVs / 2 for i := range cvs[:rem] { cvs[i] = ChainingValue(ParentNode(cvs[i*2], cvs[i*2+1], key, flags)) } if numCVs%2 != 0 { cvs[rem] = cvs[rem*2] rem++ } numCVs = rem } return ParentNode(cvs[0], cvs[1], key, flags) } blake3-1.3.0/guts/compress_noasm.go000066400000000000000000000035411461472055300172270ustar00rootroot00000000000000//go:build !amd64 // +build !amd64 package guts import "encoding/binary" // CompressBuffer compresses up to MaxSIMD chunks in parallel and returns their // root node. func CompressBuffer(buf *[MaxSIMD * ChunkSize]byte, buflen int, key *[8]uint32, counter uint64, flags uint32) Node { return compressBufferGeneric(buf, buflen, key, counter, flags) } // CompressChunk compresses a single chunk, returning its final (uncompressed) // node. func CompressChunk(chunk []byte, key *[8]uint32, counter uint64, flags uint32) Node { n := Node{ CV: *key, Counter: counter, BlockLen: BlockSize, Flags: flags | FlagChunkStart, } var block [BlockSize]byte for len(chunk) > BlockSize { copy(block[:], chunk) chunk = chunk[BlockSize:] n.Block = BytesToWords(block) n.CV = ChainingValue(n) n.Flags &^= FlagChunkStart } // pad last block with zeros block = [BlockSize]byte{} n.BlockLen = uint32(copy(block[:], chunk)) n.Block = BytesToWords(block) n.Flags |= FlagChunkEnd return n } // CompressBlocks compresses MaxSIMD copies of n with successive counter values, // storing the results in out. func CompressBlocks(out *[MaxSIMD * BlockSize]byte, n Node) { var outs [MaxSIMD][64]byte compressBlocksGeneric(&outs, n) for i := range outs { copy(out[i*64:], outs[i][:]) } } func mergeSubtrees(cvs *[MaxSIMD][8]uint32, numCVs uint64, key *[8]uint32, flags uint32) Node { return mergeSubtreesGeneric(cvs, numCVs, key, flags) } // BytesToWords converts an array of 64 bytes to an array of 16 bytes. func BytesToWords(bytes [64]byte) (words [16]uint32) { for i := range words { words[i] = binary.LittleEndian.Uint32(bytes[4*i:]) } return } // WordsToBytes converts an array of 16 words to an array of 64 bytes. func WordsToBytes(words [16]uint32) (block [64]byte) { for i, w := range words { binary.LittleEndian.PutUint32(block[4*i:], w) } return } blake3-1.3.0/guts/cpu.go000066400000000000000000000003011461472055300147550ustar00rootroot00000000000000//go:build !darwin // +build !darwin package guts import "github.com/klauspost/cpuid/v2" var ( haveAVX2 = cpuid.CPU.Supports(cpuid.AVX2) haveAVX512 = cpuid.CPU.Supports(cpuid.AVX512F) ) blake3-1.3.0/guts/cpu_darwin.go000066400000000000000000000006151461472055300163310ustar00rootroot00000000000000package guts import ( "syscall" "github.com/klauspost/cpuid/v2" ) var ( haveAVX2 bool haveAVX512 bool ) func init() { haveAVX2 = cpuid.CPU.Supports(cpuid.AVX2) haveAVX512 = cpuid.CPU.Supports(cpuid.AVX512F) if !haveAVX512 { // On some Macs, AVX512 detection is buggy, so fallback to sysctl b, _ := syscall.Sysctl("hw.optional.avx512f") haveAVX512 = len(b) > 0 && b[0] == 1 } } blake3-1.3.0/guts/node.go000066400000000000000000000021501461472055300151170ustar00rootroot00000000000000// Package guts provides a low-level interface to the BLAKE3 cryptographic hash // function. package guts // Various constants. const ( FlagChunkStart = 1 << iota FlagChunkEnd FlagParent FlagRoot FlagKeyedHash FlagDeriveKeyContext FlagDeriveKeyMaterial BlockSize = 64 ChunkSize = 1024 MaxSIMD = 16 // AVX-512 vectors can store 16 words ) // IV is the BLAKE3 initialization vector. var IV = [8]uint32{ 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A, 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19, } // A Node represents a chunk or parent in the BLAKE3 Merkle tree. type Node struct { CV [8]uint32 // chaining value from previous node Block [16]uint32 Counter uint64 BlockLen uint32 Flags uint32 } // ParentNode returns a Node that incorporates the chaining values of two child // nodes. func ParentNode(left, right [8]uint32, key *[8]uint32, flags uint32) Node { n := Node{ CV: *key, Counter: 0, // counter is reset for parents BlockLen: BlockSize, // block is full Flags: flags | FlagParent, } copy(n.Block[:8], left[:]) copy(n.Block[8:], right[:]) return n } blake3-1.3.0/testdata/000077500000000000000000000000001461472055300144745ustar00rootroot00000000000000blake3-1.3.0/testdata/bao-golden.bao000066400000000000000000000505551461472055300172000ustar00rootroot00000000000000L)3\ w`8L87:t!;'6CXHh0r3hy u8)!@gr7x5o֊60~4X@.dj)Ң7"݀Kf z*n='c{!74&L@!5N}uíMj8u~z"\|[i-&5dJgM:[4x lZ?Y[w˝H0 …- "5ҥz1=-׽΃Pco[fpIkZi[S_)txa6a906a4a24bbd5acb2d123a37b28f9e9a81bbaae360d58f85e5fc9d75f7c370a0cc09b6522d9c8d822f2f28f485", "keyed_hash": "c951ecdf03288d0fcc96ee3413563d8a6d3589547f2c2fb36d9786470f1b9d6e890316d2e6d8b8c25b0a5b2180f94fb1a158ef508c3cde45e2966bd796a696d3e13efd86259d756387d9becf5c8bf1ce2192b87025152907b6d8cc33d17826d8b7b9bc97e38c3c85108ef09f013e01c229c20a83d9e8efac5b37470da28575fd755a10", "derive_key": "74a16c1c3d44368a86e1ca6df64be6a2f64cce8f09220787450722d85725dea59c413264404661e9e4d955409dfe4ad3aa487871bcd454ed12abfe2c2b1eb7757588cf6cb18d2eccad49e018c0d0fec323bec82bf1644c6325717d13ea712e6840d3e6e730d35553f59eff5377a9c350bcc1556694b924b858f329c44ee64b884ef00d" }, { "input_len": 1024, "hash": "42214739f095a406f3fc83deb889744ac00df831c10daa55189b5d121c855af71cf8107265ecdaf8505b95d8fcec83a98a6a96ea5109d2c179c47a387ffbb404756f6eeae7883b446b70ebb144527c2075ab8ab204c0086bb22b7c93d465efc57f8d917f0b385c6df265e77003b85102967486ed57db5c5ca170ba441427ed9afa684e", "keyed_hash": "75c46f6f3d9eb4f55ecaaee480db732e6c2105546f1e675003687c31719c7ba4a78bc838c72852d4f49c864acb7adafe2478e824afe51c8919d06168414c265f298a8094b1ad813a9b8614acabac321f24ce61c5a5346eb519520d38ecc43e89b5000236df0597243e4d2493fd626730e2ba17ac4d8824d09d1a4a8f57b8227778e2de", "derive_key": "7356cd7720d5b66b6d0697eb3177d9f8d73a4a5c5e968896eb6a6896843027066c23b601d3ddfb391e90d5c8eccdef4ae2a264bce9e612ba15e2bc9d654af1481b2e75dbabe615974f1070bba84d56853265a34330b4766f8e75edd1f4a1650476c10802f22b64bd3919d246ba20a17558bc51c199efdec67e80a227251808d8ce5bad" }, { "input_len": 1025, "hash": "d00278ae47eb27b34faecf67b4fe263f82d5412916c1ffd97c8cb7fb814b8444f4c4a22b4b399155358a994e52bf255de60035742ec71bd08ac275a1b51cc6bfe332b0ef84b409108cda080e6269ed4b3e2c3f7d722aa4cdc98d16deb554e5627be8f955c98e1d5f9565a9194cad0c4285f93700062d9595adb992ae68ff12800ab67a", "keyed_hash": "357dc55de0c7e382c900fd6e320acc04146be01db6a8ce7210b7189bd664ea69362396b77fdc0d2634a552970843722066c3c15902ae5097e00ff53f1e116f1cd5352720113a837ab2452cafbdF/o7fyEJr  dފז+]e!vܝna)➭iJY-}B6ܻ 0 /@720FcDB$qe4d54085d9cf5d21ca613071551b25d52e69d6c81123872b6f19cd3bc1333edf0c52b94de23ba772cf82636cff4542540a7738d5b930", "derive_key": "effaa245f065fbf82ac186839a249707c3bddf6d3fdda22d1b95a3c970379bcb5d31013a167509e9066273ab6e2123bc835b408b067d88f96addb550d96b6852dad38e320b9d940f86db74d398c770f462118b35d2724efa13da97194491d96dd37c3c09cbef665953f2ee85ec83d88b88d11547a6f911c8217cca46defa2751e7f3ad" }, { "input_len": 2048, "hash": "e776b6028c7cd22a4d0ba182a8bf62205d2ef576467e838ed6f2529b85fba24a9a60bf80001410ec9eea6698cd537939fad4749edd484cb541aced55cd9bf54764d063f23f6f1e32e12958ba5cfeb1bf618ad094266d4fc3c968c2088f677454c288c67ba0dba337b9d91c7e1ba586dc9a5bc2d5e90c14f53a8863ac75655461cea8f9", "keyed_hash": "879cf1fa2ea0e79126cb1063617a05b6ad9d0b696d0d757cf053439f60a99dd10173b961cd574288194b23ece278c330fbb8585485e74967f31352a8183aa782b2b22f26cdcadb61eed1a5bc144b8198fbb0c13abbf8e3192c145d0a5c21633b0ef86054f42809df823389ee40811a5910dcbd1018af31c3b43aa55201ed4edaac74fe", "derive_key": "7b2945cb4fef70885cc5d78a87bf6f6207dd901ff239201351ffac04e1088a23e2c11a1ebffcea4d80447867b61badb1383d842d4e79645d48dd82ccba290769caa7af8eaa1bd78a2a5e6e94fbdab78d9c7b74e894879f6a515257ccf6f95056f4e25390f24f6b35ffbb74b766202569b1d797f2d4bd9d17524c720107f985f4ddc583" }, { "input_len": 2049, "hash": "5f4d72f40d7a5f82b15ca2b2e44b1de3c2ef86c426c95c1af0b687952256303096de31d71d74103403822a2e0bc1eb193e7aecc9643a76b7bbc0c9f9c52e8783aae98764ca468962b5c2ec92f0c74eb5448d519713e09413719431c802f948dd5d90425a4ecdadece9eb178d80f26efccae630734dff63340285adec2aed3b51073ad3", "keyed_hash": "9f29700902f7c86e514ddc4df1e3049f258b2472b6dd5267f61bf13983b78dd5f9a88abfefdfa1e00b418971f2b39c64ca621e8eb37fceac57fd0c8fc8e117d43b81447be22d5d8186f8f5919ba6bcc6846bd7d50726c06d245672c2ad4f61702c646499ee1173daa061ffe15bf45a631e2946d616a4c345822f1151284712f76b2b0e", "derive_key": "2ea477c5515cc3dd606512ee72bb3e0e758cfae7232826f35fb98ca1bcbdf27316d8e9e79081a80b046b60f6a263616f33ca464bd78d79fa18200d06c7fc9bffd808cc4755!_f~SSFSICM_)Bhhf1lb277a7d5e09da0f29ed150f6537ea9bed946227ff184cc66a72a5f8c1e4bd8b04e81cf40fe6dc4427ad5678311a61f4ffc39d195589bdbc670f63ae70f4b6" }, { "input_len": 3072, "hash": "b98cb0ff3623be03326b373de6b9095218513e64f1ee2edd2525c7ad1e5cffd29a3f6b0b978d6608335c09dc94ccf682f9951cdfc501bfe47b9c9189a6fc7b404d120258506341a6d802857322fbd20d3e5dae05b95c88793fa83db1cb08e7d8008d1599b6209d78336e24839724c191b2a52a80448306e0daa84a3fdb566661a37e11", "keyed_hash": "044a0e7b172a312dc02a4c9a818c036ffa2776368d7f528268d2e6b5df19177022f302d0529e4174cc507c463671217975e81dab02b8fdeb0d7ccc7568dd22574c783a76be215441b32e91b9a904be8ea81f7a0afd14bad8ee7c8efc305ace5d3dd61b996febe8da4f56ca0919359a7533216e2999fc87ff7d8f176fbecb3d6f34278b", "derive_key": "050df97f8c2ead654d9bb3ab8c9178edcd902a32f8495949feadcc1e0480c46b3604131bbd6e3ba573b6dd682fa0a63e5b165d39fc43a625d00207607a2bfeb65ff1d29292152e26b298868e3b87be95d6458f6f2ce6118437b632415abe6ad522874bcd79e4030a5e7bad2efa90a7a7c67e93f0a18fb28369d0a9329ab5c24134ccb0" }, { "input_len": 3073, "hash": "7124b49501012f81cc7f11ca069ec9226cecb8a2c850cfe644e327d22d3e1cd39a27ae3b79d68d89da9bf25bc27139ae65a324918a5f9b7828181e52cf373c84f35b639b7fccbb985b6f2fa56aea0c18f531203497b8bbd3a07ceb5926f1cab74d14bd66486d9a91eba99059a98bd1cd25876b2af5a76c3e9eed554ed72ea952b603bf", "keyed_hash": "68dede9bef00ba89e43f31a6825f4cf433389fedae75c04ee9f0cf16a427c95a96d6da3fe985054d3478865be9a092250839a697bbda74e279e8a9e69f0025e4cfddd6cfb434b1cd9543aaf97c635d1b451a4386041e4bb100f5e45407cbbc24fa53ea2de3536ccb329e4eb9466ec37093a42cf62b82903c696a93a50b702c80f3c3c5", "derive_key": "72613c9ec9ff7e40f8f5c173784c532ad852e827dba2bf85b2ab4b76f7079081576288e552647a9d86481c2cae75c2dd4e7c5195fb9ada1ef50e9c5098c249d743929191441301c69e1f48505a4305ec1778450ee48b8e69dc23a25960fe33070ea549119599760a8a2d28aeca06b8c5e9ba58bc19e11fe57b6ee98aa44b2a8e6b14a5" }, { "input_len": 4096, "hash": "015094013f57a5277b59d8475c0501042c0b642e531b0a1c8f58d2163229e9690289e9409ddb1b99768eafe16236Hw* O/O_25գ I}9:}VjWOݬF;2M3680]]4O4Bs,b.Z)/Vz]ЄSPHS;/? uJ2^i/8N6zPM da896faf7e1114bebeadc1be30829b6f8af707d85c298f4f0ff4d9438aef948335612ae921e76d411c3a9111df62d27eaf871959ae0062b5492a0feb98ef3ed4af277f5395172dbe5c311918ea0074ce0036454f620", "keyed_hash": "befc660aea2f1718884cd8deb9902811d332f4fc4a38cf7c7300d597a081bfc0bbb64a36edb564e01e4b4aaf3b060092a6b838bea44afebd2deb8298fa562b7b597c757b9df4c911c3ca462e2ac89e9a787357aaf74c3b56d5c07bc93ce899568a3eb17d9250c20f6c5f6c1e792ec9a2dcb715398d5a6ec6d5c54f586a00403a1af1de", "derive_key": "1e0d7f3db8c414c97c6307cbda6cd27ac3b030949da8e23be1a1a924ad2f25b9d78038f7b198596c6cc4a9ccf93223c08722d684f240ff6569075ed81591fd93f9fff1110b3a75bc67e426012e5588959cc5a4c192173a03c00731cf84544f65a2fb9378989f72e9694a6a394a8a30997c2e67f95a504e631cd2c5f55246024761b245" }, { "input_len": 4097, "hash": "9b4052b38f1c5fc8b1f9ff7ac7b27cd242487b3d890d15c96a1c25b8aa0fb99505f91b0b5600a11251652eacfa9497b31cd3c409ce2e45cfe6c0a016967316c426bd26f619eab5d70af9a418b845c608840390f361630bd497b1ab44019316357c61dbe091ce72fc16dc340ac3d6e009e050b3adac4b5b2c92e722cffdc46501531956", "keyed_hash": "00df940cd36bb9fa7cbbc3556744e0dbc8191401afe70520ba292ee3ca80abbc606db4976cfdd266ae0abf667d9481831ff12e0caa268e7d3e57260c0824115a54ce595ccc897786d9dcbf495599cfd90157186a46ec800a6763f1c59e36197e9939e900809f7077c102f888caaf864b253bc41eea812656d46742e4ea42769f89b83f", "derive_key": "aca51029626b55fda7117b42a7c211f8c6e9ba4fe5b7a8ca922f34299500ead8a897f66a400fed9198fd61dd2d58d382458e64e100128075fc54b860934e8de2e84170734b06e1d212a117100820dbc48292d148afa50567b8b84b1ec336ae10d40c8c975a624996e12de31abbe135d9d159375739c333798a80c64ae895e51e22f3ad" }, { "input_len": 5120, "hash": "9cadc15fed8b5d854562b26a9536d9707cadeda9b143978f319ab34230535833acc61c8fdc114a2010ce8038c853e121e1544985133fccdd0a2d507e8e615e611e9a0ba4f47915f49e53d721816a9198e8b30f12d20ec3689989175f1bf7a300eee0d9321fad8da232ece6efb8e9fd81b42ad161f6b9550a069e66b11b40487a5f5059", "keyed_hash": "2c493e48e9b9bf31e0553a22b23503c0a3388f035cece68eb438d22fa1943e209b4dc9209cdyLcEI߃XZYTs/u6B柸2.b?\Ǒ*2h{980ce7c1f7c9a744658e7e288465717ae6e56d5463d4f80cdb2ef56495f6a4f5487f69749af0c34c2cdfa857f3056bf8d807336a14d7b89bf62bef2fb54f9af6a546f818dc1e98b9e07f8a5834da50fa28fb5874af91bf06020d1bf0120e", "derive_key": "7a7acac8a02adcf3038d74cdd1d34527de8a0fcc0ee3399d1262397ce5817f6055d0cefd84d9d57fe792d65a278fd20384ac6c30fdb340092f1a74a92ace99c482b28f0fc0ef3b923e56ade20c6dba47e49227166251337d80a037e987ad3a7f728b5ab6dfafd6e2ab1bd583a95d9c895ba9c2422c24ea0f62961f0dca45cad47bfa0d" }, { "input_len": 5121, "hash": "628bd2cb2004694adaab7bbd778a25df25c47b9d4155a55f8fbd79f2fe154cff96adaab0613a6146cdaabe498c3a94e529d3fc1da2bd08edf54ed64d40dcd6777647eac51d8277d70219a9694334a68bc8f0f23e20b0ff70ada6f844542dfa32cd4204ca1846ef76d811cdb296f65e260227f477aa7aa008bac878f72257484f2b6c95", "keyed_hash": "6ccf1c34753e7a044db80798ecd0782a8f76f33563accaddbfbb2e0ea4b2d0240d07e63f13667a8d1490e5e04f13eb617aea16a8c8a5aaed1ef6fbde1b0515e3c81050b361af6ead126032998290b563e3caddeaebfab592e155f2e161fb7cba939092133f23f9e65245e58ec23457b78a2e8a125588aad6e07d7f11a85b88d375b72d", "derive_key": "b07f01e518e702f7ccb44a267e9e112d403a7b3f4883a47ffbed4b48339b3c341a0add0ac032ab5aaea1e4e5b004707ec5681ae0fcbe3796974c0b1cf31a194740c14519273eedaabec832e8a784b6e7cfc2c5952677e6c3f2c3914454082d7eb1ce1766ac7d75a4d3001fc89544dd46b5147382240d689bbbaefc359fb6ae30263165" }, { "input_len": 6144, "hash": "3e2e5b74e048f3add6d21faab3f83aa44d3b2278afb83b80b3c35164ebeca2054d742022da6fdda444ebc384b04a54c3ac5839b49da7d39f6d8a9db03deab32aade156c1c0311e9b3435cde0ddba0dce7b26a376cad121294b689193508dd63151603c6ddb866ad16c2ee41585d1633a2cea093bea714f4c5d6b903522045b20395c83", "keyed_hash": "3d6b6d21281d0ade5b2b016ae4034c5dec10ca7e475f90f76eac7138e9bc8f1dc35754060091dc5caf3efabe0603c60f45e415bb3407db67e6beb3d11cf8e4f7907561f05dace0c15807f4b5f389c841eb114d81a82c02a00b57206b1d11fa6e803486b048a5ce87105a686dee041207e095323dfe172df73deb8c9532066d88f9da7e", "derive_key": "2a95beae63ddce523762355cf4b9c1d8f131465780a391286a5d01abb56~[1ByCOld̊Q‚:"c[HEOyTDEm*;,n~{bܼY>јuMb)|ghJgkA 7ɺ:ۋ83a1597099e3c6488aab6c48f3c15dbe1942d21dbcdc12115d19a8b8465fb54e9053323a9178e4275647f1a9927f6439e52b7031a0b465c861a3fc531527f7758b2b888cf2f20582e9e2c593709c0a44f9c6e0f8b963994882ea4168827823eef1f64169fef" }, { "input_len": 6145, "hash": "f1323a8631446cc50536a9f705ee5cb619424d46887f3c376c695b70e0f0507f18a2cfdd73c6e39dd75ce7c1c6e3ef238fd54465f053b25d21044ccb2093beb015015532b108313b5829c3621ce324b8e14229091b7c93f32db2e4e63126a377d2a63a3597997d4f1cba59309cb4af240ba70cebff9a23d5e3ff0cdae2cfd54e070022", "keyed_hash": "9ac301e9e39e45e3250a7e3b3df701aa0fb6889fbd80eeecf28dbc6300fbc539f3c184ca2f59780e27a576c1d1fb9772e99fd17881d02ac7dfd39675aca918453283ed8c3169085ef4a466b91c1649cc341dfdee60e32231fc34c9c4e0b9a2ba87ca8f372589c744c15fd6f985eec15e98136f25beeb4b13c4e43dc84abcc79cd4646c", "derive_key": "379bcc61d0051dd489f686c13de00d5b14c505245103dc040d9e4dd1facab8e5114493d029bdbd295aaa744a59e31f35c7f52dba9c3642f773dd0b4262a9980a2aef811697e1305d37ba9d8b6d850ef07fe41108993180cf779aeece363704c76483458603bbeeb693cffbbe5588d1f3535dcad888893e53d977424bb707201569a8d2" }, { "input_len": 7168, "hash": "61da957ec2499a95d6b8023e2b0e604ec7f6b50e80a9678b89d2628e99ada77a5707c321c83361793b9af62a40f43b523df1c8633cecb4cd14d00bdc79c78fca5165b863893f6d38b02ff7236c5a9a8ad2dba87d24c547cab046c29fc5bc1ed142e1de4763613bb162a5a538e6ef05ed05199d751f9eb58d332791b8d73fb74e4fce95", "keyed_hash": "b42835e40e9d4a7f42ad8cc04f85a963a76e18198377ed84adddeaecacc6f3fca2f01d5277d69bb681c70fa8d36094f73ec06e452c80d2ff2257ed82e7ba348400989a65ee8daa7094ae0933e3d2210ac6395c4af24f91c2b590ef87d7788d7066ea3eaebca4c08a4f14b9a27644f99084c3543711b64a070b94f2c9d1d8a90d035d52", "derive_key": "11c37a112765370c94a51415d0d651190c288566e295d505defdad895dae223730d5a5175a38841693020669c7638f40b9bc1f9f39cf98bda7a5b54ae24218a800a2116b34665aa95d846d97ea988bfcb53dd9c055d588fa21ba78996776ea6c40bc428b53c62b5f3ccf200f647a5aae8067f0ea1976391fcc72af1945100e2a6dcb88" }, { "input_len": 7169, "hash": "a003fc7a5175Ni'yyW"iG[HNnczCFyyQPK͵4a9b3c7fae0367ab3d782dccf28855a03d435f8cfe74605e781798a8b20534be1ca9eb2ae2df3fae2ea60e48c6fb0b850b1385b5de0fe460dbe9d9f9b0d8db4435da75c601156df9d047f4ede008732eb17adc05d96180f8a73548522840779e6062d643b79478a6e8dbce68927f36ebf676ffa7d72d5f68f050b119c8", "keyed_hash": "ed9b1a922c046fdb3d423ae34e143b05ca1bf28b710432857bf738bcedbfa5113c9e28d72fcbfc020814ce3f5d4fc867f01c8f5b6caf305b3ea8a8ba2da3ab69fabcb438f19ff11f5378ad4484d75c478de425fb8e6ee809b54eec9bdb184315dc856617c09f5340451bf42fd3270a7b0b6566169f242e533777604c118a6358250f54", "derive_key": "554b0a5efea9ef183f2f9b931b7497995d9eb26f5c5c6dad2b97d62fc5ac31d99b20652c016d88ba2a611bbd761668d5eda3e568e940faae24b0d9991c3bd25a65f770b89fdcadabcb3d1a9c1cb63e69721cacf1ae69fefdcef1e3ef41bc5312ccc17222199e47a26552c6adc460cf47a72319cb5039369d0060eaea59d6c65130f1dd" }, { "input_len": 8192, "hash": "aae792484c8efe4f19e2ca7d371d8c467ffb10748d8a5a1ae579948f718a2a635fe51a27db045a567c1ad51be5aa34c01c6651c4d9b5b5ac5d0fd58cf18dd61a47778566b797a8c67df7b1d60b97b19288d2d877bb2df417ace009dcb0241ca1257d62712b6a4043b4ff33f690d849da91ea3bf711ed583cb7b7a7da2839ba71309bbf", "keyed_hash": "dc9637c8845a770b4cbf76b8daec0eebf7dc2eac11498517f08d44c8fc00d58a4834464159dcbc12a0ba0c6d6eb41bac0ed6585cabfe0aca36a375e6c5480c22afdc40785c170f5a6b8a1107dbee282318d00d915ac9ed1143ad40765ec120042ee121cd2baa36250c618adaf9e27260fda2f94dea8fb6f08c04f8f10c78292aa46102", "derive_key": "ad01d7ae4ad059b0d33baa3c01319dcf8088094d0359e5fd45d6aeaa8b2d0c3d4c9e58958553513b67f84f8eac653aeeb02ae1d5672dcecf91cd9985a0e67f4501910ecba25555395427ccc7241d70dc21c190e2aadee875e5aae6bf1912837e53411dabf7a56cbf8e4fb780432b0d7fe6cec45024a0788cf5874616407757e9e6bef7" }, { "input_len": 8193, "hash": "bab6c09cb8ce8cf459261398d2e7aef35700bf488116ceb94a36d0f5f1b7bc3bb2282aa69be089359ea1154b9a9286c4a56af4de975a9aa4a5c497654914d279bea60bb6d2cf7225a2fa0ff5ef56bbe4b149f3ed15860f78b4e2ad04e158e375c1e0c0b551cd7dfc82f1b155c11b6b3ed51ec9edb30d133653bb5709d1dbd55f4e1ff6", "keyed_hash@MS9y3U1Xg&{XZŪȚb( C6%ص7FeK.o>V@3Cxٰxޱ>5{\!-Gs1%:BUȏaS<;d": "954a2a75420c8d6547e3ba5b98d963e6fa6491addc8c023189cc519821b4a1f5f03228648fd983aef045c2fa8290934b0866b615f585149587dda2299039965328835a2b18f1d63b7e300fc76ff260b571839fe44876a4eae66cbac8c67694411ed7e09df51068a22c6e67d6d3dd2cca8ff12e3275384006c80f4db68023f24eebba57", "derive_key": "af1e0346e389b17c23200270a64aa4e1ead98c61695d917de7d5b00491c9b0f12f20a01d6d622edf3de026a4db4e4526225debb93c1237934d71c7340bb5916158cbdafe9ac3225476b6ab57a12357db3abbad7a26c6e66290e44034fb08a20a8d0ec264f309994d2810c49cfba6989d7abb095897459f5425adb48aba07c5fb3c83c0" }, { "input_len": 16384, "hash": "f875d6646de28985646f34ee13be9a576fd515f76b5b0a26bb324735041ddde49d764c270176e53e97bdffa58d549073f2c660be0e81293767ed4e4929f9ad34bbb39a529334c57c4a381ffd2a6d4bfdbf1482651b172aa883cc13408fa67758a3e47503f93f87720a3177325f7823251b85275f64636a8f1d599c2e49722f42e93893", "keyed_hash": "9e9fc4eb7cf081ea7c47d1807790ed211bfec56aa25bb7037784c13c4b707b0df9e601b101e4cf63a404dfe50f2e1865bb12edc8fca166579ce0c70dba5a5c0fc960ad6f3772183416a00bd29d4c6e651ea7620bb100c9449858bf14e1ddc9ecd35725581ca5b9160de04060045993d972571c3e8f71e9d0496bfa744656861b169d65", "derive_key": "160e18b5878cd0df1c3af85eb25a0db5344d43a6fbd7a8ef4ed98d0714c3f7e160dc0b1f09caa35f2f417b9ef309dfe5ebd67f4c9507995a531374d099cf8ae317542e885ec6f589378864d3ea98716b3bbb65ef4ab5e0ab5bb298a501f19a41ec19af84a5e6b428ecd813b1a47ed91c9657c3fba11c406bc316768b58f6802c9e9b57" }, { "input_len": 31744, "hash": "62b6960e1a44bcc1eb1a611a8d6235b6b4b78f32e7abc4fb4c6cdcce94895c47860cc51f2b0c28a7b77304bd55fe73af663c02d3f52ea053ba43431ca5bab7bfea2f5e9d7121770d88f70ae9649ea713087d1914f7f312147e247f87eb2d4ffef0ac978bf7b6579d57d533355aa20b8b77b13fd09748728a5cc327a8ec470f4013226f", "keyed_hash": "efa53b389ab67c593dba624d898d0f7353ab99e4ac9d42302ee64cbf9939a4193a7258db2d9cd32a7a3ecfce46144114b15c2fcb68a618a976bd74515d47be08b628be420b5e830fade7c080e351a076fbc38641ad80c736c8a18fe3c66ce12f95c61c2462a9770d60d0f77115bbcd3782b593016a4e728d4c06cee4505cb0c08a42ec",Hb"?4Z\l= $&WOѭ8Ul}VBjE1L.T&q {{Sg "derive_key": "39772aef80e0ebe60596361e45b061e8f417429d529171b6764468c22928e28e9759adeb797a3fbf771b1bcea30150a020e317982bf0d6e7d14dd9f064bc11025c25f31e81bd78a921db0174f03dd481d30e93fd8e90f8b2fee209f849f2d2a52f31719a490fb0ba7aea1e09814ee912eba111a9fde9d5c274185f7bae8ba85d300a2b" }, { "input_len": 100000, "hash": "d93c23eedaf165a7e0be908ba86f1a7a520d568d2d13cde787c8580c5c72cc54902b765d0e69ff7f278ef2f8bb839b673f0db20afa0566c78965ad819674822fd11a507251555fc6daec7437074bc7b7307dfe122411b3676a932b5b0360d5ad495f8e7431d3d025fac5b4e955ce893a3504f2569f838eea47cf1bb21c4ae659db522f", "keyed_hash": "74c836d008247adebbc032d1bced2e71d19050b5c39fa03c43d4160ad8d170732f3b73e374a4500825c13d2c8c9384ce12c033adc49245ce42f50d5b48237397b8447bd414b0693bef98518db8a3494e6e8e3abc931f92f472d938f07eac97d1cc69b375426bce26c5e829b5b41cacbb5543544977749d503fa78309e7a158640e579c", "derive_key": "039c0c0d76eacefea9c8d042698bd012d3cef4091ed5c5a7e32a30e4d51718930a99481bb11214d9e9e79e58d11875a789447731a887aa77499843148d35b1752c6314af6d36559341bd6895c5ee0a452c99cb47a9b22dfe36042932fc9a423d245b91b6246c85e4b0d415cbece3e0545d6e242853da7f3dd1f9b0f146ec72706b8c28" } ] } blake3-1.3.0/testdata/bao-golden.obao000066400000000000000000000023101461472055300173410ustar00rootroot00000000000000L)3\ w`8L87:t!;'6CXHh0r3hy u8)!@gr7x5o֊60~4X@.dj)Ң7"݀Kf z*n='c{!74&L@!5N}uíMj8u~z"\|[i-&5dJgM:[4x lZ?Y[w˝H0 …- "5ҥz1=-׽΃Pco[fpIkZi[S_)txF/o7fyEJr  dފז+]e!vܝna)➭iJY-}B6ܻ 0 /@720FcDB$q!_f~SSFSICM_)Bhhf1lb6Hw* O/O_25գ I}9:}VjWOݬF;2M3680]]4O4Bs,b.Z)/Vz]ЄSPHS;/? uJ2^i/8N6zPM yLcEI߃XZYTs/u6B柸2.b?\Ǒ*2h{9~[1ByCOld̊Q‚:"c[HEOyTDEm*;,n~{bܼY>јuMb)|ghJgkA 7ɺ:ۋNi'yyW"iG[HNnczCFyyQPK͵@MS9y3U1Xg&{XZŪȚb( C6%ص7FeK.o>V@3Cxٰxޱ>5{\!-Gs1%:BUȏaS<;dHb"?4Z\l= $&WOѭ8Ul}VBjE1L.T&q {{Sgblake3-1.3.0/testdata/vectors.json000066400000000000000000000462451461472055300170670ustar00rootroot00000000000000{ "key": "whats the Elvish word for friend", "cases": [ { "input_len": 0, "hash": "af1349b9f5f9a1a6a0404dea36dcc9499bcb25c9adc112b7cc9a93cae41f3262e00f03e7b69af26b7faaf09fcd333050338ddfe085b8cc869ca98b206c08243a26f5487789e8f660afe6c99ef9e0c52b92e7393024a80459cf91f476f9ffdbda7001c22e159b402631f277ca96f2defdf1078282314e763699a31c5363165421cce14d", "keyed_hash": "92b2b75604ed3c761f9d6f62392c8a9227ad0ea3f09573e783f1498a4ed60d26b18171a2f22a4b94822c701f107153dba24918c4bae4d2945c20ece13387627d3b73cbf97b797d5e59948c7ef788f54372df45e45e4293c7dc18c1d41144a9758be58960856be1eabbe22c2653190de560ca3b2ac4aa692a9210694254c371e851bc8f", "derive_key": "2cc39783c223154fea8dfb7c1b1660f2ac2dcbd1c1de8277b0b0dd39b7e50d7d905630c8be290dfcf3e6842f13bddd573c098c3f17361f1f206b8cad9d088aa4a3f746752c6b0ce6a83b0da81d59649257cdf8eb3e9f7d4998e41021fac119deefb896224ac99f860011f73609e6e0e4540f93b273e56547dfd3aa1a035ba6689d89a0" }, { "input_len": 1, "hash": "2d3adedff11b61f14c886e35afa036736dcd87a74d27b5c1510225d0f592e213c3a6cb8bf623e20cdb535f8d1a5ffb86342d9c0b64aca3bce1d31f60adfa137b358ad4d79f97b47c3d5e79f179df87a3b9776ef8325f8329886ba42f07fb138bb502f4081cbcec3195c5871e6c23e2cc97d3c69a613eba131e5f1351f3f1da786545e5", "keyed_hash": "6d7878dfff2f485635d39013278ae14f1454b8c0a3a2d34bc1ab38228a80c95b6568c0490609413006fbd428eb3fd14e7756d90f73a4725fad147f7bf70fd61c4e0cf7074885e92b0e3f125978b4154986d4fb202a3f331a3fb6cf349a3a70e49990f98fe4289761c8602c4e6ab1138d31d3b62218078b2f3ba9a88e1d08d0dd4cea11", "derive_key": "b3e2e340a117a499c6cf2398a19ee0d29cca2bb7404c73063382693bf66cb06c5827b91bf889b6b97c5477f535361caefca0b5d8c4746441c57617111933158950670f9aa8a05d791daae10ac683cbef8faf897c84e6114a59d2173c3f417023a35d6983f2c7dfa57e7fc559ad751dbfb9ffab39c2ef8c4aafebc9ae973a64f0c76551" }, { "input_len": 1023, "hash": "10108970eeda3eb932baac1428c7a2163b0e924c9a9e25b35bba72b28f70bd11a182d27a591b05592b15607500e1e8dd56bc6c7fc063715b7a1d737df5bad3339c56778957d870eb9717b57ea3d9fb68d1b55127bba6a906a4a24bbd5acb2d123a37b28f9e9a81bbaae360d58f85e5fc9d75f7c370a0cc09b6522d9c8d822f2f28f485", "keyed_hash": "c951ecdf03288d0fcc96ee3413563d8a6d3589547f2c2fb36d9786470f1b9d6e890316d2e6d8b8c25b0a5b2180f94fb1a158ef508c3cde45e2966bd796a696d3e13efd86259d756387d9becf5c8bf1ce2192b87025152907b6d8cc33d17826d8b7b9bc97e38c3c85108ef09f013e01c229c20a83d9e8efac5b37470da28575fd755a10", "derive_key": "74a16c1c3d44368a86e1ca6df64be6a2f64cce8f09220787450722d85725dea59c413264404661e9e4d955409dfe4ad3aa487871bcd454ed12abfe2c2b1eb7757588cf6cb18d2eccad49e018c0d0fec323bec82bf1644c6325717d13ea712e6840d3e6e730d35553f59eff5377a9c350bcc1556694b924b858f329c44ee64b884ef00d" }, { "input_len": 1024, "hash": "42214739f095a406f3fc83deb889744ac00df831c10daa55189b5d121c855af71cf8107265ecdaf8505b95d8fcec83a98a6a96ea5109d2c179c47a387ffbb404756f6eeae7883b446b70ebb144527c2075ab8ab204c0086bb22b7c93d465efc57f8d917f0b385c6df265e77003b85102967486ed57db5c5ca170ba441427ed9afa684e", "keyed_hash": "75c46f6f3d9eb4f55ecaaee480db732e6c2105546f1e675003687c31719c7ba4a78bc838c72852d4f49c864acb7adafe2478e824afe51c8919d06168414c265f298a8094b1ad813a9b8614acabac321f24ce61c5a5346eb519520d38ecc43e89b5000236df0597243e4d2493fd626730e2ba17ac4d8824d09d1a4a8f57b8227778e2de", "derive_key": "7356cd7720d5b66b6d0697eb3177d9f8d73a4a5c5e968896eb6a6896843027066c23b601d3ddfb391e90d5c8eccdef4ae2a264bce9e612ba15e2bc9d654af1481b2e75dbabe615974f1070bba84d56853265a34330b4766f8e75edd1f4a1650476c10802f22b64bd3919d246ba20a17558bc51c199efdec67e80a227251808d8ce5bad" }, { "input_len": 1025, "hash": "d00278ae47eb27b34faecf67b4fe263f82d5412916c1ffd97c8cb7fb814b8444f4c4a22b4b399155358a994e52bf255de60035742ec71bd08ac275a1b51cc6bfe332b0ef84b409108cda080e6269ed4b3e2c3f7d722aa4cdc98d16deb554e5627be8f955c98e1d5f9565a9194cad0c4285f93700062d9595adb992ae68ff12800ab67a", "keyed_hash": "357dc55de0c7e382c900fd6e320acc04146be01db6a8ce7210b7189bd664ea69362396b77fdc0d2634a552970843722066c3c15902ae5097e00ff53f1e116f1cd5352720113a837ab2452cafbde4d54085d9cf5d21ca613071551b25d52e69d6c81123872b6f19cd3bc1333edf0c52b94de23ba772cf82636cff4542540a7738d5b930", "derive_key": "effaa245f065fbf82ac186839a249707c3bddf6d3fdda22d1b95a3c970379bcb5d31013a167509e9066273ab6e2123bc835b408b067d88f96addb550d96b6852dad38e320b9d940f86db74d398c770f462118b35d2724efa13da97194491d96dd37c3c09cbef665953f2ee85ec83d88b88d11547a6f911c8217cca46defa2751e7f3ad" }, { "input_len": 2048, "hash": "e776b6028c7cd22a4d0ba182a8bf62205d2ef576467e838ed6f2529b85fba24a9a60bf80001410ec9eea6698cd537939fad4749edd484cb541aced55cd9bf54764d063f23f6f1e32e12958ba5cfeb1bf618ad094266d4fc3c968c2088f677454c288c67ba0dba337b9d91c7e1ba586dc9a5bc2d5e90c14f53a8863ac75655461cea8f9", "keyed_hash": "879cf1fa2ea0e79126cb1063617a05b6ad9d0b696d0d757cf053439f60a99dd10173b961cd574288194b23ece278c330fbb8585485e74967f31352a8183aa782b2b22f26cdcadb61eed1a5bc144b8198fbb0c13abbf8e3192c145d0a5c21633b0ef86054f42809df823389ee40811a5910dcbd1018af31c3b43aa55201ed4edaac74fe", "derive_key": "7b2945cb4fef70885cc5d78a87bf6f6207dd901ff239201351ffac04e1088a23e2c11a1ebffcea4d80447867b61badb1383d842d4e79645d48dd82ccba290769caa7af8eaa1bd78a2a5e6e94fbdab78d9c7b74e894879f6a515257ccf6f95056f4e25390f24f6b35ffbb74b766202569b1d797f2d4bd9d17524c720107f985f4ddc583" }, { "input_len": 2049, "hash": "5f4d72f40d7a5f82b15ca2b2e44b1de3c2ef86c426c95c1af0b687952256303096de31d71d74103403822a2e0bc1eb193e7aecc9643a76b7bbc0c9f9c52e8783aae98764ca468962b5c2ec92f0c74eb5448d519713e09413719431c802f948dd5d90425a4ecdadece9eb178d80f26efccae630734dff63340285adec2aed3b51073ad3", "keyed_hash": "9f29700902f7c86e514ddc4df1e3049f258b2472b6dd5267f61bf13983b78dd5f9a88abfefdfa1e00b418971f2b39c64ca621e8eb37fceac57fd0c8fc8e117d43b81447be22d5d8186f8f5919ba6bcc6846bd7d50726c06d245672c2ad4f61702c646499ee1173daa061ffe15bf45a631e2946d616a4c345822f1151284712f76b2b0e", "derive_key": "2ea477c5515cc3dd606512ee72bb3e0e758cfae7232826f35fb98ca1bcbdf27316d8e9e79081a80b046b60f6a263616f33ca464bd78d79fa18200d06c7fc9bffd808cc4755277a7d5e09da0f29ed150f6537ea9bed946227ff184cc66a72a5f8c1e4bd8b04e81cf40fe6dc4427ad5678311a61f4ffc39d195589bdbc670f63ae70f4b6" }, { "input_len": 3072, "hash": "b98cb0ff3623be03326b373de6b9095218513e64f1ee2edd2525c7ad1e5cffd29a3f6b0b978d6608335c09dc94ccf682f9951cdfc501bfe47b9c9189a6fc7b404d120258506341a6d802857322fbd20d3e5dae05b95c88793fa83db1cb08e7d8008d1599b6209d78336e24839724c191b2a52a80448306e0daa84a3fdb566661a37e11", "keyed_hash": "044a0e7b172a312dc02a4c9a818c036ffa2776368d7f528268d2e6b5df19177022f302d0529e4174cc507c463671217975e81dab02b8fdeb0d7ccc7568dd22574c783a76be215441b32e91b9a904be8ea81f7a0afd14bad8ee7c8efc305ace5d3dd61b996febe8da4f56ca0919359a7533216e2999fc87ff7d8f176fbecb3d6f34278b", "derive_key": "050df97f8c2ead654d9bb3ab8c9178edcd902a32f8495949feadcc1e0480c46b3604131bbd6e3ba573b6dd682fa0a63e5b165d39fc43a625d00207607a2bfeb65ff1d29292152e26b298868e3b87be95d6458f6f2ce6118437b632415abe6ad522874bcd79e4030a5e7bad2efa90a7a7c67e93f0a18fb28369d0a9329ab5c24134ccb0" }, { "input_len": 3073, "hash": "7124b49501012f81cc7f11ca069ec9226cecb8a2c850cfe644e327d22d3e1cd39a27ae3b79d68d89da9bf25bc27139ae65a324918a5f9b7828181e52cf373c84f35b639b7fccbb985b6f2fa56aea0c18f531203497b8bbd3a07ceb5926f1cab74d14bd66486d9a91eba99059a98bd1cd25876b2af5a76c3e9eed554ed72ea952b603bf", "keyed_hash": "68dede9bef00ba89e43f31a6825f4cf433389fedae75c04ee9f0cf16a427c95a96d6da3fe985054d3478865be9a092250839a697bbda74e279e8a9e69f0025e4cfddd6cfb434b1cd9543aaf97c635d1b451a4386041e4bb100f5e45407cbbc24fa53ea2de3536ccb329e4eb9466ec37093a42cf62b82903c696a93a50b702c80f3c3c5", "derive_key": "72613c9ec9ff7e40f8f5c173784c532ad852e827dba2bf85b2ab4b76f7079081576288e552647a9d86481c2cae75c2dd4e7c5195fb9ada1ef50e9c5098c249d743929191441301c69e1f48505a4305ec1778450ee48b8e69dc23a25960fe33070ea549119599760a8a2d28aeca06b8c5e9ba58bc19e11fe57b6ee98aa44b2a8e6b14a5" }, { "input_len": 4096, "hash": "015094013f57a5277b59d8475c0501042c0b642e531b0a1c8f58d2163229e9690289e9409ddb1b99768eafe1623da896faf7e1114bebeadc1be30829b6f8af707d85c298f4f0ff4d9438aef948335612ae921e76d411c3a9111df62d27eaf871959ae0062b5492a0feb98ef3ed4af277f5395172dbe5c311918ea0074ce0036454f620", "keyed_hash": "befc660aea2f1718884cd8deb9902811d332f4fc4a38cf7c7300d597a081bfc0bbb64a36edb564e01e4b4aaf3b060092a6b838bea44afebd2deb8298fa562b7b597c757b9df4c911c3ca462e2ac89e9a787357aaf74c3b56d5c07bc93ce899568a3eb17d9250c20f6c5f6c1e792ec9a2dcb715398d5a6ec6d5c54f586a00403a1af1de", "derive_key": "1e0d7f3db8c414c97c6307cbda6cd27ac3b030949da8e23be1a1a924ad2f25b9d78038f7b198596c6cc4a9ccf93223c08722d684f240ff6569075ed81591fd93f9fff1110b3a75bc67e426012e5588959cc5a4c192173a03c00731cf84544f65a2fb9378989f72e9694a6a394a8a30997c2e67f95a504e631cd2c5f55246024761b245" }, { "input_len": 4097, "hash": "9b4052b38f1c5fc8b1f9ff7ac7b27cd242487b3d890d15c96a1c25b8aa0fb99505f91b0b5600a11251652eacfa9497b31cd3c409ce2e45cfe6c0a016967316c426bd26f619eab5d70af9a418b845c608840390f361630bd497b1ab44019316357c61dbe091ce72fc16dc340ac3d6e009e050b3adac4b5b2c92e722cffdc46501531956", "keyed_hash": "00df940cd36bb9fa7cbbc3556744e0dbc8191401afe70520ba292ee3ca80abbc606db4976cfdd266ae0abf667d9481831ff12e0caa268e7d3e57260c0824115a54ce595ccc897786d9dcbf495599cfd90157186a46ec800a6763f1c59e36197e9939e900809f7077c102f888caaf864b253bc41eea812656d46742e4ea42769f89b83f", "derive_key": "aca51029626b55fda7117b42a7c211f8c6e9ba4fe5b7a8ca922f34299500ead8a897f66a400fed9198fd61dd2d58d382458e64e100128075fc54b860934e8de2e84170734b06e1d212a117100820dbc48292d148afa50567b8b84b1ec336ae10d40c8c975a624996e12de31abbe135d9d159375739c333798a80c64ae895e51e22f3ad" }, { "input_len": 5120, "hash": "9cadc15fed8b5d854562b26a9536d9707cadeda9b143978f319ab34230535833acc61c8fdc114a2010ce8038c853e121e1544985133fccdd0a2d507e8e615e611e9a0ba4f47915f49e53d721816a9198e8b30f12d20ec3689989175f1bf7a300eee0d9321fad8da232ece6efb8e9fd81b42ad161f6b9550a069e66b11b40487a5f5059", "keyed_hash": "2c493e48e9b9bf31e0553a22b23503c0a3388f035cece68eb438d22fa1943e209b4dc9209cd80ce7c1f7c9a744658e7e288465717ae6e56d5463d4f80cdb2ef56495f6a4f5487f69749af0c34c2cdfa857f3056bf8d807336a14d7b89bf62bef2fb54f9af6a546f818dc1e98b9e07f8a5834da50fa28fb5874af91bf06020d1bf0120e", "derive_key": "7a7acac8a02adcf3038d74cdd1d34527de8a0fcc0ee3399d1262397ce5817f6055d0cefd84d9d57fe792d65a278fd20384ac6c30fdb340092f1a74a92ace99c482b28f0fc0ef3b923e56ade20c6dba47e49227166251337d80a037e987ad3a7f728b5ab6dfafd6e2ab1bd583a95d9c895ba9c2422c24ea0f62961f0dca45cad47bfa0d" }, { "input_len": 5121, "hash": "628bd2cb2004694adaab7bbd778a25df25c47b9d4155a55f8fbd79f2fe154cff96adaab0613a6146cdaabe498c3a94e529d3fc1da2bd08edf54ed64d40dcd6777647eac51d8277d70219a9694334a68bc8f0f23e20b0ff70ada6f844542dfa32cd4204ca1846ef76d811cdb296f65e260227f477aa7aa008bac878f72257484f2b6c95", "keyed_hash": "6ccf1c34753e7a044db80798ecd0782a8f76f33563accaddbfbb2e0ea4b2d0240d07e63f13667a8d1490e5e04f13eb617aea16a8c8a5aaed1ef6fbde1b0515e3c81050b361af6ead126032998290b563e3caddeaebfab592e155f2e161fb7cba939092133f23f9e65245e58ec23457b78a2e8a125588aad6e07d7f11a85b88d375b72d", "derive_key": "b07f01e518e702f7ccb44a267e9e112d403a7b3f4883a47ffbed4b48339b3c341a0add0ac032ab5aaea1e4e5b004707ec5681ae0fcbe3796974c0b1cf31a194740c14519273eedaabec832e8a784b6e7cfc2c5952677e6c3f2c3914454082d7eb1ce1766ac7d75a4d3001fc89544dd46b5147382240d689bbbaefc359fb6ae30263165" }, { "input_len": 6144, "hash": "3e2e5b74e048f3add6d21faab3f83aa44d3b2278afb83b80b3c35164ebeca2054d742022da6fdda444ebc384b04a54c3ac5839b49da7d39f6d8a9db03deab32aade156c1c0311e9b3435cde0ddba0dce7b26a376cad121294b689193508dd63151603c6ddb866ad16c2ee41585d1633a2cea093bea714f4c5d6b903522045b20395c83", "keyed_hash": "3d6b6d21281d0ade5b2b016ae4034c5dec10ca7e475f90f76eac7138e9bc8f1dc35754060091dc5caf3efabe0603c60f45e415bb3407db67e6beb3d11cf8e4f7907561f05dace0c15807f4b5f389c841eb114d81a82c02a00b57206b1d11fa6e803486b048a5ce87105a686dee041207e095323dfe172df73deb8c9532066d88f9da7e", "derive_key": "2a95beae63ddce523762355cf4b9c1d8f131465780a391286a5d01abb5683a1597099e3c6488aab6c48f3c15dbe1942d21dbcdc12115d19a8b8465fb54e9053323a9178e4275647f1a9927f6439e52b7031a0b465c861a3fc531527f7758b2b888cf2f20582e9e2c593709c0a44f9c6e0f8b963994882ea4168827823eef1f64169fef" }, { "input_len": 6145, "hash": "f1323a8631446cc50536a9f705ee5cb619424d46887f3c376c695b70e0f0507f18a2cfdd73c6e39dd75ce7c1c6e3ef238fd54465f053b25d21044ccb2093beb015015532b108313b5829c3621ce324b8e14229091b7c93f32db2e4e63126a377d2a63a3597997d4f1cba59309cb4af240ba70cebff9a23d5e3ff0cdae2cfd54e070022", "keyed_hash": "9ac301e9e39e45e3250a7e3b3df701aa0fb6889fbd80eeecf28dbc6300fbc539f3c184ca2f59780e27a576c1d1fb9772e99fd17881d02ac7dfd39675aca918453283ed8c3169085ef4a466b91c1649cc341dfdee60e32231fc34c9c4e0b9a2ba87ca8f372589c744c15fd6f985eec15e98136f25beeb4b13c4e43dc84abcc79cd4646c", "derive_key": "379bcc61d0051dd489f686c13de00d5b14c505245103dc040d9e4dd1facab8e5114493d029bdbd295aaa744a59e31f35c7f52dba9c3642f773dd0b4262a9980a2aef811697e1305d37ba9d8b6d850ef07fe41108993180cf779aeece363704c76483458603bbeeb693cffbbe5588d1f3535dcad888893e53d977424bb707201569a8d2" }, { "input_len": 7168, "hash": "61da957ec2499a95d6b8023e2b0e604ec7f6b50e80a9678b89d2628e99ada77a5707c321c83361793b9af62a40f43b523df1c8633cecb4cd14d00bdc79c78fca5165b863893f6d38b02ff7236c5a9a8ad2dba87d24c547cab046c29fc5bc1ed142e1de4763613bb162a5a538e6ef05ed05199d751f9eb58d332791b8d73fb74e4fce95", "keyed_hash": "b42835e40e9d4a7f42ad8cc04f85a963a76e18198377ed84adddeaecacc6f3fca2f01d5277d69bb681c70fa8d36094f73ec06e452c80d2ff2257ed82e7ba348400989a65ee8daa7094ae0933e3d2210ac6395c4af24f91c2b590ef87d7788d7066ea3eaebca4c08a4f14b9a27644f99084c3543711b64a070b94f2c9d1d8a90d035d52", "derive_key": "11c37a112765370c94a51415d0d651190c288566e295d505defdad895dae223730d5a5175a38841693020669c7638f40b9bc1f9f39cf98bda7a5b54ae24218a800a2116b34665aa95d846d97ea988bfcb53dd9c055d588fa21ba78996776ea6c40bc428b53c62b5f3ccf200f647a5aae8067f0ea1976391fcc72af1945100e2a6dcb88" }, { "input_len": 7169, "hash": "a003fc7a51754a9b3c7fae0367ab3d782dccf28855a03d435f8cfe74605e781798a8b20534be1ca9eb2ae2df3fae2ea60e48c6fb0b850b1385b5de0fe460dbe9d9f9b0d8db4435da75c601156df9d047f4ede008732eb17adc05d96180f8a73548522840779e6062d643b79478a6e8dbce68927f36ebf676ffa7d72d5f68f050b119c8", "keyed_hash": "ed9b1a922c046fdb3d423ae34e143b05ca1bf28b710432857bf738bcedbfa5113c9e28d72fcbfc020814ce3f5d4fc867f01c8f5b6caf305b3ea8a8ba2da3ab69fabcb438f19ff11f5378ad4484d75c478de425fb8e6ee809b54eec9bdb184315dc856617c09f5340451bf42fd3270a7b0b6566169f242e533777604c118a6358250f54", "derive_key": "554b0a5efea9ef183f2f9b931b7497995d9eb26f5c5c6dad2b97d62fc5ac31d99b20652c016d88ba2a611bbd761668d5eda3e568e940faae24b0d9991c3bd25a65f770b89fdcadabcb3d1a9c1cb63e69721cacf1ae69fefdcef1e3ef41bc5312ccc17222199e47a26552c6adc460cf47a72319cb5039369d0060eaea59d6c65130f1dd" }, { "input_len": 8192, "hash": "aae792484c8efe4f19e2ca7d371d8c467ffb10748d8a5a1ae579948f718a2a635fe51a27db045a567c1ad51be5aa34c01c6651c4d9b5b5ac5d0fd58cf18dd61a47778566b797a8c67df7b1d60b97b19288d2d877bb2df417ace009dcb0241ca1257d62712b6a4043b4ff33f690d849da91ea3bf711ed583cb7b7a7da2839ba71309bbf", "keyed_hash": "dc9637c8845a770b4cbf76b8daec0eebf7dc2eac11498517f08d44c8fc00d58a4834464159dcbc12a0ba0c6d6eb41bac0ed6585cabfe0aca36a375e6c5480c22afdc40785c170f5a6b8a1107dbee282318d00d915ac9ed1143ad40765ec120042ee121cd2baa36250c618adaf9e27260fda2f94dea8fb6f08c04f8f10c78292aa46102", "derive_key": "ad01d7ae4ad059b0d33baa3c01319dcf8088094d0359e5fd45d6aeaa8b2d0c3d4c9e58958553513b67f84f8eac653aeeb02ae1d5672dcecf91cd9985a0e67f4501910ecba25555395427ccc7241d70dc21c190e2aadee875e5aae6bf1912837e53411dabf7a56cbf8e4fb780432b0d7fe6cec45024a0788cf5874616407757e9e6bef7" }, { "input_len": 8193, "hash": "bab6c09cb8ce8cf459261398d2e7aef35700bf488116ceb94a36d0f5f1b7bc3bb2282aa69be089359ea1154b9a9286c4a56af4de975a9aa4a5c497654914d279bea60bb6d2cf7225a2fa0ff5ef56bbe4b149f3ed15860f78b4e2ad04e158e375c1e0c0b551cd7dfc82f1b155c11b6b3ed51ec9edb30d133653bb5709d1dbd55f4e1ff6", "keyed_hash": "954a2a75420c8d6547e3ba5b98d963e6fa6491addc8c023189cc519821b4a1f5f03228648fd983aef045c2fa8290934b0866b615f585149587dda2299039965328835a2b18f1d63b7e300fc76ff260b571839fe44876a4eae66cbac8c67694411ed7e09df51068a22c6e67d6d3dd2cca8ff12e3275384006c80f4db68023f24eebba57", "derive_key": "af1e0346e389b17c23200270a64aa4e1ead98c61695d917de7d5b00491c9b0f12f20a01d6d622edf3de026a4db4e4526225debb93c1237934d71c7340bb5916158cbdafe9ac3225476b6ab57a12357db3abbad7a26c6e66290e44034fb08a20a8d0ec264f309994d2810c49cfba6989d7abb095897459f5425adb48aba07c5fb3c83c0" }, { "input_len": 16384, "hash": "f875d6646de28985646f34ee13be9a576fd515f76b5b0a26bb324735041ddde49d764c270176e53e97bdffa58d549073f2c660be0e81293767ed4e4929f9ad34bbb39a529334c57c4a381ffd2a6d4bfdbf1482651b172aa883cc13408fa67758a3e47503f93f87720a3177325f7823251b85275f64636a8f1d599c2e49722f42e93893", "keyed_hash": "9e9fc4eb7cf081ea7c47d1807790ed211bfec56aa25bb7037784c13c4b707b0df9e601b101e4cf63a404dfe50f2e1865bb12edc8fca166579ce0c70dba5a5c0fc960ad6f3772183416a00bd29d4c6e651ea7620bb100c9449858bf14e1ddc9ecd35725581ca5b9160de04060045993d972571c3e8f71e9d0496bfa744656861b169d65", "derive_key": "160e18b5878cd0df1c3af85eb25a0db5344d43a6fbd7a8ef4ed98d0714c3f7e160dc0b1f09caa35f2f417b9ef309dfe5ebd67f4c9507995a531374d099cf8ae317542e885ec6f589378864d3ea98716b3bbb65ef4ab5e0ab5bb298a501f19a41ec19af84a5e6b428ecd813b1a47ed91c9657c3fba11c406bc316768b58f6802c9e9b57" }, { "input_len": 31744, "hash": "62b6960e1a44bcc1eb1a611a8d6235b6b4b78f32e7abc4fb4c6cdcce94895c47860cc51f2b0c28a7b77304bd55fe73af663c02d3f52ea053ba43431ca5bab7bfea2f5e9d7121770d88f70ae9649ea713087d1914f7f312147e247f87eb2d4ffef0ac978bf7b6579d57d533355aa20b8b77b13fd09748728a5cc327a8ec470f4013226f", "keyed_hash": "efa53b389ab67c593dba624d898d0f7353ab99e4ac9d42302ee64cbf9939a4193a7258db2d9cd32a7a3ecfce46144114b15c2fcb68a618a976bd74515d47be08b628be420b5e830fade7c080e351a076fbc38641ad80c736c8a18fe3c66ce12f95c61c2462a9770d60d0f77115bbcd3782b593016a4e728d4c06cee4505cb0c08a42ec", "derive_key": "39772aef80e0ebe60596361e45b061e8f417429d529171b6764468c22928e28e9759adeb797a3fbf771b1bcea30150a020e317982bf0d6e7d14dd9f064bc11025c25f31e81bd78a921db0174f03dd481d30e93fd8e90f8b2fee209f849f2d2a52f31719a490fb0ba7aea1e09814ee912eba111a9fde9d5c274185f7bae8ba85d300a2b" }, { "input_len": 100000, "hash": "d93c23eedaf165a7e0be908ba86f1a7a520d568d2d13cde787c8580c5c72cc54902b765d0e69ff7f278ef2f8bb839b673f0db20afa0566c78965ad819674822fd11a507251555fc6daec7437074bc7b7307dfe122411b3676a932b5b0360d5ad495f8e7431d3d025fac5b4e955ce893a3504f2569f838eea47cf1bb21c4ae659db522f", "keyed_hash": "74c836d008247adebbc032d1bced2e71d19050b5c39fa03c43d4160ad8d170732f3b73e374a4500825c13d2c8c9384ce12c033adc49245ce42f50d5b48237397b8447bd414b0693bef98518db8a3494e6e8e3abc931f92f472d938f07eac97d1cc69b375426bce26c5e829b5b41cacbb5543544977749d503fa78309e7a158640e579c", "derive_key": "039c0c0d76eacefea9c8d042698bd012d3cef4091ed5c5a7e32a30e4d51718930a99481bb11214d9e9e79e58d11875a789447731a887aa77499843148d35b1752c6314af6d36559341bd6895c5ee0a452c99cb47a9b22dfe36042932fc9a423d245b91b6246c85e4b0d415cbece3e0545d6e242853da7f3dd1f9b0f146ec72706b8c28" } ] }