pax_global_header00006660000000000000000000000064141154417670014524gustar00rootroot0000000000000052 comment=09d3897aaa0fdbc603afcf54cfe25bf5d677a6b4 blake3-1.1.6/000077500000000000000000000000001411544176700126725ustar00rootroot00000000000000blake3-1.1.6/LICENSE000066400000000000000000000020701411544176700136760ustar00rootroot00000000000000The MIT License (MIT) Copyright (c) 2020 Luke Champine Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. blake3-1.1.6/README.md000066400000000000000000000044461411544176700141610ustar00rootroot00000000000000blake3 ------ [![GoDoc](https://godoc.org/lukechampine.com/blake3?status.svg)](https://godoc.org/lukechampine.com/blake3) [![Go Report Card](http://goreportcard.com/badge/lukechampine.com/blake3)](https://goreportcard.com/report/lukechampine.com/blake3) ``` go get lukechampine.com/blake3 ``` `blake3` implements the [BLAKE3 cryptographic hash function](https://github.com/BLAKE3-team/BLAKE3). This implementation aims to be performant without sacrificing (too much) readability, in the hopes of eventually landing in `x/crypto`. In addition to the pure-Go implementation, this package also contains AVX-512 and AVX2 routines (generated by [`avo`](https://github.com/mmcloughlin/avo)) that greatly increase performance for large inputs and outputs. Contributions are greatly appreciated. [All contributors are eligible to receive an Urbit planet.](https://twitter.com/lukechampine/status/1274797924522885134) ## Benchmarks Tested on a 2020 MacBook Air (i5-7600K @ 3.80GHz). Benchmarks will improve as soon as I get access to a beefier AVX-512 machine. :wink: ### AVX-512 ``` BenchmarkSum256/64 120 ns/op 533.00 MB/s BenchmarkSum256/1024 2229 ns/op 459.36 MB/s BenchmarkSum256/65536 16245 ns/op 4034.11 MB/s BenchmarkWrite 245 ns/op 4177.38 MB/s BenchmarkXOF 246 ns/op 4159.30 MB/s ``` ### AVX2 ``` BenchmarkSum256/64 120 ns/op 533.00 MB/s BenchmarkSum256/1024 2229 ns/op 459.36 MB/s BenchmarkSum256/65536 31137 ns/op 2104.76 MB/s BenchmarkWrite 487 ns/op 2103.12 MB/s BenchmarkXOF 329 ns/op 3111.27 MB/s ``` ### Pure Go ``` BenchmarkSum256/64 120 ns/op 533.00 MB/s BenchmarkSum256/1024 2229 ns/op 459.36 MB/s BenchmarkSum256/65536 133505 ns/op 490.89 MB/s BenchmarkWrite 2022 ns/op 506.36 MB/s BenchmarkXOF 1914 ns/op 534.98 MB/s ``` ## Shortcomings There is no assembly routine for single-block compressions. This is most noticeable for ~1KB inputs. Each assembly routine inlines all 7 rounds, causing thousands of lines of duplicated code. Ideally the routines could be merged such that only a single routine is generated for AVX-512 and AVX2, without sacrificing too much performance. blake3-1.1.6/avo/000077500000000000000000000000001411544176700134575ustar00rootroot00000000000000blake3-1.1.6/avo/gen.go000066400000000000000000000413741411544176700145700ustar00rootroot00000000000000// +build ignore package main import ( "fmt" . "github.com/mmcloughlin/avo/build" "github.com/mmcloughlin/avo/ir" . "github.com/mmcloughlin/avo/operand" . "github.com/mmcloughlin/avo/reg" ) func main() { genGlobals() genCompressBlocksAVX512() genCompressChunksAVX512() genCompressBlocksAVX2() genCompressChunksAVX2() genCompressParentsAVX2() Generate() } var globals struct { iv Mem seq Mem seq64 Mem // for loadCounter shuffleRot8 Mem shuffleRot16 Mem } func genGlobals() { globals.iv = GLOBL("iv", RODATA|NOPTR) DATA(0*4, U32(0x6A09E667)) DATA(1*4, U32(0xBB67AE85)) DATA(2*4, U32(0x3C6EF372)) DATA(3*4, U32(0xA54FF53A)) globals.seq = GLOBL("seq", RODATA|NOPTR) for i := 0; i < 16; i++ { DATA(i*4, U32(i)) } globals.seq64 = GLOBL("seq64", RODATA|NOPTR) for i := 0; i < 8; i++ { DATA(i*8, U64(i)) } globals.shuffleRot8 = GLOBL("shuffle_rot8", RODATA|NOPTR) for i := 0; i < 8; i++ { DATA(i*4, U32(0x00030201+0x04040404*i)) } globals.shuffleRot16 = GLOBL("shuffle_rot16", RODATA|NOPTR) for i := 0; i < 8; i++ { DATA(i*4, U32(0x01000302+0x04040404*i)) } } func genCompressBlocksAVX512() { TEXT("compressBlocksAVX512", NOSPLIT, "func(out *[1024]byte, block *[16]uint32, cv *[8]uint32, counter uint64, blockLen uint32, flags uint32)") out := Mem{Base: Load(Param("out"), GP64())} block := Mem{Base: Load(Param("block"), GP64())} cv := Mem{Base: Load(Param("cv"), GP64())} counter, _ := Param("counter").Resolve() blockLen, _ := Param("blockLen").Resolve() flags, _ := Param("flags").Resolve() Comment("Initialize block vectors") var vs, mv [16]VecVirtual for i := range vs { vs[i], mv[i] = ZMM(), ZMM() VPBROADCASTD_Z(block.Offset(i*4), mv[i]) } Comment("Initialize state vectors") for i, v := range vs { switch i { case 0, 1, 2, 3, 4, 5, 6, 7: // cv VPBROADCASTD_Z(cv.Offset(i*4), v) case 8, 9, 10, 11: // iv VPBROADCASTD_Z(globals.iv.Offset((i-8)*4), v) case 12: // counter VPBROADCASTD_Z(counter.Addr, vs[12]) VPADDD_Z(globals.seq, vs[12], vs[12]) // set a 1 bit in K1 for each overflowed counter in vs[12] VPCMPUD(Imm(1), globals.seq, vs[12], K1) // add 1 to each counter in vs[13] for each 1 bit in K1 VPBROADCASTD_Z(counter.Addr.Offset(1*4), vs[13]) VPADDD_ZBK(globals.seq.Offset(4), vs[13], K1, vs[13]) case 14: // blockLen VPBROADCASTD_Z(blockLen.Addr, v) case 15: // flags VPBROADCASTD_Z(flags.Addr, v) } } performRoundsAVX512(vs, mv) Comment("Finalize CVs") for i, v := range vs[:8] { VPXORD_Z(v, vs[i+8], v) } for i, v := range vs[8:] { VPXORD_ZB(cv.Offset(i*4), v, v) } stride := ZMM() VMOVDQU32_Z(globals.seq, stride) VPSLLD_Z(Imm(6), stride, stride) // stride of 64 for i, v := range vs { KXNORD(K1, K1, K1) // fastest way to set all bits to 1 VPSCATTERDD_Z(v, K1, out.Offset(i*4).Idx(stride, 1)) } RET() } func genCompressChunksAVX512() { TEXT("compressChunksAVX512", NOSPLIT, "func(cvs *[16][8]uint32, buf *[16384]byte, key *[8]uint32, counter uint64, flags uint32)") cvs := Mem{Base: Load(Param("cvs"), GP64())} buf := Mem{Base: Load(Param("buf"), GP64())} key := Mem{Base: Load(Param("key"), GP64())} counter, _ := Param("counter").Resolve() flags, _ := Param("flags").Resolve() var vs, mv [16]VecVirtual for i := range vs { vs[i], mv[i] = ZMM(), ZMM() } Comment("Initialize counter") counterLo := AllocLocal(64) counterHi := AllocLocal(64) VPBROADCASTD_Z(counter.Addr, vs[0]) VPADDD_Z(globals.seq, vs[0], vs[0]) VPCMPUD(Imm(1), globals.seq, vs[0], K1) VPBROADCASTD_Z(counter.Addr.Offset(4), vs[1]) VPADDD_ZBK(globals.seq.Offset(4), vs[1], K1, vs[1]) VMOVDQU32_Z(vs[0], counterLo) VMOVDQU32_Z(vs[1], counterHi) Comment("Initialize flags") chunkFlags := AllocLocal(16 * 4) VPBROADCASTD_Z(flags.Addr, vs[0]) VMOVDQU32_Z(vs[0], chunkFlags) ORL(Imm(1), chunkFlags.Offset(0*4)) ORL(Imm(2), chunkFlags.Offset(15*4)) Comment("Load key") for i := 0; i < 8; i++ { VPBROADCASTD_Z(key.Offset(i*4), vs[i]) } Comment("Loop index") loop := GP64() XORQ(loop, loop) Label("loop") Comment("Load transposed block") VMOVDQU32_Z(globals.seq, vs[8]) VPSLLD_Z(Imm(10), vs[8], vs[8]) // stride of 1024 for i, m := range mv { KXNORD(K1, K1, K1) VPGATHERDD_Z(buf.Offset(i*4).Idx(vs[8], 1), K1, m) } ADDQ(Imm(64), buf.Base) Comment("Reload state vectors (other than CVs)") for i := 0; i < 4; i++ { VPBROADCASTD_Z(globals.iv.Offset(i*4), vs[8+i]) } VMOVDQU32_Z(counterLo, vs[12]) VMOVDQU32_Z(counterHi, vs[13]) VPBROADCASTD_Z(globals.seq.Offset(4), vs[14]) VPSLLD_Z(Imm(6), vs[14], vs[14]) // 64 VPBROADCASTD_Z(chunkFlags.Idx(loop, 4), vs[15]) performRoundsAVX512(vs, mv) Comment("Finalize CVs") for i := range vs[:8] { VPXORD_Z(vs[i], vs[i+8], vs[i]) } Comment("Loop") INCQ(loop) CMPQ(loop, U32(16)) JNE(LabelRef("loop")) Comment("Finished; transpose CVs") VMOVDQU32_Z(globals.seq, vs[8]) VPSLLD_Z(Imm(5), vs[8], vs[8]) // stride of 32 for i, v := range vs[:8] { KXNORD(K1, K1, K1) // fastest way to set all bits to 1 VPSCATTERDD_Z(v, K1, cvs.Offset(i*4).Idx(vs[8], 1)) } RET() } func performRoundsAVX512(vs, mv [16]VecVirtual) { g := func(a, b, c, d, mx, my VecVirtual) { VPADDD_Z(a, b, a) VPADDD_Z(mx, a, a) VPXORD_Z(d, a, d) VPRORD_Z(Imm(16), d, d) VPADDD_Z(c, d, c) VPXORD_Z(b, c, b) VPRORD_Z(Imm(12), b, b) VPADDD_Z(a, b, a) VPADDD_Z(my, a, a) VPXORD_Z(d, a, d) VPRORD_Z(Imm(8), d, d) VPADDD_Z(c, d, c) VPXORD_Z(b, c, b) VPRORD_Z(Imm(7), b, b) } for i := 0; i < 7; i++ { Comment(fmt.Sprintf("Round %v", i+1)) g(vs[0], vs[4], vs[8], vs[12], mv[0], mv[1]) g(vs[1], vs[5], vs[9], vs[13], mv[2], mv[3]) g(vs[2], vs[6], vs[10], vs[14], mv[4], mv[5]) g(vs[3], vs[7], vs[11], vs[15], mv[6], mv[7]) g(vs[0], vs[5], vs[10], vs[15], mv[8], mv[9]) g(vs[1], vs[6], vs[11], vs[12], mv[10], mv[11]) g(vs[2], vs[7], vs[8], vs[13], mv[12], mv[13]) g(vs[3], vs[4], vs[9], vs[14], mv[14], mv[15]) // permute mv = [16]VecVirtual{ mv[2], mv[6], mv[3], mv[10], mv[7], mv[0], mv[4], mv[13], mv[1], mv[11], mv[12], mv[5], mv[9], mv[14], mv[15], mv[8], } } } func genCompressBlocksAVX2() { TEXT("compressBlocksAVX2", NOSPLIT, "func(out *[512]byte, block *[16]uint32, cv *[8]uint32, counter uint64, blockLen uint32, flags uint32)") out := Mem{Base: Load(Param("out"), GP64())} block := Mem{Base: Load(Param("block"), GP64())} cv := Mem{Base: Load(Param("cv"), GP64())} counter, _ := Param("counter").Resolve() blockLen, _ := Param("blockLen").Resolve() flags, _ := Param("flags").Resolve() var vs [16]VecVirtual var mv [16]Mem for i := range vs { vs[i] = YMM() mv[i] = AllocLocal(32) } Comment("Load block") for i := 0; i < 16; i++ { VPBROADCASTD(block.Offset(i*4), vs[0]) VMOVDQU(vs[0], mv[i]) } Comment("Initialize state vectors") for i, v := range vs { switch i { case 0, 1, 2, 3, 4, 5, 6, 7: // cv VPBROADCASTD(cv.Offset(i*4), v) case 8, 9, 10, 11: // iv VPBROADCASTD(globals.iv.Offset((i-8)*4), v) case 12: // counter loadCounter(counter.Addr, vs[12:14], vs[14:16]) case 14: // blockLen VPBROADCASTD(blockLen.Addr, v) case 15: // flags VPBROADCASTD(flags.Addr, v) } } performRoundsAVX2(vs, mv) Comment("Finalize CVs") for i := 8; i < 16; i++ { VMOVDQU(vs[i], mv[i]) } for i := range vs[:8] { VPXOR(vs[i], vs[i+8], vs[i]) } transpose(vs[:8], vs[8:]) for i, v := range vs[8:] { VMOVDQU(v, out.Offset(i*64)) } for i := 8; i < 16; i++ { VMOVDQU(mv[i], vs[i]) } for i, v := range vs[8:] { VPBROADCASTD(cv.Offset(i*4), vs[0]) VPXOR(vs[0], v, v) } transpose(vs[8:], vs[:8]) for i, v := range vs[:8] { VMOVDQU(v, out.Offset(i*64+32)) } RET() } func genCompressChunksAVX2() { TEXT("compressChunksAVX2", NOSPLIT, "func(cvs *[8][8]uint32, buf *[8192]byte, key *[8]uint32, counter uint64, flags uint32)") cvs := Mem{Base: Load(Param("cvs"), GP64())} buf := Mem{Base: Load(Param("buf"), GP64())} key := Mem{Base: Load(Param("key"), GP64())} counter, _ := Param("counter").Resolve() flags, _ := Param("flags").Resolve() var vs [16]VecVirtual var mv [16]Mem for i := range vs { vs[i] = YMM() mv[i] = AllocLocal(32) } Comment("Load key") for i := 0; i < 8; i++ { VPBROADCASTD(key.Offset(i*4), vs[i]) } Comment("Initialize counter") counterLo := AllocLocal(32) counterHi := AllocLocal(32) loadCounter(counter.Addr, vs[12:14], vs[14:16]) VMOVDQU(vs[12], counterLo) VMOVDQU(vs[13], counterHi) Comment("Initialize flags") chunkFlags := AllocLocal(16 * 4) VPBROADCASTD(flags.Addr, vs[14]) VMOVDQU(vs[14], chunkFlags.Offset(0*32)) VMOVDQU(vs[14], chunkFlags.Offset(1*32)) ORL(Imm(1), chunkFlags.Offset(0*4)) ORL(Imm(2), chunkFlags.Offset(15*4)) Comment("Loop index") loop := GP64() XORQ(loop, loop) Label("loop") Comment("Load transposed block") VMOVDQU(globals.seq, vs[9]) VPSLLD(Imm(10), vs[9], vs[9]) // stride of 1024 for i := 0; i < 16; i++ { VPCMPEQD(vs[8], vs[8], vs[8]) // fastest way to set all bits to 1 VPGATHERDD(vs[8], buf.Offset(i*4).Idx(vs[9], 1), vs[10]) VMOVDQU(vs[10], mv[i]) } ADDQ(Imm(64), buf.Base) Comment("Reload state vectors (other than CVs)") for i := 0; i < 4; i++ { VPBROADCASTD(globals.iv.Offset(i*4), vs[8+i]) } VMOVDQU(counterLo, vs[12]) VMOVDQU(counterHi, vs[13]) VPBROADCASTD(globals.seq.Offset(4), vs[14]) VPSLLD(Imm(6), vs[14], vs[14]) // 64 VPBROADCASTD(chunkFlags.Idx(loop, 4), vs[15]) performRoundsAVX2(vs, mv) Comment("Finalize CVs") for i := range vs[:8] { VPXOR(vs[i], vs[i+8], vs[i]) } Comment("Loop") INCQ(loop) CMPQ(loop, U32(16)) JNE(LabelRef("loop")) Comment("Finished; transpose CVs") transpose(vs[:8], vs[8:]) for i, v := range vs[8:] { VMOVDQU(v, cvs.Offset(i*32)) } RET() } func genCompressParentsAVX2() { TEXT("compressParentsAVX2", NOSPLIT, "func(parents *[8][8]uint32, cvs *[16][8]uint32, key *[8]uint32, flags uint32)") parents := Mem{Base: Load(Param("parents"), GP64())} cvs := Mem{Base: Load(Param("cvs"), GP64())} key := Mem{Base: Load(Param("key"), GP64())} flags, _ := Param("flags").Resolve() var vs [16]VecVirtual var mv [16]Mem for i := range vs { vs[i] = YMM() mv[i] = AllocLocal(32) } Comment("Load transposed block") VMOVDQU(globals.seq, vs[9]) VPSLLD(Imm(6), vs[9], vs[9]) // stride of 64 for i := 0; i < 16; i++ { VPCMPEQD(vs[8], vs[8], vs[8]) // fastest way to set all bits to 1 VPGATHERDD(vs[8], cvs.Offset(i*4).Idx(vs[9], 1), vs[10]) VMOVDQU(vs[10], mv[i]) } Comment("Initialize state vectors") for i, v := range vs { switch i { case 0, 1, 2, 3, 4, 5, 6, 7: // cv VPBROADCASTD(key.Offset(i*4), v) case 8, 9, 10, 11: // iv VPBROADCASTD(globals.iv.Offset((i-8)*4), v) case 12, 13: // counter VPXOR(v, v, v) case 14: // blockLen VPBROADCASTD(globals.seq.Offset(1*4), v) VPSLLD(Imm(6), v, v) // 64 case 15: // flags ORL(Imm(4), flags.Addr) // flagParent VPBROADCASTD(flags.Addr, v) } } performRoundsAVX2(vs, mv) Comment("Finalize CVs") for i := range vs[:8] { VPXOR(vs[i], vs[i+8], vs[i]) } transpose(vs[:8], vs[8:]) for i, v := range vs[8:] { VMOVDQU(v, parents.Offset(i*32)) } RET() } func performRoundsAVX2(sv [16]VecVirtual, mv [16]Mem) { spillMem := AllocLocal(32) tmp := sv[8] g := func(a, b, c, d VecVirtual, mx, my Mem) { // Helper function for performing rotations. Also manages c, tmp and // spillMem: if c == tmp, we need to spill and reload c using spillMem. rotr := func(v VecVirtual, n uint64, dst VecVirtual) { switch n { case 8, 16: shuf := [...]Mem{8: globals.shuffleRot8, 16: globals.shuffleRot16}[n] VPSHUFB(shuf, v, dst) if c == tmp { VMOVDQU(spillMem, c) } case 7, 12: if c == tmp { VMOVDQU(c, spillMem) } VPSRLD(Imm(n), v, tmp) VPSLLD(Imm(32-n), v, dst) VPOR(dst, tmp, dst) } } VPADDD(a, b, a) VPADDD(mx, a, a) VPXOR(d, a, d) rotr(d, 16, d) VPADDD(c, d, c) VPXOR(b, c, b) rotr(b, 12, b) VPADDD(a, b, a) VPADDD(my, a, a) VPXOR(d, a, d) rotr(d, 8, d) VPADDD(c, d, c) VPXOR(b, c, b) rotr(b, 7, b) } VMOVDQU(sv[8], spillMem) // spill for i := 0; i < 7; i++ { Comment(fmt.Sprintf("Round %v", i+1)) g(sv[0], sv[4], sv[8], sv[12], mv[0], mv[1]) g(sv[1], sv[5], sv[9], sv[13], mv[2], mv[3]) g(sv[2], sv[6], sv[10], sv[14], mv[4], mv[5]) g(sv[3], sv[7], sv[11], sv[15], mv[6], mv[7]) g(sv[0], sv[5], sv[10], sv[15], mv[8], mv[9]) g(sv[1], sv[6], sv[11], sv[12], mv[10], mv[11]) g(sv[2], sv[7], sv[8], sv[13], mv[12], mv[13]) g(sv[3], sv[4], sv[9], sv[14], mv[14], mv[15]) // permute mv = [16]Mem{ mv[2], mv[6], mv[3], mv[10], mv[7], mv[0], mv[4], mv[13], mv[1], mv[11], mv[12], mv[5], mv[9], mv[14], mv[15], mv[8], } } VMOVDQU(spillMem, sv[8]) // reload } func loadCounter(counter Mem, dst, scratch []VecVirtual) { // fill dst[0] and dst[1] with counter + 0,1,2,3,4,5,6,7, then transpose so // that dst[0] contains low 32 bits and dst[1] contains high 32 bits. VPBROADCASTQ(counter, dst[0]) VPBROADCASTQ(counter, dst[1]) VPADDQ(globals.seq64.Offset(0*4), dst[0], dst[0]) VPADDQ(globals.seq64.Offset(8*4), dst[1], dst[1]) VPUNPCKLDQ(dst[1], dst[0], scratch[0]) VPUNPCKHDQ(dst[1], dst[0], scratch[1]) VPUNPCKLDQ(scratch[1], scratch[0], dst[0]) VPUNPCKHDQ(scratch[1], scratch[0], dst[1]) const perm = 0<<0 | 2<<2 | 1<<4 | 3<<6 VPERMQ(Imm(perm), dst[0], dst[0]) VPERMQ(Imm(perm), dst[1], dst[1]) } func transpose(src, dst []VecVirtual) { // interleave uint32s for i := 0; i < 8; i += 2 { VPUNPCKLDQ(src[i+1], src[i], dst[i+0]) VPUNPCKHDQ(src[i+1], src[i], dst[i+1]) } // interleave groups of two uint32s for i := 0; i < 4; i++ { j := i*2 - i%2 // j := 0,1,4,5 VPUNPCKLQDQ(dst[j+2], dst[j], src[i*2+0]) VPUNPCKHQDQ(dst[j+2], dst[j], src[i*2+1]) } // interleave groups of four uint32s for i := 0; i < 4; i++ { VPERM2I128(Imm(0x20), src[i+4], src[i], dst[i+0]) VPERM2I128(Imm(0x31), src[i+4], src[i], dst[i+4]) } } // AVX-512 is not currently supported by avo, so we need to manually define the // instructions we need type maskReg = LabelRef // hack; avo doesn't allow custom Op types const K0 maskReg = "K0" const K1 maskReg = "K1" const K2 maskReg = "K2" func VMOVDQU32_Z(src, dst Op) { Instruction(&ir.Instruction{ Opcode: "VMOVDQU32", Operands: []Op{src, dst}, Inputs: []Op{src}, Outputs: []Op{dst}, ISA: []string{"AVX512F"}, }) } func VPBROADCASTD_Z(src, dst Op) { Instruction(&ir.Instruction{ Opcode: "VPBROADCASTD", Operands: []Op{src, dst}, Inputs: []Op{src}, Outputs: []Op{dst}, ISA: []string{"AVX512F"}, }) } func VPGATHERDD_Z(src, mask, dst Op) { Instruction(&ir.Instruction{ Opcode: "VPGATHERDD", Operands: []Op{src, mask, dst}, Inputs: []Op{src, mask}, Outputs: []Op{dst}, ISA: []string{"AVX512F"}, }) } func VPSCATTERDD_Z(src, mask, dst Op) { Instruction(&ir.Instruction{ Opcode: "VPSCATTERDD", Operands: []Op{src, mask, dst}, Inputs: []Op{src, mask}, Outputs: []Op{dst}, ISA: []string{"AVX512F"}, }) } func VPORD_Z(x, y, dst Op) { Instruction(&ir.Instruction{ Opcode: "VPORD", Operands: []Op{x, y, dst}, Inputs: []Op{x, y}, Outputs: []Op{dst}, ISA: []string{"AVX512F"}, }) } func VPXORD_Z(x, y, dst Op) { Instruction(&ir.Instruction{ Opcode: "VPXORD", Operands: []Op{x, y, dst}, Inputs: []Op{x, y}, Outputs: []Op{dst}, ISA: []string{"AVX512F"}, }) } func VPXORD_ZB(x, y, dst Op) { Instruction(&ir.Instruction{ Opcode: "VPXORD.BCST", Operands: []Op{x, y, dst}, Inputs: []Op{x, y}, Outputs: []Op{dst}, ISA: []string{"AVX512F"}, }) } func VPRORD_Z(n, src, dst Op) { Instruction(&ir.Instruction{ Opcode: "VPRORD", Operands: []Op{n, src, dst}, Inputs: []Op{n, src}, Outputs: []Op{dst}, ISA: []string{"AVX512F"}, }) } func VPSLLD_Z(n, src, dst Op) { Instruction(&ir.Instruction{ Opcode: "VPSLLD", Operands: []Op{n, src, dst}, Inputs: []Op{n, src}, Outputs: []Op{dst}, ISA: []string{"AVX512F"}, }) } func VPADDD_Z(x, y, dst Op) { Instruction(&ir.Instruction{ Opcode: "VPADDD", Operands: []Op{x, y, dst}, Inputs: []Op{x, y}, Outputs: []Op{dst}, ISA: []string{"AVX512F"}, }) } func VPADDD_ZB(x, y, dst Op) { Instruction(&ir.Instruction{ Opcode: "VPADDD.BCST", Operands: []Op{x, y, dst}, Inputs: []Op{x, y}, Outputs: []Op{dst}, ISA: []string{"AVX512F"}, }) } func VPADDD_ZBK(x, y, mask, dst Op) { Instruction(&ir.Instruction{ Opcode: "VPADDD.BCST", Operands: []Op{x, y, mask, dst}, Inputs: []Op{x, y, mask}, Outputs: []Op{dst}, ISA: []string{"AVX512F"}, }) } func KXNORD(x, y, dst Op) { Instruction(&ir.Instruction{ Opcode: "KXNORD", Operands: []Op{x, y, dst}, Inputs: []Op{x, y}, Outputs: []Op{dst}, ISA: []string{"AVX512F"}, }) } func VPCMPUD(pred, x, y, dst Op) { Instruction(&ir.Instruction{ Opcode: "VPCMPUD", Operands: []Op{pred, x, y, dst}, Inputs: []Op{pred, x, y}, Outputs: []Op{dst}, ISA: []string{"AVX512F"}, }) } blake3-1.1.6/blake3.go000066400000000000000000000165711411544176700143740ustar00rootroot00000000000000// Package blake3 implements the BLAKE3 cryptographic hash function. package blake3 // import "lukechampine.com/blake3" import ( "encoding/binary" "errors" "hash" "io" "math" "math/bits" ) const ( flagChunkStart = 1 << iota flagChunkEnd flagParent flagRoot flagKeyedHash flagDeriveKeyContext flagDeriveKeyMaterial blockSize = 64 chunkSize = 1024 maxSIMD = 16 // AVX-512 vectors can store 16 words ) var iv = [8]uint32{ 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A, 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19, } // A node represents a chunk or parent in the BLAKE3 Merkle tree. type node struct { cv [8]uint32 // chaining value from previous node block [16]uint32 counter uint64 blockLen uint32 flags uint32 } // parentNode returns a node that incorporates the chaining values of two child // nodes. func parentNode(left, right [8]uint32, key [8]uint32, flags uint32) node { n := node{ cv: key, counter: 0, // counter is reset for parents blockLen: blockSize, // block is full flags: flags | flagParent, } copy(n.block[:8], left[:]) copy(n.block[8:], right[:]) return n } // Hasher implements hash.Hash. type Hasher struct { key [8]uint32 flags uint32 size int // output size, for Sum // log(n) set of Merkle subtree roots, at most one per height. stack [50][8]uint32 // 2^50 * maxSIMD * chunkSize = 2^64 counter uint64 // number of buffers hashed; also serves as a bit vector indicating which stack elems are occupied buf [maxSIMD * chunkSize]byte buflen int } func (h *Hasher) hasSubtreeAtHeight(i int) bool { return h.counter&(1< 0 { if h.buflen == len(h.buf) { n := compressBuffer(&h.buf, h.buflen, &h.key, h.counter*maxSIMD, h.flags) h.pushSubtree(chainingValue(n)) h.buflen = 0 } n := copy(h.buf[h.buflen:], p) h.buflen += n p = p[n:] } return lenp, nil } // Sum implements hash.Hash. func (h *Hasher) Sum(b []byte) (sum []byte) { // We need to append h.Size() bytes to b. Reuse b's capacity if possible; // otherwise, allocate a new slice. if total := len(b) + h.Size(); cap(b) >= total { sum = b[:total] } else { sum = make([]byte, total) copy(sum, b) } // Read into the appended portion of sum. Use a low-latency-low-throughput // path for small digests (requiring a single compression), and a // high-latency-high-throughput path for large digests. if dst := sum[len(b):]; len(dst) <= 64 { var out [64]byte wordsToBytes(compressNode(h.rootNode()), &out) copy(dst, out[:]) } else { h.XOF().Read(dst) } return } // Reset implements hash.Hash. func (h *Hasher) Reset() { h.counter = 0 h.buflen = 0 } // BlockSize implements hash.Hash. func (h *Hasher) BlockSize() int { return 64 } // Size implements hash.Hash. func (h *Hasher) Size() int { return h.size } // XOF returns an OutputReader initialized with the current hash state. func (h *Hasher) XOF() *OutputReader { return &OutputReader{ n: h.rootNode(), } } func newHasher(key [8]uint32, flags uint32, size int) *Hasher { return &Hasher{ key: key, flags: flags, size: size, } } // New returns a Hasher for the specified size and key. If key is nil, the hash // is unkeyed. Otherwise, len(key) must be 32. func New(size int, key []byte) *Hasher { if key == nil { return newHasher(iv, 0, size) } var keyWords [8]uint32 for i := range keyWords { keyWords[i] = binary.LittleEndian.Uint32(key[i*4:]) } return newHasher(keyWords, flagKeyedHash, size) } // Sum256 and Sum512 always use the same hasher state, so we can save some time // when hashing small inputs by constructing the hasher ahead of time. var defaultHasher = New(0, nil) // Sum256 returns the unkeyed BLAKE3 hash of b, truncated to 256 bits. func Sum256(b []byte) (out [32]byte) { out512 := Sum512(b) copy(out[:], out512[:]) return } // Sum512 returns the unkeyed BLAKE3 hash of b, truncated to 512 bits. func Sum512(b []byte) (out [64]byte) { var n node if len(b) <= blockSize { hashBlock(&out, b) return } else if len(b) <= chunkSize { n = compressChunk(b, &iv, 0, 0) n.flags |= flagRoot } else { h := *defaultHasher h.Write(b) n = h.rootNode() } wordsToBytes(compressNode(n), &out) return } // DeriveKey derives a subkey from ctx and srcKey. ctx should be hardcoded, // globally unique, and application-specific. A good format for ctx strings is: // // [application] [commit timestamp] [purpose] // // e.g.: // // example.com 2019-12-25 16:18:03 session tokens v1 // // The purpose of these requirements is to ensure that an attacker cannot trick // two different applications into using the same context string. func DeriveKey(subKey []byte, ctx string, srcKey []byte) { // construct the derivation Hasher const derivationIVLen = 32 h := newHasher(iv, flagDeriveKeyContext, 32) h.Write([]byte(ctx)) derivationIV := h.Sum(make([]byte, 0, derivationIVLen)) var ivWords [8]uint32 for i := range ivWords { ivWords[i] = binary.LittleEndian.Uint32(derivationIV[i*4:]) } h = newHasher(ivWords, flagDeriveKeyMaterial, 0) // derive the subKey h.Write(srcKey) h.XOF().Read(subKey) } // An OutputReader produces an seekable stream of 2^64 - 1 pseudorandom output // bytes. type OutputReader struct { n node buf [maxSIMD * blockSize]byte off uint64 } // Read implements io.Reader. Callers may assume that Read returns len(p), nil // unless the read would extend beyond the end of the stream. func (or *OutputReader) Read(p []byte) (int, error) { if or.off == math.MaxUint64 { return 0, io.EOF } else if rem := math.MaxUint64 - or.off; uint64(len(p)) > rem { p = p[:rem] } lenp := len(p) for len(p) > 0 { if or.off%(maxSIMD*blockSize) == 0 { or.n.counter = or.off / blockSize compressBlocks(&or.buf, or.n) } n := copy(p, or.buf[or.off%(maxSIMD*blockSize):]) p = p[n:] or.off += uint64(n) } return lenp, nil } // Seek implements io.Seeker. func (or *OutputReader) Seek(offset int64, whence int) (int64, error) { off := or.off switch whence { case io.SeekStart: if offset < 0 { return 0, errors.New("seek position cannot be negative") } off = uint64(offset) case io.SeekCurrent: if offset < 0 { if uint64(-offset) > off { return 0, errors.New("seek position cannot be negative") } off -= uint64(-offset) } else { off += uint64(offset) } case io.SeekEnd: off = uint64(offset) - 1 default: panic("invalid whence") } or.off = off or.n.counter = uint64(off) / blockSize if or.off%(maxSIMD*blockSize) != 0 { compressBlocks(&or.buf, or.n) } // NOTE: or.off >= 2^63 will result in a negative return value. // Nothing we can do about this. return int64(or.off), nil } // ensure that Hasher implements hash.Hash var _ hash.Hash = (*Hasher)(nil) blake3-1.1.6/blake3_amd64.s000066400000000000000000003757051411544176700152330ustar00rootroot00000000000000// Code generated by command: go run gen.go -out blake3_amd64.s. DO NOT EDIT. #include "textflag.h" DATA iv<>+0(SB)/4, $0x6a09e667 DATA iv<>+4(SB)/4, $0xbb67ae85 DATA iv<>+8(SB)/4, $0x3c6ef372 DATA iv<>+12(SB)/4, $0xa54ff53a GLOBL iv<>(SB), RODATA|NOPTR, $16 DATA seq<>+0(SB)/4, $0x00000000 DATA seq<>+4(SB)/4, $0x00000001 DATA seq<>+8(SB)/4, $0x00000002 DATA seq<>+12(SB)/4, $0x00000003 DATA seq<>+16(SB)/4, $0x00000004 DATA seq<>+20(SB)/4, $0x00000005 DATA seq<>+24(SB)/4, $0x00000006 DATA seq<>+28(SB)/4, $0x00000007 DATA seq<>+32(SB)/4, $0x00000008 DATA seq<>+36(SB)/4, $0x00000009 DATA seq<>+40(SB)/4, $0x0000000a DATA seq<>+44(SB)/4, $0x0000000b DATA seq<>+48(SB)/4, $0x0000000c DATA seq<>+52(SB)/4, $0x0000000d DATA seq<>+56(SB)/4, $0x0000000e DATA seq<>+60(SB)/4, $0x0000000f GLOBL seq<>(SB), RODATA|NOPTR, $64 DATA seq64<>+0(SB)/8, $0x0000000000000000 DATA seq64<>+8(SB)/8, $0x0000000000000001 DATA seq64<>+16(SB)/8, $0x0000000000000002 DATA seq64<>+24(SB)/8, $0x0000000000000003 DATA seq64<>+32(SB)/8, $0x0000000000000004 DATA seq64<>+40(SB)/8, $0x0000000000000005 DATA seq64<>+48(SB)/8, $0x0000000000000006 DATA seq64<>+56(SB)/8, $0x0000000000000007 GLOBL seq64<>(SB), RODATA|NOPTR, $64 DATA shuffle_rot8<>+0(SB)/4, $0x00030201 DATA shuffle_rot8<>+4(SB)/4, $0x04070605 DATA shuffle_rot8<>+8(SB)/4, $0x080b0a09 DATA shuffle_rot8<>+12(SB)/4, $0x0c0f0e0d DATA shuffle_rot8<>+16(SB)/4, $0x10131211 DATA shuffle_rot8<>+20(SB)/4, $0x14171615 DATA shuffle_rot8<>+24(SB)/4, $0x181b1a19 DATA shuffle_rot8<>+28(SB)/4, $0x1c1f1e1d GLOBL shuffle_rot8<>(SB), RODATA|NOPTR, $32 DATA shuffle_rot16<>+0(SB)/4, $0x01000302 DATA shuffle_rot16<>+4(SB)/4, $0x05040706 DATA shuffle_rot16<>+8(SB)/4, $0x09080b0a DATA shuffle_rot16<>+12(SB)/4, $0x0d0c0f0e DATA shuffle_rot16<>+16(SB)/4, $0x11101312 DATA shuffle_rot16<>+20(SB)/4, $0x15141716 DATA shuffle_rot16<>+24(SB)/4, $0x19181b1a DATA shuffle_rot16<>+28(SB)/4, $0x1d1c1f1e GLOBL shuffle_rot16<>(SB), RODATA|NOPTR, $32 // func compressBlocksAVX512(out *[1024]byte, block *[16]uint32, cv *[8]uint32, counter uint64, blockLen uint32, flags uint32) // Requires: AVX512F TEXT ·compressBlocksAVX512(SB), NOSPLIT, $0-40 MOVQ out+0(FP), AX MOVQ block+8(FP), CX MOVQ cv+16(FP), DX // Initialize block vectors VPBROADCASTD (CX), Z1 VPBROADCASTD 4(CX), Z3 VPBROADCASTD 8(CX), Z5 VPBROADCASTD 12(CX), Z7 VPBROADCASTD 16(CX), Z9 VPBROADCASTD 20(CX), Z11 VPBROADCASTD 24(CX), Z13 VPBROADCASTD 28(CX), Z15 VPBROADCASTD 32(CX), Z17 VPBROADCASTD 36(CX), Z19 VPBROADCASTD 40(CX), Z21 VPBROADCASTD 44(CX), Z23 VPBROADCASTD 48(CX), Z25 VPBROADCASTD 52(CX), Z27 VPBROADCASTD 56(CX), Z29 VPBROADCASTD 60(CX), Z31 // Initialize state vectors VPBROADCASTD (DX), Z0 VPBROADCASTD 4(DX), Z2 VPBROADCASTD 8(DX), Z4 VPBROADCASTD 12(DX), Z6 VPBROADCASTD 16(DX), Z8 VPBROADCASTD 20(DX), Z10 VPBROADCASTD 24(DX), Z12 VPBROADCASTD 28(DX), Z14 VPBROADCASTD iv<>+0(SB), Z16 VPBROADCASTD iv<>+4(SB), Z18 VPBROADCASTD iv<>+8(SB), Z20 VPBROADCASTD iv<>+12(SB), Z22 VPBROADCASTD counter+24(FP), Z24 VPADDD seq<>+0(SB), Z24, Z24 VPCMPUD $0x01, seq<>+0(SB), Z24, K1 VPBROADCASTD counter+28(FP), Z26 VPADDD.BCST seq<>+4(SB), Z26, K1, Z26 VPBROADCASTD blockLen+32(FP), Z28 VPBROADCASTD flags+36(FP), Z30 // Round 1 VPADDD Z0, Z8, Z0 VPADDD Z1, Z0, Z0 VPXORD Z24, Z0, Z24 VPRORD $0x10, Z24, Z24 VPADDD Z16, Z24, Z16 VPXORD Z8, Z16, Z8 VPRORD $0x0c, Z8, Z8 VPADDD Z0, Z8, Z0 VPADDD Z3, Z0, Z0 VPXORD Z24, Z0, Z24 VPRORD $0x08, Z24, Z24 VPADDD Z16, Z24, Z16 VPXORD Z8, Z16, Z8 VPRORD $0x07, Z8, Z8 VPADDD Z2, Z10, Z2 VPADDD Z5, Z2, Z2 VPXORD Z26, Z2, Z26 VPRORD $0x10, Z26, Z26 VPADDD Z18, Z26, Z18 VPXORD Z10, Z18, Z10 VPRORD $0x0c, Z10, Z10 VPADDD Z2, Z10, Z2 VPADDD Z7, Z2, Z2 VPXORD Z26, Z2, Z26 VPRORD $0x08, Z26, Z26 VPADDD Z18, Z26, Z18 VPXORD Z10, Z18, Z10 VPRORD $0x07, Z10, Z10 VPADDD Z4, Z12, Z4 VPADDD Z9, Z4, Z4 VPXORD Z28, Z4, Z28 VPRORD $0x10, Z28, Z28 VPADDD Z20, Z28, Z20 VPXORD Z12, Z20, Z12 VPRORD $0x0c, Z12, Z12 VPADDD Z4, Z12, Z4 VPADDD Z11, Z4, Z4 VPXORD Z28, Z4, Z28 VPRORD $0x08, Z28, Z28 VPADDD Z20, Z28, Z20 VPXORD Z12, Z20, Z12 VPRORD $0x07, Z12, Z12 VPADDD Z6, Z14, Z6 VPADDD Z13, Z6, Z6 VPXORD Z30, Z6, Z30 VPRORD $0x10, Z30, Z30 VPADDD Z22, Z30, Z22 VPXORD Z14, Z22, Z14 VPRORD $0x0c, Z14, Z14 VPADDD Z6, Z14, Z6 VPADDD Z15, Z6, Z6 VPXORD Z30, Z6, Z30 VPRORD $0x08, Z30, Z30 VPADDD Z22, Z30, Z22 VPXORD Z14, Z22, Z14 VPRORD $0x07, Z14, Z14 VPADDD Z0, Z10, Z0 VPADDD Z17, Z0, Z0 VPXORD Z30, Z0, Z30 VPRORD $0x10, Z30, Z30 VPADDD Z20, Z30, Z20 VPXORD Z10, Z20, Z10 VPRORD $0x0c, Z10, Z10 VPADDD Z0, Z10, Z0 VPADDD Z19, Z0, Z0 VPXORD Z30, Z0, Z30 VPRORD $0x08, Z30, Z30 VPADDD Z20, Z30, Z20 VPXORD Z10, Z20, Z10 VPRORD $0x07, Z10, Z10 VPADDD Z2, Z12, Z2 VPADDD Z21, Z2, Z2 VPXORD Z24, Z2, Z24 VPRORD $0x10, Z24, Z24 VPADDD Z22, Z24, Z22 VPXORD Z12, Z22, Z12 VPRORD $0x0c, Z12, Z12 VPADDD Z2, Z12, Z2 VPADDD Z23, Z2, Z2 VPXORD Z24, Z2, Z24 VPRORD $0x08, Z24, Z24 VPADDD Z22, Z24, Z22 VPXORD Z12, Z22, Z12 VPRORD $0x07, Z12, Z12 VPADDD Z4, Z14, Z4 VPADDD Z25, Z4, Z4 VPXORD Z26, Z4, Z26 VPRORD $0x10, Z26, Z26 VPADDD Z16, Z26, Z16 VPXORD Z14, Z16, Z14 VPRORD $0x0c, Z14, Z14 VPADDD Z4, Z14, Z4 VPADDD Z27, Z4, Z4 VPXORD Z26, Z4, Z26 VPRORD $0x08, Z26, Z26 VPADDD Z16, Z26, Z16 VPXORD Z14, Z16, Z14 VPRORD $0x07, Z14, Z14 VPADDD Z6, Z8, Z6 VPADDD Z29, Z6, Z6 VPXORD Z28, Z6, Z28 VPRORD $0x10, Z28, Z28 VPADDD Z18, Z28, Z18 VPXORD Z8, Z18, Z8 VPRORD $0x0c, Z8, Z8 VPADDD Z6, Z8, Z6 VPADDD Z31, Z6, Z6 VPXORD Z28, Z6, Z28 VPRORD $0x08, Z28, Z28 VPADDD Z18, Z28, Z18 VPXORD Z8, Z18, Z8 VPRORD $0x07, Z8, Z8 // Round 2 VPADDD Z0, Z8, Z0 VPADDD Z5, Z0, Z0 VPXORD Z24, Z0, Z24 VPRORD $0x10, Z24, Z24 VPADDD Z16, Z24, Z16 VPXORD Z8, Z16, Z8 VPRORD $0x0c, Z8, Z8 VPADDD Z0, Z8, Z0 VPADDD Z13, Z0, Z0 VPXORD Z24, Z0, Z24 VPRORD $0x08, Z24, Z24 VPADDD Z16, Z24, Z16 VPXORD Z8, Z16, Z8 VPRORD $0x07, Z8, Z8 VPADDD Z2, Z10, Z2 VPADDD Z7, Z2, Z2 VPXORD Z26, Z2, Z26 VPRORD $0x10, Z26, Z26 VPADDD Z18, Z26, Z18 VPXORD Z10, Z18, Z10 VPRORD $0x0c, Z10, Z10 VPADDD Z2, Z10, Z2 VPADDD Z21, Z2, Z2 VPXORD Z26, Z2, Z26 VPRORD $0x08, Z26, Z26 VPADDD Z18, Z26, Z18 VPXORD Z10, Z18, Z10 VPRORD $0x07, Z10, Z10 VPADDD Z4, Z12, Z4 VPADDD Z15, Z4, Z4 VPXORD Z28, Z4, Z28 VPRORD $0x10, Z28, Z28 VPADDD Z20, Z28, Z20 VPXORD Z12, Z20, Z12 VPRORD $0x0c, Z12, Z12 VPADDD Z4, Z12, Z4 VPADDD Z1, Z4, Z4 VPXORD Z28, Z4, Z28 VPRORD $0x08, Z28, Z28 VPADDD Z20, Z28, Z20 VPXORD Z12, Z20, Z12 VPRORD $0x07, Z12, Z12 VPADDD Z6, Z14, Z6 VPADDD Z9, Z6, Z6 VPXORD Z30, Z6, Z30 VPRORD $0x10, Z30, Z30 VPADDD Z22, Z30, Z22 VPXORD Z14, Z22, Z14 VPRORD $0x0c, Z14, Z14 VPADDD Z6, Z14, Z6 VPADDD Z27, Z6, Z6 VPXORD Z30, Z6, Z30 VPRORD $0x08, Z30, Z30 VPADDD Z22, Z30, Z22 VPXORD Z14, Z22, Z14 VPRORD $0x07, Z14, Z14 VPADDD Z0, Z10, Z0 VPADDD Z3, Z0, Z0 VPXORD Z30, Z0, Z30 VPRORD $0x10, Z30, Z30 VPADDD Z20, Z30, Z20 VPXORD Z10, Z20, Z10 VPRORD $0x0c, Z10, Z10 VPADDD Z0, Z10, Z0 VPADDD Z23, Z0, Z0 VPXORD Z30, Z0, Z30 VPRORD $0x08, Z30, Z30 VPADDD Z20, Z30, Z20 VPXORD Z10, Z20, Z10 VPRORD $0x07, Z10, Z10 VPADDD Z2, Z12, Z2 VPADDD Z25, Z2, Z2 VPXORD Z24, Z2, Z24 VPRORD $0x10, Z24, Z24 VPADDD Z22, Z24, Z22 VPXORD Z12, Z22, Z12 VPRORD $0x0c, Z12, Z12 VPADDD Z2, Z12, Z2 VPADDD Z11, Z2, Z2 VPXORD Z24, Z2, Z24 VPRORD $0x08, Z24, Z24 VPADDD Z22, Z24, Z22 VPXORD Z12, Z22, Z12 VPRORD $0x07, Z12, Z12 VPADDD Z4, Z14, Z4 VPADDD Z19, Z4, Z4 VPXORD Z26, Z4, Z26 VPRORD $0x10, Z26, Z26 VPADDD Z16, Z26, Z16 VPXORD Z14, Z16, Z14 VPRORD $0x0c, Z14, Z14 VPADDD Z4, Z14, Z4 VPADDD Z29, Z4, Z4 VPXORD Z26, Z4, Z26 VPRORD $0x08, Z26, Z26 VPADDD Z16, Z26, Z16 VPXORD Z14, Z16, Z14 VPRORD $0x07, Z14, Z14 VPADDD Z6, Z8, Z6 VPADDD Z31, Z6, Z6 VPXORD Z28, Z6, Z28 VPRORD $0x10, Z28, Z28 VPADDD Z18, Z28, Z18 VPXORD Z8, Z18, Z8 VPRORD $0x0c, Z8, Z8 VPADDD Z6, Z8, Z6 VPADDD Z17, Z6, Z6 VPXORD Z28, Z6, Z28 VPRORD $0x08, Z28, Z28 VPADDD Z18, Z28, Z18 VPXORD Z8, Z18, Z8 VPRORD $0x07, Z8, Z8 // Round 3 VPADDD Z0, Z8, Z0 VPADDD Z7, Z0, Z0 VPXORD Z24, Z0, Z24 VPRORD $0x10, Z24, Z24 VPADDD Z16, Z24, Z16 VPXORD Z8, Z16, Z8 VPRORD $0x0c, Z8, Z8 VPADDD Z0, Z8, Z0 VPADDD Z9, Z0, Z0 VPXORD Z24, Z0, Z24 VPRORD $0x08, Z24, Z24 VPADDD Z16, Z24, Z16 VPXORD Z8, Z16, Z8 VPRORD $0x07, Z8, Z8 VPADDD Z2, Z10, Z2 VPADDD Z21, Z2, Z2 VPXORD Z26, Z2, Z26 VPRORD $0x10, Z26, Z26 VPADDD Z18, Z26, Z18 VPXORD Z10, Z18, Z10 VPRORD $0x0c, Z10, Z10 VPADDD Z2, Z10, Z2 VPADDD Z25, Z2, Z2 VPXORD Z26, Z2, Z26 VPRORD $0x08, Z26, Z26 VPADDD Z18, Z26, Z18 VPXORD Z10, Z18, Z10 VPRORD $0x07, Z10, Z10 VPADDD Z4, Z12, Z4 VPADDD Z27, Z4, Z4 VPXORD Z28, Z4, Z28 VPRORD $0x10, Z28, Z28 VPADDD Z20, Z28, Z20 VPXORD Z12, Z20, Z12 VPRORD $0x0c, Z12, Z12 VPADDD Z4, Z12, Z4 VPADDD Z5, Z4, Z4 VPXORD Z28, Z4, Z28 VPRORD $0x08, Z28, Z28 VPADDD Z20, Z28, Z20 VPXORD Z12, Z20, Z12 VPRORD $0x07, Z12, Z12 VPADDD Z6, Z14, Z6 VPADDD Z15, Z6, Z6 VPXORD Z30, Z6, Z30 VPRORD $0x10, Z30, Z30 VPADDD Z22, Z30, Z22 VPXORD Z14, Z22, Z14 VPRORD $0x0c, Z14, Z14 VPADDD Z6, Z14, Z6 VPADDD Z29, Z6, Z6 VPXORD Z30, Z6, Z30 VPRORD $0x08, Z30, Z30 VPADDD Z22, Z30, Z22 VPXORD Z14, Z22, Z14 VPRORD $0x07, Z14, Z14 VPADDD Z0, Z10, Z0 VPADDD Z13, Z0, Z0 VPXORD Z30, Z0, Z30 VPRORD $0x10, Z30, Z30 VPADDD Z20, Z30, Z20 VPXORD Z10, Z20, Z10 VPRORD $0x0c, Z10, Z10 VPADDD Z0, Z10, Z0 VPADDD Z11, Z0, Z0 VPXORD Z30, Z0, Z30 VPRORD $0x08, Z30, Z30 VPADDD Z20, Z30, Z20 VPXORD Z10, Z20, Z10 VPRORD $0x07, Z10, Z10 VPADDD Z2, Z12, Z2 VPADDD Z19, Z2, Z2 VPXORD Z24, Z2, Z24 VPRORD $0x10, Z24, Z24 VPADDD Z22, Z24, Z22 VPXORD Z12, Z22, Z12 VPRORD $0x0c, Z12, Z12 VPADDD Z2, Z12, Z2 VPADDD Z1, Z2, Z2 VPXORD Z24, Z2, Z24 VPRORD $0x08, Z24, Z24 VPADDD Z22, Z24, Z22 VPXORD Z12, Z22, Z12 VPRORD $0x07, Z12, Z12 VPADDD Z4, Z14, Z4 VPADDD Z23, Z4, Z4 VPXORD Z26, Z4, Z26 VPRORD $0x10, Z26, Z26 VPADDD Z16, Z26, Z16 VPXORD Z14, Z16, Z14 VPRORD $0x0c, Z14, Z14 VPADDD Z4, Z14, Z4 VPADDD Z31, Z4, Z4 VPXORD Z26, Z4, Z26 VPRORD $0x08, Z26, Z26 VPADDD Z16, Z26, Z16 VPXORD Z14, Z16, Z14 VPRORD $0x07, Z14, Z14 VPADDD Z6, Z8, Z6 VPADDD Z17, Z6, Z6 VPXORD Z28, Z6, Z28 VPRORD $0x10, Z28, Z28 VPADDD Z18, Z28, Z18 VPXORD Z8, Z18, Z8 VPRORD $0x0c, Z8, Z8 VPADDD Z6, Z8, Z6 VPADDD Z3, Z6, Z6 VPXORD Z28, Z6, Z28 VPRORD $0x08, Z28, Z28 VPADDD Z18, Z28, Z18 VPXORD Z8, Z18, Z8 VPRORD $0x07, Z8, Z8 // Round 4 VPADDD Z0, Z8, Z0 VPADDD Z21, Z0, Z0 VPXORD Z24, Z0, Z24 VPRORD $0x10, Z24, Z24 VPADDD Z16, Z24, Z16 VPXORD Z8, Z16, Z8 VPRORD $0x0c, Z8, Z8 VPADDD Z0, Z8, Z0 VPADDD Z15, Z0, Z0 VPXORD Z24, Z0, Z24 VPRORD $0x08, Z24, Z24 VPADDD Z16, Z24, Z16 VPXORD Z8, Z16, Z8 VPRORD $0x07, Z8, Z8 VPADDD Z2, Z10, Z2 VPADDD Z25, Z2, Z2 VPXORD Z26, Z2, Z26 VPRORD $0x10, Z26, Z26 VPADDD Z18, Z26, Z18 VPXORD Z10, Z18, Z10 VPRORD $0x0c, Z10, Z10 VPADDD Z2, Z10, Z2 VPADDD Z19, Z2, Z2 VPXORD Z26, Z2, Z26 VPRORD $0x08, Z26, Z26 VPADDD Z18, Z26, Z18 VPXORD Z10, Z18, Z10 VPRORD $0x07, Z10, Z10 VPADDD Z4, Z12, Z4 VPADDD Z29, Z4, Z4 VPXORD Z28, Z4, Z28 VPRORD $0x10, Z28, Z28 VPADDD Z20, Z28, Z20 VPXORD Z12, Z20, Z12 VPRORD $0x0c, Z12, Z12 VPADDD Z4, Z12, Z4 VPADDD Z7, Z4, Z4 VPXORD Z28, Z4, Z28 VPRORD $0x08, Z28, Z28 VPADDD Z20, Z28, Z20 VPXORD Z12, Z20, Z12 VPRORD $0x07, Z12, Z12 VPADDD Z6, Z14, Z6 VPADDD Z27, Z6, Z6 VPXORD Z30, Z6, Z30 VPRORD $0x10, Z30, Z30 VPADDD Z22, Z30, Z22 VPXORD Z14, Z22, Z14 VPRORD $0x0c, Z14, Z14 VPADDD Z6, Z14, Z6 VPADDD Z31, Z6, Z6 VPXORD Z30, Z6, Z30 VPRORD $0x08, Z30, Z30 VPADDD Z22, Z30, Z22 VPXORD Z14, Z22, Z14 VPRORD $0x07, Z14, Z14 VPADDD Z0, Z10, Z0 VPADDD Z9, Z0, Z0 VPXORD Z30, Z0, Z30 VPRORD $0x10, Z30, Z30 VPADDD Z20, Z30, Z20 VPXORD Z10, Z20, Z10 VPRORD $0x0c, Z10, Z10 VPADDD Z0, Z10, Z0 VPADDD Z1, Z0, Z0 VPXORD Z30, Z0, Z30 VPRORD $0x08, Z30, Z30 VPADDD Z20, Z30, Z20 VPXORD Z10, Z20, Z10 VPRORD $0x07, Z10, Z10 VPADDD Z2, Z12, Z2 VPADDD Z23, Z2, Z2 VPXORD Z24, Z2, Z24 VPRORD $0x10, Z24, Z24 VPADDD Z22, Z24, Z22 VPXORD Z12, Z22, Z12 VPRORD $0x0c, Z12, Z12 VPADDD Z2, Z12, Z2 VPADDD Z5, Z2, Z2 VPXORD Z24, Z2, Z24 VPRORD $0x08, Z24, Z24 VPADDD Z22, Z24, Z22 VPXORD Z12, Z22, Z12 VPRORD $0x07, Z12, Z12 VPADDD Z4, Z14, Z4 VPADDD Z11, Z4, Z4 VPXORD Z26, Z4, Z26 VPRORD $0x10, Z26, Z26 VPADDD Z16, Z26, Z16 VPXORD Z14, Z16, Z14 VPRORD $0x0c, Z14, Z14 VPADDD Z4, Z14, Z4 VPADDD Z17, Z4, Z4 VPXORD Z26, Z4, Z26 VPRORD $0x08, Z26, Z26 VPADDD Z16, Z26, Z16 VPXORD Z14, Z16, Z14 VPRORD $0x07, Z14, Z14 VPADDD Z6, Z8, Z6 VPADDD Z3, Z6, Z6 VPXORD Z28, Z6, Z28 VPRORD $0x10, Z28, Z28 VPADDD Z18, Z28, Z18 VPXORD Z8, Z18, Z8 VPRORD $0x0c, Z8, Z8 VPADDD Z6, Z8, Z6 VPADDD Z13, Z6, Z6 VPXORD Z28, Z6, Z28 VPRORD $0x08, Z28, Z28 VPADDD Z18, Z28, Z18 VPXORD Z8, Z18, Z8 VPRORD $0x07, Z8, Z8 // Round 5 VPADDD Z0, Z8, Z0 VPADDD Z25, Z0, Z0 VPXORD Z24, Z0, Z24 VPRORD $0x10, Z24, Z24 VPADDD Z16, Z24, Z16 VPXORD Z8, Z16, Z8 VPRORD $0x0c, Z8, Z8 VPADDD Z0, Z8, Z0 VPADDD Z27, Z0, Z0 VPXORD Z24, Z0, Z24 VPRORD $0x08, Z24, Z24 VPADDD Z16, Z24, Z16 VPXORD Z8, Z16, Z8 VPRORD $0x07, Z8, Z8 VPADDD Z2, Z10, Z2 VPADDD Z19, Z2, Z2 VPXORD Z26, Z2, Z26 VPRORD $0x10, Z26, Z26 VPADDD Z18, Z26, Z18 VPXORD Z10, Z18, Z10 VPRORD $0x0c, Z10, Z10 VPADDD Z2, Z10, Z2 VPADDD Z23, Z2, Z2 VPXORD Z26, Z2, Z26 VPRORD $0x08, Z26, Z26 VPADDD Z18, Z26, Z18 VPXORD Z10, Z18, Z10 VPRORD $0x07, Z10, Z10 VPADDD Z4, Z12, Z4 VPADDD Z31, Z4, Z4 VPXORD Z28, Z4, Z28 VPRORD $0x10, Z28, Z28 VPADDD Z20, Z28, Z20 VPXORD Z12, Z20, Z12 VPRORD $0x0c, Z12, Z12 VPADDD Z4, Z12, Z4 VPADDD Z21, Z4, Z4 VPXORD Z28, Z4, Z28 VPRORD $0x08, Z28, Z28 VPADDD Z20, Z28, Z20 VPXORD Z12, Z20, Z12 VPRORD $0x07, Z12, Z12 VPADDD Z6, Z14, Z6 VPADDD Z29, Z6, Z6 VPXORD Z30, Z6, Z30 VPRORD $0x10, Z30, Z30 VPADDD Z22, Z30, Z22 VPXORD Z14, Z22, Z14 VPRORD $0x0c, Z14, Z14 VPADDD Z6, Z14, Z6 VPADDD Z17, Z6, Z6 VPXORD Z30, Z6, Z30 VPRORD $0x08, Z30, Z30 VPADDD Z22, Z30, Z22 VPXORD Z14, Z22, Z14 VPRORD $0x07, Z14, Z14 VPADDD Z0, Z10, Z0 VPADDD Z15, Z0, Z0 VPXORD Z30, Z0, Z30 VPRORD $0x10, Z30, Z30 VPADDD Z20, Z30, Z20 VPXORD Z10, Z20, Z10 VPRORD $0x0c, Z10, Z10 VPADDD Z0, Z10, Z0 VPADDD Z5, Z0, Z0 VPXORD Z30, Z0, Z30 VPRORD $0x08, Z30, Z30 VPADDD Z20, Z30, Z20 VPXORD Z10, Z20, Z10 VPRORD $0x07, Z10, Z10 VPADDD Z2, Z12, Z2 VPADDD Z11, Z2, Z2 VPXORD Z24, Z2, Z24 VPRORD $0x10, Z24, Z24 VPADDD Z22, Z24, Z22 VPXORD Z12, Z22, Z12 VPRORD $0x0c, Z12, Z12 VPADDD Z2, Z12, Z2 VPADDD Z7, Z2, Z2 VPXORD Z24, Z2, Z24 VPRORD $0x08, Z24, Z24 VPADDD Z22, Z24, Z22 VPXORD Z12, Z22, Z12 VPRORD $0x07, Z12, Z12 VPADDD Z4, Z14, Z4 VPADDD Z1, Z4, Z4 VPXORD Z26, Z4, Z26 VPRORD $0x10, Z26, Z26 VPADDD Z16, Z26, Z16 VPXORD Z14, Z16, Z14 VPRORD $0x0c, Z14, Z14 VPADDD Z4, Z14, Z4 VPADDD Z3, Z4, Z4 VPXORD Z26, Z4, Z26 VPRORD $0x08, Z26, Z26 VPADDD Z16, Z26, Z16 VPXORD Z14, Z16, Z14 VPRORD $0x07, Z14, Z14 VPADDD Z6, Z8, Z6 VPADDD Z13, Z6, Z6 VPXORD Z28, Z6, Z28 VPRORD $0x10, Z28, Z28 VPADDD Z18, Z28, Z18 VPXORD Z8, Z18, Z8 VPRORD $0x0c, Z8, Z8 VPADDD Z6, Z8, Z6 VPADDD Z9, Z6, Z6 VPXORD Z28, Z6, Z28 VPRORD $0x08, Z28, Z28 VPADDD Z18, Z28, Z18 VPXORD Z8, Z18, Z8 VPRORD $0x07, Z8, Z8 // Round 6 VPADDD Z0, Z8, Z0 VPADDD Z19, Z0, Z0 VPXORD Z24, Z0, Z24 VPRORD $0x10, Z24, Z24 VPADDD Z16, Z24, Z16 VPXORD Z8, Z16, Z8 VPRORD $0x0c, Z8, Z8 VPADDD Z0, Z8, Z0 VPADDD Z29, Z0, Z0 VPXORD Z24, Z0, Z24 VPRORD $0x08, Z24, Z24 VPADDD Z16, Z24, Z16 VPXORD Z8, Z16, Z8 VPRORD $0x07, Z8, Z8 VPADDD Z2, Z10, Z2 VPADDD Z23, Z2, Z2 VPXORD Z26, Z2, Z26 VPRORD $0x10, Z26, Z26 VPADDD Z18, Z26, Z18 VPXORD Z10, Z18, Z10 VPRORD $0x0c, Z10, Z10 VPADDD Z2, Z10, Z2 VPADDD Z11, Z2, Z2 VPXORD Z26, Z2, Z26 VPRORD $0x08, Z26, Z26 VPADDD Z18, Z26, Z18 VPXORD Z10, Z18, Z10 VPRORD $0x07, Z10, Z10 VPADDD Z4, Z12, Z4 VPADDD Z17, Z4, Z4 VPXORD Z28, Z4, Z28 VPRORD $0x10, Z28, Z28 VPADDD Z20, Z28, Z20 VPXORD Z12, Z20, Z12 VPRORD $0x0c, Z12, Z12 VPADDD Z4, Z12, Z4 VPADDD Z25, Z4, Z4 VPXORD Z28, Z4, Z28 VPRORD $0x08, Z28, Z28 VPADDD Z20, Z28, Z20 VPXORD Z12, Z20, Z12 VPRORD $0x07, Z12, Z12 VPADDD Z6, Z14, Z6 VPADDD Z31, Z6, Z6 VPXORD Z30, Z6, Z30 VPRORD $0x10, Z30, Z30 VPADDD Z22, Z30, Z22 VPXORD Z14, Z22, Z14 VPRORD $0x0c, Z14, Z14 VPADDD Z6, Z14, Z6 VPADDD Z3, Z6, Z6 VPXORD Z30, Z6, Z30 VPRORD $0x08, Z30, Z30 VPADDD Z22, Z30, Z22 VPXORD Z14, Z22, Z14 VPRORD $0x07, Z14, Z14 VPADDD Z0, Z10, Z0 VPADDD Z27, Z0, Z0 VPXORD Z30, Z0, Z30 VPRORD $0x10, Z30, Z30 VPADDD Z20, Z30, Z20 VPXORD Z10, Z20, Z10 VPRORD $0x0c, Z10, Z10 VPADDD Z0, Z10, Z0 VPADDD Z7, Z0, Z0 VPXORD Z30, Z0, Z30 VPRORD $0x08, Z30, Z30 VPADDD Z20, Z30, Z20 VPXORD Z10, Z20, Z10 VPRORD $0x07, Z10, Z10 VPADDD Z2, Z12, Z2 VPADDD Z1, Z2, Z2 VPXORD Z24, Z2, Z24 VPRORD $0x10, Z24, Z24 VPADDD Z22, Z24, Z22 VPXORD Z12, Z22, Z12 VPRORD $0x0c, Z12, Z12 VPADDD Z2, Z12, Z2 VPADDD Z21, Z2, Z2 VPXORD Z24, Z2, Z24 VPRORD $0x08, Z24, Z24 VPADDD Z22, Z24, Z22 VPXORD Z12, Z22, Z12 VPRORD $0x07, Z12, Z12 VPADDD Z4, Z14, Z4 VPADDD Z5, Z4, Z4 VPXORD Z26, Z4, Z26 VPRORD $0x10, Z26, Z26 VPADDD Z16, Z26, Z16 VPXORD Z14, Z16, Z14 VPRORD $0x0c, Z14, Z14 VPADDD Z4, Z14, Z4 VPADDD Z13, Z4, Z4 VPXORD Z26, Z4, Z26 VPRORD $0x08, Z26, Z26 VPADDD Z16, Z26, Z16 VPXORD Z14, Z16, Z14 VPRORD $0x07, Z14, Z14 VPADDD Z6, Z8, Z6 VPADDD Z9, Z6, Z6 VPXORD Z28, Z6, Z28 VPRORD $0x10, Z28, Z28 VPADDD Z18, Z28, Z18 VPXORD Z8, Z18, Z8 VPRORD $0x0c, Z8, Z8 VPADDD Z6, Z8, Z6 VPADDD Z15, Z6, Z6 VPXORD Z28, Z6, Z28 VPRORD $0x08, Z28, Z28 VPADDD Z18, Z28, Z18 VPXORD Z8, Z18, Z8 VPRORD $0x07, Z8, Z8 // Round 7 VPADDD Z0, Z8, Z0 VPADDD Z23, Z0, Z0 VPXORD Z24, Z0, Z24 VPRORD $0x10, Z24, Z24 VPADDD Z16, Z24, Z16 VPXORD Z8, Z16, Z8 VPRORD $0x0c, Z8, Z8 VPADDD Z0, Z8, Z0 VPADDD Z31, Z0, Z0 VPXORD Z24, Z0, Z24 VPRORD $0x08, Z24, Z24 VPADDD Z16, Z24, Z16 VPXORD Z8, Z16, Z8 VPRORD $0x07, Z8, Z8 VPADDD Z2, Z10, Z2 VPADDD Z11, Z2, Z2 VPXORD Z26, Z2, Z26 VPRORD $0x10, Z26, Z26 VPADDD Z18, Z26, Z18 VPXORD Z10, Z18, Z10 VPRORD $0x0c, Z10, Z10 VPADDD Z2, Z10, Z2 VPADDD Z1, Z2, Z2 VPXORD Z26, Z2, Z26 VPRORD $0x08, Z26, Z26 VPADDD Z18, Z26, Z18 VPXORD Z10, Z18, Z10 VPRORD $0x07, Z10, Z10 VPADDD Z4, Z12, Z4 VPADDD Z3, Z4, Z4 VPXORD Z28, Z4, Z28 VPRORD $0x10, Z28, Z28 VPADDD Z20, Z28, Z20 VPXORD Z12, Z20, Z12 VPRORD $0x0c, Z12, Z12 VPADDD Z4, Z12, Z4 VPADDD Z19, Z4, Z4 VPXORD Z28, Z4, Z28 VPRORD $0x08, Z28, Z28 VPADDD Z20, Z28, Z20 VPXORD Z12, Z20, Z12 VPRORD $0x07, Z12, Z12 VPADDD Z6, Z14, Z6 VPADDD Z17, Z6, Z6 VPXORD Z30, Z6, Z30 VPRORD $0x10, Z30, Z30 VPADDD Z22, Z30, Z22 VPXORD Z14, Z22, Z14 VPRORD $0x0c, Z14, Z14 VPADDD Z6, Z14, Z6 VPADDD Z13, Z6, Z6 VPXORD Z30, Z6, Z30 VPRORD $0x08, Z30, Z30 VPADDD Z22, Z30, Z22 VPXORD Z14, Z22, Z14 VPRORD $0x07, Z14, Z14 VPADDD Z0, Z10, Z0 VPADDD Z29, Z0, Z0 VPXORD Z30, Z0, Z30 VPRORD $0x10, Z30, Z30 VPADDD Z20, Z30, Z20 VPXORD Z10, Z20, Z10 VPRORD $0x0c, Z10, Z10 VPADDD Z0, Z10, Z0 VPADDD Z21, Z0, Z0 VPXORD Z30, Z0, Z30 VPRORD $0x08, Z30, Z30 VPADDD Z20, Z30, Z20 VPXORD Z10, Z20, Z10 VPRORD $0x07, Z10, Z10 VPADDD Z2, Z12, Z2 VPADDD Z5, Z2, Z2 VPXORD Z24, Z2, Z24 VPRORD $0x10, Z24, Z24 VPADDD Z22, Z24, Z22 VPXORD Z12, Z22, Z12 VPRORD $0x0c, Z12, Z12 VPADDD Z2, Z12, Z2 VPADDD Z25, Z2, Z2 VPXORD Z24, Z2, Z24 VPRORD $0x08, Z24, Z24 VPADDD Z22, Z24, Z22 VPXORD Z12, Z22, Z12 VPRORD $0x07, Z12, Z12 VPADDD Z4, Z14, Z4 VPADDD Z7, Z4, Z4 VPXORD Z26, Z4, Z26 VPRORD $0x10, Z26, Z26 VPADDD Z16, Z26, Z16 VPXORD Z14, Z16, Z14 VPRORD $0x0c, Z14, Z14 VPADDD Z4, Z14, Z4 VPADDD Z9, Z4, Z4 VPXORD Z26, Z4, Z26 VPRORD $0x08, Z26, Z26 VPADDD Z16, Z26, Z16 VPXORD Z14, Z16, Z14 VPRORD $0x07, Z14, Z14 VPADDD Z6, Z8, Z6 VPADDD Z15, Z6, Z6 VPXORD Z28, Z6, Z28 VPRORD $0x10, Z28, Z28 VPADDD Z18, Z28, Z18 VPXORD Z8, Z18, Z8 VPRORD $0x0c, Z8, Z8 VPADDD Z6, Z8, Z6 VPADDD Z27, Z6, Z6 VPXORD Z28, Z6, Z28 VPRORD $0x08, Z28, Z28 VPADDD Z18, Z28, Z18 VPXORD Z8, Z18, Z8 VPRORD $0x07, Z8, Z8 // Finalize CVs VPXORD Z0, Z16, Z0 VPXORD Z2, Z18, Z2 VPXORD Z4, Z20, Z4 VPXORD Z6, Z22, Z6 VPXORD Z8, Z24, Z8 VPXORD Z10, Z26, Z10 VPXORD Z12, Z28, Z12 VPXORD Z14, Z30, Z14 VPXORD.BCST (DX), Z16, Z16 VPXORD.BCST 4(DX), Z18, Z18 VPXORD.BCST 8(DX), Z20, Z20 VPXORD.BCST 12(DX), Z22, Z22 VPXORD.BCST 16(DX), Z24, Z24 VPXORD.BCST 20(DX), Z26, Z26 VPXORD.BCST 24(DX), Z28, Z28 VPXORD.BCST 28(DX), Z30, Z30 VMOVDQU32 seq<>+0(SB), Z1 VPSLLD $0x06, Z1, Z1 KXNORD K1, K1, K1 VPSCATTERDD Z0, K1, (AX)(Z1*1) KXNORD K1, K1, K1 VPSCATTERDD Z2, K1, 4(AX)(Z1*1) KXNORD K1, K1, K1 VPSCATTERDD Z4, K1, 8(AX)(Z1*1) KXNORD K1, K1, K1 VPSCATTERDD Z6, K1, 12(AX)(Z1*1) KXNORD K1, K1, K1 VPSCATTERDD Z8, K1, 16(AX)(Z1*1) KXNORD K1, K1, K1 VPSCATTERDD Z10, K1, 20(AX)(Z1*1) KXNORD K1, K1, K1 VPSCATTERDD Z12, K1, 24(AX)(Z1*1) KXNORD K1, K1, K1 VPSCATTERDD Z14, K1, 28(AX)(Z1*1) KXNORD K1, K1, K1 VPSCATTERDD Z16, K1, 32(AX)(Z1*1) KXNORD K1, K1, K1 VPSCATTERDD Z18, K1, 36(AX)(Z1*1) KXNORD K1, K1, K1 VPSCATTERDD Z20, K1, 40(AX)(Z1*1) KXNORD K1, K1, K1 VPSCATTERDD Z22, K1, 44(AX)(Z1*1) KXNORD K1, K1, K1 VPSCATTERDD Z24, K1, 48(AX)(Z1*1) KXNORD K1, K1, K1 VPSCATTERDD Z26, K1, 52(AX)(Z1*1) KXNORD K1, K1, K1 VPSCATTERDD Z28, K1, 56(AX)(Z1*1) KXNORD K1, K1, K1 VPSCATTERDD Z30, K1, 60(AX)(Z1*1) RET // func compressChunksAVX512(cvs *[16][8]uint32, buf *[16384]byte, key *[8]uint32, counter uint64, flags uint32) // Requires: AVX512F TEXT ·compressChunksAVX512(SB), NOSPLIT, $192-40 MOVQ cvs+0(FP), AX MOVQ buf+8(FP), CX MOVQ key+16(FP), DX // Initialize counter VPBROADCASTD counter+24(FP), Z0 VPADDD seq<>+0(SB), Z0, Z0 VPCMPUD $0x01, seq<>+0(SB), Z0, K1 VPBROADCASTD counter+28(FP), Z2 VPADDD.BCST seq<>+4(SB), Z2, K1, Z2 VMOVDQU32 Z0, (SP) VMOVDQU32 Z2, 64(SP) // Initialize flags VPBROADCASTD flags+32(FP), Z0 VMOVDQU32 Z0, 128(SP) ORL $0x01, 128(SP) ORL $0x02, 188(SP) // Load key VPBROADCASTD (DX), Z0 VPBROADCASTD 4(DX), Z2 VPBROADCASTD 8(DX), Z4 VPBROADCASTD 12(DX), Z6 VPBROADCASTD 16(DX), Z8 VPBROADCASTD 20(DX), Z10 VPBROADCASTD 24(DX), Z12 VPBROADCASTD 28(DX), Z14 // Loop index XORQ DX, DX loop: // Load transposed block VMOVDQU32 seq<>+0(SB), Z16 VPSLLD $0x0a, Z16, Z16 KXNORD K1, K1, K1 VPGATHERDD (CX)(Z16*1), K1, Z1 KXNORD K1, K1, K1 VPGATHERDD 4(CX)(Z16*1), K1, Z3 KXNORD K1, K1, K1 VPGATHERDD 8(CX)(Z16*1), K1, Z5 KXNORD K1, K1, K1 VPGATHERDD 12(CX)(Z16*1), K1, Z7 KXNORD K1, K1, K1 VPGATHERDD 16(CX)(Z16*1), K1, Z9 KXNORD K1, K1, K1 VPGATHERDD 20(CX)(Z16*1), K1, Z11 KXNORD K1, K1, K1 VPGATHERDD 24(CX)(Z16*1), K1, Z13 KXNORD K1, K1, K1 VPGATHERDD 28(CX)(Z16*1), K1, Z15 KXNORD K1, K1, K1 VPGATHERDD 32(CX)(Z16*1), K1, Z17 KXNORD K1, K1, K1 VPGATHERDD 36(CX)(Z16*1), K1, Z19 KXNORD K1, K1, K1 VPGATHERDD 40(CX)(Z16*1), K1, Z21 KXNORD K1, K1, K1 VPGATHERDD 44(CX)(Z16*1), K1, Z23 KXNORD K1, K1, K1 VPGATHERDD 48(CX)(Z16*1), K1, Z25 KXNORD K1, K1, K1 VPGATHERDD 52(CX)(Z16*1), K1, Z27 KXNORD K1, K1, K1 VPGATHERDD 56(CX)(Z16*1), K1, Z29 KXNORD K1, K1, K1 VPGATHERDD 60(CX)(Z16*1), K1, Z31 ADDQ $0x40, CX // Reload state vectors (other than CVs) VPBROADCASTD iv<>+0(SB), Z16 VPBROADCASTD iv<>+4(SB), Z18 VPBROADCASTD iv<>+8(SB), Z20 VPBROADCASTD iv<>+12(SB), Z22 VMOVDQU32 (SP), Z24 VMOVDQU32 64(SP), Z26 VPBROADCASTD seq<>+4(SB), Z28 VPSLLD $0x06, Z28, Z28 VPBROADCASTD 128(SP)(DX*4), Z30 // Round 1 VPADDD Z0, Z8, Z0 VPADDD Z1, Z0, Z0 VPXORD Z24, Z0, Z24 VPRORD $0x10, Z24, Z24 VPADDD Z16, Z24, Z16 VPXORD Z8, Z16, Z8 VPRORD $0x0c, Z8, Z8 VPADDD Z0, Z8, Z0 VPADDD Z3, Z0, Z0 VPXORD Z24, Z0, Z24 VPRORD $0x08, Z24, Z24 VPADDD Z16, Z24, Z16 VPXORD Z8, Z16, Z8 VPRORD $0x07, Z8, Z8 VPADDD Z2, Z10, Z2 VPADDD Z5, Z2, Z2 VPXORD Z26, Z2, Z26 VPRORD $0x10, Z26, Z26 VPADDD Z18, Z26, Z18 VPXORD Z10, Z18, Z10 VPRORD $0x0c, Z10, Z10 VPADDD Z2, Z10, Z2 VPADDD Z7, Z2, Z2 VPXORD Z26, Z2, Z26 VPRORD $0x08, Z26, Z26 VPADDD Z18, Z26, Z18 VPXORD Z10, Z18, Z10 VPRORD $0x07, Z10, Z10 VPADDD Z4, Z12, Z4 VPADDD Z9, Z4, Z4 VPXORD Z28, Z4, Z28 VPRORD $0x10, Z28, Z28 VPADDD Z20, Z28, Z20 VPXORD Z12, Z20, Z12 VPRORD $0x0c, Z12, Z12 VPADDD Z4, Z12, Z4 VPADDD Z11, Z4, Z4 VPXORD Z28, Z4, Z28 VPRORD $0x08, Z28, Z28 VPADDD Z20, Z28, Z20 VPXORD Z12, Z20, Z12 VPRORD $0x07, Z12, Z12 VPADDD Z6, Z14, Z6 VPADDD Z13, Z6, Z6 VPXORD Z30, Z6, Z30 VPRORD $0x10, Z30, Z30 VPADDD Z22, Z30, Z22 VPXORD Z14, Z22, Z14 VPRORD $0x0c, Z14, Z14 VPADDD Z6, Z14, Z6 VPADDD Z15, Z6, Z6 VPXORD Z30, Z6, Z30 VPRORD $0x08, Z30, Z30 VPADDD Z22, Z30, Z22 VPXORD Z14, Z22, Z14 VPRORD $0x07, Z14, Z14 VPADDD Z0, Z10, Z0 VPADDD Z17, Z0, Z0 VPXORD Z30, Z0, Z30 VPRORD $0x10, Z30, Z30 VPADDD Z20, Z30, Z20 VPXORD Z10, Z20, Z10 VPRORD $0x0c, Z10, Z10 VPADDD Z0, Z10, Z0 VPADDD Z19, Z0, Z0 VPXORD Z30, Z0, Z30 VPRORD $0x08, Z30, Z30 VPADDD Z20, Z30, Z20 VPXORD Z10, Z20, Z10 VPRORD $0x07, Z10, Z10 VPADDD Z2, Z12, Z2 VPADDD Z21, Z2, Z2 VPXORD Z24, Z2, Z24 VPRORD $0x10, Z24, Z24 VPADDD Z22, Z24, Z22 VPXORD Z12, Z22, Z12 VPRORD $0x0c, Z12, Z12 VPADDD Z2, Z12, Z2 VPADDD Z23, Z2, Z2 VPXORD Z24, Z2, Z24 VPRORD $0x08, Z24, Z24 VPADDD Z22, Z24, Z22 VPXORD Z12, Z22, Z12 VPRORD $0x07, Z12, Z12 VPADDD Z4, Z14, Z4 VPADDD Z25, Z4, Z4 VPXORD Z26, Z4, Z26 VPRORD $0x10, Z26, Z26 VPADDD Z16, Z26, Z16 VPXORD Z14, Z16, Z14 VPRORD $0x0c, Z14, Z14 VPADDD Z4, Z14, Z4 VPADDD Z27, Z4, Z4 VPXORD Z26, Z4, Z26 VPRORD $0x08, Z26, Z26 VPADDD Z16, Z26, Z16 VPXORD Z14, Z16, Z14 VPRORD $0x07, Z14, Z14 VPADDD Z6, Z8, Z6 VPADDD Z29, Z6, Z6 VPXORD Z28, Z6, Z28 VPRORD $0x10, Z28, Z28 VPADDD Z18, Z28, Z18 VPXORD Z8, Z18, Z8 VPRORD $0x0c, Z8, Z8 VPADDD Z6, Z8, Z6 VPADDD Z31, Z6, Z6 VPXORD Z28, Z6, Z28 VPRORD $0x08, Z28, Z28 VPADDD Z18, Z28, Z18 VPXORD Z8, Z18, Z8 VPRORD $0x07, Z8, Z8 // Round 2 VPADDD Z0, Z8, Z0 VPADDD Z5, Z0, Z0 VPXORD Z24, Z0, Z24 VPRORD $0x10, Z24, Z24 VPADDD Z16, Z24, Z16 VPXORD Z8, Z16, Z8 VPRORD $0x0c, Z8, Z8 VPADDD Z0, Z8, Z0 VPADDD Z13, Z0, Z0 VPXORD Z24, Z0, Z24 VPRORD $0x08, Z24, Z24 VPADDD Z16, Z24, Z16 VPXORD Z8, Z16, Z8 VPRORD $0x07, Z8, Z8 VPADDD Z2, Z10, Z2 VPADDD Z7, Z2, Z2 VPXORD Z26, Z2, Z26 VPRORD $0x10, Z26, Z26 VPADDD Z18, Z26, Z18 VPXORD Z10, Z18, Z10 VPRORD $0x0c, Z10, Z10 VPADDD Z2, Z10, Z2 VPADDD Z21, Z2, Z2 VPXORD Z26, Z2, Z26 VPRORD $0x08, Z26, Z26 VPADDD Z18, Z26, Z18 VPXORD Z10, Z18, Z10 VPRORD $0x07, Z10, Z10 VPADDD Z4, Z12, Z4 VPADDD Z15, Z4, Z4 VPXORD Z28, Z4, Z28 VPRORD $0x10, Z28, Z28 VPADDD Z20, Z28, Z20 VPXORD Z12, Z20, Z12 VPRORD $0x0c, Z12, Z12 VPADDD Z4, Z12, Z4 VPADDD Z1, Z4, Z4 VPXORD Z28, Z4, Z28 VPRORD $0x08, Z28, Z28 VPADDD Z20, Z28, Z20 VPXORD Z12, Z20, Z12 VPRORD $0x07, Z12, Z12 VPADDD Z6, Z14, Z6 VPADDD Z9, Z6, Z6 VPXORD Z30, Z6, Z30 VPRORD $0x10, Z30, Z30 VPADDD Z22, Z30, Z22 VPXORD Z14, Z22, Z14 VPRORD $0x0c, Z14, Z14 VPADDD Z6, Z14, Z6 VPADDD Z27, Z6, Z6 VPXORD Z30, Z6, Z30 VPRORD $0x08, Z30, Z30 VPADDD Z22, Z30, Z22 VPXORD Z14, Z22, Z14 VPRORD $0x07, Z14, Z14 VPADDD Z0, Z10, Z0 VPADDD Z3, Z0, Z0 VPXORD Z30, Z0, Z30 VPRORD $0x10, Z30, Z30 VPADDD Z20, Z30, Z20 VPXORD Z10, Z20, Z10 VPRORD $0x0c, Z10, Z10 VPADDD Z0, Z10, Z0 VPADDD Z23, Z0, Z0 VPXORD Z30, Z0, Z30 VPRORD $0x08, Z30, Z30 VPADDD Z20, Z30, Z20 VPXORD Z10, Z20, Z10 VPRORD $0x07, Z10, Z10 VPADDD Z2, Z12, Z2 VPADDD Z25, Z2, Z2 VPXORD Z24, Z2, Z24 VPRORD $0x10, Z24, Z24 VPADDD Z22, Z24, Z22 VPXORD Z12, Z22, Z12 VPRORD $0x0c, Z12, Z12 VPADDD Z2, Z12, Z2 VPADDD Z11, Z2, Z2 VPXORD Z24, Z2, Z24 VPRORD $0x08, Z24, Z24 VPADDD Z22, Z24, Z22 VPXORD Z12, Z22, Z12 VPRORD $0x07, Z12, Z12 VPADDD Z4, Z14, Z4 VPADDD Z19, Z4, Z4 VPXORD Z26, Z4, Z26 VPRORD $0x10, Z26, Z26 VPADDD Z16, Z26, Z16 VPXORD Z14, Z16, Z14 VPRORD $0x0c, Z14, Z14 VPADDD Z4, Z14, Z4 VPADDD Z29, Z4, Z4 VPXORD Z26, Z4, Z26 VPRORD $0x08, Z26, Z26 VPADDD Z16, Z26, Z16 VPXORD Z14, Z16, Z14 VPRORD $0x07, Z14, Z14 VPADDD Z6, Z8, Z6 VPADDD Z31, Z6, Z6 VPXORD Z28, Z6, Z28 VPRORD $0x10, Z28, Z28 VPADDD Z18, Z28, Z18 VPXORD Z8, Z18, Z8 VPRORD $0x0c, Z8, Z8 VPADDD Z6, Z8, Z6 VPADDD Z17, Z6, Z6 VPXORD Z28, Z6, Z28 VPRORD $0x08, Z28, Z28 VPADDD Z18, Z28, Z18 VPXORD Z8, Z18, Z8 VPRORD $0x07, Z8, Z8 // Round 3 VPADDD Z0, Z8, Z0 VPADDD Z7, Z0, Z0 VPXORD Z24, Z0, Z24 VPRORD $0x10, Z24, Z24 VPADDD Z16, Z24, Z16 VPXORD Z8, Z16, Z8 VPRORD $0x0c, Z8, Z8 VPADDD Z0, Z8, Z0 VPADDD Z9, Z0, Z0 VPXORD Z24, Z0, Z24 VPRORD $0x08, Z24, Z24 VPADDD Z16, Z24, Z16 VPXORD Z8, Z16, Z8 VPRORD $0x07, Z8, Z8 VPADDD Z2, Z10, Z2 VPADDD Z21, Z2, Z2 VPXORD Z26, Z2, Z26 VPRORD $0x10, Z26, Z26 VPADDD Z18, Z26, Z18 VPXORD Z10, Z18, Z10 VPRORD $0x0c, Z10, Z10 VPADDD Z2, Z10, Z2 VPADDD Z25, Z2, Z2 VPXORD Z26, Z2, Z26 VPRORD $0x08, Z26, Z26 VPADDD Z18, Z26, Z18 VPXORD Z10, Z18, Z10 VPRORD $0x07, Z10, Z10 VPADDD Z4, Z12, Z4 VPADDD Z27, Z4, Z4 VPXORD Z28, Z4, Z28 VPRORD $0x10, Z28, Z28 VPADDD Z20, Z28, Z20 VPXORD Z12, Z20, Z12 VPRORD $0x0c, Z12, Z12 VPADDD Z4, Z12, Z4 VPADDD Z5, Z4, Z4 VPXORD Z28, Z4, Z28 VPRORD $0x08, Z28, Z28 VPADDD Z20, Z28, Z20 VPXORD Z12, Z20, Z12 VPRORD $0x07, Z12, Z12 VPADDD Z6, Z14, Z6 VPADDD Z15, Z6, Z6 VPXORD Z30, Z6, Z30 VPRORD $0x10, Z30, Z30 VPADDD Z22, Z30, Z22 VPXORD Z14, Z22, Z14 VPRORD $0x0c, Z14, Z14 VPADDD Z6, Z14, Z6 VPADDD Z29, Z6, Z6 VPXORD Z30, Z6, Z30 VPRORD $0x08, Z30, Z30 VPADDD Z22, Z30, Z22 VPXORD Z14, Z22, Z14 VPRORD $0x07, Z14, Z14 VPADDD Z0, Z10, Z0 VPADDD Z13, Z0, Z0 VPXORD Z30, Z0, Z30 VPRORD $0x10, Z30, Z30 VPADDD Z20, Z30, Z20 VPXORD Z10, Z20, Z10 VPRORD $0x0c, Z10, Z10 VPADDD Z0, Z10, Z0 VPADDD Z11, Z0, Z0 VPXORD Z30, Z0, Z30 VPRORD $0x08, Z30, Z30 VPADDD Z20, Z30, Z20 VPXORD Z10, Z20, Z10 VPRORD $0x07, Z10, Z10 VPADDD Z2, Z12, Z2 VPADDD Z19, Z2, Z2 VPXORD Z24, Z2, Z24 VPRORD $0x10, Z24, Z24 VPADDD Z22, Z24, Z22 VPXORD Z12, Z22, Z12 VPRORD $0x0c, Z12, Z12 VPADDD Z2, Z12, Z2 VPADDD Z1, Z2, Z2 VPXORD Z24, Z2, Z24 VPRORD $0x08, Z24, Z24 VPADDD Z22, Z24, Z22 VPXORD Z12, Z22, Z12 VPRORD $0x07, Z12, Z12 VPADDD Z4, Z14, Z4 VPADDD Z23, Z4, Z4 VPXORD Z26, Z4, Z26 VPRORD $0x10, Z26, Z26 VPADDD Z16, Z26, Z16 VPXORD Z14, Z16, Z14 VPRORD $0x0c, Z14, Z14 VPADDD Z4, Z14, Z4 VPADDD Z31, Z4, Z4 VPXORD Z26, Z4, Z26 VPRORD $0x08, Z26, Z26 VPADDD Z16, Z26, Z16 VPXORD Z14, Z16, Z14 VPRORD $0x07, Z14, Z14 VPADDD Z6, Z8, Z6 VPADDD Z17, Z6, Z6 VPXORD Z28, Z6, Z28 VPRORD $0x10, Z28, Z28 VPADDD Z18, Z28, Z18 VPXORD Z8, Z18, Z8 VPRORD $0x0c, Z8, Z8 VPADDD Z6, Z8, Z6 VPADDD Z3, Z6, Z6 VPXORD Z28, Z6, Z28 VPRORD $0x08, Z28, Z28 VPADDD Z18, Z28, Z18 VPXORD Z8, Z18, Z8 VPRORD $0x07, Z8, Z8 // Round 4 VPADDD Z0, Z8, Z0 VPADDD Z21, Z0, Z0 VPXORD Z24, Z0, Z24 VPRORD $0x10, Z24, Z24 VPADDD Z16, Z24, Z16 VPXORD Z8, Z16, Z8 VPRORD $0x0c, Z8, Z8 VPADDD Z0, Z8, Z0 VPADDD Z15, Z0, Z0 VPXORD Z24, Z0, Z24 VPRORD $0x08, Z24, Z24 VPADDD Z16, Z24, Z16 VPXORD Z8, Z16, Z8 VPRORD $0x07, Z8, Z8 VPADDD Z2, Z10, Z2 VPADDD Z25, Z2, Z2 VPXORD Z26, Z2, Z26 VPRORD $0x10, Z26, Z26 VPADDD Z18, Z26, Z18 VPXORD Z10, Z18, Z10 VPRORD $0x0c, Z10, Z10 VPADDD Z2, Z10, Z2 VPADDD Z19, Z2, Z2 VPXORD Z26, Z2, Z26 VPRORD $0x08, Z26, Z26 VPADDD Z18, Z26, Z18 VPXORD Z10, Z18, Z10 VPRORD $0x07, Z10, Z10 VPADDD Z4, Z12, Z4 VPADDD Z29, Z4, Z4 VPXORD Z28, Z4, Z28 VPRORD $0x10, Z28, Z28 VPADDD Z20, Z28, Z20 VPXORD Z12, Z20, Z12 VPRORD $0x0c, Z12, Z12 VPADDD Z4, Z12, Z4 VPADDD Z7, Z4, Z4 VPXORD Z28, Z4, Z28 VPRORD $0x08, Z28, Z28 VPADDD Z20, Z28, Z20 VPXORD Z12, Z20, Z12 VPRORD $0x07, Z12, Z12 VPADDD Z6, Z14, Z6 VPADDD Z27, Z6, Z6 VPXORD Z30, Z6, Z30 VPRORD $0x10, Z30, Z30 VPADDD Z22, Z30, Z22 VPXORD Z14, Z22, Z14 VPRORD $0x0c, Z14, Z14 VPADDD Z6, Z14, Z6 VPADDD Z31, Z6, Z6 VPXORD Z30, Z6, Z30 VPRORD $0x08, Z30, Z30 VPADDD Z22, Z30, Z22 VPXORD Z14, Z22, Z14 VPRORD $0x07, Z14, Z14 VPADDD Z0, Z10, Z0 VPADDD Z9, Z0, Z0 VPXORD Z30, Z0, Z30 VPRORD $0x10, Z30, Z30 VPADDD Z20, Z30, Z20 VPXORD Z10, Z20, Z10 VPRORD $0x0c, Z10, Z10 VPADDD Z0, Z10, Z0 VPADDD Z1, Z0, Z0 VPXORD Z30, Z0, Z30 VPRORD $0x08, Z30, Z30 VPADDD Z20, Z30, Z20 VPXORD Z10, Z20, Z10 VPRORD $0x07, Z10, Z10 VPADDD Z2, Z12, Z2 VPADDD Z23, Z2, Z2 VPXORD Z24, Z2, Z24 VPRORD $0x10, Z24, Z24 VPADDD Z22, Z24, Z22 VPXORD Z12, Z22, Z12 VPRORD $0x0c, Z12, Z12 VPADDD Z2, Z12, Z2 VPADDD Z5, Z2, Z2 VPXORD Z24, Z2, Z24 VPRORD $0x08, Z24, Z24 VPADDD Z22, Z24, Z22 VPXORD Z12, Z22, Z12 VPRORD $0x07, Z12, Z12 VPADDD Z4, Z14, Z4 VPADDD Z11, Z4, Z4 VPXORD Z26, Z4, Z26 VPRORD $0x10, Z26, Z26 VPADDD Z16, Z26, Z16 VPXORD Z14, Z16, Z14 VPRORD $0x0c, Z14, Z14 VPADDD Z4, Z14, Z4 VPADDD Z17, Z4, Z4 VPXORD Z26, Z4, Z26 VPRORD $0x08, Z26, Z26 VPADDD Z16, Z26, Z16 VPXORD Z14, Z16, Z14 VPRORD $0x07, Z14, Z14 VPADDD Z6, Z8, Z6 VPADDD Z3, Z6, Z6 VPXORD Z28, Z6, Z28 VPRORD $0x10, Z28, Z28 VPADDD Z18, Z28, Z18 VPXORD Z8, Z18, Z8 VPRORD $0x0c, Z8, Z8 VPADDD Z6, Z8, Z6 VPADDD Z13, Z6, Z6 VPXORD Z28, Z6, Z28 VPRORD $0x08, Z28, Z28 VPADDD Z18, Z28, Z18 VPXORD Z8, Z18, Z8 VPRORD $0x07, Z8, Z8 // Round 5 VPADDD Z0, Z8, Z0 VPADDD Z25, Z0, Z0 VPXORD Z24, Z0, Z24 VPRORD $0x10, Z24, Z24 VPADDD Z16, Z24, Z16 VPXORD Z8, Z16, Z8 VPRORD $0x0c, Z8, Z8 VPADDD Z0, Z8, Z0 VPADDD Z27, Z0, Z0 VPXORD Z24, Z0, Z24 VPRORD $0x08, Z24, Z24 VPADDD Z16, Z24, Z16 VPXORD Z8, Z16, Z8 VPRORD $0x07, Z8, Z8 VPADDD Z2, Z10, Z2 VPADDD Z19, Z2, Z2 VPXORD Z26, Z2, Z26 VPRORD $0x10, Z26, Z26 VPADDD Z18, Z26, Z18 VPXORD Z10, Z18, Z10 VPRORD $0x0c, Z10, Z10 VPADDD Z2, Z10, Z2 VPADDD Z23, Z2, Z2 VPXORD Z26, Z2, Z26 VPRORD $0x08, Z26, Z26 VPADDD Z18, Z26, Z18 VPXORD Z10, Z18, Z10 VPRORD $0x07, Z10, Z10 VPADDD Z4, Z12, Z4 VPADDD Z31, Z4, Z4 VPXORD Z28, Z4, Z28 VPRORD $0x10, Z28, Z28 VPADDD Z20, Z28, Z20 VPXORD Z12, Z20, Z12 VPRORD $0x0c, Z12, Z12 VPADDD Z4, Z12, Z4 VPADDD Z21, Z4, Z4 VPXORD Z28, Z4, Z28 VPRORD $0x08, Z28, Z28 VPADDD Z20, Z28, Z20 VPXORD Z12, Z20, Z12 VPRORD $0x07, Z12, Z12 VPADDD Z6, Z14, Z6 VPADDD Z29, Z6, Z6 VPXORD Z30, Z6, Z30 VPRORD $0x10, Z30, Z30 VPADDD Z22, Z30, Z22 VPXORD Z14, Z22, Z14 VPRORD $0x0c, Z14, Z14 VPADDD Z6, Z14, Z6 VPADDD Z17, Z6, Z6 VPXORD Z30, Z6, Z30 VPRORD $0x08, Z30, Z30 VPADDD Z22, Z30, Z22 VPXORD Z14, Z22, Z14 VPRORD $0x07, Z14, Z14 VPADDD Z0, Z10, Z0 VPADDD Z15, Z0, Z0 VPXORD Z30, Z0, Z30 VPRORD $0x10, Z30, Z30 VPADDD Z20, Z30, Z20 VPXORD Z10, Z20, Z10 VPRORD $0x0c, Z10, Z10 VPADDD Z0, Z10, Z0 VPADDD Z5, Z0, Z0 VPXORD Z30, Z0, Z30 VPRORD $0x08, Z30, Z30 VPADDD Z20, Z30, Z20 VPXORD Z10, Z20, Z10 VPRORD $0x07, Z10, Z10 VPADDD Z2, Z12, Z2 VPADDD Z11, Z2, Z2 VPXORD Z24, Z2, Z24 VPRORD $0x10, Z24, Z24 VPADDD Z22, Z24, Z22 VPXORD Z12, Z22, Z12 VPRORD $0x0c, Z12, Z12 VPADDD Z2, Z12, Z2 VPADDD Z7, Z2, Z2 VPXORD Z24, Z2, Z24 VPRORD $0x08, Z24, Z24 VPADDD Z22, Z24, Z22 VPXORD Z12, Z22, Z12 VPRORD $0x07, Z12, Z12 VPADDD Z4, Z14, Z4 VPADDD Z1, Z4, Z4 VPXORD Z26, Z4, Z26 VPRORD $0x10, Z26, Z26 VPADDD Z16, Z26, Z16 VPXORD Z14, Z16, Z14 VPRORD $0x0c, Z14, Z14 VPADDD Z4, Z14, Z4 VPADDD Z3, Z4, Z4 VPXORD Z26, Z4, Z26 VPRORD $0x08, Z26, Z26 VPADDD Z16, Z26, Z16 VPXORD Z14, Z16, Z14 VPRORD $0x07, Z14, Z14 VPADDD Z6, Z8, Z6 VPADDD Z13, Z6, Z6 VPXORD Z28, Z6, Z28 VPRORD $0x10, Z28, Z28 VPADDD Z18, Z28, Z18 VPXORD Z8, Z18, Z8 VPRORD $0x0c, Z8, Z8 VPADDD Z6, Z8, Z6 VPADDD Z9, Z6, Z6 VPXORD Z28, Z6, Z28 VPRORD $0x08, Z28, Z28 VPADDD Z18, Z28, Z18 VPXORD Z8, Z18, Z8 VPRORD $0x07, Z8, Z8 // Round 6 VPADDD Z0, Z8, Z0 VPADDD Z19, Z0, Z0 VPXORD Z24, Z0, Z24 VPRORD $0x10, Z24, Z24 VPADDD Z16, Z24, Z16 VPXORD Z8, Z16, Z8 VPRORD $0x0c, Z8, Z8 VPADDD Z0, Z8, Z0 VPADDD Z29, Z0, Z0 VPXORD Z24, Z0, Z24 VPRORD $0x08, Z24, Z24 VPADDD Z16, Z24, Z16 VPXORD Z8, Z16, Z8 VPRORD $0x07, Z8, Z8 VPADDD Z2, Z10, Z2 VPADDD Z23, Z2, Z2 VPXORD Z26, Z2, Z26 VPRORD $0x10, Z26, Z26 VPADDD Z18, Z26, Z18 VPXORD Z10, Z18, Z10 VPRORD $0x0c, Z10, Z10 VPADDD Z2, Z10, Z2 VPADDD Z11, Z2, Z2 VPXORD Z26, Z2, Z26 VPRORD $0x08, Z26, Z26 VPADDD Z18, Z26, Z18 VPXORD Z10, Z18, Z10 VPRORD $0x07, Z10, Z10 VPADDD Z4, Z12, Z4 VPADDD Z17, Z4, Z4 VPXORD Z28, Z4, Z28 VPRORD $0x10, Z28, Z28 VPADDD Z20, Z28, Z20 VPXORD Z12, Z20, Z12 VPRORD $0x0c, Z12, Z12 VPADDD Z4, Z12, Z4 VPADDD Z25, Z4, Z4 VPXORD Z28, Z4, Z28 VPRORD $0x08, Z28, Z28 VPADDD Z20, Z28, Z20 VPXORD Z12, Z20, Z12 VPRORD $0x07, Z12, Z12 VPADDD Z6, Z14, Z6 VPADDD Z31, Z6, Z6 VPXORD Z30, Z6, Z30 VPRORD $0x10, Z30, Z30 VPADDD Z22, Z30, Z22 VPXORD Z14, Z22, Z14 VPRORD $0x0c, Z14, Z14 VPADDD Z6, Z14, Z6 VPADDD Z3, Z6, Z6 VPXORD Z30, Z6, Z30 VPRORD $0x08, Z30, Z30 VPADDD Z22, Z30, Z22 VPXORD Z14, Z22, Z14 VPRORD $0x07, Z14, Z14 VPADDD Z0, Z10, Z0 VPADDD Z27, Z0, Z0 VPXORD Z30, Z0, Z30 VPRORD $0x10, Z30, Z30 VPADDD Z20, Z30, Z20 VPXORD Z10, Z20, Z10 VPRORD $0x0c, Z10, Z10 VPADDD Z0, Z10, Z0 VPADDD Z7, Z0, Z0 VPXORD Z30, Z0, Z30 VPRORD $0x08, Z30, Z30 VPADDD Z20, Z30, Z20 VPXORD Z10, Z20, Z10 VPRORD $0x07, Z10, Z10 VPADDD Z2, Z12, Z2 VPADDD Z1, Z2, Z2 VPXORD Z24, Z2, Z24 VPRORD $0x10, Z24, Z24 VPADDD Z22, Z24, Z22 VPXORD Z12, Z22, Z12 VPRORD $0x0c, Z12, Z12 VPADDD Z2, Z12, Z2 VPADDD Z21, Z2, Z2 VPXORD Z24, Z2, Z24 VPRORD $0x08, Z24, Z24 VPADDD Z22, Z24, Z22 VPXORD Z12, Z22, Z12 VPRORD $0x07, Z12, Z12 VPADDD Z4, Z14, Z4 VPADDD Z5, Z4, Z4 VPXORD Z26, Z4, Z26 VPRORD $0x10, Z26, Z26 VPADDD Z16, Z26, Z16 VPXORD Z14, Z16, Z14 VPRORD $0x0c, Z14, Z14 VPADDD Z4, Z14, Z4 VPADDD Z13, Z4, Z4 VPXORD Z26, Z4, Z26 VPRORD $0x08, Z26, Z26 VPADDD Z16, Z26, Z16 VPXORD Z14, Z16, Z14 VPRORD $0x07, Z14, Z14 VPADDD Z6, Z8, Z6 VPADDD Z9, Z6, Z6 VPXORD Z28, Z6, Z28 VPRORD $0x10, Z28, Z28 VPADDD Z18, Z28, Z18 VPXORD Z8, Z18, Z8 VPRORD $0x0c, Z8, Z8 VPADDD Z6, Z8, Z6 VPADDD Z15, Z6, Z6 VPXORD Z28, Z6, Z28 VPRORD $0x08, Z28, Z28 VPADDD Z18, Z28, Z18 VPXORD Z8, Z18, Z8 VPRORD $0x07, Z8, Z8 // Round 7 VPADDD Z0, Z8, Z0 VPADDD Z23, Z0, Z0 VPXORD Z24, Z0, Z24 VPRORD $0x10, Z24, Z24 VPADDD Z16, Z24, Z16 VPXORD Z8, Z16, Z8 VPRORD $0x0c, Z8, Z8 VPADDD Z0, Z8, Z0 VPADDD Z31, Z0, Z0 VPXORD Z24, Z0, Z24 VPRORD $0x08, Z24, Z24 VPADDD Z16, Z24, Z16 VPXORD Z8, Z16, Z8 VPRORD $0x07, Z8, Z8 VPADDD Z2, Z10, Z2 VPADDD Z11, Z2, Z2 VPXORD Z26, Z2, Z26 VPRORD $0x10, Z26, Z26 VPADDD Z18, Z26, Z18 VPXORD Z10, Z18, Z10 VPRORD $0x0c, Z10, Z10 VPADDD Z2, Z10, Z2 VPADDD Z1, Z2, Z2 VPXORD Z26, Z2, Z26 VPRORD $0x08, Z26, Z26 VPADDD Z18, Z26, Z18 VPXORD Z10, Z18, Z10 VPRORD $0x07, Z10, Z10 VPADDD Z4, Z12, Z4 VPADDD Z3, Z4, Z4 VPXORD Z28, Z4, Z28 VPRORD $0x10, Z28, Z28 VPADDD Z20, Z28, Z20 VPXORD Z12, Z20, Z12 VPRORD $0x0c, Z12, Z12 VPADDD Z4, Z12, Z4 VPADDD Z19, Z4, Z4 VPXORD Z28, Z4, Z28 VPRORD $0x08, Z28, Z28 VPADDD Z20, Z28, Z20 VPXORD Z12, Z20, Z12 VPRORD $0x07, Z12, Z12 VPADDD Z6, Z14, Z6 VPADDD Z17, Z6, Z6 VPXORD Z30, Z6, Z30 VPRORD $0x10, Z30, Z30 VPADDD Z22, Z30, Z22 VPXORD Z14, Z22, Z14 VPRORD $0x0c, Z14, Z14 VPADDD Z6, Z14, Z6 VPADDD Z13, Z6, Z6 VPXORD Z30, Z6, Z30 VPRORD $0x08, Z30, Z30 VPADDD Z22, Z30, Z22 VPXORD Z14, Z22, Z14 VPRORD $0x07, Z14, Z14 VPADDD Z0, Z10, Z0 VPADDD Z29, Z0, Z0 VPXORD Z30, Z0, Z30 VPRORD $0x10, Z30, Z30 VPADDD Z20, Z30, Z20 VPXORD Z10, Z20, Z10 VPRORD $0x0c, Z10, Z10 VPADDD Z0, Z10, Z0 VPADDD Z21, Z0, Z0 VPXORD Z30, Z0, Z30 VPRORD $0x08, Z30, Z30 VPADDD Z20, Z30, Z20 VPXORD Z10, Z20, Z10 VPRORD $0x07, Z10, Z10 VPADDD Z2, Z12, Z2 VPADDD Z5, Z2, Z2 VPXORD Z24, Z2, Z24 VPRORD $0x10, Z24, Z24 VPADDD Z22, Z24, Z22 VPXORD Z12, Z22, Z12 VPRORD $0x0c, Z12, Z12 VPADDD Z2, Z12, Z2 VPADDD Z25, Z2, Z2 VPXORD Z24, Z2, Z24 VPRORD $0x08, Z24, Z24 VPADDD Z22, Z24, Z22 VPXORD Z12, Z22, Z12 VPRORD $0x07, Z12, Z12 VPADDD Z4, Z14, Z4 VPADDD Z7, Z4, Z4 VPXORD Z26, Z4, Z26 VPRORD $0x10, Z26, Z26 VPADDD Z16, Z26, Z16 VPXORD Z14, Z16, Z14 VPRORD $0x0c, Z14, Z14 VPADDD Z4, Z14, Z4 VPADDD Z9, Z4, Z4 VPXORD Z26, Z4, Z26 VPRORD $0x08, Z26, Z26 VPADDD Z16, Z26, Z16 VPXORD Z14, Z16, Z14 VPRORD $0x07, Z14, Z14 VPADDD Z6, Z8, Z6 VPADDD Z15, Z6, Z6 VPXORD Z28, Z6, Z28 VPRORD $0x10, Z28, Z28 VPADDD Z18, Z28, Z18 VPXORD Z8, Z18, Z8 VPRORD $0x0c, Z8, Z8 VPADDD Z6, Z8, Z6 VPADDD Z27, Z6, Z6 VPXORD Z28, Z6, Z28 VPRORD $0x08, Z28, Z28 VPADDD Z18, Z28, Z18 VPXORD Z8, Z18, Z8 VPRORD $0x07, Z8, Z8 // Finalize CVs VPXORD Z0, Z16, Z0 VPXORD Z2, Z18, Z2 VPXORD Z4, Z20, Z4 VPXORD Z6, Z22, Z6 VPXORD Z8, Z24, Z8 VPXORD Z10, Z26, Z10 VPXORD Z12, Z28, Z12 VPXORD Z14, Z30, Z14 // Loop INCQ DX CMPQ DX, $0x00000010 JNE loop // Finished; transpose CVs VMOVDQU32 seq<>+0(SB), Z16 VPSLLD $0x05, Z16, Z16 KXNORD K1, K1, K1 VPSCATTERDD Z0, K1, (AX)(Z16*1) KXNORD K1, K1, K1 VPSCATTERDD Z2, K1, 4(AX)(Z16*1) KXNORD K1, K1, K1 VPSCATTERDD Z4, K1, 8(AX)(Z16*1) KXNORD K1, K1, K1 VPSCATTERDD Z6, K1, 12(AX)(Z16*1) KXNORD K1, K1, K1 VPSCATTERDD Z8, K1, 16(AX)(Z16*1) KXNORD K1, K1, K1 VPSCATTERDD Z10, K1, 20(AX)(Z16*1) KXNORD K1, K1, K1 VPSCATTERDD Z12, K1, 24(AX)(Z16*1) KXNORD K1, K1, K1 VPSCATTERDD Z14, K1, 28(AX)(Z16*1) RET // func compressBlocksAVX2(out *[512]byte, block *[16]uint32, cv *[8]uint32, counter uint64, blockLen uint32, flags uint32) // Requires: AVX, AVX2 TEXT ·compressBlocksAVX2(SB), NOSPLIT, $544-40 MOVQ out+0(FP), AX MOVQ block+8(FP), CX MOVQ cv+16(FP), DX // Load block VPBROADCASTD (CX), Y0 VMOVDQU Y0, (SP) VPBROADCASTD 4(CX), Y0 VMOVDQU Y0, 32(SP) VPBROADCASTD 8(CX), Y0 VMOVDQU Y0, 64(SP) VPBROADCASTD 12(CX), Y0 VMOVDQU Y0, 96(SP) VPBROADCASTD 16(CX), Y0 VMOVDQU Y0, 128(SP) VPBROADCASTD 20(CX), Y0 VMOVDQU Y0, 160(SP) VPBROADCASTD 24(CX), Y0 VMOVDQU Y0, 192(SP) VPBROADCASTD 28(CX), Y0 VMOVDQU Y0, 224(SP) VPBROADCASTD 32(CX), Y0 VMOVDQU Y0, 256(SP) VPBROADCASTD 36(CX), Y0 VMOVDQU Y0, 288(SP) VPBROADCASTD 40(CX), Y0 VMOVDQU Y0, 320(SP) VPBROADCASTD 44(CX), Y0 VMOVDQU Y0, 352(SP) VPBROADCASTD 48(CX), Y0 VMOVDQU Y0, 384(SP) VPBROADCASTD 52(CX), Y0 VMOVDQU Y0, 416(SP) VPBROADCASTD 56(CX), Y0 VMOVDQU Y0, 448(SP) VPBROADCASTD 60(CX), Y0 VMOVDQU Y0, 480(SP) // Initialize state vectors VPBROADCASTD (DX), Y0 VPBROADCASTD 4(DX), Y1 VPBROADCASTD 8(DX), Y2 VPBROADCASTD 12(DX), Y3 VPBROADCASTD 16(DX), Y4 VPBROADCASTD 20(DX), Y5 VPBROADCASTD 24(DX), Y6 VPBROADCASTD 28(DX), Y7 VPBROADCASTD iv<>+0(SB), Y8 VPBROADCASTD iv<>+4(SB), Y9 VPBROADCASTD iv<>+8(SB), Y10 VPBROADCASTD iv<>+12(SB), Y11 VPBROADCASTQ counter+24(FP), Y12 VPBROADCASTQ counter+24(FP), Y13 VPADDQ seq64<>+0(SB), Y12, Y12 VPADDQ seq64<>+32(SB), Y13, Y13 VPUNPCKLDQ Y13, Y12, Y14 VPUNPCKHDQ Y13, Y12, Y15 VPUNPCKLDQ Y15, Y14, Y12 VPUNPCKHDQ Y15, Y14, Y13 VPERMQ $0xd8, Y12, Y12 VPERMQ $0xd8, Y13, Y13 VPBROADCASTD blockLen+32(FP), Y14 VPBROADCASTD flags+36(FP), Y15 VMOVDQU Y8, 512(SP) // Round 1 VPADDD Y0, Y4, Y0 VPADDD (SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y0, Y4, Y0 VPADDD 32(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y1, Y5, Y1 VPADDD 64(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y5, Y1 VPADDD 96(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y2, Y6, Y2 VPADDD 128(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y6, Y2 VPADDD 160(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y3, Y7, Y3 VPADDD 192(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y7, Y3 VPADDD 224(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y0, Y5, Y0 VPADDD 256(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y0, Y5, Y0 VPADDD 288(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y6, Y1 VPADDD 320(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y1, Y6, Y1 VPADDD 352(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y7, Y2 VPADDD 384(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y2, Y7, Y2 VPADDD 416(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y4, Y3 VPADDD 448(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y3, Y4, Y3 VPADDD 480(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 // Round 2 VPADDD Y0, Y4, Y0 VPADDD 64(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y0, Y4, Y0 VPADDD 192(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y1, Y5, Y1 VPADDD 96(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y5, Y1 VPADDD 320(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y2, Y6, Y2 VPADDD 224(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y6, Y2 VPADDD (SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y3, Y7, Y3 VPADDD 128(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y7, Y3 VPADDD 416(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y0, Y5, Y0 VPADDD 32(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y0, Y5, Y0 VPADDD 352(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y6, Y1 VPADDD 384(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y1, Y6, Y1 VPADDD 160(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y7, Y2 VPADDD 288(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y2, Y7, Y2 VPADDD 448(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y4, Y3 VPADDD 480(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y3, Y4, Y3 VPADDD 256(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 // Round 3 VPADDD Y0, Y4, Y0 VPADDD 96(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y0, Y4, Y0 VPADDD 128(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y1, Y5, Y1 VPADDD 320(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y5, Y1 VPADDD 384(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y2, Y6, Y2 VPADDD 416(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y6, Y2 VPADDD 64(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y3, Y7, Y3 VPADDD 224(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y7, Y3 VPADDD 448(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y0, Y5, Y0 VPADDD 192(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y0, Y5, Y0 VPADDD 160(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y6, Y1 VPADDD 288(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y1, Y6, Y1 VPADDD (SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y7, Y2 VPADDD 352(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y2, Y7, Y2 VPADDD 480(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y4, Y3 VPADDD 256(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y3, Y4, Y3 VPADDD 32(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 // Round 4 VPADDD Y0, Y4, Y0 VPADDD 320(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y0, Y4, Y0 VPADDD 224(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y1, Y5, Y1 VPADDD 384(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y5, Y1 VPADDD 288(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y2, Y6, Y2 VPADDD 448(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y6, Y2 VPADDD 96(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y3, Y7, Y3 VPADDD 416(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y7, Y3 VPADDD 480(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y0, Y5, Y0 VPADDD 128(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y0, Y5, Y0 VPADDD (SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y6, Y1 VPADDD 352(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y1, Y6, Y1 VPADDD 64(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y7, Y2 VPADDD 160(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y2, Y7, Y2 VPADDD 256(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y4, Y3 VPADDD 32(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y3, Y4, Y3 VPADDD 192(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 // Round 5 VPADDD Y0, Y4, Y0 VPADDD 384(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y0, Y4, Y0 VPADDD 416(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y1, Y5, Y1 VPADDD 288(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y5, Y1 VPADDD 352(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y2, Y6, Y2 VPADDD 480(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y6, Y2 VPADDD 320(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y3, Y7, Y3 VPADDD 448(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y7, Y3 VPADDD 256(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y0, Y5, Y0 VPADDD 224(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y0, Y5, Y0 VPADDD 64(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y6, Y1 VPADDD 160(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y1, Y6, Y1 VPADDD 96(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y7, Y2 VPADDD (SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y2, Y7, Y2 VPADDD 32(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y4, Y3 VPADDD 192(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y3, Y4, Y3 VPADDD 128(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 // Round 6 VPADDD Y0, Y4, Y0 VPADDD 288(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y0, Y4, Y0 VPADDD 448(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y1, Y5, Y1 VPADDD 352(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y5, Y1 VPADDD 160(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y2, Y6, Y2 VPADDD 256(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y6, Y2 VPADDD 384(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y3, Y7, Y3 VPADDD 480(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y7, Y3 VPADDD 32(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y0, Y5, Y0 VPADDD 416(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y0, Y5, Y0 VPADDD 96(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y6, Y1 VPADDD (SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y1, Y6, Y1 VPADDD 320(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y7, Y2 VPADDD 64(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y2, Y7, Y2 VPADDD 192(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y4, Y3 VPADDD 128(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y3, Y4, Y3 VPADDD 224(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 // Round 7 VPADDD Y0, Y4, Y0 VPADDD 352(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y0, Y4, Y0 VPADDD 480(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y1, Y5, Y1 VPADDD 160(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y5, Y1 VPADDD (SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y2, Y6, Y2 VPADDD 32(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y6, Y2 VPADDD 288(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y3, Y7, Y3 VPADDD 256(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y7, Y3 VPADDD 192(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y0, Y5, Y0 VPADDD 448(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y0, Y5, Y0 VPADDD 320(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y6, Y1 VPADDD 64(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y1, Y6, Y1 VPADDD 384(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y7, Y2 VPADDD 96(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y2, Y7, Y2 VPADDD 128(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y4, Y3 VPADDD 224(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y3, Y4, Y3 VPADDD 416(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 VMOVDQU 512(SP), Y8 // Finalize CVs VMOVDQU Y8, 256(SP) VMOVDQU Y9, 288(SP) VMOVDQU Y10, 320(SP) VMOVDQU Y11, 352(SP) VMOVDQU Y12, 384(SP) VMOVDQU Y13, 416(SP) VMOVDQU Y14, 448(SP) VMOVDQU Y15, 480(SP) VPXOR Y0, Y8, Y0 VPXOR Y1, Y9, Y1 VPXOR Y2, Y10, Y2 VPXOR Y3, Y11, Y3 VPXOR Y4, Y12, Y4 VPXOR Y5, Y13, Y5 VPXOR Y6, Y14, Y6 VPXOR Y7, Y15, Y7 VPUNPCKLDQ Y1, Y0, Y8 VPUNPCKHDQ Y1, Y0, Y9 VPUNPCKLDQ Y3, Y2, Y10 VPUNPCKHDQ Y3, Y2, Y11 VPUNPCKLDQ Y5, Y4, Y12 VPUNPCKHDQ Y5, Y4, Y13 VPUNPCKLDQ Y7, Y6, Y14 VPUNPCKHDQ Y7, Y6, Y15 VPUNPCKLQDQ Y10, Y8, Y0 VPUNPCKHQDQ Y10, Y8, Y1 VPUNPCKLQDQ Y11, Y9, Y2 VPUNPCKHQDQ Y11, Y9, Y3 VPUNPCKLQDQ Y14, Y12, Y4 VPUNPCKHQDQ Y14, Y12, Y5 VPUNPCKLQDQ Y15, Y13, Y6 VPUNPCKHQDQ Y15, Y13, Y7 VPERM2I128 $0x20, Y4, Y0, Y8 VPERM2I128 $0x31, Y4, Y0, Y12 VPERM2I128 $0x20, Y5, Y1, Y9 VPERM2I128 $0x31, Y5, Y1, Y13 VPERM2I128 $0x20, Y6, Y2, Y10 VPERM2I128 $0x31, Y6, Y2, Y14 VPERM2I128 $0x20, Y7, Y3, Y11 VPERM2I128 $0x31, Y7, Y3, Y15 VMOVDQU Y8, (AX) VMOVDQU Y9, 64(AX) VMOVDQU Y10, 128(AX) VMOVDQU Y11, 192(AX) VMOVDQU Y12, 256(AX) VMOVDQU Y13, 320(AX) VMOVDQU Y14, 384(AX) VMOVDQU Y15, 448(AX) VMOVDQU 256(SP), Y8 VMOVDQU 288(SP), Y9 VMOVDQU 320(SP), Y10 VMOVDQU 352(SP), Y11 VMOVDQU 384(SP), Y12 VMOVDQU 416(SP), Y13 VMOVDQU 448(SP), Y14 VMOVDQU 480(SP), Y15 VPBROADCASTD (DX), Y0 VPXOR Y0, Y8, Y8 VPBROADCASTD 4(DX), Y0 VPXOR Y0, Y9, Y9 VPBROADCASTD 8(DX), Y0 VPXOR Y0, Y10, Y10 VPBROADCASTD 12(DX), Y0 VPXOR Y0, Y11, Y11 VPBROADCASTD 16(DX), Y0 VPXOR Y0, Y12, Y12 VPBROADCASTD 20(DX), Y0 VPXOR Y0, Y13, Y13 VPBROADCASTD 24(DX), Y0 VPXOR Y0, Y14, Y14 VPBROADCASTD 28(DX), Y0 VPXOR Y0, Y15, Y15 VPUNPCKLDQ Y9, Y8, Y0 VPUNPCKHDQ Y9, Y8, Y1 VPUNPCKLDQ Y11, Y10, Y2 VPUNPCKHDQ Y11, Y10, Y3 VPUNPCKLDQ Y13, Y12, Y4 VPUNPCKHDQ Y13, Y12, Y5 VPUNPCKLDQ Y15, Y14, Y6 VPUNPCKHDQ Y15, Y14, Y7 VPUNPCKLQDQ Y2, Y0, Y8 VPUNPCKHQDQ Y2, Y0, Y9 VPUNPCKLQDQ Y3, Y1, Y10 VPUNPCKHQDQ Y3, Y1, Y11 VPUNPCKLQDQ Y6, Y4, Y12 VPUNPCKHQDQ Y6, Y4, Y13 VPUNPCKLQDQ Y7, Y5, Y14 VPUNPCKHQDQ Y7, Y5, Y15 VPERM2I128 $0x20, Y12, Y8, Y0 VPERM2I128 $0x31, Y12, Y8, Y4 VPERM2I128 $0x20, Y13, Y9, Y1 VPERM2I128 $0x31, Y13, Y9, Y5 VPERM2I128 $0x20, Y14, Y10, Y2 VPERM2I128 $0x31, Y14, Y10, Y6 VPERM2I128 $0x20, Y15, Y11, Y3 VPERM2I128 $0x31, Y15, Y11, Y7 VMOVDQU Y0, 32(AX) VMOVDQU Y1, 96(AX) VMOVDQU Y2, 160(AX) VMOVDQU Y3, 224(AX) VMOVDQU Y4, 288(AX) VMOVDQU Y5, 352(AX) VMOVDQU Y6, 416(AX) VMOVDQU Y7, 480(AX) RET // func compressChunksAVX2(cvs *[8][8]uint32, buf *[8192]byte, key *[8]uint32, counter uint64, flags uint32) // Requires: AVX, AVX2 TEXT ·compressChunksAVX2(SB), NOSPLIT, $672-40 MOVQ cvs+0(FP), AX MOVQ buf+8(FP), CX MOVQ key+16(FP), DX // Load key VPBROADCASTD (DX), Y0 VPBROADCASTD 4(DX), Y1 VPBROADCASTD 8(DX), Y2 VPBROADCASTD 12(DX), Y3 VPBROADCASTD 16(DX), Y4 VPBROADCASTD 20(DX), Y5 VPBROADCASTD 24(DX), Y6 VPBROADCASTD 28(DX), Y7 // Initialize counter VPBROADCASTQ counter+24(FP), Y12 VPBROADCASTQ counter+24(FP), Y13 VPADDQ seq64<>+0(SB), Y12, Y12 VPADDQ seq64<>+32(SB), Y13, Y13 VPUNPCKLDQ Y13, Y12, Y14 VPUNPCKHDQ Y13, Y12, Y15 VPUNPCKLDQ Y15, Y14, Y12 VPUNPCKHDQ Y15, Y14, Y13 VPERMQ $0xd8, Y12, Y12 VPERMQ $0xd8, Y13, Y13 VMOVDQU Y12, 512(SP) VMOVDQU Y13, 544(SP) // Initialize flags VPBROADCASTD flags+32(FP), Y14 VMOVDQU Y14, 576(SP) VMOVDQU Y14, 608(SP) ORL $0x01, 576(SP) ORL $0x02, 636(SP) // Loop index XORQ DX, DX loop: // Load transposed block VMOVDQU seq<>+0(SB), Y9 VPSLLD $0x0a, Y9, Y9 VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, (CX)(Y9*1), Y10 VMOVDQU Y10, (SP) VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, 4(CX)(Y9*1), Y10 VMOVDQU Y10, 32(SP) VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, 8(CX)(Y9*1), Y10 VMOVDQU Y10, 64(SP) VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, 12(CX)(Y9*1), Y10 VMOVDQU Y10, 96(SP) VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, 16(CX)(Y9*1), Y10 VMOVDQU Y10, 128(SP) VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, 20(CX)(Y9*1), Y10 VMOVDQU Y10, 160(SP) VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, 24(CX)(Y9*1), Y10 VMOVDQU Y10, 192(SP) VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, 28(CX)(Y9*1), Y10 VMOVDQU Y10, 224(SP) VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, 32(CX)(Y9*1), Y10 VMOVDQU Y10, 256(SP) VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, 36(CX)(Y9*1), Y10 VMOVDQU Y10, 288(SP) VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, 40(CX)(Y9*1), Y10 VMOVDQU Y10, 320(SP) VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, 44(CX)(Y9*1), Y10 VMOVDQU Y10, 352(SP) VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, 48(CX)(Y9*1), Y10 VMOVDQU Y10, 384(SP) VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, 52(CX)(Y9*1), Y10 VMOVDQU Y10, 416(SP) VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, 56(CX)(Y9*1), Y10 VMOVDQU Y10, 448(SP) VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, 60(CX)(Y9*1), Y10 VMOVDQU Y10, 480(SP) ADDQ $0x40, CX // Reload state vectors (other than CVs) VPBROADCASTD iv<>+0(SB), Y8 VPBROADCASTD iv<>+4(SB), Y9 VPBROADCASTD iv<>+8(SB), Y10 VPBROADCASTD iv<>+12(SB), Y11 VMOVDQU 512(SP), Y12 VMOVDQU 544(SP), Y13 VPBROADCASTD seq<>+4(SB), Y14 VPSLLD $0x06, Y14, Y14 VPBROADCASTD 576(SP)(DX*4), Y15 VMOVDQU Y8, 640(SP) // Round 1 VPADDD Y0, Y4, Y0 VPADDD (SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VMOVDQU 640(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 640(SP) VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y0, Y4, Y0 VPADDD 32(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VMOVDQU 640(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 640(SP) VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y1, Y5, Y1 VPADDD 64(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y5, Y1 VPADDD 96(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y2, Y6, Y2 VPADDD 128(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y6, Y2 VPADDD 160(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y3, Y7, Y3 VPADDD 192(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y7, Y3 VPADDD 224(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y0, Y5, Y0 VPADDD 256(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y0, Y5, Y0 VPADDD 288(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y6, Y1 VPADDD 320(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y1, Y6, Y1 VPADDD 352(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y7, Y2 VPADDD 384(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VMOVDQU 640(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 640(SP) VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y2, Y7, Y2 VPADDD 416(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VMOVDQU 640(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 640(SP) VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y4, Y3 VPADDD 448(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y3, Y4, Y3 VPADDD 480(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 // Round 2 VPADDD Y0, Y4, Y0 VPADDD 64(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VMOVDQU 640(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 640(SP) VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y0, Y4, Y0 VPADDD 192(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VMOVDQU 640(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 640(SP) VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y1, Y5, Y1 VPADDD 96(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y5, Y1 VPADDD 320(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y2, Y6, Y2 VPADDD 224(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y6, Y2 VPADDD (SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y3, Y7, Y3 VPADDD 128(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y7, Y3 VPADDD 416(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y0, Y5, Y0 VPADDD 32(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y0, Y5, Y0 VPADDD 352(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y6, Y1 VPADDD 384(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y1, Y6, Y1 VPADDD 160(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y7, Y2 VPADDD 288(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VMOVDQU 640(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 640(SP) VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y2, Y7, Y2 VPADDD 448(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VMOVDQU 640(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 640(SP) VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y4, Y3 VPADDD 480(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y3, Y4, Y3 VPADDD 256(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 // Round 3 VPADDD Y0, Y4, Y0 VPADDD 96(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VMOVDQU 640(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 640(SP) VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y0, Y4, Y0 VPADDD 128(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VMOVDQU 640(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 640(SP) VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y1, Y5, Y1 VPADDD 320(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y5, Y1 VPADDD 384(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y2, Y6, Y2 VPADDD 416(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y6, Y2 VPADDD 64(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y3, Y7, Y3 VPADDD 224(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y7, Y3 VPADDD 448(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y0, Y5, Y0 VPADDD 192(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y0, Y5, Y0 VPADDD 160(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y6, Y1 VPADDD 288(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y1, Y6, Y1 VPADDD (SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y7, Y2 VPADDD 352(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VMOVDQU 640(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 640(SP) VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y2, Y7, Y2 VPADDD 480(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VMOVDQU 640(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 640(SP) VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y4, Y3 VPADDD 256(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y3, Y4, Y3 VPADDD 32(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 // Round 4 VPADDD Y0, Y4, Y0 VPADDD 320(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VMOVDQU 640(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 640(SP) VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y0, Y4, Y0 VPADDD 224(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VMOVDQU 640(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 640(SP) VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y1, Y5, Y1 VPADDD 384(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y5, Y1 VPADDD 288(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y2, Y6, Y2 VPADDD 448(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y6, Y2 VPADDD 96(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y3, Y7, Y3 VPADDD 416(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y7, Y3 VPADDD 480(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y0, Y5, Y0 VPADDD 128(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y0, Y5, Y0 VPADDD (SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y6, Y1 VPADDD 352(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y1, Y6, Y1 VPADDD 64(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y7, Y2 VPADDD 160(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VMOVDQU 640(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 640(SP) VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y2, Y7, Y2 VPADDD 256(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VMOVDQU 640(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 640(SP) VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y4, Y3 VPADDD 32(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y3, Y4, Y3 VPADDD 192(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 // Round 5 VPADDD Y0, Y4, Y0 VPADDD 384(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VMOVDQU 640(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 640(SP) VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y0, Y4, Y0 VPADDD 416(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VMOVDQU 640(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 640(SP) VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y1, Y5, Y1 VPADDD 288(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y5, Y1 VPADDD 352(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y2, Y6, Y2 VPADDD 480(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y6, Y2 VPADDD 320(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y3, Y7, Y3 VPADDD 448(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y7, Y3 VPADDD 256(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y0, Y5, Y0 VPADDD 224(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y0, Y5, Y0 VPADDD 64(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y6, Y1 VPADDD 160(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y1, Y6, Y1 VPADDD 96(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y7, Y2 VPADDD (SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VMOVDQU 640(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 640(SP) VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y2, Y7, Y2 VPADDD 32(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VMOVDQU 640(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 640(SP) VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y4, Y3 VPADDD 192(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y3, Y4, Y3 VPADDD 128(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 // Round 6 VPADDD Y0, Y4, Y0 VPADDD 288(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VMOVDQU 640(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 640(SP) VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y0, Y4, Y0 VPADDD 448(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VMOVDQU 640(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 640(SP) VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y1, Y5, Y1 VPADDD 352(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y5, Y1 VPADDD 160(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y2, Y6, Y2 VPADDD 256(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y6, Y2 VPADDD 384(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y3, Y7, Y3 VPADDD 480(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y7, Y3 VPADDD 32(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y0, Y5, Y0 VPADDD 416(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y0, Y5, Y0 VPADDD 96(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y6, Y1 VPADDD (SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y1, Y6, Y1 VPADDD 320(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y7, Y2 VPADDD 64(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VMOVDQU 640(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 640(SP) VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y2, Y7, Y2 VPADDD 192(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VMOVDQU 640(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 640(SP) VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y4, Y3 VPADDD 128(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y3, Y4, Y3 VPADDD 224(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 // Round 7 VPADDD Y0, Y4, Y0 VPADDD 352(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VMOVDQU 640(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 640(SP) VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y0, Y4, Y0 VPADDD 480(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VMOVDQU 640(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 640(SP) VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y1, Y5, Y1 VPADDD 160(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y5, Y1 VPADDD (SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y2, Y6, Y2 VPADDD 32(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y6, Y2 VPADDD 288(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y3, Y7, Y3 VPADDD 256(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y7, Y3 VPADDD 192(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y0, Y5, Y0 VPADDD 448(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y0, Y5, Y0 VPADDD 320(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y6, Y1 VPADDD 64(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y1, Y6, Y1 VPADDD 384(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y7, Y2 VPADDD 96(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VMOVDQU 640(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 640(SP) VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y2, Y7, Y2 VPADDD 128(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VMOVDQU 640(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 640(SP) VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y4, Y3 VPADDD 224(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y3, Y4, Y3 VPADDD 416(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 VMOVDQU 640(SP), Y8 // Finalize CVs VPXOR Y0, Y8, Y0 VPXOR Y1, Y9, Y1 VPXOR Y2, Y10, Y2 VPXOR Y3, Y11, Y3 VPXOR Y4, Y12, Y4 VPXOR Y5, Y13, Y5 VPXOR Y6, Y14, Y6 VPXOR Y7, Y15, Y7 // Loop INCQ DX CMPQ DX, $0x00000010 JNE loop // Finished; transpose CVs VPUNPCKLDQ Y1, Y0, Y8 VPUNPCKHDQ Y1, Y0, Y9 VPUNPCKLDQ Y3, Y2, Y10 VPUNPCKHDQ Y3, Y2, Y11 VPUNPCKLDQ Y5, Y4, Y12 VPUNPCKHDQ Y5, Y4, Y13 VPUNPCKLDQ Y7, Y6, Y14 VPUNPCKHDQ Y7, Y6, Y15 VPUNPCKLQDQ Y10, Y8, Y0 VPUNPCKHQDQ Y10, Y8, Y1 VPUNPCKLQDQ Y11, Y9, Y2 VPUNPCKHQDQ Y11, Y9, Y3 VPUNPCKLQDQ Y14, Y12, Y4 VPUNPCKHQDQ Y14, Y12, Y5 VPUNPCKLQDQ Y15, Y13, Y6 VPUNPCKHQDQ Y15, Y13, Y7 VPERM2I128 $0x20, Y4, Y0, Y8 VPERM2I128 $0x31, Y4, Y0, Y12 VPERM2I128 $0x20, Y5, Y1, Y9 VPERM2I128 $0x31, Y5, Y1, Y13 VPERM2I128 $0x20, Y6, Y2, Y10 VPERM2I128 $0x31, Y6, Y2, Y14 VPERM2I128 $0x20, Y7, Y3, Y11 VPERM2I128 $0x31, Y7, Y3, Y15 VMOVDQU Y8, (AX) VMOVDQU Y9, 32(AX) VMOVDQU Y10, 64(AX) VMOVDQU Y11, 96(AX) VMOVDQU Y12, 128(AX) VMOVDQU Y13, 160(AX) VMOVDQU Y14, 192(AX) VMOVDQU Y15, 224(AX) RET // func compressParentsAVX2(parents *[8][8]uint32, cvs *[16][8]uint32, key *[8]uint32, flags uint32) // Requires: AVX, AVX2 TEXT ·compressParentsAVX2(SB), NOSPLIT, $544-32 MOVQ parents+0(FP), AX MOVQ cvs+8(FP), CX MOVQ key+16(FP), DX // Load transposed block VMOVDQU seq<>+0(SB), Y9 VPSLLD $0x06, Y9, Y9 VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, (CX)(Y9*1), Y10 VMOVDQU Y10, (SP) VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, 4(CX)(Y9*1), Y10 VMOVDQU Y10, 32(SP) VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, 8(CX)(Y9*1), Y10 VMOVDQU Y10, 64(SP) VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, 12(CX)(Y9*1), Y10 VMOVDQU Y10, 96(SP) VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, 16(CX)(Y9*1), Y10 VMOVDQU Y10, 128(SP) VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, 20(CX)(Y9*1), Y10 VMOVDQU Y10, 160(SP) VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, 24(CX)(Y9*1), Y10 VMOVDQU Y10, 192(SP) VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, 28(CX)(Y9*1), Y10 VMOVDQU Y10, 224(SP) VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, 32(CX)(Y9*1), Y10 VMOVDQU Y10, 256(SP) VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, 36(CX)(Y9*1), Y10 VMOVDQU Y10, 288(SP) VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, 40(CX)(Y9*1), Y10 VMOVDQU Y10, 320(SP) VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, 44(CX)(Y9*1), Y10 VMOVDQU Y10, 352(SP) VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, 48(CX)(Y9*1), Y10 VMOVDQU Y10, 384(SP) VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, 52(CX)(Y9*1), Y10 VMOVDQU Y10, 416(SP) VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, 56(CX)(Y9*1), Y10 VMOVDQU Y10, 448(SP) VPCMPEQD Y8, Y8, Y8 VPGATHERDD Y8, 60(CX)(Y9*1), Y10 VMOVDQU Y10, 480(SP) // Initialize state vectors VPBROADCASTD (DX), Y0 VPBROADCASTD 4(DX), Y1 VPBROADCASTD 8(DX), Y2 VPBROADCASTD 12(DX), Y3 VPBROADCASTD 16(DX), Y4 VPBROADCASTD 20(DX), Y5 VPBROADCASTD 24(DX), Y6 VPBROADCASTD 28(DX), Y7 VPBROADCASTD iv<>+0(SB), Y8 VPBROADCASTD iv<>+4(SB), Y9 VPBROADCASTD iv<>+8(SB), Y10 VPBROADCASTD iv<>+12(SB), Y11 VPXOR Y12, Y12, Y12 VPXOR Y13, Y13, Y13 VPBROADCASTD seq<>+4(SB), Y14 VPSLLD $0x06, Y14, Y14 ORL $0x04, flags+24(FP) VPBROADCASTD flags+24(FP), Y15 VMOVDQU Y8, 512(SP) // Round 1 VPADDD Y0, Y4, Y0 VPADDD (SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y0, Y4, Y0 VPADDD 32(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y1, Y5, Y1 VPADDD 64(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y5, Y1 VPADDD 96(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y2, Y6, Y2 VPADDD 128(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y6, Y2 VPADDD 160(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y3, Y7, Y3 VPADDD 192(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y7, Y3 VPADDD 224(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y0, Y5, Y0 VPADDD 256(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y0, Y5, Y0 VPADDD 288(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y6, Y1 VPADDD 320(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y1, Y6, Y1 VPADDD 352(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y7, Y2 VPADDD 384(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y2, Y7, Y2 VPADDD 416(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y4, Y3 VPADDD 448(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y3, Y4, Y3 VPADDD 480(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 // Round 2 VPADDD Y0, Y4, Y0 VPADDD 64(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y0, Y4, Y0 VPADDD 192(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y1, Y5, Y1 VPADDD 96(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y5, Y1 VPADDD 320(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y2, Y6, Y2 VPADDD 224(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y6, Y2 VPADDD (SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y3, Y7, Y3 VPADDD 128(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y7, Y3 VPADDD 416(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y0, Y5, Y0 VPADDD 32(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y0, Y5, Y0 VPADDD 352(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y6, Y1 VPADDD 384(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y1, Y6, Y1 VPADDD 160(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y7, Y2 VPADDD 288(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y2, Y7, Y2 VPADDD 448(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y4, Y3 VPADDD 480(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y3, Y4, Y3 VPADDD 256(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 // Round 3 VPADDD Y0, Y4, Y0 VPADDD 96(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y0, Y4, Y0 VPADDD 128(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y1, Y5, Y1 VPADDD 320(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y5, Y1 VPADDD 384(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y2, Y6, Y2 VPADDD 416(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y6, Y2 VPADDD 64(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y3, Y7, Y3 VPADDD 224(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y7, Y3 VPADDD 448(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y0, Y5, Y0 VPADDD 192(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y0, Y5, Y0 VPADDD 160(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y6, Y1 VPADDD 288(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y1, Y6, Y1 VPADDD (SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y7, Y2 VPADDD 352(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y2, Y7, Y2 VPADDD 480(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y4, Y3 VPADDD 256(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y3, Y4, Y3 VPADDD 32(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 // Round 4 VPADDD Y0, Y4, Y0 VPADDD 320(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y0, Y4, Y0 VPADDD 224(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y1, Y5, Y1 VPADDD 384(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y5, Y1 VPADDD 288(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y2, Y6, Y2 VPADDD 448(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y6, Y2 VPADDD 96(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y3, Y7, Y3 VPADDD 416(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y7, Y3 VPADDD 480(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y0, Y5, Y0 VPADDD 128(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y0, Y5, Y0 VPADDD (SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y6, Y1 VPADDD 352(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y1, Y6, Y1 VPADDD 64(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y7, Y2 VPADDD 160(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y2, Y7, Y2 VPADDD 256(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y4, Y3 VPADDD 32(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y3, Y4, Y3 VPADDD 192(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 // Round 5 VPADDD Y0, Y4, Y0 VPADDD 384(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y0, Y4, Y0 VPADDD 416(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y1, Y5, Y1 VPADDD 288(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y5, Y1 VPADDD 352(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y2, Y6, Y2 VPADDD 480(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y6, Y2 VPADDD 320(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y3, Y7, Y3 VPADDD 448(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y7, Y3 VPADDD 256(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y0, Y5, Y0 VPADDD 224(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y0, Y5, Y0 VPADDD 64(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y6, Y1 VPADDD 160(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y1, Y6, Y1 VPADDD 96(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y7, Y2 VPADDD (SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y2, Y7, Y2 VPADDD 32(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y4, Y3 VPADDD 192(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y3, Y4, Y3 VPADDD 128(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 // Round 6 VPADDD Y0, Y4, Y0 VPADDD 288(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y0, Y4, Y0 VPADDD 448(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y1, Y5, Y1 VPADDD 352(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y5, Y1 VPADDD 160(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y2, Y6, Y2 VPADDD 256(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y6, Y2 VPADDD 384(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y3, Y7, Y3 VPADDD 480(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y7, Y3 VPADDD 32(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y0, Y5, Y0 VPADDD 416(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y0, Y5, Y0 VPADDD 96(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y6, Y1 VPADDD (SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y1, Y6, Y1 VPADDD 320(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y7, Y2 VPADDD 64(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y2, Y7, Y2 VPADDD 192(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y4, Y3 VPADDD 128(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y3, Y4, Y3 VPADDD 224(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 // Round 7 VPADDD Y0, Y4, Y0 VPADDD 352(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y0, Y4, Y0 VPADDD 480(SP), Y0, Y0 VPXOR Y12, Y0, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VMOVDQU 512(SP), Y8 VPADDD Y8, Y12, Y8 VPXOR Y4, Y8, Y4 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y1, Y5, Y1 VPADDD 160(SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y5, Y1 VPADDD (SP), Y1, Y1 VPXOR Y13, Y1, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VPADDD Y9, Y13, Y9 VPXOR Y5, Y9, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y2, Y6, Y2 VPADDD 32(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y6, Y2 VPADDD 288(SP), Y2, Y2 VPXOR Y14, Y2, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y10, Y14, Y10 VPXOR Y6, Y10, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y3, Y7, Y3 VPADDD 256(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y7, Y3 VPADDD 192(SP), Y3, Y3 VPXOR Y15, Y3, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y11, Y15, Y11 VPXOR Y7, Y11, Y7 VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y0, Y5, Y0 VPADDD 448(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x0c, Y5, Y8 VPSLLD $0x14, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y0, Y5, Y0 VPADDD 320(SP), Y0, Y0 VPXOR Y15, Y0, Y15 VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15 VPADDD Y10, Y15, Y10 VPXOR Y5, Y10, Y5 VPSRLD $0x07, Y5, Y8 VPSLLD $0x19, Y5, Y5 VPOR Y5, Y8, Y5 VPADDD Y1, Y6, Y1 VPADDD 64(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x0c, Y6, Y8 VPSLLD $0x14, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y1, Y6, Y1 VPADDD 384(SP), Y1, Y1 VPXOR Y12, Y1, Y12 VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12 VPADDD Y11, Y12, Y11 VPXOR Y6, Y11, Y6 VPSRLD $0x07, Y6, Y8 VPSLLD $0x19, Y6, Y6 VPOR Y6, Y8, Y6 VPADDD Y2, Y7, Y2 VPADDD 96(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x0c, Y7, Y8 VPSLLD $0x14, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y2, Y7, Y2 VPADDD 128(SP), Y2, Y2 VPXOR Y13, Y2, Y13 VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13 VMOVDQU 512(SP), Y8 VPADDD Y8, Y13, Y8 VPXOR Y7, Y8, Y7 VMOVDQU Y8, 512(SP) VPSRLD $0x07, Y7, Y8 VPSLLD $0x19, Y7, Y7 VPOR Y7, Y8, Y7 VPADDD Y3, Y4, Y3 VPADDD 224(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x0c, Y4, Y8 VPSLLD $0x14, Y4, Y4 VPOR Y4, Y8, Y4 VPADDD Y3, Y4, Y3 VPADDD 416(SP), Y3, Y3 VPXOR Y14, Y3, Y14 VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14 VPADDD Y9, Y14, Y9 VPXOR Y4, Y9, Y4 VPSRLD $0x07, Y4, Y8 VPSLLD $0x19, Y4, Y4 VPOR Y4, Y8, Y4 VMOVDQU 512(SP), Y8 // Finalize CVs VPXOR Y0, Y8, Y0 VPXOR Y1, Y9, Y1 VPXOR Y2, Y10, Y2 VPXOR Y3, Y11, Y3 VPXOR Y4, Y12, Y4 VPXOR Y5, Y13, Y5 VPXOR Y6, Y14, Y6 VPXOR Y7, Y15, Y7 VPUNPCKLDQ Y1, Y0, Y8 VPUNPCKHDQ Y1, Y0, Y9 VPUNPCKLDQ Y3, Y2, Y10 VPUNPCKHDQ Y3, Y2, Y11 VPUNPCKLDQ Y5, Y4, Y12 VPUNPCKHDQ Y5, Y4, Y13 VPUNPCKLDQ Y7, Y6, Y14 VPUNPCKHDQ Y7, Y6, Y15 VPUNPCKLQDQ Y10, Y8, Y0 VPUNPCKHQDQ Y10, Y8, Y1 VPUNPCKLQDQ Y11, Y9, Y2 VPUNPCKHQDQ Y11, Y9, Y3 VPUNPCKLQDQ Y14, Y12, Y4 VPUNPCKHQDQ Y14, Y12, Y5 VPUNPCKLQDQ Y15, Y13, Y6 VPUNPCKHQDQ Y15, Y13, Y7 VPERM2I128 $0x20, Y4, Y0, Y8 VPERM2I128 $0x31, Y4, Y0, Y12 VPERM2I128 $0x20, Y5, Y1, Y9 VPERM2I128 $0x31, Y5, Y1, Y13 VPERM2I128 $0x20, Y6, Y2, Y10 VPERM2I128 $0x31, Y6, Y2, Y14 VPERM2I128 $0x20, Y7, Y3, Y11 VPERM2I128 $0x31, Y7, Y3, Y15 VMOVDQU Y8, (AX) VMOVDQU Y9, 32(AX) VMOVDQU Y10, 64(AX) VMOVDQU Y11, 96(AX) VMOVDQU Y12, 128(AX) VMOVDQU Y13, 160(AX) VMOVDQU Y14, 192(AX) VMOVDQU Y15, 224(AX) RET blake3-1.1.6/blake3_test.go000066400000000000000000000133211411544176700154210ustar00rootroot00000000000000package blake3_test import ( "bytes" "encoding/hex" "encoding/json" "io" "io/ioutil" "testing" "lukechampine.com/blake3" ) func toHex(data []byte) string { return hex.EncodeToString(data) } var testVectors = func() (vecs struct { Key string Cases []struct { InputLen int `json:"input_len"` Hash string `json:"hash"` KeyedHash string `json:"keyed_hash"` DeriveKey string `json:"derive_key"` } }) { data, err := ioutil.ReadFile("testdata/vectors.json") if err != nil { panic(err) } if err := json.Unmarshal(data, &vecs); err != nil { panic(err) } return }() var testInput = func() []byte { input := make([]byte, 1e6) for i := range input { input[i] = byte(i % 251) } return input }() func TestVectors(t *testing.T) { for _, vec := range testVectors.Cases { in := testInput[:vec.InputLen] // regular h := blake3.New(len(vec.Hash)/2, nil) h.Write(in) if out := toHex(h.Sum(nil)); out != vec.Hash { t.Errorf("output did not match test vector:\n\texpected: %v...\n\t got: %v...", vec.Hash[:10], out[:10]) } // keyed h = blake3.New(len(vec.KeyedHash)/2, []byte(testVectors.Key)) h.Write(in) if out := toHex(h.Sum(nil)); out != vec.KeyedHash { t.Errorf("output did not match test vector:\n\texpected: %v...\n\t got: %v...", vec.KeyedHash[:10], out[:10]) } // derive key const ctx = "BLAKE3 2019-12-27 16:29:52 test vectors context" subKey := make([]byte, len(vec.DeriveKey)/2) blake3.DeriveKey(subKey, ctx, in) if out := toHex(subKey); out != vec.DeriveKey { t.Errorf("output did not match test vector:\n\texpected: %v...\n\t got: %v...", vec.DeriveKey[:10], out[:10]) } } } func TestXOF(t *testing.T) { for _, vec := range testVectors.Cases { in := testInput[:vec.InputLen] // XOF should produce same output as Sum, even when outputting 7 bytes at a time h := blake3.New(len(vec.Hash)/2, nil) h.Write(in) var xofBuf bytes.Buffer io.CopyBuffer(&xofBuf, io.LimitReader(h.XOF(), int64(len(vec.Hash)/2)), make([]byte, 7)) if out := toHex(xofBuf.Bytes()); out != vec.Hash { t.Errorf("XOF output did not match test vector:\n\texpected: %v...\n\t got: %v...", vec.Hash[:10], out[:10]) } // Should be able to Seek around in the output stream without affecting correctness seeks := []struct { offset int64 whence int }{ {0, io.SeekStart}, {17, io.SeekCurrent}, {-5, io.SeekCurrent}, {int64(h.Size()), io.SeekStart}, {int64(h.Size()), io.SeekCurrent}, } xof := h.XOF() outR := bytes.NewReader(xofBuf.Bytes()) for _, s := range seeks { outRead := make([]byte, 10) xofRead := make([]byte, 10) offset, _ := outR.Seek(s.offset, s.whence) n, _ := outR.Read(outRead) xof.Seek(s.offset, s.whence) xof.Read(xofRead[:n]) if !bytes.Equal(outRead[:n], xofRead[:n]) { t.Errorf("XOF output did not match test vector at offset %v:\n\texpected: %x...\n\t got: %x...", offset, outRead[:10], xofRead[:10]) } } } // test behavior at end of stream xof := blake3.New(0, nil).XOF() buf := make([]byte, 1024) xof.Seek(-1000, io.SeekEnd) n, err := xof.Read(buf) if n != 1000 || err != nil { t.Errorf("expected (1000, nil) when reading near end of stream, got (%v, %v)", n, err) } n, err = xof.Read(buf) if n != 0 || err != io.EOF { t.Errorf("expected (0, EOF) when reading past end of stream, got (%v, %v)", n, err) } // test invalid seek offsets _, err = xof.Seek(-1, io.SeekStart) if err == nil { t.Error("expected invalid offset error, got nil") } xof.Seek(0, io.SeekStart) _, err = xof.Seek(-1, io.SeekCurrent) if err == nil { t.Error("expected invalid offset error, got nil") } // test invalid seek whence didPanic := func() (p bool) { defer func() { p = recover() != nil }() xof.Seek(0, 17) return }() if !didPanic { t.Error("expected panic when seeking with invalid whence") } } func TestSum(t *testing.T) { for _, vec := range testVectors.Cases { in := testInput[:vec.InputLen] var exp256 [32]byte h := blake3.New(32, nil) h.Write(in) h.Sum(exp256[:0]) if got256 := blake3.Sum256(in); exp256 != got256 { t.Errorf("Sum256 output did not match Sum output:\n\texpected: %x...\n\t got: %x...", exp256[:5], got256[:5]) } var exp512 [64]byte h = blake3.New(64, nil) h.Write(in) h.Sum(exp512[:0]) if got512 := blake3.Sum512(in); exp512 != got512 { t.Errorf("Sum512 output did not match Sum output:\n\texpected: %x...\n\t got: %x...", exp512[:5], got512[:5]) } } } func TestReset(t *testing.T) { for _, vec := range testVectors.Cases { in := testInput[:vec.InputLen] h := blake3.New(32, nil) h.Write(in) out1 := h.Sum(nil) h.Reset() h.Write(in) out2 := h.Sum(nil) if !bytes.Equal(out1, out2) { t.Error("Reset did not reset Hasher state properly") } } // gotta have 100% test coverage... if blake3.New(0, nil).BlockSize() != 64 { t.Error("incorrect block size") } } type nopReader struct{} func (nopReader) Read(p []byte) (int, error) { return len(p), nil } func BenchmarkWrite(b *testing.B) { b.ReportAllocs() b.SetBytes(1024) io.CopyN(blake3.New(0, nil), nopReader{}, int64(b.N*1024)) } func BenchmarkXOF(b *testing.B) { b.ReportAllocs() b.SetBytes(1024) io.CopyN(ioutil.Discard, blake3.New(0, nil).XOF(), int64(b.N*1024)) } func BenchmarkSum256(b *testing.B) { b.Run("64", func(b *testing.B) { b.ReportAllocs() b.SetBytes(64) buf := make([]byte, 64) for i := 0; i < b.N; i++ { blake3.Sum256(buf) } }) b.Run("1024", func(b *testing.B) { b.ReportAllocs() b.SetBytes(1024) buf := make([]byte, 1024) for i := 0; i < b.N; i++ { blake3.Sum256(buf) } }) b.Run("65536", func(b *testing.B) { b.ReportAllocs() b.SetBytes(65536) buf := make([]byte, 65536) for i := 0; i < b.N; i++ { blake3.Sum256(buf) } }) } blake3-1.1.6/compress_amd64.go000066400000000000000000000105731411544176700160550ustar00rootroot00000000000000package blake3 import "unsafe" //go:generate go run avo/gen.go -out blake3_amd64.s //go:noescape func compressChunksAVX512(cvs *[16][8]uint32, buf *[16 * chunkSize]byte, key *[8]uint32, counter uint64, flags uint32) //go:noescape func compressChunksAVX2(cvs *[8][8]uint32, buf *[8 * chunkSize]byte, key *[8]uint32, counter uint64, flags uint32) //go:noescape func compressBlocksAVX512(out *[1024]byte, block *[16]uint32, cv *[8]uint32, counter uint64, blockLen uint32, flags uint32) //go:noescape func compressBlocksAVX2(out *[512]byte, msgs *[16]uint32, cv *[8]uint32, counter uint64, blockLen uint32, flags uint32) //go:noescape func compressParentsAVX2(parents *[8][8]uint32, cvs *[16][8]uint32, key *[8]uint32, flags uint32) func compressNode(n node) (out [16]uint32) { compressNodeGeneric(&out, n) return } func compressBufferAVX512(buf *[maxSIMD * chunkSize]byte, buflen int, key *[8]uint32, counter uint64, flags uint32) node { var cvs [maxSIMD][8]uint32 compressChunksAVX512(&cvs, buf, key, counter, flags) numChunks := uint64(buflen / chunkSize) if buflen%chunkSize != 0 { // use non-asm for remainder partialChunk := buf[buflen-buflen%chunkSize : buflen] cvs[numChunks] = chainingValue(compressChunk(partialChunk, key, counter+numChunks, flags)) numChunks++ } return mergeSubtrees(&cvs, numChunks, key, flags) } func compressBufferAVX2(buf *[maxSIMD * chunkSize]byte, buflen int, key *[8]uint32, counter uint64, flags uint32) node { var cvs [maxSIMD][8]uint32 cvHalves := (*[2][8][8]uint32)(unsafe.Pointer(&cvs)) bufHalves := (*[2][8 * chunkSize]byte)(unsafe.Pointer(buf)) compressChunksAVX2(&cvHalves[0], &bufHalves[0], key, counter, flags) numChunks := uint64(buflen / chunkSize) if numChunks > 8 { compressChunksAVX2(&cvHalves[1], &bufHalves[1], key, counter+8, flags) } if buflen%chunkSize != 0 { // use non-asm for remainder partialChunk := buf[buflen-buflen%chunkSize : buflen] cvs[numChunks] = chainingValue(compressChunk(partialChunk, key, counter+numChunks, flags)) numChunks++ } return mergeSubtrees(&cvs, numChunks, key, flags) } func compressBuffer(buf *[maxSIMD * chunkSize]byte, buflen int, key *[8]uint32, counter uint64, flags uint32) node { switch { case haveAVX512 && buflen >= chunkSize*2: return compressBufferAVX512(buf, buflen, key, counter, flags) case haveAVX2 && buflen >= chunkSize*2: return compressBufferAVX2(buf, buflen, key, counter, flags) default: return compressBufferGeneric(buf, buflen, key, counter, flags) } } func compressChunk(chunk []byte, key *[8]uint32, counter uint64, flags uint32) node { n := node{ cv: *key, counter: counter, blockLen: blockSize, flags: flags | flagChunkStart, } blockBytes := (*[64]byte)(unsafe.Pointer(&n.block))[:] for len(chunk) > blockSize { copy(blockBytes, chunk) chunk = chunk[blockSize:] n.cv = chainingValue(n) n.flags &^= flagChunkStart } // pad last block with zeros n.block = [16]uint32{} copy(blockBytes, chunk) n.blockLen = uint32(len(chunk)) n.flags |= flagChunkEnd return n } func hashBlock(out *[64]byte, buf []byte) { var block [16]uint32 copy((*[64]byte)(unsafe.Pointer(&block))[:], buf) compressNodeGeneric((*[16]uint32)(unsafe.Pointer(out)), node{ cv: iv, block: block, blockLen: uint32(len(buf)), flags: flagChunkStart | flagChunkEnd | flagRoot, }) } func compressBlocks(out *[maxSIMD * blockSize]byte, n node) { switch { case haveAVX512: compressBlocksAVX512(out, &n.block, &n.cv, n.counter, n.blockLen, n.flags) case haveAVX2: outs := (*[2][512]byte)(unsafe.Pointer(out)) compressBlocksAVX2(&outs[0], &n.block, &n.cv, n.counter, n.blockLen, n.flags) compressBlocksAVX2(&outs[1], &n.block, &n.cv, n.counter+8, n.blockLen, n.flags) default: outs := (*[maxSIMD][64]byte)(unsafe.Pointer(out)) compressBlocksGeneric(outs, n) } } func mergeSubtrees(cvs *[maxSIMD][8]uint32, numCVs uint64, key *[8]uint32, flags uint32) node { if !haveAVX2 { return mergeSubtreesGeneric(cvs, numCVs, key, flags) } for numCVs > 2 { if numCVs%2 == 0 { compressParentsAVX2((*[8][8]uint32)(unsafe.Pointer(cvs)), cvs, key, flags) } else { keep := cvs[numCVs-1] compressParentsAVX2((*[8][8]uint32)(unsafe.Pointer(cvs)), cvs, key, flags) cvs[numCVs/2] = keep numCVs++ } numCVs /= 2 } return parentNode(cvs[0], cvs[1], *key, flags) } func wordsToBytes(words [16]uint32, block *[64]byte) { *block = *(*[64]byte)(unsafe.Pointer(&words)) } blake3-1.1.6/compress_generic.go000066400000000000000000000130011411544176700165430ustar00rootroot00000000000000package blake3 import ( "bytes" "math/bits" ) func compressNodeGeneric(out *[16]uint32, n node) { g := func(a, b, c, d, mx, my uint32) (uint32, uint32, uint32, uint32) { a += b + mx d = bits.RotateLeft32(d^a, -16) c += d b = bits.RotateLeft32(b^c, -12) a += b + my d = bits.RotateLeft32(d^a, -8) c += d b = bits.RotateLeft32(b^c, -7) return a, b, c, d } // NOTE: we unroll all of the rounds, as well as the permutations that occur // between rounds. // round 1 (also initializes state) // columns s0, s4, s8, s12 := g(n.cv[0], n.cv[4], iv[0], uint32(n.counter), n.block[0], n.block[1]) s1, s5, s9, s13 := g(n.cv[1], n.cv[5], iv[1], uint32(n.counter>>32), n.block[2], n.block[3]) s2, s6, s10, s14 := g(n.cv[2], n.cv[6], iv[2], n.blockLen, n.block[4], n.block[5]) s3, s7, s11, s15 := g(n.cv[3], n.cv[7], iv[3], n.flags, n.block[6], n.block[7]) // diagonals s0, s5, s10, s15 = g(s0, s5, s10, s15, n.block[8], n.block[9]) s1, s6, s11, s12 = g(s1, s6, s11, s12, n.block[10], n.block[11]) s2, s7, s8, s13 = g(s2, s7, s8, s13, n.block[12], n.block[13]) s3, s4, s9, s14 = g(s3, s4, s9, s14, n.block[14], n.block[15]) // round 2 s0, s4, s8, s12 = g(s0, s4, s8, s12, n.block[2], n.block[6]) s1, s5, s9, s13 = g(s1, s5, s9, s13, n.block[3], n.block[10]) s2, s6, s10, s14 = g(s2, s6, s10, s14, n.block[7], n.block[0]) s3, s7, s11, s15 = g(s3, s7, s11, s15, n.block[4], n.block[13]) s0, s5, s10, s15 = g(s0, s5, s10, s15, n.block[1], n.block[11]) s1, s6, s11, s12 = g(s1, s6, s11, s12, n.block[12], n.block[5]) s2, s7, s8, s13 = g(s2, s7, s8, s13, n.block[9], n.block[14]) s3, s4, s9, s14 = g(s3, s4, s9, s14, n.block[15], n.block[8]) // round 3 s0, s4, s8, s12 = g(s0, s4, s8, s12, n.block[3], n.block[4]) s1, s5, s9, s13 = g(s1, s5, s9, s13, n.block[10], n.block[12]) s2, s6, s10, s14 = g(s2, s6, s10, s14, n.block[13], n.block[2]) s3, s7, s11, s15 = g(s3, s7, s11, s15, n.block[7], n.block[14]) s0, s5, s10, s15 = g(s0, s5, s10, s15, n.block[6], n.block[5]) s1, s6, s11, s12 = g(s1, s6, s11, s12, n.block[9], n.block[0]) s2, s7, s8, s13 = g(s2, s7, s8, s13, n.block[11], n.block[15]) s3, s4, s9, s14 = g(s3, s4, s9, s14, n.block[8], n.block[1]) // round 4 s0, s4, s8, s12 = g(s0, s4, s8, s12, n.block[10], n.block[7]) s1, s5, s9, s13 = g(s1, s5, s9, s13, n.block[12], n.block[9]) s2, s6, s10, s14 = g(s2, s6, s10, s14, n.block[14], n.block[3]) s3, s7, s11, s15 = g(s3, s7, s11, s15, n.block[13], n.block[15]) s0, s5, s10, s15 = g(s0, s5, s10, s15, n.block[4], n.block[0]) s1, s6, s11, s12 = g(s1, s6, s11, s12, n.block[11], n.block[2]) s2, s7, s8, s13 = g(s2, s7, s8, s13, n.block[5], n.block[8]) s3, s4, s9, s14 = g(s3, s4, s9, s14, n.block[1], n.block[6]) // round 5 s0, s4, s8, s12 = g(s0, s4, s8, s12, n.block[12], n.block[13]) s1, s5, s9, s13 = g(s1, s5, s9, s13, n.block[9], n.block[11]) s2, s6, s10, s14 = g(s2, s6, s10, s14, n.block[15], n.block[10]) s3, s7, s11, s15 = g(s3, s7, s11, s15, n.block[14], n.block[8]) s0, s5, s10, s15 = g(s0, s5, s10, s15, n.block[7], n.block[2]) s1, s6, s11, s12 = g(s1, s6, s11, s12, n.block[5], n.block[3]) s2, s7, s8, s13 = g(s2, s7, s8, s13, n.block[0], n.block[1]) s3, s4, s9, s14 = g(s3, s4, s9, s14, n.block[6], n.block[4]) // round 6 s0, s4, s8, s12 = g(s0, s4, s8, s12, n.block[9], n.block[14]) s1, s5, s9, s13 = g(s1, s5, s9, s13, n.block[11], n.block[5]) s2, s6, s10, s14 = g(s2, s6, s10, s14, n.block[8], n.block[12]) s3, s7, s11, s15 = g(s3, s7, s11, s15, n.block[15], n.block[1]) s0, s5, s10, s15 = g(s0, s5, s10, s15, n.block[13], n.block[3]) s1, s6, s11, s12 = g(s1, s6, s11, s12, n.block[0], n.block[10]) s2, s7, s8, s13 = g(s2, s7, s8, s13, n.block[2], n.block[6]) s3, s4, s9, s14 = g(s3, s4, s9, s14, n.block[4], n.block[7]) // round 7 s0, s4, s8, s12 = g(s0, s4, s8, s12, n.block[11], n.block[15]) s1, s5, s9, s13 = g(s1, s5, s9, s13, n.block[5], n.block[0]) s2, s6, s10, s14 = g(s2, s6, s10, s14, n.block[1], n.block[9]) s3, s7, s11, s15 = g(s3, s7, s11, s15, n.block[8], n.block[6]) s0, s5, s10, s15 = g(s0, s5, s10, s15, n.block[14], n.block[10]) s1, s6, s11, s12 = g(s1, s6, s11, s12, n.block[2], n.block[12]) s2, s7, s8, s13 = g(s2, s7, s8, s13, n.block[3], n.block[4]) s3, s4, s9, s14 = g(s3, s4, s9, s14, n.block[7], n.block[13]) // finalization *out = [16]uint32{ s0 ^ s8, s1 ^ s9, s2 ^ s10, s3 ^ s11, s4 ^ s12, s5 ^ s13, s6 ^ s14, s7 ^ s15, s8 ^ n.cv[0], s9 ^ n.cv[1], s10 ^ n.cv[2], s11 ^ n.cv[3], s12 ^ n.cv[4], s13 ^ n.cv[5], s14 ^ n.cv[6], s15 ^ n.cv[7], } } func chainingValue(n node) (cv [8]uint32) { full := compressNode(n) copy(cv[:], full[:]) return } func compressBufferGeneric(buf *[maxSIMD * chunkSize]byte, buflen int, key *[8]uint32, counter uint64, flags uint32) (n node) { if buflen <= chunkSize { return compressChunk(buf[:buflen], key, counter, flags) } var cvs [maxSIMD][8]uint32 var numCVs uint64 for bb := bytes.NewBuffer(buf[:buflen]); bb.Len() > 0; numCVs++ { cvs[numCVs] = chainingValue(compressChunk(bb.Next(chunkSize), key, counter+numCVs, flags)) } return mergeSubtrees(&cvs, numCVs, key, flags) } func compressBlocksGeneric(outs *[maxSIMD][64]byte, n node) { for i := range outs { wordsToBytes(compressNode(n), &outs[i]) n.counter++ } } func mergeSubtreesGeneric(cvs *[maxSIMD][8]uint32, numCVs uint64, key *[8]uint32, flags uint32) node { for numCVs > 2 { rem := numCVs / 2 for i := range cvs[:rem] { cvs[i] = chainingValue(parentNode(cvs[i*2], cvs[i*2+1], *key, flags)) } if numCVs%2 != 0 { cvs[rem] = cvs[rem*2] rem++ } numCVs = rem } return parentNode(cvs[0], cvs[1], *key, flags) } blake3-1.1.6/compress_noasm.go000066400000000000000000000034621411544176700162560ustar00rootroot00000000000000// +build !amd64 package blake3 import "encoding/binary" func compressNode(n node) (out [16]uint32) { compressNodeGeneric(&out, n) return } func compressBuffer(buf *[maxSIMD * chunkSize]byte, buflen int, key *[8]uint32, counter uint64, flags uint32) node { return compressBufferGeneric(buf, buflen, key, counter, flags) } func compressChunk(chunk []byte, key *[8]uint32, counter uint64, flags uint32) node { n := node{ cv: *key, counter: counter, blockLen: blockSize, flags: flags | flagChunkStart, } var block [blockSize]byte for len(chunk) > blockSize { copy(block[:], chunk) chunk = chunk[blockSize:] bytesToWords(block, &n.block) n.cv = chainingValue(n) n.flags &^= flagChunkStart } // pad last block with zeros block = [blockSize]byte{} n.blockLen = uint32(len(chunk)) copy(block[:], chunk) bytesToWords(block, &n.block) n.flags |= flagChunkEnd return n } func hashBlock(out *[64]byte, buf []byte) { var block [64]byte var words [16]uint32 copy(block[:], buf) bytesToWords(block, &words) compressNodeGeneric(&words, node{ cv: iv, block: words, blockLen: uint32(len(buf)), flags: flagChunkStart | flagChunkEnd | flagRoot, }) wordsToBytes(words, out) } func compressBlocks(out *[maxSIMD * blockSize]byte, n node) { var outs [maxSIMD][64]byte compressBlocksGeneric(&outs, n) for i := range outs { copy(out[i*64:], outs[i][:]) } } func mergeSubtrees(cvs *[maxSIMD][8]uint32, numCVs uint64, key *[8]uint32, flags uint32) node { return mergeSubtreesGeneric(cvs, numCVs, key, flags) } func bytesToWords(bytes [64]byte, words *[16]uint32) { for i := range words { words[i] = binary.LittleEndian.Uint32(bytes[4*i:]) } } func wordsToBytes(words [16]uint32, block *[64]byte) { for i, w := range words { binary.LittleEndian.PutUint32(block[4*i:], w) } } blake3-1.1.6/cpu.go000066400000000000000000000002601411544176700140060ustar00rootroot00000000000000// +build !darwin package blake3 import "github.com/klauspost/cpuid/v2" var ( haveAVX2 = cpuid.CPU.Supports(cpuid.AVX2) haveAVX512 = cpuid.CPU.Supports(cpuid.AVX512F) ) blake3-1.1.6/cpu_darwin.go000066400000000000000000000006171411544176700153600ustar00rootroot00000000000000package blake3 import ( "syscall" "github.com/klauspost/cpuid/v2" ) var ( haveAVX2 bool haveAVX512 bool ) func init() { haveAVX2 = cpuid.CPU.Supports(cpuid.AVX2) haveAVX512 = cpuid.CPU.Supports(cpuid.AVX512F) if !haveAVX512 { // On some Macs, AVX512 detection is buggy, so fallback to sysctl b, _ := syscall.Sysctl("hw.optional.avx512f") haveAVX512 = len(b) > 0 && b[0] == 1 } } blake3-1.1.6/go.mod000066400000000000000000000001261411544176700137770ustar00rootroot00000000000000module lukechampine.com/blake3 go 1.13 require github.com/klauspost/cpuid/v2 v2.0.9 blake3-1.1.6/go.sum000066400000000000000000000002611411544176700140240ustar00rootroot00000000000000github.com/klauspost/cpuid/v2 v2.0.9 h1:lgaqFMSdTdQYdZ04uHyN2d/eKdOMyi2YLSvlQIBFYa4= github.com/klauspost/cpuid/v2 v2.0.9/go.mod h1:FInQzS24/EEf25PyTYn52gqo7WaD8xa0213Md/qVLRg= blake3-1.1.6/testdata/000077500000000000000000000000001411544176700145035ustar00rootroot00000000000000blake3-1.1.6/testdata/vectors.json000066400000000000000000000462451411544176700170760ustar00rootroot00000000000000{ "key": "whats the Elvish word for friend", "cases": [ { "input_len": 0, "hash": "af1349b9f5f9a1a6a0404dea36dcc9499bcb25c9adc112b7cc9a93cae41f3262e00f03e7b69af26b7faaf09fcd333050338ddfe085b8cc869ca98b206c08243a26f5487789e8f660afe6c99ef9e0c52b92e7393024a80459cf91f476f9ffdbda7001c22e159b402631f277ca96f2defdf1078282314e763699a31c5363165421cce14d", "keyed_hash": "92b2b75604ed3c761f9d6f62392c8a9227ad0ea3f09573e783f1498a4ed60d26b18171a2f22a4b94822c701f107153dba24918c4bae4d2945c20ece13387627d3b73cbf97b797d5e59948c7ef788f54372df45e45e4293c7dc18c1d41144a9758be58960856be1eabbe22c2653190de560ca3b2ac4aa692a9210694254c371e851bc8f", "derive_key": "2cc39783c223154fea8dfb7c1b1660f2ac2dcbd1c1de8277b0b0dd39b7e50d7d905630c8be290dfcf3e6842f13bddd573c098c3f17361f1f206b8cad9d088aa4a3f746752c6b0ce6a83b0da81d59649257cdf8eb3e9f7d4998e41021fac119deefb896224ac99f860011f73609e6e0e4540f93b273e56547dfd3aa1a035ba6689d89a0" }, { "input_len": 1, "hash": "2d3adedff11b61f14c886e35afa036736dcd87a74d27b5c1510225d0f592e213c3a6cb8bf623e20cdb535f8d1a5ffb86342d9c0b64aca3bce1d31f60adfa137b358ad4d79f97b47c3d5e79f179df87a3b9776ef8325f8329886ba42f07fb138bb502f4081cbcec3195c5871e6c23e2cc97d3c69a613eba131e5f1351f3f1da786545e5", "keyed_hash": "6d7878dfff2f485635d39013278ae14f1454b8c0a3a2d34bc1ab38228a80c95b6568c0490609413006fbd428eb3fd14e7756d90f73a4725fad147f7bf70fd61c4e0cf7074885e92b0e3f125978b4154986d4fb202a3f331a3fb6cf349a3a70e49990f98fe4289761c8602c4e6ab1138d31d3b62218078b2f3ba9a88e1d08d0dd4cea11", "derive_key": "b3e2e340a117a499c6cf2398a19ee0d29cca2bb7404c73063382693bf66cb06c5827b91bf889b6b97c5477f535361caefca0b5d8c4746441c57617111933158950670f9aa8a05d791daae10ac683cbef8faf897c84e6114a59d2173c3f417023a35d6983f2c7dfa57e7fc559ad751dbfb9ffab39c2ef8c4aafebc9ae973a64f0c76551" }, { "input_len": 1023, "hash": "10108970eeda3eb932baac1428c7a2163b0e924c9a9e25b35bba72b28f70bd11a182d27a591b05592b15607500e1e8dd56bc6c7fc063715b7a1d737df5bad3339c56778957d870eb9717b57ea3d9fb68d1b55127bba6a906a4a24bbd5acb2d123a37b28f9e9a81bbaae360d58f85e5fc9d75f7c370a0cc09b6522d9c8d822f2f28f485", "keyed_hash": "c951ecdf03288d0fcc96ee3413563d8a6d3589547f2c2fb36d9786470f1b9d6e890316d2e6d8b8c25b0a5b2180f94fb1a158ef508c3cde45e2966bd796a696d3e13efd86259d756387d9becf5c8bf1ce2192b87025152907b6d8cc33d17826d8b7b9bc97e38c3c85108ef09f013e01c229c20a83d9e8efac5b37470da28575fd755a10", "derive_key": "74a16c1c3d44368a86e1ca6df64be6a2f64cce8f09220787450722d85725dea59c413264404661e9e4d955409dfe4ad3aa487871bcd454ed12abfe2c2b1eb7757588cf6cb18d2eccad49e018c0d0fec323bec82bf1644c6325717d13ea712e6840d3e6e730d35553f59eff5377a9c350bcc1556694b924b858f329c44ee64b884ef00d" }, { "input_len": 1024, "hash": "42214739f095a406f3fc83deb889744ac00df831c10daa55189b5d121c855af71cf8107265ecdaf8505b95d8fcec83a98a6a96ea5109d2c179c47a387ffbb404756f6eeae7883b446b70ebb144527c2075ab8ab204c0086bb22b7c93d465efc57f8d917f0b385c6df265e77003b85102967486ed57db5c5ca170ba441427ed9afa684e", "keyed_hash": "75c46f6f3d9eb4f55ecaaee480db732e6c2105546f1e675003687c31719c7ba4a78bc838c72852d4f49c864acb7adafe2478e824afe51c8919d06168414c265f298a8094b1ad813a9b8614acabac321f24ce61c5a5346eb519520d38ecc43e89b5000236df0597243e4d2493fd626730e2ba17ac4d8824d09d1a4a8f57b8227778e2de", "derive_key": "7356cd7720d5b66b6d0697eb3177d9f8d73a4a5c5e968896eb6a6896843027066c23b601d3ddfb391e90d5c8eccdef4ae2a264bce9e612ba15e2bc9d654af1481b2e75dbabe615974f1070bba84d56853265a34330b4766f8e75edd1f4a1650476c10802f22b64bd3919d246ba20a17558bc51c199efdec67e80a227251808d8ce5bad" }, { "input_len": 1025, "hash": "d00278ae47eb27b34faecf67b4fe263f82d5412916c1ffd97c8cb7fb814b8444f4c4a22b4b399155358a994e52bf255de60035742ec71bd08ac275a1b51cc6bfe332b0ef84b409108cda080e6269ed4b3e2c3f7d722aa4cdc98d16deb554e5627be8f955c98e1d5f9565a9194cad0c4285f93700062d9595adb992ae68ff12800ab67a", "keyed_hash": "357dc55de0c7e382c900fd6e320acc04146be01db6a8ce7210b7189bd664ea69362396b77fdc0d2634a552970843722066c3c15902ae5097e00ff53f1e116f1cd5352720113a837ab2452cafbde4d54085d9cf5d21ca613071551b25d52e69d6c81123872b6f19cd3bc1333edf0c52b94de23ba772cf82636cff4542540a7738d5b930", "derive_key": "effaa245f065fbf82ac186839a249707c3bddf6d3fdda22d1b95a3c970379bcb5d31013a167509e9066273ab6e2123bc835b408b067d88f96addb550d96b6852dad38e320b9d940f86db74d398c770f462118b35d2724efa13da97194491d96dd37c3c09cbef665953f2ee85ec83d88b88d11547a6f911c8217cca46defa2751e7f3ad" }, { "input_len": 2048, "hash": "e776b6028c7cd22a4d0ba182a8bf62205d2ef576467e838ed6f2529b85fba24a9a60bf80001410ec9eea6698cd537939fad4749edd484cb541aced55cd9bf54764d063f23f6f1e32e12958ba5cfeb1bf618ad094266d4fc3c968c2088f677454c288c67ba0dba337b9d91c7e1ba586dc9a5bc2d5e90c14f53a8863ac75655461cea8f9", "keyed_hash": "879cf1fa2ea0e79126cb1063617a05b6ad9d0b696d0d757cf053439f60a99dd10173b961cd574288194b23ece278c330fbb8585485e74967f31352a8183aa782b2b22f26cdcadb61eed1a5bc144b8198fbb0c13abbf8e3192c145d0a5c21633b0ef86054f42809df823389ee40811a5910dcbd1018af31c3b43aa55201ed4edaac74fe", "derive_key": "7b2945cb4fef70885cc5d78a87bf6f6207dd901ff239201351ffac04e1088a23e2c11a1ebffcea4d80447867b61badb1383d842d4e79645d48dd82ccba290769caa7af8eaa1bd78a2a5e6e94fbdab78d9c7b74e894879f6a515257ccf6f95056f4e25390f24f6b35ffbb74b766202569b1d797f2d4bd9d17524c720107f985f4ddc583" }, { "input_len": 2049, "hash": "5f4d72f40d7a5f82b15ca2b2e44b1de3c2ef86c426c95c1af0b687952256303096de31d71d74103403822a2e0bc1eb193e7aecc9643a76b7bbc0c9f9c52e8783aae98764ca468962b5c2ec92f0c74eb5448d519713e09413719431c802f948dd5d90425a4ecdadece9eb178d80f26efccae630734dff63340285adec2aed3b51073ad3", "keyed_hash": "9f29700902f7c86e514ddc4df1e3049f258b2472b6dd5267f61bf13983b78dd5f9a88abfefdfa1e00b418971f2b39c64ca621e8eb37fceac57fd0c8fc8e117d43b81447be22d5d8186f8f5919ba6bcc6846bd7d50726c06d245672c2ad4f61702c646499ee1173daa061ffe15bf45a631e2946d616a4c345822f1151284712f76b2b0e", "derive_key": "2ea477c5515cc3dd606512ee72bb3e0e758cfae7232826f35fb98ca1bcbdf27316d8e9e79081a80b046b60f6a263616f33ca464bd78d79fa18200d06c7fc9bffd808cc4755277a7d5e09da0f29ed150f6537ea9bed946227ff184cc66a72a5f8c1e4bd8b04e81cf40fe6dc4427ad5678311a61f4ffc39d195589bdbc670f63ae70f4b6" }, { "input_len": 3072, "hash": "b98cb0ff3623be03326b373de6b9095218513e64f1ee2edd2525c7ad1e5cffd29a3f6b0b978d6608335c09dc94ccf682f9951cdfc501bfe47b9c9189a6fc7b404d120258506341a6d802857322fbd20d3e5dae05b95c88793fa83db1cb08e7d8008d1599b6209d78336e24839724c191b2a52a80448306e0daa84a3fdb566661a37e11", "keyed_hash": "044a0e7b172a312dc02a4c9a818c036ffa2776368d7f528268d2e6b5df19177022f302d0529e4174cc507c463671217975e81dab02b8fdeb0d7ccc7568dd22574c783a76be215441b32e91b9a904be8ea81f7a0afd14bad8ee7c8efc305ace5d3dd61b996febe8da4f56ca0919359a7533216e2999fc87ff7d8f176fbecb3d6f34278b", "derive_key": "050df97f8c2ead654d9bb3ab8c9178edcd902a32f8495949feadcc1e0480c46b3604131bbd6e3ba573b6dd682fa0a63e5b165d39fc43a625d00207607a2bfeb65ff1d29292152e26b298868e3b87be95d6458f6f2ce6118437b632415abe6ad522874bcd79e4030a5e7bad2efa90a7a7c67e93f0a18fb28369d0a9329ab5c24134ccb0" }, { "input_len": 3073, "hash": "7124b49501012f81cc7f11ca069ec9226cecb8a2c850cfe644e327d22d3e1cd39a27ae3b79d68d89da9bf25bc27139ae65a324918a5f9b7828181e52cf373c84f35b639b7fccbb985b6f2fa56aea0c18f531203497b8bbd3a07ceb5926f1cab74d14bd66486d9a91eba99059a98bd1cd25876b2af5a76c3e9eed554ed72ea952b603bf", "keyed_hash": "68dede9bef00ba89e43f31a6825f4cf433389fedae75c04ee9f0cf16a427c95a96d6da3fe985054d3478865be9a092250839a697bbda74e279e8a9e69f0025e4cfddd6cfb434b1cd9543aaf97c635d1b451a4386041e4bb100f5e45407cbbc24fa53ea2de3536ccb329e4eb9466ec37093a42cf62b82903c696a93a50b702c80f3c3c5", "derive_key": "72613c9ec9ff7e40f8f5c173784c532ad852e827dba2bf85b2ab4b76f7079081576288e552647a9d86481c2cae75c2dd4e7c5195fb9ada1ef50e9c5098c249d743929191441301c69e1f48505a4305ec1778450ee48b8e69dc23a25960fe33070ea549119599760a8a2d28aeca06b8c5e9ba58bc19e11fe57b6ee98aa44b2a8e6b14a5" }, { "input_len": 4096, "hash": "015094013f57a5277b59d8475c0501042c0b642e531b0a1c8f58d2163229e9690289e9409ddb1b99768eafe1623da896faf7e1114bebeadc1be30829b6f8af707d85c298f4f0ff4d9438aef948335612ae921e76d411c3a9111df62d27eaf871959ae0062b5492a0feb98ef3ed4af277f5395172dbe5c311918ea0074ce0036454f620", "keyed_hash": "befc660aea2f1718884cd8deb9902811d332f4fc4a38cf7c7300d597a081bfc0bbb64a36edb564e01e4b4aaf3b060092a6b838bea44afebd2deb8298fa562b7b597c757b9df4c911c3ca462e2ac89e9a787357aaf74c3b56d5c07bc93ce899568a3eb17d9250c20f6c5f6c1e792ec9a2dcb715398d5a6ec6d5c54f586a00403a1af1de", "derive_key": "1e0d7f3db8c414c97c6307cbda6cd27ac3b030949da8e23be1a1a924ad2f25b9d78038f7b198596c6cc4a9ccf93223c08722d684f240ff6569075ed81591fd93f9fff1110b3a75bc67e426012e5588959cc5a4c192173a03c00731cf84544f65a2fb9378989f72e9694a6a394a8a30997c2e67f95a504e631cd2c5f55246024761b245" }, { "input_len": 4097, "hash": "9b4052b38f1c5fc8b1f9ff7ac7b27cd242487b3d890d15c96a1c25b8aa0fb99505f91b0b5600a11251652eacfa9497b31cd3c409ce2e45cfe6c0a016967316c426bd26f619eab5d70af9a418b845c608840390f361630bd497b1ab44019316357c61dbe091ce72fc16dc340ac3d6e009e050b3adac4b5b2c92e722cffdc46501531956", "keyed_hash": "00df940cd36bb9fa7cbbc3556744e0dbc8191401afe70520ba292ee3ca80abbc606db4976cfdd266ae0abf667d9481831ff12e0caa268e7d3e57260c0824115a54ce595ccc897786d9dcbf495599cfd90157186a46ec800a6763f1c59e36197e9939e900809f7077c102f888caaf864b253bc41eea812656d46742e4ea42769f89b83f", "derive_key": "aca51029626b55fda7117b42a7c211f8c6e9ba4fe5b7a8ca922f34299500ead8a897f66a400fed9198fd61dd2d58d382458e64e100128075fc54b860934e8de2e84170734b06e1d212a117100820dbc48292d148afa50567b8b84b1ec336ae10d40c8c975a624996e12de31abbe135d9d159375739c333798a80c64ae895e51e22f3ad" }, { "input_len": 5120, "hash": "9cadc15fed8b5d854562b26a9536d9707cadeda9b143978f319ab34230535833acc61c8fdc114a2010ce8038c853e121e1544985133fccdd0a2d507e8e615e611e9a0ba4f47915f49e53d721816a9198e8b30f12d20ec3689989175f1bf7a300eee0d9321fad8da232ece6efb8e9fd81b42ad161f6b9550a069e66b11b40487a5f5059", "keyed_hash": "2c493e48e9b9bf31e0553a22b23503c0a3388f035cece68eb438d22fa1943e209b4dc9209cd80ce7c1f7c9a744658e7e288465717ae6e56d5463d4f80cdb2ef56495f6a4f5487f69749af0c34c2cdfa857f3056bf8d807336a14d7b89bf62bef2fb54f9af6a546f818dc1e98b9e07f8a5834da50fa28fb5874af91bf06020d1bf0120e", "derive_key": "7a7acac8a02adcf3038d74cdd1d34527de8a0fcc0ee3399d1262397ce5817f6055d0cefd84d9d57fe792d65a278fd20384ac6c30fdb340092f1a74a92ace99c482b28f0fc0ef3b923e56ade20c6dba47e49227166251337d80a037e987ad3a7f728b5ab6dfafd6e2ab1bd583a95d9c895ba9c2422c24ea0f62961f0dca45cad47bfa0d" }, { "input_len": 5121, "hash": "628bd2cb2004694adaab7bbd778a25df25c47b9d4155a55f8fbd79f2fe154cff96adaab0613a6146cdaabe498c3a94e529d3fc1da2bd08edf54ed64d40dcd6777647eac51d8277d70219a9694334a68bc8f0f23e20b0ff70ada6f844542dfa32cd4204ca1846ef76d811cdb296f65e260227f477aa7aa008bac878f72257484f2b6c95", "keyed_hash": "6ccf1c34753e7a044db80798ecd0782a8f76f33563accaddbfbb2e0ea4b2d0240d07e63f13667a8d1490e5e04f13eb617aea16a8c8a5aaed1ef6fbde1b0515e3c81050b361af6ead126032998290b563e3caddeaebfab592e155f2e161fb7cba939092133f23f9e65245e58ec23457b78a2e8a125588aad6e07d7f11a85b88d375b72d", "derive_key": "b07f01e518e702f7ccb44a267e9e112d403a7b3f4883a47ffbed4b48339b3c341a0add0ac032ab5aaea1e4e5b004707ec5681ae0fcbe3796974c0b1cf31a194740c14519273eedaabec832e8a784b6e7cfc2c5952677e6c3f2c3914454082d7eb1ce1766ac7d75a4d3001fc89544dd46b5147382240d689bbbaefc359fb6ae30263165" }, { "input_len": 6144, "hash": "3e2e5b74e048f3add6d21faab3f83aa44d3b2278afb83b80b3c35164ebeca2054d742022da6fdda444ebc384b04a54c3ac5839b49da7d39f6d8a9db03deab32aade156c1c0311e9b3435cde0ddba0dce7b26a376cad121294b689193508dd63151603c6ddb866ad16c2ee41585d1633a2cea093bea714f4c5d6b903522045b20395c83", "keyed_hash": "3d6b6d21281d0ade5b2b016ae4034c5dec10ca7e475f90f76eac7138e9bc8f1dc35754060091dc5caf3efabe0603c60f45e415bb3407db67e6beb3d11cf8e4f7907561f05dace0c15807f4b5f389c841eb114d81a82c02a00b57206b1d11fa6e803486b048a5ce87105a686dee041207e095323dfe172df73deb8c9532066d88f9da7e", "derive_key": "2a95beae63ddce523762355cf4b9c1d8f131465780a391286a5d01abb5683a1597099e3c6488aab6c48f3c15dbe1942d21dbcdc12115d19a8b8465fb54e9053323a9178e4275647f1a9927f6439e52b7031a0b465c861a3fc531527f7758b2b888cf2f20582e9e2c593709c0a44f9c6e0f8b963994882ea4168827823eef1f64169fef" }, { "input_len": 6145, "hash": "f1323a8631446cc50536a9f705ee5cb619424d46887f3c376c695b70e0f0507f18a2cfdd73c6e39dd75ce7c1c6e3ef238fd54465f053b25d21044ccb2093beb015015532b108313b5829c3621ce324b8e14229091b7c93f32db2e4e63126a377d2a63a3597997d4f1cba59309cb4af240ba70cebff9a23d5e3ff0cdae2cfd54e070022", "keyed_hash": "9ac301e9e39e45e3250a7e3b3df701aa0fb6889fbd80eeecf28dbc6300fbc539f3c184ca2f59780e27a576c1d1fb9772e99fd17881d02ac7dfd39675aca918453283ed8c3169085ef4a466b91c1649cc341dfdee60e32231fc34c9c4e0b9a2ba87ca8f372589c744c15fd6f985eec15e98136f25beeb4b13c4e43dc84abcc79cd4646c", "derive_key": "379bcc61d0051dd489f686c13de00d5b14c505245103dc040d9e4dd1facab8e5114493d029bdbd295aaa744a59e31f35c7f52dba9c3642f773dd0b4262a9980a2aef811697e1305d37ba9d8b6d850ef07fe41108993180cf779aeece363704c76483458603bbeeb693cffbbe5588d1f3535dcad888893e53d977424bb707201569a8d2" }, { "input_len": 7168, "hash": "61da957ec2499a95d6b8023e2b0e604ec7f6b50e80a9678b89d2628e99ada77a5707c321c83361793b9af62a40f43b523df1c8633cecb4cd14d00bdc79c78fca5165b863893f6d38b02ff7236c5a9a8ad2dba87d24c547cab046c29fc5bc1ed142e1de4763613bb162a5a538e6ef05ed05199d751f9eb58d332791b8d73fb74e4fce95", "keyed_hash": "b42835e40e9d4a7f42ad8cc04f85a963a76e18198377ed84adddeaecacc6f3fca2f01d5277d69bb681c70fa8d36094f73ec06e452c80d2ff2257ed82e7ba348400989a65ee8daa7094ae0933e3d2210ac6395c4af24f91c2b590ef87d7788d7066ea3eaebca4c08a4f14b9a27644f99084c3543711b64a070b94f2c9d1d8a90d035d52", "derive_key": "11c37a112765370c94a51415d0d651190c288566e295d505defdad895dae223730d5a5175a38841693020669c7638f40b9bc1f9f39cf98bda7a5b54ae24218a800a2116b34665aa95d846d97ea988bfcb53dd9c055d588fa21ba78996776ea6c40bc428b53c62b5f3ccf200f647a5aae8067f0ea1976391fcc72af1945100e2a6dcb88" }, { "input_len": 7169, "hash": "a003fc7a51754a9b3c7fae0367ab3d782dccf28855a03d435f8cfe74605e781798a8b20534be1ca9eb2ae2df3fae2ea60e48c6fb0b850b1385b5de0fe460dbe9d9f9b0d8db4435da75c601156df9d047f4ede008732eb17adc05d96180f8a73548522840779e6062d643b79478a6e8dbce68927f36ebf676ffa7d72d5f68f050b119c8", "keyed_hash": "ed9b1a922c046fdb3d423ae34e143b05ca1bf28b710432857bf738bcedbfa5113c9e28d72fcbfc020814ce3f5d4fc867f01c8f5b6caf305b3ea8a8ba2da3ab69fabcb438f19ff11f5378ad4484d75c478de425fb8e6ee809b54eec9bdb184315dc856617c09f5340451bf42fd3270a7b0b6566169f242e533777604c118a6358250f54", "derive_key": "554b0a5efea9ef183f2f9b931b7497995d9eb26f5c5c6dad2b97d62fc5ac31d99b20652c016d88ba2a611bbd761668d5eda3e568e940faae24b0d9991c3bd25a65f770b89fdcadabcb3d1a9c1cb63e69721cacf1ae69fefdcef1e3ef41bc5312ccc17222199e47a26552c6adc460cf47a72319cb5039369d0060eaea59d6c65130f1dd" }, { "input_len": 8192, "hash": "aae792484c8efe4f19e2ca7d371d8c467ffb10748d8a5a1ae579948f718a2a635fe51a27db045a567c1ad51be5aa34c01c6651c4d9b5b5ac5d0fd58cf18dd61a47778566b797a8c67df7b1d60b97b19288d2d877bb2df417ace009dcb0241ca1257d62712b6a4043b4ff33f690d849da91ea3bf711ed583cb7b7a7da2839ba71309bbf", "keyed_hash": "dc9637c8845a770b4cbf76b8daec0eebf7dc2eac11498517f08d44c8fc00d58a4834464159dcbc12a0ba0c6d6eb41bac0ed6585cabfe0aca36a375e6c5480c22afdc40785c170f5a6b8a1107dbee282318d00d915ac9ed1143ad40765ec120042ee121cd2baa36250c618adaf9e27260fda2f94dea8fb6f08c04f8f10c78292aa46102", "derive_key": "ad01d7ae4ad059b0d33baa3c01319dcf8088094d0359e5fd45d6aeaa8b2d0c3d4c9e58958553513b67f84f8eac653aeeb02ae1d5672dcecf91cd9985a0e67f4501910ecba25555395427ccc7241d70dc21c190e2aadee875e5aae6bf1912837e53411dabf7a56cbf8e4fb780432b0d7fe6cec45024a0788cf5874616407757e9e6bef7" }, { "input_len": 8193, "hash": "bab6c09cb8ce8cf459261398d2e7aef35700bf488116ceb94a36d0f5f1b7bc3bb2282aa69be089359ea1154b9a9286c4a56af4de975a9aa4a5c497654914d279bea60bb6d2cf7225a2fa0ff5ef56bbe4b149f3ed15860f78b4e2ad04e158e375c1e0c0b551cd7dfc82f1b155c11b6b3ed51ec9edb30d133653bb5709d1dbd55f4e1ff6", "keyed_hash": "954a2a75420c8d6547e3ba5b98d963e6fa6491addc8c023189cc519821b4a1f5f03228648fd983aef045c2fa8290934b0866b615f585149587dda2299039965328835a2b18f1d63b7e300fc76ff260b571839fe44876a4eae66cbac8c67694411ed7e09df51068a22c6e67d6d3dd2cca8ff12e3275384006c80f4db68023f24eebba57", "derive_key": "af1e0346e389b17c23200270a64aa4e1ead98c61695d917de7d5b00491c9b0f12f20a01d6d622edf3de026a4db4e4526225debb93c1237934d71c7340bb5916158cbdafe9ac3225476b6ab57a12357db3abbad7a26c6e66290e44034fb08a20a8d0ec264f309994d2810c49cfba6989d7abb095897459f5425adb48aba07c5fb3c83c0" }, { "input_len": 16384, "hash": "f875d6646de28985646f34ee13be9a576fd515f76b5b0a26bb324735041ddde49d764c270176e53e97bdffa58d549073f2c660be0e81293767ed4e4929f9ad34bbb39a529334c57c4a381ffd2a6d4bfdbf1482651b172aa883cc13408fa67758a3e47503f93f87720a3177325f7823251b85275f64636a8f1d599c2e49722f42e93893", "keyed_hash": "9e9fc4eb7cf081ea7c47d1807790ed211bfec56aa25bb7037784c13c4b707b0df9e601b101e4cf63a404dfe50f2e1865bb12edc8fca166579ce0c70dba5a5c0fc960ad6f3772183416a00bd29d4c6e651ea7620bb100c9449858bf14e1ddc9ecd35725581ca5b9160de04060045993d972571c3e8f71e9d0496bfa744656861b169d65", "derive_key": "160e18b5878cd0df1c3af85eb25a0db5344d43a6fbd7a8ef4ed98d0714c3f7e160dc0b1f09caa35f2f417b9ef309dfe5ebd67f4c9507995a531374d099cf8ae317542e885ec6f589378864d3ea98716b3bbb65ef4ab5e0ab5bb298a501f19a41ec19af84a5e6b428ecd813b1a47ed91c9657c3fba11c406bc316768b58f6802c9e9b57" }, { "input_len": 31744, "hash": "62b6960e1a44bcc1eb1a611a8d6235b6b4b78f32e7abc4fb4c6cdcce94895c47860cc51f2b0c28a7b77304bd55fe73af663c02d3f52ea053ba43431ca5bab7bfea2f5e9d7121770d88f70ae9649ea713087d1914f7f312147e247f87eb2d4ffef0ac978bf7b6579d57d533355aa20b8b77b13fd09748728a5cc327a8ec470f4013226f", "keyed_hash": "efa53b389ab67c593dba624d898d0f7353ab99e4ac9d42302ee64cbf9939a4193a7258db2d9cd32a7a3ecfce46144114b15c2fcb68a618a976bd74515d47be08b628be420b5e830fade7c080e351a076fbc38641ad80c736c8a18fe3c66ce12f95c61c2462a9770d60d0f77115bbcd3782b593016a4e728d4c06cee4505cb0c08a42ec", "derive_key": "39772aef80e0ebe60596361e45b061e8f417429d529171b6764468c22928e28e9759adeb797a3fbf771b1bcea30150a020e317982bf0d6e7d14dd9f064bc11025c25f31e81bd78a921db0174f03dd481d30e93fd8e90f8b2fee209f849f2d2a52f31719a490fb0ba7aea1e09814ee912eba111a9fde9d5c274185f7bae8ba85d300a2b" }, { "input_len": 100000, "hash": "d93c23eedaf165a7e0be908ba86f1a7a520d568d2d13cde787c8580c5c72cc54902b765d0e69ff7f278ef2f8bb839b673f0db20afa0566c78965ad819674822fd11a507251555fc6daec7437074bc7b7307dfe122411b3676a932b5b0360d5ad495f8e7431d3d025fac5b4e955ce893a3504f2569f838eea47cf1bb21c4ae659db522f", "keyed_hash": "74c836d008247adebbc032d1bced2e71d19050b5c39fa03c43d4160ad8d170732f3b73e374a4500825c13d2c8c9384ce12c033adc49245ce42f50d5b48237397b8447bd414b0693bef98518db8a3494e6e8e3abc931f92f472d938f07eac97d1cc69b375426bce26c5e829b5b41cacbb5543544977749d503fa78309e7a158640e579c", "derive_key": "039c0c0d76eacefea9c8d042698bd012d3cef4091ed5c5a7e32a30e4d51718930a99481bb11214d9e9e79e58d11875a789447731a887aa77499843148d35b1752c6314af6d36559341bd6895c5ee0a452c99cb47a9b22dfe36042932fc9a423d245b91b6246c85e4b0d415cbece3e0545d6e242853da7f3dd1f9b0f146ec72706b8c28" } ] }