pax_global_header00006660000000000000000000000064132140631640014512gustar00rootroot0000000000000052 comment=d88e6fde102c5fea622247b3441ad07988aae980 intel-ipsec-mb-0.48/000077500000000000000000000000001321406316400142555ustar00rootroot00000000000000intel-ipsec-mb-0.48/.gitignore000066400000000000000000000000541321406316400162440ustar00rootroot00000000000000*.o *.so *.a *~ ipsec_MB_testapp ipsec_perf intel-ipsec-mb-0.48/LICENSE000066400000000000000000000027411321406316400152660ustar00rootroot00000000000000Copyright (c) 2012-2017, Intel Corporation Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. intel-ipsec-mb-0.48/LibPerfApp/000077500000000000000000000000001321406316400162415ustar00rootroot00000000000000intel-ipsec-mb-0.48/LibPerfApp/Makefile000077500000000000000000000054371321406316400177150ustar00rootroot00000000000000# Copyright (c) 2017, Intel Corporation # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # * Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # * Neither the name of Intel Corporation nor the names of its contributors # may be used to endorse or promote products derived from this software # without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. EXE=ipsec_perf LIBDIR ?= ../ LDFLAGS = -L$(LIBDIR) -fPIE -z noexecstack -z relro -z now -pthread LDLIBS = -lIPSec_MB INCLUDE_DIRS := $(LIBDIR) $(LIBDIR)/include INCLUDES := $(foreach i,$(INCLUDE_DIRS),-I$i) CFLAGS := -DLINUX $(INCLUDES) \ -W -Wall -Wextra -Wmissing-declarations -Wpointer-arith \ -Wcast-qual -Wundef -Wwrite-strings \ -Wformat -Wformat-security \ -Wunreachable-code -Wmissing-noreturn -Wsign-compare -Wno-endif-labels \ -Wstrict-prototypes -Wmissing-prototypes -Wold-style-definition \ -pthread ifeq ($(DEBUG),y) CFLAGS += -g -DDEBUG -O0 LDFLAGS += -g else CFLAGS += -O3 -fPIE -fstack-protector -D_FORTIFY_SOURCE=2 endif SOURCES := ipsec_perf.c OBJECTS := $(SOURCES:%.c=%.o) CHECKPATCH ?= checkpatch.pl CPPCHECK ?= cppcheck .PHONY: all clean style cppcheck all: $(EXE) $(EXE): $(OBJECTS) $(CC) $(LDFLAGS) $^ $(LDLIBS) -o $@ ipsec_perf.o: ipsec_perf.c .PHONY: clean clean: -rm -f $(OBJECTS) -rm -f $(EXE) SOURCES_STYLE := $(foreach infile,$(SOURCES),-f $(infile)) CHECKPATCH?=checkpatch.pl .PHONY: style style: $(CHECKPATCH) --no-tree --no-signoff --emacs --no-color \ --ignore CODE_INDENT,INITIALISED_STATIC,LEADING_SPACE,SPLIT_STRING,\ UNSPECIFIED_INT,ARRAY_SIZE,BLOCK_COMMENT_STYLE,GLOBAL_INITIALISERS,\ COMPLEX_MACRO,SPACING,STORAGE_CLASS $(SOURCES_STYLE) intel-ipsec-mb-0.48/LibPerfApp/README000066400000000000000000000043451321406316400171270ustar00rootroot00000000000000======================================================================== README for Intel(R) Multi-Buffer Crypto for IPsec Library API performance measurement tool February 2017 ======================================================================== Contents ======== - Overview - Files - Compilation - Usage - Legal Disclaimer Overview ======== This test tool performs multiple execution of functions included in Intel Multi-Buffer Crypto for IPsec Library. Files ===== ipsec_perf.c - Tool which produces text formatted output representing average times of ipsec_mb functions execution. ipsec_diff_tool.py - Another tool which interprets text data given. Compilation =========== Required tools: - GNU make - gcc (GCC) 4.8.3 (or newer) Simply run "make" to compile the tool. To clean the build please run "make clean". You can point to another directory contaning IPSec MB library by setting LIB_LOC. for ex: LIB_LOC=../ipsec_mb_lib make In order to perform static code analysis or style check you can do: make cppcheck or make style Be aware that you will have cppcheck tool installed and checkpatch.pl script copied into one of the directories listed in $PATH. You can also set CPPCHECK and/or CHECKPATCH variables if you want give paths to this tools being placed in different directories. for ex: CPPCHECK=~/tools/cppcheck make cppcheck CHECKPATCH=~/scripts/checkpatch.pl make style Usage ===== You can simply check list of arguments by typing: ./ipsec_perf -h Usage example: ./ipsec_perf -c --no-avx512 --no-gcm -o 24 Later you can pass output to ipsec_diff_tool.py for data analysis: ./ipsec_diff_tool.py out1.txt out2.txt 5 Run ipsec_diff_tool.py -h too see help page. Legal Disclaimer ================ THIS SOFTWARE IS PROVIDED BY INTEL"AS IS". NO LICENSE, EXPRESS OR IMPLIED, BY ESTOPPEL OR OTHERWISE, TO ANY INTELLECTUAL PROPERTY RIGHTS ARE GRANTED THROUGH USE. EXCEPT AS PROVIDED IN INTEL'S TERMS AND CONDITIONS OF SALE, INTEL ASSUMES NO LIABILITY WHATSOEVER AND INTEL DISCLAIMS ANY EXPRESS OR IMPLIED WARRANTY, RELATING TO SALE AND/OR USE OF INTEL PRODUCTS INCLUDING LIABILITY OR WARRANTIES RELATING TO FITNESS FOR A PARTICULAR PURPOSE, MERCHANTABILITY, OR INFRINGEMENT OF ANY PATENT, COPYRIGHT OR OTHER INTELLECTUAL PROPERTY RIGHT. intel-ipsec-mb-0.48/LibPerfApp/ipsec_diff_tool.py000077500000000000000000000247611321406316400217600ustar00rootroot00000000000000#!/usr/bin/env python """ ********************************************************************** Copyright(c) 2017 Intel Corporation All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ********************************************************************** """ import sys # Number of parameters (ARCH, CIPHER_MODE, DIR, HASH_ALG, KEY_SIZE) PAR_NUM = 5 class Variant(object): """ Class representing one test including chosen parameters and results of average execution times """ def __init__(self, **args): self.params = (args['arch'], args['cipher'], args['dir'], args['alg'], args['keysize']) self.avg_times = [] self.slope = None self.intercept = None def set_times(self, avg_times): """ Fills test execution time list """ self.avg_times = avg_times def lin_reg(self, sizes): """ Computes linear regression of set of coordinates (x,y) """ n = len(sizes) if n != len(self.avg_times): print "Error!" return None sumx = sum(sizes) sumy = sum(self.avg_times) sumxy = sum([x * y for x, y in zip(sizes, self.avg_times)]) sumsqrx = sum([pow(x, 2) for x in sizes]) self.slope = (n * sumxy - sumx * sumy) / float(n * sumsqrx - pow(sumx, 2)) self.intercept = (sumy - self.slope * sumx) / float(n) def get_params_str(self): """ Returns all parameters concatenated into one string """ return "\t".join(i for i in self.params) def get_lin_func_str(self): """ Returns string having linear coefficients """ slope = "{:.5f}".format(self.slope) intercept = "{:.5f}".format(self.intercept) return "{}\t{}".format(slope, intercept) class VarList(list): """ Class used to store all test variants as a list of objects """ def find_obj(self, params): """ Finds first occurence of object containing given parameters """ ret_val = None matches = (obj for obj in self if obj.params == params) try: ret_val = next(matches) except StopIteration: pass return ret_val def compare(self, list_b, tolerance): """ Finds variants from two data sets which are matching and compares its linear regression coefficients. Compares list_b against itself. """ if tolerance is None: tolerance = 5.0 if tolerance < 0.0: print "Bad argument: Tolerance must not be less than 0%" exit(1) print "TOLERANCE: {:.2f}%".format(tolerance) warning = False print "NO\tARCH\tCIPHER\tDIR\tHASH\tKEYSZ\tSLOPE A\tINTERCEPT A\tSLOPE B\tINTERCEPT B" for i, obj_a in enumerate(self): obj_b = list_b.find_obj(obj_a.params) if obj_b != None: if obj_a.slope < 0.0: obj_a.slope = 0 if obj_b.slope < 0.0: obj_b.slope = 0 slope_bv = 0.01 * tolerance * obj_a.slope # border value intercept_bv = 0.01 * tolerance * obj_a.intercept diff_slope = obj_b.slope - obj_a.slope diff_intercept = obj_b.intercept - obj_a.intercept if (obj_a.slope > 0.001 and obj_b.slope > 0.001 and diff_slope > slope_bv) or diff_intercept > intercept_bv: warning = True print "{}\t{}\t{}\t{}".format(i + 1, obj_b.get_params_str(), obj_a.get_lin_func_str(), obj_b.get_lin_func_str()) if not warning: print "No differences found." return warning def printout(self): """ Prints out readable representation of the list """ print "NO\tARCH\tCIPHER\tDIR\tHASH\tKEYSZ\tSLOPE \tINTERCEPT" for i, obj in enumerate(self): print "{}\t{}\t{}".format(i + 1, obj.get_params_str(), obj.get_lin_func_str()) class Parser(object): """ Class used to parse a text file contaning performance data """ def __init__(self, fname, verbose): self.fname = fname self.verbose = verbose @staticmethod def convert2int(in_tuple): """ Converts a tuple of strings into a list of integers """ result = list(in_tuple) # Converting to list result = [int(i) for i in result] # Converting str to int return result def load(self): """ Reads a text file by columns, stores data in objects for further comparision of performance """ v_list = VarList() # Reading by columns, results in list of tuples # Each tuple is representing a column from a text file try: f = open(self.fname, 'r') except IOError: print "Error reading {} file.".format(self.fname) exit(1) else: with f: cols = zip(*(line.strip().split('\t') for line in f)) # Reading first column with payload sizes, ommiting first 5 rows sizes = self.convert2int(cols[0][PAR_NUM:]) if self.verbose: print "Available buffer sizes:\n" print sizes print "========================================================" print "\n\nVariants:\n" # Reading remaining columns contaning performance data for row in cols[1:]: # First rows are run options arch, c_mode, c_dir, h_alg, key_size = row[:PAR_NUM] if self.verbose: print arch, c_mode, c_dir, h_alg, key_size # Getting average times avg_times = self.convert2int(row[PAR_NUM:]) if self.verbose: print avg_times print "------" # Putting new object to the result list v_list.append(Variant(arch=arch, cipher=c_mode, dir=c_dir, alg=h_alg, keysize=key_size)) v_list[-1].set_times(avg_times) # Finding linear function representation of data set v_list[-1].lin_reg(sizes) if self.verbose: print "({}, {})".format(v_list[-1].slope, v_list[-1].intercept) print "============\n" return v_list, sizes class DiffTool(object): """ Main class """ def __init__(self): self.fname_a = None self.fname_b = None self.tolerance = None self.verbose = False self.analyze = False @staticmethod def usage(): """ Prints usage """ print "This tool compares file_b against file_a printing out differences." print "Usage:" print "\tipsec_diff_tool.py [-v] [-a] file_a file_b [tol]\n" print "\t-v - verbose" print "\t-a - takes only one argument: name of the file to analyze" print "\tfile_a, file_b - text files containing output from ipsec_perf tool" print "\ttol - tolerance [%], must be >= 0, default 5\n" print "Examples:" print "\tipsec_diff_tool.py file01.txt file02.txt 10" print "\tipsec_diff_tool.py -a file02.txt" print "\tipsec_diff_tool.py -v -a file01.txt" def parse_args(self): """ Get commandline arguments """ if len(sys.argv) < 3 or sys.argv[1] == "-h": self.usage() exit(1) if sys.argv[1] == "-a": self.analyze = True self.fname_a = sys.argv[2] elif sys.argv[2] == "-a": if sys.argv[1] == "-v": self.verbose = True self.analyze = True self.fname_a = sys.argv[3] elif sys.argv[1] == "-v": self.verbose = True self.fname_a = sys.argv[2] self.fname_b = sys.argv[3] if len(sys.argv) >= 5: self.tolerance = float(sys.argv[4]) else: self.fname_a = sys.argv[1] self.fname_b = sys.argv[2] if len(sys.argv) >= 4: self.tolerance = float(sys.argv[3]) def run(self): """ Main method """ self.parse_args() parser_a = Parser(self.fname_a, self.verbose) list_a, sizes_a = parser_a.load() if not self.analyze: parser_b = Parser(self.fname_b, self.verbose) list_b, sizes_b = parser_b.load() if sizes_a != sizes_b: print "Error. Buffer size lists in two compared " \ "data sets differ! Aborting.\n" exit(1) warning = list_a.compare(list_b, self.tolerance) # Compares list_b against list_a if warning: exit(2) else: list_a.printout() # Takes only one file and prints it out if __name__ == '__main__': DiffTool().run() intel-ipsec-mb-0.48/LibPerfApp/ipsec_perf.c000077500000000000000000001003451321406316400205320ustar00rootroot00000000000000/********************************************************************** Copyright(c) 2017 Intel Corporation All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************/ #include #include #include #ifdef _WIN32 #include #include #include #define __forceinline static __forceinline #else #include #define __forceinline static inline __attribute__((always_inline)) #include #endif #include "mb_mgr.h" #include "gcm_defines.h" #define BUFSIZE (512 * 1024 * 1024) #define JOB_SIZE (2 * 1024) #define JOB_SIZE_STEP 16 #define REGION_SIZE (JOB_SIZE + 3003) #define NUM_OFFSETS (BUFSIZE / REGION_SIZE) #define NUM_RUNS 16 #define KEYS_PER_JOB 15 #define ITER_SCALE 200000 #define NUM_ARCHS 4 /* SSE, AVX, AVX2, AVX512 */ #define NUM_TYPES 4 /* AES_HMAC, AES_DOCSIS, AES_GCM, AES_CCM */ #define MAX_NUM_THREADS 16 /* Maximum number of threads that can be created */ #define CIPHER_MODES_AES 4 /* CBC, CNTR, CNTR+8, NULL_CIPHER */ #define CIPHER_MODES_DOCSIS 4 /* AES DOCSIS, AES DOCSIS+8, DES DOCSIS, DES DOCSIS+8 */ #define CIPHER_MODES_GCM 1 /* GCM */ #define CIPHER_MODES_CCM 1 /* CCM */ #define DIRECTIONS 2 /* ENC, DEC */ #define HASH_ALGS_AES 8 /* SHA1, SHA256, SHA224, SHA384, SHA512, XCBC, MD5, NULL_HASH */ #define HASH_ALGS_DOCSIS 1 /* NULL_HASH */ #define HASH_ALGS_GCM 1 /* GCM */ #define HASH_ALGS_CCM 1 /* CCM */ #define KEY_SIZES_AES 3 /* 16, 24, 32 */ #define KEY_SIZES_DOCSIS 1 /* 16 or 8 */ #define KEY_SIZES_GCM 3 /* 16, 24, 32 */ #define KEY_SIZES_CCM 1 /* 16 */ /* Those defines tell how many different test cases are to be performed. * Have to be multiplied by number of chosen architectures. */ #define VARIANTS_PER_ARCH_AES (CIPHER_MODES_AES * DIRECTIONS * \ HASH_ALGS_AES * KEY_SIZES_AES) #define VARIANTS_PER_ARCH_DOCSIS (CIPHER_MODES_DOCSIS * DIRECTIONS * \ HASH_ALGS_DOCSIS * KEY_SIZES_DOCSIS) #define VARIANTS_PER_ARCH_GCM (CIPHER_MODES_GCM * DIRECTIONS * \ HASH_ALGS_GCM * KEY_SIZES_GCM) #define VARIANTS_PER_ARCH_CCM (CIPHER_MODES_CCM * DIRECTIONS * \ HASH_ALGS_CCM * KEY_SIZES_CCM) /* Typedefs used for GCM callbacks */ typedef void (*aesni_gcm_t)(const struct gcm_key_data *, struct gcm_context_data *, uint8_t *, const uint8_t *, uint64_t, const uint8_t *, const uint8_t *, uint64_t, uint8_t *, uint64_t); typedef void (*aesni_gcm_pre_t)(const void *, struct gcm_key_data *); /* AES_HMAC, DOCSIS callbacks */ struct funcs_s { init_mb_mgr_t init_mb_mgr; get_next_job_t get_next_job; submit_job_t submit_job; get_completed_job_t get_completed_job; flush_job_t flush_job; }; /* GCM callbacks */ struct funcs_gcm_s { aesni_gcm_pre_t aesni_gcm_pre; aesni_gcm_t aesni_gcm_enc; aesni_gcm_t aesni_gcm_dec; }; enum arch_type_e { ARCH_SSE = 0, ARCH_AVX, ARCH_AVX2, ARCH_AVX512 }; enum test_type_e { TTYPE_AES_HMAC, TTYPE_AES_DOCSIS, TTYPE_AES_GCM, TTYPE_AES_CCM }; /* This enum will be mostly translated to JOB_CIPHER_MODE */ enum test_cipher_mode_e { TEST_CBC = 1, TEST_CNTR, TEST_CNTR8, /* CNTR with increased buffer by 8 */ TEST_NULL_CIPHER, TEST_AESDOCSIS, TEST_AESDOCSIS8, /* AES DOCSIS with increased buffer size by 8 */ TEST_DESDOCSIS, TEST_DESDOCSIS4, /* DES DOCSIS with increased buffer size by 4 */ TEST_GCM, /* Additional field used by GCM, not translated */ TEST_CCM }; /* This enum will be mostly translated to JOB_HASH_ALG */ enum test_hash_alg_e { TEST_SHA1 = 1, TEST_SHA_224, TEST_SHA_256, TEST_SHA_384, TEST_SHA_512, TEST_XCBC, TEST_MD5, TEST_NULL_HASH, TEST_HASH_GCM, /* Additional field used by GCM, not translated */ TEST_CUSTOM_HASH, /* unused */ TEST_HASH_CCM }; /* Struct storing cipher parameters */ struct params_s { JOB_CIPHER_DIRECTION cipher_dir; enum test_type_e test_type; /* AES, DOCSIS, GCM */ enum test_cipher_mode_e cipher_mode; enum test_hash_alg_e hash_alg; uint32_t aes_key_size; uint32_t size_aes; uint32_t num_sizes; uint32_t num_variants; }; /* This struct stores all information about performed test case */ struct variant_s { uint32_t arch; struct params_s params; uint64_t *avg_times; }; enum cache_type_e { WARM = 0, COLD = 1 }; #ifdef DEBUG #define FUNCS(A) { \ init_mb_mgr_##A, \ get_next_job_##A, \ submit_job_##A, \ get_completed_job_##A, \ flush_job_##A \ } #else #define FUNCS(A) { \ init_mb_mgr_##A, \ get_next_job_##A, \ submit_job_nocheck_##A, \ get_completed_job_##A, \ flush_job_##A \ } #endif #define FUNCS_GCM(A) \ {aes_gcm_pre_128_##A, aes_gcm_enc_128_##A, aes_gcm_dec_128_##A}, \ {aes_gcm_pre_192_##A, aes_gcm_enc_192_##A, aes_gcm_dec_192_##A}, \ {aes_gcm_pre_256_##A, aes_gcm_enc_256_##A, aes_gcm_dec_256_##A} /* Function pointers used by TTYPE_AES_HMAC, TTYPE_AES_DOCSIS */ struct funcs_s func_sets[NUM_ARCHS] = { FUNCS(sse), FUNCS(avx), FUNCS(avx2), FUNCS(avx512) }; /* Function pointers used by TTYPE_AES_GCM */ struct funcs_gcm_s func_sets_gcm[NUM_ARCHS - 1][3] = { {FUNCS_GCM(sse)}, {FUNCS_GCM(avx_gen2)}, /* AVX */ {FUNCS_GCM(avx_gen4)} /* AVX2 */ }; enum cache_type_e cache_type = WARM; /* SHA1, SHA224, SHA256, SHA384, SHA512, XCBC, MD5, NULL, GMAC, CUSTOM, CCM */ const uint32_t auth_tag_length_bytes[11] = { 12, 14, 16, 24, 32, 12, 12, 0, 8, 0, 16 }; uint8_t *buf = NULL; uint32_t index_limit; uint128_t *keys = NULL; uint64_t *offset_ptr = NULL; uint32_t key_idxs[NUM_OFFSETS]; uint32_t offsets[NUM_OFFSETS]; int sha_size_incr = 24; uint8_t archs[NUM_ARCHS] = {1, 1, 1, 1}; /* uses all function sets */ uint8_t test_types[NUM_TYPES] = {1, 1, 1, 1}; /* AES, DOCSIS, GCM, CCM */ int use_gcm_job_api = 0; /* Those inline functions run different types of ipsec_mb library functions. * They run different functions depending on the chosen architecture */ __forceinline void init_mb_mgr(MB_MGR *mgr, uint32_t arch) { func_sets[arch].init_mb_mgr(mgr); } __forceinline JOB_AES_HMAC *get_next_job(MB_MGR *mgr, const uint32_t arch) { return func_sets[arch].get_next_job(mgr); } __forceinline JOB_AES_HMAC *submit_job(MB_MGR *mgr, const uint32_t arch) { return func_sets[arch].submit_job(mgr); } __forceinline JOB_AES_HMAC *get_completed_job(MB_MGR *mgr, const uint32_t arch) { return func_sets[arch].get_completed_job(mgr); } __forceinline JOB_AES_HMAC *flush_job(MB_MGR *mgr, const uint32_t arch) { return func_sets[arch].flush_job(mgr); } /* GCM functions take also key size argument (128, 192, 256bit) */ __forceinline void aesni_gcm_pre(const uint32_t arch, const uint8_t key_sz, uint8_t *key, struct gcm_key_data *gdata) { func_sets_gcm[arch][key_sz].aesni_gcm_pre(key, gdata); } __forceinline void aesni_gcm_enc(const uint32_t arch, const uint8_t key_sz, const struct gcm_key_data *gdata, struct gcm_context_data *ctx, uint8_t *out, uint8_t const *in, uint64_t len, uint8_t *iv, uint8_t const *aad, uint64_t aad_len, uint8_t *auth_tag, uint64_t auth_tag_len) { func_sets_gcm[arch][key_sz].aesni_gcm_enc(gdata, ctx, out, in, len, iv, aad, aad_len, auth_tag, auth_tag_len); } __forceinline void aesni_gcm_dec(const uint32_t arch, const uint8_t key_sz, const struct gcm_key_data *gdata, struct gcm_context_data *ctx, uint8_t *out, uint8_t const *in, uint64_t len, uint8_t *iv, uint8_t const *aad, uint64_t aad_len, uint8_t *auth_tag, uint64_t auth_tag_len) { func_sets_gcm[arch][key_sz].aesni_gcm_dec(gdata, ctx, out, in, len, iv, aad, aad_len, auth_tag, auth_tag_len); } /* Freeing allocated memory */ static void free_mem(void) { if (offset_ptr != NULL) free(offset_ptr); if (buf != NULL) free(buf); } /* Input buffer initialization */ static void init_buf(enum cache_type_e ctype) { uint32_t tmp_off; uint64_t offset; int i; buf = (uint8_t *) malloc(BUFSIZE + REGION_SIZE); if (!buf) { fprintf(stderr, "Could not malloc buf\n"); exit(EXIT_FAILURE); } offset_ptr = (uint64_t *) malloc(NUM_OFFSETS * KEYS_PER_JOB * sizeof(uint128_t) + 0x0F); if (!offset_ptr) { fprintf(stderr, "Could not malloc keys\n"); free_mem(); exit(EXIT_FAILURE); } offset = (uint64_t) offset_ptr; keys = (uint128_t *) ((offset + 0x0F) & ~0x0F); /* align to 16 bytes */ if (ctype == COLD) { for (i = 0; i < NUM_OFFSETS; i++) { offsets[i] = i * REGION_SIZE + (rand() & 0x3F0); key_idxs[i] = i * KEYS_PER_JOB; } for (i = NUM_OFFSETS - 1; i >= 0; i--) { offset = rand(); offset *= i; offset /= RAND_MAX; tmp_off = offsets[offset]; offsets[offset] = offsets[i]; offsets[i] = tmp_off; tmp_off = key_idxs[offset]; key_idxs[offset] = key_idxs[i]; key_idxs[i] = tmp_off; } index_limit = NUM_OFFSETS; } else {/* WARM */ for (i = 0; i < NUM_OFFSETS; i += 2) { offsets[i] = (2 * i + 0) * REGION_SIZE + (rand() & 0x3F0); offsets[i + 1] = (2 * i + 1) * REGION_SIZE + (rand() & 0x3F0); key_idxs[i] = (2 * i + 0) * KEYS_PER_JOB; } index_limit = 8; } } /* This function translates enum test_ciper_mode_e to be used by ipsec_mb * library */ static JOB_CIPHER_MODE translate_cipher_mode(enum test_cipher_mode_e test_mode) { JOB_CIPHER_MODE c_mode = NULL_CIPHER; switch (test_mode) { case TEST_CBC: c_mode = CBC; break; case TEST_CNTR: case TEST_CNTR8: c_mode = CNTR; break; case TEST_NULL_CIPHER: c_mode = NULL_CIPHER; break; case TEST_AESDOCSIS: case TEST_AESDOCSIS8: c_mode = DOCSIS_SEC_BPI; break; case TEST_DESDOCSIS: case TEST_DESDOCSIS4: c_mode = DOCSIS_DES; break; case TEST_GCM: c_mode = GCM; break; case TEST_CCM: c_mode = CCM; break; default: break; } return c_mode; } /* Performs test using AES_HMAC or DOCSIS */ static uint64_t do_test(const uint32_t arch, MB_MGR *mb_mgr, struct params_s *params, const uint32_t num_iter) { JOB_AES_HMAC *job; JOB_AES_HMAC job_template; uint32_t i; static uint32_t index = 0; static DECLARE_ALIGNED(uint128_t iv, 16); static uint32_t ipad[5], opad[5], digest[3]; static DECLARE_ALIGNED(uint32_t k1_expanded[11 * 4], 16); static DECLARE_ALIGNED(uint8_t k2[16], 16); static DECLARE_ALIGNED(uint8_t k3[16], 16); static DECLARE_ALIGNED(struct gcm_key_data gdata_key, 16); uint32_t size_aes; uint64_t time = 0; uint32_t aux; if ((params->cipher_mode == TEST_AESDOCSIS8) || (params->cipher_mode == TEST_CNTR8)) size_aes = params->size_aes + 8; else if (params->cipher_mode == TEST_DESDOCSIS4) size_aes = params->size_aes + 4; else size_aes = params->size_aes; job_template.msg_len_to_cipher_in_bytes = size_aes; job_template.msg_len_to_hash_in_bytes = size_aes + sha_size_incr; job_template.hash_start_src_offset_in_bytes = 0; job_template.cipher_start_src_offset_in_bytes = sha_size_incr; job_template.iv = (uint8_t *) &iv; job_template.iv_len_in_bytes = 16; job_template.auth_tag_output = (uint8_t *) digest; switch (params->hash_alg) { case TEST_XCBC: job_template._k1_expanded = k1_expanded; job_template._k2 = k2; job_template._k3 = k3; break; case TEST_HASH_CCM: break; case TEST_HASH_GCM: break; case TEST_NULL_HASH: break; default: /* hash alg is SHA1 or MD5 */ job_template.hashed_auth_key_xor_ipad = (uint8_t *) ipad; job_template.hashed_auth_key_xor_opad = (uint8_t *) opad; break; } job_template.cipher_direction = params->cipher_dir; if (params->cipher_mode == TEST_NULL_CIPHER) { job_template.chain_order = HASH_CIPHER; } else { if (job_template.cipher_direction == ENCRYPT) job_template.chain_order = CIPHER_HASH; else job_template.chain_order = HASH_CIPHER; } /* Translating enum to the API's one */ job_template.cipher_mode = translate_cipher_mode(params->cipher_mode); job_template.aes_key_len_in_bytes = params->aes_key_size; if (job_template.cipher_mode == GCM) { uint8_t key[32]; aesni_gcm_pre(arch, (params->aes_key_size / 8) - 2, key, &gdata_key); job_template.aes_enc_key_expanded = &gdata_key; job_template.aes_dec_key_expanded = &gdata_key; job_template.u.GCM.aad_len_in_bytes = 12; job_template.iv_len_in_bytes = 12; } else if (job_template.cipher_mode == CCM) { job_template.msg_len_to_cipher_in_bytes = size_aes; job_template.msg_len_to_hash_in_bytes = size_aes; job_template.hash_start_src_offset_in_bytes = 0; job_template.cipher_start_src_offset_in_bytes = 0; job_template.u.CCM.aad_len_in_bytes = 8; job_template.iv_len_in_bytes = 13; } else if (job_template.cipher_mode == DES || job_template.cipher_mode == DOCSIS_DES) { job_template.aes_key_len_in_bytes = 8; job_template.iv_len_in_bytes = 8; } job_template.hash_alg = (JOB_HASH_ALG) params->hash_alg; job_template.auth_tag_output_len_in_bytes = (uint64_t) auth_tag_length_bytes[job_template.hash_alg - 1]; time = __rdtscp(&aux); for (i = 0; i < num_iter; i++) { job = get_next_job(mb_mgr, arch); *job = job_template; job->src = buf + offsets[index]; job->dst = buf + offsets[index] + sha_size_incr; if (job->cipher_mode == GCM) { job->u.GCM.aad = job->src; } else if (job->cipher_mode == CCM) { job->u.CCM.aad = job->src; job->aes_enc_key_expanded = job->aes_dec_key_expanded = (uint32_t *) &keys[key_idxs[index]]; } else { job->aes_enc_key_expanded = job->aes_dec_key_expanded = (uint32_t *) &keys[key_idxs[index]]; } index += 2; if (index >= index_limit) index = 0; job = submit_job(mb_mgr, arch); while (job) { #ifdef DEBUG if (job->status != STS_COMPLETED) fprintf(stderr, "failed job, status:%d\n", job->status); #endif job = get_completed_job(mb_mgr, arch); } } while ((job = flush_job(mb_mgr, arch))) { #ifdef DEBUG if (job->status != STS_COMPLETED) fprintf(stderr, "failed job, status:%d\n", job->status); #endif } time = __rdtscp(&aux) - time; return time / num_iter; } /* Performs test using GCM */ static uint64_t do_test_gcm(const uint32_t arch, struct params_s *params, const uint32_t num_iter) { struct gcm_key_data gdata_key; struct gcm_context_data gdata_ctx; uint8_t *key; static uint32_t index = 0; uint8_t key_sz = params->aes_key_size / 8 - 2; uint32_t size_aes = params->size_aes; uint32_t i; uint8_t aad[12]; uint8_t auth_tag[12]; DECLARE_ALIGNED(uint8_t iv[16], 16); uint64_t time = 0; uint32_t aux; key = (uint8_t *) malloc(sizeof(uint8_t) * params->aes_key_size); if (!key) { fprintf(stderr, "Could not malloc key\n"); free_mem(); exit(EXIT_FAILURE); } aesni_gcm_pre(arch, key_sz, key, &gdata_key); if (params->cipher_dir == ENCRYPT) { time = __rdtscp(&aux); for (i = 0; i < num_iter; i++) { aesni_gcm_enc(arch, key_sz, &gdata_key, &gdata_ctx, buf + offsets[index] + sha_size_incr, buf + offsets[index] + sha_size_incr, size_aes, iv, aad, sizeof(aad), auth_tag, sizeof(auth_tag)); index += 2; if (index >= index_limit) index = 0; } time = __rdtscp(&aux) - time; } else { /*DECRYPT*/ time = __rdtscp(&aux); for (i = 0; i < num_iter; i++) { aesni_gcm_dec(arch, key_sz, &gdata_key, &gdata_ctx, buf + offsets[index] + sha_size_incr, buf + offsets[index] + sha_size_incr, size_aes, iv, aad, sizeof(aad), auth_tag, sizeof(auth_tag)); index += 2; if (index >= index_limit) index = 0; } time = __rdtscp(&aux) - time; } free(key); return time / num_iter; } /* Method used by qsort to compare 2 values */ static int compare_uint64_t(const void *a, const void *b) { return (int)(int64_t)(*(const uint64_t *)a - *(const uint64_t *)b); } /* Computes mean of set of times after dropping bottom and top quarters */ static uint64_t mean_median(uint64_t *array, uint32_t size) { uint32_t quarter = size / 4; uint32_t i; uint64_t sum; /* these are single threaded runs, so we skip * the hardware thread related skew clipping * thus skipping "ignore first and last eighth" */ /* ignore lowest and highest quarter */ qsort(array, size, sizeof(uint64_t), compare_uint64_t); /* dropping the bottom and top quarters * after sorting to remove noise/variations */ array += quarter; size -= quarter * 2; if ((size == 0) || (size & 0x80000000)) { fprintf(stderr, "not enough data points\n"); free_mem(); exit(EXIT_FAILURE); } sum = 0; for (i = 0; i < size; i++) sum += array[i]; sum = (sum + size / 2) / size; return sum; } /* Runs test for each buffer size and stores averaged execution time */ static void process_variant(MB_MGR *mgr, const uint32_t arch, struct params_s *params, struct variant_s *variant_ptr, const uint32_t run) { const uint32_t sizes = params->num_sizes; uint64_t *times = &variant_ptr->avg_times[run]; uint32_t sz; for (sz = 0; sz < sizes; sz++) { const uint32_t size_aes = (sz + 1) * JOB_SIZE_STEP; const uint32_t num_iter = ITER_SCALE / size_aes; params->size_aes = size_aes; if (params->test_type == TTYPE_AES_GCM && (!use_gcm_job_api)) *times = do_test_gcm(arch, params, 2 * num_iter); else *times = do_test(arch, mgr, params, num_iter); times += NUM_RUNS; } variant_ptr->params = *params; variant_ptr->arch = arch; } /* Sets cipher mode, hash algorithm */ static void do_variants(MB_MGR *mgr, const uint32_t arch, struct params_s *params, const uint32_t run, struct variant_s **variant_ptr, uint32_t *variant) { uint32_t hash_alg; uint32_t h_start = TEST_SHA1; uint32_t h_end = TEST_NULL_HASH; uint32_t c_mode; uint32_t c_start = TEST_CBC; uint32_t c_end = TEST_NULL_CIPHER; switch (params->test_type) { case TTYPE_AES_DOCSIS: h_start = TEST_NULL_HASH; c_start = TEST_AESDOCSIS; c_end = TEST_DESDOCSIS4; break; case TTYPE_AES_GCM: h_start = TEST_HASH_GCM; h_end = TEST_HASH_GCM; c_start = TEST_GCM; c_end = TEST_GCM; break; case TTYPE_AES_CCM: h_start = TEST_HASH_CCM; h_end = TEST_HASH_CCM; c_start = TEST_CCM; c_end = TEST_CCM; break; default: break; } for (c_mode = c_start; c_mode <= c_end; c_mode++) { params->cipher_mode = (enum test_cipher_mode_e) c_mode; for (hash_alg = h_start; hash_alg <= h_end; hash_alg++) { params->hash_alg = (enum test_hash_alg_e) hash_alg; process_variant(mgr, arch, params, *variant_ptr, run); (*variant)++; (*variant_ptr)++; } } } /* Sets cipher direction and key size */ static void run_dir_test(MB_MGR *mgr, const uint32_t arch, struct params_s *params, const uint32_t run, struct variant_s **variant_ptr, uint32_t *variant) { uint32_t dir; uint32_t k; /* Key size */ uint32_t limit = AES_256_BYTES; /* Key size value limit */ if (params->test_type == TTYPE_AES_DOCSIS || params->test_type == TTYPE_AES_CCM) limit = AES_128_BYTES; init_mb_mgr(mgr, arch); for (dir = ENCRYPT; dir <= DECRYPT; dir++) { params->cipher_dir = (JOB_CIPHER_DIRECTION) dir; for (k = AES_128_BYTES; k <= limit; k += 8) { params->aes_key_size = k; do_variants(mgr, arch, params, run, variant_ptr, variant); } } } /* Generates output containing averaged times for each test variant */ static void print_times(struct variant_s *variant_list, struct params_s *params, const uint32_t total_variants) { const uint32_t sizes = params->num_sizes; uint32_t col; uint32_t sz; /* Temporary variables */ struct params_s par; uint8_t c_mode; uint8_t c_dir; uint8_t h_alg; const char *func_names[4] = { "SSE", "AVX", "AVX2", "AVX512" }; const char *c_mode_names[10] = { "CBC", "CNTR", "CNTR+8", "NULL_CIPHER", "DOCAES", "DOCAES+8", "DOCDES", "DOCDES+4", "GCM", "CCM" }; const char *c_dir_names[2] = { "ENCRYPT", "DECRYPT" }; const char *h_alg_names[11] = { "SHA1", "SHA_224", "SHA_256", "SHA_384", "SHA_512", "XCBC", "MD5", "NULL_HASH", "GCM", "CUSTOM", "CCM" }; printf("ARCH"); for (col = 0; col < total_variants; col++) printf("\t%s", func_names[variant_list[col].arch]); printf("\n"); printf("CIPHER"); for (col = 0; col < total_variants; col++) { par = variant_list[col].params; c_mode = par.cipher_mode - CBC; printf("\t%s", c_mode_names[c_mode]); } printf("\n"); printf("DIR"); for (col = 0; col < total_variants; col++) { par = variant_list[col].params; c_dir = par.cipher_dir - ENCRYPT; printf("\t%s", c_dir_names[c_dir]); } printf("\n"); printf("HASH_ALG"); for (col = 0; col < total_variants; col++) { par = variant_list[col].params; h_alg = par.hash_alg - SHA1; printf("\t%s", h_alg_names[h_alg]); } printf("\n"); printf("KEY_SIZE"); for (col = 0; col < total_variants; col++) { par = variant_list[col].params; printf("\tAES-%u", par.aes_key_size * 8); } printf("\n"); for (sz = 0; sz < sizes; sz++) { printf("%d", (sz + 1) * JOB_SIZE_STEP); for (col = 0; col < total_variants; col++) { uint64_t *time_ptr = &variant_list[col].avg_times[sz * NUM_RUNS]; const unsigned long long val = mean_median(time_ptr, NUM_RUNS); printf("\t%llu", val); } printf("\n"); } } /* Prepares data structure for test variants storage, sets test configuration */ #ifdef _WIN32 static void #else static void * #endif run_tests(void *arg) { uint32_t i; const int do_print_times = (arg == NULL) ? 0 : 1; MB_MGR mb_mgr; struct params_s params; uint32_t num_variants[NUM_TYPES] = {0, 0, 0}; uint32_t type, at_size, run, arch; uint32_t variants_per_arch, max_arch; uint32_t variant; uint32_t total_variants = 0; struct variant_s *variant_ptr; struct variant_s *variant_list; params.num_sizes = JOB_SIZE / JOB_SIZE_STEP; for (type = TTYPE_AES_HMAC; type < NUM_TYPES; type++) { if (test_types[type] == 0) continue; switch (type) { default: case TTYPE_AES_HMAC: variants_per_arch = VARIANTS_PER_ARCH_AES; max_arch = NUM_ARCHS; break; case TTYPE_AES_DOCSIS: variants_per_arch = VARIANTS_PER_ARCH_DOCSIS; max_arch = NUM_ARCHS; break; case TTYPE_AES_GCM: variants_per_arch = VARIANTS_PER_ARCH_GCM; max_arch = NUM_ARCHS - 1; /* No AVX512 for GCM */ break; case TTYPE_AES_CCM: variants_per_arch = VARIANTS_PER_ARCH_CCM; max_arch = NUM_ARCHS; break; } /* Calculating number of all variants */ for (arch = 0; arch < max_arch; arch++) { if (archs[arch] == 0) continue; num_variants[type] += variants_per_arch; } total_variants += num_variants[type]; } variant_list = (struct variant_s *) malloc(total_variants * sizeof(struct variant_s)); if (!variant_list) { fprintf(stderr, "Cannot allocate memory\n"); free_mem(); exit(EXIT_FAILURE); } at_size = NUM_RUNS * params.num_sizes * sizeof(uint64_t); for (variant = 0, variant_ptr = variant_list; variant < total_variants; variant++, variant_ptr++) { variant_ptr->avg_times = (uint64_t *) malloc(at_size); if (!variant_ptr->avg_times) { fprintf(stderr, "Cannot allocate memory\n"); free_mem(); exit(EXIT_FAILURE); } } for (run = 0; run < NUM_RUNS; run++) { fprintf(stderr, "Starting run %d of %d\n", run+1, NUM_RUNS); variant = 0; variant_ptr = variant_list; for (type = TTYPE_AES_HMAC; type < NUM_TYPES; type++) { if (test_types[type] == 0) continue; if (type == TTYPE_AES_GCM) /* No AVX512 for GCM */ max_arch = NUM_ARCHS - 1; else max_arch = NUM_ARCHS; params.num_variants = num_variants[type]; params.test_type = type; /* Performing tests for each selected architecture */ for (arch = 0; arch < max_arch; arch++) { if (archs[arch] == 0) continue; run_dir_test(&mb_mgr, arch, ¶ms, run, &variant_ptr, &variant); } } /* end for type */ } /* end for run */ if (do_print_times == 1) print_times(variant_list, ¶ms, total_variants); if (variant_list != NULL) { /* Freeing variants list */ for (i = 0; i < total_variants; i++) free(variant_list[i].avg_times); free(variant_list); } #ifndef _WIN32 return NULL; #endif } static void usage(void) { fprintf(stderr, "Usage: ipsec_perf [args], " "where args are zero or more\n" "-h: print this message\n" "-c: Use cold cache, it uses warm as default\n" "-w: Use warm cache\n" "--no-avx512: Don't do AVX512\n" "--no-avx2: Don't do AVX2\n" "--no-avx: Don't do AVX\n" "--no-sse: Don't do SSE\n" "-o val: Use for the SHA size increment, default is 24\n" "--shani-on: use SHA extensions, default: auto-detect\n" "--shani-off: don't use SHA extensions\n" "--no-gcm: do not run GCM perf tests\n" "--no-aes: do not run standard AES + HMAC perf tests\n" "--no-docsis: do not run DOCSIS cipher perf tests\n" "--no-ccm: do not run CCM cipher perf tests\n" "--gcm-job-api: use JOB API for GCM perf tests" " (raw GCM API is default)\n" "--threads num: for the number of threads to run." "Max: %d\n", MAX_NUM_THREADS + 1); } int main(int argc, char *argv[]) { MB_MGR lmgr; int i, num_t = 0; #ifdef _WIN32 HANDLE threads[MAX_NUM_THREADS]; #else pthread_t tids[MAX_NUM_THREADS]; #endif for (i = 1; i < argc; i++) if (strcmp(argv[i], "-h") == 0) { usage(); return EXIT_SUCCESS; } else if (strcmp(argv[i], "-c") == 0) { cache_type = COLD; fprintf(stderr, "Cold cache, "); } else if (strcmp(argv[i], "-w") == 0) { cache_type = WARM; fprintf(stderr, "Warm cache, "); } else if (strcmp(argv[i], "--no-avx512") == 0) { archs[ARCH_AVX512] = 0; } else if (strcmp(argv[i], "--no-avx2") == 0) { archs[ARCH_AVX2] = 0; } else if (strcmp(argv[i], "--no-avx") == 0) { archs[ARCH_AVX] = 0; } else if (strcmp(argv[i], "--no-sse") == 0) { archs[ARCH_SSE] = 0; } else if (strcmp(argv[i], "--shani-on") == 0) { sse_sha_ext_usage = SHA_EXT_PRESENT; } else if (strcmp(argv[i], "--shani-off") == 0) { sse_sha_ext_usage = SHA_EXT_NOT_PRESENT; } else if (strcmp(argv[i], "--no-gcm") == 0) { test_types[TTYPE_AES_GCM] = 0; } else if (strcmp(argv[i], "--no-aes") == 0) { test_types[TTYPE_AES_HMAC] = 0; } else if (strcmp(argv[i], "--no-docsis") == 0) { test_types[TTYPE_AES_DOCSIS] = 0; } else if (strcmp(argv[i], "--no-ccm") == 0) { test_types[TTYPE_AES_CCM] = 0; } else if (strcmp(argv[i], "--gcm-job-api") == 0) { use_gcm_job_api = 1; } else if ((strcmp(argv[i], "-o") == 0) && (i < argc - 1)) { i++; sha_size_incr = atoi(argv[i]); } else if (strcmp(argv[i], "--threads") == 0) { num_t = atoi(argv[++i]); if (num_t > (MAX_NUM_THREADS + 1)) { fprintf(stderr, "Invalid number of threads!\n"); return EXIT_FAILURE; } } else { usage(); return EXIT_FAILURE; } fprintf(stderr, "SHA size incr = %d\n", sha_size_incr); init_mb_mgr_sse(&lmgr); if (archs[ARCH_SSE]) { fprintf(stderr, "%s SHA extensions (shani) for SSE arch\n", (sse_sha_ext_usage == SHA_EXT_PRESENT) ? "Using" : "Not using"); } init_buf(cache_type); if (num_t > 1) for (i = 0; i < num_t - 1; i++) { #ifdef _WIN32 threads[i] = (HANDLE)_beginthread(&run_tests, 0, NULL); #else pthread_attr_t attr; pthread_attr_init(&attr); pthread_create(&tids[i], &attr, run_tests, NULL); #endif } run_tests((void *)1); if (num_t > 1) { #ifdef _WIN32 WaitForMultipleObjects(num_t, threads, FALSE, INFINITE); #endif for (i = 0; i < num_t - 1; i++) { fprintf(stderr, "Waiting on thread %d to finish...\n", i+2); #ifdef _WIN32 CloseHandle(threads[i]); #else pthread_join(tids[i], NULL); #endif } } free_mem(); return EXIT_SUCCESS; } intel-ipsec-mb-0.48/LibPerfApp/win_x64.mak000066400000000000000000000040311321406316400202270ustar00rootroot00000000000000# # Copyright (c) 2017, Intel Corporation # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # * Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # * Neither the name of Intel Corporation nor the names of its contributors # may be used to endorse or promote products derived from this software # without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # APP = ipsec_perf IPSECLIB = ..\libIPSec_MB.lib !ifdef DEBUG DCFLAGS = /Od /DDEBUG /Z7 DLFLAGS = /debug !else DCFLAGS = /O2 /Oi DLFLAGS = !endif CC = cl CFLAGS = /nologo $(DCFLAGS) /Y- /W3 /WX- /Gm- /fp:precise /EHsc /I.. /I..\include LNK = link LFLAGS = /out:$(APP).exe $(DLFLAGS) all: $(APP).exe $(APP).exe: ipsec_perf.obj $(IPSECLIB) $(LNK) $(LFLAGS) ipsec_perf.obj $(IPSECLIB) ipsec_perf.obj: ipsec_perf.c $(CC) /c $(CFLAGS) ipsec_perf.c clean: del /q ipsec_perf.obj $(APP).exe $(APP).pdb $(APP).ilk intel-ipsec-mb-0.48/LibTestApp/000077500000000000000000000000001321406316400162645ustar00rootroot00000000000000intel-ipsec-mb-0.48/LibTestApp/Makefile000066400000000000000000000050711321406316400177270ustar00rootroot00000000000000# # Copyright (c) 2012-2017, Intel Corporation # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # * Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # * Neither the name of Intel Corporation nor the names of its contributors # may be used to endorse or promote products derived from this software # without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # APP := ipsec_MB_testapp LIBDIR ?= ../ LDFLAGS = -L$(LIBDIR) -fPIE -z noexecstack -z relro -z now LDLIBS = -lIPSec_MB CFLAGS = -g -DLINUX -I../include -I.. ifeq ($(DEBUG),y) CFLAGS += -O0 -DDEBUG LDFLAGS += -g else CFLAGS += -O3 endif SOURCES := main.c gcm_test.c ctr_test.c customop_test.c des_test.c ccm_test.c OBJECTS := $(SOURCES:%.c=%.o) all: $(APP) $(APP): $(OBJECTS) $(CC) $(LDFLAGS) $^ $(LDLIBS) -o $@ main.o: main.c do_test.h gcm_test.o: gcm_test.c gcm_ctr_vectors_test.h ctr_test.o: ctr_test.c gcm_ctr_vectors_test.h des_test.o: des_test.c ccm_test.o: ccm_test.c customop_test.o: customop_test.c customop_test.h .PHONY: clean clean: -rm -f $(OBJECTS) $(APP) SOURCES_STYLE := $(foreach infile,$(SOURCES),-f $(infile)) CHECKPATCH?=checkpatch.pl .PHONY: style style: $(CHECKPATCH) --no-tree --no-signoff --emacs --no-color --max-line-length=100 \ --ignore CODE_INDENT,INITIALISED_STATIC,LEADING_SPACE,SPLIT_STRING,\ UNSPECIFIED_INT,ARRAY_SIZE,BLOCK_COMMENT_STYLE,GLOBAL_INITIALISERS,\ AVOID_EXTERNS,USE_FUNC,CONSTANT_COMPARISON $(SOURCES_STYLE) intel-ipsec-mb-0.48/LibTestApp/ccm_test.c000066400000000000000000002365561321406316400202520ustar00rootroot00000000000000/***************************************************************************** Copyright (c) 2017, Intel Corporation Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include #include #include #include #include #include #include "gcm_ctr_vectors_test.h" /* * Test vectors from https://tools.ietf.org/html/rfc3610 */ /* * =============== Packet Vector #1 ================== * AES Key = C0 C1 C2 C3 C4 C5 C6 C7 C8 C9 CA CB CC CD CE CF * Nonce = 00 00 00 03 02 01 00 A0 A1 A2 A3 A4 A5 * Total packet length = 31. [Input with 8 cleartext header octets] * 00 01 02 03 04 05 06 07 08 09 0A 0B 0C 0D 0E 0F * 10 11 12 13 14 15 16 17 18 19 1A 1B 1C 1D 1E * CBC IV in: 59 00 00 00 03 02 01 00 A0 A1 A2 A3 A4 A5 00 17 * CBC IV out:EB 9D 55 47 73 09 55 AB 23 1E 0A 2D FE 4B 90 D6 * After xor: EB 95 55 46 71 0A 51 AE 25 19 0A 2D FE 4B 90 D6 [hdr] * After AES: CD B6 41 1E 3C DC 9B 4F 5D 92 58 B6 9E E7 F0 91 * After xor: C5 BF 4B 15 30 D1 95 40 4D 83 4A A5 8A F2 E6 86 [msg] * After AES: 9C 38 40 5E A0 3C 1B C9 04 B5 8B 40 C7 6C A2 EB * After xor: 84 21 5A 45 BC 21 05 C9 04 B5 8B 40 C7 6C A2 EB [msg] * After AES: 2D C6 97 E4 11 CA 83 A8 60 C2 C4 06 CC AA 54 2F * CBC-MAC : 2D C6 97 E4 11 CA 83 A8 * CTR Start: 01 00 00 00 03 02 01 00 A0 A1 A2 A3 A4 A5 00 01 * CTR[0001]: 50 85 9D 91 6D CB 6D DD E0 77 C2 D1 D4 EC 9F 97 * CTR[0002]: 75 46 71 7A C6 DE 9A FF 64 0C 9C 06 DE 6D 0D 8F * CTR[MAC ]: 3A 2E 46 C8 EC 33 A5 48 * Total packet length = 39. [Authenticated and Encrypted Output] * 00 01 02 03 04 05 06 07 58 8C 97 9A 61 C6 63 D2 * F0 66 D0 C2 C0 F9 89 80 6D 5F 6B 61 DA C3 84 17 * E8 D1 2C FD F9 26 E0 */ static const uint8_t keys_01[] = { 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF }; static const uint8_t nonce_01[] = { 0x00, 0x00, 0x00, 0x03, 0x02, 0x01, 0x00, 0xA0, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5 }; static const uint8_t packet_in_01[] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E }; static const uint8_t packet_out_01[] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x58, 0x8C, 0x97, 0x9A, 0x61, 0xC6, 0x63, 0xD2, 0xF0, 0x66, 0xD0, 0xC2, 0xC0, 0xF9, 0x89, 0x80, 0x6D, 0x5F, 0x6B, 0x61, 0xDA, 0xC3, 0x84, 0x17, 0xE8, 0xD1, 0x2C, 0xFD, 0xF9, 0x26, 0xE0 }; #define clear_len_01 8 #define auth_len_01 8 /* * =============== Packet Vector #2 ================== * AES Key = C0 C1 C2 C3 C4 C5 C6 C7 C8 C9 CA CB CC CD CE CF * Nonce = 00 00 00 04 03 02 01 A0 A1 A2 A3 A4 A5 * Total packet length = 32. [Input with 8 cleartext header octets] * 00 01 02 03 04 05 06 07 08 09 0A 0B 0C 0D 0E 0F * 10 11 12 13 14 15 16 17 18 19 1A 1B 1C 1D 1E 1F * CBC IV in: 59 00 00 00 04 03 02 01 A0 A1 A2 A3 A4 A5 00 18 * CBC IV out:F0 C2 54 D3 CA 03 E2 39 70 BD 24 A8 4C 39 9E 77 * After xor: F0 CA 54 D2 C8 00 E6 3C 76 BA 24 A8 4C 39 9E 77 [hdr] * After AES: 48 DE 8B 86 28 EA 4A 40 00 AA 42 C2 95 BF 4A 8C * After xor: 40 D7 81 8D 24 E7 44 4F 10 BB 50 D1 81 AA 5C 9B [msg] * After AES: 0F 89 FF BC A6 2B C2 4F 13 21 5F 16 87 96 AA 33 * After xor: 17 90 E5 A7 BA 36 DC 50 13 21 5F 16 87 96 AA 33 [msg] * After AES: F7 B9 05 6A 86 92 6C F3 FB 16 3D C4 99 EF AA 11 * CBC-MAC : F7 B9 05 6A 86 92 6C F3 * CTR Start: 01 00 00 00 04 03 02 01 A0 A1 A2 A3 A4 A5 00 01 * CTR[0001]: 7A C0 10 3D ED 38 F6 C0 39 0D BA 87 1C 49 91 F4 * CTR[0002]: D4 0C DE 22 D5 F9 24 24 F7 BE 9A 56 9D A7 9F 51 * CTR[MAC ]: 57 28 D0 04 96 D2 65 E5 * Total packet length = 40. [Authenticated and Encrypted Output] * 00 01 02 03 04 05 06 07 72 C9 1A 36 E1 35 F8 CF * 29 1C A8 94 08 5C 87 E3 CC 15 C4 39 C9 E4 3A 3B * A0 91 D5 6E 10 40 09 16 */ static const uint8_t keys_02[] = { 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF }; static const uint8_t nonce_02[] = { 0x00, 0x00, 0x00, 0x04, 0x03, 0x02, 0x01, 0xA0, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5 }; static const uint8_t packet_in_02[] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F }; static const uint8_t packet_out_02[] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x72, 0xC9, 0x1A, 0x36, 0xE1, 0x35, 0xF8, 0xCF, 0x29, 0x1C, 0xA8, 0x94, 0x08, 0x5C, 0x87, 0xE3, 0xCC, 0x15, 0xC4, 0x39, 0xC9, 0xE4, 0x3A, 0x3B, 0xA0, 0x91, 0xD5, 0x6E, 0x10, 0x40, 0x09, 0x16 }; #define clear_len_02 8 #define auth_len_02 8 /* * =============== Packet Vector #3 ================== * AES Key = C0 C1 C2 C3 C4 C5 C6 C7 C8 C9 CA CB CC CD CE CF * Nonce = 00 00 00 05 04 03 02 A0 A1 A2 A3 A4 A5 * Total packet length = 33. [Input with 8 cleartext header octets] * 00 01 02 03 04 05 06 07 08 09 0A 0B 0C 0D 0E 0F * 10 11 12 13 14 15 16 17 18 19 1A 1B 1C 1D 1E 1F * 20 * CBC IV in: 59 00 00 00 05 04 03 02 A0 A1 A2 A3 A4 A5 00 19 * CBC IV out:6F 8A 12 F7 BF 8D 4D C5 A1 19 6E 95 DF F0 B4 27 * After xor: 6F 82 12 F6 BD 8E 49 C0 A7 1E 6E 95 DF F0 B4 27 [hdr] * After AES: 37 E9 B7 8C C2 20 17 E7 33 80 43 0C BE F4 28 24 * After xor: 3F E0 BD 87 CE 2D 19 E8 23 91 51 1F AA E1 3E 33 [msg] * After AES: 90 CA 05 13 9F 4D 4E CF 22 6F E9 81 C5 9E 2D 40 * After xor: 88 D3 1F 08 83 50 50 D0 02 6F E9 81 C5 9E 2D 40 [msg] * After AES: 73 B4 67 75 C0 26 DE AA 41 03 97 D6 70 FE 5F B0 * CBC-MAC : 73 B4 67 75 C0 26 DE AA * CTR Start: 01 00 00 00 05 04 03 02 A0 A1 A2 A3 A4 A5 00 01 * CTR[0001]: 59 B8 EF FF 46 14 73 12 B4 7A 1D 9D 39 3D 3C FF * CTR[0002]: 69 F1 22 A0 78 C7 9B 89 77 89 4C 99 97 5C 23 78 * CTR[MAC ]: 39 6E C0 1A 7D B9 6E 6F * Total packet length = 41. [Authenticated and Encrypted Output] * 00 01 02 03 04 05 06 07 51 B1 E5 F4 4A 19 7D 1D * A4 6B 0F 8E 2D 28 2A E8 71 E8 38 BB 64 DA 85 96 * 57 4A DA A7 6F BD 9F B0 C5 */ static const uint8_t keys_03[] = { 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF }; static const uint8_t nonce_03[] = { 0x00, 0x00, 0x00, 0x05, 0x04, 0x03, 0x02, 0xA0, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5 }; static const uint8_t packet_in_03[] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F, 0x20 }; static const uint8_t packet_out_03[] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x51, 0xB1, 0xE5, 0xF4, 0x4A, 0x19, 0x7D, 0x1D, 0xA4, 0x6B, 0x0F, 0x8E, 0x2D, 0x28, 0x2A, 0xE8, 0x71, 0xE8, 0x38, 0xBB, 0x64, 0xDA, 0x85, 0x96, 0x57, 0x4A, 0xDA, 0xA7, 0x6F, 0xBD, 0x9F, 0xB0, 0xC5 }; #define clear_len_03 8 #define auth_len_03 8 /* * =============== Packet Vector #4 ================== * AES Key = C0 C1 C2 C3 C4 C5 C6 C7 C8 C9 CA CB CC CD CE CF * Nonce = 00 00 00 06 05 04 03 A0 A1 A2 A3 A4 A5 * Total packet length = 31. [Input with 12 cleartext header octets] * 00 01 02 03 04 05 06 07 08 09 0A 0B 0C 0D 0E 0F * 10 11 12 13 14 15 16 17 18 19 1A 1B 1C 1D 1E * CBC IV in: 59 00 00 00 06 05 04 03 A0 A1 A2 A3 A4 A5 00 13 * CBC IV out:06 65 2C 60 0E F5 89 63 CA C3 25 A9 CD 3E 2B E1 * After xor: 06 69 2C 61 0C F6 8D 66 CC C4 2D A0 C7 35 2B E1 [hdr] * After AES: A0 75 09 AC 15 C2 58 86 04 2F 80 60 54 FE A6 86 * After xor: AC 78 07 A3 05 D3 4A 95 10 3A 96 77 4C E7 BC 9D [msg] * After AES: 64 4C 09 90 D9 1B 83 E9 AB 4B 8E ED 06 6F F5 BF * After xor: 78 51 17 90 D9 1B 83 E9 AB 4B 8E ED 06 6F F5 BF [msg] * After AES: 4B 4F 4B 39 B5 93 E6 BF B0 B2 C2 B7 0F 29 CD 7A * CBC-MAC : 4B 4F 4B 39 B5 93 E6 BF * CTR Start: 01 00 00 00 06 05 04 03 A0 A1 A2 A3 A4 A5 00 01 * CTR[0001]: AE 81 66 6A 83 8B 88 6A EE BF 4A 5B 32 84 50 8A * CTR[0002]: D1 B1 92 06 AC 93 9E 2F B6 DD CE 10 A7 74 FD 8D * CTR[MAC ]: DD 87 2A 80 7C 75 F8 4E * Total packet length = 39. [Authenticated and Encrypted Output] * 00 01 02 03 04 05 06 07 08 09 0A 0B A2 8C 68 65 * 93 9A 9A 79 FA AA 5C 4C 2A 9D 4A 91 CD AC 8C 96 * C8 61 B9 C9 E6 1E F1 */ static const uint8_t keys_04[] = { 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF }; static const uint8_t nonce_04[] = { 0x00, 0x00, 0x00, 0x06, 0x05, 0x04, 0x03, 0xA0, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5 }; static const uint8_t packet_in_04[] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E }; static const uint8_t packet_out_04[] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0xA2, 0x8C, 0x68, 0x65, 0x93, 0x9A, 0x9A, 0x79, 0xFA, 0xAA, 0x5C, 0x4C, 0x2A, 0x9D, 0x4A, 0x91, 0xCD, 0xAC, 0x8C, 0x96, 0xC8, 0x61, 0xB9, 0xC9, 0xE6, 0x1E, 0xF1 }; #define clear_len_04 12 #define auth_len_04 8 /* * =============== Packet Vector #5 ================== * AES Key = C0 C1 C2 C3 C4 C5 C6 C7 C8 C9 CA CB CC CD CE CF * Nonce = 00 00 00 07 06 05 04 A0 A1 A2 A3 A4 A5 * Total packet length = 32. [Input with 12 cleartext header octets] * 00 01 02 03 04 05 06 07 08 09 0A 0B 0C 0D 0E 0F * 10 11 12 13 14 15 16 17 18 19 1A 1B 1C 1D 1E 1F * CBC IV in: 59 00 00 00 07 06 05 04 A0 A1 A2 A3 A4 A5 00 14 * CBC IV out:00 4C 50 95 45 80 3C 48 51 CD E1 3B 56 C8 9A 85 * After xor: 00 40 50 94 47 83 38 4D 57 CA E9 32 5C C3 9A 85 [hdr] * After AES: E2 B8 F7 CE 49 B2 21 72 84 A8 EA 84 FA AD 67 5C * After xor: EE B5 F9 C1 59 A3 33 61 90 BD FC 93 E2 B4 7D 47 [msg] * After AES: 3E FB 36 72 25 DB 11 01 D3 C2 2F 0E CA FF 44 F3 * After xor: 22 E6 28 6D 25 DB 11 01 D3 C2 2F 0E CA FF 44 F3 [msg] * After AES: 48 B9 E8 82 55 05 4A B5 49 0A 95 F9 34 9B 4B 5E * CBC-MAC : 48 B9 E8 82 55 05 4A B5 * CTR Start: 01 00 00 00 07 06 05 04 A0 A1 A2 A3 A4 A5 00 01 * CTR[0001]: D0 FC F5 74 4D 8F 31 E8 89 5B 05 05 4B 7C 90 C3 * CTR[0002]: 72 A0 D4 21 9F 0D E1 D4 04 83 BC 2D 3D 0C FC 2A * CTR[MAC ]: 19 51 D7 85 28 99 67 26 * Total packet length = 40. [Authenticated and Encrypted Output] * 00 01 02 03 04 05 06 07 08 09 0A 0B DC F1 FB 7B * 5D 9E 23 FB 9D 4E 13 12 53 65 8A D8 6E BD CA 3E * 51 E8 3F 07 7D 9C 2D 93 */ static const uint8_t keys_05[] = { 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF }; static const uint8_t nonce_05[] = { 0x00, 0x00, 0x00, 0x07, 0x06, 0x05, 0x04, 0xA0, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5 }; static const uint8_t packet_in_05[] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F }; static const uint8_t packet_out_05[] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0xDC, 0xF1, 0xFB, 0x7B, 0x5D, 0x9E, 0x23, 0xFB, 0x9D, 0x4E, 0x13, 0x12, 0x53, 0x65, 0x8A, 0xD8, 0x6E, 0xBD, 0xCA, 0x3E, 0x51, 0xE8, 0x3F, 0x07, 0x7D, 0x9C, 0x2D, 0x93 }; #define clear_len_05 12 #define auth_len_05 8 /* * =============== Packet Vector #6 ================== * AES Key = C0 C1 C2 C3 C4 C5 C6 C7 C8 C9 CA CB CC CD CE CF * Nonce = 00 00 00 08 07 06 05 A0 A1 A2 A3 A4 A5 * Total packet length = 33. [Input with 12 cleartext header octets] * 00 01 02 03 04 05 06 07 08 09 0A 0B 0C 0D 0E 0F * 10 11 12 13 14 15 16 17 18 19 1A 1B 1C 1D 1E 1F * 20 * CBC IV in: 59 00 00 00 08 07 06 05 A0 A1 A2 A3 A4 A5 00 15 * CBC IV out:04 72 DA 4C 6F F6 0A 63 06 52 1A 06 04 80 CD E5 * After xor: 04 7E DA 4D 6D F5 0E 66 00 55 12 0F 0E 8B CD E5 [hdr] * After AES: 64 4C 36 A5 A2 27 37 62 0B 89 F1 D7 BF F2 73 D4 * After xor: 68 41 38 AA B2 36 25 71 1F 9C E7 C0 A7 EB 69 CF [msg] * After AES: 41 E1 19 CD 19 24 CE 77 F1 2F A6 60 C1 6E BB 4E * After xor: 5D FC 07 D2 39 24 CE 77 F1 2F A6 60 C1 6E BB 4E [msg] * After AES: A5 27 D8 15 6A C3 59 BF 1C B8 86 E6 2F 29 91 29 * CBC-MAC : A5 27 D8 15 6A C3 59 BF * CTR Start: 01 00 00 00 08 07 06 05 A0 A1 A2 A3 A4 A5 00 01 * CTR[0001]: 63 CC BE 1E E0 17 44 98 45 64 B2 3A 8D 24 5C 80 * CTR[0002]: 39 6D BA A2 A7 D2 CB D4 B5 E1 7C 10 79 45 BB C0 * CTR[MAC ]: E5 7D DC 56 C6 52 92 2B * Total packet length = 41. [Authenticated and Encrypted Output] * 00 01 02 03 04 05 06 07 08 09 0A 0B 6F C1 B0 11 * F0 06 56 8B 51 71 A4 2D 95 3D 46 9B 25 70 A4 BD * 87 40 5A 04 43 AC 91 CB 94 */ static const uint8_t keys_06[] = { 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF }; static const uint8_t nonce_06[] = { 0x00, 0x00, 0x00, 0x08, 0x07, 0x06, 0x05, 0xA0, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5 }; static const uint8_t packet_in_06[] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F, 0x20 }; static const uint8_t packet_out_06[] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x6F, 0xC1, 0xB0, 0x11, 0xF0, 0x06, 0x56, 0x8B, 0x51, 0x71, 0xA4, 0x2D, 0x95, 0x3D, 0x46, 0x9B, 0x25, 0x70, 0xA4, 0xBD, 0x87, 0x40, 0x5A, 0x04, 0x43, 0xAC, 0x91, 0xCB, 0x94 }; #define clear_len_06 12 #define auth_len_06 8 /* * =============== Packet Vector #7 ================== * AES Key = C0 C1 C2 C3 C4 C5 C6 C7 C8 C9 CA CB CC CD CE CF * Nonce = 00 00 00 09 08 07 06 A0 A1 A2 A3 A4 A5 * Total packet length = 31. [Input with 8 cleartext header octets] * 00 01 02 03 04 05 06 07 08 09 0A 0B 0C 0D 0E 0F * 10 11 12 13 14 15 16 17 18 19 1A 1B 1C 1D 1E * CBC IV in: 61 00 00 00 09 08 07 06 A0 A1 A2 A3 A4 A5 00 17 * CBC IV out:60 06 C5 72 DA 23 9C BF A0 5B 0A DE D2 CD A8 1E * After xor: 60 0E C5 73 D8 20 98 BA A6 5C 0A DE D2 CD A8 1E [hdr] * After AES: 41 7D E2 AE 94 E2 EA D9 00 FC 44 FC D0 69 52 27 * After xor: 49 74 E8 A5 98 EF E4 D6 10 ED 56 EF C4 7C 44 30 [msg] * After AES: 2A 6C 42 CA 49 D7 C7 01 C5 7D 59 FF 87 16 49 0E * After xor: 32 75 58 D1 55 CA D9 01 C5 7D 59 FF 87 16 49 0E [msg] * After AES: 89 8B D6 45 4E 27 20 BB D2 7E F3 15 7A 7C 90 B2 * CBC-MAC : 89 8B D6 45 4E 27 20 BB D2 7E * CTR Start: 01 00 00 00 09 08 07 06 A0 A1 A2 A3 A4 A5 00 01 * CTR[0001]: 09 3C DB B9 C5 52 4F DA C1 C5 EC D2 91 C4 70 AF * CTR[0002]: 11 57 83 86 E2 C4 72 B4 8E CC 8A AD AB 77 6F CB * CTR[MAC ]: 8D 07 80 25 62 B0 8C 00 A6 EE * Total packet length = 41. [Authenticated and Encrypted Output] * 00 01 02 03 04 05 06 07 01 35 D1 B2 C9 5F 41 D5 * D1 D4 FE C1 85 D1 66 B8 09 4E 99 9D FE D9 6C 04 * 8C 56 60 2C 97 AC BB 74 90 */ static const uint8_t keys_07[] = { 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF }; static const uint8_t nonce_07[] = { 0x00, 0x00, 0x00, 0x09, 0x08, 0x07, 0x06, 0xA0, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5 }; static const uint8_t packet_in_07[] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E }; static const uint8_t packet_out_07[] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x01, 0x35, 0xD1, 0xB2, 0xC9, 0x5F, 0x41, 0xD5, 0xD1, 0xD4, 0xFE, 0xC1, 0x85, 0xD1, 0x66, 0xB8, 0x09, 0x4E, 0x99, 0x9D, 0xFE, 0xD9, 0x6C, 0x04, 0x8C, 0x56, 0x60, 0x2C, 0x97, 0xAC, 0xBB, 0x74, 0x90 }; #define clear_len_07 8 #define auth_len_07 10 /* * =============== Packet Vector #8 ================== * AES Key = C0 C1 C2 C3 C4 C5 C6 C7 C8 C9 CA CB CC CD CE CF * Nonce = 00 00 00 0A 09 08 07 A0 A1 A2 A3 A4 A5 * Total packet length = 32. [Input with 8 cleartext header octets] * 00 01 02 03 04 05 06 07 08 09 0A 0B 0C 0D 0E 0F * 10 11 12 13 14 15 16 17 18 19 1A 1B 1C 1D 1E 1F * CBC IV in: 61 00 00 00 0A 09 08 07 A0 A1 A2 A3 A4 A5 00 18 * CBC IV out:63 A3 FA E4 6C 79 F3 FA 78 38 B8 A2 80 36 B6 0B * After xor: 63 AB FA E5 6E 7A F7 FF 7E 3F B8 A2 80 36 B6 0B [hdr] * After AES: 1C 99 1A 3D B7 60 79 27 34 40 79 1F AD 8B 5B 02 * After xor: 14 90 10 36 BB 6D 77 28 24 51 6B 0C B9 9E 4D 15 [msg] * After AES: 14 19 E8 E8 CB BE 75 58 E1 E3 BE 4B 6C 9F 82 E3 * After xor: 0C 00 F2 F3 D7 A3 6B 47 E1 E3 BE 4B 6C 9F 82 E3 [msg] * After AES: E0 16 E8 1C 7F 7B 8A 38 A5 38 F2 CB 5B B6 C1 F2 * CBC-MAC : E0 16 E8 1C 7F 7B 8A 38 A5 38 * CTR Start: 01 00 00 00 0A 09 08 07 A0 A1 A2 A3 A4 A5 00 01 * CTR[0001]: 73 7C 33 91 CC 8E 13 DD E0 AA C5 4B 6D B7 EB 98 * CTR[0002]: 74 B7 71 77 C5 AA C5 3B 04 A4 F8 70 8E 92 EB 2B * CTR[MAC ]: 21 6D AC 2F 8B 4F 1C 07 91 8C * Total packet length = 42. [Authenticated and Encrypted Output] * 00 01 02 03 04 05 06 07 7B 75 39 9A C0 83 1D D2 * F0 BB D7 58 79 A2 FD 8F 6C AE 6B 6C D9 B7 DB 24 * C1 7B 44 33 F4 34 96 3F 34 B4 */ static const uint8_t keys_08[] = { 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF }; static const uint8_t nonce_08[] = { 0x00, 0x00, 0x00, 0x0a, 0x09, 0x08, 0x07, 0xA0, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5 }; static const uint8_t packet_in_08[] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F }; static const uint8_t packet_out_08[] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x7B, 0x75, 0x39, 0x9A, 0xC0, 0x83, 0x1D, 0xD2, 0xF0, 0xBB, 0xD7, 0x58, 0x79, 0xA2, 0xFD, 0x8F, 0x6C, 0xAE, 0x6B, 0x6C, 0xD9, 0xB7, 0xDB, 0x24, 0xC1, 0x7B, 0x44, 0x33, 0xF4, 0x34, 0x96, 0x3F, 0x34, 0xB4 }; #define clear_len_08 8 #define auth_len_08 10 /* * =============== Packet Vector #9 ================== * AES Key = C0 C1 C2 C3 C4 C5 C6 C7 C8 C9 CA CB CC CD CE CF * Nonce = 00 00 00 0B 0A 09 08 A0 A1 A2 A3 A4 A5 * Total packet length = 33. [Input with 8 cleartext header octets] * 00 01 02 03 04 05 06 07 08 09 0A 0B 0C 0D 0E 0F * 10 11 12 13 14 15 16 17 18 19 1A 1B 1C 1D 1E 1F * 20 * CBC IV in: 61 00 00 00 0B 0A 09 08 A0 A1 A2 A3 A4 A5 00 19 * CBC IV out:4F 2C 86 11 1E 08 2A DD 6B 44 21 3A B5 13 13 16 * After xor: 4F 24 86 10 1C 0B 2E D8 6D 43 21 3A B5 13 13 16 [hdr] * After AES: F6 EC 56 87 3C 57 12 DC 9C C5 3C A8 D4 D1 ED 0A * After xor: FE E5 5C 8C 30 5A 1C D3 8C D4 2E BB C0 C4 FB 1D [msg] * After AES: 17 C1 80 A5 31 53 D4 C3 03 85 0C 95 65 80 34 52 * After xor: 0F D8 9A BE 2D 4E CA DC 23 85 0C 95 65 80 34 52 [msg] * After AES: 46 A1 F6 E2 B1 6E 75 F8 1C F5 6B 1A 80 04 44 1B * CBC-MAC : 46 A1 F6 E2 B1 6E 75 F8 1C F5 * CTR Start: 01 00 00 00 0B 0A 09 08 A0 A1 A2 A3 A4 A5 00 01 * CTR[0001]: 8A 5A 10 6B C0 29 9A 55 5B 93 6B 0B 0E A0 DE 5A * CTR[0002]: EA 05 FD E2 AB 22 5C FE B7 73 12 CB 88 D9 A5 4A * CTR[MAC ]: AC 3D F1 07 DA 30 C4 86 43 BB * Total packet length = 43. [Authenticated and Encrypted Output] * 00 01 02 03 04 05 06 07 82 53 1A 60 CC 24 94 5A * 4B 82 79 18 1A B5 C8 4D F2 1C E7 F9 B7 3F 42 E1 * 97 EA 9C 07 E5 6B 5E B1 7E 5F 4E */ static const uint8_t keys_09[] = { 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF }; static const uint8_t nonce_09[] = { 0x00, 0x00, 0x00, 0x0b, 0x0a, 0x09, 0x08, 0xA0, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5 }; static const uint8_t packet_in_09[] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F, 0x20 }; static const uint8_t packet_out_09[] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x82, 0x53, 0x1A, 0x60, 0xCC, 0x24, 0x94, 0x5A, 0x4B, 0x82, 0x79, 0x18, 0x1A, 0xB5, 0xC8, 0x4D, 0xF2, 0x1C, 0xE7, 0xF9, 0xB7, 0x3F, 0x42, 0xE1, 0x97, 0xEA, 0x9C, 0x07, 0xE5, 0x6B, 0x5E, 0xB1, 0x7E, 0x5F, 0x4E }; #define clear_len_09 8 #define auth_len_09 10 /* * =============== Packet Vector #10 ================== * AES Key = C0 C1 C2 C3 C4 C5 C6 C7 C8 C9 CA CB CC CD CE CF * Nonce = 00 00 00 0C 0B 0A 09 A0 A1 A2 A3 A4 A5 * Total packet length = 31. [Input with 12 cleartext header octets] * 00 01 02 03 04 05 06 07 08 09 0A 0B 0C 0D 0E 0F * 10 11 12 13 14 15 16 17 18 19 1A 1B 1C 1D 1E * CBC IV in: 61 00 00 00 0C 0B 0A 09 A0 A1 A2 A3 A4 A5 00 13 * CBC IV out:7F B8 0A 32 E9 80 57 46 EC 31 6C 3A B2 A2 EB 5D * After xor: 7F B4 0A 33 EB 83 53 43 EA 36 64 33 B8 A9 EB 5D [hdr] * After AES: 7E 96 96 BF F1 56 D6 A8 6E AC F5 7B 7F 23 47 5A * After xor: 72 9B 98 B0 E1 47 C4 BB 7A B9 E3 6C 67 3A 5D 41 [msg] * After AES: 8B 4A EE 42 04 24 8A 59 FA CC 88 66 57 66 DD 72 * After xor: 97 57 F0 42 04 24 8A 59 FA CC 88 66 57 66 DD 72 [msg] * After AES: 41 63 89 36 62 ED D7 EB CD 6E 15 C1 89 48 62 05 * CBC-MAC : 41 63 89 36 62 ED D7 EB CD 6E * CTR Start: 01 00 00 00 0C 0B 0A 09 A0 A1 A2 A3 A4 A5 00 01 * CTR[0001]: 0B 39 2B 9B 05 66 97 06 3F 12 56 8F 2B 13 A1 0F * CTR[0002]: 07 89 65 25 23 40 94 3B 9E 69 B2 56 CC 5E F7 31 * CTR[MAC ]: 17 09 20 76 09 A0 4E 72 45 B3 * Total packet length = 41. [Authenticated and Encrypted Output] * 00 01 02 03 04 05 06 07 08 09 0A 0B 07 34 25 94 * 15 77 85 15 2B 07 40 98 33 0A BB 14 1B 94 7B 56 * 6A A9 40 6B 4D 99 99 88 DD */ static const uint8_t keys_10[] = { 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF }; static const uint8_t nonce_10[] = { 0x00, 0x00, 0x00, 0x0c, 0x0b, 0x0a, 0x09, 0xA0, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5 }; static const uint8_t packet_in_10[] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E }; static const uint8_t packet_out_10[] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x07, 0x34, 0x25, 0x94, 0x15, 0x77, 0x85, 0x15, 0x2B, 0x07, 0x40, 0x98, 0x33, 0x0A, 0xBB, 0x14, 0x1B, 0x94, 0x7B, 0x56, 0x6A, 0xA9, 0x40, 0x6B, 0x4D, 0x99, 0x99, 0x88, 0xDD }; #define clear_len_10 12 #define auth_len_10 10 /* * =============== Packet Vector #11 ================== * AES Key = C0 C1 C2 C3 C4 C5 C6 C7 C8 C9 CA CB CC CD CE CF * Nonce = 00 00 00 0D 0C 0B 0A A0 A1 A2 A3 A4 A5 * Total packet length = 32. [Input with 12 cleartext header octets] * 00 01 02 03 04 05 06 07 08 09 0A 0B 0C 0D 0E 0F * 10 11 12 13 14 15 16 17 18 19 1A 1B 1C 1D 1E 1F * CBC IV in: 61 00 00 00 0D 0C 0B 0A A0 A1 A2 A3 A4 A5 00 14 * CBC IV out:B0 84 85 79 51 D2 FA 42 76 EF 3A D7 14 B9 62 87 * After xor: B0 88 85 78 53 D1 FE 47 70 E8 32 DE 1E B2 62 87 [hdr] * After AES: C9 B3 64 7E D8 79 2A 5C 65 B7 CE CC 19 0A 97 0A * After xor: C5 BE 6A 71 C8 68 38 4F 71 A2 D8 DB 01 13 8D 11 [msg] * After AES: 34 0F 69 17 FA B9 19 D6 1D AC D0 35 36 D6 55 8B * After xor: 28 12 77 08 FA B9 19 D6 1D AC D0 35 36 D6 55 8B [msg] * After AES: 6B 5E 24 34 12 CC C2 AD 6F 1B 11 C3 A1 A9 D8 BC * CBC-MAC : 6B 5E 24 34 12 CC C2 AD 6F 1B * CTR Start: 01 00 00 00 0D 0C 0B 0A A0 A1 A2 A3 A4 A5 00 01 * CTR[0001]: 6B 66 BC 0C 90 A1 F1 12 FC BE 6F 4E 12 20 77 BC * CTR[0002]: 97 9E 57 2B BE 65 8A E5 CC 20 11 83 2A 9A 9B 5B * CTR[MAC ]: 9E 64 86 DD 02 B6 49 C1 6D 37 * Total packet length = 42. [Authenticated and Encrypted Output] * 00 01 02 03 04 05 06 07 08 09 0A 0B 67 6B B2 03 * 80 B0 E3 01 E8 AB 79 59 0A 39 6D A7 8B 83 49 34 * F5 3A A2 E9 10 7A 8B 6C 02 2C */ static const uint8_t keys_11[] = { 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF }; static const uint8_t nonce_11[] = { 0x00, 0x00, 0x00, 0x0d, 0x0c, 0x0b, 0x0a, 0xA0, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5 }; static const uint8_t packet_in_11[] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F }; static const uint8_t packet_out_11[] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x67, 0x6B, 0xB2, 0x03, 0x80, 0xB0, 0xE3, 0x01, 0xE8, 0xAB, 0x79, 0x59, 0x0A, 0x39, 0x6D, 0xA7, 0x8B, 0x83, 0x49, 0x34, 0xF5, 0x3A, 0xA2, 0xE9, 0x10, 0x7A, 0x8B, 0x6C, 0x02, 0x2C }; #define clear_len_11 12 #define auth_len_11 10 /* * =============== Packet Vector #12 ================== * AES Key = C0 C1 C2 C3 C4 C5 C6 C7 C8 C9 CA CB CC CD CE CF * Nonce = 00 00 00 0E 0D 0C 0B A0 A1 A2 A3 A4 A5 * Total packet length = 33. [Input with 12 cleartext header octets] * 00 01 02 03 04 05 06 07 08 09 0A 0B 0C 0D 0E 0F * 10 11 12 13 14 15 16 17 18 19 1A 1B 1C 1D 1E 1F * 20 * CBC IV in: 61 00 00 00 0E 0D 0C 0B A0 A1 A2 A3 A4 A5 00 15 * CBC IV out:5F 8E 8D 02 AD 95 7C 5A 36 14 CF 63 40 16 97 4F * After xor: 5F 82 8D 03 AF 96 78 5F 30 13 C7 6A 4A 1D 97 4F [hdr] * After AES: 63 FA BD 69 B9 55 65 FF 54 AA F4 60 88 7D EC 9F * After xor: 6F F7 B3 66 A9 44 77 EC 40 BF E2 77 90 64 F6 84 [msg] * After AES: 5A 76 5F 0B 93 CE 4F 6A B4 1D 91 30 18 57 6A D7 * After xor: 46 6B 41 14 B3 CE 4F 6A B4 1D 91 30 18 57 6A D7 [msg] * After AES: 9D 66 92 41 01 08 D5 B6 A1 45 85 AC AF 86 32 E8 * CBC-MAC : 9D 66 92 41 01 08 D5 B6 A1 45 * CTR Start: 01 00 00 00 0E 0D 0C 0B A0 A1 A2 A3 A4 A5 00 01 * CTR[0001]: CC F2 AE D9 E0 4A C9 74 E6 58 55 B3 2B 94 30 BF * CTR[0002]: A2 CA AC 11 63 F4 07 E5 E5 F6 E3 B3 79 0F 79 F8 * CTR[MAC ]: 50 7C 31 57 63 EF 78 D3 77 9E * Total packet length = 43. [Authenticated and Encrypted Output] * 00 01 02 03 04 05 06 07 08 09 0A 0B C0 FF A0 D6 * F0 5B DB 67 F2 4D 43 A4 33 8D 2A A4 BE D7 B2 0E * 43 CD 1A A3 16 62 E7 AD 65 D6 DB */ static const uint8_t keys_12[] = { 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF }; static const uint8_t nonce_12[] = { 0x00, 0x00, 0x00, 0x0e, 0x0d, 0x0c, 0x0b, 0xA0, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5 }; static const uint8_t packet_in_12[] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F, 0x20 }; static const uint8_t packet_out_12[] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0xC0, 0xFF, 0xA0, 0xD6, 0xF0, 0x5B, 0xDB, 0x67, 0xF2, 0x4D, 0x43, 0xA4, 0x33, 0x8D, 0x2A, 0xA4, 0xBE, 0xD7, 0xB2, 0x0E, 0x43, 0xCD, 0x1A, 0xA3, 0x16, 0x62, 0xE7, 0xAD, 0x65, 0xD6, 0xDB }; #define clear_len_12 12 #define auth_len_12 10 /* * =============== Packet Vector #13 ================== * AES Key = D7 82 8D 13 B2 B0 BD C3 25 A7 62 36 DF 93 CC 6B * Nonce = 00 41 2B 4E A9 CD BE 3C 96 96 76 6C FA * Total packet length = 31. [Input with 8 cleartext header octets] * 0B E1 A8 8B AC E0 18 B1 08 E8 CF 97 D8 20 EA 25 * 84 60 E9 6A D9 CF 52 89 05 4D 89 5C EA C4 7C * CBC IV in: 59 00 41 2B 4E A9 CD BE 3C 96 96 76 6C FA 00 17 * CBC IV out:33 AE C3 1A 1F B7 CC 35 E5 DA D2 BA C0 90 D9 A3 * After xor: 33 A6 C8 FB B7 3C 60 D5 FD 6B D2 BA C0 90 D9 A3 [hdr] * After AES: B7 56 CA 1E 5B 42 C6 9C 58 E3 0A F5 2B F7 7C FD * After xor: BF BE 05 89 83 62 2C B9 DC 83 E3 9F F2 38 2E 74 [msg] * After AES: 33 3D 3A 3D 07 B5 3C 7B 22 0E 96 1A 18 A9 A1 9E * After xor: 36 70 B3 61 ED 71 40 7B 22 0E 96 1A 18 A9 A1 9E [msg] * After AES: 14 BD DB 6B F9 01 63 4D FB 56 51 83 BC 74 93 F7 * CBC-MAC : 14 BD DB 6B F9 01 63 4D * CTR Start: 01 00 41 2B 4E A9 CD BE 3C 96 96 76 6C FA 00 01 * CTR[0001]: 44 51 B0 11 7A 84 82 BF 03 19 AE C1 59 5E BD DA * CTR[0002]: 83 EB 76 E1 3A 44 84 7F 92 20 09 07 76 B8 25 C5 * CTR[MAC ]: F3 31 2C A0 F5 DC B4 FE * Total packet length = 39. [Authenticated and Encrypted Output] * 0B E1 A8 8B AC E0 18 B1 4C B9 7F 86 A2 A4 68 9A * 87 79 47 AB 80 91 EF 53 86 A6 FF BD D0 80 F8 E7 * 8C F7 CB 0C DD D7 B3 */ static const uint8_t keys_13[] = { 0xD7, 0x82, 0x8D, 0x13, 0xB2, 0xB0, 0xBD, 0xC3, 0x25, 0xA7, 0x62, 0x36, 0xDF, 0x93, 0xCC, 0x6B }; static const uint8_t nonce_13[] = { 0x00, 0x41, 0x2b, 0x4e, 0xa9, 0xcd, 0xbe, 0x3c, 0x96, 0x96, 0x76, 0x6c, 0xfa }; static const uint8_t packet_in_13[] = { 0x0B, 0xE1, 0xA8, 0x8B, 0xAC, 0xE0, 0x18, 0xB1, 0x08, 0xE8, 0xCF, 0x97, 0xD8, 0x20, 0xEA, 0x25, 0x84, 0x60, 0xE9, 0x6A, 0xD9, 0xCF, 0x52, 0x89, 0x05, 0x4D, 0x89, 0x5C, 0xEA, 0xC4, 0x7C }; static const uint8_t packet_out_13[] = { 0x0B, 0xE1, 0xA8, 0x8B, 0xAC, 0xE0, 0x18, 0xB1, 0x4C, 0xB9, 0x7F, 0x86, 0xA2, 0xA4, 0x68, 0x9A, 0x87, 0x79, 0x47, 0xAB, 0x80, 0x91, 0xEF, 0x53, 0x86, 0xA6, 0xFF, 0xBD, 0xD0, 0x80, 0xF8, 0xE7, 0x8C, 0xF7, 0xCB, 0x0C, 0xDD, 0xD7, 0xB3 }; #define clear_len_13 8 #define auth_len_13 8 /* * =============== Packet Vector #14 ================== * AES Key = D7 82 8D 13 B2 B0 BD C3 25 A7 62 36 DF 93 CC 6B * Nonce = 00 33 56 8E F7 B2 63 3C 96 96 76 6C FA * Total packet length = 32. [Input with 8 cleartext header octets] * 63 01 8F 76 DC 8A 1B CB 90 20 EA 6F 91 BD D8 5A * FA 00 39 BA 4B AF F9 BF B7 9C 70 28 94 9C D0 EC * CBC IV in: 59 00 33 56 8E F7 B2 63 3C 96 96 76 6C FA 00 18 * CBC IV out:42 0D B1 50 BB 0C 44 DA 83 E4 52 09 55 99 67 E3 * After xor: 42 05 D2 51 34 7A 98 50 98 2F 52 09 55 99 67 E3 [hdr] * After AES: EA D1 CA 56 02 02 09 5C E6 12 B0 D2 18 A0 DD 44 * After xor: 7A F1 20 39 93 BF D1 06 1C 12 89 68 53 0F 24 FB [msg] * After AES: 51 77 41 69 C3 DE 6B 24 13 27 74 90 F5 FF C5 62 * After xor: E6 EB 31 41 57 42 BB C8 13 27 C5 62 [msg] * After AES: D4 CC 3B 82 DF 9F CC 56 7E E5 83 61 D7 8D FB 5E * CBC-MAC : D4 CC 3B 82 DF 9F CC 56 * CTR Start: 01 00 33 56 8E F7 B2 63 3C 96 96 76 6C FA 00 01 * CTR[0001]: DC EB F4 13 38 3C 66 A0 5A 72 55 EF 98 D7 FF AD * CTR[0002]: 2F 54 2C BA 15 D6 6C DF E1 EC 46 8F 0E 68 A1 24 * CTR[MAC ]: 11 E2 D3 9F A2 E8 0C DC * Total packet length = 40. [Authenticated and Encrypted Output] * 63 01 8F 76 DC 8A 1B CB 4C CB 1E 7C A9 81 BE FA * A0 72 6C 55 D3 78 06 12 98 C8 5C 92 81 4A BC 33 * C5 2E E8 1D 7D 77 C0 8A */ static const uint8_t keys_14[] = { 0xD7, 0x82, 0x8D, 0x13, 0xB2, 0xB0, 0xBD, 0xC3, 0x25, 0xA7, 0x62, 0x36, 0xDF, 0x93, 0xCC, 0x6B }; static const uint8_t nonce_14[] = { 0x00, 0x33, 0x56, 0x8E, 0xF7, 0xB2, 0x63, 0x3C, 0x96, 0x96, 0x76, 0x6C, 0xFA }; static const uint8_t packet_in_14[] = { 0x63, 0x01, 0x8F, 0x76, 0xDC, 0x8A, 0x1B, 0xCB, 0x90, 0x20, 0xEA, 0x6F, 0x91, 0xBD, 0xD8, 0x5A, 0xFA, 0x00, 0x39, 0xBA, 0x4B, 0xAF, 0xF9, 0xBF, 0xB7, 0x9C, 0x70, 0x28, 0x94, 0x9C, 0xD0, 0xEC, }; static const uint8_t packet_out_14[] = { 0x63, 0x01, 0x8F, 0x76, 0xDC, 0x8A, 0x1B, 0xCB, 0x4C, 0xCB, 0x1E, 0x7C, 0xA9, 0x81, 0xBE, 0xFA, 0xA0, 0x72, 0x6C, 0x55, 0xD3, 0x78, 0x06, 0x12, 0x98, 0xC8, 0x5C, 0x92, 0x81, 0x4A, 0xBC, 0x33, 0xC5, 0x2E, 0xE8, 0x1D, 0x7D, 0x77, 0xC0, 0x8A }; #define clear_len_14 8 #define auth_len_14 8 /* * =============== Packet Vector #15 ================== * AES Key = D7 82 8D 13 B2 B0 BD C3 25 A7 62 36 DF 93 CC 6B * Nonce = 00 10 3F E4 13 36 71 3C 96 96 76 6C FA * Total packet length = 33. [Input with 8 cleartext header octets] * AA 6C FA 36 CA E8 6B 40 B9 16 E0 EA CC 1C 00 D7 * DC EC 68 EC 0B 3B BB 1A 02 DE 8A 2D 1A A3 46 13 * 2E * CBC IV in: 59 00 10 3F E4 13 36 71 3C 96 96 76 6C FA 00 19 * CBC IV out:B3 26 49 FF D5 9F 56 0F 02 2D 11 E2 62 C5 BE EA * After xor: B3 2E E3 93 2F A9 9C E7 69 6D 11 E2 62 C5 BE EA [hdr] * After AES: 82 50 9E E5 B2 FF DB CA 9B D0 2E 20 6B 3F B7 AD * After xor: 3B 46 7E 0F 7E E3 DB 1D 47 3C 46 CC 60 04 0C B7 [msg] * After AES: 80 46 0E 4C 08 3A D0 3F B9 A9 13 BE E4 DE 2F 66 * After xor: 82 98 84 61 12 99 96 2C 97 A9 13 BE E4 DE 2F 66 [msg] * After AES: 47 29 CB 00 31 F1 81 C1 92 68 4B 89 A4 71 50 E7 * CBC-MAC : 47 29 CB 00 31 F1 81 C1 * CTR Start: 01 00 10 3F E4 13 36 71 3C 96 96 76 6C FA 00 01 * CTR[0001]: 08 C4 DA C8 EC C1 C0 7B 4C E1 F2 4C 37 5A 47 EE * CTR[0002]: A7 87 2E 6C 6D C4 4E 84 26 02 50 4C 3F A5 73 C5 * CTR[MAC ]: E0 5F B2 6E EA 83 B4 C7 * Total packet length = 41. [Authenticated and Encrypted Output] * AA 6C FA 36 CA E8 6B 40 B1 D2 3A 22 20 DD C0 AC * 90 0D 9A A0 3C 61 FC F4 A5 59 A4 41 77 67 08 97 * 08 A7 76 79 6E DB 72 35 06 */ static const uint8_t keys_15[] = { 0xD7, 0x82, 0x8D, 0x13, 0xB2, 0xB0, 0xBD, 0xC3, 0x25, 0xA7, 0x62, 0x36, 0xDF, 0x93, 0xCC, 0x6B }; static const uint8_t nonce_15[] = { 0x00, 0x10, 0x3F, 0xE4, 0x13, 0x36, 0x71, 0x3C, 0x96, 0x96, 0x76, 0x6C, 0xFA }; static const uint8_t packet_in_15[] = { 0xAA, 0x6C, 0xFA, 0x36, 0xCA, 0xE8, 0x6B, 0x40, 0xB9, 0x16, 0xE0, 0xEA, 0xCC, 0x1C, 0x00, 0xD7, 0xDC, 0xEC, 0x68, 0xEC, 0x0B, 0x3B, 0xBB, 0x1A, 0x02, 0xDE, 0x8A, 0x2D, 0x1A, 0xA3, 0x46, 0x13, 0x2E }; static const uint8_t packet_out_15[] = { 0xAA, 0x6C, 0xFA, 0x36, 0xCA, 0xE8, 0x6B, 0x40, 0xB1, 0xD2, 0x3A, 0x22, 0x20, 0xDD, 0xC0, 0xAC, 0x90, 0x0D, 0x9A, 0xA0, 0x3C, 0x61, 0xFC, 0xF4, 0xA5, 0x59, 0xA4, 0x41, 0x77, 0x67, 0x08, 0x97, 0x08, 0xA7, 0x76, 0x79, 0x6E, 0xDB, 0x72, 0x35, 0x06 }; #define clear_len_15 8 #define auth_len_15 8 /* * =============== Packet Vector #16 ================== * AES Key = D7 82 8D 13 B2 B0 BD C3 25 A7 62 36 DF 93 CC 6B * Nonce = 00 76 4C 63 B8 05 8E 3C 96 96 76 6C FA * Total packet length = 31. [Input with 12 cleartext header octets] * D0 D0 73 5C 53 1E 1B EC F0 49 C2 44 12 DA AC 56 * 30 EF A5 39 6F 77 0C E1 A6 6B 21 F7 B2 10 1C * CBC IV in: 59 00 76 4C 63 B8 05 8E 3C 96 96 76 6C FA 00 13 * CBC IV out:AB DC 4E C9 AA 72 33 97 DF 2D AD 76 33 DE 3B 0D * After xor: AB D0 9E 19 D9 2E 60 89 C4 C1 5D 3F F1 9A 3B 0D [hdr] * After AES: 62 86 F6 2F 23 42 63 B0 1C FD 8C 37 40 74 81 EB * After xor: 70 5C 5A 79 13 AD C6 89 73 8A 80 D6 E6 1F A0 1C [msg] * After AES: 88 95 84 18 CF 79 CA BE EB C0 0C C4 86 E6 01 F7 * After xor: 3A 85 98 18 CF 79 CA BE EB C0 0C C4 86 E6 01 F7 [msg] * After AES: C1 85 92 D9 84 CD 67 80 63 D1 D9 6D C1 DF A1 11 * CBC-MAC : C1 85 92 D9 84 CD 67 80 * CTR Start: 01 00 76 4C 63 B8 05 8E 3C 96 96 76 6C FA 00 01 * CTR[0001]: 06 08 FF 95 A6 94 D5 59 F4 0B B7 9D EF FA 41 DF * CTR[0002]: 80 55 3A 75 78 38 04 A9 64 8B 68 DD 7F DC DD 7A * CTR[MAC ]: 5B EA DB 4E DF 07 B9 2F * Total packet length = 39. [Authenticated and Encrypted Output] * D0 D0 73 5C 53 1E 1B EC F0 49 C2 44 14 D2 53 C3 * 96 7B 70 60 9B 7C BB 7C 49 91 60 28 32 45 26 9A * 6F 49 97 5B CA DE AF */ static const uint8_t keys_16[] = { 0xD7, 0x82, 0x8D, 0x13, 0xB2, 0xB0, 0xBD, 0xC3, 0x25, 0xA7, 0x62, 0x36, 0xDF, 0x93, 0xCC, 0x6B }; static const uint8_t nonce_16[] = { 0x00, 0x76, 0x4C, 0x63, 0xB8, 0x05, 0x8E, 0x3C, 0x96, 0x96, 0x76, 0x6C, 0xFA }; static const uint8_t packet_in_16[] = { 0xD0, 0xD0, 0x73, 0x5C, 0x53, 0x1E, 0x1B, 0xEC, 0xF0, 0x49, 0xC2, 0x44, 0x12, 0xDA, 0xAC, 0x56, 0x30, 0xEF, 0xA5, 0x39, 0x6F, 0x77, 0x0C, 0xE1, 0xA6, 0x6B, 0x21, 0xF7, 0xB2, 0x10, 0x1C }; static const uint8_t packet_out_16[] = { 0xD0, 0xD0, 0x73, 0x5C, 0x53, 0x1E, 0x1B, 0xEC, 0xF0, 0x49, 0xC2, 0x44, 0x14, 0xD2, 0x53, 0xC3, 0x96, 0x7B, 0x70, 0x60, 0x9B, 0x7C, 0xBB, 0x7C, 0x49, 0x91, 0x60, 0x28, 0x32, 0x45, 0x26, 0x9A, 0x6F, 0x49, 0x97, 0x5B, 0xCA, 0xDE, 0xAF }; #define clear_len_16 12 #define auth_len_16 8 /* * =============== Packet Vector #17 ================== * AES Key = D7 82 8D 13 B2 B0 BD C3 25 A7 62 36 DF 93 CC 6B * Nonce = 00 F8 B6 78 09 4E 3B 3C 96 96 76 6C FA * Total packet length = 32. [Input with 12 cleartext header octets] * 77 B6 0F 01 1C 03 E1 52 58 99 BC AE E8 8B 6A 46 * C7 8D 63 E5 2E B8 C5 46 EF B5 DE 6F 75 E9 CC 0D * CBC IV in: 59 00 F8 B6 78 09 4E 3B 3C 96 96 76 6C FA 00 14 * CBC IV out:F4 68 FE 5D B1 53 0B 7A 5A A5 FB 27 40 CF 6E 33 * After xor: F4 64 89 EB BE 52 17 79 BB F7 A3 BE FC 61 6E 33 [hdr] * After AES: 23 29 0E 0B 33 45 9A 83 32 2D E4 06 86 67 10 04 * After xor: CB A2 64 4D F4 C8 F9 66 1C 95 21 40 69 D2 CE 6B [msg] * After AES: 8F BE D4 0F 8B 89 B7 B8 20 D5 5F E0 3C E2 43 11 * After xor: FA 57 18 02 8B 89 B7 B8 20 D5 5F E0 3C E2 43 11 [msg] * After AES: 6A DB 15 B6 71 81 B2 E2 2B E3 4A F2 B2 83 E2 29 * CBC-MAC : 6A DB 15 B6 71 81 B2 E2 * CTR Start: 01 00 F8 B6 78 09 4E 3B 3C 96 96 76 6C FA 00 01 * CTR[0001]: BD CE 95 5C CF D3 81 0A 91 EA 77 A6 A4 5B C0 4C * CTR[0002]: 43 2E F2 32 AE 36 D8 92 22 BF 63 37 E6 B2 6C E8 * CTR[MAC ]: 1C F7 19 C1 35 7F CC DE * Total packet length = 40. [Authenticated and Encrypted Output] * 77 B6 0F 01 1C 03 E1 52 58 99 BC AE 55 45 FF 1A * 08 5E E2 EF BF 52 B2 E0 4B EE 1E 23 36 C7 3E 3F * 76 2C 0C 77 44 FE 7E 3C */ static const uint8_t keys_17[] = { 0xD7, 0x82, 0x8D, 0x13, 0xB2, 0xB0, 0xBD, 0xC3, 0x25, 0xA7, 0x62, 0x36, 0xDF, 0x93, 0xCC, 0x6B }; static const uint8_t nonce_17[] = { 0x00, 0xF8, 0xB6, 0x78, 0x09, 0x4E, 0x3B, 0x3C, 0x96, 0x96, 0x76, 0x6C, 0xFA }; static const uint8_t packet_in_17[] = { 0x77, 0xB6, 0x0F, 0x01, 0x1C, 0x03, 0xE1, 0x52, 0x58, 0x99, 0xBC, 0xAE, 0xE8, 0x8B, 0x6A, 0x46, 0xC7, 0x8D, 0x63, 0xE5, 0x2E, 0xB8, 0xC5, 0x46, 0xEF, 0xB5, 0xDE, 0x6F, 0x75, 0xE9, 0xCC, 0x0D }; static const uint8_t packet_out_17[] = { 0x77, 0xB6, 0x0F, 0x01, 0x1C, 0x03, 0xE1, 0x52, 0x58, 0x99, 0xBC, 0xAE, 0x55, 0x45, 0xFF, 0x1A, 0x08, 0x5E, 0xE2, 0xEF, 0xBF, 0x52, 0xB2, 0xE0, 0x4B, 0xEE, 0x1E, 0x23, 0x36, 0xC7, 0x3E, 0x3F, 0x76, 0x2C, 0x0C, 0x77, 0x44, 0xFE, 0x7E, 0x3C }; #define clear_len_17 12 #define auth_len_17 8 /* * =============== Packet Vector #18 ================== * AES Key = D7 82 8D 13 B2 B0 BD C3 25 A7 62 36 DF 93 CC 6B * Nonce = 00 D5 60 91 2D 3F 70 3C 96 96 76 6C FA * Total packet length = 33. [Input with 12 cleartext header octets] * CD 90 44 D2 B7 1F DB 81 20 EA 60 C0 64 35 AC BA * FB 11 A8 2E 2F 07 1D 7C A4 A5 EB D9 3A 80 3B A8 * 7F * CBC IV in: 59 00 D5 60 91 2D 3F 70 3C 96 96 76 6C FA 00 15 * CBC IV out:BA 37 74 54 D7 20 A4 59 25 97 F6 A3 D1 D6 BA 67 * After xor: BA 3B B9 C4 93 F2 13 46 FE 16 D6 49 B1 16 BA 67 [hdr] * After AES: 81 6A 20 20 38 D0 A6 30 CB E0 B7 3C 39 BB CE 05 * After xor: E5 5F 8C 9A C3 C1 0E 1E E4 E7 AA 40 9D 1E 25 DC [msg] * After AES: 6D 5C 15 FD 85 2D 5C 3C E3 03 3D 85 DA 57 BD AC * After xor: 57 DC 2E 55 FA 2D 5C 3C E3 03 3D 85 DA 57 BD AC [msg] * After AES: B0 4A 1C 23 BC 39 B6 51 76 FD 5B FF 9B C1 28 5E * CBC-MAC : B0 4A 1C 23 BC 39 B6 51 * CTR Start: 01 00 D5 60 91 2D 3F 70 3C 96 96 76 6C FA 00 01 * CTR[0001]: 64 A2 C5 56 50 CE E0 4C 7A 93 D8 EE F5 43 E8 8E * CTR[0002]: 18 E7 65 AC B7 B0 E9 AF 09 2B D0 20 6C A1 C8 3C * CTR[MAC ]: F7 43 82 79 5C 49 F3 00 * Total packet length = 41. [Authenticated and Encrypted Output] * CD 90 44 D2 B7 1F DB 81 20 EA 60 C0 00 97 69 EC * AB DF 48 62 55 94 C5 92 51 E6 03 57 22 67 5E 04 * C8 47 09 9E 5A E0 70 45 51 */ static const uint8_t keys_18[] = { 0xD7, 0x82, 0x8D, 0x13, 0xB2, 0xB0, 0xBD, 0xC3, 0x25, 0xA7, 0x62, 0x36, 0xDF, 0x93, 0xCC, 0x6B }; static const uint8_t nonce_18[] = { 0x00, 0xD5, 0x60, 0x91, 0x2D, 0x3F, 0x70, 0x3C, 0x96, 0x96, 0x76, 0x6C, 0xFA }; static const uint8_t packet_in_18[] = { 0xCD, 0x90, 0x44, 0xD2, 0xB7, 0x1F, 0xDB, 0x81, 0x20, 0xEA, 0x60, 0xC0, 0x64, 0x35, 0xAC, 0xBA, 0xFB, 0x11, 0xA8, 0x2E, 0x2F, 0x07, 0x1D, 0x7C, 0xA4, 0xA5, 0xEB, 0xD9, 0x3A, 0x80, 0x3B, 0xA8, 0x7F }; static const uint8_t packet_out_18[] = { 0xCD, 0x90, 0x44, 0xD2, 0xB7, 0x1F, 0xDB, 0x81, 0x20, 0xEA, 0x60, 0xC0, 0x00, 0x97, 0x69, 0xEC, 0xAB, 0xDF, 0x48, 0x62, 0x55, 0x94, 0xC5, 0x92, 0x51, 0xE6, 0x03, 0x57, 0x22, 0x67, 0x5E, 0x04, 0xC8, 0x47, 0x09, 0x9E, 0x5A, 0xE0, 0x70, 0x45, 0x51 }; #define clear_len_18 12 #define auth_len_18 8 /* * =============== Packet Vector #19 ================== * AES Key = D7 82 8D 13 B2 B0 BD C3 25 A7 62 36 DF 93 CC 6B * Nonce = 00 42 FF F8 F1 95 1C 3C 96 96 76 6C FA * Total packet length = 31. [Input with 8 cleartext header octets] * D8 5B C7 E6 9F 94 4F B8 8A 19 B9 50 BC F7 1A 01 * 8E 5E 67 01 C9 17 87 65 98 09 D6 7D BE DD 18 * CBC IV in: 61 00 42 FF F8 F1 95 1C 3C 96 96 76 6C FA 00 17 * CBC IV out:44 F7 CC 9C 2B DD 2F 45 F6 38 25 6B 73 6E 1D 7A * After xor: 44 FF 14 C7 EC 3B B0 D1 B9 80 25 6B 73 6E 1D 7A [hdr] * After AES: 57 C3 73 F8 00 AA 5F CC 7B CF 1D 1B DD BB 4C 52 * After xor: DD DA CA A8 BC 5D 45 CD F5 91 7A 1A 14 AC CB 37 [msg] * After AES: 42 4E 93 72 72 C8 79 B6 11 C7 A5 9F 47 8D 9F D8 * After xor: DA 47 45 0F CC 15 61 B6 11 C7 A5 9F 47 8D 9F D8 [msg] * After AES: 9A CB 03 F8 B9 DB C8 D2 D2 D7 A4 B4 95 25 08 67 * CBC-MAC : 9A CB 03 F8 B9 DB C8 D2 D2 D7 * CTR Start: 01 00 42 FF F8 F1 95 1C 3C 96 96 76 6C FA 00 01 * CTR[0001]: 36 38 34 FA 28 83 3D B7 55 66 0D 98 65 0D 68 46 * CTR[0002]: 35 E9 63 54 87 16 72 56 3F 0C 08 AF 78 44 31 A9 * CTR[MAC ]: F9 B7 FA 46 7B 9B 40 45 14 6D * Total packet length = 41. [Authenticated and Encrypted Output] * D8 5B C7 E6 9F 94 4F B8 BC 21 8D AA 94 74 27 B6 * DB 38 6A 99 AC 1A EF 23 AD E0 B5 29 39 CB 6A 63 * 7C F9 BE C2 40 88 97 C6 BA */ static const uint8_t keys_19[] = { 0xD7, 0x82, 0x8D, 0x13, 0xB2, 0xB0, 0xBD, 0xC3, 0x25, 0xA7, 0x62, 0x36, 0xDF, 0x93, 0xCC, 0x6B }; static const uint8_t nonce_19[] = { 0x00, 0x42, 0xFF, 0xF8, 0xF1, 0x95, 0x1C, 0x3C, 0x96, 0x96, 0x76, 0x6C, 0xFA }; static const uint8_t packet_in_19[] = { 0xD8, 0x5B, 0xC7, 0xE6, 0x9F, 0x94, 0x4F, 0xB8, 0x8A, 0x19, 0xB9, 0x50, 0xBC, 0xF7, 0x1A, 0x01, 0x8E, 0x5E, 0x67, 0x01, 0xC9, 0x17, 0x87, 0x65, 0x98, 0x09, 0xD6, 0x7D, 0xBE, 0xDD, 0x18 }; static const uint8_t packet_out_19[] = { 0xD8, 0x5B, 0xC7, 0xE6, 0x9F, 0x94, 0x4F, 0xB8, 0xBC, 0x21, 0x8D, 0xAA, 0x94, 0x74, 0x27, 0xB6, 0xDB, 0x38, 0x6A, 0x99, 0xAC, 0x1A, 0xEF, 0x23, 0xAD, 0xE0, 0xB5, 0x29, 0x39, 0xCB, 0x6A, 0x63, 0x7C, 0xF9, 0xBE, 0xC2, 0x40, 0x88, 0x97, 0xC6, 0xBA }; #define clear_len_19 8 #define auth_len_19 10 /* * ================= Packet Vector #20 ================== * AES Key = D7 82 8D 13 B2 B0 BD C3 25 A7 62 36 DF 93 CC 6B * Nonce = 00 92 0F 40 E5 6C DC 3C 96 96 76 6C FA * Total packet length = 32. [Input with 8 cleartext header octets] * 74 A0 EB C9 06 9F 5B 37 17 61 43 3C 37 C5 A3 5F * C1 F3 9F 40 63 02 EB 90 7C 61 63 BE 38 C9 84 37 * CBC IV in: 61 00 92 0F 40 E5 6C DC 3C 96 96 76 6C FA 00 18 * CBC IV out:60 CB 21 CE 40 06 50 AE 2A D2 BE 52 9F 5F 0F C2 * After xor: 60 C3 55 6E AB CF 56 31 71 E5 BE 52 9F 5F 0F C2 [hdr] * After AES: 03 20 64 14 35 32 5D 95 C8 A2 50 40 93 28 DA 9B * After xor: 14 41 27 28 02 F7 FE CA 09 51 CF 00 F0 2A 31 0B [msg] * After AES: B9 E8 87 95 ED F7 F0 08 15 15 F0 14 E2 FE 0E 48 * After xor: C5 89 E4 2B D5 3E 74 3F 15 15 F0 14 E2 FE 0E 48 [msg] * After AES: 8F AD 0C 23 E9 63 7E 87 FA 21 45 51 1B 47 DE F1 * CBC-MAC : 8F AD 0C 23 E9 63 7E 87 FA 21 * CTR Start: 01 00 92 0F 40 E5 6C DC 3C 96 96 76 6C FA 00 01 * CTR[0001]: 4F 71 A5 C1 12 42 E3 7D 29 F0 FE E4 1B E1 02 5F * CTR[0002]: 34 2B D3 F1 7C B7 7B C1 79 0B 05 05 61 59 27 2C * CTR[MAC ]: 7F 09 7B EF C6 AA C1 D3 73 65 * Total packet length = 42. [Authenticated and Encrypted Output] * 74 A0 EB C9 06 9F 5B 37 58 10 E6 FD 25 87 40 22 * E8 03 61 A4 78 E3 E9 CF 48 4A B0 4F 44 7E FF F6 * F0 A4 77 CC 2F C9 BF 54 89 44 */ static const uint8_t keys_20[] = { 0xD7, 0x82, 0x8D, 0x13, 0xB2, 0xB0, 0xBD, 0xC3, 0x25, 0xA7, 0x62, 0x36, 0xDF, 0x93, 0xCC, 0x6B }; static const uint8_t nonce_20[] = { 0x00, 0x92, 0x0F, 0x40, 0xE5, 0x6C, 0xDC, 0x3C, 0x96, 0x96, 0x76, 0x6C, 0xFA }; static const uint8_t packet_in_20[] = { 0x74, 0xA0, 0xEB, 0xC9, 0x06, 0x9F, 0x5B, 0x37, 0x17, 0x61, 0x43, 0x3C, 0x37, 0xC5, 0xA3, 0x5F, 0xC1, 0xF3, 0x9F, 0x40, 0x63, 0x02, 0xEB, 0x90, 0x7C, 0x61, 0x63, 0xBE, 0x38, 0xC9, 0x84, 0x37 }; static const uint8_t packet_out_20[] = { 0x74, 0xA0, 0xEB, 0xC9, 0x06, 0x9F, 0x5B, 0x37, 0x58, 0x10, 0xE6, 0xFD, 0x25, 0x87, 0x40, 0x22, 0xE8, 0x03, 0x61, 0xA4, 0x78, 0xE3, 0xE9, 0xCF, 0x48, 0x4A, 0xB0, 0x4F, 0x44, 0x7E, 0xFF, 0xF6, 0xF0, 0xA4, 0x77, 0xCC, 0x2F, 0xC9, 0xBF, 0x54, 0x89, 0x44 }; #define clear_len_20 8 #define auth_len_20 10 /* * =============== Packet Vector #21 ================== * AES Key = D7 82 8D 13 B2 B0 BD C3 25 A7 62 36 DF 93 CC 6B * Nonce = 00 27 CA 0C 71 20 BC 3C 96 96 76 6C FA * Total packet length = 33. [Input with 8 cleartext header octets] * 44 A3 AA 3A AE 64 75 CA A4 34 A8 E5 85 00 C6 E4 * 15 30 53 88 62 D6 86 EA 9E 81 30 1B 5A E4 22 6B * FA * CBC IV in: 61 00 27 CA 0C 71 20 BC 3C 96 96 76 6C FA 00 19 * CBC IV out:43 07 C0 73 A8 9E E1 D5 05 27 B2 9A 62 48 D6 D2 * After xor: 43 0F 84 D0 02 A4 4F B1 70 ED B2 9A 62 48 D6 D2 [hdr] * After AES: B6 0B C6 F5 84 01 75 BC 01 27 70 F1 11 8D 75 10 * After xor: 12 3F 6E 10 01 01 B3 58 14 17 23 79 73 5B F3 FA [msg] * After AES: 7D 5E 64 92 CE 2C B9 EA 7E 4C 4A 09 09 89 C8 FB * After xor: E3 DF 54 89 94 C8 9B 81 84 4C 4A 09 09 89 C8 FB [msg] * After AES: 68 5F 8D 79 D2 2B 9B 74 21 DF 4C 3E 87 BA 0A AF * CBC-MAC : 68 5F 8D 79 D2 2B 9B 74 21 DF * CTR Start: 01 00 27 CA 0C 71 20 BC 3C 96 96 76 6C FA 00 01 * CTR[0001]: 56 8A 45 9E 40 09 48 67 EB 85 E0 9E 6A 2E 64 76 * CTR[0002]: A6 00 AA 92 92 03 54 9A AE EF 2C CC 59 13 7A 57 * CTR[MAC ]: 25 1E DC DD 3F 11 10 F3 98 11 * Total packet length = 43. [Authenticated and Encrypted Output] * 44 A3 AA 3A AE 64 75 CA F2 BE ED 7B C5 09 8E 83 * FE B5 B3 16 08 F8 E2 9C 38 81 9A 89 C8 E7 76 F1 * 54 4D 41 51 A4 ED 3A 8B 87 B9 CE */ static const uint8_t keys_21[] = { 0xD7, 0x82, 0x8D, 0x13, 0xB2, 0xB0, 0xBD, 0xC3, 0x25, 0xA7, 0x62, 0x36, 0xDF, 0x93, 0xCC, 0x6B }; static const uint8_t nonce_21[] = { 0x00, 0x27, 0xCA, 0x0C, 0x71, 0x20, 0xBC, 0x3C, 0x96, 0x96, 0x76, 0x6C, 0xFA }; static const uint8_t packet_in_21[] = { 0x44, 0xA3, 0xAA, 0x3A, 0xAE, 0x64, 0x75, 0xCA, 0xA4, 0x34, 0xA8, 0xE5, 0x85, 0x00, 0xC6, 0xE4, 0x15, 0x30, 0x53, 0x88, 0x62, 0xD6, 0x86, 0xEA, 0x9E, 0x81, 0x30, 0x1B, 0x5A, 0xE4, 0x22, 0x6B, 0xFA }; static const uint8_t packet_out_21[] = { 0x44, 0xA3, 0xAA, 0x3A, 0xAE, 0x64, 0x75, 0xCA, 0xF2, 0xBE, 0xED, 0x7B, 0xC5, 0x09, 0x8E, 0x83, 0xFE, 0xB5, 0xB3, 0x16, 0x08, 0xF8, 0xE2, 0x9C, 0x38, 0x81, 0x9A, 0x89, 0xC8, 0xE7, 0x76, 0xF1, 0x54, 0x4D, 0x41, 0x51, 0xA4, 0xED, 0x3A, 0x8B, 0x87, 0xB9, 0xCE }; #define clear_len_21 8 #define auth_len_21 10 /* * =============== Packet Vector #22 ================== * AES Key = D7 82 8D 13 B2 B0 BD C3 25 A7 62 36 DF 93 CC 6B * Nonce = 00 5B 8C CB CD 9A F8 3C 96 96 76 6C FA * Total packet length = 31. [Input with 12 cleartext header octets] * EC 46 BB 63 B0 25 20 C3 3C 49 FD 70 B9 6B 49 E2 * 1D 62 17 41 63 28 75 DB 7F 6C 92 43 D2 D7 C2 * CBC IV in: 61 00 5B 8C CB CD 9A F8 3C 96 96 76 6C FA 00 13 * CBC IV out:91 14 AD 06 B6 CC 02 35 76 9A B6 14 C4 82 95 03 * After xor: 91 18 41 40 0D AF B2 10 56 59 8A 5D 39 F2 95 03 [hdr] * After AES: 29 BD 7C 27 83 E3 E8 D3 C3 5C 01 F4 4C EC BB FA * After xor: 90 D6 35 C5 9E 81 FF 92 A0 74 74 2F 33 80 29 B9 [msg] * After AES: 4E DA F4 0D 21 0B D4 5F FE 97 90 B9 AA EC 34 4C * After xor: 9C 0D 36 0D 21 0B D4 5F FE 97 90 B9 AA EC 34 4C [msg] * After AES: 21 9E F8 90 EA 64 C2 11 A5 37 88 83 E1 BA 22 0D * CBC-MAC : 21 9E F8 90 EA 64 C2 11 A5 37 * CTR Start: 01 00 5B 8C CB CD 9A F8 3C 96 96 76 6C FA 00 01 * CTR[0001]: 88 BC 19 42 80 C1 FA 3E BE FC EF FB 4D C6 2D 54 * CTR[0002]: 3E 59 7D A5 AE 21 CC A4 00 9E 4C 0C 91 F6 22 49 * CTR[MAC ]: 5C BC 30 98 66 02 A9 F4 64 A0 * Total packet length = 41. [Authenticated and Encrypted Output] * EC 46 BB 63 B0 25 20 C3 3C 49 FD 70 31 D7 50 A0 * 9D A3 ED 7F DD D4 9A 20 32 AA BF 17 EC 8E BF 7D * 22 C8 08 8C 66 6B E5 C1 97 */ static const uint8_t keys_22[] = { 0xD7, 0x82, 0x8D, 0x13, 0xB2, 0xB0, 0xBD, 0xC3, 0x25, 0xA7, 0x62, 0x36, 0xDF, 0x93, 0xCC, 0x6B }; static const uint8_t nonce_22[] = { 0x00, 0x5B, 0x8C, 0xCB, 0xCD, 0x9A, 0xF8, 0x3C, 0x96, 0x96, 0x76, 0x6C, 0xFA }; static const uint8_t packet_in_22[] = { 0xEC, 0x46, 0xBB, 0x63, 0xB0, 0x25, 0x20, 0xC3, 0x3C, 0x49, 0xFD, 0x70, 0xB9, 0x6B, 0x49, 0xE2, 0x1D, 0x62, 0x17, 0x41, 0x63, 0x28, 0x75, 0xDB, 0x7F, 0x6C, 0x92, 0x43, 0xD2, 0xD7, 0xC2 }; static const uint8_t packet_out_22[] = { 0xEC, 0x46, 0xBB, 0x63, 0xB0, 0x25, 0x20, 0xC3, 0x3C, 0x49, 0xFD, 0x70, 0x31, 0xD7, 0x50, 0xA0, 0x9D, 0xA3, 0xED, 0x7F, 0xDD, 0xD4, 0x9A, 0x20, 0x32, 0xAA, 0xBF, 0x17, 0xEC, 0x8E, 0xBF, 0x7D, 0x22, 0xC8, 0x08, 0x8C, 0x66, 0x6B, 0xE5, 0xC1, 0x97 }; #define clear_len_22 12 #define auth_len_22 10 /* * =============== Packet Vector #23 ================== * AES Key = D7 82 8D 13 B2 B0 BD C3 25 A7 62 36 DF 93 CC 6B * Nonce = 00 3E BE 94 04 4B 9A 3C 96 96 76 6C FA * Total packet length = 32. [Input with 12 cleartext header octets] * 47 A6 5A C7 8B 3D 59 42 27 E8 5E 71 E2 FC FB B8 * 80 44 2C 73 1B F9 51 67 C8 FF D7 89 5E 33 70 76 * CBC IV in: 61 00 3E BE 94 04 4B 9A 3C 96 96 76 6C FA 00 14 * CBC IV out:0F 70 3F 5A 54 2C 44 6E 8B 74 A3 73 9B 48 B9 61 * After xor: 0F 7C 78 FC 0E EB CF 53 D2 36 84 9B C5 39 B9 61 [hdr] * After AES: 40 5B ED 29 D0 98 AE 91 DB 68 78 F3 68 B8 73 85 * After xor: A2 A7 16 91 50 DC 82 E2 C0 91 29 94 A0 47 A4 0C [msg] * After AES: 3D 03 29 3C FD 81 1B 37 01 51 FB C7 85 6B 7A 74 * After xor: 63 30 59 4A FD 81 1B 37 01 51 FB C7 85 6B 7A 74 [msg] * After AES: 66 4F 27 16 3E 36 0F 72 62 0D 4E 67 7C E0 61 DE * CBC-MAC : 66 4F 27 16 3E 36 0F 72 62 0D * CTR Start: 01 00 3E BE 94 04 4B 9A 3C 96 96 76 6C FA 00 01 * CTR[0001]: 0A 7E 0A 63 53 C8 CF 9E BC 3B 6E 63 15 9A D0 97 * CTR[0002]: EA 20 32 DA 27 82 6E 13 9E 1E 72 5C 5B 0D 3E BF * CTR[MAC ]: B9 31 27 CA F0 F1 A1 20 FA 70 * Total packet length = 42. [Authenticated and Encrypted Output] * 47 A6 5A C7 8B 3D 59 42 27 E8 5E 71 E8 82 F1 DB * D3 8C E3 ED A7 C2 3F 04 DD 65 07 1E B4 13 42 AC * DF 7E 00 DC CE C7 AE 52 98 7D */ static const uint8_t keys_23[] = { 0xD7, 0x82, 0x8D, 0x13, 0xB2, 0xB0, 0xBD, 0xC3, 0x25, 0xA7, 0x62, 0x36, 0xDF, 0x93, 0xCC, 0x6B }; static const uint8_t nonce_23[] = { 0x00, 0x3E, 0xBE, 0x94, 0x04, 0x4B, 0x9A, 0x3C, 0x96, 0x96, 0x76, 0x6C, 0xFA }; static const uint8_t packet_in_23[] = { 0x47, 0xA6, 0x5A, 0xC7, 0x8B, 0x3D, 0x59, 0x42, 0x27, 0xE8, 0x5E, 0x71, 0xE2, 0xFC, 0xFB, 0xB8, 0x80, 0x44, 0x2C, 0x73, 0x1B, 0xF9, 0x51, 0x67, 0xC8, 0xFF, 0xD7, 0x89, 0x5E, 0x33, 0x70, 0x76 }; static const uint8_t packet_out_23[] = { 0x47, 0xA6, 0x5A, 0xC7, 0x8B, 0x3D, 0x59, 0x42, 0x27, 0xE8, 0x5E, 0x71, 0xE8, 0x82, 0xF1, 0xDB, 0xD3, 0x8C, 0xE3, 0xED, 0xA7, 0xC2, 0x3F, 0x04, 0xDD, 0x65, 0x07, 0x1E, 0xB4, 0x13, 0x42, 0xAC, 0xDF, 0x7E, 0x00, 0xDC, 0xCE, 0xC7, 0xAE, 0x52, 0x98, 0x7D }; #define clear_len_23 12 #define auth_len_23 10 /* * =============== Packet Vector #24 ================== * AES Key = D7 82 8D 13 B2 B0 BD C3 25 A7 62 36 DF 93 CC 6B * Nonce = 00 8D 49 3B 30 AE 8B 3C 96 96 76 6C FA * Total packet length = 33. [Input with 12 cleartext header octets] * 6E 37 A6 EF 54 6D 95 5D 34 AB 60 59 AB F2 1C 0B * 02 FE B8 8F 85 6D F4 A3 73 81 BC E3 CC 12 85 17 * D4 * CBC IV in: 61 00 8D 49 3B 30 AE 8B 3C 96 96 76 6C FA 00 15 * CBC IV out:67 AC E4 E8 06 77 7A D3 27 1D 0B 93 4C 67 98 15 * After xor: 67 A0 8A DF A0 98 2E BE B2 40 3F 38 2C 3E 98 15 [hdr] * After AES: 35 58 F8 7E CA C2 B4 39 B6 7E 75 BB F1 5E 69 08 * After xor: 9E AA E4 75 C8 3C 0C B6 33 13 81 18 82 DF D5 EB [msg] * After AES: 54 E4 7B 62 22 F0 BB 87 17 D0 71 6A EB AF 19 9E * After xor: 98 F6 FE 75 F6 F0 BB 87 17 D0 71 6A EB AF 19 9E [msg] * After AES: 23 E3 30 50 BC 57 DC 2C 3D 3E 7C 94 77 D1 49 71 * CBC-MAC : 23 E3 30 50 BC 57 DC 2C 3D 3E * CTR Start: 01 00 8D 49 3B 30 AE 8B 3C 96 96 76 6C FA 00 01 * CTR[0001]: 58 DB 19 B3 88 9A A3 8B 3C A4 0B 16 FF 42 2C 73 * CTR[0002]: C3 2F 24 3D 65 DC 7E 9F 4B 02 16 AB 7F B9 6B 4D * CTR[MAC ]: 4E 2D AE D2 53 F6 B1 8A 1D 67 * Total packet length = 43. [Authenticated and Encrypted Output] * 6E 37 A6 EF 54 6D 95 5D 34 AB 60 59 F3 29 05 B8 * 8A 64 1B 04 B9 C9 FF B5 8C C3 90 90 0F 3D A1 2A * B1 6D CE 9E 82 EF A1 6D A6 20 59 */ static const uint8_t keys_24[] = { 0xD7, 0x82, 0x8D, 0x13, 0xB2, 0xB0, 0xBD, 0xC3, 0x25, 0xA7, 0x62, 0x36, 0xDF, 0x93, 0xCC, 0x6B }; static const uint8_t nonce_24[] = { 0x00, 0x8D, 0x49, 0x3B, 0x30, 0xAE, 0x8B, 0x3C, 0x96, 0x96, 0x76, 0x6C, 0xFA }; static const uint8_t packet_in_24[] = { 0x6E, 0x37, 0xA6, 0xEF, 0x54, 0x6D, 0x95, 0x5D, 0x34, 0xAB, 0x60, 0x59, 0xAB, 0xF2, 0x1C, 0x0B, 0x02, 0xFE, 0xB8, 0x8F, 0x85, 0x6D, 0xF4, 0xA3, 0x73, 0x81, 0xBC, 0xE3, 0xCC, 0x12, 0x85, 0x17, 0xD4 }; static const uint8_t packet_out_24[] = { 0x6E, 0x37, 0xA6, 0xEF, 0x54, 0x6D, 0x95, 0x5D, 0x34, 0xAB, 0x60, 0x59, 0xF3, 0x29, 0x05, 0xB8, 0x8A, 0x64, 0x1B, 0x04, 0xB9, 0xC9, 0xFF, 0xB5, 0x8C, 0xC3, 0x90, 0x90, 0x0F, 0x3D, 0xA1, 0x2A, 0xB1, 0x6D, 0xCE, 0x9E, 0x82, 0xEF, 0xA1, 0x6D, 0xA6, 0x20, 0x59 }; #define clear_len_24 12 #define auth_len_24 10 /** Additional AES-CCM-128 test vectors */ static const uint8_t keys_90[] = { 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F }; static const uint8_t nonce_90[] = { 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16 }; static const uint8_t packet_in_90[] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x20, 0x21, 0x22, 0x23 }; static const uint8_t packet_out_90[] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x71, 0x62, 0x01, 0x5B, 0x4D, 0xAC, 0x25, 0x5D }; #define clear_len_90 8 #define auth_len_90 4 static const uint8_t keys_91[] = { 0xC9, 0x7C, 0x1F, 0x67, 0xCE, 0x37, 0x11, 0x85, 0x51, 0x4A, 0x8A, 0x19, 0xF2, 0xBD, 0xD5, 0x2F }; static const uint8_t nonce_91[] = { 0x00, 0x50, 0x30, 0xF1, 0x84, 0x44, 0x08, 0xB5, 0x03, 0x97, 0x76, 0xE7, 0x0C }; static const uint8_t packet_in_91[] = { 0x08, 0x40, 0x0F, 0xD2, 0xE1, 0x28, 0xA5, 0x7C, 0x50, 0x30, 0xF1, 0x84, 0x44, 0x08, 0xAB, 0xAE, 0xA5, 0xB8, 0xFC, 0xBA, 0x00, 0x00, 0xF8, 0xBA, 0x1A, 0x55, 0xD0, 0x2F, 0x85, 0xAE, 0x96, 0x7B, 0xB6, 0x2F, 0xB6, 0xCD, 0xA8, 0xEB, 0x7E, 0x78, 0xA0, 0x50 }; static const uint8_t packet_out_91[] = { 0x08, 0x40, 0x0F, 0xD2, 0xE1, 0x28, 0xA5, 0x7C, 0x50, 0x30, 0xF1, 0x84, 0x44, 0x08, 0xAB, 0xAE, 0xA5, 0xB8, 0xFC, 0xBA, 0x00, 0x00, 0xF3, 0xD0, 0xA2, 0xFE, 0x9A, 0x3D, 0xBF, 0x23, 0x42, 0xA6, 0x43, 0xE4, 0x32, 0x46, 0xE8, 0x0C, 0x3C, 0x04, 0xD0, 0x19, 0x78, 0x45, 0xCE, 0x0B, 0x16, 0xF9, 0x76, 0x23 }; #define clear_len_91 22 #define auth_len_91 8 static const uint8_t keys_92[] = { 0xC9, 0x7C, 0x1F, 0x67, 0xCE, 0x37, 0x11, 0x85, 0x51, 0x4A, 0x8A, 0x19, 0xF2, 0xBD, 0xD5, 0x2F }; static const uint8_t nonce_92[] = { 0x00, 0x50, 0x30, 0xF1, 0x84, 0x44, 0x08, 0xB5, 0x03, 0x97, 0x76, 0xE7, 0x0C }; static const uint8_t packet_in_92[] = { 0xF8, 0xBA, 0x1A, 0x55, 0xD0, 0x2F, 0x85, 0xAE, 0x96, 0x7B, 0xB6, 0x2F, 0xB6, 0xCD, 0xA8, 0xEB, 0x7E, 0x78, 0xA0, 0x50 }; static const uint8_t packet_out_92[] = { 0xF3, 0xD0, 0xA2, 0xFE, 0x9A, 0x3D, 0xBF, 0x23, 0x42, 0xA6, 0x43, 0xE4, 0x32, 0x46, 0xE8, 0x0C, 0x3C, 0x04, 0xD0, 0x19, 0x41, 0x83, 0x21, 0x89, 0xA3, 0xD3, 0x1B, 0x43 }; #define clear_len_92 0 #define auth_len_92 8 static const uint8_t keys_100[] = { 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F }; static const uint8_t nonce_100[] = { 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16 }; static const uint8_t packet_in_100[] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x20, 0x21, 0x22, 0x23, }; static const uint8_t packet_out_100[] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x71, 0x62, 0x01, 0x5B, 0xB0, 0xC9, 0x5E, 0x58, 0x03, 0x6E }; #define clear_len_100 8 #define auth_len_100 6 static const uint8_t keys_101[] = { 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F }; static const uint8_t nonce_101[] = { 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16 }; static const uint8_t packet_in_101[] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x20, 0x21, 0x22, 0x23, }; static const uint8_t packet_out_101[] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x71, 0x62, 0x01, 0x5B, 0xD0, 0xAD, 0x86, 0xFD, 0x33, 0xC2, 0x69, 0x86 }; #define clear_len_101 8 #define auth_len_101 8 static const uint8_t keys_102[] = { 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F }; static const uint8_t nonce_102[] = { 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16 }; static const uint8_t packet_in_102[] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x20, 0x21, 0x22, 0x23, }; static const uint8_t packet_out_102[] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x71, 0x62, 0x01, 0x5B, 0x05, 0x12, 0xDA, 0xBF, 0xD9, 0x72, 0xA6, 0x68, 0x53, 0xC1 }; #define clear_len_102 8 #define auth_len_102 10 static const uint8_t keys_103[] = { 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F }; static const uint8_t nonce_103[] = { 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16 }; static const uint8_t packet_in_103[] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x20, 0x21, 0x22, 0x23, }; static const uint8_t packet_out_103[] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x71, 0x62, 0x01, 0x5B, 0xBA, 0x03, 0xBF, 0x8C, 0xE0, 0xD6, 0x00, 0xA4, 0x48, 0x6F, 0xCC, 0xB3 }; #define clear_len_103 8 #define auth_len_103 12 static const uint8_t keys_104[] = { 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F }; static const uint8_t nonce_104[] = { 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16 }; static const uint8_t packet_in_104[] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x20, 0x21, 0x22, 0x23, }; static const uint8_t packet_out_104[] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x71, 0x62, 0x01, 0x5B, 0x6B, 0x9B, 0xFB, 0xFE, 0xA8, 0x2C, 0x04, 0x77, 0x8E, 0x67, 0xF5, 0x18, 0x46, 0xC6 }; #define clear_len_104 8 #define auth_len_104 14 static const uint8_t keys_105[] = { 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F }; static const uint8_t nonce_105[] = { 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16 }; static const uint8_t packet_in_105[] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x20, 0x21, 0x22, 0x23, }; static const uint8_t packet_out_105[] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x71, 0x62, 0x01, 0x5B, 0x2B, 0xB5, 0x7C, 0x0A, 0xF4, 0x5E, 0x4D, 0x83, 0x04, 0xF0, 0x5F, 0x45, 0x99, 0x3F, 0x15, 0x17 }; #define clear_len_105 8 #define auth_len_105 16 #define CCM_TEST_VEC(num) \ { keys_##num, nonce_##num, sizeof(nonce_##num), \ packet_in_##num, sizeof(packet_in_##num), \ clear_len_##num, packet_out_##num, \ auth_len_##num } static const struct ccm_rfc3610_vector { const uint8_t *keys; const uint8_t *nonce; const size_t nonce_len; /* packet in = [ AAD | plain text ] */ const uint8_t *packet_in; const size_t packet_len; const size_t clear_len; /* packet out = [ AAD | cipher text | authentication tag ] */ const uint8_t *packet_out; const size_t auth_len; } ccm_vectors[] = { CCM_TEST_VEC(01), CCM_TEST_VEC(02), CCM_TEST_VEC(03), CCM_TEST_VEC(04), CCM_TEST_VEC(05), CCM_TEST_VEC(06), CCM_TEST_VEC(07), CCM_TEST_VEC(08), CCM_TEST_VEC(09), CCM_TEST_VEC(10), CCM_TEST_VEC(11), CCM_TEST_VEC(12), CCM_TEST_VEC(13), CCM_TEST_VEC(14), CCM_TEST_VEC(15), CCM_TEST_VEC(16), CCM_TEST_VEC(17), CCM_TEST_VEC(18), CCM_TEST_VEC(19), CCM_TEST_VEC(20), CCM_TEST_VEC(21), CCM_TEST_VEC(22), CCM_TEST_VEC(23), CCM_TEST_VEC(24), CCM_TEST_VEC(90), CCM_TEST_VEC(91), CCM_TEST_VEC(92), CCM_TEST_VEC(100), CCM_TEST_VEC(101), CCM_TEST_VEC(102), CCM_TEST_VEC(103), CCM_TEST_VEC(104), CCM_TEST_VEC(105), }; #ifdef _WIN32 #define snprintf _snprintf #endif static void hexdump(FILE *fp, const char *msg, const void *p, size_t len) { unsigned int i, out, ofs; const unsigned char *data = p; fprintf(fp, "%s\n", msg); ofs = 0; while (ofs < len) { char line[120]; out = snprintf(line, sizeof(line), "%08x:", ofs); for (i = 0; ((ofs + i) < len) && (i < 16); i++) out += snprintf(line + out, sizeof(line) - out, " %02x", (data[ofs + i] & 0xff)); for (; i <= 16; i++) out += snprintf(line + out, sizeof(line) - out, " | "); for (i = 0; (ofs < len) && (i < 16); i++, ofs++) { unsigned char c = data[ofs]; if ((c < ' ') || (c > '~')) c = '.'; out += snprintf(line + out, sizeof(line) - out, "%c", c); } fprintf(fp, "%s\n", line); } } static int ccm_job_ok(const struct ccm_rfc3610_vector *vec, const struct JOB_AES_HMAC *job, const uint8_t *target, const uint8_t *padding, const uint8_t *auth, const size_t sizeof_padding, const int dir, const int in_place) { if (job->status != STS_COMPLETED) { printf("%d Error status:%d", __LINE__, job->status); return 0; } /* cipher checks */ if (in_place) { if (dir == ENCRYPT) { if (memcmp(vec->packet_out, target + sizeof_padding, vec->packet_len)) { printf("cipher mismatched\n"); hexdump(stderr, "Received", target + sizeof_padding, vec->packet_len); hexdump(stderr, "Expected", vec->packet_out, vec->packet_len); return 0; } } else { if (memcmp(vec->packet_in, target + sizeof_padding, vec->packet_len)) { printf("cipher mismatched\n"); hexdump(stderr, "Received", target + sizeof_padding, vec->packet_len); hexdump(stderr, "Expected", vec->packet_in, vec->packet_len); return 0; } } } else { /* out-of-place */ if (dir == ENCRYPT) { if (memcmp(vec->packet_out + vec->clear_len, target + sizeof_padding, vec->packet_len - vec->clear_len)) { printf("cipher mismatched\n"); hexdump(stderr, "Received", target + sizeof_padding, vec->packet_len - vec->clear_len); hexdump(stderr, "Expected", vec->packet_out + vec->clear_len, vec->packet_len - vec->clear_len); return 0; } } else { if (memcmp(vec->packet_in + vec->clear_len, target + sizeof_padding, vec->packet_len - vec->clear_len)) { printf("cipher mismatched\n"); hexdump(stderr, "Received", target + sizeof_padding, vec->packet_len - vec->clear_len); hexdump(stderr, "Expected", vec->packet_in + vec->clear_len, vec->packet_len - vec->clear_len); return 0; } } } if (memcmp(padding, target, sizeof_padding)) { printf("cipher overwrite head\n"); hexdump(stderr, "Target", target, sizeof(padding)); return 0; } if (in_place) { if (memcmp(padding, target + sizeof_padding + vec->packet_len, sizeof_padding)) { printf("cipher overwrite tail\n"); hexdump(stderr, "Target", target + sizeof_padding + vec->packet_len, sizeof_padding); return 0; } } else { if (memcmp(padding, target + sizeof_padding + vec->packet_len - vec->clear_len, sizeof_padding)) { printf("cipher overwrite tail\n"); hexdump(stderr, "Target", target + sizeof_padding + vec->packet_len - vec->clear_len, sizeof_padding); return 0; } } /* hash checks */ if (memcmp(padding, &auth[sizeof_padding + vec->auth_len], sizeof_padding)) { printf("hash overwrite tail\n"); hexdump(stderr, "Target", &auth[sizeof_padding + vec->auth_len], sizeof_padding); return 0; } if (memcmp(padding, &auth[0], sizeof_padding)) { printf("hash overwrite head\n"); hexdump(stderr, "Target", &auth[0], sizeof_padding); return 0; } if (memcmp(vec->packet_out + vec->packet_len, &auth[sizeof_padding], vec->auth_len)) { printf("hash mismatched\n"); hexdump(stderr, "Received", &auth[sizeof_padding], vec->auth_len); hexdump(stderr, "Expected", vec->packet_out + vec->packet_len, vec->auth_len); return 0; } return 1; } static int test_ccm(struct MB_MGR *mb_mgr, const struct ccm_rfc3610_vector *vec, const int dir, const int in_place, const int num_jobs) { DECLARE_ALIGNED(uint32_t expkey[4*15], 16); DECLARE_ALIGNED(uint32_t dust[4*15], 16); struct JOB_AES_HMAC *job; uint8_t padding[16]; uint8_t **targets = malloc(num_jobs * sizeof(void *)); uint8_t **auths = malloc(num_jobs * sizeof(void *)); int i = 0, jobs_rx = 0, ret = -1; const int order = (dir == ENCRYPT) ? CIPHER_HASH : HASH_CIPHER; if (targets == NULL || auths == NULL) { fprintf(stderr, "Can't allocate buffer memory\n"); goto end2; } memset(padding, -1, sizeof(padding)); memset(targets, 0, num_jobs * sizeof(void *)); memset(auths, 0, num_jobs * sizeof(void *)); for (i = 0; i < num_jobs; i++) { targets[i] = malloc(vec->packet_len + (sizeof(padding) * 2)); auths[i] = malloc(16 + (sizeof(padding) * 2)); if (targets[i] == NULL || auths[i] == NULL) { fprintf(stderr, "Can't allocate buffer memory\n"); goto end; } memset(targets[i], -1, vec->packet_len + (sizeof(padding) * 2)); memset(auths[i], -1, 16 + (sizeof(padding) * 2)); if (in_place) { if (dir == ENCRYPT) memcpy(targets[i] + sizeof(padding), vec->packet_in, vec->packet_len); else memcpy(targets[i] + sizeof(padding), vec->packet_out, vec->packet_len); } } IMB_AES_KEYEXP_128(mb_mgr, vec->keys, expkey, dust); while ((job = IMB_FLUSH_JOB(mb_mgr)) != NULL) ; for (i = 0; i < num_jobs; i++) { job = IMB_GET_NEXT_JOB(mb_mgr); job->cipher_direction = dir; job->chain_order = order; if (in_place) { job->dst = targets[i] + sizeof(padding) + vec->clear_len; job->src = targets[i] + sizeof(padding); } else { if (dir == ENCRYPT) { job->dst = targets[i] + sizeof(padding); job->src = vec->packet_in; } else { job->dst = targets[i] + sizeof(padding); job->src = vec->packet_out; } } job->cipher_mode = CCM; job->aes_enc_key_expanded = expkey; job->aes_dec_key_expanded = expkey; job->aes_key_len_in_bytes = 16; /* AES-CCM-128 for now */ job->iv = vec->nonce; job->iv_len_in_bytes = vec->nonce_len; job->cipher_start_src_offset_in_bytes = vec->clear_len; job->msg_len_to_cipher_in_bytes = vec->packet_len - vec->clear_len; job->hash_alg = AES_CCM; job->hash_start_src_offset_in_bytes = vec->clear_len; job->msg_len_to_hash_in_bytes = vec->packet_len - vec->clear_len; job->auth_tag_output = auths[i] + sizeof(padding); job->auth_tag_output_len_in_bytes = vec->auth_len; job->u.CCM.aad_len_in_bytes = vec->clear_len; job->u.CCM.aad = job->src; job->user_data = targets[i]; job->user_data2 = auths[i]; job = IMB_SUBMIT_JOB(mb_mgr); if (job) { jobs_rx++; if (num_jobs < 4) { printf("%d Unexpected return from submit_job\n", __LINE__); goto end; } if (!ccm_job_ok(vec, job, job->user_data, padding, job->user_data2, sizeof(padding), dir, in_place)) goto end; } } while ((job = IMB_FLUSH_JOB(mb_mgr)) != NULL) { jobs_rx++; if (!ccm_job_ok(vec, job, job->user_data, padding, job->user_data2, sizeof(padding), dir, in_place)) goto end; } if (jobs_rx != num_jobs) { printf("Expected %d jobs, received %d\n", num_jobs, jobs_rx); goto end; } ret = 0; end: for (i = 0; i < num_jobs; i++) { if (targets[i] != NULL) free(targets[i]); if (auths[i] != NULL) free(auths[i]); } end2: if (targets != NULL) free(targets); if (auths != NULL) free(auths); return ret; } static int test_ccm_std_vectors(struct MB_MGR *mb_mgr, const int num_jobs) { const int vectors_cnt = sizeof(ccm_vectors) / sizeof(ccm_vectors[0]); int vect; int errors = 0; printf("AES-CCM standard test vectors (N jobs = %d):\n", num_jobs); for (vect = 1; vect <= vectors_cnt; vect++) { const int idx = vect - 1; #ifdef DEBUG printf("Standard vector [%d/%d] NONCELen:%d PktLen:%d " "AADLen:%d AUTHlen:%d\n", vect, vectors_cnt, (int) ccm_vectors[idx].nonce_len, (int) ccm_vectors[idx].packet_len, (int) ccm_vectors[idx].clear_len, (int) ccm_vectors[idx].auth_len); #else printf("."); #endif if (test_ccm(mb_mgr, &ccm_vectors[idx], ENCRYPT, 1, num_jobs)) { printf("error #%d encrypt in-place\n", vect); errors++; } if (test_ccm(mb_mgr, &ccm_vectors[idx], DECRYPT, 1, num_jobs)) { printf("error #%d decrypt in-place\n", vect); errors++; } if (test_ccm(mb_mgr, &ccm_vectors[idx], ENCRYPT, 0, num_jobs)) { printf("error #%d encrypt out-of-place\n", vect); errors++; } if (test_ccm(mb_mgr, &ccm_vectors[idx], DECRYPT, 0, num_jobs)) { printf("error #%d decrypt out-of-place\n", vect); errors++; } } printf("\n"); return errors; } int ccm_test(const enum arch_type arch, struct MB_MGR *mb_mgr) { int errors = 0; errors += test_ccm_std_vectors(mb_mgr, 1); errors += test_ccm_std_vectors(mb_mgr, 3); errors += test_ccm_std_vectors(mb_mgr, 4); errors += test_ccm_std_vectors(mb_mgr, 5); errors += test_ccm_std_vectors(mb_mgr, 7); errors += test_ccm_std_vectors(mb_mgr, 8); errors += test_ccm_std_vectors(mb_mgr, 9); if (0 == errors) printf("...Pass\n"); else printf("...Fail\n"); return errors; } intel-ipsec-mb-0.48/LibTestApp/ctr_test.c000066400000000000000000000622221321406316400202630ustar00rootroot00000000000000/***************************************************************************** Copyright (c) 2017, Intel Corporation Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include #include #include #include #include #include #include "gcm_ctr_vectors_test.h" /* * Test Vector from * https://tools.ietf.org/html/rfc3686 * */ /* Test Vector #1: Encrypting 16 octets using AES-CTR with 128-bit key AES Key : AE 68 52 F8 12 10 67 CC 4B F7 A5 76 55 77 F3 9E AES-CTR IV : 00 00 00 00 00 00 00 00 Nonce : 00 00 00 30 Plaintext String : 'Single block msg' Plaintext : 53 69 6E 67 6C 65 20 62 6C 6F 63 6B 20 6D 73 67 Counter Block (1): 00 00 00 30 00 00 00 00 00 00 00 00 00 00 00 01 Key Stream (1): B7 60 33 28 DB C2 93 1B 41 0E 16 C8 06 7E 62 DF Ciphertext : E4 09 5D 4F B7 A7 B3 79 2D 61 75 A3 26 13 11 B8 */ static uint8_t K1_CTR[] = { 0xAE, 0x68, 0x52, 0xF8, 0x12, 0x10, 0x67, 0xCC, 0x4B, 0xF7, 0xA5, 0x76, 0x55, 0x77, 0xF3, 0x9E, }; static uint8_t IV1_CTR[] = { 0x00, 0x00, 0x00, 0x30, /* nonce */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, }; static uint8_t P1_CTR[] = { 0x53, 0x69, 0x6E, 0x67, 0x6C, 0x65, 0x20, 0x62, 0x6C, 0x6F, 0x63, 0x6B, 0x20, 0x6D, 0x73, 0x67, }; static uint8_t C1_CTR[] = { 0xE4, 0x09, 0x5D, 0x4F, 0xB7, 0xA7, 0xB3, 0x79, 0x2D, 0x61, 0x75, 0xA3, 0x26, 0x13, 0x11, 0xB8, }; static uint8_t T1_CTR[] = { 0 }; static uint8_t A1_CTR[] = { 0 }; #define A1_CTR_len 0 /* Test Vector #2: Encrypting 32 octets using AES-CTR with 128-bit key AES Key : 7E 24 06 78 17 FA E0 D7 43 D6 CE 1F 32 53 91 63 AES-CTR IV : C0 54 3B 59 DA 48 D9 0B Nonce : 00 6C B6 DB Plaintext : 00 01 02 03 04 05 06 07 08 09 0A 0B 0C 0D 0E 0F : 10 11 12 13 14 15 16 17 18 19 1A 1B 1C 1D 1E 1F Counter Block (1): 00 6C B6 DB C0 54 3B 59 DA 48 D9 0B 00 00 00 01 Key Stream (1): 51 05 A3 05 12 8F 74 DE 71 04 4B E5 82 D7 DD 87 Counter Block (2): 00 6C B6 DB C0 54 3B 59 DA 48 D9 0B 00 00 00 02 Key Stream (2): FB 3F 0C EF 52 CF 41 DF E4 FF 2A C4 8D 5C A0 37 Ciphertext : 51 04 A1 06 16 8A 72 D9 79 0D 41 EE 8E DA D3 88 : EB 2E 1E FC 46 DA 57 C8 FC E6 30 DF 91 41 BE 28 */ static uint8_t K2_CTR[] = { 0x7E, 0x24, 0x06, 0x78, 0x17, 0xFA, 0xE0, 0xD7, 0x43, 0xD6, 0xCE, 0x1F, 0x32, 0x53, 0x91, 0x63, }; static uint8_t IV2_CTR[] = { 0x00, 0x6C, 0xB6, 0xDB, /* nonce */ 0xC0, 0x54, 0x3B, 0x59, 0xDA, 0x48, 0xD9, 0x0B, }; static uint8_t P2_CTR[] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F, }; static uint8_t C2_CTR[] = { 0x51, 0x04, 0xA1, 0x06, 0x16, 0x8A, 0x72, 0xD9, 0x79, 0x0D, 0x41, 0xEE, 0x8E, 0xDA, 0xD3, 0x88, 0xEB, 0x2E, 0x1E, 0xFC, 0x46, 0xDA, 0x57, 0xC8, 0xFC, 0xE6, 0x30, 0xDF, 0x91, 0x41, 0xBE, 0x28, }; static uint8_t T2_CTR[] = { 0 }; static uint8_t A2_CTR[] = { 0 }; #define A2_CTR_len 0 /* Test Vector #3: Encrypting 36 octets using AES-CTR with 128-bit key AES Key : 76 91 BE 03 5E 50 20 A8 AC 6E 61 85 29 F9 A0 DC AES-CTR IV : 27 77 7F 3F 4A 17 86 F0 Nonce : 00 E0 01 7B Plaintext : 00 01 02 03 04 05 06 07 08 09 0A 0B 0C 0D 0E 0F : 10 11 12 13 14 15 16 17 18 19 1A 1B 1C 1D 1E 1F : 20 21 22 23 Counter Block (1): 00 E0 01 7B 27 77 7F 3F 4A 17 86 F0 00 00 00 01 Key Stream (1): C1 CE 4A AB 9B 2A FB DE C7 4F 58 E2 E3 D6 7C D8 Counter Block (2): 00 E0 01 7B 27 77 7F 3F 4A 17 86 F0 00 00 00 02 Key Stream (2): 55 51 B6 38 CA 78 6E 21 CD 83 46 F1 B2 EE 0E 4C Counter Block (3): 00 E0 01 7B 27 77 7F 3F 4A 17 86 F0 00 00 00 03 Key Stream (3): 05 93 25 0C 17 55 36 00 A6 3D FE CF 56 23 87 E9 Ciphertext : C1 CF 48 A8 9F 2F FD D9 CF 46 52 E9 EF DB 72 D7 : 45 40 A4 2B DE 6D 78 36 D5 9A 5C EA AE F3 10 53 : 25 B2 07 2F */ static uint8_t K3_CTR[] = { 0x76, 0x91, 0xBE, 0x03, 0x5E, 0x50, 0x20, 0xA8, 0xAC, 0x6E, 0x61, 0x85, 0x29, 0xF9, 0xA0, 0xDC, }; static uint8_t IV3_CTR[] = { 0x00, 0xE0, 0x01, 0x7B, /* nonce */ 0x27, 0x77, 0x7F, 0x3F, 0x4A, 0x17, 0x86, 0xF0, }; static uint8_t P3_CTR[] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F, 0x20, 0x21, 0x22, 0x23, }; static uint8_t C3_CTR[] = { 0xC1, 0xCF, 0x48, 0xA8, 0x9F, 0x2F, 0xFD, 0xD9, 0xCF, 0x46, 0x52, 0xE9, 0xEF, 0xDB, 0x72, 0xD7, 0x45, 0x40, 0xA4, 0x2B, 0xDE, 0x6D, 0x78, 0x36, 0xD5, 0x9A, 0x5C, 0xEA, 0xAE, 0xF3, 0x10, 0x53, 0x25, 0xB2, 0x07, 0x2F, }; static uint8_t T3_CTR[] = { 0 }; static uint8_t A3_CTR[] = { 0 }; #define A3_CTR_len 0 /* Test Vector #4: Encrypting 16 octets using AES-CTR with 192-bit key AES Key : 16 AF 5B 14 5F C9 F5 79 C1 75 F9 3E 3B FB 0E ED : 86 3D 06 CC FD B7 85 15 AES-CTR IV : 36 73 3C 14 7D 6D 93 CB Nonce : 00 00 00 48 Plaintext String : 'Single block msg' Plaintext : 53 69 6E 67 6C 65 20 62 6C 6F 63 6B 20 6D 73 67 Counter Block (1): 00 00 00 48 36 73 3C 14 7D 6D 93 CB 00 00 00 01 Key Stream (1): 18 3C 56 28 8E 3C E9 AA 22 16 56 CB 23 A6 9A 4F Ciphertext : 4B 55 38 4F E2 59 C9 C8 4E 79 35 A0 03 CB E9 28 */ static uint8_t K4_CTR[] = { 0x16, 0xAF, 0x5B, 0x14, 0x5F, 0xC9, 0xF5, 0x79, 0xC1, 0x75, 0xF9, 0x3E, 0x3B, 0xFB, 0x0E, 0xED, 0x86, 0x3D, 0x06, 0xCC, 0xFD, 0xB7, 0x85, 0x15, }; static uint8_t IV4_CTR[] = { 0x00, 0x00, 0x00, 0x48, /* nonce */ 0x36, 0x73, 0x3C, 0x14, 0x7D, 0x6D, 0x93, 0xCB, }; static uint8_t P4_CTR[] = { 0x53, 0x69, 0x6E, 0x67, 0x6C, 0x65, 0x20, 0x62, 0x6C, 0x6F, 0x63, 0x6B, 0x20, 0x6D, 0x73, 0x67, }; static uint8_t C4_CTR[] = { 0x4B, 0x55, 0x38, 0x4F, 0xE2, 0x59, 0xC9, 0xC8, 0x4E, 0x79, 0x35, 0xA0, 0x03, 0xCB, 0xE9, 0x28, }; static uint8_t T4_CTR[] = { 0 }; static uint8_t A4_CTR[] = { 0 }; #define A4_CTR_len 0 /* Test Vector #5: Encrypting 32 octets using AES-CTR with 192-bit key AES Key : 7C 5C B2 40 1B 3D C3 3C 19 E7 34 08 19 E0 F6 9C : 67 8C 3D B8 E6 F6 A9 1A AES-CTR IV : 02 0C 6E AD C2 CB 50 0D Nonce : 00 96 B0 3B Plaintext : 00 01 02 03 04 05 06 07 08 09 0A 0B 0C 0D 0E 0F : 10 11 12 13 14 15 16 17 18 19 1A 1B 1C 1D 1E 1F Counter Block (1): 00 96 B0 3B 02 0C 6E AD C2 CB 50 0D 00 00 00 01 Key Stream (1): 45 33 41 FF 64 9E 25 35 76 D6 A0 F1 7D 3C C3 90 Counter Block (2): 00 96 B0 3B 02 0C 6E AD C2 CB 50 0D 00 00 00 02 Key Stream (2): 94 81 62 0F 4E C1 B1 8B E4 06 FA E4 5E E9 E5 1F Ciphertext : 45 32 43 FC 60 9B 23 32 7E DF AA FA 71 31 CD 9F : 84 90 70 1C 5A D4 A7 9C FC 1F E0 FF 42 F4 FB 00 */ static uint8_t K5_CTR[] = { 0x7C, 0x5C, 0xB2, 0x40, 0x1B, 0x3D, 0xC3, 0x3C, 0x19, 0xE7, 0x34, 0x08, 0x19, 0xE0, 0xF6, 0x9C, 0x67, 0x8C, 0x3D, 0xB8, 0xE6, 0xF6, 0xA9, 0x1A, }; static uint8_t IV5_CTR[] = { 0x00, 0x96, 0xB0, 0x3B, /* nonce */ 0x02, 0x0C, 0x6E, 0xAD, 0xC2, 0xCB, 0x50, 0x0D, }; static uint8_t P5_CTR[] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F, }; static uint8_t C5_CTR[] = { 0x45, 0x32, 0x43, 0xFC, 0x60, 0x9B, 0x23, 0x32, 0x7E, 0xDF, 0xAA, 0xFA, 0x71, 0x31, 0xCD, 0x9F, 0x84, 0x90, 0x70, 0x1C, 0x5A, 0xD4, 0xA7, 0x9C, 0xFC, 0x1F, 0xE0, 0xFF, 0x42, 0xF4, 0xFB, 0x00, }; static uint8_t T5_CTR[] = { 0 }; static uint8_t A5_CTR[] = { 0 }; #define A5_CTR_len 0 /* Test Vector #6: Encrypting 36 octets using AES-CTR with 192-bit key AES Key : 02 BF 39 1E E8 EC B1 59 B9 59 61 7B 09 65 27 9B : F5 9B 60 A7 86 D3 E0 FE AES-CTR IV : 5C BD 60 27 8D CC 09 12 Nonce : 00 07 BD FD Plaintext : 00 01 02 03 04 05 06 07 08 09 0A 0B 0C 0D 0E 0F : 10 11 12 13 14 15 16 17 18 19 1A 1B 1C 1D 1E 1F : 20 21 22 23 Counter Block (1): 00 07 BD FD 5C BD 60 27 8D CC 09 12 00 00 00 01 Key Stream (1): 96 88 3D C6 5A 59 74 28 5C 02 77 DA D1 FA E9 57 Counter Block (2): 00 07 BD FD 5C BD 60 27 8D CC 09 12 00 00 00 02 Key Stream (2): C2 99 AE 86 D2 84 73 9F 5D 2F D2 0A 7A 32 3F 97 Counter Block (3): 00 07 BD FD 5C BD 60 27 8D CC 09 12 00 00 00 03 Key Stream (3): 8B CF 2B 16 39 99 B2 26 15 B4 9C D4 FE 57 39 98 Ciphertext : 96 89 3F C5 5E 5C 72 2F 54 0B 7D D1 DD F7 E7 58 : D2 88 BC 95 C6 91 65 88 45 36 C8 11 66 2F 21 88 : AB EE 09 35 */ static uint8_t K6_CTR[] = { 0x02, 0xBF, 0x39, 0x1E, 0xE8, 0xEC, 0xB1, 0x59, 0xB9, 0x59, 0x61, 0x7B, 0x09, 0x65, 0x27, 0x9B, 0xF5, 0x9B, 0x60, 0xA7, 0x86, 0xD3, 0xE0, 0xFE, }; static uint8_t IV6_CTR[] = { 0x00, 0x07, 0xBD, 0xFD, /* nonce */ 0x5C, 0xBD, 0x60, 0x27, 0x8D, 0xCC, 0x09, 0x12, }; static uint8_t P6_CTR[] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F, 0x20, 0x21, 0x22, 0x23, }; static uint8_t C6_CTR[] = { 0x96, 0x89, 0x3F, 0xC5, 0x5E, 0x5C, 0x72, 0x2F, 0x54, 0x0B, 0x7D, 0xD1, 0xDD, 0xF7, 0xE7, 0x58, 0xD2, 0x88, 0xBC, 0x95, 0xC6, 0x91, 0x65, 0x88, 0x45, 0x36, 0xC8, 0x11, 0x66, 0x2F, 0x21, 0x88, 0xAB, 0xEE, 0x09, 0x35, }; static uint8_t T6_CTR[] = { 0 }; static uint8_t A6_CTR[] = { 0 }; #define A6_CTR_len 0 /* Test Vector #7: Encrypting 16 octets using AES-CTR with 256-bit key AES Key : 77 6B EF F2 85 1D B0 6F 4C 8A 05 42 C8 69 6F 6C : 6A 81 AF 1E EC 96 B4 D3 7F C1 D6 89 E6 C1 C1 04 AES-CTR IV : DB 56 72 C9 7A A8 F0 B2 Nonce : 00 00 00 60 Plaintext String : 'Single block msg' Plaintext : 53 69 6E 67 6C 65 20 62 6C 6F 63 6B 20 6D 73 67 Counter Block (1): 00 00 00 60 DB 56 72 C9 7A A8 F0 B2 00 00 00 01 Key Stream (1): 47 33 BE 7A D3 E7 6E A5 3A 67 00 B7 51 8E 93 A7 Ciphertext : 14 5A D0 1D BF 82 4E C7 56 08 63 DC 71 E3 E0 C0 */ static uint8_t K7_CTR[] = { 0x77, 0x6B, 0xEF, 0xF2, 0x85, 0x1D, 0xB0, 0x6F, 0x4C, 0x8A, 0x05, 0x42, 0xC8, 0x69, 0x6F, 0x6C, 0x6A, 0x81, 0xAF, 0x1E, 0xEC, 0x96, 0xB4, 0xD3, 0x7F, 0xC1, 0xD6, 0x89, 0xE6, 0xC1, 0xC1, 0x04, }; static uint8_t IV7_CTR[] = { 0x00, 0x00, 0x00, 0x60, /* nonce */ 0xDB, 0x56, 0x72, 0xC9, 0x7A, 0xA8, 0xF0, 0xB2, }; static uint8_t P7_CTR[] = { 0x53, 0x69, 0x6E, 0x67, 0x6C, 0x65, 0x20, 0x62, 0x6C, 0x6F, 0x63, 0x6B, 0x20, 0x6D, 0x73, 0x67, }; static uint8_t C7_CTR[] = { 0x14, 0x5A, 0xD0, 0x1D, 0xBF, 0x82, 0x4E, 0xC7, 0x56, 0x08, 0x63, 0xDC, 0x71, 0xE3, 0xE0, 0xC0, }; static uint8_t T7_CTR[] = { 0 }; static uint8_t A7_CTR[] = { 0 }; #define A7_CTR_len 0 /* Test Vector #8: Encrypting 32 octets using AES-CTR with 256-bit key AES Key : F6 D6 6D 6B D5 2D 59 BB 07 96 36 58 79 EF F8 86 : C6 6D D5 1A 5B 6A 99 74 4B 50 59 0C 87 A2 38 84 AES-CTR IV : C1 58 5E F1 5A 43 D8 75 Nonce : 00 FA AC 24 Plaintext : 00 01 02 03 04 05 06 07 08 09 0A 0B 0C 0D 0E 0F : 10 11 12 13 14 15 16 17 18 19 1A 1B 1C 1D 1E 1F Counter block (1): 00 FA AC 24 C1 58 5E F1 5A 43 D8 75 00 00 00 01 Key stream (1): F0 5F 21 18 3C 91 67 2B 41 E7 0A 00 8C 43 BC A6 Counter block (2): 00 FA AC 24 C1 58 5E F1 5A 43 D8 75 00 00 00 02 Key stream (2): A8 21 79 43 9B 96 8B 7D 4D 29 99 06 8F 59 B1 03 Ciphertext : F0 5E 23 1B 38 94 61 2C 49 EE 00 0B 80 4E B2 A9 : B8 30 6B 50 8F 83 9D 6A 55 30 83 1D 93 44 AF 1C */ static uint8_t K8_CTR[] = { 0xF6, 0xD6, 0x6D, 0x6B, 0xD5, 0x2D, 0x59, 0xBB, 0x07, 0x96, 0x36, 0x58, 0x79, 0xEF, 0xF8, 0x86, 0xC6, 0x6D, 0xD5, 0x1A, 0x5B, 0x6A, 0x99, 0x74, 0x4B, 0x50, 0x59, 0x0C, 0x87, 0xA2, 0x38, 0x84, }; static uint8_t IV8_CTR[] = { 0x00, 0xFA, 0xAC, 0x24, /* nonce */ 0xC1, 0x58, 0x5E, 0xF1, 0x5A, 0x43, 0xD8, 0x75, }; static uint8_t P8_CTR[] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F, }; static uint8_t C8_CTR[] = { 0xF0, 0x5E, 0x23, 0x1B, 0x38, 0x94, 0x61, 0x2C, 0x49, 0xEE, 0x00, 0x0B, 0x80, 0x4E, 0xB2, 0xA9, 0xB8, 0x30, 0x6B, 0x50, 0x8F, 0x83, 0x9D, 0x6A, 0x55, 0x30, 0x83, 0x1D, 0x93, 0x44, 0xAF, 0x1C, }; static uint8_t T8_CTR[] = { 0 }; static uint8_t A8_CTR[] = { 0 }; #define A8_CTR_len 0 /* Test Vector #9: Encrypting 36 octets using AES-CTR with 256-bit key AES Key : FF 7A 61 7C E6 91 48 E4 F1 72 6E 2F 43 58 1D E2 : AA 62 D9 F8 05 53 2E DF F1 EE D6 87 FB 54 15 3D AES-CTR IV : 51 A5 1D 70 A1 C1 11 48 Nonce : 00 1C C5 B7 Plaintext : 00 01 02 03 04 05 06 07 08 09 0A 0B 0C 0D 0E 0F : 10 11 12 13 14 15 16 17 18 19 1A 1B 1C 1D 1E 1F : 20 21 22 23 Counter block (1): 00 1C C5 B7 51 A5 1D 70 A1 C1 11 48 00 00 00 01 Key stream (1): EB 6D 50 81 19 0E BD F0 C6 7C 9E 4D 26 C7 41 A5 Counter block (2): 00 1C C5 B7 51 A5 1D 70 A1 C1 11 48 00 00 00 02 Key stream (2): A4 16 CD 95 71 7C EB 10 EC 95 DA AE 9F CB 19 00 Counter block (3): 00 1C C5 B7 51 A5 1D 70 A1 C1 11 48 00 00 00 03 Key stream (3): 3E E1 C4 9B C6 B9 CA 21 3F 6E E2 71 D0 A9 33 39 Ciphertext : EB 6C 52 82 1D 0B BB F7 CE 75 94 46 2A CA 4F AA : B4 07 DF 86 65 69 FD 07 F4 8C C0 B5 83 D6 07 1F : 1E C0 E6 B8 */ static uint8_t K9_CTR[] = { 0xFF, 0x7A, 0x61, 0x7C, 0xE6, 0x91, 0x48, 0xE4, 0xF1, 0x72, 0x6E, 0x2F, 0x43, 0x58, 0x1D, 0xE2, 0xAA, 0x62, 0xD9, 0xF8, 0x05, 0x53, 0x2E, 0xDF, 0xF1, 0xEE, 0xD6, 0x87, 0xFB, 0x54, 0x15, 0x3D, }; static uint8_t IV9_CTR[] = { 0x00, 0x1C, 0xC5, 0xB7, /* nonce */ 0x51, 0xA5, 0x1D, 0x70, 0xA1, 0xC1, 0x11, 0x48, }; static uint8_t P9_CTR[] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F, 0x20, 0x21, 0x22, 0x23, }; static uint8_t C9_CTR[] = { 0xEB, 0x6C, 0x52, 0x82, 0x1D, 0x0B, 0xBB, 0xF7, 0xCE, 0x75, 0x94, 0x46, 0x2A, 0xCA, 0x4F, 0xAA, 0xB4, 0x07, 0xDF, 0x86, 0x65, 0x69, 0xFD, 0x07, 0xF4, 0x8C, 0xC0, 0xB5, 0x83, 0xD6, 0x07, 0x1F, 0x1E, 0xC0, 0xE6, 0xB8, }; static uint8_t T9_CTR[] = { 0 }; static uint8_t A9_CTR[] = { 0 }; #define A9_CTR_len 0 static const struct gcm_ctr_vector ctr_vectors[] = { /* * field order {K, Klen, IV, IVlen, A, Alen, P, Plen, C, T, Tlen}; * original vector does not have a valid sub hash key */ vector(1_CTR), vector(2_CTR), vector(3_CTR), vector(4_CTR), vector(5_CTR), vector(6_CTR), vector(7_CTR), vector(8_CTR), vector(9_CTR), }; #ifdef _WIN32 #define snprintf _snprintf #endif static void hexdump(FILE *fp, const char *msg, const void *p, size_t len) { unsigned int i, out, ofs; const unsigned char *data = p; fprintf(fp, "%s\n", msg); ofs = 0; while (ofs < len) { char line[120]; out = snprintf(line, sizeof(line), "%08x:", ofs); for (i = 0; ((ofs + i) < len) && (i < 16); i++) out += snprintf(line + out, sizeof(line) - out, " %02x", (data[ofs + i] & 0xff)); for (; i <= 16; i++) out += snprintf(line + out, sizeof(line) - out, " | "); for (i = 0; (ofs < len) && (i < 16); i++, ofs++) { unsigned char c = data[ofs]; if ((c < ' ') || (c > '~')) c = '.'; out += snprintf(line + out, sizeof(line) - out, "%c", c); } fprintf(fp, "%s\n", line); } } static int test_ctr(struct MB_MGR *mb_mgr, const void *expkey, unsigned key_len, const void *iv, unsigned iv_len, const uint8_t *in_text, const uint8_t *out_text, unsigned text_len, int dir, int order) { struct JOB_AES_HMAC *job; uint8_t padding[16]; uint8_t *target = malloc(text_len + (sizeof(padding) * 2)); int ret = -1; if (target == NULL) { fprintf(stderr, "Can't allocate buffer memory\n"); goto end; } memset(target, -1, text_len + (sizeof(padding) * 2)); memset(padding, -1, sizeof(padding)); while ((job = IMB_FLUSH_JOB(mb_mgr)) != NULL) ; job = IMB_GET_NEXT_JOB(mb_mgr); job->cipher_direction = dir; job->chain_order = order; job->dst = target + 16; job->src = in_text; job->cipher_mode = CNTR; job->aes_enc_key_expanded = expkey; job->aes_dec_key_expanded = expkey; job->aes_key_len_in_bytes = key_len; job->iv = iv; job->iv_len_in_bytes = iv_len; job->cipher_start_src_offset_in_bytes = 0; job->msg_len_to_cipher_in_bytes = text_len; job->hash_alg = NULL_HASH; job->hashed_auth_key_xor_ipad = NULL; job->hashed_auth_key_xor_opad = NULL; job->hash_start_src_offset_in_bytes = 0; job->msg_len_to_hash_in_bytes = 0; job->auth_tag_output = NULL; job->auth_tag_output_len_in_bytes = 0; job = IMB_SUBMIT_JOB(mb_mgr); if (job) { printf("%d Unexpected return from submit_job\n", __LINE__); goto end; } job = IMB_FLUSH_JOB(mb_mgr); if (!job) { printf("%d Unexpected null return from flush_job\n", __LINE__); goto end; } if (job->status != STS_COMPLETED) { printf("%d Error status:%d", __LINE__, job->status); goto end; } if (memcmp(out_text, target + 16, text_len)) { printf("mismatched\n"); hexdump(stderr, "Target", target, text_len + 32); goto end; } if (memcmp(padding, target, sizeof(padding))) { printf("overwrite head\n"); hexdump(stderr, "Target", target, text_len + 32); goto end; } if (memcmp(padding, target + sizeof(padding) + text_len, sizeof(padding))) { printf("overwrite tail\n"); hexdump(stderr, "Target", target, text_len + 32); goto end; } ret = 0; while ((job = IMB_FLUSH_JOB(mb_mgr)) != NULL) ; end: if (target != NULL) free(target); return ret; } static int test_ctr_std_vectors(struct MB_MGR *mb_mgr) { int const vectors_cnt = sizeof(ctr_vectors) / sizeof(ctr_vectors[0]); int vect; int errors = 0; DECLARE_ALIGNED(uint32_t expkey[4*15], 16); DECLARE_ALIGNED(uint32_t dust[4*15], 16); printf("AES-CTR standard test vectors:\n"); for (vect = 0; vect < vectors_cnt; vect++) { #ifdef DEBUG printf("Standard vector %d/%d Keylen:%d IVlen:%d PTLen:%d " "AADlen:%d Tlen:%d\n", vect, vectors_cnt - 1, (int) ctr_vectors[vect].Klen, (int) ctr_vectors[vect].IVlen, (int) ctr_vectors[vect].Plen, (int) ctr_vectors[vect].Alen, (int) ctr_vectors[vect].Tlen); #else printf("."); #endif switch (ctr_vectors[vect].Klen) { case BITS_128: IMB_AES_KEYEXP_128(mb_mgr, ctr_vectors[vect].K, expkey, dust); break; case BITS_192: IMB_AES_KEYEXP_192(mb_mgr, ctr_vectors[vect].K, expkey, dust); break; case BITS_256: IMB_AES_KEYEXP_256(mb_mgr, ctr_vectors[vect].K, expkey, dust); break; default: return -1; } if (test_ctr(mb_mgr, expkey, ctr_vectors[vect].Klen, ctr_vectors[vect].IV, (unsigned) ctr_vectors[vect].IVlen, ctr_vectors[vect].P, ctr_vectors[vect].C, (unsigned) ctr_vectors[vect].Plen, ENCRYPT, CIPHER_HASH)) { printf("error #%d encrypt\n", vect + 1); errors++; } if (test_ctr(mb_mgr, expkey, ctr_vectors[vect].Klen, ctr_vectors[vect].IV, (unsigned) ctr_vectors[vect].IVlen, ctr_vectors[vect].C, ctr_vectors[vect].P, (unsigned) ctr_vectors[vect].Plen, DECRYPT, HASH_CIPHER)) { printf("error #%d decrypt\n", vect + 1); errors++; } if (ctr_vectors[vect].IVlen == 12) { /* IV in the table didn't include block counter (12 bytes). * Let's encrypt & decrypt the same but * with 16 byte IV that includes block counter. */ const unsigned new_iv_len = 16; const unsigned orig_iv_len = 12; uint8_t local_iv[16]; memcpy(local_iv, ctr_vectors[vect].IV, orig_iv_len); *((uint32_t *)&local_iv[orig_iv_len]) = 0x01000000; if (test_ctr(mb_mgr, expkey, ctr_vectors[vect].Klen, local_iv, new_iv_len, ctr_vectors[vect].P, ctr_vectors[vect].C, (unsigned) ctr_vectors[vect].Plen, ENCRYPT, CIPHER_HASH)) { printf("error #%d encrypt\n", vect + 1); errors++; } if (test_ctr(mb_mgr, expkey, ctr_vectors[vect].Klen, local_iv, new_iv_len, ctr_vectors[vect].C, ctr_vectors[vect].P, (unsigned) ctr_vectors[vect].Plen, DECRYPT, HASH_CIPHER)) { printf("error #%d decrypt\n", vect + 1); errors++; } } } printf("\n"); return errors; } int ctr_test(const enum arch_type arch, struct MB_MGR *mb_mgr) { int errors; errors = test_ctr_std_vectors(mb_mgr); if (0 == errors) printf("...Pass\n"); else printf("...Fail\n"); return errors; } intel-ipsec-mb-0.48/LibTestApp/customop_test.c000077500000000000000000000245561321406316400213570ustar00rootroot00000000000000/***************************************************************************** Copyright (c) 2017, Intel Corporation Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include #include #include #include #include #include "customop_test.h" #define DIM(_a) (sizeof(_a) / sizeof(_a[0])) #if defined(DEBUG) #if _WIN32 #define TRACE(fmt, ...) fprintf(stderr, "%s:%d "fmt, __FUNCTION__, __LINE__, __VA_ARGS__) #else #define TRACE(fmt, ...) fprintf(stderr, "%s:%d "fmt, __func__, __LINE__, __VA_ARGS__) #endif #else # define TRACE(fmt, ...) #endif struct cipher_attr_s { const char *name; JOB_CIPHER_MODE mode; unsigned key_len; unsigned iv_len; }; struct auth_attr_s { const char *name; JOB_HASH_ALG hash; unsigned tag_len; }; struct test_vec_s { uint8_t iv[16]; uint8_t txt[64]; uint8_t tag[32]; uint8_t verify[32]; DECLARE_ALIGNED(uint8_t enc_key[16*16], 64); DECLARE_ALIGNED(uint8_t dec_key[16*16], 64); uint8_t ipad[256]; uint8_t opad[256]; const struct cipher_attr_s *cipher; const struct auth_attr_s *auth; unsigned seq; }; /* * addon cipher function */ static int cipher_addon(struct JOB_AES_HMAC *job) { struct test_vec_s *node = job->user_data; TRACE("Seq:%u Cipher Addon cipher:%s auth:%s\n", node->seq, node->cipher->name, node->auth->name); if (job->cipher_direction == ENCRYPT) memset(job->dst, 1, job->msg_len_to_cipher_in_bytes); else memset(job->dst, 2, job->msg_len_to_cipher_in_bytes); return 0; /* success */ } /* * addon hash function */ static int hash_addon(struct JOB_AES_HMAC *job) { struct test_vec_s *node = job->user_data; TRACE("Seq:%u Auth Addon cipher:%s auth:%s\n", node->seq, node->cipher->name, node->auth->name); memset(job->auth_tag_output, 3, job->auth_tag_output_len_in_bytes); return 0; /* success */ } /* * test cipher functions */ static const struct auth_attr_s auth_attr_tab[] = { { "SHA1", SHA1, 12 }, { "SHA224", SHA_224, 14 }, { "SHA256", SHA_256, 16 }, { "SHA384", SHA_384, 24 }, { "SHA512", SHA_512, 32 }, { "MD5", MD5, 12 }, { "CUSTOM_HASH", CUSTOM_HASH, 16 } }; /* * test hash functions */ static const struct cipher_attr_s cipher_attr_tab[] = { { "CBC128", CBC, 16, 16 }, { "CBC192", CBC, 24, 16 }, { "CBC256", CBC, 32, 16 }, { "CUSTOM_CIPHER", CUSTOM_CIPHER, 32, 12 }, { "CTR128", CNTR, 16, 12 }, { "CTR192", CNTR, 24, 12 }, { "CTR256", CNTR, 32, 12 } }; static int job_check(const struct JOB_AES_HMAC *job) { struct test_vec_s *done = job->user_data; TRACE("done Seq:%u Cipher:%s Auth:%s\n", done->seq, done->cipher->name, done->auth->name); if (job->status != STS_COMPLETED) { TRACE("failed job status:%d\n", job->status); return -1; } if (job->cipher_mode == CUSTOM_CIPHER) { if (job->cipher_direction == ENCRYPT) { unsigned i; for (i = 0; i < job->msg_len_to_cipher_in_bytes; i++) { if (job->dst[i] != 1) { TRACE("NG add-on encryption %u\n", i); return -1; } } TRACE("Addon encryption passes Seq:%u\n", done->seq); } else { unsigned i; for (i = 0; i < job->msg_len_to_cipher_in_bytes; i++) { if (job->dst[i] != 2) { TRACE("NG add-on decryption %u\n", i); return -1; } } TRACE("Addon decryption passes Seq:%u\n", done->seq); } } if (job->hash_alg == CUSTOM_HASH) { unsigned i; for (i = 0; i < job->auth_tag_output_len_in_bytes; i++) { if (job->auth_tag_output[i] != 3) { TRACE("NG add-on hashing %u\n", i); return -1; } } TRACE("Addon hashing passes Seq:%u\n", done->seq); } return 0; } void customop_test(struct MB_MGR *mgr) { struct test_vec_s test_tab[DIM(cipher_attr_tab) * DIM(auth_attr_tab)]; struct JOB_AES_HMAC *job; unsigned i, j, seq; int result = 0; for (i = 0, seq = 0; i < DIM(cipher_attr_tab); i++) { for (j = 0; j < DIM(auth_attr_tab); j++) { assert(seq < DIM(test_tab)); test_tab[seq].seq = seq; test_tab[seq].cipher = &cipher_attr_tab[i]; test_tab[seq].auth = &auth_attr_tab[j]; seq++; } } /* encryption */ for (i = 0; i < seq; i++) { struct test_vec_s *node = &test_tab[i]; while ((job = IMB_GET_NEXT_JOB(mgr)) == NULL) { job = IMB_FLUSH_JOB(mgr); result |= job_check(job); } job->cipher_func = cipher_addon; job->hash_func = hash_addon; job->aes_enc_key_expanded = node->enc_key; job->aes_dec_key_expanded = node->dec_key; job->aes_key_len_in_bytes = node->cipher->key_len; job->src = node->txt; job->dst = node->txt; job->cipher_start_src_offset_in_bytes = 16; job->msg_len_to_cipher_in_bytes = sizeof(node->txt); job->hash_start_src_offset_in_bytes = 0; job->msg_len_to_hash_in_bytes = sizeof(node->txt) + sizeof(node->iv); job->iv = node->iv; job->iv_len_in_bytes = node->cipher->iv_len; job->auth_tag_output = node->tag; job->auth_tag_output_len_in_bytes = node->auth->tag_len; job->u.HMAC._hashed_auth_key_xor_ipad = node->ipad; job->u.HMAC._hashed_auth_key_xor_opad = node->opad; job->cipher_mode = node->cipher->mode; job->cipher_direction = ENCRYPT; job->chain_order = CIPHER_HASH; job->hash_alg = node->auth->hash; job->user_data = node; job = IMB_SUBMIT_JOB(mgr); while (job) { result |= job_check(job); job = IMB_GET_COMPLETED_JOB(mgr); } } while ((job = IMB_FLUSH_JOB(mgr)) != NULL) result |= job_check(job); /* decryption */ for (i = 0; i < seq; i++) { struct test_vec_s *node = &test_tab[i]; while ((job = IMB_GET_NEXT_JOB(mgr)) == NULL) { job = IMB_FLUSH_JOB(mgr); result |= job_check(job); } job->cipher_func = cipher_addon; job->hash_func = hash_addon; job->aes_enc_key_expanded = node->enc_key; job->aes_dec_key_expanded = node->dec_key; job->aes_key_len_in_bytes = node->cipher->key_len; job->src = node->txt; job->dst = node->txt; job->cipher_start_src_offset_in_bytes = 16; job->msg_len_to_cipher_in_bytes = sizeof(node->txt); job->hash_start_src_offset_in_bytes = 0; job->msg_len_to_hash_in_bytes = sizeof(node->txt) + sizeof(node->iv); job->iv = node->iv; job->iv_len_in_bytes = node->cipher->iv_len; job->auth_tag_output = node->tag; job->auth_tag_output_len_in_bytes = node->auth->tag_len; job->u.HMAC._hashed_auth_key_xor_ipad = node->ipad; job->u.HMAC._hashed_auth_key_xor_opad = node->opad; job->cipher_mode = node->cipher->mode; job->cipher_direction = DECRYPT; job->chain_order = HASH_CIPHER; job->hash_alg = node->auth->hash; job->user_data = node; job = IMB_SUBMIT_JOB(mgr); while (job) { result |= job_check(job); job = IMB_GET_COMPLETED_JOB(mgr); } } while ((job = IMB_FLUSH_JOB(mgr)) != NULL) result |= job_check(job); if (result) fprintf(stdout, "Custom cipher/auth test failed!\n"); else fprintf(stdout, "Custom cipher/auth test passed\n"); } intel-ipsec-mb-0.48/LibTestApp/customop_test.h000077500000000000000000000032771321406316400213610ustar00rootroot00000000000000/* * Copyright (c) 2017, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * * Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of Intel Corporation nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef _CUSTOMOP_TEST_H_ #define _CUSTOMOP_TEST_H_ struct MB_MGR; extern void customop_test(struct MB_MGR *state); #endif /* !_CUSTOMOP_TEST_H_ */ intel-ipsec-mb-0.48/LibTestApp/des_test.c000066400000000000000000000401341321406316400202440ustar00rootroot00000000000000/***************************************************************************** Copyright (c) 2017, Intel Corporation Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include #include #include #include #include #include #include #include "gcm_ctr_vectors_test.h" #ifndef DIM #define DIM(x) (sizeof(x) / sizeof(x[0])) #endif struct des_vector { const uint8_t *K; /* key */ const uint8_t *IV; /* initialization vector */ const uint8_t *P; /* plain text */ uint64_t Plen; /* plain text length */ const uint8_t *C; /* cipher text - same length as plain text */ }; /* CM-SP-SECv3.1-I07-170111 I.7 */ static const uint8_t K1[] = { 0xe6, 0x60, 0x0f, 0xd8, 0x85, 0x2e, 0xf5, 0xab }; static const uint8_t IV1[] = { 0x81, 0x0e, 0x52, 0x8e, 0x1c, 0x5f, 0xda, 0x1a }; static const uint8_t P1[] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x88, 0x41, 0x65, 0x06 }; static const uint8_t C1[] = { 0x0d, 0xda, 0x5a, 0xcb, 0xd0, 0x5e, 0x55, 0x67, 0x9f, 0x04, 0xd1, 0xb6, 0x41, 0x3d, 0x4e, 0xed }; static const uint8_t K2[] = { 0x3b, 0x38, 0x98, 0x37, 0x15, 0x20, 0xf7, 0x5e }; static const uint8_t IV2[] = { 0x02, 0xa8, 0x11, 0x77, 0x4d, 0xcd, 0xe1, 0x3b }; static const uint8_t P2[] = { 0x05, 0xef, 0xf7, 0x00, 0xe9, 0xa1, 0x3a, 0xe5, 0xca, 0x0b, 0xcb, 0xd0, 0x48, 0x47, 0x64, 0xbd, 0x1f, 0x23, 0x1e, 0xa8, 0x1c, 0x7b, 0x64, 0xc5, 0x14, 0x73, 0x5a, 0xc5, 0x5e, 0x4b, 0x79, 0x63, 0x3b, 0x70, 0x64, 0x24, 0x11, 0x9e, 0x09, 0xdc, 0xaa, 0xd4, 0xac, 0xf2, 0x1b, 0x10, 0xaf, 0x3b, 0x33, 0xcd, 0xe3, 0x50, 0x48, 0x47, 0x15, 0x5c, 0xbb, 0x6f, 0x22, 0x19, 0xba, 0x9b, 0x7d, 0xf5 }; static const uint8_t C2[] = { 0xf3, 0x31, 0x8d, 0x01, 0x19, 0x4d, 0xa8, 0x00, 0xa4, 0x2c, 0x10, 0xb5, 0x33, 0xd6, 0xbc, 0x11, 0x97, 0x59, 0x2d, 0xcc, 0x9b, 0x5d, 0x35, 0x9a, 0xc3, 0x04, 0x5d, 0x07, 0x4c, 0x86, 0xbf, 0x72, 0xe5, 0x1a, 0x72, 0x25, 0x82, 0x22, 0x54, 0x03, 0xde, 0x8b, 0x7a, 0x58, 0x5c, 0x6c, 0x28, 0xdf, 0x41, 0x0e, 0x38, 0xd6, 0x2a, 0x86, 0xe3, 0x4f, 0xa2, 0x7c, 0x22, 0x39, 0x60, 0x06, 0x03, 0x6f }; static struct des_vector vectors[] = { {K1, IV1, P1, sizeof(P1), C1}, {K2, IV2, P2, sizeof(P2), C2}, }; /* CM-SP-SECv3.1-I07-170111 I.7 */ static const uint8_t DK1[] = { 0xe6, 0x60, 0x0f, 0xd8, 0x85, 0x2e, 0xf5, 0xab }; static const uint8_t DIV1[] = { 0x81, 0x0e, 0x52, 0x8e, 0x1c, 0x5f, 0xda, 0x1a }; static const uint8_t DP1[] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x88, 0x41, 0x65, 0x06 }; static const uint8_t DC1[] = { 0x0d, 0xda, 0x5a, 0xcb, 0xd0, 0x5e, 0x55, 0x67, 0x9f, 0x04, 0xd1, 0xb6, 0x41, 0x3d, 0x4e, 0xed }; static const uint8_t DK2[] = { 0xe6, 0x60, 0x0f, 0xd8, 0x85, 0x2e, 0xf5, 0xab }; static const uint8_t DIV2[] = { 0x81, 0x0e, 0x52, 0x8e, 0x1c, 0x5f, 0xda, 0x1a }; static const uint8_t DP2[] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x91, 0xd2, 0xd1, 0x9f }; static const uint8_t DC2[] = { 0x0d, 0xda, 0x5a, 0xcb, 0xd0, 0x5e, 0x55, 0x67, 0x51, 0x47, 0x46, 0x86, 0x8a, 0x71, 0xe5, 0x77, 0xef, 0xac, 0x88 }; static const uint8_t DK3[] = { 0xe6, 0x60, 0x0f, 0xd8, 0x85, 0x2e, 0xf5, 0xab }; static const uint8_t DIV3[] = { 0x51, 0x47, 0x46, 0x86, 0x8a, 0x71, 0xe5, 0x77 }; static const uint8_t DP3[] = { 0xd2, 0xd1, 0x9f }; static const uint8_t DC3[] = { 0xef, 0xac, 0x88 }; static struct des_vector docsis_vectors[] = { {DK1, DIV1, DP1, sizeof(DP1), DC1}, {DK2, DIV2, DP2, sizeof(DP2), DC2}, {DK3, DIV3, DP3, sizeof(DP3), DC3}, }; static int test_des_many(struct MB_MGR *mb_mgr, const uint64_t *ks, const void *iv, const uint8_t *in_text, const uint8_t *out_text, unsigned text_len, int dir, int order, JOB_CIPHER_MODE cipher, const int in_place, const int num_jobs) { struct JOB_AES_HMAC *job; uint8_t padding[16]; uint8_t **targets = malloc(num_jobs * sizeof(void *)); int i, jobs_rx = 0, ret = -1; assert(targets != NULL); memset(padding, -1, sizeof(padding)); for (i = 0; i < num_jobs; i++) { targets[i] = malloc(text_len + (sizeof(padding) * 2)); memset(targets[i], -1, text_len + (sizeof(padding) * 2)); if (in_place) { /* copy input text to the allocated buffer */ memcpy(targets[i] + sizeof(padding), in_text, text_len); } } /* flush the scheduler */ while ((job = IMB_FLUSH_JOB(mb_mgr)) != NULL) ; for (i = 0; i < num_jobs; i++) { job = IMB_GET_NEXT_JOB(mb_mgr); job->cipher_direction = dir; job->chain_order = order; if (!in_place) { job->dst = targets[i] + sizeof(padding); job->src = in_text; } else { job->dst = targets[i] + sizeof(padding); job->src = targets[i] + sizeof(padding); } job->cipher_mode = cipher; job->aes_enc_key_expanded = ks; job->aes_dec_key_expanded = ks; job->aes_key_len_in_bytes = 8; job->iv = iv; job->iv_len_in_bytes = 8; job->cipher_start_src_offset_in_bytes = 0; job->msg_len_to_cipher_in_bytes = text_len; job->user_data = (void *)((uint64_t)i); job->hash_alg = NULL_HASH; job->hashed_auth_key_xor_ipad = NULL; job->hashed_auth_key_xor_opad = NULL; job->hash_start_src_offset_in_bytes = 0; job->msg_len_to_hash_in_bytes = 0; job->auth_tag_output = NULL; job->auth_tag_output_len_in_bytes = 0; job = IMB_SUBMIT_JOB(mb_mgr); if (job != NULL) { const int num = (const int)((uint64_t)job->user_data); jobs_rx++; if (job->status != STS_COMPLETED) { printf("%d error status:%d, job %d", __LINE__, job->status, num); goto end; } if (memcmp(out_text, targets[num] + sizeof(padding), text_len)) { printf("%d mismatched\n", num); goto end; } if (memcmp(padding, targets[num], sizeof(padding))) { printf("%d overwrite head\n", num); goto end; } if (memcmp(padding, targets[num] + sizeof(padding) + text_len, sizeof(padding))) { printf("%d overwrite tail\n", num); goto end; } } } while ((job = IMB_FLUSH_JOB(mb_mgr)) != NULL) { const int num = (const int)((uint64_t)job->user_data); jobs_rx++; if (job->status != STS_COMPLETED) { printf("%d Error status:%d, job %d", __LINE__, job->status, num); goto end; } if (memcmp(out_text, targets[num] + sizeof(padding), text_len)) { printf("%d mismatched\n", num); goto end; } if (memcmp(padding, targets[num], sizeof(padding))) { printf("%d overwrite head\n", num); goto end; } if (memcmp(padding, targets[num] + sizeof(padding) + text_len, sizeof(padding))) { printf("%d overwrite tail\n", num); goto end; } } if (jobs_rx != num_jobs) { printf("Expected %d jobs, received %d\n", num_jobs, jobs_rx); goto end; } ret = 0; end: for (i = 0; i < num_jobs; i++) free(targets[i]); free(targets); return ret; } static int test_des_one(struct MB_MGR *mb_mgr, const uint64_t *ks, const void *iv, const uint8_t *in_text, const uint8_t *out_text, unsigned text_len, int dir, int order, JOB_CIPHER_MODE cipher, const int in_place) { struct JOB_AES_HMAC *job; uint8_t padding[16]; uint8_t *target = malloc(text_len + (sizeof(padding) * 2)); int ret = -1; assert(target != NULL); memset(target, -1, text_len + (sizeof(padding) * 2)); memset(padding, -1, sizeof(padding)); if (in_place) { /* copy input text to the allocated buffer */ memcpy(target + sizeof(padding), in_text, text_len); } while ((job = IMB_FLUSH_JOB(mb_mgr)) != NULL) ; job = IMB_GET_NEXT_JOB(mb_mgr); job->cipher_direction = dir; job->chain_order = order; if (!in_place) { job->dst = target + sizeof(padding); job->src = in_text; } else { job->dst = target + sizeof(padding); job->src = target + sizeof(padding); } job->cipher_mode = cipher; job->aes_enc_key_expanded = ks; job->aes_dec_key_expanded = ks; job->aes_key_len_in_bytes = 8; job->iv = iv; job->iv_len_in_bytes = 8; job->cipher_start_src_offset_in_bytes = 0; job->msg_len_to_cipher_in_bytes = text_len; job->hash_alg = NULL_HASH; job->hashed_auth_key_xor_ipad = NULL; job->hashed_auth_key_xor_opad = NULL; job->hash_start_src_offset_in_bytes = 0; job->msg_len_to_hash_in_bytes = 0; job->auth_tag_output = NULL; job->auth_tag_output_len_in_bytes = 0; job = IMB_SUBMIT_JOB(mb_mgr); if (job) { printf("%d Unexpected return from submit_job\n", __LINE__); goto end; } job = IMB_FLUSH_JOB(mb_mgr); if (!job) { printf("%d Unexpected null return from flush_job\n", __LINE__); goto end; } if (job->status != STS_COMPLETED) { printf("%d Error status:%d", __LINE__, job->status); goto end; } if (memcmp(out_text, target + sizeof(padding), text_len)) { printf("mismatched\n"); goto end; } if (memcmp(padding, target, sizeof(padding))) { printf("overwrite head\n"); goto end; } if (memcmp(padding, target + sizeof(padding) + text_len, sizeof(padding))) { printf("overwrite tail\n"); goto end; } ret = 0; while ((job = IMB_FLUSH_JOB(mb_mgr)) != NULL) ; end: free(target); return ret; } static int test_des(struct MB_MGR *mb_mgr, const uint64_t *ks, const void *iv, const uint8_t *in_text, const uint8_t *out_text, unsigned text_len, int dir, int order, JOB_CIPHER_MODE cipher, const int in_place) { int ret = 0; ret |= test_des_one(mb_mgr, ks, iv, in_text, out_text, text_len, dir, order, cipher, in_place); ret |= test_des_many(mb_mgr, ks, iv, in_text, out_text, text_len, dir, order, cipher, in_place, 32); return ret; } static int test_des_vectors(struct MB_MGR *mb_mgr, const int vec_cnt, const struct des_vector *vec_tab, const char *banner, const JOB_CIPHER_MODE cipher) { int vect, errors = 0; uint64_t ks[16]; printf("%s:\n", banner); for (vect = 0; vect < vec_cnt; vect++) { #ifdef DEBUG printf("Standard vector %d/%d PTLen:%d\n", vect + 1, vec_cnt, (int) vec_tab[vect].Plen); #else printf("."); #endif des_key_schedule(ks, vec_tab[vect].K); if (test_des(mb_mgr, ks, vec_tab[vect].IV, vec_tab[vect].P, vec_tab[vect].C, (unsigned) vec_tab[vect].Plen, ENCRYPT, CIPHER_HASH, cipher, 0)) { printf("error #%d encrypt\n", vect + 1); errors++; } if (test_des(mb_mgr, ks, vec_tab[vect].IV, vec_tab[vect].C, vec_tab[vect].P, (unsigned) vec_tab[vect].Plen, DECRYPT, HASH_CIPHER, cipher, 0)) { printf("error #%d decrypt\n", vect + 1); errors++; } if (test_des(mb_mgr, ks, vec_tab[vect].IV, vec_tab[vect].P, vec_tab[vect].C, (unsigned) vec_tab[vect].Plen, ENCRYPT, CIPHER_HASH, cipher, 1)) { printf("error #%d encrypt in-place\n", vect + 1); errors++; } if (test_des(mb_mgr, ks, vec_tab[vect].IV, vec_tab[vect].C, vec_tab[vect].P, (unsigned) vec_tab[vect].Plen, DECRYPT, HASH_CIPHER, cipher, 1)) { printf("error #%d decrypt in-place\n", vect + 1); errors++; } } printf("\n"); return errors; } int des_test(const enum arch_type arch, struct MB_MGR *mb_mgr) { int errors; errors = test_des_vectors(mb_mgr, DIM(vectors), vectors, "DES standard test vectors", DES); errors += test_des_vectors(mb_mgr, DIM(docsis_vectors), docsis_vectors, "DOCSIS DES standard test vectors", DOCSIS_DES); if (0 == errors) printf("...Pass\n"); else printf("...Fail\n"); return errors; } intel-ipsec-mb-0.48/LibTestApp/do_test.h000066400000000000000000000310611321406316400200770ustar00rootroot00000000000000/* * Copyright (c) 2012-2017, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * * Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of Intel Corporation nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include #include #include #include #undef init_mb_mgr #undef sha1_one_block #undef sha224_one_block #undef sha256_one_block #undef sha384_one_block #undef sha512_one_block #undef md5_one_block #undef aes_keyexp_128 #undef aes_keyexp_192 #undef aes_keyexp_256 #undef aes_xcbc_expand_key #undef aes_keyexp_128_enc #undef KNOWN_ANSWER_TEST #undef DO_TEST #undef TEST_AUX_FUNC #if (TEST == TEST_AVX) #define init_mb_mgr init_mb_mgr_avx #define sha1_one_block sha1_one_block_avx #define sha224_one_block sha224_one_block_avx #define sha256_one_block sha256_one_block_avx #define sha384_one_block sha384_one_block_avx #define sha512_one_block sha512_one_block_avx #define md5_one_block md5_one_block_avx #define aes_keyexp_128 aes_keyexp_128_avx #define aes_keyexp_192 aes_keyexp_192_avx #define aes_keyexp_256 aes_keyexp_256_avx #define aes_xcbc_expand_key aes_xcbc_expand_key_avx #define aes_keyexp_128_enc aes_keyexp_128_enc_avx #define KNOWN_ANSWER_TEST known_answer_test_avx #define DO_TEST do_test_avx #define TEST_AUX_FUNC test_aux_func_avx #elif (TEST == TEST_AVX2) #define init_mb_mgr init_mb_mgr_avx2 #define sha1_one_block sha1_one_block_avx2 #define sha224_one_block sha224_one_block_avx2 #define sha256_one_block sha256_one_block_avx2 #define sha384_one_block sha384_one_block_avx2 #define sha512_one_block sha512_one_block_avx2 #define md5_one_block md5_one_block_avx2 #define aes_keyexp_128 aes_keyexp_128_avx2 #define aes_keyexp_192 aes_keyexp_192_avx2 #define aes_keyexp_256 aes_keyexp_256_avx2 #define aes_xcbc_expand_key aes_xcbc_expand_key_avx2 #define aes_keyexp_128_enc aes_keyexp_128_enc_avx2 #define KNOWN_ANSWER_TEST known_answer_test_avx2 #define DO_TEST do_test_avx2 #define TEST_AUX_FUNC test_aux_func_avx2 #elif (TEST == TEST_AVX512) #define init_mb_mgr init_mb_mgr_avx512 #define sha1_one_block sha1_one_block_avx512 #define sha224_one_block sha224_one_block_avx512 #define sha256_one_block sha256_one_block_avx512 #define sha384_one_block sha384_one_block_avx512 #define sha512_one_block sha512_one_block_avx512 #define md5_one_block md5_one_block_avx512 #define aes_keyexp_128 aes_keyexp_128_avx512 #define aes_keyexp_192 aes_keyexp_192_avx512 #define aes_keyexp_256 aes_keyexp_256_avx512 #define aes_xcbc_expand_key aes_xcbc_expand_key_avx512 #define aes_keyexp_128_enc aes_keyexp_128_enc_avx512 #define KNOWN_ANSWER_TEST known_answer_test_avx512 #define DO_TEST do_test_avx512 #define TEST_AUX_FUNC test_aux_func_avx512 #else #define init_mb_mgr init_mb_mgr_sse #define sha1_one_block sha1_one_block_sse #define sha224_one_block sha224_one_block_sse #define sha256_one_block sha256_one_block_sse #define sha384_one_block sha384_one_block_sse #define sha512_one_block sha512_one_block_sse #define md5_one_block md5_one_block_sse #define aes_keyexp_128 aes_keyexp_128_sse #define aes_keyexp_192 aes_keyexp_192_sse #define aes_keyexp_256 aes_keyexp_256_sse #define aes_xcbc_expand_key aes_xcbc_expand_key_sse #define aes_keyexp_128_enc aes_keyexp_128_enc_sse #define KNOWN_ANSWER_TEST known_answer_test_sse #define DO_TEST do_test_sse #define TEST_AUX_FUNC test_aux_func_sse #endif #ifndef DO_TEST_DATA_H_DECLARED #define DO_TEST_DATA_H_DECLARED static unsigned char key[] = { 0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77, 0x78,0x79,0x7a,0x7b,0x7c,0x7d,0x7e,0x7f, 0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87, 0x88,0x89,0x8a,0x8b,0x8c,0x8d,0x8e,0x8f, 0x90,0x91,0x92,0x93,0x94,0x95,0x96,0x97, 0x98,0x99,0x9a,0x9b,0x9c,0x9d,0x9e,0x9f, 0xa0 }; static unsigned char text[] = { 0x53,0x61,0x6d,0x70,0x6c,0x65,0x20,0x23,0x34 }; static unsigned char hmac12[] = { 0x9e,0xa8,0x86,0xef,0xe2,0x68,0xdb,0xec,0xce,0x42,0x0c,0x75 }; #define KEYSIZE sizeof(key) #define TEXTSIZE sizeof(text); static unsigned char plain[] = { 0x6b,0xc1,0xbe,0xe2,0x2e,0x40,0x9f,0x96, 0xe9,0x3d,0x7e,0x11,0x73,0x93,0x17,0x2a, 0xae,0x2d,0x8a,0x57,0x1e,0x03,0xac,0x9c, 0x9e,0xb7,0x6f,0xac,0x45,0xaf,0x8e,0x51, 0x30,0xc8,0x1c,0x46,0xa3,0x5c,0xe4,0x11, 0xe5,0xfb,0xc1,0x19,0x1a,0x0a,0x52,0xef, 0xf6,0x9f,0x24,0x45,0xdf,0x4f,0x9b,0x17, 0xad,0x2b,0x41,0x7b,0xe6,0x6c,0x37,0x10 }; static unsigned char key128[] = { 0x2b,0x7e,0x15,0x16,0x28,0xae,0xd2,0xa6, 0xab,0xf7,0x15,0x88,0x09,0xcf,0x4f,0x3c }; static unsigned char key256[] = { 0x60,0x3d,0xeb,0x10,0x15,0xca,0x71,0xbe, 0x2b,0x73,0xae,0xf0,0x85,0x7d,0x77,0x81, 0x1f,0x35,0x2c,0x07,0x3b,0x61,0x08,0xd7, 0x2d,0x98,0x10,0xa3,0x09,0x14,0xdf,0xf4 }; static unsigned char ic[] = { 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, 0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f }; static unsigned char cipherCBC128[] = { 0x76,0x49,0xab,0xac,0x81,0x19,0xb2,0x46, 0xce,0xe9,0x8e,0x9b,0x12,0xe9,0x19,0x7d, 0x50,0x86,0xcb,0x9b,0x50,0x72,0x19,0xee, 0x95,0xdb,0x11,0x3a,0x91,0x76,0x78,0xb2, 0x73,0xbe,0xd6,0xb8,0xe3,0xc1,0x74,0x3b, 0x71,0x16,0xe6,0x9e,0x22,0x22,0x95,0x16, 0x3f,0xf1,0xca,0xa1,0x68,0x1f,0xac,0x09, 0x12,0x0e,0xca,0x30,0x75,0x86,0xe1,0xa7 }; static unsigned char cipherCBC256[] = { 0xf5,0x8c,0x4c,0x04,0xd6,0xe5,0xf1,0xba, 0x77,0x9e,0xab,0xfb,0x5f,0x7b,0xfb,0xd6, 0x9c,0xfc,0x4e,0x96,0x7e,0xdb,0x80,0x8d, 0x67,0x9f,0x77,0x7b,0xc6,0x70,0x2c,0x7d, 0x39,0xf2,0x33,0x69,0xa9,0xd9,0xba,0xcf, 0xa5,0x30,0xe2,0x63,0x04,0x23,0x14,0x61, 0xb2,0xeb,0x05,0xe2,0xc3,0x9b,0xe9,0xfc, 0xda,0x6c,0x19,0x07,0x8c,0x6a,0x9d,0x1b }; #define NUMBLOCKS 4 #define NUMBYTES (NUMBLOCKS * 16) #endif /* DO_TEST_DATA_H_DECLARED */ void KNOWN_ANSWER_TEST(MB_MGR *mb_mgr) { uint8_t test_buf[NUMBYTES]; uint8_t buf[64]; DECLARE_ALIGNED(uint32_t enc_keys[15*4], 16); DECLARE_ALIGNED(uint32_t dec_keys[15*4], 16); DECLARE_ALIGNED(uint8_t ipad_hash[5*4], 16); DECLARE_ALIGNED(uint8_t opad_hash[5*4], 16); JOB_AES_HMAC *job; uint8_t iv[16]; uint8_t digest[12]; uint32_t i; // compute ipad hash for (i=0; i<64; i++) buf[i] = 0x36; for (i=0; iaes_enc_key_expanded = enc_keys; job->aes_dec_key_expanded = dec_keys; job->cipher_direction = DECRYPT; job->chain_order = HASH_CIPHER; job->dst = test_buf; job->aes_key_len_in_bytes = 16; job->auth_tag_output = digest; job->auth_tag_output_len_in_bytes = 12; memcpy(iv, ic, sizeof(iv)); job->iv = iv; job->iv_len_in_bytes = 16; job->src = cipherCBC128; job->cipher_start_src_offset_in_bytes = 0; job->msg_len_to_cipher_in_bytes = NUMBYTES; job->hash_start_src_offset_in_bytes = text - job->src; job->msg_len_to_hash_in_bytes = TEXTSIZE; job->hashed_auth_key_xor_ipad = ipad_hash; job->hashed_auth_key_xor_opad = opad_hash; job->cipher_mode = CBC; job->hash_alg = SHA1; job = IMB_SUBMIT_JOB(mb_mgr); if (job) { printf("Unexpected return from submit_job\n"); return; } job = IMB_FLUSH_JOB(mb_mgr); if (!job) { printf("Unexpected null return from flush_job\n"); return; } for (i=0; imsg_len_to_cipher_in_bytes = size; job->msg_len_to_hash_in_bytes = size + 20; job->hash_start_src_offset_in_bytes = 0; job->cipher_start_src_offset_in_bytes = 20; job->auth_tag_output = (uint8_t*) digest; job->auth_tag_output_len_in_bytes = 12; job->hashed_auth_key_xor_ipad = (uint8_t*)ipad; job->hashed_auth_key_xor_opad = (uint8_t*)opad; job->aes_enc_key_expanded = job->aes_dec_key_expanded = (uint32_t*) keys; job->src = buf; job->dst = buf + 20; job->iv = (uint8_t *) &IV; job->iv_len_in_bytes = 16; job->cipher_mode = CBC; job->hash_alg = SHA1; if (rand() & 1) job->aes_key_len_in_bytes = 16; else job->aes_key_len_in_bytes = 32; if (rand() & 1) { job->cipher_direction = ENCRYPT; job->chain_order = CIPHER_HASH; } else { job->cipher_direction = DECRYPT; job->chain_order = HASH_CIPHER; } job = IMB_SUBMIT_JOB(mb_mgr); while (job) { job = IMB_GET_COMPLETED_JOB(mb_mgr); } // end while (job) } // end for i while ((job = IMB_FLUSH_JOB(mb_mgr)) != NULL) { } TEST_AUX_FUNC(); } intel-ipsec-mb-0.48/LibTestApp/gcm_ctr_vectors_test.h000066400000000000000000000056371321406316400226720ustar00rootroot00000000000000/* * Copyright (c) 2017, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * * Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of Intel Corporation nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef GCM_CTR_VECTORS_TEST_H_ #define GCM_CTR_VECTORS_TEST_H_ #include enum arch_type { ARCH_SSE = 0, ARCH_AVX, ARCH_AVX2, ARCH_AVX512, ARCH_NUMOF }; enum key_size { BITS_128 = 16, BITS_192 = 24, BITS_256 = 32, }; #define KBITS(K) (sizeof(K)) // struct to hold pointers to the key, plaintext and ciphertext vectors struct gcm_ctr_vector { const uint8_t* K; // AES Key enum key_size Klen; // length of key in bits const uint8_t* IV; // initial value used by GCM uint64_t IVlen; // length of IV in bytes const uint8_t* A; // additional authenticated data uint64_t Alen; // length of AAD in bytes const uint8_t* P; // Plain text uint64_t Plen; // length of our plaintext //outputs of encryption const uint8_t* C; // same length as PT const uint8_t* T; // Authenication tag uint8_t Tlen; // AT length can be 0 to 128bits }; #define vector(N) \ {K##N, (KBITS(K##N)), IV##N, sizeof(IV##N), A##N, A##N##_len, \ P##N, sizeof(P##N), C##N, T##N, sizeof(T##N)} int gcm_test(const enum arch_type arch); struct MB_MGR; int ctr_test(const enum arch_type arch, struct MB_MGR *); #endif /* GCM_CTR_VECTORS_TEST_H_ */ intel-ipsec-mb-0.48/LibTestApp/gcm_test.c000066400000000000000000001570471321406316400202530ustar00rootroot00000000000000/********************************************************************** Copyright(c) 2011-2017 Intel Corporation All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************/ #include #include #include #include /* for memcmp() */ #include #include "gcm_ctr_vectors_test.h" #include "mb_mgr.h" /* * 60-Byte Packet Encryption Using GCM-AES-128 * http://www.ieee802.org/1/files/public/docs2011/bn-randall-test-vectors-0511-v1.pdf * K: AD7A2BD03EAC835A6F620FDCB506B345 * IV: 12153524C0895E81B2C28465 * AAD: D609B1F056637A0D46DF998D88E52E00 * B2C2846512153524C0895E81 * P: 08000F101112131415161718191A1B1C * 1D1E1F202122232425262728292A2B2C * 2D2E2F303132333435363738393A0002 * C: 701AFA1CC039C0D765128A665DAB6924 * 3899BF7318CCDC81C9931DA17FBE8EDD * 7D17CB8B4C26FC81E3284F2B7FBA713D * AT: 4F8D55E7D3F06FD5A13C0C29B9D5B880 * H: 73A23D80121DE2D5A850253FCF43120E */ static uint8_t K1[] = { 0xAD, 0x7A, 0x2B, 0xD0, 0x3E, 0xAC, 0x83, 0x5A, 0x6F, 0x62, 0x0F, 0xDC, 0xB5, 0x06, 0xB3, 0x45 }; static uint8_t P1[] = { 0x08, 0x00, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F, 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2A, 0x2B, 0x2C, 0x2D, 0x2E, 0x2F, 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3A, 0x00, 0x02 }; static uint8_t IV1[] = { 0x12, 0x15, 0x35, 0x24, 0xC0, 0x89, 0x5E, 0x81, 0xB2, 0xC2, 0x84, 0x65 }; static uint8_t A1[] = { 0xD6, 0x09, 0xB1, 0xF0, 0x56, 0x63, 0x7A, 0x0D, 0x46, 0xDF, 0x99, 0x8D, 0x88, 0xE5, 0x2E, 0x00, 0xB2, 0xC2, 0x84, 0x65, 0x12, 0x15, 0x35, 0x24, 0xC0, 0x89, 0x5E, 0x81 }; #define A1_len sizeof(A1) static uint8_t C1[] = { 0x70, 0x1A, 0xFA, 0x1C, 0xC0, 0x39, 0xC0, 0xD7, 0x65, 0x12, 0x8A, 0x66, 0x5D, 0xAB, 0x69, 0x24, 0x38, 0x99, 0xBF, 0x73, 0x18, 0xCC, 0xDC, 0x81, 0xC9, 0x93, 0x1D, 0xA1, 0x7F, 0xBE, 0x8E, 0xDD, 0x7D, 0x17, 0xCB, 0x8B, 0x4C, 0x26, 0xFC, 0x81, 0xE3, 0x28, 0x4F, 0x2B, 0x7F, 0xBA, 0x71, 0x3D }; static uint8_t T1[] = { 0x4F, 0x8D, 0x55, 0xE7, 0xD3, 0xF0, 0x6F, 0xD5, 0xA1, 0x3C, 0x0C, 0x29, 0xB9, 0xD5, 0xB8, 0x80 }; /* * 54-Byte Packet Encryption Using GCM-AES-128 * http://www.ieee802.org/1/files/public/docs2011/bn-randall-test-vectors-0511-v1.pdf * K: 071B113B0CA743FECCCF3D051F737382 * IV: F0761E8DCD3D000176D457ED * AAD: E20106D7CD0DF0761E8DCD3D88E54C2A * 76D457ED * P: 08000F101112131415161718191A1B1C * 1D1E1F202122232425262728292A2B2C * 2D2E2F30313233340004 * C: 13B4C72B389DC5018E72A171DD85A5D3 * 752274D3A019FBCAED09A425CD9B2E1C * 9B72EEE7C9DE7D52B3F3 * AT: D6A5284F4A6D3FE22A5D6C2B960494C3 * H: E4E01725D724C1215C7309AD34539257 */ static uint8_t K2[] = { 0x07, 0x1B, 0x11, 0x3B, 0x0C, 0xA7, 0x43, 0xFE, 0xCC, 0xCF, 0x3D, 0x05, 0x1F, 0x73, 0x73, 0x82 }; static uint8_t P2[] = { 0x08, 0x00, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F, 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2A, 0x2B, 0x2C, 0x2D, 0x2E, 0x2F, 0x30, 0x31, 0x32, 0x33, 0x34, 0x00, 0x04 }; static uint8_t IV2[] = { 0xF0, 0x76, 0x1E, 0x8D, 0xCD, 0x3D, 0x00, 0x01, 0x76, 0xD4, 0x57, 0xED }; /* static uint8_t IV1p[] = {0, 0, 0, 1}; */ static uint8_t A2[] = { 0xE2, 0x01, 0x06, 0xD7, 0xCD, 0x0D, 0xF0, 0x76, 0x1E, 0x8D, 0xCD, 0x3D, 0x88, 0xE5, 0x4C, 0x2A, 0x76, 0xD4, 0x57, 0xED }; #define A2_len sizeof(A2) static uint8_t C2[] = { 0x13, 0xB4, 0xC7, 0x2B, 0x38, 0x9D, 0xC5, 0x01, 0x8E, 0x72, 0xA1, 0x71, 0xDD, 0x85, 0xA5, 0xD3, 0x75, 0x22, 0x74, 0xD3, 0xA0, 0x19, 0xFB, 0xCA, 0xED, 0x09, 0xA4, 0x25, 0xCD, 0x9B, 0x2E, 0x1C, 0x9B, 0x72, 0xEE, 0xE7, 0xC9, 0xDE, 0x7D, 0x52, 0xB3, 0xF3 }; static uint8_t T2[] = { 0xD6, 0xA5, 0x28, 0x4F, 0x4A, 0x6D, 0x3F, 0xE2, 0x2A, 0x5D, 0x6C, 0x2B, 0x96, 0x04, 0x94, 0xC3 }; /* * http://csrc.nist.gov/groups/STM/cavp/gcmtestvectors.zip gcmEncryptExtIV128.rsp * [Keylen = 128] * [IVlen = 96] * [PTlen = 128] * [AADlen = 128] * [Taglen = 128] * Count = 0 * K: c939cc13397c1d37de6ae0e1cb7c423c * IV: b3d8cc017cbb89b39e0f67e2 * P: c3b3c41f113a31b73d9a5cd432103069 * AAD: 24825602bd12a984e0092d3e448eda5f * C: 93fe7d9e9bfd10348a5606e5cafa7354 * AT: 0032a1dc85f1c9786925a2e71d8272dd */ static uint8_t K3[] = { 0xc9, 0x39, 0xcc, 0x13, 0x39, 0x7c, 0x1d, 0x37, 0xde, 0x6a, 0xe0, 0xe1, 0xcb, 0x7c, 0x42, 0x3c }; static uint8_t IV3[] = { 0xb3, 0xd8, 0xcc, 0x01, 0x7c, 0xbb, 0x89, 0xb3, 0x9e, 0x0f, 0x67, 0xe2 }; static uint8_t P3[] = { 0xc3, 0xb3, 0xc4, 0x1f, 0x11, 0x3a, 0x31, 0xb7, 0x3d, 0x9a, 0x5c, 0xd4, 0x32, 0x10, 0x30, 0x69 }; static uint8_t A3[] = { 0x24, 0x82, 0x56, 0x02, 0xbd, 0x12, 0xa9, 0x84, 0xe0, 0x09, 0x2d, 0x3e, 0x44, 0x8e, 0xda, 0x5f }; #define A3_len sizeof(A3) static uint8_t C3[] = { 0x93, 0xfe, 0x7d, 0x9e, 0x9b, 0xfd, 0x10, 0x34, 0x8a, 0x56, 0x06, 0xe5, 0xca, 0xfa, 0x73, 0x54 }; static uint8_t T3[] = { 0x00, 0x32, 0xa1, 0xdc, 0x85, 0xf1, 0xc9, 0x78, 0x69, 0x25, 0xa2, 0xe7, 0x1d, 0x82, 0x72, 0xdd }; /* * http://csrc.nist.gov/groups/STM/cavp/gcmtestvectors.zip gcmEncryptExtIV128.rsp * [Keylen = 128] * [IVlen = 96] * [PTlen = 256] * [AADlen = 128] * [Taglen = 128] * Count = 0 * K = 298efa1ccf29cf62ae6824bfc19557fc * IV = 6f58a93fe1d207fae4ed2f6d * P = cc38bccd6bc536ad919b1395f5d63801f99f8068d65ca5ac63872daf16b93901 * AAD = 021fafd238463973ffe80256e5b1c6b1 * C = dfce4e9cd291103d7fe4e63351d9e79d3dfd391e3267104658212da96521b7db * T = 542465ef599316f73a7a560509a2d9f2 */ static uint8_t K4[] = { 0x29, 0x8e, 0xfa, 0x1c, 0xcf, 0x29, 0xcf, 0x62, 0xae, 0x68, 0x24, 0xbf, 0xc1, 0x95, 0x57, 0xfc }; static uint8_t IV4[] = { 0x6f, 0x58, 0xa9, 0x3f, 0xe1, 0xd2, 0x07, 0xfa, 0xe4, 0xed, 0x2f, 0x6d }; static uint8_t P4[] = { 0xcc, 0x38, 0xbc, 0xcd, 0x6b, 0xc5, 0x36, 0xad, 0x91, 0x9b, 0x13, 0x95, 0xf5, 0xd6, 0x38, 0x01, 0xf9, 0x9f, 0x80, 0x68, 0xd6, 0x5c, 0xa5, 0xac, 0x63, 0x87, 0x2d, 0xaf, 0x16, 0xb9, 0x39, 0x01 }; static uint8_t A4[] = { 0x02, 0x1f, 0xaf, 0xd2, 0x38, 0x46, 0x39, 0x73, 0xff, 0xe8, 0x02, 0x56, 0xe5, 0xb1, 0xc6, 0xb1 }; #define A4_len sizeof(A4) static uint8_t C4[] = { 0xdf, 0xce, 0x4e, 0x9c, 0xd2, 0x91, 0x10, 0x3d, 0x7f, 0xe4, 0xe6, 0x33, 0x51, 0xd9, 0xe7, 0x9d, 0x3d, 0xfd, 0x39, 0x1e, 0x32, 0x67, 0x10, 0x46, 0x58, 0x21, 0x2d, 0xa9, 0x65, 0x21, 0xb7, 0xdb }; static uint8_t T4[] = { 0x54, 0x24, 0x65, 0xef, 0x59, 0x93, 0x16, 0xf7, 0x3a, 0x7a, 0x56, 0x05, 0x09, 0xa2, 0xd9, 0xf2 }; /* * http://csrc.nist.gov/groups/STM/cavp/gcmtestvectors.zip gcmEncryptExtIV128.rsp * [Keylen = 128] * [IVlen = 96] * [PTlen = 256] * [AADlen = 128] * [Taglen = 128] * Count = 0 * K = 298efa1ccf29cf62ae6824bfc19557fc * IV = 6f58a93fe1d207fae4ed2f6d * P = cc38bccd6bc536ad919b1395f5d63801f99f8068d65ca5ac63872daf16b93901 * AAD = 021fafd238463973ffe80256e5b1c6b1 * C = dfce4e9cd291103d7fe4e63351d9e79d3dfd391e3267104658212da96521b7db * T = 542465ef599316f73a7a560509a2d9f2 */ static uint8_t K5[] = { 0x29, 0x8e, 0xfa, 0x1c, 0xcf, 0x29, 0xcf, 0x62, 0xae, 0x68, 0x24, 0xbf, 0xc1, 0x95, 0x57, 0xfc }; static uint8_t IV5[] = { 0x6f, 0x58, 0xa9, 0x3f, 0xe1, 0xd2, 0x07, 0xfa, 0xe4, 0xed, 0x2f, 0x6d }; static uint8_t P5[] = { 0xcc, 0x38, 0xbc, 0xcd, 0x6b, 0xc5, 0x36, 0xad, 0x91, 0x9b, 0x13, 0x95, 0xf5, 0xd6, 0x38, 0x01, 0xf9, 0x9f, 0x80, 0x68, 0xd6, 0x5c, 0xa5, 0xac, 0x63, 0x87, 0x2d, 0xaf, 0x16, 0xb9, 0x39, 0x01 }; static uint8_t A5[] = { 0x02, 0x1f, 0xaf, 0xd2, 0x38, 0x46, 0x39, 0x73, 0xff, 0xe8, 0x02, 0x56, 0xe5, 0xb1, 0xc6, 0xb1 }; #define A5_len sizeof(A5) static uint8_t C5[] = { 0xdf, 0xce, 0x4e, 0x9c, 0xd2, 0x91, 0x10, 0x3d, 0x7f, 0xe4, 0xe6, 0x33, 0x51, 0xd9, 0xe7, 0x9d, 0x3d, 0xfd, 0x39, 0x1e, 0x32, 0x67, 0x10, 0x46, 0x58, 0x21, 0x2d, 0xa9, 0x65, 0x21, 0xb7, 0xdb }; static uint8_t T5[] = { 0x54, 0x24, 0x65, 0xef, 0x59, 0x93, 0x16, 0xf7, 0x3a, 0x7a, 0x56, 0x05, 0x09, 0xa2, 0xd9, 0xf2 }; /* * http://csrc.nist.gov/groups/ST/toolkit/BCM/documents/proposedmodes/gcm/gcm-revised-spec.pdf * Test Case 2 * K: 00000000000000000000000000000000 * P: 00000000000000000000000000000000 * IV: 000000000000000000000000 * C: 0388dace60b6a392f328c2b971b2fe78 * T: ab6e47d42cec13bdf53a67b21257bddf * H: 66e94bd4ef8a2c3b884cfa59ca342b2e */ static uint8_t K6[] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }; static uint8_t P6[] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }; static uint8_t IV6[] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }; static uint8_t A6[] = {0}; #define A6_len 0 static uint8_t C6[] = { 0x03, 0x88, 0xda, 0xce, 0x60, 0xb6, 0xa3, 0x92, 0xf3, 0x28, 0xc2, 0xb9, 0x71, 0xb2, 0xfe, 0x78 }; static uint8_t T6[] = { 0xab, 0x6e, 0x47, 0xd4, 0x2c, 0xec, 0x13, 0xbd, 0xf5, 0x3a, 0x67, 0xb2, 0x12, 0x57, 0xbd, 0xdf }; /* * http://csrc.nist.gov/groups/ST/toolkit/BCM/documents/proposedmodes/gcm/gcm-revised-spec.pdf * Test Case 3 * K: feffe9928665731c6d6a8f9467308308 * P: d9313225f88406e5a55909c5aff5269a * 86a7a9531534f7da2e4c303d8a318a72 * 1c3c0c95956809532fcf0e2449a6b525 * b16aedf5aa0de657ba637b391aafd255 * IV: cafebabefacedbaddecaf888 * H: b83b533708bf535d0aa6e52980d53b78 * C: 42831ec2217774244b7221b784d0d49c * e3aa212f2c02a4e035c17e2329aca12e * 21d514b25466931c7d8f6a5aac84aa05 * 1ba30b396a0aac973d58e091473f5985 * T: 4d5c2af327cd64a62cf35abd2ba6fab4 */ static uint8_t K7[] = { 0xfe, 0xff, 0xe9, 0x92, 0x86, 0x65, 0x73, 0x1c, 0x6d, 0x6a, 0x8f, 0x94, 0x67, 0x30, 0x83, 0x08 }; static uint8_t P7[] = { 0xd9, 0x31, 0x32, 0x25, 0xf8, 0x84, 0x06, 0xe5, 0xa5, 0x59, 0x09, 0xc5, 0xaf, 0xf5, 0x26, 0x9a, 0x86, 0xa7, 0xa9, 0x53, 0x15, 0x34, 0xf7, 0xda, 0x2e, 0x4c, 0x30, 0x3d, 0x8a, 0x31, 0x8a, 0x72, 0x1c, 0x3c, 0x0c, 0x95, 0x95, 0x68, 0x09, 0x53, 0x2f, 0xcf, 0x0e, 0x24, 0x49, 0xa6, 0xb5, 0x25, 0xb1, 0x6a, 0xed, 0xf5, 0xaa, 0x0d, 0xe6, 0x57, 0xba, 0x63, 0x7b, 0x39, 0x1a, 0xaf, 0xd2, 0x55 }; static uint8_t IV7[] = { 0xca, 0xfe, 0xba, 0xbe, 0xfa, 0xce, 0xdb, 0xad, 0xde, 0xca, 0xf8, 0x88 }; static uint8_t A7[] = {0}; #define A7_len 0 static uint8_t C7[] = { 0x42, 0x83, 0x1e, 0xc2, 0x21, 0x77, 0x74, 0x24, 0x4b, 0x72, 0x21, 0xb7, 0x84, 0xd0, 0xd4, 0x9c, 0xe3, 0xaa, 0x21, 0x2f, 0x2c, 0x02, 0xa4, 0xe0, 0x35, 0xc1, 0x7e, 0x23, 0x29, 0xac, 0xa1, 0x2e, 0x21, 0xd5, 0x14, 0xb2, 0x54, 0x66, 0x93, 0x1c, 0x7d, 0x8f, 0x6a, 0x5a, 0xac, 0x84, 0xaa, 0x05, 0x1b, 0xa3, 0x0b, 0x39, 0x6a, 0x0a, 0xac, 0x97, 0x3d, 0x58, 0xe0, 0x91, 0x47, 0x3f, 0x59, 0x85 }; static uint8_t T7[] = { 0x4d, 0x5c, 0x2a, 0xf3, 0x27, 0xcd, 0x64, 0xa6, 0x2c, 0xf3, 0x5a, 0xbd, 0x2b, 0xa6, 0xfa, 0xb4 }; /* * http://csrc.nist.gov/groups/ST/toolkit/BCM/documents/proposedmodes/gcm/gcm-revised-spec.pdf * Test Case 4 * K: feffe9928665731c6d6a8f9467308308 * P: d9313225f88406e5a55909c5aff5269a * 86a7a9531534f7da2e4c303d8a318a72 * 1c3c0c95956809532fcf0e2449a6b525 * b16aedf5aa0de657ba637b39 * A: feedfacedeadbeeffeedfacedeadbeef * abaddad2 * IV: cafebabefacedbaddecaf888 * H: b83b533708bf535d0aa6e52980d53b78 * C: 42831ec2217774244b7221b784d0d49c * e3aa212f2c02a4e035c17e2329aca12e * 21d514b25466931c7d8f6a5aac84aa05 * 1ba30b396a0aac973d58e091 * T: 5bc94fbc3221a5db94fae95ae7121a47 */ static uint8_t K8[] = { 0xfe, 0xff, 0xe9, 0x92, 0x86, 0x65, 0x73, 0x1c, 0x6d, 0x6a, 0x8f, 0x94, 0x67, 0x30, 0x83, 0x08 }; static uint8_t P8[] = { 0xd9, 0x31, 0x32, 0x25, 0xf8, 0x84, 0x06, 0xe5, 0xa5, 0x59, 0x09, 0xc5, 0xaf, 0xf5, 0x26, 0x9a, 0x86, 0xa7, 0xa9, 0x53, 0x15, 0x34, 0xf7, 0xda, 0x2e, 0x4c, 0x30, 0x3d, 0x8a, 0x31, 0x8a, 0x72, 0x1c, 0x3c, 0x0c, 0x95, 0x95, 0x68, 0x09, 0x53, 0x2f, 0xcf, 0x0e, 0x24, 0x49, 0xa6, 0xb5, 0x25, 0xb1, 0x6a, 0xed, 0xf5, 0xaa, 0x0d, 0xe6, 0x57, 0xba, 0x63, 0x7b, 0x39 }; static uint8_t A8[] = { 0xfe, 0xed, 0xfa, 0xce, 0xde, 0xad, 0xbe, 0xef, 0xfe, 0xed, 0xfa, 0xce, 0xde, 0xad, 0xbe, 0xef, 0xab, 0xad, 0xda, 0xd2 }; #define A8_len sizeof(A8) static uint8_t IV8[] = { 0xca, 0xfe, 0xba, 0xbe, 0xfa, 0xce, 0xdb, 0xad, 0xde, 0xca, 0xf8, 0x88 }; static uint8_t C8[] = { 0x42, 0x83, 0x1e, 0xc2, 0x21, 0x77, 0x74, 0x24, 0x4b, 0x72, 0x21, 0xb7, 0x84, 0xd0, 0xd4, 0x9c, 0xe3, 0xaa, 0x21, 0x2f, 0x2c, 0x02, 0xa4, 0xe0, 0x35, 0xc1, 0x7e, 0x23, 0x29, 0xac, 0xa1, 0x2e, 0x21, 0xd5, 0x14, 0xb2, 0x54, 0x66, 0x93, 0x1c, 0x7d, 0x8f, 0x6a, 0x5a, 0xac, 0x84, 0xaa, 0x05, 0x1b, 0xa3, 0x0b, 0x39, 0x6a, 0x0a, 0xac, 0x97, 0x3d, 0x58, 0xe0, 0x91, 0x47, 0x3f, 0x59, 0x85 }; static uint8_t T8[] = { 0x5b, 0xc9, 0x4f, 0xbc, 0x32, 0x21, 0xa5, 0xdb, 0x94, 0xfa, 0xe9, 0x5a, 0xe7, 0x12, 0x1a, 0x47 }; /* * http://csrc.nist.gov/groups/ST/toolkit/BCM/documents/proposedmodes/gcm/gcm-revised-spec.pdf * Test Case 14 * K: 00000000000000000000000000000000 * 00000000000000000000000000000000 * P: 00000000000000000000000000000000 * A: * IV: 000000000000000000000000 * H: dc95c078a2408989ad48a21492842087 * C: cea7403d4d606b6e074ec5d3baf39d18 * T: d0d1c8a799996bf0265b98b5d48ab919 */ static uint8_t K9[] = { 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 }; static uint8_t P9[] = { 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 }; static uint8_t A9[] = {0}; #define A9_len 0 static uint8_t IV9[] = { 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 }; static uint8_t C9[] = { 0xce, 0xa7, 0x40, 0x3d, 0x4d, 0x60, 0x6b, 0x6e, 0x07, 0x4e, 0xc5, 0xd3, 0xba, 0xf3, 0x9d, 0x18 }; static uint8_t T9[] = { 0xd0, 0xd1, 0xc8, 0xa7, 0x99, 0x99, 0x6b, 0xf0, 0x26, 0x5b, 0x98, 0xb5, 0xd4, 0x8a, 0xb9, 0x19 }; /* * http://csrc.nist.gov/groups/ST/toolkit/BCM/documents/proposedmodes/gcm/gcm-revised-spec.pdf * Test Case 15 * K: feffe9928665731c6d6a8f9467308308 * feffe9928665731c6d6a8f9467308308 * P: d9313225f88406e5a55909c5aff5269a * 86a7a9531534f7da2e4c303d8a318a72 * 1c3c0c95956809532fcf0e2449a6b525 * b16aedf5aa0de657ba637b391aafd255 * A: * IV: cafebabefacedbaddecaf888 * H: acbef20579b4b8ebce889bac8732dad7 * C: 522dc1f099567d07f47f37a32a84427d * 643a8cdcbfe5c0c97598a2bd2555d1aa * 8cb08e48590dbb3da7b08b1056828838 * c5f61e6393ba7a0abcc9f662898015ad * T: b094dac5d93471bdec1a502270e3cc6c */ static uint8_t K10[] = { 0xfe, 0xff, 0xe9, 0x92, 0x86, 0x65, 0x73, 0x1c, 0x6d, 0x6a, 0x8f, 0x94, 0x67, 0x30, 0x83, 0x08, 0xfe, 0xff, 0xe9, 0x92, 0x86, 0x65, 0x73, 0x1c, 0x6d, 0x6a, 0x8f, 0x94, 0x67, 0x30, 0x83, 0x08 }; static uint8_t P10[] = { 0xd9, 0x31, 0x32, 0x25, 0xf8, 0x84, 0x06, 0xe5, 0xa5, 0x59, 0x09, 0xc5, 0xaf, 0xf5, 0x26, 0x9a, 0x86, 0xa7, 0xa9, 0x53, 0x15, 0x34, 0xf7, 0xda, 0x2e, 0x4c, 0x30, 0x3d, 0x8a, 0x31, 0x8a, 0x72, 0x1c, 0x3c, 0x0c, 0x95, 0x95, 0x68, 0x09, 0x53, 0x2f, 0xcf, 0x0e, 0x24, 0x49, 0xa6, 0xb5, 0x25, 0xb1, 0x6a, 0xed, 0xf5, 0xaa, 0x0d, 0xe6, 0x57, 0xba, 0x63, 0x7b, 0x39, 0x1a, 0xaf, 0xd2, 0x55 }; static uint8_t A10[] = {0}; #define A10_len 0 static uint8_t IV10[] = { 0xca, 0xfe, 0xba, 0xbe, 0xfa, 0xce, 0xdb, 0xad, 0xde, 0xca, 0xf8, 0x88 }; static uint8_t C10[] = { 0x52, 0x2d, 0xc1, 0xf0, 0x99, 0x56, 0x7d, 0x07, 0xf4, 0x7f, 0x37, 0xa3, 0x2a, 0x84, 0x42, 0x7d, 0x64, 0x3a, 0x8c, 0xdc, 0xbf, 0xe5, 0xc0, 0xc9, 0x75, 0x98, 0xa2, 0xbd, 0x25, 0x55, 0xd1, 0xaa, 0x8c, 0xb0, 0x8e, 0x48, 0x59, 0x0d, 0xbb, 0x3d, 0xa7, 0xb0, 0x8b, 0x10, 0x56, 0x82, 0x88, 0x38, 0xc5, 0xf6, 0x1e, 0x63, 0x93, 0xba, 0x7a, 0x0a, 0xbc, 0xc9, 0xf6, 0x62, 0x89, 0x80, 0x15, 0xad }; static uint8_t T10[] = { 0xb0, 0x94, 0xda, 0xc5, 0xd9, 0x34, 0x71, 0xbd, 0xec, 0x1a, 0x50, 0x22, 0x70, 0xe3, 0xcc, 0x6c }; /* * http://csrc.nist.gov/groups/ST/toolkit/BCM/documents/proposedmodes/gcm/gcm-revised-spec.pdf * Test Case 16 * K: feffe9928665731c6d6a8f9467308308 * feffe9928665731c6d6a8f9467308308 * P: d9313225f88406e5a55909c5aff5269a * 86a7a9531534f7da2e4c303d8a318a72 * 1c3c0c95956809532fcf0e2449a6b525 * b16aedf5aa0de657ba637b39 * A: feedfacedeadbeeffeedfacedeadbeef * abaddad2 * IV: cafebabefacedbaddecaf888 * H: acbef20579b4b8ebce889bac8732dad7 * C: 522dc1f099567d07f47f37a32a84427d * 643a8cdcbfe5c0c97598a2bd2555d1aa * 8cb08e48590dbb3da7b08b1056828838 * c5f61e6393ba7a0abcc9f662 * T: 76fc6ece0f4e1768cddf8853bb2d551b */ static uint8_t K11[] = { 0xfe, 0xff, 0xe9, 0x92, 0x86, 0x65, 0x73, 0x1c, 0x6d, 0x6a, 0x8f, 0x94, 0x67, 0x30, 0x83, 0x08, 0xfe, 0xff, 0xe9, 0x92, 0x86, 0x65, 0x73, 0x1c, 0x6d, 0x6a, 0x8f, 0x94, 0x67, 0x30, 0x83, 0x08 }; static uint8_t P11[] = { 0xd9, 0x31, 0x32, 0x25, 0xf8, 0x84, 0x06, 0xe5, 0xa5, 0x59, 0x09, 0xc5, 0xaf, 0xf5, 0x26, 0x9a, 0x86, 0xa7, 0xa9, 0x53, 0x15, 0x34, 0xf7, 0xda, 0x2e, 0x4c, 0x30, 0x3d, 0x8a, 0x31, 0x8a, 0x72, 0x1c, 0x3c, 0x0c, 0x95, 0x95, 0x68, 0x09, 0x53, 0x2f, 0xcf, 0x0e, 0x24, 0x49, 0xa6, 0xb5, 0x25, 0xb1, 0x6a, 0xed, 0xf5, 0xaa, 0x0d, 0xe6, 0x57, 0xba, 0x63, 0x7b, 0x39 }; static uint8_t A11[] = { 0xfe, 0xed, 0xfa, 0xce, 0xde, 0xad, 0xbe, 0xef, 0xfe, 0xed, 0xfa, 0xce, 0xde, 0xad, 0xbe, 0xef, 0xab, 0xad, 0xda, 0xd2 }; #define A11_len sizeof(A11) static uint8_t IV11[] = { 0xca, 0xfe, 0xba, 0xbe, 0xfa, 0xce, 0xdb, 0xad, 0xde, 0xca, 0xf8, 0x88 }; static uint8_t C11[] = { 0x52, 0x2d, 0xc1, 0xf0, 0x99, 0x56, 0x7d, 0x07, 0xf4, 0x7f, 0x37, 0xa3, 0x2a, 0x84, 0x42, 0x7d, 0x64, 0x3a, 0x8c, 0xdc, 0xbf, 0xe5, 0xc0, 0xc9, 0x75, 0x98, 0xa2, 0xbd, 0x25, 0x55, 0xd1, 0xaa, 0x8c, 0xb0, 0x8e, 0x48, 0x59, 0x0d, 0xbb, 0x3d, 0xa7, 0xb0, 0x8b, 0x10, 0x56, 0x82, 0x88, 0x38, 0xc5, 0xf6, 0x1e, 0x63, 0x93, 0xba, 0x7a, 0x0a, 0xbc, 0xc9, 0xf6, 0x62 }; static uint8_t T11[] = { 0x76, 0xfc, 0x6e, 0xce, 0x0f, 0x4e, 0x17, 0x68, 0xcd, 0xdf, 0x88, 0x53, 0xbb, 0x2d, 0x55, 0x1b }; /* * http://csrc.nist.gov/groups/ST/toolkit/BCM/documents/proposedmodes/gcm/gcm-revised-spec.pdf * Test Case 17 -- Not supported IV length less than 12 bytes * K: feffe9928665731c6d6a8f9467308308 * feffe9928665731c6d6a8f9467308308 * P: d9313225f88406e5a55909c5aff5269a * 86a7a9531534f7da2e4c303d8a318a72 * 1c3c0c95956809532fcf0e2449a6b525 * b16aedf5aa0de657ba637b39 * A: feedfacedeadbeeffeedfacedeadbeef * abaddad2 * IV: cafebabefacedbad * H: acbef20579b4b8ebce889bac8732dad7 * C: c3762df1ca787d32ae47c13bf19844cb * af1ae14d0b976afac52ff7d79bba9de0 * feb582d33934a4f0954cc2363bc73f78 * 62ac430e64abe499f47c9b1f * T: 3a337dbf46a792c45e454913fe2ea8f2 */ /* static uint8_t K12[] = { */ /* 0xfe, 0xff, 0xe9, 0x92, 0x86, 0x65, 0x73, 0x1c, */ /* 0x6d, 0x6a, 0x8f, 0x94, 0x67, 0x30, 0x83, 0x08, */ /* 0xfe, 0xff, 0xe9, 0x92, 0x86, 0x65, 0x73, 0x1c, */ /* 0x6d, 0x6a, 0x8f, 0x94, 0x67, 0x30, 0x83, 0x08 */ /* }; */ /* static uint8_t P12[] = { */ /* 0xd9, 0x31, 0x32, 0x25, 0xf8, 0x84, 0x06, 0xe5, */ /* 0xa5, 0x59, 0x09, 0xc5, 0xaf, 0xf5, 0x26, 0x9a, */ /* 0x86, 0xa7, 0xa9, 0x53, 0x15, 0x34, 0xf7, 0xda, */ /* 0x2e, 0x4c, 0x30, 0x3d, 0x8a, 0x31, 0x8a, 0x72, */ /* 0x1c, 0x3c, 0x0c, 0x95, 0x95, 0x68, 0x09, 0x53, */ /* 0x2f, 0xcf, 0x0e, 0x24, 0x49, 0xa6, 0xb5, 0x25, */ /* 0xb1, 0x6a, 0xed, 0xf5, 0xaa, 0x0d, 0xe6, 0x57, */ /* 0xba, 0x63, 0x7b, 0x39 */ /* }; */ /* static uint8_t A12[] = { */ /* 0xfe, 0xed, 0xfa, 0xce, 0xde, 0xad, 0xbe, 0xef, */ /* 0xfe, 0xed, 0xfa, 0xce, 0xde, 0xad, 0xbe, 0xef, */ /* 0xab, 0xad, 0xda, 0xd2 */ /* }; */ /* static uint8_t IV12[] = { */ /* 0xca, 0xfe, 0xba, 0xbe, 0xfa, 0xce, 0xdb, 0xad */ /* }; */ /* static uint8_t H12[] = { */ /* 0xac, 0xbe, 0xf2, 0x05, 0x79, 0xb4, 0xb8, 0xeb, */ /* 0xce, 0x88, 0x9b, 0xac, 0x87, 0x32, 0xda, 0xd7 */ /* }; */ /* static uint8_t C12[] = { */ /* 0xc3, 0x76, 0x2d, 0xf1, 0xca, 0x78, 0x7d, 0x32, */ /* 0xae, 0x47, 0xc1, 0x3b, 0xf1, 0x98, 0x44, 0xcb, */ /* 0xaf, 0x1a, 0xe1, 0x4d, 0x0b, 0x97, 0x6a, 0xfa, */ /* 0xc5, 0x2f, 0xf7, 0xd7, 0x9b, 0xba, 0x9d, 0xe0, */ /* 0xfe, 0xb5, 0x82, 0xd3, 0x39, 0x34, 0xa4, 0xf0, */ /* 0x95, 0x4c, 0xc2, 0x36, 0x3b, 0xc7, 0x3f, 0x78, */ /* 0x62, 0xac, 0x43, 0x0e, 0x64, 0xab, 0xe4, 0x99, */ /* 0xf4, 0x7c, 0x9b, 0x1f */ /* }; */ /* static uint8_t T12[] = { */ /* 0x3a, 0x33, 0x7d, 0xbf, 0x46, 0xa7, 0x92, 0xc4, */ /* 0x5e, 0x45, 0x49, 0x13, 0xfe, 0x2e, 0xa8, 0xf2 */ /* }; */ /* * http://csrc.nist.gov/groups/ST/toolkit/BCM/documents/proposedmodes/gcm/gcm-revised-spec.pdf * Test Case 18 -- Not supported IV length greater than 12 bytes * K: feffe9928665731c6d6a8f9467308308 * feffe9928665731c6d6a8f9467308308 * P: d9313225f88406e5a55909c5aff5269a * 86a7a9531534f7da2e4c303d8a318a72 * 1c3c0c95956809532fcf0e2449a6b525 * b16aedf5aa0de657ba637b39 * A: feedfacedeadbeeffeedfacedeadbeef * abaddad2 * IV: 9313225df88406e555909c5aff5269aa * 6a7a9538534f7da1e4c303d2a318a728 * c3c0c95156809539fcf0e2429a6b5254 * 16aedbf5a0de6a57a637b39b * H: acbef20579b4b8ebce889bac8732dad7 * C: 5a8def2f0c9e53f1f75d7853659e2a20 * eeb2b22aafde6419a058ab4f6f746bf4 * 0fc0c3b780f244452da3ebf1c5d82cde * a2418997200ef82e44ae7e3f * T: a44a8266ee1c8eb0c8b5d4cf5ae9f19a */ /* * https://tools.ietf.org/html/draft-mcgrew-gcm-test-01 * case #7 */ /******************************************************** key = feffe9928665731c6d6a8f9467308308 feffe9928665731c (24 octets) spi = 0000a5f8 seq = 0000000a (4 octets) nonce = cafebabefacedbaddecaf888 plaintext = 45000028a4ad4000400678800a01038f 0a010612802306b8cb712602dd6bb03e 501016d075680001 (40 octets) aad = 0000a5f80000000a (8 octets) ctext+tag = a5b1f8066029aea40e598b8122de0242 0938b3ab33f828e687b8858b5bfbdbd0 315b27452144cc7795457b9652037f53 18027b5b4cd7a636 (56 octets) ********************************************************/ static uint8_t K13[] = { 0xfe, 0xff, 0xe9, 0x92, 0x86, 0x65, 0x73, 0x1c, 0x6d, 0x6a, 0x8f, 0x94, 0x67, 0x30, 0x83, 0x08, 0xfe, 0xff, 0xe9, 0x92, 0x86, 0x65, 0x73, 0x1c, }; static uint8_t IV13[] = { 0xca, 0xfe, 0xba, 0xbe, 0xfa, 0xce, 0xdb, 0xad, 0xde, 0xca, 0xf8, 0x88, }; static uint8_t A13[] = { 0x00, 0x00, 0xa5, 0xf8, 0x00, 0x00, 0x00, 0x0a, }; #define A13_len sizeof(A13) static uint8_t P13[] = { 0x45, 0x00, 0x00, 0x28, 0xa4, 0xad, 0x40, 0x00, 0x40, 0x06, 0x78, 0x80, 0x0a, 0x01, 0x03, 0x8f, 0x0a, 0x01, 0x06, 0x12, 0x80, 0x23, 0x06, 0xb8, 0xcb, 0x71, 0x26, 0x02, 0xdd, 0x6b, 0xb0, 0x3e, 0x50, 0x10, 0x16, 0xd0, 0x75, 0x68, 0x00, 0x01, }; static uint8_t T13[] = { 0x95, 0x45, 0x7b, 0x96, 0x52, 0x03, 0x7f, 0x53, 0x18, 0x02, 0x7b, 0x5b, 0x4c, 0xd7, 0xa6, 0x36, }; static uint8_t C13[] = { 0xa5, 0xb1, 0xf8, 0x06, 0x60, 0x29, 0xae, 0xa4, 0x0e, 0x59, 0x8b, 0x81, 0x22, 0xde, 0x02, 0x42, 0x09, 0x38, 0xb3, 0xab, 0x33, 0xf8, 0x28, 0xe6, 0x87, 0xb8, 0x85, 0x8b, 0x5b, 0xfb, 0xdb, 0xd0, 0x31, 0x5b, 0x27, 0x45, 0x21, 0x44, 0xcc, 0x77, }; static const struct gcm_ctr_vector gcm_vectors[] = { /* * field order {K, Klen, IV, IVlen, A, Alen, P, Plen, C, T, Tlen}; * original vector does not have a valid sub hash key */ vector(1), vector(2), vector(3), vector(4), vector(5), vector(6), vector(7), vector(8), vector(9), vector(10), vector(11), /* vector(12), -- IV of less than 16bytes are not supported */ vector(13), }; typedef void (*gcm_enc_dec_fn_t)(const struct gcm_key_data *, struct gcm_context_data *, uint8_t *, const uint8_t *, uint64_t, const uint8_t *, const uint8_t *, uint64_t, uint8_t *, uint64_t); typedef void (*gcm_pre_fn_t)(const void *, struct gcm_key_data *); static gcm_pre_fn_t aesni_gcm128_pre = NULL; static gcm_enc_dec_fn_t aesni_gcm128_enc = NULL; static gcm_enc_dec_fn_t aesni_gcm128_dec = NULL; static gcm_enc_dec_fn_t aesni_gcm128_enc_2 = NULL; static gcm_enc_dec_fn_t aesni_gcm128_dec_2 = NULL; static gcm_pre_fn_t aesni_gcm192_pre = NULL; static gcm_enc_dec_fn_t aesni_gcm192_enc = NULL; static gcm_enc_dec_fn_t aesni_gcm192_dec = NULL; static gcm_enc_dec_fn_t aesni_gcm192_enc_2 = NULL; static gcm_enc_dec_fn_t aesni_gcm192_dec_2 = NULL; static gcm_pre_fn_t aesni_gcm256_pre = NULL; static gcm_enc_dec_fn_t aesni_gcm256_enc = NULL; static gcm_enc_dec_fn_t aesni_gcm256_dec = NULL; static gcm_enc_dec_fn_t aesni_gcm256_enc_2 = NULL; static gcm_enc_dec_fn_t aesni_gcm256_dec_2 = NULL; static MB_MGR gcm_mgr; static int check_data(const uint8_t *test, const uint8_t *expected, uint64_t len, const char *data_name) { int mismatch; int is_error = 0; mismatch = memcmp(test, expected, len); if (mismatch) { uint64_t a; is_error = 1; printf(" expected results don't match %s \t\t", data_name); for (a = 0; a < len; a++) { if (test[a] != expected[a]) { printf(" '%x' != '%x' at %llx of %llx\n", test[a], expected[a], (unsigned long long) a, (unsigned long long) len); break; } } } return is_error; } /***************************************************************************** * RAW SGL API *****************************************************************************/ static void sgl_aes_gcm_enc_128_sse(const struct gcm_key_data *key, struct gcm_context_data *ctx, uint8_t *out, const uint8_t *in, uint64_t len, const uint8_t *iv, const uint8_t *aad, uint64_t aad_len, uint8_t *auth_tag, uint64_t auth_tag_len) { aes_gcm_init_128_sse(key, ctx, iv, aad, aad_len); aes_gcm_enc_128_update_sse(key, ctx, out, in, len); aes_gcm_enc_128_finalize_sse(key, ctx, auth_tag, auth_tag_len); } static void sgl_aes_gcm_dec_128_sse(const struct gcm_key_data *key, struct gcm_context_data *ctx, uint8_t *out, const uint8_t *in, uint64_t len, const uint8_t *iv, const uint8_t *aad, uint64_t aad_len, uint8_t *auth_tag, uint64_t auth_tag_len) { aes_gcm_init_128_sse(key, ctx, iv, aad, aad_len); aes_gcm_dec_128_update_sse(key, ctx, out, in, len); aes_gcm_dec_128_finalize_sse(key, ctx, auth_tag, auth_tag_len); } static void sgl_aes_gcm_enc_192_sse(const struct gcm_key_data *key, struct gcm_context_data *ctx, uint8_t *out, const uint8_t *in, uint64_t len, const uint8_t *iv, const uint8_t *aad, uint64_t aad_len, uint8_t *auth_tag, uint64_t auth_tag_len) { aes_gcm_init_192_sse(key, ctx, iv, aad, aad_len); aes_gcm_enc_192_update_sse(key, ctx, out, in, len); aes_gcm_enc_192_finalize_sse(key, ctx, auth_tag, auth_tag_len); } static void sgl_aes_gcm_dec_192_sse(const struct gcm_key_data *key, struct gcm_context_data *ctx, uint8_t *out, const uint8_t *in, uint64_t len, const uint8_t *iv, const uint8_t *aad, uint64_t aad_len, uint8_t *auth_tag, uint64_t auth_tag_len) { aes_gcm_init_192_sse(key, ctx, iv, aad, aad_len); aes_gcm_dec_192_update_sse(key, ctx, out, in, len); aes_gcm_dec_192_finalize_sse(key, ctx, auth_tag, auth_tag_len); } static void sgl_aes_gcm_enc_256_sse(const struct gcm_key_data *key, struct gcm_context_data *ctx, uint8_t *out, const uint8_t *in, uint64_t len, const uint8_t *iv, const uint8_t *aad, uint64_t aad_len, uint8_t *auth_tag, uint64_t auth_tag_len) { aes_gcm_init_256_sse(key, ctx, iv, aad, aad_len); aes_gcm_enc_256_update_sse(key, ctx, out, in, len); aes_gcm_enc_256_finalize_sse(key, ctx, auth_tag, auth_tag_len); } static void sgl_aes_gcm_dec_256_sse(const struct gcm_key_data *key, struct gcm_context_data *ctx, uint8_t *out, const uint8_t *in, uint64_t len, const uint8_t *iv, const uint8_t *aad, uint64_t aad_len, uint8_t *auth_tag, uint64_t auth_tag_len) { aes_gcm_init_256_sse(key, ctx, iv, aad, aad_len); aes_gcm_dec_256_update_sse(key, ctx, out, in, len); aes_gcm_dec_256_finalize_sse(key, ctx, auth_tag, auth_tag_len); } static void sgl_aes_gcm_enc_128_avx_gen2(const struct gcm_key_data *key, struct gcm_context_data *ctx, uint8_t *out, const uint8_t *in, uint64_t len, const uint8_t *iv, const uint8_t *aad, uint64_t aad_len, uint8_t *auth_tag, uint64_t auth_tag_len) { aes_gcm_init_128_avx_gen2(key, ctx, iv, aad, aad_len); aes_gcm_enc_128_update_avx_gen2(key, ctx, out, in, len); aes_gcm_enc_128_finalize_avx_gen2(key, ctx, auth_tag, auth_tag_len); } static void sgl_aes_gcm_dec_128_avx_gen2(const struct gcm_key_data *key, struct gcm_context_data *ctx, uint8_t *out, const uint8_t *in, uint64_t len, const uint8_t *iv, const uint8_t *aad, uint64_t aad_len, uint8_t *auth_tag, uint64_t auth_tag_len) { aes_gcm_init_128_avx_gen2(key, ctx, iv, aad, aad_len); aes_gcm_dec_128_update_avx_gen2(key, ctx, out, in, len); aes_gcm_dec_128_finalize_avx_gen2(key, ctx, auth_tag, auth_tag_len); } static void sgl_aes_gcm_enc_192_avx_gen2(const struct gcm_key_data *key, struct gcm_context_data *ctx, uint8_t *out, const uint8_t *in, uint64_t len, const uint8_t *iv, const uint8_t *aad, uint64_t aad_len, uint8_t *auth_tag, uint64_t auth_tag_len) { aes_gcm_init_192_avx_gen2(key, ctx, iv, aad, aad_len); aes_gcm_enc_192_update_avx_gen2(key, ctx, out, in, len); aes_gcm_enc_192_finalize_avx_gen2(key, ctx, auth_tag, auth_tag_len); } static void sgl_aes_gcm_dec_192_avx_gen2(const struct gcm_key_data *key, struct gcm_context_data *ctx, uint8_t *out, const uint8_t *in, uint64_t len, const uint8_t *iv, const uint8_t *aad, uint64_t aad_len, uint8_t *auth_tag, uint64_t auth_tag_len) { aes_gcm_init_192_avx_gen2(key, ctx, iv, aad, aad_len); aes_gcm_dec_192_update_avx_gen2(key, ctx, out, in, len); aes_gcm_dec_192_finalize_avx_gen2(key, ctx, auth_tag, auth_tag_len); } static void sgl_aes_gcm_enc_256_avx_gen2(const struct gcm_key_data *key, struct gcm_context_data *ctx, uint8_t *out, const uint8_t *in, uint64_t len, const uint8_t *iv, const uint8_t *aad, uint64_t aad_len, uint8_t *auth_tag, uint64_t auth_tag_len) { aes_gcm_init_256_avx_gen2(key, ctx, iv, aad, aad_len); aes_gcm_enc_256_update_avx_gen2(key, ctx, out, in, len); aes_gcm_enc_256_finalize_avx_gen2(key, ctx, auth_tag, auth_tag_len); } static void sgl_aes_gcm_dec_256_avx_gen2(const struct gcm_key_data *key, struct gcm_context_data *ctx, uint8_t *out, const uint8_t *in, uint64_t len, const uint8_t *iv, const uint8_t *aad, uint64_t aad_len, uint8_t *auth_tag, uint64_t auth_tag_len) { aes_gcm_init_256_avx_gen2(key, ctx, iv, aad, aad_len); aes_gcm_dec_256_update_avx_gen2(key, ctx, out, in, len); aes_gcm_dec_256_finalize_avx_gen2(key, ctx, auth_tag, auth_tag_len); } static void sgl_aes_gcm_enc_128_avx_gen4(const struct gcm_key_data *key, struct gcm_context_data *ctx, uint8_t *out, const uint8_t *in, uint64_t len, const uint8_t *iv, const uint8_t *aad, uint64_t aad_len, uint8_t *auth_tag, uint64_t auth_tag_len) { aes_gcm_init_128_avx_gen4(key, ctx, iv, aad, aad_len); aes_gcm_enc_128_update_avx_gen4(key, ctx, out, in, len); aes_gcm_enc_128_finalize_avx_gen4(key, ctx, auth_tag, auth_tag_len); } static void sgl_aes_gcm_dec_128_avx_gen4(const struct gcm_key_data *key, struct gcm_context_data *ctx, uint8_t *out, const uint8_t *in, uint64_t len, const uint8_t *iv, const uint8_t *aad, uint64_t aad_len, uint8_t *auth_tag, uint64_t auth_tag_len) { aes_gcm_init_128_avx_gen4(key, ctx, iv, aad, aad_len); aes_gcm_dec_128_update_avx_gen4(key, ctx, out, in, len); aes_gcm_dec_128_finalize_avx_gen4(key, ctx, auth_tag, auth_tag_len); } static void sgl_aes_gcm_enc_192_avx_gen4(const struct gcm_key_data *key, struct gcm_context_data *ctx, uint8_t *out, const uint8_t *in, uint64_t len, const uint8_t *iv, const uint8_t *aad, uint64_t aad_len, uint8_t *auth_tag, uint64_t auth_tag_len) { aes_gcm_init_192_avx_gen4(key, ctx, iv, aad, aad_len); aes_gcm_enc_192_update_avx_gen4(key, ctx, out, in, len); aes_gcm_enc_192_finalize_avx_gen4(key, ctx, auth_tag, auth_tag_len); } static void sgl_aes_gcm_dec_192_avx_gen4(const struct gcm_key_data *key, struct gcm_context_data *ctx, uint8_t *out, const uint8_t *in, uint64_t len, const uint8_t *iv, const uint8_t *aad, uint64_t aad_len, uint8_t *auth_tag, uint64_t auth_tag_len) { aes_gcm_init_192_avx_gen4(key, ctx, iv, aad, aad_len); aes_gcm_dec_192_update_avx_gen4(key, ctx, out, in, len); aes_gcm_dec_192_finalize_avx_gen4(key, ctx, auth_tag, auth_tag_len); } static void sgl_aes_gcm_enc_256_avx_gen4(const struct gcm_key_data *key, struct gcm_context_data *ctx, uint8_t *out, const uint8_t *in, uint64_t len, const uint8_t *iv, const uint8_t *aad, uint64_t aad_len, uint8_t *auth_tag, uint64_t auth_tag_len) { aes_gcm_init_256_avx_gen4(key, ctx, iv, aad, aad_len); aes_gcm_enc_256_update_avx_gen4(key, ctx, out, in, len); aes_gcm_enc_256_finalize_avx_gen4(key, ctx, auth_tag, auth_tag_len); } static void sgl_aes_gcm_dec_256_avx_gen4(const struct gcm_key_data *key, struct gcm_context_data *ctx, uint8_t *out, const uint8_t *in, uint64_t len, const uint8_t *iv, const uint8_t *aad, uint64_t aad_len, uint8_t *auth_tag, uint64_t auth_tag_len) { aes_gcm_init_256_avx_gen4(key, ctx, iv, aad, aad_len); aes_gcm_dec_256_update_avx_gen4(key, ctx, out, in, len); aes_gcm_dec_256_finalize_avx_gen4(key, ctx, auth_tag, auth_tag_len); } /***************************************************************************** * job API *****************************************************************************/ static void aes_gcm_job(MB_MGR *mb_mgr, JOB_CHAIN_ORDER order, const struct gcm_key_data *key, uint64_t key_len, uint8_t *out, const uint8_t *in, uint64_t len, const uint8_t *iv, const uint8_t *aad, uint64_t aad_len, uint8_t *auth_tag, uint64_t auth_tag_len) { JOB_AES_HMAC *job; job = IMB_GET_NEXT_JOB(mb_mgr); if (!job) { fprintf(stderr, "failed to get job\n"); return; } job->cipher_mode = GCM; job->hash_alg = AES_GMAC; job->chain_order = order; job->aes_enc_key_expanded = key; job->aes_dec_key_expanded = key; job->aes_key_len_in_bytes = key_len; job->src = in; job->dst = out; job->msg_len_to_cipher_in_bytes = len; job->cipher_start_src_offset_in_bytes = UINT64_C(0); job->iv = iv; job->iv_len_in_bytes = 12; job->u.GCM.aad = aad; job->u.GCM.aad_len_in_bytes = aad_len; job->auth_tag_output = auth_tag; job->auth_tag_output_len_in_bytes = auth_tag_len; job->cipher_direction = (order == CIPHER_HASH) ? ENCRYPT : DECRYPT; job = IMB_SUBMIT_JOB(mb_mgr); while (job) { if (job->status != STS_COMPLETED) fprintf(stderr, "failed job, status:%d\n", job->status); job = IMB_GET_COMPLETED_JOB(mb_mgr); } while ((job = IMB_FLUSH_JOB(mb_mgr)) != NULL) { if (job->status != STS_COMPLETED) fprintf(stderr, "failed job, status:%d\n", job->status); } } typedef void (*gcm_enc_dec_fn_t)(const struct gcm_key_data *, struct gcm_context_data *, uint8_t *, const uint8_t *, uint64_t, const uint8_t *, const uint8_t *, uint64_t, uint8_t *, uint64_t); static void job_aes_gcm_enc_128(const struct gcm_key_data *key, struct gcm_context_data *ctx, uint8_t *out, const uint8_t *in, uint64_t len, const uint8_t *iv, const uint8_t *aad, uint64_t aad_len, uint8_t *auth_tag, uint64_t auth_tag_len) { aes_gcm_job(&gcm_mgr, CIPHER_HASH, key, AES_128_BYTES, out, in, len, iv, aad, aad_len, auth_tag, auth_tag_len); } static void job_aes_gcm_dec_128(const struct gcm_key_data *key, struct gcm_context_data *ctx, uint8_t *out, const uint8_t *in, uint64_t len, const uint8_t *iv, const uint8_t *aad, uint64_t aad_len, uint8_t *auth_tag, uint64_t auth_tag_len) { aes_gcm_job(&gcm_mgr, HASH_CIPHER, key, AES_128_BYTES, out, in, len, iv, aad, aad_len, auth_tag, auth_tag_len); } static void job_aes_gcm_enc_192(const struct gcm_key_data *key, struct gcm_context_data *ctx, uint8_t *out, const uint8_t *in, uint64_t len, const uint8_t *iv, const uint8_t *aad, uint64_t aad_len, uint8_t *auth_tag, uint64_t auth_tag_len) { aes_gcm_job(&gcm_mgr, CIPHER_HASH, key, AES_192_BYTES, out, in, len, iv, aad, aad_len, auth_tag, auth_tag_len); } static void job_aes_gcm_dec_192(const struct gcm_key_data *key, struct gcm_context_data *ctx, uint8_t *out, const uint8_t *in, uint64_t len, const uint8_t *iv, const uint8_t *aad, uint64_t aad_len, uint8_t *auth_tag, uint64_t auth_tag_len) { aes_gcm_job(&gcm_mgr, HASH_CIPHER, key, AES_192_BYTES, out, in, len, iv, aad, aad_len, auth_tag, auth_tag_len); } static void job_aes_gcm_enc_256(const struct gcm_key_data *key, struct gcm_context_data *ctx, uint8_t *out, const uint8_t *in, uint64_t len, const uint8_t *iv, const uint8_t *aad, uint64_t aad_len, uint8_t *auth_tag, uint64_t auth_tag_len) { aes_gcm_job(&gcm_mgr, CIPHER_HASH, key, AES_256_BYTES, out, in, len, iv, aad, aad_len, auth_tag, auth_tag_len); } static void job_aes_gcm_dec_256(const struct gcm_key_data *key, struct gcm_context_data *ctx, uint8_t *out, const uint8_t *in, uint64_t len, const uint8_t *iv, const uint8_t *aad, uint64_t aad_len, uint8_t *auth_tag, uint64_t auth_tag_len) { aes_gcm_job(&gcm_mgr, HASH_CIPHER, key, AES_256_BYTES, out, in, len, iv, aad, aad_len, auth_tag, auth_tag_len); } /*****************************************************************************/ static int test_gcm_vectors(struct gcm_ctr_vector const *vector, gcm_pre_fn_t prefn, gcm_enc_dec_fn_t encfn, gcm_enc_dec_fn_t decfn) { struct gcm_key_data gdata_key; struct gcm_context_data gdata_ctx; int is_error = 0; /* Temporary array for the calculated vectors */ uint8_t *ct_test = NULL; uint8_t *pt_test = NULL; uint8_t *T_test = NULL; uint8_t *T2_test = NULL; #ifdef DEBUG printf("Testing GCM128 std vectors\n"); #endif /* Allocate space for the calculated ciphertext */ ct_test = malloc(vector->Plen); if (ct_test == NULL) { fprintf(stderr, "Can't allocate ciphertext memory\n"); is_error = 1; goto test_gcm_vectors_exit; } /* Allocate space for the calculated ciphertext */ pt_test = malloc(vector->Plen); if (pt_test == NULL) { fprintf(stderr, "Can't allocate plaintext memory\n"); is_error = 1; goto test_gcm_vectors_exit; } T_test = malloc(vector->Tlen); if (T_test == NULL) { fprintf(stderr, "Can't allocate tag memory\n"); is_error = 1; goto test_gcm_vectors_exit; } T2_test = malloc(vector->Tlen); if (T2_test == NULL) { fprintf(stderr, "Can't allocate tag(2) memory\n"); is_error = 1; goto test_gcm_vectors_exit; } /* This is only required once for a given key */ prefn(vector->K, &gdata_key); /* * Encrypt */ encfn(&gdata_key, &gdata_ctx, ct_test, vector->P, vector->Plen, vector->IV, vector->A, vector->Alen, T_test, vector->Tlen); is_error |= check_data(ct_test, vector->C, vector->Plen, "encrypted cypher text (C)"); is_error |= check_data(T_test, vector->T, vector->Tlen, "tag (T)"); /* test of in-place encrypt */ memcpy(pt_test, vector->P, vector->Plen); encfn(&gdata_key, &gdata_ctx, pt_test, pt_test, vector->Plen, vector->IV, vector->A, vector->Alen, T_test, vector->Tlen); is_error |= check_data(pt_test, vector->C, vector->Plen, "encrypted cypher text(in-place)"); memset(ct_test, 0, vector->Plen); memset(T_test, 0, vector->Tlen); /* * Decrypt */ decfn(&gdata_key, &gdata_ctx, pt_test, vector->C, vector->Plen, vector->IV, vector->A, vector->Alen, T_test, vector->Tlen); is_error |= check_data(pt_test, vector->P, vector->Plen, "decrypted plain text (P)"); /* * GCM decryption outputs a 16 byte tag value * that must be verified against the expected tag value */ is_error |= check_data(T_test, vector->T, vector->Tlen, "decrypted tag (T)"); /* test in in-place decrypt */ memcpy(ct_test, vector->C, vector->Plen); decfn(&gdata_key, &gdata_ctx, ct_test, ct_test, vector->Plen, vector->IV, vector->A, vector->Alen, T_test, vector->Tlen); is_error |= check_data(ct_test, vector->P, vector->Plen, "plain text (P) - in-place"); is_error |= check_data(T_test, vector->T, vector->Tlen, "decrypted tag (T) - in-place"); /* enc -> dec */ encfn(&gdata_key, &gdata_ctx, ct_test, vector->P, vector->Plen, vector->IV, vector->A, vector->Alen, T_test, vector->Tlen); memset(pt_test, 0, vector->Plen); decfn(&gdata_key, &gdata_ctx, pt_test, ct_test, vector->Plen, vector->IV, vector->A, vector->Alen, T2_test, vector->Tlen); is_error |= check_data(pt_test, vector->P, vector->Plen, "self decrypted plain text (P)"); is_error |= check_data(T_test, T2_test, vector->Tlen, "self decrypted tag (T)"); memset(pt_test, 0, vector->Plen); test_gcm_vectors_exit: if (NULL != ct_test) free(ct_test); if (NULL != pt_test) free(pt_test); if (NULL != T_test) free(T_test); if (NULL != T2_test) free(T2_test); return is_error; } static int test_gcm_std_vectors(void) { int const vectors_cnt = sizeof(gcm_vectors) / sizeof(gcm_vectors[0]); int vect; int is_error = 0; printf("AES-GCM standard test vectors:\n"); for (vect = 0; vect < vectors_cnt; vect++) { #ifdef DEBUG printf("Standard vector %d/%d Keylen:%d IVlen:%d PTLen:%d " "AADlen:%d Tlen:%d\n", vect, vectors_cnt - 1, (int) gcm_vectors[vect].Klen, (int) gcm_vectors[vect].IVlen, (int) gcm_vectors[vect].Plen, (int) gcm_vectors[vect].Alen, (int) gcm_vectors[vect].Tlen); #else printf("."); #endif switch (gcm_vectors[vect].Klen) { case BITS_128: is_error |= test_gcm_vectors(&gcm_vectors[vect], aesni_gcm128_pre, aesni_gcm128_enc, aesni_gcm128_dec); is_error |= test_gcm_vectors(&gcm_vectors[vect], aesni_gcm128_pre, aesni_gcm128_enc_2, aesni_gcm128_dec_2); is_error |= test_gcm_vectors(&gcm_vectors[vect], aesni_gcm128_pre, job_aes_gcm_enc_128, job_aes_gcm_dec_128); break; case BITS_192: is_error |= test_gcm_vectors(&gcm_vectors[vect], aesni_gcm192_pre, aesni_gcm192_enc, aesni_gcm192_dec); is_error |= test_gcm_vectors(&gcm_vectors[vect], aesni_gcm192_pre, aesni_gcm192_enc_2, aesni_gcm192_dec_2); is_error |= test_gcm_vectors(&gcm_vectors[vect], aesni_gcm192_pre, job_aes_gcm_enc_192, job_aes_gcm_dec_192); break; case BITS_256: is_error |= test_gcm_vectors(&gcm_vectors[vect], aesni_gcm256_pre, aesni_gcm256_enc, aesni_gcm256_dec); is_error |= test_gcm_vectors(&gcm_vectors[vect], aesni_gcm256_pre, aesni_gcm256_enc_2, aesni_gcm256_dec_2); is_error |= test_gcm_vectors(&gcm_vectors[vect], aesni_gcm256_pre, job_aes_gcm_enc_256, job_aes_gcm_dec_256); break; default: is_error = -1; break; } if (0 != is_error) return is_error; } printf("\n"); return is_error; } int gcm_test(const enum arch_type arch) { int errors = 0; switch (arch) { case ARCH_SSE: aesni_gcm128_pre = aes_gcm_pre_128_sse; aesni_gcm128_enc = aes_gcm_enc_128_sse; aesni_gcm128_dec = aes_gcm_dec_128_sse; aesni_gcm128_enc_2 = sgl_aes_gcm_enc_128_sse; aesni_gcm128_dec_2 = sgl_aes_gcm_dec_128_sse; aesni_gcm192_pre = aes_gcm_pre_192_sse; aesni_gcm192_enc = aes_gcm_enc_192_sse; aesni_gcm192_dec = aes_gcm_dec_192_sse; aesni_gcm192_enc_2 = sgl_aes_gcm_enc_192_sse; aesni_gcm192_dec_2 = sgl_aes_gcm_dec_192_sse; aesni_gcm256_pre = aes_gcm_pre_256_sse; aesni_gcm256_enc = aes_gcm_enc_256_sse; aesni_gcm256_dec = aes_gcm_dec_256_sse; aesni_gcm256_enc_2 = sgl_aes_gcm_enc_256_sse; aesni_gcm256_dec_2 = sgl_aes_gcm_dec_256_sse; init_mb_mgr_sse(&gcm_mgr); break; case ARCH_AVX: aesni_gcm128_pre = aes_gcm_pre_128_avx_gen2; aesni_gcm128_enc = aes_gcm_enc_128_avx_gen2; aesni_gcm128_dec = aes_gcm_dec_128_avx_gen2; aesni_gcm128_enc_2 = sgl_aes_gcm_enc_128_avx_gen2; aesni_gcm128_dec_2 = sgl_aes_gcm_dec_128_avx_gen2; aesni_gcm192_pre = aes_gcm_pre_192_avx_gen2; aesni_gcm192_enc = aes_gcm_enc_192_avx_gen2; aesni_gcm192_dec = aes_gcm_dec_192_avx_gen2; aesni_gcm192_enc_2 = sgl_aes_gcm_enc_192_avx_gen2; aesni_gcm192_dec_2 = sgl_aes_gcm_dec_192_avx_gen2; aesni_gcm256_pre = aes_gcm_pre_256_avx_gen2; aesni_gcm256_enc = aes_gcm_enc_256_avx_gen2; aesni_gcm256_dec = aes_gcm_dec_256_avx_gen2; aesni_gcm256_enc_2 = sgl_aes_gcm_enc_256_avx_gen2; aesni_gcm256_dec_2 = sgl_aes_gcm_dec_256_avx_gen2; init_mb_mgr_avx(&gcm_mgr); break; case ARCH_AVX2: aesni_gcm128_pre = aes_gcm_pre_128_avx_gen4; aesni_gcm128_enc = aes_gcm_enc_128_avx_gen4; aesni_gcm128_dec = aes_gcm_dec_128_avx_gen4; aesni_gcm128_enc_2 = sgl_aes_gcm_enc_128_avx_gen4; aesni_gcm128_dec_2 = sgl_aes_gcm_dec_128_avx_gen4; aesni_gcm192_pre = aes_gcm_pre_192_avx_gen4; aesni_gcm192_enc = aes_gcm_enc_192_avx_gen4; aesni_gcm192_dec = aes_gcm_dec_192_avx_gen4; aesni_gcm192_enc_2 = sgl_aes_gcm_enc_192_avx_gen4; aesni_gcm192_dec_2 = sgl_aes_gcm_dec_192_avx_gen4; aesni_gcm256_pre = aes_gcm_pre_256_avx_gen4; aesni_gcm256_enc = aes_gcm_enc_256_avx_gen4; aesni_gcm256_dec = aes_gcm_dec_256_avx_gen4; aesni_gcm256_enc_2 = sgl_aes_gcm_enc_256_avx_gen4; aesni_gcm256_dec_2 = sgl_aes_gcm_dec_256_avx_gen4; init_mb_mgr_avx2(&gcm_mgr); break; case ARCH_AVX512: aesni_gcm128_pre = aes_gcm_pre_128_avx_gen4; aesni_gcm128_enc = aes_gcm_enc_128_avx_gen4; aesni_gcm128_dec = aes_gcm_dec_128_avx_gen4; aesni_gcm128_enc_2 = sgl_aes_gcm_enc_128_avx_gen4; aesni_gcm128_dec_2 = sgl_aes_gcm_dec_128_avx_gen4; aesni_gcm192_pre = aes_gcm_pre_192_avx_gen4; aesni_gcm192_enc = aes_gcm_enc_192_avx_gen4; aesni_gcm192_dec = aes_gcm_dec_192_avx_gen4; aesni_gcm192_enc_2 = sgl_aes_gcm_enc_192_avx_gen4; aesni_gcm192_dec_2 = sgl_aes_gcm_dec_192_avx_gen4; aesni_gcm256_pre = aes_gcm_pre_256_avx_gen4; aesni_gcm256_enc = aes_gcm_enc_256_avx_gen4; aesni_gcm256_dec = aes_gcm_dec_256_avx_gen4; aesni_gcm256_enc_2 = sgl_aes_gcm_enc_256_avx_gen4; aesni_gcm256_dec_2 = sgl_aes_gcm_dec_256_avx_gen4; init_mb_mgr_avx512(&gcm_mgr); break; default: printf("Invalid architecture type %d selected!\n", arch); return 1; } errors = test_gcm_std_vectors(); if (0 == errors) printf("...Pass\n"); else printf("...Fail\n"); return errors; } intel-ipsec-mb-0.48/LibTestApp/gcm_vectors.h000066400000000000000000000034711321406316400207550ustar00rootroot00000000000000/********************************************************************** Copyright(c) 2011-2017 Intel Corporation All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************/ #ifndef AES_GCM_VECTORS_H_ #define AES_GCM_VECTORS_H_ #include #include "gcm_std_vectors_test.h" #endif /* AES_GCM_VECTORS_H_ */ intel-ipsec-mb-0.48/LibTestApp/main.c000066400000000000000000000120621321406316400173550ustar00rootroot00000000000000/***************************************************************************** Copyright (c) 2012-2017, Intel Corporation Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include #include #include #include #include #include "gcm_ctr_vectors_test.h" #include "customop_test.h" extern int des_test(const enum arch_type arch, struct MB_MGR *mb_mgr); extern int ccm_test(const enum arch_type arch, struct MB_MGR *mb_mgr); #define TEST_SSE 1 #define TEST_AVX 2 #define TEST_AVX2 3 #define TEST TEST_SSE #include "do_test.h" #undef TEST #define TEST TEST_AVX #include "do_test.h" #undef TEST #define TEST TEST_AVX2 #include "do_test.h" #undef TEST #define TEST TEST_AVX512 #include "do_test.h" static void usage(const char *name) { fprintf(stderr, "Usage: %s [args], where args are zero or more\n" "--no-avx512: Don't do AVX512\n" "--no-avx2: Don't do AVX2\n" "--no-avx: Don't do AVX\n" "--no-sse: Don't do SSE\n" "--shani-on: use SHA extensions, default: auto-detect\n" "--shani-off: don't use SHA extensions\n", name); } int main(int argc, char **argv) { int i, do_sse = 1, do_avx = 1, do_avx2 = 1, do_avx512 = 1; MB_MGR mb_mgr; for (i = 1; i < argc; i++) { if (strcmp(argv[i], "-h") == 0) { usage(argv[0]); return EXIT_SUCCESS; } else if (strcmp(argv[i], "--no-avx512") == 0) { do_avx512 = 0; } else if (strcmp(argv[i], "--no-avx2") == 0) { do_avx2 = 0; } else if (strcmp(argv[i], "--no-avx") == 0) { do_avx = 0; } else if (strcmp(argv[i], "--no-sse") == 0) { do_sse = 0; } else if (strcmp(argv[i], "--shani-on") == 0) { sse_sha_ext_usage = SHA_EXT_PRESENT; } else if (strcmp(argv[i], "--shani-off") == 0) { sse_sha_ext_usage = SHA_EXT_NOT_PRESENT; } else { usage(argv[0]); return EXIT_FAILURE; } } if (do_sse) { printf("Testing SSE interface\n"); init_mb_mgr_sse(&mb_mgr); known_answer_test_sse(&mb_mgr); do_test_sse(&mb_mgr); ctr_test(ARCH_SSE, &mb_mgr); gcm_test(ARCH_SSE); customop_test(&mb_mgr); des_test(ARCH_SSE, &mb_mgr); ccm_test(ARCH_SSE, &mb_mgr); } if (do_avx) { printf("Testing AVX interface\n"); init_mb_mgr_avx(&mb_mgr); known_answer_test_avx(&mb_mgr); do_test_avx(&mb_mgr); ctr_test(ARCH_AVX, &mb_mgr); gcm_test(ARCH_AVX); customop_test(&mb_mgr); des_test(ARCH_AVX, &mb_mgr); ccm_test(ARCH_AVX, &mb_mgr); } if (do_avx2) { printf("Testing AVX2 interface\n"); init_mb_mgr_avx2(&mb_mgr); known_answer_test_avx2(&mb_mgr); do_test_avx2(&mb_mgr); ctr_test(ARCH_AVX2, &mb_mgr); gcm_test(ARCH_AVX2); customop_test(&mb_mgr); des_test(ARCH_AVX2, &mb_mgr); ccm_test(ARCH_AVX2, &mb_mgr); } if (do_avx512) { printf("Testing AVX512 interface\n"); init_mb_mgr_avx512(&mb_mgr); known_answer_test_avx512(&mb_mgr); do_test_avx512(&mb_mgr); ctr_test(ARCH_AVX512, &mb_mgr); gcm_test(ARCH_AVX512); customop_test(&mb_mgr); des_test(ARCH_AVX512, &mb_mgr); ccm_test(ARCH_AVX512, &mb_mgr); } printf("Test completed\n"); return EXIT_SUCCESS; } intel-ipsec-mb-0.48/LibTestApp/win_x64.mak000066400000000000000000000054141321406316400202600ustar00rootroot00000000000000# # Copyright (c) 2017, Intel Corporation # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # * Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # * Neither the name of Intel Corporation nor the names of its contributors # may be used to endorse or promote products derived from this software # without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # APP = ipsec_MB_testapp IPSECLIB = ..\libIPSec_MB.lib !ifdef DEBUG DCFLAGS = /Od /DDEBUG /Z7 DLFLAGS = /debug !else DCFLAGS = /O2 /Oi DLFLAGS = !endif CC = cl # _CRT_SECURE_NO_WARNINGS disables warning C4996 about unsecure snprintf() being used CFLAGS = /nologo /D_CRT_SECURE_NO_WARNINGS $(DCFLAGS) /Y- /W3 /WX- /Gm- /fp:precise /EHsc /I.. /I..\include LNK = link LFLAGS = /out:$(APP).exe $(DLFLAGS) all: $(APP).exe $(APP).exe: main.obj gcm_test.obj ctr_test.obj customop_test.obj des_test.obj ccm_test.obj $(IPSECLIB) $(LNK) $(LFLAGS) main.obj gcm_test.obj ctr_test.obj customop_test.obj des_test.obj ccm_test.obj $(IPSECLIB) main.obj: main.c do_test.h $(CC) /c $(CFLAGS) main.c gcm_test.obj: gcm_test.c gcm_ctr_vectors_test.h $(CC) /c $(CFLAGS) gcm_test.c ctr_test.obj: ctr_test.c gcm_ctr_vectors_test.h $(CC) /c $(CFLAGS) ctr_test.c customop_test.obj: customop_test.c customop_test.h $(CC) /c $(CFLAGS) customop_test.c des_test.obj: des_test.c gcm_ctr_vectors_test.h $(CC) /c $(CFLAGS) des_test.c ccm_test.obj: ccm_test.c gcm_ctr_vectors_test.h $(CC) /c $(CFLAGS) ccm_test.c clean: del /q main.obj ctr_test.obj gcm_test.obj customop_test.obj des_test.obj ccm_test.obj $(APP).* intel-ipsec-mb-0.48/Makefile000066400000000000000000000243761321406316400157310ustar00rootroot00000000000000# # Copyright (c) 2012-2017, Intel Corporation # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # * Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # * Neither the name of Intel Corporation nor the names of its contributors # may be used to endorse or promote products derived from this software # without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # LIB = libIPSec_MB VERSION = 0.48 SO_VERSION = 0 SHARED ?= n USE_YASM ?= n YASM ?= yasm NASM ?= nasm OBJ_DIR = obj INCLUDE_DIRS := include . INCLUDES := $(foreach i,$(INCLUDE_DIRS),-I $i) CC ?= gcc CFLAGS := -DLINUX $(EXTRA_CFLAGS) $(INCLUDES) \ -W -Wall -Wextra -Wmissing-declarations -Wpointer-arith \ -Wcast-qual -Wundef -Wwrite-strings \ -Wformat -Wformat-security \ -Wunreachable-code -Wmissing-noreturn -Wsign-compare -Wno-endif-labels \ -Wstrict-prototypes -Wmissing-prototypes -Wold-style-definition ifeq ($(DEBUG),y) CFLAGS += -g -O0 -DDEBUG LDFLAGS += -g else CFLAGS += -O3 -fstack-protector -D_FORTIFY_SOURCE=2 endif # so or static build ifeq ($(SHARED),y) CFLAGS += -fPIC LIBNAME = $(LIB).so.$(VERSION) LIBPERM = 0755 LDFLAGS += -z noexecstack -z relro -z now else CFLAGS += -fPIE LIBNAME = $(LIB).a LIBPERM = 0644 LDFLAGS += -g endif ASM_INCLUDE_DIRS := include . avx avx2 avx512 sse YASM_INCLUDES := $(foreach i,$(ASM_INCLUDE_DIRS),-I $i) YASM_FLAGS := -f x64 -f elf64 -X gnu -g dwarf2 -DLINUX -D__linux__ $(YASM_INCLUDES) NASM_INCLUDES := $(foreach i,$(ASM_INCLUDE_DIRS),-I$i/) NASM_FLAGS := -felf64 -Xgnu -gdwarf -DLINUX -D__linux__ $(NASM_INCLUDES) lib_objs := \ aes128_cbc_dec_by4_sse.o \ aes128_cbc_dec_by8_avx.o \ aes128_cntr_by4_sse.o \ aes128_cntr_by8_avx.o \ aes128_ecbenc_x3.o \ aes192_cbc_dec_by4_sse.o \ aes192_cbc_dec_by8_avx.o \ aes192_cntr_by4_sse.o \ aes192_cntr_by8_avx.o \ aes256_cbc_dec_by4_sse.o \ aes256_cbc_dec_by8_avx.o \ aes256_cntr_by4_sse.o \ aes256_cntr_by8_avx.o \ aes_cfb_128_sse.o \ aes_cfb_128_avx.o \ aes128_cbc_mac_x4.o \ aes128_cbc_mac_x8.o \ aes_cbc_enc_128_x4.o \ aes_cbc_enc_128_x8.o \ aes_cbc_enc_192_x4.o \ aes_cbc_enc_192_x8.o \ aes_cbc_enc_256_x4.o \ aes_cbc_enc_256_x8.o \ aes_keyexp_128.o \ aes_keyexp_192.o \ aes_keyexp_256.o \ aes_xcbc_mac_128_x4.o \ aes_xcbc_mac_128_x8.o \ mb_mgr_aes192_flush_avx.o \ mb_mgr_aes192_flush_sse.o \ mb_mgr_aes192_submit_avx.o \ mb_mgr_aes192_submit_sse.o \ mb_mgr_aes256_flush_avx.o \ mb_mgr_aes256_flush_sse.o \ mb_mgr_aes256_submit_avx.o \ mb_mgr_aes256_submit_sse.o \ mb_mgr_aes_flush_avx.o \ mb_mgr_aes_flush_sse.o \ mb_mgr_aes_submit_avx.o \ mb_mgr_aes_submit_sse.o \ mb_mgr_aes_xcbc_flush_avx.o \ mb_mgr_aes_xcbc_flush_sse.o \ mb_mgr_aes_xcbc_submit_avx.o \ mb_mgr_aes_xcbc_submit_sse.o \ mb_mgr_hmac_flush_avx.o \ mb_mgr_hmac_flush_avx2.o \ mb_mgr_hmac_flush_sse.o \ mb_mgr_hmac_flush_ni_sse.o \ mb_mgr_hmac_flush_avx512.o \ mb_mgr_hmac_md5_flush_avx.o \ mb_mgr_hmac_md5_flush_avx2.o \ mb_mgr_hmac_md5_flush_sse.o \ mb_mgr_hmac_md5_submit_avx.o \ mb_mgr_hmac_md5_submit_avx2.o \ mb_mgr_hmac_md5_submit_sse.o \ mb_mgr_hmac_sha_224_flush_avx.o \ mb_mgr_hmac_sha_224_flush_avx2.o \ mb_mgr_hmac_sha_224_flush_avx512.o \ mb_mgr_hmac_sha_224_flush_sse.o \ mb_mgr_hmac_sha_224_flush_ni_sse.o \ mb_mgr_hmac_sha_224_submit_avx.o \ mb_mgr_hmac_sha_224_submit_avx2.o \ mb_mgr_hmac_sha_224_submit_avx512.o \ mb_mgr_hmac_sha_224_submit_sse.o \ mb_mgr_hmac_sha_224_submit_ni_sse.o \ mb_mgr_hmac_sha_256_flush_avx.o \ mb_mgr_hmac_sha_256_flush_avx2.o \ mb_mgr_hmac_sha_256_flush_sse.o \ mb_mgr_hmac_sha_256_flush_ni_sse.o \ mb_mgr_hmac_sha_256_flush_avx512.o \ mb_mgr_hmac_sha_256_submit_avx.o \ mb_mgr_hmac_sha_256_submit_avx2.o \ mb_mgr_hmac_sha_256_submit_sse.o \ mb_mgr_hmac_sha_256_submit_ni_sse.o \ mb_mgr_hmac_sha_256_submit_avx512.o \ mb_mgr_hmac_sha_384_flush_avx.o \ mb_mgr_hmac_sha_384_flush_avx2.o \ mb_mgr_hmac_sha_384_flush_avx512.o \ mb_mgr_hmac_sha_384_flush_sse.o \ mb_mgr_hmac_sha_384_submit_avx.o \ mb_mgr_hmac_sha_384_submit_avx2.o \ mb_mgr_hmac_sha_384_submit_avx512.o \ mb_mgr_hmac_sha_384_submit_sse.o \ mb_mgr_hmac_sha_512_flush_avx.o \ mb_mgr_hmac_sha_512_flush_avx2.o \ mb_mgr_hmac_sha_512_flush_avx512.o \ mb_mgr_hmac_sha_512_flush_sse.o \ mb_mgr_hmac_sha_512_submit_avx.o \ mb_mgr_hmac_sha_512_submit_avx2.o \ mb_mgr_hmac_sha_512_submit_avx512.o \ mb_mgr_hmac_sha_512_submit_sse.o \ mb_mgr_hmac_submit_avx.o \ mb_mgr_hmac_submit_avx2.o \ mb_mgr_hmac_submit_sse.o \ mb_mgr_hmac_submit_ni_sse.o \ mb_mgr_hmac_submit_avx512.o \ mb_mgr_des_avx512.o \ md5_x4x2_avx.o \ md5_x4x2_sse.o \ md5_x8x2_avx2.o \ save_xmms.o \ sha1_mult_avx.o \ sha1_mult_sse.o \ sha1_ni_x2_sse.o \ sha1_one_block_avx.o \ sha1_one_block_sse.o \ sha1_x8_avx2.o \ sha1_x16_avx512.o \ sha224_one_block_avx.o \ sha224_one_block_sse.o \ sha256_oct_avx2.o \ sha256_one_block_avx.o \ sha256_one_block_sse.o \ sha256_ni_x2_sse.o \ sha256_x16_avx512.o \ sha384_one_block_avx.o \ sha384_one_block_sse.o \ sha512_one_block_avx.o \ sha512_one_block_sse.o \ sha512_x2_avx.o \ sha512_x2_sse.o \ sha512_x4_avx2.o \ sha512_x8_avx512.o \ sha_256_mult_avx.o \ sha_256_mult_sse.o \ \ aes_xcbc_expand_key.o \ mb_mgr_avx.o \ mb_mgr_avx2.o \ mb_mgr_avx512.o \ mb_mgr_sse.o \ md5_one_block.o \ des_key.o \ des_basic.o \ des_x16_avx512.o gcm_objs := gcm128_sse.o gcm192_sse.o gcm256_sse.o \ gcm128_avx_gen2.o gcm192_avx_gen2.o gcm256_avx_gen2.o \ gcm128_avx_gen4.o gcm192_avx_gen4.o gcm256_avx_gen4.o ifeq ($(NO_GCM), y) obj2_files := $(lib_objs:%=$(OBJ_DIR)/%) CFLAGS += -DNO_GCM else obj2_files := $(lib_objs:%=$(OBJ_DIR)/%) $(gcm_objs:%=$(OBJ_DIR)/%) endif all: $(LIBNAME) $(LIBNAME): $(obj2_files) ifeq ($(SHARED),y) $(CC) -shared -Wl,-soname,$(LIB).so.$(SO_VERSION) -o $(LIBNAME) $^ -lc ln -f -s $(LIBNAME) $(LIB).so.$(SO_VERSION) ln -f -s $(LIB).so.$(SO_VERSION) $(LIB).so else ar -qcs $@ $^ endif $(obj2_files): | $(OBJ_DIR) $(OBJ_DIR)/%.o:%.c @ echo "Making object file $@ " $(CC) -c $(CFLAGS) $< -o $@ @ echo "--------------------------------------------------------------" $(OBJ_DIR)/%.o:%.asm @ echo "Making object file $@ " ifeq ($(USE_YASM),y) $(YASM) $(YASM_FLAGS) $< -o $@ else $(NASM) -o $@ $(NASM_FLAGS) $< endif @ echo "--------------------------------------------------------------" $(OBJ_DIR)/%.o:sse/%.c @ echo "Making object file $@ " $(CC) -c $(CFLAGS) $< -o $@ @ echo "--------------------------------------------------------------" $(OBJ_DIR)/%.o:sse/%.asm @ echo "Making object file $@ " ifeq ($(USE_YASM),y) $(YASM) $(YASM_FLAGS) $< -o $@ else $(NASM) -o $@ $(NASM_FLAGS) $< endif @ echo "--------------------------------------------------------------" $(OBJ_DIR)/%.o:avx/%.c @ echo "Making object file $@ " $(CC) -c $(CFLAGS) $< -o $@ @ echo "--------------------------------------------------------------" $(OBJ_DIR)/%.o:avx/%.asm @ echo "Making object file $@ " ifeq ($(USE_YASM),y) $(YASM) $(YASM_FLAGS) $< -o $@ else $(NASM) -o $@ $(NASM_FLAGS) $< endif @ echo "--------------------------------------------------------------" $(OBJ_DIR)/%.o:avx2/%.c @ echo "Making object file $@ " $(CC) -c $(CFLAGS) $< -o $@ @ echo "--------------------------------------------------------------" $(OBJ_DIR)/%.o:avx2/%.asm @ echo "Making object file $@ " ifeq ($(USE_YASM),y) $(YASM) $(YASM_FLAGS) $< -o $@ else $(NASM) -o $@ $(NASM_FLAGS) $< endif @ echo "--------------------------------------------------------------" $(OBJ_DIR)/%.o:avx512/%.c @ echo "Making object file $@ " $(CC) -c $(CFLAGS) $< -o $@ @ echo "--------------------------------------------------------------" $(OBJ_DIR)/%.o:avx512/%.asm @ echo "Making object file $@ " ifeq ($(USE_YASM),y) $(YASM) $(YASM_FLAGS) $< -o $@ else $(NASM) -o $@ $(NASM_FLAGS) $< endif @ echo "--------------------------------------------------------------" $(OBJ_DIR)/%.o:include/%.asm @ echo "Making object file $@ " ifeq ($(USE_YASM),y) $(YASM) $(YASM_FLAGS) $< -o $@ else $(NASM) -o $@ $(NASM_FLAGS) $< endif @ echo "--------------------------------------------------------------" $(OBJ_DIR): mkdir $(OBJ_DIR) .PHONY: TAGS TAGS: find ./ -name '*.[ch]' | etags - find ./ -name '*.asm' | etags -a - find ./ -name '*.inc' | etags -a - .PHONY: clean clean: rm -Rf $(obj2_files) rm -f $(LIB).a $(LIB).so* SOURCES_DIRS := . sse avx avx2 avx512 include SOURCES := $(foreach dir,$(SOURCES_DIRS),$(wildcard $(dir)/*.[ch]) $(wildcard $(dir)/*.asm) $(wildcard $(dir)/*.inc)) SOURCES_STYLE := $(foreach infile,$(SOURCES),-f $(infile)) CHECKPATCH?=checkpatch.pl # SPACING - produces false positives with tyepdefs and * # CONSTANT_COMPARISON - forbids defensive programming technique # USE_FUNC - produces false positives for Windows target # INITIALISED_STATIC, LEADING_SPACE, SPLIT_STRING, CODE_INDENT, # PREFER_ALIGNED, UNSPECIFIED_INT, ARRAY_SIZE, GLOBAL_INITIALISERS, # NEW_TYPEDEFS, AVOID_EXTERNS, COMPLEX_MACRO, BLOCK_COMMENT_STYLE # - found obsolete in this project .PHONY: style style: $(CHECKPATCH) --no-tree --no-signoff --emacs --no-color \ --ignore CODE_INDENT,INITIALISED_STATIC,LEADING_SPACE,SPLIT_STRING,\ UNSPECIFIED_INT,ARRAY_SIZE,BLOCK_COMMENT_STYLE,GLOBAL_INITIALISERS,\ NEW_TYPEDEFS,AVOID_EXTERNS,COMPLEX_MACRO,PREFER_ALIGNED,USE_FUNC,\ CONSTANT_COMPARISON,SPACING $(SOURCES_STYLE) intel-ipsec-mb-0.48/README000066400000000000000000000052301321406316400151350ustar00rootroot00000000000000======================================================================== README for Intel(R) Multi-Buffer Crypto for IPsec Library November 2016 ======================================================================== Contents ======== - Overview - Package Content - Compilation - Legal Disclaimer Overview ======== Intel Multi-Buffer Crypto for IPsec Library is highly-optimized software implementations of the core cryptographic processing for IPsec, which provides industry-leading performance on a range of Intel(R) Processors. For information on how to build and use this library, see the Intel White Paper: "Fast Multi-buffer IPsec Implementations on Intel Architecture Processors". Jim Guilford, Sean Gulley, et. al. The easiest way to find it is to search the Internet for the title and Intel White Paper. Package Content =============== LibTestApp - sample application using the library interface sse - Intel(R) SSE optimized routines avx - Intel(R) AVX optimized routines avx2 - Intel(R) AVX2 optimized routines avx512 - Intel(R) AVX512 optimized routines Compilation =========== Linux (64-bit only) ------------------- Required tools: - GNU make - NASM version 2.12.02 (or newer) - gcc (GCC) 4.8.3 (or newer) NOTE: Current DES and DOCSIS DES AVX512 cipher implementation requires GCC version with AVX512F and AVX512BW support. gcc (GCC) 6.3.1 has been tested to work correctly. Simply run "make" or "make all" at the top level directory to compile the library. To clean the library build please run "make clean". Windows (x64 only) ------------------ Required tools: - Microsoft (R) Visual Studio 2010: - NMAKE: Microsoft (R) Program Maintenance Utility Version 10.00.30319.01 - CL: Microsoft (R) C/C++ Optimizing Compiler Version 16.00.30319.01 for x64 - LIB: Microsoft (R) Library Manager Version 10.00.30319.01 - LINK: Microsoft (R) Incremental Linker Version 10.00.30319.01 - NASM version 2.12.02 (or newer) Simply run "nmake /f win_x64.mak" or "nmake /f win_x64.mak all" at the top level directory to compile the library. To clean the library build please run "nmake /f win_x64.mak clean". Legal Disclaimer ================ THIS SOFTWARE IS PROVIDED BY INTEL"AS IS". NO LICENSE, EXPRESS OR IMPLIED, BY ESTOPPEL OR OTHERWISE, TO ANY INTELLECTUAL PROPERTY RIGHTS ARE GRANTED THROUGH USE. EXCEPT AS PROVIDED IN INTEL'S TERMS AND CONDITIONS OF SALE, INTEL ASSUMES NO LIABILITY WHATSOEVER AND INTEL DISCLAIMS ANY EXPRESS OR IMPLIED WARRANTY, RELATING TO SALE AND/OR USE OF INTEL PRODUCTS INCLUDING LIABILITY OR WARRANTIES RELATING TO FITNESS FOR A PARTICULAR PURPOSE, MERCHANTABILITY, OR INFRINGEMENT OF ANY PATENT, COPYRIGHT OR OTHER INTELLECTUAL PROPERTY RIGHT. intel-ipsec-mb-0.48/ReleaseNotes.txt000066400000000000000000000404031321406316400174100ustar00rootroot00000000000000======================================================================== Release Notes for Intel(R) Multi-Buffer Crypto for IPsec Library v0.48 December 2017 ======================================================================== 12 Dec, 2017 General - Linux SO compilation option added - Windows DLL compilation option added - AES CCM 128 support added - Multithread command line option added to LibPerfApp - Coding style fixes - Coding style target added to Makefile v0.47 October 2017 ======================================================================== Oct 5, 2017 Intel(R) AVX-512 Instructions - DES CBC AVX512 implementation - DOCSIS DES AVX512 implementation General - DES CBC cipher added (generic x86 implementation) - DOCSIS DES cipher added (generic x86 implementation) - DES and DOCSIS DES tests added - RPM SPEC file created v0.46 June 2017 ======================================================================== Jun 27, 2017 General - AES GCM optimizations for AVX2 - Change of AES GCM API: renamed and expanded keys separated from the context - New AES GCM API via job structure and API's - use of the interface may simplify application design at the expense of slightly lower performance vs direct AES GCM API's - AES GCM IV automatically padded with block counter (no need for application to do it) - IV in AES CTR mode can be 12 bytes (no block counter); 16 byte format still allowed - Macros added to ease access to job API for specific architecture - use of these macros can simplify application design but it may produce worse performance than calling architecture job API's directly - Submit_job_nocheck() API added to gain some cycles by not validating job structure - Result stability improvements in LibPerfApp v0.45 March 2017 ======================================================================== Mar 29, 2017 Intel(R) AVX-512 Instructions - Added optimized HMAC-SHA224 and HMAC-SHA256 - Added optimized HMAC-SHA384 and HMAC-SHA512 General - Windows x64 compilation target - New DOCSIS SEC BPI V3.1 cipher - GCM128 and GCM256 updates (with new API that is scatter gather list friendly) - GCM192 added - Added library API benchmark tool 'ipsec_perf' and script to compare results 'ipsec_diff_tool.py' Bug Fixes (vs v0.44) - AES CTR mode fix to allow message size not to be multiple of AES block size - RSI and RDI registers clobbered when running HMAC-SHA224 or HMAC-SHA256 on Windows using SHA extensions v0.44 November 2016 ======================================================================== Nov 21, 2016 Intel(R) AVX-512 Instructions - AVX512 multi buffer manager added (uses AVX2 implementations by default) - Optimized SHA1 implementation added Intel(R) SHA Extensions - SHA1, SHA224 and SHA256 implementations added for Intel(R) SSE General - NULL cipher added - NULL hash added - NASM tool chain compilation added (default) ======================================= Feb 11, 2015 Fixed, so that the job auth_tag_output_len_in_bytes takes a different value for different MAC types. In particular, the valid values are(in bytes): SHA1 - 12 sha224 - 14 SHA256 - 16 sha384 - 24 SHA512 - 32 XCBC - 12 MD5 - 12 ======================================= Oct 24, 2011 SHA_256 added to multibuffer ------------------------ 12 Aug 2011 API The GCM API is distinct from the Multi-buffer API. This is because the GCM code is an optimized single-buffer implementation. By packaging them separately, the application has the option of where, when, and how to call the GCM code, independent of how it is calling the multi-buffer code. For example, the application might be enqueing multi-buffer requests for a separate thread to process. In this scenario, if a particular packet used GCM, then the application could choose whether to call the GCM routines directly, or whether to enqueue those requests and have the compute thread call the GCM routines. GCM API The GCM functions are defined as described the the header files. They are simple computational routines, with no state associated with them. Multi-Buffer API: Two Sets of Functions There are two parallel interfaces, one suffixed with "_sse" and one suffixed with "_avx". These are functionally equivalent. The "_sse" functions work on WSM and later processors. The "_avx" functions offer better performance, but they only run on processors after WSM. The same interface object structures are used for both sets of interfaces, although one cannot mix the two interfaces on the same initialized object (e.g. it would be wrong to initialize with init_mb_mgr_sse() and then to pass that to submit_job_avx() ). After the MB_MGR structure has been initialized with one of the two initialization functions (init_mb_mgr_sse() or init_mb_mgr_avx()), only the corresponding functions should be used on it. There are several ways in which an application could use these interfaces. 1) Direct If an application is only going to be run on a post-WSM machine, it can just call the "_avx" functions directly. Conversely, if it is just going to be run on WSM machines, it can call the "_sse" functions directly. 2) Via Branches If an application can run on both WSM and SNB and wants the improved performance on SNB, then it can use some method to determine if it is on SNB, and then use a conditional branch to determine which function to call. E.g. this could be wrapped in a macro along the lines of: #define submit_job(mb_mgr) \ if (_use_avx) submit_job_avx(mb_mgr); \ else submit_job_sse(mb_mgr) 3) Via a Function Table One can embed the function addresses into a structure, call them through this structure, and change the structure based on which set of functions one wishes to use, e.g. struct funcs_t { init_mb_mgr_t init_mb_mgr; get_next_job_t get_next_job; submit_job_t submit_job; get_completed_job_t get_completed_job; flush_job_t flush_job; }; funcs_t funcs_sse = { init_mb_mgr_sse, get_next_job_sse, submit_job_sse, get_completed_job_sse, flush_job_sse }; funcs_t funcs_avx = { init_mb_mgr_avx, get_next_job_avx, submit_job_avx, get_completed_job_avx, flush_job_avx }; funcs_t *funcs = &funcs_sse; ... if (do_avx) funcs = &funcs_avx; ... funcs->init_mb_mgr(&mb_mgr); For simplicity in the rest of this document, the functions will be refered to no suffix. API: Overview The basic unit of work is a "job". It is represented by a JOB_AES_HMAC structure. It contains all of the information needed to perform encryption/decryption and SHA1/HMAC authentication on one buffer for IPSec processing. The basic paradigm is that the application needs to be able to provide new jobs before old jobs have completed processing. One might call this an "asynchronous" interface. The basic interface is that the application "submits" a job to the multi-buffer manager (MB_MGR), and it may receive a completed job back, or it may receive NULL. The returned job, if there is one, will not be the same as the submitted job, but the jobs will be returned in the same order in which they are submitted. Since there can be a semi-arbitrary number of outstanding jobs, management of the job object is handled by the MB_MGR. The application gets a pointer to a new job object by calling get_next_job(). It then fills in the data fields and submits it by calling submit_job(). If a job is returned, then that job has been completed, and the application should do whatever it needs to do in order to further process that buffer. The job object is not explicitly returned to the MB_MGR. Rather it is implicitly returned by the next call to get_next_job(). Another way to put this is that the data within the job object is guaranteed to be valid until the next call to get_next_job(). In order to reduce latency, there is an optional function that may be called, get_completed_job(). This returns the next job if that job has previously been completed. But if that job has not been completed, no processing is done, and the function returns NULL. This may be used to reduce the number of outstanding jobs within the MB_MGR. At times, it may be necessary to process the jobs currently within the MB_MGR without providing new jobs as input. This process is called "flushing", and it is invoked by calling flush_job(). If there are any jobs within the MB_MGR, this will complete processing on the earliest job and return it. It will only return NULL if there are no jobs within the MB_MGR. Flushing will be described in more detail below. The presumption is that the same AES key will apply to a number of buffers. For increased efficiency, it requires that the AES key expansion happens as a distinct step apart from buffer encryption/decryption. The expanded keys are stored in a data structure (array), and this expanded key structure is used by the job object. There are two variants provided, MB_MGR and MB_MGR2. They are functionally equivalent. The reason that two are provided is that they differ slightly in their implementation, and so they may have slightly different characteristics in terms of latency and overhead. API: Usage Skeleton The basic usage is illustrated in the following pseudo_code: init_mb_mgr(&mb_mgr); ... aes_keyexp_128(key, enc_exp_keys, dec_exp_keys); ... while (work_to_be_done) { job = get_next_job(&mb_mgr); // TODO: Fill in job fields job = submit_job(&mb_mgr); while (job) { // TODO: Complete processing on job job = get_completed_job(&mb_mgr); } } API: Job Fields The mode is determined by the fields "cipher_direction" and "chain_order". The first specifies encrypt or decrypt, and the second specifies whether whether the hash should be done before or after the cipher operation. In the current implementation, only two combinations of these are supported. For encryption, these should be set to "ENCRYPT" and "CIPHER_HASH", and for decryption, these should be set to "DECRYPT" and "HASH_CIPHER". The expanded keys are pointed to by "aes_enc_key_expanded" and "aes_dec_key_expanded". These arrays must be aligned on a 16-byte boundary. Only one of these is necessary (as determined by "cipher_direction"). One selects AES128 vs AES256 by using the "aes_key_len_in_bytes" field. The only valid values are 16 (AES128) and 32 (AES256). One selects the AES mode (CBC versus counter-mode) using "cipher_mode". One selects the hash algorith (SHA1-HMAC, AES-XCBC, or MD5-HMAC) using "hash_alg". The data to be encrypted/decrypted is defined by "src + cipher_start_src_offset_in_bytes". The length of data is given by "msg_len_to_cipher_in_bytes". It must be a multiple of 16 bytes. The destination for the cipher operation is given by "dst" (NOT by "dst + cipher_start_src_offset_in_bytes". In many/most applications, the destination pointer may overlap the source pointer. That is, "dst" may be equal to "src + cipher_start_src_offset_in_bytes". The IV for the cipher operation is given by "iv". The "iv_len_in_bytes" should be 16. This pointer does not need to be aligned. The data to be hashed is defined by "src + hash_start_src_offset_in_bytes". The length of data is given by "msg_len_to_hash_in_bytes". The output of the hash operation is defined by "auth_tag_output". The number of bytes written is given by "auth_tag_output_len_in_bytes". Currently the only valid value for this parameter is 12. The ipad and opad are given as the result of hashing the HMAC key xor'ed with the appropriate value. That is, rather than passing in the HMAC key and rehashing the initial block for every buffer, the hashing of the initial block is done separately, and the results of this hash are used as input in the job structure. Similar to the expanded AES keys, the premise here is that one HMAC key will apply to many buffers, so we want to do that hashing once and not for each buffer. The "status" reflects the status of the returned job. It should be "STS_COMPLETED". The "user_data" field is ignored. It can be used to attach application data to the job object. Flushing Concerns As long as jobs are coming in at a reasonable rate, jobs should be returned at a reasonable rate. However, if there is a lull in the arrival of new jobs, the last few jobs that were submitted tend to stay in the MB_MGR until new jobs arrive. This might result in there being an unreasonable latency for these jobs. In this case, flush_job() should be used to complete processing on these outstanding jobs and prevent them from having excessive latency. Exactly when and how to use flush_job() is up to the application, and is a balancing act. The processing of flush_job() is less efficient than that of submit_job(), so calling flush_job() too often will lower the system efficiency. Conversely, calling flush_job() too rarely may result in some jobs seeing excessive latency. There are several strategies that the application may employ for flushing. One usage model is that there is a (thread-safe) queue containing work items. One or more threads puts work onto this queue, and one or more processing threads removes items from this queue and processes them through the MB_MGR. In this usage, a simple flushing strategy is that when the processing thread wants to do more work, but the queue is empty, it then proceeds to flush jobs until either the queue contains more work, or the MB_MGR no longer contains jobs (i.e. that flush_job() returns NULL). A variation on this is that when the work queue is empty, the processing thread might pause for a short time to see if any new work appears, before it starts flushing. In other usage models, there may be no such queue. An alternate flushing strategy is that have a separate "flush thread" hanging around. It wakes up periodically and checks to see if any work has been requested since the last time it woke up. If some period of time has gone by with no new work appearing, it would proceed to flush the MB_MGR. AES Key Usage If the AES mode is CBC, then the fields aes_enc_key_expanded or aes_dec_key_expanded are using depending on whether the data is being encrypted or decrypted. However, if the AES mode is CNTR (counter mode), then only aes_enc_key_expanded is used, even for a decrypt operation. The application can handle this dichotomy, or it might choose to simply set both fields in all cases. Thread Safety The MB_MGR and the associated functions ARE NOT thread safe. If there are multiple threads that may be calling these functions (e.g. a processing thread and a flushing thread), it is the responsibility of the application to put in place sufficient locking so that no two threads will make calls to the same MB_MGR object at the same time. XMM Register Usage The current implementation is designed for integration in the Linux Kernel. All of the functions satisfy the Linux ABI with respect to general purpose registers. However, the submit_job() and flush_job() functions use XMM registers without saving/restoring any of them. It is up to the application to manage the saving/restoring of XMM registers itself. Auxiliary Functions There are several auxiliary functions packed with MB_MGR. These may be used, or the application may choose to use their own version. Two of these, aes_keyexp_128() and aes_keyexp_256() expand AES keys into a form that is acceptable for reference in the job structure. In the case of AES128, the expanded key structure should be an array of 11 128-bit words, aligned on a 16-byte boundary. In the case of AES256, it should be an array of 15 128-bit words, aligned on a 16-byte boundary. There is also a function, sha1(), which will compute the SHA1 digest of a single 64-byte block. It can be used to compute the ipad and opad digests. There is a similar function, md5(), which can be used when using MD5-HMAC. For further details on the usage of these functions, see the sample test application. intel-ipsec-mb-0.48/aes128_ecbenc_x3.asm000066400000000000000000000140151321406316400176740ustar00rootroot00000000000000;; ;; Copyright (c) 2012-2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; ; Routines to do simple AES ECB Enc on one stream with 3 blocks ;void ; aes128_ecbenc_x3_sse(void *in, void *keys, void *out1, void *out2, void *out3); ;void ; aes128_ecbenc_x3_avx(void *in, void *keys, void *out1, void *out2, void *out3); %include "os.asm" %ifdef LINUX %define IN rdi ; arg 1 %define KEYS rsi ; arg 2 %define OUT0 rdx ; arg 3 %define OUT1 rcx ; arg 4 %define OUT2 r8 ; arg 5 %else %define IN rcx ; arg 1 %define KEYS rdx ; arg 2 %define OUT0 r8 ; arg 3 %define OUT1 r9 ; arg 4 %define OUT2 rax ; %endif %define XDATA0 xmm0 %define XDATA1 xmm1 %define XDATA2 xmm2 %define XKEYA xmm3 %define XKEYB xmm4 section .text MKGLOBAL(aes128_ecbenc_x3_sse,function,internal) aes128_ecbenc_x3_sse: %ifndef LINUX mov OUT2, [rsp + 5*8] %endif movdqu XDATA0, [IN + 0*16] ; load first block of plain text movdqu XDATA1, [IN + 1*16] ; load second block of plain text movdqu XDATA2, [IN + 2*16] ; load third block of plain text movdqa XKEYA, [KEYS + 16*0] movdqa XKEYB, [KEYS + 16*1] pxor XDATA0, XKEYA ; 0. ARK pxor XDATA1, XKEYA ; 0. ARK pxor XDATA2, XKEYA ; 0. ARK movdqa XKEYA, [KEYS + 16*2] aesenc XDATA0, XKEYB ; 1. ENC aesenc XDATA1, XKEYB ; 1. ENC aesenc XDATA2, XKEYB ; 1. ENC movdqa XKEYB, [KEYS + 16*3] aesenc XDATA0, XKEYA ; 2. ENC aesenc XDATA1, XKEYA ; 2. ENC aesenc XDATA2, XKEYA ; 2. ENC movdqa XKEYA, [KEYS + 16*4] aesenc XDATA0, XKEYB ; 3. ENC aesenc XDATA1, XKEYB ; 3. ENC aesenc XDATA2, XKEYB ; 3. ENC movdqa XKEYB, [KEYS + 16*5] aesenc XDATA0, XKEYA ; 4. ENC aesenc XDATA1, XKEYA ; 4. ENC aesenc XDATA2, XKEYA ; 4. ENC movdqa XKEYA, [KEYS + 16*6] aesenc XDATA0, XKEYB ; 5. ENC aesenc XDATA1, XKEYB ; 5. ENC aesenc XDATA2, XKEYB ; 5. ENC movdqa XKEYB, [KEYS + 16*7] aesenc XDATA0, XKEYA ; 6. ENC aesenc XDATA1, XKEYA ; 6. ENC aesenc XDATA2, XKEYA ; 6. ENC movdqa XKEYA, [KEYS + 16*8] aesenc XDATA0, XKEYB ; 7. ENC aesenc XDATA1, XKEYB ; 7. ENC aesenc XDATA2, XKEYB ; 7. ENC movdqa XKEYB, [KEYS + 16*9] aesenc XDATA0, XKEYA ; 8. ENC aesenc XDATA1, XKEYA ; 8. ENC aesenc XDATA2, XKEYA ; 8. ENC movdqa XKEYA, [KEYS + 16*10] aesenc XDATA0, XKEYB ; 9. ENC aesenc XDATA1, XKEYB ; 9. ENC aesenc XDATA2, XKEYB ; 9. ENC aesenclast XDATA0, XKEYA ; 10. ENC aesenclast XDATA1, XKEYA ; 10. ENC aesenclast XDATA2, XKEYA ; 10. ENC movdqu [OUT0], XDATA0 ; write back ciphertext movdqu [OUT1], XDATA1 ; write back ciphertext movdqu [OUT2], XDATA2 ; write back ciphertext ret ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; MKGLOBAL(aes128_ecbenc_x3_avx,function,internal) aes128_ecbenc_x3_avx: %ifndef LINUX mov OUT2, [rsp + 5*8] %endif vmovdqu XDATA0, [IN + 0*16] ; load first block of plain text vmovdqu XDATA1, [IN + 1*16] ; load second block of plain text vmovdqu XDATA2, [IN + 2*16] ; load third block of plain text vmovdqa XKEYA, [KEYS + 16*0] vmovdqa XKEYB, [KEYS + 16*1] vpxor XDATA0, XDATA0, XKEYA ; 0. ARK vpxor XDATA1, XDATA1, XKEYA ; 0. ARK vpxor XDATA2, XDATA2, XKEYA ; 0. ARK vmovdqa XKEYA, [KEYS + 16*2] vaesenc XDATA0, XKEYB ; 1. ENC vaesenc XDATA1, XKEYB ; 1. ENC vaesenc XDATA2, XKEYB ; 1. ENC vmovdqa XKEYB, [KEYS + 16*3] vaesenc XDATA0, XKEYA ; 2. ENC vaesenc XDATA1, XKEYA ; 2. ENC vaesenc XDATA2, XKEYA ; 2. ENC vmovdqa XKEYA, [KEYS + 16*4] vaesenc XDATA0, XKEYB ; 3. ENC vaesenc XDATA1, XKEYB ; 3. ENC vaesenc XDATA2, XKEYB ; 3. ENC vmovdqa XKEYB, [KEYS + 16*5] vaesenc XDATA0, XKEYA ; 4. ENC vaesenc XDATA1, XKEYA ; 4. ENC vaesenc XDATA2, XKEYA ; 4. ENC vmovdqa XKEYA, [KEYS + 16*6] vaesenc XDATA0, XKEYB ; 5. ENC vaesenc XDATA1, XKEYB ; 5. ENC vaesenc XDATA2, XKEYB ; 5. ENC vmovdqa XKEYB, [KEYS + 16*7] vaesenc XDATA0, XKEYA ; 6. ENC vaesenc XDATA1, XKEYA ; 6. ENC vaesenc XDATA2, XKEYA ; 6. ENC vmovdqa XKEYA, [KEYS + 16*8] vaesenc XDATA0, XKEYB ; 7. ENC vaesenc XDATA1, XKEYB ; 7. ENC vaesenc XDATA2, XKEYB ; 7. ENC vmovdqa XKEYB, [KEYS + 16*9] vaesenc XDATA0, XKEYA ; 8. ENC vaesenc XDATA1, XKEYA ; 8. ENC vaesenc XDATA2, XKEYA ; 8. ENC vmovdqa XKEYA, [KEYS + 16*10] vaesenc XDATA0, XKEYB ; 9. ENC vaesenc XDATA1, XKEYB ; 9. ENC vaesenc XDATA2, XKEYB ; 9. ENC vaesenclast XDATA0, XKEYA ; 10. ENC vaesenclast XDATA1, XKEYA ; 10. ENC vaesenclast XDATA2, XKEYA ; 10. ENC vmovdqu [OUT0], XDATA0 ; write back ciphertext vmovdqu [OUT1], XDATA1 ; write back ciphertext vmovdqu [OUT2], XDATA2 ; write back ciphertext ret intel-ipsec-mb-0.48/aes_keyexp_128.asm000066400000000000000000000255531321406316400175200ustar00rootroot00000000000000;; ;; Copyright (c) 2012-2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; ; Routine to do AES key expansion %include "os.asm" %macro key_expansion_128_sse 0 ;; Assumes the xmm3 includes all zeros at this point. pshufd xmm2, xmm2, 11111111b shufps xmm3, xmm1, 00010000b pxor xmm1, xmm3 shufps xmm3, xmm1, 10001100b pxor xmm1, xmm3 pxor xmm1, xmm2 %endmacro %macro key_expansion_128_avx 0 ;; Assumes the xmm3 includes all zeros at this point. vpshufd xmm2, xmm2, 11111111b vshufps xmm3, xmm3, xmm1, 00010000b vpxor xmm1, xmm1, xmm3 vshufps xmm3, xmm3, xmm1, 10001100b vpxor xmm1, xmm1, xmm3 vpxor xmm1, xmm1, xmm2 %endmacro %ifdef LINUX %define KEY rdi %define EXP_ENC_KEYS rsi %define EXP_DEC_KEYS rdx %else %define KEY rcx %define EXP_ENC_KEYS rdx %define EXP_DEC_KEYS r8 %endif section .text ; void aes_keyexp_128(UINT128 *key, ; UINT128 *enc_exp_keys, ; UINT128 *dec_exp_keys); ; ; arg 1: rcx: pointer to key ; arg 2: rdx: pointer to expanded key array for encrypt ; arg 3: r8: pointer to expanded key array for decrypt ; MKGLOBAL(aes_keyexp_128_sse,function,) aes_keyexp_128_sse: movdqu xmm1, [KEY] ; loading the AES key movdqa [EXP_ENC_KEYS + 16*0], xmm1 movdqa [EXP_DEC_KEYS + 16*10], xmm1 ; Storing key in memory pxor xmm3, xmm3 aeskeygenassist xmm2, xmm1, 0x1 ; Generating round key 1 key_expansion_128_sse movdqa [EXP_ENC_KEYS + 16*1], xmm1 aesimc xmm4, xmm1 movdqa [EXP_DEC_KEYS + 16*9], xmm4 aeskeygenassist xmm2, xmm1, 0x2 ; Generating round key 2 key_expansion_128_sse movdqa [EXP_ENC_KEYS + 16*2], xmm1 aesimc xmm5, xmm1 movdqa [EXP_DEC_KEYS + 16*8], xmm5 aeskeygenassist xmm2, xmm1, 0x4 ; Generating round key 3 key_expansion_128_sse movdqa [EXP_ENC_KEYS + 16*3], xmm1 aesimc xmm4, xmm1 movdqa [EXP_DEC_KEYS + 16*7], xmm4 aeskeygenassist xmm2, xmm1, 0x8 ; Generating round key 4 key_expansion_128_sse movdqa [EXP_ENC_KEYS + 16*4], xmm1 aesimc xmm5, xmm1 movdqa [EXP_DEC_KEYS + 16*6], xmm5 aeskeygenassist xmm2, xmm1, 0x10 ; Generating round key 5 key_expansion_128_sse movdqa [EXP_ENC_KEYS + 16*5], xmm1 aesimc xmm4, xmm1 movdqa [EXP_DEC_KEYS + 16*5], xmm4 aeskeygenassist xmm2, xmm1, 0x20 ; Generating round key 6 key_expansion_128_sse movdqa [EXP_ENC_KEYS + 16*6], xmm1 aesimc xmm5, xmm1 movdqa [EXP_DEC_KEYS + 16*4], xmm5 aeskeygenassist xmm2, xmm1, 0x40 ; Generating round key 7 key_expansion_128_sse movdqa [EXP_ENC_KEYS + 16*7], xmm1 aesimc xmm4, xmm1 movdqa [EXP_DEC_KEYS + 16*3], xmm4 aeskeygenassist xmm2, xmm1, 0x80 ; Generating round key 8 key_expansion_128_sse movdqa [EXP_ENC_KEYS + 16*8], xmm1 aesimc xmm5, xmm1 movdqa [EXP_DEC_KEYS + 16*2], xmm5 aeskeygenassist xmm2, xmm1, 0x1b ; Generating round key 9 key_expansion_128_sse movdqa [EXP_ENC_KEYS + 16*9], xmm1 aesimc xmm4, xmm1 movdqa [EXP_DEC_KEYS + 16*1], xmm4 aeskeygenassist xmm2, xmm1, 0x36 ; Generating round key 10 key_expansion_128_sse movdqa [EXP_ENC_KEYS + 16*10], xmm1 movdqa [EXP_DEC_KEYS + 16*0], xmm1 ret ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; MKGLOBAL(aes_keyexp_128_avx,function,) aes_keyexp_128_avx: vmovdqu xmm1, [KEY] ; loading the AES key vmovdqa [EXP_ENC_KEYS + 16*0], xmm1 vmovdqa [EXP_DEC_KEYS + 16*10], xmm1 ; Storing key in memory vpxor xmm3, xmm3, xmm3 vaeskeygenassist xmm2, xmm1, 0x1 ; Generating round key 1 key_expansion_128_avx vmovdqa [EXP_ENC_KEYS + 16*1], xmm1 vaesimc xmm4, xmm1 vmovdqa [EXP_DEC_KEYS + 16*9], xmm4 vaeskeygenassist xmm2, xmm1, 0x2 ; Generating round key 2 key_expansion_128_avx vmovdqa [EXP_ENC_KEYS + 16*2], xmm1 vaesimc xmm5, xmm1 vmovdqa [EXP_DEC_KEYS + 16*8], xmm5 vaeskeygenassist xmm2, xmm1, 0x4 ; Generating round key 3 key_expansion_128_avx vmovdqa [EXP_ENC_KEYS + 16*3], xmm1 vaesimc xmm4, xmm1 vmovdqa [EXP_DEC_KEYS + 16*7], xmm4 vaeskeygenassist xmm2, xmm1, 0x8 ; Generating round key 4 key_expansion_128_avx vmovdqa [EXP_ENC_KEYS + 16*4], xmm1 vaesimc xmm5, xmm1 vmovdqa [EXP_DEC_KEYS + 16*6], xmm5 vaeskeygenassist xmm2, xmm1, 0x10 ; Generating round key 5 key_expansion_128_avx vmovdqa [EXP_ENC_KEYS + 16*5], xmm1 vaesimc xmm4, xmm1 vmovdqa [EXP_DEC_KEYS + 16*5], xmm4 vaeskeygenassist xmm2, xmm1, 0x20 ; Generating round key 6 key_expansion_128_avx vmovdqa [EXP_ENC_KEYS + 16*6], xmm1 vaesimc xmm5, xmm1 vmovdqa [EXP_DEC_KEYS + 16*4], xmm5 vaeskeygenassist xmm2, xmm1, 0x40 ; Generating round key 7 key_expansion_128_avx vmovdqa [EXP_ENC_KEYS + 16*7], xmm1 vaesimc xmm4, xmm1 vmovdqa [EXP_DEC_KEYS + 16*3], xmm4 vaeskeygenassist xmm2, xmm1, 0x80 ; Generating round key 8 key_expansion_128_avx vmovdqa [EXP_ENC_KEYS + 16*8], xmm1 vaesimc xmm5, xmm1 vmovdqa [EXP_DEC_KEYS + 16*2], xmm5 vaeskeygenassist xmm2, xmm1, 0x1b ; Generating round key 9 key_expansion_128_avx vmovdqa [EXP_ENC_KEYS + 16*9], xmm1 vaesimc xmm4, xmm1 vmovdqa [EXP_DEC_KEYS + 16*1], xmm4 vaeskeygenassist xmm2, xmm1, 0x36 ; Generating round key 10 key_expansion_128_avx vmovdqa [EXP_ENC_KEYS + 16*10], xmm1 vmovdqa [EXP_DEC_KEYS + 16*0], xmm1 ret ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; void aes_keyexp_128_enc_sse(UINT128 *key, ; UINT128 *enc_exp_keys); ; ; arg 1: rcx: pointer to key ; arg 2: rdx: pointer to expanded key array for encrypt ; MKGLOBAL(aes_keyexp_128_enc_sse,function,) aes_keyexp_128_enc_sse: movdqu xmm1, [KEY] ; loading the AES key movdqa [EXP_ENC_KEYS + 16*0], xmm1 pxor xmm3, xmm3 aeskeygenassist xmm2, xmm1, 0x1 ; Generating round key 1 key_expansion_128_sse movdqa [EXP_ENC_KEYS + 16*1], xmm1 aeskeygenassist xmm2, xmm1, 0x2 ; Generating round key 2 key_expansion_128_sse movdqa [EXP_ENC_KEYS + 16*2], xmm1 aeskeygenassist xmm2, xmm1, 0x4 ; Generating round key 3 key_expansion_128_sse movdqa [EXP_ENC_KEYS + 16*3], xmm1 aeskeygenassist xmm2, xmm1, 0x8 ; Generating round key 4 key_expansion_128_sse movdqa [EXP_ENC_KEYS + 16*4], xmm1 aeskeygenassist xmm2, xmm1, 0x10 ; Generating round key 5 key_expansion_128_sse movdqa [EXP_ENC_KEYS + 16*5], xmm1 aeskeygenassist xmm2, xmm1, 0x20 ; Generating round key 6 key_expansion_128_sse movdqa [EXP_ENC_KEYS + 16*6], xmm1 aeskeygenassist xmm2, xmm1, 0x40 ; Generating round key 7 key_expansion_128_sse movdqa [EXP_ENC_KEYS + 16*7], xmm1 aeskeygenassist xmm2, xmm1, 0x80 ; Generating round key 8 key_expansion_128_sse movdqa [EXP_ENC_KEYS + 16*8], xmm1 aeskeygenassist xmm2, xmm1, 0x1b ; Generating round key 9 key_expansion_128_sse movdqa [EXP_ENC_KEYS + 16*9], xmm1 aeskeygenassist xmm2, xmm1, 0x36 ; Generating round key 10 key_expansion_128_sse movdqa [EXP_ENC_KEYS + 16*10], xmm1 ret MKGLOBAL(aes_keyexp_128_enc_avx,function,) aes_keyexp_128_enc_avx: vmovdqu xmm1, [KEY] ; loading the AES key vmovdqa [EXP_ENC_KEYS + 16*0], xmm1 vpxor xmm3, xmm3, xmm3 vaeskeygenassist xmm2, xmm1, 0x1 ; Generating round key 1 key_expansion_128_avx vmovdqa [EXP_ENC_KEYS + 16*1], xmm1 vaeskeygenassist xmm2, xmm1, 0x2 ; Generating round key 2 key_expansion_128_avx vmovdqa [EXP_ENC_KEYS + 16*2], xmm1 vaeskeygenassist xmm2, xmm1, 0x4 ; Generating round key 3 key_expansion_128_avx vmovdqa [EXP_ENC_KEYS + 16*3], xmm1 vaeskeygenassist xmm2, xmm1, 0x8 ; Generating round key 4 key_expansion_128_avx vmovdqa [EXP_ENC_KEYS + 16*4], xmm1 vaeskeygenassist xmm2, xmm1, 0x10 ; Generating round key 5 key_expansion_128_avx vmovdqa [EXP_ENC_KEYS + 16*5], xmm1 vaeskeygenassist xmm2, xmm1, 0x20 ; Generating round key 6 key_expansion_128_avx vmovdqa [EXP_ENC_KEYS + 16*6], xmm1 vaeskeygenassist xmm2, xmm1, 0x40 ; Generating round key 7 key_expansion_128_avx vmovdqa [EXP_ENC_KEYS + 16*7], xmm1 vaeskeygenassist xmm2, xmm1, 0x80 ; Generating round key 8 key_expansion_128_avx vmovdqa [EXP_ENC_KEYS + 16*8], xmm1 vaeskeygenassist xmm2, xmm1, 0x1b ; Generating round key 9 key_expansion_128_avx vmovdqa [EXP_ENC_KEYS + 16*9], xmm1 vaeskeygenassist xmm2, xmm1, 0x36 ; Generating round key 10 key_expansion_128_avx vmovdqa [EXP_ENC_KEYS + 16*10], xmm1 ret intel-ipsec-mb-0.48/aes_keyexp_192.asm000066400000000000000000000307111321406316400175110ustar00rootroot00000000000000;; ;; Copyright (c) 2012-2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; %include "os.asm" %ifdef LINUX %define KEY rdi %define EXP_ENC_KEYS rsi %define EXP_DEC_KEYS rdx %else %define KEY rcx %define EXP_ENC_KEYS rdx %define EXP_DEC_KEYS r8 %endif %macro key_expansion_1_192_sse 1 ;; Assumes the xmm3 includes all zeros at this point. pshufd xmm2, xmm2, 11111111b shufps xmm3, xmm1, 00010000b pxor xmm1, xmm3 shufps xmm3, xmm1, 10001100b pxor xmm1, xmm3 pxor xmm1, xmm2 movdqu [EXP_ENC_KEYS + %1], xmm1 %endmacro ; Calculate w10 and w11 using calculated w9 and known w4-w5 %macro key_expansion_2_192_sse 1 movdqa xmm5, xmm4 pslldq xmm5, 4 shufps xmm6, xmm1, 11110000b pxor xmm6, xmm5 pxor xmm4, xmm6 pshufd xmm7, xmm4, 00001110b movdqu [EXP_ENC_KEYS + %1], xmm7 %endmacro %macro key_dec_192_sse 1 movdqa xmm0, [EXP_ENC_KEYS + 16 * %1] aesimc xmm1, xmm0 movdqa [EXP_DEC_KEYS + 16 * (12 - %1)], xmm1 %endmacro %macro key_expansion_1_192_avx 1 ;; Assumes the xmm3 includes all zeros at this point. vpshufd xmm2, xmm2, 11111111b vshufps xmm3, xmm3, xmm1, 00010000b vpxor xmm1, xmm1, xmm3 vshufps xmm3, xmm3, xmm1, 10001100b vpxor xmm1, xmm1, xmm3 vpxor xmm1, xmm1, xmm2 vmovdqu [EXP_ENC_KEYS + %1], xmm1 %endmacro ; Calculate w10 and w11 using calculated w9 and known w4-w5 %macro key_expansion_2_192_avx 1 vmovdqa xmm5, xmm4 vpslldq xmm5, xmm5, 4 vshufps xmm6, xmm6, xmm1, 11110000b vpxor xmm6, xmm6, xmm5 vpxor xmm4, xmm4, xmm6 vpshufd xmm7, xmm4, 00001110b vmovdqu [EXP_ENC_KEYS + %1], xmm7 %endmacro %macro key_dec_192_avx 1 vmovdqa xmm0, [EXP_ENC_KEYS + 16 * %1] vaesimc xmm1, xmm0 vmovdqa [EXP_DEC_KEYS + 16 * (12 - %1)], xmm1 %endmacro section .text ; void aes_keyexp_192(UINT128 *key, ; UINT128 *enc_exp_keys, ; UINT128 *dec_exp_keys); ; ; arg 1: rcx: pointer to key ; arg 2: rdx: pointer to expanded key array for encrypt ; arg 3: r8: pointer to expanded key array for decrypt ; MKGLOBAL(aes_keyexp_192_sse,function,) aes_keyexp_192_sse: %ifndef LINUX sub rsp, 16*2 + 8 movdqa [rsp + 0*16], xmm6 movdqa [rsp + 1*16], xmm7 %endif movq xmm7, [KEY + 16] ; loading the AES key, 64 bits movq [EXP_ENC_KEYS + 16], xmm7 ; Storing key in memory where all key expansion pshufd xmm4, xmm7, 01001111b movdqu xmm1, [KEY] ; loading the AES key, 128 bits movdqu [EXP_ENC_KEYS], xmm1 ; Storing key in memory where all key expansion movdqa [EXP_DEC_KEYS + 16*0], xmm1 movdqa [EXP_DEC_KEYS + 16*12], xmm1 pxor xmm3, xmm3 ; Set xmm3 to be all zeros. Required for the key_expansion pxor xmm6, xmm6 ; Set xmm3 to be all zeros. Required for the key_expansion aeskeygenassist xmm2, xmm4, 0x1 ; Complete round key 1 and generate round key 2 key_expansion_1_192_sse 24 key_expansion_2_192_sse 40 aeskeygenassist xmm2, xmm4, 0x2 ; Generate round key 3 and part of round key 4 key_expansion_1_192_sse 48 key_expansion_2_192_sse 64 aeskeygenassist xmm2, xmm4, 0x4 ; Complete round key 4 and generate round key 5 key_expansion_1_192_sse 72 key_expansion_2_192_sse 88 aeskeygenassist xmm2, xmm4, 0x8 ; Generate round key 6 and part of round key 7 key_expansion_1_192_sse 96 key_expansion_2_192_sse 112 aeskeygenassist xmm2, xmm4, 0x10 ; Complete round key 7 and generate round key 8 key_expansion_1_192_sse 120 key_expansion_2_192_sse 136 aeskeygenassist xmm2, xmm4, 0x20 ; Generate round key 9 and part of round key 10 key_expansion_1_192_sse 144 key_expansion_2_192_sse 160 aeskeygenassist xmm2, xmm4, 0x40 ; Complete round key 10 and generate round key 11 key_expansion_1_192_sse 168 key_expansion_2_192_sse 184 aeskeygenassist xmm2, xmm4, 0x80 ; Generate round key 12 key_expansion_1_192_sse 192 ;;; we have already saved the 12 th key, which is pure input on the ;;; ENC key path movdqa xmm0, [EXP_ENC_KEYS + 16 * 12] movdqa [EXP_DEC_KEYS + 16*0], xmm0 ;;; generate remaining decrypt keys key_dec_192_sse 1 key_dec_192_sse 2 key_dec_192_sse 3 key_dec_192_sse 4 key_dec_192_sse 5 key_dec_192_sse 6 key_dec_192_sse 7 key_dec_192_sse 8 key_dec_192_sse 9 key_dec_192_sse 10 key_dec_192_sse 11 %ifndef LINUX movdqa xmm6, [rsp + 0*16] movdqa xmm7, [rsp + 1*16] add rsp, 16*2 + 8 %endif ret MKGLOBAL(aes_keyexp_192_avx,function,) aes_keyexp_192_avx: %ifndef LINUX sub rsp, 16*2 + 8 vmovdqa [rsp + 0*16], xmm6 vmovdqa [rsp + 1*16], xmm7 %endif vmovq xmm7, [KEY + 16] ; loading the AES key, 64 bits vmovq [EXP_ENC_KEYS + 16], xmm7 ; Storing key in memory where all key expansion vpshufd xmm4, xmm7, 01001111b vmovdqu xmm1, [KEY] ; loading the AES key, 128 bits vmovdqu [EXP_ENC_KEYS], xmm1 ; Storing key in memory where all key expansion vmovdqa [EXP_DEC_KEYS + 16*0], xmm1 vmovdqa [EXP_DEC_KEYS + 16*12], xmm1 vpxor xmm3, xmm3, xmm3 vpxor xmm6, xmm6, xmm6 vaeskeygenassist xmm2, xmm4, 0x1 ; Complete round key 1 and generate round key 2 key_expansion_1_192_avx 24 key_expansion_2_192_avx 40 vaeskeygenassist xmm2, xmm4, 0x2 ; Generate round key 3 and part of round key 4 key_expansion_1_192_avx 48 key_expansion_2_192_avx 64 vaeskeygenassist xmm2, xmm4, 0x4 ; Complete round key 4 and generate round key 5 key_expansion_1_192_avx 72 key_expansion_2_192_avx 88 vaeskeygenassist xmm2, xmm4, 0x8 ; Generate round key 6 and part of round key 7 key_expansion_1_192_avx 96 key_expansion_2_192_avx 112 vaeskeygenassist xmm2, xmm4, 0x10 ; Complete round key 7 and generate round key 8 key_expansion_1_192_avx 120 key_expansion_2_192_avx 136 vaeskeygenassist xmm2, xmm4, 0x20 ; Generate round key 9 and part of round key 10 key_expansion_1_192_avx 144 key_expansion_2_192_avx 160 vaeskeygenassist xmm2, xmm4, 0x40 ; Complete round key 10 and generate round key 11 key_expansion_1_192_avx 168 key_expansion_2_192_avx 184 vaeskeygenassist xmm2, xmm4, 0x80 ; Generate round key 12 key_expansion_1_192_avx 192 ;;; we have already saved the 12 th key, which is pure input on the ;;; ENC key path vmovdqa xmm0, [EXP_ENC_KEYS + 16 * 12] vmovdqa [EXP_DEC_KEYS + 16*0], xmm0 ;;; generate remaining decrypt keys key_dec_192_avx 1 key_dec_192_avx 2 key_dec_192_avx 3 key_dec_192_avx 4 key_dec_192_avx 5 key_dec_192_avx 6 key_dec_192_avx 7 key_dec_192_avx 8 key_dec_192_avx 9 key_dec_192_avx 10 key_dec_192_avx 11 %ifndef LINUX vmovdqa xmm6, [rsp + 0*16] vmovdqa xmm7, [rsp + 1*16] add rsp, 16*2 + 8 %endif ret ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; void aes_keyexp_192_enc_sse(UINT128 *key, ; UINT128 *enc_exp_keys); ; ; arg 1: rcx: pointer to key ; arg 2: rdx: pointer to expanded key array for encrypt ; MKGLOBAL(aes_keyexp_192_enc_sse,function,) aes_keyexp_192_enc_sse: %ifndef LINUX sub rsp, 16*2 + 8 movdqa [rsp + 0*16], xmm6 movdqa [rsp + 1*16], xmm7 %endif movq xmm7, [KEY + 16] ; loading the AES key, 64 bits movq [EXP_ENC_KEYS + 16], xmm7 ; Storing key in memory where all key expansion pshufd xmm4, xmm7, 01001111b movdqu xmm1, [KEY] ; loading the AES key, 128 bits movdqu [EXP_ENC_KEYS], xmm1 ; Storing key in memory where all key expansion pxor xmm3, xmm3 ; Set xmm3 to be all zeros. Required for the key_expansion. pxor xmm6, xmm6 ; Set xmm3 to be all zeros. Required for the key_expansion. aeskeygenassist xmm2, xmm4, 0x1 ; Complete round key 1 and generate round key 2 key_expansion_1_192_sse 24 key_expansion_2_192_sse 40 aeskeygenassist xmm2, xmm4, 0x2 ; Generate round key 3 and part of round key 4 key_expansion_1_192_sse 48 key_expansion_2_192_sse 64 aeskeygenassist xmm2, xmm4, 0x4 ; Complete round key 4 and generate round key 5 key_expansion_1_192_sse 72 key_expansion_2_192_sse 88 aeskeygenassist xmm2, xmm4, 0x8 ; Generate round key 6 and part of round key 7 key_expansion_1_192_sse 96 key_expansion_2_192_sse 112 aeskeygenassist xmm2, xmm4, 0x10 ; Complete round key 7 and generate round key 8 key_expansion_1_192_sse 120 key_expansion_2_192_sse 136 aeskeygenassist xmm2, xmm4, 0x20 ; Generate round key 9 and part of round key 10 key_expansion_1_192_sse 144 key_expansion_2_192_sse 160 aeskeygenassist xmm2, xmm4, 0x40 ; Complete round key 10 and generate round key 11 key_expansion_1_192_sse 168 key_expansion_2_192_sse 184 aeskeygenassist xmm2, xmm4, 0x80 ; Generate round key 12 key_expansion_1_192_sse 192 %ifndef LINUX movdqa xmm6, [rsp + 0*16] movdqa xmm7, [rsp + 1*16] add rsp, 16*2 + 8 %endif ret MKGLOBAL(aes_keyexp_192_enc_avx,function,) aes_keyexp_192_enc_avx: %ifndef LINUX sub rsp, 16*2 + 8 vmovdqa [rsp + 0*16], xmm6 vmovdqa [rsp + 1*16], xmm7 %endif vmovq xmm7, [KEY + 16] ; loading the AES key, 64 bits vmovq [EXP_ENC_KEYS + 16], xmm7 ; Storing key in memory where all key expansion vpshufd xmm4, xmm7, 01001111b vmovdqu xmm1, [KEY] ; loading the AES key, 128 bits vmovdqu [EXP_ENC_KEYS], xmm1 ; Storing key in memory where all key expansion vpxor xmm3, xmm3, xmm3 vpxor xmm6, xmm6, xmm6 vaeskeygenassist xmm2, xmm4, 0x1 ; Complete round key 1 and generate round key 2 key_expansion_1_192_avx 24 key_expansion_2_192_avx 40 vaeskeygenassist xmm2, xmm4, 0x2 ; Generate round key 3 and part of round key 4 key_expansion_1_192_avx 48 key_expansion_2_192_avx 64 vaeskeygenassist xmm2, xmm4, 0x4 ; Complete round key 4 and generate round key 5 key_expansion_1_192_avx 72 key_expansion_2_192_avx 88 vaeskeygenassist xmm2, xmm4, 0x8 ; Generate round key 6 and part of round key 7 key_expansion_1_192_avx 96 key_expansion_2_192_avx 112 vaeskeygenassist xmm2, xmm4, 0x10 ; Complete round key 7 and generate round key 8 key_expansion_1_192_avx 120 key_expansion_2_192_avx 136 vaeskeygenassist xmm2, xmm4, 0x20 ; Generate round key 9 and part of round key 10 key_expansion_1_192_avx 144 key_expansion_2_192_avx 160 vaeskeygenassist xmm2, xmm4, 0x40 ; Complete round key 10 and generate round key 11 key_expansion_1_192_avx 168 key_expansion_2_192_avx 184 vaeskeygenassist xmm2, xmm4, 0x80 ; Generate round key 12 key_expansion_1_192_avx 192 %ifndef LINUX vmovdqa xmm6, [rsp + 0*16] vmovdqa xmm7, [rsp + 1*16] add rsp, 16*2 + 8 %endif ret intel-ipsec-mb-0.48/aes_keyexp_256.asm000066400000000000000000000335051321406316400175160ustar00rootroot00000000000000;; ;; Copyright (c) 2012-2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; ; Routine to do AES key expansion %include "os.asm" ; Uses the f() function of the aeskeygenassist result %macro key_expansion_256_sse 0 ;; Assumes the xmm3 includes all zeros at this point. pshufd xmm2, xmm2, 11111111b shufps xmm3, xmm1, 00010000b pxor xmm1, xmm3 shufps xmm3, xmm1, 10001100b pxor xmm1, xmm3 pxor xmm1, xmm2 %endmacro ; Uses the SubWord function of the aeskeygenassist result %macro key_expansion_256_sse_2 0 ;; Assumes the xmm3 includes all zeros at this point. pshufd xmm2, xmm2, 10101010b shufps xmm3, xmm4, 00010000b pxor xmm4, xmm3 shufps xmm3, xmm4, 10001100b pxor xmm4, xmm3 pxor xmm4, xmm2 %endmacro ; Uses the f() function of the aeskeygenassist result %macro key_expansion_256_avx 0 ;; Assumes the xmm3 includes all zeros at this point. vpshufd xmm2, xmm2, 11111111b vshufps xmm3, xmm3, xmm1, 00010000b vpxor xmm1, xmm1, xmm3 vshufps xmm3, xmm3, xmm1, 10001100b vpxor xmm1, xmm1, xmm3 vpxor xmm1, xmm1, xmm2 %endmacro ; Uses the SubWord function of the aeskeygenassist result %macro key_expansion_256_avx_2 0 ;; Assumes the xmm3 includes all zeros at this point. vpshufd xmm2, xmm2, 10101010b vshufps xmm3, xmm3, xmm4, 00010000b vpxor xmm4, xmm4, xmm3 vshufps xmm3, xmm3, xmm4, 10001100b vpxor xmm4, xmm4, xmm3 vpxor xmm4, xmm4, xmm2 %endmacro %ifdef LINUX %define KEY rdi %define EXP_ENC_KEYS rsi %define EXP_DEC_KEYS rdx %else %define KEY rcx %define EXP_ENC_KEYS rdx %define EXP_DEC_KEYS r8 %endif section .text ; void aes_keyexp_256(UINT128 *key, ; UINT128 *enc_exp_keys, ; UINT128 *dec_exp_keys); ; ; arg 1: rcx: pointer to key ; arg 2: rdx: pointer to expanded key array for encrypt ; arg 3: r8: pointer to expanded key array for decrypt ; MKGLOBAL(aes_keyexp_256_sse,function,) aes_keyexp_256_sse: movdqu xmm1, [KEY] ; loading the AES key movdqa [EXP_ENC_KEYS + 16*0], xmm1 movdqa [EXP_DEC_KEYS + 16*14], xmm1 ; Storing key in memory movdqu xmm4, [KEY+16] ; loading the AES key movdqa [EXP_ENC_KEYS + 16*1], xmm4 aesimc xmm0, xmm4 movdqa [EXP_DEC_KEYS + 16*13], xmm0 ; Storing key in memory pxor xmm3, xmm3 ; Required for the key_expansion. aeskeygenassist xmm2, xmm4, 0x1 ; Generating round key 2 key_expansion_256_sse movdqa [EXP_ENC_KEYS + 16*2], xmm1 aesimc xmm5, xmm1 movdqa [EXP_DEC_KEYS + 16*12], xmm5 aeskeygenassist xmm2, xmm1, 0x1 ; Generating round key 3 key_expansion_256_sse_2 movdqa [EXP_ENC_KEYS + 16*3], xmm4 aesimc xmm0, xmm4 movdqa [EXP_DEC_KEYS + 16*11], xmm0 aeskeygenassist xmm2, xmm4, 0x2 ; Generating round key 4 key_expansion_256_sse movdqa [EXP_ENC_KEYS + 16*4], xmm1 aesimc xmm5, xmm1 movdqa [EXP_DEC_KEYS + 16*10], xmm5 aeskeygenassist xmm2, xmm1, 0x2 ; Generating round key 5 key_expansion_256_sse_2 movdqa [EXP_ENC_KEYS + 16*5], xmm4 aesimc xmm0, xmm4 movdqa [EXP_DEC_KEYS + 16*9], xmm0 aeskeygenassist xmm2, xmm4, 0x4 ; Generating round key 6 key_expansion_256_sse movdqa [EXP_ENC_KEYS + 16*6], xmm1 aesimc xmm5, xmm1 movdqa [EXP_DEC_KEYS + 16*8], xmm5 aeskeygenassist xmm2, xmm1, 0x4 ; Generating round key 7 key_expansion_256_sse_2 movdqa [EXP_ENC_KEYS + 16*7], xmm4 aesimc xmm0, xmm4 movdqa [EXP_DEC_KEYS + 16*7], xmm0 aeskeygenassist xmm2, xmm4, 0x8 ; Generating round key 8 key_expansion_256_sse movdqa [EXP_ENC_KEYS + 16*8], xmm1 aesimc xmm5, xmm1 movdqa [EXP_DEC_KEYS + 16*6], xmm5 aeskeygenassist xmm2, xmm1, 0x8 ; Generating round key 9 key_expansion_256_sse_2 movdqa [EXP_ENC_KEYS + 16*9], xmm4 aesimc xmm0, xmm4 movdqa [EXP_DEC_KEYS + 16*5], xmm0 aeskeygenassist xmm2, xmm4, 0x10 ; Generating round key 10 key_expansion_256_sse movdqa [EXP_ENC_KEYS + 16*10], xmm1 aesimc xmm5, xmm1 movdqa [EXP_DEC_KEYS + 16*4], xmm5 aeskeygenassist xmm2, xmm1, 0x10 ; Generating round key 11 key_expansion_256_sse_2 movdqa [EXP_ENC_KEYS + 16*11], xmm4 aesimc xmm0, xmm4 movdqa [EXP_DEC_KEYS + 16*3], xmm0 aeskeygenassist xmm2, xmm4, 0x20 ; Generating round key 12 key_expansion_256_sse movdqa [EXP_ENC_KEYS + 16*12], xmm1 aesimc xmm5, xmm1 movdqa [EXP_DEC_KEYS + 16*2], xmm5 aeskeygenassist xmm2, xmm1, 0x20 ; Generating round key 13 key_expansion_256_sse_2 movdqa [EXP_ENC_KEYS + 16*13], xmm4 aesimc xmm0, xmm4 movdqa [EXP_DEC_KEYS + 16*1], xmm0 aeskeygenassist xmm2, xmm4, 0x40 ; Generating round key 14 key_expansion_256_sse movdqa [EXP_ENC_KEYS + 16*14], xmm1 movdqa [EXP_DEC_KEYS + 16*0], xmm1 ret MKGLOBAL(aes_keyexp_256_avx,function,) aes_keyexp_256_avx: vmovdqu xmm1, [KEY] ; loading the AES key vmovdqa [EXP_ENC_KEYS + 16*0], xmm1 vmovdqa [EXP_DEC_KEYS + 16*14], xmm1 ; Storing key in memory vmovdqu xmm4, [KEY+16] ; loading the AES key vmovdqa [EXP_ENC_KEYS + 16*1], xmm4 vaesimc xmm0, xmm4 vmovdqa [EXP_DEC_KEYS + 16*13], xmm0 ; Storing key in memory vpxor xmm3, xmm3, xmm3 ; Required for the key_expansion. vaeskeygenassist xmm2, xmm4, 0x1 ; Generating round key 2 key_expansion_256_avx vmovdqa [EXP_ENC_KEYS + 16*2], xmm1 vaesimc xmm5, xmm1 vmovdqa [EXP_DEC_KEYS + 16*12], xmm5 vaeskeygenassist xmm2, xmm1, 0x1 ; Generating round key 3 key_expansion_256_avx_2 vmovdqa [EXP_ENC_KEYS + 16*3], xmm4 vaesimc xmm0, xmm4 vmovdqa [EXP_DEC_KEYS + 16*11], xmm0 vaeskeygenassist xmm2, xmm4, 0x2 ; Generating round key 4 key_expansion_256_avx vmovdqa [EXP_ENC_KEYS + 16*4], xmm1 vaesimc xmm5, xmm1 vmovdqa [EXP_DEC_KEYS + 16*10], xmm5 vaeskeygenassist xmm2, xmm1, 0x2 ; Generating round key 5 key_expansion_256_avx_2 vmovdqa [EXP_ENC_KEYS + 16*5], xmm4 vaesimc xmm0, xmm4 vmovdqa [EXP_DEC_KEYS + 16*9], xmm0 vaeskeygenassist xmm2, xmm4, 0x4 ; Generating round key 6 key_expansion_256_avx vmovdqa [EXP_ENC_KEYS + 16*6], xmm1 vaesimc xmm5, xmm1 vmovdqa [EXP_DEC_KEYS + 16*8], xmm5 vaeskeygenassist xmm2, xmm1, 0x4 ; Generating round key 7 key_expansion_256_avx_2 vmovdqa [EXP_ENC_KEYS + 16*7], xmm4 vaesimc xmm0, xmm4 vmovdqa [EXP_DEC_KEYS + 16*7], xmm0 vaeskeygenassist xmm2, xmm4, 0x8 ; Generating round key 8 key_expansion_256_avx vmovdqa [EXP_ENC_KEYS + 16*8], xmm1 vaesimc xmm5, xmm1 vmovdqa [EXP_DEC_KEYS + 16*6], xmm5 vaeskeygenassist xmm2, xmm1, 0x8 ; Generating round key 9 key_expansion_256_avx_2 vmovdqa [EXP_ENC_KEYS + 16*9], xmm4 vaesimc xmm0, xmm4 vmovdqa [EXP_DEC_KEYS + 16*5], xmm0 vaeskeygenassist xmm2, xmm4, 0x10 ; Generating round key 10 key_expansion_256_avx vmovdqa [EXP_ENC_KEYS + 16*10], xmm1 vaesimc xmm5, xmm1 vmovdqa [EXP_DEC_KEYS + 16*4], xmm5 vaeskeygenassist xmm2, xmm1, 0x10 ; Generating round key 11 key_expansion_256_avx_2 vmovdqa [EXP_ENC_KEYS + 16*11], xmm4 vaesimc xmm0, xmm4 vmovdqa [EXP_DEC_KEYS + 16*3], xmm0 vaeskeygenassist xmm2, xmm4, 0x20 ; Generating round key 12 key_expansion_256_avx vmovdqa [EXP_ENC_KEYS + 16*12], xmm1 vaesimc xmm5, xmm1 vmovdqa [EXP_DEC_KEYS + 16*2], xmm5 vaeskeygenassist xmm2, xmm1, 0x20 ; Generating round key 13 key_expansion_256_avx_2 vmovdqa [EXP_ENC_KEYS + 16*13], xmm4 vaesimc xmm0, xmm4 vmovdqa [EXP_DEC_KEYS + 16*1], xmm0 vaeskeygenassist xmm2, xmm4, 0x40 ; Generating round key 14 key_expansion_256_avx vmovdqa [EXP_ENC_KEYS + 16*14], xmm1 vmovdqa [EXP_DEC_KEYS + 16*0], xmm1 ret ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; void aes_keyexp_256_enc_sse(UINT128 *key, ; UINT128 *enc_exp_keys); ; ; arg 1: rcx: pointer to key ; arg 2: rdx: pointer to expanded key array for encrypt ; MKGLOBAL(aes_keyexp_256_enc_sse,function,) aes_keyexp_256_enc_sse: movdqu xmm1, [KEY] ; loading the AES key movdqa [EXP_ENC_KEYS + 16*0], xmm1 movdqu xmm4, [KEY+16] ; loading the AES key movdqa [EXP_ENC_KEYS + 16*1], xmm4 pxor xmm3, xmm3 ; Required for the key_expansion. aeskeygenassist xmm2, xmm4, 0x1 ; Generating round key 2 key_expansion_256_sse movdqa [EXP_ENC_KEYS + 16*2], xmm1 aeskeygenassist xmm2, xmm1, 0x1 ; Generating round key 3 key_expansion_256_sse_2 movdqa [EXP_ENC_KEYS + 16*3], xmm4 aeskeygenassist xmm2, xmm4, 0x2 ; Generating round key 4 key_expansion_256_sse movdqa [EXP_ENC_KEYS + 16*4], xmm1 aeskeygenassist xmm2, xmm1, 0x2 ; Generating round key 5 key_expansion_256_sse_2 movdqa [EXP_ENC_KEYS + 16*5], xmm4 aeskeygenassist xmm2, xmm4, 0x4 ; Generating round key 6 key_expansion_256_sse movdqa [EXP_ENC_KEYS + 16*6], xmm1 aeskeygenassist xmm2, xmm1, 0x4 ; Generating round key 7 key_expansion_256_sse_2 movdqa [EXP_ENC_KEYS + 16*7], xmm4 aeskeygenassist xmm2, xmm4, 0x8 ; Generating round key 8 key_expansion_256_sse movdqa [EXP_ENC_KEYS + 16*8], xmm1 aeskeygenassist xmm2, xmm1, 0x8 ; Generating round key 9 key_expansion_256_sse_2 movdqa [EXP_ENC_KEYS + 16*9], xmm4 aeskeygenassist xmm2, xmm4, 0x10 ; Generating round key 10 key_expansion_256_sse movdqa [EXP_ENC_KEYS + 16*10], xmm1 aeskeygenassist xmm2, xmm1, 0x10 ; Generating round key 11 key_expansion_256_sse_2 movdqa [EXP_ENC_KEYS + 16*11], xmm4 aeskeygenassist xmm2, xmm4, 0x20 ; Generating round key 12 key_expansion_256_sse movdqa [EXP_ENC_KEYS + 16*12], xmm1 aeskeygenassist xmm2, xmm1, 0x20 ; Generating round key 13 key_expansion_256_sse_2 movdqa [EXP_ENC_KEYS + 16*13], xmm4 aeskeygenassist xmm2, xmm4, 0x40 ; Generating round key 14 key_expansion_256_sse movdqa [EXP_ENC_KEYS + 16*14], xmm1 ret MKGLOBAL(aes_keyexp_256_enc_avx,function,) aes_keyexp_256_enc_avx: vmovdqu xmm1, [KEY] ; loading the AES key vmovdqa [EXP_ENC_KEYS + 16*0], xmm1 vmovdqu xmm4, [KEY+16] ; loading the AES key vmovdqa [EXP_ENC_KEYS + 16*1], xmm4 vpxor xmm3, xmm3, xmm3 ; Required for the key_expansion. vaeskeygenassist xmm2, xmm4, 0x1 ; Generating round key 2 key_expansion_256_avx vmovdqa [EXP_ENC_KEYS + 16*2], xmm1 vaeskeygenassist xmm2, xmm1, 0x1 ; Generating round key 3 key_expansion_256_avx_2 vmovdqa [EXP_ENC_KEYS + 16*3], xmm4 vaeskeygenassist xmm2, xmm4, 0x2 ; Generating round key 4 key_expansion_256_avx vmovdqa [EXP_ENC_KEYS + 16*4], xmm1 vaeskeygenassist xmm2, xmm1, 0x2 ; Generating round key 5 key_expansion_256_avx_2 vmovdqa [EXP_ENC_KEYS + 16*5], xmm4 vaeskeygenassist xmm2, xmm4, 0x4 ; Generating round key 6 key_expansion_256_avx vmovdqa [EXP_ENC_KEYS + 16*6], xmm1 vaeskeygenassist xmm2, xmm1, 0x4 ; Generating round key 7 key_expansion_256_avx_2 vmovdqa [EXP_ENC_KEYS + 16*7], xmm4 vaeskeygenassist xmm2, xmm4, 0x8 ; Generating round key 8 key_expansion_256_avx vmovdqa [EXP_ENC_KEYS + 16*8], xmm1 vaeskeygenassist xmm2, xmm1, 0x8 ; Generating round key 9 key_expansion_256_avx_2 vmovdqa [EXP_ENC_KEYS + 16*9], xmm4 vaeskeygenassist xmm2, xmm4, 0x10 ; Generating round key 10 key_expansion_256_avx vmovdqa [EXP_ENC_KEYS + 16*10], xmm1 vaeskeygenassist xmm2, xmm1, 0x10 ; Generating round key 11 key_expansion_256_avx_2 vmovdqa [EXP_ENC_KEYS + 16*11], xmm4 vaeskeygenassist xmm2, xmm4, 0x20 ; Generating round key 12 key_expansion_256_avx vmovdqa [EXP_ENC_KEYS + 16*12], xmm1 vaeskeygenassist xmm2, xmm1, 0x20 ; Generating round key 13 key_expansion_256_avx_2 vmovdqa [EXP_ENC_KEYS + 16*13], xmm4 vaeskeygenassist xmm2, xmm4, 0x40 ; Generating round key 14 key_expansion_256_avx vmovdqa [EXP_ENC_KEYS + 16*14], xmm1 ret intel-ipsec-mb-0.48/aes_xcbc_expand_key.c000066400000000000000000000056271321406316400204110ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2012-2017, Intel Corporation Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ #include #include "types.h" #include "aux_funcs.h" #include "os.h" void aes_keyexp_128_enc_sse(const void *key, void *enc_exp_keys); void aes_keyexp_128_enc_avx(const void *key, void *enc_exp_keys); void aes128_ecbenc_x3_sse(const void *in, void *keys, void *out1, void *out2, void *out3); void aes128_ecbenc_x3_avx(const void *in, void *keys, void *out1, void *out2, void *out3); static UINT32 in[4*3] = { 0x01010101, 0x01010101, 0x01010101, 0x01010101, 0x02020202, 0x02020202, 0x02020202, 0x02020202, 0x03030303, 0x03030303, 0x03030303, 0x03030303 }; void aes_xcbc_expand_key_sse(const void *key, void *k1_exp, void *k2, void *k3) { DECLARE_ALIGNED(UINT32 keys_exp_enc[11*4], 16); aes_keyexp_128_enc_sse(key, keys_exp_enc); aes128_ecbenc_x3_sse(in, keys_exp_enc, k1_exp, k2, k3); aes_keyexp_128_enc_sse(k1_exp, k1_exp); } void aes_xcbc_expand_key_avx(const void *key, void *k1_exp, void *k2, void *k3) { DECLARE_ALIGNED(UINT32 keys_exp_enc[11*4], 16); aes_keyexp_128_enc_avx(key, keys_exp_enc); aes128_ecbenc_x3_avx(in, keys_exp_enc, k1_exp, k2, k3); aes_keyexp_128_enc_avx(k1_exp, k1_exp); } intel-ipsec-mb-0.48/asm.h000066400000000000000000000115231321406316400152100ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2012-2017, Intel Corporation Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ /* interface to asm routines */ #ifndef _ASM_H #define _ASM_H #include "asm_types.h" /* Define interface to AES base asm code */ #define AES_BLOCK_SIZE 16 void aes_cbc_enc_128_x8(AES_ARGS_x8 *args, UINT64 len_in_bytes); void aes_cbc_enc_192_x8(AES_ARGS_x8 *args, UINT64 len_in_bytes); void aes_cbc_enc_256_x8(AES_ARGS_x8 *args, UINT64 len_in_bytes); void aes_cbc_dec_128_avx(const void *in, const UINT8 *IV, const void *keys, void *out, UINT64 len_bytes); void aes_cbc_dec_192_avx(const void *in, const UINT8 *IV, const void *keys, void *out, UINT64 len_bytes); void aes_cbc_dec_256_avx(const void *in, const UINT8 *IV, const void *keys, void *out, UINT64 len_bytes); void aes_cbc_dec_128_sse(const void *in, const UINT8 *IV, const void *keys, void *out, UINT64 len_bytes); void aes_cbc_dec_192_sse(const void *in, const UINT8 *IV, const void *keys, void *out, UINT64 len_bytes); void aes_cbc_dec_256_sse(const void *in, const UINT8 *IV, const void *keys, void *out, UINT64 len_bytes); void aes_cntr_256_sse(const void *in, const void *IV, const void *keys, void *out, UINT64 len_bytes, UINT64 IV_len); void aes_cntr_192_sse(const void *in, const void *IV, const void *keys, void *out, UINT64 len_bytes, UINT64 IV_len); void aes_cntr_128_sse(const void *in, const void *IV, const void *keys, void *out, UINT64 len_bytes, UINT64 IV_len); void aes_cntr_256_avx(const void *in, const void *IV, const void *keys, void *out, UINT64 len_bytes, UINT64 IV_len); void aes_cntr_192_avx(const void *in, const void *IV, const void *keys, void *out, UINT64 len_bytes, UINT64 IV_len); void aes_cntr_128_avx(const void *in, const void *IV, const void *keys, void *out, UINT64 len_bytes, UINT64 IV_len); #endif /* _ASM_H */ intel-ipsec-mb-0.48/asm_types.h000066400000000000000000000106141321406316400164340ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2012-2017, Intel Corporation Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ #ifndef IMB_ASM_TYPES_H #define IMB_ASM_TYPES_H #include "os.h" #include "constants.h" typedef struct { const uint8_t *in[8]; uint8_t *out[8]; const uint32_t *keys[8]; DECLARE_ALIGNED(UINT128 IV[8], 32); } AES_ARGS_x8; /* AVX512 with its larger register sizes (vs AVX2) supports more parallelism * It will be used to size objects */ #define AVX512_NUM_SHA1_LANES 16 #define AVX512_NUM_SHA256_LANES 16 #define AVX512_NUM_SHA512_LANES 8 #define AVX512_NUM_MD5_LANES 32 #define AVX512_NUM_DES_LANES 16 /* AVX2 with its larger register sizes (vs SSE) supports more parallelism */ #define AVX2_NUM_SHA1_LANES 8 #define AVX2_NUM_SHA256_LANES 8 #define AVX2_NUM_SHA512_LANES 4 #define AVX2_NUM_MD5_LANES 16 #define AVX_NUM_SHA1_LANES 4 #define AVX_NUM_SHA256_LANES 4 #define AVX_NUM_SHA512_LANES 2 #define AVX_NUM_MD5_LANES 8 #define SSE_NUM_SHA1_LANES AVX_NUM_SHA1_LANES #define SSE_NUM_SHA256_LANES AVX_NUM_SHA256_LANES #define SSE_NUM_SHA512_LANES AVX_NUM_SHA512_LANES #define SSE_NUM_MD5_LANES AVX_NUM_MD5_LANES /* Each row is sized to hold enough lanes for AVX2, AVX1 and SSE use a subset * of each row. Thus one row is not adjacent in memory to its neighboring rows * in the case of SSE and AVX1. */ #define MD5_DIGEST_SZ (NUM_MD5_DIGEST_WORDS * AVX512_NUM_MD5_LANES) #define SHA1_DIGEST_SZ (NUM_SHA_DIGEST_WORDS * AVX512_NUM_SHA1_LANES) #define SHA256_DIGEST_SZ (NUM_SHA_256_DIGEST_WORDS * AVX512_NUM_SHA256_LANES) #define SHA512_DIGEST_SZ (NUM_SHA_512_DIGEST_WORDS * AVX512_NUM_SHA512_LANES) typedef struct { DECLARE_ALIGNED(uint32_t digest[SHA1_DIGEST_SZ], 32); uint8_t *data_ptr[AVX512_NUM_SHA1_LANES]; } SHA1_ARGS; typedef struct { DECLARE_ALIGNED(uint32_t digest[SHA256_DIGEST_SZ], 32); uint8_t *data_ptr[AVX512_NUM_SHA256_LANES]; } SHA256_ARGS; typedef struct { DECLARE_ALIGNED(uint64_t digest[SHA512_DIGEST_SZ], 32); uint8_t *data_ptr[AVX512_NUM_SHA512_LANES]; } SHA512_ARGS; typedef struct { DECLARE_ALIGNED(uint32_t digest[MD5_DIGEST_SZ], 32); uint8_t *data_ptr[AVX512_NUM_MD5_LANES]; } MD5_ARGS; typedef struct { const uint8_t *in[8]; const uint32_t *keys[8]; DECLARE_ALIGNED(UINT128 ICV[8], 32); } AES_XCBC_ARGS_x8; typedef struct { const uint8_t *in[AVX512_NUM_DES_LANES]; uint8_t *out[AVX512_NUM_DES_LANES]; const uint8_t *keys[AVX512_NUM_DES_LANES]; uint32_t IV[AVX512_NUM_DES_LANES * 2]; /* uint32_t is more handy here */ uint32_t partial_len[AVX512_NUM_DES_LANES]; uint32_t block_len[AVX512_NUM_DES_LANES]; const uint8_t *last_in[AVX512_NUM_DES_LANES]; uint8_t *last_out[AVX512_NUM_DES_LANES]; } DES_ARGS_x16; #endif /* ifdef IMB_ASM_TYPES_H */ intel-ipsec-mb-0.48/aux_funcs.h000066400000000000000000000127171321406316400164310ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2012-2017, Intel Corporation Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ #ifndef _AUX_FUNCS_H_ #define _AUX_FUNCS_H_ /* SSE */ void sha1_one_block_sse(const void *data, void *digest); void sha224_one_block_sse(const void *data, void *digest); void sha256_one_block_sse(const void *data, void *digest); void sha384_one_block_sse(const void *data, void *digest); void sha512_one_block_sse(const void *data, void *digest); void md5_one_block_sse(const void *data, void *digest); void aes_keyexp_128_sse(const void *key, void *enc_exp_keys, void *dec_exp_keys); void aes_keyexp_192_sse(const void *key, void *enc_exp_keys, void *dec_exp_keys); void aes_keyexp_256_sse(const void *key, void *enc_exp_keys, void *dec_exp_keys); void aes_xcbc_expand_key_sse(const void *key, void *k1_exp, void *k2, void *k3); void aes_keyexp_128_enc_sse(const void *key, void *enc_exp_keys); void aes_keyexp_192_enc_sse(const void *key, void *enc_exp_keys); void aes_keyexp_256_enc_sse(const void *key, void *enc_exp_keys); void aes_cfb_128_one_sse(void *out, const void *in, const void *iv, const void *keys, uint64_t len); /* AVX */ void sha1_one_block_avx(const void *data, void *digest); void sha224_one_block_avx(const void *data, void *digest); void sha256_one_block_avx(const void *data, void *digest); void sha384_one_block_avx(const void *data, void *digest); void sha512_one_block_avx(const void *data, void *digest); #define md5_one_block_avx md5_one_block_sse void aes_keyexp_128_avx(const void *key, void *enc_exp_keys, void *dec_exp_keys); void aes_keyexp_192_avx(const void *key, void *enc_exp_keys, void *dec_exp_keys); void aes_keyexp_256_avx(const void *key, void *enc_exp_keys, void *dec_exp_keys); void aes_xcbc_expand_key_avx(const void *key, void *k1_exp, void *k2, void *k3); void aes_keyexp_128_enc_avx(const void *key, void *enc_exp_keys); void aes_keyexp_192_enc_avx(const void *key, void *enc_exp_keys); void aes_keyexp_256_enc_avx(const void *key, void *enc_exp_keys); void aes_cfb_128_one_avx(void *out, const void *in, const void *iv, const void *keys, uint64_t len); /* AVX2 */ #define sha1_one_block_avx2 sha1_one_block_avx #define sha224_one_block_avx2 sha224_one_block_avx #define sha256_one_block_avx2 sha256_one_block_avx #define sha384_one_block_avx2 sha384_one_block_avx #define sha512_one_block_avx2 sha512_one_block_avx #define md5_one_block_avx2 md5_one_block_avx #define aes_keyexp_128_avx2 aes_keyexp_128_avx #define aes_keyexp_192_avx2 aes_keyexp_192_avx #define aes_keyexp_256_avx2 aes_keyexp_256_avx #define aes_xcbc_expand_key_avx2 aes_xcbc_expand_key_avx #define aes_keyexp_128_enc_avx2 aes_keyexp_128_enc_avx #define aes_keyexp_192_enc_avx2 aes_keyexp_192_enc_avx #define aes_keyexp_256_enc_avx2 aes_keyexp_256_enc_avx #define aes_cfb_128_one_avx2 aes_cfb_128_one_avx /* AVX512 */ #define sha1_one_block_avx512 sha1_one_block_avx2 #define sha224_one_block_avx512 sha224_one_block_avx2 #define sha256_one_block_avx512 sha256_one_block_avx2 #define sha384_one_block_avx512 sha384_one_block_avx2 #define sha512_one_block_avx512 sha512_one_block_avx2 #define md5_one_block_avx512 md5_one_block_avx2 #define aes_keyexp_128_avx512 aes_keyexp_128_avx2 #define aes_keyexp_192_avx512 aes_keyexp_192_avx2 #define aes_keyexp_256_avx512 aes_keyexp_256_avx2 #define aes_xcbc_expand_key_avx512 aes_xcbc_expand_key_avx2 #define aes_keyexp_128_enc_avx512 aes_keyexp_128_enc_avx2 #define aes_keyexp_192_enc_avx512 aes_keyexp_192_enc_avx2 #define aes_keyexp_256_enc_avx512 aes_keyexp_256_enc_avx2 #define aes_cfb_128_one_avx512 aes_cfb_128_one_avx2 #endif /* !_AUX_FUNCS_H_ */ intel-ipsec-mb-0.48/avx/000077500000000000000000000000001321406316400150535ustar00rootroot00000000000000intel-ipsec-mb-0.48/avx/aes128_cbc_dec_by8_avx.asm000066400000000000000000000140421321406316400216430ustar00rootroot00000000000000;; ;; Copyright (c) 2012-2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; ; routine to do AES128 CBC decrypt "by8" ;; clobbers xmm0-15 %include "os.asm" %define CONCAT(a,b) a %+ b %define VMOVDQ vmovdqu %define xdata0 xmm0 %define xdata1 xmm1 %define xdata2 xmm2 %define xdata3 xmm3 %define xdata4 xmm4 %define xdata5 xmm5 %define xdata6 xmm6 %define xdata7 xmm7 %define xIV xmm8 %define xkey0 xmm9 %define xkey2 xmm10 %define xkey4 xmm11 %define xkey6 xmm12 %define xkey8 xmm13 %define xkey10 xmm14 %define xkeytmp xmm15 %ifdef LINUX %define p_in rdi %define p_IV rsi %define p_keys rdx %define p_out rcx %define num_bytes r8 %else %define p_in rcx %define p_IV rdx %define p_keys r8 %define p_out r9 %define num_bytes rax %endif %define tmp r10 %macro do_aes_load 1 do_aes %1, 1 %endmacro %macro do_aes_noload 1 do_aes %1, 0 %endmacro ; do_aes num_in_par load_keys ; This increments p_in, but not p_out %macro do_aes 2 %define %%by %1 %define %%load_keys %2 %if (%%load_keys) vmovdqa xkey0, [p_keys + 0*16] %endif %assign i 0 %rep %%by VMOVDQ CONCAT(xdata,i), [p_in + i*16] %assign i (i+1) %endrep %if (%%load_keys) vmovdqa xkey2, [p_keys + 2*16] %endif %assign i 0 %rep %%by vpxor CONCAT(xdata,i), CONCAT(xdata,i), xkey0 %assign i (i+1) %endrep add p_in, 16*%%by vmovdqa xkeytmp, [p_keys + 1*16] %assign i 0 %rep %%by vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeytmp %assign i (i+1) %endrep %if (%%load_keys) vmovdqa xkey4, [p_keys + 4*16] %endif %assign i 0 %rep %%by vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkey2 %assign i (i+1) %endrep vmovdqa xkeytmp, [p_keys + 3*16] %assign i 0 %rep %%by vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeytmp %assign i (i+1) %endrep %if (%%load_keys) vmovdqa xkey6, [p_keys + 6*16] %endif %assign i 0 %rep %%by vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkey4 %assign i (i+1) %endrep vmovdqa xkeytmp, [p_keys + 5*16] %assign i 0 %rep %%by vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeytmp %assign i (i+1) %endrep %if (%%load_keys) vmovdqa xkey8, [p_keys + 8*16] %endif %assign i 0 %rep %%by vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkey6 %assign i (i+1) %endrep vmovdqa xkeytmp, [p_keys + 7*16] %assign i 0 %rep %%by vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeytmp %assign i (i+1) %endrep %if (%%load_keys) vmovdqa xkey10, [p_keys + 10*16] %endif %assign i 0 %rep %%by vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkey8 %assign i (i+1) %endrep vmovdqa xkeytmp, [p_keys + 9*16] %assign i 0 %rep %%by vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeytmp %assign i (i+1) %endrep %assign i 0 %rep %%by vaesdeclast CONCAT(xdata,i), CONCAT(xdata,i), xkey10 %assign i (i+1) %endrep vpxor xdata0, xdata0, xIV %assign i 1 %if (%%by > 1) %rep (%%by - 1) VMOVDQ xIV, [p_in + (i-1)*16 - 16*%%by] vpxor CONCAT(xdata,i), CONCAT(xdata,i), xIV %assign i (i+1) %endrep %endif VMOVDQ xIV, [p_in + (i-1)*16 - 16*%%by] %assign i 0 %rep %%by VMOVDQ [p_out + i*16], CONCAT(xdata,i) %assign i (i+1) %endrep %endmacro ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; section .text ;; aes_cbc_dec_128_avx(void *in, void *IV, void *keys, void *out, UINT64 num_bytes) MKGLOBAL(aes_cbc_dec_128_avx,function,internal) aes_cbc_dec_128_avx: %ifndef LINUX mov num_bytes, [rsp + 8*5] %endif vmovdqu xIV, [p_IV] mov tmp, num_bytes and tmp, 7*16 jz mult_of_8_blks ; 1 <= tmp <= 7 cmp tmp, 4*16 jg gt4 je eq4 lt4: cmp tmp, 2*16 jg eq3 je eq2 eq1: do_aes_load 1 add p_out, 1*16 and num_bytes, ~7*16 jz do_return2 jmp main_loop2 eq2: do_aes_load 2 add p_out, 2*16 and num_bytes, ~7*16 jz do_return2 jmp main_loop2 eq3: do_aes_load 3 add p_out, 3*16 and num_bytes, ~7*16 jz do_return2 jmp main_loop2 eq4: do_aes_load 4 add p_out, 4*16 and num_bytes, ~7*16 jz do_return2 jmp main_loop2 gt4: cmp tmp, 6*16 jg eq7 je eq6 eq5: do_aes_load 5 add p_out, 5*16 and num_bytes, ~7*16 jz do_return2 jmp main_loop2 eq6: do_aes_load 6 add p_out, 6*16 and num_bytes, ~7*16 jz do_return2 jmp main_loop2 eq7: do_aes_load 7 add p_out, 7*16 and num_bytes, ~7*16 jz do_return2 jmp main_loop2 mult_of_8_blks: vmovdqa xkey0, [p_keys + 0*16] vmovdqa xkey2, [p_keys + 2*16] vmovdqa xkey4, [p_keys + 4*16] vmovdqa xkey6, [p_keys + 6*16] vmovdqa xkey8, [p_keys + 8*16] vmovdqa xkey10, [p_keys + 10*16] main_loop2: ; num_bytes is a multiple of 8 and >0 do_aes_noload 8 add p_out, 8*16 sub num_bytes, 8*16 jne main_loop2 do_return2: ; Don't write back IV ; vmovdqu [p_IV], xIV ret intel-ipsec-mb-0.48/avx/aes128_cbc_mac_x8.asm000066400000000000000000000032521321406316400206300ustar00rootroot00000000000000;; ;; Copyright (c) 2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; ;;; Routine to compute CBC-MAC. It is based on 128 bit CBC AES encrypt code. %define CBC_MAC 1 %include "aes_cbc_enc_128_x8.asm" intel-ipsec-mb-0.48/avx/aes128_cntr_by8_avx.asm000066400000000000000000000221101321406316400212420ustar00rootroot00000000000000;; ;; Copyright (c) 2012-2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; %include "os.asm" %include "memcpy.asm" ; routine to do AES128 CNTR enc/decrypt "by8" ; XMM registers are clobbered. Saving/restoring must be done at a higher level section .data default rel MKGLOBAL(byteswap_const,data,internal) MKGLOBAL(ddq_add_1,data,internal) MKGLOBAL(ddq_add_2,data,internal) MKGLOBAL(ddq_add_3,data,internal) MKGLOBAL(ddq_add_4,data,internal) MKGLOBAL(ddq_add_5,data,internal) MKGLOBAL(ddq_add_6,data,internal) MKGLOBAL(ddq_add_7,data,internal) MKGLOBAL(ddq_add_8,data,internal) align 16 byteswap_const: ;DDQ 0x000102030405060708090A0B0C0D0E0F DQ 0x08090A0B0C0D0E0F, 0x0001020304050607 ddq_add_1: ;DDQ 0x00000000000000000000000000000001 DQ 0x0000000000000001, 0x0000000000000000 ddq_add_2: ;DDQ 0x00000000000000000000000000000002 DQ 0x0000000000000002, 0x0000000000000000 ddq_add_3: ;DDQ 0x00000000000000000000000000000003 DQ 0x0000000000000003, 0x0000000000000000 ddq_add_4: ;DDQ 0x00000000000000000000000000000004 DQ 0x0000000000000004, 0x0000000000000000 ddq_add_5: ;DDQ 0x00000000000000000000000000000005 DQ 0x0000000000000005, 0x0000000000000000 ddq_add_6: ;DDQ 0x00000000000000000000000000000006 DQ 0x0000000000000006, 0x0000000000000000 ddq_add_7: ;DDQ 0x00000000000000000000000000000007 DQ 0x0000000000000007, 0x0000000000000000 ddq_add_8: ;DDQ 0x00000000000000000000000000000008 DQ 0x0000000000000008, 0x0000000000000000 section .text %define CONCAT(a,b) a %+ b %define VMOVDQ vmovdqu %define xdata0 xmm0 %define xdata1 xmm1 %define xdata2 xmm2 %define xdata3 xmm3 %define xdata4 xmm4 %define xdata5 xmm5 %define xdata6 xmm6 %define xdata7 xmm7 %define xcounter xmm8 %define xbyteswap xmm9 %define xkey0 xmm10 %define xkey3 xmm11 %define xkey6 xmm12 %define xkey9 xmm13 %define xkeyA xmm14 %define xkeyB xmm15 %ifdef LINUX %define p_in rdi %define p_IV rsi %define p_keys rdx %define p_out rcx %define num_bytes r8 %define p_ivlen r9 %else %define p_in rcx %define p_IV rdx %define p_keys r8 %define p_out r9 %define num_bytes r10 %define p_ivlen qword [rsp + 8*6] %endif %define tmp r11 %define p_tmp rsp + _buffer %macro do_aes_load 1 do_aes %1, 1 %endmacro %macro do_aes_noload 1 do_aes %1, 0 %endmacro ; do_aes num_in_par load_keys ; This increments p_in, but not p_out %macro do_aes 2 %define %%by %1 %define %%load_keys %2 %if (%%load_keys) vmovdqa xkey0, [p_keys + 0*16] %endif vpshufb xdata0, xcounter, xbyteswap %assign i 1 %rep (%%by - 1) vpaddd CONCAT(xdata,i), xcounter, [rel CONCAT(ddq_add_,i)] vpshufb CONCAT(xdata,i), CONCAT(xdata,i), xbyteswap %assign i (i + 1) %endrep vmovdqa xkeyA, [p_keys + 1*16] vpxor xdata0, xkey0 vpaddd xcounter, xcounter, [rel CONCAT(ddq_add_,%%by)] %assign i 1 %rep (%%by - 1) vpxor CONCAT(xdata,i), xkey0 %assign i (i + 1) %endrep vmovdqa xkeyB, [p_keys + 2*16] %assign i 0 %rep %%by vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyA ; key 1 %assign i (i+1) %endrep %if (%%load_keys) vmovdqa xkey3, [p_keys + 3*16] %endif %assign i 0 %rep %%by vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyB ; key 2 %assign i (i+1) %endrep add p_in, 16*%%by vmovdqa xkeyB, [p_keys + 4*16] %assign i 0 %rep %%by vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkey3 ; key 3 %assign i (i+1) %endrep vmovdqa xkeyA, [p_keys + 5*16] %assign i 0 %rep %%by vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyB ; key 4 %assign i (i+1) %endrep %if (%%load_keys) vmovdqa xkey6, [p_keys + 6*16] %endif %assign i 0 %rep %%by vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyA ; key 5 %assign i (i+1) %endrep vmovdqa xkeyA, [p_keys + 7*16] %assign i 0 %rep %%by vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkey6 ; key 6 %assign i (i+1) %endrep vmovdqa xkeyB, [p_keys + 8*16] %assign i 0 %rep %%by vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyA ; key 7 %assign i (i+1) %endrep %if (%%load_keys) vmovdqa xkey9, [p_keys + 9*16] %endif %assign i 0 %rep %%by vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyB ; key 8 %assign i (i+1) %endrep vmovdqa xkeyB, [p_keys + 10*16] %assign i 0 %rep %%by vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkey9 ; key 9 %assign i (i+1) %endrep %assign i 0 %rep %%by vaesenclast CONCAT(xdata,i), CONCAT(xdata,i), xkeyB ; key 10 %assign i (i+1) %endrep %assign i 0 %rep (%%by / 2) %assign j (i+1) VMOVDQ xkeyA, [p_in + i*16 - 16*%%by] VMOVDQ xkeyB, [p_in + j*16 - 16*%%by] vpxor CONCAT(xdata,i), CONCAT(xdata,i), xkeyA vpxor CONCAT(xdata,j), CONCAT(xdata,j), xkeyB %assign i (i+2) %endrep %if (i < %%by) VMOVDQ xkeyA, [p_in + i*16 - 16*%%by] vpxor CONCAT(xdata,i), CONCAT(xdata,i), xkeyA %endif %assign i 0 %rep %%by VMOVDQ [p_out + i*16], CONCAT(xdata,i) %assign i (i+1) %endrep %endmacro struc STACK _buffer: resq 2 _rsp_save: resq 1 endstruc ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; aes_cntr_128_avx(void *in, void *IV, void *keys, void *out, UINT64 num_bytes) align 32 MKGLOBAL(aes_cntr_128_avx,function,internal) aes_cntr_128_avx: %ifndef LINUX mov num_bytes, [rsp + 8*5] ; arg5 %endif vmovdqa xbyteswap, [rel byteswap_const] test p_ivlen, 16 jnz iv_is_16_bytes ; Read 12 bytes: Nonce + ESP IV. Then pad with block counter 0x00000001 mov DWORD(tmp), 0x01000000 vpinsrq xcounter, [p_IV], 0 vpinsrd xcounter, [p_IV + 8], 2 vpinsrd xcounter, DWORD(tmp), 3 bswap_iv: vpshufb xcounter, xbyteswap mov tmp, num_bytes and tmp, 7*16 jz chk ; x8 > or < 15 (not 7 lines) ; 1 <= tmp <= 7 cmp tmp, 4*16 jg gt4 je eq4 lt4: cmp tmp, 2*16 jg eq3 je eq2 eq1: do_aes_load 1 add p_out, 1*16 jmp chk eq2: do_aes_load 2 add p_out, 2*16 jmp chk eq3: do_aes_load 3 add p_out, 3*16 jmp chk eq4: do_aes_load 4 add p_out, 4*16 jmp chk gt4: cmp tmp, 6*16 jg eq7 je eq6 eq5: do_aes_load 5 add p_out, 5*16 jmp chk eq6: do_aes_load 6 add p_out, 6*16 jmp chk eq7: do_aes_load 7 add p_out, 7*16 ; fall through to chk chk: and num_bytes, ~(7*16) jz do_return2 cmp num_bytes, 16 jb last ; process multiples of 8 blocks vmovdqa xkey0, [p_keys + 0*16] vmovdqa xkey3, [p_keys + 3*16] vmovdqa xkey6, [p_keys + 6*16] vmovdqa xkey9, [p_keys + 9*16] jmp main_loop2 align 32 main_loop2: ; num_bytes is a multiple of 8 blocks + partial bytes do_aes_noload 8 add p_out, 8*16 sub num_bytes, 8*16 cmp num_bytes, 8*16 jae main_loop2 test num_bytes, 15 ; partial bytes to be processed? jnz last do_return2: ; don't return updated IV ; vpshufb xcounter, xcounter, xbyteswap ; vmovdqu [p_IV], xcounter ret last: ;; Code dealing with the partial block cases ; reserve 16 byte aligned buffer on stack mov rax, rsp sub rsp, STACK_size and rsp, -16 mov [rsp + _rsp_save], rax ; save SP ; copy input bytes into scratch buffer memcpy_avx_16_1 p_tmp, p_in, num_bytes, tmp, rax ; Encryption of a single partial block (p_tmp) vpshufb xcounter, xbyteswap vmovdqa xdata0, xcounter vpxor xdata0, [p_keys + 16*0] %assign i 1 %rep 9 vaesenc xdata0, [p_keys + 16*i] %assign i (i+1) %endrep ; created keystream vaesenclast xdata0, [p_keys + 16*i] ; xor keystream with the message (scratch) vpxor xdata0, [p_tmp] vmovdqa [p_tmp], xdata0 ; copy result into the output buffer memcpy_avx_16_1 p_out, p_tmp, num_bytes, tmp, rax ; remove the stack frame mov rsp, [rsp + _rsp_save] ; original SP jmp do_return2 iv_is_16_bytes: ; Read 16 byte IV: Nonce + ESP IV + block counter (BE) vmovdqu xcounter, [p_IV] jmp bswap_iv intel-ipsec-mb-0.48/avx/aes192_cbc_dec_by8_avx.asm000066400000000000000000000144321321406316400216470ustar00rootroot00000000000000;; ;; Copyright (c) 2012-2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; ; routine to do AES192 CBC decrypt "by8" ; XMM registers are clobbered. Saving/restoring must be done at a higher level %include "os.asm" %define CONCAT(a,b) a %+ b %define VMOVDQ vmovdqu %define xdata0 xmm0 %define xdata1 xmm1 %define xdata2 xmm2 %define xdata3 xmm3 %define xdata4 xmm4 %define xdata5 xmm5 %define xdata6 xmm6 %define xdata7 xmm7 %define xIV xmm8 %define xkey0 xmm9 %define xkey3 xmm10 %define xkey6 xmm11 %define xkey9 xmm12 %define xkey12 xmm13 %define xkeyA xmm14 %define xkeyB xmm15 %ifdef LINUX %define p_in rdi %define p_IV rsi %define p_keys rdx %define p_out rcx %define num_bytes r8 %else %define p_in rcx %define p_IV rdx %define p_keys r8 %define p_out r9 %define num_bytes rax %endif %define tmp r10 %macro do_aes_load 1 do_aes %1, 1 %endmacro %macro do_aes_noload 1 do_aes %1, 0 %endmacro ; do_aes num_in_par load_keys ; This increments p_in, but not p_out %macro do_aes 2 %define %%by %1 %define %%load_keys %2 %if (%%load_keys) vmovdqa xkey0, [p_keys + 0*16] %endif %assign i 0 %rep %%by VMOVDQ CONCAT(xdata,i), [p_in + i*16] %assign i (i+1) %endrep vmovdqa xkeyA, [p_keys + 1*16] %assign i 0 %rep %%by vpxor CONCAT(xdata,i), CONCAT(xdata,i), xkey0 %assign i (i+1) %endrep vmovdqa xkeyB, [p_keys + 2*16] %assign i 0 %rep %%by vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeyA %assign i (i+1) %endrep add p_in, 16*%%by %if (%%load_keys) vmovdqa xkey3, [p_keys + 3*16] %endif %assign i 0 %rep %%by vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeyB %assign i (i+1) %endrep vmovdqa xkeyA, [p_keys + 4*16] %assign i 0 %rep %%by vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkey3 %assign i (i+1) %endrep vmovdqa xkeyB, [p_keys + 5*16] %assign i 0 %rep %%by vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeyA %assign i (i+1) %endrep %if (%%load_keys) vmovdqa xkey6, [p_keys + 6*16] %endif %assign i 0 %rep %%by vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeyB %assign i (i+1) %endrep vmovdqa xkeyA, [p_keys + 7*16] %assign i 0 %rep %%by vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkey6 %assign i (i+1) %endrep vmovdqa xkeyB, [p_keys + 8*16] %assign i 0 %rep %%by vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeyA %assign i (i+1) %endrep %if (%%load_keys) vmovdqa xkey9, [p_keys + 9*16] %endif %assign i 0 %rep %%by vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeyB %assign i (i+1) %endrep vmovdqa xkeyA, [p_keys + 10*16] %assign i 0 %rep %%by vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkey9 %assign i (i+1) %endrep vmovdqa xkeyB, [p_keys + 11*16] %assign i 0 %rep %%by vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeyA %assign i (i+1) %endrep %if (%%load_keys) vmovdqa xkey12, [p_keys + 12*16] %endif %assign i 0 %rep %%by vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeyB %assign i (i+1) %endrep %assign i 0 %rep %%by vaesdeclast CONCAT(xdata,i), CONCAT(xdata,i), xkey12 %assign i (i+1) %endrep vpxor xdata0, xdata0, xIV %assign i 1 %if (%%by > 1) %rep (%%by - 1) VMOVDQ xIV, [p_in + (i-1)*16 - 16*%%by] vpxor CONCAT(xdata,i), CONCAT(xdata,i), xIV %assign i (i+1) %endrep %endif VMOVDQ xIV, [p_in + (i-1)*16 - 16*%%by] %assign i 0 %rep %%by VMOVDQ [p_out + i*16], CONCAT(xdata,i) %assign i (i+1) %endrep %endmacro ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; section .text ;; aes_cbc_dec_192_avx(void *in, void *IV, void *keys, void *out, UINT64 num_bytes) MKGLOBAL(aes_cbc_dec_192_avx,function,internal) aes_cbc_dec_192_avx: %ifndef LINUX mov num_bytes, [rsp + 8*5] %endif vmovdqu xIV, [p_IV] mov tmp, num_bytes and tmp, 7*16 jz mult_of_8_blks ; 1 <= tmp <= 7 cmp tmp, 4*16 jg gt4 je eq4 lt4: cmp tmp, 2*16 jg eq3 je eq2 eq1: do_aes_load 1 add p_out, 1*16 and num_bytes, ~7*16 jz do_return2 jmp main_loop2 eq2: do_aes_load 2 add p_out, 2*16 and num_bytes, ~7*16 jz do_return2 jmp main_loop2 eq3: do_aes_load 3 add p_out, 3*16 and num_bytes, ~7*16 jz do_return2 jmp main_loop2 eq4: do_aes_load 4 add p_out, 4*16 and num_bytes, ~7*16 jz do_return2 jmp main_loop2 gt4: cmp tmp, 6*16 jg eq7 je eq6 eq5: do_aes_load 5 add p_out, 5*16 and num_bytes, ~7*16 jz do_return2 jmp main_loop2 eq6: do_aes_load 6 add p_out, 6*16 and num_bytes, ~7*16 jz do_return2 jmp main_loop2 eq7: do_aes_load 7 add p_out, 7*16 and num_bytes, ~7*16 jz do_return2 jmp main_loop2 mult_of_8_blks: vmovdqa xkey0, [p_keys + 0*16] vmovdqa xkey3, [p_keys + 3*16] vmovdqa xkey6, [p_keys + 6*16] vmovdqa xkey9, [p_keys + 9*16] vmovdqa xkey12, [p_keys + 12*16] main_loop2: ; num_bytes is a multiple of 8 and >0 do_aes_noload 8 add p_out, 8*16 sub num_bytes, 8*16 jne main_loop2 do_return2: ; Don't write back IV ; vmovdqu [p_IV], xIV ret intel-ipsec-mb-0.48/avx/aes192_cntr_by8_avx.asm000066400000000000000000000204561321406316400212560ustar00rootroot00000000000000;; ;; Copyright (c) 2012-2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; %include "os.asm" %include "memcpy.asm" ; routine to do AES192 CNTR enc/decrypt "by8" ; XMM registers are clobbered. Saving/restoring must be done at a higher level extern byteswap_const extern ddq_add_1, ddq_add_2, ddq_add_3, ddq_add_4 extern ddq_add_5, ddq_add_6, ddq_add_7, ddq_add_8 %define CONCAT(a,b) a %+ b %define VMOVDQ vmovdqu %define xdata0 xmm0 %define xdata1 xmm1 %define xdata2 xmm2 %define xdata3 xmm3 %define xdata4 xmm4 %define xdata5 xmm5 %define xdata6 xmm6 %define xdata7 xmm7 %define xcounter xmm8 %define xbyteswap xmm9 %define xkey0 xmm10 %define xkey4 xmm11 %define xkey8 xmm12 %define xkey12 xmm13 %define xkeyA xmm14 %define xkeyB xmm15 %ifdef LINUX %define p_in rdi %define p_IV rsi %define p_keys rdx %define p_out rcx %define num_bytes r8 %define p_ivlen r9 %else %define p_in rcx %define p_IV rdx %define p_keys r8 %define p_out r9 %define num_bytes r10 %define p_ivlen qword [rsp + 8*6] %endif %define tmp r11 %define p_tmp rsp + _buffer %macro do_aes_load 1 do_aes %1, 1 %endmacro %macro do_aes_noload 1 do_aes %1, 0 %endmacro ; do_aes num_in_par load_keys ; This increments p_in, but not p_out %macro do_aes 2 %define %%by %1 %define %%load_keys %2 %if (%%load_keys) vmovdqa xkey0, [p_keys + 0*16] %endif vpshufb xdata0, xcounter, xbyteswap %assign i 1 %rep (%%by - 1) vpaddd CONCAT(xdata,i), xcounter, [rel CONCAT(ddq_add_,i)] vpshufb CONCAT(xdata,i), CONCAT(xdata,i), xbyteswap %assign i (i + 1) %endrep vmovdqa xkeyA, [p_keys + 1*16] vpxor xdata0, xkey0 vpaddd xcounter, xcounter, [rel CONCAT(ddq_add_,%%by)] %assign i 1 %rep (%%by - 1) vpxor CONCAT(xdata,i), xkey0 %assign i (i + 1) %endrep vmovdqa xkeyB, [p_keys + 2*16] %assign i 0 %rep %%by vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyA ; key 1 %assign i (i+1) %endrep vmovdqa xkeyA, [p_keys + 3*16] %assign i 0 %rep %%by vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyB ; key 2 %assign i (i+1) %endrep add p_in, 16*%%by %if (%%load_keys) vmovdqa xkey4, [p_keys + 4*16] %endif %assign i 0 %rep %%by vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyA ; key 3 %assign i (i+1) %endrep vmovdqa xkeyA, [p_keys + 5*16] %assign i 0 %rep %%by vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkey4 ; key 4 %assign i (i+1) %endrep vmovdqa xkeyB, [p_keys + 6*16] %assign i 0 %rep %%by vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyA ; key 5 %assign i (i+1) %endrep vmovdqa xkeyA, [p_keys + 7*16] %assign i 0 %rep %%by vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyB ; key 6 %assign i (i+1) %endrep %if (%%load_keys) vmovdqa xkey8, [p_keys + 8*16] %endif %assign i 0 %rep %%by vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyA ; key 7 %assign i (i+1) %endrep vmovdqa xkeyA, [p_keys + 9*16] %assign i 0 %rep %%by vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkey8 ; key 8 %assign i (i+1) %endrep vmovdqa xkeyB, [p_keys + 10*16] %assign i 0 %rep %%by vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyA ; key 9 %assign i (i+1) %endrep vmovdqa xkeyA, [p_keys + 11*16] %assign i 0 %rep %%by vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyB ; key 10 %assign i (i+1) %endrep %if (%%load_keys) vmovdqa xkey12, [p_keys + 12*16] %endif %assign i 0 %rep %%by vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyA ; key 11 %assign i (i+1) %endrep %assign i 0 %rep %%by vaesenclast CONCAT(xdata,i), CONCAT(xdata,i), xkey12 ; key 12 %assign i (i+1) %endrep %assign i 0 %rep (%%by / 2) %assign j (i+1) VMOVDQ xkeyA, [p_in + i*16 - 16*%%by] VMOVDQ xkeyB, [p_in + j*16 - 16*%%by] vpxor CONCAT(xdata,i), CONCAT(xdata,i), xkeyA vpxor CONCAT(xdata,j), CONCAT(xdata,j), xkeyB %assign i (i+2) %endrep %if (i < %%by) VMOVDQ xkeyA, [p_in + i*16 - 16*%%by] vpxor CONCAT(xdata,i), CONCAT(xdata,i), xkeyA %endif %assign i 0 %rep %%by VMOVDQ [p_out + i*16], CONCAT(xdata,i) %assign i (i+1) %endrep %endmacro struc STACK _buffer: resq 2 _rsp_save: resq 1 endstruc ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; section .text ;; aes_cntr_192_avx(void *in, void *IV, void *keys, void *out, UINT64 num_bytes, UINT64 iv_len) align 32 MKGLOBAL(aes_cntr_192_avx,function,internal) aes_cntr_192_avx: %ifndef LINUX mov num_bytes, [rsp + 8*5] %endif vmovdqa xbyteswap, [rel byteswap_const] test p_ivlen, 16 jnz iv_is_16_bytes ; Read 12 bytes: Nonce + ESP IV. Then pad with block counter 0x00000001 mov DWORD(tmp), 0x01000000 vpinsrq xcounter, [p_IV], 0 vpinsrd xcounter, [p_IV + 8], 2 vpinsrd xcounter, DWORD(tmp), 3 bswap_iv: vpshufb xcounter, xbyteswap mov tmp, num_bytes and tmp, 7*16 jz chk ; x8 > or < 15 (not 7 lines) ; 1 <= tmp <= 7 cmp tmp, 4*16 jg gt4 je eq4 lt4: cmp tmp, 2*16 jg eq3 je eq2 eq1: do_aes_load 1 add p_out, 1*16 jmp chk eq2: do_aes_load 2 add p_out, 2*16 jmp chk eq3: do_aes_load 3 add p_out, 3*16 jmp chk eq4: do_aes_load 4 add p_out, 4*16 jmp chk gt4: cmp tmp, 6*16 jg eq7 je eq6 eq5: do_aes_load 5 add p_out, 5*16 jmp chk eq6: do_aes_load 6 add p_out, 6*16 jmp chk eq7: do_aes_load 7 add p_out, 7*16 ; fall through to chk chk: and num_bytes, ~(7*16) jz do_return2 cmp num_bytes, 16 jb last ; process multiples of 8 blocks vmovdqa xkey0, [p_keys + 0*16] vmovdqa xkey4, [p_keys + 4*16] vmovdqa xkey8, [p_keys + 8*16] vmovdqa xkey12, [p_keys + 12*16] jmp main_loop2 align 32 main_loop2: ; num_bytes is a multiple of 8 blocks + partial bytes do_aes_noload 8 add p_out, 8*16 sub num_bytes, 8*16 cmp num_bytes, 8*16 jae main_loop2 test num_bytes, 15 ; partial bytes to be processed? jnz last do_return2: ; don't return updated IV ; vpshufb xcounter, xcounter, xbyteswap ; vmovdqu [p_IV], xcounter ret last: ;; Code dealing with the partial block cases ; reserve 16 byte aligned buffer on stack mov rax, rsp sub rsp, STACK_size and rsp, -16 mov [rsp + _rsp_save], rax ; save SP ; copy input bytes into scratch buffer memcpy_avx_16_1 p_tmp, p_in, num_bytes, tmp, rax ; Encryption of a single partial block (p_tmp) vpshufb xcounter, xbyteswap vmovdqa xdata0, xcounter vpxor xdata0, [p_keys + 16*0] %assign i 1 %rep 11 vaesenc xdata0, [p_keys + 16*i] %assign i (i+1) %endrep ; created keystream vaesenclast xdata0, [p_keys + 16*i] ; xor keystream with the message (scratch) vpxor xdata0, [p_tmp] vmovdqa [p_tmp], xdata0 ; copy result into the output buffer memcpy_avx_16_1 p_out, p_tmp, num_bytes, tmp, rax ; remove the stack frame mov rsp, [rsp + _rsp_save] ; original SP jmp do_return2 iv_is_16_bytes: ; Read 16 byte IV: Nonce + ESP IV + block counter (BE) vmovdqu xcounter, [p_IV] jmp bswap_iv intel-ipsec-mb-0.48/avx/aes256_cbc_dec_by8_avx.asm000066400000000000000000000150361321406316400216510ustar00rootroot00000000000000;; ;; Copyright (c) 2012-2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; ; routine to do AES256 CBC decrypt "by8" ; XMM registers are clobbered. Saving/restoring must be done at a higher level %include "os.asm" %define CONCAT(a,b) a %+ b %define VMOVDQ vmovdqu %define xdata0 xmm0 %define xdata1 xmm1 %define xdata2 xmm2 %define xdata3 xmm3 %define xdata4 xmm4 %define xdata5 xmm5 %define xdata6 xmm6 %define xdata7 xmm7 %define xIV xmm8 %define xkey0 xmm9 %define xkey3 xmm10 %define xkey6 xmm11 %define xkey9 xmm12 %define xkey12 xmm13 %define xkeyA xmm14 %define xkeyB xmm15 %ifdef LINUX %define p_in rdi %define p_IV rsi %define p_keys rdx %define p_out rcx %define num_bytes r8 %else %define p_in rcx %define p_IV rdx %define p_keys r8 %define p_out r9 %define num_bytes rax %endif %define tmp r10 %macro do_aes_load 1 do_aes %1, 1 %endmacro %macro do_aes_noload 1 do_aes %1, 0 %endmacro ; do_aes num_in_par load_keys ; This increments p_in, but not p_out %macro do_aes 2 %define %%by %1 %define %%load_keys %2 %if (%%load_keys) vmovdqa xkey0, [p_keys + 0*16] %endif %assign i 0 %rep %%by VMOVDQ CONCAT(xdata,i), [p_in + i*16] %assign i (i+1) %endrep vmovdqa xkeyA, [p_keys + 1*16] %assign i 0 %rep %%by vpxor CONCAT(xdata,i), CONCAT(xdata,i), xkey0 %assign i (i+1) %endrep vmovdqa xkeyB, [p_keys + 2*16] %assign i 0 %rep %%by vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeyA %assign i (i+1) %endrep add p_in, 16*%%by %if (%%load_keys) vmovdqa xkey3, [p_keys + 3*16] %endif %assign i 0 %rep %%by vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeyB %assign i (i+1) %endrep vmovdqa xkeyA, [p_keys + 4*16] %assign i 0 %rep %%by vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkey3 %assign i (i+1) %endrep vmovdqa xkeyB, [p_keys + 5*16] %assign i 0 %rep %%by vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeyA %assign i (i+1) %endrep %if (%%load_keys) vmovdqa xkey6, [p_keys + 6*16] %endif %assign i 0 %rep %%by vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeyB %assign i (i+1) %endrep vmovdqa xkeyA, [p_keys + 7*16] %assign i 0 %rep %%by vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkey6 %assign i (i+1) %endrep vmovdqa xkeyB, [p_keys + 8*16] %assign i 0 %rep %%by vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeyA %assign i (i+1) %endrep %if (%%load_keys) vmovdqa xkey9, [p_keys + 9*16] %endif %assign i 0 %rep %%by vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeyB %assign i (i+1) %endrep vmovdqa xkeyA, [p_keys + 10*16] %assign i 0 %rep %%by vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkey9 %assign i (i+1) %endrep vmovdqa xkeyB, [p_keys + 11*16] %assign i 0 %rep %%by vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeyA %assign i (i+1) %endrep %if (%%load_keys) vmovdqa xkey12, [p_keys + 12*16] %endif %assign i 0 %rep %%by vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeyB %assign i (i+1) %endrep vmovdqa xkeyA, [p_keys + 13*16] %assign i 0 %rep %%by vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkey12 %assign i (i+1) %endrep vmovdqa xkeyB, [p_keys + 14*16] %assign i 0 %rep %%by vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeyA %assign i (i+1) %endrep %assign i 0 %rep %%by vaesdeclast CONCAT(xdata,i), CONCAT(xdata,i), xkeyB %assign i (i+1) %endrep vpxor xdata0, xdata0, xIV %assign i 1 %if (%%by > 1) %rep (%%by - 1) VMOVDQ xIV, [p_in + (i-1)*16 - 16*%%by] vpxor CONCAT(xdata,i), CONCAT(xdata,i), xIV %assign i (i+1) %endrep %endif VMOVDQ xIV, [p_in + (i-1)*16 - 16*%%by] %assign i 0 %rep %%by VMOVDQ [p_out + i*16], CONCAT(xdata,i) %assign i (i+1) %endrep %endmacro ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; section .text ;; aes_cbc_dec_256_avx(void *in, void *IV, void *keys, void *out, UINT64 num_bytes) MKGLOBAL(aes_cbc_dec_256_avx,function,internal) aes_cbc_dec_256_avx: %ifndef LINUX mov num_bytes, [rsp + 8*5] %endif vmovdqu xIV, [p_IV] mov tmp, num_bytes and tmp, 7*16 jz mult_of_8_blks ; 1 <= tmp <= 7 cmp tmp, 4*16 jg gt4 je eq4 lt4: cmp tmp, 2*16 jg eq3 je eq2 eq1: do_aes_load 1 add p_out, 1*16 and num_bytes, ~7*16 jz do_return2 jmp main_loop2 eq2: do_aes_load 2 add p_out, 2*16 and num_bytes, ~7*16 jz do_return2 jmp main_loop2 eq3: do_aes_load 3 add p_out, 3*16 and num_bytes, ~7*16 jz do_return2 jmp main_loop2 eq4: do_aes_load 4 add p_out, 4*16 and num_bytes, ~7*16 jz do_return2 jmp main_loop2 gt4: cmp tmp, 6*16 jg eq7 je eq6 eq5: do_aes_load 5 add p_out, 5*16 and num_bytes, ~7*16 jz do_return2 jmp main_loop2 eq6: do_aes_load 6 add p_out, 6*16 and num_bytes, ~7*16 jz do_return2 jmp main_loop2 eq7: do_aes_load 7 add p_out, 7*16 and num_bytes, ~7*16 jz do_return2 jmp main_loop2 mult_of_8_blks: vmovdqa xkey0, [p_keys + 0*16] vmovdqa xkey3, [p_keys + 3*16] vmovdqa xkey6, [p_keys + 6*16] vmovdqa xkey9, [p_keys + 9*16] vmovdqa xkey12, [p_keys + 12*16] main_loop2: ; num_bytes is a multiple of 8 and >0 do_aes_noload 8 add p_out, 8*16 sub num_bytes, 8*16 jne main_loop2 do_return2: ; Don't write back IV ; vmovdqu [p_IV], xIV ret intel-ipsec-mb-0.48/avx/aes256_cntr_by8_avx.asm000066400000000000000000000210601321406316400212470ustar00rootroot00000000000000;; ;; Copyright (c) 2012-2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; %include "os.asm" %include "memcpy.asm" ; routine to do AES256 CNTR enc/decrypt "by8" ; XMM registers are clobbered. Saving/restoring must be done at a higher level extern byteswap_const extern ddq_add_1, ddq_add_2, ddq_add_3, ddq_add_4 extern ddq_add_5, ddq_add_6, ddq_add_7, ddq_add_8 %define CONCAT(a,b) a %+ b %define VMOVDQ vmovdqu %define xdata0 xmm0 %define xdata1 xmm1 %define xdata2 xmm2 %define xdata3 xmm3 %define xdata4 xmm4 %define xdata5 xmm5 %define xdata6 xmm6 %define xdata7 xmm7 %define xcounter xmm8 %define xbyteswap xmm9 %define xkey0 xmm10 %define xkey4 xmm11 %define xkey8 xmm12 %define xkey12 xmm13 %define xkeyA xmm14 %define xkeyB xmm15 %ifdef LINUX %define p_in rdi %define p_IV rsi %define p_keys rdx %define p_out rcx %define num_bytes r8 %define p_ivlen r9 %else %define p_in rcx %define p_IV rdx %define p_keys r8 %define p_out r9 %define num_bytes r10 %define p_ivlen qword [rsp + 8*6] %endif %define tmp r11 %define p_tmp rsp + _buffer %macro do_aes_load 1 do_aes %1, 1 %endmacro %macro do_aes_noload 1 do_aes %1, 0 %endmacro ; do_aes num_in_par load_keys ; This increments p_in, but not p_out %macro do_aes 2 %define %%by %1 %define %%load_keys %2 %if (%%load_keys) vmovdqa xkey0, [p_keys + 0*16] %endif vpshufb xdata0, xcounter, xbyteswap %assign i 1 %rep (%%by - 1) vpaddd CONCAT(xdata,i), xcounter, [rel CONCAT(ddq_add_,i)] vpshufb CONCAT(xdata,i), CONCAT(xdata,i), xbyteswap %assign i (i + 1) %endrep vmovdqa xkeyA, [p_keys + 1*16] vpxor xdata0, xkey0 vpaddd xcounter, xcounter, [rel CONCAT(ddq_add_,%%by)] %assign i 1 %rep (%%by - 1) vpxor CONCAT(xdata,i), xkey0 %assign i (i + 1) %endrep vmovdqa xkeyB, [p_keys + 2*16] %assign i 0 %rep %%by vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyA ; key 1 %assign i (i+1) %endrep vmovdqa xkeyA, [p_keys + 3*16] %assign i 0 %rep %%by vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyB ; key 2 %assign i (i+1) %endrep add p_in, 16*%%by %if (%%load_keys) vmovdqa xkey4, [p_keys + 4*16] %endif %assign i 0 %rep %%by vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyA ; key 3 %assign i (i+1) %endrep vmovdqa xkeyA, [p_keys + 5*16] %assign i 0 %rep %%by vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkey4 ; key 4 %assign i (i+1) %endrep vmovdqa xkeyB, [p_keys + 6*16] %assign i 0 %rep %%by vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyA ; key 5 %assign i (i+1) %endrep vmovdqa xkeyA, [p_keys + 7*16] %assign i 0 %rep %%by vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyB ; key 6 %assign i (i+1) %endrep %if (%%load_keys) vmovdqa xkey8, [p_keys + 8*16] %endif %assign i 0 %rep %%by vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyA ; key 7 %assign i (i+1) %endrep vmovdqa xkeyA, [p_keys + 9*16] %assign i 0 %rep %%by vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkey8 ; key 8 %assign i (i+1) %endrep vmovdqa xkeyB, [p_keys + 10*16] %assign i 0 %rep %%by vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyA ; key 9 %assign i (i+1) %endrep vmovdqa xkeyA, [p_keys + 11*16] %assign i 0 %rep %%by vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyB ; key 10 %assign i (i+1) %endrep %if (%%load_keys) vmovdqa xkey12, [p_keys + 12*16] %endif %assign i 0 %rep %%by vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyA ; key 11 %assign i (i+1) %endrep vmovdqa xkeyA, [p_keys + 13*16] %assign i 0 %rep %%by vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkey12 ; key 12 %assign i (i+1) %endrep vmovdqa xkeyB, [p_keys + 14*16] %assign i 0 %rep %%by vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyA ; key 13 %assign i (i+1) %endrep %assign i 0 %rep %%by vaesenclast CONCAT(xdata,i), CONCAT(xdata,i), xkeyB ; key 14 %assign i (i+1) %endrep %assign i 0 %rep (%%by / 2) %assign j (i+1) VMOVDQ xkeyA, [p_in + i*16 - 16*%%by] VMOVDQ xkeyB, [p_in + j*16 - 16*%%by] vpxor CONCAT(xdata,i), CONCAT(xdata,i), xkeyA vpxor CONCAT(xdata,j), CONCAT(xdata,j), xkeyB %assign i (i+2) %endrep %if (i < %%by) VMOVDQ xkeyA, [p_in + i*16 - 16*%%by] vpxor CONCAT(xdata,i), CONCAT(xdata,i), xkeyA %endif %assign i 0 %rep %%by VMOVDQ [p_out + i*16], CONCAT(xdata,i) %assign i (i+1) %endrep %endmacro struc STACK _buffer: resq 2 _rsp_save: resq 1 endstruc ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; section .text ;; aes_cntr_256_avx(void *in, void *IV, void *keys, void *out, UINT64 num_bytes, UINT64 iv_len) align 32 MKGLOBAL(aes_cntr_256_avx,function,internal) aes_cntr_256_avx: %ifndef LINUX mov num_bytes, [rsp + 8*5] %endif vmovdqa xbyteswap, [rel byteswap_const] test p_ivlen, 16 jnz iv_is_16_bytes ; Read 12 bytes: Nonce + ESP IV. Then pad with block counter 0x00000001 mov DWORD(tmp), 0x01000000 vpinsrq xcounter, [p_IV], 0 vpinsrd xcounter, [p_IV + 8], 2 vpinsrd xcounter, DWORD(tmp), 3 bswap_iv: vpshufb xcounter, xbyteswap mov tmp, num_bytes and tmp, 7*16 jz chk ; x8 > or < 15 (not 7 lines) ; 1 <= tmp <= 7 cmp tmp, 4*16 jg gt4 je eq4 lt4: cmp tmp, 2*16 jg eq3 je eq2 eq1: do_aes_load 1 add p_out, 1*16 jmp chk eq2: do_aes_load 2 add p_out, 2*16 jmp chk eq3: do_aes_load 3 add p_out, 3*16 jmp chk eq4: do_aes_load 4 add p_out, 4*16 jmp chk gt4: cmp tmp, 6*16 jg eq7 je eq6 eq5: do_aes_load 5 add p_out, 5*16 jmp chk eq6: do_aes_load 6 add p_out, 6*16 jmp chk eq7: do_aes_load 7 add p_out, 7*16 ; fall through to chk chk: and num_bytes, ~(7*16) jz do_return2 cmp num_bytes, 16 jb last ; process multiples of 8 blocks vmovdqa xkey0, [p_keys + 0*16] vmovdqa xkey4, [p_keys + 4*16] vmovdqa xkey8, [p_keys + 8*16] vmovdqa xkey12, [p_keys + 12*16] jmp main_loop2 align 32 main_loop2: ; num_bytes is a multiple of 8 and >0 do_aes_noload 8 add p_out, 8*16 sub num_bytes, 8*16 cmp num_bytes, 8*16 jae main_loop2 test num_bytes, 15 ; partial bytes to be processed? jnz last do_return2: ; don't return updated IV ; vpshufb xcounter, xcounter, xbyteswap ; vmovdqu [p_IV], xcounter ret last: ;; Code dealing with the partial block cases ; reserve 16 byte aligned buffer on stack mov rax, rsp sub rsp, STACK_size and rsp, -16 mov [rsp + _rsp_save], rax ; save SP ; copy input bytes into scratch buffer memcpy_avx_16_1 p_tmp, p_in, num_bytes, tmp, rax ; Encryption of a single partial block (p_tmp) vpshufb xcounter, xbyteswap vmovdqa xdata0, xcounter vpxor xdata0, [p_keys + 16*0] %assign i 1 %rep 13 vaesenc xdata0, [p_keys + 16*i] %assign i (i+1) %endrep ; created keystream vaesenclast xdata0, [p_keys + 16*i] ; xor keystream with the message (scratch) vpxor xdata0, [p_tmp] vmovdqa [p_tmp], xdata0 ; copy result into the output buffer memcpy_avx_16_1 p_out, p_tmp, num_bytes, tmp, rax ; remove the stack frame mov rsp, [rsp + _rsp_save] ; original SP jmp do_return2 iv_is_16_bytes: ; Read 16 byte IV: Nonce + ESP IV + block counter (BE) vmovdqu xcounter, [p_IV] jmp bswap_iv intel-ipsec-mb-0.48/avx/aes_cbc_enc_128_x8.asm000066400000000000000000000412061321406316400207750ustar00rootroot00000000000000;; ;; Copyright (c) 2012-2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; ;;; routine to do a 128 bit CBC AES encrypt and CBC MAC ;; clobbers all registers except for ARG1 and rbp %include "os.asm" %include "mb_mgr_datastruct.asm" %define VMOVDQ vmovdqu ;; assume buffers not aligned %macro VPXOR2 2 vpxor %1, %1, %2 %endm ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; struct AES_ARGS_x8 { ;; void* in[8]; ;; void* out[8]; ;; UINT128* keys[8]; ;; UINT128 IV[8]; ;; } ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; void aes_cbc_enc_128_x8(AES_ARGS_x8 *args, UINT64 len); ;; arg 1: ARG : addr of AES_ARGS_x8 structure ;; arg 2: LEN : len (in units of bytes) struc STACK _gpr_save: resq 8 _len: resq 1 endstruc %define GPR_SAVE_AREA rsp + _gpr_save %define LEN_AREA rsp + _len %ifdef LINUX %define arg1 rdi %define arg2 rsi %define arg3 rcx %define arg4 rdx %else %define arg1 rcx %define arg2 rdx %define arg3 rdi %define arg4 rsi %endif %define ARG arg1 %define LEN arg2 %define IDX rax %define TMP rbx %define KEYS0 arg3 %define KEYS1 arg4 %define KEYS2 rbp %define KEYS3 r8 %define KEYS4 r9 %define KEYS5 r10 %define KEYS6 r11 %define KEYS7 r12 %define IN0 r13 %define IN2 r14 %define IN4 r15 %define IN6 LEN %define XDATA0 xmm0 %define XDATA1 xmm1 %define XDATA2 xmm2 %define XDATA3 xmm3 %define XDATA4 xmm4 %define XDATA5 xmm5 %define XDATA6 xmm6 %define XDATA7 xmm7 %define XKEY0_3 xmm8 %define XKEY1_4 xmm9 %define XKEY2_5 xmm10 %define XKEY3_6 xmm11 %define XKEY4_7 xmm12 %define XKEY5_8 xmm13 %define XKEY6_9 xmm14 %define XTMP xmm15 %ifdef CBC_MAC MKGLOBAL(aes128_cbc_mac_x8,function,internal) aes128_cbc_mac_x8: %else MKGLOBAL(aes_cbc_enc_128_x8,function,internal) aes_cbc_enc_128_x8: %endif sub rsp, STACK_size mov [GPR_SAVE_AREA + 8*0], rbp %ifdef CBC_MAC mov [GPR_SAVE_AREA + 8*1], rbx mov [GPR_SAVE_AREA + 8*2], r12 mov [GPR_SAVE_AREA + 8*3], r13 mov [GPR_SAVE_AREA + 8*4], r14 mov [GPR_SAVE_AREA + 8*5], r15 %ifndef LINUX mov [GPR_SAVE_AREA + 8*6], rsi mov [GPR_SAVE_AREA + 8*7], rdi %endif %endif mov IDX, 16 mov [LEN_AREA], LEN ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; mov IN0, [ARG + _aesarg_in + 8*0] mov IN2, [ARG + _aesarg_in + 8*2] mov IN4, [ARG + _aesarg_in + 8*4] mov IN6, [ARG + _aesarg_in + 8*6] mov TMP, [ARG + _aesarg_in + 8*1] VMOVDQ XDATA0, [IN0] ; load first block of plain text VMOVDQ XDATA1, [TMP] ; load first block of plain text mov TMP, [ARG + _aesarg_in + 8*3] VMOVDQ XDATA2, [IN2] ; load first block of plain text VMOVDQ XDATA3, [TMP] ; load first block of plain text mov TMP, [ARG + _aesarg_in + 8*5] VMOVDQ XDATA4, [IN4] ; load first block of plain text VMOVDQ XDATA5, [TMP] ; load first block of plain text mov TMP, [ARG + _aesarg_in + 8*7] VMOVDQ XDATA6, [IN6] ; load first block of plain text VMOVDQ XDATA7, [TMP] ; load first block of plain text VPXOR2 XDATA0, [ARG + _aesarg_IV + 16*0] ; plaintext XOR IV VPXOR2 XDATA1, [ARG + _aesarg_IV + 16*1] ; plaintext XOR IV VPXOR2 XDATA2, [ARG + _aesarg_IV + 16*2] ; plaintext XOR IV VPXOR2 XDATA3, [ARG + _aesarg_IV + 16*3] ; plaintext XOR IV VPXOR2 XDATA4, [ARG + _aesarg_IV + 16*4] ; plaintext XOR IV VPXOR2 XDATA5, [ARG + _aesarg_IV + 16*5] ; plaintext XOR IV VPXOR2 XDATA6, [ARG + _aesarg_IV + 16*6] ; plaintext XOR IV VPXOR2 XDATA7, [ARG + _aesarg_IV + 16*7] ; plaintext XOR IV mov KEYS0, [ARG + _aesarg_keys + 8*0] mov KEYS1, [ARG + _aesarg_keys + 8*1] mov KEYS2, [ARG + _aesarg_keys + 8*2] mov KEYS3, [ARG + _aesarg_keys + 8*3] mov KEYS4, [ARG + _aesarg_keys + 8*4] mov KEYS5, [ARG + _aesarg_keys + 8*5] mov KEYS6, [ARG + _aesarg_keys + 8*6] mov KEYS7, [ARG + _aesarg_keys + 8*7] VPXOR2 XDATA0, [KEYS0 + 16*0] ; 0. ARK VPXOR2 XDATA1, [KEYS1 + 16*0] ; 0. ARK VPXOR2 XDATA2, [KEYS2 + 16*0] ; 0. ARK VPXOR2 XDATA3, [KEYS3 + 16*0] ; 0. ARK VPXOR2 XDATA4, [KEYS4 + 16*0] ; 0. ARK VPXOR2 XDATA5, [KEYS5 + 16*0] ; 0. ARK VPXOR2 XDATA6, [KEYS6 + 16*0] ; 0. ARK VPXOR2 XDATA7, [KEYS7 + 16*0] ; 0. ARK vaesenc XDATA0, [KEYS0 + 16*1] ; 1. ENC vaesenc XDATA1, [KEYS1 + 16*1] ; 1. ENC vaesenc XDATA2, [KEYS2 + 16*1] ; 1. ENC vaesenc XDATA3, [KEYS3 + 16*1] ; 1. ENC vaesenc XDATA4, [KEYS4 + 16*1] ; 1. ENC vaesenc XDATA5, [KEYS5 + 16*1] ; 1. ENC vaesenc XDATA6, [KEYS6 + 16*1] ; 1. ENC vaesenc XDATA7, [KEYS7 + 16*1] ; 1. ENC vmovdqa XKEY0_3, [KEYS0 + 16*3] ; load round 3 key vaesenc XDATA0, [KEYS0 + 16*2] ; 2. ENC vaesenc XDATA1, [KEYS1 + 16*2] ; 2. ENC vaesenc XDATA2, [KEYS2 + 16*2] ; 2. ENC vaesenc XDATA3, [KEYS3 + 16*2] ; 2. ENC vaesenc XDATA4, [KEYS4 + 16*2] ; 2. ENC vaesenc XDATA5, [KEYS5 + 16*2] ; 2. ENC vaesenc XDATA6, [KEYS6 + 16*2] ; 2. ENC vaesenc XDATA7, [KEYS7 + 16*2] ; 2. ENC vmovdqa XKEY1_4, [KEYS1 + 16*4] ; load round 4 key vaesenc XDATA0, XKEY0_3 ; 3. ENC vaesenc XDATA1, [KEYS1 + 16*3] ; 3. ENC vaesenc XDATA2, [KEYS2 + 16*3] ; 3. ENC vaesenc XDATA3, [KEYS3 + 16*3] ; 3. ENC vaesenc XDATA4, [KEYS4 + 16*3] ; 3. ENC vaesenc XDATA5, [KEYS5 + 16*3] ; 3. ENC vaesenc XDATA6, [KEYS6 + 16*3] ; 3. ENC vaesenc XDATA7, [KEYS7 + 16*3] ; 3. ENC vaesenc XDATA0, [KEYS0 + 16*4] ; 4. ENC vmovdqa XKEY2_5, [KEYS2 + 16*5] ; load round 5 key vaesenc XDATA1, XKEY1_4 ; 4. ENC vaesenc XDATA2, [KEYS2 + 16*4] ; 4. ENC vaesenc XDATA3, [KEYS3 + 16*4] ; 4. ENC vaesenc XDATA4, [KEYS4 + 16*4] ; 4. ENC vaesenc XDATA5, [KEYS5 + 16*4] ; 4. ENC vaesenc XDATA6, [KEYS6 + 16*4] ; 4. ENC vaesenc XDATA7, [KEYS7 + 16*4] ; 4. ENC vaesenc XDATA0, [KEYS0 + 16*5] ; 5. ENC vaesenc XDATA1, [KEYS1 + 16*5] ; 5. ENC vmovdqa XKEY3_6, [KEYS3 + 16*6] ; load round 6 key vaesenc XDATA2, XKEY2_5 ; 5. ENC vaesenc XDATA3, [KEYS3 + 16*5] ; 5. ENC vaesenc XDATA4, [KEYS4 + 16*5] ; 5. ENC vaesenc XDATA5, [KEYS5 + 16*5] ; 5. ENC vaesenc XDATA6, [KEYS6 + 16*5] ; 5. ENC vaesenc XDATA7, [KEYS7 + 16*5] ; 5. ENC vaesenc XDATA0, [KEYS0 + 16*6] ; 6. ENC vaesenc XDATA1, [KEYS1 + 16*6] ; 6. ENC vaesenc XDATA2, [KEYS2 + 16*6] ; 6. ENC vmovdqa XKEY4_7, [KEYS4 + 16*7] ; load round 7 key vaesenc XDATA3, XKEY3_6 ; 6. ENC vaesenc XDATA4, [KEYS4 + 16*6] ; 6. ENC vaesenc XDATA5, [KEYS5 + 16*6] ; 6. ENC vaesenc XDATA6, [KEYS6 + 16*6] ; 6. ENC vaesenc XDATA7, [KEYS7 + 16*6] ; 6. ENC vaesenc XDATA0, [KEYS0 + 16*7] ; 7. ENC vaesenc XDATA1, [KEYS1 + 16*7] ; 7. ENC vaesenc XDATA2, [KEYS2 + 16*7] ; 7. ENC vaesenc XDATA3, [KEYS3 + 16*7] ; 7. ENC vmovdqa XKEY5_8, [KEYS5 + 16*8] ; load round 8 key vaesenc XDATA4, XKEY4_7 ; 7. ENC vaesenc XDATA5, [KEYS5 + 16*7] ; 7. ENC vaesenc XDATA6, [KEYS6 + 16*7] ; 7. ENC vaesenc XDATA7, [KEYS7 + 16*7] ; 7. ENC vaesenc XDATA0, [KEYS0 + 16*8] ; 8. ENC vaesenc XDATA1, [KEYS1 + 16*8] ; 8. ENC vaesenc XDATA2, [KEYS2 + 16*8] ; 8. ENC vaesenc XDATA3, [KEYS3 + 16*8] ; 8. ENC vaesenc XDATA4, [KEYS4 + 16*8] ; 8. ENC vmovdqa XKEY6_9, [KEYS6 + 16*9] ; load round 9 key vaesenc XDATA5, XKEY5_8 ; 8. ENC vaesenc XDATA6, [KEYS6 + 16*8] ; 8. ENC vaesenc XDATA7, [KEYS7 + 16*8] ; 8. ENC vaesenc XDATA0, [KEYS0 + 16*9] ; 9. ENC vaesenc XDATA1, [KEYS1 + 16*9] ; 9. ENC vaesenc XDATA2, [KEYS2 + 16*9] ; 9. ENC vaesenc XDATA3, [KEYS3 + 16*9] ; 9. ENC vaesenc XDATA4, [KEYS4 + 16*9] ; 9. ENC vaesenc XDATA5, [KEYS5 + 16*9] ; 9. ENC mov TMP, [ARG + _aesarg_out + 8*0] vaesenc XDATA6, XKEY6_9 ; 9. ENC vaesenc XDATA7, [KEYS7 + 16*9] ; 9. ENC vaesenclast XDATA0, [KEYS0 + 16*10] ; 10. ENC vaesenclast XDATA1, [KEYS1 + 16*10] ; 10. ENC vaesenclast XDATA2, [KEYS2 + 16*10] ; 10. ENC vaesenclast XDATA3, [KEYS3 + 16*10] ; 10. ENC vaesenclast XDATA4, [KEYS4 + 16*10] ; 10. ENC vaesenclast XDATA5, [KEYS5 + 16*10] ; 10. ENC vaesenclast XDATA6, [KEYS6 + 16*10] ; 10. ENC vaesenclast XDATA7, [KEYS7 + 16*10] ; 10. ENC %ifndef CBC_MAC VMOVDQ [TMP], XDATA0 ; write back ciphertext mov TMP, [ARG + _aesarg_out + 8*1] VMOVDQ [TMP], XDATA1 ; write back ciphertext mov TMP, [ARG + _aesarg_out + 8*2] VMOVDQ [TMP], XDATA2 ; write back ciphertext mov TMP, [ARG + _aesarg_out + 8*3] VMOVDQ [TMP], XDATA3 ; write back ciphertext mov TMP, [ARG + _aesarg_out + 8*4] VMOVDQ [TMP], XDATA4 ; write back ciphertext mov TMP, [ARG + _aesarg_out + 8*5] VMOVDQ [TMP], XDATA5 ; write back ciphertext mov TMP, [ARG + _aesarg_out + 8*6] VMOVDQ [TMP], XDATA6 ; write back ciphertext mov TMP, [ARG + _aesarg_out + 8*7] VMOVDQ [TMP], XDATA7 ; write back ciphertext %endif cmp [LEN_AREA], IDX je done main_loop: mov TMP, [ARG + _aesarg_in + 8*1] VPXOR2 XDATA0, [IN0 + IDX] ; load next block of plain text VPXOR2 XDATA1, [TMP + IDX] ; load next block of plain text mov TMP, [ARG + _aesarg_in + 8*3] VPXOR2 XDATA2, [IN2 + IDX] ; load next block of plain text VPXOR2 XDATA3, [TMP + IDX] ; load next block of plain text mov TMP, [ARG + _aesarg_in + 8*5] VPXOR2 XDATA4, [IN4 + IDX] ; load next block of plain text VPXOR2 XDATA5, [TMP + IDX] ; load next block of plain text mov TMP, [ARG + _aesarg_in + 8*7] VPXOR2 XDATA6, [IN6 + IDX] ; load next block of plain text VPXOR2 XDATA7, [TMP + IDX] ; load next block of plain text VPXOR2 XDATA0, [KEYS0 + 16*0] ; 0. ARK VPXOR2 XDATA1, [KEYS1 + 16*0] ; 0. ARK VPXOR2 XDATA2, [KEYS2 + 16*0] ; 0. ARK VPXOR2 XDATA3, [KEYS3 + 16*0] ; 0. ARK VPXOR2 XDATA4, [KEYS4 + 16*0] ; 0. ARK VPXOR2 XDATA5, [KEYS5 + 16*0] ; 0. ARK VPXOR2 XDATA6, [KEYS6 + 16*0] ; 0. ARK VPXOR2 XDATA7, [KEYS7 + 16*0] ; 0. ARK vaesenc XDATA0, [KEYS0 + 16*1] ; 1. ENC vaesenc XDATA1, [KEYS1 + 16*1] ; 1. ENC vaesenc XDATA2, [KEYS2 + 16*1] ; 1. ENC vaesenc XDATA3, [KEYS3 + 16*1] ; 1. ENC vaesenc XDATA4, [KEYS4 + 16*1] ; 1. ENC vaesenc XDATA5, [KEYS5 + 16*1] ; 1. ENC vaesenc XDATA6, [KEYS6 + 16*1] ; 1. ENC vaesenc XDATA7, [KEYS7 + 16*1] ; 1. ENC vaesenc XDATA0, [KEYS0 + 16*2] ; 2. ENC vaesenc XDATA1, [KEYS1 + 16*2] ; 2. ENC vaesenc XDATA2, [KEYS2 + 16*2] ; 2. ENC vaesenc XDATA3, [KEYS3 + 16*2] ; 2. ENC vaesenc XDATA4, [KEYS4 + 16*2] ; 2. ENC vaesenc XDATA5, [KEYS5 + 16*2] ; 2. ENC vaesenc XDATA6, [KEYS6 + 16*2] ; 2. ENC vaesenc XDATA7, [KEYS7 + 16*2] ; 2. ENC vaesenc XDATA0, XKEY0_3 ; 3. ENC vaesenc XDATA1, [KEYS1 + 16*3] ; 3. ENC vaesenc XDATA2, [KEYS2 + 16*3] ; 3. ENC vaesenc XDATA3, [KEYS3 + 16*3] ; 3. ENC vaesenc XDATA4, [KEYS4 + 16*3] ; 3. ENC vaesenc XDATA5, [KEYS5 + 16*3] ; 3. ENC vaesenc XDATA6, [KEYS6 + 16*3] ; 3. ENC vaesenc XDATA7, [KEYS7 + 16*3] ; 3. ENC vaesenc XDATA0, [KEYS0 + 16*4] ; 4. ENC vaesenc XDATA1, XKEY1_4 ; 4. ENC vaesenc XDATA2, [KEYS2 + 16*4] ; 4. ENC vaesenc XDATA3, [KEYS3 + 16*4] ; 4. ENC vaesenc XDATA4, [KEYS4 + 16*4] ; 4. ENC vaesenc XDATA5, [KEYS5 + 16*4] ; 4. ENC vaesenc XDATA6, [KEYS6 + 16*4] ; 4. ENC vaesenc XDATA7, [KEYS7 + 16*4] ; 4. ENC vaesenc XDATA0, [KEYS0 + 16*5] ; 5. ENC vaesenc XDATA1, [KEYS1 + 16*5] ; 5. ENC vaesenc XDATA2, XKEY2_5 ; 5. ENC vaesenc XDATA3, [KEYS3 + 16*5] ; 5. ENC vaesenc XDATA4, [KEYS4 + 16*5] ; 5. ENC vaesenc XDATA5, [KEYS5 + 16*5] ; 5. ENC vaesenc XDATA6, [KEYS6 + 16*5] ; 5. ENC vaesenc XDATA7, [KEYS7 + 16*5] ; 5. ENC vaesenc XDATA0, [KEYS0 + 16*6] ; 6. ENC vaesenc XDATA1, [KEYS1 + 16*6] ; 6. ENC vaesenc XDATA2, [KEYS2 + 16*6] ; 6. ENC vaesenc XDATA3, XKEY3_6 ; 6. ENC vaesenc XDATA4, [KEYS4 + 16*6] ; 6. ENC vaesenc XDATA5, [KEYS5 + 16*6] ; 6. ENC vaesenc XDATA6, [KEYS6 + 16*6] ; 6. ENC vaesenc XDATA7, [KEYS7 + 16*6] ; 6. ENC vaesenc XDATA0, [KEYS0 + 16*7] ; 7. ENC vaesenc XDATA1, [KEYS1 + 16*7] ; 7. ENC vaesenc XDATA2, [KEYS2 + 16*7] ; 7. ENC vaesenc XDATA3, [KEYS3 + 16*7] ; 7. ENC vaesenc XDATA4, XKEY4_7 ; 7. ENC vaesenc XDATA5, [KEYS5 + 16*7] ; 7. ENC vaesenc XDATA6, [KEYS6 + 16*7] ; 7. ENC vaesenc XDATA7, [KEYS7 + 16*7] ; 7. ENC vaesenc XDATA0, [KEYS0 + 16*8] ; 8. ENC vaesenc XDATA1, [KEYS1 + 16*8] ; 8. ENC vaesenc XDATA2, [KEYS2 + 16*8] ; 8. ENC vaesenc XDATA3, [KEYS3 + 16*8] ; 8. ENC vaesenc XDATA4, [KEYS4 + 16*8] ; 8. ENC vaesenc XDATA5, XKEY5_8 ; 8. ENC vaesenc XDATA6, [KEYS6 + 16*8] ; 8. ENC vaesenc XDATA7, [KEYS7 + 16*8] ; 8. ENC vaesenc XDATA0, [KEYS0 + 16*9] ; 9. ENC vaesenc XDATA1, [KEYS1 + 16*9] ; 9. ENC vaesenc XDATA2, [KEYS2 + 16*9] ; 9. ENC vaesenc XDATA3, [KEYS3 + 16*9] ; 9. ENC vaesenc XDATA4, [KEYS4 + 16*9] ; 9. ENC vaesenc XDATA5, [KEYS5 + 16*9] ; 9. ENC mov TMP, [ARG + _aesarg_out + 8*0] vaesenc XDATA6, XKEY6_9 ; 9. ENC vaesenc XDATA7, [KEYS7 + 16*9] ; 9. ENC vaesenclast XDATA0, [KEYS0 + 16*10] ; 10. ENC vaesenclast XDATA1, [KEYS1 + 16*10] ; 10. ENC vaesenclast XDATA2, [KEYS2 + 16*10] ; 10. ENC vaesenclast XDATA3, [KEYS3 + 16*10] ; 10. ENC vaesenclast XDATA4, [KEYS4 + 16*10] ; 10. ENC vaesenclast XDATA5, [KEYS5 + 16*10] ; 10. ENC vaesenclast XDATA6, [KEYS6 + 16*10] ; 10. ENC vaesenclast XDATA7, [KEYS7 + 16*10] ; 10. ENC %ifndef CBC_MAC ;; no ciphertext write back for CBC-MAC VMOVDQ [TMP + IDX], XDATA0 ; write back ciphertext mov TMP, [ARG + _aesarg_out + 8*1] VMOVDQ [TMP + IDX], XDATA1 ; write back ciphertext mov TMP, [ARG + _aesarg_out + 8*2] VMOVDQ [TMP + IDX], XDATA2 ; write back ciphertext mov TMP, [ARG + _aesarg_out + 8*3] VMOVDQ [TMP + IDX], XDATA3 ; write back ciphertext mov TMP, [ARG + _aesarg_out + 8*4] VMOVDQ [TMP + IDX], XDATA4 ; write back ciphertext mov TMP, [ARG + _aesarg_out + 8*5] VMOVDQ [TMP + IDX], XDATA5 ; write back ciphertext mov TMP, [ARG + _aesarg_out + 8*6] VMOVDQ [TMP + IDX], XDATA6 ; write back ciphertext mov TMP, [ARG + _aesarg_out + 8*7] VMOVDQ [TMP + IDX], XDATA7 ; write back ciphertext %endif add IDX, 16 cmp [LEN_AREA], IDX jne main_loop done: ;; update IV for AES128-CBC / store digest for CBC-MAC vmovdqa [ARG + _aesarg_IV + 16*0], XDATA0 vmovdqa [ARG + _aesarg_IV + 16*1], XDATA1 vmovdqa [ARG + _aesarg_IV + 16*2], XDATA2 vmovdqa [ARG + _aesarg_IV + 16*3], XDATA3 vmovdqa [ARG + _aesarg_IV + 16*4], XDATA4 vmovdqa [ARG + _aesarg_IV + 16*5], XDATA5 vmovdqa [ARG + _aesarg_IV + 16*6], XDATA6 vmovdqa [ARG + _aesarg_IV + 16*7], XDATA7 ;; update IN and OUT vmovd xmm0, [LEN_AREA] vpshufd xmm0, xmm0, 0x44 vpaddq xmm1, xmm0, [ARG + _aesarg_in + 16*0] vpaddq xmm2, xmm0, [ARG + _aesarg_in + 16*1] vpaddq xmm3, xmm0, [ARG + _aesarg_in + 16*2] vpaddq xmm4, xmm0, [ARG + _aesarg_in + 16*3] vmovdqa [ARG + _aesarg_in + 16*0], xmm1 vmovdqa [ARG + _aesarg_in + 16*1], xmm2 vmovdqa [ARG + _aesarg_in + 16*2], xmm3 vmovdqa [ARG + _aesarg_in + 16*3], xmm4 %ifndef CBC_MAC vpaddq xmm5, xmm0, [ARG + _aesarg_out + 16*0] vpaddq xmm6, xmm0, [ARG + _aesarg_out + 16*1] vpaddq xmm7, xmm0, [ARG + _aesarg_out + 16*2] vpaddq xmm8, xmm0, [ARG + _aesarg_out + 16*3] vmovdqa [ARG + _aesarg_out + 16*0], xmm5 vmovdqa [ARG + _aesarg_out + 16*1], xmm6 vmovdqa [ARG + _aesarg_out + 16*2], xmm7 vmovdqa [ARG + _aesarg_out + 16*3], xmm8 %endif ;; XMMs are saved at a higher level mov rbp, [GPR_SAVE_AREA + 8*0] %ifdef CBC_MAC mov rbx, [GPR_SAVE_AREA + 8*1] mov r12, [GPR_SAVE_AREA + 8*2] mov r13, [GPR_SAVE_AREA + 8*3] mov r14, [GPR_SAVE_AREA + 8*4] mov r15, [GPR_SAVE_AREA + 8*5] %ifndef LINUX mov rsi, [GPR_SAVE_AREA + 8*6] mov rdi, [GPR_SAVE_AREA + 8*7] %endif %endif add rsp, STACK_size ret intel-ipsec-mb-0.48/avx/aes_cbc_enc_192_x8.asm000066400000000000000000000422351321406316400210010ustar00rootroot00000000000000;; ;; Copyright (c) 2012-2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; ;;; routine to do a 192 bit CBC AES encrypt ;; clobbers all registers except for ARG1 and rbp %include "os.asm" %include "mb_mgr_datastruct.asm" %define VMOVDQ vmovdqu ;; assume buffers not aligned %macro VPXOR2 2 vpxor %1, %1, %2 %endm ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; struct AES_ARGS_x8 { ;; void* in[8]; ;; void* out[8]; ;; UINT128* keys[8]; ;; UINT128 IV[8]; ;; } ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; void aes_cbc_enc_192_x8(AES_ARGS_x8 *args, UINT64 len); ;; arg 1: ARG : addr of AES_ARGS_x8 structure ;; arg 2: LEN : len (in units of bytes) struc STACK _gpr_save: resq 1 _len: resq 1 endstruc %define GPR_SAVE_AREA rsp + _gpr_save %define LEN_AREA rsp + _len %ifdef LINUX %define ARG rdi %define LEN rsi %define REG3 rcx %define REG4 rdx %else %define ARG rcx %define LEN rdx %define REG3 rsi %define REG4 rdi %endif %define IDX rax %define TMP rbx %define KEYS0 REG3 %define KEYS1 REG4 %define KEYS2 rbp %define KEYS3 r8 %define KEYS4 r9 %define KEYS5 r10 %define KEYS6 r11 %define KEYS7 r12 %define IN0 r13 %define IN2 r14 %define IN4 r15 %define IN6 LEN %define XDATA0 xmm0 %define XDATA1 xmm1 %define XDATA2 xmm2 %define XDATA3 xmm3 %define XDATA4 xmm4 %define XDATA5 xmm5 %define XDATA6 xmm6 %define XDATA7 xmm7 %define XKEY0_3 xmm8 %define XKEY1_4 xmm9 %define XKEY2_5 xmm10 %define XKEY3_6 xmm11 %define XKEY4_7 xmm12 %define XKEY5_8 xmm13 %define XKEY6_9 xmm14 %define XTMP xmm15 MKGLOBAL(aes_cbc_enc_192_x8,function,internal) aes_cbc_enc_192_x8: sub rsp, STACK_size mov [GPR_SAVE_AREA + 8*0], rbp mov IDX, 16 mov [LEN_AREA], LEN mov IN0, [ARG + _aesarg_in + 8*0] mov IN2, [ARG + _aesarg_in + 8*2] mov IN4, [ARG + _aesarg_in + 8*4] mov IN6, [ARG + _aesarg_in + 8*6] ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; mov TMP, [ARG + _aesarg_in + 8*1] VMOVDQ XDATA0, [IN0] ; load first block of plain text VMOVDQ XDATA1, [TMP] ; load first block of plain text mov TMP, [ARG + _aesarg_in + 8*3] VMOVDQ XDATA2, [IN2] ; load first block of plain text VMOVDQ XDATA3, [TMP] ; load first block of plain text mov TMP, [ARG + _aesarg_in + 8*5] VMOVDQ XDATA4, [IN4] ; load first block of plain text VMOVDQ XDATA5, [TMP] ; load first block of plain text mov TMP, [ARG + _aesarg_in + 8*7] VMOVDQ XDATA6, [IN6] ; load first block of plain text VMOVDQ XDATA7, [TMP] ; load first block of plain text VPXOR2 XDATA0, [ARG + _aesarg_IV + 16*0] ; plaintext XOR IV VPXOR2 XDATA1, [ARG + _aesarg_IV + 16*1] ; plaintext XOR IV VPXOR2 XDATA2, [ARG + _aesarg_IV + 16*2] ; plaintext XOR IV VPXOR2 XDATA3, [ARG + _aesarg_IV + 16*3] ; plaintext XOR IV VPXOR2 XDATA4, [ARG + _aesarg_IV + 16*4] ; plaintext XOR IV VPXOR2 XDATA5, [ARG + _aesarg_IV + 16*5] ; plaintext XOR IV VPXOR2 XDATA6, [ARG + _aesarg_IV + 16*6] ; plaintext XOR IV VPXOR2 XDATA7, [ARG + _aesarg_IV + 16*7] ; plaintext XOR IV mov KEYS0, [ARG + _aesarg_keys + 8*0] mov KEYS1, [ARG + _aesarg_keys + 8*1] mov KEYS2, [ARG + _aesarg_keys + 8*2] mov KEYS3, [ARG + _aesarg_keys + 8*3] mov KEYS4, [ARG + _aesarg_keys + 8*4] mov KEYS5, [ARG + _aesarg_keys + 8*5] mov KEYS6, [ARG + _aesarg_keys + 8*6] mov KEYS7, [ARG + _aesarg_keys + 8*7] VPXOR2 XDATA0, [KEYS0 + 16*0] ; 0. ARK VPXOR2 XDATA1, [KEYS1 + 16*0] ; 0. ARK VPXOR2 XDATA2, [KEYS2 + 16*0] ; 0. ARK VPXOR2 XDATA3, [KEYS3 + 16*0] ; 0. ARK VPXOR2 XDATA4, [KEYS4 + 16*0] ; 0. ARK VPXOR2 XDATA5, [KEYS5 + 16*0] ; 0. ARK VPXOR2 XDATA6, [KEYS6 + 16*0] ; 0. ARK VPXOR2 XDATA7, [KEYS7 + 16*0] ; 0. ARK vaesenc XDATA0, [KEYS0 + 16*1] ; 1. ENC vaesenc XDATA1, [KEYS1 + 16*1] ; 1. ENC vaesenc XDATA2, [KEYS2 + 16*1] ; 1. ENC vaesenc XDATA3, [KEYS3 + 16*1] ; 1. ENC vaesenc XDATA4, [KEYS4 + 16*1] ; 1. ENC vaesenc XDATA5, [KEYS5 + 16*1] ; 1. ENC vaesenc XDATA6, [KEYS6 + 16*1] ; 1. ENC vaesenc XDATA7, [KEYS7 + 16*1] ; 1. ENC vmovdqa XKEY0_3, [KEYS0 + 16*3] ; load round 3 key vaesenc XDATA0, [KEYS0 + 16*2] ; 2. ENC vaesenc XDATA1, [KEYS1 + 16*2] ; 2. ENC vaesenc XDATA2, [KEYS2 + 16*2] ; 2. ENC vaesenc XDATA3, [KEYS3 + 16*2] ; 2. ENC vaesenc XDATA4, [KEYS4 + 16*2] ; 2. ENC vaesenc XDATA5, [KEYS5 + 16*2] ; 2. ENC vaesenc XDATA6, [KEYS6 + 16*2] ; 2. ENC vaesenc XDATA7, [KEYS7 + 16*2] ; 2. ENC vmovdqa XKEY1_4, [KEYS1 + 16*4] ; load round 4 key vaesenc XDATA0, XKEY0_3 ; 3. ENC vaesenc XDATA1, [KEYS1 + 16*3] ; 3. ENC vaesenc XDATA2, [KEYS2 + 16*3] ; 3. ENC vaesenc XDATA3, [KEYS3 + 16*3] ; 3. ENC vaesenc XDATA4, [KEYS4 + 16*3] ; 3. ENC vaesenc XDATA5, [KEYS5 + 16*3] ; 3. ENC vaesenc XDATA6, [KEYS6 + 16*3] ; 3. ENC vaesenc XDATA7, [KEYS7 + 16*3] ; 3. ENC vaesenc XDATA0, [KEYS0 + 16*4] ; 4. ENC vmovdqa XKEY2_5, [KEYS2 + 16*5] ; load round 5 key vaesenc XDATA1, XKEY1_4 ; 4. ENC vaesenc XDATA2, [KEYS2 + 16*4] ; 4. ENC vaesenc XDATA3, [KEYS3 + 16*4] ; 4. ENC vaesenc XDATA4, [KEYS4 + 16*4] ; 4. ENC vaesenc XDATA5, [KEYS5 + 16*4] ; 4. ENC vaesenc XDATA6, [KEYS6 + 16*4] ; 4. ENC vaesenc XDATA7, [KEYS7 + 16*4] ; 4. ENC vaesenc XDATA0, [KEYS0 + 16*5] ; 5. ENC vaesenc XDATA1, [KEYS1 + 16*5] ; 5. ENC vmovdqa XKEY3_6, [KEYS3 + 16*6] ; load round 6 key vaesenc XDATA2, XKEY2_5 ; 5. ENC vaesenc XDATA3, [KEYS3 + 16*5] ; 5. ENC vaesenc XDATA4, [KEYS4 + 16*5] ; 5. ENC vaesenc XDATA5, [KEYS5 + 16*5] ; 5. ENC vaesenc XDATA6, [KEYS6 + 16*5] ; 5. ENC vaesenc XDATA7, [KEYS7 + 16*5] ; 5. ENC vaesenc XDATA0, [KEYS0 + 16*6] ; 6. ENC vaesenc XDATA1, [KEYS1 + 16*6] ; 6. ENC vaesenc XDATA2, [KEYS2 + 16*6] ; 6. ENC vmovdqa XKEY4_7, [KEYS4 + 16*7] ; load round 7 key vaesenc XDATA3, XKEY3_6 ; 6. ENC vaesenc XDATA4, [KEYS4 + 16*6] ; 6. ENC vaesenc XDATA5, [KEYS5 + 16*6] ; 6. ENC vaesenc XDATA6, [KEYS6 + 16*6] ; 6. ENC vaesenc XDATA7, [KEYS7 + 16*6] ; 6. ENC vaesenc XDATA0, [KEYS0 + 16*7] ; 7. ENC vaesenc XDATA1, [KEYS1 + 16*7] ; 7. ENC vaesenc XDATA2, [KEYS2 + 16*7] ; 7. ENC vaesenc XDATA3, [KEYS3 + 16*7] ; 7. ENC vmovdqa XKEY5_8, [KEYS5 + 16*8] ; load round 8 key vaesenc XDATA4, XKEY4_7 ; 7. ENC vaesenc XDATA5, [KEYS5 + 16*7] ; 7. ENC vaesenc XDATA6, [KEYS6 + 16*7] ; 7. ENC vaesenc XDATA7, [KEYS7 + 16*7] ; 7. ENC vaesenc XDATA0, [KEYS0 + 16*8] ; 8. ENC vaesenc XDATA1, [KEYS1 + 16*8] ; 8. ENC vaesenc XDATA2, [KEYS2 + 16*8] ; 8. ENC vaesenc XDATA3, [KEYS3 + 16*8] ; 8. ENC vaesenc XDATA4, [KEYS4 + 16*8] ; 8. ENC vmovdqa XKEY6_9, [KEYS6 + 16*9] ; load round 9 key vaesenc XDATA5, XKEY5_8 ; 8. ENC vaesenc XDATA6, [KEYS6 + 16*8] ; 8. ENC vaesenc XDATA7, [KEYS7 + 16*8] ; 8. ENC vaesenc XDATA0, [KEYS0 + 16*9] ; 9. ENC vaesenc XDATA1, [KEYS1 + 16*9] ; 9. ENC vaesenc XDATA2, [KEYS2 + 16*9] ; 9. ENC vaesenc XDATA3, [KEYS3 + 16*9] ; 9. ENC vaesenc XDATA4, [KEYS4 + 16*9] ; 9. ENC vaesenc XDATA5, [KEYS5 + 16*9] ; 9. ENC mov TMP, [ARG + _aesarg_out + 8*0] vaesenc XDATA6, XKEY6_9 ; 9. ENC vaesenc XDATA7, [KEYS7 + 16*9] ; 9. ENC vaesenc XDATA0, [KEYS0 + 16*10] ; 10. ENC vaesenc XDATA1, [KEYS1 + 16*10] ; 10. ENC vaesenc XDATA2, [KEYS2 + 16*10] ; 10. ENC vaesenc XDATA3, [KEYS3 + 16*10] ; 10. ENC vaesenc XDATA4, [KEYS4 + 16*10] ; 10. ENC vaesenc XDATA5, [KEYS5 + 16*10] ; 10. ENC vaesenc XDATA6, [KEYS6 + 16*10] ; 10. ENC vaesenc XDATA7, [KEYS7 + 16*10] ; 10. ENC vaesenc XDATA0, [KEYS0 + 16*11] ; 11. ENC vaesenc XDATA1, [KEYS1 + 16*11] ; 11. ENC vaesenc XDATA2, [KEYS2 + 16*11] ; 11. ENC vaesenc XDATA3, [KEYS3 + 16*11] ; 11. ENC vaesenc XDATA4, [KEYS4 + 16*11] ; 11. ENC vaesenc XDATA5, [KEYS5 + 16*11] ; 11. ENC vaesenc XDATA6, [KEYS6 + 16*11] ; 11. ENC vaesenc XDATA7, [KEYS7 + 16*11] ; 11. ENC vaesenclast XDATA0, [KEYS0 + 16*12] ; 12. ENC vaesenclast XDATA1, [KEYS1 + 16*12] ; 12. ENC vaesenclast XDATA2, [KEYS2 + 16*12] ; 12. ENC vaesenclast XDATA3, [KEYS3 + 16*12] ; 12. ENC vaesenclast XDATA4, [KEYS4 + 16*12] ; 12. ENC vaesenclast XDATA5, [KEYS5 + 16*12] ; 12. ENC vaesenclast XDATA6, [KEYS6 + 16*12] ; 12. ENC vaesenclast XDATA7, [KEYS7 + 16*12] ; 12. ENC VMOVDQ [TMP], XDATA0 ; write back ciphertext mov TMP, [ARG + _aesarg_out + 8*1] VMOVDQ [TMP], XDATA1 ; write back ciphertext mov TMP, [ARG + _aesarg_out + 8*2] VMOVDQ [TMP], XDATA2 ; write back ciphertext mov TMP, [ARG + _aesarg_out + 8*3] VMOVDQ [TMP], XDATA3 ; write back ciphertext mov TMP, [ARG + _aesarg_out + 8*4] VMOVDQ [TMP], XDATA4 ; write back ciphertext mov TMP, [ARG + _aesarg_out + 8*5] VMOVDQ [TMP], XDATA5 ; write back ciphertext mov TMP, [ARG + _aesarg_out + 8*6] VMOVDQ [TMP], XDATA6 ; write back ciphertext mov TMP, [ARG + _aesarg_out + 8*7] VMOVDQ [TMP], XDATA7 ; write back ciphertext cmp [LEN_AREA], IDX je done main_loop: mov TMP, [ARG + _aesarg_in + 8*1] VPXOR2 XDATA0, [IN0 + IDX] ; load next block of plain text VPXOR2 XDATA1, [TMP + IDX] ; load next block of plain text mov TMP, [ARG + _aesarg_in + 8*3] VPXOR2 XDATA2, [IN2 + IDX] ; load next block of plain text VPXOR2 XDATA3, [TMP + IDX] ; load next block of plain text mov TMP, [ARG + _aesarg_in + 8*5] VPXOR2 XDATA4, [IN4 + IDX] ; load next block of plain text VPXOR2 XDATA5, [TMP + IDX] ; load next block of plain text mov TMP, [ARG + _aesarg_in + 8*7] VPXOR2 XDATA6, [IN6 + IDX] ; load next block of plain text VPXOR2 XDATA7, [TMP + IDX] ; load next block of plain text VPXOR2 XDATA0, [KEYS0 + 16*0] ; 0. ARK VPXOR2 XDATA1, [KEYS1 + 16*0] ; 0. ARK VPXOR2 XDATA2, [KEYS2 + 16*0] ; 0. ARK VPXOR2 XDATA3, [KEYS3 + 16*0] ; 0. ARK VPXOR2 XDATA4, [KEYS4 + 16*0] ; 0. ARK VPXOR2 XDATA5, [KEYS5 + 16*0] ; 0. ARK VPXOR2 XDATA6, [KEYS6 + 16*0] ; 0. ARK VPXOR2 XDATA7, [KEYS7 + 16*0] ; 0. ARK vaesenc XDATA0, [KEYS0 + 16*1] ; 1. ENC vaesenc XDATA1, [KEYS1 + 16*1] ; 1. ENC vaesenc XDATA2, [KEYS2 + 16*1] ; 1. ENC vaesenc XDATA3, [KEYS3 + 16*1] ; 1. ENC vaesenc XDATA4, [KEYS4 + 16*1] ; 1. ENC vaesenc XDATA5, [KEYS5 + 16*1] ; 1. ENC vaesenc XDATA6, [KEYS6 + 16*1] ; 1. ENC vaesenc XDATA7, [KEYS7 + 16*1] ; 1. ENC vaesenc XDATA0, [KEYS0 + 16*2] ; 2. ENC vaesenc XDATA1, [KEYS1 + 16*2] ; 2. ENC vaesenc XDATA2, [KEYS2 + 16*2] ; 2. ENC vaesenc XDATA3, [KEYS3 + 16*2] ; 2. ENC vaesenc XDATA4, [KEYS4 + 16*2] ; 2. ENC vaesenc XDATA5, [KEYS5 + 16*2] ; 2. ENC vaesenc XDATA6, [KEYS6 + 16*2] ; 2. ENC vaesenc XDATA7, [KEYS7 + 16*2] ; 2. ENC vaesenc XDATA0, XKEY0_3 ; 3. ENC vaesenc XDATA1, [KEYS1 + 16*3] ; 3. ENC vaesenc XDATA2, [KEYS2 + 16*3] ; 3. ENC vaesenc XDATA3, [KEYS3 + 16*3] ; 3. ENC vaesenc XDATA4, [KEYS4 + 16*3] ; 3. ENC vaesenc XDATA5, [KEYS5 + 16*3] ; 3. ENC vaesenc XDATA6, [KEYS6 + 16*3] ; 3. ENC vaesenc XDATA7, [KEYS7 + 16*3] ; 3. ENC vaesenc XDATA0, [KEYS0 + 16*4] ; 4. ENC vaesenc XDATA1, XKEY1_4 ; 4. ENC vaesenc XDATA2, [KEYS2 + 16*4] ; 4. ENC vaesenc XDATA3, [KEYS3 + 16*4] ; 4. ENC vaesenc XDATA4, [KEYS4 + 16*4] ; 4. ENC vaesenc XDATA5, [KEYS5 + 16*4] ; 4. ENC vaesenc XDATA6, [KEYS6 + 16*4] ; 4. ENC vaesenc XDATA7, [KEYS7 + 16*4] ; 4. ENC vaesenc XDATA0, [KEYS0 + 16*5] ; 5. ENC vaesenc XDATA1, [KEYS1 + 16*5] ; 5. ENC vaesenc XDATA2, XKEY2_5 ; 5. ENC vaesenc XDATA3, [KEYS3 + 16*5] ; 5. ENC vaesenc XDATA4, [KEYS4 + 16*5] ; 5. ENC vaesenc XDATA5, [KEYS5 + 16*5] ; 5. ENC vaesenc XDATA6, [KEYS6 + 16*5] ; 5. ENC vaesenc XDATA7, [KEYS7 + 16*5] ; 5. ENC vaesenc XDATA0, [KEYS0 + 16*6] ; 6. ENC vaesenc XDATA1, [KEYS1 + 16*6] ; 6. ENC vaesenc XDATA2, [KEYS2 + 16*6] ; 6. ENC vaesenc XDATA3, XKEY3_6 ; 6. ENC vaesenc XDATA4, [KEYS4 + 16*6] ; 6. ENC vaesenc XDATA5, [KEYS5 + 16*6] ; 6. ENC vaesenc XDATA6, [KEYS6 + 16*6] ; 6. ENC vaesenc XDATA7, [KEYS7 + 16*6] ; 6. ENC vaesenc XDATA0, [KEYS0 + 16*7] ; 7. ENC vaesenc XDATA1, [KEYS1 + 16*7] ; 7. ENC vaesenc XDATA2, [KEYS2 + 16*7] ; 7. ENC vaesenc XDATA3, [KEYS3 + 16*7] ; 7. ENC vaesenc XDATA4, XKEY4_7 ; 7. ENC vaesenc XDATA5, [KEYS5 + 16*7] ; 7. ENC vaesenc XDATA6, [KEYS6 + 16*7] ; 7. ENC vaesenc XDATA7, [KEYS7 + 16*7] ; 7. ENC vaesenc XDATA0, [KEYS0 + 16*8] ; 8. ENC vaesenc XDATA1, [KEYS1 + 16*8] ; 8. ENC vaesenc XDATA2, [KEYS2 + 16*8] ; 8. ENC vaesenc XDATA3, [KEYS3 + 16*8] ; 8. ENC vaesenc XDATA4, [KEYS4 + 16*8] ; 8. ENC vaesenc XDATA5, XKEY5_8 ; 8. ENC vaesenc XDATA6, [KEYS6 + 16*8] ; 8. ENC vaesenc XDATA7, [KEYS7 + 16*8] ; 8. ENC vaesenc XDATA0, [KEYS0 + 16*9] ; 9. ENC vaesenc XDATA1, [KEYS1 + 16*9] ; 9. ENC vaesenc XDATA2, [KEYS2 + 16*9] ; 9. ENC vaesenc XDATA3, [KEYS3 + 16*9] ; 9. ENC vaesenc XDATA4, [KEYS4 + 16*9] ; 9. ENC vaesenc XDATA5, [KEYS5 + 16*9] ; 9. ENC mov TMP, [ARG + _aesarg_out + 8*0] vaesenc XDATA6, XKEY6_9 ; 9. ENC vaesenc XDATA7, [KEYS7 + 16*9] ; 9. ENC vaesenc XDATA0, [KEYS0 + 16*10] ; 10. ENC vaesenc XDATA1, [KEYS1 + 16*10] ; 10. ENC vaesenc XDATA2, [KEYS2 + 16*10] ; 10. ENC vaesenc XDATA3, [KEYS3 + 16*10] ; 10. ENC vaesenc XDATA4, [KEYS4 + 16*10] ; 10. ENC vaesenc XDATA5, [KEYS5 + 16*10] ; 10. ENC vaesenc XDATA6, [KEYS6 + 16*10] ; 10. ENC vaesenc XDATA7, [KEYS7 + 16*10] ; 10. ENC vaesenc XDATA0, [KEYS0 + 16*11] ; 11. ENC vaesenc XDATA1, [KEYS1 + 16*11] ; 11. ENC vaesenc XDATA2, [KEYS2 + 16*11] ; 11. ENC vaesenc XDATA3, [KEYS3 + 16*11] ; 11. ENC vaesenc XDATA4, [KEYS4 + 16*11] ; 11. ENC vaesenc XDATA5, [KEYS5 + 16*11] ; 11. ENC vaesenc XDATA6, [KEYS6 + 16*11] ; 11. ENC vaesenc XDATA7, [KEYS7 + 16*11] ; 11. ENC vaesenclast XDATA0, [KEYS0 + 16*12] ; 12. ENC vaesenclast XDATA1, [KEYS1 + 16*12] ; 12. ENC vaesenclast XDATA2, [KEYS2 + 16*12] ; 12. ENC vaesenclast XDATA3, [KEYS3 + 16*12] ; 12. ENC vaesenclast XDATA4, [KEYS4 + 16*12] ; 12. ENC vaesenclast XDATA5, [KEYS5 + 16*12] ; 12. ENC vaesenclast XDATA6, [KEYS6 + 16*12] ; 12. ENC vaesenclast XDATA7, [KEYS7 + 16*12] ; 12. ENC VMOVDQ [TMP + IDX], XDATA0 ; write back ciphertext mov TMP, [ARG + _aesarg_out + 8*1] VMOVDQ [TMP + IDX], XDATA1 ; write back ciphertext mov TMP, [ARG + _aesarg_out + 8*2] VMOVDQ [TMP + IDX], XDATA2 ; write back ciphertext mov TMP, [ARG + _aesarg_out + 8*3] VMOVDQ [TMP + IDX], XDATA3 ; write back ciphertext mov TMP, [ARG + _aesarg_out + 8*4] VMOVDQ [TMP + IDX], XDATA4 ; write back ciphertext mov TMP, [ARG + _aesarg_out + 8*5] VMOVDQ [TMP + IDX], XDATA5 ; write back ciphertext mov TMP, [ARG + _aesarg_out + 8*6] VMOVDQ [TMP + IDX], XDATA6 ; write back ciphertext mov TMP, [ARG + _aesarg_out + 8*7] VMOVDQ [TMP + IDX], XDATA7 ; write back ciphertext add IDX, 16 cmp [LEN_AREA], IDX jne main_loop done: ;; update IV vmovdqa [ARG + _aesarg_IV + 16*0], XDATA0 vmovdqa [ARG + _aesarg_IV + 16*1], XDATA1 vmovdqa [ARG + _aesarg_IV + 16*2], XDATA2 vmovdqa [ARG + _aesarg_IV + 16*3], XDATA3 vmovdqa [ARG + _aesarg_IV + 16*4], XDATA4 vmovdqa [ARG + _aesarg_IV + 16*5], XDATA5 vmovdqa [ARG + _aesarg_IV + 16*6], XDATA6 vmovdqa [ARG + _aesarg_IV + 16*7], XDATA7 ;; update IN and OUT vmovd xmm0, [LEN_AREA] vpshufd xmm0, xmm0, 0x44 vpaddq xmm1, xmm0, [ARG + _aesarg_in + 16*0] vpaddq xmm2, xmm0, [ARG + _aesarg_in + 16*1] vpaddq xmm3, xmm0, [ARG + _aesarg_in + 16*2] vpaddq xmm4, xmm0, [ARG + _aesarg_in + 16*3] vmovdqa [ARG + _aesarg_in + 16*0], xmm1 vmovdqa [ARG + _aesarg_in + 16*1], xmm2 vmovdqa [ARG + _aesarg_in + 16*2], xmm3 vmovdqa [ARG + _aesarg_in + 16*3], xmm4 vpaddq xmm5, xmm0, [ARG + _aesarg_out + 16*0] vpaddq xmm6, xmm0, [ARG + _aesarg_out + 16*1] vpaddq xmm7, xmm0, [ARG + _aesarg_out + 16*2] vpaddq xmm8, xmm0, [ARG + _aesarg_out + 16*3] vmovdqa [ARG + _aesarg_out + 16*0], xmm5 vmovdqa [ARG + _aesarg_out + 16*1], xmm6 vmovdqa [ARG + _aesarg_out + 16*2], xmm7 vmovdqa [ARG + _aesarg_out + 16*3], xmm8 ;; XMMs are saved at a higher level mov rbp, [GPR_SAVE_AREA + 8*0] add rsp, STACK_size ret intel-ipsec-mb-0.48/avx/aes_cbc_enc_256_x8.asm000066400000000000000000000450411321406316400210000ustar00rootroot00000000000000;; ;; Copyright (c) 2012-2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; ;;; routine to do a 256 bit CBC AES encrypt ;; clobbers all registers except for ARG1 and rbp %include "os.asm" %include "mb_mgr_datastruct.asm" %define VMOVDQ vmovdqu ;; assume buffers not aligned %macro VPXOR2 2 vpxor %1, %1, %2 %endm ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; struct AES_ARGS_x8 { ;; void* in[8]; ;; void* out[8]; ;; UINT128* keys[8]; ;; UINT128 IV[8]; ;; } ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; void aes_cbc_enc_256_x8(AES_ARGS_x8 *args, UINT64 len); ;; arg 1: ARG : addr of AES_ARGS_x8 structure ;; arg 2: LEN : len (in units of bytes) struc STACK _gpr_save: resq 1 _len: resq 1 endstruc %define GPR_SAVE_AREA rsp + _gpr_save %define LEN_AREA rsp + _len %ifdef LINUX %define ARG rdi %define LEN rsi %define REG3 rcx %define REG4 rdx %else %define ARG rcx %define LEN rdx %define REG3 rsi %define REG4 rdi %endif %define IDX rax %define TMP rbx %define KEYS0 REG3 %define KEYS1 REG4 %define KEYS2 rbp %define KEYS3 r8 %define KEYS4 r9 %define KEYS5 r10 %define KEYS6 r11 %define KEYS7 r12 %define IN0 r13 %define IN2 r14 %define IN4 r15 %define IN6 LEN %define XDATA0 xmm0 %define XDATA1 xmm1 %define XDATA2 xmm2 %define XDATA3 xmm3 %define XDATA4 xmm4 %define XDATA5 xmm5 %define XDATA6 xmm6 %define XDATA7 xmm7 %define XKEY0_3 xmm8 %define XKEY1_4 xmm9 %define XKEY2_5 xmm10 %define XKEY3_6 xmm11 %define XKEY4_7 xmm12 %define XKEY5_8 xmm13 %define XKEY6_9 xmm14 %define XTMP xmm15 MKGLOBAL(aes_cbc_enc_256_x8,function,internal) aes_cbc_enc_256_x8: sub rsp, STACK_size mov [GPR_SAVE_AREA + 8*0], rbp mov IDX, 16 mov [LEN_AREA], LEN mov IN0, [ARG + _aesarg_in + 8*0] mov IN2, [ARG + _aesarg_in + 8*2] mov IN4, [ARG + _aesarg_in + 8*4] mov IN6, [ARG + _aesarg_in + 8*6] ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; mov TMP, [ARG + _aesarg_in + 8*1] VMOVDQ XDATA0, [IN0] ; load first block of plain text VMOVDQ XDATA1, [TMP] ; load first block of plain text mov TMP, [ARG + _aesarg_in + 8*3] VMOVDQ XDATA2, [IN2] ; load first block of plain text VMOVDQ XDATA3, [TMP] ; load first block of plain text mov TMP, [ARG + _aesarg_in + 8*5] VMOVDQ XDATA4, [IN4] ; load first block of plain text VMOVDQ XDATA5, [TMP] ; load first block of plain text mov TMP, [ARG + _aesarg_in + 8*7] VMOVDQ XDATA6, [IN6] ; load first block of plain text VMOVDQ XDATA7, [TMP] ; load first block of plain text VPXOR2 XDATA0, [ARG + _aesarg_IV + 16*0] ; plaintext XOR IV VPXOR2 XDATA1, [ARG + _aesarg_IV + 16*1] ; plaintext XOR IV VPXOR2 XDATA2, [ARG + _aesarg_IV + 16*2] ; plaintext XOR IV VPXOR2 XDATA3, [ARG + _aesarg_IV + 16*3] ; plaintext XOR IV VPXOR2 XDATA4, [ARG + _aesarg_IV + 16*4] ; plaintext XOR IV VPXOR2 XDATA5, [ARG + _aesarg_IV + 16*5] ; plaintext XOR IV VPXOR2 XDATA6, [ARG + _aesarg_IV + 16*6] ; plaintext XOR IV VPXOR2 XDATA7, [ARG + _aesarg_IV + 16*7] ; plaintext XOR IV mov KEYS0, [ARG + _aesarg_keys + 8*0] mov KEYS1, [ARG + _aesarg_keys + 8*1] mov KEYS2, [ARG + _aesarg_keys + 8*2] mov KEYS3, [ARG + _aesarg_keys + 8*3] mov KEYS4, [ARG + _aesarg_keys + 8*4] mov KEYS5, [ARG + _aesarg_keys + 8*5] mov KEYS6, [ARG + _aesarg_keys + 8*6] mov KEYS7, [ARG + _aesarg_keys + 8*7] VPXOR2 XDATA0, [KEYS0 + 16*0] ; 0. ARK VPXOR2 XDATA1, [KEYS1 + 16*0] ; 0. ARK VPXOR2 XDATA2, [KEYS2 + 16*0] ; 0. ARK VPXOR2 XDATA3, [KEYS3 + 16*0] ; 0. ARK VPXOR2 XDATA4, [KEYS4 + 16*0] ; 0. ARK VPXOR2 XDATA5, [KEYS5 + 16*0] ; 0. ARK VPXOR2 XDATA6, [KEYS6 + 16*0] ; 0. ARK VPXOR2 XDATA7, [KEYS7 + 16*0] ; 0. ARK vaesenc XDATA0, [KEYS0 + 16*1] ; 1. ENC vaesenc XDATA1, [KEYS1 + 16*1] ; 1. ENC vaesenc XDATA2, [KEYS2 + 16*1] ; 1. ENC vaesenc XDATA3, [KEYS3 + 16*1] ; 1. ENC vaesenc XDATA4, [KEYS4 + 16*1] ; 1. ENC vaesenc XDATA5, [KEYS5 + 16*1] ; 1. ENC vaesenc XDATA6, [KEYS6 + 16*1] ; 1. ENC vaesenc XDATA7, [KEYS7 + 16*1] ; 1. ENC vmovdqa XKEY0_3, [KEYS0 + 16*3] ; load round 3 key vaesenc XDATA0, [KEYS0 + 16*2] ; 2. ENC vaesenc XDATA1, [KEYS1 + 16*2] ; 2. ENC vaesenc XDATA2, [KEYS2 + 16*2] ; 2. ENC vaesenc XDATA3, [KEYS3 + 16*2] ; 2. ENC vaesenc XDATA4, [KEYS4 + 16*2] ; 2. ENC vaesenc XDATA5, [KEYS5 + 16*2] ; 2. ENC vaesenc XDATA6, [KEYS6 + 16*2] ; 2. ENC vaesenc XDATA7, [KEYS7 + 16*2] ; 2. ENC vmovdqa XKEY1_4, [KEYS1 + 16*4] ; load round 4 key vaesenc XDATA0, XKEY0_3 ; 3. ENC vaesenc XDATA1, [KEYS1 + 16*3] ; 3. ENC vaesenc XDATA2, [KEYS2 + 16*3] ; 3. ENC vaesenc XDATA3, [KEYS3 + 16*3] ; 3. ENC vaesenc XDATA4, [KEYS4 + 16*3] ; 3. ENC vaesenc XDATA5, [KEYS5 + 16*3] ; 3. ENC vaesenc XDATA6, [KEYS6 + 16*3] ; 3. ENC vaesenc XDATA7, [KEYS7 + 16*3] ; 3. ENC vaesenc XDATA0, [KEYS0 + 16*4] ; 4. ENC vmovdqa XKEY2_5, [KEYS2 + 16*5] ; load round 5 key vaesenc XDATA1, XKEY1_4 ; 4. ENC vaesenc XDATA2, [KEYS2 + 16*4] ; 4. ENC vaesenc XDATA3, [KEYS3 + 16*4] ; 4. ENC vaesenc XDATA4, [KEYS4 + 16*4] ; 4. ENC vaesenc XDATA5, [KEYS5 + 16*4] ; 4. ENC vaesenc XDATA6, [KEYS6 + 16*4] ; 4. ENC vaesenc XDATA7, [KEYS7 + 16*4] ; 4. ENC vaesenc XDATA0, [KEYS0 + 16*5] ; 5. ENC vaesenc XDATA1, [KEYS1 + 16*5] ; 5. ENC vmovdqa XKEY3_6, [KEYS3 + 16*6] ; load round 6 key vaesenc XDATA2, XKEY2_5 ; 5. ENC vaesenc XDATA3, [KEYS3 + 16*5] ; 5. ENC vaesenc XDATA4, [KEYS4 + 16*5] ; 5. ENC vaesenc XDATA5, [KEYS5 + 16*5] ; 5. ENC vaesenc XDATA6, [KEYS6 + 16*5] ; 5. ENC vaesenc XDATA7, [KEYS7 + 16*5] ; 5. ENC vaesenc XDATA0, [KEYS0 + 16*6] ; 6. ENC vaesenc XDATA1, [KEYS1 + 16*6] ; 6. ENC vaesenc XDATA2, [KEYS2 + 16*6] ; 6. ENC vmovdqa XKEY4_7, [KEYS4 + 16*7] ; load round 7 key vaesenc XDATA3, XKEY3_6 ; 6. ENC vaesenc XDATA4, [KEYS4 + 16*6] ; 6. ENC vaesenc XDATA5, [KEYS5 + 16*6] ; 6. ENC vaesenc XDATA6, [KEYS6 + 16*6] ; 6. ENC vaesenc XDATA7, [KEYS7 + 16*6] ; 6. ENC vaesenc XDATA0, [KEYS0 + 16*7] ; 7. ENC vaesenc XDATA1, [KEYS1 + 16*7] ; 7. ENC vaesenc XDATA2, [KEYS2 + 16*7] ; 7. ENC vaesenc XDATA3, [KEYS3 + 16*7] ; 7. ENC vmovdqa XKEY5_8, [KEYS5 + 16*8] ; load round 8 key vaesenc XDATA4, XKEY4_7 ; 7. ENC vaesenc XDATA5, [KEYS5 + 16*7] ; 7. ENC vaesenc XDATA6, [KEYS6 + 16*7] ; 7. ENC vaesenc XDATA7, [KEYS7 + 16*7] ; 7. ENC vaesenc XDATA0, [KEYS0 + 16*8] ; 8. ENC vaesenc XDATA1, [KEYS1 + 16*8] ; 8. ENC vaesenc XDATA2, [KEYS2 + 16*8] ; 8. ENC vaesenc XDATA3, [KEYS3 + 16*8] ; 8. ENC vaesenc XDATA4, [KEYS4 + 16*8] ; 8. ENC vmovdqa XKEY6_9, [KEYS6 + 16*9] ; load round 9 key vaesenc XDATA5, XKEY5_8 ; 8. ENC vaesenc XDATA6, [KEYS6 + 16*8] ; 8. ENC vaesenc XDATA7, [KEYS7 + 16*8] ; 8. ENC vaesenc XDATA0, [KEYS0 + 16*9] ; 9. ENC vaesenc XDATA1, [KEYS1 + 16*9] ; 9. ENC vaesenc XDATA2, [KEYS2 + 16*9] ; 9. ENC vaesenc XDATA3, [KEYS3 + 16*9] ; 9. ENC vaesenc XDATA4, [KEYS4 + 16*9] ; 9. ENC vaesenc XDATA5, [KEYS5 + 16*9] ; 9. ENC mov TMP, [ARG + _aesarg_out + 8*0] vaesenc XDATA6, XKEY6_9 ; 9. ENC vaesenc XDATA7, [KEYS7 + 16*9] ; 9. ENC vaesenc XDATA0, [KEYS0 + 16*10] ; 10. ENC vaesenc XDATA1, [KEYS1 + 16*10] ; 10. ENC vaesenc XDATA2, [KEYS2 + 16*10] ; 10. ENC vaesenc XDATA3, [KEYS3 + 16*10] ; 10. ENC vaesenc XDATA4, [KEYS4 + 16*10] ; 10. ENC vaesenc XDATA5, [KEYS5 + 16*10] ; 10. ENC vaesenc XDATA6, [KEYS6 + 16*10] ; 10. ENC vaesenc XDATA7, [KEYS7 + 16*10] ; 10. ENC vaesenc XDATA0, [KEYS0 + 16*11] ; 11. ENC vaesenc XDATA1, [KEYS1 + 16*11] ; 11. ENC vaesenc XDATA2, [KEYS2 + 16*11] ; 11. ENC vaesenc XDATA3, [KEYS3 + 16*11] ; 11. ENC vaesenc XDATA4, [KEYS4 + 16*11] ; 11. ENC vaesenc XDATA5, [KEYS5 + 16*11] ; 11. ENC vaesenc XDATA6, [KEYS6 + 16*11] ; 11. ENC vaesenc XDATA7, [KEYS7 + 16*11] ; 11. ENC vaesenc XDATA0, [KEYS0 + 16*12] ; 12. ENC vaesenc XDATA1, [KEYS1 + 16*12] ; 12. ENC vaesenc XDATA2, [KEYS2 + 16*12] ; 12. ENC vaesenc XDATA3, [KEYS3 + 16*12] ; 12. ENC vaesenc XDATA4, [KEYS4 + 16*12] ; 12. ENC vaesenc XDATA5, [KEYS5 + 16*12] ; 12. ENC vaesenc XDATA6, [KEYS6 + 16*12] ; 12. ENC vaesenc XDATA7, [KEYS7 + 16*12] ; 12. ENC vaesenc XDATA0, [KEYS0 + 16*13] ; 13. ENC vaesenc XDATA1, [KEYS1 + 16*13] ; 13. ENC vaesenc XDATA2, [KEYS2 + 16*13] ; 13. ENC vaesenc XDATA3, [KEYS3 + 16*13] ; 13. ENC vaesenc XDATA4, [KEYS4 + 16*13] ; 13. ENC vaesenc XDATA5, [KEYS5 + 16*13] ; 13. ENC vaesenc XDATA6, [KEYS6 + 16*13] ; 13. ENC vaesenc XDATA7, [KEYS7 + 16*13] ; 13. ENC vaesenclast XDATA0, [KEYS0 + 16*14] ; 14. ENC vaesenclast XDATA1, [KEYS1 + 16*14] ; 14. ENC vaesenclast XDATA2, [KEYS2 + 16*14] ; 14. ENC vaesenclast XDATA3, [KEYS3 + 16*14] ; 14. ENC vaesenclast XDATA4, [KEYS4 + 16*14] ; 14. ENC vaesenclast XDATA5, [KEYS5 + 16*14] ; 14. ENC vaesenclast XDATA6, [KEYS6 + 16*14] ; 14. ENC vaesenclast XDATA7, [KEYS7 + 16*14] ; 14. ENC VMOVDQ [TMP], XDATA0 ; write back ciphertext mov TMP, [ARG + _aesarg_out + 8*1] VMOVDQ [TMP], XDATA1 ; write back ciphertext mov TMP, [ARG + _aesarg_out + 8*2] VMOVDQ [TMP], XDATA2 ; write back ciphertext mov TMP, [ARG + _aesarg_out + 8*3] VMOVDQ [TMP], XDATA3 ; write back ciphertext mov TMP, [ARG + _aesarg_out + 8*4] VMOVDQ [TMP], XDATA4 ; write back ciphertext mov TMP, [ARG + _aesarg_out + 8*5] VMOVDQ [TMP], XDATA5 ; write back ciphertext mov TMP, [ARG + _aesarg_out + 8*6] VMOVDQ [TMP], XDATA6 ; write back ciphertext mov TMP, [ARG + _aesarg_out + 8*7] VMOVDQ [TMP], XDATA7 ; write back ciphertext cmp [LEN_AREA], IDX je done main_loop: mov TMP, [ARG + _aesarg_in + 8*1] VPXOR2 XDATA0, [IN0 + IDX] ; load next block of plain text VPXOR2 XDATA1, [TMP + IDX] ; load next block of plain text mov TMP, [ARG + _aesarg_in + 8*3] VPXOR2 XDATA2, [IN2 + IDX] ; load next block of plain text VPXOR2 XDATA3, [TMP + IDX] ; load next block of plain text mov TMP, [ARG + _aesarg_in + 8*5] VPXOR2 XDATA4, [IN4 + IDX] ; load next block of plain text VPXOR2 XDATA5, [TMP + IDX] ; load next block of plain text mov TMP, [ARG + _aesarg_in + 8*7] VPXOR2 XDATA6, [IN6 + IDX] ; load next block of plain text VPXOR2 XDATA7, [TMP + IDX] ; load next block of plain text VPXOR2 XDATA0, [KEYS0 + 16*0] ; 0. ARK VPXOR2 XDATA1, [KEYS1 + 16*0] ; 0. ARK VPXOR2 XDATA2, [KEYS2 + 16*0] ; 0. ARK VPXOR2 XDATA3, [KEYS3 + 16*0] ; 0. ARK VPXOR2 XDATA4, [KEYS4 + 16*0] ; 0. ARK VPXOR2 XDATA5, [KEYS5 + 16*0] ; 0. ARK VPXOR2 XDATA6, [KEYS6 + 16*0] ; 0. ARK VPXOR2 XDATA7, [KEYS7 + 16*0] ; 0. ARK vaesenc XDATA0, [KEYS0 + 16*1] ; 1. ENC vaesenc XDATA1, [KEYS1 + 16*1] ; 1. ENC vaesenc XDATA2, [KEYS2 + 16*1] ; 1. ENC vaesenc XDATA3, [KEYS3 + 16*1] ; 1. ENC vaesenc XDATA4, [KEYS4 + 16*1] ; 1. ENC vaesenc XDATA5, [KEYS5 + 16*1] ; 1. ENC vaesenc XDATA6, [KEYS6 + 16*1] ; 1. ENC vaesenc XDATA7, [KEYS7 + 16*1] ; 1. ENC vaesenc XDATA0, [KEYS0 + 16*2] ; 2. ENC vaesenc XDATA1, [KEYS1 + 16*2] ; 2. ENC vaesenc XDATA2, [KEYS2 + 16*2] ; 2. ENC vaesenc XDATA3, [KEYS3 + 16*2] ; 2. ENC vaesenc XDATA4, [KEYS4 + 16*2] ; 2. ENC vaesenc XDATA5, [KEYS5 + 16*2] ; 2. ENC vaesenc XDATA6, [KEYS6 + 16*2] ; 2. ENC vaesenc XDATA7, [KEYS7 + 16*2] ; 2. ENC vaesenc XDATA0, XKEY0_3 ; 3. ENC vaesenc XDATA1, [KEYS1 + 16*3] ; 3. ENC vaesenc XDATA2, [KEYS2 + 16*3] ; 3. ENC vaesenc XDATA3, [KEYS3 + 16*3] ; 3. ENC vaesenc XDATA4, [KEYS4 + 16*3] ; 3. ENC vaesenc XDATA5, [KEYS5 + 16*3] ; 3. ENC vaesenc XDATA6, [KEYS6 + 16*3] ; 3. ENC vaesenc XDATA7, [KEYS7 + 16*3] ; 3. ENC vaesenc XDATA0, [KEYS0 + 16*4] ; 4. ENC vaesenc XDATA1, XKEY1_4 ; 4. ENC vaesenc XDATA2, [KEYS2 + 16*4] ; 4. ENC vaesenc XDATA3, [KEYS3 + 16*4] ; 4. ENC vaesenc XDATA4, [KEYS4 + 16*4] ; 4. ENC vaesenc XDATA5, [KEYS5 + 16*4] ; 4. ENC vaesenc XDATA6, [KEYS6 + 16*4] ; 4. ENC vaesenc XDATA7, [KEYS7 + 16*4] ; 4. ENC vaesenc XDATA0, [KEYS0 + 16*5] ; 5. ENC vaesenc XDATA1, [KEYS1 + 16*5] ; 5. ENC vaesenc XDATA2, XKEY2_5 ; 5. ENC vaesenc XDATA3, [KEYS3 + 16*5] ; 5. ENC vaesenc XDATA4, [KEYS4 + 16*5] ; 5. ENC vaesenc XDATA5, [KEYS5 + 16*5] ; 5. ENC vaesenc XDATA6, [KEYS6 + 16*5] ; 5. ENC vaesenc XDATA7, [KEYS7 + 16*5] ; 5. ENC vaesenc XDATA0, [KEYS0 + 16*6] ; 6. ENC vaesenc XDATA1, [KEYS1 + 16*6] ; 6. ENC vaesenc XDATA2, [KEYS2 + 16*6] ; 6. ENC vaesenc XDATA3, XKEY3_6 ; 6. ENC vaesenc XDATA4, [KEYS4 + 16*6] ; 6. ENC vaesenc XDATA5, [KEYS5 + 16*6] ; 6. ENC vaesenc XDATA6, [KEYS6 + 16*6] ; 6. ENC vaesenc XDATA7, [KEYS7 + 16*6] ; 6. ENC vaesenc XDATA0, [KEYS0 + 16*7] ; 7. ENC vaesenc XDATA1, [KEYS1 + 16*7] ; 7. ENC vaesenc XDATA2, [KEYS2 + 16*7] ; 7. ENC vaesenc XDATA3, [KEYS3 + 16*7] ; 7. ENC vaesenc XDATA4, XKEY4_7 ; 7. ENC vaesenc XDATA5, [KEYS5 + 16*7] ; 7. ENC vaesenc XDATA6, [KEYS6 + 16*7] ; 7. ENC vaesenc XDATA7, [KEYS7 + 16*7] ; 7. ENC vaesenc XDATA0, [KEYS0 + 16*8] ; 8. ENC vaesenc XDATA1, [KEYS1 + 16*8] ; 8. ENC vaesenc XDATA2, [KEYS2 + 16*8] ; 8. ENC vaesenc XDATA3, [KEYS3 + 16*8] ; 8. ENC vaesenc XDATA4, [KEYS4 + 16*8] ; 8. ENC vaesenc XDATA5, XKEY5_8 ; 8. ENC vaesenc XDATA6, [KEYS6 + 16*8] ; 8. ENC vaesenc XDATA7, [KEYS7 + 16*8] ; 8. ENC vaesenc XDATA0, [KEYS0 + 16*9] ; 9. ENC vaesenc XDATA1, [KEYS1 + 16*9] ; 9. ENC vaesenc XDATA2, [KEYS2 + 16*9] ; 9. ENC vaesenc XDATA3, [KEYS3 + 16*9] ; 9. ENC vaesenc XDATA4, [KEYS4 + 16*9] ; 9. ENC vaesenc XDATA5, [KEYS5 + 16*9] ; 9. ENC mov TMP, [ARG + _aesarg_out + 8*0] vaesenc XDATA6, XKEY6_9 ; 9. ENC vaesenc XDATA7, [KEYS7 + 16*9] ; 9. ENC vaesenc XDATA0, [KEYS0 + 16*10] ; 10. ENC vaesenc XDATA1, [KEYS1 + 16*10] ; 10. ENC vaesenc XDATA2, [KEYS2 + 16*10] ; 10. ENC vaesenc XDATA3, [KEYS3 + 16*10] ; 10. ENC vaesenc XDATA4, [KEYS4 + 16*10] ; 10. ENC vaesenc XDATA5, [KEYS5 + 16*10] ; 10. ENC vaesenc XDATA6, [KEYS6 + 16*10] ; 10. ENC vaesenc XDATA7, [KEYS7 + 16*10] ; 10. ENC vaesenc XDATA0, [KEYS0 + 16*11] ; 11. ENC vaesenc XDATA1, [KEYS1 + 16*11] ; 11. ENC vaesenc XDATA2, [KEYS2 + 16*11] ; 11. ENC vaesenc XDATA3, [KEYS3 + 16*11] ; 11. ENC vaesenc XDATA4, [KEYS4 + 16*11] ; 11. ENC vaesenc XDATA5, [KEYS5 + 16*11] ; 11. ENC vaesenc XDATA6, [KEYS6 + 16*11] ; 11. ENC vaesenc XDATA7, [KEYS7 + 16*11] ; 11. ENC vaesenc XDATA0, [KEYS0 + 16*12] ; 12. ENC vaesenc XDATA1, [KEYS1 + 16*12] ; 12. ENC vaesenc XDATA2, [KEYS2 + 16*12] ; 12. ENC vaesenc XDATA3, [KEYS3 + 16*12] ; 12. ENC vaesenc XDATA4, [KEYS4 + 16*12] ; 12. ENC vaesenc XDATA5, [KEYS5 + 16*12] ; 12. ENC vaesenc XDATA6, [KEYS6 + 16*12] ; 12. ENC vaesenc XDATA7, [KEYS7 + 16*12] ; 12. ENC vaesenc XDATA0, [KEYS0 + 16*13] ; 13. ENC vaesenc XDATA1, [KEYS1 + 16*13] ; 13. ENC vaesenc XDATA2, [KEYS2 + 16*13] ; 13. ENC vaesenc XDATA3, [KEYS3 + 16*13] ; 13. ENC vaesenc XDATA4, [KEYS4 + 16*13] ; 13. ENC vaesenc XDATA5, [KEYS5 + 16*13] ; 13. ENC vaesenc XDATA6, [KEYS6 + 16*13] ; 13. ENC vaesenc XDATA7, [KEYS7 + 16*13] ; 13. ENC vaesenclast XDATA0, [KEYS0 + 16*14] ; 14. ENC vaesenclast XDATA1, [KEYS1 + 16*14] ; 14. ENC vaesenclast XDATA2, [KEYS2 + 16*14] ; 14. ENC vaesenclast XDATA3, [KEYS3 + 16*14] ; 14. ENC vaesenclast XDATA4, [KEYS4 + 16*14] ; 14. ENC vaesenclast XDATA5, [KEYS5 + 16*14] ; 14. ENC vaesenclast XDATA6, [KEYS6 + 16*14] ; 14. ENC vaesenclast XDATA7, [KEYS7 + 16*14] ; 14. ENC VMOVDQ [TMP + IDX], XDATA0 ; write back ciphertext mov TMP, [ARG + _aesarg_out + 8*1] VMOVDQ [TMP + IDX], XDATA1 ; write back ciphertext mov TMP, [ARG + _aesarg_out + 8*2] VMOVDQ [TMP + IDX], XDATA2 ; write back ciphertext mov TMP, [ARG + _aesarg_out + 8*3] VMOVDQ [TMP + IDX], XDATA3 ; write back ciphertext mov TMP, [ARG + _aesarg_out + 8*4] VMOVDQ [TMP + IDX], XDATA4 ; write back ciphertext mov TMP, [ARG + _aesarg_out + 8*5] VMOVDQ [TMP + IDX], XDATA5 ; write back ciphertext mov TMP, [ARG + _aesarg_out + 8*6] VMOVDQ [TMP + IDX], XDATA6 ; write back ciphertext mov TMP, [ARG + _aesarg_out + 8*7] VMOVDQ [TMP + IDX], XDATA7 ; write back ciphertext add IDX, 16 cmp [LEN_AREA], IDX jne main_loop done: ;; update IV vmovdqa [ARG + _aesarg_IV + 16*0], XDATA0 vmovdqa [ARG + _aesarg_IV + 16*1], XDATA1 vmovdqa [ARG + _aesarg_IV + 16*2], XDATA2 vmovdqa [ARG + _aesarg_IV + 16*3], XDATA3 vmovdqa [ARG + _aesarg_IV + 16*4], XDATA4 vmovdqa [ARG + _aesarg_IV + 16*5], XDATA5 vmovdqa [ARG + _aesarg_IV + 16*6], XDATA6 vmovdqa [ARG + _aesarg_IV + 16*7], XDATA7 ;; update IN and OUT vmovd xmm0, [LEN_AREA] vpshufd xmm0, xmm0, 0x44 vpaddq xmm1, xmm0, [ARG + _aesarg_in + 16*0] vpaddq xmm2, xmm0, [ARG + _aesarg_in + 16*1] vpaddq xmm3, xmm0, [ARG + _aesarg_in + 16*2] vpaddq xmm4, xmm0, [ARG + _aesarg_in + 16*3] vmovdqa [ARG + _aesarg_in + 16*0], xmm1 vmovdqa [ARG + _aesarg_in + 16*1], xmm2 vmovdqa [ARG + _aesarg_in + 16*2], xmm3 vmovdqa [ARG + _aesarg_in + 16*3], xmm4 vpaddq xmm5, xmm0, [ARG + _aesarg_out + 16*0] vpaddq xmm6, xmm0, [ARG + _aesarg_out + 16*1] vpaddq xmm7, xmm0, [ARG + _aesarg_out + 16*2] vpaddq xmm8, xmm0, [ARG + _aesarg_out + 16*3] vmovdqa [ARG + _aesarg_out + 16*0], xmm5 vmovdqa [ARG + _aesarg_out + 16*1], xmm6 vmovdqa [ARG + _aesarg_out + 16*2], xmm7 vmovdqa [ARG + _aesarg_out + 16*3], xmm8 ;; XMMs are saved at a higher level mov rbp, [GPR_SAVE_AREA + 8*0] add rsp, STACK_size ret intel-ipsec-mb-0.48/avx/aes_cfb_128_avx.asm000066400000000000000000000122201321406316400204040ustar00rootroot00000000000000;; ;; Copyright (c) 2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; %include "os.asm" %include "memcpy.asm" ;;; Routine to do 128 bit CFB AES encrypt/decrypt operations on one block only. ;;; It processes only one buffer at a time. ;;; It is designed to manage partial blocks of DOCSIS 3.1 SEC BPI ;; In System V AMD64 ABI ;; calle saves: RBX, RBP, R12-R15 ;; Windows x64 ABI ;; calle saves: RBX, RBP, RDI, RSI, RSP, R12-R15 ;; ;; Registers: RAX RBX RCX RDX RBP RSI RDI R8 R9 R10 R11 R12 R13 R14 R15 ;; ----------------------------------------------------------- ;; Windows clobbers: RAX R9 R10 R11 ;; Windows preserves: RBX RCX RDX RBP RSI RDI R8 R12 R13 R14 R15 ;; ----------------------------------------------------------- ;; Linux clobbers: RAX R9 R10 ;; Linux preserves: RBX RCX RDX RBP RSI RDI R8 R11 R12 R13 R14 R15 ;; ----------------------------------------------------------- ;; ;; Linux/Windows clobbers: xmm0 ;; %ifdef LINUX %define arg1 rdi %define arg2 rsi %define arg3 rdx %define arg4 rcx %define arg5 r8 %else %define arg1 rcx %define arg2 rdx %define arg3 r8 %define arg4 r9 %define arg5 [rsp + 5*8] %endif %define OUT arg1 %define IN arg2 %define IV arg3 %define KEYS arg4 %ifdef LINUX %define LEN arg5 %else %define LEN2 arg5 %define LEN r11 %endif %define TMP0 rax %define TMP1 r10 %define PTR0 rsp + _buffer %define XDATA xmm0 section .text struc STACK _buffer: resq 2 _rsp_save: resq 1 endstruc ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; void aes_cfb_128_one(void *out, void *in, void *iv, void *keys) ;; arg 1: OUT : addr to put clear/cipher text out ;; arg 2: IN : addr to take cipher/clear text from ;; arg 3: IV : initialization vector ;; arg 4: KEYS: pointer to expanded keys structure (16 byte aligned) ;; arg 5: LEN: length of the text to encrypt/decrypt (valid range is 0 to 16) ;; ;; AES CFB128 one block encrypt/decrypt implementation. ;; The function doesn't update IV. The result of operation can be found in OUT. ;; ;; It is primarly designed to process partial block of ;; DOCSIS 3.1 AES Packet PDU Encryption (I.10) ;; ;; It process up to one block only (up to 16 bytes). ;; ;; It makes sure not to read more than LEN bytes from IN and ;; not to store more than LEN bytes to OUT. MKGLOBAL(aes_cfb_128_one_avx,function,internal) align 32 aes_cfb_128_one_avx: %ifndef LINUX mov LEN, LEN2 %endif mov rax, rsp sub rsp, STACK_size and rsp, -16 mov [rsp + _rsp_save], rax test LEN, 16 jz copy_in_lt16 vmovdqu XDATA, [IN] vmovdqa [PTR0], XDATA jmp copy_in_end copy_in_lt16: memcpy_avx_16 PTR0, IN, LEN, TMP0, TMP1 copy_in_end: ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; vmovdqu XDATA, [IV] ; IV (or next to last block) vpxor XDATA, XDATA, [KEYS + 16*0] ; 0. ARK vaesenc XDATA, XDATA, [KEYS + 16*1] ; 1. ENC vaesenc XDATA, XDATA, [KEYS + 16*2] ; 2. ENC vaesenc XDATA, XDATA, [KEYS + 16*3] ; 3. ENC vaesenc XDATA, XDATA, [KEYS + 16*4] ; 4. ENC vaesenc XDATA, XDATA, [KEYS + 16*5] ; 5. ENC vaesenc XDATA, XDATA, [KEYS + 16*6] ; 6. ENC vaesenc XDATA, XDATA, [KEYS + 16*7] ; 7. ENC vaesenc XDATA, XDATA, [KEYS + 16*8] ; 8. ENC vaesenc XDATA, XDATA, [KEYS + 16*9] ; 9. ENC vaesenclast XDATA, XDATA, [KEYS + 16*10] ; 10. ENC vpxor XDATA, XDATA, [PTR0] ; plaintext/ciphertext XOR block cipher encryption test LEN, 16 jz copy_out_lt16 vmovdqu [OUT], XDATA jmp copy_out_end copy_out_lt16: vmovdqa [PTR0], XDATA memcpy_avx_16 OUT, PTR0, LEN, TMP0, TMP1 copy_out_end: ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; mov rsp, [rsp + _rsp_save] ; original SP ret intel-ipsec-mb-0.48/avx/aes_xcbc_mac_128_x8.asm000066400000000000000000000343431321406316400211640ustar00rootroot00000000000000;; ;; Copyright (c) 2012-2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; ;;; routine to do 128 bit AES XCBC ;; clobbers all registers except for ARG1 and rbp %include "os.asm" %include "mb_mgr_datastruct.asm" %define VMOVDQ vmovdqu ;; assume buffers not aligned %macro VPXOR2 2 vpxor %1, %1, %2 %endm ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; struct AES_XCBC_ARGS_x8 { ;; void* in[8]; ;; UINT128* keys[8]; ;; UINT128 ICV[8]; ;; } ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; void aes_xcbc_mac_128_x8(AES_XCBC_ARGS_x8 *args, UINT64 len); ;; arg 1: ARG : addr of AES_XCBC_ARGS_x8 structure ;; arg 2: LEN : len (in units of bytes) struc STACK _gpr_save: resq 1 _len: resq 1 endstruc %define GPR_SAVE_AREA rsp + _gpr_save %define LEN_AREA rsp + _len %ifdef LINUX %define ARG rdi %define LEN rsi %define REG3 rcx %define REG4 rdx %else %define ARG rcx %define LEN rdx %define REG3 rsi %define REG4 rdi %endif %define IDX rax %define TMP rbx %define KEYS0 REG3 %define KEYS1 REG4 %define KEYS2 rbp %define KEYS3 r8 %define KEYS4 r9 %define KEYS5 r10 %define KEYS6 r11 %define KEYS7 r12 %define IN0 r13 %define IN2 r14 %define IN4 r15 %define IN6 LEN %define XDATA0 xmm0 %define XDATA1 xmm1 %define XDATA2 xmm2 %define XDATA3 xmm3 %define XDATA4 xmm4 %define XDATA5 xmm5 %define XDATA6 xmm6 %define XDATA7 xmm7 %define XKEY0_3 xmm8 %define XKEY1_4 xmm9 %define XKEY2_5 xmm10 %define XKEY3_6 xmm11 %define XKEY4_7 xmm12 %define XKEY5_8 xmm13 %define XKEY6_9 xmm14 %define XTMP xmm15 MKGLOBAL(aes_xcbc_mac_128_x8,function,internal) aes_xcbc_mac_128_x8: sub rsp, STACK_size mov [GPR_SAVE_AREA + 8*0], rbp mov IDX, 16 mov [LEN_AREA], LEN mov IN0, [ARG + _aesxcbcarg_in + 8*0] mov IN2, [ARG + _aesxcbcarg_in + 8*2] mov IN4, [ARG + _aesxcbcarg_in + 8*4] mov IN6, [ARG + _aesxcbcarg_in + 8*6] ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; mov TMP, [ARG + _aesxcbcarg_in + 8*1] VMOVDQ XDATA0, [IN0] ; load first block of plain text VMOVDQ XDATA1, [TMP] ; load first block of plain text mov TMP, [ARG + _aesxcbcarg_in + 8*3] VMOVDQ XDATA2, [IN2] ; load first block of plain text VMOVDQ XDATA3, [TMP] ; load first block of plain text mov TMP, [ARG + _aesxcbcarg_in + 8*5] VMOVDQ XDATA4, [IN4] ; load first block of plain text VMOVDQ XDATA5, [TMP] ; load first block of plain text mov TMP, [ARG + _aesxcbcarg_in + 8*7] VMOVDQ XDATA6, [IN6] ; load first block of plain text VMOVDQ XDATA7, [TMP] ; load first block of plain text VPXOR2 XDATA0, [ARG + _aesxcbcarg_ICV + 16*0] ; plaintext XOR ICV VPXOR2 XDATA1, [ARG + _aesxcbcarg_ICV + 16*1] ; plaintext XOR ICV VPXOR2 XDATA2, [ARG + _aesxcbcarg_ICV + 16*2] ; plaintext XOR ICV VPXOR2 XDATA3, [ARG + _aesxcbcarg_ICV + 16*3] ; plaintext XOR ICV VPXOR2 XDATA4, [ARG + _aesxcbcarg_ICV + 16*4] ; plaintext XOR ICV VPXOR2 XDATA5, [ARG + _aesxcbcarg_ICV + 16*5] ; plaintext XOR ICV VPXOR2 XDATA6, [ARG + _aesxcbcarg_ICV + 16*6] ; plaintext XOR ICV VPXOR2 XDATA7, [ARG + _aesxcbcarg_ICV + 16*7] ; plaintext XOR ICV mov KEYS0, [ARG + _aesxcbcarg_keys + 8*0] mov KEYS1, [ARG + _aesxcbcarg_keys + 8*1] mov KEYS2, [ARG + _aesxcbcarg_keys + 8*2] mov KEYS3, [ARG + _aesxcbcarg_keys + 8*3] mov KEYS4, [ARG + _aesxcbcarg_keys + 8*4] mov KEYS5, [ARG + _aesxcbcarg_keys + 8*5] mov KEYS6, [ARG + _aesxcbcarg_keys + 8*6] mov KEYS7, [ARG + _aesxcbcarg_keys + 8*7] VPXOR2 XDATA0, [KEYS0 + 16*0] ; 0. ARK VPXOR2 XDATA1, [KEYS1 + 16*0] ; 0. ARK VPXOR2 XDATA2, [KEYS2 + 16*0] ; 0. ARK VPXOR2 XDATA3, [KEYS3 + 16*0] ; 0. ARK VPXOR2 XDATA4, [KEYS4 + 16*0] ; 0. ARK VPXOR2 XDATA5, [KEYS5 + 16*0] ; 0. ARK VPXOR2 XDATA6, [KEYS6 + 16*0] ; 0. ARK VPXOR2 XDATA7, [KEYS7 + 16*0] ; 0. ARK vaesenc XDATA0, [KEYS0 + 16*1] ; 1. ENC vaesenc XDATA1, [KEYS1 + 16*1] ; 1. ENC vaesenc XDATA2, [KEYS2 + 16*1] ; 1. ENC vaesenc XDATA3, [KEYS3 + 16*1] ; 1. ENC vaesenc XDATA4, [KEYS4 + 16*1] ; 1. ENC vaesenc XDATA5, [KEYS5 + 16*1] ; 1. ENC vaesenc XDATA6, [KEYS6 + 16*1] ; 1. ENC vaesenc XDATA7, [KEYS7 + 16*1] ; 1. ENC vmovdqa XKEY0_3, [KEYS0 + 16*3] ; load round 3 key vaesenc XDATA0, [KEYS0 + 16*2] ; 2. ENC vaesenc XDATA1, [KEYS1 + 16*2] ; 2. ENC vaesenc XDATA2, [KEYS2 + 16*2] ; 2. ENC vaesenc XDATA3, [KEYS3 + 16*2] ; 2. ENC vaesenc XDATA4, [KEYS4 + 16*2] ; 2. ENC vaesenc XDATA5, [KEYS5 + 16*2] ; 2. ENC vaesenc XDATA6, [KEYS6 + 16*2] ; 2. ENC vaesenc XDATA7, [KEYS7 + 16*2] ; 2. ENC vmovdqa XKEY1_4, [KEYS1 + 16*4] ; load round 4 key vaesenc XDATA0, XKEY0_3 ; 3. ENC vaesenc XDATA1, [KEYS1 + 16*3] ; 3. ENC vaesenc XDATA2, [KEYS2 + 16*3] ; 3. ENC vaesenc XDATA3, [KEYS3 + 16*3] ; 3. ENC vaesenc XDATA4, [KEYS4 + 16*3] ; 3. ENC vaesenc XDATA5, [KEYS5 + 16*3] ; 3. ENC vaesenc XDATA6, [KEYS6 + 16*3] ; 3. ENC vaesenc XDATA7, [KEYS7 + 16*3] ; 3. ENC vaesenc XDATA0, [KEYS0 + 16*4] ; 4. ENC vmovdqa XKEY2_5, [KEYS2 + 16*5] ; load round 5 key vaesenc XDATA1, XKEY1_4 ; 4. ENC vaesenc XDATA2, [KEYS2 + 16*4] ; 4. ENC vaesenc XDATA3, [KEYS3 + 16*4] ; 4. ENC vaesenc XDATA4, [KEYS4 + 16*4] ; 4. ENC vaesenc XDATA5, [KEYS5 + 16*4] ; 4. ENC vaesenc XDATA6, [KEYS6 + 16*4] ; 4. ENC vaesenc XDATA7, [KEYS7 + 16*4] ; 4. ENC vaesenc XDATA0, [KEYS0 + 16*5] ; 5. ENC vaesenc XDATA1, [KEYS1 + 16*5] ; 5. ENC vmovdqa XKEY3_6, [KEYS3 + 16*6] ; load round 6 key vaesenc XDATA2, XKEY2_5 ; 5. ENC vaesenc XDATA3, [KEYS3 + 16*5] ; 5. ENC vaesenc XDATA4, [KEYS4 + 16*5] ; 5. ENC vaesenc XDATA5, [KEYS5 + 16*5] ; 5. ENC vaesenc XDATA6, [KEYS6 + 16*5] ; 5. ENC vaesenc XDATA7, [KEYS7 + 16*5] ; 5. ENC vaesenc XDATA0, [KEYS0 + 16*6] ; 6. ENC vaesenc XDATA1, [KEYS1 + 16*6] ; 6. ENC vaesenc XDATA2, [KEYS2 + 16*6] ; 6. ENC vmovdqa XKEY4_7, [KEYS4 + 16*7] ; load round 7 key vaesenc XDATA3, XKEY3_6 ; 6. ENC vaesenc XDATA4, [KEYS4 + 16*6] ; 6. ENC vaesenc XDATA5, [KEYS5 + 16*6] ; 6. ENC vaesenc XDATA6, [KEYS6 + 16*6] ; 6. ENC vaesenc XDATA7, [KEYS7 + 16*6] ; 6. ENC vaesenc XDATA0, [KEYS0 + 16*7] ; 7. ENC vaesenc XDATA1, [KEYS1 + 16*7] ; 7. ENC vaesenc XDATA2, [KEYS2 + 16*7] ; 7. ENC vaesenc XDATA3, [KEYS3 + 16*7] ; 7. ENC vmovdqa XKEY5_8, [KEYS5 + 16*8] ; load round 8 key vaesenc XDATA4, XKEY4_7 ; 7. ENC vaesenc XDATA5, [KEYS5 + 16*7] ; 7. ENC vaesenc XDATA6, [KEYS6 + 16*7] ; 7. ENC vaesenc XDATA7, [KEYS7 + 16*7] ; 7. ENC vaesenc XDATA0, [KEYS0 + 16*8] ; 8. ENC vaesenc XDATA1, [KEYS1 + 16*8] ; 8. ENC vaesenc XDATA2, [KEYS2 + 16*8] ; 8. ENC vaesenc XDATA3, [KEYS3 + 16*8] ; 8. ENC vaesenc XDATA4, [KEYS4 + 16*8] ; 8. ENC vmovdqa XKEY6_9, [KEYS6 + 16*9] ; load round 9 key vaesenc XDATA5, XKEY5_8 ; 8. ENC vaesenc XDATA6, [KEYS6 + 16*8] ; 8. ENC vaesenc XDATA7, [KEYS7 + 16*8] ; 8. ENC vaesenc XDATA0, [KEYS0 + 16*9] ; 9. ENC vaesenc XDATA1, [KEYS1 + 16*9] ; 9. ENC vaesenc XDATA2, [KEYS2 + 16*9] ; 9. ENC vaesenc XDATA3, [KEYS3 + 16*9] ; 9. ENC vaesenc XDATA4, [KEYS4 + 16*9] ; 9. ENC vaesenc XDATA5, [KEYS5 + 16*9] ; 9. ENC vaesenc XDATA6, XKEY6_9 ; 9. ENC vaesenc XDATA7, [KEYS7 + 16*9] ; 9. ENC vaesenclast XDATA0, [KEYS0 + 16*10] ; 10. ENC vaesenclast XDATA1, [KEYS1 + 16*10] ; 10. ENC vaesenclast XDATA2, [KEYS2 + 16*10] ; 10. ENC vaesenclast XDATA3, [KEYS3 + 16*10] ; 10. ENC vaesenclast XDATA4, [KEYS4 + 16*10] ; 10. ENC vaesenclast XDATA5, [KEYS5 + 16*10] ; 10. ENC vaesenclast XDATA6, [KEYS6 + 16*10] ; 10. ENC vaesenclast XDATA7, [KEYS7 + 16*10] ; 10. ENC cmp [LEN_AREA], IDX je done main_loop: mov TMP, [ARG + _aesxcbcarg_in + 8*1] VPXOR2 XDATA0, [IN0 + IDX] ; load next block of plain text VPXOR2 XDATA1, [TMP + IDX] ; load next block of plain text mov TMP, [ARG + _aesxcbcarg_in + 8*3] VPXOR2 XDATA2, [IN2 + IDX] ; load next block of plain text VPXOR2 XDATA3, [TMP + IDX] ; load next block of plain text mov TMP, [ARG + _aesxcbcarg_in + 8*5] VPXOR2 XDATA4, [IN4 + IDX] ; load next block of plain text VPXOR2 XDATA5, [TMP + IDX] ; load next block of plain text mov TMP, [ARG + _aesxcbcarg_in + 8*7] VPXOR2 XDATA6, [IN6 + IDX] ; load next block of plain text VPXOR2 XDATA7, [TMP + IDX] ; load next block of plain text VPXOR2 XDATA0, [KEYS0 + 16*0] ; 0. ARK VPXOR2 XDATA1, [KEYS1 + 16*0] ; 0. ARK VPXOR2 XDATA2, [KEYS2 + 16*0] ; 0. ARK VPXOR2 XDATA3, [KEYS3 + 16*0] ; 0. ARK VPXOR2 XDATA4, [KEYS4 + 16*0] ; 0. ARK VPXOR2 XDATA5, [KEYS5 + 16*0] ; 0. ARK VPXOR2 XDATA6, [KEYS6 + 16*0] ; 0. ARK VPXOR2 XDATA7, [KEYS7 + 16*0] ; 0. ARK vaesenc XDATA0, [KEYS0 + 16*1] ; 1. ENC vaesenc XDATA1, [KEYS1 + 16*1] ; 1. ENC vaesenc XDATA2, [KEYS2 + 16*1] ; 1. ENC vaesenc XDATA3, [KEYS3 + 16*1] ; 1. ENC vaesenc XDATA4, [KEYS4 + 16*1] ; 1. ENC vaesenc XDATA5, [KEYS5 + 16*1] ; 1. ENC vaesenc XDATA6, [KEYS6 + 16*1] ; 1. ENC vaesenc XDATA7, [KEYS7 + 16*1] ; 1. ENC vaesenc XDATA0, [KEYS0 + 16*2] ; 2. ENC vaesenc XDATA1, [KEYS1 + 16*2] ; 2. ENC vaesenc XDATA2, [KEYS2 + 16*2] ; 2. ENC vaesenc XDATA3, [KEYS3 + 16*2] ; 2. ENC vaesenc XDATA4, [KEYS4 + 16*2] ; 2. ENC vaesenc XDATA5, [KEYS5 + 16*2] ; 2. ENC vaesenc XDATA6, [KEYS6 + 16*2] ; 2. ENC vaesenc XDATA7, [KEYS7 + 16*2] ; 2. ENC vaesenc XDATA0, XKEY0_3 ; 3. ENC vaesenc XDATA1, [KEYS1 + 16*3] ; 3. ENC vaesenc XDATA2, [KEYS2 + 16*3] ; 3. ENC vaesenc XDATA3, [KEYS3 + 16*3] ; 3. ENC vaesenc XDATA4, [KEYS4 + 16*3] ; 3. ENC vaesenc XDATA5, [KEYS5 + 16*3] ; 3. ENC vaesenc XDATA6, [KEYS6 + 16*3] ; 3. ENC vaesenc XDATA7, [KEYS7 + 16*3] ; 3. ENC vaesenc XDATA0, [KEYS0 + 16*4] ; 4. ENC vaesenc XDATA1, XKEY1_4 ; 4. ENC vaesenc XDATA2, [KEYS2 + 16*4] ; 4. ENC vaesenc XDATA3, [KEYS3 + 16*4] ; 4. ENC vaesenc XDATA4, [KEYS4 + 16*4] ; 4. ENC vaesenc XDATA5, [KEYS5 + 16*4] ; 4. ENC vaesenc XDATA6, [KEYS6 + 16*4] ; 4. ENC vaesenc XDATA7, [KEYS7 + 16*4] ; 4. ENC vaesenc XDATA0, [KEYS0 + 16*5] ; 5. ENC vaesenc XDATA1, [KEYS1 + 16*5] ; 5. ENC vaesenc XDATA2, XKEY2_5 ; 5. ENC vaesenc XDATA3, [KEYS3 + 16*5] ; 5. ENC vaesenc XDATA4, [KEYS4 + 16*5] ; 5. ENC vaesenc XDATA5, [KEYS5 + 16*5] ; 5. ENC vaesenc XDATA6, [KEYS6 + 16*5] ; 5. ENC vaesenc XDATA7, [KEYS7 + 16*5] ; 5. ENC vaesenc XDATA0, [KEYS0 + 16*6] ; 6. ENC vaesenc XDATA1, [KEYS1 + 16*6] ; 6. ENC vaesenc XDATA2, [KEYS2 + 16*6] ; 6. ENC vaesenc XDATA3, XKEY3_6 ; 6. ENC vaesenc XDATA4, [KEYS4 + 16*6] ; 6. ENC vaesenc XDATA5, [KEYS5 + 16*6] ; 6. ENC vaesenc XDATA6, [KEYS6 + 16*6] ; 6. ENC vaesenc XDATA7, [KEYS7 + 16*6] ; 6. ENC vaesenc XDATA0, [KEYS0 + 16*7] ; 7. ENC vaesenc XDATA1, [KEYS1 + 16*7] ; 7. ENC vaesenc XDATA2, [KEYS2 + 16*7] ; 7. ENC vaesenc XDATA3, [KEYS3 + 16*7] ; 7. ENC vaesenc XDATA4, XKEY4_7 ; 7. ENC vaesenc XDATA5, [KEYS5 + 16*7] ; 7. ENC vaesenc XDATA6, [KEYS6 + 16*7] ; 7. ENC vaesenc XDATA7, [KEYS7 + 16*7] ; 7. ENC vaesenc XDATA0, [KEYS0 + 16*8] ; 8. ENC vaesenc XDATA1, [KEYS1 + 16*8] ; 8. ENC vaesenc XDATA2, [KEYS2 + 16*8] ; 8. ENC vaesenc XDATA3, [KEYS3 + 16*8] ; 8. ENC vaesenc XDATA4, [KEYS4 + 16*8] ; 8. ENC vaesenc XDATA5, XKEY5_8 ; 8. ENC vaesenc XDATA6, [KEYS6 + 16*8] ; 8. ENC vaesenc XDATA7, [KEYS7 + 16*8] ; 8. ENC vaesenc XDATA0, [KEYS0 + 16*9] ; 9. ENC vaesenc XDATA1, [KEYS1 + 16*9] ; 9. ENC vaesenc XDATA2, [KEYS2 + 16*9] ; 9. ENC vaesenc XDATA3, [KEYS3 + 16*9] ; 9. ENC vaesenc XDATA4, [KEYS4 + 16*9] ; 9. ENC vaesenc XDATA5, [KEYS5 + 16*9] ; 9. ENC vaesenc XDATA6, XKEY6_9 ; 9. ENC vaesenc XDATA7, [KEYS7 + 16*9] ; 9. ENC vaesenclast XDATA0, [KEYS0 + 16*10] ; 10. ENC vaesenclast XDATA1, [KEYS1 + 16*10] ; 10. ENC vaesenclast XDATA2, [KEYS2 + 16*10] ; 10. ENC vaesenclast XDATA3, [KEYS3 + 16*10] ; 10. ENC vaesenclast XDATA4, [KEYS4 + 16*10] ; 10. ENC vaesenclast XDATA5, [KEYS5 + 16*10] ; 10. ENC vaesenclast XDATA6, [KEYS6 + 16*10] ; 10. ENC vaesenclast XDATA7, [KEYS7 + 16*10] ; 10. ENC add IDX, 16 cmp [LEN_AREA], IDX jne main_loop done: ;; update ICV vmovdqa [ARG + _aesxcbcarg_ICV + 16*0], XDATA0 vmovdqa [ARG + _aesxcbcarg_ICV + 16*1], XDATA1 vmovdqa [ARG + _aesxcbcarg_ICV + 16*2], XDATA2 vmovdqa [ARG + _aesxcbcarg_ICV + 16*3], XDATA3 vmovdqa [ARG + _aesxcbcarg_ICV + 16*4], XDATA4 vmovdqa [ARG + _aesxcbcarg_ICV + 16*5], XDATA5 vmovdqa [ARG + _aesxcbcarg_ICV + 16*6], XDATA6 vmovdqa [ARG + _aesxcbcarg_ICV + 16*7], XDATA7 ;; update IN vmovd xmm0, [LEN_AREA] vpshufd xmm0, xmm0, 0x44 vpaddq xmm1, xmm0, [ARG + _aesxcbcarg_in + 16*0] vpaddq xmm2, xmm0, [ARG + _aesxcbcarg_in + 16*1] vpaddq xmm3, xmm0, [ARG + _aesxcbcarg_in + 16*2] vpaddq xmm4, xmm0, [ARG + _aesxcbcarg_in + 16*3] vmovdqa [ARG + _aesxcbcarg_in + 16*0], xmm1 vmovdqa [ARG + _aesxcbcarg_in + 16*1], xmm2 vmovdqa [ARG + _aesxcbcarg_in + 16*2], xmm3 vmovdqa [ARG + _aesxcbcarg_in + 16*3], xmm4 ;; XMMs are saved at a higher level mov rbp, [GPR_SAVE_AREA + 8*0] add rsp, STACK_size ret intel-ipsec-mb-0.48/avx/gcm128_avx_gen2.asm000066400000000000000000000033661321406316400203570ustar00rootroot00000000000000;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; Copyright(c) 2011-2017 Intel Corporation All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions ; are met: ; * Redistributions of source code must retain the above copyright ; notice, this list of conditions and the following disclaimer. ; * Redistributions in binary form must reproduce the above copyright ; notice, this list of conditions and the following disclaimer in ; the documentation and/or other materials provided with the ; distribution. ; * Neither the name of Intel Corporation nor the names of its ; contributors may be used to endorse or promote products derived ; from this software without specific prior written permission. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT ; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, ; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY ; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; %define GCM128_MODE 1 %include "gcm_avx_gen2.asm" intel-ipsec-mb-0.48/avx/gcm192_avx_gen2.asm000066400000000000000000000033611321406316400203530ustar00rootroot00000000000000;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; Copyright(c) 2017 Intel Corporation All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions ; are met: ; * Redistributions of source code must retain the above copyright ; notice, this list of conditions and the following disclaimer. ; * Redistributions in binary form must reproduce the above copyright ; notice, this list of conditions and the following disclaimer in ; the documentation and/or other materials provided with the ; distribution. ; * Neither the name of Intel Corporation nor the names of its ; contributors may be used to endorse or promote products derived ; from this software without specific prior written permission. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT ; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, ; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY ; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; %define GCM192_MODE 1 %include "gcm_avx_gen2.asm" intel-ipsec-mb-0.48/avx/gcm256_avx_gen2.asm000066400000000000000000000033651321406316400203600ustar00rootroot00000000000000;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; Copyright(c) 2011-2017 Intel Corporation All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions ; are met: ; * Redistributions of source code must retain the above copyright ; notice, this list of conditions and the following disclaimer. ; * Redistributions in binary form must reproduce the above copyright ; notice, this list of conditions and the following disclaimer in ; the documentation and/or other materials provided with the ; distribution. ; * Neither the name of Intel Corporation nor the names of its ; contributors may be used to endorse or promote products derived ; from this software without specific prior written permission. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT ; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, ; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY ; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; %define GCM256_MODE 1 %include "gcm_avx_gen2.asm" intel-ipsec-mb-0.48/avx/gcm_avx_gen2.asm000066400000000000000000002201101321406316400201100ustar00rootroot00000000000000;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; Copyright(c) 2011-2017 Intel Corporation All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions ; are met: ; * Redistributions of source code must retain the above copyright ; notice, this list of conditions and the following disclaimer. ; * Redistributions in binary form must reproduce the above copyright ; notice, this list of conditions and the following disclaimer in ; the documentation and/or other materials provided with the ; distribution. ; * Neither the name of Intel Corporation nor the names of its ; contributors may be used to endorse or promote products derived ; from this software without specific prior written permission. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT ; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, ; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY ; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; ; Authors: ; Erdinc Ozturk ; Vinodh Gopal ; James Guilford ; ; ; References: ; This code was derived and highly optimized from the code described in paper: ; Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation on Intel Architecture Processors. August, 2010 ; ; For the shift-based reductions used in this code, we used the method described in paper: ; Shay Gueron, Michael E. Kounavis. Intel Carry-Less Multiplication Instruction and its Usage for Computing the GCM Mode. January, 2010. ; ; ; ; ; Assumptions: ; ; ; ; iv: ; 0 1 2 3 ; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ ; | Salt (From the SA) | ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ ; | Initialization Vector | ; | (This is the sequence number from IPSec header) | ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ ; | 0x1 | ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ ; ; ; ; AAD: ; AAD will be padded with 0 to the next 16byte multiple ; for example, assume AAD is a u32 vector ; ; if AAD is 8 bytes: ; AAD[3] = {A0, A1}; ; padded AAD in xmm register = {A1 A0 0 0} ; ; 0 1 2 3 ; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ ; | SPI (A1) | ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ ; | 32-bit Sequence Number (A0) | ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ ; | 0x0 | ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ ; ; AAD Format with 32-bit Sequence Number ; ; if AAD is 12 bytes: ; AAD[3] = {A0, A1, A2}; ; padded AAD in xmm register = {A2 A1 A0 0} ; ; 0 1 2 3 ; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ ; | SPI (A2) | ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ ; | 64-bit Extended Sequence Number {A1,A0} | ; | | ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ ; | 0x0 | ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ ; ; AAD Format with 64-bit Extended Sequence Number ; ; ; aadLen: ; Must be a multiple of 4 bytes and from the definition of the spec. ; The code additionally supports any aadLen length. ; ; TLen: ; from the definition of the spec, TLen can only be 8, 12 or 16 bytes. ; ; poly = x^128 + x^127 + x^126 + x^121 + 1 ; throughout the code, one tab and two tab indentations are used. one tab is for GHASH part, two tabs is for AES part. ; %include "os.asm" %include "reg_sizes.asm" %include "gcm_defines.asm" %ifndef GCM128_MODE %ifndef GCM192_MODE %ifndef GCM256_MODE %error "No GCM mode selected for gcm_avx_gen2.asm!" %endif %endif %endif %ifdef GCM128_MODE %define FN_NAME(x,y) aes_gcm_ %+ x %+ _128 %+ y %+ avx_gen2 %define NROUNDS 9 %endif %ifdef GCM192_MODE %define FN_NAME(x,y) aes_gcm_ %+ x %+ _192 %+ y %+ avx_gen2 %define NROUNDS 11 %endif %ifdef GCM256_MODE %define FN_NAME(x,y) aes_gcm_ %+ x %+ _256 %+ y %+ avx_gen2 %define NROUNDS 13 %endif default rel ; need to push 4 registers into stack to maintain %define STACK_OFFSET 8*4 %define TMP2 16*0 ; Temporary storage for AES State 2 (State 1 is stored in an XMM register) %define TMP3 16*1 ; Temporary storage for AES State 3 %define TMP4 16*2 ; Temporary storage for AES State 4 %define TMP5 16*3 ; Temporary storage for AES State 5 %define TMP6 16*4 ; Temporary storage for AES State 6 %define TMP7 16*5 ; Temporary storage for AES State 7 %define TMP8 16*6 ; Temporary storage for AES State 8 %define LOCAL_STORAGE 16*7 %ifidn __OUTPUT_FORMAT__, win64 %define XMM_STORAGE 16*10 %else %define XMM_STORAGE 0 %endif %define VARIABLE_OFFSET LOCAL_STORAGE + XMM_STORAGE ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; Utility Macros ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0) ; Input: A and B (128-bits each, bit-reflected) ; Output: C = A*B*x mod poly, (i.e. >>1 ) ; To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input ; GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; %macro GHASH_MUL 7 %define %%GH %1 ; 16 Bytes %define %%HK %2 ; 16 Bytes %define %%T1 %3 %define %%T2 %4 %define %%T3 %5 %define %%T4 %6 %define %%T5 %7 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; Karatsuba vpshufd %%T2, %%GH, 01001110b vpshufd %%T3, %%HK, 01001110b vpxor %%T2, %%T2, %%GH ; %%T2 = (a1+a0) vpxor %%T3, %%T3, %%HK ; %%T3 = (b1+b0) vpclmulqdq %%T1, %%GH, %%HK, 0x11 ; %%T1 = a1*b1 vpclmulqdq %%GH, %%HK, 0x00 ; %%GH = a0*b0 vpclmulqdq %%T2, %%T3, 0x00 ; %%T2 = (a1+a0)*(b1+b0) vpxor %%T2, %%T2, %%GH vpxor %%T2, %%T2, %%T1 ; %%T2 = a0*b1+a1*b0 vpslldq %%T3, %%T2, 8 ; shift-L %%T3 2 DWs vpsrldq %%T2, %%T2, 8 ; shift-R %%T2 2 DWs vpxor %%GH, %%GH, %%T3 vpxor %%T1, %%T1, %%T2 ; <%%T1:%%GH> = %%GH x %%HK ;first phase of the reduction vpslld %%T2, %%GH, 31 ; packed right shifting << 31 vpslld %%T3, %%GH, 30 ; packed right shifting shift << 30 vpslld %%T4, %%GH, 25 ; packed right shifting shift << 25 vpxor %%T2, %%T2, %%T3 ; xor the shifted versions vpxor %%T2, %%T2, %%T4 vpsrldq %%T5, %%T2, 4 ; shift-R %%T5 1 DW vpslldq %%T2, %%T2, 12 ; shift-L %%T2 3 DWs vpxor %%GH, %%GH, %%T2 ; first phase of the reduction complete ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;second phase of the reduction vpsrld %%T2,%%GH,1 ; packed left shifting >> 1 vpsrld %%T3,%%GH,2 ; packed left shifting >> 2 vpsrld %%T4,%%GH,7 ; packed left shifting >> 7 vpxor %%T2, %%T2, %%T3 ; xor the shifted versions vpxor %%T2, %%T2, %%T4 vpxor %%T2, %%T2, %%T5 vpxor %%GH, %%GH, %%T2 vpxor %%GH, %%GH, %%T1 ; the result is in %%GH %endmacro %macro PRECOMPUTE 8 %define %%GDATA %1 %define %%HK %2 %define %%T1 %3 %define %%T2 %4 %define %%T3 %5 %define %%T4 %6 %define %%T5 %7 %define %%T6 %8 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; Haskey_i_k holds XORed values of the low and high parts of the Haskey_i vmovdqa %%T5, %%HK vpshufd %%T1, %%T5, 01001110b vpxor %%T1, %%T5 vmovdqu [%%GDATA + HashKey_k], %%T1 GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^2<<1 mod poly vmovdqu [%%GDATA + HashKey_2], %%T5 ; [HashKey_2] = HashKey^2<<1 mod poly vpshufd %%T1, %%T5, 01001110b vpxor %%T1, %%T5 vmovdqu [%%GDATA + HashKey_2_k], %%T1 GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^3<<1 mod poly vmovdqu [%%GDATA + HashKey_3], %%T5 vpshufd %%T1, %%T5, 01001110b vpxor %%T1, %%T5 vmovdqu [%%GDATA + HashKey_3_k], %%T1 GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^4<<1 mod poly vmovdqu [%%GDATA + HashKey_4], %%T5 vpshufd %%T1, %%T5, 01001110b vpxor %%T1, %%T5 vmovdqu [%%GDATA + HashKey_4_k], %%T1 GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^5<<1 mod poly vmovdqu [%%GDATA + HashKey_5], %%T5 vpshufd %%T1, %%T5, 01001110b vpxor %%T1, %%T5 vmovdqu [%%GDATA + HashKey_5_k], %%T1 GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^6<<1 mod poly vmovdqu [%%GDATA + HashKey_6], %%T5 vpshufd %%T1, %%T5, 01001110b vpxor %%T1, %%T5 vmovdqu [%%GDATA + HashKey_6_k], %%T1 GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^7<<1 mod poly vmovdqu [%%GDATA + HashKey_7], %%T5 vpshufd %%T1, %%T5, 01001110b vpxor %%T1, %%T5 vmovdqu [%%GDATA + HashKey_7_k], %%T1 GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^8<<1 mod poly vmovdqu [%%GDATA + HashKey_8], %%T5 vpshufd %%T1, %%T5, 01001110b vpxor %%T1, %%T5 vmovdqu [%%GDATA + HashKey_8_k], %%T1 %endmacro ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; READ_SMALL_DATA_INPUT: Packs xmm register with data when data input is less than 16 bytes. ; Returns 0 if data has length 0. ; Input: The input data (INPUT), that data's length (LENGTH). ; Output: The packed xmm register (OUTPUT). ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; %macro READ_SMALL_DATA_INPUT 6 %define %%OUTPUT %1 ; %%OUTPUT is an xmm register %define %%INPUT %2 %define %%LENGTH %3 %define %%END_READ_LOCATION %4 ; All this and the lower inputs are temp registers %define %%COUNTER %5 %define %%TMP1 %6 vpxor %%OUTPUT, %%OUTPUT mov %%COUNTER, %%LENGTH mov %%END_READ_LOCATION, %%INPUT add %%END_READ_LOCATION, %%LENGTH xor %%TMP1, %%TMP1 cmp %%COUNTER, 8 jl %%_byte_loop_2 vpinsrq %%OUTPUT, [%%INPUT],0 ;Read in 8 bytes if they exists je %%_done sub %%COUNTER, 8 %%_byte_loop_1: ;Read in data 1 byte at a time while data is left shl %%TMP1, 8 ;This loop handles when 8 bytes were already read in dec %%END_READ_LOCATION mov BYTE(%%TMP1), BYTE [%%END_READ_LOCATION] dec %%COUNTER jg %%_byte_loop_1 vpinsrq %%OUTPUT, %%TMP1, 1 jmp %%_done %%_byte_loop_2: ;Read in data 1 byte at a time while data is left cmp %%COUNTER, 0 je %%_done shl %%TMP1, 8 ;This loop handles when no bytes were already read in dec %%END_READ_LOCATION mov BYTE(%%TMP1), BYTE [%%END_READ_LOCATION] dec %%COUNTER jg %%_byte_loop_2 vpinsrq %%OUTPUT, %%TMP1, 0 %%_done: %endmacro ; READ_SMALL_DATA_INPUT ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted. ; Input: The input data (A_IN), that data's length (A_LEN), and the hash key (HASH_KEY). ; Output: The hash of the data (AAD_HASH). ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; %macro CALC_AAD_HASH 14 %define %%A_IN %1 %define %%A_LEN %2 %define %%AAD_HASH %3 %define %%HASH_KEY %4 %define %%XTMP1 %5 ; xmm temp reg 5 %define %%XTMP2 %6 %define %%XTMP3 %7 %define %%XTMP4 %8 %define %%XTMP5 %9 ; xmm temp reg 5 %define %%T1 %10 ; temp reg 1 %define %%T2 %11 %define %%T3 %12 %define %%T4 %13 %define %%T5 %14 ; temp reg 5 mov %%T1, %%A_IN ; T1 = AAD mov %%T2, %%A_LEN ; T2 = aadLen vpxor %%AAD_HASH, %%AAD_HASH cmp %%T2, 16 jl %%_get_small_AAD_block %%_get_AAD_loop16: vmovdqu %%XTMP1, [%%T1] ;byte-reflect the AAD data vpshufb %%XTMP1, [SHUF_MASK] vpxor %%AAD_HASH, %%XTMP1 GHASH_MUL %%AAD_HASH, %%HASH_KEY, %%XTMP1, %%XTMP2, %%XTMP3, %%XTMP4, %%XTMP5 sub %%T2, 16 je %%_CALC_AAD_done add %%T1, 16 cmp %%T2, 16 jge %%_get_AAD_loop16 %%_get_small_AAD_block: READ_SMALL_DATA_INPUT %%XTMP1, %%T1, %%T2, %%T3, %%T4, %%T5 ;byte-reflect the AAD data vpshufb %%XTMP1, [SHUF_MASK] vpxor %%AAD_HASH, %%XTMP1 GHASH_MUL %%AAD_HASH, %%HASH_KEY, %%XTMP1, %%XTMP2, %%XTMP3, %%XTMP4, %%XTMP5 %%_CALC_AAD_done: %endmacro ; CALC_AAD_HASH ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks between update calls. ; Requires the input data be at least 1 byte long. ; Input: ; GDATA_KEY - struct gcm_key_data * ; GDATA_CTX - struct gcm_context_data * ; PLAIN_CYPH_IN - input text ; PLAIN_CYPH_LEN - input text length ; DATA_OFFSET - the current data offset ; ENC_DEC - whether encoding or decoding ; Output: A cypher of the first partial block (CYPH_PLAIN_OUT), and updated GDATA_CTX ; Clobbers rax, r10, r12, r13, r15, xmm0, xmm1, xmm2, xmm3, xmm5, xmm6, xmm9, xmm10, xmm11, xmm13 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; %macro PARTIAL_BLOCK 8 %define %%GDATA_KEY %1 %define %%GDATA_CTX %2 %define %%CYPH_PLAIN_OUT %3 %define %%PLAIN_CYPH_IN %4 %define %%PLAIN_CYPH_LEN %5 %define %%DATA_OFFSET %6 %define %%AAD_HASH %7 %define %%ENC_DEC %8 mov r13, [%%GDATA_CTX + PBlockLen] cmp r13, 0 je %%_partial_block_done ;Leave Macro if no partial blocks cmp %%PLAIN_CYPH_LEN, 16 ;Read in input data without over reading jl %%_fewer_than_16_bytes VXLDR xmm1, [%%PLAIN_CYPH_IN] ;If more than 16 bytes of data, just fill the xmm register jmp %%_data_read %%_fewer_than_16_bytes: lea r10, [%%PLAIN_CYPH_IN + %%DATA_OFFSET] READ_SMALL_DATA_INPUT xmm1, r10, %%PLAIN_CYPH_LEN, rax, r12, r15 %%_data_read: ;Finished reading in data vmovdqu xmm9, [%%GDATA_CTX + PBlockEncKey] ;xmm9 = my_ctx_data.partial_block_enc_key vmovdqu xmm13, [%%GDATA_KEY + HashKey] lea r12, [SHIFT_MASK] cmp r13, rax add r12, r13 ; adjust the shuffle mask pointer to be able to shift r13 bytes (16-r13 is the number of bytes in plaintext mod 16) vmovdqu xmm2, [r12] ; get the appropriate shuffle mask vpshufb xmm9, xmm2 ;shift right r13 bytes %ifidn %%ENC_DEC, DEC vmovdqa xmm3, xmm1 vpxor xmm9, xmm1 ; Cyphertext XOR E(K, Yn) mov r15, %%PLAIN_CYPH_LEN add r15, r13 sub r15, 16 ;Set r15 to be the amount of data left in CYPH_PLAIN_IN after filling the block jge %%_no_extra_mask_1 ;Determine if if partial block is not being filled and shift mask accordingly sub r12, r15 %%_no_extra_mask_1: vmovdqu xmm1, [r12 + ALL_F-SHIFT_MASK] ; get the appropriate mask to mask out bottom r13 bytes of xmm9 vpand xmm9, xmm1 ; mask out bottom r13 bytes of xmm9 vpand xmm3, xmm1 vpshufb xmm3, [SHUF_MASK] vpshufb xmm3, xmm2 vpxor %%AAD_HASH, xmm3 cmp r15,0 jl %%_partial_incomplete_1 GHASH_MUL %%AAD_HASH, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block xor rax,rax mov [%%GDATA_CTX + PBlockLen], rax jmp %%_dec_done %%_partial_incomplete_1: %ifidn __OUTPUT_FORMAT__, win64 mov rax, %%PLAIN_CYPH_LEN add [%%GDATA_CTX + PBlockLen], rax %else add [%%GDATA_CTX + PBlockLen], %%PLAIN_CYPH_LEN %endif %%_dec_done: vmovdqu [%%GDATA_CTX + AadHash], %%AAD_HASH %else vpxor xmm9, xmm1 ; Plaintext XOR E(K, Yn) mov r15, %%PLAIN_CYPH_LEN add r15, r13 sub r15, 16 ;Set r15 to be the amount of data left in CYPH_PLAIN_IN after filling the block jge %%_no_extra_mask_2 ;Determine if if partial block is not being filled and shift mask accordingly sub r12, r15 %%_no_extra_mask_2: vmovdqu xmm1, [r12 + ALL_F-SHIFT_MASK] ; get the appropriate mask to mask out bottom r13 bytes of xmm9 vpand xmm9, xmm1 ; mask out bottom r13 bytes of xmm9 vpshufb xmm9, [SHUF_MASK] vpshufb xmm9, xmm2 vpxor %%AAD_HASH, xmm9 cmp r15,0 jl %%_partial_incomplete_2 GHASH_MUL %%AAD_HASH, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block xor rax,rax mov [%%GDATA_CTX + PBlockLen], rax jmp %%_encode_done %%_partial_incomplete_2: %ifidn __OUTPUT_FORMAT__, win64 mov rax, %%PLAIN_CYPH_LEN add [%%GDATA_CTX + PBlockLen], rax %else add [%%GDATA_CTX + PBlockLen], %%PLAIN_CYPH_LEN %endif %%_encode_done: vmovdqu [%%GDATA_CTX + AadHash], %%AAD_HASH vpshufb xmm9, [SHUF_MASK] ; shuffle xmm9 back to output as ciphertext vpshufb xmm9, xmm2 %endif ;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; output encrypted Bytes cmp r15,0 jl %%_partial_fill mov r12, r13 mov r13, 16 sub r13, r12 ; Set r13 to be the number of bytes to write out jmp %%_count_set %%_partial_fill: mov r13, %%PLAIN_CYPH_LEN %%_count_set: vmovq rax, xmm9 cmp r13, 8 jle %%_less_than_8_bytes_left mov [%%CYPH_PLAIN_OUT+ %%DATA_OFFSET], rax add %%DATA_OFFSET, 8 vpsrldq xmm9, xmm9, 8 vmovq rax, xmm9 sub r13, 8 %%_less_than_8_bytes_left: mov BYTE [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], al add %%DATA_OFFSET, 1 shr rax, 8 sub r13, 1 jne %%_less_than_8_bytes_left ;;;;;;;;;;;;;;;;;;;;;;;;;;;;; %%_partial_block_done: %endmacro ; PARTIAL_BLOCK ; if a = number of total plaintext bytes ; b = floor(a/16) ; %%num_initial_blocks = b mod 8; ; encrypt the initial %%num_initial_blocks blocks and apply ghash on the ciphertext ; %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r14 are used as a pointer only, not modified. ; Updated AAD_HASH is returned in %%T3 %macro INITIAL_BLOCKS 24 %define %%GDATA_KEY %1 %define %%GDATA_CTX %2 %define %%CYPH_PLAIN_OUT %3 %define %%PLAIN_CYPH_IN %4 %define %%LENGTH %5 %define %%DATA_OFFSET %6 %define %%num_initial_blocks %7 ; can be 0, 1, 2, 3, 4, 5, 6 or 7 %define %%T1 %8 %define %%HASH_KEY %9 %define %%T3 %10 %define %%T4 %11 %define %%T5 %12 %define %%CTR %13 %define %%XMM1 %14 %define %%XMM2 %15 %define %%XMM3 %16 %define %%XMM4 %17 %define %%XMM5 %18 %define %%XMM6 %19 %define %%XMM7 %20 %define %%XMM8 %21 %define %%T6 %22 %define %%T_key %23 %define %%ENC_DEC %24 %assign i (8-%%num_initial_blocks) vmovdqu reg(i), %%XMM8 ; move AAD_HASH to temp reg ; start AES for %%num_initial_blocks blocks vmovdqu %%CTR, [%%GDATA_CTX + CurCount] ; %%CTR = Y0 %assign i (9-%%num_initial_blocks) %rep %%num_initial_blocks vpaddd %%CTR, [ONE] ; INCR Y0 vmovdqa reg(i), %%CTR vpshufb reg(i), [SHUF_MASK] ; perform a 16Byte swap %assign i (i+1) %endrep vmovdqu %%T_key, [%%GDATA_KEY+16*0] %assign i (9-%%num_initial_blocks) %rep %%num_initial_blocks vpxor reg(i),%%T_key %assign i (i+1) %endrep %assign j 1 %rep NROUNDS vmovdqu %%T_key, [%%GDATA_KEY+16*j] %assign i (9-%%num_initial_blocks) %rep %%num_initial_blocks vaesenc reg(i),%%T_key %assign i (i+1) %endrep %assign j (j+1) %endrep ; NROUNDS vmovdqu %%T_key, [%%GDATA_KEY+16*j] %assign i (9-%%num_initial_blocks) %rep %%num_initial_blocks vaesenclast reg(i),%%T_key %assign i (i+1) %endrep %assign i (9-%%num_initial_blocks) %rep %%num_initial_blocks VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET] vpxor reg(i), %%T1 VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], reg(i) ; write back ciphertext for %%num_initial_blocks blocks add %%DATA_OFFSET, 16 %ifidn %%ENC_DEC, DEC vmovdqa reg(i), %%T1 %endif vpshufb reg(i), [SHUF_MASK] ; prepare ciphertext for GHASH computations %assign i (i+1) %endrep %assign i (8-%%num_initial_blocks) %assign j (9-%%num_initial_blocks) %rep %%num_initial_blocks vpxor reg(j), reg(i) GHASH_MUL reg(j), %%HASH_KEY, %%T1, %%T3, %%T4, %%T5, %%T6 ; apply GHASH on %%num_initial_blocks blocks %assign i (i+1) %assign j (j+1) %endrep ; %%XMM8 has the current Hash Value vmovdqa %%T3, %%XMM8 cmp %%LENGTH, 128 jl %%_initial_blocks_done ; no need for precomputed constants ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; Haskey_i_k holds XORed values of the low and high parts of the Haskey_i vpaddd %%CTR, [ONE] ; INCR Y0 vmovdqa %%XMM1, %%CTR vpshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap vpaddd %%CTR, [ONE] ; INCR Y0 vmovdqa %%XMM2, %%CTR vpshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap vpaddd %%CTR, [ONE] ; INCR Y0 vmovdqa %%XMM3, %%CTR vpshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap vpaddd %%CTR, [ONE] ; INCR Y0 vmovdqa %%XMM4, %%CTR vpshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap vpaddd %%CTR, [ONE] ; INCR Y0 vmovdqa %%XMM5, %%CTR vpshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap vpaddd %%CTR, [ONE] ; INCR Y0 vmovdqa %%XMM6, %%CTR vpshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap vpaddd %%CTR, [ONE] ; INCR Y0 vmovdqa %%XMM7, %%CTR vpshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap vpaddd %%CTR, [ONE] ; INCR Y0 vmovdqa %%XMM8, %%CTR vpshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap vmovdqu %%T_key, [%%GDATA_KEY+16*0] vpxor %%XMM1, %%T_key vpxor %%XMM2, %%T_key vpxor %%XMM3, %%T_key vpxor %%XMM4, %%T_key vpxor %%XMM5, %%T_key vpxor %%XMM6, %%T_key vpxor %%XMM7, %%T_key vpxor %%XMM8, %%T_key %assign i 1 %rep NROUNDS vmovdqu %%T_key, [%%GDATA_KEY+16*i] vaesenc %%XMM1, %%T_key vaesenc %%XMM2, %%T_key vaesenc %%XMM3, %%T_key vaesenc %%XMM4, %%T_key vaesenc %%XMM5, %%T_key vaesenc %%XMM6, %%T_key vaesenc %%XMM7, %%T_key vaesenc %%XMM8, %%T_key %assign i (i+1) %endrep vmovdqu %%T_key, [%%GDATA_KEY+16*i] vaesenclast %%XMM1, %%T_key vaesenclast %%XMM2, %%T_key vaesenclast %%XMM3, %%T_key vaesenclast %%XMM4, %%T_key vaesenclast %%XMM5, %%T_key vaesenclast %%XMM6, %%T_key vaesenclast %%XMM7, %%T_key vaesenclast %%XMM8, %%T_key VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*0] vpxor %%XMM1, %%T1 VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*0], %%XMM1 %ifidn %%ENC_DEC, DEC vmovdqa %%XMM1, %%T1 %endif VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*1] vpxor %%XMM2, %%T1 VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*1], %%XMM2 %ifidn %%ENC_DEC, DEC vmovdqa %%XMM2, %%T1 %endif VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*2] vpxor %%XMM3, %%T1 VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*2], %%XMM3 %ifidn %%ENC_DEC, DEC vmovdqa %%XMM3, %%T1 %endif VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*3] vpxor %%XMM4, %%T1 VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*3], %%XMM4 %ifidn %%ENC_DEC, DEC vmovdqa %%XMM4, %%T1 %endif VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*4] vpxor %%XMM5, %%T1 VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*4], %%XMM5 %ifidn %%ENC_DEC, DEC vmovdqa %%XMM5, %%T1 %endif VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*5] vpxor %%XMM6, %%T1 VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*5], %%XMM6 %ifidn %%ENC_DEC, DEC vmovdqa %%XMM6, %%T1 %endif VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*6] vpxor %%XMM7, %%T1 VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*6], %%XMM7 %ifidn %%ENC_DEC, DEC vmovdqa %%XMM7, %%T1 %endif VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*7] vpxor %%XMM8, %%T1 VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*7], %%XMM8 %ifidn %%ENC_DEC, DEC vmovdqa %%XMM8, %%T1 %endif add %%DATA_OFFSET, 128 vpshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap vpxor %%XMM1, %%T3 ; combine GHASHed value with the corresponding ciphertext vpshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap vpshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap vpshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap vpshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap vpshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap vpshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap vpshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; %%_initial_blocks_done: %endmacro ; encrypt 8 blocks at a time ; ghash the 8 previously encrypted ciphertext blocks ; %%GDATA - (GCM key data), %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN are used as pointers only, not modified ; r11 is the data offset value %macro GHASH_8_ENCRYPT_8_PARALLEL 22 %define %%GDATA %1 %define %%CYPH_PLAIN_OUT %2 %define %%PLAIN_CYPH_IN %3 %define %%DATA_OFFSET %4 %define %%T1 %5 %define %%T2 %6 %define %%T3 %7 %define %%T4 %8 %define %%T5 %9 %define %%T6 %10 %define %%CTR %11 %define %%XMM1 %12 %define %%XMM2 %13 %define %%XMM3 %14 %define %%XMM4 %15 %define %%XMM5 %16 %define %%XMM6 %17 %define %%XMM7 %18 %define %%XMM8 %19 %define %%T7 %20 %define %%loop_idx %21 %define %%ENC_DEC %22 vmovdqa %%T2, %%XMM1 vmovdqu [rsp + TMP2], %%XMM2 vmovdqu [rsp + TMP3], %%XMM3 vmovdqu [rsp + TMP4], %%XMM4 vmovdqu [rsp + TMP5], %%XMM5 vmovdqu [rsp + TMP6], %%XMM6 vmovdqu [rsp + TMP7], %%XMM7 vmovdqu [rsp + TMP8], %%XMM8 %ifidn %%loop_idx, in_order vpaddd %%XMM1, %%CTR, [ONE] ; INCR CNT vpaddd %%XMM2, %%XMM1, [ONE] vpaddd %%XMM3, %%XMM2, [ONE] vpaddd %%XMM4, %%XMM3, [ONE] vpaddd %%XMM5, %%XMM4, [ONE] vpaddd %%XMM6, %%XMM5, [ONE] vpaddd %%XMM7, %%XMM6, [ONE] vpaddd %%XMM8, %%XMM7, [ONE] vmovdqa %%CTR, %%XMM8 vpshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap vpshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap vpshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap vpshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap vpshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap vpshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap vpshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap vpshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap %else vpaddd %%XMM1, %%CTR, [ONEf] ; INCR CNT vpaddd %%XMM2, %%XMM1, [ONEf] vpaddd %%XMM3, %%XMM2, [ONEf] vpaddd %%XMM4, %%XMM3, [ONEf] vpaddd %%XMM5, %%XMM4, [ONEf] vpaddd %%XMM6, %%XMM5, [ONEf] vpaddd %%XMM7, %%XMM6, [ONEf] vpaddd %%XMM8, %%XMM7, [ONEf] vmovdqa %%CTR, %%XMM8 %endif ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; vmovdqu %%T1, [%%GDATA + 16*0] vpxor %%XMM1, %%T1 vpxor %%XMM2, %%T1 vpxor %%XMM3, %%T1 vpxor %%XMM4, %%T1 vpxor %%XMM5, %%T1 vpxor %%XMM6, %%T1 vpxor %%XMM7, %%T1 vpxor %%XMM8, %%T1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; vmovdqu %%T1, [%%GDATA + 16*1] vaesenc %%XMM1, %%T1 vaesenc %%XMM2, %%T1 vaesenc %%XMM3, %%T1 vaesenc %%XMM4, %%T1 vaesenc %%XMM5, %%T1 vaesenc %%XMM6, %%T1 vaesenc %%XMM7, %%T1 vaesenc %%XMM8, %%T1 vmovdqu %%T1, [%%GDATA + 16*2] vaesenc %%XMM1, %%T1 vaesenc %%XMM2, %%T1 vaesenc %%XMM3, %%T1 vaesenc %%XMM4, %%T1 vaesenc %%XMM5, %%T1 vaesenc %%XMM6, %%T1 vaesenc %%XMM7, %%T1 vaesenc %%XMM8, %%T1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; vmovdqu %%T5, [%%GDATA + HashKey_8] vpclmulqdq %%T4, %%T2, %%T5, 0x11 ; %%T4 = a1*b1 vpclmulqdq %%T7, %%T2, %%T5, 0x00 ; %%T7 = a0*b0 vpshufd %%T6, %%T2, 01001110b vpxor %%T6, %%T2 vmovdqu %%T5, [%%GDATA + HashKey_8_k] vpclmulqdq %%T6, %%T6, %%T5, 0x00 ; vmovdqu %%T1, [%%GDATA + 16*3] vaesenc %%XMM1, %%T1 vaesenc %%XMM2, %%T1 vaesenc %%XMM3, %%T1 vaesenc %%XMM4, %%T1 vaesenc %%XMM5, %%T1 vaesenc %%XMM6, %%T1 vaesenc %%XMM7, %%T1 vaesenc %%XMM8, %%T1 vmovdqu %%T1, [rsp + TMP2] vmovdqu %%T5, [%%GDATA + HashKey_7] vpclmulqdq %%T3, %%T1, %%T5, 0x11 vpxor %%T4, %%T4, %%T3 vpclmulqdq %%T3, %%T1, %%T5, 0x00 vpxor %%T7, %%T7, %%T3 vpshufd %%T3, %%T1, 01001110b vpxor %%T3, %%T1 vmovdqu %%T5, [%%GDATA + HashKey_7_k] vpclmulqdq %%T3, %%T3, %%T5, 0x10 vpxor %%T6, %%T6, %%T3 vmovdqu %%T1, [%%GDATA + 16*4] vaesenc %%XMM1, %%T1 vaesenc %%XMM2, %%T1 vaesenc %%XMM3, %%T1 vaesenc %%XMM4, %%T1 vaesenc %%XMM5, %%T1 vaesenc %%XMM6, %%T1 vaesenc %%XMM7, %%T1 vaesenc %%XMM8, %%T1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; vmovdqu %%T1, [rsp + TMP3] vmovdqu %%T5, [%%GDATA + HashKey_6] vpclmulqdq %%T3, %%T1, %%T5, 0x11 vpxor %%T4, %%T4, %%T3 vpclmulqdq %%T3, %%T1, %%T5, 0x00 vpxor %%T7, %%T7, %%T3 vpshufd %%T3, %%T1, 01001110b vpxor %%T3, %%T1 vmovdqu %%T5, [%%GDATA + HashKey_6_k] vpclmulqdq %%T3, %%T3, %%T5, 0x10 vpxor %%T6, %%T6, %%T3 vmovdqu %%T1, [%%GDATA + 16*5] vaesenc %%XMM1, %%T1 vaesenc %%XMM2, %%T1 vaesenc %%XMM3, %%T1 vaesenc %%XMM4, %%T1 vaesenc %%XMM5, %%T1 vaesenc %%XMM6, %%T1 vaesenc %%XMM7, %%T1 vaesenc %%XMM8, %%T1 vmovdqu %%T1, [rsp + TMP4] vmovdqu %%T5, [%%GDATA + HashKey_5] vpclmulqdq %%T3, %%T1, %%T5, 0x11 vpxor %%T4, %%T4, %%T3 vpclmulqdq %%T3, %%T1, %%T5, 0x00 vpxor %%T7, %%T7, %%T3 vpshufd %%T3, %%T1, 01001110b vpxor %%T3, %%T1 vmovdqu %%T5, [%%GDATA + HashKey_5_k] vpclmulqdq %%T3, %%T3, %%T5, 0x10 vpxor %%T6, %%T6, %%T3 vmovdqu %%T1, [%%GDATA + 16*6] vaesenc %%XMM1, %%T1 vaesenc %%XMM2, %%T1 vaesenc %%XMM3, %%T1 vaesenc %%XMM4, %%T1 vaesenc %%XMM5, %%T1 vaesenc %%XMM6, %%T1 vaesenc %%XMM7, %%T1 vaesenc %%XMM8, %%T1 vmovdqu %%T1, [rsp + TMP5] vmovdqu %%T5, [%%GDATA + HashKey_4] vpclmulqdq %%T3, %%T1, %%T5, 0x11 vpxor %%T4, %%T4, %%T3 vpclmulqdq %%T3, %%T1, %%T5, 0x00 vpxor %%T7, %%T7, %%T3 vpshufd %%T3, %%T1, 01001110b vpxor %%T3, %%T1 vmovdqu %%T5, [%%GDATA + HashKey_4_k] vpclmulqdq %%T3, %%T3, %%T5, 0x10 vpxor %%T6, %%T6, %%T3 vmovdqu %%T1, [%%GDATA + 16*7] vaesenc %%XMM1, %%T1 vaesenc %%XMM2, %%T1 vaesenc %%XMM3, %%T1 vaesenc %%XMM4, %%T1 vaesenc %%XMM5, %%T1 vaesenc %%XMM6, %%T1 vaesenc %%XMM7, %%T1 vaesenc %%XMM8, %%T1 vmovdqu %%T1, [rsp + TMP6] vmovdqu %%T5, [%%GDATA + HashKey_3] vpclmulqdq %%T3, %%T1, %%T5, 0x11 vpxor %%T4, %%T4, %%T3 vpclmulqdq %%T3, %%T1, %%T5, 0x00 vpxor %%T7, %%T7, %%T3 vpshufd %%T3, %%T1, 01001110b vpxor %%T3, %%T1 vmovdqu %%T5, [%%GDATA + HashKey_3_k] vpclmulqdq %%T3, %%T3, %%T5, 0x10 vpxor %%T6, %%T6, %%T3 vmovdqu %%T1, [%%GDATA + 16*8] vaesenc %%XMM1, %%T1 vaesenc %%XMM2, %%T1 vaesenc %%XMM3, %%T1 vaesenc %%XMM4, %%T1 vaesenc %%XMM5, %%T1 vaesenc %%XMM6, %%T1 vaesenc %%XMM7, %%T1 vaesenc %%XMM8, %%T1 vmovdqu %%T1, [rsp + TMP7] vmovdqu %%T5, [%%GDATA + HashKey_2] vpclmulqdq %%T3, %%T1, %%T5, 0x11 vpxor %%T4, %%T4, %%T3 vpclmulqdq %%T3, %%T1, %%T5, 0x00 vpxor %%T7, %%T7, %%T3 vpshufd %%T3, %%T1, 01001110b vpxor %%T3, %%T1 vmovdqu %%T5, [%%GDATA + HashKey_2_k] vpclmulqdq %%T3, %%T3, %%T5, 0x10 vpxor %%T6, %%T6, %%T3 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; vmovdqu %%T5, [%%GDATA + 16*9] vaesenc %%XMM1, %%T5 vaesenc %%XMM2, %%T5 vaesenc %%XMM3, %%T5 vaesenc %%XMM4, %%T5 vaesenc %%XMM5, %%T5 vaesenc %%XMM6, %%T5 vaesenc %%XMM7, %%T5 vaesenc %%XMM8, %%T5 vmovdqu %%T1, [rsp + TMP8] vmovdqu %%T5, [%%GDATA + HashKey] vpclmulqdq %%T3, %%T1, %%T5, 0x11 vpxor %%T4, %%T4, %%T3 vpclmulqdq %%T3, %%T1, %%T5, 0x00 vpxor %%T7, %%T7, %%T3 vpshufd %%T3, %%T1, 01001110b vpxor %%T3, %%T1 vmovdqu %%T5, [%%GDATA + HashKey_k] vpclmulqdq %%T3, %%T3, %%T5, 0x10 vpxor %%T6, %%T6, %%T3 vpxor %%T6, %%T4 vpxor %%T6, %%T7 %ifdef GCM128_MODE vmovdqu %%T5, [%%GDATA + 16*10] %endif %ifdef GCM192_MODE vmovdqu %%T5, [%%GDATA + 16*10] vaesenc %%XMM1, %%T5 vaesenc %%XMM2, %%T5 vaesenc %%XMM3, %%T5 vaesenc %%XMM4, %%T5 vaesenc %%XMM5, %%T5 vaesenc %%XMM6, %%T5 vaesenc %%XMM7, %%T5 vaesenc %%XMM8, %%T5 vmovdqu %%T5, [%%GDATA + 16*11] vaesenc %%XMM1, %%T5 vaesenc %%XMM2, %%T5 vaesenc %%XMM3, %%T5 vaesenc %%XMM4, %%T5 vaesenc %%XMM5, %%T5 vaesenc %%XMM6, %%T5 vaesenc %%XMM7, %%T5 vaesenc %%XMM8, %%T5 vmovdqu %%T5, [%%GDATA + 16*12] %endif %ifdef GCM256_MODE vmovdqu %%T5, [%%GDATA + 16*10] vaesenc %%XMM1, %%T5 vaesenc %%XMM2, %%T5 vaesenc %%XMM3, %%T5 vaesenc %%XMM4, %%T5 vaesenc %%XMM5, %%T5 vaesenc %%XMM6, %%T5 vaesenc %%XMM7, %%T5 vaesenc %%XMM8, %%T5 vmovdqu %%T5, [%%GDATA + 16*11] vaesenc %%XMM1, %%T5 vaesenc %%XMM2, %%T5 vaesenc %%XMM3, %%T5 vaesenc %%XMM4, %%T5 vaesenc %%XMM5, %%T5 vaesenc %%XMM6, %%T5 vaesenc %%XMM7, %%T5 vaesenc %%XMM8, %%T5 vmovdqu %%T5, [%%GDATA + 16*12] vaesenc %%XMM1, %%T5 vaesenc %%XMM2, %%T5 vaesenc %%XMM3, %%T5 vaesenc %%XMM4, %%T5 vaesenc %%XMM5, %%T5 vaesenc %%XMM6, %%T5 vaesenc %%XMM7, %%T5 vaesenc %%XMM8, %%T5 vmovdqu %%T5, [%%GDATA + 16*13] vaesenc %%XMM1, %%T5 vaesenc %%XMM2, %%T5 vaesenc %%XMM3, %%T5 vaesenc %%XMM4, %%T5 vaesenc %%XMM5, %%T5 vaesenc %%XMM6, %%T5 vaesenc %%XMM7, %%T5 vaesenc %%XMM8, %%T5 vmovdqu %%T5, [%%GDATA + 16*14] %endif %assign i 0 %assign j 1 %rep 8 %ifidn %%ENC_DEC, ENC %ifdef NT_LD VXLDR %%T2, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*i] vpxor %%T2, %%T2, %%T5 %else vpxor %%T2, %%T5, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*i] %endif ; NT_LD vaesenclast reg(j), reg(j), %%T2 %else VXLDR %%T2, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*i] vpxor %%T2, %%T2, %%T5 vaesenclast %%T3, reg(j), %%T2 vpxor reg(j), %%T2, %%T5 VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*i], %%T3 %endif ; %%ENC_DEC %assign i (i+1) %assign j (j+1) %endrep vpslldq %%T3, %%T6, 8 ; shift-L %%T3 2 DWs vpsrldq %%T6, %%T6, 8 ; shift-R %%T2 2 DWs vpxor %%T7, %%T3 vpxor %%T6, %%T4 ; accumulate the results in %%T6:%%T7 ;first phase of the reduction vpslld %%T2, %%T7, 31 ; packed right shifting << 31 vpslld %%T3, %%T7, 30 ; packed right shifting shift << 30 vpslld %%T4, %%T7, 25 ; packed right shifting shift << 25 vpxor %%T2, %%T2, %%T3 ; xor the shifted versions vpxor %%T2, %%T2, %%T4 vpsrldq %%T1, %%T2, 4 ; shift-R %%T1 1 DW vpslldq %%T2, %%T2, 12 ; shift-L %%T2 3 DWs vpxor %%T7, %%T2 ; first phase of the reduction complete ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; %ifidn %%ENC_DEC, ENC VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*0], %%XMM1 ; Write to the Ciphertext buffer VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*1], %%XMM2 ; Write to the Ciphertext buffer VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*2], %%XMM3 ; Write to the Ciphertext buffer VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*3], %%XMM4 ; Write to the Ciphertext buffer VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*4], %%XMM5 ; Write to the Ciphertext buffer VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*5], %%XMM6 ; Write to the Ciphertext buffer VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*6], %%XMM7 ; Write to the Ciphertext buffer VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*7], %%XMM8 ; Write to the Ciphertext buffer %endif ;second phase of the reduction vpsrld %%T2,%%T7,1 ; packed left shifting >> 1 vpsrld %%T3,%%T7,2 ; packed left shifting >> 2 vpsrld %%T4,%%T7,7 ; packed left shifting >> 7 vpxor %%T2, %%T2,%%T3 ; xor the shifted versions vpxor %%T2, %%T2,%%T4 vpxor %%T2, %%T2, %%T1 vpxor %%T7, %%T7, %%T2 vpxor %%T6, %%T6, %%T7 ; the result is in %%T6 vpshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap vpshufb %%XMM2, [SHUF_MASK] vpshufb %%XMM3, [SHUF_MASK] vpshufb %%XMM4, [SHUF_MASK] vpshufb %%XMM5, [SHUF_MASK] vpshufb %%XMM6, [SHUF_MASK] vpshufb %%XMM7, [SHUF_MASK] vpshufb %%XMM8, [SHUF_MASK] vpxor %%XMM1, %%T6 %endmacro ; GHASH the last 4 ciphertext blocks. ; %%GDATA is GCM key data %macro GHASH_LAST_8 16 %define %%GDATA %1 %define %%T1 %2 %define %%T2 %3 %define %%T3 %4 %define %%T4 %5 %define %%T5 %6 %define %%T6 %7 %define %%T7 %8 %define %%XMM1 %9 %define %%XMM2 %10 %define %%XMM3 %11 %define %%XMM4 %12 %define %%XMM5 %13 %define %%XMM6 %14 %define %%XMM7 %15 %define %%XMM8 %16 ;; Karatsuba Method vpshufd %%T2, %%XMM1, 01001110b vpxor %%T2, %%XMM1 vmovdqu %%T5, [%%GDATA + HashKey_8] vpclmulqdq %%T6, %%XMM1, %%T5, 0x11 vpclmulqdq %%T7, %%XMM1, %%T5, 0x00 vmovdqu %%T3, [%%GDATA + HashKey_8_k] vpclmulqdq %%XMM1, %%T2, %%T3, 0x00 ;;;;;;;;;;;;;;;;;;;;;; vpshufd %%T2, %%XMM2, 01001110b vpxor %%T2, %%XMM2 vmovdqu %%T5, [%%GDATA + HashKey_7] vpclmulqdq %%T4, %%XMM2, %%T5, 0x11 vpxor %%T6, %%T6, %%T4 vpclmulqdq %%T4, %%XMM2, %%T5, 0x00 vpxor %%T7, %%T7, %%T4 vmovdqu %%T3, [%%GDATA + HashKey_7_k] vpclmulqdq %%T2, %%T2, %%T3, 0x00 vpxor %%XMM1, %%XMM1, %%T2 ;;;;;;;;;;;;;;;;;;;;;; vpshufd %%T2, %%XMM3, 01001110b vpxor %%T2, %%XMM3 vmovdqu %%T5, [%%GDATA + HashKey_6] vpclmulqdq %%T4, %%XMM3, %%T5, 0x11 vpxor %%T6, %%T6, %%T4 vpclmulqdq %%T4, %%XMM3, %%T5, 0x00 vpxor %%T7, %%T7, %%T4 vmovdqu %%T3, [%%GDATA + HashKey_6_k] vpclmulqdq %%T2, %%T2, %%T3, 0x00 vpxor %%XMM1, %%XMM1, %%T2 ;;;;;;;;;;;;;;;;;;;;;; vpshufd %%T2, %%XMM4, 01001110b vpxor %%T2, %%XMM4 vmovdqu %%T5, [%%GDATA + HashKey_5] vpclmulqdq %%T4, %%XMM4, %%T5, 0x11 vpxor %%T6, %%T6, %%T4 vpclmulqdq %%T4, %%XMM4, %%T5, 0x00 vpxor %%T7, %%T7, %%T4 vmovdqu %%T3, [%%GDATA + HashKey_5_k] vpclmulqdq %%T2, %%T2, %%T3, 0x00 vpxor %%XMM1, %%XMM1, %%T2 ;;;;;;;;;;;;;;;;;;;;;; vpshufd %%T2, %%XMM5, 01001110b vpxor %%T2, %%XMM5 vmovdqu %%T5, [%%GDATA + HashKey_4] vpclmulqdq %%T4, %%XMM5, %%T5, 0x11 vpxor %%T6, %%T6, %%T4 vpclmulqdq %%T4, %%XMM5, %%T5, 0x00 vpxor %%T7, %%T7, %%T4 vmovdqu %%T3, [%%GDATA + HashKey_4_k] vpclmulqdq %%T2, %%T2, %%T3, 0x00 vpxor %%XMM1, %%XMM1, %%T2 ;;;;;;;;;;;;;;;;;;;;;; vpshufd %%T2, %%XMM6, 01001110b vpxor %%T2, %%XMM6 vmovdqu %%T5, [%%GDATA + HashKey_3] vpclmulqdq %%T4, %%XMM6, %%T5, 0x11 vpxor %%T6, %%T6, %%T4 vpclmulqdq %%T4, %%XMM6, %%T5, 0x00 vpxor %%T7, %%T7, %%T4 vmovdqu %%T3, [%%GDATA + HashKey_3_k] vpclmulqdq %%T2, %%T2, %%T3, 0x00 vpxor %%XMM1, %%XMM1, %%T2 ;;;;;;;;;;;;;;;;;;;;;; vpshufd %%T2, %%XMM7, 01001110b vpxor %%T2, %%XMM7 vmovdqu %%T5, [%%GDATA + HashKey_2] vpclmulqdq %%T4, %%XMM7, %%T5, 0x11 vpxor %%T6, %%T6, %%T4 vpclmulqdq %%T4, %%XMM7, %%T5, 0x00 vpxor %%T7, %%T7, %%T4 vmovdqu %%T3, [%%GDATA + HashKey_2_k] vpclmulqdq %%T2, %%T2, %%T3, 0x00 vpxor %%XMM1, %%XMM1, %%T2 ;;;;;;;;;;;;;;;;;;;;;; vpshufd %%T2, %%XMM8, 01001110b vpxor %%T2, %%XMM8 vmovdqu %%T5, [%%GDATA + HashKey] vpclmulqdq %%T4, %%XMM8, %%T5, 0x11 vpxor %%T6, %%T6, %%T4 vpclmulqdq %%T4, %%XMM8, %%T5, 0x00 vpxor %%T7, %%T7, %%T4 vmovdqu %%T3, [%%GDATA + HashKey_k] vpclmulqdq %%T2, %%T2, %%T3, 0x00 vpxor %%XMM1, %%XMM1, %%T2 vpxor %%XMM1, %%XMM1, %%T6 vpxor %%T2, %%XMM1, %%T7 vpslldq %%T4, %%T2, 8 vpsrldq %%T2, %%T2, 8 vpxor %%T7, %%T4 vpxor %%T6, %%T2 ; <%%T6:%%T7> holds the result of the accumulated carry-less multiplications ;first phase of the reduction vpslld %%T2, %%T7, 31 ; packed right shifting << 31 vpslld %%T3, %%T7, 30 ; packed right shifting shift << 30 vpslld %%T4, %%T7, 25 ; packed right shifting shift << 25 vpxor %%T2, %%T2, %%T3 ; xor the shifted versions vpxor %%T2, %%T2, %%T4 vpsrldq %%T1, %%T2, 4 ; shift-R %%T1 1 DW vpslldq %%T2, %%T2, 12 ; shift-L %%T2 3 DWs vpxor %%T7, %%T2 ; first phase of the reduction complete ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;second phase of the reduction vpsrld %%T2,%%T7,1 ; packed left shifting >> 1 vpsrld %%T3,%%T7,2 ; packed left shifting >> 2 vpsrld %%T4,%%T7,7 ; packed left shifting >> 7 vpxor %%T2, %%T2,%%T3 ; xor the shifted versions vpxor %%T2, %%T2,%%T4 vpxor %%T2, %%T2, %%T1 vpxor %%T7, %%T7, %%T2 vpxor %%T6, %%T6, %%T7 ; the result is in %%T6 %endmacro ; Encryption of a single block ; %%GDATA is GCM key data %macro ENCRYPT_SINGLE_BLOCK 2 %define %%GDATA %1 %define %%XMM0 %2 vpxor %%XMM0, [%%GDATA+16*0] %assign i 1 %rep NROUNDS vaesenc %%XMM0, [%%GDATA+16*i] %assign i (i+1) %endrep ; NROUNDS vaesenclast %%XMM0, [%%GDATA+16*i] %endmacro ;; Start of Stack Setup %macro FUNC_SAVE 0 ;; Required for Update/GMC_ENC ;the number of pushes must equal STACK_OFFSET push r12 push r13 push r14 push r15 mov r14, rsp sub rsp, VARIABLE_OFFSET and rsp, ~63 %ifidn __OUTPUT_FORMAT__, win64 ; xmm6:xmm15 need to be maintained for Windows vmovdqu [rsp + LOCAL_STORAGE + 0*16],xmm6 vmovdqu [rsp + LOCAL_STORAGE + 1*16],xmm7 vmovdqu [rsp + LOCAL_STORAGE + 2*16],xmm8 vmovdqu [rsp + LOCAL_STORAGE + 3*16],xmm9 vmovdqu [rsp + LOCAL_STORAGE + 4*16],xmm10 vmovdqu [rsp + LOCAL_STORAGE + 5*16],xmm11 vmovdqu [rsp + LOCAL_STORAGE + 6*16],xmm12 vmovdqu [rsp + LOCAL_STORAGE + 7*16],xmm13 vmovdqu [rsp + LOCAL_STORAGE + 8*16],xmm14 vmovdqu [rsp + LOCAL_STORAGE + 9*16],xmm15 %endif %endmacro %macro FUNC_RESTORE 0 %ifidn __OUTPUT_FORMAT__, win64 vmovdqu xmm15 , [rsp + LOCAL_STORAGE + 9*16] vmovdqu xmm14 , [rsp + LOCAL_STORAGE + 8*16] vmovdqu xmm13 , [rsp + LOCAL_STORAGE + 7*16] vmovdqu xmm12 , [rsp + LOCAL_STORAGE + 6*16] vmovdqu xmm11 , [rsp + LOCAL_STORAGE + 5*16] vmovdqu xmm10 , [rsp + LOCAL_STORAGE + 4*16] vmovdqu xmm9 , [rsp + LOCAL_STORAGE + 3*16] vmovdqu xmm8 , [rsp + LOCAL_STORAGE + 2*16] vmovdqu xmm7 , [rsp + LOCAL_STORAGE + 1*16] vmovdqu xmm6 , [rsp + LOCAL_STORAGE + 0*16] %endif ;; Required for Update/GMC_ENC mov rsp, r14 pop r15 pop r14 pop r13 pop r12 %endmacro ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; GCM_INIT initializes a gcm_context_data struct to prepare for encoding/decoding. ; Input: struct gcm_key_data *(GDATA_KEY), struct gcm_context_data *(GDATA_CTX), ; IV, Additional Authentication data (A_IN), Additional ; Data length (A_LEN) ; Output: Updated GDATA with the hash of A_IN (AadHash) and initialized other parts of GDATA. ; Clobbers rax, r10-r13, and xmm0-xmm6 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; %macro GCM_INIT 5 %define %%GDATA_KEY %1 %define %%GDATA_CTX %2 %define %%IV %3 %define %%A_IN %4 %define %%A_LEN %5 %define %%AAD_HASH xmm0 %define %%SUBHASH xmm1 vmovdqu %%SUBHASH, [%%GDATA_KEY + HashKey] CALC_AAD_HASH %%A_IN, %%A_LEN, %%AAD_HASH, %%SUBHASH, xmm2, xmm3, xmm4, xmm5, xmm6, r10, r11, r12, r13, rax vpxor xmm2, xmm3 mov r10, %%A_LEN vmovdqu [%%GDATA_CTX + AadHash], %%AAD_HASH ; ctx_data.aad hash = aad_hash mov [%%GDATA_CTX + AadLen], r10 ; ctx_data.aad_length = aad_length xor r10, r10 mov [%%GDATA_CTX + InLen], r10 ; ctx_data.in_length = 0 mov [%%GDATA_CTX + PBlockLen], r10 ; ctx_data.partial_block_length = 0 vmovdqu [%%GDATA_CTX + PBlockEncKey], xmm2 ; ctx_data.partial_block_enc_key = 0 mov r10, %%IV vmovdqa xmm2, [rel ONEf] ; read 12 IV bytes and pad with 0x00000001 vpinsrq xmm2, [r10], 0 vpinsrd xmm2, [r10+8], 2 vmovdqu [%%GDATA_CTX + OrigIV], xmm2 ; ctx_data.orig_IV = iv vpshufb xmm2, [SHUF_MASK] vmovdqu [%%GDATA_CTX + CurCount], xmm2 ; ctx_data.current_counter = iv %endmacro ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context_data struct ; has been initialized by GCM_INIT ; Requires the input data be at least 1 byte long because of READ_SMALL_INPUT_DATA. ; Input: struct gcm_key_data* (GDATA_KEY), struct gcm_context_data * (GDATA_CTX), ; input text (PLAIN_CYPH_IN), input text length (PLAIN_CYPH_LEN), ; and whether encoding or decoding (ENC_DEC) ; Output: A cypher of the given plain text (CYPH_PLAIN_OUT), and updated GDATA_CTX ; Clobbers rax, r10-r15, and xmm0-xmm15 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; %macro GCM_ENC_DEC 6 %define %%GDATA_KEY %1 %define %%GDATA_CTX %2 %define %%CYPH_PLAIN_OUT %3 %define %%PLAIN_CYPH_IN %4 %define %%PLAIN_CYPH_LEN %5 %define %%ENC_DEC %6 %define %%DATA_OFFSET r11 ; Macro flow: ; calculate the number of 16byte blocks in the message ; process (number of 16byte blocks) mod 8 '%%_initial_num_blocks_is_# .. %%_initial_blocks_encrypted' ; process 8 16 byte blocks at a time until all are done '%%_encrypt_by_8_new .. %%_eight_cipher_left' ; if there is a block of less tahn 16 bytes process it '%%_zero_cipher_left .. %%_multiple_of_16_bytes' cmp %%PLAIN_CYPH_LEN, 0 je %%_multiple_of_16_bytes xor %%DATA_OFFSET, %%DATA_OFFSET %ifidn __OUTPUT_FORMAT__, win64 mov rax, %%PLAIN_CYPH_LEN add [%%GDATA_CTX + InLen], rax ; Update length of data processed %else add [%%GDATA_CTX + InLen], %%PLAIN_CYPH_LEN ; Update length of data processed %endif vmovdqu xmm13, [%%GDATA_KEY + HashKey] ; xmm13 = HashKey vmovdqu xmm8, [%%GDATA_CTX + AadHash] PARTIAL_BLOCK %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%PLAIN_CYPH_LEN, %%DATA_OFFSET, xmm8, %%ENC_DEC mov r13, %%PLAIN_CYPH_LEN sub r13, %%DATA_OFFSET mov r10, r13 ; save the amount of data left to process in r10 and r13, -16 ; r13 = r13 - (r13 mod 16) mov r12, r13 shr r12, 4 and r12, 7 jz %%_initial_num_blocks_is_0 cmp r12, 7 je %%_initial_num_blocks_is_7 cmp r12, 6 je %%_initial_num_blocks_is_6 cmp r12, 5 je %%_initial_num_blocks_is_5 cmp r12, 4 je %%_initial_num_blocks_is_4 cmp r12, 3 je %%_initial_num_blocks_is_3 cmp r12, 2 je %%_initial_num_blocks_is_2 jmp %%_initial_num_blocks_is_1 %%_initial_num_blocks_is_7: INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 7, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC sub r13, 16*7 jmp %%_initial_blocks_encrypted %%_initial_num_blocks_is_6: INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 6, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC sub r13, 16*6 jmp %%_initial_blocks_encrypted %%_initial_num_blocks_is_5: INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 5, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC sub r13, 16*5 jmp %%_initial_blocks_encrypted %%_initial_num_blocks_is_4: INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 4, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC sub r13, 16*4 jmp %%_initial_blocks_encrypted %%_initial_num_blocks_is_3: INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 3, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC sub r13, 16*3 jmp %%_initial_blocks_encrypted %%_initial_num_blocks_is_2: INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 2, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC sub r13, 16*2 jmp %%_initial_blocks_encrypted %%_initial_num_blocks_is_1: INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 1, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC sub r13, 16 jmp %%_initial_blocks_encrypted %%_initial_num_blocks_is_0: INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 0, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC %%_initial_blocks_encrypted: cmp r13, 0 je %%_zero_cipher_left sub r13, 128 je %%_eight_cipher_left vmovd r15d, xmm9 and r15d, 255 vpshufb xmm9, [SHUF_MASK] %%_encrypt_by_8_new: cmp r15d, 255-8 jg %%_encrypt_by_8 add r15b, 8 GHASH_8_ENCRYPT_8_PARALLEL %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, out_order, %%ENC_DEC add %%DATA_OFFSET, 128 sub r13, 128 jne %%_encrypt_by_8_new vpshufb xmm9, [SHUF_MASK] jmp %%_eight_cipher_left %%_encrypt_by_8: vpshufb xmm9, [SHUF_MASK] add r15b, 8 GHASH_8_ENCRYPT_8_PARALLEL %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN,%%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, in_order, %%ENC_DEC vpshufb xmm9, [SHUF_MASK] add %%DATA_OFFSET, 128 sub r13, 128 jne %%_encrypt_by_8_new vpshufb xmm9, [SHUF_MASK] %%_eight_cipher_left: GHASH_LAST_8 %%GDATA_KEY, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8 %%_zero_cipher_left: vmovdqu [%%GDATA_CTX + AadHash], xmm14 ; ctx_data.aad hash = xmm14 vmovdqu [%%GDATA_CTX + CurCount], xmm9 ; ctx_data.current_counter = xmm9 mov r13, r10 and r13, 15 ; r13 = (%%PLAIN_CYPH_LEN mod 16) je %%_multiple_of_16_bytes mov [%%GDATA_CTX + PBlockLen], r13 ; ctx_data.partial_blck_length = r13 ; handle the last <16 Byte block seperately vpaddd xmm9, [ONE] ; INCR CNT to get Yn vmovdqu [%%GDATA_CTX + CurCount], xmm9 ; my_ctx_data.current_counter = xmm9 vpshufb xmm9, [SHUF_MASK] ENCRYPT_SINGLE_BLOCK %%GDATA_KEY, xmm9 ; E(K, Yn) vmovdqu [%%GDATA_CTX + PBlockEncKey], xmm9 ; ctx_data.partial_block_enc_key = xmm9 cmp %%PLAIN_CYPH_LEN, 16 jge %%_large_enough_update lea r10, [%%PLAIN_CYPH_IN + %%DATA_OFFSET] READ_SMALL_DATA_INPUT xmm1, r10, r13, r12, r15, rax lea r12, [SHIFT_MASK + 16] sub r12, r13 jmp %%_data_read %%_large_enough_update: sub %%DATA_OFFSET, 16 add %%DATA_OFFSET, r13 vmovdqu xmm1, [%%PLAIN_CYPH_IN+%%DATA_OFFSET] ; receive the last <16 Byte block sub %%DATA_OFFSET, r13 add %%DATA_OFFSET, 16 lea r12, [SHIFT_MASK + 16] sub r12, r13 ; adjust the shuffle mask pointer to be able to shift 16-r13 bytes (r13 is the number of bytes in plaintext mod 16) vmovdqu xmm2, [r12] ; get the appropriate shuffle mask vpshufb xmm1, xmm2 ; shift right 16-r13 bytes %%_data_read: %ifidn %%ENC_DEC, DEC vmovdqa xmm2, xmm1 vpxor xmm9, xmm1 ; Plaintext XOR E(K, Yn) vmovdqu xmm1, [r12 + ALL_F - SHIFT_MASK] ; get the appropriate mask to mask out top 16-r13 bytes of xmm9 vpand xmm9, xmm1 ; mask out top 16-r13 bytes of xmm9 vpand xmm2, xmm1 vpshufb xmm2, [SHUF_MASK] vpxor xmm14, xmm2 vmovdqu [%%GDATA_CTX + AadHash], xmm14 %else vpxor xmm9, xmm1 ; Plaintext XOR E(K, Yn) vmovdqu xmm1, [r12 + ALL_F - SHIFT_MASK] ; get the appropriate mask to mask out top 16-r13 bytes of xmm9 vpand xmm9, xmm1 ; mask out top 16-r13 bytes of xmm9 vpshufb xmm9, [SHUF_MASK] vpxor xmm14, xmm9 vmovdqu [%%GDATA_CTX + AadHash], xmm14 vpshufb xmm9, [SHUF_MASK] ; shuffle xmm9 back to output as ciphertext %endif ;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; output r13 Bytes vmovq rax, xmm9 cmp r13, 8 jle %%_less_than_8_bytes_left mov [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], rax add %%DATA_OFFSET, 8 vpsrldq xmm9, xmm9, 8 vmovq rax, xmm9 sub r13, 8 %%_less_than_8_bytes_left: mov BYTE [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], al add %%DATA_OFFSET, 1 shr rax, 8 sub r13, 1 jne %%_less_than_8_bytes_left ;;;;;;;;;;;;;;;;;;;;;;;;;;;;; %%_multiple_of_16_bytes: %endmacro ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; GCM_COMPLETE Finishes Encyrption/Decryption of last partial block after GCM_UPDATE finishes. ; Input: struct gcm_key_data* (GDATA_KEY), struct gcm_context_data *(GDATA_CTX) and ; whether encoding or decoding (ENC_DEC). ; Output: Authorization Tag (AUTH_TAG) and Authorization Tag length (AUTH_TAG_LEN) ; Clobbers rax, r10-r12, and xmm0, xmm1, xmm5, xmm6, xmm9, xmm11, xmm14, xmm15 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; %macro GCM_COMPLETE 5 %define %%GDATA_KEY %1 %define %%GDATA_CTX %2 %define %%AUTH_TAG %3 %define %%AUTH_TAG_LEN %4 %define %%ENC_DEC %5 %define %%PLAIN_CYPH_LEN rax mov r12, [%%GDATA_CTX + PBlockLen] vmovdqu xmm14, [%%GDATA_CTX + AadHash] vmovdqu xmm13, [%%GDATA_KEY + HashKey] cmp r12, 0 je %%_partial_done GHASH_MUL xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block vmovdqu [%%GDATA_CTX + AadHash], xmm14 %%_partial_done: mov r12, [%%GDATA_CTX + AadLen] ; r12 = aadLen (number of bytes) mov %%PLAIN_CYPH_LEN, [%%GDATA_CTX + InLen] shl r12, 3 ; convert into number of bits vmovd xmm15, r12d ; len(A) in xmm15 shl %%PLAIN_CYPH_LEN, 3 ; len(C) in bits (*128) vmovq xmm1, %%PLAIN_CYPH_LEN vpslldq xmm15, xmm15, 8 ; xmm15 = len(A)|| 0x0000000000000000 vpxor xmm15, xmm1 ; xmm15 = len(A)||len(C) vpxor xmm14, xmm15 GHASH_MUL xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ; final GHASH computation vpshufb xmm14, [SHUF_MASK] ; perform a 16Byte swap vmovdqu xmm9, [%%GDATA_CTX + OrigIV] ; xmm9 = Y0 ENCRYPT_SINGLE_BLOCK %%GDATA_KEY, xmm9 ; E(K, Y0) vpxor xmm9, xmm14 %%_return_T: mov r10, %%AUTH_TAG ; r10 = authTag mov r11, %%AUTH_TAG_LEN ; r11 = auth_tag_len cmp r11, 16 je %%_T_16 cmp r11, 12 je %%_T_12 %%_T_8: vmovq rax, xmm9 mov [r10], rax jmp %%_return_T_done %%_T_12: vmovq rax, xmm9 mov [r10], rax vpsrldq xmm9, xmm9, 8 vmovd eax, xmm9 mov [r10 + 8], eax jmp %%_return_T_done %%_T_16: vmovdqu [r10], xmm9 %%_return_T_done: %endmacro ; GCM_COMPLETE ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;void aes_gcm_precomp_128_avx_gen2 ; (struct gcm_key_data *key_data); ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; MKGLOBAL(FN_NAME(precomp,_),function,) FN_NAME(precomp,_): push r12 push r13 push r14 push r15 mov r14, rsp sub rsp, VARIABLE_OFFSET and rsp, ~63 ; align rsp to 64 bytes %ifidn __OUTPUT_FORMAT__, win64 ; only xmm6 needs to be maintained vmovdqu [rsp + LOCAL_STORAGE + 0*16],xmm6 %endif vpxor xmm6, xmm6 ENCRYPT_SINGLE_BLOCK arg1, xmm6 ; xmm6 = HashKey vpshufb xmm6, [SHUF_MASK] ;;;;;;;;;;;;;;; PRECOMPUTATION of HashKey<<1 mod poly from the HashKey;;;;;;;;;;;;;;; vmovdqa xmm2, xmm6 vpsllq xmm6, 1 vpsrlq xmm2, 63 vmovdqa xmm1, xmm2 vpslldq xmm2, xmm2, 8 vpsrldq xmm1, xmm1, 8 vpor xmm6, xmm2 ;reduction vpshufd xmm2, xmm1, 00100100b vpcmpeqd xmm2, [TWOONE] vpand xmm2, [POLY] vpxor xmm6, xmm2 ; xmm6 holds the HashKey<<1 mod poly ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; vmovdqu [arg1 + HashKey], xmm6 ; store HashKey<<1 mod poly PRECOMPUTE arg1, xmm6, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5 %ifidn __OUTPUT_FORMAT__, win64 vmovdqu xmm6, [rsp + LOCAL_STORAGE + 0*16] %endif mov rsp, r14 pop r15 pop r14 pop r13 pop r12 ret ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;void aes_gcm_init_128_avx_gen2( ; const struct gcm_key_data *key_data, ; struct gcm_context_data *context_data, ; u8 *iv, ; const u8 *aad, ; u64 aad_len); ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; MKGLOBAL(FN_NAME(init,_),function,) FN_NAME(init,_): push r12 push r13 %ifidn __OUTPUT_FORMAT__, win64 push r14 push r15 mov r14, rsp ; xmm6:xmm15 need to be maintained for Windows sub rsp, 1*16 movdqu [rsp + 0*16], xmm6 %endif GCM_INIT arg1, arg2, arg3, arg4, arg5 %ifidn __OUTPUT_FORMAT__, win64 movdqu xmm6 , [rsp + 0*16] mov rsp, r14 pop r15 pop r14 %endif pop r13 pop r12 ret ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;void aes_gcm_enc_128_update_avx_gen2( ; const struct gcm_key_data *key_data, ; struct gcm_context_data *context_data, ; u8 *out, ; const u8 *in, ; u64 plaintext_len); ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; MKGLOBAL(FN_NAME(enc,_update_),function,) FN_NAME(enc,_update_): FUNC_SAVE GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, ENC FUNC_RESTORE ret ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;void aes_gcm_dec_128_update_avx_gen2( ; const struct gcm_key_data *key_data, ; struct gcm_context_data *context_data, ; u8 *out, ; const u8 *in, ; u64 plaintext_len); ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; MKGLOBAL(FN_NAME(dec,_update_),function,) FN_NAME(dec,_update_): FUNC_SAVE GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, DEC FUNC_RESTORE ret ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;void aes_gcm_enc_128_finalize_avx_gen2( ; const struct gcm_key_data *key_data, ; struct gcm_context_data *context_data, ; u8 *auth_tag, ; u64 auth_tag_len); ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; MKGLOBAL(FN_NAME(enc,_finalize_),function,) FN_NAME(enc,_finalize_): push r12 %ifidn __OUTPUT_FORMAT__, win64 ; xmm6:xmm15 need to be maintained for Windows sub rsp, 5*16 vmovdqu [rsp + 0*16],xmm6 vmovdqu [rsp + 1*16],xmm9 vmovdqu [rsp + 2*16],xmm11 vmovdqu [rsp + 3*16],xmm14 vmovdqu [rsp + 4*16],xmm15 %endif GCM_COMPLETE arg1, arg2, arg3, arg4, ENC %ifidn __OUTPUT_FORMAT__, win64 vmovdqu xmm15 , [rsp + 4*16] vmovdqu xmm14 , [rsp + 3*16] vmovdqu xmm11 , [rsp + 2*16] vmovdqu xmm9 , [rsp + 1*16] vmovdqu xmm6 , [rsp + 0*16] add rsp, 5*16 %endif pop r12 ret ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;void aes_gcm_dec_128_finalize_avx_gen2( ; const struct gcm_key_data *key_data, ; struct gcm_context_data *context_data, ; u8 *auth_tag, ; u64 auth_tag_len); ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; MKGLOBAL(FN_NAME(dec,_finalize_),function,) FN_NAME(dec,_finalize_): push r12 %ifidn __OUTPUT_FORMAT__, win64 ; xmm6:xmm15 need to be maintained for Windows sub rsp, 5*16 vmovdqu [rsp + 0*16],xmm6 vmovdqu [rsp + 1*16],xmm9 vmovdqu [rsp + 2*16],xmm11 vmovdqu [rsp + 3*16],xmm14 vmovdqu [rsp + 4*16],xmm15 %endif GCM_COMPLETE arg1, arg2, arg3, arg4, DEC %ifidn __OUTPUT_FORMAT__, win64 vmovdqu xmm15 , [rsp + 4*16] vmovdqu xmm14 , [rsp + 3*16] vmovdqu xmm11 , [rsp + 2*16] vmovdqu xmm9 , [rsp + 1*16] vmovdqu xmm6 , [rsp + 0*16] add rsp, 5*16 %endif pop r12 ret ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;void aes_gcm_enc_128_avx_gen2( ; const struct gcm_key_data *key_data, ; struct gcm_context_data *context_data, ; u8 *out, ; const u8 *in, ; u64 plaintext_len, ; u8 *iv, ; const u8 *aad, ; u64 aad_len, ; u8 *auth_tag, ; u64 auth_tag_len); ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; MKGLOBAL(FN_NAME(enc,_),function,) FN_NAME(enc,_): FUNC_SAVE GCM_INIT arg1, arg2, arg6, arg7, arg8 GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, ENC GCM_COMPLETE arg1, arg2, arg9, arg10, ENC FUNC_RESTORE ret ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;void aes_gcm_dec_128_avx_gen2( ; const struct gcm_key_data *key_data, ; struct gcm_context_data *context_data, ; u8 *out, ; const u8 *in, ; u64 plaintext_len, ; u8 *iv, ; const u8 *aad, ; u64 aad_len, ; u8 *auth_tag, ; u64 auth_tag_len); ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; MKGLOBAL(FN_NAME(dec,_),function,) FN_NAME(dec,_): FUNC_SAVE GCM_INIT arg1, arg2, arg6, arg7, arg8 GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, DEC GCM_COMPLETE arg1, arg2, arg9, arg10, DEC FUNC_RESTORE ret intel-ipsec-mb-0.48/avx/mb_mgr_aes192_flush_avx.asm000066400000000000000000000032561321406316400221710ustar00rootroot00000000000000;; ;; Copyright (c) 2012-2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; %define AES_CBC_ENC_X8 aes_cbc_enc_192_x8 %define FLUSH_JOB_AES_ENC flush_job_aes192_enc_avx %include "mb_mgr_aes_flush_avx.asm" intel-ipsec-mb-0.48/avx/mb_mgr_aes192_submit_avx.asm000066400000000000000000000032611321406316400223470ustar00rootroot00000000000000;; ;; Copyright (c) 2012-2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; %define AES_CBC_ENC_X8 aes_cbc_enc_192_x8 %define SUBMIT_JOB_AES_ENC submit_job_aes192_enc_avx %include "mb_mgr_aes_submit_avx.asm" intel-ipsec-mb-0.48/avx/mb_mgr_aes256_flush_avx.asm000066400000000000000000000032561321406316400221720ustar00rootroot00000000000000;; ;; Copyright (c) 2012-2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; %define AES_CBC_ENC_X8 aes_cbc_enc_256_x8 %define FLUSH_JOB_AES_ENC flush_job_aes256_enc_avx %include "mb_mgr_aes_flush_avx.asm" intel-ipsec-mb-0.48/avx/mb_mgr_aes256_submit_avx.asm000066400000000000000000000032611321406316400223500ustar00rootroot00000000000000;; ;; Copyright (c) 2012-2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; %define AES_CBC_ENC_X8 aes_cbc_enc_256_x8 %define SUBMIT_JOB_AES_ENC submit_job_aes256_enc_avx %include "mb_mgr_aes_submit_avx.asm" intel-ipsec-mb-0.48/avx/mb_mgr_aes_flush_avx.asm000066400000000000000000000147531321406316400217410ustar00rootroot00000000000000;; ;; Copyright (c) 2012-2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; %include "os.asm" %include "job_aes_hmac.asm" %include "mb_mgr_datastruct.asm" %include "reg_sizes.asm" %ifndef AES_CBC_ENC_X8 %define AES_CBC_ENC_X8 aes_cbc_enc_128_x8 %define FLUSH_JOB_AES_ENC flush_job_aes128_enc_avx %endif ; void AES_CBC_ENC_X8(AES_ARGS_x8 *args, UINT64 len_in_bytes); extern AES_CBC_ENC_X8 section .data default rel align 16 len_masks: ;ddq 0x0000000000000000000000000000FFFF dq 0x000000000000FFFF, 0x0000000000000000 ;ddq 0x000000000000000000000000FFFF0000 dq 0x00000000FFFF0000, 0x0000000000000000 ;ddq 0x00000000000000000000FFFF00000000 dq 0x0000FFFF00000000, 0x0000000000000000 ;ddq 0x0000000000000000FFFF000000000000 dq 0xFFFF000000000000, 0x0000000000000000 ;ddq 0x000000000000FFFF0000000000000000 dq 0x0000000000000000, 0x000000000000FFFF ;ddq 0x00000000FFFF00000000000000000000 dq 0x0000000000000000, 0x00000000FFFF0000 ;ddq 0x0000FFFF000000000000000000000000 dq 0x0000000000000000, 0x0000FFFF00000000 ;ddq 0xFFFF0000000000000000000000000000 dq 0x0000000000000000, 0xFFFF000000000000 dupw: ;ddq 0x01000100010001000100010001000100 dq 0x0100010001000100, 0x0100010001000100 one: dq 1 two: dq 2 three: dq 3 four: dq 4 five: dq 5 six: dq 6 seven: dq 7 section .text %define APPEND(a,b) a %+ b %ifdef LINUX %define arg1 rdi %define arg2 rsi %else %define arg1 rcx %define arg2 rdx %endif %define state arg1 %define job arg2 %define len2 arg2 %define job_rax rax %if 1 %define unused_lanes rbx %define tmp1 rbx %define good_lane rdx %define iv rdx %define tmp2 rax ; idx needs to be in rbp %define tmp rbp %define idx rbp %define tmp3 r8 %endif ; STACK_SPACE needs to be an odd multiple of 8 ; This routine and its callee clobbers all GPRs struc STACK _gpr_save: resq 8 _rsp_save: resq 1 endstruc ; JOB* FLUSH_JOB_AES_ENC(MB_MGR_AES_OOO *state, JOB_AES_HMAC *job) ; arg 1 : state ; arg 2 : job MKGLOBAL(FLUSH_JOB_AES_ENC,function,internal) FLUSH_JOB_AES_ENC: mov rax, rsp sub rsp, STACK_size and rsp, -16 mov [rsp + _gpr_save + 8*0], rbx mov [rsp + _gpr_save + 8*1], rbp mov [rsp + _gpr_save + 8*2], r12 mov [rsp + _gpr_save + 8*3], r13 mov [rsp + _gpr_save + 8*4], r14 mov [rsp + _gpr_save + 8*5], r15 %ifndef LINUX mov [rsp + _gpr_save + 8*6], rsi mov [rsp + _gpr_save + 8*7], rdi %endif mov [rsp + _rsp_save], rax ; original SP ; check for empty mov unused_lanes, [state + _aes_unused_lanes] bt unused_lanes, 32+3 jc return_null ; find a lane with a non-null job xor good_lane, good_lane cmp qword [state + _aes_job_in_lane + 1*8], 0 cmovne good_lane, [rel one] cmp qword [state + _aes_job_in_lane + 2*8], 0 cmovne good_lane, [rel two] cmp qword [state + _aes_job_in_lane + 3*8], 0 cmovne good_lane, [rel three] cmp qword [state + _aes_job_in_lane + 4*8], 0 cmovne good_lane, [rel four] cmp qword [state + _aes_job_in_lane + 5*8], 0 cmovne good_lane, [rel five] cmp qword [state + _aes_job_in_lane + 6*8], 0 cmovne good_lane, [rel six] cmp qword [state + _aes_job_in_lane + 7*8], 0 cmovne good_lane, [rel seven] ; copy good_lane to empty lanes mov tmp1, [state + _aes_args_in + good_lane*8] mov tmp2, [state + _aes_args_out + good_lane*8] mov tmp3, [state + _aes_args_keys + good_lane*8] shl good_lane, 4 ; multiply by 16 vmovdqa xmm2, [state + _aes_args_IV + good_lane] vmovdqa xmm0, [state + _aes_lens] %assign I 0 %rep 8 cmp qword [state + _aes_job_in_lane + I*8], 0 jne APPEND(skip_,I) mov [state + _aes_args_in + I*8], tmp1 mov [state + _aes_args_out + I*8], tmp2 mov [state + _aes_args_keys + I*8], tmp3 vmovdqa [state + _aes_args_IV + I*16], xmm2 vpor xmm0, xmm0, [rel len_masks + 16*I] APPEND(skip_,I): %assign I (I+1) %endrep ; Find min length vphminposuw xmm1, xmm0 vpextrw DWORD(len2), xmm1, 0 ; min value vpextrw DWORD(idx), xmm1, 1 ; min index (0...3) cmp len2, 0 je len_is_0 vpshufb xmm1, xmm1, [rel dupw] ; duplicate words across all lanes vpsubw xmm0, xmm0, xmm1 vmovdqa [state + _aes_lens], xmm0 ; "state" and "args" are the same address, arg1 ; len is arg2 call AES_CBC_ENC_X8 ; state and idx are intact len_is_0: ; process completed job "idx" mov job_rax, [state + _aes_job_in_lane + idx*8] ; Don't write back IV ; mov iv, [job_rax + _iv] mov unused_lanes, [state + _aes_unused_lanes] mov qword [state + _aes_job_in_lane + idx*8], 0 or dword [job_rax + _status], STS_COMPLETED_AES shl unused_lanes, 4 or unused_lanes, idx ; shl idx, 4 ; multiply by 16 mov [state + _aes_unused_lanes], unused_lanes ; vmovdqa xmm0, [state + _aes_args_IV + idx] ; vmovdqu [iv], xmm0 return: mov rbx, [rsp + _gpr_save + 8*0] mov rbp, [rsp + _gpr_save + 8*1] mov r12, [rsp + _gpr_save + 8*2] mov r13, [rsp + _gpr_save + 8*3] mov r14, [rsp + _gpr_save + 8*4] mov r15, [rsp + _gpr_save + 8*5] %ifndef LINUX mov rsi, [rsp + _gpr_save + 8*6] mov rdi, [rsp + _gpr_save + 8*7] %endif mov rsp, [rsp + _rsp_save] ; original SP ret return_null: xor job_rax, job_rax jmp return intel-ipsec-mb-0.48/avx/mb_mgr_aes_submit_avx.asm000066400000000000000000000122021321406316400221060ustar00rootroot00000000000000;; ;; Copyright (c) 2012-2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; %include "os.asm" %include "job_aes_hmac.asm" %include "mb_mgr_datastruct.asm" %include "reg_sizes.asm" %ifndef AES_CBC_ENC_X8 %define AES_CBC_ENC_X8 aes_cbc_enc_128_x8 %define SUBMIT_JOB_AES_ENC submit_job_aes128_enc_avx %endif ; void AES_CBC_ENC_X8(AES_ARGS_x8 *args, UINT64 len_in_bytes); extern AES_CBC_ENC_X8 section .data default rel align 16 dupw: ;ddq 0x01000100010001000100010001000100 dq 0x0100010001000100, 0x0100010001000100 section .text %ifdef LINUX %define arg1 rdi %define arg2 rsi %else %define arg1 rcx %define arg2 rdx %endif %define state arg1 %define job arg2 %define len2 arg2 %define job_rax rax %if 1 ; idx needs to be in rbp %define len rbp %define idx rbp %define tmp rbp %define lane r8 %define iv r9 %define unused_lanes rbx %endif ; STACK_SPACE needs to be an odd multiple of 8 ; This routine and its callee clobbers all GPRs struc STACK _gpr_save: resq 8 _rsp_save: resq 1 endstruc ; JOB* SUBMIT_JOB_AES_ENC(MB_MGR_AES_OOO *state, JOB_AES_HMAC *job) ; arg 1 : state ; arg 2 : job MKGLOBAL(SUBMIT_JOB_AES_ENC,function,internal) SUBMIT_JOB_AES_ENC: mov rax, rsp sub rsp, STACK_size and rsp, -16 mov [rsp + _gpr_save + 8*0], rbx mov [rsp + _gpr_save + 8*1], rbp mov [rsp + _gpr_save + 8*2], r12 mov [rsp + _gpr_save + 8*3], r13 mov [rsp + _gpr_save + 8*4], r14 mov [rsp + _gpr_save + 8*5], r15 %ifndef LINUX mov [rsp + _gpr_save + 8*6], rsi mov [rsp + _gpr_save + 8*7], rdi %endif mov [rsp + _rsp_save], rax ; original SP mov unused_lanes, [state + _aes_unused_lanes] mov lane, unused_lanes and lane, 0xF shr unused_lanes, 4 mov len, [job + _msg_len_to_cipher_in_bytes] and len, -16 ; DOCSIS may pass size unaligned to block size mov iv, [job + _iv] mov [state + _aes_unused_lanes], unused_lanes mov [state + _aes_job_in_lane + lane*8], job mov [state + _aes_lens + 2*lane], WORD(len) mov tmp, [job + _src] add tmp, [job + _cipher_start_src_offset_in_bytes] vmovdqu xmm0, [iv] mov [state + _aes_args_in + lane*8], tmp mov tmp, [job + _aes_enc_key_expanded] mov [state + _aes_args_keys + lane*8], tmp mov tmp, [job + _dst] mov [state + _aes_args_out + lane*8], tmp shl lane, 4 ; multiply by 16 vmovdqa [state + _aes_args_IV + lane], xmm0 cmp unused_lanes, 0xf jne return_null ; Find min length vmovdqa xmm0, [state + _aes_lens] vphminposuw xmm1, xmm0 vpextrw DWORD(len2), xmm1, 0 ; min value vpextrw DWORD(idx), xmm1, 1 ; min index (0...7) cmp len2, 0 je len_is_0 vpshufb xmm1, xmm1, [rel dupw] ; duplicate words across all lanes vpsubw xmm0, xmm0, xmm1 vmovdqa [state + _aes_lens], xmm0 ; "state" and "args" are the same address, arg1 ; len is arg2 call AES_CBC_ENC_X8 ; state and idx are intact len_is_0: ; process completed job "idx" mov job_rax, [state + _aes_job_in_lane + idx*8] ; Don't write back IV ; mov iv, [job_rax + _iv] mov unused_lanes, [state + _aes_unused_lanes] mov qword [state + _aes_job_in_lane + idx*8], 0 or dword [job_rax + _status], STS_COMPLETED_AES shl unused_lanes, 4 or unused_lanes, idx ; shl idx, 4 ; multiply by 16 mov [state + _aes_unused_lanes], unused_lanes ; vmovdqa xmm0, [state + _aes_args_IV + idx] ; vmovdqu [iv], xmm0 return: mov rbx, [rsp + _gpr_save + 8*0] mov rbp, [rsp + _gpr_save + 8*1] mov r12, [rsp + _gpr_save + 8*2] mov r13, [rsp + _gpr_save + 8*3] mov r14, [rsp + _gpr_save + 8*4] mov r15, [rsp + _gpr_save + 8*5] %ifndef LINUX mov rsi, [rsp + _gpr_save + 8*6] mov rdi, [rsp + _gpr_save + 8*7] %endif mov rsp, [rsp + _rsp_save] ; original SP ret return_null: xor job_rax, job_rax jmp return intel-ipsec-mb-0.48/avx/mb_mgr_aes_xcbc_flush_avx.asm000066400000000000000000000163501321406316400227330ustar00rootroot00000000000000;; ;; Copyright (c) 2012-2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; %include "os.asm" %include "job_aes_hmac.asm" %include "mb_mgr_datastruct.asm" %include "reg_sizes.asm" %ifndef AES_XCBC_X8 %define AES_XCBC_X8 aes_xcbc_mac_128_x8 %define FLUSH_JOB_AES_XCBC flush_job_aes_xcbc_avx %endif ; void AES_XCBC_X8(AES_XCBC_ARGS_x8 *args, UINT64 len_in_bytes); extern AES_XCBC_X8 section .data default rel align 16 len_masks: ;ddq 0x0000000000000000000000000000FFFF dq 0x000000000000FFFF, 0x0000000000000000 ;ddq 0x000000000000000000000000FFFF0000 dq 0x00000000FFFF0000, 0x0000000000000000 ;ddq 0x00000000000000000000FFFF00000000 dq 0x0000FFFF00000000, 0x0000000000000000 ;ddq 0x0000000000000000FFFF000000000000 dq 0xFFFF000000000000, 0x0000000000000000 ;ddq 0x000000000000FFFF0000000000000000 dq 0x0000000000000000, 0x000000000000FFFF ;ddq 0x00000000FFFF00000000000000000000 dq 0x0000000000000000, 0x00000000FFFF0000 ;ddq 0x0000FFFF000000000000000000000000 dq 0x0000000000000000, 0x0000FFFF00000000 ;ddq 0xFFFF0000000000000000000000000000 dq 0x0000000000000000, 0xFFFF000000000000 dupw: ;ddq 0x01000100010001000100010001000100 dq 0x0100010001000100, 0x0100010001000100 one: dq 1 two: dq 2 three: dq 3 four: dq 4 five: dq 5 six: dq 6 seven: dq 7 section .text %define APPEND(a,b) a %+ b %ifdef LINUX %define arg1 rdi %define arg2 rsi %else %define arg1 rcx %define arg2 rdx %endif %define state arg1 %define job arg2 %define len2 arg2 %define job_rax rax %if 1 %define unused_lanes rbx %define tmp1 rbx %define icv rdx %define tmp2 rax ; idx needs to be in rbp %define tmp r10 %define idx rbp %define tmp3 r8 %define lane_data r9 %endif ; STACK_SPACE needs to be an odd multiple of 8 ; This routine and its callee clobbers all GPRs struc STACK _gpr_save: resq 8 _rsp_save: resq 1 endstruc ; JOB* FLUSH_JOB_AES_XCBC(MB_MGR_AES_XCBC_OOO *state, JOB_AES_HMAC *job) ; arg 1 : state ; arg 2 : job MKGLOBAL(FLUSH_JOB_AES_XCBC,function,internal) FLUSH_JOB_AES_XCBC: mov rax, rsp sub rsp, STACK_size and rsp, -16 mov [rsp + _gpr_save + 8*0], rbx mov [rsp + _gpr_save + 8*1], rbp mov [rsp + _gpr_save + 8*2], r12 mov [rsp + _gpr_save + 8*3], r13 mov [rsp + _gpr_save + 8*4], r14 mov [rsp + _gpr_save + 8*5], r15 %ifndef LINUX mov [rsp + _gpr_save + 8*6], rsi mov [rsp + _gpr_save + 8*7], rdi %endif mov [rsp + _rsp_save], rax ; original SP ; check for empty mov unused_lanes, [state + _aes_xcbc_unused_lanes] bt unused_lanes, 32+3 jc return_null ; find a lane with a non-null job xor idx, idx cmp qword [state + _aes_xcbc_ldata + 1 * _XCBC_LANE_DATA_size + _xcbc_job_in_lane], 0 cmovne idx, [rel one] cmp qword [state + _aes_xcbc_ldata + 2 * _XCBC_LANE_DATA_size + _xcbc_job_in_lane], 0 cmovne idx, [rel two] cmp qword [state + _aes_xcbc_ldata + 3 * _XCBC_LANE_DATA_size + _xcbc_job_in_lane], 0 cmovne idx, [rel three] cmp qword [state + _aes_xcbc_ldata + 4 * _XCBC_LANE_DATA_size + _xcbc_job_in_lane], 0 cmovne idx, [rel four] cmp qword [state + _aes_xcbc_ldata + 5 * _XCBC_LANE_DATA_size + _xcbc_job_in_lane], 0 cmovne idx, [rel five] cmp qword [state + _aes_xcbc_ldata + 6 * _XCBC_LANE_DATA_size + _xcbc_job_in_lane], 0 cmovne idx, [rel six] cmp qword [state + _aes_xcbc_ldata + 7 * _XCBC_LANE_DATA_size + _xcbc_job_in_lane], 0 cmovne idx, [rel seven] copy_lane_data: ; copy idx to empty lanes mov tmp1, [state + _aes_xcbc_args_in + idx*8] mov tmp3, [state + _aes_xcbc_args_keys + idx*8] shl idx, 4 ; multiply by 16 vmovdqa xmm2, [state + _aes_xcbc_args_ICV + idx] vmovdqa xmm0, [state + _aes_xcbc_lens] %assign I 0 %rep 8 cmp qword [state + _aes_xcbc_ldata + I * _XCBC_LANE_DATA_size + _xcbc_job_in_lane], 0 jne APPEND(skip_,I) mov [state + _aes_xcbc_args_in + I*8], tmp1 mov [state + _aes_xcbc_args_keys + I*8], tmp3 vmovdqa [state + _aes_xcbc_args_ICV + I*16], xmm2 vpor xmm0, xmm0, [rel len_masks + 16*I] APPEND(skip_,I): %assign I (I+1) %endrep vmovdqa [state + _aes_xcbc_lens], xmm0 ; Find min length vphminposuw xmm1, xmm0 vpextrw DWORD(len2), xmm1, 0 ; min value vpextrw DWORD(idx), xmm1, 1 ; min index (0...3) cmp len2, 0 je len_is_0 vpshufb xmm1, xmm1, [rel dupw] ; duplicate words across all lanes vpsubw xmm0, xmm0, xmm1 vmovdqa [state + _aes_xcbc_lens], xmm0 ; "state" and "args" are the same address, arg1 ; len is arg2 call AES_XCBC_X8 ; state and idx are intact len_is_0: ; process completed job "idx" imul lane_data, idx, _XCBC_LANE_DATA_size lea lane_data, [state + _aes_xcbc_ldata + lane_data] cmp dword [lane_data + _xcbc_final_done], 0 jne end_loop mov dword [lane_data + _xcbc_final_done], 1 mov word [state + _aes_xcbc_lens + 2*idx], 16 lea tmp, [lane_data + _xcbc_final_block] mov [state + _aes_xcbc_args_in + 8*idx], tmp jmp copy_lane_data end_loop: mov job_rax, [lane_data + _xcbc_job_in_lane] mov icv, [job_rax + _auth_tag_output] mov unused_lanes, [state + _aes_xcbc_unused_lanes] mov qword [lane_data + _xcbc_job_in_lane], 0 or dword [job_rax + _status], STS_COMPLETED_HMAC shl unused_lanes, 4 or unused_lanes, idx shl idx, 4 ; multiply by 16 mov [state + _aes_xcbc_unused_lanes], unused_lanes ; copy 12 bytes vmovdqa xmm0, [state + _aes_xcbc_args_ICV + idx] vmovq [icv], xmm0 vpextrd [icv + 8], xmm0, 2 return: mov rbx, [rsp + _gpr_save + 8*0] mov rbp, [rsp + _gpr_save + 8*1] mov r12, [rsp + _gpr_save + 8*2] mov r13, [rsp + _gpr_save + 8*3] mov r14, [rsp + _gpr_save + 8*4] mov r15, [rsp + _gpr_save + 8*5] %ifndef LINUX mov rsi, [rsp + _gpr_save + 8*6] mov rdi, [rsp + _gpr_save + 8*7] %endif mov rsp, [rsp + _rsp_save] ; original SP ret return_null: xor job_rax, job_rax jmp return intel-ipsec-mb-0.48/avx/mb_mgr_aes_xcbc_submit_avx.asm000066400000000000000000000162721321406316400231200ustar00rootroot00000000000000;; ;; Copyright (c) 2012-2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; %include "os.asm" %include "job_aes_hmac.asm" %include "mb_mgr_datastruct.asm" %include "reg_sizes.asm" %include "memcpy.asm" %ifndef AES_XCBC_X8 %define AES_XCBC_X8 aes_xcbc_mac_128_x8 %define SUBMIT_JOB_AES_XCBC submit_job_aes_xcbc_avx %endif ; void AES_XCBC_X8(AES_XCBC_ARGS_x8 *args, UINT64 len_in_bytes); extern AES_XCBC_X8 section .data default rel align 16 dupw: ;ddq 0x01000100010001000100010001000100 dq 0x0100010001000100, 0x0100010001000100 x80: ;ddq 0x00000000000000000000000000000080 dq 0x0000000000000080, 0x0000000000000000 section .text %ifdef LINUX %define arg1 rdi %define arg2 rsi %else %define arg1 rcx %define arg2 rdx %endif %define state arg1 %define job arg2 %define len2 arg2 %define job_rax rax %if 1 ; idx needs to be in rbp %define len r11 %define idx rbp %define tmp2 rbp %define tmp r14 %define lane r8 %define icv r9 %define p2 r9 %define last_len r10 %define lane_data r12 %define p r13 %define unused_lanes rbx %endif ; STACK_SPACE needs to be an odd multiple of 8 ; This routine and its callee clobbers all GPRs struc STACK _gpr_save: resq 8 _rsp_save: resq 1 endstruc ; JOB* SUBMIT_JOB_AES_XCBC(MB_MGR_AES_XCBC_OOO *state, JOB_AES_HMAC *job) ; arg 1 : state ; arg 2 : job MKGLOBAL(SUBMIT_JOB_AES_XCBC,function,internal) SUBMIT_JOB_AES_XCBC: mov rax, rsp sub rsp, STACK_size and rsp, -16 mov [rsp + _gpr_save + 8*0], rbx mov [rsp + _gpr_save + 8*1], rbp mov [rsp + _gpr_save + 8*2], r12 mov [rsp + _gpr_save + 8*3], r13 mov [rsp + _gpr_save + 8*4], r14 mov [rsp + _gpr_save + 8*5], r15 %ifndef LINUX mov [rsp + _gpr_save + 8*6], rsi mov [rsp + _gpr_save + 8*7], rdi %endif mov [rsp + _rsp_save], rax ; original SP mov unused_lanes, [state + _aes_xcbc_unused_lanes] mov lane, unused_lanes and lane, 0xF shr unused_lanes, 4 imul lane_data, lane, _XCBC_LANE_DATA_size lea lane_data, [state + _aes_xcbc_ldata + lane_data] mov len, [job + _msg_len_to_hash_in_bytes] mov [state + _aes_xcbc_unused_lanes], unused_lanes mov [lane_data + _xcbc_job_in_lane], job mov dword [lane_data + _xcbc_final_done], 0 mov tmp, [job + _k1_expanded] mov [state + _aes_xcbc_args_keys + lane*8], tmp mov p, [job + _src] add p, [job + _hash_start_src_offset_in_bytes] mov last_len, len cmp len, 16 jle small_buffer mov [state + _aes_xcbc_args_in + lane*8], p add p, len ; set point to end of data and last_len, 15 ; Check lsbs of msg len jnz slow_copy ; if not 16B mult, do slow copy fast_copy: vmovdqu xmm0, [p - 16] ; load last block M[n] mov tmp, [job + _k2] ; load K2 address vmovdqu xmm1, [tmp] ; load K2 vpxor xmm0, xmm0, xmm1 ; M[n] XOR K2 vmovdqa [lane_data + _xcbc_final_block], xmm0 sub len, 16 ; take last block off length end_fast_copy: mov [state + _aes_xcbc_lens + 2*lane], WORD(len) vpxor xmm0, xmm0, xmm0 shl lane, 4 ; multiply by 16 vmovdqa [state + _aes_xcbc_args_ICV + lane], xmm0 cmp unused_lanes, 0xf jne return_null start_loop: ; Find min length vmovdqa xmm0, [state + _aes_xcbc_lens] vphminposuw xmm1, xmm0 vpextrw DWORD(len2), xmm1, 0 ; min value vpextrw DWORD(idx), xmm1, 1 ; min index (0...7) cmp len2, 0 je len_is_0 vpshufb xmm1, xmm1, [rel dupw] ; duplicate words across all lanes vpsubw xmm0, xmm0, xmm1 vmovdqa [state + _aes_xcbc_lens], xmm0 ; "state" and "args" are the same address, arg1 ; len is arg2 call AES_XCBC_X8 ; state and idx are intact len_is_0: ; process completed job "idx" imul lane_data, idx, _XCBC_LANE_DATA_size lea lane_data, [state + _aes_xcbc_ldata + lane_data] cmp dword [lane_data + _xcbc_final_done], 0 jne end_loop mov dword [lane_data + _xcbc_final_done], 1 mov word [state + _aes_xcbc_lens + 2*idx], 16 lea tmp, [lane_data + _xcbc_final_block] mov [state + _aes_xcbc_args_in + 8*idx], tmp jmp start_loop end_loop: ; process completed job "idx" mov job_rax, [lane_data + _xcbc_job_in_lane] mov icv, [job_rax + _auth_tag_output] mov unused_lanes, [state + _aes_xcbc_unused_lanes] mov qword [lane_data + _xcbc_job_in_lane], 0 or dword [job_rax + _status], STS_COMPLETED_HMAC shl unused_lanes, 4 or unused_lanes, idx shl idx, 4 ; multiply by 16 mov [state + _aes_xcbc_unused_lanes], unused_lanes ; copy 12 bytes vmovdqa xmm0, [state + _aes_xcbc_args_ICV + idx] vmovq [icv], xmm0 vpextrd [icv + 8], xmm0, 2 return: mov rbx, [rsp + _gpr_save + 8*0] mov rbp, [rsp + _gpr_save + 8*1] mov r12, [rsp + _gpr_save + 8*2] mov r13, [rsp + _gpr_save + 8*3] mov r14, [rsp + _gpr_save + 8*4] mov r15, [rsp + _gpr_save + 8*5] %ifndef LINUX mov rsi, [rsp + _gpr_save + 8*6] mov rdi, [rsp + _gpr_save + 8*7] %endif mov rsp, [rsp + _rsp_save] ; original SP ret small_buffer: ; For buffers <= 16 Bytes ; The input data is set to final block lea tmp, [lane_data + _xcbc_final_block] ; final block mov [state + _aes_xcbc_args_in + lane*8], tmp add p, len ; set point to end of data cmp len, 16 je fast_copy slow_copy: and len, ~15 ; take final block off len sub p, last_len ; adjust data pointer lea p2, [lane_data + _xcbc_final_block + 16] ; upper part of final sub p2, last_len ; adjust data pointer backwards memcpy_avx_16_1 p2, p, last_len, tmp, tmp2 vmovdqa xmm0, [rel x80] ; fill reg with padding vmovdqu [lane_data + _xcbc_final_block + 16], xmm0 ; add padding vmovdqu xmm0, [p2] ; load final block to process mov tmp, [job + _k3] ; load K3 address vmovdqu xmm1, [tmp] ; load K3 vpxor xmm0, xmm0, xmm1 ; M[n] XOR K3 vmovdqu [lane_data + _xcbc_final_block], xmm0 ; write final block jmp end_fast_copy return_null: xor job_rax, job_rax jmp return intel-ipsec-mb-0.48/avx/mb_mgr_avx.c000066400000000000000000000503431321406316400173450ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2012-2017, Intel Corporation Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ #include #include #include #include "os.h" #include "mb_mgr.h" #include "save_xmms.h" #include "asm.h" #ifndef NO_GCM #include "gcm_defines.h" #endif #include "des.h" JOB_AES_HMAC *submit_job_aes128_enc_avx(MB_MGR_AES_OOO *state, JOB_AES_HMAC *job); JOB_AES_HMAC *flush_job_aes128_enc_avx(MB_MGR_AES_OOO *state); JOB_AES_HMAC *submit_job_aes192_enc_avx(MB_MGR_AES_OOO *state, JOB_AES_HMAC *job); JOB_AES_HMAC *flush_job_aes192_enc_avx(MB_MGR_AES_OOO *state); JOB_AES_HMAC *submit_job_aes256_enc_avx(MB_MGR_AES_OOO *state, JOB_AES_HMAC *job); JOB_AES_HMAC *flush_job_aes256_enc_avx(MB_MGR_AES_OOO *state); JOB_AES_HMAC *submit_job_aes_xcbc_avx(MB_MGR_AES_XCBC_OOO *state, JOB_AES_HMAC *job); JOB_AES_HMAC *flush_job_aes_xcbc_avx(MB_MGR_AES_XCBC_OOO *state); #define SAVE_XMMS save_xmms_avx #define RESTORE_XMMS restore_xmms_avx #define SUBMIT_JOB_AES128_ENC submit_job_aes128_enc_avx #define SUBMIT_JOB_AES128_DEC submit_job_aes128_dec_avx #define FLUSH_JOB_AES128_ENC flush_job_aes128_enc_avx #define SUBMIT_JOB_AES192_ENC submit_job_aes192_enc_avx #define SUBMIT_JOB_AES192_DEC submit_job_aes192_dec_avx #define FLUSH_JOB_AES192_ENC flush_job_aes192_enc_avx #define SUBMIT_JOB_AES256_ENC submit_job_aes256_enc_avx #define SUBMIT_JOB_AES256_DEC submit_job_aes256_dec_avx #define FLUSH_JOB_AES256_ENC flush_job_aes256_enc_avx #define SUBMIT_JOB_AES128_CNTR submit_job_aes128_cntr_avx #define SUBMIT_JOB_AES192_CNTR submit_job_aes192_cntr_avx #define SUBMIT_JOB_AES256_CNTR submit_job_aes256_cntr_avx #define AES_CBC_DEC_128 aes_cbc_dec_128_avx #define AES_CBC_DEC_192 aes_cbc_dec_192_avx #define AES_CBC_DEC_256 aes_cbc_dec_256_avx #define AES_CNTR_128 aes_cntr_128_avx #define AES_CNTR_192 aes_cntr_192_avx #define AES_CNTR_256 aes_cntr_256_avx #ifndef NO_GCM #define AES_GCM_DEC_128 aes_gcm_dec_128_avx_gen2 #define AES_GCM_ENC_128 aes_gcm_enc_128_avx_gen2 #define AES_GCM_DEC_192 aes_gcm_dec_192_avx_gen2 #define AES_GCM_ENC_192 aes_gcm_enc_192_avx_gen2 #define AES_GCM_DEC_256 aes_gcm_dec_256_avx_gen2 #define AES_GCM_ENC_256 aes_gcm_enc_256_avx_gen2 #endif #define SUBMIT_JOB_AES_XCBC submit_job_aes_xcbc_avx #define FLUSH_JOB_AES_XCBC flush_job_aes_xcbc_avx #define SUBMIT_JOB_AES128_DEC submit_job_aes128_dec_avx #define SUBMIT_JOB_AES192_DEC submit_job_aes192_dec_avx #define SUBMIT_JOB_AES256_DEC submit_job_aes256_dec_avx #define QUEUE_SIZE queue_size_avx #define SUBMIT_JOB_AES_ENC SUBMIT_JOB_AES_ENC_AVX #define FLUSH_JOB_AES_ENC FLUSH_JOB_AES_ENC_AVX #define SUBMIT_JOB_AES_DEC SUBMIT_JOB_AES_DEC_AVX #define FLUSH_JOB_AES_DEC FLUSH_JOB_AES_DEC_AVX JOB_AES_HMAC *submit_job_hmac_avx(MB_MGR_HMAC_SHA_1_OOO *state, JOB_AES_HMAC *job); JOB_AES_HMAC *flush_job_hmac_avx(MB_MGR_HMAC_SHA_1_OOO *state); JOB_AES_HMAC *submit_job_hmac_sha_224_avx(MB_MGR_HMAC_SHA_256_OOO *state, JOB_AES_HMAC *job); JOB_AES_HMAC *flush_job_hmac_sha_224_avx(MB_MGR_HMAC_SHA_256_OOO *state); JOB_AES_HMAC *submit_job_hmac_sha_256_avx(MB_MGR_HMAC_SHA_256_OOO *state, JOB_AES_HMAC *job); JOB_AES_HMAC *flush_job_hmac_sha_256_avx(MB_MGR_HMAC_SHA_256_OOO *state); JOB_AES_HMAC *submit_job_hmac_sha_384_avx(MB_MGR_HMAC_SHA_512_OOO *state, JOB_AES_HMAC *job); JOB_AES_HMAC *flush_job_hmac_sha_384_avx(MB_MGR_HMAC_SHA_512_OOO *state); JOB_AES_HMAC *submit_job_hmac_sha_512_avx(MB_MGR_HMAC_SHA_512_OOO *state, JOB_AES_HMAC *job); JOB_AES_HMAC *flush_job_hmac_sha_512_avx(MB_MGR_HMAC_SHA_512_OOO *state); JOB_AES_HMAC *submit_job_hmac_md5_avx(MB_MGR_HMAC_MD5_OOO *state, JOB_AES_HMAC *job); JOB_AES_HMAC *flush_job_hmac_md5_avx(MB_MGR_HMAC_MD5_OOO *state); #define SUBMIT_JOB_HMAC submit_job_hmac_avx #define FLUSH_JOB_HMAC flush_job_hmac_avx #define SUBMIT_JOB_HMAC_SHA_224 submit_job_hmac_sha_224_avx #define FLUSH_JOB_HMAC_SHA_224 flush_job_hmac_sha_224_avx #define SUBMIT_JOB_HMAC_SHA_256 submit_job_hmac_sha_256_avx #define FLUSH_JOB_HMAC_SHA_256 flush_job_hmac_sha_256_avx #define SUBMIT_JOB_HMAC_SHA_384 submit_job_hmac_sha_384_avx #define FLUSH_JOB_HMAC_SHA_384 flush_job_hmac_sha_384_avx #define SUBMIT_JOB_HMAC_SHA_512 submit_job_hmac_sha_512_avx #define FLUSH_JOB_HMAC_SHA_512 flush_job_hmac_sha_512_avx #define SUBMIT_JOB_HMAC_MD5 submit_job_hmac_md5_avx #define FLUSH_JOB_HMAC_MD5 flush_job_hmac_md5_avx /* ====================================================================== */ #define SUBMIT_JOB submit_job_avx #define FLUSH_JOB flush_job_avx #define SUBMIT_JOB_NOCHECK submit_job_nocheck_avx /* ====================================================================== */ #define SUBMIT_JOB_HASH SUBMIT_JOB_HASH_AVX #define FLUSH_JOB_HASH FLUSH_JOB_HASH_AVX /* ====================================================================== */ #define AES_CFB_128_ONE aes_cfb_128_one_avx void aes128_cbc_mac_x8(AES_ARGS_x8 *args, uint64_t len); #define AES128_CBC_MAC aes128_cbc_mac_x8 #define FLUSH_JOB_AES_CCM_AUTH flush_job_aes_ccm_auth_arch #define SUBMIT_JOB_AES_CCM_AUTH submit_job_aes_ccm_auth_arch #define AES_CCM_MAX_JOBS 8 /* ====================================================================== */ void init_mb_mgr_avx(MB_MGR *state) { unsigned int j; UINT8 *p; /* Init AES out-of-order fields */ state->aes128_ooo.lens[0] = 0; state->aes128_ooo.lens[1] = 0; state->aes128_ooo.lens[2] = 0; state->aes128_ooo.lens[3] = 0; state->aes128_ooo.lens[4] = 0; state->aes128_ooo.lens[5] = 0; state->aes128_ooo.lens[6] = 0; state->aes128_ooo.lens[7] = 0; state->aes128_ooo.unused_lanes = 0xF76543210; state->aes128_ooo.job_in_lane[0] = NULL; state->aes128_ooo.job_in_lane[1] = NULL; state->aes128_ooo.job_in_lane[2] = NULL; state->aes128_ooo.job_in_lane[3] = NULL; state->aes128_ooo.job_in_lane[4] = NULL; state->aes128_ooo.job_in_lane[5] = NULL; state->aes128_ooo.job_in_lane[6] = NULL; state->aes128_ooo.job_in_lane[7] = NULL; state->aes192_ooo.lens[0] = 0; state->aes192_ooo.lens[1] = 0; state->aes192_ooo.lens[2] = 0; state->aes192_ooo.lens[3] = 0; state->aes192_ooo.lens[4] = 0; state->aes192_ooo.lens[5] = 0; state->aes192_ooo.lens[6] = 0; state->aes192_ooo.lens[7] = 0; state->aes192_ooo.unused_lanes = 0xF76543210; state->aes192_ooo.job_in_lane[0] = NULL; state->aes192_ooo.job_in_lane[1] = NULL; state->aes192_ooo.job_in_lane[2] = NULL; state->aes192_ooo.job_in_lane[3] = NULL; state->aes192_ooo.job_in_lane[4] = NULL; state->aes192_ooo.job_in_lane[5] = NULL; state->aes192_ooo.job_in_lane[6] = NULL; state->aes192_ooo.job_in_lane[7] = NULL; state->aes256_ooo.lens[0] = 0; state->aes256_ooo.lens[1] = 0; state->aes256_ooo.lens[2] = 0; state->aes256_ooo.lens[3] = 0; state->aes256_ooo.lens[4] = 0; state->aes256_ooo.lens[5] = 0; state->aes256_ooo.lens[6] = 0; state->aes256_ooo.lens[7] = 0; state->aes256_ooo.unused_lanes = 0xF76543210; state->aes256_ooo.job_in_lane[0] = NULL; state->aes256_ooo.job_in_lane[1] = NULL; state->aes256_ooo.job_in_lane[2] = NULL; state->aes256_ooo.job_in_lane[3] = NULL; state->aes256_ooo.job_in_lane[4] = NULL; state->aes256_ooo.job_in_lane[5] = NULL; state->aes256_ooo.job_in_lane[6] = NULL; state->aes256_ooo.job_in_lane[7] = NULL; /* DOCSIS SEC BPI uses same settings as AES128 CBC */ state->docsis_sec_ooo.lens[0] = 0; state->docsis_sec_ooo.lens[1] = 0; state->docsis_sec_ooo.lens[2] = 0; state->docsis_sec_ooo.lens[3] = 0; state->docsis_sec_ooo.lens[4] = 0; state->docsis_sec_ooo.lens[5] = 0; state->docsis_sec_ooo.lens[6] = 0; state->docsis_sec_ooo.lens[7] = 0; state->docsis_sec_ooo.unused_lanes = 0xF76543210; state->docsis_sec_ooo.job_in_lane[0] = NULL; state->docsis_sec_ooo.job_in_lane[1] = NULL; state->docsis_sec_ooo.job_in_lane[2] = NULL; state->docsis_sec_ooo.job_in_lane[3] = NULL; state->docsis_sec_ooo.job_in_lane[4] = NULL; state->docsis_sec_ooo.job_in_lane[5] = NULL; state->docsis_sec_ooo.job_in_lane[6] = NULL; state->docsis_sec_ooo.job_in_lane[7] = NULL; /* Init HMAC/SHA1 out-of-order fields */ state->hmac_sha_1_ooo.lens[0] = 0; state->hmac_sha_1_ooo.lens[1] = 0; state->hmac_sha_1_ooo.lens[2] = 0; state->hmac_sha_1_ooo.lens[3] = 0; state->hmac_sha_1_ooo.lens[4] = 0xFFFF; state->hmac_sha_1_ooo.lens[5] = 0xFFFF; state->hmac_sha_1_ooo.lens[6] = 0xFFFF; state->hmac_sha_1_ooo.lens[7] = 0xFFFF; state->hmac_sha_1_ooo.unused_lanes = 0xFF03020100; for (j = 0; j < AVX_NUM_SHA1_LANES; j++) { state->hmac_sha_1_ooo.ldata[j].job_in_lane = NULL; state->hmac_sha_1_ooo.ldata[j].extra_block[64] = 0x80; memset(state->hmac_sha_1_ooo.ldata[j].extra_block + 65, 0x00, 64+7); p = state->hmac_sha_1_ooo.ldata[j].outer_block; memset(p + 5*4 + 1, 0x00, 64 - 5*4 - 1 - 2); p[5*4] = 0x80; p[64-2] = 0x02; p[64-1] = 0xA0; } /* Init HMAC/SHA224 out-of-order fields */ state->hmac_sha_224_ooo.lens[0] = 0; state->hmac_sha_224_ooo.lens[1] = 0; state->hmac_sha_224_ooo.lens[2] = 0; state->hmac_sha_224_ooo.lens[3] = 0; state->hmac_sha_224_ooo.lens[4] = 0xFFFF; state->hmac_sha_224_ooo.lens[5] = 0xFFFF; state->hmac_sha_224_ooo.lens[6] = 0xFFFF; state->hmac_sha_224_ooo.lens[7] = 0xFFFF; state->hmac_sha_224_ooo.unused_lanes = 0xFF03020100; for (j = 0; j < AVX_NUM_SHA256_LANES; j++) { state->hmac_sha_224_ooo.ldata[j].job_in_lane = NULL; state->hmac_sha_224_ooo.ldata[j].extra_block[64] = 0x80; memset(state->hmac_sha_224_ooo.ldata[j].extra_block + 65, 0x00, 64+7); p = state->hmac_sha_224_ooo.ldata[j].outer_block; memset(p + 8*4 + 1, 0x00, 64 - 8*4 - 1 - 2); p[7 * 4] = 0x80; /* digest 7 words long */ p[64 - 2] = 0x02; /* length in little endian = 0x02E0 */ p[64 - 1] = 0xE0; } /* Init HMAC/SHA256 out-of-order fields */ state->hmac_sha_256_ooo.lens[0] = 0; state->hmac_sha_256_ooo.lens[1] = 0; state->hmac_sha_256_ooo.lens[2] = 0; state->hmac_sha_256_ooo.lens[3] = 0; state->hmac_sha_256_ooo.lens[4] = 0xFFFF; state->hmac_sha_256_ooo.lens[5] = 0xFFFF; state->hmac_sha_256_ooo.lens[6] = 0xFFFF; state->hmac_sha_256_ooo.lens[7] = 0xFFFF; state->hmac_sha_256_ooo.unused_lanes = 0xFF03020100; for (j = 0; j < AVX_NUM_SHA256_LANES; j++) { state->hmac_sha_256_ooo.ldata[j].job_in_lane = NULL; state->hmac_sha_256_ooo.ldata[j].extra_block[64] = 0x80; memset(state->hmac_sha_256_ooo.ldata[j].extra_block + 65, 0x00, 64+7); p = state->hmac_sha_256_ooo.ldata[j].outer_block; memset(p + 8*4 + 1, 0x00, 64 - 8*4 - 1 - 2); p[8 * 4] = 0x80; /* 8 digest words */ p[64 - 2] = 0x03; /* length */ p[64 - 1] = 0x00; } /* Init HMAC/SHA384 out-of-order fields */ state->hmac_sha_384_ooo.lens[0] = 0; state->hmac_sha_384_ooo.lens[1] = 0; state->hmac_sha_384_ooo.lens[2] = 0xFFFF; state->hmac_sha_384_ooo.lens[3] = 0xFFFF; state->hmac_sha_384_ooo.lens[4] = 0xFFFF; state->hmac_sha_384_ooo.lens[5] = 0xFFFF; state->hmac_sha_384_ooo.lens[6] = 0xFFFF; state->hmac_sha_384_ooo.lens[7] = 0xFFFF; state->hmac_sha_384_ooo.unused_lanes = 0xFF0100; for (j = 0; j < AVX_NUM_SHA512_LANES; j++) { MB_MGR_HMAC_SHA_512_OOO *ctx = &state->hmac_sha_384_ooo; ctx->ldata[j].job_in_lane = NULL; ctx->ldata[j].extra_block[SHA_384_BLOCK_SIZE] = 0x80; memset(ctx->ldata[j].extra_block + (SHA_384_BLOCK_SIZE + 1), 0x00, SHA_384_BLOCK_SIZE + 7); p = ctx->ldata[j].outer_block; memset(p + SHA384_DIGEST_SIZE_IN_BYTES + 1, 0x00, /* special end point because this length is constant */ SHA_384_BLOCK_SIZE - SHA384_DIGEST_SIZE_IN_BYTES - 1 - 2); /* mark the end */ p[SHA384_DIGEST_SIZE_IN_BYTES] = 0x80; /* hmac outer block length always of fixed size, * it is OKey length, a whole message block length, 1024 bits, * with padding plus the length of the inner digest, * which is 384 bits, 1408 bits == 0x0580. * The input message block needs to be converted to big endian * within the sha implementation before use. */ p[SHA_384_BLOCK_SIZE - 2] = 0x05; p[SHA_384_BLOCK_SIZE - 1] = 0x80; } /* Init HMAC/SHA512 out-of-order fields */ state->hmac_sha_512_ooo.lens[0] = 0; state->hmac_sha_512_ooo.lens[1] = 0; state->hmac_sha_512_ooo.lens[2] = 0xFFFF; state->hmac_sha_512_ooo.lens[3] = 0xFFFF; state->hmac_sha_512_ooo.lens[4] = 0xFFFF; state->hmac_sha_512_ooo.lens[5] = 0xFFFF; state->hmac_sha_512_ooo.lens[6] = 0xFFFF; state->hmac_sha_512_ooo.lens[7] = 0xFFFF; state->hmac_sha_512_ooo.unused_lanes = 0xFF0100; for (j = 0; j < AVX_NUM_SHA512_LANES; j++) { MB_MGR_HMAC_SHA_512_OOO *ctx = &state->hmac_sha_512_ooo; ctx->ldata[j].job_in_lane = NULL; ctx->ldata[j].extra_block[SHA_512_BLOCK_SIZE] = 0x80; memset(ctx->ldata[j].extra_block + (SHA_512_BLOCK_SIZE + 1), 0x00, SHA_512_BLOCK_SIZE + 7); p = ctx->ldata[j].outer_block; memset(p + SHA512_DIGEST_SIZE_IN_BYTES + 1, 0x00, /* special end point because this length is constant */ SHA_512_BLOCK_SIZE - SHA512_DIGEST_SIZE_IN_BYTES - 1 - 2); /* mark the end */ p[SHA512_DIGEST_SIZE_IN_BYTES] = 0x80; /* * hmac outer block length always of fixed size, * it is OKey length, a whole message block length, 1024 bits, * with padding plus the length of the inner digest, * which is 512 bits, 1536 bits == 0x600. * The input message block needs to be converted to big endian * within the sha implementation before use. */ p[SHA_512_BLOCK_SIZE - 2] = 0x06; p[SHA_512_BLOCK_SIZE - 1] = 0x00; } /* Init HMAC/MD5 out-of-order fields */ state->hmac_md5_ooo.lens[0] = 0; state->hmac_md5_ooo.lens[1] = 0; state->hmac_md5_ooo.lens[2] = 0; state->hmac_md5_ooo.lens[3] = 0; state->hmac_md5_ooo.lens[4] = 0; state->hmac_md5_ooo.lens[5] = 0; state->hmac_md5_ooo.lens[6] = 0; state->hmac_md5_ooo.lens[7] = 0; state->hmac_md5_ooo.lens[8] = 0xFFFF; state->hmac_md5_ooo.lens[9] = 0xFFFF; state->hmac_md5_ooo.lens[10] = 0xFFFF; state->hmac_md5_ooo.lens[11] = 0xFFFF; state->hmac_md5_ooo.lens[12] = 0xFFFF; state->hmac_md5_ooo.lens[13] = 0xFFFF; state->hmac_md5_ooo.lens[14] = 0xFFFF; state->hmac_md5_ooo.lens[15] = 0xFFFF; state->hmac_md5_ooo.unused_lanes = 0xF76543210; for (j = 0; j < AVX_NUM_MD5_LANES; j++) { state->hmac_md5_ooo.ldata[j].job_in_lane = NULL; state->hmac_md5_ooo.ldata[j].extra_block[64] = 0x80; memset(state->hmac_md5_ooo.ldata[j].extra_block + 65, 0x00, 64 + 7); p = state->hmac_md5_ooo.ldata[j].outer_block; memset(p + 5*4 + 1, 0x00, 64 - 5*4 - 1 - 2); p[4 * 4] = 0x80; p[64 - 7] = 0x02; p[64 - 8] = 0x80; } /* Init AES/XCBC OOO fields */ state->aes_xcbc_ooo.lens[0] = 0; state->aes_xcbc_ooo.lens[1] = 0; state->aes_xcbc_ooo.lens[2] = 0; state->aes_xcbc_ooo.lens[3] = 0; state->aes_xcbc_ooo.lens[4] = 0; state->aes_xcbc_ooo.lens[5] = 0; state->aes_xcbc_ooo.lens[6] = 0; state->aes_xcbc_ooo.lens[7] = 0; state->aes_xcbc_ooo.unused_lanes = 0xF76543210; for (j = 0; j < 8; j++) { state->aes_xcbc_ooo.ldata[j].job_in_lane = NULL; state->aes_xcbc_ooo.ldata[j].final_block[16] = 0x80; memset(state->aes_xcbc_ooo.ldata[j].final_block + 17, 0x00, 15); } /* Init AES-CCM auth out-of-order fields */ for (j = 0; j < 8; j++) { state->aes_ccm_ooo.init_done[j] = 0; state->aes_ccm_ooo.lens[j] = 0; state->aes_ccm_ooo.job_in_lane[j] = NULL; } state->aes_ccm_ooo.unused_lanes = 0xF76543210; /* Init "in order" components */ state->next_job = 0; state->earliest_job = -1; /* set AVX handlers */ state->get_next_job = get_next_job_avx; state->submit_job = submit_job_avx; state->submit_job_nocheck = submit_job_nocheck_avx; state->get_completed_job = get_completed_job_avx; state->flush_job = flush_job_avx; state->queue_size = queue_size_avx; state->keyexp_128 = aes_keyexp_128_avx; state->keyexp_192 = aes_keyexp_192_avx; state->keyexp_256 = aes_keyexp_256_avx; } #include "mb_mgr_code.h" intel-ipsec-mb-0.48/avx/mb_mgr_hmac_flush_avx.asm000066400000000000000000000170211321406316400220700ustar00rootroot00000000000000;; ;; Copyright (c) 2012-2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; %include "os.asm" %include "job_aes_hmac.asm" %include "mb_mgr_datastruct.asm" %include "reg_sizes.asm" extern sha1_mult_avx section .data default rel align 16 byteswap: ;ddq 0x0c0d0e0f08090a0b0405060700010203 dq 0x0405060700010203, 0x0c0d0e0f08090a0b x80: ;ddq 0x00000000000000000000000000000080 dq 0x0000000000000080, 0x0000000000000000 x00: ;ddq 0x00000000000000000000000000000000 dq 0x0000000000000000, 0x0000000000000000 len_masks: ;ddq 0x0000000000000000000000000000FFFF dq 0x000000000000FFFF, 0x0000000000000000 ;ddq 0x000000000000000000000000FFFF0000 dq 0x00000000FFFF0000, 0x0000000000000000 ;ddq 0x00000000000000000000FFFF00000000 dq 0x0000FFFF00000000, 0x0000000000000000 ;ddq 0x0000000000000000FFFF000000000000 dq 0xFFFF000000000000, 0x0000000000000000 one: dq 1 two: dq 2 three: dq 3 section .text %if 1 %ifdef LINUX %define arg1 rdi %define arg2 rsi %else %define arg1 rcx %define arg2 rdx %endif %define state arg1 %define job arg2 %define len2 arg2 ; idx needs to be in rbx, rbp, r12-r15 %define idx rbp %define unused_lanes rbx %define lane_data rbx %define tmp2 rbx %define job_rax rax %define tmp1 rax %define size_offset rax %define tmp rax %define start_offset rax %define tmp3 arg1 %define extra_blocks arg2 %define p arg2 %define tmp4 r8 %endif ; This routine clobbers rbx, rbp struc STACK _gpr_save: resq 2 _rsp_save: resq 1 endstruc %define APPEND(a,b) a %+ b ; JOB* flush_job_hmac_avx(MB_MGR_HMAC_SHA_1_OOO *state) ; arg 1 : rcx : state MKGLOBAL(flush_job_hmac_avx,function,internal) flush_job_hmac_avx: mov rax, rsp sub rsp, STACK_size and rsp, -16 mov [rsp + _gpr_save + 8*0], rbx mov [rsp + _gpr_save + 8*1], rbp mov [rsp + _rsp_save], rax ; original SP mov unused_lanes, [state + _unused_lanes] bt unused_lanes, 32+7 jc return_null ; find a lane with a non-null job xor idx, idx cmp qword [state + _ldata + 1 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0 cmovne idx, [rel one] cmp qword [state + _ldata + 2 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0 cmovne idx, [rel two] cmp qword [state + _ldata + 3 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0 cmovne idx, [rel three] copy_lane_data: ; copy valid lane (idx) to empty lanes vmovdqa xmm0, [state + _lens] mov tmp, [state + _args_data_ptr + PTR_SZ*idx] %assign I 0 %rep 4 cmp qword [state + _ldata + I * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0 jne APPEND(skip_,I) mov [state + _args_data_ptr + PTR_SZ*I], tmp vpor xmm0, xmm0, [rel len_masks + 16*I] APPEND(skip_,I): %assign I (I+1) %endrep vmovdqa [state + _lens], xmm0 vphminposuw xmm1, xmm0 vpextrw DWORD(len2), xmm1, 0 ; min value vpextrw DWORD(idx), xmm1, 1 ; min index (0...3) cmp len2, 0 je len_is_0 vpshuflw xmm1, xmm1, 0 vpsubw xmm0, xmm0, xmm1 vmovdqa [state + _lens], xmm0 ; "state" and "args" are the same address, arg1 ; len is arg2 call sha1_mult_avx ; state is intact len_is_0: ; process completed job "idx" imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size lea lane_data, [state + _ldata + lane_data] mov DWORD(extra_blocks), [lane_data + _extra_blocks] cmp extra_blocks, 0 jne proc_extra_blocks cmp dword [lane_data + _outer_done], 0 jne end_loop proc_outer: mov dword [lane_data + _outer_done], 1 mov DWORD(size_offset), [lane_data + _size_offset] mov qword [lane_data + _extra_block + size_offset], 0 mov word [state + _lens + 2*idx], 1 lea tmp, [lane_data + _outer_block] mov job, [lane_data + _job_in_lane] mov [state + _args_data_ptr + PTR_SZ*idx], tmp ;; idx determines which column ;; read off from consecutive rows vmovd xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 0*SHA1_DIGEST_ROW_SIZE] vpinsrd xmm0, xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 1*SHA1_DIGEST_ROW_SIZE], 1 vpinsrd xmm0, xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 2*SHA1_DIGEST_ROW_SIZE], 2 vpinsrd xmm0, xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 3*SHA1_DIGEST_ROW_SIZE], 3 vpshufb xmm0, xmm0, [rel byteswap] mov DWORD(tmp), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 4*SHA1_DIGEST_ROW_SIZE] bswap DWORD(tmp) vmovdqa [lane_data + _outer_block], xmm0 mov [lane_data + _outer_block + 4*4], DWORD(tmp) mov tmp, [job + _auth_key_xor_opad] vmovdqu xmm0, [tmp] mov DWORD(tmp), [tmp + 4*4] vmovd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 0*SHA1_DIGEST_ROW_SIZE], xmm0 vpextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 1*SHA1_DIGEST_ROW_SIZE], xmm0, 1 vpextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 2*SHA1_DIGEST_ROW_SIZE], xmm0, 2 vpextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 3*SHA1_DIGEST_ROW_SIZE], xmm0, 3 mov [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 4*SHA1_DIGEST_ROW_SIZE], DWORD(tmp) jmp copy_lane_data align 16 proc_extra_blocks: mov DWORD(start_offset), [lane_data + _start_offset] mov [state + _lens + 2*idx], WORD(extra_blocks) lea tmp, [lane_data + _extra_block + start_offset] mov [state + _args_data_ptr + PTR_SZ*idx], tmp mov dword [lane_data + _extra_blocks], 0 jmp copy_lane_data return_null: xor job_rax, job_rax jmp return align 16 end_loop: mov job_rax, [lane_data + _job_in_lane] mov qword [lane_data + _job_in_lane], 0 or dword [job_rax + _status], STS_COMPLETED_HMAC mov unused_lanes, [state + _unused_lanes] shl unused_lanes, 8 or unused_lanes, idx mov [state + _unused_lanes], unused_lanes mov p, [job_rax + _auth_tag_output] ; copy 12 bytes mov DWORD(tmp2), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 0*SHA1_DIGEST_ROW_SIZE] mov DWORD(tmp4), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 1*SHA1_DIGEST_ROW_SIZE] mov DWORD(tmp3), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 2*SHA1_DIGEST_ROW_SIZE] bswap DWORD(tmp2) bswap DWORD(tmp4) bswap DWORD(tmp3) mov [p + 0*4], DWORD(tmp2) mov [p + 1*4], DWORD(tmp4) mov [p + 2*4], DWORD(tmp3) return: mov rbx, [rsp + _gpr_save + 8*0] mov rbp, [rsp + _gpr_save + 8*1] mov rsp, [rsp + _rsp_save] ; original SP ret intel-ipsec-mb-0.48/avx/mb_mgr_hmac_md5_flush_avx.asm000066400000000000000000000210341321406316400226340ustar00rootroot00000000000000;; ;; Copyright (c) 2012-2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; %include "os.asm" %include "job_aes_hmac.asm" %include "mb_mgr_datastruct.asm" %include "reg_sizes.asm" extern md5_x4x2_avx section .data default rel align 16 dupw: ;ddq 0x01000100010001000100010001000100 dq 0x0100010001000100, 0x0100010001000100 x80: ;ddq 0x00000000000000000000000000000080 dq 0x0000000000000080, 0x0000000000000000 x00: ;ddq 0x00000000000000000000000000000000 dq 0x0000000000000000, 0x0000000000000000 len_masks: ;ddq 0x0000000000000000000000000000FFFF dq 0x000000000000FFFF, 0x0000000000000000 ;ddq 0x000000000000000000000000FFFF0000 dq 0x00000000FFFF0000, 0x0000000000000000 ;ddq 0x00000000000000000000FFFF00000000 dq 0x0000FFFF00000000, 0x0000000000000000 ;ddq 0x0000000000000000FFFF000000000000 dq 0xFFFF000000000000, 0x0000000000000000 ;ddq 0x000000000000FFFF0000000000000000 dq 0x0000000000000000, 0x000000000000FFFF ;ddq 0x00000000FFFF00000000000000000000 dq 0x0000000000000000, 0x00000000FFFF0000 ;ddq 0x0000FFFF000000000000000000000000 dq 0x0000000000000000, 0x0000FFFF00000000 ;ddq 0xFFFF0000000000000000000000000000 dq 0x0000000000000000, 0xFFFF000000000000 one: dq 1 two: dq 2 three: dq 3 four: dq 4 five: dq 5 six: dq 6 seven: dq 7 section .text %if 1 %ifdef LINUX %define arg1 rdi %define arg2 rsi %else %define arg1 rcx %define arg2 rdx %endif %define state arg1 %define job arg2 %define len2 arg2 ; idx needs to be in rbp %define idx rbp ; unused_lanes must be in rax-rdx %define unused_lanes rbx %define lane_data rbx %define tmp2 rbx %define job_rax rax %define tmp1 rax %define size_offset rax %define tmp rax %define start_offset rax %define tmp3 arg1 %define extra_blocks arg2 %define p arg2 %define tmp4 r8 %endif ; This routine and/or the called routine clobbers all GPRs struc STACK _gpr_save: resq 8 _rsp_save: resq 1 endstruc %define APPEND(a,b) a %+ b ; JOB* flush_job_hmac_md5_avx(MB_MGR_HMAC_MD5_OOO *state) ; arg 1 : rcx : state MKGLOBAL(flush_job_hmac_md5_avx,function,internal) flush_job_hmac_md5_avx: mov rax, rsp sub rsp, STACK_size and rsp, -16 mov [rsp + _gpr_save + 8*0], rbx mov [rsp + _gpr_save + 8*1], rbp mov [rsp + _gpr_save + 8*2], r12 mov [rsp + _gpr_save + 8*3], r13 mov [rsp + _gpr_save + 8*4], r14 mov [rsp + _gpr_save + 8*5], r15 %ifndef LINUX mov [rsp + _gpr_save + 8*6], rsi mov [rsp + _gpr_save + 8*7], rdi %endif mov [rsp + _rsp_save], rax ; original SP mov unused_lanes, [state + _unused_lanes_md5] bt unused_lanes, 32+3 jc return_null ; find a lane with a non-null job xor idx, idx cmp qword [state + _ldata_md5 + 1 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane],0 cmovne idx, [rel one] cmp qword [state + _ldata_md5 + 2 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane],0 cmovne idx, [rel two] cmp qword [state + _ldata_md5 + 3 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane],0 cmovne idx, [rel three] cmp qword [state + _ldata_md5 + 4 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane],0 cmovne idx, [rel four] cmp qword [state + _ldata_md5 + 5 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane],0 cmovne idx, [rel five] cmp qword [state + _ldata_md5 + 6 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane],0 cmovne idx, [rel six] cmp qword [state + _ldata_md5 + 7 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane],0 cmovne idx, [rel seven] copy_lane_data: ; copy good lane (idx) to empty lanes vmovdqa xmm0, [state + _lens_md5] mov tmp, [state + _args_data_ptr_md5 + PTR_SZ*idx] %assign I 0 %rep 8 cmp qword [state + _ldata_md5 + I * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0 jne APPEND(skip_,I) mov [state + _args_data_ptr_md5 + PTR_SZ*I], tmp vpor xmm0, xmm0, [rel len_masks + 16*I] APPEND(skip_,I): %assign I (I+1) %endrep vmovdqa [state + _lens_md5], xmm0 vphminposuw xmm1, xmm0 vpextrw DWORD(len2), xmm1, 0 ; min value vpextrw DWORD(idx), xmm1, 1 ; min index (0...3) cmp len2, 0 je len_is_0 vpshufb xmm1, [rel dupw] ; duplicate words across all lanes vpsubw xmm0, xmm0, xmm1 vmovdqa [state + _lens_md5], xmm0 ; "state" and "args" are the same address, arg1 ; len is arg2 call md5_x4x2_avx ; state and idx are intact len_is_0: ; process completed job "idx" imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size lea lane_data, [state + _ldata_md5 + lane_data] mov DWORD(extra_blocks), [lane_data + _extra_blocks] cmp extra_blocks, 0 jne proc_extra_blocks cmp dword [lane_data + _outer_done], 0 jne end_loop proc_outer: mov dword [lane_data + _outer_done], 1 mov DWORD(size_offset), [lane_data + _size_offset] mov qword [lane_data + _extra_block + size_offset], 0 mov word [state + _lens_md5 + 2*idx], 1 lea tmp, [lane_data + _outer_block] mov job, [lane_data + _job_in_lane] mov [state + _args_data_ptr_md5 + PTR_SZ*idx], tmp vmovd xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 0*MD5_DIGEST_ROW_SIZE] vpinsrd xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 1*MD5_DIGEST_ROW_SIZE], 1 vpinsrd xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 2*MD5_DIGEST_ROW_SIZE], 2 vpinsrd xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 3*MD5_DIGEST_ROW_SIZE], 3 vmovdqa [lane_data + _outer_block], xmm0 mov tmp, [job + _auth_key_xor_opad] vmovdqu xmm0, [tmp] vmovd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 0*MD5_DIGEST_ROW_SIZE], xmm0 vpextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 1*MD5_DIGEST_ROW_SIZE], xmm0, 1 vpextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 2*MD5_DIGEST_ROW_SIZE], xmm0, 2 vpextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 3*MD5_DIGEST_ROW_SIZE], xmm0, 3 jmp copy_lane_data align 16 proc_extra_blocks: mov DWORD(start_offset), [lane_data + _start_offset] mov [state + _lens_md5 + 2*idx], WORD(extra_blocks) lea tmp, [lane_data + _extra_block + start_offset] mov [state + _args_data_ptr_md5 + PTR_SZ*idx], tmp mov dword [lane_data + _extra_blocks], 0 jmp copy_lane_data return_null: xor job_rax, job_rax jmp return align 16 end_loop: mov job_rax, [lane_data + _job_in_lane] mov qword [lane_data + _job_in_lane], 0 or dword [job_rax + _status], STS_COMPLETED_HMAC mov unused_lanes, [state + _unused_lanes_md5] shl unused_lanes, 4 or unused_lanes, idx mov [state + _unused_lanes_md5], unused_lanes mov p, [job_rax + _auth_tag_output] ; copy 12 bytes mov DWORD(tmp2), [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 0*MD5_DIGEST_ROW_SIZE] mov DWORD(tmp4), [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 1*MD5_DIGEST_ROW_SIZE] mov DWORD(tmp3), [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 2*MD5_DIGEST_ROW_SIZE] ; bswap DWORD(tmp2) ; bswap DWORD(tmp4) ; bswap DWORD(tmp3) mov [p + 0*4], DWORD(tmp2) mov [p + 1*4], DWORD(tmp4) mov [p + 2*4], DWORD(tmp3) return: mov rbx, [rsp + _gpr_save + 8*0] mov rbp, [rsp + _gpr_save + 8*1] mov r12, [rsp + _gpr_save + 8*2] mov r13, [rsp + _gpr_save + 8*3] mov r14, [rsp + _gpr_save + 8*4] mov r15, [rsp + _gpr_save + 8*5] %ifndef LINUX mov rsi, [rsp + _gpr_save + 8*6] mov rdi, [rsp + _gpr_save + 8*7] %endif mov rsp, [rsp + _rsp_save] ; original SP ret intel-ipsec-mb-0.48/avx/mb_mgr_hmac_md5_submit_avx.asm000066400000000000000000000240661321406316400230260ustar00rootroot00000000000000;; ;; Copyright (c) 2012-2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; %include "os.asm" %include "job_aes_hmac.asm" %include "mb_mgr_datastruct.asm" %include "memcpy.asm" %include "reg_sizes.asm" extern md5_x4x2_avx section .data default rel align 16 dupw: ;ddq 0x01000100010001000100010001000100 dq 0x0100010001000100, 0x0100010001000100 section .text %if 1 %ifdef LINUX %define arg1 rdi %define arg2 rsi %define reg3 rcx %define reg4 rdx %else %define arg1 rcx %define arg2 rdx %define reg3 rdi %define reg4 rsi %endif %define state arg1 %define job arg2 %define len2 arg2 ; idx needs to be in rbp %define last_len rbp %define idx rbp %define p r11 %define start_offset r11 %define unused_lanes rbx %define tmp4 rbx %define job_rax rax %define len rax %define size_offset reg3 %define tmp2 reg3 %define lane reg4 %define tmp3 reg4 %define extra_blocks r8 %define tmp r9 %define p2 r9 %define lane_data r10 %endif ; This routine and/or the called routine clobbers all GPRs struc STACK _gpr_save: resq 8 _rsp_save: resq 1 endstruc ; JOB* submit_job_hmac_md5_avx(MB_MGR_HMAC_MD5_OOO *state, JOB_AES_HMAC *job) ; arg 1 : rcx : state ; arg 2 : rdx : job MKGLOBAL(submit_job_hmac_md5_avx,function,internal) submit_job_hmac_md5_avx: mov rax, rsp sub rsp, STACK_size and rsp, -16 mov [rsp + _gpr_save + 8*0], rbx mov [rsp + _gpr_save + 8*1], rbp mov [rsp + _gpr_save + 8*2], r12 mov [rsp + _gpr_save + 8*3], r13 mov [rsp + _gpr_save + 8*4], r14 mov [rsp + _gpr_save + 8*5], r15 %ifndef LINUX mov [rsp + _gpr_save + 8*6], rsi mov [rsp + _gpr_save + 8*7], rdi %endif mov [rsp + _rsp_save], rax ; original SP mov unused_lanes, [state + _unused_lanes_md5] mov lane, unused_lanes and lane, 0xF shr unused_lanes, 4 imul lane_data, lane, _HMAC_SHA1_LANE_DATA_size lea lane_data, [state + _ldata_md5 + lane_data] mov [state + _unused_lanes_md5], unused_lanes mov len, [job + _msg_len_to_hash_in_bytes] mov tmp, len shr tmp, 6 ; divide by 64, len in terms of blocks mov [lane_data + _job_in_lane], job mov dword [lane_data + _outer_done], 0 mov [state + _lens_md5 + 2*lane], WORD(tmp) mov last_len, len and last_len, 63 lea extra_blocks, [last_len + 9 + 63] shr extra_blocks, 6 mov [lane_data + _extra_blocks], DWORD(extra_blocks) mov p, [job + _src] add p, [job + _hash_start_src_offset_in_bytes] mov [state + _args_data_ptr_md5 + PTR_SZ*lane], p cmp len, 64 jb copy_lt64 fast_copy: add p, len vmovdqu xmm0, [p - 64 + 0*16] vmovdqu xmm1, [p - 64 + 1*16] vmovdqu xmm2, [p - 64 + 2*16] vmovdqu xmm3, [p - 64 + 3*16] vmovdqa [lane_data + _extra_block + 0*16], xmm0 vmovdqa [lane_data + _extra_block + 1*16], xmm1 vmovdqa [lane_data + _extra_block + 2*16], xmm2 vmovdqa [lane_data + _extra_block + 3*16], xmm3 end_fast_copy: mov size_offset, extra_blocks shl size_offset, 6 sub size_offset, last_len add size_offset, 64-8 mov [lane_data + _size_offset], DWORD(size_offset) mov start_offset, 64 sub start_offset, last_len mov [lane_data + _start_offset], DWORD(start_offset) lea tmp, [8*64 + 8*len] ; bswap tmp mov [lane_data + _extra_block + size_offset], tmp mov tmp, [job + _auth_key_xor_ipad] vmovdqu xmm0, [tmp] vmovd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*lane + 0*MD5_DIGEST_ROW_SIZE], xmm0 vpextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*lane + 1*MD5_DIGEST_ROW_SIZE], xmm0, 1 vpextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*lane + 2*MD5_DIGEST_ROW_SIZE], xmm0, 2 vpextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*lane + 3*MD5_DIGEST_ROW_SIZE], xmm0, 3 test len, ~63 jnz ge64_bytes lt64_bytes: mov [state + _lens_md5 + 2*lane], WORD(extra_blocks) lea tmp, [lane_data + _extra_block + start_offset] mov [state + _args_data_ptr_md5 + PTR_SZ*lane], tmp mov dword [lane_data + _extra_blocks], 0 ge64_bytes: cmp unused_lanes, 0xf jne return_null jmp start_loop align 16 start_loop: ; Find min length vmovdqa xmm0, [state + _lens_md5] vphminposuw xmm1, xmm0 vpextrw DWORD(len2), xmm1, 0 ; min value vpextrw DWORD(idx), xmm1, 1 ; min index (0...3) cmp len2, 0 je len_is_0 vpshufb xmm1, xmm1, [rel dupw] ; duplicate words across all lanes vpsubw xmm0, xmm0, xmm1 vmovdqa [state + _lens_md5], xmm0 ; "state" and "args" are the same address, arg1 ; len is arg2 call md5_x4x2_avx ; state and idx are intact len_is_0: ; process completed job "idx" imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size lea lane_data, [state + _ldata_md5 + lane_data] mov DWORD(extra_blocks), [lane_data + _extra_blocks] cmp extra_blocks, 0 jne proc_extra_blocks cmp dword [lane_data + _outer_done], 0 jne end_loop proc_outer: mov dword [lane_data + _outer_done], 1 mov DWORD(size_offset), [lane_data + _size_offset] mov qword [lane_data + _extra_block + size_offset], 0 mov word [state + _lens_md5 + 2*idx], 1 lea tmp, [lane_data + _outer_block] mov job, [lane_data + _job_in_lane] mov [state + _args_data_ptr_md5 + PTR_SZ*idx], tmp vmovd xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 0*MD5_DIGEST_ROW_SIZE] vpinsrd xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 1*MD5_DIGEST_ROW_SIZE], 1 vpinsrd xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 2*MD5_DIGEST_ROW_SIZE], 2 vpinsrd xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 3*MD5_DIGEST_ROW_SIZE], 3 ; vpshufb xmm0, [byteswap wrt rip] vmovdqa [lane_data + _outer_block], xmm0 mov tmp, [job + _auth_key_xor_opad] vmovdqu xmm0, [tmp] vmovd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 0*MD5_DIGEST_ROW_SIZE], xmm0 vpextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 1*MD5_DIGEST_ROW_SIZE], xmm0, 1 vpextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 2*MD5_DIGEST_ROW_SIZE], xmm0, 2 vpextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 3*MD5_DIGEST_ROW_SIZE], xmm0, 3 jmp start_loop align 16 proc_extra_blocks: mov DWORD(start_offset), [lane_data + _start_offset] mov [state + _lens_md5 + 2*idx], WORD(extra_blocks) lea tmp, [lane_data + _extra_block + start_offset] mov [state + _args_data_ptr_md5 + PTR_SZ*idx], tmp mov dword [lane_data + _extra_blocks], 0 jmp start_loop align 16 copy_lt64: ;; less than one message block of data ;; beginning of source block ;; destination extrablock but backwards by len from where 0x80 pre-populated ;; p2 clobbers unused_lanes, undo before exiting lea p2, [lane_data + _extra_block + 64] sub p2, len memcpy_avx_64_1 p2, p, len, tmp4, tmp2, xmm0, xmm1, xmm2, xmm3 mov unused_lanes, [state + _unused_lanes_md5] jmp end_fast_copy return_null: xor job_rax, job_rax jmp return align 16 end_loop: mov job_rax, [lane_data + _job_in_lane] mov unused_lanes, [state + _unused_lanes_md5] mov qword [lane_data + _job_in_lane], 0 or dword [job_rax + _status], STS_COMPLETED_HMAC shl unused_lanes, 4 or unused_lanes, idx mov [state + _unused_lanes_md5], unused_lanes mov p, [job_rax + _auth_tag_output] ; copy 12 bytes mov DWORD(tmp), [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 0*MD5_DIGEST_ROW_SIZE] mov DWORD(tmp2), [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 1*MD5_DIGEST_ROW_SIZE] mov DWORD(tmp3), [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 2*MD5_DIGEST_ROW_SIZE] mov [p + 0*4], DWORD(tmp) mov [p + 1*4], DWORD(tmp2) mov [p + 2*4], DWORD(tmp3) return: mov rbx, [rsp + _gpr_save + 8*0] mov rbp, [rsp + _gpr_save + 8*1] mov r12, [rsp + _gpr_save + 8*2] mov r13, [rsp + _gpr_save + 8*3] mov r14, [rsp + _gpr_save + 8*4] mov r15, [rsp + _gpr_save + 8*5] %ifndef LINUX mov rsi, [rsp + _gpr_save + 8*6] mov rdi, [rsp + _gpr_save + 8*7] %endif mov rsp, [rsp + _rsp_save] ; original SP ret intel-ipsec-mb-0.48/avx/mb_mgr_hmac_sha_224_flush_avx.asm000066400000000000000000000032221321406316400233100ustar00rootroot00000000000000;; ;; Copyright (c) 2012-2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; %define FUNC flush_job_hmac_sha_224_avx %define SHA224 %include "mb_mgr_hmac_sha_256_flush_avx.asm" intel-ipsec-mb-0.48/avx/mb_mgr_hmac_sha_224_submit_avx.asm000066400000000000000000000032241321406316400234740ustar00rootroot00000000000000;; ;; Copyright (c) 2012-2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; %define FUNC submit_job_hmac_sha_224_avx %define SHA224 %include "mb_mgr_hmac_sha_256_submit_avx.asm" intel-ipsec-mb-0.48/avx/mb_mgr_hmac_sha_256_flush_avx.asm000066400000000000000000000203261321406316400233210ustar00rootroot00000000000000;; ;; Copyright (c) 2012-2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; %include "os.asm" %include "job_aes_hmac.asm" %include "mb_mgr_datastruct.asm" %include "reg_sizes.asm" extern sha_256_mult_avx section .data default rel align 16 byteswap: ;ddq 0x0c0d0e0f08090a0b0405060700010203 dq 0x0405060700010203, 0x0c0d0e0f08090a0b len_masks: ;ddq 0x0000000000000000000000000000FFFF dq 0x000000000000FFFF, 0x0000000000000000 ;ddq 0x000000000000000000000000FFFF0000 dq 0x00000000FFFF0000, 0x0000000000000000 ;ddq 0x00000000000000000000FFFF00000000 dq 0x0000FFFF00000000, 0x0000000000000000 ;ddq 0x0000000000000000FFFF000000000000 dq 0xFFFF000000000000, 0x0000000000000000 one: dq 1 two: dq 2 three: dq 3 section .text %ifndef FUNC %define FUNC flush_job_hmac_sha_256_avx %endif %if 1 %ifdef LINUX %define arg1 rdi %define arg2 rsi %else %define arg1 rcx %define arg2 rdx %endif %define state arg1 %define job arg2 %define len2 arg2 ; idx needs to be in rbx, rbp, r13-r15 %define idx rbp %define unused_lanes rbx %define lane_data rbx %define tmp2 rbx %define job_rax rax %define tmp1 rax %define size_offset rax %define tmp rax %define start_offset rax %define tmp3 arg1 %define extra_blocks arg2 %define p arg2 %define tmp4 r8 %define tmp5 r9 %define tmp6 r10 %endif ; This routine clobbers rbx, rbp; called routine also clobbers r12 struc STACK _gpr_save: resq 3 _rsp_save: resq 1 endstruc %define APPEND(a,b) a %+ b ; JOB* FUNC(MB_MGR_HMAC_SHA_256_OOO *state) ; arg 1 : rcx : state MKGLOBAL(FUNC,function,internal) FUNC: mov rax, rsp sub rsp, STACK_size and rsp, -16 mov [rsp + _gpr_save + 8*0], rbx mov [rsp + _gpr_save + 8*1], rbp mov [rsp + _gpr_save + 8*2], r12 mov [rsp + _rsp_save], rax ; original SP mov unused_lanes, [state + _unused_lanes_sha256] bt unused_lanes, 32+7 jc return_null ; find a lane with a non-null job xor idx, idx cmp qword [state + _ldata_sha256 + 1 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0 cmovne idx, [rel one] cmp qword [state + _ldata_sha256 + 2 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0 cmovne idx, [rel two] cmp qword [state + _ldata_sha256 + 3 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0 cmovne idx, [rel three] copy_lane_data: ; copy idx to empty lanes vmovdqa xmm0, [state + _lens_sha256] mov tmp, [state + _args_data_ptr_sha256 + 8*idx] %assign I 0 %rep 4 cmp qword [state + _ldata_sha256 + I * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0 jne APPEND(skip_,I) mov [state + _args_data_ptr_sha256 + 8*I], tmp vpor xmm0, xmm0, [rel len_masks + 16*I] APPEND(skip_,I): %assign I (I+1) %endrep vmovdqa [state + _lens_sha256], xmm0 vphminposuw xmm1, xmm0 vpextrw DWORD(len2), xmm1, 0 ; min value vpextrw DWORD(idx), xmm1, 1 ; min index (0...3) cmp len2, 0 je len_is_0 vpshuflw xmm1, xmm1, 0 vpsubw xmm0, xmm0, xmm1 vmovdqa [state + _lens_sha256], xmm0 ; "state" and "args" are the same address, arg1 ; len is arg2 call sha_256_mult_avx ; state and idx are intact len_is_0: ; process completed job "idx" imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size lea lane_data, [state + _ldata_sha256 + lane_data] mov DWORD(extra_blocks), [lane_data + _extra_blocks] cmp extra_blocks, 0 jne proc_extra_blocks cmp dword [lane_data + _outer_done], 0 jne end_loop proc_outer: mov dword [lane_data + _outer_done], 1 mov DWORD(size_offset), [lane_data + _size_offset] mov qword [lane_data + _extra_block + size_offset], 0 mov word [state + _lens_sha256 + 2*idx], 1 lea tmp, [lane_data + _outer_block] mov job, [lane_data + _job_in_lane] mov [state + _args_data_ptr_sha256 + 8*idx], tmp vmovd xmm0, [state + _args_digest_sha256 + 4*idx + 0*SHA256_DIGEST_ROW_SIZE] vpinsrd xmm0, xmm0, [state + _args_digest_sha256 + 4*idx + 1*SHA256_DIGEST_ROW_SIZE], 1 vpinsrd xmm0, xmm0, [state + _args_digest_sha256 + 4*idx + 2*SHA256_DIGEST_ROW_SIZE], 2 vpinsrd xmm0, xmm0, [state + _args_digest_sha256 + 4*idx + 3*SHA256_DIGEST_ROW_SIZE], 3 vpshufb xmm0, xmm0, [rel byteswap] vmovd xmm1, [state + _args_digest_sha256 + 4*idx + 4*SHA256_DIGEST_ROW_SIZE] vpinsrd xmm1, xmm1, [state + _args_digest_sha256 + 4*idx + 5*SHA256_DIGEST_ROW_SIZE], 1 vpinsrd xmm1, xmm1, [state + _args_digest_sha256 + 4*idx + 6*SHA256_DIGEST_ROW_SIZE], 2 %ifndef SHA224 vpinsrd xmm1, xmm1, [state + _args_digest_sha256 + 4*idx + 7*SHA256_DIGEST_ROW_SIZE], 3 %endif vpshufb xmm1, xmm1, [rel byteswap] vmovdqa [lane_data + _outer_block], xmm0 vmovdqa [lane_data + _outer_block + 4*4], xmm1 %ifdef SHA224 mov dword [lane_data + _outer_block + 7*4], 0x80 %endif mov tmp, [job + _auth_key_xor_opad] vmovdqu xmm0, [tmp] vmovdqu xmm1, [tmp + 4*4] vmovd [state + _args_digest_sha256 + 4*idx + 0*SHA256_DIGEST_ROW_SIZE], xmm0 vpextrd [state + _args_digest_sha256 + 4*idx + 1*SHA256_DIGEST_ROW_SIZE], xmm0, 1 vpextrd [state + _args_digest_sha256 + 4*idx + 2*SHA256_DIGEST_ROW_SIZE], xmm0, 2 vpextrd [state + _args_digest_sha256 + 4*idx + 3*SHA256_DIGEST_ROW_SIZE], xmm0, 3 vmovd [state + _args_digest_sha256 + 4*idx + 4*SHA256_DIGEST_ROW_SIZE], xmm1 vpextrd [state + _args_digest_sha256 + 4*idx + 5*SHA256_DIGEST_ROW_SIZE], xmm1, 1 vpextrd [state + _args_digest_sha256 + 4*idx + 6*SHA256_DIGEST_ROW_SIZE], xmm1, 2 vpextrd [state + _args_digest_sha256 + 4*idx + 7*SHA256_DIGEST_ROW_SIZE], xmm1, 3 jmp copy_lane_data align 16 proc_extra_blocks: mov DWORD(start_offset), [lane_data + _start_offset] mov [state + _lens_sha256 + 2*idx], WORD(extra_blocks) lea tmp, [lane_data + _extra_block + start_offset] mov [state + _args_data_ptr_sha256 + 8*idx], tmp mov dword [lane_data + _extra_blocks], 0 jmp copy_lane_data return_null: xor job_rax, job_rax jmp return align 16 end_loop: mov job_rax, [lane_data + _job_in_lane] mov qword [lane_data + _job_in_lane], 0 or dword [job_rax + _status], STS_COMPLETED_HMAC mov unused_lanes, [state + _unused_lanes_sha256] shl unused_lanes, 8 or unused_lanes, idx mov [state + _unused_lanes_sha256], unused_lanes mov p, [job_rax + _auth_tag_output] ; copy 14 bytes for SHA224 and 16 bytes for SHA256 mov DWORD(tmp2), [state + _args_digest_sha256 + 4*idx + 0*SHA256_DIGEST_ROW_SIZE] mov DWORD(tmp4), [state + _args_digest_sha256 + 4*idx + 1*SHA256_DIGEST_ROW_SIZE] mov DWORD(tmp6), [state + _args_digest_sha256 + 4*idx + 2*SHA256_DIGEST_ROW_SIZE] mov DWORD(tmp5), [state + _args_digest_sha256 + 4*idx + 3*SHA256_DIGEST_ROW_SIZE] bswap DWORD(tmp2) bswap DWORD(tmp4) bswap DWORD(tmp6) bswap DWORD(tmp5) mov [p + 0*4], DWORD(tmp2) mov [p + 1*4], DWORD(tmp4) mov [p + 2*4], DWORD(tmp6) %ifdef SHA224 mov [p + 3*4], WORD(tmp5) %else mov [p + 3*4], DWORD(tmp5) %endif return: mov rbx, [rsp + _gpr_save + 8*0] mov rbp, [rsp + _gpr_save + 8*1] mov r12, [rsp + _gpr_save + 8*2] mov rsp, [rsp + _rsp_save] ; original SP ret intel-ipsec-mb-0.48/avx/mb_mgr_hmac_sha_256_submit_avx.asm000066400000000000000000000242121321406316400235010ustar00rootroot00000000000000;; ;; Copyright (c) 2012-2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; %include "os.asm" %include "job_aes_hmac.asm" %include "mb_mgr_datastruct.asm" %include "reg_sizes.asm" %include "memcpy.asm" extern sha_256_mult_avx section .data default rel align 16 byteswap: ;ddq 0x0c0d0e0f08090a0b0405060700010203 dq 0x0405060700010203, 0x0c0d0e0f08090a0b section .text %ifndef FUNC %define FUNC submit_job_hmac_sha_256_avx %endif %if 1 %ifdef LINUX %define arg1 rdi %define arg2 rsi %define reg3 rcx %define reg4 rdx %else %define arg1 rcx %define arg2 rdx %define reg3 rdi %define reg4 rsi %endif %define state arg1 %define job arg2 %define len2 arg2 ; idx needs to be in rbx, rbp, r13-r15 %define last_len rbp %define idx rbp %define p r11 %define start_offset r11 %define unused_lanes rbx %define tmp4 rbx %define job_rax rax %define len rax %define size_offset reg3 %define tmp2 reg3 %define lane reg4 %define tmp3 reg4 %define extra_blocks r8 %define tmp r9 %define p2 r9 %define lane_data r10 %endif ; This routine clobbers rbx, rbp, rsi, rdi; called routine also clobbers r12 struc STACK _gpr_save: resq 5 _rsp_save: resq 1 endstruc ; JOB* FUNC(MB_MGR_HMAC_SHA_256_OOO *state, JOB_AES_HMAC *job) ; arg 1 : rcx : state ; arg 2 : rdx : job MKGLOBAL(FUNC,function,internal) FUNC: mov rax, rsp sub rsp, STACK_size and rsp, -16 mov [rsp + _gpr_save + 8*0], rbx mov [rsp + _gpr_save + 8*1], rbp mov [rsp + _gpr_save + 8*2], r12 %ifndef LINUX mov [rsp + _gpr_save + 8*3], rsi mov [rsp + _gpr_save + 8*4], rdi %endif mov [rsp + _rsp_save], rax ; original SP mov unused_lanes, [state + _unused_lanes_sha256] movzx lane, BYTE(unused_lanes) shr unused_lanes, 8 imul lane_data, lane, _HMAC_SHA1_LANE_DATA_size lea lane_data, [state + _ldata_sha256 + lane_data] mov [state + _unused_lanes_sha256], unused_lanes mov len, [job + _msg_len_to_hash_in_bytes] mov tmp, len shr tmp, 6 ; divide by 64, len in terms of blocks mov [lane_data + _job_in_lane], job mov dword [lane_data + _outer_done], 0 mov [state + _lens_sha256 + 2*lane], WORD(tmp) mov last_len, len and last_len, 63 lea extra_blocks, [last_len + 9 + 63] shr extra_blocks, 6 mov [lane_data + _extra_blocks], DWORD(extra_blocks) mov p, [job + _src] add p, [job + _hash_start_src_offset_in_bytes] mov [state + _args_data_ptr_sha256 + 8*lane], p cmp len, 64 jb copy_lt64 fast_copy: add p, len vmovdqu xmm0, [p - 64 + 0*16] vmovdqu xmm1, [p - 64 + 1*16] vmovdqu xmm2, [p - 64 + 2*16] vmovdqu xmm3, [p - 64 + 3*16] vmovdqa [lane_data + _extra_block + 0*16], xmm0 vmovdqa [lane_data + _extra_block + 1*16], xmm1 vmovdqa [lane_data + _extra_block + 2*16], xmm2 vmovdqa [lane_data + _extra_block + 3*16], xmm3 end_fast_copy: mov size_offset, extra_blocks shl size_offset, 6 sub size_offset, last_len add size_offset, 64-8 mov [lane_data + _size_offset], DWORD(size_offset) mov start_offset, 64 sub start_offset, last_len mov [lane_data + _start_offset], DWORD(start_offset) lea tmp, [8*64 + 8*len] bswap tmp mov [lane_data + _extra_block + size_offset], tmp mov tmp, [job + _auth_key_xor_ipad] vmovdqu xmm0, [tmp] vmovdqu xmm1, [tmp + 4*4] vmovd [state + _args_digest_sha256 + 4*lane + 0*SHA256_DIGEST_ROW_SIZE], xmm0 vpextrd [state + _args_digest_sha256 + 4*lane + 1*SHA256_DIGEST_ROW_SIZE], xmm0, 1 vpextrd [state + _args_digest_sha256 + 4*lane + 2*SHA256_DIGEST_ROW_SIZE], xmm0, 2 vpextrd [state + _args_digest_sha256 + 4*lane + 3*SHA256_DIGEST_ROW_SIZE], xmm0, 3 vmovd [state + _args_digest_sha256 + 4*lane + 4*SHA256_DIGEST_ROW_SIZE], xmm1 vpextrd [state + _args_digest_sha256 + 4*lane + 5*SHA256_DIGEST_ROW_SIZE], xmm1, 1 vpextrd [state + _args_digest_sha256 + 4*lane + 6*SHA256_DIGEST_ROW_SIZE], xmm1, 2 vpextrd [state + _args_digest_sha256 + 4*lane + 7*SHA256_DIGEST_ROW_SIZE], xmm1, 3 test len, ~63 jnz ge64_bytes lt64_bytes: mov [state + _lens_sha256 + 2*lane], WORD(extra_blocks) lea tmp, [lane_data + _extra_block + start_offset] mov [state + _args_data_ptr_sha256 + 8*lane], tmp mov dword [lane_data + _extra_blocks], 0 ge64_bytes: cmp unused_lanes, 0xff jne return_null jmp start_loop align 16 start_loop: ; Find min length vmovdqa xmm0, [state + _lens_sha256] vphminposuw xmm1, xmm0 vpextrw DWORD(len2), xmm1, 0 ; min value vpextrw DWORD(idx), xmm1, 1 ; min index (0...3) cmp len2, 0 je len_is_0 vpshuflw xmm1, xmm1, 0 vpsubw xmm0, xmm0, xmm1 vmovdqa [state + _lens_sha256], xmm0 ; "state" and "args" are the same address, arg1 ; len is arg2 call sha_256_mult_avx ; state and idx are intact len_is_0: ; process completed job "idx" imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size lea lane_data, [state + _ldata_sha256 + lane_data] mov DWORD(extra_blocks), [lane_data + _extra_blocks] cmp extra_blocks, 0 jne proc_extra_blocks cmp dword [lane_data + _outer_done], 0 jne end_loop proc_outer: mov dword [lane_data + _outer_done], 1 mov DWORD(size_offset), [lane_data + _size_offset] mov qword [lane_data + _extra_block + size_offset], 0 mov word [state + _lens_sha256 + 2*idx], 1 lea tmp, [lane_data + _outer_block] mov job, [lane_data + _job_in_lane] mov [state + _args_data_ptr_sha256 + 8*idx], tmp vmovd xmm0, [state + _args_digest_sha256 + 4*idx + 0*SHA256_DIGEST_ROW_SIZE] vpinsrd xmm0, xmm0, [state + _args_digest_sha256 + 4*idx + 1*SHA256_DIGEST_ROW_SIZE], 1 vpinsrd xmm0, xmm0, [state + _args_digest_sha256 + 4*idx + 2*SHA256_DIGEST_ROW_SIZE], 2 vpinsrd xmm0, xmm0, [state + _args_digest_sha256 + 4*idx + 3*SHA256_DIGEST_ROW_SIZE], 3 vpshufb xmm0, xmm0, [rel byteswap] vmovd xmm1, [state + _args_digest_sha256 + 4*idx + 4*SHA256_DIGEST_ROW_SIZE] vpinsrd xmm1, xmm1, [state + _args_digest_sha256 + 4*idx + 5*SHA256_DIGEST_ROW_SIZE], 1 vpinsrd xmm1, xmm1, [state + _args_digest_sha256 + 4*idx + 6*SHA256_DIGEST_ROW_SIZE], 2 %ifndef SHA224 vpinsrd xmm1, xmm1, [state + _args_digest_sha256 + 4*idx + 7*SHA256_DIGEST_ROW_SIZE], 3 %endif vpshufb xmm1, xmm1, [rel byteswap] vmovdqa [lane_data + _outer_block], xmm0 vmovdqa [lane_data + _outer_block + 4*4], xmm1 %ifdef SHA224 mov dword [lane_data + _outer_block + 7*4], 0x80 %endif mov tmp, [job + _auth_key_xor_opad] vmovdqu xmm0, [tmp] vmovdqu xmm1, [tmp + 4*4] vmovd [state + _args_digest_sha256 + 4*idx + 0*SHA256_DIGEST_ROW_SIZE], xmm0 vpextrd [state + _args_digest_sha256 + 4*idx + 1*SHA256_DIGEST_ROW_SIZE], xmm0, 1 vpextrd [state + _args_digest_sha256 + 4*idx + 2*SHA256_DIGEST_ROW_SIZE], xmm0, 2 vpextrd [state + _args_digest_sha256 + 4*idx + 3*SHA256_DIGEST_ROW_SIZE], xmm0, 3 vmovd [state + _args_digest_sha256 + 4*idx + 4*SHA256_DIGEST_ROW_SIZE], xmm1 vpextrd [state + _args_digest_sha256 + 4*idx + 5*SHA256_DIGEST_ROW_SIZE], xmm1, 1 vpextrd [state + _args_digest_sha256 + 4*idx + 6*SHA256_DIGEST_ROW_SIZE], xmm1, 2 vpextrd [state + _args_digest_sha256 + 4*idx + 7*SHA256_DIGEST_ROW_SIZE], xmm1, 3 jmp start_loop align 16 proc_extra_blocks: mov DWORD(start_offset), [lane_data + _start_offset] mov [state + _lens_sha256 + 2*idx], WORD(extra_blocks) lea tmp, [lane_data + _extra_block + start_offset] mov [state + _args_data_ptr_sha256 + 8*idx], tmp mov dword [lane_data + _extra_blocks], 0 jmp start_loop align 16 copy_lt64: ;; less than one message block of data ;; beginning of source block ;; destination extrablock but backwards by len from where 0x80 pre-populated ;; p2 clobbers unused_lanes, undo before exit lea p2, [lane_data + _extra_block + 64] sub p2, len memcpy_avx_64_1 p2, p, len, tmp4, tmp2, xmm0, xmm1, xmm2, xmm3 mov unused_lanes, [state + _unused_lanes_sha256] jmp end_fast_copy return_null: xor job_rax, job_rax jmp return align 16 end_loop: mov job_rax, [lane_data + _job_in_lane] mov unused_lanes, [state + _unused_lanes_sha256] mov qword [lane_data + _job_in_lane], 0 or dword [job_rax + _status], STS_COMPLETED_HMAC shl unused_lanes, 8 or unused_lanes, idx mov [state + _unused_lanes_sha256], unused_lanes mov p, [job_rax + _auth_tag_output] ; copy 14 bytes for SHA224 and 16 bytes for SHA256 mov DWORD(tmp), [state + _args_digest_sha256 + 4*idx + 0*SHA256_DIGEST_ROW_SIZE] mov DWORD(tmp2), [state + _args_digest_sha256 + 4*idx + 1*SHA256_DIGEST_ROW_SIZE] mov DWORD(tmp3), [state + _args_digest_sha256 + 4*idx + 2*SHA256_DIGEST_ROW_SIZE] mov DWORD(tmp4), [state + _args_digest_sha256 + 4*idx + 3*SHA256_DIGEST_ROW_SIZE] bswap DWORD(tmp) bswap DWORD(tmp2) bswap DWORD(tmp3) bswap DWORD(tmp4) mov [p + 0*4], DWORD(tmp) mov [p + 1*4], DWORD(tmp2) mov [p + 2*4], DWORD(tmp3) %ifdef SHA224 mov [p + 3*4], WORD(tmp4) %else mov [p + 3*4], DWORD(tmp4) %endif return: mov rbx, [rsp + _gpr_save + 8*0] mov rbp, [rsp + _gpr_save + 8*1] mov r12, [rsp + _gpr_save + 8*2] %ifndef LINUX mov rsi, [rsp + _gpr_save + 8*3] mov rdi, [rsp + _gpr_save + 8*4] %endif mov rsp, [rsp + _rsp_save] ; original SP ret intel-ipsec-mb-0.48/avx/mb_mgr_hmac_sha_384_flush_avx.asm000066400000000000000000000032411321406316400233200ustar00rootroot00000000000000;; ;; Copyright (c) 2012-2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; %define FUNC flush_job_hmac_sha_384_avx %define SHA_X_DIGEST_SIZE 384 %include "mb_mgr_hmac_sha_512_flush_avx.asm" intel-ipsec-mb-0.48/avx/mb_mgr_hmac_sha_384_submit_avx.asm000066400000000000000000000032431321406316400235040ustar00rootroot00000000000000;; ;; Copyright (c) 2012-2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; %define FUNC submit_job_hmac_sha_384_avx %define SHA_X_DIGEST_SIZE 384 %include "mb_mgr_hmac_sha_512_submit_avx.asm" intel-ipsec-mb-0.48/avx/mb_mgr_hmac_sha_512_flush_avx.asm000066400000000000000000000161021321406316400233110ustar00rootroot00000000000000;; ;; Copyright (c) 2012-2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; %include "os.asm" %include "job_aes_hmac.asm" %include "mb_mgr_datastruct.asm" %include "reg_sizes.asm" extern sha512_x2_avx section .data default rel align 16 byteswap: ;ddq 0x08090a0b0c0d0e0f0001020304050607 dq 0x0001020304050607, 0x08090a0b0c0d0e0f len_masks: ;ddq 0x0000000000000000000000000000FFFF dq 0x000000000000FFFF, 0x0000000000000000 ;ddq 0x000000000000000000000000FFFF0000 dq 0x00000000FFFF0000, 0x0000000000000000 one: dq 1 section .text %ifndef FUNC %define FUNC flush_job_hmac_sha_512_avx %define SHA_X_DIGEST_SIZE 512 %endif %if 1 %ifdef LINUX %define arg1 rdi %define arg2 rsi %else %define arg1 rcx %define arg2 rdx %endif %define state arg1 %define job arg2 %define len2 arg2 ; idx needs to be in rbx, rbp, r12-r15 %define idx rbp %define unused_lanes rbx %define lane_data rbx %define tmp2 rbx %define job_rax rax %define tmp1 rax %define size_offset rax %define tmp rax %define start_offset rax %define tmp3 arg1 %define extra_blocks arg2 %define p arg2 %define tmp4 r8 %define tmp5 r9 %define tmp6 r10 %endif ; This routine clobbers rbx, rbp struc STACK _gpr_save: resq 2 _rsp_save: resq 1 endstruc %define APPEND(a,b) a %+ b ; JOB* FUNC(MB_MGR_HMAC_SHA_512_OOO *state) ; arg 1 : rcx : state MKGLOBAL(FUNC,function,internal) FUNC: mov rax, rsp sub rsp, STACK_size and rsp, -16 mov [rsp + _gpr_save + 8*0], rbx mov [rsp + _gpr_save + 8*1], rbp mov [rsp + _rsp_save], rax ; original SP mov unused_lanes, [state + _unused_lanes_sha512] bt unused_lanes, 16+7 jc return_null ; find a lane with a non-null job xor idx, idx cmp qword [state + _ldata_sha512 + 1 * _SHA512_LANE_DATA_size + _job_in_lane_sha512], 0 cmovne idx, [rel one] copy_lane_data: ; copy good lane (idx) to empty lanes vmovdqa xmm0, [state + _lens_sha512] mov tmp, [state + _args_sha512 + _data_ptr_sha512 + PTR_SZ*idx] %assign I 0 %rep 2 cmp qword [state + _ldata_sha512 + I * _SHA512_LANE_DATA_size + _job_in_lane_sha512], 0 jne APPEND(skip_,I) mov [state + _args_sha512 + _data_ptr_sha512 + PTR_SZ*I], tmp vpor xmm0, xmm0, [rel len_masks + 16*I] APPEND(skip_,I): %assign I (I+1) %endrep vmovdqa [state + _lens_sha512], xmm0 vphminposuw xmm1, xmm0 vpextrw DWORD(len2), xmm1, 0 ; min value vpextrw DWORD(idx), xmm1, 1 ; min index (0...3) cmp len2, 0 je len_is_0 vpshuflw xmm1, xmm1, 0xA0 vpsubw xmm0, xmm0, xmm1 vmovdqa [state + _lens_sha512], xmm0 ; "state" and "args" are the same address, arg1 ; len is arg2 call sha512_x2_avx ; state and idx are intact len_is_0: ; process completed job "idx" imul lane_data, idx, _SHA512_LANE_DATA_size lea lane_data, [state + _ldata_sha512 + lane_data] mov DWORD(extra_blocks), [lane_data + _extra_blocks_sha512] cmp extra_blocks, 0 jne proc_extra_blocks cmp dword [lane_data + _outer_done_sha512], 0 jne end_loop proc_outer: mov dword [lane_data + _outer_done_sha512], 1 mov DWORD(size_offset), [lane_data + _size_offset_sha512] mov qword [lane_data + _extra_block_sha512 + size_offset], 0 mov word [state + _lens_sha512 + 2*idx], 1 lea tmp, [lane_data + _outer_block_sha512] mov job, [lane_data + _job_in_lane_sha512] mov [state + _args_data_ptr_sha512 + PTR_SZ*idx], tmp ; move digest into data location %assign I 0 %rep (SHA_X_DIGEST_SIZE / (8*16)) vmovq xmm0, [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 2*I*SHA512_DIGEST_ROW_SIZE] vpinsrq xmm0, [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + (2*I + 1)*SHA512_DIGEST_ROW_SIZE], 1 vpshufb xmm0, [rel byteswap] vmovdqa [lane_data + _outer_block_sha512 + I * 16], xmm0 %assign I (I+1) %endrep ; move the opad key into digest mov tmp, [job + _auth_key_xor_opad] %assign I 0 %rep 4 vmovdqu xmm0, [tmp + I * 16] vmovq [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 2*I*SHA512_DIGEST_ROW_SIZE], xmm0 vpextrq [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + (2*I + 1)*SHA512_DIGEST_ROW_SIZE], xmm0, 1 %assign I (I+1) %endrep jmp copy_lane_data align 16 proc_extra_blocks: mov DWORD(start_offset), [lane_data + _start_offset_sha512] mov [state + _lens_sha512 + 2*idx], WORD(extra_blocks) lea tmp, [lane_data + _extra_block_sha512 + start_offset] mov [state + _args_data_ptr_sha512 + PTR_SZ*idx], tmp mov dword [lane_data + _extra_blocks_sha512], 0 jmp copy_lane_data return_null: xor job_rax, job_rax jmp return align 16 end_loop: mov job_rax, [lane_data + _job_in_lane_sha512] mov qword [lane_data + _job_in_lane_sha512], 0 or dword [job_rax + _status], STS_COMPLETED_HMAC mov unused_lanes, [state + _unused_lanes_sha512] shl unused_lanes, 8 or unused_lanes, idx mov [state + _unused_lanes_sha512], unused_lanes mov p, [job_rax + _auth_tag_output] ; below is the code for both SHA512 & SHA384. copy SHA512=32 bytes and SHA384=24 bytes mov QWORD(tmp2), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 0*SHA512_DIGEST_ROW_SIZE] mov QWORD(tmp4), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 1*SHA512_DIGEST_ROW_SIZE] mov QWORD(tmp6), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 2*SHA512_DIGEST_ROW_SIZE] %if (SHA_X_DIGEST_SIZE != 384) mov QWORD(tmp5), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 3*SHA512_DIGEST_ROW_SIZE] %endif bswap QWORD(tmp2) bswap QWORD(tmp4) bswap QWORD(tmp6) %if (SHA_X_DIGEST_SIZE != 384) bswap QWORD(tmp5) %endif mov [p + 0*8], QWORD(tmp2) mov [p + 1*8], QWORD(tmp4) mov [p + 2*8], QWORD(tmp6) %if (SHA_X_DIGEST_SIZE != 384) mov [p + 3*8], QWORD(tmp5) %endif return: mov rbx, [rsp + _gpr_save + 8*0] mov rbp, [rsp + _gpr_save + 8*1] mov rsp, [rsp + _rsp_save] ; original SP ret intel-ipsec-mb-0.48/avx/mb_mgr_hmac_sha_512_submit_avx.asm000066400000000000000000000223171321406316400235000ustar00rootroot00000000000000;; ;; Copyright (c) 2012-2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; %include "os.asm" %include "job_aes_hmac.asm" %include "mb_mgr_datastruct.asm" %include "reg_sizes.asm" %include "memcpy.asm" extern sha512_x2_avx section .data default rel align 16 byteswap: ;ddq 0x08090a0b0c0d0e0f0001020304050607 dq 0x0001020304050607, 0x08090a0b0c0d0e0f section .text %ifndef FUNC %define FUNC submit_job_hmac_sha_512_avx %define SHA_X_DIGEST_SIZE 512 %endif %if 1 %ifdef LINUX %define arg1 rdi %define arg2 rsi %define reg3 rcx %define reg4 rdx %else %define arg1 rcx %define arg2 rdx %define reg3 rdi %define reg4 rsi %endif %define state arg1 %define job arg2 %define len2 arg2 ; idx needs to be in rbx, rbp, r12-r15 %define last_len rbp %define idx rbp %define p r11 %define start_offset r11 %define unused_lanes rbx %define tmp4 rbx %define job_rax rax %define len rax %define size_offset reg3 %define tmp2 reg3 %define lane reg4 %define tmp3 reg4 %define extra_blocks r8 %define tmp r9 %define p2 r9 %define lane_data r10 %endif ; This routine clobbers rbx, rbp, rsi, rdi struc STACK _gpr_save: resq 4 _rsp_save: resq 1 endstruc ; JOB* FUNC(MB_MGR_HMAC_sha_512_OOO *state, JOB_AES_HMAC *job) ; arg 1 : rcx : state ; arg 2 : rdx : job MKGLOBAL(FUNC,function,internal) FUNC: mov rax, rsp sub rsp, STACK_size and rsp, -16 mov [rsp + _gpr_save + 8*0], rbx mov [rsp + _gpr_save + 8*1], rbp %ifndef LINUX mov [rsp + _gpr_save + 8*2], rsi mov [rsp + _gpr_save + 8*3], rdi %endif mov [rsp + _rsp_save], rax ; original SP mov unused_lanes, [state + _unused_lanes_sha512] movzx lane, BYTE(unused_lanes) shr unused_lanes, 8 imul lane_data, lane, _SHA512_LANE_DATA_size lea lane_data, [state + _ldata_sha512 + lane_data] mov [state + _unused_lanes_sha512], unused_lanes mov len, [job + _msg_len_to_hash_in_bytes] mov tmp, len shr tmp, 7 ; divide by 128, len in terms of blocks mov [lane_data + _job_in_lane_sha512], job mov dword [lane_data + _outer_done_sha512], 0 mov [state + _lens_sha512 + 2*lane], WORD(tmp) ; 2 is word size in bytes mov last_len, len and last_len, 127 lea extra_blocks, [last_len + 17 + 127] shr extra_blocks, 7 mov [lane_data + _extra_blocks_sha512], DWORD(extra_blocks) mov p, [job + _src] add p, [job + _hash_start_src_offset_in_bytes] mov [state + _args_data_ptr_sha512 + PTR_SZ*lane], p cmp len, 128 jb copy_lt128 fast_copy: add p, len %assign I 0 %rep 2 vmovdqu xmm0, [p - 128 + I*4*16 + 0*16] vmovdqu xmm1, [p - 128 + I*4*16 + 1*16] vmovdqu xmm2, [p - 128 + I*4*16 + 2*16] vmovdqu xmm3, [p - 128 + I*4*16 + 3*16] vmovdqa [lane_data + _extra_block_sha512 + I*4*16 + 0*16], xmm0 vmovdqa [lane_data + _extra_block_sha512 + I*4*16 + 1*16], xmm1 vmovdqa [lane_data + _extra_block_sha512 + I*4*16 + 2*16], xmm2 vmovdqa [lane_data + _extra_block_sha512 + I*4*16 + 3*16], xmm3 %assign I (I+1) %endrep end_fast_copy: mov size_offset, extra_blocks shl size_offset, 7 sub size_offset, last_len add size_offset, 128-8 mov [lane_data + _size_offset_sha512], DWORD(size_offset) mov start_offset, 128 sub start_offset, last_len mov [lane_data + _start_offset_sha512], DWORD(start_offset) lea tmp, [8*128 + 8*len] bswap tmp mov [lane_data + _extra_block_sha512 + size_offset], tmp mov tmp, [job + _auth_key_xor_ipad] %assign I 0 %rep 4 vmovdqu xmm0, [tmp + I * 2 * 8] vmovq [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*lane + (2*I)*SHA512_DIGEST_ROW_SIZE], xmm0 vpextrq [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*lane + (2*I + 1)*SHA512_DIGEST_ROW_SIZE], xmm0, 1 %assign I (I+1) %endrep test len, ~127 jnz ge128_bytes lt128_bytes: mov [state + _lens_sha512 + 2*lane], WORD(extra_blocks) lea tmp, [lane_data + _extra_block_sha512 + start_offset] mov [state + _args_data_ptr_sha512 + PTR_SZ*lane], tmp ;; 8 to hold a UINT8 mov dword [lane_data + _extra_blocks_sha512], 0 ge128_bytes: cmp unused_lanes, 0xff jne return_null jmp start_loop align 16 start_loop: ; Find min length vmovdqa xmm0, [state + _lens_sha512] vphminposuw xmm1, xmm0 vpextrw DWORD(len2), xmm1, 0 ; min value vpextrw DWORD(idx), xmm1, 1 ; min index (0...1) cmp len2, 0 je len_is_0 vpshuflw xmm1, xmm1, 0xA0 vpsubw xmm0, xmm0, xmm1 vmovdqa [state + _lens_sha512], xmm0 ; "state" and "args" are the same address, arg1 ; len is arg2 call sha512_x2_avx ; state and idx are intact len_is_0: ; process completed job "idx" imul lane_data, idx, _SHA512_LANE_DATA_size lea lane_data, [state + _ldata_sha512 + lane_data] mov DWORD(extra_blocks), [lane_data + _extra_blocks_sha512] cmp extra_blocks, 0 jne proc_extra_blocks cmp dword [lane_data + _outer_done_sha512], 0 jne end_loop proc_outer: mov dword [lane_data + _outer_done_sha512], 1 mov DWORD(size_offset), [lane_data + _size_offset_sha512] mov qword [lane_data + _extra_block_sha512 + size_offset], 0 mov word [state + _lens_sha512 + 2*idx], 1 lea tmp, [lane_data + _outer_block_sha512] mov job, [lane_data + _job_in_lane_sha512] mov [state + _args_data_ptr_sha512 + PTR_SZ*idx], tmp %assign I 0 %rep (SHA_X_DIGEST_SIZE / (8 * 16)) vmovq xmm0, [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 2*I*SHA512_DIGEST_ROW_SIZE] vpinsrq xmm0, [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + (2*I + 1)*SHA512_DIGEST_ROW_SIZE], 1 vpshufb xmm0, [rel byteswap] vmovdqa [lane_data + _outer_block_sha512 + I * 16], xmm0 %assign I (I+1) %endrep mov tmp, [job + _auth_key_xor_opad] %assign I 0 %rep 4 vmovdqu xmm0, [tmp + I * 16] vmovq [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 2*I*SHA512_DIGEST_ROW_SIZE], xmm0 vpextrq [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + (2*I + 1)*SHA512_DIGEST_ROW_SIZE], xmm0, 1 %assign I (I+1) %endrep jmp start_loop align 16 proc_extra_blocks: mov DWORD(start_offset), [lane_data + _start_offset_sha512] mov [state + _lens_sha512 + 2*idx], WORD(extra_blocks) lea tmp, [lane_data + _extra_block_sha512 + start_offset] mov [state + _args_data_ptr_sha512 + PTR_SZ*idx], tmp ;; idx is index of shortest length message mov dword [lane_data + _extra_blocks_sha512], 0 jmp start_loop align 16 copy_lt128: ;; less than one message block of data ;; destination extra block but backwards by len from where 0x80 pre-populated lea p2, [lane_data + _extra_block + 128] sub p2, len memcpy_avx_128_1 p2, p, len, tmp4, tmp2, xmm0, xmm1, xmm2, xmm3 mov unused_lanes, [state + _unused_lanes_sha512] jmp end_fast_copy return_null: xor job_rax, job_rax jmp return align 16 end_loop: mov job_rax, [lane_data + _job_in_lane_sha512] mov unused_lanes, [state + _unused_lanes_sha512] mov qword [lane_data + _job_in_lane_sha512], 0 or dword [job_rax + _status], STS_COMPLETED_HMAC shl unused_lanes, 8 or unused_lanes, idx mov [state + _unused_lanes_sha512], unused_lanes mov p, [job_rax + _auth_tag_output] ; below is the code for both SHA512 & SHA384. SHA512=32 bytes and SHA384=24 bytes mov QWORD(tmp), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 0*SHA512_DIGEST_ROW_SIZE] mov QWORD(tmp2), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 1*SHA512_DIGEST_ROW_SIZE] mov QWORD(tmp3), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 2*SHA512_DIGEST_ROW_SIZE] %if (SHA_X_DIGEST_SIZE != 384) mov QWORD(tmp4), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 3*SHA512_DIGEST_ROW_SIZE] %endif bswap QWORD(tmp) bswap QWORD(tmp2) bswap QWORD(tmp3) %if (SHA_X_DIGEST_SIZE != 384) bswap QWORD(tmp4) %endif mov [p + 0*8], QWORD(tmp) mov [p + 1*8], QWORD(tmp2) mov [p + 2*8], QWORD(tmp3) %if (SHA_X_DIGEST_SIZE != 384) mov [p + 3*8], QWORD(tmp4) %endif return: mov rbx, [rsp + _gpr_save + 8*0] mov rbp, [rsp + _gpr_save + 8*1] %ifndef LINUX mov rsi, [rsp + _gpr_save + 8*2] mov rdi, [rsp + _gpr_save + 8*3] %endif mov rsp, [rsp + _rsp_save] ; original SP ret intel-ipsec-mb-0.48/avx/mb_mgr_hmac_submit_avx.asm000066400000000000000000000243211321406316400222530ustar00rootroot00000000000000;; ;; Copyright (c) 2012-2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; %include "os.asm" %include "job_aes_hmac.asm" %include "mb_mgr_datastruct.asm" %include "reg_sizes.asm" %include "memcpy.asm" extern sha1_mult_avx section .data default rel align 16 byteswap: ;ddq 0x0c0d0e0f08090a0b0405060700010203 dq 0x0405060700010203, 0x0c0d0e0f08090a0b section .text %if 1 %ifdef LINUX %define arg1 rdi %define arg2 rsi %define reg3 rcx %define reg4 rdx %else %define arg1 rcx %define arg2 rdx %define reg3 rdi %define reg4 rsi %endif %define state arg1 %define job arg2 %define len2 arg2 ; idx needs to be in rbx, rbp, r12-r15 %define last_len rbp %define idx rbp %define p r11 %define start_offset r11 %define unused_lanes rbx %define tmp4 rbx %define job_rax rax %define len rax %define size_offset reg3 %define tmp2 reg3 %define lane reg4 %define tmp3 reg4 %define extra_blocks r8 %define tmp r9 %define p2 r9 %define lane_data r10 %endif ; This routine clobbers rdi, rsi, rbx, rbp struc STACK _gpr_save: resq 4 _rsp_save: resq 1 endstruc ; JOB* submit_job_hmac_avx(MB_MGR_HMAC_SHA_1_OOO *state, JOB_AES_HMAC *job) ; arg 1 : rcx : state ; arg 2 : rdx : job MKGLOBAL(submit_job_hmac_avx,function,internal) submit_job_hmac_avx: mov rax, rsp sub rsp, STACK_size and rsp, -16 mov [rsp + _gpr_save + 8*0], rbx mov [rsp + _gpr_save + 8*1], rbp %ifndef LINUX mov [rsp + _gpr_save + 8*2], rsi mov [rsp + _gpr_save + 8*3], rdi %endif mov [rsp + _rsp_save], rax ; original SP mov unused_lanes, [state + _unused_lanes] movzx lane, BYTE(unused_lanes) shr unused_lanes, 8 imul lane_data, lane, _HMAC_SHA1_LANE_DATA_size lea lane_data, [state + _ldata + lane_data] mov [state + _unused_lanes], unused_lanes mov len, [job + _msg_len_to_hash_in_bytes] mov tmp, len shr tmp, 6 ; divide by 64, len in terms of blocks mov [lane_data + _job_in_lane], job mov dword [lane_data + _outer_done], 0 mov [state + _lens + 2*lane], WORD(tmp) mov last_len, len and last_len, 63 lea extra_blocks, [last_len + 9 + 63] shr extra_blocks, 6 mov [lane_data + _extra_blocks], DWORD(extra_blocks) mov p, [job + _src] add p, [job + _hash_start_src_offset_in_bytes] mov [state + _args_data_ptr + PTR_SZ*lane], p cmp len, 64 jb copy_lt64 fast_copy: add p, len vmovdqu xmm0, [p - 64 + 0*16] vmovdqu xmm1, [p - 64 + 1*16] vmovdqu xmm2, [p - 64 + 2*16] vmovdqu xmm3, [p - 64 + 3*16] vmovdqa [lane_data + _extra_block + 0*16], xmm0 vmovdqa [lane_data + _extra_block + 1*16], xmm1 vmovdqa [lane_data + _extra_block + 2*16], xmm2 vmovdqa [lane_data + _extra_block + 3*16], xmm3 end_fast_copy: mov size_offset, extra_blocks shl size_offset, 6 sub size_offset, last_len add size_offset, 64-8 mov [lane_data + _size_offset], DWORD(size_offset) mov start_offset, 64 sub start_offset, last_len mov [lane_data + _start_offset], DWORD(start_offset) lea tmp, [8*64 + 8*len] bswap tmp mov [lane_data + _extra_block + size_offset], tmp mov tmp, [job + _auth_key_xor_ipad] vmovdqu xmm0, [tmp] mov DWORD(tmp), [tmp + 4*4] vmovd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*lane + 0*SHA1_DIGEST_ROW_SIZE], xmm0 vpextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*lane + 1*SHA1_DIGEST_ROW_SIZE], xmm0, 1 vpextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*lane + 2*SHA1_DIGEST_ROW_SIZE], xmm0, 2 vpextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*lane + 3*SHA1_DIGEST_ROW_SIZE], xmm0, 3 mov [state + _args_digest + SHA1_DIGEST_WORD_SIZE*lane + 4*SHA1_DIGEST_ROW_SIZE], DWORD(tmp) test len, ~63 jnz ge64_bytes lt64_bytes: mov [state + _lens + 2*lane], WORD(extra_blocks) lea tmp, [lane_data + _extra_block + start_offset] mov [state + _args_data_ptr + PTR_SZ*lane], tmp mov dword [lane_data + _extra_blocks], 0 ge64_bytes: cmp unused_lanes, 0xff jne return_null jmp start_loop align 16 start_loop: ; Find min length vmovdqa xmm0, [state + _lens] vphminposuw xmm1, xmm0 vpextrw DWORD(len2), xmm1, 0 ; min value vpextrw DWORD(idx), xmm1, 1 ; min index (0...3) cmp len2, 0 je len_is_0 vpshuflw xmm1, xmm1, 0 vpsubw xmm0, xmm0, xmm1 vmovdqa [state + _lens], xmm0 ; "state" and "args" are the same address, arg1 ; len is arg2 call sha1_mult_avx ; state and idx are intact len_is_0: ; process completed job "idx" imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size lea lane_data, [state + _ldata + lane_data] mov DWORD(extra_blocks), [lane_data + _extra_blocks] cmp extra_blocks, 0 jne proc_extra_blocks cmp dword [lane_data + _outer_done], 0 jne end_loop proc_outer: mov dword [lane_data + _outer_done], 1 mov DWORD(size_offset), [lane_data + _size_offset] mov qword [lane_data + _extra_block + size_offset], 0 mov word [state + _lens + 2*idx], 1 lea tmp, [lane_data + _outer_block] mov job, [lane_data + _job_in_lane] mov [state + _args_data_ptr + PTR_SZ*idx], tmp vmovd xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 0*SHA1_DIGEST_ROW_SIZE] vpinsrd xmm0, xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 1*SHA1_DIGEST_ROW_SIZE], 1 vpinsrd xmm0, xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 2*SHA1_DIGEST_ROW_SIZE], 2 vpinsrd xmm0, xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 3*SHA1_DIGEST_ROW_SIZE], 3 vpshufb xmm0, xmm0, [rel byteswap] mov DWORD(tmp), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 4*SHA1_DIGEST_ROW_SIZE] bswap DWORD(tmp) vmovdqa [lane_data + _outer_block], xmm0 mov [lane_data + _outer_block + 4*SHA1_DIGEST_WORD_SIZE], DWORD(tmp) mov tmp, [job + _auth_key_xor_opad] vmovdqu xmm0, [tmp] mov DWORD(tmp), [tmp + 4*4] vmovd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 0*SHA1_DIGEST_ROW_SIZE], xmm0 vpextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 1*SHA1_DIGEST_ROW_SIZE], xmm0, 1 vpextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 2*SHA1_DIGEST_ROW_SIZE], xmm0, 2 vpextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 3*SHA1_DIGEST_ROW_SIZE], xmm0, 3 mov [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 4*SHA1_DIGEST_ROW_SIZE], DWORD(tmp) jmp start_loop align 16 proc_extra_blocks: mov DWORD(start_offset), [lane_data + _start_offset] mov [state + _lens + 2*idx], WORD(extra_blocks) lea tmp, [lane_data + _extra_block + start_offset] mov [state + _args_data_ptr + PTR_SZ*idx], tmp mov dword [lane_data + _extra_blocks], 0 jmp start_loop align 16 copy_lt64: ;; less than one message block of data ;; beginning of source block ;; destination extrablock but backwards by len from where 0x80 pre-populated lea p2, [lane_data + _extra_block + 64] sub p2, len memcpy_avx_64_1 p2, p, len, tmp4, tmp2, xmm0, xmm1, xmm2, xmm3 mov unused_lanes, [state + _unused_lanes] jmp end_fast_copy return_null: xor job_rax, job_rax jmp return align 16 end_loop: mov job_rax, [lane_data + _job_in_lane] mov unused_lanes, [state + _unused_lanes] mov qword [lane_data + _job_in_lane], 0 or dword [job_rax + _status], STS_COMPLETED_HMAC shl unused_lanes, 8 or unused_lanes, idx mov [state + _unused_lanes], unused_lanes mov p, [job_rax + _auth_tag_output] ; copy 12 bytes mov DWORD(tmp), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 0*SHA1_DIGEST_ROW_SIZE] mov DWORD(tmp2), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 1*SHA1_DIGEST_ROW_SIZE] mov DWORD(tmp3), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 2*SHA1_DIGEST_ROW_SIZE] bswap DWORD(tmp) bswap DWORD(tmp2) bswap DWORD(tmp3) mov [p + 0*SHA1_DIGEST_WORD_SIZE], DWORD(tmp) mov [p + 1*SHA1_DIGEST_WORD_SIZE], DWORD(tmp2) mov [p + 2*SHA1_DIGEST_WORD_SIZE], DWORD(tmp3) return: mov rbx, [rsp + _gpr_save + 8*0] mov rbp, [rsp + _gpr_save + 8*1] %ifndef LINUX mov rsi, [rsp + _gpr_save + 8*2] mov rdi, [rsp + _gpr_save + 8*3] %endif mov rsp, [rsp + _rsp_save] ; original SP ret intel-ipsec-mb-0.48/avx/md5_x4x2_avx.asm000066400000000000000000000664101321406316400200140ustar00rootroot00000000000000;; ;; Copyright (c) 2012-2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; ;; code to compute octal MD5 using AVX ;; Stack must be aligned to 16 bytes before call ;; Windows clobbers: rax rbx rdx rsi rdi r8 r9 r10 r11 r12 r13 r14 r15 ;; Windows preserves: rcx rbp ;; ;; Linux clobbers: rax rbx rcx rdx rsi r8 r9 r10 r11 r12 r13 r14 r15 ;; Linux preserves: rdi rbp ;; ;; clobbers xmm0-15 %include "os.asm" %include "mb_mgr_datastruct.asm" extern MD5_TABLE section .data default rel align 64 ONES: dd 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff section .text %ifdef LINUX ;; Linux Registers %define arg1 rdi %define arg2 rsi %define mem1 rcx %define mem2 rdx %else %define arg1 rcx %define arg2 rdx %define mem1 rdi %define mem2 rsi %endif ;; rbp is not clobbered %define state arg1 %define num_blks arg2 %define inp0 r8 %define inp1 r9 %define inp2 r10 %define inp3 r11 %define inp4 r12 %define inp5 r13 %define inp6 r14 %define inp7 r15 %define TBL rax %define IDX rbx %define A xmm0 %define B xmm1 %define C xmm2 %define D xmm3 %define E xmm4 ; tmp %define F xmm5 ; tmp %define A2 xmm6 %define B2 xmm7 %define C2 xmm8 %define D2 xmm9 %define FUN E %define TMP F %define FUN2 xmm10 %define TMP2 xmm11 %define T0 xmm10 %define T1 xmm11 %define T2 xmm12 %define T3 xmm13 %define T4 xmm14 %define T5 xmm15 ; Stack Layout ; ; 470 DD2 ; 460 CC2 ; 450 BB2 ; 440 AA2 ; 430 DD ; 420 CC ; 410 BB ; 400 AA ; ; 3F0 data2[15] for lanes 7...4 \ ; ... \ ; 300 data2[0] for lanes 7...4 \ ; 2F0 data2[15] for lanes 3...0 > mem block 2 ; ... / ; 210 data2[1] for lanes 3...0 / ; 200 data2[0] for lanes 3...0 / ; ; 1F0 data1[15] for lanes 7...4 \ ; ... \ ; 100 data1[0] for lanes 7...4 \ ; F0 data1[15] for lanes 3...0 > mem block 1 ; ... / ; 10 data1[1] for lanes 3...0 / ; 0 data1[0] for lanes 3...0 / ; stack size must be an odd multiple of 8 bytes in size struc STACK _DATA: reso 2*2*16 ; 2 blocks * 2 sets of lanes * 16 regs _DIGEST: reso 8 ; stores AA-DD, AA2-DD2 resb 8 ; for alignment endstruc %define STACK_SIZE STACK_size %define AA rsp + _DIGEST + 16*0 %define BB rsp + _DIGEST + 16*1 %define CC rsp + _DIGEST + 16*2 %define DD rsp + _DIGEST + 16*3 %define AA2 rsp + _DIGEST + 16*4 %define BB2 rsp + _DIGEST + 16*5 %define CC2 rsp + _DIGEST + 16*6 %define DD2 rsp + _DIGEST + 16*7 ;; ;; MD5 left rotations (number of bits) ;; rot11 equ 7 rot12 equ 12 rot13 equ 17 rot14 equ 22 rot21 equ 5 rot22 equ 9 rot23 equ 14 rot24 equ 20 rot31 equ 4 rot32 equ 11 rot33 equ 16 rot34 equ 23 rot41 equ 6 rot42 equ 10 rot43 equ 15 rot44 equ 21 ; transpose r0, r1, r2, r3, t0, t1 ; "transpose" data in {r0..r3} using temps {t0..t3} ; Input looks like: {r0 r1 r2 r3} ; r0 = {a3 a2 a1 a0} ; r1 = {b3 b2 b1 b0} ; r2 = {c3 c2 c1 c0} ; r3 = {d3 d2 d1 d0} ; ; output looks like: {t0 r1 r0 r3} ; t0 = {d0 c0 b0 a0} ; r1 = {d1 c1 b1 a1} ; r0 = {d2 c2 b2 a2} ; r3 = {d3 c3 b3 a3} ; %macro TRANSPOSE 6 %define %%r0 %1 %define %%r1 %2 %define %%r2 %3 %define %%r3 %4 %define %%t0 %5 %define %%t1 %6 vshufps %%t0, %%r0, %%r1, 0x44 ; t0 = {b1 b0 a1 a0} vshufps %%r0, %%r0, %%r1, 0xEE ; r0 = {b3 b2 a3 a2} vshufps %%t1, %%r2, %%r3, 0x44 ; t1 = {d1 d0 c1 c0} vshufps %%r2, %%r2, %%r3, 0xEE ; r2 = {d3 d2 c3 c2} vshufps %%r1, %%t0, %%t1, 0xDD ; r1 = {d1 c1 b1 a1} vshufps %%r3, %%r0, %%r2, 0xDD ; r3 = {d3 c3 b3 a3} vshufps %%r0, %%r0, %%r2, 0x88 ; r0 = {d2 c2 b2 a2} vshufps %%t0, %%t0, %%t1, 0x88 ; t0 = {d0 c0 b0 a0} %endmacro ;; ;; Magic functions defined in RFC 1321 ;; ; macro MAGIC_F F,X,Y,Z ;; F = ((Z) ^ ((X) & ((Y) ^ (Z)))) %macro MAGIC_F 4 %define %%F %1 %define %%X %2 %define %%Y %3 %define %%Z %4 vpxor %%F,%%Z, %%Y vpand %%F,%%F,%%X vpxor %%F,%%F,%%Z %endmacro ; macro MAGIC_G F,X,Y,Z ;; F = F((Z),(X),(Y)) %macro MAGIC_G 4 %define %%F %1 %define %%X %2 %define %%Y %3 %define %%Z %4 MAGIC_F %%F,%%Z,%%X,%%Y %endmacro ; macro MAGIC_H F,X,Y,Z ;; F = ((X) ^ (Y) ^ (Z)) %macro MAGIC_H 4 %define %%F %1 %define %%X %2 %define %%Y %3 %define %%Z %4 vpxor %%F,%%Z, %%Y vpxor %%F,%%F, %%X %endmacro ; macro MAGIC_I F,X,Y,Z ;; F = ((Y) ^ ((X) | ~(Z))) %macro MAGIC_I 4 %define %%F %1 %define %%X %2 %define %%Y %3 %define %%Z %4 vpxor %%F,%%Z,[rel ONES] ; pnot %%F vpor %%F,%%F,%%X vpxor %%F,%%F,%%Y %endmacro ; PROLD reg, imm, tmp %macro PROLD 3 %define %%reg %1 %define %%imm %2 %define %%tmp %3 vpsrld %%tmp, %%reg, (32-%%imm) vpslld %%reg, %%reg, %%imm vpor %%reg, %%reg, %%tmp %endmacro ;; ;; single MD5 step ;; ;; A = B +ROL32((A +MAGIC(B,C,D) +data +const), nrot) ;; ; macro MD5_STEP1 MAGIC_FUN, A,B,C,D, A2,B2,C3,D2, FUN, TMP, data, MD5const, nrot %macro MD5_STEP1 14 %define %%MAGIC_FUN %1 %define %%A %2 %define %%B %3 %define %%C %4 %define %%D %5 %define %%A2 %6 %define %%B2 %7 %define %%C2 %8 %define %%D2 %9 %define %%FUN %10 %define %%TMP %11 %define %%data %12 %define %%MD5const %13 %define %%nrot %14 vpaddd %%A, %%A, %%MD5const vpaddd %%A2, %%A2, %%MD5const vpaddd %%A, %%A, [%%data] vpaddd %%A2, %%A2, [%%data + 16*16] %%MAGIC_FUN %%FUN, %%B,%%C,%%D vpaddd %%A, %%A, %%FUN %%MAGIC_FUN %%FUN, %%B2,%%C2,%%D2 vpaddd %%A2, %%A2, %%FUN PROLD %%A,%%nrot, %%TMP PROLD %%A2,%%nrot, %%TMP vpaddd %%A, %%A, %%B vpaddd %%A2, %%A2, %%B2 %endmacro ;; ;; single MD5 step ;; ;; A = B +ROL32((A +MAGIC(B,C,D) +data +const), nrot) ;; ; macro MD5_STEP MAGIC_FUN, A,B,C,D, A2,B2,C3,D2, FUN, TMP, FUN2, TMP2, data, ; MD5const, nrot %macro MD5_STEP 16 %define %%MAGIC_FUN %1 %define %%A %2 %define %%B %3 %define %%C %4 %define %%D %5 %define %%A2 %6 %define %%B2 %7 %define %%C2 %8 %define %%D2 %9 %define %%FUN %10 %define %%TMP %11 %define %%FUN2 %12 %define %%TMP2 %13 %define %%data %14 %define %%MD5const %15 %define %%nrot %16 vmovdqa %%TMP,[%%data] vmovdqa %%TMP2,[%%data + 16*16] vpaddd %%A, %%A, %%MD5const vpaddd %%A2, %%A2, %%MD5const vpaddd %%A, %%A, %%TMP vpaddd %%A2, %%A2, %%TMP2 %%MAGIC_FUN %%FUN, %%B,%%C,%%D %%MAGIC_FUN %%FUN2, %%B2,%%C2,%%D2 vpaddd %%A, %%A, %%FUN vpaddd %%A2, %%A2, %%FUN2 PROLD %%A,%%nrot, %%TMP PROLD %%A2,%%nrot, %%TMP2 vpaddd %%A, %%A, %%B vpaddd %%A2, %%A2, %%B2 %endmacro ; void md5_x4x2_avx(MD5_ARGS *args, UINT64 num_blks) ; arg 1 : pointer to MD5_ARGS structure ; arg 2 : number of blocks (>=1) ; align 32 MKGLOBAL(md5_x4x2_avx,function,internal) md5_x4x2_avx: sub rsp, STACK_SIZE ;; each row of transposed digests is split into 2 parts, the right half stored in A, and left half in A2 ;; Initialize digests vmovdqa A,[state + 0*16 + 0*MD5_DIGEST_ROW_SIZE] vmovdqa B,[state + 0*16 + 1*MD5_DIGEST_ROW_SIZE] vmovdqa C,[state + 0*16 + 2*MD5_DIGEST_ROW_SIZE] vmovdqa D,[state + 0*16 + 3*MD5_DIGEST_ROW_SIZE] vmovdqa A2,[state + 1*16 + 0*MD5_DIGEST_ROW_SIZE] vmovdqa B2,[state + 1*16 + 1*MD5_DIGEST_ROW_SIZE] vmovdqa C2,[state + 1*16 + 2*MD5_DIGEST_ROW_SIZE] vmovdqa D2,[state + 1*16 + 3*MD5_DIGEST_ROW_SIZE] lea TBL, [rel MD5_TABLE] ;; load input pointers mov inp0,[state+_data_ptr_md5 +0*PTR_SZ] mov inp1,[state+_data_ptr_md5 +1*PTR_SZ] mov inp2,[state+_data_ptr_md5 +2*PTR_SZ] mov inp3,[state+_data_ptr_md5 +3*PTR_SZ] mov inp4,[state+_data_ptr_md5 +4*PTR_SZ] mov inp5,[state+_data_ptr_md5 +5*PTR_SZ] mov inp6,[state+_data_ptr_md5 +6*PTR_SZ] mov inp7,[state+_data_ptr_md5 +7*PTR_SZ] xor IDX, IDX ; Make ping-pong pointers to the two memory blocks mov mem1, rsp lea mem2, [rsp + 16*16*2] ;; Load first block of data and save back to stack %assign I 0 %rep 4 vmovdqu T2,[inp0+IDX+I*16] vmovdqu T1,[inp1+IDX+I*16] vmovdqu T4,[inp2+IDX+I*16] vmovdqu T3,[inp3+IDX+I*16] TRANSPOSE T2, T1, T4, T3, T0, T5 vmovdqa [mem1+(I*4+0)*16],T0 vmovdqa [mem1+(I*4+1)*16],T1 vmovdqa [mem1+(I*4+2)*16],T2 vmovdqa [mem1+(I*4+3)*16],T3 vmovdqu T2,[inp4+IDX+I*16] vmovdqu T1,[inp5+IDX+I*16] vmovdqu T4,[inp6+IDX+I*16] vmovdqu T3,[inp7+IDX+I*16] TRANSPOSE T2, T1, T4, T3, T0, T5 vmovdqa [mem1+(I*4+0)*16 + 16*16],T0 vmovdqa [mem1+(I*4+1)*16 + 16*16],T1 vmovdqa [mem1+(I*4+2)*16 + 16*16],T2 vmovdqa [mem1+(I*4+3)*16 + 16*16],T3 %assign I (I+1) %endrep lloop: ; save old digests vmovdqa [AA], A vmovdqa [BB], B vmovdqa [CC], C vmovdqa [DD], D ; save old digests vmovdqa [AA2], A2 vmovdqa [BB2], B2 vmovdqa [CC2], C2 vmovdqa [DD2], D2 add IDX, 4*16 sub num_blks, 1 je lastblock MD5_STEP1 MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 0*16, [TBL+ 0*16], rot11 MD5_STEP1 MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 1*16, [TBL+ 1*16], rot12 MD5_STEP1 MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 2*16, [TBL+ 2*16], rot13 MD5_STEP1 MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 3*16, [TBL+ 3*16], rot14 MD5_STEP1 MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 4*16, [TBL+ 4*16], rot11 MD5_STEP1 MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 5*16, [TBL+ 5*16], rot12 MD5_STEP1 MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 6*16, [TBL+ 6*16], rot13 MD5_STEP1 MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 7*16, [TBL+ 7*16], rot14 %assign I 0 vmovdqu T2,[inp0+IDX+I*16] vmovdqu T1,[inp1+IDX+I*16] vmovdqu T4,[inp2+IDX+I*16] vmovdqu T3,[inp3+IDX+I*16] TRANSPOSE T2, T1, T4, T3, T0, T5 vmovdqa [mem2+(I*4+0)*16],T0 vmovdqa [mem2+(I*4+1)*16],T1 vmovdqa [mem2+(I*4+2)*16],T2 vmovdqa [mem2+(I*4+3)*16],T3 MD5_STEP1 MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 8*16, [TBL+ 8*16], rot11 MD5_STEP1 MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 9*16, [TBL+ 9*16], rot12 MD5_STEP1 MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +10*16, [TBL+10*16], rot13 MD5_STEP1 MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 +11*16, [TBL+11*16], rot14 MD5_STEP1 MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 +12*16, [TBL+12*16], rot11 MD5_STEP1 MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 +13*16, [TBL+13*16], rot12 MD5_STEP1 MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +14*16, [TBL+14*16], rot13 MD5_STEP1 MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 +15*16, [TBL+15*16], rot14 vmovdqu T2,[inp4+IDX+I*16] vmovdqu T1,[inp5+IDX+I*16] vmovdqu T4,[inp6+IDX+I*16] vmovdqu T3,[inp7+IDX+I*16] TRANSPOSE T2, T1, T4, T3, T0, T5 vmovdqa [mem2+(I*4+0)*16 + 16*16],T0 vmovdqa [mem2+(I*4+1)*16 + 16*16],T1 vmovdqa [mem2+(I*4+2)*16 + 16*16],T2 vmovdqa [mem2+(I*4+3)*16 + 16*16],T3 %assign I (I+1) MD5_STEP1 MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 1*16, [TBL+16*16], rot21 MD5_STEP1 MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 6*16, [TBL+17*16], rot22 MD5_STEP1 MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +11*16, [TBL+18*16], rot23 MD5_STEP1 MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 0*16, [TBL+19*16], rot24 MD5_STEP1 MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 5*16, [TBL+20*16], rot21 MD5_STEP1 MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 +10*16, [TBL+21*16], rot22 MD5_STEP1 MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +15*16, [TBL+22*16], rot23 MD5_STEP1 MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 4*16, [TBL+23*16], rot24 vmovdqu T2,[inp0+IDX+I*16] vmovdqu T1,[inp1+IDX+I*16] vmovdqu T4,[inp2+IDX+I*16] vmovdqu T3,[inp3+IDX+I*16] TRANSPOSE T2, T1, T4, T3, T0, T5 vmovdqa [mem2+(I*4+0)*16],T0 vmovdqa [mem2+(I*4+1)*16],T1 vmovdqa [mem2+(I*4+2)*16],T2 vmovdqa [mem2+(I*4+3)*16],T3 MD5_STEP1 MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 9*16, [TBL+24*16], rot21 MD5_STEP1 MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 +14*16, [TBL+25*16], rot22 MD5_STEP1 MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 3*16, [TBL+26*16], rot23 MD5_STEP1 MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 8*16, [TBL+27*16], rot24 MD5_STEP1 MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 +13*16, [TBL+28*16], rot21 MD5_STEP1 MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 2*16, [TBL+29*16], rot22 MD5_STEP1 MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 7*16, [TBL+30*16], rot23 MD5_STEP1 MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 +12*16, [TBL+31*16], rot24 vmovdqu T2,[inp4+IDX+I*16] vmovdqu T1,[inp5+IDX+I*16] vmovdqu T4,[inp6+IDX+I*16] vmovdqu T3,[inp7+IDX+I*16] TRANSPOSE T2, T1, T4, T3, T0, T5 vmovdqa [mem2+(I*4+0)*16 + 16*16],T0 vmovdqa [mem2+(I*4+1)*16 + 16*16],T1 vmovdqa [mem2+(I*4+2)*16 + 16*16],T2 vmovdqa [mem2+(I*4+3)*16 + 16*16],T3 %assign I (I+1) MD5_STEP1 MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 5*16, [TBL+32*16], rot31 MD5_STEP1 MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 8*16, [TBL+33*16], rot32 MD5_STEP1 MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +11*16, [TBL+34*16], rot33 MD5_STEP1 MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 +14*16, [TBL+35*16], rot34 MD5_STEP1 MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 1*16, [TBL+36*16], rot31 MD5_STEP1 MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 4*16, [TBL+37*16], rot32 MD5_STEP1 MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 7*16, [TBL+38*16], rot33 MD5_STEP1 MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 +10*16, [TBL+39*16], rot34 vmovdqu T2,[inp0+IDX+I*16] vmovdqu T1,[inp1+IDX+I*16] vmovdqu T4,[inp2+IDX+I*16] vmovdqu T3,[inp3+IDX+I*16] TRANSPOSE T2, T1, T4, T3, T0, T5 vmovdqa [mem2+(I*4+0)*16],T0 vmovdqa [mem2+(I*4+1)*16],T1 vmovdqa [mem2+(I*4+2)*16],T2 vmovdqa [mem2+(I*4+3)*16],T3 MD5_STEP1 MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 +13*16, [TBL+40*16], rot31 MD5_STEP1 MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 0*16, [TBL+41*16], rot32 MD5_STEP1 MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 3*16, [TBL+42*16], rot33 MD5_STEP1 MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 6*16, [TBL+43*16], rot34 MD5_STEP1 MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 9*16, [TBL+44*16], rot31 MD5_STEP1 MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 +12*16, [TBL+45*16], rot32 MD5_STEP1 MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +15*16, [TBL+46*16], rot33 MD5_STEP1 MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 2*16, [TBL+47*16], rot34 vmovdqu T2,[inp4+IDX+I*16] vmovdqu T1,[inp5+IDX+I*16] vmovdqu T4,[inp6+IDX+I*16] vmovdqu T3,[inp7+IDX+I*16] TRANSPOSE T2, T1, T4, T3, T0, T5 vmovdqa [mem2+(I*4+0)*16 + 16*16],T0 vmovdqa [mem2+(I*4+1)*16 + 16*16],T1 vmovdqa [mem2+(I*4+2)*16 + 16*16],T2 vmovdqa [mem2+(I*4+3)*16 + 16*16],T3 %assign I (I+1) MD5_STEP1 MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 0*16, [TBL+48*16], rot41 MD5_STEP1 MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 7*16, [TBL+49*16], rot42 MD5_STEP1 MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +14*16, [TBL+50*16], rot43 MD5_STEP1 MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 5*16, [TBL+51*16], rot44 MD5_STEP1 MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 +12*16, [TBL+52*16], rot41 MD5_STEP1 MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 3*16, [TBL+53*16], rot42 MD5_STEP1 MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +10*16, [TBL+54*16], rot43 MD5_STEP1 MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 1*16, [TBL+55*16], rot44 vmovdqu T2,[inp0+IDX+I*16] vmovdqu T1,[inp1+IDX+I*16] vmovdqu T4,[inp2+IDX+I*16] vmovdqu T3,[inp3+IDX+I*16] TRANSPOSE T2, T1, T4, T3, T0, T5 vmovdqa [mem2+(I*4+0)*16],T0 vmovdqa [mem2+(I*4+1)*16],T1 vmovdqa [mem2+(I*4+2)*16],T2 vmovdqa [mem2+(I*4+3)*16],T3 MD5_STEP1 MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 8*16, [TBL+56*16], rot41 MD5_STEP1 MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 +15*16, [TBL+57*16], rot42 MD5_STEP1 MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 6*16, [TBL+58*16], rot43 MD5_STEP1 MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 +13*16, [TBL+59*16], rot44 MD5_STEP1 MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 4*16, [TBL+60*16], rot41 MD5_STEP1 MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 +11*16, [TBL+61*16], rot42 MD5_STEP1 MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 2*16, [TBL+62*16], rot43 MD5_STEP1 MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 9*16, [TBL+63*16], rot44 vmovdqu T2,[inp4+IDX+I*16] vmovdqu T1,[inp5+IDX+I*16] vmovdqu T4,[inp6+IDX+I*16] vmovdqu T3,[inp7+IDX+I*16] TRANSPOSE T2, T1, T4, T3, T0, T5 vmovdqa [mem2+(I*4+0)*16 + 16*16],T0 vmovdqa [mem2+(I*4+1)*16 + 16*16],T1 vmovdqa [mem2+(I*4+2)*16 + 16*16],T2 vmovdqa [mem2+(I*4+3)*16 + 16*16],T3 %assign I (I+1) vpaddd A,A,[AA] vpaddd B,B,[BB] vpaddd C,C,[CC] vpaddd D,D,[DD] vpaddd A2,A2,[AA2] vpaddd B2,B2,[BB2] vpaddd C2,C2,[CC2] vpaddd D2,D2,[DD2] ; swap mem1 and mem2 xchg mem1, mem2 jmp lloop lastblock: MD5_STEP MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 0*16, [TBL+ 0*16], rot11 MD5_STEP MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 1*16, [TBL+ 1*16], rot12 MD5_STEP MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 2*16, [TBL+ 2*16], rot13 MD5_STEP MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 3*16, [TBL+ 3*16], rot14 MD5_STEP MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 4*16, [TBL+ 4*16], rot11 MD5_STEP MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 5*16, [TBL+ 5*16], rot12 MD5_STEP MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 6*16, [TBL+ 6*16], rot13 MD5_STEP MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 7*16, [TBL+ 7*16], rot14 MD5_STEP MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 8*16, [TBL+ 8*16], rot11 MD5_STEP MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 9*16, [TBL+ 9*16], rot12 MD5_STEP MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +10*16, [TBL+10*16], rot13 MD5_STEP MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 +11*16, [TBL+11*16], rot14 MD5_STEP MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 +12*16, [TBL+12*16], rot11 MD5_STEP MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 +13*16, [TBL+13*16], rot12 MD5_STEP MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +14*16, [TBL+14*16], rot13 MD5_STEP MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 +15*16, [TBL+15*16], rot14 MD5_STEP MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 1*16, [TBL+16*16], rot21 MD5_STEP MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 6*16, [TBL+17*16], rot22 MD5_STEP MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +11*16, [TBL+18*16], rot23 MD5_STEP MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 0*16, [TBL+19*16], rot24 MD5_STEP MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 5*16, [TBL+20*16], rot21 MD5_STEP MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 +10*16, [TBL+21*16], rot22 MD5_STEP MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +15*16, [TBL+22*16], rot23 MD5_STEP MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 4*16, [TBL+23*16], rot24 MD5_STEP MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 9*16, [TBL+24*16], rot21 MD5_STEP MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 +14*16, [TBL+25*16], rot22 MD5_STEP MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 3*16, [TBL+26*16], rot23 MD5_STEP MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 8*16, [TBL+27*16], rot24 MD5_STEP MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 +13*16, [TBL+28*16], rot21 MD5_STEP MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 2*16, [TBL+29*16], rot22 MD5_STEP MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 7*16, [TBL+30*16], rot23 MD5_STEP MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 +12*16, [TBL+31*16], rot24 MD5_STEP MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 5*16, [TBL+32*16], rot31 MD5_STEP MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 8*16, [TBL+33*16], rot32 MD5_STEP MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +11*16, [TBL+34*16], rot33 MD5_STEP MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 +14*16, [TBL+35*16], rot34 MD5_STEP MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 1*16, [TBL+36*16], rot31 MD5_STEP MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 4*16, [TBL+37*16], rot32 MD5_STEP MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 7*16, [TBL+38*16], rot33 MD5_STEP MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 +10*16, [TBL+39*16], rot34 MD5_STEP MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 +13*16, [TBL+40*16], rot31 MD5_STEP MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 0*16, [TBL+41*16], rot32 MD5_STEP MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 3*16, [TBL+42*16], rot33 MD5_STEP MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 6*16, [TBL+43*16], rot34 MD5_STEP MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 9*16, [TBL+44*16], rot31 MD5_STEP MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 +12*16, [TBL+45*16], rot32 MD5_STEP MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +15*16, [TBL+46*16], rot33 MD5_STEP MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 2*16, [TBL+47*16], rot34 MD5_STEP MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 0*16, [TBL+48*16], rot41 MD5_STEP MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 7*16, [TBL+49*16], rot42 MD5_STEP MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +14*16, [TBL+50*16], rot43 MD5_STEP MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 5*16, [TBL+51*16], rot44 MD5_STEP MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 +12*16, [TBL+52*16], rot41 MD5_STEP MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 3*16, [TBL+53*16], rot42 MD5_STEP MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +10*16, [TBL+54*16], rot43 MD5_STEP MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 1*16, [TBL+55*16], rot44 MD5_STEP MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 8*16, [TBL+56*16], rot41 MD5_STEP MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 +15*16, [TBL+57*16], rot42 MD5_STEP MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 6*16, [TBL+58*16], rot43 MD5_STEP MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 +13*16, [TBL+59*16], rot44 MD5_STEP MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 4*16, [TBL+60*16], rot41 MD5_STEP MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 +11*16, [TBL+61*16], rot42 MD5_STEP MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 2*16, [TBL+62*16], rot43 MD5_STEP MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 9*16, [TBL+63*16], rot44 vpaddd A,A,[AA] vpaddd B,B,[BB] vpaddd C,C,[CC] vpaddd D,D,[DD] vpaddd A2,A2,[AA2] vpaddd B2,B2,[BB2] vpaddd C2,C2,[CC2] vpaddd D2,D2,[DD2] ; write out digests vmovdqu [state + 0*16 + 0*MD5_DIGEST_ROW_SIZE ], A vmovdqu [state + 0*16 + 1*MD5_DIGEST_ROW_SIZE ], B vmovdqu [state + 0*16 + 2*MD5_DIGEST_ROW_SIZE ], C vmovdqu [state + 0*16 + 3*MD5_DIGEST_ROW_SIZE ], D vmovdqu [state + 1*16 + 0*MD5_DIGEST_ROW_SIZE], A2 vmovdqu [state + 1*16 + 1*MD5_DIGEST_ROW_SIZE], B2 vmovdqu [state + 1*16 + 2*MD5_DIGEST_ROW_SIZE], C2 vmovdqu [state + 1*16 + 3*MD5_DIGEST_ROW_SIZE], D2 ;; update input pointers add inp0, IDX add inp1, IDX add inp2, IDX add inp3, IDX add inp4, IDX add inp5, IDX add inp6, IDX add inp7, IDX mov [state +_data_ptr_md5 + 0*PTR_SZ], inp0 mov [state +_data_ptr_md5 + 1*PTR_SZ], inp1 mov [state +_data_ptr_md5 + 2*PTR_SZ], inp2 mov [state +_data_ptr_md5 + 3*PTR_SZ], inp3 mov [state +_data_ptr_md5 + 4*PTR_SZ], inp4 mov [state +_data_ptr_md5 + 5*PTR_SZ], inp5 mov [state +_data_ptr_md5 + 6*PTR_SZ], inp6 mov [state +_data_ptr_md5 + 7*PTR_SZ], inp7 ;;;;;;;;;;;;;;;; ;; Postamble add rsp, STACK_SIZE ret intel-ipsec-mb-0.48/avx/sha1_mult_avx.asm000066400000000000000000000242101321406316400203270ustar00rootroot00000000000000;; ;; Copyright (c) 2012-2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; %include "os.asm" %include "mb_mgr_datastruct.asm" section .data default rel align 16 PSHUFFLE_BYTE_FLIP_MASK: ;ddq 0x0c0d0e0f08090a0b0405060700010203 dq 0x0405060700010203, 0x0c0d0e0f08090a0b K00_19: ;ddq 0x5A8279995A8279995A8279995A827999 dq 0x5A8279995A827999, 0x5A8279995A827999 K20_39: ;ddq 0x6ED9EBA16ED9EBA16ED9EBA16ED9EBA1 dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1 K40_59: ;ddq 0x8F1BBCDC8F1BBCDC8F1BBCDC8F1BBCDC dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC K60_79: ;ddq 0xCA62C1D6CA62C1D6CA62C1D6CA62C1D6 dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6 section .text ;; code to compute quad SHA1 using AVX ;; derived from ...\sha1_multiple\sha1_quad4.asm ;; variation of sha1_mult2.asm : clobbers all xmm regs, rcx left intact ;; rbx, rsi, rdi, rbp, r12-r15 left intact ;; This version is not safe to call from C/C++ ;; Stack must be aligned to 16 bytes before call ;; Windows clobbers: rax rdx r8 r9 r10 r11 ;; Windows preserves: rbx rcx rsi rdi rbp r12 r13 r14 r15 ;; ;; Linux clobbers: rax rsi r8 r9 r10 r11 ;; Linux preserves: rbx rcx rdx rdi rbp r12 r13 r14 r15 ;; ;; clobbers xmm0-15 ; transpose r0, r1, r2, r3, t0, t1 ; "transpose" data in {r0..r3} using temps {t0..t3} ; Input looks like: {r0 r1 r2 r3} ; r0 = {a3 a2 a1 a0} ; r1 = {b3 b2 b1 b0} ; r2 = {c3 c2 c1 c0} ; r3 = {d3 d2 d1 d0} ; ; output looks like: {t0 r1 r0 r3} ; t0 = {d0 c0 b0 a0} ; r1 = {d1 c1 b1 a1} ; r0 = {d2 c2 b2 a2} ; r3 = {d3 c3 b3 a3} ; %macro TRANSPOSE 6 %define %%r0 %1 %define %%r1 %2 %define %%r2 %3 %define %%r3 %4 %define %%t0 %5 %define %%t1 %6 vshufps %%t0, %%r0, %%r1, 0x44 ; t0 = {b1 b0 a1 a0} vshufps %%r0, %%r0, %%r1, 0xEE ; r0 = {b3 b2 a3 a2} vshufps %%t1, %%r2, %%r3, 0x44 ; t1 = {d1 d0 c1 c0} vshufps %%r2, %%r2, %%r3, 0xEE ; r2 = {d3 d2 c3 c2} vshufps %%r1, %%t0, %%t1, 0xDD ; r1 = {d1 c1 b1 a1} vshufps %%r3, %%r0, %%r2, 0xDD ; r3 = {d3 c3 b3 a3} vshufps %%r0, %%r0, %%r2, 0x88 ; r0 = {d2 c2 b2 a2} vshufps %%t0, %%t0, %%t1, 0x88 ; t0 = {d0 c0 b0 a0} %endmacro ;; ;; Magic functions defined in FIPS 180-1 ;; ; macro MAGIC_F0 F,B,C,D,T ;; F = (D ^ (B & (C ^ D))) %macro MAGIC_F0 5 %define %%regF %1 %define %%regB %2 %define %%regC %3 %define %%regD %4 %define %%regT %5 vpxor %%regF, %%regC,%%regD vpand %%regF, %%regF,%%regB vpxor %%regF, %%regF,%%regD %endmacro ; macro MAGIC_F1 F,B,C,D,T ;; F = (B ^ C ^ D) %macro MAGIC_F1 5 %define %%regF %1 %define %%regB %2 %define %%regC %3 %define %%regD %4 %define %%regT %5 vpxor %%regF,%%regD,%%regC vpxor %%regF,%%regF,%%regB %endmacro ; macro MAGIC_F2 F,B,C,D,T ;; F = ((B & C) | (B & D) | (C & D)) %macro MAGIC_F2 5 %define %%regF %1 %define %%regB %2 %define %%regC %3 %define %%regD %4 %define %%regT %5 vpor %%regF,%%regB,%%regC vpand %%regT,%%regB,%%regC vpand %%regF,%%regF,%%regD vpor %%regF,%%regF,%%regT %endmacro ; macro MAGIC_F3 F,B,C,D,T ;; F = (B ^ C ^ D) %macro MAGIC_F3 5 %define %%regF %1 %define %%regB %2 %define %%regC %3 %define %%regD %4 %define %%regT %5 MAGIC_F1 %%regF,%%regB,%%regC,%%regD,%%regT %endmacro ; PROLD reg, imm, tmp %macro PROLD 3 %define %%reg %1 %define %%imm %2 %define %%tmp %3 vpsrld %%tmp, %%reg, (32-(%%imm)) vpslld %%reg, %%reg, %%imm vpor %%reg, %%reg, %%tmp %endmacro ; non-destructive ; PROLD_nd reg, imm, tmp, src %macro PROLD_nd 4 %define %%reg %1 %define %%imm %2 %define %%tmp %3 %define %%src %4 vpsrld %%tmp, %%src, (32-(%%imm)) vpslld %%reg, %%src, %%imm vpor %%reg, %%reg, %%tmp %endmacro %macro SHA1_STEP_00_15 10 %define %%regA %1 %define %%regB %2 %define %%regC %3 %define %%regD %4 %define %%regE %5 %define %%regT %6 %define %%regF %7 %define %%memW %8 %define %%immCNT %9 %define %%MAGIC %10 vpaddd %%regE, %%regE,%%immCNT vpaddd %%regE, %%regE,[rsp + (%%memW * 16)] PROLD_nd %%regT,5, %%regF,%%regA vpaddd %%regE, %%regE,%%regT %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D) PROLD %%regB,30, %%regT vpaddd %%regE, %%regE,%%regF %endmacro %macro SHA1_STEP_16_79 10 %define %%regA %1 %define %%regB %2 %define %%regC %3 %define %%regD %4 %define %%regE %5 %define %%regT %6 %define %%regF %7 %define %%memW %8 %define %%immCNT %9 %define %%MAGIC %10 vpaddd %%regE, %%regE,%%immCNT vmovdqa W14, [rsp + ((%%memW - 14) & 15) * 16] vpxor W16, W16, W14 vpxor W16, W16, [rsp + ((%%memW - 8) & 15) * 16] vpxor W16, W16, [rsp + ((%%memW - 3) & 15) * 16] vpsrld %%regF, W16, (32-1) vpslld W16, W16, 1 vpor %%regF, %%regF, W16 ROTATE_W vmovdqa [rsp + ((%%memW - 0) & 15) * 16],%%regF vpaddd %%regE, %%regE,%%regF PROLD_nd %%regT,5, %%regF, %%regA vpaddd %%regE, %%regE,%%regT %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D) PROLD %%regB,30, %%regT vpaddd %%regE,%%regE,%%regF %endmacro ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; FRAMESZ must be an odd multiple of 8 %define FRAMESZ 16*16 + 8 %define VMOVPS vmovdqu %ifdef LINUX %define arg1 rdi %define arg2 rsi %else %define arg1 rcx %define arg2 rdx %endif %define inp0 r8 %define inp1 r9 %define inp2 r10 %define inp3 r11 %define IDX rax %define A xmm0 %define B xmm1 %define C xmm2 %define D xmm3 %define E xmm4 %define F xmm5 ; tmp %define G xmm6 ; tmp %define TMP G %define FUN F %define K xmm7 %define AA xmm8 %define BB xmm9 %define CC xmm10 %define DD xmm11 %define EE xmm12 %define T0 xmm6 %define T1 xmm7 %define T2 xmm8 %define T3 xmm9 %define T4 xmm10 %define T5 xmm11 %define W14 xmm13 %define W15 xmm14 %define W16 xmm15 %macro ROTATE_ARGS 0 %xdefine TMP_ E %xdefine E D %xdefine D C %xdefine C B %xdefine B A %xdefine A TMP_ %endm %macro ROTATE_W 0 %xdefine TMP_ W16 %xdefine W16 W15 %xdefine W15 W14 %xdefine W14 TMP_ %endm align 32 ; XMM registers are clobbered. Saving/restoring must be done at a higher level ; void sha1_mult_avx(SHA1_ARGS *args, UINT32 size_in_blocks); ; arg 1 : rcx : pointer to args ; arg 2 : rdx : size (in blocks) ;; assumed to be >= 1 MKGLOBAL(sha1_mult_avx,function,internal) sha1_mult_avx: sub rsp, FRAMESZ ;; Initialize digests vmovdqa A, [arg1 + 0*SHA1_DIGEST_ROW_SIZE] vmovdqa B, [arg1 + 1*SHA1_DIGEST_ROW_SIZE] vmovdqa C, [arg1 + 2*SHA1_DIGEST_ROW_SIZE] vmovdqa D, [arg1 + 3*SHA1_DIGEST_ROW_SIZE] vmovdqa E, [arg1 + 4*SHA1_DIGEST_ROW_SIZE] ;; transpose input onto stack mov inp0,[arg1 + _data_ptr_sha1 + 0*PTR_SZ] mov inp1,[arg1 + _data_ptr_sha1 + 1*PTR_SZ] mov inp2,[arg1 + _data_ptr_sha1 + 2*PTR_SZ] mov inp3,[arg1 + _data_ptr_sha1 + 3*PTR_SZ] xor IDX, IDX lloop: vmovdqa F, [rel PSHUFFLE_BYTE_FLIP_MASK] %assign I 0 %rep 4 VMOVPS T2,[inp0+IDX] VMOVPS T1,[inp1+IDX] VMOVPS T4,[inp2+IDX] VMOVPS T3,[inp3+IDX] TRANSPOSE T2, T1, T4, T3, T0, T5 vpshufb T0, T0, F vmovdqa [rsp+(I*4+0)*16],T0 vpshufb T1, T1, F vmovdqa [rsp+(I*4+1)*16],T1 vpshufb T2, T2, F vmovdqa [rsp+(I*4+2)*16],T2 vpshufb T3, T3, F vmovdqa [rsp+(I*4+3)*16],T3 add IDX, 4*4 %assign I (I+1) %endrep ; save old digests vmovdqa AA, A vmovdqa BB, B vmovdqa CC, C vmovdqa DD, D vmovdqa EE, E ;; ;; perform 0-79 steps ;; vmovdqa K, [rel K00_19] ;; do rounds 0...15 %assign I 0 %rep 16 SHA1_STEP_00_15 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0 ROTATE_ARGS %assign I (I+1) %endrep ;; do rounds 16...19 vmovdqa W16, [rsp + ((16 - 16) & 15) * 16] vmovdqa W15, [rsp + ((16 - 15) & 15) * 16] %rep 4 SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0 ROTATE_ARGS %assign I (I+1) %endrep ;; do rounds 20...39 vmovdqa K, [rel K20_39] %rep 20 SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F1 ROTATE_ARGS %assign I (I+1) %endrep ;; do rounds 40...59 vmovdqa K, [rel K40_59] %rep 20 SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F2 ROTATE_ARGS %assign I (I+1) %endrep ;; do rounds 60...79 vmovdqa K, [rel K60_79] %rep 20 SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F3 ROTATE_ARGS %assign I (I+1) %endrep vpaddd A,A,AA vpaddd B,B,BB vpaddd C,C,CC vpaddd D,D,DD vpaddd E,E,EE sub arg2, 1 jne lloop ; write out digests vmovdqa [arg1 + 0*SHA1_DIGEST_ROW_SIZE], A vmovdqa [arg1 + 1*SHA1_DIGEST_ROW_SIZE], B vmovdqa [arg1 + 2*SHA1_DIGEST_ROW_SIZE], C vmovdqa [arg1 + 3*SHA1_DIGEST_ROW_SIZE], D vmovdqa [arg1 + 4*SHA1_DIGEST_ROW_SIZE], E ; update input pointers add inp0, IDX mov [arg1 + _data_ptr_sha1 + 0*PTR_SZ], inp0 add inp1, IDX mov [arg1 + _data_ptr_sha1 + 1*PTR_SZ], inp1 add inp2, IDX mov [arg1 + _data_ptr_sha1 + 2*PTR_SZ], inp2 add inp3, IDX mov [arg1 + _data_ptr_sha1 + 3*PTR_SZ], inp3 ;;;;;;;;;;;;;;;; ;; Postamble add rsp, FRAMESZ ret intel-ipsec-mb-0.48/avx/sha1_one_block_avx.asm000066400000000000000000000234471321406316400213140ustar00rootroot00000000000000;; ;; Copyright (c) 2012-2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; ; SHA1 code, hybrid, rolled, interleaved ; Uses AVX instructions %include "os.asm" section .data default rel align 16 PSHUFFLE_BYTE_FLIP_MASK: ;ddq 0x0c0d0e0f08090a0b0405060700010203 dq 0x0405060700010203, 0x0c0d0e0f08090a0b K00_19: ;ddq 0x5A8279995A8279995A8279995A827999 dq 0x5A8279995A827999, 0x5A8279995A827999 K20_39: ;ddq 0x6ED9EBA16ED9EBA16ED9EBA16ED9EBA1 dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1 K40_59: ;ddq 0x8F1BBCDC8F1BBCDC8F1BBCDC8F1BBCDC dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC K60_79: ;ddq 0xCA62C1D6CA62C1D6CA62C1D6CA62C1D6 dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6 section .text %define VMOVDQ vmovdqu ;; assume buffers not aligned %ifdef LINUX %define INP rdi ; 1st arg %define CTX rsi ; 2nd arg %define REG3 ecx %define REG4 edx %else %define INP rcx ; 1st arg %define CTX rdx ; 2nd arg %define REG3 edi %define REG4 esi %endif %define FRAMESZ 3*16 + 1*8 %define _RSP FRAMESZ-1*8 + rsp %define a eax %define b ebx %define c REG3 %define d REG4 %define e r8d %define T1 r9d %define f r10d %define RND r11d %define g r12d %define h r13d %define XTMP0 xmm0 %define XTMP1 xmm1 %define XK xmm2 %xdefine X0 xmm3 %xdefine X1 xmm4 %xdefine X2 xmm5 %xdefine X3 xmm6 %xdefine X4 xmm7 %define XFER xmm8 %define SZ 4 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros %macro rotate_Xs 0 %xdefine X_ X0 %xdefine X0 X1 %xdefine X1 X2 %xdefine X2 X3 %xdefine X3 X4 %xdefine X4 X_ %endmacro %macro ROTATE_ARGS 0 %xdefine TMP_ h %xdefine h g %xdefine g f %xdefine f e %xdefine e d %xdefine d c %xdefine c b %xdefine b a %xdefine a TMP_ %endm ;; Magic functions defined in FIPS 180-1 ;; ; macro MAGIC_F0 F,B,C,D,T ;; F = (D ^ (B & (C ^ D))) %macro MAGIC_F0 5 %define %%regF %1 %define %%regB %2 %define %%regC %3 %define %%regD %4 %define %%regT %5 mov %%regF,%%regC xor %%regF,%%regD and %%regF,%%regB xor %%regF,%%regD %endmacro ; macro MAGIC_F1 F,B,C,D,T ;; F = (B ^ C ^ D) %macro MAGIC_F1 5 %define %%regF %1 %define %%regB %2 %define %%regC %3 %define %%regD %4 %define %%regT %5 mov %%regF,%%regD xor %%regF,%%regC xor %%regF,%%regB %endmacro ; macro MAGIC_F2 F,B,C,D,T ;; F = ((B & C) | (B & D) | (C & D)) %macro MAGIC_F2 5 %define %%regF %1 %define %%regB %2 %define %%regC %3 %define %%regD %4 %define %%regT %5 mov %%regF,%%regB mov %%regT,%%regB or %%regF,%%regC and %%regT,%%regC and %%regF,%%regD or %%regF,%%regT %endmacro ; macro MAGIC_F3 F,B,C,D,T ;; F = (B ^ C ^ D) %macro MAGIC_F3 5 %define %%regF %1 %define %%regB %2 %define %%regC %3 %define %%regD %4 %define %%regT %5 MAGIC_F1 %%regF,%%regB,%%regC,%%regD,%%regT %endmacro ;; input is T1 %macro ROUND 1 %define %%MAGIC %1 add e,T1 mov T1,a rol T1,5 add e,T1 %%MAGIC h,b,c,d,T1 ;; FUN = MAGIC_Fi(B,C,D) rol b,30 add h,e ROTATE_ARGS %endmacro %macro do_4i 1 vpaddd XFER, XK, X0 vpextrd T1, XFER, 0 ;ROUND %1 add e,T1 ;SCHEDULE_4 vpalignr XTMP0, X1, X0, 8 ; XTMP0 = W[-14] mov T1,a rol T1,5 vpxor XTMP1, X2, X0 ; XTMP1 = W[-8] ^ W[-16] add e,T1 vpxor XTMP0, XTMP0, XTMP1 ; XTMP0 = W[-8] ^ W[-14] ^ W[-16] %1 h,b,c,d,T1 ;; FUN = MAGIC_Fi(B,C,D) ;; Finish low half rol b,30 vpsrldq X4, X3, 4 ; X4 = W[-3] {xxBA} add h,e ROTATE_ARGS vpextrd T1, XFER, 1 ;ROUND %1 add e,T1 vpxor X4, X4, XTMP0 mov T1,a rol T1,5 ;; rotate X4 left 1 vpsrld XTMP1, X4, (32-1) add e,T1 vpslld X4, X4, 1 %1 h,b,c,d,T1 ;; FUN = MAGIC_Fi(B,C,D) vpxor X4, X4, XTMP1 ; X4 = W[0] {xxBA} rol b,30 add h,e ROTATE_ARGS vpextrd T1, XFER, 2 ;ROUND %1 add e,T1 mov T1,a ;; Finish high half vpalignr XTMP1, X4, X3, 4 ; XTMP1 = w[-3] {DCxx} rol T1,5 add e,T1 vpxor XTMP0, XTMP0, XTMP1 %1 h,b,c,d,T1 ;; FUN = MAGIC_Fi(B,C,D) ;; rotate XTMP0 left 1 vpsrld XTMP1, XTMP0, (32-1) rol b,30 add h,e ROTATE_ARGS vpextrd T1, XFER, 3 ;ROUND %1 add e,T1 mov T1,a vpslld XTMP0, XTMP0, 1 rol T1,5 add e,T1 vpxor XTMP0, XTMP0, XTMP1 ; XTMP0 = W[0] {DCxx} %1 h,b,c,d,T1 ;; FUN = MAGIC_Fi(B,C,D) ;; COMBINE HALVES vshufps X4, X4, XTMP0, 11100100b ; X4 = X[0] {DCBA} rol b,30 add h,e rotate_Xs ROTATE_ARGS %endmacro ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; void sha1_one_block_avx(void *input_data, UINT32 digest[8] ;; arg 1 : rcx : pointer to input data ;; arg 2 : rdx : pointer to digest MKGLOBAL(sha1_one_block_avx,function,) align 32 sha1_one_block_avx: push rbx push rsi push rdi push r12 push r13 ;; byte swap first 16 dwords vmovdqa XTMP0, [rel PSHUFFLE_BYTE_FLIP_MASK] mov rax,rsp ; copy rsp VMOVDQ X0, [INP + 0*16] sub rsp,FRAMESZ VMOVDQ X1, [INP + 1*16] and rsp,-64 ; align stack frame vmovdqa [rsp + 0 * 16], xmm6 vmovdqa [rsp + 1 * 16], xmm7 vmovdqa [rsp + 2 * 16], xmm8 VMOVDQ X2, [INP + 2*16] mov [_RSP],rax ; save copy of rsp VMOVDQ X3, [INP + 3*16] ;; load initial digest mov a,0x67452301 vpshufb X0, XTMP0 mov b,0xefcdab89 vpshufb X1, XTMP0 mov c,0x98badcfe vpshufb X2, XTMP0 mov d,0x10325476 vpshufb X3, XTMP0 mov e,0xc3d2e1f0 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; do rounds 00-19 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; vmovdqa XK, [rel K00_19] mov RND, 3 ROTATE_ARGS ROTATE_ARGS ROTATE_ARGS ROTATE_ARGS rotate_Xs rotate_Xs rotate_Xs rotate_Xs jmp loop1_5 align 16 loop1: do_4i MAGIC_F0 loop1_5: do_4i MAGIC_F0 rotate_Xs rotate_Xs rotate_Xs rotate_Xs vmovdqa X0, X2 vmovdqa X2, X4 vmovdqa X4, X1 vmovdqa X1, X3 sub RND, 1 jne loop1 rotate_Xs rotate_Xs rotate_Xs rotate_Xs ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; end rounds 00-19 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; do rounds 20-39 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; vmovdqa XK, [rel K20_39] mov RND, 3 ROTATE_ARGS ROTATE_ARGS ROTATE_ARGS ROTATE_ARGS rotate_Xs rotate_Xs rotate_Xs rotate_Xs jmp loop2_5 align 16 loop2: do_4i MAGIC_F1 loop2_5: do_4i MAGIC_F1 rotate_Xs rotate_Xs rotate_Xs rotate_Xs vmovdqa X0, X2 vmovdqa X2, X4 vmovdqa X4, X1 vmovdqa X1, X3 sub RND, 1 jne loop2 rotate_Xs rotate_Xs rotate_Xs rotate_Xs ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; end rounds 20-39 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; do rounds 40-59 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; vmovdqa XK, [rel K40_59] mov RND, 3 ROTATE_ARGS ROTATE_ARGS ROTATE_ARGS ROTATE_ARGS rotate_Xs rotate_Xs rotate_Xs rotate_Xs jmp loop3_5 align 16 loop3: do_4i MAGIC_F2 loop3_5: do_4i MAGIC_F2 rotate_Xs rotate_Xs rotate_Xs rotate_Xs vmovdqa X0, X2 vmovdqa X2, X4 vmovdqa X4, X1 vmovdqa X1, X3 sub RND, 1 jne loop3 rotate_Xs rotate_Xs rotate_Xs rotate_Xs ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; end rounds 40-59 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; do rounds 60-79 vmovdqa XK, [rel K60_79] do_4i MAGIC_F3 vpaddd XFER, XK, X0 vpextrd T1, XFER, 0 ROUND MAGIC_F3 vpextrd T1, XFER, 1 ROUND MAGIC_F3 vpextrd T1, XFER, 2 ROUND MAGIC_F3 vpextrd T1, XFER, 3 ROUND MAGIC_F3 vpaddd XFER, XK, X1 vpextrd T1, XFER, 0 ROUND MAGIC_F3 vpextrd T1, XFER, 1 ROUND MAGIC_F3 vpextrd T1, XFER, 2 ROUND MAGIC_F3 vpextrd T1, XFER, 3 ROUND MAGIC_F3 vpaddd XFER, XK, X2 vpextrd T1, XFER, 0 ROUND MAGIC_F3 vpextrd T1, XFER, 1 ROUND MAGIC_F3 vpextrd T1, XFER, 2 ROUND MAGIC_F3 vpextrd T1, XFER, 3 ROUND MAGIC_F3 vpaddd XFER, XK, X3 vpextrd T1, XFER, 0 ROUND MAGIC_F3 vpextrd T1, XFER, 1 ROUND MAGIC_F3 vpextrd T1, XFER, 2 ROUND MAGIC_F3 vpextrd T1, XFER, 3 ROUND MAGIC_F3 add a,0x67452301 mov [SZ*0 + CTX], a add b,0xefcdab89 mov [SZ*1 + CTX], b add c,0x98badcfe mov [SZ*2 + CTX], c add d,0x10325476 mov [SZ*3 + CTX], d add e,0xc3d2e1f0 mov [SZ*4 + CTX], e vmovdqa xmm8, [rsp + 2 * 16] vmovdqa xmm7, [rsp + 1 * 16] vmovdqa xmm6, [rsp + 0 * 16] mov rsp,[_RSP] pop r13 pop r12 pop rdi pop rsi pop rbx ret intel-ipsec-mb-0.48/avx/sha224_one_block_avx.asm000066400000000000000000000036571321406316400214640ustar00rootroot00000000000000;; ;; Copyright (c) 2012-2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; ; This code schedules 1 blocks at a time, with 4 lanes per block ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; %define H0 0xc1059ed8 %define H1 0x367cd507 %define H2 0x3070dd17 %define H3 0xf70e5939 %define H4 0xffc00b31 %define H5 0x68581511 %define H6 0x64f98fa7 %define H7 0xbefa4fa4 %define FUNC sha224_one_block_avx %include "sha256_one_block_avx.asm" intel-ipsec-mb-0.48/avx/sha256_one_block_avx.asm000066400000000000000000000344111321406316400214610ustar00rootroot00000000000000;; ;; Copyright (c) 2012-2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; ; This code schedules 1 blocks at a time, with 4 lanes per block ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; %include "os.asm" section .data default rel align 64 K256: dd 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 dd 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 dd 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 dd 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 dd 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc dd 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da dd 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 dd 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 dd 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 dd 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 dd 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 dd 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 dd 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 dd 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 dd 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 dd 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 PSHUFFLE_BYTE_FLIP_MASK: ;ddq 0x0c0d0e0f08090a0b0405060700010203 dq 0x0405060700010203, 0x0c0d0e0f08090a0b ; shuffle xBxA -> 00BA _SHUF_00BA: ;ddq 0xFFFFFFFFFFFFFFFF0b0a090803020100 dq 0x0b0a090803020100, 0xFFFFFFFFFFFFFFFF ; shuffle xDxC -> DC00 _SHUF_DC00: ;ddq 0x0b0a090803020100FFFFFFFFFFFFFFFF dq 0xFFFFFFFFFFFFFFFF, 0x0b0a090803020100 section .text %define VMOVDQ vmovdqu ;; assume buffers not aligned ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros %macro MY_ROR 2 shld %1,%1,(32-(%2)) %endm ; COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask ; Load xmm with mem and byte swap each dword %macro COPY_XMM_AND_BSWAP 3 VMOVDQ %1, %2 vpshufb %1, %1, %3 %endmacro ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; %define X0 xmm4 %define X1 xmm5 %define X2 xmm6 %define X3 xmm7 %define XTMP0 xmm0 %define XTMP1 xmm1 %define XTMP2 xmm2 %define XTMP3 xmm3 %define XTMP4 xmm8 %define XFER xmm9 %define XTMP5 xmm11 %define SHUF_00BA xmm10 ; shuffle xBxA -> 00BA %define SHUF_DC00 xmm12 ; shuffle xDxC -> DC00 %define BYTE_FLIP_MASK xmm13 %ifdef LINUX %define CTX rsi ; 2nd arg %define INP rdi ; 1st arg %define SRND rdi ; clobbers INP %define c ecx %define d r8d %define e edx %else %define CTX rdx ; 2nd arg %define INP rcx ; 1st arg %define SRND rcx ; clobbers INP %define c edi %define d esi %define e r8d %endif %define TBL rbp %define a eax %define b ebx %define f r9d %define g r10d %define h r11d %define y0 r13d %define y1 r14d %define y2 r15d struc STACK %ifndef LINUX _XMM_SAVE: reso 7 %endif _XFER: reso 1 endstruc %ifndef H0 %define H0 0x6a09e667 %define H1 0xbb67ae85 %define H2 0x3c6ef372 %define H3 0xa54ff53a %define H4 0x510e527f %define H5 0x9b05688c %define H6 0x1f83d9ab %define H7 0x5be0cd19 %define FUNC sha256_one_block_avx %endif ; rotate_Xs ; Rotate values of symbols X0...X3 %macro rotate_Xs 0 %xdefine X_ X0 %xdefine X0 X1 %xdefine X1 X2 %xdefine X2 X3 %xdefine X3 X_ %endm ; ROTATE_ARGS ; Rotate values of symbols a...h %macro ROTATE_ARGS 0 %xdefine TMP_ h %xdefine h g %xdefine g f %xdefine f e %xdefine e d %xdefine d c %xdefine c b %xdefine b a %xdefine a TMP_ %endm %macro FOUR_ROUNDS_AND_SCHED 0 ;; compute s0 four at a time and s1 two at a time ;; compute W[-16] + W[-7] 4 at a time ;vmovdqa XTMP0, X3 mov y0, e ; y0 = e MY_ROR y0, (25-11) ; y0 = e >> (25-11) mov y1, a ; y1 = a vpalignr XTMP0, X3, X2, 4 ; XTMP0 = W[-7] MY_ROR y1, (22-13) ; y1 = a >> (22-13) xor y0, e ; y0 = e ^ (e >> (25-11)) mov y2, f ; y2 = f MY_ROR y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6)) ;vmovdqa XTMP1, X1 xor y1, a ; y1 = a ^ (a >> (22-13) xor y2, g ; y2 = f^g vpaddd XTMP0, XTMP0, X0 ; XTMP0 = W[-7] + W[-16] xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) and y2, e ; y2 = (f^g)&e MY_ROR y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2)) ;; compute s0 vpalignr XTMP1, X1, X0, 4 ; XTMP1 = W[-15] xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) MY_ROR y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) xor y2, g ; y2 = CH = ((f^g)&e)^g MY_ROR y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) add y2, y0 ; y2 = S1 + CH add y2, [rsp + _XFER + 0*4] ; y2 = k + w + S1 + CH mov y0, a ; y0 = a add h, y2 ; h = h + S1 + CH + k + w mov y2, a ; y2 = a vpsrld XTMP2, XTMP1, 7 or y0, c ; y0 = a|c add d, h ; d = d + h + S1 + CH + k + w and y2, c ; y2 = a&c vpslld XTMP3, XTMP1, (32-7) and y0, b ; y0 = (a|c)&b add h, y1 ; h = h + S1 + CH + k + w + S0 vpor XTMP3, XTMP3, XTMP2 ; XTMP1 = W[-15] MY_ROR 7 or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c) add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ ROTATE_ARGS mov y0, e ; y0 = e mov y1, a ; y1 = a MY_ROR y0, (25-11) ; y0 = e >> (25-11) xor y0, e ; y0 = e ^ (e >> (25-11)) mov y2, f ; y2 = f MY_ROR y1, (22-13) ; y1 = a >> (22-13) vpsrld XTMP2, XTMP1,18 xor y1, a ; y1 = a ^ (a >> (22-13) MY_ROR y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6)) xor y2, g ; y2 = f^g vpsrld XTMP4, XTMP1, 3 ; XTMP4 = W[-15] >> 3 MY_ROR y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2)) xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) and y2, e ; y2 = (f^g)&e MY_ROR y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) vpslld XTMP1, XTMP1, (32-18) xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) xor y2, g ; y2 = CH = ((f^g)&e)^g vpxor XTMP3, XTMP3, XTMP1 add y2, y0 ; y2 = S1 + CH add y2, [rsp + _XFER + 1*4] ; y2 = k + w + S1 + CH MY_ROR y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) vpxor XTMP3, XTMP3, XTMP2 ; XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR 18 mov y0, a ; y0 = a add h, y2 ; h = h + S1 + CH + k + w mov y2, a ; y2 = a vpxor XTMP1, XTMP3, XTMP4 ; XTMP1 = s0 or y0, c ; y0 = a|c add d, h ; d = d + h + S1 + CH + k + w and y2, c ; y2 = a&c ;; compute low s1 vpshufd XTMP2, X3, 11111010b ; XTMP2 = W[-2] {BBAA} and y0, b ; y0 = (a|c)&b add h, y1 ; h = h + S1 + CH + k + w + S0 vpaddd XTMP0, XTMP0, XTMP1 ; XTMP0 = W[-16] + W[-7] + s0 or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c) add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ ROTATE_ARGS ;vmovdqa XTMP3, XTMP2 ; XTMP3 = W[-2] {BBAA} mov y0, e ; y0 = e mov y1, a ; y1 = a MY_ROR y0, (25-11) ; y0 = e >> (25-11) ;vmovdqa XTMP4, XTMP2 ; XTMP4 = W[-2] {BBAA} xor y0, e ; y0 = e ^ (e >> (25-11)) MY_ROR y1, (22-13) ; y1 = a >> (22-13) mov y2, f ; y2 = f xor y1, a ; y1 = a ^ (a >> (22-13) MY_ROR y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6)) vpsrld XTMP4, XTMP2, 10 ; XTMP4 = W[-2] >> 10 {BBAA} xor y2, g ; y2 = f^g vpsrlq XTMP3, XTMP2, 19 ; XTMP3 = W[-2] MY_ROR 19 {xBxA} xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) and y2, e ; y2 = (f^g)&e vpsrlq XTMP2, XTMP2, 17 ; XTMP2 = W[-2] MY_ROR 17 {xBxA} MY_ROR y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2)) xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) xor y2, g ; y2 = CH = ((f^g)&e)^g MY_ROR y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) vpxor XTMP2, XTMP2, XTMP3 add y2, y0 ; y2 = S1 + CH MY_ROR y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) add y2, [rsp + _XFER + 2*4] ; y2 = k + w + S1 + CH vpxor XTMP4, XTMP4, XTMP2 ; XTMP4 = s1 {xBxA} mov y0, a ; y0 = a add h, y2 ; h = h + S1 + CH + k + w mov y2, a ; y2 = a vpshufb XTMP4, XTMP4, SHUF_00BA ; XTMP4 = s1 {00BA} or y0, c ; y0 = a|c add d, h ; d = d + h + S1 + CH + k + w and y2, c ; y2 = a&c vpaddd XTMP0, XTMP0, XTMP4 ; XTMP0 = {..., ..., W[1], W[0]} and y0, b ; y0 = (a|c)&b add h, y1 ; h = h + S1 + CH + k + w + S0 ;; compute high s1 vpshufd XTMP2, XTMP0, 01010000b ; XTMP2 = W[-2] {DDCC} or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c) add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ ROTATE_ARGS ;vmovdqa XTMP3, XTMP2 ; XTMP3 = W[-2] {DDCC} mov y0, e ; y0 = e MY_ROR y0, (25-11) ; y0 = e >> (25-11) mov y1, a ; y1 = a ;vmovdqa XTMP5, XTMP2 ; XTMP5 = W[-2] {DDCC} MY_ROR y1, (22-13) ; y1 = a >> (22-13) xor y0, e ; y0 = e ^ (e >> (25-11)) mov y2, f ; y2 = f MY_ROR y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6)) vpsrld XTMP5, XTMP2, 10 ; XTMP5 = W[-2] >> 10 {DDCC} xor y1, a ; y1 = a ^ (a >> (22-13) xor y2, g ; y2 = f^g vpsrlq XTMP3, XTMP2, 19 ; XTMP3 = W[-2] MY_ROR 19 {xDxC} xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) and y2, e ; y2 = (f^g)&e MY_ROR y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2)) vpsrlq XTMP2, XTMP2, 17 ; XTMP2 = W[-2] MY_ROR 17 {xDxC} xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) MY_ROR y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) xor y2, g ; y2 = CH = ((f^g)&e)^g vpxor XTMP2, XTMP2, XTMP3 MY_ROR y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) add y2, y0 ; y2 = S1 + CH add y2, [rsp + _XFER + 3*4] ; y2 = k + w + S1 + CH vpxor XTMP5, XTMP5, XTMP2 ; XTMP5 = s1 {xDxC} mov y0, a ; y0 = a add h, y2 ; h = h + S1 + CH + k + w mov y2, a ; y2 = a vpshufb XTMP5, XTMP5, SHUF_DC00 ; XTMP5 = s1 {DC00} or y0, c ; y0 = a|c add d, h ; d = d + h + S1 + CH + k + w and y2, c ; y2 = a&c vpaddd X0, XTMP5, XTMP0 ; X0 = {W[3], W[2], W[1], W[0]} and y0, b ; y0 = (a|c)&b add h, y1 ; h = h + S1 + CH + k + w + S0 or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c) add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ ROTATE_ARGS rotate_Xs %endm ;; input is [rsp + _XFER + %1 * 4] %macro DO_ROUND 1 mov y0, e ; y0 = e MY_ROR y0, (25-11) ; y0 = e >> (25-11) mov y1, a ; y1 = a xor y0, e ; y0 = e ^ (e >> (25-11)) MY_ROR y1, (22-13) ; y1 = a >> (22-13) mov y2, f ; y2 = f xor y1, a ; y1 = a ^ (a >> (22-13) MY_ROR y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6)) xor y2, g ; y2 = f^g xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) MY_ROR y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2)) and y2, e ; y2 = (f^g)&e xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) MY_ROR y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) xor y2, g ; y2 = CH = ((f^g)&e)^g add y2, y0 ; y2 = S1 + CH MY_ROR y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) add y2, [rsp + _XFER + %1 * 4] ; y2 = k + w + S1 + CH mov y0, a ; y0 = a add h, y2 ; h = h + S1 + CH + k + w mov y2, a ; y2 = a or y0, c ; y0 = a|c add d, h ; d = d + h + S1 + CH + k + w and y2, c ; y2 = a&c and y0, b ; y0 = (a|c)&b add h, y1 ; h = h + S1 + CH + k + w + S0 or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c) add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ ROTATE_ARGS %endm ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; void FUNC(void *input_data, UINT32 digest[8], UINT64 num_blks) ;; arg 1 : pointer to input data ;; arg 2 : pointer to digest section .text MKGLOBAL(FUNC,function,) align 32 FUNC: push rbx %ifndef LINUX push rsi push rdi %endif push rbp push r13 push r14 push r15 sub rsp,STACK_size %ifndef LINUX vmovdqa [rsp + _XMM_SAVE + 0*16],xmm6 vmovdqa [rsp + _XMM_SAVE + 1*16],xmm7 vmovdqa [rsp + _XMM_SAVE + 2*16],xmm8 vmovdqa [rsp + _XMM_SAVE + 3*16],xmm9 vmovdqa [rsp + _XMM_SAVE + 4*16],xmm10 vmovdqa [rsp + _XMM_SAVE + 5*16],xmm11 vmovdqa [rsp + _XMM_SAVE + 6*16],xmm12 vmovdqa [rsp + _XMM_SAVE + 7*16],xmm13 %endif ;; load initial digest mov a,H0 mov b,H1 mov c,H2 mov d,H3 mov e,H4 mov f,H5 mov g,H6 mov h,H7 vmovdqa BYTE_FLIP_MASK, [rel PSHUFFLE_BYTE_FLIP_MASK] vmovdqa SHUF_00BA, [rel _SHUF_00BA] vmovdqa SHUF_DC00, [rel _SHUF_DC00] lea TBL,[rel K256] ;; byte swap first 16 dwords COPY_XMM_AND_BSWAP X0, [INP + 0*16], BYTE_FLIP_MASK COPY_XMM_AND_BSWAP X1, [INP + 1*16], BYTE_FLIP_MASK COPY_XMM_AND_BSWAP X2, [INP + 2*16], BYTE_FLIP_MASK COPY_XMM_AND_BSWAP X3, [INP + 3*16], BYTE_FLIP_MASK ;; schedule 48 input dwords, by doing 3 rounds of 16 each mov SRND, 3 align 16 loop1: vpaddd XFER, X0, [TBL + 0*16] vmovdqa [rsp + _XFER], XFER FOUR_ROUNDS_AND_SCHED vpaddd XFER, X0, [TBL + 1*16] vmovdqa [rsp + _XFER], XFER FOUR_ROUNDS_AND_SCHED vpaddd XFER, X0, [TBL + 2*16] vmovdqa [rsp + _XFER], XFER FOUR_ROUNDS_AND_SCHED vpaddd XFER, X0, [TBL + 3*16] vmovdqa [rsp + _XFER], XFER add TBL, 4*16 FOUR_ROUNDS_AND_SCHED sub SRND, 1 jne loop1 mov SRND, 2 loop2: vpaddd XFER, X0, [TBL + 0*16] vmovdqa [rsp + _XFER], XFER DO_ROUND 0 DO_ROUND 1 DO_ROUND 2 DO_ROUND 3 vpaddd XFER, X1, [TBL + 1*16] vmovdqa [rsp + _XFER], XFER add TBL, 2*16 DO_ROUND 0 DO_ROUND 1 DO_ROUND 2 DO_ROUND 3 vmovdqa X0, X2 vmovdqa X1, X3 sub SRND, 1 jne loop2 add a,H0 add b,H1 add c,H2 add d,H3 add e,H4 add f,H5 add g,H6 mov [4*0 + CTX],a mov [4*1 + CTX],b mov [4*2 + CTX],c mov [4*3 + CTX],d mov [4*4 + CTX],e mov [4*5 + CTX],f mov [4*6 + CTX],g add h,H7 mov [4*7 + CTX],h done_hash: %ifndef LINUX vmovdqa xmm6,[rsp + _XMM_SAVE + 0*16] vmovdqa xmm7,[rsp + _XMM_SAVE + 1*16] vmovdqa xmm8,[rsp + _XMM_SAVE + 2*16] vmovdqa xmm9,[rsp + _XMM_SAVE + 3*16] vmovdqa xmm10,[rsp + _XMM_SAVE + 4*16] vmovdqa xmm11,[rsp + _XMM_SAVE + 5*16] vmovdqa xmm12,[rsp + _XMM_SAVE + 6*16] vmovdqa xmm13,[rsp + _XMM_SAVE + 7*16] %endif add rsp, STACK_size pop r15 pop r14 pop r13 pop rbp %ifndef LINUX pop rdi pop rsi %endif pop rbx ret intel-ipsec-mb-0.48/avx/sha384_one_block_avx.asm000066400000000000000000000037571321406316400214740ustar00rootroot00000000000000;; ;; Copyright (c) 2012-2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; ; This code schedules 1 blocks at a time, with 4 lanes per block ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; %define H0 0xcbbb9d5dc1059ed8 %define H1 0x629a292a367cd507 %define H2 0x9159015a3070dd17 %define H3 0x152fecd8f70e5939 %define H4 0x67332667ffc00b31 %define H5 0x8eb44a8768581511 %define H6 0xdb0c2e0d64f98fa7 %define H7 0x47b5481dbefa4fa4 %define FUNC sha384_one_block_avx %include "sha512_one_block_avx.asm" intel-ipsec-mb-0.48/avx/sha512_one_block_avx.asm000066400000000000000000000305711321406316400214570ustar00rootroot00000000000000;; ;; Copyright (c) 2012-2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; ; This code schedules 1 blocks at a time, with 4 lanes per block ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; %include "os.asm" %define VMOVDQ vmovdqu ;; assume buffers not aligned %ifndef FUNC %define FUNC sha512_one_block_avx %endif ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros %macro MY_ROR 2 shld %1,%1,(64-(%2)) %endm ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask ; Load xmm with mem and byte swap each dword %macro COPY_XMM_AND_BSWAP 3 VMOVDQ %1, %2 vpshufb %1, %3 %endmacro ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; %define X0 xmm4 %define X1 xmm5 %define X2 xmm6 %define X3 xmm7 %define X4 xmm8 %define X5 xmm9 %define X6 xmm10 %define X7 xmm11 %define XTMP0 xmm0 %define XTMP1 xmm1 %define XTMP2 xmm2 %define XTMP3 xmm3 %define XFER xmm13 %define BYTE_FLIP_MASK xmm12 %ifdef LINUX %define CTX rsi ; 2nd arg %define INP rdi ; 1st arg %define SRND rdi ; clobbers INP %define c rcx %define d r8 %define e rdx %else %define CTX rdx ; 2nd arg %define INP rcx ; 1st arg %define SRND rcx ; clobbers INP %define c rdi %define d rsi %define e r8 %endif %define TBL rbp %define a rax %define b rbx %define f r9 %define g r10 %define h r11 %define y0 r13 %define y1 r14 %define y2 r15 %ifndef H0 %define H0 0x6a09e667f3bcc908 %define H1 0xbb67ae8584caa73b %define H2 0x3c6ef372fe94f82b %define H3 0xa54ff53a5f1d36f1 %define H4 0x510e527fade682d1 %define H5 0x9b05688c2b3e6c1f %define H6 0x1f83d9abfb41bd6b %define H7 0x5be0cd19137e2179 %endif struc STACK %ifndef LINUX _XMM_SAVE: reso 8 %endif _XFER: reso 1 endstruc ; rotate_Xs ; Rotate values of symbols X0...X7 %macro rotate_Xs 0 %xdefine X_ X0 %xdefine X0 X1 %xdefine X1 X2 %xdefine X2 X3 %xdefine X3 X4 %xdefine X4 X5 %xdefine X5 X6 %xdefine X6 X7 %xdefine X7 X_ %endm ; ROTATE_ARGS ; Rotate values of symbols a...h %macro ROTATE_ARGS 0 %xdefine TMP_ h %xdefine h g %xdefine g f %xdefine f e %xdefine e d %xdefine d c %xdefine c b %xdefine b a %xdefine a TMP_ %endm %macro TWO_ROUNDS_AND_SCHED 0 vpalignr XTMP0, X5, X4, 8 ; XTMP0 = W[-7] ;; compute s0 four at a time and s1 two at a time ;; compute W[-16] + W[-7] 4 at a time mov y0, e ; y0 = e mov y1, a ; y1 = a MY_ROR y0, (41-18) ; y0 = e >> (41-18) vpaddq XTMP0, XTMP0, X0 ; XTMP0 = W[-7] + W[-16] xor y0, e ; y0 = e ^ (e >> (41-18)) mov y2, f ; y2 = f MY_ROR y1, (39-34) ; y1 = a >> (39-34) ;; compute s0 vpalignr XTMP1, X1, X0, 8 ; XTMP1 = W[-15] xor y1, a ; y1 = a ^ (a >> (39-34) MY_ROR y0, (18-14) ; y0 = (e >> (18-14)) ^ (e >> (41-14)) vpsllq XTMP2, XTMP1, (64-1) xor y2, g ; y2 = f^g MY_ROR y1, (34-28) ; y1 = (a >> (34-28)) ^ (a >> (39-28)) vpsrlq XTMP3, XTMP1, 1 xor y0, e ; y0 = e ^ (e >> (18-14)) ^ (e >> (41-14)) and y2, e ; y2 = (f^g)&e MY_ROR y0, 14 ; y0 = S1 = (e>>14) & (e>>18) ^ (e>>41) vpor XTMP2, XTMP2, XTMP3 ; XTMP2 = W[-15] ror 1 xor y1, a ; y1 = a ^ (a >> (34-28)) ^ (a >> (39-28)) xor y2, g ; y2 = CH = ((f^g)&e)^g add y2, y0 ; y2 = S1 + CH vpsrlq XTMP3, XTMP1, 8 add y2, [rsp + _XFER + 0*8] ; y2 = k + w + S1 + CH MY_ROR y1, 28 ; y1 = S0 = (a>>28) ^ (a>>34) ^ (a>>39) mov y0, a ; y0 = a vpsllq X0, XTMP1, (64-8) add h, y2 ; h = h + S1 + CH + k + w mov y2, a ; y2 = a or y0, c ; y0 = a|c vpor X0, X0, XTMP3 add d, h ; d = d + t1 and y2, c ; y2 = a&c and y0, b ; y0 = (a|c)&b vpsrlq XTMP1, XTMP1, 7 ; X0 = W[-15] >> 7 add h, y1 ; h = t1 + S0 or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c) vpxor XTMP1, XTMP1, XTMP2 ; XTMP1 = W[-15] ror 1 ^ W[-15] ror 8 add h, y0 ; h = t1 + S0 + MAJ vpxor XTMP1, XTMP1, X0 ; XTMP1 = s0 ROTATE_ARGS ;; compute s1 vpaddq XTMP0, XTMP0, XTMP1 ; XTMP0 = W[-16] + W[-7] + s0 mov y0, e ; y0 = e mov y1, a ; y1 = a MY_ROR y0, (41-18) ; y0 = e >> (41-18) vpsllq XTMP3, X7, (64-19) xor y0, e ; y0 = e ^ (e >> (41-18)) mov y2, f ; y2 = f MY_ROR y1, (39-34) ; y1 = a >> (39-34) vpsrlq X0, X7, 19 xor y1, a ; y1 = a ^ (a >> (39-34) MY_ROR y0, (18-14) ; y0 = (e >> (18-14)) ^ (e >> (41-14)) vpor XTMP3, XTMP3, X0 ; XTMP3 = W[-2] ror 19 xor y2, g ; y2 = f^g MY_ROR y1, (34-28) ; y1 = (a >> (34-28)) ^ (a >> (39-28)) vpsllq XTMP2, X7, (64-61) xor y0, e ; y0 = e ^ (e >> (18-14)) ^ (e >> (41-14)) and y2, e ; y2 = (f^g)&e MY_ROR y0, 14 ; y0 = S1 = (e>>14) & (e>>18) ^ (e>>41) vpsrlq XTMP1, X7, 61 xor y1, a ; y1 = a ^ (a >> (34-28)) ^ (a >> (39-28)) xor y2, g ; y2 = CH = ((f^g)&e)^g add y2, y0 ; y2 = S1 + CH vpor XTMP2, XTMP2, XTMP1 ; XTMP2 = W[-2] ror 61 add y2, [rsp + _XFER + 1*8] ; y2 = k + w + S1 + CH MY_ROR y1, 28 ; y1 = S0 = (a>>28) ^ (a>>34) ^ (a>>39) mov y0, a ; y0 = a vpsrlq XTMP1, X7, 6 ; XTMP1 = W[-2] >> 6 add h, y2 ; h = h + S1 + CH + k + w mov y2, a ; y2 = a or y0, c ; y0 = a|c vpxor XTMP1, XTMP1, XTMP2 add d, h ; d = d + t1 and y2, c ; y2 = a&c and y0, b ; y0 = (a|c)&b vpxor X0, XTMP3, XTMP1 ; X0 = s1 add h, y1 ; h = t1 + S0 or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c) add h, y0 ; h = t1 + S0 + MAJ vpaddq X0, X0, XTMP0 ; X0 = {W[1], W[0]} ROTATE_ARGS rotate_Xs %endm ;; input is [rsp + _XFER + %1 * 8] %macro DO_ROUND 1 mov y0, e ; y0 = e MY_ROR y0, (41-18) ; y0 = e >> (41-18) mov y1, a ; y1 = a xor y0, e ; y0 = e ^ (e >> (41-18)) MY_ROR y1, (39-34) ; y1 = a >> (39-34) mov y2, f ; y2 = f xor y1, a ; y1 = a ^ (a >> (39-34) MY_ROR y0, (18-14) ; y0 = (e >> (18-14)) ^ (e >> (41-14)) xor y2, g ; y2 = f^g xor y0, e ; y0 = e ^ (e >> (18-14)) ^ (e >> (25-6)) MY_ROR y1, (34-28) ; y1 = (a >> (34-28)) ^ (a >> (39-28)) and y2, e ; y2 = (f^g)&e xor y1, a ; y1 = a ^ (a >> (34-28)) ^ (a >> (39-28)) MY_ROR y0, 14 ; y0 = S1 = (e>>14) & (e>>18) ^ (e>>41) xor y2, g ; y2 = CH = ((f^g)&e)^g add y2, y0 ; y2 = S1 + CH MY_ROR y1, 28 ; y1 = S0 = (a>>28) ^ (a>>34) ^ (a>>39) add y2, [rsp + _XFER + %1*8] ; y2 = k + w + S1 + CH mov y0, a ; y0 = a add h, y2 ; h = h + S1 + CH + k + w mov y2, a ; y2 = a or y0, c ; y0 = a|c add d, h ; d = d + t1 and y2, c ; y2 = a&c and y0, b ; y0 = (a|c)&b add h, y1 ; h = t1 + S0 or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c) add h, y0 ; h = t1 + S0 + MAJ ROTATE_ARGS %endm section .data default rel align 64 K512: dq 0x428a2f98d728ae22,0x7137449123ef65cd dq 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc dq 0x3956c25bf348b538,0x59f111f1b605d019 dq 0x923f82a4af194f9b,0xab1c5ed5da6d8118 dq 0xd807aa98a3030242,0x12835b0145706fbe dq 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 dq 0x72be5d74f27b896f,0x80deb1fe3b1696b1 dq 0x9bdc06a725c71235,0xc19bf174cf692694 dq 0xe49b69c19ef14ad2,0xefbe4786384f25e3 dq 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 dq 0x2de92c6f592b0275,0x4a7484aa6ea6e483 dq 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 dq 0x983e5152ee66dfab,0xa831c66d2db43210 dq 0xb00327c898fb213f,0xbf597fc7beef0ee4 dq 0xc6e00bf33da88fc2,0xd5a79147930aa725 dq 0x06ca6351e003826f,0x142929670a0e6e70 dq 0x27b70a8546d22ffc,0x2e1b21385c26c926 dq 0x4d2c6dfc5ac42aed,0x53380d139d95b3df dq 0x650a73548baf63de,0x766a0abb3c77b2a8 dq 0x81c2c92e47edaee6,0x92722c851482353b dq 0xa2bfe8a14cf10364,0xa81a664bbc423001 dq 0xc24b8b70d0f89791,0xc76c51a30654be30 dq 0xd192e819d6ef5218,0xd69906245565a910 dq 0xf40e35855771202a,0x106aa07032bbd1b8 dq 0x19a4c116b8d2d0c8,0x1e376c085141ab53 dq 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 dq 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb dq 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 dq 0x748f82ee5defb2fc,0x78a5636f43172f60 dq 0x84c87814a1f0ab72,0x8cc702081a6439ec dq 0x90befffa23631e28,0xa4506cebde82bde9 dq 0xbef9a3f7b2c67915,0xc67178f2e372532b dq 0xca273eceea26619c,0xd186b8c721c0c207 dq 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 dq 0x06f067aa72176fba,0x0a637dc5a2c898a6 dq 0x113f9804bef90dae,0x1b710b35131c471b dq 0x28db77f523047d84,0x32caab7b40c72493 dq 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c dq 0x4cc5d4becb3e42b6,0x597f299cfc657e2a dq 0x5fcb6fab3ad6faec,0x6c44198c4a475817 h0: dq H0 h1: dq H1 h2: dq H2 h3: dq H3 h4: dq H4 h5: dq H5 h6: dq H6 h7: dq H7 align 16 PSHUFFLE_BYTE_FLIP_MASK: ;ddq 0x08090a0b0c0d0e0f0001020304050607 dq 0x0001020304050607, 0x08090a0b0c0d0e0f ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; void FUNC(void *input_data, UINT64 digest[8]) ;; arg 1 : pointer to input data ;; arg 2 : pointer to digest section .text MKGLOBAL(FUNC,function,) align 32 FUNC: push rbx %ifndef LINUX push rsi push rdi %endif push rbp push r13 push r14 push r15 sub rsp,STACK_size %ifndef LINUX vmovdqa [rsp + _XMM_SAVE + 0*16],xmm6 vmovdqa [rsp + _XMM_SAVE + 1*16],xmm7 vmovdqa [rsp + _XMM_SAVE + 2*16],xmm8 vmovdqa [rsp + _XMM_SAVE + 3*16],xmm9 vmovdqa [rsp + _XMM_SAVE + 4*16],xmm10 vmovdqa [rsp + _XMM_SAVE + 5*16],xmm11 vmovdqa [rsp + _XMM_SAVE + 6*16],xmm12 vmovdqa [rsp + _XMM_SAVE + 7*16],xmm13 %endif ;; load initial digest mov a,[rel h0] mov b,[rel h1] mov c,[rel h2] mov d,[rel h3] mov e,[rel h4] mov f,[rel h5] mov g,[rel h6] mov h,[rel h7] vmovdqa BYTE_FLIP_MASK, [rel PSHUFFLE_BYTE_FLIP_MASK] lea TBL,[rel K512] ;; byte swap first 16 qwords COPY_XMM_AND_BSWAP X0, [INP + 0*16], BYTE_FLIP_MASK COPY_XMM_AND_BSWAP X1, [INP + 1*16], BYTE_FLIP_MASK COPY_XMM_AND_BSWAP X2, [INP + 2*16], BYTE_FLIP_MASK COPY_XMM_AND_BSWAP X3, [INP + 3*16], BYTE_FLIP_MASK COPY_XMM_AND_BSWAP X4, [INP + 4*16], BYTE_FLIP_MASK COPY_XMM_AND_BSWAP X5, [INP + 5*16], BYTE_FLIP_MASK COPY_XMM_AND_BSWAP X6, [INP + 6*16], BYTE_FLIP_MASK COPY_XMM_AND_BSWAP X7, [INP + 7*16], BYTE_FLIP_MASK ;; schedule 64 input qwords, by doing 4 iterations of 16 rounds mov SRND, 4 align 16 loop1: %assign i 0 %rep 7 vpaddq XFER, X0, [TBL + i*16] vmovdqa [rsp + _XFER], XFER TWO_ROUNDS_AND_SCHED %assign i (i+1) %endrep vpaddq XFER, X0, [TBL + 7*16] vmovdqa [rsp + _XFER], XFER add TBL, 8*16 TWO_ROUNDS_AND_SCHED sub SRND, 1 jne loop1 mov SRND, 2 jmp loop2a loop2: vmovdqa X0, X4 vmovdqa X1, X5 vmovdqa X2, X6 vmovdqa X3, X7 loop2a: vpaddq X0, X0, [TBL + 0*16] vmovdqa [rsp + _XFER], X0 DO_ROUND 0 DO_ROUND 1 vpaddq X1, X1, [TBL + 1*16] vmovdqa [rsp + _XFER], X1 DO_ROUND 0 DO_ROUND 1 vpaddq X2, X2, [TBL + 2*16] vmovdqa [rsp + _XFER], X2 DO_ROUND 0 DO_ROUND 1 vpaddq X3, X3, [TBL + 3*16] vmovdqa [rsp + _XFER], X3 add TBL, 4*16 DO_ROUND 0 DO_ROUND 1 sub SRND, 1 jne loop2 add a,[rel h0] add b,[rel h1] add c,[rel h2] add d,[rel h3] add e,[rel h4] add f,[rel h5] add g,[rel h6] mov [8*0 + CTX],a mov [8*1 + CTX],b mov [8*2 + CTX],c mov [8*3 + CTX],d mov [8*4 + CTX],e mov [8*5 + CTX],f mov [8*6 + CTX],g add h,[rel h7] mov [8*7 + CTX],h done_hash: %ifndef LINUX vmovdqa xmm6,[rsp + _XMM_SAVE + 0*16] vmovdqa xmm7,[rsp + _XMM_SAVE + 1*16] vmovdqa xmm8,[rsp + _XMM_SAVE + 2*16] vmovdqa xmm9,[rsp + _XMM_SAVE + 3*16] vmovdqa xmm10,[rsp + _XMM_SAVE + 4*16] vmovdqa xmm11,[rsp + _XMM_SAVE + 5*16] vmovdqa xmm12,[rsp + _XMM_SAVE + 6*16] vmovdqa xmm13,[rsp + _XMM_SAVE + 7*16] %endif add rsp, STACK_size pop r15 pop r14 pop r13 pop rbp %ifndef LINUX pop rdi pop rsi %endif pop rbx ret intel-ipsec-mb-0.48/avx/sha512_x2_avx.asm000066400000000000000000000222641321406316400200550ustar00rootroot00000000000000;; ;; Copyright (c) 2012-2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; ;; code to compute SHA512 by-2 using AVX ;; outer calling routine takes care of save and restore of XMM registers ;; Logic designed/laid out by JDG ;; Function clobbers: rax, rcx, rdx, rbx, rsi, rdi, r9-r15; ymm0-15 ;; Stack must be aligned to 16 bytes before call ;; Windows clobbers: rax rdx r8 r9 r10 r11 ;; Windows preserves: rbx rcx rsi rdi rbp r12 r13 r14 r15 ;; ;; Linux clobbers: rax rsi r8 r9 r10 r11 ;; Linux preserves: rbx rcx rdx rdi rbp r12 r13 r14 r15 ;; ;; clobbers xmm0-15 %include "os.asm" %include "mb_mgr_datastruct.asm" extern K512_2 section .data default rel align 32 ; one from sha512_rorx ; this does the big endian to little endian conversion ; over a quad word PSHUFFLE_BYTE_FLIP_MASK: ;ddq 0x08090a0b0c0d0e0f0001020304050607 dq 0x0001020304050607, 0x08090a0b0c0d0e0f ;ddq 0x18191a1b1c1d1e1f1011121314151617 dq 0x1011121314151617, 0x18191a1b1c1d1e1f section .text %ifdef LINUX ; Linux definitions %define arg1 rdi %define arg2 rsi %else ; Windows definitions %define arg1 rcx %define arg2 rdx %endif ; Common definitions %define STATE arg1 %define INP_SIZE arg2 %define IDX rax %define ROUND r8 %define TBL r11 %define inp0 r9 %define inp1 r10 %define a xmm0 %define b xmm1 %define c xmm2 %define d xmm3 %define e xmm4 %define f xmm5 %define g xmm6 %define h xmm7 %define a0 xmm8 %define a1 xmm9 %define a2 xmm10 %define TT0 xmm14 %define TT1 xmm13 %define TT2 xmm12 %define TT3 xmm11 %define TT4 xmm10 %define TT5 xmm9 %define T1 xmm14 %define TMP xmm15 %define SZ2 2*SHA512_DIGEST_WORD_SIZE ; Size of one vector register %define ROUNDS 80*SZ2 ; Define stack usage struc STACK _DATA: resb SZ2 * 16 _DIGEST: resb SZ2 * NUM_SHA512_DIGEST_WORDS resb 8 ; for alignment, must be odd multiple of 8 endstruc %define VMOVPD vmovupd ; transpose r0, r1, t0 ; Input looks like {r0 r1} ; r0 = {a1 a0} ; r1 = {b1 b0} ; ; output looks like ; r0 = {b0, a0} ; t0 = {b1, a1} %macro TRANSPOSE 3 %define %%r0 %1 %define %%r1 %2 %define %%t0 %3 vshufpd %%t0, %%r0, %%r1, 11b ; t0 = b1 a1 vshufpd %%r0, %%r0, %%r1, 00b ; r0 = b0 a0 %endm %macro ROTATE_ARGS 0 %xdefine TMP_ h %xdefine h g %xdefine g f %xdefine f e %xdefine e d %xdefine d c %xdefine c b %xdefine b a %xdefine a TMP_ %endm ; PRORQ reg, imm, tmp ; packed-rotate-right-double ; does a rotate by doing two shifts and an or %macro PRORQ 3 %define %%reg %1 %define %%imm %2 %define %%tmp %3 vpsllq %%tmp, %%reg, (64-(%%imm)) vpsrlq %%reg, %%reg, %%imm vpor %%reg, %%reg, %%tmp %endmacro ; non-destructive ; PRORQ_nd reg, imm, tmp, src %macro PRORQ_nd 4 %define %%reg %1 %define %%imm %2 %define %%tmp %3 %define %%src %4 vpsllq %%tmp, %%src, (64-(%%imm)) vpsrlq %%reg, %%src, %%imm vpor %%reg, %%reg, %%tmp %endmacro ; PRORQ dst/src, amt %macro PRORQ 2 PRORQ %1, %2, TMP %endmacro ; PRORQ_nd dst, src, amt %macro PRORQ_nd 3 PRORQ_nd %1, %3, TMP, %2 %endmacro ;; arguments passed implicitly in preprocessor symbols i, a...h %macro ROUND_00_15 2 %define %%T1 %1 %define %%i %2 PRORQ_nd a0, e, (18-14) ; sig1: a0 = (e >> 4) vpxor a2, f, g ; ch: a2 = f^g vpand a2, a2, e ; ch: a2 = (f^g)&e vpxor a2, a2, g ; a2 = ch PRORQ_nd a1, e, 41 ; sig1: a1 = (e >> 41) vmovdqa [SZ2*(%%i&0xf) + rsp + _DATA],%%T1 vpaddq %%T1,%%T1,[TBL + ROUND] ; T1 = W + K vpxor a0, a0, e ; sig1: a0 = e ^ (e >> 5) PRORQ a0, 14 ; sig1: a0 = (e >> 14) ^ (e >> 18) vpaddq h, h, a2 ; h = h + ch PRORQ_nd a2, a, (34-28) ; sig0: a2 = (a >> 6) vpaddq h, h, %%T1 ; h = h + ch + W + K vpxor a0, a0, a1 ; a0 = sigma1 vmovdqa %%T1, a ; maj: T1 = a PRORQ_nd a1, a, 39 ; sig0: a1 = (a >> 39) vpxor %%T1, %%T1, c ; maj: T1 = a^c add ROUND, SZ2 ; ROUND++ vpand %%T1, %%T1, b ; maj: T1 = (a^c)&b vpaddq h, h, a0 vpaddq d, d, h vpxor a2, a2, a ; sig0: a2 = a ^ (a >> 11) PRORQ a2, 28 ; sig0: a2 = (a >> 28) ^ (a >> 34) vpxor a2, a2, a1 ; a2 = sig0 vpand a1, a, c ; maj: a1 = a&c vpor a1, a1, %%T1 ; a1 = maj vpaddq h, h, a1 ; h = h + ch + W + K + maj vpaddq h, h, a2 ; h = h + ch + W + K + maj + sigma0 ROTATE_ARGS %endm ;; arguments passed implicitly in preprocessor symbols i, a...h %macro ROUND_16_XX 2 %define %%T1 %1 %define %%i %2 vmovdqa %%T1, [SZ2*((%%i-15)&0xf) + rsp + _DATA] vmovdqa a1, [SZ2*((%%i-2)&0xf) + rsp + _DATA] vmovdqa a0, %%T1 PRORQ %%T1, 8-1 vmovdqa a2, a1 PRORQ a1, 61-19 vpxor %%T1, %%T1, a0 PRORQ %%T1, 1 vpxor a1, a1, a2 PRORQ a1, 19 vpsrlq a0, a0, 7 vpxor %%T1, %%T1, a0 vpsrlq a2, a2, 6 vpxor a1, a1, a2 vpaddq %%T1, %%T1, [SZ2*((%%i-16)&0xf) + rsp + _DATA] vpaddq a1, a1, [SZ2*((%%i-7)&0xf) + rsp + _DATA] vpaddq %%T1, %%T1, a1 ROUND_00_15 %%T1, %%i %endm ;; SHA512_ARGS: ;; UINT128 digest[8]; // transposed digests ;; UINT8 *data_ptr[2]; ;; ;; void sha512_x2_avx(SHA512_ARGS *args, UINT64 msg_size_in_blocks) ;; arg 1 : STATE : pointer args ;; arg 2 : INP_SIZE : size of data in blocks (assumed >= 1) ;; MKGLOBAL(sha512_x2_avx,function,internal) align 32 sha512_x2_avx: ; general registers preserved in outer calling routine ; outer calling routine saves all the XMM registers sub rsp, STACK_size ;; Load the pre-transposed incoming digest. vmovdqa a,[STATE + 0 * SHA512_DIGEST_ROW_SIZE] vmovdqa b,[STATE + 1 * SHA512_DIGEST_ROW_SIZE] vmovdqa c,[STATE + 2 * SHA512_DIGEST_ROW_SIZE] vmovdqa d,[STATE + 3 * SHA512_DIGEST_ROW_SIZE] vmovdqa e,[STATE + 4 * SHA512_DIGEST_ROW_SIZE] vmovdqa f,[STATE + 5 * SHA512_DIGEST_ROW_SIZE] vmovdqa g,[STATE + 6 * SHA512_DIGEST_ROW_SIZE] vmovdqa h,[STATE + 7 * SHA512_DIGEST_ROW_SIZE] lea TBL,[rel K512_2] ;; load the address of each of the 2 message lanes ;; getting ready to transpose input onto stack mov inp0,[STATE + _data_ptr_sha512 +0*PTR_SZ] mov inp1,[STATE + _data_ptr_sha512 +1*PTR_SZ] xor IDX, IDX lloop: xor ROUND, ROUND ;; save old digest vmovdqa [rsp + _DIGEST + 0*SZ2], a vmovdqa [rsp + _DIGEST + 1*SZ2], b vmovdqa [rsp + _DIGEST + 2*SZ2], c vmovdqa [rsp + _DIGEST + 3*SZ2], d vmovdqa [rsp + _DIGEST + 4*SZ2], e vmovdqa [rsp + _DIGEST + 5*SZ2], f vmovdqa [rsp + _DIGEST + 6*SZ2], g vmovdqa [rsp + _DIGEST + 7*SZ2], h %assign i 0 %rep 8 ;; load up the shuffler for little-endian to big-endian format vmovdqa TMP, [rel PSHUFFLE_BYTE_FLIP_MASK] VMOVPD TT0,[inp0+IDX+i*16] ;; double precision is 64 bits VMOVPD TT2,[inp1+IDX+i*16] TRANSPOSE TT0, TT2, TT1 vpshufb TT0, TT0, TMP vpshufb TT1, TT1, TMP ROUND_00_15 TT0,(i*2+0) ROUND_00_15 TT1,(i*2+1) %assign i (i+1) %endrep ;; Increment IDX by message block size == 8 (loop) * 16 (XMM width in bytes) add IDX, 8 * 16 %assign i (i*4) jmp Lrounds_16_xx align 16 Lrounds_16_xx: %rep 16 ROUND_16_XX T1, i %assign i (i+1) %endrep cmp ROUND,ROUNDS jb Lrounds_16_xx ;; add old digest vpaddq a, a, [rsp + _DIGEST + 0*SZ2] vpaddq b, b, [rsp + _DIGEST + 1*SZ2] vpaddq c, c, [rsp + _DIGEST + 2*SZ2] vpaddq d, d, [rsp + _DIGEST + 3*SZ2] vpaddq e, e, [rsp + _DIGEST + 4*SZ2] vpaddq f, f, [rsp + _DIGEST + 5*SZ2] vpaddq g, g, [rsp + _DIGEST + 6*SZ2] vpaddq h, h, [rsp + _DIGEST + 7*SZ2] sub INP_SIZE, 1 ;; consumed one message block jne lloop ; write back to memory (state object) the transposed digest vmovdqa [STATE+0*SHA512_DIGEST_ROW_SIZE],a vmovdqa [STATE+1*SHA512_DIGEST_ROW_SIZE],b vmovdqa [STATE+2*SHA512_DIGEST_ROW_SIZE],c vmovdqa [STATE+3*SHA512_DIGEST_ROW_SIZE],d vmovdqa [STATE+4*SHA512_DIGEST_ROW_SIZE],e vmovdqa [STATE+5*SHA512_DIGEST_ROW_SIZE],f vmovdqa [STATE+6*SHA512_DIGEST_ROW_SIZE],g vmovdqa [STATE+7*SHA512_DIGEST_ROW_SIZE],h ; update input pointers add inp0, IDX mov [STATE + _data_ptr_sha512 + 0*PTR_SZ], inp0 add inp1, IDX mov [STATE + _data_ptr_sha512 + 1*PTR_SZ], inp1 ;;;;;;;;;;;;;;;; ;; Postamble add rsp, STACK_size ; outer calling routine restores XMM and other GP registers ret intel-ipsec-mb-0.48/avx/sha_256_mult_avx.asm000066400000000000000000000231741321406316400206520ustar00rootroot00000000000000;; ;; Copyright (c) 2012-2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; ;; code to compute quad SHA256 using AVX ;; outer calling routine takes care of save and restore of XMM registers ;; Logic designed/laid out by JDG ;; Stack must be aligned to 16 bytes before call ;; Windows clobbers: rax rbx rdx r8 r9 r10 r11 r12 ;; Windows preserves: rcx rsi rdi rbp r12 r14 r15 ;; ;; Linux clobbers: rax rbx rsi r8 r9 r10 r11 r12 ;; Linux preserves: rcx rdx rdi rbp r13 r14 r15 ;; ;; clobbers xmm0-15 %include "os.asm" %include "mb_mgr_datastruct.asm" extern K256_4 %ifdef LINUX %define arg1 rdi %define arg2 rsi %else ; Windows definitions %define arg1 rcx %define arg2 rdx %endif ; Common definitions %define STATE arg1 %define INP_SIZE arg2 %define IDX rax %define ROUND rbx %define TBL r12 %define inp0 r8 %define inp1 r9 %define inp2 r10 %define inp3 r11 %define a xmm0 %define b xmm1 %define c xmm2 %define d xmm3 %define e xmm4 %define f xmm5 %define g xmm6 %define h xmm7 %define a0 xmm8 %define a1 xmm9 %define a2 xmm10 %define TT0 xmm14 %define TT1 xmm13 %define TT2 xmm12 %define TT3 xmm11 %define TT4 xmm10 %define TT5 xmm9 %define T1 xmm14 %define TMP xmm15 %define SZ4 4*SHA256_DIGEST_WORD_SIZE ; Size of one vector register %define ROUNDS 64*SZ4 ; Define stack usage struc STACK _DATA: resb SZ4 * 16 _DIGEST: resb SZ4 * NUM_SHA256_DIGEST_WORDS resb 8 ; for alignment, must be odd multiple of 8 endstruc %define VMOVPS vmovups ; transpose r0, r1, r2, r3, t0, t1 ; "transpose" data in {r0..r3} using temps {t0..t3} ; Input looks like: {r0 r1 r2 r3} ; r0 = {a3 a2 a1 a0} ; r1 = {b3 b2 b1 b0} ; r2 = {c3 c2 c1 c0} ; r3 = {d3 d2 d1 d0} ; ; output looks like: {t0 r1 r0 r3} ; t0 = {d0 c0 b0 a0} ; r1 = {d1 c1 b1 a1} ; r0 = {d2 c2 b2 a2} ; r3 = {d3 c3 b3 a3} ; %macro TRANSPOSE 6 %define %%r0 %1 %define %%r1 %2 %define %%r2 %3 %define %%r3 %4 %define %%t0 %5 %define %%t1 %6 vshufps %%t0, %%r0, %%r1, 0x44 ; t0 = {b1 b0 a1 a0} vshufps %%r0, %%r0, %%r1, 0xEE ; r0 = {b3 b2 a3 a2} vshufps %%t1, %%r2, %%r3, 0x44 ; t1 = {d1 d0 c1 c0} vshufps %%r2, %%r2, %%r3, 0xEE ; r2 = {d3 d2 c3 c2} vshufps %%r1, %%t0, %%t1, 0xDD ; r1 = {d1 c1 b1 a1} vshufps %%r3, %%r0, %%r2, 0xDD ; r3 = {d3 c3 b3 a3} vshufps %%r0, %%r0, %%r2, 0x88 ; r0 = {d2 c2 b2 a2} vshufps %%t0, %%t0, %%t1, 0x88 ; t0 = {d0 c0 b0 a0} %endmacro %macro ROTATE_ARGS 0 %xdefine TMP_ h %xdefine h g %xdefine g f %xdefine f e %xdefine e d %xdefine d c %xdefine c b %xdefine b a %xdefine a TMP_ %endm ; PRORD reg, imm, tmp %macro PRORD 3 %define %%reg %1 %define %%imm %2 %define %%tmp %3 vpslld %%tmp, %%reg, (32-(%%imm)) vpsrld %%reg, %%reg, %%imm vpor %%reg, %%reg, %%tmp %endmacro ; non-destructive ; PRORD_nd reg, imm, tmp, src %macro PRORD_nd 4 %define %%reg %1 %define %%imm %2 %define %%tmp %3 %define %%src %4 ;vmovdqa %%tmp, %%reg vpslld %%tmp, %%src, (32-(%%imm)) vpsrld %%reg, %%src, %%imm vpor %%reg, %%reg, %%tmp %endmacro ; PRORD dst/src, amt %macro PRORD 2 PRORD %1, %2, TMP %endmacro ; PRORD_nd dst, src, amt %macro PRORD_nd 3 PRORD_nd %1, %3, TMP, %2 %endmacro ;; arguments passed implicitly in preprocessor symbols i, a...h %macro ROUND_00_15 2 %define %%T1 %1 %define %%i %2 PRORD_nd a0, e, (11-6) ; sig1: a0 = (e >> 5) vpxor a2, f, g ; ch: a2 = f^g vpand a2, a2, e ; ch: a2 = (f^g)&e vpxor a2, a2, g ; a2 = ch PRORD_nd a1, e, 25 ; sig1: a1 = (e >> 25) vmovdqa [SZ4*(%%i&0xf) + rsp + _DATA], %%T1 vpaddd %%T1, %%T1, [TBL + ROUND] ; T1 = W + K vpxor a0, a0, e ; sig1: a0 = e ^ (e >> 5) PRORD a0, 6 ; sig1: a0 = (e >> 6) ^ (e >> 11) vpaddd h, h, a2 ; h = h + ch PRORD_nd a2, a, (13-2) ; sig0: a2 = (a >> 11) vpaddd h, h, %%T1 ; h = h + ch + W + K vpxor a0, a0, a1 ; a0 = sigma1 PRORD_nd a1, a, 22 ; sig0: a1 = (a >> 22) vpxor %%T1, a, c ; maj: T1 = a^c add ROUND, SZ4 ; ROUND++ vpand %%T1, %%T1, b ; maj: T1 = (a^c)&b vpaddd h, h, a0 vpaddd d, d, h vpxor a2, a2, a ; sig0: a2 = a ^ (a >> 11) PRORD a2, 2 ; sig0: a2 = (a >> 2) ^ (a >> 13) vpxor a2, a2, a1 ; a2 = sig0 vpand a1, a, c ; maj: a1 = a&c vpor a1, a1, %%T1 ; a1 = maj vpaddd h, h, a1 ; h = h + ch + W + K + maj vpaddd h, h, a2 ; h = h + ch + W + K + maj + sigma0 ROTATE_ARGS %endm ;; arguments passed implicitly in preprocessor symbols i, a...h %macro ROUND_16_XX 2 %define %%T1 %1 %define %%i %2 vmovdqa %%T1, [SZ4*((%%i-15)&0xf) + rsp + _DATA] vmovdqa a1, [SZ4*((%%i-2)&0xf) + rsp + _DATA] vmovdqa a0, %%T1 PRORD %%T1, 18-7 vmovdqa a2, a1 PRORD a1, 19-17 vpxor %%T1, %%T1, a0 PRORD %%T1, 7 vpxor a1, a1, a2 PRORD a1, 17 vpsrld a0, a0, 3 vpxor %%T1, %%T1, a0 vpsrld a2, a2, 10 vpxor a1, a1, a2 vpaddd %%T1, %%T1, [SZ4*((%%i-16)&0xf) + rsp + _DATA] vpaddd a1, a1, [SZ4*((%%i-7)&0xf) + rsp + _DATA] vpaddd %%T1, %%T1, a1 ROUND_00_15 %%T1, %%i %endm section .data default rel align 16 PSHUFFLE_BYTE_FLIP_MASK: ;ddq 0x0c0d0e0f08090a0b0405060700010203 dq 0x0405060700010203, 0x0c0d0e0f08090a0b section .text ;; SHA256_ARGS: ;; UINT128 digest[8]; // transposed digests ;; UINT8 *data_ptr[4]; ;; ;; void sha_256_mult_avx(SHA256_ARGS *args, UINT64 num_blocks); ;; arg 1 : STATE : pointer args ;; arg 2 : INP_SIZE : size of data in blocks (assumed >= 1) ;; MKGLOBAL(sha_256_mult_avx,function,internal) align 16 sha_256_mult_avx: ; general registers preserved in outer calling routine ; outer calling routine saves all the XMM registers sub rsp, STACK_size ;; Load the pre-transposed incoming digest. vmovdqa a,[STATE+0*SHA256_DIGEST_ROW_SIZE] vmovdqa b,[STATE+1*SHA256_DIGEST_ROW_SIZE] vmovdqa c,[STATE+2*SHA256_DIGEST_ROW_SIZE] vmovdqa d,[STATE+3*SHA256_DIGEST_ROW_SIZE] vmovdqa e,[STATE+4*SHA256_DIGEST_ROW_SIZE] vmovdqa f,[STATE+5*SHA256_DIGEST_ROW_SIZE] vmovdqa g,[STATE+6*SHA256_DIGEST_ROW_SIZE] vmovdqa h,[STATE+7*SHA256_DIGEST_ROW_SIZE] lea TBL,[rel K256_4] ;; load the address of each of the 4 message lanes ;; getting ready to transpose input onto stack mov inp0,[STATE + _data_ptr_sha256 + 0*PTR_SZ] mov inp1,[STATE + _data_ptr_sha256 + 1*PTR_SZ] mov inp2,[STATE + _data_ptr_sha256 + 2*PTR_SZ] mov inp3,[STATE + _data_ptr_sha256 + 3*PTR_SZ] xor IDX, IDX lloop: xor ROUND, ROUND ;; save old digest vmovdqa [rsp + _DIGEST + 0*SZ4], a vmovdqa [rsp + _DIGEST + 1*SZ4], b vmovdqa [rsp + _DIGEST + 2*SZ4], c vmovdqa [rsp + _DIGEST + 3*SZ4], d vmovdqa [rsp + _DIGEST + 4*SZ4], e vmovdqa [rsp + _DIGEST + 5*SZ4], f vmovdqa [rsp + _DIGEST + 6*SZ4], g vmovdqa [rsp + _DIGEST + 7*SZ4], h %assign i 0 %rep 4 vmovdqa TMP, [rel PSHUFFLE_BYTE_FLIP_MASK] VMOVPS TT2,[inp0+IDX+i*16] VMOVPS TT1,[inp1+IDX+i*16] VMOVPS TT4,[inp2+IDX+i*16] VMOVPS TT3,[inp3+IDX+i*16] TRANSPOSE TT2, TT1, TT4, TT3, TT0, TT5 vpshufb TT0, TT0, TMP vpshufb TT1, TT1, TMP vpshufb TT2, TT2, TMP vpshufb TT3, TT3, TMP ROUND_00_15 TT0,(i*4+0) ROUND_00_15 TT1,(i*4+1) ROUND_00_15 TT2,(i*4+2) ROUND_00_15 TT3,(i*4+3) %assign i (i+1) %endrep add IDX, 4*4*4 %assign i (i*4) jmp Lrounds_16_xx align 16 Lrounds_16_xx: %rep 16 ROUND_16_XX T1, i %assign i (i+1) %endrep cmp ROUND,ROUNDS jb Lrounds_16_xx ;; add old digest vpaddd a, a, [rsp + _DIGEST + 0*SZ4] vpaddd b, b, [rsp + _DIGEST + 1*SZ4] vpaddd c, c, [rsp + _DIGEST + 2*SZ4] vpaddd d, d, [rsp + _DIGEST + 3*SZ4] vpaddd e, e, [rsp + _DIGEST + 4*SZ4] vpaddd f, f, [rsp + _DIGEST + 5*SZ4] vpaddd g, g, [rsp + _DIGEST + 6*SZ4] vpaddd h, h, [rsp + _DIGEST + 7*SZ4] sub INP_SIZE, 1 ;; unit is blocks jne lloop ; write back to memory (state object) the transposed digest vmovdqa [STATE+0*SHA256_DIGEST_ROW_SIZE],a vmovdqa [STATE+1*SHA256_DIGEST_ROW_SIZE],b vmovdqa [STATE+2*SHA256_DIGEST_ROW_SIZE],c vmovdqa [STATE+3*SHA256_DIGEST_ROW_SIZE],d vmovdqa [STATE+4*SHA256_DIGEST_ROW_SIZE],e vmovdqa [STATE+5*SHA256_DIGEST_ROW_SIZE],f vmovdqa [STATE+6*SHA256_DIGEST_ROW_SIZE],g vmovdqa [STATE+7*SHA256_DIGEST_ROW_SIZE],h ; update input pointers add inp0, IDX mov [STATE + _data_ptr_sha256 + 0*8], inp0 add inp1, IDX mov [STATE + _data_ptr_sha256 + 1*8], inp1 add inp2, IDX mov [STATE + _data_ptr_sha256 + 2*8], inp2 add inp3, IDX mov [STATE + _data_ptr_sha256 + 3*8], inp3 ;;;;;;;;;;;;;;;; ;; Postamble add rsp, STACK_size ; outer calling routine restores XMM and other GP registers ret intel-ipsec-mb-0.48/avx2/000077500000000000000000000000001321406316400151355ustar00rootroot00000000000000intel-ipsec-mb-0.48/avx2/gcm128_avx_gen4.asm000066400000000000000000000033611321406316400204360ustar00rootroot00000000000000;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; Copyright(c) 2017 Intel Corporation All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions ; are met: ; * Redistributions of source code must retain the above copyright ; notice, this list of conditions and the following disclaimer. ; * Redistributions in binary form must reproduce the above copyright ; notice, this list of conditions and the following disclaimer in ; the documentation and/or other materials provided with the ; distribution. ; * Neither the name of Intel Corporation nor the names of its ; contributors may be used to endorse or promote products derived ; from this software without specific prior written permission. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT ; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, ; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY ; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; %define GCM128_MODE 1 %include "gcm_avx_gen4.asm" intel-ipsec-mb-0.48/avx2/gcm192_avx_gen4.asm000066400000000000000000000033611321406316400204370ustar00rootroot00000000000000;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; Copyright(c) 2017 Intel Corporation All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions ; are met: ; * Redistributions of source code must retain the above copyright ; notice, this list of conditions and the following disclaimer. ; * Redistributions in binary form must reproduce the above copyright ; notice, this list of conditions and the following disclaimer in ; the documentation and/or other materials provided with the ; distribution. ; * Neither the name of Intel Corporation nor the names of its ; contributors may be used to endorse or promote products derived ; from this software without specific prior written permission. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT ; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, ; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY ; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; %define GCM192_MODE 1 %include "gcm_avx_gen4.asm" intel-ipsec-mb-0.48/avx2/gcm256_avx_gen4.asm000066400000000000000000000033611321406316400204400ustar00rootroot00000000000000;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; Copyright(c) 2017 Intel Corporation All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions ; are met: ; * Redistributions of source code must retain the above copyright ; notice, this list of conditions and the following disclaimer. ; * Redistributions in binary form must reproduce the above copyright ; notice, this list of conditions and the following disclaimer in ; the documentation and/or other materials provided with the ; distribution. ; * Neither the name of Intel Corporation nor the names of its ; contributors may be used to endorse or promote products derived ; from this software without specific prior written permission. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT ; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, ; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY ; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; %define GCM256_MODE 1 %include "gcm_avx_gen4.asm" intel-ipsec-mb-0.48/avx2/gcm_avx_gen4.asm000066400000000000000000003513021321406316400202040ustar00rootroot00000000000000;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; Copyright(c) 2011-2017 Intel Corporation All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions ; are met: ; * Redistributions of source code must retain the above copyright ; notice, this list of conditions and the following disclaimer. ; * Redistributions in binary form must reproduce the above copyright ; notice, this list of conditions and the following disclaimer in ; the documentation and/or other materials provided with the ; distribution. ; * Neither the name of Intel Corporation nor the names of its ; contributors may be used to endorse or promote products derived ; from this software without specific prior written permission. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT ; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, ; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY ; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; ; Authors: ; Erdinc Ozturk ; Vinodh Gopal ; James Guilford ; ; ; References: ; This code was derived and highly optimized from the code described in paper: ; Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation on Intel Architecture Processors. August, 2010 ; The details of the implementation is explained in: ; Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode on Intel Architecture Processors. October, 2012. ; ; ; ; ; Assumptions: ; ; ; ; iv: ; 0 1 2 3 ; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ ; | Salt (From the SA) | ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ ; | Initialization Vector | ; | (This is the sequence number from IPSec header) | ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ ; | 0x1 | ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ ; ; ; ; AAD: ; AAD will be padded with 0 to the next 16byte multiple ; for example, assume AAD is a u32 vector ; ; if AAD is 8 bytes: ; AAD[3] = {A0, A1}; ; padded AAD in xmm register = {A1 A0 0 0} ; ; 0 1 2 3 ; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ ; | SPI (A1) | ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ ; | 32-bit Sequence Number (A0) | ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ ; | 0x0 | ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ ; ; AAD Format with 32-bit Sequence Number ; ; if AAD is 12 bytes: ; AAD[3] = {A0, A1, A2}; ; padded AAD in xmm register = {A2 A1 A0 0} ; ; 0 1 2 3 ; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ ; | SPI (A2) | ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ ; | 64-bit Extended Sequence Number {A1,A0} | ; | | ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ ; | 0x0 | ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ ; ; AAD Format with 64-bit Extended Sequence Number ; ; ; aadLen: ; Must be a multiple of 4 bytes and from the definition of the spec. ; The code additionally supports any aadLen length. ; ; TLen: ; from the definition of the spec, TLen can only be 8, 12 or 16 bytes. ; ; poly = x^128 + x^127 + x^126 + x^121 + 1 ; throughout the code, one tab and two tab indentations are used. one tab is for GHASH part, two tabs is for AES part. ; %include "os.asm" %include "reg_sizes.asm" %include "gcm_defines.asm" %ifndef GCM128_MODE %ifndef GCM192_MODE %ifndef GCM256_MODE %error "No GCM mode selected for gcm_avx_gen4.asm!" %endif %endif %endif ;; Decide on AES-GCM key size to compile for %ifdef GCM128_MODE %define NROUNDS 9 %define FN_NAME(x,y) aes_gcm_ %+ x %+ _128 %+ y %+ avx_gen4 %endif %ifdef GCM192_MODE %define NROUNDS 11 %define FN_NAME(x,y) aes_gcm_ %+ x %+ _192 %+ y %+ avx_gen4 %endif %ifdef GCM256_MODE %define NROUNDS 13 %define FN_NAME(x,y) aes_gcm_ %+ x %+ _256 %+ y %+ avx_gen4 %endif section .text default rel ; need to push 4 registers into stack to maintain %define STACK_OFFSET 8*4 %define TMP2 16*0 ; Temporary storage for AES State 2 (State 1 is stored in an XMM register) %define TMP3 16*1 ; Temporary storage for AES State 3 %define TMP4 16*2 ; Temporary storage for AES State 4 %define TMP5 16*3 ; Temporary storage for AES State 5 %define TMP6 16*4 ; Temporary storage for AES State 6 %define TMP7 16*5 ; Temporary storage for AES State 7 %define TMP8 16*6 ; Temporary storage for AES State 8 %define LOCAL_STORAGE 16*7 %ifidn __OUTPUT_FORMAT__, win64 %define XMM_STORAGE 16*10 %else %define XMM_STORAGE 0 %endif %define VARIABLE_OFFSET LOCAL_STORAGE + XMM_STORAGE ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; Utility Macros ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0) ; Input: A and B (128-bits each, bit-reflected) ; Output: C = A*B*x mod poly, (i.e. >>1 ) ; To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input ; GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; %macro GHASH_MUL 7 %define %%GH %1 ; 16 Bytes %define %%HK %2 ; 16 Bytes %define %%T1 %3 %define %%T2 %4 %define %%T3 %5 %define %%T4 %6 %define %%T5 %7 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; vpclmulqdq %%T1, %%GH, %%HK, 0x11 ; %%T1 = a1*b1 vpclmulqdq %%T2, %%GH, %%HK, 0x00 ; %%T2 = a0*b0 vpclmulqdq %%T3, %%GH, %%HK, 0x01 ; %%T3 = a1*b0 vpclmulqdq %%GH, %%GH, %%HK, 0x10 ; %%GH = a0*b1 vpxor %%GH, %%GH, %%T3 vpsrldq %%T3, %%GH, 8 ; shift-R %%GH 2 DWs vpslldq %%GH, %%GH, 8 ; shift-L %%GH 2 DWs vpxor %%T1, %%T1, %%T3 vpxor %%GH, %%GH, %%T2 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;first phase of the reduction vmovdqu %%T3, [POLY2] vpclmulqdq %%T2, %%T3, %%GH, 0x01 vpslldq %%T2, %%T2, 8 ; shift-L %%T2 2 DWs vpxor %%GH, %%GH, %%T2 ; first phase of the reduction complete ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;second phase of the reduction vpclmulqdq %%T2, %%T3, %%GH, 0x00 vpsrldq %%T2, %%T2, 4 ; shift-R %%T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R) vpclmulqdq %%GH, %%T3, %%GH, 0x10 vpslldq %%GH, %%GH, 4 ; shift-L %%GH 1 DW (Shift-L 1-DW to obtain result with no shifts) vpxor %%GH, %%GH, %%T2 ; second phase of the reduction complete ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; vpxor %%GH, %%GH, %%T1 ; the result is in %%GH %endmacro ; In PRECOMPUTE, the commands filling Hashkey_i_k are not required for avx_gen4 ; functions, but are kept to allow users to switch cpu architectures between calls ; of pre, init, update, and finalize. %macro PRECOMPUTE 8 %define %%GDATA %1 %define %%HK %2 %define %%T1 %3 %define %%T2 %4 %define %%T3 %5 %define %%T4 %6 %define %%T5 %7 %define %%T6 %8 ; Haskey_i_k holds XORed values of the low and high parts of the Haskey_i vmovdqa %%T5, %%HK vpshufd %%T1, %%T5, 01001110b vpxor %%T1, %%T5 vmovdqu [%%GDATA + HashKey_k], %%T1 GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^2<<1 mod poly vmovdqu [%%GDATA + HashKey_2], %%T5 ; [HashKey_2] = HashKey^2<<1 mod poly vpshufd %%T1, %%T5, 01001110b vpxor %%T1, %%T5 vmovdqu [%%GDATA + HashKey_2_k], %%T1 GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^3<<1 mod poly vmovdqu [%%GDATA + HashKey_3], %%T5 vpshufd %%T1, %%T5, 01001110b vpxor %%T1, %%T5 vmovdqu [%%GDATA + HashKey_3_k], %%T1 GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^4<<1 mod poly vmovdqu [%%GDATA + HashKey_4], %%T5 vpshufd %%T1, %%T5, 01001110b vpxor %%T1, %%T5 vmovdqu [%%GDATA + HashKey_4_k], %%T1 GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^5<<1 mod poly vmovdqu [%%GDATA + HashKey_5], %%T5 vpshufd %%T1, %%T5, 01001110b vpxor %%T1, %%T5 vmovdqu [%%GDATA + HashKey_5_k], %%T1 GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^6<<1 mod poly vmovdqu [%%GDATA + HashKey_6], %%T5 vpshufd %%T1, %%T5, 01001110b vpxor %%T1, %%T5 vmovdqu [%%GDATA + HashKey_6_k], %%T1 GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^7<<1 mod poly vmovdqu [%%GDATA + HashKey_7], %%T5 vpshufd %%T1, %%T5, 01001110b vpxor %%T1, %%T5 vmovdqu [%%GDATA + HashKey_7_k], %%T1 GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^8<<1 mod poly vmovdqu [%%GDATA + HashKey_8], %%T5 vpshufd %%T1, %%T5, 01001110b vpxor %%T1, %%T5 vmovdqu [%%GDATA + HashKey_8_k], %%T1 %endmacro ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; READ_SMALL_DATA_INPUT: Packs xmm register with data when data input is less than 16 bytes. ; Returns 0 if data has length 0. ; Input: The input data (INPUT), that data's length (LENGTH). ; Output: The packed xmm register (OUTPUT). ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; %macro READ_SMALL_DATA_INPUT 6 %define %%OUTPUT %1 ; %%OUTPUT is an xmm register %define %%INPUT %2 %define %%LENGTH %3 %define %%END_READ_LOCATION %4 ; All this and the lower inputs are temp registers %define %%COUNTER %5 %define %%TMP1 %6 vpxor %%OUTPUT, %%OUTPUT mov %%COUNTER, %%LENGTH mov %%END_READ_LOCATION, %%INPUT add %%END_READ_LOCATION, %%LENGTH xor %%TMP1, %%TMP1 cmp %%COUNTER, 8 jl %%_byte_loop_2 vpinsrq %%OUTPUT, [%%INPUT],0 ;Read in 8 bytes if they exists je %%_done sub %%COUNTER, 8 %%_byte_loop_1: ;Read in data 1 byte at a time while data is left shl %%TMP1, 8 ;This loop handles when 8 bytes were already read in dec %%END_READ_LOCATION mov BYTE(%%TMP1), BYTE [%%END_READ_LOCATION] dec %%COUNTER jg %%_byte_loop_1 vpinsrq %%OUTPUT, %%TMP1, 1 jmp %%_done %%_byte_loop_2: ;Read in data 1 byte at a time while data is left ;; NOTE: in current implementation check for zero length is obsolete here. ;; The adequate checks are done by callers of this macro. ;; cmp %%COUNTER, 0 ;; je %%_done shl %%TMP1, 8 ;This loop handles when no bytes were already read in dec %%END_READ_LOCATION mov BYTE(%%TMP1), BYTE [%%END_READ_LOCATION] dec %%COUNTER jg %%_byte_loop_2 vpinsrq %%OUTPUT, %%TMP1, 0 %%_done: %endmacro ; READ_SMALL_DATA_INPUT ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted. ; Input: The input data (A_IN), that data's length (A_LEN), and the hash key (HASH_KEY). ; Output: The hash of the data (AAD_HASH). ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; %macro CALC_AAD_HASH 14 %define %%A_IN %1 %define %%A_LEN %2 %define %%AAD_HASH %3 %define %%HASH_KEY %4 %define %%XTMP1 %5 ; xmm temp reg 5 %define %%XTMP2 %6 %define %%XTMP3 %7 %define %%XTMP4 %8 %define %%XTMP5 %9 ; xmm temp reg 5 %define %%T1 %10 ; temp reg 1 %define %%T2 %11 %define %%T3 %12 %define %%T4 %13 %define %%T5 %14 ; temp reg 5 mov %%T1, %%A_IN ; T1 = AAD mov %%T2, %%A_LEN ; T2 = aadLen vpxor %%AAD_HASH, %%AAD_HASH cmp %%T2, 16 jl %%_get_small_AAD_block %%_get_AAD_loop16: vmovdqu %%XTMP1, [%%T1] ;byte-reflect the AAD data vpshufb %%XTMP1, [SHUF_MASK] vpxor %%AAD_HASH, %%XTMP1 GHASH_MUL %%AAD_HASH, %%HASH_KEY, %%XTMP1, %%XTMP2, %%XTMP3, %%XTMP4, %%XTMP5 sub %%T2, 16 je %%_CALC_AAD_done add %%T1, 16 cmp %%T2, 16 jge %%_get_AAD_loop16 %%_get_small_AAD_block: READ_SMALL_DATA_INPUT %%XTMP1, %%T1, %%T2, %%T3, %%T4, %%T5 ;byte-reflect the AAD data vpshufb %%XTMP1, [SHUF_MASK] vpxor %%AAD_HASH, %%XTMP1 GHASH_MUL %%AAD_HASH, %%HASH_KEY, %%XTMP1, %%XTMP2, %%XTMP3, %%XTMP4, %%XTMP5 %%_CALC_AAD_done: %endmacro ; CALC_AAD_HASH ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks between update calls. ; Requires the input data be at least 1 byte long. ; Input: gcm_key_data * (GDATA_KEY), gcm_context_data *(GDATA_CTX), input text (PLAIN_CYPH_IN), ; input text length (PLAIN_CYPH_LEN), the current data offset (DATA_OFFSET), ; and whether encoding or decoding (ENC_DEC) ; Output: A cypher of the first partial block (CYPH_PLAIN_OUT), and updated GDATA_CTX ; Clobbers rax, r10, r12, r13, r15, xmm0, xmm1, xmm2, xmm3, xmm5, xmm6, xmm9, xmm10, xmm11, xmm13 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; %macro PARTIAL_BLOCK 8 %define %%GDATA_KEY %1 %define %%GDATA_CTX %2 %define %%CYPH_PLAIN_OUT %3 %define %%PLAIN_CYPH_IN %4 %define %%PLAIN_CYPH_LEN %5 %define %%DATA_OFFSET %6 %define %%AAD_HASH %7 %define %%ENC_DEC %8 mov r13, [%%GDATA_CTX + PBlockLen] cmp r13, 0 je %%_partial_block_done ;Leave Macro if no partial blocks cmp %%PLAIN_CYPH_LEN, 16 ;Read in input data without over reading jl %%_fewer_than_16_bytes VXLDR xmm1, [%%PLAIN_CYPH_IN] ;If more than 16 bytes of data, just fill the xmm register jmp %%_data_read %%_fewer_than_16_bytes: lea r10, [%%PLAIN_CYPH_IN + %%DATA_OFFSET] READ_SMALL_DATA_INPUT xmm1, r10, %%PLAIN_CYPH_LEN, rax, r12, r15 %%_data_read: ;Finished reading in data vmovdqu xmm9, [%%GDATA_CTX + PBlockEncKey] ;xmm9 = my_ctx_data.partial_block_enc_key vmovdqu xmm13, [%%GDATA_KEY + HashKey] lea r12, [SHIFT_MASK] add r12, r13 ; adjust the shuffle mask pointer to be able to shift r13 bytes (16-r13 is the number of bytes in plaintext mod 16) vmovdqu xmm2, [r12] ; get the appropriate shuffle mask vpshufb xmm9, xmm2 ;shift right r13 bytes %ifidn %%ENC_DEC, DEC vmovdqa xmm3, xmm1 vpxor xmm9, xmm1 ; Cyphertext XOR E(K, Yn) mov r15, %%PLAIN_CYPH_LEN add r15, r13 sub r15, 16 ;Set r15 to be the amount of data left in CYPH_PLAIN_IN after filling the block jge %%_no_extra_mask_1 ;Determine if if partial block is not being filled and shift mask accordingly sub r12, r15 %%_no_extra_mask_1: vmovdqu xmm1, [r12 + ALL_F - SHIFT_MASK]; get the appropriate mask to mask out bottom r13 bytes of xmm9 vpand xmm9, xmm1 ; mask out bottom r13 bytes of xmm9 vpand xmm3, xmm1 vpshufb xmm3, [SHUF_MASK] vpshufb xmm3, xmm2 vpxor %%AAD_HASH, xmm3 cmp r15,0 jl %%_partial_incomplete_1 GHASH_MUL %%AAD_HASH, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block xor rax,rax mov [%%GDATA_CTX + PBlockLen], rax jmp %%_dec_done %%_partial_incomplete_1: %ifidn __OUTPUT_FORMAT__, win64 mov rax, %%PLAIN_CYPH_LEN add [%%GDATA_CTX + PBlockLen], rax %else add [%%GDATA_CTX + PBlockLen], %%PLAIN_CYPH_LEN %endif %%_dec_done: vmovdqu [%%GDATA_CTX + AadHash], %%AAD_HASH %else vpxor xmm9, xmm1 ; Plaintext XOR E(K, Yn) mov r15, %%PLAIN_CYPH_LEN add r15, r13 sub r15, 16 ;Set r15 to be the amount of data left in CYPH_PLAIN_IN after filling the block jge %%_no_extra_mask_2 ;Determine if if partial block is not being filled and shift mask accordingly sub r12, r15 %%_no_extra_mask_2: vmovdqu xmm1, [r12 + ALL_F-SHIFT_MASK] ; get the appropriate mask to mask out bottom r13 bytes of xmm9 vpand xmm9, xmm1 ; mask out bottom r13 bytes of xmm9 vpshufb xmm9, [SHUF_MASK] vpshufb xmm9, xmm2 vpxor %%AAD_HASH, xmm9 cmp r15,0 jl %%_partial_incomplete_2 GHASH_MUL %%AAD_HASH, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block xor rax,rax mov [%%GDATA_CTX + PBlockLen], rax jmp %%_encode_done %%_partial_incomplete_2: %ifidn __OUTPUT_FORMAT__, win64 mov rax, %%PLAIN_CYPH_LEN add [%%GDATA_CTX + PBlockLen], rax %else add [%%GDATA_CTX + PBlockLen], %%PLAIN_CYPH_LEN %endif %%_encode_done: vmovdqu [%%GDATA_CTX + AadHash], %%AAD_HASH vpshufb xmm9, [SHUF_MASK] ; shuffle xmm9 back to output as ciphertext vpshufb xmm9, xmm2 %endif ;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; output encrypted Bytes cmp r15,0 jl %%_partial_fill mov r12, r13 mov r13, 16 sub r13, r12 ; Set r13 to be the number of bytes to write out jmp %%_count_set %%_partial_fill: mov r13, %%PLAIN_CYPH_LEN %%_count_set: vmovq rax, xmm9 cmp r13, 8 jle %%_less_than_8_bytes_left mov [%%CYPH_PLAIN_OUT+ %%DATA_OFFSET], rax add %%DATA_OFFSET, 8 vpsrldq xmm9, xmm9, 8 vmovq rax, xmm9 sub r13, 8 %%_less_than_8_bytes_left: mov BYTE [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], al add %%DATA_OFFSET, 1 shr rax, 8 sub r13, 1 jne %%_less_than_8_bytes_left ;;;;;;;;;;;;;;;;;;;;;;;;;;;;; %%_partial_block_done: %endmacro ; PARTIAL_BLOCK %macro GHASH_SINGLE_MUL 9 %define %%GDATA %1 %define %%HASHKEY %2 %define %%CIPHER %3 %define %%STATE_11 %4 %define %%STATE_00 %5 %define %%STATE_MID %6 %define %%T1 %7 %define %%T2 %8 %define %%FIRST %9 vmovdqu %%T1, [%%GDATA + %%HASHKEY] %ifidn %%FIRST, first vpclmulqdq %%STATE_11, %%CIPHER, %%T1, 0x11 ; %%T4 = a1*b1 vpclmulqdq %%STATE_00, %%CIPHER, %%T1, 0x00 ; %%T4_2 = a0*b0 vpclmulqdq %%STATE_MID, %%CIPHER, %%T1, 0x01 ; %%T6 = a1*b0 vpclmulqdq %%T2, %%CIPHER, %%T1, 0x10 ; %%T5 = a0*b1 vpxor %%STATE_MID, %%STATE_MID, %%T2 %else vpclmulqdq %%T2, %%CIPHER, %%T1, 0x11 vpxor %%STATE_11, %%STATE_11, %%T2 vpclmulqdq %%T2, %%CIPHER, %%T1, 0x00 vpxor %%STATE_00, %%STATE_00, %%T2 vpclmulqdq %%T2, %%CIPHER, %%T1, 0x01 vpxor %%STATE_MID, %%STATE_MID, %%T2 vpclmulqdq %%T2, %%CIPHER, %%T1, 0x10 vpxor %%STATE_MID, %%STATE_MID, %%T2 %endif %endmacro ; if a = number of total plaintext bytes ; b = floor(a/16) ; %%num_initial_blocks = b mod 8; ; encrypt the initial %%num_initial_blocks blocks and apply ghash on the ciphertext ; %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r14 are used as a pointer only, not modified. ; Updated AAD_HASH is returned in %%T3 %macro INITIAL_BLOCKS 23 %define %%GDATA_KEY %1 %define %%CYPH_PLAIN_OUT %2 %define %%PLAIN_CYPH_IN %3 %define %%LENGTH %4 %define %%DATA_OFFSET %5 %define %%num_initial_blocks %6 ; can be 0, 1, 2, 3, 4, 5, 6 or 7 %define %%T1 %7 %define %%T2 %8 %define %%T3 %9 %define %%T4 %10 %define %%T5 %11 %define %%CTR %12 %define %%XMM1 %13 %define %%XMM2 %14 %define %%XMM3 %15 %define %%XMM4 %16 %define %%XMM5 %17 %define %%XMM6 %18 %define %%XMM7 %19 %define %%XMM8 %20 %define %%T6 %21 %define %%T_key %22 %define %%ENC_DEC %23 %assign i (8-%%num_initial_blocks) ;; Move AAD_HASH to temp reg vmovdqu %%T2, %%XMM8 ;; Start AES for %%num_initial_blocks blocks ;; vmovdqu %%CTR, [%%GDATA_CTX + CurCount] ; %%CTR = Y0 %assign i (9-%%num_initial_blocks) %rep %%num_initial_blocks vpaddd %%CTR, %%CTR, [ONE] ; INCR Y0 vmovdqa reg(i), %%CTR vpshufb reg(i), [SHUF_MASK] ; perform a 16Byte swap %assign i (i+1) %endrep %if(%%num_initial_blocks>0) vmovdqu %%T_key, [%%GDATA_KEY+16*0] %assign i (9-%%num_initial_blocks) %rep %%num_initial_blocks vpxor reg(i),reg(i),%%T_key %assign i (i+1) %endrep %assign j 1 %rep NROUNDS vmovdqu %%T_key, [%%GDATA_KEY+16*j] %assign i (9-%%num_initial_blocks) %rep %%num_initial_blocks vaesenc reg(i),%%T_key %assign i (i+1) %endrep %assign j (j+1) %endrep vmovdqu %%T_key, [%%GDATA_KEY+16*j] %assign i (9-%%num_initial_blocks) %rep %%num_initial_blocks vaesenclast reg(i),%%T_key %assign i (i+1) %endrep %endif ; %if(%%num_initial_blocks>0) %assign i (9-%%num_initial_blocks) %rep %%num_initial_blocks VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET] vpxor reg(i), reg(i), %%T1 ;; Write back ciphertext for %%num_initial_blocks blocks VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], reg(i) add %%DATA_OFFSET, 16 %ifidn %%ENC_DEC, DEC vmovdqa reg(i), %%T1 %endif ;; Prepare ciphertext for GHASH computations vpshufb reg(i), [SHUF_MASK] %assign i (i+1) %endrep ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; %assign i (9-%%num_initial_blocks) %if(%%num_initial_blocks>0) vmovdqa %%T3, reg(i) %assign i (i+1) %endif %rep %%num_initial_blocks-1 vmovdqu [rsp + TMP %+ i], reg(i) %assign i (i+1) %endrep ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; Haskey_i_k holds XORed values of the low and high parts of ;; the Haskey_i vpaddd %%XMM1, %%CTR, [ONE] ; INCR Y0 vpaddd %%XMM2, %%CTR, [TWO] ; INCR Y0 vpaddd %%XMM3, %%XMM1, [TWO] ; INCR Y0 vpaddd %%XMM4, %%XMM2, [TWO] ; INCR Y0 vpaddd %%XMM5, %%XMM3, [TWO] ; INCR Y0 vpaddd %%XMM6, %%XMM4, [TWO] ; INCR Y0 vpaddd %%XMM7, %%XMM5, [TWO] ; INCR Y0 vpaddd %%XMM8, %%XMM6, [TWO] ; INCR Y0 vmovdqa %%CTR, %%XMM8 vpshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap vpshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap vpshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap vpshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap vpshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap vpshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap vpshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap vpshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap vmovdqu %%T_key, [%%GDATA_KEY+16*0] vpxor %%XMM1, %%XMM1, %%T_key vpxor %%XMM2, %%XMM2, %%T_key vpxor %%XMM3, %%XMM3, %%T_key vpxor %%XMM4, %%XMM4, %%T_key vpxor %%XMM5, %%XMM5, %%T_key vpxor %%XMM6, %%XMM6, %%T_key vpxor %%XMM7, %%XMM7, %%T_key vpxor %%XMM8, %%XMM8, %%T_key %assign i (8-%%num_initial_blocks) %assign j (9-%%num_initial_blocks) %assign k (%%num_initial_blocks) %define %%T4_2 %%T4 %if(%%num_initial_blocks>0) ;; Hash in AES state ;; T2 - incoming AAD hash vpxor %%T2, %%T3 ;; GDATA, HASHKEY, CIPHER, ;; STATE_11, STATE_00, STATE_MID, T1, T2 GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \ %%T1, %%T4, %%T6, %%T5, %%T3, first %endif vmovdqu %%T_key, [%%GDATA_KEY+16*1] vaesenc %%XMM1, %%T_key vaesenc %%XMM2, %%T_key vaesenc %%XMM3, %%T_key vaesenc %%XMM4, %%T_key vaesenc %%XMM5, %%T_key vaesenc %%XMM6, %%T_key vaesenc %%XMM7, %%T_key vaesenc %%XMM8, %%T_key vmovdqu %%T_key, [%%GDATA_KEY+16*2] vaesenc %%XMM1, %%T_key vaesenc %%XMM2, %%T_key vaesenc %%XMM3, %%T_key vaesenc %%XMM4, %%T_key vaesenc %%XMM5, %%T_key vaesenc %%XMM6, %%T_key vaesenc %%XMM7, %%T_key vaesenc %%XMM8, %%T_key %assign i (i+1) %assign j (j+1) %assign k (k-1) %if(%%num_initial_blocks>1) ;; GDATA, HASHKEY, CIPHER, ;; STATE_11, STATE_00, STATE_MID, T1, T2 vmovdqu %%T2, [rsp + TMP %+ j] GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \ %%T1, %%T4, %%T6, %%T5, %%T3, not_first %endif vmovdqu %%T_key, [%%GDATA_KEY+16*3] vaesenc %%XMM1, %%T_key vaesenc %%XMM2, %%T_key vaesenc %%XMM3, %%T_key vaesenc %%XMM4, %%T_key vaesenc %%XMM5, %%T_key vaesenc %%XMM6, %%T_key vaesenc %%XMM7, %%T_key vaesenc %%XMM8, %%T_key vmovdqu %%T_key, [%%GDATA_KEY+16*4] vaesenc %%XMM1, %%T_key vaesenc %%XMM2, %%T_key vaesenc %%XMM3, %%T_key vaesenc %%XMM4, %%T_key vaesenc %%XMM5, %%T_key vaesenc %%XMM6, %%T_key vaesenc %%XMM7, %%T_key vaesenc %%XMM8, %%T_key %assign i (i+1) %assign j (j+1) %assign k (k-1) %if(%%num_initial_blocks>2) ;; GDATA, HASHKEY, CIPHER, ;; STATE_11, STATE_00, STATE_MID, T1, T2 vmovdqu %%T2, [rsp + TMP %+ j] GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \ %%T1, %%T4, %%T6, %%T5, %%T3, not_first %endif %assign i (i+1) %assign j (j+1) %assign k (k-1) %if(%%num_initial_blocks>3) ;; GDATA, HASHKEY, CIPHER, ;; STATE_11, STATE_00, STATE_MID, T1, T2 vmovdqu %%T2, [rsp + TMP %+ j] GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \ %%T1, %%T4, %%T6, %%T5, %%T3, not_first %endif vmovdqu %%T_key, [%%GDATA_KEY+16*5] vaesenc %%XMM1, %%T_key vaesenc %%XMM2, %%T_key vaesenc %%XMM3, %%T_key vaesenc %%XMM4, %%T_key vaesenc %%XMM5, %%T_key vaesenc %%XMM6, %%T_key vaesenc %%XMM7, %%T_key vaesenc %%XMM8, %%T_key vmovdqu %%T_key, [%%GDATA_KEY+16*6] vaesenc %%XMM1, %%T_key vaesenc %%XMM2, %%T_key vaesenc %%XMM3, %%T_key vaesenc %%XMM4, %%T_key vaesenc %%XMM5, %%T_key vaesenc %%XMM6, %%T_key vaesenc %%XMM7, %%T_key vaesenc %%XMM8, %%T_key %assign i (i+1) %assign j (j+1) %assign k (k-1) %if(%%num_initial_blocks>4) ;; GDATA, HASHKEY, CIPHER, ;; STATE_11, STATE_00, STATE_MID, T1, T2 vmovdqu %%T2, [rsp + TMP %+ j] GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \ %%T1, %%T4, %%T6, %%T5, %%T3, not_first %endif vmovdqu %%T_key, [%%GDATA_KEY+16*7] vaesenc %%XMM1, %%T_key vaesenc %%XMM2, %%T_key vaesenc %%XMM3, %%T_key vaesenc %%XMM4, %%T_key vaesenc %%XMM5, %%T_key vaesenc %%XMM6, %%T_key vaesenc %%XMM7, %%T_key vaesenc %%XMM8, %%T_key vmovdqu %%T_key, [%%GDATA_KEY+16*8] vaesenc %%XMM1, %%T_key vaesenc %%XMM2, %%T_key vaesenc %%XMM3, %%T_key vaesenc %%XMM4, %%T_key vaesenc %%XMM5, %%T_key vaesenc %%XMM6, %%T_key vaesenc %%XMM7, %%T_key vaesenc %%XMM8, %%T_key %assign i (i+1) %assign j (j+1) %assign k (k-1) %if(%%num_initial_blocks>5) ;; GDATA, HASHKEY, CIPHER, ;; STATE_11, STATE_00, STATE_MID, T1, T2 vmovdqu %%T2, [rsp + TMP %+ j] GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \ %%T1, %%T4, %%T6, %%T5, %%T3, not_first %endif vmovdqu %%T_key, [%%GDATA_KEY+16*9] vaesenc %%XMM1, %%T_key vaesenc %%XMM2, %%T_key vaesenc %%XMM3, %%T_key vaesenc %%XMM4, %%T_key vaesenc %%XMM5, %%T_key vaesenc %%XMM6, %%T_key vaesenc %%XMM7, %%T_key vaesenc %%XMM8, %%T_key %ifndef GCM128_MODE vmovdqu %%T_key, [%%GDATA_KEY+16*10] vaesenc %%XMM1, %%T_key vaesenc %%XMM2, %%T_key vaesenc %%XMM3, %%T_key vaesenc %%XMM4, %%T_key vaesenc %%XMM5, %%T_key vaesenc %%XMM6, %%T_key vaesenc %%XMM7, %%T_key vaesenc %%XMM8, %%T_key %endif %assign i (i+1) %assign j (j+1) %assign k (k-1) %if(%%num_initial_blocks>6) ;; GDATA, HASHKEY, CIPHER, ;; STATE_11, STATE_00, STATE_MID, T1, T2 vmovdqu %%T2, [rsp + TMP %+ j] GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \ %%T1, %%T4, %%T6, %%T5, %%T3, not_first %endif %ifdef GCM128_MODE vmovdqu %%T_key, [%%GDATA_KEY+16*10] vaesenclast %%XMM1, %%T_key vaesenclast %%XMM2, %%T_key vaesenclast %%XMM3, %%T_key vaesenclast %%XMM4, %%T_key vaesenclast %%XMM5, %%T_key vaesenclast %%XMM6, %%T_key vaesenclast %%XMM7, %%T_key vaesenclast %%XMM8, %%T_key %endif %ifdef GCM192_MODE vmovdqu %%T_key, [%%GDATA_KEY+16*11] vaesenc %%XMM1, %%T_key vaesenc %%XMM2, %%T_key vaesenc %%XMM3, %%T_key vaesenc %%XMM4, %%T_key vaesenc %%XMM5, %%T_key vaesenc %%XMM6, %%T_key vaesenc %%XMM7, %%T_key vaesenc %%XMM8, %%T_key vmovdqu %%T_key, [%%GDATA_KEY+16*12] vaesenclast %%XMM1, %%T_key vaesenclast %%XMM2, %%T_key vaesenclast %%XMM3, %%T_key vaesenclast %%XMM4, %%T_key vaesenclast %%XMM5, %%T_key vaesenclast %%XMM6, %%T_key vaesenclast %%XMM7, %%T_key vaesenclast %%XMM8, %%T_key %endif %ifdef GCM256_MODE vmovdqu %%T_key, [%%GDATA_KEY+16*11] vaesenc %%XMM1, %%T_key vaesenc %%XMM2, %%T_key vaesenc %%XMM3, %%T_key vaesenc %%XMM4, %%T_key vaesenc %%XMM5, %%T_key vaesenc %%XMM6, %%T_key vaesenc %%XMM7, %%T_key vaesenc %%XMM8, %%T_key vmovdqu %%T_key, [%%GDATA_KEY+16*12] vaesenc %%XMM1, %%T_key vaesenc %%XMM2, %%T_key vaesenc %%XMM3, %%T_key vaesenc %%XMM4, %%T_key vaesenc %%XMM5, %%T_key vaesenc %%XMM6, %%T_key vaesenc %%XMM7, %%T_key vaesenc %%XMM8, %%T_key %endif %assign i (i+1) %assign j (j+1) %assign k (k-1) %if(%%num_initial_blocks>7) ;; GDATA, HASHKEY, CIPHER, ;; STATE_11, STATE_00, STATE_MID, T1, T2 vmovdqu %%T2, [rsp + TMP %+ j] GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \ %%T1, %%T4, %%T6, %%T5, %%T3, not_first %endif %ifdef GCM256_MODE ; GCM256 vmovdqu %%T_key, [%%GDATA_KEY+16*13] vaesenc %%XMM1, %%T_key vaesenc %%XMM2, %%T_key vaesenc %%XMM3, %%T_key vaesenc %%XMM4, %%T_key vaesenc %%XMM5, %%T_key vaesenc %%XMM6, %%T_key vaesenc %%XMM7, %%T_key vaesenc %%XMM8, %%T_key vmovdqu %%T_key, [%%GDATA_KEY+16*14] vaesenclast %%XMM1, %%T_key vaesenclast %%XMM2, %%T_key vaesenclast %%XMM3, %%T_key vaesenclast %%XMM4, %%T_key vaesenclast %%XMM5, %%T_key vaesenclast %%XMM6, %%T_key vaesenclast %%XMM7, %%T_key vaesenclast %%XMM8, %%T_key %endif ; GCM256 mode %if(%%num_initial_blocks>0) vpsrldq %%T3, %%T6, 8 ; shift-R %%T2 2 DWs vpslldq %%T6, %%T6, 8 ; shift-L %%T3 2 DWs vpxor %%T1, %%T1, %%T3 ; accumulate the results in %%T1:%%T4 vpxor %%T4, %%T6, %%T4 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; First phase of the reduction vmovdqu %%T3, [POLY2] vpclmulqdq %%T2, %%T3, %%T4, 0x01 vpslldq %%T2, %%T2, 8 ; shift-L xmm2 2 DWs ;; First phase of the reduction complete vpxor %%T4, %%T4, %%T2 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; Second phase of the reduction vpclmulqdq %%T2, %%T3, %%T4, 0x00 ;; Shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R) vpsrldq %%T2, %%T2, 4 vpclmulqdq %%T4, %%T3, %%T4, 0x10 ;; Shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts) vpslldq %%T4, %%T4, 4 ;; Second phase of the reduction complete vpxor %%T4, %%T4, %%T2 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; The result is in %%T3 vpxor %%T3, %%T1, %%T4 %else ;; The hash should end up in T3 vmovdqa %%T3, %%T2 %endif ;; Final hash is now in T3 %if %%num_initial_blocks > 0 ;; NOTE: obsolete in case %%num_initial_blocks = 0 sub %%LENGTH, 16*%%num_initial_blocks %endif VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*0] vpxor %%XMM1, %%XMM1, %%T1 VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*0], %%XMM1 %ifidn %%ENC_DEC, DEC vmovdqa %%XMM1, %%T1 %endif VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*1] vpxor %%XMM2, %%XMM2, %%T1 VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*1], %%XMM2 %ifidn %%ENC_DEC, DEC vmovdqa %%XMM2, %%T1 %endif VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*2] vpxor %%XMM3, %%XMM3, %%T1 VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*2], %%XMM3 %ifidn %%ENC_DEC, DEC vmovdqa %%XMM3, %%T1 %endif VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*3] vpxor %%XMM4, %%XMM4, %%T1 VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*3], %%XMM4 %ifidn %%ENC_DEC, DEC vmovdqa %%XMM4, %%T1 %endif VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*4] vpxor %%XMM5, %%XMM5, %%T1 VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*4], %%XMM5 %ifidn %%ENC_DEC, DEC vmovdqa %%XMM5, %%T1 %endif VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*5] vpxor %%XMM6, %%XMM6, %%T1 VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*5], %%XMM6 %ifidn %%ENC_DEC, DEC vmovdqa %%XMM6, %%T1 %endif VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*6] vpxor %%XMM7, %%XMM7, %%T1 VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*6], %%XMM7 %ifidn %%ENC_DEC, DEC vmovdqa %%XMM7, %%T1 %endif %if %%num_initial_blocks > 0 ;; NOTE: 'jl' is never taken for %%num_initial_blocks = 0 ;; This macro is executed for lenght 128 and up, ;; zero length is checked in GCM_ENC_DEC. ;; If the last block is partial then the xor will be done later ;; in ENCRYPT_FINAL_PARTIAL_BLOCK. ;; We know it's partial if LENGTH - 16*num_initial_blocks < 128 cmp %%LENGTH, 128 jl %%_initial_skip_last_word_write %endif VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*7] vpxor %%XMM8, %%XMM8, %%T1 VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*7], %%XMM8 %ifidn %%ENC_DEC, DEC vmovdqa %%XMM8, %%T1 %endif ;; Update %%LENGTH with the number of blocks processed sub %%LENGTH, 16 add %%DATA_OFFSET, 16 %%_initial_skip_last_word_write: sub %%LENGTH, 128-16 add %%DATA_OFFSET, 128-16 vpshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap ;; Combine GHASHed value with the corresponding ciphertext vpxor %%XMM1, %%XMM1, %%T3 vpshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap vpshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap vpshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap vpshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap vpshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap vpshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap vpshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; %%_initial_blocks_done: %endmacro ;;; INITIAL_BLOCKS macro with support for a partial final block. ;;; num_initial_blocks is expected to include the partial final block ;;; in the count. %macro INITIAL_BLOCKS_PARTIAL 25 %define %%GDATA_KEY %1 %define %%GDATA_CTX %2 %define %%CYPH_PLAIN_OUT %3 %define %%PLAIN_CYPH_IN %4 %define %%LENGTH %5 %define %%DATA_OFFSET %6 %define %%num_initial_blocks %7 ; can be 1, 2, 3, 4, 5, 6 or 7 (not 0) %define %%T1 %8 %define %%T2 %9 %define %%T3 %10 %define %%T4 %11 %define %%T5 %12 %define %%CTR %13 %define %%XMM1 %14 %define %%XMM2 %15 %define %%XMM3 %16 %define %%XMM4 %17 %define %%XMM5 %18 %define %%XMM6 %19 %define %%XMM7 %20 %define %%XMM8 %21 %define %%T6 %22 %define %%T_key %23 %define %%ENC_DEC %24 %define %%INSTANCE_TYPE %25 %assign i (8-%%num_initial_blocks) ;; Move AAD_HASH to temp reg vmovdqu %%T2, %%XMM8 ;; vmovdqu %%CTR, [%%GDATA_CTX + CurCount] ; %%CTR = Y0 %assign i (9-%%num_initial_blocks) %rep %%num_initial_blocks ;; Compute AES counters vpaddd %%CTR, %%CTR, [rel ONE] ; INCR Y0 vmovdqa reg(i), %%CTR vpshufb reg(i), [rel SHUF_MASK] ; perform a 16Byte swap %assign i (i+1) %endrep vmovdqu %%T_key, [%%GDATA_KEY+16*0] %assign i (9-%%num_initial_blocks) %rep %%num_initial_blocks ; Start AES for %%num_initial_blocks blocks vpxor reg(i),reg(i),%%T_key %assign i (i+1) %endrep %assign j 1 %rep NROUNDS vmovdqu %%T_key, [%%GDATA_KEY+16*j] %assign i (9-%%num_initial_blocks) %rep %%num_initial_blocks vaesenc reg(i),%%T_key %assign i (i+1) %endrep %assign j (j+1) %endrep vmovdqu %%T_key, [%%GDATA_KEY+16*j] %assign i (9-%%num_initial_blocks) %rep %%num_initial_blocks vaesenclast reg(i),%%T_key %assign i (i+1) %endrep ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; Hash all but the last block of data ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; %assign i (9-%%num_initial_blocks) %rep %%num_initial_blocks-1 ;; Encrypt the message for all but the last block VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET] vpxor reg(i), reg(i), %%T1 ;; write back ciphertext for %%num_initial_blocks blocks VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], reg(i) add %%DATA_OFFSET, 16 %ifidn %%ENC_DEC, DEC vmovdqa reg(i), %%T1 %endif ;; Prepare ciphertext for GHASH computations vpshufb reg(i), [rel SHUF_MASK] %assign i (i+1) %endrep ;; The final block of data may be <16B sub %%LENGTH, 16*(%%num_initial_blocks-1) %if %%num_initial_blocks < 8 ;; NOTE: the 'jl' is always taken for num_initial_blocks = 8. ;; This is run in the context of GCM_ENC_DEC_SMALL for length < 128. cmp %%LENGTH, 16 jl %%_small_initial_partial_block ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; Handle a full length final block - encrypt and hash all blocks ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; sub %%LENGTH, 16 mov [%%GDATA_CTX + PBlockLen], %%LENGTH ;; Encrypt the message VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET] vpxor reg(i), reg(i), %%T1 ;; write back ciphertext for %%num_initial_blocks blocks VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], reg(i) add %%DATA_OFFSET, 16 %ifidn %%ENC_DEC, DEC vmovdqa reg(i), %%T1 %endif ;; Prepare ciphertext for GHASH computations vpshufb reg(i), [rel SHUF_MASK] ;; Hash all of the data %assign i (8-%%num_initial_blocks) %assign j (9-%%num_initial_blocks) %assign k (%%num_initial_blocks) %assign last_block_to_hash 0 %if(%%num_initial_blocks>last_block_to_hash) ;; Hash in AES state vpxor %%T2, reg(j) ;; T2 - incoming AAD hash ;; reg(i) holds ciphertext ;; T5 - hash key ;; T6 - updated xor ;; reg(1)/xmm1 should now be available for tmp use vmovdqu %%T5, [%%GDATA_KEY + HashKey_ %+ k] vpclmulqdq %%T1, %%T2, %%T5, 0x11 ; %%T4 = a1*b1 vpclmulqdq %%T4, %%T2, %%T5, 0x00 ; %%T4 = a0*b0 vpclmulqdq %%T6, %%T2, %%T5, 0x01 ; %%T6 = a1*b0 vpclmulqdq %%T5, %%T2, %%T5, 0x10 ; %%T5 = a0*b1 vpxor %%T6, %%T6, %%T5 %endif %assign i (i+1) %assign j (j+1) %assign k (k-1) %assign rep_count (%%num_initial_blocks-1) %rep rep_count vmovdqu %%T5, [%%GDATA_KEY + HashKey_ %+ k] vpclmulqdq %%T3, reg(j), %%T5, 0x11 vpxor %%T1, %%T1, %%T3 vpclmulqdq %%T3, reg(j), %%T5, 0x00 vpxor %%T4, %%T4, %%T3 vpclmulqdq %%T3, reg(j), %%T5, 0x01 vpxor %%T6, %%T6, %%T3 vpclmulqdq %%T3, reg(j), %%T5, 0x10 vpxor %%T6, %%T6, %%T3 %assign i (i+1) %assign j (j+1) %assign k (k-1) %endrep ;; Record that a reduction is needed mov r12, 1 jmp %%_small_initial_compute_hash %endif ; %if %%num_initial_blocks < 8 %%_small_initial_partial_block: ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; Handle ghash for a <16B final block ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; In this case if it's a single call to encrypt we can ;; hash all of the data but if it's an init / update / finalize ;; series of call we need to leave the last block if it's ;; less than a full block of data. mov [%%GDATA_CTX + PBlockLen], %%LENGTH vmovdqu [%%GDATA_CTX + PBlockEncKey], reg(i) ;; Handle a partial final block ;; GDATA, KEY, T1, T2 ;; r13 - length ;; LT16 - indicates type of read and that the buffer is less than 16 bytes long ;; NOTE: could be replaced with %%LENGTH but at this point ;; %%LENGTH is always less than 16. ;; No PLAIN_CYPH_LEN argument available in this macro. ENCRYPT_FINAL_PARTIAL_BLOCK reg(i), %%T1, %%T3, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, LT16, %%ENC_DEC, %%DATA_OFFSET vpshufb reg(i), [SHUF_MASK] %ifidn %%INSTANCE_TYPE, multi_call %assign i (8-%%num_initial_blocks) %assign j (9-%%num_initial_blocks) %assign k (%%num_initial_blocks-1) %assign last_block_to_hash 1 %else %assign i (8-%%num_initial_blocks) %assign j (9-%%num_initial_blocks) %assign k (%%num_initial_blocks) %assign last_block_to_hash 0 %endif %if(%%num_initial_blocks>last_block_to_hash) ;; Record that a reduction is needed mov r12, 1 ;; Hash in AES state vpxor %%T2, reg(j) ;; T2 - incoming AAD hash ;; reg(i) holds ciphertext ;; T5 - hash key ;; T6 - updated xor ;; reg(1)/xmm1 should now be available for tmp use vmovdqu %%T5, [%%GDATA_KEY + HashKey_ %+ k] vpclmulqdq %%T1, %%T2, %%T5, 0x11 ; %%T4 = a1*b1 vpclmulqdq %%T4, %%T2, %%T5, 0x00 ; %%T4 = a0*b0 vpclmulqdq %%T6, %%T2, %%T5, 0x01 ; %%T6 = a1*b0 vpclmulqdq %%T5, %%T2, %%T5, 0x10 ; %%T5 = a0*b1 vpxor %%T6, %%T6, %%T5 %else ;; Record that a reduction is not needed - ;; In this case no hashes are computed because there ;; is only one initial block and it is < 16B in length. mov r12, 0 %endif %assign i (i+1) %assign j (j+1) %assign k (k-1) %ifidn %%INSTANCE_TYPE, multi_call %assign rep_count (%%num_initial_blocks-2) %%_multi_call_hash: %else %assign rep_count (%%num_initial_blocks-1) %endif %rep rep_count vmovdqu %%T5, [%%GDATA_KEY + HashKey_ %+ k] vpclmulqdq %%T3, reg(j), %%T5, 0x11 vpxor %%T1, %%T1, %%T3 vpclmulqdq %%T3, reg(j), %%T5, 0x00 vpxor %%T4, %%T4, %%T3 vpclmulqdq %%T3, reg(j), %%T5, 0x01 vpxor %%T6, %%T6, %%T3 vpclmulqdq %%T3, reg(j), %%T5, 0x10 vpxor %%T6, %%T6, %%T3 %assign i (i+1) %assign j (j+1) %assign k (k-1) %endrep %%_small_initial_compute_hash: ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; Ghash reduction ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; %if(%%num_initial_blocks=1) %ifidn %%INSTANCE_TYPE, multi_call ;; We only need to check if a reduction is needed if ;; initial_blocks == 1 and init/update/final is being used. ;; In this case we may just have a partial block, and that ;; gets hashed in finalize. cmp r12, 0 je %%_no_reduction_needed %endif %endif vpsrldq %%T3, %%T6, 8 ; shift-R %%T2 2 DWs vpslldq %%T6, %%T6, 8 ; shift-L %%T3 2 DWs vpxor %%T1, %%T1, %%T3 ; accumulate the results in %%T1:%%T4 vpxor %%T4, %%T6, %%T4 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; First phase of the reduction vmovdqu %%T3, [POLY2] vpclmulqdq %%T2, %%T3, %%T4, 0x01 ;; shift-L xmm2 2 DWs vpslldq %%T2, %%T2, 8 vpxor %%T4, %%T4, %%T2 ;; First phase of the reduction complete ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; Second phase of the reduction vpclmulqdq %%T2, %%T3, %%T4, 0x00 ;; Shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R) vpsrldq %%T2, %%T2, 4 vpclmulqdq %%T4, %%T3, %%T4, 0x10 ;; Shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts) vpslldq %%T4, %%T4, 4 vpxor %%T4, %%T4, %%T2 ;; Second phase of the reduction complete ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; vpxor %%T3, %%T1, %%T4 %ifidn %%INSTANCE_TYPE, multi_call ;; If using init/update/finalize, we need to xor any partial block data ;; into the hash. %if %%num_initial_blocks > 1 ;; NOTE: for %%num_initial_blocks = 0 the xor never takes place %if %%num_initial_blocks != 8 ;; NOTE: for %%num_initial_blocks = 8, %%LENGTH, stored in [PBlockLen] is never zero cmp qword [%%GDATA_CTX + PBlockLen], 0 je %%_no_partial_block_xor %endif ; %%num_initial_blocks != 8 vpxor %%T3, %%T3, reg(8) %%_no_partial_block_xor: %endif ; %%num_initial_blocks > 1 %endif ; %%INSTANCE_TYPE, multi_call %if(%%num_initial_blocks=1) %ifidn %%INSTANCE_TYPE, multi_call ;; NOTE: %%_no_reduction_needed case only valid for ;; multi_call with initial_blocks = 1. ;; Look for comment above around '_no_reduction_needed' ;; The jmp below is obsolete as the code will fall through. ;; The result is in %%T3 jmp %%_after_reduction %%_no_reduction_needed: ;; The hash should end up in T3. The only way we should get here is if ;; there is a partial block of data, so xor that into the hash. vpxor %%T3, %%T2, reg(8) %endif ; %%INSTANCE_TYPE = multi_call %endif ; %%num_initial_blocks=1 %%_after_reduction: ;; Final hash is now in T3 %endmacro ; INITIAL_BLOCKS_PARTIAL ; encrypt 8 blocks at a time ; ghash the 8 previously encrypted ciphertext blocks ; %%GDATA (KEY), %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN are used as pointers only, not modified ; %%DATA_OFFSET is the data offset value %macro GHASH_8_ENCRYPT_8_PARALLEL 23 %define %%GDATA %1 %define %%CYPH_PLAIN_OUT %2 %define %%PLAIN_CYPH_IN %3 %define %%DATA_OFFSET %4 %define %%T1 %5 %define %%T2 %6 %define %%T3 %7 %define %%T4 %8 %define %%T5 %9 %define %%T6 %10 %define %%CTR %11 %define %%XMM1 %12 %define %%XMM2 %13 %define %%XMM3 %14 %define %%XMM4 %15 %define %%XMM5 %16 %define %%XMM6 %17 %define %%XMM7 %18 %define %%XMM8 %19 %define %%T7 %20 %define %%loop_idx %21 %define %%ENC_DEC %22 %define %%FULL_PARTIAL %23 vmovdqa %%T2, %%XMM1 vmovdqu [rsp + TMP2], %%XMM2 vmovdqu [rsp + TMP3], %%XMM3 vmovdqu [rsp + TMP4], %%XMM4 vmovdqu [rsp + TMP5], %%XMM5 vmovdqu [rsp + TMP6], %%XMM6 vmovdqu [rsp + TMP7], %%XMM7 vmovdqu [rsp + TMP8], %%XMM8 %ifidn %%loop_idx, in_order vpaddd %%XMM1, %%CTR, [ONE] ; INCR CNT vmovdqu %%T5, [TWO] vpaddd %%XMM2, %%CTR, %%T5 vpaddd %%XMM3, %%XMM1, %%T5 vpaddd %%XMM4, %%XMM2, %%T5 vpaddd %%XMM5, %%XMM3, %%T5 vpaddd %%XMM6, %%XMM4, %%T5 vpaddd %%XMM7, %%XMM5, %%T5 vpaddd %%XMM8, %%XMM6, %%T5 vmovdqa %%CTR, %%XMM8 vmovdqu %%T5, [SHUF_MASK] vpshufb %%XMM1, %%T5 ; perform a 16Byte swap vpshufb %%XMM2, %%T5 ; perform a 16Byte swap vpshufb %%XMM3, %%T5 ; perform a 16Byte swap vpshufb %%XMM4, %%T5 ; perform a 16Byte swap vpshufb %%XMM5, %%T5 ; perform a 16Byte swap vpshufb %%XMM6, %%T5 ; perform a 16Byte swap vpshufb %%XMM7, %%T5 ; perform a 16Byte swap vpshufb %%XMM8, %%T5 ; perform a 16Byte swap %else vpaddd %%XMM1, %%CTR, [ONEf] ; INCR CNT vmovdqu %%T5, [TWOf] vpaddd %%XMM2, %%CTR, %%T5 vpaddd %%XMM3, %%XMM1, %%T5 vpaddd %%XMM4, %%XMM2, %%T5 vpaddd %%XMM5, %%XMM3, %%T5 vpaddd %%XMM6, %%XMM4, %%T5 vpaddd %%XMM7, %%XMM5, %%T5 vpaddd %%XMM8, %%XMM6, %%T5 vmovdqa %%CTR, %%XMM8 %endif ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; vmovdqu %%T1, [%%GDATA + 16*0] vpxor %%XMM1, %%XMM1, %%T1 vpxor %%XMM2, %%XMM2, %%T1 vpxor %%XMM3, %%XMM3, %%T1 vpxor %%XMM4, %%XMM4, %%T1 vpxor %%XMM5, %%XMM5, %%T1 vpxor %%XMM6, %%XMM6, %%T1 vpxor %%XMM7, %%XMM7, %%T1 vpxor %%XMM8, %%XMM8, %%T1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; vmovdqu %%T1, [%%GDATA + 16*1] vaesenc %%XMM1, %%T1 vaesenc %%XMM2, %%T1 vaesenc %%XMM3, %%T1 vaesenc %%XMM4, %%T1 vaesenc %%XMM5, %%T1 vaesenc %%XMM6, %%T1 vaesenc %%XMM7, %%T1 vaesenc %%XMM8, %%T1 vmovdqu %%T1, [%%GDATA + 16*2] vaesenc %%XMM1, %%T1 vaesenc %%XMM2, %%T1 vaesenc %%XMM3, %%T1 vaesenc %%XMM4, %%T1 vaesenc %%XMM5, %%T1 vaesenc %%XMM6, %%T1 vaesenc %%XMM7, %%T1 vaesenc %%XMM8, %%T1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; vmovdqu %%T5, [%%GDATA + HashKey_8] vpclmulqdq %%T4, %%T2, %%T5, 0x11 ; %%T4 = a1*b1 vpclmulqdq %%T7, %%T2, %%T5, 0x00 ; %%T7 = a0*b0 vpclmulqdq %%T6, %%T2, %%T5, 0x01 ; %%T6 = a1*b0 vpclmulqdq %%T5, %%T2, %%T5, 0x10 ; %%T5 = a0*b1 vpxor %%T6, %%T6, %%T5 vmovdqu %%T1, [%%GDATA + 16*3] vaesenc %%XMM1, %%T1 vaesenc %%XMM2, %%T1 vaesenc %%XMM3, %%T1 vaesenc %%XMM4, %%T1 vaesenc %%XMM5, %%T1 vaesenc %%XMM6, %%T1 vaesenc %%XMM7, %%T1 vaesenc %%XMM8, %%T1 vmovdqu %%T1, [rsp + TMP2] vmovdqu %%T5, [%%GDATA + HashKey_7] vpclmulqdq %%T3, %%T1, %%T5, 0x11 vpxor %%T4, %%T4, %%T3 vpclmulqdq %%T3, %%T1, %%T5, 0x00 vpxor %%T7, %%T7, %%T3 vpclmulqdq %%T3, %%T1, %%T5, 0x01 vpxor %%T6, %%T6, %%T3 vpclmulqdq %%T3, %%T1, %%T5, 0x10 vpxor %%T6, %%T6, %%T3 vmovdqu %%T1, [%%GDATA + 16*4] vaesenc %%XMM1, %%T1 vaesenc %%XMM2, %%T1 vaesenc %%XMM3, %%T1 vaesenc %%XMM4, %%T1 vaesenc %%XMM5, %%T1 vaesenc %%XMM6, %%T1 vaesenc %%XMM7, %%T1 vaesenc %%XMM8, %%T1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; vmovdqu %%T1, [rsp + TMP3] vmovdqu %%T5, [%%GDATA + HashKey_6] vpclmulqdq %%T3, %%T1, %%T5, 0x11 vpxor %%T4, %%T4, %%T3 vpclmulqdq %%T3, %%T1, %%T5, 0x00 vpxor %%T7, %%T7, %%T3 vpclmulqdq %%T3, %%T1, %%T5, 0x01 vpxor %%T6, %%T6, %%T3 vpclmulqdq %%T3, %%T1, %%T5, 0x10 vpxor %%T6, %%T6, %%T3 vmovdqu %%T1, [%%GDATA + 16*5] vaesenc %%XMM1, %%T1 vaesenc %%XMM2, %%T1 vaesenc %%XMM3, %%T1 vaesenc %%XMM4, %%T1 vaesenc %%XMM5, %%T1 vaesenc %%XMM6, %%T1 vaesenc %%XMM7, %%T1 vaesenc %%XMM8, %%T1 vmovdqu %%T1, [rsp + TMP4] vmovdqu %%T5, [%%GDATA + HashKey_5] vpclmulqdq %%T3, %%T1, %%T5, 0x11 vpxor %%T4, %%T4, %%T3 vpclmulqdq %%T3, %%T1, %%T5, 0x00 vpxor %%T7, %%T7, %%T3 vpclmulqdq %%T3, %%T1, %%T5, 0x01 vpxor %%T6, %%T6, %%T3 vpclmulqdq %%T3, %%T1, %%T5, 0x10 vpxor %%T6, %%T6, %%T3 vmovdqu %%T1, [%%GDATA + 16*6] vaesenc %%XMM1, %%T1 vaesenc %%XMM2, %%T1 vaesenc %%XMM3, %%T1 vaesenc %%XMM4, %%T1 vaesenc %%XMM5, %%T1 vaesenc %%XMM6, %%T1 vaesenc %%XMM7, %%T1 vaesenc %%XMM8, %%T1 vmovdqu %%T1, [rsp + TMP5] vmovdqu %%T5, [%%GDATA + HashKey_4] vpclmulqdq %%T3, %%T1, %%T5, 0x11 vpxor %%T4, %%T4, %%T3 vpclmulqdq %%T3, %%T1, %%T5, 0x00 vpxor %%T7, %%T7, %%T3 vpclmulqdq %%T3, %%T1, %%T5, 0x01 vpxor %%T6, %%T6, %%T3 vpclmulqdq %%T3, %%T1, %%T5, 0x10 vpxor %%T6, %%T6, %%T3 vmovdqu %%T1, [%%GDATA + 16*7] vaesenc %%XMM1, %%T1 vaesenc %%XMM2, %%T1 vaesenc %%XMM3, %%T1 vaesenc %%XMM4, %%T1 vaesenc %%XMM5, %%T1 vaesenc %%XMM6, %%T1 vaesenc %%XMM7, %%T1 vaesenc %%XMM8, %%T1 vmovdqu %%T1, [rsp + TMP6] vmovdqu %%T5, [%%GDATA + HashKey_3] vpclmulqdq %%T3, %%T1, %%T5, 0x11 vpxor %%T4, %%T4, %%T3 vpclmulqdq %%T3, %%T1, %%T5, 0x00 vpxor %%T7, %%T7, %%T3 vpclmulqdq %%T3, %%T1, %%T5, 0x01 vpxor %%T6, %%T6, %%T3 vpclmulqdq %%T3, %%T1, %%T5, 0x10 vpxor %%T6, %%T6, %%T3 vmovdqu %%T1, [%%GDATA + 16*8] vaesenc %%XMM1, %%T1 vaesenc %%XMM2, %%T1 vaesenc %%XMM3, %%T1 vaesenc %%XMM4, %%T1 vaesenc %%XMM5, %%T1 vaesenc %%XMM6, %%T1 vaesenc %%XMM7, %%T1 vaesenc %%XMM8, %%T1 vmovdqu %%T1, [rsp + TMP7] vmovdqu %%T5, [%%GDATA + HashKey_2] vpclmulqdq %%T3, %%T1, %%T5, 0x11 vpxor %%T4, %%T4, %%T3 vpclmulqdq %%T3, %%T1, %%T5, 0x00 vpxor %%T7, %%T7, %%T3 vpclmulqdq %%T3, %%T1, %%T5, 0x01 vpxor %%T6, %%T6, %%T3 vpclmulqdq %%T3, %%T1, %%T5, 0x10 vpxor %%T6, %%T6, %%T3 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; vmovdqu %%T5, [%%GDATA + 16*9] vaesenc %%XMM1, %%T5 vaesenc %%XMM2, %%T5 vaesenc %%XMM3, %%T5 vaesenc %%XMM4, %%T5 vaesenc %%XMM5, %%T5 vaesenc %%XMM6, %%T5 vaesenc %%XMM7, %%T5 vaesenc %%XMM8, %%T5 vmovdqu %%T1, [rsp + TMP8] vmovdqu %%T5, [%%GDATA + HashKey] vpclmulqdq %%T3, %%T1, %%T5, 0x00 vpxor %%T7, %%T7, %%T3 vpclmulqdq %%T3, %%T1, %%T5, 0x01 vpxor %%T6, %%T6, %%T3 vpclmulqdq %%T3, %%T1, %%T5, 0x10 vpxor %%T6, %%T6, %%T3 vpclmulqdq %%T3, %%T1, %%T5, 0x11 vpxor %%T1, %%T4, %%T3 vmovdqu %%T5, [%%GDATA + 16*10] %ifndef GCM128_MODE ; GCM192 or GCM256 vaesenc %%XMM1, %%T5 vaesenc %%XMM2, %%T5 vaesenc %%XMM3, %%T5 vaesenc %%XMM4, %%T5 vaesenc %%XMM5, %%T5 vaesenc %%XMM6, %%T5 vaesenc %%XMM7, %%T5 vaesenc %%XMM8, %%T5 vmovdqu %%T5, [%%GDATA + 16*11] vaesenc %%XMM1, %%T5 vaesenc %%XMM2, %%T5 vaesenc %%XMM3, %%T5 vaesenc %%XMM4, %%T5 vaesenc %%XMM5, %%T5 vaesenc %%XMM6, %%T5 vaesenc %%XMM7, %%T5 vaesenc %%XMM8, %%T5 vmovdqu %%T5, [%%GDATA + 16*12] %endif %ifdef GCM256_MODE vaesenc %%XMM1, %%T5 vaesenc %%XMM2, %%T5 vaesenc %%XMM3, %%T5 vaesenc %%XMM4, %%T5 vaesenc %%XMM5, %%T5 vaesenc %%XMM6, %%T5 vaesenc %%XMM7, %%T5 vaesenc %%XMM8, %%T5 vmovdqu %%T5, [%%GDATA + 16*13] vaesenc %%XMM1, %%T5 vaesenc %%XMM2, %%T5 vaesenc %%XMM3, %%T5 vaesenc %%XMM4, %%T5 vaesenc %%XMM5, %%T5 vaesenc %%XMM6, %%T5 vaesenc %%XMM7, %%T5 vaesenc %%XMM8, %%T5 vmovdqu %%T5, [%%GDATA + 16*14] %endif ; GCM256 %assign i 0 %assign j 1 %rep 8 ;; SNP TBD: This is pretty ugly - consider whether just XORing the ;; data in after vaesenclast is simpler and performant. Would ;; also have to ripple it through partial block and ghash_mul_8. %ifidn %%FULL_PARTIAL, full %ifdef NT_LD VXLDR %%T2, [%%PLAIN_CYPH_IN+%%DATA_OFFSET+16*i] vpxor %%T2, %%T2, %%T5 %else vpxor %%T2, %%T5, [%%PLAIN_CYPH_IN+%%DATA_OFFSET+16*i] %endif %ifidn %%ENC_DEC, ENC vaesenclast reg(j), reg(j), %%T2 %else vaesenclast %%T3, reg(j), %%T2 vpxor reg(j), %%T2, %%T5 VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*i], %%T3 %endif %else ; Don't read the final data during partial block processing %ifdef NT_LD %if (i<7) VXLDR %%T2, [%%PLAIN_CYPH_IN+%%DATA_OFFSET+16*i] vpxor %%T2, %%T2, %%T5 %else ;; Stage the key directly in T2 rather than hash it with plaintext vmovdqu %%T2, %%T5 %endif %else %if (i<7) vpxor %%T2, %%T5, [%%PLAIN_CYPH_IN+%%DATA_OFFSET+16*i] %else ;; Stage the key directly in T2 rather than hash it with plaintext vmovdqu %%T2, %%T5 %endif %endif %ifidn %%ENC_DEC, ENC vaesenclast reg(j), reg(j), %%T2 %else %if (i<7) vaesenclast %%T3, reg(j), %%T2 vpxor reg(j), %%T2, %%T5 ;; Do not read the data since it could fault VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*i], %%T3 %else vaesenclast reg(j), reg(j), %%T2 %endif %endif %endif %assign i (i+1) %assign j (j+1) %endrep ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; vpslldq %%T3, %%T6, 8 ; shift-L %%T3 2 DWs vpsrldq %%T6, %%T6, 8 ; shift-R %%T2 2 DWs vpxor %%T7, %%T7, %%T3 vpxor %%T1, %%T1, %%T6 ; accumulate the results in %%T1:%%T7 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;first phase of the reduction vmovdqu %%T3, [POLY2] vpclmulqdq %%T2, %%T3, %%T7, 0x01 vpslldq %%T2, %%T2, 8 ; shift-L xmm2 2 DWs vpxor %%T7, %%T7, %%T2 ; first phase of the reduction complete ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; %ifidn %%ENC_DEC, ENC ; Write to the Ciphertext buffer VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*0], %%XMM1 VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*1], %%XMM2 VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*2], %%XMM3 VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*3], %%XMM4 VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*4], %%XMM5 VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*5], %%XMM6 VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*6], %%XMM7 %ifidn %%FULL_PARTIAL, full ;; Avoid writing past the buffer if handling a partial block VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*7], %%XMM8 %endif %endif ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;second phase of the reduction vpclmulqdq %%T2, %%T3, %%T7, 0x00 vpsrldq %%T2, %%T2, 4 ; shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R) vpclmulqdq %%T4, %%T3, %%T7, 0x10 vpslldq %%T4, %%T4, 4 ; shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts) vpxor %%T4, %%T4, %%T2 ; second phase of the reduction complete ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; vpxor %%T1, %%T1, %%T4 ; the result is in %%T1 vpshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap vpshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap vpshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap vpshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap vpshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap vpshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap vpshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap vpshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap vpxor %%XMM1, %%T1 %endmacro ; GHASH_8_ENCRYPT_8_PARALLEL ; GHASH the last 4 ciphertext blocks. %macro GHASH_LAST_8 16 %define %%GDATA %1 %define %%T1 %2 %define %%T2 %3 %define %%T3 %4 %define %%T4 %5 %define %%T5 %6 %define %%T6 %7 %define %%T7 %8 %define %%XMM1 %9 %define %%XMM2 %10 %define %%XMM3 %11 %define %%XMM4 %12 %define %%XMM5 %13 %define %%XMM6 %14 %define %%XMM7 %15 %define %%XMM8 %16 ;; Karatsuba Method vmovdqu %%T5, [%%GDATA + HashKey_8] vpshufd %%T2, %%XMM1, 01001110b vpshufd %%T3, %%T5, 01001110b vpxor %%T2, %%T2, %%XMM1 vpxor %%T3, %%T3, %%T5 vpclmulqdq %%T6, %%XMM1, %%T5, 0x11 vpclmulqdq %%T7, %%XMM1, %%T5, 0x00 vpclmulqdq %%XMM1, %%T2, %%T3, 0x00 ;;;;;;;;;;;;;;;;;;;;;; vmovdqu %%T5, [%%GDATA + HashKey_7] vpshufd %%T2, %%XMM2, 01001110b vpshufd %%T3, %%T5, 01001110b vpxor %%T2, %%T2, %%XMM2 vpxor %%T3, %%T3, %%T5 vpclmulqdq %%T4, %%XMM2, %%T5, 0x11 vpxor %%T6, %%T6, %%T4 vpclmulqdq %%T4, %%XMM2, %%T5, 0x00 vpxor %%T7, %%T7, %%T4 vpclmulqdq %%T2, %%T2, %%T3, 0x00 vpxor %%XMM1, %%XMM1, %%T2 ;;;;;;;;;;;;;;;;;;;;;; vmovdqu %%T5, [%%GDATA + HashKey_6] vpshufd %%T2, %%XMM3, 01001110b vpshufd %%T3, %%T5, 01001110b vpxor %%T2, %%T2, %%XMM3 vpxor %%T3, %%T3, %%T5 vpclmulqdq %%T4, %%XMM3, %%T5, 0x11 vpxor %%T6, %%T6, %%T4 vpclmulqdq %%T4, %%XMM3, %%T5, 0x00 vpxor %%T7, %%T7, %%T4 vpclmulqdq %%T2, %%T2, %%T3, 0x00 vpxor %%XMM1, %%XMM1, %%T2 ;;;;;;;;;;;;;;;;;;;;;; vmovdqu %%T5, [%%GDATA + HashKey_5] vpshufd %%T2, %%XMM4, 01001110b vpshufd %%T3, %%T5, 01001110b vpxor %%T2, %%T2, %%XMM4 vpxor %%T3, %%T3, %%T5 vpclmulqdq %%T4, %%XMM4, %%T5, 0x11 vpxor %%T6, %%T6, %%T4 vpclmulqdq %%T4, %%XMM4, %%T5, 0x00 vpxor %%T7, %%T7, %%T4 vpclmulqdq %%T2, %%T2, %%T3, 0x00 vpxor %%XMM1, %%XMM1, %%T2 ;;;;;;;;;;;;;;;;;;;;;; vmovdqu %%T5, [%%GDATA + HashKey_4] vpshufd %%T2, %%XMM5, 01001110b vpshufd %%T3, %%T5, 01001110b vpxor %%T2, %%T2, %%XMM5 vpxor %%T3, %%T3, %%T5 vpclmulqdq %%T4, %%XMM5, %%T5, 0x11 vpxor %%T6, %%T6, %%T4 vpclmulqdq %%T4, %%XMM5, %%T5, 0x00 vpxor %%T7, %%T7, %%T4 vpclmulqdq %%T2, %%T2, %%T3, 0x00 vpxor %%XMM1, %%XMM1, %%T2 ;;;;;;;;;;;;;;;;;;;;;; vmovdqu %%T5, [%%GDATA + HashKey_3] vpshufd %%T2, %%XMM6, 01001110b vpshufd %%T3, %%T5, 01001110b vpxor %%T2, %%T2, %%XMM6 vpxor %%T3, %%T3, %%T5 vpclmulqdq %%T4, %%XMM6, %%T5, 0x11 vpxor %%T6, %%T6, %%T4 vpclmulqdq %%T4, %%XMM6, %%T5, 0x00 vpxor %%T7, %%T7, %%T4 vpclmulqdq %%T2, %%T2, %%T3, 0x00 vpxor %%XMM1, %%XMM1, %%T2 ;;;;;;;;;;;;;;;;;;;;;; vmovdqu %%T5, [%%GDATA + HashKey_2] vpshufd %%T2, %%XMM7, 01001110b vpshufd %%T3, %%T5, 01001110b vpxor %%T2, %%T2, %%XMM7 vpxor %%T3, %%T3, %%T5 vpclmulqdq %%T4, %%XMM7, %%T5, 0x11 vpxor %%T6, %%T6, %%T4 vpclmulqdq %%T4, %%XMM7, %%T5, 0x00 vpxor %%T7, %%T7, %%T4 vpclmulqdq %%T2, %%T2, %%T3, 0x00 vpxor %%XMM1, %%XMM1, %%T2 ;;;;;;;;;;;;;;;;;;;;;; vmovdqu %%T5, [%%GDATA + HashKey] vpshufd %%T2, %%XMM8, 01001110b vpshufd %%T3, %%T5, 01001110b vpxor %%T2, %%T2, %%XMM8 vpxor %%T3, %%T3, %%T5 vpclmulqdq %%T4, %%XMM8, %%T5, 0x11 vpxor %%T6, %%T6, %%T4 vpclmulqdq %%T4, %%XMM8, %%T5, 0x00 vpxor %%T7, %%T7, %%T4 vpclmulqdq %%T2, %%T2, %%T3, 0x00 vpxor %%XMM1, %%XMM1, %%T2 vpxor %%XMM1, %%XMM1, %%T6 vpxor %%T2, %%XMM1, %%T7 vpslldq %%T4, %%T2, 8 vpsrldq %%T2, %%T2, 8 vpxor %%T7, %%T7, %%T4 vpxor %%T6, %%T6, %%T2 ; <%%T6:%%T7> holds the result of the accumulated carry-less multiplications ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;first phase of the reduction vmovdqu %%T3, [POLY2] vpclmulqdq %%T2, %%T3, %%T7, 0x01 vpslldq %%T2, %%T2, 8 ; shift-L xmm2 2 DWs vpxor %%T7, %%T7, %%T2 ; first phase of the reduction complete ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;second phase of the reduction vpclmulqdq %%T2, %%T3, %%T7, 0x00 vpsrldq %%T2, %%T2, 4 ; shift-R %%T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R) vpclmulqdq %%T4, %%T3, %%T7, 0x10 vpslldq %%T4, %%T4, 4 ; shift-L %%T4 1 DW (Shift-L 1-DW to obtain result with no shifts) vpxor %%T4, %%T4, %%T2 ; second phase of the reduction complete ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; vpxor %%T6, %%T6, %%T4 ; the result is in %%T6 %endmacro ; GHASH the last 4 ciphertext blocks. %macro GHASH_LAST_7 15 %define %%GDATA %1 %define %%T1 %2 %define %%T2 %3 %define %%T3 %4 %define %%T4 %5 %define %%T5 %6 %define %%T6 %7 %define %%T7 %8 %define %%XMM1 %9 %define %%XMM2 %10 %define %%XMM3 %11 %define %%XMM4 %12 %define %%XMM5 %13 %define %%XMM6 %14 %define %%XMM7 %15 ;; Karatsuba Method vmovdqu %%T5, [%%GDATA + HashKey_7] vpshufd %%T2, %%XMM1, 01001110b vpshufd %%T3, %%T5, 01001110b vpxor %%T2, %%T2, %%XMM1 vpxor %%T3, %%T3, %%T5 vpclmulqdq %%T6, %%XMM1, %%T5, 0x11 vpclmulqdq %%T7, %%XMM1, %%T5, 0x00 vpclmulqdq %%XMM1, %%T2, %%T3, 0x00 ;;;;;;;;;;;;;;;;;;;;;; vmovdqu %%T5, [%%GDATA + HashKey_6] vpshufd %%T2, %%XMM2, 01001110b vpshufd %%T3, %%T5, 01001110b vpxor %%T2, %%T2, %%XMM2 vpxor %%T3, %%T3, %%T5 vpclmulqdq %%T4, %%XMM2, %%T5, 0x11 vpxor %%T6, %%T6, %%T4 vpclmulqdq %%T4, %%XMM2, %%T5, 0x00 vpxor %%T7, %%T7, %%T4 vpclmulqdq %%T2, %%T2, %%T3, 0x00 vpxor %%XMM1, %%XMM1, %%T2 ;;;;;;;;;;;;;;;;;;;;;; vmovdqu %%T5, [%%GDATA + HashKey_5] vpshufd %%T2, %%XMM3, 01001110b vpshufd %%T3, %%T5, 01001110b vpxor %%T2, %%T2, %%XMM3 vpxor %%T3, %%T3, %%T5 vpclmulqdq %%T4, %%XMM3, %%T5, 0x11 vpxor %%T6, %%T6, %%T4 vpclmulqdq %%T4, %%XMM3, %%T5, 0x00 vpxor %%T7, %%T7, %%T4 vpclmulqdq %%T2, %%T2, %%T3, 0x00 vpxor %%XMM1, %%XMM1, %%T2 ;;;;;;;;;;;;;;;;;;;;;; vmovdqu %%T5, [%%GDATA + HashKey_4] vpshufd %%T2, %%XMM4, 01001110b vpshufd %%T3, %%T5, 01001110b vpxor %%T2, %%T2, %%XMM4 vpxor %%T3, %%T3, %%T5 vpclmulqdq %%T4, %%XMM4, %%T5, 0x11 vpxor %%T6, %%T6, %%T4 vpclmulqdq %%T4, %%XMM4, %%T5, 0x00 vpxor %%T7, %%T7, %%T4 vpclmulqdq %%T2, %%T2, %%T3, 0x00 vpxor %%XMM1, %%XMM1, %%T2 ;;;;;;;;;;;;;;;;;;;;;; vmovdqu %%T5, [%%GDATA + HashKey_3] vpshufd %%T2, %%XMM5, 01001110b vpshufd %%T3, %%T5, 01001110b vpxor %%T2, %%T2, %%XMM5 vpxor %%T3, %%T3, %%T5 vpclmulqdq %%T4, %%XMM5, %%T5, 0x11 vpxor %%T6, %%T6, %%T4 vpclmulqdq %%T4, %%XMM5, %%T5, 0x00 vpxor %%T7, %%T7, %%T4 vpclmulqdq %%T2, %%T2, %%T3, 0x00 vpxor %%XMM1, %%XMM1, %%T2 ;;;;;;;;;;;;;;;;;;;;;; vmovdqu %%T5, [%%GDATA + HashKey_2] vpshufd %%T2, %%XMM6, 01001110b vpshufd %%T3, %%T5, 01001110b vpxor %%T2, %%T2, %%XMM6 vpxor %%T3, %%T3, %%T5 vpclmulqdq %%T4, %%XMM6, %%T5, 0x11 vpxor %%T6, %%T6, %%T4 vpclmulqdq %%T4, %%XMM6, %%T5, 0x00 vpxor %%T7, %%T7, %%T4 vpclmulqdq %%T2, %%T2, %%T3, 0x00 vpxor %%XMM1, %%XMM1, %%T2 ;;;;;;;;;;;;;;;;;;;;;; vmovdqu %%T5, [%%GDATA + HashKey_1] vpshufd %%T2, %%XMM7, 01001110b vpshufd %%T3, %%T5, 01001110b vpxor %%T2, %%T2, %%XMM7 vpxor %%T3, %%T3, %%T5 vpclmulqdq %%T4, %%XMM7, %%T5, 0x11 vpxor %%T6, %%T6, %%T4 vpclmulqdq %%T4, %%XMM7, %%T5, 0x00 vpxor %%T7, %%T7, %%T4 vpclmulqdq %%T2, %%T2, %%T3, 0x00 vpxor %%XMM1, %%XMM1, %%T2 ;;;;;;;;;;;;;;;;;;;;;; vpxor %%XMM1, %%XMM1, %%T6 vpxor %%T2, %%XMM1, %%T7 vpslldq %%T4, %%T2, 8 vpsrldq %%T2, %%T2, 8 vpxor %%T7, %%T7, %%T4 vpxor %%T6, %%T6, %%T2 ; <%%T6:%%T7> holds the result of the accumulated carry-less multiplications ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;first phase of the reduction vmovdqu %%T3, [POLY2] vpclmulqdq %%T2, %%T3, %%T7, 0x01 vpslldq %%T2, %%T2, 8 ; shift-L xmm2 2 DWs vpxor %%T7, %%T7, %%T2 ; first phase of the reduction complete ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;second phase of the reduction vpclmulqdq %%T2, %%T3, %%T7, 0x00 vpsrldq %%T2, %%T2, 4 ; shift-R %%T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R) vpclmulqdq %%T4, %%T3, %%T7, 0x10 vpslldq %%T4, %%T4, 4 ; shift-L %%T4 1 DW (Shift-L 1-DW to obtain result with no shifts) vpxor %%T4, %%T4, %%T2 ; second phase of the reduction complete ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; vpxor %%T6, %%T6, %%T4 ; the result is in %%T6 %endmacro ;;; Handle encryption of the final partial block ;;; IN: ;;; r13 - Number of bytes to read ;;; MODIFIES: ;;; KEY - Key for encrypting the partial block ;;; HASH - Current hash value ;;; SMASHES: ;;; r10, r12, r15, rax ;;; T1, T2 ;;; Note: ;;; PLAIN_CYPH_LEN, %7, is passed only to determine ;;; if buffer is big enough to do a 16 byte read & shift. ;;; 'LT16' is passed here only if buffer is known to be smaller ;;; than 16 bytes. ;;; Any other value passed here will result in 16 byte read ;;; code path. ;;; TBD: Remove HASH from the instantiation %macro ENCRYPT_FINAL_PARTIAL_BLOCK 8 %define %%KEY %1 %define %%T1 %2 %define %%T2 %3 %define %%CYPH_PLAIN_OUT %4 %define %%PLAIN_CYPH_IN %5 %define %%PLAIN_CYPH_LEN %6 %define %%ENC_DEC %7 %define %%DATA_OFFSET %8 ;; NOTE: type of read tuned based %%PLAIN_CYPH_LEN setting %ifidn %%PLAIN_CYPH_LEN, LT16 ;; Handle the case where the message is < 16 bytes lea r10, [%%PLAIN_CYPH_IN + %%DATA_OFFSET] ;; T1 - packed output ;; r10 - input data address ;; r13 - input data length ;; r12, r15, rax - temp registers READ_SMALL_DATA_INPUT %%T1, r10, r13, r12, r15, rax lea r12, [SHIFT_MASK + 16] sub r12, r13 %else ;; Handle the case where the message is >= 16 bytes sub %%DATA_OFFSET, 16 add %%DATA_OFFSET, r13 ;; Receive the last <16 Byte block vmovdqu %%T1, [%%PLAIN_CYPH_IN+%%DATA_OFFSET] sub %%DATA_OFFSET, r13 add %%DATA_OFFSET, 16 lea r12, [SHIFT_MASK + 16] ;; Adjust the shuffle mask pointer to be able to shift 16-r13 bytes ;; (r13 is the number of bytes in plaintext mod 16) sub r12, r13 ;; Get the appropriate shuffle mask vmovdqu %%T2, [r12] ;; shift right 16-r13 bytes vpshufb %%T1, %%T2 %endif ; %%PLAIN_CYPH_LEN, LT16 ;; At this point T1 contains the partial block data %ifidn %%ENC_DEC, DEC ;; Plaintext XOR E(K, Yn) ;; Set aside the ciphertext vmovdqa %%T2, %%T1 vpxor %%KEY, %%KEY, %%T1 ;; Get the appropriate mask to mask out top 16-r13 bytes of ciphertext vmovdqu %%T1, [r12 + ALL_F - SHIFT_MASK] ;; Mask out top 16-r13 bytes of ciphertext vpand %%KEY, %%KEY, %%T1 ;; Prepare the ciphertext for the hash ;; mask out top 16-r13 bytes of the plaintext vpand %%T2, %%T2, %%T1 %else ;; Plaintext XOR E(K, Yn) vpxor %%KEY, %%KEY, %%T1 ;; Get the appropriate mask to mask out top 16-r13 bytes of %%KEY vmovdqu %%T1, [r12 + ALL_F - SHIFT_MASK] ;; Mask out top 16-r13 bytes of %%KEY vpand %%KEY, %%KEY, %%T1 %endif ;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; Output r13 Bytes vmovq rax, %%KEY cmp r13, 8 jle %%_less_than_8_bytes_left mov [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], rax add %%DATA_OFFSET, 8 vpsrldq %%T1, %%KEY, 8 vmovq rax, %%T1 sub r13, 8 %%_less_than_8_bytes_left: mov BYTE [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], al add %%DATA_OFFSET, 1 shr rax, 8 sub r13, 1 jne %%_less_than_8_bytes_left ;;;;;;;;;;;;;;;;;;;;;;;;;;;;; %ifidn %%ENC_DEC, DEC ;; If decrypt, restore the ciphertext into %%KEY vmovdqu %%KEY, %%T2 %endif %endmacro ; ENCRYPT_FINAL_PARTIAL_BLOCK ; Encryption of a single block %macro ENCRYPT_SINGLE_BLOCK 2 %define %%GDATA %1 %define %%XMM0 %2 vpxor %%XMM0, %%XMM0, [%%GDATA+16*0] %assign i 1 %rep NROUNDS vaesenc %%XMM0, [%%GDATA+16*i] %assign i (i+1) %endrep vaesenclast %%XMM0, [%%GDATA+16*i] %endmacro ;; Start of Stack Setup %macro FUNC_SAVE 0 ;; Required for Update/GMC_ENC ;the number of pushes must equal STACK_OFFSET push r12 push r13 push r14 push r15 mov r14, rsp sub rsp, VARIABLE_OFFSET and rsp, ~63 %ifidn __OUTPUT_FORMAT__, win64 ; xmm6:xmm15 need to be maintained for Windows vmovdqu [rsp + LOCAL_STORAGE + 0*16],xmm6 vmovdqu [rsp + LOCAL_STORAGE + 1*16],xmm7 vmovdqu [rsp + LOCAL_STORAGE + 2*16],xmm8 vmovdqu [rsp + LOCAL_STORAGE + 3*16],xmm9 vmovdqu [rsp + LOCAL_STORAGE + 4*16],xmm10 vmovdqu [rsp + LOCAL_STORAGE + 5*16],xmm11 vmovdqu [rsp + LOCAL_STORAGE + 6*16],xmm12 vmovdqu [rsp + LOCAL_STORAGE + 7*16],xmm13 vmovdqu [rsp + LOCAL_STORAGE + 8*16],xmm14 vmovdqu [rsp + LOCAL_STORAGE + 9*16],xmm15 %endif %endmacro %macro FUNC_RESTORE 0 %ifidn __OUTPUT_FORMAT__, win64 vmovdqu xmm15, [rsp + LOCAL_STORAGE + 9*16] vmovdqu xmm14, [rsp + LOCAL_STORAGE + 8*16] vmovdqu xmm13, [rsp + LOCAL_STORAGE + 7*16] vmovdqu xmm12, [rsp + LOCAL_STORAGE + 6*16] vmovdqu xmm11, [rsp + LOCAL_STORAGE + 5*16] vmovdqu xmm10, [rsp + LOCAL_STORAGE + 4*16] vmovdqu xmm9, [rsp + LOCAL_STORAGE + 3*16] vmovdqu xmm8, [rsp + LOCAL_STORAGE + 2*16] vmovdqu xmm7, [rsp + LOCAL_STORAGE + 1*16] vmovdqu xmm6, [rsp + LOCAL_STORAGE + 0*16] %endif ;; Required for Update/GMC_ENC mov rsp, r14 pop r15 pop r14 pop r13 pop r12 %endmacro ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; GCM_INIT initializes a gcm_context_data struct to prepare for encoding/decoding. ; Input: gcm_key_data * (GDATA_KEY), gcm_context_data *(GDATA_CTX), IV, ; Additional Authentication data (A_IN), Additional Data length (A_LEN). ; Output: Updated GDATA_CTX with the hash of A_IN (AadHash) and initialized other parts of GDATA_CTX. ; Clobbers rax, r10-r13, and xmm0-xmm6 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; %macro GCM_INIT 5 %define %%GDATA_KEY %1 %define %%GDATA_CTX %2 %define %%IV %3 %define %%A_IN %4 %define %%A_LEN %5 %define %%AAD_HASH xmm14 %define %%SUBHASH xmm1 vmovdqu %%SUBHASH, [%%GDATA_KEY + HashKey] mov r10, %%A_LEN cmp r10, 0 je %%_aad_is_zero CALC_AAD_HASH %%A_IN, %%A_LEN, %%AAD_HASH, %%SUBHASH, xmm2, xmm3, xmm4, xmm5, xmm6, r10, r11, r12, r13, rax jmp %%_after_aad %%_aad_is_zero: vpxor %%AAD_HASH, %%AAD_HASH %%_after_aad: mov r10, %%A_LEN vpxor xmm2, xmm3 vmovdqu [%%GDATA_CTX + AadHash], %%AAD_HASH ; ctx_data.aad hash = aad_hash mov [%%GDATA_CTX + AadLen], r10 ; ctx_data.aad_length = aad_length xor r10, r10 mov [%%GDATA_CTX + InLen], r10 ; ctx_data.in_length = 0 mov [%%GDATA_CTX + PBlockLen], r10 ; ctx_data.partial_block_length = 0 vmovdqu [%%GDATA_CTX + PBlockEncKey], xmm2 ; ctx_data.partial_block_enc_key = 0 mov r10, %%IV vmovdqa xmm2, [rel ONEf] ; read 12 IV bytes and pad with 0x00000001 vpinsrq xmm2, [r10], 0 vpinsrd xmm2, [r10+8], 2 vmovdqu [%%GDATA_CTX + OrigIV], xmm2 ; ctx_data.orig_IV = iv vpshufb xmm2, [SHUF_MASK] vmovdqu [%%GDATA_CTX + CurCount], xmm2 ; ctx_data.current_counter = iv %endmacro %macro GCM_ENC_DEC_SMALL 12 %define %%GDATA_KEY %1 %define %%GDATA_CTX %2 %define %%CYPH_PLAIN_OUT %3 %define %%PLAIN_CYPH_IN %4 %define %%PLAIN_CYPH_LEN %5 %define %%ENC_DEC %6 %define %%DATA_OFFSET %7 %define %%LENGTH %8 %define %%NUM_BLOCKS %9 %define %%CTR %10 %define %%HASH %11 %define %%INSTANCE_TYPE %12 ;; NOTE: the check below is obsolete in current implementation. The check is already done in GCM_ENC_DEC. ;; cmp %%NUM_BLOCKS, 0 ;; je %%_small_initial_blocks_encrypted cmp %%NUM_BLOCKS, 8 je %%_small_initial_num_blocks_is_8 cmp %%NUM_BLOCKS, 7 je %%_small_initial_num_blocks_is_7 cmp %%NUM_BLOCKS, 6 je %%_small_initial_num_blocks_is_6 cmp %%NUM_BLOCKS, 5 je %%_small_initial_num_blocks_is_5 cmp %%NUM_BLOCKS, 4 je %%_small_initial_num_blocks_is_4 cmp %%NUM_BLOCKS, 3 je %%_small_initial_num_blocks_is_3 cmp %%NUM_BLOCKS, 2 je %%_small_initial_num_blocks_is_2 jmp %%_small_initial_num_blocks_is_1 %%_small_initial_num_blocks_is_8: INITIAL_BLOCKS_PARTIAL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 8, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE jmp %%_small_initial_blocks_encrypted %%_small_initial_num_blocks_is_7: ;; r13 - %%LENGTH ;; xmm12 - T1 ;; xmm13 - T2 ;; xmm14 - T3 - AAD HASH OUT when not producing 8 AES keys ;; xmm15 - T4 ;; xmm11 - T5 ;; xmm9 - CTR ;; xmm1 - XMM1 - Cipher + Hash when producing 8 AES keys ;; xmm2 - XMM2 ;; xmm3 - XMM3 ;; xmm4 - XMM4 ;; xmm5 - XMM5 ;; xmm6 - XMM6 ;; xmm7 - XMM7 ;; xmm8 - XMM8 - AAD HASH IN ;; xmm10 - T6 ;; xmm0 - T_key INITIAL_BLOCKS_PARTIAL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 7, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE jmp %%_small_initial_blocks_encrypted %%_small_initial_num_blocks_is_6: INITIAL_BLOCKS_PARTIAL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 6, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE jmp %%_small_initial_blocks_encrypted %%_small_initial_num_blocks_is_5: INITIAL_BLOCKS_PARTIAL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 5, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE jmp %%_small_initial_blocks_encrypted %%_small_initial_num_blocks_is_4: INITIAL_BLOCKS_PARTIAL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 4, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE jmp %%_small_initial_blocks_encrypted %%_small_initial_num_blocks_is_3: INITIAL_BLOCKS_PARTIAL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 3, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE jmp %%_small_initial_blocks_encrypted %%_small_initial_num_blocks_is_2: INITIAL_BLOCKS_PARTIAL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 2, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE jmp %%_small_initial_blocks_encrypted %%_small_initial_num_blocks_is_1: INITIAL_BLOCKS_PARTIAL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 1, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE ;; Note: zero initial blocks not allowed. %%_small_initial_blocks_encrypted: %endmacro ; GCM_ENC_DEC_SMALL ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context_data struct ; has been initialized by GCM_INIT ; Requires the input data be at least 1 byte long because of READ_SMALL_INPUT_DATA. ; Input: gcm_key_data struct* (GDATA_KEY), gcm_context_data *(GDATA_CTX), input text (PLAIN_CYPH_IN), ; input text length (PLAIN_CYPH_LEN) and whether encoding or decoding (ENC_DEC). ; Output: A cypher of the given plain text (CYPH_PLAIN_OUT), and updated GDATA_CTX ; Clobbers rax, r10-r15, and xmm0-xmm15 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; %macro GCM_ENC_DEC 7 %define %%GDATA_KEY %1 %define %%GDATA_CTX %2 %define %%CYPH_PLAIN_OUT %3 %define %%PLAIN_CYPH_IN %4 %define %%PLAIN_CYPH_LEN %5 %define %%ENC_DEC %6 %define %%INSTANCE_TYPE %7 %define %%DATA_OFFSET r11 ; Macro flow: ; calculate the number of 16byte blocks in the message ; process (number of 16byte blocks) mod 8 '%%_initial_num_blocks_is_# .. %%_initial_blocks_encrypted' ; process 8 16 byte blocks at a time until all are done '%%_encrypt_by_8_new .. %%_eight_cipher_left' ; if there is a block of less tahn 16 bytes process it '%%_zero_cipher_left .. %%_multiple_of_16_bytes' cmp %%PLAIN_CYPH_LEN, 0 je %%_enc_dec_done xor %%DATA_OFFSET, %%DATA_OFFSET ;; Update length of data processed %ifidn __OUTPUT_FORMAT__, win64 mov rax, %%PLAIN_CYPH_LEN add [%%GDATA_CTX + InLen], rax %else add [%%GDATA_CTX + InLen], %%PLAIN_CYPH_LEN %endif vmovdqu xmm13, [%%GDATA_KEY + HashKey] vmovdqu xmm8, [%%GDATA_CTX + AadHash] %ifidn %%INSTANCE_TYPE, multi_call ;; NOTE: partial block processing makes only sense for multi_call here. ;; Used for the update flow - if there was a previous partial ;; block fill the remaining bytes here. PARTIAL_BLOCK %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%PLAIN_CYPH_LEN, %%DATA_OFFSET, xmm8, %%ENC_DEC %endif ;; lift CTR set from initial_blocks to here %ifidn %%INSTANCE_TYPE, single_call vmovdqu xmm9, xmm2 %else vmovdqu xmm9, [%%GDATA_CTX + CurCount] %endif ;; Save the amount of data left to process in r10 mov r13, %%PLAIN_CYPH_LEN %ifidn %%INSTANCE_TYPE, multi_call ;; NOTE: %%DATA_OFFSET is zero in single_call case. ;; Consequently PLAIN_CYPH_LEN will never be zero after ;; %%DATA_OFFSET subtraction below. sub r13, %%DATA_OFFSET ;; There may be no more data if it was consumed in the partial block. cmp r13, 0 je %%_enc_dec_done %endif ; %%INSTANCE_TYPE, multi_call mov r10, r13 ;; Determine how many blocks to process in INITIAL mov r12, r13 shr r12, 4 and r12, 7 ;; Process one additional block in INITIAL if there is a partial block and r10, 0xf blsmsk r10, r10 ; Set CF if zero cmc ; Flip CF adc r12, 0x0 ; Process an additional INITIAL block if CF set ;; Less than 127B will be handled by the small message code, which ;; can process up to 7 16B blocks. cmp r13, 128 jge %%_large_message_path GCM_ENC_DEC_SMALL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%PLAIN_CYPH_LEN, %%ENC_DEC, %%DATA_OFFSET, r13, r12, xmm9, xmm14, %%INSTANCE_TYPE jmp %%_ghash_done %%_large_message_path: and r12, 0x7 ; Still, don't allow 8 INITIAL blocks since this will ; can be handled by the x8 partial loop. cmp r12, 0 je %%_initial_num_blocks_is_0 cmp r12, 7 je %%_initial_num_blocks_is_7 cmp r12, 6 je %%_initial_num_blocks_is_6 cmp r12, 5 je %%_initial_num_blocks_is_5 cmp r12, 4 je %%_initial_num_blocks_is_4 cmp r12, 3 je %%_initial_num_blocks_is_3 cmp r12, 2 je %%_initial_num_blocks_is_2 jmp %%_initial_num_blocks_is_1 %%_initial_num_blocks_is_7: ;; r13 - %%LENGTH ;; xmm12 - T1 ;; xmm13 - T2 ;; xmm14 - T3 - AAD HASH OUT when not producing 8 AES keys ;; xmm15 - T4 ;; xmm11 - T5 ;; xmm9 - CTR ;; xmm1 - XMM1 - Cipher + Hash when producing 8 AES keys ;; xmm2 - XMM2 ;; xmm3 - XMM3 ;; xmm4 - XMM4 ;; xmm5 - XMM5 ;; xmm6 - XMM6 ;; xmm7 - XMM7 ;; xmm8 - XMM8 - AAD HASH IN ;; xmm10 - T6 ;; xmm0 - T_key INITIAL_BLOCKS %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 7, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC jmp %%_initial_blocks_encrypted %%_initial_num_blocks_is_6: INITIAL_BLOCKS %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 6, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC jmp %%_initial_blocks_encrypted %%_initial_num_blocks_is_5: INITIAL_BLOCKS %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 5, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC jmp %%_initial_blocks_encrypted %%_initial_num_blocks_is_4: INITIAL_BLOCKS %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 4, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC jmp %%_initial_blocks_encrypted %%_initial_num_blocks_is_3: INITIAL_BLOCKS %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 3, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC jmp %%_initial_blocks_encrypted %%_initial_num_blocks_is_2: INITIAL_BLOCKS %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 2, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC jmp %%_initial_blocks_encrypted %%_initial_num_blocks_is_1: INITIAL_BLOCKS %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 1, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC jmp %%_initial_blocks_encrypted %%_initial_num_blocks_is_0: INITIAL_BLOCKS %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 0, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC %%_initial_blocks_encrypted: ;; The entire message was encrypted processed in initial and now need to be hashed cmp r13, 0 je %%_encrypt_done ;; Encrypt the final <16 byte (partial) block, then hash cmp r13, 16 jl %%_encrypt_final_partial ;; Process 7 full blocks plus a partial block cmp r13, 128 jl %%_encrypt_by_8_partial %%_encrypt_by_8_parallel: ;; in_order vs. out_order is an optimization to increment the counter without shuffling ;; it back into little endian. r15d keeps track of when we need to increent in order so ;; that the carry is handled correctly. vmovd r15d, xmm9 and r15d, 255 vpshufb xmm9, [rel SHUF_MASK] %%_encrypt_by_8_new: cmp r15d, 255-8 jg %%_encrypt_by_8 ;; xmm0 - T1 ;; xmm10 - T2 ;; xmm11 - T3 ;; xmm12 - T4 ;; xmm13 - T5 ;; xmm14 - T6 ;; xmm9 - CTR ;; xmm1 - XMM1 ;; xmm2 - XMM2 ;; xmm3 - XMM3 ;; xmm4 - XMM4 ;; xmm5 - XMM5 ;; xmm6 - XMM6 ;; xmm7 - XMM7 ;; xmm8 - XMM8 ;; xmm15 - T7 add r15b, 8 GHASH_8_ENCRYPT_8_PARALLEL %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, out_order, %%ENC_DEC, full add %%DATA_OFFSET, 128 sub r13, 128 cmp r13, 128 jge %%_encrypt_by_8_new vpshufb xmm9, [SHUF_MASK] jmp %%_encrypt_by_8_parallel_done %%_encrypt_by_8: vpshufb xmm9, [SHUF_MASK] add r15b, 8 GHASH_8_ENCRYPT_8_PARALLEL %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, in_order, %%ENC_DEC, full vpshufb xmm9, [SHUF_MASK] add %%DATA_OFFSET, 128 sub r13, 128 cmp r13, 128 jge %%_encrypt_by_8_new vpshufb xmm9, [SHUF_MASK] %%_encrypt_by_8_parallel_done: ;; Test to see if we need a by 8 with partial block. At this point ;; bytes remaining should be either zero or between 113-127. cmp r13, 0 je %%_encrypt_done %%_encrypt_by_8_partial: ;; Shuffle needed to align key for partial block xor. out_order ;; is a little faster because it avoids extra shuffles. ;; TBD: Might need to account for when we don't have room to increment the counter. ;; Process parallel buffers with a final partial block. GHASH_8_ENCRYPT_8_PARALLEL %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, in_order, %%ENC_DEC, partial add %%DATA_OFFSET, 128-16 sub r13, 128-16 %%_encrypt_final_partial: vpshufb xmm8, [SHUF_MASK] mov [%%GDATA_CTX + PBlockLen], r13 vmovdqu [%%GDATA_CTX + PBlockEncKey], xmm8 ;; xmm8 - Final encrypted counter - need to hash with partial or full block ciphertext ;; GDATA, KEY, T1, T2 ENCRYPT_FINAL_PARTIAL_BLOCK xmm8, xmm0, xmm10, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%PLAIN_CYPH_LEN, %%ENC_DEC, %%DATA_OFFSET vpshufb xmm8, [SHUF_MASK] %%_encrypt_done: ;; Mapping to macro parameters ;; IN: ;; xmm9 contains the counter ;; xmm1-xmm8 contain the xor'd ciphertext ;; OUT: ;; xmm14 contains the final hash ;; GDATA, T1, T2, T3, T4, T5, T6, T7, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8 %ifidn %%INSTANCE_TYPE, multi_call mov r13, [%%GDATA_CTX + PBlockLen] cmp r13, 0 jz %%_hash_last_8 GHASH_LAST_7 %%GDATA_KEY, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7 ;; XOR the partial word into the hash vpxor xmm14, xmm14, xmm8 jmp %%_ghash_done %endif %%_hash_last_8: GHASH_LAST_8 %%GDATA_KEY, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8 %%_ghash_done: vmovdqu [%%GDATA_CTX + CurCount], xmm9 ; my_ctx_data.current_counter = xmm9 vmovdqu [%%GDATA_CTX + AadHash], xmm14 ; my_ctx_data.aad hash = xmm14 %%_enc_dec_done: %endmacro ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; GCM_COMPLETE Finishes Encyrption/Decryption of last partial block after GCM_UPDATE finishes. ; Input: A gcm_key_data * (GDATA_KEY), gcm_context_data (GDATA_CTX) and whether encoding or decoding (ENC_DEC). ; Output: Authorization Tag (AUTH_TAG) and Authorization Tag length (AUTH_TAG_LEN) ; Clobbers rax, r10-r12, and xmm0, xmm1, xmm5, xmm6, xmm9, xmm11, xmm14, xmm15 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; %macro GCM_COMPLETE 6 %define %%GDATA_KEY %1 %define %%GDATA_CTX %2 %define %%AUTH_TAG %3 %define %%AUTH_TAG_LEN %4 %define %%ENC_DEC %5 %define %%INSTANCE_TYPE %6 %define %%PLAIN_CYPH_LEN rax vmovdqu xmm13, [%%GDATA_KEY + HashKey] ;; Start AES as early as possible vmovdqu xmm9, [%%GDATA_CTX + OrigIV] ; xmm9 = Y0 ENCRYPT_SINGLE_BLOCK %%GDATA_KEY, xmm9 ; E(K, Y0) %ifidn %%INSTANCE_TYPE, multi_call ;; If the GCM function is called as a single function call rather ;; than invoking the individual parts (init, update, finalize) we ;; can remove a write to read dependency on AadHash. vmovdqu xmm14, [%%GDATA_CTX + AadHash] ;; Encrypt the final partial block. If we did this as a single call then ;; the partial block was handled in the main GCM_ENC_DEC macro. mov r12, [%%GDATA_CTX + PBlockLen] cmp r12, 0 je %%_partial_done GHASH_MUL xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block vmovdqu [%%GDATA_CTX + AadHash], xmm14 %%_partial_done: %endif mov r12, [%%GDATA_CTX + AadLen] ; r12 = aadLen (number of bytes) mov %%PLAIN_CYPH_LEN, [%%GDATA_CTX + InLen] shl r12, 3 ; convert into number of bits vmovd xmm15, r12d ; len(A) in xmm15 shl %%PLAIN_CYPH_LEN, 3 ; len(C) in bits (*128) vmovq xmm1, %%PLAIN_CYPH_LEN vpslldq xmm15, xmm15, 8 ; xmm15 = len(A)|| 0x0000000000000000 vpxor xmm15, xmm15, xmm1 ; xmm15 = len(A)||len(C) vpxor xmm14, xmm15 GHASH_MUL xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 vpshufb xmm14, [SHUF_MASK] ; perform a 16Byte swap vpxor xmm9, xmm9, xmm14 %%_return_T: mov r10, %%AUTH_TAG ; r10 = authTag mov r11, %%AUTH_TAG_LEN ; r11 = auth_tag_len cmp r11, 16 je %%_T_16 cmp r11, 12 je %%_T_12 %%_T_8: vmovq rax, xmm9 mov [r10], rax jmp %%_return_T_done %%_T_12: vmovq rax, xmm9 mov [r10], rax vpsrldq xmm9, xmm9, 8 vmovd eax, xmm9 mov [r10 + 8], eax jmp %%_return_T_done %%_T_16: vmovdqu [r10], xmm9 %%_return_T_done: %endmacro ; GCM_COMPLETE ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;void aes_gcm_precomp_128_avx_gen4 / ; aes_gcm_precomp_192_avx_gen4 / ; aes_gcm_precomp_256_avx_gen4 ; (struct gcm_key_data *key_data) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; MKGLOBAL(FN_NAME(precomp,_),function,) FN_NAME(precomp,_): push r12 push r13 push r14 push r15 mov r14, rsp sub rsp, VARIABLE_OFFSET and rsp, ~63 ; align rsp to 64 bytes %ifidn __OUTPUT_FORMAT__, win64 ; only xmm6 needs to be maintained vmovdqu [rsp + LOCAL_STORAGE + 0*16],xmm6 %endif vpxor xmm6, xmm6 ENCRYPT_SINGLE_BLOCK arg1, xmm6 ; xmm6 = HashKey vpshufb xmm6, [rel SHUF_MASK] ;;;;;;;;;;;;;;; PRECOMPUTATION of HashKey<<1 mod poly from the HashKey;;;;;;;;;;;;;;; vmovdqa xmm2, xmm6 vpsllq xmm6, xmm6, 1 vpsrlq xmm2, xmm2, 63 vmovdqa xmm1, xmm2 vpslldq xmm2, xmm2, 8 vpsrldq xmm1, xmm1, 8 vpor xmm6, xmm6, xmm2 ;reduction vpshufd xmm2, xmm1, 00100100b vpcmpeqd xmm2, [TWOONE] vpand xmm2, xmm2, [POLY] vpxor xmm6, xmm6, xmm2 ; xmm6 holds the HashKey<<1 mod poly ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; vmovdqu [arg1 + HashKey], xmm6 ; store HashKey<<1 mod poly PRECOMPUTE arg1, xmm6, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5 %ifidn __OUTPUT_FORMAT__, win64 vmovdqu xmm6, [rsp + LOCAL_STORAGE + 0*16] %endif mov rsp, r14 pop r15 pop r14 pop r13 pop r12 ret ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;void aes_gcm_init_128_avx_gen4 / aes_gcm_init_192_avx_gen4 / aes_gcm_init_256_avx_gen4 ; (const struct gcm_key_data *key_data, ; struct gcm_context_data *context_data, ; u8 *iv, ; const u8 *aad, ; u64 aad_len); ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; MKGLOBAL(FN_NAME(init,_),function,) FN_NAME(init,_): push r12 push r13 %ifidn __OUTPUT_FORMAT__, win64 push r14 push r15 mov r14, rsp ; xmm6:xmm15 need to be maintained for Windows sub rsp, 1*16 movdqu [rsp + 0*16], xmm6 %endif GCM_INIT arg1, arg2, arg3, arg4, arg5 %ifidn __OUTPUT_FORMAT__, win64 movdqu xmm6 , [rsp + 0*16] mov rsp, r14 pop r15 pop r14 %endif pop r13 pop r12 ret ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;void aes_gcm_enc_128_update_avx_gen4 / aes_gcm_enc_192_update_avx_gen4 / ; aes_gcm_enc_128_update_avx_gen4 ; (const struct gcm_key_data *key_data, ; struct gcm_context_data *context_data, ; u8 *out, ; const u8 *in, ; u64 plaintext_len); ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; MKGLOBAL(FN_NAME(enc,_update_),function,) FN_NAME(enc,_update_): FUNC_SAVE GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, ENC, multi_call FUNC_RESTORE ret ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;void aes_gcm_dec_128_update_avx_gen4 / aes_gcm_dec_192_update_avx_gen4 / ; aes_gcm_dec_256_update_avx_gen4 ; (const struct gcm_key_data *key_data, ; struct gcm_context_data *context_data, ; u8 *out, ; const u8 *in, ; u64 plaintext_len); ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; MKGLOBAL(FN_NAME(dec,_update_),function,) FN_NAME(dec,_update_): FUNC_SAVE GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, DEC, multi_call FUNC_RESTORE ret ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;void aes_gcm_enc_128_finalize_avx_gen4 / aes_gcm_enc_192_finalize_avx_gen4 / ; aes_gcm_enc_256_finalize_avx_gen4 ; (const struct gcm_key_data *key_data, ; struct gcm_context_data *context_data, ; u8 *auth_tag, ; u64 auth_tag_len); ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; MKGLOBAL(FN_NAME(enc,_finalize_),function,) FN_NAME(enc,_finalize_): push r12 %ifidn __OUTPUT_FORMAT__, win64 ; xmm6:xmm15 need to be maintained for Windows sub rsp, 5*16 vmovdqu [rsp + 0*16], xmm6 vmovdqu [rsp + 1*16], xmm9 vmovdqu [rsp + 2*16], xmm11 vmovdqu [rsp + 3*16], xmm14 vmovdqu [rsp + 4*16], xmm15 %endif GCM_COMPLETE arg1, arg2, arg3, arg4, ENC, multi_call %ifidn __OUTPUT_FORMAT__, win64 vmovdqu xmm15, [rsp + 4*16] vmovdqu xmm14, [rsp + 3*16] vmovdqu xmm11, [rsp + 2*16] vmovdqu xmm9, [rsp + 1*16] vmovdqu xmm6, [rsp + 0*16] add rsp, 5*16 %endif pop r12 ret ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;void aes_gcm_dec_128_finalize_avx_gen4 / aes_gcm_dec_192_finalize_avx_gen4 ; aes_gcm_dec_256_finalize_avx_gen4 ; (const struct gcm_key_data *key_data, ; struct gcm_context_data *context_data, ; u8 *auth_tag, ; u64 auth_tag_len); ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; MKGLOBAL(FN_NAME(dec,_finalize_),function,) FN_NAME(dec,_finalize_): push r12 %ifidn __OUTPUT_FORMAT__, win64 ; xmm6:xmm15 need to be maintained for Windows sub rsp, 5*16 vmovdqu [rsp + 0*16], xmm6 vmovdqu [rsp + 1*16], xmm9 vmovdqu [rsp + 2*16], xmm11 vmovdqu [rsp + 3*16], xmm14 vmovdqu [rsp + 4*16], xmm15 %endif GCM_COMPLETE arg1, arg2, arg3, arg4, DEC, multi_call %ifidn __OUTPUT_FORMAT__, win64 vmovdqu xmm15, [rsp + 4*16] vmovdqu xmm14, [rsp + 3*16] vmovdqu xmm11, [rsp + 2*16] vmovdqu xmm9, [rsp + 1*16] vmovdqu xmm6, [rsp + 0*16] add rsp, 5*16 %endif pop r12 ret ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;void aes_gcm_enc_128_avx_gen4 / aes_gcm_enc_192_avx_gen4 / aes_gcm_enc_256_avx_gen4 ; (const struct gcm_key_data *key_data, ; struct gcm_context_data *context_data, ; u8 *out, ; const u8 *in, ; u64 plaintext_len, ; u8 *iv, ; const u8 *aad, ; u64 aad_len, ; u8 *auth_tag, ; u64 auth_tag_len); ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; MKGLOBAL(FN_NAME(enc,_),function,) FN_NAME(enc,_): FUNC_SAVE GCM_INIT arg1, arg2, arg6, arg7, arg8 GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, ENC, single_call GCM_COMPLETE arg1, arg2, arg9, arg10, ENC, single_call FUNC_RESTORE ret ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;void aes_gcm_dec_128_avx_gen4 / aes_gcm_dec_192_avx_gen4 / aes_gcm_dec_256_avx_gen4 ; (const struct gcm_key_data *key_data, ; struct gcm_context_data *context_data, ; u8 *out, ; const u8 *in, ; u64 plaintext_len, ; u8 *iv, ; const u8 *aad, ; u64 aad_len, ; u8 *auth_tag, ; u64 auth_tag_len); ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; MKGLOBAL(FN_NAME(dec,_),function,) FN_NAME(dec,_): FUNC_SAVE GCM_INIT arg1, arg2, arg6, arg7, arg8 GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, DEC, single_call GCM_COMPLETE arg1, arg2, arg9, arg10, DEC, single_call FUNC_RESTORE ret intel-ipsec-mb-0.48/avx2/mb_mgr_avx2.c000066400000000000000000000505561321406316400175170ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2012-2017, Intel Corporation Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ #include #include #include #include "os.h" #define AVX2 #include "mb_mgr.h" #include "save_xmms.h" #include "asm.h" #ifndef NO_GCM #include "gcm_defines.h" #endif #include "des.h" JOB_AES_HMAC *submit_job_aes128_enc_avx(MB_MGR_AES_OOO *state, JOB_AES_HMAC *job); JOB_AES_HMAC *flush_job_aes128_enc_avx(MB_MGR_AES_OOO *state); JOB_AES_HMAC *submit_job_aes192_enc_avx(MB_MGR_AES_OOO *state, JOB_AES_HMAC *job); JOB_AES_HMAC *flush_job_aes192_enc_avx(MB_MGR_AES_OOO *state); JOB_AES_HMAC *submit_job_aes256_enc_avx(MB_MGR_AES_OOO *state, JOB_AES_HMAC *job); JOB_AES_HMAC *flush_job_aes256_enc_avx(MB_MGR_AES_OOO *state); JOB_AES_HMAC *submit_job_aes_xcbc_avx(MB_MGR_AES_XCBC_OOO *state, JOB_AES_HMAC *job); JOB_AES_HMAC *flush_job_aes_xcbc_avx(MB_MGR_AES_XCBC_OOO *state); #define SAVE_XMMS save_xmms_avx #define RESTORE_XMMS restore_xmms_avx #define SUBMIT_JOB_AES128_ENC submit_job_aes128_enc_avx #define SUBMIT_JOB_AES128_DEC submit_job_aes128_dec_avx #define FLUSH_JOB_AES128_ENC flush_job_aes128_enc_avx #define SUBMIT_JOB_AES192_ENC submit_job_aes192_enc_avx #define SUBMIT_JOB_AES192_DEC submit_job_aes192_dec_avx #define FLUSH_JOB_AES192_ENC flush_job_aes192_enc_avx #define SUBMIT_JOB_AES256_ENC submit_job_aes256_enc_avx #define SUBMIT_JOB_AES256_DEC submit_job_aes256_dec_avx #define FLUSH_JOB_AES256_ENC flush_job_aes256_enc_avx #define SUBMIT_JOB_AES128_CNTR submit_job_aes128_cntr_avx #define SUBMIT_JOB_AES192_CNTR submit_job_aes192_cntr_avx #define SUBMIT_JOB_AES256_CNTR submit_job_aes256_cntr_avx #define AES_CBC_DEC_128 aes_cbc_dec_128_avx #define AES_CBC_DEC_192 aes_cbc_dec_192_avx #define AES_CBC_DEC_256 aes_cbc_dec_256_avx #define AES_CNTR_128 aes_cntr_128_avx #define AES_CNTR_192 aes_cntr_192_avx #define AES_CNTR_256 aes_cntr_256_avx #ifndef NO_GCM #define AES_GCM_DEC_128 aes_gcm_dec_128_avx_gen4 #define AES_GCM_ENC_128 aes_gcm_enc_128_avx_gen4 #define AES_GCM_DEC_192 aes_gcm_dec_192_avx_gen4 #define AES_GCM_ENC_192 aes_gcm_enc_192_avx_gen4 #define AES_GCM_DEC_256 aes_gcm_dec_256_avx_gen4 #define AES_GCM_ENC_256 aes_gcm_enc_256_avx_gen4 #endif /* NO_GCM */ #define SUBMIT_JOB_AES_XCBC submit_job_aes_xcbc_avx #define FLUSH_JOB_AES_XCBC flush_job_aes_xcbc_avx #define SUBMIT_JOB_AES128_DEC submit_job_aes128_dec_avx #define SUBMIT_JOB_AES192_DEC submit_job_aes192_dec_avx #define SUBMIT_JOB_AES256_DEC submit_job_aes256_dec_avx #define QUEUE_SIZE queue_size_avx2 #define SUBMIT_JOB_AES_ENC SUBMIT_JOB_AES_ENC_AVX2 #define FLUSH_JOB_AES_ENC FLUSH_JOB_AES_ENC_AVX2 #define SUBMIT_JOB_AES_DEC SUBMIT_JOB_AES_DEC_AVX2 JOB_AES_HMAC *submit_job_hmac_avx2(MB_MGR_HMAC_SHA_1_OOO *state, JOB_AES_HMAC *job); JOB_AES_HMAC *flush_job_hmac_avx2(MB_MGR_HMAC_SHA_1_OOO *state); JOB_AES_HMAC *submit_job_hmac_sha_224_avx2(MB_MGR_HMAC_SHA_256_OOO *state, JOB_AES_HMAC *job); JOB_AES_HMAC *flush_job_hmac_sha_224_avx2(MB_MGR_HMAC_SHA_256_OOO *state); JOB_AES_HMAC *submit_job_hmac_sha_256_avx2(MB_MGR_HMAC_SHA_256_OOO *state, JOB_AES_HMAC *job); JOB_AES_HMAC *flush_job_hmac_sha_256_avx2(MB_MGR_HMAC_SHA_256_OOO *state); JOB_AES_HMAC *submit_job_hmac_sha_384_avx2(MB_MGR_HMAC_SHA_512_OOO *state, JOB_AES_HMAC *job); JOB_AES_HMAC *flush_job_hmac_sha_384_avx2(MB_MGR_HMAC_SHA_512_OOO *state); JOB_AES_HMAC *submit_job_hmac_sha_512_avx2(MB_MGR_HMAC_SHA_512_OOO *state, JOB_AES_HMAC *job); JOB_AES_HMAC *flush_job_hmac_sha_512_avx2(MB_MGR_HMAC_SHA_512_OOO *state); JOB_AES_HMAC *submit_job_hmac_md5_avx2(MB_MGR_HMAC_MD5_OOO *state, JOB_AES_HMAC *job); JOB_AES_HMAC *flush_job_hmac_md5_avx2(MB_MGR_HMAC_MD5_OOO *state); #define SUBMIT_JOB_HMAC submit_job_hmac_avx2 #define FLUSH_JOB_HMAC flush_job_hmac_avx2 #define SUBMIT_JOB_HMAC_SHA_224 submit_job_hmac_sha_224_avx2 #define FLUSH_JOB_HMAC_SHA_224 flush_job_hmac_sha_224_avx2 #define SUBMIT_JOB_HMAC_SHA_256 submit_job_hmac_sha_256_avx2 #define FLUSH_JOB_HMAC_SHA_256 flush_job_hmac_sha_256_avx2 #define SUBMIT_JOB_HMAC_SHA_384 submit_job_hmac_sha_384_avx2 #define FLUSH_JOB_HMAC_SHA_384 flush_job_hmac_sha_384_avx2 #define SUBMIT_JOB_HMAC_SHA_512 submit_job_hmac_sha_512_avx2 #define FLUSH_JOB_HMAC_SHA_512 flush_job_hmac_sha_512_avx2 #define SUBMIT_JOB_HMAC_MD5 submit_job_hmac_md5_avx2 #define FLUSH_JOB_HMAC_MD5 flush_job_hmac_md5_avx2 /* ====================================================================== */ #define SUBMIT_JOB submit_job_avx2 #define FLUSH_JOB flush_job_avx2 #define SUBMIT_JOB_NOCHECK submit_job_nocheck_avx2 #define QUEUE_SIZE queue_size_avx2 /* ====================================================================== */ #define SUBMIT_JOB_HASH SUBMIT_JOB_HASH_AVX2 #define FLUSH_JOB_HASH FLUSH_JOB_HASH_AVX2 /* ====================================================================== */ #define AES_CFB_128_ONE aes_cfb_128_one_avx2 void aes128_cbc_mac_x8(AES_ARGS_x8 *args, uint64_t len); #define AES128_CBC_MAC aes128_cbc_mac_x8 #define FLUSH_JOB_AES_CCM_AUTH flush_job_aes_ccm_auth_arch #define SUBMIT_JOB_AES_CCM_AUTH submit_job_aes_ccm_auth_arch #define AES_CCM_MAX_JOBS 8 /* ====================================================================== */ void init_mb_mgr_avx2(MB_MGR *state) { unsigned int j; UINT8 *p; /* Init AES out-of-order fields */ state->aes128_ooo.lens[0] = 0; state->aes128_ooo.lens[1] = 0; state->aes128_ooo.lens[2] = 0; state->aes128_ooo.lens[3] = 0; state->aes128_ooo.lens[4] = 0; state->aes128_ooo.lens[5] = 0; state->aes128_ooo.lens[6] = 0; state->aes128_ooo.lens[7] = 0; state->aes128_ooo.unused_lanes = 0xF76543210; state->aes128_ooo.job_in_lane[0] = NULL; state->aes128_ooo.job_in_lane[1] = NULL; state->aes128_ooo.job_in_lane[2] = NULL; state->aes128_ooo.job_in_lane[3] = NULL; state->aes128_ooo.job_in_lane[4] = NULL; state->aes128_ooo.job_in_lane[5] = NULL; state->aes128_ooo.job_in_lane[6] = NULL; state->aes128_ooo.job_in_lane[7] = NULL; state->aes192_ooo.lens[0] = 0; state->aes192_ooo.lens[1] = 0; state->aes192_ooo.lens[2] = 0; state->aes192_ooo.lens[3] = 0; state->aes192_ooo.lens[4] = 0; state->aes192_ooo.lens[5] = 0; state->aes192_ooo.lens[6] = 0; state->aes192_ooo.lens[7] = 0; state->aes192_ooo.unused_lanes = 0xF76543210; state->aes192_ooo.job_in_lane[0] = NULL; state->aes192_ooo.job_in_lane[1] = NULL; state->aes192_ooo.job_in_lane[2] = NULL; state->aes192_ooo.job_in_lane[3] = NULL; state->aes192_ooo.job_in_lane[4] = NULL; state->aes192_ooo.job_in_lane[5] = NULL; state->aes192_ooo.job_in_lane[6] = NULL; state->aes192_ooo.job_in_lane[7] = NULL; state->aes256_ooo.lens[0] = 0; state->aes256_ooo.lens[1] = 0; state->aes256_ooo.lens[2] = 0; state->aes256_ooo.lens[3] = 0; state->aes256_ooo.lens[4] = 0; state->aes256_ooo.lens[5] = 0; state->aes256_ooo.lens[6] = 0; state->aes256_ooo.lens[7] = 0; state->aes256_ooo.unused_lanes = 0xF76543210; state->aes256_ooo.job_in_lane[0] = NULL; state->aes256_ooo.job_in_lane[1] = NULL; state->aes256_ooo.job_in_lane[2] = NULL; state->aes256_ooo.job_in_lane[3] = NULL; state->aes256_ooo.job_in_lane[4] = NULL; state->aes256_ooo.job_in_lane[5] = NULL; state->aes256_ooo.job_in_lane[6] = NULL; state->aes256_ooo.job_in_lane[7] = NULL; /* DOCSIS SEC BPI uses same settings as AES128 CBC */ state->docsis_sec_ooo.lens[0] = 0; state->docsis_sec_ooo.lens[1] = 0; state->docsis_sec_ooo.lens[2] = 0; state->docsis_sec_ooo.lens[3] = 0; state->docsis_sec_ooo.lens[4] = 0; state->docsis_sec_ooo.lens[5] = 0; state->docsis_sec_ooo.lens[6] = 0; state->docsis_sec_ooo.lens[7] = 0; state->docsis_sec_ooo.unused_lanes = 0xF76543210; state->docsis_sec_ooo.job_in_lane[0] = NULL; state->docsis_sec_ooo.job_in_lane[1] = NULL; state->docsis_sec_ooo.job_in_lane[2] = NULL; state->docsis_sec_ooo.job_in_lane[3] = NULL; state->docsis_sec_ooo.job_in_lane[4] = NULL; state->docsis_sec_ooo.job_in_lane[5] = NULL; state->docsis_sec_ooo.job_in_lane[6] = NULL; state->docsis_sec_ooo.job_in_lane[7] = NULL; /* Init HMAC/SHA1 out-of-order fields */ state->hmac_sha_1_ooo.lens[0] = 0; state->hmac_sha_1_ooo.lens[1] = 0; state->hmac_sha_1_ooo.lens[2] = 0; state->hmac_sha_1_ooo.lens[3] = 0; state->hmac_sha_1_ooo.lens[4] = 0; state->hmac_sha_1_ooo.lens[5] = 0; state->hmac_sha_1_ooo.lens[6] = 0; state->hmac_sha_1_ooo.lens[7] = 0; state->hmac_sha_1_ooo.unused_lanes = 0xF76543210; for (j = 0; j < AVX2_NUM_SHA1_LANES; j++) { state->hmac_sha_1_ooo.ldata[j].job_in_lane = NULL; state->hmac_sha_1_ooo.ldata[j].extra_block[64] = 0x80; memset(state->hmac_sha_1_ooo.ldata[j].extra_block + 65, 0x00, 64 + 7); p = state->hmac_sha_1_ooo.ldata[j].outer_block; memset(p + 5*4 + 1, 0x00, 64 - 5*4 - 1 - 2); p[5 * 4] = 0x80; p[64 - 2] = 0x02; p[64 - 1] = 0xA0; } /* Init HMAC/SHA224 out-of-order fields */ state->hmac_sha_224_ooo.lens[0] = 0; state->hmac_sha_224_ooo.lens[1] = 0; state->hmac_sha_224_ooo.lens[2] = 0; state->hmac_sha_224_ooo.lens[3] = 0; state->hmac_sha_224_ooo.lens[4] = 0; state->hmac_sha_224_ooo.lens[5] = 0; state->hmac_sha_224_ooo.lens[6] = 0; state->hmac_sha_224_ooo.lens[7] = 0; state->hmac_sha_224_ooo.unused_lanes = 0xF76543210; /* sha256 and sha224 are very similar except for * digest constants and output size */ for (j = 0; j < AVX2_NUM_SHA256_LANES; j++) { state->hmac_sha_224_ooo.ldata[j].job_in_lane = NULL; state->hmac_sha_224_ooo.ldata[j].extra_block[64] = 0x80; memset(state->hmac_sha_224_ooo.ldata[j].extra_block + 65, 0x00, 64 + 7); p = state->hmac_sha_224_ooo.ldata[j].outer_block; memset(p + 8*4 + 1, 0x00, 64 - 8*4 - 1 - 2); p[7 * 4] = 0x80; /* digest 7 words long */ p[64 - 2] = 0x02; /* length in little endian = 0x02E0 */ p[64 - 1] = 0xE0; } /* Init HMAC/SHA256 out-of-order fields */ state->hmac_sha_256_ooo.lens[0] = 0; state->hmac_sha_256_ooo.lens[1] = 0; state->hmac_sha_256_ooo.lens[2] = 0; state->hmac_sha_256_ooo.lens[3] = 0; state->hmac_sha_256_ooo.lens[4] = 0; state->hmac_sha_256_ooo.lens[5] = 0; state->hmac_sha_256_ooo.lens[6] = 0; state->hmac_sha_256_ooo.lens[7] = 0; state->hmac_sha_256_ooo.unused_lanes = 0xF76543210; for (j = 0; j < AVX2_NUM_SHA256_LANES; j++) { state->hmac_sha_256_ooo.ldata[j].job_in_lane = NULL; state->hmac_sha_256_ooo.ldata[j].extra_block[64] = 0x80; memset(state->hmac_sha_256_ooo.ldata[j].extra_block + 65, 0x00, 64 + 7); /* hmac related */ p = state->hmac_sha_256_ooo.ldata[j].outer_block; memset(p + 8*4 + 1, 0x00, 64 - 8*4 - 1 - 2); p[8 * 4] = 0x80; /* 8 digest words */ p[64 - 2] = 0x03; /* length */ p[64 - 1] = 0x00; } /* Init HMAC/SHA384 out-of-order fields */ state->hmac_sha_384_ooo.lens[0] = 0; state->hmac_sha_384_ooo.lens[1] = 0; state->hmac_sha_384_ooo.lens[2] = 0; state->hmac_sha_384_ooo.lens[3] = 0; state->hmac_sha_384_ooo.lens[4] = 0xFFFF; state->hmac_sha_384_ooo.lens[5] = 0xFFFF; state->hmac_sha_384_ooo.lens[6] = 0xFFFF; state->hmac_sha_384_ooo.lens[7] = 0xFFFF; state->hmac_sha_384_ooo.unused_lanes = 0xFF03020100; for (j = 0; j < AVX2_NUM_SHA512_LANES; j++) { MB_MGR_HMAC_SHA_512_OOO *ctx = &state->hmac_sha_384_ooo; ctx->ldata[j].job_in_lane = NULL; ctx->ldata[j].extra_block[SHA_384_BLOCK_SIZE] = 0x80; memset(ctx->ldata[j].extra_block + (SHA_384_BLOCK_SIZE + 1), 0x00, SHA_384_BLOCK_SIZE + 7); p = ctx->ldata[j].outer_block; /* special end point because this length is constant */ memset(p + SHA384_DIGEST_SIZE_IN_BYTES + 1, 0x00, SHA_384_BLOCK_SIZE - SHA384_DIGEST_SIZE_IN_BYTES - 1 - 2); /* mark the end */ p[SHA384_DIGEST_SIZE_IN_BYTES] = 0x80; /* hmac outer block length always of fixed size, * it is OKey length, a whole message block length, 1024 bits, * with padding plus the length of the inner digest, * which is 384 bits, 1408 bits == 0x0580. * The input message block needs to be converted to big endian * within the sha implementation before use. */ p[SHA_384_BLOCK_SIZE - 2] = 0x05; p[SHA_384_BLOCK_SIZE - 1] = 0x80; } /* Init HMAC/SHA512 out-of-order fields */ state->hmac_sha_512_ooo.lens[0] = 0; state->hmac_sha_512_ooo.lens[1] = 0; state->hmac_sha_512_ooo.lens[2] = 0; state->hmac_sha_512_ooo.lens[3] = 0; state->hmac_sha_512_ooo.lens[4] = 0xFFFF; state->hmac_sha_512_ooo.lens[5] = 0xFFFF; state->hmac_sha_512_ooo.lens[6] = 0xFFFF; state->hmac_sha_512_ooo.lens[7] = 0xFFFF; state->hmac_sha_512_ooo.unused_lanes = 0xFF03020100; for (j = 0; j < AVX2_NUM_SHA512_LANES; j++) { MB_MGR_HMAC_SHA_512_OOO *ctx = &state->hmac_sha_512_ooo; ctx->ldata[j].job_in_lane = NULL; ctx->ldata[j].extra_block[SHA_512_BLOCK_SIZE] = 0x80; memset(ctx->ldata[j].extra_block + (SHA_512_BLOCK_SIZE + 1), 0x00, SHA_512_BLOCK_SIZE + 7); p = ctx->ldata[j].outer_block; /* special end point because this length is constant */ memset(p + SHA512_DIGEST_SIZE_IN_BYTES + 1, 0x00, SHA_512_BLOCK_SIZE - SHA512_DIGEST_SIZE_IN_BYTES - 1 - 2); /* mark the end */ p[SHA512_DIGEST_SIZE_IN_BYTES] = 0x80; /* hmac outer block length always of fixed size, * it is OKey length, a whole message block length, 1024 bits, * with padding plus the length of the inner digest, * which is 512 bits, 1536 bits == 0x600. * The input message block needs to be converted to big endian * within the sha implementation before use. */ p[SHA_512_BLOCK_SIZE - 2] = 0x06; p[SHA_512_BLOCK_SIZE - 1] = 0x00; } /* Init HMAC/MD5 out-of-order fields */ state->hmac_md5_ooo.lens[0] = 0; state->hmac_md5_ooo.lens[1] = 0; state->hmac_md5_ooo.lens[2] = 0; state->hmac_md5_ooo.lens[3] = 0; state->hmac_md5_ooo.lens[4] = 0; state->hmac_md5_ooo.lens[5] = 0; state->hmac_md5_ooo.lens[6] = 0; state->hmac_md5_ooo.lens[7] = 0; state->hmac_md5_ooo.lens[8] = 0; state->hmac_md5_ooo.lens[9] = 0; state->hmac_md5_ooo.lens[10] = 0; state->hmac_md5_ooo.lens[11] = 0; state->hmac_md5_ooo.lens[12] = 0; state->hmac_md5_ooo.lens[13] = 0; state->hmac_md5_ooo.lens[14] = 0; state->hmac_md5_ooo.lens[15] = 0; state->hmac_md5_ooo.unused_lanes = 0xFEDCBA9876543210; state->hmac_md5_ooo.num_lanes_inuse = 0; for (j = 0; j < AVX2_NUM_MD5_LANES; j++) { state->hmac_md5_ooo.ldata[j].job_in_lane = NULL; state->hmac_md5_ooo.ldata[j].extra_block[64] = 0x80; memset(state->hmac_md5_ooo.ldata[j].extra_block + 65, 0x00, 64 + 7); p = state->hmac_md5_ooo.ldata[j].outer_block; memset(p + 5*4 + 1, 0x00, 64 - 5*4 - 1 - 2); p[4 * 4] = 0x80; p[64 - 7] = 0x02; p[64 - 8] = 0x80; } /* Init AES/XCBC OOO fields */ state->aes_xcbc_ooo.lens[0] = 0; state->aes_xcbc_ooo.lens[1] = 0; state->aes_xcbc_ooo.lens[2] = 0; state->aes_xcbc_ooo.lens[3] = 0; state->aes_xcbc_ooo.lens[4] = 0; state->aes_xcbc_ooo.lens[5] = 0; state->aes_xcbc_ooo.lens[6] = 0; state->aes_xcbc_ooo.lens[7] = 0; state->aes_xcbc_ooo.unused_lanes = 0xF76543210; for (j = 0; j < 8 ; j++) { state->aes_xcbc_ooo.ldata[j].job_in_lane = NULL; state->aes_xcbc_ooo.ldata[j].final_block[16] = 0x80; memset(state->aes_xcbc_ooo.ldata[j].final_block + 17, 0x00, 15); } /* Init AES-CCM auth out-of-order fields */ for (j = 0; j < 8; j++) { state->aes_ccm_ooo.init_done[j] = 0; state->aes_ccm_ooo.lens[j] = 0; state->aes_ccm_ooo.job_in_lane[j] = NULL; } state->aes_ccm_ooo.unused_lanes = 0xF76543210; /* Init "in order" components */ state->next_job = 0; state->earliest_job = -1; /* set handlers */ state->get_next_job = get_next_job_avx2; state->submit_job = submit_job_avx2; state->submit_job_nocheck = submit_job_nocheck_avx2; state->get_completed_job = get_completed_job_avx2; state->flush_job = flush_job_avx2; state->queue_size = queue_size_avx2; state->keyexp_128 = aes_keyexp_128_avx2; state->keyexp_192 = aes_keyexp_192_avx2; state->keyexp_256 = aes_keyexp_256_avx2; } #include "mb_mgr_code.h" intel-ipsec-mb-0.48/avx2/mb_mgr_hmac_flush_avx2.asm000066400000000000000000000201401321406316400222300ustar00rootroot00000000000000;; ;; Copyright (c) 2012-2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; %include "os.asm" %include "job_aes_hmac.asm" %include "mb_mgr_datastruct.asm" %include "reg_sizes.asm" ;%define DO_DBGPRINT %include "dbgprint.asm" extern sha1_x8_avx2 section .data default rel align 16 byteswap: ;ddq 0x0c0d0e0f08090a0b0405060700010203 dq 0x0405060700010203, 0x0c0d0e0f08090a0b x80: ;ddq 0x00000000000000000000000000000080 dq 0x0000000000000080, 0x0000000000000000 x00: ;ddq 0x00000000000000000000000000000000 dq 0x0000000000000000, 0x0000000000000000 len_masks: ;ddq 0x0000000000000000000000000000FFFF dq 0x000000000000FFFF, 0x0000000000000000 ;ddq 0x000000000000000000000000FFFF0000 dq 0x00000000FFFF0000, 0x0000000000000000 ;ddq 0x00000000000000000000FFFF00000000 dq 0x0000FFFF00000000, 0x0000000000000000 ;ddq 0x0000000000000000FFFF000000000000 dq 0xFFFF000000000000, 0x0000000000000000 ;ddq 0x000000000000FFFF0000000000000000 dq 0x0000000000000000, 0x000000000000FFFF ;ddq 0x00000000FFFF00000000000000000000 dq 0x0000000000000000, 0x00000000FFFF0000 ;ddq 0x0000FFFF000000000000000000000000 dq 0x0000000000000000, 0x0000FFFF00000000 ;ddq 0xFFFF0000000000000000000000000000 dq 0x0000000000000000, 0xFFFF000000000000 lane_1: dq 1 lane_2: dq 2 lane_3: dq 3 lane_4: dq 4 lane_5: dq 5 lane_6: dq 6 lane_7: dq 7 section .text %if 1 %ifdef LINUX %define arg1 rdi %define arg2 rsi %else %define arg1 rcx %define arg2 rdx %endif %define state arg1 %define job arg2 %define len2 arg2 ; idx needs to be in rbx, rdi, rbp %define idx rbp %define unused_lanes r9 %define lane_data r9 %define tmp2 r9 %define job_rax rax %define tmp1 rax %define size_offset rax %define tmp rax %define start_offset rax %define tmp3 arg1 %define extra_blocks arg2 %define p arg2 %define tmp4 r8 %endif ; we clobber rbp, called routine clobbers r12-r15 struc STACK _gpr_save: resq 5 _rsp_save: resq 1 endstruc %define APPEND(a,b) a %+ b ; JOB* flush_job_hmac_avx(MB_MGR_HMAC_SHA_1_OOO *state) ; arg 1 : rcx : state MKGLOBAL(flush_job_hmac_avx2,function,internal) flush_job_hmac_avx2: mov rax, rsp sub rsp, STACK_size and rsp, -32 ; align stack to 32 byte boundary mov [rsp + _gpr_save + 8*0], rbp mov [rsp + _gpr_save + 8*1], r12 mov [rsp + _gpr_save + 8*2], r13 mov [rsp + _gpr_save + 8*3], r14 mov [rsp + _gpr_save + 8*4], r15 mov [rsp + _rsp_save], rax mov unused_lanes, [state + _unused_lanes] bt unused_lanes, 32+3 jc return_null ; find a lane with a non-null job xor idx, idx %assign I 1 %rep 7 cmp qword [state + _ldata + (I * _HMAC_SHA1_LANE_DATA_size) + _job_in_lane], 0 cmovne idx, [rel APPEND(lane_,I)] %assign I (I+1) %endrep copy_lane_data: ; copy valid lane (idx) to empty lanes vmovdqa xmm0, [state + _lens] mov tmp, [state + _args_data_ptr + PTR_SZ*idx] %assign I 0 %rep 8 cmp qword [state + _ldata + I * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0 jne APPEND(skip_,I) mov [state + _args_data_ptr + PTR_SZ*I], tmp vpor xmm0, xmm0, [rel len_masks + 16*I] APPEND(skip_,I): %assign I (I+1) %endrep vmovdqa [state + _lens], xmm0 vphminposuw xmm1, xmm0 vpextrw DWORD(len2), xmm1, 0 ; min value vpextrw DWORD(idx), xmm1, 1 ; min index (0...7) DBGPRINTL64 "FLUSH min_length", len2 DBGPRINTL64 "FLUSH min_length index ", idx cmp len2, 0 je len_is_0 vpbroadcastw xmm1, xmm1 DBGPRINTL_XMM "FLUSH lens after shuffle", xmm1 vpsubw xmm0, xmm0, xmm1 vmovdqa [state + _lens], xmm0 DBGPRINTL_XMM "FLUSH lens immediately after min subtraction", xmm0 ; "state" and "args" are the same address, arg1 ; len is arg2 call sha1_x8_avx2 ; state and idx are intact len_is_0: ; process completed job "idx" imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size lea lane_data, [state + _ldata + lane_data] mov DWORD(extra_blocks), [lane_data + _extra_blocks] cmp extra_blocks, 0 jne proc_extra_blocks cmp dword [lane_data + _outer_done], 0 jne end_loop proc_outer: mov dword [lane_data + _outer_done], 1 mov DWORD(size_offset), [lane_data + _size_offset] mov qword [lane_data + _extra_block + size_offset], 0 mov word [state + _lens + 2*idx], 1 lea tmp, [lane_data + _outer_block] mov job, [lane_data + _job_in_lane] mov [state + _args_data_ptr + PTR_SZ*idx], tmp vmovd xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 0*SHA1_DIGEST_ROW_SIZE] vpinsrd xmm0, xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 1*SHA1_DIGEST_ROW_SIZE], 1 vpinsrd xmm0, xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 2*SHA1_DIGEST_ROW_SIZE], 2 vpinsrd xmm0, xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 3*SHA1_DIGEST_ROW_SIZE], 3 vpshufb xmm0, xmm0, [rel byteswap] mov DWORD(tmp), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 4*SHA1_DIGEST_ROW_SIZE] bswap DWORD(tmp) vmovdqa [lane_data + _outer_block], xmm0 mov [lane_data + _outer_block + 4*4], DWORD(tmp) mov tmp, [job + _auth_key_xor_opad] vmovdqu xmm0, [tmp] mov DWORD(tmp), [tmp + 4*4] vmovd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 0*SHA1_DIGEST_ROW_SIZE], xmm0 vpextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 1*SHA1_DIGEST_ROW_SIZE], xmm0, 1 vpextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 2*SHA1_DIGEST_ROW_SIZE], xmm0, 2 vpextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 3*SHA1_DIGEST_ROW_SIZE], xmm0, 3 mov [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 4*SHA1_DIGEST_ROW_SIZE], DWORD(tmp) jmp copy_lane_data align 16 proc_extra_blocks: mov DWORD(start_offset), [lane_data + _start_offset] mov [state + _lens + 2*idx], WORD(extra_blocks) lea tmp, [lane_data + _extra_block + start_offset] mov [state + _args_data_ptr + PTR_SZ*idx], tmp mov dword [lane_data + _extra_blocks], 0 jmp copy_lane_data return_null: xor job_rax, job_rax jmp return align 16 end_loop: mov job_rax, [lane_data + _job_in_lane] mov qword [lane_data + _job_in_lane], 0 or dword [job_rax + _status], STS_COMPLETED_HMAC mov unused_lanes, [state + _unused_lanes] shl unused_lanes, 4 ;; a nibble or unused_lanes, idx mov [state + _unused_lanes], unused_lanes mov p, [job_rax + _auth_tag_output] ; copy 12 bytes mov DWORD(tmp2), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 0*SHA1_DIGEST_ROW_SIZE] mov DWORD(tmp4), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 1*SHA1_DIGEST_ROW_SIZE] mov DWORD(tmp3), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 2*SHA1_DIGEST_ROW_SIZE] bswap DWORD(tmp2) bswap DWORD(tmp4) bswap DWORD(tmp3) mov [p + 0*4], DWORD(tmp2) mov [p + 1*4], DWORD(tmp4) mov [p + 2*4], DWORD(tmp3) return: mov rbp, [rsp + _gpr_save + 8*0] mov r12, [rsp + _gpr_save + 8*1] mov r13, [rsp + _gpr_save + 8*2] mov r14, [rsp + _gpr_save + 8*3] mov r15, [rsp + _gpr_save + 8*4] mov rsp, [rsp + _rsp_save] ret intel-ipsec-mb-0.48/avx2/mb_mgr_hmac_md5_flush_avx2.asm000066400000000000000000000245541321406316400230120ustar00rootroot00000000000000;; ;; Copyright (c) 2012-2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; %include "os.asm" %include "job_aes_hmac.asm" %include "mb_mgr_datastruct.asm" %include "reg_sizes.asm" ;%define DO_DBGPRINT %include "dbgprint.asm" extern md5_x8x2_avx2 section .data default rel align 16 dupw: ;ddq 0x01000100010001000100010001000100 dq 0x0100010001000100, 0x0100010001000100 x80: ;ddq 0x00000000000000000000000000000080 dq 0x0000000000000080, 0x0000000000000000 x00: ;ddq 0x00000000000000000000000000000000 dq 0x0000000000000000, 0x0000000000000000 len_masks: ;ddq 0x0000000000000000000000000000FFFF dq 0x000000000000FFFF, 0x0000000000000000 ;ddq 0x000000000000000000000000FFFF0000 dq 0x00000000FFFF0000, 0x0000000000000000 ;ddq 0x00000000000000000000FFFF00000000 dq 0x0000FFFF00000000, 0x0000000000000000 ;ddq 0x0000000000000000FFFF000000000000 dq 0xFFFF000000000000, 0x0000000000000000 ;ddq 0x000000000000FFFF0000000000000000 dq 0x0000000000000000, 0x000000000000FFFF ;ddq 0x00000000FFFF00000000000000000000 dq 0x0000000000000000, 0x00000000FFFF0000 ;ddq 0x0000FFFF000000000000000000000000 dq 0x0000000000000000, 0x0000FFFF00000000 ;ddq 0xFFFF0000000000000000000000000000 dq 0x0000000000000000, 0xFFFF000000000000 lane_1: dq 1 lane_2: dq 2 lane_3: dq 3 lane_4: dq 4 lane_5: dq 5 lane_6: dq 6 lane_7: dq 7 lane_8: dq 8 lane_9: dq 9 lane_10: dq 10 lane_11: dq 11 lane_12: dq 12 lane_13: dq 13 lane_14: dq 14 lane_15: dq 15 section .text %if 1 %ifdef LINUX %define arg1 rdi %define arg2 rsi %else %define arg1 rcx %define arg2 rdx %endif %define state arg1 %define job arg2 %define len2 arg2 ; idx needs to be in rbp %define idx rbp %define unused_lanes rbx %define lane_data rbx %define tmp2 rbx %define job_rax rax %define tmp1 rax %define size_offset rax %define tmp rax %define start_offset rax %define tmp3 arg1 %define extra_blocks arg2 %define p arg2 %define tmp4 r8 %define num_lanes_inuse r12 %define len_upper r13 %define idx_upper r14 %endif ; This routine and/or the called routine clobbers all GPRs struc STACK _gpr_save: resq 8 _rsp_save: resq 1 endstruc %define APPEND(a,b) a %+ b ; JOB* flush_job_hmac_md5_avx(MB_MGR_HMAC_MD5_OOO *state) ; arg 1 : rcx : state MKGLOBAL(flush_job_hmac_md5_avx2,function,internal) flush_job_hmac_md5_avx2: mov rax, rsp sub rsp, STACK_size and rsp, -32 mov [rsp + _gpr_save + 8*0], rbx mov [rsp + _gpr_save + 8*1], rbp mov [rsp + _gpr_save + 8*2], r12 mov [rsp + _gpr_save + 8*3], r13 mov [rsp + _gpr_save + 8*4], r14 mov [rsp + _gpr_save + 8*5], r15 %ifndef LINUX mov [rsp + _gpr_save + 8*6], rsi mov [rsp + _gpr_save + 8*7], rdi %endif mov [rsp + _rsp_save], rax ; original SP DBGPRINTL "---------- enter md5 flush -----------" mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse_md5] ;; empty? cmp num_lanes_inuse, 0 jz return_null ; find a lane with a non-null job -- flush does not have to be efficient! mov idx, 0 %assign I 1 %rep 15 cmp qword [state + _ldata_md5 + I * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0 cmovne idx, [rel APPEND(lane_,I)] %assign I (I+1) %endrep copy_lane_data: ; copy good lane (idx) to empty lanes mov tmp, [state + _args_data_ptr_md5 + PTR_SZ*idx] ;; tackle lower 8 lanes vmovdqa xmm0, [state + _lens_md5 + 0*16] ;; lower 8 lengths %assign I 0 %rep 8 cmp qword [state + _ldata_md5 + I * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0 jne APPEND(lower_skip_,I) mov [state + _args_data_ptr_md5 + PTR_SZ*I], tmp vpor xmm0, xmm0, [rel len_masks + 16*I] APPEND(lower_skip_,I): %assign I (I+1) %endrep ;; tackle upper lanes vmovdqa xmm1, [state + _lens_md5 + 1*16] ;; upper 8 lengths %assign I 0 %rep 8 cmp qword [state + _ldata_md5 + (8 + I) * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0 jne APPEND(upper_skip_,I) mov [state + _args_data_ptr_md5 + PTR_SZ*(8+I)], tmp vpor xmm1, xmm1, [rel len_masks + 16*I] APPEND(upper_skip_,I): %assign I (I+1) %endrep jmp start_loop0 align 32 start_loop0: ; Find min length vphminposuw xmm2, xmm0 vpextrw DWORD(len2), xmm2, 0 ; min value vpextrw DWORD(idx), xmm2, 1 ; min index (0...7) vphminposuw xmm3, xmm1 vpextrw DWORD(len_upper), xmm3, 0 ; min value vpextrw DWORD(idx_upper), xmm3, 1 ; min index (8...F) cmp len2, len_upper jle use_min min_in_high: vmovdqa xmm2, xmm3 mov len2, len_upper mov idx, idx_upper or idx, 0x8 ; to reflect that index in 8-F use_min: and len2, len2 ; to set flags jz len_is_0 DBGPRINTL64 "min_length min_index ", len2, idx DBGPRINTL_XMM "FLUSH md5 lens before sub lower", xmm0 vpbroadcastw xmm2, xmm2 ; duplicate words across all lanes vpsubw xmm0, xmm0, xmm2 DBGPRINTL_XMM "FLUSH md5 lens after sub lower", xmm0 vmovdqa [state + _lens_md5 + 0*16], xmm0 vpsubw xmm1, xmm1, xmm2 DBGPRINTL_XMM "FLUSH md5 lens after sub upper", xmm1 vmovdqa [state + _lens_md5 + 1*16], xmm1 ; "state" and "args" are the same address, arg1 ; len is arg2 call md5_x8x2_avx2 ; state and idx are intact len_is_0: ; process completed job "idx" imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size lea lane_data, [state + _ldata_md5 + lane_data] mov DWORD(extra_blocks), [lane_data + _extra_blocks] cmp extra_blocks, 0 jne proc_extra_blocks cmp dword [lane_data + _outer_done], 0 jne end_loop proc_outer: mov dword [lane_data + _outer_done], 1 mov DWORD(size_offset), [lane_data + _size_offset] mov qword [lane_data + _extra_block + size_offset], 0 mov word [state + _lens_md5 + 2*idx], 1 lea tmp, [lane_data + _outer_block] mov job, [lane_data + _job_in_lane] mov [state + _args_data_ptr_md5 + PTR_SZ*idx], tmp vmovd xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 0*MD5_DIGEST_ROW_SIZE] vpinsrd xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 1*MD5_DIGEST_ROW_SIZE], 1 vpinsrd xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 2*MD5_DIGEST_ROW_SIZE], 2 vpinsrd xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 3*MD5_DIGEST_ROW_SIZE], 3 vmovdqa [lane_data + _outer_block], xmm0 mov tmp, [job + _auth_key_xor_opad] vmovdqu xmm0, [tmp] vmovd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 0*MD5_DIGEST_ROW_SIZE], xmm0 vpextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 1*MD5_DIGEST_ROW_SIZE], xmm0, 1 vpextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 2*MD5_DIGEST_ROW_SIZE], xmm0, 2 vpextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 3*MD5_DIGEST_ROW_SIZE], xmm0, 3 jmp copy_lane_data align 16 proc_extra_blocks: mov DWORD(start_offset), [lane_data + _start_offset] mov [state + _lens_md5 + 2*idx], WORD(extra_blocks) lea tmp, [lane_data + _extra_block + start_offset] mov [state + _args_data_ptr_md5 + PTR_SZ*idx], tmp mov dword [lane_data + _extra_blocks], 0 jmp copy_lane_data return_null: xor job_rax, job_rax jmp return align 16 end_loop: mov job_rax, [lane_data + _job_in_lane] mov qword [lane_data + _job_in_lane], 0 or dword [job_rax + _status], STS_COMPLETED_HMAC mov unused_lanes, [state + _unused_lanes_md5] shl unused_lanes, 4 or unused_lanes, idx mov [state + _unused_lanes_md5], unused_lanes mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse_md5] ;; update lanes inuse sub num_lanes_inuse, 1 mov [state + _num_lanes_inuse_md5], DWORD(num_lanes_inuse) mov p, [job_rax + _auth_tag_output] ; copy 12 bytes mov DWORD(tmp2), [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 0*MD5_DIGEST_ROW_SIZE] mov DWORD(tmp4), [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 1*MD5_DIGEST_ROW_SIZE] mov DWORD(tmp3), [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 2*MD5_DIGEST_ROW_SIZE] ; bswap DWORD(tmp2) ; bswap DWORD(tmp4) ; bswap DWORD(tmp3) mov [p + 0*4], DWORD(tmp2) mov [p + 1*4], DWORD(tmp4) mov [p + 2*4], DWORD(tmp3) return: DBGPRINTL "---------- exit md5 flush -----------" mov rbx, [rsp + _gpr_save + 8*0] mov rbp, [rsp + _gpr_save + 8*1] mov r12, [rsp + _gpr_save + 8*2] mov r13, [rsp + _gpr_save + 8*3] mov r14, [rsp + _gpr_save + 8*4] mov r15, [rsp + _gpr_save + 8*5] %ifndef LINUX mov rsi, [rsp + _gpr_save + 8*6] mov rdi, [rsp + _gpr_save + 8*7] %endif mov rsp, [rsp + _rsp_save] ; original SP ret intel-ipsec-mb-0.48/avx2/mb_mgr_hmac_md5_submit_avx2.asm000066400000000000000000000264121321406316400231670ustar00rootroot00000000000000;; ;; Copyright (c) 2012-2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; %include "os.asm" %include "job_aes_hmac.asm" %include "mb_mgr_datastruct.asm" %include "memcpy.asm" %include "reg_sizes.asm" ;%define DO_DBGPRINT %include "dbgprint.asm" extern md5_x8x2_avx2 %if 1 %ifdef LINUX %define arg1 rdi %define arg2 rsi %define reg3 rcx %define reg4 rdx %else %define arg1 rcx %define arg2 rdx %define reg3 rdi %define reg4 rsi %endif %define state arg1 %define job arg2 %define len2 arg2 ; idx needs to be in rbp %define last_len rbp %define idx rbp %define p r11 %define start_offset r11 %define unused_lanes rbx %define tmp4 rbx %define job_rax rax %define len rax %define size_offset reg3 %define tmp2 reg3 %define lane reg4 %define tmp3 reg4 %define extra_blocks r8 %define tmp r9 %define p2 r9 %define lane_data r10 %define num_lanes_inuse r12 %define len_upper r13 %define idx_upper r14 %endif ; This routine and/or the called routine clobbers all GPRs struc STACK _gpr_save: resq 8 _rsp_save: resq 1 endstruc ; JOB* submit_job_hmac_md5_avx(MB_MGR_HMAC_MD5_OOO *state, JOB_AES_HMAC *job) ; arg 1 : rcx : state ; arg 2 : rdx : job MKGLOBAL(submit_job_hmac_md5_avx2,function,internal) submit_job_hmac_md5_avx2: mov rax, rsp sub rsp, STACK_size and rsp, -32 mov [rsp + _gpr_save + 8*0], rbx mov [rsp + _gpr_save + 8*1], rbp mov [rsp + _gpr_save + 8*2], r12 mov [rsp + _gpr_save + 8*3], r13 mov [rsp + _gpr_save + 8*4], r14 mov [rsp + _gpr_save + 8*5], r15 %ifndef LINUX mov [rsp + _gpr_save + 8*6], rsi mov [rsp + _gpr_save + 8*7], rdi %endif mov [rsp + _rsp_save], rax ; original SP DBGPRINTL "---------- enter md5 submit -----------" mov unused_lanes, [state + _unused_lanes_md5] mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse_md5] mov lane, unused_lanes and lane, 0xF shr unused_lanes, 4 mov [state + _unused_lanes_md5], unused_lanes add num_lanes_inuse, 1 mov [state + _num_lanes_inuse_md5], DWORD(num_lanes_inuse) DBGPRINTL64 "SUBMIT ********** num_lanes_in_use", num_lanes_inuse imul lane_data, lane, _HMAC_SHA1_LANE_DATA_size lea lane_data, [state + _ldata_md5 + lane_data] mov len, [job + _msg_len_to_hash_in_bytes] mov tmp, len shr tmp, 6 ; divide by 64, len in terms of blocks DBGPRINTL64 "SUBMIT job len, num_blks ", len, tmp mov [lane_data + _job_in_lane], job mov dword [lane_data + _outer_done], 0 mov [state + _lens_md5 + 2*lane], WORD(tmp) mov last_len, len and last_len, 63 lea extra_blocks, [last_len + 9 + 63] shr extra_blocks, 6 mov [lane_data + _extra_blocks], DWORD(extra_blocks) mov p, [job + _src] add p, [job + _hash_start_src_offset_in_bytes] mov [state + _args_data_ptr_md5 + PTR_SZ*lane], p cmp len, 64 jb copy_lt64 fast_copy: add p, len vmovdqu ymm0, [p - 64 + 0 * 32] vmovdqu ymm1, [p - 64 + 1 * 32] vmovdqu [lane_data + _extra_block + 0*32], ymm0 vmovdqu [lane_data + _extra_block + 1*32], ymm1 end_fast_copy: mov size_offset, extra_blocks shl size_offset, 6 sub size_offset, last_len add size_offset, 64-8 mov [lane_data + _size_offset], DWORD(size_offset) mov start_offset, 64 sub start_offset, last_len mov [lane_data + _start_offset], DWORD(start_offset) lea tmp, [8*64 + 8*len] ; bswap tmp mov [lane_data + _extra_block + size_offset], tmp mov tmp, [job + _auth_key_xor_ipad] vmovdqu xmm0, [tmp] vmovd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*lane + 0*MD5_DIGEST_ROW_SIZE], xmm0 vpextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*lane + 1*MD5_DIGEST_ROW_SIZE], xmm0, 1 vpextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*lane + 2*MD5_DIGEST_ROW_SIZE], xmm0, 2 vpextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*lane + 3*MD5_DIGEST_ROW_SIZE], xmm0, 3 test len, ~63 jnz ge64_bytes lt64_bytes: mov [state + _lens_md5 + 2*lane], WORD(extra_blocks) lea tmp, [lane_data + _extra_block + start_offset] mov [state + _args_data_ptr_md5 + PTR_SZ*lane], tmp mov dword [lane_data + _extra_blocks], 0 ge64_bytes: DBGPRINTL64 "SUBMIT md5 all lanes loaded? ********** num_lanes_in_use", num_lanes_inuse cmp num_lanes_inuse, 0x10 ; all 16 lanes loaded? jne return_null jmp start_loop align 16 start_loop: ; Find min length vmovdqa xmm0, [state + _lens_md5] vphminposuw xmm1, xmm0 vpextrw DWORD(len2), xmm1, 0 ; min value vpextrw DWORD(idx), xmm1, 1 ; min index (0...7) vmovdqa xmm2, [state + _lens_md5 + 1*16] ;; second 8 lengths vphminposuw xmm3, xmm2 vpextrw DWORD(len_upper), xmm3, 0 ; min value vpextrw DWORD(idx_upper), xmm3, 1 ; min index (8...F) cmp len2, len_upper jle use_min min_in_high: vmovdqa xmm1, xmm3 mov len2, len_upper mov idx, idx_upper ;; idx retrieved would be [0-7] or idx, 0x8 ;; to reflect that index in 8-F use_min: cmp len2, 0 je len_is_0 DBGPRINTL64 "min_length min_index ", len2, idx vpbroadcastw xmm1, xmm1 ; duplicate words across all lanes vpsubw xmm0, xmm0, xmm1 vmovdqa [state + _lens_md5 + 0*16], xmm0 DBGPRINTL_XMM "SUBMIT lens after sub lower", xmm0 vpsubw xmm2, xmm2, xmm1 vmovdqa [state + _lens_md5 + 1*16], xmm2 DBGPRINTL_XMM "SUBMIT lens after sub upper", xmm2 ; "state" and "args" are the same address, arg1 ; len is arg2 call md5_x8x2_avx2 ; state and idx are intact len_is_0: ; process completed job "idx" imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size lea lane_data, [state + _ldata_md5 + lane_data] mov DWORD(extra_blocks), [lane_data + _extra_blocks] cmp extra_blocks, 0 jne proc_extra_blocks cmp dword [lane_data + _outer_done], 0 jne end_loop proc_outer: mov dword [lane_data + _outer_done], 1 mov DWORD(size_offset), [lane_data + _size_offset] mov qword [lane_data + _extra_block + size_offset], 0 mov word [state + _lens_md5 + 2*idx], 1 lea tmp, [lane_data + _outer_block] mov job, [lane_data + _job_in_lane] mov [state + _args_data_ptr_md5 + PTR_SZ*idx], tmp vmovd xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 0*MD5_DIGEST_ROW_SIZE] vpinsrd xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 1*MD5_DIGEST_ROW_SIZE], 1 vpinsrd xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 2*MD5_DIGEST_ROW_SIZE], 2 vpinsrd xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 3*MD5_DIGEST_ROW_SIZE], 3 vmovdqa [lane_data + _outer_block], xmm0 mov tmp, [job + _auth_key_xor_opad] vmovdqu xmm0, [tmp] vmovd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 0*MD5_DIGEST_ROW_SIZE], xmm0 vpextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 1*MD5_DIGEST_ROW_SIZE], xmm0, 1 vpextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 2*MD5_DIGEST_ROW_SIZE], xmm0, 2 vpextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 3*MD5_DIGEST_ROW_SIZE], xmm0, 3 jmp start_loop align 16 proc_extra_blocks: mov DWORD(start_offset), [lane_data + _start_offset] mov [state + _lens_md5 + 2*idx], WORD(extra_blocks) lea tmp, [lane_data + _extra_block + start_offset] mov [state + _args_data_ptr_md5 + PTR_SZ*idx], tmp mov dword [lane_data + _extra_blocks], 0 jmp start_loop align 16 copy_lt64: ;; less than one message block of data ;; beginning of source block ;; destination extrablock but backwards by len from where 0x80 pre-populated ;; p2 clobbers unused_lanes, undo before exiting lea p2, [lane_data + _extra_block + 64] sub p2, len memcpy_avx2_64_1 p2, p, len, tmp4, tmp2, ymm0, ymm1 mov unused_lanes, [state + _unused_lanes_md5] jmp end_fast_copy return_null: xor job_rax, job_rax jmp return align 16 end_loop: mov job_rax, [lane_data + _job_in_lane] mov unused_lanes, [state + _unused_lanes_md5] mov qword [lane_data + _job_in_lane], 0 or dword [job_rax + _status], STS_COMPLETED_HMAC shl unused_lanes, 4 or unused_lanes, idx mov [state + _unused_lanes_md5], unused_lanes mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse_md5] sub num_lanes_inuse, 1 mov [state + _num_lanes_inuse_md5], DWORD(num_lanes_inuse) mov p, [job_rax + _auth_tag_output] ; copy 12 bytes mov DWORD(tmp), [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 0*MD5_DIGEST_ROW_SIZE] mov DWORD(tmp2), [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 1*MD5_DIGEST_ROW_SIZE] mov DWORD(tmp3), [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 2*MD5_DIGEST_ROW_SIZE] mov [p + 0*4], DWORD(tmp) mov [p + 1*4], DWORD(tmp2) mov [p + 2*4], DWORD(tmp3) return: DBGPRINTL "---------- exit md5 submit -----------" mov rbx, [rsp + _gpr_save + 8*0] mov rbp, [rsp + _gpr_save + 8*1] mov r12, [rsp + _gpr_save + 8*2] mov r13, [rsp + _gpr_save + 8*3] mov r14, [rsp + _gpr_save + 8*4] mov r15, [rsp + _gpr_save + 8*5] %ifndef LINUX mov rsi, [rsp + _gpr_save + 8*6] mov rdi, [rsp + _gpr_save + 8*7] %endif mov rsp, [rsp + _rsp_save] ; original SP ret intel-ipsec-mb-0.48/avx2/mb_mgr_hmac_sha_224_flush_avx2.asm000066400000000000000000000032241321406316400234560ustar00rootroot00000000000000;; ;; Copyright (c) 2012-2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; %define FUNC flush_job_hmac_sha_224_avx2 %define SHA224 %include "mb_mgr_hmac_sha_256_flush_avx2.asm" intel-ipsec-mb-0.48/avx2/mb_mgr_hmac_sha_224_submit_avx2.asm000066400000000000000000000032261321406316400236420ustar00rootroot00000000000000;; ;; Copyright (c) 2012-2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; %define FUNC submit_job_hmac_sha_224_avx2 %define SHA224 %include "mb_mgr_hmac_sha_256_submit_avx2.asm" intel-ipsec-mb-0.48/avx2/mb_mgr_hmac_sha_256_flush_avx2.asm000066400000000000000000000213621321406316400234660ustar00rootroot00000000000000;; ;; Copyright (c) 2012-2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; %include "os.asm" %include "job_aes_hmac.asm" %include "mb_mgr_datastruct.asm" %include "reg_sizes.asm" extern sha256_oct_avx2 section .data default rel align 16 byteswap: ;ddq 0x0c0d0e0f08090a0b0405060700010203 dq 0x0405060700010203, 0x0c0d0e0f08090a0b len_masks: ;ddq 0x0000000000000000000000000000FFFF dq 0x000000000000FFFF, 0x0000000000000000 ;ddq 0x000000000000000000000000FFFF0000 dq 0x00000000FFFF0000, 0x0000000000000000 ;ddq 0x00000000000000000000FFFF00000000 dq 0x0000FFFF00000000, 0x0000000000000000 ;ddq 0x0000000000000000FFFF000000000000 dq 0xFFFF000000000000, 0x0000000000000000 ;ddq 0x000000000000FFFF0000000000000000 dq 0x0000000000000000, 0x000000000000FFFF ;ddq 0x00000000FFFF00000000000000000000 dq 0x0000000000000000, 0x00000000FFFF0000 ;ddq 0x0000FFFF000000000000000000000000 dq 0x0000000000000000, 0x0000FFFF00000000 ;ddq 0xFFFF0000000000000000000000000000 dq 0x0000000000000000, 0xFFFF000000000000 lane_1: dq 1 lane_2: dq 2 lane_3: dq 3 lane_4: dq 4 lane_5: dq 5 lane_6: dq 6 lane_7: dq 7 section .text %ifndef FUNC %define FUNC flush_job_hmac_sha_256_avx2 %endif %if 1 %ifdef LINUX %define arg1 rdi %define arg2 rsi %define reg3 rdx %else %define arg1 rcx %define arg2 rdx %define reg3 rsi %endif %define state arg1 %define job arg2 %define len2 arg2 ; idx needs to be in rbp, r15 %define idx rbp %define unused_lanes r10 %define tmp5 r10 %define lane_data rbx %define tmp2 rbx %define job_rax rax %define tmp1 rax %define size_offset rax %define start_offset rax %define tmp3 arg1 %define extra_blocks arg2 %define p arg2 %define tmp4 reg3 %define tmp r9 %endif ; we clobber rsi, rbp; called routine also clobbers rbx, rdi, r12, r13, r14 struc STACK _gpr_save: resq 7 _rsp_save: resq 1 endstruc %define APPEND(a,b) a %+ b ; JOB* FUNC(MB_MGR_HMAC_SHA_256_OOO *state) ; arg 1 : state MKGLOBAL(FUNC,function,internal) FUNC: mov rax, rsp sub rsp, STACK_size and rsp, -32 mov [rsp + _gpr_save + 8*0], rbx mov [rsp + _gpr_save + 8*1], rbp mov [rsp + _gpr_save + 8*2], r12 mov [rsp + _gpr_save + 8*3], r13 mov [rsp + _gpr_save + 8*4], r14 %ifndef LINUX mov [rsp + _gpr_save + 8*5], rsi mov [rsp + _gpr_save + 8*6], rdi %endif mov [rsp + _rsp_save], rax ; original SP ; if bit (32+3) is set, then all lanes are empty mov unused_lanes, [state + _unused_lanes_sha256] bt unused_lanes, 32+3 jc return_null ; find a lane with a non-null job xor idx, idx %assign I 1 %rep 7 cmp qword [state + _ldata_sha256 + (I * _HMAC_SHA1_LANE_DATA_size) + _job_in_lane], 0 cmovne idx, [rel APPEND(lane_,I)] %assign I (I+1) %endrep copy_lane_data: ; copy idx to empty lanes vmovdqa xmm0, [state + _lens_sha256] mov tmp, [state + _args_data_ptr_sha256 + 8*idx] %assign I 0 %rep 8 cmp qword [state + _ldata_sha256 + I * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0 jne APPEND(skip_,I) mov [state + _args_data_ptr_sha256 + 8*I], tmp vpor xmm0, xmm0, [rel len_masks + 16*I] APPEND(skip_,I): %assign I (I+1) %endrep vmovdqa [state + _lens_sha256 ], xmm0 vphminposuw xmm1, xmm0 vpextrw DWORD(len2), xmm1, 0 ; min value vpextrw DWORD(idx), xmm1, 1 ; min index (0...7) cmp len2, 0 je len_is_0 vpbroadcastw xmm1, xmm1 ; duplicate words across all lanes vpsubw xmm0, xmm0, xmm1 vmovdqa [state + _lens_sha256], xmm0 ; "state" and "args" are the same address, arg1 ; len is arg2 call sha256_oct_avx2 ; state and idx are intact len_is_0: ; process completed job "idx" imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size lea lane_data, [state + _ldata_sha256 + lane_data] mov DWORD(extra_blocks), [lane_data + _extra_blocks] cmp extra_blocks, 0 jne proc_extra_blocks cmp dword [lane_data + _outer_done], 0 jne end_loop proc_outer: mov dword [lane_data + _outer_done], 1 mov DWORD(size_offset), [lane_data + _size_offset] mov qword [lane_data + _extra_block + size_offset], 0 mov word [state + _lens_sha256 + 2*idx], 1 lea tmp, [lane_data + _outer_block] mov job, [lane_data + _job_in_lane] mov [state + _args_data_ptr_sha256 + 8*idx], tmp vmovd xmm0, [state + _args_digest_sha256 + 4*idx + 0*SHA256_DIGEST_ROW_SIZE] vpinsrd xmm0, xmm0, [state + _args_digest_sha256 + 4*idx + 1*SHA256_DIGEST_ROW_SIZE], 1 vpinsrd xmm0, xmm0, [state + _args_digest_sha256 + 4*idx + 2*SHA256_DIGEST_ROW_SIZE], 2 vpinsrd xmm0, xmm0, [state + _args_digest_sha256 + 4*idx + 3*SHA256_DIGEST_ROW_SIZE], 3 vpshufb xmm0, xmm0, [rel byteswap] vmovd xmm1, [state + _args_digest_sha256 + 4*idx + 4*SHA256_DIGEST_ROW_SIZE] vpinsrd xmm1, xmm1, [state + _args_digest_sha256 + 4*idx + 5*SHA256_DIGEST_ROW_SIZE], 1 vpinsrd xmm1, xmm1, [state + _args_digest_sha256 + 4*idx + 6*SHA256_DIGEST_ROW_SIZE], 2 %ifndef SHA224 vpinsrd xmm1, xmm1, [state + _args_digest_sha256 + 4*idx + 7*SHA256_DIGEST_ROW_SIZE], 3 %endif vpshufb xmm1, xmm1, [rel byteswap] vmovdqa [lane_data + _outer_block], xmm0 vmovdqa [lane_data + _outer_block + 4*4], xmm1 %ifdef SHA224 mov dword [lane_data + _outer_block + 7*4], 0x80 %endif mov tmp, [job + _auth_key_xor_opad] vmovdqu xmm0, [tmp] vmovdqu xmm1, [tmp + 4*4] vmovd [state + _args_digest_sha256 + 4*idx + 0*SHA256_DIGEST_ROW_SIZE], xmm0 vpextrd [state + _args_digest_sha256 + 4*idx + 1*SHA256_DIGEST_ROW_SIZE], xmm0, 1 vpextrd [state + _args_digest_sha256 + 4*idx + 2*SHA256_DIGEST_ROW_SIZE], xmm0, 2 vpextrd [state + _args_digest_sha256 + 4*idx + 3*SHA256_DIGEST_ROW_SIZE], xmm0, 3 vmovd [state + _args_digest_sha256 + 4*idx + 4*SHA256_DIGEST_ROW_SIZE], xmm1 vpextrd [state + _args_digest_sha256 + 4*idx + 5*SHA256_DIGEST_ROW_SIZE], xmm1, 1 vpextrd [state + _args_digest_sha256 + 4*idx + 6*SHA256_DIGEST_ROW_SIZE], xmm1, 2 vpextrd [state + _args_digest_sha256 + 4*idx + 7*SHA256_DIGEST_ROW_SIZE], xmm1, 3 jmp copy_lane_data align 16 proc_extra_blocks: mov DWORD(start_offset), [lane_data + _start_offset] mov [state + _lens_sha256 + 2*idx], WORD(extra_blocks) lea tmp, [lane_data + _extra_block + start_offset] mov [state + _args_data_ptr_sha256 + 8*idx], tmp mov dword [lane_data + _extra_blocks], 0 jmp copy_lane_data return_null: xor job_rax, job_rax jmp return align 16 end_loop: mov job_rax, [lane_data + _job_in_lane] mov qword [lane_data + _job_in_lane], 0 or dword [job_rax + _status], STS_COMPLETED_HMAC mov unused_lanes, [state + _unused_lanes_sha256] shl unused_lanes, 4 or unused_lanes, idx mov [state + _unused_lanes_sha256], unused_lanes mov p, [job_rax + _auth_tag_output] ; copy SHA224=14bytes and SHA256=16bytes mov DWORD(tmp), [state + _args_digest_sha256 + 4*idx + 0*SHA256_DIGEST_ROW_SIZE] mov DWORD(tmp2), [state + _args_digest_sha256 + 4*idx + 1*SHA256_DIGEST_ROW_SIZE] mov DWORD(tmp4), [state + _args_digest_sha256 + 4*idx + 2*SHA256_DIGEST_ROW_SIZE] mov DWORD(tmp5), [state + _args_digest_sha256 + 4*idx + 3*SHA256_DIGEST_ROW_SIZE] bswap DWORD(tmp) bswap DWORD(tmp2) bswap DWORD(tmp4) bswap DWORD(tmp5) mov [p + 0*4], DWORD(tmp) mov [p + 1*4], DWORD(tmp2) mov [p + 2*4], DWORD(tmp4) %ifdef SHA224 mov [p + 3*4], WORD(tmp5) %else mov [p + 3*4], DWORD(tmp5) %endif return: mov rbx, [rsp + _gpr_save + 8*0] mov rbp, [rsp + _gpr_save + 8*1] mov r12, [rsp + _gpr_save + 8*2] mov r13, [rsp + _gpr_save + 8*3] mov r14, [rsp + _gpr_save + 8*4] %ifndef LINUX mov rsi, [rsp + _gpr_save + 8*5] mov rdi, [rsp + _gpr_save + 8*6] %endif mov rsp, [rsp + _rsp_save] ; original SP ret intel-ipsec-mb-0.48/avx2/mb_mgr_hmac_sha_256_submit_avx2.asm000066400000000000000000000243241321406316400236510ustar00rootroot00000000000000;; ;; Copyright (c) 2012-2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; %include "os.asm" %include "job_aes_hmac.asm" %include "mb_mgr_datastruct.asm" %include "reg_sizes.asm" %include "memcpy.asm" extern sha256_oct_avx2 section .data default rel align 16 byteswap: ;ddq 0x0c0d0e0f08090a0b0405060700010203 dq 0x0405060700010203, 0x0c0d0e0f08090a0b section .text %ifndef FUNC %define FUNC submit_job_hmac_sha_256_avx2 %endif %if 1 %ifdef LINUX %define arg1 rdi %define arg2 rsi %define reg3 rcx %define reg4 rdx %else %define arg1 rcx %define arg2 rdx %define reg3 rdi %define reg4 rsi %endif %define state arg1 %define job arg2 %define len2 arg2 ; idx needs to be in rbp, r15 %define last_len rbp %define idx rbp %define p r11 %define start_offset r11 %define unused_lanes rbx %define p2 rbx %define tmp4 rbx %define job_rax rax %define len rax %define size_offset reg3 %define tmp2 reg3 %define lane reg4 %define tmp3 reg4 %define extra_blocks r8 %define tmp r9 %define lane_data r10 %endif ; we clobber rbx, rsi, rdi, rbp; called routine also clobbers r12, r13, r14 struc STACK _gpr_save: resq 7 _rsp_save: resq 1 endstruc ; JOB* FUNC(MB_MGR_HMAC_SHA_256_OOO *state, JOB_AES_HMAC *job) ; arg 1 : rcx : state ; arg 2 : rdx : job MKGLOBAL(FUNC,function,internal) FUNC: mov rax, rsp sub rsp, STACK_size and rsp, -32 mov [rsp + _gpr_save + 8*0], rbx mov [rsp + _gpr_save + 8*1], rbp mov [rsp + _gpr_save + 8*2], r12 mov [rsp + _gpr_save + 8*3], r13 mov [rsp + _gpr_save + 8*4], r14 %ifndef LINUX mov [rsp + _gpr_save + 8*5], rsi mov [rsp + _gpr_save + 8*6], rdi %endif mov [rsp + _rsp_save], rax ; original SP mov unused_lanes, [state + _unused_lanes_sha256] mov lane, unused_lanes and lane, 0xF ;; just a nibble shr unused_lanes, 4 imul lane_data, lane, _HMAC_SHA1_LANE_DATA_size lea lane_data, [state + _ldata_sha256 + lane_data] mov [state + _unused_lanes_sha256], unused_lanes mov len, [job + _msg_len_to_hash_in_bytes] mov tmp, len shr tmp, 6 ; divide by 64, len in terms of blocks mov [lane_data + _job_in_lane], job mov dword [lane_data + _outer_done], 0 mov [state + _lens_sha256 + 2*lane], WORD(tmp) mov last_len, len and last_len, 63 lea extra_blocks, [last_len + 9 + 63] shr extra_blocks, 6 mov [lane_data + _extra_blocks], DWORD(extra_blocks) mov p, [job + _src] add p, [job + _hash_start_src_offset_in_bytes] mov [state + _args_data_ptr_sha256 + 8*lane], p cmp len, 64 jb copy_lt64 fast_copy: add p, len vmovdqu ymm0, [p - 64 + 0 * 32] vmovdqu ymm1, [p - 64 + 1 * 32] vmovdqu [lane_data + _extra_block + 0*32], ymm0 vmovdqu [lane_data + _extra_block + 1*32], ymm1 end_fast_copy: mov size_offset, extra_blocks shl size_offset, 6 sub size_offset, last_len add size_offset, 64-8 mov [lane_data + _size_offset], DWORD(size_offset) mov start_offset, 64 sub start_offset, last_len mov [lane_data + _start_offset], DWORD(start_offset) lea tmp, [8*64 + 8*len] bswap tmp mov [lane_data + _extra_block + size_offset], tmp mov tmp, [job + _auth_key_xor_ipad] vmovdqu xmm0, [tmp] vmovdqu xmm1, [tmp + 4*4] vmovd [state + _args_digest_sha256 + 4*lane + 0*SHA256_DIGEST_ROW_SIZE], xmm0 vpextrd [state + _args_digest_sha256 + 4*lane + 1*SHA256_DIGEST_ROW_SIZE], xmm0, 1 vpextrd [state + _args_digest_sha256 + 4*lane + 2*SHA256_DIGEST_ROW_SIZE], xmm0, 2 vpextrd [state + _args_digest_sha256 + 4*lane + 3*SHA256_DIGEST_ROW_SIZE], xmm0, 3 vmovd [state + _args_digest_sha256 + 4*lane + 4*SHA256_DIGEST_ROW_SIZE], xmm1 vpextrd [state + _args_digest_sha256 + 4*lane + 5*SHA256_DIGEST_ROW_SIZE], xmm1, 1 vpextrd [state + _args_digest_sha256 + 4*lane + 6*SHA256_DIGEST_ROW_SIZE], xmm1, 2 vpextrd [state + _args_digest_sha256 + 4*lane + 7*SHA256_DIGEST_ROW_SIZE], xmm1, 3 test len, ~63 jnz ge64_bytes lt64_bytes: mov [state + _lens_sha256 + 2*lane], WORD(extra_blocks) lea tmp, [lane_data + _extra_block + start_offset] mov [state + _args_data_ptr_sha256 + 8*lane], tmp mov dword [lane_data + _extra_blocks], 0 ge64_bytes: cmp unused_lanes, 0xf jne return_null jmp start_loop align 16 start_loop: ; Find min length vmovdqa xmm0, [state + _lens_sha256] vphminposuw xmm1, xmm0 vpextrw DWORD(len2), xmm1, 0 ; min value vpextrw DWORD(idx), xmm1, 1 ; min index (0...7) cmp len2, 0 je len_is_0 vpbroadcastw xmm1, xmm1 ; duplicate words across all lanes vpsubw xmm0, xmm0, xmm1 vmovdqa [state + _lens_sha256], xmm0 ; "state" and "args" are the same address, arg1 ; len is arg2 call sha256_oct_avx2 ; state and idx are intact len_is_0: ; process completed job "idx" imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size lea lane_data, [state + _ldata_sha256 + lane_data] mov DWORD(extra_blocks), [lane_data + _extra_blocks] cmp extra_blocks, 0 jne proc_extra_blocks cmp dword [lane_data + _outer_done], 0 jne end_loop proc_outer: mov dword [lane_data + _outer_done], 1 mov DWORD(size_offset), [lane_data + _size_offset] mov qword [lane_data + _extra_block + size_offset], 0 mov word [state + _lens_sha256 + 2*idx], 1 lea tmp, [lane_data + _outer_block] mov job, [lane_data + _job_in_lane] mov [state + _args_data_ptr_sha256 + 8*idx], tmp vmovd xmm0, [state + _args_digest_sha256 + 4*idx + 0*SHA256_DIGEST_ROW_SIZE] vpinsrd xmm0, xmm0, [state + _args_digest_sha256 + 4*idx + 1*SHA256_DIGEST_ROW_SIZE], 1 vpinsrd xmm0, xmm0, [state + _args_digest_sha256 + 4*idx + 2*SHA256_DIGEST_ROW_SIZE], 2 vpinsrd xmm0, xmm0, [state + _args_digest_sha256 + 4*idx + 3*SHA256_DIGEST_ROW_SIZE], 3 vpshufb xmm0, xmm0, [rel byteswap] vmovd xmm1, [state + _args_digest_sha256 + 4*idx + 4*SHA256_DIGEST_ROW_SIZE] vpinsrd xmm1, xmm1, [state + _args_digest_sha256 + 4*idx + 5*SHA256_DIGEST_ROW_SIZE], 1 vpinsrd xmm1, xmm1, [state + _args_digest_sha256 + 4*idx + 6*SHA256_DIGEST_ROW_SIZE], 2 %ifndef SHA224 vpinsrd xmm1, xmm1, [state + _args_digest_sha256 + 4*idx + 7*SHA256_DIGEST_ROW_SIZE], 3 %endif vpshufb xmm1, xmm1, [rel byteswap] vmovdqa [lane_data + _outer_block], xmm0 vmovdqa [lane_data + _outer_block + 4*4], xmm1 %ifdef SHA224 mov dword [lane_data + _outer_block + 7*4], 0x80 %endif mov tmp, [job + _auth_key_xor_opad] vmovdqu xmm0, [tmp] vmovdqu xmm1, [tmp + 4*4] vmovd [state + _args_digest_sha256 + 4*idx + 0*SHA256_DIGEST_ROW_SIZE], xmm0 vpextrd [state + _args_digest_sha256 + 4*idx + 1*SHA256_DIGEST_ROW_SIZE], xmm0, 1 vpextrd [state + _args_digest_sha256 + 4*idx + 2*SHA256_DIGEST_ROW_SIZE], xmm0, 2 vpextrd [state + _args_digest_sha256 + 4*idx + 3*SHA256_DIGEST_ROW_SIZE], xmm0, 3 vmovd [state + _args_digest_sha256 + 4*idx + 4*SHA256_DIGEST_ROW_SIZE], xmm1 vpextrd [state + _args_digest_sha256 + 4*idx + 5*SHA256_DIGEST_ROW_SIZE], xmm1, 1 vpextrd [state + _args_digest_sha256 + 4*idx + 6*SHA256_DIGEST_ROW_SIZE], xmm1, 2 vpextrd [state + _args_digest_sha256 + 4*idx + 7*SHA256_DIGEST_ROW_SIZE], xmm1, 3 jmp start_loop align 16 proc_extra_blocks: mov DWORD(start_offset), [lane_data + _start_offset] mov [state + _lens_sha256 + 2*idx], WORD(extra_blocks) lea tmp, [lane_data + _extra_block + start_offset] mov [state + _args_data_ptr_sha256 + 8*idx], tmp mov dword [lane_data + _extra_blocks], 0 jmp start_loop align 16 copy_lt64: ;; less than one message block of data ;; beginning of source block ;; destination extrablock but backwards by len from where 0x80 pre-populated lea p2, [lane_data + _extra_block + 64] sub p2, len memcpy_avx2_64_1 p2, p, len, tmp, tmp2, ymm0, ymm1 mov unused_lanes, [state + _unused_lanes_sha256] jmp end_fast_copy return_null: xor job_rax, job_rax jmp return align 16 end_loop: mov job_rax, [lane_data + _job_in_lane] mov unused_lanes, [state + _unused_lanes_sha256] mov qword [lane_data + _job_in_lane], 0 or dword [job_rax + _status], STS_COMPLETED_HMAC shl unused_lanes, 4 or unused_lanes, idx mov [state + _unused_lanes_sha256], unused_lanes mov p, [job_rax + _auth_tag_output] ; copy 14 bytes for SHA224 and 16 bytes for SHA256 mov DWORD(tmp), [state + _args_digest_sha256 + 4*idx + 0*SHA256_DIGEST_ROW_SIZE] mov DWORD(tmp2), [state + _args_digest_sha256 + 4*idx + 1*SHA256_DIGEST_ROW_SIZE] mov DWORD(tmp3), [state + _args_digest_sha256 + 4*idx + 2*SHA256_DIGEST_ROW_SIZE] mov DWORD(tmp4), [state + _args_digest_sha256 + 4*idx + 3*SHA256_DIGEST_ROW_SIZE] bswap DWORD(tmp) bswap DWORD(tmp2) bswap DWORD(tmp3) bswap DWORD(tmp4) mov [p + 0*4], DWORD(tmp) mov [p + 1*4], DWORD(tmp2) mov [p + 2*4], DWORD(tmp3) %ifdef SHA224 mov [p + 3*4], WORD(tmp4) %else mov [p + 3*4], DWORD(tmp4) %endif return: mov rbx, [rsp + _gpr_save + 8*0] mov rbp, [rsp + _gpr_save + 8*1] mov r12, [rsp + _gpr_save + 8*2] mov r13, [rsp + _gpr_save + 8*3] mov r14, [rsp + _gpr_save + 8*4] %ifndef LINUX mov rsi, [rsp + _gpr_save + 8*5] mov rdi, [rsp + _gpr_save + 8*6] %endif mov rsp, [rsp + _rsp_save] ; original SP ret intel-ipsec-mb-0.48/avx2/mb_mgr_hmac_sha_384_flush_avx2.asm000066400000000000000000000032431321406316400234660ustar00rootroot00000000000000;; ;; Copyright (c) 2012-2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; %define FUNC flush_job_hmac_sha_384_avx2 %define SHA_X_DIGEST_SIZE 384 %include "mb_mgr_hmac_sha_512_flush_avx2.asm" intel-ipsec-mb-0.48/avx2/mb_mgr_hmac_sha_384_submit_avx2.asm000066400000000000000000000032451321406316400236520ustar00rootroot00000000000000;; ;; Copyright (c) 2012-2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; %define FUNC submit_job_hmac_sha_384_avx2 %define SHA_X_DIGEST_SIZE 384 %include "mb_mgr_hmac_sha_512_submit_avx2.asm" intel-ipsec-mb-0.48/avx2/mb_mgr_hmac_sha_512_flush_avx2.asm000066400000000000000000000166551321406316400234720ustar00rootroot00000000000000;; ;; Copyright (c) 2012-2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; %include "os.asm" %include "job_aes_hmac.asm" %include "mb_mgr_datastruct.asm" %include "reg_sizes.asm" extern sha512_x4_avx2 section .data default rel align 16 byteswap: ;ddq 0x08090a0b0c0d0e0f0001020304050607 dq 0x0001020304050607, 0x08090a0b0c0d0e0f len_masks: ;ddq 0x0000000000000000000000000000FFFF dq 0x000000000000FFFF, 0x0000000000000000 ;ddq 0x000000000000000000000000FFFF0000 dq 0x00000000FFFF0000, 0x0000000000000000 ;ddq 0x00000000000000000000FFFF00000000 dq 0x0000FFFF00000000, 0x0000000000000000 ;ddq 0x0000000000000000FFFF000000000000 dq 0xFFFF000000000000, 0x0000000000000000 lane_1: dq 1 lane_2: dq 2 lane_3: dq 3 section .text %ifndef FUNC %define FUNC flush_job_hmac_sha_512_avx2 %define SHA_X_DIGEST_SIZE 512 %endif %if 1 %ifdef LINUX %define arg1 rdi %define arg2 rsi %else %define arg1 rcx %define arg2 rdx %endif %define state arg1 %define job arg2 %define len2 arg2 ; idx needs to be in rbp, r15 %define idx rbp %define unused_lanes rbx %define lane_data rbx %define tmp2 rbx %define job_rax rax %define tmp1 rax %define size_offset rax %define tmp rax %define start_offset rax %define tmp3 arg1 %define extra_blocks arg2 %define p arg2 %define tmp4 r8 %define tmp5 r9 %define tmp6 r10 %endif ; we clobber rbx, rbp; called routine also clobbers r12 struc STACK _gpr_save: resq 3 _rsp_save: resq 1 endstruc %define APPEND(a,b) a %+ b ; JOB* FUNC(MB_MGR_HMAC_SHA_512_OOO *state) ; arg 1 : rcx : state MKGLOBAL(FUNC,function,internal) FUNC: mov rax, rsp sub rsp, STACK_size and rsp, -32 mov [rsp + _gpr_save + 8*0], rbx mov [rsp + _gpr_save + 8*1], rbp mov [rsp + _gpr_save + 8*2], r12 mov [rsp + _rsp_save], rax ; original SP mov unused_lanes, [state + _unused_lanes_sha512] bt unused_lanes, 32+7 jc return_null ; find a lane with a non-null job xor idx, idx %assign I 1 %rep 3 cmp qword [state + _ldata_sha512 + 1 * _SHA512_LANE_DATA_size + _job_in_lane_sha512], 0 cmovne idx, [rel APPEND(lane_, I)] %assign I (I+1) %endrep copy_lane_data: ; copy good lane (idx) to empty lanes vmovdqa xmm0, [state + _lens_sha512] mov tmp, [state + _args_sha512 + _data_ptr_sha512 + PTR_SZ*idx] %assign I 0 %rep 4 cmp qword [state + _ldata_sha512 + I * _SHA512_LANE_DATA_size + _job_in_lane_sha512], 0 jne APPEND(skip_,I) mov [state + _args_sha512 + _data_ptr_sha512 + PTR_SZ*I], tmp vpor xmm0, xmm0, [rel len_masks + 16*I] APPEND(skip_,I): %assign I (I+1) %endrep vmovdqa [state + _lens_sha512], xmm0 vphminposuw xmm1, xmm0 vpextrw DWORD(len2), xmm1, 0 ; min value vpextrw DWORD(idx), xmm1, 1 ; min index (0...3) cmp len2, 0 je len_is_0 vpshuflw xmm1, xmm1, 0x00 vpsubw xmm0, xmm0, xmm1 vmovdqa [state + _lens_sha512], xmm0 ; "state" and "args" are the same address, arg1 ; len is arg2 call sha512_x4_avx2 ; state and idx are intact len_is_0: ; process completed job "idx" imul lane_data, idx, _SHA512_LANE_DATA_size lea lane_data, [state + _ldata_sha512 + lane_data] mov DWORD(extra_blocks), [lane_data + _extra_blocks_sha512] cmp extra_blocks, 0 jne proc_extra_blocks cmp dword [lane_data + _outer_done_sha512], 0 jne end_loop proc_outer: mov dword [lane_data + _outer_done_sha512], 1 mov DWORD(size_offset), [lane_data + _size_offset_sha512] mov qword [lane_data + _extra_block_sha512 + size_offset], 0 mov word [state + _lens_sha512 + 2*idx], 1 lea tmp, [lane_data + _outer_block_sha512] mov job, [lane_data + _job_in_lane_sha512] mov [state + _args_data_ptr_sha512 + PTR_SZ*idx], tmp ; move digest into data location %assign I 0 %rep (SHA_X_DIGEST_SIZE / (8*16)) vmovq xmm0, [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 2*I*SHA512_DIGEST_ROW_SIZE] vpinsrq xmm0, [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + (2*I + 1)*SHA512_DIGEST_ROW_SIZE], 1 vpshufb xmm0, [rel byteswap] vmovdqa [lane_data + _outer_block_sha512 + I*2*SHA512_DIGEST_WORD_SIZE], xmm0 %assign I (I+1) %endrep ; move the opad key into digest mov tmp, [job + _auth_key_xor_opad] %assign I 0 %rep 4 vmovdqu xmm0, [tmp + I * 16] vmovq [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + (2*I + 0)*SHA512_DIGEST_ROW_SIZE], xmm0 vpextrq [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + (2*I + 1)*SHA512_DIGEST_ROW_SIZE], xmm0, 1 %assign I (I+1) %endrep jmp copy_lane_data align 16 proc_extra_blocks: mov DWORD(start_offset), [lane_data + _start_offset_sha512] mov [state + _lens_sha512 + 2*idx], WORD(extra_blocks) lea tmp, [lane_data + _extra_block_sha512 + start_offset] mov [state + _args_data_ptr_sha512 + PTR_SZ*idx], tmp mov dword [lane_data + _extra_blocks_sha512], 0 jmp copy_lane_data return_null: xor job_rax, job_rax jmp return align 16 end_loop: mov job_rax, [lane_data + _job_in_lane_sha512] mov qword [lane_data + _job_in_lane_sha512], 0 or dword [job_rax + _status], STS_COMPLETED_HMAC mov unused_lanes, [state + _unused_lanes_sha512] shl unused_lanes, 8 or unused_lanes, idx mov [state + _unused_lanes_sha512], unused_lanes mov p, [job_rax + _auth_tag_output] ; below is the code for both SHA512 & SHA384. SHA512=32 bytes and SHA384=24 bytes mov QWORD(tmp2), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 0*SHA512_DIGEST_ROW_SIZE] mov QWORD(tmp4), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 1*SHA512_DIGEST_ROW_SIZE] mov QWORD(tmp6), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 2*SHA512_DIGEST_ROW_SIZE] %if (SHA_X_DIGEST_SIZE != 384) mov QWORD(tmp5), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 3*SHA512_DIGEST_ROW_SIZE] %endif bswap QWORD(tmp2) bswap QWORD(tmp4) bswap QWORD(tmp6) %if (SHA_X_DIGEST_SIZE != 384) bswap QWORD(tmp5) %endif mov [p + 0*8], QWORD(tmp2) mov [p + 1*8], QWORD(tmp4) mov [p + 2*8], QWORD(tmp6) %if (SHA_X_DIGEST_SIZE != 384) mov [p + 3*8], QWORD(tmp5) %endif return: mov rbx, [rsp + _gpr_save + 8*0] mov rbp, [rsp + _gpr_save + 8*1] mov r12, [rsp + _gpr_save + 8*2] mov rsp, [rsp + _rsp_save] ; original SP ret intel-ipsec-mb-0.48/avx2/mb_mgr_hmac_sha_512_submit_avx2.asm000066400000000000000000000224161321406316400236440ustar00rootroot00000000000000;; ;; Copyright (c) 2012-2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; %include "os.asm" %include "job_aes_hmac.asm" %include "mb_mgr_datastruct.asm" %include "reg_sizes.asm" %include "memcpy.asm" extern sha512_x4_avx2 section .data default rel align 16 byteswap: ;ddq 0x08090a0b0c0d0e0f0001020304050607 dq 0x0001020304050607, 0x08090a0b0c0d0e0f section .text %ifndef FUNC %define FUNC submit_job_hmac_sha_512_avx2 %define SHA_X_DIGEST_SIZE 512 %endif %if 1 %ifdef LINUX %define arg1 rdi %define arg2 rsi %define reg3 rcx %define reg4 rdx %else %define arg1 rcx %define arg2 rdx %define reg3 rdi %define reg4 rsi %endif %define state arg1 %define job arg2 %define len2 arg2 ; idx needs to be in rbp, r13, r14, r16 %define last_len rbp %define idx rbp %define p r11 %define start_offset r11 %define unused_lanes rbx %define tmp4 rbx %define job_rax rax %define len rax %define size_offset reg3 %define tmp2 reg3 %define lane reg4 %define tmp3 reg4 %define extra_blocks r8 %define tmp r9 %define p2 r9 %define lane_data r10 %endif ; Define stack usage ; we clobber rbx, rsi, rdi, rbp; called routine also clobbers r12 struc STACK _gpr_save: resq 5 _rsp_save: resq 1 endstruc ; JOB* FUNC(MB_MGR_HMAC_sha_512_OOO *state, JOB_AES_HMAC *job) ; arg 1 : rcx : state ; arg 2 : rdx : job MKGLOBAL(FUNC,function,internal) FUNC: mov rax, rsp sub rsp, STACK_size and rsp, -32 mov [rsp + _gpr_save + 8*0], rbx mov [rsp + _gpr_save + 8*1], rbp mov [rsp + _gpr_save + 8*2], r12 %ifndef LINUX mov [rsp + _gpr_save + 8*3], rsi mov [rsp + _gpr_save + 8*4], rdi %endif mov [rsp + _rsp_save], rax ; original SP mov unused_lanes, [state + _unused_lanes_sha512] movzx lane, BYTE(unused_lanes) shr unused_lanes, 8 imul lane_data, lane, _SHA512_LANE_DATA_size lea lane_data, [state + _ldata_sha512 + lane_data] mov [state + _unused_lanes_sha512], unused_lanes mov len, [job + _msg_len_to_hash_in_bytes] mov tmp, len shr tmp, 7 ; divide by 128, len in terms of blocks mov [lane_data + _job_in_lane_sha512], job mov dword [lane_data + _outer_done_sha512], 0 mov [state + _lens_sha512 + 2*lane], WORD(tmp) ; 2 is word size in bytes mov last_len, len and last_len, 127 lea extra_blocks, [last_len + 17 + 127] shr extra_blocks, 7 mov [lane_data + _extra_blocks_sha512], DWORD(extra_blocks) mov p, [job + _src] add p, [job + _hash_start_src_offset_in_bytes] mov [state + _args_data_ptr_sha512 + PTR_SZ*lane], p cmp len, 128 jb copy_lt128 fast_copy: add p, len vmovdqu ymm0, [p - 128 + 0*32] vmovdqu ymm1, [p - 128 + 1*32] vmovdqu ymm2, [p - 128 + 2*32] vmovdqu ymm3, [p - 128 + 3*32] vmovdqu [lane_data + _extra_block_sha512 + 0*32], ymm0 vmovdqu [lane_data + _extra_block_sha512 + 1*32], ymm1 vmovdqu [lane_data + _extra_block_sha512 + 2*32], ymm2 vmovdqu [lane_data + _extra_block_sha512 + 3*32], ymm3 end_fast_copy: mov size_offset, extra_blocks shl size_offset, 7 sub size_offset, last_len add size_offset, 128-8 mov [lane_data + _size_offset_sha512], DWORD(size_offset) mov start_offset, 128 sub start_offset, last_len mov [lane_data + _start_offset_sha512], DWORD(start_offset) lea tmp, [8*128 + 8*len] bswap tmp mov [lane_data + _extra_block_sha512 + size_offset], tmp mov tmp, [job + _auth_key_xor_ipad] %assign I 0 %rep 4 vmovdqu xmm0, [tmp + I * 2 * SHA512_DIGEST_WORD_SIZE] vmovq [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*lane + (2*I + 0)*SHA512_DIGEST_ROW_SIZE], xmm0 vpextrq [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*lane + (2*I + 1)*SHA512_DIGEST_ROW_SIZE], xmm0, 1 %assign I (I+1) %endrep test len, ~127 jnz ge128_bytes lt128_bytes: mov [state + _lens_sha512 + 2*lane], WORD(extra_blocks) lea tmp, [lane_data + _extra_block_sha512 + start_offset] mov [state + _args_data_ptr_sha512 + PTR_SZ*lane], tmp ;; 8 to hold a UINT8 mov dword [lane_data + _extra_blocks_sha512], 0 ge128_bytes: cmp unused_lanes, 0xff jne return_null jmp start_loop align 16 start_loop: ; Find min length vmovdqa xmm0, [state + _lens_sha512] vphminposuw xmm1, xmm0 vpextrw DWORD(len2), xmm1, 0 ; min value vpextrw DWORD(idx), xmm1, 1 ; min index (0...1) cmp len2, 0 je len_is_0 vpshuflw xmm1, xmm1, 0x00 vpsubw xmm0, xmm0, xmm1 vmovdqa [state + _lens_sha512], xmm0 ; "state" and "args" are the same address, arg1 ; len is arg2 call sha512_x4_avx2 ; state and idx are intact len_is_0: ; process completed job "idx" imul lane_data, idx, _SHA512_LANE_DATA_size lea lane_data, [state + _ldata_sha512 + lane_data] mov DWORD(extra_blocks), [lane_data + _extra_blocks_sha512] cmp extra_blocks, 0 jne proc_extra_blocks cmp dword [lane_data + _outer_done_sha512], 0 jne end_loop proc_outer: mov dword [lane_data + _outer_done_sha512], 1 mov DWORD(size_offset), [lane_data + _size_offset_sha512] mov qword [lane_data + _extra_block_sha512 + size_offset], 0 mov word [state + _lens_sha512 + 2*idx], 1 lea tmp, [lane_data + _outer_block_sha512] mov job, [lane_data + _job_in_lane_sha512] mov [state + _args_data_ptr_sha512 + PTR_SZ*idx], tmp %assign I 0 %rep (SHA_X_DIGEST_SIZE / (8 * 16)) vmovq xmm0, [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + (2*I + 0)*SHA512_DIGEST_ROW_SIZE] vpinsrq xmm0, [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + (2*I + 1)*SHA512_DIGEST_ROW_SIZE], 1 vpshufb xmm0, [rel byteswap] vmovdqa [lane_data + _outer_block_sha512 + I * 2 * SHA512_DIGEST_WORD_SIZE], xmm0 %assign I (I+1) %endrep mov tmp, [job + _auth_key_xor_opad] %assign I 0 %rep 4 vmovdqu xmm0, [tmp + I * 16] vmovq [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + (2*I+0)*SHA512_DIGEST_ROW_SIZE], xmm0 vpextrq [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + (2*I + 1)*SHA512_DIGEST_ROW_SIZE], xmm0, 1 %assign I (I+1) %endrep jmp start_loop align 16 proc_extra_blocks: mov DWORD(start_offset), [lane_data + _start_offset_sha512] mov [state + _lens_sha512 + 2*idx], WORD(extra_blocks) lea tmp, [lane_data + _extra_block_sha512 + start_offset] mov [state + _args_data_ptr_sha512 + PTR_SZ*idx], tmp ;; idx is index of shortest length message mov dword [lane_data + _extra_blocks_sha512], 0 jmp start_loop align 16 copy_lt128: ;; less than one message block of data ;; destination extra block but backwards by len from where 0x80 pre-populated lea p2, [lane_data + _extra_block + 128] sub p2, len memcpy_avx2_128_1 p2, p, len, tmp4, tmp2, ymm0, ymm1, ymm2, ymm3 mov unused_lanes, [state + _unused_lanes_sha512] jmp end_fast_copy return_null: xor job_rax, job_rax jmp return align 16 end_loop: mov job_rax, [lane_data + _job_in_lane_sha512] mov unused_lanes, [state + _unused_lanes_sha512] mov qword [lane_data + _job_in_lane_sha512], 0 or dword [job_rax + _status], STS_COMPLETED_HMAC shl unused_lanes, 8 or unused_lanes, idx mov [state + _unused_lanes_sha512], unused_lanes mov p, [job_rax + _auth_tag_output] ; below is the code for both SHA512 & SHA384. SHA512=32 bytes and SHA384=24 bytes mov QWORD(tmp), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 0*SHA512_DIGEST_ROW_SIZE] mov QWORD(tmp2), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 1*SHA512_DIGEST_ROW_SIZE] mov QWORD(tmp3), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 2*SHA512_DIGEST_ROW_SIZE] %if (SHA_X_DIGEST_SIZE != 384) mov QWORD(tmp4), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 3*SHA512_DIGEST_ROW_SIZE] %endif bswap QWORD(tmp) bswap QWORD(tmp2) bswap QWORD(tmp3) %if (SHA_X_DIGEST_SIZE != 384) bswap QWORD(tmp4) %endif mov [p + 0*8], QWORD(tmp) mov [p + 1*8], QWORD(tmp2) mov [p + 2*8], QWORD(tmp3) %if (SHA_X_DIGEST_SIZE != 384) mov [p + 3*8], QWORD(tmp4) %endif return: mov rbx, [rsp + _gpr_save + 8*0] mov rbp, [rsp + _gpr_save + 8*1] mov r12, [rsp + _gpr_save + 8*2] %ifndef LINUX mov rsi, [rsp + _gpr_save + 8*3] mov rdi, [rsp + _gpr_save + 8*4] %endif mov rsp, [rsp + _rsp_save] ; original SP ret intel-ipsec-mb-0.48/avx2/mb_mgr_hmac_submit_avx2.asm000066400000000000000000000254211321406316400224210ustar00rootroot00000000000000;; ;; Copyright (c) 2012-2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; %include "os.asm" %include "job_aes_hmac.asm" %include "mb_mgr_datastruct.asm" %include "reg_sizes.asm" %include "memcpy.asm" ;%define DO_DBGPRINT %include "dbgprint.asm" extern sha1_x8_avx2 section .data default rel align 16 byteswap: ;ddq 0x0c0d0e0f08090a0b0405060700010203 dq 0x0405060700010203, 0x0c0d0e0f08090a0b section .text %if 1 %ifdef LINUX %define arg1 rdi %define arg2 rsi %define reg3 rcx %define reg4 rdx %else %define arg1 rcx %define arg2 rdx %define reg3 rdi %define reg4 rsi %endif %define state arg1 %define job arg2 %define len2 arg2 ; idx needs to be in rbx, rdi, rbp %define last_len rbp %define idx rbp %define p r11 %define start_offset r11 %define unused_lanes r12 %define tmp4 r12 %define job_rax rax %define len rax %define size_offset reg3 %define tmp2 reg3 %define lane reg4 %define tmp3 reg4 %define extra_blocks r8 %define tmp r9 %define p2 r9 %define lane_data r10 %endif ; we clobber rsi, rdi, rbp, r12; called routine clobbers also r13-r15 struc STACK _gpr_save: resq 7 _rsp_save: resq 1 endstruc ; JOB* submit_job_hmac_avx(MB_MGR_HMAC_SHA_1_OOO *state, JOB_AES_HMAC *job) ; arg 1 : rcx : state ; arg 2 : rdx : job MKGLOBAL(submit_job_hmac_avx2,function,internal) submit_job_hmac_avx2: mov rax, rsp sub rsp, STACK_size and rsp, -32 ; align to 32 byte boundary mov [rsp + _gpr_save + 8*0], rbp mov [rsp + _gpr_save + 8*1], r12 mov [rsp + _gpr_save + 8*2], r13 mov [rsp + _gpr_save + 8*3], r14 mov [rsp + _gpr_save + 8*4], r15 %ifndef LINUX mov [rsp + _gpr_save + 8*5], rsi mov [rsp + _gpr_save + 8*6], rdi %endif mov [rsp + _rsp_save], rax DBGPRINTL "---------- enter sha1 submit -----------" mov unused_lanes, [state + _unused_lanes] mov lane, unused_lanes and lane, 0xF ;; just a nibble shr unused_lanes, 4 imul lane_data, lane, _HMAC_SHA1_LANE_DATA_size lea lane_data, [state + _ldata + lane_data] mov [state + _unused_lanes], unused_lanes mov len, [job + _msg_len_to_hash_in_bytes] mov tmp, len shr tmp, 6 ; divide by 64, len in terms of blocks mov [lane_data + _job_in_lane], job mov dword [lane_data + _outer_done], 0 mov [state + _lens + 2*lane], WORD(tmp) mov last_len, len and last_len, 63 lea extra_blocks, [last_len + 9 + 63] shr extra_blocks, 6 mov [lane_data + _extra_blocks], DWORD(extra_blocks) mov p, [job + _src] add p, [job + _hash_start_src_offset_in_bytes] mov [state + _args_data_ptr + PTR_SZ*lane], p cmp len, 64 jb copy_lt64 fast_copy: add p, len vmovdqu ymm0, [p - 64 + 0 * 32] vmovdqu ymm1, [p - 64 + 1 * 32] vmovdqu [lane_data + _extra_block + 0*32], ymm0 vmovdqu [lane_data + _extra_block + 1*32], ymm1 end_fast_copy: mov size_offset, extra_blocks shl size_offset, 6 sub size_offset, last_len add size_offset, 64-8 mov [lane_data + _size_offset], DWORD(size_offset) mov start_offset, 64 sub start_offset, last_len mov [lane_data + _start_offset], DWORD(start_offset) lea tmp, [8*64 + 8*len] bswap tmp mov [lane_data + _extra_block + size_offset], tmp mov tmp, [job + _auth_key_xor_ipad] vmovdqu xmm0, [tmp] mov DWORD(tmp), [tmp + 4*4] vmovd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*lane + 0*SHA1_DIGEST_ROW_SIZE], xmm0 vpextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*lane + 1*SHA1_DIGEST_ROW_SIZE], xmm0, 1 vpextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*lane + 2*SHA1_DIGEST_ROW_SIZE], xmm0, 2 vpextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*lane + 3*SHA1_DIGEST_ROW_SIZE], xmm0, 3 mov [state + _args_digest + SHA1_DIGEST_WORD_SIZE*lane + 4*SHA1_DIGEST_ROW_SIZE], DWORD(tmp) test len, ~63 jnz ge64_bytes lt64_bytes: mov [state + _lens + 2*lane], WORD(extra_blocks) lea tmp, [lane_data + _extra_block + start_offset] mov [state + _args_data_ptr + PTR_SZ*lane], tmp mov dword [lane_data + _extra_blocks], 0 ge64_bytes: cmp unused_lanes, 0xf jne return_null jmp start_loop align 16 start_loop: ; Find min length vmovdqa xmm0, [state + _lens] vphminposuw xmm1, xmm0 vpextrw DWORD(len2), xmm1, 0 ; min value vpextrw DWORD(idx), xmm1, 1 ; min index (0...3) DBGPRINTL64 "min_length", len2 DBGPRINTL64 "min_length index ", idx cmp len2, 0 je len_is_0 vpbroadcastw xmm1, xmm1 DBGPRINTL_XMM "SUBMIT lens after shuffle", xmm1 vpsubw xmm0, xmm0, xmm1 vmovdqa [state + _lens], xmm0 DBGPRINTL_XMM "lengths after subtraction", xmm0 ; "state" and "args" are the same address, arg1 ; len is arg2 call sha1_x8_avx2 ; state and idx are intact len_is_0: ; process completed job "idx" imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size lea lane_data, [state + _ldata + lane_data] mov DWORD(extra_blocks), [lane_data + _extra_blocks] cmp extra_blocks, 0 jne proc_extra_blocks cmp dword [lane_data + _outer_done], 0 jne end_loop proc_outer: mov dword [lane_data + _outer_done], 1 mov DWORD(size_offset), [lane_data + _size_offset] mov qword [lane_data + _extra_block + size_offset], 0 mov word [state + _lens + 2*idx], 1 lea tmp, [lane_data + _outer_block] mov job, [lane_data + _job_in_lane] mov [state + _args_data_ptr + PTR_SZ*idx], tmp vmovd xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 0*SHA1_DIGEST_ROW_SIZE] vpinsrd xmm0, xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 1*SHA1_DIGEST_ROW_SIZE], 1 vpinsrd xmm0, xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 2*SHA1_DIGEST_ROW_SIZE], 2 vpinsrd xmm0, xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 3*SHA1_DIGEST_ROW_SIZE], 3 vpshufb xmm0, xmm0, [rel byteswap] mov DWORD(tmp), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 4*SHA1_DIGEST_ROW_SIZE] bswap DWORD(tmp) vmovdqa [lane_data + _outer_block], xmm0 mov [lane_data + _outer_block + 4*SHA1_DIGEST_WORD_SIZE], DWORD(tmp) mov tmp, [job + _auth_key_xor_opad] vmovdqu xmm0, [tmp] mov DWORD(tmp), [tmp + 4*4] vmovd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 0*SHA1_DIGEST_ROW_SIZE], xmm0 vpextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 1*SHA1_DIGEST_ROW_SIZE], xmm0, 1 vpextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 2*SHA1_DIGEST_ROW_SIZE], xmm0, 2 vpextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 3*SHA1_DIGEST_ROW_SIZE], xmm0, 3 mov [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 4*SHA1_DIGEST_ROW_SIZE], DWORD(tmp) jmp start_loop align 16 proc_extra_blocks: mov DWORD(start_offset), [lane_data + _start_offset] mov [state + _lens + 2*idx], WORD(extra_blocks) lea tmp, [lane_data + _extra_block + start_offset] mov [state + _args_data_ptr + PTR_SZ*idx], tmp mov dword [lane_data + _extra_blocks], 0 jmp start_loop align 16 copy_lt64: ;; less than one message block of data ;; beginning of source block ;; destination extrablock but backwards by len from where 0x80 pre-populated lea p2, [lane_data + _extra_block + 64] sub p2, len memcpy_avx2_64_1 p2, p, len, tmp4, tmp2, ymm0, ymm1 mov unused_lanes, [state + _unused_lanes] jmp end_fast_copy return_null: xor job_rax, job_rax jmp return align 16 end_loop: mov job_rax, [lane_data + _job_in_lane] mov unused_lanes, [state + _unused_lanes] mov qword [lane_data + _job_in_lane], 0 or dword [job_rax + _status], STS_COMPLETED_HMAC shl unused_lanes, 4 or unused_lanes, idx mov [state + _unused_lanes], unused_lanes mov p, [job_rax + _auth_tag_output] ; copy 12 bytes mov DWORD(tmp), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 0*SHA1_DIGEST_ROW_SIZE] mov DWORD(tmp2), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 1*SHA1_DIGEST_ROW_SIZE] mov DWORD(tmp3), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 2*SHA1_DIGEST_ROW_SIZE] bswap DWORD(tmp) bswap DWORD(tmp2) bswap DWORD(tmp3) mov [p + 0*SHA1_DIGEST_WORD_SIZE], DWORD(tmp) mov [p + 1*SHA1_DIGEST_WORD_SIZE], DWORD(tmp2) mov [p + 2*SHA1_DIGEST_WORD_SIZE], DWORD(tmp3) DBGPRINTL "---------- exit sha1 submit -----------" return: mov rbp, [rsp + _gpr_save + 8*0] mov r12, [rsp + _gpr_save + 8*1] mov r13, [rsp + _gpr_save + 8*2] mov r14, [rsp + _gpr_save + 8*3] mov r15, [rsp + _gpr_save + 8*4] %ifndef LINUX mov rsi, [rsp + _gpr_save + 8*5] mov rdi, [rsp + _gpr_save + 8*6] %endif mov rsp, [rsp + _rsp_save] ret intel-ipsec-mb-0.48/avx2/md5_x8x2_avx2.asm000066400000000000000000001166401321406316400201650ustar00rootroot00000000000000;; ;; Copyright (c) 2012-2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; ;; code to compute double octal MD5 using AVX2 ;; Stack must be aligned to 32 bytes before call ;; Windows clobbers: rax rbx rdx rsi rdi r8 r9 r10 r11 r12 r13 r14 r15 ;; Windows preserves: rcx rbp ;; ;; Linux clobbers: rax rbx rcx rdx rsi r8 r9 r10 r11 r12 r13 r14 r15 ;; Linux preserves: rdi rbp ;; ;; clobbers ymm0-15 %include "os.asm" %include "mb_mgr_datastruct.asm" section .data default rel align 64 MD5_TABLE: dd 0xd76aa478, 0xd76aa478, 0xd76aa478, 0xd76aa478 dd 0xd76aa478, 0xd76aa478, 0xd76aa478, 0xd76aa478 dd 0xe8c7b756, 0xe8c7b756, 0xe8c7b756, 0xe8c7b756 dd 0xe8c7b756, 0xe8c7b756, 0xe8c7b756, 0xe8c7b756 dd 0x242070db, 0x242070db, 0x242070db, 0x242070db dd 0x242070db, 0x242070db, 0x242070db, 0x242070db dd 0xc1bdceee, 0xc1bdceee, 0xc1bdceee, 0xc1bdceee dd 0xc1bdceee, 0xc1bdceee, 0xc1bdceee, 0xc1bdceee dd 0xf57c0faf, 0xf57c0faf, 0xf57c0faf, 0xf57c0faf dd 0xf57c0faf, 0xf57c0faf, 0xf57c0faf, 0xf57c0faf dd 0x4787c62a, 0x4787c62a, 0x4787c62a, 0x4787c62a dd 0x4787c62a, 0x4787c62a, 0x4787c62a, 0x4787c62a dd 0xa8304613, 0xa8304613, 0xa8304613, 0xa8304613 dd 0xa8304613, 0xa8304613, 0xa8304613, 0xa8304613 dd 0xfd469501, 0xfd469501, 0xfd469501, 0xfd469501 dd 0xfd469501, 0xfd469501, 0xfd469501, 0xfd469501 dd 0x698098d8, 0x698098d8, 0x698098d8, 0x698098d8 dd 0x698098d8, 0x698098d8, 0x698098d8, 0x698098d8 dd 0x8b44f7af, 0x8b44f7af, 0x8b44f7af, 0x8b44f7af dd 0x8b44f7af, 0x8b44f7af, 0x8b44f7af, 0x8b44f7af dd 0xffff5bb1, 0xffff5bb1, 0xffff5bb1, 0xffff5bb1 dd 0xffff5bb1, 0xffff5bb1, 0xffff5bb1, 0xffff5bb1 dd 0x895cd7be, 0x895cd7be, 0x895cd7be, 0x895cd7be dd 0x895cd7be, 0x895cd7be, 0x895cd7be, 0x895cd7be dd 0x6b901122, 0x6b901122, 0x6b901122, 0x6b901122 dd 0x6b901122, 0x6b901122, 0x6b901122, 0x6b901122 dd 0xfd987193, 0xfd987193, 0xfd987193, 0xfd987193 dd 0xfd987193, 0xfd987193, 0xfd987193, 0xfd987193 dd 0xa679438e, 0xa679438e, 0xa679438e, 0xa679438e dd 0xa679438e, 0xa679438e, 0xa679438e, 0xa679438e dd 0x49b40821, 0x49b40821, 0x49b40821, 0x49b40821 dd 0x49b40821, 0x49b40821, 0x49b40821, 0x49b40821 dd 0xf61e2562, 0xf61e2562, 0xf61e2562, 0xf61e2562 dd 0xf61e2562, 0xf61e2562, 0xf61e2562, 0xf61e2562 dd 0xc040b340, 0xc040b340, 0xc040b340, 0xc040b340 dd 0xc040b340, 0xc040b340, 0xc040b340, 0xc040b340 dd 0x265e5a51, 0x265e5a51, 0x265e5a51, 0x265e5a51 dd 0x265e5a51, 0x265e5a51, 0x265e5a51, 0x265e5a51 dd 0xe9b6c7aa, 0xe9b6c7aa, 0xe9b6c7aa, 0xe9b6c7aa dd 0xe9b6c7aa, 0xe9b6c7aa, 0xe9b6c7aa, 0xe9b6c7aa dd 0xd62f105d, 0xd62f105d, 0xd62f105d, 0xd62f105d dd 0xd62f105d, 0xd62f105d, 0xd62f105d, 0xd62f105d dd 0x02441453, 0x02441453, 0x02441453, 0x02441453 dd 0x02441453, 0x02441453, 0x02441453, 0x02441453 dd 0xd8a1e681, 0xd8a1e681, 0xd8a1e681, 0xd8a1e681 dd 0xd8a1e681, 0xd8a1e681, 0xd8a1e681, 0xd8a1e681 dd 0xe7d3fbc8, 0xe7d3fbc8, 0xe7d3fbc8, 0xe7d3fbc8 dd 0xe7d3fbc8, 0xe7d3fbc8, 0xe7d3fbc8, 0xe7d3fbc8 dd 0x21e1cde6, 0x21e1cde6, 0x21e1cde6, 0x21e1cde6 dd 0x21e1cde6, 0x21e1cde6, 0x21e1cde6, 0x21e1cde6 dd 0xc33707d6, 0xc33707d6, 0xc33707d6, 0xc33707d6 dd 0xc33707d6, 0xc33707d6, 0xc33707d6, 0xc33707d6 dd 0xf4d50d87, 0xf4d50d87, 0xf4d50d87, 0xf4d50d87 dd 0xf4d50d87, 0xf4d50d87, 0xf4d50d87, 0xf4d50d87 dd 0x455a14ed, 0x455a14ed, 0x455a14ed, 0x455a14ed dd 0x455a14ed, 0x455a14ed, 0x455a14ed, 0x455a14ed dd 0xa9e3e905, 0xa9e3e905, 0xa9e3e905, 0xa9e3e905 dd 0xa9e3e905, 0xa9e3e905, 0xa9e3e905, 0xa9e3e905 dd 0xfcefa3f8, 0xfcefa3f8, 0xfcefa3f8, 0xfcefa3f8 dd 0xfcefa3f8, 0xfcefa3f8, 0xfcefa3f8, 0xfcefa3f8 dd 0x676f02d9, 0x676f02d9, 0x676f02d9, 0x676f02d9 dd 0x676f02d9, 0x676f02d9, 0x676f02d9, 0x676f02d9 dd 0x8d2a4c8a, 0x8d2a4c8a, 0x8d2a4c8a, 0x8d2a4c8a dd 0x8d2a4c8a, 0x8d2a4c8a, 0x8d2a4c8a, 0x8d2a4c8a dd 0xfffa3942, 0xfffa3942, 0xfffa3942, 0xfffa3942 dd 0xfffa3942, 0xfffa3942, 0xfffa3942, 0xfffa3942 dd 0x8771f681, 0x8771f681, 0x8771f681, 0x8771f681 dd 0x8771f681, 0x8771f681, 0x8771f681, 0x8771f681 dd 0x6d9d6122, 0x6d9d6122, 0x6d9d6122, 0x6d9d6122 dd 0x6d9d6122, 0x6d9d6122, 0x6d9d6122, 0x6d9d6122 dd 0xfde5380c, 0xfde5380c, 0xfde5380c, 0xfde5380c dd 0xfde5380c, 0xfde5380c, 0xfde5380c, 0xfde5380c dd 0xa4beea44, 0xa4beea44, 0xa4beea44, 0xa4beea44 dd 0xa4beea44, 0xa4beea44, 0xa4beea44, 0xa4beea44 dd 0x4bdecfa9, 0x4bdecfa9, 0x4bdecfa9, 0x4bdecfa9 dd 0x4bdecfa9, 0x4bdecfa9, 0x4bdecfa9, 0x4bdecfa9 dd 0xf6bb4b60, 0xf6bb4b60, 0xf6bb4b60, 0xf6bb4b60 dd 0xf6bb4b60, 0xf6bb4b60, 0xf6bb4b60, 0xf6bb4b60 dd 0xbebfbc70, 0xbebfbc70, 0xbebfbc70, 0xbebfbc70 dd 0xbebfbc70, 0xbebfbc70, 0xbebfbc70, 0xbebfbc70 dd 0x289b7ec6, 0x289b7ec6, 0x289b7ec6, 0x289b7ec6 dd 0x289b7ec6, 0x289b7ec6, 0x289b7ec6, 0x289b7ec6 dd 0xeaa127fa, 0xeaa127fa, 0xeaa127fa, 0xeaa127fa dd 0xeaa127fa, 0xeaa127fa, 0xeaa127fa, 0xeaa127fa dd 0xd4ef3085, 0xd4ef3085, 0xd4ef3085, 0xd4ef3085 dd 0xd4ef3085, 0xd4ef3085, 0xd4ef3085, 0xd4ef3085 dd 0x04881d05, 0x04881d05, 0x04881d05, 0x04881d05 dd 0x04881d05, 0x04881d05, 0x04881d05, 0x04881d05 dd 0xd9d4d039, 0xd9d4d039, 0xd9d4d039, 0xd9d4d039 dd 0xd9d4d039, 0xd9d4d039, 0xd9d4d039, 0xd9d4d039 dd 0xe6db99e5, 0xe6db99e5, 0xe6db99e5, 0xe6db99e5 dd 0xe6db99e5, 0xe6db99e5, 0xe6db99e5, 0xe6db99e5 dd 0x1fa27cf8, 0x1fa27cf8, 0x1fa27cf8, 0x1fa27cf8 dd 0x1fa27cf8, 0x1fa27cf8, 0x1fa27cf8, 0x1fa27cf8 dd 0xc4ac5665, 0xc4ac5665, 0xc4ac5665, 0xc4ac5665 dd 0xc4ac5665, 0xc4ac5665, 0xc4ac5665, 0xc4ac5665 dd 0xf4292244, 0xf4292244, 0xf4292244, 0xf4292244 dd 0xf4292244, 0xf4292244, 0xf4292244, 0xf4292244 dd 0x432aff97, 0x432aff97, 0x432aff97, 0x432aff97 dd 0x432aff97, 0x432aff97, 0x432aff97, 0x432aff97 dd 0xab9423a7, 0xab9423a7, 0xab9423a7, 0xab9423a7 dd 0xab9423a7, 0xab9423a7, 0xab9423a7, 0xab9423a7 dd 0xfc93a039, 0xfc93a039, 0xfc93a039, 0xfc93a039 dd 0xfc93a039, 0xfc93a039, 0xfc93a039, 0xfc93a039 dd 0x655b59c3, 0x655b59c3, 0x655b59c3, 0x655b59c3 dd 0x655b59c3, 0x655b59c3, 0x655b59c3, 0x655b59c3 dd 0x8f0ccc92, 0x8f0ccc92, 0x8f0ccc92, 0x8f0ccc92 dd 0x8f0ccc92, 0x8f0ccc92, 0x8f0ccc92, 0x8f0ccc92 dd 0xffeff47d, 0xffeff47d, 0xffeff47d, 0xffeff47d dd 0xffeff47d, 0xffeff47d, 0xffeff47d, 0xffeff47d dd 0x85845dd1, 0x85845dd1, 0x85845dd1, 0x85845dd1 dd 0x85845dd1, 0x85845dd1, 0x85845dd1, 0x85845dd1 dd 0x6fa87e4f, 0x6fa87e4f, 0x6fa87e4f, 0x6fa87e4f dd 0x6fa87e4f, 0x6fa87e4f, 0x6fa87e4f, 0x6fa87e4f dd 0xfe2ce6e0, 0xfe2ce6e0, 0xfe2ce6e0, 0xfe2ce6e0 dd 0xfe2ce6e0, 0xfe2ce6e0, 0xfe2ce6e0, 0xfe2ce6e0 dd 0xa3014314, 0xa3014314, 0xa3014314, 0xa3014314 dd 0xa3014314, 0xa3014314, 0xa3014314, 0xa3014314 dd 0x4e0811a1, 0x4e0811a1, 0x4e0811a1, 0x4e0811a1 dd 0x4e0811a1, 0x4e0811a1, 0x4e0811a1, 0x4e0811a1 dd 0xf7537e82, 0xf7537e82, 0xf7537e82, 0xf7537e82 dd 0xf7537e82, 0xf7537e82, 0xf7537e82, 0xf7537e82 dd 0xbd3af235, 0xbd3af235, 0xbd3af235, 0xbd3af235 dd 0xbd3af235, 0xbd3af235, 0xbd3af235, 0xbd3af235 dd 0x2ad7d2bb, 0x2ad7d2bb, 0x2ad7d2bb, 0x2ad7d2bb dd 0x2ad7d2bb, 0x2ad7d2bb, 0x2ad7d2bb, 0x2ad7d2bb dd 0xeb86d391, 0xeb86d391, 0xeb86d391, 0xeb86d391 dd 0xeb86d391, 0xeb86d391, 0xeb86d391, 0xeb86d391 ONES: dd 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff dd 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff section .text %ifndef LINUX %define arg1 rcx %define arg2 rdx %define reg3 rdi %define reg4 rsi %else %define arg1 rdi %define arg2 rsi %define reg3 rcx %define reg4 rdx %endif ;; rbp is not clobbered %define state arg1 %define num_blks arg2 %define inp0 r8 %define inp1 r9 %define inp2 r10 %define inp3 r11 %define inp4 r12 %define inp5 r13 %define inp6 r14 %define inp7 r15 ;; These are pointers to data block1 and block2 in the stack ; which will ping pong back and forth %define DPTR1 rbx %define DPTR2 reg3 %define TBL rax %define IDX reg4 ;; Transposed Digest Storage %define Y_A ymm0 %define Y_B ymm1 %define Y_C ymm2 %define Y_D ymm3 %define Y_A2 ymm4 %define Y_B2 ymm5 %define Y_C2 ymm6 %define Y_D2 ymm7 ;; Temp YMM registers corresponding to the Temp XMM registers ;; used during the transposition of the digests %define Y_KTMP1 ymm12 %define Y_KTMP2 ymm13 ;; Temporary registers used during MD5 round operations %define Y_FUN ymm8 %define Y_TMP ymm9 %define Y_FUN2 ymm10 %define Y_TMP2 ymm11 ;; YMM registers used during data fetching. ;; Data are stored into the stack after transposition %define Y_DAT0 ymm8 %define Y_DAT1 ymm9 %define Y_DAT2 ymm10 %define Y_DAT3 ymm11 %define Y_DAT4 ymm12 %define Y_DAT5 ymm13 %define Y_DAT6 ymm14 %define Y_DAT7 ymm15 ;; Temporary registers used during data transposition %define Y_DTMP1 ymm0 %define Y_DTMP2 ymm1 %define RESY resb 32* ;; Assume stack aligned to 32 bytes before call ;; Therefore FRAMESIZE mod 32 must be 32-8 = 24 struc STACK _DATA: RESY 2*2*16 ; 2 blocks * 2 sets of lanes * 16 regs _DIGEST: RESY 8 ; stores Y_AA-Y_DD, Y_AA2-Y_DD2 _TMPDIGEST: RESY 2 ; stores Y_AA, Y_BB temporarily resb 24 ; align endstruc %define Y_AA rsp + _DIGEST + 32*0 %define Y_BB rsp + _DIGEST + 32*1 %define Y_CC rsp + _DIGEST + 32*2 %define Y_DD rsp + _DIGEST + 32*3 %define Y_AA2 rsp + _DIGEST + 32*4 %define Y_BB2 rsp + _DIGEST + 32*5 %define Y_CC2 rsp + _DIGEST + 32*6 %define Y_DD2 rsp + _DIGEST + 32*7 ;; ;; MD5 left rotations (number of bits) ;; rot11 equ 7 rot12 equ 12 rot13 equ 17 rot14 equ 22 rot21 equ 5 rot22 equ 9 rot23 equ 14 rot24 equ 20 rot31 equ 4 rot32 equ 11 rot33 equ 16 rot34 equ 23 rot41 equ 6 rot42 equ 10 rot43 equ 15 rot44 equ 21 ; TRANSPOSE8 r0, r1, r2, r3, r4, r5, r6, r7, t0, t1 ; "transpose" data in {r0...r7} using temps {t0...t1} ; Input looks like: {r0 r1 r2 r3 r4 r5 r6 r7} ; r0 = {a7 a6 a5 a4 a3 a2 a1 a0} ; r1 = {b7 b6 b5 b4 b3 b2 b1 b0} ; r2 = {c7 c6 c5 c4 c3 c2 c1 c0} ; r3 = {d7 d6 d5 d4 d3 d2 d1 d0} ; r4 = {e7 e6 e5 e4 e3 e2 e1 e0} ; r5 = {f7 f6 f5 f4 f3 f2 f1 f0} ; r6 = {g7 g6 g5 g4 g3 g2 g1 g0} ; r7 = {h7 h6 h5 h4 h3 h2 h1 h0} ; ; Output looks like: {r0 r1 r2 r3 r4 r5 r6 r7} ; r0 = {h0 g0 f0 e0 d0 c0 b0 a0} ; r1 = {h1 g1 f1 e1 d1 c1 b1 a1} ; r2 = {h2 g2 f2 e2 d2 c2 b2 a2} ; r3 = {h3 g3 f3 e3 d3 c3 b3 a3} ; r4 = {h4 g4 f4 e4 d4 c4 b4 a4} ; r5 = {h5 g5 f5 e5 d5 c5 b5 a5} ; r6 = {h6 g6 f6 e6 d6 c6 b6 a6} ; r7 = {h7 g7 f7 e7 d7 c7 b7 a7} ; %macro TRANSPOSE8 10 %define %%r0 %1 %define %%r1 %2 %define %%r2 %3 %define %%r3 %4 %define %%r4 %5 %define %%r5 %6 %define %%r6 %7 %define %%r7 %8 %define %%t0 %9 %define %%t1 %10 ; process top half (r0..r3) {a...d} vshufps %%t0, %%r0, %%r1, 0x44 ; t0 = {b5 b4 a5 a4 b1 b0 a1 a0} vshufps %%r0, %%r0, %%r1, 0xEE ; r0 = {b7 b6 a7 a6 b3 b2 a3 a2} vshufps %%t1, %%r2, %%r3, 0x44 ; t1 = {d5 d4 c5 c4 d1 d0 c1 c0} vshufps %%r2, %%r2, %%r3, 0xEE ; r2 = {d7 d6 c7 c6 d3 d2 c3 c2} vshufps %%r3, %%t0, %%t1, 0xDD ; r3 = {d5 c5 b5 a5 d1 c1 b1 a1} vshufps %%r1, %%r0, %%r2, 0x88 ; r1 = {d6 c6 b6 a6 d2 c2 b2 a2} vshufps %%r0, %%r0, %%r2, 0xDD ; r0 = {d7 c7 b7 a7 d3 c3 b3 a3} vshufps %%t0, %%t0, %%t1, 0x88 ; t0 = {d4 c4 b4 a4 d0 c0 b0 a0} ; use r2 in place of t0 ; process bottom half (r4..r7) {e...h} vshufps %%r2, %%r4, %%r5, 0x44 ; r2 = {f5 f4 e5 e4 f1 f0 e1 e0} vshufps %%r4, %%r4, %%r5, 0xEE ; r4 = {f7 f6 e7 e6 f3 f2 e3 e2} vshufps %%t1, %%r6, %%r7, 0x44 ; t1 = {h5 h4 g5 g4 h1 h0 g1 g0} vshufps %%r6, %%r6, %%r7, 0xEE ; r6 = {h7 h6 g7 g6 h3 h2 g3 g2} vshufps %%r7, %%r2, %%t1, 0xDD ; r7 = {h5 g5 f5 e5 h1 g1 f1 e1} vshufps %%r5, %%r4, %%r6, 0x88 ; r5 = {h6 g6 f6 e6 h2 g2 f2 e2} vshufps %%r4, %%r4, %%r6, 0xDD ; r4 = {h7 g7 f7 e7 h3 g3 f3 e3} vshufps %%t1, %%r2, %%t1, 0x88 ; t1 = {h4 g4 f4 e4 h0 g0 f0 e0} vperm2f128 %%r6, %%r5, %%r1, 0x13 ; h6...a6 vperm2f128 %%r2, %%r5, %%r1, 0x02 ; h2...a2 vperm2f128 %%r5, %%r7, %%r3, 0x13 ; h5...a5 vperm2f128 %%r1, %%r7, %%r3, 0x02 ; h1...a1 vperm2f128 %%r7, %%r4, %%r0, 0x13 ; h7...a7 vperm2f128 %%r3, %%r4, %%r0, 0x02 ; h3...a3 vperm2f128 %%r4, %%t1, %%t0, 0x13 ; h4...a4 vperm2f128 %%r0, %%t1, %%t0, 0x02 ; h0...a0 %endmacro ;; ;; Magic functions defined in RFC 1321 ;; ; macro MAGIC_F F,X,Y,Z ;; F = ((Z) ^ ((X) & ((Y) ^ (Z)))) %macro MAGIC_F 4 %define %%F %1 %define %%X %2 %define %%Y %3 %define %%Z %4 vpxor %%F,%%Z, %%Y vpand %%F,%%F,%%X vpxor %%F,%%F,%%Z %endmacro ; macro MAGIC_G F,X,Y,Z ;; F = F((Z),(X),(Y)) %macro MAGIC_G 4 %define %%F %1 %define %%X %2 %define %%Y %3 %define %%Z %4 MAGIC_F %%F,%%Z,%%X,%%Y %endmacro ; macro MAGIC_H F,X,Y,Z ;; F = ((X) ^ (Y) ^ (Z)) %macro MAGIC_H 4 %define %%F %1 %define %%X %2 %define %%Y %3 %define %%Z %4 vpxor %%F,%%Z, %%Y vpxor %%F,%%F, %%X %endmacro ; macro MAGIC_I F,X,Y,Z ;; F = ((Y) ^ ((X) | ~(Z))) %macro MAGIC_I 4 %define %%F %1 %define %%X %2 %define %%Y %3 %define %%Z %4 vpxor %%F,%%Z,[rel ONES] ; pnot %%F vpor %%F,%%F,%%X vpxor %%F,%%F,%%Y %endmacro ; PROLD reg, imm, tmp %macro PROLD 3 %define %%reg %1 %define %%imm %2 %define %%tmp %3 vpsrld %%tmp, %%reg, (32-%%imm) vpslld %%reg, %%reg, %%imm vpor %%reg, %%reg, %%tmp %endmacro ;; ;; single MD5 step ;; ;; A = B +ROL32((A +MAGIC(B,C,D) +data +const), nrot) ;; ; macro MD5_STEP MAGIC_FUN, A,B,C,D, A2,B2,C3,D2, FUN, TMP, FUN2, TMP2, data, ; MD5const, nrot %macro MD5_STEP 16 %define %%MAGIC_FUN %1 %define %%rA %2 %define %%rB %3 %define %%rC %4 %define %%rD %5 %define %%rA2 %6 %define %%rB2 %7 %define %%rC2 %8 %define %%rD2 %9 %define %%FUN %10 %define %%TMP %11 %define %%FUN2 %12 %define %%TMP2 %13 %define %%data %14 %define %%MD5const %15 %define %%nrot %16 vpaddd %%rA, %%rA, %%MD5const vpaddd %%rA2, %%rA2, %%MD5const vpaddd %%rA, %%rA, [%%data] vpaddd %%rA2, %%rA2, [%%data + 16*32] %%MAGIC_FUN %%FUN, %%rB,%%rC,%%rD %%MAGIC_FUN %%FUN2, %%rB2,%%rC2,%%rD2 vpaddd %%rA, %%rA, %%FUN vpaddd %%rA2, %%rA2, %%FUN2 PROLD %%rA,%%nrot, %%TMP PROLD %%rA2,%%nrot, %%TMP2 vpaddd %%rA, %%rA, %%rB vpaddd %%rA2, %%rA2, %%rB2 %endmacro align 32 ; void md5_x8x2_avx(MD5_ARGS *args, UINT64 num_blks) ; arg 1 : pointer to MD5_ARGS structure ; arg 2 : number of blocks (>=1) MKGLOBAL(md5_x8x2_avx2,function,internal) md5_x8x2_avx2: sub rsp, STACK_size mov DPTR1, rsp lea DPTR2, [rsp + 32*32] ;; Load MD5 constant pointer to register lea TBL, [rel MD5_TABLE] ; Initialize index for data retrieval xor IDX, IDX ;; Fetch Pointers to Data Stream 1 to 8 mov inp0,[state + _data_ptr_md5+0*PTR_SZ] mov inp1,[state + _data_ptr_md5+1*PTR_SZ] mov inp2,[state + _data_ptr_md5+2*PTR_SZ] mov inp3,[state + _data_ptr_md5+3*PTR_SZ] mov inp4,[state + _data_ptr_md5+4*PTR_SZ] mov inp5,[state + _data_ptr_md5+5*PTR_SZ] mov inp6,[state + _data_ptr_md5+6*PTR_SZ] mov inp7,[state + _data_ptr_md5+7*PTR_SZ] %assign I 0 %rep 2 vmovdqu Y_DAT0,[inp0+IDX+I*32] vmovdqu Y_DAT1,[inp1+IDX+I*32] vmovdqu Y_DAT2,[inp2+IDX+I*32] vmovdqu Y_DAT3,[inp3+IDX+I*32] vmovdqu Y_DAT4,[inp4+IDX+I*32] vmovdqu Y_DAT5,[inp5+IDX+I*32] vmovdqu Y_DAT6,[inp6+IDX+I*32] vmovdqu Y_DAT7,[inp7+IDX+I*32] TRANSPOSE8 Y_DAT0, Y_DAT1, Y_DAT2, Y_DAT3, Y_DAT4, Y_DAT5, Y_DAT6, Y_DAT7, Y_DTMP1, Y_DTMP2 vmovdqa [DPTR1+_DATA+(I*8+0)*32],Y_DAT0 vmovdqa [DPTR1+_DATA+(I*8+1)*32],Y_DAT1 vmovdqa [DPTR1+_DATA+(I*8+2)*32],Y_DAT2 vmovdqa [DPTR1+_DATA+(I*8+3)*32],Y_DAT3 vmovdqa [DPTR1+_DATA+(I*8+4)*32],Y_DAT4 vmovdqa [DPTR1+_DATA+(I*8+5)*32],Y_DAT5 vmovdqa [DPTR1+_DATA+(I*8+6)*32],Y_DAT6 vmovdqa [DPTR1+_DATA+(I*8+7)*32],Y_DAT7 %assign I (I+1) %endrep ;; Fetch Pointers to Data Stream 9 to 16 mov inp0,[state + _data_ptr_md5 + 8*8] mov inp1,[state + _data_ptr_md5 + 9*8] mov inp2,[state + _data_ptr_md5 + 10*8] mov inp3,[state + _data_ptr_md5 + 11*8] mov inp4,[state + _data_ptr_md5 + 12*8] mov inp5,[state + _data_ptr_md5 + 13*8] mov inp6,[state + _data_ptr_md5 + 14*8] mov inp7,[state + _data_ptr_md5 + 15*8] %assign I 0 %rep 2 vmovdqu Y_DAT0,[inp0+IDX+I*32] vmovdqu Y_DAT1,[inp1+IDX+I*32] vmovdqu Y_DAT2,[inp2+IDX+I*32] vmovdqu Y_DAT3,[inp3+IDX+I*32] vmovdqu Y_DAT4,[inp4+IDX+I*32] vmovdqu Y_DAT5,[inp5+IDX+I*32] vmovdqu Y_DAT6,[inp6+IDX+I*32] vmovdqu Y_DAT7,[inp7+IDX+I*32] TRANSPOSE8 Y_DAT0, Y_DAT1, Y_DAT2, Y_DAT3, Y_DAT4, Y_DAT5, Y_DAT6, Y_DAT7, Y_DTMP1, Y_DTMP2 vmovdqa [DPTR1+_DATA+((I+2)*8+0)*32],Y_DAT0 vmovdqa [DPTR1+_DATA+((I+2)*8+1)*32],Y_DAT1 vmovdqa [DPTR1+_DATA+((I+2)*8+2)*32],Y_DAT2 vmovdqa [DPTR1+_DATA+((I+2)*8+3)*32],Y_DAT3 vmovdqa [DPTR1+_DATA+((I+2)*8+4)*32],Y_DAT4 vmovdqa [DPTR1+_DATA+((I+2)*8+5)*32],Y_DAT5 vmovdqa [DPTR1+_DATA+((I+2)*8+6)*32],Y_DAT6 vmovdqa [DPTR1+_DATA+((I+2)*8+7)*32],Y_DAT7 %assign I (I+1) %endrep ;; digests are already transposed vmovdqu Y_A,[state + 0 * MD5_DIGEST_ROW_SIZE ] vmovdqu Y_B,[state + 1 * MD5_DIGEST_ROW_SIZE ] vmovdqu Y_C,[state + 2 * MD5_DIGEST_ROW_SIZE ] vmovdqu Y_D,[state + 3 * MD5_DIGEST_ROW_SIZE ] ; Load the digest for each stream (9-16) vmovdqu Y_A2,[state + 0 * MD5_DIGEST_ROW_SIZE + 32] vmovdqu Y_B2,[state + 1 * MD5_DIGEST_ROW_SIZE + 32] vmovdqu Y_C2,[state + 2 * MD5_DIGEST_ROW_SIZE + 32] vmovdqu Y_D2,[state + 3 * MD5_DIGEST_ROW_SIZE + 32] lloop: ; save old digests to stack vmovdqa [Y_AA], Y_A vmovdqa [Y_BB], Y_B vmovdqa [Y_CC], Y_C vmovdqa [Y_DD], Y_D vmovdqa [Y_AA2], Y_A2 vmovdqa [Y_BB2], Y_B2 vmovdqa [Y_CC2], Y_C2 vmovdqa [Y_DD2], Y_D2 ;; Increment IDX to point to next data block (64 bytes per block) add IDX, 64 ;; Update size of remaining blocks to process sub num_blks, 1 je lastblock ; Perform the 64 rounds of processing ... MD5_STEP MAGIC_F, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 0*32, [TBL+ 0*32], rot11 MD5_STEP MAGIC_F, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 1*32, [TBL+ 1*32], rot12 MD5_STEP MAGIC_F, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 2*32, [TBL+ 2*32], rot13 MD5_STEP MAGIC_F, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 3*32, [TBL+ 3*32], rot14 MD5_STEP MAGIC_F, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 4*32, [TBL+ 4*32], rot11 MD5_STEP MAGIC_F, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 5*32, [TBL+ 5*32], rot12 MD5_STEP MAGIC_F, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 6*32, [TBL+ 6*32], rot13 MD5_STEP MAGIC_F, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 7*32, [TBL+ 7*32], rot14 ;; Fetch Pointers to Data Stream 1 to 8 ?? mov inp0,[state + _data_ptr_md5 + 0*8] mov inp1,[state + _data_ptr_md5 + 1*8] mov inp2,[state + _data_ptr_md5 + 2*8] mov inp3,[state + _data_ptr_md5 + 3*8] mov inp4,[state + _data_ptr_md5 + 4*8] mov inp5,[state + _data_ptr_md5 + 5*8] mov inp6,[state + _data_ptr_md5 + 6*8] mov inp7,[state + _data_ptr_md5 + 7*8] MD5_STEP MAGIC_F, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 8*32, [TBL+ 8*32], rot11 MD5_STEP MAGIC_F, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 9*32, [TBL+ 9*32], rot12 MD5_STEP MAGIC_F, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+10*32, [TBL+10*32], rot13 MD5_STEP MAGIC_F, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+11*32, [TBL+11*32], rot14 MD5_STEP MAGIC_F, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+12*32, [TBL+12*32], rot11 MD5_STEP MAGIC_F, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+13*32, [TBL+13*32], rot12 MD5_STEP MAGIC_F, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+14*32, [TBL+14*32], rot13 MD5_STEP MAGIC_F, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+15*32, [TBL+15*32], rot14 %assign I 0 ; Y_A and Y_B share the same registers with Y_DTMP1 and Y_DTMP2 ; Therefore we need to save these to stack and restore after transpose vmovdqa [rsp + _TMPDIGEST + 0*32], Y_A vmovdqa [rsp + _TMPDIGEST + 1*32], Y_B vmovdqu Y_DAT0,[inp0+IDX+I*32] vmovdqu Y_DAT1,[inp1+IDX+I*32] vmovdqu Y_DAT2,[inp2+IDX+I*32] vmovdqu Y_DAT3,[inp3+IDX+I*32] vmovdqu Y_DAT4,[inp4+IDX+I*32] vmovdqu Y_DAT5,[inp5+IDX+I*32] vmovdqu Y_DAT6,[inp6+IDX+I*32] vmovdqu Y_DAT7,[inp7+IDX+I*32] TRANSPOSE8 Y_DAT0, Y_DAT1, Y_DAT2, Y_DAT3, Y_DAT4, Y_DAT5, Y_DAT6, Y_DAT7, Y_DTMP1, Y_DTMP2 vmovdqa [DPTR2+_DATA+(I*8+0)*32],Y_DAT0 vmovdqa [DPTR2+_DATA+(I*8+1)*32],Y_DAT1 vmovdqa [DPTR2+_DATA+(I*8+2)*32],Y_DAT2 vmovdqa [DPTR2+_DATA+(I*8+3)*32],Y_DAT3 vmovdqa [DPTR2+_DATA+(I*8+4)*32],Y_DAT4 vmovdqa [DPTR2+_DATA+(I*8+5)*32],Y_DAT5 vmovdqa [DPTR2+_DATA+(I*8+6)*32],Y_DAT6 vmovdqa [DPTR2+_DATA+(I*8+7)*32],Y_DAT7 ; Restore Y_A and Y_B vmovdqa Y_A, [rsp + _TMPDIGEST + 0*32] vmovdqa Y_B, [rsp + _TMPDIGEST + 1*32] MD5_STEP MAGIC_G, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 1*32, [TBL+16*32], rot21 MD5_STEP MAGIC_G, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 6*32, [TBL+17*32], rot22 MD5_STEP MAGIC_G, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+11*32, [TBL+18*32], rot23 MD5_STEP MAGIC_G, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 0*32, [TBL+19*32], rot24 MD5_STEP MAGIC_G, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 5*32, [TBL+20*32], rot21 MD5_STEP MAGIC_G, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+10*32, [TBL+21*32], rot22 MD5_STEP MAGIC_G, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+15*32, [TBL+22*32], rot23 MD5_STEP MAGIC_G, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 4*32, [TBL+23*32], rot24 MD5_STEP MAGIC_G, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 9*32, [TBL+24*32], rot21 MD5_STEP MAGIC_G, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+14*32, [TBL+25*32], rot22 MD5_STEP MAGIC_G, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 3*32, [TBL+26*32], rot23 MD5_STEP MAGIC_G, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 8*32, [TBL+27*32], rot24 MD5_STEP MAGIC_G, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+13*32, [TBL+28*32], rot21 MD5_STEP MAGIC_G, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 2*32, [TBL+29*32], rot22 MD5_STEP MAGIC_G, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 7*32, [TBL+30*32], rot23 MD5_STEP MAGIC_G, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+12*32, [TBL+31*32], rot24 %assign I (I+1) ; Y_A and Y_B share the same registers with Y_DTMP1 and Y_DTMP2 ; Therefore we need to save these to stack and restore after transpose vmovdqa [rsp + _TMPDIGEST + 0*32], Y_A vmovdqa [rsp + _TMPDIGEST + 1*32], Y_B vmovdqu Y_DAT0,[inp0+IDX+I*32] vmovdqu Y_DAT1,[inp1+IDX+I*32] vmovdqu Y_DAT2,[inp2+IDX+I*32] vmovdqu Y_DAT3,[inp3+IDX+I*32] vmovdqu Y_DAT4,[inp4+IDX+I*32] vmovdqu Y_DAT5,[inp5+IDX+I*32] vmovdqu Y_DAT6,[inp6+IDX+I*32] vmovdqu Y_DAT7,[inp7+IDX+I*32] TRANSPOSE8 Y_DAT0, Y_DAT1, Y_DAT2, Y_DAT3, Y_DAT4, Y_DAT5, Y_DAT6, Y_DAT7, Y_DTMP1, Y_DTMP2 vmovdqa [DPTR2+_DATA+(I*8+0)*32],Y_DAT0 vmovdqa [DPTR2+_DATA+(I*8+1)*32],Y_DAT1 vmovdqa [DPTR2+_DATA+(I*8+2)*32],Y_DAT2 vmovdqa [DPTR2+_DATA+(I*8+3)*32],Y_DAT3 vmovdqa [DPTR2+_DATA+(I*8+4)*32],Y_DAT4 vmovdqa [DPTR2+_DATA+(I*8+5)*32],Y_DAT5 vmovdqa [DPTR2+_DATA+(I*8+6)*32],Y_DAT6 vmovdqa [DPTR2+_DATA+(I*8+7)*32],Y_DAT7 ; Restore Y_A and Y_B vmovdqa Y_A, [rsp + _TMPDIGEST + 0*32] vmovdqa Y_B, [rsp + _TMPDIGEST + 1*32] MD5_STEP MAGIC_H, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 5*32, [TBL+32*32], rot31 MD5_STEP MAGIC_H, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 8*32, [TBL+33*32], rot32 MD5_STEP MAGIC_H, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+11*32, [TBL+34*32], rot33 MD5_STEP MAGIC_H, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+14*32, [TBL+35*32], rot34 MD5_STEP MAGIC_H, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 1*32, [TBL+36*32], rot31 MD5_STEP MAGIC_H, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 4*32, [TBL+37*32], rot32 MD5_STEP MAGIC_H, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 7*32, [TBL+38*32], rot33 MD5_STEP MAGIC_H, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+10*32, [TBL+39*32], rot34 ;; Fetch Pointers to Data Stream 9 to 16 mov inp0,[state + _data_ptr_md5 + 8*8] mov inp1,[state + _data_ptr_md5 + 9*8] mov inp2,[state + _data_ptr_md5 + 10*8] mov inp3,[state + _data_ptr_md5 + 11*8] mov inp4,[state + _data_ptr_md5 + 12*8] mov inp5,[state + _data_ptr_md5 + 13*8] mov inp6,[state + _data_ptr_md5 + 14*8] mov inp7,[state + _data_ptr_md5 + 15*8] MD5_STEP MAGIC_H, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+13*32, [TBL+40*32], rot31 MD5_STEP MAGIC_H, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 0*32, [TBL+41*32], rot32 MD5_STEP MAGIC_H, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 3*32, [TBL+42*32], rot33 MD5_STEP MAGIC_H, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 6*32, [TBL+43*32], rot34 MD5_STEP MAGIC_H, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 9*32, [TBL+44*32], rot31 MD5_STEP MAGIC_H, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+12*32, [TBL+45*32], rot32 MD5_STEP MAGIC_H, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+15*32, [TBL+46*32], rot33 MD5_STEP MAGIC_H, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 2*32, [TBL+47*32], rot34 %assign I 0 ; Y_A and Y_B share the same registers with Y_DTMP1 and Y_DTMP2 ; Therefore we need to save these to stack and restore after transpose vmovdqa [rsp + _TMPDIGEST + 0*32], Y_A vmovdqa [rsp + _TMPDIGEST + 1*32], Y_B vmovdqu Y_DAT0,[inp0+IDX+I*32] vmovdqu Y_DAT1,[inp1+IDX+I*32] vmovdqu Y_DAT2,[inp2+IDX+I*32] vmovdqu Y_DAT3,[inp3+IDX+I*32] vmovdqu Y_DAT4,[inp4+IDX+I*32] vmovdqu Y_DAT5,[inp5+IDX+I*32] vmovdqu Y_DAT6,[inp6+IDX+I*32] vmovdqu Y_DAT7,[inp7+IDX+I*32] TRANSPOSE8 Y_DAT0, Y_DAT1, Y_DAT2, Y_DAT3, Y_DAT4, Y_DAT5, Y_DAT6, Y_DAT7, Y_DTMP1, Y_DTMP2 vmovdqa [DPTR2+_DATA+((I+2)*8+0)*32],Y_DAT0 vmovdqa [DPTR2+_DATA+((I+2)*8+1)*32],Y_DAT1 vmovdqa [DPTR2+_DATA+((I+2)*8+2)*32],Y_DAT2 vmovdqa [DPTR2+_DATA+((I+2)*8+3)*32],Y_DAT3 vmovdqa [DPTR2+_DATA+((I+2)*8+4)*32],Y_DAT4 vmovdqa [DPTR2+_DATA+((I+2)*8+5)*32],Y_DAT5 vmovdqa [DPTR2+_DATA+((I+2)*8+6)*32],Y_DAT6 vmovdqa [DPTR2+_DATA+((I+2)*8+7)*32],Y_DAT7 ; Restore Y_A and Y_B vmovdqa Y_A, [rsp + _TMPDIGEST + 0*32] vmovdqa Y_B, [rsp + _TMPDIGEST + 1*32] MD5_STEP MAGIC_I, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 0*32, [TBL+48*32], rot41 MD5_STEP MAGIC_I, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 7*32, [TBL+49*32], rot42 MD5_STEP MAGIC_I, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+14*32, [TBL+50*32], rot43 MD5_STEP MAGIC_I, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 5*32, [TBL+51*32], rot44 MD5_STEP MAGIC_I, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+12*32, [TBL+52*32], rot41 MD5_STEP MAGIC_I, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 3*32, [TBL+53*32], rot42 MD5_STEP MAGIC_I, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+10*32, [TBL+54*32], rot43 MD5_STEP MAGIC_I, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 1*32, [TBL+55*32], rot44 MD5_STEP MAGIC_I, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 8*32, [TBL+56*32], rot41 MD5_STEP MAGIC_I, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+15*32, [TBL+57*32], rot42 MD5_STEP MAGIC_I, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 6*32, [TBL+58*32], rot43 MD5_STEP MAGIC_I, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+13*32, [TBL+59*32], rot44 MD5_STEP MAGIC_I, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 4*32, [TBL+60*32], rot41 MD5_STEP MAGIC_I, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+11*32, [TBL+61*32], rot42 MD5_STEP MAGIC_I, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 2*32, [TBL+62*32], rot43 MD5_STEP MAGIC_I, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 9*32, [TBL+63*32], rot44 %assign I (I+1) ; Y_A and Y_B share the same registers with Y_DTMP1 and Y_DTMP2 ; Therefore we need to save these to stack and restore after transpose vmovdqa [rsp + _TMPDIGEST + 0*32], Y_A vmovdqa [rsp + _TMPDIGEST + 1*32], Y_B vmovdqu Y_DAT0,[inp0+IDX+I*32] vmovdqu Y_DAT1,[inp1+IDX+I*32] vmovdqu Y_DAT2,[inp2+IDX+I*32] vmovdqu Y_DAT3,[inp3+IDX+I*32] vmovdqu Y_DAT4,[inp4+IDX+I*32] vmovdqu Y_DAT5,[inp5+IDX+I*32] vmovdqu Y_DAT6,[inp6+IDX+I*32] vmovdqu Y_DAT7,[inp7+IDX+I*32] TRANSPOSE8 Y_DAT0, Y_DAT1, Y_DAT2, Y_DAT3, Y_DAT4, Y_DAT5, Y_DAT6, Y_DAT7, Y_DTMP1, Y_DTMP2 vmovdqa [DPTR2+_DATA+((I+2)*8+0)*32],Y_DAT0 vmovdqa [DPTR2+_DATA+((I+2)*8+1)*32],Y_DAT1 vmovdqa [DPTR2+_DATA+((I+2)*8+2)*32],Y_DAT2 vmovdqa [DPTR2+_DATA+((I+2)*8+3)*32],Y_DAT3 vmovdqa [DPTR2+_DATA+((I+2)*8+4)*32],Y_DAT4 vmovdqa [DPTR2+_DATA+((I+2)*8+5)*32],Y_DAT5 vmovdqa [DPTR2+_DATA+((I+2)*8+6)*32],Y_DAT6 vmovdqa [DPTR2+_DATA+((I+2)*8+7)*32],Y_DAT7 ; Restore Y_A and Y_B vmovdqa Y_A, [rsp + _TMPDIGEST + 0*32] vmovdqa Y_B, [rsp + _TMPDIGEST + 1*32] ; Add results to old digest values vpaddd Y_A,Y_A,[Y_AA] vpaddd Y_B,Y_B,[Y_BB] vpaddd Y_C,Y_C,[Y_CC] vpaddd Y_D,Y_D,[Y_DD] vpaddd Y_A2,Y_A2,[Y_AA2] vpaddd Y_B2,Y_B2,[Y_BB2] vpaddd Y_C2,Y_C2,[Y_CC2] vpaddd Y_D2,Y_D2,[Y_DD2] ; Swap DPTR1 and DPTR2 xchg DPTR1, DPTR2 ;; Proceed to processing of next block jmp lloop lastblock: ; Perform the 64 rounds of processing ... MD5_STEP MAGIC_F, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 0*32, [TBL+ 0*32], rot11 MD5_STEP MAGIC_F, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 1*32, [TBL+ 1*32], rot12 MD5_STEP MAGIC_F, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 2*32, [TBL+ 2*32], rot13 MD5_STEP MAGIC_F, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 3*32, [TBL+ 3*32], rot14 MD5_STEP MAGIC_F, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 4*32, [TBL+ 4*32], rot11 MD5_STEP MAGIC_F, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 5*32, [TBL+ 5*32], rot12 MD5_STEP MAGIC_F, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 6*32, [TBL+ 6*32], rot13 MD5_STEP MAGIC_F, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 7*32, [TBL+ 7*32], rot14 MD5_STEP MAGIC_F, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 8*32, [TBL+ 8*32], rot11 MD5_STEP MAGIC_F, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 9*32, [TBL+ 9*32], rot12 MD5_STEP MAGIC_F, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+10*32, [TBL+10*32], rot13 MD5_STEP MAGIC_F, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+11*32, [TBL+11*32], rot14 MD5_STEP MAGIC_F, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+12*32, [TBL+12*32], rot11 MD5_STEP MAGIC_F, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+13*32, [TBL+13*32], rot12 MD5_STEP MAGIC_F, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+14*32, [TBL+14*32], rot13 MD5_STEP MAGIC_F, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+15*32, [TBL+15*32], rot14 MD5_STEP MAGIC_G, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 1*32, [TBL+16*32], rot21 MD5_STEP MAGIC_G, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 6*32, [TBL+17*32], rot22 MD5_STEP MAGIC_G, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+11*32, [TBL+18*32], rot23 MD5_STEP MAGIC_G, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 0*32, [TBL+19*32], rot24 MD5_STEP MAGIC_G, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 5*32, [TBL+20*32], rot21 MD5_STEP MAGIC_G, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+10*32, [TBL+21*32], rot22 MD5_STEP MAGIC_G, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+15*32, [TBL+22*32], rot23 MD5_STEP MAGIC_G, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 4*32, [TBL+23*32], rot24 MD5_STEP MAGIC_G, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 9*32, [TBL+24*32], rot21 MD5_STEP MAGIC_G, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+14*32, [TBL+25*32], rot22 MD5_STEP MAGIC_G, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 3*32, [TBL+26*32], rot23 MD5_STEP MAGIC_G, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 8*32, [TBL+27*32], rot24 MD5_STEP MAGIC_G, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+13*32, [TBL+28*32], rot21 MD5_STEP MAGIC_G, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 2*32, [TBL+29*32], rot22 MD5_STEP MAGIC_G, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 7*32, [TBL+30*32], rot23 MD5_STEP MAGIC_G, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+12*32, [TBL+31*32], rot24 MD5_STEP MAGIC_H, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 5*32, [TBL+32*32], rot31 MD5_STEP MAGIC_H, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 8*32, [TBL+33*32], rot32 MD5_STEP MAGIC_H, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+11*32, [TBL+34*32], rot33 MD5_STEP MAGIC_H, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+14*32, [TBL+35*32], rot34 MD5_STEP MAGIC_H, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 1*32, [TBL+36*32], rot31 MD5_STEP MAGIC_H, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 4*32, [TBL+37*32], rot32 MD5_STEP MAGIC_H, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 7*32, [TBL+38*32], rot33 MD5_STEP MAGIC_H, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+10*32, [TBL+39*32], rot34 MD5_STEP MAGIC_H, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+13*32, [TBL+40*32], rot31 MD5_STEP MAGIC_H, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 0*32, [TBL+41*32], rot32 MD5_STEP MAGIC_H, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 3*32, [TBL+42*32], rot33 MD5_STEP MAGIC_H, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 6*32, [TBL+43*32], rot34 MD5_STEP MAGIC_H, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 9*32, [TBL+44*32], rot31 MD5_STEP MAGIC_H, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+12*32, [TBL+45*32], rot32 MD5_STEP MAGIC_H, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+15*32, [TBL+46*32], rot33 MD5_STEP MAGIC_H, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 2*32, [TBL+47*32], rot34 MD5_STEP MAGIC_I, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 0*32, [TBL+48*32], rot41 MD5_STEP MAGIC_I, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 7*32, [TBL+49*32], rot42 MD5_STEP MAGIC_I, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+14*32, [TBL+50*32], rot43 MD5_STEP MAGIC_I, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 5*32, [TBL+51*32], rot44 MD5_STEP MAGIC_I, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+12*32, [TBL+52*32], rot41 MD5_STEP MAGIC_I, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 3*32, [TBL+53*32], rot42 MD5_STEP MAGIC_I, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+10*32, [TBL+54*32], rot43 MD5_STEP MAGIC_I, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 1*32, [TBL+55*32], rot44 MD5_STEP MAGIC_I, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 8*32, [TBL+56*32], rot41 MD5_STEP MAGIC_I, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+15*32, [TBL+57*32], rot42 MD5_STEP MAGIC_I, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 6*32, [TBL+58*32], rot43 MD5_STEP MAGIC_I, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+13*32, [TBL+59*32], rot44 MD5_STEP MAGIC_I, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 4*32, [TBL+60*32], rot41 MD5_STEP MAGIC_I, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+11*32, [TBL+61*32], rot42 MD5_STEP MAGIC_I, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 2*32, [TBL+62*32], rot43 MD5_STEP MAGIC_I, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 9*32, [TBL+63*32], rot44 ;; update into data pointers %assign I 0 %rep 8 mov inp0, [state + _data_ptr_md5 + (2*I)*8] mov inp1, [state + _data_ptr_md5 + (2*I +1)*8] add inp0, IDX add inp1, IDX mov [state + _data_ptr_md5 + (2*I)*8], inp0 mov [state + _data_ptr_md5 + (2*I+1)*8], inp1 %assign I (I+1) %endrep vpaddd Y_A,Y_A,[Y_AA] vpaddd Y_B,Y_B,[Y_BB] vpaddd Y_C,Y_C,[Y_CC] vpaddd Y_D,Y_D,[Y_DD] vpaddd Y_A2,Y_A2,[Y_AA2] vpaddd Y_B2,Y_B2,[Y_BB2] vpaddd Y_C2,Y_C2,[Y_CC2] vpaddd Y_D2,Y_D2,[Y_DD2] vmovdqu [state + 0*MD5_DIGEST_ROW_SIZE ],Y_A vmovdqu [state + 1*MD5_DIGEST_ROW_SIZE ],Y_B vmovdqu [state + 2*MD5_DIGEST_ROW_SIZE ],Y_C vmovdqu [state + 3*MD5_DIGEST_ROW_SIZE ],Y_D vmovdqu [state + 0*MD5_DIGEST_ROW_SIZE + 32 ],Y_A2 ;; 32 is YMM width vmovdqu [state + 1*MD5_DIGEST_ROW_SIZE + 32 ],Y_B2 vmovdqu [state + 2*MD5_DIGEST_ROW_SIZE + 32 ],Y_C2 vmovdqu [state + 3*MD5_DIGEST_ROW_SIZE + 32 ],Y_D2 ;;;;;;;;;;;;;;;; ;; Postamble add rsp, STACK_size ret intel-ipsec-mb-0.48/avx2/sha1_x8_avx2.asm000066400000000000000000000327041321406316400200600ustar00rootroot00000000000000;; ;; Copyright (c) 2012-2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; ;; Stack must be aligned to 32 bytes before call ;; Windows clobbers: rax rdx r8 r9 r10 r11 r12 r13 r14 r15 ;; Windows preserves: rbx rcx rsi rdi rbp ;; ;; Linux clobbers: rax rdx rsi r9 r10 r11 r12 r13 r14 r15 ;; Linux preserves: rbx rcx rdi rbp r8 ;; ;; clobbers ymm0-15 %include "os.asm" ;%define DO_DBGPRINT %include "dbgprint.asm" %include "mb_mgr_datastruct.asm" section .data default rel align 32 PSHUFFLE_BYTE_FLIP_MASK: ;ddq 0x0c0d0e0f08090a0b0405060700010203 ;ddq 0x0c0d0e0f08090a0b0405060700010203 dq 0x0405060700010203, 0x0c0d0e0f08090a0b dq 0x0405060700010203, 0x0c0d0e0f08090a0b K00_19: ;ddq 0x5A8279995A8279995A8279995A827999 ;ddq 0x5A8279995A8279995A8279995A827999 dq 0x5A8279995A827999, 0x5A8279995A827999 dq 0x5A8279995A827999, 0x5A8279995A827999 K20_39: ;ddq 0x6ED9EBA16ED9EBA16ED9EBA16ED9EBA1 ;ddq 0x6ED9EBA16ED9EBA16ED9EBA16ED9EBA1 dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1 dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1 K40_59: ;ddq 0x8F1BBCDC8F1BBCDC8F1BBCDC8F1BBCDC ;ddq 0x8F1BBCDC8F1BBCDC8F1BBCDC8F1BBCDC dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC K60_79: ;ddq 0xCA62C1D6CA62C1D6CA62C1D6CA62C1D6 ;ddq 0xCA62C1D6CA62C1D6CA62C1D6CA62C1D6 dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6 dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6 section .text %ifdef LINUX %define arg1 rdi %define arg2 rsi %define reg3 rdx %else %define arg1 rcx %define arg2 rdx %define reg3 r8 %endif %define state arg1 %define num_blks arg2 %define inp0 r9 %define inp1 r10 %define inp2 r11 %define inp3 r12 %define inp4 r13 %define inp5 r14 %define inp6 r15 %define inp7 reg3 %define IDX rax ; ymm0 A ; ymm1 B ; ymm2 C ; ymm3 D ; ymm4 E ; ymm5 F AA ; ymm6 T0 BB ; ymm7 T1 CC ; ymm8 T2 DD ; ymm9 T3 EE ; ymm10 T4 TMP ; ymm11 T5 FUN ; ymm12 T6 K ; ymm13 T7 W14 ; ymm14 T8 W15 ; ymm15 T9 W16 %define A ymm0 %define B ymm1 %define C ymm2 %define D ymm3 %define E ymm4 %define F ymm5 %define T0 ymm6 %define T1 ymm7 %define T2 ymm8 %define T3 ymm9 %define T4 ymm10 %define T5 ymm11 %define T6 ymm12 %define T7 ymm13 %define T8 ymm14 %define T9 ymm15 %define AA ymm5 %define BB ymm6 %define CC ymm7 %define DD ymm8 %define EE ymm9 %define TMP ymm10 %define FUN ymm11 %define K ymm12 %define W14 ymm13 %define W15 ymm14 %define W16 ymm15 ;; Assume stack aligned to 32 bytes before call ;; Therefore FRAMESIZE mod 32 must be 32-8 = 24 %define FRAMESZ 32*16 + 24 %define VMOVPS vmovups ; TRANSPOSE8 r0, r1, r2, r3, r4, r5, r6, r7, t0, t1 ; "transpose" data in {r0...r7} using temps {t0...t1} ; Input looks like: {r0 r1 r2 r3 r4 r5 r6 r7} ; r0 = {a7 a6 a5 a4 a3 a2 a1 a0} ; r1 = {b7 b6 b5 b4 b3 b2 b1 b0} ; r2 = {c7 c6 c5 c4 c3 c2 c1 c0} ; r3 = {d7 d6 d5 d4 d3 d2 d1 d0} ; r4 = {e7 e6 e5 e4 e3 e2 e1 e0} ; r5 = {f7 f6 f5 f4 f3 f2 f1 f0} ; r6 = {g7 g6 g5 g4 g3 g2 g1 g0} ; r7 = {h7 h6 h5 h4 h3 h2 h1 h0} ; ; Output looks like: {r0 r1 r2 r3 r4 r5 r6 r7} ; r0 = {h0 g0 f0 e0 d0 c0 b0 a0} ; r1 = {h1 g1 f1 e1 d1 c1 b1 a1} ; r2 = {h2 g2 f2 e2 d2 c2 b2 a2} ; r3 = {h3 g3 f3 e3 d3 c3 b3 a3} ; r4 = {h4 g4 f4 e4 d4 c4 b4 a4} ; r5 = {h5 g5 f5 e5 d5 c5 b5 a5} ; r6 = {h6 g6 f6 e6 d6 c6 b6 a6} ; r7 = {h7 g7 f7 e7 d7 c7 b7 a7} ; %macro TRANSPOSE8 10 %define %%r0 %1 %define %%r1 %2 %define %%r2 %3 %define %%r3 %4 %define %%r4 %5 %define %%r5 %6 %define %%r6 %7 %define %%r7 %8 %define %%t0 %9 %define %%t1 %10 ; process top half (r0..r3) {a...d} vshufps %%t0, %%r0, %%r1, 0x44 ; t0 = {b5 b4 a5 a4 b1 b0 a1 a0} vshufps %%r0, %%r0, %%r1, 0xEE ; r0 = {b7 b6 a7 a6 b3 b2 a3 a2} vshufps %%t1, %%r2, %%r3, 0x44 ; t1 = {d5 d4 c5 c4 d1 d0 c1 c0} vshufps %%r2, %%r2, %%r3, 0xEE ; r2 = {d7 d6 c7 c6 d3 d2 c3 c2} vshufps %%r3, %%t0, %%t1, 0xDD ; r3 = {d5 c5 b5 a5 d1 c1 b1 a1} vshufps %%r1, %%r0, %%r2, 0x88 ; r1 = {d6 c6 b6 a6 d2 c2 b2 a2} vshufps %%r0, %%r0, %%r2, 0xDD ; r0 = {d7 c7 b7 a7 d3 c3 b3 a3} vshufps %%t0, %%t0, %%t1, 0x88 ; t0 = {d4 c4 b4 a4 d0 c0 b0 a0} ; use r2 in place of t0 ; process bottom half (r4..r7) {e...h} vshufps %%r2, %%r4, %%r5, 0x44 ; r2 = {f5 f4 e5 e4 f1 f0 e1 e0} vshufps %%r4, %%r4, %%r5, 0xEE ; r4 = {f7 f6 e7 e6 f3 f2 e3 e2} vshufps %%t1, %%r6, %%r7, 0x44 ; t1 = {h5 h4 g5 g4 h1 h0 g1 g0} vshufps %%r6, %%r6, %%r7, 0xEE ; r6 = {h7 h6 g7 g6 h3 h2 g3 g2} vshufps %%r7, %%r2, %%t1, 0xDD ; r7 = {h5 g5 f5 e5 h1 g1 f1 e1} vshufps %%r5, %%r4, %%r6, 0x88 ; r5 = {h6 g6 f6 e6 h2 g2 f2 e2} vshufps %%r4, %%r4, %%r6, 0xDD ; r4 = {h7 g7 f7 e7 h3 g3 f3 e3} vshufps %%t1, %%r2, %%t1, 0x88 ; t1 = {h4 g4 f4 e4 h0 g0 f0 e0} vperm2f128 %%r6, %%r5, %%r1, 0x13 ; h6...a6 vperm2f128 %%r2, %%r5, %%r1, 0x02 ; h2...a2 vperm2f128 %%r5, %%r7, %%r3, 0x13 ; h5...a5 vperm2f128 %%r1, %%r7, %%r3, 0x02 ; h1...a1 vperm2f128 %%r7, %%r4, %%r0, 0x13 ; h7...a7 vperm2f128 %%r3, %%r4, %%r0, 0x02 ; h3...a3 vperm2f128 %%r4, %%t1, %%t0, 0x13 ; h4...a4 vperm2f128 %%r0, %%t1, %%t0, 0x02 ; h0...a0 %endmacro ;; ;; Magic functions defined in FIPS 180-1 ;; ;MAGIC_F0 MACRO regF:REQ,regB:REQ,regC:REQ,regD:REQ,regT:REQ ;; ((D ^ (B & (C ^ D))) %macro MAGIC_F0 5 %define %%regF %1 %define %%regB %2 %define %%regC %3 %define %%regD %4 %define %%regT %5 ;vmovdqa %%regF,%%regC vpxor %%regF, %%regC,%%regD vpand %%regF, %%regF,%%regB vpxor %%regF, %%regF,%%regD %endmacro ;MAGIC_F1 MACRO regF:REQ,regB:REQ,regC:REQ,regD:REQ,regT:REQ ;; (B ^ C ^ D) %macro MAGIC_F1 5 %define %%regF %1 %define %%regB %2 %define %%regC %3 %define %%regD %4 %define %%regT %5 ;vmovdqa %%regF,%%regD vpxor %%regF,%%regD,%%regC vpxor %%regF,%%regF,%%regB %endmacro ;MAGIC_F2 MACRO regF:REQ,regB:REQ,regC:REQ,regD:REQ,regT:REQ ;; ((B & C) | (B & D) | (C & D)) %macro MAGIC_F2 5 %define %%regF %1 %define %%regB %2 %define %%regC %3 %define %%regD %4 %define %%regT %5 ;vmovdqa %%regF,%%regB ;vmovdqa %%regT,%%regB vpor %%regF,%%regB,%%regC vpand %%regT,%%regB,%%regC vpand %%regF,%%regF,%%regD vpor %%regF,%%regF,%%regT %endmacro ;MAGIC_F3 MACRO regF:REQ,regB:REQ,regC:REQ,regD:REQ,regT:REQ %macro MAGIC_F3 5 %define %%regF %1 %define %%regB %2 %define %%regC %3 %define %%regD %4 %define %%regT %5 MAGIC_F1 %%regF,%%regB,%%regC,%%regD,%%regT %endmacro ; PROLD reg, imm, tmp %macro PROLD 3 %define %%reg %1 %define %%imm %2 %define %%tmp %3 ;vmovdqa %%tmp, %%reg vpsrld %%tmp, %%reg, (32-%%imm) vpslld %%reg, %%reg, %%imm vpor %%reg, %%reg, %%tmp %endmacro ; PROLD reg, imm, tmp %macro PROLD_nd 4 %define %%reg %1 %define %%imm %2 %define %%tmp %3 %define %%src %4 ;vmovdqa %%tmp, %%reg vpsrld %%tmp, %%src, (32-%%imm) vpslld %%reg, %%src, %%imm vpor %%reg, %%reg, %%tmp %endmacro %macro SHA1_STEP_00_15 10 %define %%regA %1 %define %%regB %2 %define %%regC %3 %define %%regD %4 %define %%regE %5 %define %%regT %6 %define %%regF %7 %define %%memW %8 %define %%immCNT %9 %define %%MAGIC %10 vpaddd %%regE, %%regE,%%immCNT vpaddd %%regE, %%regE,[rsp + (%%memW * 32)] ;vmovdqa %%regT,%%regA PROLD_nd %%regT,5, %%regF,%%regA vpaddd %%regE, %%regE,%%regT %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D) PROLD %%regB,30, %%regT vpaddd %%regE, %%regE,%%regF %endmacro %macro SHA1_STEP_16_79 10 %define %%regA %1 %define %%regB %2 %define %%regC %3 %define %%regD %4 %define %%regE %5 %define %%regT %6 %define %%regF %7 %define %%memW %8 %define %%immCNT %9 %define %%MAGIC %10 vpaddd %%regE, %%regE,%%immCNT vmovdqa W14, [rsp + ((%%memW - 14) & 15) * 32] vpxor W16, W16, W14 vpxor W16, W16, [rsp + ((%%memW - 8) & 15) * 32] vpxor W16, W16, [rsp + ((%%memW - 3) & 15) * 32] ;vmovdqa %%regF, W16 vpsrld %%regF, W16, (32-1) vpslld W16, W16, 1 vpor %%regF, %%regF, W16 ROTATE_W vmovdqa [rsp + ((%%memW - 0) & 15) * 32],%%regF vpaddd %%regE, %%regE,%%regF ;vmovdqa %%regT,%%regA PROLD_nd %%regT,5, %%regF, %%regA vpaddd %%regE, %%regE,%%regT %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D) PROLD %%regB,30, %%regT vpaddd %%regE,%%regE,%%regF %endmacro ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; %macro ROTATE_ARGS 0 %xdefine TMP_ E %xdefine E D %xdefine D C %xdefine C B %xdefine B A %xdefine A TMP_ %endm %macro ROTATE_W 0 %xdefine TMP_ W16 %xdefine W16 W15 %xdefine W15 W14 %xdefine W14 TMP_ %endm align 32 ; void sha1_x8_avx2(void *state, int num_blks) ; arg 1 : rcx : pointer to array[4] of pointer to input data ; arg 2 : rdx : size (in blocks) ;; assumed to be >= 1 MKGLOBAL(sha1_x8_avx2,function,internal) sha1_x8_avx2: sub rsp, FRAMESZ ;; Initialize digests vmovdqu A, [state + 0*SHA1_DIGEST_ROW_SIZE] vmovdqu B, [state + 1*SHA1_DIGEST_ROW_SIZE] vmovdqu C, [state + 2*SHA1_DIGEST_ROW_SIZE] vmovdqu D, [state + 3*SHA1_DIGEST_ROW_SIZE] vmovdqu E, [state + 4*SHA1_DIGEST_ROW_SIZE] DBGPRINTL_YMM "Sha1-AVX2 incoming transposed digest", A, B, C, D, E ;; transpose input onto stack mov inp0,[state+_data_ptr_sha1+0*PTR_SZ] mov inp1,[state+_data_ptr_sha1+1*PTR_SZ] mov inp2,[state+_data_ptr_sha1+2*PTR_SZ] mov inp3,[state+_data_ptr_sha1+3*PTR_SZ] mov inp4,[state+_data_ptr_sha1+4*PTR_SZ] mov inp5,[state+_data_ptr_sha1+5*PTR_SZ] mov inp6,[state+_data_ptr_sha1+6*PTR_SZ] mov inp7,[state+_data_ptr_sha1+7*PTR_SZ] xor IDX, IDX lloop: vmovdqa F, [rel PSHUFFLE_BYTE_FLIP_MASK] %assign I 0 %rep 2 VMOVPS T0,[inp0+IDX] VMOVPS T1,[inp1+IDX] VMOVPS T2,[inp2+IDX] VMOVPS T3,[inp3+IDX] VMOVPS T4,[inp4+IDX] VMOVPS T5,[inp5+IDX] VMOVPS T6,[inp6+IDX] VMOVPS T7,[inp7+IDX] TRANSPOSE8 T0, T1, T2, T3, T4, T5, T6, T7, T8, T9 DBGPRINTL_YMM "Sha1-AVX2 incoming transposed input", T0, T1, T2, T3, T4, T5, T6, T7, T8, T9 vpshufb T0, T0, F vmovdqa [rsp+(I*8+0)*32],T0 vpshufb T1, T1, F vmovdqa [rsp+(I*8+1)*32],T1 vpshufb T2, T2, F vmovdqa [rsp+(I*8+2)*32],T2 vpshufb T3, T3, F vmovdqa [rsp+(I*8+3)*32],T3 vpshufb T4, T4, F vmovdqa [rsp+(I*8+4)*32],T4 vpshufb T5, T5, F vmovdqa [rsp+(I*8+5)*32],T5 vpshufb T6, T6, F vmovdqa [rsp+(I*8+6)*32],T6 vpshufb T7, T7, F vmovdqa [rsp+(I*8+7)*32],T7 add IDX, 32 %assign I (I+1) %endrep ; save old digests vmovdqa AA, A vmovdqa BB, B vmovdqa CC, C vmovdqa DD, D vmovdqa EE, E ;; ;; perform 0-79 steps ;; vmovdqa K, [rel K00_19] ;; do rounds 0...15 %assign I 0 %rep 16 SHA1_STEP_00_15 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0 ROTATE_ARGS %assign I (I+1) %endrep ;; do rounds 16...19 vmovdqa W16, [rsp + ((16 - 16) & 15) * 32] vmovdqa W15, [rsp + ((16 - 15) & 15) * 32] %rep 4 SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0 ROTATE_ARGS %assign I (I+1) %endrep ;; do rounds 20...39 vmovdqa K, [rel K20_39] %rep 20 SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F1 ROTATE_ARGS %assign I (I+1) %endrep ;; do rounds 40...59 vmovdqa K, [rel K40_59] %rep 20 SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F2 ROTATE_ARGS %assign I (I+1) %endrep ;; do rounds 60...79 vmovdqa K, [rel K60_79] %rep 20 SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F3 ROTATE_ARGS %assign I (I+1) %endrep vpaddd A,A,AA vpaddd B,B,BB vpaddd C,C,CC vpaddd D,D,DD vpaddd E,E,EE sub num_blks, 1 jne lloop ; write out digests vmovdqu [state + 0*SHA1_DIGEST_ROW_SIZE], A vmovdqu [state + 1*SHA1_DIGEST_ROW_SIZE], B vmovdqu [state + 2*SHA1_DIGEST_ROW_SIZE], C vmovdqu [state + 3*SHA1_DIGEST_ROW_SIZE], D vmovdqu [state + 4*SHA1_DIGEST_ROW_SIZE], E DBGPRINTL_YMM "Sha1-AVX2 outgoing transposed digest", A, B, C, D, E ;; update input pointers add inp0, IDX add inp1, IDX add inp2, IDX add inp3, IDX add inp4, IDX add inp5, IDX add inp6, IDX add inp7, IDX mov [state+_data_ptr_sha1+0*PTR_SZ], inp0 mov [state+_data_ptr_sha1+1*PTR_SZ], inp1 mov [state+_data_ptr_sha1+2*PTR_SZ], inp2 mov [state+_data_ptr_sha1+3*PTR_SZ], inp3 mov [state+_data_ptr_sha1+4*PTR_SZ], inp4 mov [state+_data_ptr_sha1+5*PTR_SZ], inp5 mov [state+_data_ptr_sha1+6*PTR_SZ], inp6 mov [state+_data_ptr_sha1+7*PTR_SZ], inp7 ;;;;;;;;;;;;;;;; ;; Postamble add rsp, FRAMESZ ret intel-ipsec-mb-0.48/avx2/sha256_oct_avx2.asm000066400000000000000000000467041321406316400204670ustar00rootroot00000000000000;; ;; Copyright (c) 2012-2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; ;; code to compute oct SHA256 using SSE-256 ;; outer calling routine takes care of save and restore of XMM registers ;; Logic designed/laid out by JDG ;; Function clobbers: rax, rcx, rdx, rbx, rsi, rdi, r9-r15; ymm0-15 ;; Stack must be aligned to 32 bytes before call ;; Windows clobbers: rax rbx rdx rsi rdi r8 r9 r10 r11 r12 r13 r14 ;; Windows preserves: rcx rbp r15 ;; ;; Linux clobbers: rax rbx rcx rdx rsi r8 r9 r10 r11 r12 r13 r14 ;; Linux preserves: rdi rbp r15 ;; ;; clobbers ymm0-15 %include "os.asm" ;%define DO_DBGPRINT %include "dbgprint.asm" %include "mb_mgr_datastruct.asm" section .data default rel align 64 ;global K256_8 K256_8: dq 0x428a2f98428a2f98, 0x428a2f98428a2f98 dq 0x428a2f98428a2f98, 0x428a2f98428a2f98 dq 0x7137449171374491, 0x7137449171374491 dq 0x7137449171374491, 0x7137449171374491 dq 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf dq 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf dq 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5 dq 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5 dq 0x3956c25b3956c25b, 0x3956c25b3956c25b dq 0x3956c25b3956c25b, 0x3956c25b3956c25b dq 0x59f111f159f111f1, 0x59f111f159f111f1 dq 0x59f111f159f111f1, 0x59f111f159f111f1 dq 0x923f82a4923f82a4, 0x923f82a4923f82a4 dq 0x923f82a4923f82a4, 0x923f82a4923f82a4 dq 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5 dq 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5 dq 0xd807aa98d807aa98, 0xd807aa98d807aa98 dq 0xd807aa98d807aa98, 0xd807aa98d807aa98 dq 0x12835b0112835b01, 0x12835b0112835b01 dq 0x12835b0112835b01, 0x12835b0112835b01 dq 0x243185be243185be, 0x243185be243185be dq 0x243185be243185be, 0x243185be243185be dq 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3 dq 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3 dq 0x72be5d7472be5d74, 0x72be5d7472be5d74 dq 0x72be5d7472be5d74, 0x72be5d7472be5d74 dq 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe dq 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe dq 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7 dq 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7 dq 0xc19bf174c19bf174, 0xc19bf174c19bf174 dq 0xc19bf174c19bf174, 0xc19bf174c19bf174 dq 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1 dq 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1 dq 0xefbe4786efbe4786, 0xefbe4786efbe4786 dq 0xefbe4786efbe4786, 0xefbe4786efbe4786 dq 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6 dq 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6 dq 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc dq 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc dq 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f dq 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f dq 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa dq 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa dq 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc dq 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc dq 0x76f988da76f988da, 0x76f988da76f988da dq 0x76f988da76f988da, 0x76f988da76f988da dq 0x983e5152983e5152, 0x983e5152983e5152 dq 0x983e5152983e5152, 0x983e5152983e5152 dq 0xa831c66da831c66d, 0xa831c66da831c66d dq 0xa831c66da831c66d, 0xa831c66da831c66d dq 0xb00327c8b00327c8, 0xb00327c8b00327c8 dq 0xb00327c8b00327c8, 0xb00327c8b00327c8 dq 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7 dq 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7 dq 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3 dq 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3 dq 0xd5a79147d5a79147, 0xd5a79147d5a79147 dq 0xd5a79147d5a79147, 0xd5a79147d5a79147 dq 0x06ca635106ca6351, 0x06ca635106ca6351 dq 0x06ca635106ca6351, 0x06ca635106ca6351 dq 0x1429296714292967, 0x1429296714292967 dq 0x1429296714292967, 0x1429296714292967 dq 0x27b70a8527b70a85, 0x27b70a8527b70a85 dq 0x27b70a8527b70a85, 0x27b70a8527b70a85 dq 0x2e1b21382e1b2138, 0x2e1b21382e1b2138 dq 0x2e1b21382e1b2138, 0x2e1b21382e1b2138 dq 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc dq 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc dq 0x53380d1353380d13, 0x53380d1353380d13 dq 0x53380d1353380d13, 0x53380d1353380d13 dq 0x650a7354650a7354, 0x650a7354650a7354 dq 0x650a7354650a7354, 0x650a7354650a7354 dq 0x766a0abb766a0abb, 0x766a0abb766a0abb dq 0x766a0abb766a0abb, 0x766a0abb766a0abb dq 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e dq 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e dq 0x92722c8592722c85, 0x92722c8592722c85 dq 0x92722c8592722c85, 0x92722c8592722c85 dq 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1 dq 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1 dq 0xa81a664ba81a664b, 0xa81a664ba81a664b dq 0xa81a664ba81a664b, 0xa81a664ba81a664b dq 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70 dq 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70 dq 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3 dq 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3 dq 0xd192e819d192e819, 0xd192e819d192e819 dq 0xd192e819d192e819, 0xd192e819d192e819 dq 0xd6990624d6990624, 0xd6990624d6990624 dq 0xd6990624d6990624, 0xd6990624d6990624 dq 0xf40e3585f40e3585, 0xf40e3585f40e3585 dq 0xf40e3585f40e3585, 0xf40e3585f40e3585 dq 0x106aa070106aa070, 0x106aa070106aa070 dq 0x106aa070106aa070, 0x106aa070106aa070 dq 0x19a4c11619a4c116, 0x19a4c11619a4c116 dq 0x19a4c11619a4c116, 0x19a4c11619a4c116 dq 0x1e376c081e376c08, 0x1e376c081e376c08 dq 0x1e376c081e376c08, 0x1e376c081e376c08 dq 0x2748774c2748774c, 0x2748774c2748774c dq 0x2748774c2748774c, 0x2748774c2748774c dq 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5 dq 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5 dq 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3 dq 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3 dq 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a dq 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a dq 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f dq 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f dq 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3 dq 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3 dq 0x748f82ee748f82ee, 0x748f82ee748f82ee dq 0x748f82ee748f82ee, 0x748f82ee748f82ee dq 0x78a5636f78a5636f, 0x78a5636f78a5636f dq 0x78a5636f78a5636f, 0x78a5636f78a5636f dq 0x84c8781484c87814, 0x84c8781484c87814 dq 0x84c8781484c87814, 0x84c8781484c87814 dq 0x8cc702088cc70208, 0x8cc702088cc70208 dq 0x8cc702088cc70208, 0x8cc702088cc70208 dq 0x90befffa90befffa, 0x90befffa90befffa dq 0x90befffa90befffa, 0x90befffa90befffa dq 0xa4506ceba4506ceb, 0xa4506ceba4506ceb dq 0xa4506ceba4506ceb, 0xa4506ceba4506ceb dq 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7 dq 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7 dq 0xc67178f2c67178f2, 0xc67178f2c67178f2 dq 0xc67178f2c67178f2, 0xc67178f2c67178f2 PSHUFFLE_BYTE_FLIP_MASK: ;ddq 0x0c0d0e0f08090a0b0405060700010203 ;ddq 0x0c0d0e0f08090a0b0405060700010203 dq 0x0405060700010203, 0x0c0d0e0f08090a0b dq 0x0405060700010203, 0x0c0d0e0f08090a0b align 64 MKGLOBAL(K256,data,internal) K256: dd 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 dd 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 dd 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 dd 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 dd 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc dd 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da dd 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 dd 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 dd 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 dd 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 dd 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 dd 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 dd 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 dd 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 dd 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 dd 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 section .text %ifdef LINUX %define arg1 rdi %define arg2 rsi %define reg3 rcx %define reg4 rdx %else ; Windows definitions %define arg1 rcx %define arg2 rdx %define reg3 rsi %define reg4 rdi %endif ; Common definitions %define STATE arg1 %define INP_SIZE arg2 %define IDX rax %define ROUND rbx %define TBL reg3 %define inp0 r9 %define inp1 r10 %define inp2 r11 %define inp3 r12 %define inp4 r13 %define inp5 r14 %define inp6 r8 %define inp7 reg4 ; ymm0 a ; ymm1 b ; ymm2 c ; ymm3 d ; ymm4 e ; ymm5 f ; ymm6 g TMP0 ; ymm7 h TMP1 ; ymm8 T1 TT0 ; ymm9 TT1 ; ymm10 TT2 ; ymm11 TT3 ; ymm12 a0 TT4 ; ymm13 a1 TT5 ; ymm14 a2 TT6 ; ymm15 TMP TT7 %define a ymm0 %define b ymm1 %define c ymm2 %define d ymm3 %define e ymm4 %define f ymm5 %define g ymm6 %define h ymm7 %define T1 ymm8 %define a0 ymm12 %define a1 ymm13 %define a2 ymm14 %define TMP ymm15 %define TMP0 ymm6 %define TMP1 ymm7 %define TT0 ymm8 %define TT1 ymm9 %define TT2 ymm10 %define TT3 ymm11 %define TT4 ymm12 %define TT5 ymm13 %define TT6 ymm14 %define TT7 ymm15 %define SZ8 8*SHA256_DIGEST_WORD_SIZE ; Size of one vector register %define ROUNDS 64*SZ8 ; Define stack usage ;; Assume stack aligned to 32 bytes before call ;; Therefore FRAMESZ mod 32 must be 32-8 = 24 struc stack_frame .data resb 16*SZ8 .digest resb 8*SZ8 .ytmp resb 4*SZ8 .align resb 24 endstruc %define FRAMESZ stack_frame_size %define _DIGEST stack_frame.digest %define _YTMP stack_frame.ytmp %define YTMP0 rsp + _YTMP + 0*SZ8 %define YTMP1 rsp + _YTMP + 1*SZ8 %define YTMP2 rsp + _YTMP + 2*SZ8 %define YTMP3 rsp + _YTMP + 3*SZ8 %define VMOVPS vmovups ; TRANSPOSE8 r0, r1, r2, r3, r4, r5, r6, r7, t0, t1 ; "transpose" data in {r0...r7} using temps {t0...t1} ; Input looks like: {r0 r1 r2 r3 r4 r5 r6 r7} ; r0 = {a7 a6 a5 a4 a3 a2 a1 a0} ; r1 = {b7 b6 b5 b4 b3 b2 b1 b0} ; r2 = {c7 c6 c5 c4 c3 c2 c1 c0} ; r3 = {d7 d6 d5 d4 d3 d2 d1 d0} ; r4 = {e7 e6 e5 e4 e3 e2 e1 e0} ; r5 = {f7 f6 f5 f4 f3 f2 f1 f0} ; r6 = {g7 g6 g5 g4 g3 g2 g1 g0} ; r7 = {h7 h6 h5 h4 h3 h2 h1 h0} ; ; Output looks like: {r0 r1 r2 r3 r4 r5 r6 r7} ; r0 = {h0 g0 f0 e0 d0 c0 b0 a0} ; r1 = {h1 g1 f1 e1 d1 c1 b1 a1} ; r2 = {h2 g2 f2 e2 d2 c2 b2 a2} ; r3 = {h3 g3 f3 e3 d3 c3 b3 a3} ; r4 = {h4 g4 f4 e4 d4 c4 b4 a4} ; r5 = {h5 g5 f5 e5 d5 c5 b5 a5} ; r6 = {h6 g6 f6 e6 d6 c6 b6 a6} ; r7 = {h7 g7 f7 e7 d7 c7 b7 a7} ; %macro TRANSPOSE8 10 %define %%r0 %1 %define %%r1 %2 %define %%r2 %3 %define %%r3 %4 %define %%r4 %5 %define %%r5 %6 %define %%r6 %7 %define %%r7 %8 %define %%t0 %9 %define %%t1 %10 ; process top half (r0..r3) {a...d} vshufps %%t0, %%r0, %%r1, 0x44 ; t0 = {b5 b4 a5 a4 b1 b0 a1 a0} vshufps %%r0, %%r0, %%r1, 0xEE ; r0 = {b7 b6 a7 a6 b3 b2 a3 a2} vshufps %%t1, %%r2, %%r3, 0x44 ; t1 = {d5 d4 c5 c4 d1 d0 c1 c0} vshufps %%r2, %%r2, %%r3, 0xEE ; r2 = {d7 d6 c7 c6 d3 d2 c3 c2} vshufps %%r3, %%t0, %%t1, 0xDD ; r3 = {d5 c5 b5 a5 d1 c1 b1 a1} vshufps %%r1, %%r0, %%r2, 0x88 ; r1 = {d6 c6 b6 a6 d2 c2 b2 a2} vshufps %%r0, %%r0, %%r2, 0xDD ; r0 = {d7 c7 b7 a7 d3 c3 b3 a3} vshufps %%t0, %%t0, %%t1, 0x88 ; t0 = {d4 c4 b4 a4 d0 c0 b0 a0} ; use r2 in place of t0 ; process bottom half (r4..r7) {e...h} vshufps %%r2, %%r4, %%r5, 0x44 ; r2 = {f5 f4 e5 e4 f1 f0 e1 e0} vshufps %%r4, %%r4, %%r5, 0xEE ; r4 = {f7 f6 e7 e6 f3 f2 e3 e2} vshufps %%t1, %%r6, %%r7, 0x44 ; t1 = {h5 h4 g5 g4 h1 h0 g1 g0} vshufps %%r6, %%r6, %%r7, 0xEE ; r6 = {h7 h6 g7 g6 h3 h2 g3 g2} vshufps %%r7, %%r2, %%t1, 0xDD ; r7 = {h5 g5 f5 e5 h1 g1 f1 e1} vshufps %%r5, %%r4, %%r6, 0x88 ; r5 = {h6 g6 f6 e6 h2 g2 f2 e2} vshufps %%r4, %%r4, %%r6, 0xDD ; r4 = {h7 g7 f7 e7 h3 g3 f3 e3} vshufps %%t1, %%r2, %%t1, 0x88 ; t1 = {h4 g4 f4 e4 h0 g0 f0 e0} vperm2f128 %%r6, %%r5, %%r1, 0x13 ; h6...a6 vperm2f128 %%r2, %%r5, %%r1, 0x02 ; h2...a2 vperm2f128 %%r5, %%r7, %%r3, 0x13 ; h5...a5 vperm2f128 %%r1, %%r7, %%r3, 0x02 ; h1...a1 vperm2f128 %%r7, %%r4, %%r0, 0x13 ; h7...a7 vperm2f128 %%r3, %%r4, %%r0, 0x02 ; h3...a3 vperm2f128 %%r4, %%t1, %%t0, 0x13 ; h4...a4 vperm2f128 %%r0, %%t1, %%t0, 0x02 ; h0...a0 %endmacro %macro ROTATE_ARGS 0 %xdefine TMP_ h %xdefine h g %xdefine g f %xdefine f e %xdefine e d %xdefine d c %xdefine c b %xdefine b a %xdefine a TMP_ %endm ; PRORD reg, imm, tmp %macro PRORD 3 %define %%reg %1 %define %%imm %2 %define %%tmp %3 vpslld %%tmp, %%reg, (32-(%%imm)) vpsrld %%reg, %%reg, %%imm vpor %%reg, %%reg, %%tmp %endmacro ; non-destructive ; PRORD_nd reg, imm, tmp, src %macro PRORD_nd 4 %define %%reg %1 %define %%imm %2 %define %%tmp %3 %define %%src %4 ;vmovdqa %%tmp, %%reg vpslld %%tmp, %%src, (32-(%%imm)) vpsrld %%reg, %%src, %%imm vpor %%reg, %%reg, %%tmp %endmacro ; PRORD dst/src, amt %macro PRORD 2 PRORD %1, %2, TMP %endmacro ; PRORD_nd dst, src, amt %macro PRORD_nd 3 PRORD_nd %1, %3, TMP, %2 %endmacro ;; arguments passed implicitly in preprocessor symbols i, a...h %macro ROUND_00_15 2 %define %%T1 %1 %define %%i %2 PRORD_nd a0, e, (11-6) ; sig1: a0 = (e >> 5) vpxor a2, f, g ; ch: a2 = f^g vpand a2, a2, e ; ch: a2 = (f^g)&e vpxor a2, a2, g ; a2 = ch PRORD_nd a1, e, 25 ; sig1: a1 = (e >> 25) vmovdqa [SZ8*(%%i&0xf) + rsp], %%T1 vpaddd %%T1, %%T1, [TBL + ROUND] ; T1 = W + K vpxor a0, a0, e ; sig1: a0 = e ^ (e >> 5) PRORD a0, 6 ; sig1: a0 = (e >> 6) ^ (e >> 11) vpaddd h, h, a2 ; h = h + ch PRORD_nd a2, a, (13-2) ; sig0: a2 = (a >> 11) vpaddd h, h, %%T1 ; h = h + ch + W + K vpxor a0, a0, a1 ; a0 = sigma1 PRORD_nd a1, a, 22 ; sig0: a1 = (a >> 22) vpxor %%T1, a, c ; maj: T1 = a^c add ROUND, SZ8 ; ROUND++ vpand %%T1, %%T1, b ; maj: T1 = (a^c)&b vpaddd h, h, a0 vpaddd d, d, h vpxor a2, a2, a ; sig0: a2 = a ^ (a >> 11) PRORD a2, 2 ; sig0: a2 = (a >> 2) ^ (a >> 13) vpxor a2, a2, a1 ; a2 = sig0 vpand a1, a, c ; maj: a1 = a&c vpor a1, a1, %%T1 ; a1 = maj vpaddd h, h, a1 ; h = h + ch + W + K + maj vpaddd h, h, a2 ; h = h + ch + W + K + maj + sigma0 ROTATE_ARGS %endm ;; arguments passed implicitly in preprocessor symbols i, a...h %macro ROUND_16_XX 2 %define %%T1 %1 %define %%i %2 vmovdqa %%T1, [SZ8*((%%i-15)&0xf) + rsp] vmovdqa a1, [SZ8*((%%i-2)&0xf) + rsp] vmovdqa a0, %%T1 PRORD %%T1, 18-7 vmovdqa a2, a1 PRORD a1, 19-17 vpxor %%T1, %%T1, a0 PRORD %%T1, 7 vpxor a1, a1, a2 PRORD a1, 17 vpsrld a0, a0, 3 vpxor %%T1, %%T1, a0 vpsrld a2, a2, 10 vpxor a1, a1, a2 vpaddd %%T1, %%T1, [SZ8*((%%i-16)&0xf) + rsp] vpaddd a1, a1, [SZ8*((%%i-7)&0xf) + rsp] vpaddd %%T1, %%T1, a1 ROUND_00_15 %%T1, %%i %endm ;; SHA256_ARGS: ;; UINT128 digest[8]; // transposed digests ;; UINT8 *data_ptr[4]; ;; ;; void sha256_oct_avx2(SHA256_ARGS *args, UINT64 bytes); ;; arg 1 : STATE : pointer to array of pointers to input data ;; arg 2 : INP_SIZE : size of input in blocks MKGLOBAL(sha256_oct_avx2,function,internal) align 16 sha256_oct_avx2: ; general registers preserved in outer calling routine ; outer calling routine saves all the XMM registers sub rsp, FRAMESZ ;; Load the pre-transposed incoming digest. vmovdqu a,[STATE + 0*SHA256_DIGEST_ROW_SIZE] vmovdqu b,[STATE + 1*SHA256_DIGEST_ROW_SIZE] vmovdqu c,[STATE + 2*SHA256_DIGEST_ROW_SIZE] vmovdqu d,[STATE + 3*SHA256_DIGEST_ROW_SIZE] vmovdqu e,[STATE + 4*SHA256_DIGEST_ROW_SIZE] vmovdqu f,[STATE + 5*SHA256_DIGEST_ROW_SIZE] vmovdqu g,[STATE + 6*SHA256_DIGEST_ROW_SIZE] vmovdqu h,[STATE + 7*SHA256_DIGEST_ROW_SIZE] lea TBL,[rel K256_8] ;; load the address of each of the 4 message lanes ;; getting ready to transpose input onto stack mov inp0,[STATE + _data_ptr_sha256 + 0*PTR_SZ] mov inp1,[STATE + _data_ptr_sha256 + 1*PTR_SZ] mov inp2,[STATE + _data_ptr_sha256 + 2*PTR_SZ] mov inp3,[STATE + _data_ptr_sha256 + 3*PTR_SZ] mov inp4,[STATE + _data_ptr_sha256 + 4*PTR_SZ] mov inp5,[STATE + _data_ptr_sha256 + 5*PTR_SZ] mov inp6,[STATE + _data_ptr_sha256 + 6*PTR_SZ] mov inp7,[STATE + _data_ptr_sha256 + 7*PTR_SZ] xor IDX, IDX lloop: xor ROUND, ROUND ;; save old digest vmovdqa [rsp + _DIGEST + 0*SZ8], a vmovdqa [rsp + _DIGEST + 1*SZ8], b vmovdqa [rsp + _DIGEST + 2*SZ8], c vmovdqa [rsp + _DIGEST + 3*SZ8], d vmovdqa [rsp + _DIGEST + 4*SZ8], e vmovdqa [rsp + _DIGEST + 5*SZ8], f vmovdqa [rsp + _DIGEST + 6*SZ8], g vmovdqa [rsp + _DIGEST + 7*SZ8], h DBGPRINTL_YMM "transposed digest ", a,b,c,d,e,f,g,h %assign i 0 %rep 2 VMOVPS TT0,[inp0+IDX+i*32] VMOVPS TT1,[inp1+IDX+i*32] VMOVPS TT2,[inp2+IDX+i*32] VMOVPS TT3,[inp3+IDX+i*32] VMOVPS TT4,[inp4+IDX+i*32] VMOVPS TT5,[inp5+IDX+i*32] VMOVPS TT6,[inp6+IDX+i*32] VMOVPS TT7,[inp7+IDX+i*32] vmovdqa [YTMP0], g vmovdqa [YTMP1], h TRANSPOSE8 TT0, TT1, TT2, TT3, TT4, TT5, TT6, TT7, TMP0, TMP1 DBGPRINTL_YMM "transposed input ", TT0, TT1, TT2, TT3, TT4, TT5, TT6, TT7 vmovdqa TMP1, [rel PSHUFFLE_BYTE_FLIP_MASK] vmovdqa g, [YTMP0] vpshufb TT0, TT0, TMP1 vpshufb TT1, TT1, TMP1 vpshufb TT2, TT2, TMP1 vpshufb TT3, TT3, TMP1 vpshufb TT4, TT4, TMP1 vpshufb TT5, TT5, TMP1 vpshufb TT6, TT6, TMP1 vpshufb TT7, TT7, TMP1 vmovdqa h, [YTMP1] vmovdqa [YTMP0], TT4 vmovdqa [YTMP1], TT5 vmovdqa [YTMP2], TT6 vmovdqa [YTMP3], TT7 ROUND_00_15 TT0,(i*8+0) vmovdqa TT0, [YTMP0] ROUND_00_15 TT1,(i*8+1) vmovdqa TT1, [YTMP1] ROUND_00_15 TT2,(i*8+2) vmovdqa TT2, [YTMP2] ROUND_00_15 TT3,(i*8+3) vmovdqa TT3, [YTMP3] ROUND_00_15 TT0,(i*8+4) ROUND_00_15 TT1,(i*8+5) ROUND_00_15 TT2,(i*8+6) ROUND_00_15 TT3,(i*8+7) %assign i (i+1) %endrep add IDX, 4*4*4 %assign i (i*8) jmp Lrounds_16_xx align 16 Lrounds_16_xx: %rep 16 ROUND_16_XX T1, i %assign i (i+1) %endrep cmp ROUND,ROUNDS jb Lrounds_16_xx ;; add old digest vpaddd a, a, [rsp + _DIGEST + 0*SZ8] vpaddd b, b, [rsp + _DIGEST + 1*SZ8] vpaddd c, c, [rsp + _DIGEST + 2*SZ8] vpaddd d, d, [rsp + _DIGEST + 3*SZ8] vpaddd e, e, [rsp + _DIGEST + 4*SZ8] vpaddd f, f, [rsp + _DIGEST + 5*SZ8] vpaddd g, g, [rsp + _DIGEST + 6*SZ8] vpaddd h, h, [rsp + _DIGEST + 7*SZ8] sub INP_SIZE, 1 ;; unit is blocks jne lloop ; write back to memory (state object) the transposed digest vmovdqu [STATE + 0*SHA256_DIGEST_ROW_SIZE],a vmovdqu [STATE + 1*SHA256_DIGEST_ROW_SIZE],b vmovdqu [STATE + 2*SHA256_DIGEST_ROW_SIZE],c vmovdqu [STATE + 3*SHA256_DIGEST_ROW_SIZE],d vmovdqu [STATE + 4*SHA256_DIGEST_ROW_SIZE],e vmovdqu [STATE + 5*SHA256_DIGEST_ROW_SIZE],f vmovdqu [STATE + 6*SHA256_DIGEST_ROW_SIZE],g vmovdqu [STATE + 7*SHA256_DIGEST_ROW_SIZE],h DBGPRINTL_YMM "sha256 digest on exit ", a,b,c,d,e,f,g,h ; update input pointers add inp0, IDX mov [STATE + _data_ptr_sha256 + 0*8], inp0 add inp1, IDX mov [STATE + _data_ptr_sha256 + 1*8], inp1 add inp2, IDX mov [STATE + _data_ptr_sha256 + 2*8], inp2 add inp3, IDX mov [STATE + _data_ptr_sha256 + 3*8], inp3 add inp4, IDX mov [STATE + _data_ptr_sha256 + 4*8], inp4 add inp5, IDX mov [STATE + _data_ptr_sha256 + 5*8], inp5 add inp6, IDX mov [STATE + _data_ptr_sha256 + 6*8], inp6 add inp7, IDX mov [STATE + _data_ptr_sha256 + 7*8], inp7 ;;;;;;;;;;;;;;;; ;; Postamble add rsp, FRAMESZ ret intel-ipsec-mb-0.48/avx2/sha512_x4_avx2.asm000066400000000000000000000422441321406316400202230ustar00rootroot00000000000000;; ;; Copyright (c) 2012-2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; ;; code to compute quad SHA512 using AVX ;; use YMMs to tackle the larger digest size ;; outer calling routine takes care of save and restore of XMM registers ;; Logic designed/laid out by JDG ;; Function clobbers: rax, rcx, rdx, rbx, rsi, rdi, r9-r15; ymm0-15 ;; Stack must be aligned to 32 bytes before call ;; Windows clobbers: rax rbx rdx r8 r9 r10 r11 r12 ;; Windows preserves: rcx rsi rdi rbp r13 r14 r15 ;; ;; Linux clobbers: rax rbx rcx rdx rsi r8 r9 r10 r11 r12 ;; Linux preserves: rcx rdx rdi rbp r13 r14 r15 ;; ;; clobbers ymm0-15 %include "os.asm" ;%define DO_DBGPRINT %include "dbgprint.asm" %include "mb_mgr_datastruct.asm" section .data default rel align 64 K512_4: dq 0x428a2f98d728ae22, 0x428a2f98d728ae22, 0x428a2f98d728ae22, 0x428a2f98d728ae22 dq 0x7137449123ef65cd, 0x7137449123ef65cd, 0x7137449123ef65cd, 0x7137449123ef65cd dq 0xb5c0fbcfec4d3b2f, 0xb5c0fbcfec4d3b2f, 0xb5c0fbcfec4d3b2f, 0xb5c0fbcfec4d3b2f dq 0xe9b5dba58189dbbc, 0xe9b5dba58189dbbc, 0xe9b5dba58189dbbc, 0xe9b5dba58189dbbc dq 0x3956c25bf348b538, 0x3956c25bf348b538, 0x3956c25bf348b538, 0x3956c25bf348b538 dq 0x59f111f1b605d019, 0x59f111f1b605d019, 0x59f111f1b605d019, 0x59f111f1b605d019 dq 0x923f82a4af194f9b, 0x923f82a4af194f9b, 0x923f82a4af194f9b, 0x923f82a4af194f9b dq 0xab1c5ed5da6d8118, 0xab1c5ed5da6d8118, 0xab1c5ed5da6d8118, 0xab1c5ed5da6d8118 dq 0xd807aa98a3030242, 0xd807aa98a3030242, 0xd807aa98a3030242, 0xd807aa98a3030242 dq 0x12835b0145706fbe, 0x12835b0145706fbe, 0x12835b0145706fbe, 0x12835b0145706fbe dq 0x243185be4ee4b28c, 0x243185be4ee4b28c, 0x243185be4ee4b28c, 0x243185be4ee4b28c dq 0x550c7dc3d5ffb4e2, 0x550c7dc3d5ffb4e2, 0x550c7dc3d5ffb4e2, 0x550c7dc3d5ffb4e2 dq 0x72be5d74f27b896f, 0x72be5d74f27b896f, 0x72be5d74f27b896f, 0x72be5d74f27b896f dq 0x80deb1fe3b1696b1, 0x80deb1fe3b1696b1, 0x80deb1fe3b1696b1, 0x80deb1fe3b1696b1 dq 0x9bdc06a725c71235, 0x9bdc06a725c71235, 0x9bdc06a725c71235, 0x9bdc06a725c71235 dq 0xc19bf174cf692694, 0xc19bf174cf692694, 0xc19bf174cf692694, 0xc19bf174cf692694 dq 0xe49b69c19ef14ad2, 0xe49b69c19ef14ad2, 0xe49b69c19ef14ad2, 0xe49b69c19ef14ad2 dq 0xefbe4786384f25e3, 0xefbe4786384f25e3, 0xefbe4786384f25e3, 0xefbe4786384f25e3 dq 0x0fc19dc68b8cd5b5, 0x0fc19dc68b8cd5b5, 0x0fc19dc68b8cd5b5, 0x0fc19dc68b8cd5b5 dq 0x240ca1cc77ac9c65, 0x240ca1cc77ac9c65, 0x240ca1cc77ac9c65, 0x240ca1cc77ac9c65 dq 0x2de92c6f592b0275, 0x2de92c6f592b0275, 0x2de92c6f592b0275, 0x2de92c6f592b0275 dq 0x4a7484aa6ea6e483, 0x4a7484aa6ea6e483, 0x4a7484aa6ea6e483, 0x4a7484aa6ea6e483 dq 0x5cb0a9dcbd41fbd4, 0x5cb0a9dcbd41fbd4, 0x5cb0a9dcbd41fbd4, 0x5cb0a9dcbd41fbd4 dq 0x76f988da831153b5, 0x76f988da831153b5, 0x76f988da831153b5, 0x76f988da831153b5 dq 0x983e5152ee66dfab, 0x983e5152ee66dfab, 0x983e5152ee66dfab, 0x983e5152ee66dfab dq 0xa831c66d2db43210, 0xa831c66d2db43210, 0xa831c66d2db43210, 0xa831c66d2db43210 dq 0xb00327c898fb213f, 0xb00327c898fb213f, 0xb00327c898fb213f, 0xb00327c898fb213f dq 0xbf597fc7beef0ee4, 0xbf597fc7beef0ee4, 0xbf597fc7beef0ee4, 0xbf597fc7beef0ee4 dq 0xc6e00bf33da88fc2, 0xc6e00bf33da88fc2, 0xc6e00bf33da88fc2, 0xc6e00bf33da88fc2 dq 0xd5a79147930aa725, 0xd5a79147930aa725, 0xd5a79147930aa725, 0xd5a79147930aa725 dq 0x06ca6351e003826f, 0x06ca6351e003826f, 0x06ca6351e003826f, 0x06ca6351e003826f dq 0x142929670a0e6e70, 0x142929670a0e6e70, 0x142929670a0e6e70, 0x142929670a0e6e70 dq 0x27b70a8546d22ffc, 0x27b70a8546d22ffc, 0x27b70a8546d22ffc, 0x27b70a8546d22ffc dq 0x2e1b21385c26c926, 0x2e1b21385c26c926, 0x2e1b21385c26c926, 0x2e1b21385c26c926 dq 0x4d2c6dfc5ac42aed, 0x4d2c6dfc5ac42aed, 0x4d2c6dfc5ac42aed, 0x4d2c6dfc5ac42aed dq 0x53380d139d95b3df, 0x53380d139d95b3df, 0x53380d139d95b3df, 0x53380d139d95b3df dq 0x650a73548baf63de, 0x650a73548baf63de, 0x650a73548baf63de, 0x650a73548baf63de dq 0x766a0abb3c77b2a8, 0x766a0abb3c77b2a8, 0x766a0abb3c77b2a8, 0x766a0abb3c77b2a8 dq 0x81c2c92e47edaee6, 0x81c2c92e47edaee6, 0x81c2c92e47edaee6, 0x81c2c92e47edaee6 dq 0x92722c851482353b, 0x92722c851482353b, 0x92722c851482353b, 0x92722c851482353b dq 0xa2bfe8a14cf10364, 0xa2bfe8a14cf10364, 0xa2bfe8a14cf10364, 0xa2bfe8a14cf10364 dq 0xa81a664bbc423001, 0xa81a664bbc423001, 0xa81a664bbc423001, 0xa81a664bbc423001 dq 0xc24b8b70d0f89791, 0xc24b8b70d0f89791, 0xc24b8b70d0f89791, 0xc24b8b70d0f89791 dq 0xc76c51a30654be30, 0xc76c51a30654be30, 0xc76c51a30654be30, 0xc76c51a30654be30 dq 0xd192e819d6ef5218, 0xd192e819d6ef5218, 0xd192e819d6ef5218, 0xd192e819d6ef5218 dq 0xd69906245565a910, 0xd69906245565a910, 0xd69906245565a910, 0xd69906245565a910 dq 0xf40e35855771202a, 0xf40e35855771202a, 0xf40e35855771202a, 0xf40e35855771202a dq 0x106aa07032bbd1b8, 0x106aa07032bbd1b8, 0x106aa07032bbd1b8, 0x106aa07032bbd1b8 dq 0x19a4c116b8d2d0c8, 0x19a4c116b8d2d0c8, 0x19a4c116b8d2d0c8, 0x19a4c116b8d2d0c8 dq 0x1e376c085141ab53, 0x1e376c085141ab53, 0x1e376c085141ab53, 0x1e376c085141ab53 dq 0x2748774cdf8eeb99, 0x2748774cdf8eeb99, 0x2748774cdf8eeb99, 0x2748774cdf8eeb99 dq 0x34b0bcb5e19b48a8, 0x34b0bcb5e19b48a8, 0x34b0bcb5e19b48a8, 0x34b0bcb5e19b48a8 dq 0x391c0cb3c5c95a63, 0x391c0cb3c5c95a63, 0x391c0cb3c5c95a63, 0x391c0cb3c5c95a63 dq 0x4ed8aa4ae3418acb, 0x4ed8aa4ae3418acb, 0x4ed8aa4ae3418acb, 0x4ed8aa4ae3418acb dq 0x5b9cca4f7763e373, 0x5b9cca4f7763e373, 0x5b9cca4f7763e373, 0x5b9cca4f7763e373 dq 0x682e6ff3d6b2b8a3, 0x682e6ff3d6b2b8a3, 0x682e6ff3d6b2b8a3, 0x682e6ff3d6b2b8a3 dq 0x748f82ee5defb2fc, 0x748f82ee5defb2fc, 0x748f82ee5defb2fc, 0x748f82ee5defb2fc dq 0x78a5636f43172f60, 0x78a5636f43172f60, 0x78a5636f43172f60, 0x78a5636f43172f60 dq 0x84c87814a1f0ab72, 0x84c87814a1f0ab72, 0x84c87814a1f0ab72, 0x84c87814a1f0ab72 dq 0x8cc702081a6439ec, 0x8cc702081a6439ec, 0x8cc702081a6439ec, 0x8cc702081a6439ec dq 0x90befffa23631e28, 0x90befffa23631e28, 0x90befffa23631e28, 0x90befffa23631e28 dq 0xa4506cebde82bde9, 0xa4506cebde82bde9, 0xa4506cebde82bde9, 0xa4506cebde82bde9 dq 0xbef9a3f7b2c67915, 0xbef9a3f7b2c67915, 0xbef9a3f7b2c67915, 0xbef9a3f7b2c67915 dq 0xc67178f2e372532b, 0xc67178f2e372532b, 0xc67178f2e372532b, 0xc67178f2e372532b dq 0xca273eceea26619c, 0xca273eceea26619c, 0xca273eceea26619c, 0xca273eceea26619c dq 0xd186b8c721c0c207, 0xd186b8c721c0c207, 0xd186b8c721c0c207, 0xd186b8c721c0c207 dq 0xeada7dd6cde0eb1e, 0xeada7dd6cde0eb1e, 0xeada7dd6cde0eb1e, 0xeada7dd6cde0eb1e dq 0xf57d4f7fee6ed178, 0xf57d4f7fee6ed178, 0xf57d4f7fee6ed178, 0xf57d4f7fee6ed178 dq 0x06f067aa72176fba, 0x06f067aa72176fba, 0x06f067aa72176fba, 0x06f067aa72176fba dq 0x0a637dc5a2c898a6, 0x0a637dc5a2c898a6, 0x0a637dc5a2c898a6, 0x0a637dc5a2c898a6 dq 0x113f9804bef90dae, 0x113f9804bef90dae, 0x113f9804bef90dae, 0x113f9804bef90dae dq 0x1b710b35131c471b, 0x1b710b35131c471b, 0x1b710b35131c471b, 0x1b710b35131c471b dq 0x28db77f523047d84, 0x28db77f523047d84, 0x28db77f523047d84, 0x28db77f523047d84 dq 0x32caab7b40c72493, 0x32caab7b40c72493, 0x32caab7b40c72493, 0x32caab7b40c72493 dq 0x3c9ebe0a15c9bebc, 0x3c9ebe0a15c9bebc, 0x3c9ebe0a15c9bebc, 0x3c9ebe0a15c9bebc dq 0x431d67c49c100d4c, 0x431d67c49c100d4c, 0x431d67c49c100d4c, 0x431d67c49c100d4c dq 0x4cc5d4becb3e42b6, 0x4cc5d4becb3e42b6, 0x4cc5d4becb3e42b6, 0x4cc5d4becb3e42b6 dq 0x597f299cfc657e2a, 0x597f299cfc657e2a, 0x597f299cfc657e2a, 0x597f299cfc657e2a dq 0x5fcb6fab3ad6faec, 0x5fcb6fab3ad6faec, 0x5fcb6fab3ad6faec, 0x5fcb6fab3ad6faec dq 0x6c44198c4a475817, 0x6c44198c4a475817, 0x6c44198c4a475817, 0x6c44198c4a475817 align 32 PSHUFFLE_BYTE_FLIP_MASK: ;ddq 0x08090a0b0c0d0e0f0001020304050607 dq 0x0001020304050607, 0x08090a0b0c0d0e0f ;ddq 0x18191a1b1c1d1e1f1011121314151617 dq 0x1011121314151617, 0x18191a1b1c1d1e1f section .text %ifdef LINUX %define arg1 rdi %define arg2 rsi %else %define arg1 rcx %define arg2 rdx %endif ; Common definitions %define STATE arg1 %define INP_SIZE arg2 %define IDX rax %define ROUND rbx %define TBL r8 %define inp0 r9 %define inp1 r10 %define inp2 r11 %define inp3 r12 %define a ymm0 %define b ymm1 %define c ymm2 %define d ymm3 %define e ymm4 %define f ymm5 %define g ymm6 %define h ymm7 %define a0 ymm8 %define a1 ymm9 %define a2 ymm10 %define TT0 ymm14 %define TT1 ymm13 %define TT2 ymm12 %define TT3 ymm11 %define TT4 ymm10 %define TT5 ymm9 %define T1 ymm14 %define TMP ymm15 %define SZ4 4*SHA512_DIGEST_WORD_SIZE ; Size of one vector register %define ROUNDS 80*SZ4 ; Define stack usage ;; Assume stack aligned to 32 bytes before call ;; Therefore FRAMESZ mod 32 must be 32-8 = 24 struc stack_frame .data resb 16*SZ4 .digest resb NUM_SHA512_DIGEST_WORDS*SZ4 .align resb 24 endstruc %define _DIGEST stack_frame.digest %define VMOVPD vmovupd ; operates on YMMs ; transpose r0, r1, r2, r3, t0, t1 ; "transpose" data in {r0..r3} using temps {t0..t3} ; Input looks like: {r0 r1 r2 r3} ; r0 = {a7 a6 a5 a4 a3 a2 a1 a0} ; r1 = {b7 b6 b5 b4 b3 b2 b1 b0} ; r2 = {c7 c6 c5 c4 c3 c2 c1 c0} ; r3 = {d7 d6 d5 d4 d3 d2 d1 d0} ; ; output looks like: {t0 r1 r0 r3} ; t0 = {d1 d0 c1 c0 b1 b0 a1 a0} ; r1 = {d3 d2 c3 c2 b3 b2 a3 a2} ; r0 = {d5 d4 c5 c4 b5 b4 a5 a4} ; r3 = {d7 d6 c7 c6 b7 b6 a7 a6} ; %macro TRANSPOSE 6 %define %%r0 %1 %define %%r1 %2 %define %%r2 %3 %define %%r3 %4 %define %%t0 %5 %define %%t1 %6 ; vshufps does not cross the mid-way boundary and hence is cheaper vshufps %%t0, %%r0, %%r1, 0x44 ; t0 = {b5 b4 a5 a4 b1 b0 a1 a0} vshufps %%r0, %%r0, %%r1, 0xEE ; r0 = {b7 b6 a7 a6 b3 b2 a3 a2} vshufps %%t1, %%r2, %%r3, 0x44 ; t1 = {d5 d4 c5 c4 d1 d0 c1 c0} vshufps %%r2, %%r2, %%r3, 0xEE ; r2 = {d7 d6 c7 c6 d3 d2 c3 c2} vperm2f128 %%r1, %%r0, %%r2, 0x20; r1 = {d3 d2 c3 c2 b3 b2 a3 a2} vperm2f128 %%r3, %%r0, %%r2, 0x31; r3 = {d7 d6 c7 c6 b7 b6 a7 a6} vperm2f128 %%r0, %%t0, %%t1, 0x31; r0 = {d5 d4 c5 c4 b5 b4 a5 a4} ; now ok to clobber t0 vperm2f128 %%t0, %%t0, %%t1, 0x20; t0 = {d1 d0 c1 c0 b1 b0 a1 a0} %endmacro %macro ROTATE_ARGS 0 %xdefine TMP_ h %xdefine h g %xdefine g f %xdefine f e %xdefine e d %xdefine d c %xdefine c b %xdefine b a %xdefine a TMP_ %endm ; PRORQ reg, imm, tmp ; packed-rotate-right-double ; does a rotate by doing two shifts and an or %macro PRORQ 3 %define %%reg %1 %define %%imm %2 %define %%tmp %3 vpsllq %%tmp, %%reg, (64-(%%imm)) vpsrlq %%reg, %%reg, %%imm vpor %%reg, %%reg, %%tmp %endmacro ; non-destructive ; PRORQ_nd reg, imm, tmp, src %macro PRORQ_nd 4 %define %%reg %1 %define %%imm %2 %define %%tmp %3 %define %%src %4 vpsllq %%tmp, %%src, (64-(%%imm)) vpsrlq %%reg, %%src, %%imm vpor %%reg, %%reg, %%tmp %endmacro ; PRORQ dst/src, amt %macro PRORQ 2 PRORQ %1, %2, TMP %endmacro ; PRORQ_nd dst, src, amt %macro PRORQ_nd 3 PRORQ_nd %1, %3, TMP, %2 %endmacro ;; arguments passed implicitly in preprocessor symbols i, a...h %macro ROUND_00_15 2 %define %%T1 %1 %define %%i %2 PRORQ_nd a0, e, (18-14) ; sig1: a0 = (e >> 4) vpxor a2, f, g ; ch: a2 = f^g vpand a2, a2, e ; ch: a2 = (f^g)&e vpxor a2, a2, g ; a2 = ch PRORQ_nd a1, e, 41 ; sig1: a1 = (e >> 41) vmovdqa [SZ4*(%%i&0xf) + rsp],%%T1 vpaddq %%T1,%%T1,[TBL + ROUND] ; T1 = W + K vpxor a0, a0, e ; sig1: a0 = e ^ (e >> 5) PRORQ a0, 14 ; sig1: a0 = (e >> 14) ^ (e >> 18) vpaddq h, h, a2 ; h = h + ch PRORQ_nd a2, a, (34-28) ; sig0: a2 = (a >> 6) vpaddq h, h, %%T1 ; h = h + ch + W + K vpxor a0, a0, a1 ; a0 = sigma1 vmovdqa %%T1, a ; maj: T1 = a PRORQ_nd a1, a, 39 ; sig0: a1 = (a >> 39) vpxor %%T1, %%T1, c ; maj: T1 = a^c add ROUND, SZ4 ; ROUND++ vpand %%T1, %%T1, b ; maj: T1 = (a^c)&b vpaddq h, h, a0 vpaddq d, d, h vpxor a2, a2, a ; sig0: a2 = a ^ (a >> 11) PRORQ a2, 28 ; sig0: a2 = (a >> 28) ^ (a >> 34) vpxor a2, a2, a1 ; a2 = sig0 vpand a1, a, c ; maj: a1 = a&c vpor a1, a1, %%T1 ; a1 = maj vpaddq h, h, a1 ; h = h + ch + W + K + maj vpaddq h, h, a2 ; h = h + ch + W + K + maj + sigma0 ROTATE_ARGS %endm ;; arguments passed implicitly in preprocessor symbols i, a...h %macro ROUND_16_XX 2 %define %%T1 %1 %define %%i %2 vmovdqa %%T1, [SZ4*((%%i-15)&0xf) + rsp] vmovdqa a1, [SZ4*((%%i-2)&0xf) + rsp] vmovdqa a0, %%T1 PRORQ %%T1, 8-1 vmovdqa a2, a1 PRORQ a1, 61-19 vpxor %%T1, %%T1, a0 PRORQ %%T1, 1 vpxor a1, a1, a2 PRORQ a1, 19 vpsrlq a0, a0, 7 vpxor %%T1, %%T1, a0 vpsrlq a2, a2, 6 vpxor a1, a1, a2 vpaddq %%T1, %%T1, [SZ4*((%%i-16)&0xf) + rsp] vpaddq a1, a1, [SZ4*((%%i-7)&0xf) + rsp] vpaddq %%T1, %%T1, a1 ROUND_00_15 %%T1, %%i %endm ;; void sha512_x4_avx2(void *STATE, const int INP_SIZE) ;; arg 1 : STATE : pointer to input data ;; arg 2 : INP_SIZE : size of data in blocks (assumed >= 1) MKGLOBAL(sha512_x4_avx2,function,internal) align 32 sha512_x4_avx2: ; general registers preserved in outer calling routine ; outer calling routine saves all the XMM registers sub rsp, stack_frame_size ;; Load the pre-transposed incoming digest. vmovdqu a, [STATE+ 0*SHA512_DIGEST_ROW_SIZE] vmovdqu b, [STATE+ 1*SHA512_DIGEST_ROW_SIZE] vmovdqu c, [STATE+ 2*SHA512_DIGEST_ROW_SIZE] vmovdqu d, [STATE+ 3*SHA512_DIGEST_ROW_SIZE] vmovdqu e, [STATE+ 4*SHA512_DIGEST_ROW_SIZE] vmovdqu f, [STATE+ 5*SHA512_DIGEST_ROW_SIZE] vmovdqu g, [STATE+ 6*SHA512_DIGEST_ROW_SIZE] vmovdqu h, [STATE+ 7*SHA512_DIGEST_ROW_SIZE] DBGPRINTL_YMM "sha512-avx2 Incoming digest", a, b, c, d, e, f, g, h lea TBL,[K512_4] ;; load the address of each of the MAX_LANES (4) message lanes ;; getting ready to transpose input onto stack mov inp0,[STATE + _data_ptr_sha512 + 0*PTR_SZ] mov inp1,[STATE + _data_ptr_sha512 + 1*PTR_SZ] mov inp2,[STATE + _data_ptr_sha512 + 2*PTR_SZ] mov inp3,[STATE + _data_ptr_sha512 + 3*PTR_SZ] xor IDX, IDX lloop: xor ROUND, ROUND ;; save old digest vmovdqa [rsp + _DIGEST + 0*SZ4], a vmovdqa [rsp + _DIGEST + 1*SZ4], b vmovdqa [rsp + _DIGEST + 2*SZ4], c vmovdqa [rsp + _DIGEST + 3*SZ4], d vmovdqa [rsp + _DIGEST + 4*SZ4], e vmovdqa [rsp + _DIGEST + 5*SZ4], f vmovdqa [rsp + _DIGEST + 6*SZ4], g vmovdqa [rsp + _DIGEST + 7*SZ4], h %assign i 0 %rep 4 ;; load up the shuffler for little-endian to big-endian format vmovdqa TMP, [PSHUFFLE_BYTE_FLIP_MASK] VMOVPD TT2,[inp0+IDX+i*32] VMOVPD TT1,[inp1+IDX+i*32] VMOVPD TT4,[inp2+IDX+i*32] VMOVPD TT3,[inp3+IDX+i*32] TRANSPOSE TT2, TT1, TT4, TT3, TT0, TT5 DBGPRINTL_YMM "sha512-avx2 Incoming data", TT1, TT2, TT3, TT4 vpshufb TT0, TT0, TMP vpshufb TT1, TT1, TMP vpshufb TT2, TT2, TMP vpshufb TT3, TT3, TMP ROUND_00_15 TT0,(i*4+0) ROUND_00_15 TT1,(i*4+1) ROUND_00_15 TT2,(i*4+2) ROUND_00_15 TT3,(i*4+3) %assign i (i+1) %endrep ;; Increment IDX by message block size == 8 (loop) * 16 (XMM width in bytes) add IDX, 4 * 32 %assign i (i*4) jmp Lrounds_16_xx align 16 Lrounds_16_xx: %rep 16 ROUND_16_XX T1, i %assign i (i+1) %endrep cmp ROUND,ROUNDS jb Lrounds_16_xx ;; add old digest vpaddq a, a, [rsp + _DIGEST + 0*SZ4] vpaddq b, b, [rsp + _DIGEST + 1*SZ4] vpaddq c, c, [rsp + _DIGEST + 2*SZ4] vpaddq d, d, [rsp + _DIGEST + 3*SZ4] vpaddq e, e, [rsp + _DIGEST + 4*SZ4] vpaddq f, f, [rsp + _DIGEST + 5*SZ4] vpaddq g, g, [rsp + _DIGEST + 6*SZ4] vpaddq h, h, [rsp + _DIGEST + 7*SZ4] sub INP_SIZE, 1 ;; consumed one message block jne lloop ; write back to memory (state object) the transposed digest vmovdqu [STATE+ 0*SHA512_DIGEST_ROW_SIZE ],a vmovdqu [STATE+ 1*SHA512_DIGEST_ROW_SIZE ],b vmovdqu [STATE+ 2*SHA512_DIGEST_ROW_SIZE ],c vmovdqu [STATE+ 3*SHA512_DIGEST_ROW_SIZE ],d vmovdqu [STATE+ 4*SHA512_DIGEST_ROW_SIZE ],e vmovdqu [STATE+ 5*SHA512_DIGEST_ROW_SIZE ],f vmovdqu [STATE+ 6*SHA512_DIGEST_ROW_SIZE ],g vmovdqu [STATE+ 7*SHA512_DIGEST_ROW_SIZE ],h DBGPRINTL_YMM "sha512-avx2 Outgoing digest", a, b, c, d, e, f, g, h ;; update input data pointers add inp0, IDX mov [STATE + _data_ptr_sha512 + 0*PTR_SZ], inp0 add inp1, IDX mov [STATE + _data_ptr_sha512 + 1*PTR_SZ], inp1 add inp2, IDX mov [STATE + _data_ptr_sha512 + 2*PTR_SZ], inp2 add inp3, IDX mov [STATE + _data_ptr_sha512 + 3*PTR_SZ], inp3 ;;;;;;;;;;;;;;;; ;; Postamble add rsp, stack_frame_size ; outer calling routine restores XMM and other GP registers ret intel-ipsec-mb-0.48/avx512/000077500000000000000000000000001321406316400153035ustar00rootroot00000000000000intel-ipsec-mb-0.48/avx512/des_x16_avx512.asm000066400000000000000000002067561321406316400204040ustar00rootroot00000000000000;; ;; Copyright (c) 2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; ;; Authors: ;; Shay Gueron (1, 2), Regev Shemy (2), Tomasz kantecki (2) ;; (1) University of Haifa, Israel ;; (2) Intel Corporation ;; In System V AMD64 ABI ;; calle saves: RBX, RBP, R12-R15 ;; Windows x64 ABI ;; calle saves: RBX, RBP, RDI, RSI, RSP, R12-R15 ;; ;; Registers: RAX RBX RCX RDX RBP RSI RDI R8 R9 R10 R11 R12 R13 R14 R15 ;; ----------------------------------------------------------- ;; Windows clobbers: RAX R8 R9 R10 R11 ;; Windows preserves: RBX RCX RDX RBP RSI RDI R12 R13 R14 R15 ;; ----------------------------------------------------------- ;; Linux clobbers: RAX RCX RDX R10 R11 ;; Linux preserves: RBX RBP RSI RDI R8 R9 R12 R13 R14 R15 ;; ----------------------------------------------------------- ;; Clobbers ZMM0-31 and K1 to K7 %include "os.asm" %include "reg_sizes.asm" %include "mb_mgr_datastruct.asm" %include "constants.asm" ;%define DO_DBGPRINT %include "dbgprint.asm" %ifdef LINUX %define arg1 rdi %define arg2 rsi %define arg3 rdx %define arg4 rcx %else %define arg1 rcx %define arg2 rdx %define arg3 r8 %define arg4 r9 %endif %define STATE arg1 %define SIZE arg2 %define OFFSET rax %define IA0 arg3 %define IA1 arg4 %define IA2 r10 %define INP0 r11 %define INP1 r12 %define INP2 r13 %define INP3 r14 %define INP4 r15 %define KSOFFSET r11 %define ZW0 zmm0 %define ZW1 zmm1 %define ZW2 zmm2 %define ZW3 zmm3 %define ZW4 zmm4 %define ZW5 zmm5 %define ZW6 zmm6 %define ZW7 zmm7 %define ZW8 zmm8 %define ZW9 zmm9 %define ZW10 zmm10 %define ZW11 zmm11 %define ZW12 zmm12 %define ZW13 zmm13 %define ZW14 zmm14 %define ZW15 zmm15 %define ZIV0 zmm16 %define ZIV1 zmm17 %define ZTMP0 zmm18 %define ZTMP1 zmm19 %define ZTMP2 zmm20 %define ZTMP3 zmm21 %define ZTMP4 zmm22 %define ZTMP5 zmm23 %define ZTMP6 zmm24 %define ZTMP7 zmm25 %define ZTMP8 zmm26 %define ZTMP9 zmm27 %define ZTMP10 zmm28 %define ZTMP11 zmm29 %define ZTMP12 zmm30 %define ZTMP13 zmm31 struc STACKFRAME _key_sched: resq 16*16 ; 16 lanes x 16 qwords; 16 x 128 bytes = 2048 _tmp_iv: resq 16 ; 2 x 64 bytes _tmp_in: resq 16 ; 2 x 64 bytes _tmp_out: resq 16 ; 2 x 64 bytes _tmp_mask: resd 16 ; 1 x 64 bytes _gpr_save: resq 4 ; r12 to r15 _rsp_save: resq 1 endstruc ;;; =========================================================================== ;;; =========================================================================== ;;; MACROS ;;; =========================================================================== ;;; =========================================================================== ;;; =========================================================================== ;;; PERMUTE ;;; =========================================================================== ;;; A [in/out] - zmm register ;;; B [in/out] - zmm register ;;; NSHIFT [in] - constant to shift words by ;;; MASK [in] - zmm or m512 with mask ;;; T0 [clobbered] - temporary zmm register %macro PERMUTE 5 %define %%A %1 %define %%B %2 %define %%NSHIFT %3 %define %%MASK %4 %define %%T0 %5 vpsrld %%T0, %%A, %%NSHIFT vpxord %%T0, %%T0, %%B vpandd %%T0, %%T0, %%MASK vpxord %%B, %%B, %%T0 vpslld %%T0, %%T0, %%NSHIFT vpxord %%A, %%A, %%T0 %endmacro ;;; =========================================================================== ;;; INITIAL PERMUTATION ;;; =========================================================================== ;;; L [in/out] - zmm register ;;; R [in/out] - zmm register ;;; T0 [clobbered] - temporary zmm register %macro IP_Z 3 %define %%L %1 %define %%R %2 %define %%T0 %3 PERMUTE %%R, %%L, 4, [rel init_perm_consts + 0*64], %%T0 PERMUTE %%L, %%R, 16, [rel init_perm_consts + 1*64], %%T0 PERMUTE %%R, %%L, 2, [rel init_perm_consts + 2*64], %%T0 PERMUTE %%L, %%R, 8, [rel init_perm_consts + 3*64], %%T0 PERMUTE %%R, %%L, 1, [rel init_perm_consts + 4*64], %%T0 %endmacro ;;; =========================================================================== ;;; FINAL PERMUTATION ;;; =========================================================================== ;;; L [in/out] - zmm register ;;; R [in/out] - zmm register ;;; T0 [clobbered] - temporary zmm register %macro FP_Z 3 %define %%L %1 %define %%R %2 %define %%T0 %3 PERMUTE %%L, %%R, 1, [rel init_perm_consts + 4*64], %%T0 PERMUTE %%R, %%L, 8, [rel init_perm_consts + 3*64], %%T0 PERMUTE %%L, %%R, 2, [rel init_perm_consts + 2*64], %%T0 PERMUTE %%R, %%L, 16, [rel init_perm_consts + 1*64], %%T0 PERMUTE %%L, %%R, 4, [rel init_perm_consts + 0*64], %%T0 %endmacro ;;; =========================================================================== ;;; P PHASE ;;; =========================================================================== ;;; W0 [in/out] - zmm register ;;; in: vector of 16 x 32bits from S phase ;;; out: permuted in vector ;;; T0-T3 [clobbered] - temporary zmm register %macro P_PHASE 5 %define %%W0 %1 %define %%T0 %2 %define %%T1 %3 %define %%T2 %4 %define %%T3 %5 vprord %%T0, %%W0, 3 vpandd %%T0, %%T0, [rel mask_values + 0*64] vprord %%T1, %%W0, 5 vpandd %%T1, %%T1, [rel mask_values + 1*64] vpord %%T0, %%T0, %%T1 vprord %%T1, %%W0, 24 vpandd %%T1, %%T1, [rel mask_values + 2*64] vprord %%T2, %%W0, 26 vpandd %%T2, %%T2, [rel mask_values + 3*64] vpord %%T1, %%T1, %%T2 vpord %%T0, %%T0, %%T1 vprord %%T1, %%W0, 15 vpandd %%T1, %%T1, [rel mask_values + 4*64] vprord %%T2, %%W0, 17 vpandd %%T2, %%T2, [rel mask_values + 5*64] vpord %%T1, %%T1, %%T2 vprord %%T2, %%W0, 6 vpandd %%T2, %%T2, [rel mask_values + 6*64] vprord %%T3, %%W0, 21 vpandd %%T3, %%T3, [rel mask_values + 7*64] vpord %%T2, %%T2, %%T3 vpord %%T1, %%T1, %%T2 vpord %%T0, %%T0, %%T1 vprord %%T1, %%W0, 12 vpandd %%T1, %%T1, [rel mask_values + 8*64] vprord %%T2, %%W0, 14 vpandd %%T2, %%T2, [rel mask_values + 9*64] vpord %%T1, %%T1, %%T2 vprord %%T2, %%W0, 4 vpandd %%T2, %%T2, [rel mask_values + 10*64] vprord %%T3, %%W0, 11 vpandd %%T3, %%T3, [rel mask_values + 11*64] vpord %%T2, %%T2, %%T3 vpord %%T1, %%T1, %%T2 vpord %%T0, %%T0, %%T1 vprord %%T1, %%W0, 16 vpandd %%T1, %%T1, [rel mask_values + 12*64] vprord %%T2, %%W0, 22 vpandd %%T2, %%T2, [rel mask_values + 13*64] vpord %%T1, %%T1, %%T2 vprord %%T2, %%W0, 19 vpandd %%T2, %%T2, [rel mask_values + 14*64] vprord %%T3, %%W0, 10 vpandd %%T3, %%T3, [rel mask_values + 15*64] vpord %%T2, %%T2, %%T3 vpord %%T1, %%T1, %%T2 vpord %%T0, %%T0, %%T1 vprord %%T1, %%W0, 9 vpandd %%T1, %%T1, [rel mask_values + 16*64] vprord %%T2, %%W0, 13 vpandd %%T2, %%T2, [rel mask_values + 17*64] vpord %%T1, %%T1, %%T2 vprord %%T2, %%W0, 25 vpandd %%T2, %%T2, [rel mask_values + 18*64] vpord %%T1, %%T1, %%T2 vpord %%W0, %%T0, %%T1 %endmacro ;;; =========================================================================== ;;; E PHASE ;;; =========================================================================== ;;; ;;; Expands 16x32-bit words into 16x48-bit words ;;; plus XOR's result with the key schedule. ;;; The output is adjusted to be friendly as S phase input. ;;; ;;; in [in] - zmm register ;;; out0a [out] - zmm register ;;; out0b [out] - zmm register ;;; out1a [out] - zmm register ;;; out1b [out] - zmm register ;;; k0 [in] - key schedule; zmm or m512 ;;; k1 [in] - key schedule; zmm or m512 ;;; t0-t1 [clobbered] - temporary zmm register %macro E_PHASE 9 %define %%IN %1 %define %%OUT0A %2 %define %%OUT0B %3 %define %%OUT1A %4 %define %%OUT1B %5 %define %%K0 %6 %define %%K1 %7 %define %%T0 %8 %define %%T1 %9 vprord %%T0, %%IN, 31 vprord %%T1, %%IN, 3 vpshufb %%T0, %%T0, [rel idx_e] vpshufb %%T1, %%T1, [rel idx_e] vpunpcklbw %%OUT0A, %%T0, %%T1 vpunpckhbw %%OUT1A, %%T0, %%T1 vpxord %%OUT0A, %%OUT0A, %%K0 vpxord %%OUT1A, %%OUT1A, %%K1 vpandd %%OUT0B, %%OUT0A, [rel and_eu] vpsrlw %%OUT0B, %%OUT0B, 8 vpandd %%OUT0A, %%OUT0A, [rel and_ed] vpandd %%OUT1B, %%OUT1A, [rel and_eu] vpsrlw %%OUT1B, %%OUT1B, 8 vpandd %%OUT1A, %%OUT1A, [rel and_ed] %endmacro ;;; =========================================================================== ;;; S-BOX ;;; =========================================================================== ;;; ;;; NOTE: clobbers k1-k6 OpMask registers ;;; ;;; IN0A [in] - zmm register; output from E-phase ;;; IN0B [in] - zmm register; output from E-phase ;;; IN1A [in] - zmm register; output from E-phase ;;; IN1B [in] - zmm register; output from E-phase ;;; OUT [out] - zmm register; output from E-phase ;;; T0-T5 [clobbered] - temporary zmm register %macro S_PHASE 11 %define %%IN0A %1 %define %%IN0B %2 %define %%IN1A %3 %define %%IN1B %4 %define %%OUT %5 %define %%T0 %6 %define %%T1 %7 %define %%T2 %8 %define %%T3 %9 %define %%T4 %10 %define %%T5 %11 vmovdqa64 %%T0, [rel reg_values16bit_7] vpcmpuw k3, %%IN0A, %%T0, 2 ; 2 -> LE vpcmpuw k4, %%IN0B, %%T0, 2 ; 2 -> LE vpcmpuw k5, %%IN1A, %%T0, 2 ; 2 -> LE vpcmpuw k6, %%IN1B, %%T0, 2 ; 2 -> LE mov DWORD(IA0), 0x55555555 kmovd k1, DWORD(IA0) mov DWORD(IA0), 0xaaaaaaaa kmovd k2, DWORD(IA0) vpermw %%T0{k1}{z}, %%IN0A, [rel S_box_flipped + 0*64] vpermw %%T1{k1}{z}, %%IN0A, [rel S_box_flipped + 1*64] vpermw %%T2{k2}{z}, %%IN0A, [rel S_box_flipped + 4*64] vpermw %%T3{k2}{z}, %%IN0A, [rel S_box_flipped + 5*64] vpxord %%T0, %%T0, %%T2 vpxord %%OUT, %%T1, %%T3 vmovdqu16 %%OUT{k3}, %%T0 vpermw %%T0{k1}{z}, %%IN0B, [rel S_box_flipped + 2*64] vpermw %%T1{k1}{z}, %%IN0B, [rel S_box_flipped + 3*64] vpermw %%T2{k2}{z}, %%IN0B, [rel S_box_flipped + 6*64] vpermw %%T3{k2}{z}, %%IN0B, [rel S_box_flipped + 7*64] vpxord %%T0, %%T0, %%T2 vpxord %%T3, %%T1, %%T3 vmovdqu16 %%T3{k4}, %%T0 vpsllw %%T3, %%T3, 4 vpxord %%OUT, %%OUT, %%T3 vpermw %%T0{k1}{z}, %%IN1A, [rel S_box_flipped + 8*64] vpermw %%T1{k1}{z}, %%IN1A, [rel S_box_flipped + 9*64] vpermw %%T2{k2}{z}, %%IN1A, [rel S_box_flipped + 12*64] vpermw %%T3{k2}{z}, %%IN1A, [rel S_box_flipped + 13*64] vpxord %%T0, %%T0, %%T2 vpxord %%T4, %%T1, %%T3 vmovdqu16 %%T4{k5}, %%T0 vpermw %%T0{k1}{z}, %%IN1B, [rel S_box_flipped + 10*64] vpermw %%T1{k1}{z}, %%IN1B, [rel S_box_flipped + 11*64] vpermw %%T2{k2}{z}, %%IN1B, [rel S_box_flipped + 14*64] vpermw %%T3{k2}{z}, %%IN1B, [rel S_box_flipped + 15*64] vpxord %%T0, %%T0, %%T2 vpxord %%T5, %%T1, %%T3 vmovdqu16 %%T5{k6}, %%T0 vpsllw %%T5, %%T5, 4 vpxord %%T4, %%T4, %%T5 vpsllw %%T4, %%T4, 8 vpxord %%OUT, %%OUT, %%T4 vpshufb %%OUT, %%OUT, [rel shuffle_reg] %endmacro ;;; =========================================================================== ;;; DES encryption/decryption round ;;; =========================================================================== ;;; ;;; Clobbers k1-k6 OpMask registers ;;; ;;; ENC_DEC [in] - ENC for encryption, DEC for decryption ;;; R [in/out] - zmm register; plain text in & cipher text out ;;; L [in/out] - zmm register; plain text in & cipher text out ;;; KS [in] - pointer to the key schedule ;;; T0-T11 [clobbered] - temporary zmm register %macro DES_ENC_DEC 16 %define %%ENC_DEC %1 %define %%R %2 %define %%L %3 %define %%KS %4 %define %%T0 %5 %define %%T1 %6 %define %%T2 %7 %define %%T3 %8 %define %%T4 %9 %define %%T5 %10 %define %%T6 %11 %define %%T7 %12 %define %%T8 %13 %define %%T9 %14 %define %%T10 %15 %define %%T11 %16 IP_Z %%R, %%L, %%T0 %ifidn %%ENC_DEC, ENC ;; ENCRYPTION xor KSOFFSET, KSOFFSET %%_des_enc_loop: E_PHASE %%R, %%T1, %%T2, %%T3, %%T4, [%%KS + KSOFFSET + (0*64)], [%%KS + KSOFFSET + (1*64)], %%T6, %%T7 S_PHASE %%T1, %%T2, %%T3, %%T4, %%T0, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11 P_PHASE %%T0, %%T1, %%T2, %%T3, %%T4 vpxord %%L, %%L, %%T0 E_PHASE %%L, %%T1, %%T2, %%T3, %%T4, [%%KS + KSOFFSET + (2*64)], [%%KS + KSOFFSET + (3*64)], %%T6, %%T7 S_PHASE %%T1, %%T2, %%T3, %%T4, %%T0, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11 P_PHASE %%T0, %%T1, %%T2, %%T3, %%T4 vpxord %%R, %%R, %%T0 add KSOFFSET, (4*64) cmp KSOFFSET, (8*(4*64)) jb %%_des_enc_loop %else ;; DECRYPTION mov KSOFFSET, (8*(4*64)) %%_des_dec_loop: E_PHASE %%R, %%T1, %%T2, %%T3, %%T4, [%%KS + KSOFFSET - (2*64)], [%%KS + KSOFFSET - (1*64)], %%T6, %%T7 S_PHASE %%T1, %%T2, %%T3, %%T4, %%T0, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11 P_PHASE %%T0, %%T1, %%T2, %%T3, %%T4 vpxord %%L, %%L, %%T0 E_PHASE %%L, %%T1, %%T2, %%T3, %%T4, [%%KS + KSOFFSET - (4*64)], [%%KS + KSOFFSET - (3*64)], %%T6, %%T7 S_PHASE %%T1, %%T2, %%T3, %%T4, %%T0, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11 P_PHASE %%T0, %%T1, %%T2, %%T3, %%T4 vpxord %%R, %%R, %%T0 sub KSOFFSET, (4*64) jnz %%_des_dec_loop %endif ; DECRYPTION FP_Z %%R, %%L, %%T0 %endmacro ;;; =========================================================================== ;;; DATA TRANSPOSITION AT DATA INPUT ;;; =========================================================================== ;;; ;;; IN00 - IN15 [in/out]: ;;; in: IN00 - lane 0 data, IN01 - lane 1 data, ... IN15 - lane 15 data ;;; out: R0 - 16 x word0, L0 - 16 x word1, ... L7 - 16 x word15 ;;; T0-T3 [clobbered] - temporary zmm registers ;;; K0-K5 [clobbered] - temporary zmm registers ;;; H0-H3 [clobbered] - temporary zmm registers %macro TRANSPOSE_IN 30 %define %%IN00 %1 ; R0 %define %%IN01 %2 ; L0 %define %%IN02 %3 ; R1 %define %%IN03 %4 ; L1 %define %%IN04 %5 ; R2 %define %%IN05 %6 ; L2 %define %%IN06 %7 ; R3 %define %%IN07 %8 ; L3 %define %%IN08 %9 ; R4 %define %%IN09 %10 ; L4 %define %%IN10 %11 ; R5 %define %%IN11 %12 ; L5 %define %%IN12 %13 ; R6 %define %%IN13 %14 ; L6 %define %%IN14 %15 ; R7 %define %%IN15 %16 ; L7 %define %%T0 %17 %define %%T1 %18 %define %%T2 %19 %define %%T3 %20 %define %%K0 %21 %define %%K1 %22 %define %%K2 %23 %define %%K3 %24 %define %%K4 %25 %define %%K5 %26 %define %%H0 %27 %define %%H1 %28 %define %%H2 %29 %define %%H3 %30 vpunpckldq %%K0, %%IN00, %%IN01 vpunpckhdq %%K1, %%IN00, %%IN01 vpunpckldq %%T0, %%IN02, %%IN03 vpunpckhdq %%T1, %%IN02, %%IN03 vpunpckldq %%IN00, %%IN04, %%IN05 vpunpckhdq %%IN01, %%IN04, %%IN05 vpunpckldq %%IN02, %%IN06, %%IN07 vpunpckhdq %%IN03, %%IN06, %%IN07 vpunpcklqdq %%K2, %%K0, %%T0 vpunpckhqdq %%T2, %%K0, %%T0 vpunpcklqdq %%K3, %%K1, %%T1 vpunpckhqdq %%T3, %%K1, %%T1 vpunpcklqdq %%K0, %%IN00, %%IN02 vpunpckhqdq %%K1, %%IN00, %%IN02 vpunpcklqdq %%T0, %%IN01, %%IN03 vpunpckhqdq %%T1, %%IN01, %%IN03 vpunpckldq %%K4, %%IN08, %%IN09 vpunpckhdq %%K5, %%IN08, %%IN09 vpunpckldq %%IN04, %%IN10, %%IN11 vpunpckhdq %%IN05, %%IN10, %%IN11 vpunpckldq %%IN06, %%IN12, %%IN13 vpunpckhdq %%IN07, %%IN12, %%IN13 vpunpckldq %%IN10, %%IN14, %%IN15 vpunpckhdq %%IN11, %%IN14, %%IN15 vpunpcklqdq %%IN12, %%K4, %%IN04 vpunpckhqdq %%IN13, %%K4, %%IN04 vpunpcklqdq %%IN14, %%K5, %%IN05 vpunpckhqdq %%IN15, %%K5, %%IN05 vpunpcklqdq %%IN00, %%IN06, %%IN10 vpunpckhqdq %%IN01, %%IN06, %%IN10 vpunpcklqdq %%IN02, %%IN07, %%IN11 vpunpckhqdq %%IN03, %%IN07, %%IN11 vshufi64x2 %%H0, %%K2, %%K0, 0x44 vshufi64x2 %%H1, %%K2, %%K0, 0xee vshufi64x2 %%H2, %%IN12, %%IN00, 0x44 vshufi64x2 %%H3, %%IN12, %%IN00, 0xee vshufi64x2 %%IN00, %%H0, %%H2, 0x88 ; R0 vshufi64x2 %%IN04, %%H0, %%H2, 0xdd ; R2 vshufi64x2 %%IN08, %%H1, %%H3, 0x88 ; R4 vshufi64x2 %%IN12, %%H1, %%H3, 0xdd ; R6 vshufi64x2 %%H0, %%T2, %%K1, 0x44 vshufi64x2 %%H1, %%T2, %%K1, 0xee vshufi64x2 %%H2, %%IN13, %%IN01, 0x44 vshufi64x2 %%H3, %%IN13, %%IN01, 0xee vshufi64x2 %%IN01, %%H0, %%H2, 0x88 ; L0 vshufi64x2 %%IN05, %%H0, %%H2, 0xdd ; L2 vshufi64x2 %%IN09, %%H1, %%H3, 0x88 ; L4 vshufi64x2 %%IN13, %%H1, %%H3, 0xdd ; L6 vshufi64x2 %%H0, %%K3, %%T0, 0x44 vshufi64x2 %%H1, %%K3, %%T0, 0xee vshufi64x2 %%H2, %%IN14, %%IN02, 0x44 vshufi64x2 %%H3, %%IN14, %%IN02, 0xee vshufi64x2 %%IN02, %%H0, %%H2, 0x88 ; R1 vshufi64x2 %%IN06, %%H0, %%H2, 0xdd ; R3 vshufi64x2 %%IN10, %%H1, %%H3, 0x88 ; R5 vshufi64x2 %%IN14, %%H1, %%H3, 0xdd ; R7 vshufi64x2 %%H0, %%T3, %%T1, 0x44 vshufi64x2 %%H1, %%T3, %%T1, 0xee vshufi64x2 %%H2, %%IN15, %%IN03, 0x44 vshufi64x2 %%H3, %%IN15, %%IN03, 0xee vshufi64x2 %%IN03, %%H0, %%H2, 0x88 ; L1 vshufi64x2 %%IN07, %%H0, %%H2, 0xdd ; L3 vshufi64x2 %%IN11, %%H1, %%H3, 0x88 ; L5 vshufi64x2 %%IN15, %%H1, %%H3, 0xdd ; L7 %endmacro ;;; =========================================================================== ;;; DATA TRANSPOSITION AT DATA OUTPUT ;;; =========================================================================== ;;; ;;; IN00-IN15 aka R0/L0 - R7/L7 [in/out]: ;;; in: R0 - 16 x word0, L0 - 16 x word1, ... L7 - 16 x word15 ;;; out: R0 - lane 0 data, L0 - lane 1 data, ... L7 - lane 15 data ;;; T0-T3 [clobbered] - temporary zmm registers ;;; K0-K5 [clobbered] - temporary zmm registers ;;; H0-H3 [clobbered] - temporary zmm registers %macro TRANSPOSE_OUT 30 %define %%IN00 %1 ; R0 %define %%IN01 %2 ; L0 %define %%IN02 %3 ; R1 %define %%IN03 %4 ; L1 %define %%IN04 %5 ; R2 %define %%IN05 %6 ; L2 %define %%IN06 %7 ; R3 %define %%IN07 %8 ; L3 %define %%IN08 %9 ; R4 %define %%IN09 %10 ; L4 %define %%IN10 %11 ; R5 %define %%IN11 %12 ; L5 %define %%IN12 %13 ; R6 %define %%IN13 %14 ; L6 %define %%IN14 %15 ; R7 %define %%IN15 %16 ; L7 %define %%T0 %17 %define %%T1 %18 %define %%T2 %19 %define %%T3 %20 %define %%K0 %21 %define %%K1 %22 %define %%K2 %23 %define %%K3 %24 %define %%K4 %25 %define %%K5 %26 %define %%H0 %27 %define %%H1 %28 %define %%H2 %29 %define %%H3 %30 vpunpckldq %%K0, %%IN01, %%IN00 vpunpckhdq %%K1, %%IN01, %%IN00 vpunpckldq %%T0, %%IN03, %%IN02 vpunpckhdq %%T1, %%IN03, %%IN02 vpunpckldq %%IN00, %%IN05, %%IN04 vpunpckhdq %%IN01, %%IN05, %%IN04 vpunpckldq %%IN02, %%IN07, %%IN06 vpunpckhdq %%IN03, %%IN07, %%IN06 vpunpcklqdq %%K2, %%K0, %%T0 vpunpckhqdq %%T2, %%K0, %%T0 vpunpcklqdq %%K3, %%K1, %%T1 vpunpckhqdq %%T3, %%K1, %%T1 vpunpcklqdq %%K0, %%IN00, %%IN02 vpunpckhqdq %%K1, %%IN00, %%IN02 vpunpcklqdq %%T0, %%IN01, %%IN03 vpunpckhqdq %%T1, %%IN01, %%IN03 vpunpckldq %%K4, %%IN09, %%IN08 vpunpckhdq %%K5, %%IN09, %%IN08 vpunpckldq %%IN04, %%IN11, %%IN10 vpunpckhdq %%IN05, %%IN11, %%IN10 vpunpckldq %%IN06, %%IN13, %%IN12 vpunpckhdq %%IN07, %%IN13, %%IN12 vpunpckldq %%IN10, %%IN15, %%IN14 vpunpckhdq %%IN11, %%IN15, %%IN14 vpunpcklqdq %%IN12, %%K4, %%IN04 vpunpckhqdq %%IN13, %%K4, %%IN04 vpunpcklqdq %%IN14, %%K5, %%IN05 vpunpckhqdq %%IN15, %%K5, %%IN05 vpunpcklqdq %%IN00, %%IN06, %%IN10 vpunpckhqdq %%IN01, %%IN06, %%IN10 vpunpcklqdq %%IN02, %%IN07, %%IN11 vpunpckhqdq %%IN03, %%IN07, %%IN11 vshufi64x2 %%H0, %%K2, %%K0, 0x44 vshufi64x2 %%H1, %%K2, %%K0, 0xee vshufi64x2 %%H2, %%IN12, %%IN00, 0x44 vshufi64x2 %%H3, %%IN12, %%IN00, 0xee vshufi64x2 %%IN00, %%H0, %%H2, 0x88 ; R0 vshufi64x2 %%IN04, %%H0, %%H2, 0xdd ; R2 vshufi64x2 %%IN08, %%H1, %%H3, 0x88 ; R4 vshufi64x2 %%IN12, %%H1, %%H3, 0xdd ; R6 vshufi64x2 %%H0, %%T2, %%K1, 0x44 vshufi64x2 %%H1, %%T2, %%K1, 0xee vshufi64x2 %%H2, %%IN13, %%IN01, 0x44 vshufi64x2 %%H3, %%IN13, %%IN01, 0xee vshufi64x2 %%IN01, %%H0, %%H2, 0x88 ; L0 vshufi64x2 %%IN05, %%H0, %%H2, 0xdd ; L2 vshufi64x2 %%IN09, %%H1, %%H3, 0x88 ; L4 vshufi64x2 %%IN13, %%H1, %%H3, 0xdd ; L6 vshufi64x2 %%H0, %%K3, %%T0, 0x44 vshufi64x2 %%H1, %%K3, %%T0, 0xee vshufi64x2 %%H2, %%IN14, %%IN02, 0x44 vshufi64x2 %%H3, %%IN14, %%IN02, 0xee vshufi64x2 %%IN02, %%H0, %%H2, 0x88 ; R1 vshufi64x2 %%IN06, %%H0, %%H2, 0xdd ; R3 vshufi64x2 %%IN10, %%H1, %%H3, 0x88 ; R5 vshufi64x2 %%IN14, %%H1, %%H3, 0xdd ; R7 vshufi64x2 %%H0, %%T3, %%T1, 0x44 vshufi64x2 %%H1, %%T3, %%T1, 0xee vshufi64x2 %%H2, %%IN15, %%IN03, 0x44 vshufi64x2 %%H3, %%IN15, %%IN03, 0xee vshufi64x2 %%IN03, %%H0, %%H2, 0x88 ; L1 vshufi64x2 %%IN07, %%H0, %%H2, 0xdd ; L3 vshufi64x2 %%IN11, %%H1, %%H3, 0x88 ; L5 vshufi64x2 %%IN15, %%H1, %%H3, 0xdd ; L7 %endmacro ;;; =========================================================================== ;;; DATA TRANSPOSITION OF ONE DES BLOCK AT DATA INPUT ;;; =========================================================================== ;;; ;;; IN00-IN15 / R0/L0-R7/L7 [in/out]: ;;; in: IN00 - lane 0 data, IN01 - lane 1 data, ... IN15 - lane 15 data ;;; out: R0 - 16 x word0, L0 - 16 x word1 ;;; T0,T2 [clobbered] - temporary zmm registers ;;; K0-K4 [clobbered] - temporary zmm registers ;;; H0,H2 [clobbered] - temporary zmm registers %macro TRANSPOSE_IN_ONE 24 %define %%IN00 %1 ; R0 %define %%IN01 %2 ; L0 %define %%IN02 %3 ; R1 %define %%IN03 %4 ; L1 %define %%IN04 %5 ; R2 %define %%IN05 %6 ; L2 %define %%IN06 %7 ; R3 %define %%IN07 %8 ; L3 %define %%IN08 %9 ; R4 %define %%IN09 %10 ; L4 %define %%IN10 %11 ; R5 %define %%IN11 %12 ; L5 %define %%IN12 %13 ; R6 %define %%IN13 %14 ; L6 %define %%IN14 %15 ; R7 %define %%IN15 %16 ; L7 %define %%T0 %17 %define %%T2 %18 %define %%K0 %19 %define %%K1 %20 %define %%K2 %21 %define %%K4 %22 %define %%H0 %23 %define %%H2 %24 vpunpckldq %%K0, %%IN00, %%IN01 vpunpckhdq %%K1, %%IN00, %%IN01 vpunpckldq %%T0, %%IN02, %%IN03 vpunpckldq %%IN00, %%IN04, %%IN05 vpunpckhdq %%IN01, %%IN04, %%IN05 vpunpckldq %%IN02, %%IN06, %%IN07 vpunpcklqdq %%K2, %%K0, %%T0 vpunpckhqdq %%T2, %%K0, %%T0 vpunpcklqdq %%K0, %%IN00, %%IN02 vpunpckhqdq %%K1, %%IN00, %%IN02 vpunpckldq %%K4, %%IN08, %%IN09 vpunpckldq %%IN04, %%IN10, %%IN11 vpunpckldq %%IN06, %%IN12, %%IN13 vpunpckldq %%IN10, %%IN14, %%IN15 vpunpcklqdq %%IN12, %%K4, %%IN04 vpunpckhqdq %%IN13, %%K4, %%IN04 vpunpcklqdq %%IN00, %%IN06, %%IN10 vpunpckhqdq %%IN01, %%IN06, %%IN10 vshufi64x2 %%H0, %%K2, %%K0, 0x44 vshufi64x2 %%H2, %%IN12, %%IN00, 0x44 vshufi64x2 %%IN00, %%H0, %%H2, 0x88 ; R0 vshufi64x2 %%H0, %%T2, %%K1, 0x44 vshufi64x2 %%H2, %%IN13, %%IN01, 0x44 vshufi64x2 %%IN01, %%H0, %%H2, 0x88 ; L0 %endmacro ;;; =========================================================================== ;;; DATA TRANSPOSITION OF ONE DES BLOCK AT DATA OUTPUT ;;; =========================================================================== ;;; ;;; IN00-IN15 aka R0/L0 - R7/L7 [in/out]: ;;; in: R0 - 16 x word0, L0 - 16 x word1 ;;; out: R0 - lane 0 data, L0 - lane 1 data, ... L7 - lane 15 data ;;; T0-T3 [clobbered] - temporary zmm registers ;;; K0-K3 [clobbered] - temporary zmm registers ;;; H0,H1 [clobbered] - temporary zmm registers %macro TRANSPOSE_OUT_ONE 25 %define %%IN00 %1 ; R0 %define %%IN01 %2 ; L0 %define %%IN02 %3 ; R1 %define %%IN03 %4 ; L1 %define %%IN04 %5 ; R2 %define %%IN05 %6 ; L2 %define %%IN06 %7 ; R3 %define %%IN07 %8 ; L3 %define %%IN08 %9 ; R4 %define %%IN09 %10 ; L4 %define %%IN10 %11 ; R5 %define %%IN11 %12 ; L5 %define %%IN12 %13 ; R6 %define %%IN13 %14 ; L6 %define %%IN14 %15 ; R7 %define %%IN15 %16 ; L7 %define %%T0 %17 %define %%T2 %18 %define %%T3 %19 %define %%K0 %20 %define %%K1 %21 %define %%K2 %22 %define %%K3 %23 %define %%H0 %24 %define %%H1 %25 vpxord %%T0, %%T0, %%T0 vpunpckldq %%K0, %%IN01, %%IN00 vpunpckhdq %%K1, %%IN01, %%IN00 vpunpcklqdq %%K2, %%K0, %%T0 vpunpckhqdq %%T2, %%K0, %%T0 vpunpcklqdq %%K3, %%K1, %%T0 vpunpckhqdq %%T3, %%K1, %%T0 vshufi64x2 %%H0, %%K2, %%T0, 0x44 vshufi64x2 %%H1, %%K2, %%T0, 0xee vshufi64x2 %%IN00, %%H0, %%T0, 0x88 ; R0 vshufi64x2 %%IN04, %%H0, %%T0, 0xdd ; R2 vshufi64x2 %%IN08, %%H1, %%T0, 0x88 ; R4 vshufi64x2 %%IN12, %%H1, %%T0, 0xdd ; R6 vshufi64x2 %%H0, %%T2, %%T0, 0x44 vshufi64x2 %%H1, %%T2, %%T0, 0xee vshufi64x2 %%IN01, %%H0, %%T0, 0x88 ; L0 vshufi64x2 %%IN05, %%H0, %%T0, 0xdd ; L2 vshufi64x2 %%IN09, %%H1, %%T0, 0x88 ; L4 vshufi64x2 %%IN13, %%H1, %%T0, 0xdd ; L6 vshufi64x2 %%H0, %%K3, %%T0, 0x44 vshufi64x2 %%H1, %%K3, %%T0, 0xee vshufi64x2 %%IN02, %%H0, %%T0, 0x88 ; R1 vshufi64x2 %%IN06, %%H0, %%T0, 0xdd ; R3 vshufi64x2 %%IN10, %%H1, %%T0, 0x88 ; R5 vshufi64x2 %%IN14, %%H1, %%T0, 0xdd ; R7 vshufi64x2 %%H0, %%T3, %%T0, 0x44 vshufi64x2 %%H1, %%T3, %%T0, 0xee vshufi64x2 %%IN03, %%H0, %%T0, 0x88 ; L1 vshufi64x2 %%IN07, %%H0, %%T0, 0xdd ; L3 vshufi64x2 %%IN11, %%H1, %%T0, 0x88 ; L5 vshufi64x2 %%IN15, %%H1, %%T0, 0xdd ; L7 %endmacro ;;; =========================================================================== ;;; DES INITIALIZATION ;;; key schedule transposition and IV set up ;;; =========================================================================== ;;; ;;; STATE_KEYS [in] - KEYS in DES OOO STATE ;;; STATE_IV [ in] - IV in DES OOO STATE ;;; KS [out] - place to store transposed key schedule or NULL ;;; IV0 [out] - r512; initialization vector ;;; IV1 [out] - r512; initialization vector ;;; T0-T27 [clobbered] - temporary r512 %macro DES_INIT 33 %define %%STATE_KEYS %1 %define %%STATE_IV %2 %define %%KS %3 %define %%IV0 %4 %define %%IV1 %5 %define %%T0 %6 %define %%T1 %7 %define %%T2 %8 %define %%T3 %9 %define %%T4 %10 %define %%T5 %11 %define %%T6 %12 %define %%T7 %13 %define %%T8 %14 %define %%T9 %15 %define %%T10 %16 %define %%T11 %17 %define %%T12 %18 %define %%T13 %19 %define %%T14 %20 %define %%T15 %21 %define %%T16 %22 %define %%T17 %23 %define %%T18 %24 %define %%T19 %25 %define %%T20 %26 %define %%T21 %27 %define %%T22 %28 %define %%T23 %29 %define %%T24 %30 %define %%T25 %31 %define %%T26 %32 %define %%T27 %33 %ifnidn %%KS, NULL ;; set up the key schedule ;; - load first half of the keys & transpose ;; - transpose and store ;; note: we can use IV registers as temprary ones here %assign IDX 0 %rep 16 mov IA0, [%%STATE_KEYS + (IDX*PTR_SZ)] vmovdqu64 %%T %+ IDX, [IA0] %assign IDX (IDX + 1) %endrep TRANSPOSE_IN %%T0, %%T1, %%T2, %%T3, %%T4, %%T5, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11, %%T12, %%T13, %%T14, %%T15, %%T16, %%T17, %%T18, %%T19, %%T20, %%T21, %%T22, %%T23, %%T24, %%T25, %%T26, %%T27, %%IV0, %%IV1 %assign IDX 0 %rep 16 vmovdqu64 [%%KS + (IDX * 64)], %%T %+ IDX %assign IDX (IDX + 1) %endrep ;; - load second half of the keys & transpose ;; - transpose and store ;; note: we can use IV registers as temprary ones here %assign IDX 0 %rep 16 mov IA0, [%%STATE_KEYS + (IDX*PTR_SZ)] vmovdqu64 %%T %+ IDX, [IA0 + 64] %assign IDX (IDX + 1) %endrep TRANSPOSE_IN %%T0, %%T1, %%T2, %%T3, %%T4, %%T5, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11, %%T12, %%T13, %%T14, %%T15, %%T16, %%T17, %%T18, %%T19, %%T20, %%T21, %%T22, %%T23, %%T24, %%T25, %%T26, %%T27, %%IV0, %%IV1 %assign IDX 0 %rep 16 vmovdqu64 [%%KS + (16 * 64) + (IDX * 64)], %%T %+ IDX %assign IDX (IDX + 1) %endrep %endif ; KS != NULL ;; set up IV ;; - they are already kept transposed so this is enough to load them vmovdqu64 %%IV0, [%%STATE_IV + (0 * 64)] vmovdqu64 %%IV1, [%%STATE_IV + (1 * 64)] %endmacro ;;; =========================================================================== ;;; DES FINISH ;;; Update in/out pointers and store IV ;;; =========================================================================== ;;; ;;; Needs: STATE & SIZE ;;; IV0 [in] - r512; initialization vector ;;; IV1 [in] - r512; initialization vector ;;; T0-T4 [clobbered] - temporary r512 registers %macro DES_FINISH 7 %define %%IV0 %1 %define %%IV1 %2 %define %%T0 %3 %define %%T1 %4 %define %%T2 %5 %define %%T3 %6 %define %%T4 %7 vpbroadcastq %%T4, SIZE vmovdqu64 %%T0, [STATE + _des_args_in + (0 * PTR_SZ)] vmovdqu64 %%T1, [STATE + _des_args_in + (8 * PTR_SZ)] vmovdqu64 %%T2, [STATE + _des_args_out + (0 * PTR_SZ)] vmovdqu64 %%T3, [STATE + _des_args_out + (8 * PTR_SZ)] vpaddq %%T0, %%T0, %%T4 vpaddq %%T1, %%T1, %%T4 vpaddq %%T2, %%T2, %%T4 vpaddq %%T3, %%T3, %%T4 vmovdqu64 [STATE + _des_args_in + (0 * PTR_SZ)], %%T0 vmovdqu64 [STATE + _des_args_in + (8 * PTR_SZ)], %%T1 vmovdqu64 [STATE + _des_args_out + (0 * PTR_SZ)], %%T2 vmovdqu64 [STATE + _des_args_out + (8 * PTR_SZ)], %%T3 vmovdqu64 [STATE + _des_args_IV + (0 * 64)], %%IV0 vmovdqu64 [STATE + _des_args_IV + (1 * 64)], %%IV1 %endmacro ;;; =========================================================================== ;;; DES CFB ENCRYPT/DECRYPT - ONE BLOCK ONLY ;;; =========================================================================== ;;; ;;; Needs: STATE, IA0-IA2 ;;; ENC_DEC [in] - encyrpt (ENC) or decrypt (DEC) selection ;;; KS [in] - key schedule ;;; T0-T24 [clobbered] - temporary r512 ;;; T_IN [in] - 16 * 8 byte storage ;;; T_OUT [in] - 16 * 8 byte storage ;;; T_MASK [in] - 16 * 4 byte storage ;;; T_IV [in] - 16 * 8 byte storage ;;; ;;; NOTE: clobbers OpMask registers %macro DES_CFB_ONE 31 %define %%ENC_DEC %1 %define %%KS %2 %define %%T0 %3 %define %%T1 %4 %define %%T2 %5 %define %%T3 %6 %define %%T4 %7 %define %%T5 %8 %define %%T6 %9 %define %%T7 %10 %define %%T8 %11 %define %%T9 %12 %define %%T10 %13 %define %%T11 %14 %define %%T12 %15 %define %%T13 %16 %define %%T14 %17 %define %%T15 %18 %define %%T16 %19 %define %%T17 %20 %define %%T18 %21 %define %%T19 %22 %define %%T20 %23 %define %%T21 %24 %define %%T22 %25 %define %%T23 %26 %define %%T24 %27 %define %%T_IN %28 %define %%T_OUT %29 %define %%T_IV %30 %define %%T_MASK %31 ;; - find mask for non-zero partial lengths vpxord %%T10, %%T10, %%T10 vmovdqu64 %%T0, [STATE + _des_args_PLen] vpcmpd k3, %%T0, %%T10, 4 ; NEQ kmovw DWORD(IA0), k3 movzx DWORD(IA0), WORD(IA0) or DWORD(IA0), DWORD(IA0) jz %%_des_cfb_one_end ; no non-zero partial lengths %ifidn %%ENC_DEC, ENC ;; For encyrption case we need to make sure that ;; all full blocks are complete before proceeding ;; with CFB partial block. ;; To do that current out position is compared against ;; calculated last full block position. vmovdqu64 %%T1, [STATE + _des_args_out + (0*8)] vmovdqu64 %%T2, [STATE + _des_args_LOut + (0*8)] vmovdqu64 %%T3, [STATE + _des_args_out + (8*8)] vmovdqu64 %%T4, [STATE + _des_args_LOut + (8*8)] vpcmpq k4, %%T1, %%T2, 0 ; EQ vpcmpq k5, %%T3, %%T4, 0 ; EQ kmovw DWORD(IA1), k4 movzx DWORD(IA1), BYTE(IA1) kmovw DWORD(IA2), k5 movzx DWORD(IA2), BYTE(IA2) shl DWORD(IA2), 8 or DWORD(IA2), DWORD(IA1) and DWORD(IA0), DWORD(IA2) jz %%_des_cfb_one_end ; no non-zero lengths left kmovw k3, DWORD(IA0) %endif ;; Calculate ((1 << partial_bytes) - 1) ;; in order to get the mask for loads and stores ;; k3 & IA0 - hold valid mask vmovdqa64 %%T1, [rel vec_ones_32b] vpsllvd %%T2{k3}{z}, %%T1, %%T0 vpsubd %%T2{k3}{z}, %%T2, %%T1 vmovdqu64 [%%T_MASK], %%T2 ;; clear selected partial lens not to do them twice vmovdqu32 [STATE + _des_args_PLen]{k3}, %%T10 ;; copy IV, in and out pointers vmovdqu64 %%T1, [STATE + _des_args_in + (0*PTR_SZ)] vmovdqu64 %%T2, [STATE + _des_args_in + (8*PTR_SZ)] vmovdqu64 %%T3, [STATE + _des_args_out + (0*PTR_SZ)] vmovdqu64 %%T4, [STATE + _des_args_out + (8*PTR_SZ)] vmovdqu64 %%T5, [STATE + _des_args_IV + (0*64)] vmovdqu64 %%T6, [STATE + _des_args_IV + (1*64)] vmovdqu64 [%%T_IN + (0*PTR_SZ)], %%T1 vmovdqu64 [%%T_IN + (8*PTR_SZ)], %%T2 vmovdqu64 [%%T_OUT + (0*PTR_SZ)], %%T3 vmovdqu64 [%%T_OUT + (8*PTR_SZ)], %%T4 vmovdqu64 [%%T_IV + (0*64)], %%T5 vmovdqu64 [%%T_IV + (1*64)], %%T6 ;; calculate last block case mask ;; - first block case requires no modifications to in/out/IV vmovdqu64 %%T1, [STATE + _des_args_BLen] vpcmpd k2, %%T1, %%T10, 4 ; NEQ kmovw DWORD(IA1), k2 and DWORD(IA1), DWORD(IA0) jz %%_des_cfb_one_no_last_blocks ;; set up IV, in and out for the last block case ;; - Last block needs in and out to be set differently (decryption only) ;; - IA1 holds the last block mask %ifidn %%ENC_DEC, DEC mov DWORD(IA0), DWORD(IA1) mov DWORD(IA2), DWORD(IA1) shr DWORD(IA1), 8 and DWORD(IA2), 0xff kmovw k4, DWORD(IA2) kmovw k5, DWORD(IA1) vmovdqu64 %%T1, [STATE + _des_args_LOut + (0*PTR_SZ)] vmovdqu64 %%T2, [STATE + _des_args_LOut + (8*PTR_SZ)] vmovdqu64 %%T3, [STATE + _des_args_LIn + (0*PTR_SZ)] vmovdqu64 %%T4, [STATE + _des_args_LIn + (8*PTR_SZ)] vmovdqu64 [%%T_OUT + (0*PTR_SZ)]{k4}, %%T1 vmovdqu64 [%%T_OUT + (8*PTR_SZ)]{k5}, %%T2 vmovdqu64 [%%T_IN + (0*PTR_SZ)]{k4}, %%T3 vmovdqu64 [%%T_IN + (8*PTR_SZ)]{k5}, %%T4 %endif ; decryption ;; - IV has to be set differently for CFB as well ;; - IA0 holds the last block mask %assign IDX 0 %rep 16 test DWORD(IA0), (1 << IDX) jz %%_des_cfb_one_copy_iv_next %+ IDX %ifidn %%ENC_DEC, ENC mov IA2, [STATE + _des_args_LOut + (IDX*PTR_SZ)] %else mov IA2, [STATE + _des_args_LIn + (IDX*PTR_SZ)] %endif mov IA2, [IA2 - 8] mov [%%T_IV + (0*4) + (IDX*4)], DWORD(IA2) shr IA2, 32 mov [%%T_IV + (16*4) + (IDX*4)], DWORD(IA2) %%_des_cfb_one_copy_iv_next %+ IDX: %assign IDX (IDX + 1) %endrep %%_des_cfb_one_no_last_blocks: ;; Uffff ... finally let's do some DES CFB ;; - let's use T_IN, T_OUT, T_IV and T_MASK ;; - load data with the corresponding masks & transpose ;; - T0 to T15 will hold the data xor IA0, IA0 %assign IDX 0 %assign K_IDX 1 %rep 16 mov IA1, [%%T_IN + (IDX*PTR_SZ)] mov DWORD(IA0), [%%T_MASK + (IDX*4)] kmovq k %+ K_IDX, IA0 vmovdqu8 %%T %+ IDX{k %+ K_IDX}{z}, [IA1] %assign IDX (IDX + 1) %assign K_IDX (K_IDX + 1) %if K_IDX > 7 %assign K_IDX 1 ; iterate through K1 to K7 %endif %endrep ;; - transpose the data in T0 to T15, T16 to T23 are clobbered TRANSPOSE_IN_ONE %%T0, %%T1, %%T2, %%T3, %%T4, %%T5, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11, %%T12, %%T13, %%T14, %%T15, %%T16, %%T17, %%T18, %%T19, %%T20, %%T21, %%T22, %%T23 ;; - set up IV and %%T16 & %%T17 used as IV0 and IV1 vmovdqu64 %%T16, [%%T_IV + (0 * 64)] ;IV0 vmovdqu64 %%T17, [%%T_IV + (1 * 64)] ;IV1 ;; DES encrypt ;; - R0 - %%T0 ;; - L0 - %%T1 DES_ENC_DEC ENC, %%T16, %%T17, %%KS, %%T2, %%T3, %%T4, %%T5, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11, %%T12, %%T13 ;; CFB style xor with R0/L0 with IV ;; - IV0 - %%T16 ;; - IV1 - %%T17 vpxord %%T2, %%T17, %%T0 ; R0 ^ IV1 vpxord %%T0, %%T16, %%T1 ; L0 ^ IV0 vmovdqa64 %%T1, %%T2 ;; - new R0 = L0 ^ IV0 (%%T0) ;; - new L0 = R0 ^ IV1 (%%T1) ;; Transpose the data out ;; - %%T2 to %%T24 clobbered TRANSPOSE_OUT_ONE %%T0, %%T1, %%T2, %%T3, %%T4, %%T5, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11, %%T12, %%T13, %%T14, %%T15, %%T16, %%T17, %%T18, %%T19, %%T20, %%T21, %%T22, %%T23, %%T24 ;; Store the transposed data ;; - T0 to T15 will hold the data xor IA0, IA0 %assign IDX 0 %assign K_IDX 1 %rep 16 mov IA1, [%%T_OUT + (IDX*PTR_SZ)] mov DWORD(IA0), [%%T_MASK + (IDX*4)] kmovq k %+ K_IDX, IA0 vmovdqu8 [IA1]{k %+ K_IDX}, %%T %+ IDX %assign IDX (IDX + 1) %assign K_IDX (K_IDX + 1) %if K_IDX > 7 %assign K_IDX 1 ; iterate through K1 to K7 %endif %endrep %%_des_cfb_one_end: %endmacro ;;; =========================================================================== ;;; Converts length into mask of DES blocks ;;; =========================================================================== ;;; ;;; MASK [out] - mask8 for value; for masked 64b loads and stores (r64) ;;; USES: IA0, IA1 IA2 %macro GET_MASK8 1 %define %%MASK %1 mov IA2, SIZE sub IA2, OFFSET %ifidn IA1, rcx %define myrcx IA1 %else %define myrcx rcx mov IA1, rcx %endif ;; min of 64 and length, x = length, y = 64 ;; x = y + ((x - y) & ((x - y) >> 31)) sub DWORD(IA2), 64 mov DWORD(myrcx), DWORD(IA2) sar DWORD(myrcx), 31 and DWORD(myrcx), DWORD(IA2) add DWORD(myrcx), 64 ;; - IA0 - min of 64 and remaining length ;; - divide by 8 (DES block size) ;; - create bit mask of the result mov DWORD(%%MASK), 1 shr DWORD(myrcx), 3 shl DWORD(%%MASK), BYTE(myrcx) sub DWORD(%%MASK), 1 %ifnidn IA1, rcx mov rcx, IA1 %endif %endmacro ;;; =========================================================================== ;;; DES CBC / DOCSIS DES ENCRYPT ;;; =========================================================================== ;;; ;;; DES_DOCSIS [in] - select between DES (DES CBC) and DOCSIS (DOCSIS DES) ;;; ;;; NOTE: clobbers OpMask registers %macro GENERIC_DES_ENC 1 %define %%DES_DOCSIS %1 ;; push the registers and allocate the stack frame mov rax, rsp sub rsp, STACKFRAME_size and rsp, -64 mov [rsp + _rsp_save], rax ; original SP mov [rsp + _gpr_save + 0*8], r12 mov [rsp + _gpr_save + 1*8], r13 mov [rsp + _gpr_save + 2*8], r14 mov [rsp + _gpr_save + 3*8], r15 DES_INIT STATE + _des_args_keys, STATE + _des_args_IV, rsp + _key_sched, ZIV0, ZIV1, ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11 xor OFFSET, OFFSET %%_gen_des_enc_loop: cmp OFFSET, SIZE jge %%_gen_des_enc_loop_end ;; calculate min of bytes_left and 64, convert to qword mask GET_MASK8 IA0 ; IA0 = mask kmovw k7, DWORD(IA0) ;; run masked loads mov IA0, [STATE + _des_args_in + (0*PTR_SZ)] mov IA1, [STATE + _des_args_in + (1*PTR_SZ)] mov IA2, [STATE + _des_args_in + (2*PTR_SZ)] mov INP0, [STATE + _des_args_in + (3*PTR_SZ)] mov INP1, [STATE + _des_args_in + (4*PTR_SZ)] mov INP2, [STATE + _des_args_in + (5*PTR_SZ)] mov INP3, [STATE + _des_args_in + (6*PTR_SZ)] mov INP4, [STATE + _des_args_in + (7*PTR_SZ)] vmovdqu64 ZW0{k7}{z}, [IA0 + OFFSET] vmovdqu64 ZW1{k7}{z}, [IA1 + OFFSET] vmovdqu64 ZW2{k7}{z}, [IA2 + OFFSET] vmovdqu64 ZW3{k7}{z}, [INP0 + OFFSET] vmovdqu64 ZW4{k7}{z}, [INP1 + OFFSET] vmovdqu64 ZW5{k7}{z}, [INP2 + OFFSET] vmovdqu64 ZW6{k7}{z}, [INP3 + OFFSET] vmovdqu64 ZW7{k7}{z}, [INP4 + OFFSET] mov IA0, [STATE + _des_args_in + (8*PTR_SZ)] mov IA1, [STATE + _des_args_in + (9*PTR_SZ)] mov IA2, [STATE + _des_args_in + (10*PTR_SZ)] mov INP0, [STATE + _des_args_in + (11*PTR_SZ)] mov INP1, [STATE + _des_args_in + (12*PTR_SZ)] mov INP2, [STATE + _des_args_in + (13*PTR_SZ)] mov INP3, [STATE + _des_args_in + (14*PTR_SZ)] mov INP4, [STATE + _des_args_in + (15*PTR_SZ)] vmovdqu64 ZW8{k7}{z}, [IA0 + OFFSET] vmovdqu64 ZW9{k7}{z}, [IA1 + OFFSET] vmovdqu64 ZW10{k7}{z}, [IA2 + OFFSET] vmovdqu64 ZW11{k7}{z}, [INP0 + OFFSET] vmovdqu64 ZW12{k7}{z}, [INP1 + OFFSET] vmovdqu64 ZW13{k7}{z}, [INP2 + OFFSET] vmovdqu64 ZW14{k7}{z}, [INP3 + OFFSET] vmovdqu64 ZW15{k7}{z}, [INP4 + OFFSET] ;; Transpose input TRANSPOSE_IN ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11, ZTMP12, ZTMP13 ;; DES CBC comes here vpxord ZW0, ZW0, ZIV0 ; R0 = R0 ^ IV0 vpxord ZW1, ZW1, ZIV1 ; L0 = L0 ^ IV1 %assign RN 0 %assign LN 1 %assign RNN 2 %assign LNN 3 %rep 7 DES_ENC_DEC ENC, ZW %+ RN, ZW %+ LN, rsp + _key_sched, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11 vpxord ZW %+ RNN, ZW %+ RNN, ZW %+ LN ; R1 = R1 ^ L0 vpxord ZW %+ LNN, ZW %+ LNN, ZW %+ RN ; L1 = L1 ^ R0 %assign RN (RN + 2) %assign LN (LN + 2) %assign RNN (RNN + 2) %assign LNN (LNN + 2) %endrep DES_ENC_DEC ENC, ZW14, ZW15, rsp + _key_sched, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11 vmovdqu64 ZIV0, ZW15 ; IV0 = L7 vmovdqu64 ZIV1, ZW14 ; IV1 = R7 ;; transpose data on output TRANSPOSE_OUT ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11, ZTMP12, ZTMP13 ;; run masked stores mov IA0, [STATE + _des_args_out + (0*PTR_SZ)] mov IA1, [STATE + _des_args_out + (1*PTR_SZ)] mov IA2, [STATE + _des_args_out + (2*PTR_SZ)] mov INP0, [STATE + _des_args_out + (3*PTR_SZ)] mov INP1, [STATE + _des_args_out + (4*PTR_SZ)] mov INP2, [STATE + _des_args_out + (5*PTR_SZ)] mov INP3, [STATE + _des_args_out + (6*PTR_SZ)] mov INP4, [STATE + _des_args_out + (7*PTR_SZ)] vmovdqu64 [IA0 + OFFSET]{k7}, ZW0 vmovdqu64 [IA1 + OFFSET]{k7}, ZW1 vmovdqu64 [IA2 + OFFSET]{k7}, ZW2 vmovdqu64 [INP0 + OFFSET]{k7}, ZW3 vmovdqu64 [INP1 + OFFSET]{k7}, ZW4 vmovdqu64 [INP2 + OFFSET]{k7}, ZW5 vmovdqu64 [INP3 + OFFSET]{k7}, ZW6 vmovdqu64 [INP4 + OFFSET]{k7}, ZW7 mov IA0, [STATE + _des_args_out + (8*PTR_SZ)] mov IA1, [STATE + _des_args_out + (9*PTR_SZ)] mov IA2, [STATE + _des_args_out + (10*PTR_SZ)] mov INP0, [STATE + _des_args_out + (11*PTR_SZ)] mov INP1, [STATE + _des_args_out + (12*PTR_SZ)] mov INP2, [STATE + _des_args_out + (13*PTR_SZ)] mov INP3, [STATE + _des_args_out + (14*PTR_SZ)] mov INP4, [STATE + _des_args_out + (15*PTR_SZ)] vmovdqu64 [IA0 + OFFSET]{k7}, ZW8 vmovdqu64 [IA1 + OFFSET]{k7}, ZW9 vmovdqu64 [IA2 + OFFSET]{k7}, ZW10 vmovdqu64 [INP0 + OFFSET]{k7}, ZW11 vmovdqu64 [INP1 + OFFSET]{k7}, ZW12 vmovdqu64 [INP2 + OFFSET]{k7}, ZW13 vmovdqu64 [INP3 + OFFSET]{k7}, ZW14 vmovdqu64 [INP4 + OFFSET]{k7}, ZW15 add OFFSET, 64 jmp %%_gen_des_enc_loop %%_gen_des_enc_loop_end: ;; store IV and update pointers DES_FINISH ZIV0, ZIV1, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4 ;; CFB part for DOCSIS %ifidn %%DES_DOCSIS, DOCSIS DES_CFB_ONE ENC, rsp + _key_sched, ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, rsp + _tmp_in, rsp + _tmp_out, rsp + _tmp_iv, rsp + _tmp_mask %endif ;; restore stack pointer and registers mov r12, [rsp + _gpr_save + 0*8] mov r13, [rsp + _gpr_save + 1*8] mov r14, [rsp + _gpr_save + 2*8] mov r15, [rsp + _gpr_save + 3*8] mov rsp, [rsp + _rsp_save] ; original SP %endmacro ;;; =========================================================================== ;;; DES CBC / DOCSIS DES DECRYPT ;;; =========================================================================== ;;; ;;; DES_DOCSIS [in] - select between DES (DES CBC) and DOCSIS (DOCSIS DES) ;;; ;;; NOTE: clobbers OpMask registers %macro GENERIC_DES_DEC 1 %define %%DES_DOCSIS %1 ;; push the registers and allocate the stack frame mov rax, rsp sub rsp, STACKFRAME_size and rsp, -64 mov [rsp + _rsp_save], rax ; original SP mov [rsp + _gpr_save + 0*8], r12 mov [rsp + _gpr_save + 1*8], r13 mov [rsp + _gpr_save + 2*8], r14 mov [rsp + _gpr_save + 3*8], r15 DES_INIT STATE + _des_args_keys, STATE + _des_args_IV, rsp + _key_sched, ZIV0, ZIV1, ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11 ;; CFB part for DOCSIS %ifidn %%DES_DOCSIS, DOCSIS DES_CFB_ONE DEC, rsp + _key_sched, ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, rsp + _tmp_in, rsp + _tmp_out, rsp + _tmp_iv, rsp + _tmp_mask %endif xor OFFSET, OFFSET %%_gen_des_dec_loop: cmp OFFSET, SIZE jge %%_gen_des_dec_loop_end ;; calculate min of bytes_left and 64, convert to qword mask GET_MASK8 IA0 ; IA0 = mask kmovw k7, DWORD(IA0) ;; run masked loads mov IA0, [STATE + _des_args_in + (0*PTR_SZ)] mov IA1, [STATE + _des_args_in + (1*PTR_SZ)] mov IA2, [STATE + _des_args_in + (2*PTR_SZ)] mov INP0, [STATE + _des_args_in + (3*PTR_SZ)] mov INP1, [STATE + _des_args_in + (4*PTR_SZ)] mov INP2, [STATE + _des_args_in + (5*PTR_SZ)] mov INP3, [STATE + _des_args_in + (6*PTR_SZ)] mov INP4, [STATE + _des_args_in + (7*PTR_SZ)] vmovdqu64 ZW0{k7}{z}, [IA0 + OFFSET] vmovdqu64 ZW1{k7}{z}, [IA1 + OFFSET] vmovdqu64 ZW2{k7}{z}, [IA2 + OFFSET] vmovdqu64 ZW3{k7}{z}, [INP0 + OFFSET] vmovdqu64 ZW4{k7}{z}, [INP1 + OFFSET] vmovdqu64 ZW5{k7}{z}, [INP2 + OFFSET] vmovdqu64 ZW6{k7}{z}, [INP3 + OFFSET] vmovdqu64 ZW7{k7}{z}, [INP4 + OFFSET] mov IA0, [STATE + _des_args_in + (8*PTR_SZ)] mov IA1, [STATE + _des_args_in + (9*PTR_SZ)] mov IA2, [STATE + _des_args_in + (10*PTR_SZ)] mov INP0, [STATE + _des_args_in + (11*PTR_SZ)] mov INP1, [STATE + _des_args_in + (12*PTR_SZ)] mov INP2, [STATE + _des_args_in + (13*PTR_SZ)] mov INP3, [STATE + _des_args_in + (14*PTR_SZ)] mov INP4, [STATE + _des_args_in + (15*PTR_SZ)] vmovdqu64 ZW8{k7}{z}, [IA0 + OFFSET] vmovdqu64 ZW9{k7}{z}, [IA1 + OFFSET] vmovdqu64 ZW10{k7}{z}, [IA2 + OFFSET] vmovdqu64 ZW11{k7}{z}, [INP0 + OFFSET] vmovdqu64 ZW12{k7}{z}, [INP1 + OFFSET] vmovdqu64 ZW13{k7}{z}, [INP2 + OFFSET] vmovdqu64 ZW14{k7}{z}, [INP3 + OFFSET] vmovdqu64 ZW15{k7}{z}, [INP4 + OFFSET] ;; Transpose input TRANSPOSE_IN ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11, ZTMP12, ZTMP13 ;; DES CBC comes here %assign RN 0 %assign LN 1 %rep 8 vmovdqa64 ZTMP12, ZW %+ RN ; keep R0 as IV for the next round vmovdqa64 ZTMP13, ZW %+ LN ; keep L0 as IV for the next round DES_ENC_DEC DEC, ZW %+ RN, ZW %+ LN, rsp + _key_sched, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11 vpxord ZW %+ RN, ZW %+ RN, ZIV1 ; R0 = R0 ^ IV1 vpxord ZW %+ LN, ZW %+ LN, ZIV0 ; L0 = L0 ^ IV0 vmovdqa64 ZIV0, ZTMP12 vmovdqa64 ZIV1, ZTMP13 %assign RN (RN + 2) %assign LN (LN + 2) %endrep ;; transpose data on output TRANSPOSE_OUT ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11, ZTMP12, ZTMP13 ;; run masked stores mov IA0, [STATE + _des_args_out + (0*PTR_SZ)] mov IA1, [STATE + _des_args_out + (1*PTR_SZ)] mov IA2, [STATE + _des_args_out + (2*PTR_SZ)] mov INP0, [STATE + _des_args_out + (3*PTR_SZ)] mov INP1, [STATE + _des_args_out + (4*PTR_SZ)] mov INP2, [STATE + _des_args_out + (5*PTR_SZ)] mov INP3, [STATE + _des_args_out + (6*PTR_SZ)] mov INP4, [STATE + _des_args_out + (7*PTR_SZ)] vmovdqu64 [IA0 + OFFSET]{k7}, ZW0 vmovdqu64 [IA1 + OFFSET]{k7}, ZW1 vmovdqu64 [IA2 + OFFSET]{k7}, ZW2 vmovdqu64 [INP0 + OFFSET]{k7}, ZW3 vmovdqu64 [INP1 + OFFSET]{k7}, ZW4 vmovdqu64 [INP2 + OFFSET]{k7}, ZW5 vmovdqu64 [INP3 + OFFSET]{k7}, ZW6 vmovdqu64 [INP4 + OFFSET]{k7}, ZW7 mov IA0, [STATE + _des_args_out + (8*PTR_SZ)] mov IA1, [STATE + _des_args_out + (9*PTR_SZ)] mov IA2, [STATE + _des_args_out + (10*PTR_SZ)] mov INP0, [STATE + _des_args_out + (11*PTR_SZ)] mov INP1, [STATE + _des_args_out + (12*PTR_SZ)] mov INP2, [STATE + _des_args_out + (13*PTR_SZ)] mov INP3, [STATE + _des_args_out + (14*PTR_SZ)] mov INP4, [STATE + _des_args_out + (15*PTR_SZ)] vmovdqu64 [IA0 + OFFSET]{k7}, ZW8 vmovdqu64 [IA1 + OFFSET]{k7}, ZW9 vmovdqu64 [IA2 + OFFSET]{k7}, ZW10 vmovdqu64 [INP0 + OFFSET]{k7}, ZW11 vmovdqu64 [INP1 + OFFSET]{k7}, ZW12 vmovdqu64 [INP2 + OFFSET]{k7}, ZW13 vmovdqu64 [INP3 + OFFSET]{k7}, ZW14 vmovdqu64 [INP4 + OFFSET]{k7}, ZW15 add OFFSET, 64 jmp %%_gen_des_dec_loop %%_gen_des_dec_loop_end: ;; store IV and update pointers DES_FINISH ZIV0, ZIV1, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4 ;; restore stack pointer and registers mov r12, [rsp + _gpr_save + 0*8] mov r13, [rsp + _gpr_save + 1*8] mov r14, [rsp + _gpr_save + 2*8] mov r15, [rsp + _gpr_save + 3*8] mov rsp, [rsp + _rsp_save] ; original SP %endmacro ;;; ======================================================== ;;; DATA section .data default rel align 64 mask_values: dd 0x04000000, 0x04000000, 0x04000000, 0x04000000 dd 0x04000000, 0x04000000, 0x04000000, 0x04000000 dd 0x04000000, 0x04000000, 0x04000000, 0x04000000 dd 0x04000000, 0x04000000, 0x04000000, 0x04000000 dd 0x40240202, 0x40240202, 0x40240202, 0x40240202 dd 0x40240202, 0x40240202, 0x40240202, 0x40240202 dd 0x40240202, 0x40240202, 0x40240202, 0x40240202 dd 0x40240202, 0x40240202, 0x40240202, 0x40240202 dd 0x00001110, 0x00001110, 0x00001110, 0x00001110 dd 0x00001110, 0x00001110, 0x00001110, 0x00001110 dd 0x00001110, 0x00001110, 0x00001110, 0x00001110 dd 0x00001110, 0x00001110, 0x00001110, 0x00001110 dd 0x01088000, 0x01088000, 0x01088000, 0x01088000 dd 0x01088000, 0x01088000, 0x01088000, 0x01088000 dd 0x01088000, 0x01088000, 0x01088000, 0x01088000 dd 0x01088000, 0x01088000, 0x01088000, 0x01088000 dd 0x00000001, 0x00000001, 0x00000001, 0x00000001 dd 0x00000001, 0x00000001, 0x00000001, 0x00000001 dd 0x00000001, 0x00000001, 0x00000001, 0x00000001 dd 0x00000001, 0x00000001, 0x00000001, 0x00000001 dd 0x0081000C, 0x0081000C, 0x0081000C, 0x0081000C dd 0x0081000C, 0x0081000C, 0x0081000C, 0x0081000C dd 0x0081000C, 0x0081000C, 0x0081000C, 0x0081000C dd 0x0081000C, 0x0081000C, 0x0081000C, 0x0081000C dd 0x00000020, 0x00000020, 0x00000020, 0x00000020 dd 0x00000020, 0x00000020, 0x00000020, 0x00000020 dd 0x00000020, 0x00000020, 0x00000020, 0x00000020 dd 0x00000020, 0x00000020, 0x00000020, 0x00000020 dd 0x00000040, 0x00000040, 0x00000040, 0x00000040 dd 0x00000040, 0x00000040, 0x00000040, 0x00000040 dd 0x00000040, 0x00000040, 0x00000040, 0x00000040 dd 0x00000040, 0x00000040, 0x00000040, 0x00000040 dd 0x00400400, 0x00400400, 0x00400400, 0x00400400 dd 0x00400400, 0x00400400, 0x00400400, 0x00400400 dd 0x00400400, 0x00400400, 0x00400400, 0x00400400 dd 0x00400400, 0x00400400, 0x00400400, 0x00400400 dd 0x00000800, 0x00000800, 0x00000800, 0x00000800 dd 0x00000800, 0x00000800, 0x00000800, 0x00000800 dd 0x00000800, 0x00000800, 0x00000800, 0x00000800 dd 0x00000800, 0x00000800, 0x00000800, 0x00000800 dd 0x00002000, 0x00002000, 0x00002000, 0x00002000 dd 0x00002000, 0x00002000, 0x00002000, 0x00002000 dd 0x00002000, 0x00002000, 0x00002000, 0x00002000 dd 0x00002000, 0x00002000, 0x00002000, 0x00002000 dd 0x00100000, 0x00100000, 0x00100000, 0x00100000 dd 0x00100000, 0x00100000, 0x00100000, 0x00100000 dd 0x00100000, 0x00100000, 0x00100000, 0x00100000 dd 0x00100000, 0x00100000, 0x00100000, 0x00100000 dd 0x00004000, 0x00004000, 0x00004000, 0x00004000 dd 0x00004000, 0x00004000, 0x00004000, 0x00004000 dd 0x00004000, 0x00004000, 0x00004000, 0x00004000 dd 0x00004000, 0x00004000, 0x00004000, 0x00004000 dd 0x00020000, 0x00020000, 0x00020000, 0x00020000 dd 0x00020000, 0x00020000, 0x00020000, 0x00020000 dd 0x00020000, 0x00020000, 0x00020000, 0x00020000 dd 0x00020000, 0x00020000, 0x00020000, 0x00020000 dd 0x02000000, 0x02000000, 0x02000000, 0x02000000 dd 0x02000000, 0x02000000, 0x02000000, 0x02000000 dd 0x02000000, 0x02000000, 0x02000000, 0x02000000 dd 0x02000000, 0x02000000, 0x02000000, 0x02000000 dd 0x08000000, 0x08000000, 0x08000000, 0x08000000 dd 0x08000000, 0x08000000, 0x08000000, 0x08000000 dd 0x08000000, 0x08000000, 0x08000000, 0x08000000 dd 0x08000000, 0x08000000, 0x08000000, 0x08000000 dd 0x00000080, 0x00000080, 0x00000080, 0x00000080 dd 0x00000080, 0x00000080, 0x00000080, 0x00000080 dd 0x00000080, 0x00000080, 0x00000080, 0x00000080 dd 0x00000080, 0x00000080, 0x00000080, 0x00000080 dd 0x20000000, 0x20000000, 0x20000000, 0x20000000 dd 0x20000000, 0x20000000, 0x20000000, 0x20000000 dd 0x20000000, 0x20000000, 0x20000000, 0x20000000 dd 0x20000000, 0x20000000, 0x20000000, 0x20000000 dd 0x90000000, 0x90000000, 0x90000000, 0x90000000 dd 0x90000000, 0x90000000, 0x90000000, 0x90000000 dd 0x90000000, 0x90000000, 0x90000000, 0x90000000 dd 0x90000000, 0x90000000, 0x90000000, 0x90000000 align 64 init_perm_consts: dd 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f dd 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f dd 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f dd 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f dd 0x0000ffff, 0x0000ffff, 0x0000ffff, 0x0000ffff dd 0x0000ffff, 0x0000ffff, 0x0000ffff, 0x0000ffff dd 0x0000ffff, 0x0000ffff, 0x0000ffff, 0x0000ffff dd 0x0000ffff, 0x0000ffff, 0x0000ffff, 0x0000ffff dd 0x33333333, 0x33333333, 0x33333333, 0x33333333 dd 0x33333333, 0x33333333, 0x33333333, 0x33333333 dd 0x33333333, 0x33333333, 0x33333333, 0x33333333 dd 0x33333333, 0x33333333, 0x33333333, 0x33333333 dd 0x00ff00ff, 0x00ff00ff, 0x00ff00ff, 0x00ff00ff dd 0x00ff00ff, 0x00ff00ff, 0x00ff00ff, 0x00ff00ff dd 0x00ff00ff, 0x00ff00ff, 0x00ff00ff, 0x00ff00ff dd 0x00ff00ff, 0x00ff00ff, 0x00ff00ff, 0x00ff00ff dd 0x55555555, 0x55555555, 0x55555555, 0x55555555 dd 0x55555555, 0x55555555, 0x55555555, 0x55555555 dd 0x55555555, 0x55555555, 0x55555555, 0x55555555 dd 0x55555555, 0x55555555, 0x55555555, 0x55555555 ;;; S-Box table align 64 S_box_flipped: ;; SBOX0 dw 0x07, 0x02, 0x0c, 0x0f, 0x04, 0x0b, 0x0a, 0x0c dw 0x0b, 0x07, 0x06, 0x09, 0x0d, 0x04, 0x00, 0x0a dw 0x02, 0x08, 0x05, 0x03, 0x0f, 0x06, 0x09, 0x05 dw 0x08, 0x01, 0x03, 0x0e, 0x01, 0x0d, 0x0e, 0x00 dw 0x00, 0x0f, 0x05, 0x0a, 0x07, 0x02, 0x09, 0x05 dw 0x0e, 0x01, 0x03, 0x0c, 0x0b, 0x08, 0x0c, 0x06 dw 0x0f, 0x03, 0x06, 0x0d, 0x04, 0x09, 0x0a, 0x00 dw 0x02, 0x04, 0x0d, 0x07, 0x08, 0x0e, 0x01, 0x0b ;; SBOX1 dw 0x0f, 0x00, 0x09, 0x0a, 0x06, 0x05, 0x03, 0x09 dw 0x01, 0x0e, 0x04, 0x03, 0x0c, 0x0b, 0x0a, 0x04 dw 0x08, 0x07, 0x0e, 0x01, 0x0d, 0x02, 0x00, 0x0c dw 0x07, 0x0d, 0x0b, 0x06, 0x02, 0x08, 0x05, 0x0f dw 0x0c, 0x0b, 0x03, 0x0d, 0x0f, 0x0c, 0x06, 0x00 dw 0x02, 0x05, 0x08, 0x0e, 0x01, 0x02, 0x0d, 0x07 dw 0x0b, 0x01, 0x00, 0x06, 0x04, 0x0f, 0x09, 0x0a dw 0x0e, 0x08, 0x05, 0x03, 0x07, 0x04, 0x0a, 0x09 ;; SBOX2 dw 0x05, 0x0b, 0x08, 0x0d, 0x06, 0x01, 0x0d, 0x0a dw 0x09, 0x02, 0x03, 0x04, 0x0f, 0x0c, 0x04, 0x07 dw 0x00, 0x06, 0x0b, 0x08, 0x0c, 0x0f, 0x02, 0x05 dw 0x07, 0x09, 0x0e, 0x03, 0x0a, 0x00, 0x01, 0x0e dw 0x0b, 0x08, 0x04, 0x02, 0x0c, 0x06, 0x03, 0x0d dw 0x00, 0x0b, 0x0a, 0x07, 0x06, 0x01, 0x0f, 0x04 dw 0x0e, 0x05, 0x01, 0x0f, 0x02, 0x09, 0x0d, 0x0a dw 0x09, 0x00, 0x07, 0x0c, 0x05, 0x0e, 0x08, 0x03 ;; SBOX3 dw 0x0e, 0x05, 0x08, 0x0f, 0x00, 0x03, 0x0d, 0x0a dw 0x07, 0x09, 0x01, 0x0c, 0x09, 0x0e, 0x02, 0x01 dw 0x0b, 0x06, 0x04, 0x08, 0x06, 0x0d, 0x03, 0x04 dw 0x0c, 0x00, 0x0a, 0x07, 0x05, 0x0b, 0x0f, 0x02 dw 0x0b, 0x0c, 0x02, 0x09, 0x06, 0x05, 0x08, 0x03 dw 0x0d, 0x00, 0x04, 0x0a, 0x00, 0x0b, 0x07, 0x04 dw 0x01, 0x0f, 0x0e, 0x02, 0x0f, 0x08, 0x05, 0x0e dw 0x0a, 0x06, 0x03, 0x0d, 0x0c, 0x01, 0x09, 0x07 ;; SBOX4 dw 0x04, 0x02, 0x01, 0x0f, 0x0e, 0x05, 0x0b, 0x06 dw 0x02, 0x08, 0x0c, 0x03, 0x0d, 0x0e, 0x07, 0x00 dw 0x03, 0x04, 0x0a, 0x09, 0x05, 0x0b, 0x00, 0x0c dw 0x08, 0x0d, 0x0f, 0x0a, 0x06, 0x01, 0x09, 0x07 dw 0x07, 0x0d, 0x0a, 0x06, 0x02, 0x08, 0x0c, 0x05 dw 0x04, 0x03, 0x0f, 0x00, 0x0b, 0x04, 0x01, 0x0a dw 0x0d, 0x01, 0x00, 0x0f, 0x0e, 0x07, 0x09, 0x02 dw 0x03, 0x0e, 0x05, 0x09, 0x08, 0x0b, 0x06, 0x0c ;; SBOX5 dw 0x03, 0x09, 0x00, 0x0e, 0x09, 0x04, 0x07, 0x08 dw 0x05, 0x0f, 0x0c, 0x02, 0x06, 0x03, 0x0a, 0x0d dw 0x08, 0x07, 0x0b, 0x00, 0x04, 0x01, 0x0e, 0x0b dw 0x0f, 0x0a, 0x02, 0x05, 0x01, 0x0c, 0x0d, 0x06 dw 0x05, 0x02, 0x06, 0x0d, 0x0e, 0x09, 0x00, 0x06 dw 0x02, 0x04, 0x0b, 0x08, 0x09, 0x0f, 0x0c, 0x01 dw 0x0f, 0x0c, 0x08, 0x07, 0x03, 0x0a, 0x0d, 0x00 dw 0x04, 0x03, 0x07, 0x0e, 0x0a, 0x05, 0x01, 0x0b ;; SBOX6 dw 0x02, 0x08, 0x0c, 0x05, 0x0f, 0x03, 0x0a, 0x00 dw 0x04, 0x0d, 0x09, 0x06, 0x01, 0x0e, 0x06, 0x09 dw 0x0d, 0x02, 0x03, 0x0f, 0x00, 0x0c, 0x05, 0x0a dw 0x07, 0x0b, 0x0e, 0x01, 0x0b, 0x07, 0x08, 0x04 dw 0x0b, 0x06, 0x07, 0x09, 0x02, 0x08, 0x04, 0x07 dw 0x0d, 0x0b, 0x0a, 0x00, 0x08, 0x05, 0x01, 0x0c dw 0x00, 0x0d, 0x0c, 0x0a, 0x09, 0x02, 0x0f, 0x04 dw 0x0e, 0x01, 0x03, 0x0f, 0x05, 0x0e, 0x06, 0x03 ;; SBOX7 dw 0x0b, 0x0e, 0x05, 0x00, 0x06, 0x09, 0x0a, 0x0f dw 0x01, 0x02, 0x0c, 0x05, 0x0d, 0x07, 0x03, 0x0a dw 0x04, 0x0d, 0x09, 0x06, 0x0f, 0x03, 0x00, 0x0c dw 0x02, 0x08, 0x07, 0x0b, 0x08, 0x04, 0x0e, 0x01 dw 0x08, 0x04, 0x03, 0x0f, 0x05, 0x02, 0x00, 0x0c dw 0x0b, 0x07, 0x06, 0x09, 0x0e, 0x01, 0x09, 0x06 dw 0x0f, 0x08, 0x0a, 0x03, 0x0c, 0x05, 0x07, 0x0a dw 0x01, 0x0e, 0x0d, 0x00, 0x02, 0x0b, 0x04, 0x0d ;;; Used in DOCSIS DES partial block scheduling 16 x 32bit of value 1 align 64 vec_ones_32b: dd 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 align 64 and_eu: dd 0x3f003f00, 0x3f003f00, 0x3f003f00, 0x3f003f00 dd 0x3f003f00, 0x3f003f00, 0x3f003f00, 0x3f003f00 dd 0x3f003f00, 0x3f003f00, 0x3f003f00, 0x3f003f00 dd 0x3f003f00, 0x3f003f00, 0x3f003f00, 0x3f003f00 align 64 and_ed: dd 0x003f003f, 0x003f003f, 0x003f003f, 0x003f003f dd 0x003f003f, 0x003f003f, 0x003f003f, 0x003f003f dd 0x003f003f, 0x003f003f, 0x003f003f, 0x003f003f dd 0x003f003f, 0x003f003f, 0x003f003f, 0x003f003f align 64 idx_e: dq 0x0d0c090805040100, 0x0f0e0b0a07060302 dq 0x1d1c191815141110, 0x1f1e1b1a17161312 dq 0x2d2c292825242120, 0x2f2e2b2a27262322 dq 0x3d3c393835343130, 0x3f3e3b3a37363332 align 64 reg_values16bit_7: dq 0x001f001f001f001f, 0x001f001f001f001f dq 0x001f001f001f001f, 0x001f001f001f001f dq 0x001f001f001f001f, 0x001f001f001f001f dq 0x001f001f001f001f, 0x001f001f001f001f align 64 shuffle_reg: dq 0x0705060403010200, 0x0f0d0e0c0b090a08 dq 0x1715161413111210, 0x1f1d1e1c1b191a18 dq 0x2725262423212220, 0x2f2d2e2c2b292a28 dq 0x3735363433313230, 0x3f3d3e3c3b393a38 ;;; ======================================================== ;;; CODE section .text ;;; arg 1 : pointer to DES OOO structure ;;; arg 2 : size in bytes align 64 MKGLOBAL(des_x16_cbc_enc_avx512,function,internal) des_x16_cbc_enc_avx512: GENERIC_DES_ENC DES ret ;;; arg 1 : pointer to DES OOO structure ;;; arg 2 : size in bytes align 64 MKGLOBAL(des_x16_cbc_dec_avx512,function,internal) des_x16_cbc_dec_avx512: GENERIC_DES_DEC DES ret ;;; arg 1 : pointer to DES OOO structure ;;; arg 2 : size in bytes align 64 MKGLOBAL(docsis_des_x16_enc_avx512,function,internal) docsis_des_x16_enc_avx512: GENERIC_DES_ENC DOCSIS ret ;;; arg 1 : pointer to DES OOO structure ;;; arg 2 : size in bytes align 64 MKGLOBAL(docsis_des_x16_dec_avx512,function,internal) docsis_des_x16_dec_avx512: GENERIC_DES_DEC DOCSIS ret intel-ipsec-mb-0.48/avx512/mb_mgr_avx512.c000066400000000000000000000613521321406316400200270ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2012-2017, Intel Corporation Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ #include #include #include #include "os.h" #define AVX512 #include "mb_mgr.h" #include "save_xmms.h" #include "asm.h" #ifndef NO_GCM #include "gcm_defines.h" #endif #include "des.h" JOB_AES_HMAC *submit_job_aes128_enc_avx(MB_MGR_AES_OOO *state, JOB_AES_HMAC *job); JOB_AES_HMAC *flush_job_aes128_enc_avx(MB_MGR_AES_OOO *state); JOB_AES_HMAC *submit_job_aes192_enc_avx(MB_MGR_AES_OOO *state, JOB_AES_HMAC *job); JOB_AES_HMAC *flush_job_aes192_enc_avx(MB_MGR_AES_OOO *state); JOB_AES_HMAC *submit_job_aes256_enc_avx(MB_MGR_AES_OOO *state, JOB_AES_HMAC *job); JOB_AES_HMAC *flush_job_aes256_enc_avx(MB_MGR_AES_OOO *state); JOB_AES_HMAC *submit_job_aes_xcbc_avx(MB_MGR_AES_XCBC_OOO *state, JOB_AES_HMAC *job); JOB_AES_HMAC *flush_job_aes_xcbc_avx(MB_MGR_AES_XCBC_OOO *state); JOB_AES_HMAC *submit_job_des_cbc_enc_avx512(MB_MGR_DES_OOO *state, JOB_AES_HMAC *job); JOB_AES_HMAC *flush_job_des_cbc_enc_avx512(MB_MGR_DES_OOO *state); JOB_AES_HMAC *submit_job_des_cbc_dec_avx512(MB_MGR_DES_OOO *state, JOB_AES_HMAC *job); JOB_AES_HMAC *flush_job_des_cbc_dec_avx512(MB_MGR_DES_OOO *state); JOB_AES_HMAC *submit_job_docsis_des_enc_avx512(MB_MGR_DES_OOO *state, JOB_AES_HMAC *job); JOB_AES_HMAC *flush_job_docsis_des_enc_avx512(MB_MGR_DES_OOO *state); JOB_AES_HMAC *submit_job_docsis_des_dec_avx512(MB_MGR_DES_OOO *state, JOB_AES_HMAC *job); JOB_AES_HMAC *flush_job_docsis_des_dec_avx512(MB_MGR_DES_OOO *state); #define SAVE_XMMS save_xmms_avx #define RESTORE_XMMS restore_xmms_avx #define SUBMIT_JOB_AES128_ENC submit_job_aes128_enc_avx #define SUBMIT_JOB_AES128_DEC submit_job_aes128_dec_avx #define FLUSH_JOB_AES128_ENC flush_job_aes128_enc_avx #define SUBMIT_JOB_AES192_ENC submit_job_aes192_enc_avx #define SUBMIT_JOB_AES192_DEC submit_job_aes192_dec_avx #define FLUSH_JOB_AES192_ENC flush_job_aes192_enc_avx #define SUBMIT_JOB_AES256_ENC submit_job_aes256_enc_avx #define SUBMIT_JOB_AES256_DEC submit_job_aes256_dec_avx #define FLUSH_JOB_AES256_ENC flush_job_aes256_enc_avx #define SUBMIT_JOB_AES128_CNTR submit_job_aes128_cntr_avx #define SUBMIT_JOB_AES192_CNTR submit_job_aes192_cntr_avx #define SUBMIT_JOB_AES256_CNTR submit_job_aes256_cntr_avx #define AES_CBC_DEC_128 aes_cbc_dec_128_avx #define AES_CBC_DEC_192 aes_cbc_dec_192_avx #define AES_CBC_DEC_256 aes_cbc_dec_256_avx #define AES_CNTR_128 aes_cntr_128_avx #define AES_CNTR_192 aes_cntr_192_avx #define AES_CNTR_256 aes_cntr_256_avx #define SUBMIT_JOB_AES_XCBC submit_job_aes_xcbc_avx #define FLUSH_JOB_AES_XCBC flush_job_aes_xcbc_avx #define SUBMIT_JOB_AES128_DEC submit_job_aes128_dec_avx #define SUBMIT_JOB_AES192_DEC submit_job_aes192_dec_avx #define SUBMIT_JOB_AES256_DEC submit_job_aes256_dec_avx #define SUBMIT_JOB_DES_CBC_ENC submit_job_des_cbc_enc_avx512 #define FLUSH_JOB_DES_CBC_ENC flush_job_des_cbc_enc_avx512 #define SUBMIT_JOB_DES_CBC_DEC submit_job_des_cbc_dec_avx512 #define FLUSH_JOB_DES_CBC_DEC flush_job_des_cbc_dec_avx512 #define SUBMIT_JOB_DOCSIS_DES_ENC submit_job_docsis_des_enc_avx512 #define FLUSH_JOB_DOCSIS_DES_ENC flush_job_docsis_des_enc_avx512 #define SUBMIT_JOB_DOCSIS_DES_DEC submit_job_docsis_des_dec_avx512 #define FLUSH_JOB_DOCSIS_DES_DEC flush_job_docsis_des_dec_avx512 #define SUBMIT_JOB_AES_ENC SUBMIT_JOB_AES_ENC_AVX512 #define FLUSH_JOB_AES_ENC FLUSH_JOB_AES_ENC_AVX512 #define SUBMIT_JOB_AES_DEC SUBMIT_JOB_AES_DEC_AVX512 JOB_AES_HMAC *submit_job_hmac_avx512(MB_MGR_HMAC_SHA_1_OOO *state, JOB_AES_HMAC *job); JOB_AES_HMAC *flush_job_hmac_avx512(MB_MGR_HMAC_SHA_1_OOO *state); JOB_AES_HMAC *submit_job_hmac_sha_224_avx512(MB_MGR_HMAC_SHA_256_OOO *state, JOB_AES_HMAC *job); JOB_AES_HMAC *flush_job_hmac_sha_224_avx512(MB_MGR_HMAC_SHA_256_OOO *state); JOB_AES_HMAC *submit_job_hmac_sha_256_avx512(MB_MGR_HMAC_SHA_256_OOO *state, JOB_AES_HMAC *job); JOB_AES_HMAC *flush_job_hmac_sha_256_avx512(MB_MGR_HMAC_SHA_256_OOO *state); JOB_AES_HMAC *submit_job_hmac_sha_384_avx512(MB_MGR_HMAC_SHA_512_OOO *state, JOB_AES_HMAC *job); JOB_AES_HMAC *flush_job_hmac_sha_384_avx512(MB_MGR_HMAC_SHA_512_OOO *state); JOB_AES_HMAC *submit_job_hmac_sha_512_avx512(MB_MGR_HMAC_SHA_512_OOO *state, JOB_AES_HMAC *job); JOB_AES_HMAC *flush_job_hmac_sha_512_avx512(MB_MGR_HMAC_SHA_512_OOO *state); JOB_AES_HMAC *submit_job_hmac_md5_avx2(MB_MGR_HMAC_MD5_OOO *state, JOB_AES_HMAC *job); JOB_AES_HMAC *flush_job_hmac_md5_avx2(MB_MGR_HMAC_MD5_OOO *state); #define SUBMIT_JOB_HMAC submit_job_hmac_avx512 #define FLUSH_JOB_HMAC flush_job_hmac_avx512 #define SUBMIT_JOB_HMAC_SHA_224 submit_job_hmac_sha_224_avx512 #define FLUSH_JOB_HMAC_SHA_224 flush_job_hmac_sha_224_avx512 #define SUBMIT_JOB_HMAC_SHA_256 submit_job_hmac_sha_256_avx512 #define FLUSH_JOB_HMAC_SHA_256 flush_job_hmac_sha_256_avx512 #define SUBMIT_JOB_HMAC_SHA_384 submit_job_hmac_sha_384_avx512 #define FLUSH_JOB_HMAC_SHA_384 flush_job_hmac_sha_384_avx512 #define SUBMIT_JOB_HMAC_SHA_512 submit_job_hmac_sha_512_avx512 #define FLUSH_JOB_HMAC_SHA_512 flush_job_hmac_sha_512_avx512 #define SUBMIT_JOB_HMAC_MD5 submit_job_hmac_md5_avx2 #define FLUSH_JOB_HMAC_MD5 flush_job_hmac_md5_avx2 #ifndef NO_GCM #define AES_GCM_DEC_128 aes_gcm_dec_128_avx_gen4 #define AES_GCM_ENC_128 aes_gcm_enc_128_avx_gen4 #define AES_GCM_DEC_192 aes_gcm_dec_192_avx_gen4 #define AES_GCM_ENC_192 aes_gcm_enc_192_avx_gen4 #define AES_GCM_DEC_256 aes_gcm_dec_256_avx_gen4 #define AES_GCM_ENC_256 aes_gcm_enc_256_avx_gen4 #endif /* NO_GCM */ /* ====================================================================== */ #define SUBMIT_JOB submit_job_avx512 #define FLUSH_JOB flush_job_avx512 #define QUEUE_SIZE queue_size_avx512 #define SUBMIT_JOB_NOCHECK submit_job_nocheck_avx512 /* ====================================================================== */ #define SUBMIT_JOB_HASH SUBMIT_JOB_HASH_AVX512 #define FLUSH_JOB_HASH FLUSH_JOB_HASH_AVX512 /* ====================================================================== */ #define AES_CFB_128_ONE aes_cfb_128_one_avx512 void aes128_cbc_mac_x8(AES_ARGS_x8 *args, uint64_t len); #define AES128_CBC_MAC aes128_cbc_mac_x8 #define FLUSH_JOB_AES_CCM_AUTH flush_job_aes_ccm_auth_arch #define SUBMIT_JOB_AES_CCM_AUTH submit_job_aes_ccm_auth_arch #define AES_CCM_MAX_JOBS 8 /* ====================================================================== */ void init_mb_mgr_avx512(MB_MGR *state) { unsigned int j; UINT8 *p; /* Init AES out-of-order fields */ state->aes128_ooo.lens[0] = 0; state->aes128_ooo.lens[1] = 0; state->aes128_ooo.lens[2] = 0; state->aes128_ooo.lens[3] = 0; state->aes128_ooo.lens[4] = 0; state->aes128_ooo.lens[5] = 0; state->aes128_ooo.lens[6] = 0; state->aes128_ooo.lens[7] = 0; state->aes128_ooo.unused_lanes = 0xF76543210; state->aes128_ooo.job_in_lane[0] = NULL; state->aes128_ooo.job_in_lane[1] = NULL; state->aes128_ooo.job_in_lane[2] = NULL; state->aes128_ooo.job_in_lane[3] = NULL; state->aes128_ooo.job_in_lane[4] = NULL; state->aes128_ooo.job_in_lane[5] = NULL; state->aes128_ooo.job_in_lane[6] = NULL; state->aes128_ooo.job_in_lane[7] = NULL; state->aes192_ooo.lens[0] = 0; state->aes192_ooo.lens[1] = 0; state->aes192_ooo.lens[2] = 0; state->aes192_ooo.lens[3] = 0; state->aes192_ooo.lens[4] = 0; state->aes192_ooo.lens[5] = 0; state->aes192_ooo.lens[6] = 0; state->aes192_ooo.lens[7] = 0; state->aes192_ooo.unused_lanes = 0xF76543210; state->aes192_ooo.job_in_lane[0] = NULL; state->aes192_ooo.job_in_lane[1] = NULL; state->aes192_ooo.job_in_lane[2] = NULL; state->aes192_ooo.job_in_lane[3] = NULL; state->aes192_ooo.job_in_lane[4] = NULL; state->aes192_ooo.job_in_lane[5] = NULL; state->aes192_ooo.job_in_lane[6] = NULL; state->aes192_ooo.job_in_lane[7] = NULL; state->aes256_ooo.lens[0] = 0; state->aes256_ooo.lens[1] = 0; state->aes256_ooo.lens[2] = 0; state->aes256_ooo.lens[3] = 0; state->aes256_ooo.lens[4] = 0; state->aes256_ooo.lens[5] = 0; state->aes256_ooo.lens[6] = 0; state->aes256_ooo.lens[7] = 0; state->aes256_ooo.unused_lanes = 0xF76543210; state->aes256_ooo.job_in_lane[0] = NULL; state->aes256_ooo.job_in_lane[1] = NULL; state->aes256_ooo.job_in_lane[2] = NULL; state->aes256_ooo.job_in_lane[3] = NULL; state->aes256_ooo.job_in_lane[4] = NULL; state->aes256_ooo.job_in_lane[5] = NULL; state->aes256_ooo.job_in_lane[6] = NULL; state->aes256_ooo.job_in_lane[7] = NULL; /* DOCSIS SEC BPI (AES CBC + AES CFB for partial block) * uses same settings as AES128 CBC. */ state->docsis_sec_ooo.lens[0] = 0; state->docsis_sec_ooo.lens[1] = 0; state->docsis_sec_ooo.lens[2] = 0; state->docsis_sec_ooo.lens[3] = 0; state->docsis_sec_ooo.lens[4] = 0; state->docsis_sec_ooo.lens[5] = 0; state->docsis_sec_ooo.lens[6] = 0; state->docsis_sec_ooo.lens[7] = 0; state->docsis_sec_ooo.unused_lanes = 0xF76543210; state->docsis_sec_ooo.job_in_lane[0] = NULL; state->docsis_sec_ooo.job_in_lane[1] = NULL; state->docsis_sec_ooo.job_in_lane[2] = NULL; state->docsis_sec_ooo.job_in_lane[3] = NULL; state->docsis_sec_ooo.job_in_lane[4] = NULL; state->docsis_sec_ooo.job_in_lane[5] = NULL; state->docsis_sec_ooo.job_in_lane[6] = NULL; state->docsis_sec_ooo.job_in_lane[7] = NULL; /* DOCSIS DES (DES CBC + DES CFB for partial block) */ /* - separate DES OOO for encryption */ for (j = 0; j < AVX512_NUM_DES_LANES; j++) { state->des_enc_ooo.lens[j] = 0; state->des_enc_ooo.job_in_lane[j] = NULL; } state->des_enc_ooo.unused_lanes = 0xFEDCBA9876543210; state->des_enc_ooo.num_lanes_inuse = 0; memset(&state->des_enc_ooo.args, 0, sizeof(state->des_enc_ooo.args)); /* - separate DES OOO for decryption */ for (j = 0; j < AVX512_NUM_DES_LANES; j++) { state->des_dec_ooo.lens[j] = 0; state->des_dec_ooo.job_in_lane[j] = NULL; } state->des_dec_ooo.unused_lanes = 0xFEDCBA9876543210; state->des_dec_ooo.num_lanes_inuse = 0; memset(&state->des_dec_ooo.args, 0, sizeof(state->des_dec_ooo.args)); /* - separate DOCSIS DES OOO for encryption */ for (j = 0; j < AVX512_NUM_DES_LANES; j++) { state->docsis_des_enc_ooo.lens[j] = 0; state->docsis_des_enc_ooo.job_in_lane[j] = NULL; } state->docsis_des_enc_ooo.unused_lanes = 0xFEDCBA9876543210; state->docsis_des_enc_ooo.num_lanes_inuse = 0; memset(&state->docsis_des_enc_ooo.args, 0, sizeof(state->docsis_des_enc_ooo.args)); /* - separate DES OOO for decryption */ for (j = 0; j < AVX512_NUM_DES_LANES; j++) { state->docsis_des_dec_ooo.lens[j] = 0; state->docsis_des_dec_ooo.job_in_lane[j] = NULL; } state->docsis_des_dec_ooo.unused_lanes = 0xFEDCBA9876543210; state->docsis_des_dec_ooo.num_lanes_inuse = 0; memset(&state->docsis_des_dec_ooo.args, 0, sizeof(state->docsis_des_dec_ooo.args)); /* Init HMAC/SHA1 out-of-order fields */ state->hmac_sha_1_ooo.lens[0] = 0; state->hmac_sha_1_ooo.lens[1] = 0; state->hmac_sha_1_ooo.lens[2] = 0; state->hmac_sha_1_ooo.lens[3] = 0; state->hmac_sha_1_ooo.lens[4] = 0; state->hmac_sha_1_ooo.lens[5] = 0; state->hmac_sha_1_ooo.lens[6] = 0; state->hmac_sha_1_ooo.lens[7] = 0; state->hmac_sha_1_ooo.lens[8] = 0; state->hmac_sha_1_ooo.lens[9] = 0; state->hmac_sha_1_ooo.lens[10] = 0; state->hmac_sha_1_ooo.lens[11] = 0; state->hmac_sha_1_ooo.lens[12] = 0; state->hmac_sha_1_ooo.lens[13] = 0; state->hmac_sha_1_ooo.lens[14] = 0; state->hmac_sha_1_ooo.lens[15] = 0; state->hmac_sha_1_ooo.unused_lanes = 0xFEDCBA9876543210; state->hmac_sha_1_ooo.num_lanes_inuse = 0; for (j = 0; j < AVX512_NUM_SHA1_LANES; j++) { state->hmac_sha_1_ooo.ldata[j].job_in_lane = NULL; state->hmac_sha_1_ooo.ldata[j].extra_block[64] = 0x80; memset(state->hmac_sha_1_ooo.ldata[j].extra_block + 65, 0x00, 64 + 7); p = state->hmac_sha_1_ooo.ldata[j].outer_block; memset(p + 5*4 + 1, 0x00, 64 - 5*4 - 1 - 2); p[5 * 4] = 0x80; p[64 - 2] = 0x02; p[64 - 1] = 0xA0; } /* Init HMAC/SHA224 out-of-order fields */ state->hmac_sha_224_ooo.lens[0] = 0; state->hmac_sha_224_ooo.lens[1] = 0; state->hmac_sha_224_ooo.lens[2] = 0; state->hmac_sha_224_ooo.lens[3] = 0; state->hmac_sha_224_ooo.lens[4] = 0; state->hmac_sha_224_ooo.lens[5] = 0; state->hmac_sha_224_ooo.lens[6] = 0; state->hmac_sha_224_ooo.lens[7] = 0; state->hmac_sha_224_ooo.lens[8] = 0; state->hmac_sha_224_ooo.lens[9] = 0; state->hmac_sha_224_ooo.lens[10] = 0; state->hmac_sha_224_ooo.lens[11] = 0; state->hmac_sha_224_ooo.lens[12] = 0; state->hmac_sha_224_ooo.lens[13] = 0; state->hmac_sha_224_ooo.lens[14] = 0; state->hmac_sha_224_ooo.lens[15] = 0; state->hmac_sha_224_ooo.unused_lanes = 0xFEDCBA9876543210; state->hmac_sha_224_ooo.num_lanes_inuse = 0; /* sha256 and sha224 are very similar except for * digest constants and output size */ for (j = 0; j < AVX512_NUM_SHA256_LANES; j++) { state->hmac_sha_224_ooo.ldata[j].job_in_lane = NULL; state->hmac_sha_224_ooo.ldata[j].extra_block[64] = 0x80; memset(state->hmac_sha_224_ooo.ldata[j].extra_block + 65, 0x00, 64 + 7); p = state->hmac_sha_224_ooo.ldata[j].outer_block; memset(p + 8*4 + 1, 0x00, 64 - 8*4 - 1 - 2); p[7 * 4] = 0x80; /* digest 7 words long */ p[64 - 2] = 0x02; /* length in little endian = 0x02E0 */ p[64 - 1] = 0xE0; } /* Init HMAC/SHA256 out-of-order fields */ state->hmac_sha_256_ooo.lens[0] = 0; state->hmac_sha_256_ooo.lens[1] = 0; state->hmac_sha_256_ooo.lens[2] = 0; state->hmac_sha_256_ooo.lens[3] = 0; state->hmac_sha_256_ooo.lens[4] = 0; state->hmac_sha_256_ooo.lens[5] = 0; state->hmac_sha_256_ooo.lens[6] = 0; state->hmac_sha_256_ooo.lens[7] = 0; state->hmac_sha_256_ooo.lens[8] = 0; state->hmac_sha_256_ooo.lens[9] = 0; state->hmac_sha_256_ooo.lens[10] = 0; state->hmac_sha_256_ooo.lens[11] = 0; state->hmac_sha_256_ooo.lens[12] = 0; state->hmac_sha_256_ooo.lens[13] = 0; state->hmac_sha_256_ooo.lens[14] = 0; state->hmac_sha_256_ooo.lens[15] = 0; state->hmac_sha_256_ooo.unused_lanes = 0xFEDCBA9876543210; state->hmac_sha_256_ooo.num_lanes_inuse = 0; for (j = 0; j < AVX512_NUM_SHA256_LANES; j++) { state->hmac_sha_256_ooo.ldata[j].job_in_lane = NULL; state->hmac_sha_256_ooo.ldata[j].extra_block[64] = 0x80; memset(state->hmac_sha_256_ooo.ldata[j].extra_block + 65, 0x00, 64 + 7); /* hmac related */ p = state->hmac_sha_256_ooo.ldata[j].outer_block; memset(p + 8*4 + 1, 0x00, 64 - 8*4 - 1 - 2); p[8 * 4] = 0x80; /* 8 digest words */ p[64 - 2] = 0x03; /* length */ p[64 - 1] = 0x00; } /* Init HMAC/SHA384 out-of-order fields */ state->hmac_sha_384_ooo.lens[0] = 0; state->hmac_sha_384_ooo.lens[1] = 0; state->hmac_sha_384_ooo.lens[2] = 0; state->hmac_sha_384_ooo.lens[3] = 0; state->hmac_sha_384_ooo.lens[4] = 0; state->hmac_sha_384_ooo.lens[5] = 0; state->hmac_sha_384_ooo.lens[6] = 0; state->hmac_sha_384_ooo.lens[7] = 0; state->hmac_sha_384_ooo.unused_lanes = 0xF76543210; for (j = 0; j < AVX512_NUM_SHA512_LANES; j++) { MB_MGR_HMAC_SHA_512_OOO *ctx = &state->hmac_sha_384_ooo; ctx->ldata[j].job_in_lane = NULL; ctx->ldata[j].extra_block[SHA_384_BLOCK_SIZE] = 0x80; memset(ctx->ldata[j].extra_block + (SHA_384_BLOCK_SIZE + 1), 0x00, SHA_384_BLOCK_SIZE + 7); p = ctx->ldata[j].outer_block; /* special end point because this length is constant */ memset(p + SHA384_DIGEST_SIZE_IN_BYTES + 1, 0x00, SHA_384_BLOCK_SIZE - SHA384_DIGEST_SIZE_IN_BYTES - 1 - 2); /* mark the end */ p[SHA384_DIGEST_SIZE_IN_BYTES] = 0x80; /* hmac outer block length always of fixed size, * it is OKey length, a whole message block length, 1024 bits, * with padding plus the length of the inner digest, * which is 384 bits, 1408 bits == 0x0580. * The input message block needs to be converted to big endian * within the sha implementation before use. */ p[SHA_384_BLOCK_SIZE - 2] = 0x05; p[SHA_384_BLOCK_SIZE - 1] = 0x80; } /* Init HMAC/SHA512 out-of-order fields */ state->hmac_sha_512_ooo.lens[0] = 0; state->hmac_sha_512_ooo.lens[1] = 0; state->hmac_sha_512_ooo.lens[2] = 0; state->hmac_sha_512_ooo.lens[3] = 0; state->hmac_sha_512_ooo.lens[4] = 0; state->hmac_sha_512_ooo.lens[5] = 0; state->hmac_sha_512_ooo.lens[6] = 0; state->hmac_sha_512_ooo.lens[7] = 0; state->hmac_sha_512_ooo.unused_lanes = 0xF76543210; for (j = 0; j < AVX512_NUM_SHA512_LANES; j++) { MB_MGR_HMAC_SHA_512_OOO *ctx = &state->hmac_sha_512_ooo; ctx->ldata[j].job_in_lane = NULL; ctx->ldata[j].extra_block[SHA_512_BLOCK_SIZE] = 0x80; memset(ctx->ldata[j].extra_block + (SHA_512_BLOCK_SIZE + 1), 0x00, SHA_512_BLOCK_SIZE + 7); p = ctx->ldata[j].outer_block; /* special end point because this length is constant */ memset(p + SHA512_DIGEST_SIZE_IN_BYTES + 1, 0x00, SHA_512_BLOCK_SIZE - SHA512_DIGEST_SIZE_IN_BYTES - 1 - 2); /* mark the end */ p[SHA512_DIGEST_SIZE_IN_BYTES] = 0x80; /* hmac outer block length always of fixed size, * it is OKey length, a whole message block length, 1024 bits, * with padding plus the length of the inner digest, * which is 512 bits, 1536 bits == 0x600. * The input message block needs to be converted to big endian * within the sha implementation before use. */ p[SHA_512_BLOCK_SIZE - 2] = 0x06; p[SHA_512_BLOCK_SIZE - 1] = 0x00; } /* Init HMAC/MD5 out-of-order fields */ state->hmac_md5_ooo.lens[0] = 0; state->hmac_md5_ooo.lens[1] = 0; state->hmac_md5_ooo.lens[2] = 0; state->hmac_md5_ooo.lens[3] = 0; state->hmac_md5_ooo.lens[4] = 0; state->hmac_md5_ooo.lens[5] = 0; state->hmac_md5_ooo.lens[6] = 0; state->hmac_md5_ooo.lens[7] = 0; state->hmac_md5_ooo.lens[8] = 0; state->hmac_md5_ooo.lens[9] = 0; state->hmac_md5_ooo.lens[10] = 0; state->hmac_md5_ooo.lens[11] = 0; state->hmac_md5_ooo.lens[12] = 0; state->hmac_md5_ooo.lens[13] = 0; state->hmac_md5_ooo.lens[14] = 0; state->hmac_md5_ooo.lens[15] = 0; state->hmac_md5_ooo.unused_lanes = 0xFEDCBA9876543210; state->hmac_md5_ooo.num_lanes_inuse = 0; for (j = 0; j < AVX512_NUM_MD5_LANES; j++) { state->hmac_md5_ooo.ldata[j].job_in_lane = NULL; state->hmac_md5_ooo.ldata[j].extra_block[64] = 0x80; memset(state->hmac_md5_ooo.ldata[j].extra_block + 65, 0x00, 64 + 7); p = state->hmac_md5_ooo.ldata[j].outer_block; memset(p + 5*4 + 1, 0x00, 64 - 5*4 - 1 - 2); p[4 * 4] = 0x80; p[64 - 7] = 0x02; p[64 - 8] = 0x80; } /* Init AES/XCBC OOO fields */ state->aes_xcbc_ooo.lens[0] = 0; state->aes_xcbc_ooo.lens[1] = 0; state->aes_xcbc_ooo.lens[2] = 0; state->aes_xcbc_ooo.lens[3] = 0; state->aes_xcbc_ooo.lens[4] = 0; state->aes_xcbc_ooo.lens[5] = 0; state->aes_xcbc_ooo.lens[6] = 0; state->aes_xcbc_ooo.lens[7] = 0; state->aes_xcbc_ooo.unused_lanes = 0xF76543210; for (j = 0; j < 8 ; j++) { state->aes_xcbc_ooo.ldata[j].job_in_lane = NULL; state->aes_xcbc_ooo.ldata[j].final_block[16] = 0x80; memset(state->aes_xcbc_ooo.ldata[j].final_block + 17, 0x00, 15); } /* Init AES-CCM auth out-of-order fields */ for (j = 0; j < 8; j++) { state->aes_ccm_ooo.init_done[j] = 0; state->aes_ccm_ooo.lens[j] = 0; state->aes_ccm_ooo.job_in_lane[j] = NULL; } state->aes_ccm_ooo.unused_lanes = 0xF76543210; /* Init "in order" components */ state->next_job = 0; state->earliest_job = -1; /* set handlers */ state->get_next_job = get_next_job_avx512; state->submit_job = submit_job_avx512; state->submit_job_nocheck = submit_job_nocheck_avx512; state->get_completed_job = get_completed_job_avx512; state->flush_job = flush_job_avx512; state->queue_size = queue_size_avx512; state->keyexp_128 = aes_keyexp_128_avx512; state->keyexp_192 = aes_keyexp_192_avx512; state->keyexp_256 = aes_keyexp_256_avx512; } #include "mb_mgr_code.h" intel-ipsec-mb-0.48/avx512/mb_mgr_des_avx512.asm000066400000000000000000000377631321406316400212310ustar00rootroot00000000000000;; ;; Copyright (c) 2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; ;; In System V AMD64 ABI ;; calle saves: RBX, RBP, R12-R15 ;; Windows x64 ABI ;; calle saves: RBX, RBP, RDI, RSI, RSP, R12-R15 ;; ;; Registers: RAX RBX RCX RDX RBP RSI RDI R8 R9 R10 R11 R12 R13 R14 R15 ;; ----------------------------------------------------------- ;; Windows clobbers: RAX R8 R9 R10 R11 ;; Windows preserves: RBX RCX RDX RBP RSI RDI R12 R13 R14 R15 ;; ----------------------------------------------------------- ;; Linux clobbers: RAX RCX RDX R10 R11 ;; Linux preserves: RBX RBP RSI RDI R8 R9 R12 R13 R14 R15 ;; ----------------------------------------------------------- ;; Clobbers ZMM0-31, K1-7 (K1-2 and K4-6 here but DES underneath clobbers K1-7). %include "os.asm" %include "reg_sizes.asm" %include "job_aes_hmac.asm" %include "mb_mgr_datastruct.asm" %include "constants.asm" ;%define DO_DBGPRINT %include "dbgprint.asm" extern docsis_des_x16_enc_avx512 extern docsis_des_x16_dec_avx512 extern des_x16_cbc_enc_avx512 extern des_x16_cbc_dec_avx512 %ifdef LINUX %define arg1 rdi %define arg2 rsi %define arg3 rdx %define arg4 rcx %else %define arg1 rcx %define arg2 rdx %define arg3 r8 %define arg4 r9 %endif %define STATE arg1 %define JOB arg2 %define IA0 arg3 %define IA1 arg4 %define IA2 r10 %define MIN_IDX r11 %define MIN_LEN rax %define LANE r11 %define AVX512_NUM_DES_LANES 16 %define ZTMP0 zmm0 %define ZTMP1 zmm1 %define ZTMP2 zmm2 %define ZTMP3 zmm3 %define ZTMP4 zmm4 %define ZTMP5 zmm5 %define ZTMP6 zmm6 %define ZTMP7 zmm7 %define ZTMP8 zmm8 %define ZTMP9 zmm9 ;;; =========================================================================== ;;; =========================================================================== ;;; MACROS ;;; =========================================================================== ;;; =========================================================================== ;;; =========================================================================== ;;; DES/DOCSIS DES job submit ;;; =========================================================================== ;;; DES_DOCSIS [in] - DES or DOCSIS cipher selection ;;; ENC_DEC [in] - ENCrypt or DECrypt seection %macro GENERIC_DES_SUBMIT 2 %define %%DES_DOCSIS %1 %define %%ENC_DEC %2 ;; get unsued lane and increment number of lanes in use mov IA0, [STATE + _des_unused_lanes] mov LANE, IA0 and LANE, 0xF ;; just a nibble shr IA0, 4 mov [STATE + _des_unused_lanes], IA0 add qword [STATE + _des_lanes_in_use], 1 ;; store job info in OOO structure ;; - job pointer mov [STATE + _des_job_in_lane + LANE*8], JOB ;; - key schedule %ifidn %%ENC_DEC, ENC mov IA2, [JOB + _aes_enc_key_expanded] %else mov IA2, [JOB + _aes_dec_key_expanded] %endif mov [STATE + _des_args_keys + LANE*8], IA2 ;; - IV mov IA2, [JOB + _iv] mov DWORD(IA0), [IA2] mov DWORD(IA1), [IA2 + 4] mov [STATE + _des_args_IV + LANE*4], DWORD(IA0) mov [STATE + _des_args_IV + LANE*4 + (AVX512_NUM_DES_LANES*4)], DWORD(IA1) ;; - src pointer mov IA0, [JOB + _src] add IA0, [JOB + _cipher_start_src_offset_in_bytes] mov [STATE + _des_args_in + LANE*8], IA0 ;; - destination pointer mov IA1, [JOB + _dst] mov [STATE + _des_args_out + LANE*8], IA1 ;; - length in bytes (block aligned) mov IA2, [JOB + _msg_len_to_cipher_in_bytes] and IA2, -8 mov [STATE + _des_lens + LANE*2], WORD(IA2) %ifidn %%DES_DOCSIS, DOCSIS ;; - block length mov [STATE + _des_args_BLen + LANE*4], DWORD(IA2) ;; - last in add IA0, IA2 mov [STATE + _des_args_LIn + LANE*8], IA0 ;; - last out add IA1, IA2 mov [STATE + _des_args_LOut + LANE*8], IA1 ;; - partial length mov IA2, [JOB + _msg_len_to_cipher_in_bytes] and IA2, 7 mov [STATE + _des_args_PLen + LANE*4], DWORD(IA2) %endif ; DOCSIS ;; is there enough jobs to process them in parallel? cmp qword [STATE + _des_lanes_in_use], AVX512_NUM_DES_LANES jb %%_des_submit_null_end ;; schedule the processing ;; - find min job size vmovdqa XWORD(ZTMP0), [STATE + _des_lens + 2*0] vphminposuw XWORD(ZTMP2), XWORD(ZTMP0) vpextrw DWORD(MIN_LEN), XWORD(ZTMP2), 0 ; min value vpextrw DWORD(MIN_IDX), XWORD(ZTMP2), 1 ; min index vmovdqa XWORD(ZTMP1), [STATE + _des_lens + 2*8] vphminposuw XWORD(ZTMP2), XWORD(ZTMP1) vpextrw DWORD(IA2), XWORD(ZTMP2), 0 ; min value cmp DWORD(MIN_LEN), DWORD(IA2) jle %%_use_min vpextrw DWORD(MIN_IDX), XWORD(ZTMP2), 1 ; min index add DWORD(MIN_IDX), 8 ; but index +8 mov MIN_LEN, IA2 ; min len %%_use_min: cmp MIN_LEN, 0 je %%_len_is_0 vpbroadcastw XWORD(ZTMP3), WORD(MIN_LEN) vpsubw XWORD(ZTMP0), XWORD(ZTMP0), XWORD(ZTMP3) vmovdqa [STATE + _des_lens + 2*0], XWORD(ZTMP0) vpsubw XWORD(ZTMP1), XWORD(ZTMP1), XWORD(ZTMP3) vmovdqa [STATE + _des_lens + 2*8], XWORD(ZTMP1) push MIN_IDX mov arg2, MIN_LEN %ifidn %%DES_DOCSIS, DOCSIS %ifidn %%ENC_DEC, ENC call docsis_des_x16_enc_avx512 %else ; ENC call docsis_des_x16_dec_avx512 %endif ; DEC %else ; DES %ifidn %%ENC_DEC, ENC call des_x16_cbc_enc_avx512 %else ; ENC call des_x16_cbc_dec_avx512 %endif ; DEC %endif pop MIN_IDX jmp %%_des_submit_end %%_des_submit_null_end: xor rax, rax jmp %%_des_submit_return %%_len_is_0: %ifidn %%DES_DOCSIS, DOCSIS cmp dword [STATE + _des_args_PLen + MIN_IDX*4], 0 jz %%_des_submit_end push MIN_IDX xor arg2, arg2 ; len is 0 %ifidn %%ENC_DEC, ENC call docsis_des_x16_enc_avx512 %else ; ENC call docsis_des_x16_dec_avx512 %endif ; DEC pop MIN_IDX %endif ; DOCSIS ;; fall trough %%_des_submit_end: ;; return a job ;; - decrement number of jobs in use sub qword [STATE + _des_lanes_in_use], 1 ;; - put the lane back to free lanes pool mov IA0, [STATE + _des_unused_lanes] shl IA0, 4 or IA0, MIN_IDX mov [STATE + _des_unused_lanes], IA0 ;; - mark job as complete ;; - clear job pointer mov rax, [STATE + _des_job_in_lane + MIN_IDX*8] mov qword [STATE + _des_job_in_lane + MIN_IDX*8], 0 or dword [rax + _status], STS_COMPLETED_AES %%_des_submit_return: %endmacro ;;; =========================================================================== ;;; DES/DOCSIS DES flush ;;; =========================================================================== ;;; DES_DOCSIS [in] - DES or DOCSIS cipher selection ;;; ENC_DEC [in] - ENCrypt or DECrypt seection ;;; ;;; Clobbers k1, k2, k4, k5 and k6 %macro GENERIC_DES_FLUSH 2 %define %%DES_DOCSIS %1 %define %%ENC_DEC %2 cmp qword [STATE + _des_lanes_in_use], 0 je %%_des_flush_null_end ;; find non-null job vpxord ZTMP0, ZTMP0, ZTMP0 vmovdqu64 ZTMP1, [STATE + _des_job_in_lane + (0*PTR_SZ)] vmovdqu64 ZTMP2, [STATE + _des_job_in_lane + (8*PTR_SZ)] vpcmpq k1, ZTMP1, ZTMP0, 4 ; NEQ vpcmpq k2, ZTMP2, ZTMP0, 4 ; NEQ xor IA0, IA0 xor IA1, IA1 kmovw DWORD(IA0), k1 kmovw DWORD(IA1), k2 mov DWORD(IA2), DWORD(IA1) shl DWORD(IA2), 8 or DWORD(IA2), DWORD(IA0) ; mask of non-null jobs in IA2 not BYTE(IA0) kmovw k4, DWORD(IA0) not BYTE(IA1) kmovw k5, DWORD(IA1) mov DWORD(IA0), DWORD(IA2) not WORD(IA0) kmovw k6, DWORD(IA0) ; mask of NULL jobs in k4, k5 and k6 mov DWORD(IA0), DWORD(IA2) xor IA2, IA2 bsf WORD(IA2), WORD(IA0) ; index of the 1st set bit in IA2 ;; copy good lane data into NULL lanes ;; - k1(L8)/k2(H8) - masks of non-null jobs ;; - k4(L8)/k5(H8)/k6 - masks of NULL jobs ;; - IA2 index of 1st non-null job ;; - in pointer mov IA0, [STATE + _des_args_in + IA2*8] vpbroadcastq ZTMP1, IA0 vmovdqu64 [STATE + _des_args_in + (0*PTR_SZ)]{k4}, ZTMP1 vmovdqu64 [STATE + _des_args_in + (8*PTR_SZ)]{k5}, ZTMP1 ;; - out pointer mov IA0, [STATE + _des_args_out + IA2*8] vpbroadcastq ZTMP1, IA0 vmovdqu64 [STATE + _des_args_out + (0*PTR_SZ)]{k4}, ZTMP1 vmovdqu64 [STATE + _des_args_out + (8*PTR_SZ)]{k5}, ZTMP1 ;; - key schedule mov IA0, [STATE + _des_args_keys + IA2*8] vpbroadcastq ZTMP1, IA0 vmovdqu64 [STATE + _des_args_keys + (0*PTR_SZ)]{k4}, ZTMP1 vmovdqu64 [STATE + _des_args_keys + (8*PTR_SZ)]{k5}, ZTMP1 ;; - zero partial len vmovdqu32 [STATE + _des_args_PLen]{k6}, ZTMP0 ;; - set len to UINT16_MAX mov WORD(IA0), 0xffff vpbroadcastw ZTMP1, WORD(IA0) vmovdqu16 [STATE + _des_lens]{k6}, ZTMP1 ;; - IV mov DWORD(IA0), [STATE + _des_args_IV + IA2*4] mov DWORD(IA1), [STATE + _des_args_IV + IA2*4 + (16*4)] vpbroadcastd ZTMP1, DWORD(IA0) vpbroadcastd ZTMP2, DWORD(IA1) vmovdqu32 [STATE + _des_args_IV]{k6}, ZTMP1 vmovdqu32 [STATE + _des_args_IV + (16*4)]{k6}, ZTMP2 ;; schedule the processing ;; - find min job size vmovdqa XWORD(ZTMP0), [STATE + _des_lens + 2*0] vphminposuw XWORD(ZTMP2), XWORD(ZTMP0) vpextrw DWORD(MIN_LEN), XWORD(ZTMP2), 0 ; min value vpextrw DWORD(MIN_IDX), XWORD(ZTMP2), 1 ; min index vmovdqa XWORD(ZTMP1), [STATE + _des_lens + 2*8] vphminposuw XWORD(ZTMP2), XWORD(ZTMP1) vpextrw DWORD(IA2), XWORD(ZTMP2), 0 ; min value cmp DWORD(MIN_LEN), DWORD(IA2) jle %%_use_min vpextrw DWORD(MIN_IDX), XWORD(ZTMP2), 1 ; min index add DWORD(MIN_IDX), 8 ; but index +8 mov MIN_LEN, IA2 ; min len %%_use_min: vpbroadcastw XWORD(ZTMP3), WORD(MIN_LEN) vpsubw XWORD(ZTMP0), XWORD(ZTMP0), XWORD(ZTMP3) vmovdqa [STATE + _des_lens + 2*0], XWORD(ZTMP0) vpsubw XWORD(ZTMP1), XWORD(ZTMP1), XWORD(ZTMP3) vmovdqa [STATE + _des_lens + 2*8], XWORD(ZTMP1) push MIN_IDX mov arg2, MIN_LEN %ifidn %%DES_DOCSIS, DOCSIS %ifidn %%ENC_DEC, ENC call docsis_des_x16_enc_avx512 %else ; ENC call docsis_des_x16_dec_avx512 %endif ; DEC %else ; DES %ifidn %%ENC_DEC, ENC call des_x16_cbc_enc_avx512 %else ; ENC call des_x16_cbc_dec_avx512 %endif ; DEC %endif pop MIN_IDX jmp %%_des_flush_end %%_des_flush_null_end: xor rax, rax jmp %%_des_flush_return %%_des_flush_end: ;; return a job ;; - decrement number of jobs in use sub qword [STATE + _des_lanes_in_use], 1 ;; - put the lane back to free lanes pool mov IA0, [STATE + _des_unused_lanes] shl IA0, 4 or IA0, MIN_IDX mov [STATE + _des_unused_lanes], IA0 ;; - mark job as complete mov rax, [STATE + _des_job_in_lane + MIN_IDX*8] or dword [rax + _status], STS_COMPLETED_AES ;; - clear job pointer mov qword [STATE + _des_job_in_lane + MIN_IDX*8], 0 %%_des_flush_return: %endmacro ;;; ======================================================== ;;; DATA section .data default rel ;;; ======================================================== ;;; CODE section .text ;;; arg 1 : pointer to DES OOO structure ;;; arg 2 : job align 64 MKGLOBAL(submit_job_des_cbc_enc_avx512,function,internal) submit_job_des_cbc_enc_avx512: GENERIC_DES_SUBMIT DES, ENC ret ;;; arg 1 : pointer to DES OOO structure ;;; arg 2 : job align 64 MKGLOBAL(submit_job_des_cbc_dec_avx512,function,internal) submit_job_des_cbc_dec_avx512: GENERIC_DES_SUBMIT DES, DEC ret ;;; arg 1 : pointer to DES OOO structure ;;; arg 2 : job align 64 MKGLOBAL(submit_job_docsis_des_enc_avx512,function,internal) submit_job_docsis_des_enc_avx512: GENERIC_DES_SUBMIT DOCSIS, ENC ret ;;; arg 1 : pointer to DES OOO structure ;;; arg 2 : job align 64 MKGLOBAL(submit_job_docsis_des_dec_avx512,function,internal) submit_job_docsis_des_dec_avx512: GENERIC_DES_SUBMIT DOCSIS, DEC ret ;;; arg 1 : pointer to DES OOO structure align 64 MKGLOBAL(flush_job_des_cbc_enc_avx512,function,internal) flush_job_des_cbc_enc_avx512: GENERIC_DES_FLUSH DES, ENC ret ;;; arg 1 : pointer to DES OOO structure align 64 MKGLOBAL(flush_job_des_cbc_dec_avx512,function,internal) flush_job_des_cbc_dec_avx512: GENERIC_DES_FLUSH DES, DEC ret ;;; arg 1 : pointer to DES OOO structure align 64 MKGLOBAL(flush_job_docsis_des_enc_avx512,function,internal) flush_job_docsis_des_enc_avx512: GENERIC_DES_FLUSH DOCSIS, ENC ret ;;; arg 1 : pointer to DES OOO structure align 64 MKGLOBAL(flush_job_docsis_des_dec_avx512,function,internal) flush_job_docsis_des_dec_avx512: GENERIC_DES_FLUSH DOCSIS, DEC ret intel-ipsec-mb-0.48/avx512/mb_mgr_hmac_flush_avx512.asm000066400000000000000000000243741321406316400225610ustar00rootroot00000000000000;; ;; Copyright (c) 2012-2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; ;; In System V AMD64 ABI ;; calle saves: RBX, RBP, R12-R15 ;; Windows x64 ABI ;; calle saves: RBX, RBP, RDI, RSI, RSP, R12-R15 ;; ;; Registers: RAX RBX RCX RDX RBP RSI RDI R8 R9 R10 R11 R12 R13 R14 R15 ;; ----------------------------------------------------------- ;; Windows clobbers: RAX RCX RDX R8 R9 R10 R11 ;; Windows preserves: RBX RBP RSI RDI R12 R13 R14 R15 ;; ----------------------------------------------------------- ;; Linux clobbers: RAX RSI RDI R8 R9 R10 R11 ;; Linux preserves: RBX RCX RDX RBP R12 R13 R14 R15 ;; ----------------------------------------------------------- ;; Clobbers ZMM0-31 %include "os.asm" %include "job_aes_hmac.asm" %include "mb_mgr_datastruct.asm" %include "reg_sizes.asm" ;; %define DO_DBGPRINT %include "dbgprint.asm" extern sha1_x16_avx512 section .data default rel align 16 byteswap: dq 0x0405060700010203 dq 0x0c0d0e0f08090a0b align 32 len_masks: dq 0x000000000000FFFF, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000 dq 0x00000000FFFF0000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000 dq 0x0000FFFF00000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000 dq 0xFFFF000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000 dq 0x0000000000000000, 0x000000000000FFFF, 0x0000000000000000, 0x0000000000000000 dq 0x0000000000000000, 0x00000000FFFF0000, 0x0000000000000000, 0x0000000000000000 dq 0x0000000000000000, 0x0000FFFF00000000, 0x0000000000000000, 0x0000000000000000 dq 0x0000000000000000, 0xFFFF000000000000, 0x0000000000000000, 0x0000000000000000 dq 0x0000000000000000, 0x0000000000000000, 0x000000000000FFFF, 0x0000000000000000 dq 0x0000000000000000, 0x0000000000000000, 0x00000000FFFF0000, 0x0000000000000000 dq 0x0000000000000000, 0x0000000000000000, 0x0000FFFF00000000, 0x0000000000000000 dq 0x0000000000000000, 0x0000000000000000, 0xFFFF000000000000, 0x0000000000000000 dq 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x000000000000FFFF dq 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x00000000FFFF0000 dq 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000FFFF00000000 dq 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0xFFFF000000000000 lane_1: dq 1 lane_2: dq 2 lane_3: dq 3 lane_4: dq 4 lane_5: dq 5 lane_6: dq 6 lane_7: dq 7 lane_8: dq 8 lane_9: dq 9 lane_10: dq 10 lane_11: dq 11 lane_12: dq 12 lane_13: dq 13 lane_14: dq 14 lane_15: dq 15 section .text %if 1 %ifdef LINUX %define arg1 rdi %define arg2 rsi %else %define arg1 rcx %define arg2 rdx %endif %define state arg1 %define job arg2 %define len2 arg2 ; idx needs to be in rbx, rdi, rbp %define idx rbp %define unused_lanes r9 %define lane_data r9 %define tmp2 r9 %define num_lanes_inuse r12 %define len_upper r13 %define idx_upper r14 %define job_rax rax %define tmp1 rax %define size_offset rax %define tmp rax %define start_offset rax %define tmp3 arg1 %define extra_blocks arg2 %define p arg2 %define tmp4 r8 %endif ; we clobber rbp, called routine clobbers r12-r15 struc STACK _gpr_save: resq 5 _rsp_save: resq 1 endstruc %define APPEND(a,b) a %+ b ; JOB* flush_job_hmac_avx(MB_MGR_HMAC_SHA_1_OOO *state) ; arg 1 : rcx : state MKGLOBAL(flush_job_hmac_avx512,function,internal) flush_job_hmac_avx512: mov rax, rsp sub rsp, STACK_size and rsp, -32 ; align stack to 32 byte boundary mov [rsp + _gpr_save + 8*0], rbp mov [rsp + _gpr_save + 8*1], r12 mov [rsp + _gpr_save + 8*2], r13 mov [rsp + _gpr_save + 8*3], r14 mov [rsp + _gpr_save + 8*4], r15 mov [rsp + _rsp_save], rax DBGPRINTL "---------- start hmac flush avx512 -----------" mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse_sha1] ;empty? cmp num_lanes_inuse, 0 jz return_null ; find a lane with a non-null job xor idx, idx %assign I 1 %rep 15 cmp qword [state + _ldata + (I * _HMAC_SHA1_LANE_DATA_size) + _job_in_lane], 0 cmovne idx, [rel APPEND(lane_,I)] %assign I (I+1) %endrep copy_lane_data: ; copy valid lane (idx) to empty lanes vmovdqa ymm0, [state + _lens] mov tmp, [state + _args_data_ptr + PTR_SZ*idx] %assign I 0 %rep 16 cmp qword [state + _ldata + I * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0 jne APPEND(skip_,I) mov [state + _args_data_ptr + PTR_SZ*I], tmp vpor ymm0, ymm0, [rel len_masks + 32*I] ; 32 for ymm, 16 for xmm APPEND(skip_,I): %assign I (I+1) %endrep vmovdqa [state + _lens], ymm0 vphminposuw xmm1, xmm0 vpextrw DWORD(len2), xmm1, 0 ; min value vpextrw DWORD(idx), xmm1, 1 ; min index (0...7) vmovdqa xmm2, [state + _lens + 8*2] vphminposuw xmm3, xmm2 vpextrw DWORD(len_upper), xmm3, 0 ; min value vpextrw DWORD(idx_upper), xmm3, 1 ; min index (8...F) cmp len2, len_upper jle use_min vmovdqa xmm1, xmm3 mov len2, len_upper mov idx, idx_upper ; idx would be in range 0..7 add idx, 8 ; to reflect that index is in 8..F range use_min: DBGPRINTL64 "FLUSH min_length", len2 DBGPRINTL64 "FLUSH min_length index ", idx cmp len2, 0 je len_is_0 vpbroadcastw xmm1, xmm1 DBGPRINTL_XMM "FLUSH lens after shuffle", xmm1 vpsubw xmm0, xmm0, xmm1 vmovdqa [state + _lens], xmm0 vpsubw xmm2, xmm2, xmm1 vmovdqa [state + _lens + 8*2], xmm2 DBGPRINTL_XMM "FLUSH lens immediately after min subtraction (0..7)", xmm0 DBGPRINTL_XMM "FLUSH lens immediately after min subtraction (8..F)", xmm2 ; "state" and "args" are the same address, arg1 ; len is arg2 call sha1_x16_avx512 ; state and idx are intact len_is_0: ; process completed job "idx" imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size lea lane_data, [state + _ldata + lane_data] mov DWORD(extra_blocks), [lane_data + _extra_blocks] cmp extra_blocks, 0 jne proc_extra_blocks cmp dword [lane_data + _outer_done], 0 jne end_loop proc_outer: mov dword [lane_data + _outer_done], 1 mov DWORD(size_offset), [lane_data + _size_offset] mov qword [lane_data + _extra_block + size_offset], 0 mov word [state + _lens + 2*idx], 1 lea tmp, [lane_data + _outer_block] mov job, [lane_data + _job_in_lane] mov [state + _args_data_ptr + PTR_SZ*idx], tmp vmovd xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 0*SHA1_DIGEST_ROW_SIZE] vpinsrd xmm0, xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 1*SHA1_DIGEST_ROW_SIZE], 1 vpinsrd xmm0, xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 2*SHA1_DIGEST_ROW_SIZE], 2 vpinsrd xmm0, xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 3*SHA1_DIGEST_ROW_SIZE], 3 vpshufb xmm0, xmm0, [rel byteswap] mov DWORD(tmp), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 4*SHA1_DIGEST_ROW_SIZE] bswap DWORD(tmp) vmovdqa [lane_data + _outer_block], xmm0 mov [lane_data + _outer_block + 4*4], DWORD(tmp) mov tmp, [job + _auth_key_xor_opad] vmovdqu xmm0, [tmp] mov DWORD(tmp), [tmp + 4*4] vmovd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 0*SHA1_DIGEST_ROW_SIZE], xmm0 vpextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 1*SHA1_DIGEST_ROW_SIZE], xmm0, 1 vpextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 2*SHA1_DIGEST_ROW_SIZE], xmm0, 2 vpextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 3*SHA1_DIGEST_ROW_SIZE], xmm0, 3 mov [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 4*SHA1_DIGEST_ROW_SIZE], DWORD(tmp) jmp copy_lane_data align 16 proc_extra_blocks: mov DWORD(start_offset), [lane_data + _start_offset] mov [state + _lens + 2*idx], WORD(extra_blocks) lea tmp, [lane_data + _extra_block + start_offset] mov [state + _args_data_ptr + PTR_SZ*idx], tmp mov dword [lane_data + _extra_blocks], 0 jmp copy_lane_data return_null: DBGPRINTL "FLUSH *** ---------- return null" xor job_rax, job_rax jmp return align 16 end_loop: mov job_rax, [lane_data + _job_in_lane] mov qword [lane_data + _job_in_lane], 0 or dword [job_rax + _status], STS_COMPLETED_HMAC mov unused_lanes, [state + _unused_lanes] shl unused_lanes, 4 ;; a nibble or unused_lanes, idx mov [state + _unused_lanes], unused_lanes sub dword [state + _num_lanes_inuse_sha1], 1 mov p, [job_rax + _auth_tag_output] ; copy 12 bytes mov DWORD(tmp2), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 0*SHA1_DIGEST_ROW_SIZE] mov DWORD(tmp4), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 1*SHA1_DIGEST_ROW_SIZE] mov DWORD(tmp3), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 2*SHA1_DIGEST_ROW_SIZE] bswap DWORD(tmp2) bswap DWORD(tmp4) bswap DWORD(tmp3) mov [p + 0*4], DWORD(tmp2) mov [p + 1*4], DWORD(tmp4) mov [p + 2*4], DWORD(tmp3) return: DBGPRINTL "---------- exit hmac flush avx512 -----------" mov rbp, [rsp + _gpr_save + 8*0] mov r12, [rsp + _gpr_save + 8*1] mov r13, [rsp + _gpr_save + 8*2] mov r14, [rsp + _gpr_save + 8*3] mov r15, [rsp + _gpr_save + 8*4] mov rsp, [rsp + _rsp_save] ret intel-ipsec-mb-0.48/avx512/mb_mgr_hmac_sha_224_flush_avx512.asm000066400000000000000000000031461321406316400237750ustar00rootroot00000000000000;; ;; Copyright (c) 2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; %define SHA224 %include "mb_mgr_hmac_sha_256_flush_avx512.asm" intel-ipsec-mb-0.48/avx512/mb_mgr_hmac_sha_224_submit_avx512.asm000066400000000000000000000031471321406316400241600ustar00rootroot00000000000000;; ;; Copyright (c) 2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; %define SHA224 %include "mb_mgr_hmac_sha_256_submit_avx512.asm" intel-ipsec-mb-0.48/avx512/mb_mgr_hmac_sha_256_flush_avx512.asm000066400000000000000000000257601321406316400240100ustar00rootroot00000000000000;; ;; Copyright (c) 2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; ;; In System V AMD64 ABI ;; calle saves: RBX, RBP, R12-R15 ;; Windows x64 ABI ;; calle saves: RBX, RBP, RDI, RSI, RSP, R12-R15 ;; ;; Registers: RAX RBX RCX RDX RBP RSI RDI R8 R9 R10 R11 R12 R13 R14 R15 ;; ----------------------------------------------------------- ;; Windows clobbers: RAX RCX RDX R8 R9 R10 R11 ;; Windows preserves: RBX RBP RSI RDI R12 R13 R14 R15 ;; ----------------------------------------------------------- ;; Linux clobbers: RAX RCX RDX RSI RDI R8 R9 R10 R11 ;; Linux preserves: RBX RBP R12 R13 R14 R15 ;; ----------------------------------------------------------- ;; Clobbers ZMM0-31 %include "os.asm" %include "job_aes_hmac.asm" %include "mb_mgr_datastruct.asm" %include "reg_sizes.asm" ;; %define DO_DBGPRINT %include "dbgprint.asm" extern sha256_x16_avx512 section .data default rel align 16 byteswap: dq 0x0405060700010203, 0x0c0d0e0f08090a0b align 32 len_masks: dq 0x000000000000FFFF, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000 dq 0x00000000FFFF0000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000 dq 0x0000FFFF00000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000 dq 0xFFFF000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000 dq 0x0000000000000000, 0x000000000000FFFF, 0x0000000000000000, 0x0000000000000000 dq 0x0000000000000000, 0x00000000FFFF0000, 0x0000000000000000, 0x0000000000000000 dq 0x0000000000000000, 0x0000FFFF00000000, 0x0000000000000000, 0x0000000000000000 dq 0x0000000000000000, 0xFFFF000000000000, 0x0000000000000000, 0x0000000000000000 dq 0x0000000000000000, 0x0000000000000000, 0x000000000000FFFF, 0x0000000000000000 dq 0x0000000000000000, 0x0000000000000000, 0x00000000FFFF0000, 0x0000000000000000 dq 0x0000000000000000, 0x0000000000000000, 0x0000FFFF00000000, 0x0000000000000000 dq 0x0000000000000000, 0x0000000000000000, 0xFFFF000000000000, 0x0000000000000000 dq 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x000000000000FFFF dq 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x00000000FFFF0000 dq 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000FFFF00000000 dq 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0xFFFF000000000000 lane_1: dq 1 lane_2: dq 2 lane_3: dq 3 lane_4: dq 4 lane_5: dq 5 lane_6: dq 6 lane_7: dq 7 lane_8: dq 8 lane_9: dq 9 lane_10: dq 10 lane_11: dq 11 lane_12: dq 12 lane_13: dq 13 lane_14: dq 14 lane_15: dq 15 section .text %ifdef LINUX %define arg1 rdi %define arg2 rsi %define arg3 rdx %else %define arg1 rcx %define arg2 rdx %define arg3 rsi %endif %define state arg1 %define job arg2 %define len2 arg2 ; idx needs to be in rbp, r15 %define idx rbp %define unused_lanes r10 %define tmp5 r10 %define lane_data rbx %define tmp2 rbx %define job_rax rax %define tmp1 rax %define size_offset rax %define start_offset rax %define tmp3 arg1 %define extra_blocks arg2 %define p arg2 %define tmp4 arg3 %define tmp r9 %define len_upper r13 %define idx_upper r14 ; we clobber rsi, rbp; called routine also clobbers rax, r9 to r15 struc STACK _gpr_save: resq 8 _rsp_save: resq 1 endstruc %define APPEND(a,b) a %+ b ; JOB* flush_job_hmac_sha_224_avx512(MB_MGR_HMAC_SHA_256_OOO *state) ; JOB* flush_job_hmac_sha_256_avx512(MB_MGR_HMAC_SHA_256_OOO *state) ; arg 1 : state align 32 %ifdef SHA224 MKGLOBAL(flush_job_hmac_sha_224_avx512,function,internal) flush_job_hmac_sha_224_avx512: %else MKGLOBAL(flush_job_hmac_sha_256_avx512,function,internal) flush_job_hmac_sha_256_avx512: %endif mov rax, rsp sub rsp, STACK_size and rsp, -32 mov [rsp + _gpr_save + 8*0], rbx mov [rsp + _gpr_save + 8*1], rbp mov [rsp + _gpr_save + 8*2], r12 mov [rsp + _gpr_save + 8*3], r13 mov [rsp + _gpr_save + 8*4], r14 mov [rsp + _gpr_save + 8*5], r15 %ifndef LINUX mov [rsp + _gpr_save + 8*6], rsi mov [rsp + _gpr_save + 8*7], rdi %endif mov [rsp + _rsp_save], rax ; original SP ; if bit (32+3) is set, then all lanes are empty cmp dword [state + _num_lanes_inuse_sha256], 0 jz return_null ; find a lane with a non-null job xor idx, idx %assign I 1 %rep 15 cmp qword [state + _ldata_sha256 + (I * _HMAC_SHA1_LANE_DATA_size) + _job_in_lane], 0 cmovne idx, [rel APPEND(lane_,I)] %assign I (I+1) %endrep copy_lane_data: ; copy idx to empty lanes vmovdqa ymm0, [state + _lens_sha256] mov tmp, [state + _args_data_ptr_sha256 + PTR_SZ*idx] %assign I 0 %rep 16 cmp qword [state + _ldata_sha256 + I * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0 jne APPEND(skip_,I) mov [state + _args_data_ptr_sha256 + PTR_SZ*I], tmp vpor ymm0, ymm0, [rel len_masks + 32*I] APPEND(skip_,I): %assign I (I+1) %endrep vmovdqa [state + _lens_sha256 ], ymm0 vphminposuw xmm1, xmm0 vpextrw DWORD(len2), xmm1, 0 ; min value vpextrw DWORD(idx), xmm1, 1 ; min index (0...7) vmovdqa xmm2, [state + _lens_sha256 + 8*2] vphminposuw xmm3, xmm2 vpextrw DWORD(len_upper), xmm3, 0 ; min value vpextrw DWORD(idx_upper), xmm3, 1 ; min index (8...F) cmp len2, len_upper jle use_min vmovdqa xmm1, xmm3 mov len2, len_upper mov idx, idx_upper ; idx would be in range 0..7 add idx, 8 ; to reflect that index is in 8..F range use_min: cmp len2, 0 je len_is_0 vpbroadcastw xmm1, xmm1 ; duplicate words across all lanes vpsubw xmm0, xmm0, xmm1 vmovdqa [state + _lens_sha256], xmm0 vpsubw xmm2, xmm2, xmm1 vmovdqa [state + _lens_sha256 + 8*2], xmm2 ; "state" and "args" are the same address, arg1 ; len is arg2 call sha256_x16_avx512 ; state and idx are intact len_is_0: ; process completed job "idx" imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size lea lane_data, [state + _ldata_sha256 + lane_data] mov DWORD(extra_blocks), [lane_data + _extra_blocks] cmp extra_blocks, 0 jne proc_extra_blocks cmp dword [lane_data + _outer_done], 0 jne end_loop proc_outer: mov dword [lane_data + _outer_done], 1 mov DWORD(size_offset), [lane_data + _size_offset] mov qword [lane_data + _extra_block + size_offset], 0 mov word [state + _lens_sha256 + 2*idx], 1 lea tmp, [lane_data + _outer_block] mov [state + _args_data_ptr_sha256 + PTR_SZ*idx], tmp vmovd xmm0, [state + _args_digest_sha256 + 4*idx + 0*SHA256_DIGEST_ROW_SIZE] vpinsrd xmm0, xmm0, [state + _args_digest_sha256 + 4*idx + 1*SHA256_DIGEST_ROW_SIZE], 1 vpinsrd xmm0, xmm0, [state + _args_digest_sha256 + 4*idx + 2*SHA256_DIGEST_ROW_SIZE], 2 vpinsrd xmm0, xmm0, [state + _args_digest_sha256 + 4*idx + 3*SHA256_DIGEST_ROW_SIZE], 3 vpshufb xmm0, xmm0, [rel byteswap] vmovd xmm1, [state + _args_digest_sha256 + 4*idx + 4*SHA256_DIGEST_ROW_SIZE] vpinsrd xmm1, xmm1, [state + _args_digest_sha256 + 4*idx + 5*SHA256_DIGEST_ROW_SIZE], 1 vpinsrd xmm1, xmm1, [state + _args_digest_sha256 + 4*idx + 6*SHA256_DIGEST_ROW_SIZE], 2 %ifndef SHA224 vpinsrd xmm1, xmm1, [state + _args_digest_sha256 + 4*idx + 7*SHA256_DIGEST_ROW_SIZE], 3 %endif vpshufb xmm1, xmm1, [rel byteswap] vmovdqa [lane_data + _outer_block], xmm0 vmovdqa [lane_data + _outer_block + 4*4], xmm1 %ifdef SHA224 mov dword [lane_data + _outer_block + 7*4], 0x80 %endif mov job, [lane_data + _job_in_lane] mov tmp, [job + _auth_key_xor_opad] vmovdqu xmm0, [tmp] vmovdqu xmm1, [tmp + 4*4] vmovd [state + _args_digest_sha256 + 4*idx + 0*SHA256_DIGEST_ROW_SIZE], xmm0 vpextrd [state + _args_digest_sha256 + 4*idx + 1*SHA256_DIGEST_ROW_SIZE], xmm0, 1 vpextrd [state + _args_digest_sha256 + 4*idx + 2*SHA256_DIGEST_ROW_SIZE], xmm0, 2 vpextrd [state + _args_digest_sha256 + 4*idx + 3*SHA256_DIGEST_ROW_SIZE], xmm0, 3 vmovd [state + _args_digest_sha256 + 4*idx + 4*SHA256_DIGEST_ROW_SIZE], xmm1 vpextrd [state + _args_digest_sha256 + 4*idx + 5*SHA256_DIGEST_ROW_SIZE], xmm1, 1 vpextrd [state + _args_digest_sha256 + 4*idx + 6*SHA256_DIGEST_ROW_SIZE], xmm1, 2 vpextrd [state + _args_digest_sha256 + 4*idx + 7*SHA256_DIGEST_ROW_SIZE], xmm1, 3 jmp copy_lane_data align 16 proc_extra_blocks: mov DWORD(start_offset), [lane_data + _start_offset] mov [state + _lens_sha256 + 2*idx], WORD(extra_blocks) lea tmp, [lane_data + _extra_block + start_offset] mov [state + _args_data_ptr_sha256 + PTR_SZ*idx], tmp mov dword [lane_data + _extra_blocks], 0 jmp copy_lane_data return_null: xor job_rax, job_rax jmp return align 16 end_loop: mov job_rax, [lane_data + _job_in_lane] mov qword [lane_data + _job_in_lane], 0 or dword [job_rax + _status], STS_COMPLETED_HMAC mov unused_lanes, [state + _unused_lanes_sha256] shl unused_lanes, 4 or unused_lanes, idx mov [state + _unused_lanes_sha256], unused_lanes sub dword [state + _num_lanes_inuse_sha256], 1 mov p, [job_rax + _auth_tag_output] ; copy SHA224=14bytes and SHA256=16bytes mov DWORD(tmp), [state + _args_digest_sha256 + 4*idx + 0*SHA256_DIGEST_ROW_SIZE] mov DWORD(tmp2), [state + _args_digest_sha256 + 4*idx + 1*SHA256_DIGEST_ROW_SIZE] mov DWORD(tmp4), [state + _args_digest_sha256 + 4*idx + 2*SHA256_DIGEST_ROW_SIZE] mov DWORD(tmp5), [state + _args_digest_sha256 + 4*idx + 3*SHA256_DIGEST_ROW_SIZE] bswap DWORD(tmp) bswap DWORD(tmp2) bswap DWORD(tmp4) bswap DWORD(tmp5) mov [p + 0*4], DWORD(tmp) mov [p + 1*4], DWORD(tmp2) mov [p + 2*4], DWORD(tmp4) %ifdef SHA224 mov [p + 3*4], WORD(tmp5) %else mov [p + 3*4], DWORD(tmp5) %endif return: mov rbx, [rsp + _gpr_save + 8*0] mov rbp, [rsp + _gpr_save + 8*1] mov r12, [rsp + _gpr_save + 8*2] mov r13, [rsp + _gpr_save + 8*3] mov r14, [rsp + _gpr_save + 8*4] mov r15, [rsp + _gpr_save + 8*5] %ifndef LINUX mov rsi, [rsp + _gpr_save + 8*6] mov rdi, [rsp + _gpr_save + 8*7] %endif mov rsp, [rsp + _rsp_save] ; original SP ret intel-ipsec-mb-0.48/avx512/mb_mgr_hmac_sha_256_submit_avx512.asm000066400000000000000000000270001321406316400241570ustar00rootroot00000000000000;; ;; Copyright (c) 2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; ;; In System V AMD64 ABI ;; calle saves: RBX, RBP, R12-R15 ;; Windows x64 ABI ;; calle saves: RBX, RBP, RDI, RSI, RSP, R12-R15 ;; ;; Registers: RAX RBX RCX RDX RBP RSI RDI R8 R9 R10 R11 R12 R13 R14 R15 ;; ----------------------------------------------------------- ;; Windows clobbers: RAX RCX RDX R8 R9 R10 R11 ;; Windows preserves: RBX RBP RSI RDI R12 R13 R14 R15 ;; ----------------------------------------------------------- ;; Linux clobbers: RAX RCX RDX RSI RDI R8 R9 R10 R11 ;; Linux preserves: RBX RBP R12 R13 R14 R15 ;; ----------------------------------------------------------- ;; Clobbers ZMM0-31 %include "os.asm" %include "job_aes_hmac.asm" %include "mb_mgr_datastruct.asm" %include "reg_sizes.asm" %include "memcpy.asm" ;; %define DO_DBGPRINT %include "dbgprint.asm" extern sha256_x16_avx512 section .data default rel align 16 byteswap: dq 0x0405060700010203, 0x0c0d0e0f08090a0b section .text %ifdef LINUX %define arg1 rdi %define arg2 rsi %define arg3 rcx %define arg4 rdx %else %define arg1 rcx %define arg2 rdx %define arg3 rdi %define arg4 rsi %endif %define state arg1 %define job arg2 %define len2 arg2 ; idx needs to be in rbp, r15 %define last_len rbp %define idx rbp %define p r11 %define start_offset r11 %define unused_lanes rbx %define p2 rbx %define tmp4 rbx %define job_rax rax %define len rax %define size_offset arg3 %define tmp2 arg3 %define lane arg4 %define tmp3 arg4 %define extra_blocks r8 %define tmp r9 %define lane_data r10 %define len_upper r13 %define idx_upper r14 ; we clobber rbx, rsi, rdi, rbp; called routine also clobbers r9 to r15 struc STACK _gpr_save: resq 8 _rsp_save: resq 1 endstruc ; JOB* FUNC(MB_MGR_HMAC_SHA_256_OOO *state, JOB_AES_HMAC *job) ; arg 1 : rcx : state ; arg 2 : rdx : job align 32 %ifdef SHA224 MKGLOBAL(submit_job_hmac_sha_224_avx512,function,internal) submit_job_hmac_sha_224_avx512: %else MKGLOBAL(submit_job_hmac_sha_256_avx512,function,internal) submit_job_hmac_sha_256_avx512: %endif mov rax, rsp sub rsp, STACK_size and rsp, -32 mov [rsp + _gpr_save + 8*0], rbx mov [rsp + _gpr_save + 8*1], rbp mov [rsp + _gpr_save + 8*2], r12 mov [rsp + _gpr_save + 8*3], r13 mov [rsp + _gpr_save + 8*4], r14 mov [rsp + _gpr_save + 8*5], r15 %ifndef LINUX mov [rsp + _gpr_save + 8*6], rsi mov [rsp + _gpr_save + 8*7], rdi %endif mov [rsp + _rsp_save], rax ; original SP mov unused_lanes, [state + _unused_lanes_sha256] mov lane, unused_lanes and lane, 0xF ;; just a nibble shr unused_lanes, 4 imul lane_data, lane, _HMAC_SHA1_LANE_DATA_size lea lane_data, [state + _ldata_sha256 + lane_data] mov [state + _unused_lanes_sha256], unused_lanes add dword [state + _num_lanes_inuse_sha256], 1 mov len, [job + _msg_len_to_hash_in_bytes] mov tmp, len shr tmp, 6 ; divide by 64, len in terms of blocks mov [lane_data + _job_in_lane], job mov dword [lane_data + _outer_done], 0 mov [state + _lens_sha256 + 2*lane], WORD(tmp) mov last_len, len and last_len, 63 lea extra_blocks, [last_len + 9 + 63] shr extra_blocks, 6 mov [lane_data + _extra_blocks], DWORD(extra_blocks) mov p, [job + _src] add p, [job + _hash_start_src_offset_in_bytes] mov [state + _args_data_ptr_sha256 + PTR_SZ*lane], p cmp len, 64 jb copy_lt64 fast_copy: vmovdqu32 zmm0, [p - 64 + len] vmovdqu32 [lane_data + _extra_block], zmm0 end_fast_copy: mov size_offset, extra_blocks shl size_offset, 6 sub size_offset, last_len add size_offset, 64-8 mov [lane_data + _size_offset], DWORD(size_offset) mov start_offset, 64 sub start_offset, last_len mov [lane_data + _start_offset], DWORD(start_offset) lea tmp, [8*64 + 8*len] bswap tmp mov [lane_data + _extra_block + size_offset], tmp mov tmp, [job + _auth_key_xor_ipad] vmovdqu xmm0, [tmp] vmovdqu xmm1, [tmp + 4*4] vmovd [state + _args_digest_sha256 + 4*lane + 0*SHA256_DIGEST_ROW_SIZE], xmm0 vpextrd [state + _args_digest_sha256 + 4*lane + 1*SHA256_DIGEST_ROW_SIZE], xmm0, 1 vpextrd [state + _args_digest_sha256 + 4*lane + 2*SHA256_DIGEST_ROW_SIZE], xmm0, 2 vpextrd [state + _args_digest_sha256 + 4*lane + 3*SHA256_DIGEST_ROW_SIZE], xmm0, 3 vmovd [state + _args_digest_sha256 + 4*lane + 4*SHA256_DIGEST_ROW_SIZE], xmm1 vpextrd [state + _args_digest_sha256 + 4*lane + 5*SHA256_DIGEST_ROW_SIZE], xmm1, 1 vpextrd [state + _args_digest_sha256 + 4*lane + 6*SHA256_DIGEST_ROW_SIZE], xmm1, 2 vpextrd [state + _args_digest_sha256 + 4*lane + 7*SHA256_DIGEST_ROW_SIZE], xmm1, 3 test len, ~63 jnz ge64_bytes lt64_bytes: mov [state + _lens_sha256 + 2*lane], WORD(extra_blocks) lea tmp, [lane_data + _extra_block + start_offset] mov [state + _args_data_ptr_sha256 + PTR_SZ*lane], tmp mov dword [lane_data + _extra_blocks], 0 ge64_bytes: cmp dword [state + _num_lanes_inuse_sha256], 0x10 ; all 16 lanes used? jne return_null jmp start_loop align 16 start_loop: ; Find min length vmovdqa xmm0, [state + _lens_sha256] vphminposuw xmm1, xmm0 vpextrw DWORD(len2), xmm1, 0 ; min value vpextrw DWORD(idx), xmm1, 1 ; min index (0...7) vmovdqa xmm2, [state + _lens_sha256 + 8*2] vphminposuw xmm3, xmm2 vpextrw DWORD(len_upper), xmm3, 0 ; min value vpextrw DWORD(idx_upper), xmm3, 1 ; min index (8...F) cmp len2, len_upper jle use_min vmovdqa xmm1, xmm3 mov len2, len_upper mov idx, idx_upper ; idx is in range 0..7 add idx, 8 ; to reflect that real index is in 8..F range use_min: cmp len2, 0 je len_is_0 vpbroadcastw xmm1, xmm1 ; duplicate words across all lanes vpsubw xmm0, xmm0, xmm1 vmovdqa [state + _lens_sha256 + 0*2], xmm0 vpsubw xmm2, xmm2, xmm1 vmovdqa [state + _lens_sha256 + 8*2], xmm2 ; "state" and "args" are the same address, arg1 ; len is arg2 call sha256_x16_avx512 ; state and idx are intact len_is_0: ; process completed job "idx" imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size lea lane_data, [state + _ldata_sha256 + lane_data] mov DWORD(extra_blocks), [lane_data + _extra_blocks] cmp extra_blocks, 0 jne proc_extra_blocks cmp dword [lane_data + _outer_done], 0 jne end_loop proc_outer: mov dword [lane_data + _outer_done], 1 mov DWORD(size_offset), [lane_data + _size_offset] mov qword [lane_data + _extra_block + size_offset], 0 mov word [state + _lens_sha256 + 2*idx], 1 lea tmp, [lane_data + _outer_block] mov job, [lane_data + _job_in_lane] mov [state + _args_data_ptr_sha256 + PTR_SZ*idx], tmp vmovd xmm0, [state + _args_digest_sha256 + 4*idx + 0*SHA256_DIGEST_ROW_SIZE] vpinsrd xmm0, xmm0, [state + _args_digest_sha256 + 4*idx + 1*SHA256_DIGEST_ROW_SIZE], 1 vpinsrd xmm0, xmm0, [state + _args_digest_sha256 + 4*idx + 2*SHA256_DIGEST_ROW_SIZE], 2 vpinsrd xmm0, xmm0, [state + _args_digest_sha256 + 4*idx + 3*SHA256_DIGEST_ROW_SIZE], 3 vpshufb xmm0, xmm0, [rel byteswap] vmovd xmm1, [state + _args_digest_sha256 + 4*idx + 4*SHA256_DIGEST_ROW_SIZE] vpinsrd xmm1, xmm1, [state + _args_digest_sha256 + 4*idx + 5*SHA256_DIGEST_ROW_SIZE], 1 vpinsrd xmm1, xmm1, [state + _args_digest_sha256 + 4*idx + 6*SHA256_DIGEST_ROW_SIZE], 2 %ifndef SHA224 vpinsrd xmm1, xmm1, [state + _args_digest_sha256 + 4*idx + 7*SHA256_DIGEST_ROW_SIZE], 3 %endif vpshufb xmm1, xmm1, [rel byteswap] vmovdqa [lane_data + _outer_block], xmm0 vmovdqa [lane_data + _outer_block + 4*4], xmm1 %ifdef SHA224 mov dword [lane_data + _outer_block + 7*4], 0x80 %endif mov tmp, [job + _auth_key_xor_opad] vmovdqu xmm0, [tmp] vmovdqu xmm1, [tmp + 4*4] vmovd [state + _args_digest_sha256 + 4*idx + 0*SHA256_DIGEST_ROW_SIZE], xmm0 vpextrd [state + _args_digest_sha256 + 4*idx + 1*SHA256_DIGEST_ROW_SIZE], xmm0, 1 vpextrd [state + _args_digest_sha256 + 4*idx + 2*SHA256_DIGEST_ROW_SIZE], xmm0, 2 vpextrd [state + _args_digest_sha256 + 4*idx + 3*SHA256_DIGEST_ROW_SIZE], xmm0, 3 vmovd [state + _args_digest_sha256 + 4*idx + 4*SHA256_DIGEST_ROW_SIZE], xmm1 vpextrd [state + _args_digest_sha256 + 4*idx + 5*SHA256_DIGEST_ROW_SIZE], xmm1, 1 vpextrd [state + _args_digest_sha256 + 4*idx + 6*SHA256_DIGEST_ROW_SIZE], xmm1, 2 vpextrd [state + _args_digest_sha256 + 4*idx + 7*SHA256_DIGEST_ROW_SIZE], xmm1, 3 jmp start_loop align 16 proc_extra_blocks: mov DWORD(start_offset), [lane_data + _start_offset] mov [state + _lens_sha256 + 2*idx], WORD(extra_blocks) lea tmp, [lane_data + _extra_block + start_offset] mov [state + _args_data_ptr_sha256 + 8*idx], tmp mov dword [lane_data + _extra_blocks], 0 jmp start_loop align 16 copy_lt64: ;; less than one message block of data ;; beginning of source block ;; destination extrablock but backwards by len from where 0x80 pre-populated lea p2, [lane_data + _extra_block + 64] sub p2, len memcpy_avx2_64_1 p2, p, len, tmp, tmp2, ymm0, ymm1 mov unused_lanes, [state + _unused_lanes_sha256] jmp end_fast_copy return_null: xor job_rax, job_rax jmp return align 16 end_loop: mov job_rax, [lane_data + _job_in_lane] mov unused_lanes, [state + _unused_lanes_sha256] mov qword [lane_data + _job_in_lane], 0 or dword [job_rax + _status], STS_COMPLETED_HMAC shl unused_lanes, 4 or unused_lanes, idx mov [state + _unused_lanes_sha256], unused_lanes sub dword [state + _num_lanes_inuse_sha256], 1 mov p, [job_rax + _auth_tag_output] ; copy 14 bytes for SHA224 and 16 bytes for SHA256 mov DWORD(tmp), [state + _args_digest_sha256 + 4*idx + 0*SHA256_DIGEST_ROW_SIZE] mov DWORD(tmp2), [state + _args_digest_sha256 + 4*idx + 1*SHA256_DIGEST_ROW_SIZE] mov DWORD(tmp3), [state + _args_digest_sha256 + 4*idx + 2*SHA256_DIGEST_ROW_SIZE] mov DWORD(tmp4), [state + _args_digest_sha256 + 4*idx + 3*SHA256_DIGEST_ROW_SIZE] bswap DWORD(tmp) bswap DWORD(tmp2) bswap DWORD(tmp3) bswap DWORD(tmp4) mov [p + 0*4], DWORD(tmp) mov [p + 1*4], DWORD(tmp2) mov [p + 2*4], DWORD(tmp3) %ifdef SHA224 mov [p + 3*4], WORD(tmp4) %else mov [p + 3*4], DWORD(tmp4) %endif return: mov rbx, [rsp + _gpr_save + 8*0] mov rbp, [rsp + _gpr_save + 8*1] mov r12, [rsp + _gpr_save + 8*2] mov r13, [rsp + _gpr_save + 8*3] mov r14, [rsp + _gpr_save + 8*4] mov r15, [rsp + _gpr_save + 8*5] %ifndef LINUX mov rsi, [rsp + _gpr_save + 8*6] mov rdi, [rsp + _gpr_save + 8*7] %endif mov rsp, [rsp + _rsp_save] ; original SP ret intel-ipsec-mb-0.48/avx512/mb_mgr_hmac_sha_384_flush_avx512.asm000066400000000000000000000031471321406316400240050ustar00rootroot00000000000000;; ;; Copyright (c) 2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; %define SHA384 %include "mb_mgr_hmac_sha_512_flush_avx512.asm" intel-ipsec-mb-0.48/avx512/mb_mgr_hmac_sha_384_submit_avx512.asm000066400000000000000000000031501321406316400241610ustar00rootroot00000000000000;; ;; Copyright (c) 2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; %define SHA384 %include "mb_mgr_hmac_sha_512_submit_avx512.asm" intel-ipsec-mb-0.48/avx512/mb_mgr_hmac_sha_512_flush_avx512.asm000066400000000000000000000211021321406316400237650ustar00rootroot00000000000000;; ;; Copyright (c) 2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; ;; In System V AMD64 ABI ;; calle saves: RBX, RBP, R12-R15 ;; Windows x64 ABI ;; calle saves: RBX, RBP, RDI, RSI, R12-R15 ;; ;; Clobbers ZMM0-31 %include "os.asm" %include "job_aes_hmac.asm" %include "mb_mgr_datastruct.asm" %include "reg_sizes.asm" extern sha512_x8_avx512 section .data default rel align 16 dupw: ;ddq 0x01000100010001000100010001000100 dq 0x0100010001000100, 0x0100010001000100 align 16 byteswap: ;ddq 0x08090a0b0c0d0e0f0001020304050607 dq 0x0001020304050607, 0x08090a0b0c0d0e0f align 16 len_masks: ;ddq 0x0000000000000000000000000000FFFF dq 0x000000000000FFFF, 0x0000000000000000 ;ddq 0x000000000000000000000000FFFF0000 dq 0x00000000FFFF0000, 0x0000000000000000 ;ddq 0x00000000000000000000FFFF00000000 dq 0x0000FFFF00000000, 0x0000000000000000 ;ddq 0x0000000000000000FFFF000000000000 dq 0xFFFF000000000000, 0x0000000000000000 ;ddq 0x000000000000FFFF0000000000000000 dq 0x0000000000000000, 0x000000000000FFFF ;ddq 0x00000000FFFF00000000000000000000 dq 0x0000000000000000, 0x00000000FFFF0000 ;ddq 0x0000FFFF000000000000000000000000 dq 0x0000000000000000, 0x0000FFFF00000000 ;ddq 0xFFFF0000000000000000000000000000 dq 0x0000000000000000, 0xFFFF000000000000 lane_1: dq 1 lane_2: dq 2 lane_3: dq 3 lane_4: dq 4 lane_5: dq 5 lane_6: dq 6 lane_7: dq 7 section .text %ifdef LINUX %define arg1 rdi %define arg2 rsi %else %define arg1 rcx %define arg2 rdx %endif %define state arg1 %define job arg2 %define len2 arg2 ; idx needs to be in rbp %define idx rbp %define unused_lanes rbx %define lane_data rbx %define tmp2 rbx %define job_rax rax %define size_offset rax %define tmp rax %define start_offset rax %define extra_blocks arg2 %define p arg2 %define tmp4 r8 %define tmp5 r9 %define tmp6 r10 struc STACK _gpr_save: resq 7 ; rbx, rbp, r12-r15, rdi (windows) _rsp_save: resq 1 endstruc %define APPEND(a,b) a %+ b %ifndef SHA384 ; JOB* flush_job_hmac_sha_384_avx512(MB_MGR_HMAC_SHA_512_OOO *state) ; arg 1 : state %define SHA_X_DIGEST_SIZE 512 MKGLOBAL(flush_job_hmac_sha_512_avx512,function,internal) align 64 flush_job_hmac_sha_512_avx512: %else ; JOB* flush_job_hmac_sha_512_avx512(MB_MGR_HMAC_SHA_512_OOO *state) ; arg 1 : state %define SHA_X_DIGEST_SIZE 384 MKGLOBAL(flush_job_hmac_sha_384_avx512,function,internal) align 64 flush_job_hmac_sha_384_avx512: %endif mov rax, rsp sub rsp, STACK_size and rsp, -32 mov [rsp + _gpr_save + 8*0], rbx mov [rsp + _gpr_save + 8*1], rbp mov [rsp + _gpr_save + 8*2], r12 mov [rsp + _gpr_save + 8*3], r13 mov [rsp + _gpr_save + 8*4], r14 mov [rsp + _gpr_save + 8*5], r15 %ifndef LINUX mov [rsp + _gpr_save + 8*6], rdi %endif mov [rsp + _rsp_save], rax ; original SP mov unused_lanes, [state + _unused_lanes_sha512] bt unused_lanes, 32+3 jc return_null ; find a lane with a non-null job xor idx, idx %assign I 1 %rep 7 cmp qword [state + _ldata_sha512 + I * _SHA512_LANE_DATA_size + _job_in_lane_sha512], 0 cmovne idx, [rel APPEND(lane_, I)] %assign I (I+1) %endrep copy_lane_data: ; copy good lane (idx) to empty lanes vmovdqa xmm0, [state + _lens_sha512] mov tmp, [state + _args_sha512 + _data_ptr_sha512 + PTR_SZ*idx] %assign I 0 %rep 8 cmp qword [state + _ldata_sha512 + I * _SHA512_LANE_DATA_size + _job_in_lane_sha512], 0 jne APPEND(skip_,I) mov [state + _args_sha512 + _data_ptr_sha512 + PTR_SZ*I], tmp vpor xmm0, xmm0, [rel len_masks + 16*I] APPEND(skip_,I): %assign I (I+1) %endrep vmovdqa [state + _lens_sha512], xmm0 vphminposuw xmm1, xmm0 vpextrw DWORD(len2), xmm1, 0 ; min value vpextrw DWORD(idx), xmm1, 1 ; min index (0...7) cmp len2, 0 je len_is_0 vpshufb xmm1, [rel dupw] ; duplicate words across all 8 lanes vpsubw xmm0, xmm0, xmm1 vmovdqa [state + _lens_sha512], xmm0 ; "state" and "args" are the same address, arg1 ; len is arg2 call sha512_x8_avx512 ; state and idx are intact len_is_0: ; process completed job "idx" imul lane_data, idx, _SHA512_LANE_DATA_size lea lane_data, [state + _ldata_sha512 + lane_data] mov DWORD(extra_blocks), [lane_data + _extra_blocks_sha512] cmp extra_blocks, 0 jne proc_extra_blocks cmp dword [lane_data + _outer_done_sha512], 0 jne end_loop proc_outer: mov dword [lane_data + _outer_done_sha512], 1 mov DWORD(size_offset), [lane_data + _size_offset_sha512] mov qword [lane_data + _extra_block_sha512 + size_offset], 0 mov word [state + _lens_sha512 + 2*idx], 1 lea tmp, [lane_data + _outer_block_sha512] mov job, [lane_data + _job_in_lane_sha512] mov [state + _args_data_ptr_sha512 + PTR_SZ*idx], tmp ; move digest into data location %assign I 0 %rep (SHA_X_DIGEST_SIZE / (8*16)) vmovq xmm0, [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 2*I*SHA512_DIGEST_ROW_SIZE] vpinsrq xmm0, [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + (2*I + 1)*SHA512_DIGEST_ROW_SIZE], 1 vpshufb xmm0, [rel byteswap] vmovdqa [lane_data + _outer_block_sha512 + I*2*SHA512_DIGEST_WORD_SIZE], xmm0 %assign I (I+1) %endrep ; move the opad key into digest mov tmp, [job + _auth_key_xor_opad] %assign I 0 %rep 4 vmovdqu xmm0, [tmp + I * 16] vmovq [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + (2*I + 0)*SHA512_DIGEST_ROW_SIZE], xmm0 vpextrq [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + (2*I + 1)*SHA512_DIGEST_ROW_SIZE], xmm0, 1 %assign I (I+1) %endrep jmp copy_lane_data align 32 proc_extra_blocks: mov DWORD(start_offset), [lane_data + _start_offset_sha512] mov [state + _lens_sha512 + 2*idx], WORD(extra_blocks) lea tmp, [lane_data + _extra_block_sha512 + start_offset] mov [state + _args_data_ptr_sha512 + PTR_SZ*idx], tmp mov dword [lane_data + _extra_blocks_sha512], 0 jmp copy_lane_data return_null: xor job_rax, job_rax jmp return align 32 end_loop: mov job_rax, [lane_data + _job_in_lane_sha512] mov qword [lane_data + _job_in_lane_sha512], 0 or dword [job_rax + _status], STS_COMPLETED_HMAC mov unused_lanes, [state + _unused_lanes_sha512] shl unused_lanes, 4 or unused_lanes, idx mov [state + _unused_lanes_sha512], unused_lanes mov p, [job_rax + _auth_tag_output] ; below is the code for both SHA512 & SHA384. SHA512=32 bytes and SHA384=24 bytes mov QWORD(tmp2), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 0*SHA512_DIGEST_ROW_SIZE] mov QWORD(tmp4), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 1*SHA512_DIGEST_ROW_SIZE] mov QWORD(tmp6), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 2*SHA512_DIGEST_ROW_SIZE] %if (SHA_X_DIGEST_SIZE != 384) mov QWORD(tmp5), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 3*SHA512_DIGEST_ROW_SIZE] %endif bswap QWORD(tmp2) bswap QWORD(tmp4) bswap QWORD(tmp6) %if (SHA_X_DIGEST_SIZE != 384) bswap QWORD(tmp5) %endif mov [p + 0*8], QWORD(tmp2) mov [p + 1*8], QWORD(tmp4) mov [p + 2*8], QWORD(tmp6) %if (SHA_X_DIGEST_SIZE != 384) mov [p + 3*8], QWORD(tmp5) %endif return: mov rbx, [rsp + _gpr_save + 8*0] mov rbp, [rsp + _gpr_save + 8*1] mov r12, [rsp + _gpr_save + 8*2] mov r13, [rsp + _gpr_save + 8*3] mov r14, [rsp + _gpr_save + 8*4] mov r15, [rsp + _gpr_save + 8*5] %ifndef LINUX mov rdi, [rsp + _gpr_save + 8*6] %endif mov rsp, [rsp + _rsp_save] ; original SP ret intel-ipsec-mb-0.48/avx512/mb_mgr_hmac_sha_512_submit_avx512.asm000066400000000000000000000233311321406316400241550ustar00rootroot00000000000000;; ;; Copyright (c) 2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; ;; In System V AMD64 ABI ;; calle saves: RBX, RBP, R12-R15 ;; Windows x64 ABI ;; calle saves: RBX, RBP, RDI, RSI, RSP, R12-R15 ;; ;; Clobbers ZMM0-31 %include "os.asm" %include "job_aes_hmac.asm" %include "mb_mgr_datastruct.asm" %include "reg_sizes.asm" %include "memcpy.asm" extern sha512_x8_avx512 section .data default rel align 16 dupw: ;ddq 0x01000100010001000100010001000100 dq 0x0100010001000100, 0x0100010001000100 align 16 byteswap: ;ddq 0x08090a0b0c0d0e0f0001020304050607 dq 0x0001020304050607, 0x08090a0b0c0d0e0f section .text %ifdef LINUX %define arg1 rdi %define arg2 rsi %define arg3 rcx %define arg4 rdx %else %define arg1 rcx %define arg2 rdx %define arg3 rdi %define arg4 rsi %endif %define state arg1 %define job arg2 %define len2 arg2 ; idx needs to be in rbp, r13, r14, r16 %define last_len rbp %define idx rbp %define p r11 %define start_offset r11 %define unused_lanes rbx %define tmp4 rbx %define job_rax rax %define len rax %define size_offset arg3 %define tmp2 arg3 %define lane arg4 %define tmp3 arg4 %define extra_blocks r8 %define tmp r9 %define p2 r9 %define lane_data r10 ; Define stack usage ; we clobber rbx, rsi, rdi, rbp; called routine also clobbers r12 struc STACK _gpr_save: resq 8 _rsp_save: resq 1 endstruc ; JOB* FUNC(MB_MGR_HMAC_sha_512_OOO *state, JOB_AES_HMAC *job) ; arg 1 : rcx : state ; arg 2 : rdx : job align 64 %ifndef SHA384 MKGLOBAL(submit_job_hmac_sha_512_avx512,function,internal) %define SHA_X_DIGEST_SIZE 512 submit_job_hmac_sha_512_avx512: %else MKGLOBAL(submit_job_hmac_sha_384_avx512,function,internal) %define SHA_X_DIGEST_SIZE 384 submit_job_hmac_sha_384_avx512: %endif mov rax, rsp sub rsp, STACK_size and rsp, -32 mov [rsp + _gpr_save + 8*0], rbx mov [rsp + _gpr_save + 8*1], rbp mov [rsp + _gpr_save + 8*2], r12 mov [rsp + _gpr_save + 8*3], r13 mov [rsp + _gpr_save + 8*4], r14 mov [rsp + _gpr_save + 8*5], r15 %ifndef LINUX mov [rsp + _gpr_save + 8*6], rsi mov [rsp + _gpr_save + 8*7], rdi %endif mov [rsp + _rsp_save], rax ; original SP mov unused_lanes, [state + _unused_lanes_sha512] mov lane, unused_lanes and lane, 15 shr unused_lanes, 4 imul lane_data, lane, _SHA512_LANE_DATA_size lea lane_data, [state + _ldata_sha512 + lane_data] mov [state + _unused_lanes_sha512], unused_lanes mov len, [job + _msg_len_to_hash_in_bytes] mov tmp, len shr tmp, 7 ; divide by 128, len in terms of blocks mov [lane_data + _job_in_lane_sha512], job mov dword [lane_data + _outer_done_sha512], 0 mov [state + _lens_sha512 + 2*lane], WORD(tmp) ; 2 is word size in bytes mov last_len, len and last_len, 127 lea extra_blocks, [last_len + 17 + 127] shr extra_blocks, 7 mov [lane_data + _extra_blocks_sha512], DWORD(extra_blocks) mov p, [job + _src] add p, [job + _hash_start_src_offset_in_bytes] mov [state + _args_data_ptr_sha512 + PTR_SZ*lane], p cmp len, 128 jb copy_lt128 fast_copy: add p, len vmovdqu32 zmm0, [p - 128 + 0*64] vmovdqu32 zmm1, [p - 128 + 1*64] vmovdqu32 [lane_data + _extra_block_sha512 + 0*64], zmm0 vmovdqu32 [lane_data + _extra_block_sha512 + 1*64], zmm1 end_fast_copy: mov size_offset, extra_blocks shl size_offset, 7 sub size_offset, last_len add size_offset, 128-8 mov [lane_data + _size_offset_sha512], DWORD(size_offset) mov start_offset, 128 sub start_offset, last_len mov [lane_data + _start_offset_sha512], DWORD(start_offset) lea tmp, [8*128 + 8*len] bswap tmp mov [lane_data + _extra_block_sha512 + size_offset], tmp mov tmp, [job + _auth_key_xor_ipad] %assign I 0 %rep 4 vmovdqu xmm0, [tmp + I * 2 * SHA512_DIGEST_WORD_SIZE] vmovq [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*lane + (2*I + 0)*SHA512_DIGEST_ROW_SIZE], xmm0 vpextrq [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*lane + (2*I + 1)*SHA512_DIGEST_ROW_SIZE], xmm0, 1 %assign I (I+1) %endrep test len, ~127 jnz ge128_bytes lt128_bytes: mov [state + _lens_sha512 + 2*lane], WORD(extra_blocks) lea tmp, [lane_data + _extra_block_sha512 + start_offset] mov [state + _args_data_ptr_sha512 + PTR_SZ*lane], tmp ;; 8 to hold a UINT8 mov dword [lane_data + _extra_blocks_sha512], 0 ge128_bytes: cmp unused_lanes, 0xf jne return_null jmp start_loop align 32 start_loop: ; Find min length vmovdqa xmm0, [state + _lens_sha512] vphminposuw xmm1, xmm0 vpextrw DWORD(len2), xmm1, 0 ; min value vpextrw DWORD(idx), xmm1, 1 ; min index (0...7) cmp len2, 0 je len_is_0 vpshufb xmm1, [rel dupw] ; duplicate words across all 8 lanes vpsubw xmm0, xmm0, xmm1 vmovdqa [state + _lens_sha512], xmm0 ; "state" and "args" are the same address, arg1 ; len is arg2 call sha512_x8_avx512 ; state and idx are intact len_is_0: ; process completed job "idx" imul lane_data, idx, _SHA512_LANE_DATA_size lea lane_data, [state + _ldata_sha512 + lane_data] mov DWORD(extra_blocks), [lane_data + _extra_blocks_sha512] cmp extra_blocks, 0 jne proc_extra_blocks cmp dword [lane_data + _outer_done_sha512], 0 jne end_loop proc_outer: mov dword [lane_data + _outer_done_sha512], 1 mov DWORD(size_offset), [lane_data + _size_offset_sha512] mov qword [lane_data + _extra_block_sha512 + size_offset], 0 mov word [state + _lens_sha512 + 2*idx], 1 lea tmp, [lane_data + _outer_block_sha512] mov job, [lane_data + _job_in_lane_sha512] mov [state + _args_data_ptr_sha512 + PTR_SZ*idx], tmp %assign I 0 %rep (SHA_X_DIGEST_SIZE / (8 * 16)) vmovq xmm0, [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + (2*I + 0)*SHA512_DIGEST_ROW_SIZE] vpinsrq xmm0, [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + (2*I + 1)*SHA512_DIGEST_ROW_SIZE], 1 vpshufb xmm0, [rel byteswap] vmovdqa [lane_data + _outer_block_sha512 + I * 2 * SHA512_DIGEST_WORD_SIZE], xmm0 %assign I (I+1) %endrep mov tmp, [job + _auth_key_xor_opad] %assign I 0 %rep 4 vmovdqu xmm0, [tmp + I * 16] vmovq [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + (2*I+0)*SHA512_DIGEST_ROW_SIZE], xmm0 vpextrq [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + (2*I + 1)*SHA512_DIGEST_ROW_SIZE], xmm0, 1 %assign I (I+1) %endrep jmp start_loop align 32 proc_extra_blocks: mov DWORD(start_offset), [lane_data + _start_offset_sha512] mov [state + _lens_sha512 + 2*idx], WORD(extra_blocks) lea tmp, [lane_data + _extra_block_sha512 + start_offset] mov [state + _args_data_ptr_sha512 + PTR_SZ*idx], tmp ;; idx is index of shortest length message mov dword [lane_data + _extra_blocks_sha512], 0 jmp start_loop align 32 copy_lt128: ;; less than one message block of data ;; destination extra block but backwards by len from where 0x80 pre-populated lea p2, [lane_data + _extra_block + 128] sub p2, len memcpy_avx2_128_1 p2, p, len, tmp4, tmp2, ymm0, ymm1, ymm2, ymm3 mov unused_lanes, [state + _unused_lanes_sha512] jmp end_fast_copy return_null: xor job_rax, job_rax jmp return align 32 end_loop: mov job_rax, [lane_data + _job_in_lane_sha512] mov unused_lanes, [state + _unused_lanes_sha512] mov qword [lane_data + _job_in_lane_sha512], 0 or dword [job_rax + _status], STS_COMPLETED_HMAC shl unused_lanes, 4 or unused_lanes, idx mov [state + _unused_lanes_sha512], unused_lanes mov p, [job_rax + _auth_tag_output] ; below is the code for both SHA512 & SHA384. SHA512=32 bytes and SHA384=24 bytes mov QWORD(tmp), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 0*SHA512_DIGEST_ROW_SIZE] mov QWORD(tmp2), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 1*SHA512_DIGEST_ROW_SIZE] mov QWORD(tmp3), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 2*SHA512_DIGEST_ROW_SIZE] %if (SHA_X_DIGEST_SIZE != 384) mov QWORD(tmp4), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 3*SHA512_DIGEST_ROW_SIZE] %endif bswap QWORD(tmp) bswap QWORD(tmp2) bswap QWORD(tmp3) %if (SHA_X_DIGEST_SIZE != 384) bswap QWORD(tmp4) %endif mov [p + 0*8], QWORD(tmp) mov [p + 1*8], QWORD(tmp2) mov [p + 2*8], QWORD(tmp3) %if (SHA_X_DIGEST_SIZE != 384) mov [p + 3*8], QWORD(tmp4) %endif return: mov rbx, [rsp + _gpr_save + 8*0] mov rbp, [rsp + _gpr_save + 8*1] mov r12, [rsp + _gpr_save + 8*2] mov r13, [rsp + _gpr_save + 8*3] mov r14, [rsp + _gpr_save + 8*4] mov r15, [rsp + _gpr_save + 8*5] %ifndef LINUX mov rsi, [rsp + _gpr_save + 8*6] mov rdi, [rsp + _gpr_save + 8*7] %endif mov rsp, [rsp + _rsp_save] ; original SP ret intel-ipsec-mb-0.48/avx512/mb_mgr_hmac_submit_avx512.asm000066400000000000000000000304531321406316400227360ustar00rootroot00000000000000;; ;; Copyright (c) 2012-2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; ;; In System V AMD64 ABI ;; calle saves: RBX, RBP, R12-R15 ;; Windows x64 ABI ;; calle saves: RBX, RBP, RDI, RSI, RSP, R12-R15 ;; ;; Registers: RAX RBX RCX RDX RBP RSI RDI R8 R9 R10 R11 R12 R13 R14 R15 ;; ----------------------------------------------------------- ;; Windows clobbers: RAX RCX RDX R8 R9 R10 R11 ;; Windows preserves: RBX RBP RSI RDI R12 R13 R14 R15 ;; ----------------------------------------------------------- ;; Linux clobbers: RAX RCX RDX RSI RDI R8 R9 R10 R11 ;; Linux preserves: RBX RBP R12 R13 R14 R15 ;; ----------------------------------------------------------- ;; Clobbers ZMM0-31 %include "os.asm" %include "job_aes_hmac.asm" %include "mb_mgr_datastruct.asm" %include "reg_sizes.asm" %include "memcpy.asm" ;; %define DO_DBGPRINT %include "dbgprint.asm" extern sha1_x16_avx512 section .data default rel align 16 byteswap: dq 0x0405060700010203 dq 0x0c0d0e0f08090a0b section .text %if 1 %ifdef LINUX %define arg1 rdi %define arg2 rsi %define reg3 rcx %define reg4 rdx %else %define arg1 rcx %define arg2 rdx %define reg3 rdi %define reg4 rsi %endif %define state arg1 %define job arg2 %define len2 arg2 ; idx needs to be in rbx, rdi, rbp %define last_len rbp %define idx rbp %define p r11 %define start_offset r11 %define unused_lanes r12 %define tmp4 r12 %define job_rax rax %define len rax %define size_offset reg3 %define tmp2 reg3 %define lane reg4 %define tmp3 reg4 %define extra_blocks r8 %define tmp r9 %define p2 r9 %define lane_data r10 %define num_lanes_inuse r12 %define len_upper r13 %define idx_upper r14 %endif ; we clobber rsi, rdi, rbp, r12; called routine clobbers also r9-r15 struc STACK _gpr_save: resq 7 _rsp_save: resq 1 endstruc ; JOB* submit_job_hmac_avx(MB_MGR_HMAC_SHA_1_OOO *state, JOB_AES_HMAC *job) ; arg 1 : rcx : state ; arg 2 : rdx : job MKGLOBAL(submit_job_hmac_avx512,function,internal) submit_job_hmac_avx512: mov rax, rsp sub rsp, STACK_size and rsp, -32 ; align to 32 byte boundary mov [rsp + _gpr_save + 8*0], rbp mov [rsp + _gpr_save + 8*1], r12 mov [rsp + _gpr_save + 8*2], r13 mov [rsp + _gpr_save + 8*3], r14 mov [rsp + _gpr_save + 8*4], r15 %ifndef LINUX mov [rsp + _gpr_save + 8*5], rsi mov [rsp + _gpr_save + 8*6], rdi %endif mov [rsp + _rsp_save], rax DBGPRINTL "---------- enter sha1 submit -----------" mov unused_lanes, [state + _unused_lanes] mov lane, unused_lanes and lane, 0xF ;; just a nibble shr unused_lanes, 4 imul lane_data, lane, _HMAC_SHA1_LANE_DATA_size lea lane_data, [state + _ldata + lane_data] mov [state + _unused_lanes], unused_lanes DBGPRINTL64 "lane", lane DBGPRINTL64 "unused_lanes", unused_lanes add dword [state + _num_lanes_inuse_sha1], 1 mov len, [job + _msg_len_to_hash_in_bytes] mov tmp, len shr tmp, 6 ; divide by 64, len in terms of blocks mov [lane_data + _job_in_lane], job mov dword [lane_data + _outer_done], 0 mov [state + _lens + 2*lane], WORD(tmp) mov last_len, len DBGPRINTL64 "last_len", last_len and last_len, 63 lea extra_blocks, [last_len + 9 + 63] shr extra_blocks, 6 DBGPRINTL64 "extra_blocks", extra_blocks mov [lane_data + _extra_blocks], DWORD(extra_blocks) mov p, [job + _src] add p, [job + _hash_start_src_offset_in_bytes] mov [state + _args_data_ptr + PTR_SZ*lane], p cmp len, 64 jb copy_lt64 fast_copy: vmovdqu32 zmm0, [p - 64 + len] vmovdqu32 [lane_data + _extra_block], zmm0 end_fast_copy: mov size_offset, extra_blocks shl size_offset, 6 sub size_offset, last_len add size_offset, 64-8 mov [lane_data + _size_offset], DWORD(size_offset) mov start_offset, 64 sub start_offset, last_len mov [lane_data + _start_offset], DWORD(start_offset) lea tmp, [8*64 + 8*len] bswap tmp mov [lane_data + _extra_block + size_offset], tmp mov tmp, [job + _auth_key_xor_ipad] vmovdqu xmm0, [tmp] mov DWORD(tmp), [tmp + 4*4] vmovd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*lane + 0*SHA1_DIGEST_ROW_SIZE], xmm0 vpextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*lane + 1*SHA1_DIGEST_ROW_SIZE], xmm0, 1 vpextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*lane + 2*SHA1_DIGEST_ROW_SIZE], xmm0, 2 vpextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*lane + 3*SHA1_DIGEST_ROW_SIZE], xmm0, 3 mov [state + _args_digest + SHA1_DIGEST_WORD_SIZE*lane + 4*SHA1_DIGEST_ROW_SIZE], DWORD(tmp) test len, ~63 jnz ge64_bytes lt64_bytes: DBGPRINTL64 "lt64_bytes extra_blocks", extra_blocks DBGPRINTL64 "lt64_bytes start_offset", start_offset mov [state + _lens + 2*lane], WORD(extra_blocks) lea tmp, [lane_data + _extra_block + start_offset] mov [state + _args_data_ptr + PTR_SZ*lane], tmp mov dword [lane_data + _extra_blocks], 0 ge64_bytes: mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse_sha1] cmp num_lanes_inuse, 0x10 ; all 16 lanes used? jne return_null jmp start_loop align 16 start_loop: ; Find min length vmovdqa xmm0, [state + _lens] vphminposuw xmm1, xmm0 vpextrw DWORD(len2), xmm1, 0 ; min value vpextrw DWORD(idx), xmm1, 1 ; min index (0...7) vmovdqa xmm2, [state + _lens + 8*2] vphminposuw xmm3, xmm2 vpextrw DWORD(len_upper), xmm3, 0 ; min value vpextrw DWORD(idx_upper), xmm3, 1 ; min index (8...F) cmp len2, len_upper jle use_min vmovdqa xmm1, xmm3 mov len2, len_upper mov idx, idx_upper ; idx would be in range 0..7 add idx, 8 ; to reflect that index is in 8..F range use_min: cmp len2, 0 je len_is_0 DBGPRINTL64 "min_length", len2 DBGPRINTL64 "min_length index ", idx vpbroadcastw xmm1, xmm1 DBGPRINTL_XMM "SUBMIT lens after shuffle", xmm1 vpsubw xmm0, xmm0, xmm1 vmovdqa [state + _lens + 0*2], xmm0 vpsubw xmm2, xmm2, xmm1 vmovdqa [state + _lens + 8*2], xmm2 DBGPRINTL_XMM "lengths after subtraction (0..7)", xmm0 DBGPRINTL_XMM "lengths after subtraction (8..F)", xmm2 ; "state" and "args" are the same address, arg1 ; len is arg2 call sha1_x16_avx512 ; state and idx are intact len_is_0: ; process completed job "idx" imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size lea lane_data, [state + _ldata + lane_data] mov DWORD(extra_blocks), [lane_data + _extra_blocks] cmp extra_blocks, 0 jne proc_extra_blocks cmp dword [lane_data + _outer_done], 0 jne end_loop proc_outer: mov dword [lane_data + _outer_done], 1 mov DWORD(size_offset), [lane_data + _size_offset] mov qword [lane_data + _extra_block + size_offset], 0 mov word [state + _lens + 2*idx], 1 lea tmp, [lane_data + _outer_block] mov job, [lane_data + _job_in_lane] mov [state + _args_data_ptr + PTR_SZ*idx], tmp vmovd xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 0*SHA1_DIGEST_ROW_SIZE] vpinsrd xmm0, xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 1*SHA1_DIGEST_ROW_SIZE], 1 vpinsrd xmm0, xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 2*SHA1_DIGEST_ROW_SIZE], 2 vpinsrd xmm0, xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 3*SHA1_DIGEST_ROW_SIZE], 3 vpshufb xmm0, xmm0, [rel byteswap] mov DWORD(tmp), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 4*SHA1_DIGEST_ROW_SIZE] bswap DWORD(tmp) vmovdqa [lane_data + _outer_block], xmm0 mov [lane_data + _outer_block + 4*SHA1_DIGEST_WORD_SIZE], DWORD(tmp) mov tmp, [job + _auth_key_xor_opad] vmovdqu xmm0, [tmp] mov DWORD(tmp), [tmp + 4*4] vmovd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 0*SHA1_DIGEST_ROW_SIZE], xmm0 vpextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 1*SHA1_DIGEST_ROW_SIZE], xmm0, 1 vpextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 2*SHA1_DIGEST_ROW_SIZE], xmm0, 2 vpextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 3*SHA1_DIGEST_ROW_SIZE], xmm0, 3 mov [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 4*SHA1_DIGEST_ROW_SIZE], DWORD(tmp) jmp start_loop align 16 proc_extra_blocks: mov DWORD(start_offset), [lane_data + _start_offset] mov [state + _lens + 2*idx], WORD(extra_blocks) lea tmp, [lane_data + _extra_block + start_offset] mov [state + _args_data_ptr + PTR_SZ*idx], tmp mov dword [lane_data + _extra_blocks], 0 jmp start_loop align 16 copy_lt64: ;; less than one message block of data ;; beginning of source block ;; destination extrablock but backwards by len from where 0x80 pre-populated lea p2, [lane_data + _extra_block + 64] sub p2, len memcpy_avx2_64_1 p2, p, len, tmp4, tmp2, ymm0, ymm1 mov unused_lanes, [state + _unused_lanes] jmp end_fast_copy return_null: xor job_rax, job_rax jmp return align 16 end_loop: mov job_rax, [lane_data + _job_in_lane] or dword [job_rax + _status], STS_COMPLETED_HMAC mov qword [lane_data + _job_in_lane], 0 mov unused_lanes, [state + _unused_lanes] shl unused_lanes, 4 or unused_lanes, idx mov [state + _unused_lanes], unused_lanes sub dword [state + _num_lanes_inuse_sha1], 1 mov p, [job_rax + _auth_tag_output] ; copy 12 bytes mov DWORD(tmp), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 0*SHA1_DIGEST_ROW_SIZE] mov DWORD(tmp2), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 1*SHA1_DIGEST_ROW_SIZE] mov DWORD(tmp3), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 2*SHA1_DIGEST_ROW_SIZE] bswap DWORD(tmp) bswap DWORD(tmp2) bswap DWORD(tmp3) mov [p + 0*SHA1_DIGEST_WORD_SIZE], DWORD(tmp) mov [p + 1*SHA1_DIGEST_WORD_SIZE], DWORD(tmp2) mov [p + 2*SHA1_DIGEST_WORD_SIZE], DWORD(tmp3) DBGPRINTL "---------- exit sha1 submit -----------" return: mov rbp, [rsp + _gpr_save + 8*0] mov r12, [rsp + _gpr_save + 8*1] mov r13, [rsp + _gpr_save + 8*2] mov r14, [rsp + _gpr_save + 8*3] mov r15, [rsp + _gpr_save + 8*4] %ifndef LINUX mov rsi, [rsp + _gpr_save + 8*5] mov rdi, [rsp + _gpr_save + 8*6] %endif mov rsp, [rsp + _rsp_save] ret intel-ipsec-mb-0.48/avx512/sha1_x16_avx512.asm000066400000000000000000000552271321406316400204600ustar00rootroot00000000000000;; ;; Copyright (c) 2012-2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; ;; Stack must be aligned to 32 bytes before call ;; ;; Registers: RAX RBX RCX RDX RBP RSI RDI R8 R9 R10 R11 R12 R13 R14 R15 ;; ----------------------------------------------------------- ;; Windows clobbers: RAX RDX R8 R9 R10 R11 R12 R13 R14 R15 ;; Windows preserves: RBX RCX RBP RSI RDI ;; ----------------------------------------------------------- ;; Linux clobbers: RAX RDX RSI R9 R10 R11 R12 R13 R14 R15 ;; Linux preserves: RBX RCX RBP RDI R8 ;; ----------------------------------------------------------- ;; Clobbers ZMM0-31 %include "os.asm" ;%define DO_DBGPRINT %include "dbgprint.asm" %include "mb_mgr_datastruct.asm" section .data default rel align 64 K00_19: ;ddq 0x5A8279995A8279995A8279995A827999 ;ddq 0x5A8279995A8279995A8279995A827999 ;ddq 0x5A8279995A8279995A8279995A827999 ;ddq 0x5A8279995A8279995A8279995A827999 dq 0x5A8279995A827999, 0x5A8279995A827999 dq 0x5A8279995A827999, 0x5A8279995A827999 dq 0x5A8279995A827999, 0x5A8279995A827999 dq 0x5A8279995A827999, 0x5A8279995A827999 K20_39: ;ddq 0x6ED9EBA16ED9EBA16ED9EBA16ED9EBA1 ;ddq 0x6ED9EBA16ED9EBA16ED9EBA16ED9EBA1 ;ddq 0x6ED9EBA16ED9EBA16ED9EBA16ED9EBA1 ;ddq 0x6ED9EBA16ED9EBA16ED9EBA16ED9EBA1 dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1 dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1 dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1 dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1 K40_59: ;ddq 0x8F1BBCDC8F1BBCDC8F1BBCDC8F1BBCDC ;ddq 0x8F1BBCDC8F1BBCDC8F1BBCDC8F1BBCDC ;ddq 0x8F1BBCDC8F1BBCDC8F1BBCDC8F1BBCDC ;ddq 0x8F1BBCDC8F1BBCDC8F1BBCDC8F1BBCDC dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC K60_79: ;ddq 0xCA62C1D6CA62C1D6CA62C1D6CA62C1D6 ;ddq 0xCA62C1D6CA62C1D6CA62C1D6CA62C1D6 ;ddq 0xCA62C1D6CA62C1D6CA62C1D6CA62C1D6 ;ddq 0xCA62C1D6CA62C1D6CA62C1D6CA62C1D6 dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6 dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6 dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6 dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6 PSHUFFLE_BYTE_FLIP_MASK: ;ddq 0x0c0d0e0f08090a0b0405060700010203 ;ddq 0x0c0d0e0f08090a0b0405060700010203 ;ddq 0x0c0d0e0f08090a0b0405060700010203 ;ddq 0x0c0d0e0f08090a0b0405060700010203 dq 0x0405060700010203, 0x0c0d0e0f08090a0b dq 0x0405060700010203, 0x0c0d0e0f08090a0b dq 0x0405060700010203, 0x0c0d0e0f08090a0b dq 0x0405060700010203, 0x0c0d0e0f08090a0b PSHUFFLE_TRANSPOSE16_MASK1: dq 0x0000000000000000 dq 0x0000000000000001 dq 0x0000000000000008 dq 0x0000000000000009 dq 0x0000000000000004 dq 0x0000000000000005 dq 0x000000000000000C dq 0x000000000000000D PSHUFFLE_TRANSPOSE16_MASK2: dq 0x0000000000000002 dq 0x0000000000000003 dq 0x000000000000000A dq 0x000000000000000B dq 0x0000000000000006 dq 0x0000000000000007 dq 0x000000000000000E dq 0x000000000000000F section .text %define PTR_SIZE 8 %macro TRANSPOSE16 18 %define %%r0 %1 %define %%r1 %2 %define %%r2 %3 %define %%r3 %4 %define %%r4 %5 %define %%r5 %6 %define %%r6 %7 %define %%r7 %8 %define %%r8 %9 %define %%r9 %10 %define %%r10 %11 %define %%r11 %12 %define %%r12 %13 %define %%r13 %14 %define %%r14 %15 %define %%r15 %16 %define %%t0 %17 %define %%t1 %18 ; r0 = {a15 a14 a13 a12 a11 a10 a9 a8 a7 a6 a5 a4 a3 a2 a1 a0} ; r1 = {b15 b14 b13 b12 b11 b10 b9 b8 b7 b6 b5 b4 b3 b2 b1 b0} ; r2 = {c15 c14 c13 c12 c11 c10 c9 c8 c7 c6 c5 c4 c3 c2 c1 c0} ; r3 = {d15 d14 d13 d12 d11 d10 d9 d8 d7 d6 d5 d4 d3 d2 d1 d0} ; r4 = {e15 e14 e13 e12 e11 e10 e9 e8 e7 e6 e5 e4 e3 e2 e1 e0} ; r5 = {f15 f14 f13 f12 f11 f10 f9 f8 f7 f6 f5 f4 f3 f2 f1 f0} ; r6 = {g15 g14 g13 g12 g11 g10 g9 g8 g7 g6 g5 g4 g3 g2 g1 g0} ; r7 = {h15 h14 h13 h12 h11 h10 h9 h8 h7 h6 h5 h4 h3 h2 h1 h0} ; r8 = {i15 i14 i13 i12 i11 i10 i9 i8 i7 i6 i5 i4 i3 i2 i1 i0} ; r9 = {j15 j14 j13 j12 j11 j10 j9 j8 j7 j6 j5 j4 j3 j2 j1 j0} ; r10 = {k15 k14 k13 k12 k11 k10 k9 k8 k7 k6 k5 k4 k3 k2 k1 k0} ; r11 = {l15 l14 l13 l12 l11 l10 l9 l8 l7 l6 l5 l4 l3 l2 l1 l0} ; r12 = {m15 m14 m13 m12 m11 m10 m9 m8 m7 m6 m5 m4 m3 m2 m1 m0} ; r13 = {n15 n14 n13 n12 n11 n10 n9 n8 n7 n6 n5 n4 n3 n2 n1 n0} ; r14 = {o15 o14 o13 o12 o11 o10 o9 o8 o7 o6 o5 o4 o3 o2 o1 o0} ; r15 = {p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0} ; r0 = {p0 o0 n0 m0 l0 k0 j0 i0 h0 g0 f0 e0 d0 c0 b0 a0} ; r1 = {p1 o1 n1 m1 l1 k1 j1 i1 h1 g1 f1 e1 d1 c1 b1 a1} ; r2 = {p2 o2 n2 m2 l2 k2 j2 i2 h2 g2 f2 e2 d2 c2 b2 a2} ; r3 = {p3 o3 n3 m3 l3 k3 j3 i3 h3 g3 f3 e3 d3 c3 b3 a3} ; r4 = {p4 o4 n4 m4 l4 k4 j4 i4 h4 g4 f4 e4 d4 c4 b4 a4} ; r5 = {p5 o5 n5 m5 l5 k5 j5 i5 h5 g5 f5 e5 d5 c5 b5 a5} ; r6 = {p6 o6 n6 m6 l6 k6 j6 i6 h6 g6 f6 e6 d6 c6 b6 a6} ; r7 = {p7 o7 n7 m7 l7 k7 j7 i7 h7 g7 f7 e7 d7 c7 b7 a7} ; r8 = {p8 o8 n8 m8 l8 k8 j8 i8 h8 g8 f8 e8 d8 c8 b8 a8} ; r9 = {p9 o9 n9 m9 l9 k9 j9 i9 h9 g9 f9 e9 d9 c9 b9 a9} ; r10 = {p10 o10 n10 m10 l10 k10 j10 i10 h10 g10 f10 e10 d10 c10 b10 a10} ; r11 = {p11 o11 n11 m11 l11 k11 j11 i11 h11 g11 f11 e11 d11 c11 b11 a11} ; r12 = {p12 o12 n12 m12 l12 k12 j12 i12 h12 g12 f12 e12 d12 c12 b12 a12} ; r13 = {p13 o13 n13 m13 l13 k13 j13 i13 h13 g13 f13 e13 d13 c13 b13 a13} ; r14 = {p14 o14 n14 m14 l14 k14 j14 i14 h14 g14 f14 e14 d14 c14 b14 a14} ; r15 = {p15 o15 n15 m15 l15 k15 j15 i15 h15 g15 f15 e15 d15 c15 b15 a15} ; process top half (r0..r3) {a...d} vshufps %%t0, %%r0, %%r1, 0x44 ; t0 = {b13 b12 a13 a12 b9 b8 a9 a8 b5 b4 a5 a4 b1 b0 a1 a0} vshufps %%r0, %%r0, %%r1, 0xEE ; r0 = {b15 b14 a15 a14 b11 b10 a11 a10 b7 b6 a7 a6 b3 b2 a3 a2} vshufps %%t1, %%r2, %%r3, 0x44 ; t1 = {d13 d12 c13 c12 d9 d8 c9 c8 d5 d4 c5 c4 d1 d0 c1 c0} vshufps %%r2, %%r2, %%r3, 0xEE ; r2 = {d15 d14 c15 c14 d11 d10 c11 c10 d7 d6 c7 c6 d3 d2 c3 c2} vshufps %%r3, %%t0, %%t1, 0xDD ; r3 = {d13 c13 b13 a13 d9 c9 b9 a9 d5 c5 b5 a5 d1 c1 b1 a1} vshufps %%r1, %%r0, %%r2, 0x88 ; r1 = {d14 c14 b14 a14 d10 c10 b10 a10 d6 c6 b6 a6 d2 c2 b2 a2} vshufps %%r0, %%r0, %%r2, 0xDD ; r0 = {d15 c15 b15 a15 d11 c11 b11 a11 d7 c7 b7 a7 d3 c3 b3 a3} vshufps %%t0, %%t0, %%t1, 0x88 ; t0 = {d12 c12 b12 a12 d8 c8 b8 a8 d4 c4 b4 a4 d0 c0 b0 a0} ; use r2 in place of t0 vshufps %%r2, %%r4, %%r5, 0x44 ; r2 = {f13 f12 e13 e12 f9 f8 e9 e8 f5 f4 e5 e4 f1 f0 e1 e0} vshufps %%r4, %%r4, %%r5, 0xEE ; r4 = {f15 f14 e15 e14 f11 f10 e11 e10 f7 f6 e7 e6 f3 f2 e3 e2} vshufps %%t1, %%r6, %%r7, 0x44 ; t1 = {h13 h12 g13 g12 h9 h8 g9 g8 h5 h4 g5 g4 h1 h0 g1 g0} vshufps %%r6, %%r6, %%r7, 0xEE ; r6 = {h15 h14 g15 g14 h11 h10 g11 g10 h7 h6 g7 g6 h3 h2 g3 g2} vshufps %%r7, %%r2, %%t1, 0xDD ; r7 = {h13 g13 f13 e13 h9 g9 f9 e9 h5 g5 f5 e5 h1 g1 f1 e1} vshufps %%r5, %%r4, %%r6, 0x88 ; r5 = {h14 g14 f14 e14 h10 g10 f10 e10 h6 g6 f6 e6 h2 g2 f2 e2} vshufps %%r4, %%r4, %%r6, 0xDD ; r4 = {h15 g15 f15 e15 h11 g11 f11 e11 h7 g7 f7 e7 h3 g3 f3 e3} vshufps %%r2, %%r2, %%t1, 0x88 ; r2 = {h12 g12 f12 e12 h8 g8 f8 e8 h4 g4 f4 e4 h0 g0 f0 e0} ; use r6 in place of t0 vshufps %%r6, %%r8, %%r9, 0x44 ; r6 = {j13 j12 i13 i12 j9 j8 i9 i8 j5 j4 i5 i4 j1 j0 i1 i0} vshufps %%r8, %%r8, %%r9, 0xEE ; r8 = {j15 j14 i15 i14 j11 j10 i11 i10 j7 j6 i7 i6 j3 j2 i3 i2} vshufps %%t1, %%r10, %%r11, 0x44 ; t1 = {l13 l12 k13 k12 l9 l8 k9 k8 l5 l4 k5 k4 l1 l0 k1 k0} vshufps %%r10, %%r10, %%r11, 0xEE ; r10 = {l15 l14 k15 k14 l11 l10 k11 k10 l7 l6 k7 k6 l3 l2 k3 k2} vshufps %%r11, %%r6, %%t1, 0xDD ; r11 = {l13 k13 j13 113 l9 k9 j9 i9 l5 k5 j5 i5 l1 k1 j1 i1} vshufps %%r9, %%r8, %%r10, 0x88 ; r9 = {l14 k14 j14 114 l10 k10 j10 i10 l6 k6 j6 i6 l2 k2 j2 i2} vshufps %%r8, %%r8, %%r10, 0xDD ; r8 = {l15 k15 j15 115 l11 k11 j11 i11 l7 k7 j7 i7 l3 k3 j3 i3} vshufps %%r6, %%r6, %%t1, 0x88 ; r6 = {l12 k12 j12 112 l8 k8 j8 i8 l4 k4 j4 i4 l0 k0 j0 i0} ; use r10 in place of t0 vshufps %%r10, %%r12, %%r13, 0x44 ; r10 = {n13 n12 m13 m12 n9 n8 m9 m8 n5 n4 m5 m4 n1 n0 a1 m0} vshufps %%r12, %%r12, %%r13, 0xEE ; r12 = {n15 n14 m15 m14 n11 n10 m11 m10 n7 n6 m7 m6 n3 n2 a3 m2} vshufps %%t1, %%r14, %%r15, 0x44 ; t1 = {p13 p12 013 012 p9 p8 09 08 p5 p4 05 04 p1 p0 01 00} vshufps %%r14, %%r14, %%r15, 0xEE ; r14 = {p15 p14 015 014 p11 p10 011 010 p7 p6 07 06 p3 p2 03 02} vshufps %%r15, %%r10, %%t1, 0xDD ; r15 = {p13 013 n13 m13 p9 09 n9 m9 p5 05 n5 m5 p1 01 n1 m1} vshufps %%r13, %%r12, %%r14, 0x88 ; r13 = {p14 014 n14 m14 p10 010 n10 m10 p6 06 n6 m6 p2 02 n2 m2} vshufps %%r12, %%r12, %%r14, 0xDD ; r12 = {p15 015 n15 m15 p11 011 n11 m11 p7 07 n7 m7 p3 03 n3 m3} vshufps %%r10, %%r10, %%t1, 0x88 ; r10 = {p12 012 n12 m12 p8 08 n8 m8 p4 04 n4 m4 p0 00 n0 m0} ;; At this point, the registers that contain interesting data are: ;; t0, r3, r1, r0, r2, r7, r5, r4, r6, r11, r9, r8, r10, r15, r13, r12 ;; Can use t1 and r14 as scratch registers vmovdqa32 %%r14, [PSHUFFLE_TRANSPOSE16_MASK1] vpermi2q %%r14, %%t0, %%r2 ; r14 = {h8 g8 f8 e8 d8 c8 b8 a8 h0 g0 f0 e0 d0 c0 b0 a0} vmovdqa32 %%t1, [PSHUFFLE_TRANSPOSE16_MASK2] vpermi2q %%t1, %%t0, %%r2 ; t1 = {h12 g12 f12 e12 d12 c12 b12 a12 h4 g4 f4 e4 d4 c4 b4 a4} vmovdqa32 %%r2, [PSHUFFLE_TRANSPOSE16_MASK1] vpermi2q %%r2, %%r3, %%r7 ; r2 = {h9 g9 f9 e9 d9 c9 b9 a9 h1 g1 f1 e1 d1 c1 b1 a1} vmovdqa32 %%t0, [PSHUFFLE_TRANSPOSE16_MASK2] vpermi2q %%t0, %%r3, %%r7 ; t0 = {h13 g13 f13 e13 d13 c13 b13 a13 h5 g5 f5 e5 d5 c5 b5 a5} vmovdqa32 %%r3, [PSHUFFLE_TRANSPOSE16_MASK1] vpermi2q %%r3, %%r1, %%r5 ; r3 = {h10 g10 f10 e10 d10 c10 b10 a10 h2 g2 f2 e2 d2 c2 b2 a2} vmovdqa32 %%r7, [PSHUFFLE_TRANSPOSE16_MASK2] vpermi2q %%r7, %%r1, %%r5 ; r7 = {h14 g14 f14 e14 d14 c14 b14 a14 h6 g6 f6 e6 d6 c6 b6 a6} vmovdqa32 %%r1, [PSHUFFLE_TRANSPOSE16_MASK1] vpermi2q %%r1, %%r0, %%r4 ; r1 = {h11 g11 f11 e11 d11 c11 b11 a11 h3 g3 f3 e3 d3 c3 b3 a3} vmovdqa32 %%r5, [PSHUFFLE_TRANSPOSE16_MASK2] vpermi2q %%r5, %%r0, %%r4 ; r5 = {h15 g15 f15 e15 d15 c15 b15 a15 h7 g7 f7 e7 d7 c7 b7 a7} vmovdqa32 %%r0, [PSHUFFLE_TRANSPOSE16_MASK1] vpermi2q %%r0, %%r6, %%r10 ; r0 = {p8 o8 n8 m8 l8 k8 j8 i8 p0 o0 n0 m0 l0 k0 j0 i0} vmovdqa32 %%r4, [PSHUFFLE_TRANSPOSE16_MASK2] vpermi2q %%r4, %%r6, %%r10 ; r4 = {p12 o12 n12 m12 l12 k12 j12 i12 p4 o4 n4 m4 l4 k4 j4 i4} vmovdqa32 %%r6, [PSHUFFLE_TRANSPOSE16_MASK1] vpermi2q %%r6, %%r11, %%r15 ; r6 = {p9 o9 n9 m9 l9 k9 j9 i9 p1 o1 n1 m1 l1 k1 j1 i1} vmovdqa32 %%r10, [PSHUFFLE_TRANSPOSE16_MASK2] vpermi2q %%r10, %%r11, %%r15 ; r10 = {p13 o13 n13 m13 l13 k13 j13 i13 p5 o5 n5 m5 l5 k5 j5 i5} vmovdqa32 %%r11, [PSHUFFLE_TRANSPOSE16_MASK1] vpermi2q %%r11, %%r9, %%r13 ; r11 = {p10 o10 n10 m10 l10 k10 j10 i10 p2 o2 n2 m2 l2 k2 j2 i2} vmovdqa32 %%r15, [PSHUFFLE_TRANSPOSE16_MASK2] vpermi2q %%r15, %%r9, %%r13 ; r15 = {p14 o14 n14 m14 l14 k14 j14 i14 p6 o6 n6 m6 l6 k6 j6 i6} vmovdqa32 %%r9, [PSHUFFLE_TRANSPOSE16_MASK1] vpermi2q %%r9, %%r8, %%r12 ; r9 = {p11 o11 n11 m11 l11 k11 j11 i11 p3 o3 n3 m3 l3 k3 j3 i3} vmovdqa32 %%r13, [PSHUFFLE_TRANSPOSE16_MASK2] vpermi2q %%r13, %%r8, %%r12 ; r13 = {p15 o15 n15 m15 l15 k15 j15 i15 p7 o7 n7 m7 l7 k7 j7 i7} ;; At this point r8 and r12 can be used as scratch registers vshuff64x2 %%r8, %%r14, %%r0, 0xEE ; r8 = {p8 o8 n8 m8 l8 k8 j8 i8 h8 g8 f8 e8 d8 c8 b8 a8} vshuff64x2 %%r0, %%r14, %%r0, 0x44 ; r0 = {p0 o0 n0 m0 l0 k0 j0 i0 h0 g0 f0 e0 d0 c0 b0 a0} vshuff64x2 %%r12, %%t1, %%r4, 0xEE ; r12 = {p12 o12 n12 m12 l12 k12 j12 i12 h12 g12 f12 e12 d12 c12 b12 a12} vshuff64x2 %%r4, %%t1, %%r4, 0x44 ; r4 = {p4 o4 n4 m4 l4 k4 j4 i4 h4 g4 f4 e4 d4 c4 b4 a4} vshuff64x2 %%r14, %%r7, %%r15, 0xEE ; r14 = {p14 o14 n14 m14 l14 k14 j14 i14 h14 g14 f14 e14 d14 c14 b14 a14} vshuff64x2 %%t1, %%r7, %%r15, 0x44 ; t1 = {p6 o6 n6 m6 l6 k6 j6 i6 h6 g6 f6 e6 d6 c6 b6 a6} vshuff64x2 %%r15, %%r5, %%r13, 0xEE ; r15 = {p15 o15 n15 m15 l15 k15 j15 i15 h15 g15 f15 e15 d15 c15 b15 a15} vshuff64x2 %%r7, %%r5, %%r13, 0x44 ; r7 = {p7 o7 n7 m7 l7 k7 j7 i7 h7 g7 f7 e7 d7 c7 b7 a7} vshuff64x2 %%r13, %%t0, %%r10, 0xEE ; r13 = {p13 o13 n13 m13 l13 k13 j13 i13 h13 g13 f13 e13 d13 c13 b13 a13} vshuff64x2 %%r5, %%t0, %%r10, 0x44 ; r5 = {p5 o5 n5 m5 l5 k5 j5 i5 h5 g5 f5 e5 d5 c5 b5 a5} vshuff64x2 %%r10, %%r3, %%r11, 0xEE ; r10 = {p10 o10 n10 m10 l10 k10 j10 i10 h10 g10 f10 e10 d10 c10 b10 a10} vshuff64x2 %%t0, %%r3, %%r11, 0x44 ; t0 = {p2 o2 n2 m2 l2 k2 j2 i2 h2 g2 f2 e2 d2 c2 b2 a2} vshuff64x2 %%r11, %%r1, %%r9, 0xEE ; r11 = {p11 o11 n11 m11 l11 k11 j11 i11 h11 g11 f11 e11 d11 c11 b11 a11} vshuff64x2 %%r3, %%r1, %%r9, 0x44 ; r3 = {p3 o3 n3 m3 l3 k3 j3 i3 h3 g3 f3 e3 d3 c3 b3 a3} vshuff64x2 %%r9, %%r2, %%r6, 0xEE ; r9 = {p9 o9 n9 m9 l9 k9 j9 i9 h9 g9 f9 e9 d9 c9 b9 a9} vshuff64x2 %%r1, %%r2, %%r6, 0x44 ; r1 = {p1 o1 n1 m1 l1 k1 j1 i1 h1 g1 f1 e1 d1 c1 b1 a1} vmovdqa32 %%r2, %%t0 ; r2 = {p2 o2 n2 m2 l2 k2 j2 i2 h2 g2 f2 e2 d2 c2 b2 a2} vmovdqa32 %%r6, %%t1 ; r6 = {p6 o6 n6 m6 l6 k6 j6 i6 h6 g6 f6 e6 d6 c6 b6 a6} %endmacro %define APPEND(a,b) a %+ b %ifdef LINUX %define arg1 rdi %define arg2 rsi %define arg3 rdx %define arg4 rcx %else %define arg1 rcx %define arg2 rdx %define arg3 r8 %define arg4 r9 %endif %define state arg1 %define SIZE arg2 %define IDX arg3 %define A zmm0 %define B zmm1 %define C zmm2 %define D zmm3 %define E zmm4 %define KT zmm5 %define AA zmm6 %define BB zmm7 %define CC zmm8 %define DD zmm9 %define EE zmm10 %define TMP0 zmm11 %define TMP1 zmm12 %define TMP2 zmm13 %define W0 zmm16 %define W1 zmm17 %define W2 zmm18 %define W3 zmm19 %define W4 zmm20 %define W5 zmm21 %define W6 zmm22 %define W7 zmm23 %define W8 zmm24 %define W9 zmm25 %define W10 zmm26 %define W11 zmm27 %define W12 zmm28 %define W13 zmm29 %define W14 zmm30 %define W15 zmm31 %define inp0 r9 %define inp1 r10 %define inp2 r11 %define inp3 r12 %define inp4 r13 %define inp5 r14 %define inp6 r15 %define inp7 rax %macro ROTATE_ARGS 0 %xdefine TMP_ E %xdefine E D %xdefine D C %xdefine C B %xdefine B A %xdefine A TMP_ %endm %macro PROCESS_LOOP 2 %define %%WT %1 %define %%F_IMMED %2 ; T = ROTL_5(A) + Ft(B,C,D) + E + Kt + Wt ; E=D, D=C, C=ROTL_30(B), B=A, A=T ; Ft ; 0-19 Ch(B,C,D) = (B&C) ^ (~B&D) ; 20-39, 60-79 Parity(B,C,D) = B ^ C ^ D ; 40-59 Maj(B,C,D) = (B&C) ^ (B&D) ^ (C&D) vmovdqa32 TMP1, B ; Copy B vpaddd E, E, %%WT ; E = E + Wt vpternlogd TMP1, C, D, %%F_IMMED ; TMP1 = Ft(B,C,D) vpaddd E, E, KT ; E = E + Wt + Kt vprold TMP0, A, 5 ; TMP0 = ROTL_5(A) vpaddd E, E, TMP1 ; E = Ft(B,C,D) + E + Kt + Wt vprold B, B, 30 ; B = ROTL_30(B) vpaddd E, E, TMP0 ; E = T ROTATE_ARGS %endmacro %macro MSG_SCHED_ROUND_16_79 4 %define %%WT %1 %define %%WTp2 %2 %define %%WTp8 %3 %define %%WTp13 %4 ; Wt = ROTL_1(Wt-3 ^ Wt-8 ^ Wt-14 ^ Wt-16) ; Wt+16 = ROTL_1(Wt+13 ^ Wt+8 ^ Wt+2 ^ Wt) vpternlogd %%WT, %%WTp2, %%WTp8, 0x96 vpxord %%WT, %%WT, %%WTp13 vprold %%WT, %%WT, 1 %endmacro ; Note this is reading in a block of data for one lane ; When all 16 are read, the data must be transposed to build msg schedule %macro MSG_SCHED_ROUND_00_15 2 %define %%WT %1 %define %%OFFSET %2 mov inp0, [state + _data_ptr_sha1 + (%%OFFSET*PTR_SIZE)] vmovups %%WT, [inp0+IDX] %endmacro align 64 ; void sha1_mult_x16_avx3(void **input_data, UINT128 *digest, UINT32 size) ; arg 1 : pointer to SHA1 args structure ; arg 2 : size (in blocks) ;; assumed to be >= 1 MKGLOBAL(sha1_x16_avx512,function,internal) sha1_x16_avx512: ;; Initialize digests vmovdqu32 A, [state + 0*SHA1_DIGEST_ROW_SIZE] vmovdqu32 B, [state + 1*SHA1_DIGEST_ROW_SIZE] vmovdqu32 C, [state + 2*SHA1_DIGEST_ROW_SIZE] vmovdqu32 D, [state + 3*SHA1_DIGEST_ROW_SIZE] vmovdqu32 E, [state + 4*SHA1_DIGEST_ROW_SIZE] DBGPRINTL_ZMM "Sha1-AVX512 incoming transposed digest", A, B, C, D, E DBGPRINTL64 "SIZE", SIZE xor IDX, IDX ;; transpose input onto stack mov inp0, [state + _data_ptr_sha1 + 0*PTR_SIZE] mov inp1, [state + _data_ptr_sha1 + 1*PTR_SIZE] mov inp2, [state + _data_ptr_sha1 + 2*PTR_SIZE] mov inp3, [state + _data_ptr_sha1 + 3*PTR_SIZE] mov inp4, [state + _data_ptr_sha1 + 4*PTR_SIZE] mov inp5, [state + _data_ptr_sha1 + 5*PTR_SIZE] mov inp6, [state + _data_ptr_sha1 + 6*PTR_SIZE] mov inp7, [state + _data_ptr_sha1 + 7*PTR_SIZE] vmovups W0,[inp0+IDX] vmovups W1,[inp1+IDX] vmovups W2,[inp2+IDX] vmovups W3,[inp3+IDX] vmovups W4,[inp4+IDX] vmovups W5,[inp5+IDX] vmovups W6,[inp6+IDX] vmovups W7,[inp7+IDX] mov inp0, [state + _data_ptr_sha1 + 8*PTR_SIZE] mov inp1, [state + _data_ptr_sha1 + 9*PTR_SIZE] mov inp2, [state + _data_ptr_sha1 +10*PTR_SIZE] mov inp3, [state + _data_ptr_sha1 +11*PTR_SIZE] mov inp4, [state + _data_ptr_sha1 +12*PTR_SIZE] mov inp5, [state + _data_ptr_sha1 +13*PTR_SIZE] mov inp6, [state + _data_ptr_sha1 +14*PTR_SIZE] mov inp7, [state + _data_ptr_sha1 +15*PTR_SIZE] vmovups W8, [inp0+IDX] vmovups W9, [inp1+IDX] vmovups W10,[inp2+IDX] vmovups W11,[inp3+IDX] vmovups W12,[inp4+IDX] vmovups W13,[inp5+IDX] vmovups W14,[inp6+IDX] vmovups W15,[inp7+IDX] lloop: vmovdqa32 TMP2, [rel PSHUFFLE_BYTE_FLIP_MASK] add IDX, 64 TRANSPOSE16 W0, W1, W2, W3, W4, W5, W6, W7, W8, W9, W10, W11, W12, W13, W14, W15, TMP0, TMP1 DBGPRINTL_ZMM "Sha1-AVX512 incoming transposed input", W0, W1, W2, W3, W4, W6, W7, W8, W9, W10, W11, W12, W13, W14, W15 %assign I 0 %rep 16 vpshufb APPEND(W,I), APPEND(W,I), TMP2 %assign I (I+1) %endrep ; Save digests for later addition vmovdqa32 AA, A vmovdqa32 BB, B vmovdqa32 CC, C vmovdqa32 DD, D vmovdqa32 EE, E vmovdqa32 KT, [rel K00_19] %assign I 0xCA %assign J 0 %assign K 2 %assign L 8 %assign M 13 %assign N 0 %rep 64 PROCESS_LOOP APPEND(W,J), I MSG_SCHED_ROUND_16_79 APPEND(W,J), APPEND(W,K), APPEND(W,L), APPEND(W,M) %if N = 19 vmovdqa32 KT, [rel K20_39] %assign I 0x96 %elif N = 39 vmovdqa32 KT, [rel K40_59] %assign I 0xE8 %elif N = 59 vmovdqa32 KT, [rel K60_79] %assign I 0x96 %endif %assign J ((J+1)% 16) %assign K ((K+1)% 16) %assign L ((L+1)% 16) %assign M ((M+1)% 16) %assign N (N+1) %endrep ; Check if this is the last block sub SIZE, 1 je lastLoop %assign I 0x96 %assign J 0 %rep 16 PROCESS_LOOP APPEND(W,J), I MSG_SCHED_ROUND_00_15 APPEND(W,J), J %assign J (J+1) %endrep ; Add old digest vpaddd A,A,AA vpaddd B,B,BB vpaddd C,C,CC vpaddd D,D,DD vpaddd E,E,EE jmp lloop lastLoop: ; Need to reset argument rotation values to Round 64 values %xdefine TMP_ A %xdefine A B %xdefine B C %xdefine C D %xdefine D E %xdefine E TMP_ ; Process last 16 rounds %assign I 0x96 %assign J 0 %rep 16 PROCESS_LOOP APPEND(W,J), I %assign J (J+1) %endrep ; Add old digest vpaddd A,A,AA vpaddd B,B,BB vpaddd C,C,CC vpaddd D,D,DD vpaddd E,E,EE ; Write out digest ; Do we need to untranspose digests??? vmovdqu32 [state + 0*SHA1_DIGEST_ROW_SIZE], A vmovdqu32 [state + 1*SHA1_DIGEST_ROW_SIZE], B vmovdqu32 [state + 2*SHA1_DIGEST_ROW_SIZE], C vmovdqu32 [state + 3*SHA1_DIGEST_ROW_SIZE], D vmovdqu32 [state + 4*SHA1_DIGEST_ROW_SIZE], E DBGPRINTL_ZMM "Sha1-AVX512 outgoing transposed digest", A, B, C, D, E ;; update input pointers mov inp0, [state + _data_ptr_sha1 + 0*PTR_SIZE] mov inp1, [state + _data_ptr_sha1 + 1*PTR_SIZE] mov inp2, [state + _data_ptr_sha1 + 2*PTR_SIZE] mov inp3, [state + _data_ptr_sha1 + 3*PTR_SIZE] mov inp4, [state + _data_ptr_sha1 + 4*PTR_SIZE] mov inp5, [state + _data_ptr_sha1 + 5*PTR_SIZE] mov inp6, [state + _data_ptr_sha1 + 6*PTR_SIZE] mov inp7, [state + _data_ptr_sha1 + 7*PTR_SIZE] add inp0, IDX add inp1, IDX add inp2, IDX add inp3, IDX add inp4, IDX add inp5, IDX add inp6, IDX add inp7, IDX mov [state + _data_ptr_sha1 + 0*PTR_SIZE], inp0 mov [state + _data_ptr_sha1 + 1*PTR_SIZE], inp1 mov [state + _data_ptr_sha1 + 2*PTR_SIZE], inp2 mov [state + _data_ptr_sha1 + 3*PTR_SIZE], inp3 mov [state + _data_ptr_sha1 + 4*PTR_SIZE], inp4 mov [state + _data_ptr_sha1 + 5*PTR_SIZE], inp5 mov [state + _data_ptr_sha1 + 6*PTR_SIZE], inp6 mov [state + _data_ptr_sha1 + 7*PTR_SIZE], inp7 mov inp0, [state + _data_ptr_sha1 + 8*PTR_SIZE] mov inp1, [state + _data_ptr_sha1 + 9*PTR_SIZE] mov inp2, [state + _data_ptr_sha1 + 10*PTR_SIZE] mov inp3, [state + _data_ptr_sha1 + 11*PTR_SIZE] mov inp4, [state + _data_ptr_sha1 + 12*PTR_SIZE] mov inp5, [state + _data_ptr_sha1 + 13*PTR_SIZE] mov inp6, [state + _data_ptr_sha1 + 14*PTR_SIZE] mov inp7, [state + _data_ptr_sha1 + 15*PTR_SIZE] add inp0, IDX add inp1, IDX add inp2, IDX add inp3, IDX add inp4, IDX add inp5, IDX add inp6, IDX add inp7, IDX mov [state + _data_ptr_sha1 + 8*PTR_SIZE], inp0 mov [state + _data_ptr_sha1 + 9*PTR_SIZE], inp1 mov [state + _data_ptr_sha1 + 10*PTR_SIZE], inp2 mov [state + _data_ptr_sha1 + 11*PTR_SIZE], inp3 mov [state + _data_ptr_sha1 + 12*PTR_SIZE], inp4 mov [state + _data_ptr_sha1 + 13*PTR_SIZE], inp5 mov [state + _data_ptr_sha1 + 14*PTR_SIZE], inp6 mov [state + _data_ptr_sha1 + 15*PTR_SIZE], inp7 ret intel-ipsec-mb-0.48/avx512/sha256_x16_avx512.asm000066400000000000000000001066521321406316400206330ustar00rootroot00000000000000;; ;; Copyright (c) 2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; ;; Stack must be aligned to 32 bytes before call ;; ;; Registers: RAX RBX RCX RDX RBP RSI RDI R8 R9 R10 R11 R12 R13 R14 R15 ;; ----------------------------------------------------------- ;; Windows clobbers: RAX RDX RSI RDI R9 R10 R11 R12 R13 R14 R15 ;; Windows preserves: RCX ;; ----------------------------------------------------------- ;; Linux clobbers: RAX RCX RDX RSI R9 R10 R11 R12 R13 R14 R15 ;; Linux preserves: RDI ;; ----------------------------------------------------------- ;; Clobbers ZMM0-31 %include "os.asm" ;%define DO_DBGPRINT %include "dbgprint.asm" %include "mb_mgr_datastruct.asm" ; re-use K256 from sha256_oct_avx2.asm extern K256 ;; code to compute x16 SHA256 using AVX512 %macro TRANSPOSE16 18 %define %%r0 %1 %define %%r1 %2 %define %%r2 %3 %define %%r3 %4 %define %%r4 %5 %define %%r5 %6 %define %%r6 %7 %define %%r7 %8 %define %%r8 %9 %define %%r9 %10 %define %%r10 %11 %define %%r11 %12 %define %%r12 %13 %define %%r13 %14 %define %%r14 %15 %define %%r15 %16 %define %%t0 %17 %define %%t1 %18 ; r0 = {a15 a14 a13 a12 a11 a10 a9 a8 a7 a6 a5 a4 a3 a2 a1 a0} ; r1 = {b15 b14 b13 b12 b11 b10 b9 b8 b7 b6 b5 b4 b3 b2 b1 b0} ; r2 = {c15 c14 c13 c12 c11 c10 c9 c8 c7 c6 c5 c4 c3 c2 c1 c0} ; r3 = {d15 d14 d13 d12 d11 d10 d9 d8 d7 d6 d5 d4 d3 d2 d1 d0} ; r4 = {e15 e14 e13 e12 e11 e10 e9 e8 e7 e6 e5 e4 e3 e2 e1 e0} ; r5 = {f15 f14 f13 f12 f11 f10 f9 f8 f7 f6 f5 f4 f3 f2 f1 f0} ; r6 = {g15 g14 g13 g12 g11 g10 g9 g8 g7 g6 g5 g4 g3 g2 g1 g0} ; r7 = {h15 h14 h13 h12 h11 h10 h9 h8 h7 h6 h5 h4 h3 h2 h1 h0} ; r8 = {i15 i14 i13 i12 i11 i10 i9 i8 i7 i6 i5 i4 i3 i2 i1 i0} ; r9 = {j15 j14 j13 j12 j11 j10 j9 j8 j7 j6 j5 j4 j3 j2 j1 j0} ; r10 = {k15 k14 k13 k12 k11 k10 k9 k8 k7 k6 k5 k4 k3 k2 k1 k0} ; r11 = {l15 l14 l13 l12 l11 l10 l9 l8 l7 l6 l5 l4 l3 l2 l1 l0} ; r12 = {m15 m14 m13 m12 m11 m10 m9 m8 m7 m6 m5 m4 m3 m2 m1 m0} ; r13 = {n15 n14 n13 n12 n11 n10 n9 n8 n7 n6 n5 n4 n3 n2 n1 n0} ; r14 = {o15 o14 o13 o12 o11 o10 o9 o8 o7 o6 o5 o4 o3 o2 o1 o0} ; r15 = {p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0} ; r0 = {p0 o0 n0 m0 l0 k0 j0 i0 h0 g0 f0 e0 d0 c0 b0 a0} ; r1 = {p1 o1 n1 m1 l1 k1 j1 i1 h1 g1 f1 e1 d1 c1 b1 a1} ; r2 = {p2 o2 n2 m2 l2 k2 j2 i2 h2 g2 f2 e2 d2 c2 b2 a2} ; r3 = {p3 o3 n3 m3 l3 k3 j3 i3 h3 g3 f3 e3 d3 c3 b3 a3} ; r4 = {p4 o4 n4 m4 l4 k4 j4 i4 h4 g4 f4 e4 d4 c4 b4 a4} ; r5 = {p5 o5 n5 m5 l5 k5 j5 i5 h5 g5 f5 e5 d5 c5 b5 a5} ; r6 = {p6 o6 n6 m6 l6 k6 j6 i6 h6 g6 f6 e6 d6 c6 b6 a6} ; r7 = {p7 o7 n7 m7 l7 k7 j7 i7 h7 g7 f7 e7 d7 c7 b7 a7} ; r8 = {p8 o8 n8 m8 l8 k8 j8 i8 h8 g8 f8 e8 d8 c8 b8 a8} ; r9 = {p9 o9 n9 m9 l9 k9 j9 i9 h9 g9 f9 e9 d9 c9 b9 a9} ; r10 = {p10 o10 n10 m10 l10 k10 j10 i10 h10 g10 f10 e10 d10 c10 b10 a10} ; r11 = {p11 o11 n11 m11 l11 k11 j11 i11 h11 g11 f11 e11 d11 c11 b11 a11} ; r12 = {p12 o12 n12 m12 l12 k12 j12 i12 h12 g12 f12 e12 d12 c12 b12 a12} ; r13 = {p13 o13 n13 m13 l13 k13 j13 i13 h13 g13 f13 e13 d13 c13 b13 a13} ; r14 = {p14 o14 n14 m14 l14 k14 j14 i14 h14 g14 f14 e14 d14 c14 b14 a14} ; r15 = {p15 o15 n15 m15 l15 k15 j15 i15 h15 g15 f15 e15 d15 c15 b15 a15} ; process top half (r0..r3) {a...d} vshufps %%t0, %%r0, %%r1, 0x44 ; t0 = {b13 b12 a13 a12 b9 b8 a9 a8 b5 b4 a5 a4 b1 b0 a1 a0} vshufps %%r0, %%r0, %%r1, 0xEE ; r0 = {b15 b14 a15 a14 b11 b10 a11 a10 b7 b6 a7 a6 b3 b2 a3 a2} vshufps %%t1, %%r2, %%r3, 0x44 ; t1 = {d13 d12 c13 c12 d9 d8 c9 c8 d5 d4 c5 c4 d1 d0 c1 c0} vshufps %%r2, %%r2, %%r3, 0xEE ; r2 = {d15 d14 c15 c14 d11 d10 c11 c10 d7 d6 c7 c6 d3 d2 c3 c2} vshufps %%r3, %%t0, %%t1, 0xDD ; r3 = {d13 c13 b13 a13 d9 c9 b9 a9 d5 c5 b5 a5 d1 c1 b1 a1} vshufps %%r1, %%r0, %%r2, 0x88 ; r1 = {d14 c14 b14 a14 d10 c10 b10 a10 d6 c6 b6 a6 d2 c2 b2 a2} vshufps %%r0, %%r0, %%r2, 0xDD ; r0 = {d15 c15 b15 a15 d11 c11 b11 a11 d7 c7 b7 a7 d3 c3 b3 a3} vshufps %%t0, %%t0, %%t1, 0x88 ; t0 = {d12 c12 b12 a12 d8 c8 b8 a8 d4 c4 b4 a4 d0 c0 b0 a0} ; use r2 in place of t0 vshufps %%r2, %%r4, %%r5, 0x44 ; r2 = {f13 f12 e13 e12 f9 f8 e9 e8 f5 f4 e5 e4 f1 f0 e1 e0} vshufps %%r4, %%r4, %%r5, 0xEE ; r4 = {f15 f14 e15 e14 f11 f10 e11 e10 f7 f6 e7 e6 f3 f2 e3 e2} vshufps %%t1, %%r6, %%r7, 0x44 ; t1 = {h13 h12 g13 g12 h9 h8 g9 g8 h5 h4 g5 g4 h1 h0 g1 g0} vshufps %%r6, %%r6, %%r7, 0xEE ; r6 = {h15 h14 g15 g14 h11 h10 g11 g10 h7 h6 g7 g6 h3 h2 g3 g2} vshufps %%r7, %%r2, %%t1, 0xDD ; r7 = {h13 g13 f13 e13 h9 g9 f9 e9 h5 g5 f5 e5 h1 g1 f1 e1} vshufps %%r5, %%r4, %%r6, 0x88 ; r5 = {h14 g14 f14 e14 h10 g10 f10 e10 h6 g6 f6 e6 h2 g2 f2 e2} vshufps %%r4, %%r4, %%r6, 0xDD ; r4 = {h15 g15 f15 e15 h11 g11 f11 e11 h7 g7 f7 e7 h3 g3 f3 e3} vshufps %%r2, %%r2, %%t1, 0x88 ; r2 = {h12 g12 f12 e12 h8 g8 f8 e8 h4 g4 f4 e4 h0 g0 f0 e0} ; use r6 in place of t0 vshufps %%r6, %%r8, %%r9, 0x44 ; r6 = {j13 j12 i13 i12 j9 j8 i9 i8 j5 j4 i5 i4 j1 j0 i1 i0} vshufps %%r8, %%r8, %%r9, 0xEE ; r8 = {j15 j14 i15 i14 j11 j10 i11 i10 j7 j6 i7 i6 j3 j2 i3 i2} vshufps %%t1, %%r10, %%r11, 0x44 ; t1 = {l13 l12 k13 k12 l9 l8 k9 k8 l5 l4 k5 k4 l1 l0 k1 k0} vshufps %%r10, %%r10, %%r11, 0xEE ; r10 = {l15 l14 k15 k14 l11 l10 k11 k10 l7 l6 k7 k6 l3 l2 k3 k2} vshufps %%r11, %%r6, %%t1, 0xDD ; r11 = {l13 k13 j13 113 l9 k9 j9 i9 l5 k5 j5 i5 l1 k1 j1 i1} vshufps %%r9, %%r8, %%r10, 0x88 ; r9 = {l14 k14 j14 114 l10 k10 j10 i10 l6 k6 j6 i6 l2 k2 j2 i2} vshufps %%r8, %%r8, %%r10, 0xDD ; r8 = {l15 k15 j15 115 l11 k11 j11 i11 l7 k7 j7 i7 l3 k3 j3 i3} vshufps %%r6, %%r6, %%t1, 0x88 ; r6 = {l12 k12 j12 112 l8 k8 j8 i8 l4 k4 j4 i4 l0 k0 j0 i0} ; use r10 in place of t0 vshufps %%r10, %%r12, %%r13, 0x44 ; r10 = {n13 n12 m13 m12 n9 n8 m9 m8 n5 n4 m5 m4 n1 n0 a1 m0} vshufps %%r12, %%r12, %%r13, 0xEE ; r12 = {n15 n14 m15 m14 n11 n10 m11 m10 n7 n6 m7 m6 n3 n2 a3 m2} vshufps %%t1, %%r14, %%r15, 0x44 ; t1 = {p13 p12 013 012 p9 p8 09 08 p5 p4 05 04 p1 p0 01 00} vshufps %%r14, %%r14, %%r15, 0xEE ; r14 = {p15 p14 015 014 p11 p10 011 010 p7 p6 07 06 p3 p2 03 02} vshufps %%r15, %%r10, %%t1, 0xDD ; r15 = {p13 013 n13 m13 p9 09 n9 m9 p5 05 n5 m5 p1 01 n1 m1} vshufps %%r13, %%r12, %%r14, 0x88 ; r13 = {p14 014 n14 m14 p10 010 n10 m10 p6 06 n6 m6 p2 02 n2 m2} vshufps %%r12, %%r12, %%r14, 0xDD ; r12 = {p15 015 n15 m15 p11 011 n11 m11 p7 07 n7 m7 p3 03 n3 m3} vshufps %%r10, %%r10, %%t1, 0x88 ; r10 = {p12 012 n12 m12 p8 08 n8 m8 p4 04 n4 m4 p0 00 n0 m0} ;; At this point, the registers that contain interesting data are: ;; t0, r3, r1, r0, r2, r7, r5, r4, r6, r11, r9, r8, r10, r15, r13, r12 ;; Can use t1 and r14 as scratch registers vmovdqa32 %%r14, [PSHUFFLE_TRANSPOSE16_MASK1] vpermi2q %%r14, %%t0, %%r2 ; r14 = {h8 g8 f8 e8 d8 c8 b8 a8 h0 g0 f0 e0 d0 c0 b0 a0} vmovdqa32 %%t1, [PSHUFFLE_TRANSPOSE16_MASK2] vpermi2q %%t1, %%t0, %%r2 ; t1 = {h12 g12 f12 e12 d12 c12 b12 a12 h4 g4 f4 e4 d4 c4 b4 a4} vmovdqa32 %%r2, [PSHUFFLE_TRANSPOSE16_MASK1] vpermi2q %%r2, %%r3, %%r7 ; r2 = {h9 g9 f9 e9 d9 c9 b9 a9 h1 g1 f1 e1 d1 c1 b1 a1} vmovdqa32 %%t0, [PSHUFFLE_TRANSPOSE16_MASK2] vpermi2q %%t0, %%r3, %%r7 ; t0 = {h13 g13 f13 e13 d13 c13 b13 a13 h5 g5 f5 e5 d5 c5 b5 a5} vmovdqa32 %%r3, [PSHUFFLE_TRANSPOSE16_MASK1] vpermi2q %%r3, %%r1, %%r5 ; r3 = {h10 g10 f10 e10 d10 c10 b10 a10 h2 g2 f2 e2 d2 c2 b2 a2} vmovdqa32 %%r7, [PSHUFFLE_TRANSPOSE16_MASK2] vpermi2q %%r7, %%r1, %%r5 ; r7 = {h14 g14 f14 e14 d14 c14 b14 a14 h6 g6 f6 e6 d6 c6 b6 a6} vmovdqa32 %%r1, [PSHUFFLE_TRANSPOSE16_MASK1] vpermi2q %%r1, %%r0, %%r4 ; r1 = {h11 g11 f11 e11 d11 c11 b11 a11 h3 g3 f3 e3 d3 c3 b3 a3} vmovdqa32 %%r5, [PSHUFFLE_TRANSPOSE16_MASK2] vpermi2q %%r5, %%r0, %%r4 ; r5 = {h15 g15 f15 e15 d15 c15 b15 a15 h7 g7 f7 e7 d7 c7 b7 a7} vmovdqa32 %%r0, [PSHUFFLE_TRANSPOSE16_MASK1] vpermi2q %%r0, %%r6, %%r10 ; r0 = {p8 o8 n8 m8 l8 k8 j8 i8 p0 o0 n0 m0 l0 k0 j0 i0} vmovdqa32 %%r4, [PSHUFFLE_TRANSPOSE16_MASK2] vpermi2q %%r4, %%r6, %%r10 ; r4 = {p12 o12 n12 m12 l12 k12 j12 i12 p4 o4 n4 m4 l4 k4 j4 i4} vmovdqa32 %%r6, [PSHUFFLE_TRANSPOSE16_MASK1] vpermi2q %%r6, %%r11, %%r15 ; r6 = {p9 o9 n9 m9 l9 k9 j9 i9 p1 o1 n1 m1 l1 k1 j1 i1} vmovdqa32 %%r10, [PSHUFFLE_TRANSPOSE16_MASK2] vpermi2q %%r10, %%r11, %%r15 ; r10 = {p13 o13 n13 m13 l13 k13 j13 i13 p5 o5 n5 m5 l5 k5 j5 i5} vmovdqa32 %%r11, [PSHUFFLE_TRANSPOSE16_MASK1] vpermi2q %%r11, %%r9, %%r13 ; r11 = {p10 o10 n10 m10 l10 k10 j10 i10 p2 o2 n2 m2 l2 k2 j2 i2} vmovdqa32 %%r15, [PSHUFFLE_TRANSPOSE16_MASK2] vpermi2q %%r15, %%r9, %%r13 ; r15 = {p14 o14 n14 m14 l14 k14 j14 i14 p6 o6 n6 m6 l6 k6 j6 i6} vmovdqa32 %%r9, [PSHUFFLE_TRANSPOSE16_MASK1] vpermi2q %%r9, %%r8, %%r12 ; r9 = {p11 o11 n11 m11 l11 k11 j11 i11 p3 o3 n3 m3 l3 k3 j3 i3} vmovdqa32 %%r13, [PSHUFFLE_TRANSPOSE16_MASK2] vpermi2q %%r13, %%r8, %%r12 ; r13 = {p15 o15 n15 m15 l15 k15 j15 i15 p7 o7 n7 m7 l7 k7 j7 i7} ;; At this point r8 and r12 can be used as scratch registers vshuff64x2 %%r8, %%r14, %%r0, 0xEE ; r8 = {p8 o8 n8 m8 l8 k8 j8 i8 h8 g8 f8 e8 d8 c8 b8 a8} vshuff64x2 %%r0, %%r14, %%r0, 0x44 ; r0 = {p0 o0 n0 m0 l0 k0 j0 i0 h0 g0 f0 e0 d0 c0 b0 a0} vshuff64x2 %%r12, %%t1, %%r4, 0xEE ; r12 = {p12 o12 n12 m12 l12 k12 j12 i12 h12 g12 f12 e12 d12 c12 b12 a12} vshuff64x2 %%r4, %%t1, %%r4, 0x44 ; r4 = {p4 o4 n4 m4 l4 k4 j4 i4 h4 g4 f4 e4 d4 c4 b4 a4} vshuff64x2 %%r14, %%r7, %%r15, 0xEE ; r14 = {p14 o14 n14 m14 l14 k14 j14 i14 h14 g14 f14 e14 d14 c14 b14 a14} vshuff64x2 %%t1, %%r7, %%r15, 0x44 ; t1 = {p6 o6 n6 m6 l6 k6 j6 i6 h6 g6 f6 e6 d6 c6 b6 a6} vshuff64x2 %%r15, %%r5, %%r13, 0xEE ; r15 = {p15 o15 n15 m15 l15 k15 j15 i15 h15 g15 f15 e15 d15 c15 b15 a15} vshuff64x2 %%r7, %%r5, %%r13, 0x44 ; r7 = {p7 o7 n7 m7 l7 k7 j7 i7 h7 g7 f7 e7 d7 c7 b7 a7} vshuff64x2 %%r13, %%t0, %%r10, 0xEE ; r13 = {p13 o13 n13 m13 l13 k13 j13 i13 h13 g13 f13 e13 d13 c13 b13 a13} vshuff64x2 %%r5, %%t0, %%r10, 0x44 ; r5 = {p5 o5 n5 m5 l5 k5 j5 i5 h5 g5 f5 e5 d5 c5 b5 a5} vshuff64x2 %%r10, %%r3, %%r11, 0xEE ; r10 = {p10 o10 n10 m10 l10 k10 j10 i10 h10 g10 f10 e10 d10 c10 b10 a10} vshuff64x2 %%t0, %%r3, %%r11, 0x44 ; t0 = {p2 o2 n2 m2 l2 k2 j2 i2 h2 g2 f2 e2 d2 c2 b2 a2} vshuff64x2 %%r11, %%r1, %%r9, 0xEE ; r11 = {p11 o11 n11 m11 l11 k11 j11 i11 h11 g11 f11 e11 d11 c11 b11 a11} vshuff64x2 %%r3, %%r1, %%r9, 0x44 ; r3 = {p3 o3 n3 m3 l3 k3 j3 i3 h3 g3 f3 e3 d3 c3 b3 a3} vshuff64x2 %%r9, %%r2, %%r6, 0xEE ; r9 = {p9 o9 n9 m9 l9 k9 j9 i9 h9 g9 f9 e9 d9 c9 b9 a9} vshuff64x2 %%r1, %%r2, %%r6, 0x44 ; r1 = {p1 o1 n1 m1 l1 k1 j1 i1 h1 g1 f1 e1 d1 c1 b1 a1} vmovdqa32 %%r2, %%t0 ; r2 = {p2 o2 n2 m2 l2 k2 j2 i2 h2 g2 f2 e2 d2 c2 b2 a2} vmovdqa32 %%r6, %%t1 ; r6 = {p6 o6 n6 m6 l6 k6 j6 i6 h6 g6 f6 e6 d6 c6 b6 a6} %endmacro %define APPEND(a,b) a %+ b ; Define Stack Layout START_FIELDS ;;; name size align FIELD _DIGEST_SAVE, 8*64, 64 FIELD _rsp, 8, 8 %assign STACK_SPACE _FIELD_OFFSET %ifdef LINUX ; Linux register definitions %define arg1 rdi %define arg2 rsi %define arg3 rcx %define arg4 rdx %else ; Windows definitions %define arg1 rcx %define arg2 rdx %define arg3 rsi %define arg4 rdi %endif %define STATE arg1 %define INP_SIZE arg2 %define IDX arg3 %define TBL arg4 %define A zmm0 %define B zmm1 %define C zmm2 %define D zmm3 %define E zmm4 %define F zmm5 %define G zmm6 %define H zmm7 %define T1 zmm8 %define TMP0 zmm9 %define TMP1 zmm10 %define TMP2 zmm11 %define TMP3 zmm12 %define TMP4 zmm13 %define TMP5 zmm14 %define TMP6 zmm15 %define W0 zmm16 %define W1 zmm17 %define W2 zmm18 %define W3 zmm19 %define W4 zmm20 %define W5 zmm21 %define W6 zmm22 %define W7 zmm23 %define W8 zmm24 %define W9 zmm25 %define W10 zmm26 %define W11 zmm27 %define W12 zmm28 %define W13 zmm29 %define W14 zmm30 %define W15 zmm31 %define inp0 r9 %define inp1 r10 %define inp2 r11 %define inp3 r12 %define inp4 r13 %define inp5 r14 %define inp6 r15 %define inp7 rax %macro ROTATE_ARGS 0 %xdefine TMP_ H %xdefine H G %xdefine G F %xdefine F E %xdefine E D %xdefine D C %xdefine C B %xdefine B A %xdefine A TMP_ %endm ;; CH(A, B, C) = (A&B) ^ (~A&C) ;; MAJ(E, F, G) = (E&F) ^ (E&G) ^ (F&G) ;; SIGMA0 = ROR_2 ^ ROR_13 ^ ROR_22 ;; SIGMA1 = ROR_6 ^ ROR_11 ^ ROR_25 ;; sigma0 = ROR_7 ^ ROR_18 ^ SHR_3 ;; sigma1 = ROR_17 ^ ROR_19 ^ SHR_10 ; Main processing loop per round %macro PROCESS_LOOP 2 %define %%WT %1 %define %%ROUND %2 ;; T1 = H + SIGMA1(E) + CH(E, F, G) + Kt + Wt ;; T2 = SIGMA0(A) + MAJ(A, B, C) ;; H=G, G=F, F=E, E=D+T1, D=C, C=B, B=A, A=T1+T2 ;; H becomes T2, then add T1 for A ;; D becomes D + T1 for E vpaddd T1, H, TMP3 ; T1 = H + Kt vmovdqa32 TMP0, E vprord TMP1, E, 6 ; ROR_6(E) vprord TMP2, E, 11 ; ROR_11(E) vprord TMP3, E, 25 ; ROR_25(E) vpternlogd TMP0, F, G, 0xCA ; TMP0 = CH(E,F,G) vpaddd T1, T1, %%WT ; T1 = T1 + Wt vpternlogd TMP1, TMP2, TMP3, 0x96 ; TMP1 = SIGMA1(E) vpaddd T1, T1, TMP0 ; T1 = T1 + CH(E,F,G) vpaddd T1, T1, TMP1 ; T1 = T1 + SIGMA1(E) vpaddd D, D, T1 ; D = D + T1 vprord H, A, 2 ; ROR_2(A) vprord TMP2, A, 13 ; ROR_13(A) vprord TMP3, A, 22 ; ROR_22(A) vmovdqa32 TMP0, A vpternlogd TMP0, B, C, 0xE8 ; TMP0 = MAJ(A,B,C) vpternlogd H, TMP2, TMP3, 0x96 ; H(T2) = SIGMA0(A) vpaddd H, H, TMP0 ; H(T2) = SIGMA0(A) + MAJ(A,B,C) vpaddd H, H, T1 ; H(A) = H(T2) + T1 vmovdqa32 TMP3, [TBL + ((%%ROUND+1)*64)] ; Next Kt ;; Rotate the args A-H (rotation of names associated with regs) ROTATE_ARGS %endmacro ; This is supposed to be SKL optimized assuming: ; vpternlog, vpaddd ports 5,8 ; vprord ports 1,8 ; However, vprord is only working on port 8 ; ; Main processing loop per round ; Get the msg schedule word 16 from the current, now unneccessary word %macro PROCESS_LOOP_00_47 5 %define %%WT %1 %define %%ROUND %2 %define %%WTp1 %3 %define %%WTp9 %4 %define %%WTp14 %5 ;; T1 = H + SIGMA1(E) + CH(E, F, G) + Kt + Wt ;; T2 = SIGMA0(A) + MAJ(A, B, C) ;; H=G, G=F, F=E, E=D+T1, D=C, C=B, B=A, A=T1+T2 ;; H becomes T2, then add T1 for A ;; D becomes D + T1 for E ;; For next value in msg schedule ;; Wt+16 = sigma1(Wt+14) + Wt+9 + sigma0(Wt+1) + Wt vmovdqa32 TMP0, E vprord TMP1, E, 6 ; ROR_6(E) vprord TMP2, E, 11 ; ROR_11(E) vprord TMP3, E, 25 ; ROR_25(E) vpternlogd TMP0, F, G, 0xCA ; TMP0 = CH(E,F,G) vpaddd T1, H, %%WT ; T1 = H + Wt vpternlogd TMP1, TMP2, TMP3, 0x96 ; TMP1 = SIGMA1(E) vpaddd T1, T1, TMP6 ; T1 = T1 + Kt vprord H, A, 2 ; ROR_2(A) vpaddd T1, T1, TMP0 ; T1 = T1 + CH(E,F,G) vprord TMP2, A, 13 ; ROR_13(A) vmovdqa32 TMP0, A vprord TMP3, A, 22 ; ROR_22(A) vpaddd T1, T1, TMP1 ; T1 = T1 + SIGMA1(E) vpternlogd TMP0, B, C, 0xE8 ; TMP0 = MAJ(A,B,C) vpaddd D, D, T1 ; D = D + T1 vpternlogd H, TMP2, TMP3, 0x96 ; H(T2) = SIGMA0(A) vprord TMP4, %%WTp14, 17 ; ROR_17(Wt-2) vpaddd H, H, TMP0 ; H(T2) = SIGMA0(A) + MAJ(A,B,C) vprord TMP5, %%WTp14, 19 ; ROR_19(Wt-2) vpsrld TMP6, %%WTp14, 10 ; SHR_10(Wt-2) vpaddd H, H, T1 ; H(A) = H(T2) + T1 vpternlogd TMP4, TMP5, TMP6, 0x96 ; TMP4 = sigma1(Wt-2) vpaddd %%WT, %%WT, TMP4 ; Wt = Wt-16 + sigma1(Wt-2) vprord TMP4, %%WTp1, 7 ; ROR_7(Wt-15) vprord TMP5, %%WTp1, 18 ; ROR_18(Wt-15) vpaddd %%WT, %%WT, %%WTp9 ; Wt = Wt-16 + sigma1(Wt-2) + Wt-7 vpsrld TMP6, %%WTp1, 3 ; SHR_3(Wt-15) vpternlogd TMP4, TMP5, TMP6, 0x96 ; TMP4 = sigma0(Wt-15) vpaddd %%WT, %%WT, TMP4 ; Wt = Wt-16 + sigma1(Wt-2) + ; Wt-7 + sigma0(Wt-15) + vmovdqa32 TMP6, [TBL + ((%%ROUND+1)*64)] ; Next Kt ;; Rotate the args A-H (rotation of names associated with regs) ROTATE_ARGS %endmacro %macro MSG_SCHED_ROUND_16_63 4 %define %%WT %1 %define %%WTp1 %2 %define %%WTp9 %3 %define %%WTp14 %4 vprord TMP4, %%WTp14, 17 ; ROR_17(Wt-2) vprord TMP5, %%WTp14, 19 ; ROR_19(Wt-2) vpsrld TMP6, %%WTp14, 10 ; SHR_10(Wt-2) vpternlogd TMP4, TMP5, TMP6, 0x96 ; TMP4 = sigma1(Wt-2) vpaddd %%WT, %%WT, TMP4 ; Wt = Wt-16 + sigma1(Wt-2) vpaddd %%WT, %%WT, %%WTp9 ; Wt = Wt-16 + sigma1(Wt-2) + Wt-7 vprord TMP4, %%WTp1, 7 ; ROR_7(Wt-15) vprord TMP5, %%WTp1, 18 ; ROR_18(Wt-15) vpsrld TMP6, %%WTp1, 3 ; SHR_3(Wt-15) vpternlogd TMP4, TMP5, TMP6, 0x96 ; TMP4 = sigma0(Wt-15) vpaddd %%WT, %%WT, TMP4 ; Wt = Wt-16 + sigma1(Wt-2) + ; Wt-7 + sigma0(Wt-15) + %endmacro ; Note this is reading in a block of data for one lane ; When all 16 are read, the data must be transposed to build msg schedule %macro MSG_SCHED_ROUND_00_15 2 %define %%WT %1 %define %%OFFSET %2 mov inp0, [STATE + _data_ptr_sha256 + (%%OFFSET*PTR_SZ)] vmovups %%WT, [inp0+IDX] %endmacro section .data default rel align 64 TABLE: dq 0x428a2f98428a2f98, 0x428a2f98428a2f98 dq 0x428a2f98428a2f98, 0x428a2f98428a2f98 dq 0x428a2f98428a2f98, 0x428a2f98428a2f98 dq 0x428a2f98428a2f98, 0x428a2f98428a2f98 dq 0x7137449171374491, 0x7137449171374491 dq 0x7137449171374491, 0x7137449171374491 dq 0x7137449171374491, 0x7137449171374491 dq 0x7137449171374491, 0x7137449171374491 dq 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf dq 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf dq 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf dq 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf dq 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5 dq 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5 dq 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5 dq 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5 dq 0x3956c25b3956c25b, 0x3956c25b3956c25b dq 0x3956c25b3956c25b, 0x3956c25b3956c25b dq 0x3956c25b3956c25b, 0x3956c25b3956c25b dq 0x3956c25b3956c25b, 0x3956c25b3956c25b dq 0x59f111f159f111f1, 0x59f111f159f111f1 dq 0x59f111f159f111f1, 0x59f111f159f111f1 dq 0x59f111f159f111f1, 0x59f111f159f111f1 dq 0x59f111f159f111f1, 0x59f111f159f111f1 dq 0x923f82a4923f82a4, 0x923f82a4923f82a4 dq 0x923f82a4923f82a4, 0x923f82a4923f82a4 dq 0x923f82a4923f82a4, 0x923f82a4923f82a4 dq 0x923f82a4923f82a4, 0x923f82a4923f82a4 dq 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5 dq 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5 dq 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5 dq 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5 dq 0xd807aa98d807aa98, 0xd807aa98d807aa98 dq 0xd807aa98d807aa98, 0xd807aa98d807aa98 dq 0xd807aa98d807aa98, 0xd807aa98d807aa98 dq 0xd807aa98d807aa98, 0xd807aa98d807aa98 dq 0x12835b0112835b01, 0x12835b0112835b01 dq 0x12835b0112835b01, 0x12835b0112835b01 dq 0x12835b0112835b01, 0x12835b0112835b01 dq 0x12835b0112835b01, 0x12835b0112835b01 dq 0x243185be243185be, 0x243185be243185be dq 0x243185be243185be, 0x243185be243185be dq 0x243185be243185be, 0x243185be243185be dq 0x243185be243185be, 0x243185be243185be dq 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3 dq 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3 dq 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3 dq 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3 dq 0x72be5d7472be5d74, 0x72be5d7472be5d74 dq 0x72be5d7472be5d74, 0x72be5d7472be5d74 dq 0x72be5d7472be5d74, 0x72be5d7472be5d74 dq 0x72be5d7472be5d74, 0x72be5d7472be5d74 dq 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe dq 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe dq 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe dq 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe dq 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7 dq 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7 dq 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7 dq 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7 dq 0xc19bf174c19bf174, 0xc19bf174c19bf174 dq 0xc19bf174c19bf174, 0xc19bf174c19bf174 dq 0xc19bf174c19bf174, 0xc19bf174c19bf174 dq 0xc19bf174c19bf174, 0xc19bf174c19bf174 dq 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1 dq 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1 dq 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1 dq 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1 dq 0xefbe4786efbe4786, 0xefbe4786efbe4786 dq 0xefbe4786efbe4786, 0xefbe4786efbe4786 dq 0xefbe4786efbe4786, 0xefbe4786efbe4786 dq 0xefbe4786efbe4786, 0xefbe4786efbe4786 dq 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6 dq 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6 dq 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6 dq 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6 dq 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc dq 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc dq 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc dq 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc dq 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f dq 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f dq 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f dq 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f dq 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa dq 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa dq 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa dq 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa dq 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc dq 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc dq 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc dq 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc dq 0x76f988da76f988da, 0x76f988da76f988da dq 0x76f988da76f988da, 0x76f988da76f988da dq 0x76f988da76f988da, 0x76f988da76f988da dq 0x76f988da76f988da, 0x76f988da76f988da dq 0x983e5152983e5152, 0x983e5152983e5152 dq 0x983e5152983e5152, 0x983e5152983e5152 dq 0x983e5152983e5152, 0x983e5152983e5152 dq 0x983e5152983e5152, 0x983e5152983e5152 dq 0xa831c66da831c66d, 0xa831c66da831c66d dq 0xa831c66da831c66d, 0xa831c66da831c66d dq 0xa831c66da831c66d, 0xa831c66da831c66d dq 0xa831c66da831c66d, 0xa831c66da831c66d dq 0xb00327c8b00327c8, 0xb00327c8b00327c8 dq 0xb00327c8b00327c8, 0xb00327c8b00327c8 dq 0xb00327c8b00327c8, 0xb00327c8b00327c8 dq 0xb00327c8b00327c8, 0xb00327c8b00327c8 dq 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7 dq 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7 dq 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7 dq 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7 dq 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3 dq 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3 dq 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3 dq 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3 dq 0xd5a79147d5a79147, 0xd5a79147d5a79147 dq 0xd5a79147d5a79147, 0xd5a79147d5a79147 dq 0xd5a79147d5a79147, 0xd5a79147d5a79147 dq 0xd5a79147d5a79147, 0xd5a79147d5a79147 dq 0x06ca635106ca6351, 0x06ca635106ca6351 dq 0x06ca635106ca6351, 0x06ca635106ca6351 dq 0x06ca635106ca6351, 0x06ca635106ca6351 dq 0x06ca635106ca6351, 0x06ca635106ca6351 dq 0x1429296714292967, 0x1429296714292967 dq 0x1429296714292967, 0x1429296714292967 dq 0x1429296714292967, 0x1429296714292967 dq 0x1429296714292967, 0x1429296714292967 dq 0x27b70a8527b70a85, 0x27b70a8527b70a85 dq 0x27b70a8527b70a85, 0x27b70a8527b70a85 dq 0x27b70a8527b70a85, 0x27b70a8527b70a85 dq 0x27b70a8527b70a85, 0x27b70a8527b70a85 dq 0x2e1b21382e1b2138, 0x2e1b21382e1b2138 dq 0x2e1b21382e1b2138, 0x2e1b21382e1b2138 dq 0x2e1b21382e1b2138, 0x2e1b21382e1b2138 dq 0x2e1b21382e1b2138, 0x2e1b21382e1b2138 dq 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc dq 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc dq 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc dq 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc dq 0x53380d1353380d13, 0x53380d1353380d13 dq 0x53380d1353380d13, 0x53380d1353380d13 dq 0x53380d1353380d13, 0x53380d1353380d13 dq 0x53380d1353380d13, 0x53380d1353380d13 dq 0x650a7354650a7354, 0x650a7354650a7354 dq 0x650a7354650a7354, 0x650a7354650a7354 dq 0x650a7354650a7354, 0x650a7354650a7354 dq 0x650a7354650a7354, 0x650a7354650a7354 dq 0x766a0abb766a0abb, 0x766a0abb766a0abb dq 0x766a0abb766a0abb, 0x766a0abb766a0abb dq 0x766a0abb766a0abb, 0x766a0abb766a0abb dq 0x766a0abb766a0abb, 0x766a0abb766a0abb dq 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e dq 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e dq 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e dq 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e dq 0x92722c8592722c85, 0x92722c8592722c85 dq 0x92722c8592722c85, 0x92722c8592722c85 dq 0x92722c8592722c85, 0x92722c8592722c85 dq 0x92722c8592722c85, 0x92722c8592722c85 dq 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1 dq 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1 dq 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1 dq 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1 dq 0xa81a664ba81a664b, 0xa81a664ba81a664b dq 0xa81a664ba81a664b, 0xa81a664ba81a664b dq 0xa81a664ba81a664b, 0xa81a664ba81a664b dq 0xa81a664ba81a664b, 0xa81a664ba81a664b dq 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70 dq 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70 dq 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70 dq 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70 dq 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3 dq 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3 dq 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3 dq 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3 dq 0xd192e819d192e819, 0xd192e819d192e819 dq 0xd192e819d192e819, 0xd192e819d192e819 dq 0xd192e819d192e819, 0xd192e819d192e819 dq 0xd192e819d192e819, 0xd192e819d192e819 dq 0xd6990624d6990624, 0xd6990624d6990624 dq 0xd6990624d6990624, 0xd6990624d6990624 dq 0xd6990624d6990624, 0xd6990624d6990624 dq 0xd6990624d6990624, 0xd6990624d6990624 dq 0xf40e3585f40e3585, 0xf40e3585f40e3585 dq 0xf40e3585f40e3585, 0xf40e3585f40e3585 dq 0xf40e3585f40e3585, 0xf40e3585f40e3585 dq 0xf40e3585f40e3585, 0xf40e3585f40e3585 dq 0x106aa070106aa070, 0x106aa070106aa070 dq 0x106aa070106aa070, 0x106aa070106aa070 dq 0x106aa070106aa070, 0x106aa070106aa070 dq 0x106aa070106aa070, 0x106aa070106aa070 dq 0x19a4c11619a4c116, 0x19a4c11619a4c116 dq 0x19a4c11619a4c116, 0x19a4c11619a4c116 dq 0x19a4c11619a4c116, 0x19a4c11619a4c116 dq 0x19a4c11619a4c116, 0x19a4c11619a4c116 dq 0x1e376c081e376c08, 0x1e376c081e376c08 dq 0x1e376c081e376c08, 0x1e376c081e376c08 dq 0x1e376c081e376c08, 0x1e376c081e376c08 dq 0x1e376c081e376c08, 0x1e376c081e376c08 dq 0x2748774c2748774c, 0x2748774c2748774c dq 0x2748774c2748774c, 0x2748774c2748774c dq 0x2748774c2748774c, 0x2748774c2748774c dq 0x2748774c2748774c, 0x2748774c2748774c dq 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5 dq 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5 dq 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5 dq 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5 dq 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3 dq 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3 dq 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3 dq 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3 dq 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a dq 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a dq 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a dq 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a dq 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f dq 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f dq 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f dq 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f dq 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3 dq 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3 dq 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3 dq 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3 dq 0x748f82ee748f82ee, 0x748f82ee748f82ee dq 0x748f82ee748f82ee, 0x748f82ee748f82ee dq 0x748f82ee748f82ee, 0x748f82ee748f82ee dq 0x748f82ee748f82ee, 0x748f82ee748f82ee dq 0x78a5636f78a5636f, 0x78a5636f78a5636f dq 0x78a5636f78a5636f, 0x78a5636f78a5636f dq 0x78a5636f78a5636f, 0x78a5636f78a5636f dq 0x78a5636f78a5636f, 0x78a5636f78a5636f dq 0x84c8781484c87814, 0x84c8781484c87814 dq 0x84c8781484c87814, 0x84c8781484c87814 dq 0x84c8781484c87814, 0x84c8781484c87814 dq 0x84c8781484c87814, 0x84c8781484c87814 dq 0x8cc702088cc70208, 0x8cc702088cc70208 dq 0x8cc702088cc70208, 0x8cc702088cc70208 dq 0x8cc702088cc70208, 0x8cc702088cc70208 dq 0x8cc702088cc70208, 0x8cc702088cc70208 dq 0x90befffa90befffa, 0x90befffa90befffa dq 0x90befffa90befffa, 0x90befffa90befffa dq 0x90befffa90befffa, 0x90befffa90befffa dq 0x90befffa90befffa, 0x90befffa90befffa dq 0xa4506ceba4506ceb, 0xa4506ceba4506ceb dq 0xa4506ceba4506ceb, 0xa4506ceba4506ceb dq 0xa4506ceba4506ceb, 0xa4506ceba4506ceb dq 0xa4506ceba4506ceb, 0xa4506ceba4506ceb dq 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7 dq 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7 dq 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7 dq 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7 dq 0xc67178f2c67178f2, 0xc67178f2c67178f2 dq 0xc67178f2c67178f2, 0xc67178f2c67178f2 dq 0xc67178f2c67178f2, 0xc67178f2c67178f2 dq 0xc67178f2c67178f2, 0xc67178f2c67178f2 PSHUFFLE_BYTE_FLIP_MASK: ;ddq 0x0c0d0e0f08090a0b0405060700010203 dq 0x0405060700010203, 0x0c0d0e0f08090a0b ;ddq 0x0c0d0e0f08090a0b0405060700010203 dq 0x0405060700010203, 0x0c0d0e0f08090a0b ;ddq 0x0c0d0e0f08090a0b0405060700010203 dq 0x0405060700010203, 0x0c0d0e0f08090a0b ;ddq 0x0c0d0e0f08090a0b0405060700010203 dq 0x0405060700010203, 0x0c0d0e0f08090a0b PSHUFFLE_TRANSPOSE16_MASK1: dq 0x0000000000000000 dq 0x0000000000000001 dq 0x0000000000000008 dq 0x0000000000000009 dq 0x0000000000000004 dq 0x0000000000000005 dq 0x000000000000000C dq 0x000000000000000D PSHUFFLE_TRANSPOSE16_MASK2: dq 0x0000000000000002 dq 0x0000000000000003 dq 0x000000000000000A dq 0x000000000000000B dq 0x0000000000000006 dq 0x0000000000000007 dq 0x000000000000000E dq 0x000000000000000F section .text ;; void sha256_x16_avx512(void **input_data, UINT128 *digest[16], UINT64 size) ;; arg 1 : pointer to SHA256 args structure ;; arg 2 : size (in blocks) ;; assumed to be >= 1 ;; arg 1 : rcx : pointer to array of pointers to input data ;; arg 2 : rdx : pointer to array of pointers to digest ;; arg 3 : r8 : size of input in bytes MKGLOBAL(sha256_x16_avx512,function,internal) align 64 sha256_x16_avx512: mov rax, rsp sub rsp, STACK_SPACE and rsp, ~63 ; align stack to multiple of 64 mov [rsp + _rsp], rax ;; Initialize digests vmovdqu32 A, [STATE + 0*SHA256_DIGEST_ROW_SIZE] vmovdqu32 B, [STATE + 1*SHA256_DIGEST_ROW_SIZE] vmovdqu32 C, [STATE + 2*SHA256_DIGEST_ROW_SIZE] vmovdqu32 D, [STATE + 3*SHA256_DIGEST_ROW_SIZE] vmovdqu32 E, [STATE + 4*SHA256_DIGEST_ROW_SIZE] vmovdqu32 F, [STATE + 5*SHA256_DIGEST_ROW_SIZE] vmovdqu32 G, [STATE + 6*SHA256_DIGEST_ROW_SIZE] vmovdqu32 H, [STATE + 7*SHA256_DIGEST_ROW_SIZE] lea TBL, [rel TABLE] ; Do we need to transpose digests??? ; SHA1 does not, but SHA256 has been xor IDX, IDX ;; Read in first block of input data ;; Transpose input data mov inp0, [STATE + _data_ptr_sha256 + 0*PTR_SZ] mov inp1, [STATE + _data_ptr_sha256 + 1*PTR_SZ] mov inp2, [STATE + _data_ptr_sha256 + 2*PTR_SZ] mov inp3, [STATE + _data_ptr_sha256 + 3*PTR_SZ] mov inp4, [STATE + _data_ptr_sha256 + 4*PTR_SZ] mov inp5, [STATE + _data_ptr_sha256 + 5*PTR_SZ] mov inp6, [STATE + _data_ptr_sha256 + 6*PTR_SZ] mov inp7, [STATE + _data_ptr_sha256 + 7*PTR_SZ] vmovups W0,[inp0+IDX] vmovups W1,[inp1+IDX] vmovups W2,[inp2+IDX] vmovups W3,[inp3+IDX] vmovups W4,[inp4+IDX] vmovups W5,[inp5+IDX] vmovups W6,[inp6+IDX] vmovups W7,[inp7+IDX] mov inp0, [STATE + _data_ptr_sha256 + 8*PTR_SZ] mov inp1, [STATE + _data_ptr_sha256 + 9*PTR_SZ] mov inp2, [STATE + _data_ptr_sha256 +10*PTR_SZ] mov inp3, [STATE + _data_ptr_sha256 +11*PTR_SZ] mov inp4, [STATE + _data_ptr_sha256 +12*PTR_SZ] mov inp5, [STATE + _data_ptr_sha256 +13*PTR_SZ] mov inp6, [STATE + _data_ptr_sha256 +14*PTR_SZ] mov inp7, [STATE + _data_ptr_sha256 +15*PTR_SZ] vmovups W8, [inp0+IDX] vmovups W9, [inp1+IDX] vmovups W10,[inp2+IDX] vmovups W11,[inp3+IDX] vmovups W12,[inp4+IDX] vmovups W13,[inp5+IDX] vmovups W14,[inp6+IDX] vmovups W15,[inp7+IDX] jmp lloop align 32 lloop: vmovdqa32 TMP2, [rel PSHUFFLE_BYTE_FLIP_MASK] vmovdqa32 TMP3, [TBL] ; First K ; Save digests for later addition vmovdqa32 [rsp + _DIGEST_SAVE + 64*0], A vmovdqa32 [rsp + _DIGEST_SAVE + 64*1], B vmovdqa32 [rsp + _DIGEST_SAVE + 64*2], C vmovdqa32 [rsp + _DIGEST_SAVE + 64*3], D vmovdqa32 [rsp + _DIGEST_SAVE + 64*4], E vmovdqa32 [rsp + _DIGEST_SAVE + 64*5], F vmovdqa32 [rsp + _DIGEST_SAVE + 64*6], G vmovdqa32 [rsp + _DIGEST_SAVE + 64*7], H add IDX, 64 TRANSPOSE16 W0, W1, W2, W3, W4, W5, W6, W7, W8, W9, W10, W11, W12, W13, W14, W15, TMP0, TMP1 %assign I 0 %rep 16 vpshufb APPEND(W,I), APPEND(W,I), TMP2 %assign I (I+1) %endrep ; MSG Schedule for W0-W15 is now complete in registers ; Process first 48 rounds ; Calculate next Wt+16 after processing is complete and Wt is unneeded ; PROCESS_LOOP_00_47 APPEND(W,J), I, APPEND(W,K), APPEND(W,L), APPEND(W,M) %assign I 0 %assign J 0 %assign K 1 %assign L 9 %assign M 14 %rep 48 PROCESS_LOOP APPEND(W,J), I MSG_SCHED_ROUND_16_63 APPEND(W,J), APPEND(W,K), APPEND(W,L), APPEND(W,M) %assign I (I+1) %assign J ((J+1)% 16) %assign K ((K+1)% 16) %assign L ((L+1)% 16) %assign M ((M+1)% 16) %endrep ; Check is this is the last block sub INP_SIZE, 1 je lastLoop ; Process last 16 rounds ; Read in next block msg data for use in first 16 words of msg sched %assign I 48 %assign J 0 %rep 16 PROCESS_LOOP APPEND(W,J), I MSG_SCHED_ROUND_00_15 APPEND(W,J), J %assign I (I+1) %assign J (J+1) %endrep ; Add old digest vpaddd A, A, [rsp + _DIGEST_SAVE + 64*0] vpaddd B, B, [rsp + _DIGEST_SAVE + 64*1] vpaddd C, C, [rsp + _DIGEST_SAVE + 64*2] vpaddd D, D, [rsp + _DIGEST_SAVE + 64*3] vpaddd E, E, [rsp + _DIGEST_SAVE + 64*4] vpaddd F, F, [rsp + _DIGEST_SAVE + 64*5] vpaddd G, G, [rsp + _DIGEST_SAVE + 64*6] vpaddd H, H, [rsp + _DIGEST_SAVE + 64*7] jmp lloop lastLoop: ; Process last 16 rounds %assign I 48 %assign J 0 %rep 16 PROCESS_LOOP APPEND(W,J), I %assign I (I+1) %assign J (J+1) %endrep ; Add old digest vpaddd A, A, [rsp + _DIGEST_SAVE + 64*0] vpaddd B, B, [rsp + _DIGEST_SAVE + 64*1] vpaddd C, C, [rsp + _DIGEST_SAVE + 64*2] vpaddd D, D, [rsp + _DIGEST_SAVE + 64*3] vpaddd E, E, [rsp + _DIGEST_SAVE + 64*4] vpaddd F, F, [rsp + _DIGEST_SAVE + 64*5] vpaddd G, G, [rsp + _DIGEST_SAVE + 64*6] vpaddd H, H, [rsp + _DIGEST_SAVE + 64*7] ; Write out digest ; Do we need to untranspose digests??? vmovdqu32 [STATE + 0*SHA256_DIGEST_ROW_SIZE], A vmovdqu32 [STATE + 1*SHA256_DIGEST_ROW_SIZE], B vmovdqu32 [STATE + 2*SHA256_DIGEST_ROW_SIZE], C vmovdqu32 [STATE + 3*SHA256_DIGEST_ROW_SIZE], D vmovdqu32 [STATE + 4*SHA256_DIGEST_ROW_SIZE], E vmovdqu32 [STATE + 5*SHA256_DIGEST_ROW_SIZE], F vmovdqu32 [STATE + 6*SHA256_DIGEST_ROW_SIZE], G vmovdqu32 [STATE + 7*SHA256_DIGEST_ROW_SIZE], H ; update input pointers %assign I 0 %rep 16 add [STATE + _data_ptr_sha256 + I*PTR_SZ], IDX %assign I (I+1) %endrep mov rsp, [rsp + _rsp] ret intel-ipsec-mb-0.48/avx512/sha512_x8_avx512.asm000066400000000000000000000747051321406316400205520ustar00rootroot00000000000000;; ;; Copyright (c) 2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; ;; Stack must be aligned to 32 bytes before call ;; ;; Registers: RAX RBX RCX RDX RBP RSI RDI R8 R9 R10 R11 R12 R13 R14 R15 ;; ----------------------------------------------------------- ;; Windows clobbers: RAX RDX RDI R8 R9 R10 R11 R12 R13 R14 R15 ;; Windows preserves: RBX RCX RBP RSI ;; ----------------------------------------------------------- ;; Linux clobbers: RAX RDX RSI R8 R9 R10 R11 R12 R13 R14 R15 ;; Linux preserves: RBX RCX RBP RDI ;; ----------------------------------------------------------- ;; Clobbers ZMM0-31 ;; code to compute quad SHA512 using AVX512 %include "os.asm" ;%define DO_DBGPRINT %include "dbgprint.asm" %include "mb_mgr_datastruct.asm" %macro TRANSPOSE8 12 %define %%r0 %1 %define %%r1 %2 %define %%r2 %3 %define %%r3 %4 %define %%r4 %5 %define %%r5 %6 %define %%r6 %7 %define %%r7 %8 %define %%t0 %9 %define %%t1 %10 %define %%PERM_INDEX1 %11 %define %%PERM_INDEX2 %12 ; each x(i) is 32 bits, 16 * 32 = 512 ==> a full digest length, 32 single precision quantities ; r0 = {a7 a6 a5 a4 a3 a2 a1 a0} ; r1 = {b7 b6 b5 b4 b3 b2 b1 b0} ; r2 = {c7 c6 c5 c4 c3 c2 c1 c0} ; r3 = {d7 d6 d5 d4 d3 d2 d1 d0} ; r4 = {e7 e6 e5 e4 e3 e2 e1 e0} ; r5 = {f7 f6 f5 f4 f3 f2 f1 f0} ; r6 = {g7 g6 g5 g4 g3 g2 g1 g0} ; r7 = {h7 h6 h5 h4 h3 h2 h1 h0} ;; ;;; will not get clobbered vmovdqa32 %%PERM_INDEX1, [TRANSPOSE8_PERM_INDEX_1] ; temp vmovdqa32 %%PERM_INDEX2, [TRANSPOSE8_PERM_INDEX_2] ; temp ; process top half (r0..r3) {a...d} vshufpd %%t0, %%r0, %%r1, 0x00 ; t0 = {b6 a6 b4 a4 b2 a2 b0 a0} vshufpd %%r0, %%r0, %%r1, 0xFF ; r0 = {b7 a7 b5 a5 b3 a3 b1 a1} vshufpd %%t1, %%r2, %%r3, 0x00 ; t1 = {d6 c6 d4 c4 d2 c2 d0 c0} vshufpd %%r2, %%r2, %%r3, 0xFF ; r2 = {d7 c7 d5 c5 d3 c3 d1 c1} vmovdqa32 %%r1, %%t0 ; r1 and r3 free vpermt2q %%r1, %%PERM_INDEX1,%%t1 ; r1 = {d4 c4 b4 a4 d0 c0 b0 a0} vpermt2q %%t0, %%PERM_INDEX2,%%t1 ; t0 = {d6 c6 b6 a6 d2 c2 b2 a2} vmovdqa32 %%t1, %%r0 ; t1 and r3 free vpermt2q %%t1, %%PERM_INDEX1,%%r2 ; t1 = {d5 c5 b5 a5 d1 c1 b1 a1} vpermt2q %%r0, %%PERM_INDEX2,%%r2 ; r0 = {d7 c7 b7 a7 d3 c3 b3 a3} ;; Likewise for top half ; r2 and r3 free vshufpd %%r2, %%r4, %%r5, 0x00 ; r2 = {f6 e6 f4 e4 f2 e2 f0 e0} vshufpd %%r4, %%r4, %%r5, 0xFF ; r4 = {f7 e7 f5 e5 f3 e3 f1 e1} vshufpd %%r3, %%r6, %%r7, 0x00 ; r3 = {h6 g6 h4 g4 h2 g2 h0 g0} vshufpd %%r6, %%r6, %%r7, 0xFF ; r6 = {h7 g7 h5 g5 h3 g3 h1 g1} vmovdqa32 %%r5, %%r2 ; r5 and r7 free vpermt2q %%r5, %%PERM_INDEX1,%%r3 ; r5 = {h4 g4 f4 e4 h0 g0 f0 e0} vpermt2q %%r2, %%PERM_INDEX2,%%r3 ; r2 = {h6 g6 f6 e6 h2 g2 f2 e2} vmovdqa32 %%r7, %%r4 vpermt2q %%r7, %%PERM_INDEX1,%%r6 ; r7 = {h5 g5 f5 e5 h1 g1 f1 e1} vpermt2q %%r4, %%PERM_INDEX2,%%r6 ; r4 = {h7 g7 f7 e7 h3 g3 f3 e3} ;;; free r3, r6 vshuff64x2 %%r6, %%t0, %%r2, 0xEE ; r6 = {h6 g6 f6 e6 d6 c6 b6 a6} vshuff64x2 %%r2, %%t0, %%r2, 0x44 ; r2 = {h2 g2 f2 e2 d2 c2 b2 a2} ;;; t0 and r3 free vshuff64x2 %%r3, %%r0, %%r4, 0x44 ; r3 = {h3 g3 f3 e3 d3 c3 b3 a3} vshuff64x2 %%t0, %%r0, %%r4, 0xEE ; t0 = {h7 g7 f7 e7 d7 c7 b7 a7} vshuff64x2 %%r4, %%r1, %%r5, 0xEE ; r4 = {h4 g4 f4 e4 d4 c4 b4 a4} vshuff64x2 %%r0, %%r1, %%r5, 0x44 ; r0 = {h0 g0 f0 e0 d0 c0 b0 a0} vshuff64x2 %%r5, %%t1, %%r7, 0xEE ; r5 = {h5 g5 f5 e5 d5 c5 b5 a5} vshuff64x2 %%r1, %%t1, %%r7, 0x44 ; r1 = {h1 g1 f1 e1 d1 c1 b1 a1} ;; will re-order input to avoid move ;vmovdqa32 %%r7, %%t0 ; Output looks like: {r0 r1 r2 r3 r4 r5 r6 r7} ; r0 = {h0 g0 f0 e0 d0 c0 b0 a0} ; r1 = {h1 g1 f1 e1 d1 c1 b1 a1} ; r2 = {h2 g2 f2 e2 d2 c2 b2 a2} ; r3 = {h3 g3 f3 e3 d3 c3 b3 a3} ; r4 = {h4 g4 f4 e4 d4 c4 b4 a4} ; r5 = {h5 g5 f5 e5 d5 c5 b5 a5} ; r6 = {h6 g6 f6 e6 d6 c6 b6 a6} ; temp ; r7 = {h7 g7 f7 e7 d7 c7 b7 a7} %endmacro %define APPEND(a,b) a %+ b %ifdef LINUX ; Linux register definitions %define arg1 rdi %define arg2 rsi %define arg3 rcx %define arg4 rdx %else ; Windows definitions %define arg1 rcx %define arg2 rdx %define arg3 rsi %define arg4 rdi %endif %define STATE arg1 %define INP_SIZE arg2 %define IDX arg4 %define TBL r8 ;; retaining XMM_SAVE, because the top half of YMM registers no saving required, only bottom half, the XMM part %define NUM_LANES 8 %define XMM_SAVE (15-5)*16 %define SZ 8 %define SZ8 8 * SZ %define DIGEST_SZ 8 * SZ8 %define DIGEST_SAVE NUM_LANES * DIGEST_SZ %define RSP_SAVE 1*8 ; Define Stack Layout START_FIELDS ;;; name size align FIELD _DIGEST_SAVE, NUM_LANES*8*64, 64 FIELD _XMM_SAVE, XMM_SAVE, 16 FIELD _RSP, 8, 8 %assign STACK_SPACE _FIELD_OFFSET %define inp0 r9 %define inp1 r10 %define inp2 r11 %define inp3 r12 %define inp4 r13 %define inp5 r14 %define inp6 r15 %define inp7 rax %define A zmm0 %define B zmm1 %define C zmm2 %define D zmm3 %define E zmm4 %define F zmm5 %define G zmm6 %define H zmm7 %define T1 zmm8 %define TMP0 zmm9 %define TMP1 zmm10 %define TMP2 zmm11 %define TMP3 zmm12 %define TMP4 zmm13 %define TMP5 zmm14 %define TMP6 zmm15 %define W0 zmm16 %define W1 zmm17 %define W2 zmm18 %define W3 zmm19 %define W4 zmm20 %define W5 zmm21 %define W6 zmm22 %define W7 zmm23 %define W8 zmm24 %define W9 zmm25 %define W10 zmm26 %define W11 zmm27 %define W12 zmm28 %define W13 zmm29 %define W14 zmm30 %define W15 zmm31 ; from sha256_fips180-2.pdf ; define rotates for Sigma function for main loop steps %define BIG_SIGMA_0_0 28 ; Sigma0 %define BIG_SIGMA_0_1 34 %define BIG_SIGMA_0_2 39 %define BIG_SIGMA_1_0 14 ; Sigma1 %define BIG_SIGMA_1_1 18 %define BIG_SIGMA_1_2 41 ; define rotates for Sigma function for scheduling steps %define SMALL_SIGMA_0_0 1 ; sigma0 %define SMALL_SIGMA_0_1 8 %define SMALL_SIGMA_0_2 7 %define SMALL_SIGMA_1_0 19 ; sigma1 %define SMALL_SIGMA_1_1 61 %define SMALL_SIGMA_1_2 6 %define SHA_MAX_ROUNDS 80 %define SHA_ROUNDS_LESS_16 (SHA_MAX_ROUNDS - 16) %macro ROTATE_ARGS 0 %xdefine TMP_ H %xdefine H G %xdefine G F %xdefine F E %xdefine E D %xdefine D C %xdefine C B %xdefine B A %xdefine A TMP_ %endm ;; CH(A, B, C) = (A&B) ^ (~A&C) ;; MAJ(E, F, G) = (E&F) ^ (E&G) ^ (F&G) ;; SIGMA0 = ROR_28 ^ ROR_34 ^ ROR_39 ;; SIGMA1 = ROR_14 ^ ROR_18 ^ ROR_41 ;; sigma0 = ROR_1 ^ ROR_8 ^ SHR_7 ;; sigma1 = ROR_19 ^ ROR_61 ^ SHR_6 ;; Main processing loop per round ;; equivalent to %macro ROUND_00_15 2 %macro PROCESS_LOOP 2 %define %%WT %1 %define %%ROUND %2 ;; T1 = H + BIG_SIGMA_1(E) + CH(E, F, G) + Kt + Wt ;; T2 = BIG_SIGMA_0(A) + MAJ(A, B, C) ;; H=G, G=F, F=E, E=D+T1, D=C, C=B, B=A, A=T1+T2 ;; H becomes T2, then add T1 for A ;; D becomes D + T1 for E vpaddq T1, H, TMP3 ; T1 = H + Kt vmovdqa32 TMP0, E ;; compute BIG_SIGMA_1(E) vprorq TMP1, E, BIG_SIGMA_1_0 ; ROR_14(E) vprorq TMP2, E, BIG_SIGMA_1_1 ; ROR_18(E) vprorq TMP3, E, BIG_SIGMA_1_2 ; ROR_41(E) vpternlogq TMP1, TMP2, TMP3, 0x96 ; TMP1 = BIG_SIGMA_1(E) vpternlogq TMP0, F, G, 0xCA ; TMP0 = CH(E,F,G) vpaddq T1, T1, %%WT ; T1 = T1 + Wt vpaddq T1, T1, TMP0 ; T1 = T1 + CH(E,F,G) vpaddq T1, T1, TMP1 ; T1 = T1 + BIG_SIGMA_1(E) vpaddq D, D, T1 ; D = D + T1 vprorq H, A, BIG_SIGMA_0_0 ;ROR_28(A) vprorq TMP2, A, BIG_SIGMA_0_1 ;ROR_34(A) vprorq TMP3, A, BIG_SIGMA_0_2 ;ROR_39(A) vmovdqa32 TMP0, A vpternlogq TMP0, B, C, 0xE8 ; TMP0 = MAJ(A,B,C) vpternlogq H, TMP2, TMP3, 0x96 ; H(T2) = BIG_SIGMA_0(A) vpaddq H, H, TMP0 ; H(T2) = BIG_SIGMA_0(A) + MAJ(A,B,C) vpaddq H, H, T1 ; H(A) = H(T2) + T1 vmovdqa32 TMP3, [TBL + ((%%ROUND+1)*64)] ; Next Kt ;; Rotate the args A-H (rotation of names associated with regs) ROTATE_ARGS %endmacro %macro MSG_SCHED_ROUND_16_79 4 %define %%WT %1 %define %%WTp1 %2 %define %%WTp9 %3 %define %%WTp14 %4 vprorq TMP4, %%WTp14, SMALL_SIGMA_1_0 ; ROR_19(Wt-2) vprorq TMP5, %%WTp14, SMALL_SIGMA_1_1 ; ROR_61(Wt-2) vpsrlq TMP6, %%WTp14, SMALL_SIGMA_1_2 ; SHR_6(Wt-2) vpternlogq TMP4, TMP5, TMP6, 0x96 ; TMP4 = sigma_1(Wt-2) vpaddq %%WT, %%WT, TMP4 ; Wt = Wt-16 + sigma_1(Wt-2) vpaddq %%WT, %%WT, %%WTp9 ; Wt = Wt-16 + sigma_1(Wt-2) + Wt-7 vprorq TMP4, %%WTp1, SMALL_SIGMA_0_0 ; ROR_1(Wt-15) vprorq TMP5, %%WTp1, SMALL_SIGMA_0_1 ; ROR_8(Wt-15) vpsrlq TMP6, %%WTp1, SMALL_SIGMA_0_2 ; SHR_7(Wt-15) vpternlogq TMP4, TMP5, TMP6, 0x96 ; TMP4 = sigma_0(Wt-15) vpaddq %%WT, %%WT, TMP4 ; Wt = Wt-16 + sigma_1(Wt-2) + ; Wt-7 + sigma_0(Wt-15) + %endmacro section .data default rel align 64 ; 80 constants for SHA512 ; replicating for each lane, thus 8*80 ; to aid in SIMD .. space tradeoff for time! ; local to asm file, used nowhere else TABLE: dq 0x428a2f98d728ae22, 0x428a2f98d728ae22, 0x428a2f98d728ae22, 0x428a2f98d728ae22 dq 0x428a2f98d728ae22, 0x428a2f98d728ae22, 0x428a2f98d728ae22, 0x428a2f98d728ae22 dq 0x7137449123ef65cd, 0x7137449123ef65cd, 0x7137449123ef65cd, 0x7137449123ef65cd dq 0x7137449123ef65cd, 0x7137449123ef65cd, 0x7137449123ef65cd, 0x7137449123ef65cd dq 0xb5c0fbcfec4d3b2f, 0xb5c0fbcfec4d3b2f, 0xb5c0fbcfec4d3b2f, 0xb5c0fbcfec4d3b2f dq 0xb5c0fbcfec4d3b2f, 0xb5c0fbcfec4d3b2f, 0xb5c0fbcfec4d3b2f, 0xb5c0fbcfec4d3b2f dq 0xe9b5dba58189dbbc, 0xe9b5dba58189dbbc, 0xe9b5dba58189dbbc, 0xe9b5dba58189dbbc dq 0xe9b5dba58189dbbc, 0xe9b5dba58189dbbc, 0xe9b5dba58189dbbc, 0xe9b5dba58189dbbc dq 0x3956c25bf348b538, 0x3956c25bf348b538, 0x3956c25bf348b538, 0x3956c25bf348b538 dq 0x3956c25bf348b538, 0x3956c25bf348b538, 0x3956c25bf348b538, 0x3956c25bf348b538 dq 0x59f111f1b605d019, 0x59f111f1b605d019, 0x59f111f1b605d019, 0x59f111f1b605d019 dq 0x59f111f1b605d019, 0x59f111f1b605d019, 0x59f111f1b605d019, 0x59f111f1b605d019 dq 0x923f82a4af194f9b, 0x923f82a4af194f9b, 0x923f82a4af194f9b, 0x923f82a4af194f9b dq 0x923f82a4af194f9b, 0x923f82a4af194f9b, 0x923f82a4af194f9b, 0x923f82a4af194f9b dq 0xab1c5ed5da6d8118, 0xab1c5ed5da6d8118, 0xab1c5ed5da6d8118, 0xab1c5ed5da6d8118 dq 0xab1c5ed5da6d8118, 0xab1c5ed5da6d8118, 0xab1c5ed5da6d8118, 0xab1c5ed5da6d8118 dq 0xd807aa98a3030242, 0xd807aa98a3030242, 0xd807aa98a3030242, 0xd807aa98a3030242 dq 0xd807aa98a3030242, 0xd807aa98a3030242, 0xd807aa98a3030242, 0xd807aa98a3030242 dq 0x12835b0145706fbe, 0x12835b0145706fbe, 0x12835b0145706fbe, 0x12835b0145706fbe dq 0x12835b0145706fbe, 0x12835b0145706fbe, 0x12835b0145706fbe, 0x12835b0145706fbe dq 0x243185be4ee4b28c, 0x243185be4ee4b28c, 0x243185be4ee4b28c, 0x243185be4ee4b28c dq 0x243185be4ee4b28c, 0x243185be4ee4b28c, 0x243185be4ee4b28c, 0x243185be4ee4b28c dq 0x550c7dc3d5ffb4e2, 0x550c7dc3d5ffb4e2, 0x550c7dc3d5ffb4e2, 0x550c7dc3d5ffb4e2 dq 0x550c7dc3d5ffb4e2, 0x550c7dc3d5ffb4e2, 0x550c7dc3d5ffb4e2, 0x550c7dc3d5ffb4e2 dq 0x72be5d74f27b896f, 0x72be5d74f27b896f, 0x72be5d74f27b896f, 0x72be5d74f27b896f dq 0x72be5d74f27b896f, 0x72be5d74f27b896f, 0x72be5d74f27b896f, 0x72be5d74f27b896f dq 0x80deb1fe3b1696b1, 0x80deb1fe3b1696b1, 0x80deb1fe3b1696b1, 0x80deb1fe3b1696b1 dq 0x80deb1fe3b1696b1, 0x80deb1fe3b1696b1, 0x80deb1fe3b1696b1, 0x80deb1fe3b1696b1 dq 0x9bdc06a725c71235, 0x9bdc06a725c71235, 0x9bdc06a725c71235, 0x9bdc06a725c71235 dq 0x9bdc06a725c71235, 0x9bdc06a725c71235, 0x9bdc06a725c71235, 0x9bdc06a725c71235 dq 0xc19bf174cf692694, 0xc19bf174cf692694, 0xc19bf174cf692694, 0xc19bf174cf692694 dq 0xc19bf174cf692694, 0xc19bf174cf692694, 0xc19bf174cf692694, 0xc19bf174cf692694 dq 0xe49b69c19ef14ad2, 0xe49b69c19ef14ad2, 0xe49b69c19ef14ad2, 0xe49b69c19ef14ad2 dq 0xe49b69c19ef14ad2, 0xe49b69c19ef14ad2, 0xe49b69c19ef14ad2, 0xe49b69c19ef14ad2 dq 0xefbe4786384f25e3, 0xefbe4786384f25e3, 0xefbe4786384f25e3, 0xefbe4786384f25e3 dq 0xefbe4786384f25e3, 0xefbe4786384f25e3, 0xefbe4786384f25e3, 0xefbe4786384f25e3 dq 0x0fc19dc68b8cd5b5, 0x0fc19dc68b8cd5b5, 0x0fc19dc68b8cd5b5, 0x0fc19dc68b8cd5b5 dq 0x0fc19dc68b8cd5b5, 0x0fc19dc68b8cd5b5, 0x0fc19dc68b8cd5b5, 0x0fc19dc68b8cd5b5 dq 0x240ca1cc77ac9c65, 0x240ca1cc77ac9c65, 0x240ca1cc77ac9c65, 0x240ca1cc77ac9c65 dq 0x240ca1cc77ac9c65, 0x240ca1cc77ac9c65, 0x240ca1cc77ac9c65, 0x240ca1cc77ac9c65 dq 0x2de92c6f592b0275, 0x2de92c6f592b0275, 0x2de92c6f592b0275, 0x2de92c6f592b0275 dq 0x2de92c6f592b0275, 0x2de92c6f592b0275, 0x2de92c6f592b0275, 0x2de92c6f592b0275 dq 0x4a7484aa6ea6e483, 0x4a7484aa6ea6e483, 0x4a7484aa6ea6e483, 0x4a7484aa6ea6e483 dq 0x4a7484aa6ea6e483, 0x4a7484aa6ea6e483, 0x4a7484aa6ea6e483, 0x4a7484aa6ea6e483 dq 0x5cb0a9dcbd41fbd4, 0x5cb0a9dcbd41fbd4, 0x5cb0a9dcbd41fbd4, 0x5cb0a9dcbd41fbd4 dq 0x5cb0a9dcbd41fbd4, 0x5cb0a9dcbd41fbd4, 0x5cb0a9dcbd41fbd4, 0x5cb0a9dcbd41fbd4 dq 0x76f988da831153b5, 0x76f988da831153b5, 0x76f988da831153b5, 0x76f988da831153b5 dq 0x76f988da831153b5, 0x76f988da831153b5, 0x76f988da831153b5, 0x76f988da831153b5 dq 0x983e5152ee66dfab, 0x983e5152ee66dfab, 0x983e5152ee66dfab, 0x983e5152ee66dfab dq 0x983e5152ee66dfab, 0x983e5152ee66dfab, 0x983e5152ee66dfab, 0x983e5152ee66dfab dq 0xa831c66d2db43210, 0xa831c66d2db43210, 0xa831c66d2db43210, 0xa831c66d2db43210 dq 0xa831c66d2db43210, 0xa831c66d2db43210, 0xa831c66d2db43210, 0xa831c66d2db43210 dq 0xb00327c898fb213f, 0xb00327c898fb213f, 0xb00327c898fb213f, 0xb00327c898fb213f dq 0xb00327c898fb213f, 0xb00327c898fb213f, 0xb00327c898fb213f, 0xb00327c898fb213f dq 0xbf597fc7beef0ee4, 0xbf597fc7beef0ee4, 0xbf597fc7beef0ee4, 0xbf597fc7beef0ee4 dq 0xbf597fc7beef0ee4, 0xbf597fc7beef0ee4, 0xbf597fc7beef0ee4, 0xbf597fc7beef0ee4 dq 0xc6e00bf33da88fc2, 0xc6e00bf33da88fc2, 0xc6e00bf33da88fc2, 0xc6e00bf33da88fc2 dq 0xc6e00bf33da88fc2, 0xc6e00bf33da88fc2, 0xc6e00bf33da88fc2, 0xc6e00bf33da88fc2 dq 0xd5a79147930aa725, 0xd5a79147930aa725, 0xd5a79147930aa725, 0xd5a79147930aa725 dq 0xd5a79147930aa725, 0xd5a79147930aa725, 0xd5a79147930aa725, 0xd5a79147930aa725 dq 0x06ca6351e003826f, 0x06ca6351e003826f, 0x06ca6351e003826f, 0x06ca6351e003826f dq 0x06ca6351e003826f, 0x06ca6351e003826f, 0x06ca6351e003826f, 0x06ca6351e003826f dq 0x142929670a0e6e70, 0x142929670a0e6e70, 0x142929670a0e6e70, 0x142929670a0e6e70 dq 0x142929670a0e6e70, 0x142929670a0e6e70, 0x142929670a0e6e70, 0x142929670a0e6e70 dq 0x27b70a8546d22ffc, 0x27b70a8546d22ffc, 0x27b70a8546d22ffc, 0x27b70a8546d22ffc dq 0x27b70a8546d22ffc, 0x27b70a8546d22ffc, 0x27b70a8546d22ffc, 0x27b70a8546d22ffc dq 0x2e1b21385c26c926, 0x2e1b21385c26c926, 0x2e1b21385c26c926, 0x2e1b21385c26c926 dq 0x2e1b21385c26c926, 0x2e1b21385c26c926, 0x2e1b21385c26c926, 0x2e1b21385c26c926 dq 0x4d2c6dfc5ac42aed, 0x4d2c6dfc5ac42aed, 0x4d2c6dfc5ac42aed, 0x4d2c6dfc5ac42aed dq 0x4d2c6dfc5ac42aed, 0x4d2c6dfc5ac42aed, 0x4d2c6dfc5ac42aed, 0x4d2c6dfc5ac42aed dq 0x53380d139d95b3df, 0x53380d139d95b3df, 0x53380d139d95b3df, 0x53380d139d95b3df dq 0x53380d139d95b3df, 0x53380d139d95b3df, 0x53380d139d95b3df, 0x53380d139d95b3df dq 0x650a73548baf63de, 0x650a73548baf63de, 0x650a73548baf63de, 0x650a73548baf63de dq 0x650a73548baf63de, 0x650a73548baf63de, 0x650a73548baf63de, 0x650a73548baf63de dq 0x766a0abb3c77b2a8, 0x766a0abb3c77b2a8, 0x766a0abb3c77b2a8, 0x766a0abb3c77b2a8 dq 0x766a0abb3c77b2a8, 0x766a0abb3c77b2a8, 0x766a0abb3c77b2a8, 0x766a0abb3c77b2a8 dq 0x81c2c92e47edaee6, 0x81c2c92e47edaee6, 0x81c2c92e47edaee6, 0x81c2c92e47edaee6 dq 0x81c2c92e47edaee6, 0x81c2c92e47edaee6, 0x81c2c92e47edaee6, 0x81c2c92e47edaee6 dq 0x92722c851482353b, 0x92722c851482353b, 0x92722c851482353b, 0x92722c851482353b dq 0x92722c851482353b, 0x92722c851482353b, 0x92722c851482353b, 0x92722c851482353b dq 0xa2bfe8a14cf10364, 0xa2bfe8a14cf10364, 0xa2bfe8a14cf10364, 0xa2bfe8a14cf10364 dq 0xa2bfe8a14cf10364, 0xa2bfe8a14cf10364, 0xa2bfe8a14cf10364, 0xa2bfe8a14cf10364 dq 0xa81a664bbc423001, 0xa81a664bbc423001, 0xa81a664bbc423001, 0xa81a664bbc423001 dq 0xa81a664bbc423001, 0xa81a664bbc423001, 0xa81a664bbc423001, 0xa81a664bbc423001 dq 0xc24b8b70d0f89791, 0xc24b8b70d0f89791, 0xc24b8b70d0f89791, 0xc24b8b70d0f89791 dq 0xc24b8b70d0f89791, 0xc24b8b70d0f89791, 0xc24b8b70d0f89791, 0xc24b8b70d0f89791 dq 0xc76c51a30654be30, 0xc76c51a30654be30, 0xc76c51a30654be30, 0xc76c51a30654be30 dq 0xc76c51a30654be30, 0xc76c51a30654be30, 0xc76c51a30654be30, 0xc76c51a30654be30 dq 0xd192e819d6ef5218, 0xd192e819d6ef5218, 0xd192e819d6ef5218, 0xd192e819d6ef5218 dq 0xd192e819d6ef5218, 0xd192e819d6ef5218, 0xd192e819d6ef5218, 0xd192e819d6ef5218 dq 0xd69906245565a910, 0xd69906245565a910, 0xd69906245565a910, 0xd69906245565a910 dq 0xd69906245565a910, 0xd69906245565a910, 0xd69906245565a910, 0xd69906245565a910 dq 0xf40e35855771202a, 0xf40e35855771202a, 0xf40e35855771202a, 0xf40e35855771202a dq 0xf40e35855771202a, 0xf40e35855771202a, 0xf40e35855771202a, 0xf40e35855771202a dq 0x106aa07032bbd1b8, 0x106aa07032bbd1b8, 0x106aa07032bbd1b8, 0x106aa07032bbd1b8 dq 0x106aa07032bbd1b8, 0x106aa07032bbd1b8, 0x106aa07032bbd1b8, 0x106aa07032bbd1b8 dq 0x19a4c116b8d2d0c8, 0x19a4c116b8d2d0c8, 0x19a4c116b8d2d0c8, 0x19a4c116b8d2d0c8 dq 0x19a4c116b8d2d0c8, 0x19a4c116b8d2d0c8, 0x19a4c116b8d2d0c8, 0x19a4c116b8d2d0c8 dq 0x1e376c085141ab53, 0x1e376c085141ab53, 0x1e376c085141ab53, 0x1e376c085141ab53 dq 0x1e376c085141ab53, 0x1e376c085141ab53, 0x1e376c085141ab53, 0x1e376c085141ab53 dq 0x2748774cdf8eeb99, 0x2748774cdf8eeb99, 0x2748774cdf8eeb99, 0x2748774cdf8eeb99 dq 0x2748774cdf8eeb99, 0x2748774cdf8eeb99, 0x2748774cdf8eeb99, 0x2748774cdf8eeb99 dq 0x34b0bcb5e19b48a8, 0x34b0bcb5e19b48a8, 0x34b0bcb5e19b48a8, 0x34b0bcb5e19b48a8 dq 0x34b0bcb5e19b48a8, 0x34b0bcb5e19b48a8, 0x34b0bcb5e19b48a8, 0x34b0bcb5e19b48a8 dq 0x391c0cb3c5c95a63, 0x391c0cb3c5c95a63, 0x391c0cb3c5c95a63, 0x391c0cb3c5c95a63 dq 0x391c0cb3c5c95a63, 0x391c0cb3c5c95a63, 0x391c0cb3c5c95a63, 0x391c0cb3c5c95a63 dq 0x4ed8aa4ae3418acb, 0x4ed8aa4ae3418acb, 0x4ed8aa4ae3418acb, 0x4ed8aa4ae3418acb dq 0x4ed8aa4ae3418acb, 0x4ed8aa4ae3418acb, 0x4ed8aa4ae3418acb, 0x4ed8aa4ae3418acb dq 0x5b9cca4f7763e373, 0x5b9cca4f7763e373, 0x5b9cca4f7763e373, 0x5b9cca4f7763e373 dq 0x5b9cca4f7763e373, 0x5b9cca4f7763e373, 0x5b9cca4f7763e373, 0x5b9cca4f7763e373 dq 0x682e6ff3d6b2b8a3, 0x682e6ff3d6b2b8a3, 0x682e6ff3d6b2b8a3, 0x682e6ff3d6b2b8a3 dq 0x682e6ff3d6b2b8a3, 0x682e6ff3d6b2b8a3, 0x682e6ff3d6b2b8a3, 0x682e6ff3d6b2b8a3 dq 0x748f82ee5defb2fc, 0x748f82ee5defb2fc, 0x748f82ee5defb2fc, 0x748f82ee5defb2fc dq 0x748f82ee5defb2fc, 0x748f82ee5defb2fc, 0x748f82ee5defb2fc, 0x748f82ee5defb2fc dq 0x78a5636f43172f60, 0x78a5636f43172f60, 0x78a5636f43172f60, 0x78a5636f43172f60 dq 0x78a5636f43172f60, 0x78a5636f43172f60, 0x78a5636f43172f60, 0x78a5636f43172f60 dq 0x84c87814a1f0ab72, 0x84c87814a1f0ab72, 0x84c87814a1f0ab72, 0x84c87814a1f0ab72 dq 0x84c87814a1f0ab72, 0x84c87814a1f0ab72, 0x84c87814a1f0ab72, 0x84c87814a1f0ab72 dq 0x8cc702081a6439ec, 0x8cc702081a6439ec, 0x8cc702081a6439ec, 0x8cc702081a6439ec dq 0x8cc702081a6439ec, 0x8cc702081a6439ec, 0x8cc702081a6439ec, 0x8cc702081a6439ec dq 0x90befffa23631e28, 0x90befffa23631e28, 0x90befffa23631e28, 0x90befffa23631e28 dq 0x90befffa23631e28, 0x90befffa23631e28, 0x90befffa23631e28, 0x90befffa23631e28 dq 0xa4506cebde82bde9, 0xa4506cebde82bde9, 0xa4506cebde82bde9, 0xa4506cebde82bde9 dq 0xa4506cebde82bde9, 0xa4506cebde82bde9, 0xa4506cebde82bde9, 0xa4506cebde82bde9 dq 0xbef9a3f7b2c67915, 0xbef9a3f7b2c67915, 0xbef9a3f7b2c67915, 0xbef9a3f7b2c67915 dq 0xbef9a3f7b2c67915, 0xbef9a3f7b2c67915, 0xbef9a3f7b2c67915, 0xbef9a3f7b2c67915 dq 0xc67178f2e372532b, 0xc67178f2e372532b, 0xc67178f2e372532b, 0xc67178f2e372532b dq 0xc67178f2e372532b, 0xc67178f2e372532b, 0xc67178f2e372532b, 0xc67178f2e372532b dq 0xca273eceea26619c, 0xca273eceea26619c, 0xca273eceea26619c, 0xca273eceea26619c dq 0xca273eceea26619c, 0xca273eceea26619c, 0xca273eceea26619c, 0xca273eceea26619c dq 0xd186b8c721c0c207, 0xd186b8c721c0c207, 0xd186b8c721c0c207, 0xd186b8c721c0c207 dq 0xd186b8c721c0c207, 0xd186b8c721c0c207, 0xd186b8c721c0c207, 0xd186b8c721c0c207 dq 0xeada7dd6cde0eb1e, 0xeada7dd6cde0eb1e, 0xeada7dd6cde0eb1e, 0xeada7dd6cde0eb1e dq 0xeada7dd6cde0eb1e, 0xeada7dd6cde0eb1e, 0xeada7dd6cde0eb1e, 0xeada7dd6cde0eb1e dq 0xf57d4f7fee6ed178, 0xf57d4f7fee6ed178, 0xf57d4f7fee6ed178, 0xf57d4f7fee6ed178 dq 0xf57d4f7fee6ed178, 0xf57d4f7fee6ed178, 0xf57d4f7fee6ed178, 0xf57d4f7fee6ed178 dq 0x06f067aa72176fba, 0x06f067aa72176fba, 0x06f067aa72176fba, 0x06f067aa72176fba dq 0x06f067aa72176fba, 0x06f067aa72176fba, 0x06f067aa72176fba, 0x06f067aa72176fba dq 0x0a637dc5a2c898a6, 0x0a637dc5a2c898a6, 0x0a637dc5a2c898a6, 0x0a637dc5a2c898a6 dq 0x0a637dc5a2c898a6, 0x0a637dc5a2c898a6, 0x0a637dc5a2c898a6, 0x0a637dc5a2c898a6 dq 0x113f9804bef90dae, 0x113f9804bef90dae, 0x113f9804bef90dae, 0x113f9804bef90dae dq 0x113f9804bef90dae, 0x113f9804bef90dae, 0x113f9804bef90dae, 0x113f9804bef90dae dq 0x1b710b35131c471b, 0x1b710b35131c471b, 0x1b710b35131c471b, 0x1b710b35131c471b dq 0x1b710b35131c471b, 0x1b710b35131c471b, 0x1b710b35131c471b, 0x1b710b35131c471b dq 0x28db77f523047d84, 0x28db77f523047d84, 0x28db77f523047d84, 0x28db77f523047d84 dq 0x28db77f523047d84, 0x28db77f523047d84, 0x28db77f523047d84, 0x28db77f523047d84 dq 0x32caab7b40c72493, 0x32caab7b40c72493, 0x32caab7b40c72493, 0x32caab7b40c72493 dq 0x32caab7b40c72493, 0x32caab7b40c72493, 0x32caab7b40c72493, 0x32caab7b40c72493 dq 0x3c9ebe0a15c9bebc, 0x3c9ebe0a15c9bebc, 0x3c9ebe0a15c9bebc, 0x3c9ebe0a15c9bebc dq 0x3c9ebe0a15c9bebc, 0x3c9ebe0a15c9bebc, 0x3c9ebe0a15c9bebc, 0x3c9ebe0a15c9bebc dq 0x431d67c49c100d4c, 0x431d67c49c100d4c, 0x431d67c49c100d4c, 0x431d67c49c100d4c dq 0x431d67c49c100d4c, 0x431d67c49c100d4c, 0x431d67c49c100d4c, 0x431d67c49c100d4c dq 0x4cc5d4becb3e42b6, 0x4cc5d4becb3e42b6, 0x4cc5d4becb3e42b6, 0x4cc5d4becb3e42b6 dq 0x4cc5d4becb3e42b6, 0x4cc5d4becb3e42b6, 0x4cc5d4becb3e42b6, 0x4cc5d4becb3e42b6 dq 0x597f299cfc657e2a, 0x597f299cfc657e2a, 0x597f299cfc657e2a, 0x597f299cfc657e2a dq 0x597f299cfc657e2a, 0x597f299cfc657e2a, 0x597f299cfc657e2a, 0x597f299cfc657e2a dq 0x5fcb6fab3ad6faec, 0x5fcb6fab3ad6faec, 0x5fcb6fab3ad6faec, 0x5fcb6fab3ad6faec dq 0x5fcb6fab3ad6faec, 0x5fcb6fab3ad6faec, 0x5fcb6fab3ad6faec, 0x5fcb6fab3ad6faec dq 0x6c44198c4a475817, 0x6c44198c4a475817, 0x6c44198c4a475817, 0x6c44198c4a475817 dq 0x6c44198c4a475817, 0x6c44198c4a475817, 0x6c44198c4a475817, 0x6c44198c4a475817 align 64 ; this does the big endian to little endian conversion over a quad word .. ZMM ;; shuffle on ZMM is shuffle on 4 XMM size chunks, 128 bits PSHUFFLE_BYTE_FLIP_MASK: ;ddq 0x08090a0b0c0d0e0f0001020304050607 dq 0x0001020304050607, 0x08090a0b0c0d0e0f ;ddq 0x18191a1b1c1d1e1f1011121314151617 dq 0x1011121314151617, 0x18191a1b1c1d1e1f ;ddq 0x28292a2b2c2d2e2f2021222324252627 dq 0x2021222324252627, 0x28292a2b2c2d2e2f ;ddq 0x38393a3b3c3d3e3f3031323334353637 dq 0x3031323334353637, 0x38393a3b3c3d3e3f align 64 TRANSPOSE8_PERM_INDEX_1: dq 0x0000000000000000 dq 0x0000000000000001 dq 0x0000000000000008 dq 0x0000000000000009 dq 0x0000000000000004 dq 0x0000000000000005 dq 0x000000000000000C dq 0x000000000000000D TRANSPOSE8_PERM_INDEX_2: dq 0x0000000000000002 dq 0x0000000000000003 dq 0x000000000000000A dq 0x000000000000000B dq 0x0000000000000006 dq 0x0000000000000007 dq 0x000000000000000E dq 0x000000000000000F section .text ;; void sha512_x8_avx512(void *input_data, UINT64 *digest[NUM_LANES], const int size) ;; arg 1 : rcx : pointer to input data ;; arg 2 : rdx : pointer to UINT64 digest[8][num_lanes] ;; arg 3 : size in message block lengths (= 128 bytes) MKGLOBAL(sha512_x8_avx512,function,internal) align 64 sha512_x8_avx512: mov rax, rsp sub rsp, STACK_SPACE and rsp, ~63 ; align stack to multiple of 64 mov [rsp + _RSP], rax ;; Initialize digests ; organized uint64 digest[8][num_lanes]; no transpose required ;; Digest is an array of pointers to digests vmovdqu32 A, [STATE + 0*SHA512_DIGEST_ROW_SIZE] vmovdqu32 B, [STATE + 1*SHA512_DIGEST_ROW_SIZE] vmovdqu32 C, [STATE + 2*SHA512_DIGEST_ROW_SIZE] vmovdqu32 D, [STATE + 3*SHA512_DIGEST_ROW_SIZE] vmovdqu32 E, [STATE + 4*SHA512_DIGEST_ROW_SIZE] vmovdqu32 F, [STATE + 5*SHA512_DIGEST_ROW_SIZE] vmovdqu32 G, [STATE + 6*SHA512_DIGEST_ROW_SIZE] vmovdqu32 H, [STATE + 7*SHA512_DIGEST_ROW_SIZE] lea TBL,[rel TABLE] xor IDX, IDX ;; Read in input data address, saving them in registers because ;; they will serve as variables, which we shall keep incrementing mov inp0, [STATE + _data_ptr_sha512 + 0*PTR_SZ] mov inp1, [STATE + _data_ptr_sha512 + 1*PTR_SZ] mov inp2, [STATE + _data_ptr_sha512 + 2*PTR_SZ] mov inp3, [STATE + _data_ptr_sha512 + 3*PTR_SZ] mov inp4, [STATE + _data_ptr_sha512 + 4*PTR_SZ] mov inp5, [STATE + _data_ptr_sha512 + 5*PTR_SZ] mov inp6, [STATE + _data_ptr_sha512 + 6*PTR_SZ] mov inp7, [STATE + _data_ptr_sha512 + 7*PTR_SZ] jmp lloop align 32 lloop: ;; first half of 1024 (need to transpose before use) vmovups W0, [inp0 + IDX] vmovups W1, [inp1 + IDX] vmovups W2, [inp2 + IDX] vmovups W3, [inp3 + IDX] vmovups W4, [inp4 + IDX] vmovups W5, [inp5 + IDX] vmovups W6, [inp6 + IDX] vmovups TMP0,[inp7 + IDX] TRANSPOSE8 W0, W1, W2, W3, W4, W5, W6, TMP0, W7, TMP1, TMP2, TMP3 ;; second half of 1024 (need to transpose before use) vmovups W8, [inp0 + SZ8 + IDX] vmovups W9, [inp1 + SZ8 + IDX] vmovups W10, [inp2 + SZ8 + IDX] vmovups W11, [inp3 + SZ8 + IDX] vmovups W12, [inp4 + SZ8 + IDX] vmovups W13, [inp5 + SZ8 + IDX] vmovups W14, [inp6 + SZ8 + IDX] vmovups TMP0,[inp7 + SZ8 + IDX] TRANSPOSE8 W8, W9, W10, W11, W12, W13, W14, TMP0, W15, TMP1, TMP2, TMP3 vmovdqa32 TMP2, [rel PSHUFFLE_BYTE_FLIP_MASK] vmovdqa32 TMP3, [TBL] ; First K ; Save digests for later addition vmovdqa32 [rsp + _DIGEST_SAVE + 64*0], A vmovdqa32 [rsp + _DIGEST_SAVE + 64*1], B vmovdqa32 [rsp + _DIGEST_SAVE + 64*2], C vmovdqa32 [rsp + _DIGEST_SAVE + 64*3], D vmovdqa32 [rsp + _DIGEST_SAVE + 64*4], E vmovdqa32 [rsp + _DIGEST_SAVE + 64*5], F vmovdqa32 [rsp + _DIGEST_SAVE + 64*6], G vmovdqa32 [rsp + _DIGEST_SAVE + 64*7], H add IDX, 128 ; increment by message block length in bytes %assign I 0 %rep 16 ;;; little endian to big endian vpshufb APPEND(W,I), APPEND(W,I), TMP2 %assign I (I+1) %endrep ; MSG Schedule for W0-W15 is now complete in registers ; Process first (max-rounds -16) ; Calculate next Wt+16 after processing is complete and Wt is unneeded ; PROCESS_LOOP_00_79 APPEND(W,J), I, APPEND(W,K), APPEND(W,L), APPEND(W,M) %assign I 0 %assign J 0 %assign K 1 %assign L 9 %assign M 14 %rep SHA_ROUNDS_LESS_16 PROCESS_LOOP APPEND(W,J), I MSG_SCHED_ROUND_16_79 APPEND(W,J), APPEND(W,K), APPEND(W,L), APPEND(W,M) %assign I (I+1) %assign J ((J+1)% 16) %assign K ((K+1)% 16) %assign L ((L+1)% 16) %assign M ((M+1)% 16) %endrep ; Check is this is the last block sub INP_SIZE, 1 je lastLoop ; Process last 16 rounds ; Read in next block msg data for use in first 16 words of msg sched %assign I SHA_ROUNDS_LESS_16 %assign J 0 %rep 16 PROCESS_LOOP APPEND(W,J), I %assign I (I+1) %assign J (J+1) %endrep ; Add old digest vpaddq A, A, [rsp + _DIGEST_SAVE + 64*0] vpaddq B, B, [rsp + _DIGEST_SAVE + 64*1] vpaddq C, C, [rsp + _DIGEST_SAVE + 64*2] vpaddq D, D, [rsp + _DIGEST_SAVE + 64*3] vpaddq E, E, [rsp + _DIGEST_SAVE + 64*4] vpaddq F, F, [rsp + _DIGEST_SAVE + 64*5] vpaddq G, G, [rsp + _DIGEST_SAVE + 64*6] vpaddq H, H, [rsp + _DIGEST_SAVE + 64*7] jmp lloop align 32 lastLoop: ; Process last 16 rounds %assign I SHA_ROUNDS_LESS_16 %assign J 0 %rep 16 PROCESS_LOOP APPEND(W,J), I %assign I (I+1) %assign J (J+1) %endrep ; Add old digest vpaddq A, A, [rsp + _DIGEST_SAVE + 64*0] vpaddq B, B, [rsp + _DIGEST_SAVE + 64*1] vpaddq C, C, [rsp + _DIGEST_SAVE + 64*2] vpaddq D, D, [rsp + _DIGEST_SAVE + 64*3] vpaddq E, E, [rsp + _DIGEST_SAVE + 64*4] vpaddq F, F, [rsp + _DIGEST_SAVE + 64*5] vpaddq G, G, [rsp + _DIGEST_SAVE + 64*6] vpaddq H, H, [rsp + _DIGEST_SAVE + 64*7] ; Write out digest ;; results in A, B, C, D, E, F, G, H vmovdqu32 [STATE + 0*SHA512_DIGEST_ROW_SIZE], A vmovdqu32 [STATE + 1*SHA512_DIGEST_ROW_SIZE], B vmovdqu32 [STATE + 2*SHA512_DIGEST_ROW_SIZE], C vmovdqu32 [STATE + 3*SHA512_DIGEST_ROW_SIZE], D vmovdqu32 [STATE + 4*SHA512_DIGEST_ROW_SIZE], E vmovdqu32 [STATE + 5*SHA512_DIGEST_ROW_SIZE], F vmovdqu32 [STATE + 6*SHA512_DIGEST_ROW_SIZE], G vmovdqu32 [STATE + 7*SHA512_DIGEST_ROW_SIZE], H ; update input pointers %assign I 0 %rep 8 add [STATE + _data_ptr_sha512 + I*PTR_SZ], IDX %assign I (I+1) %endrep mov rsp, [rsp + _RSP] ;hash_done: ret intel-ipsec-mb-0.48/constants.asm000066400000000000000000000060271321406316400170000ustar00rootroot00000000000000;; ;; Copyright (c) 2012-2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; ;;; Generic constants %define PTR_SZ 8 ;;; hash constants %define MD5_DIGEST_WORD_SIZE 4 %define SHA1_DIGEST_WORD_SIZE 4 %define SHA256_DIGEST_WORD_SIZE 4 %define SHA512_DIGEST_WORD_SIZE 8 ;; AVX512 constants %define MAX_MD5_LANES 32 %define MAX_SHA1_LANES 16 %define MAX_SHA256_LANES 16 %define MAX_SHA512_LANES 8 %define NUM_MD5_DIGEST_WORDS 4 %define NUM_SHA1_DIGEST_WORDS 5 %define NUM_SHA256_DIGEST_WORDS 8 %define NUM_SHA512_DIGEST_WORDS 8 %define MD5_DIGEST_ROW_SIZE (MAX_MD5_LANES * MD5_DIGEST_WORD_SIZE) %define SHA1_DIGEST_ROW_SIZE (MAX_SHA1_LANES * SHA1_DIGEST_WORD_SIZE) %define SHA256_DIGEST_ROW_SIZE (MAX_SHA256_LANES * SHA256_DIGEST_WORD_SIZE) %define SHA512_DIGEST_ROW_SIZE (MAX_SHA512_LANES * SHA512_DIGEST_WORD_SIZE) %define MD5_DIGEST_SIZE (MD5_DIGEST_ROW_SIZE * NUM_MD5_DIGEST_WORDS) %define SHA1_DIGEST_SIZE (SHA1_DIGEST_ROW_SIZE * NUM_SHA1_DIGEST_WORDS) %define SHA256_DIGEST_SIZE (SHA256_DIGEST_ROW_SIZE * NUM_SHA256_DIGEST_WORDS) %define SHA512_DIGEST_SIZE (SHA512_DIGEST_ROW_SIZE * NUM_SHA512_DIGEST_WORDS) ;; Used only by SHA-NI implementations ;; Sanity checks to fail build if not satisfied %define SHA1NI_DIGEST_ROW_SIZE (NUM_SHA1_DIGEST_WORDS * SHA1_DIGEST_WORD_SIZE) %define SHA256NI_DIGEST_ROW_SIZE (NUM_SHA256_DIGEST_WORDS * SHA256_DIGEST_WORD_SIZE) %define MD5_BLK_SZ 128 ; in bytes %define SHA1_BLK_SZ 64 ; in bytes %define SHA256_BLK_SZ 64 ; in bytes %define SHA512_BLK_SZ 128 ; in bytes intel-ipsec-mb-0.48/constants.h000066400000000000000000000072751321406316400164550ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2012-2017, Intel Corporation Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ #ifndef _CONSTANTS_H_ #define _CONSTANTS_H_ /* define SHA1 constants */ #define H0 0x67452301 #define H1 0xefcdab89 #define H2 0x98badcfe #define H3 0x10325476 #define H4 0xc3d2e1f0 /* define SHA256 constants */ #define SHA256_H0 0x6a09e667 #define SHA256_H1 0xbb67ae85 #define SHA256_H2 0x3c6ef372 #define SHA256_H3 0xa54ff53a #define SHA256_H4 0x510e527f #define SHA256_H5 0x9b05688c #define SHA256_H6 0x1f83d9ab #define SHA256_H7 0x5be0cd19 /* define SHA224 constants */ #define SHA224_H0 0xc1059ed8 #define SHA224_H1 0x367cd507 #define SHA224_H2 0x3070dd17 #define SHA224_H3 0xf70e5939 #define SHA224_H4 0xffc00b31 #define SHA224_H5 0x68581511 #define SHA224_H6 0x64f98fa7 #define SHA224_H7 0xbefa4fa4 /* define SHA512 constants */ #define SHA512_H0 0x6a09e667f3bcc908 #define SHA512_H1 0xbb67ae8584caa73b #define SHA512_H2 0x3c6ef372fe94f82b #define SHA512_H3 0xa54ff53a5f1d36f1 #define SHA512_H4 0x510e527fade682d1 #define SHA512_H5 0x9b05688c2b3e6c1f #define SHA512_H6 0x1f83d9abfb41bd6b #define SHA512_H7 0x5be0cd19137e2179 /* define SHA384 constants */ #define SHA384_H0 0xcbbb9d5dc1059ed8 #define SHA384_H1 0x629a292a367cd507 #define SHA384_H2 0x9159015a3070dd17 #define SHA384_H3 0x152fecd8f70e5939 #define SHA384_H4 0x67332667ffc00b31 #define SHA384_H5 0x8eb44a8768581511 #define SHA384_H6 0xdb0c2e0d64f98fa7 #define SHA384_H7 0x47b5481dbefa4fa4 #define NUM_MD5_DIGEST_WORDS 4 #define NUM_SHA_DIGEST_WORDS 5 #define NUM_SHA_256_DIGEST_WORDS 8 #define NUM_SHA_224_DIGEST_WORDS 7 #define NUM_SHA_512_DIGEST_WORDS 8 #define NUM_SHA_384_DIGEST_WORDS 6 #define SHA384_DIGEST_WORD_SIZE 8 #define SHA512_DIGEST_WORD_SIZE 8 #define SHA384_DIGEST_SIZE_IN_BYTES \ (NUM_SHA_384_DIGEST_WORDS * SHA384_DIGEST_WORD_SIZE) #define SHA512_DIGEST_SIZE_IN_BYTES \ (NUM_SHA_512_DIGEST_WORDS * SHA512_DIGEST_WORD_SIZE) #define SHA1_BLOCK_SIZE 64 /* 512 bits is 64 byte blocks */ #define SHA_256_BLOCK_SIZE 64 /* 512 bits is 64 byte blocks */ #define SHA_384_BLOCK_SIZE 128 #define SHA_512_BLOCK_SIZE 128 #endif /* _CONSTANTS_H_ */ intel-ipsec-mb-0.48/des.h000066400000000000000000000075451321406316400152140ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2017, Intel Corporation Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ #ifndef IMB_DES_H #define IMB_DES_H #include #define DES_KEY_SCHED_SIZE (16 * 8) /* 16 rounds x 8 bytes */ #define DES_BLOCK_SIZE 8 /** * @brief DES CBC encryption * * @param input source buffer with plain text * @param output destination buffer for cipher text * @param size number of bytes to encrypt (multiple of 8) * @param ks pointer to key schedule structure * @param ivec pointer to initialization vector */ void des_enc_cbc_basic(const void *input, void *output, const int size, const uint64_t *ks, const uint64_t *ivec); /** * @brief DES CBC decryption * * @param input source buffer with cipher text * @param output destination buffer for plain text * @param size number of bytes to decrypt (multiple of 8) * @param ks pointer to key schedule structure * @param ivec pointer to initialization vector */ void des_dec_cbc_basic(const void *input, void *output, const int size, const uint64_t *ks, const uint64_t *ivec); /** * @brief DOCSIS DES encryption * * @param input source buffer with plain text * @param output destination buffer for cipher text * @param size number of bytes to encrypt * @param ks pointer to key schedule structure * @param ivec pointer to initialization vector */ void docsis_des_enc_basic(const void *input, void *output, const int size, const uint64_t *ks, const uint64_t *ivec); /** * @brief DOCSIS DES decryption * * @param input source buffer with cipher text * @param output destination buffer for plain text * @param size number of bytes to decrypt * @param ks pointer to key schedule structure * @param ivec pointer to initialization vector */ void docsis_des_dec_basic(const void *input, void *output, const int size, const uint64_t *ks, const uint64_t *ivec); /** * @brief DES key schedule set up * * \a ks buffer needs to accomodate \a DES_KEY_SCHED_SIZE (128) bytes of data. * * @param ks destination buffer to accomodate DES key schedule * @param key a pointer to an 8 byte DES key * * @return Operation status * @retval 0 success * @retval !0 error */ int des_key_schedule(uint64_t *ks, const void *key); #endif /* IMB_DES_H */ intel-ipsec-mb-0.48/des_basic.c000066400000000000000000000576671321406316400163620ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2017, Intel Corporation Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ /* basic DES implementation */ #include #include #include "des.h" #include "des_utils.h" #include "os.h" __forceinline void permute_operation(uint32_t *pa, uint32_t *pb, const uint32_t n, const uint32_t m) { register uint32_t t = (*pb ^ (*pa >> n)) & m; *pb ^= t; *pa ^= (t << n); } /* inital permutation */ __forceinline void ip_z(uint32_t *pl, uint32_t *pr) { permute_operation(pr, pl, 4, 0x0f0f0f0f); permute_operation(pl, pr, 16, 0x0000ffff); permute_operation(pr, pl, 2, 0x33333333); permute_operation(pl, pr, 8, 0x00ff00ff); permute_operation(pr, pl, 1, 0x55555555); } /* final permuation */ __forceinline void fp_z(uint32_t *pl, uint32_t *pr) { permute_operation(pl, pr, 1, 0x55555555); permute_operation(pr, pl, 8, 0x00ff00ff); permute_operation(pl, pr, 2, 0x33333333); permute_operation(pr, pl, 16, 0x0000ffff); permute_operation(pl, pr, 4, 0x0f0f0f0f); } /* 1st part of DES round * - permutes and exands R(32 bits) into 48 bits */ __forceinline uint64_t e_phase(const uint64_t R) { /* E phase as in FIPS46-3 and also 8x6 to 8x8 expansion. * * Bit selection table for this operation looks as follows: * 32, 1, 2, 3, 4, 5, X, X, * 4, 5, 6, 7, 8, 9, X, X, * 8, 9, 10, 11, 12, 13, X, X, * 12, 13, 14, 15, 16, 17, X, X, * 16, 17, 18, 19, 20, 21, X, X, * 20, 21, 22, 23, 24, 25, X, X, * 24, 25, 26, 27, 28, 29, X, X, * 28, 29, 30, 31, 32, 1, X, X * where 'X' is bit value 0. */ return ((R << 1) & UINT64_C(0x3e)) | ((R >> 31) & UINT64_C(1)) | ((R << 5) & UINT64_C(0x3f00)) | ((R << 9) & UINT64_C(0x3f0000)) | ((R << 13) & UINT64_C(0x3f000000)) | ((R << 17) & UINT64_C(0x3f00000000)) | ((R << 21) & UINT64_C(0x3f0000000000)) | ((R << 25) & UINT64_C(0x3f000000000000)) | ((R << 29) & UINT64_C(0x1f00000000000000)) | ((R & UINT64_C(1)) << 61); } static const uint32_t sbox0p[64] = { UINT32_C(0x00410100), UINT32_C(0x00010000), UINT32_C(0x40400000), UINT32_C(0x40410100), UINT32_C(0x00400000), UINT32_C(0x40010100), UINT32_C(0x40010000), UINT32_C(0x40400000), UINT32_C(0x40010100), UINT32_C(0x00410100), UINT32_C(0x00410000), UINT32_C(0x40000100), UINT32_C(0x40400100), UINT32_C(0x00400000), UINT32_C(0x00000000), UINT32_C(0x40010000), UINT32_C(0x00010000), UINT32_C(0x40000000), UINT32_C(0x00400100), UINT32_C(0x00010100), UINT32_C(0x40410100), UINT32_C(0x00410000), UINT32_C(0x40000100), UINT32_C(0x00400100), UINT32_C(0x40000000), UINT32_C(0x00000100), UINT32_C(0x00010100), UINT32_C(0x40410000), UINT32_C(0x00000100), UINT32_C(0x40400100), UINT32_C(0x40410000), UINT32_C(0x00000000), UINT32_C(0x00000000), UINT32_C(0x40410100), UINT32_C(0x00400100), UINT32_C(0x40010000), UINT32_C(0x00410100), UINT32_C(0x00010000), UINT32_C(0x40000100), UINT32_C(0x00400100), UINT32_C(0x40410000), UINT32_C(0x00000100), UINT32_C(0x00010100), UINT32_C(0x40400000), UINT32_C(0x40010100), UINT32_C(0x40000000), UINT32_C(0x40400000), UINT32_C(0x00410000), UINT32_C(0x40410100), UINT32_C(0x00010100), UINT32_C(0x00410000), UINT32_C(0x40400100), UINT32_C(0x00400000), UINT32_C(0x40000100), UINT32_C(0x40010000), UINT32_C(0x00000000), UINT32_C(0x00010000), UINT32_C(0x00400000), UINT32_C(0x40400100), UINT32_C(0x00410100), UINT32_C(0x40000000), UINT32_C(0x40410000), UINT32_C(0x00000100), UINT32_C(0x40010100) }; static const uint32_t sbox1p[64] = { UINT32_C(0x08021002), UINT32_C(0x00000000), UINT32_C(0x00021000), UINT32_C(0x08020000), UINT32_C(0x08000002), UINT32_C(0x00001002), UINT32_C(0x08001000), UINT32_C(0x00021000), UINT32_C(0x00001000), UINT32_C(0x08020002), UINT32_C(0x00000002), UINT32_C(0x08001000), UINT32_C(0x00020002), UINT32_C(0x08021000), UINT32_C(0x08020000), UINT32_C(0x00000002), UINT32_C(0x00020000), UINT32_C(0x08001002), UINT32_C(0x08020002), UINT32_C(0x00001000), UINT32_C(0x00021002), UINT32_C(0x08000000), UINT32_C(0x00000000), UINT32_C(0x00020002), UINT32_C(0x08001002), UINT32_C(0x00021002), UINT32_C(0x08021000), UINT32_C(0x08000002), UINT32_C(0x08000000), UINT32_C(0x00020000), UINT32_C(0x00001002), UINT32_C(0x08021002), UINT32_C(0x00020002), UINT32_C(0x08021000), UINT32_C(0x08001000), UINT32_C(0x00021002), UINT32_C(0x08021002), UINT32_C(0x00020002), UINT32_C(0x08000002), UINT32_C(0x00000000), UINT32_C(0x08000000), UINT32_C(0x00001002), UINT32_C(0x00020000), UINT32_C(0x08020002), UINT32_C(0x00001000), UINT32_C(0x08000000), UINT32_C(0x00021002), UINT32_C(0x08001002), UINT32_C(0x08021000), UINT32_C(0x00001000), UINT32_C(0x00000000), UINT32_C(0x08000002), UINT32_C(0x00000002), UINT32_C(0x08021002), UINT32_C(0x00021000), UINT32_C(0x08020000), UINT32_C(0x08020002), UINT32_C(0x00020000), UINT32_C(0x00001002), UINT32_C(0x08001000), UINT32_C(0x08001002), UINT32_C(0x00000002), UINT32_C(0x08020000), UINT32_C(0x00021000) }; static const uint32_t sbox2p[64] = { UINT32_C(0x20800000), UINT32_C(0x00808020), UINT32_C(0x00000020), UINT32_C(0x20800020), UINT32_C(0x20008000), UINT32_C(0x00800000), UINT32_C(0x20800020), UINT32_C(0x00008020), UINT32_C(0x00800020), UINT32_C(0x00008000), UINT32_C(0x00808000), UINT32_C(0x20000000), UINT32_C(0x20808020), UINT32_C(0x20000020), UINT32_C(0x20000000), UINT32_C(0x20808000), UINT32_C(0x00000000), UINT32_C(0x20008000), UINT32_C(0x00808020), UINT32_C(0x00000020), UINT32_C(0x20000020), UINT32_C(0x20808020), UINT32_C(0x00008000), UINT32_C(0x20800000), UINT32_C(0x20808000), UINT32_C(0x00800020), UINT32_C(0x20008020), UINT32_C(0x00808000), UINT32_C(0x00008020), UINT32_C(0x00000000), UINT32_C(0x00800000), UINT32_C(0x20008020), UINT32_C(0x00808020), UINT32_C(0x00000020), UINT32_C(0x20000000), UINT32_C(0x00008000), UINT32_C(0x20000020), UINT32_C(0x20008000), UINT32_C(0x00808000), UINT32_C(0x20800020), UINT32_C(0x00000000), UINT32_C(0x00808020), UINT32_C(0x00008020), UINT32_C(0x20808000), UINT32_C(0x20008000), UINT32_C(0x00800000), UINT32_C(0x20808020), UINT32_C(0x20000000), UINT32_C(0x20008020), UINT32_C(0x20800000), UINT32_C(0x00800000), UINT32_C(0x20808020), UINT32_C(0x00008000), UINT32_C(0x00800020), UINT32_C(0x20800020), UINT32_C(0x00008020), UINT32_C(0x00800020), UINT32_C(0x00000000), UINT32_C(0x20808000), UINT32_C(0x20000020), UINT32_C(0x20800000), UINT32_C(0x20008020), UINT32_C(0x00000020), UINT32_C(0x00808000) }; static const uint32_t sbox3p[64] = { UINT32_C(0x00080201), UINT32_C(0x02000200), UINT32_C(0x00000001), UINT32_C(0x02080201), UINT32_C(0x00000000), UINT32_C(0x02080000), UINT32_C(0x02000201), UINT32_C(0x00080001), UINT32_C(0x02080200), UINT32_C(0x02000001), UINT32_C(0x02000000), UINT32_C(0x00000201), UINT32_C(0x02000001), UINT32_C(0x00080201), UINT32_C(0x00080000), UINT32_C(0x02000000), UINT32_C(0x02080001), UINT32_C(0x00080200), UINT32_C(0x00000200), UINT32_C(0x00000001), UINT32_C(0x00080200), UINT32_C(0x02000201), UINT32_C(0x02080000), UINT32_C(0x00000200), UINT32_C(0x00000201), UINT32_C(0x00000000), UINT32_C(0x00080001), UINT32_C(0x02080200), UINT32_C(0x02000200), UINT32_C(0x02080001), UINT32_C(0x02080201), UINT32_C(0x00080000), UINT32_C(0x02080001), UINT32_C(0x00000201), UINT32_C(0x00080000), UINT32_C(0x02000001), UINT32_C(0x00080200), UINT32_C(0x02000200), UINT32_C(0x00000001), UINT32_C(0x02080000), UINT32_C(0x02000201), UINT32_C(0x00000000), UINT32_C(0x00000200), UINT32_C(0x00080001), UINT32_C(0x00000000), UINT32_C(0x02080001), UINT32_C(0x02080200), UINT32_C(0x00000200), UINT32_C(0x02000000), UINT32_C(0x02080201), UINT32_C(0x00080201), UINT32_C(0x00080000), UINT32_C(0x02080201), UINT32_C(0x00000001), UINT32_C(0x02000200), UINT32_C(0x00080201), UINT32_C(0x00080001), UINT32_C(0x00080200), UINT32_C(0x02080000), UINT32_C(0x02000201), UINT32_C(0x00000201), UINT32_C(0x02000000), UINT32_C(0x02000001), UINT32_C(0x02080200) }; static const uint32_t sbox4p[64] = { UINT32_C(0x01000000), UINT32_C(0x00002000), UINT32_C(0x00000080), UINT32_C(0x01002084), UINT32_C(0x01002004), UINT32_C(0x01000080), UINT32_C(0x00002084), UINT32_C(0x01002000), UINT32_C(0x00002000), UINT32_C(0x00000004), UINT32_C(0x01000004), UINT32_C(0x00002080), UINT32_C(0x01000084), UINT32_C(0x01002004), UINT32_C(0x01002080), UINT32_C(0x00000000), UINT32_C(0x00002080), UINT32_C(0x01000000), UINT32_C(0x00002004), UINT32_C(0x00000084), UINT32_C(0x01000080), UINT32_C(0x00002084), UINT32_C(0x00000000), UINT32_C(0x01000004), UINT32_C(0x00000004), UINT32_C(0x01000084), UINT32_C(0x01002084), UINT32_C(0x00002004), UINT32_C(0x01002000), UINT32_C(0x00000080), UINT32_C(0x00000084), UINT32_C(0x01002080), UINT32_C(0x01002080), UINT32_C(0x01000084), UINT32_C(0x00002004), UINT32_C(0x01002000), UINT32_C(0x00002000), UINT32_C(0x00000004), UINT32_C(0x01000004), UINT32_C(0x01000080), UINT32_C(0x01000000), UINT32_C(0x00002080), UINT32_C(0x01002084), UINT32_C(0x00000000), UINT32_C(0x00002084), UINT32_C(0x01000000), UINT32_C(0x00000080), UINT32_C(0x00002004), UINT32_C(0x01000084), UINT32_C(0x00000080), UINT32_C(0x00000000), UINT32_C(0x01002084), UINT32_C(0x01002004), UINT32_C(0x01002080), UINT32_C(0x00000084), UINT32_C(0x00002000), UINT32_C(0x00002080), UINT32_C(0x01002004), UINT32_C(0x01000080), UINT32_C(0x00000084), UINT32_C(0x00000004), UINT32_C(0x00002084), UINT32_C(0x01002000), UINT32_C(0x01000004) }; const uint32_t sbox5p[64] = { UINT32_C(0x10000008), UINT32_C(0x00040008), UINT32_C(0x00000000), UINT32_C(0x10040400), UINT32_C(0x00040008), UINT32_C(0x00000400), UINT32_C(0x10000408), UINT32_C(0x00040000), UINT32_C(0x00000408), UINT32_C(0x10040408), UINT32_C(0x00040400), UINT32_C(0x10000000), UINT32_C(0x10000400), UINT32_C(0x10000008), UINT32_C(0x10040000), UINT32_C(0x00040408), UINT32_C(0x00040000), UINT32_C(0x10000408), UINT32_C(0x10040008), UINT32_C(0x00000000), UINT32_C(0x00000400), UINT32_C(0x00000008), UINT32_C(0x10040400), UINT32_C(0x10040008), UINT32_C(0x10040408), UINT32_C(0x10040000), UINT32_C(0x10000000), UINT32_C(0x00000408), UINT32_C(0x00000008), UINT32_C(0x00040400), UINT32_C(0x00040408), UINT32_C(0x10000400), UINT32_C(0x00000408), UINT32_C(0x10000000), UINT32_C(0x10000400), UINT32_C(0x00040408), UINT32_C(0x10040400), UINT32_C(0x00040008), UINT32_C(0x00000000), UINT32_C(0x10000400), UINT32_C(0x10000000), UINT32_C(0x00000400), UINT32_C(0x10040008), UINT32_C(0x00040000), UINT32_C(0x00040008), UINT32_C(0x10040408), UINT32_C(0x00040400), UINT32_C(0x00000008), UINT32_C(0x10040408), UINT32_C(0x00040400), UINT32_C(0x00040000), UINT32_C(0x10000408), UINT32_C(0x10000008), UINT32_C(0x10040000), UINT32_C(0x00040408), UINT32_C(0x00000000), UINT32_C(0x00000400), UINT32_C(0x10000008), UINT32_C(0x10000408), UINT32_C(0x10040400), UINT32_C(0x10040000), UINT32_C(0x00000408), UINT32_C(0x00000008), UINT32_C(0x10040008) }; static const uint32_t sbox6p[64] = { UINT32_C(0x00000800), UINT32_C(0x00000040), UINT32_C(0x00200040), UINT32_C(0x80200000), UINT32_C(0x80200840), UINT32_C(0x80000800), UINT32_C(0x00000840), UINT32_C(0x00000000), UINT32_C(0x00200000), UINT32_C(0x80200040), UINT32_C(0x80000040), UINT32_C(0x00200800), UINT32_C(0x80000000), UINT32_C(0x00200840), UINT32_C(0x00200800), UINT32_C(0x80000040), UINT32_C(0x80200040), UINT32_C(0x00000800), UINT32_C(0x80000800), UINT32_C(0x80200840), UINT32_C(0x00000000), UINT32_C(0x00200040), UINT32_C(0x80200000), UINT32_C(0x00000840), UINT32_C(0x80200800), UINT32_C(0x80000840), UINT32_C(0x00200840), UINT32_C(0x80000000), UINT32_C(0x80000840), UINT32_C(0x80200800), UINT32_C(0x00000040), UINT32_C(0x00200000), UINT32_C(0x80000840), UINT32_C(0x00200800), UINT32_C(0x80200800), UINT32_C(0x80000040), UINT32_C(0x00000800), UINT32_C(0x00000040), UINT32_C(0x00200000), UINT32_C(0x80200800), UINT32_C(0x80200040), UINT32_C(0x80000840), UINT32_C(0x00000840), UINT32_C(0x00000000), UINT32_C(0x00000040), UINT32_C(0x80200000), UINT32_C(0x80000000), UINT32_C(0x00200040), UINT32_C(0x00000000), UINT32_C(0x80200040), UINT32_C(0x00200040), UINT32_C(0x00000840), UINT32_C(0x80000040), UINT32_C(0x00000800), UINT32_C(0x80200840), UINT32_C(0x00200000), UINT32_C(0x00200840), UINT32_C(0x80000000), UINT32_C(0x80000800), UINT32_C(0x80200840), UINT32_C(0x80200000), UINT32_C(0x00200840), UINT32_C(0x00200800), UINT32_C(0x80000800) }; static const uint32_t sbox7p[64] = { UINT32_C(0x04100010), UINT32_C(0x04104000), UINT32_C(0x00004010), UINT32_C(0x00000000), UINT32_C(0x04004000), UINT32_C(0x00100010), UINT32_C(0x04100000), UINT32_C(0x04104010), UINT32_C(0x00000010), UINT32_C(0x04000000), UINT32_C(0x00104000), UINT32_C(0x00004010), UINT32_C(0x00104010), UINT32_C(0x04004010), UINT32_C(0x04000010), UINT32_C(0x04100000), UINT32_C(0x00004000), UINT32_C(0x00104010), UINT32_C(0x00100010), UINT32_C(0x04004000), UINT32_C(0x04104010), UINT32_C(0x04000010), UINT32_C(0x00000000), UINT32_C(0x00104000), UINT32_C(0x04000000), UINT32_C(0x00100000), UINT32_C(0x04004010), UINT32_C(0x04100010), UINT32_C(0x00100000), UINT32_C(0x00004000), UINT32_C(0x04104000), UINT32_C(0x00000010), UINT32_C(0x00100000), UINT32_C(0x00004000), UINT32_C(0x04000010), UINT32_C(0x04104010), UINT32_C(0x00004010), UINT32_C(0x04000000), UINT32_C(0x00000000), UINT32_C(0x00104000), UINT32_C(0x04100010), UINT32_C(0x04004010), UINT32_C(0x04004000), UINT32_C(0x00100010), UINT32_C(0x04104000), UINT32_C(0x00000010), UINT32_C(0x00100010), UINT32_C(0x04004000), UINT32_C(0x04104010), UINT32_C(0x00100000), UINT32_C(0x04100000), UINT32_C(0x04000010), UINT32_C(0x00104000), UINT32_C(0x00004010), UINT32_C(0x04004010), UINT32_C(0x04100000), UINT32_C(0x00000010), UINT32_C(0x04104000), UINT32_C(0x00104010), UINT32_C(0x00000000), UINT32_C(0x04000000), UINT32_C(0x04100010), UINT32_C(0x00004000), UINT32_C(0x00104010) }; __forceinline uint32_t fRK(const uint32_t R, const uint64_t K) { uint64_t x; /* Combined e-phase and 8x6bits to 8x8bits expansion. * 32 bits -> 48 bits permutation */ x = e_phase((uint64_t) R) ^ K; /* Combined s-box and p-phase. * s-box: 48 bits -> 32 bits * p-phase: 32 bits -> 32 bites permutation */ return sbox0p[x & 0x3f] | sbox1p[(x >> (8 * 1)) & 0x3f] | sbox2p[(x >> (8 * 2)) & 0x3f] | sbox3p[(x >> (8 * 3)) & 0x3f] | sbox4p[(x >> (8 * 4)) & 0x3f] | sbox5p[(x >> (8 * 5)) & 0x3f] | sbox6p[(x >> (8 * 6)) & 0x3f] | sbox7p[(x >> (8 * 7)) & 0x3f]; } __forceinline uint64_t enc_dec_1(const uint64_t data, const uint64_t *ks, const int enc) { uint32_t l, r; r = (uint32_t) (data); l = (uint32_t) (data >> 32); ip_z(&r, &l); if (enc) { l ^= fRK(r, ks[0]); r ^= fRK(l, ks[1]); l ^= fRK(r, ks[2]); r ^= fRK(l, ks[3]); l ^= fRK(r, ks[4]); r ^= fRK(l, ks[5]); l ^= fRK(r, ks[6]); r ^= fRK(l, ks[7]); l ^= fRK(r, ks[8]); r ^= fRK(l, ks[9]); l ^= fRK(r, ks[10]); r ^= fRK(l, ks[11]); l ^= fRK(r, ks[12]); r ^= fRK(l, ks[13]); l ^= fRK(r, ks[14]); r ^= fRK(l, ks[15]); } else { l ^= fRK(r, ks[15]); /* l: l0 -> r1/l2 */ r ^= fRK(l, ks[14]); /* r: r0 -> r2 */ l ^= fRK(r, ks[13]); r ^= fRK(l, ks[12]); l ^= fRK(r, ks[11]); r ^= fRK(l, ks[10]); l ^= fRK(r, ks[9]); r ^= fRK(l, ks[8]); l ^= fRK(r, ks[7]); r ^= fRK(l, ks[6]); l ^= fRK(r, ks[5]); r ^= fRK(l, ks[4]); l ^= fRK(r, ks[3]); r ^= fRK(l, ks[2]); l ^= fRK(r, ks[1]); r ^= fRK(l, ks[0]); } fp_z(&r, &l); return ((uint64_t) l) | (((uint64_t) r) << 32); } IMB_DLL_LOCAL void des_enc_cbc_basic(const void *input, void *output, const int size, const uint64_t *ks, const uint64_t *ivec) { const uint64_t *in = input; uint64_t *out = output; const int nblocks = size / 8; int n; uint64_t iv = *ivec; IMB_ASSERT(size >= 0); IMB_ASSERT(input != NULL); IMB_ASSERT(output != NULL); IMB_ASSERT(ks != NULL); IMB_ASSERT(ivec != NULL); for (n = 0; n < nblocks; n++) out[n] = iv = enc_dec_1(in[n] ^ iv, ks, 1 /* encrypt */); /* *ivec = iv; */ iv = 0; } IMB_DLL_LOCAL void des_dec_cbc_basic(const void *input, void *output, const int size, const uint64_t *ks, const uint64_t *ivec) { const uint64_t *in = input; uint64_t *out = output; const int nblocks = size / 8; int n; uint64_t iv = *ivec; IMB_ASSERT(size >= 0); IMB_ASSERT(input != NULL); IMB_ASSERT(output != NULL); IMB_ASSERT(ks != NULL); IMB_ASSERT(ivec != NULL); for (n = 0; n < nblocks; n++) { uint64_t in_block = in[n]; out[n] = enc_dec_1(in_block, ks, 0 /* decrypt */) ^ iv; iv = in_block; } /* *ivec = iv; */ iv = 0; } __forceinline void cfb_one_basic(const void *input, void *output, const int size, const uint64_t *ks, const uint64_t *ivec) { uint8_t *out = (uint8_t *) output; const uint8_t *in = (const uint8_t *) input; uint64_t t; IMB_ASSERT(size <= 8 && size >= 0); IMB_ASSERT(input != NULL); IMB_ASSERT(output != NULL); IMB_ASSERT(ks != NULL); IMB_ASSERT(ivec != NULL); t = enc_dec_1(*ivec, ks, 1 /* encrypt */); /* XOR and copy in one go */ if (size & 1) { *out++ = *in++ ^ ((uint8_t) t); t >>= 8; } if (size & 2) { uint16_t *out2 = (uint16_t *) out; const uint16_t *in2 = (const uint16_t *) in; *out2 = *in2 ^ ((uint16_t) t); t >>= 16; out += 2; in += 2; } if (size & 4) { uint32_t *out4 = (uint32_t *) out; const uint32_t *in4 = (const uint32_t *) in; *out4 = *in4 ^ ((uint32_t) t); } } IMB_DLL_LOCAL void docsis_des_enc_basic(const void *input, void *output, const int size, const uint64_t *ks, const uint64_t *ivec) { const uint64_t *in = input; uint64_t *out = output; const int nblocks = size / DES_BLOCK_SIZE; const int partial = size & 7; int n; uint64_t iv = *ivec; IMB_ASSERT(size >= 0); IMB_ASSERT(input != NULL); IMB_ASSERT(output != NULL); IMB_ASSERT(ks != NULL); IMB_ASSERT(ivec != NULL); for (n = 0; n < nblocks; n++) out[n] = iv = enc_dec_1(in[n] ^ iv, ks, 1 /* encrypt */); if (partial) { if (nblocks) cfb_one_basic(&in[nblocks], &out[nblocks], partial, ks, &out[nblocks - 1]); else cfb_one_basic(input, output, partial, ks, ivec); } /* *ivec = iv; */ iv = 0; } IMB_DLL_LOCAL void docsis_des_dec_basic(const void *input, void *output, const int size, const uint64_t *ks, const uint64_t *ivec) { const uint64_t *in = input; uint64_t *out = output; const int nblocks = size / DES_BLOCK_SIZE; const int partial = size & 7; int n; uint64_t iv = *ivec; IMB_ASSERT(size >= 0); IMB_ASSERT(input != NULL); IMB_ASSERT(output != NULL); IMB_ASSERT(ks != NULL); IMB_ASSERT(ivec != NULL); if (partial) { if (!nblocks) { /* first block is the partial one */ cfb_one_basic(input, output, partial, ks, ivec); iv = 0; return; } /* last block is partial */ cfb_one_basic(&in[nblocks], &out[nblocks], partial, ks, &in[nblocks - 1]); } for (n = 0; n < nblocks; n++) { uint64_t in_block = in[n]; out[n] = enc_dec_1(in_block, ks, 0 /* decrypt */) ^ iv; iv = in_block; } /* *ivec = iv; */ iv = 0; } intel-ipsec-mb-0.48/des_key.c000066400000000000000000000120571321406316400160510ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2017, Intel Corporation Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ #include #include #include "des.h" #include "des_utils.h" #include "os.h" /** * @brief Rotates 28-bit word * * Roll right of 28-bit word - used in 28-bit subkey operations * * @param val 28-bit word to be rotated * @param nshift number of bits to rotate by * * @return val rotated by nshift bits */ __forceinline uint32_t rotate28(const uint32_t val, const unsigned nshift) { const uint32_t mask = (UINT32_C(1) << 28) - UINT32_C(1); IMB_ASSERT(nshift <= 28); return ((val >> nshift) & mask) | ((val << (28 - nshift)) & mask); } /** * @brief Expands 8 groups of 6bits into 8 groups of 8bits * * @param in a 48-bit word including 8 groups of 6bits * * @return 64-bit word with 8 groups of 8bits */ __forceinline uint64_t expand_8x6_to_8x8(const uint64_t in) { return (((in >> (6 * 0)) & UINT64_C(63)) << (8 * 0)) | (((in >> (6 * 1)) & UINT64_C(63)) << (8 * 1)) | (((in >> (6 * 2)) & UINT64_C(63)) << (8 * 2)) | (((in >> (6 * 3)) & UINT64_C(63)) << (8 * 3)) | (((in >> (6 * 4)) & UINT64_C(63)) << (8 * 4)) | (((in >> (6 * 5)) & UINT64_C(63)) << (8 * 5)) | (((in >> (6 * 6)) & UINT64_C(63)) << (8 * 6)) | (((in >> (6 * 7)) & UINT64_C(63)) << (8 * 7)); } static const uint8_t pc1c_table_fips46_3[28] = { 57, 49, 41, 33, 25, 17, 9, 1, 58, 50, 42, 34, 26, 18, 10, 2, 59, 51, 43, 35, 27, 19, 11, 3, 60, 52, 44, 36 }; static const uint8_t pc1d_table_fips46_3[28] = { 63, 55, 47, 39, 31, 23, 15, 7, 62, 54, 46, 38, 30, 22, 14, 6, 61, 53, 45, 37, 29, 21, 13, 5, 28, 20, 12, 4 }; static const uint8_t pc2_table_fips46_3[48] = { 14, 17, 11, 24, 1, 5, 3, 28, 15, 6, 21, 10, 23, 19, 12, 4, 26, 8, 16, 7, 27, 20, 13, 2, 41, 52, 31, 37, 47, 55, 30, 40, 51, 45, 33, 48, 44, 49, 39, 56, 34, 53, 46, 42, 50, 36, 29, 32 }; static const uint8_t shift_tab_fips46_3[16] = { 1, 1, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 1 }; int des_key_schedule(uint64_t *ks, const void *key) { uint64_t c, d; uint64_t t = 0; int n; if (key == NULL || ks == NULL) return -1; /* KEY: 56 bits but spread across 64 bits * - MSB per byte used for parity * - load_and_convert loads the key and swaps bits in bytes * so that bit numbers are more suitable for LE machine and * FIPS46-3 DES tables */ t = load64_reflect(key); /* PC1 * - built from the KEY, PC1 permute tables skip KEY parity bits * - c & d are both 28 bits */ c = permute_64b(t, pc1c_table_fips46_3, IMB_DIM(pc1c_table_fips46_3)); d = permute_64b(t, pc1d_table_fips46_3, IMB_DIM(pc1d_table_fips46_3)); /* KS rounds */ for (n = 0; n < 16; n++) { c = rotate28((uint32_t)c, (unsigned) shift_tab_fips46_3[n]); d = rotate28((uint32_t)d, (unsigned) shift_tab_fips46_3[n]); /* PC2 */ t = permute_64b(c | (d << 28), pc2_table_fips46_3, IMB_DIM(pc2_table_fips46_3)); /* store KS as 6 bits per byte and keep LE */ ks[n] = expand_8x6_to_8x8(t); } return 0; } intel-ipsec-mb-0.48/gcm_defines.asm000066400000000000000000000203101321406316400172160ustar00rootroot00000000000000;; ;; Copyright (c) 2012-2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; ; ; Authors: ; Erdinc Ozturk ; Vinodh Gopal ; James Guilford section .data align 16 POLY dq 0x0000000000000001, 0xC200000000000000 POLY2 dq 0x00000001C2000000, 0xC200000000000000 TWOONE dq 0x0000000000000001, 0x0000000100000000 ; order of these constants should not change. ; more specifically, ALL_F should follow SHIFT_MASK, and ZERO should follow ALL_F SHUF_MASK dq 0x08090A0B0C0D0E0F, 0x0001020304050607 SHIFT_MASK dq 0x0706050403020100, 0x0f0e0d0c0b0a0908 ALL_F dq 0xffffffffffffffff, 0xffffffffffffffff ZERO dq 0x0000000000000000, 0x0000000000000000 ONE dq 0x0000000000000001, 0x0000000000000000 TWO dq 0x0000000000000002, 0x0000000000000000 ONEf dq 0x0000000000000000, 0x0100000000000000 TWOf dq 0x0000000000000000, 0x0200000000000000 section .text ;;define the fields of gcm_key_data struct ;; struct gcm_key_data { ;; uint8_t expanded_keys[GCM_ENC_KEY_LEN * GCM_KEY_SETS]; ;; uint8_t shifted_hkey_1[GCM_ENC_KEY_LEN]; // store HashKey <<1 mod poly here ;; uint8_t shifted_hkey_2[GCM_ENC_KEY_LEN]; // store HashKey^2 <<1 mod poly here ;; uint8_t shifted_hkey_3[GCM_ENC_KEY_LEN]; // store HashKey^3 <<1 mod poly here ;; uint8_t shifted_hkey_4[GCM_ENC_KEY_LEN]; // store HashKey^4 <<1 mod poly here ;; uint8_t shifted_hkey_5[GCM_ENC_KEY_LEN]; // store HashKey^5 <<1 mod poly here ;; uint8_t shifted_hkey_6[GCM_ENC_KEY_LEN]; // store HashKey^6 <<1 mod poly here ;; uint8_t shifted_hkey_7[GCM_ENC_KEY_LEN]; // store HashKey^7 <<1 mod poly here ;; uint8_t shifted_hkey_8[GCM_ENC_KEY_LEN]; // store HashKey^8 <<1 mod poly here ;; uint8_t shifted_hkey_1_k[GCM_ENC_KEY_LEN]; // store XOR of High 64 bits and Low 64 bits of HashKey <<1 mod poly here (for Karatsuba purposes) ;; uint8_t shifted_hkey_2_k[GCM_ENC_KEY_LEN]; // store XOR of High 64 bits and Low 64 bits of HashKey^2 <<1 mod poly here (for Karatsuba purposes) ;; uint8_t shifted_hkey_3_k[GCM_ENC_KEY_LEN]; // store XOR of High 64 bits and Low 64 bits of HashKey^3 <<1 mod poly here (for Karatsuba purposes) ;; uint8_t shifted_hkey_4_k[GCM_ENC_KEY_LEN]; // store XOR of High 64 bits and Low 64 bits of HashKey^4 <<1 mod poly here (for Karatsuba purposes) ;; uint8_t shifted_hkey_5_k[GCM_ENC_KEY_LEN]; // store XOR of High 64 bits and Low 64 bits of HashKey^5 <<1 mod poly here (for Karatsuba purposes) ;; uint8_t shifted_hkey_6_k[GCM_ENC_KEY_LEN]; // store XOR of High 64 bits and Low 64 bits of HashKey^6 <<1 mod poly here (for Karatsuba purposes) ;; uint8_t shifted_hkey_7_k[GCM_ENC_KEY_LEN]; // store XOR of High 64 bits and Low 64 bits of HashKey^7 <<1 mod poly here (for Karatsuba purposes) ;; uint8_t shifted_hkey_8_k[GCM_ENC_KEY_LEN]; // store XOR of High 64 bits and Low 64 bits of HashKey^8 <<1 mod poly here (for Karatsuba purposes) ;; } %define HashKey (16*15) ; store HashKey <<1 mod poly here %define HashKey_1 (16*15) ; store HashKey <<1 mod poly here %define HashKey_2 (16*16) ; store HashKey^2 <<1 mod poly here %define HashKey_3 (16*17) ; store HashKey^3 <<1 mod poly here %define HashKey_4 (16*18) ; store HashKey^4 <<1 mod poly here %define HashKey_5 (16*19) ; store HashKey^5 <<1 mod poly here %define HashKey_6 (16*20) ; store HashKey^6 <<1 mod poly here %define HashKey_7 (16*21) ; store HashKey^7 <<1 mod poly here %define HashKey_8 (16*22) ; store HashKey^8 <<1 mod poly here %define HashKey_k (16*23) ; store XOR of High 64 bits and Low 64 bits of HashKey <<1 mod poly here (for Karatsuba purposes) %define HashKey_2_k (16*24) ; store XOR of High 64 bits and Low 64 bits of HashKey^2 <<1 mod poly here (for Karatsuba purposes) %define HashKey_3_k (16*25) ; store XOR of High 64 bits and Low 64 bits of HashKey^3 <<1 mod poly here (for Karatsuba purposes) %define HashKey_4_k (16*26) ; store XOR of High 64 bits and Low 64 bits of HashKey^4 <<1 mod poly here (for Karatsuba purposes) %define HashKey_5_k (16*27) ; store XOR of High 64 bits and Low 64 bits of HashKey^5 <<1 mod poly here (for Karatsuba purposes) %define HashKey_6_k (16*28) ; store XOR of High 64 bits and Low 64 bits of HashKey^6 <<1 mod poly here (for Karatsuba purposes) %define HashKey_7_k (16*29) ; store XOR of High 64 bits and Low 64 bits of HashKey^7 <<1 mod poly here (for Karatsuba purposes) %define HashKey_8_k (16*30) ; store XOR of High 64 bits and Low 64 bits of HashKey^8 <<1 mod poly here (for Karatsuba purposes) ;;define the fields of gcm_context_data struct ;; struct gcm_context_data { ;; // init, update and finalize context data ;; uint8_t aad_hash[GCM_BLOCK_LEN]; ;; uint64_t aad_length; ;; uint64_t in_length; ;; uint8_t partial_block_enc_key[GCM_BLOCK_LEN]; ;; uint8_t orig_IV[GCM_BLOCK_LEN]; ;; uint8_t current_counter[GCM_BLOCK_LEN]; ;; uint64_t partial_block_length; ;; }; %define AadHash (16*0) ; store current Hash of data which has been input %define AadLen (16*1) ; store length of input data which will not be encrypted or decrypted %define InLen ((16*1)+8); store length of input data which will be encrypted or decrypted %define PBlockEncKey (16*2) ; encryption key for the partial block at the end of the previous update %define OrigIV (16*3) ; input IV %define CurCount (16*4) ; Current counter for generation of encryption key %define PBlockLen (16*5) ; length of partial block at the end of the previous update %define reg(q) xmm %+ q %ifdef WIN_ABI %xdefine arg1 rcx %xdefine arg2 rdx %xdefine arg3 r8 %xdefine arg4 r9 %xdefine arg5 qword [r14 + STACK_OFFSET + 8*5] %xdefine arg6 qword [r14 + STACK_OFFSET + 8*6] %xdefine arg7 qword [r14 + STACK_OFFSET + 8*7] %xdefine arg8 qword [r14 + STACK_OFFSET + 8*8] %xdefine arg9 qword [r14 + STACK_OFFSET + 8*9] %xdefine arg10 qword [r14 + STACK_OFFSET + 8*10] %else %xdefine arg1 rdi %xdefine arg2 rsi %xdefine arg3 rdx %xdefine arg4 rcx %xdefine arg5 r8 %xdefine arg6 r9 %xdefine arg7 [r14 + STACK_OFFSET + 8*1] %xdefine arg8 [r14 + STACK_OFFSET + 8*2] %xdefine arg9 [r14 + STACK_OFFSET + 8*3] %xdefine arg10 [r14 + STACK_OFFSET + 8*4] %endif %ifdef NT_LDST %define NT_LD %define NT_ST %endif ;;; Use Non-temporal load/stor %ifdef NT_LD %define XLDR movntdqa %define VXLDR vmovntdqa %else %define XLDR movdqu %define VXLDR vmovdqu %endif ;;; Use Non-temporal load/stor %ifdef NT_ST %define XSTR movntdq %define VXSTR vmovntdq %else %define XSTR movdqu %define VXSTR vmovdqu %endif intel-ipsec-mb-0.48/gcm_defines.h000066400000000000000000000657671321406316400167160ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2012-2017, Intel Corporation Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ #ifndef GCM_DEFINES_H #define GCM_DEFINES_H #include #include #include #ifdef __cplusplus extern "C" { #endif /* Authenticated Tag Length in bytes. * Valid values are 16 (most likely), 12 or 8. */ #define MAX_TAG_LEN (16) /* * IV data is limited to 16 bytes as follows: * 12 bytes is provided by an application - * pre-counter block j0: 4 byte salt (from Security Association) * concatenated with 8 byte Initialization Vector (from IPSec ESP * Payload). * 4 byte value 0x00000001 is padded automatically by the library - * there is no need to add these 4 bytes on application side anymore. */ #define GCM_IV_DATA_LEN (12) #define LONGEST_TESTED_AAD_LENGTH (2 * 1024) /* Key lengths of 128 and 256 supported */ #define GCM_128_KEY_LEN (16) #define GCM_192_KEY_LEN (24) #define GCM_256_KEY_LEN (32) #define GCM_BLOCK_LEN 16 #define GCM_ENC_KEY_LEN 16 #define GCM_KEY_SETS (15) /*exp key + 14 exp round keys*/ /** * @brief holds intermediate key data needed to improve performance * * gcm_key_data hold internal key information used by gcm128, gcm192 and gcm256. */ #ifdef __WIN32 __declspec(align(16)) #endif /* WIN32 */ struct gcm_key_data { uint8_t expanded_keys[GCM_ENC_KEY_LEN * GCM_KEY_SETS]; /* storage for HashKey mod poly */ uint8_t shifted_hkey_1[GCM_ENC_KEY_LEN]; /* HashKey<<1 mod poly */ uint8_t shifted_hkey_2[GCM_ENC_KEY_LEN]; /* HashKey^2<<1 mod poly */ uint8_t shifted_hkey_3[GCM_ENC_KEY_LEN]; /* HashKey^3<<1 mod poly */ uint8_t shifted_hkey_4[GCM_ENC_KEY_LEN]; /* HashKey^4<<1 mod poly */ uint8_t shifted_hkey_5[GCM_ENC_KEY_LEN]; /* HashKey^5<<1 mod poly */ uint8_t shifted_hkey_6[GCM_ENC_KEY_LEN]; /* HashKey^6<<1 mod poly */ uint8_t shifted_hkey_7[GCM_ENC_KEY_LEN]; /* HashKey^7<<1 mod poly */ uint8_t shifted_hkey_8[GCM_ENC_KEY_LEN]; /* HashKey^8<<1 mod poly */ /* * Storage for XOR of High 64 bits and low 64 bits of HashKey mod poly. * This is needed for Karatsuba purposes. */ uint8_t shifted_hkey_1_k[GCM_ENC_KEY_LEN]; /* HashKey<<1 mod poly */ uint8_t shifted_hkey_2_k[GCM_ENC_KEY_LEN]; /* HashKey^2<<1 mod poly */ uint8_t shifted_hkey_3_k[GCM_ENC_KEY_LEN]; /* HashKey^3<<1 mod poly */ uint8_t shifted_hkey_4_k[GCM_ENC_KEY_LEN]; /* HashKey^4<<1 mod poly */ uint8_t shifted_hkey_5_k[GCM_ENC_KEY_LEN]; /* HashKey^5<<1 mod poly */ uint8_t shifted_hkey_6_k[GCM_ENC_KEY_LEN]; /* HashKey^6<<1 mod poly */ uint8_t shifted_hkey_7_k[GCM_ENC_KEY_LEN]; /* HashKey^7<<1 mod poly */ uint8_t shifted_hkey_8_k[GCM_ENC_KEY_LEN]; /* HashKey^8<<1 mod poly */ } #ifdef LINUX __attribute__((aligned(16))); #else ; #endif /** * @brief holds GCM operation context */ struct gcm_context_data { /* init, update and finalize context data */ uint8_t aad_hash[GCM_BLOCK_LEN]; uint64_t aad_length; uint64_t in_length; uint8_t partial_block_enc_key[GCM_BLOCK_LEN]; uint8_t orig_IV[GCM_BLOCK_LEN]; uint8_t current_counter[GCM_BLOCK_LEN]; uint64_t partial_block_length; }; /** * @brief GCM-AES Encryption * * @param key_data GCM expanded key data * @param context_data GCM operation context data * @param out Ciphertext output. Encrypt in-place is allowed. * @param in Plaintext input. * @param len Length of data in Bytes for encryption. * @param iv pointer to 12 byte IV structure. Internally, library * concates 0x00000001 value to it. * @param aad Additional Authentication Data (AAD). * @param aad_len Length of AAD. * @param auth_tag Authenticated Tag output. * @param auth_tag_len Authenticated Tag Length in bytes (must be * a multiple of 4 bytes). Valid values are * 16 (most likely), 12 or 8. */ void aes_gcm_enc_128_sse(const struct gcm_key_data *key_data, struct gcm_context_data *context_data, uint8_t *out, uint8_t const *in, uint64_t len, const uint8_t *iv, uint8_t const *aad, uint64_t aad_len, uint8_t *auth_tag, uint64_t auth_tag_len); void aes_gcm_enc_128_avx_gen2(const struct gcm_key_data *key_data, struct gcm_context_data *context_data, uint8_t *out, uint8_t const *in, uint64_t len, const uint8_t *iv, uint8_t const *aad, uint64_t aad_len, uint8_t *auth_tag, uint64_t auth_tag_len); void aes_gcm_enc_128_avx_gen4(const struct gcm_key_data *key_data, struct gcm_context_data *context_data, uint8_t *out, uint8_t const *in, uint64_t len, const uint8_t *iv, uint8_t const *aad, uint64_t aad_len, uint8_t *auth_tag, uint64_t auth_tag_len); void aes_gcm_enc_192_sse(const struct gcm_key_data *key_data, struct gcm_context_data *context_data, uint8_t *out, uint8_t const *in, uint64_t len, const uint8_t *iv, uint8_t const *aad, uint64_t aad_len, uint8_t *auth_tag, uint64_t auth_tag_len); void aes_gcm_enc_192_avx_gen2(const struct gcm_key_data *key_data, struct gcm_context_data *context_data, uint8_t *out, uint8_t const *in, uint64_t len, const uint8_t *iv, uint8_t const *aad, uint64_t aad_len, uint8_t *auth_tag, uint64_t auth_tag_len); void aes_gcm_enc_192_avx_gen4(const struct gcm_key_data *key_data, struct gcm_context_data *context_data, uint8_t *out, uint8_t const *in, uint64_t len, const uint8_t *iv, uint8_t const *aad, uint64_t aad_len, uint8_t *auth_tag, uint64_t auth_tag_len); void aes_gcm_enc_256_sse(const struct gcm_key_data *key_data, struct gcm_context_data *context_data, uint8_t *out, uint8_t const *in, uint64_t len, const uint8_t *iv, uint8_t const *aad, uint64_t aad_len, uint8_t *auth_tag, uint64_t auth_tag_len); void aes_gcm_enc_256_avx_gen2(const struct gcm_key_data *key_data, struct gcm_context_data *context_data, uint8_t *out, uint8_t const *in, uint64_t len, const uint8_t *iv, uint8_t const *aad, uint64_t aad_len, uint8_t *auth_tag, uint64_t auth_tag_len); void aes_gcm_enc_256_avx_gen4(const struct gcm_key_data *key_data, struct gcm_context_data *context_data, uint8_t *out, uint8_t const *in, uint64_t len, const uint8_t *iv, uint8_t const *aad, uint64_t aad_len, uint8_t *auth_tag, uint64_t auth_tag_len); /** * @brief GCM-AES Decryption * * @param key_data GCM expanded keys data * @param context_data GCM operation context data * @param out Plaintext output. Decrypt in-place is allowed. * @param in Ciphertext input. * @param len Length of data in Bytes for decryption. * @param iv pointer to 12 byte IV structure. Internally, library * concates 0x00000001 value to it. * @param aad Additional Authentication Data (AAD). * @param aad_len Length of AAD. * @param auth_tag Authenticated Tag output. * @param auth_tag_len Authenticated Tag Length in bytes (must be * a multiple of 4 bytes). Valid values are * 16 (most likely), 12 or 8. */ void aes_gcm_dec_128_sse(const struct gcm_key_data *key_data, struct gcm_context_data *context_data, uint8_t *out, uint8_t const *in, uint64_t len, const uint8_t *iv, uint8_t const *aad, uint64_t aad_len, uint8_t *auth_tag, uint64_t auth_tag_len); void aes_gcm_dec_128_avx_gen2(const struct gcm_key_data *key_data, struct gcm_context_data *context_data, uint8_t *out, uint8_t const *in, uint64_t len, const uint8_t *iv, uint8_t const *aad, uint64_t aad_len, uint8_t *auth_tag, uint64_t auth_tag_len); void aes_gcm_dec_128_avx_gen4(const struct gcm_key_data *key_data, struct gcm_context_data *context_data, uint8_t *out, uint8_t const *in, uint64_t len, const uint8_t *iv, uint8_t const *aad, uint64_t aad_len, uint8_t *auth_tag, uint64_t auth_tag_len); void aes_gcm_dec_192_sse(const struct gcm_key_data *key_data, struct gcm_context_data *context_data, uint8_t *out, uint8_t const *in, uint64_t len, const uint8_t *iv, uint8_t const *aad, uint64_t aad_len, uint8_t *auth_tag, uint64_t auth_tag_len); void aes_gcm_dec_192_avx_gen2(const struct gcm_key_data *key_data, struct gcm_context_data *context_data, uint8_t *out, uint8_t const *in, uint64_t len, const uint8_t *iv, uint8_t const *aad, uint64_t aad_len, uint8_t *auth_tag, uint64_t auth_tag_len); void aes_gcm_dec_192_avx_gen4(const struct gcm_key_data *key_data, struct gcm_context_data *context_data, uint8_t *out, uint8_t const *in, uint64_t len, const uint8_t *iv, uint8_t const *aad, uint64_t aad_len, uint8_t *auth_tag, uint64_t auth_tag_len); void aes_gcm_dec_256_sse(const struct gcm_key_data *key_data, struct gcm_context_data *context_data, uint8_t *out, uint8_t const *in, uint64_t len, const uint8_t *iv, uint8_t const *aad, uint64_t aad_len, uint8_t *auth_tag, uint64_t auth_tag_len); void aes_gcm_dec_256_avx_gen2(const struct gcm_key_data *key_data, struct gcm_context_data *context_data, uint8_t *out, uint8_t const *in, uint64_t len, const uint8_t *iv, uint8_t const *aad, uint64_t aad_len, uint8_t *auth_tag, uint64_t auth_tag_len); void aes_gcm_dec_256_avx_gen4(const struct gcm_key_data *key_data, struct gcm_context_data *context_data, uint8_t *out, uint8_t const *in, uint64_t len, const uint8_t *iv, uint8_t const *aad, uint64_t aad_len, uint8_t *auth_tag, uint64_t auth_tag_len); /** * @brief Start a AES-GCM Encryption message * * @param key_data GCM expanded key data * @param context_data GCM operation context data * @param iv pointer to 12 byte IV structure. Internally, library * concates 0x00000001 value to it. * @param aad Additional Authentication Data (AAD). * @param aad_len Length of AAD. * */ void aes_gcm_init_128_sse(const struct gcm_key_data *key_data, struct gcm_context_data *context_data, const uint8_t *iv, uint8_t const *aad, uint64_t aad_len); void aes_gcm_init_128_avx_gen2(const struct gcm_key_data *key_data, struct gcm_context_data *context_data, const uint8_t *iv, uint8_t const *aad, uint64_t aad_len); void aes_gcm_init_128_avx_gen4(const struct gcm_key_data *key_data, struct gcm_context_data *context_data, const uint8_t *iv, uint8_t const *aad, uint64_t aad_len); void aes_gcm_init_192_sse(const struct gcm_key_data *key_data, struct gcm_context_data *context_data, const uint8_t *iv, uint8_t const *aad, uint64_t aad_len); void aes_gcm_init_192_avx_gen2(const struct gcm_key_data *key_data, struct gcm_context_data *context_data, const uint8_t *iv, uint8_t const *aad, uint64_t aad_len); void aes_gcm_init_192_avx_gen4(const struct gcm_key_data *key_data, struct gcm_context_data *context_data, const uint8_t *iv, uint8_t const *aad, uint64_t aad_len); void aes_gcm_init_256_sse(const struct gcm_key_data *key_data, struct gcm_context_data *context_data, const uint8_t *iv, uint8_t const *aad, uint64_t aad_len); void aes_gcm_init_256_avx_gen2(const struct gcm_key_data *key_data, struct gcm_context_data *context_data, const uint8_t *iv, uint8_t const *aad, uint64_t aad_len); void aes_gcm_init_256_avx_gen4(const struct gcm_key_data *key_data, struct gcm_context_data *context_data, const uint8_t *iv, uint8_t const *aad, uint64_t aad_len); /** * @brief encrypt a block of a AES-GCM Encryption message * * @param key_data GCM expanded key data * @param context_data GCM operation context data * @param out Ciphertext output. Encrypt in-place is allowed. * @param in Plaintext input. * @param len Length of data in Bytes for decryption. */ void aes_gcm_enc_128_update_sse(const struct gcm_key_data *key_data, struct gcm_context_data *context_data, uint8_t *out, const uint8_t *in, uint64_t len); void aes_gcm_enc_128_update_avx_gen2(const struct gcm_key_data *key_data, struct gcm_context_data *context_data, uint8_t *out, const uint8_t *in, uint64_t len); void aes_gcm_enc_128_update_avx_gen4(const struct gcm_key_data *key_data, struct gcm_context_data *context_data, uint8_t *out, const uint8_t *in, uint64_t len); void aes_gcm_enc_192_update_sse(const struct gcm_key_data *key_data, struct gcm_context_data *context_data, uint8_t *out, const uint8_t *in, uint64_t len); void aes_gcm_enc_192_update_avx_gen2(const struct gcm_key_data *key_data, struct gcm_context_data *context_data, uint8_t *out, const uint8_t *in, uint64_t len); void aes_gcm_enc_192_update_avx_gen4(const struct gcm_key_data *key_data, struct gcm_context_data *context_data, uint8_t *out, const uint8_t *in, uint64_t len); void aes_gcm_enc_256_update_sse(const struct gcm_key_data *key_data, struct gcm_context_data *context_data, uint8_t *out, const uint8_t *in, uint64_t len); void aes_gcm_enc_256_update_avx_gen2(const struct gcm_key_data *key_data, struct gcm_context_data *context_data, uint8_t *out, const uint8_t *in, uint64_t len); void aes_gcm_enc_256_update_avx_gen4(const struct gcm_key_data *key_data, struct gcm_context_data *context_data, uint8_t *out, const uint8_t *in, uint64_t len); /** * @brief decrypt a block of a AES-GCM Encryption message * * @param key_data GCM expanded key data * @param context_data GCM operation context data * @param out Plaintext output. Decrypt in-place is allowed. * @param in Ciphertext input. * @param len Length of data in Bytes for decryption. */ void aes_gcm_dec_128_update_sse(const struct gcm_key_data *key_data, struct gcm_context_data *context_data, uint8_t *out, const uint8_t *in, uint64_t len); void aes_gcm_dec_128_update_avx_gen2(const struct gcm_key_data *key_data, struct gcm_context_data *context_data, uint8_t *out, const uint8_t *in, uint64_t len); void aes_gcm_dec_128_update_avx_gen4(const struct gcm_key_data *key_data, struct gcm_context_data *context_data, uint8_t *out, const uint8_t *in, uint64_t len); void aes_gcm_dec_192_update_sse(const struct gcm_key_data *key_data, struct gcm_context_data *context_data, uint8_t *out, const uint8_t *in, uint64_t len); void aes_gcm_dec_192_update_avx_gen2(const struct gcm_key_data *key_data, struct gcm_context_data *context_data, uint8_t *out, const uint8_t *in, uint64_t len); void aes_gcm_dec_192_update_avx_gen4(const struct gcm_key_data *key_data, struct gcm_context_data *context_data, uint8_t *out, const uint8_t *in, uint64_t len); void aes_gcm_dec_256_update_sse(const struct gcm_key_data *key_data, struct gcm_context_data *context_data, uint8_t *out, const uint8_t *in, uint64_t len); void aes_gcm_dec_256_update_avx_gen2(const struct gcm_key_data *key_data, struct gcm_context_data *context_data, uint8_t *out, const uint8_t *in, uint64_t len); void aes_gcm_dec_256_update_avx_gen4(const struct gcm_key_data *key_data, struct gcm_context_data *context_data, uint8_t *out, const uint8_t *in, uint64_t len); /** * @brief End encryption of a AES-GCM Encryption message * * @param key_data GCM expanded key data * @param context_data GCM operation context data * @param auth_tag Authenticated Tag output. * @param auth_tag_len Authenticated Tag Length in bytes (must be * a multiple of 4 bytes). Valid values are * 16 (most likely), 12 or 8. */ void aes_gcm_enc_128_finalize_sse(const struct gcm_key_data *key_data, struct gcm_context_data *context_data, uint8_t *auth_tag, uint64_t auth_tag_len); void aes_gcm_enc_128_finalize_avx_gen2(const struct gcm_key_data *key_data, struct gcm_context_data *context_data, uint8_t *auth_tag, uint64_t auth_tag_len); void aes_gcm_enc_128_finalize_avx_gen4(const struct gcm_key_data *key_data, struct gcm_context_data *context_data, uint8_t *auth_tag, uint64_t auth_tag_len); void aes_gcm_enc_192_finalize_sse(const struct gcm_key_data *key_data, struct gcm_context_data *context_data, uint8_t *auth_tag, uint64_t auth_tag_len); void aes_gcm_enc_192_finalize_avx_gen2(const struct gcm_key_data *key_data, struct gcm_context_data *context_data, uint8_t *auth_tag, uint64_t auth_tag_len); void aes_gcm_enc_192_finalize_avx_gen4(const struct gcm_key_data *key_data, struct gcm_context_data *context_data, uint8_t *auth_tag, uint64_t auth_tag_len); void aes_gcm_enc_256_finalize_sse(const struct gcm_key_data *key_data, struct gcm_context_data *context_data, uint8_t *auth_tag, uint64_t auth_tag_len); void aes_gcm_enc_256_finalize_avx_gen2(const struct gcm_key_data *key_data, struct gcm_context_data *context_data, uint8_t *auth_tag, uint64_t auth_tag_len); void aes_gcm_enc_256_finalize_avx_gen4(const struct gcm_key_data *key_data, struct gcm_context_data *context_data, uint8_t *auth_tag, uint64_t auth_tag_len); /** * @brief End decryption of a AES-GCM Encryption message * * @param key_data GCM expanded key data * @param context_data GCM operation context data * @param auth_tag Authenticated Tag output. * @param auth_tag_len Authenticated Tag Length in bytes (must be * a multiple of 4 bytes). Valid values are * 16 (most likely), 12 or 8. */ void aes_gcm_dec_128_finalize_sse(const struct gcm_key_data *key_data, struct gcm_context_data *context_data, uint8_t *auth_tag, uint64_t auth_tag_len); void aes_gcm_dec_128_finalize_avx_gen2(const struct gcm_key_data *key_data, struct gcm_context_data *context_data, uint8_t *auth_tag, uint64_t auth_tag_len); void aes_gcm_dec_128_finalize_avx_gen4(const struct gcm_key_data *key_data, struct gcm_context_data *context_data, uint8_t *auth_tag, uint64_t auth_tag_len); void aes_gcm_dec_192_finalize_sse(const struct gcm_key_data *key_data, struct gcm_context_data *context_data, uint8_t *auth_tag, uint64_t auth_tag_len); void aes_gcm_dec_192_finalize_avx_gen2(const struct gcm_key_data *key_data, struct gcm_context_data *context_data, uint8_t *auth_tag, uint64_t auth_tag_len); void aes_gcm_dec_192_finalize_avx_gen4(const struct gcm_key_data *key_data, struct gcm_context_data *context_data, uint8_t *auth_tag, uint64_t auth_tag_len); void aes_gcm_dec_256_finalize_sse(const struct gcm_key_data *key_data, struct gcm_context_data *context_data, uint8_t *auth_tag, uint64_t auth_tag_len); void aes_gcm_dec_256_finalize_avx_gen2(const struct gcm_key_data *key_data, struct gcm_context_data *context_data, uint8_t *auth_tag, uint64_t auth_tag_len); void aes_gcm_dec_256_finalize_avx_gen4(const struct gcm_key_data *key_data, struct gcm_context_data *context_data, uint8_t *auth_tag, uint64_t auth_tag_len); /** * @brief Precomputation of HashKey constants * * Precomputation of HashKey<<1 mod poly constants (shifted_hkey_X and * shifted_hkey_X_k). * * @param gdata GCM context data */ void aes_gcm_precomp_128_sse(struct gcm_key_data *key_data); void aes_gcm_precomp_128_avx_gen2(struct gcm_key_data *key_data); void aes_gcm_precomp_128_avx_gen4(struct gcm_key_data *key_data); void aes_gcm_precomp_192_sse(struct gcm_key_data *key_data); void aes_gcm_precomp_192_avx_gen2(struct gcm_key_data *key_data); void aes_gcm_precomp_192_avx_gen4(struct gcm_key_data *key_data); void aes_gcm_precomp_256_sse(struct gcm_key_data *key_data); void aes_gcm_precomp_256_avx_gen2(struct gcm_key_data *key_data); void aes_gcm_precomp_256_avx_gen4(struct gcm_key_data *key_data); /** * @brief Pre-processes GCM key data * * Prefills the gcm key data with key values for each round and * the initial sub hash key for tag encoding * * @param key pointer to key data * @param key_data GCM expanded key data * */ __forceinline void aes_gcm_pre_128_sse(const void *key, struct gcm_key_data *key_data) { aes_keyexp_128_enc_sse(key, key_data->expanded_keys); aes_gcm_precomp_128_sse(key_data); } __forceinline void aes_gcm_pre_128_avx_gen2(const void *key, struct gcm_key_data *key_data) { aes_keyexp_128_enc_avx(key, key_data->expanded_keys); aes_gcm_precomp_128_avx_gen2(key_data); } __forceinline void aes_gcm_pre_128_avx_gen4(const void *key, struct gcm_key_data *key_data) { aes_keyexp_128_enc_avx2(key, key_data->expanded_keys); aes_gcm_precomp_128_avx_gen4(key_data); } __forceinline void aes_gcm_pre_192_sse(const void *key, struct gcm_key_data *key_data) { aes_keyexp_192_enc_sse(key, key_data->expanded_keys); aes_gcm_precomp_192_sse(key_data); } __forceinline void aes_gcm_pre_192_avx_gen2(const void *key, struct gcm_key_data *key_data) { aes_keyexp_192_enc_avx(key, key_data->expanded_keys); aes_gcm_precomp_192_avx_gen2(key_data); } __forceinline void aes_gcm_pre_192_avx_gen4(const void *key, struct gcm_key_data *key_data) { aes_keyexp_192_enc_avx2(key, key_data->expanded_keys); aes_gcm_precomp_192_avx_gen4(key_data); } __forceinline void aes_gcm_pre_256_sse(const void *key, struct gcm_key_data *key_data) { struct gcm_key_data tmp; aes_keyexp_256_sse(key, key_data->expanded_keys, tmp.expanded_keys); aes_gcm_precomp_256_sse(key_data); } __forceinline void aes_gcm_pre_256_avx_gen2(const void *key, struct gcm_key_data *key_data) { aes_keyexp_256_enc_avx(key, key_data->expanded_keys); aes_gcm_precomp_256_avx_gen2(key_data); } __forceinline void aes_gcm_pre_256_avx_gen4(const void *key, struct gcm_key_data *key_data) { aes_keyexp_256_enc_avx2(key, key_data->expanded_keys); aes_gcm_precomp_256_avx_gen4(key_data); } #ifdef __cplusplus } #endif /* __cplusplus */ #endif /* ifndef GCM_DEFINES_H */ intel-ipsec-mb-0.48/include/000077500000000000000000000000001321406316400157005ustar00rootroot00000000000000intel-ipsec-mb-0.48/include/datastruct.asm000066400000000000000000000140221321406316400205570ustar00rootroot00000000000000;; ;; Copyright (c) 2012-2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; ; Macros for defining data structures ; Usage example ;START_FIELDS ; JOB_AES ;;; name size align ;FIELD _plaintext, 8, 8 ; pointer to plaintext ;FIELD _ciphertext, 8, 8 ; pointer to ciphertext ;FIELD _IV, 16, 8 ; IV ;FIELD _keys, 8, 8 ; pointer to keys ;FIELD _len, 4, 4 ; length in bytes ;FIELD _status, 4, 4 ; status enumeration ;FIELD _user_data, 8, 8 ; pointer to user data ;UNION _union, size1, align1, \ ; size2, align2, \ ; size3, align3, \ ; ... ;END_FIELDS ;%assign _JOB_AES_size _FIELD_OFFSET ;%assign _JOB_AES_align _STRUCT_ALIGN ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; Alternate "struc-like" syntax: ; STRUCT job_aes2 ; RES_Q .plaintext, 1 ; RES_Q .ciphertext, 1 ; RES_DQ .IV, 1 ; RES_B .nested, _JOB_AES_SIZE, _JOB_AES_ALIGN ; RES_U .union, size1, align1, \ ; size2, align2, \ ; ... ; ENDSTRUCT ; ; Following only needed if nesting ; %assign job_aes2_size _FIELD_OFFSET ; %assign job_aes2_align _STRUCT_ALIGN ; ; RES_* macros take a name, a count and an optional alignment. ; The count in in terms of the base size of the macro, and the ; default alignment is the base size. ; The macros are: ; Macro Base size ; RES_B 1 ; RES_W 2 ; RES_D 4 ; RES_Q 8 ; RES_DQ 16 ; RES_Y 32 ; RES_Z 64 ; ; RES_U defines a union. It's arguments are a name and two or more ; pairs of "size, alignment" ; ; The two assigns are only needed if this structure is being nested ; within another. Even if the assigns are not done, one can still use ; STRUCT_NAME_size as the size of the structure. ; ; Note that for nesting, you still need to assign to STRUCT_NAME_size. ; ; The differences between this and using "struc" directly are that each ; type is implicitly aligned to its natural length (although this can be ; over-ridden with an explicit third parameter), and that the structure ; is padded at the end to its overall alignment. ; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; %ifndef _DATASTRUCT_ASM_ %define _DATASTRUCT_ASM_ ;; START_FIELDS %macro START_FIELDS 0 %assign _FIELD_OFFSET 0 %assign _STRUCT_ALIGN 0 %endm ;; FIELD name size align %macro FIELD 3 %define %%name %1 %define %%size %2 %define %%align %3 %assign _FIELD_OFFSET (_FIELD_OFFSET + (%%align) - 1) & (~ ((%%align)-1)) %%name equ _FIELD_OFFSET %assign _FIELD_OFFSET _FIELD_OFFSET + (%%size) %if (%%align > _STRUCT_ALIGN) %assign _STRUCT_ALIGN %%align %endif %endm ;; END_FIELDS %macro END_FIELDS 0 %assign _FIELD_OFFSET (_FIELD_OFFSET + _STRUCT_ALIGN-1) & (~ (_STRUCT_ALIGN-1)) %endm %macro UNION 5-* %if (0 == (%0 & 1)) %error EVEN number of parameters to UNION Macro %err %endif %rotate 1 %assign _UNION_SIZE %1 %assign _UNION_ALIGN %2 %rep (%0 - 3)/2 %rotate 2 %if (%1 > _UNION_SIZE) %assign _UNION_SIZE %1 %endif %if (%2 > _UNION_ALIGN) %assign _UNION_ALIGN %2 %endif %endrep %rotate 2 FIELD %1, _UNION_SIZE, _UNION_ALIGN %endm ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; %macro STRUCT 1 START_FIELDS struc %1 %endm %macro ENDSTRUCT 0 %assign %%tmp _FIELD_OFFSET END_FIELDS %assign %%tmp (_FIELD_OFFSET - %%tmp) %if (%%tmp > 0) resb %%tmp %endif endstruc %endm ;; RES_int name size align %macro RES_int 3 %define %%name %1 %define %%size %2 %define %%align %3 %assign _FIELD_OFFSET (_FIELD_OFFSET + (%%align) - 1) & (~ ((%%align)-1)) align %%align %%name resb %%size %assign _FIELD_OFFSET _FIELD_OFFSET + (%%size) %if (%%align > _STRUCT_ALIGN) %assign _STRUCT_ALIGN %%align %endif %endm ; macro RES_B name, size [, align] %macro RES_B 2-3 1 RES_int %1, %2, %3 %endm ; macro RES_W name, size [, align] %macro RES_W 2-3 2 RES_int %1, 2*(%2), %3 %endm ; macro RES_D name, size [, align] %macro RES_D 2-3 4 RES_int %1, 4*(%2), %3 %endm ; macro RES_Q name, size [, align] %macro RES_Q 2-3 8 RES_int %1, 8*(%2), %3 %endm ; macro RES_DQ name, size [, align] %macro RES_DQ 2-3 16 RES_int %1, 16*(%2), %3 %endm ; macro RES_Y name, size [, align] %macro RES_Y 2-3 32 RES_int %1, 32*(%2), %3 %endm ; macro RES_Z name, size [, align] %macro RES_Z 2-3 64 RES_int %1, 64*(%2), %3 %endm %macro RES_U 5-* %if (0 == (%0 & 1)) %error EVEN number of parameters to RES_U Macro %err %endif %rotate 1 %assign _UNION_SIZE %1 %assign _UNION_ALIGN %2 %rep (%0 - 3)/2 %rotate 2 %if (%1 > _UNION_SIZE) %assign _UNION_SIZE %1 %endif %if (%2 > _UNION_ALIGN) %assign _UNION_ALIGN %2 %endif %endrep %rotate 2 RES_int %1, _UNION_SIZE, _UNION_ALIGN %endm %endif ; end ifdef _DATASTRUCT_ASM_ intel-ipsec-mb-0.48/include/dbgprint.asm000066400000000000000000000241301321406316400202130ustar00rootroot00000000000000;; ;; Copyright (c) 2012-2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; ; Macros for "printing" for debug purposes from within asm code ; ; The basic macros are: ; DBGPRINT16, DBGPRINT32, DBGPRINT64, DBGPRINT_XMM, DBGPRINT_YMM, DBGPRINT_ZMM ; These are called with 1 or more arguments, all of which are of the ; size/type as specified in the name. E.g. ; DBGPRINT64 reg1, reg2, reg3, ... ; ; There is also a macro DEBUGPRINTL that takes one argument, a string. E.g. ; DBGPRINTL "hit this point in the code" ; ; There are also variations on these with the "DBGPRINT" suffixed with "L", e.g. ; DBGPRINTL64. These take two or more arguments, where the first is a string, ; and the rest are of the specified type, e.g. ; DBGPRINTL64 "Rindex", Rindex ; Essentially, this is the same as a DBGPRINTL followed by DBGPRINT64. ; ; If DO_DBGPRINT is defined, then the macros write the debug information into ; a buffer. If DO_DBGPRINT is *not* defined, then the macros expand to nothing. ; ; CAVEAT: The macros need a GPR. Currently, it uses R15. If the first register ; argument is R15, then it will use R14. This means that if you try ; DBGPRINTL64 "text", rax, r15 ; you will not get the proper value of r15. ; One way to avoid this issue is to not use multiple registers on the same line ; if the register types are GPR (i.e. this is not an issue for printing XMM ; registers). E.g the above could be done with: ; DBGPRINTL64 "test", rax ; DBGPRINT64 r15 ; ; Note also that the macros only check for r15. Thus is you tried something ; like (after token expansion): ; DBGPRINT32 r15d ; you won't get the right results. If you want to display r15d, you should ; print it as the 64-bit r15. ; ; To actually print the data, from your C code include the file ; "dbgprint.h". The default buffer size is 16kB. If you want to change ; that, #define DBG_BUFFER_SIZE before including "dbgprint.h". ; ; Then, (after your asm routine(s) have returned, call ; print_debug() or print_debug(file pointer) ; If you do not specify a file pointer, it defaults to stdout. ; ; Printing the debug data also resets the write pointer to the beginning, ; effectively "deleting" the previous messages. ; ;%define DO_DBGPRINT %ifdef DO_DBGPRINT extern pDebugBuffer ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; DBGPRINT_INT size, param, ... %macro DBGPRINT_INT 2-* %ifidni %2,r15 %xdefine %%reg r14 %else %xdefine %%reg r15 %endif %xdefine %%size %1 %rotate 1 push %%reg mov %%reg, [pDebugBuffer] %rep %0 - 1 mov byte [%%reg], %%size %if (%%size == 2) mov word [%%reg+1], %1 %elif (%%size == 4) mov dword [%%reg+1], %1 %elif (%%size == 8) mov qword [%%reg+1], %1 %elif (%%size == 16) movdqu oword [%%reg+1], %1 %elif (%%size == 32) vmovdqu [%%reg+1], %1 %elif (%%size == 64) vmovdqu32 [%%reg+1], %1 %else %error invalid size %%size %endif add %%reg, %%size+1 %rotate 1 %endrep mov [pDebugBuffer], %%reg pop %%reg %endmacro ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; DBGPRINTL_INT size, label, param, ... %macro DBGPRINTL_INT 3-* %ifidni %3,r15 %xdefine %%reg r14 %else %xdefine %%reg r15 %endif %xdefine %%size %1 %rotate 1 push %%reg mov %%reg, [pDebugBuffer] mov byte [%%reg], 0x57 section .data %%lab: db %1, 0 section .text mov qword [%%reg+1], %%lab add %%reg, 8+1 %rotate 1 %rep %0 - 2 mov byte [%%reg], %%size %if (%%size == 2) mov word [%%reg+1], %1 %elif (%%size == 4) mov dword [%%reg+1], %1 %elif (%%size == 8) mov qword [%%reg+1], %1 %elif (%%size == 16) movdqu oword [%%reg+1], %1 %elif (%%size == 32) vmovdqu [%%reg+1], %1 %elif (%%size == 64) vmovdqu32 [%%reg+1], %1 %else %error invalid size %%size %endif add %%reg, %%size+1 %rotate 1 %endrep mov [pDebugBuffer], %%reg pop %%reg %endmacro ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; DBGPRINTL* data, ... %macro DBGPRINT16 1+ DBGPRINT_INT 2, %1 %endmacro %macro DBGPRINT32 1+ DBGPRINT_INT 4, %1 %endmacro %macro DBGPRINT64 1+ DBGPRINT_INT 8, %1 %endmacro %macro DBGPRINT_XMM 1+ DBGPRINT_INT 16, %1 %endmacro %macro DBGPRINT_YMM 1+ DBGPRINT_INT 32, %1 %endmacro %macro DBGPRINT_ZMM 1+ DBGPRINT_INT 64, %1 %endmacro ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; DBGPRINTL* label, data, ... %macro DBGPRINTL16 2+ DBGPRINTL_INT 2, %1, %2 %endmacro %macro DBGPRINTL32 2+ DBGPRINTL_INT 4, %1, %2 %endmacro %macro DBGPRINTL64 2+ DBGPRINTL_INT 8, %1, %2 %endmacro %macro DBGPRINTL_XMM 2+ DBGPRINTL_INT 16, %1, %2 %endmacro %macro DBGPRINTL_YMM 2+ DBGPRINTL_INT 32, %1, %2 %endmacro %macro DBGPRINTL_ZMM 2+ DBGPRINTL_INT 64, %1, %2 %endmacro ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; %macro DBGPRINTL 1 push r15 mov r15, [pDebugBuffer] mov byte [r15], 0x57 section .data %%lab: db %1, 0 section .text mov qword [r15+1], %%lab add r15, 8+1 mov [pDebugBuffer], r15 pop r15 %endmacro ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; %else %macro DBGPRINT16 1+ %endmacro %macro DBGPRINT32 1+ %endmacro %macro DBGPRINT64 1+ %endmacro %macro DBGPRINT_XMM 1+ %endmacro %macro DBGPRINT_YMM 1+ %endmacro %macro DBGPRINT_ZMM 1+ %endmacro ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; %macro DBGPRINTL16 2+ %endmacro %macro DBGPRINTL32 2+ %endmacro %macro DBGPRINTL64 2+ %endmacro %macro DBGPRINTL_XMM 2+ %endmacro %macro DBGPRINTL_YMM 2+ %endmacro %macro DBGPRINTL_ZMM 2+ %endmacro ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; %macro DBGPRINTL 1 %endmacro %endif %if 0 ; OLD %macro DBGPRINTL_ZMM 2-* push rax mov rax, [pDebugBuffer] mov byte [rax], 0x57 section .data %%lab: db %1, 0 section .text mov qword [rax+1], %%lab add rax, 8+1 %rotate 1 %rep %0 - 1 mov byte [rax], 64 vmovdqu32 [rax+1], %1 %rotate 1 add rax, 64+1 %endrep mov [pDebugBuffer], rax pop rax %endmacro ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; %macro DBGPRINT_ZMM 1-* push rax mov rax, [pDebugBuffer] %rep %0 mov byte [rax], 64 vmovdqu32 [rax+1], %1 %rotate 1 add rax, 64+1 %endrep mov [pDebugBuffer], rax pop rax %endmacro ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; %macro DBGPRINT_YMM 1-* push rax mov rax, [pDebugBuffer] %rep %0 mov byte [rax], 32 vmovdqu [rax+1], %1 %rotate 1 add rax, 32+1 %endrep mov [pDebugBuffer], rax pop rax %endmacro ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; %macro DBGPRINT_XMM 1-* push rax mov rax, [pDebugBuffer] %rep %0 mov byte [rax], 16 vmovdqu oword [rax+1], %1 %rotate 1 add rax, 16+1 %endrep mov [pDebugBuffer], rax pop rax %endmacro ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; %macro DBGPRINTL64 2-* push rax mov rax, [pDebugBuffer] mov byte [rax], 0x57 section .data %%lab: db %1, 0 section .text mov qword [rax+1], %%lab add rax, 8+1 %rotate 1 %rep %0 - 1 mov byte [rax], 8 mov qword [rax+1], %1 %rotate 1 add rax, 8+1 %endrep mov [pDebugBuffer], rax pop rax %endmacro ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; %macro DBGPRINT64 1-* push rax mov rax, [pDebugBuffer] %rep %0 mov byte [rax], 8 mov qword [rax+1], %1 %rotate 1 add rax, 8+1 %endrep mov [pDebugBuffer], rax pop rax %endmacro ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; %macro DBGPRINT32 1-* push rax mov rax, [pDebugBuffer] %rep %0 mov byte [rax], 4 mov dword [rax+1], %1 %rotate 1 add rax, 4+1 %endrep mov [pDebugBuffer], rax pop rax %endmacro ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; %macro DBGPRINT16 1-* push rax mov rax, [pDebugBuffer] %rep %0 mov byte [rax], 2 mov word [rax+1], %1 %rotate 1 add rax, 2+1 %endrep mov [pDebugBuffer], rax pop rax %endmacro ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; %macro DBGPRINT_LAB 1 push rax mov rax, [pDebugBuffer] mov byte [rax], 0x57 section .data %%lab: db %1, 0 section .text mov qword [rax+1], %%lab add rax, 8+1 mov [pDebugBuffer], rax pop rax %endmacro ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; %macro DBGHIST 2 inc dword [%1 + 4 * %2] %endmacro %macro DBGPRINT_ZMM 1-* %endmacro %macro DBGPRINT_YMM 1-* %endmacro %macro DBGPRINT_XMM 1-* %endmacro %macro DBGPRINT64 1-* %endmacro %macro DBGPRINT32 1-* %endmacro %macro DBGPRINT16 1-* %endmacro %macro DBGHIST 2 %endmacro ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; %endif ; ifdef 0 ; OLD intel-ipsec-mb-0.48/include/des_utils.h000066400000000000000000000112371321406316400200500ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2017, Intel Corporation Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ /* DES utility functions and macros */ #ifndef DES_UTILS_H #define DES_UTILS_H #include #include "os.h" /** * @brief Gets selected bit value out of a 64-bit word * * @param val 64-bit word * @param n bit number (0 to 63) to get value of * * @return n-th bit value (0 or 1 value only) */ __forceinline uint64_t bit_get64b(const uint64_t val, const unsigned n) { IMB_ASSERT(n < 64); return (val >> n) & UINT64_C(1); } /** * @brief Sets selected bit in a 64-bit word * * @param val 64-bit word * @param n bit number (0 to 63) to get value of * @param b bit value (0 or 1) * * @return val with n-th bit set to value b */ __forceinline uint64_t bit_set64b(const uint64_t val, const unsigned n, const uint64_t b) { const uint64_t m = UINT64_C(1) << n; IMB_ASSERT(n < 64); return (val & (~m)) | (b << n); } /** * @brief Permutes bits in a 64-bit word as described by pattern * * The function goes through pattern array from index 0 to 'size' (max 63). * It sets output bit number 'index' to value of * bit number 'pattern[index] - 1' from 'in'. * * @param in 64-bit word to be permuted * @param pattern pointer to array defining the permutation * @param size is size of the permutation pattern * * @return permuted in word as described by the pattern */ __forceinline uint64_t permute_64b(const uint64_t in, const uint8_t *pattern, const int size) { uint64_t out = 0; int n = 0; IMB_ASSERT(size <= 64); for (n = 0; n < size; n++) { /* '-1' is required as bit numbers in FIPS start with 1 not 0 */ const int m = ((int) pattern[n]) - 1; const uint64_t bit_val = bit_get64b(in, m); out = bit_set64b(out, n, bit_val); } return out; } static const uint8_t reflect_tab[16] = { /* [ 0] 0000 => 0000 */ 0, /* [ 1] 0001 => 1000 */ 8, /* [ 2] 0010 => 0100 */ 4, /* [ 3] 0011 => 1100 */ 12, /* [ 4] 0100 => 0010 */ 2, /* [ 5] 0101 => 1010 */ 10, /* [ 6] 0110 => 0110 */ 6, /* [ 7] 0111 => 1110 */ 14, /* [ 8] 1000 => 0001 */ 1, /* [ 9] 1001 => 1001 */ 9, /* [10] 1010 => 0101 */ 5, /* [11] 1011 => 1101 */ 13, /* [12] 1100 => 0011 */ 3, /* [13] 1101 => 1011 */ 11, /* [14] 1110 => 0111 */ 7, /* [15] 1111 => 1111 */ 15 }; __forceinline uint8_t reflect_8b(const uint8_t pb) { return reflect_tab[pb >> 4] | (reflect_tab[pb & 15] << 4); } __forceinline uint64_t load64_reflect(const void *key) { const uint8_t *kb = (const uint8_t *) key; return ((uint64_t) reflect_8b(kb[0])) | ((uint64_t) reflect_8b(kb[1])) << 8 | ((uint64_t) reflect_8b(kb[2])) << 16 | ((uint64_t) reflect_8b(kb[3])) << 24 | ((uint64_t) reflect_8b(kb[4])) << 32 | ((uint64_t) reflect_8b(kb[5])) << 40 | ((uint64_t) reflect_8b(kb[6])) << 48 | ((uint64_t) reflect_8b(kb[7])) << 56; } #endif /* DES_UTILS_H */ intel-ipsec-mb-0.48/include/memcpy.asm000066400000000000000000000232131321406316400176750ustar00rootroot00000000000000;; ;; Copyright (c) 2012-2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; %ifndef __MEMCPY_ASM__ %define __MEMCPY_ASM__ %include "reg_sizes.asm" ; This file defines a series of macros to copy small to medium amounts ; of data from memory to memory, where the size is variable but limited. ; ; The macros are all called as: ; memcpy DST, SRC, SIZE, TMP0, TMP1, XTMP0, XTMP1, XTMP2, XTMP3 ; with the parameters defined as: ; DST : register: pointer to dst (not modified) ; SRC : register: pointer to src (not modified) ; SIZE : register: length in bytes (not modified) ; TMP0 : 64-bit temp GPR (clobbered) ; TMP1 : 64-bit temp GPR (clobbered) ; XTMP0 : temp XMM (clobbered) ; XTMP1 : temp XMM (clobbered) ; XTMP2 : temp XMM (clobbered) ; XTMP3 : temp XMM (clobbered) ; ; The name indicates the options. The name is of the form: ; memcpy__ ; where: ; is either "sse" or "avx" or "avx2" ; is either "64" or "128" and defines largest value of SIZE ; is blank or "_1". If "_1" then the min SIZE is 1 (otherwise 0) ; is blank or "_ret". If blank, the code falls through. If "ret" ; it does a "ret" at the end ; ; For the avx2 versions, the temp XMM registers need to be YMM registers ; If the SZ is 64, then only two YMM temps are needed, i.e. it is called as: ; memcpy_avx2_64 DST, SRC, SIZE, TMP0, TMP1, YTMP0, YTMP1 ; memcpy_avx2_128 DST, SRC, SIZE, TMP0, TMP1, YTMP0, YTMP1, YTMP2, YTMP3 ; ; For example: ; memcpy_sse_64 : SSE, 0 <= size < 64, falls through ; memcpy_avx_64_1 : AVX1, 1 <= size < 64, falls through ; memcpy_sse_128_ret : SSE, 0 <= size < 128, ends with ret ; mempcy_avx_128_1_ret : AVX1, 1 <= size < 128, ends with ret ; %macro memcpy_sse_64 9 __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 64, 0, 0 %endm %macro memcpy_sse_64_1 9 __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 64, 0, 0 %endm %macro memcpy_sse_128 9 __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 128, 0, 0 %endm %macro memcpy_sse_128_1 9 __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 128, 0, 0 %endm %macro memcpy_sse_64_ret 9 __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 64, 1, 0 %endm %macro memcpy_sse_64_1_ret 9 __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 64, 1, 0 %endm %macro memcpy_sse_128_ret 9 __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 128, 1, 0 %endm %macro memcpy_sse_128_1_ret 9 __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 128, 1, 0 %endm %macro memcpy_sse_16 5 __memcpy_int %1,%2,%3,%4,%5,,,,, 0, 16, 0, 0 %endm %macro memcpy_sse_16_1 5 __memcpy_int %1,%2,%3,%4,%5,,,,, 1, 16, 0, 0 %endm ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; %macro memcpy_avx_64 9 __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 64, 0, 1 %endm %macro memcpy_avx_64_1 9 __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 64, 0, 1 %endm %macro memcpy_avx_128 9 __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 128, 0, 1 %endm %macro memcpy_avx_128_1 9 __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 128, 0, 1 %endm %macro memcpy_avx_64_ret 9 __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 64, 1, 1 %endm %macro memcpy_avx_64_1_ret 9 __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 64, 1, 1 %endm %macro memcpy_avx_128_ret 9 __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 128, 1, 1 %endm %macro memcpy_avx_128_1_ret 9 __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 128, 1, 1 %endm %macro memcpy_avx_16 5 __memcpy_int %1,%2,%3,%4,%5,,,,, 0, 16, 0, 1 %endm %macro memcpy_avx_16_1 5 __memcpy_int %1,%2,%3,%4,%5,,,,, 1, 16, 0, 1 %endm ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; %macro memcpy_avx2_64 7 __memcpy_int %1,%2,%3,%4,%5,%6,%7,--,--, 0, 64, 0, 2 %endm %macro memcpy_avx2_64_1 7 __memcpy_int %1,%2,%3,%4,%5,%6,%7,--,--, 1, 64, 0, 2 %endm %macro memcpy_avx2_128 9 __memcpy_int %1,%2,%3,%4,%5,%6,%7, %8, %9, 0, 128, 0, 2 %endm %macro memcpy_avx2_128_1 9 __memcpy_int %1,%2,%3,%4,%5,%6,%7, %8, %9, 1, 128, 0, 2 %endm %macro memcpy_avx2_64_ret 7 __memcpy_int %1,%2,%3,%4,%5,%6,%7,--,--, 0, 64, 1, 2 %endm %macro memcpy_avx2_64_1_ret 7 __memcpy_int %1,%2,%3,%4,%5,%6,%7,--,--, 1, 64, 1, 2 %endm %macro memcpy_avx2_128_ret 9 __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 128, 1, 2 %endm %macro memcpy_avx2_128_1_ret 9 __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 128, 1, 2 %endm ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; %macro __memcpy_int 13 %define %%DST %1 ; register: pointer to dst (not modified) %define %%SRC %2 ; register: pointer to src (not modified) %define %%SIZE %3 ; register: length in bytes (not modified) %define %%TMP0 %4 ; 64-bit temp GPR (clobbered) %define %%TMP1 %5 ; 64-bit temp GPR (clobbered) %define %%XTMP0 %6 ; temp XMM (clobbered) %define %%XTMP1 %7 ; temp XMM (clobbered) %define %%XTMP2 %8 ; temp XMM (clobbered) %define %%XTMP3 %9 ; temp XMM (clobbered) %define %%NOT0 %10 ; if not 0, then assume size cannot be zero %define %%MAXSIZE %11 ; 128, 64, etc %define %%USERET %12 ; if not 0, use "ret" at end %define %%USEAVX %13 ; 0 = SSE, 1 = AVX1, 2 = AVX2 %if (%%USERET != 0) %define %%DONE ret %else %define %%DONE jmp %%end %endif %if (%%USEAVX != 0) %define %%MOVDQU vmovdqu %else %define %%MOVDQU movdqu %endif %if (%%MAXSIZE >= 128) test %%SIZE, 64 jz %%lt64 %if (%%USEAVX >= 2) %%MOVDQU %%XTMP0, [%%SRC + 0*32] %%MOVDQU %%XTMP1, [%%SRC + 1*32] %%MOVDQU %%XTMP2, [%%SRC + %%SIZE - 2*32] %%MOVDQU %%XTMP3, [%%SRC + %%SIZE - 1*32] %%MOVDQU [%%DST + 0*32], %%XTMP0 %%MOVDQU [%%DST + 1*32], %%XTMP1 %%MOVDQU [%%DST + %%SIZE - 2*32], %%XTMP2 %%MOVDQU [%%DST + %%SIZE - 1*32], %%XTMP3 %else %%MOVDQU %%XTMP0, [%%SRC + 0*16] %%MOVDQU %%XTMP1, [%%SRC + 1*16] %%MOVDQU %%XTMP2, [%%SRC + 2*16] %%MOVDQU %%XTMP3, [%%SRC + 3*16] %%MOVDQU [%%DST + 0*16], %%XTMP0 %%MOVDQU [%%DST + 1*16], %%XTMP1 %%MOVDQU [%%DST + 2*16], %%XTMP2 %%MOVDQU [%%DST + 3*16], %%XTMP3 %%MOVDQU %%XTMP0, [%%SRC + %%SIZE - 4*16] %%MOVDQU %%XTMP1, [%%SRC + %%SIZE - 3*16] %%MOVDQU %%XTMP2, [%%SRC + %%SIZE - 2*16] %%MOVDQU %%XTMP3, [%%SRC + %%SIZE - 1*16] %%MOVDQU [%%DST + %%SIZE - 4*16], %%XTMP0 %%MOVDQU [%%DST + %%SIZE - 3*16], %%XTMP1 %%MOVDQU [%%DST + %%SIZE - 2*16], %%XTMP2 %%MOVDQU [%%DST + %%SIZE - 1*16], %%XTMP3 %endif %%DONE %endif %if (%%MAXSIZE >= 64) %%lt64: test %%SIZE, 32 jz %%lt32 %if (%%USEAVX >= 2) %%MOVDQU %%XTMP0, [%%SRC + 0*32] %%MOVDQU %%XTMP1, [%%SRC + %%SIZE - 1*32] %%MOVDQU [%%DST + 0*32], %%XTMP0 %%MOVDQU [%%DST + %%SIZE - 1*32], %%XTMP1 %else %%MOVDQU %%XTMP0, [%%SRC + 0*16] %%MOVDQU %%XTMP1, [%%SRC + 1*16] %%MOVDQU %%XTMP2, [%%SRC + %%SIZE - 2*16] %%MOVDQU %%XTMP3, [%%SRC + %%SIZE - 1*16] %%MOVDQU [%%DST + 0*16], %%XTMP0 %%MOVDQU [%%DST + 1*16], %%XTMP1 %%MOVDQU [%%DST + %%SIZE - 2*16], %%XTMP2 %%MOVDQU [%%DST + %%SIZE - 1*16], %%XTMP3 %endif %%DONE %endif %if (%%MAXSIZE >= 32) %%lt32: test %%SIZE, 16 jz %%lt16 %if (%%USEAVX >= 2) %%MOVDQU XWORD(%%XTMP0), [%%SRC + 0*16] %%MOVDQU XWORD(%%XTMP1), [%%SRC + %%SIZE - 1*16] %%MOVDQU [%%DST + 0*16], XWORD(%%XTMP0) %%MOVDQU [%%DST + %%SIZE - 1*16], XWORD(%%XTMP1) %else %%MOVDQU %%XTMP0, [%%SRC + 0*16] %%MOVDQU %%XTMP1, [%%SRC + %%SIZE - 1*16] %%MOVDQU [%%DST + 0*16], %%XTMP0 %%MOVDQU [%%DST + %%SIZE - 1*16], %%XTMP1 %endif %%DONE %endif %if (%%MAXSIZE >= 16) %%lt16: test %%SIZE, 8 jz %%lt8 mov %%TMP0, [%%SRC] mov %%TMP1, [%%SRC + %%SIZE - 8] mov [%%DST], %%TMP0 mov [%%DST + %%SIZE - 8], %%TMP1 %%DONE %endif %if (%%MAXSIZE >= 8) %%lt8: test %%SIZE, 4 jz %%lt4 mov DWORD(%%TMP0), [%%SRC] mov DWORD(%%TMP1), [%%SRC + %%SIZE - 4] mov [%%DST], DWORD(%%TMP0) mov [%%DST + %%SIZE - 4], DWORD(%%TMP1) %%DONE %endif %if (%%MAXSIZE >= 4) %%lt4: test %%SIZE, 2 jz %%lt2 movzx DWORD(%%TMP0), word [%%SRC] movzx DWORD(%%TMP1), byte [%%SRC + %%SIZE - 1] mov [%%DST], WORD(%%TMP0) mov [%%DST + %%SIZE - 1], BYTE(%%TMP1) %%DONE %endif %%lt2: %if (%%NOT0 == 0) test %%SIZE, 1 jz %%end %endif movzx DWORD(%%TMP0), byte [%%SRC] mov [%%DST], BYTE(%%TMP0) %%end: %if (%%USERET != 0) ret %endif %endm %endif ; ifndef __MEMCPY_ASM__ intel-ipsec-mb-0.48/include/os.asm000066400000000000000000000044421321406316400170270ustar00rootroot00000000000000;; ;; Copyright (c) 2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; %ifndef OS_ASM_FILE %define OS_ASM_FILE %ifndef WIN_ABI %ifidn __OUTPUT_FORMAT__, win64 %define WIN_ABI %endif %endif %ifndef LINUX %ifidn __OUTPUT_FORMAT__, elf64 %define LINUX %endif %endif %ifdef LINUX ;;; macro to declare global symbols ;;; - name : symbol name ;;; - type : funtion or data ;;; - scope : internal, private, default %define MKGLOBAL(name,type,scope) global name %+ : %+ type scope %endif ; LINUX %ifdef WIN_ABI ;;; macro to declare global symbols ;;; - name : symbol name ;;; - type : funtion or data ;;; - scope : internal, private, default (ignored in win64 coff format) %define MKGLOBAL(name,type,scope) global name %endif ; WIN_ABI %endif ; OS_ASM_FILE intel-ipsec-mb-0.48/include/os.h000066400000000000000000000050351321406316400164750ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2012-2017, Intel Corporation Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ #ifndef __OS_H #define __OS_H #ifdef LINUX #define DECLARE_ALIGNED(decl, alignval) \ decl __attribute__((aligned(alignval))) #define __forceinline \ static inline __attribute__((always_inline)) #if __GNUC__ >= 4 #define IMB_DLL_EXPORT __attribute__((visibility("default"))) #define IMB_DLL_LOCAL __attribute__((visibility("hidden"))) #else /* GNU C 4.0 and later */ #define IMB_DLL_EXPORT #define IMB_DLL_LOCAL #endif /* different C compiler */ #else #define DECLARE_ALIGNED(decl, alignval) \ __declspec(align(alignval)) decl #define __forceinline \ static __forceinline /* Windows DLL export is done via DEF file */ #define IMB_DLL_EXPORT #define IMB_DLL_LOCAL #endif #ifdef DEBUG #include #define IMB_ASSERT(x) assert(x) #else #define IMB_ASSERT(x) #endif #ifndef IMB_DIM #define IMB_DIM(x) (sizeof(x) / sizeof(x[0])) #endif #endif /* __OS_H */ intel-ipsec-mb-0.48/include/reg_sizes.asm000066400000000000000000000057741321406316400204110ustar00rootroot00000000000000;; ;; Copyright (c) 2012-2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; ; define d and w variants for registers %define raxd eax %define raxw ax %define raxb al %define rbxd ebx %define rbxw bx %define rbxb bl %define rcxd ecx %define rcxw cx %define rcxb cl %define rdxd edx %define rdxw dx %define rdxb dl %define rsid esi %define rsiw si %define rsib sil %define rdid edi %define rdiw di %define rdib dil %define rbpd ebp %define rbpw bp %define rbpb bpl %define zmm0x xmm0 %define zmm1x xmm1 %define zmm2x xmm2 %define zmm3x xmm3 %define zmm4x xmm4 %define zmm5x xmm5 %define zmm6x xmm6 %define zmm7x xmm7 %define zmm8x xmm8 %define zmm9x xmm9 %define zmm10x xmm10 %define zmm11x xmm11 %define zmm12x xmm12 %define zmm13x xmm13 %define zmm14x xmm14 %define zmm15x xmm15 %define ymm0x xmm0 %define ymm1x xmm1 %define ymm2x xmm2 %define ymm3x xmm3 %define ymm4x xmm4 %define ymm5x xmm5 %define ymm6x xmm6 %define ymm7x xmm7 %define ymm8x xmm8 %define ymm9x xmm9 %define ymm10x xmm10 %define ymm11x xmm11 %define ymm12x xmm12 %define ymm13x xmm13 %define ymm14x xmm14 %define ymm15x xmm15 %define zmm0y ymm0 %define zmm1y ymm1 %define zmm2y ymm2 %define zmm3y ymm3 %define zmm4y ymm4 %define zmm5y ymm5 %define zmm6y ymm6 %define zmm7y ymm7 %define zmm8y ymm8 %define zmm9y ymm9 %define zmm10y ymm10 %define zmm11y ymm11 %define zmm12y ymm12 %define zmm13y ymm13 %define zmm14y ymm14 %define zmm15y ymm15 %define DWORD(reg) reg %+ d %define WORD(reg) reg %+ w %define BYTE(reg) reg %+ b %define XWORD(reg) reg %+ x %define YWORD(reg) reg %+ y intel-ipsec-mb-0.48/include/save_xmms.asm000066400000000000000000000062761321406316400204170ustar00rootroot00000000000000;; ;; Copyright (c) 2012-2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; %include "os.asm" %ifdef LINUX %define ARG1 rdi %else %define ARG1 rcx %endif section .text ; void save_xmms(UINT128 array[10]) MKGLOBAL(save_xmms,function,internal) save_xmms: movdqa [ARG1 + 0*16], xmm6 movdqa [ARG1 + 1*16], xmm7 movdqa [ARG1 + 2*16], xmm8 movdqa [ARG1 + 3*16], xmm9 movdqa [ARG1 + 4*16], xmm10 movdqa [ARG1 + 5*16], xmm11 movdqa [ARG1 + 6*16], xmm12 movdqa [ARG1 + 7*16], xmm13 movdqa [ARG1 + 8*16], xmm14 movdqa [ARG1 + 9*16], xmm15 ret ; void restore_xmms(UINT128 array[10]) MKGLOBAL(restore_xmms,function,internal) restore_xmms: movdqa xmm6, [ARG1 + 0*16] movdqa xmm7, [ARG1 + 1*16] movdqa xmm8, [ARG1 + 2*16] movdqa xmm9, [ARG1 + 3*16] movdqa xmm10, [ARG1 + 4*16] movdqa xmm11, [ARG1 + 5*16] movdqa xmm12, [ARG1 + 6*16] movdqa xmm13, [ARG1 + 7*16] movdqa xmm14, [ARG1 + 8*16] movdqa xmm15, [ARG1 + 9*16] ret ; void save_xmms_avx(UINT128 array[10]) MKGLOBAL(save_xmms_avx,function,internal) save_xmms_avx: vmovdqa [ARG1 + 0*16], xmm6 vmovdqa [ARG1 + 1*16], xmm7 vmovdqa [ARG1 + 2*16], xmm8 vmovdqa [ARG1 + 3*16], xmm9 vmovdqa [ARG1 + 4*16], xmm10 vmovdqa [ARG1 + 5*16], xmm11 vmovdqa [ARG1 + 6*16], xmm12 vmovdqa [ARG1 + 7*16], xmm13 vmovdqa [ARG1 + 8*16], xmm14 vmovdqa [ARG1 + 9*16], xmm15 ret ; void restore_xmms_avx(UINT128 array[10]) MKGLOBAL(restore_xmms_avx,function,internal) restore_xmms_avx: vmovdqa xmm6, [ARG1 + 0*16] vmovdqa xmm7, [ARG1 + 1*16] vmovdqa xmm8, [ARG1 + 2*16] vmovdqa xmm9, [ARG1 + 3*16] vmovdqa xmm10, [ARG1 + 4*16] vmovdqa xmm11, [ARG1 + 5*16] vmovdqa xmm12, [ARG1 + 6*16] vmovdqa xmm13, [ARG1 + 7*16] vmovdqa xmm14, [ARG1 + 8*16] vmovdqa xmm15, [ARG1 + 9*16] ret intel-ipsec-mb-0.48/include/save_xmms.h000066400000000000000000000034641321406316400200620ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2012-2017, Intel Corporation Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ void save_xmms(UINT128 array[10]); void restore_xmms(UINT128 array[10]); void save_xmms_avx(UINT128 array[10]); void restore_xmms_avx(UINT128 array[10]); intel-ipsec-mb-0.48/include/types.h000066400000000000000000000042451321406316400172220ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2012-2017, Intel Corporation Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ /* NOTICE: * Usage of this header file in new projects is deprecated. * Use stdint.h instead for fixed-width integers. */ #ifndef __TYPES_H #define __TYPES_H #include /* For standard integer types */ typedef struct { uint64_t low; uint64_t high; } uint128_t; typedef int64_t INT64; typedef int32_t INT32; typedef int16_t INT16; typedef int8_t INT8; typedef uint128_t UINT128; typedef uint64_t UINT64; typedef uint32_t UINT32; typedef uint16_t UINT16; typedef uint8_t UINT8; #endif intel-ipsec-mb-0.48/intel-ipsec-mb.spec000066400000000000000000000104011321406316400177350ustar00rootroot00000000000000# Copyright (c) 2017, Intel Corporation # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # * Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # * Neither the name of Intel Corporation nor the names of its contributors # may be used to endorse or promote products derived from this software # without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %global githubname intel-ipsec-mb %global githubver 0.46 %global githubfull %{githubname}-v%{githubver} # disable producing debuginfo for this package %global debug_package %{nil} Summary: IPSEC cryptography library optimized for Intel Architecture Name: %{githubname} Release: 1%{?dist} Version: %{githubver} License: BSD Group: Development/Tools ExclusiveArch: x86_64 Source0: https://github.com/01org/%{githubname}/archive/%{githubname}-v%{githubver}.tar.gz URL: https://github.com/01org/%{githubname} %description IPSEC cryptography library optimized for Intel Architecture %package -n intel-ipsec-mb-devel Summary: IPSEC cryptography library optimized for Intel Architecture License: BSD Requires: intel-ipsec-mb == %{version} Group: Development/Tools ExclusiveArch: x86_64 %description -n intel-ipsec-mb-devel IPSEC cryptography library optimized for Intel Architecture For additional information please refer to: https://github.com/01org/%{githubname} %prep %setup -n %{githubfull} %build make %{?_smp_mflags} %install install -d %{buildroot}/%{_licensedir}/%{name}-%{version} install -m 0644 %{_builddir}/%{githubfull}/LICENSE %{buildroot}/%{_licensedir}/%{name}-%{version} # Install the library install -d %{buildroot}/%{_libdir} install -m 0644 -s %{_builddir}/%{githubfull}/libIPSec_MB.a %{buildroot}/%{_libdir} # Install the header file install -d %{buildroot}/%{_includedir} install -d %{buildroot}/%{_includedir}/%{name}-%{version} install -m 0644 %{_builddir}/%{githubfull}/include/types.h %{buildroot}/%{_includedir}/%{name}-%{version} install -m 0644 %{_builddir}/%{githubfull}/constants.h %{buildroot}/%{_includedir}/%{name}-%{version} install -m 0644 %{_builddir}/%{githubfull}/job_aes_hmac.h %{buildroot}/%{_includedir}/%{name}-%{version} install -m 0644 %{_builddir}/%{githubfull}/asm_types.h %{buildroot}/%{_includedir}/%{name}-%{version} install -m 0644 %{_builddir}/%{githubfull}/mb_mgr.h %{buildroot}/%{_includedir}/%{name}-%{version} install -m 0644 %{_builddir}/%{githubfull}/gcm_defines.h %{buildroot}/%{_includedir}/%{name}-%{version} %files %{!?_licensedir:%global license %%doc} %license %{_licensedir}/%{name}-%{version}/LICENSE %doc README ReleaseNotes.txt %files -n intel-ipsec-mb-devel %{_includedir}/%{name}-%{version}/types.h %{_includedir}/%{name}-%{version}/constants.h %{_includedir}/%{name}-%{version}/job_aes_hmac.h %{_includedir}/%{name}-%{version}/asm_types.h %{_includedir}/%{name}-%{version}/gcm_defines.h %{_includedir}/%{name}-%{version}/mb_mgr.h %{_libdir}/libIPSec_MB.a %changelog * Fri Aug 11 2017 Tomasz Kantecki 0.46-1 - initial version of the package intel-ipsec-mb-0.48/job_aes_hmac.asm000066400000000000000000000104101321406316400173450ustar00rootroot00000000000000;; ;; Copyright (c) 2012-2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; %include "datastruct.asm" ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;; Define constants ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; %define STS_BEING_PROCESSED 0 %define STS_COMPLETED_AES 1 %define STS_COMPLETED_HMAC 2 %define STS_COMPLETED 3 %define STS_INVALID_ARGS 4 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;; Define JOB_AES_HMAC structure ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; START_FIELDS ; HMAC Specific Fields ;;; name size align FIELD __auth_key_xor_ipad, 8, 8 ; pointer to ipad FIELD __auth_key_xor_opad, 8, 8 ; pointer to opad END_FIELDS %assign _HMAC_spec_fields_size _FIELD_OFFSET %assign _HMAC_spec_fields_align _STRUCT_ALIGN START_FIELDS ; AES XCBC Specific Fields ;;; name size align FIELD __k1_expanded, 8, 8 ; ptr to exp k1 keys FIELD __k2, 8, 8 ; ptr to k2 FIELD __k3, 8, 8 ; ptr to k3 END_FIELDS %assign _AES_XCBC_spec_fields_size _FIELD_OFFSET %assign _AES_XCBC_spec_fields_align _STRUCT_ALIGN START_FIELDS ; CBCMAC Specific Fields ;;; name size align FIELD __aad, 8, 8 ; pointer to AAD FIELD __aad_len, 8, 8 ; 64-bit AAD length END_FIELDS %assign _CBCMAC_spec_fields_size _FIELD_OFFSET %assign _CBCMAC_spec_fields_align _STRUCT_ALIGN START_FIELDS ; JOB_AES_HMAC ;;; name size align FIELD _aes_enc_key_expanded, 8, 8 ; pointer to exp enc keys FIELD _aes_dec_key_expanded, 8, 8 ; pointer to exp dec keys FIELD _aes_key_len_in_bytes, 8, 8 FIELD _src, 8, 8 ; pointer to src buffer FIELD _dst, 8, 8 ; pointer to dst buffer FIELD _cipher_start_src_offset_in_bytes, \ 8, 8 FIELD _msg_len_to_cipher_in_bytes, 8, 8 FIELD _hash_start_src_offset_in_bytes,8, 8 FIELD _msg_len_to_hash_in_bytes, 8, 8 FIELD _iv, 8, 8 ; pointer to IV FIELD _iv_len_in_bytes, 8, 8 FIELD _auth_tag_output, 8, 8 ; pointer to hash output FIELD _auth_tag_output_len_in_bytes, 8, 8 UNION _u, _HMAC_spec_fields_size, _HMAC_spec_fields_align, \ _AES_XCBC_spec_fields_size, _AES_XCBC_spec_fields_align, \ _CBCMAC_spec_fields_size, _CBCMAC_spec_fields_align FIELD _status, 4, 4 ; JOB_STS FIELD _cipher_mode, 4, 4 ; JOB_CIPHER_MODE FIELD _cipher_direction, 4, 4 ; JOB_CIPHER_DIRECTION FIELD _hash_alg, 4, 4 ; JOB_HASH_ALG FIELD _chain_order, 4, 4 ; JOB_CHAIN_ORDER FIELD _user_data, 8, 8 FIELD _user_data2, 8, 8 END_FIELDS %assign _JOB_AES_HMAC_size _FIELD_OFFSET %assign _JOB_AES_HMAC_align _STRUCT_ALIGN %assign _auth_key_xor_ipad _u + __auth_key_xor_ipad %assign _auth_key_xor_opad _u + __auth_key_xor_opad %assign _k1_expanded _u + __k1_expanded %assign _k2 _u + __k2 %assign _k3 _u + __k3 %assign _cbcmac_aad _u + __aad %assign _cbcmac_aad_len _u + __aad_len intel-ipsec-mb-0.48/job_aes_hmac.h000066400000000000000000000152201321406316400170200ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2012-2017, Intel Corporation Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ #ifndef IMB_JOB_AES_HMAC_H #define IMB_JOB_AES_HMAC_H #include "types.h" typedef enum { STS_BEING_PROCESSED = 0, STS_COMPLETED_AES = 1, STS_COMPLETED_HMAC = 2, STS_COMPLETED = 3, /* COMPLETED_AES | COMPLETED_HMAC */ STS_INVALID_ARGS = 4, STS_INTERNAL_ERROR, STS_ERROR } JOB_STS; typedef enum { CBC = 1, CNTR, NULL_CIPHER, DOCSIS_SEC_BPI, #ifndef NO_GCM GCM, #endif /* !NO_GCM */ CUSTOM_CIPHER, DES, DOCSIS_DES, CCM, } JOB_CIPHER_MODE; typedef enum { ENCRYPT = 1, DECRYPT } JOB_CIPHER_DIRECTION; typedef enum { SHA1 = 1, SHA_224, SHA_256, SHA_384, SHA_512, AES_XCBC, MD5, NULL_HASH, #ifndef NO_GCM AES_GMAC, #endif /* !NO_GCM */ CUSTOM_HASH, AES_CCM, } JOB_HASH_ALG; typedef enum { CIPHER_HASH = 1, HASH_CIPHER } JOB_CHAIN_ORDER; typedef enum { AES_128_BYTES = 16, AES_192_BYTES = 24, AES_256_BYTES = 32 } AES_KEY_SIZE_BYTES; typedef struct JOB_AES_HMAC { const void *aes_enc_key_expanded; /* 16-byte aligned pointer. */ const void *aes_dec_key_expanded; UINT64 aes_key_len_in_bytes; /* Only 16, 24, and 32 byte (128, 192 and * 256-bit) keys supported at this time. */ const UINT8 *src; /* Input. May be cipher text or plaintext. * In-place ciphering allowed. */ UINT8 *dst; /*Output. May be cipher text or plaintext. * In-place ciphering allowed, i.e. dst = src. */ UINT64 cipher_start_src_offset_in_bytes; UINT64 msg_len_to_cipher_in_bytes; /* Max len = 65472 bytes. * IPSec case, the maximum cipher * length would be: * 65535 - * 20 (outer IP header) - * 24 (ESP header + IV) - * 12 (supported ICV length) */ UINT64 hash_start_src_offset_in_bytes; UINT64 msg_len_to_hash_in_bytes; /* Max len = 65496 bytes. * (Max cipher len + * 24 bytes ESP header) */ const UINT8 *iv; /* AES IV. */ UINT64 iv_len_in_bytes; /* AES IV length in bytes. */ UINT8 *auth_tag_output; /* HMAC Tag output. This may point to a location * in the src buffer (for in place)*/ UINT64 auth_tag_output_len_in_bytes; /* HMAC Tag output length in bytes. * (May be a truncated value)*/ /* Start algorithm-specific fields */ union { struct _HMAC_specific_fields { /* Hashed result of HMAC key xor'd with ipad (0x36). */ const UINT8 *_hashed_auth_key_xor_ipad; /* Hashed result of HMAC key xor'd with opad (0x5c). */ const UINT8 *_hashed_auth_key_xor_opad; } HMAC; struct _AES_XCBC_specific_fields { /* 16-byte aligned pointers */ const UINT32 *_k1_expanded; const UINT8 *_k2; const UINT8 *_k3; } XCBC; struct _AES_CCM_specific_fields { /* Additional Authentication Data (AAD) */ const void *aad; UINT64 aad_len_in_bytes; /* Length of AAD */ } CCM; #ifndef NO_GCM struct _AES_GCM_specific_fields { /* Additional Authentication Data (AAD) */ const void *aad; UINT64 aad_len_in_bytes; /* Length of AAD */ } GCM; #endif /* !NO_GCM */ } u; JOB_STS status; JOB_CIPHER_MODE cipher_mode; /* CBC, CNTR, DES, GCM etc. */ JOB_CIPHER_DIRECTION cipher_direction; /* Encrypt/decrypt */ /* Ignored as the direction is implied by the chain _order field. */ JOB_HASH_ALG hash_alg; /* SHA-1 or others... */ JOB_CHAIN_ORDER chain_order; /* CIPHER_HASH or HASH_CIPHER */ void *user_data; void *user_data2; /* * stateless custom cipher and hash * Return: * success: 0 * fail: other */ int (*cipher_func)(struct JOB_AES_HMAC *); int (*hash_func)(struct JOB_AES_HMAC *); } JOB_AES_HMAC; #define hashed_auth_key_xor_ipad u.HMAC._hashed_auth_key_xor_ipad #define hashed_auth_key_xor_opad u.HMAC._hashed_auth_key_xor_opad #define _k1_expanded u.XCBC._k1_expanded #define _k2 u.XCBC._k2 #define _k3 u.XCBC._k3 #endif /* IMB_JOB_AES_HMAC_H */ intel-ipsec-mb-0.48/libIPSec_MB.def000066400000000000000000000127171321406316400167550ustar00rootroot00000000000000; Copyright (c) 2017, Intel Corporation ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are met: ; ; * Redistributions of source code must retain the above copyright notice, ; this list of conditions and the following disclaimer. ; * Redistributions in binary form must reproduce the above copyright ; notice, this list of conditions and the following disclaimer in the ; documentation and/or other materials provided with the distribution. ; * Neither the name of Intel Corporation nor the names of its contributors ; may be used to endorse or promote products derived from this software ; without specific prior written permission. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. LIBRARY libIPSec_MB.dll EXPORTS aes_gcm_dec_128_avx_gen2 @1 aes_gcm_dec_128_avx_gen4 @2 aes_gcm_dec_128_finalize_avx_gen2 @3 aes_gcm_dec_128_finalize_avx_gen4 @4 aes_gcm_dec_128_finalize_sse @5 aes_gcm_dec_128_sse @6 aes_gcm_dec_128_update_avx_gen2 @7 aes_gcm_dec_128_update_avx_gen4 @8 aes_gcm_dec_128_update_sse @9 aes_gcm_dec_192_avx_gen2 @10 aes_gcm_dec_192_avx_gen4 @11 aes_gcm_dec_192_finalize_avx_gen2 @12 aes_gcm_dec_192_finalize_avx_gen4 @13 aes_gcm_dec_192_finalize_sse @14 aes_gcm_dec_192_sse @15 aes_gcm_dec_192_update_avx_gen2 @16 aes_gcm_dec_192_update_avx_gen4 @17 aes_gcm_dec_192_update_sse @18 aes_gcm_dec_256_avx_gen2 @19 aes_gcm_dec_256_avx_gen4 @20 aes_gcm_dec_256_finalize_avx_gen2 @21 aes_gcm_dec_256_finalize_avx_gen4 @22 aes_gcm_dec_256_finalize_sse @23 aes_gcm_dec_256_sse @24 aes_gcm_dec_256_update_avx_gen2 @25 aes_gcm_dec_256_update_avx_gen4 @26 aes_gcm_dec_256_update_sse @27 aes_gcm_enc_128_avx_gen2 @28 aes_gcm_enc_128_avx_gen4 @29 aes_gcm_enc_128_finalize_avx_gen2 @30 aes_gcm_enc_128_finalize_avx_gen4 @31 aes_gcm_enc_128_finalize_sse @32 aes_gcm_enc_128_sse @33 aes_gcm_enc_128_update_avx_gen2 @34 aes_gcm_enc_128_update_avx_gen4 @35 aes_gcm_enc_128_update_sse @36 aes_gcm_enc_192_avx_gen2 @37 aes_gcm_enc_192_avx_gen4 @38 aes_gcm_enc_192_finalize_avx_gen2 @39 aes_gcm_enc_192_finalize_avx_gen4 @40 aes_gcm_enc_192_finalize_sse @41 aes_gcm_enc_192_sse @42 aes_gcm_enc_192_update_avx_gen2 @43 aes_gcm_enc_192_update_avx_gen4 @44 aes_gcm_enc_192_update_sse @45 aes_gcm_enc_256_avx_gen2 @46 aes_gcm_enc_256_avx_gen4 @47 aes_gcm_enc_256_finalize_avx_gen2 @48 aes_gcm_enc_256_finalize_avx_gen4 @49 aes_gcm_enc_256_finalize_sse @50 aes_gcm_enc_256_sse @51 aes_gcm_enc_256_update_avx_gen2 @52 aes_gcm_enc_256_update_avx_gen4 @53 aes_gcm_enc_256_update_sse @54 aes_gcm_init_128_avx_gen2 @55 aes_gcm_init_128_avx_gen4 @56 aes_gcm_init_128_sse @57 aes_gcm_init_192_avx_gen2 @58 aes_gcm_init_192_avx_gen4 @59 aes_gcm_init_192_sse @60 aes_gcm_init_256_avx_gen2 @61 aes_gcm_init_256_avx_gen4 @62 aes_gcm_init_256_sse @63 aes_gcm_precomp_128_avx_gen2 @64 aes_gcm_precomp_128_avx_gen4 @65 aes_gcm_precomp_128_sse @66 aes_gcm_precomp_192_avx_gen2 @67 aes_gcm_precomp_192_avx_gen4 @68 aes_gcm_precomp_192_sse @69 aes_gcm_precomp_256_avx_gen2 @70 aes_gcm_precomp_256_avx_gen4 @71 aes_gcm_precomp_256_sse @72 aes_keyexp_128_avx @73 aes_keyexp_128_enc_avx @74 aes_keyexp_128_enc_sse @75 aes_keyexp_128_sse @76 aes_keyexp_192_avx @77 aes_keyexp_192_enc_avx @78 aes_keyexp_192_enc_sse @79 aes_keyexp_192_sse @80 aes_keyexp_256_avx @81 aes_keyexp_256_enc_avx @82 aes_keyexp_256_enc_sse @83 aes_keyexp_256_sse @84 aes_xcbc_expand_key_avx @85 aes_xcbc_expand_key_sse @86 des_key_schedule @87 flush_job_avx @88 flush_job_avx2 @89 flush_job_avx512 @90 flush_job_sse @91 init_mb_mgr_avx @92 init_mb_mgr_avx2 @93 init_mb_mgr_avx512 @94 init_mb_mgr_sse @95 md5_one_block_sse @96 queue_size_avx @97 queue_size_avx2 @98 queue_size_avx512 @99 queue_size_sse @100 sha1_one_block_avx @101 sha1_one_block_sse @102 sha224_one_block_avx @103 sha224_one_block_sse @104 sha256_one_block_avx @105 sha256_one_block_sse @106 sha384_one_block_avx @107 sha384_one_block_sse @108 sha512_one_block_avx @109 sha512_one_block_sse @110 sse_sha_ext_usage @111 submit_job_avx @112 submit_job_avx2 @113 submit_job_avx512 @114 submit_job_nocheck_avx @115 submit_job_nocheck_avx2 @116 submit_job_nocheck_avx512 @117 submit_job_nocheck_sse @118 submit_job_sse @119 intel-ipsec-mb-0.48/mb_mgr.h000066400000000000000000000317151321406316400157000ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2012-2017, Intel Corporation Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ #ifndef IMB_MB_MGR_H #define IMB_MB_MGR_H #include #include "types.h" #include "constants.h" #include "job_aes_hmac.h" #include "asm_types.h" #define MAX_JOBS 128 /* ========================================================================== */ /* AES out-of-order scheduler fields */ typedef struct { AES_ARGS_x8 args; DECLARE_ALIGNED(UINT16 lens[8], 16); /* each nibble is index (0...7) of an unused lane, * the last nibble is set to F as a flag */ UINT64 unused_lanes; JOB_AES_HMAC *job_in_lane[8]; } MB_MGR_AES_OOO; /* ========================================================================== */ /* AES XCBC out-of-order scheduler fields */ typedef struct { DECLARE_ALIGNED(UINT8 final_block[2 * 16], 32); JOB_AES_HMAC *job_in_lane; UINT64 final_done; } XCBC_LANE_DATA; typedef struct { AES_XCBC_ARGS_x8 args; DECLARE_ALIGNED(UINT16 lens[8], 16); /* each byte is index (0...3) of unused lanes * byte 4 is set to FF as a flag */ UINT64 unused_lanes; XCBC_LANE_DATA ldata[8]; } MB_MGR_AES_XCBC_OOO; /* ========================================================================== */ /* CBC-MAC out-of-order scheduler structure */ typedef struct { AES_ARGS_x8 args; /* need to re-use AES arguments */ DECLARE_ALIGNED(UINT16 lens[8], 16); DECLARE_ALIGNED(UINT16 init_done[8], 16); /* each byte is index (0...3) of unused lanes * byte 4 is set to FF as a flag */ UINT64 unused_lanes; JOB_AES_HMAC *job_in_lane[8]; DECLARE_ALIGNED(UINT8 init_blocks[8 * (4 * 16)], 32); } MB_MGR_CBCMAC_OOO; /* ========================================================================== */ /* DES out-of-order scheduler fields */ typedef struct { DES_ARGS_x16 args; DECLARE_ALIGNED(UINT16 lens[16], 16); /* each nibble is index (0...7) of unused lanes * nibble 8 is set to F as a flag */ UINT64 unused_lanes; JOB_AES_HMAC *job_in_lane[16]; UINT32 num_lanes_inuse; } MB_MGR_DES_OOO; /* ========================================================================== */ /* SHA-HMAC out-of-order scheduler fields */ /* used for SHA1 and SHA256 */ typedef struct { /* YMM aligned access to extra_block */ DECLARE_ALIGNED(UINT8 extra_block[2 * SHA1_BLOCK_SIZE+8], 32); JOB_AES_HMAC *job_in_lane; UINT8 outer_block[64]; UINT32 outer_done; UINT32 extra_blocks; /* num extra blocks (1 or 2) */ UINT32 size_offset; /* offset in extra_block to start of size field */ UINT32 start_offset; /* offset to start of data */ } HMAC_SHA1_LANE_DATA; /* used for SHA512 */ typedef struct { DECLARE_ALIGNED(UINT8 extra_block[2 * SHA_512_BLOCK_SIZE + 16], 32); UINT8 outer_block[SHA_512_BLOCK_SIZE]; JOB_AES_HMAC *job_in_lane; UINT32 outer_done; UINT32 extra_blocks; /* num extra blocks (1 or 2) */ UINT32 size_offset; /* offset in extra_block to start of size field */ UINT32 start_offset; /* offset to start of data */ } HMAC_SHA512_LANE_DATA; /* * unused_lanes contains a list of unused lanes stored as bytes or as * nibbles depending on the arch. The end of list is either FF or F. */ typedef struct { SHA1_ARGS args; DECLARE_ALIGNED(UINT16 lens[16], 32); UINT64 unused_lanes; HMAC_SHA1_LANE_DATA ldata[AVX512_NUM_SHA1_LANES]; UINT32 num_lanes_inuse; } MB_MGR_HMAC_SHA_1_OOO; typedef struct { SHA256_ARGS args; DECLARE_ALIGNED(UINT16 lens[16], 16); UINT64 unused_lanes; HMAC_SHA1_LANE_DATA ldata[AVX512_NUM_SHA256_LANES]; UINT32 num_lanes_inuse; } MB_MGR_HMAC_SHA_256_OOO; typedef struct { SHA512_ARGS args; DECLARE_ALIGNED(UINT16 lens[8], 16); UINT64 unused_lanes; HMAC_SHA512_LANE_DATA ldata[AVX512_NUM_SHA512_LANES]; } MB_MGR_HMAC_SHA_512_OOO; /* ========================================================================== */ /* MD5-HMAC out-of-order scheduler fields */ typedef struct { MD5_ARGS args; DECLARE_ALIGNED(UINT16 lens[AVX512_NUM_MD5_LANES], 16); /* * In the avx2 case, all 16 nibbles of unused lanes are used. * In that case num_lanes_inuse is used to detect the end of the list */ UINT64 unused_lanes; HMAC_SHA1_LANE_DATA ldata[AVX512_NUM_MD5_LANES]; UINT32 num_lanes_inuse; } MB_MGR_HMAC_MD5_OOO; /* ========================================================================== */ /* API definitions */ struct MB_MGR; typedef void (*init_mb_mgr_t)(struct MB_MGR *); typedef JOB_AES_HMAC *(*get_next_job_t)(struct MB_MGR *); typedef JOB_AES_HMAC *(*submit_job_t)(struct MB_MGR *); typedef JOB_AES_HMAC *(*get_completed_job_t)(struct MB_MGR *); typedef JOB_AES_HMAC *(*flush_job_t)(struct MB_MGR *); typedef UINT32 (*queue_size_t)(struct MB_MGR *); typedef void (*keyexp_t)(const void *, void *, void *); /* ========================================================================== */ /* TOP LEVEL (MB_MGR) Data structure fields */ typedef struct MB_MGR { MB_MGR_AES_OOO aes128_ooo; MB_MGR_AES_OOO aes192_ooo; MB_MGR_AES_OOO aes256_ooo; MB_MGR_AES_OOO docsis_sec_ooo; MB_MGR_DES_OOO des_enc_ooo; MB_MGR_DES_OOO des_dec_ooo; MB_MGR_DES_OOO docsis_des_enc_ooo; MB_MGR_DES_OOO docsis_des_dec_ooo; MB_MGR_HMAC_SHA_1_OOO hmac_sha_1_ooo; MB_MGR_HMAC_SHA_256_OOO hmac_sha_224_ooo; MB_MGR_HMAC_SHA_256_OOO hmac_sha_256_ooo; MB_MGR_HMAC_SHA_512_OOO hmac_sha_384_ooo; MB_MGR_HMAC_SHA_512_OOO hmac_sha_512_ooo; MB_MGR_HMAC_MD5_OOO hmac_md5_ooo; MB_MGR_AES_XCBC_OOO aes_xcbc_ooo; MB_MGR_CBCMAC_OOO aes_ccm_ooo; /* in-order scheduler fields */ int earliest_job; /* byte offset, -1 if none */ int next_job; /* byte offset */ JOB_AES_HMAC jobs[MAX_JOBS]; /* arch handlers */ get_next_job_t get_next_job; submit_job_t submit_job; submit_job_t submit_job_nocheck; get_completed_job_t get_completed_job; flush_job_t flush_job; queue_size_t queue_size; keyexp_t keyexp_128; keyexp_t keyexp_192; keyexp_t keyexp_256; } MB_MGR; /* * get_next_job returns a job object. This must be filled in and returned * via submit_job before get_next_job is called again. * After submit_job is called, one should call get_completed_job() at least * once (and preferably until it returns NULL). * get_completed_job and flush_job returns a job object. This job object ceases * to be usable at the next call to get_next_job */ IMB_DLL_EXPORT void init_mb_mgr_avx(MB_MGR *state); IMB_DLL_EXPORT JOB_AES_HMAC *submit_job_avx(MB_MGR *state); IMB_DLL_EXPORT JOB_AES_HMAC *submit_job_nocheck_avx(MB_MGR *state); IMB_DLL_EXPORT JOB_AES_HMAC *flush_job_avx(MB_MGR *state); IMB_DLL_EXPORT UINT32 queue_size_avx(MB_MGR *state); IMB_DLL_EXPORT void init_mb_mgr_avx2(MB_MGR *state); IMB_DLL_EXPORT JOB_AES_HMAC *submit_job_avx2(MB_MGR *state); IMB_DLL_EXPORT JOB_AES_HMAC *submit_job_nocheck_avx2(MB_MGR *state); IMB_DLL_EXPORT JOB_AES_HMAC *flush_job_avx2(MB_MGR *state); IMB_DLL_EXPORT UINT32 queue_size_avx2(MB_MGR *state); IMB_DLL_EXPORT void init_mb_mgr_avx512(MB_MGR *state); IMB_DLL_EXPORT JOB_AES_HMAC *submit_job_avx512(MB_MGR *state); IMB_DLL_EXPORT JOB_AES_HMAC *submit_job_nocheck_avx512(MB_MGR *state); IMB_DLL_EXPORT JOB_AES_HMAC *flush_job_avx512(MB_MGR *state); IMB_DLL_EXPORT UINT32 queue_size_avx512(MB_MGR *state); IMB_DLL_EXPORT void init_mb_mgr_sse(MB_MGR *state); IMB_DLL_EXPORT JOB_AES_HMAC *submit_job_sse(MB_MGR *state); IMB_DLL_EXPORT JOB_AES_HMAC *submit_job_nocheck_sse(MB_MGR *state); IMB_DLL_EXPORT JOB_AES_HMAC *flush_job_sse(MB_MGR *state); IMB_DLL_EXPORT UINT32 queue_size_sse(MB_MGR *state); enum SHA_EXTENSION_USAGE { SHA_EXT_NOT_PRESENT = 0, /* don't detect and don't use SHA extensions */ SHA_EXT_PRESENT, /* don't detect and use SHA extensions */ SHA_EXT_DETECT, /* default - detect & use SHA extensions if present */ }; extern enum SHA_EXTENSION_USAGE sse_sha_ext_usage; #define get_completed_job_avx get_completed_job_sse #define get_next_job_avx get_next_job_sse #define get_completed_job_avx2 get_completed_job_sse #define get_next_job_avx2 get_next_job_sse #define get_completed_job_avx512 get_completed_job_sse #define get_next_job_avx512 get_next_job_sse /* * JOBS() and ADV_JOBS() also used in mb_mgr_code.h * index in JOBS array using byte offset rather than object index */ __forceinline JOB_AES_HMAC *JOBS(MB_MGR *state, const int offset) { char *cp = (char *)state->jobs; return (JOB_AES_HMAC *)(cp + offset); } __forceinline void ADV_JOBS(int *ptr) { *ptr += sizeof(JOB_AES_HMAC); if (*ptr >= (int) (MAX_JOBS * sizeof(JOB_AES_HMAC))) *ptr = 0; } __forceinline JOB_AES_HMAC * get_completed_job_sse(MB_MGR *state) { JOB_AES_HMAC *job; if (state->earliest_job < 0) return NULL; job = JOBS(state, state->earliest_job); if (job->status < STS_COMPLETED) return NULL; ADV_JOBS(&state->earliest_job); if (state->earliest_job == state->next_job) state->earliest_job = -1; return job; } __forceinline JOB_AES_HMAC * get_next_job_sse(MB_MGR *state) { return JOBS(state, state->next_job); } /* * Wrapper macros to call arch API's set up * at init phase of multi-buffer manager. * * For example, after calling init_mb_mgr_sse(&mgr) * The 'mgr' structure be set up so that: * mgr.get_next_job will point to get_next_job_sse(), * mgr.submit_job will point to submit_job_sse(), * mgr.submit_job_nocheck will point to submit_job_nocheck_sse(), * mgr.get_completed_job will point to get_completed_job_sse(), * mgr.flush_job will point to flush_job_sse(), * mgr.queue_size will point to queue_size_sse() * mgr.keyexp_128 will point to aes_keyexp_128_sse() * mgr.keyexp_192 will point to aes_keyexp_192_sse() * mgr.keyexp_256 will point to aes_keyexp_256_sse() * * Direct use of arch API's may result in better performance. * Using below indirect interface may produce slightly worse performance but * it can simplify application implementation. * LibTestApp provides example of using the indirect interface. */ #define IMB_GET_NEXT_JOB(_mgr) ((_mgr)->get_next_job((_mgr))) #define IMB_SUBMIT_JOB(_mgr) ((_mgr)->submit_job((_mgr))) #define IMB_SUBMIT_JOB_NOCHECK(_mgr) ((_mgr)->submit_job_nocheck((_mgr))) #define IMB_GET_COMPLETED_JOB(_mgr) ((_mgr)->get_completed_job((_mgr))) #define IMB_FLUSH_JOB(_mgr) ((_mgr)->flush_job((_mgr))) #define IMB_QUEUE_SIZE(_mgr) ((_mgr)->queue_size((_mgr))) #define IMB_AES_KEYEXP_128(_mgr, _raw, _enc, _dec) \ ((_mgr)->keyexp_128((_raw), (_enc), (_dec))) #define IMB_AES_KEYEXP_192(_mgr, _raw, _enc, _dec) \ ((_mgr)->keyexp_192((_raw), (_enc), (_dec))) #define IMB_AES_KEYEXP_256(_mgr, _raw, _enc, _dec) \ ((_mgr)->keyexp_256((_raw), (_enc), (_dec))) #endif /* IMB_MB_MGR_H */ intel-ipsec-mb-0.48/mb_mgr_code.h000066400000000000000000001473741321406316400167030ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2012-2017, Intel Corporation Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ /* * This contains the bulk of the mb_mgr code, with #define's to build * an SSE, AVX, AVX2 or AVX512 version (see mb_mgr_sse.c, mb_mgr_avx.c, etc.) * * get_next_job() returns a job object. This must be filled in and returned * via submit_job() before get_next_job() is called again. * * submit_job() and flush_job() returns a job object. This job object ceases * to be usable at the next call to get_next_job() * * Assume JOBS() and ADV_JOBS() from mb_mgr_code.h are available */ #include /* memcpy(), memset() */ /* ========================================================================= */ /* Lower level "out of order" schedulers */ /* ========================================================================= */ __forceinline JOB_AES_HMAC * SUBMIT_JOB_AES128_DEC(JOB_AES_HMAC *job) { AES_CBC_DEC_128(job->src + job->cipher_start_src_offset_in_bytes, job->iv, job->aes_dec_key_expanded, job->dst, job->msg_len_to_cipher_in_bytes & (~15)); job->status |= STS_COMPLETED_AES; return job; } __forceinline JOB_AES_HMAC * SUBMIT_JOB_AES192_DEC(JOB_AES_HMAC *job) { AES_CBC_DEC_192(job->src + job->cipher_start_src_offset_in_bytes, job->iv, job->aes_dec_key_expanded, job->dst, job->msg_len_to_cipher_in_bytes); job->status |= STS_COMPLETED_AES; return job; } __forceinline JOB_AES_HMAC * SUBMIT_JOB_AES256_DEC(JOB_AES_HMAC *job) { AES_CBC_DEC_256(job->src + job->cipher_start_src_offset_in_bytes, job->iv, job->aes_dec_key_expanded, job->dst, job->msg_len_to_cipher_in_bytes); job->status |= STS_COMPLETED_AES; return job; } __forceinline JOB_AES_HMAC * SUBMIT_JOB_AES128_CNTR(JOB_AES_HMAC *job) { AES_CNTR_128(job->src + job->cipher_start_src_offset_in_bytes, job->iv, job->aes_enc_key_expanded, job->dst, job->msg_len_to_cipher_in_bytes, job->iv_len_in_bytes); job->status |= STS_COMPLETED_AES; return job; } __forceinline JOB_AES_HMAC * SUBMIT_JOB_AES192_CNTR(JOB_AES_HMAC *job) { AES_CNTR_192(job->src + job->cipher_start_src_offset_in_bytes, job->iv, job->aes_enc_key_expanded, job->dst, job->msg_len_to_cipher_in_bytes, job->iv_len_in_bytes); job->status |= STS_COMPLETED_AES; return job; } __forceinline JOB_AES_HMAC * SUBMIT_JOB_AES256_CNTR(JOB_AES_HMAC *job) { AES_CNTR_256(job->src + job->cipher_start_src_offset_in_bytes, job->iv, job->aes_enc_key_expanded, job->dst, job->msg_len_to_cipher_in_bytes, job->iv_len_in_bytes); job->status |= STS_COMPLETED_AES; return job; } /* ========================================================================= */ /* AES-CCM */ /* ========================================================================= */ __forceinline JOB_AES_HMAC * submit_flush_job_aes_ccm(MB_MGR_CBCMAC_OOO *state, JOB_AES_HMAC *job, const unsigned max_jobs, const int is_submit) { const unsigned lane_blocks_size = 64; const unsigned aad_len_size = 2; unsigned lane, min_len, min_idx; JOB_AES_HMAC *ret_job = NULL; uint8_t *pb = NULL; unsigned i; if (is_submit) { /* * SUBMIT * - get a free lane id */ const unsigned L = AES_BLOCK_SIZE - 1 - (unsigned) job->iv_len_in_bytes; lane = state->unused_lanes & 15; state->unused_lanes >>= 4; pb = &state->init_blocks[lane * lane_blocks_size]; /* * Build IV for AES-CTR-128. * - byte 0: flags with L' * - bytes 1 to 13: nonce * - zero bytes after nonce (up to byte 15) * * First AES block of init_blocks will always hold this format * throughtout job processing. */ memset(&pb[8], 0, 8); pb[0] = (uint8_t) L - 1; /* flags = L` = L - 1 */ /* nonce 7 to 13 */ memcpy(&pb[1], job->iv, job->iv_len_in_bytes); if (job->cipher_direction != ENCRYPT) { /* decrypt before authentication */ pb[15] = 1; AES_CNTR_128(job->src + job->cipher_start_src_offset_in_bytes, pb, job->aes_enc_key_expanded, job->dst, job->msg_len_to_cipher_in_bytes, AES_BLOCK_SIZE); } /* copy job data in and set up inital blocks */ state->job_in_lane[lane] = job; state->lens[lane] = AES_BLOCK_SIZE; state->init_done[lane] = 0; state->args.in[lane] = pb; state->args.keys[lane] = job->aes_enc_key_expanded; memset(&state->args.IV[lane], 0, sizeof(state->args.IV[0])); /* * Convert AES-CTR IV into BLOCK 0 for CBC-MAC-128: * - correct flags by adding M' (AAD later) * - put message length */ pb[0] |= ((job->auth_tag_output_len_in_bytes - 2) >> 1) << 3; pb[14] = (uint8_t) (job->msg_len_to_hash_in_bytes >> 8); pb[15] = (uint8_t) job->msg_len_to_hash_in_bytes; /* Make AAD correction and put together AAD blocks, if any */ if (job->u.CCM.aad_len_in_bytes != 0) { /* * - increment length by length of AAD and * AAD length size * - add AAD present flag * - copy AAD to the lane initial blocks * - zero trailing block bytes */ const unsigned aadl = (unsigned) job->u.CCM.aad_len_in_bytes + aad_len_size; state->lens[lane] += (aadl + AES_BLOCK_SIZE - 1) & (~(AES_BLOCK_SIZE - 1)); pb[0] |= 0x40; pb[AES_BLOCK_SIZE + 0] = (uint8_t) (job->u.CCM.aad_len_in_bytes >> 8); pb[AES_BLOCK_SIZE + 1] = (uint8_t) job->u.CCM.aad_len_in_bytes; memcpy(&pb[AES_BLOCK_SIZE + aad_len_size], job->u.CCM.aad, job->u.CCM.aad_len_in_bytes); memset(&pb[AES_BLOCK_SIZE + aadl], 0, state->lens[lane] - aadl); } /* enough jobs to start processing? */ if (state->unused_lanes != 0xf) return NULL; } else { /* * FLUSH * - find 1st non null job */ for (lane = 0; lane < max_jobs; lane++) if (state->job_in_lane[lane] != NULL) break; if (lane >= max_jobs) return NULL; /* no not null job */ } ccm_round: if (is_submit) { /* * SUBMIT * - find min common length to process */ min_idx = 0; min_len = state->lens[0]; for (i = 1; i < max_jobs; i++) { if (min_len > state->lens[i]) { min_idx = i; min_len = state->lens[i]; } } } else { /* * FLUSH * - copy good (not null) lane onto empty lanes * - find min common length to process across not null lanes */ min_idx = lane; min_len = state->lens[lane]; for (i = 0; i < max_jobs; i++) { if (i == lane) continue; if (state->job_in_lane[i] != NULL) { if (min_len > state->lens[i]) { min_idx = i; min_len = state->lens[i]; } } else { state->args.in[i] = state->args.in[lane]; state->args.keys[i] = state->args.keys[lane]; state->args.IV[i] = state->args.IV[lane]; state->lens[i] = UINT16_MAX; state->init_done[i] = state->init_done[lane]; } } } /* subtract min len from all lanes */ for (i = 0; i < max_jobs; i++) state->lens[i] -= min_len; /* run the algorythmic code on selected blocks */ if (min_len != 0) AES128_CBC_MAC(&state->args, min_len); ret_job = state->job_in_lane[min_idx]; pb = &state->init_blocks[min_idx * lane_blocks_size]; if (state->init_done[min_idx] == 0) { /* * First block and AAD blocks are done. * Full message blocks are to do. */ if (ret_job->cipher_direction == ENCRYPT) state->args.in[min_idx] = ret_job->src + ret_job->hash_start_src_offset_in_bytes; else state->args.in[min_idx] = ret_job->dst; state->init_done[min_idx] = 1; if (ret_job->msg_len_to_hash_in_bytes & (~15)) { /* first block + AAD done - process message blocks */ state->lens[min_idx] = ret_job->msg_len_to_hash_in_bytes & (~15); goto ccm_round; } } if (state->init_done[min_idx] == 1 && (ret_job->msg_len_to_hash_in_bytes & 15)) { /* * First block, AAD, message blocks are done. * Partial message block is still to do. */ state->init_done[min_idx] = 2; state->lens[min_idx] = AES_BLOCK_SIZE; memset(&pb[AES_BLOCK_SIZE], 0, AES_BLOCK_SIZE); memcpy(&pb[AES_BLOCK_SIZE], state->args.in[min_idx], (size_t) ret_job->msg_len_to_hash_in_bytes & 15); state->args.in[min_idx] = &pb[AES_BLOCK_SIZE]; goto ccm_round; } /* * Final XOR with AES-CNTR on B_0 * - remove M' and AAD presence bits from flags * - set counter to 0 */ pb[0] = pb[0] & 7; pb[14] = 0; pb[15] = 0; /* * Clever use of AES-CTR mode saves a few ops here. * What AES-CCM authentication requires us to do is: * AES-CCM: E(KEY,B_0) XOR IV_CBC_MAC * * And what AES_CTR offers is: * AES_CTR: E(KEY, NONCE|COUNTER) XOR PLAIN_TEXT * * So if: * B_0 is passed instead of NONCE|COUNTER and IV instead of PLAIN_TESXT * then AES_CTR function is doing pretty much what we need. * On top of it can truncate the authentication tag and copy to * destination. */ AES_CNTR_128(&state->args.IV[min_idx] /* src = IV */, pb /* nonce/iv = B_0 */, state->args.keys[min_idx], ret_job->auth_tag_output /* dst */, ret_job->auth_tag_output_len_in_bytes /* num_bytes */, AES_BLOCK_SIZE /* nonce/iv len */); if (ret_job->cipher_direction == ENCRYPT) { /* encrypt after authentication */ pb[15] = 1; /* start from counter 1, not 0 */ AES_CNTR_128(ret_job->src + ret_job->cipher_start_src_offset_in_bytes, pb, ret_job->aes_enc_key_expanded, ret_job->dst, ret_job->msg_len_to_cipher_in_bytes, AES_BLOCK_SIZE); } /* put back processed packet into unused lanes, set job as complete */ state->unused_lanes = (state->unused_lanes << 4) | min_idx; ret_job = state->job_in_lane[min_idx]; ret_job->status |= (STS_COMPLETED_HMAC|STS_COMPLETED_AES); state->job_in_lane[min_idx] = NULL; return ret_job; } static JOB_AES_HMAC * submit_job_aes_ccm_auth_arch(MB_MGR_CBCMAC_OOO *state, JOB_AES_HMAC *job) { return submit_flush_job_aes_ccm(state, job, AES_CCM_MAX_JOBS, 1); } static JOB_AES_HMAC * flush_job_aes_ccm_auth_arch(MB_MGR_CBCMAC_OOO *state) { return submit_flush_job_aes_ccm(state, NULL, AES_CCM_MAX_JOBS, 0); } /* ========================================================================= */ /* AES-GCM */ /* ========================================================================= */ #ifndef NO_GCM __forceinline JOB_AES_HMAC * SUBMIT_JOB_AES_GCM_DEC(JOB_AES_HMAC *job) { DECLARE_ALIGNED(struct gcm_context_data ctx, 16); if (16 == job->aes_key_len_in_bytes) AES_GCM_DEC_128(job->aes_dec_key_expanded, &ctx, job->dst, job->src + job->cipher_start_src_offset_in_bytes, job->msg_len_to_cipher_in_bytes, job->iv, job->u.GCM.aad, job->u.GCM.aad_len_in_bytes, job->auth_tag_output, job->auth_tag_output_len_in_bytes); else if (24 == job->aes_key_len_in_bytes) AES_GCM_DEC_192(job->aes_dec_key_expanded, &ctx, job->dst, job->src + job->cipher_start_src_offset_in_bytes, job->msg_len_to_cipher_in_bytes, job->iv, job->u.GCM.aad, job->u.GCM.aad_len_in_bytes, job->auth_tag_output, job->auth_tag_output_len_in_bytes); else AES_GCM_DEC_256(job->aes_dec_key_expanded, &ctx, job->dst, job->src + job->cipher_start_src_offset_in_bytes, job->msg_len_to_cipher_in_bytes, job->iv, job->u.GCM.aad, job->u.GCM.aad_len_in_bytes, job->auth_tag_output, job->auth_tag_output_len_in_bytes); job->status = STS_COMPLETED; return job; } __forceinline JOB_AES_HMAC * SUBMIT_JOB_AES_GCM_ENC(JOB_AES_HMAC *job) { DECLARE_ALIGNED(struct gcm_context_data ctx, 16); if (16 == job->aes_key_len_in_bytes) AES_GCM_ENC_128(job->aes_dec_key_expanded, &ctx, job->dst, job->src + job->cipher_start_src_offset_in_bytes, job->msg_len_to_cipher_in_bytes, job->iv, job->u.GCM.aad, job->u.GCM.aad_len_in_bytes, job->auth_tag_output, job->auth_tag_output_len_in_bytes); else if (24 == job->aes_key_len_in_bytes) AES_GCM_ENC_192(job->aes_dec_key_expanded, &ctx, job->dst, job->src + job->cipher_start_src_offset_in_bytes, job->msg_len_to_cipher_in_bytes, job->iv, job->u.GCM.aad, job->u.GCM.aad_len_in_bytes, job->auth_tag_output, job->auth_tag_output_len_in_bytes); else AES_GCM_ENC_256(job->aes_dec_key_expanded, &ctx, job->dst, job->src + job->cipher_start_src_offset_in_bytes, job->msg_len_to_cipher_in_bytes, job->iv, job->u.GCM.aad, job->u.GCM.aad_len_in_bytes, job->auth_tag_output, job->auth_tag_output_len_in_bytes); job->status = STS_COMPLETED; return job; } #endif /* !NO_GCM */ /* ========================================================================= */ /* Custom hash / cipher */ /* ========================================================================= */ __forceinline JOB_AES_HMAC * JOB_CUSTOM_CIPHER(JOB_AES_HMAC *job) { if (!(job->status & STS_COMPLETED_AES)) { if (job->cipher_func(job)) job->status = STS_INTERNAL_ERROR; else job->status |= STS_COMPLETED_AES; } return job; } __forceinline JOB_AES_HMAC * SUBMIT_JOB_CUSTOM_CIPHER(JOB_AES_HMAC *job) { return JOB_CUSTOM_CIPHER(job); } __forceinline JOB_AES_HMAC * FLUSH_JOB_CUSTOM_CIPHER(JOB_AES_HMAC *job) { return JOB_CUSTOM_CIPHER(job); } __forceinline JOB_AES_HMAC * JOB_CUSTOM_HASH(JOB_AES_HMAC *job) { if (!(job->status & STS_COMPLETED_HMAC)) { if (job->hash_func(job)) job->status = STS_INTERNAL_ERROR; else job->status |= STS_COMPLETED_HMAC; } return job; } __forceinline JOB_AES_HMAC * SUBMIT_JOB_CUSTOM_HASH(JOB_AES_HMAC *job) { return JOB_CUSTOM_HASH(job); } __forceinline JOB_AES_HMAC * FLUSH_JOB_CUSTOM_HASH(JOB_AES_HMAC *job) { return JOB_CUSTOM_HASH(job); } /* ========================================================================= */ /* DOCSIS AES (AES128 CBC + AES128 CFB) */ /* ========================================================================= */ #define AES_BLOCK_SIZE 16 /** * @brief Encrypts/decrypts the last partial block for DOCSIS SEC v3.1 BPI * * The last partial block is encrypted/decrypted using AES CFB128. * IV is always the next last ciphered block. * * @note It is assumed that length is bigger than one AES 128 block. * * @param job desriptor of performed crypto operation * @return It always returns value passed in \a job */ __forceinline JOB_AES_HMAC * DOCSIS_LAST_BLOCK(JOB_AES_HMAC *job) { const void *iv = NULL; UINT64 offset = 0; UINT64 partial_bytes = 0; if (job == NULL) return job; IMB_ASSERT((job->cipher_direction == DECRYPT) || (job->status & STS_COMPLETED_AES)); partial_bytes = job->msg_len_to_cipher_in_bytes & (AES_BLOCK_SIZE - 1); offset = job->msg_len_to_cipher_in_bytes & (~(AES_BLOCK_SIZE - 1)); if (!partial_bytes) return job; /* in either case IV has to be next last ciphered block */ if (job->cipher_direction == ENCRYPT) iv = job->dst + offset - AES_BLOCK_SIZE; else iv = job->src + job->cipher_start_src_offset_in_bytes + offset - AES_BLOCK_SIZE; IMB_ASSERT(partial_bytes <= AES_BLOCK_SIZE); AES_CFB_128_ONE(job->dst + offset, job->src + job->cipher_start_src_offset_in_bytes + offset, iv, job->aes_enc_key_expanded, partial_bytes); return job; } /** * @brief Encrypts/decrypts the first and only partial block for * DOCSIS SEC v3.1 BPI * * The first partial block is encrypted/decrypted using AES CFB128. * * @param job desriptor of performed crypto operation * @return It always returns value passed in \a job */ __forceinline JOB_AES_HMAC * DOCSIS_FIRST_BLOCK(JOB_AES_HMAC *job) { IMB_ASSERT(!(job->status & STS_COMPLETED_AES)); IMB_ASSERT(job->msg_len_to_cipher_in_bytes <= AES_BLOCK_SIZE); AES_CFB_128_ONE(job->dst, job->src + job->cipher_start_src_offset_in_bytes, job->iv, job->aes_enc_key_expanded, job->msg_len_to_cipher_in_bytes); job->status |= STS_COMPLETED_AES; return job; } /* ========================================================================= */ /* DES and DOCSIS DES (DES CBC + DES CFB) */ /* ========================================================================= */ /** * @brief DOCSIS DES cipher encryption * * @param job desriptor of performed crypto operation * @return It always returns value passed in \a job */ __forceinline JOB_AES_HMAC * DOCSIS_DES_ENC(JOB_AES_HMAC *job) { IMB_ASSERT(!(job->status & STS_COMPLETED_AES)); docsis_des_enc_basic(job->src + job->cipher_start_src_offset_in_bytes, job->dst, (int) job->msg_len_to_cipher_in_bytes, job->aes_enc_key_expanded, (const uint64_t *)job->iv); job->status |= STS_COMPLETED_AES; return job; } /** * @brief DOCSIS DES cipher decryption * * @param job desriptor of performed crypto operation * @return It always returns value passed in \a job */ __forceinline JOB_AES_HMAC * DOCSIS_DES_DEC(JOB_AES_HMAC *job) { IMB_ASSERT(!(job->status & STS_COMPLETED_AES)); docsis_des_dec_basic(job->src + job->cipher_start_src_offset_in_bytes, job->dst, (int) job->msg_len_to_cipher_in_bytes, job->aes_dec_key_expanded, (const uint64_t *)job->iv); job->status |= STS_COMPLETED_AES; return job; } /** * @brief DES cipher encryption * * @param job desriptor of performed crypto operation * @return It always returns value passed in \a job */ __forceinline JOB_AES_HMAC * DES_CBC_ENC(JOB_AES_HMAC *job) { IMB_ASSERT(!(job->status & STS_COMPLETED_AES)); des_enc_cbc_basic(job->src + job->cipher_start_src_offset_in_bytes, job->dst, job->msg_len_to_cipher_in_bytes & (~(DES_BLOCK_SIZE - 1)), job->aes_enc_key_expanded, (const uint64_t *)job->iv); job->status |= STS_COMPLETED_AES; return job; } /** * @brief DES cipher decryption * * @param job desriptor of performed crypto operation * @return It always returns value passed in \a job */ __forceinline JOB_AES_HMAC * DES_CBC_DEC(JOB_AES_HMAC *job) { IMB_ASSERT(!(job->status & STS_COMPLETED_AES)); des_dec_cbc_basic(job->src + job->cipher_start_src_offset_in_bytes, job->dst, job->msg_len_to_cipher_in_bytes & (~(DES_BLOCK_SIZE - 1)), job->aes_dec_key_expanded, (const uint64_t *)job->iv); job->status |= STS_COMPLETED_AES; return job; } /* ========================================================================= */ /* Cipher submit & flush functions */ /* ========================================================================= */ __forceinline JOB_AES_HMAC * SUBMIT_JOB_AES_ENC(MB_MGR *state, JOB_AES_HMAC *job) { if (CBC == job->cipher_mode) { if (16 == job->aes_key_len_in_bytes) { return SUBMIT_JOB_AES128_ENC(&state->aes128_ooo, job); } else if (24 == job->aes_key_len_in_bytes) { return SUBMIT_JOB_AES192_ENC(&state->aes192_ooo, job); } else { /* assume 32 */ return SUBMIT_JOB_AES256_ENC(&state->aes256_ooo, job); } } else if (CNTR == job->cipher_mode) { if (16 == job->aes_key_len_in_bytes) { return SUBMIT_JOB_AES128_CNTR(job); } else if (24 == job->aes_key_len_in_bytes) { return SUBMIT_JOB_AES192_CNTR(job); } else { /* assume 32 */ return SUBMIT_JOB_AES256_CNTR(job); } } else if (DOCSIS_SEC_BPI == job->cipher_mode) { if (job->msg_len_to_cipher_in_bytes >= AES_BLOCK_SIZE) { JOB_AES_HMAC *tmp; tmp = SUBMIT_JOB_AES128_ENC(&state->docsis_sec_ooo, job); return DOCSIS_LAST_BLOCK(tmp); } else return DOCSIS_FIRST_BLOCK(job); #ifndef NO_GCM } else if (GCM == job->cipher_mode) { return SUBMIT_JOB_AES_GCM_ENC(job); #endif /* NO_GCM */ } else if (CUSTOM_CIPHER == job->cipher_mode) { return SUBMIT_JOB_CUSTOM_CIPHER(job); } else if (DES == job->cipher_mode) { #ifdef SUBMIT_JOB_DES_CBC_ENC return SUBMIT_JOB_DES_CBC_ENC(&state->des_enc_ooo, job); #else return DES_CBC_ENC(job); #endif /* SUBMIT_JOB_DES_CBC_ENC */ } else if (DOCSIS_DES == job->cipher_mode) { #ifdef SUBMIT_JOB_DOCSIS_DES_ENC return SUBMIT_JOB_DOCSIS_DES_ENC(&state->docsis_des_enc_ooo, job); #else return DOCSIS_DES_ENC(job); #endif /* SUBMIT_JOB_DOCSIS_DES_ENC */ } else { /* assume NUL_CIPHER or CCM */ job->status |= STS_COMPLETED_AES; return job; } } __forceinline JOB_AES_HMAC * FLUSH_JOB_AES_ENC(MB_MGR *state, JOB_AES_HMAC *job) { if (CBC == job->cipher_mode) { if (16 == job->aes_key_len_in_bytes) { return FLUSH_JOB_AES128_ENC(&state->aes128_ooo); } else if (24 == job->aes_key_len_in_bytes) { return FLUSH_JOB_AES192_ENC(&state->aes192_ooo); } else { /* assume 32 */ return FLUSH_JOB_AES256_ENC(&state->aes256_ooo); } } else if (DOCSIS_SEC_BPI == job->cipher_mode) { JOB_AES_HMAC *tmp; tmp = FLUSH_JOB_AES128_ENC(&state->docsis_sec_ooo); return DOCSIS_LAST_BLOCK(tmp); #ifdef FLUSH_JOB_DES_CBC_ENC } else if (DES == job->cipher_mode) { return FLUSH_JOB_DES_CBC_ENC(&state->des_enc_ooo); #endif /* FLUSH_JOB_DES_CBC_ENC */ #ifdef FLUSH_JOB_DOCSIS_DES_ENC } else if (DOCSIS_DES == job->cipher_mode) { return FLUSH_JOB_DOCSIS_DES_ENC(&state->docsis_des_enc_ooo); #endif /* FLUSH_JOB_DOCSIS_DES_ENC */ } else if (CUSTOM_CIPHER == job->cipher_mode) { return FLUSH_JOB_CUSTOM_CIPHER(job); } else { /* assume CNTR, CCM or NULL_CIPHER */ return NULL; } } __forceinline JOB_AES_HMAC * SUBMIT_JOB_AES_DEC(MB_MGR *state, JOB_AES_HMAC *job) { if (CBC == job->cipher_mode) { if (16 == job->aes_key_len_in_bytes) { return SUBMIT_JOB_AES128_DEC(job); } else if (24 == job->aes_key_len_in_bytes) { return SUBMIT_JOB_AES192_DEC(job); } else { /* assume 32 */ return SUBMIT_JOB_AES256_DEC(job); } } else if (CNTR == job->cipher_mode) { if (16 == job->aes_key_len_in_bytes) { return SUBMIT_JOB_AES128_CNTR(job); } else if (24 == job->aes_key_len_in_bytes) { return SUBMIT_JOB_AES192_CNTR(job); } else { /* assume 32 */ return SUBMIT_JOB_AES256_CNTR(job); } } else if (DOCSIS_SEC_BPI == job->cipher_mode) { if (job->msg_len_to_cipher_in_bytes >= AES_BLOCK_SIZE) { DOCSIS_LAST_BLOCK(job); return SUBMIT_JOB_AES128_DEC(job); } else { return DOCSIS_FIRST_BLOCK(job); } #ifndef NO_GCM } else if (GCM == job->cipher_mode) { return SUBMIT_JOB_AES_GCM_DEC(job); #endif /* NO_GCM */ } else if (DES == job->cipher_mode) { #ifdef SUBMIT_JOB_DES_CBC_DEC return SUBMIT_JOB_DES_CBC_DEC(&state->des_dec_ooo, job); #else (void) state; return DES_CBC_DEC(job); #endif /* SUBMIT_JOB_DES_CBC_DEC */ } else if (DOCSIS_DES == job->cipher_mode) { #ifdef SUBMIT_JOB_DOCSIS_DES_DEC return SUBMIT_JOB_DOCSIS_DES_DEC(&state->docsis_des_dec_ooo, job); #else return DOCSIS_DES_DEC(job); #endif /* SUBMIT_JOB_DOCSIS_DES_DEC */ } else if (CUSTOM_CIPHER == job->cipher_mode) { return SUBMIT_JOB_CUSTOM_CIPHER(job); } else { /* assume NULL cipher or CCM */ job->status |= STS_COMPLETED_AES; return job; } } __forceinline JOB_AES_HMAC * FLUSH_JOB_AES_DEC(MB_MGR *state, JOB_AES_HMAC *job) { #ifdef FLUSH_JOB_DES_CBC_DEC if (DES == job->cipher_mode) return FLUSH_JOB_DES_CBC_DEC(&state->des_dec_ooo); #endif /* FLUSH_JOB_DES_CBC_DEC */ #ifdef FLUSH_JOB_DOCSIS_DES_DEC if (DOCSIS_DES == job->cipher_mode) return FLUSH_JOB_DOCSIS_DES_DEC(&state->docsis_des_dec_ooo); #endif /* FLUSH_JOB_DOCSIS_DES_DEC */ (void) state; return SUBMIT_JOB_AES_DEC(state, job); } /* ========================================================================= */ /* Hash submit & flush functions */ /* ========================================================================= */ __forceinline JOB_AES_HMAC * SUBMIT_JOB_HASH(MB_MGR *state, JOB_AES_HMAC *job) { #ifdef VERBOSE printf("--------Enter SUBMIT_JOB_HASH --------------\n"); #endif switch (job->hash_alg) { case SHA1: #ifdef HASH_USE_SHAEXT if (HASH_USE_SHAEXT == SHA_EXT_PRESENT) return SUBMIT_JOB_HMAC_NI(&state->hmac_sha_1_ooo, job); #endif return SUBMIT_JOB_HMAC(&state->hmac_sha_1_ooo, job); case SHA_224: #ifdef HASH_USE_SHAEXT if (HASH_USE_SHAEXT == SHA_EXT_PRESENT) return SUBMIT_JOB_HMAC_SHA_224_NI (&state->hmac_sha_224_ooo, job); #endif return SUBMIT_JOB_HMAC_SHA_224(&state->hmac_sha_224_ooo, job); case SHA_256: #ifdef HASH_USE_SHAEXT if (HASH_USE_SHAEXT == SHA_EXT_PRESENT) return SUBMIT_JOB_HMAC_SHA_256_NI (&state->hmac_sha_256_ooo, job); #endif return SUBMIT_JOB_HMAC_SHA_256(&state->hmac_sha_256_ooo, job); case SHA_384: return SUBMIT_JOB_HMAC_SHA_384(&state->hmac_sha_384_ooo, job); case SHA_512: return SUBMIT_JOB_HMAC_SHA_512(&state->hmac_sha_512_ooo, job); case AES_XCBC: return SUBMIT_JOB_AES_XCBC(&state->aes_xcbc_ooo, job); case MD5: return SUBMIT_JOB_HMAC_MD5(&state->hmac_md5_ooo, job); case CUSTOM_HASH: return SUBMIT_JOB_CUSTOM_HASH(job); case AES_CCM: return SUBMIT_JOB_AES_CCM_AUTH(&state->aes_ccm_ooo, job); default: /* assume NULL_HASH */ job->status |= STS_COMPLETED_HMAC; return job; } } __forceinline JOB_AES_HMAC * FLUSH_JOB_HASH(MB_MGR *state, JOB_AES_HMAC *job) { switch (job->hash_alg) { case SHA1: #ifdef HASH_USE_SHAEXT if (HASH_USE_SHAEXT == SHA_EXT_PRESENT) return FLUSH_JOB_HMAC_NI(&state->hmac_sha_1_ooo); #endif return FLUSH_JOB_HMAC(&state->hmac_sha_1_ooo); case SHA_224: #ifdef HASH_USE_SHAEXT if (HASH_USE_SHAEXT == SHA_EXT_PRESENT) return FLUSH_JOB_HMAC_SHA_224_NI (&state->hmac_sha_224_ooo); #endif return FLUSH_JOB_HMAC_SHA_224(&state->hmac_sha_224_ooo); case SHA_256: #ifdef HASH_USE_SHAEXT if (HASH_USE_SHAEXT == SHA_EXT_PRESENT) return FLUSH_JOB_HMAC_SHA_256_NI (&state->hmac_sha_256_ooo); #endif return FLUSH_JOB_HMAC_SHA_256(&state->hmac_sha_256_ooo); case SHA_384: return FLUSH_JOB_HMAC_SHA_384(&state->hmac_sha_384_ooo); case SHA_512: return FLUSH_JOB_HMAC_SHA_512(&state->hmac_sha_512_ooo); case AES_XCBC: return FLUSH_JOB_AES_XCBC(&state->aes_xcbc_ooo); case MD5: return FLUSH_JOB_HMAC_MD5(&state->hmac_md5_ooo); case CUSTOM_HASH: return FLUSH_JOB_CUSTOM_HASH(job); case AES_CCM: return FLUSH_JOB_AES_CCM_AUTH(&state->aes_ccm_ooo); default: /* assume NULL_HASH */ if (!(job->status & STS_COMPLETED_HMAC)) { job->status |= STS_COMPLETED_HMAC; return job; } /* if HMAC is complete then return NULL */ return NULL; } } /* ========================================================================= */ /* Job submit & flush functions */ /* ========================================================================= */ #ifdef DEBUG #ifdef _WIN32 #define INVALID_PRN(_fmt, ...) \ fprintf(stderr, "%s():%d: " _fmt, __FUNCTION__, __LINE__, __VA_ARGS__) #else #define INVALID_PRN(_fmt, ...) \ fprintf(stderr, "%s():%d: " _fmt, __func__, __LINE__, __VA_ARGS__) #endif #else #define INVALID_PRN(_fmt, ...) #endif __forceinline int is_job_invalid(const JOB_AES_HMAC *job) { const UINT64 auth_tag_len_max[] = { 0, /* INVALID selection */ 12, /* SHA1 */ 14, /* SHA_224 */ 16, /* SHA_256 */ 24, /* SHA_384 */ 32, /* SHA_512 */ 12, /* AES_XCBC */ 12, /* MD5 */ 0, /* NULL_HASH */ 16, /* AES_GMAC */ 0, /* CUSTOM HASH */ 0, /* AES_CCM */ }; switch (job->cipher_mode) { case CBC: if (job->aes_key_len_in_bytes != UINT64_C(16) && job->aes_key_len_in_bytes != UINT64_C(24) && job->aes_key_len_in_bytes != UINT64_C(32)) { INVALID_PRN("cipher_mode:%d\n", job->cipher_mode); return 1; } if (job->msg_len_to_cipher_in_bytes == 0) { INVALID_PRN("cipher_mode:%d\n", job->cipher_mode); return 1; } if (job->msg_len_to_cipher_in_bytes & UINT64_C(15)) { INVALID_PRN("cipher_mode:%d\n", job->cipher_mode); return 1; } if (job->iv_len_in_bytes != UINT64_C(16)) { INVALID_PRN("cipher_mode:%d\n", job->cipher_mode); return 1; } break; case CNTR: if (job->aes_key_len_in_bytes != UINT64_C(16) && job->aes_key_len_in_bytes != UINT64_C(24) && job->aes_key_len_in_bytes != UINT64_C(32)) { INVALID_PRN("cipher_mode:%d\n", job->cipher_mode); return 1; } if (job->iv_len_in_bytes != UINT64_C(16) && job->iv_len_in_bytes != UINT64_C(12)) { INVALID_PRN("cipher_mode:%d\n", job->cipher_mode); return 1; } if (job->msg_len_to_cipher_in_bytes == 0) { INVALID_PRN("cipher_mode:%d\n", job->cipher_mode); return 1; } break; case NULL_CIPHER: /* NULL_CIPHER only allowed in HASH_CIPHER */ if (job->chain_order != HASH_CIPHER) return 1; /* XXX: not copy src to dst */ break; case DOCSIS_SEC_BPI: if (job->aes_key_len_in_bytes != UINT64_C(16)) { INVALID_PRN("cipher_mode:%d\n", job->cipher_mode); return 1; } if (job->iv_len_in_bytes != UINT64_C(16)) { INVALID_PRN("cipher_mode:%d\n", job->cipher_mode); return 1; } if (job->msg_len_to_cipher_in_bytes == 0) { INVALID_PRN("cipher_mode:%d\n", job->cipher_mode); return 1; } break; #ifndef NO_GCM case GCM: if (job->aes_key_len_in_bytes != UINT64_C(16) && job->aes_key_len_in_bytes != UINT64_C(24) && job->aes_key_len_in_bytes != UINT64_C(32)) { INVALID_PRN("cipher_mode:%d\n", job->cipher_mode); return 1; } if (job->iv_len_in_bytes != UINT64_C(12)) { INVALID_PRN("cipher_mode:%d\n", job->cipher_mode); return 1; } if (job->hash_alg != AES_GMAC) { INVALID_PRN("cipher_mode:%d\n", job->cipher_mode); return 1; } if (job->msg_len_to_cipher_in_bytes == 0) { INVALID_PRN("cipher_mode:%d\n", job->cipher_mode); return 1; } break; #endif /* !NO_GCM */ case CUSTOM_CIPHER: /* no checks here */ if (job->cipher_func == NULL) { INVALID_PRN("cipher_mode:%d\n", job->cipher_mode); return 1; } break; case DES: if (job->aes_key_len_in_bytes != UINT64_C(8)) { INVALID_PRN("cipher_mode:%d\n", job->cipher_mode); return 1; } if (job->msg_len_to_cipher_in_bytes == 0) { INVALID_PRN("cipher_mode:%d\n", job->cipher_mode); return 1; } if (job->msg_len_to_cipher_in_bytes & UINT64_C(7)) { INVALID_PRN("cipher_mode:%d\n", job->cipher_mode); return 1; } if (job->iv_len_in_bytes != UINT64_C(8)) { INVALID_PRN("cipher_mode:%d\n", job->cipher_mode); return 1; } break; case DOCSIS_DES: if (job->aes_key_len_in_bytes != UINT64_C(8)) { INVALID_PRN("cipher_mode:%d\n", job->cipher_mode); return 1; } if (job->msg_len_to_cipher_in_bytes == 0) { INVALID_PRN("cipher_mode:%d\n", job->cipher_mode); return 1; } if (job->iv_len_in_bytes != UINT64_C(8)) { INVALID_PRN("cipher_mode:%d\n", job->cipher_mode); return 1; } break; case CCM: /* currently only AES-CCM-128 is only supported */ if (job->aes_key_len_in_bytes != UINT64_C(16)) { INVALID_PRN("cipher_mode:%d\n", job->cipher_mode); return 1; } /* * From RFC3610: * Nonce length = 15 - L * Valid L values are: 2 to 8 * Then valid nonce lengths 13 to 7 (inclusive). */ if (job->iv_len_in_bytes > UINT64_C(13) || job->iv_len_in_bytes < UINT64_C(7)) { INVALID_PRN("cipher_mode:%d\n", job->cipher_mode); return 1; } if (job->msg_len_to_cipher_in_bytes == 0) { INVALID_PRN("cipher_mode:%d\n", job->cipher_mode); return 1; } if (job->hash_alg != AES_CCM) { INVALID_PRN("cipher_mode:%d\n", job->cipher_mode); return 1; } break; default: INVALID_PRN("cipher_mode:%d\n", job->cipher_mode); return 1; } switch (job->hash_alg) { case SHA1: case AES_XCBC: case MD5: case SHA_224: case SHA_256: case SHA_384: case SHA_512: if (job->auth_tag_output_len_in_bytes != auth_tag_len_max[job->hash_alg]) { INVALID_PRN("hash_alg:%d\n", job->hash_alg); return 1; } if (job->msg_len_to_hash_in_bytes == 0) { INVALID_PRN("hash_alg:%d\n", job->hash_alg); return 1; } break; case NULL_HASH: break; #ifndef NO_GCM case AES_GMAC: if (job->auth_tag_output_len_in_bytes != UINT64_C(8) && job->auth_tag_output_len_in_bytes != UINT64_C(12) && job->auth_tag_output_len_in_bytes != UINT64_C(16)) { INVALID_PRN("hash_alg:%d\n", job->hash_alg); return 1; } if (job->cipher_mode != GCM) { INVALID_PRN("hash_alg:%d\n", job->hash_alg); return 1; } /* * msg_len_to_hash_in_bytes not checked against zero. * It is not used for AES-GCM & GMAC - see * SUBMIT_JOB_AES_GCM_ENC and SUBMIT_JOB_AES_GCM_DEC functions. */ break; #endif /* !NO_GCM */ case CUSTOM_HASH: if (job->hash_func == NULL) { INVALID_PRN("hash_alg:%d\n", job->hash_alg); return 1; } break; case AES_CCM: if (job->u.CCM.aad_len_in_bytes > 46) { /* 3 x AES_BLOCK - 2 bytes for AAD len */ INVALID_PRN("hash_alg:%d\n", job->hash_alg); return 1; } if ((job->u.CCM.aad_len_in_bytes > 0) && (job->u.CCM.aad == NULL)) { INVALID_PRN("hash_alg:%d\n", job->hash_alg); return 1; } /* M can be any even number from 4 to 16 */ if (job->auth_tag_output_len_in_bytes < UINT64_C(4) || job->auth_tag_output_len_in_bytes > UINT64_C(16) || ((job->auth_tag_output_len_in_bytes & 1) != 0)) { INVALID_PRN("hash_alg:%d\n", job->hash_alg); return 1; } if (job->cipher_mode != CCM) { INVALID_PRN("hash_alg:%d\n", job->hash_alg); return 1; } /* * AES-CCM allows for only one message for * cipher and uthentication. * AAD can be used to extend authentication over * clear text fields. */ if (job->msg_len_to_cipher_in_bytes != job->msg_len_to_hash_in_bytes) { INVALID_PRN("hash_alg:%d\n", job->hash_alg); return 1; } if (job->cipher_start_src_offset_in_bytes != job->hash_start_src_offset_in_bytes) { INVALID_PRN("hash_alg:%d\n", job->hash_alg); return 1; } break; default: INVALID_PRN("hash_alg:%d\n", job->hash_alg); return 1; } switch (job->chain_order) { case CIPHER_HASH: if (job->cipher_direction != ENCRYPT) { INVALID_PRN("chain_order:%d\n", job->chain_order); return 1; } break; case HASH_CIPHER: if (job->cipher_mode != NULL_CIPHER) { if (job->cipher_direction != DECRYPT) { INVALID_PRN("chain_order:%d\n", job->chain_order); return 1; } } break; default: INVALID_PRN("chain_order:%d\n", job->chain_order); return 1; } return 0; } __forceinline JOB_AES_HMAC *submit_new_job(MB_MGR *state, JOB_AES_HMAC *job) { if (job->chain_order == CIPHER_HASH) { /* assume job->cipher_direction == ENCRYPT */ job = SUBMIT_JOB_AES_ENC(state, job); if (job) { job = SUBMIT_JOB_HASH(state, job); if (job && (job->chain_order == HASH_CIPHER)) SUBMIT_JOB_AES_DEC(state, job); } /* end if job */ } else { /* job->chain_order == HASH_CIPHER */ /* assume job->cipher_direction == DECRYPT */ job = SUBMIT_JOB_HASH(state, job); if (job && (job->chain_order == HASH_CIPHER)) SUBMIT_JOB_AES_DEC(state, job); } return job; } __forceinline void complete_job(MB_MGR *state, JOB_AES_HMAC *job) { JOB_AES_HMAC *tmp = NULL; while (job->status < STS_COMPLETED) { if (job->chain_order == CIPHER_HASH) { /* assume job->cipher_direction == ENCRYPT */ tmp = FLUSH_JOB_AES_ENC(state, job); if (tmp) tmp = SUBMIT_JOB_HASH(state, tmp); else tmp = FLUSH_JOB_HASH(state, job); if (tmp && (tmp->chain_order == HASH_CIPHER)) SUBMIT_JOB_AES_DEC(state, tmp); } else { /* job->chain_order == HASH_CIPHER */ /* assume job->cipher_direction == DECRYPT */ tmp = FLUSH_JOB_HASH(state, job); if (tmp == NULL) tmp = FLUSH_JOB_AES_DEC(state, job); else if (tmp->chain_order == HASH_CIPHER) SUBMIT_JOB_AES_DEC(state, tmp); } } } __forceinline JOB_AES_HMAC * submit_job_and_check(MB_MGR *state, const int run_check) { JOB_AES_HMAC *job = NULL; #ifndef LINUX DECLARE_ALIGNED(UINT128 xmm_save[10], 16); SAVE_XMMS(xmm_save); #endif job = JOBS(state, state->next_job); if (run_check) { if (is_job_invalid(job)) { job->status = STS_INVALID_ARGS; } else { job->status = STS_BEING_PROCESSED; job = submit_new_job(state, job); } } else { job->status = STS_BEING_PROCESSED; job = submit_new_job(state, job); } if (state->earliest_job < 0) { /* state was previously empty */ state->earliest_job = state->next_job; ADV_JOBS(&state->next_job); #ifndef LINUX RESTORE_XMMS(xmm_save); #endif return NULL; /* if we were empty, nothing to return */ } ADV_JOBS(&state->next_job); if (state->earliest_job == state->next_job) { /* Full */ job = JOBS(state, state->earliest_job); complete_job(state, job); ADV_JOBS(&state->earliest_job); #ifndef LINUX RESTORE_XMMS(xmm_save); #endif return job; } /* not full */ #ifndef LINUX RESTORE_XMMS(xmm_save); #endif job = JOBS(state, state->earliest_job); if (job->status < STS_COMPLETED) return NULL; ADV_JOBS(&state->earliest_job); return job; } IMB_DLL_EXPORT JOB_AES_HMAC * SUBMIT_JOB(MB_MGR *state) { return submit_job_and_check(state, 1); } IMB_DLL_EXPORT JOB_AES_HMAC * SUBMIT_JOB_NOCHECK(MB_MGR *state) { return submit_job_and_check(state, 0); } IMB_DLL_EXPORT JOB_AES_HMAC * FLUSH_JOB(MB_MGR *state) { JOB_AES_HMAC *job; #ifndef LINUX DECLARE_ALIGNED(UINT128 xmm_save[10], 16); #endif if (state->earliest_job < 0) return NULL; /* empty */ #ifndef LINUX SAVE_XMMS(xmm_save); #endif job = JOBS(state, state->earliest_job); complete_job(state, job); ADV_JOBS(&state->earliest_job); if (state->earliest_job == state->next_job) state->earliest_job = -1; /* becomes empty */ #ifndef LINUX RESTORE_XMMS(xmm_save); #endif return job; } /* ========================================================================= */ /* ========================================================================= */ IMB_DLL_EXPORT UINT32 QUEUE_SIZE(MB_MGR *state) { int a, b; if (state->earliest_job < 0) return 0; a = state->next_job / sizeof(JOB_AES_HMAC); b = state->earliest_job / sizeof(JOB_AES_HMAC); return ((a-b) & (MAX_JOBS-1)); } intel-ipsec-mb-0.48/mb_mgr_datastruct.asm000066400000000000000000000266071321406316400204730ustar00rootroot00000000000000;; ;; Copyright (c) 2012-2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; %include "datastruct.asm" %include "constants.asm" ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;; Define constants ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; %define MAX_AES_JOBS 128 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;; Define AES_ARGS_X8 and AES Out of Order Data Structures ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; START_FIELDS ; AES_ARGS_X8 ;; name size align FIELD _aesarg_in, 8*8, 8 ; array of 8 pointers to in text FIELD _aesarg_out, 8*8, 8 ; array of 8 pointers to out text FIELD _aesarg_keys, 8*8, 8 ; array of 8 pointers to keys FIELD _aesarg_IV, 16*8, 32 ; array of 8 128-bit IV's END_FIELDS %assign _AES_ARGS_X8_size _FIELD_OFFSET %assign _AES_ARGS_X8_align _STRUCT_ALIGN ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; START_FIELDS ; MB_MGR_AES_OOO ;; name size align FIELD _aes_args, _AES_ARGS_X8_size, _AES_ARGS_X8_align FIELD _aes_lens, 16, 16 FIELD _aes_unused_lanes, 8, 8 FIELD _aes_job_in_lane, 8*8, 8 END_FIELDS %assign _MB_MGR_AES_OOO_size _FIELD_OFFSET %assign _MB_MGR_AES_OOO_align _STRUCT_ALIGN _aes_args_in equ _aes_args + _aesarg_in _aes_args_out equ _aes_args + _aesarg_out _aes_args_keys equ _aes_args + _aesarg_keys _aes_args_IV equ _aes_args + _aesarg_IV ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;; Define XCBC Out of Order Data Structures ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; START_FIELDS ; AES_XCBC_ARGS_X8 ;; name size align FIELD _aesxcbcarg_in, 8*8, 8 ; array of 8 pointers to in text FIELD _aesxcbcarg_keys, 8*8, 8 ; array of 8 pointers to keys FIELD _aesxcbcarg_ICV, 16*8, 32 ; array of 8 128-bit ICV's END_FIELDS %assign _AES_XCBC_ARGS_X8_size _FIELD_OFFSET %assign _AES_XCBC_ARGS_X8_align _STRUCT_ALIGN START_FIELDS ; XCBC_LANE_DATA ;;; name size align FIELD _xcbc_final_block, 2*16, 32 ; final block with padding FIELD _xcbc_job_in_lane, 8, 8 ; pointer to job object FIELD _xcbc_final_done, 8, 8 ; offset to start of data END_FIELDS %assign _XCBC_LANE_DATA_size _FIELD_OFFSET %assign _XCBC_LANE_DATA_align _STRUCT_ALIGN ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; START_FIELDS ; MB_MGR_AES_XCBC_OOO ;; name size align FIELD _aes_xcbc_args, _AES_XCBC_ARGS_X8_size, _AES_XCBC_ARGS_X8_align FIELD _aes_xcbc_lens, 16, 16 FIELD _aes_xcbc_unused_lanes, 8, 8 FIELD _aes_xcbc_ldata, _XCBC_LANE_DATA_size*8, _XCBC_LANE_DATA_align END_FIELDS %assign _MB_MGR_AES_XCBC_OOO_size _FIELD_OFFSET %assign _MB_MGR_AES_XCBC_OOO_align _STRUCT_ALIGN _aes_xcbc_args_in equ _aes_xcbc_args + _aesxcbcarg_in _aes_xcbc_args_keys equ _aes_xcbc_args + _aesxcbcarg_keys _aes_xcbc_args_ICV equ _aes_xcbc_args + _aesxcbcarg_ICV ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;; Define DES Out of Order Data Structures ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; START_FIELDS ; DES_ARGS_X16 ;; name size align FIELD _desarg_in, 16*8, 8 ; array of 16 pointers to in text FIELD _desarg_out, 16*8, 8 ; array of 16 pointers to out text FIELD _desarg_keys, 16*8, 8 ; array of 16 pointers to keys FIELD _desarg_IV, 16*8, 32 ; array of 16 64-bit IV's FIELD _desarg_plen, 16*4, 32 ; array of 16 32-bit partial lens FIELD _desarg_blen, 16*4, 32 ; array of 16 32-bit block lens FIELD _desarg_lin, 16*8, 8 ; array of 16 pointers to last (block) in text FIELD _desarg_lout, 16*8, 8 ; array of 16 pointers to last (block) out text END_FIELDS %assign _DES_ARGS_X16_size _FIELD_OFFSET %assign _DES_ARGS_X16_align _STRUCT_ALIGN ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; START_FIELDS ; MB_MGR_DES_OOO ;; name size align FIELD _des_args, _DES_ARGS_X16_size, _DES_ARGS_X16_align FIELD _des_lens, 16*2, 16 FIELD _des_unused_lanes, 8, 8 FIELD _des_job_in_lane, 16*8, 8 FIELD _des_lanes_in_use, 8, 8 END_FIELDS %assign _MB_MGR_DES_OOO_size _FIELD_OFFSET %assign _MB_MGR_DES_OOO_align _STRUCT_ALIGN _des_args_in equ _des_args + _desarg_in _des_args_out equ _des_args + _desarg_out _des_args_keys equ _des_args + _desarg_keys _des_args_IV equ _des_args + _desarg_IV _des_args_PLen equ _des_args + _desarg_plen _des_args_BLen equ _des_args + _desarg_blen _des_args_LIn equ _des_args + _desarg_lin _des_args_LOut equ _des_args + _desarg_lout ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;; Define HMAC Out Of Order Data Structures ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; START_FIELDS ; HMAC_SHA1_LANE_DATA ;;; name size align FIELD _extra_block, 2*64+8, 32 ; final block with padding FIELD _job_in_lane, 8, 8 ; pointer to job object FIELD _outer_block, 64, 1 ; block containing hash FIELD _outer_done, 4, 4 ; boolean flag FIELD _extra_blocks, 4, 4 ; num extra blocks (1 or 2) FIELD _size_offset, 4, 4 ; offset in extra_block to start of size FIELD _start_offset, 4, 4 ; offset to start of data END_FIELDS %assign _HMAC_SHA1_LANE_DATA_size _FIELD_OFFSET %assign _HMAC_SHA1_LANE_DATA_align _STRUCT_ALIGN ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; START_FIELDS ; SHA512_LANE_DATA ;;; name size align FIELD _extra_block_sha512, 2* SHA512_BLK_SZ + 16, 32 ; final block with padding, alignment 16 to read in XMM chunks FIELD _outer_block_sha512, SHA512_BLK_SZ, 1 ; block containing hash FIELD _job_in_lane_sha512, 8, 8 ; pointer to job object FIELD _outer_done_sha512, 4, 4 ; boolean flag FIELD _extra_blocks_sha512, 4, 4 ; num extra blocks (1 or 2) FIELD _size_offset_sha512, 4, 4 ; offset in extra_block to start of size FIELD _start_offset_sha512, 4, 4 ; offset to start of data END_FIELDS %assign _SHA512_LANE_DATA_size _FIELD_OFFSET %assign _SHA512_LANE_DATA_align _STRUCT_ALIGN ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; START_FIELDS ; SHA1_ARGS ;;; name size align FIELD _digest, SHA1_DIGEST_SIZE, 32 ; transposed digest FIELD _data_ptr_sha1, PTR_SZ*MAX_SHA1_LANES, 8 ; array of pointers to data END_FIELDS %assign _SHA1_ARGS_size _FIELD_OFFSET %assign _SHA1_ARGS_align _STRUCT_ALIGN ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; START_FIELDS ; MB_MGR_HMAC_SHA_1_OOO ;;; name size align FIELD _args, _SHA1_ARGS_size, _SHA1_ARGS_align FIELD _lens, 32, 32 FIELD _unused_lanes, 8, 8 FIELD _ldata, _HMAC_SHA1_LANE_DATA_size*MAX_SHA1_LANES, _HMAC_SHA1_LANE_DATA_align FIELD _num_lanes_inuse_sha1, 4, 4 END_FIELDS %assign _MB_MGR_HMAC_SHA_1_OOO_size _FIELD_OFFSET %assign _MB_MGR_HMAC_SHA_1_OOO_align _STRUCT_ALIGN _args_digest equ _args + _digest _args_data_ptr equ _args + _data_ptr_sha1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; START_FIELDS ; SHA256_ARGS ;;; name size align FIELD _digest_sha256, SHA256_DIGEST_SIZE, 32 ; transposed digest FIELD _data_ptr_sha256, PTR_SZ*MAX_SHA256_LANES, 8 ; array of pointers to data END_FIELDS %assign _SHA256_ARGS_size _FIELD_OFFSET %assign _SHA256_ARGS_align _STRUCT_ALIGN ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; START_FIELDS ; MB_MGR_HMAC_SHA_256_OOO ;;; name size align FIELD _args_sha256, _SHA256_ARGS_size, _SHA256_ARGS_align FIELD _lens_sha256, 16*2, 16 FIELD _unused_lanes_sha256, 8, 8 FIELD _ldata_sha256, _HMAC_SHA1_LANE_DATA_size * MAX_SHA256_LANES, _HMAC_SHA1_LANE_DATA_align FIELD _num_lanes_inuse_sha256, 4, 4 END_FIELDS %assign _MB_MGR_HMAC_SHA_256_OOO_size _FIELD_OFFSET %assign _MB_MGR_HMAC_SHA_256_OOO_align _STRUCT_ALIGN _args_digest_sha256 equ _args_sha256 + _digest_sha256 _args_data_ptr_sha256 equ _args_sha256 + _data_ptr_sha256 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;; Define HMAC SHA512 Out Of Order Data Structures ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; START_FIELDS ; SHA512_ARGS ;;; name size align FIELD _digest_sha512, SHA512_DIGEST_SIZE, 32 ; transposed digest. 2 lanes, 8 digest words, each 8 bytes long FIELD _data_ptr_sha512, MAX_SHA512_LANES * PTR_SZ, 8 ; array of pointers to data END_FIELDS %assign _SHA512_ARGS_size _FIELD_OFFSET %assign _SHA512_ARGS_align _STRUCT_ALIGN ;; --------------------------------------- START_FIELDS ; MB_MGR_HMAC_SHA512_OOO ;;; name size align FIELD _args_sha512, _SHA512_ARGS_size, _SHA512_ARGS_align FIELD _lens_sha512, 16, 16 FIELD _unused_lanes_sha512, 8, 8 FIELD _ldata_sha512, _SHA512_LANE_DATA_size * MAX_SHA512_LANES, _SHA512_LANE_DATA_align END_FIELDS %assign _MB_MGR_HMAC_SHA_512_OOO_size _FIELD_OFFSET %assign _MB_MGR_HMAC_SHA_512_OOO_align _STRUCT_ALIGN _args_digest_sha512 equ _args_sha512 + _digest_sha512 _args_data_ptr_sha512 equ _args_sha512 + _data_ptr_sha512 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;; Define HMAC MD5 Out Of Order Data Structures ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; START_FIELDS ; MD5_ARGS ;;; name size align FIELD _digest_md5, MD5_DIGEST_SIZE, 32 ; transposed digest FIELD _data_ptr_md5, MAX_MD5_LANES*PTR_SZ, 8 ; array of pointers to data END_FIELDS %assign _MD5_ARGS_size _FIELD_OFFSET %assign _MD5_ARGS_align _STRUCT_ALIGN ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; START_FIELDS ; MB_MGR_HMAC_MD5_OOO ;;; name size align FIELD _args_md5, _MD5_ARGS_size, _MD5_ARGS_align FIELD _lens_md5, MAX_MD5_LANES*2, 16 FIELD _unused_lanes_md5, 8, 8 FIELD _ldata_md5, _HMAC_SHA1_LANE_DATA_size * MAX_MD5_LANES, _HMAC_SHA1_LANE_DATA_align FIELD _num_lanes_inuse_md5, 4, 8 END_FIELDS %assign _MB_MGR_HMAC_MD5_OOO_size _FIELD_OFFSET %assign _MB_MGR_HMAC_MD5_OOO_align _STRUCT_ALIGN _args_digest_md5 equ _args_md5 + _digest_md5 _args_data_ptr_md5 equ _args_md5 + _data_ptr_md5 intel-ipsec-mb-0.48/md5_one_block.c000066400000000000000000000154451321406316400171320ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2012-2017, Intel Corporation Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ #include #include "types.h" void md5_one_block_sse(const UINT8 *data, UINT32 digest[4]); #ifdef LINUX #define ROTATE(a, n) (((a) << (n)) ^ ((a) >> (32 - (n)))) #else #include #define ROTATE(a, n) _rotl(a, n) #endif #define H0 0x67452301 #define H1 0xefcdab89 #define H2 0x98badcfe #define H3 0x10325476 #define F1(b, c, d) ((((c) ^ (d)) & (b)) ^ (d)) #define F2(b, c, d) ((((b) ^ (c)) & (d)) ^ (c)) #define F3(b, c, d) ((b) ^ (c) ^ (d)) #define F4(b, c, d) (((~(d)) | (b)) ^ (c)) #define STEP1(a, b, c, d, k, w, r) { \ a += w + k + F1(b, c, d); \ a = ROTATE(a, r); \ a += b; \ } #define STEP2(a, b, c, d, k, w, r) { \ a += w + k + F2(b, c, d); \ a = ROTATE(a, r); \ a += b; \ } #define STEP3(a, b, c, d, k, w, r) { \ a += w + k + F3(b, c, d); \ a = ROTATE(a, r); \ a += b; \ } #define STEP4(a, b, c, d, k, w, r) { \ a += w + k + F4(b, c, d); \ a = ROTATE(a, r); \ a += b; \ } void md5_one_block_sse(const UINT8 *data, UINT32 digest[4]) { UINT32 a, b, c, d; UINT32 w00, w01, w02, w03, w04, w05, w06, w07, w08, w09, w10, w11, w12, w13, w14, w15; const UINT32 *data32 = (const UINT32 *)data; a = H0; b = H1; c = H2; d = H3; w00 = data32[0]; w01 = data32[1]; STEP1(a, b, c, d, 0xd76aa478, w00, 7); w02 = data32[2]; STEP1(d, a, b, c, 0xe8c7b756, w01, 12); w03 = data32[3]; STEP1(c, d, a, b, 0x242070db, w02, 17); w04 = data32[4]; STEP1(b, c, d, a, 0xc1bdceee, w03, 22); w05 = data32[5]; STEP1(a, b, c, d, 0xf57c0faf, w04, 7); w06 = data32[6]; STEP1(d, a, b, c, 0x4787c62a, w05, 12); w07 = data32[7]; STEP1(c, d, a, b, 0xa8304613, w06, 17); w08 = data32[8]; STEP1(b, c, d, a, 0xfd469501, w07, 22); w09 = data32[9]; STEP1(a, b, c, d, 0x698098d8, w08, 7); w10 = data32[10]; STEP1(d, a, b, c, 0x8b44f7af, w09, 12); w11 = data32[11]; STEP1(c, d, a, b, 0xffff5bb1, w10, 17); w12 = data32[12]; STEP1(b, c, d, a, 0x895cd7be, w11, 22); w13 = data32[13]; STEP1(a, b, c, d, 0x6b901122, w12, 7); w14 = data32[14]; STEP1(d, a, b, c, 0xfd987193, w13, 12); w15 = data32[15]; STEP1(c, d, a, b, 0xa679438e, w14, 17); STEP1(b, c, d, a, 0x49b40821, w15, 22); STEP2(a, b, c, d, 0xf61e2562, w01, 5); STEP2(d, a, b, c, 0xc040b340, w06, 9); STEP2(c, d, a, b, 0x265e5a51, w11, 14); STEP2(b, c, d, a, 0xe9b6c7aa, w00, 20); STEP2(a, b, c, d, 0xd62f105d, w05, 5); STEP2(d, a, b, c, 0x02441453, w10, 9); STEP2(c, d, a, b, 0xd8a1e681, w15, 14); STEP2(b, c, d, a, 0xe7d3fbc8, w04, 20); STEP2(a, b, c, d, 0x21e1cde6, w09, 5); STEP2(d, a, b, c, 0xc33707d6, w14, 9); STEP2(c, d, a, b, 0xf4d50d87, w03, 14); STEP2(b, c, d, a, 0x455a14ed, w08, 20); STEP2(a, b, c, d, 0xa9e3e905, w13, 5); STEP2(d, a, b, c, 0xfcefa3f8, w02, 9); STEP2(c, d, a, b, 0x676f02d9, w07, 14); STEP2(b, c, d, a, 0x8d2a4c8a, w12, 20); STEP3(a, b, c, d, 0xfffa3942, w05, 4); STEP3(d, a, b, c, 0x8771f681, w08, 11); STEP3(c, d, a, b, 0x6d9d6122, w11, 16); STEP3(b, c, d, a, 0xfde5380c, w14, 23); STEP3(a, b, c, d, 0xa4beea44, w01, 4); STEP3(d, a, b, c, 0x4bdecfa9, w04, 11); STEP3(c, d, a, b, 0xf6bb4b60, w07, 16); STEP3(b, c, d, a, 0xbebfbc70, w10, 23); STEP3(a, b, c, d, 0x289b7ec6, w13, 4); STEP3(d, a, b, c, 0xeaa127fa, w00, 11); STEP3(c, d, a, b, 0xd4ef3085, w03, 16); STEP3(b, c, d, a, 0x04881d05, w06, 23); STEP3(a, b, c, d, 0xd9d4d039, w09, 4); STEP3(d, a, b, c, 0xe6db99e5, w12, 11); STEP3(c, d, a, b, 0x1fa27cf8, w15, 16); STEP3(b, c, d, a, 0xc4ac5665, w02, 23); STEP4(a, b, c, d, 0xf4292244, w00, 6); STEP4(d, a, b, c, 0x432aff97, w07, 10); STEP4(c, d, a, b, 0xab9423a7, w14, 15); STEP4(b, c, d, a, 0xfc93a039, w05, 21); STEP4(a, b, c, d, 0x655b59c3, w12, 6); STEP4(d, a, b, c, 0x8f0ccc92, w03, 10); STEP4(c, d, a, b, 0xffeff47d, w10, 15); STEP4(b, c, d, a, 0x85845dd1, w01, 21); STEP4(a, b, c, d, 0x6fa87e4f, w08, 6); STEP4(d, a, b, c, 0xfe2ce6e0, w15, 10); STEP4(c, d, a, b, 0xa3014314, w06, 15); STEP4(b, c, d, a, 0x4e0811a1, w13, 21); STEP4(a, b, c, d, 0xf7537e82, w04, 6); STEP4(d, a, b, c, 0xbd3af235, w11, 10); STEP4(c, d, a, b, 0x2ad7d2bb, w02, 15); STEP4(b, c, d, a, 0xeb86d391, w09, 21); digest[0] = a + H0; digest[1] = b + H1; digest[2] = c + H2; digest[3] = d + H3; } intel-ipsec-mb-0.48/sse/000077500000000000000000000000001321406316400150475ustar00rootroot00000000000000intel-ipsec-mb-0.48/sse/aes128_cbc_dec_by4_sse.asm000066400000000000000000000243731321406316400216370ustar00rootroot00000000000000;; ;; Copyright (c) 2012-2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; ; routine to do AES cbc decrypt on 16n bytes doing AES by 4 ; XMM registers are clobbered. Saving/restoring must be done at a higher level ; void aes_cbc_dec_128_sse(void *in, ; UINT128 *IV, ; UINT128 keys[11], ; void *out, ; UINT64 len_bytes); ; ; arg 1: IN: pointer to input (cipher text) ; arg 2: IV: pointer to IV ; arg 3: KEYS: pointer to keys ; arg 4: OUT: pointer to output (plain text) ; arg 5: LEN: length in bytes (multiple of 16) ; %include "os.asm" %define MOVDQ movdqu %ifdef LINUX %define IN rdi %define IV rsi %define KEYS rdx %define OUT rcx %define LEN r8 %else %define IN rcx %define IV rdx %define KEYS r8 %define OUT r9 %define LEN r10 %endif %define IDX rax %define TMP IDX %define XDATA0 xmm0 %define XDATA1 xmm1 %define XDATA2 xmm2 %define XDATA3 xmm3 %define XKEY0 xmm4 %define XKEY2 xmm5 %define XKEY4 xmm6 %define XKEY6 xmm7 %define XKEY8 xmm8 %define XKEY10 xmm9 %define XIV xmm10 %define XSAVED0 xmm11 %define XSAVED1 xmm12 %define XSAVED2 xmm13 %define XSAVED3 xmm14 %define XKEY xmm15 %define IV_TMP XSAVED3 section .text MKGLOBAL(aes_cbc_dec_128_sse,function,internal) aes_cbc_dec_128_sse: %ifndef LINUX mov LEN, [rsp + 8*5] %endif mov TMP, LEN and TMP, 3*16 jz initial_4 cmp TMP, 2*16 jb initial_1 ja initial_3 initial_2: ; load cipher text movdqu XDATA0, [IN + 0*16] movdqu XDATA1, [IN + 1*16] movdqa XKEY0, [KEYS + 0*16] ; save cipher text movdqa XSAVED0, XDATA0 movdqa XIV, XDATA1 pxor XDATA0, XKEY0 ; 0. ARK pxor XDATA1, XKEY0 movdqa XKEY2, [KEYS + 2*16] aesdec XDATA0, [KEYS + 1*16] ; 1. DEC aesdec XDATA1, [KEYS + 1*16] mov IDX, 2*16 aesdec XDATA0, XKEY2 ; 2. DEC aesdec XDATA1, XKEY2 movdqa XKEY4, [KEYS + 4*16] aesdec XDATA0, [KEYS + 3*16] ; 3. DEC aesdec XDATA1, [KEYS + 3*16] movdqu IV_TMP, [IV] aesdec XDATA0, XKEY4 ; 4. DEC aesdec XDATA1, XKEY4 movdqa XKEY6, [KEYS + 6*16] aesdec XDATA0, [KEYS + 5*16] ; 5. DEC aesdec XDATA1, [KEYS + 5*16] aesdec XDATA0, XKEY6 ; 6. DEC aesdec XDATA1, XKEY6 movdqa XKEY8, [KEYS + 8*16] aesdec XDATA0, [KEYS + 7*16] ; 7. DEC aesdec XDATA1, [KEYS + 7*16] aesdec XDATA0, XKEY8 ; 8. DEC aesdec XDATA1, XKEY8 movdqa XKEY10, [KEYS + 10*16] aesdec XDATA0, [KEYS + 9*16] ; 9. DEC aesdec XDATA1, [KEYS + 9*16] aesdeclast XDATA0, XKEY10 ; 10. DEC aesdeclast XDATA1, XKEY10 pxor XDATA0, IV_TMP pxor XDATA1, XSAVED0 movdqu [OUT + 0*16], XDATA0 movdqu [OUT + 1*16], XDATA1 cmp LEN, 2*16 je done jmp main_loop align 16 initial_1: ; load cipher text movdqu XDATA0, [IN + 0*16] movdqa XKEY0, [KEYS + 0*16] ; save cipher text movdqa XIV, XDATA0 pxor XDATA0, XKEY0 ; 0. ARK movdqa XKEY2, [KEYS + 2*16] aesdec XDATA0, [KEYS + 1*16] ; 1. DEC mov IDX, 1*16 aesdec XDATA0, XKEY2 ; 2. DEC movdqa XKEY4, [KEYS + 4*16] aesdec XDATA0, [KEYS + 3*16] ; 3. DEC movdqu IV_TMP, [IV] aesdec XDATA0, XKEY4 ; 4. DEC movdqa XKEY6, [KEYS + 6*16] aesdec XDATA0, [KEYS + 5*16] ; 5. DEC aesdec XDATA0, XKEY6 ; 6. DEC movdqa XKEY8, [KEYS + 8*16] aesdec XDATA0, [KEYS + 7*16] ; 7. DEC aesdec XDATA0, XKEY8 ; 8. DEC movdqa XKEY10, [KEYS + 10*16] aesdec XDATA0, [KEYS + 9*16] ; 9. DEC aesdeclast XDATA0, XKEY10 ; 10. DEC pxor XDATA0, IV_TMP movdqu [OUT + 0*16], XDATA0 cmp LEN, 1*16 je done jmp main_loop initial_3: ; load cipher text movdqu XDATA0, [IN + 0*16] movdqu XDATA1, [IN + 1*16] movdqu XDATA2, [IN + 2*16] movdqa XKEY0, [KEYS + 0*16] ; save cipher text movdqa XSAVED0, XDATA0 movdqa XSAVED1, XDATA1 movdqa XIV, XDATA2 movdqa XKEY, [KEYS + 1*16] pxor XDATA0, XKEY0 ; 0. ARK pxor XDATA1, XKEY0 pxor XDATA2, XKEY0 movdqa XKEY2, [KEYS + 2*16] aesdec XDATA0, XKEY ; 1. DEC aesdec XDATA1, XKEY aesdec XDATA2, XKEY movdqa XKEY, [KEYS + 3*16] mov IDX, 3*16 aesdec XDATA0, XKEY2 ; 2. DEC aesdec XDATA1, XKEY2 aesdec XDATA2, XKEY2 movdqa XKEY4, [KEYS + 4*16] aesdec XDATA0, XKEY ; 3. DEC aesdec XDATA1, XKEY aesdec XDATA2, XKEY movdqa XKEY, [KEYS + 5*16] movdqu IV_TMP, [IV] aesdec XDATA0, XKEY4 ; 4. DEC aesdec XDATA1, XKEY4 aesdec XDATA2, XKEY4 movdqa XKEY6, [KEYS + 6*16] aesdec XDATA0, XKEY ; 5. DEC aesdec XDATA1, XKEY aesdec XDATA2, XKEY movdqa XKEY, [KEYS + 7*16] aesdec XDATA0, XKEY6 ; 6. DEC aesdec XDATA1, XKEY6 aesdec XDATA2, XKEY6 movdqa XKEY8, [KEYS + 8*16] aesdec XDATA0, XKEY ; 7. DEC aesdec XDATA1, XKEY aesdec XDATA2, XKEY movdqa XKEY, [KEYS + 9*16] aesdec XDATA0, XKEY8 ; 8. DEC aesdec XDATA1, XKEY8 aesdec XDATA2, XKEY8 movdqa XKEY10, [KEYS + 10*16] aesdec XDATA0, XKEY ; 9. DEC aesdec XDATA1, XKEY aesdec XDATA2, XKEY aesdeclast XDATA0, XKEY10 ; 10. DEC aesdeclast XDATA1, XKEY10 aesdeclast XDATA2, XKEY10 pxor XDATA0, IV_TMP pxor XDATA1, XSAVED0 pxor XDATA2, XSAVED1 movdqu [OUT + 0*16], XDATA0 movdqu [OUT + 1*16], XDATA1 movdqu [OUT + 2*16], XDATA2 cmp LEN, 3*16 je done jmp main_loop align 16 initial_4: ; load cipher text movdqu XDATA0, [IN + 0*16] movdqu XDATA1, [IN + 1*16] movdqu XDATA2, [IN + 2*16] movdqu XDATA3, [IN + 3*16] movdqa XKEY0, [KEYS + 0*16] ; save cipher text movdqa XSAVED0, XDATA0 movdqa XSAVED1, XDATA1 movdqa XSAVED2, XDATA2 movdqa XIV, XDATA3 movdqa XKEY, [KEYS + 1*16] pxor XDATA0, XKEY0 ; 0. ARK pxor XDATA1, XKEY0 pxor XDATA2, XKEY0 pxor XDATA3, XKEY0 movdqa XKEY2, [KEYS + 2*16] aesdec XDATA0, XKEY ; 1. DEC aesdec XDATA1, XKEY aesdec XDATA2, XKEY aesdec XDATA3, XKEY movdqa XKEY, [KEYS + 3*16] mov IDX, 4*16 aesdec XDATA0, XKEY2 ; 2. DEC aesdec XDATA1, XKEY2 aesdec XDATA2, XKEY2 aesdec XDATA3, XKEY2 movdqa XKEY4, [KEYS + 4*16] aesdec XDATA0, XKEY ; 3. DEC aesdec XDATA1, XKEY aesdec XDATA2, XKEY aesdec XDATA3, XKEY movdqa XKEY, [KEYS + 5*16] movdqu IV_TMP, [IV] aesdec XDATA0, XKEY4 ; 4. DEC aesdec XDATA1, XKEY4 aesdec XDATA2, XKEY4 aesdec XDATA3, XKEY4 movdqa XKEY6, [KEYS + 6*16] aesdec XDATA0, XKEY ; 5. DEC aesdec XDATA1, XKEY aesdec XDATA2, XKEY aesdec XDATA3, XKEY movdqa XKEY, [KEYS + 7*16] aesdec XDATA0, XKEY6 ; 6. DEC aesdec XDATA1, XKEY6 aesdec XDATA2, XKEY6 aesdec XDATA3, XKEY6 movdqa XKEY8, [KEYS + 8*16] aesdec XDATA0, XKEY ; 7. DEC aesdec XDATA1, XKEY aesdec XDATA2, XKEY aesdec XDATA3, XKEY movdqa XKEY, [KEYS + 9*16] aesdec XDATA0, XKEY8 ; 8. DEC aesdec XDATA1, XKEY8 aesdec XDATA2, XKEY8 aesdec XDATA3, XKEY8 movdqa XKEY10, [KEYS + 10*16] aesdec XDATA0, XKEY ; 9. DEC aesdec XDATA1, XKEY aesdec XDATA2, XKEY aesdec XDATA3, XKEY aesdeclast XDATA0, XKEY10 ; 10. DEC aesdeclast XDATA1, XKEY10 aesdeclast XDATA2, XKEY10 aesdeclast XDATA3, XKEY10 pxor XDATA0, IV_TMP pxor XDATA1, XSAVED0 pxor XDATA2, XSAVED1 pxor XDATA3, XSAVED2 movdqu [OUT + 0*16], XDATA0 movdqu [OUT + 1*16], XDATA1 movdqu [OUT + 2*16], XDATA2 movdqu [OUT + 3*16], XDATA3 cmp LEN, 4*16 jz done jmp main_loop align 16 main_loop: ; load cipher text movdqu XDATA0, [IN + IDX + 0*16] movdqu XDATA1, [IN + IDX + 1*16] movdqu XDATA2, [IN + IDX + 2*16] movdqu XDATA3, [IN + IDX + 3*16] ; save cipher text movdqa XSAVED0, XDATA0 movdqa XSAVED1, XDATA1 movdqa XSAVED2, XDATA2 movdqa XSAVED3, XDATA3 movdqa XKEY, [KEYS + 1*16] pxor XDATA0, XKEY0 ; 0. ARK pxor XDATA1, XKEY0 pxor XDATA2, XKEY0 pxor XDATA3, XKEY0 add IDX, 4*16 aesdec XDATA0, XKEY ; 1. DEC aesdec XDATA1, XKEY aesdec XDATA2, XKEY aesdec XDATA3, XKEY movdqa XKEY, [KEYS + 3*16] aesdec XDATA0, XKEY2 ; 2. DEC aesdec XDATA1, XKEY2 aesdec XDATA2, XKEY2 aesdec XDATA3, XKEY2 aesdec XDATA0, XKEY ; 3. DEC aesdec XDATA1, XKEY aesdec XDATA2, XKEY aesdec XDATA3, XKEY movdqa XKEY, [KEYS + 5*16] aesdec XDATA0, XKEY4 ; 4. DEC aesdec XDATA1, XKEY4 aesdec XDATA2, XKEY4 aesdec XDATA3, XKEY4 aesdec XDATA0, XKEY ; 5. DEC aesdec XDATA1, XKEY aesdec XDATA2, XKEY aesdec XDATA3, XKEY movdqa XKEY, [KEYS + 7*16] aesdec XDATA0, XKEY6 ; 6. DEC aesdec XDATA1, XKEY6 aesdec XDATA2, XKEY6 aesdec XDATA3, XKEY6 aesdec XDATA0, XKEY ; 7. DEC aesdec XDATA1, XKEY aesdec XDATA2, XKEY aesdec XDATA3, XKEY movdqa XKEY, [KEYS + 9*16] aesdec XDATA0, XKEY8 ; 8. DEC aesdec XDATA1, XKEY8 aesdec XDATA2, XKEY8 aesdec XDATA3, XKEY8 aesdec XDATA0, XKEY ; 9. DEC aesdec XDATA1, XKEY aesdec XDATA2, XKEY aesdec XDATA3, XKEY aesdeclast XDATA0, XKEY10 ; 10. DEC aesdeclast XDATA1, XKEY10 aesdeclast XDATA2, XKEY10 aesdeclast XDATA3, XKEY10 pxor XDATA0, XIV pxor XDATA1, XSAVED0 pxor XDATA2, XSAVED1 pxor XDATA3, XSAVED2 movdqu [OUT + IDX + 0*16 - 4*16], XDATA0 movdqu [OUT + IDX + 1*16 - 4*16], XDATA1 movdqu [OUT + IDX + 2*16 - 4*16], XDATA2 movdqu [OUT + IDX + 3*16 - 4*16], XDATA3 movdqa XIV, XSAVED3 CMP IDX, LEN jne main_loop done: ; Don't write back IV ; movdqu [IV], XIV ret intel-ipsec-mb-0.48/sse/aes128_cbc_mac_x4.asm000066400000000000000000000032441321406316400206210ustar00rootroot00000000000000;; ;; Copyright (c) 2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; ;;; Routine to compute CBC-MAC based on 128 bit CBC AES encryptionk code %define CBC_MAC %include "aes_cbc_enc_128_x4.asm" intel-ipsec-mb-0.48/sse/aes128_cntr_by4_sse.asm000066400000000000000000000170261321406316400212400ustar00rootroot00000000000000;; ;; Copyright (c) 2012-2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; %include "os.asm" %include "memcpy.asm" ; routine to do AES128 CNTR enc/decrypt "by4" ; XMM registers are clobbered. Saving/restoring must be done at a higher level extern byteswap_const, ddq_add_1, ddq_add_2, ddq_add_3, ddq_add_4 %define CONCAT(a,b) a %+ b %define MOVDQ movdqu %define xdata0 xmm0 %define xdata1 xmm1 %define xdata2 xmm2 %define xdata3 xmm3 %define xdata4 xmm4 %define xdata5 xmm5 %define xdata6 xmm6 %define xdata7 xmm7 %define xcounter xmm8 %define xbyteswap xmm9 %define xkey0 xmm10 %define xkey3 xmm11 %define xkey6 xmm12 %define xkey9 xmm13 %define xkeyA xmm14 %define xkeyB xmm15 %ifdef LINUX %define p_in rdi %define p_IV rsi %define p_keys rdx %define p_out rcx %define num_bytes r8 %define p_ivlen r9 %else %define p_in rcx %define p_IV rdx %define p_keys r8 %define p_out r9 %define num_bytes r10 %define p_ivlen qword [rsp + 8*6] %endif %define p_tmp rsp + _buffer %define tmp r11 %macro do_aes_load 1 do_aes %1, 1 %endmacro %macro do_aes_noload 1 do_aes %1, 0 %endmacro ; do_aes num_in_par load_keys ; This increments p_in, but not p_out %macro do_aes 2 %define %%by %1 %define %%load_keys %2 %if (%%load_keys) movdqa xkey0, [p_keys + 0*16] %endif movdqa xdata0, xcounter pshufb xdata0, xbyteswap %assign i 1 %rep (%%by - 1) movdqa CONCAT(xdata,i), xcounter paddd CONCAT(xdata,i), [rel CONCAT(ddq_add_,i)] pshufb CONCAT(xdata,i), xbyteswap %assign i (i + 1) %endrep movdqa xkeyA, [p_keys + 1*16] pxor xdata0, xkey0 paddd xcounter, [rel CONCAT(ddq_add_,%%by)] %assign i 1 %rep (%%by - 1) pxor CONCAT(xdata,i), xkey0 %assign i (i + 1) %endrep movdqa xkeyB, [p_keys + 2*16] %assign i 0 %rep %%by aesenc CONCAT(xdata,i), xkeyA ; key 1 %assign i (i+1) %endrep %if (%%load_keys) movdqa xkey3, [p_keys + 3*16] %endif %assign i 0 %rep %%by aesenc CONCAT(xdata,i), xkeyB ; key 2 %assign i (i+1) %endrep add p_in, 16*%%by movdqa xkeyB, [p_keys + 4*16] %assign i 0 %rep %%by aesenc CONCAT(xdata,i), xkey3 ; key 3 %assign i (i+1) %endrep movdqa xkeyA, [p_keys + 5*16] %assign i 0 %rep %%by aesenc CONCAT(xdata,i), xkeyB ; key 4 %assign i (i+1) %endrep %if (%%load_keys) movdqa xkey6, [p_keys + 6*16] %endif %assign i 0 %rep %%by aesenc CONCAT(xdata,i), xkeyA ; key 5 %assign i (i+1) %endrep movdqa xkeyA, [p_keys + 7*16] %assign i 0 %rep %%by aesenc CONCAT(xdata,i), xkey6 ; key 6 %assign i (i+1) %endrep movdqa xkeyB, [p_keys + 8*16] %assign i 0 %rep %%by aesenc CONCAT(xdata,i), xkeyA ; key 7 %assign i (i+1) %endrep %if (%%load_keys) movdqa xkey9, [p_keys + 9*16] %endif %assign i 0 %rep %%by aesenc CONCAT(xdata,i), xkeyB ; key 8 %assign i (i+1) %endrep movdqa xkeyB, [p_keys + 10*16] %assign i 0 %rep %%by aesenc CONCAT(xdata,i), xkey9 ; key 9 %assign i (i+1) %endrep %assign i 0 %rep %%by aesenclast CONCAT(xdata,i), xkeyB ; key 10 %assign i (i+1) %endrep %assign i 0 %rep (%%by / 2) %assign j (i+1) MOVDQ xkeyA, [p_in + i*16 - 16*%%by] MOVDQ xkeyB, [p_in + j*16 - 16*%%by] pxor CONCAT(xdata,i), xkeyA pxor CONCAT(xdata,j), xkeyB %assign i (i+2) %endrep %if (i < %%by) MOVDQ xkeyA, [p_in + i*16 - 16*%%by] pxor CONCAT(xdata,i), xkeyA %endif %assign i 0 %rep %%by MOVDQ [p_out + i*16], CONCAT(xdata,i) %assign i (i+1) %endrep %endmacro struc STACK _buffer: resq 2 _rsp_save: resq 1 endstruc ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; section .text ;; aes_cntr_128_sse(void *in, void *IV, void *keys, void *out, UINT64 num_bytes, UINT64 iv_len) align 32 MKGLOBAL(aes_cntr_128_sse,function,internal) aes_cntr_128_sse: %ifndef LINUX mov num_bytes, [rsp + 8*5] ; arg5 %endif movdqa xbyteswap, [rel byteswap_const] test p_ivlen, 16 jnz iv_is_16_bytes ; Read 12 bytes: Nonce + ESP IV. Then pad with block counter 0x00000001 mov DWORD(tmp), 0x01000000 pinsrq xcounter, [p_IV], 0 pinsrd xcounter, [p_IV + 8], 2 pinsrd xcounter, DWORD(tmp), 3 bswap_iv: pshufb xcounter, xbyteswap mov tmp, num_bytes and tmp, 3*16 jz chk ; x4 > or < 15 (not 3 lines) ; 1 <= tmp <= 3 cmp tmp, 2*16 jg eq3 je eq2 eq1: do_aes_load 1 ; 1 block add p_out, 1*16 jmp chk eq2: do_aes_load 2 ; 2 blocks add p_out, 2*16 jmp chk eq3: do_aes_load 3 ; 3 blocks add p_out, 3*16 ; fall through to chk chk: and num_bytes, ~(3*16) jz do_return2 cmp num_bytes, 16 jb last ; process multiples of 4 blocks movdqa xkey0, [p_keys + 0*16] movdqa xkey3, [p_keys + 3*16] movdqa xkey6, [p_keys + 6*16] movdqa xkey9, [p_keys + 9*16] jmp main_loop2 align 32 main_loop2: ; num_bytes is a multiple of 4 blocks + partial bytes do_aes_noload 4 add p_out, 4*16 sub num_bytes, 4*16 cmp num_bytes, 4*16 jae main_loop2 test num_bytes, 15 ; partial bytes to be processed? jnz last do_return2: ; don't return updated IV ; pshufb xcounter, xbyteswap ; movdqu [p_IV], xcounter ret last: ;; Code dealing with the partial block cases ; reserve 16 byte aligned buffer on the stack mov rax, rsp sub rsp, STACK_size and rsp, -16 mov [rsp + _rsp_save], rax ; save SP ; copy input bytes into scratch buffer memcpy_sse_16_1 p_tmp, p_in, num_bytes, tmp, rax ; Encryption of a single partial block (p_tmp) pshufb xcounter, xbyteswap movdqa xdata0, xcounter pxor xdata0, [p_keys + 16*0] %assign i 1 %rep 9 aesenc xdata0, [p_keys + 16*i] %assign i (i+1) %endrep ; created keystream aesenclast xdata0, [p_keys + 16*i] ; xor keystream with the message (scratch) pxor xdata0, [p_tmp] movdqa [p_tmp], xdata0 ; copy result into the output buffer memcpy_sse_16_1 p_out, p_tmp, num_bytes, tmp, rax ; remove the stack frame mov rsp, [rsp + _rsp_save] ; original SP jmp do_return2 iv_is_16_bytes: ; Read 16 byte IV: Nonce + ESP IV + block counter (BE) movdqu xcounter, [p_IV] jmp bswap_iv intel-ipsec-mb-0.48/sse/aes192_cbc_dec_by4_sse.asm000066400000000000000000000267411321406316400216410ustar00rootroot00000000000000;; ;; Copyright (c) 2012-2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; ; routine to do AES cbc decrypt on 16n bytes doing AES by 4 ; XMM registers are clobbered. Saving/restoring must be done at a higher level ; void aes_cbc_dec_192_sse(void *in, ; UINT128 *IV, ; UINT128 keys[13], // +1 over key length ; void *out, ; UINT64 len_bytes); ; ; arg 1: IN: pointer to input (cipher text) ; arg 2: IV: pointer to IV ; arg 3: KEYS: pointer to keys ; arg 4: OUT: pointer to output (plain text) ; arg 5: LEN: length in bytes (multiple of 16) ; %include "os.asm" %define MOVDQ movdqu %ifdef LINUX %define IN rdi %define IV rsi %define KEYS rdx %define OUT rcx %define LEN r8 %else %define IN rcx %define IV rdx %define KEYS r8 %define OUT r9 %define LEN r10 %endif %define IDX rax %define TMP IDX %define XDATA0 xmm0 %define XDATA1 xmm1 %define XDATA2 xmm2 %define XDATA3 xmm3 %define XKEY0 xmm4 %define XKEY2 xmm5 %define XKEY4 xmm6 %define XKEY6 xmm7 %define XKEY10 xmm8 %define XIV xmm9 %define XSAVED0 xmm10 %define XSAVED1 xmm11 %define XSAVED2 xmm12 %define XSAVED3 xmm13 %define XKEY_A xmm14 %define XKEY_B xmm15 %define IV_TMP XSAVED3 section .text MKGLOBAL(aes_cbc_dec_192_sse,function,internal) aes_cbc_dec_192_sse: %ifndef LINUX mov LEN, [rsp + 8*5] %endif mov TMP, LEN and TMP, 3*16 jz initial_4 cmp TMP, 2*16 jb initial_1 ja initial_3 initial_2: ; load cipher text movdqu XDATA0, [IN + 0*16] movdqu XDATA1, [IN + 1*16] movdqa XKEY0, [KEYS + 0*16] ; save cipher text movdqa XSAVED0, XDATA0 movdqa XIV, XDATA1 pxor XDATA0, XKEY0 ; 0. ARK pxor XDATA1, XKEY0 movdqa XKEY2, [KEYS + 2*16] aesdec XDATA0, [KEYS + 1*16] ; 1. DEC aesdec XDATA1, [KEYS + 1*16] mov IDX, 2*16 aesdec XDATA0, XKEY2 ; 2. DEC aesdec XDATA1, XKEY2 movdqa XKEY4, [KEYS + 4*16] aesdec XDATA0, [KEYS + 3*16] ; 3. DEC aesdec XDATA1, [KEYS + 3*16] movdqu IV_TMP, [IV] aesdec XDATA0, XKEY4 ; 4. DEC aesdec XDATA1, XKEY4 movdqa XKEY6, [KEYS + 6*16] aesdec XDATA0, [KEYS + 5*16] ; 5. DEC aesdec XDATA1, [KEYS + 5*16] aesdec XDATA0, XKEY6 ; 6. DEC aesdec XDATA1, XKEY6 movdqa XKEY_B, [KEYS + 8*16] aesdec XDATA0, [KEYS + 7*16] ; 7. DEC aesdec XDATA1, [KEYS + 7*16] aesdec XDATA0, XKEY_B ; 8. DEC aesdec XDATA1, XKEY_B movdqa XKEY10, [KEYS + 10*16] aesdec XDATA0, [KEYS + 9*16] ; 9. DEC aesdec XDATA1, [KEYS + 9*16] aesdec XDATA0, XKEY10 ; 10. DEC aesdec XDATA1, XKEY10 aesdec XDATA0, [KEYS + 11*16] ; 11. DEC aesdec XDATA1, [KEYS + 11*16] aesdeclast XDATA0, [KEYS + 12*16] ; 12. DEC aesdeclast XDATA1, [KEYS + 12*16] pxor XDATA0, IV_TMP pxor XDATA1, XSAVED0 movdqu [OUT + 0*16], XDATA0 movdqu [OUT + 1*16], XDATA1 cmp LEN, 2*16 je done jmp main_loop align 16 initial_1: ; load cipher text movdqu XDATA0, [IN + 0*16] movdqa XKEY0, [KEYS + 0*16] ; save cipher text movdqa XIV, XDATA0 pxor XDATA0, XKEY0 ; 0. ARK movdqa XKEY2, [KEYS + 2*16] aesdec XDATA0, [KEYS + 1*16] ; 1. DEC mov IDX, 1*16 aesdec XDATA0, XKEY2 ; 2. DEC movdqa XKEY4, [KEYS + 4*16] aesdec XDATA0, [KEYS + 3*16] ; 3. DEC movdqu IV_TMP, [IV] aesdec XDATA0, XKEY4 ; 4. DEC movdqa XKEY6, [KEYS + 6*16] aesdec XDATA0, [KEYS + 5*16] ; 5. DEC aesdec XDATA0, XKEY6 ; 6. DEC movdqa XKEY_B, [KEYS + 8*16] aesdec XDATA0, [KEYS + 7*16] ; 7. DEC aesdec XDATA0, XKEY_B ; 8. DEC movdqa XKEY10, [KEYS + 10*16] aesdec XDATA0, [KEYS + 9*16] ; 9. DEC aesdec XDATA0, XKEY10 ; 10. DEC aesdec XDATA0, [KEYS + 11*16] ; 11. DEC aesdeclast XDATA0, [KEYS + 12*16] ; 12. DEC pxor XDATA0, IV_TMP movdqu [OUT + 0*16], XDATA0 cmp LEN, 1*16 je done jmp main_loop initial_3: ; load cipher text movdqu XDATA0, [IN + 0*16] movdqu XDATA1, [IN + 1*16] movdqu XDATA2, [IN + 2*16] movdqa XKEY0, [KEYS + 0*16] ; save cipher text movdqa XSAVED0, XDATA0 movdqa XSAVED1, XDATA1 movdqa XIV, XDATA2 movdqa XKEY_A, [KEYS + 1*16] pxor XDATA0, XKEY0 ; 0. ARK pxor XDATA1, XKEY0 pxor XDATA2, XKEY0 movdqa XKEY2, [KEYS + 2*16] aesdec XDATA0, XKEY_A ; 1. DEC aesdec XDATA1, XKEY_A aesdec XDATA2, XKEY_A movdqa XKEY_A, [KEYS + 3*16] mov IDX, 3*16 aesdec XDATA0, XKEY2 ; 2. DEC aesdec XDATA1, XKEY2 aesdec XDATA2, XKEY2 movdqa XKEY4, [KEYS + 4*16] aesdec XDATA0, XKEY_A ; 3. DEC aesdec XDATA1, XKEY_A aesdec XDATA2, XKEY_A movdqa XKEY_A, [KEYS + 5*16] movdqu IV_TMP, [IV] aesdec XDATA0, XKEY4 ; 4. DEC aesdec XDATA1, XKEY4 aesdec XDATA2, XKEY4 movdqa XKEY6, [KEYS + 6*16] aesdec XDATA0, XKEY_A ; 5. DEC aesdec XDATA1, XKEY_A aesdec XDATA2, XKEY_A movdqa XKEY_A, [KEYS + 7*16] aesdec XDATA0, XKEY6 ; 6. DEC aesdec XDATA1, XKEY6 aesdec XDATA2, XKEY6 movdqa XKEY_B, [KEYS + 8*16] aesdec XDATA0, XKEY_A ; 7. DEC aesdec XDATA1, XKEY_A aesdec XDATA2, XKEY_A movdqa XKEY_A, [KEYS + 9*16] aesdec XDATA0, XKEY_B ; 8. DEC aesdec XDATA1, XKEY_B aesdec XDATA2, XKEY_B movdqa XKEY10, [KEYS + 10*16] aesdec XDATA0, XKEY_A ; 9. DEC aesdec XDATA1, XKEY_A aesdec XDATA2, XKEY_A movdqa XKEY_A, [KEYS + 11*16] aesdec XDATA0, XKEY10 ; 10. DEC aesdec XDATA1, XKEY10 aesdec XDATA2, XKEY10 movdqa XKEY_B, [KEYS + 12*16] aesdec XDATA0, XKEY_A ; 11. DEC aesdec XDATA1, XKEY_A aesdec XDATA2, XKEY_A movdqa XKEY_A, [KEYS + 13*16] aesdeclast XDATA0, XKEY_B ; 12. DEC aesdeclast XDATA1, XKEY_B aesdeclast XDATA2, XKEY_B pxor XDATA0, IV_TMP pxor XDATA1, XSAVED0 pxor XDATA2, XSAVED1 movdqu [OUT + 0*16], XDATA0 movdqu [OUT + 1*16], XDATA1 movdqu [OUT + 2*16], XDATA2 cmp LEN, 3*16 je done jmp main_loop align 16 initial_4: ; load cipher text movdqu XDATA0, [IN + 0*16] movdqu XDATA1, [IN + 1*16] movdqu XDATA2, [IN + 2*16] movdqu XDATA3, [IN + 3*16] movdqa XKEY0, [KEYS + 0*16] ; save cipher text movdqa XSAVED0, XDATA0 movdqa XSAVED1, XDATA1 movdqa XSAVED2, XDATA2 movdqa XIV, XDATA3 movdqa XKEY_A, [KEYS + 1*16] pxor XDATA0, XKEY0 ; 0. ARK pxor XDATA1, XKEY0 pxor XDATA2, XKEY0 pxor XDATA3, XKEY0 movdqa XKEY2, [KEYS + 2*16] aesdec XDATA0, XKEY_A ; 1. DEC aesdec XDATA1, XKEY_A aesdec XDATA2, XKEY_A aesdec XDATA3, XKEY_A movdqa XKEY_A, [KEYS + 3*16] mov IDX, 4*16 aesdec XDATA0, XKEY2 ; 2. DEC aesdec XDATA1, XKEY2 aesdec XDATA2, XKEY2 aesdec XDATA3, XKEY2 movdqa XKEY4, [KEYS + 4*16] aesdec XDATA0, XKEY_A ; 3. DEC aesdec XDATA1, XKEY_A aesdec XDATA2, XKEY_A aesdec XDATA3, XKEY_A movdqa XKEY_A, [KEYS + 5*16] movdqu IV_TMP, [IV] aesdec XDATA0, XKEY4 ; 4. DEC aesdec XDATA1, XKEY4 aesdec XDATA2, XKEY4 aesdec XDATA3, XKEY4 movdqa XKEY6, [KEYS + 6*16] aesdec XDATA0, XKEY_A ; 5. DEC aesdec XDATA1, XKEY_A aesdec XDATA2, XKEY_A aesdec XDATA3, XKEY_A movdqa XKEY_A, [KEYS + 7*16] aesdec XDATA0, XKEY6 ; 6. DEC aesdec XDATA1, XKEY6 aesdec XDATA2, XKEY6 aesdec XDATA3, XKEY6 movdqa XKEY_B, [KEYS + 8*16] aesdec XDATA0, XKEY_A ; 7. DEC aesdec XDATA1, XKEY_A aesdec XDATA2, XKEY_A aesdec XDATA3, XKEY_A movdqa XKEY_A, [KEYS + 9*16] aesdec XDATA0, XKEY_B ; 8. DEC aesdec XDATA1, XKEY_B aesdec XDATA2, XKEY_B aesdec XDATA3, XKEY_B movdqa XKEY10, [KEYS + 10*16] aesdec XDATA0, XKEY_A ; 9. DEC aesdec XDATA1, XKEY_A aesdec XDATA2, XKEY_A aesdec XDATA3, XKEY_A movdqa XKEY_A, [KEYS + 11*16] aesdec XDATA0, XKEY10 ; 10. DEC aesdec XDATA1, XKEY10 aesdec XDATA2, XKEY10 aesdec XDATA3, XKEY10 movdqa XKEY_B, [KEYS + 12*16] aesdec XDATA0, XKEY_A ; 11. DEC aesdec XDATA1, XKEY_A aesdec XDATA2, XKEY_A aesdec XDATA3, XKEY_A aesdeclast XDATA0, XKEY_B ; 12. DEC aesdeclast XDATA1, XKEY_B aesdeclast XDATA2, XKEY_B aesdeclast XDATA3, XKEY_B pxor XDATA0, IV_TMP pxor XDATA1, XSAVED0 pxor XDATA2, XSAVED1 pxor XDATA3, XSAVED2 movdqu [OUT + 0*16], XDATA0 movdqu [OUT + 1*16], XDATA1 movdqu [OUT + 2*16], XDATA2 movdqu [OUT + 3*16], XDATA3 cmp LEN, 4*16 jz done jmp main_loop align 16 main_loop: ; load cipher text movdqu XDATA0, [IN + IDX + 0*16] movdqu XDATA1, [IN + IDX + 1*16] movdqu XDATA2, [IN + IDX + 2*16] movdqu XDATA3, [IN + IDX + 3*16] ; save cipher text movdqa XSAVED0, XDATA0 movdqa XSAVED1, XDATA1 movdqa XSAVED2, XDATA2 movdqa XSAVED3, XDATA3 movdqa XKEY_A, [KEYS + 1*16] pxor XDATA0, XKEY0 ; 0. ARK pxor XDATA1, XKEY0 pxor XDATA2, XKEY0 pxor XDATA3, XKEY0 add IDX, 4*16 aesdec XDATA0, XKEY_A ; 1. DEC aesdec XDATA1, XKEY_A aesdec XDATA2, XKEY_A aesdec XDATA3, XKEY_A movdqa XKEY_A, [KEYS + 3*16] aesdec XDATA0, XKEY2 ; 2. DEC aesdec XDATA1, XKEY2 aesdec XDATA2, XKEY2 aesdec XDATA3, XKEY2 aesdec XDATA0, XKEY_A ; 3. DEC aesdec XDATA1, XKEY_A aesdec XDATA2, XKEY_A aesdec XDATA3, XKEY_A movdqa XKEY_A, [KEYS + 5*16] aesdec XDATA0, XKEY4 ; 4. DEC aesdec XDATA1, XKEY4 aesdec XDATA2, XKEY4 aesdec XDATA3, XKEY4 aesdec XDATA0, XKEY_A ; 5. DEC aesdec XDATA1, XKEY_A aesdec XDATA2, XKEY_A aesdec XDATA3, XKEY_A movdqa XKEY_A, [KEYS + 7*16] aesdec XDATA0, XKEY6 ; 6. DEC aesdec XDATA1, XKEY6 aesdec XDATA2, XKEY6 aesdec XDATA3, XKEY6 movdqa XKEY_B, [KEYS + 8*16] aesdec XDATA0, XKEY_A ; 7. DEC aesdec XDATA1, XKEY_A aesdec XDATA2, XKEY_A aesdec XDATA3, XKEY_A movdqa XKEY_A, [KEYS + 9*16] aesdec XDATA0, XKEY_B ; 8. DEC aesdec XDATA1, XKEY_B aesdec XDATA2, XKEY_B aesdec XDATA3, XKEY_B aesdec XDATA0, XKEY_A ; 9. DEC aesdec XDATA1, XKEY_A aesdec XDATA2, XKEY_A aesdec XDATA3, XKEY_A movdqa XKEY_A, [KEYS + 11*16] aesdec XDATA0, XKEY10 ; 10. DEC aesdec XDATA1, XKEY10 aesdec XDATA2, XKEY10 aesdec XDATA3, XKEY10 movdqa XKEY_B, [KEYS + 12*16] aesdec XDATA0, XKEY_A ; 11. DEC aesdec XDATA1, XKEY_A aesdec XDATA2, XKEY_A aesdec XDATA3, XKEY_A aesdeclast XDATA0, XKEY_B ; 12. DECLAST aesdeclast XDATA1, XKEY_B aesdeclast XDATA2, XKEY_B aesdeclast XDATA3, XKEY_B pxor XDATA0, XIV pxor XDATA1, XSAVED0 pxor XDATA2, XSAVED1 pxor XDATA3, XSAVED2 movdqu [OUT + IDX + 0*16 - 4*16], XDATA0 movdqu [OUT + IDX + 1*16 - 4*16], XDATA1 movdqu [OUT + IDX + 2*16 - 4*16], XDATA2 movdqu [OUT + IDX + 3*16 - 4*16], XDATA3 movdqa XIV, XSAVED3 CMP IDX, LEN jne main_loop done: ; Don't write back IV ; movdqu [IV], XIV ret intel-ipsec-mb-0.48/sse/aes192_cntr_by4_sse.asm000066400000000000000000000172561321406316400212460ustar00rootroot00000000000000;; ;; Copyright (c) 2012-2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; %include "os.asm" %include "memcpy.asm" ; routine to do AES192 CNTR enc/decrypt "by4" ; XMM registers are clobbered. Saving/restoring must be done at a higher level extern byteswap_const, ddq_add_1, ddq_add_2, ddq_add_3, ddq_add_4 %define CONCAT(a,b) a %+ b %define MOVDQ movdqu %define xdata0 xmm0 %define xdata1 xmm1 %define xdata2 xmm2 %define xdata3 xmm3 %define xdata4 xmm4 %define xdata5 xmm5 %define xdata6 xmm6 %define xdata7 xmm7 %define xcounter xmm8 %define xbyteswap xmm9 %define xkey0 xmm10 %define xkey4 xmm11 %define xkey8 xmm12 %define xkey12 xmm13 %define xkeyA xmm14 %define xkeyB xmm15 %ifdef LINUX %define p_in rdi %define p_IV rsi %define p_keys rdx %define p_out rcx %define num_bytes r8 %define p_ivlen r9 %else %define p_in rcx %define p_IV rdx %define p_keys r8 %define p_out r9 %define num_bytes r10 %define p_ivlen qword [rsp + 8*6] %endif %define tmp r11 %define p_tmp rsp + _buffer %macro do_aes_load 1 do_aes %1, 1 %endmacro %macro do_aes_noload 1 do_aes %1, 0 %endmacro ; do_aes num_in_par load_keys ; This increments p_in, but not p_out %macro do_aes 2 %define %%by %1 %define %%load_keys %2 %if (%%load_keys) movdqa xkey0, [p_keys + 0*16] %endif movdqa xdata0, xcounter pshufb xdata0, xbyteswap %assign i 1 %rep (%%by - 1) movdqa CONCAT(xdata,i), xcounter paddd CONCAT(xdata,i), [rel CONCAT(ddq_add_,i)] pshufb CONCAT(xdata,i), xbyteswap %assign i (i + 1) %endrep movdqa xkeyA, [p_keys + 1*16] pxor xdata0, xkey0 paddd xcounter, [rel CONCAT(ddq_add_,%%by)] %assign i 1 %rep (%%by - 1) pxor CONCAT(xdata,i), xkey0 %assign i (i + 1) %endrep movdqa xkeyB, [p_keys + 2*16] %assign i 0 %rep %%by aesenc CONCAT(xdata,i), xkeyA ; key 1 %assign i (i+1) %endrep movdqa xkeyA, [p_keys + 3*16] %assign i 0 %rep %%by aesenc CONCAT(xdata,i), xkeyB ; key 2 %assign i (i+1) %endrep add p_in, 16*%%by %if (%%load_keys) movdqa xkey4, [p_keys + 4*16] %endif %assign i 0 %rep %%by aesenc CONCAT(xdata,i), xkeyA ; key 3 %assign i (i+1) %endrep movdqa xkeyA, [p_keys + 5*16] %assign i 0 %rep %%by aesenc CONCAT(xdata,i), xkey4 ; key 4 %assign i (i+1) %endrep movdqa xkeyB, [p_keys + 6*16] %assign i 0 %rep %%by aesenc CONCAT(xdata,i), xkeyA ; key 5 %assign i (i+1) %endrep movdqa xkeyA, [p_keys + 7*16] %assign i 0 %rep %%by aesenc CONCAT(xdata,i), xkeyB ; key 6 %assign i (i+1) %endrep %if (%%load_keys) movdqa xkey8, [p_keys + 8*16] %endif %assign i 0 %rep %%by aesenc CONCAT(xdata,i), xkeyA ; key 7 %assign i (i+1) %endrep movdqa xkeyA, [p_keys + 9*16] %assign i 0 %rep %%by aesenc CONCAT(xdata,i), xkey8 ; key 8 %assign i (i+1) %endrep movdqa xkeyB, [p_keys + 10*16] %assign i 0 %rep %%by aesenc CONCAT(xdata,i), xkeyA ; key 9 %assign i (i+1) %endrep movdqa xkeyA, [p_keys + 11*16] %assign i 0 %rep %%by aesenc CONCAT(xdata,i), xkeyB ; key 10 %assign i (i+1) %endrep %if (%%load_keys) movdqa xkey12, [p_keys + 12*16] %endif %assign i 0 %rep %%by aesenc CONCAT(xdata,i), xkeyA ; key 11 %assign i (i+1) %endrep %assign i 0 %rep %%by aesenclast CONCAT(xdata,i), xkey12 ; key 12 %assign i (i+1) %endrep %assign i 0 %rep (%%by / 2) %assign j (i+1) MOVDQ xkeyA, [p_in + i*16 - 16*%%by] MOVDQ xkeyB, [p_in + j*16 - 16*%%by] pxor CONCAT(xdata,i), xkeyA pxor CONCAT(xdata,j), xkeyB %assign i (i+2) %endrep %if (i < %%by) MOVDQ xkeyA, [p_in + i*16 - 16*%%by] pxor CONCAT(xdata,i), xkeyA %endif %assign i 0 %rep %%by MOVDQ [p_out + i*16], CONCAT(xdata,i) %assign i (i+1) %endrep %endmacro struc STACK _buffer: resq 2 _rsp_save: resq 1 endstruc ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; section .text ;; aes_cntr_192_sse(void *in, void *IV, void *keys, void *out, UINT64 num_bytes, UINT64 iv_len) align 32 MKGLOBAL(aes_cntr_192_sse,function,internal) aes_cntr_192_sse: %ifndef LINUX mov num_bytes, [rsp + 8*5] %endif movdqa xbyteswap, [rel byteswap_const] test p_ivlen, 16 jnz iv_is_16_bytes ; Read 12 bytes: Nonce + ESP IV. Then pad with block counter 0x00000001 mov DWORD(tmp), 0x01000000 pinsrq xcounter, [p_IV], 0 pinsrd xcounter, [p_IV + 8], 2 pinsrd xcounter, DWORD(tmp), 3 bswap_iv: pshufb xcounter, xbyteswap mov tmp, num_bytes and tmp, 3*16 jz chk ; x4 > or < 15 (not 3 lines) ; 1 <= tmp <= 3 cmp tmp, 2*16 jg eq3 je eq2 eq1: do_aes_load 1 add p_out, 1*16 jmp chk eq2: do_aes_load 2 add p_out, 2*16 jmp chk eq3: do_aes_load 3 add p_out, 3*16 ; fall through to chk chk: and num_bytes, ~(3*16) jz do_return2 cmp num_bytes, 16 jb last ; process multiples of 4 blocks movdqa xkey0, [p_keys + 0*16] movdqa xkey4, [p_keys + 4*16] movdqa xkey8, [p_keys + 8*16] movdqa xkey12, [p_keys + 12*16] jmp main_loop2 align 32 main_loop2: ; num_bytes is a multiple of 4 and >0 do_aes_noload 4 add p_out, 4*16 sub num_bytes, 4*16 cmp num_bytes, 4*16 jae main_loop2 test num_bytes, 15 ; partial bytes to be processed? jnz last do_return2: ; don't return updated IV ; pshufb xcounter, xbyteswap ; movdqu [p_IV], xcounter ret last: ;; Code dealing with the partial block cases ; reserve 16 byte aligned buffer on stack mov rax, rsp sub rsp, STACK_size and rsp, -16 mov [rsp + _rsp_save], rax ; save SP ; copy input bytes into scratch buffer memcpy_sse_16_1 p_tmp, p_in, num_bytes, tmp, rax ; Encryption of a single partial block (p_tmp) pshufb xcounter, xbyteswap movdqa xdata0, xcounter pxor xdata0, [p_keys + 16*0] %assign i 1 %rep 11 aesenc xdata0, [p_keys + 16*i] %assign i (i+1) %endrep ; created keystream aesenclast xdata0, [p_keys + 16*i] ; xor keystream with the message (scratch) pxor xdata0, [p_tmp] movdqa [p_tmp], xdata0 ; copy result into the output buffer memcpy_sse_16_1 p_out, p_tmp, num_bytes, tmp, rax ; remove the stack frame mov rsp, [rsp + _rsp_save] ; original SP jmp do_return2 iv_is_16_bytes: ; Read 16 byte IV: Nonce + ESP IV + block counter (BE) movdqu xcounter, [p_IV] jmp bswap_iv intel-ipsec-mb-0.48/sse/aes256_cbc_dec_by4_sse.asm000066400000000000000000000306071321406316400216360ustar00rootroot00000000000000;; ;; Copyright (c) 2012-2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; ; routine to do AES cbc decrypt on 16n bytes doing AES by 4 ; XMM registers are clobbered. Saving/restoring must be done at a higher level ; void aes_cbc_dec_256_sse(void *in, ; UINT128 *IV, ; UINT128 keys[15], ; void *out, ; UINT64 len_bytes); ; ; arg 1: rcx: pointer to input (cipher text) ; arg 2: rdx: pointer to IV ; arg 3: r8: pointer to keys ; arg 4: r9: pointer to output (plain text) ; arg 5: sp: length in bytes (multiple of 16) ; %include "os.asm" %define MOVDQ movdqu %ifdef LINUX %define IN rdi %define IV rsi %define KEYS rdx %define OUT rcx %define LEN r8 %else %define IN rcx %define IV rdx %define KEYS r8 %define OUT r9 %define LEN r10 %endif %define IDX rax %define TMP IDX %define XDATA0 xmm0 %define XDATA1 xmm1 %define XDATA2 xmm2 %define XDATA3 xmm3 %define XKEY0 xmm4 %define XKEY2 xmm5 %define XKEY4 xmm6 %define XKEY6 xmm7 %define XKEY10 xmm8 %define XIV xmm9 %define XSAVED0 xmm10 %define XSAVED1 xmm11 %define XSAVED2 xmm12 %define XSAVED3 xmm13 %define XKEY_A xmm14 %define XKEY_B xmm15 %define IV_TMP XSAVED3 section .text MKGLOBAL(aes_cbc_dec_256_sse,function,internal) aes_cbc_dec_256_sse: %ifndef LINUX mov LEN, [rsp + 8*5] %endif mov TMP, LEN and TMP, 3*16 jz initial_4 cmp TMP, 2*16 jb initial_1 ja initial_3 initial_2: ; load cipher text movdqu XDATA0, [IN + 0*16] movdqu XDATA1, [IN + 1*16] movdqa XKEY0, [KEYS + 0*16] ; save cipher text movdqa XSAVED0, XDATA0 movdqa XIV, XDATA1 pxor XDATA0, XKEY0 ; 0. ARK pxor XDATA1, XKEY0 movdqa XKEY2, [KEYS + 2*16] aesdec XDATA0, [KEYS + 1*16] ; 1. DEC aesdec XDATA1, [KEYS + 1*16] mov IDX, 2*16 aesdec XDATA0, XKEY2 ; 2. DEC aesdec XDATA1, XKEY2 movdqa XKEY4, [KEYS + 4*16] aesdec XDATA0, [KEYS + 3*16] ; 3. DEC aesdec XDATA1, [KEYS + 3*16] movdqu IV_TMP, [IV] aesdec XDATA0, XKEY4 ; 4. DEC aesdec XDATA1, XKEY4 movdqa XKEY6, [KEYS + 6*16] aesdec XDATA0, [KEYS + 5*16] ; 5. DEC aesdec XDATA1, [KEYS + 5*16] aesdec XDATA0, XKEY6 ; 6. DEC aesdec XDATA1, XKEY6 movdqa XKEY_B, [KEYS + 8*16] aesdec XDATA0, [KEYS + 7*16] ; 7. DEC aesdec XDATA1, [KEYS + 7*16] aesdec XDATA0, XKEY_B ; 8. DEC aesdec XDATA1, XKEY_B movdqa XKEY10, [KEYS + 10*16] aesdec XDATA0, [KEYS + 9*16] ; 9. DEC aesdec XDATA1, [KEYS + 9*16] aesdec XDATA0, XKEY10 ; 10. DEC aesdec XDATA1, XKEY10 aesdec XDATA0, [KEYS + 11*16] ; 11. DEC aesdec XDATA1, [KEYS + 11*16] aesdec XDATA0, [KEYS + 12*16] ; 12. DEC aesdec XDATA1, [KEYS + 12*16] aesdec XDATA0, [KEYS + 13*16] ; 13. DEC aesdec XDATA1, [KEYS + 13*16] aesdeclast XDATA0, [KEYS + 14*16] ; 14. DEC aesdeclast XDATA1, [KEYS + 14*16] pxor XDATA0, IV_TMP pxor XDATA1, XSAVED0 movdqu [OUT + 0*16], XDATA0 movdqu [OUT + 1*16], XDATA1 cmp LEN, 2*16 je done jmp main_loop align 16 initial_1: ; load cipher text movdqu XDATA0, [IN + 0*16] movdqa XKEY0, [KEYS + 0*16] ; save cipher text movdqa XIV, XDATA0 pxor XDATA0, XKEY0 ; 0. ARK movdqa XKEY2, [KEYS + 2*16] aesdec XDATA0, [KEYS + 1*16] ; 1. DEC mov IDX, 1*16 aesdec XDATA0, XKEY2 ; 2. DEC movdqa XKEY4, [KEYS + 4*16] aesdec XDATA0, [KEYS + 3*16] ; 3. DEC movdqu IV_TMP, [IV] aesdec XDATA0, XKEY4 ; 4. DEC movdqa XKEY6, [KEYS + 6*16] aesdec XDATA0, [KEYS + 5*16] ; 5. DEC aesdec XDATA0, XKEY6 ; 6. DEC movdqa XKEY_B, [KEYS + 8*16] aesdec XDATA0, [KEYS + 7*16] ; 7. DEC aesdec XDATA0, XKEY_B ; 8. DEC movdqa XKEY10, [KEYS + 10*16] aesdec XDATA0, [KEYS + 9*16] ; 9. DEC aesdec XDATA0, XKEY10 ; 10. DEC aesdec XDATA0, [KEYS + 11*16] ; 11. DEC aesdec XDATA0, [KEYS + 12*16] ; 12. DEC aesdec XDATA0, [KEYS + 13*16] ; 13. DEC aesdeclast XDATA0, [KEYS + 14*16] ; 14. DEC pxor XDATA0, IV_TMP movdqu [OUT + 0*16], XDATA0 cmp LEN, 1*16 je done jmp main_loop initial_3: ; load cipher text movdqu XDATA0, [IN + 0*16] movdqu XDATA1, [IN + 1*16] movdqu XDATA2, [IN + 2*16] movdqa XKEY0, [KEYS + 0*16] ; save cipher text movdqa XSAVED0, XDATA0 movdqa XSAVED1, XDATA1 movdqa XIV, XDATA2 movdqa XKEY_A, [KEYS + 1*16] pxor XDATA0, XKEY0 ; 0. ARK pxor XDATA1, XKEY0 pxor XDATA2, XKEY0 movdqa XKEY2, [KEYS + 2*16] aesdec XDATA0, XKEY_A ; 1. DEC aesdec XDATA1, XKEY_A aesdec XDATA2, XKEY_A movdqa XKEY_A, [KEYS + 3*16] mov IDX, 3*16 aesdec XDATA0, XKEY2 ; 2. DEC aesdec XDATA1, XKEY2 aesdec XDATA2, XKEY2 movdqa XKEY4, [KEYS + 4*16] aesdec XDATA0, XKEY_A ; 3. DEC aesdec XDATA1, XKEY_A aesdec XDATA2, XKEY_A movdqa XKEY_A, [KEYS + 5*16] movdqu IV_TMP, [IV] aesdec XDATA0, XKEY4 ; 4. DEC aesdec XDATA1, XKEY4 aesdec XDATA2, XKEY4 movdqa XKEY6, [KEYS + 6*16] aesdec XDATA0, XKEY_A ; 5. DEC aesdec XDATA1, XKEY_A aesdec XDATA2, XKEY_A movdqa XKEY_A, [KEYS + 7*16] aesdec XDATA0, XKEY6 ; 6. DEC aesdec XDATA1, XKEY6 aesdec XDATA2, XKEY6 movdqa XKEY_B, [KEYS + 8*16] aesdec XDATA0, XKEY_A ; 7. DEC aesdec XDATA1, XKEY_A aesdec XDATA2, XKEY_A movdqa XKEY_A, [KEYS + 9*16] aesdec XDATA0, XKEY_B ; 8. DEC aesdec XDATA1, XKEY_B aesdec XDATA2, XKEY_B movdqa XKEY10, [KEYS + 10*16] aesdec XDATA0, XKEY_A ; 9. DEC aesdec XDATA1, XKEY_A aesdec XDATA2, XKEY_A movdqa XKEY_A, [KEYS + 11*16] aesdec XDATA0, XKEY10 ; 10. DEC aesdec XDATA1, XKEY10 aesdec XDATA2, XKEY10 movdqa XKEY_B, [KEYS + 12*16] aesdec XDATA0, XKEY_A ; 11. DEC aesdec XDATA1, XKEY_A aesdec XDATA2, XKEY_A movdqa XKEY_A, [KEYS + 13*16] aesdec XDATA0, XKEY_B ; 12. DEC aesdec XDATA1, XKEY_B aesdec XDATA2, XKEY_B movdqa XKEY_B, [KEYS + 14*16] aesdec XDATA0, XKEY_A ; 13. DEC aesdec XDATA1, XKEY_A aesdec XDATA2, XKEY_A aesdeclast XDATA0, XKEY_B ; 14. DEC aesdeclast XDATA1, XKEY_B aesdeclast XDATA2, XKEY_B pxor XDATA0, IV_TMP pxor XDATA1, XSAVED0 pxor XDATA2, XSAVED1 movdqu [OUT + 0*16], XDATA0 movdqu [OUT + 1*16], XDATA1 movdqu [OUT + 2*16], XDATA2 cmp LEN, 3*16 je done jmp main_loop align 16 initial_4: ; load cipher text movdqu XDATA0, [IN + 0*16] movdqu XDATA1, [IN + 1*16] movdqu XDATA2, [IN + 2*16] movdqu XDATA3, [IN + 3*16] movdqa XKEY0, [KEYS + 0*16] ; save cipher text movdqa XSAVED0, XDATA0 movdqa XSAVED1, XDATA1 movdqa XSAVED2, XDATA2 movdqa XIV, XDATA3 movdqa XKEY_A, [KEYS + 1*16] pxor XDATA0, XKEY0 ; 0. ARK pxor XDATA1, XKEY0 pxor XDATA2, XKEY0 pxor XDATA3, XKEY0 movdqa XKEY2, [KEYS + 2*16] aesdec XDATA0, XKEY_A ; 1. DEC aesdec XDATA1, XKEY_A aesdec XDATA2, XKEY_A aesdec XDATA3, XKEY_A movdqa XKEY_A, [KEYS + 3*16] mov IDX, 4*16 aesdec XDATA0, XKEY2 ; 2. DEC aesdec XDATA1, XKEY2 aesdec XDATA2, XKEY2 aesdec XDATA3, XKEY2 movdqa XKEY4, [KEYS + 4*16] aesdec XDATA0, XKEY_A ; 3. DEC aesdec XDATA1, XKEY_A aesdec XDATA2, XKEY_A aesdec XDATA3, XKEY_A movdqa XKEY_A, [KEYS + 5*16] movdqu IV_TMP, [IV] aesdec XDATA0, XKEY4 ; 4. DEC aesdec XDATA1, XKEY4 aesdec XDATA2, XKEY4 aesdec XDATA3, XKEY4 movdqa XKEY6, [KEYS + 6*16] aesdec XDATA0, XKEY_A ; 5. DEC aesdec XDATA1, XKEY_A aesdec XDATA2, XKEY_A aesdec XDATA3, XKEY_A movdqa XKEY_A, [KEYS + 7*16] aesdec XDATA0, XKEY6 ; 6. DEC aesdec XDATA1, XKEY6 aesdec XDATA2, XKEY6 aesdec XDATA3, XKEY6 movdqa XKEY_B, [KEYS + 8*16] aesdec XDATA0, XKEY_A ; 7. DEC aesdec XDATA1, XKEY_A aesdec XDATA2, XKEY_A aesdec XDATA3, XKEY_A movdqa XKEY_A, [KEYS + 9*16] aesdec XDATA0, XKEY_B ; 8. DEC aesdec XDATA1, XKEY_B aesdec XDATA2, XKEY_B aesdec XDATA3, XKEY_B movdqa XKEY10, [KEYS + 10*16] aesdec XDATA0, XKEY_A ; 9. DEC aesdec XDATA1, XKEY_A aesdec XDATA2, XKEY_A aesdec XDATA3, XKEY_A movdqa XKEY_A, [KEYS + 11*16] aesdec XDATA0, XKEY10 ; 10. DEC aesdec XDATA1, XKEY10 aesdec XDATA2, XKEY10 aesdec XDATA3, XKEY10 movdqa XKEY_B, [KEYS + 12*16] aesdec XDATA0, XKEY_A ; 11. DEC aesdec XDATA1, XKEY_A aesdec XDATA2, XKEY_A aesdec XDATA3, XKEY_A movdqa XKEY_A, [KEYS + 13*16] aesdec XDATA0, XKEY_B ; 12. DEC aesdec XDATA1, XKEY_B aesdec XDATA2, XKEY_B aesdec XDATA3, XKEY_B movdqa XKEY_B, [KEYS + 14*16] aesdec XDATA0, XKEY_A ; 13. DEC aesdec XDATA1, XKEY_A aesdec XDATA2, XKEY_A aesdec XDATA3, XKEY_A aesdeclast XDATA0, XKEY_B ; 14. DEC aesdeclast XDATA1, XKEY_B aesdeclast XDATA2, XKEY_B aesdeclast XDATA3, XKEY_B pxor XDATA0, IV_TMP pxor XDATA1, XSAVED0 pxor XDATA2, XSAVED1 pxor XDATA3, XSAVED2 movdqu [OUT + 0*16], XDATA0 movdqu [OUT + 1*16], XDATA1 movdqu [OUT + 2*16], XDATA2 movdqu [OUT + 3*16], XDATA3 cmp LEN, 4*16 jz done jmp main_loop align 16 main_loop: ; load cipher text movdqu XDATA0, [IN + IDX + 0*16] movdqu XDATA1, [IN + IDX + 1*16] movdqu XDATA2, [IN + IDX + 2*16] movdqu XDATA3, [IN + IDX + 3*16] ; save cipher text movdqa XSAVED0, XDATA0 movdqa XSAVED1, XDATA1 movdqa XSAVED2, XDATA2 movdqa XSAVED3, XDATA3 movdqa XKEY_A, [KEYS + 1*16] pxor XDATA0, XKEY0 ; 0. ARK pxor XDATA1, XKEY0 pxor XDATA2, XKEY0 pxor XDATA3, XKEY0 add IDX, 4*16 aesdec XDATA0, XKEY_A ; 1. DEC aesdec XDATA1, XKEY_A aesdec XDATA2, XKEY_A aesdec XDATA3, XKEY_A movdqa XKEY_A, [KEYS + 3*16] aesdec XDATA0, XKEY2 ; 2. DEC aesdec XDATA1, XKEY2 aesdec XDATA2, XKEY2 aesdec XDATA3, XKEY2 aesdec XDATA0, XKEY_A ; 3. DEC aesdec XDATA1, XKEY_A aesdec XDATA2, XKEY_A aesdec XDATA3, XKEY_A movdqa XKEY_A, [KEYS + 5*16] aesdec XDATA0, XKEY4 ; 4. DEC aesdec XDATA1, XKEY4 aesdec XDATA2, XKEY4 aesdec XDATA3, XKEY4 aesdec XDATA0, XKEY_A ; 5. DEC aesdec XDATA1, XKEY_A aesdec XDATA2, XKEY_A aesdec XDATA3, XKEY_A movdqa XKEY_A, [KEYS + 7*16] aesdec XDATA0, XKEY6 ; 6. DEC aesdec XDATA1, XKEY6 aesdec XDATA2, XKEY6 aesdec XDATA3, XKEY6 movdqa XKEY_B, [KEYS + 8*16] aesdec XDATA0, XKEY_A ; 7. DEC aesdec XDATA1, XKEY_A aesdec XDATA2, XKEY_A aesdec XDATA3, XKEY_A movdqa XKEY_A, [KEYS + 9*16] aesdec XDATA0, XKEY_B ; 8. DEC aesdec XDATA1, XKEY_B aesdec XDATA2, XKEY_B aesdec XDATA3, XKEY_B aesdec XDATA0, XKEY_A ; 9. DEC aesdec XDATA1, XKEY_A aesdec XDATA2, XKEY_A aesdec XDATA3, XKEY_A movdqa XKEY_A, [KEYS + 11*16] aesdec XDATA0, XKEY10 ; 10. DEC aesdec XDATA1, XKEY10 aesdec XDATA2, XKEY10 aesdec XDATA3, XKEY10 movdqa XKEY_B, [KEYS + 12*16] aesdec XDATA0, XKEY_A ; 11. DEC aesdec XDATA1, XKEY_A aesdec XDATA2, XKEY_A aesdec XDATA3, XKEY_A movdqa XKEY_A, [KEYS + 13*16] aesdec XDATA0, XKEY_B ; 12. DEC aesdec XDATA1, XKEY_B aesdec XDATA2, XKEY_B aesdec XDATA3, XKEY_B movdqa XKEY_B, [KEYS + 14*16] aesdec XDATA0, XKEY_A ; 13. DEC aesdec XDATA1, XKEY_A aesdec XDATA2, XKEY_A aesdec XDATA3, XKEY_A aesdeclast XDATA0, XKEY_B ; 14. DEC aesdeclast XDATA1, XKEY_B aesdeclast XDATA2, XKEY_B aesdeclast XDATA3, XKEY_B pxor XDATA0, XIV pxor XDATA1, XSAVED0 pxor XDATA2, XSAVED1 pxor XDATA3, XSAVED2 movdqu [OUT + IDX + 0*16 - 4*16], XDATA0 movdqu [OUT + IDX + 1*16 - 4*16], XDATA1 movdqu [OUT + IDX + 2*16 - 4*16], XDATA2 movdqu [OUT + IDX + 3*16 - 4*16], XDATA3 movdqa XIV, XSAVED3 CMP IDX, LEN jne main_loop done: ; Don't write back IV ; movdqu [IV], XIV ret intel-ipsec-mb-0.48/sse/aes256_cntr_by4_sse.asm000066400000000000000000000176261321406316400212500ustar00rootroot00000000000000;; ;; Copyright (c) 2012-2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; %include "os.asm" %include "memcpy.asm" ; routine to do AES256 CNTR enc/decrypt "by4" ; XMM registers are clobbered. Saving/restoring must be done at a higher level extern byteswap_const, ddq_add_1, ddq_add_2, ddq_add_3, ddq_add_4 %define CONCAT(a,b) a %+ b %define MOVDQ movdqu %define xdata0 xmm0 %define xdata1 xmm1 %define xdata2 xmm2 %define xdata3 xmm3 %define xdata4 xmm4 %define xdata5 xmm5 %define xdata6 xmm6 %define xdata7 xmm7 %define xcounter xmm8 %define xbyteswap xmm9 %define xkey0 xmm10 %define xkey4 xmm11 %define xkey8 xmm12 %define xkey12 xmm13 %define xkeyA xmm14 %define xkeyB xmm15 %ifdef LINUX %define p_in rdi %define p_IV rsi %define p_keys rdx %define p_out rcx %define num_bytes r8 %define p_ivlen r9 %else %define p_in rcx %define p_IV rdx %define p_keys r8 %define p_out r9 %define num_bytes r10 %define p_ivlen qword [rsp + 8*6] %endif %define tmp r11 %define p_tmp rsp + _buffer %macro do_aes_load 1 do_aes %1, 1 %endmacro %macro do_aes_noload 1 do_aes %1, 0 %endmacro ; do_aes num_in_par load_keys ; This increments p_in, but not p_out %macro do_aes 2 %define %%by %1 %define %%load_keys %2 %if (%%load_keys) movdqa xkey0, [p_keys + 0*16] %endif movdqa xdata0, xcounter pshufb xdata0, xbyteswap %assign i 1 %rep (%%by - 1) movdqa CONCAT(xdata,i), xcounter paddd CONCAT(xdata,i), [rel CONCAT(ddq_add_,i)] pshufb CONCAT(xdata,i), xbyteswap %assign i (i + 1) %endrep movdqa xkeyA, [p_keys + 1*16] pxor xdata0, xkey0 paddd xcounter, [rel CONCAT(ddq_add_,%%by)] %assign i 1 %rep (%%by - 1) pxor CONCAT(xdata,i), xkey0 %assign i (i + 1) %endrep movdqa xkeyB, [p_keys + 2*16] %assign i 0 %rep %%by aesenc CONCAT(xdata,i), xkeyA ; key 1 %assign i (i+1) %endrep movdqa xkeyA, [p_keys + 3*16] %assign i 0 %rep %%by aesenc CONCAT(xdata,i), xkeyB ; key 2 %assign i (i+1) %endrep add p_in, 16*%%by %if (%%load_keys) movdqa xkey4, [p_keys + 4*16] %endif %assign i 0 %rep %%by aesenc CONCAT(xdata,i), xkeyA ; key 3 %assign i (i+1) %endrep movdqa xkeyA, [p_keys + 5*16] %assign i 0 %rep %%by aesenc CONCAT(xdata,i), xkey4 ; key 4 %assign i (i+1) %endrep movdqa xkeyB, [p_keys + 6*16] %assign i 0 %rep %%by aesenc CONCAT(xdata,i), xkeyA ; key 5 %assign i (i+1) %endrep movdqa xkeyA, [p_keys + 7*16] %assign i 0 %rep %%by aesenc CONCAT(xdata,i), xkeyB ; key 6 %assign i (i+1) %endrep %if (%%load_keys) movdqa xkey8, [p_keys + 8*16] %endif %assign i 0 %rep %%by aesenc CONCAT(xdata,i), xkeyA ; key 7 %assign i (i+1) %endrep movdqa xkeyA, [p_keys + 9*16] %assign i 0 %rep %%by aesenc CONCAT(xdata,i), xkey8 ; key 8 %assign i (i+1) %endrep movdqa xkeyB, [p_keys + 10*16] %assign i 0 %rep %%by aesenc CONCAT(xdata,i), xkeyA ; key 9 %assign i (i+1) %endrep movdqa xkeyA, [p_keys + 11*16] %assign i 0 %rep %%by aesenc CONCAT(xdata,i), xkeyB ; key 10 %assign i (i+1) %endrep %if (%%load_keys) movdqa xkey12, [p_keys + 12*16] %endif %assign i 0 %rep %%by aesenc CONCAT(xdata,i), xkeyA ; key 11 %assign i (i+1) %endrep movdqa xkeyA, [p_keys + 13*16] %assign i 0 %rep %%by aesenc CONCAT(xdata,i), xkey12 ; key 12 %assign i (i+1) %endrep movdqa xkeyB, [p_keys + 14*16] %assign i 0 %rep %%by aesenc CONCAT(xdata,i), xkeyA ; key 13 %assign i (i+1) %endrep %assign i 0 %rep %%by aesenclast CONCAT(xdata,i), xkeyB ; key 14 %assign i (i+1) %endrep %assign i 0 %rep (%%by / 2) %assign j (i+1) MOVDQ xkeyA, [p_in + i*16 - 16*%%by] MOVDQ xkeyB, [p_in + j*16 - 16*%%by] pxor CONCAT(xdata,i), xkeyA pxor CONCAT(xdata,j), xkeyB %assign i (i+2) %endrep %if (i < %%by) MOVDQ xkeyA, [p_in + i*16 - 16*%%by] pxor CONCAT(xdata,i), xkeyA %endif %assign i 0 %rep %%by MOVDQ [p_out + i*16], CONCAT(xdata,i) %assign i (i+1) %endrep %endmacro struc STACK _buffer: resq 2 _rsp_save: resq 1 endstruc ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; section .text ;; aes_cntr_256_sse(void *in, void *IV, void *keys, void *out, UINT64 num_bytes, UINT64 iv_len) align 32 MKGLOBAL(aes_cntr_256_sse,function,internal) aes_cntr_256_sse: %ifndef LINUX mov num_bytes, [rsp + 8*5] %endif movdqa xbyteswap, [rel byteswap_const] test p_ivlen, 16 jnz iv_is_16_bytes ; Read 12 bytes: Nonce + ESP IV. Then pad with block counter 0x00000001 mov DWORD(tmp), 0x01000000 pinsrq xcounter, [p_IV], 0 pinsrd xcounter, [p_IV + 8], 2 pinsrd xcounter, DWORD(tmp), 3 bswap_iv: pshufb xcounter, xbyteswap mov tmp, num_bytes and tmp, 3*16 jz chk ; x4 > or < 15 (not 3 lines) ; 1 <= tmp <= 3 cmp tmp, 2*16 jg eq3 je eq2 eq1: do_aes_load 1 add p_out, 1*16 jmp chk eq2: do_aes_load 2 add p_out, 2*16 jmp chk eq3: do_aes_load 3 add p_out, 3*16 ; fall through to chk chk: and num_bytes, ~(3*16) jz do_return2 cmp num_bytes, 16 jb last ; process multiples of 4 blocks movdqa xkey0, [p_keys + 0*16] movdqa xkey4, [p_keys + 4*16] movdqa xkey8, [p_keys + 8*16] movdqa xkey12, [p_keys + 12*16] jmp main_loop2 align 32 main_loop2: ; num_bytes is a multiple of 4 and >0 do_aes_noload 4 add p_out, 4*16 sub num_bytes, 4*16 cmp num_bytes, 4*16 jae main_loop2 test num_bytes, 15 ; partial bytes to be processed? jnz last do_return2: ; don't return updated IV ; pshufb xcounter, xbyteswap ; movdqu [p_IV], xcounter ret last: ;; Code dealing with the partial block cases ; reserve 16 byte aligned buffer on stack mov rax, rsp sub rsp, STACK_size and rsp, -16 mov [rsp + _rsp_save], rax ; save SP ; copy input bytes into scratch buffer memcpy_sse_16_1 p_tmp, p_in, num_bytes, tmp, rax ; Encryption of a single partial block (p_tmp) pshufb xcounter, xbyteswap movdqa xdata0, xcounter pxor xdata0, [p_keys + 16*0] %assign i 1 %rep 13 aesenc xdata0, [p_keys + 16*i] %assign i (i+1) %endrep ; created keystream aesenclast xdata0, [p_keys + 16*i] ; xor keystream with the message (scratch) pxor xdata0, [p_tmp] movdqa [p_tmp], xdata0 ; copy result into the output buffer memcpy_sse_16_1 p_out, p_tmp, num_bytes, tmp, rax ; remove the stack frame mov rsp, [rsp + _rsp_save] ; original SP jmp do_return2 iv_is_16_bytes: ; Read 16 byte IV: Nonce + ESP IV + block counter (BE) movdqu xcounter, [p_IV] jmp bswap_iv intel-ipsec-mb-0.48/sse/aes_cbc_enc_128_x4.asm000066400000000000000000000250171321406316400207670ustar00rootroot00000000000000;; ;; Copyright (c) 2012-2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; ;;; Routine to do a 128 bit CBC AES encryption / CBC-MAC digest computation ;;; processes 4 buffers at a time, single data structure as input ;;; Updates In and Out pointers at end %include "os.asm" %include "mb_mgr_datastruct.asm" %define MOVDQ movdqu ;; assume buffers not aligned %macro pxor2 2 MOVDQ XTMP, %2 pxor %1, XTMP %endm ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; struct AES_ARGS_x8 { ;; void* in[8]; ;; void* out[8]; ;; UINT128* keys[8]; ;; UINT128 IV[8]; ;; } ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; void aes_cbc_enc_128_x4(AES_ARGS_x8 *args, UINT64 len); ;; arg 1: ARG : addr of AES_ARGS_x8 structure ;; arg 2: LEN : len (in units of bytes) struc STACK _gpr_save: resq 8 endstruc %ifdef LINUX %define arg1 rdi %define arg2 rsi %define arg3 rdx %define arg4 rcx %else %define arg1 rcx %define arg2 rdx %define arg3 rdi ;r8 %define arg4 rsi ;r9 %endif %define ARG arg1 %define LEN arg2 %define IDX rax %define IN0 r8 %define KEYS0 rbx %define IN1 r10 %define KEYS1 arg3 %define IN2 r12 %define KEYS2 arg4 %define IN3 r14 %define KEYS3 rbp %ifndef CBC_MAC ;; No cipher text write back for CBC-MAC %define OUT0 r9 %define OUT1 r11 %define OUT2 r13 %define OUT3 r15 %endif %define XDATA0 xmm0 %define XDATA1 xmm1 %define XDATA2 xmm2 %define XDATA3 xmm3 %define XKEY0_3 xmm4 %define XKEY0_6 [KEYS0 + 16*6] %define XTMP xmm5 %define XKEY0_9 xmm6 %define XKEY1_3 xmm7 %define XKEY1_6 xmm8 %define XKEY1_9 xmm9 %define XKEY2_3 xmm10 %define XKEY2_6 xmm11 %define XKEY2_9 xmm12 %define XKEY3_3 xmm13 %define XKEY3_6 xmm14 %define XKEY3_9 xmm15 section .text %ifdef CBC_MAC MKGLOBAL(aes128_cbc_mac_x4,function,internal) aes128_cbc_mac_x4: %else MKGLOBAL(aes_cbc_enc_128_x4,function,internal) aes_cbc_enc_128_x4: %endif sub rsp, STACK_size mov [rsp + _gpr_save + 8*0], rbp %ifdef CBC_MAC mov [rsp + _gpr_save + 8*1], rbx mov [rsp + _gpr_save + 8*2], r12 mov [rsp + _gpr_save + 8*3], r13 mov [rsp + _gpr_save + 8*4], r14 mov [rsp + _gpr_save + 8*5], r15 %ifndef LINUX mov [rsp + _gpr_save + 8*6], rsi mov [rsp + _gpr_save + 8*7], rdi %endif %endif mov IDX, 16 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; mov IN0, [ARG + _aesarg_in + 8*0] mov IN1, [ARG + _aesarg_in + 8*1] mov IN2, [ARG + _aesarg_in + 8*2] mov IN3, [ARG + _aesarg_in + 8*3] MOVDQ XDATA0, [IN0] ; load first block of plain text MOVDQ XDATA1, [IN1] ; load first block of plain text MOVDQ XDATA2, [IN2] ; load first block of plain text MOVDQ XDATA3, [IN3] ; load first block of plain text mov KEYS0, [ARG + _aesarg_keys + 8*0] mov KEYS1, [ARG + _aesarg_keys + 8*1] mov KEYS2, [ARG + _aesarg_keys + 8*2] mov KEYS3, [ARG + _aesarg_keys + 8*3] pxor XDATA0, [ARG + _aesarg_IV + 16*0] ; plaintext XOR IV pxor XDATA1, [ARG + _aesarg_IV + 16*1] ; plaintext XOR IV pxor XDATA2, [ARG + _aesarg_IV + 16*2] ; plaintext XOR IV pxor XDATA3, [ARG + _aesarg_IV + 16*3] ; plaintext XOR IV %ifndef CBC_MAC mov OUT0, [ARG + _aesarg_out + 8*0] mov OUT1, [ARG + _aesarg_out + 8*1] mov OUT2, [ARG + _aesarg_out + 8*2] mov OUT3, [ARG + _aesarg_out + 8*3] %endif pxor XDATA0, [KEYS0 + 16*0] ; 0. ARK pxor XDATA1, [KEYS1 + 16*0] ; 0. ARK pxor XDATA2, [KEYS2 + 16*0] ; 0. ARK pxor XDATA3, [KEYS3 + 16*0] ; 0. ARK aesenc XDATA0, [KEYS0 + 16*1] ; 1. ENC aesenc XDATA1, [KEYS1 + 16*1] ; 1. ENC aesenc XDATA2, [KEYS2 + 16*1] ; 1. ENC aesenc XDATA3, [KEYS3 + 16*1] ; 1. ENC aesenc XDATA0, [KEYS0 + 16*2] ; 2. ENC aesenc XDATA1, [KEYS1 + 16*2] ; 2. ENC aesenc XDATA2, [KEYS2 + 16*2] ; 2. ENC aesenc XDATA3, [KEYS3 + 16*2] ; 2. ENC movdqa XKEY0_3, [KEYS0 + 16*3] ; load round 3 key movdqa XKEY1_3, [KEYS1 + 16*3] ; load round 3 key movdqa XKEY2_3, [KEYS2 + 16*3] ; load round 3 key movdqa XKEY3_3, [KEYS3 + 16*3] ; load round 3 key aesenc XDATA0, XKEY0_3 ; 3. ENC aesenc XDATA1, XKEY1_3 ; 3. ENC aesenc XDATA2, XKEY2_3 ; 3. ENC aesenc XDATA3, XKEY3_3 ; 3. ENC aesenc XDATA0, [KEYS0 + 16*4] ; 4. ENC aesenc XDATA1, [KEYS1 + 16*4] ; 4. ENC aesenc XDATA2, [KEYS2 + 16*4] ; 4. ENC aesenc XDATA3, [KEYS3 + 16*4] ; 4. ENC aesenc XDATA0, [KEYS0 + 16*5] ; 5. ENC aesenc XDATA1, [KEYS1 + 16*5] ; 5. ENC aesenc XDATA2, [KEYS2 + 16*5] ; 5. ENC aesenc XDATA3, [KEYS3 + 16*5] ; 5. ENC movdqa XKEY1_6, [KEYS1 + 16*6] ; load round 6 key movdqa XKEY2_6, [KEYS2 + 16*6] ; load round 6 key movdqa XKEY3_6, [KEYS3 + 16*6] ; load round 6 key aesenc XDATA0, XKEY0_6 ; 6. ENC aesenc XDATA1, XKEY1_6 ; 6. ENC aesenc XDATA2, XKEY2_6 ; 6. ENC aesenc XDATA3, XKEY3_6 ; 6. ENC aesenc XDATA0, [KEYS0 + 16*7] ; 7. ENC aesenc XDATA1, [KEYS1 + 16*7] ; 7. ENC aesenc XDATA2, [KEYS2 + 16*7] ; 7. ENC aesenc XDATA3, [KEYS3 + 16*7] ; 7. ENC aesenc XDATA0, [KEYS0 + 16*8] ; 8. ENC aesenc XDATA1, [KEYS1 + 16*8] ; 8. ENC aesenc XDATA2, [KEYS2 + 16*8] ; 8. ENC aesenc XDATA3, [KEYS3 + 16*8] ; 8. ENC movdqa XKEY0_9, [KEYS0 + 16*9] ; load round 9 key movdqa XKEY1_9, [KEYS1 + 16*9] ; load round 9 key movdqa XKEY2_9, [KEYS2 + 16*9] ; load round 9 key movdqa XKEY3_9, [KEYS3 + 16*9] ; load round 9 key aesenc XDATA0, XKEY0_9 ; 9. ENC aesenc XDATA1, XKEY1_9 ; 9. ENC aesenc XDATA2, XKEY2_9 ; 9. ENC aesenc XDATA3, XKEY3_9 ; 9. ENC aesenclast XDATA0, [KEYS0 + 16*10] ; 10. ENC aesenclast XDATA1, [KEYS1 + 16*10] ; 10. ENC aesenclast XDATA2, [KEYS2 + 16*10] ; 10. ENC aesenclast XDATA3, [KEYS3 + 16*10] ; 10. ENC %ifndef CBC_MAC MOVDQ [OUT0], XDATA0 ; write back ciphertext MOVDQ [OUT1], XDATA1 ; write back ciphertext MOVDQ [OUT2], XDATA2 ; write back ciphertext MOVDQ [OUT3], XDATA3 ; write back ciphertext %endif cmp LEN, IDX je done main_loop: pxor2 XDATA0, [IN0 + IDX] ; plaintext XOR IV pxor2 XDATA1, [IN1 + IDX] ; plaintext XOR IV pxor2 XDATA2, [IN2 + IDX] ; plaintext XOR IV pxor2 XDATA3, [IN3 + IDX] ; plaintext XOR IV pxor XDATA0, [KEYS0 + 16*0] ; 0. ARK pxor XDATA1, [KEYS1 + 16*0] ; 0. ARK pxor XDATA2, [KEYS2 + 16*0] ; 0. ARK pxor XDATA3, [KEYS3 + 16*0] ; 0. ARK aesenc XDATA0, [KEYS0 + 16*1] ; 1. ENC aesenc XDATA1, [KEYS1 + 16*1] ; 1. ENC aesenc XDATA2, [KEYS2 + 16*1] ; 1. ENC aesenc XDATA3, [KEYS3 + 16*1] ; 1. ENC aesenc XDATA0, [KEYS0 + 16*2] ; 2. ENC aesenc XDATA1, [KEYS1 + 16*2] ; 2. ENC aesenc XDATA2, [KEYS2 + 16*2] ; 2. ENC aesenc XDATA3, [KEYS3 + 16*2] ; 2. ENC aesenc XDATA0, XKEY0_3 ; 3. ENC aesenc XDATA1, XKEY1_3 ; 3. ENC aesenc XDATA2, XKEY2_3 ; 3. ENC aesenc XDATA3, XKEY3_3 ; 3. ENC aesenc XDATA0, [KEYS0 + 16*4] ; 4. ENC aesenc XDATA1, [KEYS1 + 16*4] ; 4. ENC aesenc XDATA2, [KEYS2 + 16*4] ; 4. ENC aesenc XDATA3, [KEYS3 + 16*4] ; 4. ENC aesenc XDATA0, [KEYS0 + 16*5] ; 5. ENC aesenc XDATA1, [KEYS1 + 16*5] ; 5. ENC aesenc XDATA2, [KEYS2 + 16*5] ; 5. ENC aesenc XDATA3, [KEYS3 + 16*5] ; 5. ENC aesenc XDATA0, XKEY0_6 ; 6. ENC aesenc XDATA1, XKEY1_6 ; 6. ENC aesenc XDATA2, XKEY2_6 ; 6. ENC aesenc XDATA3, XKEY3_6 ; 6. ENC aesenc XDATA0, [KEYS0 + 16*7] ; 7. ENC aesenc XDATA1, [KEYS1 + 16*7] ; 7. ENC aesenc XDATA2, [KEYS2 + 16*7] ; 7. ENC aesenc XDATA3, [KEYS3 + 16*7] ; 7. ENC aesenc XDATA0, [KEYS0 + 16*8] ; 8. ENC aesenc XDATA1, [KEYS1 + 16*8] ; 8. ENC aesenc XDATA2, [KEYS2 + 16*8] ; 8. ENC aesenc XDATA3, [KEYS3 + 16*8] ; 8. ENC aesenc XDATA0, XKEY0_9 ; 9. ENC aesenc XDATA1, XKEY1_9 ; 9. ENC aesenc XDATA2, XKEY2_9 ; 9. ENC aesenc XDATA3, XKEY3_9 ; 9. ENC aesenclast XDATA0, [KEYS0 + 16*10] ; 10. ENC aesenclast XDATA1, [KEYS1 + 16*10] ; 10. ENC aesenclast XDATA2, [KEYS2 + 16*10] ; 10. ENC aesenclast XDATA3, [KEYS3 + 16*10] ; 10. ENC %ifndef CBC_MAC ;; No cipher text write back for CBC-MAC MOVDQ [OUT0 + IDX], XDATA0 ; write back ciphertext MOVDQ [OUT1 + IDX], XDATA1 ; write back ciphertext MOVDQ [OUT2 + IDX], XDATA2 ; write back ciphertext MOVDQ [OUT3 + IDX], XDATA3 ; write back ciphertext %endif add IDX, 16 cmp LEN, IDX jne main_loop done: ;; update IV / store digest for CBC-MAC movdqa [ARG + _aesarg_IV + 16*0], XDATA0 movdqa [ARG + _aesarg_IV + 16*1], XDATA1 movdqa [ARG + _aesarg_IV + 16*2], XDATA2 movdqa [ARG + _aesarg_IV + 16*3], XDATA3 ;; update IN and OUT add IN0, LEN mov [ARG + _aesarg_in + 8*0], IN0 add IN1, LEN mov [ARG + _aesarg_in + 8*1], IN1 add IN2, LEN mov [ARG + _aesarg_in + 8*2], IN2 add IN3, LEN mov [ARG + _aesarg_in + 8*3], IN3 %ifndef CBC_MAC ;; No OUT pointer updates for CBC-MAC add OUT0, LEN mov [ARG + _aesarg_out + 8*0], OUT0 add OUT1, LEN mov [ARG + _aesarg_out + 8*1], OUT1 add OUT2, LEN mov [ARG + _aesarg_out + 8*2], OUT2 add OUT3, LEN mov [ARG + _aesarg_out + 8*3], OUT3 %endif %ifdef CBC_MAC mov rbx, [rsp + _gpr_save + 8*1] mov r12, [rsp + _gpr_save + 8*2] mov r13, [rsp + _gpr_save + 8*3] mov r14, [rsp + _gpr_save + 8*4] mov r15, [rsp + _gpr_save + 8*5] %ifndef LINUX mov rsi, [rsp + _gpr_save + 8*6] mov rdi, [rsp + _gpr_save + 8*7] %endif %endif mov rbp, [rsp + _gpr_save + 8*0] add rsp, STACK_size ret intel-ipsec-mb-0.48/sse/aes_cbc_enc_192_x4.asm000066400000000000000000000240671321406316400207740ustar00rootroot00000000000000;; ;; Copyright (c) 2012-2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; ;;; routine to do a 192 bit CBC AES encrypt ;;; process 4 buffers at a time, single data structure as input ;;; Updates In and Out pointers at end %include "os.asm" %include "mb_mgr_datastruct.asm" %define MOVDQ movdqu ;; assume buffers not aligned %macro pxor2 2 MOVDQ XTMP, %2 pxor %1, XTMP %endm ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; struct AES_ARGS_x8 { ;; void* in[8]; ;; void* out[8]; ;; UINT128* keys[8]; ;; UINT128 IV[8]; ;; } ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; void aes_cbc_enc_192_x4(AES_ARGS_x8 *args, UINT64 len); ;; arg 1: ARG : addr of AES_ARGS_x8 structure ;; arg 2: LEN : len (in units of bytes) %ifdef LINUX %define ARG rdi %define LEN rsi %define REG3 rcx %define REG4 rdx %else %define ARG rcx %define LEN rdx %define REG3 rsi %define REG4 rdi %endif %define IDX rax %define IN0 r8 %define KEYS0 rbx %define OUT0 r9 %define IN1 r10 %define KEYS1 REG3 %define OUT1 r11 %define IN2 r12 %define KEYS2 REG4 %define OUT2 r13 %define IN3 r14 %define KEYS3 rbp %define OUT3 r15 %define XDATA0 xmm0 %define XDATA1 xmm1 %define XDATA2 xmm2 %define XDATA3 xmm3 %define XKEY0_3 xmm4 %define XKEY0_6 [KEYS0 + 16*6] %define XTMP xmm5 %define XKEY0_9 xmm6 %define XKEY1_3 xmm7 %define XKEY1_6 xmm8 %define XKEY1_9 xmm9 %define XKEY2_3 xmm10 %define XKEY2_6 xmm11 %define XKEY2_9 xmm12 %define XKEY3_3 xmm13 %define XKEY3_6 xmm14 %define XKEY3_9 xmm15 section .text MKGLOBAL(aes_cbc_enc_192_x4,function,internal) aes_cbc_enc_192_x4: push rbp mov IDX, 16 mov IN0, [ARG + _aesarg_in + 8*0] mov IN1, [ARG + _aesarg_in + 8*1] mov IN2, [ARG + _aesarg_in + 8*2] mov IN3, [ARG + _aesarg_in + 8*3] ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; MOVDQ XDATA0, [IN0] ; load first block of plain text MOVDQ XDATA1, [IN1] ; load first block of plain text MOVDQ XDATA2, [IN2] ; load first block of plain text MOVDQ XDATA3, [IN3] ; load first block of plain text mov KEYS0, [ARG + _aesarg_keys + 8*0] mov KEYS1, [ARG + _aesarg_keys + 8*1] mov KEYS2, [ARG + _aesarg_keys + 8*2] mov KEYS3, [ARG + _aesarg_keys + 8*3] pxor XDATA0, [ARG + _aesarg_IV + 16*0] ; plaintext XOR IV pxor XDATA1, [ARG + _aesarg_IV + 16*1] ; plaintext XOR IV pxor XDATA2, [ARG + _aesarg_IV + 16*2] ; plaintext XOR IV pxor XDATA3, [ARG + _aesarg_IV + 16*3] ; plaintext XOR IV mov OUT0, [ARG + _aesarg_out + 8*0] mov OUT1, [ARG + _aesarg_out + 8*1] mov OUT2, [ARG + _aesarg_out + 8*2] mov OUT3, [ARG + _aesarg_out + 8*3] pxor XDATA0, [KEYS0 + 16*0] ; 0. ARK pxor XDATA1, [KEYS1 + 16*0] ; 0. ARK pxor XDATA2, [KEYS2 + 16*0] ; 0. ARK pxor XDATA3, [KEYS3 + 16*0] ; 0. ARK aesenc XDATA0, [KEYS0 + 16*1] ; 1. ENC aesenc XDATA1, [KEYS1 + 16*1] ; 1. ENC aesenc XDATA2, [KEYS2 + 16*1] ; 1. ENC aesenc XDATA3, [KEYS3 + 16*1] ; 1. ENC aesenc XDATA0, [KEYS0 + 16*2] ; 2. ENC aesenc XDATA1, [KEYS1 + 16*2] ; 2. ENC aesenc XDATA2, [KEYS2 + 16*2] ; 2. ENC aesenc XDATA3, [KEYS3 + 16*2] ; 2. ENC movdqa XKEY0_3, [KEYS0 + 16*3] ; load round 3 key movdqa XKEY1_3, [KEYS1 + 16*3] ; load round 3 key movdqa XKEY2_3, [KEYS2 + 16*3] ; load round 3 key movdqa XKEY3_3, [KEYS3 + 16*3] ; load round 3 key aesenc XDATA0, XKEY0_3 ; 3. ENC aesenc XDATA1, XKEY1_3 ; 3. ENC aesenc XDATA2, XKEY2_3 ; 3. ENC aesenc XDATA3, XKEY3_3 ; 3. ENC aesenc XDATA0, [KEYS0 + 16*4] ; 4. ENC aesenc XDATA1, [KEYS1 + 16*4] ; 4. ENC aesenc XDATA2, [KEYS2 + 16*4] ; 4. ENC aesenc XDATA3, [KEYS3 + 16*4] ; 4. ENC aesenc XDATA0, [KEYS0 + 16*5] ; 5. ENC aesenc XDATA1, [KEYS1 + 16*5] ; 5. ENC aesenc XDATA2, [KEYS2 + 16*5] ; 5. ENC aesenc XDATA3, [KEYS3 + 16*5] ; 5. ENC movdqa XKEY1_6, [KEYS1 + 16*6] ; load round 6 key movdqa XKEY2_6, [KEYS2 + 16*6] ; load round 6 key movdqa XKEY3_6, [KEYS3 + 16*6] ; load round 6 key aesenc XDATA0, XKEY0_6 ; 6. ENC aesenc XDATA1, XKEY1_6 ; 6. ENC aesenc XDATA2, XKEY2_6 ; 6. ENC aesenc XDATA3, XKEY3_6 ; 6. ENC aesenc XDATA0, [KEYS0 + 16*7] ; 7. ENC aesenc XDATA1, [KEYS1 + 16*7] ; 7. ENC aesenc XDATA2, [KEYS2 + 16*7] ; 7. ENC aesenc XDATA3, [KEYS3 + 16*7] ; 7. ENC aesenc XDATA0, [KEYS0 + 16*8] ; 8. ENC aesenc XDATA1, [KEYS1 + 16*8] ; 8. ENC aesenc XDATA2, [KEYS2 + 16*8] ; 8. ENC aesenc XDATA3, [KEYS3 + 16*8] ; 8. ENC movdqa XKEY0_9, [KEYS0 + 16*9] ; load round 9 key movdqa XKEY1_9, [KEYS1 + 16*9] ; load round 9 key movdqa XKEY2_9, [KEYS2 + 16*9] ; load round 9 key movdqa XKEY3_9, [KEYS3 + 16*9] ; load round 9 key aesenc XDATA0, XKEY0_9 ; 9. ENC aesenc XDATA1, XKEY1_9 ; 9. ENC aesenc XDATA2, XKEY2_9 ; 9. ENC aesenc XDATA3, XKEY3_9 ; 9. ENC aesenc XDATA0, [KEYS0 + 16*10] ; 10. ENC aesenc XDATA1, [KEYS1 + 16*10] ; 10. ENC aesenc XDATA2, [KEYS2 + 16*10] ; 10. ENC aesenc XDATA3, [KEYS3 + 16*10] ; 10. ENC aesenc XDATA0, [KEYS0 + 16*11] ; 11. ENC aesenc XDATA1, [KEYS1 + 16*11] ; 11. ENC aesenc XDATA2, [KEYS2 + 16*11] ; 11. ENC aesenc XDATA3, [KEYS3 + 16*11] ; 11. ENC aesenclast XDATA0, [KEYS0 + 16*12] ; 12. ENC aesenclast XDATA1, [KEYS1 + 16*12] ; 12. ENC aesenclast XDATA2, [KEYS2 + 16*12] ; 12. ENC aesenclast XDATA3, [KEYS3 + 16*12] ; 12. ENC MOVDQ [OUT0], XDATA0 ; write back ciphertext MOVDQ [OUT1], XDATA1 ; write back ciphertext MOVDQ [OUT2], XDATA2 ; write back ciphertext MOVDQ [OUT3], XDATA3 ; write back ciphertext cmp LEN, IDX je done main_loop: pxor2 XDATA0, [IN0 + IDX] ; plaintext XOR IV pxor2 XDATA1, [IN1 + IDX] ; plaintext XOR IV pxor2 XDATA2, [IN2 + IDX] ; plaintext XOR IV pxor2 XDATA3, [IN3 + IDX] ; plaintext XOR IV pxor XDATA0, [KEYS0 + 16*0] ; 0. ARK pxor XDATA1, [KEYS1 + 16*0] ; 0. ARK pxor XDATA2, [KEYS2 + 16*0] ; 0. ARK pxor XDATA3, [KEYS3 + 16*0] ; 0. ARK aesenc XDATA0, [KEYS0 + 16*1] ; 1. ENC aesenc XDATA1, [KEYS1 + 16*1] ; 1. ENC aesenc XDATA2, [KEYS2 + 16*1] ; 1. ENC aesenc XDATA3, [KEYS3 + 16*1] ; 1. ENC aesenc XDATA0, [KEYS0 + 16*2] ; 2. ENC aesenc XDATA1, [KEYS1 + 16*2] ; 2. ENC aesenc XDATA2, [KEYS2 + 16*2] ; 2. ENC aesenc XDATA3, [KEYS3 + 16*2] ; 2. ENC aesenc XDATA0, XKEY0_3 ; 3. ENC aesenc XDATA1, XKEY1_3 ; 3. ENC aesenc XDATA2, XKEY2_3 ; 3. ENC aesenc XDATA3, XKEY3_3 ; 3. ENC aesenc XDATA0, [KEYS0 + 16*4] ; 4. ENC aesenc XDATA1, [KEYS1 + 16*4] ; 4. ENC aesenc XDATA2, [KEYS2 + 16*4] ; 4. ENC aesenc XDATA3, [KEYS3 + 16*4] ; 4. ENC aesenc XDATA0, [KEYS0 + 16*5] ; 5. ENC aesenc XDATA1, [KEYS1 + 16*5] ; 5. ENC aesenc XDATA2, [KEYS2 + 16*5] ; 5. ENC aesenc XDATA3, [KEYS3 + 16*5] ; 5. ENC aesenc XDATA0, XKEY0_6 ; 6. ENC aesenc XDATA1, XKEY1_6 ; 6. ENC aesenc XDATA2, XKEY2_6 ; 6. ENC aesenc XDATA3, XKEY3_6 ; 6. ENC aesenc XDATA0, [KEYS0 + 16*7] ; 7. ENC aesenc XDATA1, [KEYS1 + 16*7] ; 7. ENC aesenc XDATA2, [KEYS2 + 16*7] ; 7. ENC aesenc XDATA3, [KEYS3 + 16*7] ; 7. ENC aesenc XDATA0, [KEYS0 + 16*8] ; 8. ENC aesenc XDATA1, [KEYS1 + 16*8] ; 8. ENC aesenc XDATA2, [KEYS2 + 16*8] ; 8. ENC aesenc XDATA3, [KEYS3 + 16*8] ; 8. ENC aesenc XDATA0, XKEY0_9 ; 9. ENC aesenc XDATA1, XKEY1_9 ; 9. ENC aesenc XDATA2, XKEY2_9 ; 9. ENC aesenc XDATA3, XKEY3_9 ; 9. ENC aesenc XDATA0, [KEYS0 + 16*10] ; 10. ENC aesenc XDATA1, [KEYS1 + 16*10] ; 10. ENC aesenc XDATA2, [KEYS2 + 16*10] ; 10. ENC aesenc XDATA3, [KEYS3 + 16*10] ; 10. ENC aesenc XDATA0, [KEYS0 + 16*11] ; 11. ENC aesenc XDATA1, [KEYS1 + 16*11] ; 11. ENC aesenc XDATA2, [KEYS2 + 16*11] ; 11. ENC aesenc XDATA3, [KEYS3 + 16*11] ; 11. ENC aesenclast XDATA0, [KEYS0 + 16*12] ; 12. ENC aesenclast XDATA1, [KEYS1 + 16*12] ; 12. ENC aesenclast XDATA2, [KEYS2 + 16*12] ; 12. ENC aesenclast XDATA3, [KEYS3 + 16*12] ; 12. ENC MOVDQ [OUT0 + IDX], XDATA0 ; write back ciphertext MOVDQ [OUT1 + IDX], XDATA1 ; write back ciphertex MOVDQ [OUT2 + IDX], XDATA2 ; write back ciphertex MOVDQ [OUT3 + IDX], XDATA3 ; write back ciphertex add IDX, 16 cmp LEN, IDX jne main_loop done: ;; update IV movdqa [ARG + _aesarg_IV + 16*0], XDATA0 movdqa [ARG + _aesarg_IV + 16*1], XDATA1 movdqa [ARG + _aesarg_IV + 16*2], XDATA2 movdqa [ARG + _aesarg_IV + 16*3], XDATA3 ;; update IN and OUT add IN0, LEN mov [ARG + _aesarg_in + 8*0], IN0 add IN1, LEN mov [ARG + _aesarg_in + 8*1], IN1 add IN2, LEN mov [ARG + _aesarg_in + 8*2], IN2 add IN3, LEN mov [ARG + _aesarg_in + 8*3], IN3 add OUT0, LEN mov [ARG + _aesarg_out + 8*0], OUT0 add OUT1, LEN mov [ARG + _aesarg_out + 8*1], OUT1 add OUT2, LEN mov [ARG + _aesarg_out + 8*2], OUT2 add OUT3, LEN mov [ARG + _aesarg_out + 8*3], OUT3 pop rbp ret intel-ipsec-mb-0.48/sse/aes_cbc_enc_256_x4.asm000066400000000000000000000253421321406316400207720ustar00rootroot00000000000000;; ;; Copyright (c) 2012-2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; ;;; routine to do a 256 bit CBC AES encrypt ;;; process 4 buffers at a time, single data structure as input ;;; Updates In and Out pointers at end %include "os.asm" %include "mb_mgr_datastruct.asm" %define MOVDQ movdqu ;; assume buffers not aligned %macro pxor2 2 MOVDQ XTMP, %2 pxor %1, XTMP %endm ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; struct AES_ARGS_x8 { ;; void* in[8]; ;; void* out[8]; ;; UINT128* keys[8]; ;; UINT128 IV[8]; ;; } ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; void aes_cbc_enc_256_x4(AES_ARGS_x8 *args, UINT64 len); ;; arg 1: ARG : addr of AES_ARGS_x8 structure ;; arg 2: LEN : len (in units of bytes) %ifdef LINUX %define ARG rdi %define LEN rsi %define REG3 rcx %define REG4 rdx %else %define ARG rcx %define LEN rdx %define REG3 rsi %define REG4 rdi %endif %define IDX rax %define IN0 r8 %define KEYS0 rbx %define OUT0 r9 %define IN1 r10 %define KEYS1 REG3 %define OUT1 r11 %define IN2 r12 %define KEYS2 REG4 %define OUT2 r13 %define IN3 r14 %define KEYS3 rbp %define OUT3 r15 %define XDATA0 xmm0 %define XDATA1 xmm1 %define XDATA2 xmm2 %define XDATA3 xmm3 %define XKEY0_3 xmm4 %define XKEY0_6 [KEYS0 + 16*6] %define XTMP xmm5 %define XKEY0_9 xmm6 %define XKEY1_3 xmm7 %define XKEY1_6 xmm8 %define XKEY1_9 xmm9 %define XKEY2_3 xmm10 %define XKEY2_6 xmm11 %define XKEY2_9 xmm12 %define XKEY3_3 xmm13 %define XKEY3_6 xmm14 %define XKEY3_9 xmm15 section .text MKGLOBAL(aes_cbc_enc_256_x4,function,internal) aes_cbc_enc_256_x4: push rbp mov IDX, 16 mov IN0, [ARG + _aesarg_in + 8*0] mov IN1, [ARG + _aesarg_in + 8*1] mov IN2, [ARG + _aesarg_in + 8*2] mov IN3, [ARG + _aesarg_in + 8*3] ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; MOVDQ XDATA0, [IN0] ; load first block of plain text MOVDQ XDATA1, [IN1] ; load first block of plain text MOVDQ XDATA2, [IN2] ; load first block of plain text MOVDQ XDATA3, [IN3] ; load first block of plain text mov KEYS0, [ARG + _aesarg_keys + 8*0] mov KEYS1, [ARG + _aesarg_keys + 8*1] mov KEYS2, [ARG + _aesarg_keys + 8*2] mov KEYS3, [ARG + _aesarg_keys + 8*3] pxor XDATA0, [ARG + _aesarg_IV + 16*0] ; plaintext XOR IV pxor XDATA1, [ARG + _aesarg_IV + 16*1] ; plaintext XOR IV pxor XDATA2, [ARG + _aesarg_IV + 16*2] ; plaintext XOR IV pxor XDATA3, [ARG + _aesarg_IV + 16*3] ; plaintext XOR IV mov OUT0, [ARG + _aesarg_out + 8*0] mov OUT1, [ARG + _aesarg_out + 8*1] mov OUT2, [ARG + _aesarg_out + 8*2] mov OUT3, [ARG + _aesarg_out + 8*3] pxor XDATA0, [KEYS0 + 16*0] ; 0. ARK pxor XDATA1, [KEYS1 + 16*0] ; 0. ARK pxor XDATA2, [KEYS2 + 16*0] ; 0. ARK pxor XDATA3, [KEYS3 + 16*0] ; 0. ARK aesenc XDATA0, [KEYS0 + 16*1] ; 1. ENC aesenc XDATA1, [KEYS1 + 16*1] ; 1. ENC aesenc XDATA2, [KEYS2 + 16*1] ; 1. ENC aesenc XDATA3, [KEYS3 + 16*1] ; 1. ENC aesenc XDATA0, [KEYS0 + 16*2] ; 2. ENC aesenc XDATA1, [KEYS1 + 16*2] ; 2. ENC aesenc XDATA2, [KEYS2 + 16*2] ; 2. ENC aesenc XDATA3, [KEYS3 + 16*2] ; 2. ENC movdqa XKEY0_3, [KEYS0 + 16*3] ; load round 3 key movdqa XKEY1_3, [KEYS1 + 16*3] ; load round 3 key movdqa XKEY2_3, [KEYS2 + 16*3] ; load round 3 key movdqa XKEY3_3, [KEYS3 + 16*3] ; load round 3 key aesenc XDATA0, XKEY0_3 ; 3. ENC aesenc XDATA1, XKEY1_3 ; 3. ENC aesenc XDATA2, XKEY2_3 ; 3. ENC aesenc XDATA3, XKEY3_3 ; 3. ENC aesenc XDATA0, [KEYS0 + 16*4] ; 4. ENC aesenc XDATA1, [KEYS1 + 16*4] ; 4. ENC aesenc XDATA2, [KEYS2 + 16*4] ; 4. ENC aesenc XDATA3, [KEYS3 + 16*4] ; 4. ENC aesenc XDATA0, [KEYS0 + 16*5] ; 5. ENC aesenc XDATA1, [KEYS1 + 16*5] ; 5. ENC aesenc XDATA2, [KEYS2 + 16*5] ; 5. ENC aesenc XDATA3, [KEYS3 + 16*5] ; 5. ENC movdqa XKEY1_6, [KEYS1 + 16*6] ; load round 6 key movdqa XKEY2_6, [KEYS2 + 16*6] ; load round 6 key movdqa XKEY3_6, [KEYS3 + 16*6] ; load round 6 key aesenc XDATA0, XKEY0_6 ; 6. ENC aesenc XDATA1, XKEY1_6 ; 6. ENC aesenc XDATA2, XKEY2_6 ; 6. ENC aesenc XDATA3, XKEY3_6 ; 6. ENC aesenc XDATA0, [KEYS0 + 16*7] ; 7. ENC aesenc XDATA1, [KEYS1 + 16*7] ; 7. ENC aesenc XDATA2, [KEYS2 + 16*7] ; 7. ENC aesenc XDATA3, [KEYS3 + 16*7] ; 7. ENC aesenc XDATA0, [KEYS0 + 16*8] ; 8. ENC aesenc XDATA1, [KEYS1 + 16*8] ; 8. ENC aesenc XDATA2, [KEYS2 + 16*8] ; 8. ENC aesenc XDATA3, [KEYS3 + 16*8] ; 8. ENC movdqa XKEY0_9, [KEYS0 + 16*9] ; load round 9 key movdqa XKEY1_9, [KEYS1 + 16*9] ; load round 9 key movdqa XKEY2_9, [KEYS2 + 16*9] ; load round 9 key movdqa XKEY3_9, [KEYS3 + 16*9] ; load round 9 key aesenc XDATA0, XKEY0_9 ; 9. ENC aesenc XDATA1, XKEY1_9 ; 9. ENC aesenc XDATA2, XKEY2_9 ; 9. ENC aesenc XDATA3, XKEY3_9 ; 9. ENC aesenc XDATA0, [KEYS0 + 16*10] ; 10. ENC aesenc XDATA1, [KEYS1 + 16*10] ; 10. ENC aesenc XDATA2, [KEYS2 + 16*10] ; 10. ENC aesenc XDATA3, [KEYS3 + 16*10] ; 10. ENC aesenc XDATA0, [KEYS0 + 16*11] ; 11. ENC aesenc XDATA1, [KEYS1 + 16*11] ; 11. ENC aesenc XDATA2, [KEYS2 + 16*11] ; 11. ENC aesenc XDATA3, [KEYS3 + 16*11] ; 11. ENC aesenc XDATA0, [KEYS0 + 16*12] ; 12. ENC aesenc XDATA1, [KEYS1 + 16*12] ; 12. ENC aesenc XDATA2, [KEYS2 + 16*12] ; 12. ENC aesenc XDATA3, [KEYS3 + 16*12] ; 12. ENC aesenc XDATA0, [KEYS0 + 16*13] ; 13. ENC aesenc XDATA1, [KEYS1 + 16*13] ; 13. ENC aesenc XDATA2, [KEYS2 + 16*13] ; 13. ENC aesenc XDATA3, [KEYS3 + 16*13] ; 13. ENC aesenclast XDATA0, [KEYS0 + 16*14] ; 14. ENC aesenclast XDATA1, [KEYS1 + 16*14] ; 14. ENC aesenclast XDATA2, [KEYS2 + 16*14] ; 14. ENC aesenclast XDATA3, [KEYS3 + 16*14] ; 14. ENC MOVDQ [OUT0], XDATA0 ; write back ciphertext MOVDQ [OUT1], XDATA1 ; write back ciphertext MOVDQ [OUT2], XDATA2 ; write back ciphertext MOVDQ [OUT3], XDATA3 ; write back ciphertext cmp LEN, IDX je done main_loop: pxor2 XDATA0, [IN0 + IDX] ; plaintext XOR IV pxor2 XDATA1, [IN1 + IDX] ; plaintext XOR IV pxor2 XDATA2, [IN2 + IDX] ; plaintext XOR IV pxor2 XDATA3, [IN3 + IDX] ; plaintext XOR IV pxor XDATA0, [KEYS0 + 16*0] ; 0. ARK pxor XDATA1, [KEYS1 + 16*0] ; 0. ARK pxor XDATA2, [KEYS2 + 16*0] ; 0. ARK pxor XDATA3, [KEYS3 + 16*0] ; 0. ARK aesenc XDATA0, [KEYS0 + 16*1] ; 1. ENC aesenc XDATA1, [KEYS1 + 16*1] ; 1. ENC aesenc XDATA2, [KEYS2 + 16*1] ; 1. ENC aesenc XDATA3, [KEYS3 + 16*1] ; 1. ENC aesenc XDATA0, [KEYS0 + 16*2] ; 2. ENC aesenc XDATA1, [KEYS1 + 16*2] ; 2. ENC aesenc XDATA2, [KEYS2 + 16*2] ; 2. ENC aesenc XDATA3, [KEYS3 + 16*2] ; 2. ENC aesenc XDATA0, XKEY0_3 ; 3. ENC aesenc XDATA1, XKEY1_3 ; 3. ENC aesenc XDATA2, XKEY2_3 ; 3. ENC aesenc XDATA3, XKEY3_3 ; 3. ENC aesenc XDATA0, [KEYS0 + 16*4] ; 4. ENC aesenc XDATA1, [KEYS1 + 16*4] ; 4. ENC aesenc XDATA2, [KEYS2 + 16*4] ; 4. ENC aesenc XDATA3, [KEYS3 + 16*4] ; 4. ENC aesenc XDATA0, [KEYS0 + 16*5] ; 5. ENC aesenc XDATA1, [KEYS1 + 16*5] ; 5. ENC aesenc XDATA2, [KEYS2 + 16*5] ; 5. ENC aesenc XDATA3, [KEYS3 + 16*5] ; 5. ENC aesenc XDATA0, XKEY0_6 ; 6. ENC aesenc XDATA1, XKEY1_6 ; 6. ENC aesenc XDATA2, XKEY2_6 ; 6. ENC aesenc XDATA3, XKEY3_6 ; 6. ENC aesenc XDATA0, [KEYS0 + 16*7] ; 7. ENC aesenc XDATA1, [KEYS1 + 16*7] ; 7. ENC aesenc XDATA2, [KEYS2 + 16*7] ; 7. ENC aesenc XDATA3, [KEYS3 + 16*7] ; 7. ENC aesenc XDATA0, [KEYS0 + 16*8] ; 8. ENC aesenc XDATA1, [KEYS1 + 16*8] ; 8. ENC aesenc XDATA2, [KEYS2 + 16*8] ; 8. ENC aesenc XDATA3, [KEYS3 + 16*8] ; 8. ENC aesenc XDATA0, XKEY0_9 ; 9. ENC aesenc XDATA1, XKEY1_9 ; 9. ENC aesenc XDATA2, XKEY2_9 ; 9. ENC aesenc XDATA3, XKEY3_9 ; 9. ENC aesenc XDATA0, [KEYS0 + 16*10] ; 10. ENC aesenc XDATA1, [KEYS1 + 16*10] ; 10. ENC aesenc XDATA2, [KEYS2 + 16*10] ; 10. ENC aesenc XDATA3, [KEYS3 + 16*10] ; 10. ENC aesenc XDATA0, [KEYS0 + 16*11] ; 11. ENC aesenc XDATA1, [KEYS1 + 16*11] ; 11. ENC aesenc XDATA2, [KEYS2 + 16*11] ; 11. ENC aesenc XDATA3, [KEYS3 + 16*11] ; 11. ENC aesenc XDATA0, [KEYS0 + 16*12] ; 12. ENC aesenc XDATA1, [KEYS1 + 16*12] ; 12. ENC aesenc XDATA2, [KEYS2 + 16*12] ; 12. ENC aesenc XDATA3, [KEYS3 + 16*12] ; 12. ENC aesenc XDATA0, [KEYS0 + 16*13] ; 13. ENC aesenc XDATA1, [KEYS1 + 16*13] ; 13. ENC aesenc XDATA2, [KEYS2 + 16*13] ; 13. ENC aesenc XDATA3, [KEYS3 + 16*13] ; 13. ENC aesenclast XDATA0, [KEYS0 + 16*14] ; 14. ENC aesenclast XDATA1, [KEYS1 + 16*14] ; 14. ENC aesenclast XDATA2, [KEYS2 + 16*14] ; 14. ENC aesenclast XDATA3, [KEYS3 + 16*14] ; 14. ENC MOVDQ [OUT0 + IDX], XDATA0 ; write back ciphertext MOVDQ [OUT1 + IDX], XDATA1 ; write back ciphertex MOVDQ [OUT2 + IDX], XDATA2 ; write back ciphertex MOVDQ [OUT3 + IDX], XDATA3 ; write back ciphertex add IDX, 16 cmp LEN, IDX jne main_loop done: ;; update IV movdqa [ARG + _aesarg_IV + 16*0], XDATA0 movdqa [ARG + _aesarg_IV + 16*1], XDATA1 movdqa [ARG + _aesarg_IV + 16*2], XDATA2 movdqa [ARG + _aesarg_IV + 16*3], XDATA3 ;; update IN and OUT add IN0, LEN mov [ARG + _aesarg_in + 8*0], IN0 add IN1, LEN mov [ARG + _aesarg_in + 8*1], IN1 add IN2, LEN mov [ARG + _aesarg_in + 8*2], IN2 add IN3, LEN mov [ARG + _aesarg_in + 8*3], IN3 add OUT0, LEN mov [ARG + _aesarg_out + 8*0], OUT0 add OUT1, LEN mov [ARG + _aesarg_out + 8*1], OUT1 add OUT2, LEN mov [ARG + _aesarg_out + 8*2], OUT2 add OUT3, LEN mov [ARG + _aesarg_out + 8*3], OUT3 pop rbp ret intel-ipsec-mb-0.48/sse/aes_cfb_128_sse.asm000066400000000000000000000120541321406316400204010ustar00rootroot00000000000000;; ;; Copyright (c) 2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; %include "os.asm" %include "memcpy.asm" ;;; Routine to do 128 bit CFB AES encrypt/decrypt operations on one block only. ;;; It processes only one buffer at a time. ;;; It is designed to manage partial blocks of DOCSIS 3.1 SEC BPI ;; In System V AMD64 ABI ;; calle saves: RBX, RBP, R12-R15 ;; Windows x64 ABI ;; calle saves: RBX, RBP, RDI, RSI, RSP, R12-R15 ;; ;; Registers: RAX RBX RCX RDX RBP RSI RDI R8 R9 R10 R11 R12 R13 R14 R15 ;; ----------------------------------------------------------- ;; Windows clobbers: RAX R9 R10 R11 ;; Windows preserves: RBX RCX RDX RBP RSI RDI R8 R12 R13 R14 R15 ;; ----------------------------------------------------------- ;; Linux clobbers: RAX R9 R10 ;; Linux preserves: RBX RCX RDX RBP RSI RDI R8 R11 R12 R13 R14 R15 ;; ----------------------------------------------------------- ;; ;; Linux/Windows clobbers: xmm0 ;; %ifdef LINUX %define arg1 rdi %define arg2 rsi %define arg3 rdx %define arg4 rcx %define arg5 r8 %else %define arg1 rcx %define arg2 rdx %define arg3 r8 %define arg4 r9 %define arg5 [rsp + 5*8] %endif %define OUT arg1 %define IN arg2 %define IV arg3 %define KEYS arg4 %ifdef LINUX %define LEN arg5 %else %define LEN2 arg5 %define LEN r11 %endif %define TMP0 rax %define TMP1 r10 %define PTR0 rsp + _buffer %define XDATA xmm0 section .text struc STACK _buffer: resq 2 _rsp_save: resq 1 endstruc ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; void aes_cfb_128_one(void *out, void *in, void *iv, void *keys) ;; arg 1: OUT : addr to put clear/cipher text out ;; arg 2: IN : addr to take cipher/clear text from ;; arg 3: IV : initialization vector ;; arg 4: KEYS: pointer to expanded keys structure (16 byte aligned) ;; arg 5: LEN: length of the text to encrypt/decrypt (valid range is 0 to 16) ;; ;; AES CFB128 one block encrypt/decrypt implementation. ;; The function doesn't update IV. The result of operation can be found in OUT. ;; ;; It is primarly designed to process partial block of ;; DOCSIS 3.1 AES Packet PDU Encryption (I.10) ;; ;; It process up to one block only (up to 16 bytes). ;; ;; It makes sure not to read more than LEN bytes from IN and ;; not to store more than LEN bytes to OUT. MKGLOBAL(aes_cfb_128_one_sse,function,internal) align 32 aes_cfb_128_one_sse: %ifndef LINUX mov LEN, LEN2 %endif mov rax, rsp sub rsp, STACK_size and rsp, -16 mov [rsp + _rsp_save], rax test LEN, 16 jz copy_in_lt16 movdqu XDATA, [IN] movdqa [PTR0], XDATA jmp copy_in_end copy_in_lt16: memcpy_sse_16 PTR0, IN, LEN, TMP0, TMP1 copy_in_end: ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; movdqu XDATA, [IV] ; IV (or next to last block) pxor XDATA, [KEYS + 16*0] ; 0. ARK aesenc XDATA, [KEYS + 16*1] ; 1. ENC aesenc XDATA, [KEYS + 16*2] ; 2. ENC aesenc XDATA, [KEYS + 16*3] ; 3. ENC aesenc XDATA, [KEYS + 16*4] ; 4. ENC aesenc XDATA, [KEYS + 16*5] ; 5. ENC aesenc XDATA, [KEYS + 16*6] ; 6. ENC aesenc XDATA, [KEYS + 16*7] ; 7. ENC aesenc XDATA, [KEYS + 16*8] ; 8. ENC aesenc XDATA, [KEYS + 16*9] ; 9. ENC aesenclast XDATA, [KEYS + 16*10] ; 10. ENC pxor XDATA, [PTR0] ; plaintext/ciphertext XOR block cipher encryption test LEN, 16 jz copy_out_lt16 movdqu [OUT], XDATA jmp copy_out_end copy_out_lt16: movdqa [PTR0], XDATA memcpy_sse_16 OUT, PTR0, LEN, TMP0, TMP1 copy_out_end: ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; mov rsp, [rsp + _rsp_save] ; original SP ret intel-ipsec-mb-0.48/sse/aes_xcbc_mac_128_x4.asm000066400000000000000000000213271321406316400211520ustar00rootroot00000000000000;; ;; Copyright (c) 2012-2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; ;;; routine to do 128 bit AES XCBC ;;; process 4 buffers at a time, single data structure as input ;;; Updates In pointer at end ;; clobbers all registers except for ARG1 and rbp %include "os.asm" %include "mb_mgr_datastruct.asm" %define MOVDQ movdqu ;; assume buffers not aligned %macro pxor2 2 MOVDQ XTMP, %2 pxor %1, XTMP %endm ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; struct AES_XCBC_ARGS_x8 { ;; void* in[8]; ;; UINT128* keys[8]; ;; UINT128 ICV[8]; ;; } ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; void aes_xcbc_mac_128_x4(AES_XCBC_ARGS_x8 *args, UINT64 len); ;; arg 1: ARG : addr of AES_XCBC_ARGS_x8 structure ;; arg 2: LEN : len (in units of bytes) %ifdef LINUX %define ARG rdi %define LEN rsi %define REG3 rcx %define REG4 rdx %else %define ARG rcx %define LEN rdx %define REG3 rsi %define REG4 rdi %endif %define IDX rax %define IN0 r8 %define KEYS0 rbx %define OUT0 r9 %define IN1 r10 %define KEYS1 REG3 %define OUT1 r11 %define IN2 r12 %define KEYS2 REG4 %define OUT2 r13 %define IN3 r14 %define KEYS3 rbp %define OUT3 r15 %define XDATA0 xmm0 %define XDATA1 xmm1 %define XDATA2 xmm2 %define XDATA3 xmm3 %define XKEY0_3 xmm4 %define XKEY0_6 [KEYS0 + 16*6] %define XTMP xmm5 %define XKEY0_9 xmm6 %define XKEY1_3 xmm7 %define XKEY1_6 xmm8 %define XKEY1_9 xmm9 %define XKEY2_3 xmm10 %define XKEY2_6 xmm11 %define XKEY2_9 xmm12 %define XKEY3_3 xmm13 %define XKEY3_6 xmm14 %define XKEY3_9 xmm15 section .text MKGLOBAL(aes_xcbc_mac_128_x4,function,internal) aes_xcbc_mac_128_x4: push rbp mov IDX, 16 mov IN0, [ARG + _aesxcbcarg_in + 8*0] mov IN1, [ARG + _aesxcbcarg_in + 8*1] mov IN2, [ARG + _aesxcbcarg_in + 8*2] mov IN3, [ARG + _aesxcbcarg_in + 8*3] ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; MOVDQ XDATA0, [IN0] ; load first block of plain text MOVDQ XDATA1, [IN1] ; load first block of plain text MOVDQ XDATA2, [IN2] ; load first block of plain text MOVDQ XDATA3, [IN3] ; load first block of plain text mov KEYS0, [ARG + _aesxcbcarg_keys + 8*0] mov KEYS1, [ARG + _aesxcbcarg_keys + 8*1] mov KEYS2, [ARG + _aesxcbcarg_keys + 8*2] mov KEYS3, [ARG + _aesxcbcarg_keys + 8*3] pxor XDATA0, [ARG + _aesxcbcarg_ICV + 16*0] ; plaintext XOR ICV pxor XDATA1, [ARG + _aesxcbcarg_ICV + 16*1] ; plaintext XOR ICV pxor XDATA2, [ARG + _aesxcbcarg_ICV + 16*2] ; plaintext XOR ICV pxor XDATA3, [ARG + _aesxcbcarg_ICV + 16*3] ; plaintext XOR ICV pxor XDATA0, [KEYS0 + 16*0] ; 0. ARK pxor XDATA1, [KEYS1 + 16*0] ; 0. ARK pxor XDATA2, [KEYS2 + 16*0] ; 0. ARK pxor XDATA3, [KEYS3 + 16*0] ; 0. ARK aesenc XDATA0, [KEYS0 + 16*1] ; 1. ENC aesenc XDATA1, [KEYS1 + 16*1] ; 1. ENC aesenc XDATA2, [KEYS2 + 16*1] ; 1. ENC aesenc XDATA3, [KEYS3 + 16*1] ; 1. ENC aesenc XDATA0, [KEYS0 + 16*2] ; 2. ENC aesenc XDATA1, [KEYS1 + 16*2] ; 2. ENC aesenc XDATA2, [KEYS2 + 16*2] ; 2. ENC aesenc XDATA3, [KEYS3 + 16*2] ; 2. ENC movdqa XKEY0_3, [KEYS0 + 16*3] ; load round 3 key movdqa XKEY1_3, [KEYS1 + 16*3] ; load round 3 key movdqa XKEY2_3, [KEYS2 + 16*3] ; load round 3 key movdqa XKEY3_3, [KEYS3 + 16*3] ; load round 3 key aesenc XDATA0, XKEY0_3 ; 3. ENC aesenc XDATA1, XKEY1_3 ; 3. ENC aesenc XDATA2, XKEY2_3 ; 3. ENC aesenc XDATA3, XKEY3_3 ; 3. ENC aesenc XDATA0, [KEYS0 + 16*4] ; 4. ENC aesenc XDATA1, [KEYS1 + 16*4] ; 4. ENC aesenc XDATA2, [KEYS2 + 16*4] ; 4. ENC aesenc XDATA3, [KEYS3 + 16*4] ; 4. ENC aesenc XDATA0, [KEYS0 + 16*5] ; 5. ENC aesenc XDATA1, [KEYS1 + 16*5] ; 5. ENC aesenc XDATA2, [KEYS2 + 16*5] ; 5. ENC aesenc XDATA3, [KEYS3 + 16*5] ; 5. ENC movdqa XKEY1_6, [KEYS1 + 16*6] ; load round 6 key movdqa XKEY2_6, [KEYS2 + 16*6] ; load round 6 key movdqa XKEY3_6, [KEYS3 + 16*6] ; load round 6 key aesenc XDATA0, XKEY0_6 ; 6. ENC aesenc XDATA1, XKEY1_6 ; 6. ENC aesenc XDATA2, XKEY2_6 ; 6. ENC aesenc XDATA3, XKEY3_6 ; 6. ENC aesenc XDATA0, [KEYS0 + 16*7] ; 7. ENC aesenc XDATA1, [KEYS1 + 16*7] ; 7. ENC aesenc XDATA2, [KEYS2 + 16*7] ; 7. ENC aesenc XDATA3, [KEYS3 + 16*7] ; 7. ENC aesenc XDATA0, [KEYS0 + 16*8] ; 8. ENC aesenc XDATA1, [KEYS1 + 16*8] ; 8. ENC aesenc XDATA2, [KEYS2 + 16*8] ; 8. ENC aesenc XDATA3, [KEYS3 + 16*8] ; 8. ENC movdqa XKEY0_9, [KEYS0 + 16*9] ; load round 9 key movdqa XKEY1_9, [KEYS1 + 16*9] ; load round 9 key movdqa XKEY2_9, [KEYS2 + 16*9] ; load round 9 key movdqa XKEY3_9, [KEYS3 + 16*9] ; load round 9 key aesenc XDATA0, XKEY0_9 ; 9. ENC aesenc XDATA1, XKEY1_9 ; 9. ENC aesenc XDATA2, XKEY2_9 ; 9. ENC aesenc XDATA3, XKEY3_9 ; 9. ENC aesenclast XDATA0, [KEYS0 + 16*10] ; 10. ENC aesenclast XDATA1, [KEYS1 + 16*10] ; 10. ENC aesenclast XDATA2, [KEYS2 + 16*10] ; 10. ENC aesenclast XDATA3, [KEYS3 + 16*10] ; 10. ENC cmp LEN, IDX je done main_loop: pxor2 XDATA0, [IN0 + IDX] ; plaintext XOR ICV pxor2 XDATA1, [IN1 + IDX] ; plaintext XOR ICV pxor2 XDATA2, [IN2 + IDX] ; plaintext XOR ICV pxor2 XDATA3, [IN3 + IDX] ; plaintext XOR ICV pxor XDATA0, [KEYS0 + 16*0] ; 0. ARK pxor XDATA1, [KEYS1 + 16*0] ; 0. ARK pxor XDATA2, [KEYS2 + 16*0] ; 0. ARK pxor XDATA3, [KEYS3 + 16*0] ; 0. ARK aesenc XDATA0, [KEYS0 + 16*1] ; 1. ENC aesenc XDATA1, [KEYS1 + 16*1] ; 1. ENC aesenc XDATA2, [KEYS2 + 16*1] ; 1. ENC aesenc XDATA3, [KEYS3 + 16*1] ; 1. ENC aesenc XDATA0, [KEYS0 + 16*2] ; 2. ENC aesenc XDATA1, [KEYS1 + 16*2] ; 2. ENC aesenc XDATA2, [KEYS2 + 16*2] ; 2. ENC aesenc XDATA3, [KEYS3 + 16*2] ; 2. ENC aesenc XDATA0, XKEY0_3 ; 3. ENC aesenc XDATA1, XKEY1_3 ; 3. ENC aesenc XDATA2, XKEY2_3 ; 3. ENC aesenc XDATA3, XKEY3_3 ; 3. ENC aesenc XDATA0, [KEYS0 + 16*4] ; 4. ENC aesenc XDATA1, [KEYS1 + 16*4] ; 4. ENC aesenc XDATA2, [KEYS2 + 16*4] ; 4. ENC aesenc XDATA3, [KEYS3 + 16*4] ; 4. ENC aesenc XDATA0, [KEYS0 + 16*5] ; 5. ENC aesenc XDATA1, [KEYS1 + 16*5] ; 5. ENC aesenc XDATA2, [KEYS2 + 16*5] ; 5. ENC aesenc XDATA3, [KEYS3 + 16*5] ; 5. ENC aesenc XDATA0, XKEY0_6 ; 6. ENC aesenc XDATA1, XKEY1_6 ; 6. ENC aesenc XDATA2, XKEY2_6 ; 6. ENC aesenc XDATA3, XKEY3_6 ; 6. ENC aesenc XDATA0, [KEYS0 + 16*7] ; 7. ENC aesenc XDATA1, [KEYS1 + 16*7] ; 7. ENC aesenc XDATA2, [KEYS2 + 16*7] ; 7. ENC aesenc XDATA3, [KEYS3 + 16*7] ; 7. ENC aesenc XDATA0, [KEYS0 + 16*8] ; 8. ENC aesenc XDATA1, [KEYS1 + 16*8] ; 8. ENC aesenc XDATA2, [KEYS2 + 16*8] ; 8. ENC aesenc XDATA3, [KEYS3 + 16*8] ; 8. ENC aesenc XDATA0, XKEY0_9 ; 9. ENC aesenc XDATA1, XKEY1_9 ; 9. ENC aesenc XDATA2, XKEY2_9 ; 9. ENC aesenc XDATA3, XKEY3_9 ; 9. ENC aesenclast XDATA0, [KEYS0 + 16*10] ; 10. ENC aesenclast XDATA1, [KEYS1 + 16*10] ; 10. ENC aesenclast XDATA2, [KEYS2 + 16*10] ; 10. ENC aesenclast XDATA3, [KEYS3 + 16*10] ; 10. ENC add IDX, 16 cmp LEN, IDX jne main_loop done: ;; update ICV movdqa [ARG + _aesxcbcarg_ICV + 16*0], XDATA0 movdqa [ARG + _aesxcbcarg_ICV + 16*1], XDATA1 movdqa [ARG + _aesxcbcarg_ICV + 16*2], XDATA2 movdqa [ARG + _aesxcbcarg_ICV + 16*3], XDATA3 ;; update IN add IN0, LEN mov [ARG + _aesxcbcarg_in + 8*0], IN0 add IN1, LEN mov [ARG + _aesxcbcarg_in + 8*1], IN1 add IN2, LEN mov [ARG + _aesxcbcarg_in + 8*2], IN2 add IN3, LEN mov [ARG + _aesxcbcarg_in + 8*3], IN3 pop rbp ret intel-ipsec-mb-0.48/sse/gcm128_sse.asm000066400000000000000000000033601321406316400174260ustar00rootroot00000000000000;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; Copyright(c) 2011-2017 Intel Corporation All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions ; are met: ; * Redistributions of source code must retain the above copyright ; notice, this list of conditions and the following disclaimer. ; * Redistributions in binary form must reproduce the above copyright ; notice, this list of conditions and the following disclaimer in ; the documentation and/or other materials provided with the ; distribution. ; * Neither the name of Intel Corporation nor the names of its ; contributors may be used to endorse or promote products derived ; from this software without specific prior written permission. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT ; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, ; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY ; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; %define GCM128_MODE 1 %include "gcm_sse.asm" intel-ipsec-mb-0.48/sse/gcm192_sse.asm000066400000000000000000000033541321406316400174320ustar00rootroot00000000000000;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; Copyright(c) 2017 Intel Corporation All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions ; are met: ; * Redistributions of source code must retain the above copyright ; notice, this list of conditions and the following disclaimer. ; * Redistributions in binary form must reproduce the above copyright ; notice, this list of conditions and the following disclaimer in ; the documentation and/or other materials provided with the ; distribution. ; * Neither the name of Intel Corporation nor the names of its ; contributors may be used to endorse or promote products derived ; from this software without specific prior written permission. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT ; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, ; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY ; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; %define GCM192_MODE 1 %include "gcm_sse.asm" intel-ipsec-mb-0.48/sse/gcm256_sse.asm000066400000000000000000000033611321406316400174310ustar00rootroot00000000000000;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; Copyright(c) 2011-2017 Intel Corporation All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions ; are met: ; * Redistributions of source code must retain the above copyright ; notice, this list of conditions and the following disclaimer. ; * Redistributions in binary form must reproduce the above copyright ; notice, this list of conditions and the following disclaimer in ; the documentation and/or other materials provided with the ; distribution. ; * Neither the name of Intel Corporation nor the names of its ; contributors may be used to endorse or promote products derived ; from this software without specific prior written permission. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT ; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, ; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY ; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; %define GCM256_MODE 1 %include "gcm_sse.asm" intel-ipsec-mb-0.48/sse/gcm_sse.asm000066400000000000000000002271121321406316400171760ustar00rootroot00000000000000;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; Copyright(c) 2011-2017 Intel Corporation All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions ; are met: ; * Redistributions of source code must retain the above copyright ; notice, this list of conditions and the following disclaimer. ; * Redistributions in binary form must reproduce the above copyright ; notice, this list of conditions and the following disclaimer in ; the documentation and/or other materials provided with the ; distribution. ; * Neither the name of Intel Corporation nor the names of its ; contributors may be used to endorse or promote products derived ; from this software without specific prior written permission. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT ; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, ; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY ; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; ; Authors: ; Erdinc Ozturk ; Vinodh Gopal ; James Guilford ; ; ; References: ; This code was derived and highly optimized from the code described in paper: ; Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation on Intel Architecture Processors. August, 2010 ; ; For the shift-based reductions used in this code, we used the method described in paper: ; Shay Gueron, Michael E. Kounavis. Intel Carry-Less Multiplication Instruction and its Usage for Computing the GCM Mode. January, 2010. ; ; ; ; ; Assumptions: ; ; ; ; iv: ; 0 1 2 3 ; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ ; | Salt (From the SA) | ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ ; | Initialization Vector | ; | (This is the sequence number from IPSec header) | ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ ; | 0x1 | ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ ; ; ; ; AAD: ; AAD will be padded with 0 to the next 16byte multiple ; for example, assume AAD is a u32 vector ; ; if AAD is 8 bytes: ; AAD[3] = {A0, A1}; ; padded AAD in xmm register = {A1 A0 0 0} ; ; 0 1 2 3 ; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ ; | SPI (A1) | ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ ; | 32-bit Sequence Number (A0) | ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ ; | 0x0 | ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ ; ; AAD Format with 32-bit Sequence Number ; ; if AAD is 12 bytes: ; AAD[3] = {A0, A1, A2}; ; padded AAD in xmm register = {A2 A1 A0 0} ; ; 0 1 2 3 ; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ ; | SPI (A2) | ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ ; | 64-bit Extended Sequence Number {A1,A0} | ; | | ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ ; | 0x0 | ; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ ; ; AAD Format with 64-bit Extended Sequence Number ; ; ; aadLen: ; Must be a multiple of 4 bytes and from the definition of the spec. ; The code additionally supports any aadLen length. ; ; TLen: ; from the definition of the spec, TLen can only be 8, 12 or 16 bytes. ; ; poly = x^128 + x^127 + x^126 + x^121 + 1 ; throughout the code, one tab and two tab indentations are used. one tab is for GHASH part, two tabs is for AES part. ; %include "os.asm" %include "reg_sizes.asm" %include "gcm_defines.asm" %ifndef GCM128_MODE %ifndef GCM192_MODE %ifndef GCM256_MODE %error "No GCM mode selected for gcm_sse.asm!" %endif %endif %endif %ifdef GCM128_MODE %define FN_NAME(x,y) aes_gcm_ %+ x %+ _128 %+ y %+ sse %define NROUNDS 9 %endif %ifdef GCM192_MODE %define FN_NAME(x,y) aes_gcm_ %+ x %+ _192 %+ y %+ sse %define NROUNDS 11 %endif %ifdef GCM256_MODE %define FN_NAME(x,y) aes_gcm_ %+ x %+ _256 %+ y %+ sse %define NROUNDS 13 %endif default rel ; need to push 4 registers into stack to maintain %define STACK_OFFSET 8*4 %define TMP2 16*0 ; Temporary storage for AES State 2 (State 1 is stored in an XMM register) %define TMP3 16*1 ; Temporary storage for AES State 3 %define TMP4 16*2 ; Temporary storage for AES State 4 %define TMP5 16*3 ; Temporary storage for AES State 5 %define TMP6 16*4 ; Temporary storage for AES State 6 %define TMP7 16*5 ; Temporary storage for AES State 7 %define TMP8 16*6 ; Temporary storage for AES State 8 %define LOCAL_STORAGE 16*7 %ifidn __OUTPUT_FORMAT__, win64 %define XMM_STORAGE 16*10 %else %define XMM_STORAGE 0 %endif %define VARIABLE_OFFSET LOCAL_STORAGE + XMM_STORAGE ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; Utility Macros ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0) ; Input: A and B (128-bits each, bit-reflected) ; Output: C = A*B*x mod poly, (i.e. >>1 ) ; To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input ; GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; %macro GHASH_MUL 7 %define %%GH %1 ; 16 Bytes %define %%HK %2 ; 16 Bytes %define %%T1 %3 %define %%T2 %4 %define %%T3 %5 %define %%T4 %6 %define %%T5 %7 ; %%GH, %%HK hold the values for the two operands which are carry-less multiplied ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; Karatsuba Method movdqa %%T1, %%GH pshufd %%T2, %%GH, 01001110b pshufd %%T3, %%HK, 01001110b pxor %%T2, %%GH ; %%T2 = (a1+a0) pxor %%T3, %%HK ; %%T3 = (b1+b0) pclmulqdq %%T1, %%HK, 0x11 ; %%T1 = a1*b1 pclmulqdq %%GH, %%HK, 0x00 ; %%GH = a0*b0 pclmulqdq %%T2, %%T3, 0x00 ; %%T2 = (a1+a0)*(b1+b0) pxor %%T2, %%GH pxor %%T2, %%T1 ; %%T2 = a0*b1+a1*b0 movdqa %%T3, %%T2 pslldq %%T3, 8 ; shift-L %%T3 2 DWs psrldq %%T2, 8 ; shift-R %%T2 2 DWs pxor %%GH, %%T3 pxor %%T1, %%T2 ; <%%T1:%%GH> holds the result of the carry-less multiplication of %%GH by %%HK ;first phase of the reduction movdqa %%T2, %%GH movdqa %%T3, %%GH movdqa %%T4, %%GH ; move %%GH into %%T2, %%T3, %%T4 in order to perform the three shifts independently pslld %%T2, 31 ; packed right shifting << 31 pslld %%T3, 30 ; packed right shifting shift << 30 pslld %%T4, 25 ; packed right shifting shift << 25 pxor %%T2, %%T3 ; xor the shifted versions pxor %%T2, %%T4 movdqa %%T5, %%T2 psrldq %%T5, 4 ; shift-R %%T5 1 DW pslldq %%T2, 12 ; shift-L %%T2 3 DWs pxor %%GH, %%T2 ; first phase of the reduction complete ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;second phase of the reduction movdqa %%T2,%%GH ; make 3 copies of %%GH (in in %%T2, %%T3, %%T4) for doing three shift operations movdqa %%T3,%%GH movdqa %%T4,%%GH psrld %%T2,1 ; packed left shifting >> 1 psrld %%T3,2 ; packed left shifting >> 2 psrld %%T4,7 ; packed left shifting >> 7 pxor %%T2,%%T3 ; xor the shifted versions pxor %%T2,%%T4 pxor %%T2, %%T5 pxor %%GH, %%T2 pxor %%GH, %%T1 ; the result is in %%T1 %endmacro %macro PRECOMPUTE 8 %define %%GDATA %1 %define %%HK %2 %define %%T1 %3 %define %%T2 %4 %define %%T3 %5 %define %%T4 %6 %define %%T5 %7 %define %%T6 %8 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; Haskey_i_k holds XORed values of the low and high parts of the Haskey_i movdqa %%T4, %%HK pshufd %%T1, %%HK, 01001110b pxor %%T1, %%HK movdqu [%%GDATA + HashKey_k], %%T1 GHASH_MUL %%T4, %%HK, %%T1, %%T2, %%T3, %%T5, %%T6 ; %%T4 = HashKey^2<<1 mod poly movdqu [%%GDATA + HashKey_2], %%T4 ; [HashKey_2] = HashKey^2<<1 mod poly pshufd %%T1, %%T4, 01001110b pxor %%T1, %%T4 movdqu [%%GDATA + HashKey_2_k], %%T1 GHASH_MUL %%T4, %%HK, %%T1, %%T2, %%T3, %%T5, %%T6 ; %%T4 = HashKey^3<<1 mod poly movdqu [%%GDATA + HashKey_3], %%T4 pshufd %%T1, %%T4, 01001110b pxor %%T1, %%T4 movdqu [%%GDATA + HashKey_3_k], %%T1 GHASH_MUL %%T4, %%HK, %%T1, %%T2, %%T3, %%T5, %%T6 ; %%T4 = HashKey^4<<1 mod poly movdqu [%%GDATA + HashKey_4], %%T4 pshufd %%T1, %%T4, 01001110b pxor %%T1, %%T4 movdqu [%%GDATA + HashKey_4_k], %%T1 GHASH_MUL %%T4, %%HK, %%T1, %%T2, %%T3, %%T5, %%T6 ; %%T4 = HashKey^5<<1 mod poly movdqu [%%GDATA + HashKey_5], %%T4 pshufd %%T1, %%T4, 01001110b pxor %%T1, %%T4 movdqu [%%GDATA + HashKey_5_k], %%T1 GHASH_MUL %%T4, %%HK, %%T1, %%T2, %%T3, %%T5, %%T6 ; %%T4 = HashKey^6<<1 mod poly movdqu [%%GDATA + HashKey_6], %%T4 pshufd %%T1, %%T4, 01001110b pxor %%T1, %%T4 movdqu [%%GDATA + HashKey_6_k], %%T1 GHASH_MUL %%T4, %%HK, %%T1, %%T2, %%T3, %%T5, %%T6 ; %%T4 = HashKey^7<<1 mod poly movdqu [%%GDATA + HashKey_7], %%T4 pshufd %%T1, %%T4, 01001110b pxor %%T1, %%T4 movdqu [%%GDATA + HashKey_7_k], %%T1 GHASH_MUL %%T4, %%HK, %%T1, %%T2, %%T3, %%T5, %%T6 ; %%T4 = HashKey^8<<1 mod poly movdqu [%%GDATA + HashKey_8], %%T4 pshufd %%T1, %%T4, 01001110b pxor %%T1, %%T4 movdqu [%%GDATA + HashKey_8_k], %%T1 %endmacro ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; READ_SMALL_DATA_INPUT: Packs xmm register with data when data input is less than 16 bytes. ; Returns 0 if data has length 0. ; Input: The input data (INPUT), that data's length (LENGTH). ; Output: The packed xmm register (OUTPUT). ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; %macro READ_SMALL_DATA_INPUT 6 %define %%OUTPUT %1 ; %%OUTPUT is an xmm register %define %%INPUT %2 %define %%LENGTH %3 %define %%END_READ_LOCATION %4 ; All this and the lower inputs are temp registers %define %%COUNTER %5 %define %%TMP1 %6 pxor %%OUTPUT, %%OUTPUT mov %%COUNTER, %%LENGTH mov %%END_READ_LOCATION, %%INPUT add %%END_READ_LOCATION, %%LENGTH xor %%TMP1, %%TMP1 cmp %%COUNTER, 8 jl %%_byte_loop_2 pinsrq %%OUTPUT, [%%INPUT],0 ;Read in 8 bytes if they exists je %%_done sub %%COUNTER, 8 %%_byte_loop_1: ;Read in data 1 byte at a time while data is left shl %%TMP1, 8 ;This loop handles when 8 bytes were already read in dec %%END_READ_LOCATION mov BYTE(%%TMP1), BYTE [%%END_READ_LOCATION] dec %%COUNTER jg %%_byte_loop_1 pinsrq %%OUTPUT, %%TMP1, 1 jmp %%_done %%_byte_loop_2: ;Read in data 1 byte at a time while data is left cmp %%COUNTER, 0 je %%_done shl %%TMP1, 8 ;This loop handles when no bytes were already read in dec %%END_READ_LOCATION mov BYTE(%%TMP1), BYTE [%%END_READ_LOCATION] dec %%COUNTER jg %%_byte_loop_2 pinsrq %%OUTPUT, %%TMP1, 0 %%_done: %endmacro ; READ_SMALL_DATA_INPUT ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted. ; Input: The input data (A_IN), that data's length (A_LEN), and the hash key (HASH_KEY). ; Output: The hash of the data (AAD_HASH). ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; %macro CALC_AAD_HASH 14 %define %%A_IN %1 %define %%A_LEN %2 %define %%AAD_HASH %3 %define %%HASH_KEY %4 %define %%XTMP1 %5 ; xmm temp reg 5 %define %%XTMP2 %6 %define %%XTMP3 %7 %define %%XTMP4 %8 %define %%XTMP5 %9 ; xmm temp reg 5 %define %%T1 %10 ; temp reg 1 %define %%T2 %11 %define %%T3 %12 %define %%T4 %13 %define %%T5 %14 ; temp reg 5 mov %%T1, %%A_IN ; T1 = AAD mov %%T2, %%A_LEN ; T2 = aadLen pxor %%AAD_HASH, %%AAD_HASH cmp %%T2, 16 jl %%_get_small_AAD_block %%_get_AAD_loop16: movdqu %%XTMP1, [%%T1] ;byte-reflect the AAD data pshufb %%XTMP1, [SHUF_MASK] pxor %%AAD_HASH, %%XTMP1 GHASH_MUL %%AAD_HASH, %%HASH_KEY, %%XTMP1, %%XTMP2, %%XTMP3, %%XTMP4, %%XTMP5 sub %%T2, 16 je %%_CALC_AAD_done add %%T1, 16 cmp %%T2, 16 jge %%_get_AAD_loop16 %%_get_small_AAD_block: READ_SMALL_DATA_INPUT %%XTMP1, %%T1, %%T2, %%T3, %%T4, %%T5 ;byte-reflect the AAD data pshufb %%XTMP1, [SHUF_MASK] pxor %%AAD_HASH, %%XTMP1 GHASH_MUL %%AAD_HASH, %%HASH_KEY, %%XTMP1, %%XTMP2, %%XTMP3, %%XTMP4, %%XTMP5 %%_CALC_AAD_done: %endmacro ; CALC_AAD_HASH ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks between update calls. ; Requires the input data be at least 1 byte long. ; Input: gcm_key_data (GDATA_KEY), gcm_context_data (GDATA_CTX), input text (PLAIN_CYPH_IN), ; input text length (PLAIN_CYPH_LEN), the current data offset (DATA_OFFSET), ; and whether encoding or decoding (ENC_DEC). ; Output: A cypher of the first partial block (CYPH_PLAIN_OUT), and updated GDATA_CTX ; Clobbers rax, r10, r12, r13, r15, xmm0, xmm1, xmm2, xmm3, xmm5, xmm6, xmm9, xmm10, xmm11, xmm13 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; %macro PARTIAL_BLOCK 8 %define %%GDATA_KEY %1 %define %%GDATA_CTX %2 %define %%CYPH_PLAIN_OUT %3 %define %%PLAIN_CYPH_IN %4 %define %%PLAIN_CYPH_LEN %5 %define %%DATA_OFFSET %6 %define %%AAD_HASH %7 %define %%ENC_DEC %8 mov r13, [%%GDATA_CTX + PBlockLen] cmp r13, 0 je %%_partial_block_done ;Leave Macro if no partial blocks cmp %%PLAIN_CYPH_LEN, 16 ;Read in input data without over reading jl %%_fewer_than_16_bytes XLDR xmm1, [%%PLAIN_CYPH_IN] ;If more than 16 bytes of data, just fill the xmm register jmp %%_data_read %%_fewer_than_16_bytes: lea r10, [%%PLAIN_CYPH_IN + %%DATA_OFFSET] READ_SMALL_DATA_INPUT xmm1, r10, %%PLAIN_CYPH_LEN, rax, r12, r15 mov r13, [%%GDATA_CTX + PBlockLen] %%_data_read: ;Finished reading in data movdqu xmm9, [%%GDATA_CTX + PBlockEncKey] ;xmm9 = ctx_data.partial_block_enc_key movdqu xmm13, [%%GDATA_KEY + HashKey] lea r12, [SHIFT_MASK] add r12, r13 ; adjust the shuffle mask pointer to be able to shift r13 bytes (16-r13 is the number of bytes in plaintext mod 16) movdqu xmm2, [r12] ; get the appropriate shuffle mask pshufb xmm9, xmm2 ;shift right r13 bytes %ifidn %%ENC_DEC, DEC movdqa xmm3, xmm1 pxor xmm9, xmm1 ; Cyphertext XOR E(K, Yn) mov r15, %%PLAIN_CYPH_LEN add r15, r13 sub r15, 16 ;Set r15 to be the amount of data left in CYPH_PLAIN_IN after filling the block jge %%_no_extra_mask_1 ;Determine if if partial block is not being filled and shift mask accordingly sub r12, r15 %%_no_extra_mask_1: movdqu xmm1, [r12 + ALL_F-SHIFT_MASK] ; get the appropriate mask to mask out bottom r13 bytes of xmm9 pand xmm9, xmm1 ; mask out bottom r13 bytes of xmm9 pand xmm3, xmm1 pshufb xmm3, [SHUF_MASK] pshufb xmm3, xmm2 pxor %%AAD_HASH, xmm3 cmp r15,0 jl %%_partial_incomplete_1 GHASH_MUL %%AAD_HASH, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block xor rax,rax mov [%%GDATA_CTX + PBlockLen], rax jmp %%_dec_done %%_partial_incomplete_1: %ifidn __OUTPUT_FORMAT__, win64 mov rax, %%PLAIN_CYPH_LEN add [%%GDATA_CTX + PBlockLen], rax %else add [%%GDATA_CTX + PBlockLen], %%PLAIN_CYPH_LEN %endif %%_dec_done: movdqu [%%GDATA_CTX + AadHash], %%AAD_HASH %else pxor xmm9, xmm1 ; Plaintext XOR E(K, Yn) mov r15, %%PLAIN_CYPH_LEN add r15, r13 sub r15, 16 ;Set r15 to be the amount of data left in CYPH_PLAIN_IN after filling the block jge %%_no_extra_mask_2 ;Determine if if partial block is not being filled and shift mask accordingly sub r12, r15 %%_no_extra_mask_2: movdqu xmm1, [r12 + ALL_F-SHIFT_MASK] ; get the appropriate mask to mask out bottom r13 bytes of xmm9 pand xmm9, xmm1 ; mask out bottom r13 bytes of xmm9 pshufb xmm9, [SHUF_MASK] pshufb xmm9, xmm2 pxor %%AAD_HASH, xmm9 cmp r15,0 jl %%_partial_incomplete_2 GHASH_MUL %%AAD_HASH, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block xor rax,rax mov [%%GDATA_CTX + PBlockLen], rax jmp %%_encode_done %%_partial_incomplete_2: %ifidn __OUTPUT_FORMAT__, win64 mov rax, %%PLAIN_CYPH_LEN add [%%GDATA_CTX + PBlockLen], rax %else add [%%GDATA_CTX + PBlockLen], %%PLAIN_CYPH_LEN %endif %%_encode_done: movdqu [%%GDATA_CTX + AadHash], %%AAD_HASH pshufb xmm9, [SHUF_MASK] ; shuffle xmm9 back to output as ciphertext pshufb xmm9, xmm2 %endif ;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; output encrypted Bytes cmp r15,0 jl %%_partial_fill mov r12, r13 mov r13, 16 sub r13, r12 ; Set r13 to be the number of bytes to write out jmp %%_count_set %%_partial_fill: mov r13, %%PLAIN_CYPH_LEN %%_count_set: movq rax, xmm9 cmp r13, 8 jle %%_less_than_8_bytes_left mov [%%CYPH_PLAIN_OUT+ %%DATA_OFFSET], rax add %%DATA_OFFSET, 8 psrldq xmm9, 8 movq rax, xmm9 sub r13, 8 %%_less_than_8_bytes_left: mov BYTE [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], al add %%DATA_OFFSET, 1 shr rax, 8 sub r13, 1 jne %%_less_than_8_bytes_left ;;;;;;;;;;;;;;;;;;;;;;;;;;;;; %%_partial_block_done: %endmacro ; PARTIAL_BLOCK ; if a = number of total plaintext bytes ; b = floor(a/16) ; %%num_initial_blocks = b mod 8; ; encrypt the initial %%num_initial_blocks blocks and apply ghash on the ciphertext ; %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r14 are used as a pointer only, not modified ; Updated AAD_HASH is returned in %%T3 %macro INITIAL_BLOCKS 24 %define %%GDATA_KEY %1 %define %%GDATA_CTX %2 %define %%CYPH_PLAIN_OUT %3 %define %%PLAIN_CYPH_IN %4 %define %%LENGTH %5 %define %%DATA_OFFSET %6 %define %%num_initial_blocks %7 ; can be 0, 1, 2, 3, 4, 5, 6 or 7 %define %%T1 %8 %define %%HASH_KEY %9 %define %%T3 %10 %define %%T4 %11 %define %%T5 %12 %define %%CTR %13 %define %%XMM1 %14 %define %%XMM2 %15 %define %%XMM3 %16 %define %%XMM4 %17 %define %%XMM5 %18 %define %%XMM6 %19 %define %%XMM7 %20 %define %%XMM8 %21 %define %%T6 %22 %define %%T_key %23 %define %%ENC_DEC %24 %assign i (8-%%num_initial_blocks) movdqu reg(i), %%XMM8 ; move AAD_HASH to temp reg ; start AES for %%num_initial_blocks blocks movdqu %%CTR, [%%GDATA_CTX + CurCount] ; %%CTR = Y0 %assign i (9-%%num_initial_blocks) %rep %%num_initial_blocks paddd %%CTR, [ONE] ; INCR Y0 movdqa reg(i), %%CTR pshufb reg(i), [SHUF_MASK] ; perform a 16Byte swap %assign i (i+1) %endrep movdqu %%T_key, [%%GDATA_KEY+16*0] %assign i (9-%%num_initial_blocks) %rep %%num_initial_blocks pxor reg(i),%%T_key %assign i (i+1) %endrep %assign j 1 %rep NROUNDS ; encrypt N blocks with 13 key rounds (11 for GCM192) movdqu %%T_key, [%%GDATA_KEY+16*j] %assign i (9-%%num_initial_blocks) %rep %%num_initial_blocks aesenc reg(i),%%T_key %assign i (i+1) %endrep %assign j (j+1) %endrep movdqu %%T_key, [%%GDATA_KEY+16*j] ; encrypt with last (14th) key round (12 for GCM192) %assign i (9-%%num_initial_blocks) %rep %%num_initial_blocks aesenclast reg(i),%%T_key %assign i (i+1) %endrep %assign i (9-%%num_initial_blocks) %rep %%num_initial_blocks XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET] pxor reg(i), %%T1 XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], reg(i) ; write back ciphertext for %%num_initial_blocks blocks add %%DATA_OFFSET, 16 %ifidn %%ENC_DEC, DEC movdqa reg(i), %%T1 %endif pshufb reg(i), [SHUF_MASK] ; prepare ciphertext for GHASH computations %assign i (i+1) %endrep %assign i (8-%%num_initial_blocks) %assign j (9-%%num_initial_blocks) %rep %%num_initial_blocks pxor reg(j), reg(i) GHASH_MUL reg(j), %%HASH_KEY, %%T1, %%T3, %%T4, %%T5, %%T6 ; apply GHASH on %%num_initial_blocks blocks %assign i (i+1) %assign j (j+1) %endrep ; %%XMM8 has the current Hash Value movdqa %%T3, %%XMM8 cmp %%LENGTH, 128 jl %%_initial_blocks_done ; no need for precomputed constants ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; Haskey_i_k holds XORed values of the low and high parts of the Haskey_i paddd %%CTR, [ONE] ; INCR Y0 movdqa %%XMM1, %%CTR pshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap paddd %%CTR, [ONE] ; INCR Y0 movdqa %%XMM2, %%CTR pshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap paddd %%CTR, [ONE] ; INCR Y0 movdqa %%XMM3, %%CTR pshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap paddd %%CTR, [ONE] ; INCR Y0 movdqa %%XMM4, %%CTR pshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap paddd %%CTR, [ONE] ; INCR Y0 movdqa %%XMM5, %%CTR pshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap paddd %%CTR, [ONE] ; INCR Y0 movdqa %%XMM6, %%CTR pshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap paddd %%CTR, [ONE] ; INCR Y0 movdqa %%XMM7, %%CTR pshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap paddd %%CTR, [ONE] ; INCR Y0 movdqa %%XMM8, %%CTR pshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap movdqu %%T_key, [%%GDATA_KEY+16*0] pxor %%XMM1, %%T_key pxor %%XMM2, %%T_key pxor %%XMM3, %%T_key pxor %%XMM4, %%T_key pxor %%XMM5, %%T_key pxor %%XMM6, %%T_key pxor %%XMM7, %%T_key pxor %%XMM8, %%T_key %assign i 1 %rep NROUNDS ; do early (13) rounds (11 for GCM192) movdqu %%T_key, [%%GDATA_KEY+16*i] aesenc %%XMM1, %%T_key aesenc %%XMM2, %%T_key aesenc %%XMM3, %%T_key aesenc %%XMM4, %%T_key aesenc %%XMM5, %%T_key aesenc %%XMM6, %%T_key aesenc %%XMM7, %%T_key aesenc %%XMM8, %%T_key %assign i (i+1) %endrep movdqu %%T_key, [%%GDATA_KEY+16*i] ; do final key round aesenclast %%XMM1, %%T_key aesenclast %%XMM2, %%T_key aesenclast %%XMM3, %%T_key aesenclast %%XMM4, %%T_key aesenclast %%XMM5, %%T_key aesenclast %%XMM6, %%T_key aesenclast %%XMM7, %%T_key aesenclast %%XMM8, %%T_key XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*0] pxor %%XMM1, %%T1 XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*0], %%XMM1 %ifidn %%ENC_DEC, DEC movdqa %%XMM1, %%T1 %endif XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*1] pxor %%XMM2, %%T1 XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*1], %%XMM2 %ifidn %%ENC_DEC, DEC movdqa %%XMM2, %%T1 %endif XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*2] pxor %%XMM3, %%T1 XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*2], %%XMM3 %ifidn %%ENC_DEC, DEC movdqa %%XMM3, %%T1 %endif XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*3] pxor %%XMM4, %%T1 XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*3], %%XMM4 %ifidn %%ENC_DEC, DEC movdqa %%XMM4, %%T1 %endif XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*4] pxor %%XMM5, %%T1 XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*4], %%XMM5 %ifidn %%ENC_DEC, DEC movdqa %%XMM5, %%T1 %endif XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*5] pxor %%XMM6, %%T1 XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*5], %%XMM6 %ifidn %%ENC_DEC, DEC movdqa %%XMM6, %%T1 %endif XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*6] pxor %%XMM7, %%T1 XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*6], %%XMM7 %ifidn %%ENC_DEC, DEC movdqa %%XMM7, %%T1 %endif XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*7] pxor %%XMM8, %%T1 XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*7], %%XMM8 %ifidn %%ENC_DEC, DEC movdqa %%XMM8, %%T1 %endif add %%DATA_OFFSET, 128 pshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap pxor %%XMM1, %%T3 ; combine GHASHed value with the corresponding ciphertext pshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap pshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap pshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap pshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap pshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap pshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap pshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; %%_initial_blocks_done: %endmacro ; encrypt 8 blocks at a time ; ghash the 8 previously encrypted ciphertext blocks ; %%GDATA (KEY), %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN are used as pointers only, not modified ; %%DATA_OFFSET is the data offset value %macro GHASH_8_ENCRYPT_8_PARALLEL 22 %define %%GDATA %1 %define %%CYPH_PLAIN_OUT %2 %define %%PLAIN_CYPH_IN %3 %define %%DATA_OFFSET %4 %define %%T1 %5 %define %%T2 %6 %define %%T3 %7 %define %%T4 %8 %define %%T5 %9 %define %%T6 %10 %define %%CTR %11 %define %%XMM1 %12 %define %%XMM2 %13 %define %%XMM3 %14 %define %%XMM4 %15 %define %%XMM5 %16 %define %%XMM6 %17 %define %%XMM7 %18 %define %%XMM8 %19 %define %%T7 %20 %define %%loop_idx %21 %define %%ENC_DEC %22 movdqa %%T7, %%XMM1 movdqu [rsp + TMP2], %%XMM2 movdqu [rsp + TMP3], %%XMM3 movdqu [rsp + TMP4], %%XMM4 movdqu [rsp + TMP5], %%XMM5 movdqu [rsp + TMP6], %%XMM6 movdqu [rsp + TMP7], %%XMM7 movdqu [rsp + TMP8], %%XMM8 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; Karatsuba Method movdqa %%T4, %%T7 pshufd %%T6, %%T7, 01001110b pxor %%T6, %%T7 %ifidn %%loop_idx, in_order paddd %%CTR, [ONE] ; INCR CNT %else paddd %%CTR, [ONEf] ; INCR CNT %endif movdqu %%T5, [%%GDATA + HashKey_8] pclmulqdq %%T4, %%T5, 0x11 ; %%T1 = a1*b1 pclmulqdq %%T7, %%T5, 0x00 ; %%T7 = a0*b0 movdqu %%T5, [%%GDATA + HashKey_8_k] pclmulqdq %%T6, %%T5, 0x00 ; %%T2 = (a1+a0)*(b1+b0) movdqa %%XMM1, %%CTR %ifidn %%loop_idx, in_order paddd %%CTR, [ONE] ; INCR CNT movdqa %%XMM2, %%CTR paddd %%CTR, [ONE] ; INCR CNT movdqa %%XMM3, %%CTR paddd %%CTR, [ONE] ; INCR CNT movdqa %%XMM4, %%CTR paddd %%CTR, [ONE] ; INCR CNT movdqa %%XMM5, %%CTR paddd %%CTR, [ONE] ; INCR CNT movdqa %%XMM6, %%CTR paddd %%CTR, [ONE] ; INCR CNT movdqa %%XMM7, %%CTR paddd %%CTR, [ONE] ; INCR CNT movdqa %%XMM8, %%CTR pshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap pshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap pshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap pshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap pshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap pshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap pshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap pshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap %else paddd %%CTR, [ONEf] ; INCR CNT movdqa %%XMM2, %%CTR paddd %%CTR, [ONEf] ; INCR CNT movdqa %%XMM3, %%CTR paddd %%CTR, [ONEf] ; INCR CNT movdqa %%XMM4, %%CTR paddd %%CTR, [ONEf] ; INCR CNT movdqa %%XMM5, %%CTR paddd %%CTR, [ONEf] ; INCR CNT movdqa %%XMM6, %%CTR paddd %%CTR, [ONEf] ; INCR CNT movdqa %%XMM7, %%CTR paddd %%CTR, [ONEf] ; INCR CNT movdqa %%XMM8, %%CTR %endif ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; movdqu %%T1, [%%GDATA + 16*0] pxor %%XMM1, %%T1 pxor %%XMM2, %%T1 pxor %%XMM3, %%T1 pxor %%XMM4, %%T1 pxor %%XMM5, %%T1 pxor %%XMM6, %%T1 pxor %%XMM7, %%T1 pxor %%XMM8, %%T1 ;; %%XMM6, %%T5 hold the values for the two operands which are carry-less multiplied ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; Karatsuba Method movdqu %%T1, [rsp + TMP2] movdqa %%T3, %%T1 pshufd %%T2, %%T3, 01001110b pxor %%T2, %%T3 movdqu %%T5, [%%GDATA + HashKey_7] pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1 pclmulqdq %%T3, %%T5, 0x00 ; %%T3 = a0*b0 movdqu %%T5, [%%GDATA + HashKey_7_k] pclmulqdq %%T2, %%T5, 0x00 ; %%T2 = (a1+a0)*(b1+b0) pxor %%T4, %%T1 ; accumulate the results in %%T4:%%T7, %%T6 holds the middle part pxor %%T7, %%T3 pxor %%T6, %%T2 movdqu %%T1, [%%GDATA + 16*1] aesenc %%XMM1, %%T1 aesenc %%XMM2, %%T1 aesenc %%XMM3, %%T1 aesenc %%XMM4, %%T1 aesenc %%XMM5, %%T1 aesenc %%XMM6, %%T1 aesenc %%XMM7, %%T1 aesenc %%XMM8, %%T1 movdqu %%T1, [%%GDATA + 16*2] aesenc %%XMM1, %%T1 aesenc %%XMM2, %%T1 aesenc %%XMM3, %%T1 aesenc %%XMM4, %%T1 aesenc %%XMM5, %%T1 aesenc %%XMM6, %%T1 aesenc %%XMM7, %%T1 aesenc %%XMM8, %%T1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; Karatsuba Method movdqu %%T1, [rsp + TMP3] movdqa %%T3, %%T1 pshufd %%T2, %%T3, 01001110b pxor %%T2, %%T3 movdqu %%T5, [%%GDATA + HashKey_6] pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1 pclmulqdq %%T3, %%T5, 0x00 ; %%T3 = a0*b0 movdqu %%T5, [%%GDATA + HashKey_6_k] pclmulqdq %%T2, %%T5, 0x00 ; %%T2 = (a1+a0)*(b1+b0) pxor %%T4, %%T1 ; accumulate the results in %%T4:%%T7, %%T6 holds the middle part pxor %%T7, %%T3 pxor %%T6, %%T2 movdqu %%T1, [%%GDATA + 16*3] aesenc %%XMM1, %%T1 aesenc %%XMM2, %%T1 aesenc %%XMM3, %%T1 aesenc %%XMM4, %%T1 aesenc %%XMM5, %%T1 aesenc %%XMM6, %%T1 aesenc %%XMM7, %%T1 aesenc %%XMM8, %%T1 movdqu %%T1, [rsp + TMP4] movdqa %%T3, %%T1 pshufd %%T2, %%T3, 01001110b pxor %%T2, %%T3 movdqu %%T5, [%%GDATA + HashKey_5] pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1 pclmulqdq %%T3, %%T5, 0x00 ; %%T3 = a0*b0 movdqu %%T5, [%%GDATA + HashKey_5_k] pclmulqdq %%T2, %%T5, 0x00 ; %%T2 = (a1+a0)*(b1+b0) pxor %%T4, %%T1 ; accumulate the results in %%T4:%%T7, %%T6 holds the middle part pxor %%T7, %%T3 pxor %%T6, %%T2 movdqu %%T1, [%%GDATA + 16*4] aesenc %%XMM1, %%T1 aesenc %%XMM2, %%T1 aesenc %%XMM3, %%T1 aesenc %%XMM4, %%T1 aesenc %%XMM5, %%T1 aesenc %%XMM6, %%T1 aesenc %%XMM7, %%T1 aesenc %%XMM8, %%T1 movdqu %%T1, [%%GDATA + 16*5] aesenc %%XMM1, %%T1 aesenc %%XMM2, %%T1 aesenc %%XMM3, %%T1 aesenc %%XMM4, %%T1 aesenc %%XMM5, %%T1 aesenc %%XMM6, %%T1 aesenc %%XMM7, %%T1 aesenc %%XMM8, %%T1 movdqu %%T1, [rsp + TMP5] movdqa %%T3, %%T1 pshufd %%T2, %%T3, 01001110b pxor %%T2, %%T3 movdqu %%T5, [%%GDATA + HashKey_4] pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1 pclmulqdq %%T3, %%T5, 0x00 ; %%T3 = a0*b0 movdqu %%T5, [%%GDATA + HashKey_4_k] pclmulqdq %%T2, %%T5, 0x00 ; %%T2 = (a1+a0)*(b1+b0) pxor %%T4, %%T1 ; accumulate the results in %%T4:%%T7, %%T6 holds the middle part pxor %%T7, %%T3 pxor %%T6, %%T2 movdqu %%T1, [%%GDATA + 16*6] aesenc %%XMM1, %%T1 aesenc %%XMM2, %%T1 aesenc %%XMM3, %%T1 aesenc %%XMM4, %%T1 aesenc %%XMM5, %%T1 aesenc %%XMM6, %%T1 aesenc %%XMM7, %%T1 aesenc %%XMM8, %%T1 movdqu %%T1, [rsp + TMP6] movdqa %%T3, %%T1 pshufd %%T2, %%T3, 01001110b pxor %%T2, %%T3 movdqu %%T5, [%%GDATA + HashKey_3] pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1 pclmulqdq %%T3, %%T5, 0x00 ; %%T3 = a0*b0 movdqu %%T5, [%%GDATA + HashKey_3_k] pclmulqdq %%T2, %%T5, 0x00 ; %%T2 = (a1+a0)*(b1+b0) pxor %%T4, %%T1 ; accumulate the results in %%T4:%%T7, %%T6 holds the middle part pxor %%T7, %%T3 pxor %%T6, %%T2 movdqu %%T1, [%%GDATA + 16*7] aesenc %%XMM1, %%T1 aesenc %%XMM2, %%T1 aesenc %%XMM3, %%T1 aesenc %%XMM4, %%T1 aesenc %%XMM5, %%T1 aesenc %%XMM6, %%T1 aesenc %%XMM7, %%T1 aesenc %%XMM8, %%T1 movdqu %%T1, [rsp + TMP7] movdqa %%T3, %%T1 pshufd %%T2, %%T3, 01001110b pxor %%T2, %%T3 movdqu %%T5, [%%GDATA + HashKey_2] pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1 pclmulqdq %%T3, %%T5, 0x00 ; %%T3 = a0*b0 movdqu %%T5, [%%GDATA + HashKey_2_k] pclmulqdq %%T2, %%T5, 0x00 ; %%T2 = (a1+a0)*(b1+b0) pxor %%T4, %%T1 ; accumulate the results in %%T4:%%T7, %%T6 holds the middle part pxor %%T7, %%T3 pxor %%T6, %%T2 movdqu %%T1, [%%GDATA + 16*8] aesenc %%XMM1, %%T1 aesenc %%XMM2, %%T1 aesenc %%XMM3, %%T1 aesenc %%XMM4, %%T1 aesenc %%XMM5, %%T1 aesenc %%XMM6, %%T1 aesenc %%XMM7, %%T1 aesenc %%XMM8, %%T1 ;; %%XMM8, %%T5 hold the values for the two operands which are carry-less multiplied ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; Karatsuba Method movdqu %%T1, [rsp + TMP8] movdqa %%T3, %%T1 pshufd %%T2, %%T3, 01001110b pxor %%T2, %%T3 movdqu %%T5, [%%GDATA + HashKey] pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1 pclmulqdq %%T3, %%T5, 0x00 ; %%T3 = a0*b0 movdqu %%T5, [%%GDATA + HashKey_k] pclmulqdq %%T2, %%T5, 0x00 ; %%T2 = (a1+a0)*(b1+b0) pxor %%T7, %%T3 pxor %%T4, %%T1 movdqu %%T1, [%%GDATA + 16*9] aesenc %%XMM1, %%T1 aesenc %%XMM2, %%T1 aesenc %%XMM3, %%T1 aesenc %%XMM4, %%T1 aesenc %%XMM5, %%T1 aesenc %%XMM6, %%T1 aesenc %%XMM7, %%T1 aesenc %%XMM8, %%T1 %ifdef GCM128_MODE movdqu %%T5, [%%GDATA + 16*10] %endif %ifdef GCM192_MODE movdqu %%T1, [%%GDATA + 16*10] aesenc %%XMM1, %%T1 aesenc %%XMM2, %%T1 aesenc %%XMM3, %%T1 aesenc %%XMM4, %%T1 aesenc %%XMM5, %%T1 aesenc %%XMM6, %%T1 aesenc %%XMM7, %%T1 aesenc %%XMM8, %%T1 movdqu %%T1, [%%GDATA + 16*11] aesenc %%XMM1, %%T1 aesenc %%XMM2, %%T1 aesenc %%XMM3, %%T1 aesenc %%XMM4, %%T1 aesenc %%XMM5, %%T1 aesenc %%XMM6, %%T1 aesenc %%XMM7, %%T1 aesenc %%XMM8, %%T1 movdqu %%T5, [%%GDATA + 16*12] ; finish last key round %endif %ifdef GCM256_MODE movdqu %%T1, [%%GDATA + 16*10] aesenc %%XMM1, %%T1 aesenc %%XMM2, %%T1 aesenc %%XMM3, %%T1 aesenc %%XMM4, %%T1 aesenc %%XMM5, %%T1 aesenc %%XMM6, %%T1 aesenc %%XMM7, %%T1 aesenc %%XMM8, %%T1 movdqu %%T1, [%%GDATA + 16*11] aesenc %%XMM1, %%T1 aesenc %%XMM2, %%T1 aesenc %%XMM3, %%T1 aesenc %%XMM4, %%T1 aesenc %%XMM5, %%T1 aesenc %%XMM6, %%T1 aesenc %%XMM7, %%T1 aesenc %%XMM8, %%T1 movdqu %%T1, [%%GDATA + 16*12] aesenc %%XMM1, %%T1 aesenc %%XMM2, %%T1 aesenc %%XMM3, %%T1 aesenc %%XMM4, %%T1 aesenc %%XMM5, %%T1 aesenc %%XMM6, %%T1 aesenc %%XMM7, %%T1 aesenc %%XMM8, %%T1 movdqu %%T1, [%%GDATA + 16*13] aesenc %%XMM1, %%T1 aesenc %%XMM2, %%T1 aesenc %%XMM3, %%T1 aesenc %%XMM4, %%T1 aesenc %%XMM5, %%T1 aesenc %%XMM6, %%T1 aesenc %%XMM7, %%T1 aesenc %%XMM8, %%T1 movdqu %%T5, [%%GDATA + 16*14] ; finish last key round %endif %assign i 0 %assign j 1 %rep 8 XLDR %%T1, [%%PLAIN_CYPH_IN+%%DATA_OFFSET+16*i] %ifidn %%ENC_DEC, DEC movdqa %%T3, %%T1 %endif pxor %%T1, %%T5 aesenclast reg(j), %%T1 ; XMM1:XMM8 XSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*i], reg(j) ; Write to the Output buffer %ifidn %%ENC_DEC, DEC movdqa reg(j), %%T3 %endif %assign i (i+1) %assign j (j+1) %endrep pxor %%T2, %%T6 pxor %%T2, %%T4 pxor %%T2, %%T7 movdqa %%T3, %%T2 pslldq %%T3, 8 ; shift-L %%T3 2 DWs psrldq %%T2, 8 ; shift-R %%T2 2 DWs pxor %%T7, %%T3 pxor %%T4, %%T2 ; accumulate the results in %%T4:%%T7 ;first phase of the reduction movdqa %%T2, %%T7 movdqa %%T3, %%T7 movdqa %%T1, %%T7 ; move %%T7 into %%T2, %%T3, %%T1 in order to perform the three shifts independently pslld %%T2, 31 ; packed right shifting << 31 pslld %%T3, 30 ; packed right shifting shift << 30 pslld %%T1, 25 ; packed right shifting shift << 25 pxor %%T2, %%T3 ; xor the shifted versions pxor %%T2, %%T1 movdqa %%T5, %%T2 psrldq %%T5, 4 ; shift-R %%T5 1 DW pslldq %%T2, 12 ; shift-L %%T2 3 DWs pxor %%T7, %%T2 ; first phase of the reduction complete ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; pshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap pshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap pshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap pshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap pshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap pshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap pshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap pshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap ;second phase of the reduction movdqa %%T2,%%T7 ; make 3 copies of %%T7 (in in %%T2, %%T3, %%T1) for doing three shift operations movdqa %%T3,%%T7 movdqa %%T1,%%T7 psrld %%T2,1 ; packed left shifting >> 1 psrld %%T3,2 ; packed left shifting >> 2 psrld %%T1,7 ; packed left shifting >> 7 pxor %%T2,%%T3 ; xor the shifted versions pxor %%T2,%%T1 pxor %%T2, %%T5 pxor %%T7, %%T2 pxor %%T7, %%T4 ; the result is in %%T4 pxor %%XMM1, %%T7 %endmacro ; GHASH the last 4 ciphertext blocks. %macro GHASH_LAST_8 16 %define %%GDATA %1 %define %%T1 %2 %define %%T2 %3 %define %%T3 %4 %define %%T4 %5 %define %%T5 %6 %define %%T6 %7 %define %%T7 %8 %define %%XMM1 %9 %define %%XMM2 %10 %define %%XMM3 %11 %define %%XMM4 %12 %define %%XMM5 %13 %define %%XMM6 %14 %define %%XMM7 %15 %define %%XMM8 %16 ; Karatsuba Method movdqa %%T6, %%XMM1 pshufd %%T2, %%XMM1, 01001110b pxor %%T2, %%XMM1 movdqu %%T5, [%%GDATA + HashKey_8] pclmulqdq %%T6, %%T5, 0x11 ; %%T6 = a1*b1 pclmulqdq %%XMM1, %%T5, 0x00 ; %%XMM1 = a0*b0 movdqu %%T4, [%%GDATA + HashKey_8_k] pclmulqdq %%T2, %%T4, 0x00 ; %%T2 = (a1+a0)*(b1+b0) movdqa %%T7, %%XMM1 movdqa %%XMM1, %%T2 ; result in %%T6, %%T7, %%XMM1 ; Karatsuba Method movdqa %%T1, %%XMM2 pshufd %%T2, %%XMM2, 01001110b pxor %%T2, %%XMM2 movdqu %%T5, [%%GDATA + HashKey_7] pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1 pclmulqdq %%XMM2, %%T5, 0x00 ; %%XMM2 = a0*b0 movdqu %%T4, [%%GDATA + HashKey_7_k] pclmulqdq %%T2, %%T4, 0x00 ; %%T2 = (a1+a0)*(b1+b0) pxor %%T6, %%T1 pxor %%T7, %%XMM2 pxor %%XMM1, %%T2 ; results accumulated in %%T6, %%T7, %%XMM1 ; Karatsuba Method movdqa %%T1, %%XMM3 pshufd %%T2, %%XMM3, 01001110b pxor %%T2, %%XMM3 movdqu %%T5, [%%GDATA + HashKey_6] pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1 pclmulqdq %%XMM3, %%T5, 0x00 ; %%XMM3 = a0*b0 movdqu %%T4, [%%GDATA + HashKey_6_k] pclmulqdq %%T2, %%T4, 0x00 ; %%T2 = (a1+a0)*(b1+b0) pxor %%T6, %%T1 pxor %%T7, %%XMM3 pxor %%XMM1, %%T2 ; results accumulated in %%T6, %%T7, %%XMM1 ; Karatsuba Method movdqa %%T1, %%XMM4 pshufd %%T2, %%XMM4, 01001110b pxor %%T2, %%XMM4 movdqu %%T5, [%%GDATA + HashKey_5] pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1 pclmulqdq %%XMM4, %%T5, 0x00 ; %%XMM3 = a0*b0 movdqu %%T4, [%%GDATA + HashKey_5_k] pclmulqdq %%T2, %%T4, 0x00 ; %%T2 = (a1+a0)*(b1+b0) pxor %%T6, %%T1 pxor %%T7, %%XMM4 pxor %%XMM1, %%T2 ; results accumulated in %%T6, %%T7, %%XMM1 ; Karatsuba Method movdqa %%T1, %%XMM5 pshufd %%T2, %%XMM5, 01001110b pxor %%T2, %%XMM5 movdqu %%T5, [%%GDATA + HashKey_4] pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1 pclmulqdq %%XMM5, %%T5, 0x00 ; %%XMM3 = a0*b0 movdqu %%T4, [%%GDATA + HashKey_4_k] pclmulqdq %%T2, %%T4, 0x00 ; %%T2 = (a1+a0)*(b1+b0) pxor %%T6, %%T1 pxor %%T7, %%XMM5 pxor %%XMM1, %%T2 ; results accumulated in %%T6, %%T7, %%XMM1 ; Karatsuba Method movdqa %%T1, %%XMM6 pshufd %%T2, %%XMM6, 01001110b pxor %%T2, %%XMM6 movdqu %%T5, [%%GDATA + HashKey_3] pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1 pclmulqdq %%XMM6, %%T5, 0x00 ; %%XMM3 = a0*b0 movdqu %%T4, [%%GDATA + HashKey_3_k] pclmulqdq %%T2, %%T4, 0x00 ; %%T2 = (a1+a0)*(b1+b0) pxor %%T6, %%T1 pxor %%T7, %%XMM6 pxor %%XMM1, %%T2 ; results accumulated in %%T6, %%T7, %%XMM1 ; Karatsuba Method movdqa %%T1, %%XMM7 pshufd %%T2, %%XMM7, 01001110b pxor %%T2, %%XMM7 movdqu %%T5, [%%GDATA + HashKey_2] pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1 pclmulqdq %%XMM7, %%T5, 0x00 ; %%XMM3 = a0*b0 movdqu %%T4, [%%GDATA + HashKey_2_k] pclmulqdq %%T2, %%T4, 0x00 ; %%T2 = (a1+a0)*(b1+b0) pxor %%T6, %%T1 pxor %%T7, %%XMM7 pxor %%XMM1, %%T2 ; results accumulated in %%T6, %%T7, %%XMM1 ; Karatsuba Method movdqa %%T1, %%XMM8 pshufd %%T2, %%XMM8, 01001110b pxor %%T2, %%XMM8 movdqu %%T5, [%%GDATA + HashKey] pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1 pclmulqdq %%XMM8, %%T5, 0x00 ; %%XMM4 = a0*b0 movdqu %%T4, [%%GDATA + HashKey_k] pclmulqdq %%T2, %%T4, 0x00 ; %%T2 = (a1+a0)*(b1+b0) pxor %%T6, %%T1 pxor %%T7, %%XMM8 pxor %%T2, %%XMM1 pxor %%T2, %%T6 pxor %%T2, %%T7 ; middle section of the temp results combined as in Karatsuba algorithm movdqa %%T4, %%T2 pslldq %%T4, 8 ; shift-L %%T4 2 DWs psrldq %%T2, 8 ; shift-R %%T2 2 DWs pxor %%T7, %%T4 pxor %%T6, %%T2 ; <%%T6:%%T7> holds the result of the accumulated carry-less multiplications ;first phase of the reduction movdqa %%T2, %%T7 movdqa %%T3, %%T7 movdqa %%T4, %%T7 ; move %%T7 into %%T2, %%T3, %%T4 in order to perform the three shifts independently pslld %%T2, 31 ; packed right shifting << 31 pslld %%T3, 30 ; packed right shifting shift << 30 pslld %%T4, 25 ; packed right shifting shift << 25 pxor %%T2, %%T3 ; xor the shifted versions pxor %%T2, %%T4 movdqa %%T1, %%T2 psrldq %%T1, 4 ; shift-R %%T1 1 DW pslldq %%T2, 12 ; shift-L %%T2 3 DWs pxor %%T7, %%T2 ; first phase of the reduction complete ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;second phase of the reduction movdqa %%T2,%%T7 ; make 3 copies of %%T7 (in in %%T2, %%T3, %%T4) for doing three shift operations movdqa %%T3,%%T7 movdqa %%T4,%%T7 psrld %%T2,1 ; packed left shifting >> 1 psrld %%T3,2 ; packed left shifting >> 2 psrld %%T4,7 ; packed left shifting >> 7 pxor %%T2,%%T3 ; xor the shifted versions pxor %%T2,%%T4 pxor %%T2, %%T1 pxor %%T7, %%T2 pxor %%T6, %%T7 ; the result is in %%T6 %endmacro ; Encryption of a single block %macro ENCRYPT_SINGLE_BLOCK 3 %define %%GDATA %1 %define %%ST %2 %define %%T1 %3 movdqu %%T1, [%%GDATA+16*0] pxor %%ST, %%T1 %assign i 1 %rep NROUNDS movdqu %%T1, [%%GDATA+16*i] aesenc %%ST, %%T1 %assign i (i+1) %endrep movdqu %%T1, [%%GDATA+16*i] aesenclast %%ST, %%T1 %endmacro ;; Start of Stack Setup %macro FUNC_SAVE 0 ;; Required for Update/GCM_ENC ;the number of pushes must equal STACK_OFFSET push r12 push r13 push r14 push r15 mov r14, rsp sub rsp, VARIABLE_OFFSET and rsp, ~63 %ifidn __OUTPUT_FORMAT__, win64 ; xmm6:xmm15 need to be maintained for Windows movdqu [rsp + LOCAL_STORAGE + 0*16],xmm6 movdqu [rsp + LOCAL_STORAGE + 1*16],xmm7 movdqu [rsp + LOCAL_STORAGE + 2*16],xmm8 movdqu [rsp + LOCAL_STORAGE + 3*16],xmm9 movdqu [rsp + LOCAL_STORAGE + 4*16],xmm10 movdqu [rsp + LOCAL_STORAGE + 5*16],xmm11 movdqu [rsp + LOCAL_STORAGE + 6*16],xmm12 movdqu [rsp + LOCAL_STORAGE + 7*16],xmm13 movdqu [rsp + LOCAL_STORAGE + 8*16],xmm14 movdqu [rsp + LOCAL_STORAGE + 9*16],xmm15 %endif %endmacro %macro FUNC_RESTORE 0 %ifidn __OUTPUT_FORMAT__, win64 movdqu xmm15 , [rsp + LOCAL_STORAGE + 9*16] movdqu xmm14 , [rsp + LOCAL_STORAGE + 8*16] movdqu xmm13 , [rsp + LOCAL_STORAGE + 7*16] movdqu xmm12 , [rsp + LOCAL_STORAGE + 6*16] movdqu xmm11 , [rsp + LOCAL_STORAGE + 5*16] movdqu xmm10 , [rsp + LOCAL_STORAGE + 4*16] movdqu xmm9 , [rsp + LOCAL_STORAGE + 3*16] movdqu xmm8 , [rsp + LOCAL_STORAGE + 2*16] movdqu xmm7 , [rsp + LOCAL_STORAGE + 1*16] movdqu xmm6 , [rsp + LOCAL_STORAGE + 0*16] %endif ;; Required for Update/GCM_ENC mov rsp, r14 pop r15 pop r14 pop r13 pop r12 %endmacro ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; GCM_INIT initializes a gcm_context_data struct to prepare for encoding/decoding. ; Input: gcm_key_data * (GDATA_KEY), gcm_context_data *(GDATA_CTX), IV, ; Additional Authentication data (A_IN), Additional Data length (A_LEN). ; Output: Updated GDATA_CTX with the hash of A_IN (AadHash) and initialized other parts of GDATA. ; Clobbers rax, r10-r13 and xmm0-xmm6 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; %macro GCM_INIT 5 %define %%GDATA_KEY %1 %define %%GDATA_CTX %2 %define %%IV %3 %define %%A_IN %4 %define %%A_LEN %5 %define %%AAD_HASH xmm0 %define %%SUBHASH xmm1 movdqu %%SUBHASH, [%%GDATA_KEY + HashKey] CALC_AAD_HASH %%A_IN, %%A_LEN, %%AAD_HASH, %%SUBHASH, xmm2, xmm3, xmm4, xmm5, xmm6, r10, r11, r12, r13, rax pxor xmm2, xmm3 mov r10, %%A_LEN movdqu [%%GDATA_CTX + AadHash], %%AAD_HASH ; ctx_data.aad hash = aad_hash mov [%%GDATA_CTX + AadLen], r10 ; ctx_data.aad_length = aad_length xor r10, r10 mov [%%GDATA_CTX + InLen], r10 ; ctx_data.in_length = 0 mov [%%GDATA_CTX + PBlockLen], r10 ; ctx_data.partial_block_length = 0 movdqu [%%GDATA_CTX + PBlockEncKey], xmm2 ; ctx_data.partial_block_enc_key = 0 mov r10, %%IV movdqa xmm2, [rel ONEf] ; read 12 IV bytes and pad with 0x00000001 pinsrq xmm2, [r10], 0 pinsrd xmm2, [r10+8], 2 movdqu [%%GDATA_CTX + OrigIV], xmm2 ; ctx_data.orig_IV = iv pshufb xmm2, [SHUF_MASK] movdqu [%%GDATA_CTX + CurCount], xmm2 ; ctx_data.current_counter = iv %endmacro ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context_data ; struct has been initialized by GCM_INIT. ; Requires the input data be at least 1 byte long because of READ_SMALL_INPUT_DATA. ; Input: gcm_key_data * (GDATA_KEY), gcm_context_data (GDATA_CTX), input text (PLAIN_CYPH_IN), ; input text length (PLAIN_CYPH_LEN) and whether encoding or decoding (ENC_DEC) ; Output: A cypher of the given plain text (CYPH_PLAIN_OUT), and updated GDATA_CTX ; Clobbers rax, r10-r15, and xmm0-xmm15 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; %macro GCM_ENC_DEC 6 %define %%GDATA_KEY %1 %define %%GDATA_CTX %2 %define %%CYPH_PLAIN_OUT %3 %define %%PLAIN_CYPH_IN %4 %define %%PLAIN_CYPH_LEN %5 %define %%ENC_DEC %6 %define %%DATA_OFFSET r11 ; Macro flow: ; calculate the number of 16byte blocks in the message ; process (number of 16byte blocks) mod 8 '%%_initial_num_blocks_is_# .. %%_initial_blocks_encrypted' ; process 8 16 byte blocks at a time until all are done '%%_encrypt_by_8_new .. %%_eight_cipher_left' ; if there is a block of less tahn 16 bytes process it '%%_zero_cipher_left .. %%_multiple_of_16_bytes' cmp %%PLAIN_CYPH_LEN, 0 je %%_multiple_of_16_bytes xor %%DATA_OFFSET, %%DATA_OFFSET %ifidn __OUTPUT_FORMAT__, win64 mov r12, %%PLAIN_CYPH_LEN add [%%GDATA_CTX + InLen], r12 ;Update length of data processed %else add [%%GDATA_CTX + InLen], %%PLAIN_CYPH_LEN ;Update length of data processed %endif movdqu xmm13, [%%GDATA_KEY + HashKey] ; xmm13 = HashKey movdqu xmm8, [%%GDATA_CTX + AadHash] PARTIAL_BLOCK %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%PLAIN_CYPH_LEN, %%DATA_OFFSET, xmm8, %%ENC_DEC mov r13, %%PLAIN_CYPH_LEN ; save the number of bytes of plaintext/ciphertext sub r13, %%DATA_OFFSET mov r10, r13 ;save the amount of data left to process in r10 and r13, -16 ; r13 = r13 - (r13 mod 16) mov r12, r13 shr r12, 4 and r12, 7 jz %%_initial_num_blocks_is_0 cmp r12, 7 je %%_initial_num_blocks_is_7 cmp r12, 6 je %%_initial_num_blocks_is_6 cmp r12, 5 je %%_initial_num_blocks_is_5 cmp r12, 4 je %%_initial_num_blocks_is_4 cmp r12, 3 je %%_initial_num_blocks_is_3 cmp r12, 2 je %%_initial_num_blocks_is_2 jmp %%_initial_num_blocks_is_1 %%_initial_num_blocks_is_7: INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 7, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC sub r13, 16*7 jmp %%_initial_blocks_encrypted %%_initial_num_blocks_is_6: INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 6, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC sub r13, 16*6 jmp %%_initial_blocks_encrypted %%_initial_num_blocks_is_5: INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 5, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC sub r13, 16*5 jmp %%_initial_blocks_encrypted %%_initial_num_blocks_is_4: INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 4, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC sub r13, 16*4 jmp %%_initial_blocks_encrypted %%_initial_num_blocks_is_3: INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 3, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC sub r13, 16*3 jmp %%_initial_blocks_encrypted %%_initial_num_blocks_is_2: INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 2, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC sub r13, 16*2 jmp %%_initial_blocks_encrypted %%_initial_num_blocks_is_1: INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 1, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC sub r13, 16 jmp %%_initial_blocks_encrypted %%_initial_num_blocks_is_0: INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 0, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC %%_initial_blocks_encrypted: cmp r13, 0 je %%_zero_cipher_left sub r13, 128 je %%_eight_cipher_left movd r15d, xmm9 and r15d, 255 pshufb xmm9, [SHUF_MASK] %%_encrypt_by_8_new: cmp r15d, 255-8 jg %%_encrypt_by_8 add r15b, 8 GHASH_8_ENCRYPT_8_PARALLEL %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, out_order, %%ENC_DEC add %%DATA_OFFSET, 128 sub r13, 128 jne %%_encrypt_by_8_new pshufb xmm9, [SHUF_MASK] jmp %%_eight_cipher_left %%_encrypt_by_8: pshufb xmm9, [SHUF_MASK] add r15b, 8 GHASH_8_ENCRYPT_8_PARALLEL %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, in_order, %%ENC_DEC pshufb xmm9, [SHUF_MASK] add %%DATA_OFFSET, 128 sub r13, 128 jne %%_encrypt_by_8_new pshufb xmm9, [SHUF_MASK] %%_eight_cipher_left: GHASH_LAST_8 %%GDATA_KEY, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8 %%_zero_cipher_left: movdqu [%%GDATA_CTX + AadHash], xmm14 movdqu [%%GDATA_CTX + CurCount], xmm9 mov r13, r10 and r13, 15 ; r13 = (%%PLAIN_CYPH_LEN mod 16) je %%_multiple_of_16_bytes mov [%%GDATA_CTX + PBlockLen], r13 ; my_ctx.data.partial_blck_length = r13 ; handle the last <16 Byte block seperately paddd xmm9, [ONE] ; INCR CNT to get Yn movdqu [%%GDATA_CTX + CurCount], xmm9 ; my_ctx.data.current_counter = xmm9 pshufb xmm9, [SHUF_MASK] ENCRYPT_SINGLE_BLOCK %%GDATA_KEY, xmm9, xmm2 ; E(K, Yn) movdqu [%%GDATA_CTX + PBlockEncKey], xmm9 ; my_ctx_data.partial_block_enc_key = xmm9 cmp %%PLAIN_CYPH_LEN, 16 jge %%_large_enough_update lea r10, [%%PLAIN_CYPH_IN + %%DATA_OFFSET] READ_SMALL_DATA_INPUT xmm1, r10, r13, r12, r15, rax lea r12, [SHIFT_MASK + 16] sub r12, r13 jmp %%_data_read %%_large_enough_update: sub %%DATA_OFFSET, 16 add %%DATA_OFFSET, r13 movdqu xmm1, [%%PLAIN_CYPH_IN+%%DATA_OFFSET] ; receive the last <16 Byte block sub %%DATA_OFFSET, r13 add %%DATA_OFFSET, 16 lea r12, [SHIFT_MASK + 16] sub r12, r13 ; adjust the shuffle mask pointer to be able to shift 16-r13 bytes (r13 is the number of bytes in plaintext mod 16) movdqu xmm2, [r12] ; get the appropriate shuffle mask pshufb xmm1, xmm2 ; shift right 16-r13 bytes %%_data_read: %ifidn %%ENC_DEC, DEC movdqa xmm2, xmm1 pxor xmm9, xmm1 ; Plaintext XOR E(K, Yn) movdqu xmm1, [r12 + ALL_F - SHIFT_MASK] ; get the appropriate mask to mask out top 16-r13 bytes of xmm9 pand xmm9, xmm1 ; mask out top 16-r13 bytes of xmm9 pand xmm2, xmm1 pshufb xmm2, [SHUF_MASK] pxor xmm14, xmm2 movdqu [%%GDATA_CTX + AadHash], xmm14 %else pxor xmm9, xmm1 ; Plaintext XOR E(K, Yn) movdqu xmm1, [r12 + ALL_F - SHIFT_MASK] ; get the appropriate mask to mask out top 16-r13 bytes of xmm9 pand xmm9, xmm1 ; mask out top 16-r13 bytes of xmm9 pshufb xmm9, [SHUF_MASK] pxor xmm14, xmm9 movdqu [%%GDATA_CTX + AadHash], xmm14 pshufb xmm9, [SHUF_MASK] ; shuffle xmm9 back to output as ciphertext %endif ;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; output r13 Bytes movq rax, xmm9 cmp r13, 8 jle %%_less_than_8_bytes_left mov [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], rax add %%DATA_OFFSET, 8 psrldq xmm9, 8 movq rax, xmm9 sub r13, 8 %%_less_than_8_bytes_left: mov BYTE [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], al add %%DATA_OFFSET, 1 shr rax, 8 sub r13, 1 jne %%_less_than_8_bytes_left ;;;;;;;;;;;;;;;;;;;;;;;;;;;;; %%_multiple_of_16_bytes: %endmacro ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; GCM_COMPLETE Finishes Encyrption/Decryption of last partial block after GCM_UPDATE finishes. ; Input: A gcm_key_data * (GDATA_KEY), gcm_context_data * (GDATA_CTX) and ; whether encoding or decoding (ENC_DEC). ; Output: Authorization Tag (AUTH_TAG) and Authorization Tag length (AUTH_TAG_LEN) ; Clobbers rax, r10-r12, and xmm0, xmm1, xmm5, xmm6, xmm9, xmm11, xmm14, xmm15 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; %macro GCM_COMPLETE 5 %define %%GDATA_KEY %1 %define %%GDATA_CTX %2 %define %%AUTH_TAG %3 %define %%AUTH_TAG_LEN %4 %define %%ENC_DEC %5 %define %%PLAIN_CYPH_LEN rax mov r12, [%%GDATA_CTX + PBlockLen] ; r12 = aadLen (number of bytes) movdqu xmm14, [%%GDATA_CTX + AadHash] movdqu xmm13, [%%GDATA_KEY + HashKey] cmp r12, 0 je %%_partial_done GHASH_MUL xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block movdqu [%%GDATA_CTX + AadHash], xmm14 %%_partial_done: mov r12, [%%GDATA_CTX + AadLen] ; r12 = aadLen (number of bytes) mov %%PLAIN_CYPH_LEN, [%%GDATA_CTX + InLen] shl r12, 3 ; convert into number of bits movd xmm15, r12d ; len(A) in xmm15 shl %%PLAIN_CYPH_LEN, 3 ; len(C) in bits (*128) movq xmm1, %%PLAIN_CYPH_LEN pslldq xmm15, 8 ; xmm15 = len(A)|| 0x0000000000000000 pxor xmm15, xmm1 ; xmm15 = len(A)||len(C) pxor xmm14, xmm15 GHASH_MUL xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ; final GHASH computation pshufb xmm14, [SHUF_MASK] ; perform a 16Byte swap movdqu xmm9, [%%GDATA_CTX + OrigIV] ; xmm9 = Y0 ENCRYPT_SINGLE_BLOCK %%GDATA_KEY, xmm9, xmm2 ; E(K, Y0) pxor xmm9, xmm14 %%_return_T: mov r10, %%AUTH_TAG ; r10 = authTag mov r11, %%AUTH_TAG_LEN ; r11 = auth_tag_len cmp r11, 16 je %%_T_16 cmp r11, 12 je %%_T_12 %%_T_8: movq rax, xmm9 mov [r10], rax jmp %%_return_T_done %%_T_12: movq rax, xmm9 mov [r10], rax psrldq xmm9, 8 movd eax, xmm9 mov [r10 + 8], eax jmp %%_return_T_done %%_T_16: movdqu [r10], xmm9 %%_return_T_done: %endmacro ;GCM_COMPLETE ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;void aes_gcm_precomp_128_sse / aes_gcm_precomp_192_sse / aes_gcm_precomp_256_sse ; (struct gcm_key_data *key_data); ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; MKGLOBAL(FN_NAME(precomp,_),function,) FN_NAME(precomp,_): push r12 push r13 push r14 push r15 mov r14, rsp sub rsp, VARIABLE_OFFSET and rsp, ~63 ; align rsp to 64 bytes %ifidn __OUTPUT_FORMAT__, win64 ; only xmm6 needs to be maintained movdqu [rsp + LOCAL_STORAGE + 0*16],xmm6 %endif pxor xmm6, xmm6 ENCRYPT_SINGLE_BLOCK arg1, xmm6, xmm2 ; xmm6 = HashKey pshufb xmm6, [SHUF_MASK] ;;;;;;;;;;;;;;; PRECOMPUTATION of HashKey<<1 mod poly from the HashKey;;;;;;;;;;;;;;; movdqa xmm2, xmm6 psllq xmm6, 1 psrlq xmm2, 63 movdqa xmm1, xmm2 pslldq xmm2, 8 psrldq xmm1, 8 por xmm6, xmm2 ;reduction pshufd xmm2, xmm1, 00100100b pcmpeqd xmm2, [TWOONE] pand xmm2, [POLY] pxor xmm6, xmm2 ; xmm6 holds the HashKey<<1 mod poly ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; movdqu [arg1 + HashKey], xmm6 ; store HashKey<<1 mod poly PRECOMPUTE arg1, xmm6, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5 %ifidn __OUTPUT_FORMAT__, win64 movdqu xmm6, [rsp + LOCAL_STORAGE + 0*16] %endif mov rsp, r14 pop r15 pop r14 pop r13 pop r12 ret ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;void aes_gcm_init_128_sse / aes_gcm_init_192_sse / aes_gcm_init_256_sse ( ; const struct gcm_key_data *key_data, ; struct gcm_context_data *context_data, ; u8 *iv, ; const u8 *aad, ; u64 aad_len); ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; MKGLOBAL(FN_NAME(init,_),function,) FN_NAME(init,_): push r12 push r13 %ifidn __OUTPUT_FORMAT__, win64 push r14 push r15 mov r14, rsp ; xmm6:xmm15 need to be maintained for Windows sub rsp, 1*16 movdqu [rsp + 0*16], xmm6 %endif GCM_INIT arg1, arg2, arg3, arg4, arg5 %ifidn __OUTPUT_FORMAT__, win64 movdqu xmm6 , [rsp + 0*16] mov rsp, r14 pop r15 pop r14 %endif pop r13 pop r12 ret ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;void aes_gcm_enc_128_update_sse / aes_gcm_enc_192_update_sse / aes_gcm_enc_256_update_sse ; const struct gcm_key_data *key_data, ; struct gcm_context_data *context_data, ; u8 *out, ; const u8 *in, ; u64 plaintext_len); ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; MKGLOBAL(FN_NAME(enc,_update_),function,) FN_NAME(enc,_update_): FUNC_SAVE GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, ENC FUNC_RESTORE ret ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;void aes_gcm_dec_256_update_sse / aes_gcm_dec_192_update_sse / aes_gcm_dec_256_update_sse ; const struct gcm_key_data *key_data, ; struct gcm_context_data *context_data, ; u8 *out, ; const u8 *in, ; u64 plaintext_len); ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; MKGLOBAL(FN_NAME(dec,_update_),function,) FN_NAME(dec,_update_): FUNC_SAVE GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, DEC FUNC_RESTORE ret ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;void aes_gcm_enc_128_finalize_sse / aes_gcm_enc_192_finalize_sse / aes_gcm_enc_256_finalize_sse ; const struct gcm_key_data *key_data, ; struct gcm_context_data *context_data, ; u8 *auth_tag, ; u64 auth_tag_len); ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; MKGLOBAL(FN_NAME(enc,_finalize_),function,) FN_NAME(enc,_finalize_): push r12 %ifidn __OUTPUT_FORMAT__, win64 ; xmm6:xmm15 need to be maintained for Windows sub rsp, 5*16 movdqu [rsp + 0*16],xmm6 movdqu [rsp + 1*16],xmm9 movdqu [rsp + 2*16],xmm11 movdqu [rsp + 3*16],xmm14 movdqu [rsp + 4*16],xmm15 %endif GCM_COMPLETE arg1, arg2, arg3, arg4, ENC %ifidn __OUTPUT_FORMAT__, win64 movdqu xmm15 , [rsp + 4*16] movdqu xmm14 , [rsp+ 3*16] movdqu xmm11 , [rsp + 2*16] movdqu xmm9 , [rsp + 1*16] movdqu xmm6 , [rsp + 0*16] add rsp, 5*16 %endif pop r12 ret ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;void aes_gcm_dec_128_finalize_sse / aes_gcm_dec_192_finalize_sse / aes_gcm_dec_256_finalize_sse ; const struct gcm_key_data *key_data, ; struct gcm_context_data *context_data, ; u8 *auth_tag, ; u64 auth_tag_len); ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; MKGLOBAL(FN_NAME(dec,_finalize_),function,) FN_NAME(dec,_finalize_): push r12 %ifidn __OUTPUT_FORMAT__, win64 ; xmm6:xmm15 need to be maintained for Windows sub rsp, 5*16 movdqu [rsp + 0*16],xmm6 movdqu [rsp + 1*16],xmm9 movdqu [rsp + 2*16],xmm11 movdqu [rsp + 3*16],xmm14 movdqu [rsp + 4*16],xmm15 %endif GCM_COMPLETE arg1, arg2, arg3, arg4, DEC %ifidn __OUTPUT_FORMAT__, win64 movdqu xmm15 , [rsp + 4*16] movdqu xmm14 , [rsp+ 3*16] movdqu xmm11 , [rsp + 2*16] movdqu xmm9 , [rsp + 1*16] movdqu xmm6 , [rsp + 0*16] add rsp, 5*16 %endif pop r12 ret ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;void aes_gcm_enc_128_sse / aes_gcm_enc_192_sse / aes_gcm_enc_256_sse ; const struct gcm_key_data *key_data, ; struct gcm_context_data *context_data, ; u8 *out, ; const u8 *in, ; u64 plaintext_len, ; u8 *iv, ; const u8 *aad, ; u64 aad_len, ; u8 *auth_tag, ; u64 auth_tag_len); ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; MKGLOBAL(FN_NAME(enc,_),function,) FN_NAME(enc,_): FUNC_SAVE GCM_INIT arg1, arg2, arg6, arg7, arg8 GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, ENC GCM_COMPLETE arg1, arg2, arg9, arg10, ENC FUNC_RESTORE ret ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;void aes_gcm_dec_128_sse / aes_gcm_dec_192_sse / aes_gcm_dec_256_sse ; const struct gcm_key_data *key_data, ; struct gcm_context_data *context_data, ; u8 *out, ; const u8 *in, ; u64 plaintext_len, ; u8 *iv, ; const u8 *aad, ; u64 aad_len, ; u8 *auth_tag, ; u64 auth_tag_len); ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; MKGLOBAL(FN_NAME(dec,_),function,) FN_NAME(dec,_): FUNC_SAVE GCM_INIT arg1, arg2, arg6, arg7, arg8 GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, DEC GCM_COMPLETE arg1, arg2, arg9, arg10, DEC FUNC_RESTORE ret intel-ipsec-mb-0.48/sse/mb_mgr_aes192_flush_sse.asm000066400000000000000000000032561321406316400221610ustar00rootroot00000000000000;; ;; Copyright (c) 2012-2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; %define AES_CBC_ENC_X4 aes_cbc_enc_192_x4 %define FLUSH_JOB_AES_ENC flush_job_aes192_enc_sse %include "mb_mgr_aes_flush_sse.asm" intel-ipsec-mb-0.48/sse/mb_mgr_aes192_submit_sse.asm000066400000000000000000000032611321406316400223370ustar00rootroot00000000000000;; ;; Copyright (c) 2012-2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; %define AES_CBC_ENC_X4 aes_cbc_enc_192_x4 %define SUBMIT_JOB_AES_ENC submit_job_aes192_enc_sse %include "mb_mgr_aes_submit_sse.asm" intel-ipsec-mb-0.48/sse/mb_mgr_aes256_flush_sse.asm000066400000000000000000000032561321406316400221620ustar00rootroot00000000000000;; ;; Copyright (c) 2012-2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; %define AES_CBC_ENC_X4 aes_cbc_enc_256_x4 %define FLUSH_JOB_AES_ENC flush_job_aes256_enc_sse %include "mb_mgr_aes_flush_sse.asm" intel-ipsec-mb-0.48/sse/mb_mgr_aes256_submit_sse.asm000066400000000000000000000032611321406316400223400ustar00rootroot00000000000000;; ;; Copyright (c) 2012-2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; %define AES_CBC_ENC_X4 aes_cbc_enc_256_x4 %define SUBMIT_JOB_AES_ENC submit_job_aes256_enc_sse %include "mb_mgr_aes_submit_sse.asm" intel-ipsec-mb-0.48/sse/mb_mgr_aes_flush_sse.asm000066400000000000000000000132041321406316400217170ustar00rootroot00000000000000;; ;; Copyright (c) 2012-2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; %include "os.asm" %include "job_aes_hmac.asm" %include "mb_mgr_datastruct.asm" %include "reg_sizes.asm" %ifndef AES_CBC_ENC_X4 %define AES_CBC_ENC_X4 aes_cbc_enc_128_x4 %define FLUSH_JOB_AES_ENC flush_job_aes128_enc_sse %endif ; void AES_CBC_ENC_X4(AES_ARGS_x8 *args, UINT64 len_in_bytes); extern AES_CBC_ENC_X4 section .data default rel align 16 len_masks: ;ddq 0x0000000000000000000000000000FFFF dq 0x000000000000FFFF, 0x0000000000000000 ;ddq 0x000000000000000000000000FFFF0000 dq 0x00000000FFFF0000, 0x0000000000000000 ;ddq 0x00000000000000000000FFFF00000000 dq 0x0000FFFF00000000, 0x0000000000000000 ;ddq 0x0000000000000000FFFF000000000000 dq 0xFFFF000000000000, 0x0000000000000000 one: dq 1 two: dq 2 three: dq 3 section .text %define APPEND(a,b) a %+ b %ifdef LINUX %define arg1 rdi %define arg2 rsi %else %define arg1 rcx %define arg2 rdx %endif %define state arg1 %define job arg2 %define len2 arg2 %define job_rax rax %if 1 %define unused_lanes rbx %define tmp1 rbx %define good_lane rdx %define iv rdx %define tmp2 rax ; idx needs to be in rbp %define tmp rbp %define idx rbp %define tmp3 r8 %endif ; STACK_SPACE needs to be an odd multiple of 8 ; This routine and its callee clobbers all GPRs struc STACK _gpr_save: resq 8 _rsp_save: resq 1 endstruc ; JOB* FLUSH_JOB_AES_ENC(MB_MGR_AES_OOO *state, JOB_AES_HMAC *job) ; arg 1 : state ; arg 2 : job MKGLOBAL(FLUSH_JOB_AES_ENC,function,internal) FLUSH_JOB_AES_ENC: mov rax, rsp sub rsp, STACK_size and rsp, -16 mov [rsp + _gpr_save + 8*0], rbx mov [rsp + _gpr_save + 8*1], rbp mov [rsp + _gpr_save + 8*2], r12 mov [rsp + _gpr_save + 8*3], r13 mov [rsp + _gpr_save + 8*4], r14 mov [rsp + _gpr_save + 8*5], r15 %ifndef LINUX mov [rsp + _gpr_save + 8*6], rsi mov [rsp + _gpr_save + 8*7], rdi %endif mov [rsp + _rsp_save], rax ; original SP ; check for empty mov unused_lanes, [state + _aes_unused_lanes] bt unused_lanes, 32+7 jc return_null ; find a lane with a non-null job xor good_lane, good_lane cmp qword [state + _aes_job_in_lane + 1*8], 0 cmovne good_lane, [rel one] cmp qword [state + _aes_job_in_lane + 2*8], 0 cmovne good_lane, [rel two] cmp qword [state + _aes_job_in_lane + 3*8], 0 cmovne good_lane, [rel three] ; copy good_lane to empty lanes mov tmp1, [state + _aes_args_in + good_lane*8] mov tmp2, [state + _aes_args_out + good_lane*8] mov tmp3, [state + _aes_args_keys + good_lane*8] shl good_lane, 4 ; multiply by 16 movdqa xmm2, [state + _aes_args_IV + good_lane] movdqa xmm0, [state + _aes_lens] %assign I 0 %rep 4 cmp qword [state + _aes_job_in_lane + I*8], 0 jne APPEND(skip_,I) mov [state + _aes_args_in + I*8], tmp1 mov [state + _aes_args_out + I*8], tmp2 mov [state + _aes_args_keys + I*8], tmp3 movdqa [state + _aes_args_IV + I*16], xmm2 por xmm0, [rel len_masks + 16*I] APPEND(skip_,I): %assign I (I+1) %endrep ; Find min length phminposuw xmm1, xmm0 pextrw len2, xmm1, 0 ; min value pextrw idx, xmm1, 1 ; min index (0...3) cmp len2, 0 je len_is_0 pshuflw xmm1, xmm1, 0 psubw xmm0, xmm1 movdqa [state + _aes_lens], xmm0 ; "state" and "args" are the same address, arg1 ; len is arg2 call AES_CBC_ENC_X4 ; state and idx are intact len_is_0: ; process completed job "idx" mov job_rax, [state + _aes_job_in_lane + idx*8] ; Don't write back IV ; mov iv, [job_rax + _iv] mov unused_lanes, [state + _aes_unused_lanes] mov qword [state + _aes_job_in_lane + idx*8], 0 or dword [job_rax + _status], STS_COMPLETED_AES shl unused_lanes, 8 or unused_lanes, idx ; shl idx, 4 ; multiply by 16 mov [state + _aes_unused_lanes], unused_lanes ; movdqa xmm0, [state + _aes_args_IV + idx] ; movdqu [iv], xmm0 return: mov rbx, [rsp + _gpr_save + 8*0] mov rbp, [rsp + _gpr_save + 8*1] mov r12, [rsp + _gpr_save + 8*2] mov r13, [rsp + _gpr_save + 8*3] mov r14, [rsp + _gpr_save + 8*4] mov r15, [rsp + _gpr_save + 8*5] %ifndef LINUX mov rsi, [rsp + _gpr_save + 8*6] mov rdi, [rsp + _gpr_save + 8*7] %endif mov rsp, [rsp + _rsp_save] ; original SP ret return_null: xor job_rax, job_rax jmp return intel-ipsec-mb-0.48/sse/mb_mgr_aes_submit_sse.asm000066400000000000000000000116411321406316400221040ustar00rootroot00000000000000;; ;; Copyright (c) 2012-2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; %include "os.asm" %include "job_aes_hmac.asm" %include "mb_mgr_datastruct.asm" %include "reg_sizes.asm" %ifndef AES_CBC_ENC_X4 %define AES_CBC_ENC_X4 aes_cbc_enc_128_x4 %define SUBMIT_JOB_AES_ENC submit_job_aes128_enc_sse %endif ; void AES_CBC_ENC_X4(AES_ARGS_x8 *args, UINT64 len_in_bytes); extern AES_CBC_ENC_X4 %ifdef LINUX %define arg1 rdi %define arg2 rsi %else %define arg1 rcx %define arg2 rdx %endif %define state arg1 %define job arg2 %define len2 arg2 %define job_rax rax %if 1 ; idx needs to be in rbp %define len rbp %define idx rbp %define tmp rbp %define lane r8 %define iv r9 %define unused_lanes rbx %endif ; STACK_SPACE needs to be an odd multiple of 8 ; This routine and its callee clobbers all GPRs struc STACK _gpr_save: resq 8 _rsp_save: resq 1 endstruc ; JOB* SUBMIT_JOB_AES_ENC(MB_MGR_AES_OOO *state, JOB_AES_HMAC *job) ; arg 1 : state ; arg 2 : job MKGLOBAL(SUBMIT_JOB_AES_ENC,function,internal) SUBMIT_JOB_AES_ENC: mov rax, rsp sub rsp, STACK_size and rsp, -16 mov [rsp + _gpr_save + 8*0], rbx mov [rsp + _gpr_save + 8*1], rbp mov [rsp + _gpr_save + 8*2], r12 mov [rsp + _gpr_save + 8*3], r13 mov [rsp + _gpr_save + 8*4], r14 mov [rsp + _gpr_save + 8*5], r15 %ifndef LINUX mov [rsp + _gpr_save + 8*6], rsi mov [rsp + _gpr_save + 8*7], rdi %endif mov [rsp + _rsp_save], rax ; original SP mov unused_lanes, [state + _aes_unused_lanes] movzx lane, BYTE(unused_lanes) shr unused_lanes, 8 mov len, [job + _msg_len_to_cipher_in_bytes] and len, -16 ; DOCSIS may pass size unaligned to block size mov iv, [job + _iv] mov [state + _aes_unused_lanes], unused_lanes mov [state + _aes_job_in_lane + lane*8], job mov [state + _aes_lens + 2*lane], WORD(len) mov tmp, [job + _src] add tmp, [job + _cipher_start_src_offset_in_bytes] movdqu xmm0, [iv] mov [state + _aes_args_in + lane*8], tmp mov tmp, [job + _aes_enc_key_expanded] mov [state + _aes_args_keys + lane*8], tmp mov tmp, [job + _dst] mov [state + _aes_args_out + lane*8], tmp shl lane, 4 ; multiply by 16 movdqa [state + _aes_args_IV + lane], xmm0 cmp unused_lanes, 0xff jne return_null ; Find min length movdqa xmm0, [state + _aes_lens] phminposuw xmm1, xmm0 pextrw len2, xmm1, 0 ; min value pextrw idx, xmm1, 1 ; min index (0...3) cmp len2, 0 je len_is_0 pshuflw xmm1, xmm1, 0 psubw xmm0, xmm1 movdqa [state + _aes_lens], xmm0 ; "state" and "args" are the same address, arg1 ; len is arg2 call AES_CBC_ENC_X4 ; state and idx are intact len_is_0: ; process completed job "idx" mov job_rax, [state + _aes_job_in_lane + idx*8] ; Don't write back IV ; mov iv, [job_rax + _iv] mov unused_lanes, [state + _aes_unused_lanes] mov qword [state + _aes_job_in_lane + idx*8], 0 or dword [job_rax + _status], STS_COMPLETED_AES shl unused_lanes, 8 or unused_lanes, idx ; shl idx, 4 ; multiply by 16 mov [state + _aes_unused_lanes], unused_lanes ; movdqa xmm0, [state + _aes_args_IV + idx] ; movdqu [iv], xmm0 return: mov rbx, [rsp + _gpr_save + 8*0] mov rbp, [rsp + _gpr_save + 8*1] mov r12, [rsp + _gpr_save + 8*2] mov r13, [rsp + _gpr_save + 8*3] mov r14, [rsp + _gpr_save + 8*4] mov r15, [rsp + _gpr_save + 8*5] %ifndef LINUX mov rsi, [rsp + _gpr_save + 8*6] mov rdi, [rsp + _gpr_save + 8*7] %endif mov rsp, [rsp + _rsp_save] ; original SP ret return_null: xor job_rax, job_rax jmp return intel-ipsec-mb-0.48/sse/mb_mgr_aes_xcbc_flush_sse.asm000066400000000000000000000145341321406316400227250ustar00rootroot00000000000000;; ;; Copyright (c) 2012-2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; %include "os.asm" %include "job_aes_hmac.asm" %include "mb_mgr_datastruct.asm" %include "reg_sizes.asm" %ifndef AES_XCBC_X4 %define AES_XCBC_X4 aes_xcbc_mac_128_x4 %define FLUSH_JOB_AES_XCBC flush_job_aes_xcbc_sse %endif ; void AES_XCBC_X4(AES_XCBC_ARGS_x8 *args, UINT64 len_in_bytes); extern AES_XCBC_X4 section .data default rel align 16 len_masks: ;ddq 0x0000000000000000000000000000FFFF dq 0x000000000000FFFF, 0x0000000000000000 ;ddq 0x000000000000000000000000FFFF0000 dq 0x00000000FFFF0000, 0x0000000000000000 ;ddq 0x00000000000000000000FFFF00000000 dq 0x0000FFFF00000000, 0x0000000000000000 ;ddq 0x0000000000000000FFFF000000000000 dq 0xFFFF000000000000, 0x0000000000000000 one: dq 1 two: dq 2 three: dq 3 section .text %define APPEND(a,b) a %+ b %ifdef LINUX %define arg1 rdi %define arg2 rsi %else %define arg1 rcx %define arg2 rdx %endif %define state arg1 %define job arg2 %define len2 arg2 %define job_rax rax %if 1 %define unused_lanes rbx %define tmp1 rbx %define icv rdx %define tmp2 rax ; idx needs to be in rbp %define tmp r10 %define idx rbp %define tmp3 r8 %define lane_data r9 %endif ; STACK_SPACE needs to be an odd multiple of 8 ; This routine and its callee clobbers all GPRs struc STACK _gpr_save: resq 8 _rsp_save: resq 1 endstruc ; JOB* FLUSH_JOB_AES_XCBC(MB_MGR_AES_XCBC_OOO *state, JOB_AES_HMAC *job) ; arg 1 : state ; arg 2 : job MKGLOBAL(FLUSH_JOB_AES_XCBC,function,internal) FLUSH_JOB_AES_XCBC: mov rax, rsp sub rsp, STACK_size and rsp, -16 mov [rsp + _gpr_save + 8*0], rbx mov [rsp + _gpr_save + 8*1], rbp mov [rsp + _gpr_save + 8*2], r12 mov [rsp + _gpr_save + 8*3], r13 mov [rsp + _gpr_save + 8*4], r14 mov [rsp + _gpr_save + 8*5], r15 %ifndef LINUX mov [rsp + _gpr_save + 8*6], rsi mov [rsp + _gpr_save + 8*7], rdi %endif mov [rsp + _rsp_save], rax ; original SP ; check for empty mov unused_lanes, [state + _aes_xcbc_unused_lanes] bt unused_lanes, 32+7 jc return_null ; find a lane with a non-null job xor idx, idx cmp qword [state + _aes_xcbc_ldata + 1 * _XCBC_LANE_DATA_size + _xcbc_job_in_lane], 0 cmovne idx, [rel one] cmp qword [state + _aes_xcbc_ldata + 2 * _XCBC_LANE_DATA_size + _xcbc_job_in_lane], 0 cmovne idx, [rel two] cmp qword [state + _aes_xcbc_ldata + 3 * _XCBC_LANE_DATA_size + _xcbc_job_in_lane], 0 cmovne idx, [rel three] copy_lane_data: ; copy idx to empty lanes mov tmp1, [state + _aes_xcbc_args_in + idx*8] mov tmp3, [state + _aes_xcbc_args_keys + idx*8] shl idx, 4 ; multiply by 16 movdqa xmm2, [state + _aes_xcbc_args_ICV + idx] movdqa xmm0, [state + _aes_xcbc_lens] %assign I 0 %rep 4 cmp qword [state + _aes_xcbc_ldata + I * _XCBC_LANE_DATA_size + _xcbc_job_in_lane], 0 jne APPEND(skip_,I) mov [state + _aes_xcbc_args_in + I*8], tmp1 mov [state + _aes_xcbc_args_keys + I*8], tmp3 movdqa [state + _aes_xcbc_args_ICV + I*16], xmm2 por xmm0, [rel len_masks + 16*I] APPEND(skip_,I): %assign I (I+1) %endrep movdqa [state + _aes_xcbc_lens], xmm0 ; Find min length phminposuw xmm1, xmm0 pextrw len2, xmm1, 0 ; min value pextrw idx, xmm1, 1 ; min index (0...3) cmp len2, 0 je len_is_0 pshuflw xmm1, xmm1, 0 psubw xmm0, xmm1 movdqa [state + _aes_xcbc_lens], xmm0 ; "state" and "args" are the same address, arg1 ; len is arg2 call AES_XCBC_X4 ; state and idx are intact len_is_0: ; process completed job "idx" imul lane_data, idx, _XCBC_LANE_DATA_size lea lane_data, [state + _aes_xcbc_ldata + lane_data] cmp dword [lane_data + _xcbc_final_done], 0 jne end_loop mov dword [lane_data + _xcbc_final_done], 1 mov word [state + _aes_xcbc_lens + 2*idx], 16 lea tmp, [lane_data + _xcbc_final_block] mov [state + _aes_xcbc_args_in + 8*idx], tmp jmp copy_lane_data end_loop: mov job_rax, [lane_data + _xcbc_job_in_lane] mov icv, [job_rax + _auth_tag_output] mov unused_lanes, [state + _aes_xcbc_unused_lanes] mov qword [lane_data + _xcbc_job_in_lane], 0 or dword [job_rax + _status], STS_COMPLETED_HMAC shl unused_lanes, 8 or unused_lanes, idx shl idx, 4 ; multiply by 16 mov [state + _aes_xcbc_unused_lanes], unused_lanes ; copy 12 bytes movdqa xmm0, [state + _aes_xcbc_args_ICV + idx] movq [icv], xmm0 pextrd [icv + 8], xmm0, 2 return: mov rbx, [rsp + _gpr_save + 8*0] mov rbp, [rsp + _gpr_save + 8*1] mov r12, [rsp + _gpr_save + 8*2] mov r13, [rsp + _gpr_save + 8*3] mov r14, [rsp + _gpr_save + 8*4] mov r15, [rsp + _gpr_save + 8*5] %ifndef LINUX mov rsi, [rsp + _gpr_save + 8*6] mov rdi, [rsp + _gpr_save + 8*7] %endif mov rsp, [rsp + _rsp_save] ; original SP ret return_null: xor job_rax, job_rax jmp return intel-ipsec-mb-0.48/sse/mb_mgr_aes_xcbc_submit_sse.asm000066400000000000000000000157611321406316400231120ustar00rootroot00000000000000;; ;; Copyright (c) 2012-2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; %include "os.asm" %include "job_aes_hmac.asm" %include "mb_mgr_datastruct.asm" %include "reg_sizes.asm" %include "memcpy.asm" %ifndef AES_XCBC_X4 %define AES_XCBC_X4 aes_xcbc_mac_128_x4 %define SUBMIT_JOB_AES_XCBC submit_job_aes_xcbc_sse %endif ; void AES_XCBC_X4(AES_XCBC_ARGS_x8 *args, UINT64 len_in_bytes); extern AES_XCBC_X4 section .data default rel align 16 x80: ;ddq 0x00000000000000000000000000000080 dq 0x0000000000000080, 0x0000000000000000 section .text %ifdef LINUX %define arg1 rdi %define arg2 rsi %else %define arg1 rcx %define arg2 rdx %endif %define state arg1 %define job arg2 %define len2 arg2 %define job_rax rax %if 1 ; idx needs to be in rbp %define idx rbp %define last_len rbp %define lane r8 %define icv r9 %define p2 r9 %define tmp r10 %define len r11 %define lane_data r12 %define p r13 %define tmp2 r14 %define unused_lanes rbx %endif ; STACK_SPACE needs to be an odd multiple of 8 ; This routine and its callee clobbers all GPRs struc STACK _gpr_save: resq 8 _rsp_save: resq 1 endstruc ; JOB* SUBMIT_JOB_AES_XCBC(MB_MGR_AES_XCBC_OOO *state, JOB_AES_HMAC *job) ; arg 1 : state ; arg 2 : job MKGLOBAL(SUBMIT_JOB_AES_XCBC,function,internal) SUBMIT_JOB_AES_XCBC: mov rax, rsp sub rsp, STACK_size and rsp, -16 mov [rsp + _gpr_save + 8*0], rbx mov [rsp + _gpr_save + 8*1], rbp mov [rsp + _gpr_save + 8*2], r12 mov [rsp + _gpr_save + 8*3], r13 mov [rsp + _gpr_save + 8*4], r14 mov [rsp + _gpr_save + 8*5], r15 %ifndef LINUX mov [rsp + _gpr_save + 8*6], rsi mov [rsp + _gpr_save + 8*7], rdi %endif mov [rsp + _rsp_save], rax ; original SP mov unused_lanes, [state + _aes_xcbc_unused_lanes] movzx lane, BYTE(unused_lanes) shr unused_lanes, 8 imul lane_data, lane, _XCBC_LANE_DATA_size lea lane_data, [state + _aes_xcbc_ldata + lane_data] mov [state + _aes_xcbc_unused_lanes], unused_lanes mov len, [job + _msg_len_to_hash_in_bytes] mov [lane_data + _xcbc_job_in_lane], job mov dword [lane_data + _xcbc_final_done], 0 mov tmp, [job + _k1_expanded] mov [state + _aes_xcbc_args_keys + lane*8], tmp mov p, [job + _src] add p, [job + _hash_start_src_offset_in_bytes] mov last_len, len cmp len, 16 jle small_buffer mov [state + _aes_xcbc_args_in + lane*8], p add p, len ; set point to end of data and last_len, 15 ; Check lsbs of msg len jnz slow_copy ; if not 16B mult, do slow copy fast_copy: movdqu xmm0, [p - 16] ; load last block M[n] mov tmp, [job + _k2] ; load K2 address movdqu xmm1, [tmp] ; load K2 pxor xmm0, xmm1 ; M[n] XOR K2 movdqa [lane_data + _xcbc_final_block], xmm0 sub len, 16 ; take last block off length end_fast_copy: mov [state + _aes_xcbc_lens + 2*lane], WORD(len) pxor xmm0, xmm0 shl lane, 4 ; multiply by 16 movdqa [state + _aes_xcbc_args_ICV + lane], xmm0 cmp unused_lanes, 0xff jne return_null start_loop: ; Find min length movdqa xmm0, [state + _aes_xcbc_lens] phminposuw xmm1, xmm0 pextrw len2, xmm1, 0 ; min value pextrw idx, xmm1, 1 ; min index (0...3) cmp len2, 0 je len_is_0 pshuflw xmm1, xmm1, 0 psubw xmm0, xmm1 movdqa [state + _aes_xcbc_lens], xmm0 ; "state" and "args" are the same address, arg1 ; len is arg2 call AES_XCBC_X4 ; state and idx are intact len_is_0: ; process completed job "idx" imul lane_data, idx, _XCBC_LANE_DATA_size lea lane_data, [state + _aes_xcbc_ldata + lane_data] cmp dword [lane_data + _xcbc_final_done], 0 jne end_loop mov dword [lane_data + _xcbc_final_done], 1 mov word [state + _aes_xcbc_lens + 2*idx], 16 lea tmp, [lane_data + _xcbc_final_block] mov [state + _aes_xcbc_args_in + 8*idx], tmp jmp start_loop end_loop: ; process completed job "idx" mov job_rax, [lane_data + _xcbc_job_in_lane] mov icv, [job_rax + _auth_tag_output] mov unused_lanes, [state + _aes_xcbc_unused_lanes] mov qword [lane_data + _xcbc_job_in_lane], 0 or dword [job_rax + _status], STS_COMPLETED_HMAC shl unused_lanes, 8 or unused_lanes, idx shl idx, 4 ; multiply by 16 mov [state + _aes_xcbc_unused_lanes], unused_lanes ; copy 12 bytes movdqa xmm0, [state + _aes_xcbc_args_ICV + idx] movq [icv], xmm0 pextrd [icv + 8], xmm0, 2 return: mov rbx, [rsp + _gpr_save + 8*0] mov rbp, [rsp + _gpr_save + 8*1] mov r12, [rsp + _gpr_save + 8*2] mov r13, [rsp + _gpr_save + 8*3] mov r14, [rsp + _gpr_save + 8*4] mov r15, [rsp + _gpr_save + 8*5] %ifndef LINUX mov rsi, [rsp + _gpr_save + 8*6] mov rdi, [rsp + _gpr_save + 8*7] %endif mov rsp, [rsp + _rsp_save] ; original SP ret small_buffer: ; For buffers <= 16 Bytes ; The input data is set to final block lea tmp, [lane_data + _xcbc_final_block] ; final block mov [state + _aes_xcbc_args_in + lane*8], tmp add p, len ; set point to end of data cmp len, 16 je fast_copy slow_copy: and len, ~15 ; take final block off len sub p, last_len ; adjust data pointer lea p2, [lane_data + _xcbc_final_block + 16] ; upper part of final sub p2, last_len ; adjust data pointer backwards memcpy_sse_16_1 p2, p, last_len, tmp, tmp2 movdqa xmm0, [rel x80] ; fill reg with padding movdqu [lane_data + _xcbc_final_block + 16], xmm0 ; add padding movdqu xmm0, [p2] ; load final block to process mov tmp, [job + _k3] ; load K3 address movdqu xmm1, [tmp] ; load K3 pxor xmm0, xmm1 ; M[n] XOR K3 movdqu [lane_data + _xcbc_final_block], xmm0 ; write final block jmp end_fast_copy return_null: xor job_rax, job_rax jmp return intel-ipsec-mb-0.48/sse/mb_mgr_hmac_flush_ni_sse.asm000066400000000000000000000170021321406316400225450ustar00rootroot00000000000000;; ;; Copyright (c) 2012-2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; ;; In System V AMD64 ABI ;; calle saves: RBX, RBP, R12-R15 ;; Windows x64 ABI ;; calle saves: RBX, RBP, RDI, RSI, RSP, R12-R15 ;; ;; Registers: RAX RBX RCX RDX RBP RSI RDI R8 R9 R10 R11 R12 R13 R14 R15 ;; ----------------------------------------------------------- ;; Windows clobbers: RAX RCX RDX R8 ;; Windows preserves: RBX RBP RSI RDI R9 R10 R11 R12 R13 R14 R15 ;; ----------------------------------------------------------- ;; Linux clobbers: RAX RSI RDI R8 ;; Linux preserves: RBX RCX RDX RBP R9 R10 R11 R12 R13 R14 R15 ;; ----------------------------------------------------------- ;; ;; Linux/Windows clobbers: xmm0 - xmm15 ;; %include "os.asm" %include "job_aes_hmac.asm" %include "mb_mgr_datastruct.asm" %include "reg_sizes.asm" ;%define DO_DBGPRINT %include "dbgprint.asm" extern sha1_ni section .data default rel align 16 byteswap: ;ddq 0x0c0d0e0f08090a0b0405060700010203 dq 0x0405060700010203, 0x0c0d0e0f08090a0b one: dq 1 section .text %ifdef LINUX %define arg1 rdi %define arg2 rsi %else %define arg1 rcx %define arg2 rdx %endif %define state arg1 %define job arg2 %define len2 arg2 ; idx needs to be in rbx, rbp, r12-r15 %define idx rbp %define unused_lanes rbx %define lane_data rbx %define tmp2 rbx %define job_rax rax %define tmp1 rax %define size_offset rax %define tmp rax %define start_offset rax %define tmp3 arg1 %define extra_blocks arg2 %define p arg2 %define tmp4 r8 %define p2 r8 ; This routine clobbers rbx, rbp struc STACK _gpr_save: resq 4 _rsp_save: resq 1 endstruc %define APPEND(a,b) a %+ b ; JOB* flush_job_hmac_ni_sse(MB_MGR_HMAC_SHA_1_OOO *state) ; arg 1 : state MKGLOBAL(flush_job_hmac_ni_sse,function,internal) flush_job_hmac_ni_sse: mov rax, rsp sub rsp, STACK_size and rsp, -16 mov [rsp + _gpr_save + 8*0], rbx mov [rsp + _gpr_save + 8*1], rbp %ifndef LINUX mov [rsp + _gpr_save + 8*2], rsi mov [rsp + _gpr_save + 8*3], rdi %endif mov [rsp + _rsp_save], rax ; original SP DBGPRINTL "enter sha1-ni-sse flush" mov unused_lanes, [state + _unused_lanes] bt unused_lanes, 16+7 jc return_null ; find a lane with a non-null job, assume it is 0 then check 1 xor idx, idx cmp qword [state + _ldata + 1 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0 cmovne idx, [rel one] DBGPRINTL64 "idx:", idx copy_lane_data: ; copy valid lane (idx) to empty lanes mov tmp, [state + _args_data_ptr + PTR_SZ*idx] movzx len2, word [state + _lens + idx*2] DBGPRINTL64 "ptr", tmp ; there are only two lanes so if one is empty it is easy to determine which one xor idx, 1 mov [state + _args_data_ptr + PTR_SZ*idx], tmp xor idx, 1 ; No need to find min length - only two lanes available cmp len2, 0 je len_is_0 ; Set length on both lanes to 0 mov dword [state + _lens], 0 ; "state" and "args" are the same address, arg1 ; len is arg2 call sha1_ni ; state is intact len_is_0: ; process completed job "idx" imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size lea lane_data, [state + _ldata + lane_data] mov DWORD(extra_blocks), [lane_data + _extra_blocks] cmp extra_blocks, 0 jne proc_extra_blocks cmp dword [lane_data + _outer_done], 0 jne end_loop proc_outer: mov dword [lane_data + _outer_done], 1 mov DWORD(size_offset), [lane_data + _size_offset] mov qword [lane_data + _extra_block + size_offset], 0 mov word [state + _lens + 2*idx], 1 DBGPRINTL64 "outer-block-index", idx lea tmp, [lane_data + _outer_block] DBGPRINTL64 "outer block ptr:", tmp mov [state + _args_data_ptr + PTR_SZ*idx], tmp ;; idx determines which column ;; read off from consecutive rows %if SHA1NI_DIGEST_ROW_SIZE != 20 %error "Below code has been optimized for SHA1NI_DIGEST_ROW_SIZE = 20!" %endif lea p2, [idx + idx*4] movdqu xmm0, [state + _args_digest + p2*4] pshufb xmm0, [rel byteswap] mov DWORD(tmp), [state + _args_digest + p2*4 + 4*SHA1_DIGEST_WORD_SIZE] bswap DWORD(tmp) movdqa [lane_data + _outer_block], xmm0 mov [lane_data + _outer_block + 4*SHA1_DIGEST_WORD_SIZE], DWORD(tmp) DBGPRINTL_XMM "sha1 outer hash input words[0-3]", xmm0 DBGPRINTL64 "sha1 outer hash input word 4", tmp mov job, [lane_data + _job_in_lane] mov tmp, [job + _auth_key_xor_opad] movdqu xmm0, [tmp] mov DWORD(tmp), [tmp + 4*SHA1_DIGEST_WORD_SIZE] movdqu [state + _args_digest + p2*4], xmm0 mov [state + _args_digest + p2*4 + 4*SHA1_DIGEST_WORD_SIZE], DWORD(tmp) jmp copy_lane_data align 16 proc_extra_blocks: mov DWORD(start_offset), [lane_data + _start_offset] DBGPRINTL64 "extra blocks-start offset", start_offset mov [state + _lens + 2*idx], WORD(extra_blocks) DBGPRINTL64 "extra blocks-len", extra_blocks lea tmp, [lane_data + _extra_block + start_offset] DBGPRINTL64 "extra block ptr", tmp mov [state + _args_data_ptr + PTR_SZ*idx], tmp mov dword [lane_data + _extra_blocks], 0 jmp copy_lane_data return_null: xor job_rax, job_rax jmp return align 16 end_loop: mov job_rax, [lane_data + _job_in_lane] mov qword [lane_data + _job_in_lane], 0 or dword [job_rax + _status], STS_COMPLETED_HMAC mov unused_lanes, [state + _unused_lanes] shl unused_lanes, 8 or unused_lanes, idx mov [state + _unused_lanes], unused_lanes mov p, [job_rax + _auth_tag_output] ; copy 12 bytes %if SHA1NI_DIGEST_ROW_SIZE != 20 %error "Below code has been optimized for SHA1NI_DIGEST_ROW_SIZE = 20!" %endif lea idx, [idx + idx*4] mov DWORD(tmp2), [state + _args_digest + idx*4 + 0*SHA1_DIGEST_WORD_SIZE] mov DWORD(tmp4), [state + _args_digest + idx*4 + 1*SHA1_DIGEST_WORD_SIZE] mov DWORD(tmp3), [state + _args_digest + idx*4 + 2*SHA1_DIGEST_WORD_SIZE] bswap DWORD(tmp2) bswap DWORD(tmp4) bswap DWORD(tmp3) mov [p + 0*4], DWORD(tmp2) mov [p + 1*4], DWORD(tmp4) mov [p + 2*4], DWORD(tmp3) return: mov rbx, [rsp + _gpr_save + 8*0] mov rbp, [rsp + _gpr_save + 8*1] %ifndef LINUX mov rsi, [rsp + _gpr_save + 8*2] mov rdi, [rsp + _gpr_save + 8*3] %endif mov rsp, [rsp + _rsp_save] ; original SP ret intel-ipsec-mb-0.48/sse/mb_mgr_hmac_flush_sse.asm000066400000000000000000000172441321406316400220670ustar00rootroot00000000000000;; ;; Copyright (c) 2012-2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; %include "os.asm" %include "job_aes_hmac.asm" %include "mb_mgr_datastruct.asm" %include "reg_sizes.asm" ;%define DO_DBGPRINT %include "dbgprint.asm" extern sha1_mult_sse section .data default rel align 16 byteswap: ;ddq 0x0c0d0e0f08090a0b0405060700010203 dq 0x0405060700010203, 0x0c0d0e0f08090a0b x80: ;ddq 0x00000000000000000000000000000080 dq 0x0000000000000080, 0x0000000000000000 x00: ;ddq 0x00000000000000000000000000000000 dq 0x0000000000000000, 0x0000000000000000 len_masks: ;ddq 0x0000000000000000000000000000FFFF dq 0x000000000000FFFF, 0x0000000000000000 ;ddq 0x000000000000000000000000FFFF0000 dq 0x00000000FFFF0000, 0x0000000000000000 ;ddq 0x00000000000000000000FFFF00000000 dq 0x0000FFFF00000000, 0x0000000000000000 ;ddq 0x0000000000000000FFFF000000000000 dq 0xFFFF000000000000, 0x0000000000000000 one: dq 1 two: dq 2 three: dq 3 section .text %if 1 %ifdef LINUX %define arg1 rdi %define arg2 rsi %else %define arg1 rcx %define arg2 rdx %endif %define state arg1 %define job arg2 %define len2 arg2 ; idx needs to be in rbx, rbp, r12-r15 %define idx rbp %define unused_lanes rbx %define lane_data rbx %define tmp2 rbx %define job_rax rax %define tmp1 rax %define size_offset rax %define tmp rax %define start_offset rax %define tmp3 arg1 %define extra_blocks arg2 %define p arg2 %define tmp4 r8 %endif ; This routine clobbers rbx, rbp struc STACK _gpr_save: resq 2 _rsp_save: resq 1 endstruc %define APPEND(a,b) a %+ b ; JOB* flush_job_hmac_sse(MB_MGR_HMAC_SHA_1_OOO *state) ; arg 1 : state MKGLOBAL(flush_job_hmac_sse,function,internal) flush_job_hmac_sse: mov rax, rsp sub rsp, STACK_size and rsp, -16 mov [rsp + _gpr_save + 8*0], rbx mov [rsp + _gpr_save + 8*1], rbp mov [rsp + _rsp_save], rax ; original SP DBGPRINTL "enter sha1-sse flush" mov unused_lanes, [state + _unused_lanes] bt unused_lanes, 32+7 jc return_null ; find a lane with a non-null job xor idx, idx cmp qword [state + _ldata + 1 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0 cmovne idx, [rel one] cmp qword [state + _ldata + 2 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0 cmovne idx, [rel two] cmp qword [state + _ldata + 3 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0 cmovne idx, [rel three] copy_lane_data: ; copy valid lane (idx) to empty lanes movdqa xmm0, [state + _lens] mov tmp, [state + _args_data_ptr + PTR_SZ*idx] %assign I 0 %rep 4 cmp qword [state + _ldata + I * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0 jne APPEND(skip_,I) mov [state + _args_data_ptr + PTR_SZ*I], tmp por xmm0, [rel len_masks + 16*I] APPEND(skip_,I): %assign I (I+1) %endrep movdqa [state + _lens], xmm0 phminposuw xmm1, xmm0 pextrw len2, xmm1, 0 ; min value pextrw idx, xmm1, 1 ; min index (0...3) cmp len2, 0 je len_is_0 pshuflw xmm1, xmm1, 0 psubw xmm0, xmm1 movdqa [state + _lens], xmm0 ; "state" and "args" are the same address, arg1 ; len is arg2 call sha1_mult_sse ; state is intact len_is_0: ; process completed job "idx" imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size lea lane_data, [state + _ldata + lane_data] mov DWORD(extra_blocks), [lane_data + _extra_blocks] cmp extra_blocks, 0 jne proc_extra_blocks cmp dword [lane_data + _outer_done], 0 jne end_loop proc_outer: mov dword [lane_data + _outer_done], 1 mov DWORD(size_offset), [lane_data + _size_offset] mov qword [lane_data + _extra_block + size_offset], 0 mov word [state + _lens + 2*idx], 1 lea tmp, [lane_data + _outer_block] mov job, [lane_data + _job_in_lane] mov [state + _args_data_ptr + PTR_SZ*idx], tmp ;; idx determines which column ;; read off from consecutive rows movd xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 0*SHA1_DIGEST_ROW_SIZE] pinsrd xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 1*SHA1_DIGEST_ROW_SIZE], 1 pinsrd xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 2*SHA1_DIGEST_ROW_SIZE], 2 pinsrd xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 3*SHA1_DIGEST_ROW_SIZE], 3 pshufb xmm0, [rel byteswap] mov DWORD(tmp), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 4*SHA1_DIGEST_ROW_SIZE] bswap DWORD(tmp) movdqa [lane_data + _outer_block], xmm0 mov [lane_data + _outer_block + 4*4], DWORD(tmp) DBGPRINTL_XMM "sha1 outer hash input words[0-3]", xmm0 DBGPRINTL64 "sha1 outer hash input word 4", tmp mov tmp, [job + _auth_key_xor_opad] movdqu xmm0, [tmp] mov DWORD(tmp), [tmp + 4*4] movd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 0*SHA1_DIGEST_ROW_SIZE], xmm0 pextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 1*SHA1_DIGEST_ROW_SIZE], xmm0, 1 pextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 2*SHA1_DIGEST_ROW_SIZE], xmm0, 2 pextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 3*SHA1_DIGEST_ROW_SIZE], xmm0, 3 mov [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 4*SHA1_DIGEST_ROW_SIZE], DWORD(tmp) jmp copy_lane_data align 16 proc_extra_blocks: mov DWORD(start_offset), [lane_data + _start_offset] mov [state + _lens + 2*idx], WORD(extra_blocks) lea tmp, [lane_data + _extra_block + start_offset] mov [state + _args_data_ptr + PTR_SZ*idx], tmp mov dword [lane_data + _extra_blocks], 0 jmp copy_lane_data return_null: xor job_rax, job_rax jmp return align 16 end_loop: mov job_rax, [lane_data + _job_in_lane] mov qword [lane_data + _job_in_lane], 0 or dword [job_rax + _status], STS_COMPLETED_HMAC mov unused_lanes, [state + _unused_lanes] shl unused_lanes, 8 or unused_lanes, idx mov [state + _unused_lanes], unused_lanes mov p, [job_rax + _auth_tag_output] ; copy 12 bytes mov DWORD(tmp2), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 0*SHA1_DIGEST_ROW_SIZE] mov DWORD(tmp4), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 1*SHA1_DIGEST_ROW_SIZE] mov DWORD(tmp3), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 2*SHA1_DIGEST_ROW_SIZE] bswap DWORD(tmp2) bswap DWORD(tmp4) bswap DWORD(tmp3) mov [p + 0*4], DWORD(tmp2) mov [p + 1*4], DWORD(tmp4) mov [p + 2*4], DWORD(tmp3) return: mov rbx, [rsp + _gpr_save + 8*0] mov rbp, [rsp + _gpr_save + 8*1] mov rsp, [rsp + _rsp_save] ; original SP ret intel-ipsec-mb-0.48/sse/mb_mgr_hmac_md5_flush_sse.asm000066400000000000000000000205151321406316400226270ustar00rootroot00000000000000;; ;; Copyright (c) 2012-2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; %include "os.asm" %include "job_aes_hmac.asm" %include "mb_mgr_datastruct.asm" %include "reg_sizes.asm" extern md5_x4x2_sse section .data default rel align 16 dupw: ;ddq 0x01000100010001000100010001000100 dq 0x0100010001000100, 0x0100010001000100 len_masks: ;ddq 0x0000000000000000000000000000FFFF dq 0x000000000000FFFF, 0x0000000000000000 ;ddq 0x000000000000000000000000FFFF0000 dq 0x00000000FFFF0000, 0x0000000000000000 ;ddq 0x00000000000000000000FFFF00000000 dq 0x0000FFFF00000000, 0x0000000000000000 ;ddq 0x0000000000000000FFFF000000000000 dq 0xFFFF000000000000, 0x0000000000000000 ;ddq 0x000000000000FFFF0000000000000000 dq 0x0000000000000000, 0x000000000000FFFF ;ddq 0x00000000FFFF00000000000000000000 dq 0x0000000000000000, 0x00000000FFFF0000 ;ddq 0x0000FFFF000000000000000000000000 dq 0x0000000000000000, 0x0000FFFF00000000 ;ddq 0xFFFF0000000000000000000000000000 dq 0x0000000000000000, 0xFFFF000000000000 one: dq 1 two: dq 2 three: dq 3 four: dq 4 five: dq 5 six: dq 6 seven: dq 7 section .text %if 1 %ifdef LINUX %define arg1 rdi %define arg2 rsi %else %define arg1 rcx %define arg2 rdx %endif %define state arg1 %define job arg2 %define len2 arg2 ; idx needs to be in rbp %define idx rbp ; unused_lanes must be in rax-rdx %define unused_lanes rbx %define lane_data rbx %define tmp2 rbx %define job_rax rax %define tmp1 rax %define size_offset rax %define tmp rax %define start_offset rax %define tmp3 arg1 %define extra_blocks arg2 %define p arg2 %define tmp4 r8 %endif ; This routine and/or the called routine clobbers all GPRs struc STACK _gpr_save: resq 8 _rsp_save: resq 1 endstruc %define APPEND(a,b) a %+ b ; JOB* flush_job_hmac_md5_sse(MB_MGR_HMAC_MD5_OOO *state) ; arg 1 : rcx : state MKGLOBAL(flush_job_hmac_md5_sse,function,internal) flush_job_hmac_md5_sse: mov rax, rsp sub rsp, STACK_size and rsp, -16 mov [rsp + _gpr_save + 8*0], rbx mov [rsp + _gpr_save + 8*1], rbp mov [rsp + _gpr_save + 8*2], r12 mov [rsp + _gpr_save + 8*3], r13 mov [rsp + _gpr_save + 8*4], r14 mov [rsp + _gpr_save + 8*5], r15 %ifndef LINUX mov [rsp + _gpr_save + 8*6], rsi mov [rsp + _gpr_save + 8*7], rdi %endif mov [rsp + _rsp_save], rax ; original SP mov unused_lanes, [state + _unused_lanes_md5] bt unused_lanes, 32+3 jc return_null ; find a lane with a non-null job xor idx, idx cmp qword [state + _ldata_md5 + 1 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane],0 cmovne idx, [rel one] cmp qword [state + _ldata_md5 + 2 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane],0 cmovne idx, [rel two] cmp qword [state + _ldata_md5 + 3 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane],0 cmovne idx, [rel three] cmp qword [state + _ldata_md5 + 4 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane],0 cmovne idx, [rel four] cmp qword [state + _ldata_md5 + 5 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane],0 cmovne idx, [rel five] cmp qword [state + _ldata_md5 + 6 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane],0 cmovne idx, [rel six] cmp qword [state + _ldata_md5 + 7 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane],0 cmovne idx, [rel seven] copy_lane_data: ; copy good lane (idx) to empty lanes movdqa xmm0, [state + _lens_md5] mov tmp, [state + _args_data_ptr_md5 + PTR_SZ*idx] %assign I 0 %rep 8 cmp qword [state + _ldata_md5 + I * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0 jne APPEND(skip_,I) mov [state + _args_data_ptr_md5 + PTR_SZ*I], tmp por xmm0, [rel len_masks + 16*I] APPEND(skip_,I): %assign I (I+1) %endrep movdqa [state + _lens_md5], xmm0 phminposuw xmm1, xmm0 pextrw len2, xmm1, 0 ; min value pextrw idx, xmm1, 1 ; min index (0...3) cmp len2, 0 je len_is_0 pshufb xmm1, [rel dupw] ; duplicate words across all lanes psubw xmm0, xmm1 movdqa [state + _lens_md5], xmm0 ; "state" and "args" are the same address, arg1 ; len is arg2 call md5_x4x2_sse ; state and idx are intact len_is_0: ; process completed job "idx" imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size lea lane_data, [state + _ldata_md5 + lane_data] mov DWORD(extra_blocks), [lane_data + _extra_blocks] cmp extra_blocks, 0 jne proc_extra_blocks cmp dword [lane_data + _outer_done], 0 jne end_loop proc_outer: mov dword [lane_data + _outer_done], 1 mov DWORD(size_offset), [lane_data + _size_offset] mov qword [lane_data + _extra_block + size_offset], 0 mov word [state + _lens_md5 + 2*idx], 1 lea tmp, [lane_data + _outer_block] mov job, [lane_data + _job_in_lane] mov [state + _args_data_ptr_md5 + PTR_SZ*idx], tmp movd xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 0*MD5_DIGEST_ROW_SIZE] pinsrd xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 1*MD5_DIGEST_ROW_SIZE], 1 pinsrd xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 2*MD5_DIGEST_ROW_SIZE], 2 pinsrd xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 3*MD5_DIGEST_ROW_SIZE], 3 ; pshufb xmm0, [byteswap wrt rip] movdqa [lane_data + _outer_block], xmm0 mov tmp, [job + _auth_key_xor_opad] movdqu xmm0, [tmp] movd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 0*MD5_DIGEST_ROW_SIZE], xmm0 pextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 1*MD5_DIGEST_ROW_SIZE], xmm0, 1 pextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 2*MD5_DIGEST_ROW_SIZE], xmm0, 2 pextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 3*MD5_DIGEST_ROW_SIZE], xmm0, 3 jmp copy_lane_data align 16 proc_extra_blocks: mov DWORD(start_offset), [lane_data + _start_offset] mov [state + _lens_md5 + 2*idx], WORD(extra_blocks) lea tmp, [lane_data + _extra_block + start_offset] mov [state + _args_data_ptr_md5 + PTR_SZ*idx], tmp mov dword [lane_data + _extra_blocks], 0 jmp copy_lane_data return_null: xor job_rax, job_rax jmp return align 16 end_loop: mov job_rax, [lane_data + _job_in_lane] mov qword [lane_data + _job_in_lane], 0 or dword [job_rax + _status], STS_COMPLETED_HMAC mov unused_lanes, [state + _unused_lanes_md5] shl unused_lanes, 4 or unused_lanes, idx mov [state + _unused_lanes_md5], unused_lanes mov p, [job_rax + _auth_tag_output] ; copy 12 bytes mov DWORD(tmp2), [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 0*MD5_DIGEST_ROW_SIZE] mov DWORD(tmp4), [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 1*MD5_DIGEST_ROW_SIZE] mov DWORD(tmp3), [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 2*MD5_DIGEST_ROW_SIZE] ; bswap DWORD(tmp2) ; bswap DWORD(tmp4) ; bswap DWORD(tmp3) mov [p + 0*4], DWORD(tmp2) mov [p + 1*4], DWORD(tmp4) mov [p + 2*4], DWORD(tmp3) return: mov rbx, [rsp + _gpr_save + 8*0] mov rbp, [rsp + _gpr_save + 8*1] mov r12, [rsp + _gpr_save + 8*2] mov r13, [rsp + _gpr_save + 8*3] mov r14, [rsp + _gpr_save + 8*4] mov r15, [rsp + _gpr_save + 8*5] %ifndef LINUX mov rsi, [rsp + _gpr_save + 8*6] mov rdi, [rsp + _gpr_save + 8*7] %endif mov rsp, [rsp + _rsp_save] ; original SP ret intel-ipsec-mb-0.48/sse/mb_mgr_hmac_md5_submit_sse.asm000066400000000000000000000240521321406316400230110ustar00rootroot00000000000000;; ;; Copyright (c) 2012-2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; %include "os.asm" %include "job_aes_hmac.asm" %include "mb_mgr_datastruct.asm" %include "memcpy.asm" %include "reg_sizes.asm" extern md5_x4x2_sse section .data default rel align 16 ;byteswap: ddq 0x0c0d0e0f08090a0b0405060700010203 dupw: ;ddq 0x01000100010001000100010001000100 dq 0x0100010001000100, 0x0100010001000100 section .text %if 1 %ifdef LINUX %define arg1 rdi %define arg2 rsi %define reg3 rcx %define reg4 rdx %else %define arg1 rcx %define arg2 rdx %define reg3 rdi %define reg4 rsi %endif %define state arg1 %define job arg2 %define len2 arg2 ; idx needs to be in rbp %define last_len rbp %define idx rbp %define p r11 %define start_offset r11 %define unused_lanes rbx %define tmp4 rbx %define job_rax rax %define len rax %define size_offset reg3 %define tmp2 reg3 %define lane reg4 %define tmp3 reg4 %define extra_blocks r8 %define tmp r9 %define p2 r9 %define lane_data r10 %endif ; This routine and/or the called routine clobbers all GPRs struc STACK _gpr_save: resq 8 _rsp_save: resq 1 endstruc ; JOB* submit_job_hmac_md5_sse(MB_MGR_HMAC_MD5_OOO *state, JOB_AES_HMAC *job) ; arg 1 : rcx : state ; arg 2 : rdx : job MKGLOBAL(submit_job_hmac_md5_sse,function,internal) submit_job_hmac_md5_sse: mov rax, rsp sub rsp, STACK_size and rsp, -16 mov [rsp + _gpr_save + 8*0], rbx mov [rsp + _gpr_save + 8*1], rbp mov [rsp + _gpr_save + 8*2], r12 mov [rsp + _gpr_save + 8*3], r13 mov [rsp + _gpr_save + 8*4], r14 mov [rsp + _gpr_save + 8*5], r15 %ifndef LINUX mov [rsp + _gpr_save + 8*6], rsi mov [rsp + _gpr_save + 8*7], rdi %endif mov [rsp + _rsp_save], rax ; original SP mov unused_lanes, [state + _unused_lanes_md5] mov lane, unused_lanes and lane, 0xF shr unused_lanes, 4 imul lane_data, lane, _HMAC_SHA1_LANE_DATA_size lea lane_data, [state + _ldata_md5 + lane_data] mov [state + _unused_lanes_md5], unused_lanes mov len, [job + _msg_len_to_hash_in_bytes] mov tmp, len shr tmp, 6 ; divide by 64, len in terms of blocks mov [lane_data + _job_in_lane], job mov dword [lane_data + _outer_done], 0 mov [state + _lens_md5 + 2*lane], WORD(tmp) mov last_len, len and last_len, 63 lea extra_blocks, [last_len + 9 + 63] shr extra_blocks, 6 mov [lane_data + _extra_blocks], DWORD(extra_blocks) mov p, [job + _src] add p, [job + _hash_start_src_offset_in_bytes] mov [state + _args_data_ptr_md5 + PTR_SZ*lane], p cmp len, 64 jb copy_lt64 fast_copy: add p, len movdqu xmm0, [p - 64 + 0*16] movdqu xmm1, [p - 64 + 1*16] movdqu xmm2, [p - 64 + 2*16] movdqu xmm3, [p - 64 + 3*16] movdqa [lane_data + _extra_block + 0*16], xmm0 movdqa [lane_data + _extra_block + 1*16], xmm1 movdqa [lane_data + _extra_block + 2*16], xmm2 movdqa [lane_data + _extra_block + 3*16], xmm3 end_fast_copy: mov size_offset, extra_blocks shl size_offset, 6 sub size_offset, last_len add size_offset, 64-8 mov [lane_data + _size_offset], DWORD(size_offset) mov start_offset, 64 sub start_offset, last_len mov [lane_data + _start_offset], DWORD(start_offset) lea tmp, [8*64 + 8*len] ; bswap tmp mov [lane_data + _extra_block + size_offset], tmp mov tmp, [job + _auth_key_xor_ipad] movdqu xmm0, [tmp] movd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*lane + 0*MD5_DIGEST_ROW_SIZE], xmm0 pextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*lane + 1*MD5_DIGEST_ROW_SIZE], xmm0, 1 pextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*lane + 2*MD5_DIGEST_ROW_SIZE], xmm0, 2 pextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*lane + 3*MD5_DIGEST_ROW_SIZE], xmm0, 3 test len, ~63 jnz ge64_bytes lt64_bytes: mov [state + _lens_md5 + 2*lane], WORD(extra_blocks) lea tmp, [lane_data + _extra_block + start_offset] mov [state + _args_data_ptr_md5 + PTR_SZ*lane], tmp mov dword [lane_data + _extra_blocks], 0 ge64_bytes: cmp unused_lanes, 0xf jne return_null jmp start_loop align 16 start_loop: ; Find min length movdqa xmm0, [state + _lens_md5] phminposuw xmm1, xmm0 pextrw len2, xmm1, 0 ; min value pextrw idx, xmm1, 1 ; min index (0...3) cmp len2, 0 je len_is_0 pshufb xmm1, [rel dupw] ; duplicate words across all lanes psubw xmm0, xmm1 movdqa [state + _lens_md5], xmm0 ; "state" and "args" are the same address, arg1 ; len is arg2 call md5_x4x2_sse ; state and idx are intact len_is_0: ; process completed job "idx" imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size lea lane_data, [state + _ldata_md5 + lane_data] mov DWORD(extra_blocks), [lane_data + _extra_blocks] cmp extra_blocks, 0 jne proc_extra_blocks cmp dword [lane_data + _outer_done], 0 jne end_loop proc_outer: mov dword [lane_data + _outer_done], 1 mov DWORD(size_offset), [lane_data + _size_offset] mov qword [lane_data + _extra_block + size_offset], 0 mov word [state + _lens_md5 + 2*idx], 1 lea tmp, [lane_data + _outer_block] mov job, [lane_data + _job_in_lane] mov [state + _args_data_ptr_md5 + PTR_SZ*idx], tmp movd xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 0*MD5_DIGEST_ROW_SIZE] pinsrd xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 1*MD5_DIGEST_ROW_SIZE], 1 pinsrd xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 2*MD5_DIGEST_ROW_SIZE], 2 pinsrd xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 3*MD5_DIGEST_ROW_SIZE], 3 ; pshufb xmm0, [rel byteswap] movdqa [lane_data + _outer_block], xmm0 mov tmp, [job + _auth_key_xor_opad] movdqu xmm0, [tmp] movd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 0*MD5_DIGEST_ROW_SIZE], xmm0 pextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 1*MD5_DIGEST_ROW_SIZE], xmm0, 1 pextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 2*MD5_DIGEST_ROW_SIZE], xmm0, 2 pextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 3*MD5_DIGEST_ROW_SIZE], xmm0, 3 jmp start_loop align 16 proc_extra_blocks: mov DWORD(start_offset), [lane_data + _start_offset] mov [state + _lens_md5 + 2*idx], WORD(extra_blocks) lea tmp, [lane_data + _extra_block + start_offset] mov [state + _args_data_ptr_md5 + PTR_SZ*idx], tmp mov dword [lane_data + _extra_blocks], 0 jmp start_loop align 16 copy_lt64: ;; less than one message block of data ;; beginning of source block ;; destination extrablock but backwards by len from where 0x80 pre-populated ;; p2 clobbers unused_lanes, undo before exiting lea p2, [lane_data + _extra_block + 64] sub p2, len memcpy_sse_64_1 p2, p, len, tmp4, tmp2, xmm0, xmm1, xmm2, xmm3 mov unused_lanes, [state + _unused_lanes_md5] jmp end_fast_copy return_null: xor job_rax, job_rax jmp return align 16 end_loop: mov job_rax, [lane_data + _job_in_lane] mov unused_lanes, [state + _unused_lanes_md5] mov qword [lane_data + _job_in_lane], 0 or dword [job_rax + _status], STS_COMPLETED_HMAC shl unused_lanes, 4 or unused_lanes, idx mov [state + _unused_lanes_md5], unused_lanes mov p, [job_rax + _auth_tag_output] ; copy 12 bytes mov DWORD(tmp), [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 0*MD5_DIGEST_ROW_SIZE] mov DWORD(tmp2), [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 1*MD5_DIGEST_ROW_SIZE] mov DWORD(tmp3), [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 2*MD5_DIGEST_ROW_SIZE] mov [p + 0*4], DWORD(tmp) mov [p + 1*4], DWORD(tmp2) mov [p + 2*4], DWORD(tmp3) return: mov rbx, [rsp + _gpr_save + 8*0] mov rbp, [rsp + _gpr_save + 8*1] mov r12, [rsp + _gpr_save + 8*2] mov r13, [rsp + _gpr_save + 8*3] mov r14, [rsp + _gpr_save + 8*4] mov r15, [rsp + _gpr_save + 8*5] %ifndef LINUX mov rsi, [rsp + _gpr_save + 8*6] mov rdi, [rsp + _gpr_save + 8*7] %endif mov rsp, [rsp + _rsp_save] ; original SP ret intel-ipsec-mb-0.48/sse/mb_mgr_hmac_sha_224_flush_ni_sse.asm000066400000000000000000000031531321406316400237710ustar00rootroot00000000000000;; ;; Copyright (c) 2012-2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; %define SHA224 %include "mb_mgr_hmac_sha_256_flush_ni_sse.asm" intel-ipsec-mb-0.48/sse/mb_mgr_hmac_sha_224_flush_sse.asm000066400000000000000000000032221321406316400233000ustar00rootroot00000000000000;; ;; Copyright (c) 2012-2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; %define FUNC flush_job_hmac_sha_224_sse %define SHA224 %include "mb_mgr_hmac_sha_256_flush_sse.asm" intel-ipsec-mb-0.48/sse/mb_mgr_hmac_sha_224_submit_ni_sse.asm000066400000000000000000000031541321406316400241540ustar00rootroot00000000000000;; ;; Copyright (c) 2012-2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; %define SHA224 %include "mb_mgr_hmac_sha_256_submit_ni_sse.asm" intel-ipsec-mb-0.48/sse/mb_mgr_hmac_sha_224_submit_sse.asm000066400000000000000000000032241321406316400234640ustar00rootroot00000000000000;; ;; Copyright (c) 2012-2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; %define FUNC submit_job_hmac_sha_224_sse %define SHA224 %include "mb_mgr_hmac_sha_256_submit_sse.asm" intel-ipsec-mb-0.48/sse/mb_mgr_hmac_sha_256_flush_ni_sse.asm000066400000000000000000000162051321406316400240000ustar00rootroot00000000000000;; ;; Copyright (c) 2012-2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; ;; In System V AMD64 ABI ;; calle saves: RBX, RBP, R12-R15 ;; Windows x64 ABI ;; calle saves: RBX, RBP, RDI, RSI, RSP, R12-R15 ;; ;; Linux/Windows clobbers: xmm0 - xmm15 ;; %include "os.asm" %include "job_aes_hmac.asm" %include "mb_mgr_datastruct.asm" %include "reg_sizes.asm" ;%define DO_DBGPRINT %include "dbgprint.asm" extern sha256_ni %ifdef LINUX %define arg1 rdi %define arg2 rsi %else %define arg1 rcx %define arg2 rdx %endif %define state arg1 %define job arg2 %define len2 arg2 ; idx needs to be in rbx, rbp, r13-r15 %define idx rbp %define unused_lanes rbx %define lane_data rbx %define tmp2 rbx %define job_rax rax %define tmp1 rax %define size_offset rax %define tmp rax %define start_offset rax %define tmp3 arg1 %define extra_blocks arg2 %define p arg2 %define tmp4 r8 %define tmp5 r9 %define tmp6 r10 %define bswap_xmm4 xmm4 struc STACK _gpr_save: resq 4 ;rbx, rbp, rsi (win), rdi (win) _rsp_save: resq 1 endstruc section .data default rel align 16 byteswap: dq 0x0405060700010203 dq 0x0c0d0e0f08090a0b one: dq 1 section .text %ifdef SHA224 ;; JOB* flush_job_hmac_sha_224_ni_sse(MB_MGR_HMAC_SHA_256_OOO *state) ;; arg1 : state MKGLOBAL(flush_job_hmac_sha_224_ni_sse,function,internal) flush_job_hmac_sha_224_ni_sse: %else ;; JOB* flush_job_hmac_sha_256_ni_sse(MB_MGR_HMAC_SHA_256_OOO *state) ;; arg1 : state MKGLOBAL(flush_job_hmac_sha_256_ni_sse,function,internal) flush_job_hmac_sha_256_ni_sse: %endif mov rax, rsp sub rsp, STACK_size and rsp, -16 mov [rsp + _gpr_save + 8*0], rbx mov [rsp + _gpr_save + 8*1], rbp %ifndef LINUX mov [rsp + _gpr_save + 8*2], rsi mov [rsp + _gpr_save + 8*3], rdi %endif mov [rsp + _rsp_save], rax ; original SP DBGPRINTL "enter sha256-ni-sse flush" mov unused_lanes, [state + _unused_lanes_sha256] bt unused_lanes, 16+7 jc return_null ; find a lane with a non-null job, assume it is 0 then check 1 xor idx, idx cmp qword [state + _ldata_sha256 + 1 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0 cmovne idx, [rel one] DBGPRINTL64 "idx:", idx copy_lane_data: ; copy idx to empty lanes mov tmp, [state + _args_data_ptr_sha256 + PTR_SZ*idx] xor len2, len2 mov WORD(len2), word [state + _lens_sha256 + idx*2] ; there are only two lanes so if one is empty it is easy to determine which one xor idx, 1 mov [state + _args_data_ptr_sha256 + PTR_SZ*idx], tmp xor idx, 1 ; No need to find min length - only two lanes available cmp len2, 0 je len_is_0 ; set length on both lanes to 0 mov dword [state + _lens_sha256], 0 ; "state" and "args" are the same address, arg1 ; len is arg2 call sha256_ni ; state and idx are intact len_is_0: ; process completed job "idx" imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size lea lane_data, [state + _ldata_sha256 + lane_data] mov DWORD(extra_blocks), [lane_data + _extra_blocks] cmp extra_blocks, 0 jne proc_extra_blocks movdqa bswap_xmm4, [rel byteswap] cmp dword [lane_data + _outer_done], 0 jne end_loop proc_outer: mov dword [lane_data + _outer_done], 1 mov DWORD(size_offset), [lane_data + _size_offset] mov qword [lane_data + _extra_block + size_offset], 0 mov word [state + _lens_sha256 + 2*idx], 1 lea tmp, [lane_data + _outer_block] mov job, [lane_data + _job_in_lane] mov [state + _args_data_ptr_sha256 + PTR_SZ*idx], tmp %if SHA256NI_DIGEST_ROW_SIZE != 32 %error "Below code has been optimized for SHA256NI_DIGEST_ROW_SIZE = 32!" %endif lea tmp4, [idx*8] ; x8 here + scale factor x4 below give x32 movdqu xmm0, [state + _args_digest_sha256 + tmp4*4] movdqu xmm1, [state + _args_digest_sha256 + tmp4*4 + 4*4] pshufb xmm0, bswap_xmm4 pshufb xmm1, bswap_xmm4 movdqa [lane_data + _outer_block], xmm0 movdqa [lane_data + _outer_block + 4*4], xmm1 %ifdef SHA224 ;; overwrite top 4 bytes with 0x80 mov dword [lane_data + _outer_block + 7*4], 0x80 %endif DBGPRINTL "sha256 outer hash input words:" DBGPRINT_XMM xmm0 DBGPRINT_XMM xmm1 mov tmp, [job + _auth_key_xor_opad] movdqu xmm0, [tmp] movdqu xmm1, [tmp + 4*4] DBGPRINTL64 "auth_key_xor_opad", tmp movdqu [state + _args_digest_sha256 + tmp4*4], xmm0 movdqu [state + _args_digest_sha256 + tmp4*4 + 4*4], xmm1 DBGPRINTL "new digest args" DBGPRINT_XMM xmm0 DBGPRINT_XMM xmm1 jmp copy_lane_data align 16 proc_extra_blocks: mov DWORD(start_offset), [lane_data + _start_offset] mov [state + _lens_sha256 + 2*idx], WORD(extra_blocks) lea tmp, [lane_data + _extra_block + start_offset] mov [state + _args_data_ptr_sha256 + PTR_SZ*idx], tmp mov dword [lane_data + _extra_blocks], 0 jmp copy_lane_data return_null: xor job_rax, job_rax jmp return align 16 end_loop: mov job_rax, [lane_data + _job_in_lane] mov qword [lane_data + _job_in_lane], 0 or dword [job_rax + _status], STS_COMPLETED_HMAC mov unused_lanes, [state + _unused_lanes_sha256] shl unused_lanes, 8 or unused_lanes, idx mov [state + _unused_lanes_sha256], unused_lanes mov p, [job_rax + _auth_tag_output] ; copy 16 bytes for SHA256, 14 bytes for SHA224 %if SHA256NI_DIGEST_ROW_SIZE != 32 %error "Below code has been optimized for SHA256NI_DIGEST_ROW_SIZE = 32!" %endif shl idx, 5 movdqu xmm0, [state + _args_digest_sha256 + idx] pshufb xmm0, bswap_xmm4 %ifdef SHA224 ;; SHA224 movq [p + 0*4], xmm0 pextrd [p + 2*4], xmm0, 2 pextrw [p + 3*4], xmm0, 6 %else ;; SHA256 movdqu [p], xmm0 %endif DBGPRINTL "auth_tag_output:" DBGPRINT_XMM xmm0 return: DBGPRINTL "exit sha256-ni-sse flush" mov rbx, [rsp + _gpr_save + 8*0] mov rbp, [rsp + _gpr_save + 8*1] %ifndef LINUX mov rsi, [rsp + _gpr_save + 8*2] mov rdi, [rsp + _gpr_save + 8*3] %endif mov rsp, [rsp + _rsp_save] ; original SP ret intel-ipsec-mb-0.48/sse/mb_mgr_hmac_sha_256_flush_sse.asm000066400000000000000000000200321321406316400233030ustar00rootroot00000000000000;; ;; Copyright (c) 2012-2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; %include "os.asm" %include "job_aes_hmac.asm" %include "mb_mgr_datastruct.asm" %include "reg_sizes.asm" extern sha_256_mult_sse section .data default rel align 16 byteswap: ;ddq 0x0c0d0e0f08090a0b0405060700010203 dq 0x0405060700010203, 0x0c0d0e0f08090a0b len_masks: ;ddq 0x0000000000000000000000000000FFFF dq 0x000000000000FFFF, 0x0000000000000000 ;ddq 0x000000000000000000000000FFFF0000 dq 0x00000000FFFF0000, 0x0000000000000000 ;ddq 0x00000000000000000000FFFF00000000 dq 0x0000FFFF00000000, 0x0000000000000000 ;ddq 0x0000000000000000FFFF000000000000 dq 0xFFFF000000000000, 0x0000000000000000 one: dq 1 two: dq 2 three: dq 3 section .text %ifndef FUNC %define FUNC flush_job_hmac_sha_256_sse %endif %if 1 %ifdef LINUX %define arg1 rdi %define arg2 rsi %else %define arg1 rcx %define arg2 rdx %endif %define state arg1 %define job arg2 %define len2 arg2 ; idx needs to be in rbx, rbp, r13-r15 %define idx rbp %define unused_lanes rbx %define lane_data rbx %define tmp2 rbx %define job_rax rax %define tmp1 rax %define size_offset rax %define tmp rax %define start_offset rax %define tmp3 arg1 %define extra_blocks arg2 %define p arg2 %define tmp4 r8 %define tmp5 r9 %define tmp6 r10 %endif ; This routine clobbers rbx, rbp; called routine also clobbers r12 struc STACK _gpr_save: resq 3 _rsp_save: resq 1 endstruc %define APPEND(a,b) a %+ b ; JOB* FUNC(MB_MGR_HMAC_SHA_256_OOO *state) ; arg 1 : rcx : state MKGLOBAL(FUNC,function,internal) FUNC: mov rax, rsp sub rsp, STACK_size and rsp, -16 mov [rsp + _gpr_save + 8*0], rbx mov [rsp + _gpr_save + 8*1], rbp mov [rsp + _gpr_save + 8*2], r12 mov [rsp + _rsp_save], rax ; original SP mov unused_lanes, [state + _unused_lanes_sha256] bt unused_lanes, 32+7 jc return_null ; find a lane with a non-null job xor idx, idx cmp qword [state + _ldata_sha256 + 1 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0 cmovne idx, [rel one] cmp qword [state + _ldata_sha256 + 2 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0 cmovne idx, [rel two] cmp qword [state + _ldata_sha256 + 3 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0 cmovne idx, [rel three] copy_lane_data: ; copy idx to empty lanes movdqa xmm0, [state + _lens_sha256] mov tmp, [state + _args_data_ptr_sha256 + 8*idx] %assign I 0 %rep 4 cmp qword [state + _ldata_sha256 + I * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0 jne APPEND(skip_,I) mov [state + _args_data_ptr_sha256 + 8*I], tmp por xmm0, [rel len_masks + 16*I] APPEND(skip_,I): %assign I (I+1) %endrep movdqa [state + _lens_sha256], xmm0 phminposuw xmm1, xmm0 pextrw len2, xmm1, 0 ; min value pextrw idx, xmm1, 1 ; min index (0...3) cmp len2, 0 je len_is_0 pshuflw xmm1, xmm1, 0 psubw xmm0, xmm1 movdqa [state + _lens_sha256], xmm0 ; "state" and "args" are the same address, arg1 ; len is arg2 call sha_256_mult_sse ; state and idx are intact len_is_0: ; process completed job "idx" imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size lea lane_data, [state + _ldata_sha256 + lane_data] mov DWORD(extra_blocks), [lane_data + _extra_blocks] cmp extra_blocks, 0 jne proc_extra_blocks cmp dword [lane_data + _outer_done], 0 jne end_loop proc_outer: mov dword [lane_data + _outer_done], 1 mov DWORD(size_offset), [lane_data + _size_offset] mov qword [lane_data + _extra_block + size_offset], 0 mov word [state + _lens_sha256 + 2*idx], 1 lea tmp, [lane_data + _outer_block] mov job, [lane_data + _job_in_lane] mov [state + _args_data_ptr_sha256 + 8*idx], tmp movd xmm0, [state + _args_digest_sha256 + 4*idx + 0*SHA256_DIGEST_ROW_SIZE] pinsrd xmm0, [state + _args_digest_sha256 + 4*idx + 1*SHA256_DIGEST_ROW_SIZE], 1 pinsrd xmm0, [state + _args_digest_sha256 + 4*idx + 2*SHA256_DIGEST_ROW_SIZE], 2 pinsrd xmm0, [state + _args_digest_sha256 + 4*idx + 3*SHA256_DIGEST_ROW_SIZE], 3 pshufb xmm0, [rel byteswap] movd xmm1, [state + _args_digest_sha256 + 4*idx + 4*SHA256_DIGEST_ROW_SIZE] pinsrd xmm1, [state + _args_digest_sha256 + 4*idx + 5*SHA256_DIGEST_ROW_SIZE], 1 pinsrd xmm1, [state + _args_digest_sha256 + 4*idx + 6*SHA256_DIGEST_ROW_SIZE], 2 %ifndef SHA224 pinsrd xmm1, [state + _args_digest_sha256 + 4*idx + 7*SHA256_DIGEST_ROW_SIZE], 3 %endif pshufb xmm1, [rel byteswap] movdqa [lane_data + _outer_block], xmm0 movdqa [lane_data + _outer_block + 4*4], xmm1 %ifdef SHA224 mov dword [lane_data + _outer_block + 7*4], 0x80 %endif mov tmp, [job + _auth_key_xor_opad] movdqu xmm0, [tmp] movdqu xmm1, [tmp + 4*4] movd [state + _args_digest_sha256 + 4*idx + 0*SHA256_DIGEST_ROW_SIZE], xmm0 pextrd [state + _args_digest_sha256 + 4*idx + 1*SHA256_DIGEST_ROW_SIZE], xmm0, 1 pextrd [state + _args_digest_sha256 + 4*idx + 2*SHA256_DIGEST_ROW_SIZE], xmm0, 2 pextrd [state + _args_digest_sha256 + 4*idx + 3*SHA256_DIGEST_ROW_SIZE], xmm0, 3 movd [state + _args_digest_sha256 + 4*idx + 4*SHA256_DIGEST_ROW_SIZE], xmm1 pextrd [state + _args_digest_sha256 + 4*idx + 5*SHA256_DIGEST_ROW_SIZE], xmm1, 1 pextrd [state + _args_digest_sha256 + 4*idx + 6*SHA256_DIGEST_ROW_SIZE], xmm1, 2 pextrd [state + _args_digest_sha256 + 4*idx + 7*SHA256_DIGEST_ROW_SIZE], xmm1, 3 jmp copy_lane_data align 16 proc_extra_blocks: mov DWORD(start_offset), [lane_data + _start_offset] mov [state + _lens_sha256 + 2*idx], WORD(extra_blocks) lea tmp, [lane_data + _extra_block + start_offset] mov [state + _args_data_ptr_sha256 + 8*idx], tmp mov dword [lane_data + _extra_blocks], 0 jmp copy_lane_data return_null: xor job_rax, job_rax jmp return align 16 end_loop: mov job_rax, [lane_data + _job_in_lane] mov qword [lane_data + _job_in_lane], 0 or dword [job_rax + _status], STS_COMPLETED_HMAC mov unused_lanes, [state + _unused_lanes_sha256] shl unused_lanes, 8 or unused_lanes, idx mov [state + _unused_lanes_sha256], unused_lanes mov p, [job_rax + _auth_tag_output] ; copy 14 bytes for SHA224 and 16 bytes for SHA256 mov DWORD(tmp2), [state + _args_digest_sha256 + 4*idx + 0*SHA256_DIGEST_ROW_SIZE] mov DWORD(tmp4), [state + _args_digest_sha256 + 4*idx + 1*SHA256_DIGEST_ROW_SIZE] mov DWORD(tmp6), [state + _args_digest_sha256 + 4*idx + 2*SHA256_DIGEST_ROW_SIZE] mov DWORD(tmp5), [state + _args_digest_sha256 + 4*idx + 3*SHA256_DIGEST_ROW_SIZE] bswap DWORD(tmp2) bswap DWORD(tmp4) bswap DWORD(tmp6) bswap DWORD(tmp5) mov [p + 0*4], DWORD(tmp2) mov [p + 1*4], DWORD(tmp4) mov [p + 2*4], DWORD(tmp6) %ifdef SHA224 mov [p + 3*4], WORD(tmp5) %else mov [p + 3*4], DWORD(tmp5) %endif return: mov rbx, [rsp + _gpr_save + 8*0] mov rbp, [rsp + _gpr_save + 8*1] mov r12, [rsp + _gpr_save + 8*2] mov rsp, [rsp + _rsp_save] ; original SP ret intel-ipsec-mb-0.48/sse/mb_mgr_hmac_sha_256_submit_ni_sse.asm000066400000000000000000000227711321406316400241670ustar00rootroot00000000000000;; ;; Copyright (c) 2012-2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; ;; In System V AMD64 ABI ;; calle saves: RBX, RBP, R12-R15 ;; Windows x64 ABI ;; calle saves: RBX, RBP, RDI, RSI, RSP, R12-R15 ;; ;; Linux/Windows clobbers: xmm0 - xmm15 ;; %include "os.asm" %include "job_aes_hmac.asm" %include "mb_mgr_datastruct.asm" %include "reg_sizes.asm" %include "memcpy.asm" ;%define DO_DBGPRINT %include "dbgprint.asm" extern sha256_ni %ifdef LINUX %define arg1 rdi %define arg2 rsi %define reg3 rcx %define reg4 rdx %else %define arg1 rcx %define arg2 rdx %define reg3 rdi %define reg4 rsi %endif %define state arg1 %define job arg2 %define len2 arg2 ; idx needs to be in rbx, rbp, r13-r15 %define last_len rbp %define idx rbp %define p r11 %define start_offset r11 %define unused_lanes rbx %define tmp4 rbx %define job_rax rax %define len rax %define size_offset reg3 %define tmp2 reg3 %define lane reg4 %define extra_blocks r8 %define tmp r9 %define p2 r9 %define lane_data r10 %define bswap_xmm4 xmm4 struc STACK _gpr_save: resq 4 ; rbx, rbp, rsi (win), rdi (win) _rsp_save: resq 1 endstruc section .data default rel align 16 byteswap: dq 0x0405060700010203 dq 0x0c0d0e0f08090a0b section .text %ifdef SHA224 ; JOB* submit_job_hmac_sha_224_ni_sse(MB_MGR_HMAC_SHA_256_OOO *state, JOB_AES_HMAC *job) ; arg 1 : state ; arg 2 : job MKGLOBAL(submit_job_hmac_sha_224_ni_sse,function,internal) submit_job_hmac_sha_224_ni_sse: %else ; JOB* submit_job_hmac_sha_256_ni_sse(MB_MGR_HMAC_SHA_256_OOO *state, JOB_AES_HMAC *job) ; arg 1 : state ; arg 2 : job MKGLOBAL(submit_job_hmac_sha_256_ni_sse,function,internal) submit_job_hmac_sha_256_ni_sse: %endif mov rax, rsp sub rsp, STACK_size and rsp, -16 mov [rsp + _gpr_save + 8*0], rbx mov [rsp + _gpr_save + 8*1], rbp %ifndef LINUX mov [rsp + _gpr_save + 8*2], rsi mov [rsp + _gpr_save + 8*3], rdi %endif mov [rsp + _rsp_save], rax ; original SP DBGPRINTL "enter sha256-ni-sse submit" mov unused_lanes, [state + _unused_lanes_sha256] movzx lane, BYTE(unused_lanes) DBGPRINTL64 "lane: ", lane shr unused_lanes, 8 imul lane_data, lane, _HMAC_SHA1_LANE_DATA_size ; SHA1 & SHA256 lane data is the same lea lane_data, [state + _ldata_sha256 + lane_data] mov [state + _unused_lanes_sha256], unused_lanes mov len, [job + _msg_len_to_hash_in_bytes] DBGPRINTL64 "length: ", len mov tmp, len shr tmp, 6 ; divide by 64, len in terms of blocks mov [lane_data + _job_in_lane], job mov dword [lane_data + _outer_done], 0 mov [state + _lens_sha256 + 2*lane], WORD(tmp) mov last_len, len and last_len, 63 lea extra_blocks, [last_len + 9 + 63] shr extra_blocks, 6 mov [lane_data + _extra_blocks], DWORD(extra_blocks) mov p, [job + _src] add p, [job + _hash_start_src_offset_in_bytes] mov [state + _args_data_ptr_sha256 + 8*lane], p cmp len, 64 jb copy_lt64 fast_copy: add p, len movdqu xmm0, [p - 64 + 0*16] movdqu xmm1, [p - 64 + 1*16] movdqu xmm2, [p - 64 + 2*16] movdqu xmm3, [p - 64 + 3*16] movdqa [lane_data + _extra_block + 0*16], xmm0 movdqa [lane_data + _extra_block + 1*16], xmm1 movdqa [lane_data + _extra_block + 2*16], xmm2 movdqa [lane_data + _extra_block + 3*16], xmm3 end_fast_copy: mov size_offset, extra_blocks shl size_offset, 6 sub size_offset, last_len add size_offset, 64-8 mov [lane_data + _size_offset], DWORD(size_offset) mov start_offset, 64 sub start_offset, last_len mov [lane_data + _start_offset], DWORD(start_offset) lea tmp, [8*64 + 8*len] bswap tmp mov [lane_data + _extra_block + size_offset], tmp mov tmp, [job + _auth_key_xor_ipad] movdqu xmm0, [tmp] movdqu xmm1, [tmp + 4*4] %if SHA256NI_DIGEST_ROW_SIZE != 32 %error "Below code has been optimized for SHA256NI_DIGEST_ROW_SIZE = 32!" %endif lea tmp, [lane*8] ; x8 here plus x4 scale factor give x32 movdqu [state + _args_digest_sha256 + tmp*4], xmm0 movdqu [state + _args_digest_sha256 + tmp*4 + 4*4], xmm1 DBGPRINTL "args digest:" DBGPRINT_XMM xmm0 DBGPRINT_XMM xmm1 test len, ~63 jnz ge64_bytes lt64_bytes: mov [state + _lens_sha256 + 2*lane], WORD(extra_blocks) lea tmp, [lane_data + _extra_block + start_offset] mov [state + _args_data_ptr_sha256 + 8*lane], tmp mov dword [lane_data + _extra_blocks], 0 ge64_bytes: cmp unused_lanes, 0xff jne return_null jmp start_loop align 16 start_loop: ; Find min length - only two lanes available xor len2, len2 mov tmp, 0x10000 mov WORD(len2), word [state + _lens_sha256 + 0*2] ; [0:15] - lane 0 length, [16:31] - lane index (0) mov WORD(tmp), word [state + _lens_sha256 + 1*2] ; [0:15] - lane 1 length, [16:31] - lane index (1) cmp WORD(len2), WORD(tmp) cmovg DWORD(len2), DWORD(tmp) ; move if lane 0 length is greater than lane 1 length mov idx, len2 ; retrieve index & length from [16:31] and [0:15] bit fields shr DWORD(idx), 16 and DWORD(len2), 0xffff je len_is_0 sub word [state + _lens_sha256 + 0*2], WORD(len2) sub word [state + _lens_sha256 + 1*2], WORD(len2) ; "state" and "args" are the same address, arg1 ; len is arg2 call sha256_ni ; state is intact len_is_0: ; process completed job "idx" imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size lea lane_data, [state + _ldata_sha256 + lane_data] mov DWORD(extra_blocks), [lane_data + _extra_blocks] cmp extra_blocks, 0 jne proc_extra_blocks movdqa bswap_xmm4, [rel byteswap] cmp dword [lane_data + _outer_done], 0 jne end_loop proc_outer: mov dword [lane_data + _outer_done], 1 mov DWORD(size_offset), [lane_data + _size_offset] mov qword [lane_data + _extra_block + size_offset], 0 mov word [state + _lens_sha256 + 2*idx], 1 lea tmp, [lane_data + _outer_block] mov job, [lane_data + _job_in_lane] mov [state + _args_data_ptr_sha256 + PTR_SZ*idx], tmp %if SHA256NI_DIGEST_ROW_SIZE != 32 %error "Below code has been optimized for SHA256NI_DIGEST_ROW_SIZE = 32!" %endif lea tmp4, [idx*8] ; x8 here + scale factor x4 below give x32 movdqu xmm0, [state + _args_digest_sha256 + tmp4*4] movdqu xmm1, [state + _args_digest_sha256 + tmp4*4 + 4*4] pshufb xmm0, bswap_xmm4 pshufb xmm1, bswap_xmm4 movdqa [lane_data + _outer_block], xmm0 movdqa [lane_data + _outer_block + 4*4], xmm1 %ifdef SHA224 ;; overwrite top 4 bytes with 0x80 mov dword [lane_data + _outer_block + 7*4], 0x80 %endif mov tmp, [job + _auth_key_xor_opad] movdqu xmm0, [tmp] movdqu xmm1, [tmp + 4*4] movdqu [state + _args_digest_sha256 + tmp4*4], xmm0 movdqu [state + _args_digest_sha256 + tmp4*4 + 4*4], xmm1 jmp start_loop align 16 proc_extra_blocks: mov DWORD(start_offset), [lane_data + _start_offset] mov [state + _lens_sha256 + 2*idx], WORD(extra_blocks) lea tmp, [lane_data + _extra_block + start_offset] mov [state + _args_data_ptr_sha256 + PTR_SZ*idx], tmp mov dword [lane_data + _extra_blocks], 0 jmp start_loop align 16 copy_lt64: ;; less than one message block of data ;; beginning of source block ;; destination extrablock but backwards by len from where 0x80 pre-populated ;; p2 clobbers unused_lanes, undo before exit lea p2, [lane_data + _extra_block + 64] sub p2, len memcpy_sse_64_1 p2, p, len, tmp4, tmp2, xmm0, xmm1, xmm2, xmm3 mov unused_lanes, [state + _unused_lanes_sha256] jmp end_fast_copy return_null: xor job_rax, job_rax jmp return align 16 end_loop: mov job_rax, [lane_data + _job_in_lane] mov unused_lanes, [state + _unused_lanes_sha256] mov qword [lane_data + _job_in_lane], 0 or dword [job_rax + _status], STS_COMPLETED_HMAC shl unused_lanes, 8 or unused_lanes, idx mov [state + _unused_lanes_sha256], unused_lanes mov p, [job_rax + _auth_tag_output] ; copy 16 bytes for SHA256, 14 for SHA224 %if SHA256NI_DIGEST_ROW_SIZE != 32 %error "Below code has been optimized for SHA256NI_DIGEST_ROW_SIZE = 32!" %endif shl idx, 5 movdqu xmm0, [state + _args_digest_sha256 + idx] pshufb xmm0, bswap_xmm4 %ifdef SHA224 ;; SHA224 movq [p + 0*4], xmm0 pextrd [p + 2*4], xmm0, 2 pextrw [p + 3*4], xmm0, 6 %else ;; SHA256 movdqu [p], xmm0 %endif return: mov rbx, [rsp + _gpr_save + 8*0] mov rbp, [rsp + _gpr_save + 8*1] %ifndef LINUX mov rsi, [rsp + _gpr_save + 8*2] mov rdi, [rsp + _gpr_save + 8*3] %endif mov rsp, [rsp + _rsp_save] ; original SP ret intel-ipsec-mb-0.48/sse/mb_mgr_hmac_sha_256_submit_sse.asm000066400000000000000000000240301321406316400234670ustar00rootroot00000000000000;; ;; Copyright (c) 2012-2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; %include "os.asm" %include "job_aes_hmac.asm" %include "mb_mgr_datastruct.asm" %include "reg_sizes.asm" %include "memcpy.asm" extern sha_256_mult_sse section .data default rel align 16 byteswap: ;ddq 0x0c0d0e0f08090a0b0405060700010203 dq 0x0405060700010203, 0x0c0d0e0f08090a0b section .text %ifndef FUNC %define FUNC submit_job_hmac_sha_256_sse %endif %if 1 %ifdef LINUX %define arg1 rdi %define arg2 rsi %define reg3 rcx %define reg4 rdx %else %define arg1 rcx %define arg2 rdx %define reg3 rdi %define reg4 rsi %endif %define state arg1 %define job arg2 %define len2 arg2 ; idx needs to be in rbx, rbp, r13-r15 %define last_len rbp %define idx rbp %define p r11 %define start_offset r11 %define unused_lanes rbx %define tmp4 rbx %define job_rax rax %define len rax %define size_offset reg3 %define tmp2 reg3 %define lane reg4 %define tmp3 reg4 %define extra_blocks r8 %define tmp r9 %define p2 r9 %define lane_data r10 %endif ; This routine clobbers rbx, rbp, rsi, rdi; called routine also clobbers r12 struc STACK _gpr_save: resq 5 _rsp_save: resq 1 endstruc ; JOB* FUNC(MB_MGR_HMAC_SHA_256_OOO *state, JOB_AES_HMAC *job) ; arg 1 : rcx : state ; arg 2 : rdx : job MKGLOBAL(FUNC,function,internal) FUNC: mov rax, rsp sub rsp, STACK_size and rsp, -16 mov [rsp + _gpr_save + 8*0], rbx mov [rsp + _gpr_save + 8*1], rbp mov [rsp + _gpr_save + 8*2], r12 %ifndef LINUX mov [rsp + _gpr_save + 8*3], rsi mov [rsp + _gpr_save + 8*4], rdi %endif mov [rsp + _rsp_save], rax ; original SP mov unused_lanes, [state + _unused_lanes_sha256] movzx lane, BYTE(unused_lanes) shr unused_lanes, 8 imul lane_data, lane, _HMAC_SHA1_LANE_DATA_size lea lane_data, [state + _ldata_sha256 + lane_data] mov [state + _unused_lanes_sha256], unused_lanes mov len, [job + _msg_len_to_hash_in_bytes] mov tmp, len shr tmp, 6 ; divide by 64, len in terms of blocks mov [lane_data + _job_in_lane], job mov dword [lane_data + _outer_done], 0 mov [state + _lens_sha256 + 2*lane], WORD(tmp) mov last_len, len and last_len, 63 lea extra_blocks, [last_len + 9 + 63] shr extra_blocks, 6 mov [lane_data + _extra_blocks], DWORD(extra_blocks) mov p, [job + _src] add p, [job + _hash_start_src_offset_in_bytes] mov [state + _args_data_ptr_sha256 + 8*lane], p cmp len, 64 jb copy_lt64 fast_copy: add p, len movdqu xmm0, [p - 64 + 0*16] movdqu xmm1, [p - 64 + 1*16] movdqu xmm2, [p - 64 + 2*16] movdqu xmm3, [p - 64 + 3*16] movdqa [lane_data + _extra_block + 0*16], xmm0 movdqa [lane_data + _extra_block + 1*16], xmm1 movdqa [lane_data + _extra_block + 2*16], xmm2 movdqa [lane_data + _extra_block + 3*16], xmm3 end_fast_copy: mov size_offset, extra_blocks shl size_offset, 6 sub size_offset, last_len add size_offset, 64-8 mov [lane_data + _size_offset], DWORD(size_offset) mov start_offset, 64 sub start_offset, last_len mov [lane_data + _start_offset], DWORD(start_offset) lea tmp, [8*64 + 8*len] bswap tmp mov [lane_data + _extra_block + size_offset], tmp mov tmp, [job + _auth_key_xor_ipad] movdqu xmm0, [tmp] movdqu xmm1, [tmp + 4*4] movd [state + _args_digest_sha256 + 4*lane + 0*SHA256_DIGEST_ROW_SIZE], xmm0 pextrd [state + _args_digest_sha256 + 4*lane + 1*SHA256_DIGEST_ROW_SIZE], xmm0, 1 pextrd [state + _args_digest_sha256 + 4*lane + 2*SHA256_DIGEST_ROW_SIZE], xmm0, 2 pextrd [state + _args_digest_sha256 + 4*lane + 3*SHA256_DIGEST_ROW_SIZE], xmm0, 3 movd [state + _args_digest_sha256 + 4*lane + 4*SHA256_DIGEST_ROW_SIZE], xmm1 pextrd [state + _args_digest_sha256 + 4*lane + 5*SHA256_DIGEST_ROW_SIZE], xmm1, 1 pextrd [state + _args_digest_sha256 + 4*lane + 6*SHA256_DIGEST_ROW_SIZE], xmm1, 2 pextrd [state + _args_digest_sha256 + 4*lane + 7*SHA256_DIGEST_ROW_SIZE], xmm1, 3 test len, ~63 jnz ge64_bytes lt64_bytes: mov [state + _lens_sha256 + 2*lane], WORD(extra_blocks) lea tmp, [lane_data + _extra_block + start_offset] mov [state + _args_data_ptr_sha256 + 8*lane], tmp mov dword [lane_data + _extra_blocks], 0 ge64_bytes: cmp unused_lanes, 0xff jne return_null jmp start_loop align 16 start_loop: ; Find min length movdqa xmm0, [state + _lens_sha256] phminposuw xmm1, xmm0 pextrw len2, xmm1, 0 ; min value pextrw idx, xmm1, 1 ; min index (0...3) cmp len2, 0 je len_is_0 pshuflw xmm1, xmm1, 0 psubw xmm0, xmm1 movdqa [state + _lens_sha256], xmm0 ; "state" and "args" are the same address, arg1 ; len is arg2 call sha_256_mult_sse ; state and idx are intact len_is_0: ; process completed job "idx" imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size lea lane_data, [state + _ldata_sha256 + lane_data] mov DWORD(extra_blocks), [lane_data + _extra_blocks] cmp extra_blocks, 0 jne proc_extra_blocks cmp dword [lane_data + _outer_done], 0 jne end_loop proc_outer: mov dword [lane_data + _outer_done], 1 mov DWORD(size_offset), [lane_data + _size_offset] mov qword [lane_data + _extra_block + size_offset], 0 mov word [state + _lens_sha256 + 2*idx], 1 lea tmp, [lane_data + _outer_block] mov job, [lane_data + _job_in_lane] mov [state + _args_data_ptr_sha256 + 8*idx], tmp movd xmm0, [state + _args_digest_sha256 + 4*idx + 0*SHA256_DIGEST_ROW_SIZE] pinsrd xmm0, [state + _args_digest_sha256 + 4*idx + 1*SHA256_DIGEST_ROW_SIZE], 1 pinsrd xmm0, [state + _args_digest_sha256 + 4*idx + 2*SHA256_DIGEST_ROW_SIZE], 2 pinsrd xmm0, [state + _args_digest_sha256 + 4*idx + 3*SHA256_DIGEST_ROW_SIZE], 3 pshufb xmm0, [rel byteswap] movd xmm1, [state + _args_digest_sha256 + 4*idx + 4*SHA256_DIGEST_ROW_SIZE] pinsrd xmm1, [state + _args_digest_sha256 + 4*idx + 5*SHA256_DIGEST_ROW_SIZE], 1 pinsrd xmm1, [state + _args_digest_sha256 + 4*idx + 6*SHA256_DIGEST_ROW_SIZE], 2 %ifndef SHA224 pinsrd xmm1, [state + _args_digest_sha256 + 4*idx + 7*SHA256_DIGEST_ROW_SIZE], 3 %endif pshufb xmm1, [rel byteswap] movdqa [lane_data + _outer_block], xmm0 movdqa [lane_data + _outer_block + 4*4], xmm1 %ifdef SHA224 mov dword [lane_data + _outer_block + 7*4], 0x80 %endif mov tmp, [job + _auth_key_xor_opad] movdqu xmm0, [tmp] movdqu xmm1, [tmp + 4*4] movd [state + _args_digest_sha256 + 4*idx + 0*SHA256_DIGEST_ROW_SIZE], xmm0 pextrd [state + _args_digest_sha256 + 4*idx + 1*SHA256_DIGEST_ROW_SIZE], xmm0, 1 pextrd [state + _args_digest_sha256 + 4*idx + 2*SHA256_DIGEST_ROW_SIZE], xmm0, 2 pextrd [state + _args_digest_sha256 + 4*idx + 3*SHA256_DIGEST_ROW_SIZE], xmm0, 3 movd [state + _args_digest_sha256 + 4*idx + 4*SHA256_DIGEST_ROW_SIZE], xmm1 pextrd [state + _args_digest_sha256 + 4*idx + 5*SHA256_DIGEST_ROW_SIZE], xmm1, 1 pextrd [state + _args_digest_sha256 + 4*idx + 6*SHA256_DIGEST_ROW_SIZE], xmm1, 2 pextrd [state + _args_digest_sha256 + 4*idx + 7*SHA256_DIGEST_ROW_SIZE], xmm1, 3 jmp start_loop align 16 proc_extra_blocks: mov DWORD(start_offset), [lane_data + _start_offset] mov [state + _lens_sha256 + 2*idx], WORD(extra_blocks) lea tmp, [lane_data + _extra_block + start_offset] mov [state + _args_data_ptr_sha256 + 8*idx], tmp mov dword [lane_data + _extra_blocks], 0 jmp start_loop align 16 copy_lt64: ;; less than one message block of data ;; beginning of source block ;; destination extrablock but backwards by len from where 0x80 pre-populated ;; p2 clobbers unused_lanes, undo before exit lea p2, [lane_data + _extra_block + 64] sub p2, len memcpy_sse_64_1 p2, p, len, tmp4, tmp2, xmm0, xmm1, xmm2, xmm3 mov unused_lanes, [state + _unused_lanes_sha256] jmp end_fast_copy return_null: xor job_rax, job_rax jmp return align 16 end_loop: mov job_rax, [lane_data + _job_in_lane] mov unused_lanes, [state + _unused_lanes_sha256] mov qword [lane_data + _job_in_lane], 0 or dword [job_rax + _status], STS_COMPLETED_HMAC shl unused_lanes, 8 or unused_lanes, idx mov [state + _unused_lanes_sha256], unused_lanes mov p, [job_rax + _auth_tag_output] ; copy 14 bytes for SHA224 and 16 bytes for SHA256 mov DWORD(tmp), [state + _args_digest_sha256 + 4*idx + 0*SHA256_DIGEST_ROW_SIZE] mov DWORD(tmp2), [state + _args_digest_sha256 + 4*idx + 1*SHA256_DIGEST_ROW_SIZE] mov DWORD(tmp3), [state + _args_digest_sha256 + 4*idx + 2*SHA256_DIGEST_ROW_SIZE] mov DWORD(tmp4), [state + _args_digest_sha256 + 4*idx + 3*SHA256_DIGEST_ROW_SIZE] bswap DWORD(tmp) bswap DWORD(tmp2) bswap DWORD(tmp3) bswap DWORD(tmp4) mov [p + 0*4], DWORD(tmp) mov [p + 1*4], DWORD(tmp2) mov [p + 2*4], DWORD(tmp3) %ifdef SHA224 mov [p + 3*4], WORD(tmp4) %else mov [p + 3*4], DWORD(tmp4) %endif return: mov rbx, [rsp + _gpr_save + 8*0] mov rbp, [rsp + _gpr_save + 8*1] mov r12, [rsp + _gpr_save + 8*2] %ifndef LINUX mov rsi, [rsp + _gpr_save + 8*3] mov rdi, [rsp + _gpr_save + 8*4] %endif mov rsp, [rsp + _rsp_save] ; original SP ret intel-ipsec-mb-0.48/sse/mb_mgr_hmac_sha_384_flush_sse.asm000066400000000000000000000032411321406316400233100ustar00rootroot00000000000000;; ;; Copyright (c) 2012-2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; %define FUNC flush_job_hmac_sha_384_sse %define SHA_X_DIGEST_SIZE 384 %include "mb_mgr_hmac_sha_512_flush_sse.asm" intel-ipsec-mb-0.48/sse/mb_mgr_hmac_sha_384_submit_sse.asm000066400000000000000000000032431321406316400234740ustar00rootroot00000000000000;; ;; Copyright (c) 2012-2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; %define FUNC submit_job_hmac_sha_384_sse %define SHA_X_DIGEST_SIZE 384 %include "mb_mgr_hmac_sha_512_submit_sse.asm" intel-ipsec-mb-0.48/sse/mb_mgr_hmac_sha_512_flush_sse.asm000066400000000000000000000157101321406316400233050ustar00rootroot00000000000000;; ;; Copyright (c) 2012-2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; %include "os.asm" %include "job_aes_hmac.asm" %include "mb_mgr_datastruct.asm" %include "reg_sizes.asm" extern sha512_x2_sse section .data default rel align 16 byteswap: ;ddq 0x08090a0b0c0d0e0f0001020304050607 dq 0x0001020304050607, 0x08090a0b0c0d0e0f len_masks: ;ddq 0x0000000000000000000000000000FFFF dq 0x000000000000FFFF, 0x0000000000000000 ;ddq 0x000000000000000000000000FFFF0000 dq 0x00000000FFFF0000, 0x0000000000000000 one: dq 1 section .text %ifndef FUNC %define FUNC flush_job_hmac_sha_512_sse %define SHA_X_DIGEST_SIZE 512 %endif %if 1 %ifdef LINUX %define arg1 rdi %define arg2 rsi %else %define arg1 rcx %define arg2 rdx %endif %define state arg1 %define job arg2 %define len2 arg2 ; idx needs to be in rbx, rbp, r12-r15 %define idx rbp %define unused_lanes rbx %define lane_data rbx %define tmp2 rbx %define job_rax rax %define tmp1 rax %define size_offset rax %define tmp rax %define start_offset rax %define tmp3 arg1 %define extra_blocks arg2 %define p arg2 %define tmp4 r8 %define tmp5 r9 %define tmp6 r10 %endif ; This routine clobbers rbx, rbp struc STACK _gpr_save: resq 2 _rsp_save: resq 1 endstruc %define APPEND(a,b) a %+ b ; JOB* FUNC(MB_MGR_HMAC_SHA_512_OOO *state) ; arg 1 : rcx : state MKGLOBAL(FUNC,function,internal) FUNC: mov rax, rsp sub rsp, STACK_size and rsp, -16 mov [rsp + _gpr_save + 8*0], rbx mov [rsp + _gpr_save + 8*1], rbp mov [rsp + _rsp_save], rax ; original SP mov unused_lanes, [state + _unused_lanes_sha512] bt unused_lanes, 16+7 jc return_null ; find a lane with a non-null job xor idx, idx cmp qword [state + _ldata_sha512 + 1 * _SHA512_LANE_DATA_size + _job_in_lane_sha512], 0 cmovne idx, [rel one] copy_lane_data: ; copy good lane (idx) to empty lanes movdqa xmm0, [state + _lens_sha512] mov tmp, [state + _args_sha512 + _data_ptr_sha512 + PTR_SZ*idx] %assign I 0 %rep 2 cmp qword [state + _ldata_sha512 + I * _SHA512_LANE_DATA_size + _job_in_lane_sha512], 0 jne APPEND(skip_,I) mov [state + _args_sha512 + _data_ptr_sha512 + PTR_SZ*I], tmp por xmm0, [rel len_masks + 16*I] APPEND(skip_,I): %assign I (I+1) %endrep movdqa [state + _lens_sha512], xmm0 phminposuw xmm1, xmm0 pextrw len2, xmm1, 0 ; min value pextrw idx, xmm1, 1 ; min index (0...3) cmp len2, 0 je len_is_0 pshuflw xmm1, xmm1, 0xA0 psubw xmm0, xmm1 movdqa [state + _lens_sha512], xmm0 ; "state" and "args" are the same address, arg1 ; len is arg2 call sha512_x2_sse ; state and idx are intact len_is_0: ; process completed job "idx" imul lane_data, idx, _SHA512_LANE_DATA_size lea lane_data, [state + _ldata_sha512 + lane_data] mov DWORD(extra_blocks), [lane_data + _extra_blocks_sha512] cmp extra_blocks, 0 jne proc_extra_blocks cmp dword [lane_data + _outer_done_sha512], 0 jne end_loop proc_outer: mov dword [lane_data + _outer_done_sha512], 1 mov DWORD(size_offset), [lane_data + _size_offset_sha512] mov qword [lane_data + _extra_block_sha512 + size_offset], 0 mov word [state + _lens_sha512 + 2*idx], 1 lea tmp, [lane_data + _outer_block_sha512] mov job, [lane_data + _job_in_lane_sha512] mov [state + _args_data_ptr_sha512 + PTR_SZ*idx], tmp %assign I 0 %rep (SHA_X_DIGEST_SIZE / (8*16)) movq xmm0, [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + (2*I)*SHA512_DIGEST_ROW_SIZE] pinsrq xmm0, [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + (2*I + 1) *SHA512_DIGEST_ROW_SIZE], 1 pshufb xmm0, [rel byteswap] movdqa [lane_data + _outer_block_sha512 + I*16], xmm0 %assign I (I+1) %endrep mov tmp, [job + _auth_key_xor_opad] %assign I 0 %rep 4 movdqu xmm0, [tmp + I * 16] movq [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 2*I*SHA512_DIGEST_ROW_SIZE], xmm0 pextrq [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + (2*I + 1)*SHA512_DIGEST_ROW_SIZE], xmm0, 1 %assign I (I+1) %endrep jmp copy_lane_data align 16 proc_extra_blocks: mov DWORD(start_offset), [lane_data + _start_offset_sha512] mov [state + _lens_sha512 + 2*idx], WORD(extra_blocks) lea tmp, [lane_data + _extra_block_sha512 + start_offset] mov [state + _args_data_ptr_sha512 + PTR_SZ*idx], tmp mov dword [lane_data + _extra_blocks_sha512], 0 jmp copy_lane_data return_null: xor job_rax, job_rax jmp return align 16 end_loop: mov job_rax, [lane_data + _job_in_lane_sha512] mov qword [lane_data + _job_in_lane_sha512], 0 or dword [job_rax + _status], STS_COMPLETED_HMAC mov unused_lanes, [state + _unused_lanes_sha512] shl unused_lanes, 8 or unused_lanes, idx mov [state + _unused_lanes_sha512], unused_lanes mov p, [job_rax + _auth_tag_output] ; below is the code for both SHA512 & SHA384. SHA512=32 bytes and SHA384=24 bytes mov QWORD(tmp2), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 0*SHA512_DIGEST_ROW_SIZE] mov QWORD(tmp4), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 1*SHA512_DIGEST_ROW_SIZE] mov QWORD(tmp6), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 2*SHA512_DIGEST_ROW_SIZE] %if (SHA_X_DIGEST_SIZE != 384) mov QWORD(tmp5), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 3*SHA512_DIGEST_ROW_SIZE] %endif bswap QWORD(tmp2) bswap QWORD(tmp4) bswap QWORD(tmp6) %if (SHA_X_DIGEST_SIZE != 384) bswap QWORD(tmp5) %endif mov [p + 0*8], QWORD(tmp2) mov [p + 1*8], QWORD(tmp4) mov [p + 2*8], QWORD(tmp6) %if (SHA_X_DIGEST_SIZE != 384) mov [p + 3*8], QWORD(tmp5) %endif return: mov rbx, [rsp + _gpr_save + 8*0] mov rbp, [rsp + _gpr_save + 8*1] mov rsp, [rsp + _rsp_save] ; original SP ret intel-ipsec-mb-0.48/sse/mb_mgr_hmac_sha_512_submit_sse.asm000066400000000000000000000223361321406316400234710ustar00rootroot00000000000000;; ;; Copyright (c) 2012-2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; %include "os.asm" %include "job_aes_hmac.asm" %include "mb_mgr_datastruct.asm" %include "reg_sizes.asm" %include "memcpy.asm" extern sha512_x2_sse section .data default rel align 16 byteswap: ;ddq 0x08090a0b0c0d0e0f0001020304050607 dq 0x0001020304050607, 0x08090a0b0c0d0e0f section .text %ifndef FUNC %define FUNC submit_job_hmac_sha_512_sse %define SHA_X_DIGEST_SIZE 512 %endif %if 1 %ifdef LINUX %define arg1 rdi %define arg2 rsi %define reg3 rcx %define reg4 rdx %else %define arg1 rcx %define arg2 rdx %define reg3 rdi %define reg4 rsi %endif %define state arg1 %define job arg2 %define len2 arg2 ; idx needs to be in rbx, rbp, r12-r15 %define last_len rbp %define idx rbp %define p r11 %define start_offset r11 %define unused_lanes rbx %define tmp4 rbx %define job_rax rax %define len rax %define size_offset reg3 %define tmp2 reg3 %define lane reg4 %define tmp3 reg4 %define extra_blocks r8 %define tmp r9 %define p2 r9 %define lane_data r10 %endif ; This routine clobbers rbx, rbp, rsi, rdi struc STACK _gpr_save: resq 4 _rsp_save: resq 1 endstruc ; JOB* FUNC(MB_MGR_HMAC_SHA_512_OOO *state, JOB_AES_HMAC *job) ; arg 1 : rcx : state ; arg 2 : rdx : job MKGLOBAL(FUNC,function,internal) FUNC: mov rax, rsp sub rsp, STACK_size and rsp, -16 mov [rsp + _gpr_save + 8*0], rbx mov [rsp + _gpr_save + 8*1], rbp %ifndef LINUX mov [rsp + _gpr_save + 8*2], rsi mov [rsp + _gpr_save + 8*3], rdi %endif mov [rsp + _rsp_save], rax ; original SP mov unused_lanes, [state + _unused_lanes_sha512] movzx lane, BYTE(unused_lanes) shr unused_lanes, 8 imul lane_data, lane, _SHA512_LANE_DATA_size lea lane_data, [state + _ldata_sha512+ lane_data] mov [state + _unused_lanes_sha512], unused_lanes mov len, [job + _msg_len_to_hash_in_bytes] mov tmp, len shr tmp, 7 ; divide by 128, len in terms of sha512 blocks mov [lane_data + _job_in_lane_sha512], job mov dword [lane_data + _outer_done_sha512], 0 mov [state + _lens_sha512 + 2*lane], WORD(tmp) ; 2 is word size in bytes mov last_len, len and last_len, 127 lea extra_blocks, [last_len + 17 + 127] shr extra_blocks, 7 mov [lane_data + _extra_blocks_sha512], DWORD(extra_blocks) mov p, [job + _src] add p, [job + _hash_start_src_offset_in_bytes] mov [state + _args_data_ptr_sha512 + PTR_SZ*lane], p cmp len, 128 jb copy_lt128 fast_copy: add p, len %assign I 0 %rep 2 movdqu xmm0, [p - 128 + I*4*16 + 0*16] movdqu xmm1, [p - 128 + I*4*16 + 1*16] movdqu xmm2, [p - 128 + I*4*16 + 2*16] movdqu xmm3, [p - 128 + I*4*16 + 3*16] movdqa [lane_data + _extra_block_sha512 + I*4*16 + 0*16], xmm0 movdqa [lane_data + _extra_block_sha512 + I*4*16 + 1*16], xmm1 movdqa [lane_data + _extra_block_sha512 + I*4*16 + 2*16], xmm2 movdqa [lane_data + _extra_block_sha512 + I*4*16 + 3*16], xmm3 %assign I (I+1) %endrep end_fast_copy: mov size_offset, extra_blocks shl size_offset, 7 sub size_offset, last_len add size_offset, 128-8 mov [lane_data + _size_offset_sha512], DWORD(size_offset) mov start_offset, 128 sub start_offset, last_len mov [lane_data + _start_offset_sha512], DWORD(start_offset) lea tmp, [8*128 + 8*len] bswap tmp mov [lane_data + _extra_block_sha512 + size_offset], tmp mov tmp, [job + _auth_key_xor_ipad] %assign I 0 %rep 4 movdqu xmm0, [tmp + I * 2 * SHA512_DIGEST_WORD_SIZE] movq [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*lane + (2*I)*SHA512_DIGEST_ROW_SIZE], xmm0 pextrq [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*lane + (2*I + 1)*SHA512_DIGEST_ROW_SIZE], xmm0, 1 %assign I (I+1) %endrep test len, ~127 jnz ge128_bytes lt128_bytes: mov [state + _lens_sha512 + 2*lane], WORD(extra_blocks) lea tmp, [lane_data + _extra_block_sha512 + start_offset] mov [state + _args_data_ptr_sha512 + PTR_SZ*lane], tmp ;; 8 to hold a UINT8 mov dword [lane_data + _extra_blocks_sha512], 0 ge128_bytes: cmp unused_lanes, 0xff jne return_null jmp start_loop align 16 start_loop: ; Find min length movdqa xmm0, [state + _lens_sha512] phminposuw xmm1, xmm0 pextrw DWORD(len2), xmm1, 0 ; min value pextrw DWORD(idx), xmm1, 1 ; min index (0...1) cmp len2, 0 je len_is_0 pshuflw xmm1, xmm1, 0XA0 psubw xmm0, xmm1 movdqa [state + _lens_sha512], xmm0 ; "state" and "args" are the same address, arg1 ; len is arg2 call sha512_x2_sse ; state and idx are intact len_is_0: ; process completed job "idx" imul lane_data, idx, _SHA512_LANE_DATA_size lea lane_data, [state + _ldata_sha512 + lane_data] mov DWORD(extra_blocks), [lane_data + _extra_blocks_sha512] cmp extra_blocks, 0 jne proc_extra_blocks cmp dword [lane_data + _outer_done_sha512], 0 jne end_loop proc_outer: mov dword [lane_data + _outer_done_sha512], 1 mov DWORD(size_offset), [lane_data + _size_offset_sha512] mov qword [lane_data + _extra_block_sha512 + size_offset], 0 mov word [state + _lens_sha512 + 2*idx], 1 lea tmp, [lane_data + _outer_block_sha512] mov job, [lane_data + _job_in_lane_sha512] mov [state + _args_data_ptr_sha512 + PTR_SZ*idx], tmp %assign I 0 %rep (SHA_X_DIGEST_SIZE / (8 * 16)) movq xmm0, [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + (2*I)*SHA512_DIGEST_ROW_SIZE] pinsrq xmm0, [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + (2*I + 1)*SHA512_DIGEST_ROW_SIZE], 1 pshufb xmm0, [rel byteswap] movdqa [lane_data + _outer_block_sha512 + I*16], xmm0 %assign I (I+1) %endrep mov tmp, [job + _auth_key_xor_opad] %assign I 0 %rep 4 movdqu xmm0, [tmp + I*16] movq [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 2*I*SHA512_DIGEST_ROW_SIZE], xmm0 pextrq [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + (2*I + 1)*SHA512_DIGEST_ROW_SIZE], xmm0, 1 %assign I (I+1) %endrep jmp start_loop align 16 proc_extra_blocks: mov DWORD(start_offset), [lane_data + _start_offset_sha512] mov [state + _lens_sha512 + 2*idx], WORD(extra_blocks) lea tmp, [lane_data + _extra_block_sha512 + start_offset] mov [state + _args_data_ptr_sha512 + PTR_SZ*idx], tmp mov dword [lane_data + _extra_blocks_sha512], 0 jmp start_loop align 16 copy_lt128: ;; less than one message block of data ;; beginning of source block ;; destination extra block but backwards by len from where 0x80 pre-populated lea p2, [lane_data + _extra_block + 128] sub p2, len memcpy_sse_128_1 p2, p, len, tmp4, tmp2, xmm0, xmm1, xmm2, xmm3 mov unused_lanes, [state + _unused_lanes_sha512] jmp end_fast_copy return_null: xor job_rax, job_rax jmp return align 16 end_loop: mov job_rax, [lane_data + _job_in_lane_sha512] mov unused_lanes, [state + _unused_lanes_sha512] mov qword [lane_data + _job_in_lane_sha512], 0 or dword [job_rax + _status], STS_COMPLETED_HMAC shl unused_lanes, 8 or unused_lanes, idx mov [state + _unused_lanes_sha512], unused_lanes mov p, [job_rax + _auth_tag_output] ; below is the code for both SHA512 & SHA384. SHA512=32 bytes and SHA384=24 bytes mov QWORD(tmp), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 0*SHA512_DIGEST_ROW_SIZE] mov QWORD(tmp2), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 1*SHA512_DIGEST_ROW_SIZE] mov QWORD(tmp3), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 2*SHA512_DIGEST_ROW_SIZE] %if (SHA_X_DIGEST_SIZE != 384) mov QWORD(tmp4), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 3*SHA512_DIGEST_ROW_SIZE] ; this line of code will run only for SHA512 %endif bswap QWORD(tmp) bswap QWORD(tmp2) bswap QWORD(tmp3) %if (SHA_X_DIGEST_SIZE != 384) bswap QWORD(tmp4) %endif mov [p + 0*8], QWORD(tmp) mov [p + 1*8], QWORD(tmp2) mov [p + 2*8], QWORD(tmp3) %if (SHA_X_DIGEST_SIZE != 384) mov [p + 3*8], QWORD(tmp4) %endif return: mov rbx, [rsp + _gpr_save + 8*0] mov rbp, [rsp + _gpr_save + 8*1] %ifndef LINUX mov rsi, [rsp + _gpr_save + 8*2] mov rdi, [rsp + _gpr_save + 8*3] %endif mov rsp, [rsp + _rsp_save] ; original SP ret intel-ipsec-mb-0.48/sse/mb_mgr_hmac_submit_ni_sse.asm000066400000000000000000000252101321406316400227270ustar00rootroot00000000000000;; ;; Copyright (c) 2012-2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; ;; In System V AMD64 ABI ;; calle saves: RBX, RBP, R12-R15 ;; Windows x64 ABI ;; calle saves: RBX, RBP, RDI, RSI, RSP, R12-R15 ;; ;; Registers: RAX RBX RCX RDX RBP RSI RDI R8 R9 R10 R11 R12 R13 R14 R15 ;; ----------------------------------------------------------- ;; Windows clobbers: RAX RCX RDX R8 R9 R10 R11 ;; Windows preserves: RBX RBP RSI RDI R12 R13 R14 R15 ;; ----------------------------------------------------------- ;; Linux clobbers: RAX RCX RDX RSI RDI R8 R9 R10 R11 ;; Linux preserves: RBX RBP R12 R13 R14 R15 ;; ----------------------------------------------------------- ;; ;; Linux/Windows clobbers: xmm0 - xmm15 ;; %include "os.asm" %include "job_aes_hmac.asm" %include "mb_mgr_datastruct.asm" %include "reg_sizes.asm" %include "memcpy.asm" ;%define DO_DBGPRINT %include "dbgprint.asm" extern sha1_ni section .data default rel align 16 byteswap: dq 0x0405060700010203 dq 0x0c0d0e0f08090a0b section .text %ifdef LINUX %define arg1 rdi %define arg2 rsi %define reg3 rcx %define reg4 rdx %else %define arg1 rcx %define arg2 rdx %define reg3 rdi %define reg4 rsi %endif %define state arg1 %define job arg2 %define len2 arg2 ; idx needs to be in rbx, rbp, r12-r15 %define last_len rbp %define idx rbp %define p4 rbp %define p r11 %define start_offset r11 %define unused_lanes rbx %define tmp4 rbx %define p3 rbx %define job_rax rax %define len rax %define size_offset reg3 %define tmp2 reg3 %define lane reg4 %define tmp3 reg4 %define extra_blocks r8 %define tmp r9 %define p2 r9 %define lane_data r10 struc STACK _gpr_save: resq 4 _rsp_save: resq 1 endstruc ; JOB* submit_job_hmac_ni_sse(MB_MGR_HMAC_SHA_1_OOO *state, JOB_AES_HMAC *job) ; arg 1 : rcx : state ; arg 2 : rdx : job MKGLOBAL(submit_job_hmac_ni_sse,function,internal) submit_job_hmac_ni_sse: mov rax, rsp sub rsp, STACK_size and rsp, -16 mov [rsp + _gpr_save + 8*0], rbx mov [rsp + _gpr_save + 8*1], rbp %ifndef LINUX mov [rsp + _gpr_save + 8*2], rsi mov [rsp + _gpr_save + 8*3], rdi %endif mov [rsp + _rsp_save], rax ; original SP DBGPRINTL "enter sha1-ni-sse submit" mov unused_lanes, [state + _unused_lanes] movzx lane, BYTE(unused_lanes) DBGPRINTL64 "lane: ", lane shr unused_lanes, 8 imul lane_data, lane, _HMAC_SHA1_LANE_DATA_size lea lane_data, [state + _ldata + lane_data] mov [state + _unused_lanes], unused_lanes mov len, [job + _msg_len_to_hash_in_bytes] DBGPRINTL64 "length: ", len mov tmp, len shr tmp, 6 ; divide by 64, len in terms of blocks mov [lane_data + _job_in_lane], job mov dword [lane_data + _outer_done], 0 mov [state + _lens + 2*lane], WORD(tmp) mov last_len, len and last_len, 63 lea extra_blocks, [last_len + 9 + 63] shr extra_blocks, 6 mov [lane_data + _extra_blocks], DWORD(extra_blocks) mov p, [job + _src] add p, [job + _hash_start_src_offset_in_bytes] DBGPRINTL64 "src pointer + offset:", p mov [state + _args_data_ptr + PTR_SZ*lane], p cmp len, 64 jb copy_lt64 fast_copy: add p, len movdqu xmm0, [p - 64 + 0*16] movdqu xmm1, [p - 64 + 1*16] movdqu xmm2, [p - 64 + 2*16] movdqu xmm3, [p - 64 + 3*16] movdqa [lane_data + _extra_block + 0*16], xmm0 movdqa [lane_data + _extra_block + 1*16], xmm1 movdqa [lane_data + _extra_block + 2*16], xmm2 movdqa [lane_data + _extra_block + 3*16], xmm3 end_fast_copy: mov size_offset, extra_blocks shl size_offset, 6 sub size_offset, last_len add size_offset, 64-8 mov [lane_data + _size_offset], DWORD(size_offset) mov start_offset, 64 sub start_offset, last_len mov [lane_data + _start_offset], DWORD(start_offset) lea tmp, [8*64 + 8*len] bswap tmp mov [lane_data + _extra_block + size_offset], tmp mov tmp, [job + _auth_key_xor_ipad] movdqu xmm0, [tmp] mov DWORD(tmp), [tmp + 4*SHA1_DIGEST_WORD_SIZE] %if SHA1NI_DIGEST_ROW_SIZE != 20 %error "Below code has been optimized for SHA1NI_DIGEST_ROW_SIZE = 20!" %endif lea p4, [lane + lane*4] movdqu [state + _args_digest + p4*4 + 0*SHA1_DIGEST_WORD_SIZE], xmm0 mov [state + _args_digest + p4*4 + 4*SHA1_DIGEST_WORD_SIZE], DWORD(tmp) test len, ~63 jnz ge64_bytes lt64_bytes: mov [state + _lens + 2*lane], WORD(extra_blocks) lea tmp, [lane_data + _extra_block + start_offset] mov [state + _args_data_ptr + PTR_SZ*lane], tmp mov dword [lane_data + _extra_blocks], 0 ge64_bytes: cmp unused_lanes, 0xff jne return_null jmp start_loop align 16 start_loop: ; Find min length - only two lanes available xor len2, len2 mov p3, 0x10000 mov WORD(len2), word [state + _lens + 0*2] ; [0:15] - lane 0 length, [16:31] - lane index (0) mov WORD(p3), word [state + _lens + 1*2] ; [0:15] - lane 1 length, [16:31] - lane index (1) cmp WORD(len2), WORD(p3) cmovg DWORD(len2), DWORD(p3) ; move if lane 0 length is greater than lane 1 length mov idx, len2 ; retrieve index & length from [16:31] and [0:15] bit fields shr DWORD(idx), 16 and DWORD(len2), 0xffff je len_is_0 sub word [state + _lens + 0*2], WORD(len2) sub word [state + _lens + 1*2], WORD(len2) ; "state" and "args" are the same address, arg1 ; len is arg2 call sha1_ni ; state is intact len_is_0: ; process completed job "idx" imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size lea lane_data, [state + _ldata + lane_data] mov DWORD(extra_blocks), [lane_data + _extra_blocks] cmp extra_blocks, 0 jne proc_extra_blocks cmp dword [lane_data + _outer_done], 0 jne end_loop proc_outer: mov dword [lane_data + _outer_done], 1 mov DWORD(size_offset), [lane_data + _size_offset] mov qword [lane_data + _extra_block + size_offset], 0 mov word [state + _lens + 2*idx], 1 lea tmp, [lane_data + _outer_block] mov job, [lane_data + _job_in_lane] mov [state + _args_data_ptr + PTR_SZ*idx], tmp %if SHA1NI_DIGEST_ROW_SIZE != 20 %error "Below code has been optimized for SHA1NI_DIGEST_ROW_SIZE = 20!" %endif lea p3, [idx + idx*4] movdqu xmm0, [state + _args_digest + p3*4 + 0*SHA1_DIGEST_WORD_SIZE] pshufb xmm0, [rel byteswap] mov DWORD(tmp), [state + _args_digest + p3*4 + 4*SHA1_DIGEST_WORD_SIZE] bswap DWORD(tmp) movdqa [lane_data + _outer_block], xmm0 mov [lane_data + _outer_block + 4*SHA1_DIGEST_WORD_SIZE], DWORD(tmp) mov tmp, [job + _auth_key_xor_opad] movdqu xmm0, [tmp] mov DWORD(tmp), [tmp + 4*SHA1_DIGEST_WORD_SIZE] movdqu [state + _args_digest + p3*4 + 0*SHA1_DIGEST_WORD_SIZE], xmm0 mov [state + _args_digest + p3*4 + 4*SHA1_DIGEST_WORD_SIZE], DWORD(tmp) jmp start_loop align 16 proc_extra_blocks: mov DWORD(start_offset), [lane_data + _start_offset] mov [state + _lens + 2*idx], WORD(extra_blocks) lea tmp, [lane_data + _extra_block + start_offset] mov [state + _args_data_ptr + PTR_SZ*idx], tmp mov dword [lane_data + _extra_blocks], 0 jmp start_loop align 16 copy_lt64: ;; less than one message block of data ;; beginning of source block ;; destination extrablock but backwards by len from where 0x80 pre-populated lea p2, [lane_data + _extra_block + 64] sub p2, len memcpy_sse_64_1 p2, p, len, tmp4, tmp2, xmm0, xmm1, xmm2, xmm3 mov unused_lanes, [state + _unused_lanes] jmp end_fast_copy return_null: xor job_rax, job_rax jmp return align 16 end_loop: mov job_rax, [lane_data + _job_in_lane] mov unused_lanes, [state + _unused_lanes] mov qword [lane_data + _job_in_lane], 0 or dword [job_rax + _status], STS_COMPLETED_HMAC shl unused_lanes, 8 or unused_lanes, idx mov [state + _unused_lanes], unused_lanes mov p, [job_rax + _auth_tag_output] ; copy 12 bytes %if SHA1NI_DIGEST_ROW_SIZE != 20 %error "Below code has been optimized for SHA1NI_DIGEST_ROW_SIZE = 20!" %endif lea idx, [idx + 4*idx] mov DWORD(tmp), [state + _args_digest + idx*4 + 0*SHA1_DIGEST_WORD_SIZE] mov DWORD(tmp2), [state + _args_digest + idx*4 + 1*SHA1_DIGEST_WORD_SIZE] mov DWORD(tmp3), [state + _args_digest + idx*4 + 2*SHA1_DIGEST_WORD_SIZE] bswap DWORD(tmp) bswap DWORD(tmp2) bswap DWORD(tmp3) mov [p + 0*SHA1_DIGEST_WORD_SIZE], DWORD(tmp) mov [p + 1*SHA1_DIGEST_WORD_SIZE], DWORD(tmp2) mov [p + 2*SHA1_DIGEST_WORD_SIZE], DWORD(tmp3) return: mov rbx, [rsp + _gpr_save + 8*0] mov rbp, [rsp + _gpr_save + 8*1] %ifndef LINUX mov rsi, [rsp + _gpr_save + 8*2] mov rdi, [rsp + _gpr_save + 8*3] %endif mov rsp, [rsp + _rsp_save] ; original SP ret intel-ipsec-mb-0.48/sse/mb_mgr_hmac_submit_sse.asm000066400000000000000000000243271321406316400222510ustar00rootroot00000000000000;; ;; Copyright (c) 2012-2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; %include "os.asm" %include "job_aes_hmac.asm" %include "mb_mgr_datastruct.asm" %include "reg_sizes.asm" %include "memcpy.asm" ;%define DO_DBGPRINT %include "dbgprint.asm" extern sha1_mult_sse section .data default rel align 16 byteswap: ;ddq 0x0c0d0e0f08090a0b0405060700010203 dq 0x0405060700010203, 0x0c0d0e0f08090a0b section .text %if 1 %ifdef LINUX %define arg1 rdi %define arg2 rsi %define reg3 rcx %define reg4 rdx %else %define arg1 rcx %define arg2 rdx %define reg3 rdi %define reg4 rsi %endif %define state arg1 %define job arg2 %define len2 arg2 ; idx needs to be in rbx, rbp, r12-r15 %define last_len rbp %define idx rbp %define p r11 %define start_offset r11 %define unused_lanes rbx %define tmp4 rbx %define job_rax rax %define len rax %define size_offset reg3 %define tmp2 reg3 %define lane reg4 %define tmp3 reg4 %define extra_blocks r8 %define tmp r9 %define p2 r9 %define lane_data r10 %endif ; This routine clobbers rdi, rsi, rbx, rbp struc STACK _gpr_save: resq 4 _rsp_save: resq 1 endstruc ; JOB* submit_job_hmac_sse(MB_MGR_HMAC_SHA_1_OOO *state, JOB_AES_HMAC *job) ; arg 1 : rcx : state ; arg 2 : rdx : job MKGLOBAL(submit_job_hmac_sse,function, internal) submit_job_hmac_sse: mov rax, rsp sub rsp, STACK_size and rsp, -16 mov [rsp + _gpr_save + 8*0], rbx mov [rsp + _gpr_save + 8*1], rbp %ifndef LINUX mov [rsp + _gpr_save + 8*2], rsi mov [rsp + _gpr_save + 8*3], rdi %endif mov [rsp + _rsp_save], rax ; original SP DBGPRINTL "enter sha1-sse submit" mov unused_lanes, [state + _unused_lanes] movzx lane, BYTE(unused_lanes) shr unused_lanes, 8 imul lane_data, lane, _HMAC_SHA1_LANE_DATA_size lea lane_data, [state + _ldata + lane_data] mov [state + _unused_lanes], unused_lanes mov len, [job + _msg_len_to_hash_in_bytes] mov tmp, len shr tmp, 6 ; divide by 64, len in terms of blocks mov [lane_data + _job_in_lane], job mov dword [lane_data + _outer_done], 0 mov [state + _lens + 2*lane], WORD(tmp) mov last_len, len and last_len, 63 lea extra_blocks, [last_len + 9 + 63] shr extra_blocks, 6 mov [lane_data + _extra_blocks], DWORD(extra_blocks) mov p, [job + _src] add p, [job + _hash_start_src_offset_in_bytes] mov [state + _args_data_ptr + PTR_SZ*lane], p cmp len, 64 jb copy_lt64 fast_copy: add p, len movdqu xmm0, [p - 64 + 0*16] movdqu xmm1, [p - 64 + 1*16] movdqu xmm2, [p - 64 + 2*16] movdqu xmm3, [p - 64 + 3*16] movdqa [lane_data + _extra_block + 0*16], xmm0 movdqa [lane_data + _extra_block + 1*16], xmm1 movdqa [lane_data + _extra_block + 2*16], xmm2 movdqa [lane_data + _extra_block + 3*16], xmm3 end_fast_copy: mov size_offset, extra_blocks shl size_offset, 6 sub size_offset, last_len add size_offset, 64-8 mov [lane_data + _size_offset], DWORD(size_offset) mov start_offset, 64 sub start_offset, last_len mov [lane_data + _start_offset], DWORD(start_offset) lea tmp, [8*64 + 8*len] bswap tmp mov [lane_data + _extra_block + size_offset], tmp mov tmp, [job + _auth_key_xor_ipad] movdqu xmm0, [tmp] mov DWORD(tmp), [tmp + 4*4] movd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*lane + 0*SHA1_DIGEST_ROW_SIZE], xmm0 pextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*lane + 1*SHA1_DIGEST_ROW_SIZE], xmm0, 1 pextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*lane + 2*SHA1_DIGEST_ROW_SIZE], xmm0, 2 pextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*lane + 3*SHA1_DIGEST_ROW_SIZE], xmm0, 3 mov [state + _args_digest + SHA1_DIGEST_WORD_SIZE*lane + 4*SHA1_DIGEST_ROW_SIZE], DWORD(tmp) test len, ~63 jnz ge64_bytes lt64_bytes: mov [state + _lens + 2*lane], WORD(extra_blocks) lea tmp, [lane_data + _extra_block + start_offset] mov [state + _args_data_ptr + PTR_SZ*lane], tmp mov dword [lane_data + _extra_blocks], 0 ge64_bytes: cmp unused_lanes, 0xff jne return_null jmp start_loop align 16 start_loop: ; Find min length movdqa xmm0, [state + _lens] phminposuw xmm1, xmm0 pextrw len2, xmm1, 0 ; min value pextrw idx, xmm1, 1 ; min index (0...3) cmp len2, 0 je len_is_0 pshuflw xmm1, xmm1, 0 psubw xmm0, xmm1 movdqa [state + _lens], xmm0 ; "state" and "args" are the same address, arg1 ; len is arg2 call sha1_mult_sse ; state is intact len_is_0: ; process completed job "idx" imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size lea lane_data, [state + _ldata + lane_data] mov DWORD(extra_blocks), [lane_data + _extra_blocks] cmp extra_blocks, 0 jne proc_extra_blocks cmp dword [lane_data + _outer_done], 0 jne end_loop proc_outer: mov dword [lane_data + _outer_done], 1 mov DWORD(size_offset), [lane_data + _size_offset] mov qword [lane_data + _extra_block + size_offset], 0 mov word [state + _lens + 2*idx], 1 lea tmp, [lane_data + _outer_block] mov job, [lane_data + _job_in_lane] mov [state + _args_data_ptr + PTR_SZ*idx], tmp movd xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 0*SHA1_DIGEST_ROW_SIZE] pinsrd xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 1*SHA1_DIGEST_ROW_SIZE], 1 pinsrd xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 2*SHA1_DIGEST_ROW_SIZE], 2 pinsrd xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 3*SHA1_DIGEST_ROW_SIZE], 3 pshufb xmm0, [rel byteswap] mov DWORD(tmp), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 4*SHA1_DIGEST_ROW_SIZE] bswap DWORD(tmp) movdqa [lane_data + _outer_block], xmm0 mov [lane_data + _outer_block + 4*SHA1_DIGEST_WORD_SIZE], DWORD(tmp) mov tmp, [job + _auth_key_xor_opad] movdqu xmm0, [tmp] mov DWORD(tmp), [tmp + 4*4] movd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 0*SHA1_DIGEST_ROW_SIZE], xmm0 pextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 1*SHA1_DIGEST_ROW_SIZE], xmm0, 1 pextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 2*SHA1_DIGEST_ROW_SIZE], xmm0, 2 pextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 3*SHA1_DIGEST_ROW_SIZE], xmm0, 3 mov [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 4*SHA1_DIGEST_ROW_SIZE], DWORD(tmp) jmp start_loop align 16 proc_extra_blocks: mov DWORD(start_offset), [lane_data + _start_offset] mov [state + _lens + 2*idx], WORD(extra_blocks) lea tmp, [lane_data + _extra_block + start_offset] mov [state + _args_data_ptr + PTR_SZ*idx], tmp mov dword [lane_data + _extra_blocks], 0 jmp start_loop align 16 copy_lt64: ;; less than one message block of data ;; beginning of source block ;; destination extrablock but backwards by len from where 0x80 pre-populated lea p2, [lane_data + _extra_block + 64] sub p2, len memcpy_sse_64_1 p2, p, len, tmp4, tmp2, xmm0, xmm1, xmm2, xmm3 mov unused_lanes, [state + _unused_lanes] jmp end_fast_copy return_null: xor job_rax, job_rax jmp return align 16 end_loop: mov job_rax, [lane_data + _job_in_lane] mov unused_lanes, [state + _unused_lanes] mov qword [lane_data + _job_in_lane], 0 or dword [job_rax + _status], STS_COMPLETED_HMAC shl unused_lanes, 8 or unused_lanes, idx mov [state + _unused_lanes], unused_lanes mov p, [job_rax + _auth_tag_output] ; copy 12 bytes mov DWORD(tmp), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 0*SHA1_DIGEST_ROW_SIZE] mov DWORD(tmp2), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 1*SHA1_DIGEST_ROW_SIZE] mov DWORD(tmp3), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 2*SHA1_DIGEST_ROW_SIZE] bswap DWORD(tmp) bswap DWORD(tmp2) bswap DWORD(tmp3) mov [p + 0*SHA1_DIGEST_WORD_SIZE], DWORD(tmp) mov [p + 1*SHA1_DIGEST_WORD_SIZE], DWORD(tmp2) mov [p + 2*SHA1_DIGEST_WORD_SIZE], DWORD(tmp3) return: mov rbx, [rsp + _gpr_save + 8*0] mov rbp, [rsp + _gpr_save + 8*1] %ifndef LINUX mov rsi, [rsp + _gpr_save + 8*2] mov rdi, [rsp + _gpr_save + 8*3] %endif mov rsp, [rsp + _rsp_save] ; original SP ret intel-ipsec-mb-0.48/sse/mb_mgr_sse.c000066400000000000000000000632531321406316400173410ustar00rootroot00000000000000/******************************************************************************* Copyright (c) 2012-2017, Intel Corporation Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ #include #include #include #include "os.h" #ifdef __WIN32 #include #endif #include "mb_mgr.h" #include "save_xmms.h" #include "asm.h" #ifndef NO_GCM #include "gcm_defines.h" #endif #include "des.h" JOB_AES_HMAC *submit_job_aes128_enc_sse(MB_MGR_AES_OOO *state, JOB_AES_HMAC *job); JOB_AES_HMAC *flush_job_aes128_enc_sse(MB_MGR_AES_OOO *state); JOB_AES_HMAC *submit_job_aes192_enc_sse(MB_MGR_AES_OOO *state, JOB_AES_HMAC *job); JOB_AES_HMAC *flush_job_aes192_enc_sse(MB_MGR_AES_OOO *state); JOB_AES_HMAC *submit_job_aes256_enc_sse(MB_MGR_AES_OOO *state, JOB_AES_HMAC *job); JOB_AES_HMAC *flush_job_aes256_enc_sse(MB_MGR_AES_OOO *state); JOB_AES_HMAC *submit_job_hmac_sse(MB_MGR_HMAC_SHA_1_OOO *state, JOB_AES_HMAC *job); JOB_AES_HMAC *flush_job_hmac_sse(MB_MGR_HMAC_SHA_1_OOO *state); JOB_AES_HMAC *submit_job_hmac_ni_sse(MB_MGR_HMAC_SHA_1_OOO *state, JOB_AES_HMAC *job); JOB_AES_HMAC *flush_job_hmac_ni_sse(MB_MGR_HMAC_SHA_1_OOO *state); JOB_AES_HMAC *submit_job_hmac_sha_224_sse(MB_MGR_HMAC_SHA_256_OOO *state, JOB_AES_HMAC *job); JOB_AES_HMAC *flush_job_hmac_sha_224_sse(MB_MGR_HMAC_SHA_256_OOO *state); JOB_AES_HMAC *submit_job_hmac_sha_224_ni_sse(MB_MGR_HMAC_SHA_256_OOO *state, JOB_AES_HMAC *job); JOB_AES_HMAC *flush_job_hmac_sha_224_ni_sse(MB_MGR_HMAC_SHA_256_OOO *state); JOB_AES_HMAC *submit_job_hmac_sha_256_sse(MB_MGR_HMAC_SHA_256_OOO *state, JOB_AES_HMAC *job); JOB_AES_HMAC *flush_job_hmac_sha_256_sse(MB_MGR_HMAC_SHA_256_OOO *state); JOB_AES_HMAC *submit_job_hmac_sha_256_ni_sse(MB_MGR_HMAC_SHA_256_OOO *state, JOB_AES_HMAC *job); JOB_AES_HMAC *flush_job_hmac_sha_256_ni_sse(MB_MGR_HMAC_SHA_256_OOO *state); JOB_AES_HMAC *submit_job_hmac_sha_384_sse(MB_MGR_HMAC_SHA_512_OOO *state, JOB_AES_HMAC *job); JOB_AES_HMAC *flush_job_hmac_sha_384_sse(MB_MGR_HMAC_SHA_512_OOO *state); JOB_AES_HMAC *submit_job_hmac_sha_512_sse(MB_MGR_HMAC_SHA_512_OOO *state, JOB_AES_HMAC *job); JOB_AES_HMAC *flush_job_hmac_sha_512_sse(MB_MGR_HMAC_SHA_512_OOO *state); JOB_AES_HMAC *submit_job_hmac_md5_sse(MB_MGR_HMAC_MD5_OOO *state, JOB_AES_HMAC *job); JOB_AES_HMAC *flush_job_hmac_md5_sse(MB_MGR_HMAC_MD5_OOO *state); JOB_AES_HMAC *submit_job_aes_xcbc_sse(MB_MGR_AES_XCBC_OOO *state, JOB_AES_HMAC *job); JOB_AES_HMAC *flush_job_aes_xcbc_sse(MB_MGR_AES_XCBC_OOO *state); #define SAVE_XMMS save_xmms #define RESTORE_XMMS restore_xmms #define SUBMIT_JOB_AES128_ENC submit_job_aes128_enc_sse #define SUBMIT_JOB_AES128_DEC submit_job_aes128_dec_sse #define FLUSH_JOB_AES128_ENC flush_job_aes128_enc_sse #define SUBMIT_JOB_AES192_ENC submit_job_aes192_enc_sse #define SUBMIT_JOB_AES192_DEC submit_job_aes192_dec_sse #define FLUSH_JOB_AES192_ENC flush_job_aes192_enc_sse #define SUBMIT_JOB_AES256_ENC submit_job_aes256_enc_sse #define SUBMIT_JOB_AES256_DEC submit_job_aes256_dec_sse #define FLUSH_JOB_AES256_ENC flush_job_aes256_enc_sse #define SUBMIT_JOB_HMAC submit_job_hmac_sse #define FLUSH_JOB_HMAC flush_job_hmac_sse #define SUBMIT_JOB_HMAC_NI submit_job_hmac_ni_sse #define FLUSH_JOB_HMAC_NI flush_job_hmac_ni_sse #define SUBMIT_JOB_HMAC_SHA_224 submit_job_hmac_sha_224_sse #define FLUSH_JOB_HMAC_SHA_224 flush_job_hmac_sha_224_sse #define SUBMIT_JOB_HMAC_SHA_224_NI submit_job_hmac_sha_224_ni_sse #define FLUSH_JOB_HMAC_SHA_224_NI flush_job_hmac_sha_224_ni_sse #define SUBMIT_JOB_HMAC_SHA_256 submit_job_hmac_sha_256_sse #define FLUSH_JOB_HMAC_SHA_256 flush_job_hmac_sha_256_sse #define SUBMIT_JOB_HMAC_SHA_256_NI submit_job_hmac_sha_256_ni_sse #define FLUSH_JOB_HMAC_SHA_256_NI flush_job_hmac_sha_256_ni_sse #define SUBMIT_JOB_HMAC_SHA_384 submit_job_hmac_sha_384_sse #define FLUSH_JOB_HMAC_SHA_384 flush_job_hmac_sha_384_sse #define SUBMIT_JOB_HMAC_SHA_512 submit_job_hmac_sha_512_sse #define FLUSH_JOB_HMAC_SHA_512 flush_job_hmac_sha_512_sse #define SUBMIT_JOB_HMAC_MD5 submit_job_hmac_md5_sse #define FLUSH_JOB_HMAC_MD5 flush_job_hmac_md5_sse #define SUBMIT_JOB_AES_XCBC submit_job_aes_xcbc_sse #define FLUSH_JOB_AES_XCBC flush_job_aes_xcbc_sse #define SUBMIT_JOB_AES128_CNTR submit_job_aes128_cntr_sse #define SUBMIT_JOB_AES192_CNTR submit_job_aes192_cntr_sse #define SUBMIT_JOB_AES256_CNTR submit_job_aes256_cntr_sse #define AES_CBC_DEC_128 aes_cbc_dec_128_sse #define AES_CBC_DEC_192 aes_cbc_dec_192_sse #define AES_CBC_DEC_256 aes_cbc_dec_256_sse #define AES_CNTR_128 aes_cntr_128_sse #define AES_CNTR_192 aes_cntr_192_sse #define AES_CNTR_256 aes_cntr_256_sse #ifndef NO_GCM #define AES_GCM_DEC_128 aes_gcm_dec_128_sse #define AES_GCM_ENC_128 aes_gcm_enc_128_sse #define AES_GCM_DEC_192 aes_gcm_dec_192_sse #define AES_GCM_ENC_192 aes_gcm_enc_192_sse #define AES_GCM_DEC_256 aes_gcm_dec_256_sse #define AES_GCM_ENC_256 aes_gcm_enc_256_sse #endif /* NO_GCM */ /* ====================================================================== */ #define SUBMIT_JOB submit_job_sse #define FLUSH_JOB flush_job_sse #define SUBMIT_JOB_NOCHECK submit_job_nocheck_sse #define SUBMIT_JOB_AES128_DEC submit_job_aes128_dec_sse #define SUBMIT_JOB_AES192_DEC submit_job_aes192_dec_sse #define SUBMIT_JOB_AES256_DEC submit_job_aes256_dec_sse #define QUEUE_SIZE queue_size_sse /* ====================================================================== */ #define SUBMIT_JOB_AES_ENC SUBMIT_JOB_AES_ENC_SSE #define FLUSH_JOB_AES_ENC FLUSH_JOB_AES_ENC_SSE #define SUBMIT_JOB_AES_DEC SUBMIT_JOB_AES_DEC_SSE #define SUBMIT_JOB_HASH SUBMIT_JOB_HASH_SSE #define FLUSH_JOB_HASH FLUSH_JOB_HASH_SSE /* ====================================================================== */ #define AES_CFB_128_ONE aes_cfb_128_one_sse void aes128_cbc_mac_x4(AES_ARGS_x8 *args, uint64_t len); #define AES128_CBC_MAC aes128_cbc_mac_x4 #define FLUSH_JOB_AES_CCM_AUTH flush_job_aes_ccm_auth_arch #define SUBMIT_JOB_AES_CCM_AUTH submit_job_aes_ccm_auth_arch #define AES_CCM_MAX_JOBS 4 /* ====================================================================== */ /* Variable to decide between SIMD or SHAxNI OOO scheduler selection. */ enum SHA_EXTENSION_USAGE sse_sha_ext_usage = SHA_EXT_DETECT; /* * Used to decide if SHA1/SHA256 SIMD or SHA1NI OOO scheduler should be * called. */ #define HASH_USE_SHAEXT sse_sha_ext_usage /* ====================================================================== */ struct cpuid_regs { UINT32 eax; UINT32 ebx; UINT32 ecx; UINT32 edx; }; /* * A C wrapper for CPUID opcode * * Parameters: * [in] leaf - CPUID leaf number (EAX) * [in] subleaf - CPUID sub-leaf number (ECX) * [out] out - registers structure to store results of CPUID into */ static void __mbcpuid(const unsigned leaf, const unsigned subleaf, struct cpuid_regs *out) { #ifdef _WIN32 /* Windows */ int regs[4]; __cpuidex(regs, leaf, subleaf); out->eax = regs[0]; out->ebx = regs[1]; out->ecx = regs[2]; out->edx = regs[3]; #else /* Linux */ #ifdef __x86_64__ asm volatile("mov %4, %%eax\n\t" "mov %5, %%ecx\n\t" "cpuid\n\t" "mov %%eax, %0\n\t" "mov %%ebx, %1\n\t" "mov %%ecx, %2\n\t" "mov %%edx, %3\n\t" : "=g" (out->eax), "=g" (out->ebx), "=g" (out->ecx), "=g" (out->edx) : "g" (leaf), "g" (subleaf) : "%eax", "%ebx", "%ecx", "%edx"); #else asm volatile("push %%ebx\n\t" "mov %4, %%eax\n\t" "mov %5, %%ecx\n\t" "cpuid\n\t" "mov %%eax, %0\n\t" "mov %%ebx, %1\n\t" "mov %%ecx, %2\n\t" "mov %%edx, %3\n\t" "pop %%ebx\n\t" : "=g" (out->eax), "=g" (out->ebx), "=g" (out->ecx), "=g" (out->edx) : "g" (leaf), "g" (subleaf) : "%eax", "%ecx", "%edx"); #endif #endif /* Linux */ } /* * Uses CPUID instruction to detected presence of SHA extensions. * * Return value: * 0 - SHA extensions not present * 1 - SHA extensions present */ static int sha_extensions_supported(void) { struct cpuid_regs r; /* Check highest leaf number. If less then 7 then SHA not supported. */ __mbcpuid(0x0, 0x0, &r); if (r.eax < 0x7) return 0; /* Check presence of SHA extensions in the extended feature flags */ __mbcpuid(0x7, 0x0, &r); if (r.ebx & (1 << 29)) return 1; return 0; } void init_mb_mgr_sse(MB_MGR *state) { unsigned int j; UINT8 *p; #ifdef HASH_USE_SHAEXT switch (HASH_USE_SHAEXT) { case SHA_EXT_PRESENT: break; case SHA_EXT_NOT_PRESENT: break; case SHA_EXT_DETECT: default: if (sha_extensions_supported()) HASH_USE_SHAEXT = SHA_EXT_PRESENT; else HASH_USE_SHAEXT = SHA_EXT_NOT_PRESENT; break; } #endif /* HASH_USE_SHAEXT */ /* Init AES out-of-order fields */ state->aes128_ooo.lens[0] = 0; state->aes128_ooo.lens[1] = 0; state->aes128_ooo.lens[2] = 0; state->aes128_ooo.lens[3] = 0; state->aes128_ooo.lens[4] = 0xFFFF; state->aes128_ooo.lens[5] = 0xFFFF; state->aes128_ooo.lens[6] = 0xFFFF; state->aes128_ooo.lens[7] = 0xFFFF; state->aes128_ooo.unused_lanes = 0xFF03020100; state->aes128_ooo.job_in_lane[0] = NULL; state->aes128_ooo.job_in_lane[1] = NULL; state->aes128_ooo.job_in_lane[2] = NULL; state->aes128_ooo.job_in_lane[3] = NULL; state->aes192_ooo.lens[0] = 0; state->aes192_ooo.lens[1] = 0; state->aes192_ooo.lens[2] = 0; state->aes192_ooo.lens[3] = 0; state->aes192_ooo.lens[4] = 0xFFFF; state->aes192_ooo.lens[5] = 0xFFFF; state->aes192_ooo.lens[6] = 0xFFFF; state->aes192_ooo.lens[7] = 0xFFFF; state->aes192_ooo.unused_lanes = 0xFF03020100; state->aes192_ooo.job_in_lane[0] = NULL; state->aes192_ooo.job_in_lane[1] = NULL; state->aes192_ooo.job_in_lane[2] = NULL; state->aes192_ooo.job_in_lane[3] = NULL; state->aes256_ooo.lens[0] = 0; state->aes256_ooo.lens[1] = 0; state->aes256_ooo.lens[2] = 0; state->aes256_ooo.lens[3] = 0; state->aes256_ooo.lens[4] = 0xFFFF; state->aes256_ooo.lens[5] = 0xFFFF; state->aes256_ooo.lens[6] = 0xFFFF; state->aes256_ooo.lens[7] = 0xFFFF; state->aes256_ooo.unused_lanes = 0xFF03020100; state->aes256_ooo.job_in_lane[0] = NULL; state->aes256_ooo.job_in_lane[1] = NULL; state->aes256_ooo.job_in_lane[2] = NULL; state->aes256_ooo.job_in_lane[3] = NULL; /* DOCSIS SEC BPI uses same settings as AES128 CBC */ state->docsis_sec_ooo.lens[0] = 0; state->docsis_sec_ooo.lens[1] = 0; state->docsis_sec_ooo.lens[2] = 0; state->docsis_sec_ooo.lens[3] = 0; state->docsis_sec_ooo.lens[4] = 0xFFFF; state->docsis_sec_ooo.lens[5] = 0xFFFF; state->docsis_sec_ooo.lens[6] = 0xFFFF; state->docsis_sec_ooo.lens[7] = 0xFFFF; state->docsis_sec_ooo.unused_lanes = 0xFF03020100; state->docsis_sec_ooo.job_in_lane[0] = NULL; state->docsis_sec_ooo.job_in_lane[1] = NULL; state->docsis_sec_ooo.job_in_lane[2] = NULL; state->docsis_sec_ooo.job_in_lane[3] = NULL; /* Init HMAC/SHA1 out-of-order fields */ state->hmac_sha_1_ooo.lens[0] = 0; state->hmac_sha_1_ooo.lens[1] = 0; state->hmac_sha_1_ooo.lens[2] = 0; state->hmac_sha_1_ooo.lens[3] = 0; state->hmac_sha_1_ooo.lens[4] = 0xFFFF; state->hmac_sha_1_ooo.lens[5] = 0xFFFF; state->hmac_sha_1_ooo.lens[6] = 0xFFFF; state->hmac_sha_1_ooo.lens[7] = 0xFFFF; state->hmac_sha_1_ooo.unused_lanes = 0xFF03020100; for (j = 0; j < SSE_NUM_SHA1_LANES; j++) { state->hmac_sha_1_ooo.ldata[j].job_in_lane = NULL; state->hmac_sha_1_ooo.ldata[j].extra_block[64] = 0x80; memset(state->hmac_sha_1_ooo.ldata[j].extra_block + 65, 0x00, 64+7); p = state->hmac_sha_1_ooo.ldata[j].outer_block; memset(p + 5*4 + 1, 0x00, 64 - 5*4 - 1 - 2); p[5*4] = 0x80; p[64-2] = 0x02; p[64-1] = 0xA0; } #ifdef HASH_USE_SHAEXT if (HASH_USE_SHAEXT == SHA_EXT_PRESENT) { /* Init HMAC/SHA1 NI out-of-order fields */ state->hmac_sha_1_ooo.lens[0] = 0; state->hmac_sha_1_ooo.lens[1] = 0; state->hmac_sha_1_ooo.lens[2] = 0xFFFF; state->hmac_sha_1_ooo.lens[3] = 0xFFFF; state->hmac_sha_1_ooo.lens[4] = 0xFFFF; state->hmac_sha_1_ooo.lens[5] = 0xFFFF; state->hmac_sha_1_ooo.lens[6] = 0xFFFF; state->hmac_sha_1_ooo.lens[7] = 0xFFFF; state->hmac_sha_1_ooo.unused_lanes = 0xFF0100; } #endif /* HASH_USE_SHAEXT */ /* Init HMAC/SHA224 out-of-order fields */ state->hmac_sha_224_ooo.lens[0] = 0; state->hmac_sha_224_ooo.lens[1] = 0; state->hmac_sha_224_ooo.lens[2] = 0; state->hmac_sha_224_ooo.lens[3] = 0; state->hmac_sha_224_ooo.lens[4] = 0xFFFF; state->hmac_sha_224_ooo.lens[5] = 0xFFFF; state->hmac_sha_224_ooo.lens[6] = 0xFFFF; state->hmac_sha_224_ooo.lens[7] = 0xFFFF; state->hmac_sha_224_ooo.unused_lanes = 0xFF03020100; for (j = 0; j < SSE_NUM_SHA256_LANES; j++) { state->hmac_sha_224_ooo.ldata[j].job_in_lane = NULL; state->hmac_sha_224_ooo.ldata[j].extra_block[64] = 0x80; memset(state->hmac_sha_224_ooo.ldata[j].extra_block + 65, 0x00, 64+7); p = state->hmac_sha_224_ooo.ldata[j].outer_block; memset(p + 8*4 + 1, 0x00, 64 - 8*4 - 1 - 2); p[7*4] = 0x80; /* digest 7 words long */ p[64-2] = 0x02; /* length in little endian = 0x02E0 */ p[64-1] = 0xE0; } #ifdef HASH_USE_SHAEXT if (HASH_USE_SHAEXT == SHA_EXT_PRESENT) { /* Init HMAC/SHA224 NI out-of-order fields */ state->hmac_sha_224_ooo.lens[0] = 0; state->hmac_sha_224_ooo.lens[1] = 0; state->hmac_sha_224_ooo.lens[2] = 0xFFFF; state->hmac_sha_224_ooo.lens[3] = 0xFFFF; state->hmac_sha_224_ooo.lens[4] = 0xFFFF; state->hmac_sha_224_ooo.lens[5] = 0xFFFF; state->hmac_sha_224_ooo.lens[6] = 0xFFFF; state->hmac_sha_224_ooo.lens[7] = 0xFFFF; state->hmac_sha_224_ooo.unused_lanes = 0xFF0100; } #endif /* HASH_USE_SHAEXT */ /* Init HMAC/SHA_256 out-of-order fields */ state->hmac_sha_256_ooo.lens[0] = 0; state->hmac_sha_256_ooo.lens[1] = 0; state->hmac_sha_256_ooo.lens[2] = 0; state->hmac_sha_256_ooo.lens[3] = 0; state->hmac_sha_256_ooo.lens[4] = 0xFFFF; state->hmac_sha_256_ooo.lens[5] = 0xFFFF; state->hmac_sha_256_ooo.lens[6] = 0xFFFF; state->hmac_sha_256_ooo.lens[7] = 0xFFFF; state->hmac_sha_256_ooo.unused_lanes = 0xFF03020100; for (j = 0; j < SSE_NUM_SHA256_LANES; j++) { state->hmac_sha_256_ooo.ldata[j].job_in_lane = NULL; state->hmac_sha_256_ooo.ldata[j].extra_block[64] = 0x80; memset(state->hmac_sha_256_ooo.ldata[j].extra_block + 65, 0x00, 64+7); p = state->hmac_sha_256_ooo.ldata[j].outer_block; memset(p + 8*4 + 1, 0x00, 64 - 8*4 - 1 - 2); /* digest is 8*4 bytes long */ p[8*4] = 0x80; p[64-2] = 0x03; /* length of (opad (64*8) bits + 256 bits) * in hex is 0x300 */ p[64-1] = 0x00; } #ifdef HASH_USE_SHAEXT if (HASH_USE_SHAEXT == SHA_EXT_PRESENT) { /* Init HMAC/SHA256 NI out-of-order fields */ state->hmac_sha_256_ooo.lens[0] = 0; state->hmac_sha_256_ooo.lens[1] = 0; state->hmac_sha_256_ooo.lens[2] = 0xFFFF; state->hmac_sha_256_ooo.lens[3] = 0xFFFF; state->hmac_sha_256_ooo.lens[4] = 0xFFFF; state->hmac_sha_256_ooo.lens[5] = 0xFFFF; state->hmac_sha_256_ooo.lens[6] = 0xFFFF; state->hmac_sha_256_ooo.lens[7] = 0xFFFF; state->hmac_sha_256_ooo.unused_lanes = 0xFF0100; } #endif /* HASH_USE_SHAEXT */ /* Init HMAC/SHA384 out-of-order fields */ state->hmac_sha_384_ooo.lens[0] = 0; state->hmac_sha_384_ooo.lens[1] = 0; state->hmac_sha_384_ooo.lens[2] = 0xFFFF; state->hmac_sha_384_ooo.lens[3] = 0xFFFF; state->hmac_sha_384_ooo.lens[4] = 0xFFFF; state->hmac_sha_384_ooo.lens[5] = 0xFFFF; state->hmac_sha_384_ooo.lens[6] = 0xFFFF; state->hmac_sha_384_ooo.lens[7] = 0xFFFF; state->hmac_sha_384_ooo.unused_lanes = 0xFF0100; for (j = 0; j < SSE_NUM_SHA512_LANES; j++) { MB_MGR_HMAC_SHA_512_OOO *ctx = &state->hmac_sha_384_ooo; ctx->ldata[j].job_in_lane = NULL; ctx->ldata[j].extra_block[SHA_384_BLOCK_SIZE] = 0x80; memset(ctx->ldata[j].extra_block + (SHA_384_BLOCK_SIZE + 1), 0x00, SHA_384_BLOCK_SIZE + 7); p = ctx->ldata[j].outer_block; memset(p + SHA384_DIGEST_SIZE_IN_BYTES + 1, 0x00, /* special end point because this length is constant */ SHA_384_BLOCK_SIZE - SHA384_DIGEST_SIZE_IN_BYTES - 1 - 2); p[SHA384_DIGEST_SIZE_IN_BYTES] = 0x80; /* mark the end */ /* * hmac outer block length always of fixed size, it is OKey * length, a whole message block length, 1024 bits, with padding * plus the length of the inner digest, which is 384 bits * 1408 bits == 0x0580. The input message block needs to be * converted to big endian within the sha implementation * before use. */ p[SHA_384_BLOCK_SIZE - 2] = 0x05; p[SHA_384_BLOCK_SIZE - 1] = 0x80; } /* Init HMAC/SHA512 out-of-order fields */ state->hmac_sha_512_ooo.lens[0] = 0; state->hmac_sha_512_ooo.lens[1] = 0; state->hmac_sha_512_ooo.lens[2] = 0xFFFF; state->hmac_sha_512_ooo.lens[3] = 0xFFFF; state->hmac_sha_512_ooo.lens[4] = 0xFFFF; state->hmac_sha_512_ooo.lens[5] = 0xFFFF; state->hmac_sha_512_ooo.lens[6] = 0xFFFF; state->hmac_sha_512_ooo.lens[7] = 0xFFFF; state->hmac_sha_512_ooo.unused_lanes = 0xFF0100; for (j = 0; j < SSE_NUM_SHA512_LANES; j++) { MB_MGR_HMAC_SHA_512_OOO *ctx = &state->hmac_sha_512_ooo; ctx->ldata[j].job_in_lane = NULL; ctx->ldata[j].extra_block[SHA_512_BLOCK_SIZE] = 0x80; memset(ctx->ldata[j].extra_block + (SHA_512_BLOCK_SIZE + 1), 0x00, SHA_512_BLOCK_SIZE + 7); p = ctx->ldata[j].outer_block; memset(p + SHA512_DIGEST_SIZE_IN_BYTES + 1, 0x00, /* special end point because this length is constant */ SHA_512_BLOCK_SIZE - SHA512_DIGEST_SIZE_IN_BYTES - 1 - 2); p[SHA512_DIGEST_SIZE_IN_BYTES] = 0x80; /* mark the end */ /* * hmac outer block length always of fixed size, it is OKey * length, a whole message block length, 1024 bits, with padding * plus the length of the inner digest, which is 512 bits * 1536 bits == 0x600. The input message block needs to be * converted to big endian within the sha implementation * before use. */ p[SHA_512_BLOCK_SIZE - 2] = 0x06; p[SHA_512_BLOCK_SIZE - 1] = 0x00; } /* Init HMAC/MD5 out-of-order fields */ state->hmac_md5_ooo.lens[0] = 0; state->hmac_md5_ooo.lens[1] = 0; state->hmac_md5_ooo.lens[2] = 0; state->hmac_md5_ooo.lens[3] = 0; state->hmac_md5_ooo.lens[4] = 0; state->hmac_md5_ooo.lens[5] = 0; state->hmac_md5_ooo.lens[6] = 0; state->hmac_md5_ooo.lens[7] = 0; state->hmac_md5_ooo.lens[8] = 0xFFFF; state->hmac_md5_ooo.lens[9] = 0xFFFF; state->hmac_md5_ooo.lens[10] = 0xFFFF; state->hmac_md5_ooo.lens[11] = 0xFFFF; state->hmac_md5_ooo.lens[12] = 0xFFFF; state->hmac_md5_ooo.lens[13] = 0xFFFF; state->hmac_md5_ooo.lens[14] = 0xFFFF; state->hmac_md5_ooo.lens[15] = 0xFFFF; state->hmac_md5_ooo.unused_lanes = 0xF76543210; for (j = 0; j < SSE_NUM_MD5_LANES; j++) { state->hmac_md5_ooo.ldata[j].job_in_lane = NULL; state->hmac_md5_ooo.ldata[j].extra_block[64] = 0x80; memset(state->hmac_md5_ooo.ldata[j].extra_block + 65, 0x00, 64 + 7); p = state->hmac_md5_ooo.ldata[j].outer_block; memset(p + (5 * 4) + 1, 0x00, 64 - (5 * 4) - 1 - 2); p[4*4] = 0x80; p[64-7] = 0x02; p[64-8] = 0x80; } /* Init AES/XCBC OOO fields */ state->aes_xcbc_ooo.lens[0] = 0; state->aes_xcbc_ooo.lens[1] = 0; state->aes_xcbc_ooo.lens[2] = 0; state->aes_xcbc_ooo.lens[3] = 0; state->aes_xcbc_ooo.lens[4] = 0xFFFF; state->aes_xcbc_ooo.lens[5] = 0xFFFF; state->aes_xcbc_ooo.lens[6] = 0xFFFF; state->aes_xcbc_ooo.lens[7] = 0xFFFF; state->aes_xcbc_ooo.unused_lanes = 0xFF03020100; for (j = 0; j < 4; j++) { state->aes_xcbc_ooo.ldata[j].job_in_lane = NULL; state->aes_xcbc_ooo.ldata[j].final_block[16] = 0x80; memset(state->aes_xcbc_ooo.ldata[j].final_block + 17, 0x00, 15); } /* Init AES-CCM auth out-of-order fields */ for (j = 0; j < 4; j++) { state->aes_ccm_ooo.init_done[j] = 0; state->aes_ccm_ooo.lens[j] = 0; state->aes_ccm_ooo.job_in_lane[j] = NULL; } state->aes_ccm_ooo.unused_lanes = 0xF3210; /* Init "in order" components */ state->next_job = 0; state->earliest_job = -1; /* set SSE handlers */ state->get_next_job = get_next_job_sse; state->submit_job = submit_job_sse; state->submit_job_nocheck = submit_job_nocheck_sse; state->get_completed_job = get_completed_job_sse; state->flush_job = flush_job_sse; state->queue_size = queue_size_sse; state->keyexp_128 = aes_keyexp_128_sse; state->keyexp_192 = aes_keyexp_192_sse; state->keyexp_256 = aes_keyexp_256_sse; } #include "mb_mgr_code.h" intel-ipsec-mb-0.48/sse/md5_x4x2_sse.asm000066400000000000000000000774461321406316400200170ustar00rootroot00000000000000;; ;; Copyright (c) 2012-2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; ;; code to compute octal MD5 using SSE ;; Stack must be aligned to 16 bytes before call ;; Windows clobbers: rax rbx rdx rsi rdi r8 r9 r10 r11 r12 r13 r14 r15 ;; Windows preserves: rcx rbp ;; ;; Linux clobbers: rax rbx rcx rdx rsi r8 r9 r10 r11 r12 r13 r14 r15 ;; Linux preserves: rdi rbp ;; ;; clobbers xmm0-15 %include "os.asm" %include "mb_mgr_datastruct.asm" section .data align=64 default rel align 64 MKGLOBAL(MD5_TABLE,data,internal) MD5_TABLE: dd 0xd76aa478, 0xd76aa478, 0xd76aa478, 0xd76aa478 dd 0xe8c7b756, 0xe8c7b756, 0xe8c7b756, 0xe8c7b756 dd 0x242070db, 0x242070db, 0x242070db, 0x242070db dd 0xc1bdceee, 0xc1bdceee, 0xc1bdceee, 0xc1bdceee dd 0xf57c0faf, 0xf57c0faf, 0xf57c0faf, 0xf57c0faf dd 0x4787c62a, 0x4787c62a, 0x4787c62a, 0x4787c62a dd 0xa8304613, 0xa8304613, 0xa8304613, 0xa8304613 dd 0xfd469501, 0xfd469501, 0xfd469501, 0xfd469501 dd 0x698098d8, 0x698098d8, 0x698098d8, 0x698098d8 dd 0x8b44f7af, 0x8b44f7af, 0x8b44f7af, 0x8b44f7af dd 0xffff5bb1, 0xffff5bb1, 0xffff5bb1, 0xffff5bb1 dd 0x895cd7be, 0x895cd7be, 0x895cd7be, 0x895cd7be dd 0x6b901122, 0x6b901122, 0x6b901122, 0x6b901122 dd 0xfd987193, 0xfd987193, 0xfd987193, 0xfd987193 dd 0xa679438e, 0xa679438e, 0xa679438e, 0xa679438e dd 0x49b40821, 0x49b40821, 0x49b40821, 0x49b40821 dd 0xf61e2562, 0xf61e2562, 0xf61e2562, 0xf61e2562 dd 0xc040b340, 0xc040b340, 0xc040b340, 0xc040b340 dd 0x265e5a51, 0x265e5a51, 0x265e5a51, 0x265e5a51 dd 0xe9b6c7aa, 0xe9b6c7aa, 0xe9b6c7aa, 0xe9b6c7aa dd 0xd62f105d, 0xd62f105d, 0xd62f105d, 0xd62f105d dd 0x02441453, 0x02441453, 0x02441453, 0x02441453 dd 0xd8a1e681, 0xd8a1e681, 0xd8a1e681, 0xd8a1e681 dd 0xe7d3fbc8, 0xe7d3fbc8, 0xe7d3fbc8, 0xe7d3fbc8 dd 0x21e1cde6, 0x21e1cde6, 0x21e1cde6, 0x21e1cde6 dd 0xc33707d6, 0xc33707d6, 0xc33707d6, 0xc33707d6 dd 0xf4d50d87, 0xf4d50d87, 0xf4d50d87, 0xf4d50d87 dd 0x455a14ed, 0x455a14ed, 0x455a14ed, 0x455a14ed dd 0xa9e3e905, 0xa9e3e905, 0xa9e3e905, 0xa9e3e905 dd 0xfcefa3f8, 0xfcefa3f8, 0xfcefa3f8, 0xfcefa3f8 dd 0x676f02d9, 0x676f02d9, 0x676f02d9, 0x676f02d9 dd 0x8d2a4c8a, 0x8d2a4c8a, 0x8d2a4c8a, 0x8d2a4c8a dd 0xfffa3942, 0xfffa3942, 0xfffa3942, 0xfffa3942 dd 0x8771f681, 0x8771f681, 0x8771f681, 0x8771f681 dd 0x6d9d6122, 0x6d9d6122, 0x6d9d6122, 0x6d9d6122 dd 0xfde5380c, 0xfde5380c, 0xfde5380c, 0xfde5380c dd 0xa4beea44, 0xa4beea44, 0xa4beea44, 0xa4beea44 dd 0x4bdecfa9, 0x4bdecfa9, 0x4bdecfa9, 0x4bdecfa9 dd 0xf6bb4b60, 0xf6bb4b60, 0xf6bb4b60, 0xf6bb4b60 dd 0xbebfbc70, 0xbebfbc70, 0xbebfbc70, 0xbebfbc70 dd 0x289b7ec6, 0x289b7ec6, 0x289b7ec6, 0x289b7ec6 dd 0xeaa127fa, 0xeaa127fa, 0xeaa127fa, 0xeaa127fa dd 0xd4ef3085, 0xd4ef3085, 0xd4ef3085, 0xd4ef3085 dd 0x04881d05, 0x04881d05, 0x04881d05, 0x04881d05 dd 0xd9d4d039, 0xd9d4d039, 0xd9d4d039, 0xd9d4d039 dd 0xe6db99e5, 0xe6db99e5, 0xe6db99e5, 0xe6db99e5 dd 0x1fa27cf8, 0x1fa27cf8, 0x1fa27cf8, 0x1fa27cf8 dd 0xc4ac5665, 0xc4ac5665, 0xc4ac5665, 0xc4ac5665 dd 0xf4292244, 0xf4292244, 0xf4292244, 0xf4292244 dd 0x432aff97, 0x432aff97, 0x432aff97, 0x432aff97 dd 0xab9423a7, 0xab9423a7, 0xab9423a7, 0xab9423a7 dd 0xfc93a039, 0xfc93a039, 0xfc93a039, 0xfc93a039 dd 0x655b59c3, 0x655b59c3, 0x655b59c3, 0x655b59c3 dd 0x8f0ccc92, 0x8f0ccc92, 0x8f0ccc92, 0x8f0ccc92 dd 0xffeff47d, 0xffeff47d, 0xffeff47d, 0xffeff47d dd 0x85845dd1, 0x85845dd1, 0x85845dd1, 0x85845dd1 dd 0x6fa87e4f, 0x6fa87e4f, 0x6fa87e4f, 0x6fa87e4f dd 0xfe2ce6e0, 0xfe2ce6e0, 0xfe2ce6e0, 0xfe2ce6e0 dd 0xa3014314, 0xa3014314, 0xa3014314, 0xa3014314 dd 0x4e0811a1, 0x4e0811a1, 0x4e0811a1, 0x4e0811a1 dd 0xf7537e82, 0xf7537e82, 0xf7537e82, 0xf7537e82 dd 0xbd3af235, 0xbd3af235, 0xbd3af235, 0xbd3af235 dd 0x2ad7d2bb, 0x2ad7d2bb, 0x2ad7d2bb, 0x2ad7d2bb dd 0xeb86d391, 0xeb86d391, 0xeb86d391, 0xeb86d391 ONES: dd 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff section .text %ifdef LINUX ;; Linux Registers %define arg1 rdi %define arg2 rsi %define mem1 rcx %define mem2 rdx %else %define arg1 rcx %define arg2 rdx %define mem1 rdi %define mem2 rsi %endif ;; rbp is not clobbered %define inp0 r8 %define inp1 r9 %define inp2 r10 %define inp3 r11 %define inp4 r12 %define inp5 r13 %define inp6 r14 %define inp7 r15 %define TBL rax %define IDX rbx %define A xmm0 %define B xmm1 %define C xmm2 %define D xmm3 %define E xmm4 ; tmp %define F xmm5 ; tmp %define A2 xmm6 %define B2 xmm7 %define C2 xmm8 %define D2 xmm9 %define FUN E %define TMP F %define FUN2 xmm10 %define TMP2 xmm11 %define T0 xmm10 %define T1 xmm11 %define T2 xmm12 %define T3 xmm13 %define T4 xmm14 %define T5 xmm15 ; Stack Layout ; ; 470 DD2 ; 460 CC2 ; 450 BB2 ; 440 AA2 ; 430 DD ; 420 CC ; 410 BB ; 400 AA ; ; 3F0 data2[15] for lanes 7...4 \ ; ... \ ; 300 data2[0] for lanes 7...4 \ ; 2F0 data2[15] for lanes 3...0 > mem block 2 ; ... / ; 210 data2[1] for lanes 3...0 / ; 200 data2[0] for lanes 3...0 / ; ; 1F0 data1[15] for lanes 7...4 \ ; ... \ ; 100 data1[0] for lanes 7...4 \ ; F0 data1[15] for lanes 3...0 > mem block 1 ; ... / ; 10 data1[1] for lanes 3...0 / ; 0 data1[0] for lanes 3...0 / ; stack size must be an odd multiple of 8 bytes in size struc STACK _DATA: reso 2*2*16 ; 2 blocks * 2 sets of lanes * 16 regs _DIGEST: reso 8 ; stores AA-DD, AA2-DD2 resb 8 ; for alignment endstruc %define STACK_SIZE STACK_size %define AA rsp + _DIGEST + 16*0 %define BB rsp + _DIGEST + 16*1 %define CC rsp + _DIGEST + 16*2 %define DD rsp + _DIGEST + 16*3 %define AA2 rsp + _DIGEST + 16*4 %define BB2 rsp + _DIGEST + 16*5 %define CC2 rsp + _DIGEST + 16*6 %define DD2 rsp + _DIGEST + 16*7 ;; ;; MD5 left rotations (number of bits) ;; rot11 equ 7 rot12 equ 12 rot13 equ 17 rot14 equ 22 rot21 equ 5 rot22 equ 9 rot23 equ 14 rot24 equ 20 rot31 equ 4 rot32 equ 11 rot33 equ 16 rot34 equ 23 rot41 equ 6 rot42 equ 10 rot43 equ 15 rot44 equ 21 ; transpose r0, r1, r2, r3, t0, t1 ; "transpose" data in {r0..r3} using temps {t0..t3} ; Input looks like: {r0 r1 r2 r3} ; r0 = {a3 a2 a1 a0} ; r1 = {b3 b2 b1 b0} ; r2 = {c3 c2 c1 c0} ; r3 = {d3 d2 d1 d0} ; ; output looks like: {t0 r1 r0 r3} ; t0 = {d0 c0 b0 a0} ; r1 = {d1 c1 b1 a1} ; r0 = {d2 c2 b2 a2} ; r3 = {d3 c3 b3 a3} ; %macro TRANSPOSE 6 %define %%r0 %1 %define %%r1 %2 %define %%r2 %3 %define %%r3 %4 %define %%t0 %5 %define %%t1 %6 movdqa %%t0, %%r0 shufps %%t0, %%r1, 0x44 ; t0 = {b1 b0 a1 a0} shufps %%r0, %%r1, 0xEE ; r0 = {b3 b2 a3 a2} movdqa %%t1, %%r2 shufps %%t1, %%r3, 0x44 ; t1 = {d1 d0 c1 c0} shufps %%r2, %%r3, 0xEE ; r2 = {d3 d2 c3 c2} movdqa %%r1, %%t0 shufps %%r1, %%t1, 0xDD ; r1 = {d1 c1 b1 a1} movdqa %%r3, %%r0 shufps %%r3, %%r2, 0xDD ; r3 = {d3 c3 b3 a3} shufps %%r0, %%r2, 0x88 ; r0 = {d2 c2 b2 a2} shufps %%t0, %%t1, 0x88 ; t0 = {d0 c0 b0 a0} %endmacro ;; ;; Magic functions defined in RFC 1321 ;; ; macro MAGIC_F F,X,Y,Z ;; F = ((Z) ^ ((X) & ((Y) ^ (Z)))) %macro MAGIC_F 4 %define %%F %1 %define %%X %2 %define %%Y %3 %define %%Z %4 movdqa %%F,%%Z pxor %%F,%%Y pand %%F,%%X pxor %%F,%%Z %endmacro ; macro MAGIC_G F,X,Y,Z ;; F = F((Z),(X),(Y)) %macro MAGIC_G 4 %define %%F %1 %define %%X %2 %define %%Y %3 %define %%Z %4 MAGIC_F %%F,%%Z,%%X,%%Y %endmacro ; macro MAGIC_H F,X,Y,Z ;; F = ((X) ^ (Y) ^ (Z)) %macro MAGIC_H 4 %define %%F %1 %define %%X %2 %define %%Y %3 %define %%Z %4 movdqa %%F,%%Z pxor %%F,%%Y pxor %%F,%%X %endmacro ; macro MAGIC_I F,X,Y,Z ;; F = ((Y) ^ ((X) | ~(Z))) %macro MAGIC_I 4 %define %%F %1 %define %%X %2 %define %%Y %3 %define %%Z %4 movdqa %%F,%%Z pxor %%F,[rel ONES] ; pnot %%F por %%F,%%X pxor %%F,%%Y %endmacro ; PROLD reg, imm, tmp %macro PROLD 3 %define %%reg %1 %define %%imm %2 %define %%tmp %3 movdqa %%tmp, %%reg psrld %%tmp, (32-%%imm) pslld %%reg, %%imm por %%reg, %%tmp %endmacro ;; ;; single MD5 step ;; ;; A = B +ROL32((A +MAGIC(B,C,D) +data +const), nrot) ;; ; macro MD5_STEP1 MAGIC_FUN, A,B,C,D, A2,B2,C3,D2, FUN, TMP, data, MD5const, nrot %macro MD5_STEP1 14 %define %%MAGIC_FUN %1 %define %%A %2 %define %%B %3 %define %%C %4 %define %%D %5 %define %%A2 %6 %define %%B2 %7 %define %%C2 %8 %define %%D2 %9 %define %%FUN %10 %define %%TMP %11 %define %%data %12 %define %%MD5const %13 %define %%nrot %14 paddd %%A, %%MD5const paddd %%A2, %%MD5const paddd %%A, [%%data] paddd %%A2, [%%data + 16*16] %%MAGIC_FUN %%FUN, %%B,%%C,%%D paddd %%A, %%FUN %%MAGIC_FUN %%FUN, %%B2,%%C2,%%D2 paddd %%A2, %%FUN PROLD %%A,%%nrot, %%TMP PROLD %%A2,%%nrot, %%TMP paddd %%A, %%B paddd %%A2, %%B2 %endmacro ;; ;; single MD5 step ;; ;; A = B +ROL32((A +MAGIC(B,C,D) +data +const), nrot) ;; ; macro MD5_STEP MAGIC_FUN, A,B,C,D, A2,B2,C3,D2, FUN, TMP, FUN2, TMP2, data, ; MD5const, nrot %macro MD5_STEP 16 %define %%MAGIC_FUN %1 %define %%A %2 %define %%B %3 %define %%C %4 %define %%D %5 %define %%A2 %6 %define %%B2 %7 %define %%C2 %8 %define %%D2 %9 %define %%FUN %10 %define %%TMP %11 %define %%FUN2 %12 %define %%TMP2 %13 %define %%data %14 %define %%MD5const %15 %define %%nrot %16 paddd %%A, %%MD5const paddd %%A2, %%MD5const paddd %%A, [%%data] paddd %%A2, [%%data + 16*16] %%MAGIC_FUN %%FUN, %%B,%%C,%%D %%MAGIC_FUN %%FUN2, %%B2,%%C2,%%D2 paddd %%A, %%FUN paddd %%A2, %%FUN2 PROLD %%A,%%nrot, %%TMP PROLD %%A2,%%nrot, %%TMP2 paddd %%A, %%B paddd %%A2, %%B2 %endmacro ; void md5_x4x2_sse(MD5_ARGS *args, UINT64 num_blks) ; arg 1 : pointer to MD5_ARGS structure ; arg 2 : number of blocks (>=1) ; align 32 MKGLOBAL(md5_x4x2_sse,function,internal) md5_x4x2_sse: sub rsp, STACK_SIZE ;; each row of transposed digests is split into 2 parts, the right half stored in A, and left half in A2 ;; Initialize digests movdqa A,[arg1 + 0*16 + 0*MD5_DIGEST_ROW_SIZE] movdqa B,[arg1 + 0*16 + 1*MD5_DIGEST_ROW_SIZE] movdqa C,[arg1 + 0*16 + 2*MD5_DIGEST_ROW_SIZE] movdqa D,[arg1 + 0*16 + 3*MD5_DIGEST_ROW_SIZE] ;; Initialize digests movdqa A2,[arg1 + 1*16 + 0*MD5_DIGEST_ROW_SIZE] movdqa B2,[arg1 + 1*16 + 1*MD5_DIGEST_ROW_SIZE] movdqa C2,[arg1 + 1*16 + 2*MD5_DIGEST_ROW_SIZE] movdqa D2,[arg1 + 1*16 + 3*MD5_DIGEST_ROW_SIZE] lea TBL, [rel MD5_TABLE] ;; load input pointers mov inp0,[arg1+_data_ptr_md5 +0*PTR_SZ] mov inp1,[arg1+_data_ptr_md5 +1*PTR_SZ] mov inp2,[arg1+_data_ptr_md5 +2*PTR_SZ] mov inp3,[arg1+_data_ptr_md5 +3*PTR_SZ] mov inp4,[arg1+_data_ptr_md5 +4*PTR_SZ] mov inp5,[arg1+_data_ptr_md5 +5*PTR_SZ] mov inp6,[arg1+_data_ptr_md5 +6*PTR_SZ] mov inp7,[arg1+_data_ptr_md5 +7*PTR_SZ] xor IDX, IDX ; Make ping-pong pointers to the two memory blocks mov mem1, rsp lea mem2, [rsp + 16*16*2] ;; Load first block of data and save back to stack %assign I 0 %rep 4 movdqu T2,[inp0+IDX+I*16] movdqu T1,[inp1+IDX+I*16] movdqu T4,[inp2+IDX+I*16] movdqu T3,[inp3+IDX+I*16] TRANSPOSE T2, T1, T4, T3, T0, T5 movdqa [mem1+(I*4+0)*16],T0 movdqa [mem1+(I*4+1)*16],T1 movdqa [mem1+(I*4+2)*16],T2 movdqa [mem1+(I*4+3)*16],T3 movdqu T2,[inp4+IDX+I*16] movdqu T1,[inp5+IDX+I*16] movdqu T4,[inp6+IDX+I*16] movdqu T3,[inp7+IDX+I*16] TRANSPOSE T2, T1, T4, T3, T0, T5 movdqa [mem1+(I*4+0)*16 + 16*16],T0 movdqa [mem1+(I*4+1)*16 + 16*16],T1 movdqa [mem1+(I*4+2)*16 + 16*16],T2 movdqa [mem1+(I*4+3)*16 + 16*16],T3 %assign I (I+1) %endrep lloop: ; save old digests movdqa [AA], A movdqa [BB], B movdqa [CC], C movdqa [DD], D ; save old digests movdqa [AA2], A2 movdqa [BB2], B2 movdqa [CC2], C2 movdqa [DD2], D2 add IDX, 4*16 sub arg2, 1 je lastblock MD5_STEP1 MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 0*16, [TBL+ 0*16], rot11 MD5_STEP1 MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 1*16, [TBL+ 1*16], rot12 MD5_STEP1 MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 2*16, [TBL+ 2*16], rot13 MD5_STEP1 MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 3*16, [TBL+ 3*16], rot14 MD5_STEP1 MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 4*16, [TBL+ 4*16], rot11 MD5_STEP1 MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 5*16, [TBL+ 5*16], rot12 MD5_STEP1 MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 6*16, [TBL+ 6*16], rot13 MD5_STEP1 MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 7*16, [TBL+ 7*16], rot14 %assign I 0 movdqu T2,[inp0+IDX+I*16] movdqu T1,[inp1+IDX+I*16] movdqu T4,[inp2+IDX+I*16] movdqu T3,[inp3+IDX+I*16] TRANSPOSE T2, T1, T4, T3, T0, T5 movdqa [mem2+(I*4+0)*16],T0 movdqa [mem2+(I*4+1)*16],T1 movdqa [mem2+(I*4+2)*16],T2 movdqa [mem2+(I*4+3)*16],T3 MD5_STEP1 MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 8*16, [TBL+ 8*16], rot11 MD5_STEP1 MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 9*16, [TBL+ 9*16], rot12 MD5_STEP1 MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +10*16, [TBL+10*16], rot13 MD5_STEP1 MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 +11*16, [TBL+11*16], rot14 MD5_STEP1 MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 +12*16, [TBL+12*16], rot11 MD5_STEP1 MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 +13*16, [TBL+13*16], rot12 MD5_STEP1 MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +14*16, [TBL+14*16], rot13 MD5_STEP1 MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 +15*16, [TBL+15*16], rot14 movdqu T2,[inp4+IDX+I*16] movdqu T1,[inp5+IDX+I*16] movdqu T4,[inp6+IDX+I*16] movdqu T3,[inp7+IDX+I*16] TRANSPOSE T2, T1, T4, T3, T0, T5 movdqa [mem2+(I*4+0)*16 + 16*16],T0 movdqa [mem2+(I*4+1)*16 + 16*16],T1 movdqa [mem2+(I*4+2)*16 + 16*16],T2 movdqa [mem2+(I*4+3)*16 + 16*16],T3 %assign I (I+1) MD5_STEP1 MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 1*16, [TBL+16*16], rot21 MD5_STEP1 MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 6*16, [TBL+17*16], rot22 MD5_STEP1 MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +11*16, [TBL+18*16], rot23 MD5_STEP1 MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 0*16, [TBL+19*16], rot24 MD5_STEP1 MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 5*16, [TBL+20*16], rot21 MD5_STEP1 MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 +10*16, [TBL+21*16], rot22 MD5_STEP1 MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +15*16, [TBL+22*16], rot23 MD5_STEP1 MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 4*16, [TBL+23*16], rot24 movdqu T2,[inp0+IDX+I*16] movdqu T1,[inp1+IDX+I*16] movdqu T4,[inp2+IDX+I*16] movdqu T3,[inp3+IDX+I*16] TRANSPOSE T2, T1, T4, T3, T0, T5 movdqa [mem2+(I*4+0)*16],T0 movdqa [mem2+(I*4+1)*16],T1 movdqa [mem2+(I*4+2)*16],T2 movdqa [mem2+(I*4+3)*16],T3 MD5_STEP1 MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 9*16, [TBL+24*16], rot21 MD5_STEP1 MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 +14*16, [TBL+25*16], rot22 MD5_STEP1 MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 3*16, [TBL+26*16], rot23 MD5_STEP1 MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 8*16, [TBL+27*16], rot24 MD5_STEP1 MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 +13*16, [TBL+28*16], rot21 MD5_STEP1 MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 2*16, [TBL+29*16], rot22 MD5_STEP1 MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 7*16, [TBL+30*16], rot23 MD5_STEP1 MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 +12*16, [TBL+31*16], rot24 movdqu T2,[inp4+IDX+I*16] movdqu T1,[inp5+IDX+I*16] movdqu T4,[inp6+IDX+I*16] movdqu T3,[inp7+IDX+I*16] TRANSPOSE T2, T1, T4, T3, T0, T5 movdqa [mem2+(I*4+0)*16 + 16*16],T0 movdqa [mem2+(I*4+1)*16 + 16*16],T1 movdqa [mem2+(I*4+2)*16 + 16*16],T2 movdqa [mem2+(I*4+3)*16 + 16*16],T3 %assign I (I+1) MD5_STEP1 MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 5*16, [TBL+32*16], rot31 MD5_STEP1 MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 8*16, [TBL+33*16], rot32 MD5_STEP1 MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +11*16, [TBL+34*16], rot33 MD5_STEP1 MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 +14*16, [TBL+35*16], rot34 MD5_STEP1 MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 1*16, [TBL+36*16], rot31 MD5_STEP1 MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 4*16, [TBL+37*16], rot32 MD5_STEP1 MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 7*16, [TBL+38*16], rot33 MD5_STEP1 MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 +10*16, [TBL+39*16], rot34 movdqu T2,[inp0+IDX+I*16] movdqu T1,[inp1+IDX+I*16] movdqu T4,[inp2+IDX+I*16] movdqu T3,[inp3+IDX+I*16] TRANSPOSE T2, T1, T4, T3, T0, T5 movdqa [mem2+(I*4+0)*16],T0 movdqa [mem2+(I*4+1)*16],T1 movdqa [mem2+(I*4+2)*16],T2 movdqa [mem2+(I*4+3)*16],T3 MD5_STEP1 MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 +13*16, [TBL+40*16], rot31 MD5_STEP1 MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 0*16, [TBL+41*16], rot32 MD5_STEP1 MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 3*16, [TBL+42*16], rot33 MD5_STEP1 MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 6*16, [TBL+43*16], rot34 MD5_STEP1 MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 9*16, [TBL+44*16], rot31 MD5_STEP1 MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 +12*16, [TBL+45*16], rot32 MD5_STEP1 MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +15*16, [TBL+46*16], rot33 MD5_STEP1 MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 2*16, [TBL+47*16], rot34 movdqu T2,[inp4+IDX+I*16] movdqu T1,[inp5+IDX+I*16] movdqu T4,[inp6+IDX+I*16] movdqu T3,[inp7+IDX+I*16] TRANSPOSE T2, T1, T4, T3, T0, T5 movdqa [mem2+(I*4+0)*16 + 16*16],T0 movdqa [mem2+(I*4+1)*16 + 16*16],T1 movdqa [mem2+(I*4+2)*16 + 16*16],T2 movdqa [mem2+(I*4+3)*16 + 16*16],T3 %assign I (I+1) MD5_STEP1 MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 0*16, [TBL+48*16], rot41 MD5_STEP1 MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 7*16, [TBL+49*16], rot42 MD5_STEP1 MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +14*16, [TBL+50*16], rot43 MD5_STEP1 MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 5*16, [TBL+51*16], rot44 MD5_STEP1 MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 +12*16, [TBL+52*16], rot41 MD5_STEP1 MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 3*16, [TBL+53*16], rot42 MD5_STEP1 MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +10*16, [TBL+54*16], rot43 MD5_STEP1 MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 1*16, [TBL+55*16], rot44 movdqu T2,[inp0+IDX+I*16] movdqu T1,[inp1+IDX+I*16] movdqu T4,[inp2+IDX+I*16] movdqu T3,[inp3+IDX+I*16] TRANSPOSE T2, T1, T4, T3, T0, T5 movdqa [mem2+(I*4+0)*16],T0 movdqa [mem2+(I*4+1)*16],T1 movdqa [mem2+(I*4+2)*16],T2 movdqa [mem2+(I*4+3)*16],T3 MD5_STEP1 MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 8*16, [TBL+56*16], rot41 MD5_STEP1 MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 +15*16, [TBL+57*16], rot42 MD5_STEP1 MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 6*16, [TBL+58*16], rot43 MD5_STEP1 MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 +13*16, [TBL+59*16], rot44 MD5_STEP1 MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 4*16, [TBL+60*16], rot41 MD5_STEP1 MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 +11*16, [TBL+61*16], rot42 MD5_STEP1 MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 2*16, [TBL+62*16], rot43 MD5_STEP1 MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 9*16, [TBL+63*16], rot44 movdqu T2,[inp4+IDX+I*16] movdqu T1,[inp5+IDX+I*16] movdqu T4,[inp6+IDX+I*16] movdqu T3,[inp7+IDX+I*16] TRANSPOSE T2, T1, T4, T3, T0, T5 movdqa [mem2+(I*4+0)*16 + 16*16],T0 movdqa [mem2+(I*4+1)*16 + 16*16],T1 movdqa [mem2+(I*4+2)*16 + 16*16],T2 movdqa [mem2+(I*4+3)*16 + 16*16],T3 %assign I (I+1) paddd A,[AA] paddd B,[BB] paddd C,[CC] paddd D,[DD] paddd A2,[AA2] paddd B2,[BB2] paddd C2,[CC2] paddd D2,[DD2] ; swap mem1 and mem2 xchg mem1, mem2 jmp lloop lastblock: MD5_STEP MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 0*16, [TBL+ 0*16], rot11 MD5_STEP MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 1*16, [TBL+ 1*16], rot12 MD5_STEP MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 2*16, [TBL+ 2*16], rot13 MD5_STEP MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 3*16, [TBL+ 3*16], rot14 MD5_STEP MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 4*16, [TBL+ 4*16], rot11 MD5_STEP MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 5*16, [TBL+ 5*16], rot12 MD5_STEP MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 6*16, [TBL+ 6*16], rot13 MD5_STEP MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 7*16, [TBL+ 7*16], rot14 MD5_STEP MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 8*16, [TBL+ 8*16], rot11 MD5_STEP MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 9*16, [TBL+ 9*16], rot12 MD5_STEP MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +10*16, [TBL+10*16], rot13 MD5_STEP MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 +11*16, [TBL+11*16], rot14 MD5_STEP MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 +12*16, [TBL+12*16], rot11 MD5_STEP MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 +13*16, [TBL+13*16], rot12 MD5_STEP MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +14*16, [TBL+14*16], rot13 MD5_STEP MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 +15*16, [TBL+15*16], rot14 MD5_STEP MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 1*16, [TBL+16*16], rot21 MD5_STEP MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 6*16, [TBL+17*16], rot22 MD5_STEP MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +11*16, [TBL+18*16], rot23 MD5_STEP MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 0*16, [TBL+19*16], rot24 MD5_STEP MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 5*16, [TBL+20*16], rot21 MD5_STEP MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 +10*16, [TBL+21*16], rot22 MD5_STEP MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +15*16, [TBL+22*16], rot23 MD5_STEP MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 4*16, [TBL+23*16], rot24 MD5_STEP MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 9*16, [TBL+24*16], rot21 MD5_STEP MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 +14*16, [TBL+25*16], rot22 MD5_STEP MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 3*16, [TBL+26*16], rot23 MD5_STEP MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 8*16, [TBL+27*16], rot24 MD5_STEP MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 +13*16, [TBL+28*16], rot21 MD5_STEP MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 2*16, [TBL+29*16], rot22 MD5_STEP MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 7*16, [TBL+30*16], rot23 MD5_STEP MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 +12*16, [TBL+31*16], rot24 MD5_STEP MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 5*16, [TBL+32*16], rot31 MD5_STEP MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 8*16, [TBL+33*16], rot32 MD5_STEP MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +11*16, [TBL+34*16], rot33 MD5_STEP MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 +14*16, [TBL+35*16], rot34 MD5_STEP MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 1*16, [TBL+36*16], rot31 MD5_STEP MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 4*16, [TBL+37*16], rot32 MD5_STEP MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 7*16, [TBL+38*16], rot33 MD5_STEP MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 +10*16, [TBL+39*16], rot34 MD5_STEP MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 +13*16, [TBL+40*16], rot31 MD5_STEP MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 0*16, [TBL+41*16], rot32 MD5_STEP MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 3*16, [TBL+42*16], rot33 MD5_STEP MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 6*16, [TBL+43*16], rot34 MD5_STEP MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 9*16, [TBL+44*16], rot31 MD5_STEP MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 +12*16, [TBL+45*16], rot32 MD5_STEP MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +15*16, [TBL+46*16], rot33 MD5_STEP MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 2*16, [TBL+47*16], rot34 MD5_STEP MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 0*16, [TBL+48*16], rot41 MD5_STEP MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 7*16, [TBL+49*16], rot42 MD5_STEP MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +14*16, [TBL+50*16], rot43 MD5_STEP MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 5*16, [TBL+51*16], rot44 MD5_STEP MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 +12*16, [TBL+52*16], rot41 MD5_STEP MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 3*16, [TBL+53*16], rot42 MD5_STEP MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +10*16, [TBL+54*16], rot43 MD5_STEP MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 1*16, [TBL+55*16], rot44 MD5_STEP MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 8*16, [TBL+56*16], rot41 MD5_STEP MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 +15*16, [TBL+57*16], rot42 MD5_STEP MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 6*16, [TBL+58*16], rot43 MD5_STEP MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 +13*16, [TBL+59*16], rot44 MD5_STEP MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 4*16, [TBL+60*16], rot41 MD5_STEP MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 +11*16, [TBL+61*16], rot42 MD5_STEP MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 2*16, [TBL+62*16], rot43 MD5_STEP MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 9*16, [TBL+63*16], rot44 paddd A,[AA] paddd B,[BB] paddd C,[CC] paddd D,[DD] paddd A2,[AA2] paddd B2,[BB2] paddd C2,[CC2] paddd D2,[DD2] ; write out digests movdqu [arg1 + 0*16 + 0*MD5_DIGEST_ROW_SIZE], A movdqu [arg1 + 0*16 + 1*MD5_DIGEST_ROW_SIZE], B movdqu [arg1 + 0*16 + 2*MD5_DIGEST_ROW_SIZE], C movdqu [arg1 + 0*16 + 3*MD5_DIGEST_ROW_SIZE], D movdqu [arg1 + 1*16 + 0*MD5_DIGEST_ROW_SIZE], A2 movdqu [arg1 + 1*16 + 1*MD5_DIGEST_ROW_SIZE], B2 movdqu [arg1 + 1*16 + 2*MD5_DIGEST_ROW_SIZE], C2 movdqu [arg1 + 1*16 + 3*MD5_DIGEST_ROW_SIZE], D2 ;; update input pointers add inp0, IDX add inp1, IDX add inp2, IDX add inp3, IDX add inp4, IDX add inp5, IDX add inp6, IDX add inp7, IDX mov [arg1 +_data_ptr_md5 + 0*PTR_SZ], inp0 mov [arg1 +_data_ptr_md5 + 1*PTR_SZ], inp1 mov [arg1 +_data_ptr_md5 + 2*PTR_SZ], inp2 mov [arg1 +_data_ptr_md5 + 3*PTR_SZ], inp3 mov [arg1 +_data_ptr_md5 + 4*PTR_SZ], inp4 mov [arg1 +_data_ptr_md5 + 5*PTR_SZ], inp5 mov [arg1 +_data_ptr_md5 + 6*PTR_SZ], inp6 mov [arg1 +_data_ptr_md5 + 7*PTR_SZ], inp7 ;;;;;;;;;;;;;;;; ;; Postamble add rsp, STACK_SIZE ret intel-ipsec-mb-0.48/sse/sha1_mult_sse.asm000066400000000000000000000244101321406316400203210ustar00rootroot00000000000000;; ;; Copyright (c) 2012-2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; %include "os.asm" ;%define DO_DBGPRINT %include "dbgprint.asm" %include "mb_mgr_datastruct.asm" section .data default rel align 16 PSHUFFLE_BYTE_FLIP_MASK: ;ddq 0x0c0d0e0f08090a0b0405060700010203 dq 0x0405060700010203, 0x0c0d0e0f08090a0b K00_19: ;ddq 0x5A8279995A8279995A8279995A827999 dq 0x5A8279995A827999, 0x5A8279995A827999 K20_39: ;ddq 0x6ED9EBA16ED9EBA16ED9EBA16ED9EBA1 dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1 K40_59: ;ddq 0x8F1BBCDC8F1BBCDC8F1BBCDC8F1BBCDC dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC K60_79: ;ddq 0xCA62C1D6CA62C1D6CA62C1D6CA62C1D6 dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6 section .text ;; code to compute quad SHA1 using SSE ;; derived from ...\sha1_multiple\sha1_quad4.asm ;; variation of sha1_mult2.asm : clobbers all xmm regs, rcx left intact ;; rbx, rsi, rdi, rbp, r12-r15 left intact ;; This version is not safe to call from C/C++ ;; Stack must be aligned to 16 bytes before call ;; Windows clobbers: rax rdx r8 r9 r10 r11 ;; Windows preserves: rbx rcx rsi rdi rbp r12 r13 r14 r15 ;; ;; Linux clobbers: rax rsi r8 r9 r10 r11 ;; Linux preserves: rbx rcx rdx rdi rbp r12 r13 r14 r15 ;; ;; clobbers xmm0-15 ; transpose r0, r1, r2, r3, t0, t1 ; "transpose" data in {r0..r3} using temps {t0..t3} ; Input looks like: {r0 r1 r2 r3} ; r0 = {a3 a2 a1 a0} ; r1 = {b3 b2 b1 b0} ; r2 = {c3 c2 c1 c0} ; r3 = {d3 d2 d1 d0} ; ; output looks like: {t0 r1 r0 r3} ; t0 = {d0 c0 b0 a0} ; r1 = {d1 c1 b1 a1} ; r0 = {d2 c2 b2 a2} ; r3 = {d3 c3 b3 a3} ; %macro TRANSPOSE 6 %define %%r0 %1 %define %%r1 %2 %define %%r2 %3 %define %%r3 %4 %define %%t0 %5 %define %%t1 %6 movaps %%t0, %%r0 ; t0 = {a3 a2 a1 a0} shufps %%t0, %%r1, 0x44 ; t0 = {b1 b0 a1 a0} shufps %%r0, %%r1, 0xEE ; r0 = {b3 b2 a3 a2} movaps %%t1, %%r2 ; t1 = {c3 c2 c1 c0} shufps %%t1, %%r3, 0x44 ; t1 = {d1 d0 c1 c0} shufps %%r2, %%r3, 0xEE ; r2 = {d3 d2 c3 c2} movaps %%r1, %%t0 ; r1 = {b1 b0 a1 a0} shufps %%r1, %%t1, 0xDD ; r1 = {d1 c1 b1 a1} movaps %%r3, %%r0 ; r3 = {b3 b2 a3 a2} shufps %%r3, %%r2, 0xDD ; r3 = {d3 c3 b3 a3} shufps %%r0, %%r2, 0x88 ; r0 = {d2 c2 b2 a2} shufps %%t0, %%t1, 0x88 ; t0 = {d0 c0 b0 a0} %endmacro ;; ;; Magic functions defined in FIPS 180-1 ;; ; macro MAGIC_F0 F,B,C,D,T ;; F = (D ^ (B & (C ^ D))) %macro MAGIC_F0 5 %define %%regF %1 %define %%regB %2 %define %%regC %3 %define %%regD %4 %define %%regT %5 movdqa %%regF,%%regC pxor %%regF,%%regD pand %%regF,%%regB pxor %%regF,%%regD %endmacro ; macro MAGIC_F1 F,B,C,D,T ;; F = (B ^ C ^ D) %macro MAGIC_F1 5 %define %%regF %1 %define %%regB %2 %define %%regC %3 %define %%regD %4 %define %%regT %5 movdqa %%regF,%%regD pxor %%regF,%%regC pxor %%regF,%%regB %endmacro ; macro MAGIC_F2 F,B,C,D,T ;; F = ((B & C) | (B & D) | (C & D)) %macro MAGIC_F2 5 %define %%regF %1 %define %%regB %2 %define %%regC %3 %define %%regD %4 %define %%regT %5 movdqa %%regF,%%regB movdqa %%regT,%%regB por %%regF,%%regC pand %%regT,%%regC pand %%regF,%%regD por %%regF,%%regT %endmacro ; macro MAGIC_F3 F,B,C,D,T ;; F = (B ^ C ^ D) %macro MAGIC_F3 5 %define %%regF %1 %define %%regB %2 %define %%regC %3 %define %%regD %4 %define %%regT %5 MAGIC_F1 %%regF,%%regB,%%regC,%%regD,%%regT %endmacro ; PROLD reg, imm, tmp %macro PROLD 3 %define %%reg %1 %define %%imm %2 %define %%tmp %3 movdqa %%tmp, %%reg pslld %%reg, %%imm psrld %%tmp, (32-%%imm) por %%reg, %%tmp %endmacro %macro SHA1_STEP_00_15 10 %define %%regA %1 %define %%regB %2 %define %%regC %3 %define %%regD %4 %define %%regE %5 %define %%regT %6 %define %%regF %7 %define %%memW %8 %define %%immCNT %9 %define %%MAGIC %10 paddd %%regE,%%immCNT paddd %%regE,[rsp + (%%memW * 16)] movdqa %%regT,%%regA PROLD %%regT,5, %%regF paddd %%regE,%%regT %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D) PROLD %%regB,30, %%regT paddd %%regE,%%regF %endmacro %macro SHA1_STEP_16_79 10 %define %%regA %1 %define %%regB %2 %define %%regC %3 %define %%regD %4 %define %%regE %5 %define %%regT %6 %define %%regF %7 %define %%memW %8 %define %%immCNT %9 %define %%MAGIC %10 paddd %%regE,%%immCNT movdqa W14, [rsp + ((%%memW - 14) & 15) * 16] pxor W16, W14 pxor W16, [rsp + ((%%memW - 8) & 15) * 16] pxor W16, [rsp + ((%%memW - 3) & 15) * 16] movdqa %%regF, W16 pslld W16, 1 psrld %%regF, (32-1) por %%regF, W16 ROTATE_W movdqa [rsp + ((%%memW - 0) & 15) * 16],%%regF paddd %%regE,%%regF movdqa %%regT,%%regA PROLD %%regT,5, %%regF paddd %%regE,%%regT %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D) PROLD %%regB,30, %%regT paddd %%regE,%%regF %endmacro ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; FRAMESZ must be an odd multiple of 8 %define FRAMESZ 16*16 + 8 %define MOVPS movdqu %ifdef LINUX %define arg1 rdi %define arg2 rsi %else %define arg1 rcx %define arg2 rdx %endif %define inp0 r8 %define inp1 r9 %define inp2 r10 %define inp3 r11 %define IDX rax %define A xmm0 %define B xmm1 %define C xmm2 %define D xmm3 %define E xmm4 %define F xmm5 ; tmp %define G xmm6 ; tmp %define TMP G %define FUN F %define K xmm7 %define AA xmm8 %define BB xmm9 %define CC xmm10 %define DD xmm11 %define EE xmm12 %define T0 xmm6 %define T1 xmm7 %define T2 xmm8 %define T3 xmm9 %define T4 xmm10 %define T5 xmm11 %define W14 xmm13 %define W15 xmm14 %define W16 xmm15 %macro ROTATE_ARGS 0 %xdefine TMP_ E %xdefine E D %xdefine D C %xdefine C B %xdefine B A %xdefine A TMP_ %endm %macro ROTATE_W 0 %xdefine TMP_ W16 %xdefine W16 W15 %xdefine W15 W14 %xdefine W14 TMP_ %endm align 32 ; XMM registers are clobbered. Saving/restoring must be done at a higher level ; void sha1_mult_sse(SHA1_ARGS *args, UINT32 size_in_blocks); ; arg 1 : rcx : pointer to args ; arg 2 : rdx : size (in blocks) ;; assumed to be >= 1 MKGLOBAL(sha1_mult_sse,function,internal) sha1_mult_sse: sub rsp, FRAMESZ ;; Initialize digests movdqa A, [arg1 + 0*SHA1_DIGEST_ROW_SIZE] movdqa B, [arg1 + 1*SHA1_DIGEST_ROW_SIZE] movdqa C, [arg1 + 2*SHA1_DIGEST_ROW_SIZE] movdqa D, [arg1 + 3*SHA1_DIGEST_ROW_SIZE] movdqa E, [arg1 + 4*SHA1_DIGEST_ROW_SIZE] DBGPRINTL_XMM "Sha1-SSE Incoming transposed digest", A, B, C, D, E ;; load input pointers mov inp0,[arg1 + _data_ptr_sha1 + 0*PTR_SZ] mov inp1,[arg1 + _data_ptr_sha1 + 1*PTR_SZ] mov inp2,[arg1 + _data_ptr_sha1 + 2*PTR_SZ] mov inp3,[arg1 + _data_ptr_sha1 + 3*PTR_SZ] DBGPRINTL64 "Sha1-SSE Incoming data ptrs", inp0, inp1, inp2, inp3 xor IDX, IDX lloop: movdqa F, [rel PSHUFFLE_BYTE_FLIP_MASK] %assign I 0 %rep 4 MOVPS T2,[inp0+IDX] MOVPS T1,[inp1+IDX] MOVPS T4,[inp2+IDX] MOVPS T3,[inp3+IDX] TRANSPOSE T2, T1, T4, T3, T0, T5 DBGPRINTL_XMM "sha1 incoming data", T0, T1, T2, T3 pshufb T0, F movdqa [rsp+(I*4+0)*16],T0 pshufb T1, F movdqa [rsp+(I*4+1)*16],T1 pshufb T2, F movdqa [rsp+(I*4+2)*16],T2 pshufb T3, F movdqa [rsp+(I*4+3)*16],T3 add IDX, 4*4 %assign I (I+1) %endrep ; save old digests movdqa AA, A movdqa BB, B movdqa CC, C movdqa DD, D movdqa EE, E ;; ;; perform 0-79 steps ;; movdqa K, [rel K00_19] ;; do rounds 0...15 %assign I 0 %rep 16 SHA1_STEP_00_15 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0 ROTATE_ARGS %assign I (I+1) %endrep ;; do rounds 16...19 movdqa W16, [rsp + ((16 - 16) & 15) * 16] movdqa W15, [rsp + ((16 - 15) & 15) * 16] %rep 4 SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0 ROTATE_ARGS %assign I (I+1) %endrep ;; do rounds 20...39 movdqa K, [rel K20_39] %rep 20 SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F1 ROTATE_ARGS %assign I (I+1) %endrep ;; do rounds 40...59 movdqa K, [rel K40_59] %rep 20 SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F2 ROTATE_ARGS %assign I (I+1) %endrep ;; do rounds 60...79 movdqa K, [rel K60_79] %rep 20 SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F3 ROTATE_ARGS %assign I (I+1) %endrep paddd A,AA paddd B,BB paddd C,CC paddd D,DD paddd E,EE sub arg2, 1 jne lloop ; write out digests movdqa [arg1 + 0*SHA1_DIGEST_ROW_SIZE], A movdqa [arg1 + 1*SHA1_DIGEST_ROW_SIZE], B movdqa [arg1 + 2*SHA1_DIGEST_ROW_SIZE], C movdqa [arg1 + 3*SHA1_DIGEST_ROW_SIZE], D movdqa [arg1 + 4*SHA1_DIGEST_ROW_SIZE], E DBGPRINTL_XMM "Sha1 Outgoing transposed digest", A, B, C, D, E ; update input pointers add inp0, IDX mov [arg1 + _data_ptr_sha1 + 0*PTR_SZ], inp0 add inp1, IDX mov [arg1 + _data_ptr_sha1 + 1*PTR_SZ], inp1 add inp2, IDX mov [arg1 + _data_ptr_sha1 + 2*PTR_SZ], inp2 add inp3, IDX mov [arg1 + _data_ptr_sha1 + 3*PTR_SZ], inp3 DBGPRINTL64 "Sha1-sse outgoing data ptrs", inp0, inp1, inp2, inp3 ;;;;;;;;;;;;;;;; ;; Postamble add rsp, FRAMESZ ret intel-ipsec-mb-0.48/sse/sha1_ni_x2_sse.asm000066400000000000000000000273011321406316400203610ustar00rootroot00000000000000;; ;; Copyright (c) 2012-2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; ;; Stack must be aligned to 32 bytes before call ;; ;; Registers: RAX RBX RCX RDX RBP RSI RDI R8 R9 R10 R11 R12 R13 R14 R15 ;; ----------------------------------------------------------- ;; Windows clobbers: RDX R10 R11 ;; Windows preserves: RAX RBX RCX RBP RSI RDI R8 R9 R12 R13 R14 R15 ;; ----------------------------------------------------------- ;; Linux clobbers: RDI R10 R11 ;; Linux preserves: RAX RBX RCX RDX RBP RSI R8 R9 R12 R13 R14 R15 ;; ----------------------------------------------------------- ;; ;; Linux/Windows clobbers: xmm0 - xmm15 %include "os.asm" ;%define DO_DBGPRINT %include "dbgprint.asm" %include "mb_mgr_datastruct.asm" %ifdef LINUX %define arg1 rdi %define arg2 rsi %define arg3 rcx %define arg4 rdx %else %define arg1 rcx %define arg2 rdx %define arg3 rdi %define arg4 rsi %endif %define args arg1 %define NUM_BLKS arg2 ; reso = resdq => 16 bytes struc frame .ABCD_SAVE reso 1 .E_SAVE reso 1 .ABCD_SAVEb reso 1 .E_SAVEb reso 1 .align resq 1 endstruc %define INP r10 %define INPb r11 %define ABCD xmm0 %define E0 xmm1 ; Need two E's b/c they ping pong %define E1 xmm2 %define MSG0 xmm3 %define MSG1 xmm4 %define MSG2 xmm5 %define MSG3 xmm6 %define ABCDb xmm7 %define E0b xmm8 ; Need two E's b/c they ping pong %define E1b xmm9 %define MSG0b xmm10 %define MSG1b xmm11 %define MSG2b xmm12 %define MSG3b xmm13 %define SHUF_MASK xmm14 %define E_MASK xmm15 section .data default rel align 64 PSHUFFLE_BYTE_FLIP_MASK: ;ddq 0x000102030405060708090a0b0c0d0e0f dq 0x08090a0b0c0d0e0f, 0x0001020304050607 UPPER_WORD_MASK: ;ddq 0xFFFFFFFF000000000000000000000000 dq 0x0000000000000000, 0xFFFFFFFF00000000 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; void sha1_ni(SHA1_ARGS *args, UINT32 size_in_blocks) ;; arg1 : pointer to args ;; arg2 : size (in blocks) ;; assumed to be >= 1 section .text MKGLOBAL(sha1_ni,function,internal) align 32 sha1_ni: sub rsp, frame_size DBGPRINTL "enter sha1-ni-x2" shl NUM_BLKS, 6 ; convert to bytes jz done_hash ;; load input pointers mov INP, [args + _data_ptr_sha1 + 0*PTR_SZ] DBGPRINTL64 "jobA: pointer", INP mov INPb, [args + _data_ptr_sha1 + 1*PTR_SZ] add NUM_BLKS, INP ; pointer to end of data block -> loop exit condition ;; load initial digest movdqu ABCD, [args + 0*SHA1NI_DIGEST_ROW_SIZE] pxor E0, E0 pinsrd E0, [args + 0*SHA1NI_DIGEST_ROW_SIZE + 4*SHA1_DIGEST_WORD_SIZE], 3 pshufd ABCD, ABCD, 0x1B DBGPRINTL_XMM "jobA: digest in words[0-3]", ABCD DBGPRINTL_XMM "jobA: digest in word 4", E0 movdqu ABCDb, [args + 1*SHA1NI_DIGEST_ROW_SIZE] pxor E0b, E0b pinsrd E0b, [args + 1*SHA1NI_DIGEST_ROW_SIZE + 4*SHA1_DIGEST_WORD_SIZE], 3 pshufd ABCDb, ABCDb, 0x1B movdqa SHUF_MASK, [rel PSHUFFLE_BYTE_FLIP_MASK] movdqa E_MASK, [rel UPPER_WORD_MASK] DBGPRINTL "jobA data:" loop0: ;; Copy digests movdqa [rsp + frame.ABCD_SAVE], ABCD movdqa [rsp + frame.E_SAVE], E0 movdqa [rsp + frame.ABCD_SAVEb], ABCDb movdqa [rsp + frame.E_SAVEb], E0b ;; Only needed if not using sha1nexte for rounds 0-3 pand E0, E_MASK pand E0b, E_MASK ;; Needed if using sha1nexte for rounds 0-3 ;; Need to rotate E right by 30 ;movdqa E1, E0 ;psrld E0, 30 ;pslld E1, 2 ;pxor E0, E1 ;; Rounds 0-3 movdqu MSG0, [INP + 0*16] pshufb MSG0, SHUF_MASK DBGPRINT_XMM MSG0 ;sha1nexte E0, MSG0 paddd E0, MSG0 ; instead of sha1nexte movdqa E1, ABCD sha1rnds4 ABCD, E0, 0 movdqu MSG0b, [INPb + 0*16] pshufb MSG0b, SHUF_MASK ;sha1nexte E0b, MSG0b paddd E0b, MSG0b ; instead of sha1nexte movdqa E1b, ABCDb sha1rnds4 ABCDb, E0b, 0 ;; Rounds 4-7 movdqu MSG1, [INP + 1*16] pshufb MSG1, SHUF_MASK DBGPRINT_XMM MSG1 sha1nexte E1, MSG1 movdqa E0, ABCD sha1rnds4 ABCD, E1, 0 sha1msg1 MSG0, MSG1 movdqu MSG1b, [INPb + 1*16] pshufb MSG1b, SHUF_MASK sha1nexte E1b, MSG1b movdqa E0b, ABCDb sha1rnds4 ABCDb, E1b, 0 sha1msg1 MSG0b, MSG1b ;; Rounds 8-11 movdqu MSG2, [INP + 2*16] pshufb MSG2, SHUF_MASK DBGPRINT_XMM MSG2 sha1nexte E0, MSG2 movdqa E1, ABCD sha1rnds4 ABCD, E0, 0 sha1msg1 MSG1, MSG2 pxor MSG0, MSG2 movdqu MSG2b, [INPb + 2*16] pshufb MSG2b, SHUF_MASK sha1nexte E0b, MSG2b movdqa E1b, ABCDb sha1rnds4 ABCDb, E0b, 0 sha1msg1 MSG1b, MSG2b pxor MSG0b, MSG2b ;; Rounds 12-15 movdqu MSG3, [INP + 3*16] pshufb MSG3, SHUF_MASK DBGPRINT_XMM MSG3 sha1nexte E1, MSG3 movdqa E0, ABCD sha1msg2 MSG0, MSG3 sha1rnds4 ABCD, E1, 0 sha1msg1 MSG2, MSG3 pxor MSG1, MSG3 movdqu MSG3b, [INPb + 3*16] pshufb MSG3b, SHUF_MASK sha1nexte E1b, MSG3b movdqa E0b, ABCDb sha1msg2 MSG0b, MSG3b sha1rnds4 ABCDb, E1b, 0 sha1msg1 MSG2b, MSG3b pxor MSG1b, MSG3b ;; Rounds 16-19 sha1nexte E0, MSG0 movdqa E1, ABCD sha1msg2 MSG1, MSG0 sha1rnds4 ABCD, E0, 0 sha1msg1 MSG3, MSG0 pxor MSG2, MSG0 sha1nexte E0b, MSG0b movdqa E1b, ABCDb sha1msg2 MSG1b, MSG0b sha1rnds4 ABCDb, E0b, 0 sha1msg1 MSG3b, MSG0b pxor MSG2b, MSG0b ;; Rounds 20-23 sha1nexte E1, MSG1 movdqa E0, ABCD sha1msg2 MSG2, MSG1 sha1rnds4 ABCD, E1, 1 sha1msg1 MSG0, MSG1 pxor MSG3, MSG1 sha1nexte E1b, MSG1b movdqa E0b, ABCDb sha1msg2 MSG2b, MSG1b sha1rnds4 ABCDb, E1b, 1 sha1msg1 MSG0b, MSG1b pxor MSG3b, MSG1b ;; Rounds 24-27 sha1nexte E0, MSG2 movdqa E1, ABCD sha1msg2 MSG3, MSG2 sha1rnds4 ABCD, E0, 1 sha1msg1 MSG1, MSG2 pxor MSG0, MSG2 sha1nexte E0b, MSG2b movdqa E1b, ABCDb sha1msg2 MSG3b, MSG2b sha1rnds4 ABCDb, E0b, 1 sha1msg1 MSG1b, MSG2b pxor MSG0b, MSG2b ;; Rounds 28-31 sha1nexte E1, MSG3 movdqa E0, ABCD sha1msg2 MSG0, MSG3 sha1rnds4 ABCD, E1, 1 sha1msg1 MSG2, MSG3 pxor MSG1, MSG3 sha1nexte E1b, MSG3b movdqa E0b, ABCDb sha1msg2 MSG0b, MSG3b sha1rnds4 ABCDb, E1b, 1 sha1msg1 MSG2b, MSG3b pxor MSG1b, MSG3b ;; Rounds 32-35 sha1nexte E0, MSG0 movdqa E1, ABCD sha1msg2 MSG1, MSG0 sha1rnds4 ABCD, E0, 1 sha1msg1 MSG3, MSG0 pxor MSG2, MSG0 sha1nexte E0b, MSG0b movdqa E1b, ABCDb sha1msg2 MSG1b, MSG0b sha1rnds4 ABCDb, E0b, 1 sha1msg1 MSG3b, MSG0b pxor MSG2b, MSG0b ;; Rounds 36-39 sha1nexte E1, MSG1 movdqa E0, ABCD sha1msg2 MSG2, MSG1 sha1rnds4 ABCD, E1, 1 sha1msg1 MSG0, MSG1 pxor MSG3, MSG1 sha1nexte E1b, MSG1b movdqa E0b, ABCDb sha1msg2 MSG2b, MSG1b sha1rnds4 ABCDb, E1b, 1 sha1msg1 MSG0b, MSG1b pxor MSG3b, MSG1b ;; Rounds 40-43 sha1nexte E0, MSG2 movdqa E1, ABCD sha1msg2 MSG3, MSG2 sha1rnds4 ABCD, E0, 2 sha1msg1 MSG1, MSG2 pxor MSG0, MSG2 sha1nexte E0b, MSG2b movdqa E1b, ABCDb sha1msg2 MSG3b, MSG2b sha1rnds4 ABCDb, E0b, 2 sha1msg1 MSG1b, MSG2b pxor MSG0b, MSG2b ;; Rounds 44-47 sha1nexte E1, MSG3 movdqa E0, ABCD sha1msg2 MSG0, MSG3 sha1rnds4 ABCD, E1, 2 sha1msg1 MSG2, MSG3 pxor MSG1, MSG3 sha1nexte E1b, MSG3b movdqa E0b, ABCDb sha1msg2 MSG0b, MSG3b sha1rnds4 ABCDb, E1b, 2 sha1msg1 MSG2b, MSG3b pxor MSG1b, MSG3b ;; Rounds 48-51 sha1nexte E0, MSG0 movdqa E1, ABCD sha1msg2 MSG1, MSG0 sha1rnds4 ABCD, E0, 2 sha1msg1 MSG3, MSG0 pxor MSG2, MSG0 sha1nexte E0b, MSG0b movdqa E1b, ABCDb sha1msg2 MSG1b, MSG0b sha1rnds4 ABCDb, E0b, 2 sha1msg1 MSG3b, MSG0b pxor MSG2b, MSG0b ;; Rounds 52-55 sha1nexte E1, MSG1 movdqa E0, ABCD sha1msg2 MSG2, MSG1 sha1rnds4 ABCD, E1, 2 sha1msg1 MSG0, MSG1 pxor MSG3, MSG1 sha1nexte E1b, MSG1b movdqa E0b, ABCDb sha1msg2 MSG2b, MSG1b sha1rnds4 ABCDb, E1b, 2 sha1msg1 MSG0b, MSG1b pxor MSG3b, MSG1b ;; Rounds 56-59 sha1nexte E0, MSG2 movdqa E1, ABCD sha1msg2 MSG3, MSG2 sha1rnds4 ABCD, E0, 2 sha1msg1 MSG1, MSG2 pxor MSG0, MSG2 sha1nexte E0b, MSG2b movdqa E1b, ABCDb sha1msg2 MSG3b, MSG2b sha1rnds4 ABCDb, E0b, 2 sha1msg1 MSG1b, MSG2b pxor MSG0b, MSG2b ;; Rounds 60-63 sha1nexte E1, MSG3 movdqa E0, ABCD sha1msg2 MSG0, MSG3 sha1rnds4 ABCD, E1, 3 sha1msg1 MSG2, MSG3 pxor MSG1, MSG3 sha1nexte E1b, MSG3b movdqa E0b, ABCDb sha1msg2 MSG0b, MSG3b sha1rnds4 ABCDb, E1b, 3 sha1msg1 MSG2b, MSG3b pxor MSG1b, MSG3b ;; Rounds 64-67 sha1nexte E0, MSG0 movdqa E1, ABCD sha1msg2 MSG1, MSG0 sha1rnds4 ABCD, E0, 3 sha1msg1 MSG3, MSG0 pxor MSG2, MSG0 sha1nexte E0b, MSG0b movdqa E1b, ABCDb sha1msg2 MSG1b, MSG0b sha1rnds4 ABCDb, E0b, 3 sha1msg1 MSG3b, MSG0b pxor MSG2b, MSG0b ;; Rounds 68-71 sha1nexte E1, MSG1 movdqa E0, ABCD sha1msg2 MSG2, MSG1 sha1rnds4 ABCD, E1, 3 pxor MSG3, MSG1 sha1nexte E1b, MSG1b movdqa E0b, ABCDb sha1msg2 MSG2b, MSG1b sha1rnds4 ABCDb, E1b, 3 pxor MSG3b, MSG1b ;; Rounds 72-75 sha1nexte E0, MSG2 movdqa E1, ABCD sha1msg2 MSG3, MSG2 sha1rnds4 ABCD, E0, 3 sha1nexte E0b, MSG2b movdqa E1b, ABCDb sha1msg2 MSG3b, MSG2b sha1rnds4 ABCDb, E0b, 3 ;; Rounds 76-79 sha1nexte E1, MSG3 movdqa E0, ABCD sha1rnds4 ABCD, E1, 3 sha1nexte E1b, MSG3b movdqa E0b, ABCDb sha1rnds4 ABCDb, E1b, 3 ;; Need to rotate E left by 30 movdqa E1, E0 pslld E0, 30 psrld E1, 2 pxor E0, E1 movdqa E1b, E0b pslld E0b, 30 psrld E1b, 2 pxor E0b, E1b paddd ABCD, [rsp + frame.ABCD_SAVE] paddd E0, [rsp + frame.E_SAVE] paddd ABCDb, [rsp + frame.ABCD_SAVEb] paddd E0b, [rsp + frame.E_SAVEb] add INP, 64 add INPb, 64 cmp INP, NUM_BLKS jne loop0 ;; write out digests pshufd ABCD, ABCD, 0x1B movdqu [args + 0*SHA1NI_DIGEST_ROW_SIZE], ABCD pextrd [args + 0*SHA1NI_DIGEST_ROW_SIZE + 4*SHA1_DIGEST_WORD_SIZE], E0, 3 DBGPRINTL_XMM "jobA: digest out words[0-3]", ABCD DBGPRINTL_XMM "jobA: digest out word 4", E0 pshufd ABCDb, ABCDb, 0x1B movdqu [args + 1*SHA1NI_DIGEST_ROW_SIZE], ABCDb pextrd [args + 1*SHA1NI_DIGEST_ROW_SIZE + 4*SHA1_DIGEST_WORD_SIZE], E0b, 3 ;; update input pointers mov [args + _data_ptr_sha1 + 0*PTR_SZ], INP mov [args + _data_ptr_sha1 + 1*PTR_SZ], INPb done_hash: add rsp, frame_size ret intel-ipsec-mb-0.48/sse/sha1_one_block_sse.asm000066400000000000000000000235201321406316400212740ustar00rootroot00000000000000;; ;; Copyright (c) 2012-2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; ; SHA1 code, hybrid, rolled, interleaved ; Uses SSE instructions %include "os.asm" section .data default rel align 16 PSHUFFLE_BYTE_FLIP_MASK: ;ddq 0x0c0d0e0f08090a0b0405060700010203 dq 0x0405060700010203, 0x0c0d0e0f08090a0b K00_19: ;ddq 0x5A8279995A8279995A8279995A827999 dq 0x5A8279995A827999, 0x5A8279995A827999 K20_39: ;ddq 0x6ED9EBA16ED9EBA16ED9EBA16ED9EBA1 dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1 K40_59: ;ddq 0x8F1BBCDC8F1BBCDC8F1BBCDC8F1BBCDC dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC K60_79: ;ddq 0xCA62C1D6CA62C1D6CA62C1D6CA62C1D6 dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6 section .text %define MOVDQ movdqu ;; assume buffers not aligned %ifdef LINUX %define INP rdi ; 1st arg %define CTX rsi ; 2nd arg %define REG3 ecx %define REG4 edx %else %define INP rcx ; 1st arg %define CTX rdx ; 2nd arg %define REG3 edi %define REG4 esi %endif %define FRAMESZ 3*16 + 1*8 %define _RSP FRAMESZ-1*8 + rsp %define a eax %define b ebx %define c REG3 %define d REG4 %define e r8d %define T1 r9d %define f r10d %define RND r11d %define g r12d %define h r13d %define XTMP0 xmm0 %define XTMP1 xmm1 %define XK xmm2 %xdefine X0 xmm3 %xdefine X1 xmm4 %xdefine X2 xmm5 %xdefine X3 xmm6 %xdefine X4 xmm7 %define XFER xmm8 %define SZ 4 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros %macro rotate_Xs 0 %xdefine X_ X0 %xdefine X0 X1 %xdefine X1 X2 %xdefine X2 X3 %xdefine X3 X4 %xdefine X4 X_ %endmacro %macro ROTATE_ARGS 0 %xdefine TMP_ h %xdefine h g %xdefine g f %xdefine f e %xdefine e d %xdefine d c %xdefine c b %xdefine b a %xdefine a TMP_ %endm ;; Magic functions defined in FIPS 180-1 ;; ; macro MAGIC_F0 F,B,C,D,T ;; F = (D ^ (B & (C ^ D))) %macro MAGIC_F0 5 %define %%regF %1 %define %%regB %2 %define %%regC %3 %define %%regD %4 %define %%regT %5 mov %%regF,%%regC xor %%regF,%%regD and %%regF,%%regB xor %%regF,%%regD %endmacro ; macro MAGIC_F1 F,B,C,D,T ;; F = (B ^ C ^ D) %macro MAGIC_F1 5 %define %%regF %1 %define %%regB %2 %define %%regC %3 %define %%regD %4 %define %%regT %5 mov %%regF,%%regD xor %%regF,%%regC xor %%regF,%%regB %endmacro ; macro MAGIC_F2 F,B,C,D,T ;; F = ((B & C) | (B & D) | (C & D)) %macro MAGIC_F2 5 %define %%regF %1 %define %%regB %2 %define %%regC %3 %define %%regD %4 %define %%regT %5 mov %%regF,%%regB mov %%regT,%%regB or %%regF,%%regC and %%regT,%%regC and %%regF,%%regD or %%regF,%%regT %endmacro ; macro MAGIC_F3 F,B,C,D,T ;; F = (B ^ C ^ D) %macro MAGIC_F3 5 %define %%regF %1 %define %%regB %2 %define %%regC %3 %define %%regD %4 %define %%regT %5 MAGIC_F1 %%regF,%%regB,%%regC,%%regD,%%regT %endmacro ;; input is T1 %macro ROUND 1 %define %%MAGIC %1 add e,T1 mov T1,a rol T1,5 add e,T1 %%MAGIC h,b,c,d,T1 ;; FUN = MAGIC_Fi(B,C,D) rol b,30 add h,e ROTATE_ARGS %endmacro %macro do_4i 1 movdqa XFER, XK paddd XFER, X0 pextrd T1, XFER, 0 ;ROUND %1 add e,T1 ;SCHEDULE_4 movdqa XTMP0, X1 palignr XTMP0, X0, 8 ; XTMP0 = W[-14] mov T1,a movdqa XTMP1, X2 rol T1,5 pxor XTMP1, X0 ; XTMP1 = W[-8] ^ W[-16] add e,T1 pxor XTMP0, XTMP1 ; XTMP0 = W[-8] ^ W[-14] ^ W[-16] %1 h,b,c,d,T1 ;; FUN = MAGIC_Fi(B,C,D) ;; Finish low half movdqa X4, X3 rol b,30 psrldq X4, 4 ; X4 = W[-3] {xxBA} add h,e ROTATE_ARGS pextrd T1, XFER, 1 ;ROUND %1 add e,T1 pxor X4, XTMP0 ; mov T1,a movdqa XTMP1, X4 rol T1,5 ;; rotate X4 left 1 psrld XTMP1, (32-1) add e,T1 pslld X4, 1 %1 h,b,c,d,T1 ;; FUN = MAGIC_Fi(B,C,D) pxor X4, XTMP1 ; X4 = W[0] {xxBA} rol b,30 add h,e ROTATE_ARGS pextrd T1, XFER, 2 ;ROUND %1 add e,T1 movdqa XTMP1, X4 mov T1,a ;; Finish high half palignr XTMP1, X3, 4 ; XTMP1 = w[-3] {DCxx} rol T1,5 add e,T1 pxor XTMP0, XTMP1 %1 h,b,c,d,T1 ;; FUN = MAGIC_Fi(B,C,D) ;; rotate XTMP0 left 1 movdqa XTMP1, XTMP0 psrld XTMP1, (32-1) rol b,30 add h,e ROTATE_ARGS pextrd T1, XFER, 3 ;ROUND %1 add e,T1 mov T1,a pslld XTMP0, 1 rol T1,5 add e,T1 pxor XTMP0, XTMP1 ; XTMP0 = W[0] {DCxx} %1 h,b,c,d,T1 ;; FUN = MAGIC_Fi(B,C,D) ;; COMBINE HALVES shufps X4, XTMP0, 11100100b ; X4 = X[0] {DCBA} rol b,30 add h,e rotate_Xs ROTATE_ARGS %endmacro ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; void sha1_one_block_sse(void *input_data, UINT32 digest[8] ;; arg 1 : rcx : pointer to input data ;; arg 2 : rdx : pointer to digest MKGLOBAL(sha1_one_block_sse,function,) align 32 sha1_one_block_sse: push rbx push rsi push rdi push r12 push r13 ;; byte swap first 16 dwords movdqa XTMP0, [rel PSHUFFLE_BYTE_FLIP_MASK] mov rax,rsp ; copy rsp MOVDQ X0, [INP + 0*16] sub rsp,FRAMESZ MOVDQ X1, [INP + 1*16] and rsp,-64 ; align stack frame movdqa [rsp + 0 * 16], xmm6 movdqa [rsp + 1 * 16], xmm7 movdqa [rsp + 2 * 16], xmm8 MOVDQ X2, [INP + 2*16] mov [_RSP],rax ; save copy of rsp MOVDQ X3, [INP + 3*16] ;; load initial digest mov a,0x67452301 pshufb X0, XTMP0 mov b,0xefcdab89 pshufb X1, XTMP0 mov c,0x98badcfe pshufb X2, XTMP0 mov d,0x10325476 pshufb X3, XTMP0 mov e,0xc3d2e1f0 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; do rounds 00-19 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; movdqa XK, [rel K00_19] mov RND, 3 ROTATE_ARGS ROTATE_ARGS ROTATE_ARGS ROTATE_ARGS rotate_Xs rotate_Xs rotate_Xs rotate_Xs jmp loop1_5 align 16 loop1: do_4i MAGIC_F0 loop1_5: do_4i MAGIC_F0 rotate_Xs rotate_Xs rotate_Xs rotate_Xs movdqa X0, X2 movdqa X2, X4 movdqa X4, X1 movdqa X1, X3 sub RND, 1 jne loop1 rotate_Xs rotate_Xs rotate_Xs rotate_Xs ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; end rounds 00-19 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; do rounds 20-39 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; movdqa XK, [rel K20_39] mov RND, 3 ROTATE_ARGS ROTATE_ARGS ROTATE_ARGS ROTATE_ARGS rotate_Xs rotate_Xs rotate_Xs rotate_Xs jmp loop2_5 align 16 loop2: do_4i MAGIC_F1 loop2_5: do_4i MAGIC_F1 rotate_Xs rotate_Xs rotate_Xs rotate_Xs movdqa X0, X2 movdqa X2, X4 movdqa X4, X1 movdqa X1, X3 sub RND, 1 jne loop2 rotate_Xs rotate_Xs rotate_Xs rotate_Xs ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; end rounds 20-39 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; do rounds 40-59 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; movdqa XK, [rel K40_59] mov RND, 3 ROTATE_ARGS ROTATE_ARGS ROTATE_ARGS ROTATE_ARGS rotate_Xs rotate_Xs rotate_Xs rotate_Xs jmp loop3_5 align 16 loop3: do_4i MAGIC_F2 loop3_5: do_4i MAGIC_F2 rotate_Xs rotate_Xs rotate_Xs rotate_Xs movdqa X0, X2 movdqa X2, X4 movdqa X4, X1 movdqa X1, X3 sub RND, 1 jne loop3 rotate_Xs rotate_Xs rotate_Xs rotate_Xs ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; end rounds 40-59 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; do rounds 60-79 movdqa XK, [rel K60_79] do_4i MAGIC_F3 movdqa XFER, XK paddd XFER, X0 pextrd T1, XFER, 0 ROUND MAGIC_F3 pextrd T1, XFER, 1 ROUND MAGIC_F3 pextrd T1, XFER, 2 ROUND MAGIC_F3 pextrd T1, XFER, 3 ROUND MAGIC_F3 movdqa XFER, XK paddd XFER, X1 pextrd T1, XFER, 0 ROUND MAGIC_F3 pextrd T1, XFER, 1 ROUND MAGIC_F3 pextrd T1, XFER, 2 ROUND MAGIC_F3 pextrd T1, XFER, 3 ROUND MAGIC_F3 movdqa XFER, XK paddd XFER, X2 pextrd T1, XFER, 0 ROUND MAGIC_F3 pextrd T1, XFER, 1 ROUND MAGIC_F3 pextrd T1, XFER, 2 ROUND MAGIC_F3 pextrd T1, XFER, 3 ROUND MAGIC_F3 movdqa XFER, XK paddd XFER, X3 pextrd T1, XFER, 0 ROUND MAGIC_F3 pextrd T1, XFER, 1 ROUND MAGIC_F3 pextrd T1, XFER, 2 ROUND MAGIC_F3 pextrd T1, XFER, 3 ROUND MAGIC_F3 add a,0x67452301 mov [SZ*0 + CTX], a add b,0xefcdab89 mov [SZ*1 + CTX], b add c,0x98badcfe mov [SZ*2 + CTX], c add d,0x10325476 mov [SZ*3 + CTX], d add e,0xc3d2e1f0 mov [SZ*4 + CTX], e movdqa xmm8, [rsp + 2 * 16] movdqa xmm7, [rsp + 1 * 16] movdqa xmm6, [rsp + 0 * 16] mov rsp,[_RSP] pop r13 pop r12 pop rdi pop rsi pop rbx ret intel-ipsec-mb-0.48/sse/sha224_one_block_sse.asm000066400000000000000000000036571321406316400214540ustar00rootroot00000000000000;; ;; Copyright (c) 2012-2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; ; This code schedules 1 blocks at a time, with 4 lanes per block ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; %define H0 0xc1059ed8 %define H1 0x367cd507 %define H2 0x3070dd17 %define H3 0xf70e5939 %define H4 0xffc00b31 %define H5 0x68581511 %define H6 0x64f98fa7 %define H7 0xbefa4fa4 %define FUNC sha224_one_block_sse %include "sha256_one_block_sse.asm" intel-ipsec-mb-0.48/sse/sha256_ni_x2_sse.asm000066400000000000000000000455401321406316400205420ustar00rootroot00000000000000;; ;; Copyright (c) 2012-2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; ;; Stack must be aligned to 32 bytes before call ;; ;; Registers: RAX RBX RCX RDX RBP RSI RDI R8 R9 R10 R11 R12 R13 R14 R15 ;; ----------------------------------------------------------- ;; Windows clobbers: RCX RDX RSI RDI R11 ;; Windows preserves: RAX RBX RBP R8 R9 R10 R12 R13 R14 R15 ;; ----------------------------------------------------------- ;; Linux clobbers: RCX RDX RSI RDI R11 ;; Linux preserves: RAX RBX RBP R8 R9 R10 R12 R13 R14 R15 ;; ----------------------------------------------------------- ;; ;; Linux/Windows clobbers: xmm0 - xmm15 %include "os.asm" ;%define DO_DBGPRINT %include "dbgprint.asm" %include "mb_mgr_datastruct.asm" ; resdq = res0 => 16 bytes struc frame .ABEF_SAVE reso 1 .CDGH_SAVE reso 1 .ABEF_SAVEb reso 1 .CDGH_SAVEb reso 1 .align resq 1 endstruc %ifdef LINUX %define arg1 rdi %define arg2 rsi %define arg3 rcx %define arg4 rdx %else %define arg1 rcx %define arg2 rdx %define arg3 rdi %define arg4 rsi %endif %define args arg1 %define NUM_BLKS arg2 %define INP arg3 %define INPb arg4 %define SHA256CONSTANTS r11 ;; MSG MUST be xmm0 (implicit argument) %define MSG xmm0 %define STATE0 xmm1 %define STATE1 xmm2 %define MSGTMP0 xmm3 %define MSGTMP1 xmm4 %define MSGTMP2 xmm5 %define MSGTMP3 xmm6 %define MSGTMP4 xmm7 %define STATE0b xmm8 %define STATE1b xmm9 %define MSGTMP0b xmm10 %define MSGTMP1b xmm11 %define MSGTMP2b xmm12 %define MSGTMP3b xmm13 %define MSGTMP xmm14 %define SHUF_MASK xmm15 section .data default rel align 64 K256: dd 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5 dd 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5 dd 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3 dd 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174 dd 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc dd 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da dd 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7 dd 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967 dd 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13 dd 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85 dd 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3 dd 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070 dd 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5 dd 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3 dd 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208 dd 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; void sha256_ni(SHA256_ARGS *args, UINT32 size_in_blocks) ;; arg1 : pointer to args ;; arg2 : size (in blocks) ;; assumed to be >= 1 section .text MKGLOBAL(sha256_ni,function,internal) align 32 sha256_ni: sub rsp, frame_size DBGPRINTL "enter sha256-ni-x2" shl NUM_BLKS, 6 ; convert to bytes jz done_hash DBGPRINTL64 "jobA/B byte size:", NUM_BLKS ;; load input pointers mov INP, [args + _data_ptr_sha256 + 0*PTR_SZ] mov INPb, [args + _data_ptr_sha256 + 1*PTR_SZ] add NUM_BLKS, INP ; pointer to end of data ;; load initial digest ;; Probably need to reorder these appropriately ;; DCBA, HGFE -> ABEF, CDGH movdqu STATE0, [args + 0*SHA256NI_DIGEST_ROW_SIZE] movdqu STATE1, [args + 0*SHA256NI_DIGEST_ROW_SIZE + 16] movdqu STATE0b, [args + 1*SHA256NI_DIGEST_ROW_SIZE] movdqu STATE1b, [args + 1*SHA256NI_DIGEST_ROW_SIZE + 16] DBGPRINTL "jobA digest in:" DBGPRINT_XMM STATE0 DBGPRINT_XMM STATE1 DBGPRINTL "jobB digest in:" DBGPRINT_XMM STATE0b DBGPRINT_XMM STATE1b pshufd STATE0, STATE0, 0xB1 ; CDAB pshufd STATE1, STATE1, 0x1B ; EFGH movdqa MSGTMP4, STATE0 pshufd STATE0b, STATE0b, 0xB1 ; CDAB pshufd STATE1b, STATE1b, 0x1B ; EFGH movdqa MSGTMP, STATE0b palignr STATE0, STATE1, 8 ; ABEF palignr STATE0b, STATE1b, 8 ; ABEF pblendw STATE1, MSGTMP4, 0xF0 ; CDGH pblendw STATE1b, MSGTMP, 0xF0 ; CDGH lea SHA256CONSTANTS,[rel K256] movdqa SHUF_MASK, [rel PSHUFFLE_BYTE_FLIP_MASK] %ifdef DO_DBGPRINT ;; prin buffer A push r10 push NUM_BLKS DBGPRINTL "jobA data:" xor r10, r10 sub NUM_BLKS, INP .loop_dbgA: movdqu MSG, [INP + r10 + 0*16] DBGPRINT_XMM MSG movdqu MSG, [INP + r10 + 1*16] DBGPRINT_XMM MSG movdqu MSG, [INP + r10 + 2*16] DBGPRINT_XMM MSG movdqu MSG, [INP + r10 + 3*16] DBGPRINT_XMM MSG add r10, 64 cmp NUM_BLKS, r10 jne .loop_dbgA pop NUM_BLKS pop r10 %endif %ifdef DO_DBGPRINT ;; prin buffer B push r10 push NUM_BLKS DBGPRINTL "jobB data:" xor r10, r10 sub NUM_BLKS, INP .loop_dbgB: movdqu MSG, [INPb + r10 + 0*16] DBGPRINT_XMM MSG movdqu MSG, [INPb + r10 + 1*16] DBGPRINT_XMM MSG movdqu MSG, [INPb + r10 + 2*16] DBGPRINT_XMM MSG movdqu MSG, [INPb + r10 + 3*16] DBGPRINT_XMM MSG add r10, 64 cmp NUM_BLKS, r10 jne .loop_dbgB pop NUM_BLKS pop r10 %endif .loop0: ;; Save digests movdqa [rsp + frame.ABEF_SAVE], STATE0 movdqa [rsp + frame.CDGH_SAVE], STATE1 movdqa [rsp + frame.ABEF_SAVEb], STATE0b movdqa [rsp + frame.CDGH_SAVEb], STATE1b ;; Rounds 0-3 movdqu MSG, [INP + 0*16] pshufb MSG, SHUF_MASK movdqa MSGTMP0, MSG paddd MSG, [SHA256CONSTANTS + 0*16] sha256rnds2 STATE1, STATE0, MSG ; MSG is implicit argument pshufd MSG, MSG, 0x0E sha256rnds2 STATE0, STATE1, MSG ; MSG is implicit argument movdqu MSG, [INPb + 0*16] pshufb MSG, SHUF_MASK movdqa MSGTMP0b, MSG paddd MSG, [SHA256CONSTANTS + 0*16] sha256rnds2 STATE1b, STATE0b, MSG ; MSG is implicit argument pshufd MSG, MSG, 0x0E sha256rnds2 STATE0b, STATE1b, MSG ; MSG is implicit argument ;; Rounds 4-7 movdqu MSG, [INP + 1*16] pshufb MSG, SHUF_MASK movdqa MSGTMP1, MSG paddd MSG, [SHA256CONSTANTS + 1*16] sha256rnds2 STATE1, STATE0, MSG ; MSG is implicit argument pshufd MSG, MSG, 0x0E sha256rnds2 STATE0, STATE1, MSG ; MSG is implicit argument movdqu MSG, [INPb + 1*16] pshufb MSG, SHUF_MASK movdqa MSGTMP1b, MSG paddd MSG, [SHA256CONSTANTS + 1*16] sha256rnds2 STATE1b, STATE0b, MSG ; MSG is implicit argument pshufd MSG, MSG, 0x0E sha256rnds2 STATE0b, STATE1b, MSG ; MSG is implicit argument sha256msg1 MSGTMP0, MSGTMP1 sha256msg1 MSGTMP0b, MSGTMP1b ;; Rounds 8-11 movdqu MSG, [INP + 2*16] pshufb MSG, SHUF_MASK movdqa MSGTMP2, MSG paddd MSG, [SHA256CONSTANTS + 2*16] sha256rnds2 STATE1, STATE0, MSG ; MSG is implicit argument pshufd MSG, MSG, 0x0E sha256rnds2 STATE0, STATE1, MSG ; MSG is implicit argument movdqu MSG, [INPb + 2*16] pshufb MSG, SHUF_MASK movdqa MSGTMP2b, MSG paddd MSG, [SHA256CONSTANTS + 2*16] sha256rnds2 STATE1b, STATE0b, MSG ; MSG is implicit argument pshufd MSG, MSG, 0x0E sha256rnds2 STATE0b, STATE1b, MSG ; MSG is implicit argument sha256msg1 MSGTMP1, MSGTMP2 sha256msg1 MSGTMP1b, MSGTMP2b ;; Rounds 12-15 movdqu MSG, [INP + 3*16] pshufb MSG, SHUF_MASK movdqa MSGTMP3, MSG paddd MSG, [SHA256CONSTANTS + 3*16] sha256rnds2 STATE1, STATE0, MSG ; MSG is implicit argument movdqa MSGTMP, MSGTMP3 palignr MSGTMP, MSGTMP2, 4 paddd MSGTMP0, MSGTMP sha256msg2 MSGTMP0, MSGTMP3 pshufd MSG, MSG, 0x0E sha256rnds2 STATE0, STATE1, MSG ; MSG is implicit argument movdqu MSG, [INPb + 3*16] pshufb MSG, SHUF_MASK movdqa MSGTMP3b, MSG paddd MSG, [SHA256CONSTANTS + 3*16] sha256rnds2 STATE1b, STATE0b, MSG ; MSG is implicit argument movdqa MSGTMP, MSGTMP3b palignr MSGTMP, MSGTMP2b, 4 paddd MSGTMP0b, MSGTMP sha256msg2 MSGTMP0b, MSGTMP3b pshufd MSG, MSG, 0x0E sha256rnds2 STATE0b, STATE1b, MSG ; MSG is implicit argument sha256msg1 MSGTMP2, MSGTMP3 sha256msg1 MSGTMP2b, MSGTMP3b ;; Rounds 16-19 movdqa MSG, MSGTMP0 paddd MSG, [SHA256CONSTANTS + 4*16] sha256rnds2 STATE1, STATE0, MSG ; MSG is implicit argument movdqa MSGTMP, MSGTMP0 palignr MSGTMP, MSGTMP3, 4 paddd MSGTMP1, MSGTMP sha256msg2 MSGTMP1, MSGTMP0 pshufd MSG, MSG, 0x0E sha256rnds2 STATE0, STATE1, MSG ; MSG is implicit argument movdqa MSG, MSGTMP0b paddd MSG, [SHA256CONSTANTS + 4*16] sha256rnds2 STATE1b, STATE0b, MSG ; MSG is implicit argument movdqa MSGTMP, MSGTMP0b palignr MSGTMP, MSGTMP3b, 4 paddd MSGTMP1b, MSGTMP sha256msg2 MSGTMP1b, MSGTMP0b pshufd MSG, MSG, 0x0E sha256rnds2 STATE0b, STATE1b, MSG ; MSG is implicit argument sha256msg1 MSGTMP3, MSGTMP0 sha256msg1 MSGTMP3b, MSGTMP0b ;; Rounds 20-23 movdqa MSG, MSGTMP1 paddd MSG, [SHA256CONSTANTS + 5*16] sha256rnds2 STATE1, STATE0, MSG ; MSG is implicit argument movdqa MSGTMP, MSGTMP1 palignr MSGTMP, MSGTMP0, 4 paddd MSGTMP2, MSGTMP sha256msg2 MSGTMP2, MSGTMP1 pshufd MSG, MSG, 0x0E sha256rnds2 STATE0, STATE1, MSG ; MSG is implicit argument movdqa MSG, MSGTMP1b paddd MSG, [SHA256CONSTANTS + 5*16] sha256rnds2 STATE1b, STATE0b, MSG ; MSG is implicit argument movdqa MSGTMP, MSGTMP1b palignr MSGTMP, MSGTMP0b, 4 paddd MSGTMP2b, MSGTMP sha256msg2 MSGTMP2b, MSGTMP1b pshufd MSG, MSG, 0x0E sha256rnds2 STATE0b, STATE1b, MSG ; MSG is implicit argument sha256msg1 MSGTMP0, MSGTMP1 sha256msg1 MSGTMP0b, MSGTMP1b ;; Rounds 24-27 movdqa MSG, MSGTMP2 paddd MSG, [SHA256CONSTANTS + 6*16] sha256rnds2 STATE1, STATE0, MSG ; MSG is implicit argument movdqa MSGTMP, MSGTMP2 palignr MSGTMP, MSGTMP1, 4 paddd MSGTMP3, MSGTMP sha256msg2 MSGTMP3, MSGTMP2 pshufd MSG, MSG, 0x0E sha256rnds2 STATE0, STATE1, MSG ; MSG is implicit argument movdqa MSG, MSGTMP2b paddd MSG, [SHA256CONSTANTS + 6*16] sha256rnds2 STATE1b, STATE0b, MSG ; MSG is implicit argument movdqa MSGTMP, MSGTMP2b palignr MSGTMP, MSGTMP1b, 4 paddd MSGTMP3b, MSGTMP sha256msg2 MSGTMP3b, MSGTMP2b pshufd MSG, MSG, 0x0E sha256rnds2 STATE0b, STATE1b, MSG ; MSG is implicit argument sha256msg1 MSGTMP1, MSGTMP2 sha256msg1 MSGTMP1b, MSGTMP2b ;; Rounds 28-31 movdqa MSG, MSGTMP3 paddd MSG, [SHA256CONSTANTS + 7*16] sha256rnds2 STATE1, STATE0, MSG ; MSG is implicit argument movdqa MSGTMP, MSGTMP3 palignr MSGTMP, MSGTMP2, 4 paddd MSGTMP0, MSGTMP sha256msg2 MSGTMP0, MSGTMP3 pshufd MSG, MSG, 0x0E sha256rnds2 STATE0, STATE1, MSG ; MSG is implicit argument movdqa MSG, MSGTMP3b paddd MSG, [SHA256CONSTANTS + 7*16] sha256rnds2 STATE1b, STATE0b, MSG ; MSG is implicit argument movdqa MSGTMP, MSGTMP3b palignr MSGTMP, MSGTMP2b, 4 paddd MSGTMP0b, MSGTMP sha256msg2 MSGTMP0b, MSGTMP3b pshufd MSG, MSG, 0x0E sha256rnds2 STATE0b, STATE1b, MSG ; MSG is implicit argument sha256msg1 MSGTMP2, MSGTMP3 sha256msg1 MSGTMP2b, MSGTMP3b ;; Rounds 32-35 movdqa MSG, MSGTMP0 paddd MSG, [SHA256CONSTANTS + 8*16] sha256rnds2 STATE1, STATE0, MSG ; MSG is implicit argument movdqa MSGTMP, MSGTMP0 palignr MSGTMP, MSGTMP3, 4 paddd MSGTMP1, MSGTMP sha256msg2 MSGTMP1, MSGTMP0 pshufd MSG, MSG, 0x0E sha256rnds2 STATE0, STATE1, MSG ; MSG is implicit argument movdqa MSG, MSGTMP0b paddd MSG, [SHA256CONSTANTS + 8*16] sha256rnds2 STATE1b, STATE0b, MSG ; MSG is implicit argument movdqa MSGTMP, MSGTMP0b palignr MSGTMP, MSGTMP3b, 4 paddd MSGTMP1b, MSGTMP sha256msg2 MSGTMP1b, MSGTMP0b pshufd MSG, MSG, 0x0E sha256rnds2 STATE0b, STATE1b, MSG ; MSG is implicit argument sha256msg1 MSGTMP3, MSGTMP0 sha256msg1 MSGTMP3b, MSGTMP0b ;; Rounds 36-39 movdqa MSG, MSGTMP1 paddd MSG, [SHA256CONSTANTS + 9*16] sha256rnds2 STATE1, STATE0, MSG ; MSG is implicit argument movdqa MSGTMP, MSGTMP1 palignr MSGTMP, MSGTMP0, 4 paddd MSGTMP2, MSGTMP sha256msg2 MSGTMP2, MSGTMP1 pshufd MSG, MSG, 0x0E sha256rnds2 STATE0, STATE1, MSG ; MSG is implicit argument movdqa MSG, MSGTMP1b paddd MSG, [SHA256CONSTANTS + 9*16] sha256rnds2 STATE1b, STATE0b, MSG ; MSG is implicit argument movdqa MSGTMP, MSGTMP1b palignr MSGTMP, MSGTMP0b, 4 paddd MSGTMP2b, MSGTMP sha256msg2 MSGTMP2b, MSGTMP1b pshufd MSG, MSG, 0x0E sha256rnds2 STATE0b, STATE1b, MSG ; MSG is implicit argument sha256msg1 MSGTMP0, MSGTMP1 sha256msg1 MSGTMP0b, MSGTMP1b ;; Rounds 40-43 movdqa MSG, MSGTMP2 paddd MSG, [SHA256CONSTANTS + 10*16] sha256rnds2 STATE1, STATE0, MSG ; MSG is implicit argument movdqa MSGTMP, MSGTMP2 palignr MSGTMP, MSGTMP1, 4 paddd MSGTMP3, MSGTMP sha256msg2 MSGTMP3, MSGTMP2 pshufd MSG, MSG, 0x0E sha256rnds2 STATE0, STATE1, MSG ; MSG is implicit argument movdqa MSG, MSGTMP2b paddd MSG, [SHA256CONSTANTS + 10*16] sha256rnds2 STATE1b, STATE0b, MSG ; MSG is implicit argument movdqa MSGTMP, MSGTMP2b palignr MSGTMP, MSGTMP1b, 4 paddd MSGTMP3b, MSGTMP sha256msg2 MSGTMP3b, MSGTMP2b pshufd MSG, MSG, 0x0E sha256rnds2 STATE0b, STATE1b, MSG ; MSG is implicit argument sha256msg1 MSGTMP1, MSGTMP2 sha256msg1 MSGTMP1b, MSGTMP2b ;; Rounds 44-47 movdqa MSG, MSGTMP3 paddd MSG, [SHA256CONSTANTS + 11*16] sha256rnds2 STATE1, STATE0, MSG ; MSG is implicit argument movdqa MSGTMP, MSGTMP3 palignr MSGTMP, MSGTMP2, 4 paddd MSGTMP0, MSGTMP sha256msg2 MSGTMP0, MSGTMP3 pshufd MSG, MSG, 0x0E sha256rnds2 STATE0, STATE1, MSG ; MSG is implicit argument movdqa MSG, MSGTMP3b paddd MSG, [SHA256CONSTANTS + 11*16] sha256rnds2 STATE1b, STATE0b, MSG ; MSG is implicit argument movdqa MSGTMP, MSGTMP3b palignr MSGTMP, MSGTMP2b, 4 paddd MSGTMP0b, MSGTMP sha256msg2 MSGTMP0b, MSGTMP3b pshufd MSG, MSG, 0x0E sha256rnds2 STATE0b, STATE1b, MSG ; MSG is implicit argument sha256msg1 MSGTMP2, MSGTMP3 sha256msg1 MSGTMP2b, MSGTMP3b ;; Rounds 48-51 movdqa MSG, MSGTMP0 paddd MSG, [SHA256CONSTANTS + 12*16] sha256rnds2 STATE1, STATE0, MSG ; MSG is implicit argument movdqa MSGTMP, MSGTMP0 palignr MSGTMP, MSGTMP3, 4 paddd MSGTMP1, MSGTMP sha256msg2 MSGTMP1, MSGTMP0 pshufd MSG, MSG, 0x0E sha256rnds2 STATE0, STATE1, MSG ; MSG is implicit argument movdqa MSG, MSGTMP0b paddd MSG, [SHA256CONSTANTS + 12*16] sha256rnds2 STATE1b, STATE0b, MSG ; MSG is implicit argument movdqa MSGTMP, MSGTMP0b palignr MSGTMP, MSGTMP3b, 4 paddd MSGTMP1b, MSGTMP sha256msg2 MSGTMP1b, MSGTMP0b pshufd MSG, MSG, 0x0E sha256rnds2 STATE0b, STATE1b, MSG ; MSG is implicit argument sha256msg1 MSGTMP3, MSGTMP0 sha256msg1 MSGTMP3b, MSGTMP0b ;; Rounds 52-55 movdqa MSG, MSGTMP1 paddd MSG, [SHA256CONSTANTS + 13*16] sha256rnds2 STATE1, STATE0, MSG ; MSG is implicit argument movdqa MSGTMP, MSGTMP1 palignr MSGTMP, MSGTMP0, 4 paddd MSGTMP2, MSGTMP sha256msg2 MSGTMP2, MSGTMP1 pshufd MSG, MSG, 0x0E sha256rnds2 STATE0, STATE1, MSG ; MSG is implicit argument movdqa MSG, MSGTMP1b paddd MSG, [SHA256CONSTANTS + 13*16] sha256rnds2 STATE1b, STATE0b, MSG ; MSG is implicit argument movdqa MSGTMP, MSGTMP1b palignr MSGTMP, MSGTMP0b, 4 paddd MSGTMP2b, MSGTMP sha256msg2 MSGTMP2b, MSGTMP1b pshufd MSG, MSG, 0x0E sha256rnds2 STATE0b, STATE1b, MSG ; MSG is implicit argument ;; Rounds 56-59 movdqa MSG, MSGTMP2 paddd MSG, [SHA256CONSTANTS + 14*16] sha256rnds2 STATE1, STATE0, MSG ; MSG is implicit argument movdqa MSGTMP, MSGTMP2 palignr MSGTMP, MSGTMP1, 4 paddd MSGTMP3, MSGTMP sha256msg2 MSGTMP3, MSGTMP2 pshufd MSG, MSG, 0x0E sha256rnds2 STATE0, STATE1, MSG ; MSG is implicit argument movdqa MSG, MSGTMP2b paddd MSG, [SHA256CONSTANTS + 14*16] sha256rnds2 STATE1b, STATE0b, MSG ; MSG is implicit argument movdqa MSGTMP, MSGTMP2b palignr MSGTMP, MSGTMP1b, 4 paddd MSGTMP3b, MSGTMP sha256msg2 MSGTMP3b, MSGTMP2b pshufd MSG, MSG, 0x0E sha256rnds2 STATE0b, STATE1b, MSG ; MSG is implicit argument ;; Rounds 60-63 movdqa MSG, MSGTMP3 paddd MSG, [SHA256CONSTANTS + 15*16] sha256rnds2 STATE1, STATE0, MSG ; MSG is implicit argument pshufd MSG, MSG, 0x0E sha256rnds2 STATE0, STATE1, MSG ; MSG is implicit argument movdqa MSG, MSGTMP3b paddd MSG, [SHA256CONSTANTS + 15*16] sha256rnds2 STATE1b, STATE0b, MSG ; MSG is implicit argument pshufd MSG, MSG, 0x0E sha256rnds2 STATE0b, STATE1b, MSG ; MSG is implicit argument paddd STATE0, [rsp + frame.ABEF_SAVE] paddd STATE1, [rsp + frame.CDGH_SAVE] paddd STATE0b, [rsp + frame.ABEF_SAVEb] paddd STATE1b, [rsp + frame.CDGH_SAVEb] add INP, 64 add INPb, 64 cmp INP, NUM_BLKS jne .loop0 ;; update data pointers mov [args + _data_ptr_sha256 + 0*PTR_SZ], INP mov [args + _data_ptr_sha256 + 1*PTR_SZ], INPb ; Reorder for writeback pshufd STATE0, STATE0, 0x1B ; FEBA pshufd STATE1, STATE1, 0xB1 ; DCHG movdqa MSGTMP4, STATE0 pshufd STATE0b, STATE0b, 0x1B ; FEBA pshufd STATE1b, STATE1b, 0xB1 ; DCHG movdqa MSGTMP, STATE0b pblendw STATE0, STATE1, 0xF0 ; DCBA pblendw STATE0b, STATE1b, 0xF0 ; DCBA palignr STATE1, MSGTMP4, 8 ; HGFE palignr STATE1b, MSGTMP, 8 ; HGFE ;; update digests movdqu [args + 0*SHA256NI_DIGEST_ROW_SIZE + 0*16], STATE0 movdqu [args + 0*SHA256NI_DIGEST_ROW_SIZE + 1*16], STATE1 movdqu [args + 1*SHA256NI_DIGEST_ROW_SIZE + 0*16], STATE0b movdqu [args + 1*SHA256NI_DIGEST_ROW_SIZE + 1*16], STATE1b DBGPRINTL "jobA digest out:" DBGPRINT_XMM STATE0 DBGPRINT_XMM STATE1 DBGPRINTL "jobB digest out:" DBGPRINT_XMM STATE0b DBGPRINT_XMM STATE1b done_hash: DBGPRINTL "exit sha256-ni-x2" add rsp, frame_size ret intel-ipsec-mb-0.48/sse/sha256_one_block_sse.asm000066400000000000000000000335621321406316400214570ustar00rootroot00000000000000;; ;; Copyright (c) 2012-2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; ; This code schedules 1 blocks at a time, with 4 lanes per block ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; %include "os.asm" section .data default rel align 64 K256: dd 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 dd 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 dd 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 dd 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 dd 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc dd 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da dd 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 dd 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 dd 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 dd 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 dd 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 dd 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 dd 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 dd 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 dd 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 dd 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 PSHUFFLE_BYTE_FLIP_MASK: ;ddq 0x0c0d0e0f08090a0b0405060700010203 dq 0x0405060700010203, 0x0c0d0e0f08090a0b ; shuffle xBxA -> 00BA _SHUF_00BA: ;ddq 0xFFFFFFFFFFFFFFFF0b0a090803020100 dq 0x0b0a090803020100, 0xFFFFFFFFFFFFFFFF ; shuffle xDxC -> DC00 _SHUF_DC00: ;ddq 0x0b0a090803020100FFFFFFFFFFFFFFFF dq 0xFFFFFFFFFFFFFFFF, 0x0b0a090803020100 section .text %define MOVDQ movdqu ;; assume buffers not aligned ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros ; COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask ; Load xmm with mem and byte swap each dword %macro COPY_XMM_AND_BSWAP 3 MOVDQ %1, %2 pshufb %1, %3 %endmacro ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; %define X0 xmm4 %define X1 xmm5 %define X2 xmm6 %define X3 xmm7 %define XTMP0 xmm0 %define XTMP1 xmm1 %define XTMP2 xmm2 %define XTMP3 xmm3 %define XTMP4 xmm8 %define XFER xmm9 %define SHUF_00BA xmm10 ; shuffle xBxA -> 00BA %define SHUF_DC00 xmm11 ; shuffle xDxC -> DC00 %define BYTE_FLIP_MASK xmm12 %ifdef LINUX %define CTX rsi ; 2nd arg %define INP rdi ; 1st arg %define SRND rdi ; clobbers INP %define c ecx %define d r8d %define e edx %else %define CTX rdx ; 2nd arg %define INP rcx ; 1st arg %define SRND rcx ; clobbers INP %define c edi %define d esi %define e r8d %endif %define TBL rbp %define a eax %define b ebx %define f r9d %define g r10d %define h r11d %define y0 r13d %define y1 r14d %define y2 r15d struc STACK %ifndef LINUX _XMM_SAVE: reso 7 %endif _XFER: reso 1 endstruc %ifndef H0 %define H0 0x6a09e667 %define H1 0xbb67ae85 %define H2 0x3c6ef372 %define H3 0xa54ff53a %define H4 0x510e527f %define H5 0x9b05688c %define H6 0x1f83d9ab %define H7 0x5be0cd19 %define FUNC sha256_one_block_sse %endif ; rotate_Xs ; Rotate values of symbols X0...X3 %macro rotate_Xs 0 %xdefine X_ X0 %xdefine X0 X1 %xdefine X1 X2 %xdefine X2 X3 %xdefine X3 X_ %endm ; ROTATE_ARGS ; Rotate values of symbols a...h %macro ROTATE_ARGS 0 %xdefine TMP_ h %xdefine h g %xdefine g f %xdefine f e %xdefine e d %xdefine d c %xdefine c b %xdefine b a %xdefine a TMP_ %endm %macro FOUR_ROUNDS_AND_SCHED 0 ;; compute s0 four at a time and s1 two at a time ;; compute W[-16] + W[-7] 4 at a time movdqa XTMP0, X3 mov y0, e ; y0 = e ror y0, (25-11) ; y0 = e >> (25-11) mov y1, a ; y1 = a palignr XTMP0, X2, 4 ; XTMP0 = W[-7] ror y1, (22-13) ; y1 = a >> (22-13) xor y0, e ; y0 = e ^ (e >> (25-11)) mov y2, f ; y2 = f ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6)) movdqa XTMP1, X1 xor y1, a ; y1 = a ^ (a >> (22-13) xor y2, g ; y2 = f^g paddd XTMP0, X0 ; XTMP0 = W[-7] + W[-16] xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) and y2, e ; y2 = (f^g)&e ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2)) ;; compute s0 palignr XTMP1, X0, 4 ; XTMP1 = W[-15] xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) xor y2, g ; y2 = CH = ((f^g)&e)^g movdqa XTMP2, XTMP1 ; XTMP2 = W[-15] ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) add y2, y0 ; y2 = S1 + CH add y2, [rsp + _XFER + 0*4] ; y2 = k + w + S1 + CH movdqa XTMP3, XTMP1 ; XTMP3 = W[-15] mov y0, a ; y0 = a add h, y2 ; h = h + S1 + CH + k + w mov y2, a ; y2 = a pslld XTMP1, (32-7) or y0, c ; y0 = a|c add d, h ; d = d + h + S1 + CH + k + w and y2, c ; y2 = a&c psrld XTMP2, 7 and y0, b ; y0 = (a|c)&b add h, y1 ; h = h + S1 + CH + k + w + S0 por XTMP1, XTMP2 ; XTMP1 = W[-15] ror 7 or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c) add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ ROTATE_ARGS movdqa XTMP2, XTMP3 ; XTMP2 = W[-15] mov y0, e ; y0 = e mov y1, a ; y1 = a movdqa XTMP4, XTMP3 ; XTMP4 = W[-15] ror y0, (25-11) ; y0 = e >> (25-11) xor y0, e ; y0 = e ^ (e >> (25-11)) mov y2, f ; y2 = f ror y1, (22-13) ; y1 = a >> (22-13) pslld XTMP3, (32-18) xor y1, a ; y1 = a ^ (a >> (22-13) ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6)) xor y2, g ; y2 = f^g psrld XTMP2, 18 ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2)) xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) and y2, e ; y2 = (f^g)&e ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) pxor XTMP1, XTMP3 xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) xor y2, g ; y2 = CH = ((f^g)&e)^g psrld XTMP4, 3 ; XTMP4 = W[-15] >> 3 add y2, y0 ; y2 = S1 + CH add y2, [rsp + _XFER + 1*4] ; y2 = k + w + S1 + CH ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) pxor XTMP1, XTMP2 ; XTMP1 = W[-15] ror 7 ^ W[-15] ror 18 mov y0, a ; y0 = a add h, y2 ; h = h + S1 + CH + k + w mov y2, a ; y2 = a pxor XTMP1, XTMP4 ; XTMP1 = s0 or y0, c ; y0 = a|c add d, h ; d = d + h + S1 + CH + k + w and y2, c ; y2 = a&c ;; compute low s1 pshufd XTMP2, X3, 11111010b ; XTMP2 = W[-2] {BBAA} and y0, b ; y0 = (a|c)&b add h, y1 ; h = h + S1 + CH + k + w + S0 paddd XTMP0, XTMP1 ; XTMP0 = W[-16] + W[-7] + s0 or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c) add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ ROTATE_ARGS movdqa XTMP3, XTMP2 ; XTMP3 = W[-2] {BBAA} mov y0, e ; y0 = e mov y1, a ; y1 = a ror y0, (25-11) ; y0 = e >> (25-11) movdqa XTMP4, XTMP2 ; XTMP4 = W[-2] {BBAA} xor y0, e ; y0 = e ^ (e >> (25-11)) ror y1, (22-13) ; y1 = a >> (22-13) mov y2, f ; y2 = f xor y1, a ; y1 = a ^ (a >> (22-13) ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6)) psrlq XTMP2, 17 ; XTMP2 = W[-2] ror 17 {xBxA} xor y2, g ; y2 = f^g psrlq XTMP3, 19 ; XTMP3 = W[-2] ror 19 {xBxA} xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) and y2, e ; y2 = (f^g)&e psrld XTMP4, 10 ; XTMP4 = W[-2] >> 10 {BBAA} ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2)) xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) xor y2, g ; y2 = CH = ((f^g)&e)^g ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) pxor XTMP2, XTMP3 add y2, y0 ; y2 = S1 + CH ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) add y2, [rsp + _XFER + 2*4] ; y2 = k + w + S1 + CH pxor XTMP4, XTMP2 ; XTMP4 = s1 {xBxA} mov y0, a ; y0 = a add h, y2 ; h = h + S1 + CH + k + w mov y2, a ; y2 = a pshufb XTMP4, SHUF_00BA ; XTMP4 = s1 {00BA} or y0, c ; y0 = a|c add d, h ; d = d + h + S1 + CH + k + w and y2, c ; y2 = a&c paddd XTMP0, XTMP4 ; XTMP0 = {..., ..., W[1], W[0]} and y0, b ; y0 = (a|c)&b add h, y1 ; h = h + S1 + CH + k + w + S0 ;; compute high s1 pshufd XTMP2, XTMP0, 01010000b ; XTMP2 = W[-2] {DDCC} or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c) add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ ROTATE_ARGS movdqa XTMP3, XTMP2 ; XTMP3 = W[-2] {DDCC} mov y0, e ; y0 = e ror y0, (25-11) ; y0 = e >> (25-11) mov y1, a ; y1 = a movdqa X0, XTMP2 ; X0 = W[-2] {DDCC} ror y1, (22-13) ; y1 = a >> (22-13) xor y0, e ; y0 = e ^ (e >> (25-11)) mov y2, f ; y2 = f ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6)) psrlq XTMP2, 17 ; XTMP2 = W[-2] ror 17 {xDxC} xor y1, a ; y1 = a ^ (a >> (22-13) xor y2, g ; y2 = f^g psrlq XTMP3, 19 ; XTMP3 = W[-2] ror 19 {xDxC} xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) and y2, e ; y2 = (f^g)&e ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2)) psrld X0, 10 ; X0 = W[-2] >> 10 {DDCC} xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) xor y2, g ; y2 = CH = ((f^g)&e)^g pxor XTMP2, XTMP3 ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) add y2, y0 ; y2 = S1 + CH add y2, [rsp + _XFER + 3*4] ; y2 = k + w + S1 + CH pxor X0, XTMP2 ; X0 = s1 {xDxC} mov y0, a ; y0 = a add h, y2 ; h = h + S1 + CH + k + w mov y2, a ; y2 = a pshufb X0, SHUF_DC00 ; X0 = s1 {DC00} or y0, c ; y0 = a|c add d, h ; d = d + h + S1 + CH + k + w and y2, c ; y2 = a&c paddd X0, XTMP0 ; X0 = {W[3], W[2], W[1], W[0]} and y0, b ; y0 = (a|c)&b add h, y1 ; h = h + S1 + CH + k + w + S0 or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c) add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ ROTATE_ARGS rotate_Xs %endm ;; input is [rsp + _XFER + %1 * 4] %macro DO_ROUND 1 mov y0, e ; y0 = e ror y0, (25-11) ; y0 = e >> (25-11) mov y1, a ; y1 = a xor y0, e ; y0 = e ^ (e >> (25-11)) ror y1, (22-13) ; y1 = a >> (22-13) mov y2, f ; y2 = f xor y1, a ; y1 = a ^ (a >> (22-13) ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6)) xor y2, g ; y2 = f^g xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2)) and y2, e ; y2 = (f^g)&e xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) xor y2, g ; y2 = CH = ((f^g)&e)^g add y2, y0 ; y2 = S1 + CH ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) add y2, [rsp + _XFER + %1 * 4] ; y2 = k + w + S1 + CH mov y0, a ; y0 = a add h, y2 ; h = h + S1 + CH + k + w mov y2, a ; y2 = a or y0, c ; y0 = a|c add d, h ; d = d + h + S1 + CH + k + w and y2, c ; y2 = a&c and y0, b ; y0 = (a|c)&b add h, y1 ; h = h + S1 + CH + k + w + S0 or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c) add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ ROTATE_ARGS %endm ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; void FUNC(void *input_data, UINT32 digest[8]) ;; arg 1 : pointer to input data ;; arg 2 : pointer to digest section .text MKGLOBAL(FUNC,function,) align 32 FUNC: push rbx %ifndef LINUX push rsi push rdi %endif push rbp push r13 push r14 push r15 sub rsp,STACK_size %ifndef LINUX movdqa [rsp + _XMM_SAVE + 0*16],xmm6 movdqa [rsp + _XMM_SAVE + 1*16],xmm7 movdqa [rsp + _XMM_SAVE + 2*16],xmm8 movdqa [rsp + _XMM_SAVE + 3*16],xmm9 movdqa [rsp + _XMM_SAVE + 4*16],xmm10 movdqa [rsp + _XMM_SAVE + 5*16],xmm11 movdqa [rsp + _XMM_SAVE + 6*16],xmm12 %endif ;; load initial digest mov a,H0 mov b,H1 mov c,H2 mov d,H3 mov e,H4 mov f,H5 mov g,H6 mov h,H7 movdqa BYTE_FLIP_MASK, [rel PSHUFFLE_BYTE_FLIP_MASK] movdqa SHUF_00BA, [rel _SHUF_00BA] movdqa SHUF_DC00, [rel _SHUF_DC00] lea TBL,[rel K256] ;; byte swap first 16 dwords COPY_XMM_AND_BSWAP X0, [INP + 0*16], BYTE_FLIP_MASK COPY_XMM_AND_BSWAP X1, [INP + 1*16], BYTE_FLIP_MASK COPY_XMM_AND_BSWAP X2, [INP + 2*16], BYTE_FLIP_MASK COPY_XMM_AND_BSWAP X3, [INP + 3*16], BYTE_FLIP_MASK ;; schedule 48 input dwords, by doing 3 rounds of 16 each mov SRND, 3 align 16 loop1: movdqa XFER, [TBL + 0*16] paddd XFER, X0 movdqa [rsp + _XFER], XFER FOUR_ROUNDS_AND_SCHED movdqa XFER, [TBL + 1*16] paddd XFER, X0 movdqa [rsp + _XFER], XFER FOUR_ROUNDS_AND_SCHED movdqa XFER, [TBL + 2*16] paddd XFER, X0 movdqa [rsp + _XFER], XFER FOUR_ROUNDS_AND_SCHED movdqa XFER, [TBL + 3*16] paddd XFER, X0 movdqa [rsp + _XFER], XFER add TBL, 4*16 FOUR_ROUNDS_AND_SCHED sub SRND, 1 jne loop1 mov SRND, 2 loop2: paddd X0, [TBL + 0*16] movdqa [rsp + _XFER], X0 DO_ROUND 0 DO_ROUND 1 DO_ROUND 2 DO_ROUND 3 paddd X1, [TBL + 1*16] movdqa [rsp + _XFER], X1 add TBL, 2*16 DO_ROUND 0 DO_ROUND 1 DO_ROUND 2 DO_ROUND 3 movdqa X0, X2 movdqa X1, X3 sub SRND, 1 jne loop2 add a,H0 add b,H1 add c,H2 add d,H3 add e,H4 add f,H5 add g,H6 mov [4*0 + CTX],a mov [4*1 + CTX],b mov [4*2 + CTX],c mov [4*3 + CTX],d mov [4*4 + CTX],e mov [4*5 + CTX],f mov [4*6 + CTX],g add h,H7 mov [4*7 + CTX],h done_hash: %ifndef LINUX movdqa xmm6,[rsp + _XMM_SAVE + 0*16] movdqa xmm7,[rsp + _XMM_SAVE + 1*16] movdqa xmm8,[rsp + _XMM_SAVE + 2*16] movdqa xmm9,[rsp + _XMM_SAVE + 3*16] movdqa xmm10,[rsp + _XMM_SAVE + 4*16] movdqa xmm11,[rsp + _XMM_SAVE + 5*16] movdqa xmm12,[rsp + _XMM_SAVE + 6*16] %endif add rsp, STACK_size pop r15 pop r14 pop r13 pop rbp %ifndef LINUX pop rdi pop rsi %endif pop rbx ret intel-ipsec-mb-0.48/sse/sha384_one_block_sse.asm000066400000000000000000000037571321406316400214640ustar00rootroot00000000000000;; ;; Copyright (c) 2012-2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; ; This code schedules 1 blocks at a time, with 4 lanes per block ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; %define H0 0xcbbb9d5dc1059ed8 %define H1 0x629a292a367cd507 %define H2 0x9159015a3070dd17 %define H3 0x152fecd8f70e5939 %define H4 0x67332667ffc00b31 %define H5 0x8eb44a8768581511 %define H6 0xdb0c2e0d64f98fa7 %define H7 0x47b5481dbefa4fa4 %define FUNC sha384_one_block_sse %include "sha512_one_block_sse.asm" intel-ipsec-mb-0.48/sse/sha512_one_block_sse.asm000066400000000000000000000306231321406316400214450ustar00rootroot00000000000000;; ;; Copyright (c) 2012-2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; ; This code schedules 1 blocks at a time, with 4 lanes per block ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; %include "os.asm" %define MOVDQ movdqu ;; assume buffers not aligned %ifndef FUNC %define FUNC sha512_one_block_sse %endif ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros ; COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask ; Load xmm with mem and byte swap each dword %macro COPY_XMM_AND_BSWAP 3 MOVDQ %1, %2 pshufb %1, %3 %endmacro ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; %define X0 xmm4 %define X1 xmm5 %define X2 xmm6 %define X3 xmm7 %define X4 xmm8 %define X5 xmm9 %define X6 xmm10 %define X7 xmm11 %define XTMP0 xmm0 %define XTMP1 xmm1 %define XTMP2 xmm2 %define XTMP3 xmm3 %define XFER xmm13 %define BYTE_FLIP_MASK xmm12 %ifdef LINUX %define CTX rsi ; 2nd arg %define INP rdi ; 1st arg %define SRND rdi ; clobbers INP %define c rcx %define d r8 %define e rdx %else %define CTX rdx ; 2nd arg %define INP rcx ; 1st arg %define SRND rcx ; clobbers INP %define c rdi %define d rsi %define e r8 %endif %define TBL rbp %define a rax %define b rbx %define f r9 %define g r10 %define h r11 %define y0 r13 %define y1 r14 %define y2 r15 struc STACK %ifndef LINUX _XMM_SAVE: reso 8 %endif _XFER: reso 1 endstruc %ifndef H0 %define H0 0x6a09e667f3bcc908 %define H1 0xbb67ae8584caa73b %define H2 0x3c6ef372fe94f82b %define H3 0xa54ff53a5f1d36f1 %define H4 0x510e527fade682d1 %define H5 0x9b05688c2b3e6c1f %define H6 0x1f83d9abfb41bd6b %define H7 0x5be0cd19137e2179 %endif ; rotate_Xs ; Rotate values of symbols X0...X7 %macro rotate_Xs 0 %xdefine X_ X0 %xdefine X0 X1 %xdefine X1 X2 %xdefine X2 X3 %xdefine X3 X4 %xdefine X4 X5 %xdefine X5 X6 %xdefine X6 X7 %xdefine X7 X_ %endm ; ROTATE_ARGS ; Rotate values of symbols a...h %macro ROTATE_ARGS 0 %xdefine TMP_ h %xdefine h g %xdefine g f %xdefine f e %xdefine e d %xdefine d c %xdefine c b %xdefine b a %xdefine a TMP_ %endm %macro TWO_ROUNDS_AND_SCHED 0 ;; compute s0 four at a time and s1 two at a time ;; compute W[-16] + W[-7] 4 at a time movdqa XTMP0, X5 mov y0, e ; y0 = e mov y1, a ; y1 = a ror y0, (41-18) ; y0 = e >> (41-18) palignr XTMP0, X4, 8 ; XTMP0 = W[-7] xor y0, e ; y0 = e ^ (e >> (41-18)) mov y2, f ; y2 = f ror y1, (39-34) ; y1 = a >> (39-34) xor y1, a ; y1 = a ^ (a >> (39-34) movdqa XTMP1, X1 ror y0, (18-14) ; y0 = (e >> (18-14)) ^ (e >> (41-14)) xor y2, g ; y2 = f^g paddq XTMP0, X0 ; XTMP0 = W[-7] + W[-16] ror y1, (34-28) ; y1 = (a >> (34-28)) ^ (a >> (39-28)) xor y0, e ; y0 = e ^ (e >> (18-14)) ^ (e >> (41-14)) and y2, e ; y2 = (f^g)&e ;; compute s0 palignr XTMP1, X0, 8 ; XTMP1 = W[-15] xor y1, a ; y1 = a ^ (a >> (34-28)) ^ (a >> (39-28)) xor y2, g ; y2 = CH = ((f^g)&e)^g movdqa XTMP2, XTMP1 ; XTMP2 = W[-15] ror y0, 14 ; y0 = S1 = (e>>14) & (e>>18) ^ (e>>41) add y2, y0 ; y2 = S1 + CH add y2, [rsp + _XFER + 0*8] ; y2 = k + w + S1 + CH ror y1, 28 ; y1 = S0 = (a>>28) ^ (a>>34) ^ (a>>39) movdqa XTMP3, XTMP1 ; XTMP3 = W[-15] mov y0, a ; y0 = a add h, y2 ; h = h + S1 + CH + k + w psllq XTMP1, (64-1) mov y2, a ; y2 = a or y0, c ; y0 = a|c psrlq XTMP2, 1 add d, h ; d = d + t1 and y2, c ; y2 = a&c por XTMP1, XTMP2 ; XTMP1 = W[-15] ror 1 and y0, b ; y0 = (a|c)&b add h, y1 ; h = t1 + S0 movdqa XTMP2, XTMP3 ; XTMP2 = W[-15] psrlq XTMP2, 8 or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c) add h, y0 ; h = t1 + S0 + MAJ movdqa X0, XTMP3 ; X0 = W[-15] psllq XTMP3, (64-8) ROTATE_ARGS pxor XTMP1, XTMP3 psrlq X0, 7 ; X0 = W[-15] >> 7 mov y0, e ; y0 = e mov y1, a ; y1 = a pxor XTMP1, XTMP2 ; XTMP1 = W[-15] ror 1 ^ W[-15] ror 8 ror y0, (41-18) ; y0 = e >> (41-18) xor y0, e ; y0 = e ^ (e >> (41-18)) mov y2, f ; y2 = f pxor XTMP1, X0 ; XTMP1 = s0 ror y1, (39-34) ; y1 = a >> (39-34) xor y1, a ; y1 = a ^ (a >> (39-34) ;; compute s1 movdqa XTMP2, X7 ; XTMP2 = W[-2] ror y0, (18-14) ; y0 = (e >> (18-14)) ^ (e >> (41-14)) xor y2, g ; y2 = f^g paddq XTMP0, XTMP1 ; XTMP0 = W[-16] + W[-7] + s0 ror y1, (34-28) ; y1 = (a >> (34-28)) ^ (a >> (39-28)) xor y0, e ; y0 = e ^ (e >> (18-14)) ^ (e >> (41-14)) movdqa XTMP3, XTMP2 ; XTMP3 = W[-2] movdqa X0, XTMP2 ; X0 = W[-2] and y2, e ; y2 = (f^g)&e ror y0, 14 ; y0 = S1 = (e>>14) & (e>>18) ^ (e>>41) xor y1, a ; y1 = a ^ (a >> (34-28)) ^ (a >> (39-28)) psllq XTMP3, (64-19) xor y2, g ; y2 = CH = ((f^g)&e)^g add y2, y0 ; y2 = S1 + CH add y2, [rsp + _XFER + 1*8] ; y2 = k + w + S1 + CH psrlq X0, 19 ror y1, 28 ; y1 = S0 = (a>>28) ^ (a>>34) ^ (a>>39) mov y0, a ; y0 = a add h, y2 ; h = h + S1 + CH + k + w por XTMP3, X0 ; XTMP3 = W[-2] ror 19 mov y2, a ; y2 = a or y0, c ; y0 = a|c movdqa X0, XTMP2 ; X0 = W[-2] movdqa XTMP1, XTMP2 ; XTMP1 = W[-2] add d, h ; d = d + t1 and y2, c ; y2 = a&c psllq X0, (64-61) and y0, b ; y0 = (a|c)&b add h, y1 ; h = t1 + S0 psrlq XTMP1, 61 or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c) add h, y0 ; h = t1 + S0 + MAJ por X0, XTMP1 ; X0 = W[-2] ror 61 psrlq XTMP2, 6 ; XTMP2 = W[-2] >> 6 pxor XTMP2, XTMP3 pxor X0, XTMP2 ; X0 = s1 paddq X0, XTMP0 ; X0 = {W[1], W[0]} ROTATE_ARGS rotate_Xs %endm ;; input is [rsp + _XFER + %1 * 8] %macro DO_ROUND 1 mov y0, e ; y0 = e ror y0, (41-18) ; y0 = e >> (41-18) mov y1, a ; y1 = a xor y0, e ; y0 = e ^ (e >> (41-18)) ror y1, (39-34) ; y1 = a >> (39-34) mov y2, f ; y2 = f xor y1, a ; y1 = a ^ (a >> (39-34) ror y0, (18-14) ; y0 = (e >> (18-14)) ^ (e >> (41-14)) xor y2, g ; y2 = f^g xor y0, e ; y0 = e ^ (e >> (18-14)) ^ (e >> (25-6)) ror y1, (34-28) ; y1 = (a >> (34-28)) ^ (a >> (39-28)) and y2, e ; y2 = (f^g)&e xor y1, a ; y1 = a ^ (a >> (34-28)) ^ (a >> (39-28)) ror y0, 14 ; y0 = S1 = (e>>14) & (e>>18) ^ (e>>41) xor y2, g ; y2 = CH = ((f^g)&e)^g add y2, y0 ; y2 = S1 + CH ror y1, 28 ; y1 = S0 = (a>>28) ^ (a>>34) ^ (a>>39) add y2, [rsp + _XFER + %1*8] ; y2 = k + w + S1 + CH mov y0, a ; y0 = a add h, y2 ; h = h + S1 + CH + k + w mov y2, a ; y2 = a or y0, c ; y0 = a|c add d, h ; d = d + t1 and y2, c ; y2 = a&c and y0, b ; y0 = (a|c)&b add h, y1 ; h = t1 + S0 or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c) add h, y0 ; h = t1 + S0 + MAJ ROTATE_ARGS %endm section .data default rel align 64 K512: dq 0x428a2f98d728ae22,0x7137449123ef65cd dq 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc dq 0x3956c25bf348b538,0x59f111f1b605d019 dq 0x923f82a4af194f9b,0xab1c5ed5da6d8118 dq 0xd807aa98a3030242,0x12835b0145706fbe dq 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 dq 0x72be5d74f27b896f,0x80deb1fe3b1696b1 dq 0x9bdc06a725c71235,0xc19bf174cf692694 dq 0xe49b69c19ef14ad2,0xefbe4786384f25e3 dq 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 dq 0x2de92c6f592b0275,0x4a7484aa6ea6e483 dq 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 dq 0x983e5152ee66dfab,0xa831c66d2db43210 dq 0xb00327c898fb213f,0xbf597fc7beef0ee4 dq 0xc6e00bf33da88fc2,0xd5a79147930aa725 dq 0x06ca6351e003826f,0x142929670a0e6e70 dq 0x27b70a8546d22ffc,0x2e1b21385c26c926 dq 0x4d2c6dfc5ac42aed,0x53380d139d95b3df dq 0x650a73548baf63de,0x766a0abb3c77b2a8 dq 0x81c2c92e47edaee6,0x92722c851482353b dq 0xa2bfe8a14cf10364,0xa81a664bbc423001 dq 0xc24b8b70d0f89791,0xc76c51a30654be30 dq 0xd192e819d6ef5218,0xd69906245565a910 dq 0xf40e35855771202a,0x106aa07032bbd1b8 dq 0x19a4c116b8d2d0c8,0x1e376c085141ab53 dq 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 dq 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb dq 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 dq 0x748f82ee5defb2fc,0x78a5636f43172f60 dq 0x84c87814a1f0ab72,0x8cc702081a6439ec dq 0x90befffa23631e28,0xa4506cebde82bde9 dq 0xbef9a3f7b2c67915,0xc67178f2e372532b dq 0xca273eceea26619c,0xd186b8c721c0c207 dq 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 dq 0x06f067aa72176fba,0x0a637dc5a2c898a6 dq 0x113f9804bef90dae,0x1b710b35131c471b dq 0x28db77f523047d84,0x32caab7b40c72493 dq 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c dq 0x4cc5d4becb3e42b6,0x597f299cfc657e2a dq 0x5fcb6fab3ad6faec,0x6c44198c4a475817 h0: dq H0 h1: dq H1 h2: dq H2 h3: dq H3 h4: dq H4 h5: dq H5 h6: dq H6 h7: dq H7 align 16 PSHUFFLE_BYTE_FLIP_MASK: ;ddq 0x08090a0b0c0d0e0f0001020304050607 dq 0x0001020304050607, 0x08090a0b0c0d0e0f ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; void FUNC(void *input_data, UINT64 digest[8]) ;; arg 1 : pointer to input data ;; arg 2 : pointer to digest section .text MKGLOBAL(FUNC,function,) align 32 FUNC: push rbx %ifndef LINUX push rsi push rdi %endif push rbp push r13 push r14 push r15 sub rsp,STACK_size %ifndef LINUX movdqa [rsp + _XMM_SAVE + 0*16],xmm6 movdqa [rsp + _XMM_SAVE + 1*16],xmm7 movdqa [rsp + _XMM_SAVE + 2*16],xmm8 movdqa [rsp + _XMM_SAVE + 3*16],xmm9 movdqa [rsp + _XMM_SAVE + 4*16],xmm10 movdqa [rsp + _XMM_SAVE + 5*16],xmm11 movdqa [rsp + _XMM_SAVE + 6*16],xmm12 movdqa [rsp + _XMM_SAVE + 7*16],xmm13 %endif ;; load initial digest mov a,[rel h0] mov b,[rel h1] mov c,[rel h2] mov d,[rel h3] mov e,[rel h4] mov f,[rel h5] mov g,[rel h6] mov h,[rel h7] movdqa BYTE_FLIP_MASK, [rel PSHUFFLE_BYTE_FLIP_MASK] lea TBL,[rel K512] ;; byte swap first 16 qwords COPY_XMM_AND_BSWAP X0, [INP + 0*16], BYTE_FLIP_MASK COPY_XMM_AND_BSWAP X1, [INP + 1*16], BYTE_FLIP_MASK COPY_XMM_AND_BSWAP X2, [INP + 2*16], BYTE_FLIP_MASK COPY_XMM_AND_BSWAP X3, [INP + 3*16], BYTE_FLIP_MASK COPY_XMM_AND_BSWAP X4, [INP + 4*16], BYTE_FLIP_MASK COPY_XMM_AND_BSWAP X5, [INP + 5*16], BYTE_FLIP_MASK COPY_XMM_AND_BSWAP X6, [INP + 6*16], BYTE_FLIP_MASK COPY_XMM_AND_BSWAP X7, [INP + 7*16], BYTE_FLIP_MASK ;; schedule 64 input qwords, by doing 4 iterations of 16 rounds mov SRND, 4 align 16 loop1: %assign i 0 %rep 7 movdqa XFER, X0 paddq XFER, [TBL + i*16] movdqa [rsp + _XFER], XFER TWO_ROUNDS_AND_SCHED %assign i (i+1) %endrep movdqa XFER, X0 paddq XFER, [TBL + 7*16] movdqa [rsp + _XFER], XFER add TBL, 8*16 TWO_ROUNDS_AND_SCHED sub SRND, 1 jne loop1 mov SRND, 2 jmp loop2a loop2: movdqa X0, X4 movdqa X1, X5 movdqa X2, X6 movdqa X3, X7 loop2a: paddq X0, [TBL + 0*16] movdqa [rsp + _XFER], X0 DO_ROUND 0 DO_ROUND 1 paddq X1, [TBL + 1*16] movdqa [rsp + _XFER], X1 DO_ROUND 0 DO_ROUND 1 paddq X2, [TBL + 2*16] movdqa [rsp + _XFER], X2 DO_ROUND 0 DO_ROUND 1 paddq X3, [TBL + 3*16] movdqa [rsp + _XFER], X3 add TBL, 4*16 DO_ROUND 0 DO_ROUND 1 sub SRND, 1 jne loop2 add a,[rel h0] add b,[rel h1] add c,[rel h2] add d,[rel h3] add e,[rel h4] add f,[rel h5] add g,[rel h6] mov [8*0 + CTX],a mov [8*1 + CTX],b mov [8*2 + CTX],c mov [8*3 + CTX],d mov [8*4 + CTX],e mov [8*5 + CTX],f mov [8*6 + CTX],g add h,[rel h7] mov [8*7 + CTX],h done_hash: %ifndef LINUX movdqa xmm6,[rsp + _XMM_SAVE + 0*16] movdqa xmm7,[rsp + _XMM_SAVE + 1*16] movdqa xmm8,[rsp + _XMM_SAVE + 2*16] movdqa xmm9,[rsp + _XMM_SAVE + 3*16] movdqa xmm10,[rsp + _XMM_SAVE + 4*16] movdqa xmm11,[rsp + _XMM_SAVE + 5*16] movdqa xmm12,[rsp + _XMM_SAVE + 6*16] movdqa xmm13,[rsp + _XMM_SAVE + 7*16] %endif add rsp, STACK_size pop r15 pop r14 pop r13 pop rbp %ifndef LINUX pop rdi pop rsi %endif pop rbx ret intel-ipsec-mb-0.48/sse/sha512_x2_sse.asm000066400000000000000000000305771321406316400200530ustar00rootroot00000000000000;; ;; Copyright (c) 2012-2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; ;; code to compute SHA512 by-2 using SSE ;; outer calling routine takes care of save and restore of XMM registers ;; Logic designed/laid out by JDG ;; Function clobbers: rax, rcx, rdx, rbx, rsi, rdi, r9-r15; ymm0-15 ;; Stack must be aligned to 16 bytes before call ;; Windows clobbers: rax rdx r8 r9 r10 r11 ;; Windows preserves: rbx rcx rsi rdi rbp r12 r13 r14 r15 ;; ;; Linux clobbers: rax rsi r8 r9 r10 r11 ;; Linux preserves: rbx rcx rdx rdi rbp r12 r13 r14 r15 ;; ;; clobbers xmm0-15 %include "os.asm" %include "mb_mgr_datastruct.asm" ;%define DO_DBGPRINT %include "dbgprint.asm" section .data default rel align 64 MKGLOBAL(K512_2,data,internal) K512_2: dq 0x428a2f98d728ae22, 0x428a2f98d728ae22 dq 0x7137449123ef65cd, 0x7137449123ef65cd dq 0xb5c0fbcfec4d3b2f, 0xb5c0fbcfec4d3b2f dq 0xe9b5dba58189dbbc, 0xe9b5dba58189dbbc dq 0x3956c25bf348b538, 0x3956c25bf348b538 dq 0x59f111f1b605d019, 0x59f111f1b605d019 dq 0x923f82a4af194f9b, 0x923f82a4af194f9b dq 0xab1c5ed5da6d8118, 0xab1c5ed5da6d8118 dq 0xd807aa98a3030242, 0xd807aa98a3030242 dq 0x12835b0145706fbe, 0x12835b0145706fbe dq 0x243185be4ee4b28c, 0x243185be4ee4b28c dq 0x550c7dc3d5ffb4e2, 0x550c7dc3d5ffb4e2 dq 0x72be5d74f27b896f, 0x72be5d74f27b896f dq 0x80deb1fe3b1696b1, 0x80deb1fe3b1696b1 dq 0x9bdc06a725c71235, 0x9bdc06a725c71235 dq 0xc19bf174cf692694, 0xc19bf174cf692694 dq 0xe49b69c19ef14ad2, 0xe49b69c19ef14ad2 dq 0xefbe4786384f25e3, 0xefbe4786384f25e3 dq 0x0fc19dc68b8cd5b5, 0x0fc19dc68b8cd5b5 dq 0x240ca1cc77ac9c65, 0x240ca1cc77ac9c65 dq 0x2de92c6f592b0275, 0x2de92c6f592b0275 dq 0x4a7484aa6ea6e483, 0x4a7484aa6ea6e483 dq 0x5cb0a9dcbd41fbd4, 0x5cb0a9dcbd41fbd4 dq 0x76f988da831153b5, 0x76f988da831153b5 dq 0x983e5152ee66dfab, 0x983e5152ee66dfab dq 0xa831c66d2db43210, 0xa831c66d2db43210 dq 0xb00327c898fb213f, 0xb00327c898fb213f dq 0xbf597fc7beef0ee4, 0xbf597fc7beef0ee4 dq 0xc6e00bf33da88fc2, 0xc6e00bf33da88fc2 dq 0xd5a79147930aa725, 0xd5a79147930aa725 dq 0x06ca6351e003826f, 0x06ca6351e003826f dq 0x142929670a0e6e70, 0x142929670a0e6e70 dq 0x27b70a8546d22ffc, 0x27b70a8546d22ffc dq 0x2e1b21385c26c926, 0x2e1b21385c26c926 dq 0x4d2c6dfc5ac42aed, 0x4d2c6dfc5ac42aed dq 0x53380d139d95b3df, 0x53380d139d95b3df dq 0x650a73548baf63de, 0x650a73548baf63de dq 0x766a0abb3c77b2a8, 0x766a0abb3c77b2a8 dq 0x81c2c92e47edaee6, 0x81c2c92e47edaee6 dq 0x92722c851482353b, 0x92722c851482353b dq 0xa2bfe8a14cf10364, 0xa2bfe8a14cf10364 dq 0xa81a664bbc423001, 0xa81a664bbc423001 dq 0xc24b8b70d0f89791, 0xc24b8b70d0f89791 dq 0xc76c51a30654be30, 0xc76c51a30654be30 dq 0xd192e819d6ef5218, 0xd192e819d6ef5218 dq 0xd69906245565a910, 0xd69906245565a910 dq 0xf40e35855771202a, 0xf40e35855771202a dq 0x106aa07032bbd1b8, 0x106aa07032bbd1b8 dq 0x19a4c116b8d2d0c8, 0x19a4c116b8d2d0c8 dq 0x1e376c085141ab53, 0x1e376c085141ab53 dq 0x2748774cdf8eeb99, 0x2748774cdf8eeb99 dq 0x34b0bcb5e19b48a8, 0x34b0bcb5e19b48a8 dq 0x391c0cb3c5c95a63, 0x391c0cb3c5c95a63 dq 0x4ed8aa4ae3418acb, 0x4ed8aa4ae3418acb dq 0x5b9cca4f7763e373, 0x5b9cca4f7763e373 dq 0x682e6ff3d6b2b8a3, 0x682e6ff3d6b2b8a3 dq 0x748f82ee5defb2fc, 0x748f82ee5defb2fc dq 0x78a5636f43172f60, 0x78a5636f43172f60 dq 0x84c87814a1f0ab72, 0x84c87814a1f0ab72 dq 0x8cc702081a6439ec, 0x8cc702081a6439ec dq 0x90befffa23631e28, 0x90befffa23631e28 dq 0xa4506cebde82bde9, 0xa4506cebde82bde9 dq 0xbef9a3f7b2c67915, 0xbef9a3f7b2c67915 dq 0xc67178f2e372532b, 0xc67178f2e372532b dq 0xca273eceea26619c, 0xca273eceea26619c dq 0xd186b8c721c0c207, 0xd186b8c721c0c207 dq 0xeada7dd6cde0eb1e, 0xeada7dd6cde0eb1e dq 0xf57d4f7fee6ed178, 0xf57d4f7fee6ed178 dq 0x06f067aa72176fba, 0x06f067aa72176fba dq 0x0a637dc5a2c898a6, 0x0a637dc5a2c898a6 dq 0x113f9804bef90dae, 0x113f9804bef90dae dq 0x1b710b35131c471b, 0x1b710b35131c471b dq 0x28db77f523047d84, 0x28db77f523047d84 dq 0x32caab7b40c72493, 0x32caab7b40c72493 dq 0x3c9ebe0a15c9bebc, 0x3c9ebe0a15c9bebc dq 0x431d67c49c100d4c, 0x431d67c49c100d4c dq 0x4cc5d4becb3e42b6, 0x4cc5d4becb3e42b6 dq 0x597f299cfc657e2a, 0x597f299cfc657e2a dq 0x5fcb6fab3ad6faec, 0x5fcb6fab3ad6faec dq 0x6c44198c4a475817, 0x6c44198c4a475817 PSHUFFLE_BYTE_FLIP_MASK: ;ddq 0x08090a0b0c0d0e0f0001020304050607 dq 0x0001020304050607, 0x08090a0b0c0d0e0f section .text %ifdef LINUX ; Linux definitions %define arg1 rdi %define arg2 rsi %else ; Windows definitions %define arg1 rcx %define arg2 rdx %endif ; Common definitions %define STATE arg1 %define INP_SIZE arg2 %define IDX rax %define ROUND r8 %define TBL r11 %define inp0 r9 %define inp1 r10 %define a xmm0 %define b xmm1 %define c xmm2 %define d xmm3 %define e xmm4 %define f xmm5 %define g xmm6 %define h xmm7 %define a0 xmm8 %define a1 xmm9 %define a2 xmm10 %define TT0 xmm14 %define TT1 xmm13 %define TT2 xmm12 %define TT3 xmm11 %define TT4 xmm10 %define TT5 xmm9 %define T1 xmm14 %define TMP xmm15 %define SZ2 2*SHA512_DIGEST_WORD_SIZE ; Size of one vector register %define ROUNDS 80*SZ2 ; Define stack usage struc STACK _DATA: resb SZ2 * 16 _DIGEST: resb SZ2 * NUM_SHA512_DIGEST_WORDS resb 8 ; for alignment, must be odd multiple of 8 endstruc %define MOVPD movupd ; transpose r0, r1, t0 ; Input looks like {r0 r1} ; r0 = {a1 a0} ; r1 = {b1 b0} ; ; output looks like ; r0 = {b0, a0} ; t0 = {b1, a1} %macro TRANSPOSE 3 %define %%r0 %1 %define %%r1 %2 %define %%t0 %3 movapd %%t0, %%r0 ; t0 = a1 a0 shufpd %%r0, %%r1, 00b ; r0 = b0 a0 shufpd %%t0, %%r1, 11b ; t0 = b1 a1 %endm %macro ROTATE_ARGS 0 %xdefine TMP_ h %xdefine h g %xdefine g f %xdefine f e %xdefine e d %xdefine d c %xdefine c b %xdefine b a %xdefine a TMP_ %endm ; PRORQ reg, imm, tmp ; packed-rotate-right-double ; does a rotate by doing two shifts and an or %macro PRORQ 3 %define %%reg %1 %define %%imm %2 %define %%tmp %3 movdqa %%tmp, %%reg psllq %%tmp, (64-(%%imm)) psrlq %%reg, %%imm por %%reg, %%tmp %endmacro ; PRORQ dst/src, amt %macro PRORQ 2 PRORQ %1, %2, TMP %endmacro ;; arguments passed implicitly in preprocessor symbols i, a...h %macro ROUND_00_15 2 %define %%T1 %1 %define %%i %2 movdqa a0, e ; sig1: a0 = e movdqa a1, e ; sig1: s1 = e PRORQ a0, (18-14) ; sig1: a0 = (e >> 4) movdqa a2, f ; ch: a2 = f pxor a2, g ; ch: a2 = f^g pand a2, e ; ch: a2 = (f^g)&e pxor a2, g ; a2 = ch PRORQ a1, 41 ; sig1: a1 = (e >> 41) movdqa [SZ2*(%%i&0xf) + rsp],%%T1 paddq %%T1,[TBL + ROUND] ; T1 = W + K pxor a0, e ; sig1: a0 = e ^ (e >> 5) PRORQ a0, 14 ; sig1: a0 = (e >> 14) ^ (e >> 18) paddq h, a2 ; h = h + ch movdqa a2, a ; sig0: a2 = a PRORQ a2, (34-28) ; sig0: a2 = (a >> 6) paddq h, %%T1 ; h = h + ch + W + K pxor a0, a1 ; a0 = sigma1 movdqa a1, a ; sig0: a1 = a movdqa %%T1, a ; maj: T1 = a PRORQ a1, 39 ; sig0: a1 = (a >> 39) pxor %%T1, c ; maj: T1 = a^c add ROUND, SZ2 ; ROUND++ pand %%T1, b ; maj: T1 = (a^c)&b paddq h, a0 paddq d, h pxor a2, a ; sig0: a2 = a ^ (a >> 11) PRORQ a2, 28 ; sig0: a2 = (a >> 28) ^ (a >> 34) pxor a2, a1 ; a2 = sig0 movdqa a1, a ; maj: a1 = a pand a1, c ; maj: a1 = a&c por a1, %%T1 ; a1 = maj paddq h, a1 ; h = h + ch + W + K + maj paddq h, a2 ; h = h + ch + W + K + maj + sigma0 ROTATE_ARGS %endm ;; arguments passed implicitly in preprocessor symbols i, a...h %macro ROUND_16_XX 2 %define %%T1 %1 %define %%i %2 movdqa %%T1, [SZ2*((%%i-15)&0xf) + rsp] movdqa a1, [SZ2*((%%i-2)&0xf) + rsp] movdqa a0, %%T1 PRORQ %%T1, 8-1 movdqa a2, a1 PRORQ a1, 61-19 pxor %%T1, a0 PRORQ %%T1, 1 pxor a1, a2 PRORQ a1, 19 psrlq a0, 7 pxor %%T1, a0 psrlq a2, 6 pxor a1, a2 paddq %%T1, [SZ2*((%%i-16)&0xf) + rsp] paddq a1, [SZ2*((%%i-7)&0xf) + rsp] paddq %%T1, a1 ROUND_00_15 %%T1, %%i %endm ;; SHA512_ARGS: ;; UINT128 digest[8]; // transposed digests ;; UINT8 *data_ptr[2]; ;; ;; void sha512_x2_sse(SHA512_ARGS *args, UINT64 num_blocks); ;; arg 1 : STATE : pointer args ;; arg 2 : INP_SIZE : size of data in blocks (assumed >= 1) ;; MKGLOBAL(sha512_x2_sse,function,internal) align 32 sha512_x2_sse: ; general registers preserved in outer calling routine ; outer calling routine saves all the XMM registers sub rsp, STACK_size ;; Load the pre-transposed incoming digest. movdqa a,[STATE + 0 * SHA512_DIGEST_ROW_SIZE] movdqa b,[STATE + 1 * SHA512_DIGEST_ROW_SIZE] movdqa c,[STATE + 2 * SHA512_DIGEST_ROW_SIZE] movdqa d,[STATE + 3 * SHA512_DIGEST_ROW_SIZE] movdqa e,[STATE + 4 * SHA512_DIGEST_ROW_SIZE] movdqa f,[STATE + 5 * SHA512_DIGEST_ROW_SIZE] movdqa g,[STATE + 6 * SHA512_DIGEST_ROW_SIZE] movdqa h,[STATE + 7 * SHA512_DIGEST_ROW_SIZE] DBGPRINTL_XMM "incoming transposed sha512 digest", a, b, c, d, e, f, g, h lea TBL,[rel K512_2] ;; load the address of each of the 2 message lanes ;; getting ready to transpose input onto stack mov inp0,[STATE + _data_ptr_sha512 +0*PTR_SZ] mov inp1,[STATE + _data_ptr_sha512 +1*PTR_SZ] xor IDX, IDX lloop: xor ROUND, ROUND DBGPRINTL64 "lloop enter INP_SIZE ", INP_SIZE DBGPRINTL64 " IDX = ", IDX ;; save old digest movdqa [rsp + _DIGEST + 0*SZ2], a movdqa [rsp + _DIGEST + 1*SZ2], b movdqa [rsp + _DIGEST + 2*SZ2], c movdqa [rsp + _DIGEST + 3*SZ2], d movdqa [rsp + _DIGEST + 4*SZ2], e movdqa [rsp + _DIGEST + 5*SZ2], f movdqa [rsp + _DIGEST + 6*SZ2], g movdqa [rsp + _DIGEST + 7*SZ2], h DBGPRINTL "incoming data[" %assign i 0 %rep 8 ;; load up the shuffler for little-endian to big-endian format movdqa TMP, [rel PSHUFFLE_BYTE_FLIP_MASK] MOVPD TT0,[inp0+IDX+i*16] ;; double precision is 64 bits MOVPD TT2,[inp1+IDX+i*16] DBGPRINTL_XMM "input message block", TT0 TRANSPOSE TT0, TT2, TT1 pshufb TT0, TMP pshufb TT1, TMP ROUND_00_15 TT0,(i*2+0) ROUND_00_15 TT1,(i*2+1) %assign i (i+1) %endrep DBGPRINTL "]" add IDX, 8 * 16 ;; increment by a message block %assign i (i*4) jmp Lrounds_16_xx align 16 Lrounds_16_xx: %rep 16 ROUND_16_XX T1, i %assign i (i+1) %endrep cmp ROUND,ROUNDS jb Lrounds_16_xx ;; add old digest paddq a, [rsp + _DIGEST + 0*SZ2] paddq b, [rsp + _DIGEST + 1*SZ2] paddq c, [rsp + _DIGEST + 2*SZ2] paddq d, [rsp + _DIGEST + 3*SZ2] paddq e, [rsp + _DIGEST + 4*SZ2] paddq f, [rsp + _DIGEST + 5*SZ2] paddq g, [rsp + _DIGEST + 6*SZ2] paddq h, [rsp + _DIGEST + 7*SZ2] sub INP_SIZE, 1 ;; unit is blocks jne lloop ; write back to memory (state object) the transposed digest movdqa [STATE + 0*SHA512_DIGEST_ROW_SIZE],a movdqa [STATE + 1*SHA512_DIGEST_ROW_SIZE],b movdqa [STATE + 2*SHA512_DIGEST_ROW_SIZE],c movdqa [STATE + 3*SHA512_DIGEST_ROW_SIZE],d movdqa [STATE + 4*SHA512_DIGEST_ROW_SIZE],e movdqa [STATE + 5*SHA512_DIGEST_ROW_SIZE],f movdqa [STATE + 6*SHA512_DIGEST_ROW_SIZE],g movdqa [STATE + 7*SHA512_DIGEST_ROW_SIZE],h DBGPRINTL_XMM "exit transposed digest ", a, b, c, d, e, f, g, h ; update input pointers add inp0, IDX mov [STATE + _data_ptr_sha512 + 0*PTR_SZ], inp0 add inp1, IDX mov [STATE + _data_ptr_sha512 + 1*PTR_SZ], inp1 ;;;;;;;;;;;;;;;; ;; Postamble add rsp, STACK_size DBGPRINTL "====================== exit sha512_x2_sse code =====================\n" ret intel-ipsec-mb-0.48/sse/sha_256_mult_sse.asm000066400000000000000000000306551321406316400206440ustar00rootroot00000000000000;; ;; Copyright (c) 2012-2017, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; ;; code to compute quad SHA256 using SSE ;; outer calling routine takes care of save and restore of XMM registers ;; Logic designed/laid out by JDG ;; Stack must be aligned to 16 bytes before call ;; Windows clobbers: rax rbx rdx r8 r9 r10 r11 r12 ;; Windows preserves: rcx rsi rdi rbp r12 r14 r15 ;; ;; Linux clobbers: rax rbx rsi r8 r9 r10 r11 r12 ;; Linux preserves: rcx rdx rdi rbp r13 r14 r15 ;; ;; clobbers xmm0-15 %include "os.asm" %include "mb_mgr_datastruct.asm" ;%define DO_DBGPRINT %include "dbgprint.asm" section .data default rel align 64 MKGLOBAL(K256_4,data,internal) K256_4: dq 0x428a2f98428a2f98, 0x428a2f98428a2f98 dq 0x7137449171374491, 0x7137449171374491 dq 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf dq 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5 dq 0x3956c25b3956c25b, 0x3956c25b3956c25b dq 0x59f111f159f111f1, 0x59f111f159f111f1 dq 0x923f82a4923f82a4, 0x923f82a4923f82a4 dq 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5 dq 0xd807aa98d807aa98, 0xd807aa98d807aa98 dq 0x12835b0112835b01, 0x12835b0112835b01 dq 0x243185be243185be, 0x243185be243185be dq 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3 dq 0x72be5d7472be5d74, 0x72be5d7472be5d74 dq 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe dq 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7 dq 0xc19bf174c19bf174, 0xc19bf174c19bf174 dq 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1 dq 0xefbe4786efbe4786, 0xefbe4786efbe4786 dq 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6 dq 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc dq 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f dq 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa dq 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc dq 0x76f988da76f988da, 0x76f988da76f988da dq 0x983e5152983e5152, 0x983e5152983e5152 dq 0xa831c66da831c66d, 0xa831c66da831c66d dq 0xb00327c8b00327c8, 0xb00327c8b00327c8 dq 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7 dq 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3 dq 0xd5a79147d5a79147, 0xd5a79147d5a79147 dq 0x06ca635106ca6351, 0x06ca635106ca6351 dq 0x1429296714292967, 0x1429296714292967 dq 0x27b70a8527b70a85, 0x27b70a8527b70a85 dq 0x2e1b21382e1b2138, 0x2e1b21382e1b2138 dq 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc dq 0x53380d1353380d13, 0x53380d1353380d13 dq 0x650a7354650a7354, 0x650a7354650a7354 dq 0x766a0abb766a0abb, 0x766a0abb766a0abb dq 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e dq 0x92722c8592722c85, 0x92722c8592722c85 dq 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1 dq 0xa81a664ba81a664b, 0xa81a664ba81a664b dq 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70 dq 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3 dq 0xd192e819d192e819, 0xd192e819d192e819 dq 0xd6990624d6990624, 0xd6990624d6990624 dq 0xf40e3585f40e3585, 0xf40e3585f40e3585 dq 0x106aa070106aa070, 0x106aa070106aa070 dq 0x19a4c11619a4c116, 0x19a4c11619a4c116 dq 0x1e376c081e376c08, 0x1e376c081e376c08 dq 0x2748774c2748774c, 0x2748774c2748774c dq 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5 dq 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3 dq 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a dq 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f dq 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3 dq 0x748f82ee748f82ee, 0x748f82ee748f82ee dq 0x78a5636f78a5636f, 0x78a5636f78a5636f dq 0x84c8781484c87814, 0x84c8781484c87814 dq 0x8cc702088cc70208, 0x8cc702088cc70208 dq 0x90befffa90befffa, 0x90befffa90befffa dq 0xa4506ceba4506ceb, 0xa4506ceba4506ceb dq 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7 dq 0xc67178f2c67178f2, 0xc67178f2c67178f2 PSHUFFLE_BYTE_FLIP_MASK: ;ddq 0x0c0d0e0f08090a0b0405060700010203 dq 0x0405060700010203, 0x0c0d0e0f08090a0b section .text %ifdef LINUX ; Linux definitions %define arg1 rdi %define arg2 rsi %else ; Windows definitions %define arg1 rcx %define arg2 rdx %endif ; Common definitions %define STATE arg1 %define INP_SIZE arg2 %define IDX rax %define ROUND rbx %define TBL r12 %define inp0 r8 %define inp1 r9 %define inp2 r10 %define inp3 r11 %define a xmm0 %define b xmm1 %define c xmm2 %define d xmm3 %define e xmm4 %define f xmm5 %define g xmm6 %define h xmm7 %define a0 xmm8 %define a1 xmm9 %define a2 xmm10 %define TT0 xmm14 %define TT1 xmm13 %define TT2 xmm12 %define TT3 xmm11 %define TT4 xmm10 %define TT5 xmm9 %define T1 xmm14 %define TMP xmm15 %define SZ4 4*SHA256_DIGEST_WORD_SIZE ; Size of one vector register %define ROUNDS 64*SZ4 ; Define stack usage struc STACK _DATA: resb SZ4 * 16 _DIGEST: resb SZ4 * NUM_SHA256_DIGEST_WORDS resb 8 ; for alignment, must be odd multiple of 8 endstruc %define MOVPS movups ; transpose r0, r1, r2, r3, t0, t1 ; "transpose" data in {r0..r3} using temps {t0..t3} ; Input looks like: {r0 r1 r2 r3} ; r0 = {a3 a2 a1 a0} ; r1 = {b3 b2 b1 b0} ; r2 = {c3 c2 c1 c0} ; r3 = {d3 d2 d1 d0} ; ; output looks like: {t0 r1 r0 r3} ; t0 = {d0 c0 b0 a0} ; r1 = {d1 c1 b1 a1} ; r0 = {d2 c2 b2 a2} ; r3 = {d3 c3 b3 a3} ; %macro TRANSPOSE 6 %define %%r0 %1 %define %%r1 %2 %define %%r2 %3 %define %%r3 %4 %define %%t0 %5 %define %%t1 %6 movaps %%t0, %%r0 ; t0 = {a3 a2 a1 a0} shufps %%t0, %%r1, 0x44 ; t0 = {b1 b0 a1 a0} shufps %%r0, %%r1, 0xEE ; r0 = {b3 b2 a3 a2} movaps %%t1, %%r2 ; t1 = {c3 c2 c1 c0} shufps %%t1, %%r3, 0x44 ; t1 = {d1 d0 c1 c0} shufps %%r2, %%r3, 0xEE ; r2 = {d3 d2 c3 c2} movaps %%r1, %%t0 ; r1 = {b1 b0 a1 a0} shufps %%r1, %%t1, 0xDD ; r1 = {d1 c1 b1 a1} movaps %%r3, %%r0 ; r3 = {b3 b2 a3 a2} shufps %%r3, %%r2, 0xDD ; r3 = {d3 c3 b3 a3} shufps %%r0, %%r2, 0x88 ; r0 = {d2 c2 b2 a2} shufps %%t0, %%t1, 0x88 ; t0 = {d0 c0 b0 a0} %endmacro %macro ROTATE_ARGS 0 %xdefine TMP_ h %xdefine h g %xdefine g f %xdefine f e %xdefine e d %xdefine d c %xdefine c b %xdefine b a %xdefine a TMP_ %endm ; PRORD reg, imm, tmp %macro PRORD 3 %define %%reg %1 %define %%imm %2 %define %%tmp %3 movdqa %%tmp, %%reg psrld %%reg, %%imm pslld %%tmp, (32-(%%imm)) por %%reg, %%tmp %endmacro %macro PRORD 2 PRORD %1, %2, TMP %endmacro ;; arguments passed implicitly in preprocessor symbols i, a...h %macro ROUND_00_15 2 %define %%T1 %1 %define %%i %2 movdqa a0, e ; sig1: a0 = e movdqa a1, e ; sig1: s1 = e PRORD a0, (11-6) ; sig1: a0 = (e >> 5) movdqa a2, f ; ch: a2 = f pxor a2, g ; ch: a2 = f^g pand a2, e ; ch: a2 = (f^g)&e pxor a2, g ; a2 = ch PRORD a1, 25 ; sig1: a1 = (e >> 25) movdqa [SZ4*(%%i&0xf) + rsp],%%T1 paddd %%T1,[TBL + ROUND] ; T1 = W + K pxor a0, e ; sig1: a0 = e ^ (e >> 5) PRORD a0, 6 ; sig1: a0 = (e >> 6) ^ (e >> 11) paddd h, a2 ; h = h + ch movdqa a2, a ; sig0: a2 = a PRORD a2, (13-2) ; sig0: a2 = (a >> 11) paddd h, %%T1 ; h = h + ch + W + K pxor a0, a1 ; a0 = sigma1 movdqa a1, a ; sig0: a1 = a movdqa %%T1, a ; maj: T1 = a PRORD a1, 22 ; sig0: a1 = (a >> 22) pxor %%T1, c ; maj: T1 = a^c add ROUND, SZ4 ; ROUND++ pand %%T1, b ; maj: T1 = (a^c)&b paddd h, a0 paddd d, h pxor a2, a ; sig0: a2 = a ^ (a >> 11) PRORD a2, 2 ; sig0: a2 = (a >> 2) ^ (a >> 13) pxor a2, a1 ; a2 = sig0 movdqa a1, a ; maj: a1 = a pand a1, c ; maj: a1 = a&c por a1, %%T1 ; a1 = maj paddd h, a1 ; h = h + ch + W + K + maj paddd h, a2 ; h = h + ch + W + K + maj + sigma0 ROTATE_ARGS %endm ;; arguments passed implicitly in preprocessor symbols i, a...h %macro ROUND_16_XX 2 %define %%T1 %1 %define %%i %2 movdqa %%T1, [SZ4*((%%i-15)&0xf) + rsp] movdqa a1, [SZ4*((%%i-2)&0xf) + rsp] movdqa a0, %%T1 PRORD %%T1, 18-7 movdqa a2, a1 PRORD a1, 19-17 pxor %%T1, a0 PRORD %%T1, 7 pxor a1, a2 PRORD a1, 17 psrld a0, 3 pxor %%T1, a0 psrld a2, 10 pxor a1, a2 paddd %%T1, [SZ4*((%%i-16)&0xf) + rsp] paddd a1, [SZ4*((%%i-7)&0xf) + rsp] paddd %%T1, a1 ROUND_00_15 %%T1, %%i %endm ;; SHA256_ARGS: ;; UINT128 digest[8]; // transposed digests ;; UINT8 *data_ptr[4]; ;; ;; void sha_256_mult_sse(SHA256_ARGS *args, UINT64 num_blocks); ;; arg 1 : STATE : pointer args ;; arg 2 : INP_SIZE : size of data in blocks (assumed >= 1) ;; MKGLOBAL(sha_256_mult_sse,function,internal) align 32 sha_256_mult_sse: ; general registers preserved in outer calling routine ; outer calling routine saves all the XMM registers sub rsp, STACK_size ;; Load the pre-transposed incoming digest. movdqa a,[STATE + 0 * SHA256_DIGEST_ROW_SIZE ] movdqa b,[STATE + 1 * SHA256_DIGEST_ROW_SIZE ] movdqa c,[STATE + 2 * SHA256_DIGEST_ROW_SIZE ] movdqa d,[STATE + 3 * SHA256_DIGEST_ROW_SIZE ] movdqa e,[STATE + 4 * SHA256_DIGEST_ROW_SIZE ] movdqa f,[STATE + 5 * SHA256_DIGEST_ROW_SIZE ] movdqa g,[STATE + 6 * SHA256_DIGEST_ROW_SIZE ] movdqa h,[STATE + 7 * SHA256_DIGEST_ROW_SIZE ] DBGPRINTL_XMM "incoming transposed sha256 digest", a, b, c, d, e, f, g, h lea TBL,[rel K256_4] ;; load the address of each of the 4 message lanes ;; getting ready to transpose input onto stack mov inp0,[STATE + _data_ptr_sha256 + 0*PTR_SZ] mov inp1,[STATE + _data_ptr_sha256 + 1*PTR_SZ] mov inp2,[STATE + _data_ptr_sha256 + 2*PTR_SZ] mov inp3,[STATE + _data_ptr_sha256 + 3*PTR_SZ] DBGPRINTL64 "incoming input data ptrs ", inp0, inp1, inp2, inp3 xor IDX, IDX lloop: xor ROUND, ROUND ;; save old digest movdqa [rsp + _DIGEST + 0*SZ4], a movdqa [rsp + _DIGEST + 1*SZ4], b movdqa [rsp + _DIGEST + 2*SZ4], c movdqa [rsp + _DIGEST + 3*SZ4], d movdqa [rsp + _DIGEST + 4*SZ4], e movdqa [rsp + _DIGEST + 5*SZ4], f movdqa [rsp + _DIGEST + 6*SZ4], g movdqa [rsp + _DIGEST + 7*SZ4], h %assign i 0 %rep 4 movdqa TMP, [rel PSHUFFLE_BYTE_FLIP_MASK] MOVPS TT2,[inp0+IDX+i*16] MOVPS TT1,[inp1+IDX+i*16] MOVPS TT4,[inp2+IDX+i*16] MOVPS TT3,[inp3+IDX+i*16] TRANSPOSE TT2, TT1, TT4, TT3, TT0, TT5 pshufb TT0, TMP pshufb TT1, TMP pshufb TT2, TMP pshufb TT3, TMP ROUND_00_15 TT0,(i*4+0) ROUND_00_15 TT1,(i*4+1) ROUND_00_15 TT2,(i*4+2) ROUND_00_15 TT3,(i*4+3) %assign i (i+1) %endrep add IDX, 4*4*4 %assign i (i*4) jmp Lrounds_16_xx align 16 Lrounds_16_xx: %rep 16 ROUND_16_XX T1, i %assign i (i+1) %endrep cmp ROUND,ROUNDS jb Lrounds_16_xx ;; add old digest paddd a, [rsp + _DIGEST + 0*SZ4] paddd b, [rsp + _DIGEST + 1*SZ4] paddd c, [rsp + _DIGEST + 2*SZ4] paddd d, [rsp + _DIGEST + 3*SZ4] paddd e, [rsp + _DIGEST + 4*SZ4] paddd f, [rsp + _DIGEST + 5*SZ4] paddd g, [rsp + _DIGEST + 6*SZ4] paddd h, [rsp + _DIGEST + 7*SZ4] sub INP_SIZE, 1 ;; unit is blocks jne lloop ; write back to memory (state object) the transposed digest movdqa [STATE+0*SHA256_DIGEST_ROW_SIZE ],a movdqa [STATE+1*SHA256_DIGEST_ROW_SIZE ],b movdqa [STATE+2*SHA256_DIGEST_ROW_SIZE ],c movdqa [STATE+3*SHA256_DIGEST_ROW_SIZE ],d movdqa [STATE+4*SHA256_DIGEST_ROW_SIZE ],e movdqa [STATE+5*SHA256_DIGEST_ROW_SIZE ],f movdqa [STATE+6*SHA256_DIGEST_ROW_SIZE ],g movdqa [STATE+7*SHA256_DIGEST_ROW_SIZE ],h DBGPRINTL_XMM "updated transposed sha256 digest", a, b, c, d, e, f, g, h ; update input pointers add inp0, IDX mov [STATE + _data_ptr_sha256 + 0*8], inp0 add inp1, IDX mov [STATE + _data_ptr_sha256 + 1*8], inp1 add inp2, IDX mov [STATE + _data_ptr_sha256 + 2*8], inp2 add inp3, IDX mov [STATE + _data_ptr_sha256 + 3*8], inp3 DBGPRINTL64 "updated input data ptrs ", inp0, inp1, inp2, inp3 ;;;;;;;;;;;;;;;; ;; Postamble add rsp, STACK_size ; outer calling routine restores XMM and other GP registers ret intel-ipsec-mb-0.48/win_x64.mak000066400000000000000000000224111321406316400162450ustar00rootroot00000000000000# # Copyright (c) 2017, Intel Corporation # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # * Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # * Neither the name of Intel Corporation nor the names of its contributors # may be used to endorse or promote products derived from this software # without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # Available build options: # DEBUG=y - this option will produce library fit for debugging # SHARED=y - this option will produce shared library (DLL) !ifdef SHARED LIBNAME = libIPSec_MB.dll !else LIBNAME = libIPSec_MB.lib !endif OBJ_DIR = obj !ifdef DEBUG DCFLAGS = /Od /DDEBUG /Z7 DAFLAGS = -gcv8 DLFLAGS = /DEBUG !else DCFLAGS = /O2 /Oi DAFLAGS = DLFLAGS = /RELEASE !endif CC = cl CFLAGS = $(EXTRA_CFLAGS) $(DCFLAGS) /I. /Iinclude \ /nologo /Y- /W3 /WX- /Gm- /fp:precise /EHsc LIB_TOOL = lib LIBFLAGS = /nologo /machine:X64 /nodefaultlib LINK_TOOL = link LINKFLAGS = $(DLFLAGS) /nologo /machine:X64 AS = nasm AFLAGS = $(DAFLAGS) -fwin64 -Xvc -DWIN_ABI -Iinclude/ -I./ -Iavx/ -Iavx2/ -Iavx512/ -Isse/ lib_objs1 = \ $(OBJ_DIR)\aes128_cbc_dec_by4_sse.obj \ $(OBJ_DIR)\aes128_cbc_dec_by8_avx.obj \ $(OBJ_DIR)\aes128_cntr_by4_sse.obj \ $(OBJ_DIR)\aes128_cntr_by8_avx.obj \ $(OBJ_DIR)\aes128_ecbenc_x3.obj \ $(OBJ_DIR)\aes192_cbc_dec_by4_sse.obj \ $(OBJ_DIR)\aes192_cbc_dec_by8_avx.obj \ $(OBJ_DIR)\aes192_cntr_by4_sse.obj \ $(OBJ_DIR)\aes192_cntr_by8_avx.obj \ $(OBJ_DIR)\aes256_cbc_dec_by4_sse.obj \ $(OBJ_DIR)\aes256_cbc_dec_by8_avx.obj \ $(OBJ_DIR)\aes256_cntr_by4_sse.obj \ $(OBJ_DIR)\aes256_cntr_by8_avx.obj \ $(OBJ_DIR)\aes_cfb_128_sse.obj \ $(OBJ_DIR)\aes_cfb_128_avx.obj \ $(OBJ_DIR)\aes128_cbc_mac_x4.obj \ $(OBJ_DIR)\aes128_cbc_mac_x8.obj \ $(OBJ_DIR)\aes_cbc_enc_128_x4.obj \ $(OBJ_DIR)\aes_cbc_enc_128_x8.obj \ $(OBJ_DIR)\aes_cbc_enc_192_x4.obj \ $(OBJ_DIR)\aes_cbc_enc_192_x8.obj \ $(OBJ_DIR)\aes_cbc_enc_256_x4.obj \ $(OBJ_DIR)\aes_cbc_enc_256_x8.obj \ $(OBJ_DIR)\aes_keyexp_128.obj \ $(OBJ_DIR)\aes_keyexp_192.obj \ $(OBJ_DIR)\aes_keyexp_256.obj \ $(OBJ_DIR)\aes_xcbc_mac_128_x4.obj \ $(OBJ_DIR)\aes_xcbc_mac_128_x8.obj \ $(OBJ_DIR)\md5_x4x2_avx.obj \ $(OBJ_DIR)\md5_x4x2_sse.obj \ $(OBJ_DIR)\md5_x8x2_avx2.obj \ $(OBJ_DIR)\save_xmms.obj \ $(OBJ_DIR)\sha1_mult_avx.obj \ $(OBJ_DIR)\sha1_mult_sse.obj \ $(OBJ_DIR)\sha1_ni_x2_sse.obj \ $(OBJ_DIR)\sha1_one_block_avx.obj \ $(OBJ_DIR)\sha1_one_block_sse.obj \ $(OBJ_DIR)\sha1_x8_avx2.obj \ $(OBJ_DIR)\sha1_x16_avx512.obj \ $(OBJ_DIR)\sha224_one_block_avx.obj \ $(OBJ_DIR)\sha224_one_block_sse.obj \ $(OBJ_DIR)\sha256_oct_avx2.obj \ $(OBJ_DIR)\sha256_one_block_avx.obj \ $(OBJ_DIR)\sha256_one_block_sse.obj \ $(OBJ_DIR)\sha256_ni_x2_sse.obj \ $(OBJ_DIR)\sha256_x16_avx512.obj \ $(OBJ_DIR)\sha384_one_block_avx.obj \ $(OBJ_DIR)\sha384_one_block_sse.obj \ $(OBJ_DIR)\sha512_one_block_avx.obj \ $(OBJ_DIR)\sha512_one_block_sse.obj \ $(OBJ_DIR)\sha512_x2_avx.obj \ $(OBJ_DIR)\sha512_x2_sse.obj \ $(OBJ_DIR)\sha512_x4_avx2.obj \ $(OBJ_DIR)\sha512_x8_avx512.obj \ $(OBJ_DIR)\sha_256_mult_avx.obj \ $(OBJ_DIR)\sha_256_mult_sse.obj \ $(OBJ_DIR)\aes_xcbc_expand_key.obj \ $(OBJ_DIR)\md5_one_block.obj \ $(OBJ_DIR)\des_key.obj \ $(OBJ_DIR)\des_basic.obj \ $(OBJ_DIR)\des_x16_avx512.obj lib_objs2 = \ $(OBJ_DIR)\mb_mgr_aes192_flush_avx.obj \ $(OBJ_DIR)\mb_mgr_aes192_flush_sse.obj \ $(OBJ_DIR)\mb_mgr_aes192_submit_avx.obj \ $(OBJ_DIR)\mb_mgr_aes192_submit_sse.obj \ $(OBJ_DIR)\mb_mgr_aes256_flush_avx.obj \ $(OBJ_DIR)\mb_mgr_aes256_flush_sse.obj \ $(OBJ_DIR)\mb_mgr_aes256_submit_avx.obj \ $(OBJ_DIR)\mb_mgr_aes256_submit_sse.obj \ $(OBJ_DIR)\mb_mgr_aes_flush_avx.obj \ $(OBJ_DIR)\mb_mgr_aes_flush_sse.obj \ $(OBJ_DIR)\mb_mgr_aes_submit_avx.obj \ $(OBJ_DIR)\mb_mgr_aes_submit_sse.obj \ $(OBJ_DIR)\mb_mgr_aes_xcbc_flush_avx.obj \ $(OBJ_DIR)\mb_mgr_aes_xcbc_flush_sse.obj \ $(OBJ_DIR)\mb_mgr_aes_xcbc_submit_avx.obj \ $(OBJ_DIR)\mb_mgr_aes_xcbc_submit_sse.obj \ $(OBJ_DIR)\mb_mgr_hmac_flush_avx.obj \ $(OBJ_DIR)\mb_mgr_hmac_flush_avx2.obj \ $(OBJ_DIR)\mb_mgr_hmac_flush_sse.obj \ $(OBJ_DIR)\mb_mgr_hmac_flush_ni_sse.obj \ $(OBJ_DIR)\mb_mgr_hmac_flush_avx512.obj \ $(OBJ_DIR)\mb_mgr_hmac_md5_flush_avx.obj \ $(OBJ_DIR)\mb_mgr_hmac_md5_flush_avx2.obj \ $(OBJ_DIR)\mb_mgr_hmac_md5_flush_sse.obj \ $(OBJ_DIR)\mb_mgr_hmac_md5_submit_avx.obj \ $(OBJ_DIR)\mb_mgr_hmac_md5_submit_avx2.obj \ $(OBJ_DIR)\mb_mgr_hmac_md5_submit_sse.obj \ $(OBJ_DIR)\mb_mgr_hmac_sha_224_flush_avx.obj \ $(OBJ_DIR)\mb_mgr_hmac_sha_224_flush_avx2.obj \ $(OBJ_DIR)\mb_mgr_hmac_sha_224_flush_avx512.obj \ $(OBJ_DIR)\mb_mgr_hmac_sha_224_flush_sse.obj \ $(OBJ_DIR)\mb_mgr_hmac_sha_224_flush_ni_sse.obj \ $(OBJ_DIR)\mb_mgr_hmac_sha_224_submit_avx.obj \ $(OBJ_DIR)\mb_mgr_hmac_sha_224_submit_avx2.obj \ $(OBJ_DIR)\mb_mgr_hmac_sha_224_submit_avx512.obj \ $(OBJ_DIR)\mb_mgr_hmac_sha_224_submit_sse.obj \ $(OBJ_DIR)\mb_mgr_hmac_sha_224_submit_ni_sse.obj \ $(OBJ_DIR)\mb_mgr_hmac_sha_256_flush_avx.obj \ $(OBJ_DIR)\mb_mgr_hmac_sha_256_flush_avx2.obj \ $(OBJ_DIR)\mb_mgr_hmac_sha_256_flush_avx512.obj \ $(OBJ_DIR)\mb_mgr_hmac_sha_256_flush_sse.obj \ $(OBJ_DIR)\mb_mgr_hmac_sha_256_flush_ni_sse.obj \ $(OBJ_DIR)\mb_mgr_hmac_sha_256_submit_avx.obj \ $(OBJ_DIR)\mb_mgr_hmac_sha_256_submit_avx2.obj \ $(OBJ_DIR)\mb_mgr_hmac_sha_256_submit_avx512.obj \ $(OBJ_DIR)\mb_mgr_hmac_sha_256_submit_sse.obj \ $(OBJ_DIR)\mb_mgr_hmac_sha_256_submit_ni_sse.obj \ $(OBJ_DIR)\mb_mgr_hmac_sha_384_flush_avx.obj \ $(OBJ_DIR)\mb_mgr_hmac_sha_384_flush_avx2.obj \ $(OBJ_DIR)\mb_mgr_hmac_sha_384_flush_avx512.obj \ $(OBJ_DIR)\mb_mgr_hmac_sha_384_flush_sse.obj \ $(OBJ_DIR)\mb_mgr_hmac_sha_384_submit_avx.obj \ $(OBJ_DIR)\mb_mgr_hmac_sha_384_submit_avx2.obj \ $(OBJ_DIR)\mb_mgr_hmac_sha_384_submit_avx512.obj \ $(OBJ_DIR)\mb_mgr_hmac_sha_384_submit_sse.obj \ $(OBJ_DIR)\mb_mgr_hmac_sha_512_flush_avx.obj \ $(OBJ_DIR)\mb_mgr_hmac_sha_512_flush_avx2.obj \ $(OBJ_DIR)\mb_mgr_hmac_sha_512_flush_avx512.obj \ $(OBJ_DIR)\mb_mgr_hmac_sha_512_flush_sse.obj \ $(OBJ_DIR)\mb_mgr_hmac_sha_512_submit_avx.obj \ $(OBJ_DIR)\mb_mgr_hmac_sha_512_submit_avx2.obj \ $(OBJ_DIR)\mb_mgr_hmac_sha_512_submit_avx512.obj \ $(OBJ_DIR)\mb_mgr_hmac_sha_512_submit_sse.obj \ $(OBJ_DIR)\mb_mgr_hmac_submit_avx.obj \ $(OBJ_DIR)\mb_mgr_hmac_submit_avx2.obj \ $(OBJ_DIR)\mb_mgr_hmac_submit_sse.obj \ $(OBJ_DIR)\mb_mgr_hmac_submit_ni_sse.obj \ $(OBJ_DIR)\mb_mgr_hmac_submit_avx512.obj \ $(OBJ_DIR)\mb_mgr_avx.obj \ $(OBJ_DIR)\mb_mgr_avx2.obj \ $(OBJ_DIR)\mb_mgr_avx512.obj \ $(OBJ_DIR)\mb_mgr_des_avx512.obj \ $(OBJ_DIR)\mb_mgr_sse.obj gcm_objs = \ $(OBJ_DIR)\gcm128_sse.obj \ $(OBJ_DIR)\gcm128_avx_gen2.obj \ $(OBJ_DIR)\gcm128_avx_gen4.obj \ $(OBJ_DIR)\gcm192_sse.obj \ $(OBJ_DIR)\gcm192_avx_gen2.obj \ $(OBJ_DIR)\gcm192_avx_gen4.obj \ $(OBJ_DIR)\gcm256_sse.obj \ $(OBJ_DIR)\gcm256_avx_gen2.obj \ $(OBJ_DIR)\gcm256_avx_gen4.obj !ifdef NO_GCM all_objs = $(lib_objs1) $(lib_objs2) CFLAGS = $(CFLAGS) -DNO_GCM !else all_objs = $(lib_objs1) $(lib_objs2) $(gcm_objs) !endif all: $(LIBNAME) $(LIBNAME): $(all_objs) !ifdef SHARED $(LINK_TOOL) $(LINKFLAGS) /DLL /DEF:libIPSec_MB.def /OUT:$@ $(all_objs) !else $(LIB_TOOL) $(LIBFLAGS) /out:$@ $(all_objs) !endif $(all_objs): $(OBJ_DIR) {.\}.c{$(OBJ_DIR)}.obj: $(CC) /Fo$@ /c $(CFLAGS) $< {.\}.asm{$(OBJ_DIR)}.obj: $(AS) -o $@ $(AFLAGS) $< {sse\}.c{$(OBJ_DIR)}.obj: $(CC) /Fo$@ /c $(CFLAGS) $< {sse\}.asm{$(OBJ_DIR)}.obj: $(AS) -o $@ $(AFLAGS) $< {avx\}.c{$(OBJ_DIR)}.obj: $(CC) /Fo$@ /c $(CFLAGS) $< {avx\}.asm{$(OBJ_DIR)}.obj: $(AS) -o $@ $(AFLAGS) $< {avx2\}.c{$(OBJ_DIR)}.obj: $(CC) /Fo$@ /c $(CFLAGS) $< {avx2\}.asm{$(OBJ_DIR)}.obj: $(AS) -o $@ $(AFLAGS) $< {avx512\}.c{$(OBJ_DIR)}.obj: $(CC) /Fo$@ /c $(CFLAGS) $< {avx512\}.asm{$(OBJ_DIR)}.obj: $(AS) -o $@ $(AFLAGS) $< {include\}.asm{$(OBJ_DIR)}.obj: $(AS) -o $@ $(AFLAGS) $< $(OBJ_DIR): mkdir $(OBJ_DIR) clean: del /q $(lib_objs1) del /q $(lib_objs2) del /q $(gcm_objs) del /q $(LIBNAME).*