diff --git a/.wakemanifest b/.wakemanifest index b9493019d..a3a835ab1 100644 --- a/.wakemanifest +++ b/.wakemanifest @@ -58,7 +58,7 @@ tools/wake-hash/hash.wake tools/wake-migrate/wake-migrate.wake tools/wake-unit/wake-unit.wake tools/wake/wake.wake -vendor/blake2/blake2.wake +vendor/blake3/blake3.wake vendor/emscripten.wake vendor/fuse.wake vendor/git.wake diff --git a/LICENSE b/LICENSE index c2958af47..89f0b4e70 100644 --- a/LICENSE +++ b/LICENSE @@ -4,7 +4,7 @@ Each file includes a header which describes the relevant license. In broad strokes: wake LICENSE.Apache2 everything not in vendor/ - BLAKE2 LICENSE.CC0 vendor/blake2/ + BLAKE3 LICENSE.CC0 vendor/blake3/ gopt LICENSE.TFL vendor/gopt/ lemon public domain vendor/lemon/ SipHash LICENSE.CC0 vendor/siphash/ diff --git a/Makefile b/Makefile index 40b315d96..2b9f5d14d 100644 --- a/Makefile +++ b/Makefile @@ -28,14 +28,45 @@ COMMON_CPP := $(foreach dir,$(COMMON_DIRS),$(wildcard $(dir)/*.cpp)) COMMON_OBJS := src/json/jlexer.o \ $(patsubst %.cpp,%.o,$(COMMON_CPP)) $(patsubst %.c,%.o,$(COMMON_C)) +# BLAKE3 platform detection +UNAME_M := $(shell uname -m) + +# BLAKE3 C files (always compiled) +BLAKE3_C_COMMON := vendor/blake3/blake3.c vendor/blake3/blake3_portable.c vendor/blake3/blake3_dispatch.c + +# BLAKE3 platform-specific files and flags +ifeq ($(UNAME_M),x86_64) + # x86_64: Use SSE2/SSE4.1/AVX2/AVX-512 assembly implementations + BLAKE3_ASM := vendor/blake3/blake3_sse2_x86-64_unix.S \ + vendor/blake3/blake3_sse41_x86-64_unix.S \ + vendor/blake3/blake3_avx2_x86-64_unix.S \ + vendor/blake3/blake3_avx512_x86-64_unix.S + BLAKE3_C := $(BLAKE3_C_COMMON) + BLAKE3_CFLAGS := +else ifeq ($(UNAME_M),aarch64) + # ARM64: Use NEON implementation (requires BLAKE3_USE_NEON=1 for dispatch) + BLAKE3_ASM := + BLAKE3_C := $(BLAKE3_C_COMMON) vendor/blake3/blake3_neon.c + BLAKE3_CFLAGS := -DBLAKE3_USE_NEON=1 +else + # Other platforms: Use portable implementation only + BLAKE3_ASM := + BLAKE3_C := $(BLAKE3_C_COMMON) + BLAKE3_CFLAGS := +endif + +BLAKE3_ASM_OBJS := $(patsubst %.S,%.o,$(BLAKE3_ASM)) +BLAKE3_OBJS := $(patsubst %.c,%.o,$(BLAKE3_C)) $(BLAKE3_ASM_OBJS) + WAKE_DIRS := $(COMMON_DIRS) src/dst src/optimizer src/parser src/runtime src/types src/wcl tools/wake + WAKE_C := $(foreach dir,$(WAKE_DIRS),$(wildcard $(dir)/*.c)) \ - vendor/blake2/blake2b-ref.c vendor/utf8proc/utf8proc.c \ + $(BLAKE3_C) vendor/utf8proc/utf8proc.c \ vendor/siphash/siphash.c vendor/whereami/whereami.c \ vendor/gopt/gopt.c vendor/gopt/gopt-errors.c vendor/gopt/gopt-arg.c WAKE_CPP := $(foreach dir,$(WAKE_DIRS),$(wildcard $(dir)/*.cpp)) WAKE_OBJS := src/parser/lexer.o src/parser/parser.o src/json/jlexer.o \ - $(patsubst %.cpp,%.o,$(WAKE_CPP)) $(patsubst %.c,%.o,$(WAKE_C)) + $(patsubst %.cpp,%.o,$(WAKE_CPP)) $(patsubst %.c,%.o,$(WAKE_C)) $(BLAKE3_ASM_OBJS) WAKE_ENV := WAKE_PATH=$(shell dirname $(shell which $(firstword $(CC)))) @@ -104,10 +135,10 @@ bin/wakebox: tools/wakebox/main.cpp src/wakefs/*.cpp vendor/gopt/*.c $(COMMON_O lib/wake/fuse-waked: tools/fuse-waked/main.cpp $(COMMON_OBJS) $(CXX) $(CFLAGS) $(LOCAL_CFLAGS) $(FUSE_CFLAGS) $(CXX_VERSION) $^ -o $@ $(LDFLAGS) $(CORE_LDFLAGS) $(FUSE_LDFLAGS) -lib/wake/shim-wake: tools/shim-wake/main.o vendor/blake2/blake2b-ref.o src/wcl/filepath.o +lib/wake/shim-wake: tools/shim-wake/main.o $(BLAKE3_OBJS) src/wcl/filepath.o $(CXX) $(CFLAGS) -o $@ $^ $(LDFLAGS) $(CORE_LDFLAGS) -lib/wake/wake-hash: tools/wake-hash/main.o vendor/blake2/blake2b-ref.o $(COMMON_OBJS) +lib/wake/wake-hash: tools/wake-hash/main.o $(BLAKE3_OBJS) $(COMMON_OBJS) $(CXX) $(CFLAGS) -o $@ $^ $(LOCAL_CFLAGS) $(CXX_VERSION) $(LDFLAGS) $(CORE_LDFLAGS) bin/wake-migrate: tools/wake-migrate/main.o $(COMMON_OBJS) @@ -119,6 +150,14 @@ bin/wake-migrate: tools/wake-migrate/main.o $(COMMON_OBJS) %.o: %.c $(filter-out src/version.h,$(wildcard */*.h)) $(CC) $(CFLAGS) $(LOCAL_CFLAGS) -o $@ -c $< +# C rule for BLAKE3 files (needs BLAKE3_CFLAGS for NEON on aarch64) +vendor/blake3/%.o: vendor/blake3/%.c + $(CC) $(CFLAGS) $(LOCAL_CFLAGS) $(BLAKE3_CFLAGS) -o $@ -c $< + +# Assembly rule for BLAKE3 SIMD implementations +vendor/blake3/%.o: vendor/blake3/%.S + $(CC) $(CFLAGS) -o $@ -c $< + # Rely on wake to recreate this file if re2c is available %.cpp: %.cpp.gz gzip -dc $^ > $@.tmp diff --git a/debian/copyright b/debian/copyright index 091decb33..a0263c764 100644 --- a/debian/copyright +++ b/debian/copyright @@ -15,9 +15,9 @@ Copyright: 2012-2016 Jean-Philippe Aumasson 2012-2014 Daniel J. Bernstein License: CC0 -Files: shim/blake2* -Copyright: 2012 Samuel Neves -License: CC0 +Files: vendor/blake3/* +Copyright: 2019-2020 Samuel Neves and Jack O'Connor +License: CC0 or Apache-2.0 Files: utf8proc/* Copyright: 2018 Steven G. Johnson, Jiahao Chen, Peter Colberg, Tony Kelman, Scott P. Jones, and other contributors. diff --git a/src/runtime/runtime.wake b/src/runtime/runtime.wake index 589e64840..2e2bd2beb 100644 --- a/src/runtime/runtime.wake +++ b/src/runtime/runtime.wake @@ -22,5 +22,5 @@ target runtime variant = @here variant (optimizer, dst, types, json, util, Nil) - (util, gmp, sqlite3, re2, ncurses, blake2, Nil) + (util, gmp, sqlite3, re2, ncurses, blake3, Nil) Nil diff --git a/src/runtime/schema.h b/src/runtime/schema.h index 892ea91c8..6f7168598 100644 --- a/src/runtime/schema.h +++ b/src/runtime/schema.h @@ -1,7 +1,7 @@ #ifndef WAKE_SCHEMA_H #define WAKE_SCHEMA_H -#define SCHEMA_VERSION "9" +#define SCHEMA_VERSION "10" // Increment the SCHEMA_VERSION every time the below string changes. // Also add migrations to the wake-migration tool if needed. diff --git a/src/runtime/string.cpp b/src/runtime/string.cpp index 9361892a2..4421a49c8 100644 --- a/src/runtime/string.cpp +++ b/src/runtime/string.cpp @@ -29,7 +29,7 @@ #include #include -#include "blake2/blake2.h" +#include "blake3/blake3.h" #include "gc.h" #include "json/utf8.h" #include "prim.h" @@ -756,11 +756,11 @@ static PRIMTYPE(type_hash_str) { return args.size() == 1 && args[0]->unify(Data::typeString) && out->unify(Data::typeString); } -static void blake2b(const std::string &str, uint64_t (*out)[4]) { - blake2b_state S; - blake2b_init(&S, sizeof(*out)); - blake2b_update(&S, reinterpret_cast(str.data()), str.size()); - blake2b_final(&S, reinterpret_cast(*out), sizeof(*out)); +static void blake3(const std::string &str, uint64_t (*out)[4]) { + blake3_hasher hasher; + blake3_hasher_init(&hasher); + blake3_hasher_update(&hasher, reinterpret_cast(str.data()), str.size()); + blake3_hasher_finalize(&hasher, reinterpret_cast(*out), sizeof(*out)); } static PRIMFN(prim_hash_str) { @@ -768,7 +768,7 @@ static PRIMFN(prim_hash_str) { STRING(str, 0); uint64_t hash[4]; - blake2b(str->as_str(), &hash); + blake3(str->as_str(), &hash); auto hex = wcl::to_hex(&hash); RETURN(String::alloc(runtime.heap, std::move(hex))); diff --git a/tools/shim-wake/main.cpp b/tools/shim-wake/main.cpp index 56199fd6e..7dbd8e1dd 100644 --- a/tools/shim-wake/main.cpp +++ b/tools/shim-wake/main.cpp @@ -31,7 +31,7 @@ #include #include -#include "blake2/blake2.h" +#include "blake3/blake3.h" #include "compat/nofollow.h" #include "wcl/filepath.h" @@ -45,7 +45,7 @@ static int do_hash_dir() { } static int do_hash_link(const char *link) { - blake2b_state S; + blake3_hasher hasher; uint8_t hash[HASH_BYTES]; constexpr ssize_t len = 8192; char buffer[len]; @@ -61,9 +61,9 @@ static int do_hash_link(const char *link) { return 1; } - blake2b_init(&S, sizeof(hash)); - blake2b_update(&S, (uint8_t *)buffer, out); - blake2b_final(&S, &hash[0], sizeof(hash)); + blake3_hasher_init(&hasher); + blake3_hasher_update(&hasher, (uint8_t *)buffer, out); + blake3_hasher_finalize(&hasher, &hash[0], sizeof(hash)); for (int i = 0; i < (int)sizeof(hash); ++i) printf("%02x", hash[i]); printf("\n"); @@ -72,13 +72,14 @@ static int do_hash_link(const char *link) { } static int do_hash_file(const char *file, int fd) { - blake2b_state S; + blake3_hasher hasher; uint8_t hash[HASH_BYTES], buffer[8192]; ssize_t got; - blake2b_init(&S, sizeof(hash)); - while ((got = read(fd, &buffer[0], sizeof(buffer))) > 0) blake2b_update(&S, &buffer[0], got); - blake2b_final(&S, &hash[0], sizeof(hash)); + blake3_hasher_init(&hasher); + while ((got = read(fd, &buffer[0], sizeof(buffer))) > 0) + blake3_hasher_update(&hasher, &buffer[0], got); + blake3_hasher_finalize(&hasher, &hash[0], sizeof(hash)); if (got < 0) { fprintf(stderr, "shim hash read(%s): %s\n", file, strerror(errno)); diff --git a/tools/shim-wake/shim.wake b/tools/shim-wake/shim.wake index c23e97c4f..fccb91bc9 100644 --- a/tools/shim-wake/shim.wake +++ b/tools/shim-wake/shim.wake @@ -19,4 +19,4 @@ from wake import _ from gcc_wake import _ target buildShim variant: Result (List Path) Error = - tool @here Nil variant "lib/wake/shim-wake" (blake2, compat, wcl, Nil) Nil Nil + tool @here Nil variant "lib/wake/shim-wake" (blake3, compat, wcl, Nil) Nil Nil diff --git a/tools/wake-hash/hash.wake b/tools/wake-hash/hash.wake index 92df69fc2..4f3f15daf 100644 --- a/tools/wake-hash/hash.wake +++ b/tools/wake-hash/hash.wake @@ -19,4 +19,4 @@ from wake import _ from gcc_wake import _ target buildHash variant: Result (List Path) Error = - tool @here Nil variant "lib/wake/wake-hash" (blake2, compat, Nil) Nil Nil + tool @here Nil variant "lib/wake/wake-hash" (blake3, compat, Nil) Nil Nil diff --git a/tools/wake-hash/main.cpp b/tools/wake-hash/main.cpp index 76d8617c0..f922b8d3d 100644 --- a/tools/wake-hash/main.cpp +++ b/tools/wake-hash/main.cpp @@ -40,7 +40,7 @@ #include #include -#include "blake2/blake2.h" +#include "blake3/blake3.h" #include "compat/nofollow.h" #include "wcl/optional.h" #include "wcl/unique_fd.h" @@ -130,7 +130,7 @@ static wcl::optional hash_exotic() { static wcl::optional hash_dir() { return wcl::some(Hash256()); } static wcl::optional hash_link(const char* link) { - blake2b_state S; + blake3_hasher hasher; uint8_t hash[HASH_BYTES]; std::vector buffer(8192, 0); @@ -147,21 +147,22 @@ static wcl::optional hash_link(const char* link) { buffer.resize(2 * buffer.size(), 0); } - blake2b_init(&S, sizeof(hash)); - blake2b_update(&S, reinterpret_cast(buffer.data()), buffer.size()); - blake2b_final(&S, &hash[0], sizeof(hash)); + blake3_hasher_init(&hasher); + blake3_hasher_update(&hasher, reinterpret_cast(buffer.data()), buffer.size()); + blake3_hasher_finalize(&hasher, &hash[0], sizeof(hash)); return wcl::some(Hash256::from_hash(&hash)); } static wcl::optional hash_file(const char* file, int fd) { - blake2b_state S; + blake3_hasher hasher; uint8_t hash[HASH_BYTES], buffer[8192]; ssize_t got; - blake2b_init(&S, sizeof(hash)); - while ((got = read(fd, &buffer[0], sizeof(buffer))) > 0) blake2b_update(&S, &buffer[0], got); - blake2b_final(&S, &hash[0], sizeof(hash)); + blake3_hasher_init(&hasher); + while ((got = read(fd, &buffer[0], sizeof(buffer))) > 0) + blake3_hasher_update(&hasher, &buffer[0], got); + blake3_hasher_finalize(&hasher, &hash[0], sizeof(hash)); if (got < 0) { std::cerr << "wake-hash read(" << file << "): " << strerror(errno) << std::endl; diff --git a/tools/wake-migrate/main.cpp b/tools/wake-migrate/main.cpp index 0c476289b..a20b24dbd 100644 --- a/tools/wake-migrate/main.cpp +++ b/tools/wake-migrate/main.cpp @@ -190,6 +190,22 @@ static std::vector get_migrations() { }, "Convert runner_status from INTEGER to TEXT"}, + // Version 9 -> 10: Hash algorithm changed from BLAKE2b to BLAKE3 + // Migration is not supported - user must delete wake.db and reinitialize + {9, 10, + [](sqlite3* db) -> bool { + (void)db; // unused + std::cerr << std::endl; + std::cerr << "Wake has been upgraded from BLAKE2b to BLAKE3 hashing." << std::endl; + std::cerr << "Your existing wake.db is incompatible and must be deleted." << std::endl; + std::cerr << std::endl; + std::cerr << "Please run:" << std::endl; + std::cerr << " rm wake.db && wake --init ." << std::endl; + std::cerr << std::endl; + return false; + }, + "Hash algorithm changed from BLAKE2b to BLAKE3 - manual deletion required"}, + }; } diff --git a/vendor/blake2/blake2-impl.h b/vendor/blake2/blake2-impl.h deleted file mode 100644 index 801f1ae7b..000000000 --- a/vendor/blake2/blake2-impl.h +++ /dev/null @@ -1,160 +0,0 @@ -/* - BLAKE2 reference source code package - optimized C implementations - - Written in 2012 by Samuel Neves - - To the extent possible under law, the author(s) have dedicated all copyright - and related and neighboring rights to this software to the public domain - worldwide. This software is distributed without any warranty. - - You should have received a copy of the CC0 Public Domain Dedication along with - this software. If not, see . -*/ -#pragma once -#ifndef __BLAKE2_IMPL_H__ -#define __BLAKE2_IMPL_H__ - -#if defined(_WIN32) || defined(WIN32) -#include -#endif - -#include -#include -#include -#include "config.h" - -#define BLAKE2_IMPL_CAT(x,y) x ## y -#define BLAKE2_IMPL_EVAL(x,y) BLAKE2_IMPL_CAT(x,y) -#define BLAKE2_IMPL_NAME(fun) BLAKE2_IMPL_EVAL(fun, SUFFIX) - -static inline uint32_t load32( const void *src ) -{ -#if defined(NATIVE_LITTLE_ENDIAN) - uint32_t w; - memcpy( &w, src, sizeof( w ) ); - return w; -#else - const uint8_t *p = ( uint8_t * )src; - uint32_t w = *p++; - w |= ( uint32_t )( *p++ ) << 8; - w |= ( uint32_t )( *p++ ) << 16; - w |= ( uint32_t )( *p++ ) << 24; - return w; -#endif -} - -static inline uint64_t load64( const void *src ) -{ -#if defined(NATIVE_LITTLE_ENDIAN) - uint64_t w; - memcpy( &w, src, sizeof( w ) ); - return w; -#else - const uint8_t *p = ( uint8_t * )src; - uint64_t w = *p++; - w |= ( uint64_t )( *p++ ) << 8; - w |= ( uint64_t )( *p++ ) << 16; - w |= ( uint64_t )( *p++ ) << 24; - w |= ( uint64_t )( *p++ ) << 32; - w |= ( uint64_t )( *p++ ) << 40; - w |= ( uint64_t )( *p++ ) << 48; - w |= ( uint64_t )( *p++ ) << 56; - return w; -#endif -} - -static inline void store32( void *dst, uint32_t w ) -{ -#if defined(NATIVE_LITTLE_ENDIAN) - memcpy( dst, &w, sizeof( w ) ); -#else - uint8_t *p = ( uint8_t * )dst; - *p++ = ( uint8_t )w; w >>= 8; - *p++ = ( uint8_t )w; w >>= 8; - *p++ = ( uint8_t )w; w >>= 8; - *p++ = ( uint8_t )w; -#endif -} - -static inline void store64( void *dst, uint64_t w ) -{ -#if defined(NATIVE_LITTLE_ENDIAN) - memcpy( dst, &w, sizeof( w ) ); -#else - uint8_t *p = ( uint8_t * )dst; - *p++ = ( uint8_t )w; w >>= 8; - *p++ = ( uint8_t )w; w >>= 8; - *p++ = ( uint8_t )w; w >>= 8; - *p++ = ( uint8_t )w; w >>= 8; - *p++ = ( uint8_t )w; w >>= 8; - *p++ = ( uint8_t )w; w >>= 8; - *p++ = ( uint8_t )w; w >>= 8; - *p++ = ( uint8_t )w; -#endif -} - -static inline uint64_t load48( const void *src ) -{ - const uint8_t *p = ( const uint8_t * )src; - uint64_t w = *p++; - w |= ( uint64_t )( *p++ ) << 8; - w |= ( uint64_t )( *p++ ) << 16; - w |= ( uint64_t )( *p++ ) << 24; - w |= ( uint64_t )( *p++ ) << 32; - w |= ( uint64_t )( *p++ ) << 40; - return w; -} - -static inline void store48( void *dst, uint64_t w ) -{ - uint8_t *p = ( uint8_t * )dst; - *p++ = ( uint8_t )w; w >>= 8; - *p++ = ( uint8_t )w; w >>= 8; - *p++ = ( uint8_t )w; w >>= 8; - *p++ = ( uint8_t )w; w >>= 8; - *p++ = ( uint8_t )w; w >>= 8; - *p++ = ( uint8_t )w; -} - -static inline uint32_t rotl32( const uint32_t w, const unsigned c ) -{ - return ( w << c ) | ( w >> ( 32 - c ) ); -} - -static inline uint64_t rotl64( const uint64_t w, const unsigned c ) -{ - return ( w << c ) | ( w >> ( 64 - c ) ); -} - -static inline uint32_t rotr32( const uint32_t w, const unsigned c ) -{ - return ( w >> c ) | ( w << ( 32 - c ) ); -} - -static inline uint64_t rotr64( const uint64_t w, const unsigned c ) -{ - return ( w >> c ) | ( w << ( 64 - c ) ); -} - -/* prevents compiler optimizing out memset() */ -static inline void secure_zero_memory(void *v, size_t n) -{ -#if defined(_WIN32) || defined(WIN32) - SecureZeroMemory(v, n); -#else -// prioritize first the general C11 call -#if defined(HAVE_MEMSET_S) - memset_s(v, n, 0, n); -#elif defined(HAVE_EXPLICIT_BZERO) - explicit_bzero(v, n); -#elif defined(HAVE_EXPLICIT_MEMSET) - explicit_memset(v, 0, n); -#else - memset(v, 0, n); - __asm__ __volatile__("" :: "r"(v) : "memory"); -#endif -#endif -} - -#endif - diff --git a/vendor/blake2/blake2.h b/vendor/blake2/blake2.h deleted file mode 100644 index 5ca17f610..000000000 --- a/vendor/blake2/blake2.h +++ /dev/null @@ -1,182 +0,0 @@ -/* - BLAKE2 reference source code package - optimized C implementations - - Written in 2012 by Samuel Neves - - To the extent possible under law, the author(s) have dedicated all copyright - and related and neighboring rights to this software to the public domain - worldwide. This software is distributed without any warranty. - - You should have received a copy of the CC0 Public Domain Dedication along with - this software. If not, see . -*/ -#pragma once -#ifndef __BLAKE2_H__ -#define __BLAKE2_H__ - -#include -#include - -#if defined(_WIN32) || defined(__CYGWIN__) - #define BLAKE2_DLL_IMPORT __declspec(dllimport) - #define BLAKE2_DLL_EXPORT __declspec(dllexport) - #define BLAKE2_DLL_PRIVATE -#elif __GNUC__ >= 4 - #define BLAKE2_DLL_IMPORT __attribute__ ((visibility ("default"))) - #define BLAKE2_DLL_EXPORT __attribute__ ((visibility ("default"))) - #define BLAKE2_DLL_PRIVATE __attribute__ ((visibility ("hidden"))) -#else - #define BLAKE2_DLL_IMPORT - #define BLAKE2_DLL_EXPORT - #define BLAKE2_DLL_PRIVATE -#endif - -#if defined(BLAKE2_DLL) - #if defined(BLAKE2_DLL_EXPORTS) // defined if we are building the DLL - #define BLAKE2_API BLAKE2_DLL_EXPORT - #else - #define BLAKE2_API BLAKE2_DLL_IMPORT - #endif - #define BLAKE2_PRIVATE BLAKE2_DLL_PRIVATE // must only be used by hidden logic -#else - #define BLAKE2_API - #define BLAKE2_PRIVATE -#endif - -#if defined(__cplusplus) -extern "C" { -#elif defined(_MSC_VER) && !defined(inline) -#define inline __inline -#endif - - enum blake2s_constant - { - BLAKE2S_BLOCKBYTES = 64, - BLAKE2S_OUTBYTES = 32, - BLAKE2S_KEYBYTES = 32, - BLAKE2S_SALTBYTES = 8, - BLAKE2S_PERSONALBYTES = 8 - }; - - enum blake2b_constant - { - BLAKE2B_BLOCKBYTES = 128, - BLAKE2B_OUTBYTES = 64, - BLAKE2B_KEYBYTES = 64, - BLAKE2B_SALTBYTES = 16, - BLAKE2B_PERSONALBYTES = 16 - }; - -#pragma pack(push, 1) - typedef struct __blake2s_param - { - uint8_t digest_length; // 1 - uint8_t key_length; // 2 - uint8_t fanout; // 3 - uint8_t depth; // 4 - uint32_t leaf_length; // 8 - uint8_t node_offset[6];// 14 - uint8_t node_depth; // 15 - uint8_t inner_length; // 16 - // uint8_t reserved[0]; - uint8_t salt[BLAKE2S_SALTBYTES]; // 24 - uint8_t personal[BLAKE2S_PERSONALBYTES]; // 32 - } blake2s_param; - - typedef struct __blake2s_state - { - uint32_t h[8]; - uint32_t t[2]; - uint32_t f[2]; - uint8_t buf[2 * BLAKE2S_BLOCKBYTES]; - uint32_t buflen; - uint8_t outlen; - uint8_t last_node; - } blake2s_state; - - typedef struct __blake2b_param - { - uint8_t digest_length; // 1 - uint8_t key_length; // 2 - uint8_t fanout; // 3 - uint8_t depth; // 4 - uint32_t leaf_length; // 8 - uint64_t node_offset; // 16 - uint8_t node_depth; // 17 - uint8_t inner_length; // 18 - uint8_t reserved[14]; // 32 - uint8_t salt[BLAKE2B_SALTBYTES]; // 48 - uint8_t personal[BLAKE2B_PERSONALBYTES]; // 64 - } blake2b_param; - - typedef struct __blake2b_state - { - uint64_t h[8]; - uint64_t t[2]; - uint64_t f[2]; - uint8_t buf[2 * BLAKE2B_BLOCKBYTES]; - uint32_t buflen; - uint8_t outlen; - uint8_t last_node; - } blake2b_state; - - typedef struct __blake2sp_state - { - blake2s_state S[8][1]; - blake2s_state R[1]; - uint8_t buf[8 * BLAKE2S_BLOCKBYTES]; - uint32_t buflen; - uint8_t outlen; - } blake2sp_state; - - typedef struct __blake2bp_state - { - blake2b_state S[4][1]; - blake2b_state R[1]; - uint8_t buf[4 * BLAKE2B_BLOCKBYTES]; - uint32_t buflen; - uint8_t outlen; - } blake2bp_state; -#pragma pack(pop) - - // Streaming API - BLAKE2_API int blake2s_init( blake2s_state *S, size_t outlen ); - BLAKE2_API int blake2s_init_key( blake2s_state *S, size_t outlen, const void *key, size_t keylen ); - BLAKE2_API int blake2s_init_param( blake2s_state *S, const blake2s_param *P ); - BLAKE2_API int blake2s_update( blake2s_state *S, const uint8_t *in, size_t inlen ); - BLAKE2_API int blake2s_final( blake2s_state *S, uint8_t *out, size_t outlen ); - - BLAKE2_API int blake2b_init( blake2b_state *S, size_t outlen ); - BLAKE2_API int blake2b_init_key( blake2b_state *S, size_t outlen, const void *key, size_t keylen ); - BLAKE2_API int blake2b_init_param( blake2b_state *S, const blake2b_param *P ); - BLAKE2_API int blake2b_update( blake2b_state *S, const uint8_t *in, size_t inlen ); - BLAKE2_API int blake2b_final( blake2b_state *S, uint8_t *out, size_t outlen ); - - BLAKE2_API int blake2sp_init( blake2sp_state *S, size_t outlen ); - BLAKE2_API int blake2sp_init_key( blake2sp_state *S, size_t outlen, const void *key, size_t keylen ); - BLAKE2_API int blake2sp_update( blake2sp_state *S, const uint8_t *in, size_t inlen ); - BLAKE2_API int blake2sp_final( blake2sp_state *S, uint8_t *out, size_t outlen ); - - BLAKE2_API int blake2bp_init( blake2bp_state *S, size_t outlen ); - BLAKE2_API int blake2bp_init_key( blake2bp_state *S, size_t outlen, const void *key, size_t keylen ); - BLAKE2_API int blake2bp_update( blake2bp_state *S, const uint8_t *in, size_t inlen ); - BLAKE2_API int blake2bp_final( blake2bp_state *S, uint8_t *out, size_t outlen ); - - // Simple API - BLAKE2_API int blake2s( uint8_t *out, const void *in, const void *key, size_t outlen, size_t inlen, size_t keylen ); - BLAKE2_API int blake2b( uint8_t *out, const void *in, const void *key, size_t outlen, size_t inlen, size_t keylen ); - - BLAKE2_API int blake2sp( uint8_t *out, const void *in, const void *key, size_t outlen, size_t inlen, size_t keylen ); - BLAKE2_API int blake2bp( uint8_t *out, const void *in, const void *key, size_t outlen, size_t inlen, size_t keylen ); - - static inline int blake2( uint8_t *out, const void *in, const void *key, size_t outlen, size_t inlen, size_t keylen ) - { - return blake2b( out, in, key, outlen, inlen, keylen ); - } - -#if defined(__cplusplus) -} -#endif - -#endif - diff --git a/vendor/blake2/blake2.wake b/vendor/blake2/blake2.wake deleted file mode 100644 index 7d3ef84f8..000000000 --- a/vendor/blake2/blake2.wake +++ /dev/null @@ -1,21 +0,0 @@ -# Copyright 2019 SiFive, Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You should have received a copy of LICENSE.Apache2 along with -# this software. If not, you may obtain a copy at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -package build_wake - -from wake import _ - -target blake2 variant = - vendor @here variant "1.0" `.*\.h` `.*\.c` diff --git a/vendor/blake2/blake2b-ref.c b/vendor/blake2/blake2b-ref.c deleted file mode 100644 index 46e143e80..000000000 --- a/vendor/blake2/blake2b-ref.c +++ /dev/null @@ -1,315 +0,0 @@ -/* - BLAKE2 reference source code package - reference C implementations - - Written in 2012 by Samuel Neves - - To the extent possible under law, the author(s) have dedicated all copyright - and related and neighboring rights to this software to the public domain - worldwide. This software is distributed without any warranty. - - You should have received a copy of the CC0 Public Domain Dedication along with - this software. If not, see . -*/ - -#include -#include -#include - -#include "blake2.h" -#include "blake2-impl.h" - -static const uint64_t blake2b_IV[8] = -{ - 0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL, - 0x3c6ef372fe94f82bULL, 0xa54ff53a5f1d36f1ULL, - 0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL, - 0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL -}; - -static const uint8_t blake2b_sigma[12][16] = -{ - { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } , - { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } , - { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 } , - { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 } , - { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 } , - { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 } , - { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 } , - { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 } , - { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 } , - { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0 } , - { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } , - { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } -}; - - -static inline int blake2b_set_lastnode( blake2b_state *S ) -{ - S->f[1] = ~0ULL; - return 0; -} - -/* Some helper functions, not necessarily useful */ -static inline int blake2b_set_lastblock( blake2b_state *S ) -{ - if( S->last_node ) blake2b_set_lastnode( S ); - - S->f[0] = ~0ULL; - return 0; -} - -static inline int blake2b_increment_counter( blake2b_state *S, const uint64_t inc ) -{ - S->t[0] += inc; - S->t[1] += ( S->t[0] < inc ); - return 0; -} - -static inline int blake2b_init0( blake2b_state *S ) -{ - memset( S, 0, sizeof( blake2b_state ) ); - - for( int i = 0; i < 8; ++i ) S->h[i] = blake2b_IV[i]; - - return 0; -} - -#define blake2b_init BLAKE2_IMPL_NAME(blake2b_init) -#define blake2b_init_param BLAKE2_IMPL_NAME(blake2b_init_param) -#define blake2b_init_key BLAKE2_IMPL_NAME(blake2b_init_key) -#define blake2b_update BLAKE2_IMPL_NAME(blake2b_update) -#define blake2b_final BLAKE2_IMPL_NAME(blake2b_final) -#define blake2b BLAKE2_IMPL_NAME(blake2b) - -#if defined(__cplusplus) -extern "C" { -#endif - int blake2b_init( blake2b_state *S, size_t outlen ); - int blake2b_init_param( blake2b_state *S, const blake2b_param *P ); - int blake2b_init_key( blake2b_state *S, size_t outlen, const void *key, size_t keylen ); - int blake2b_update( blake2b_state *S, const uint8_t *in, size_t inlen ); - int blake2b_final( blake2b_state *S, uint8_t *out, size_t outlen ); - int blake2b( uint8_t *out, const void *in, const void *key, size_t outlen, size_t inlen, size_t keylen ); -#if defined(__cplusplus) -} -#endif - -/* init xors IV with input parameter block */ -int blake2b_init_param( blake2b_state *S, const blake2b_param *P ) -{ - blake2b_init0( S ); - uint8_t *p = ( uint8_t * )( P ); - - /* IV XOR ParamBlock */ - for( size_t i = 0; i < 8; ++i ) - S->h[i] ^= load64( p + sizeof( S->h[i] ) * i ); - - S->outlen = P->digest_length; - return 0; -} - - - -int blake2b_init( blake2b_state *S, size_t outlen ) -{ - blake2b_param P[1]; - - if ( ( !outlen ) || ( outlen > BLAKE2B_OUTBYTES ) ) return -1; - - P->digest_length = ( uint8_t ) outlen; - P->key_length = 0; - P->fanout = 1; - P->depth = 1; - store32( &P->leaf_length, 0 ); - store64( &P->node_offset, 0 ); - P->node_depth = 0; - P->inner_length = 0; - memset( P->reserved, 0, sizeof( P->reserved ) ); - memset( P->salt, 0, sizeof( P->salt ) ); - memset( P->personal, 0, sizeof( P->personal ) ); - return blake2b_init_param( S, P ); -} - - -int blake2b_init_key( blake2b_state *S, size_t outlen, const void *key, size_t keylen ) -{ - blake2b_param P[1]; - - if ( ( !outlen ) || ( outlen > BLAKE2B_OUTBYTES ) ) return -1; - - if ( !key || !keylen || keylen > BLAKE2B_KEYBYTES ) return -1; - - P->digest_length = ( uint8_t ) outlen; - P->key_length = ( uint8_t ) keylen; - P->fanout = 1; - P->depth = 1; - store32( &P->leaf_length, 0 ); - store64( &P->node_offset, 0 ); - P->node_depth = 0; - P->inner_length = 0; - memset( P->reserved, 0, sizeof( P->reserved ) ); - memset( P->salt, 0, sizeof( P->salt ) ); - memset( P->personal, 0, sizeof( P->personal ) ); - - if( blake2b_init_param( S, P ) < 0 ) return -1; - - { - uint8_t block[BLAKE2B_BLOCKBYTES]; - memset( block, 0, BLAKE2B_BLOCKBYTES ); - memcpy( block, key, keylen ); - blake2b_update( S, block, BLAKE2B_BLOCKBYTES ); - secure_zero_memory( block, BLAKE2B_BLOCKBYTES ); /* Burn the key from stack */ - } - return 0; -} - -static int blake2b_compress( blake2b_state *S, const uint8_t block[BLAKE2B_BLOCKBYTES] ) -{ - uint64_t m[16]; - uint64_t v[16]; - size_t i; - - for( i = 0; i < 16; ++i ) - m[i] = load64( block + i * sizeof( m[i] ) ); - - for( i = 0; i < 8; ++i ) - v[i] = S->h[i]; - - v[ 8] = blake2b_IV[0]; - v[ 9] = blake2b_IV[1]; - v[10] = blake2b_IV[2]; - v[11] = blake2b_IV[3]; - v[12] = S->t[0] ^ blake2b_IV[4]; - v[13] = S->t[1] ^ blake2b_IV[5]; - v[14] = S->f[0] ^ blake2b_IV[6]; - v[15] = S->f[1] ^ blake2b_IV[7]; -#define G(r,i,a,b,c,d) \ - do { \ - a = a + b + m[blake2b_sigma[r][2*i+0]]; \ - d = rotr64(d ^ a, 32); \ - c = c + d; \ - b = rotr64(b ^ c, 24); \ - a = a + b + m[blake2b_sigma[r][2*i+1]]; \ - d = rotr64(d ^ a, 16); \ - c = c + d; \ - b = rotr64(b ^ c, 63); \ - } while(0) -#define ROUND(r) \ - do { \ - G(r,0,v[ 0],v[ 4],v[ 8],v[12]); \ - G(r,1,v[ 1],v[ 5],v[ 9],v[13]); \ - G(r,2,v[ 2],v[ 6],v[10],v[14]); \ - G(r,3,v[ 3],v[ 7],v[11],v[15]); \ - G(r,4,v[ 0],v[ 5],v[10],v[15]); \ - G(r,5,v[ 1],v[ 6],v[11],v[12]); \ - G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \ - G(r,7,v[ 3],v[ 4],v[ 9],v[14]); \ - } while(0) - ROUND( 0 ); - ROUND( 1 ); - ROUND( 2 ); - ROUND( 3 ); - ROUND( 4 ); - ROUND( 5 ); - ROUND( 6 ); - ROUND( 7 ); - ROUND( 8 ); - ROUND( 9 ); - ROUND( 10 ); - ROUND( 11 ); - - for( i = 0; i < 8; ++i ) - S->h[i] = S->h[i] ^ v[i] ^ v[i + 8]; - -#undef G -#undef ROUND - return 0; -} - - -int blake2b_update( blake2b_state *S, const uint8_t *in, size_t inlen ) -{ - while( inlen > 0 ) - { - uint32_t left = S->buflen; - uint32_t fill = 2 * BLAKE2B_BLOCKBYTES - left; - - if( inlen > fill ) - { - memcpy( S->buf + left, in, fill ); // Fill buffer - S->buflen += fill; - blake2b_increment_counter( S, BLAKE2B_BLOCKBYTES ); - blake2b_compress( S, S->buf ); // Compress - memcpy( S->buf, S->buf + BLAKE2B_BLOCKBYTES, BLAKE2B_BLOCKBYTES ); // Shift buffer left - S->buflen -= BLAKE2B_BLOCKBYTES; - in += fill; - inlen -= fill; - } - else // inlen <= fill - { - memcpy( S->buf + left, in, inlen ); - S->buflen += ( uint32_t ) inlen; // Be lazy, do not compress - in += inlen; - inlen -= inlen; - } - } - - return 0; -} - -int blake2b_final( blake2b_state *S, uint8_t *out, size_t outlen ) -{ - uint8_t buffer[BLAKE2B_OUTBYTES]; - size_t i; - - if(S->outlen != outlen) return -1; - - if( S->buflen > BLAKE2B_BLOCKBYTES ) - { - blake2b_increment_counter( S, BLAKE2B_BLOCKBYTES ); - blake2b_compress( S, S->buf ); - S->buflen -= BLAKE2B_BLOCKBYTES; - memcpy( S->buf, S->buf + BLAKE2B_BLOCKBYTES, S->buflen ); - } - - blake2b_increment_counter( S, S->buflen ); - blake2b_set_lastblock( S ); - memset( S->buf + S->buflen, 0, 2 * BLAKE2B_BLOCKBYTES - S->buflen ); /* Padding */ - blake2b_compress( S, S->buf ); - - for( i = 0; i < 8; ++i ) /* Output full hash to temp buffer */ - store64( buffer + sizeof( S->h[i] ) * i, S->h[i] ); - - memcpy( out, buffer, outlen ); - return 0; -} - -int blake2b( uint8_t *out, const void *in, const void *key, size_t outlen, size_t inlen, size_t keylen ) -{ - blake2b_state S[1]; - - /* Verify parameters */ - if ( NULL == in && inlen > 0 ) return -1; - - if ( NULL == out ) return -1; - - if( NULL == key && keylen > 0 ) return -1; - - if( !outlen || outlen > BLAKE2B_OUTBYTES ) return -1; - - if( keylen > BLAKE2B_KEYBYTES ) return -1; - - if( keylen > 0 ) - { - if( blake2b_init_key( S, outlen, key, keylen ) < 0 ) return -1; - } - else - { - if( blake2b_init( S, outlen ) < 0 ) return -1; - } - - if( blake2b_update( S, ( uint8_t * )in, inlen ) < 0 ) return -1; - return blake2b_final( S, out, outlen ); -} - - diff --git a/vendor/blake2/config.h b/vendor/blake2/config.h deleted file mode 100644 index feee1544f..000000000 --- a/vendor/blake2/config.h +++ /dev/null @@ -1 +0,0 @@ -#define SUFFIX diff --git a/vendor/blake2/LICENSE.CC0 b/vendor/blake3/LICENSE similarity index 94% rename from vendor/blake2/LICENSE.CC0 rename to vendor/blake3/LICENSE index 0e259d42c..237911b8c 100644 --- a/vendor/blake2/LICENSE.CC0 +++ b/vendor/blake3/LICENSE @@ -1,3 +1,8 @@ +This work is released into the public domain with CC0 1.0. Alternatively, it is +licensed under the Apache License 2.0. + +------------------------------------------------------------------------------- + Creative Commons Legal Code CC0 1.0 Universal @@ -25,9 +30,9 @@ of later claims of infringement build upon, modify, incorporate in other works, reuse and redistribute as freely as possible in any form whatsoever and for any purposes, including without limitation commercial purposes. These owners may contribute to the Commons to promote the ideal of a free -culture and the further production of creative, cultural and scientific -works, or to gain reputation or greater distribution for their Work in -part through the use and efforts of others. +culture and the further production of creative and cultural works, or to +gain reputation or greater distribution for their Work in part through the +use and efforts of others. For these and/or other purposes and motivations, and without any expectation of additional consideration or compensation, the person @@ -119,3 +124,4 @@ express Statement of Purpose. d. Affirmer understands and acknowledges that Creative Commons is not a party to this document and has no duty or obligation with respect to this CC0 or use of the Work. + diff --git a/vendor/blake3/blake3.c b/vendor/blake3/blake3.c new file mode 100644 index 000000000..74fc2e73b --- /dev/null +++ b/vendor/blake3/blake3.c @@ -0,0 +1,650 @@ +#include +#include +#include + +#include "blake3.h" +#include "blake3_impl.h" + +const char *blake3_version(void) { return BLAKE3_VERSION_STRING; } + +INLINE void chunk_state_init(blake3_chunk_state *self, const uint32_t key[8], + uint8_t flags) { + memcpy(self->cv, key, BLAKE3_KEY_LEN); + self->chunk_counter = 0; + memset(self->buf, 0, BLAKE3_BLOCK_LEN); + self->buf_len = 0; + self->blocks_compressed = 0; + self->flags = flags; +} + +INLINE void chunk_state_reset(blake3_chunk_state *self, const uint32_t key[8], + uint64_t chunk_counter) { + memcpy(self->cv, key, BLAKE3_KEY_LEN); + self->chunk_counter = chunk_counter; + self->blocks_compressed = 0; + memset(self->buf, 0, BLAKE3_BLOCK_LEN); + self->buf_len = 0; +} + +INLINE size_t chunk_state_len(const blake3_chunk_state *self) { + return (BLAKE3_BLOCK_LEN * (size_t)self->blocks_compressed) + + ((size_t)self->buf_len); +} + +INLINE size_t chunk_state_fill_buf(blake3_chunk_state *self, + const uint8_t *input, size_t input_len) { + size_t take = BLAKE3_BLOCK_LEN - ((size_t)self->buf_len); + if (take > input_len) { + take = input_len; + } + uint8_t *dest = self->buf + ((size_t)self->buf_len); + memcpy(dest, input, take); + self->buf_len += (uint8_t)take; + return take; +} + +INLINE uint8_t chunk_state_maybe_start_flag(const blake3_chunk_state *self) { + if (self->blocks_compressed == 0) { + return CHUNK_START; + } else { + return 0; + } +} + +typedef struct { + uint32_t input_cv[8]; + uint64_t counter; + uint8_t block[BLAKE3_BLOCK_LEN]; + uint8_t block_len; + uint8_t flags; +} output_t; + +INLINE output_t make_output(const uint32_t input_cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags) { + output_t ret; + memcpy(ret.input_cv, input_cv, 32); + memcpy(ret.block, block, BLAKE3_BLOCK_LEN); + ret.block_len = block_len; + ret.counter = counter; + ret.flags = flags; + return ret; +} + +// Chaining values within a given chunk (specifically the compress_in_place +// interface) are represented as words. This avoids unnecessary bytes<->words +// conversion overhead in the portable implementation. However, the hash_many +// interface handles both user input and parent node blocks, so it accepts +// bytes. For that reason, chaining values in the CV stack are represented as +// bytes. +INLINE void output_chaining_value(const output_t *self, uint8_t cv[32]) { + uint32_t cv_words[8]; + memcpy(cv_words, self->input_cv, 32); + blake3_compress_in_place(cv_words, self->block, self->block_len, + self->counter, self->flags); + store_cv_words(cv, cv_words); +} + +INLINE void output_root_bytes(const output_t *self, uint64_t seek, uint8_t *out, + size_t out_len) { + if (out_len == 0) { + return; + } + uint64_t output_block_counter = seek / 64; + size_t offset_within_block = seek % 64; + uint8_t wide_buf[64]; + if(offset_within_block) { + blake3_compress_xof(self->input_cv, self->block, self->block_len, output_block_counter, self->flags | ROOT, wide_buf); + const size_t available_bytes = 64 - offset_within_block; + const size_t bytes = out_len > available_bytes ? available_bytes : out_len; + memcpy(out, wide_buf + offset_within_block, bytes); + out += bytes; + out_len -= bytes; + output_block_counter += 1; + } + if(out_len / 64) { + blake3_xof_many(self->input_cv, self->block, self->block_len, output_block_counter, self->flags | ROOT, out, out_len / 64); + } + output_block_counter += out_len / 64; + out += out_len & -64; + out_len -= out_len & -64; + if(out_len) { + blake3_compress_xof(self->input_cv, self->block, self->block_len, output_block_counter, self->flags | ROOT, wide_buf); + memcpy(out, wide_buf, out_len); + } +} + +INLINE void chunk_state_update(blake3_chunk_state *self, const uint8_t *input, + size_t input_len) { + if (self->buf_len > 0) { + size_t take = chunk_state_fill_buf(self, input, input_len); + input += take; + input_len -= take; + if (input_len > 0) { + blake3_compress_in_place( + self->cv, self->buf, BLAKE3_BLOCK_LEN, self->chunk_counter, + self->flags | chunk_state_maybe_start_flag(self)); + self->blocks_compressed += 1; + self->buf_len = 0; + memset(self->buf, 0, BLAKE3_BLOCK_LEN); + } + } + + while (input_len > BLAKE3_BLOCK_LEN) { + blake3_compress_in_place(self->cv, input, BLAKE3_BLOCK_LEN, + self->chunk_counter, + self->flags | chunk_state_maybe_start_flag(self)); + self->blocks_compressed += 1; + input += BLAKE3_BLOCK_LEN; + input_len -= BLAKE3_BLOCK_LEN; + } + + chunk_state_fill_buf(self, input, input_len); +} + +INLINE output_t chunk_state_output(const blake3_chunk_state *self) { + uint8_t block_flags = + self->flags | chunk_state_maybe_start_flag(self) | CHUNK_END; + return make_output(self->cv, self->buf, self->buf_len, self->chunk_counter, + block_flags); +} + +INLINE output_t parent_output(const uint8_t block[BLAKE3_BLOCK_LEN], + const uint32_t key[8], uint8_t flags) { + return make_output(key, block, BLAKE3_BLOCK_LEN, 0, flags | PARENT); +} + +// Given some input larger than one chunk, return the number of bytes that +// should go in the left subtree. This is the largest power-of-2 number of +// chunks that leaves at least 1 byte for the right subtree. +INLINE size_t left_subtree_len(size_t input_len) { + // Subtract 1 to reserve at least one byte for the right side. input_len + // should always be greater than BLAKE3_CHUNK_LEN. + size_t full_chunks = (input_len - 1) / BLAKE3_CHUNK_LEN; + return round_down_to_power_of_2(full_chunks) * BLAKE3_CHUNK_LEN; +} + +// Use SIMD parallelism to hash up to MAX_SIMD_DEGREE chunks at the same time +// on a single thread. Write out the chunk chaining values and return the +// number of chunks hashed. These chunks are never the root and never empty; +// those cases use a different codepath. +INLINE size_t compress_chunks_parallel(const uint8_t *input, size_t input_len, + const uint32_t key[8], + uint64_t chunk_counter, uint8_t flags, + uint8_t *out) { +#if defined(BLAKE3_TESTING) + assert(0 < input_len); + assert(input_len <= MAX_SIMD_DEGREE * BLAKE3_CHUNK_LEN); +#endif + + const uint8_t *chunks_array[MAX_SIMD_DEGREE]; + size_t input_position = 0; + size_t chunks_array_len = 0; + while (input_len - input_position >= BLAKE3_CHUNK_LEN) { + chunks_array[chunks_array_len] = &input[input_position]; + input_position += BLAKE3_CHUNK_LEN; + chunks_array_len += 1; + } + + blake3_hash_many(chunks_array, chunks_array_len, + BLAKE3_CHUNK_LEN / BLAKE3_BLOCK_LEN, key, chunk_counter, + true, flags, CHUNK_START, CHUNK_END, out); + + // Hash the remaining partial chunk, if there is one. Note that the empty + // chunk (meaning the empty message) is a different codepath. + if (input_len > input_position) { + uint64_t counter = chunk_counter + (uint64_t)chunks_array_len; + blake3_chunk_state chunk_state; + chunk_state_init(&chunk_state, key, flags); + chunk_state.chunk_counter = counter; + chunk_state_update(&chunk_state, &input[input_position], + input_len - input_position); + output_t output = chunk_state_output(&chunk_state); + output_chaining_value(&output, &out[chunks_array_len * BLAKE3_OUT_LEN]); + return chunks_array_len + 1; + } else { + return chunks_array_len; + } +} + +// Use SIMD parallelism to hash up to MAX_SIMD_DEGREE parents at the same time +// on a single thread. Write out the parent chaining values and return the +// number of parents hashed. (If there's an odd input chaining value left over, +// return it as an additional output.) These parents are never the root and +// never empty; those cases use a different codepath. +INLINE size_t compress_parents_parallel(const uint8_t *child_chaining_values, + size_t num_chaining_values, + const uint32_t key[8], uint8_t flags, + uint8_t *out) { +#if defined(BLAKE3_TESTING) + assert(2 <= num_chaining_values); + assert(num_chaining_values <= 2 * MAX_SIMD_DEGREE_OR_2); +#endif + + const uint8_t *parents_array[MAX_SIMD_DEGREE_OR_2]; + size_t parents_array_len = 0; + while (num_chaining_values - (2 * parents_array_len) >= 2) { + parents_array[parents_array_len] = + &child_chaining_values[2 * parents_array_len * BLAKE3_OUT_LEN]; + parents_array_len += 1; + } + + blake3_hash_many(parents_array, parents_array_len, 1, key, + 0, // Parents always use counter 0. + false, flags | PARENT, + 0, // Parents have no start flags. + 0, // Parents have no end flags. + out); + + // If there's an odd child left over, it becomes an output. + if (num_chaining_values > 2 * parents_array_len) { + memcpy(&out[parents_array_len * BLAKE3_OUT_LEN], + &child_chaining_values[2 * parents_array_len * BLAKE3_OUT_LEN], + BLAKE3_OUT_LEN); + return parents_array_len + 1; + } else { + return parents_array_len; + } +} + +// The wide helper function returns (writes out) an array of chaining values +// and returns the length of that array. The number of chaining values returned +// is the dynamically detected SIMD degree, at most MAX_SIMD_DEGREE. Or fewer, +// if the input is shorter than that many chunks. The reason for maintaining a +// wide array of chaining values going back up the tree, is to allow the +// implementation to hash as many parents in parallel as possible. +// +// As a special case when the SIMD degree is 1, this function will still return +// at least 2 outputs. This guarantees that this function doesn't perform the +// root compression. (If it did, it would use the wrong flags, and also we +// wouldn't be able to implement extendable output.) Note that this function is +// not used when the whole input is only 1 chunk long; that's a different +// codepath. +// +// Why not just have the caller split the input on the first update(), instead +// of implementing this special rule? Because we don't want to limit SIMD or +// multi-threading parallelism for that update(). +size_t blake3_compress_subtree_wide(const uint8_t *input, size_t input_len, + const uint32_t key[8], + uint64_t chunk_counter, uint8_t flags, + uint8_t *out, bool use_tbb) { + // Note that the single chunk case does *not* bump the SIMD degree up to 2 + // when it is 1. If this implementation adds multi-threading in the future, + // this gives us the option of multi-threading even the 2-chunk case, which + // can help performance on smaller platforms. + if (input_len <= blake3_simd_degree() * BLAKE3_CHUNK_LEN) { + return compress_chunks_parallel(input, input_len, key, chunk_counter, flags, + out); + } + + // With more than simd_degree chunks, we need to recurse. Start by dividing + // the input into left and right subtrees. (Note that this is only optimal + // as long as the SIMD degree is a power of 2. If we ever get a SIMD degree + // of 3 or something, we'll need a more complicated strategy.) + size_t left_input_len = left_subtree_len(input_len); + size_t right_input_len = input_len - left_input_len; + const uint8_t *right_input = &input[left_input_len]; + uint64_t right_chunk_counter = + chunk_counter + (uint64_t)(left_input_len / BLAKE3_CHUNK_LEN); + + // Make space for the child outputs. Here we use MAX_SIMD_DEGREE_OR_2 to + // account for the special case of returning 2 outputs when the SIMD degree + // is 1. + uint8_t cv_array[2 * MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN]; + size_t degree = blake3_simd_degree(); + if (left_input_len > BLAKE3_CHUNK_LEN && degree == 1) { + // The special case: We always use a degree of at least two, to make + // sure there are two outputs. Except, as noted above, at the chunk + // level, where we allow degree=1. (Note that the 1-chunk-input case is + // a different codepath.) + degree = 2; + } + uint8_t *right_cvs = &cv_array[degree * BLAKE3_OUT_LEN]; + + // Recurse! + size_t left_n = -1; + size_t right_n = -1; + +#if defined(BLAKE3_USE_TBB) + blake3_compress_subtree_wide_join_tbb( + key, flags, use_tbb, + // left-hand side + input, left_input_len, chunk_counter, cv_array, &left_n, + // right-hand side + right_input, right_input_len, right_chunk_counter, right_cvs, &right_n); +#else + left_n = blake3_compress_subtree_wide( + input, left_input_len, key, chunk_counter, flags, cv_array, use_tbb); + right_n = blake3_compress_subtree_wide(right_input, right_input_len, key, + right_chunk_counter, flags, right_cvs, + use_tbb); +#endif // BLAKE3_USE_TBB + + // The special case again. If simd_degree=1, then we'll have left_n=1 and + // right_n=1. Rather than compressing them into a single output, return + // them directly, to make sure we always have at least two outputs. + if (left_n == 1) { + memcpy(out, cv_array, 2 * BLAKE3_OUT_LEN); + return 2; + } + + // Otherwise, do one layer of parent node compression. + size_t num_chaining_values = left_n + right_n; + return compress_parents_parallel(cv_array, num_chaining_values, key, flags, + out); +} + +// Hash a subtree with compress_subtree_wide(), and then condense the resulting +// list of chaining values down to a single parent node. Don't compress that +// last parent node, however. Instead, return its message bytes (the +// concatenated chaining values of its children). This is necessary when the +// first call to update() supplies a complete subtree, because the topmost +// parent node of that subtree could end up being the root. It's also necessary +// for extended output in the general case. +// +// As with compress_subtree_wide(), this function is not used on inputs of 1 +// chunk or less. That's a different codepath. +INLINE void +compress_subtree_to_parent_node(const uint8_t *input, size_t input_len, + const uint32_t key[8], uint64_t chunk_counter, + uint8_t flags, uint8_t out[2 * BLAKE3_OUT_LEN], + bool use_tbb) { +#if defined(BLAKE3_TESTING) + assert(input_len > BLAKE3_CHUNK_LEN); +#endif + + uint8_t cv_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN]; + size_t num_cvs = blake3_compress_subtree_wide(input, input_len, key, + chunk_counter, flags, cv_array, use_tbb); + assert(num_cvs <= MAX_SIMD_DEGREE_OR_2); + // The following loop never executes when MAX_SIMD_DEGREE_OR_2 is 2, because + // as we just asserted, num_cvs will always be <=2 in that case. But GCC + // (particularly GCC 8.5) can't tell that it never executes, and if NDEBUG is + // set then it emits incorrect warnings here. We tried a few different + // hacks to silence these, but in the end our hacks just produced different + // warnings (see https://github.com/BLAKE3-team/BLAKE3/pull/380). Out of + // desperation, we ifdef out this entire loop when we know it's not needed. +#if MAX_SIMD_DEGREE_OR_2 > 2 + // If MAX_SIMD_DEGREE_OR_2 is greater than 2 and there's enough input, + // compress_subtree_wide() returns more than 2 chaining values. Condense + // them into 2 by forming parent nodes repeatedly. + uint8_t out_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN / 2]; + while (num_cvs > 2) { + num_cvs = + compress_parents_parallel(cv_array, num_cvs, key, flags, out_array); + memcpy(cv_array, out_array, num_cvs * BLAKE3_OUT_LEN); + } +#endif + memcpy(out, cv_array, 2 * BLAKE3_OUT_LEN); +} + +INLINE void hasher_init_base(blake3_hasher *self, const uint32_t key[8], + uint8_t flags) { + memcpy(self->key, key, BLAKE3_KEY_LEN); + chunk_state_init(&self->chunk, key, flags); + self->cv_stack_len = 0; +} + +void blake3_hasher_init(blake3_hasher *self) { hasher_init_base(self, IV, 0); } + +void blake3_hasher_init_keyed(blake3_hasher *self, + const uint8_t key[BLAKE3_KEY_LEN]) { + uint32_t key_words[8]; + load_key_words(key, key_words); + hasher_init_base(self, key_words, KEYED_HASH); +} + +void blake3_hasher_init_derive_key_raw(blake3_hasher *self, const void *context, + size_t context_len) { + blake3_hasher context_hasher; + hasher_init_base(&context_hasher, IV, DERIVE_KEY_CONTEXT); + blake3_hasher_update(&context_hasher, context, context_len); + uint8_t context_key[BLAKE3_KEY_LEN]; + blake3_hasher_finalize(&context_hasher, context_key, BLAKE3_KEY_LEN); + uint32_t context_key_words[8]; + load_key_words(context_key, context_key_words); + hasher_init_base(self, context_key_words, DERIVE_KEY_MATERIAL); +} + +void blake3_hasher_init_derive_key(blake3_hasher *self, const char *context) { + blake3_hasher_init_derive_key_raw(self, context, strlen(context)); +} + +// As described in hasher_push_cv() below, we do "lazy merging", delaying +// merges until right before the next CV is about to be added. This is +// different from the reference implementation. Another difference is that we +// aren't always merging 1 chunk at a time. Instead, each CV might represent +// any power-of-two number of chunks, as long as the smaller-above-larger stack +// order is maintained. Instead of the "count the trailing 0-bits" algorithm +// described in the spec, we use a "count the total number of 1-bits" variant +// that doesn't require us to retain the subtree size of the CV on top of the +// stack. The principle is the same: each CV that should remain in the stack is +// represented by a 1-bit in the total number of chunks (or bytes) so far. +INLINE void hasher_merge_cv_stack(blake3_hasher *self, uint64_t total_len) { + size_t post_merge_stack_len = (size_t)popcnt(total_len); + while (self->cv_stack_len > post_merge_stack_len) { + uint8_t *parent_node = + &self->cv_stack[(self->cv_stack_len - 2) * BLAKE3_OUT_LEN]; + output_t output = parent_output(parent_node, self->key, self->chunk.flags); + output_chaining_value(&output, parent_node); + self->cv_stack_len -= 1; + } +} + +// In reference_impl.rs, we merge the new CV with existing CVs from the stack +// before pushing it. We can do that because we know more input is coming, so +// we know none of the merges are root. +// +// This setting is different. We want to feed as much input as possible to +// compress_subtree_wide(), without setting aside anything for the chunk_state. +// If the user gives us 64 KiB, we want to parallelize over all 64 KiB at once +// as a single subtree, if at all possible. +// +// This leads to two problems: +// 1) This 64 KiB input might be the only call that ever gets made to update. +// In this case, the root node of the 64 KiB subtree would be the root node +// of the whole tree, and it would need to be ROOT finalized. We can't +// compress it until we know. +// 2) This 64 KiB input might complete a larger tree, whose root node is +// similarly going to be the root of the whole tree. For example, maybe +// we have 196 KiB (that is, 128 + 64) hashed so far. We can't compress the +// node at the root of the 256 KiB subtree until we know how to finalize it. +// +// The second problem is solved with "lazy merging". That is, when we're about +// to add a CV to the stack, we don't merge it with anything first, as the +// reference impl does. Instead we do merges using the *previous* CV that was +// added, which is sitting on top of the stack, and we put the new CV +// (unmerged) on top of the stack afterwards. This guarantees that we never +// merge the root node until finalize(). +// +// Solving the first problem requires an additional tool, +// compress_subtree_to_parent_node(). That function always returns the top +// *two* chaining values of the subtree it's compressing. We then do lazy +// merging with each of them separately, so that the second CV will always +// remain unmerged. (That also helps us support extendable output when we're +// hashing an input all-at-once.) +INLINE void hasher_push_cv(blake3_hasher *self, uint8_t new_cv[BLAKE3_OUT_LEN], + uint64_t chunk_counter) { + hasher_merge_cv_stack(self, chunk_counter); + memcpy(&self->cv_stack[self->cv_stack_len * BLAKE3_OUT_LEN], new_cv, + BLAKE3_OUT_LEN); + self->cv_stack_len += 1; +} + +INLINE void blake3_hasher_update_base(blake3_hasher *self, const void *input, + size_t input_len, bool use_tbb) { + // Explicitly checking for zero avoids causing UB by passing a null pointer + // to memcpy. This comes up in practice with things like: + // std::vector v; + // blake3_hasher_update(&hasher, v.data(), v.size()); + if (input_len == 0) { + return; + } + + const uint8_t *input_bytes = (const uint8_t *)input; + + // If we have some partial chunk bytes in the internal chunk_state, we need + // to finish that chunk first. + if (chunk_state_len(&self->chunk) > 0) { + size_t take = BLAKE3_CHUNK_LEN - chunk_state_len(&self->chunk); + if (take > input_len) { + take = input_len; + } + chunk_state_update(&self->chunk, input_bytes, take); + input_bytes += take; + input_len -= take; + // If we've filled the current chunk and there's more coming, finalize this + // chunk and proceed. In this case we know it's not the root. + if (input_len > 0) { + output_t output = chunk_state_output(&self->chunk); + uint8_t chunk_cv[32]; + output_chaining_value(&output, chunk_cv); + hasher_push_cv(self, chunk_cv, self->chunk.chunk_counter); + chunk_state_reset(&self->chunk, self->key, self->chunk.chunk_counter + 1); + } else { + return; + } + } + + // Now the chunk_state is clear, and we have more input. If there's more than + // a single chunk (so, definitely not the root chunk), hash the largest whole + // subtree we can, with the full benefits of SIMD (and maybe in the future, + // multi-threading) parallelism. Two restrictions: + // - The subtree has to be a power-of-2 number of chunks. Only subtrees along + // the right edge can be incomplete, and we don't know where the right edge + // is going to be until we get to finalize(). + // - The subtree must evenly divide the total number of chunks up until this + // point (if total is not 0). If the current incomplete subtree is only + // waiting for 1 more chunk, we can't hash a subtree of 4 chunks. We have + // to complete the current subtree first. + // Because we might need to break up the input to form powers of 2, or to + // evenly divide what we already have, this part runs in a loop. + while (input_len > BLAKE3_CHUNK_LEN) { + size_t subtree_len = round_down_to_power_of_2(input_len); + uint64_t count_so_far = self->chunk.chunk_counter * BLAKE3_CHUNK_LEN; + // Shrink the subtree_len until it evenly divides the count so far. We know + // that subtree_len itself is a power of 2, so we can use a bitmasking + // trick instead of an actual remainder operation. (Note that if the caller + // consistently passes power-of-2 inputs of the same size, as is hopefully + // typical, this loop condition will always fail, and subtree_len will + // always be the full length of the input.) + // + // An aside: We don't have to shrink subtree_len quite this much. For + // example, if count_so_far is 1, we could pass 2 chunks to + // compress_subtree_to_parent_node. Since we'll get 2 CVs back, we'll still + // get the right answer in the end, and we might get to use 2-way SIMD + // parallelism. The problem with this optimization, is that it gets us + // stuck always hashing 2 chunks. The total number of chunks will remain + // odd, and we'll never graduate to higher degrees of parallelism. See + // https://github.com/BLAKE3-team/BLAKE3/issues/69. + while ((((uint64_t)(subtree_len - 1)) & count_so_far) != 0) { + subtree_len /= 2; + } + // The shrunken subtree_len might now be 1 chunk long. If so, hash that one + // chunk by itself. Otherwise, compress the subtree into a pair of CVs. + uint64_t subtree_chunks = subtree_len / BLAKE3_CHUNK_LEN; + if (subtree_len <= BLAKE3_CHUNK_LEN) { + blake3_chunk_state chunk_state; + chunk_state_init(&chunk_state, self->key, self->chunk.flags); + chunk_state.chunk_counter = self->chunk.chunk_counter; + chunk_state_update(&chunk_state, input_bytes, subtree_len); + output_t output = chunk_state_output(&chunk_state); + uint8_t cv[BLAKE3_OUT_LEN]; + output_chaining_value(&output, cv); + hasher_push_cv(self, cv, chunk_state.chunk_counter); + } else { + // This is the high-performance happy path, though getting here depends + // on the caller giving us a long enough input. + uint8_t cv_pair[2 * BLAKE3_OUT_LEN]; + compress_subtree_to_parent_node(input_bytes, subtree_len, self->key, + self->chunk.chunk_counter, + self->chunk.flags, cv_pair, use_tbb); + hasher_push_cv(self, cv_pair, self->chunk.chunk_counter); + hasher_push_cv(self, &cv_pair[BLAKE3_OUT_LEN], + self->chunk.chunk_counter + (subtree_chunks / 2)); + } + self->chunk.chunk_counter += subtree_chunks; + input_bytes += subtree_len; + input_len -= subtree_len; + } + + // If there's any remaining input less than a full chunk, add it to the chunk + // state. In that case, also do a final merge loop to make sure the subtree + // stack doesn't contain any unmerged pairs. The remaining input means we + // know these merges are non-root. This merge loop isn't strictly necessary + // here, because hasher_push_chunk_cv already does its own merge loop, but it + // simplifies blake3_hasher_finalize below. + if (input_len > 0) { + chunk_state_update(&self->chunk, input_bytes, input_len); + hasher_merge_cv_stack(self, self->chunk.chunk_counter); + } +} + +void blake3_hasher_update(blake3_hasher *self, const void *input, + size_t input_len) { + bool use_tbb = false; + blake3_hasher_update_base(self, input, input_len, use_tbb); +} + +#if defined(BLAKE3_USE_TBB) +void blake3_hasher_update_tbb(blake3_hasher *self, const void *input, + size_t input_len) { + bool use_tbb = true; + blake3_hasher_update_base(self, input, input_len, use_tbb); +} +#endif // BLAKE3_USE_TBB + +void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out, + size_t out_len) { + blake3_hasher_finalize_seek(self, 0, out, out_len); +} + +void blake3_hasher_finalize_seek(const blake3_hasher *self, uint64_t seek, + uint8_t *out, size_t out_len) { + // Explicitly checking for zero avoids causing UB by passing a null pointer + // to memcpy. This comes up in practice with things like: + // std::vector v; + // blake3_hasher_finalize(&hasher, v.data(), v.size()); + if (out_len == 0) { + return; + } + + // If the subtree stack is empty, then the current chunk is the root. + if (self->cv_stack_len == 0) { + output_t output = chunk_state_output(&self->chunk); + output_root_bytes(&output, seek, out, out_len); + return; + } + // If there are any bytes in the chunk state, finalize that chunk and do a + // roll-up merge between that chunk hash and every subtree in the stack. In + // this case, the extra merge loop at the end of blake3_hasher_update + // guarantees that none of the subtrees in the stack need to be merged with + // each other first. Otherwise, if there are no bytes in the chunk state, + // then the top of the stack is a chunk hash, and we start the merge from + // that. + output_t output; + size_t cvs_remaining; + if (chunk_state_len(&self->chunk) > 0) { + cvs_remaining = self->cv_stack_len; + output = chunk_state_output(&self->chunk); + } else { + // There are always at least 2 CVs in the stack in this case. + cvs_remaining = self->cv_stack_len - 2; + output = parent_output(&self->cv_stack[cvs_remaining * 32], self->key, + self->chunk.flags); + } + while (cvs_remaining > 0) { + cvs_remaining -= 1; + uint8_t parent_block[BLAKE3_BLOCK_LEN]; + memcpy(parent_block, &self->cv_stack[cvs_remaining * 32], 32); + output_chaining_value(&output, &parent_block[32]); + output = parent_output(parent_block, self->key, self->chunk.flags); + } + output_root_bytes(&output, seek, out, out_len); +} + +void blake3_hasher_reset(blake3_hasher *self) { + chunk_state_reset(&self->chunk, self->key, 0); + self->cv_stack_len = 0; +} diff --git a/vendor/blake3/blake3.h b/vendor/blake3/blake3.h new file mode 100644 index 000000000..2557487e0 --- /dev/null +++ b/vendor/blake3/blake3.h @@ -0,0 +1,83 @@ +#ifndef BLAKE3_H +#define BLAKE3_H + +#include +#include + +#if !defined(BLAKE3_API) +# if defined(_WIN32) || defined(__CYGWIN__) +# if defined(BLAKE3_DLL) +# if defined(BLAKE3_DLL_EXPORTS) +# define BLAKE3_API __declspec(dllexport) +# else +# define BLAKE3_API __declspec(dllimport) +# endif +# define BLAKE3_PRIVATE +# else +# define BLAKE3_API +# define BLAKE3_PRIVATE +# endif +# elif __GNUC__ >= 4 +# define BLAKE3_API __attribute__((visibility("default"))) +# define BLAKE3_PRIVATE __attribute__((visibility("hidden"))) +# else +# define BLAKE3_API +# define BLAKE3_PRIVATE +# endif +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +#define BLAKE3_VERSION_STRING "1.8.2" +#define BLAKE3_KEY_LEN 32 +#define BLAKE3_OUT_LEN 32 +#define BLAKE3_BLOCK_LEN 64 +#define BLAKE3_CHUNK_LEN 1024 +#define BLAKE3_MAX_DEPTH 54 + +// This struct is a private implementation detail. It has to be here because +// it's part of the blake3_hasher structure defined below. +typedef struct { + uint32_t cv[8]; + uint64_t chunk_counter; + uint8_t buf[BLAKE3_BLOCK_LEN]; + uint8_t buf_len; + uint8_t blocks_compressed; + uint8_t flags; +} blake3_chunk_state; + +typedef struct { + uint32_t key[8]; + blake3_chunk_state chunk; + uint8_t cv_stack_len; + // The stack size is MAX_DEPTH + 1 because we do lazy merging. For example, + // with 7 chunks, we have 3 entries in the stack. Adding an 8th chunk + // requires a 4th entry, rather than merging everything down to 1, because we + // don't know whether more input is coming. This is different from how the + // reference implementation does things. + uint8_t cv_stack[(BLAKE3_MAX_DEPTH + 1) * BLAKE3_OUT_LEN]; +} blake3_hasher; + +BLAKE3_API const char *blake3_version(void); +BLAKE3_API void blake3_hasher_init(blake3_hasher *self); +BLAKE3_API void blake3_hasher_init_keyed(blake3_hasher *self, + const uint8_t key[BLAKE3_KEY_LEN]); +BLAKE3_API void blake3_hasher_init_derive_key(blake3_hasher *self, const char *context); +BLAKE3_API void blake3_hasher_init_derive_key_raw(blake3_hasher *self, const void *context, + size_t context_len); +BLAKE3_API void blake3_hasher_update(blake3_hasher *self, const void *input, + size_t input_len); +BLAKE3_API void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out, + size_t out_len); +BLAKE3_API void blake3_hasher_finalize_seek(const blake3_hasher *self, uint64_t seek, + uint8_t *out, size_t out_len); +BLAKE3_API void blake3_hasher_reset(blake3_hasher *self); + +#ifdef __cplusplus +} +#endif + +#endif /* BLAKE3_H */ + diff --git a/vendor/blake3/blake3.wake b/vendor/blake3/blake3.wake new file mode 100644 index 000000000..40424d41a --- /dev/null +++ b/vendor/blake3/blake3.wake @@ -0,0 +1,81 @@ +# Copyright 2019 SiFive, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You should have received a copy of LICENSE.Apache2 along with +# this software. If not, you may obtain a copy at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +package build_wake + +from wake import _ +from gcc_wake import _ + +# Compile an assembly file to an object file +def compileAsm (variant: String) (asmFile: Path): Result (List Path) Error = + def obj = replace `\.S$` ".{variant}.o" asmFile.getPathName + def cmdline = "cc", "-c", asmFile.getPathName, "-o", obj, Nil + + makeExecPlan cmdline (asmFile, Nil) + | setPlanLabel "asm {asmFile.getPathName}" + | runJobWith defaultRunner + | getJobOutputs + +# Get the list of SIMD assembly files based on architecture +def blake3AsmFiles: List String = + if machine ==~ "x86_64" then + "blake3_sse2_x86-64_unix.S", + "blake3_sse41_x86-64_unix.S", + "blake3_avx2_x86-64_unix.S", + "blake3_avx512_x86-64_unix.S", + Nil + else + Nil + +# Get the regex for C files to compile based on architecture +# On x86_64: compile all C files except blake3_neon.c (ARM-only) +# On ARM: compile all C files (including blake3_neon.c) +def blake3CFilesRegex: RegExp = + if machine ==~ "x86_64" then + `blake3(_dispatch|_portable)?\.c` + else if machine ==~ "aarch64" then + `blake3(_dispatch|_portable|_neon)?\.c` + else + # Fallback: just portable implementation + `blake3(_dispatch|_portable)?\.c` + +# Get extra C flags based on architecture +# On aarch64: need -DBLAKE3_USE_NEON=1 for the dispatch to use NEON +def blake3ExtraCFlags: List String = + if machine ==~ "aarch64" then + "-DBLAKE3_USE_NEON=1", Nil + else + Nil + +target blake3 variant = + def Pair _ cVariant = variant + + # Compile C files with architecture-specific flags (e.g., -DBLAKE3_USE_NEON=1 on aarch64) + require Pass cLib = vendor @here variant "1.0" `.*\.h` blake3CFilesRegex blake3ExtraCFlags + + # Compile architecture-specific assembly files + require Pass asmPaths = + blake3AsmFiles + | map ("{@here}/{_}") + | map source + | findFail + + require Pass asmObjs = + map (compileAsm cVariant) asmPaths + | findFail + + # Combine C objects with assembly objects + Pass (cLib | setSysLibObjects (asmObjs.flatten ++ cLib.getSysLibObjects)) + diff --git a/vendor/blake3/blake3_avx2_x86-64_unix.S b/vendor/blake3/blake3_avx2_x86-64_unix.S new file mode 100644 index 000000000..812bb8568 --- /dev/null +++ b/vendor/blake3/blake3_avx2_x86-64_unix.S @@ -0,0 +1,1815 @@ +#if defined(__ELF__) && defined(__linux__) +.section .note.GNU-stack,"",%progbits +#endif + +#if defined(__ELF__) && defined(__CET__) && defined(__has_include) +#if __has_include() +#include +#endif +#endif + +#if !defined(_CET_ENDBR) +#define _CET_ENDBR +#endif + +.intel_syntax noprefix +.global _blake3_hash_many_avx2 +.global blake3_hash_many_avx2 +#ifdef __APPLE__ +.text +#else +.section .text +#endif + .p2align 6 +_blake3_hash_many_avx2: +blake3_hash_many_avx2: + _CET_ENDBR + push r15 + push r14 + push r13 + push r12 + push rbx + push rbp + mov rbp, rsp + sub rsp, 680 + and rsp, 0xFFFFFFFFFFFFFFC0 + neg r9d + vmovd xmm0, r9d + vpbroadcastd ymm0, xmm0 + vmovdqa ymmword ptr [rsp+0x280], ymm0 + vpand ymm1, ymm0, ymmword ptr [ADD0+rip] + vpand ymm2, ymm0, ymmword ptr [ADD1+rip] + vmovdqa ymmword ptr [rsp+0x220], ymm2 + vmovd xmm2, r8d + vpbroadcastd ymm2, xmm2 + vpaddd ymm2, ymm2, ymm1 + vmovdqa ymmword ptr [rsp+0x240], ymm2 + vpxor ymm1, ymm1, ymmword ptr [CMP_MSB_MASK+rip] + vpxor ymm2, ymm2, ymmword ptr [CMP_MSB_MASK+rip] + vpcmpgtd ymm2, ymm1, ymm2 + shr r8, 32 + vmovd xmm3, r8d + vpbroadcastd ymm3, xmm3 + vpsubd ymm3, ymm3, ymm2 + vmovdqa ymmword ptr [rsp+0x260], ymm3 + shl rdx, 6 + mov qword ptr [rsp+0x2A0], rdx + cmp rsi, 8 + jc 3f +2: + vpbroadcastd ymm0, dword ptr [rcx] + vpbroadcastd ymm1, dword ptr [rcx+0x4] + vpbroadcastd ymm2, dword ptr [rcx+0x8] + vpbroadcastd ymm3, dword ptr [rcx+0xC] + vpbroadcastd ymm4, dword ptr [rcx+0x10] + vpbroadcastd ymm5, dword ptr [rcx+0x14] + vpbroadcastd ymm6, dword ptr [rcx+0x18] + vpbroadcastd ymm7, dword ptr [rcx+0x1C] + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + mov r10, qword ptr [rdi+0x10] + mov r11, qword ptr [rdi+0x18] + mov r12, qword ptr [rdi+0x20] + mov r13, qword ptr [rdi+0x28] + mov r14, qword ptr [rdi+0x30] + mov r15, qword ptr [rdi+0x38] + movzx eax, byte ptr [rbp+0x38] + movzx ebx, byte ptr [rbp+0x40] + or eax, ebx + xor edx, edx +.p2align 5 +9: + movzx ebx, byte ptr [rbp+0x48] + or ebx, eax + add rdx, 64 + cmp rdx, qword ptr [rsp+0x2A0] + cmove eax, ebx + mov dword ptr [rsp+0x200], eax + vmovups xmm8, xmmword ptr [r8+rdx-0x40] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x40], 0x01 + vmovups xmm9, xmmword ptr [r9+rdx-0x40] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x40], 0x01 + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-0x40] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x40], 0x01 + vmovups xmm11, xmmword ptr [r11+rdx-0x40] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x40], 0x01 + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm8, ymm12, ymm14, 136 + vmovaps ymmword ptr [rsp], ymm8 + vshufps ymm9, ymm12, ymm14, 221 + vmovaps ymmword ptr [rsp+0x20], ymm9 + vshufps ymm10, ymm13, ymm15, 136 + vmovaps ymmword ptr [rsp+0x40], ymm10 + vshufps ymm11, ymm13, ymm15, 221 + vmovaps ymmword ptr [rsp+0x60], ymm11 + vmovups xmm8, xmmword ptr [r8+rdx-0x30] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x30], 0x01 + vmovups xmm9, xmmword ptr [r9+rdx-0x30] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x30], 0x01 + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-0x30] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x30], 0x01 + vmovups xmm11, xmmword ptr [r11+rdx-0x30] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x30], 0x01 + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm8, ymm12, ymm14, 136 + vmovaps ymmword ptr [rsp+0x80], ymm8 + vshufps ymm9, ymm12, ymm14, 221 + vmovaps ymmword ptr [rsp+0xA0], ymm9 + vshufps ymm10, ymm13, ymm15, 136 + vmovaps ymmword ptr [rsp+0xC0], ymm10 + vshufps ymm11, ymm13, ymm15, 221 + vmovaps ymmword ptr [rsp+0xE0], ymm11 + vmovups xmm8, xmmword ptr [r8+rdx-0x20] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x20], 0x01 + vmovups xmm9, xmmword ptr [r9+rdx-0x20] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x20], 0x01 + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-0x20] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x20], 0x01 + vmovups xmm11, xmmword ptr [r11+rdx-0x20] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x20], 0x01 + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm8, ymm12, ymm14, 136 + vmovaps ymmword ptr [rsp+0x100], ymm8 + vshufps ymm9, ymm12, ymm14, 221 + vmovaps ymmword ptr [rsp+0x120], ymm9 + vshufps ymm10, ymm13, ymm15, 136 + vmovaps ymmword ptr [rsp+0x140], ymm10 + vshufps ymm11, ymm13, ymm15, 221 + vmovaps ymmword ptr [rsp+0x160], ymm11 + vmovups xmm8, xmmword ptr [r8+rdx-0x10] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x10], 0x01 + vmovups xmm9, xmmword ptr [r9+rdx-0x10] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x10], 0x01 + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-0x10] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x10], 0x01 + vmovups xmm11, xmmword ptr [r11+rdx-0x10] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x10], 0x01 + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm8, ymm12, ymm14, 136 + vmovaps ymmword ptr [rsp+0x180], ymm8 + vshufps ymm9, ymm12, ymm14, 221 + vmovaps ymmword ptr [rsp+0x1A0], ymm9 + vshufps ymm10, ymm13, ymm15, 136 + vmovaps ymmword ptr [rsp+0x1C0], ymm10 + vshufps ymm11, ymm13, ymm15, 221 + vmovaps ymmword ptr [rsp+0x1E0], ymm11 + vpbroadcastd ymm15, dword ptr [rsp+0x200] + prefetcht0 [r8+rdx+0x80] + prefetcht0 [r12+rdx+0x80] + prefetcht0 [r9+rdx+0x80] + prefetcht0 [r13+rdx+0x80] + prefetcht0 [r10+rdx+0x80] + prefetcht0 [r14+rdx+0x80] + prefetcht0 [r11+rdx+0x80] + prefetcht0 [r15+rdx+0x80] + vpaddd ymm0, ymm0, ymmword ptr [rsp] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x40] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x80] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm0, ymmword ptr [rsp+0x240] + vpxor ymm13, ymm1, ymmword ptr [rsp+0x260] + vpxor ymm14, ymm2, ymmword ptr [BLAKE3_BLOCK_LEN+rip] + vpxor ymm15, ymm3, ymm15 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [BLAKE3_IV_0+rip] + vpaddd ymm9, ymm13, ymmword ptr [BLAKE3_IV_1+rip] + vpaddd ymm10, ymm14, ymmword ptr [BLAKE3_IV_2+rip] + vpaddd ymm11, ymm15, ymmword ptr [BLAKE3_IV_3+rip] + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x20] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x60] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0xA0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x100] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x180] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1C0] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x120] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1A0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x40] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x60] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0xE0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x80] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0xC0] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140] + vpaddd ymm2, ymm2, ymmword ptr [rsp] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1A0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x20] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x120] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x160] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1C0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x60] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1A0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x80] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x40] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1C0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0xC0] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x120] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x160] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0xA0] + vpaddd ymm1, ymm1, ymmword ptr [rsp] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1E0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x20] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x140] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1C0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1A0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0xE0] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x120] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x60] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x80] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0xA0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x20] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x40] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x100] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x180] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x120] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1E0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1C0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1A0] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x140] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0xE0] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0] + vpaddd ymm2, ymm2, ymmword ptr [rsp] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x40] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x60] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x20] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x80] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x120] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x100] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1C0] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x180] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x20] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1A0] + vpaddd ymm1, ymm1, ymmword ptr [rsp] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x40] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x80] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x60] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0xC0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x160] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x20] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1E0] + vpaddd ymm1, ymm1, ymmword ptr [rsp] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x120] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1C0] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x40] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x60] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x140] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x80] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1A0] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vpxor ymm0, ymm0, ymm8 + vpxor ymm1, ymm1, ymm9 + vpxor ymm2, ymm2, ymm10 + vpxor ymm3, ymm3, ymm11 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpxor ymm4, ymm4, ymm12 + vpxor ymm5, ymm5, ymm13 + vpxor ymm6, ymm6, ymm14 + vpxor ymm7, ymm7, ymm15 + movzx eax, byte ptr [rbp+0x38] + jne 9b + mov rbx, qword ptr [rbp+0x50] + vunpcklps ymm8, ymm0, ymm1 + vunpcklps ymm9, ymm2, ymm3 + vunpckhps ymm10, ymm0, ymm1 + vunpcklps ymm11, ymm4, ymm5 + vunpcklps ymm0, ymm6, ymm7 + vshufps ymm12, ymm8, ymm9, 78 + vblendps ymm1, ymm8, ymm12, 0xCC + vshufps ymm8, ymm11, ymm0, 78 + vunpckhps ymm13, ymm2, ymm3 + vblendps ymm2, ymm11, ymm8, 0xCC + vblendps ymm3, ymm12, ymm9, 0xCC + vperm2f128 ymm12, ymm1, ymm2, 0x20 + vmovups ymmword ptr [rbx], ymm12 + vunpckhps ymm14, ymm4, ymm5 + vblendps ymm4, ymm8, ymm0, 0xCC + vunpckhps ymm15, ymm6, ymm7 + vperm2f128 ymm7, ymm3, ymm4, 0x20 + vmovups ymmword ptr [rbx+0x20], ymm7 + vshufps ymm5, ymm10, ymm13, 78 + vblendps ymm6, ymm5, ymm13, 0xCC + vshufps ymm13, ymm14, ymm15, 78 + vblendps ymm10, ymm10, ymm5, 0xCC + vblendps ymm14, ymm14, ymm13, 0xCC + vperm2f128 ymm8, ymm10, ymm14, 0x20 + vmovups ymmword ptr [rbx+0x40], ymm8 + vblendps ymm15, ymm13, ymm15, 0xCC + vperm2f128 ymm13, ymm6, ymm15, 0x20 + vmovups ymmword ptr [rbx+0x60], ymm13 + vperm2f128 ymm9, ymm1, ymm2, 0x31 + vperm2f128 ymm11, ymm3, ymm4, 0x31 + vmovups ymmword ptr [rbx+0x80], ymm9 + vperm2f128 ymm14, ymm10, ymm14, 0x31 + vperm2f128 ymm15, ymm6, ymm15, 0x31 + vmovups ymmword ptr [rbx+0xA0], ymm11 + vmovups ymmword ptr [rbx+0xC0], ymm14 + vmovups ymmword ptr [rbx+0xE0], ymm15 + vmovdqa ymm0, ymmword ptr [rsp+0x220] + vpaddd ymm1, ymm0, ymmword ptr [rsp+0x240] + vmovdqa ymmword ptr [rsp+0x240], ymm1 + vpxor ymm0, ymm0, ymmword ptr [CMP_MSB_MASK+rip] + vpxor ymm2, ymm1, ymmword ptr [CMP_MSB_MASK+rip] + vpcmpgtd ymm2, ymm0, ymm2 + vmovdqa ymm0, ymmword ptr [rsp+0x260] + vpsubd ymm2, ymm0, ymm2 + vmovdqa ymmword ptr [rsp+0x260], ymm2 + add rdi, 64 + add rbx, 256 + mov qword ptr [rbp+0x50], rbx + sub rsi, 8 + cmp rsi, 8 + jnc 2b + test rsi, rsi + jnz 3f +4: + vzeroupper + mov rsp, rbp + pop rbp + pop rbx + pop r12 + pop r13 + pop r14 + pop r15 + ret +.p2align 5 +3: + mov rbx, qword ptr [rbp+0x50] + mov r15, qword ptr [rsp+0x2A0] + movzx r13d, byte ptr [rbp+0x38] + movzx r12d, byte ptr [rbp+0x48] + test rsi, 0x4 + je 3f + vbroadcasti128 ymm0, xmmword ptr [rcx] + vbroadcasti128 ymm1, xmmword ptr [rcx+0x10] + vmovdqa ymm8, ymm0 + vmovdqa ymm9, ymm1 + vbroadcasti128 ymm12, xmmword ptr [rsp+0x240] + vbroadcasti128 ymm13, xmmword ptr [rsp+0x260] + vpunpckldq ymm14, ymm12, ymm13 + vpunpckhdq ymm15, ymm12, ymm13 + vpermq ymm14, ymm14, 0x50 + vpermq ymm15, ymm15, 0x50 + vbroadcasti128 ymm12, xmmword ptr [BLAKE3_BLOCK_LEN+rip] + vpblendd ymm14, ymm14, ymm12, 0x44 + vpblendd ymm15, ymm15, ymm12, 0x44 + vmovdqa ymmword ptr [rsp], ymm14 + vmovdqa ymmword ptr [rsp+0x20], ymm15 + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + mov r10, qword ptr [rdi+0x10] + mov r11, qword ptr [rdi+0x18] + movzx eax, byte ptr [rbp+0x40] + or eax, r13d + xor edx, edx +.p2align 5 +2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + mov dword ptr [rsp+0x200], eax + vmovups ymm2, ymmword ptr [r8+rdx-0x40] + vinsertf128 ymm2, ymm2, xmmword ptr [r9+rdx-0x40], 0x01 + vmovups ymm3, ymmword ptr [r8+rdx-0x30] + vinsertf128 ymm3, ymm3, xmmword ptr [r9+rdx-0x30], 0x01 + vshufps ymm4, ymm2, ymm3, 136 + vshufps ymm5, ymm2, ymm3, 221 + vmovups ymm2, ymmword ptr [r8+rdx-0x20] + vinsertf128 ymm2, ymm2, xmmword ptr [r9+rdx-0x20], 0x01 + vmovups ymm3, ymmword ptr [r8+rdx-0x10] + vinsertf128 ymm3, ymm3, xmmword ptr [r9+rdx-0x10], 0x01 + vshufps ymm6, ymm2, ymm3, 136 + vshufps ymm7, ymm2, ymm3, 221 + vpshufd ymm6, ymm6, 0x93 + vpshufd ymm7, ymm7, 0x93 + vmovups ymm10, ymmword ptr [r10+rdx-0x40] + vinsertf128 ymm10, ymm10, xmmword ptr [r11+rdx-0x40], 0x01 + vmovups ymm11, ymmword ptr [r10+rdx-0x30] + vinsertf128 ymm11, ymm11, xmmword ptr [r11+rdx-0x30], 0x01 + vshufps ymm12, ymm10, ymm11, 136 + vshufps ymm13, ymm10, ymm11, 221 + vmovups ymm10, ymmword ptr [r10+rdx-0x20] + vinsertf128 ymm10, ymm10, xmmword ptr [r11+rdx-0x20], 0x01 + vmovups ymm11, ymmword ptr [r10+rdx-0x10] + vinsertf128 ymm11, ymm11, xmmword ptr [r11+rdx-0x10], 0x01 + vshufps ymm14, ymm10, ymm11, 136 + vshufps ymm15, ymm10, ymm11, 221 + vpshufd ymm14, ymm14, 0x93 + vpshufd ymm15, ymm15, 0x93 + prefetcht0 [r8+rdx+0x80] + prefetcht0 [r9+rdx+0x80] + prefetcht0 [r10+rdx+0x80] + prefetcht0 [r11+rdx+0x80] + vpbroadcastd ymm2, dword ptr [rsp+0x200] + vmovdqa ymm3, ymmword ptr [rsp] + vmovdqa ymm11, ymmword ptr [rsp+0x20] + vpblendd ymm3, ymm3, ymm2, 0x88 + vpblendd ymm11, ymm11, ymm2, 0x88 + vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV+rip] + vmovdqa ymm10, ymm2 + mov al, 7 +9: + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm8, ymm8, ymm12 + vmovdqa ymmword ptr [rsp+0x40], ymm4 + nop + vmovdqa ymmword ptr [rsp+0x60], ymm12 + nop + vpaddd ymm0, ymm0, ymm1 + vpaddd ymm8, ymm8, ymm9 + vpxor ymm3, ymm3, ymm0 + vpxor ymm11, ymm11, ymm8 + vbroadcasti128 ymm4, xmmword ptr [ROT16+rip] + vpshufb ymm3, ymm3, ymm4 + vpshufb ymm11, ymm11, ymm4 + vpaddd ymm2, ymm2, ymm3 + vpaddd ymm10, ymm10, ymm11 + vpxor ymm1, ymm1, ymm2 + vpxor ymm9, ymm9, ymm10 + vpsrld ymm4, ymm1, 12 + vpslld ymm1, ymm1, 20 + vpor ymm1, ymm1, ymm4 + vpsrld ymm4, ymm9, 12 + vpslld ymm9, ymm9, 20 + vpor ymm9, ymm9, ymm4 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm0, ymm0, ymm1 + vpaddd ymm8, ymm8, ymm9 + vmovdqa ymmword ptr [rsp+0x80], ymm5 + vmovdqa ymmword ptr [rsp+0xA0], ymm13 + vpxor ymm3, ymm3, ymm0 + vpxor ymm11, ymm11, ymm8 + vbroadcasti128 ymm4, xmmword ptr [ROT8+rip] + vpshufb ymm3, ymm3, ymm4 + vpshufb ymm11, ymm11, ymm4 + vpaddd ymm2, ymm2, ymm3 + vpaddd ymm10, ymm10, ymm11 + vpxor ymm1, ymm1, ymm2 + vpxor ymm9, ymm9, ymm10 + vpsrld ymm4, ymm1, 7 + vpslld ymm1, ymm1, 25 + vpor ymm1, ymm1, ymm4 + vpsrld ymm4, ymm9, 7 + vpslld ymm9, ymm9, 25 + vpor ymm9, ymm9, ymm4 + vpshufd ymm0, ymm0, 0x93 + vpshufd ymm8, ymm8, 0x93 + vpshufd ymm3, ymm3, 0x4E + vpshufd ymm11, ymm11, 0x4E + vpshufd ymm2, ymm2, 0x39 + vpshufd ymm10, ymm10, 0x39 + vpaddd ymm0, ymm0, ymm6 + vpaddd ymm8, ymm8, ymm14 + vpaddd ymm0, ymm0, ymm1 + vpaddd ymm8, ymm8, ymm9 + vpxor ymm3, ymm3, ymm0 + vpxor ymm11, ymm11, ymm8 + vbroadcasti128 ymm4, xmmword ptr [ROT16+rip] + vpshufb ymm3, ymm3, ymm4 + vpshufb ymm11, ymm11, ymm4 + vpaddd ymm2, ymm2, ymm3 + vpaddd ymm10, ymm10, ymm11 + vpxor ymm1, ymm1, ymm2 + vpxor ymm9, ymm9, ymm10 + vpsrld ymm4, ymm1, 12 + vpslld ymm1, ymm1, 20 + vpor ymm1, ymm1, ymm4 + vpsrld ymm4, ymm9, 12 + vpslld ymm9, ymm9, 20 + vpor ymm9, ymm9, ymm4 + vpaddd ymm0, ymm0, ymm7 + vpaddd ymm8, ymm8, ymm15 + vpaddd ymm0, ymm0, ymm1 + vpaddd ymm8, ymm8, ymm9 + vpxor ymm3, ymm3, ymm0 + vpxor ymm11, ymm11, ymm8 + vbroadcasti128 ymm4, xmmword ptr [ROT8+rip] + vpshufb ymm3, ymm3, ymm4 + vpshufb ymm11, ymm11, ymm4 + vpaddd ymm2, ymm2, ymm3 + vpaddd ymm10, ymm10, ymm11 + vpxor ymm1, ymm1, ymm2 + vpxor ymm9, ymm9, ymm10 + vpsrld ymm4, ymm1, 7 + vpslld ymm1, ymm1, 25 + vpor ymm1, ymm1, ymm4 + vpsrld ymm4, ymm9, 7 + vpslld ymm9, ymm9, 25 + vpor ymm9, ymm9, ymm4 + vpshufd ymm0, ymm0, 0x39 + vpshufd ymm8, ymm8, 0x39 + vpshufd ymm3, ymm3, 0x4E + vpshufd ymm11, ymm11, 0x4E + vpshufd ymm2, ymm2, 0x93 + vpshufd ymm10, ymm10, 0x93 + dec al + je 9f + vmovdqa ymm4, ymmword ptr [rsp+0x40] + vmovdqa ymm5, ymmword ptr [rsp+0x80] + vshufps ymm12, ymm4, ymm5, 214 + vpshufd ymm13, ymm4, 0x0F + vpshufd ymm4, ymm12, 0x39 + vshufps ymm12, ymm6, ymm7, 250 + vpblendd ymm13, ymm13, ymm12, 0xAA + vpunpcklqdq ymm12, ymm7, ymm5 + vpblendd ymm12, ymm12, ymm6, 0x88 + vpshufd ymm12, ymm12, 0x78 + vpunpckhdq ymm5, ymm5, ymm7 + vpunpckldq ymm6, ymm6, ymm5 + vpshufd ymm7, ymm6, 0x1E + vmovdqa ymmword ptr [rsp+0x40], ymm13 + vmovdqa ymmword ptr [rsp+0x80], ymm12 + vmovdqa ymm12, ymmword ptr [rsp+0x60] + vmovdqa ymm13, ymmword ptr [rsp+0xA0] + vshufps ymm5, ymm12, ymm13, 214 + vpshufd ymm6, ymm12, 0x0F + vpshufd ymm12, ymm5, 0x39 + vshufps ymm5, ymm14, ymm15, 250 + vpblendd ymm6, ymm6, ymm5, 0xAA + vpunpcklqdq ymm5, ymm15, ymm13 + vpblendd ymm5, ymm5, ymm14, 0x88 + vpshufd ymm5, ymm5, 0x78 + vpunpckhdq ymm13, ymm13, ymm15 + vpunpckldq ymm14, ymm14, ymm13 + vpshufd ymm15, ymm14, 0x1E + vmovdqa ymm13, ymm6 + vmovdqa ymm14, ymm5 + vmovdqa ymm5, ymmword ptr [rsp+0x40] + vmovdqa ymm6, ymmword ptr [rsp+0x80] + jmp 9b +9: + vpxor ymm0, ymm0, ymm2 + vpxor ymm1, ymm1, ymm3 + vpxor ymm8, ymm8, ymm10 + vpxor ymm9, ymm9, ymm11 + mov eax, r13d + cmp rdx, r15 + jne 2b + vmovdqu xmmword ptr [rbx], xmm0 + vmovdqu xmmword ptr [rbx+0x10], xmm1 + vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01 + vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01 + vmovdqu xmmword ptr [rbx+0x40], xmm8 + vmovdqu xmmword ptr [rbx+0x50], xmm9 + vextracti128 xmmword ptr [rbx+0x60], ymm8, 0x01 + vextracti128 xmmword ptr [rbx+0x70], ymm9, 0x01 + vmovaps xmm8, xmmword ptr [rsp+0x280] + vmovaps xmm0, xmmword ptr [rsp+0x240] + vmovaps xmm1, xmmword ptr [rsp+0x250] + vmovaps xmm2, xmmword ptr [rsp+0x260] + vmovaps xmm3, xmmword ptr [rsp+0x270] + vblendvps xmm0, xmm0, xmm1, xmm8 + vblendvps xmm2, xmm2, xmm3, xmm8 + vmovaps xmmword ptr [rsp+0x240], xmm0 + vmovaps xmmword ptr [rsp+0x260], xmm2 + add rbx, 128 + add rdi, 32 + sub rsi, 4 +3: + test rsi, 0x2 + je 3f + vbroadcasti128 ymm0, xmmword ptr [rcx] + vbroadcasti128 ymm1, xmmword ptr [rcx+0x10] + vmovd xmm13, dword ptr [rsp+0x240] + vpinsrd xmm13, xmm13, dword ptr [rsp+0x260], 1 + vpinsrd xmm13, xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 + vmovd xmm14, dword ptr [rsp+0x244] + vpinsrd xmm14, xmm14, dword ptr [rsp+0x264], 1 + vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 + vinserti128 ymm13, ymm13, xmm14, 0x01 + vbroadcasti128 ymm14, xmmword ptr [ROT16+rip] + vbroadcasti128 ymm15, xmmword ptr [ROT8+rip] + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + movzx eax, byte ptr [rbp+0x40] + or eax, r13d + xor edx, edx +.p2align 5 +2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + mov dword ptr [rsp+0x200], eax + vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV+rip] + vpbroadcastd ymm8, dword ptr [rsp+0x200] + vpblendd ymm3, ymm13, ymm8, 0x88 + vmovups ymm8, ymmword ptr [r8+rdx-0x40] + vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x40], 0x01 + vmovups ymm9, ymmword ptr [r8+rdx-0x30] + vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x30], 0x01 + vshufps ymm4, ymm8, ymm9, 136 + vshufps ymm5, ymm8, ymm9, 221 + vmovups ymm8, ymmword ptr [r8+rdx-0x20] + vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x20], 0x01 + vmovups ymm9, ymmword ptr [r8+rdx-0x10] + vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x10], 0x01 + vshufps ymm6, ymm8, ymm9, 136 + vshufps ymm7, ymm8, ymm9, 221 + vpshufd ymm6, ymm6, 0x93 + vpshufd ymm7, ymm7, 0x93 + mov al, 7 +9: + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm0, ymm0, ymm1 + vpxor ymm3, ymm3, ymm0 + vpshufb ymm3, ymm3, ymm14 + vpaddd ymm2, ymm2, ymm3 + vpxor ymm1, ymm1, ymm2 + vpsrld ymm8, ymm1, 12 + vpslld ymm1, ymm1, 20 + vpor ymm1, ymm1, ymm8 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm0, ymm0, ymm1 + vpxor ymm3, ymm3, ymm0 + vpshufb ymm3, ymm3, ymm15 + vpaddd ymm2, ymm2, ymm3 + vpxor ymm1, ymm1, ymm2 + vpsrld ymm8, ymm1, 7 + vpslld ymm1, ymm1, 25 + vpor ymm1, ymm1, ymm8 + vpshufd ymm0, ymm0, 0x93 + vpshufd ymm3, ymm3, 0x4E + vpshufd ymm2, ymm2, 0x39 + vpaddd ymm0, ymm0, ymm6 + vpaddd ymm0, ymm0, ymm1 + vpxor ymm3, ymm3, ymm0 + vpshufb ymm3, ymm3, ymm14 + vpaddd ymm2, ymm2, ymm3 + vpxor ymm1, ymm1, ymm2 + vpsrld ymm8, ymm1, 12 + vpslld ymm1, ymm1, 20 + vpor ymm1, ymm1, ymm8 + vpaddd ymm0, ymm0, ymm7 + vpaddd ymm0, ymm0, ymm1 + vpxor ymm3, ymm3, ymm0 + vpshufb ymm3, ymm3, ymm15 + vpaddd ymm2, ymm2, ymm3 + vpxor ymm1, ymm1, ymm2 + vpsrld ymm8, ymm1, 7 + vpslld ymm1, ymm1, 25 + vpor ymm1, ymm1, ymm8 + vpshufd ymm0, ymm0, 0x39 + vpshufd ymm3, ymm3, 0x4E + vpshufd ymm2, ymm2, 0x93 + dec al + jz 9f + vshufps ymm8, ymm4, ymm5, 214 + vpshufd ymm9, ymm4, 0x0F + vpshufd ymm4, ymm8, 0x39 + vshufps ymm8, ymm6, ymm7, 250 + vpblendd ymm9, ymm9, ymm8, 0xAA + vpunpcklqdq ymm8, ymm7, ymm5 + vpblendd ymm8, ymm8, ymm6, 0x88 + vpshufd ymm8, ymm8, 0x78 + vpunpckhdq ymm5, ymm5, ymm7 + vpunpckldq ymm6, ymm6, ymm5 + vpshufd ymm7, ymm6, 0x1E + vmovdqa ymm5, ymm9 + vmovdqa ymm6, ymm8 + jmp 9b +9: + vpxor ymm0, ymm0, ymm2 + vpxor ymm1, ymm1, ymm3 + mov eax, r13d + cmp rdx, r15 + jne 2b + vmovdqu xmmword ptr [rbx], xmm0 + vmovdqu xmmword ptr [rbx+0x10], xmm1 + vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01 + vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01 + vmovaps ymm8, ymmword ptr [rsp+0x280] + vmovaps ymm0, ymmword ptr [rsp+0x240] + vmovups ymm1, ymmword ptr [rsp+0x248] + vmovaps ymm2, ymmword ptr [rsp+0x260] + vmovups ymm3, ymmword ptr [rsp+0x268] + vblendvps ymm0, ymm0, ymm1, ymm8 + vblendvps ymm2, ymm2, ymm3, ymm8 + vmovaps ymmword ptr [rsp+0x240], ymm0 + vmovaps ymmword ptr [rsp+0x260], ymm2 + add rbx, 64 + add rdi, 16 + sub rsi, 2 +3: + test rsi, 0x1 + je 4b + vmovdqu xmm0, xmmword ptr [rcx] + vmovdqu xmm1, xmmword ptr [rcx+0x10] + vmovd xmm3, dword ptr [rsp+0x240] + vpinsrd xmm3, xmm3, dword ptr [rsp+0x260], 1 + vpinsrd xmm13, xmm3, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 + vmovdqa xmm14, xmmword ptr [ROT16+rip] + vmovdqa xmm15, xmmword ptr [ROT8+rip] + mov r8, qword ptr [rdi] + movzx eax, byte ptr [rbp+0x40] + or eax, r13d + xor edx, edx +.p2align 5 +2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + vmovdqa xmm2, xmmword ptr [BLAKE3_IV+rip] + vmovdqa xmm3, xmm13 + vpinsrd xmm3, xmm3, eax, 3 + vmovups xmm8, xmmword ptr [r8+rdx-0x40] + vmovups xmm9, xmmword ptr [r8+rdx-0x30] + vshufps xmm4, xmm8, xmm9, 136 + vshufps xmm5, xmm8, xmm9, 221 + vmovups xmm8, xmmword ptr [r8+rdx-0x20] + vmovups xmm9, xmmword ptr [r8+rdx-0x10] + vshufps xmm6, xmm8, xmm9, 136 + vshufps xmm7, xmm8, xmm9, 221 + vpshufd xmm6, xmm6, 0x93 + vpshufd xmm7, xmm7, 0x93 + mov al, 7 +9: + vpaddd xmm0, xmm0, xmm4 + vpaddd xmm0, xmm0, xmm1 + vpxor xmm3, xmm3, xmm0 + vpshufb xmm3, xmm3, xmm14 + vpaddd xmm2, xmm2, xmm3 + vpxor xmm1, xmm1, xmm2 + vpsrld xmm8, xmm1, 12 + vpslld xmm1, xmm1, 20 + vpor xmm1, xmm1, xmm8 + vpaddd xmm0, xmm0, xmm5 + vpaddd xmm0, xmm0, xmm1 + vpxor xmm3, xmm3, xmm0 + vpshufb xmm3, xmm3, xmm15 + vpaddd xmm2, xmm2, xmm3 + vpxor xmm1, xmm1, xmm2 + vpsrld xmm8, xmm1, 7 + vpslld xmm1, xmm1, 25 + vpor xmm1, xmm1, xmm8 + vpshufd xmm0, xmm0, 0x93 + vpshufd xmm3, xmm3, 0x4E + vpshufd xmm2, xmm2, 0x39 + vpaddd xmm0, xmm0, xmm6 + vpaddd xmm0, xmm0, xmm1 + vpxor xmm3, xmm3, xmm0 + vpshufb xmm3, xmm3, xmm14 + vpaddd xmm2, xmm2, xmm3 + vpxor xmm1, xmm1, xmm2 + vpsrld xmm8, xmm1, 12 + vpslld xmm1, xmm1, 20 + vpor xmm1, xmm1, xmm8 + vpaddd xmm0, xmm0, xmm7 + vpaddd xmm0, xmm0, xmm1 + vpxor xmm3, xmm3, xmm0 + vpshufb xmm3, xmm3, xmm15 + vpaddd xmm2, xmm2, xmm3 + vpxor xmm1, xmm1, xmm2 + vpsrld xmm8, xmm1, 7 + vpslld xmm1, xmm1, 25 + vpor xmm1, xmm1, xmm8 + vpshufd xmm0, xmm0, 0x39 + vpshufd xmm3, xmm3, 0x4E + vpshufd xmm2, xmm2, 0x93 + dec al + jz 9f + vshufps xmm8, xmm4, xmm5, 214 + vpshufd xmm9, xmm4, 0x0F + vpshufd xmm4, xmm8, 0x39 + vshufps xmm8, xmm6, xmm7, 250 + vpblendd xmm9, xmm9, xmm8, 0xAA + vpunpcklqdq xmm8, xmm7, xmm5 + vpblendd xmm8, xmm8, xmm6, 0x88 + vpshufd xmm8, xmm8, 0x78 + vpunpckhdq xmm5, xmm5, xmm7 + vpunpckldq xmm6, xmm6, xmm5 + vpshufd xmm7, xmm6, 0x1E + vmovdqa xmm5, xmm9 + vmovdqa xmm6, xmm8 + jmp 9b +9: + vpxor xmm0, xmm0, xmm2 + vpxor xmm1, xmm1, xmm3 + mov eax, r13d + cmp rdx, r15 + jne 2b + vmovdqu xmmword ptr [rbx], xmm0 + vmovdqu xmmword ptr [rbx+0x10], xmm1 + jmp 4b + + +#ifdef __APPLE__ +.static_data +#else +.section .rodata +#endif +.p2align 6 +ADD0: + .long 0, 1, 2, 3, 4, 5, 6, 7 +ADD1: + .long 8, 8, 8, 8, 8, 8, 8, 8 +BLAKE3_IV_0: + .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667 + .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667 +BLAKE3_IV_1: + .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85 + .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85 +BLAKE3_IV_2: + .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372 + .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372 +BLAKE3_IV_3: + .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A + .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A +BLAKE3_BLOCK_LEN: + .long 0x00000040, 0x00000040, 0x00000040, 0x00000040 + .long 0x00000040, 0x00000040, 0x00000040, 0x00000040 +ROT16: + .byte 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13 +ROT8: + .byte 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12 +CMP_MSB_MASK: + .long 0x80000000, 0x80000000, 0x80000000, 0x80000000 + .long 0x80000000, 0x80000000, 0x80000000, 0x80000000 +BLAKE3_IV: + .long 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A + diff --git a/vendor/blake3/blake3_avx512_x86-64_unix.S b/vendor/blake3/blake3_avx512_x86-64_unix.S new file mode 100644 index 000000000..9642e413a --- /dev/null +++ b/vendor/blake3/blake3_avx512_x86-64_unix.S @@ -0,0 +1,4824 @@ +#if defined(__ELF__) && defined(__linux__) +.section .note.GNU-stack,"",%progbits +#endif + +#if defined(__ELF__) && defined(__CET__) && defined(__has_include) +#if __has_include() +#include +#endif +#endif + +#if !defined(_CET_ENDBR) +#define _CET_ENDBR +#endif + +.intel_syntax noprefix +.global _blake3_hash_many_avx512 +.global blake3_hash_many_avx512 +.global blake3_compress_in_place_avx512 +.global _blake3_compress_in_place_avx512 +.global blake3_compress_xof_avx512 +.global _blake3_compress_xof_avx512 +.global blake3_xof_many_avx512 +.global _blake3_xof_many_avx512 + +#ifdef __APPLE__ +.text +#else +.section .text +#endif +.p2align 6 +_blake3_hash_many_avx512: +blake3_hash_many_avx512: + _CET_ENDBR + push r15 + push r14 + push r13 + push r12 + push rbx + push rbp + mov rbp, rsp + sub rsp, 144 + and rsp, 0xFFFFFFFFFFFFFFC0 + neg r9 + kmovw k1, r9d + vmovd xmm0, r8d + vpbroadcastd ymm0, xmm0 + shr r8, 32 + vmovd xmm1, r8d + vpbroadcastd ymm1, xmm1 + vmovdqa ymm4, ymm1 + vmovdqa ymm5, ymm1 + vpaddd ymm2, ymm0, ymmword ptr [ADD0+rip] + vpaddd ymm3, ymm0, ymmword ptr [ADD0+32+rip] + vpcmpltud k2, ymm2, ymm0 + vpcmpltud k3, ymm3, ymm0 + vpaddd ymm4 {k2}, ymm4, dword ptr [ADD1+rip] {1to8} + vpaddd ymm5 {k3}, ymm5, dword ptr [ADD1+rip] {1to8} + knotw k2, k1 + vmovdqa32 ymm2 {k2}, ymm0 + vmovdqa32 ymm3 {k2}, ymm0 + vmovdqa32 ymm4 {k2}, ymm1 + vmovdqa32 ymm5 {k2}, ymm1 + vmovdqa ymmword ptr [rsp], ymm2 + vmovdqa ymmword ptr [rsp+0x1*0x20], ymm3 + vmovdqa ymmword ptr [rsp+0x2*0x20], ymm4 + vmovdqa ymmword ptr [rsp+0x3*0x20], ymm5 + shl rdx, 6 + mov qword ptr [rsp+0x80], rdx + cmp rsi, 16 + jc 3f +2: + vpbroadcastd zmm0, dword ptr [rcx] + vpbroadcastd zmm1, dword ptr [rcx+0x1*0x4] + vpbroadcastd zmm2, dword ptr [rcx+0x2*0x4] + vpbroadcastd zmm3, dword ptr [rcx+0x3*0x4] + vpbroadcastd zmm4, dword ptr [rcx+0x4*0x4] + vpbroadcastd zmm5, dword ptr [rcx+0x5*0x4] + vpbroadcastd zmm6, dword ptr [rcx+0x6*0x4] + vpbroadcastd zmm7, dword ptr [rcx+0x7*0x4] + movzx eax, byte ptr [rbp+0x38] + movzx ebx, byte ptr [rbp+0x40] + or eax, ebx + xor edx, edx +.p2align 5 +9: + movzx ebx, byte ptr [rbp+0x48] + or ebx, eax + add rdx, 64 + cmp rdx, qword ptr [rsp+0x80] + cmove eax, ebx + mov dword ptr [rsp+0x88], eax + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + mov r10, qword ptr [rdi+0x10] + mov r11, qword ptr [rdi+0x18] + mov r12, qword ptr [rdi+0x40] + mov r13, qword ptr [rdi+0x48] + mov r14, qword ptr [rdi+0x50] + mov r15, qword ptr [rdi+0x58] + vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20] + vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01 + vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20] + vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01 + vpunpcklqdq zmm8, zmm16, zmm17 + vpunpckhqdq zmm9, zmm16, zmm17 + vmovdqu32 ymm18, ymmword ptr [rdx+r10-0x2*0x20] + vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01 + vmovdqu32 ymm19, ymmword ptr [rdx+r11-0x2*0x20] + vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01 + vpunpcklqdq zmm10, zmm18, zmm19 + vpunpckhqdq zmm11, zmm18, zmm19 + mov r8, qword ptr [rdi+0x20] + mov r9, qword ptr [rdi+0x28] + mov r10, qword ptr [rdi+0x30] + mov r11, qword ptr [rdi+0x38] + mov r12, qword ptr [rdi+0x60] + mov r13, qword ptr [rdi+0x68] + mov r14, qword ptr [rdi+0x70] + mov r15, qword ptr [rdi+0x78] + vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20] + vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01 + vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20] + vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01 + vpunpcklqdq zmm12, zmm16, zmm17 + vpunpckhqdq zmm13, zmm16, zmm17 + vmovdqu32 ymm18, ymmword ptr [rdx+r10-0x2*0x20] + vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01 + vmovdqu32 ymm19, ymmword ptr [rdx+r11-0x2*0x20] + vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01 + vpunpcklqdq zmm14, zmm18, zmm19 + vpunpckhqdq zmm15, zmm18, zmm19 + vmovdqa32 zmm27, zmmword ptr [INDEX0+rip] + vmovdqa32 zmm31, zmmword ptr [INDEX1+rip] + vshufps zmm16, zmm8, zmm10, 136 + vshufps zmm17, zmm12, zmm14, 136 + vmovdqa32 zmm20, zmm16 + vpermt2d zmm16, zmm27, zmm17 + vpermt2d zmm20, zmm31, zmm17 + vshufps zmm17, zmm8, zmm10, 221 + vshufps zmm30, zmm12, zmm14, 221 + vmovdqa32 zmm21, zmm17 + vpermt2d zmm17, zmm27, zmm30 + vpermt2d zmm21, zmm31, zmm30 + vshufps zmm18, zmm9, zmm11, 136 + vshufps zmm8, zmm13, zmm15, 136 + vmovdqa32 zmm22, zmm18 + vpermt2d zmm18, zmm27, zmm8 + vpermt2d zmm22, zmm31, zmm8 + vshufps zmm19, zmm9, zmm11, 221 + vshufps zmm8, zmm13, zmm15, 221 + vmovdqa32 zmm23, zmm19 + vpermt2d zmm19, zmm27, zmm8 + vpermt2d zmm23, zmm31, zmm8 + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + mov r10, qword ptr [rdi+0x10] + mov r11, qword ptr [rdi+0x18] + mov r12, qword ptr [rdi+0x40] + mov r13, qword ptr [rdi+0x48] + mov r14, qword ptr [rdi+0x50] + mov r15, qword ptr [rdi+0x58] + vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20] + vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01 + vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20] + vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01 + vpunpcklqdq zmm8, zmm24, zmm25 + vpunpckhqdq zmm9, zmm24, zmm25 + vmovdqu32 ymm24, ymmword ptr [r10+rdx-0x1*0x20] + vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01 + vmovdqu32 ymm25, ymmword ptr [r11+rdx-0x1*0x20] + vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01 + vpunpcklqdq zmm10, zmm24, zmm25 + vpunpckhqdq zmm11, zmm24, zmm25 + prefetcht0 [r8+rdx+0x80] + prefetcht0 [r12+rdx+0x80] + prefetcht0 [r9+rdx+0x80] + prefetcht0 [r13+rdx+0x80] + prefetcht0 [r10+rdx+0x80] + prefetcht0 [r14+rdx+0x80] + prefetcht0 [r11+rdx+0x80] + prefetcht0 [r15+rdx+0x80] + mov r8, qword ptr [rdi+0x20] + mov r9, qword ptr [rdi+0x28] + mov r10, qword ptr [rdi+0x30] + mov r11, qword ptr [rdi+0x38] + mov r12, qword ptr [rdi+0x60] + mov r13, qword ptr [rdi+0x68] + mov r14, qword ptr [rdi+0x70] + mov r15, qword ptr [rdi+0x78] + vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20] + vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01 + vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20] + vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01 + vpunpcklqdq zmm12, zmm24, zmm25 + vpunpckhqdq zmm13, zmm24, zmm25 + vmovdqu32 ymm24, ymmword ptr [r10+rdx-0x1*0x20] + vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01 + vmovdqu32 ymm25, ymmword ptr [r11+rdx-0x1*0x20] + vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01 + vpunpcklqdq zmm14, zmm24, zmm25 + vpunpckhqdq zmm15, zmm24, zmm25 + prefetcht0 [r8+rdx+0x80] + prefetcht0 [r12+rdx+0x80] + prefetcht0 [r9+rdx+0x80] + prefetcht0 [r13+rdx+0x80] + prefetcht0 [r10+rdx+0x80] + prefetcht0 [r14+rdx+0x80] + prefetcht0 [r11+rdx+0x80] + prefetcht0 [r15+rdx+0x80] + vshufps zmm24, zmm8, zmm10, 136 + vshufps zmm30, zmm12, zmm14, 136 + vmovdqa32 zmm28, zmm24 + vpermt2d zmm24, zmm27, zmm30 + vpermt2d zmm28, zmm31, zmm30 + vshufps zmm25, zmm8, zmm10, 221 + vshufps zmm30, zmm12, zmm14, 221 + vmovdqa32 zmm29, zmm25 + vpermt2d zmm25, zmm27, zmm30 + vpermt2d zmm29, zmm31, zmm30 + vshufps zmm26, zmm9, zmm11, 136 + vshufps zmm8, zmm13, zmm15, 136 + vmovdqa32 zmm30, zmm26 + vpermt2d zmm26, zmm27, zmm8 + vpermt2d zmm30, zmm31, zmm8 + vshufps zmm8, zmm9, zmm11, 221 + vshufps zmm10, zmm13, zmm15, 221 + vpermi2d zmm27, zmm8, zmm10 + vpermi2d zmm31, zmm8, zmm10 + vpbroadcastd zmm8, dword ptr [BLAKE3_IV_0+rip] + vpbroadcastd zmm9, dword ptr [BLAKE3_IV_1+rip] + vpbroadcastd zmm10, dword ptr [BLAKE3_IV_2+rip] + vpbroadcastd zmm11, dword ptr [BLAKE3_IV_3+rip] + vmovdqa32 zmm12, zmmword ptr [rsp] + vmovdqa32 zmm13, zmmword ptr [rsp+0x1*0x40] + vpbroadcastd zmm14, dword ptr [BLAKE3_BLOCK_LEN+rip] + vpbroadcastd zmm15, dword ptr [rsp+0x22*0x4] + vpaddd zmm0, zmm0, zmm16 + vpaddd zmm1, zmm1, zmm18 + vpaddd zmm2, zmm2, zmm20 + vpaddd zmm3, zmm3, zmm22 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vprord zmm15, zmm15, 16 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 12 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vpaddd zmm0, zmm0, zmm17 + vpaddd zmm1, zmm1, zmm19 + vpaddd zmm2, zmm2, zmm21 + vpaddd zmm3, zmm3, zmm23 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vprord zmm15, zmm15, 8 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 7 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vpaddd zmm0, zmm0, zmm24 + vpaddd zmm1, zmm1, zmm26 + vpaddd zmm2, zmm2, zmm28 + vpaddd zmm3, zmm3, zmm30 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 16 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vprord zmm4, zmm4, 12 + vpaddd zmm0, zmm0, zmm25 + vpaddd zmm1, zmm1, zmm27 + vpaddd zmm2, zmm2, zmm29 + vpaddd zmm3, zmm3, zmm31 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 8 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vprord zmm4, zmm4, 7 + vpaddd zmm0, zmm0, zmm18 + vpaddd zmm1, zmm1, zmm19 + vpaddd zmm2, zmm2, zmm23 + vpaddd zmm3, zmm3, zmm20 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vprord zmm15, zmm15, 16 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 12 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vpaddd zmm0, zmm0, zmm22 + vpaddd zmm1, zmm1, zmm26 + vpaddd zmm2, zmm2, zmm16 + vpaddd zmm3, zmm3, zmm29 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vprord zmm15, zmm15, 8 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 7 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vpaddd zmm0, zmm0, zmm17 + vpaddd zmm1, zmm1, zmm28 + vpaddd zmm2, zmm2, zmm25 + vpaddd zmm3, zmm3, zmm31 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 16 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vprord zmm4, zmm4, 12 + vpaddd zmm0, zmm0, zmm27 + vpaddd zmm1, zmm1, zmm21 + vpaddd zmm2, zmm2, zmm30 + vpaddd zmm3, zmm3, zmm24 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 8 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vprord zmm4, zmm4, 7 + vpaddd zmm0, zmm0, zmm19 + vpaddd zmm1, zmm1, zmm26 + vpaddd zmm2, zmm2, zmm29 + vpaddd zmm3, zmm3, zmm23 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vprord zmm15, zmm15, 16 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 12 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vpaddd zmm0, zmm0, zmm20 + vpaddd zmm1, zmm1, zmm28 + vpaddd zmm2, zmm2, zmm18 + vpaddd zmm3, zmm3, zmm30 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vprord zmm15, zmm15, 8 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 7 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vpaddd zmm0, zmm0, zmm22 + vpaddd zmm1, zmm1, zmm25 + vpaddd zmm2, zmm2, zmm27 + vpaddd zmm3, zmm3, zmm24 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 16 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vprord zmm4, zmm4, 12 + vpaddd zmm0, zmm0, zmm21 + vpaddd zmm1, zmm1, zmm16 + vpaddd zmm2, zmm2, zmm31 + vpaddd zmm3, zmm3, zmm17 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 8 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vprord zmm4, zmm4, 7 + vpaddd zmm0, zmm0, zmm26 + vpaddd zmm1, zmm1, zmm28 + vpaddd zmm2, zmm2, zmm30 + vpaddd zmm3, zmm3, zmm29 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vprord zmm15, zmm15, 16 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 12 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vpaddd zmm0, zmm0, zmm23 + vpaddd zmm1, zmm1, zmm25 + vpaddd zmm2, zmm2, zmm19 + vpaddd zmm3, zmm3, zmm31 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vprord zmm15, zmm15, 8 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 7 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vpaddd zmm0, zmm0, zmm20 + vpaddd zmm1, zmm1, zmm27 + vpaddd zmm2, zmm2, zmm21 + vpaddd zmm3, zmm3, zmm17 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 16 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vprord zmm4, zmm4, 12 + vpaddd zmm0, zmm0, zmm16 + vpaddd zmm1, zmm1, zmm18 + vpaddd zmm2, zmm2, zmm24 + vpaddd zmm3, zmm3, zmm22 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 8 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vprord zmm4, zmm4, 7 + vpaddd zmm0, zmm0, zmm28 + vpaddd zmm1, zmm1, zmm25 + vpaddd zmm2, zmm2, zmm31 + vpaddd zmm3, zmm3, zmm30 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vprord zmm15, zmm15, 16 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 12 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vpaddd zmm0, zmm0, zmm29 + vpaddd zmm1, zmm1, zmm27 + vpaddd zmm2, zmm2, zmm26 + vpaddd zmm3, zmm3, zmm24 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vprord zmm15, zmm15, 8 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 7 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vpaddd zmm0, zmm0, zmm23 + vpaddd zmm1, zmm1, zmm21 + vpaddd zmm2, zmm2, zmm16 + vpaddd zmm3, zmm3, zmm22 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 16 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vprord zmm4, zmm4, 12 + vpaddd zmm0, zmm0, zmm18 + vpaddd zmm1, zmm1, zmm19 + vpaddd zmm2, zmm2, zmm17 + vpaddd zmm3, zmm3, zmm20 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 8 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vprord zmm4, zmm4, 7 + vpaddd zmm0, zmm0, zmm25 + vpaddd zmm1, zmm1, zmm27 + vpaddd zmm2, zmm2, zmm24 + vpaddd zmm3, zmm3, zmm31 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vprord zmm15, zmm15, 16 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 12 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vpaddd zmm0, zmm0, zmm30 + vpaddd zmm1, zmm1, zmm21 + vpaddd zmm2, zmm2, zmm28 + vpaddd zmm3, zmm3, zmm17 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vprord zmm15, zmm15, 8 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 7 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vpaddd zmm0, zmm0, zmm29 + vpaddd zmm1, zmm1, zmm16 + vpaddd zmm2, zmm2, zmm18 + vpaddd zmm3, zmm3, zmm20 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 16 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vprord zmm4, zmm4, 12 + vpaddd zmm0, zmm0, zmm19 + vpaddd zmm1, zmm1, zmm26 + vpaddd zmm2, zmm2, zmm22 + vpaddd zmm3, zmm3, zmm23 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 8 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vprord zmm4, zmm4, 7 + vpaddd zmm0, zmm0, zmm27 + vpaddd zmm1, zmm1, zmm21 + vpaddd zmm2, zmm2, zmm17 + vpaddd zmm3, zmm3, zmm24 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vprord zmm15, zmm15, 16 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 12 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vpaddd zmm0, zmm0, zmm31 + vpaddd zmm1, zmm1, zmm16 + vpaddd zmm2, zmm2, zmm25 + vpaddd zmm3, zmm3, zmm22 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vprord zmm15, zmm15, 8 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 7 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vpaddd zmm0, zmm0, zmm30 + vpaddd zmm1, zmm1, zmm18 + vpaddd zmm2, zmm2, zmm19 + vpaddd zmm3, zmm3, zmm23 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 16 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vprord zmm4, zmm4, 12 + vpaddd zmm0, zmm0, zmm26 + vpaddd zmm1, zmm1, zmm28 + vpaddd zmm2, zmm2, zmm20 + vpaddd zmm3, zmm3, zmm29 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 8 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vprord zmm4, zmm4, 7 + vpxord zmm0, zmm0, zmm8 + vpxord zmm1, zmm1, zmm9 + vpxord zmm2, zmm2, zmm10 + vpxord zmm3, zmm3, zmm11 + vpxord zmm4, zmm4, zmm12 + vpxord zmm5, zmm5, zmm13 + vpxord zmm6, zmm6, zmm14 + vpxord zmm7, zmm7, zmm15 + movzx eax, byte ptr [rbp+0x38] + jne 9b + mov rbx, qword ptr [rbp+0x50] + vpunpckldq zmm16, zmm0, zmm1 + vpunpckhdq zmm17, zmm0, zmm1 + vpunpckldq zmm18, zmm2, zmm3 + vpunpckhdq zmm19, zmm2, zmm3 + vpunpckldq zmm20, zmm4, zmm5 + vpunpckhdq zmm21, zmm4, zmm5 + vpunpckldq zmm22, zmm6, zmm7 + vpunpckhdq zmm23, zmm6, zmm7 + vpunpcklqdq zmm0, zmm16, zmm18 + vpunpckhqdq zmm1, zmm16, zmm18 + vpunpcklqdq zmm2, zmm17, zmm19 + vpunpckhqdq zmm3, zmm17, zmm19 + vpunpcklqdq zmm4, zmm20, zmm22 + vpunpckhqdq zmm5, zmm20, zmm22 + vpunpcklqdq zmm6, zmm21, zmm23 + vpunpckhqdq zmm7, zmm21, zmm23 + vshufi32x4 zmm16, zmm0, zmm4, 0x88 + vshufi32x4 zmm17, zmm1, zmm5, 0x88 + vshufi32x4 zmm18, zmm2, zmm6, 0x88 + vshufi32x4 zmm19, zmm3, zmm7, 0x88 + vshufi32x4 zmm20, zmm0, zmm4, 0xDD + vshufi32x4 zmm21, zmm1, zmm5, 0xDD + vshufi32x4 zmm22, zmm2, zmm6, 0xDD + vshufi32x4 zmm23, zmm3, zmm7, 0xDD + vshufi32x4 zmm0, zmm16, zmm17, 0x88 + vshufi32x4 zmm1, zmm18, zmm19, 0x88 + vshufi32x4 zmm2, zmm20, zmm21, 0x88 + vshufi32x4 zmm3, zmm22, zmm23, 0x88 + vshufi32x4 zmm4, zmm16, zmm17, 0xDD + vshufi32x4 zmm5, zmm18, zmm19, 0xDD + vshufi32x4 zmm6, zmm20, zmm21, 0xDD + vshufi32x4 zmm7, zmm22, zmm23, 0xDD + vmovdqu32 zmmword ptr [rbx], zmm0 + vmovdqu32 zmmword ptr [rbx+0x1*0x40], zmm1 + vmovdqu32 zmmword ptr [rbx+0x2*0x40], zmm2 + vmovdqu32 zmmword ptr [rbx+0x3*0x40], zmm3 + vmovdqu32 zmmword ptr [rbx+0x4*0x40], zmm4 + vmovdqu32 zmmword ptr [rbx+0x5*0x40], zmm5 + vmovdqu32 zmmword ptr [rbx+0x6*0x40], zmm6 + vmovdqu32 zmmword ptr [rbx+0x7*0x40], zmm7 + vmovdqa32 zmm0, zmmword ptr [rsp] + vmovdqa32 zmm1, zmmword ptr [rsp+0x1*0x40] + vmovdqa32 zmm2, zmm0 + vpaddd zmm2{k1}, zmm0, dword ptr [ADD16+rip] {1to16} + vpcmpltud k2, zmm2, zmm0 + vpaddd zmm1 {k2}, zmm1, dword ptr [ADD1+rip] {1to16} + vmovdqa32 zmmword ptr [rsp], zmm2 + vmovdqa32 zmmword ptr [rsp+0x1*0x40], zmm1 + add rdi, 128 + add rbx, 512 + mov qword ptr [rbp+0x50], rbx + sub rsi, 16 + cmp rsi, 16 + jnc 2b + test rsi, rsi + jnz 3f +4: + vzeroupper + mov rsp, rbp + pop rbp + pop rbx + pop r12 + pop r13 + pop r14 + pop r15 + ret +.p2align 6 +3: + test esi, 0x8 + je 3f + vpbroadcastd ymm0, dword ptr [rcx] + vpbroadcastd ymm1, dword ptr [rcx+0x4] + vpbroadcastd ymm2, dword ptr [rcx+0x8] + vpbroadcastd ymm3, dword ptr [rcx+0xC] + vpbroadcastd ymm4, dword ptr [rcx+0x10] + vpbroadcastd ymm5, dword ptr [rcx+0x14] + vpbroadcastd ymm6, dword ptr [rcx+0x18] + vpbroadcastd ymm7, dword ptr [rcx+0x1C] + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + mov r10, qword ptr [rdi+0x10] + mov r11, qword ptr [rdi+0x18] + mov r12, qword ptr [rdi+0x20] + mov r13, qword ptr [rdi+0x28] + mov r14, qword ptr [rdi+0x30] + mov r15, qword ptr [rdi+0x38] + movzx eax, byte ptr [rbp+0x38] + movzx ebx, byte ptr [rbp+0x40] + or eax, ebx + xor edx, edx +2: + movzx ebx, byte ptr [rbp+0x48] + or ebx, eax + add rdx, 64 + cmp rdx, qword ptr [rsp+0x80] + cmove eax, ebx + mov dword ptr [rsp+0x88], eax + vmovups xmm8, xmmword ptr [r8+rdx-0x40] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x40], 0x01 + vmovups xmm9, xmmword ptr [r9+rdx-0x40] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x40], 0x01 + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-0x40] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x40], 0x01 + vmovups xmm11, xmmword ptr [r11+rdx-0x40] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x40], 0x01 + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm16, ymm12, ymm14, 136 + vshufps ymm17, ymm12, ymm14, 221 + vshufps ymm18, ymm13, ymm15, 136 + vshufps ymm19, ymm13, ymm15, 221 + vmovups xmm8, xmmword ptr [r8+rdx-0x30] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x30], 0x01 + vmovups xmm9, xmmword ptr [r9+rdx-0x30] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x30], 0x01 + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-0x30] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x30], 0x01 + vmovups xmm11, xmmword ptr [r11+rdx-0x30] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x30], 0x01 + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm20, ymm12, ymm14, 136 + vshufps ymm21, ymm12, ymm14, 221 + vshufps ymm22, ymm13, ymm15, 136 + vshufps ymm23, ymm13, ymm15, 221 + vmovups xmm8, xmmword ptr [r8+rdx-0x20] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x20], 0x01 + vmovups xmm9, xmmword ptr [r9+rdx-0x20] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x20], 0x01 + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-0x20] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x20], 0x01 + vmovups xmm11, xmmword ptr [r11+rdx-0x20] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x20], 0x01 + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm24, ymm12, ymm14, 136 + vshufps ymm25, ymm12, ymm14, 221 + vshufps ymm26, ymm13, ymm15, 136 + vshufps ymm27, ymm13, ymm15, 221 + vmovups xmm8, xmmword ptr [r8+rdx-0x10] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x10], 0x01 + vmovups xmm9, xmmword ptr [r9+rdx-0x10] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x10], 0x01 + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-0x10] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x10], 0x01 + vmovups xmm11, xmmword ptr [r11+rdx-0x10] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x10], 0x01 + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm28, ymm12, ymm14, 136 + vshufps ymm29, ymm12, ymm14, 221 + vshufps ymm30, ymm13, ymm15, 136 + vshufps ymm31, ymm13, ymm15, 221 + vpbroadcastd ymm8, dword ptr [BLAKE3_IV_0+rip] + vpbroadcastd ymm9, dword ptr [BLAKE3_IV_1+rip] + vpbroadcastd ymm10, dword ptr [BLAKE3_IV_2+rip] + vpbroadcastd ymm11, dword ptr [BLAKE3_IV_3+rip] + vmovdqa ymm12, ymmword ptr [rsp] + vmovdqa ymm13, ymmword ptr [rsp+0x40] + vpbroadcastd ymm14, dword ptr [BLAKE3_BLOCK_LEN+rip] + vpbroadcastd ymm15, dword ptr [rsp+0x88] + vpaddd ymm0, ymm0, ymm16 + vpaddd ymm1, ymm1, ymm18 + vpaddd ymm2, ymm2, ymm20 + vpaddd ymm3, ymm3, ymm22 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vprord ymm15, ymm15, 16 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 12 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vpaddd ymm0, ymm0, ymm17 + vpaddd ymm1, ymm1, ymm19 + vpaddd ymm2, ymm2, ymm21 + vpaddd ymm3, ymm3, ymm23 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vprord ymm15, ymm15, 8 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 7 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vpaddd ymm0, ymm0, ymm24 + vpaddd ymm1, ymm1, ymm26 + vpaddd ymm2, ymm2, ymm28 + vpaddd ymm3, ymm3, ymm30 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 16 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vprord ymm4, ymm4, 12 + vpaddd ymm0, ymm0, ymm25 + vpaddd ymm1, ymm1, ymm27 + vpaddd ymm2, ymm2, ymm29 + vpaddd ymm3, ymm3, ymm31 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 8 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vprord ymm4, ymm4, 7 + vpaddd ymm0, ymm0, ymm18 + vpaddd ymm1, ymm1, ymm19 + vpaddd ymm2, ymm2, ymm23 + vpaddd ymm3, ymm3, ymm20 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vprord ymm15, ymm15, 16 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 12 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vpaddd ymm0, ymm0, ymm22 + vpaddd ymm1, ymm1, ymm26 + vpaddd ymm2, ymm2, ymm16 + vpaddd ymm3, ymm3, ymm29 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vprord ymm15, ymm15, 8 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 7 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vpaddd ymm0, ymm0, ymm17 + vpaddd ymm1, ymm1, ymm28 + vpaddd ymm2, ymm2, ymm25 + vpaddd ymm3, ymm3, ymm31 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 16 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vprord ymm4, ymm4, 12 + vpaddd ymm0, ymm0, ymm27 + vpaddd ymm1, ymm1, ymm21 + vpaddd ymm2, ymm2, ymm30 + vpaddd ymm3, ymm3, ymm24 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 8 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vprord ymm4, ymm4, 7 + vpaddd ymm0, ymm0, ymm19 + vpaddd ymm1, ymm1, ymm26 + vpaddd ymm2, ymm2, ymm29 + vpaddd ymm3, ymm3, ymm23 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vprord ymm15, ymm15, 16 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 12 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vpaddd ymm0, ymm0, ymm20 + vpaddd ymm1, ymm1, ymm28 + vpaddd ymm2, ymm2, ymm18 + vpaddd ymm3, ymm3, ymm30 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vprord ymm15, ymm15, 8 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 7 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vpaddd ymm0, ymm0, ymm22 + vpaddd ymm1, ymm1, ymm25 + vpaddd ymm2, ymm2, ymm27 + vpaddd ymm3, ymm3, ymm24 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 16 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vprord ymm4, ymm4, 12 + vpaddd ymm0, ymm0, ymm21 + vpaddd ymm1, ymm1, ymm16 + vpaddd ymm2, ymm2, ymm31 + vpaddd ymm3, ymm3, ymm17 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 8 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vprord ymm4, ymm4, 7 + vpaddd ymm0, ymm0, ymm26 + vpaddd ymm1, ymm1, ymm28 + vpaddd ymm2, ymm2, ymm30 + vpaddd ymm3, ymm3, ymm29 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vprord ymm15, ymm15, 16 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 12 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vpaddd ymm0, ymm0, ymm23 + vpaddd ymm1, ymm1, ymm25 + vpaddd ymm2, ymm2, ymm19 + vpaddd ymm3, ymm3, ymm31 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vprord ymm15, ymm15, 8 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 7 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vpaddd ymm0, ymm0, ymm20 + vpaddd ymm1, ymm1, ymm27 + vpaddd ymm2, ymm2, ymm21 + vpaddd ymm3, ymm3, ymm17 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 16 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vprord ymm4, ymm4, 12 + vpaddd ymm0, ymm0, ymm16 + vpaddd ymm1, ymm1, ymm18 + vpaddd ymm2, ymm2, ymm24 + vpaddd ymm3, ymm3, ymm22 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 8 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vprord ymm4, ymm4, 7 + vpaddd ymm0, ymm0, ymm28 + vpaddd ymm1, ymm1, ymm25 + vpaddd ymm2, ymm2, ymm31 + vpaddd ymm3, ymm3, ymm30 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vprord ymm15, ymm15, 16 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 12 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vpaddd ymm0, ymm0, ymm29 + vpaddd ymm1, ymm1, ymm27 + vpaddd ymm2, ymm2, ymm26 + vpaddd ymm3, ymm3, ymm24 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vprord ymm15, ymm15, 8 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 7 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vpaddd ymm0, ymm0, ymm23 + vpaddd ymm1, ymm1, ymm21 + vpaddd ymm2, ymm2, ymm16 + vpaddd ymm3, ymm3, ymm22 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 16 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vprord ymm4, ymm4, 12 + vpaddd ymm0, ymm0, ymm18 + vpaddd ymm1, ymm1, ymm19 + vpaddd ymm2, ymm2, ymm17 + vpaddd ymm3, ymm3, ymm20 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 8 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vprord ymm4, ymm4, 7 + vpaddd ymm0, ymm0, ymm25 + vpaddd ymm1, ymm1, ymm27 + vpaddd ymm2, ymm2, ymm24 + vpaddd ymm3, ymm3, ymm31 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vprord ymm15, ymm15, 16 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 12 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vpaddd ymm0, ymm0, ymm30 + vpaddd ymm1, ymm1, ymm21 + vpaddd ymm2, ymm2, ymm28 + vpaddd ymm3, ymm3, ymm17 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vprord ymm15, ymm15, 8 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 7 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vpaddd ymm0, ymm0, ymm29 + vpaddd ymm1, ymm1, ymm16 + vpaddd ymm2, ymm2, ymm18 + vpaddd ymm3, ymm3, ymm20 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 16 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vprord ymm4, ymm4, 12 + vpaddd ymm0, ymm0, ymm19 + vpaddd ymm1, ymm1, ymm26 + vpaddd ymm2, ymm2, ymm22 + vpaddd ymm3, ymm3, ymm23 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 8 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vprord ymm4, ymm4, 7 + vpaddd ymm0, ymm0, ymm27 + vpaddd ymm1, ymm1, ymm21 + vpaddd ymm2, ymm2, ymm17 + vpaddd ymm3, ymm3, ymm24 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vprord ymm15, ymm15, 16 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 12 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vpaddd ymm0, ymm0, ymm31 + vpaddd ymm1, ymm1, ymm16 + vpaddd ymm2, ymm2, ymm25 + vpaddd ymm3, ymm3, ymm22 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vprord ymm15, ymm15, 8 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 7 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vpaddd ymm0, ymm0, ymm30 + vpaddd ymm1, ymm1, ymm18 + vpaddd ymm2, ymm2, ymm19 + vpaddd ymm3, ymm3, ymm23 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 16 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vprord ymm4, ymm4, 12 + vpaddd ymm0, ymm0, ymm26 + vpaddd ymm1, ymm1, ymm28 + vpaddd ymm2, ymm2, ymm20 + vpaddd ymm3, ymm3, ymm29 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 8 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vprord ymm4, ymm4, 7 + vpxor ymm0, ymm0, ymm8 + vpxor ymm1, ymm1, ymm9 + vpxor ymm2, ymm2, ymm10 + vpxor ymm3, ymm3, ymm11 + vpxor ymm4, ymm4, ymm12 + vpxor ymm5, ymm5, ymm13 + vpxor ymm6, ymm6, ymm14 + vpxor ymm7, ymm7, ymm15 + movzx eax, byte ptr [rbp+0x38] + jne 2b + mov rbx, qword ptr [rbp+0x50] + vunpcklps ymm8, ymm0, ymm1 + vunpcklps ymm9, ymm2, ymm3 + vunpckhps ymm10, ymm0, ymm1 + vunpcklps ymm11, ymm4, ymm5 + vunpcklps ymm0, ymm6, ymm7 + vshufps ymm12, ymm8, ymm9, 78 + vblendps ymm1, ymm8, ymm12, 0xCC + vshufps ymm8, ymm11, ymm0, 78 + vunpckhps ymm13, ymm2, ymm3 + vblendps ymm2, ymm11, ymm8, 0xCC + vblendps ymm3, ymm12, ymm9, 0xCC + vperm2f128 ymm12, ymm1, ymm2, 0x20 + vmovups ymmword ptr [rbx], ymm12 + vunpckhps ymm14, ymm4, ymm5 + vblendps ymm4, ymm8, ymm0, 0xCC + vunpckhps ymm15, ymm6, ymm7 + vperm2f128 ymm7, ymm3, ymm4, 0x20 + vmovups ymmword ptr [rbx+0x20], ymm7 + vshufps ymm5, ymm10, ymm13, 78 + vblendps ymm6, ymm5, ymm13, 0xCC + vshufps ymm13, ymm14, ymm15, 78 + vblendps ymm10, ymm10, ymm5, 0xCC + vblendps ymm14, ymm14, ymm13, 0xCC + vperm2f128 ymm8, ymm10, ymm14, 0x20 + vmovups ymmword ptr [rbx+0x40], ymm8 + vblendps ymm15, ymm13, ymm15, 0xCC + vperm2f128 ymm13, ymm6, ymm15, 0x20 + vmovups ymmword ptr [rbx+0x60], ymm13 + vperm2f128 ymm9, ymm1, ymm2, 0x31 + vperm2f128 ymm11, ymm3, ymm4, 0x31 + vmovups ymmword ptr [rbx+0x80], ymm9 + vperm2f128 ymm14, ymm10, ymm14, 0x31 + vperm2f128 ymm15, ymm6, ymm15, 0x31 + vmovups ymmword ptr [rbx+0xA0], ymm11 + vmovups ymmword ptr [rbx+0xC0], ymm14 + vmovups ymmword ptr [rbx+0xE0], ymm15 + vmovdqa ymm0, ymmword ptr [rsp] + vmovdqa ymm2, ymmword ptr [rsp+0x2*0x20] + vmovdqa32 ymm0 {k1}, ymmword ptr [rsp+0x1*0x20] + vmovdqa32 ymm2 {k1}, ymmword ptr [rsp+0x3*0x20] + vmovdqa ymmword ptr [rsp], ymm0 + vmovdqa ymmword ptr [rsp+0x2*0x20], ymm2 + add rbx, 256 + mov qword ptr [rbp+0x50], rbx + add rdi, 64 + sub rsi, 8 +3: + mov rbx, qword ptr [rbp+0x50] + mov r15, qword ptr [rsp+0x80] + movzx r13, byte ptr [rbp+0x38] + movzx r12, byte ptr [rbp+0x48] + test esi, 0x4 + je 3f + vbroadcasti32x4 zmm0, xmmword ptr [rcx] + vbroadcasti32x4 zmm1, xmmword ptr [rcx+0x1*0x10] + vmovdqa xmm12, xmmword ptr [rsp] + vmovdqa xmm13, xmmword ptr [rsp+0x4*0x10] + vpunpckldq xmm14, xmm12, xmm13 + vpunpckhdq xmm15, xmm12, xmm13 + vpermq ymm14, ymm14, 0xDC + vpermq ymm15, ymm15, 0xDC + vpbroadcastd zmm12, dword ptr [BLAKE3_BLOCK_LEN+rip] + vinserti64x4 zmm13, zmm14, ymm15, 0x01 + mov eax, 17476 + kmovw k2, eax + vpblendmd zmm13 {k2}, zmm13, zmm12 + vbroadcasti32x4 zmm15, xmmword ptr [BLAKE3_IV+rip] + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + mov r10, qword ptr [rdi+0x10] + mov r11, qword ptr [rdi+0x18] + mov eax, 43690 + kmovw k3, eax + mov eax, 34952 + kmovw k4, eax + movzx eax, byte ptr [rbp+0x40] + or eax, r13d + xor edx, edx +.p2align 5 +2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + mov dword ptr [rsp+0x88], eax + vmovdqa32 zmm2, zmm15 + vpbroadcastd zmm8, dword ptr [rsp+0x22*0x4] + vpblendmd zmm3 {k4}, zmm13, zmm8 + vmovups zmm8, zmmword ptr [r8+rdx-0x1*0x40] + vinserti32x4 zmm8, zmm8, xmmword ptr [r9+rdx-0x4*0x10], 0x01 + vinserti32x4 zmm8, zmm8, xmmword ptr [r10+rdx-0x4*0x10], 0x02 + vinserti32x4 zmm8, zmm8, xmmword ptr [r11+rdx-0x4*0x10], 0x03 + vmovups zmm9, zmmword ptr [r8+rdx-0x30] + vinserti32x4 zmm9, zmm9, xmmword ptr [r9+rdx-0x3*0x10], 0x01 + vinserti32x4 zmm9, zmm9, xmmword ptr [r10+rdx-0x3*0x10], 0x02 + vinserti32x4 zmm9, zmm9, xmmword ptr [r11+rdx-0x3*0x10], 0x03 + vshufps zmm4, zmm8, zmm9, 136 + vshufps zmm5, zmm8, zmm9, 221 + vmovups zmm8, zmmword ptr [r8+rdx-0x20] + vinserti32x4 zmm8, zmm8, xmmword ptr [r9+rdx-0x2*0x10], 0x01 + vinserti32x4 zmm8, zmm8, xmmword ptr [r10+rdx-0x2*0x10], 0x02 + vinserti32x4 zmm8, zmm8, xmmword ptr [r11+rdx-0x2*0x10], 0x03 + vmovups zmm9, zmmword ptr [r8+rdx-0x10] + vinserti32x4 zmm9, zmm9, xmmword ptr [r9+rdx-0x1*0x10], 0x01 + vinserti32x4 zmm9, zmm9, xmmword ptr [r10+rdx-0x1*0x10], 0x02 + vinserti32x4 zmm9, zmm9, xmmword ptr [r11+rdx-0x1*0x10], 0x03 + vshufps zmm6, zmm8, zmm9, 136 + vshufps zmm7, zmm8, zmm9, 221 + vpshufd zmm6, zmm6, 0x93 + vpshufd zmm7, zmm7, 0x93 + mov al, 7 +9: + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm0, zmm0, zmm1 + vpxord zmm3, zmm3, zmm0 + vprord zmm3, zmm3, 16 + vpaddd zmm2, zmm2, zmm3 + vpxord zmm1, zmm1, zmm2 + vprord zmm1, zmm1, 12 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm0, zmm0, zmm1 + vpxord zmm3, zmm3, zmm0 + vprord zmm3, zmm3, 8 + vpaddd zmm2, zmm2, zmm3 + vpxord zmm1, zmm1, zmm2 + vprord zmm1, zmm1, 7 + vpshufd zmm0, zmm0, 0x93 + vpshufd zmm3, zmm3, 0x4E + vpshufd zmm2, zmm2, 0x39 + vpaddd zmm0, zmm0, zmm6 + vpaddd zmm0, zmm0, zmm1 + vpxord zmm3, zmm3, zmm0 + vprord zmm3, zmm3, 16 + vpaddd zmm2, zmm2, zmm3 + vpxord zmm1, zmm1, zmm2 + vprord zmm1, zmm1, 12 + vpaddd zmm0, zmm0, zmm7 + vpaddd zmm0, zmm0, zmm1 + vpxord zmm3, zmm3, zmm0 + vprord zmm3, zmm3, 8 + vpaddd zmm2, zmm2, zmm3 + vpxord zmm1, zmm1, zmm2 + vprord zmm1, zmm1, 7 + vpshufd zmm0, zmm0, 0x39 + vpshufd zmm3, zmm3, 0x4E + vpshufd zmm2, zmm2, 0x93 + dec al + jz 9f + vshufps zmm8, zmm4, zmm5, 214 + vpshufd zmm9, zmm4, 0x0F + vpshufd zmm4, zmm8, 0x39 + vshufps zmm8, zmm6, zmm7, 250 + vpblendmd zmm9 {k3}, zmm9, zmm8 + vpunpcklqdq zmm8, zmm7, zmm5 + vpblendmd zmm8 {k4}, zmm8, zmm6 + vpshufd zmm8, zmm8, 0x78 + vpunpckhdq zmm5, zmm5, zmm7 + vpunpckldq zmm6, zmm6, zmm5 + vpshufd zmm7, zmm6, 0x1E + vmovdqa32 zmm5, zmm9 + vmovdqa32 zmm6, zmm8 + jmp 9b +9: + vpxord zmm0, zmm0, zmm2 + vpxord zmm1, zmm1, zmm3 + mov eax, r13d + cmp rdx, r15 + jne 2b + vmovdqu xmmword ptr [rbx], xmm0 + vmovdqu xmmword ptr [rbx+0x10], xmm1 + vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01 + vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01 + vextracti32x4 xmmword ptr [rbx+0x4*0x10], zmm0, 0x02 + vextracti32x4 xmmword ptr [rbx+0x5*0x10], zmm1, 0x02 + vextracti32x4 xmmword ptr [rbx+0x6*0x10], zmm0, 0x03 + vextracti32x4 xmmword ptr [rbx+0x7*0x10], zmm1, 0x03 + vmovdqa xmm0, xmmword ptr [rsp] + vmovdqa xmm2, xmmword ptr [rsp+0x40] + vmovdqa32 xmm0 {k1}, xmmword ptr [rsp+0x1*0x10] + vmovdqa32 xmm2 {k1}, xmmword ptr [rsp+0x5*0x10] + vmovdqa xmmword ptr [rsp], xmm0 + vmovdqa xmmword ptr [rsp+0x40], xmm2 + add rbx, 128 + add rdi, 32 + sub rsi, 4 +3: + test esi, 0x2 + je 3f + vbroadcasti128 ymm0, xmmword ptr [rcx] + vbroadcasti128 ymm1, xmmword ptr [rcx+0x10] + vmovd xmm13, dword ptr [rsp] + vpinsrd xmm13, xmm13, dword ptr [rsp+0x40], 1 + vpinsrd xmm13, xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 + vmovd xmm14, dword ptr [rsp+0x4] + vpinsrd xmm14, xmm14, dword ptr [rsp+0x44], 1 + vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 + vinserti128 ymm13, ymm13, xmm14, 0x01 + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + movzx eax, byte ptr [rbp+0x40] + or eax, r13d + xor edx, edx +.p2align 5 +2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + mov dword ptr [rsp+0x88], eax + vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV+rip] + vpbroadcastd ymm8, dword ptr [rsp+0x88] + vpblendd ymm3, ymm13, ymm8, 0x88 + vmovups ymm8, ymmword ptr [r8+rdx-0x40] + vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x40], 0x01 + vmovups ymm9, ymmword ptr [r8+rdx-0x30] + vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x30], 0x01 + vshufps ymm4, ymm8, ymm9, 136 + vshufps ymm5, ymm8, ymm9, 221 + vmovups ymm8, ymmword ptr [r8+rdx-0x20] + vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x20], 0x01 + vmovups ymm9, ymmword ptr [r8+rdx-0x10] + vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x10], 0x01 + vshufps ymm6, ymm8, ymm9, 136 + vshufps ymm7, ymm8, ymm9, 221 + vpshufd ymm6, ymm6, 0x93 + vpshufd ymm7, ymm7, 0x93 + mov al, 7 +9: + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm0, ymm0, ymm1 + vpxord ymm3, ymm3, ymm0 + vprord ymm3, ymm3, 16 + vpaddd ymm2, ymm2, ymm3 + vpxord ymm1, ymm1, ymm2 + vprord ymm1, ymm1, 12 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm0, ymm0, ymm1 + vpxord ymm3, ymm3, ymm0 + vprord ymm3, ymm3, 8 + vpaddd ymm2, ymm2, ymm3 + vpxord ymm1, ymm1, ymm2 + vprord ymm1, ymm1, 7 + vpshufd ymm0, ymm0, 0x93 + vpshufd ymm3, ymm3, 0x4E + vpshufd ymm2, ymm2, 0x39 + vpaddd ymm0, ymm0, ymm6 + vpaddd ymm0, ymm0, ymm1 + vpxord ymm3, ymm3, ymm0 + vprord ymm3, ymm3, 16 + vpaddd ymm2, ymm2, ymm3 + vpxord ymm1, ymm1, ymm2 + vprord ymm1, ymm1, 12 + vpaddd ymm0, ymm0, ymm7 + vpaddd ymm0, ymm0, ymm1 + vpxord ymm3, ymm3, ymm0 + vprord ymm3, ymm3, 8 + vpaddd ymm2, ymm2, ymm3 + vpxord ymm1, ymm1, ymm2 + vprord ymm1, ymm1, 7 + vpshufd ymm0, ymm0, 0x39 + vpshufd ymm3, ymm3, 0x4E + vpshufd ymm2, ymm2, 0x93 + dec al + jz 9f + vshufps ymm8, ymm4, ymm5, 214 + vpshufd ymm9, ymm4, 0x0F + vpshufd ymm4, ymm8, 0x39 + vshufps ymm8, ymm6, ymm7, 250 + vpblendd ymm9, ymm9, ymm8, 0xAA + vpunpcklqdq ymm8, ymm7, ymm5 + vpblendd ymm8, ymm8, ymm6, 0x88 + vpshufd ymm8, ymm8, 0x78 + vpunpckhdq ymm5, ymm5, ymm7 + vpunpckldq ymm6, ymm6, ymm5 + vpshufd ymm7, ymm6, 0x1E + vmovdqa ymm5, ymm9 + vmovdqa ymm6, ymm8 + jmp 9b +9: + vpxor ymm0, ymm0, ymm2 + vpxor ymm1, ymm1, ymm3 + mov eax, r13d + cmp rdx, r15 + jne 2b + vmovdqu xmmword ptr [rbx], xmm0 + vmovdqu xmmword ptr [rbx+0x10], xmm1 + vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01 + vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01 + vmovdqa xmm0, xmmword ptr [rsp] + vmovdqa xmm2, xmmword ptr [rsp+0x4*0x10] + vmovdqu32 xmm0 {k1}, xmmword ptr [rsp+0x8] + vmovdqu32 xmm2 {k1}, xmmword ptr [rsp+0x48] + vmovdqa xmmword ptr [rsp], xmm0 + vmovdqa xmmword ptr [rsp+0x4*0x10], xmm2 + add rbx, 64 + add rdi, 16 + sub rsi, 2 +3: + test esi, 0x1 + je 4b + vmovdqu xmm0, xmmword ptr [rcx] + vmovdqu xmm1, xmmword ptr [rcx+0x10] + vmovd xmm14, dword ptr [rsp] + vpinsrd xmm14, xmm14, dword ptr [rsp+0x40], 1 + vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 + vmovdqa xmm15, xmmword ptr [BLAKE3_IV+rip] + mov r8, qword ptr [rdi] + movzx eax, byte ptr [rbp+0x40] + or eax, r13d + xor edx, edx +.p2align 5 +2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + vpinsrd xmm3, xmm14, eax, 3 + vmovdqa xmm2, xmm15 + vmovups xmm8, xmmword ptr [r8+rdx-0x40] + vmovups xmm9, xmmword ptr [r8+rdx-0x30] + vshufps xmm4, xmm8, xmm9, 136 + vshufps xmm5, xmm8, xmm9, 221 + vmovups xmm8, xmmword ptr [r8+rdx-0x20] + vmovups xmm9, xmmword ptr [r8+rdx-0x10] + vshufps xmm6, xmm8, xmm9, 136 + vshufps xmm7, xmm8, xmm9, 221 + vpshufd xmm6, xmm6, 0x93 + vpshufd xmm7, xmm7, 0x93 + mov al, 7 +9: + vpaddd xmm0, xmm0, xmm4 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 16 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 12 + vpaddd xmm0, xmm0, xmm5 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 8 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 7 + vpshufd xmm0, xmm0, 0x93 + vpshufd xmm3, xmm3, 0x4E + vpshufd xmm2, xmm2, 0x39 + vpaddd xmm0, xmm0, xmm6 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 16 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 12 + vpaddd xmm0, xmm0, xmm7 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 8 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 7 + vpshufd xmm0, xmm0, 0x39 + vpshufd xmm3, xmm3, 0x4E + vpshufd xmm2, xmm2, 0x93 + dec al + jz 9f + vshufps xmm8, xmm4, xmm5, 214 + vpshufd xmm9, xmm4, 0x0F + vpshufd xmm4, xmm8, 0x39 + vshufps xmm8, xmm6, xmm7, 250 + vpblendd xmm9, xmm9, xmm8, 0xAA + vpunpcklqdq xmm8, xmm7, xmm5 + vpblendd xmm8, xmm8, xmm6, 0x88 + vpshufd xmm8, xmm8, 0x78 + vpunpckhdq xmm5, xmm5, xmm7 + vpunpckldq xmm6, xmm6, xmm5 + vpshufd xmm7, xmm6, 0x1E + vmovdqa xmm5, xmm9 + vmovdqa xmm6, xmm8 + jmp 9b +9: + vpxor xmm0, xmm0, xmm2 + vpxor xmm1, xmm1, xmm3 + mov eax, r13d + cmp rdx, r15 + jne 2b + vmovdqu xmmword ptr [rbx], xmm0 + vmovdqu xmmword ptr [rbx+0x10], xmm1 + jmp 4b +.p2align 6 +_blake3_compress_in_place_avx512: +blake3_compress_in_place_avx512: + _CET_ENDBR + vmovdqu xmm0, xmmword ptr [rdi] + vmovdqu xmm1, xmmword ptr [rdi+0x10] + movzx eax, r8b + movzx edx, dl + shl rax, 32 + add rdx, rax + vmovq xmm3, rcx + vmovq xmm4, rdx + vpunpcklqdq xmm3, xmm3, xmm4 + vmovaps xmm2, xmmword ptr [BLAKE3_IV+rip] + vmovups xmm8, xmmword ptr [rsi] + vmovups xmm9, xmmword ptr [rsi+0x10] + vshufps xmm4, xmm8, xmm9, 136 + vshufps xmm5, xmm8, xmm9, 221 + vmovups xmm8, xmmword ptr [rsi+0x20] + vmovups xmm9, xmmword ptr [rsi+0x30] + vshufps xmm6, xmm8, xmm9, 136 + vshufps xmm7, xmm8, xmm9, 221 + vpshufd xmm6, xmm6, 0x93 + vpshufd xmm7, xmm7, 0x93 + mov al, 7 +9: + vpaddd xmm0, xmm0, xmm4 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 16 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 12 + vpaddd xmm0, xmm0, xmm5 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 8 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 7 + vpshufd xmm0, xmm0, 0x93 + vpshufd xmm3, xmm3, 0x4E + vpshufd xmm2, xmm2, 0x39 + vpaddd xmm0, xmm0, xmm6 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 16 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 12 + vpaddd xmm0, xmm0, xmm7 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 8 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 7 + vpshufd xmm0, xmm0, 0x39 + vpshufd xmm3, xmm3, 0x4E + vpshufd xmm2, xmm2, 0x93 + dec al + jz 9f + vshufps xmm8, xmm4, xmm5, 214 + vpshufd xmm9, xmm4, 0x0F + vpshufd xmm4, xmm8, 0x39 + vshufps xmm8, xmm6, xmm7, 250 + vpblendd xmm9, xmm9, xmm8, 0xAA + vpunpcklqdq xmm8, xmm7, xmm5 + vpblendd xmm8, xmm8, xmm6, 0x88 + vpshufd xmm8, xmm8, 0x78 + vpunpckhdq xmm5, xmm5, xmm7 + vpunpckldq xmm6, xmm6, xmm5 + vpshufd xmm7, xmm6, 0x1E + vmovdqa xmm5, xmm9 + vmovdqa xmm6, xmm8 + jmp 9b +9: + vpxor xmm0, xmm0, xmm2 + vpxor xmm1, xmm1, xmm3 + vmovdqu xmmword ptr [rdi], xmm0 + vmovdqu xmmword ptr [rdi+0x10], xmm1 + ret + +.p2align 6 +_blake3_compress_xof_avx512: +blake3_compress_xof_avx512: + _CET_ENDBR + vmovdqu xmm0, xmmword ptr [rdi] + vmovdqu xmm1, xmmword ptr [rdi+0x10] + movzx eax, r8b + movzx edx, dl + shl rax, 32 + add rdx, rax + vmovq xmm3, rcx + vmovq xmm4, rdx + vpunpcklqdq xmm3, xmm3, xmm4 + vmovaps xmm2, xmmword ptr [BLAKE3_IV+rip] + vmovups xmm8, xmmword ptr [rsi] + vmovups xmm9, xmmword ptr [rsi+0x10] + vshufps xmm4, xmm8, xmm9, 136 + vshufps xmm5, xmm8, xmm9, 221 + vmovups xmm8, xmmword ptr [rsi+0x20] + vmovups xmm9, xmmword ptr [rsi+0x30] + vshufps xmm6, xmm8, xmm9, 136 + vshufps xmm7, xmm8, xmm9, 221 + vpshufd xmm6, xmm6, 0x93 + vpshufd xmm7, xmm7, 0x93 + mov al, 7 +9: + vpaddd xmm0, xmm0, xmm4 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 16 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 12 + vpaddd xmm0, xmm0, xmm5 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 8 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 7 + vpshufd xmm0, xmm0, 0x93 + vpshufd xmm3, xmm3, 0x4E + vpshufd xmm2, xmm2, 0x39 + vpaddd xmm0, xmm0, xmm6 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 16 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 12 + vpaddd xmm0, xmm0, xmm7 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 8 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 7 + vpshufd xmm0, xmm0, 0x39 + vpshufd xmm3, xmm3, 0x4E + vpshufd xmm2, xmm2, 0x93 + dec al + jz 9f + vshufps xmm8, xmm4, xmm5, 214 + vpshufd xmm9, xmm4, 0x0F + vpshufd xmm4, xmm8, 0x39 + vshufps xmm8, xmm6, xmm7, 250 + vpblendd xmm9, xmm9, xmm8, 0xAA + vpunpcklqdq xmm8, xmm7, xmm5 + vpblendd xmm8, xmm8, xmm6, 0x88 + vpshufd xmm8, xmm8, 0x78 + vpunpckhdq xmm5, xmm5, xmm7 + vpunpckldq xmm6, xmm6, xmm5 + vpshufd xmm7, xmm6, 0x1E + vmovdqa xmm5, xmm9 + vmovdqa xmm6, xmm8 + jmp 9b +9: + vpxor xmm0, xmm0, xmm2 + vpxor xmm1, xmm1, xmm3 + vpxor xmm2, xmm2, [rdi] + vpxor xmm3, xmm3, [rdi+0x10] + vmovdqu xmmword ptr [r9], xmm0 + vmovdqu xmmword ptr [r9+0x10], xmm1 + vmovdqu xmmword ptr [r9+0x20], xmm2 + vmovdqu xmmword ptr [r9+0x30], xmm3 + ret + +.p2align 6 +blake3_xof_many_avx512: +_blake3_xof_many_avx512: + _CET_ENDBR + mov r10,QWORD PTR [rsp+0x8] + cmp r10,0x1 + ja 2f + vmovdqu xmm0,XMMWORD PTR [rdi] + vmovdqu xmm1,XMMWORD PTR [rdi+0x10] + movzx eax,r8b + movzx edx,dl + shl rax,0x20 + add rdx,rax + vmovq xmm3,rcx + vmovq xmm4,rdx + vpunpcklqdq xmm3,xmm3,xmm4 + vmovaps xmm2,XMMWORD PTR [BLAKE3_IV+rip] + vmovups xmm8,XMMWORD PTR [rsi] + vmovups xmm9,XMMWORD PTR [rsi+0x10] + vshufps xmm4,xmm8,xmm9,0x88 + vshufps xmm5,xmm8,xmm9,0xdd + vmovups xmm8,XMMWORD PTR [rsi+0x20] + vmovups xmm9,XMMWORD PTR [rsi+0x30] + vshufps xmm6,xmm8,xmm9,0x88 + vshufps xmm7,xmm8,xmm9,0xdd + vpshufd xmm6,xmm6,0x93 + vpshufd xmm7,xmm7,0x93 + mov al,0x7 +3: + vpaddd xmm0,xmm0,xmm4 + vpaddd xmm0,xmm0,xmm1 + vpxord xmm3,xmm3,xmm0 + vprord xmm3,xmm3,0x10 + vpaddd xmm2,xmm2,xmm3 + vpxord xmm1,xmm1,xmm2 + vprord xmm1,xmm1,0xc + vpaddd xmm0,xmm0,xmm5 + vpaddd xmm0,xmm0,xmm1 + vpxord xmm3,xmm3,xmm0 + vprord xmm3,xmm3,0x8 + vpaddd xmm2,xmm2,xmm3 + vpxord xmm1,xmm1,xmm2 + vprord xmm1,xmm1,0x7 + vpshufd xmm0,xmm0,0x93 + vpshufd xmm3,xmm3,0x4e + vpshufd xmm2,xmm2,0x39 + vpaddd xmm0,xmm0,xmm6 + vpaddd xmm0,xmm0,xmm1 + vpxord xmm3,xmm3,xmm0 + vprord xmm3,xmm3,0x10 + vpaddd xmm2,xmm2,xmm3 + vpxord xmm1,xmm1,xmm2 + vprord xmm1,xmm1,0xc + vpaddd xmm0,xmm0,xmm7 + vpaddd xmm0,xmm0,xmm1 + vpxord xmm3,xmm3,xmm0 + vprord xmm3,xmm3,0x8 + vpaddd xmm2,xmm2,xmm3 + vpxord xmm1,xmm1,xmm2 + vprord xmm1,xmm1,0x7 + vpshufd xmm0,xmm0,0x39 + vpshufd xmm3,xmm3,0x4e + vpshufd xmm2,xmm2,0x93 + dec al + je 3f + vshufps xmm8,xmm4,xmm5,0xd6 + vpshufd xmm9,xmm4,0xf + vpshufd xmm4,xmm8,0x39 + vshufps xmm8,xmm6,xmm7,0xfa + vpblendd xmm9,xmm9,xmm8,0xaa + vpunpcklqdq xmm8,xmm7,xmm5 + vpblendd xmm8,xmm8,xmm6,0x88 + vpshufd xmm8,xmm8,0x78 + vpunpckhdq xmm5,xmm5,xmm7 + vpunpckldq xmm6,xmm6,xmm5 + vpshufd xmm7,xmm6,0x1e + vmovdqa xmm5,xmm9 + vmovdqa xmm6,xmm8 + jmp 3b +3: + vpxor xmm0,xmm0,xmm2 + vpxor xmm1,xmm1,xmm3 + vpxor xmm2,xmm2,XMMWORD PTR [rdi] + vpxor xmm3,xmm3,XMMWORD PTR [rdi+0x10] + vmovdqu XMMWORD PTR [r9],xmm0 + vmovdqu XMMWORD PTR [r9+0x10],xmm1 + vmovdqu XMMWORD PTR [r9+0x20],xmm2 + vmovdqu XMMWORD PTR [r9+0x30],xmm3 + ret +.p2align 6 +2: + push rbp + mov rbp,rsp + sub rsp,0x90 + and rsp,0xffffffffffffffc0 + vpbroadcastd zmm0,ecx + shr rcx,0x20 + vpbroadcastd zmm1,ecx + vpaddd zmm2,zmm0,ZMMWORD PTR [ADD0+rip] + vpcmpltud k1,zmm2,zmm0 + vpaddd zmm1{k1},zmm1,DWORD PTR [ADD1+rip]{1to16} + vmovdqa32 ZMMWORD PTR [rsp],zmm2 + vmovdqa32 ZMMWORD PTR [rsp+0x40],zmm1 + cmp r10,0x10 + jb 2f +3: + vpbroadcastd zmm16,DWORD PTR [rsi] + vpbroadcastd zmm17,DWORD PTR [rsi+0x4] + vpbroadcastd zmm18,DWORD PTR [rsi+0x8] + vpbroadcastd zmm19,DWORD PTR [rsi+0xc] + vpbroadcastd zmm20,DWORD PTR [rsi+0x10] + vpbroadcastd zmm21,DWORD PTR [rsi+0x14] + vpbroadcastd zmm22,DWORD PTR [rsi+0x18] + vpbroadcastd zmm23,DWORD PTR [rsi+0x1c] + vpbroadcastd zmm24,DWORD PTR [rsi+0x20] + vpbroadcastd zmm25,DWORD PTR [rsi+0x24] + vpbroadcastd zmm26,DWORD PTR [rsi+0x28] + vpbroadcastd zmm27,DWORD PTR [rsi+0x2c] + vpbroadcastd zmm28,DWORD PTR [rsi+0x30] + vpbroadcastd zmm29,DWORD PTR [rsi+0x34] + vpbroadcastd zmm30,DWORD PTR [rsi+0x38] + vpbroadcastd zmm31,DWORD PTR [rsi+0x3c] + vpbroadcastd zmm0,DWORD PTR [rdi] + vpbroadcastd zmm1,DWORD PTR [rdi+0x4] + vpbroadcastd zmm2,DWORD PTR [rdi+0x8] + vpbroadcastd zmm3,DWORD PTR [rdi+0xc] + vpbroadcastd zmm4,DWORD PTR [rdi+0x10] + vpbroadcastd zmm5,DWORD PTR [rdi+0x14] + vpbroadcastd zmm6,DWORD PTR [rdi+0x18] + vpbroadcastd zmm7,DWORD PTR [rdi+0x1c] + vpbroadcastd zmm8,DWORD PTR [BLAKE3_IV_0+rip] + vpbroadcastd zmm9,DWORD PTR [BLAKE3_IV_1+rip] + vpbroadcastd zmm10,DWORD PTR [BLAKE3_IV_2+rip] + vpbroadcastd zmm11,DWORD PTR [BLAKE3_IV_3+rip] + vmovdqa32 zmm12,ZMMWORD PTR [rsp] + vmovdqa32 zmm13,ZMMWORD PTR [rsp+0x40] + vpbroadcastd zmm14,edx + vpbroadcastd zmm15,r8d + vpaddd zmm0,zmm0,zmm16 + vpaddd zmm1,zmm1,zmm18 + vpaddd zmm2,zmm2,zmm20 + vpaddd zmm3,zmm3,zmm22 + vpaddd zmm0,zmm0,zmm4 + vpaddd zmm1,zmm1,zmm5 + vpaddd zmm2,zmm2,zmm6 + vpaddd zmm3,zmm3,zmm7 + vpxord zmm12,zmm12,zmm0 + vpxord zmm13,zmm13,zmm1 + vpxord zmm14,zmm14,zmm2 + vpxord zmm15,zmm15,zmm3 + vprord zmm12,zmm12,0x10 + vprord zmm13,zmm13,0x10 + vprord zmm14,zmm14,0x10 + vprord zmm15,zmm15,0x10 + vpaddd zmm8,zmm8,zmm12 + vpaddd zmm9,zmm9,zmm13 + vpaddd zmm10,zmm10,zmm14 + vpaddd zmm11,zmm11,zmm15 + vpxord zmm4,zmm4,zmm8 + vpxord zmm5,zmm5,zmm9 + vpxord zmm6,zmm6,zmm10 + vpxord zmm7,zmm7,zmm11 + vprord zmm4,zmm4,0xc + vprord zmm5,zmm5,0xc + vprord zmm6,zmm6,0xc + vprord zmm7,zmm7,0xc + vpaddd zmm0,zmm0,zmm17 + vpaddd zmm1,zmm1,zmm19 + vpaddd zmm2,zmm2,zmm21 + vpaddd zmm3,zmm3,zmm23 + vpaddd zmm0,zmm0,zmm4 + vpaddd zmm1,zmm1,zmm5 + vpaddd zmm2,zmm2,zmm6 + vpaddd zmm3,zmm3,zmm7 + vpxord zmm12,zmm12,zmm0 + vpxord zmm13,zmm13,zmm1 + vpxord zmm14,zmm14,zmm2 + vpxord zmm15,zmm15,zmm3 + vprord zmm12,zmm12,0x8 + vprord zmm13,zmm13,0x8 + vprord zmm14,zmm14,0x8 + vprord zmm15,zmm15,0x8 + vpaddd zmm8,zmm8,zmm12 + vpaddd zmm9,zmm9,zmm13 + vpaddd zmm10,zmm10,zmm14 + vpaddd zmm11,zmm11,zmm15 + vpxord zmm4,zmm4,zmm8 + vpxord zmm5,zmm5,zmm9 + vpxord zmm6,zmm6,zmm10 + vpxord zmm7,zmm7,zmm11 + vprord zmm4,zmm4,0x7 + vprord zmm5,zmm5,0x7 + vprord zmm6,zmm6,0x7 + vprord zmm7,zmm7,0x7 + vpaddd zmm0,zmm0,zmm24 + vpaddd zmm1,zmm1,zmm26 + vpaddd zmm2,zmm2,zmm28 + vpaddd zmm3,zmm3,zmm30 + vpaddd zmm0,zmm0,zmm5 + vpaddd zmm1,zmm1,zmm6 + vpaddd zmm2,zmm2,zmm7 + vpaddd zmm3,zmm3,zmm4 + vpxord zmm15,zmm15,zmm0 + vpxord zmm12,zmm12,zmm1 + vpxord zmm13,zmm13,zmm2 + vpxord zmm14,zmm14,zmm3 + vprord zmm15,zmm15,0x10 + vprord zmm12,zmm12,0x10 + vprord zmm13,zmm13,0x10 + vprord zmm14,zmm14,0x10 + vpaddd zmm10,zmm10,zmm15 + vpaddd zmm11,zmm11,zmm12 + vpaddd zmm8,zmm8,zmm13 + vpaddd zmm9,zmm9,zmm14 + vpxord zmm5,zmm5,zmm10 + vpxord zmm6,zmm6,zmm11 + vpxord zmm7,zmm7,zmm8 + vpxord zmm4,zmm4,zmm9 + vprord zmm5,zmm5,0xc + vprord zmm6,zmm6,0xc + vprord zmm7,zmm7,0xc + vprord zmm4,zmm4,0xc + vpaddd zmm0,zmm0,zmm25 + vpaddd zmm1,zmm1,zmm27 + vpaddd zmm2,zmm2,zmm29 + vpaddd zmm3,zmm3,zmm31 + vpaddd zmm0,zmm0,zmm5 + vpaddd zmm1,zmm1,zmm6 + vpaddd zmm2,zmm2,zmm7 + vpaddd zmm3,zmm3,zmm4 + vpxord zmm15,zmm15,zmm0 + vpxord zmm12,zmm12,zmm1 + vpxord zmm13,zmm13,zmm2 + vpxord zmm14,zmm14,zmm3 + vprord zmm15,zmm15,0x8 + vprord zmm12,zmm12,0x8 + vprord zmm13,zmm13,0x8 + vprord zmm14,zmm14,0x8 + vpaddd zmm10,zmm10,zmm15 + vpaddd zmm11,zmm11,zmm12 + vpaddd zmm8,zmm8,zmm13 + vpaddd zmm9,zmm9,zmm14 + vpxord zmm5,zmm5,zmm10 + vpxord zmm6,zmm6,zmm11 + vpxord zmm7,zmm7,zmm8 + vpxord zmm4,zmm4,zmm9 + vprord zmm5,zmm5,0x7 + vprord zmm6,zmm6,0x7 + vprord zmm7,zmm7,0x7 + vprord zmm4,zmm4,0x7 + vpaddd zmm0,zmm0,zmm18 + vpaddd zmm1,zmm1,zmm19 + vpaddd zmm2,zmm2,zmm23 + vpaddd zmm3,zmm3,zmm20 + vpaddd zmm0,zmm0,zmm4 + vpaddd zmm1,zmm1,zmm5 + vpaddd zmm2,zmm2,zmm6 + vpaddd zmm3,zmm3,zmm7 + vpxord zmm12,zmm12,zmm0 + vpxord zmm13,zmm13,zmm1 + vpxord zmm14,zmm14,zmm2 + vpxord zmm15,zmm15,zmm3 + vprord zmm12,zmm12,0x10 + vprord zmm13,zmm13,0x10 + vprord zmm14,zmm14,0x10 + vprord zmm15,zmm15,0x10 + vpaddd zmm8,zmm8,zmm12 + vpaddd zmm9,zmm9,zmm13 + vpaddd zmm10,zmm10,zmm14 + vpaddd zmm11,zmm11,zmm15 + vpxord zmm4,zmm4,zmm8 + vpxord zmm5,zmm5,zmm9 + vpxord zmm6,zmm6,zmm10 + vpxord zmm7,zmm7,zmm11 + vprord zmm4,zmm4,0xc + vprord zmm5,zmm5,0xc + vprord zmm6,zmm6,0xc + vprord zmm7,zmm7,0xc + vpaddd zmm0,zmm0,zmm22 + vpaddd zmm1,zmm1,zmm26 + vpaddd zmm2,zmm2,zmm16 + vpaddd zmm3,zmm3,zmm29 + vpaddd zmm0,zmm0,zmm4 + vpaddd zmm1,zmm1,zmm5 + vpaddd zmm2,zmm2,zmm6 + vpaddd zmm3,zmm3,zmm7 + vpxord zmm12,zmm12,zmm0 + vpxord zmm13,zmm13,zmm1 + vpxord zmm14,zmm14,zmm2 + vpxord zmm15,zmm15,zmm3 + vprord zmm12,zmm12,0x8 + vprord zmm13,zmm13,0x8 + vprord zmm14,zmm14,0x8 + vprord zmm15,zmm15,0x8 + vpaddd zmm8,zmm8,zmm12 + vpaddd zmm9,zmm9,zmm13 + vpaddd zmm10,zmm10,zmm14 + vpaddd zmm11,zmm11,zmm15 + vpxord zmm4,zmm4,zmm8 + vpxord zmm5,zmm5,zmm9 + vpxord zmm6,zmm6,zmm10 + vpxord zmm7,zmm7,zmm11 + vprord zmm4,zmm4,0x7 + vprord zmm5,zmm5,0x7 + vprord zmm6,zmm6,0x7 + vprord zmm7,zmm7,0x7 + vpaddd zmm0,zmm0,zmm17 + vpaddd zmm1,zmm1,zmm28 + vpaddd zmm2,zmm2,zmm25 + vpaddd zmm3,zmm3,zmm31 + vpaddd zmm0,zmm0,zmm5 + vpaddd zmm1,zmm1,zmm6 + vpaddd zmm2,zmm2,zmm7 + vpaddd zmm3,zmm3,zmm4 + vpxord zmm15,zmm15,zmm0 + vpxord zmm12,zmm12,zmm1 + vpxord zmm13,zmm13,zmm2 + vpxord zmm14,zmm14,zmm3 + vprord zmm15,zmm15,0x10 + vprord zmm12,zmm12,0x10 + vprord zmm13,zmm13,0x10 + vprord zmm14,zmm14,0x10 + vpaddd zmm10,zmm10,zmm15 + vpaddd zmm11,zmm11,zmm12 + vpaddd zmm8,zmm8,zmm13 + vpaddd zmm9,zmm9,zmm14 + vpxord zmm5,zmm5,zmm10 + vpxord zmm6,zmm6,zmm11 + vpxord zmm7,zmm7,zmm8 + vpxord zmm4,zmm4,zmm9 + vprord zmm5,zmm5,0xc + vprord zmm6,zmm6,0xc + vprord zmm7,zmm7,0xc + vprord zmm4,zmm4,0xc + vpaddd zmm0,zmm0,zmm27 + vpaddd zmm1,zmm1,zmm21 + vpaddd zmm2,zmm2,zmm30 + vpaddd zmm3,zmm3,zmm24 + vpaddd zmm0,zmm0,zmm5 + vpaddd zmm1,zmm1,zmm6 + vpaddd zmm2,zmm2,zmm7 + vpaddd zmm3,zmm3,zmm4 + vpxord zmm15,zmm15,zmm0 + vpxord zmm12,zmm12,zmm1 + vpxord zmm13,zmm13,zmm2 + vpxord zmm14,zmm14,zmm3 + vprord zmm15,zmm15,0x8 + vprord zmm12,zmm12,0x8 + vprord zmm13,zmm13,0x8 + vprord zmm14,zmm14,0x8 + vpaddd zmm10,zmm10,zmm15 + vpaddd zmm11,zmm11,zmm12 + vpaddd zmm8,zmm8,zmm13 + vpaddd zmm9,zmm9,zmm14 + vpxord zmm5,zmm5,zmm10 + vpxord zmm6,zmm6,zmm11 + vpxord zmm7,zmm7,zmm8 + vpxord zmm4,zmm4,zmm9 + vprord zmm5,zmm5,0x7 + vprord zmm6,zmm6,0x7 + vprord zmm7,zmm7,0x7 + vprord zmm4,zmm4,0x7 + vpaddd zmm0,zmm0,zmm19 + vpaddd zmm1,zmm1,zmm26 + vpaddd zmm2,zmm2,zmm29 + vpaddd zmm3,zmm3,zmm23 + vpaddd zmm0,zmm0,zmm4 + vpaddd zmm1,zmm1,zmm5 + vpaddd zmm2,zmm2,zmm6 + vpaddd zmm3,zmm3,zmm7 + vpxord zmm12,zmm12,zmm0 + vpxord zmm13,zmm13,zmm1 + vpxord zmm14,zmm14,zmm2 + vpxord zmm15,zmm15,zmm3 + vprord zmm12,zmm12,0x10 + vprord zmm13,zmm13,0x10 + vprord zmm14,zmm14,0x10 + vprord zmm15,zmm15,0x10 + vpaddd zmm8,zmm8,zmm12 + vpaddd zmm9,zmm9,zmm13 + vpaddd zmm10,zmm10,zmm14 + vpaddd zmm11,zmm11,zmm15 + vpxord zmm4,zmm4,zmm8 + vpxord zmm5,zmm5,zmm9 + vpxord zmm6,zmm6,zmm10 + vpxord zmm7,zmm7,zmm11 + vprord zmm4,zmm4,0xc + vprord zmm5,zmm5,0xc + vprord zmm6,zmm6,0xc + vprord zmm7,zmm7,0xc + vpaddd zmm0,zmm0,zmm20 + vpaddd zmm1,zmm1,zmm28 + vpaddd zmm2,zmm2,zmm18 + vpaddd zmm3,zmm3,zmm30 + vpaddd zmm0,zmm0,zmm4 + vpaddd zmm1,zmm1,zmm5 + vpaddd zmm2,zmm2,zmm6 + vpaddd zmm3,zmm3,zmm7 + vpxord zmm12,zmm12,zmm0 + vpxord zmm13,zmm13,zmm1 + vpxord zmm14,zmm14,zmm2 + vpxord zmm15,zmm15,zmm3 + vprord zmm12,zmm12,0x8 + vprord zmm13,zmm13,0x8 + vprord zmm14,zmm14,0x8 + vprord zmm15,zmm15,0x8 + vpaddd zmm8,zmm8,zmm12 + vpaddd zmm9,zmm9,zmm13 + vpaddd zmm10,zmm10,zmm14 + vpaddd zmm11,zmm11,zmm15 + vpxord zmm4,zmm4,zmm8 + vpxord zmm5,zmm5,zmm9 + vpxord zmm6,zmm6,zmm10 + vpxord zmm7,zmm7,zmm11 + vprord zmm4,zmm4,0x7 + vprord zmm5,zmm5,0x7 + vprord zmm6,zmm6,0x7 + vprord zmm7,zmm7,0x7 + vpaddd zmm0,zmm0,zmm22 + vpaddd zmm1,zmm1,zmm25 + vpaddd zmm2,zmm2,zmm27 + vpaddd zmm3,zmm3,zmm24 + vpaddd zmm0,zmm0,zmm5 + vpaddd zmm1,zmm1,zmm6 + vpaddd zmm2,zmm2,zmm7 + vpaddd zmm3,zmm3,zmm4 + vpxord zmm15,zmm15,zmm0 + vpxord zmm12,zmm12,zmm1 + vpxord zmm13,zmm13,zmm2 + vpxord zmm14,zmm14,zmm3 + vprord zmm15,zmm15,0x10 + vprord zmm12,zmm12,0x10 + vprord zmm13,zmm13,0x10 + vprord zmm14,zmm14,0x10 + vpaddd zmm10,zmm10,zmm15 + vpaddd zmm11,zmm11,zmm12 + vpaddd zmm8,zmm8,zmm13 + vpaddd zmm9,zmm9,zmm14 + vpxord zmm5,zmm5,zmm10 + vpxord zmm6,zmm6,zmm11 + vpxord zmm7,zmm7,zmm8 + vpxord zmm4,zmm4,zmm9 + vprord zmm5,zmm5,0xc + vprord zmm6,zmm6,0xc + vprord zmm7,zmm7,0xc + vprord zmm4,zmm4,0xc + vpaddd zmm0,zmm0,zmm21 + vpaddd zmm1,zmm1,zmm16 + vpaddd zmm2,zmm2,zmm31 + vpaddd zmm3,zmm3,zmm17 + vpaddd zmm0,zmm0,zmm5 + vpaddd zmm1,zmm1,zmm6 + vpaddd zmm2,zmm2,zmm7 + vpaddd zmm3,zmm3,zmm4 + vpxord zmm15,zmm15,zmm0 + vpxord zmm12,zmm12,zmm1 + vpxord zmm13,zmm13,zmm2 + vpxord zmm14,zmm14,zmm3 + vprord zmm15,zmm15,0x8 + vprord zmm12,zmm12,0x8 + vprord zmm13,zmm13,0x8 + vprord zmm14,zmm14,0x8 + vpaddd zmm10,zmm10,zmm15 + vpaddd zmm11,zmm11,zmm12 + vpaddd zmm8,zmm8,zmm13 + vpaddd zmm9,zmm9,zmm14 + vpxord zmm5,zmm5,zmm10 + vpxord zmm6,zmm6,zmm11 + vpxord zmm7,zmm7,zmm8 + vpxord zmm4,zmm4,zmm9 + vprord zmm5,zmm5,0x7 + vprord zmm6,zmm6,0x7 + vprord zmm7,zmm7,0x7 + vprord zmm4,zmm4,0x7 + vpaddd zmm0,zmm0,zmm26 + vpaddd zmm1,zmm1,zmm28 + vpaddd zmm2,zmm2,zmm30 + vpaddd zmm3,zmm3,zmm29 + vpaddd zmm0,zmm0,zmm4 + vpaddd zmm1,zmm1,zmm5 + vpaddd zmm2,zmm2,zmm6 + vpaddd zmm3,zmm3,zmm7 + vpxord zmm12,zmm12,zmm0 + vpxord zmm13,zmm13,zmm1 + vpxord zmm14,zmm14,zmm2 + vpxord zmm15,zmm15,zmm3 + vprord zmm12,zmm12,0x10 + vprord zmm13,zmm13,0x10 + vprord zmm14,zmm14,0x10 + vprord zmm15,zmm15,0x10 + vpaddd zmm8,zmm8,zmm12 + vpaddd zmm9,zmm9,zmm13 + vpaddd zmm10,zmm10,zmm14 + vpaddd zmm11,zmm11,zmm15 + vpxord zmm4,zmm4,zmm8 + vpxord zmm5,zmm5,zmm9 + vpxord zmm6,zmm6,zmm10 + vpxord zmm7,zmm7,zmm11 + vprord zmm4,zmm4,0xc + vprord zmm5,zmm5,0xc + vprord zmm6,zmm6,0xc + vprord zmm7,zmm7,0xc + vpaddd zmm0,zmm0,zmm23 + vpaddd zmm1,zmm1,zmm25 + vpaddd zmm2,zmm2,zmm19 + vpaddd zmm3,zmm3,zmm31 + vpaddd zmm0,zmm0,zmm4 + vpaddd zmm1,zmm1,zmm5 + vpaddd zmm2,zmm2,zmm6 + vpaddd zmm3,zmm3,zmm7 + vpxord zmm12,zmm12,zmm0 + vpxord zmm13,zmm13,zmm1 + vpxord zmm14,zmm14,zmm2 + vpxord zmm15,zmm15,zmm3 + vprord zmm12,zmm12,0x8 + vprord zmm13,zmm13,0x8 + vprord zmm14,zmm14,0x8 + vprord zmm15,zmm15,0x8 + vpaddd zmm8,zmm8,zmm12 + vpaddd zmm9,zmm9,zmm13 + vpaddd zmm10,zmm10,zmm14 + vpaddd zmm11,zmm11,zmm15 + vpxord zmm4,zmm4,zmm8 + vpxord zmm5,zmm5,zmm9 + vpxord zmm6,zmm6,zmm10 + vpxord zmm7,zmm7,zmm11 + vprord zmm4,zmm4,0x7 + vprord zmm5,zmm5,0x7 + vprord zmm6,zmm6,0x7 + vprord zmm7,zmm7,0x7 + vpaddd zmm0,zmm0,zmm20 + vpaddd zmm1,zmm1,zmm27 + vpaddd zmm2,zmm2,zmm21 + vpaddd zmm3,zmm3,zmm17 + vpaddd zmm0,zmm0,zmm5 + vpaddd zmm1,zmm1,zmm6 + vpaddd zmm2,zmm2,zmm7 + vpaddd zmm3,zmm3,zmm4 + vpxord zmm15,zmm15,zmm0 + vpxord zmm12,zmm12,zmm1 + vpxord zmm13,zmm13,zmm2 + vpxord zmm14,zmm14,zmm3 + vprord zmm15,zmm15,0x10 + vprord zmm12,zmm12,0x10 + vprord zmm13,zmm13,0x10 + vprord zmm14,zmm14,0x10 + vpaddd zmm10,zmm10,zmm15 + vpaddd zmm11,zmm11,zmm12 + vpaddd zmm8,zmm8,zmm13 + vpaddd zmm9,zmm9,zmm14 + vpxord zmm5,zmm5,zmm10 + vpxord zmm6,zmm6,zmm11 + vpxord zmm7,zmm7,zmm8 + vpxord zmm4,zmm4,zmm9 + vprord zmm5,zmm5,0xc + vprord zmm6,zmm6,0xc + vprord zmm7,zmm7,0xc + vprord zmm4,zmm4,0xc + vpaddd zmm0,zmm0,zmm16 + vpaddd zmm1,zmm1,zmm18 + vpaddd zmm2,zmm2,zmm24 + vpaddd zmm3,zmm3,zmm22 + vpaddd zmm0,zmm0,zmm5 + vpaddd zmm1,zmm1,zmm6 + vpaddd zmm2,zmm2,zmm7 + vpaddd zmm3,zmm3,zmm4 + vpxord zmm15,zmm15,zmm0 + vpxord zmm12,zmm12,zmm1 + vpxord zmm13,zmm13,zmm2 + vpxord zmm14,zmm14,zmm3 + vprord zmm15,zmm15,0x8 + vprord zmm12,zmm12,0x8 + vprord zmm13,zmm13,0x8 + vprord zmm14,zmm14,0x8 + vpaddd zmm10,zmm10,zmm15 + vpaddd zmm11,zmm11,zmm12 + vpaddd zmm8,zmm8,zmm13 + vpaddd zmm9,zmm9,zmm14 + vpxord zmm5,zmm5,zmm10 + vpxord zmm6,zmm6,zmm11 + vpxord zmm7,zmm7,zmm8 + vpxord zmm4,zmm4,zmm9 + vprord zmm5,zmm5,0x7 + vprord zmm6,zmm6,0x7 + vprord zmm7,zmm7,0x7 + vprord zmm4,zmm4,0x7 + vpaddd zmm0,zmm0,zmm28 + vpaddd zmm1,zmm1,zmm25 + vpaddd zmm2,zmm2,zmm31 + vpaddd zmm3,zmm3,zmm30 + vpaddd zmm0,zmm0,zmm4 + vpaddd zmm1,zmm1,zmm5 + vpaddd zmm2,zmm2,zmm6 + vpaddd zmm3,zmm3,zmm7 + vpxord zmm12,zmm12,zmm0 + vpxord zmm13,zmm13,zmm1 + vpxord zmm14,zmm14,zmm2 + vpxord zmm15,zmm15,zmm3 + vprord zmm12,zmm12,0x10 + vprord zmm13,zmm13,0x10 + vprord zmm14,zmm14,0x10 + vprord zmm15,zmm15,0x10 + vpaddd zmm8,zmm8,zmm12 + vpaddd zmm9,zmm9,zmm13 + vpaddd zmm10,zmm10,zmm14 + vpaddd zmm11,zmm11,zmm15 + vpxord zmm4,zmm4,zmm8 + vpxord zmm5,zmm5,zmm9 + vpxord zmm6,zmm6,zmm10 + vpxord zmm7,zmm7,zmm11 + vprord zmm4,zmm4,0xc + vprord zmm5,zmm5,0xc + vprord zmm6,zmm6,0xc + vprord zmm7,zmm7,0xc + vpaddd zmm0,zmm0,zmm29 + vpaddd zmm1,zmm1,zmm27 + vpaddd zmm2,zmm2,zmm26 + vpaddd zmm3,zmm3,zmm24 + vpaddd zmm0,zmm0,zmm4 + vpaddd zmm1,zmm1,zmm5 + vpaddd zmm2,zmm2,zmm6 + vpaddd zmm3,zmm3,zmm7 + vpxord zmm12,zmm12,zmm0 + vpxord zmm13,zmm13,zmm1 + vpxord zmm14,zmm14,zmm2 + vpxord zmm15,zmm15,zmm3 + vprord zmm12,zmm12,0x8 + vprord zmm13,zmm13,0x8 + vprord zmm14,zmm14,0x8 + vprord zmm15,zmm15,0x8 + vpaddd zmm8,zmm8,zmm12 + vpaddd zmm9,zmm9,zmm13 + vpaddd zmm10,zmm10,zmm14 + vpaddd zmm11,zmm11,zmm15 + vpxord zmm4,zmm4,zmm8 + vpxord zmm5,zmm5,zmm9 + vpxord zmm6,zmm6,zmm10 + vpxord zmm7,zmm7,zmm11 + vprord zmm4,zmm4,0x7 + vprord zmm5,zmm5,0x7 + vprord zmm6,zmm6,0x7 + vprord zmm7,zmm7,0x7 + vpaddd zmm0,zmm0,zmm23 + vpaddd zmm1,zmm1,zmm21 + vpaddd zmm2,zmm2,zmm16 + vpaddd zmm3,zmm3,zmm22 + vpaddd zmm0,zmm0,zmm5 + vpaddd zmm1,zmm1,zmm6 + vpaddd zmm2,zmm2,zmm7 + vpaddd zmm3,zmm3,zmm4 + vpxord zmm15,zmm15,zmm0 + vpxord zmm12,zmm12,zmm1 + vpxord zmm13,zmm13,zmm2 + vpxord zmm14,zmm14,zmm3 + vprord zmm15,zmm15,0x10 + vprord zmm12,zmm12,0x10 + vprord zmm13,zmm13,0x10 + vprord zmm14,zmm14,0x10 + vpaddd zmm10,zmm10,zmm15 + vpaddd zmm11,zmm11,zmm12 + vpaddd zmm8,zmm8,zmm13 + vpaddd zmm9,zmm9,zmm14 + vpxord zmm5,zmm5,zmm10 + vpxord zmm6,zmm6,zmm11 + vpxord zmm7,zmm7,zmm8 + vpxord zmm4,zmm4,zmm9 + vprord zmm5,zmm5,0xc + vprord zmm6,zmm6,0xc + vprord zmm7,zmm7,0xc + vprord zmm4,zmm4,0xc + vpaddd zmm0,zmm0,zmm18 + vpaddd zmm1,zmm1,zmm19 + vpaddd zmm2,zmm2,zmm17 + vpaddd zmm3,zmm3,zmm20 + vpaddd zmm0,zmm0,zmm5 + vpaddd zmm1,zmm1,zmm6 + vpaddd zmm2,zmm2,zmm7 + vpaddd zmm3,zmm3,zmm4 + vpxord zmm15,zmm15,zmm0 + vpxord zmm12,zmm12,zmm1 + vpxord zmm13,zmm13,zmm2 + vpxord zmm14,zmm14,zmm3 + vprord zmm15,zmm15,0x8 + vprord zmm12,zmm12,0x8 + vprord zmm13,zmm13,0x8 + vprord zmm14,zmm14,0x8 + vpaddd zmm10,zmm10,zmm15 + vpaddd zmm11,zmm11,zmm12 + vpaddd zmm8,zmm8,zmm13 + vpaddd zmm9,zmm9,zmm14 + vpxord zmm5,zmm5,zmm10 + vpxord zmm6,zmm6,zmm11 + vpxord zmm7,zmm7,zmm8 + vpxord zmm4,zmm4,zmm9 + vprord zmm5,zmm5,0x7 + vprord zmm6,zmm6,0x7 + vprord zmm7,zmm7,0x7 + vprord zmm4,zmm4,0x7 + vpaddd zmm0,zmm0,zmm25 + vpaddd zmm1,zmm1,zmm27 + vpaddd zmm2,zmm2,zmm24 + vpaddd zmm3,zmm3,zmm31 + vpaddd zmm0,zmm0,zmm4 + vpaddd zmm1,zmm1,zmm5 + vpaddd zmm2,zmm2,zmm6 + vpaddd zmm3,zmm3,zmm7 + vpxord zmm12,zmm12,zmm0 + vpxord zmm13,zmm13,zmm1 + vpxord zmm14,zmm14,zmm2 + vpxord zmm15,zmm15,zmm3 + vprord zmm12,zmm12,0x10 + vprord zmm13,zmm13,0x10 + vprord zmm14,zmm14,0x10 + vprord zmm15,zmm15,0x10 + vpaddd zmm8,zmm8,zmm12 + vpaddd zmm9,zmm9,zmm13 + vpaddd zmm10,zmm10,zmm14 + vpaddd zmm11,zmm11,zmm15 + vpxord zmm4,zmm4,zmm8 + vpxord zmm5,zmm5,zmm9 + vpxord zmm6,zmm6,zmm10 + vpxord zmm7,zmm7,zmm11 + vprord zmm4,zmm4,0xc + vprord zmm5,zmm5,0xc + vprord zmm6,zmm6,0xc + vprord zmm7,zmm7,0xc + vpaddd zmm0,zmm0,zmm30 + vpaddd zmm1,zmm1,zmm21 + vpaddd zmm2,zmm2,zmm28 + vpaddd zmm3,zmm3,zmm17 + vpaddd zmm0,zmm0,zmm4 + vpaddd zmm1,zmm1,zmm5 + vpaddd zmm2,zmm2,zmm6 + vpaddd zmm3,zmm3,zmm7 + vpxord zmm12,zmm12,zmm0 + vpxord zmm13,zmm13,zmm1 + vpxord zmm14,zmm14,zmm2 + vpxord zmm15,zmm15,zmm3 + vprord zmm12,zmm12,0x8 + vprord zmm13,zmm13,0x8 + vprord zmm14,zmm14,0x8 + vprord zmm15,zmm15,0x8 + vpaddd zmm8,zmm8,zmm12 + vpaddd zmm9,zmm9,zmm13 + vpaddd zmm10,zmm10,zmm14 + vpaddd zmm11,zmm11,zmm15 + vpxord zmm4,zmm4,zmm8 + vpxord zmm5,zmm5,zmm9 + vpxord zmm6,zmm6,zmm10 + vpxord zmm7,zmm7,zmm11 + vprord zmm4,zmm4,0x7 + vprord zmm5,zmm5,0x7 + vprord zmm6,zmm6,0x7 + vprord zmm7,zmm7,0x7 + vpaddd zmm0,zmm0,zmm29 + vpaddd zmm1,zmm1,zmm16 + vpaddd zmm2,zmm2,zmm18 + vpaddd zmm3,zmm3,zmm20 + vpaddd zmm0,zmm0,zmm5 + vpaddd zmm1,zmm1,zmm6 + vpaddd zmm2,zmm2,zmm7 + vpaddd zmm3,zmm3,zmm4 + vpxord zmm15,zmm15,zmm0 + vpxord zmm12,zmm12,zmm1 + vpxord zmm13,zmm13,zmm2 + vpxord zmm14,zmm14,zmm3 + vprord zmm15,zmm15,0x10 + vprord zmm12,zmm12,0x10 + vprord zmm13,zmm13,0x10 + vprord zmm14,zmm14,0x10 + vpaddd zmm10,zmm10,zmm15 + vpaddd zmm11,zmm11,zmm12 + vpaddd zmm8,zmm8,zmm13 + vpaddd zmm9,zmm9,zmm14 + vpxord zmm5,zmm5,zmm10 + vpxord zmm6,zmm6,zmm11 + vpxord zmm7,zmm7,zmm8 + vpxord zmm4,zmm4,zmm9 + vprord zmm5,zmm5,0xc + vprord zmm6,zmm6,0xc + vprord zmm7,zmm7,0xc + vprord zmm4,zmm4,0xc + vpaddd zmm0,zmm0,zmm19 + vpaddd zmm1,zmm1,zmm26 + vpaddd zmm2,zmm2,zmm22 + vpaddd zmm3,zmm3,zmm23 + vpaddd zmm0,zmm0,zmm5 + vpaddd zmm1,zmm1,zmm6 + vpaddd zmm2,zmm2,zmm7 + vpaddd zmm3,zmm3,zmm4 + vpxord zmm15,zmm15,zmm0 + vpxord zmm12,zmm12,zmm1 + vpxord zmm13,zmm13,zmm2 + vpxord zmm14,zmm14,zmm3 + vprord zmm15,zmm15,0x8 + vprord zmm12,zmm12,0x8 + vprord zmm13,zmm13,0x8 + vprord zmm14,zmm14,0x8 + vpaddd zmm10,zmm10,zmm15 + vpaddd zmm11,zmm11,zmm12 + vpaddd zmm8,zmm8,zmm13 + vpaddd zmm9,zmm9,zmm14 + vpxord zmm5,zmm5,zmm10 + vpxord zmm6,zmm6,zmm11 + vpxord zmm7,zmm7,zmm8 + vpxord zmm4,zmm4,zmm9 + vprord zmm5,zmm5,0x7 + vprord zmm6,zmm6,0x7 + vprord zmm7,zmm7,0x7 + vprord zmm4,zmm4,0x7 + vpaddd zmm0,zmm0,zmm27 + vpaddd zmm1,zmm1,zmm21 + vpaddd zmm2,zmm2,zmm17 + vpaddd zmm3,zmm3,zmm24 + vpaddd zmm0,zmm0,zmm4 + vpaddd zmm1,zmm1,zmm5 + vpaddd zmm2,zmm2,zmm6 + vpaddd zmm3,zmm3,zmm7 + vpxord zmm12,zmm12,zmm0 + vpxord zmm13,zmm13,zmm1 + vpxord zmm14,zmm14,zmm2 + vpxord zmm15,zmm15,zmm3 + vprord zmm12,zmm12,0x10 + vprord zmm13,zmm13,0x10 + vprord zmm14,zmm14,0x10 + vprord zmm15,zmm15,0x10 + vpaddd zmm8,zmm8,zmm12 + vpaddd zmm9,zmm9,zmm13 + vpaddd zmm10,zmm10,zmm14 + vpaddd zmm11,zmm11,zmm15 + vpxord zmm4,zmm4,zmm8 + vpxord zmm5,zmm5,zmm9 + vpxord zmm6,zmm6,zmm10 + vpxord zmm7,zmm7,zmm11 + vprord zmm4,zmm4,0xc + vprord zmm5,zmm5,0xc + vprord zmm6,zmm6,0xc + vprord zmm7,zmm7,0xc + vpaddd zmm0,zmm0,zmm31 + vpaddd zmm1,zmm1,zmm16 + vpaddd zmm2,zmm2,zmm25 + vpaddd zmm3,zmm3,zmm22 + vpaddd zmm0,zmm0,zmm4 + vpaddd zmm1,zmm1,zmm5 + vpaddd zmm2,zmm2,zmm6 + vpaddd zmm3,zmm3,zmm7 + vpxord zmm12,zmm12,zmm0 + vpxord zmm13,zmm13,zmm1 + vpxord zmm14,zmm14,zmm2 + vpxord zmm15,zmm15,zmm3 + vprord zmm12,zmm12,0x8 + vprord zmm13,zmm13,0x8 + vprord zmm14,zmm14,0x8 + vprord zmm15,zmm15,0x8 + vpaddd zmm8,zmm8,zmm12 + vpaddd zmm9,zmm9,zmm13 + vpaddd zmm10,zmm10,zmm14 + vpaddd zmm11,zmm11,zmm15 + vpxord zmm4,zmm4,zmm8 + vpxord zmm5,zmm5,zmm9 + vpxord zmm6,zmm6,zmm10 + vpxord zmm7,zmm7,zmm11 + vprord zmm4,zmm4,0x7 + vprord zmm5,zmm5,0x7 + vprord zmm6,zmm6,0x7 + vprord zmm7,zmm7,0x7 + vpaddd zmm0,zmm0,zmm30 + vpaddd zmm1,zmm1,zmm18 + vpaddd zmm2,zmm2,zmm19 + vpaddd zmm3,zmm3,zmm23 + vpaddd zmm0,zmm0,zmm5 + vpaddd zmm1,zmm1,zmm6 + vpaddd zmm2,zmm2,zmm7 + vpaddd zmm3,zmm3,zmm4 + vpxord zmm15,zmm15,zmm0 + vpxord zmm12,zmm12,zmm1 + vpxord zmm13,zmm13,zmm2 + vpxord zmm14,zmm14,zmm3 + vprord zmm15,zmm15,0x10 + vprord zmm12,zmm12,0x10 + vprord zmm13,zmm13,0x10 + vprord zmm14,zmm14,0x10 + vpaddd zmm10,zmm10,zmm15 + vpaddd zmm11,zmm11,zmm12 + vpaddd zmm8,zmm8,zmm13 + vpaddd zmm9,zmm9,zmm14 + vpxord zmm5,zmm5,zmm10 + vpxord zmm6,zmm6,zmm11 + vpxord zmm7,zmm7,zmm8 + vpxord zmm4,zmm4,zmm9 + vprord zmm5,zmm5,0xc + vprord zmm6,zmm6,0xc + vprord zmm7,zmm7,0xc + vprord zmm4,zmm4,0xc + vpaddd zmm0,zmm0,zmm26 + vpaddd zmm1,zmm1,zmm28 + vpaddd zmm2,zmm2,zmm20 + vpaddd zmm3,zmm3,zmm29 + vpaddd zmm0,zmm0,zmm5 + vpaddd zmm1,zmm1,zmm6 + vpaddd zmm2,zmm2,zmm7 + vpaddd zmm3,zmm3,zmm4 + vpxord zmm15,zmm15,zmm0 + vpxord zmm12,zmm12,zmm1 + vpxord zmm13,zmm13,zmm2 + vpxord zmm14,zmm14,zmm3 + vprord zmm15,zmm15,0x8 + vprord zmm12,zmm12,0x8 + vprord zmm13,zmm13,0x8 + vprord zmm14,zmm14,0x8 + vpaddd zmm10,zmm10,zmm15 + vpaddd zmm11,zmm11,zmm12 + vpaddd zmm8,zmm8,zmm13 + vpaddd zmm9,zmm9,zmm14 + vpxord zmm5,zmm5,zmm10 + vpxord zmm6,zmm6,zmm11 + vpxord zmm7,zmm7,zmm8 + vpxord zmm4,zmm4,zmm9 + vprord zmm5,zmm5,0x7 + vprord zmm6,zmm6,0x7 + vprord zmm7,zmm7,0x7 + vprord zmm4,zmm4,0x7 + vpxord zmm0,zmm0,zmm8 + vpxord zmm1,zmm1,zmm9 + vpxord zmm2,zmm2,zmm10 + vpxord zmm3,zmm3,zmm11 + vpxord zmm4,zmm4,zmm12 + vpxord zmm5,zmm5,zmm13 + vpxord zmm6,zmm6,zmm14 + vpxord zmm7,zmm7,zmm15 + vpxord zmm8,zmm8,DWORD PTR [rdi]{1to16} + vpxord zmm9,zmm9,DWORD PTR [rdi+0x4]{1to16} + vpxord zmm10,zmm10,DWORD PTR [rdi+0x8]{1to16} + vpxord zmm11,zmm11,DWORD PTR [rdi+0xc]{1to16} + vpxord zmm12,zmm12,DWORD PTR [rdi+0x10]{1to16} + vpxord zmm13,zmm13,DWORD PTR [rdi+0x14]{1to16} + vpxord zmm14,zmm14,DWORD PTR [rdi+0x18]{1to16} + vpxord zmm15,zmm15,DWORD PTR [rdi+0x1c]{1to16} + vpunpckldq zmm16,zmm0,zmm1 + vpunpckhdq zmm17,zmm0,zmm1 + vpunpckldq zmm18,zmm2,zmm3 + vpunpckhdq zmm19,zmm2,zmm3 + vpunpckldq zmm20,zmm4,zmm5 + vpunpckhdq zmm21,zmm4,zmm5 + vpunpckldq zmm22,zmm6,zmm7 + vpunpckhdq zmm23,zmm6,zmm7 + vpunpckldq zmm24,zmm8,zmm9 + vpunpckhdq zmm25,zmm8,zmm9 + vpunpckldq zmm26,zmm10,zmm11 + vpunpckhdq zmm27,zmm10,zmm11 + vpunpckldq zmm28,zmm12,zmm13 + vpunpckhdq zmm29,zmm12,zmm13 + vpunpckldq zmm30,zmm14,zmm15 + vpunpckhdq zmm31,zmm14,zmm15 + vpunpcklqdq zmm0,zmm16,zmm18 + vpunpckhqdq zmm1,zmm16,zmm18 + vpunpcklqdq zmm2,zmm17,zmm19 + vpunpckhqdq zmm3,zmm17,zmm19 + vpunpcklqdq zmm4,zmm20,zmm22 + vpunpckhqdq zmm5,zmm20,zmm22 + vpunpcklqdq zmm6,zmm21,zmm23 + vpunpckhqdq zmm7,zmm21,zmm23 + vpunpcklqdq zmm8,zmm24,zmm26 + vpunpckhqdq zmm9,zmm24,zmm26 + vpunpcklqdq zmm10,zmm25,zmm27 + vpunpckhqdq zmm11,zmm25,zmm27 + vpunpcklqdq zmm12,zmm28,zmm30 + vpunpckhqdq zmm13,zmm28,zmm30 + vpunpcklqdq zmm14,zmm29,zmm31 + vpunpckhqdq zmm15,zmm29,zmm31 + vshufi32x4 zmm16,zmm0,zmm4,0x88 + vshufi32x4 zmm17,zmm1,zmm5,0x88 + vshufi32x4 zmm18,zmm2,zmm6,0x88 + vshufi32x4 zmm19,zmm3,zmm7,0x88 + vshufi32x4 zmm20,zmm0,zmm4,0xdd + vshufi32x4 zmm21,zmm1,zmm5,0xdd + vshufi32x4 zmm22,zmm2,zmm6,0xdd + vshufi32x4 zmm23,zmm3,zmm7,0xdd + vshufi32x4 zmm24,zmm8,zmm12,0x88 + vshufi32x4 zmm25,zmm9,zmm13,0x88 + vshufi32x4 zmm26,zmm10,zmm14,0x88 + vshufi32x4 zmm27,zmm11,zmm15,0x88 + vshufi32x4 zmm28,zmm8,zmm12,0xdd + vshufi32x4 zmm29,zmm9,zmm13,0xdd + vshufi32x4 zmm30,zmm10,zmm14,0xdd + vshufi32x4 zmm31,zmm11,zmm15,0xdd + vshufi32x4 zmm0,zmm16,zmm24,0x88 + vshufi32x4 zmm1,zmm17,zmm25,0x88 + vshufi32x4 zmm2,zmm18,zmm26,0x88 + vshufi32x4 zmm3,zmm19,zmm27,0x88 + vshufi32x4 zmm4,zmm20,zmm28,0x88 + vshufi32x4 zmm5,zmm21,zmm29,0x88 + vshufi32x4 zmm6,zmm22,zmm30,0x88 + vshufi32x4 zmm7,zmm23,zmm31,0x88 + vshufi32x4 zmm8,zmm16,zmm24,0xdd + vshufi32x4 zmm9,zmm17,zmm25,0xdd + vshufi32x4 zmm10,zmm18,zmm26,0xdd + vshufi32x4 zmm11,zmm19,zmm27,0xdd + vshufi32x4 zmm12,zmm20,zmm28,0xdd + vshufi32x4 zmm13,zmm21,zmm29,0xdd + vshufi32x4 zmm14,zmm22,zmm30,0xdd + vshufi32x4 zmm15,zmm23,zmm31,0xdd + vmovdqu32 ZMMWORD PTR [r9],zmm0 + vmovdqu32 ZMMWORD PTR [r9+0x40],zmm1 + vmovdqu32 ZMMWORD PTR [r9+0x80],zmm2 + vmovdqu32 ZMMWORD PTR [r9+0xc0],zmm3 + vmovdqu32 ZMMWORD PTR [r9+0x100],zmm4 + vmovdqu32 ZMMWORD PTR [r9+0x140],zmm5 + vmovdqu32 ZMMWORD PTR [r9+0x180],zmm6 + vmovdqu32 ZMMWORD PTR [r9+0x1c0],zmm7 + vmovdqu32 ZMMWORD PTR [r9+0x200],zmm8 + vmovdqu32 ZMMWORD PTR [r9+0x240],zmm9 + vmovdqu32 ZMMWORD PTR [r9+0x280],zmm10 + vmovdqu32 ZMMWORD PTR [r9+0x2c0],zmm11 + vmovdqu32 ZMMWORD PTR [r9+0x300],zmm12 + vmovdqu32 ZMMWORD PTR [r9+0x340],zmm13 + vmovdqu32 ZMMWORD PTR [r9+0x380],zmm14 + vmovdqu32 ZMMWORD PTR [r9+0x3c0],zmm15 + vmovdqa32 zmm0,ZMMWORD PTR [rsp] + vmovdqa32 zmm1,ZMMWORD PTR [rsp+0x40] + vpaddd zmm2,zmm0,DWORD PTR [ADD16+rip]{1to16} + vpcmpltud k1,zmm2,zmm0 + vpaddd zmm1{k1},zmm1,DWORD PTR [ADD1+rip]{1to16} + vmovdqa32 ZMMWORD PTR [rsp],zmm2 + vmovdqa32 ZMMWORD PTR [rsp+0x40],zmm1 + add r9,0x400 + sub r10,0x10 + cmp r10,0x10 + jae 3b + test r10,r10 + jne 2f +9: + vzeroupper + mov rsp,rbp + pop rbp + ret +2: + test r10,0x8 + je 2f + vpbroadcastd ymm16,DWORD PTR [rsi] + vpbroadcastd ymm17,DWORD PTR [rsi+0x4] + vpbroadcastd ymm18,DWORD PTR [rsi+0x8] + vpbroadcastd ymm19,DWORD PTR [rsi+0xc] + vpbroadcastd ymm20,DWORD PTR [rsi+0x10] + vpbroadcastd ymm21,DWORD PTR [rsi+0x14] + vpbroadcastd ymm22,DWORD PTR [rsi+0x18] + vpbroadcastd ymm23,DWORD PTR [rsi+0x1c] + vpbroadcastd ymm24,DWORD PTR [rsi+0x20] + vpbroadcastd ymm25,DWORD PTR [rsi+0x24] + vpbroadcastd ymm26,DWORD PTR [rsi+0x28] + vpbroadcastd ymm27,DWORD PTR [rsi+0x2c] + vpbroadcastd ymm28,DWORD PTR [rsi+0x30] + vpbroadcastd ymm29,DWORD PTR [rsi+0x34] + vpbroadcastd ymm30,DWORD PTR [rsi+0x38] + vpbroadcastd ymm31,DWORD PTR [rsi+0x3c] + vpbroadcastd ymm0,DWORD PTR [rdi] + vpbroadcastd ymm1,DWORD PTR [rdi+0x4] + vpbroadcastd ymm2,DWORD PTR [rdi+0x8] + vpbroadcastd ymm3,DWORD PTR [rdi+0xc] + vpbroadcastd ymm4,DWORD PTR [rdi+0x10] + vpbroadcastd ymm5,DWORD PTR [rdi+0x14] + vpbroadcastd ymm6,DWORD PTR [rdi+0x18] + vpbroadcastd ymm7,DWORD PTR [rdi+0x1c] + vpbroadcastd ymm8,DWORD PTR [BLAKE3_IV_0+rip] + vpbroadcastd ymm9,DWORD PTR [BLAKE3_IV_1+rip] + vpbroadcastd ymm10,DWORD PTR [BLAKE3_IV_2+rip] + vpbroadcastd ymm11,DWORD PTR [BLAKE3_IV_3+rip] + vmovdqa ymm12,YMMWORD PTR [rsp] + vmovdqa ymm13,YMMWORD PTR [rsp+0x40] + vpbroadcastd ymm14,edx + vpbroadcastd ymm15,r8d + vpaddd ymm0,ymm0,ymm16 + vpaddd ymm1,ymm1,ymm18 + vpaddd ymm2,ymm2,ymm20 + vpaddd ymm3,ymm3,ymm22 + vpaddd ymm0,ymm0,ymm4 + vpaddd ymm1,ymm1,ymm5 + vpaddd ymm2,ymm2,ymm6 + vpaddd ymm3,ymm3,ymm7 + vpxord ymm12,ymm12,ymm0 + vpxord ymm13,ymm13,ymm1 + vpxord ymm14,ymm14,ymm2 + vpxord ymm15,ymm15,ymm3 + vprord ymm12,ymm12,0x10 + vprord ymm13,ymm13,0x10 + vprord ymm14,ymm14,0x10 + vprord ymm15,ymm15,0x10 + vpaddd ymm8,ymm8,ymm12 + vpaddd ymm9,ymm9,ymm13 + vpaddd ymm10,ymm10,ymm14 + vpaddd ymm11,ymm11,ymm15 + vpxord ymm4,ymm4,ymm8 + vpxord ymm5,ymm5,ymm9 + vpxord ymm6,ymm6,ymm10 + vpxord ymm7,ymm7,ymm11 + vprord ymm4,ymm4,0xc + vprord ymm5,ymm5,0xc + vprord ymm6,ymm6,0xc + vprord ymm7,ymm7,0xc + vpaddd ymm0,ymm0,ymm17 + vpaddd ymm1,ymm1,ymm19 + vpaddd ymm2,ymm2,ymm21 + vpaddd ymm3,ymm3,ymm23 + vpaddd ymm0,ymm0,ymm4 + vpaddd ymm1,ymm1,ymm5 + vpaddd ymm2,ymm2,ymm6 + vpaddd ymm3,ymm3,ymm7 + vpxord ymm12,ymm12,ymm0 + vpxord ymm13,ymm13,ymm1 + vpxord ymm14,ymm14,ymm2 + vpxord ymm15,ymm15,ymm3 + vprord ymm12,ymm12,0x8 + vprord ymm13,ymm13,0x8 + vprord ymm14,ymm14,0x8 + vprord ymm15,ymm15,0x8 + vpaddd ymm8,ymm8,ymm12 + vpaddd ymm9,ymm9,ymm13 + vpaddd ymm10,ymm10,ymm14 + vpaddd ymm11,ymm11,ymm15 + vpxord ymm4,ymm4,ymm8 + vpxord ymm5,ymm5,ymm9 + vpxord ymm6,ymm6,ymm10 + vpxord ymm7,ymm7,ymm11 + vprord ymm4,ymm4,0x7 + vprord ymm5,ymm5,0x7 + vprord ymm6,ymm6,0x7 + vprord ymm7,ymm7,0x7 + vpaddd ymm0,ymm0,ymm24 + vpaddd ymm1,ymm1,ymm26 + vpaddd ymm2,ymm2,ymm28 + vpaddd ymm3,ymm3,ymm30 + vpaddd ymm0,ymm0,ymm5 + vpaddd ymm1,ymm1,ymm6 + vpaddd ymm2,ymm2,ymm7 + vpaddd ymm3,ymm3,ymm4 + vpxord ymm15,ymm15,ymm0 + vpxord ymm12,ymm12,ymm1 + vpxord ymm13,ymm13,ymm2 + vpxord ymm14,ymm14,ymm3 + vprord ymm15,ymm15,0x10 + vprord ymm12,ymm12,0x10 + vprord ymm13,ymm13,0x10 + vprord ymm14,ymm14,0x10 + vpaddd ymm10,ymm10,ymm15 + vpaddd ymm11,ymm11,ymm12 + vpaddd ymm8,ymm8,ymm13 + vpaddd ymm9,ymm9,ymm14 + vpxord ymm5,ymm5,ymm10 + vpxord ymm6,ymm6,ymm11 + vpxord ymm7,ymm7,ymm8 + vpxord ymm4,ymm4,ymm9 + vprord ymm5,ymm5,0xc + vprord ymm6,ymm6,0xc + vprord ymm7,ymm7,0xc + vprord ymm4,ymm4,0xc + vpaddd ymm0,ymm0,ymm25 + vpaddd ymm1,ymm1,ymm27 + vpaddd ymm2,ymm2,ymm29 + vpaddd ymm3,ymm3,ymm31 + vpaddd ymm0,ymm0,ymm5 + vpaddd ymm1,ymm1,ymm6 + vpaddd ymm2,ymm2,ymm7 + vpaddd ymm3,ymm3,ymm4 + vpxord ymm15,ymm15,ymm0 + vpxord ymm12,ymm12,ymm1 + vpxord ymm13,ymm13,ymm2 + vpxord ymm14,ymm14,ymm3 + vprord ymm15,ymm15,0x8 + vprord ymm12,ymm12,0x8 + vprord ymm13,ymm13,0x8 + vprord ymm14,ymm14,0x8 + vpaddd ymm10,ymm10,ymm15 + vpaddd ymm11,ymm11,ymm12 + vpaddd ymm8,ymm8,ymm13 + vpaddd ymm9,ymm9,ymm14 + vpxord ymm5,ymm5,ymm10 + vpxord ymm6,ymm6,ymm11 + vpxord ymm7,ymm7,ymm8 + vpxord ymm4,ymm4,ymm9 + vprord ymm5,ymm5,0x7 + vprord ymm6,ymm6,0x7 + vprord ymm7,ymm7,0x7 + vprord ymm4,ymm4,0x7 + vpaddd ymm0,ymm0,ymm18 + vpaddd ymm1,ymm1,ymm19 + vpaddd ymm2,ymm2,ymm23 + vpaddd ymm3,ymm3,ymm20 + vpaddd ymm0,ymm0,ymm4 + vpaddd ymm1,ymm1,ymm5 + vpaddd ymm2,ymm2,ymm6 + vpaddd ymm3,ymm3,ymm7 + vpxord ymm12,ymm12,ymm0 + vpxord ymm13,ymm13,ymm1 + vpxord ymm14,ymm14,ymm2 + vpxord ymm15,ymm15,ymm3 + vprord ymm12,ymm12,0x10 + vprord ymm13,ymm13,0x10 + vprord ymm14,ymm14,0x10 + vprord ymm15,ymm15,0x10 + vpaddd ymm8,ymm8,ymm12 + vpaddd ymm9,ymm9,ymm13 + vpaddd ymm10,ymm10,ymm14 + vpaddd ymm11,ymm11,ymm15 + vpxord ymm4,ymm4,ymm8 + vpxord ymm5,ymm5,ymm9 + vpxord ymm6,ymm6,ymm10 + vpxord ymm7,ymm7,ymm11 + vprord ymm4,ymm4,0xc + vprord ymm5,ymm5,0xc + vprord ymm6,ymm6,0xc + vprord ymm7,ymm7,0xc + vpaddd ymm0,ymm0,ymm22 + vpaddd ymm1,ymm1,ymm26 + vpaddd ymm2,ymm2,ymm16 + vpaddd ymm3,ymm3,ymm29 + vpaddd ymm0,ymm0,ymm4 + vpaddd ymm1,ymm1,ymm5 + vpaddd ymm2,ymm2,ymm6 + vpaddd ymm3,ymm3,ymm7 + vpxord ymm12,ymm12,ymm0 + vpxord ymm13,ymm13,ymm1 + vpxord ymm14,ymm14,ymm2 + vpxord ymm15,ymm15,ymm3 + vprord ymm12,ymm12,0x8 + vprord ymm13,ymm13,0x8 + vprord ymm14,ymm14,0x8 + vprord ymm15,ymm15,0x8 + vpaddd ymm8,ymm8,ymm12 + vpaddd ymm9,ymm9,ymm13 + vpaddd ymm10,ymm10,ymm14 + vpaddd ymm11,ymm11,ymm15 + vpxord ymm4,ymm4,ymm8 + vpxord ymm5,ymm5,ymm9 + vpxord ymm6,ymm6,ymm10 + vpxord ymm7,ymm7,ymm11 + vprord ymm4,ymm4,0x7 + vprord ymm5,ymm5,0x7 + vprord ymm6,ymm6,0x7 + vprord ymm7,ymm7,0x7 + vpaddd ymm0,ymm0,ymm17 + vpaddd ymm1,ymm1,ymm28 + vpaddd ymm2,ymm2,ymm25 + vpaddd ymm3,ymm3,ymm31 + vpaddd ymm0,ymm0,ymm5 + vpaddd ymm1,ymm1,ymm6 + vpaddd ymm2,ymm2,ymm7 + vpaddd ymm3,ymm3,ymm4 + vpxord ymm15,ymm15,ymm0 + vpxord ymm12,ymm12,ymm1 + vpxord ymm13,ymm13,ymm2 + vpxord ymm14,ymm14,ymm3 + vprord ymm15,ymm15,0x10 + vprord ymm12,ymm12,0x10 + vprord ymm13,ymm13,0x10 + vprord ymm14,ymm14,0x10 + vpaddd ymm10,ymm10,ymm15 + vpaddd ymm11,ymm11,ymm12 + vpaddd ymm8,ymm8,ymm13 + vpaddd ymm9,ymm9,ymm14 + vpxord ymm5,ymm5,ymm10 + vpxord ymm6,ymm6,ymm11 + vpxord ymm7,ymm7,ymm8 + vpxord ymm4,ymm4,ymm9 + vprord ymm5,ymm5,0xc + vprord ymm6,ymm6,0xc + vprord ymm7,ymm7,0xc + vprord ymm4,ymm4,0xc + vpaddd ymm0,ymm0,ymm27 + vpaddd ymm1,ymm1,ymm21 + vpaddd ymm2,ymm2,ymm30 + vpaddd ymm3,ymm3,ymm24 + vpaddd ymm0,ymm0,ymm5 + vpaddd ymm1,ymm1,ymm6 + vpaddd ymm2,ymm2,ymm7 + vpaddd ymm3,ymm3,ymm4 + vpxord ymm15,ymm15,ymm0 + vpxord ymm12,ymm12,ymm1 + vpxord ymm13,ymm13,ymm2 + vpxord ymm14,ymm14,ymm3 + vprord ymm15,ymm15,0x8 + vprord ymm12,ymm12,0x8 + vprord ymm13,ymm13,0x8 + vprord ymm14,ymm14,0x8 + vpaddd ymm10,ymm10,ymm15 + vpaddd ymm11,ymm11,ymm12 + vpaddd ymm8,ymm8,ymm13 + vpaddd ymm9,ymm9,ymm14 + vpxord ymm5,ymm5,ymm10 + vpxord ymm6,ymm6,ymm11 + vpxord ymm7,ymm7,ymm8 + vpxord ymm4,ymm4,ymm9 + vprord ymm5,ymm5,0x7 + vprord ymm6,ymm6,0x7 + vprord ymm7,ymm7,0x7 + vprord ymm4,ymm4,0x7 + vpaddd ymm0,ymm0,ymm19 + vpaddd ymm1,ymm1,ymm26 + vpaddd ymm2,ymm2,ymm29 + vpaddd ymm3,ymm3,ymm23 + vpaddd ymm0,ymm0,ymm4 + vpaddd ymm1,ymm1,ymm5 + vpaddd ymm2,ymm2,ymm6 + vpaddd ymm3,ymm3,ymm7 + vpxord ymm12,ymm12,ymm0 + vpxord ymm13,ymm13,ymm1 + vpxord ymm14,ymm14,ymm2 + vpxord ymm15,ymm15,ymm3 + vprord ymm12,ymm12,0x10 + vprord ymm13,ymm13,0x10 + vprord ymm14,ymm14,0x10 + vprord ymm15,ymm15,0x10 + vpaddd ymm8,ymm8,ymm12 + vpaddd ymm9,ymm9,ymm13 + vpaddd ymm10,ymm10,ymm14 + vpaddd ymm11,ymm11,ymm15 + vpxord ymm4,ymm4,ymm8 + vpxord ymm5,ymm5,ymm9 + vpxord ymm6,ymm6,ymm10 + vpxord ymm7,ymm7,ymm11 + vprord ymm4,ymm4,0xc + vprord ymm5,ymm5,0xc + vprord ymm6,ymm6,0xc + vprord ymm7,ymm7,0xc + vpaddd ymm0,ymm0,ymm20 + vpaddd ymm1,ymm1,ymm28 + vpaddd ymm2,ymm2,ymm18 + vpaddd ymm3,ymm3,ymm30 + vpaddd ymm0,ymm0,ymm4 + vpaddd ymm1,ymm1,ymm5 + vpaddd ymm2,ymm2,ymm6 + vpaddd ymm3,ymm3,ymm7 + vpxord ymm12,ymm12,ymm0 + vpxord ymm13,ymm13,ymm1 + vpxord ymm14,ymm14,ymm2 + vpxord ymm15,ymm15,ymm3 + vprord ymm12,ymm12,0x8 + vprord ymm13,ymm13,0x8 + vprord ymm14,ymm14,0x8 + vprord ymm15,ymm15,0x8 + vpaddd ymm8,ymm8,ymm12 + vpaddd ymm9,ymm9,ymm13 + vpaddd ymm10,ymm10,ymm14 + vpaddd ymm11,ymm11,ymm15 + vpxord ymm4,ymm4,ymm8 + vpxord ymm5,ymm5,ymm9 + vpxord ymm6,ymm6,ymm10 + vpxord ymm7,ymm7,ymm11 + vprord ymm4,ymm4,0x7 + vprord ymm5,ymm5,0x7 + vprord ymm6,ymm6,0x7 + vprord ymm7,ymm7,0x7 + vpaddd ymm0,ymm0,ymm22 + vpaddd ymm1,ymm1,ymm25 + vpaddd ymm2,ymm2,ymm27 + vpaddd ymm3,ymm3,ymm24 + vpaddd ymm0,ymm0,ymm5 + vpaddd ymm1,ymm1,ymm6 + vpaddd ymm2,ymm2,ymm7 + vpaddd ymm3,ymm3,ymm4 + vpxord ymm15,ymm15,ymm0 + vpxord ymm12,ymm12,ymm1 + vpxord ymm13,ymm13,ymm2 + vpxord ymm14,ymm14,ymm3 + vprord ymm15,ymm15,0x10 + vprord ymm12,ymm12,0x10 + vprord ymm13,ymm13,0x10 + vprord ymm14,ymm14,0x10 + vpaddd ymm10,ymm10,ymm15 + vpaddd ymm11,ymm11,ymm12 + vpaddd ymm8,ymm8,ymm13 + vpaddd ymm9,ymm9,ymm14 + vpxord ymm5,ymm5,ymm10 + vpxord ymm6,ymm6,ymm11 + vpxord ymm7,ymm7,ymm8 + vpxord ymm4,ymm4,ymm9 + vprord ymm5,ymm5,0xc + vprord ymm6,ymm6,0xc + vprord ymm7,ymm7,0xc + vprord ymm4,ymm4,0xc + vpaddd ymm0,ymm0,ymm21 + vpaddd ymm1,ymm1,ymm16 + vpaddd ymm2,ymm2,ymm31 + vpaddd ymm3,ymm3,ymm17 + vpaddd ymm0,ymm0,ymm5 + vpaddd ymm1,ymm1,ymm6 + vpaddd ymm2,ymm2,ymm7 + vpaddd ymm3,ymm3,ymm4 + vpxord ymm15,ymm15,ymm0 + vpxord ymm12,ymm12,ymm1 + vpxord ymm13,ymm13,ymm2 + vpxord ymm14,ymm14,ymm3 + vprord ymm15,ymm15,0x8 + vprord ymm12,ymm12,0x8 + vprord ymm13,ymm13,0x8 + vprord ymm14,ymm14,0x8 + vpaddd ymm10,ymm10,ymm15 + vpaddd ymm11,ymm11,ymm12 + vpaddd ymm8,ymm8,ymm13 + vpaddd ymm9,ymm9,ymm14 + vpxord ymm5,ymm5,ymm10 + vpxord ymm6,ymm6,ymm11 + vpxord ymm7,ymm7,ymm8 + vpxord ymm4,ymm4,ymm9 + vprord ymm5,ymm5,0x7 + vprord ymm6,ymm6,0x7 + vprord ymm7,ymm7,0x7 + vprord ymm4,ymm4,0x7 + vpaddd ymm0,ymm0,ymm26 + vpaddd ymm1,ymm1,ymm28 + vpaddd ymm2,ymm2,ymm30 + vpaddd ymm3,ymm3,ymm29 + vpaddd ymm0,ymm0,ymm4 + vpaddd ymm1,ymm1,ymm5 + vpaddd ymm2,ymm2,ymm6 + vpaddd ymm3,ymm3,ymm7 + vpxord ymm12,ymm12,ymm0 + vpxord ymm13,ymm13,ymm1 + vpxord ymm14,ymm14,ymm2 + vpxord ymm15,ymm15,ymm3 + vprord ymm12,ymm12,0x10 + vprord ymm13,ymm13,0x10 + vprord ymm14,ymm14,0x10 + vprord ymm15,ymm15,0x10 + vpaddd ymm8,ymm8,ymm12 + vpaddd ymm9,ymm9,ymm13 + vpaddd ymm10,ymm10,ymm14 + vpaddd ymm11,ymm11,ymm15 + vpxord ymm4,ymm4,ymm8 + vpxord ymm5,ymm5,ymm9 + vpxord ymm6,ymm6,ymm10 + vpxord ymm7,ymm7,ymm11 + vprord ymm4,ymm4,0xc + vprord ymm5,ymm5,0xc + vprord ymm6,ymm6,0xc + vprord ymm7,ymm7,0xc + vpaddd ymm0,ymm0,ymm23 + vpaddd ymm1,ymm1,ymm25 + vpaddd ymm2,ymm2,ymm19 + vpaddd ymm3,ymm3,ymm31 + vpaddd ymm0,ymm0,ymm4 + vpaddd ymm1,ymm1,ymm5 + vpaddd ymm2,ymm2,ymm6 + vpaddd ymm3,ymm3,ymm7 + vpxord ymm12,ymm12,ymm0 + vpxord ymm13,ymm13,ymm1 + vpxord ymm14,ymm14,ymm2 + vpxord ymm15,ymm15,ymm3 + vprord ymm12,ymm12,0x8 + vprord ymm13,ymm13,0x8 + vprord ymm14,ymm14,0x8 + vprord ymm15,ymm15,0x8 + vpaddd ymm8,ymm8,ymm12 + vpaddd ymm9,ymm9,ymm13 + vpaddd ymm10,ymm10,ymm14 + vpaddd ymm11,ymm11,ymm15 + vpxord ymm4,ymm4,ymm8 + vpxord ymm5,ymm5,ymm9 + vpxord ymm6,ymm6,ymm10 + vpxord ymm7,ymm7,ymm11 + vprord ymm4,ymm4,0x7 + vprord ymm5,ymm5,0x7 + vprord ymm6,ymm6,0x7 + vprord ymm7,ymm7,0x7 + vpaddd ymm0,ymm0,ymm20 + vpaddd ymm1,ymm1,ymm27 + vpaddd ymm2,ymm2,ymm21 + vpaddd ymm3,ymm3,ymm17 + vpaddd ymm0,ymm0,ymm5 + vpaddd ymm1,ymm1,ymm6 + vpaddd ymm2,ymm2,ymm7 + vpaddd ymm3,ymm3,ymm4 + vpxord ymm15,ymm15,ymm0 + vpxord ymm12,ymm12,ymm1 + vpxord ymm13,ymm13,ymm2 + vpxord ymm14,ymm14,ymm3 + vprord ymm15,ymm15,0x10 + vprord ymm12,ymm12,0x10 + vprord ymm13,ymm13,0x10 + vprord ymm14,ymm14,0x10 + vpaddd ymm10,ymm10,ymm15 + vpaddd ymm11,ymm11,ymm12 + vpaddd ymm8,ymm8,ymm13 + vpaddd ymm9,ymm9,ymm14 + vpxord ymm5,ymm5,ymm10 + vpxord ymm6,ymm6,ymm11 + vpxord ymm7,ymm7,ymm8 + vpxord ymm4,ymm4,ymm9 + vprord ymm5,ymm5,0xc + vprord ymm6,ymm6,0xc + vprord ymm7,ymm7,0xc + vprord ymm4,ymm4,0xc + vpaddd ymm0,ymm0,ymm16 + vpaddd ymm1,ymm1,ymm18 + vpaddd ymm2,ymm2,ymm24 + vpaddd ymm3,ymm3,ymm22 + vpaddd ymm0,ymm0,ymm5 + vpaddd ymm1,ymm1,ymm6 + vpaddd ymm2,ymm2,ymm7 + vpaddd ymm3,ymm3,ymm4 + vpxord ymm15,ymm15,ymm0 + vpxord ymm12,ymm12,ymm1 + vpxord ymm13,ymm13,ymm2 + vpxord ymm14,ymm14,ymm3 + vprord ymm15,ymm15,0x8 + vprord ymm12,ymm12,0x8 + vprord ymm13,ymm13,0x8 + vprord ymm14,ymm14,0x8 + vpaddd ymm10,ymm10,ymm15 + vpaddd ymm11,ymm11,ymm12 + vpaddd ymm8,ymm8,ymm13 + vpaddd ymm9,ymm9,ymm14 + vpxord ymm5,ymm5,ymm10 + vpxord ymm6,ymm6,ymm11 + vpxord ymm7,ymm7,ymm8 + vpxord ymm4,ymm4,ymm9 + vprord ymm5,ymm5,0x7 + vprord ymm6,ymm6,0x7 + vprord ymm7,ymm7,0x7 + vprord ymm4,ymm4,0x7 + vpaddd ymm0,ymm0,ymm28 + vpaddd ymm1,ymm1,ymm25 + vpaddd ymm2,ymm2,ymm31 + vpaddd ymm3,ymm3,ymm30 + vpaddd ymm0,ymm0,ymm4 + vpaddd ymm1,ymm1,ymm5 + vpaddd ymm2,ymm2,ymm6 + vpaddd ymm3,ymm3,ymm7 + vpxord ymm12,ymm12,ymm0 + vpxord ymm13,ymm13,ymm1 + vpxord ymm14,ymm14,ymm2 + vpxord ymm15,ymm15,ymm3 + vprord ymm12,ymm12,0x10 + vprord ymm13,ymm13,0x10 + vprord ymm14,ymm14,0x10 + vprord ymm15,ymm15,0x10 + vpaddd ymm8,ymm8,ymm12 + vpaddd ymm9,ymm9,ymm13 + vpaddd ymm10,ymm10,ymm14 + vpaddd ymm11,ymm11,ymm15 + vpxord ymm4,ymm4,ymm8 + vpxord ymm5,ymm5,ymm9 + vpxord ymm6,ymm6,ymm10 + vpxord ymm7,ymm7,ymm11 + vprord ymm4,ymm4,0xc + vprord ymm5,ymm5,0xc + vprord ymm6,ymm6,0xc + vprord ymm7,ymm7,0xc + vpaddd ymm0,ymm0,ymm29 + vpaddd ymm1,ymm1,ymm27 + vpaddd ymm2,ymm2,ymm26 + vpaddd ymm3,ymm3,ymm24 + vpaddd ymm0,ymm0,ymm4 + vpaddd ymm1,ymm1,ymm5 + vpaddd ymm2,ymm2,ymm6 + vpaddd ymm3,ymm3,ymm7 + vpxord ymm12,ymm12,ymm0 + vpxord ymm13,ymm13,ymm1 + vpxord ymm14,ymm14,ymm2 + vpxord ymm15,ymm15,ymm3 + vprord ymm12,ymm12,0x8 + vprord ymm13,ymm13,0x8 + vprord ymm14,ymm14,0x8 + vprord ymm15,ymm15,0x8 + vpaddd ymm8,ymm8,ymm12 + vpaddd ymm9,ymm9,ymm13 + vpaddd ymm10,ymm10,ymm14 + vpaddd ymm11,ymm11,ymm15 + vpxord ymm4,ymm4,ymm8 + vpxord ymm5,ymm5,ymm9 + vpxord ymm6,ymm6,ymm10 + vpxord ymm7,ymm7,ymm11 + vprord ymm4,ymm4,0x7 + vprord ymm5,ymm5,0x7 + vprord ymm6,ymm6,0x7 + vprord ymm7,ymm7,0x7 + vpaddd ymm0,ymm0,ymm23 + vpaddd ymm1,ymm1,ymm21 + vpaddd ymm2,ymm2,ymm16 + vpaddd ymm3,ymm3,ymm22 + vpaddd ymm0,ymm0,ymm5 + vpaddd ymm1,ymm1,ymm6 + vpaddd ymm2,ymm2,ymm7 + vpaddd ymm3,ymm3,ymm4 + vpxord ymm15,ymm15,ymm0 + vpxord ymm12,ymm12,ymm1 + vpxord ymm13,ymm13,ymm2 + vpxord ymm14,ymm14,ymm3 + vprord ymm15,ymm15,0x10 + vprord ymm12,ymm12,0x10 + vprord ymm13,ymm13,0x10 + vprord ymm14,ymm14,0x10 + vpaddd ymm10,ymm10,ymm15 + vpaddd ymm11,ymm11,ymm12 + vpaddd ymm8,ymm8,ymm13 + vpaddd ymm9,ymm9,ymm14 + vpxord ymm5,ymm5,ymm10 + vpxord ymm6,ymm6,ymm11 + vpxord ymm7,ymm7,ymm8 + vpxord ymm4,ymm4,ymm9 + vprord ymm5,ymm5,0xc + vprord ymm6,ymm6,0xc + vprord ymm7,ymm7,0xc + vprord ymm4,ymm4,0xc + vpaddd ymm0,ymm0,ymm18 + vpaddd ymm1,ymm1,ymm19 + vpaddd ymm2,ymm2,ymm17 + vpaddd ymm3,ymm3,ymm20 + vpaddd ymm0,ymm0,ymm5 + vpaddd ymm1,ymm1,ymm6 + vpaddd ymm2,ymm2,ymm7 + vpaddd ymm3,ymm3,ymm4 + vpxord ymm15,ymm15,ymm0 + vpxord ymm12,ymm12,ymm1 + vpxord ymm13,ymm13,ymm2 + vpxord ymm14,ymm14,ymm3 + vprord ymm15,ymm15,0x8 + vprord ymm12,ymm12,0x8 + vprord ymm13,ymm13,0x8 + vprord ymm14,ymm14,0x8 + vpaddd ymm10,ymm10,ymm15 + vpaddd ymm11,ymm11,ymm12 + vpaddd ymm8,ymm8,ymm13 + vpaddd ymm9,ymm9,ymm14 + vpxord ymm5,ymm5,ymm10 + vpxord ymm6,ymm6,ymm11 + vpxord ymm7,ymm7,ymm8 + vpxord ymm4,ymm4,ymm9 + vprord ymm5,ymm5,0x7 + vprord ymm6,ymm6,0x7 + vprord ymm7,ymm7,0x7 + vprord ymm4,ymm4,0x7 + vpaddd ymm0,ymm0,ymm25 + vpaddd ymm1,ymm1,ymm27 + vpaddd ymm2,ymm2,ymm24 + vpaddd ymm3,ymm3,ymm31 + vpaddd ymm0,ymm0,ymm4 + vpaddd ymm1,ymm1,ymm5 + vpaddd ymm2,ymm2,ymm6 + vpaddd ymm3,ymm3,ymm7 + vpxord ymm12,ymm12,ymm0 + vpxord ymm13,ymm13,ymm1 + vpxord ymm14,ymm14,ymm2 + vpxord ymm15,ymm15,ymm3 + vprord ymm12,ymm12,0x10 + vprord ymm13,ymm13,0x10 + vprord ymm14,ymm14,0x10 + vprord ymm15,ymm15,0x10 + vpaddd ymm8,ymm8,ymm12 + vpaddd ymm9,ymm9,ymm13 + vpaddd ymm10,ymm10,ymm14 + vpaddd ymm11,ymm11,ymm15 + vpxord ymm4,ymm4,ymm8 + vpxord ymm5,ymm5,ymm9 + vpxord ymm6,ymm6,ymm10 + vpxord ymm7,ymm7,ymm11 + vprord ymm4,ymm4,0xc + vprord ymm5,ymm5,0xc + vprord ymm6,ymm6,0xc + vprord ymm7,ymm7,0xc + vpaddd ymm0,ymm0,ymm30 + vpaddd ymm1,ymm1,ymm21 + vpaddd ymm2,ymm2,ymm28 + vpaddd ymm3,ymm3,ymm17 + vpaddd ymm0,ymm0,ymm4 + vpaddd ymm1,ymm1,ymm5 + vpaddd ymm2,ymm2,ymm6 + vpaddd ymm3,ymm3,ymm7 + vpxord ymm12,ymm12,ymm0 + vpxord ymm13,ymm13,ymm1 + vpxord ymm14,ymm14,ymm2 + vpxord ymm15,ymm15,ymm3 + vprord ymm12,ymm12,0x8 + vprord ymm13,ymm13,0x8 + vprord ymm14,ymm14,0x8 + vprord ymm15,ymm15,0x8 + vpaddd ymm8,ymm8,ymm12 + vpaddd ymm9,ymm9,ymm13 + vpaddd ymm10,ymm10,ymm14 + vpaddd ymm11,ymm11,ymm15 + vpxord ymm4,ymm4,ymm8 + vpxord ymm5,ymm5,ymm9 + vpxord ymm6,ymm6,ymm10 + vpxord ymm7,ymm7,ymm11 + vprord ymm4,ymm4,0x7 + vprord ymm5,ymm5,0x7 + vprord ymm6,ymm6,0x7 + vprord ymm7,ymm7,0x7 + vpaddd ymm0,ymm0,ymm29 + vpaddd ymm1,ymm1,ymm16 + vpaddd ymm2,ymm2,ymm18 + vpaddd ymm3,ymm3,ymm20 + vpaddd ymm0,ymm0,ymm5 + vpaddd ymm1,ymm1,ymm6 + vpaddd ymm2,ymm2,ymm7 + vpaddd ymm3,ymm3,ymm4 + vpxord ymm15,ymm15,ymm0 + vpxord ymm12,ymm12,ymm1 + vpxord ymm13,ymm13,ymm2 + vpxord ymm14,ymm14,ymm3 + vprord ymm15,ymm15,0x10 + vprord ymm12,ymm12,0x10 + vprord ymm13,ymm13,0x10 + vprord ymm14,ymm14,0x10 + vpaddd ymm10,ymm10,ymm15 + vpaddd ymm11,ymm11,ymm12 + vpaddd ymm8,ymm8,ymm13 + vpaddd ymm9,ymm9,ymm14 + vpxord ymm5,ymm5,ymm10 + vpxord ymm6,ymm6,ymm11 + vpxord ymm7,ymm7,ymm8 + vpxord ymm4,ymm4,ymm9 + vprord ymm5,ymm5,0xc + vprord ymm6,ymm6,0xc + vprord ymm7,ymm7,0xc + vprord ymm4,ymm4,0xc + vpaddd ymm0,ymm0,ymm19 + vpaddd ymm1,ymm1,ymm26 + vpaddd ymm2,ymm2,ymm22 + vpaddd ymm3,ymm3,ymm23 + vpaddd ymm0,ymm0,ymm5 + vpaddd ymm1,ymm1,ymm6 + vpaddd ymm2,ymm2,ymm7 + vpaddd ymm3,ymm3,ymm4 + vpxord ymm15,ymm15,ymm0 + vpxord ymm12,ymm12,ymm1 + vpxord ymm13,ymm13,ymm2 + vpxord ymm14,ymm14,ymm3 + vprord ymm15,ymm15,0x8 + vprord ymm12,ymm12,0x8 + vprord ymm13,ymm13,0x8 + vprord ymm14,ymm14,0x8 + vpaddd ymm10,ymm10,ymm15 + vpaddd ymm11,ymm11,ymm12 + vpaddd ymm8,ymm8,ymm13 + vpaddd ymm9,ymm9,ymm14 + vpxord ymm5,ymm5,ymm10 + vpxord ymm6,ymm6,ymm11 + vpxord ymm7,ymm7,ymm8 + vpxord ymm4,ymm4,ymm9 + vprord ymm5,ymm5,0x7 + vprord ymm6,ymm6,0x7 + vprord ymm7,ymm7,0x7 + vprord ymm4,ymm4,0x7 + vpaddd ymm0,ymm0,ymm27 + vpaddd ymm1,ymm1,ymm21 + vpaddd ymm2,ymm2,ymm17 + vpaddd ymm3,ymm3,ymm24 + vpaddd ymm0,ymm0,ymm4 + vpaddd ymm1,ymm1,ymm5 + vpaddd ymm2,ymm2,ymm6 + vpaddd ymm3,ymm3,ymm7 + vpxord ymm12,ymm12,ymm0 + vpxord ymm13,ymm13,ymm1 + vpxord ymm14,ymm14,ymm2 + vpxord ymm15,ymm15,ymm3 + vprord ymm12,ymm12,0x10 + vprord ymm13,ymm13,0x10 + vprord ymm14,ymm14,0x10 + vprord ymm15,ymm15,0x10 + vpaddd ymm8,ymm8,ymm12 + vpaddd ymm9,ymm9,ymm13 + vpaddd ymm10,ymm10,ymm14 + vpaddd ymm11,ymm11,ymm15 + vpxord ymm4,ymm4,ymm8 + vpxord ymm5,ymm5,ymm9 + vpxord ymm6,ymm6,ymm10 + vpxord ymm7,ymm7,ymm11 + vprord ymm4,ymm4,0xc + vprord ymm5,ymm5,0xc + vprord ymm6,ymm6,0xc + vprord ymm7,ymm7,0xc + vpaddd ymm0,ymm0,ymm31 + vpaddd ymm1,ymm1,ymm16 + vpaddd ymm2,ymm2,ymm25 + vpaddd ymm3,ymm3,ymm22 + vpaddd ymm0,ymm0,ymm4 + vpaddd ymm1,ymm1,ymm5 + vpaddd ymm2,ymm2,ymm6 + vpaddd ymm3,ymm3,ymm7 + vpxord ymm12,ymm12,ymm0 + vpxord ymm13,ymm13,ymm1 + vpxord ymm14,ymm14,ymm2 + vpxord ymm15,ymm15,ymm3 + vprord ymm12,ymm12,0x8 + vprord ymm13,ymm13,0x8 + vprord ymm14,ymm14,0x8 + vprord ymm15,ymm15,0x8 + vpaddd ymm8,ymm8,ymm12 + vpaddd ymm9,ymm9,ymm13 + vpaddd ymm10,ymm10,ymm14 + vpaddd ymm11,ymm11,ymm15 + vpxord ymm4,ymm4,ymm8 + vpxord ymm5,ymm5,ymm9 + vpxord ymm6,ymm6,ymm10 + vpxord ymm7,ymm7,ymm11 + vprord ymm4,ymm4,0x7 + vprord ymm5,ymm5,0x7 + vprord ymm6,ymm6,0x7 + vprord ymm7,ymm7,0x7 + vpaddd ymm0,ymm0,ymm30 + vpaddd ymm1,ymm1,ymm18 + vpaddd ymm2,ymm2,ymm19 + vpaddd ymm3,ymm3,ymm23 + vpaddd ymm0,ymm0,ymm5 + vpaddd ymm1,ymm1,ymm6 + vpaddd ymm2,ymm2,ymm7 + vpaddd ymm3,ymm3,ymm4 + vpxord ymm15,ymm15,ymm0 + vpxord ymm12,ymm12,ymm1 + vpxord ymm13,ymm13,ymm2 + vpxord ymm14,ymm14,ymm3 + vprord ymm15,ymm15,0x10 + vprord ymm12,ymm12,0x10 + vprord ymm13,ymm13,0x10 + vprord ymm14,ymm14,0x10 + vpaddd ymm10,ymm10,ymm15 + vpaddd ymm11,ymm11,ymm12 + vpaddd ymm8,ymm8,ymm13 + vpaddd ymm9,ymm9,ymm14 + vpxord ymm5,ymm5,ymm10 + vpxord ymm6,ymm6,ymm11 + vpxord ymm7,ymm7,ymm8 + vpxord ymm4,ymm4,ymm9 + vprord ymm5,ymm5,0xc + vprord ymm6,ymm6,0xc + vprord ymm7,ymm7,0xc + vprord ymm4,ymm4,0xc + vpaddd ymm0,ymm0,ymm26 + vpaddd ymm1,ymm1,ymm28 + vpaddd ymm2,ymm2,ymm20 + vpaddd ymm3,ymm3,ymm29 + vpaddd ymm0,ymm0,ymm5 + vpaddd ymm1,ymm1,ymm6 + vpaddd ymm2,ymm2,ymm7 + vpaddd ymm3,ymm3,ymm4 + vpxord ymm15,ymm15,ymm0 + vpxord ymm12,ymm12,ymm1 + vpxord ymm13,ymm13,ymm2 + vpxord ymm14,ymm14,ymm3 + vprord ymm15,ymm15,0x8 + vprord ymm12,ymm12,0x8 + vprord ymm13,ymm13,0x8 + vprord ymm14,ymm14,0x8 + vpaddd ymm10,ymm10,ymm15 + vpaddd ymm11,ymm11,ymm12 + vpaddd ymm8,ymm8,ymm13 + vpaddd ymm9,ymm9,ymm14 + vpxord ymm5,ymm5,ymm10 + vpxord ymm6,ymm6,ymm11 + vpxord ymm7,ymm7,ymm8 + vpxord ymm4,ymm4,ymm9 + vprord ymm5,ymm5,0x7 + vprord ymm6,ymm6,0x7 + vprord ymm7,ymm7,0x7 + vprord ymm4,ymm4,0x7 + vpxor ymm0,ymm0,ymm8 + vpxor ymm1,ymm1,ymm9 + vpxor ymm2,ymm2,ymm10 + vpxor ymm3,ymm3,ymm11 + vpxor ymm4,ymm4,ymm12 + vpxor ymm5,ymm5,ymm13 + vpxor ymm6,ymm6,ymm14 + vpxor ymm7,ymm7,ymm15 + vpxord ymm8,ymm8,DWORD PTR [rdi]{1to8} + vpxord ymm9,ymm9,DWORD PTR [rdi+0x4]{1to8} + vpxord ymm10,ymm10,DWORD PTR [rdi+0x8]{1to8} + vpxord ymm11,ymm11,DWORD PTR [rdi+0xc]{1to8} + vpxord ymm12,ymm12,DWORD PTR [rdi+0x10]{1to8} + vpxord ymm13,ymm13,DWORD PTR [rdi+0x14]{1to8} + vpxord ymm14,ymm14,DWORD PTR [rdi+0x18]{1to8} + vpxord ymm15,ymm15,DWORD PTR [rdi+0x1c]{1to8} + vpunpckldq ymm16,ymm0,ymm1 + vpunpckhdq ymm17,ymm0,ymm1 + vpunpckldq ymm18,ymm2,ymm3 + vpunpckhdq ymm19,ymm2,ymm3 + vpunpckldq ymm20,ymm4,ymm5 + vpunpckhdq ymm21,ymm4,ymm5 + vpunpckldq ymm22,ymm6,ymm7 + vpunpckhdq ymm23,ymm6,ymm7 + vpunpckldq ymm24,ymm8,ymm9 + vpunpckhdq ymm25,ymm8,ymm9 + vpunpckldq ymm26,ymm10,ymm11 + vpunpckhdq ymm27,ymm10,ymm11 + vpunpckldq ymm28,ymm12,ymm13 + vpunpckhdq ymm29,ymm12,ymm13 + vpunpckldq ymm30,ymm14,ymm15 + vpunpckhdq ymm31,ymm14,ymm15 + vpunpcklqdq ymm0,ymm16,ymm18 + vpunpckhqdq ymm1,ymm16,ymm18 + vpunpcklqdq ymm2,ymm17,ymm19 + vpunpckhqdq ymm3,ymm17,ymm19 + vpunpcklqdq ymm4,ymm20,ymm22 + vpunpckhqdq ymm5,ymm20,ymm22 + vpunpcklqdq ymm6,ymm21,ymm23 + vpunpckhqdq ymm7,ymm21,ymm23 + vpunpcklqdq ymm8,ymm24,ymm26 + vpunpckhqdq ymm9,ymm24,ymm26 + vpunpcklqdq ymm10,ymm25,ymm27 + vpunpckhqdq ymm11,ymm25,ymm27 + vpunpcklqdq ymm12,ymm28,ymm30 + vpunpckhqdq ymm13,ymm28,ymm30 + vpunpcklqdq ymm14,ymm29,ymm31 + vpunpckhqdq ymm15,ymm29,ymm31 + vshufi32x4 ymm16,ymm0,ymm4,0x0 + vshufi32x4 ymm17,ymm8,ymm12,0x0 + vshufi32x4 ymm18,ymm1,ymm5,0x0 + vshufi32x4 ymm19,ymm9,ymm13,0x0 + vshufi32x4 ymm20,ymm2,ymm6,0x0 + vshufi32x4 ymm21,ymm10,ymm14,0x0 + vshufi32x4 ymm22,ymm3,ymm7,0x0 + vshufi32x4 ymm23,ymm11,ymm15,0x0 + vshufi32x4 ymm24,ymm0,ymm4,0x3 + vshufi32x4 ymm25,ymm8,ymm12,0x3 + vshufi32x4 ymm26,ymm1,ymm5,0x3 + vshufi32x4 ymm27,ymm9,ymm13,0x3 + vshufi32x4 ymm28,ymm2,ymm6,0x3 + vshufi32x4 ymm29,ymm10,ymm14,0x3 + vshufi32x4 ymm30,ymm3,ymm7,0x3 + vshufi32x4 ymm31,ymm11,ymm15,0x3 + vmovdqu32 YMMWORD PTR [r9],ymm16 + vmovdqu32 YMMWORD PTR [r9+0x20],ymm17 + vmovdqu32 YMMWORD PTR [r9+0x40],ymm18 + vmovdqu32 YMMWORD PTR [r9+0x60],ymm19 + vmovdqu32 YMMWORD PTR [r9+0x80],ymm20 + vmovdqu32 YMMWORD PTR [r9+0xa0],ymm21 + vmovdqu32 YMMWORD PTR [r9+0xc0],ymm22 + vmovdqu32 YMMWORD PTR [r9+0xe0],ymm23 + vmovdqu32 YMMWORD PTR [r9+0x100],ymm24 + vmovdqu32 YMMWORD PTR [r9+0x120],ymm25 + vmovdqu32 YMMWORD PTR [r9+0x140],ymm26 + vmovdqu32 YMMWORD PTR [r9+0x160],ymm27 + vmovdqu32 YMMWORD PTR [r9+0x180],ymm28 + vmovdqu32 YMMWORD PTR [r9+0x1a0],ymm29 + vmovdqu32 YMMWORD PTR [r9+0x1c0],ymm30 + vmovdqu32 YMMWORD PTR [r9+0x1e0],ymm31 + vmovdqa ymm0,YMMWORD PTR [rsp+0x20] + vmovdqa ymm1,YMMWORD PTR [rsp+0x60] + vmovdqa YMMWORD PTR [rsp],ymm0 + vmovdqa YMMWORD PTR [rsp+0x40],ymm1 + add r9,0x200 + sub r10,0x8 +2: + test r10,0x4 + je 2f + vbroadcasti32x4 zmm0,XMMWORD PTR [rdi] + vbroadcasti32x4 zmm1,XMMWORD PTR [rdi+0x10] + vbroadcasti32x4 zmm2,XMMWORD PTR [BLAKE3_IV+rip] + vmovdqa xmm12,XMMWORD PTR [rsp] + vmovdqa xmm13,XMMWORD PTR [rsp+0x40] + vpunpckldq xmm14,xmm12,xmm13 + vpunpckhdq xmm15,xmm12,xmm13 + vpermq ymm14,ymm14,0xdc + vpermq ymm15,ymm15,0xdc + vpbroadcastd zmm12,edx + vinserti64x4 zmm13,zmm14,ymm15,0x1 + mov eax,0x4444 + kmovw k2,eax + vpblendmd zmm13{k2},zmm13,zmm12 + vpbroadcastd zmm15,r8d + mov eax,0x8888 + kmovw k4,eax + vpblendmd zmm3{k4},zmm13,zmm15 + mov eax,0xaaaa + kmovw k3,eax + vbroadcasti32x4 zmm8,XMMWORD PTR [rsi] + vbroadcasti32x4 zmm9,XMMWORD PTR [rsi+0x10] + vshufps zmm4,zmm8,zmm9,0x88 + vshufps zmm5,zmm8,zmm9,0xdd + vbroadcasti32x4 zmm8,XMMWORD PTR [rsi+0x20] + vbroadcasti32x4 zmm9,XMMWORD PTR [rsi+0x30] + vshufps zmm6,zmm8,zmm9,0x88 + vshufps zmm7,zmm8,zmm9,0xdd + vpshufd zmm6,zmm6,0x93 + vpshufd zmm7,zmm7,0x93 + mov al,0x7 +3: + vpaddd zmm0,zmm0,zmm4 + vpaddd zmm0,zmm0,zmm1 + vpxord zmm3,zmm3,zmm0 + vprord zmm3,zmm3,0x10 + vpaddd zmm2,zmm2,zmm3 + vpxord zmm1,zmm1,zmm2 + vprord zmm1,zmm1,0xc + vpaddd zmm0,zmm0,zmm5 + vpaddd zmm0,zmm0,zmm1 + vpxord zmm3,zmm3,zmm0 + vprord zmm3,zmm3,0x8 + vpaddd zmm2,zmm2,zmm3 + vpxord zmm1,zmm1,zmm2 + vprord zmm1,zmm1,0x7 + vpshufd zmm0,zmm0,0x93 + vpshufd zmm3,zmm3,0x4e + vpshufd zmm2,zmm2,0x39 + vpaddd zmm0,zmm0,zmm6 + vpaddd zmm0,zmm0,zmm1 + vpxord zmm3,zmm3,zmm0 + vprord zmm3,zmm3,0x10 + vpaddd zmm2,zmm2,zmm3 + vpxord zmm1,zmm1,zmm2 + vprord zmm1,zmm1,0xc + vpaddd zmm0,zmm0,zmm7 + vpaddd zmm0,zmm0,zmm1 + vpxord zmm3,zmm3,zmm0 + vprord zmm3,zmm3,0x8 + vpaddd zmm2,zmm2,zmm3 + vpxord zmm1,zmm1,zmm2 + vprord zmm1,zmm1,0x7 + vpshufd zmm0,zmm0,0x39 + vpshufd zmm3,zmm3,0x4e + vpshufd zmm2,zmm2,0x93 + dec al + je 3f + vshufps zmm8,zmm4,zmm5,0xd6 + vpshufd zmm9,zmm4,0xf + vpshufd zmm4,zmm8,0x39 + vshufps zmm8,zmm6,zmm7,0xfa + vpblendmd zmm9{k3},zmm9,zmm8 + vpunpcklqdq zmm8,zmm7,zmm5 + vpblendmd zmm8{k4},zmm8,zmm6 + vpshufd zmm8,zmm8,0x78 + vpunpckhdq zmm5,zmm5,zmm7 + vpunpckldq zmm6,zmm6,zmm5 + vpshufd zmm7,zmm6,0x1e + vmovdqa32 zmm5,zmm9 + vmovdqa32 zmm6,zmm8 + jmp 3b +3: + vpxord zmm0,zmm0,zmm2 + vpxord zmm1,zmm1,zmm3 + vbroadcasti32x4 zmm8,XMMWORD PTR [rdi] + vbroadcasti32x4 zmm9,XMMWORD PTR [rdi+0x10] + vpxord zmm2,zmm2,zmm8 + vpxord zmm3,zmm3,zmm9 + vmovdqu XMMWORD PTR [r9],xmm0 + vmovdqu XMMWORD PTR [r9+0x10],xmm1 + vmovdqu XMMWORD PTR [r9+0x20],xmm2 + vmovdqu XMMWORD PTR [r9+0x30],xmm3 + vextracti128 XMMWORD PTR [r9+0x40],ymm0,0x1 + vextracti128 XMMWORD PTR [r9+0x50],ymm1,0x1 + vextracti128 XMMWORD PTR [r9+0x60],ymm2,0x1 + vextracti128 XMMWORD PTR [r9+0x70],ymm3,0x1 + vextracti32x4 XMMWORD PTR [r9+0x80],zmm0,0x2 + vextracti32x4 XMMWORD PTR [r9+0x90],zmm1,0x2 + vextracti32x4 XMMWORD PTR [r9+0xa0],zmm2,0x2 + vextracti32x4 XMMWORD PTR [r9+0xb0],zmm3,0x2 + vextracti32x4 XMMWORD PTR [r9+0xc0],zmm0,0x3 + vextracti32x4 XMMWORD PTR [r9+0xd0],zmm1,0x3 + vextracti32x4 XMMWORD PTR [r9+0xe0],zmm2,0x3 + vextracti32x4 XMMWORD PTR [r9+0xf0],zmm3,0x3 + vmovdqa xmm0,XMMWORD PTR [rsp+0x10] + vmovdqa xmm1,XMMWORD PTR [rsp+0x50] + vmovdqa XMMWORD PTR [rsp],xmm0 + vmovdqa XMMWORD PTR [rsp+0x40],xmm1 + add r9,0x100 + sub r10,0x4 +2: + test r10,0x2 + je 2f + vbroadcasti128 ymm0,XMMWORD PTR [rdi] + vbroadcasti128 ymm1,XMMWORD PTR [rdi+0x10] + vmovd xmm13,DWORD PTR [rsp] + vpinsrd xmm13,xmm13,DWORD PTR [rsp+0x40],0x1 + vpinsrd xmm13,xmm13,edx,0x2 + vmovd xmm14,DWORD PTR [rsp+0x4] + vpinsrd xmm14,xmm14,DWORD PTR [rsp+0x44],0x1 + vpinsrd xmm14,xmm14,edx,0x2 + vinserti128 ymm13,ymm13,xmm14,0x1 + vbroadcasti128 ymm2,XMMWORD PTR [BLAKE3_IV+rip] + vpbroadcastd ymm8,r8d + vpblendd ymm3,ymm13,ymm8,0x88 + vbroadcasti128 ymm8,XMMWORD PTR [rsi] + vbroadcasti128 ymm9,XMMWORD PTR [rsi+0x10] + vshufps ymm4,ymm8,ymm9,0x88 + vshufps ymm5,ymm8,ymm9,0xdd + vbroadcasti128 ymm8,XMMWORD PTR [rsi+0x20] + vbroadcasti128 ymm9,XMMWORD PTR [rsi+0x30] + vshufps ymm6,ymm8,ymm9,0x88 + vshufps ymm7,ymm8,ymm9,0xdd + vpshufd ymm6,ymm6,0x93 + vpshufd ymm7,ymm7,0x93 + mov al,0x7 +3: + vpaddd ymm0,ymm0,ymm4 + vpaddd ymm0,ymm0,ymm1 + vpxord ymm3,ymm3,ymm0 + vprord ymm3,ymm3,0x10 + vpaddd ymm2,ymm2,ymm3 + vpxord ymm1,ymm1,ymm2 + vprord ymm1,ymm1,0xc + vpaddd ymm0,ymm0,ymm5 + vpaddd ymm0,ymm0,ymm1 + vpxord ymm3,ymm3,ymm0 + vprord ymm3,ymm3,0x8 + vpaddd ymm2,ymm2,ymm3 + vpxord ymm1,ymm1,ymm2 + vprord ymm1,ymm1,0x7 + vpshufd ymm0,ymm0,0x93 + vpshufd ymm3,ymm3,0x4e + vpshufd ymm2,ymm2,0x39 + vpaddd ymm0,ymm0,ymm6 + vpaddd ymm0,ymm0,ymm1 + vpxord ymm3,ymm3,ymm0 + vprord ymm3,ymm3,0x10 + vpaddd ymm2,ymm2,ymm3 + vpxord ymm1,ymm1,ymm2 + vprord ymm1,ymm1,0xc + vpaddd ymm0,ymm0,ymm7 + vpaddd ymm0,ymm0,ymm1 + vpxord ymm3,ymm3,ymm0 + vprord ymm3,ymm3,0x8 + vpaddd ymm2,ymm2,ymm3 + vpxord ymm1,ymm1,ymm2 + vprord ymm1,ymm1,0x7 + vpshufd ymm0,ymm0,0x39 + vpshufd ymm3,ymm3,0x4e + vpshufd ymm2,ymm2,0x93 + dec al + je 3f + vshufps ymm8,ymm4,ymm5,0xd6 + vpshufd ymm9,ymm4,0xf + vpshufd ymm4,ymm8,0x39 + vshufps ymm8,ymm6,ymm7,0xfa + vpblendd ymm9,ymm9,ymm8,0xaa + vpunpcklqdq ymm8,ymm7,ymm5 + vpblendd ymm8,ymm8,ymm6,0x88 + vpshufd ymm8,ymm8,0x78 + vpunpckhdq ymm5,ymm5,ymm7 + vpunpckldq ymm6,ymm6,ymm5 + vpshufd ymm7,ymm6,0x1e + vmovdqa ymm5,ymm9 + vmovdqa ymm6,ymm8 + jmp 3b +3: + vpxor ymm0,ymm0,ymm2 + vpxor ymm1,ymm1,ymm3 + vbroadcasti128 ymm8,XMMWORD PTR [rdi] + vbroadcasti128 ymm9,XMMWORD PTR [rdi+0x10] + vpxor ymm2,ymm2,ymm8 + vpxor ymm3,ymm3,ymm9 + vmovdqu XMMWORD PTR [r9],xmm0 + vmovdqu XMMWORD PTR [r9+0x10],xmm1 + vmovdqu XMMWORD PTR [r9+0x20],xmm2 + vmovdqu XMMWORD PTR [r9+0x30],xmm3 + vextracti128 XMMWORD PTR [r9+0x40],ymm0,0x1 + vextracti128 XMMWORD PTR [r9+0x50],ymm1,0x1 + vextracti128 XMMWORD PTR [r9+0x60],ymm2,0x1 + vextracti128 XMMWORD PTR [r9+0x70],ymm3,0x1 + vmovdqu xmm0,XMMWORD PTR [rsp+0x8] + vmovdqu xmm1,XMMWORD PTR [rsp+0x48] + vmovdqa XMMWORD PTR [rsp],xmm0 + vmovdqa XMMWORD PTR [rsp+0x40],xmm1 + add r9,0x80 + sub r10,0x2 +2: + test r10,0x1 + je 9b + vmovdqu xmm0,XMMWORD PTR [rdi] + vmovdqu xmm1,XMMWORD PTR [rdi+0x10] + vmovd xmm14,DWORD PTR [rsp] + vpinsrd xmm14,xmm14,DWORD PTR [rsp+0x40],0x1 + vpinsrd xmm14,xmm14,edx,0x2 + vmovdqa xmm2,XMMWORD PTR [BLAKE3_IV+rip] + vpinsrd xmm3,xmm14,r8d,0x3 + vmovups xmm8,XMMWORD PTR [rsi] + vmovups xmm9,XMMWORD PTR [rsi+0x10] + vshufps xmm4,xmm8,xmm9,0x88 + vshufps xmm5,xmm8,xmm9,0xdd + vmovups xmm8,XMMWORD PTR [rsi+0x20] + vmovups xmm9,XMMWORD PTR [rsi+0x30] + vshufps xmm6,xmm8,xmm9,0x88 + vshufps xmm7,xmm8,xmm9,0xdd + vpshufd xmm6,xmm6,0x93 + vpshufd xmm7,xmm7,0x93 + mov al,0x7 +3: + vpaddd xmm0,xmm0,xmm4 + vpaddd xmm0,xmm0,xmm1 + vpxord xmm3,xmm3,xmm0 + vprord xmm3,xmm3,0x10 + vpaddd xmm2,xmm2,xmm3 + vpxord xmm1,xmm1,xmm2 + vprord xmm1,xmm1,0xc + vpaddd xmm0,xmm0,xmm5 + vpaddd xmm0,xmm0,xmm1 + vpxord xmm3,xmm3,xmm0 + vprord xmm3,xmm3,0x8 + vpaddd xmm2,xmm2,xmm3 + vpxord xmm1,xmm1,xmm2 + vprord xmm1,xmm1,0x7 + vpshufd xmm0,xmm0,0x93 + vpshufd xmm3,xmm3,0x4e + vpshufd xmm2,xmm2,0x39 + vpaddd xmm0,xmm0,xmm6 + vpaddd xmm0,xmm0,xmm1 + vpxord xmm3,xmm3,xmm0 + vprord xmm3,xmm3,0x10 + vpaddd xmm2,xmm2,xmm3 + vpxord xmm1,xmm1,xmm2 + vprord xmm1,xmm1,0xc + vpaddd xmm0,xmm0,xmm7 + vpaddd xmm0,xmm0,xmm1 + vpxord xmm3,xmm3,xmm0 + vprord xmm3,xmm3,0x8 + vpaddd xmm2,xmm2,xmm3 + vpxord xmm1,xmm1,xmm2 + vprord xmm1,xmm1,0x7 + vpshufd xmm0,xmm0,0x39 + vpshufd xmm3,xmm3,0x4e + vpshufd xmm2,xmm2,0x93 + dec al + je 3f + vshufps xmm8,xmm4,xmm5,0xd6 + vpshufd xmm9,xmm4,0xf + vpshufd xmm4,xmm8,0x39 + vshufps xmm8,xmm6,xmm7,0xfa + vpblendd xmm9,xmm9,xmm8,0xaa + vpunpcklqdq xmm8,xmm7,xmm5 + vpblendd xmm8,xmm8,xmm6,0x88 + vpshufd xmm8,xmm8,0x78 + vpunpckhdq xmm5,xmm5,xmm7 + vpunpckldq xmm6,xmm6,xmm5 + vpshufd xmm7,xmm6,0x1e + vmovdqa xmm5,xmm9 + vmovdqa xmm6,xmm8 + jmp 3b +3: + vpxor xmm0,xmm0,xmm2 + vpxor xmm1,xmm1,xmm3 + vpxor xmm2,xmm2,XMMWORD PTR [rdi] + vpxor xmm3,xmm3,XMMWORD PTR [rdi+0x10] + vmovdqu XMMWORD PTR [r9],xmm0 + vmovdqu XMMWORD PTR [r9+0x10],xmm1 + vmovdqu XMMWORD PTR [r9+0x20],xmm2 + vmovdqu XMMWORD PTR [r9+0x30],xmm3 + jmp 9b + + +#ifdef __APPLE__ +.static_data +#else +.section .rodata +#endif +.p2align 6 +INDEX0: + .long 0, 1, 2, 3, 16, 17, 18, 19 + .long 8, 9, 10, 11, 24, 25, 26, 27 +INDEX1: + .long 4, 5, 6, 7, 20, 21, 22, 23 + .long 12, 13, 14, 15, 28, 29, 30, 31 +ADD0: + .long 0, 1, 2, 3, 4, 5, 6, 7 + .long 8, 9, 10, 11, 12, 13, 14, 15 +ADD1: .long 1 + +ADD16: .long 16 +BLAKE3_BLOCK_LEN: + .long 64 +.p2align 6 +BLAKE3_IV: +BLAKE3_IV_0: + .long 0x6A09E667 +BLAKE3_IV_1: + .long 0xBB67AE85 +BLAKE3_IV_2: + .long 0x3C6EF372 +BLAKE3_IV_3: + .long 0xA54FF53A diff --git a/vendor/blake3/blake3_dispatch.c b/vendor/blake3/blake3_dispatch.c new file mode 100644 index 000000000..14dfbbe0c --- /dev/null +++ b/vendor/blake3/blake3_dispatch.c @@ -0,0 +1,332 @@ +#include +#include +#include + +#include "blake3_impl.h" + +#if defined(_MSC_VER) +#include +#endif + +#if defined(IS_X86) +#if defined(_MSC_VER) +#include +#elif defined(__GNUC__) +#include +#else +#undef IS_X86 /* Unimplemented! */ +#endif +#endif + +#if !defined(BLAKE3_ATOMICS) +#if defined(__has_include) +#if __has_include() && !defined(_MSC_VER) +#define BLAKE3_ATOMICS 1 +#else +#define BLAKE3_ATOMICS 0 +#endif /* __has_include() && !defined(_MSC_VER) */ +#else +#define BLAKE3_ATOMICS 0 +#endif /* defined(__has_include) */ +#endif /* BLAKE3_ATOMICS */ + +#if BLAKE3_ATOMICS +#define ATOMIC_INT _Atomic int +#define ATOMIC_LOAD(x) x +#define ATOMIC_STORE(x, y) x = y +#elif defined(_MSC_VER) +#define ATOMIC_INT LONG +#define ATOMIC_LOAD(x) InterlockedOr(&x, 0) +#define ATOMIC_STORE(x, y) InterlockedExchange(&x, y) +#else +#define ATOMIC_INT int +#define ATOMIC_LOAD(x) x +#define ATOMIC_STORE(x, y) x = y +#endif + +#define MAYBE_UNUSED(x) (void)((x)) + +#if defined(IS_X86) +static uint64_t xgetbv(void) { +#if defined(_MSC_VER) + return _xgetbv(0); +#else + uint32_t eax = 0, edx = 0; + __asm__ __volatile__("xgetbv\n" : "=a"(eax), "=d"(edx) : "c"(0)); + return ((uint64_t)edx << 32) | eax; +#endif +} + +static void cpuid(uint32_t out[4], uint32_t id) { +#if defined(_MSC_VER) + __cpuid((int *)out, id); +#elif defined(__i386__) || defined(_M_IX86) + __asm__ __volatile__("movl %%ebx, %1\n" + "cpuid\n" + "xchgl %1, %%ebx\n" + : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3]) + : "a"(id)); +#else + __asm__ __volatile__("cpuid\n" + : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) + : "a"(id)); +#endif +} + +static void cpuidex(uint32_t out[4], uint32_t id, uint32_t sid) { +#if defined(_MSC_VER) + __cpuidex((int *)out, id, sid); +#elif defined(__i386__) || defined(_M_IX86) + __asm__ __volatile__("movl %%ebx, %1\n" + "cpuid\n" + "xchgl %1, %%ebx\n" + : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3]) + : "a"(id), "c"(sid)); +#else + __asm__ __volatile__("cpuid\n" + : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) + : "a"(id), "c"(sid)); +#endif +} + + +enum cpu_feature { + SSE2 = 1 << 0, + SSSE3 = 1 << 1, + SSE41 = 1 << 2, + AVX = 1 << 3, + AVX2 = 1 << 4, + AVX512F = 1 << 5, + AVX512VL = 1 << 6, + /* ... */ + UNDEFINED = 1 << 30 +}; + +#if !defined(BLAKE3_TESTING) +static /* Allow the variable to be controlled manually for testing */ +#endif + ATOMIC_INT g_cpu_features = UNDEFINED; + +#if !defined(BLAKE3_TESTING) +static +#endif + enum cpu_feature + get_cpu_features(void) { + + /* If TSAN detects a data race here, try compiling with -DBLAKE3_ATOMICS=1 */ + enum cpu_feature features = ATOMIC_LOAD(g_cpu_features); + if (features != UNDEFINED) { + return features; + } else { +#if defined(IS_X86) + uint32_t regs[4] = {0}; + uint32_t *eax = ®s[0], *ebx = ®s[1], *ecx = ®s[2], *edx = ®s[3]; + (void)edx; + features = 0; + cpuid(regs, 0); + const int max_id = *eax; + cpuid(regs, 1); +#if defined(__amd64__) || defined(_M_X64) + features |= SSE2; +#else + if (*edx & (1UL << 26)) + features |= SSE2; +#endif + if (*ecx & (1UL << 9)) + features |= SSSE3; + if (*ecx & (1UL << 19)) + features |= SSE41; + + if (*ecx & (1UL << 27)) { // OSXSAVE + const uint64_t mask = xgetbv(); + if ((mask & 6) == 6) { // SSE and AVX states + if (*ecx & (1UL << 28)) + features |= AVX; + if (max_id >= 7) { + cpuidex(regs, 7, 0); + if (*ebx & (1UL << 5)) + features |= AVX2; + if ((mask & 224) == 224) { // Opmask, ZMM_Hi256, Hi16_Zmm + if (*ebx & (1UL << 31)) + features |= AVX512VL; + if (*ebx & (1UL << 16)) + features |= AVX512F; + } + } + } + } + ATOMIC_STORE(g_cpu_features, features); + return features; +#else + /* How to detect NEON? */ + return 0; +#endif + } +} +#endif + +void blake3_compress_in_place(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags) { +#if defined(IS_X86) + const enum cpu_feature features = get_cpu_features(); + MAYBE_UNUSED(features); +#if !defined(BLAKE3_NO_AVX512) + if (features & AVX512VL) { + blake3_compress_in_place_avx512(cv, block, block_len, counter, flags); + return; + } +#endif +#if !defined(BLAKE3_NO_SSE41) + if (features & SSE41) { + blake3_compress_in_place_sse41(cv, block, block_len, counter, flags); + return; + } +#endif +#if !defined(BLAKE3_NO_SSE2) + if (features & SSE2) { + blake3_compress_in_place_sse2(cv, block, block_len, counter, flags); + return; + } +#endif +#endif + blake3_compress_in_place_portable(cv, block, block_len, counter, flags); +} + +void blake3_compress_xof(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, uint8_t flags, + uint8_t out[64]) { +#if defined(IS_X86) + const enum cpu_feature features = get_cpu_features(); + MAYBE_UNUSED(features); +#if !defined(BLAKE3_NO_AVX512) + if (features & AVX512VL) { + blake3_compress_xof_avx512(cv, block, block_len, counter, flags, out); + return; + } +#endif +#if !defined(BLAKE3_NO_SSE41) + if (features & SSE41) { + blake3_compress_xof_sse41(cv, block, block_len, counter, flags, out); + return; + } +#endif +#if !defined(BLAKE3_NO_SSE2) + if (features & SSE2) { + blake3_compress_xof_sse2(cv, block, block_len, counter, flags, out); + return; + } +#endif +#endif + blake3_compress_xof_portable(cv, block, block_len, counter, flags, out); +} + + +void blake3_xof_many(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, uint8_t flags, + uint8_t out[64], size_t outblocks) { + if (outblocks == 0) { + // The current assembly implementation always outputs at least 1 block. + return; + } +#if defined(IS_X86) + const enum cpu_feature features = get_cpu_features(); + MAYBE_UNUSED(features); +#if !defined(_WIN32) && !defined(__CYGWIN__) && !defined(BLAKE3_NO_AVX512) + if (features & AVX512VL) { + blake3_xof_many_avx512(cv, block, block_len, counter, flags, out, outblocks); + return; + } +#endif +#endif + for(size_t i = 0; i < outblocks; ++i) { + blake3_compress_xof(cv, block, block_len, counter + i, flags, out + 64*i); + } +} + +void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs, + size_t blocks, const uint32_t key[8], uint64_t counter, + bool increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t *out) { +#if defined(IS_X86) + const enum cpu_feature features = get_cpu_features(); + MAYBE_UNUSED(features); +#if !defined(BLAKE3_NO_AVX512) + if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) { + blake3_hash_many_avx512(inputs, num_inputs, blocks, key, counter, + increment_counter, flags, flags_start, flags_end, + out); + return; + } +#endif +#if !defined(BLAKE3_NO_AVX2) + if (features & AVX2) { + blake3_hash_many_avx2(inputs, num_inputs, blocks, key, counter, + increment_counter, flags, flags_start, flags_end, + out); + return; + } +#endif +#if !defined(BLAKE3_NO_SSE41) + if (features & SSE41) { + blake3_hash_many_sse41(inputs, num_inputs, blocks, key, counter, + increment_counter, flags, flags_start, flags_end, + out); + return; + } +#endif +#if !defined(BLAKE3_NO_SSE2) + if (features & SSE2) { + blake3_hash_many_sse2(inputs, num_inputs, blocks, key, counter, + increment_counter, flags, flags_start, flags_end, + out); + return; + } +#endif +#endif + +#if BLAKE3_USE_NEON == 1 + blake3_hash_many_neon(inputs, num_inputs, blocks, key, counter, + increment_counter, flags, flags_start, flags_end, out); + return; +#endif + + blake3_hash_many_portable(inputs, num_inputs, blocks, key, counter, + increment_counter, flags, flags_start, flags_end, + out); +} + +// The dynamically detected SIMD degree of the current platform. +size_t blake3_simd_degree(void) { +#if defined(IS_X86) + const enum cpu_feature features = get_cpu_features(); + MAYBE_UNUSED(features); +#if !defined(BLAKE3_NO_AVX512) + if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) { + return 16; + } +#endif +#if !defined(BLAKE3_NO_AVX2) + if (features & AVX2) { + return 8; + } +#endif +#if !defined(BLAKE3_NO_SSE41) + if (features & SSE41) { + return 4; + } +#endif +#if !defined(BLAKE3_NO_SSE2) + if (features & SSE2) { + return 4; + } +#endif +#endif +#if BLAKE3_USE_NEON == 1 + return 4; +#endif + return 1; +} diff --git a/vendor/blake3/blake3_impl.h b/vendor/blake3/blake3_impl.h new file mode 100644 index 000000000..cf1689899 --- /dev/null +++ b/vendor/blake3/blake3_impl.h @@ -0,0 +1,167 @@ +#ifndef BLAKE3_IMPL_H +#define BLAKE3_IMPL_H + +#include +#include +#include +#include +#include + +#include "blake3.h" + +#ifdef __cplusplus +extern "C" { +#endif + +// internal flags +enum blake3_flags { + CHUNK_START = 1 << 0, + CHUNK_END = 1 << 1, + PARENT = 1 << 2, + ROOT = 1 << 3, + KEYED_HASH = 1 << 4, + DERIVE_KEY_CONTEXT = 1 << 5, + DERIVE_KEY_MATERIAL = 1 << 6, +}; + +#if defined(_MSC_VER) +#define INLINE static __forceinline +#else +#define INLINE static inline __attribute__((always_inline)) +#endif + +// Use portable implementation only - no SIMD +#define MAX_SIMD_DEGREE 1 +#define MAX_SIMD_DEGREE_OR_2 2 + +static const uint32_t IV[8] = {0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL, + 0xA54FF53AUL, 0x510E527FUL, 0x9B05688CUL, + 0x1F83D9ABUL, 0x5BE0CD19UL}; + +static const uint8_t MSG_SCHEDULE[7][16] = { + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, + {2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8}, + {3, 4, 10, 12, 13, 2, 7, 14, 6, 5, 9, 0, 11, 15, 8, 1}, + {10, 7, 12, 9, 14, 3, 13, 15, 4, 0, 11, 2, 5, 8, 1, 6}, + {12, 13, 9, 11, 15, 10, 14, 8, 7, 2, 5, 3, 0, 1, 6, 4}, + {9, 14, 11, 5, 8, 12, 15, 1, 13, 3, 0, 10, 2, 6, 4, 7}, + {11, 15, 5, 0, 1, 9, 8, 6, 14, 10, 2, 12, 3, 4, 7, 13}, +}; + +/* Find index of the highest set bit */ +static unsigned int highest_one(uint64_t x) { +#if defined(__GNUC__) || defined(__clang__) + return 63 ^ (unsigned int)__builtin_clzll(x); +#else + unsigned int c = 0; + if(x & 0xffffffff00000000ULL) { x >>= 32; c += 32; } + if(x & 0x00000000ffff0000ULL) { x >>= 16; c += 16; } + if(x & 0x000000000000ff00ULL) { x >>= 8; c += 8; } + if(x & 0x00000000000000f0ULL) { x >>= 4; c += 4; } + if(x & 0x000000000000000cULL) { x >>= 2; c += 2; } + if(x & 0x0000000000000002ULL) { c += 1; } + return c; +#endif +} + +INLINE unsigned int popcnt(uint64_t x) { +#if defined(__GNUC__) || defined(__clang__) + return (unsigned int)__builtin_popcountll(x); +#else + unsigned int count = 0; + while (x != 0) { + count += 1; + x &= x - 1; + } + return count; +#endif +} + +INLINE uint64_t round_down_to_power_of_2(uint64_t x) { + return 1ULL << highest_one(x | 1); +} + +INLINE uint32_t counter_low(uint64_t counter) { return (uint32_t)counter; } +INLINE uint32_t counter_high(uint64_t counter) { return (uint32_t)(counter >> 32); } + +INLINE uint32_t load32(const void *src) { + const uint8_t *p = (const uint8_t *)src; + return ((uint32_t)(p[0]) << 0) | ((uint32_t)(p[1]) << 8) | + ((uint32_t)(p[2]) << 16) | ((uint32_t)(p[3]) << 24); +} + +INLINE void load_key_words(const uint8_t key[BLAKE3_KEY_LEN], uint32_t key_words[8]) { + key_words[0] = load32(&key[0 * 4]); + key_words[1] = load32(&key[1 * 4]); + key_words[2] = load32(&key[2 * 4]); + key_words[3] = load32(&key[3 * 4]); + key_words[4] = load32(&key[4 * 4]); + key_words[5] = load32(&key[5 * 4]); + key_words[6] = load32(&key[6 * 4]); + key_words[7] = load32(&key[7 * 4]); +} + +INLINE void store32(void *dst, uint32_t w) { + uint8_t *p = (uint8_t *)dst; + p[0] = (uint8_t)(w >> 0); + p[1] = (uint8_t)(w >> 8); + p[2] = (uint8_t)(w >> 16); + p[3] = (uint8_t)(w >> 24); +} + +INLINE void store_cv_words(uint8_t bytes_out[32], uint32_t cv_words[8]) { + store32(&bytes_out[0 * 4], cv_words[0]); + store32(&bytes_out[1 * 4], cv_words[1]); + store32(&bytes_out[2 * 4], cv_words[2]); + store32(&bytes_out[3 * 4], cv_words[3]); + store32(&bytes_out[4 * 4], cv_words[4]); + store32(&bytes_out[5 * 4], cv_words[5]); + store32(&bytes_out[6 * 4], cv_words[6]); + store32(&bytes_out[7 * 4], cv_words[7]); +} + +void blake3_compress_in_place(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags); + +void blake3_compress_xof(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, uint8_t flags, + uint8_t out[64]); + +void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs, + size_t blocks, const uint32_t key[8], uint64_t counter, + bool increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t *out); + +void blake3_xof_many(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, uint8_t flags, + uint8_t *out, size_t outblocks); + +size_t blake3_simd_degree(void); + +// Portable implementation declarations +void blake3_compress_in_place_portable(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags); + +void blake3_compress_xof_portable(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags, uint8_t out[64]); + +void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs, + size_t blocks, const uint32_t key[8], + uint64_t counter, bool increment_counter, + uint8_t flags, uint8_t flags_start, + uint8_t flags_end, uint8_t *out); + +#ifdef __cplusplus +} +#endif + +#endif /* BLAKE3_IMPL_H */ + diff --git a/vendor/blake3/blake3_neon.c b/vendor/blake3/blake3_neon.c new file mode 100644 index 000000000..794ea80f4 --- /dev/null +++ b/vendor/blake3/blake3_neon.c @@ -0,0 +1,367 @@ +#include "blake3_impl.h" + +#include + +#ifdef __ARM_BIG_ENDIAN +#error "This implementation only supports little-endian ARM." +// It might be that all we need for big-endian support here is to get the loads +// and stores right, but step zero would be finding a way to test it in CI. +#endif + +INLINE uint32x4_t loadu_128(const uint8_t src[16]) { + // vld1q_u32 has alignment requirements. Don't use it. + return vreinterpretq_u32_u8(vld1q_u8(src)); +} + +INLINE void storeu_128(uint32x4_t src, uint8_t dest[16]) { + // vst1q_u32 has alignment requirements. Don't use it. + vst1q_u8(dest, vreinterpretq_u8_u32(src)); +} + +INLINE uint32x4_t add_128(uint32x4_t a, uint32x4_t b) { + return vaddq_u32(a, b); +} + +INLINE uint32x4_t xor_128(uint32x4_t a, uint32x4_t b) { + return veorq_u32(a, b); +} + +INLINE uint32x4_t set1_128(uint32_t x) { return vld1q_dup_u32(&x); } + +INLINE uint32x4_t set4(uint32_t a, uint32_t b, uint32_t c, uint32_t d) { + uint32_t array[4] = {a, b, c, d}; + return vld1q_u32(array); +} + +INLINE uint32x4_t rot16_128(uint32x4_t x) { + // The straightforward implementation would be two shifts and an or, but that's + // slower on microarchitectures we've tested. See + // https://github.com/BLAKE3-team/BLAKE3/pull/319. + // return vorrq_u32(vshrq_n_u32(x, 16), vshlq_n_u32(x, 32 - 16)); + return vreinterpretq_u32_u16(vrev32q_u16(vreinterpretq_u16_u32(x))); +} + +INLINE uint32x4_t rot12_128(uint32x4_t x) { + // See comment in rot16_128. + // return vorrq_u32(vshrq_n_u32(x, 12), vshlq_n_u32(x, 32 - 12)); + return vsriq_n_u32(vshlq_n_u32(x, 32-12), x, 12); +} + +INLINE uint32x4_t rot8_128(uint32x4_t x) { + // See comment in rot16_128. + // return vorrq_u32(vshrq_n_u32(x, 8), vshlq_n_u32(x, 32 - 8)); +#if defined(__clang__) + return vreinterpretq_u32_u8(__builtin_shufflevector(vreinterpretq_u8_u32(x), vreinterpretq_u8_u32(x), 1,2,3,0,5,6,7,4,9,10,11,8,13,14,15,12)); +#elif __GNUC__ * 10000 + __GNUC_MINOR__ * 100 >=40700 + static const uint8x16_t r8 = {1,2,3,0,5,6,7,4,9,10,11,8,13,14,15,12}; + return vreinterpretq_u32_u8(__builtin_shuffle(vreinterpretq_u8_u32(x), vreinterpretq_u8_u32(x), r8)); +#else + return vsriq_n_u32(vshlq_n_u32(x, 32-8), x, 8); +#endif +} + +INLINE uint32x4_t rot7_128(uint32x4_t x) { + // See comment in rot16_128. + // return vorrq_u32(vshrq_n_u32(x, 7), vshlq_n_u32(x, 32 - 7)); + return vsriq_n_u32(vshlq_n_u32(x, 32-7), x, 7); +} + +// TODO: compress_neon + +// TODO: hash2_neon + +/* + * ---------------------------------------------------------------------------- + * hash4_neon + * ---------------------------------------------------------------------------- + */ + +INLINE void round_fn4(uint32x4_t v[16], uint32x4_t m[16], size_t r) { + v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][0]]); + v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][2]]); + v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][4]]); + v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][6]]); + v[0] = add_128(v[0], v[4]); + v[1] = add_128(v[1], v[5]); + v[2] = add_128(v[2], v[6]); + v[3] = add_128(v[3], v[7]); + v[12] = xor_128(v[12], v[0]); + v[13] = xor_128(v[13], v[1]); + v[14] = xor_128(v[14], v[2]); + v[15] = xor_128(v[15], v[3]); + v[12] = rot16_128(v[12]); + v[13] = rot16_128(v[13]); + v[14] = rot16_128(v[14]); + v[15] = rot16_128(v[15]); + v[8] = add_128(v[8], v[12]); + v[9] = add_128(v[9], v[13]); + v[10] = add_128(v[10], v[14]); + v[11] = add_128(v[11], v[15]); + v[4] = xor_128(v[4], v[8]); + v[5] = xor_128(v[5], v[9]); + v[6] = xor_128(v[6], v[10]); + v[7] = xor_128(v[7], v[11]); + v[4] = rot12_128(v[4]); + v[5] = rot12_128(v[5]); + v[6] = rot12_128(v[6]); + v[7] = rot12_128(v[7]); + v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][1]]); + v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][3]]); + v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][5]]); + v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][7]]); + v[0] = add_128(v[0], v[4]); + v[1] = add_128(v[1], v[5]); + v[2] = add_128(v[2], v[6]); + v[3] = add_128(v[3], v[7]); + v[12] = xor_128(v[12], v[0]); + v[13] = xor_128(v[13], v[1]); + v[14] = xor_128(v[14], v[2]); + v[15] = xor_128(v[15], v[3]); + v[12] = rot8_128(v[12]); + v[13] = rot8_128(v[13]); + v[14] = rot8_128(v[14]); + v[15] = rot8_128(v[15]); + v[8] = add_128(v[8], v[12]); + v[9] = add_128(v[9], v[13]); + v[10] = add_128(v[10], v[14]); + v[11] = add_128(v[11], v[15]); + v[4] = xor_128(v[4], v[8]); + v[5] = xor_128(v[5], v[9]); + v[6] = xor_128(v[6], v[10]); + v[7] = xor_128(v[7], v[11]); + v[4] = rot7_128(v[4]); + v[5] = rot7_128(v[5]); + v[6] = rot7_128(v[6]); + v[7] = rot7_128(v[7]); + + v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][8]]); + v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][10]]); + v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][12]]); + v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][14]]); + v[0] = add_128(v[0], v[5]); + v[1] = add_128(v[1], v[6]); + v[2] = add_128(v[2], v[7]); + v[3] = add_128(v[3], v[4]); + v[15] = xor_128(v[15], v[0]); + v[12] = xor_128(v[12], v[1]); + v[13] = xor_128(v[13], v[2]); + v[14] = xor_128(v[14], v[3]); + v[15] = rot16_128(v[15]); + v[12] = rot16_128(v[12]); + v[13] = rot16_128(v[13]); + v[14] = rot16_128(v[14]); + v[10] = add_128(v[10], v[15]); + v[11] = add_128(v[11], v[12]); + v[8] = add_128(v[8], v[13]); + v[9] = add_128(v[9], v[14]); + v[5] = xor_128(v[5], v[10]); + v[6] = xor_128(v[6], v[11]); + v[7] = xor_128(v[7], v[8]); + v[4] = xor_128(v[4], v[9]); + v[5] = rot12_128(v[5]); + v[6] = rot12_128(v[6]); + v[7] = rot12_128(v[7]); + v[4] = rot12_128(v[4]); + v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][9]]); + v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][11]]); + v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][13]]); + v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][15]]); + v[0] = add_128(v[0], v[5]); + v[1] = add_128(v[1], v[6]); + v[2] = add_128(v[2], v[7]); + v[3] = add_128(v[3], v[4]); + v[15] = xor_128(v[15], v[0]); + v[12] = xor_128(v[12], v[1]); + v[13] = xor_128(v[13], v[2]); + v[14] = xor_128(v[14], v[3]); + v[15] = rot8_128(v[15]); + v[12] = rot8_128(v[12]); + v[13] = rot8_128(v[13]); + v[14] = rot8_128(v[14]); + v[10] = add_128(v[10], v[15]); + v[11] = add_128(v[11], v[12]); + v[8] = add_128(v[8], v[13]); + v[9] = add_128(v[9], v[14]); + v[5] = xor_128(v[5], v[10]); + v[6] = xor_128(v[6], v[11]); + v[7] = xor_128(v[7], v[8]); + v[4] = xor_128(v[4], v[9]); + v[5] = rot7_128(v[5]); + v[6] = rot7_128(v[6]); + v[7] = rot7_128(v[7]); + v[4] = rot7_128(v[4]); +} + +INLINE void transpose_vecs_128(uint32x4_t vecs[4]) { + // Individually transpose the four 2x2 sub-matrices in each corner. + uint32x4x2_t rows01 = vtrnq_u32(vecs[0], vecs[1]); + uint32x4x2_t rows23 = vtrnq_u32(vecs[2], vecs[3]); + + // Swap the top-right and bottom-left 2x2s (which just got transposed). + vecs[0] = + vcombine_u32(vget_low_u32(rows01.val[0]), vget_low_u32(rows23.val[0])); + vecs[1] = + vcombine_u32(vget_low_u32(rows01.val[1]), vget_low_u32(rows23.val[1])); + vecs[2] = + vcombine_u32(vget_high_u32(rows01.val[0]), vget_high_u32(rows23.val[0])); + vecs[3] = + vcombine_u32(vget_high_u32(rows01.val[1]), vget_high_u32(rows23.val[1])); +} + +INLINE void transpose_msg_vecs4(const uint8_t *const *inputs, + size_t block_offset, uint32x4_t out[16]) { + out[0] = loadu_128(&inputs[0][block_offset + 0 * sizeof(uint32x4_t)]); + out[1] = loadu_128(&inputs[1][block_offset + 0 * sizeof(uint32x4_t)]); + out[2] = loadu_128(&inputs[2][block_offset + 0 * sizeof(uint32x4_t)]); + out[3] = loadu_128(&inputs[3][block_offset + 0 * sizeof(uint32x4_t)]); + out[4] = loadu_128(&inputs[0][block_offset + 1 * sizeof(uint32x4_t)]); + out[5] = loadu_128(&inputs[1][block_offset + 1 * sizeof(uint32x4_t)]); + out[6] = loadu_128(&inputs[2][block_offset + 1 * sizeof(uint32x4_t)]); + out[7] = loadu_128(&inputs[3][block_offset + 1 * sizeof(uint32x4_t)]); + out[8] = loadu_128(&inputs[0][block_offset + 2 * sizeof(uint32x4_t)]); + out[9] = loadu_128(&inputs[1][block_offset + 2 * sizeof(uint32x4_t)]); + out[10] = loadu_128(&inputs[2][block_offset + 2 * sizeof(uint32x4_t)]); + out[11] = loadu_128(&inputs[3][block_offset + 2 * sizeof(uint32x4_t)]); + out[12] = loadu_128(&inputs[0][block_offset + 3 * sizeof(uint32x4_t)]); + out[13] = loadu_128(&inputs[1][block_offset + 3 * sizeof(uint32x4_t)]); + out[14] = loadu_128(&inputs[2][block_offset + 3 * sizeof(uint32x4_t)]); + out[15] = loadu_128(&inputs[3][block_offset + 3 * sizeof(uint32x4_t)]); + transpose_vecs_128(&out[0]); + transpose_vecs_128(&out[4]); + transpose_vecs_128(&out[8]); + transpose_vecs_128(&out[12]); +} + +INLINE void load_counters4(uint64_t counter, bool increment_counter, + uint32x4_t *out_low, uint32x4_t *out_high) { + uint64_t mask = (increment_counter ? ~0 : 0); + *out_low = set4( + counter_low(counter + (mask & 0)), counter_low(counter + (mask & 1)), + counter_low(counter + (mask & 2)), counter_low(counter + (mask & 3))); + *out_high = set4( + counter_high(counter + (mask & 0)), counter_high(counter + (mask & 1)), + counter_high(counter + (mask & 2)), counter_high(counter + (mask & 3))); +} + +static void blake3_hash4_neon(const uint8_t *const *inputs, size_t blocks, + const uint32_t key[8], uint64_t counter, + bool increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, + uint8_t *out) { + uint32x4_t h_vecs[8] = { + set1_128(key[0]), set1_128(key[1]), set1_128(key[2]), set1_128(key[3]), + set1_128(key[4]), set1_128(key[5]), set1_128(key[6]), set1_128(key[7]), + }; + uint32x4_t counter_low_vec, counter_high_vec; + load_counters4(counter, increment_counter, &counter_low_vec, + &counter_high_vec); + uint8_t block_flags = flags | flags_start; + + for (size_t block = 0; block < blocks; block++) { + if (block + 1 == blocks) { + block_flags |= flags_end; + } + uint32x4_t block_len_vec = set1_128(BLAKE3_BLOCK_LEN); + uint32x4_t block_flags_vec = set1_128(block_flags); + uint32x4_t msg_vecs[16]; + transpose_msg_vecs4(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs); + + uint32x4_t v[16] = { + h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], + h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], + set1_128(IV[0]), set1_128(IV[1]), set1_128(IV[2]), set1_128(IV[3]), + counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec, + }; + round_fn4(v, msg_vecs, 0); + round_fn4(v, msg_vecs, 1); + round_fn4(v, msg_vecs, 2); + round_fn4(v, msg_vecs, 3); + round_fn4(v, msg_vecs, 4); + round_fn4(v, msg_vecs, 5); + round_fn4(v, msg_vecs, 6); + h_vecs[0] = xor_128(v[0], v[8]); + h_vecs[1] = xor_128(v[1], v[9]); + h_vecs[2] = xor_128(v[2], v[10]); + h_vecs[3] = xor_128(v[3], v[11]); + h_vecs[4] = xor_128(v[4], v[12]); + h_vecs[5] = xor_128(v[5], v[13]); + h_vecs[6] = xor_128(v[6], v[14]); + h_vecs[7] = xor_128(v[7], v[15]); + + block_flags = flags; + } + + transpose_vecs_128(&h_vecs[0]); + transpose_vecs_128(&h_vecs[4]); + // The first four vecs now contain the first half of each output, and the + // second four vecs contain the second half of each output. + storeu_128(h_vecs[0], &out[0 * sizeof(uint32x4_t)]); + storeu_128(h_vecs[4], &out[1 * sizeof(uint32x4_t)]); + storeu_128(h_vecs[1], &out[2 * sizeof(uint32x4_t)]); + storeu_128(h_vecs[5], &out[3 * sizeof(uint32x4_t)]); + storeu_128(h_vecs[2], &out[4 * sizeof(uint32x4_t)]); + storeu_128(h_vecs[6], &out[5 * sizeof(uint32x4_t)]); + storeu_128(h_vecs[3], &out[6 * sizeof(uint32x4_t)]); + storeu_128(h_vecs[7], &out[7 * sizeof(uint32x4_t)]); +} + +/* + * ---------------------------------------------------------------------------- + * hash_many_neon + * ---------------------------------------------------------------------------- + */ + +void blake3_compress_in_place_portable(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags); + +INLINE void hash_one_neon(const uint8_t *input, size_t blocks, + const uint32_t key[8], uint64_t counter, + uint8_t flags, uint8_t flags_start, uint8_t flags_end, + uint8_t out[BLAKE3_OUT_LEN]) { + uint32_t cv[8]; + memcpy(cv, key, BLAKE3_KEY_LEN); + uint8_t block_flags = flags | flags_start; + while (blocks > 0) { + if (blocks == 1) { + block_flags |= flags_end; + } + // TODO: Implement compress_neon. However note that according to + // https://github.com/BLAKE2/BLAKE2/commit/7965d3e6e1b4193438b8d3a656787587d2579227, + // compress_neon might not be any faster than compress_portable. + blake3_compress_in_place_portable(cv, input, BLAKE3_BLOCK_LEN, counter, + block_flags); + input = &input[BLAKE3_BLOCK_LEN]; + blocks -= 1; + block_flags = flags; + } + memcpy(out, cv, BLAKE3_OUT_LEN); +} + +void blake3_hash_many_neon(const uint8_t *const *inputs, size_t num_inputs, + size_t blocks, const uint32_t key[8], + uint64_t counter, bool increment_counter, + uint8_t flags, uint8_t flags_start, + uint8_t flags_end, uint8_t *out) { + while (num_inputs >= 4) { + blake3_hash4_neon(inputs, blocks, key, counter, increment_counter, flags, + flags_start, flags_end, out); + if (increment_counter) { + counter += 4; + } + inputs += 4; + num_inputs -= 4; + out = &out[4 * BLAKE3_OUT_LEN]; + } + while (num_inputs > 0) { + hash_one_neon(inputs[0], blocks, key, counter, flags, flags_start, + flags_end, out); + if (increment_counter) { + counter += 1; + } + inputs += 1; + num_inputs -= 1; + out = &out[BLAKE3_OUT_LEN]; + } +} diff --git a/vendor/blake3/blake3_portable.c b/vendor/blake3/blake3_portable.c new file mode 100644 index 000000000..1c6288882 --- /dev/null +++ b/vendor/blake3/blake3_portable.c @@ -0,0 +1,160 @@ +#include "blake3_impl.h" + +INLINE uint32_t rotr32(uint32_t w, uint32_t c) { + return (w >> c) | (w << (32 - c)); +} + +INLINE void g(uint32_t *state, size_t a, size_t b, size_t c, size_t d, + uint32_t x, uint32_t y) { + state[a] = state[a] + state[b] + x; + state[d] = rotr32(state[d] ^ state[a], 16); + state[c] = state[c] + state[d]; + state[b] = rotr32(state[b] ^ state[c], 12); + state[a] = state[a] + state[b] + y; + state[d] = rotr32(state[d] ^ state[a], 8); + state[c] = state[c] + state[d]; + state[b] = rotr32(state[b] ^ state[c], 7); +} + +INLINE void round_fn(uint32_t state[16], const uint32_t *msg, size_t round) { + // Select the message schedule based on the round. + const uint8_t *schedule = MSG_SCHEDULE[round]; + + // Mix the columns. + g(state, 0, 4, 8, 12, msg[schedule[0]], msg[schedule[1]]); + g(state, 1, 5, 9, 13, msg[schedule[2]], msg[schedule[3]]); + g(state, 2, 6, 10, 14, msg[schedule[4]], msg[schedule[5]]); + g(state, 3, 7, 11, 15, msg[schedule[6]], msg[schedule[7]]); + + // Mix the diagonals. + g(state, 0, 5, 10, 15, msg[schedule[8]], msg[schedule[9]]); + g(state, 1, 6, 11, 12, msg[schedule[10]], msg[schedule[11]]); + g(state, 2, 7, 8, 13, msg[schedule[12]], msg[schedule[13]]); + g(state, 3, 4, 9, 14, msg[schedule[14]], msg[schedule[15]]); +} + +INLINE void compress_pre(uint32_t state[16], const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, uint8_t flags) { + uint32_t block_words[16]; + block_words[0] = load32(block + 4 * 0); + block_words[1] = load32(block + 4 * 1); + block_words[2] = load32(block + 4 * 2); + block_words[3] = load32(block + 4 * 3); + block_words[4] = load32(block + 4 * 4); + block_words[5] = load32(block + 4 * 5); + block_words[6] = load32(block + 4 * 6); + block_words[7] = load32(block + 4 * 7); + block_words[8] = load32(block + 4 * 8); + block_words[9] = load32(block + 4 * 9); + block_words[10] = load32(block + 4 * 10); + block_words[11] = load32(block + 4 * 11); + block_words[12] = load32(block + 4 * 12); + block_words[13] = load32(block + 4 * 13); + block_words[14] = load32(block + 4 * 14); + block_words[15] = load32(block + 4 * 15); + + state[0] = cv[0]; + state[1] = cv[1]; + state[2] = cv[2]; + state[3] = cv[3]; + state[4] = cv[4]; + state[5] = cv[5]; + state[6] = cv[6]; + state[7] = cv[7]; + state[8] = IV[0]; + state[9] = IV[1]; + state[10] = IV[2]; + state[11] = IV[3]; + state[12] = counter_low(counter); + state[13] = counter_high(counter); + state[14] = (uint32_t)block_len; + state[15] = (uint32_t)flags; + + round_fn(state, &block_words[0], 0); + round_fn(state, &block_words[0], 1); + round_fn(state, &block_words[0], 2); + round_fn(state, &block_words[0], 3); + round_fn(state, &block_words[0], 4); + round_fn(state, &block_words[0], 5); + round_fn(state, &block_words[0], 6); +} + +void blake3_compress_in_place_portable(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags) { + uint32_t state[16]; + compress_pre(state, cv, block, block_len, counter, flags); + cv[0] = state[0] ^ state[8]; + cv[1] = state[1] ^ state[9]; + cv[2] = state[2] ^ state[10]; + cv[3] = state[3] ^ state[11]; + cv[4] = state[4] ^ state[12]; + cv[5] = state[5] ^ state[13]; + cv[6] = state[6] ^ state[14]; + cv[7] = state[7] ^ state[15]; +} + +void blake3_compress_xof_portable(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags, uint8_t out[64]) { + uint32_t state[16]; + compress_pre(state, cv, block, block_len, counter, flags); + + store32(&out[0 * 4], state[0] ^ state[8]); + store32(&out[1 * 4], state[1] ^ state[9]); + store32(&out[2 * 4], state[2] ^ state[10]); + store32(&out[3 * 4], state[3] ^ state[11]); + store32(&out[4 * 4], state[4] ^ state[12]); + store32(&out[5 * 4], state[5] ^ state[13]); + store32(&out[6 * 4], state[6] ^ state[14]); + store32(&out[7 * 4], state[7] ^ state[15]); + store32(&out[8 * 4], state[8] ^ cv[0]); + store32(&out[9 * 4], state[9] ^ cv[1]); + store32(&out[10 * 4], state[10] ^ cv[2]); + store32(&out[11 * 4], state[11] ^ cv[3]); + store32(&out[12 * 4], state[12] ^ cv[4]); + store32(&out[13 * 4], state[13] ^ cv[5]); + store32(&out[14 * 4], state[14] ^ cv[6]); + store32(&out[15 * 4], state[15] ^ cv[7]); +} + +INLINE void hash_one_portable(const uint8_t *input, size_t blocks, + const uint32_t key[8], uint64_t counter, + uint8_t flags, uint8_t flags_start, + uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN]) { + uint32_t cv[8]; + memcpy(cv, key, BLAKE3_KEY_LEN); + uint8_t block_flags = flags | flags_start; + while (blocks > 0) { + if (blocks == 1) { + block_flags |= flags_end; + } + blake3_compress_in_place_portable(cv, input, BLAKE3_BLOCK_LEN, counter, + block_flags); + input = &input[BLAKE3_BLOCK_LEN]; + blocks -= 1; + block_flags = flags; + } + store_cv_words(out, cv); +} + +void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs, + size_t blocks, const uint32_t key[8], + uint64_t counter, bool increment_counter, + uint8_t flags, uint8_t flags_start, + uint8_t flags_end, uint8_t *out) { + while (num_inputs > 0) { + hash_one_portable(inputs[0], blocks, key, counter, flags, flags_start, + flags_end, out); + if (increment_counter) { + counter += 1; + } + inputs += 1; + num_inputs -= 1; + out = &out[BLAKE3_OUT_LEN]; + } +} + diff --git a/vendor/blake3/blake3_sse2_x86-64_unix.S b/vendor/blake3/blake3_sse2_x86-64_unix.S new file mode 100644 index 000000000..99f033fef --- /dev/null +++ b/vendor/blake3/blake3_sse2_x86-64_unix.S @@ -0,0 +1,2291 @@ +#if defined(__ELF__) && defined(__linux__) +.section .note.GNU-stack,"",%progbits +#endif + +#if defined(__ELF__) && defined(__CET__) && defined(__has_include) +#if __has_include() +#include +#endif +#endif + +#if !defined(_CET_ENDBR) +#define _CET_ENDBR +#endif + +.intel_syntax noprefix +.global blake3_hash_many_sse2 +.global _blake3_hash_many_sse2 +.global blake3_compress_in_place_sse2 +.global _blake3_compress_in_place_sse2 +.global blake3_compress_xof_sse2 +.global _blake3_compress_xof_sse2 +#ifdef __APPLE__ +.text +#else +.section .text +#endif + .p2align 6 +_blake3_hash_many_sse2: +blake3_hash_many_sse2: + _CET_ENDBR + push r15 + push r14 + push r13 + push r12 + push rbx + push rbp + mov rbp, rsp + sub rsp, 360 + and rsp, 0xFFFFFFFFFFFFFFC0 + neg r9d + movd xmm0, r9d + pshufd xmm0, xmm0, 0x00 + movdqa xmmword ptr [rsp+0x130], xmm0 + movdqa xmm1, xmm0 + pand xmm1, xmmword ptr [ADD0+rip] + pand xmm0, xmmword ptr [ADD1+rip] + movdqa xmmword ptr [rsp+0x150], xmm0 + movd xmm0, r8d + pshufd xmm0, xmm0, 0x00 + paddd xmm0, xmm1 + movdqa xmmword ptr [rsp+0x110], xmm0 + pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip] + pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip] + pcmpgtd xmm1, xmm0 + shr r8, 32 + movd xmm2, r8d + pshufd xmm2, xmm2, 0x00 + psubd xmm2, xmm1 + movdqa xmmword ptr [rsp+0x120], xmm2 + mov rbx, qword ptr [rbp+0x50] + mov r15, rdx + shl r15, 6 + movzx r13d, byte ptr [rbp+0x38] + movzx r12d, byte ptr [rbp+0x48] + cmp rsi, 4 + jc 3f +2: + movdqu xmm3, xmmword ptr [rcx] + pshufd xmm0, xmm3, 0x00 + pshufd xmm1, xmm3, 0x55 + pshufd xmm2, xmm3, 0xAA + pshufd xmm3, xmm3, 0xFF + movdqu xmm7, xmmword ptr [rcx+0x10] + pshufd xmm4, xmm7, 0x00 + pshufd xmm5, xmm7, 0x55 + pshufd xmm6, xmm7, 0xAA + pshufd xmm7, xmm7, 0xFF + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + mov r10, qword ptr [rdi+0x10] + mov r11, qword ptr [rdi+0x18] + movzx eax, byte ptr [rbp+0x40] + or eax, r13d + xor edx, edx +9: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + movdqu xmm8, xmmword ptr [r8+rdx-0x40] + movdqu xmm9, xmmword ptr [r9+rdx-0x40] + movdqu xmm10, xmmword ptr [r10+rdx-0x40] + movdqu xmm11, xmmword ptr [r11+rdx-0x40] + movdqa xmm12, xmm8 + punpckldq xmm8, xmm9 + punpckhdq xmm12, xmm9 + movdqa xmm14, xmm10 + punpckldq xmm10, xmm11 + punpckhdq xmm14, xmm11 + movdqa xmm9, xmm8 + punpcklqdq xmm8, xmm10 + punpckhqdq xmm9, xmm10 + movdqa xmm13, xmm12 + punpcklqdq xmm12, xmm14 + punpckhqdq xmm13, xmm14 + movdqa xmmword ptr [rsp], xmm8 + movdqa xmmword ptr [rsp+0x10], xmm9 + movdqa xmmword ptr [rsp+0x20], xmm12 + movdqa xmmword ptr [rsp+0x30], xmm13 + movdqu xmm8, xmmword ptr [r8+rdx-0x30] + movdqu xmm9, xmmword ptr [r9+rdx-0x30] + movdqu xmm10, xmmword ptr [r10+rdx-0x30] + movdqu xmm11, xmmword ptr [r11+rdx-0x30] + movdqa xmm12, xmm8 + punpckldq xmm8, xmm9 + punpckhdq xmm12, xmm9 + movdqa xmm14, xmm10 + punpckldq xmm10, xmm11 + punpckhdq xmm14, xmm11 + movdqa xmm9, xmm8 + punpcklqdq xmm8, xmm10 + punpckhqdq xmm9, xmm10 + movdqa xmm13, xmm12 + punpcklqdq xmm12, xmm14 + punpckhqdq xmm13, xmm14 + movdqa xmmword ptr [rsp+0x40], xmm8 + movdqa xmmword ptr [rsp+0x50], xmm9 + movdqa xmmword ptr [rsp+0x60], xmm12 + movdqa xmmword ptr [rsp+0x70], xmm13 + movdqu xmm8, xmmword ptr [r8+rdx-0x20] + movdqu xmm9, xmmword ptr [r9+rdx-0x20] + movdqu xmm10, xmmword ptr [r10+rdx-0x20] + movdqu xmm11, xmmword ptr [r11+rdx-0x20] + movdqa xmm12, xmm8 + punpckldq xmm8, xmm9 + punpckhdq xmm12, xmm9 + movdqa xmm14, xmm10 + punpckldq xmm10, xmm11 + punpckhdq xmm14, xmm11 + movdqa xmm9, xmm8 + punpcklqdq xmm8, xmm10 + punpckhqdq xmm9, xmm10 + movdqa xmm13, xmm12 + punpcklqdq xmm12, xmm14 + punpckhqdq xmm13, xmm14 + movdqa xmmword ptr [rsp+0x80], xmm8 + movdqa xmmword ptr [rsp+0x90], xmm9 + movdqa xmmword ptr [rsp+0xA0], xmm12 + movdqa xmmword ptr [rsp+0xB0], xmm13 + movdqu xmm8, xmmword ptr [r8+rdx-0x10] + movdqu xmm9, xmmword ptr [r9+rdx-0x10] + movdqu xmm10, xmmword ptr [r10+rdx-0x10] + movdqu xmm11, xmmword ptr [r11+rdx-0x10] + movdqa xmm12, xmm8 + punpckldq xmm8, xmm9 + punpckhdq xmm12, xmm9 + movdqa xmm14, xmm10 + punpckldq xmm10, xmm11 + punpckhdq xmm14, xmm11 + movdqa xmm9, xmm8 + punpcklqdq xmm8, xmm10 + punpckhqdq xmm9, xmm10 + movdqa xmm13, xmm12 + punpcklqdq xmm12, xmm14 + punpckhqdq xmm13, xmm14 + movdqa xmmword ptr [rsp+0xC0], xmm8 + movdqa xmmword ptr [rsp+0xD0], xmm9 + movdqa xmmword ptr [rsp+0xE0], xmm12 + movdqa xmmword ptr [rsp+0xF0], xmm13 + movdqa xmm9, xmmword ptr [BLAKE3_IV_1+rip] + movdqa xmm10, xmmword ptr [BLAKE3_IV_2+rip] + movdqa xmm11, xmmword ptr [BLAKE3_IV_3+rip] + movdqa xmm12, xmmword ptr [rsp+0x110] + movdqa xmm13, xmmword ptr [rsp+0x120] + movdqa xmm14, xmmword ptr [BLAKE3_BLOCK_LEN+rip] + movd xmm15, eax + pshufd xmm15, xmm15, 0x00 + prefetcht0 [r8+rdx+0x80] + prefetcht0 [r9+rdx+0x80] + prefetcht0 [r10+rdx+0x80] + prefetcht0 [r11+rdx+0x80] + paddd xmm0, xmmword ptr [rsp] + paddd xmm1, xmmword ptr [rsp+0x20] + paddd xmm2, xmmword ptr [rsp+0x40] + paddd xmm3, xmmword ptr [rsp+0x60] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + movdqa xmm8, xmmword ptr [BLAKE3_IV_0+rip] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x10] + paddd xmm1, xmmword ptr [rsp+0x30] + paddd xmm2, xmmword ptr [rsp+0x50] + paddd xmm3, xmmword ptr [rsp+0x70] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x80] + paddd xmm1, xmmword ptr [rsp+0xA0] + paddd xmm2, xmmword ptr [rsp+0xC0] + paddd xmm3, xmmword ptr [rsp+0xE0] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x90] + paddd xmm1, xmmword ptr [rsp+0xB0] + paddd xmm2, xmmword ptr [rsp+0xD0] + paddd xmm3, xmmword ptr [rsp+0xF0] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x20] + paddd xmm1, xmmword ptr [rsp+0x30] + paddd xmm2, xmmword ptr [rsp+0x70] + paddd xmm3, xmmword ptr [rsp+0x40] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x60] + paddd xmm1, xmmword ptr [rsp+0xA0] + paddd xmm2, xmmword ptr [rsp] + paddd xmm3, xmmword ptr [rsp+0xD0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x10] + paddd xmm1, xmmword ptr [rsp+0xC0] + paddd xmm2, xmmword ptr [rsp+0x90] + paddd xmm3, xmmword ptr [rsp+0xF0] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0xB0] + paddd xmm1, xmmword ptr [rsp+0x50] + paddd xmm2, xmmword ptr [rsp+0xE0] + paddd xmm3, xmmword ptr [rsp+0x80] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x30] + paddd xmm1, xmmword ptr [rsp+0xA0] + paddd xmm2, xmmword ptr [rsp+0xD0] + paddd xmm3, xmmword ptr [rsp+0x70] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x40] + paddd xmm1, xmmword ptr [rsp+0xC0] + paddd xmm2, xmmword ptr [rsp+0x20] + paddd xmm3, xmmword ptr [rsp+0xE0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x60] + paddd xmm1, xmmword ptr [rsp+0x90] + paddd xmm2, xmmword ptr [rsp+0xB0] + paddd xmm3, xmmword ptr [rsp+0x80] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x50] + paddd xmm1, xmmword ptr [rsp] + paddd xmm2, xmmword ptr [rsp+0xF0] + paddd xmm3, xmmword ptr [rsp+0x10] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0xA0] + paddd xmm1, xmmword ptr [rsp+0xC0] + paddd xmm2, xmmword ptr [rsp+0xE0] + paddd xmm3, xmmword ptr [rsp+0xD0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x70] + paddd xmm1, xmmword ptr [rsp+0x90] + paddd xmm2, xmmword ptr [rsp+0x30] + paddd xmm3, xmmword ptr [rsp+0xF0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x40] + paddd xmm1, xmmword ptr [rsp+0xB0] + paddd xmm2, xmmword ptr [rsp+0x50] + paddd xmm3, xmmword ptr [rsp+0x10] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp] + paddd xmm1, xmmword ptr [rsp+0x20] + paddd xmm2, xmmword ptr [rsp+0x80] + paddd xmm3, xmmword ptr [rsp+0x60] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0xC0] + paddd xmm1, xmmword ptr [rsp+0x90] + paddd xmm2, xmmword ptr [rsp+0xF0] + paddd xmm3, xmmword ptr [rsp+0xE0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0xD0] + paddd xmm1, xmmword ptr [rsp+0xB0] + paddd xmm2, xmmword ptr [rsp+0xA0] + paddd xmm3, xmmword ptr [rsp+0x80] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x70] + paddd xmm1, xmmword ptr [rsp+0x50] + paddd xmm2, xmmword ptr [rsp] + paddd xmm3, xmmword ptr [rsp+0x60] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x20] + paddd xmm1, xmmword ptr [rsp+0x30] + paddd xmm2, xmmword ptr [rsp+0x10] + paddd xmm3, xmmword ptr [rsp+0x40] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x90] + paddd xmm1, xmmword ptr [rsp+0xB0] + paddd xmm2, xmmword ptr [rsp+0x80] + paddd xmm3, xmmword ptr [rsp+0xF0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0xE0] + paddd xmm1, xmmword ptr [rsp+0x50] + paddd xmm2, xmmword ptr [rsp+0xC0] + paddd xmm3, xmmword ptr [rsp+0x10] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0xD0] + paddd xmm1, xmmword ptr [rsp] + paddd xmm2, xmmword ptr [rsp+0x20] + paddd xmm3, xmmword ptr [rsp+0x40] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x30] + paddd xmm1, xmmword ptr [rsp+0xA0] + paddd xmm2, xmmword ptr [rsp+0x60] + paddd xmm3, xmmword ptr [rsp+0x70] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0xB0] + paddd xmm1, xmmword ptr [rsp+0x50] + paddd xmm2, xmmword ptr [rsp+0x10] + paddd xmm3, xmmword ptr [rsp+0x80] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0xF0] + paddd xmm1, xmmword ptr [rsp] + paddd xmm2, xmmword ptr [rsp+0x90] + paddd xmm3, xmmword ptr [rsp+0x60] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0xE0] + paddd xmm1, xmmword ptr [rsp+0x20] + paddd xmm2, xmmword ptr [rsp+0x30] + paddd xmm3, xmmword ptr [rsp+0x70] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0xA0] + paddd xmm1, xmmword ptr [rsp+0xC0] + paddd xmm2, xmmword ptr [rsp+0x40] + paddd xmm3, xmmword ptr [rsp+0xD0] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + pxor xmm0, xmm8 + pxor xmm1, xmm9 + pxor xmm2, xmm10 + pxor xmm3, xmm11 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + pxor xmm4, xmm12 + pxor xmm5, xmm13 + pxor xmm6, xmm14 + pxor xmm7, xmm15 + mov eax, r13d + jne 9b + movdqa xmm9, xmm0 + punpckldq xmm0, xmm1 + punpckhdq xmm9, xmm1 + movdqa xmm11, xmm2 + punpckldq xmm2, xmm3 + punpckhdq xmm11, xmm3 + movdqa xmm1, xmm0 + punpcklqdq xmm0, xmm2 + punpckhqdq xmm1, xmm2 + movdqa xmm3, xmm9 + punpcklqdq xmm9, xmm11 + punpckhqdq xmm3, xmm11 + movdqu xmmword ptr [rbx], xmm0 + movdqu xmmword ptr [rbx+0x20], xmm1 + movdqu xmmword ptr [rbx+0x40], xmm9 + movdqu xmmword ptr [rbx+0x60], xmm3 + movdqa xmm9, xmm4 + punpckldq xmm4, xmm5 + punpckhdq xmm9, xmm5 + movdqa xmm11, xmm6 + punpckldq xmm6, xmm7 + punpckhdq xmm11, xmm7 + movdqa xmm5, xmm4 + punpcklqdq xmm4, xmm6 + punpckhqdq xmm5, xmm6 + movdqa xmm7, xmm9 + punpcklqdq xmm9, xmm11 + punpckhqdq xmm7, xmm11 + movdqu xmmword ptr [rbx+0x10], xmm4 + movdqu xmmword ptr [rbx+0x30], xmm5 + movdqu xmmword ptr [rbx+0x50], xmm9 + movdqu xmmword ptr [rbx+0x70], xmm7 + movdqa xmm1, xmmword ptr [rsp+0x110] + movdqa xmm0, xmm1 + paddd xmm1, xmmword ptr [rsp+0x150] + movdqa xmmword ptr [rsp+0x110], xmm1 + pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip] + pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip] + pcmpgtd xmm0, xmm1 + movdqa xmm1, xmmword ptr [rsp+0x120] + psubd xmm1, xmm0 + movdqa xmmword ptr [rsp+0x120], xmm1 + add rbx, 128 + add rdi, 32 + sub rsi, 4 + cmp rsi, 4 + jnc 2b + test rsi, rsi + jnz 3f +4: + mov rsp, rbp + pop rbp + pop rbx + pop r12 + pop r13 + pop r14 + pop r15 + ret +.p2align 5 +3: + test esi, 0x2 + je 3f + movups xmm0, xmmword ptr [rcx] + movups xmm1, xmmword ptr [rcx+0x10] + movaps xmm8, xmm0 + movaps xmm9, xmm1 + movd xmm13, dword ptr [rsp+0x110] + movd xmm14, dword ptr [rsp+0x120] + punpckldq xmm13, xmm14 + movaps xmmword ptr [rsp], xmm13 + movd xmm14, dword ptr [rsp+0x114] + movd xmm13, dword ptr [rsp+0x124] + punpckldq xmm14, xmm13 + movaps xmmword ptr [rsp+0x10], xmm14 + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + movzx eax, byte ptr [rbp+0x40] + or eax, r13d + xor edx, edx +2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + movaps xmm2, xmmword ptr [BLAKE3_IV+rip] + movaps xmm10, xmm2 + movups xmm4, xmmword ptr [r8+rdx-0x40] + movups xmm5, xmmword ptr [r8+rdx-0x30] + movaps xmm3, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm3, xmm5, 221 + movaps xmm5, xmm3 + movups xmm6, xmmword ptr [r8+rdx-0x20] + movups xmm7, xmmword ptr [r8+rdx-0x10] + movaps xmm3, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 0x93 + shufps xmm3, xmm7, 221 + pshufd xmm7, xmm3, 0x93 + movups xmm12, xmmword ptr [r9+rdx-0x40] + movups xmm13, xmmword ptr [r9+rdx-0x30] + movaps xmm11, xmm12 + shufps xmm12, xmm13, 136 + shufps xmm11, xmm13, 221 + movaps xmm13, xmm11 + movups xmm14, xmmword ptr [r9+rdx-0x20] + movups xmm15, xmmword ptr [r9+rdx-0x10] + movaps xmm11, xmm14 + shufps xmm14, xmm15, 136 + pshufd xmm14, xmm14, 0x93 + shufps xmm11, xmm15, 221 + pshufd xmm15, xmm11, 0x93 + shl rax, 0x20 + or rax, 0x40 + movq xmm3, rax + movdqa xmmword ptr [rsp+0x20], xmm3 + movaps xmm3, xmmword ptr [rsp] + movaps xmm11, xmmword ptr [rsp+0x10] + punpcklqdq xmm3, xmmword ptr [rsp+0x20] + punpcklqdq xmm11, xmmword ptr [rsp+0x20] + mov al, 7 +9: + paddd xmm0, xmm4 + paddd xmm8, xmm12 + movaps xmmword ptr [rsp+0x20], xmm4 + movaps xmmword ptr [rsp+0x30], xmm12 + paddd xmm0, xmm1 + paddd xmm8, xmm9 + pxor xmm3, xmm0 + pxor xmm11, xmm8 + pshuflw xmm3, xmm3, 0xB1 + pshufhw xmm3, xmm3, 0xB1 + pshuflw xmm11, xmm11, 0xB1 + pshufhw xmm11, xmm11, 0xB1 + paddd xmm2, xmm3 + paddd xmm10, xmm11 + pxor xmm1, xmm2 + pxor xmm9, xmm10 + movdqa xmm4, xmm1 + pslld xmm1, 20 + psrld xmm4, 12 + por xmm1, xmm4 + movdqa xmm4, xmm9 + pslld xmm9, 20 + psrld xmm4, 12 + por xmm9, xmm4 + paddd xmm0, xmm5 + paddd xmm8, xmm13 + movaps xmmword ptr [rsp+0x40], xmm5 + movaps xmmword ptr [rsp+0x50], xmm13 + paddd xmm0, xmm1 + paddd xmm8, xmm9 + pxor xmm3, xmm0 + pxor xmm11, xmm8 + movdqa xmm13, xmm3 + psrld xmm3, 8 + pslld xmm13, 24 + pxor xmm3, xmm13 + movdqa xmm13, xmm11 + psrld xmm11, 8 + pslld xmm13, 24 + pxor xmm11, xmm13 + paddd xmm2, xmm3 + paddd xmm10, xmm11 + pxor xmm1, xmm2 + pxor xmm9, xmm10 + movdqa xmm4, xmm1 + pslld xmm1, 25 + psrld xmm4, 7 + por xmm1, xmm4 + movdqa xmm4, xmm9 + pslld xmm9, 25 + psrld xmm4, 7 + por xmm9, xmm4 + pshufd xmm0, xmm0, 0x93 + pshufd xmm8, xmm8, 0x93 + pshufd xmm3, xmm3, 0x4E + pshufd xmm11, xmm11, 0x4E + pshufd xmm2, xmm2, 0x39 + pshufd xmm10, xmm10, 0x39 + paddd xmm0, xmm6 + paddd xmm8, xmm14 + paddd xmm0, xmm1 + paddd xmm8, xmm9 + pxor xmm3, xmm0 + pxor xmm11, xmm8 + pshuflw xmm3, xmm3, 0xB1 + pshufhw xmm3, xmm3, 0xB1 + pshuflw xmm11, xmm11, 0xB1 + pshufhw xmm11, xmm11, 0xB1 + paddd xmm2, xmm3 + paddd xmm10, xmm11 + pxor xmm1, xmm2 + pxor xmm9, xmm10 + movdqa xmm4, xmm1 + pslld xmm1, 20 + psrld xmm4, 12 + por xmm1, xmm4 + movdqa xmm4, xmm9 + pslld xmm9, 20 + psrld xmm4, 12 + por xmm9, xmm4 + paddd xmm0, xmm7 + paddd xmm8, xmm15 + paddd xmm0, xmm1 + paddd xmm8, xmm9 + pxor xmm3, xmm0 + pxor xmm11, xmm8 + movdqa xmm13, xmm3 + psrld xmm3, 8 + pslld xmm13, 24 + pxor xmm3, xmm13 + movdqa xmm13, xmm11 + psrld xmm11, 8 + pslld xmm13, 24 + pxor xmm11, xmm13 + paddd xmm2, xmm3 + paddd xmm10, xmm11 + pxor xmm1, xmm2 + pxor xmm9, xmm10 + movdqa xmm4, xmm1 + pslld xmm1, 25 + psrld xmm4, 7 + por xmm1, xmm4 + movdqa xmm4, xmm9 + pslld xmm9, 25 + psrld xmm4, 7 + por xmm9, xmm4 + pshufd xmm0, xmm0, 0x39 + pshufd xmm8, xmm8, 0x39 + pshufd xmm3, xmm3, 0x4E + pshufd xmm11, xmm11, 0x4E + pshufd xmm2, xmm2, 0x93 + pshufd xmm10, xmm10, 0x93 + dec al + je 9f + movdqa xmm12, xmmword ptr [rsp+0x20] + movdqa xmm5, xmmword ptr [rsp+0x40] + pshufd xmm13, xmm12, 0x0F + shufps xmm12, xmm5, 214 + pshufd xmm4, xmm12, 0x39 + movdqa xmm12, xmm6 + shufps xmm12, xmm7, 250 + pand xmm13, xmmword ptr [PBLENDW_0x33_MASK+rip] + pand xmm12, xmmword ptr [PBLENDW_0xCC_MASK+rip] + por xmm13, xmm12 + movdqa xmmword ptr [rsp+0x20], xmm13 + movdqa xmm12, xmm7 + punpcklqdq xmm12, xmm5 + movdqa xmm13, xmm6 + pand xmm12, xmmword ptr [PBLENDW_0x3F_MASK+rip] + pand xmm13, xmmword ptr [PBLENDW_0xC0_MASK+rip] + por xmm12, xmm13 + pshufd xmm12, xmm12, 0x78 + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 0x1E + movdqa xmmword ptr [rsp+0x40], xmm12 + movdqa xmm5, xmmword ptr [rsp+0x30] + movdqa xmm13, xmmword ptr [rsp+0x50] + pshufd xmm6, xmm5, 0x0F + shufps xmm5, xmm13, 214 + pshufd xmm12, xmm5, 0x39 + movdqa xmm5, xmm14 + shufps xmm5, xmm15, 250 + pand xmm6, xmmword ptr [PBLENDW_0x33_MASK+rip] + pand xmm5, xmmword ptr [PBLENDW_0xCC_MASK+rip] + por xmm6, xmm5 + movdqa xmm5, xmm15 + punpcklqdq xmm5, xmm13 + movdqa xmmword ptr [rsp+0x30], xmm2 + movdqa xmm2, xmm14 + pand xmm5, xmmword ptr [PBLENDW_0x3F_MASK+rip] + pand xmm2, xmmword ptr [PBLENDW_0xC0_MASK+rip] + por xmm5, xmm2 + movdqa xmm2, xmmword ptr [rsp+0x30] + pshufd xmm5, xmm5, 0x78 + punpckhdq xmm13, xmm15 + punpckldq xmm14, xmm13 + pshufd xmm15, xmm14, 0x1E + movdqa xmm13, xmm6 + movdqa xmm14, xmm5 + movdqa xmm5, xmmword ptr [rsp+0x20] + movdqa xmm6, xmmword ptr [rsp+0x40] + jmp 9b +9: + pxor xmm0, xmm2 + pxor xmm1, xmm3 + pxor xmm8, xmm10 + pxor xmm9, xmm11 + mov eax, r13d + cmp rdx, r15 + jne 2b + movups xmmword ptr [rbx], xmm0 + movups xmmword ptr [rbx+0x10], xmm1 + movups xmmword ptr [rbx+0x20], xmm8 + movups xmmword ptr [rbx+0x30], xmm9 + mov eax, dword ptr [rsp+0x130] + neg eax + mov r10d, dword ptr [rsp+0x110+8*rax] + mov r11d, dword ptr [rsp+0x120+8*rax] + mov dword ptr [rsp+0x110], r10d + mov dword ptr [rsp+0x120], r11d + add rdi, 16 + add rbx, 64 + sub rsi, 2 +3: + test esi, 0x1 + je 4b + movups xmm0, xmmword ptr [rcx] + movups xmm1, xmmword ptr [rcx+0x10] + movd xmm13, dword ptr [rsp+0x110] + movd xmm14, dword ptr [rsp+0x120] + punpckldq xmm13, xmm14 + mov r8, qword ptr [rdi] + movzx eax, byte ptr [rbp+0x40] + or eax, r13d + xor edx, edx +2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + movaps xmm2, xmmword ptr [BLAKE3_IV+rip] + shl rax, 32 + or rax, 64 + movq xmm12, rax + movdqa xmm3, xmm13 + punpcklqdq xmm3, xmm12 + movups xmm4, xmmword ptr [r8+rdx-0x40] + movups xmm5, xmmword ptr [r8+rdx-0x30] + movaps xmm8, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm8, xmm5, 221 + movaps xmm5, xmm8 + movups xmm6, xmmword ptr [r8+rdx-0x20] + movups xmm7, xmmword ptr [r8+rdx-0x10] + movaps xmm8, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 0x93 + shufps xmm8, xmm7, 221 + pshufd xmm7, xmm8, 0x93 + mov al, 7 +9: + paddd xmm0, xmm4 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshuflw xmm3, xmm3, 0xB1 + pshufhw xmm3, xmm3, 0xB1 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm5 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + movdqa xmm14, xmm3 + psrld xmm3, 8 + pslld xmm14, 24 + pxor xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x93 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x39 + paddd xmm0, xmm6 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshuflw xmm3, xmm3, 0xB1 + pshufhw xmm3, xmm3, 0xB1 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm7 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + movdqa xmm14, xmm3 + psrld xmm3, 8 + pslld xmm14, 24 + pxor xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x39 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x93 + dec al + jz 9f + movdqa xmm8, xmm4 + shufps xmm8, xmm5, 214 + pshufd xmm9, xmm4, 0x0F + pshufd xmm4, xmm8, 0x39 + movdqa xmm8, xmm6 + shufps xmm8, xmm7, 250 + pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip] + pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip] + por xmm9, xmm8 + movdqa xmm8, xmm7 + punpcklqdq xmm8, xmm5 + movdqa xmm10, xmm6 + pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip] + pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip] + por xmm8, xmm10 + pshufd xmm8, xmm8, 0x78 + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 0x1E + movdqa xmm5, xmm9 + movdqa xmm6, xmm8 + jmp 9b +9: + pxor xmm0, xmm2 + pxor xmm1, xmm3 + mov eax, r13d + cmp rdx, r15 + jne 2b + movups xmmword ptr [rbx], xmm0 + movups xmmword ptr [rbx+0x10], xmm1 + jmp 4b + +.p2align 6 +blake3_compress_in_place_sse2: +_blake3_compress_in_place_sse2: + _CET_ENDBR + movups xmm0, xmmword ptr [rdi] + movups xmm1, xmmword ptr [rdi+0x10] + movaps xmm2, xmmword ptr [BLAKE3_IV+rip] + shl r8, 32 + add rdx, r8 + movq xmm3, rcx + movq xmm4, rdx + punpcklqdq xmm3, xmm4 + movups xmm4, xmmword ptr [rsi] + movups xmm5, xmmword ptr [rsi+0x10] + movaps xmm8, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm8, xmm5, 221 + movaps xmm5, xmm8 + movups xmm6, xmmword ptr [rsi+0x20] + movups xmm7, xmmword ptr [rsi+0x30] + movaps xmm8, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 0x93 + shufps xmm8, xmm7, 221 + pshufd xmm7, xmm8, 0x93 + mov al, 7 +9: + paddd xmm0, xmm4 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshuflw xmm3, xmm3, 0xB1 + pshufhw xmm3, xmm3, 0xB1 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm5 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + movdqa xmm14, xmm3 + psrld xmm3, 8 + pslld xmm14, 24 + pxor xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x93 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x39 + paddd xmm0, xmm6 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshuflw xmm3, xmm3, 0xB1 + pshufhw xmm3, xmm3, 0xB1 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm7 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + movdqa xmm14, xmm3 + psrld xmm3, 8 + pslld xmm14, 24 + pxor xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x39 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x93 + dec al + jz 9f + movdqa xmm8, xmm4 + shufps xmm8, xmm5, 214 + pshufd xmm9, xmm4, 0x0F + pshufd xmm4, xmm8, 0x39 + movdqa xmm8, xmm6 + shufps xmm8, xmm7, 250 + pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip] + pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip] + por xmm9, xmm8 + movdqa xmm8, xmm7 + punpcklqdq xmm8, xmm5 + movdqa xmm10, xmm6 + pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip] + pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip] + por xmm8, xmm10 + pshufd xmm8, xmm8, 0x78 + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 0x1E + movdqa xmm5, xmm9 + movdqa xmm6, xmm8 + jmp 9b +9: + pxor xmm0, xmm2 + pxor xmm1, xmm3 + movups xmmword ptr [rdi], xmm0 + movups xmmword ptr [rdi+0x10], xmm1 + ret + +.p2align 6 +blake3_compress_xof_sse2: +_blake3_compress_xof_sse2: + _CET_ENDBR + movups xmm0, xmmword ptr [rdi] + movups xmm1, xmmword ptr [rdi+0x10] + movaps xmm2, xmmword ptr [BLAKE3_IV+rip] + movzx eax, r8b + movzx edx, dl + shl rax, 32 + add rdx, rax + movq xmm3, rcx + movq xmm4, rdx + punpcklqdq xmm3, xmm4 + movups xmm4, xmmword ptr [rsi] + movups xmm5, xmmword ptr [rsi+0x10] + movaps xmm8, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm8, xmm5, 221 + movaps xmm5, xmm8 + movups xmm6, xmmword ptr [rsi+0x20] + movups xmm7, xmmword ptr [rsi+0x30] + movaps xmm8, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 0x93 + shufps xmm8, xmm7, 221 + pshufd xmm7, xmm8, 0x93 + mov al, 7 +9: + paddd xmm0, xmm4 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshuflw xmm3, xmm3, 0xB1 + pshufhw xmm3, xmm3, 0xB1 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm5 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + movdqa xmm14, xmm3 + psrld xmm3, 8 + pslld xmm14, 24 + pxor xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x93 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x39 + paddd xmm0, xmm6 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshuflw xmm3, xmm3, 0xB1 + pshufhw xmm3, xmm3, 0xB1 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm7 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + movdqa xmm14, xmm3 + psrld xmm3, 8 + pslld xmm14, 24 + pxor xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x39 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x93 + dec al + jz 9f + movdqa xmm8, xmm4 + shufps xmm8, xmm5, 214 + pshufd xmm9, xmm4, 0x0F + pshufd xmm4, xmm8, 0x39 + movdqa xmm8, xmm6 + shufps xmm8, xmm7, 250 + pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip] + pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip] + por xmm9, xmm8 + movdqa xmm8, xmm7 + punpcklqdq xmm8, xmm5 + movdqa xmm10, xmm6 + pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip] + pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip] + por xmm8, xmm10 + pshufd xmm8, xmm8, 0x78 + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 0x1E + movdqa xmm5, xmm9 + movdqa xmm6, xmm8 + jmp 9b +9: + movdqu xmm4, xmmword ptr [rdi] + movdqu xmm5, xmmword ptr [rdi+0x10] + pxor xmm0, xmm2 + pxor xmm1, xmm3 + pxor xmm2, xmm4 + pxor xmm3, xmm5 + movups xmmword ptr [r9], xmm0 + movups xmmword ptr [r9+0x10], xmm1 + movups xmmword ptr [r9+0x20], xmm2 + movups xmmword ptr [r9+0x30], xmm3 + ret + + +#ifdef __APPLE__ +.static_data +#else +.section .rodata +#endif +.p2align 6 +BLAKE3_IV: + .long 0x6A09E667, 0xBB67AE85 + .long 0x3C6EF372, 0xA54FF53A +ADD0: + .long 0, 1, 2, 3 +ADD1: + .long 4, 4, 4, 4 +BLAKE3_IV_0: + .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667 +BLAKE3_IV_1: + .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85 +BLAKE3_IV_2: + .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372 +BLAKE3_IV_3: + .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A +BLAKE3_BLOCK_LEN: + .long 64, 64, 64, 64 +CMP_MSB_MASK: + .long 0x80000000, 0x80000000, 0x80000000, 0x80000000 +PBLENDW_0x33_MASK: + .long 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000 +PBLENDW_0xCC_MASK: + .long 0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF +PBLENDW_0x3F_MASK: + .long 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 +PBLENDW_0xC0_MASK: + .long 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF diff --git a/vendor/blake3/blake3_sse41_x86-64_unix.S b/vendor/blake3/blake3_sse41_x86-64_unix.S new file mode 100644 index 000000000..a3ff64269 --- /dev/null +++ b/vendor/blake3/blake3_sse41_x86-64_unix.S @@ -0,0 +1,2028 @@ +#if defined(__ELF__) && defined(__linux__) +.section .note.GNU-stack,"",%progbits +#endif + +#if defined(__ELF__) && defined(__CET__) && defined(__has_include) +#if __has_include() +#include +#endif +#endif + +#if !defined(_CET_ENDBR) +#define _CET_ENDBR +#endif + +.intel_syntax noprefix +.global blake3_hash_many_sse41 +.global _blake3_hash_many_sse41 +.global blake3_compress_in_place_sse41 +.global _blake3_compress_in_place_sse41 +.global blake3_compress_xof_sse41 +.global _blake3_compress_xof_sse41 +#ifdef __APPLE__ +.text +#else +.section .text +#endif + .p2align 6 +_blake3_hash_many_sse41: +blake3_hash_many_sse41: + _CET_ENDBR + push r15 + push r14 + push r13 + push r12 + push rbx + push rbp + mov rbp, rsp + sub rsp, 360 + and rsp, 0xFFFFFFFFFFFFFFC0 + neg r9d + movd xmm0, r9d + pshufd xmm0, xmm0, 0x00 + movdqa xmmword ptr [rsp+0x130], xmm0 + movdqa xmm1, xmm0 + pand xmm1, xmmword ptr [ADD0+rip] + pand xmm0, xmmword ptr [ADD1+rip] + movdqa xmmword ptr [rsp+0x150], xmm0 + movd xmm0, r8d + pshufd xmm0, xmm0, 0x00 + paddd xmm0, xmm1 + movdqa xmmword ptr [rsp+0x110], xmm0 + pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip] + pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip] + pcmpgtd xmm1, xmm0 + shr r8, 32 + movd xmm2, r8d + pshufd xmm2, xmm2, 0x00 + psubd xmm2, xmm1 + movdqa xmmword ptr [rsp+0x120], xmm2 + mov rbx, qword ptr [rbp+0x50] + mov r15, rdx + shl r15, 6 + movzx r13d, byte ptr [rbp+0x38] + movzx r12d, byte ptr [rbp+0x48] + cmp rsi, 4 + jc 3f +2: + movdqu xmm3, xmmword ptr [rcx] + pshufd xmm0, xmm3, 0x00 + pshufd xmm1, xmm3, 0x55 + pshufd xmm2, xmm3, 0xAA + pshufd xmm3, xmm3, 0xFF + movdqu xmm7, xmmword ptr [rcx+0x10] + pshufd xmm4, xmm7, 0x00 + pshufd xmm5, xmm7, 0x55 + pshufd xmm6, xmm7, 0xAA + pshufd xmm7, xmm7, 0xFF + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + mov r10, qword ptr [rdi+0x10] + mov r11, qword ptr [rdi+0x18] + movzx eax, byte ptr [rbp+0x40] + or eax, r13d + xor edx, edx +9: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + movdqu xmm8, xmmword ptr [r8+rdx-0x40] + movdqu xmm9, xmmword ptr [r9+rdx-0x40] + movdqu xmm10, xmmword ptr [r10+rdx-0x40] + movdqu xmm11, xmmword ptr [r11+rdx-0x40] + movdqa xmm12, xmm8 + punpckldq xmm8, xmm9 + punpckhdq xmm12, xmm9 + movdqa xmm14, xmm10 + punpckldq xmm10, xmm11 + punpckhdq xmm14, xmm11 + movdqa xmm9, xmm8 + punpcklqdq xmm8, xmm10 + punpckhqdq xmm9, xmm10 + movdqa xmm13, xmm12 + punpcklqdq xmm12, xmm14 + punpckhqdq xmm13, xmm14 + movdqa xmmword ptr [rsp], xmm8 + movdqa xmmword ptr [rsp+0x10], xmm9 + movdqa xmmword ptr [rsp+0x20], xmm12 + movdqa xmmword ptr [rsp+0x30], xmm13 + movdqu xmm8, xmmword ptr [r8+rdx-0x30] + movdqu xmm9, xmmword ptr [r9+rdx-0x30] + movdqu xmm10, xmmword ptr [r10+rdx-0x30] + movdqu xmm11, xmmword ptr [r11+rdx-0x30] + movdqa xmm12, xmm8 + punpckldq xmm8, xmm9 + punpckhdq xmm12, xmm9 + movdqa xmm14, xmm10 + punpckldq xmm10, xmm11 + punpckhdq xmm14, xmm11 + movdqa xmm9, xmm8 + punpcklqdq xmm8, xmm10 + punpckhqdq xmm9, xmm10 + movdqa xmm13, xmm12 + punpcklqdq xmm12, xmm14 + punpckhqdq xmm13, xmm14 + movdqa xmmword ptr [rsp+0x40], xmm8 + movdqa xmmword ptr [rsp+0x50], xmm9 + movdqa xmmword ptr [rsp+0x60], xmm12 + movdqa xmmword ptr [rsp+0x70], xmm13 + movdqu xmm8, xmmword ptr [r8+rdx-0x20] + movdqu xmm9, xmmword ptr [r9+rdx-0x20] + movdqu xmm10, xmmword ptr [r10+rdx-0x20] + movdqu xmm11, xmmword ptr [r11+rdx-0x20] + movdqa xmm12, xmm8 + punpckldq xmm8, xmm9 + punpckhdq xmm12, xmm9 + movdqa xmm14, xmm10 + punpckldq xmm10, xmm11 + punpckhdq xmm14, xmm11 + movdqa xmm9, xmm8 + punpcklqdq xmm8, xmm10 + punpckhqdq xmm9, xmm10 + movdqa xmm13, xmm12 + punpcklqdq xmm12, xmm14 + punpckhqdq xmm13, xmm14 + movdqa xmmword ptr [rsp+0x80], xmm8 + movdqa xmmword ptr [rsp+0x90], xmm9 + movdqa xmmword ptr [rsp+0xA0], xmm12 + movdqa xmmword ptr [rsp+0xB0], xmm13 + movdqu xmm8, xmmword ptr [r8+rdx-0x10] + movdqu xmm9, xmmword ptr [r9+rdx-0x10] + movdqu xmm10, xmmword ptr [r10+rdx-0x10] + movdqu xmm11, xmmword ptr [r11+rdx-0x10] + movdqa xmm12, xmm8 + punpckldq xmm8, xmm9 + punpckhdq xmm12, xmm9 + movdqa xmm14, xmm10 + punpckldq xmm10, xmm11 + punpckhdq xmm14, xmm11 + movdqa xmm9, xmm8 + punpcklqdq xmm8, xmm10 + punpckhqdq xmm9, xmm10 + movdqa xmm13, xmm12 + punpcklqdq xmm12, xmm14 + punpckhqdq xmm13, xmm14 + movdqa xmmword ptr [rsp+0xC0], xmm8 + movdqa xmmword ptr [rsp+0xD0], xmm9 + movdqa xmmword ptr [rsp+0xE0], xmm12 + movdqa xmmword ptr [rsp+0xF0], xmm13 + movdqa xmm9, xmmword ptr [BLAKE3_IV_1+rip] + movdqa xmm10, xmmword ptr [BLAKE3_IV_2+rip] + movdqa xmm11, xmmword ptr [BLAKE3_IV_3+rip] + movdqa xmm12, xmmword ptr [rsp+0x110] + movdqa xmm13, xmmword ptr [rsp+0x120] + movdqa xmm14, xmmword ptr [BLAKE3_BLOCK_LEN+rip] + movd xmm15, eax + pshufd xmm15, xmm15, 0x00 + prefetcht0 [r8+rdx+0x80] + prefetcht0 [r9+rdx+0x80] + prefetcht0 [r10+rdx+0x80] + prefetcht0 [r11+rdx+0x80] + paddd xmm0, xmmword ptr [rsp] + paddd xmm1, xmmword ptr [rsp+0x20] + paddd xmm2, xmmword ptr [rsp+0x40] + paddd xmm3, xmmword ptr [rsp+0x60] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [BLAKE3_IV_0+rip] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x10] + paddd xmm1, xmmword ptr [rsp+0x30] + paddd xmm2, xmmword ptr [rsp+0x50] + paddd xmm3, xmmword ptr [rsp+0x70] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x80] + paddd xmm1, xmmword ptr [rsp+0xA0] + paddd xmm2, xmmword ptr [rsp+0xC0] + paddd xmm3, xmmword ptr [rsp+0xE0] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x90] + paddd xmm1, xmmword ptr [rsp+0xB0] + paddd xmm2, xmmword ptr [rsp+0xD0] + paddd xmm3, xmmword ptr [rsp+0xF0] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x20] + paddd xmm1, xmmword ptr [rsp+0x30] + paddd xmm2, xmmword ptr [rsp+0x70] + paddd xmm3, xmmword ptr [rsp+0x40] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x60] + paddd xmm1, xmmword ptr [rsp+0xA0] + paddd xmm2, xmmword ptr [rsp] + paddd xmm3, xmmword ptr [rsp+0xD0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x10] + paddd xmm1, xmmword ptr [rsp+0xC0] + paddd xmm2, xmmword ptr [rsp+0x90] + paddd xmm3, xmmword ptr [rsp+0xF0] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0xB0] + paddd xmm1, xmmword ptr [rsp+0x50] + paddd xmm2, xmmword ptr [rsp+0xE0] + paddd xmm3, xmmword ptr [rsp+0x80] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x30] + paddd xmm1, xmmword ptr [rsp+0xA0] + paddd xmm2, xmmword ptr [rsp+0xD0] + paddd xmm3, xmmword ptr [rsp+0x70] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x40] + paddd xmm1, xmmword ptr [rsp+0xC0] + paddd xmm2, xmmword ptr [rsp+0x20] + paddd xmm3, xmmword ptr [rsp+0xE0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x60] + paddd xmm1, xmmword ptr [rsp+0x90] + paddd xmm2, xmmword ptr [rsp+0xB0] + paddd xmm3, xmmword ptr [rsp+0x80] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x50] + paddd xmm1, xmmword ptr [rsp] + paddd xmm2, xmmword ptr [rsp+0xF0] + paddd xmm3, xmmword ptr [rsp+0x10] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0xA0] + paddd xmm1, xmmword ptr [rsp+0xC0] + paddd xmm2, xmmword ptr [rsp+0xE0] + paddd xmm3, xmmword ptr [rsp+0xD0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x70] + paddd xmm1, xmmword ptr [rsp+0x90] + paddd xmm2, xmmword ptr [rsp+0x30] + paddd xmm3, xmmword ptr [rsp+0xF0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x40] + paddd xmm1, xmmword ptr [rsp+0xB0] + paddd xmm2, xmmword ptr [rsp+0x50] + paddd xmm3, xmmword ptr [rsp+0x10] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp] + paddd xmm1, xmmword ptr [rsp+0x20] + paddd xmm2, xmmword ptr [rsp+0x80] + paddd xmm3, xmmword ptr [rsp+0x60] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0xC0] + paddd xmm1, xmmword ptr [rsp+0x90] + paddd xmm2, xmmword ptr [rsp+0xF0] + paddd xmm3, xmmword ptr [rsp+0xE0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0xD0] + paddd xmm1, xmmword ptr [rsp+0xB0] + paddd xmm2, xmmword ptr [rsp+0xA0] + paddd xmm3, xmmword ptr [rsp+0x80] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x70] + paddd xmm1, xmmword ptr [rsp+0x50] + paddd xmm2, xmmword ptr [rsp] + paddd xmm3, xmmword ptr [rsp+0x60] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x20] + paddd xmm1, xmmword ptr [rsp+0x30] + paddd xmm2, xmmword ptr [rsp+0x10] + paddd xmm3, xmmword ptr [rsp+0x40] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x90] + paddd xmm1, xmmword ptr [rsp+0xB0] + paddd xmm2, xmmword ptr [rsp+0x80] + paddd xmm3, xmmword ptr [rsp+0xF0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0xE0] + paddd xmm1, xmmword ptr [rsp+0x50] + paddd xmm2, xmmword ptr [rsp+0xC0] + paddd xmm3, xmmword ptr [rsp+0x10] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0xD0] + paddd xmm1, xmmword ptr [rsp] + paddd xmm2, xmmword ptr [rsp+0x20] + paddd xmm3, xmmword ptr [rsp+0x40] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x30] + paddd xmm1, xmmword ptr [rsp+0xA0] + paddd xmm2, xmmword ptr [rsp+0x60] + paddd xmm3, xmmword ptr [rsp+0x70] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0xB0] + paddd xmm1, xmmword ptr [rsp+0x50] + paddd xmm2, xmmword ptr [rsp+0x10] + paddd xmm3, xmmword ptr [rsp+0x80] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0xF0] + paddd xmm1, xmmword ptr [rsp] + paddd xmm2, xmmword ptr [rsp+0x90] + paddd xmm3, xmmword ptr [rsp+0x60] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0xE0] + paddd xmm1, xmmword ptr [rsp+0x20] + paddd xmm2, xmmword ptr [rsp+0x30] + paddd xmm3, xmmword ptr [rsp+0x70] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0xA0] + paddd xmm1, xmmword ptr [rsp+0xC0] + paddd xmm2, xmmword ptr [rsp+0x40] + paddd xmm3, xmmword ptr [rsp+0xD0] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + pxor xmm0, xmm8 + pxor xmm1, xmm9 + pxor xmm2, xmm10 + pxor xmm3, xmm11 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + pxor xmm4, xmm12 + pxor xmm5, xmm13 + pxor xmm6, xmm14 + pxor xmm7, xmm15 + mov eax, r13d + jne 9b + movdqa xmm9, xmm0 + punpckldq xmm0, xmm1 + punpckhdq xmm9, xmm1 + movdqa xmm11, xmm2 + punpckldq xmm2, xmm3 + punpckhdq xmm11, xmm3 + movdqa xmm1, xmm0 + punpcklqdq xmm0, xmm2 + punpckhqdq xmm1, xmm2 + movdqa xmm3, xmm9 + punpcklqdq xmm9, xmm11 + punpckhqdq xmm3, xmm11 + movdqu xmmword ptr [rbx], xmm0 + movdqu xmmword ptr [rbx+0x20], xmm1 + movdqu xmmword ptr [rbx+0x40], xmm9 + movdqu xmmword ptr [rbx+0x60], xmm3 + movdqa xmm9, xmm4 + punpckldq xmm4, xmm5 + punpckhdq xmm9, xmm5 + movdqa xmm11, xmm6 + punpckldq xmm6, xmm7 + punpckhdq xmm11, xmm7 + movdqa xmm5, xmm4 + punpcklqdq xmm4, xmm6 + punpckhqdq xmm5, xmm6 + movdqa xmm7, xmm9 + punpcklqdq xmm9, xmm11 + punpckhqdq xmm7, xmm11 + movdqu xmmword ptr [rbx+0x10], xmm4 + movdqu xmmword ptr [rbx+0x30], xmm5 + movdqu xmmword ptr [rbx+0x50], xmm9 + movdqu xmmword ptr [rbx+0x70], xmm7 + movdqa xmm1, xmmword ptr [rsp+0x110] + movdqa xmm0, xmm1 + paddd xmm1, xmmword ptr [rsp+0x150] + movdqa xmmword ptr [rsp+0x110], xmm1 + pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip] + pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip] + pcmpgtd xmm0, xmm1 + movdqa xmm1, xmmword ptr [rsp+0x120] + psubd xmm1, xmm0 + movdqa xmmword ptr [rsp+0x120], xmm1 + add rbx, 128 + add rdi, 32 + sub rsi, 4 + cmp rsi, 4 + jnc 2b + test rsi, rsi + jnz 3f +4: + mov rsp, rbp + pop rbp + pop rbx + pop r12 + pop r13 + pop r14 + pop r15 + ret +.p2align 5 +3: + test esi, 0x2 + je 3f + movups xmm0, xmmword ptr [rcx] + movups xmm1, xmmword ptr [rcx+0x10] + movaps xmm8, xmm0 + movaps xmm9, xmm1 + movd xmm13, dword ptr [rsp+0x110] + pinsrd xmm13, dword ptr [rsp+0x120], 1 + pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 + movaps xmmword ptr [rsp], xmm13 + movd xmm14, dword ptr [rsp+0x114] + pinsrd xmm14, dword ptr [rsp+0x124], 1 + pinsrd xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 + movaps xmmword ptr [rsp+0x10], xmm14 + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + movzx eax, byte ptr [rbp+0x40] + or eax, r13d + xor edx, edx +2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + movaps xmm2, xmmword ptr [BLAKE3_IV+rip] + movaps xmm10, xmm2 + movups xmm4, xmmword ptr [r8+rdx-0x40] + movups xmm5, xmmword ptr [r8+rdx-0x30] + movaps xmm3, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm3, xmm5, 221 + movaps xmm5, xmm3 + movups xmm6, xmmword ptr [r8+rdx-0x20] + movups xmm7, xmmword ptr [r8+rdx-0x10] + movaps xmm3, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 0x93 + shufps xmm3, xmm7, 221 + pshufd xmm7, xmm3, 0x93 + movups xmm12, xmmword ptr [r9+rdx-0x40] + movups xmm13, xmmword ptr [r9+rdx-0x30] + movaps xmm11, xmm12 + shufps xmm12, xmm13, 136 + shufps xmm11, xmm13, 221 + movaps xmm13, xmm11 + movups xmm14, xmmword ptr [r9+rdx-0x20] + movups xmm15, xmmword ptr [r9+rdx-0x10] + movaps xmm11, xmm14 + shufps xmm14, xmm15, 136 + pshufd xmm14, xmm14, 0x93 + shufps xmm11, xmm15, 221 + pshufd xmm15, xmm11, 0x93 + movaps xmm3, xmmword ptr [rsp] + movaps xmm11, xmmword ptr [rsp+0x10] + pinsrd xmm3, eax, 3 + pinsrd xmm11, eax, 3 + mov al, 7 +9: + paddd xmm0, xmm4 + paddd xmm8, xmm12 + movaps xmmword ptr [rsp+0x20], xmm4 + movaps xmmword ptr [rsp+0x30], xmm12 + paddd xmm0, xmm1 + paddd xmm8, xmm9 + pxor xmm3, xmm0 + pxor xmm11, xmm8 + movaps xmm12, xmmword ptr [ROT16+rip] + pshufb xmm3, xmm12 + pshufb xmm11, xmm12 + paddd xmm2, xmm3 + paddd xmm10, xmm11 + pxor xmm1, xmm2 + pxor xmm9, xmm10 + movdqa xmm4, xmm1 + pslld xmm1, 20 + psrld xmm4, 12 + por xmm1, xmm4 + movdqa xmm4, xmm9 + pslld xmm9, 20 + psrld xmm4, 12 + por xmm9, xmm4 + paddd xmm0, xmm5 + paddd xmm8, xmm13 + movaps xmmword ptr [rsp+0x40], xmm5 + movaps xmmword ptr [rsp+0x50], xmm13 + paddd xmm0, xmm1 + paddd xmm8, xmm9 + pxor xmm3, xmm0 + pxor xmm11, xmm8 + movaps xmm13, xmmword ptr [ROT8+rip] + pshufb xmm3, xmm13 + pshufb xmm11, xmm13 + paddd xmm2, xmm3 + paddd xmm10, xmm11 + pxor xmm1, xmm2 + pxor xmm9, xmm10 + movdqa xmm4, xmm1 + pslld xmm1, 25 + psrld xmm4, 7 + por xmm1, xmm4 + movdqa xmm4, xmm9 + pslld xmm9, 25 + psrld xmm4, 7 + por xmm9, xmm4 + pshufd xmm0, xmm0, 0x93 + pshufd xmm8, xmm8, 0x93 + pshufd xmm3, xmm3, 0x4E + pshufd xmm11, xmm11, 0x4E + pshufd xmm2, xmm2, 0x39 + pshufd xmm10, xmm10, 0x39 + paddd xmm0, xmm6 + paddd xmm8, xmm14 + paddd xmm0, xmm1 + paddd xmm8, xmm9 + pxor xmm3, xmm0 + pxor xmm11, xmm8 + pshufb xmm3, xmm12 + pshufb xmm11, xmm12 + paddd xmm2, xmm3 + paddd xmm10, xmm11 + pxor xmm1, xmm2 + pxor xmm9, xmm10 + movdqa xmm4, xmm1 + pslld xmm1, 20 + psrld xmm4, 12 + por xmm1, xmm4 + movdqa xmm4, xmm9 + pslld xmm9, 20 + psrld xmm4, 12 + por xmm9, xmm4 + paddd xmm0, xmm7 + paddd xmm8, xmm15 + paddd xmm0, xmm1 + paddd xmm8, xmm9 + pxor xmm3, xmm0 + pxor xmm11, xmm8 + pshufb xmm3, xmm13 + pshufb xmm11, xmm13 + paddd xmm2, xmm3 + paddd xmm10, xmm11 + pxor xmm1, xmm2 + pxor xmm9, xmm10 + movdqa xmm4, xmm1 + pslld xmm1, 25 + psrld xmm4, 7 + por xmm1, xmm4 + movdqa xmm4, xmm9 + pslld xmm9, 25 + psrld xmm4, 7 + por xmm9, xmm4 + pshufd xmm0, xmm0, 0x39 + pshufd xmm8, xmm8, 0x39 + pshufd xmm3, xmm3, 0x4E + pshufd xmm11, xmm11, 0x4E + pshufd xmm2, xmm2, 0x93 + pshufd xmm10, xmm10, 0x93 + dec al + je 9f + movdqa xmm12, xmmword ptr [rsp+0x20] + movdqa xmm5, xmmword ptr [rsp+0x40] + pshufd xmm13, xmm12, 0x0F + shufps xmm12, xmm5, 214 + pshufd xmm4, xmm12, 0x39 + movdqa xmm12, xmm6 + shufps xmm12, xmm7, 250 + pblendw xmm13, xmm12, 0xCC + movdqa xmm12, xmm7 + punpcklqdq xmm12, xmm5 + pblendw xmm12, xmm6, 0xC0 + pshufd xmm12, xmm12, 0x78 + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 0x1E + movdqa xmmword ptr [rsp+0x20], xmm13 + movdqa xmmword ptr [rsp+0x40], xmm12 + movdqa xmm5, xmmword ptr [rsp+0x30] + movdqa xmm13, xmmword ptr [rsp+0x50] + pshufd xmm6, xmm5, 0x0F + shufps xmm5, xmm13, 214 + pshufd xmm12, xmm5, 0x39 + movdqa xmm5, xmm14 + shufps xmm5, xmm15, 250 + pblendw xmm6, xmm5, 0xCC + movdqa xmm5, xmm15 + punpcklqdq xmm5, xmm13 + pblendw xmm5, xmm14, 0xC0 + pshufd xmm5, xmm5, 0x78 + punpckhdq xmm13, xmm15 + punpckldq xmm14, xmm13 + pshufd xmm15, xmm14, 0x1E + movdqa xmm13, xmm6 + movdqa xmm14, xmm5 + movdqa xmm5, xmmword ptr [rsp+0x20] + movdqa xmm6, xmmword ptr [rsp+0x40] + jmp 9b +9: + pxor xmm0, xmm2 + pxor xmm1, xmm3 + pxor xmm8, xmm10 + pxor xmm9, xmm11 + mov eax, r13d + cmp rdx, r15 + jne 2b + movups xmmword ptr [rbx], xmm0 + movups xmmword ptr [rbx+0x10], xmm1 + movups xmmword ptr [rbx+0x20], xmm8 + movups xmmword ptr [rbx+0x30], xmm9 + movdqa xmm0, xmmword ptr [rsp+0x130] + movdqa xmm1, xmmword ptr [rsp+0x110] + movdqa xmm2, xmmword ptr [rsp+0x120] + movdqu xmm3, xmmword ptr [rsp+0x118] + movdqu xmm4, xmmword ptr [rsp+0x128] + blendvps xmm1, xmm3, xmm0 + blendvps xmm2, xmm4, xmm0 + movdqa xmmword ptr [rsp+0x110], xmm1 + movdqa xmmword ptr [rsp+0x120], xmm2 + add rdi, 16 + add rbx, 64 + sub rsi, 2 +3: + test esi, 0x1 + je 4b + movups xmm0, xmmword ptr [rcx] + movups xmm1, xmmword ptr [rcx+0x10] + movd xmm13, dword ptr [rsp+0x110] + pinsrd xmm13, dword ptr [rsp+0x120], 1 + pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 + movaps xmm14, xmmword ptr [ROT8+rip] + movaps xmm15, xmmword ptr [ROT16+rip] + mov r8, qword ptr [rdi] + movzx eax, byte ptr [rbp+0x40] + or eax, r13d + xor edx, edx +2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + movaps xmm2, xmmword ptr [BLAKE3_IV+rip] + movaps xmm3, xmm13 + pinsrd xmm3, eax, 3 + movups xmm4, xmmword ptr [r8+rdx-0x40] + movups xmm5, xmmword ptr [r8+rdx-0x30] + movaps xmm8, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm8, xmm5, 221 + movaps xmm5, xmm8 + movups xmm6, xmmword ptr [r8+rdx-0x20] + movups xmm7, xmmword ptr [r8+rdx-0x10] + movaps xmm8, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 0x93 + shufps xmm8, xmm7, 221 + pshufd xmm7, xmm8, 0x93 + mov al, 7 +9: + paddd xmm0, xmm4 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm15 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm5 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x93 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x39 + paddd xmm0, xmm6 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm15 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm7 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x39 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x93 + dec al + jz 9f + movdqa xmm8, xmm4 + shufps xmm8, xmm5, 214 + pshufd xmm9, xmm4, 0x0F + pshufd xmm4, xmm8, 0x39 + movdqa xmm8, xmm6 + shufps xmm8, xmm7, 250 + pblendw xmm9, xmm8, 0xCC + movdqa xmm8, xmm7 + punpcklqdq xmm8, xmm5 + pblendw xmm8, xmm6, 0xC0 + pshufd xmm8, xmm8, 0x78 + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 0x1E + movdqa xmm5, xmm9 + movdqa xmm6, xmm8 + jmp 9b +9: + pxor xmm0, xmm2 + pxor xmm1, xmm3 + mov eax, r13d + cmp rdx, r15 + jne 2b + movups xmmword ptr [rbx], xmm0 + movups xmmword ptr [rbx+0x10], xmm1 + jmp 4b + +.p2align 6 +blake3_compress_in_place_sse41: +_blake3_compress_in_place_sse41: + _CET_ENDBR + movups xmm0, xmmword ptr [rdi] + movups xmm1, xmmword ptr [rdi+0x10] + movaps xmm2, xmmword ptr [BLAKE3_IV+rip] + shl r8, 32 + add rdx, r8 + movq xmm3, rcx + movq xmm4, rdx + punpcklqdq xmm3, xmm4 + movups xmm4, xmmword ptr [rsi] + movups xmm5, xmmword ptr [rsi+0x10] + movaps xmm8, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm8, xmm5, 221 + movaps xmm5, xmm8 + movups xmm6, xmmword ptr [rsi+0x20] + movups xmm7, xmmword ptr [rsi+0x30] + movaps xmm8, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 0x93 + shufps xmm8, xmm7, 221 + pshufd xmm7, xmm8, 0x93 + movaps xmm14, xmmword ptr [ROT8+rip] + movaps xmm15, xmmword ptr [ROT16+rip] + mov al, 7 +9: + paddd xmm0, xmm4 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm15 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm5 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x93 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x39 + paddd xmm0, xmm6 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm15 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm7 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x39 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x93 + dec al + jz 9f + movdqa xmm8, xmm4 + shufps xmm8, xmm5, 214 + pshufd xmm9, xmm4, 0x0F + pshufd xmm4, xmm8, 0x39 + movdqa xmm8, xmm6 + shufps xmm8, xmm7, 250 + pblendw xmm9, xmm8, 0xCC + movdqa xmm8, xmm7 + punpcklqdq xmm8, xmm5 + pblendw xmm8, xmm6, 0xC0 + pshufd xmm8, xmm8, 0x78 + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 0x1E + movdqa xmm5, xmm9 + movdqa xmm6, xmm8 + jmp 9b +9: + pxor xmm0, xmm2 + pxor xmm1, xmm3 + movups xmmword ptr [rdi], xmm0 + movups xmmword ptr [rdi+0x10], xmm1 + ret + +.p2align 6 +blake3_compress_xof_sse41: +_blake3_compress_xof_sse41: + _CET_ENDBR + movups xmm0, xmmword ptr [rdi] + movups xmm1, xmmword ptr [rdi+0x10] + movaps xmm2, xmmword ptr [BLAKE3_IV+rip] + movzx eax, r8b + movzx edx, dl + shl rax, 32 + add rdx, rax + movq xmm3, rcx + movq xmm4, rdx + punpcklqdq xmm3, xmm4 + movups xmm4, xmmword ptr [rsi] + movups xmm5, xmmword ptr [rsi+0x10] + movaps xmm8, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm8, xmm5, 221 + movaps xmm5, xmm8 + movups xmm6, xmmword ptr [rsi+0x20] + movups xmm7, xmmword ptr [rsi+0x30] + movaps xmm8, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 0x93 + shufps xmm8, xmm7, 221 + pshufd xmm7, xmm8, 0x93 + movaps xmm14, xmmword ptr [ROT8+rip] + movaps xmm15, xmmword ptr [ROT16+rip] + mov al, 7 +9: + paddd xmm0, xmm4 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm15 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm5 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x93 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x39 + paddd xmm0, xmm6 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm15 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm7 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x39 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x93 + dec al + jz 9f + movdqa xmm8, xmm4 + shufps xmm8, xmm5, 214 + pshufd xmm9, xmm4, 0x0F + pshufd xmm4, xmm8, 0x39 + movdqa xmm8, xmm6 + shufps xmm8, xmm7, 250 + pblendw xmm9, xmm8, 0xCC + movdqa xmm8, xmm7 + punpcklqdq xmm8, xmm5 + pblendw xmm8, xmm6, 0xC0 + pshufd xmm8, xmm8, 0x78 + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 0x1E + movdqa xmm5, xmm9 + movdqa xmm6, xmm8 + jmp 9b +9: + movdqu xmm4, xmmword ptr [rdi] + movdqu xmm5, xmmword ptr [rdi+0x10] + pxor xmm0, xmm2 + pxor xmm1, xmm3 + pxor xmm2, xmm4 + pxor xmm3, xmm5 + movups xmmword ptr [r9], xmm0 + movups xmmword ptr [r9+0x10], xmm1 + movups xmmword ptr [r9+0x20], xmm2 + movups xmmword ptr [r9+0x30], xmm3 + ret + + +#ifdef __APPLE__ +.static_data +#else +.section .rodata +#endif +.p2align 6 +BLAKE3_IV: + .long 0x6A09E667, 0xBB67AE85 + .long 0x3C6EF372, 0xA54FF53A +ROT16: + .byte 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13 +ROT8: + .byte 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12 +ADD0: + .long 0, 1, 2, 3 +ADD1: + .long 4, 4, 4, 4 +BLAKE3_IV_0: + .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667 +BLAKE3_IV_1: + .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85 +BLAKE3_IV_2: + .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372 +BLAKE3_IV_3: + .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A +BLAKE3_BLOCK_LEN: + .long 64, 64, 64, 64 +CMP_MSB_MASK: + .long 0x80000000, 0x80000000, 0x80000000, 0x80000000 diff --git a/vendor/gopt/gopt.wake b/vendor/gopt/gopt.wake index 075b0cce8..52262434b 100644 --- a/vendor/gopt/gopt.wake +++ b/vendor/gopt/gopt.wake @@ -18,4 +18,4 @@ package build_wake from wake import _ target gopt variant = - vendor @here variant "10.0" `.*\.h` `gopt(|-errors|-arg).c` + vendor @here variant "10.0" `.*\.h` `gopt(|-errors|-arg).c` Nil diff --git a/vendor/siphash/siphash.wake b/vendor/siphash/siphash.wake index 69799e6cd..0180ac134 100644 --- a/vendor/siphash/siphash.wake +++ b/vendor/siphash/siphash.wake @@ -18,4 +18,4 @@ package build_wake from wake import _ target siphash variant = - vendor @here variant "1.0" `.*\.h` `.*\.c` + vendor @here variant "1.0" `.*\.h` `.*\.c` Nil diff --git a/vendor/utf8proc/utf8proc.wake b/vendor/utf8proc/utf8proc.wake index ee71ea026..44379e5e6 100644 --- a/vendor/utf8proc/utf8proc.wake +++ b/vendor/utf8proc/utf8proc.wake @@ -21,4 +21,4 @@ from gcc_wake import _ # Prefer to use system utf8proc if available def utf8proc variant = match (pkgConfig "utf8proc") Some x -> Pass x - None -> vendor @here variant "2.2.0" `.*\.h|utf8proc_data\.c` `utf8proc\.c` + None -> vendor @here variant "2.2.0" `.*\.h|utf8proc_data\.c` `utf8proc\.c` Nil diff --git a/vendor/vendor.wake b/vendor/vendor.wake index 0665c8281..bd95eafa6 100644 --- a/vendor/vendor.wake +++ b/vendor/vendor.wake @@ -24,13 +24,13 @@ target vendorBuildDir Unit = def vendorDir = @here -def vendor (dir: String) (variant: Pair String String) (version: String) (headers: RegExp) (files: RegExp): Result SysLib Error = +def vendor (dir: String) (variant: Pair String String) (version: String) (headers: RegExp) (files: RegExp) (extraFlags: List String): Result SysLib Error = def Pair _ cVariant = variant require Pass headerPaths = sources dir headers require Pass sourcePaths = sources dir files - def compile = compileC cVariant Nil headerPaths + def compile = compileC cVariant extraFlags headerPaths require Pass objects = map compile sourcePaths diff --git a/vendor/whereami/whereami.wake b/vendor/whereami/whereami.wake index 679d6137c..33fa3c074 100644 --- a/vendor/whereami/whereami.wake +++ b/vendor/whereami/whereami.wake @@ -18,4 +18,4 @@ package build_wake from wake import _ target whereami variant = - vendor @here variant "1.0" `.*\.h` `.*\.c` + vendor @here variant "1.0" `.*\.h` `.*\.c` Nil