From 17ab915fd36ecbbce3e5bb5ee0ab94829147fb77 Mon Sep 17 00:00:00 2001 From: bdenhollander <44237618+bdenhollander@users.noreply.github.com> Date: Fri, 10 Nov 2023 17:51:57 -0500 Subject: [PATCH] Use VkFFT from OpenMM 8.1 OpenMM 8.1 includes VkFFT.h version 1.2.33, which has some HIP speed improvements. --- CMakeLists.txt | 2 +- platforms/hip/include/vkFFT.h | 39618 -------------------------------- 2 files changed, 1 insertion(+), 39619 deletions(-) delete mode 100644 platforms/hip/include/vkFFT.h diff --git a/CMakeLists.txt b/CMakeLists.txt index bbb913e..35ebdd1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -79,7 +79,7 @@ IF(NOT EXECUTABLE_OUTPUT_PATH) ENDIF(NOT EXECUTABLE_OUTPUT_PATH) # Include OpenMM utiliity libraries -SET(OPENMM_LIB_SUBDIRS libraries/jama libraries/quern libraries/hilbert libraries/csha1 libraries/lbfgs libraries/pocketfft) +SET(OPENMM_LIB_SUBDIRS libraries/jama libraries/quern libraries/hilbert libraries/csha1 libraries/lbfgs libraries/pocketfft libraries/vkfft) FOREACH(subdir ${OPENMM_LIB_SUBDIRS}) INCLUDE_DIRECTORIES(BEFORE ${OPENMM_SOURCE_DIR}/${subdir}/include) ENDFOREACH(subdir) diff --git a/platforms/hip/include/vkFFT.h b/platforms/hip/include/vkFFT.h deleted file mode 100644 index 6c8bb69..0000000 --- a/platforms/hip/include/vkFFT.h +++ /dev/null @@ -1,39618 +0,0 @@ -// This file is part of VkFFT, a Vulkan Fast Fourier Transform library -// -// Copyright (C) 2020 - present Dmitrii Tolmachev -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in -// all copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -// THE SOFTWARE. - -#ifndef VKFFT_H -#define VKFFT_H - -#include -#include -#include -#include -#include -#include -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif -#include -#if(VKFFT_BACKEND==0) -#include "vulkan/vulkan.h" -#include "glslang_c_interface.h" -#elif(VKFFT_BACKEND==1) -#include -#include -#include -#include -#include -#elif(VKFFT_BACKEND==2) -#include -#include -#include -#include -#elif(VKFFT_BACKEND==3) -#ifndef CL_USE_DEPRECATED_OPENCL_1_2_APIS -#define CL_USE_DEPRECATED_OPENCL_1_2_APIS -#endif -#ifdef __APPLE__ -#include -#else -#include -#endif -#elif(VKFFT_BACKEND==4) -#include -#endif - -typedef struct { - //WHDCN layout - - //required parameters: - uint64_t FFTdim; //FFT dimensionality (1, 2 or 3) - uint64_t size[3]; // WHD -system dimensions - -#if(VKFFT_BACKEND==0) - VkPhysicalDevice* physicalDevice;//pointer to Vulkan physical device, obtained from vkEnumeratePhysicalDevices - VkDevice* device;//pointer to Vulkan device, created with vkCreateDevice - VkQueue* queue;//pointer to Vulkan queue, created with vkGetDeviceQueue - VkCommandPool* commandPool;//pointer to Vulkan command pool, created with vkCreateCommandPool - VkFence* fence;//pointer to Vulkan fence, created with vkCreateFence - uint64_t isCompilerInitialized;//specify if glslang compiler has been intialized before (0 - off, 1 - on). Default 0 -#elif(VKFFT_BACKEND==1) - CUdevice* device;//pointer to CUDA device, obtained from cuDeviceGet - //CUcontext* context;//pointer to CUDA context, obtained from cuDeviceGet - cudaStream_t* stream;//pointer to streams (can be more than 1), where to execute the kernels - uint64_t num_streams;//try to submit CUDA kernels in multiple streams for asynchronous execution. Default 1 -#elif(VKFFT_BACKEND==2) - hipDevice_t* device;//pointer to HIP device, obtained from hipDeviceGet - //hipCtx_t* context;//pointer to HIP context, obtained from hipDeviceGet - hipStream_t* stream;//pointer to streams (can be more than 1), where to execute the kernels - uint64_t num_streams;//try to submit HIP kernels in multiple streams for asynchronous execution. Default 1 -#elif(VKFFT_BACKEND==3) - cl_platform_id* platform;//not required - cl_device_id* device; - cl_context* context; -#elif(VKFFT_BACKEND==4) - ze_device_handle_t* device; - ze_context_handle_t* context; - ze_command_queue_handle_t* commandQueue; - uint32_t commandQueueID; -#endif - - //data parameters: - uint64_t userTempBuffer; //buffer allocated by app automatically if needed to reorder Four step algorithm. Setting to non zero value enables manual user allocation (0 - off, 1 - on) - - uint64_t bufferNum;//multiple buffer sequence storage is Vulkan only. Default 1 - uint64_t tempBufferNum;//multiple buffer sequence storage is Vulkan only. Default 1, buffer allocated by app automatically if needed to reorder Four step algorithm. Setting to non zero value enables manual user allocation - uint64_t inputBufferNum;//multiple buffer sequence storage is Vulkan only. Default 1, if isInputFormatted is enabled - uint64_t outputBufferNum;//multiple buffer sequence storage is Vulkan only. Default 1, if isOutputFormatted is enabled - uint64_t kernelNum;//multiple buffer sequence storage is Vulkan only. Default 1, if performConvolution is enabled - - //sizes are obligatory in Vulkan backend, optional in others - uint64_t* bufferSize;//array of buffers sizes in bytes - uint64_t* tempBufferSize;//array of temp buffers sizes in bytes. Default set to bufferSize sum, buffer allocated by app automatically if needed to reorder Four step algorithm. Setting to non zero value enables manual user allocation - uint64_t* inputBufferSize;//array of input buffers sizes in bytes, if isInputFormatted is enabled - uint64_t* outputBufferSize;//array of output buffers sizes in bytes, if isOutputFormatted is enabled - uint64_t* kernelSize;//array of kernel buffers sizes in bytes, if performConvolution is enabled - -#if(VKFFT_BACKEND==0) - VkBuffer* buffer;//pointer to array of buffers (or one buffer) used for computations - VkBuffer* tempBuffer;//needed if reorderFourStep is enabled to transpose the array. Same sum size or bigger as buffer (can be split in multiple). Default 0. Setting to non zero value enables manual user allocation - VkBuffer* inputBuffer;//pointer to array of input buffers (or one buffer) used to read data from if isInputFormatted is enabled - VkBuffer* outputBuffer;//pointer to array of output buffers (or one buffer) used for write data to if isOutputFormatted is enabled - VkBuffer* kernel;//pointer to array of kernel buffers (or one buffer) used for read kernel data from if performConvolution is enabled -#elif(VKFFT_BACKEND==1) - void** buffer;//pointer to device buffer used for computations - void** tempBuffer;//needed if reorderFourStep is enabled to transpose the array. Same size as buffer. Default 0. Setting to non zero value enables manual user allocation - void** inputBuffer;//pointer to device buffer used to read data from if isInputFormatted is enabled - void** outputBuffer;//pointer to device buffer used to read data from if isOutputFormatted is enabled - void** kernel;//pointer to device buffer used to read kernel data from if performConvolution is enabled -#elif(VKFFT_BACKEND==2) - void** buffer;//pointer to device buffer used for computations - void** tempBuffer;//needed if reorderFourStep is enabled to transpose the array. Same size as buffer. Default 0. Setting to non zero value enables manual user allocation - void** inputBuffer;//pointer to device buffer used to read data from if isInputFormatted is enabled - void** outputBuffer;//pointer to device buffer used to read data from if isOutputFormatted is enabled - void** kernel;//pointer to device buffer used to read kernel data from if performConvolution is enabled -#elif(VKFFT_BACKEND==3) - cl_mem* buffer;//pointer to device buffer used for computations - cl_mem* tempBuffer;//needed if reorderFourStep is enabled to transpose the array. Same size as buffer. Default 0. Setting to non zero value enables manual user allocation - cl_mem* inputBuffer;//pointer to device buffer used to read data from if isInputFormatted is enabled - cl_mem* outputBuffer;//pointer to device buffer used to read data from if isOutputFormatted is enabled - cl_mem* kernel;//pointer to device buffer used to read kernel data from if performConvolution is enabled -#elif(VKFFT_BACKEND==4) - void** buffer;//pointer to device buffer used for computations - void** tempBuffer;//needed if reorderFourStep is enabled to transpose the array. Same size as buffer. Default 0. Setting to non zero value enables manual user allocation - void** inputBuffer;//pointer to device buffer used to read data from if isInputFormatted is enabled - void** outputBuffer;//pointer to device buffer used to read data from if isOutputFormatted is enabled - void** kernel;//pointer to device buffer used to read kernel data from if performConvolution is enabled -#endif - uint64_t bufferOffset;//specify if VkFFT has to offset the first element position inside the buffer. In bytes. Default 0 - uint64_t tempBufferOffset;//specify if VkFFT has to offset the first element position inside the temp buffer. In bytes. Default 0 - uint64_t inputBufferOffset;//specify if VkFFT has to offset the first element position inside the input buffer. In bytes. Default 0 - uint64_t outputBufferOffset;//specify if VkFFT has to offset the first element position inside the output buffer. In bytes. Default 0 - uint64_t kernelOffset;//specify if VkFFT has to offset the first element position inside the kernel. In bytes. Default 0 - uint64_t specifyOffsetsAtLaunch;//specify if offsets will be selected with launch parameters VkFFTLaunchParams (0 - off, 1 - on). Default 0 - - //optional: (default 0 if not stated otherwise) - uint64_t coalescedMemory;//in bytes, for Nvidia and AMD is equal to 32, Intel is equal 64, scaled for half precision. Gonna work regardles, but if specified by user correctly, the performance will be higher. - uint64_t aimThreads;//aim at this many threads per block. Default 128 - uint64_t numSharedBanks;//how many banks shared memory has. Default 32 - uint64_t inverseReturnToInputBuffer;//return data to the input buffer in inverse transform (0 - off, 1 - on). isInputFormatted must be enabled - uint64_t numberBatches;// N - used to perform multiple batches of initial data. Default 1 - uint64_t useUint64;// use 64-bit addressing mode in generated kernels - uint64_t omitDimension[3];//disable FFT for this dimension (0 - FFT enabled, 1 - FFT disabled). Default 0. Doesn't work for R2C dimension 0 for now. Doesn't work with convolutions. - uint64_t performBandwidthBoost;//try to reduce coalsesced number by a factor of X to get bigger sequence in one upload for strided axes. Default: -1 for DCT, 2 for Bluestein's algorithm (or -1 if DCT), 0 otherwise - - uint64_t doublePrecision; //perform calculations in double precision (0 - off, 1 - on). - uint64_t halfPrecision; //perform calculations in half precision (0 - off, 1 - on) - uint64_t halfPrecisionMemoryOnly; //use half precision only as input/output buffer. Input/Output have to be allocated as half, buffer/tempBuffer have to be allocated as float (out of place mode only). Specify isInputFormatted and isOutputFormatted to use (0 - off, 1 - on) - uint64_t doublePrecisionFloatMemory; //use FP64 precision for all calculations, while all memory storage is done in FP32. - - uint64_t performR2C; //perform R2C/C2R decomposition (0 - off, 1 - on) - uint64_t performDCT; //perform DCT transformation (X - DCT type, 1-4) - uint64_t disableMergeSequencesR2C; //disable merging of two real sequences to reduce calculations (0 - off, 1 - on) - uint64_t normalize; //normalize inverse transform (0 - off, 1 - on) - uint64_t disableReorderFourStep; // disables unshuffling of Four step algorithm. Requires tempbuffer allocation (0 - off, 1 - on) - uint64_t useLUT; //switches from calculating sincos to using precomputed LUT tables (0 - off, 1 - on). Configured by initialization routine - uint64_t makeForwardPlanOnly; //generate code only for forward FFT (0 - off, 1 - on) - uint64_t makeInversePlanOnly; //generate code only for inverse FFT (0 - off, 1 - on) - - uint64_t bufferStride[3];//buffer strides - default set to x - x*y - x*y*z values - uint64_t isInputFormatted; //specify if input buffer is padded - 0 - padded, 1 - not padded. For example if it is not padded for R2C if out-of-place mode is selected (only if numberBatches==1 and numberKernels==1) - uint64_t isOutputFormatted; //specify if output buffer is padded - 0 - padded, 1 - not padded. For example if it is not padded for R2C if out-of-place mode is selected (only if numberBatches==1 and numberKernels==1) - uint64_t inputBufferStride[3];//input buffer strides. Used if isInputFormatted is enabled. Default set to bufferStride values - uint64_t outputBufferStride[3];//output buffer strides. Used if isInputFormatted is enabled. Default set to bufferStride values - - uint64_t considerAllAxesStrided;//will create plan for nonstrided axis similar as a strided axis - used with disableReorderFourStep to get the same layout for Bluestein kernel (0 - off, 1 - on) - uint64_t keepShaderCode;//will keep shader code and print all executed shaders during the plan execution in order (0 - off, 1 - on) - uint64_t printMemoryLayout;//will print order of buffers used in shaders (0 - off, 1 - on) - - uint64_t saveApplicationToString;//will save all compiled binaries to VkFFTApplication.saveApplicationString (will be allocated by VkFFT, deallocated with deleteVkFFT call). (0 - off, 1 - on) - - uint64_t loadApplicationFromString;//will load all binaries from loadApplicationString instead of recompiling them (must be allocated by user, must contain what saveApplicationToString call generated previously in VkFFTApplication.saveApplicationString). (0 - off, 1 - on). Mutually exclusive with saveApplicationToString - void* loadApplicationString;//memory binary array through which user can load VkFFT binaries, must be provided by user if loadApplicationFromString = 1. Use rb/wb flags to load/save. - - uint64_t disableSetLocale;//disables all VkFFT attempts to set locale to C - user must ensure that VkFFT has C locale during the plan initialization. This option is needed for multithreading. Default 0. - - //optional Bluestein optimizations: (default 0 if not stated otherwise) - uint64_t fixMaxRadixBluestein;//controls the padding of sequences in Bluestein convolution. If specified, padded sequence will be made of up to fixMaxRadixBluestein primes. Default: 2 for CUDA and Vulkan/OpenCL/HIP up to 1048576 combined dimension FFT system, 7 for Vulkan/OpenCL/HIP past after. Min = 2, Max = 13. - uint64_t forceBluesteinSequenceSize;// force the sequence size to pad to in Bluestein's algorithm. Must be at least 2*N-1 and decomposable with primes 2-13. - uint64_t useCustomBluesteinPaddingPattern;// force the sequence sizes to pad to in Bluestein's algorithm, but on a range. This number specifies the number of elements in primeSizes and in paddedSizes arrays. primeSizes - array of non-decomposable as radix scheme sizes - 17, 23, 31 etc. - // paddedSizes - array of lengths to pad to. paddedSizes[i] will be the padding size for all non-decomposable sequences from primeSizes[i] to primeSizes[i+1] (will use default scheme after last one) - 42, 60, 64 for primeSizes before and 37+ will use default scheme (for example). Default is vendor and API-based specified in autoCustomBluesteinPaddingPattern. - uint64_t* primeSizes; // described in useCustomBluesteinPaddingPattern - uint64_t* paddedSizes; // described in useCustomBluesteinPaddingPattern - - uint64_t fixMinRaderPrimeMult;//start direct multiplication Rader's algorithm for radix primes from this number. This means that VkFFT will inline custom Rader kernels if sequence is divisible by these primes. Default is 17, as VkFFT has kernels for 2-13. If you make it less than 13, VkFFT will switch from these kernels to Rader. - uint64_t fixMaxRaderPrimeMult;//switch from Mult Rader's algorithm for radix primes from this number. Current limitation for Rader is maxThreadNum/2+1, realistically you would want to switch somewhere on 30-100 range. Default is vendor-specific (currently ~40) - - uint64_t fixMinRaderPrimeFFT;//start FFT convolution version of Rader for radix primes from this number. Better than direct multiplication version for almost all primes (except small ones, like 17-23 on some GPUs). Must be bigger or equal to fixMinRaderPrimeMult. Deafult 29 on AMD and 17 on other GPUs. - uint64_t fixMaxRaderPrimeFFT;//switch to Bluestein's algorithm for radix primes from this number. Switch may happen earlier if prime can't fit in shared memory. Default is 16384, which is bigger than most current GPU's shared memory. - - //optional zero padding control parameters: (default 0 if not stated otherwise) - uint64_t performZeropadding[3]; // don't read some data/perform computations if some input sequences are zeropadded for each axis (0 - off, 1 - on) - uint64_t fft_zeropad_left[3];//specify start boundary of zero block in the system for each axis - uint64_t fft_zeropad_right[3];//specify end boundary of zero block in the system for each axis - uint64_t frequencyZeroPadding; //set to 1 if zeropadding of frequency domain, default 0 - spatial zeropadding - - //optional convolution control parameters: (default 0 if not stated otherwise) - uint64_t performConvolution; //perform convolution in this application (0 - off, 1 - on). Disables reorderFourStep parameter - uint64_t conjugateConvolution;//0 off, 1 - conjugation of the sequence FFT is currently done on, 2 - conjugation of the convolution kernel - uint64_t crossPowerSpectrumNormalization;//normalize the FFT x kernel multiplication in frequency domain - uint64_t coordinateFeatures; // C - coordinate, or dimension of features vector. In matrix convolution - size of vector - uint64_t matrixConvolution; //if equal to 2 perform 2x2, if equal to 3 perform 3x3 matrix-vector convolution. Overrides coordinateFeatures - uint64_t symmetricKernel; //specify if kernel in 2x2 or 3x3 matrix convolution is symmetric - uint64_t numberKernels;// N - only used in convolution step - specify how many kernels were initialized before. Expands one input to multiple (batched) output - uint64_t kernelConvolution;// specify if this application is used to create kernel for convolution, so it has the same properties. performConvolution has to be set to 0 for kernel creation - - //register overutilization (experimental): (default 0 if not stated otherwise) - uint64_t registerBoost; //specify if register file size is bigger than shared memory and can be used to extend it X times (on Nvidia 256KB register file can be used instead of 32KB of shared memory, set this constant to 4 to emulate 128KB of shared memory). Default 1 - uint64_t registerBoostNonPow2; //specify if register overutilization should be used on non power of 2 sequences (0 - off, 1 - on) - uint64_t registerBoost4Step; //specify if register file overutilization should be used in big sequences (>2^14), same definition as registerBoost. Default 1 - - //not used techniques: - uint64_t swapTo3Stage4Step; //specify at which power of 2 to switch from 2 upload to 3 upload 4-step FFT, in case if making max sequence size lower than coalesced sequence helps to combat TLB misses. Default 0 - disabled. Must be at least 17 - uint64_t devicePageSize;//in KB, the size of a page on the GPU. Setting to 0 disables local buffer split in pages - uint64_t localPageSize;//in KB, the size to split page into if sequence spans multiple devicePageSize pages - - //automatically filled based on device info (still can be reconfigured by user): - uint64_t computeCapabilityMajor; // CUDA/HIP compute capability of the device - uint64_t computeCapabilityMinor; // CUDA/HIP compute capability of the device - uint64_t maxComputeWorkGroupCount[3]; // maxComputeWorkGroupCount from VkPhysicalDeviceLimits - uint64_t maxComputeWorkGroupSize[3]; // maxComputeWorkGroupCount from VkPhysicalDeviceLimits - uint64_t maxThreadsNum; //max number of threads from VkPhysicalDeviceLimits - uint64_t sharedMemorySizeStatic; //available for static allocation shared memory size, in bytes - uint64_t sharedMemorySize; //available for allocation shared memory size, in bytes - uint64_t sharedMemorySizePow2; //power of 2 which is less or equal to sharedMemorySize, in bytes - uint64_t warpSize; //number of threads per warp/wavefront. - uint64_t halfThreads;//Intel fix - uint64_t allocateTempBuffer; //buffer allocated by app automatically if needed to reorder Four step algorithm. Parameter to check if it has been allocated - uint64_t reorderFourStep; // unshuffle Four step algorithm. Requires tempbuffer allocation (0 - off, 1 - on). Default 1. - int64_t maxCodeLength; //specify how big can be buffer used for code generation (in char). Default 4000000 chars. - int64_t maxTempLength; //specify how big can be buffer used for intermediate string sprintfs be (in char). Default 5000 chars. If code segfaults for some reason - try increasing this number. - uint64_t autoCustomBluesteinPaddingPattern; // default value for useCustomBluesteinPaddingPattern - uint64_t useRaderUintLUT; // allocate additional LUT to store g_pow - uint64_t vendorID; // vendorID 0x10DE - NVIDIA, 0x8086 - Intel, 0x1002 - AMD, etc. -#if(VKFFT_BACKEND==0) - VkDeviceMemory tempBufferDeviceMemory;//Filled at app creation - VkCommandBuffer* commandBuffer;//Filled at app execution - VkMemoryBarrier* memory_barrier;//Filled at app creation -#elif(VKFFT_BACKEND==1) - cudaEvent_t* stream_event;//Filled at app creation - uint64_t streamCounter;//Filled at app creation - uint64_t streamID;//Filled at app creation -#elif(VKFFT_BACKEND==2) - hipEvent_t* stream_event;//Filled at app creation - uint64_t streamCounter;//Filled at app creation - uint64_t streamID;//Filled at app creation -#elif(VKFFT_BACKEND==3) - cl_command_queue* commandQueue; -#elif(VKFFT_BACKEND==4) - ze_command_list_handle_t* commandList;//Filled at app execution -#endif -} VkFFTConfiguration;//parameters specified at plan creation - -typedef struct { -#if(VKFFT_BACKEND==0) - VkCommandBuffer* commandBuffer;//commandBuffer to which FFT is appended - - VkBuffer* buffer;//pointer to array of buffers (or one buffer) used for computations - VkBuffer* tempBuffer;//needed if reorderFourStep is enabled to transpose the array. Same sum size or bigger as buffer (can be split in multiple). Default 0. Setting to non zero value enables manual user allocation - VkBuffer* inputBuffer;//pointer to array of input buffers (or one buffer) used to read data from if isInputFormatted is enabled - VkBuffer* outputBuffer;//pointer to array of output buffers (or one buffer) used for write data to if isOutputFormatted is enabled - VkBuffer* kernel;//pointer to array of kernel buffers (or one buffer) used for read kernel data from if performConvolution is enabled -#elif(VKFFT_BACKEND==1) - void** buffer;//pointer to device buffer used for computations - void** tempBuffer;//needed if reorderFourStep is enabled to transpose the array. Same size as buffer. Default 0. Setting to non zero value enables manual user allocation - void** inputBuffer;//pointer to device buffer used to read data from if isInputFormatted is enabled - void** outputBuffer;//pointer to device buffer used to read data from if isOutputFormatted is enabled - void** kernel;//pointer to device buffer used to read kernel data from if performConvolution is enabled -#elif(VKFFT_BACKEND==2) - void** buffer;//pointer to device buffer used for computations - void** tempBuffer;//needed if reorderFourStep is enabled to transpose the array. Same size as buffer. Default 0. Setting to non zero value enables manual user allocation - void** inputBuffer;//pointer to device buffer used to read data from if isInputFormatted is enabled - void** outputBuffer;//pointer to device buffer used to read data from if isOutputFormatted is enabled - void** kernel;//pointer to device buffer used to read kernel data from if performConvolution is enabled -#elif(VKFFT_BACKEND==3) - cl_command_queue* commandQueue;//commandBuffer to which FFT is appended - - cl_mem* buffer;//pointer to device buffer used for computations - cl_mem* tempBuffer;//needed if reorderFourStep is enabled to transpose the array. Same size as buffer. Default 0. Setting to non zero value enables manual user allocation - cl_mem* inputBuffer;//pointer to device buffer used to read data from if isInputFormatted is enabled - cl_mem* outputBuffer;//pointer to device buffer used to read data from if isOutputFormatted is enabled - cl_mem* kernel;//pointer to device buffer used to read kernel data from if performConvolution is enabled -#elif(VKFFT_BACKEND==4) - ze_command_list_handle_t* commandList;//commandList to which FFT is appended - - void** buffer;//pointer to device buffer used for computations - void** tempBuffer;//needed if reorderFourStep is enabled to transpose the array. Same size as buffer. Default 0. Setting to non zero value enables manual user allocation - void** inputBuffer;//pointer to device buffer used to read data from if isInputFormatted is enabled - void** outputBuffer;//pointer to device buffer used to read data from if isOutputFormatted is enabled - void** kernel;//pointer to device buffer used to read kernel data from if performConvolution is enabled -#endif - //following parameters can be specified during kernels launch, if specifyOffsetsAtLaunch parameter was enabled during the initializeVkFFT call - uint64_t bufferOffset;//specify if VkFFT has to offset the first element position inside the buffer. In bytes. Default 0 - uint64_t tempBufferOffset;//specify if VkFFT has to offset the first element position inside the temp buffer. In bytes. Default 0 - uint64_t inputBufferOffset;//specify if VkFFT has to offset the first element position inside the input buffer. In bytes. Default 0 - uint64_t outputBufferOffset;//specify if VkFFT has to offset the first element position inside the output buffer. In bytes. Default 0 - uint64_t kernelOffset;//specify if VkFFT has to offset the first element position inside the kernel. In bytes. Default 0 -} VkFFTLaunchParams;//parameters specified at plan execution -typedef enum VkFFTResult { - VKFFT_SUCCESS = 0, - VKFFT_ERROR_MALLOC_FAILED = 1, - VKFFT_ERROR_INSUFFICIENT_CODE_BUFFER = 2, - VKFFT_ERROR_INSUFFICIENT_TEMP_BUFFER = 3, - VKFFT_ERROR_PLAN_NOT_INITIALIZED = 4, - VKFFT_ERROR_NULL_TEMP_PASSED = 5, - VKFFT_ERROR_INVALID_PHYSICAL_DEVICE = 1001, - VKFFT_ERROR_INVALID_DEVICE = 1002, - VKFFT_ERROR_INVALID_QUEUE = 1003, - VKFFT_ERROR_INVALID_COMMAND_POOL = 1004, - VKFFT_ERROR_INVALID_FENCE = 1005, - VKFFT_ERROR_ONLY_FORWARD_FFT_INITIALIZED = 1006, - VKFFT_ERROR_ONLY_INVERSE_FFT_INITIALIZED = 1007, - VKFFT_ERROR_INVALID_CONTEXT = 1008, - VKFFT_ERROR_INVALID_PLATFORM = 1009, - VKFFT_ERROR_ENABLED_saveApplicationToString = 1010, - VKFFT_ERROR_EMPTY_FILE = 1011, - VKFFT_ERROR_EMPTY_FFTdim = 2001, - VKFFT_ERROR_EMPTY_size = 2002, - VKFFT_ERROR_EMPTY_bufferSize = 2003, - VKFFT_ERROR_EMPTY_buffer = 2004, - VKFFT_ERROR_EMPTY_tempBufferSize = 2005, - VKFFT_ERROR_EMPTY_tempBuffer = 2006, - VKFFT_ERROR_EMPTY_inputBufferSize = 2007, - VKFFT_ERROR_EMPTY_inputBuffer = 2008, - VKFFT_ERROR_EMPTY_outputBufferSize = 2009, - VKFFT_ERROR_EMPTY_outputBuffer = 2010, - VKFFT_ERROR_EMPTY_kernelSize = 2011, - VKFFT_ERROR_EMPTY_kernel = 2012, - VKFFT_ERROR_EMPTY_applicationString = 2013, - VKFFT_ERROR_EMPRY_useCustomBluesteinPaddingPattern_arrays = 2014, - VKFFT_ERROR_UNSUPPORTED_RADIX = 3001, - VKFFT_ERROR_UNSUPPORTED_FFT_LENGTH = 3002, - VKFFT_ERROR_UNSUPPORTED_FFT_LENGTH_R2C = 3003, - VKFFT_ERROR_UNSUPPORTED_FFT_LENGTH_DCT = 3004, - VKFFT_ERROR_UNSUPPORTED_FFT_OMIT = 3005, - VKFFT_ERROR_FAILED_TO_ALLOCATE = 4001, - VKFFT_ERROR_FAILED_TO_MAP_MEMORY = 4002, - VKFFT_ERROR_FAILED_TO_ALLOCATE_COMMAND_BUFFERS = 4003, - VKFFT_ERROR_FAILED_TO_BEGIN_COMMAND_BUFFER = 4004, - VKFFT_ERROR_FAILED_TO_END_COMMAND_BUFFER = 4005, - VKFFT_ERROR_FAILED_TO_SUBMIT_QUEUE = 4006, - VKFFT_ERROR_FAILED_TO_WAIT_FOR_FENCES = 4007, - VKFFT_ERROR_FAILED_TO_RESET_FENCES = 4008, - VKFFT_ERROR_FAILED_TO_CREATE_DESCRIPTOR_POOL = 4009, - VKFFT_ERROR_FAILED_TO_CREATE_DESCRIPTOR_SET_LAYOUT = 4010, - VKFFT_ERROR_FAILED_TO_ALLOCATE_DESCRIPTOR_SETS = 4011, - VKFFT_ERROR_FAILED_TO_CREATE_PIPELINE_LAYOUT = 4012, - VKFFT_ERROR_FAILED_SHADER_PREPROCESS = 4013, - VKFFT_ERROR_FAILED_SHADER_PARSE = 4014, - VKFFT_ERROR_FAILED_SHADER_LINK = 4015, - VKFFT_ERROR_FAILED_SPIRV_GENERATE = 4016, - VKFFT_ERROR_FAILED_TO_CREATE_SHADER_MODULE = 4017, - VKFFT_ERROR_FAILED_TO_CREATE_INSTANCE = 4018, - VKFFT_ERROR_FAILED_TO_SETUP_DEBUG_MESSENGER = 4019, - VKFFT_ERROR_FAILED_TO_FIND_PHYSICAL_DEVICE = 4020, - VKFFT_ERROR_FAILED_TO_CREATE_DEVICE = 4021, - VKFFT_ERROR_FAILED_TO_CREATE_FENCE = 4022, - VKFFT_ERROR_FAILED_TO_CREATE_COMMAND_POOL = 4023, - VKFFT_ERROR_FAILED_TO_CREATE_BUFFER = 4024, - VKFFT_ERROR_FAILED_TO_ALLOCATE_MEMORY = 4025, - VKFFT_ERROR_FAILED_TO_BIND_BUFFER_MEMORY = 4026, - VKFFT_ERROR_FAILED_TO_FIND_MEMORY = 4027, - VKFFT_ERROR_FAILED_TO_SYNCHRONIZE = 4028, - VKFFT_ERROR_FAILED_TO_COPY = 4029, - VKFFT_ERROR_FAILED_TO_CREATE_PROGRAM = 4030, - VKFFT_ERROR_FAILED_TO_COMPILE_PROGRAM = 4031, - VKFFT_ERROR_FAILED_TO_GET_CODE_SIZE = 4032, - VKFFT_ERROR_FAILED_TO_GET_CODE = 4033, - VKFFT_ERROR_FAILED_TO_DESTROY_PROGRAM = 4034, - VKFFT_ERROR_FAILED_TO_LOAD_MODULE = 4035, - VKFFT_ERROR_FAILED_TO_GET_FUNCTION = 4036, - VKFFT_ERROR_FAILED_TO_SET_DYNAMIC_SHARED_MEMORY = 4037, - VKFFT_ERROR_FAILED_TO_MODULE_GET_GLOBAL = 4038, - VKFFT_ERROR_FAILED_TO_LAUNCH_KERNEL = 4039, - VKFFT_ERROR_FAILED_TO_EVENT_RECORD = 4040, - VKFFT_ERROR_FAILED_TO_ADD_NAME_EXPRESSION = 4041, - VKFFT_ERROR_FAILED_TO_INITIALIZE = 4042, - VKFFT_ERROR_FAILED_TO_SET_DEVICE_ID = 4043, - VKFFT_ERROR_FAILED_TO_GET_DEVICE = 4044, - VKFFT_ERROR_FAILED_TO_CREATE_CONTEXT = 4045, - VKFFT_ERROR_FAILED_TO_CREATE_PIPELINE = 4046, - VKFFT_ERROR_FAILED_TO_SET_KERNEL_ARG = 4047, - VKFFT_ERROR_FAILED_TO_CREATE_COMMAND_QUEUE = 4048, - VKFFT_ERROR_FAILED_TO_RELEASE_COMMAND_QUEUE = 4049, - VKFFT_ERROR_FAILED_TO_ENUMERATE_DEVICES = 4050, - VKFFT_ERROR_FAILED_TO_GET_ATTRIBUTE = 4051, - VKFFT_ERROR_FAILED_TO_CREATE_EVENT = 4052, - VKFFT_ERROR_FAILED_TO_CREATE_COMMAND_LIST = 4053, - VKFFT_ERROR_FAILED_TO_DESTROY_COMMAND_LIST = 4054, - VKFFT_ERROR_FAILED_TO_SUBMIT_BARRIER = 4055 -} VkFFTResult; - -typedef struct VkFFTRaderContainer VkFFTRaderContainer; - -struct VkFFTRaderContainer { - uint64_t prime; - uint64_t generator; - uint64_t multiplier; - uint64_t inline_rader_g_pow; - uint64_t raderUintLUToffset; - - uint64_t type; //0 - FFT, 1 - Direct multiplication - - uint64_t raderRegisters; - uint64_t rader_min_registers; - - //Direct multiplication parameters - - //FFT parameters - uint64_t registers_per_thread; - uint64_t min_registers_per_thread; - uint64_t loc_multipliers[33]; - uint64_t registers_per_thread_per_radix[33]; - uint64_t stageRadix[20]; - uint64_t numStages; - uint64_t numSubPrimes; - uint64_t stage_rader_generator[20]; - uint64_t containerFFTDim; - uint64_t containerFFTNum; - uint64_t subLogicalGroupSizeMax;//how many threads are needed per Rader transform - uint64_t RaderKernelOffsetLUT; - uint64_t RaderRadixOffsetLUT; - uint64_t RaderRadixOffsetLUTiFFT; - void* raderFFTkernel; - - struct VkFFTRaderContainer* container; -}; - -typedef struct { - uint64_t size[3]; - uint64_t localSize[3]; - uint64_t numSubgroups; - uint64_t sourceFFTSize; - uint64_t fftDim; - uint64_t inverse; - uint64_t actualInverse; - uint64_t inverseBluestein; - uint64_t zeropad[2]; - uint64_t zeropadBluestein[2]; - uint64_t axis_id; - uint64_t axis_upload_id; - uint64_t numAxisUploads; - uint64_t registers_per_thread; - uint64_t registers_per_thread_per_radix[33]; - uint64_t min_registers_per_thread; - uint64_t maxNonPow2Radix; - uint64_t usedLocRegs; - uint64_t readToRegisters; - uint64_t writeFromRegisters; - uint64_t LUT; - uint64_t raderUintLUT; - uint64_t useCoalescedLUTUploadToSM; - uint64_t useBluesteinFFT; - uint64_t reverseBluesteinMultiUpload; - uint64_t BluesteinConvolutionStep; - uint64_t BluesteinPreMultiplication; - uint64_t BluesteinPostMultiplication; - uint64_t startDCT3LUT; - uint64_t startDCT4LUT; - uint64_t performR2C; - uint64_t performR2CmultiUpload; - uint64_t performDCT; - uint64_t performBandwidthBoost; - uint64_t frequencyZeropadding; - uint64_t performZeropaddingFull[3]; // don't do read/write if full sequence is omitted - uint64_t performZeropaddingInput[3]; // don't read if input is zeropadded (0 - off, 1 - on) - uint64_t performZeropaddingOutput[3]; // don't write if output is zeropadded (0 - off, 1 - on) - uint64_t fft_zeropad_left_full[3]; - uint64_t fft_zeropad_left_read[3]; - uint64_t fft_zeropad_left_write[3]; - uint64_t fft_zeropad_right_full[3]; - uint64_t fft_zeropad_right_read[3]; - uint64_t fft_zeropad_right_write[3]; - uint64_t fft_zeropad_Bluestein_left_read[3]; - uint64_t fft_zeropad_Bluestein_left_write[3]; - uint64_t fft_zeropad_Bluestein_right_read[3]; - uint64_t fft_zeropad_Bluestein_right_write[3]; - uint64_t inputStride[5]; - uint64_t outputStride[5]; - uint64_t fft_dim_full; - uint64_t stageStartSize; - uint64_t firstStageStartSize; - uint64_t fft_dim_x; - uint64_t dispatchZactualFFTSize; - uint64_t numStages; - uint64_t stageRadix[33]; - uint64_t inputOffset; - uint64_t kernelOffset; - uint64_t outputOffset; - uint64_t reorderFourStep; - uint64_t pushConstantsStructSize; - uint64_t performWorkGroupShift[3]; - uint64_t performPostCompilationInputOffset; - uint64_t performPostCompilationOutputOffset; - uint64_t performPostCompilationKernelOffset; - uint64_t inputBufferBlockNum; - uint64_t inputBufferBlockSize; - uint64_t outputBufferBlockNum; - uint64_t outputBufferBlockSize; - uint64_t kernelBlockNum; - uint64_t kernelBlockSize; - uint64_t numCoordinates; - uint64_t matrixConvolution; //if equal to 2 perform 2x2, if equal to 3 perform 3x3 matrix-vector convolution. Overrides coordinateFeatures - uint64_t numBatches; - uint64_t numKernels; - uint64_t conjugateConvolution; - uint64_t crossPowerSpectrumNormalization; - uint64_t usedSharedMemory; - uint64_t sharedMemSize; - uint64_t sharedMemSizePow2; - uint64_t normalize; - uint64_t complexSize; - uint64_t inputNumberByteSize; - uint64_t outputNumberByteSize; - uint64_t kernelNumberByteSize; - uint64_t maxStageSumLUT; - uint64_t unroll; - uint64_t swapComputeWorkGroupID; - uint64_t convolutionStep; - uint64_t symmetricKernel; - uint64_t supportAxis; - uint64_t cacheShuffle; - uint64_t registerBoost; - uint64_t warpSize; - uint64_t numSharedBanks; - uint64_t resolveBankConflictFirstStages; - uint64_t sharedStrideBankConflictFirstStages; - uint64_t sharedStrideReadWriteConflict; - - uint64_t sharedStrideRaderFFT; - uint64_t sharedShiftRaderFFT; - - uint64_t maxSharedStride; - uint64_t axisSwapped; - uint64_t mergeSequencesR2C; - - uint64_t numBuffersBound[10]; - uint64_t convolutionBindingID; - uint64_t LUTBindingID; - uint64_t BluesteinConvolutionBindingID; - uint64_t BluesteinMultiplicationBindingID; - - uint64_t useRader; - uint64_t numRaderPrimes; - uint64_t minRaderFFTThreadNum; - VkFFTRaderContainer* raderContainer; - VkFFTRaderContainer* currentRaderContainer; - uint64_t RaderUintLUTBindingID; - - uint64_t useRaderMult; - uint64_t additionalRaderSharedSize; - uint64_t RaderKernelOffsetShared[33]; - uint64_t RaderKernelOffsetLUT[33]; - uint64_t rader_generator[33]; - uint64_t fixMinRaderPrimeMult;//start Rader algorithm for primes from this number - uint64_t fixMaxRaderPrimeMult;//switch from Rader to Bluestein algorithm for primes from this number - uint64_t fixMinRaderPrimeFFT;//start Rader algorithm for primes from this number - uint64_t fixMaxRaderPrimeFFT;//switch from Rader to Bluestein algorithm for primes from this number - - uint64_t inline_rader_g_pow; - uint64_t inline_rader_kernel; - - uint64_t raderRegisters; - uint64_t rader_min_registers; - - uint64_t useRaderFFT; - - uint64_t performOffsetUpdate; - uint64_t performBufferSetUpdate; - uint64_t useUint64; - uint64_t disableSetLocale; - - char** regIDs; - char* disableThreadsStart; - char* disableThreadsEnd; - char sdataID[50]; - char inoutID[50]; - char combinedID[50]; - char raderIDx[50]; - char raderIDx2[50]; - char gl_LocalInvocationID_x[50]; - char gl_LocalInvocationID_y[50]; - char gl_LocalInvocationID_z[50]; - char gl_GlobalInvocationID_x[200]; - char gl_GlobalInvocationID_y[200]; - char gl_GlobalInvocationID_z[200]; - char gl_SubgroupInvocationID[200]; - char gl_SubgroupID[200]; - char tshuffle[50]; - char sharedStride[50]; - char gl_WorkGroupSize_x[50]; - char gl_WorkGroupSize_y[50]; - char gl_WorkGroupSize_z[50]; - char gl_WorkGroupID_x[50]; - char gl_WorkGroupID_y[50]; - char gl_WorkGroupID_z[50]; - char tempReg[50]; - char vecType[30]; - char stageInvocationID[50]; - char blockInvocationID[50]; - char temp[50]; - char w[50]; - char iw[50]; - char x0[33][40]; - char locID[33][40]; - char* code0; - char* output; - char* tempStr; - int64_t tempLen; - int64_t currentLen; - int64_t maxCodeLength; - int64_t maxTempLength; - char oldLocale[100]; -} VkFFTSpecializationConstantsLayout; -typedef struct { - uint32_t dataUint32[10]; - uint64_t dataUint64[10]; - //specify what can be in layout - uint64_t performWorkGroupShift[3]; - uint64_t workGroupShift[3]; - - uint64_t performPostCompilationInputOffset; - uint64_t inputOffset; - - uint64_t performPostCompilationOutputOffset; - uint64_t outputOffset; - - uint64_t performPostCompilationKernelOffset; - uint64_t kernelOffset; - - uint64_t structSize; -} VkFFTPushConstantsLayout; - -typedef struct { - uint64_t numBindings; - uint64_t axisBlock[4]; - uint64_t groupedBatch; - VkFFTSpecializationConstantsLayout specializationConstants; - VkFFTPushConstantsLayout pushConstants; - uint64_t updatePushConstants; -#if(VKFFT_BACKEND==0) - VkBuffer* inputBuffer; - VkBuffer* outputBuffer; - VkDescriptorPool descriptorPool; - VkDescriptorSetLayout descriptorSetLayout; - VkDescriptorSet descriptorSet; - VkPipelineLayout pipelineLayout; - VkPipeline pipeline; - VkDeviceMemory bufferLUTDeviceMemory; - VkBuffer bufferLUT; - VkDeviceMemory bufferRaderUintLUTDeviceMemory; - VkBuffer bufferRaderUintLUT; - VkDeviceMemory* bufferBluesteinDeviceMemory; - VkDeviceMemory* bufferBluesteinFFTDeviceMemory; - VkBuffer* bufferBluestein; - VkBuffer* bufferBluesteinFFT; -#elif(VKFFT_BACKEND==1) - void** inputBuffer; - void** outputBuffer; - CUmodule VkFFTModule; - CUfunction VkFFTKernel; - void* bufferLUT; - void* bufferRaderUintLUT; - CUdeviceptr consts_addr; - void** bufferBluestein; - void** bufferBluesteinFFT; -#elif(VKFFT_BACKEND==2) - void** inputBuffer; - void** outputBuffer; - hipModule_t VkFFTModule; - hipFunction_t VkFFTKernel; - void* bufferLUT; - void* bufferRaderUintLUT; - hipDeviceptr_t consts_addr; - void** bufferBluestein; - void** bufferBluesteinFFT; -#elif(VKFFT_BACKEND==3) - cl_mem* inputBuffer; - cl_mem* outputBuffer; - cl_program program; - cl_kernel kernel; - cl_mem bufferLUT; - cl_mem bufferRaderUintLUT; - cl_mem* bufferBluestein; - cl_mem* bufferBluesteinFFT; -#elif(VKFFT_BACKEND==4) - void** inputBuffer; - void** outputBuffer; - ze_module_handle_t VkFFTModule; - ze_kernel_handle_t VkFFTKernel; - void* bufferLUT; - void* bufferRaderUintLUT; - void** bufferBluestein; - void** bufferBluesteinFFT; -#endif - - void* binary; - uint64_t binarySize; - - uint64_t bufferLUTSize; - uint64_t bufferRaderUintLUTSize; - uint64_t referenceLUT; -} VkFFTAxis; - -typedef struct { - uint64_t actualFFTSizePerAxis[3][3]; - uint64_t numAxisUploads[3]; - uint64_t axisSplit[3][4]; - VkFFTAxis axes[3][4]; - - uint64_t multiUploadR2C; - uint64_t actualPerformR2CPerAxis[3]; // automatically specified, shows if R2C is actually performed or inside FFT or as a separate step - VkFFTAxis R2Cdecomposition; - VkFFTAxis inverseBluesteinAxes[3][4]; -} VkFFTPlan; -typedef struct { - VkFFTConfiguration configuration; - VkFFTPlan* localFFTPlan; - VkFFTPlan* localFFTPlan_inverse; //additional inverse plan - - uint64_t actualNumBatches; - uint64_t firstAxis; - uint64_t lastAxis; - //Bluestein buffers reused among plans - uint64_t useBluesteinFFT[3]; -#if(VKFFT_BACKEND==0) - VkDeviceMemory bufferRaderUintLUTDeviceMemory[3][4]; - VkBuffer bufferRaderUintLUT[3][4]; - VkDeviceMemory bufferBluesteinDeviceMemory[3]; - VkDeviceMemory bufferBluesteinFFTDeviceMemory[3]; - VkDeviceMemory bufferBluesteinIFFTDeviceMemory[3]; - VkBuffer bufferBluestein[3]; - VkBuffer bufferBluesteinFFT[3]; - VkBuffer bufferBluesteinIFFT[3]; -#elif(VKFFT_BACKEND==1) - void* bufferRaderUintLUT[3][4]; - void* bufferBluestein[3]; - void* bufferBluesteinFFT[3]; - void* bufferBluesteinIFFT[3]; -#elif(VKFFT_BACKEND==2) - void* bufferRaderUintLUT[3][4]; - void* bufferBluestein[3]; - void* bufferBluesteinFFT[3]; - void* bufferBluesteinIFFT[3]; -#elif(VKFFT_BACKEND==3) - cl_mem bufferRaderUintLUT[3][4]; - cl_mem bufferBluestein[3]; - cl_mem bufferBluesteinFFT[3]; - cl_mem bufferBluesteinIFFT[3]; -#elif(VKFFT_BACKEND==4) - void* bufferRaderUintLUT[3][4]; - void* bufferBluestein[3]; - void* bufferBluesteinFFT[3]; - void* bufferBluesteinIFFT[3]; -#endif - uint64_t bufferRaderUintLUTSize[3][4]; - uint64_t bufferBluesteinSize[3]; - void* applicationBluesteinString[3]; - uint64_t applicationBluesteinStringSize[3]; - - uint64_t numRaderFFTPrimes; - uint64_t rader_primes[30]; - uint64_t rader_buffer_size[30]; - void* raderFFTkernel[30]; - uint64_t applicationStringOffsetRader; - - uint64_t currentApplicationStringPos; - - uint64_t applicationStringSize;//size of saveApplicationString in bytes - void* saveApplicationString;//memory array(uint32_t* for Vulkan, char* for CUDA/HIP/OpenCL) through which user can access VkFFT generated binaries. (will be allocated by VkFFT, deallocated with deleteVkFFT call) -} VkFFTApplication; - -static inline VkFFTResult VkAppendLine(VkFFTSpecializationConstantsLayout* sc) { - //appends code line stored in tempStr to generated code - if (sc->tempLen < 0) return VKFFT_ERROR_INSUFFICIENT_TEMP_BUFFER; - if (sc->currentLen + sc->tempLen > sc->maxCodeLength) return VKFFT_ERROR_INSUFFICIENT_CODE_BUFFER; - sc->currentLen += sprintf(sc->output + sc->currentLen, "%s", sc->tempStr); - return VKFFT_SUCCESS; -} -static inline VkFFTResult VkAppendLineFromInput(VkFFTSpecializationConstantsLayout* sc, const char* in) { - //appends code line stored in tempStr to generated code - if (sc->currentLen + (int64_t)strlen(in) > sc->maxCodeLength) return VKFFT_ERROR_INSUFFICIENT_CODE_BUFFER; - sc->currentLen += sprintf(sc->output + sc->currentLen, "%s", in); - return VKFFT_SUCCESS; -} -static inline VkFFTResult appendLicense(VkFFTSpecializationConstantsLayout* sc) { - VkFFTResult res = VKFFT_SUCCESS; - sc->tempLen = sprintf(sc->tempStr, "\ -// This file is part of VkFFT, a Vulkan Fast Fourier Transform library\n\ -//\n\ -// Copyright (C) 2020 - present Dmitrii Tolmachev \n\ -//\n\ -// Permission is hereby granted, free of charge, to any person obtaining a copy\n\ -// of this software and associated documentation files (the \"Software\"), to deal\n\ -// in the Software without restriction, including without limitation the rights\n\ -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n\ -// copies of the Software, and to permit persons to whom the Software is\n\ -// furnished to do so, subject to the following conditions:\n\ -//\n\ -// The above copyright notice and this permission notice shall be included in\n\ -// all copies or substantial portions of the Software.\n\ -//\n\ -// THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n\ -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n\ -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n\ -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n\ -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n\ -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN\n\ -// THE SOFTWARE.\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - return res; -} -static inline VkFFTResult VkMovComplex(VkFFTSpecializationConstantsLayout* sc, const char* out, const char* in) { - VkFFTResult res = VKFFT_SUCCESS; - sc->tempLen = sprintf(sc->tempStr, "\ - %s = %s;\n", out, in); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - return res; -} -static inline VkFFTResult VkMovReal(VkFFTSpecializationConstantsLayout* sc, const char* out, const char* in) { - VkFFTResult res = VKFFT_SUCCESS; - sc->tempLen = sprintf(sc->tempStr, "\ - %s = %s;\n", out, in); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - return res; -} -static inline VkFFTResult VkSharedStore(VkFFTSpecializationConstantsLayout* sc, const char* id, const char* in) { - VkFFTResult res = VKFFT_SUCCESS; - sc->tempLen = sprintf(sc->tempStr, "\ - sdata[%s] = %s;\n", id, in); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - return res; -} -static inline VkFFTResult VkSharedLoad(VkFFTSpecializationConstantsLayout* sc, const char* out, const char* id) { - VkFFTResult res = VKFFT_SUCCESS; - sc->tempLen = sprintf(sc->tempStr, "\ - %s = sdata[%s];\n", out, id); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - return res; -} -static inline VkFFTResult VkAddReal(VkFFTSpecializationConstantsLayout* sc, const char* out, const char* in_1, const char* in_2) { - VkFFTResult res = VKFFT_SUCCESS; - sc->tempLen = sprintf(sc->tempStr, "\ - %s = %s + %s;\n", out, in_1, in_2); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - return res; -} -static inline VkFFTResult VkAddComplex(VkFFTSpecializationConstantsLayout* sc, const char* out, const char* in_1, const char* in_2) { - VkFFTResult res = VKFFT_SUCCESS; -#if(VKFFT_BACKEND==2) - sc->tempLen = sprintf(sc->tempStr, "\ - %s = %s + %s;\n", out, in_1, in_2); -#else - sc->tempLen = sprintf(sc->tempStr, "\ - %s.x = %s.x + %s.x;\n\ - %s.y = %s.y + %s.y;\n", out, in_1, in_2, out, in_1, in_2); -#endif - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - return res; -} -static inline VkFFTResult VkAddComplexInv(VkFFTSpecializationConstantsLayout* sc, const char* out, const char* in_1, const char* in_2) { - VkFFTResult res = VKFFT_SUCCESS; -#if(VKFFT_BACKEND==2) - sc->tempLen = sprintf(sc->tempStr, "\ - %s = - %s - %s;\n", out, in_1, in_2); -#else - sc->tempLen = sprintf(sc->tempStr, "\ - %s.x = - %s.x - %s.x;\n\ - %s.y = - %s.y - %s.y;\n", out, in_1, in_2, out, in_1, in_2); -#endif - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - return res; -} -static inline VkFFTResult VkSubComplex(VkFFTSpecializationConstantsLayout* sc, const char* out, const char* in_1, const char* in_2) { - VkFFTResult res = VKFFT_SUCCESS; -#if(VKFFT_BACKEND==2) - sc->tempLen = sprintf(sc->tempStr, "\ - %s = %s - %s;\n", out, in_1, in_2); -#else - sc->tempLen = sprintf(sc->tempStr, "\ - %s.x = %s.x - %s.x;\n\ - %s.y = %s.y - %s.y;\n", out, in_1, in_2, out, in_1, in_2); -#endif - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - return res; -} -static inline VkFFTResult VkSubReal(VkFFTSpecializationConstantsLayout* sc, const char* out, const char* in_1, const char* in_2) { - VkFFTResult res = VKFFT_SUCCESS; - sc->tempLen = sprintf(sc->tempStr, "\ - %s = %s - %s;\n", out, in_1, in_2); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - return res; -} -static inline VkFFTResult VkFMA3Complex(VkFFTSpecializationConstantsLayout* sc, const char* out_1, const char* out_2, const char* in_1, const char* in_num, const char* in_conj) { - VkFFTResult res = VKFFT_SUCCESS; - //sc->tempLen = sprintf(sc->tempStr, " printf(\"%%d %%f %%f %%f %%f \\n \", %s, %s.x, %s.y, %s.x, %s.y);\n\n", sc->gl_LocalInvocationID_x, in_1, in_1, in_conj, in_conj); - //res = VkAppendLine(sc); - //if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, "\ - %s.x = fma(%s.x, %s.x, %s.x);\n\ - %s.y = fma(%s.y, %s.x, %s.y);\n", out_1, in_1, in_num, out_1, out_1, in_conj, in_num, out_1); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, "\ - %s.x = fma(%s.y, %s.y, %s.x);\n\ - %s.y = fma(%s.x, %s.y, %s.y);\n", out_2, in_1, in_num, out_2, out_2, in_conj, in_num, out_2); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - /*sc->tempLen = sprintf(sc->tempStr, "\ - temp2.x = fma(%s.x, %s.x, %s.x);\n\ - %s.x = temp2.x;\n\ - temp2.y = fma(%s.y, %s.x, %s.y);\n\ - %s.y = temp2.y;\n", in_1, in_num, out_1, out_1, in_conj, in_num, out_1, out_1); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, "\ - temp2.x = fma(%s.y, %s.y, %s.x);\n\ - %s.x = temp2.x;\n\ - temp2.y = fma(%s.x, %s.y, %s.y);\n\ - %s.y = temp2.y;\n", in_1, in_num, out_2, out_2, in_conj, in_num, out_2, out_2); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res;*/ - //sc->tempLen = sprintf(sc->tempStr, " printf(\"%%d %%f %%f %%f %%f \\n \", %s, %s.x, %s.y, %s.x, %s.y);\n\n", sc->gl_LocalInvocationID_x, out_1, out_1, out_2, out_2); - //res = VkAppendLine(sc); - //if (res != VKFFT_SUCCESS) return res; - return res; -} -static inline VkFFTResult VkFMAComplex(VkFFTSpecializationConstantsLayout* sc, const char* out, const char* in_1, const char* in_num, const char* in_2) { - VkFFTResult res = VKFFT_SUCCESS; - sc->tempLen = sprintf(sc->tempStr, "\ - %s.x = fma(%s.x, %s, %s.x);\n\ - %s.y = fma(%s.y, %s, %s.y);\n", out, in_1, in_num, in_2, out, in_1, in_num, in_2); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - return res; -} -static inline VkFFTResult VkFMAReal(VkFFTSpecializationConstantsLayout* sc, const char* out, const char* in_1, const char* in_num, const char* in_2) { - VkFFTResult res = VKFFT_SUCCESS; - sc->tempLen = sprintf(sc->tempStr, "\ - %s = fma(%s, %s, %s);\n", out, in_1, in_num, in_2); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - return res; -} -static inline VkFFTResult VkMulComplex(VkFFTSpecializationConstantsLayout* sc, const char* out, const char* in_1, const char* in_2, const char* temp) { - VkFFTResult res = VKFFT_SUCCESS; -#if(VKFFT_BACKEND==2) - sc->tempLen = sprintf(sc->tempStr, "\ - %s = %s * %s.x + %s(-%s.y, %s.x) * %s.y;\n", out, in_1, in_2, sc->vecType, in_1, in_1, in_2); -#else - if (strcmp(out, in_1) && strcmp(out, in_2)) { - sc->tempLen = sprintf(sc->tempStr, "\ - %s.x = %s.x * %s.x - %s.y * %s.y;\n\ - %s.y = %s.y * %s.x + %s.x * %s.y;\n", out, in_1, in_2, in_1, in_2, out, in_1, in_2, in_1, in_2); - } - else { - if (temp) { - sc->tempLen = sprintf(sc->tempStr, "\ - %s.x = %s.x * %s.x - %s.y * %s.y;\n\ - %s.y = %s.y * %s.x + %s.x * %s.y;\n\ - %s = %s;\n", temp, in_1, in_2, in_1, in_2, temp, in_1, in_2, in_1, in_2, out, temp); - } - else - return VKFFT_ERROR_NULL_TEMP_PASSED; - } -#endif - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - return res; -} -static inline VkFFTResult VkMulComplexConj(VkFFTSpecializationConstantsLayout* sc, const char* out, const char* in_1, const char* in_2, const char* temp) { - VkFFTResult res = VKFFT_SUCCESS; -#if(VKFFT_BACKEND==2) - sc->tempLen = sprintf(sc->tempStr, "\ - %s = %s * %s.x + %s(%s.y, -%s.x) * %s.y;\n", out, in_1, in_2, sc->vecType, in_1, in_1, in_2); -#else - if (strcmp(out, in_1) && strcmp(out, in_2)) { - sc->tempLen = sprintf(sc->tempStr, "\ - %s.x = %s.x * %s.x + %s.y * %s.y;\n\ - %s.y = %s.y * %s.x - %s.x * %s.y;\n", out, in_1, in_2, in_1, in_2, out, in_1, in_2, in_1, in_2); - } - else { - if (temp) { - sc->tempLen = sprintf(sc->tempStr, "\ - %s.x = %s.x * %s.x + %s.y * %s.y;\n\ - %s.y = %s.y * %s.x - %s.x * %s.y;\n\ - %s = %s;\n", temp, in_1, in_2, in_1, in_2, temp, in_1, in_2, in_1, in_2, out, temp); - } - else - return VKFFT_ERROR_NULL_TEMP_PASSED; - } -#endif - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - return res; -} -static inline VkFFTResult VkMulComplexNumber(VkFFTSpecializationConstantsLayout* sc, const char* out, const char* in_1, const char* in_num) { - VkFFTResult res = VKFFT_SUCCESS; -#if(VKFFT_BACKEND==2) - sc->tempLen = sprintf(sc->tempStr, "\ - %s = %s * %s;\n", out, in_1, in_num); -#else - sc->tempLen = sprintf(sc->tempStr, "\ - %s.x = %s.x * %s;\n\ - %s.y = %s.y * %s;\n", out, in_1, in_num, out, in_1, in_num); -#endif - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - return res; -} -static inline VkFFTResult VkMulComplexNumberImag(VkFFTSpecializationConstantsLayout* sc, const char* out, const char* in_1, const char* in_num, const char* temp) { - VkFFTResult res = VKFFT_SUCCESS; -#if(VKFFT_BACKEND==2) - sc->tempLen = sprintf(sc->tempStr, "\ - %s = %s(-%s.y, %s.x) * %s;\n", out, sc->vecType, in_1, in_1, in_num); -#else - if (strcmp(out, in_1)) { - sc->tempLen = sprintf(sc->tempStr, "\ - %s.x = - %s.y * %s;\n\ - %s.y = %s.x * %s;\n", out, in_1, in_num, out, in_1, in_num); - } - else { - if (temp) { - sc->tempLen = sprintf(sc->tempStr, "\ - %s.x = - %s.y * %s;\n\ - %s.y = %s.x * %s;\n\ - %s = %s;\n", temp, in_1, in_num, temp, in_1, in_num, out, temp); - } - else - return VKFFT_ERROR_NULL_TEMP_PASSED; - } -#endif - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - return res; -} -static inline VkFFTResult VkDivComplexNumber(VkFFTSpecializationConstantsLayout* sc, const char* out, const char* in_1, const char* in_num) { - VkFFTResult res = VKFFT_SUCCESS; - sc->tempLen = sprintf(sc->tempStr, "\ - %s.x = %s.x / %s;\n\ - %s.y = %s.y / %s;\n", out, in_1, in_num, out, in_1, in_num); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - return res; -} - -static inline VkFFTResult VkMulReal(VkFFTSpecializationConstantsLayout* sc, const char* out, const char* in_1, const char* in_2) { - VkFFTResult res = VKFFT_SUCCESS; - sc->tempLen = sprintf(sc->tempStr, "\ - %s = %s * %s;\n", out, in_1, in_2); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - return res; -} - -static inline VkFFTResult VkShuffleComplex(VkFFTSpecializationConstantsLayout* sc, const char* out, const char* in_1, const char* in_2, const char* temp) { - VkFFTResult res = VKFFT_SUCCESS; -#if(VKFFT_BACKEND==2) - sc->tempLen = sprintf(sc->tempStr, "\ - %s = %s + %s(-%s.y, %s.x);\n", out, in_1, sc->vecType, in_2, in_2); -#else - if (strcmp(out, in_2)) { - sc->tempLen = sprintf(sc->tempStr, "\ - %s.x = %s.x - %s.y;\n\ - %s.y = %s.y + %s.x;\n", out, in_1, in_2, out, in_1, in_2); - } - else { - if (temp) { - sc->tempLen = sprintf(sc->tempStr, "\ - %s.x = %s.x - %s.y;\n\ - %s.y = %s.x + %s.y;\n\ - %s = %s;\n", temp, in_1, in_2, temp, in_1, in_2, out, temp); - } - else - return VKFFT_ERROR_NULL_TEMP_PASSED; - } -#endif - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - return res; -} -static inline VkFFTResult VkShuffleComplexInv(VkFFTSpecializationConstantsLayout* sc, const char* out, const char* in_1, const char* in_2, const char* temp) { - VkFFTResult res = VKFFT_SUCCESS; -#if(VKFFT_BACKEND==2) - sc->tempLen = sprintf(sc->tempStr, "\ - %s = %s + %s(%s.y, -%s.x);\n", out, in_1, sc->vecType, in_2, in_2); -#else - if (strcmp(out, in_2)) { - sc->tempLen = sprintf(sc->tempStr, "\ - %s.x = %s.x + %s.y;\n\ - %s.y = %s.y - %s.x;\n", out, in_1, in_2, out, in_1, in_2); - } - else { - if (temp) { - sc->tempLen = sprintf(sc->tempStr, "\ - %s.x = %s.x + %s.y;\n\ - %s.y = %s.x - %s.y;\n\ - %s = %s;\n", temp, in_1, in_2, temp, in_1, in_2, out, temp); - } - else - return VKFFT_ERROR_NULL_TEMP_PASSED; - } -#endif - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - return res; -} -static inline VkFFTResult VkModReal(VkFFTSpecializationConstantsLayout* sc, const char* out, const char* in_1, const char* in_num) { - VkFFTResult res = VKFFT_SUCCESS; - sc->tempLen = sprintf(sc->tempStr, "\ - %s = %s %% %s;\n", out, in_1, in_num); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - return res; -} -static inline VkFFTResult VkDivReal(VkFFTSpecializationConstantsLayout* sc, const char* out, const char* in_1, const char* in_num) { - VkFFTResult res = VKFFT_SUCCESS; - sc->tempLen = sprintf(sc->tempStr, "\ - %s = %s / %s;\n", out, in_1, in_num); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - return res; -} -static inline VkFFTResult VkPermute(VkFFTSpecializationConstantsLayout* sc, const uint64_t* permute, const uint64_t num_elem, const uint64_t type, char** regIDs, const char* temp) { - VkFFTResult res = VKFFT_SUCCESS; - char temp_ID[33][20]; - /*uint64_t permute_complete[33]; - uint64_t num_completed = 0; - uint64_t start = 0; - uint64_t start_subcycle = 0;*/ - if (type == 0) { - for (uint64_t i = 0; i < num_elem; i++) - sprintf(temp_ID[i], "%s", sc->locID[i]); - for (uint64_t i = 0; i < num_elem; i++) - sprintf(sc->locID[i], "%s", temp_ID[permute[i]]); - /*for (uint64_t i = 0; i < num_elem; i++) { - permute_complete[i] = 0; - } - while (start != num_elem) { - if (permute_complete[start] == 0) { - if (start_subcycle == 0) { - sc->tempLen = sprintf(sc->tempStr, "\ - %s = %s;\n", temp, sc->locID[start]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - start_subcycle = start; - } - if (permute[start] == start_subcycle) { - sc->tempLen = sprintf(sc->tempStr, "\ - %s = %s;\n", sc->locID[start], temp); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, "\ - %s = %s;\n", sc->locID[start], sc->locID[permute[start]]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - permute_complete[start] = 1; - start = permute[start]; - } - else { - start++; - start_subcycle = 0; - } - }*/ - - } - if (type == 1) { - for (uint64_t i = 0; i < num_elem; i++) - sprintf(temp_ID[i], "%s", regIDs[i]); - for (uint64_t i = 0; i < num_elem; i++) - sprintf(regIDs[i], "%s", temp_ID[permute[i]]); - /*for (uint64_t i = 0; i < num_elem; i++) { - permute_complete[i] = 0; - } - while (start != num_elem) { - if (permute_complete[start] == 0) { - if (start_subcycle == 0) { - sc->tempLen = sprintf(sc->tempStr, "\ - %s = %s;\n", temp, regIDs[start]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - start_subcycle = start; - } - if (permute[start] == start_subcycle) { - sc->tempLen = sprintf(sc->tempStr, "\ - %s = %s;\n", regIDs[start], temp); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, "\ - %s = %s;\n", regIDs[start], regIDs[permute[start]]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - permute_complete[start] = 1; - start = permute[start]; - } - else { - start++; - start_subcycle = 0; - } - }*/ - } - return res; -} -static inline VkFFTResult VkSubgroupAdd(VkFFTSpecializationConstantsLayout* sc, const char* in, const char* out, const uint64_t subWarpSplit) { - VkFFTResult res = VKFFT_SUCCESS; - -#if (VKFFT_BACKEND==0) - sc->tempLen = sprintf(sc->tempStr, " %s.x = subgroupAdd(%s.x);\n", out, in); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = subgroupAdd(%s.y);\n", out, in); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; -#elif (VKFFT_BACKEND==1) - //v1 - /*for (int i = 1; i < sc->warpSize / subWarpSplit; i *= 2) { - sc->tempLen = sprintf(sc->tempStr, " %s.x += __shfl_xor_sync(0xffffffff, %s.x, %d);\n", out, in, i); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y += __shfl_xor_sync(0xffffffff, %s.y, %d);\n", out, in, i); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - //v2 - for (int i = (int)sc->warpSize / 2 / subWarpSplit; i > 0; i /= 2) { - sc->tempLen = sprintf(sc->tempStr, " %s.x += __shfl_down_sync(0xffffffff, %s.x, %d);\n", out, in, i); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y += __shfl_down_sync(0xffffffff, %s.y, %d);\n", out, in, i); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - }*/ -#endif - return res; -} - -static inline VkFFTResult initializeVkFFT(VkFFTApplication* app, VkFFTConfiguration inputLaunchConfiguration); -static inline VkFFTResult VkFFTAppend(VkFFTApplication* app, int inverse, VkFFTLaunchParams* launchParams); - -static inline VkFFTResult appendVersion(VkFFTSpecializationConstantsLayout* sc) { - VkFFTResult res = VKFFT_SUCCESS; -#if(VKFFT_BACKEND==0) - sc->tempLen = sprintf(sc->tempStr, "#version 450\n\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; -#endif - return res; -} -static inline VkFFTResult appendExtensions(VkFFTSpecializationConstantsLayout* sc, const char* floatType, const char* floatTypeInputMemory, const char* floatTypeOutputMemory, const char* floatTypeKernelMemory) { - VkFFTResult res = VKFFT_SUCCESS; -#if(VKFFT_BACKEND==0) - //sc->tempLen = sprintf(sc->tempStr, "#extension GL_EXT_debug_printf : require\n\n"); - //res = VkAppendLine(sc); - //if (res != VKFFT_SUCCESS) return res; - - if ((!strcmp(floatType, "double")) || (sc->useUint64)) { - sc->tempLen = sprintf(sc->tempStr, "\ -#extension GL_ARB_gpu_shader_fp64 : enable\n\ -#extension GL_ARB_gpu_shader_int64 : enable\n\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if ((!strcmp(floatTypeInputMemory, "half")) || (!strcmp(floatTypeOutputMemory, "half")) || (!strcmp(floatTypeKernelMemory, "half"))) { - sc->tempLen = sprintf(sc->tempStr, "#extension GL_EXT_shader_16bit_storage : require\n\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } -#elif(VKFFT_BACKEND==1) -#elif(VKFFT_BACKEND==2) -#ifdef VKFFT_OLD_ROCM - sc->tempLen = sprintf(sc->tempStr, "\ -#include \n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; -#endif -#elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)) - if ((!strcmp(floatType, "double")) || (sc->useUint64)) { - sc->tempLen = sprintf(sc->tempStr, "\ -#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } -#endif - return res; -} -static inline VkFFTResult appendLayoutVkFFT(VkFFTSpecializationConstantsLayout* sc) { - VkFFTResult res = VKFFT_SUCCESS; -#if(VKFFT_BACKEND==0) - sc->tempLen = sprintf(sc->tempStr, "layout (local_size_x = %" PRIu64 ", local_size_y = %" PRIu64 ", local_size_z = %" PRIu64 ") in;\n", sc->localSize[0], sc->localSize[1], sc->localSize[2]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; -#elif(VKFFT_BACKEND==1) -#elif(VKFFT_BACKEND==2) -#elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)) -#endif - return res; -} -static inline VkFFTResult appendConstant(VkFFTSpecializationConstantsLayout* sc, const char* type, const char* name, const char* defaultVal, const char* LFending) { - VkFFTResult res = VKFFT_SUCCESS; -#if((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)) - sc->tempLen = sprintf(sc->tempStr, "__constant %s %s = %s%s;\n", type, name, defaultVal, LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; -#else - sc->tempLen = sprintf(sc->tempStr, "const %s %s = %s%s;\n", type, name, defaultVal, LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; -#endif - return res; -} -static inline VkFFTResult appendPushConstant(VkFFTSpecializationConstantsLayout* sc, const char* type, const char* name) { - VkFFTResult res = VKFFT_SUCCESS; - sc->tempLen = sprintf(sc->tempStr, " %s %s;\n", type, name); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - return res; -} -static inline VkFFTResult appendBarrierVkFFT(VkFFTSpecializationConstantsLayout* sc, uint64_t numTab) { - VkFFTResult res = VKFFT_SUCCESS; - char tabs[100]; - for (uint64_t i = 0; i < numTab; i++) - sprintf(tabs, " "); -#if(VKFFT_BACKEND==0) - sc->tempLen = sprintf(sc->tempStr, "%sbarrier();\n\n", tabs); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; -#elif(VKFFT_BACKEND==1) - sc->tempLen = sprintf(sc->tempStr, "%s__syncthreads();\n\n", tabs); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; -#elif(VKFFT_BACKEND==2) - sc->tempLen = sprintf(sc->tempStr, "%s__syncthreads();\n\n", tabs); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; -#elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)) - sc->tempLen = sprintf(sc->tempStr, "%sbarrier(CLK_LOCAL_MEM_FENCE);\n\n", tabs); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; -#endif - return res; -} -static inline VkFFTResult appendPushConstantsVkFFT(VkFFTSpecializationConstantsLayout* sc, const char* floatType, const char* uintType) { - VkFFTResult res = VKFFT_SUCCESS; - if (sc->pushConstantsStructSize == 0) - return res; -#if(VKFFT_BACKEND==0) - sc->tempLen = sprintf(sc->tempStr, "layout(push_constant) uniform PushConsts\n{\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; -#elif(VKFFT_BACKEND==1) - sc->tempLen = sprintf(sc->tempStr, " typedef struct {\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; -#elif(VKFFT_BACKEND==2) - sc->tempLen = sprintf(sc->tempStr, " typedef struct {\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; -#elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)) - sc->tempLen = sprintf(sc->tempStr, " typedef struct {\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; -#endif - if (sc->performWorkGroupShift[0]) { - res = appendPushConstant(sc, uintType, "workGroupShiftX"); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->performWorkGroupShift[1]) { - res = appendPushConstant(sc, uintType, "workGroupShiftY"); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->performWorkGroupShift[2]) { - res = appendPushConstant(sc, uintType, "workGroupShiftZ"); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->performPostCompilationInputOffset) { - res = appendPushConstant(sc, uintType, "inputOffset"); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->performPostCompilationOutputOffset) { - res = appendPushConstant(sc, uintType, "outputOffset"); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->performPostCompilationKernelOffset) { - res = appendPushConstant(sc, uintType, "kernelOffset"); - if (res != VKFFT_SUCCESS) return res; - } -#if(VKFFT_BACKEND==0) - sc->tempLen = sprintf(sc->tempStr, "} consts;\n\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; -#elif(VKFFT_BACKEND==1) - sc->tempLen = sprintf(sc->tempStr, " }PushConsts;\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " __constant__ PushConsts consts;\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; -#elif(VKFFT_BACKEND==2) - sc->tempLen = sprintf(sc->tempStr, " }PushConsts;\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " __constant__ PushConsts consts;\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; -#elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)) - sc->tempLen = sprintf(sc->tempStr, " }PushConsts;\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; -#endif - return res; -} -static inline VkFFTResult appendConstantsVkFFT(VkFFTSpecializationConstantsLayout* sc, const char* floatType, const char* uintType) { - VkFFTResult res = VKFFT_SUCCESS; - char LFending[4] = ""; - char uintType_32[30]; - if (!strcmp(floatType, "float")) sprintf(LFending, "f"); -#if(VKFFT_BACKEND==0) - if (!strcmp(floatType, "double")) sprintf(LFending, "LF"); - sprintf(uintType_32, "uint"); -#elif(VKFFT_BACKEND==1) - if (!strcmp(floatType, "double")) sprintf(LFending, "l"); - sprintf(uintType_32, "unsigned int"); -#elif(VKFFT_BACKEND==2) - if (!strcmp(floatType, "double")) sprintf(LFending, "l"); - sprintf(uintType_32, "unsigned int"); -#elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)) - sprintf(uintType_32, "unsigned int"); - //if (!strcmp(floatType, "double")) sprintf(LFending, "l"); -#endif - - res = appendConstant(sc, floatType, "loc_PI", "3.1415926535897932384626433832795", LFending); - if (res != VKFFT_SUCCESS) return res; - res = appendConstant(sc, floatType, "loc_SQRT1_2", "0.70710678118654752440084436210485", LFending); - if (res != VKFFT_SUCCESS) return res; - if (sc->useRader) { - for (uint64_t i = 0; i < sc->numRaderPrimes; i++) { - if (sc->raderContainer[i].prime > 0) { - if (sc->inline_rader_g_pow == 1) { - uint64_t g_pow = 1; -#if((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)) - sc->tempLen = sprintf(sc->tempStr, "__constant %s g_pow_%" PRIu64 "[%" PRIu64 "]= {1", uintType_32, sc->raderContainer[i].prime, sc->raderContainer[i].prime); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; -#else - sc->tempLen = sprintf(sc->tempStr, "const %s g_pow_%" PRIu64 "[%" PRIu64 "]= {1", uintType_32, sc->raderContainer[i].prime, sc->raderContainer[i].prime); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; -#endif - for (uint64_t t = 0; t < sc->raderContainer[i].prime - 1; t++) { - g_pow = (g_pow * sc->raderContainer[i].generator) % sc->raderContainer[i].prime; - sc->tempLen = sprintf(sc->tempStr, ", %" PRIu64 "", g_pow); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, "};\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->inline_rader_kernel) { -#if((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)) - sc->tempLen = sprintf(sc->tempStr, "__constant %s r_rader_kernel_%" PRIu64 "[%" PRIu64 "]= {", floatType, sc->raderContainer[i].prime, sc->raderContainer[i].prime - 1); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; -#else - sc->tempLen = sprintf(sc->tempStr, "const %s r_rader_kernel_%" PRIu64 "[%" PRIu64 "]= {", floatType, sc->raderContainer[i].prime, sc->raderContainer[i].prime - 1); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; -#endif - for (uint64_t j = 0; j < (sc->raderContainer[i].prime - 1); j++) {//fix later - if (!strcmp(floatType, "double")) { - double* raderFFTKernel = (double*)sc->raderContainer[i].raderFFTkernel; - sc->tempLen = sprintf(sc->tempStr, "%.17e%s ", raderFFTKernel[2 * j] / (sc->raderContainer[i].prime - 1), LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (!strcmp(floatType, "float")) { - float* raderFFTKernel = (float*)sc->raderContainer[i].raderFFTkernel; - sc->tempLen = sprintf(sc->tempStr, "%.8e%s ", raderFFTKernel[2 * j] / (sc->raderContainer[i].prime - 1), LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (j < (sc->raderContainer[i].prime - 2)) { - sc->tempLen = sprintf(sc->tempStr, ", "); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, "};\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } -#if((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)) - sc->tempLen = sprintf(sc->tempStr, "__constant %s i_rader_kernel_%" PRIu64 "[%" PRIu64 "]= {", floatType, sc->raderContainer[i].prime, sc->raderContainer[i].prime - 1); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; -#else - sc->tempLen = sprintf(sc->tempStr, "const %s i_rader_kernel_%" PRIu64 "[%" PRIu64 "]= {", floatType, sc->raderContainer[i].prime, sc->raderContainer[i].prime - 1); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; -#endif - for (uint64_t j = 0; j < (sc->raderContainer[i].prime - 1); j++) {//fix later - if (!strcmp(floatType, "double")) { - double* raderFFTKernel = (double*)sc->raderContainer[i].raderFFTkernel; - sc->tempLen = sprintf(sc->tempStr, "%.17e%s ", raderFFTKernel[2 * j + 1] / (sc->raderContainer[i].prime - 1), LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (!strcmp(floatType, "float")) { - float* raderFFTKernel = (float*)sc->raderContainer[i].raderFFTkernel; - sc->tempLen = sprintf(sc->tempStr, "%.8e%s ", raderFFTKernel[2 * j + 1] / (sc->raderContainer[i].prime - 1), LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - - if (j < (sc->raderContainer[i].prime - 2)) { - sc->tempLen = sprintf(sc->tempStr, ", "); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, "};\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - } - } - } - return res; -} -static inline VkFFTResult appendSinCos20(VkFFTSpecializationConstantsLayout* sc, const char* floatType, const char* uintType) { - VkFFTResult res = VKFFT_SUCCESS; - char functionDefinitions[100] = ""; - char vecType[30]; - char LFending[4] = ""; - if (!strcmp(floatType, "float")) sprintf(LFending, "f"); -#if(VKFFT_BACKEND==0) - if (!strcmp(floatType, "half")) sprintf(vecType, "f16vec2"); - if (!strcmp(floatType, "float")) sprintf(vecType, "vec2"); - if (!strcmp(floatType, "double")) sprintf(vecType, "dvec2"); - if (!strcmp(floatType, "double")) sprintf(LFending, "LF"); -#elif(VKFFT_BACKEND==1) - if (!strcmp(floatType, "half")) sprintf(vecType, "f16vec2"); - if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); - if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); - if (!strcmp(floatType, "double")) sprintf(LFending, "l"); - sprintf(functionDefinitions, "__device__ static __inline__ "); -#elif(VKFFT_BACKEND==2) - if (!strcmp(floatType, "half")) sprintf(vecType, "f16vec2"); - if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); - if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); - if (!strcmp(floatType, "double")) sprintf(LFending, "l"); - sprintf(functionDefinitions, "__device__ static __inline__ "); -#elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)) - if (!strcmp(floatType, "half")) sprintf(vecType, "f16vec2"); - if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); - if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); - //if (!strcmp(floatType, "double")) sprintf(LFending, "l"); - sprintf(functionDefinitions, "static __inline__ "); -#endif - res = appendConstant(sc, floatType, "loc_2_PI", "0.63661977236758134307553505349006", LFending); - if (res != VKFFT_SUCCESS) return res; - res = appendConstant(sc, floatType, "loc_PI_2", "1.5707963267948966192313216916398", LFending); - if (res != VKFFT_SUCCESS) return res; - res = appendConstant(sc, floatType, "a1", "0.99999999999999999999962122687403772", LFending); - if (res != VKFFT_SUCCESS) return res; - res = appendConstant(sc, floatType, "a3", "-0.166666666666666666637194166219637268", LFending); - if (res != VKFFT_SUCCESS) return res; - res = appendConstant(sc, floatType, "a5", "0.00833333333333333295212653322266277182", LFending); - if (res != VKFFT_SUCCESS) return res; - res = appendConstant(sc, floatType, "a7", "-0.000198412698412696489459896530659927773", LFending); - if (res != VKFFT_SUCCESS) return res; - res = appendConstant(sc, floatType, "a9", "2.75573192239364018847578909205399262e-6", LFending); - if (res != VKFFT_SUCCESS) return res; - res = appendConstant(sc, floatType, "a11", "-2.50521083781017605729370231280411712e-8", LFending); - if (res != VKFFT_SUCCESS) return res; - res = appendConstant(sc, floatType, "a13", "1.60590431721336942356660057796782021e-10", LFending); - if (res != VKFFT_SUCCESS) return res; - res = appendConstant(sc, floatType, "a15", "-7.64712637907716970380859898835680587e-13", LFending); - if (res != VKFFT_SUCCESS) return res; - res = appendConstant(sc, floatType, "a17", "2.81018528153898622636194976499656274e-15", LFending); - if (res != VKFFT_SUCCESS) return res; - res = appendConstant(sc, floatType, "ab", "-7.97989713648499642889739108679114937e-18", LFending); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, "\ -%s%s sincos_20(double x)\n\ -{\n\ - //minimax coefs for sin for 0..pi/2 range\n\ - double y = abs(x * loc_2_PI);\n\ - double q = floor(y);\n\ - int quadrant = int(q);\n\ - double t = (quadrant & 1) != 0 ? 1 - y + q : y - q;\n\ - t *= loc_PI_2;\n\ - double t2 = t * t;\n\ - double r = fma(fma(fma(fma(fma(fma(fma(fma(fma(ab, t2, a17), t2, a15), t2, a13), t2, a11), t2, a9), t2, a7), t2, a5), t2, a3), t2 * t, t);\n\ - %s cos_sin;\n\ - cos_sin.x = ((quadrant == 0) || (quadrant == 3)) ? sqrt(1 - r * r) : -sqrt(1 - r * r);\n\ - r = x < 0 ? -r : r;\n\ - cos_sin.y = (quadrant & 2) != 0 ? -r : r;\n\ - return cos_sin;\n\ -}\n\n", functionDefinitions, vecType, vecType); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - return res; -} -static inline VkFFTResult appendConversion(VkFFTSpecializationConstantsLayout* sc, const char* floatType, const char* floatTypeDifferent) { - VkFFTResult res = VKFFT_SUCCESS; -#if(VKFFT_BACKEND!=0) - char functionDefinitions[100] = ""; - char vecType[30]; - char vecTypeDifferent[30]; -#endif -#if(VKFFT_BACKEND==0) -#elif(VKFFT_BACKEND==1) - sprintf(functionDefinitions, "__device__ static __inline__ "); -#elif(VKFFT_BACKEND==2) - sprintf(functionDefinitions, "__device__ static __inline__ "); -#elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)) - sprintf(functionDefinitions, "static __inline__ "); -#endif -#if(VKFFT_BACKEND!=0) - if (!strcmp(floatType, "half")) sprintf(vecType, "f16vec2"); - if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); - if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); - if (!strcmp(floatTypeDifferent, "half")) sprintf(vecTypeDifferent, "f16vec2"); - if (!strcmp(floatTypeDifferent, "float")) sprintf(vecTypeDifferent, "float2"); - if (!strcmp(floatTypeDifferent, "double")) sprintf(vecTypeDifferent, "double2"); - sc->tempLen = sprintf(sc->tempStr, "\ -%s%s conv_%s(%s input)\n\ -{\n\ - %s ret_val;\n\ - ret_val.x = (%s) input.x;\n\ - ret_val.y = (%s) input.y;\n\ - return ret_val;\n\ -}\n\n", functionDefinitions, vecType, vecType, vecTypeDifferent, vecType, floatType, floatType); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, "\ -%s%s conv_%s(%s input)\n\ -{\n\ - %s ret_val;\n\ - ret_val.x = (%s) input.x;\n\ - ret_val.y = (%s) input.y;\n\ - return ret_val;\n\ -}\n\n", functionDefinitions, vecTypeDifferent, vecTypeDifferent, vecType, vecTypeDifferent, floatTypeDifferent, floatTypeDifferent); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; -#endif - return res; -} -static inline VkFFTResult appendInputLayoutVkFFT(VkFFTSpecializationConstantsLayout* sc, uint64_t id, const char* floatTypeMemory, uint64_t inputType) { - VkFFTResult res = VKFFT_SUCCESS; - char vecType[30]; - switch (inputType) { - case 0: case 1: case 2: case 3: case 4: case 6: { -#if(VKFFT_BACKEND==0) - if (!strcmp(floatTypeMemory, "half")) { - sc->inputNumberByteSize = 2 * 2; - sprintf(vecType, "f16vec2"); - } - if (!strcmp(floatTypeMemory, "float")) { - sc->inputNumberByteSize = 2 * sizeof(float); - sprintf(vecType, "vec2"); - } - if (!strcmp(floatTypeMemory, "double")) { - sc->inputNumberByteSize = 2 * sizeof(double); - sprintf(vecType, "dvec2"); - } - if (sc->inputBufferBlockNum == 1) { - sc->tempLen = sprintf(sc->tempStr, "\ -layout(std430, binding = %" PRIu64 ") buffer DataIn{\n\ - %s inputs[%" PRIu64 "];\n\ -};\n\n", id, vecType, sc->inputBufferBlockSize); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, "\ -layout(std430, binding = %" PRIu64 ") buffer DataIn{\n\ - %s inputs[%" PRIu64 "];\n\ -} inputBlocks[%" PRIu64 "];\n\n", id, vecType, sc->inputBufferBlockSize, sc->inputBufferBlockNum); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } -#elif(VKFFT_BACKEND==1) - if (!strcmp(floatTypeMemory, "half")) { - sc->inputNumberByteSize = 2 * 2; - sprintf(vecType, "f16vec2"); - } - if (!strcmp(floatTypeMemory, "float")) { - sc->inputNumberByteSize = 2 * sizeof(float); - sprintf(vecType, "float2"); - } - if (!strcmp(floatTypeMemory, "double")) { - sc->inputNumberByteSize = 2 * sizeof(double); - sprintf(vecType, "double2"); - } -#elif(VKFFT_BACKEND==2) - if (!strcmp(floatTypeMemory, "half")) { - sc->inputNumberByteSize = 2 * 2; - sprintf(vecType, "f16vec2"); - } - if (!strcmp(floatTypeMemory, "float")) { - sc->inputNumberByteSize = 2 * sizeof(float); - sprintf(vecType, "float2"); - } - if (!strcmp(floatTypeMemory, "double")) { - sc->inputNumberByteSize = 2 * sizeof(double); - sprintf(vecType, "double2"); - } -#elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)) - if (!strcmp(floatTypeMemory, "half")) { - sc->inputNumberByteSize = 2 * 2; - sprintf(vecType, "f16vec2"); - } - if (!strcmp(floatTypeMemory, "float")) { - sc->inputNumberByteSize = 2 * sizeof(float); - sprintf(vecType, "float2"); - } - if (!strcmp(floatTypeMemory, "double")) { - sc->inputNumberByteSize = 2 * sizeof(double); - sprintf(vecType, "double2"); - } -#endif - break; - } - case 5: case 110: case 111: case 120: case 121: case 130: case 131: case 140: case 141: case 142: case 143: case 144: case 145: - { - if (!strcmp(floatTypeMemory, "half")) { - sc->inputNumberByteSize = 2; - sprintf(vecType, "float16_t"); - } - if (!strcmp(floatTypeMemory, "float")) { - sc->inputNumberByteSize = sizeof(float); - sprintf(vecType, "float"); - } - if (!strcmp(floatTypeMemory, "double")) { - sc->inputNumberByteSize = sizeof(double); - sprintf(vecType, "double"); - } -#if(VKFFT_BACKEND==0) - if (sc->inputBufferBlockNum == 1) { - sc->tempLen = sprintf(sc->tempStr, "\ -layout(std430, binding = %" PRIu64 ") buffer DataIn{\n\ - %s inputs[%" PRIu64 "];\n\ -};\n\n", id, vecType, 2 * sc->inputBufferBlockSize); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, "\ -layout(std430, binding = %" PRIu64 ") buffer DataIn{\n\ - %s inputs[%" PRIu64 "];\n\ -} inputBlocks[%" PRIu64 "];\n\n", id, vecType, 2 * sc->inputBufferBlockSize, sc->inputBufferBlockNum); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } -#endif - break; - } - } - return res; -} -static inline VkFFTResult appendOutputLayoutVkFFT(VkFFTSpecializationConstantsLayout* sc, uint64_t id, const char* floatTypeMemory, uint64_t outputType) { - VkFFTResult res = VKFFT_SUCCESS; - char vecType[30]; - switch (outputType) { - case 0: case 1: case 2: case 3: case 4: case 5: { -#if(VKFFT_BACKEND==0) - if (!strcmp(floatTypeMemory, "half")) { - sc->outputNumberByteSize = 2 * 2; - sprintf(vecType, "f16vec2"); - } - if (!strcmp(floatTypeMemory, "float")) { - sc->outputNumberByteSize = 2 * sizeof(float); - sprintf(vecType, "vec2"); - } - if (!strcmp(floatTypeMemory, "double")) { - sc->outputNumberByteSize = 2 * sizeof(double); - sprintf(vecType, "dvec2"); - } - if (sc->outputBufferBlockNum == 1) { - sc->tempLen = sprintf(sc->tempStr, "\ -layout(std430, binding = %" PRIu64 ") buffer DataOut{\n\ - %s outputs[%" PRIu64 "];\n\ -};\n\n", id, vecType, sc->outputBufferBlockSize); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, "\ -layout(std430, binding = %" PRIu64 ") buffer DataOut{\n\ - %s outputs[%" PRIu64 "];\n\ -} outputBlocks[%" PRIu64 "];\n\n", id, vecType, sc->outputBufferBlockSize, sc->outputBufferBlockNum); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } -#elif(VKFFT_BACKEND==1) - if (!strcmp(floatTypeMemory, "half")) { - sc->outputNumberByteSize = 2 * 2; - sprintf(vecType, "f16vec2"); - } - if (!strcmp(floatTypeMemory, "float")) { - sc->outputNumberByteSize = 2 * sizeof(float); - sprintf(vecType, "float2"); - } - if (!strcmp(floatTypeMemory, "double")) { - sc->outputNumberByteSize = 2 * sizeof(double); - sprintf(vecType, "double2"); - } -#elif(VKFFT_BACKEND==2) - if (!strcmp(floatTypeMemory, "half")) { - sc->outputNumberByteSize = 2 * 2; - sprintf(vecType, "f16vec2"); - } - if (!strcmp(floatTypeMemory, "float")) { - sc->outputNumberByteSize = 2 * sizeof(float); - sprintf(vecType, "float2"); - } - if (!strcmp(floatTypeMemory, "double")) { - sc->outputNumberByteSize = 2 * sizeof(double); - sprintf(vecType, "double2"); - } -#elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)) - if (!strcmp(floatTypeMemory, "half")) { - sc->outputNumberByteSize = 2 * 2; - sprintf(vecType, "f16vec2"); - } - if (!strcmp(floatTypeMemory, "float")) { - sc->outputNumberByteSize = 2 * sizeof(float); - sprintf(vecType, "float2"); - } - if (!strcmp(floatTypeMemory, "double")) { - sc->outputNumberByteSize = 2 * sizeof(double); - sprintf(vecType, "double2"); - } -#endif - break; - } - case 6: case 110: case 111: case 120: case 121: case 130: case 131: case 140: case 141: case 142: case 143: case 144: case 145: - { - if (!strcmp(floatTypeMemory, "half")) { - sc->outputNumberByteSize = 2; - sprintf(vecType, "float16_t"); - } - if (!strcmp(floatTypeMemory, "float")) { - sc->outputNumberByteSize = sizeof(float); - sprintf(vecType, "float"); - } - if (!strcmp(floatTypeMemory, "double")) { - sc->outputNumberByteSize = sizeof(double); - sprintf(vecType, "double"); - } -#if(VKFFT_BACKEND==0) - if (sc->outputBufferBlockNum == 1) { - sc->tempLen = sprintf(sc->tempStr, "\ -layout(std430, binding = %" PRIu64 ") buffer DataOut{\n\ - %s outputs[%" PRIu64 "];\n\ -};\n\n", id, vecType, 2 * sc->outputBufferBlockSize); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, "\ -layout(std430, binding = %" PRIu64 ") buffer DataOut{\n\ - %s outputs[%" PRIu64 "];\n\ -} outputBlocks[%" PRIu64 "];\n\n", id, vecType, 2 * sc->outputBufferBlockSize, sc->outputBufferBlockNum); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } -#endif - break; - } - } - return res; -} -static inline VkFFTResult appendKernelLayoutVkFFT(VkFFTSpecializationConstantsLayout* sc, uint64_t id, const char* floatTypeMemory) { - VkFFTResult res = VKFFT_SUCCESS; - char vecType[30]; -#if(VKFFT_BACKEND==0) - if (!strcmp(floatTypeMemory, "half")) { - sc->kernelNumberByteSize = 2 * 2; - sprintf(vecType, "f16vec2"); - } - if (!strcmp(floatTypeMemory, "float")) { - sc->kernelNumberByteSize = 2 * sizeof(float); - sprintf(vecType, "vec2"); - } - if (!strcmp(floatTypeMemory, "double")) { - sc->kernelNumberByteSize = 2 * sizeof(double); - sprintf(vecType, "dvec2"); - } - if (sc->kernelBlockNum == 1) { - sc->tempLen = sprintf(sc->tempStr, "\ -layout(std430, binding = %" PRIu64 ") buffer Kernel_FFT{\n\ - %s kernel_obj[%" PRIu64 "];\n\ -};\n\n", id, vecType, sc->kernelBlockSize); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, "\ -layout(std430, binding = %" PRIu64 ") buffer Kernel_FFT{\n\ - %s kernel_obj[%" PRIu64 "];\n\ -} kernelBlocks[%" PRIu64 "];\n\n", id, vecType, sc->kernelBlockSize, sc->kernelBlockNum); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } -#elif(VKFFT_BACKEND==1) - if (!strcmp(floatTypeMemory, "half")) { - sc->kernelNumberByteSize = 2 * 2; - sprintf(vecType, "f16vec2"); - } - if (!strcmp(floatTypeMemory, "float")) { - sc->kernelNumberByteSize = 2 * sizeof(float); - sprintf(vecType, "float2"); - } - if (!strcmp(floatTypeMemory, "double")) { - sc->kernelNumberByteSize = 2 * sizeof(double); - sprintf(vecType, "double2"); - } -#elif(VKFFT_BACKEND==2) - if (!strcmp(floatTypeMemory, "half")) { - sc->kernelNumberByteSize = 2 * 2; - sprintf(vecType, "f16vec2"); - } - if (!strcmp(floatTypeMemory, "float")) { - sc->kernelNumberByteSize = 2 * sizeof(float); - sprintf(vecType, "float2"); - } - if (!strcmp(floatTypeMemory, "double")) { - sc->kernelNumberByteSize = 2 * sizeof(double); - sprintf(vecType, "double2"); - } -#elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)) - if (!strcmp(floatTypeMemory, "half")) { - sc->kernelNumberByteSize = 2 * 2; - sprintf(vecType, "f16vec2"); - } - if (!strcmp(floatTypeMemory, "float")) { - sc->kernelNumberByteSize = 2 * sizeof(float); - sprintf(vecType, "float2"); - } - if (!strcmp(floatTypeMemory, "double")) { - sc->kernelNumberByteSize = 2 * sizeof(double); - sprintf(vecType, "double2"); - } -#endif - return res; -} -static inline VkFFTResult appendLUTLayoutVkFFT(VkFFTSpecializationConstantsLayout* sc, uint64_t id, const char* floatType) { - VkFFTResult res = VKFFT_SUCCESS; - char vecType[30]; -#if(VKFFT_BACKEND==0) - if (!strcmp(floatType, "float")) sprintf(vecType, "vec2"); - if (!strcmp(floatType, "double")) sprintf(vecType, "dvec2"); - sc->tempLen = sprintf(sc->tempStr, "\ -layout(std430, binding = %" PRIu64 ") readonly buffer DataLUT {\n\ -%s twiddleLUT[];\n\ -};\n", id, vecType); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; -#elif(VKFFT_BACKEND==1) - if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); - if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); -#elif(VKFFT_BACKEND==2) - if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); - if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); -#elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)) - if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); - if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); -#endif - return res; -} -static inline VkFFTResult appendRaderUintLUTLayoutVkFFT(VkFFTSpecializationConstantsLayout* sc, uint64_t id) { - VkFFTResult res = VKFFT_SUCCESS; - char uintType_32[30]; -#if(VKFFT_BACKEND==0) - sprintf(uintType_32, "uint"); - sc->tempLen = sprintf(sc->tempStr, "\ -layout(std430, binding = %" PRIu64 ") readonly buffer DataRaderUintLUT {\n\ -%s g_pow[];\n\ -};\n", id, uintType_32); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; -#elif(VKFFT_BACKEND==1) - sprintf(uintType_32, "unsigned int"); -#elif(VKFFT_BACKEND==2) - sprintf(uintType_32, "unsigned int"); -#elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)) - sprintf(uintType_32, "unsigned int"); -#endif - return res; -} -static inline VkFFTResult appendBluesteinLayoutVkFFT(VkFFTSpecializationConstantsLayout* sc, uint64_t id, const char* floatType) { - VkFFTResult res = VKFFT_SUCCESS; - char vecType[30]; -#if(VKFFT_BACKEND==0) - uint64_t loc_id = id; - if (!strcmp(floatType, "float")) sprintf(vecType, "vec2"); - if (!strcmp(floatType, "double")) sprintf(vecType, "dvec2"); - if (sc->BluesteinConvolutionStep) { - sc->tempLen = sprintf(sc->tempStr, "\ -layout(std430, binding = %" PRIu64 ") readonly buffer DataBluesteinConvolutionKernel {\n\ -%s BluesteinConvolutionKernel[];\n\ -};\n", loc_id, vecType); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - loc_id++; - } - if (sc->BluesteinPreMultiplication || sc->BluesteinPostMultiplication) { - sc->tempLen = sprintf(sc->tempStr, "\ -layout(std430, binding = %" PRIu64 ") readonly buffer DataBluesteinMultiplication {\n\ -%s BluesteinMultiplication[];\n\ -};\n", loc_id, vecType); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - loc_id++; - } -#elif(VKFFT_BACKEND==1) - if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); - if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); -#elif(VKFFT_BACKEND==2) - if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); - if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); -#elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)) - if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); - if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); -#endif - return res; -} -static inline VkFFTResult indexInputVkFFT(VkFFTSpecializationConstantsLayout* sc, const char* uintType, uint64_t inputType, const char* index_x, const char* index_y, const char* coordinate, const char* batchID) { - VkFFTResult res = VKFFT_SUCCESS; - switch (inputType % 1000) { - case 0: case 2: case 3: case 4:case 5: case 6: case 110: case 120: case 130: case 140: case 142: case 144: {//single_c2c + single_c2c_strided - char inputOffset[30] = ""; - if (sc->inputOffset > 0) { - sprintf(inputOffset, "%" PRIu64 " + ", sc->inputOffset / sc->inputNumberByteSize); - } - else { - if (sc->performPostCompilationInputOffset) { - if (inputType < 1000) - sprintf(inputOffset, "consts.inputOffset + "); - else - sprintf(inputOffset, "consts.kernelOffset + "); - } - } - char shiftX[500] = ""; - if (sc->inputStride[0] == 1) - sprintf(shiftX, "(%s)", index_x); - else - sprintf(shiftX, "(%s) * %" PRIu64 "", index_x, sc->inputStride[0]); - char shiftY[500] = ""; - uint64_t mult = (sc->mergeSequencesR2C) ? 2 : 1; - if (sc->size[1] > 1) { - if (sc->numAxisUploads == 1) { - if (sc->axisSwapped) { - if (sc->performWorkGroupShift[1]) - sprintf(shiftY, " + (%s + consts.workGroupShiftY) * %" PRIu64 "", sc->gl_WorkGroupID_y, mult * sc->localSize[0] * sc->inputStride[1]); - else - sprintf(shiftY, " + %s * %" PRIu64 "", sc->gl_WorkGroupID_y, mult * sc->localSize[0] * sc->inputStride[1]); - } - else { - if (sc->performWorkGroupShift[1]) - sprintf(shiftY, " + (%s + consts.workGroupShiftY) * %" PRIu64 "", sc->gl_WorkGroupID_y, mult * sc->localSize[1] * sc->inputStride[1]); - else - sprintf(shiftY, " + %s * %" PRIu64 "", sc->gl_WorkGroupID_y, mult * sc->localSize[1] * sc->inputStride[1]); - } - } - else { - if (sc->performWorkGroupShift[1]) - sprintf(shiftY, " + (%s + consts.workGroupShiftY) * %" PRIu64 "", sc->gl_WorkGroupID_y, sc->inputStride[1]); - else - sprintf(shiftY, " + %s * %" PRIu64 "", sc->gl_WorkGroupID_y, sc->inputStride[1]); - } - } - char shiftZ[500] = ""; - if (sc->size[2] > 1) { - if (sc->numCoordinates * sc->matrixConvolution * sc->numBatches > 1) { - if (sc->performWorkGroupShift[2]) - sprintf(shiftZ, " + ((%s + consts.workGroupShiftZ * %s) %% %" PRIu64 ") * %" PRIu64 "", sc->gl_GlobalInvocationID_z, sc->gl_WorkGroupSize_z, sc->dispatchZactualFFTSize, sc->inputStride[2]); - else - sprintf(shiftZ, " + (%s %% %" PRIu64 ") * %" PRIu64 "", sc->gl_GlobalInvocationID_z, sc->dispatchZactualFFTSize, sc->inputStride[2]); - } - else { - if (sc->performWorkGroupShift[2]) - sprintf(shiftZ, " + (%s + consts.workGroupShiftZ * %s) * %" PRIu64 "", sc->gl_GlobalInvocationID_z, sc->gl_WorkGroupSize_z, sc->inputStride[2]); - else - sprintf(shiftZ, " + %s * %" PRIu64 "", sc->gl_GlobalInvocationID_z, sc->inputStride[2]); - } - } - char shiftCoordinate[500] = ""; - uint64_t maxCoordinate = sc->numCoordinates * sc->matrixConvolution; - if (sc->numCoordinates * sc->matrixConvolution > 1) { - sprintf(shiftCoordinate, " + ((%s / %" PRIu64 ") %% %" PRIu64 ") * %" PRIu64 "", sc->gl_GlobalInvocationID_z, sc->dispatchZactualFFTSize, maxCoordinate, sc->inputStride[3]); - } - if ((sc->matrixConvolution > 1) && (sc->convolutionStep)) { - maxCoordinate = 1; - sprintf(shiftCoordinate, " + %s * %" PRIu64 "", coordinate, sc->inputStride[3]); - } - char shiftBatch[500] = ""; - if ((sc->numBatches > 1) || (sc->numKernels > 1)) { - if (sc->convolutionStep && (sc->numKernels > 1)) { - sprintf(shiftBatch, " + %s * %" PRIu64 "", batchID, sc->inputStride[4]); - } - else - sprintf(shiftBatch, " + (%s / %" PRIu64 ") * %" PRIu64 "", sc->gl_GlobalInvocationID_z, sc->dispatchZactualFFTSize * maxCoordinate, sc->inputStride[4]); - } - sc->tempLen = sprintf(sc->tempStr, "%s%s%s%s%s%s", inputOffset, shiftX, shiftY, shiftZ, shiftCoordinate, shiftBatch); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - break; - } - case 1: case 111: case 121: case 131: case 141: case 143: case 145: {//grouped_c2c - char inputOffset[30] = ""; - if (sc->inputOffset > 0) { - sprintf(inputOffset, "%" PRIu64 " + ", sc->inputOffset / sc->inputNumberByteSize); - } - else { - if (sc->performPostCompilationInputOffset) { - if (inputType < 1000) - sprintf(inputOffset, "consts.inputOffset + "); - else - sprintf(inputOffset, "consts.kernelOffset + "); - } - } - char shiftX[500] = ""; - if (sc->inputStride[0] == 1) - sprintf(shiftX, "(%s)", index_x); - else - sprintf(shiftX, "(%s) * %" PRIu64 "", index_x, sc->inputStride[0]); - - char shiftY[500] = ""; - if (index_y) - sprintf(shiftY, " + (%s) * %" PRIu64 "", index_y, sc->inputStride[1]); - - char shiftZ[500] = ""; - if (sc->size[2] > 1) { - if (sc->numCoordinates * sc->matrixConvolution * sc->numBatches > 1) { - if (sc->performWorkGroupShift[2]) - sprintf(shiftZ, " + ((%s + consts.workGroupShiftZ * %s) %% %" PRIu64 ") * %" PRIu64 "", sc->gl_GlobalInvocationID_z, sc->gl_WorkGroupSize_z, sc->dispatchZactualFFTSize, sc->inputStride[2]); - else - sprintf(shiftZ, " + (%s %% %" PRIu64 ") * %" PRIu64 "", sc->gl_GlobalInvocationID_z, sc->dispatchZactualFFTSize, sc->inputStride[2]); - } - else { - if (sc->performWorkGroupShift[2]) - sprintf(shiftZ, " + (%s + consts.workGroupShiftZ * %s) * %" PRIu64 "", sc->gl_GlobalInvocationID_z, sc->gl_WorkGroupSize_z, sc->inputStride[2]); - else - sprintf(shiftZ, " + %s * %" PRIu64 "", sc->gl_GlobalInvocationID_z, sc->inputStride[2]); - } - } - char shiftCoordinate[500] = ""; - uint64_t maxCoordinate = sc->numCoordinates * sc->matrixConvolution; - if (sc->numCoordinates * sc->matrixConvolution > 1) { - sprintf(shiftCoordinate, " + ((%s / %" PRIu64 ") %% %" PRIu64 ") * %" PRIu64 "", sc->gl_GlobalInvocationID_z, sc->dispatchZactualFFTSize, maxCoordinate, sc->inputStride[3]); - } - if ((sc->matrixConvolution > 1) && (sc->convolutionStep)) { - maxCoordinate = 1; - sprintf(shiftCoordinate, " + %s * %" PRIu64 "", coordinate, sc->inputStride[3]); - } - char shiftBatch[500] = ""; - if ((sc->numBatches > 1) || (sc->numKernels > 1)) { - if (sc->convolutionStep && (sc->numKernels > 1)) { - sprintf(shiftBatch, " + %s * %" PRIu64 "", batchID, sc->inputStride[4]); - } - else - sprintf(shiftBatch, " + (%s / %" PRIu64 ") * %" PRIu64 "", sc->gl_GlobalInvocationID_z, sc->dispatchZactualFFTSize * maxCoordinate, sc->inputStride[4]); - } - sc->tempLen = sprintf(sc->tempStr, "%s%s%s%s%s%s", inputOffset, shiftX, shiftY, shiftZ, shiftCoordinate, shiftBatch); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - break; - } - } - return res; -} -static inline VkFFTResult indexOutputVkFFT(VkFFTSpecializationConstantsLayout* sc, const char* uintType, uint64_t outputType, const char* index_x, const char* index_y, const char* coordinate, const char* batchID) { - VkFFTResult res = VKFFT_SUCCESS; - switch (outputType % 1000) {//single_c2c + single_c2c_strided - case 0: case 2: case 3: case 4: case 5: case 6: case 110: case 120: case 130: case 140: case 142: case 144: { - char outputOffset[30] = ""; - if (sc->outputOffset > 0) { - sprintf(outputOffset, "%" PRIu64 " + ", sc->outputOffset / sc->outputNumberByteSize); - } - else { - if (sc->performPostCompilationOutputOffset) { - if (outputType < 1000) - sprintf(outputOffset, "consts.outputOffset + "); - else - sprintf(outputOffset, "consts.kernelOffset + "); - } - } - char shiftX[500] = ""; - if (sc->numAxisUploads == 1) - sprintf(shiftX, "(%s)", index_x); - else - sprintf(shiftX, "(%s) * %" PRIu64 "", index_x, sc->outputStride[0]); - char shiftY[500] = ""; - uint64_t mult = (sc->mergeSequencesR2C) ? 2 : 1; - if (sc->size[1] > 1) { - if (sc->numAxisUploads == 1) { - if (sc->axisSwapped) { - if (sc->performWorkGroupShift[1]) - sprintf(shiftY, " + (%s + consts.workGroupShiftY) * %" PRIu64 "", sc->gl_WorkGroupID_y, mult * sc->localSize[0] * sc->outputStride[1]); - else - sprintf(shiftY, " + %s * %" PRIu64 "", sc->gl_WorkGroupID_y, mult * sc->localSize[0] * sc->outputStride[1]); - } - else { - if (sc->performWorkGroupShift[1]) - sprintf(shiftY, " + (%s + consts.workGroupShiftY) * %" PRIu64 "", sc->gl_WorkGroupID_y, mult * sc->localSize[1] * sc->outputStride[1]); - else - sprintf(shiftY, " + %s * %" PRIu64 "", sc->gl_WorkGroupID_y, mult * sc->localSize[1] * sc->outputStride[1]); - } - } - else { - if (sc->performWorkGroupShift[1]) - sprintf(shiftY, " + (%s + consts.workGroupShiftY) * %" PRIu64 "", sc->gl_WorkGroupID_y, sc->outputStride[1]); - else - sprintf(shiftY, " + %s * %" PRIu64 "", sc->gl_WorkGroupID_y, sc->outputStride[1]); - } - } - char shiftZ[500] = ""; - if (sc->size[2] > 1) { - if (sc->numCoordinates * sc->matrixConvolution * sc->numBatches > 1) { - if (sc->performWorkGroupShift[2]) - sprintf(shiftZ, " + ((%s + consts.workGroupShiftZ * %s) %% %" PRIu64 ") * %" PRIu64 "", sc->gl_GlobalInvocationID_z, sc->gl_WorkGroupSize_z, sc->dispatchZactualFFTSize, sc->outputStride[2]); - else - sprintf(shiftZ, " + (%s %% %" PRIu64 ") * %" PRIu64 "", sc->gl_GlobalInvocationID_z, sc->dispatchZactualFFTSize, sc->outputStride[2]); - } - else { - if (sc->performWorkGroupShift[2]) - sprintf(shiftZ, " + (%s + consts.workGroupShiftZ * %s) * %" PRIu64 "", sc->gl_GlobalInvocationID_z, sc->gl_WorkGroupSize_z, sc->outputStride[2]); - else - sprintf(shiftZ, " + %s * %" PRIu64 "", sc->gl_GlobalInvocationID_z, sc->outputStride[2]); - } - } - char shiftCoordinate[500] = ""; - uint64_t maxCoordinate = sc->numCoordinates * sc->matrixConvolution; - if (sc->numCoordinates * sc->matrixConvolution > 1) { - sprintf(shiftCoordinate, " + ((%s / %" PRIu64 ") %% %" PRIu64 ") * %" PRIu64 "", sc->gl_GlobalInvocationID_z, sc->dispatchZactualFFTSize, maxCoordinate, sc->outputStride[3]); - } - if ((sc->matrixConvolution > 1) && (sc->convolutionStep)) { - maxCoordinate = 1; - sprintf(shiftCoordinate, " + %s * %" PRIu64 "", coordinate, sc->outputStride[3]); - } - char shiftBatch[500] = ""; - if ((sc->numBatches > 1) || (sc->numKernels > 1)) { - if (sc->convolutionStep && (sc->numKernels > 1)) { - sprintf(shiftBatch, " + %s * %" PRIu64 "", batchID, sc->outputStride[4]); - } - else - sprintf(shiftBatch, " + (%s / %" PRIu64 ") * %" PRIu64 "", sc->gl_GlobalInvocationID_z, sc->dispatchZactualFFTSize * maxCoordinate, sc->outputStride[4]); - } - sc->tempLen = sprintf(sc->tempStr, "%s%s%s%s%s%s", outputOffset, shiftX, shiftY, shiftZ, shiftCoordinate, shiftBatch); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - break; - } - case 1: case 111: case 121: case 131: case 141: case 143: case 145: {//grouped_c2c - char outputOffset[30] = ""; - if (sc->outputOffset > 0) { - sprintf(outputOffset, "%" PRIu64 " + ", sc->outputOffset / sc->outputNumberByteSize); - } - else { - if (sc->performPostCompilationOutputOffset) { - if (outputType < 1000) - sprintf(outputOffset, "consts.outputOffset + "); - else - sprintf(outputOffset, "consts.kernelOffset + "); - } - } - char shiftX[500] = ""; - if (sc->numAxisUploads == 1) - sprintf(shiftX, "(%s)", index_x); - else - sprintf(shiftX, "(%s) * %" PRIu64 "", index_x, sc->outputStride[0]); - char shiftY[500] = ""; - if (index_y) - sprintf(shiftY, " + (%s) * %" PRIu64 "", index_y, sc->outputStride[1]); - char shiftZ[500] = ""; - if (sc->size[2] > 1) { - if (sc->numCoordinates * sc->matrixConvolution * sc->numBatches > 1) { - if (sc->performWorkGroupShift[2]) - sprintf(shiftZ, " + ((%s + consts.workGroupShiftZ * %s) %% %" PRIu64 ") * %" PRIu64 "", sc->gl_GlobalInvocationID_z, sc->gl_WorkGroupSize_z, sc->dispatchZactualFFTSize, sc->outputStride[2]); - else - sprintf(shiftZ, " + (%s %% %" PRIu64 ") * %" PRIu64 "", sc->gl_GlobalInvocationID_z, sc->dispatchZactualFFTSize, sc->outputStride[2]); - } - else { - if (sc->performWorkGroupShift[2]) - sprintf(shiftZ, " + (%s + consts.workGroupShiftZ * %s) * %" PRIu64 "", sc->gl_GlobalInvocationID_z, sc->gl_WorkGroupSize_z, sc->outputStride[2]); - else - sprintf(shiftZ, " + %s * %" PRIu64 "", sc->gl_GlobalInvocationID_z, sc->outputStride[2]); - } - } - char shiftCoordinate[500] = ""; - uint64_t maxCoordinate = sc->numCoordinates * sc->matrixConvolution; - if (sc->numCoordinates * sc->matrixConvolution > 1) { - sprintf(shiftCoordinate, " + ((%s / %" PRIu64 ") %% %" PRIu64 ") * %" PRIu64 "", sc->gl_GlobalInvocationID_z, sc->dispatchZactualFFTSize, maxCoordinate, sc->outputStride[3]); - } - if ((sc->matrixConvolution > 1) && (sc->convolutionStep)) { - maxCoordinate = 1; - sprintf(shiftCoordinate, " + %s * %" PRIu64 "", coordinate, sc->outputStride[3]); - } - char shiftBatch[500] = ""; - if ((sc->numBatches > 1) || (sc->numKernels > 1)) { - if (sc->convolutionStep && (sc->numKernels > 1)) { - sprintf(shiftBatch, " + %s * %" PRIu64 "", batchID, sc->outputStride[4]); - } - else - sprintf(shiftBatch, " + (%s / %" PRIu64 ") * %" PRIu64 "", sc->gl_GlobalInvocationID_z, sc->dispatchZactualFFTSize * maxCoordinate, sc->outputStride[4]); - } - sc->tempLen = sprintf(sc->tempStr, "%s%s%s%s%s%s", outputOffset, shiftX, shiftY, shiftZ, shiftCoordinate, shiftBatch); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - break; - - } - } - return res; -} - -static inline VkFFTResult inlineRadixKernelVkFFT(VkFFTSpecializationConstantsLayout* sc, const char* floatType, const char* uintType, uint64_t radix, uint64_t stageSize, uint64_t stageSizeSum, double stageAngle, char** regID) { - VkFFTResult res = VKFFT_SUCCESS; - double double_PI = 3.1415926535897932384626433832795; - char vecType[30]; - char LFending[4] = ""; - if (!strcmp(floatType, "float")) sprintf(LFending, "f"); -#if(VKFFT_BACKEND==0) - if (!strcmp(floatType, "float")) sprintf(vecType, "vec2"); - if (!strcmp(floatType, "double")) sprintf(vecType, "dvec2"); - char cosDef[20] = "cos"; - char sinDef[20] = "sin"; - if (!strcmp(floatType, "double")) sprintf(LFending, "LF"); -#elif(VKFFT_BACKEND==1) - if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); - if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); - char cosDef[20] = "__cosf"; - char sinDef[20] = "__sinf"; - if (!strcmp(floatType, "double")) sprintf(LFending, "l"); -#elif(VKFFT_BACKEND==2) - if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); - if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); - char cosDef[20] = "__cosf"; - char sinDef[20] = "__sinf"; - if (!strcmp(floatType, "double")) sprintf(LFending, "l"); -#elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)) - if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); - if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); - char cosDef[20] = "native_cos"; - char sinDef[20] = "native_sin"; - //if (!strcmp(floatType, "double")) sprintf(LFending, "l"); -#endif - char* temp = sc->temp; - //sprintf(temp, "loc_0"); - char* w = sc->w; - //sprintf(w, "w"); - char* iw = sc->iw; - //sprintf(iw, "iw"); - char convolutionInverse[30] = ""; - if (sc->convolutionStep) sprintf(convolutionInverse, ", %s inverse", uintType); - switch (radix) { - case 2: { - /*if (sc->LUT) { - sc->tempLen = sprintf(sc->tempStr, "void radix2(inout %s temp_0, inout %s temp_1, %s LUTId) {\n", vecType, vecType, uintType); - } - else { - sc->tempLen = sprintf(sc->tempStr, "void radix2(inout %s temp_0, inout %s temp_1, %s angle) {\n", vecType, vecType, floatType); - }*/ - /*VkAppendLine(sc, " {\n"); - sc->tempLen = sprintf(sc->tempStr, " %s %s;\n", vecType, temp); - res = VkAppendLine(sc); -if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " {\n\ - %s temp;\n", vecType);*/ - if (stageSize == 1) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = 1;\n", w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - if (sc->LUT) { - if (sc->useCoalescedLUTUploadToSM) { - sc->tempLen = sprintf(sc->tempStr, " %s = sdata[stageInvocationID];\n", w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " %s = twiddleLUT[LUTId];\n", w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (stageAngle < 0) { - sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.y;\n", w, w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - if (!strcmp(floatType, "float")) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = %s(angle);\n", w, cosDef); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = %s(angle);\n", w, sinDef); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (!strcmp(floatType, "double")) { - sc->tempLen = sprintf(sc->tempStr, " %s = sincos_20(angle);\n", w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - res = VkMulComplex(sc, temp, regID[1], w, 0); - if (res != VKFFT_SUCCESS) return res; - res = VkSubComplex(sc, regID[1], regID[0], temp); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, regID[0], regID[0], temp); - if (res != VKFFT_SUCCESS) return res; - /*VkAppendLine(sc, " }\n"); - sc->tempLen = sprintf(sc->tempStr, "\ -temp.x = temp%s.x * w.x - temp%s.y * w.y;\n\ -temp.y = temp%s.y * w.x + temp%s.x * w.y;\n\ -temp%s = temp%s - temp;\n\ -temp%s = temp%s + temp;\n\ -}\n", regID[1], regID[1], regID[1], regID[1], regID[1], regID[0], regID[0], regID[0]);*/ - break; - } - case 3: { - /* if (sc->LUT) { - sc->tempLen = sprintf(sc->tempStr, "void radix3(inout %s temp_0, inout %s temp_1, inout %s temp_2, %s LUTId) {\n", vecType, vecType, vecType, uintType); - } - else { - sc->tempLen = sprintf(sc->tempStr, "void radix3(inout %s temp_0, inout %s temp_1, inout %s temp_2, %s angle) {\n", vecType, vecType, vecType, floatType); - }*/ - char* tf[2]; - //VkAppendLine(sc, " {\n"); - for (uint64_t i = 0; i < 2; i++) { - tf[i] = (char*)malloc(sizeof(char) * 50); - if (!tf[i]) { - for (uint64_t j = 0; j < i; j++) { - free(tf[j]); - tf[j] = 0; - } - return VKFFT_ERROR_MALLOC_FAILED; - } - } - - sprintf(tf[0], "-0.5%s", LFending); - sprintf(tf[1], "-0.8660254037844386467637231707529%s", LFending); - - /*for (uint64_t i = 0; i < 3; i++) { - sc->locID[i] = (char*)malloc(sizeof(char) * 50); - sprintf(sc->locID[i], "loc_%" PRIu64 "", i); - sc->tempLen = sprintf(sc->tempStr, " %s %s;\n", vecType, sc->locID[i]); - res = VkAppendLine(sc); -if (res != VKFFT_SUCCESS) return res; - }*/ - if (stageSize == 1) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = 1;\n", w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - if (sc->LUT) { - if (sc->useCoalescedLUTUploadToSM) { - sc->tempLen = sprintf(sc->tempStr, " %s = sdata[stageInvocationID];\n", w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " %s = twiddleLUT[LUTId];\n", w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (stageAngle < 0) { - sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.y;\n", w, w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - if (!strcmp(floatType, "float")) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = %s(angle*%.17e%s);\n", w, cosDef, 4.0 / 3.0, LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = %s(angle*%.17e%s);\n", w, sinDef, 4.0 / 3.0, LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - //sc->tempLen = sprintf(sc->tempStr, " w = %s(cos(angle*%.17e), sin(angle*%.17e));\n\n", vecType, 4.0 / 3.0, 4.0 / 3.0); - } - if (!strcmp(floatType, "double")) { - sc->tempLen = sprintf(sc->tempStr, " %s = sincos_20(angle*%.17e%s);\n", w, 4.0 / 3.0, LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - res = VkMulComplex(sc, sc->locID[2], regID[2], w, 0); - /*sc->tempLen = sprintf(sc->tempStr, "\ -loc_2.x = temp%s.x * w.x - temp%s.y * w.y;\n\ -loc_2.y = temp%s.y * w.x + temp%s.x * w.y;\n", regID[2], regID[2], regID[2], regID[2]);*/ - if (stageSize == 1) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = 1;\n", w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - if (sc->LUT) { - if (sc->useCoalescedLUTUploadToSM) { - sc->tempLen = sprintf(sc->tempStr, " %s = sdata[stageInvocationID+%" PRIu64 "];\n", w, stageSize); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " %s = twiddleLUT[LUTId+%" PRIu64 "];\n", w, stageSize); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (stageAngle < 0) { - sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.y;\n", w, w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - if (!strcmp(floatType, "float")) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = %s(angle*%.17e%s);\n", w, cosDef, 2.0 / 3.0, LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = %s(angle*%.17e%s);\n", w, sinDef, 2.0 / 3.0, LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - //sc->tempLen = sprintf(sc->tempStr, " w = %s(cos(angle*%.17e), sin(angle*%.17e));\n\n", vecType, 2.0 / 3.0, 2.0 / 3.0); - } - if (!strcmp(floatType, "double")) { - sc->tempLen = sprintf(sc->tempStr, " %s=sincos_20(angle*%.17e%s);\n", w, 2.0 / 3.0, LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - res = VkMulComplex(sc, sc->locID[1], regID[1], w, 0); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, regID[1], sc->locID[1], sc->locID[2]); - if (res != VKFFT_SUCCESS) return res; - res = VkSubComplex(sc, regID[2], sc->locID[1], sc->locID[2]); - if (res != VKFFT_SUCCESS) return res; - /*sc->tempLen = sprintf(sc->tempStr, "\ -temp%s = loc_1 + loc_2;\n\ -temp%s = loc_1 - loc_2;\n", regID[1], regID[2]);*/ - res = VkAddComplex(sc, sc->locID[0], regID[0], regID[1]); - if (res != VKFFT_SUCCESS) return res; - res = VkFMAComplex(sc, sc->locID[1], regID[1], tf[0], regID[0]); - if (res != VKFFT_SUCCESS) return res; - res = VkMulComplexNumber(sc, sc->locID[2], regID[2], tf[1]); - if (res != VKFFT_SUCCESS) return res; - res = VkMovComplex(sc, regID[0], sc->locID[0]); - if (res != VKFFT_SUCCESS) return res; - /*sc->tempLen = sprintf(sc->tempStr, "\ -loc_0 = temp%s + temp%s;\n\ -loc_1 = temp%s - 0.5 * temp%s;\n\ -loc_2 = -0.8660254037844386467637231707529 * temp%s;\n\ -temp%s = loc_0;\n", regID[0], regID[1], regID[0], regID[1], regID[2], regID[0]);*/ - - if (stageAngle < 0) - { - res = VkShuffleComplex(sc, regID[1], sc->locID[1], sc->locID[2], 0); - if (res != VKFFT_SUCCESS) return res; - res = VkShuffleComplexInv(sc, regID[2], sc->locID[1], sc->locID[2], 0); - if (res != VKFFT_SUCCESS) return res; - /*sc->tempLen = sprintf(sc->tempStr, "\ -temp%s.x = loc_1.x - loc_2.y; \n\ -temp%s.y = loc_1.y + loc_2.x; \n\ -temp%s.x = loc_1.x + loc_2.y; \n\ -temp%s.y = loc_1.y - loc_2.x; \n", regID[1], regID[1], regID[2], regID[2]);*/ - } - else { - res = VkShuffleComplexInv(sc, regID[1], sc->locID[1], sc->locID[2], 0); - if (res != VKFFT_SUCCESS) return res; - res = VkShuffleComplex(sc, regID[2], sc->locID[1], sc->locID[2], 0); - if (res != VKFFT_SUCCESS) return res; - /*sc->tempLen = sprintf(sc->tempStr, "\ -temp%s.x = loc_1.x + loc_2.y; \n\ -temp%s.y = loc_1.y - loc_2.x; \n\ -temp%s.x = loc_1.x - loc_2.y; \n\ -temp%s.y = loc_1.y + loc_2.x; \n", regID[1], regID[1], regID[2], regID[2]);*/ - } - - //VkAppendLine(sc, " }\n"); - for (uint64_t i = 0; i < 2; i++) { - free(tf[i]); - tf[i] = 0; - //free(sc->locID[i]); - } - //free(sc->locID[2]); - break; - } - case 4: { - /*if (sc->LUT) - sc->tempLen = sprintf(sc->tempStr, "void radix4(inout %s temp_0, inout %s temp_1, inout %s temp_2, inout %s temp_3, %s LUTId%s) {\n", vecType, vecType, vecType, vecType, uintType, convolutionInverse); - else - sc->tempLen = sprintf(sc->tempStr, "void radix4(inout %s temp_0, inout %s temp_1, inout %s temp_2, inout %s temp_3, %s angle%s) {\n", vecType, vecType, vecType, vecType, floatType, convolutionInverse); - */ - //VkAppendLine(sc, " {\n"); - //sc->tempLen = sprintf(sc->tempStr, " %s %s;\n", vecType, temp); - //res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (stageSize == 1) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = 1;\n", w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - if (sc->LUT) { - if (sc->useCoalescedLUTUploadToSM) { - sc->tempLen = sprintf(sc->tempStr, " %s = sdata[stageInvocationID];\n", w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " %s = twiddleLUT[LUTId];\n", w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (stageAngle < 0) { - sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.y;\n", w, w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - if (!strcmp(floatType, "float")) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = %s(angle);\n", w, cosDef); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = %s(angle);\n", w, sinDef); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (!strcmp(floatType, "double")) { - sc->tempLen = sprintf(sc->tempStr, " %s = sincos_20(angle);\n", w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - res = VkMulComplex(sc, temp, regID[2], w, 0); - if (res != VKFFT_SUCCESS) return res; - res = VkSubComplex(sc, regID[2], regID[0], temp); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, regID[0], regID[0], temp); - if (res != VKFFT_SUCCESS) return res; - res = VkMulComplex(sc, temp, regID[3], w, 0); - if (res != VKFFT_SUCCESS) return res; - res = VkSubComplex(sc, regID[3], regID[1], temp); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, regID[1], regID[1], temp); - if (res != VKFFT_SUCCESS) return res; - /*sc->tempLen = sprintf(sc->tempStr, "\ -temp.x=temp%s.x*w.x-temp%s.y*w.y;\n\ -temp.y = temp%s.y * w.x + temp%s.x * w.y;\n\ -temp%s = temp%s - temp;\n\ -temp%s = temp%s + temp;\n\n\ -temp.x=temp%s.x*w.x-temp%s.y*w.y;\n\ -temp.y = temp%s.y * w.x + temp%s.x * w.y;\n\ -temp%s = temp%s - temp;\n\ -temp%s = temp%s + temp;\n\n\ -//DIF 2nd stage with angle\n", regID[2], regID[2], regID[2], regID[2], regID[2], regID[0], regID[0], regID[0], regID[3], regID[3], regID[3], regID[3], regID[3], regID[1], regID[1], regID[1]);*/ - if (stageSize == 1) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = 1;\n", w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - if (sc->LUT) { - if (sc->useCoalescedLUTUploadToSM) { - sc->tempLen = sprintf(sc->tempStr, " %s = sdata[stageInvocationID+%" PRIu64 "];\n", w, stageSize); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " %s=twiddleLUT[LUTId+%" PRIu64 "];\n", w, stageSize); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (stageAngle < 0) { - sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.y;\n", w, w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - if (!strcmp(floatType, "float")) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = %s(0.5%s*angle);\n", w, cosDef, LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = %s(0.5%s*angle);\n", w, sinDef, LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (!strcmp(floatType, "double")) { - sc->tempLen = sprintf(sc->tempStr, " %s=normalize(%s + %s(1.0, 0.0));\n", w, w, vecType); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - res = VkMulComplex(sc, temp, regID[1], w, 0); - if (res != VKFFT_SUCCESS) return res; - res = VkSubComplex(sc, regID[1], regID[0], temp); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, regID[0], regID[0], temp); - if (res != VKFFT_SUCCESS) return res; - /*sc->tempLen = sprintf(sc->tempStr, "\ -temp.x = temp%s.x * w.x - temp%s.y * w.y;\n\ -temp.y = temp%s.y * w.x + temp%s.x * w.y;\n\ -temp%s = temp%s - temp;\n\ -temp%s = temp%s + temp;\n\n", regID[1], regID[1], regID[1], regID[1], regID[1], regID[0], regID[0], regID[0]);*/ - if (stageAngle < 0) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = %s.x;", temp, w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.x = %s.y;\n", w, w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.x;\n", w, temp); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - //sc->tempLen = sprintf(sc->tempStr, " w = %s(w.y, -w.x);\n\n", vecType); - } - else { - sc->tempLen = sprintf(sc->tempStr, " %s.x = %s.x;", temp, w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.x = -%s.y;\n", w, w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = %s.x;\n", w, temp); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - //sc->tempLen = sprintf(sc->tempStr, " w = %s(-w.y, w.x);\n\n", vecType); - } - res = VkMulComplex(sc, temp, regID[3], w, 0); - if (res != VKFFT_SUCCESS) return res; - res = VkSubComplex(sc, regID[3], regID[2], temp); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, regID[2], regID[2], temp); - if (res != VKFFT_SUCCESS) return res; - //res = VkMovComplex(sc, temp, regID[1]); - //if (res != VKFFT_SUCCESS) return res; - - uint64_t permute2[4] = { 0,2,1,3 }; - res = VkPermute(sc, permute2, 4, 1, regID, temp); - if (res != VKFFT_SUCCESS) return res; - - /*res = VkMovComplex(sc, regID[1], regID[2]); - if (res != VKFFT_SUCCESS) return res; - res = VkMovComplex(sc, regID[2], temp); - if (res != VKFFT_SUCCESS) return res;*/ - /*VkAppendLine(sc, " }\n"); - sc->tempLen = sprintf(sc->tempStr, "\ -temp.x = temp%s.x * w.x - temp%s.y * w.y;\n\ -temp.y = temp%s.y * w.x + temp%s.x * w.y;\n\ -temp%s = temp%s - temp;\n\ -temp%s = temp%s + temp;\n\n\ -temp = temp%s;\n\ -temp%s = temp%s;\n\ -temp%s = temp;\n\ -}\n", regID[3], regID[3], regID[3], regID[3], regID[3], regID[2], regID[2], regID[2], regID[1], regID[1], regID[2], regID[2]);*/ - break; - } - case 5: { - /*if (sc->LUT) { - sc->tempLen = sprintf(sc->tempStr, "void radix5(inout %s temp_0, inout %s temp_1, inout %s temp_2, inout %s temp_3, inout %s temp_4, %s LUTId) {\n", vecType, vecType, vecType, vecType, vecType, uintType); - } - else { - sc->tempLen = sprintf(sc->tempStr, "void radix5(inout %s temp_0, inout %s temp_1, inout %s temp_2, inout %s temp_3, inout %s temp_4, %s angle) {\n", vecType, vecType, vecType, vecType, vecType, floatType); - }*/ - char* tf[5]; - //VkAppendLine(sc, " {\n"); - for (uint64_t i = 0; i < 5; i++) { - tf[i] = (char*)malloc(sizeof(char) * 50); - if (!tf[i]) { - for (uint64_t j = 0; j < i; j++) { - free(tf[j]); - tf[j] = 0; - } - return VKFFT_ERROR_MALLOC_FAILED; - } - } - sprintf(tf[0], "-0.5%s", LFending); - sprintf(tf[1], "1.538841768587626701285145288018455%s", LFending); - sprintf(tf[2], "-0.363271264002680442947733378740309%s", LFending); - sprintf(tf[3], "-0.809016994374947424102293417182819%s", LFending); - sprintf(tf[4], "-0.587785252292473129168705954639073%s", LFending); - - /*for (uint64_t i = 0; i < 5; i++) { - sc->locID[i] = (char*)malloc(sizeof(char) * 50); - sprintf(sc->locID[i], "loc_%" PRIu64 "", i); - sc->tempLen = sprintf(sc->tempStr, " %s %s;\n", vecType, sc->locID[i]); - res = VkAppendLine(sc); -if (res != VKFFT_SUCCESS) return res; - }*/ - /*sc->tempLen = sprintf(sc->tempStr, " {\n\ - %s loc_0;\n %s loc_1;\n %s loc_2;\n %s loc_3;\n %s loc_4;\n", vecType, vecType, vecType, vecType, vecType);*/ - - for (uint64_t i = radix - 1; i > 0; i--) { - if (stageSize == 1) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = 1;\n", w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - if (i == radix - 1) { - if (sc->LUT) { - if (sc->useCoalescedLUTUploadToSM) { - sc->tempLen = sprintf(sc->tempStr, " %s = sdata[stageInvocationID];\n", w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " %s = twiddleLUT[LUTId];\n", w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (stageAngle < 0) { - sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.y;\n", w, w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - if (!strcmp(floatType, "float")) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = %s(angle*%.17e%s);\n", w, cosDef, 2.0 * i / radix, LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = %s(angle*%.17e%s);\n", w, sinDef, 2.0 * i / radix, LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - //sc->tempLen = sprintf(sc->tempStr, " w = %s(cos(angle*%.17e), sin(angle*%.17e));\n\n", vecType, 2.0 * i / radix, 2.0 * i / radix); - } - if (!strcmp(floatType, "double")) { - sc->tempLen = sprintf(sc->tempStr, " %s = sincos_20(angle*%.17e%s);\n", w, 2.0 * i / radix, LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - else { - if (sc->LUT) { - if (sc->useCoalescedLUTUploadToSM) { - sc->tempLen = sprintf(sc->tempStr, " %s = sdata[stageInvocationID+%" PRIu64 "];\n\n", w, (radix - 1 - i) * stageSize); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " %s = twiddleLUT[LUTId+%" PRIu64 "];\n\n", w, (radix - 1 - i) * stageSize); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (stageAngle < 0) { - sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.y;\n", w, w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - if (!strcmp(floatType, "float")) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = %s(angle*%.17e%s);\n", w, cosDef, 2.0 * i / radix, LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = %s(angle*%.17e%s);\n", w, sinDef, 2.0 * i / radix, LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - //sc->tempLen = sprintf(sc->tempStr, " w = %s(cos(angle*%.17e), sin(angle*%.17e));\n\n", vecType, 2.0 * i / radix, 2.0 * i / radix); - } - if (!strcmp(floatType, "double")) { - sc->tempLen = sprintf(sc->tempStr, " %s = sincos_20(angle*%.17e%s);\n", w, 2.0 * i / radix, LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - } - res = VkMulComplex(sc, sc->locID[i], regID[i], w, 0); - if (res != VKFFT_SUCCESS) return res; - /*sc->tempLen = sprintf(sc->tempStr, "\ -loc_%" PRIu64 ".x = temp%s.x * w.x - temp%s.y * w.y;\n\ -loc_%" PRIu64 ".y = temp%s.y * w.x + temp%s.x * w.y;\n", i, regID[i], regID[i], i, regID[i], regID[i]);*/ - } - res = VkAddComplex(sc, regID[1], sc->locID[1], sc->locID[4]); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, regID[2], sc->locID[2], sc->locID[3]); - if (res != VKFFT_SUCCESS) return res; - res = VkSubComplex(sc, regID[3], sc->locID[2], sc->locID[3]); - if (res != VKFFT_SUCCESS) return res; - res = VkSubComplex(sc, regID[4], sc->locID[1], sc->locID[4]); - if (res != VKFFT_SUCCESS) return res; - res = VkSubComplex(sc, sc->locID[3], regID[1], regID[2]); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, sc->locID[4], regID[3], regID[4]); - if (res != VKFFT_SUCCESS) return res; - /*sc->tempLen = sprintf(sc->tempStr, "\ -temp%s = loc_1 + loc_4;\n\ -temp%s = loc_2 + loc_3;\n\ -temp%s = loc_2 - loc_3;\n\ -temp%s = loc_1 - loc_4;\n\ -loc_3 = temp%s - temp%s;\n\ -loc_4 = temp%s + temp%s;\n", regID[1], regID[2], regID[3], regID[4], regID[1], regID[2], regID[3], regID[4]);*/ - res = VkAddComplex(sc, sc->locID[0], regID[0], regID[1]); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, sc->locID[0], sc->locID[0], regID[2]); - if (res != VKFFT_SUCCESS) return res; - res = VkFMAComplex(sc, sc->locID[1], regID[1], tf[0], regID[0]); - if (res != VKFFT_SUCCESS) return res; - res = VkFMAComplex(sc, sc->locID[2], regID[2], tf[0], regID[0]); - if (res != VKFFT_SUCCESS) return res; - res = VkMulComplexNumber(sc, regID[3], regID[3], tf[1]); - if (res != VKFFT_SUCCESS) return res; - res = VkMulComplexNumber(sc, regID[4], regID[4], tf[2]); - if (res != VKFFT_SUCCESS) return res; - res = VkMulComplexNumber(sc, sc->locID[3], sc->locID[3], tf[3]); - if (res != VKFFT_SUCCESS) return res; - res = VkMulComplexNumber(sc, sc->locID[4], sc->locID[4], tf[4]); - if (res != VKFFT_SUCCESS) return res; - /*sc->tempLen = sprintf(sc->tempStr, "\ -loc_0 = temp%s + temp%s + temp%s;\n\ -loc_1 = temp%s - 0.5 * temp%s;\n\ -loc_2 = temp%s - 0.5 * temp%s;\n\ -temp%s *= 1.538841768587626701285145288018455;\n\ -temp%s *= -0.363271264002680442947733378740309;\n\ -loc_3 *= -0.809016994374947424102293417182819;\n\ -loc_4 *= -0.587785252292473129168705954639073;\n", regID[0], regID[1], regID[2], regID[0], regID[1], regID[0], regID[2], regID[3], regID[4]);*/ - res = VkSubComplex(sc, sc->locID[1], sc->locID[1], sc->locID[3]); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, sc->locID[2], sc->locID[2], sc->locID[3]); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, sc->locID[3], regID[3], sc->locID[4]); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, sc->locID[4], sc->locID[4], regID[4]); - if (res != VKFFT_SUCCESS) return res; - res = VkMovComplex(sc, regID[0], sc->locID[0]); - if (res != VKFFT_SUCCESS) return res; - /*sc->tempLen = sprintf(sc->tempStr, "\ -loc_1 -= loc_3;\n\ -loc_2 += loc_3;\n\ -loc_3 = temp%s+loc_4;\n\ -loc_4 += temp%s;\n\ -temp%s = loc_0;\n", regID[3], regID[4], regID[0]);*/ - - if (stageAngle < 0) - { - res = VkShuffleComplex(sc, regID[1], sc->locID[1], sc->locID[4], 0); - if (res != VKFFT_SUCCESS) return res; - res = VkShuffleComplex(sc, regID[2], sc->locID[2], sc->locID[3], 0); - if (res != VKFFT_SUCCESS) return res; - res = VkShuffleComplexInv(sc, regID[3], sc->locID[2], sc->locID[3], 0); - if (res != VKFFT_SUCCESS) return res; - res = VkShuffleComplexInv(sc, regID[4], sc->locID[1], sc->locID[4], 0); - if (res != VKFFT_SUCCESS) return res; - /*sc->tempLen = sprintf(sc->tempStr, "\ -temp%s.x = loc_1.x - loc_4.y; \n\ -temp%s.y = loc_1.y + loc_4.x; \n\ -temp%s.x = loc_2.x - loc_3.y; \n\ -temp%s.y = loc_2.y + loc_3.x; \n\ -temp%s.x = loc_2.x + loc_3.y; \n\ -temp%s.y = loc_2.y - loc_3.x; \n\ -temp%s.x = loc_1.x + loc_4.y; \n\ -temp%s.y = loc_1.y - loc_4.x; \n", regID[1], regID[1], regID[2], regID[2], regID[3], regID[3], regID[4], regID[4]);*/ - } - else { - res = VkShuffleComplexInv(sc, regID[1], sc->locID[1], sc->locID[4], 0); - if (res != VKFFT_SUCCESS) return res; - res = VkShuffleComplexInv(sc, regID[2], sc->locID[2], sc->locID[3], 0); - if (res != VKFFT_SUCCESS) return res; - res = VkShuffleComplex(sc, regID[3], sc->locID[2], sc->locID[3], 0); - if (res != VKFFT_SUCCESS) return res; - res = VkShuffleComplex(sc, regID[4], sc->locID[1], sc->locID[4], 0); - if (res != VKFFT_SUCCESS) return res; - /*sc->tempLen = sprintf(sc->tempStr, "\ -temp%s.x = loc_1.x + loc_4.y; \n\ -temp%s.y = loc_1.y - loc_4.x; \n\ -temp%s.x = loc_2.x + loc_3.y; \n\ -temp%s.y = loc_2.y - loc_3.x; \n\ -temp%s.x = loc_2.x - loc_3.y; \n\ -temp%s.y = loc_2.y + loc_3.x; \n\ -temp%s.x = loc_1.x - loc_4.y; \n\ -temp%s.y = loc_1.y + loc_4.x; \n", regID[1], regID[1], regID[2], regID[2], regID[3], regID[3], regID[4], regID[4]);*/ - } - - //VkAppendLine(sc, " }\n"); - for (uint64_t i = 0; i < 5; i++) { - free(tf[i]); - tf[i] = 0; - //free(sc->locID[i]); - } - break; - } - case 6: { - char* tf[2]; - //VkAppendLine(sc, " {\n"); - for (uint64_t i = 0; i < 2; i++) { - tf[i] = (char*)malloc(sizeof(char) * 50); - if (!tf[i]) { - for (uint64_t j = 0; j < i; j++) { - free(tf[j]); - tf[j] = 0; - } - return VKFFT_ERROR_MALLOC_FAILED; - } - } - - sprintf(tf[0], "-0.5%s", LFending); - sprintf(tf[1], "-0.8660254037844386467637231707529%s", LFending); - for (uint64_t i = radix - 1; i > 0; i--) { - if (stageSize == 1) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = 1;\n", w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - if (i == radix - 1) { - if (sc->LUT) { - if (sc->useCoalescedLUTUploadToSM) { - sc->tempLen = sprintf(sc->tempStr, " %s = sdata[stageInvocationID];\n", w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " %s = twiddleLUT[LUTId];\n", w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (stageAngle < 0) { - sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.y;\n", w, w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - if (!strcmp(floatType, "float")) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = %s(angle*%.17e%s);\n", w, cosDef, 2.0 * i / radix, LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = %s(angle*%.17e%s);\n", w, sinDef, 2.0 * i / radix, LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - //sc->tempLen = sprintf(sc->tempStr, " w = %s(cos(angle*%.17e), sin(angle*%.17e));\n\n", vecType, 2.0 * i / radix, 2.0 * i / radix); - } - if (!strcmp(floatType, "double")) { - sc->tempLen = sprintf(sc->tempStr, " %s = sincos_20(angle*%.17e%s);\n", w, 2.0 * i / radix, LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - else { - if (sc->LUT) { - if (sc->useCoalescedLUTUploadToSM) { - sc->tempLen = sprintf(sc->tempStr, " %s = sdata[stageInvocationID+%" PRIu64 "];\n\n", w, (radix - 1 - i) * stageSize); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " %s = twiddleLUT[LUTId+%" PRIu64 "];\n\n", w, (radix - 1 - i) * stageSize); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (stageAngle < 0) { - sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.y;\n", w, w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - if (!strcmp(floatType, "float")) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = %s(angle*%.17e%s);\n", w, cosDef, 2.0 * i / radix, LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = %s(angle*%.17e%s);\n", w, sinDef, 2.0 * i / radix, LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - //sc->tempLen = sprintf(sc->tempStr, " w = %s(cos(angle*%.17e), sin(angle*%.17e));\n\n", vecType, 2.0 * i / radix, 2.0 * i / radix); - } - if (!strcmp(floatType, "double")) { - sc->tempLen = sprintf(sc->tempStr, " %s = sincos_20(angle*%.17e%s);\n", w, 2.0 * i / radix, LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - } - res = VkMulComplex(sc, regID[i], regID[i], w, temp); - if (res != VKFFT_SUCCESS) return res; - } - //important - //res = VkMovComplex(sc, regID[1], sc->locID[1]); - //if (res != VKFFT_SUCCESS) return res; - - //uint64_t P = 3; - uint64_t Q = 2; - for (uint64_t i = 0; i < Q; i++) { - res = VkMovComplex(sc, sc->locID[0], regID[i]); - if (res != VKFFT_SUCCESS) return res; - res = VkMovComplex(sc, sc->locID[1], regID[i + Q]); - if (res != VKFFT_SUCCESS) return res; - res = VkMovComplex(sc, sc->locID[2], regID[i + 2 * Q]); - if (res != VKFFT_SUCCESS) return res; - - res = VkAddComplex(sc, regID[i + Q], sc->locID[1], sc->locID[2]); - if (res != VKFFT_SUCCESS) return res; - res = VkSubComplex(sc, regID[i + 2 * Q], sc->locID[1], sc->locID[2]); - if (res != VKFFT_SUCCESS) return res; - - res = VkAddComplex(sc, sc->locID[0], regID[i], regID[i + Q]); - if (res != VKFFT_SUCCESS) return res; - res = VkFMAComplex(sc, sc->locID[1], regID[i + Q], tf[0], regID[i]); - if (res != VKFFT_SUCCESS) return res; - res = VkMulComplexNumber(sc, sc->locID[2], regID[i + 2 * Q], tf[1]); - if (res != VKFFT_SUCCESS) return res; - res = VkMovComplex(sc, regID[i], sc->locID[0]); - if (res != VKFFT_SUCCESS) return res; - if (stageAngle < 0) - { - res = VkShuffleComplex(sc, regID[i + Q], sc->locID[1], sc->locID[2], 0); - if (res != VKFFT_SUCCESS) return res; - res = VkShuffleComplexInv(sc, regID[i + 2 * Q], sc->locID[1], sc->locID[2], 0); - if (res != VKFFT_SUCCESS) return res; - } - else { - res = VkShuffleComplexInv(sc, regID[i + Q], sc->locID[1], sc->locID[2], 0); - if (res != VKFFT_SUCCESS) return res; - res = VkShuffleComplex(sc, regID[i + 2 * Q], sc->locID[1], sc->locID[2], 0); - if (res != VKFFT_SUCCESS) return res; - } - } - - res = VkMovComplex(sc, temp, regID[1]); - if (res != VKFFT_SUCCESS) return res; - res = VkSubComplex(sc, regID[1], regID[0], temp); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, regID[0], regID[0], temp); - if (res != VKFFT_SUCCESS) return res; - if (stageAngle < 0) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = -0.5%s;\n", w, LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = 0.8660254037844386467637231707529%s;\n\n", w, LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " %s.x = -0.5%s;\n", w, LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = -0.8660254037844386467637231707529%s;\n\n", w, LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - - res = VkMulComplex(sc, temp, regID[3], w, 0); - if (res != VKFFT_SUCCESS) return res; - res = VkSubComplex(sc, regID[3], regID[2], temp); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, regID[2], regID[2], temp); - if (res != VKFFT_SUCCESS) return res; - - sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.y;\n", w, w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - res = VkMulComplex(sc, temp, regID[5], w, 0); - if (res != VKFFT_SUCCESS) return res; - res = VkSubComplex(sc, regID[5], regID[4], temp); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, regID[4], regID[4], temp); - if (res != VKFFT_SUCCESS) return res; - - uint64_t permute2[6] = { 0,3,4,1,2,5 }; - res = VkPermute(sc, permute2, 6, 1, regID, temp); - if (res != VKFFT_SUCCESS) return res; - - /*res = VkMovComplex(sc, temp, regID[1]); - if (res != VKFFT_SUCCESS) return res; - res = VkMovComplex(sc, regID[1], regID[3]); - if (res != VKFFT_SUCCESS) return res; - res = VkMovComplex(sc, regID[3], temp); - if (res != VKFFT_SUCCESS) return res; - - res = VkMovComplex(sc, temp, regID[2]); - if (res != VKFFT_SUCCESS) return res; - res = VkMovComplex(sc, regID[2], regID[4]); - if (res != VKFFT_SUCCESS) return res; - res = VkMovComplex(sc, regID[4], temp); - if (res != VKFFT_SUCCESS) return res;*/ - - for (uint64_t i = 0; i < 2; i++) { - free(tf[i]); - tf[i] = 0; - } - break; - } - case 7: { - /*if (sc->LUT) { - sc->tempLen = sprintf(sc->tempStr, "void radix5(inout %s temp_0, inout %s temp_1, inout %s temp_2, inout %s temp_3, inout %s temp_4, %s LUTId) {\n", vecType, vecType, vecType, vecType, vecType, uintType); - } - else { - sc->tempLen = sprintf(sc->tempStr, "void radix5(inout %s temp_0, inout %s temp_1, inout %s temp_2, inout %s temp_3, inout %s temp_4, %s angle) {\n", vecType, vecType, vecType, vecType, vecType, floatType); - }*/ - char* tf[8]; - - //VkAppendLine(sc, " {\n"); - for (uint64_t i = 0; i < 8; i++) { - tf[i] = (char*)malloc(sizeof(char) * 50); - if (!tf[i]) { - for (uint64_t j = 0; j < i; j++) { - free(tf[j]); - tf[j] = 0; - } - return VKFFT_ERROR_MALLOC_FAILED; - } - } - sprintf(tf[0], "-1.16666666666666651863693004997913%s", LFending); - sprintf(tf[1], "0.79015646852540022404554065360571%s", LFending); - sprintf(tf[2], "0.05585426728964774240049351305970%s", LFending); - sprintf(tf[3], "0.73430220123575240531721419756650%s", LFending); - if (stageAngle < 0) { - sprintf(tf[4], "0.44095855184409837868031445395900%s", LFending); - sprintf(tf[5], "0.34087293062393136944265847887436%s", LFending); - sprintf(tf[6], "-0.53396936033772524066165487965918%s", LFending); - sprintf(tf[7], "0.87484229096165666561546458979137%s", LFending); - } - else { - sprintf(tf[4], "-0.44095855184409837868031445395900%s", LFending); - sprintf(tf[5], "-0.34087293062393136944265847887436%s", LFending); - sprintf(tf[6], "0.53396936033772524066165487965918%s", LFending); - sprintf(tf[7], "-0.87484229096165666561546458979137%s", LFending); - } - /*for (uint64_t i = 0; i < 7; i++) { - sc->locID[i] = (char*)malloc(sizeof(char) * 50); - sprintf(sc->locID[i], "loc_%" PRIu64 "", i); - sc->tempLen = sprintf(sc->tempStr, " %s %s;\n", vecType, sc->locID[i]); - res = VkAppendLine(sc); -if (res != VKFFT_SUCCESS) return res; - }*/ - for (uint64_t i = radix - 1; i > 0; i--) { - if (stageSize == 1) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = 1;\n", w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - if (i == radix - 1) { - if (sc->LUT) { - if (sc->useCoalescedLUTUploadToSM) { - sc->tempLen = sprintf(sc->tempStr, " %s = sdata[stageInvocationID];\n", w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " %s = twiddleLUT[LUTId];\n", w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (stageAngle < 0) { - sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.y;\n", w, w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - if (!strcmp(floatType, "float")) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = %s(angle*%.17e%s);\n", w, cosDef, 2.0 * i / radix, LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = %s(angle*%.17e%s);\n", w, sinDef, 2.0 * i / radix, LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - //sc->tempLen = sprintf(sc->tempStr, " w = %s(cos(angle*%.17e), sin(angle*%.17e));\n\n", vecType, 2.0 * i / radix, 2.0 * i / radix); - } - if (!strcmp(floatType, "double")) { - sc->tempLen = sprintf(sc->tempStr, " %s = sincos_20(angle*%.17e%s);\n", w, 2.0 * i / radix, LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - else { - if (sc->LUT) { - if (sc->useCoalescedLUTUploadToSM) { - sc->tempLen = sprintf(sc->tempStr, " %s = sdata[stageInvocationID+%" PRIu64 "];\n\n", w, (radix - 1 - i) * stageSize); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " %s = twiddleLUT[LUTId+%" PRIu64 "];\n\n", w, (radix - 1 - i) * stageSize); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (stageAngle < 0) { - sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.y;\n", w, w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - if (!strcmp(floatType, "float")) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = %s(angle*%.17e%s);\n", w, cosDef, 2.0 * i / radix, LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = %s(angle*%.17e%s);\n", w, sinDef, 2.0 * i / radix, LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - //sc->tempLen = sprintf(sc->tempStr, " w = %s(cos(angle*%.17e), sin(angle*%.17e));\n\n", vecType, 2.0 * i / radix, 2.0 * i / radix); - } - if (!strcmp(floatType, "double")) { - sc->tempLen = sprintf(sc->tempStr, " %s = sincos_20(angle*%.17e%s);\n", w, 2.0 * i / radix, LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - } - res = VkMulComplex(sc, sc->locID[i], regID[i], w, 0); - if (res != VKFFT_SUCCESS) return res; - /*sc->tempLen = sprintf(sc->tempStr, "\ -loc_%" PRIu64 ".x = temp%s.x * w.x - temp%s.y * w.y;\n\ -loc_%" PRIu64 ".y = temp%s.y * w.x + temp%s.x * w.y;\n", i, regID[i], regID[i], i, regID[i], regID[i]);*/ - } - res = VkMovComplex(sc, sc->locID[0], regID[0]); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, regID[0], sc->locID[1], sc->locID[6]); - if (res != VKFFT_SUCCESS) return res; - res = VkSubComplex(sc, regID[1], sc->locID[1], sc->locID[6]); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, regID[2], sc->locID[2], sc->locID[5]); - if (res != VKFFT_SUCCESS) return res; - res = VkSubComplex(sc, regID[3], sc->locID[2], sc->locID[5]); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, regID[4], sc->locID[4], sc->locID[3]); - if (res != VKFFT_SUCCESS) return res; - res = VkSubComplex(sc, regID[5], sc->locID[4], sc->locID[3]); - if (res != VKFFT_SUCCESS) return res; - /*sc->tempLen = sprintf(sc->tempStr, "\ -loc_0 = temp%s;\n\ -temp%s = loc_1 + loc_6;\n\ -temp%s = loc_1 - loc_6;\n\ -temp%s = loc_2 + loc_5;\n\ -temp%s = loc_2 - loc_5;\n\ -temp%s = loc_4 + loc_3;\n\ -temp%s = loc_4 - loc_3;\n", regID[0], regID[0], regID[1], regID[2], regID[3], regID[4], regID[5]);*/ - res = VkAddComplex(sc, sc->locID[5], regID[1], regID[3]); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, sc->locID[5], sc->locID[5], regID[5]); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, sc->locID[1], regID[0], regID[2]); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, sc->locID[1], sc->locID[1], regID[4]); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, sc->locID[0], sc->locID[0], sc->locID[1]); - if (res != VKFFT_SUCCESS) return res; - /*sc->tempLen = sprintf(sc->tempStr, "\ -loc_5 = temp%s + temp%s + temp%s;\n\ -loc_1 = temp%s + temp%s + temp%s;\n\ -loc_0 += loc_1;\n", regID[1], regID[3], regID[5], regID[0], regID[2], regID[4]);*/ - res = VkSubComplex(sc, sc->locID[2], regID[0], regID[4]); - if (res != VKFFT_SUCCESS) return res; - res = VkSubComplex(sc, sc->locID[3], regID[4], regID[2]); - if (res != VKFFT_SUCCESS) return res; - res = VkSubComplex(sc, sc->locID[4], regID[2], regID[0]); - if (res != VKFFT_SUCCESS) return res; - /*sc->tempLen = sprintf(sc->tempStr, "\ -loc_2 = temp%s - temp%s;\n\ -loc_3 = temp%s - temp%s;\n\ -loc_4 = temp%s - temp%s;\n", regID[0], regID[4], regID[4], regID[2], regID[2], regID[0]);*/ - res = VkSubComplex(sc, regID[0], regID[1], regID[5]); - if (res != VKFFT_SUCCESS) return res; - res = VkSubComplex(sc, regID[2], regID[5], regID[3]); - if (res != VKFFT_SUCCESS) return res; - res = VkSubComplex(sc, regID[4], regID[3], regID[1]); - if (res != VKFFT_SUCCESS) return res; - /*sc->tempLen = sprintf(sc->tempStr, "\ -temp%s = temp%s - temp%s;\n\ -temp%s = temp%s - temp%s;\n\ -temp%s = temp%s - temp%s;\n", regID[0], regID[1], regID[5], regID[2], regID[5], regID[3], regID[4], regID[3], regID[1]);*/ - - res = VkMulComplexNumber(sc, sc->locID[1], sc->locID[1], tf[0]); - if (res != VKFFT_SUCCESS) return res; - res = VkMulComplexNumber(sc, sc->locID[2], sc->locID[2], tf[1]); - if (res != VKFFT_SUCCESS) return res; - res = VkMulComplexNumber(sc, sc->locID[3], sc->locID[3], tf[2]); - if (res != VKFFT_SUCCESS) return res; - res = VkMulComplexNumber(sc, sc->locID[4], sc->locID[4], tf[3]); - if (res != VKFFT_SUCCESS) return res; - res = VkMulComplexNumber(sc, sc->locID[5], sc->locID[5], tf[4]); - if (res != VKFFT_SUCCESS) return res; - res = VkMulComplexNumber(sc, regID[0], regID[0], tf[5]); - if (res != VKFFT_SUCCESS) return res; - res = VkMulComplexNumber(sc, regID[2], regID[2], tf[6]); - if (res != VKFFT_SUCCESS) return res; - res = VkMulComplexNumber(sc, regID[4], regID[4], tf[7]); - if (res != VKFFT_SUCCESS) return res; - /*sc->tempLen = sprintf(sc->tempStr, "\ -loc_1 *= -1.16666666666666651863693004997913;\n\ -loc_2 *= 0.79015646852540022404554065360571;\n\ -loc_3 *= 0.05585426728964774240049351305970;\n\ -loc_4 *= 0.73430220123575240531721419756650;\n\ -loc_5 *= 0.44095855184409837868031445395900;\n\ -temp%s *= 0.34087293062393136944265847887436;\n\ -temp%s *= -0.53396936033772524066165487965918;\n\ -temp%s *= 0.87484229096165666561546458979137;\n", regID[0], regID[2], regID[4]);*/ - - res = VkSubComplex(sc, regID[5], regID[4], regID[2]); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplexInv(sc, regID[6], regID[4], regID[0]); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, regID[4], regID[0], regID[2]); - if (res != VKFFT_SUCCESS) return res; - /*sc->tempLen = sprintf(sc->tempStr, "\ -temp%s = temp%s - temp%s;\n\ -temp%s = - temp%s - temp%s;\n\ -temp%s = temp%s + temp%s;\n", regID[5], regID[4], regID[2], regID[6], regID[4], regID[0], regID[4], regID[0], regID[2]);*/ - res = VkAddComplex(sc, regID[0], sc->locID[0], sc->locID[1]); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, regID[1], sc->locID[2], sc->locID[3]); - if (res != VKFFT_SUCCESS) return res; - res = VkSubComplex(sc, regID[2], sc->locID[4], sc->locID[3]); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplexInv(sc, regID[3], sc->locID[2], sc->locID[4]); - if (res != VKFFT_SUCCESS) return res; - /*sc->tempLen = sprintf(sc->tempStr, "\ -temp%s = loc_0 + loc_1;\n\ -temp%s = loc_2 + loc_3;\n\ -temp%s = loc_4 - loc_3;\n\ -temp%s = - loc_2 - loc_4;\n", regID[0], regID[1], regID[2], regID[3]);*/ - res = VkAddComplex(sc, sc->locID[1], regID[0], regID[1]); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, sc->locID[2], regID[0], regID[2]); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, sc->locID[3], regID[0], regID[3]); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, sc->locID[4], regID[4], sc->locID[5]); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, sc->locID[6], regID[6], sc->locID[5]); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, sc->locID[5], sc->locID[5], regID[5]); - if (res != VKFFT_SUCCESS) return res; - res = VkMovComplex(sc, regID[0], sc->locID[0]); - if (res != VKFFT_SUCCESS) return res; - /*sc->tempLen = sprintf(sc->tempStr, "\ -loc_1 = temp%s + temp%s;\n\ -loc_2 = temp%s + temp%s;\n\ -loc_3 = temp%s + temp%s;\n\ -loc_4 = temp%s + loc_5;\n\ -loc_6 = temp%s + loc_5;\n\ -loc_5 += temp%s;\n\ -temp%s = loc_0;\n", regID[0], regID[1], regID[0], regID[2], regID[0], regID[3], regID[4], regID[6], regID[5], regID[0]);*/ - res = VkShuffleComplexInv(sc, regID[1], sc->locID[1], sc->locID[4], 0); - if (res != VKFFT_SUCCESS) return res; - res = VkShuffleComplexInv(sc, regID[2], sc->locID[3], sc->locID[6], 0); - if (res != VKFFT_SUCCESS) return res; - res = VkShuffleComplex(sc, regID[3], sc->locID[2], sc->locID[5], 0); - if (res != VKFFT_SUCCESS) return res; - res = VkShuffleComplexInv(sc, regID[4], sc->locID[2], sc->locID[5], 0); - if (res != VKFFT_SUCCESS) return res; - res = VkShuffleComplex(sc, regID[5], sc->locID[3], sc->locID[6], 0); - if (res != VKFFT_SUCCESS) return res; - res = VkShuffleComplex(sc, regID[6], sc->locID[1], sc->locID[4], 0); - if (res != VKFFT_SUCCESS) return res; - - /*sc->tempLen = sprintf(sc->tempStr, "\ -temp%s.x = loc_1.x + loc_4.y; \n\ -temp%s.y = loc_1.y - loc_4.x; \n\ -temp%s.x = loc_3.x + loc_6.y; \n\ -temp%s.y = loc_3.y - loc_6.x; \n\ -temp%s.x = loc_2.x - loc_5.y; \n\ -temp%s.y = loc_2.y + loc_5.x; \n\ -temp%s.x = loc_2.x + loc_5.y; \n\ -temp%s.y = loc_2.y - loc_5.x; \n\ -temp%s.x = loc_3.x - loc_6.y; \n\ -temp%s.y = loc_3.y + loc_6.x; \n\ -temp%s.x = loc_1.x - loc_4.y; \n\ -temp%s.y = loc_1.y + loc_4.x; \n", regID[1], regID[1], regID[2], regID[2], regID[3], regID[3], regID[4], regID[4], regID[5], regID[5], regID[6], regID[6]); - VkAppendLine(sc, " }\n");*/ - /*for (uint64_t i = 0; i < 7; i++) { - free(sc->locID[i]); - }*/ - for (uint64_t i = 0; i < 8; i++) { - free(tf[i]); - tf[i] = 0; - } - break; - } - case 8: { - /*if (sc->LUT) - sc->tempLen = sprintf(sc->tempStr, "void radix8(inout %s temp_0, inout %s temp_1, inout %s temp_2, inout %s temp_3, inout %s temp_4, inout %s temp_5, inout %s temp_6, inout %s temp_7, %s LUTId%s) {\n", vecType, vecType, vecType, vecType, vecType, vecType, vecType, vecType, uintType, convolutionInverse); - else - sc->tempLen = sprintf(sc->tempStr, "void radix8(inout %s temp_0, inout %s temp_1, inout %s temp_2, inout %s temp_3, inout %s temp_4, inout %s temp_5, inout %s temp_6, inout %s temp_7, %s angle%s) {\n", vecType, vecType, vecType, vecType, vecType, vecType, vecType, vecType, floatType, convolutionInverse); - */ - //VkAppendLine(sc, " {\n"); - /*sc->tempLen = sprintf(sc->tempStr, " %s %s;\n", vecType, temp); - res = VkAppendLine(sc); -if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s %s;\n", vecType, iw); - res = VkAppendLine(sc); -if (res != VKFFT_SUCCESS) return res;*/ - if (stageSize == 1) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = 1;\n", w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - if (sc->LUT) { - if (sc->useCoalescedLUTUploadToSM) { - sc->tempLen = sprintf(sc->tempStr, " %s = sdata[stageInvocationID];\n", w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " %s = twiddleLUT[LUTId];\n", w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (stageAngle < 0) { - sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.y;\n", w, w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - if (!strcmp(floatType, "float")) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = %s(angle);\n", w, cosDef); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = %s(angle);\n", w, sinDef); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (!strcmp(floatType, "double")) { - sc->tempLen = sprintf(sc->tempStr, " %s = sincos_20(angle);\n", w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - if (!strcmp(floatType, "double")) { - sc->tempLen = sprintf(sc->tempStr, " %s = sincos_20(angle);\n", w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - for (uint64_t i = 0; i < 4; i++) { - res = VkMulComplex(sc, temp, regID[i + 4], w, 0); - if (res != VKFFT_SUCCESS) return res; - res = VkSubComplex(sc, regID[i + 4], regID[i], temp); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, regID[i], regID[i], temp); - if (res != VKFFT_SUCCESS) return res; - /*sc->tempLen = sprintf(sc->tempStr, "\ -temp.x=temp%s.x*w.x-temp%s.y*w.y;\n\ -temp.y = temp%s.y * w.x + temp%s.x * w.y;\n\ -temp%s = temp%s - temp;\n\ -temp%s = temp%s + temp;\n\n", regID[i + 4], regID[i + 4], regID[i + 4], regID[i + 4], regID[i + 4], regID[i + 0], regID[i + 0], regID[i + 0]);*/ - } - if (stageSize == 1) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = 1;\n", w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - if (sc->LUT) { - if (sc->useCoalescedLUTUploadToSM) { - sc->tempLen = sprintf(sc->tempStr, " %s = sdata[stageInvocationID+%" PRIu64 "];\n\n", w, stageSize); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " %s = twiddleLUT[LUTId+%" PRIu64 "];\n\n", w, stageSize); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (stageAngle < 0) { - sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.y;\n", w, w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - if (!strcmp(floatType, "float")) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = %s(0.5%s*angle);\n", w, cosDef, LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = %s(0.5%s*angle);\n", w, sinDef, LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (!strcmp(floatType, "double")) { - sc->tempLen = sprintf(sc->tempStr, " %s=normalize(%s + %s(1.0, 0.0));\n", w, w, vecType); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - for (uint64_t i = 0; i < 2; i++) { - res = VkMulComplex(sc, temp, regID[i + 2], w, 0); - if (res != VKFFT_SUCCESS) return res; - res = VkSubComplex(sc, regID[i + 2], regID[i], temp); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, regID[i], regID[i], temp); - if (res != VKFFT_SUCCESS) return res; - /*sc->tempLen = sprintf(sc->tempStr, "\ -temp.x=temp%s.x*w.x-temp%s.y*w.y;\n\ -temp.y = temp%s.y * w.x + temp%s.x * w.y;\n\ -temp%s = temp%s - temp;\n\ -temp%s = temp%s + temp;\n\n", regID[i + 2], regID[i + 2], regID[i + 2], regID[i + 2], regID[i + 2], regID[i + 0], regID[i + 0], regID[i + 0]);*/ - } - if (stageAngle < 0) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = %s.y;\n", iw, w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.x;\n", iw, w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - //sc->tempLen = sprintf(sc->tempStr, " w = %s(w.y, -w.x);\n\n", vecType); - } - else { - sc->tempLen = sprintf(sc->tempStr, " %s.x = -%s.y;\n", iw, w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = %s.x;\n", iw, w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - //sc->tempLen = sprintf(sc->tempStr, " iw = %s(-w.y, w.x);\n\n", vecType); - } - - for (uint64_t i = 4; i < 6; i++) { - res = VkMulComplex(sc, temp, regID[i + 2], iw, 0); - if (res != VKFFT_SUCCESS) return res; - res = VkSubComplex(sc, regID[i + 2], regID[i], temp); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, regID[i], regID[i], temp); - if (res != VKFFT_SUCCESS) return res; - /*sc->tempLen = sprintf(sc->tempStr, "\ -temp.x = temp%s.x * iw.x - temp%s.y * iw.y;\n\ -temp.y = temp%s.y * iw.x + temp%s.x * iw.y;\n\ -temp%s = temp%s - temp;\n\ -temp%s = temp%s + temp;\n\n", regID[i + 2], regID[i + 2], regID[i + 2], regID[i + 2], regID[i + 2], regID[i + 0], regID[i + 0], regID[i + 0]);*/ - } - if (stageSize == 1) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = 1;\n", w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - if (sc->LUT) { - if (sc->useCoalescedLUTUploadToSM) { - sc->tempLen = sprintf(sc->tempStr, " %s = sdata[stageInvocationID+%" PRIu64 "];\n\n", w, 2 * stageSize); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " %s = twiddleLUT[LUTId+%" PRIu64 "];\n\n", w, 2 * stageSize); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (stageAngle < 0) { - sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.y;\n", w, w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - if (!strcmp(floatType, "float")) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = %s(0.25%s*angle);\n", w, cosDef, LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = %s(0.25%s*angle);\n", w, sinDef, LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - //sc->tempLen = sprintf(sc->tempStr, " w = %s(cos(0.25*angle), sin(0.25*angle));\n\n", vecType); - } - if (!strcmp(floatType, "double")) { - sc->tempLen = sprintf(sc->tempStr, " %s=normalize(%s + %s(1.0, 0.0));\n", w, w, vecType); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - res = VkMulComplex(sc, temp, regID[1], w, 0); - if (res != VKFFT_SUCCESS) return res; - res = VkSubComplex(sc, regID[1], regID[0], temp); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, regID[0], regID[0], temp); - if (res != VKFFT_SUCCESS) return res; - /*sc->tempLen = sprintf(sc->tempStr, "\ -temp.x=temp%s.x*w.x-temp%s.y*w.y;\n\ -temp.y = temp%s.y * w.x + temp%s.x * w.y;\n\ -temp%s = temp%s - temp;\n\ -temp%s = temp%s + temp;\n\n", regID[1], regID[1], regID[1], regID[1], regID[1], regID[0], regID[0], regID[0]);*/ - if (stageAngle < 0) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = %s.y;\n", iw, w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.x;\n", iw, w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - //sc->tempLen = sprintf(sc->tempStr, " w = %s(w.y, -w.x);\n\n", vecType); - } - else { - sc->tempLen = sprintf(sc->tempStr, " %s.x = -%s.y;\n", iw, w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = %s.x;\n", iw, w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - //sc->tempLen = sprintf(sc->tempStr, " iw = %s(-w.y, w.x);\n\n", vecType); - } - res = VkMulComplex(sc, temp, regID[3], iw, 0); - if (res != VKFFT_SUCCESS) return res; - res = VkSubComplex(sc, regID[3], regID[2], temp); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, regID[2], regID[2], temp); - if (res != VKFFT_SUCCESS) return res; - /*sc->tempLen = sprintf(sc->tempStr, "\ -temp.x = temp%s.x * iw.x - temp%s.y * iw.y;\n\ -temp.y = temp%s.y * iw.x + temp%s.x * iw.y;\n\ -temp%s = temp%s - temp;\n\ -temp%s = temp%s + temp;\n\n", regID[3], regID[3], regID[3], regID[3], regID[3], regID[2], regID[2], regID[2]);*/ - if (stageAngle < 0) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = %s.x * loc_SQRT1_2 + %s.y * loc_SQRT1_2;\n", iw, w, w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = %s.y * loc_SQRT1_2 - %s.x * loc_SQRT1_2;\n\n", iw, w, w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " %s.x = %s.x * loc_SQRT1_2 - %s.y * loc_SQRT1_2;\n", iw, w, w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = %s.y * loc_SQRT1_2 + %s.x * loc_SQRT1_2;\n\n", iw, w, w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - res = VkMulComplex(sc, temp, regID[5], iw, 0); - if (res != VKFFT_SUCCESS) return res; - res = VkSubComplex(sc, regID[5], regID[4], temp); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, regID[4], regID[4], temp); - if (res != VKFFT_SUCCESS) return res; - /*sc->tempLen = sprintf(sc->tempStr, "\ -temp.x = temp%s.x * iw.x - temp%s.y * iw.y;\n\ -temp.y = temp%s.y * iw.x + temp%s.x * iw.y;\n\ -temp%s = temp%s - temp;\n\ -temp%s = temp%s + temp;\n\n", regID[5], regID[5], regID[5], regID[5], regID[5], regID[4], regID[4], regID[4]);*/ - if (stageAngle < 0) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = %s.y;\n", w, iw); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.x;\n", w, iw); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - //sc->tempLen = sprintf(sc->tempStr, " w = %s(iw.y, -iw.x);\n\n", vecType); - } - else { - sc->tempLen = sprintf(sc->tempStr, " %s.x = -%s.y;\n", w, iw); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = %s.x;\n", w, iw); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - //sc->tempLen = sprintf(sc->tempStr, " w = %s(-iw.y, iw.x);\n\n", vecType); - } - res = VkMulComplex(sc, temp, regID[7], w, 0); - if (res != VKFFT_SUCCESS) return res; - res = VkSubComplex(sc, regID[7], regID[6], temp); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, regID[6], regID[6], temp); - if (res != VKFFT_SUCCESS) return res; - - uint64_t permute2[8] = { 0,4,2,6,1,5,3,7 }; - res = VkPermute(sc, permute2, 8, 1, regID, temp); - if (res != VKFFT_SUCCESS) return res; - /* - if (res != VKFFT_SUCCESS) return res; - res = VkMovComplex(sc, temp, regID[1]); - if (res != VKFFT_SUCCESS) return res; - res = VkMovComplex(sc, regID[1], regID[4]); - if (res != VKFFT_SUCCESS) return res; - res = VkMovComplex(sc, regID[4], temp); - if (res != VKFFT_SUCCESS) return res; - res = VkMovComplex(sc, temp, regID[3]); - if (res != VKFFT_SUCCESS) return res; - res = VkMovComplex(sc, regID[3], regID[6]); - if (res != VKFFT_SUCCESS) return res; - res = VkMovComplex(sc, regID[6], temp); - if (res != VKFFT_SUCCESS) return res;*/ - /*sc->tempLen = sprintf(sc->tempStr, "\ -temp.x = temp%s.x * w.x - temp%s.y * w.y;\n\ -temp.y = temp%s.y * w.x + temp%s.x * w.y;\n\ -temp%s = temp%s - temp;\n\ -temp%s = temp%s + temp;\n\n\ -temp = temp%s;\n\ -temp%s = temp%s;\n\ -temp%s = temp;\n\n\ -temp = temp%s;\n\ -temp%s = temp%s;\n\ -temp%s = temp;\n\ -}\n\n", regID[7], regID[7], regID[7], regID[7], regID[7], regID[6], regID[6], regID[6], regID[1], regID[1], regID[4], regID[4], regID[3], regID[3], regID[6], regID[6]); - //VkAppendLine(sc, " }\n");*/ - - break; - } - case 9: { - char* tf[2]; - //VkAppendLine(sc, " {\n"); - for (uint64_t i = 0; i < 2; i++) { - tf[i] = (char*)malloc(sizeof(char) * 50); - if (!tf[i]) { - for (uint64_t j = 0; j < i; j++) { - free(tf[j]); - tf[j] = 0; - } - return VKFFT_ERROR_MALLOC_FAILED; - } - } - - sprintf(tf[0], "-0.5%s", LFending); - sprintf(tf[1], "-0.8660254037844386467637231707529%s", LFending); - for (uint64_t i = radix - 1; i > 0; i--) { - if (stageSize == 1) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = 1;\n", w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - if (i == radix - 1) { - if (sc->LUT) { - if (sc->useCoalescedLUTUploadToSM) { - sc->tempLen = sprintf(sc->tempStr, " %s = sdata[stageInvocationID];\n", w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " %s = twiddleLUT[LUTId];\n", w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (stageAngle < 0) { - sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.y;\n", w, w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - if (!strcmp(floatType, "float")) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = %s(angle*%.17e%s);\n", w, cosDef, 2.0 * i / radix, LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = %s(angle*%.17e%s);\n", w, sinDef, 2.0 * i / radix, LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - //sc->tempLen = sprintf(sc->tempStr, " w = %s(cos(angle*%.17e), sin(angle*%.17e));\n\n", vecType, 2.0 * i / radix, 2.0 * i / radix); - } - if (!strcmp(floatType, "double")) { - sc->tempLen = sprintf(sc->tempStr, " %s = sincos_20(angle*%.17e%s);\n", w, 2.0 * i / radix, LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - else { - if (sc->LUT) { - if (sc->useCoalescedLUTUploadToSM) { - sc->tempLen = sprintf(sc->tempStr, " %s = sdata[stageInvocationID+%" PRIu64 "];\n\n", w, (radix - 1 - i) * stageSize); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " %s = twiddleLUT[LUTId+%" PRIu64 "];\n\n", w, (radix - 1 - i) * stageSize); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (stageAngle < 0) { - sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.y;\n", w, w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - if (!strcmp(floatType, "float")) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = %s(angle*%.17e%s);\n", w, cosDef, 2.0 * i / radix, LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = %s(angle*%.17e%s);\n", w, sinDef, 2.0 * i / radix, LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - //sc->tempLen = sprintf(sc->tempStr, " w = %s(cos(angle*%.17e), sin(angle*%.17e));\n\n", vecType, 2.0 * i / radix, 2.0 * i / radix); - } - if (!strcmp(floatType, "double")) { - sc->tempLen = sprintf(sc->tempStr, " %s = sincos_20(angle*%.17e%s);\n", w, 2.0 * i / radix, LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - } - res = VkMulComplex(sc, regID[i], regID[i], w, temp); - if (res != VKFFT_SUCCESS) return res; - } - //important - //res = VkMovComplex(sc, regID[1], sc->locID[1]); - //if (res != VKFFT_SUCCESS) return res; - //res = VkMovComplex(sc, regID[2], sc->locID[2]); - //if (res != VKFFT_SUCCESS) return res; - uint64_t P = 3; - uint64_t Q = 3; - for (uint64_t i = 0; i < Q; i++) { - res = VkMovComplex(sc, sc->locID[0], regID[i]); - if (res != VKFFT_SUCCESS) return res; - res = VkMovComplex(sc, sc->locID[1], regID[i + Q]); - if (res != VKFFT_SUCCESS) return res; - res = VkMovComplex(sc, sc->locID[2], regID[i + 2 * Q]); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, regID[i + Q], sc->locID[1], sc->locID[2]); - if (res != VKFFT_SUCCESS) return res; - res = VkSubComplex(sc, regID[i + 2 * Q], sc->locID[1], sc->locID[2]); - if (res != VKFFT_SUCCESS) return res; - - res = VkAddComplex(sc, sc->locID[0], regID[i], regID[i + Q]); - if (res != VKFFT_SUCCESS) return res; - res = VkFMAComplex(sc, sc->locID[1], regID[i + Q], tf[0], regID[i]); - if (res != VKFFT_SUCCESS) return res; - res = VkMulComplexNumber(sc, sc->locID[2], regID[i + 2 * Q], tf[1]); - if (res != VKFFT_SUCCESS) return res; - res = VkMovComplex(sc, regID[i], sc->locID[0]); - if (res != VKFFT_SUCCESS) return res; - if (stageAngle < 0) - { - res = VkShuffleComplex(sc, regID[i + Q], sc->locID[1], sc->locID[2], 0); - if (res != VKFFT_SUCCESS) return res; - res = VkShuffleComplexInv(sc, regID[i + 2 * Q], sc->locID[1], sc->locID[2], 0); - if (res != VKFFT_SUCCESS) return res; - } - else { - res = VkShuffleComplexInv(sc, regID[i + Q], sc->locID[1], sc->locID[2], 0); - if (res != VKFFT_SUCCESS) return res; - res = VkShuffleComplex(sc, regID[i + 2 * Q], sc->locID[1], sc->locID[2], 0); - if (res != VKFFT_SUCCESS) return res; - } - } - - - for (uint64_t i = 0; i < P; i++) { - if (i > 0) { - if (stageAngle < 0) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = %.17e%s;\n", w, cos(2 * i * double_PI / radix), LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = %.17e%s;\n\n", w, -sin(2 * i * double_PI / radix), LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " %s.x = %.17e%s;\n", w, cos(2 * i * double_PI / radix), LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = %.17e%s;\n\n", w, sin(2 * i * double_PI / radix), LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - res = VkMulComplex(sc, sc->locID[1], regID[Q * i + 1], w, temp); - if (res != VKFFT_SUCCESS) return res; - if (stageAngle < 0) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = %.17e%s;\n", w, cos(4 * i * double_PI / radix), LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = %.17e%s;\n\n", w, -sin(4 * i * double_PI / radix), LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " %s.x = %.17e%s;\n", w, cos(4 * i * double_PI / radix), LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = %.17e%s;\n\n", w, sin(4 * i * double_PI / radix), LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - res = VkMulComplex(sc, sc->locID[2], regID[Q * i + 2], w, temp); - if (res != VKFFT_SUCCESS) return res; - } - else { - res = VkMovComplex(sc, sc->locID[1], regID[1]); - if (res != VKFFT_SUCCESS) return res; - res = VkMovComplex(sc, sc->locID[2], regID[2]); - if (res != VKFFT_SUCCESS) return res; - } - - res = VkAddComplex(sc, regID[Q * i + 1], sc->locID[1], sc->locID[2]); - if (res != VKFFT_SUCCESS) return res; - res = VkSubComplex(sc, regID[Q * i + 2], sc->locID[1], sc->locID[2]); - if (res != VKFFT_SUCCESS) return res; - - res = VkAddComplex(sc, sc->locID[0], regID[Q * i], regID[Q * i + 1]); - if (res != VKFFT_SUCCESS) return res; - res = VkFMAComplex(sc, sc->locID[1], regID[Q * i + 1], tf[0], regID[Q * i]); - if (res != VKFFT_SUCCESS) return res; - res = VkMulComplexNumber(sc, sc->locID[2], regID[Q * i + 2], tf[1]); - if (res != VKFFT_SUCCESS) return res; - res = VkMovComplex(sc, regID[Q * i], sc->locID[0]); - if (res != VKFFT_SUCCESS) return res; - if (stageAngle < 0) - { - res = VkShuffleComplex(sc, regID[Q * i + 1], sc->locID[1], sc->locID[2], 0); - if (res != VKFFT_SUCCESS) return res; - res = VkShuffleComplexInv(sc, regID[Q * i + 2], sc->locID[1], sc->locID[2], 0); - if (res != VKFFT_SUCCESS) return res; - } - else { - res = VkShuffleComplexInv(sc, regID[Q * i + 1], sc->locID[1], sc->locID[2], 0); - if (res != VKFFT_SUCCESS) return res; - res = VkShuffleComplex(sc, regID[Q * i + 2], sc->locID[1], sc->locID[2], 0); - if (res != VKFFT_SUCCESS) return res; - } - } - - uint64_t permute2[9] = { 0,3,6,1,4,7,2,5,8 }; - res = VkPermute(sc, permute2, 9, 1, regID, temp); - if (res != VKFFT_SUCCESS) return res; - - /*res = VkMovComplex(sc, temp, regID[1]); - if (res != VKFFT_SUCCESS) return res; - res = VkMovComplex(sc, regID[1], regID[3]); - if (res != VKFFT_SUCCESS) return res; - res = VkMovComplex(sc, regID[3], temp); - if (res != VKFFT_SUCCESS) return res; - - res = VkMovComplex(sc, temp, regID[2]); - if (res != VKFFT_SUCCESS) return res; - res = VkMovComplex(sc, regID[2], regID[4]); - if (res != VKFFT_SUCCESS) return res; - res = VkMovComplex(sc, regID[4], temp); - if (res != VKFFT_SUCCESS) return res;*/ - - for (uint64_t i = 0; i < 2; i++) { - free(tf[i]); - tf[i] = 0; - } - break; - } - case 10: { - char* tf[5]; - //VkAppendLine(sc, " {\n"); - for (uint64_t i = 0; i < 5; i++) { - tf[i] = (char*)malloc(sizeof(char) * 50); - if (!tf[i]) { - for (uint64_t j = 0; j < i; j++) { - free(tf[j]); - tf[j] = 0; - } - return VKFFT_ERROR_MALLOC_FAILED; - } - } - sprintf(tf[0], "-0.5%s", LFending); - sprintf(tf[1], "1.538841768587626701285145288018455%s", LFending); - sprintf(tf[2], "-0.363271264002680442947733378740309%s", LFending); - sprintf(tf[3], "-0.809016994374947424102293417182819%s", LFending); - sprintf(tf[4], "-0.587785252292473129168705954639073%s", LFending); - for (uint64_t i = radix - 1; i > 0; i--) { - if (stageSize == 1) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = 1;\n", w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - if (i == radix - 1) { - if (sc->LUT) { - if (sc->useCoalescedLUTUploadToSM) { - sc->tempLen = sprintf(sc->tempStr, " %s = sdata[stageInvocationID];\n", w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " %s = twiddleLUT[LUTId];\n", w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (stageAngle < 0) { - sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.y;\n", w, w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - if (!strcmp(floatType, "float")) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = %s(angle*%.17e%s);\n", w, cosDef, 2.0 * i / radix, LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = %s(angle*%.17e%s);\n", w, sinDef, 2.0 * i / radix, LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - //sc->tempLen = sprintf(sc->tempStr, " w = %s(cos(angle*%.17e), sin(angle*%.17e));\n\n", vecType, 2.0 * i / radix, 2.0 * i / radix); - } - if (!strcmp(floatType, "double")) { - sc->tempLen = sprintf(sc->tempStr, " %s = sincos_20(angle*%.17e%s);\n", w, 2.0 * i / radix, LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - else { - if (sc->LUT) { - if (sc->useCoalescedLUTUploadToSM) { - sc->tempLen = sprintf(sc->tempStr, " %s = sdata[stageInvocationID+%" PRIu64 "];\n\n", w, (radix - 1 - i) * stageSize); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " %s = twiddleLUT[LUTId+%" PRIu64 "];\n\n", w, (radix - 1 - i) * stageSize); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (stageAngle < 0) { - sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.y;\n", w, w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - if (!strcmp(floatType, "float")) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = %s(angle*%.17e%s);\n", w, cosDef, 2.0 * i / radix, LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = %s(angle*%.17e%s);\n", w, sinDef, 2.0 * i / radix, LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - //sc->tempLen = sprintf(sc->tempStr, " w = %s(cos(angle*%.17e), sin(angle*%.17e));\n\n", vecType, 2.0 * i / radix, 2.0 * i / radix); - } - if (!strcmp(floatType, "double")) { - sc->tempLen = sprintf(sc->tempStr, " %s = sincos_20(angle*%.17e%s);\n", w, 2.0 * i / radix, LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - } - res = VkMulComplex(sc, regID[i], regID[i], w, temp); - if (res != VKFFT_SUCCESS) return res; - } - //important - //res = VkMovComplex(sc, regID[1], sc->locID[1]); - //if (res != VKFFT_SUCCESS) return res; - - uint64_t P = 5; - uint64_t Q = 2; - for (uint64_t i = 0; i < Q; i++) { - res = VkMovComplex(sc, sc->locID[0], regID[i]); - if (res != VKFFT_SUCCESS) return res; - res = VkMovComplex(sc, sc->locID[1], regID[i + Q]); - if (res != VKFFT_SUCCESS) return res; - res = VkMovComplex(sc, sc->locID[2], regID[i + 2 * Q]); - if (res != VKFFT_SUCCESS) return res; - res = VkMovComplex(sc, sc->locID[3], regID[i + 3 * Q]); - if (res != VKFFT_SUCCESS) return res; - res = VkMovComplex(sc, sc->locID[4], regID[i + 4 * Q]); - if (res != VKFFT_SUCCESS) return res; - - res = VkAddComplex(sc, regID[i + Q], sc->locID[1], sc->locID[4]); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, regID[i + 2 * Q], sc->locID[2], sc->locID[3]); - if (res != VKFFT_SUCCESS) return res; - res = VkSubComplex(sc, regID[i + 3 * Q], sc->locID[2], sc->locID[3]); - if (res != VKFFT_SUCCESS) return res; - res = VkSubComplex(sc, regID[i + 4 * Q], sc->locID[1], sc->locID[4]); - if (res != VKFFT_SUCCESS) return res; - res = VkSubComplex(sc, sc->locID[3], regID[i + Q], regID[i + 2 * Q]); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, sc->locID[4], regID[i + 3 * Q], regID[i + 4 * Q]); - if (res != VKFFT_SUCCESS) return res; - - res = VkAddComplex(sc, sc->locID[0], regID[i], regID[i + Q]); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, sc->locID[0], sc->locID[0], regID[i + 2 * Q]); - if (res != VKFFT_SUCCESS) return res; - res = VkFMAComplex(sc, sc->locID[1], regID[i + Q], tf[0], regID[i]); - if (res != VKFFT_SUCCESS) return res; - res = VkFMAComplex(sc, sc->locID[2], regID[i + 2 * Q], tf[0], regID[i]); - if (res != VKFFT_SUCCESS) return res; - res = VkMulComplexNumber(sc, regID[i + 3 * Q], regID[i + 3 * Q], tf[1]); - if (res != VKFFT_SUCCESS) return res; - res = VkMulComplexNumber(sc, regID[i + 4 * Q], regID[i + 4 * Q], tf[2]); - if (res != VKFFT_SUCCESS) return res; - res = VkMulComplexNumber(sc, sc->locID[3], sc->locID[3], tf[3]); - if (res != VKFFT_SUCCESS) return res; - res = VkMulComplexNumber(sc, sc->locID[4], sc->locID[4], tf[4]); - if (res != VKFFT_SUCCESS) return res; - - res = VkSubComplex(sc, sc->locID[1], sc->locID[1], sc->locID[3]); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, sc->locID[2], sc->locID[2], sc->locID[3]); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, sc->locID[3], regID[i + 3 * Q], sc->locID[4]); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, sc->locID[4], sc->locID[4], regID[i + 4 * Q]); - if (res != VKFFT_SUCCESS) return res; - res = VkMovComplex(sc, regID[i], sc->locID[0]); - if (res != VKFFT_SUCCESS) return res; - - if (stageAngle < 0) - { - res = VkShuffleComplex(sc, regID[i + Q], sc->locID[1], sc->locID[4], 0); - if (res != VKFFT_SUCCESS) return res; - res = VkShuffleComplex(sc, regID[i + 2 * Q], sc->locID[2], sc->locID[3], 0); - if (res != VKFFT_SUCCESS) return res; - res = VkShuffleComplexInv(sc, regID[i + 3 * Q], sc->locID[2], sc->locID[3], 0); - if (res != VKFFT_SUCCESS) return res; - res = VkShuffleComplexInv(sc, regID[i + 4 * Q], sc->locID[1], sc->locID[4], 0); - if (res != VKFFT_SUCCESS) return res; - } - else { - res = VkShuffleComplexInv(sc, regID[i + Q], sc->locID[1], sc->locID[4], 0); - if (res != VKFFT_SUCCESS) return res; - res = VkShuffleComplexInv(sc, regID[i + 2 * Q], sc->locID[2], sc->locID[3], 0); - if (res != VKFFT_SUCCESS) return res; - res = VkShuffleComplex(sc, regID[i + 3 * Q], sc->locID[2], sc->locID[3], 0); - if (res != VKFFT_SUCCESS) return res; - res = VkShuffleComplex(sc, regID[i + 4 * Q], sc->locID[1], sc->locID[4], 0); - if (res != VKFFT_SUCCESS) return res; - } - - } - - - for (uint64_t i = 0; i < P; i++) { - if (i > 0) { - if (stageAngle < 0) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = %.17e%s;\n", w, cos(2 * i * double_PI / radix), LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = %.17e%s;\n\n", w, -sin(2 * i * double_PI / radix), LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " %s.x = %.17e%s;\n", w, cos(2 * i * double_PI / radix), LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = %.17e%s;\n\n", w, sin(2 * i * double_PI / radix), LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - res = VkMulComplex(sc, temp, regID[Q * i + 1], w, 0); - } - else { - res = VkMovComplex(sc, temp, regID[Q * i + 1]); - if (res != VKFFT_SUCCESS) return res; - } - res = VkSubComplex(sc, regID[Q * i + 1], regID[Q * i], temp); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, regID[Q * i], regID[Q * i], temp); - if (res != VKFFT_SUCCESS) return res; - } - - uint64_t permute2[10] = { 0, 2, 4, 6, 8, 1, 3, 5, 7, 9 }; - res = VkPermute(sc, permute2, 10, 1, regID, temp); - if (res != VKFFT_SUCCESS) return res; - - for (uint64_t i = 0; i < 5; i++) { - free(tf[i]); - tf[i] = 0; - } - break; - } - case 11: { - - char* tf[20]; - //char* tf2[4]; - //char* tf2inv[4]; - //VkAppendLine(sc, " {\n"); - for (uint64_t i = 0; i < 20; i++) { - tf[i] = (char*)malloc(sizeof(char) * 50); - if (!tf[i]) { - for (uint64_t j = 0; j < i; j++) { - free(tf[j]); - tf[j] = 0; - } - return VKFFT_ERROR_MALLOC_FAILED; - } - //tf2[i] = (char*)malloc(sizeof(char) * 50); - //tf2inv[i] = (char*)malloc(sizeof(char) * 50); - } - sprintf(tf[0], "-1.10000000000000000e+00%s", LFending); - - sprintf(tf[2], "2.53097611605958783e-01%s", LFending); - sprintf(tf[3], "-1.28820061077367898e+00%s", LFending); - sprintf(tf[4], "3.04632239669212490e-01%s", LFending); - sprintf(tf[5], "-3.91339615511917427e-01%s", LFending); - sprintf(tf[6], "-2.87102225339285022e+00%s", LFending); - sprintf(tf[7], "1.37490798661638380e+00%s", LFending); - sprintf(tf[8], "8.17178135341212419e-01%s", LFending); - sprintf(tf[9], "1.80074650644567891e+00%s", LFending); - sprintf(tf[10], "-8.59492973614497502e-01%s", LFending); - - if (stageAngle < 0) { - sprintf(tf[1], "3.31662479035539914e-01%s", LFending); - sprintf(tf[11], "-2.37347045474827967e+00%s", LFending); - sprintf(tf[12], "-2.48363930874935801e-02%s", LFending); - sprintf(tf[13], "4.74017017512828764e-01%s", LFending); - sprintf(tf[14], "7.42183927770612595e-01%s", LFending); - sprintf(tf[15], "1.40647330909460866e+00%s", LFending); - sprintf(tf[16], "-1.19136455219594772e+00%s", LFending); - sprintf(tf[17], "7.08088885039503180e-01%s", LFending); - sprintf(tf[18], "2.58908260614167995e-01%s", LFending); - sprintf(tf[19], "-4.99299221941104307e-02%s", LFending); - } - else { - sprintf(tf[1], "-3.31662479035539914e-01%s", LFending); - sprintf(tf[11], "2.37347045474827967e+00%s", LFending); - sprintf(tf[12], "2.48363930874935801e-02%s", LFending); - sprintf(tf[13], "-4.74017017512828764e-01%s", LFending); - sprintf(tf[14], "-7.42183927770612595e-01%s", LFending); - sprintf(tf[15], "-1.40647330909460866e+00%s", LFending); - sprintf(tf[16], "1.19136455219594772e+00%s", LFending); - sprintf(tf[17], "-7.08088885039503180e-01%s", LFending); - sprintf(tf[18], "-2.58908260614167995e-01%s", LFending); - sprintf(tf[19], "4.99299221941104307e-02%s", LFending); - } - for (uint64_t i = radix - 1; i > 0; i--) { - if (stageSize == 1) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = 1;\n", w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - if (i == radix - 1) { - if (sc->LUT) { - if (sc->useCoalescedLUTUploadToSM) { - sc->tempLen = sprintf(sc->tempStr, " %s = sdata[stageInvocationID];\n", w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " %s = twiddleLUT[LUTId];\n", w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (stageAngle < 0) { - sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.y;\n", w, w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - if (!strcmp(floatType, "float")) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = %s(angle*%.17e%s);\n", w, cosDef, 2.0 * i / radix, LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = %s(angle*%.17e%s);\n", w, sinDef, 2.0 * i / radix, LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - //sc->tempLen = sprintf(sc->tempStr, " w = %s(cos(angle*%.17e), sin(angle*%.17e));\n\n", vecType, 2.0 * i / radix, 2.0 * i / radix); - } - if (!strcmp(floatType, "double")) { - sc->tempLen = sprintf(sc->tempStr, " %s = sincos_20(angle*%.17e%s);\n", w, 2.0 * i / radix, LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - else { - if (sc->LUT) { - if (sc->useCoalescedLUTUploadToSM) { - sc->tempLen = sprintf(sc->tempStr, " %s = sdata[stageInvocationID+%" PRIu64 "];\n\n", w, (radix - 1 - i) * stageSize); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " %s = twiddleLUT[LUTId+%" PRIu64 "];\n\n", w, (radix - 1 - i) * stageSize); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (stageAngle < 0) { - sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.y;\n", w, w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - if (!strcmp(floatType, "float")) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = %s(angle*%.17e%s);\n", w, cosDef, 2.0 * i / radix, LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = %s(angle*%.17e%s);\n", w, sinDef, 2.0 * i / radix, LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - //sc->tempLen = sprintf(sc->tempStr, " w = %s(cos(angle*%.17e), sin(angle*%.17e));\n\n", vecType, 2.0 * i / radix, 2.0 * i / radix); - } - if (!strcmp(floatType, "double")) { - sc->tempLen = sprintf(sc->tempStr, " %s = sincos_20(angle*%.17e%s);\n", w, 2.0 * i / radix, LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - } - res = VkMulComplex(sc, sc->locID[i], regID[i], w, 0); - if (res != VKFFT_SUCCESS) return res; - } - res = VkMovComplex(sc, sc->locID[0], regID[0]); - if (res != VKFFT_SUCCESS) return res; - uint64_t permute[11] = { 0,1,9,4,3,5,10,2,7,8,6 }; - res = VkPermute(sc, permute, 11, 0, 0, w); - if (res != VKFFT_SUCCESS) return res; - for (uint64_t i = 0; i < 5; i++) { - res = VkAddComplex(sc, regID[i + 1], sc->locID[i + 1], sc->locID[i + 6]); - if (res != VKFFT_SUCCESS) return res; - res = VkSubComplex(sc, regID[i + 6], sc->locID[i + 1], sc->locID[i + 6]); - if (res != VKFFT_SUCCESS) return res; - } - res = VkMovComplex(sc, sc->locID[1], regID[1]); - if (res != VKFFT_SUCCESS) return res; - for (uint64_t i = 0; i < 4; i++) { - res = VkAddComplex(sc, sc->locID[1], sc->locID[1], regID[i + 2]); - if (res != VKFFT_SUCCESS) return res; - res = VkSubComplex(sc, sc->locID[i + 3], regID[i + 1], regID[5]); - if (res != VKFFT_SUCCESS) return res; - } - res = VkMovComplex(sc, sc->locID[2], regID[6]); - if (res != VKFFT_SUCCESS) return res; - for (uint64_t i = 0; i < 4; i++) { - res = VkAddComplex(sc, sc->locID[2], sc->locID[2], regID[i + 7]); - if (res != VKFFT_SUCCESS) return res; - res = VkSubComplex(sc, sc->locID[i + 7], regID[i + 6], regID[10]); - if (res != VKFFT_SUCCESS) return res; - } - - res = VkAddComplex(sc, regID[0], sc->locID[0], sc->locID[1]); - if (res != VKFFT_SUCCESS) return res; - res = VkMulComplexNumber(sc, regID[1], sc->locID[1], tf[0]); - if (res != VKFFT_SUCCESS) return res; - res = VkMulComplexNumberImag(sc, regID[2], sc->locID[2], tf[1], sc->locID[0]); - if (res != VKFFT_SUCCESS) return res; - for (uint64_t k = 0; k < 2; k++) { - res = VkAddComplex(sc, regID[k * 4 + 3], sc->locID[k * 4 + 3], sc->locID[k * 4 + 5]); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, regID[k * 4 + 4], sc->locID[k * 4 + 4], sc->locID[k * 4 + 6]); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, regID[k * 4 + 5], sc->locID[k * 4 + 3], sc->locID[k * 4 + 4]); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, regID[k * 4 + 6], sc->locID[k * 4 + 5], sc->locID[k * 4 + 6]); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, sc->locID[1], regID[k * 4 + 3], regID[k * 4 + 4]); - if (res != VKFFT_SUCCESS) return res; - - if (k == 0) { - res = VkMulComplexNumber(sc, sc->locID[k * 4 + 3], sc->locID[k * 4 + 3], tf[k * 9 + 2]); - if (res != VKFFT_SUCCESS) return res; - res = VkMulComplexNumber(sc, sc->locID[k * 4 + 4], sc->locID[k * 4 + 4], tf[k * 9 + 3]); - if (res != VKFFT_SUCCESS) return res; - res = VkMulComplexNumber(sc, regID[k * 4 + 5], regID[k * 4 + 5], tf[k * 9 + 4]); - if (res != VKFFT_SUCCESS) return res; - res = VkMulComplexNumber(sc, sc->locID[k * 4 + 5], sc->locID[k * 4 + 5], tf[k * 9 + 5]); - if (res != VKFFT_SUCCESS) return res; - res = VkMulComplexNumber(sc, sc->locID[k * 4 + 6], sc->locID[k * 4 + 6], tf[k * 9 + 6]); - if (res != VKFFT_SUCCESS) return res; - res = VkMulComplexNumber(sc, regID[k * 4 + 6], regID[k * 4 + 6], tf[k * 9 + 7]); - if (res != VKFFT_SUCCESS) return res; - res = VkMulComplexNumber(sc, regID[k * 4 + 3], regID[k * 4 + 3], tf[k * 9 + 8]); - if (res != VKFFT_SUCCESS) return res; - res = VkMulComplexNumber(sc, regID[k * 4 + 4], regID[k * 4 + 4], tf[k * 9 + 9]); - if (res != VKFFT_SUCCESS) return res; - res = VkMulComplexNumber(sc, sc->locID[1], sc->locID[1], tf[k * 9 + 10]); - if (res != VKFFT_SUCCESS) return res; - } - else { - res = VkMulComplexNumberImag(sc, sc->locID[k * 4 + 3], sc->locID[k * 4 + 3], tf[k * 9 + 2], sc->locID[0]); - if (res != VKFFT_SUCCESS) return res; - res = VkMulComplexNumberImag(sc, sc->locID[k * 4 + 4], sc->locID[k * 4 + 4], tf[k * 9 + 3], sc->locID[0]); - if (res != VKFFT_SUCCESS) return res; - res = VkMulComplexNumberImag(sc, regID[k * 4 + 5], regID[k * 4 + 5], tf[k * 9 + 4], sc->locID[0]); - if (res != VKFFT_SUCCESS) return res; - res = VkMulComplexNumberImag(sc, sc->locID[k * 4 + 5], sc->locID[k * 4 + 5], tf[k * 9 + 5], sc->locID[0]); - if (res != VKFFT_SUCCESS) return res; - res = VkMulComplexNumberImag(sc, sc->locID[k * 4 + 6], sc->locID[k * 4 + 6], tf[k * 9 + 6], sc->locID[0]); - if (res != VKFFT_SUCCESS) return res; - res = VkMulComplexNumberImag(sc, regID[k * 4 + 6], regID[k * 4 + 6], tf[k * 9 + 7], sc->locID[0]); - if (res != VKFFT_SUCCESS) return res; - res = VkMulComplexNumberImag(sc, regID[k * 4 + 3], regID[k * 4 + 3], tf[k * 9 + 8], sc->locID[0]); - if (res != VKFFT_SUCCESS) return res; - res = VkMulComplexNumberImag(sc, regID[k * 4 + 4], regID[k * 4 + 4], tf[k * 9 + 9], sc->locID[0]); - if (res != VKFFT_SUCCESS) return res; - res = VkMulComplexNumberImag(sc, sc->locID[1], sc->locID[1], tf[k * 9 + 10], sc->locID[0]); - if (res != VKFFT_SUCCESS) return res; - } - - res = VkAddComplex(sc, sc->locID[k * 4 + 3], sc->locID[k * 4 + 3], regID[k * 4 + 3]); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, sc->locID[k * 4 + 5], sc->locID[k * 4 + 5], regID[k * 4 + 3]); - if (res != VKFFT_SUCCESS) return res; - - res = VkAddComplex(sc, sc->locID[k * 4 + 4], sc->locID[k * 4 + 4], regID[k * 4 + 4]); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, sc->locID[k * 4 + 6], sc->locID[k * 4 + 6], regID[k * 4 + 4]); - if (res != VKFFT_SUCCESS) return res; - - res = VkAddComplex(sc, regID[k * 4 + 5], regID[k * 4 + 5], sc->locID[1]); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, regID[k * 4 + 6], regID[k * 4 + 6], sc->locID[1]); - if (res != VKFFT_SUCCESS) return res; - - res = VkAddComplex(sc, regID[k * 4 + 3], sc->locID[k * 4 + 3], regID[k * 4 + 5]); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, regID[k * 4 + 4], sc->locID[k * 4 + 4], regID[k * 4 + 5]); - if (res != VKFFT_SUCCESS) return res; - - res = VkAddComplex(sc, regID[k * 4 + 5], sc->locID[k * 4 + 5], regID[k * 4 + 6]); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, regID[k * 4 + 6], sc->locID[k * 4 + 6], regID[k * 4 + 6]); - if (res != VKFFT_SUCCESS) return res; - - } - res = VkAddComplex(sc, regID[1], regID[0], regID[1]); - if (res != VKFFT_SUCCESS) return res; - - res = VkMovComplex(sc, sc->locID[5], regID[1]); - if (res != VKFFT_SUCCESS) return res; - for (uint64_t i = 0; i < 4; i++) { - res = VkAddComplex(sc, sc->locID[i + 1], regID[1], regID[i + 3]); - if (res != VKFFT_SUCCESS) return res; - res = VkSubComplex(sc, sc->locID[5], sc->locID[5], regID[i + 3]); - if (res != VKFFT_SUCCESS) return res; - } - res = VkMovComplex(sc, sc->locID[10], regID[2]); - if (res != VKFFT_SUCCESS) return res; - for (uint64_t i = 0; i < 4; i++) { - res = VkAddComplex(sc, sc->locID[i + 6], regID[2], regID[i + 7]); - if (res != VKFFT_SUCCESS) return res; - res = VkSubComplex(sc, sc->locID[10], sc->locID[10], regID[i + 7]); - if (res != VKFFT_SUCCESS) return res; - } - for (uint64_t i = 0; i < 5; i++) { - res = VkAddComplex(sc, regID[i + 1], sc->locID[i + 1], sc->locID[i + 6]); - if (res != VKFFT_SUCCESS) return res; - res = VkSubComplex(sc, regID[i + 6], sc->locID[i + 1], sc->locID[i + 6]); - if (res != VKFFT_SUCCESS) return res; - } - uint64_t permute2[11] = { 0,10,1,8,7,9,4,2,3,6,5 }; - res = VkPermute(sc, permute2, 11, 1, regID, temp); - if (res != VKFFT_SUCCESS) return res; - - for (uint64_t i = 0; i < 20; i++) { - free(tf[i]); - tf[i] = 0; - } - break; - } - case 12: { - char* tf[2]; - //VkAppendLine(sc, " {\n"); - for (uint64_t i = 0; i < 2; i++) { - tf[i] = (char*)malloc(sizeof(char) * 50); - if (!tf[i]) { - for (uint64_t j = 0; j < i; j++) { - free(tf[j]); - tf[j] = 0; - } - return VKFFT_ERROR_MALLOC_FAILED; - } - } - sprintf(tf[0], "-0.5%s", LFending); - sprintf(tf[1], "-0.8660254037844386467637231707529%s", LFending); - for (uint64_t i = radix - 1; i > 0; i--) { - if (stageSize == 1) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = 1;\n", w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - if (i == radix - 1) { - if (sc->LUT) { - if (sc->useCoalescedLUTUploadToSM) { - sc->tempLen = sprintf(sc->tempStr, " %s = sdata[stageInvocationID];\n", w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " %s = twiddleLUT[LUTId];\n", w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (stageAngle < 0) { - sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.y;\n", w, w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - if (!strcmp(floatType, "float")) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = %s(angle*%.17e%s);\n", w, cosDef, 2.0 * i / radix, LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = %s(angle*%.17e%s);\n", w, sinDef, 2.0 * i / radix, LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - //sc->tempLen = sprintf(sc->tempStr, " w = %s(cos(angle*%.17e), sin(angle*%.17e));\n\n", vecType, 2.0 * i / radix, 2.0 * i / radix); - } - if (!strcmp(floatType, "double")) { - sc->tempLen = sprintf(sc->tempStr, " %s = sincos_20(angle*%.17e%s);\n", w, 2.0 * i / radix, LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - else { - if (sc->LUT) { - if (sc->useCoalescedLUTUploadToSM) { - sc->tempLen = sprintf(sc->tempStr, " %s = sdata[stageInvocationID+%" PRIu64 "];\n\n", w, (radix - 1 - i) * stageSize); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " %s = twiddleLUT[LUTId+%" PRIu64 "];\n\n", w, (radix - 1 - i) * stageSize); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (stageAngle < 0) { - sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.y;\n", w, w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - if (!strcmp(floatType, "float")) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = %s(angle*%.17e%s);\n", w, cosDef, 2.0 * i / radix, LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = %s(angle*%.17e%s);\n", w, sinDef, 2.0 * i / radix, LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - //sc->tempLen = sprintf(sc->tempStr, " w = %s(cos(angle*%.17e), sin(angle*%.17e));\n\n", vecType, 2.0 * i / radix, 2.0 * i / radix); - } - if (!strcmp(floatType, "double")) { - sc->tempLen = sprintf(sc->tempStr, " %s = sincos_20(angle*%.17e%s);\n", w, 2.0 * i / radix, LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - } - res = VkMulComplex(sc, regID[i], regID[i], w, temp); - if (res != VKFFT_SUCCESS) return res; - } - //important - //res = VkMovComplex(sc, regID[1], sc->locID[1]); - //if (res != VKFFT_SUCCESS) return res; - //res = VkMovComplex(sc, regID[2], sc->locID[2]); - //if (res != VKFFT_SUCCESS) return res; - uint64_t P = 3; - uint64_t Q = 4; - for (uint64_t i = 0; i < Q; i++) { - res = VkMovComplex(sc, sc->locID[0], regID[i]); - if (res != VKFFT_SUCCESS) return res; - res = VkMovComplex(sc, sc->locID[1], regID[i + Q]); - if (res != VKFFT_SUCCESS) return res; - res = VkMovComplex(sc, sc->locID[2], regID[i + 2 * Q]); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, regID[i + Q], sc->locID[1], sc->locID[2]); - if (res != VKFFT_SUCCESS) return res; - res = VkSubComplex(sc, regID[i + 2 * Q], sc->locID[1], sc->locID[2]); - if (res != VKFFT_SUCCESS) return res; - - res = VkAddComplex(sc, sc->locID[0], regID[i], regID[i + Q]); - if (res != VKFFT_SUCCESS) return res; - res = VkFMAComplex(sc, sc->locID[1], regID[i + Q], tf[0], regID[i]); - if (res != VKFFT_SUCCESS) return res; - res = VkMulComplexNumber(sc, sc->locID[2], regID[i + 2 * Q], tf[1]); - if (res != VKFFT_SUCCESS) return res; - res = VkMovComplex(sc, regID[i], sc->locID[0]); - if (res != VKFFT_SUCCESS) return res; - if (stageAngle < 0) - { - res = VkShuffleComplex(sc, regID[i + Q], sc->locID[1], sc->locID[2], 0); - if (res != VKFFT_SUCCESS) return res; - res = VkShuffleComplexInv(sc, regID[i + 2 * Q], sc->locID[1], sc->locID[2], 0); - if (res != VKFFT_SUCCESS) return res; - } - else { - res = VkShuffleComplexInv(sc, regID[i + Q], sc->locID[1], sc->locID[2], 0); - if (res != VKFFT_SUCCESS) return res; - res = VkShuffleComplex(sc, regID[i + 2 * Q], sc->locID[1], sc->locID[2], 0); - if (res != VKFFT_SUCCESS) return res; - } - } - - - for (uint64_t i = 0; i < P; i++) { - for (uint64_t j = 0; j < Q; j++) { - if (i > 0) { - if (stageAngle < 0) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = %.17e%s;\n", w, cos(2 * i * j * double_PI / radix), LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = %.17e%s;\n\n", w, -sin(2 * i * j * double_PI / radix), LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " %s.x = %.17e%s;\n", w, cos(2 * i * j * double_PI / radix), LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = %.17e%s;\n\n", w, sin(2 * i * j * double_PI / radix), LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - res = VkMulComplex(sc, regID[Q * i + j], regID[Q * i + j], w, temp); - if (res != VKFFT_SUCCESS) return res; - } - } - res = VkMovComplex(sc, temp, regID[Q * i + 2]); - if (res != VKFFT_SUCCESS) return res; - res = VkSubComplex(sc, regID[Q * i + 2], regID[Q * i], regID[Q * i + 2]); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, regID[Q * i], regID[Q * i], temp); - if (res != VKFFT_SUCCESS) return res; - - res = VkMovComplex(sc, temp, regID[Q * i + 3]); - if (res != VKFFT_SUCCESS) return res; - res = VkSubComplex(sc, regID[Q * i + 3], regID[Q * i + 1], regID[Q * i + 3]); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, regID[Q * i + 1], regID[Q * i + 1], temp); - if (res != VKFFT_SUCCESS) return res; - - res = VkMovComplex(sc, temp, regID[Q * i + 1]); - if (res != VKFFT_SUCCESS) return res; - res = VkSubComplex(sc, regID[Q * i + 1], regID[Q * i], regID[Q * i + 1]); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, regID[Q * i], regID[Q * i], temp); - if (res != VKFFT_SUCCESS) return res; - - if (stageAngle < 0) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = %s.y;\n", temp, regID[Q * i + 3]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.x;\n", temp, regID[Q * i + 3]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " %s.x = -%s.y;\n", temp, regID[Q * i + 3]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = %s.x;\n", temp, regID[Q * i + 3]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - res = VkSubComplex(sc, regID[Q * i + 3], regID[Q * i + 2], temp); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, regID[Q * i + 2], regID[Q * i + 2], temp); - if (res != VKFFT_SUCCESS) return res; - } - - uint64_t permute2[12] = { 0,4,8,2,6,10,1,5,9,3,7,11 }; - res = VkPermute(sc, permute2, 12, 1, regID, temp); - if (res != VKFFT_SUCCESS) return res; - - for (uint64_t i = 0; i < 2; i++) { - free(tf[i]); - tf[i] = 0; - } - break; - } - case 13: { - - char* tf[20]; - //char* tf2[4]; - //char* tf2inv[4]; - //VkAppendLine(sc, " {\n"); - for (uint64_t i = 0; i < 20; i++) { - tf[i] = (char*)malloc(sizeof(char) * 50); - if (!tf[i]) { - for (uint64_t j = 0; j < i; j++) { - free(tf[j]); - tf[j] = 0; - } - return VKFFT_ERROR_MALLOC_FAILED; - } - //tf2[i] = (char*)malloc(sizeof(char) * 50); - //tf2inv[i] = (char*)malloc(sizeof(char) * 50); - } - sprintf(tf[0], "-1.08333333333333333e+00%s", LFending); - sprintf(tf[1], "-3.00462606288665890e-01%s", LFending); - sprintf(tf[5], "1.00707406572753300e+00%s", LFending); - sprintf(tf[6], "7.31245990975348148e-01%s", LFending); - sprintf(tf[7], "-5.79440018900960419e-01%s", LFending); - sprintf(tf[8], "5.31932498429674383e-01%s", LFending); - sprintf(tf[9], "-5.08814921720397551e-01%s", LFending); - sprintf(tf[10], "-7.70585890309231480e-03%s", LFending); - - if (stageAngle < 0) { - sprintf(tf[2], "-7.49279330626139051e-01%s", LFending); - sprintf(tf[3], "4.01002128321867324e-01%s", LFending); - sprintf(tf[4], "1.74138601152135891e-01%s", LFending); - sprintf(tf[11], "-2.51139331838956803e+00%s", LFending); - sprintf(tf[12], "-1.82354640868242068e+00%s", LFending); - sprintf(tf[13], "1.44497990902399609e+00%s", LFending); - sprintf(tf[14], "-1.34405691517736958e+00%s", LFending); - sprintf(tf[15], "-9.75932420775945109e-01%s", LFending); - sprintf(tf[16], "7.73329778651104860e-01%s", LFending); - sprintf(tf[17], "1.92772511678346858e+00%s", LFending); - sprintf(tf[18], "1.39973941472918284e+00%s", LFending); - sprintf(tf[19], "-1.10915484383755047e+00%s", LFending); - } - else { - sprintf(tf[2], "7.49279330626139051e-01%s", LFending); - sprintf(tf[3], "-4.01002128321867324e-01%s", LFending); - sprintf(tf[4], "-1.74138601152135891e-01%s", LFending); - sprintf(tf[11], "2.51139331838956803e+00%s", LFending); - sprintf(tf[12], "1.82354640868242068e+00%s", LFending); - sprintf(tf[13], "-1.44497990902399609e+00%s", LFending); - sprintf(tf[14], "1.34405691517736958e+00%s", LFending); - sprintf(tf[15], "9.75932420775945109e-01%s", LFending); - sprintf(tf[16], "-7.73329778651104860e-01%s", LFending); - sprintf(tf[17], "-1.92772511678346858e+00%s", LFending); - sprintf(tf[18], "-1.39973941472918284e+00%s", LFending); - sprintf(tf[19], "1.10915484383755047e+00%s", LFending); - } - for (uint64_t i = radix - 1; i > 0; i--) { - if (stageSize == 1) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = 1;\n", w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - if (i == radix - 1) { - if (sc->LUT) { - if (sc->useCoalescedLUTUploadToSM) { - sc->tempLen = sprintf(sc->tempStr, " %s = sdata[stageInvocationID];\n", w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " %s = twiddleLUT[LUTId];\n", w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (stageAngle < 0) { - sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.y;\n", w, w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - if (!strcmp(floatType, "float")) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = %s(angle*%.17e%s);\n", w, cosDef, 2.0 * i / radix, LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = %s(angle*%.17e%s);\n", w, sinDef, 2.0 * i / radix, LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - //sc->tempLen = sprintf(sc->tempStr, " w = %s(cos(angle*%.17e), sin(angle*%.17e));\n\n", vecType, 2.0 * i / radix, 2.0 * i / radix); - } - if (!strcmp(floatType, "double")) { - sc->tempLen = sprintf(sc->tempStr, " %s = sincos_20(angle*%.17e%s);\n", w, 2.0 * i / radix, LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - else { - if (sc->LUT) { - if (sc->useCoalescedLUTUploadToSM) { - sc->tempLen = sprintf(sc->tempStr, " %s = sdata[stageInvocationID+%" PRIu64 "];\n\n", w, (radix - 1 - i) * stageSize); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " %s = twiddleLUT[LUTId+%" PRIu64 "];\n\n", w, (radix - 1 - i) * stageSize); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (stageAngle < 0) { - sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.y;\n", w, w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - if (!strcmp(floatType, "float")) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = %s(angle*%.17e%s);\n", w, cosDef, 2.0 * i / radix, LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = %s(angle*%.17e%s);\n", w, sinDef, 2.0 * i / radix, LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - //sc->tempLen = sprintf(sc->tempStr, " w = %s(cos(angle*%.17e), sin(angle*%.17e));\n\n", vecType, 2.0 * i / radix, 2.0 * i / radix); - } - if (!strcmp(floatType, "double")) { - sc->tempLen = sprintf(sc->tempStr, " %s = sincos_20(angle*%.17e%s);\n", w, 2.0 * i / radix, LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - } - res = VkMulComplex(sc, sc->locID[i], regID[i], w, 0); - if (res != VKFFT_SUCCESS) return res; - - } - res = VkMovComplex(sc, sc->locID[0], regID[0]); - if (res != VKFFT_SUCCESS) return res; - uint64_t permute[13] = { 0,1,3,9,5,2,6,12,10,4,8,11,7 }; - res = VkPermute(sc, permute, 13, 0, 0, w); - if (res != VKFFT_SUCCESS) return res; - for (uint64_t i = 0; i < 6; i++) { - res = VkSubComplex(sc, regID[i + 7], sc->locID[i + 1], sc->locID[i + 7]); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, sc->locID[i + 1], sc->locID[i + 1], sc->locID[i + 7]); - if (res != VKFFT_SUCCESS) return res; - } - for (uint64_t i = 0; i < 3; i++) { - res = VkAddComplex(sc, regID[i + 1], sc->locID[i + 1], sc->locID[i + 4]); - if (res != VKFFT_SUCCESS) return res; - res = VkSubComplex(sc, regID[i + 4], sc->locID[i + 1], sc->locID[i + 4]); - if (res != VKFFT_SUCCESS) return res; - } - for (uint64_t i = 0; i < 4; i++) { - res = VkAddComplex(sc, sc->locID[i + 1], regID[i * 3 + 1], regID[i * 3 + 2]); - if (res != VKFFT_SUCCESS) return res; - res = VkSubComplex(sc, sc->locID[i * 2 + 5], regID[i * 3 + 1], regID[i * 3 + 3]); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, sc->locID[i + 1], sc->locID[i + 1], regID[i * 3 + 3]); - if (res != VKFFT_SUCCESS) return res; - res = VkSubComplex(sc, sc->locID[i * 2 + 6], regID[i * 3 + 2], regID[i * 3 + 3]); - if (res != VKFFT_SUCCESS) return res; - } - - res = VkAddComplex(sc, regID[0], sc->locID[0], sc->locID[1]); - if (res != VKFFT_SUCCESS) return res; - res = VkMulComplexNumber(sc, regID[1], sc->locID[1], tf[0]); - if (res != VKFFT_SUCCESS) return res; - res = VkMulComplexNumber(sc, regID[2], sc->locID[2], tf[1]); - if (res != VKFFT_SUCCESS) return res; - for (uint64_t k = 0; k < 3; k++) { - res = VkAddComplex(sc, regID[k * 2 + 4], sc->locID[k * 2 + 3], sc->locID[k * 2 + 4]); - - if (k == 0) { - res = VkMulComplexNumberImag(sc, sc->locID[k * 2 + 3], sc->locID[k * 2 + 3], tf[k * 3 + 2], sc->locID[0]); - if (res != VKFFT_SUCCESS) return res; - res = VkMulComplexNumberImag(sc, sc->locID[k * 2 + 4], sc->locID[k * 2 + 4], tf[k * 3 + 3], sc->locID[0]); - if (res != VKFFT_SUCCESS) return res; - res = VkMulComplexNumberImag(sc, regID[k * 2 + 4], regID[k * 2 + 4], tf[k * 3 + 4], sc->locID[0]); - if (res != VKFFT_SUCCESS) return res; - } - else { - res = VkMulComplexNumber(sc, sc->locID[k * 2 + 3], sc->locID[k * 2 + 3], tf[k * 3 + 2]); - if (res != VKFFT_SUCCESS) return res; - res = VkMulComplexNumber(sc, sc->locID[k * 2 + 4], sc->locID[k * 2 + 4], tf[k * 3 + 3]); - if (res != VKFFT_SUCCESS) return res; - res = VkMulComplexNumber(sc, regID[k * 2 + 4], regID[k * 2 + 4], tf[k * 3 + 4]); - if (res != VKFFT_SUCCESS) return res; - } - - res = VkAddComplex(sc, regID[k * 2 + 3], sc->locID[k * 2 + 3], regID[k * 2 + 4]); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, regID[k * 2 + 4], sc->locID[k * 2 + 4], regID[k * 2 + 4]); - if (res != VKFFT_SUCCESS) return res; - - } - res = VkAddComplex(sc, regID[9], sc->locID[9], sc->locID[11]); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, regID[10], sc->locID[10], sc->locID[12]); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, regID[11], sc->locID[9], sc->locID[10]); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, regID[12], sc->locID[11], sc->locID[12]); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, sc->locID[1], regID[9], regID[10]); - if (res != VKFFT_SUCCESS) return res; - - res = VkMulComplexNumberImag(sc, sc->locID[9], sc->locID[9], tf[11], sc->locID[0]); - if (res != VKFFT_SUCCESS) return res; - res = VkMulComplexNumberImag(sc, sc->locID[10], sc->locID[10], tf[12], sc->locID[0]); - if (res != VKFFT_SUCCESS) return res; - res = VkMulComplexNumberImag(sc, regID[11], regID[11], tf[13], sc->locID[0]); - if (res != VKFFT_SUCCESS) return res; - res = VkMulComplexNumberImag(sc, sc->locID[11], sc->locID[11], tf[14], sc->locID[0]); - if (res != VKFFT_SUCCESS) return res; - res = VkMulComplexNumberImag(sc, sc->locID[12], sc->locID[12], tf[15], sc->locID[0]); - if (res != VKFFT_SUCCESS) return res; - res = VkMulComplexNumberImag(sc, regID[12], regID[12], tf[16], sc->locID[0]); - if (res != VKFFT_SUCCESS) return res; - res = VkMulComplexNumberImag(sc, regID[9], regID[9], tf[17], sc->locID[0]); - if (res != VKFFT_SUCCESS) return res; - res = VkMulComplexNumberImag(sc, regID[10], regID[10], tf[18], sc->locID[0]); - if (res != VKFFT_SUCCESS) return res; - res = VkMulComplexNumberImag(sc, sc->locID[1], sc->locID[1], tf[19], sc->locID[0]); - if (res != VKFFT_SUCCESS) return res; - - res = VkAddComplex(sc, sc->locID[9], sc->locID[9], regID[9]); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, sc->locID[11], sc->locID[11], regID[9]); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, sc->locID[10], sc->locID[10], regID[10]); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, sc->locID[12], sc->locID[12], regID[10]); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, regID[11], regID[11], sc->locID[1]); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, regID[12], regID[12], sc->locID[1]); - if (res != VKFFT_SUCCESS) return res; - - res = VkAddComplex(sc, regID[9], sc->locID[9], regID[11]); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, regID[10], sc->locID[10], regID[11]); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, regID[11], sc->locID[11], regID[12]); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, regID[12], sc->locID[12], regID[12]); - if (res != VKFFT_SUCCESS) return res; - - res = VkAddComplex(sc, regID[1], regID[0], regID[1]); - if (res != VKFFT_SUCCESS) return res; - - for (uint64_t i = 0; i < 4; i++) { - res = VkAddComplex(sc, sc->locID[i * 3 + 1], regID[i + 1], regID[i * 2 + 5]); - if (res != VKFFT_SUCCESS) return res; - res = VkSubComplex(sc, sc->locID[i * 3 + 3], regID[i + 1], regID[i * 2 + 5]); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, sc->locID[i * 3 + 2], regID[i + 1], regID[i * 2 + 6]); - if (res != VKFFT_SUCCESS) return res; - res = VkSubComplex(sc, sc->locID[i * 3 + 3], sc->locID[i * 3 + 3], regID[i * 2 + 6]); - if (res != VKFFT_SUCCESS) return res; - } - for (uint64_t i = 0; i < 3; i++) { - res = VkAddComplex(sc, regID[i + 1], sc->locID[i + 1], sc->locID[i + 4]); - if (res != VKFFT_SUCCESS) return res; - res = VkSubComplex(sc, sc->locID[i + 4], sc->locID[i + 1], sc->locID[i + 4]); - if (res != VKFFT_SUCCESS) return res; - res = VkMovComplex(sc, sc->locID[i + 1], regID[i + 1]); - if (res != VKFFT_SUCCESS) return res; - } - for (uint64_t i = 0; i < 6; i++) { - res = VkAddComplex(sc, regID[i + 1], sc->locID[i + 1], sc->locID[i + 7]); - if (res != VKFFT_SUCCESS) return res; - res = VkSubComplex(sc, regID[i + 7], sc->locID[i + 1], sc->locID[i + 7]); - if (res != VKFFT_SUCCESS) return res; - } - uint64_t permute2[13] = { 0,12,1,10,5,3,2,8,9,11,4,7,6 }; - res = VkPermute(sc, permute2, 13, 1, regID, temp); - if (res != VKFFT_SUCCESS) return res; - - for (uint64_t i = 0; i < 20; i++) { - free(tf[i]); - tf[i] = 0; - } - break; - } - case 14: { - char* tf[8]; - - //VkAppendLine(sc, " {\n"); - for (uint64_t i = 0; i < 8; i++) { - tf[i] = (char*)malloc(sizeof(char) * 50); - if (!tf[i]) { - for (uint64_t j = 0; j < i; j++) { - free(tf[j]); - tf[j] = 0; - } - return VKFFT_ERROR_MALLOC_FAILED; - } - } - sprintf(tf[0], "-1.16666666666666651863693004997913%s", LFending); - sprintf(tf[1], "0.79015646852540022404554065360571%s", LFending); - sprintf(tf[2], "0.05585426728964774240049351305970%s", LFending); - sprintf(tf[3], "0.73430220123575240531721419756650%s", LFending); - if (stageAngle < 0) { - sprintf(tf[4], "0.44095855184409837868031445395900%s", LFending); - sprintf(tf[5], "0.34087293062393136944265847887436%s", LFending); - sprintf(tf[6], "-0.53396936033772524066165487965918%s", LFending); - sprintf(tf[7], "0.87484229096165666561546458979137%s", LFending); - } - else { - sprintf(tf[4], "-0.44095855184409837868031445395900%s", LFending); - sprintf(tf[5], "-0.34087293062393136944265847887436%s", LFending); - sprintf(tf[6], "0.53396936033772524066165487965918%s", LFending); - sprintf(tf[7], "-0.87484229096165666561546458979137%s", LFending); - } - for (uint64_t i = radix - 1; i > 0; i--) { - if (stageSize == 1) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = 1;\n", w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - if (i == radix - 1) { - if (sc->LUT) { - if (sc->useCoalescedLUTUploadToSM) { - sc->tempLen = sprintf(sc->tempStr, " %s = sdata[stageInvocationID];\n", w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " %s = twiddleLUT[LUTId];\n", w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (stageAngle < 0) { - sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.y;\n", w, w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - if (!strcmp(floatType, "float")) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = %s(angle*%.17e%s);\n", w, cosDef, 2.0 * i / radix, LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = %s(angle*%.17e%s);\n", w, sinDef, 2.0 * i / radix, LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - //sc->tempLen = sprintf(sc->tempStr, " w = %s(cos(angle*%.17e), sin(angle*%.17e));\n\n", vecType, 2.0 * i / radix, 2.0 * i / radix); - } - if (!strcmp(floatType, "double")) { - sc->tempLen = sprintf(sc->tempStr, " %s = sincos_20(angle*%.17e%s);\n", w, 2.0 * i / radix, LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - else { - if (sc->LUT) { - if (sc->useCoalescedLUTUploadToSM) { - sc->tempLen = sprintf(sc->tempStr, " %s = sdata[stageInvocationID+%" PRIu64 "];\n\n", w, (radix - 1 - i) * stageSize); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " %s = twiddleLUT[LUTId+%" PRIu64 "];\n\n", w, (radix - 1 - i) * stageSize); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (stageAngle < 0) { - sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.y;\n", w, w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - if (!strcmp(floatType, "float")) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = %s(angle*%.17e%s);\n", w, cosDef, 2.0 * i / radix, LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = %s(angle*%.17e%s);\n", w, sinDef, 2.0 * i / radix, LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - //sc->tempLen = sprintf(sc->tempStr, " w = %s(cos(angle*%.17e), sin(angle*%.17e));\n\n", vecType, 2.0 * i / radix, 2.0 * i / radix); - } - if (!strcmp(floatType, "double")) { - sc->tempLen = sprintf(sc->tempStr, " %s = sincos_20(angle*%.17e%s);\n", w, 2.0 * i / radix, LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - } - res = VkMulComplex(sc, regID[i], regID[i], w, temp); - if (res != VKFFT_SUCCESS) return res; - } - //important - //res = VkMovComplex(sc, regID[1], sc->locID[1]); - //if (res != VKFFT_SUCCESS) return res; - - uint64_t P = 7; - uint64_t Q = 2; - for (uint64_t i = 0; i < Q; i++) { - res = VkMovComplex(sc, sc->locID[0], regID[i]); - if (res != VKFFT_SUCCESS) return res; - res = VkMovComplex(sc, sc->locID[1], regID[i + Q]); - if (res != VKFFT_SUCCESS) return res; - res = VkMovComplex(sc, sc->locID[2], regID[i + 2 * Q]); - if (res != VKFFT_SUCCESS) return res; - res = VkMovComplex(sc, sc->locID[3], regID[i + 3 * Q]); - if (res != VKFFT_SUCCESS) return res; - res = VkMovComplex(sc, sc->locID[4], regID[i + 4 * Q]); - if (res != VKFFT_SUCCESS) return res; - res = VkMovComplex(sc, sc->locID[5], regID[i + 5 * Q]); - if (res != VKFFT_SUCCESS) return res; - res = VkMovComplex(sc, sc->locID[6], regID[i + 6 * Q]); - if (res != VKFFT_SUCCESS) return res; - - res = VkAddComplex(sc, regID[i], sc->locID[1], sc->locID[6]); - if (res != VKFFT_SUCCESS) return res; - res = VkSubComplex(sc, regID[i + Q], sc->locID[1], sc->locID[6]); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, regID[i + 2 * Q], sc->locID[2], sc->locID[5]); - if (res != VKFFT_SUCCESS) return res; - res = VkSubComplex(sc, regID[i + 3 * Q], sc->locID[2], sc->locID[5]); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, regID[i + 4 * Q], sc->locID[4], sc->locID[3]); - if (res != VKFFT_SUCCESS) return res; - res = VkSubComplex(sc, regID[i + 5 * Q], sc->locID[4], sc->locID[3]); - if (res != VKFFT_SUCCESS) return res; - - res = VkAddComplex(sc, sc->locID[5], regID[i + Q], regID[i + 3 * Q]); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, sc->locID[5], sc->locID[5], regID[i + 5 * Q]); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, sc->locID[1], regID[i], regID[i + 2 * Q]); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, sc->locID[1], sc->locID[1], regID[i + 4 * Q]); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, sc->locID[0], sc->locID[0], sc->locID[1]); - if (res != VKFFT_SUCCESS) return res; - - res = VkSubComplex(sc, sc->locID[2], regID[i], regID[i + 4 * Q]); - if (res != VKFFT_SUCCESS) return res; - res = VkSubComplex(sc, sc->locID[3], regID[i + 4 * Q], regID[i + 2 * Q]); - if (res != VKFFT_SUCCESS) return res; - res = VkSubComplex(sc, sc->locID[4], regID[i + 2 * Q], regID[i]); - if (res != VKFFT_SUCCESS) return res; - - res = VkSubComplex(sc, regID[i], regID[i + Q], regID[i + 5 * Q]); - if (res != VKFFT_SUCCESS) return res; - res = VkSubComplex(sc, regID[i + 2 * Q], regID[i + 5 * Q], regID[i + 3 * Q]); - if (res != VKFFT_SUCCESS) return res; - res = VkSubComplex(sc, regID[i + 4 * Q], regID[i + 3 * Q], regID[i + Q]); - if (res != VKFFT_SUCCESS) return res; - - res = VkMulComplexNumber(sc, sc->locID[1], sc->locID[1], tf[0]); - if (res != VKFFT_SUCCESS) return res; - res = VkMulComplexNumber(sc, sc->locID[2], sc->locID[2], tf[1]); - if (res != VKFFT_SUCCESS) return res; - res = VkMulComplexNumber(sc, sc->locID[3], sc->locID[3], tf[2]); - if (res != VKFFT_SUCCESS) return res; - res = VkMulComplexNumber(sc, sc->locID[4], sc->locID[4], tf[3]); - if (res != VKFFT_SUCCESS) return res; - res = VkMulComplexNumber(sc, sc->locID[5], sc->locID[5], tf[4]); - if (res != VKFFT_SUCCESS) return res; - res = VkMulComplexNumber(sc, regID[i], regID[i], tf[5]); - if (res != VKFFT_SUCCESS) return res; - res = VkMulComplexNumber(sc, regID[i + 2 * Q], regID[i + 2 * Q], tf[6]); - if (res != VKFFT_SUCCESS) return res; - res = VkMulComplexNumber(sc, regID[i + 4 * Q], regID[i + 4 * Q], tf[7]); - if (res != VKFFT_SUCCESS) return res; - - res = VkSubComplex(sc, regID[i + 5 * Q], regID[i + 4 * Q], regID[i + 2 * Q]); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplexInv(sc, regID[i + 6 * Q], regID[i + 4 * Q], regID[i]); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, regID[i + 4 * Q], regID[i], regID[i + 2 * Q]); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, regID[i], sc->locID[0], sc->locID[1]); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, regID[i + Q], sc->locID[2], sc->locID[3]); - if (res != VKFFT_SUCCESS) return res; - res = VkSubComplex(sc, regID[i + 2 * Q], sc->locID[4], sc->locID[3]); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplexInv(sc, regID[i + 3 * Q], sc->locID[2], sc->locID[4]); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, sc->locID[1], regID[i], regID[i + Q]); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, sc->locID[2], regID[i], regID[i + 2 * Q]); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, sc->locID[3], regID[i], regID[i + 3 * Q]); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, sc->locID[4], regID[i + 4 * Q], sc->locID[5]); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, sc->locID[6], regID[i + 6 * Q], sc->locID[5]); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, sc->locID[5], sc->locID[5], regID[i + 5 * Q]); - if (res != VKFFT_SUCCESS) return res; - res = VkMovComplex(sc, regID[i], sc->locID[0]); - if (res != VKFFT_SUCCESS) return res; - res = VkShuffleComplexInv(sc, regID[i + Q], sc->locID[1], sc->locID[4], 0); - if (res != VKFFT_SUCCESS) return res; - res = VkShuffleComplexInv(sc, regID[i + 2 * Q], sc->locID[3], sc->locID[6], 0); - if (res != VKFFT_SUCCESS) return res; - res = VkShuffleComplex(sc, regID[i + 3 * Q], sc->locID[2], sc->locID[5], 0); - if (res != VKFFT_SUCCESS) return res; - res = VkShuffleComplexInv(sc, regID[i + 4 * Q], sc->locID[2], sc->locID[5], 0); - if (res != VKFFT_SUCCESS) return res; - res = VkShuffleComplex(sc, regID[i + 5 * Q], sc->locID[3], sc->locID[6], 0); - if (res != VKFFT_SUCCESS) return res; - res = VkShuffleComplex(sc, regID[i + 6 * Q], sc->locID[1], sc->locID[4], 0); - if (res != VKFFT_SUCCESS) return res; - - } - - - for (uint64_t i = 0; i < P; i++) { - if (i > 0) { - if (stageAngle < 0) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = %.17e%s;\n", w, cos(2 * i * double_PI / radix), LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = %.17e%s;\n\n", w, -sin(2 * i * double_PI / radix), LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " %s.x = %.17e%s;\n", w, cos(2 * i * double_PI / radix), LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = %.17e%s;\n\n", w, sin(2 * i * double_PI / radix), LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - res = VkMulComplex(sc, temp, regID[Q * i + 1], w, 0); - if (res != VKFFT_SUCCESS) return res; - } - else { - res = VkMovComplex(sc, temp, regID[Q * i + 1]); - if (res != VKFFT_SUCCESS) return res; - } - res = VkSubComplex(sc, regID[Q * i + 1], regID[Q * i], temp); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, regID[Q * i], regID[Q * i], temp); - if (res != VKFFT_SUCCESS) return res; - } - - uint64_t permute2[14] = { 0,2,4,6,8,10,12,1,3,5,7,9,11,13 }; - res = VkPermute(sc, permute2, 14, 1, regID, temp); - if (res != VKFFT_SUCCESS) return res; - - for (uint64_t i = 0; i < 8; i++) { - free(tf[i]); - tf[i] = 0; - } - break; - } - case 15: { - char* tf[5]; - //VkAppendLine(sc, " {\n"); - for (uint64_t i = 0; i < 5; i++) { - tf[i] = (char*)malloc(sizeof(char) * 50); - if (!tf[i]) { - for (uint64_t j = 0; j < i; j++) { - free(tf[j]); - tf[j] = 0; - } - return VKFFT_ERROR_MALLOC_FAILED; - } - } - sprintf(tf[0], "-0.5%s", LFending); - sprintf(tf[1], "1.538841768587626701285145288018455%s", LFending); - sprintf(tf[2], "-0.363271264002680442947733378740309%s", LFending); - sprintf(tf[3], "-0.809016994374947424102293417182819%s", LFending); - sprintf(tf[4], "-0.587785252292473129168705954639073%s", LFending); - - char* tf2[2]; - //VkAppendLine(sc, " {\n"); - for (uint64_t i = 0; i < 2; i++) { - tf2[i] = (char*)malloc(sizeof(char) * 50); - if (!tf2[i]) { - for (uint64_t j = 0; j < i; j++) { - free(tf2[j]); - tf2[j] = 0; - } - return VKFFT_ERROR_MALLOC_FAILED; - } - } - - sprintf(tf2[0], "-0.5%s", LFending); - sprintf(tf2[1], "-0.8660254037844386467637231707529%s", LFending); - - for (uint64_t i = radix - 1; i > 0; i--) { - if (stageSize == 1) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = 1;\n", w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - if (i == radix - 1) { - if (sc->LUT) { - if (sc->useCoalescedLUTUploadToSM) { - sc->tempLen = sprintf(sc->tempStr, " %s = sdata[stageInvocationID];\n", w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " %s = twiddleLUT[LUTId];\n", w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (stageAngle < 0) { - sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.y;\n", w, w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - if (!strcmp(floatType, "float")) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = %s(angle*%.17e%s);\n", w, cosDef, 2.0 * i / radix, LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = %s(angle*%.17e%s);\n", w, sinDef, 2.0 * i / radix, LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - //sc->tempLen = sprintf(sc->tempStr, " w = %s(cos(angle*%.17e), sin(angle*%.17e));\n\n", vecType, 2.0 * i / radix, 2.0 * i / radix); - } - if (!strcmp(floatType, "double")) { - sc->tempLen = sprintf(sc->tempStr, " %s = sincos_20(angle*%.17e%s);\n", w, 2.0 * i / radix, LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - else { - if (sc->LUT) { - if (sc->useCoalescedLUTUploadToSM) { - sc->tempLen = sprintf(sc->tempStr, " %s = sdata[stageInvocationID+%" PRIu64 "];\n\n", w, (radix - 1 - i) * stageSize); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " %s = twiddleLUT[LUTId+%" PRIu64 "];\n\n", w, (radix - 1 - i) * stageSize); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (stageAngle < 0) { - sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.y;\n", w, w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - if (!strcmp(floatType, "float")) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = %s(angle*%.17e%s);\n", w, cosDef, 2.0 * i / radix, LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = %s(angle*%.17e%s);\n", w, sinDef, 2.0 * i / radix, LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - //sc->tempLen = sprintf(sc->tempStr, " w = %s(cos(angle*%.17e), sin(angle*%.17e));\n\n", vecType, 2.0 * i / radix, 2.0 * i / radix); - } - if (!strcmp(floatType, "double")) { - sc->tempLen = sprintf(sc->tempStr, " %s = sincos_20(angle*%.17e%s);\n", w, 2.0 * i / radix, LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - } - res = VkMulComplex(sc, regID[i], regID[i], w, temp); - if (res != VKFFT_SUCCESS) return res; - } - //important - //res = VkMovComplex(sc, regID[1], sc->locID[1]); - //if (res != VKFFT_SUCCESS) return res; - - uint64_t P = 5; - uint64_t Q = 3; - for (uint64_t i = 0; i < Q; i++) { - res = VkMovComplex(sc, sc->locID[0], regID[i]); - if (res != VKFFT_SUCCESS) return res; - res = VkMovComplex(sc, sc->locID[1], regID[i + Q]); - if (res != VKFFT_SUCCESS) return res; - res = VkMovComplex(sc, sc->locID[2], regID[i + 2 * Q]); - if (res != VKFFT_SUCCESS) return res; - res = VkMovComplex(sc, sc->locID[3], regID[i + 3 * Q]); - if (res != VKFFT_SUCCESS) return res; - res = VkMovComplex(sc, sc->locID[4], regID[i + 4 * Q]); - if (res != VKFFT_SUCCESS) return res; - - res = VkAddComplex(sc, regID[i + Q], sc->locID[1], sc->locID[4]); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, regID[i + 2 * Q], sc->locID[2], sc->locID[3]); - if (res != VKFFT_SUCCESS) return res; - res = VkSubComplex(sc, regID[i + 3 * Q], sc->locID[2], sc->locID[3]); - if (res != VKFFT_SUCCESS) return res; - res = VkSubComplex(sc, regID[i + 4 * Q], sc->locID[1], sc->locID[4]); - if (res != VKFFT_SUCCESS) return res; - res = VkSubComplex(sc, sc->locID[3], regID[i + Q], regID[i + 2 * Q]); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, sc->locID[4], regID[i + 3 * Q], regID[i + 4 * Q]); - if (res != VKFFT_SUCCESS) return res; - - res = VkAddComplex(sc, sc->locID[0], regID[i], regID[i + Q]); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, sc->locID[0], sc->locID[0], regID[i + 2 * Q]); - if (res != VKFFT_SUCCESS) return res; - res = VkFMAComplex(sc, sc->locID[1], regID[i + Q], tf[0], regID[i]); - if (res != VKFFT_SUCCESS) return res; - res = VkFMAComplex(sc, sc->locID[2], regID[i + 2 * Q], tf[0], regID[i]); - if (res != VKFFT_SUCCESS) return res; - res = VkMulComplexNumber(sc, regID[i + 3 * Q], regID[i + 3 * Q], tf[1]); - if (res != VKFFT_SUCCESS) return res; - res = VkMulComplexNumber(sc, regID[i + 4 * Q], regID[i + 4 * Q], tf[2]); - if (res != VKFFT_SUCCESS) return res; - res = VkMulComplexNumber(sc, sc->locID[3], sc->locID[3], tf[3]); - if (res != VKFFT_SUCCESS) return res; - res = VkMulComplexNumber(sc, sc->locID[4], sc->locID[4], tf[4]); - if (res != VKFFT_SUCCESS) return res; - - res = VkSubComplex(sc, sc->locID[1], sc->locID[1], sc->locID[3]); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, sc->locID[2], sc->locID[2], sc->locID[3]); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, sc->locID[3], regID[i + 3 * Q], sc->locID[4]); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, sc->locID[4], sc->locID[4], regID[i + 4 * Q]); - if (res != VKFFT_SUCCESS) return res; - res = VkMovComplex(sc, regID[i], sc->locID[0]); - if (res != VKFFT_SUCCESS) return res; - - if (stageAngle < 0) - { - res = VkShuffleComplex(sc, regID[i + Q], sc->locID[1], sc->locID[4], 0); - if (res != VKFFT_SUCCESS) return res; - res = VkShuffleComplex(sc, regID[i + 2 * Q], sc->locID[2], sc->locID[3], 0); - if (res != VKFFT_SUCCESS) return res; - res = VkShuffleComplexInv(sc, regID[i + 3 * Q], sc->locID[2], sc->locID[3], 0); - if (res != VKFFT_SUCCESS) return res; - res = VkShuffleComplexInv(sc, regID[i + 4 * Q], sc->locID[1], sc->locID[4], 0); - if (res != VKFFT_SUCCESS) return res; - } - else { - res = VkShuffleComplexInv(sc, regID[i + Q], sc->locID[1], sc->locID[4], 0); - if (res != VKFFT_SUCCESS) return res; - res = VkShuffleComplexInv(sc, regID[i + 2 * Q], sc->locID[2], sc->locID[3], 0); - if (res != VKFFT_SUCCESS) return res; - res = VkShuffleComplex(sc, regID[i + 3 * Q], sc->locID[2], sc->locID[3], 0); - if (res != VKFFT_SUCCESS) return res; - res = VkShuffleComplex(sc, regID[i + 4 * Q], sc->locID[1], sc->locID[4], 0); - if (res != VKFFT_SUCCESS) return res; - } - - } - - - for (uint64_t i = 0; i < P; i++) { - if (i > 0) { - if (stageAngle < 0) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = %.17e%s;\n", w, cos(2 * i * double_PI / radix), LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = %.17e%s;\n\n", w, -sin(2 * i * double_PI / radix), LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " %s.x = %.17e%s;\n", w, cos(2 * i * double_PI / radix), LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = %.17e%s;\n\n", w, sin(2 * i * double_PI / radix), LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - res = VkMulComplex(sc, sc->locID[1], regID[Q * i + 1], w, temp); - if (res != VKFFT_SUCCESS) return res; - if (stageAngle < 0) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = %.17e%s;\n", w, cos(4 * i * double_PI / radix), LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = %.17e%s;\n\n", w, -sin(4 * i * double_PI / radix), LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " %s.x = %.17e%s;\n", w, cos(4 * i * double_PI / radix), LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = %.17e%s;\n\n", w, sin(4 * i * double_PI / radix), LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - res = VkMulComplex(sc, sc->locID[2], regID[Q * i + 2], w, temp); - if (res != VKFFT_SUCCESS) return res; - } - else { - res = VkMovComplex(sc, sc->locID[1], regID[1]); - if (res != VKFFT_SUCCESS) return res; - res = VkMovComplex(sc, sc->locID[2], regID[2]); - if (res != VKFFT_SUCCESS) return res; - } - - res = VkAddComplex(sc, regID[Q * i + 1], sc->locID[1], sc->locID[2]); - if (res != VKFFT_SUCCESS) return res; - res = VkSubComplex(sc, regID[Q * i + 2], sc->locID[1], sc->locID[2]); - if (res != VKFFT_SUCCESS) return res; - - res = VkAddComplex(sc, sc->locID[0], regID[Q * i], regID[Q * i + 1]); - if (res != VKFFT_SUCCESS) return res; - res = VkFMAComplex(sc, sc->locID[1], regID[Q * i + 1], tf2[0], regID[Q * i]); - if (res != VKFFT_SUCCESS) return res; - res = VkMulComplexNumber(sc, sc->locID[2], regID[Q * i + 2], tf2[1]); - if (res != VKFFT_SUCCESS) return res; - res = VkMovComplex(sc, regID[Q * i], sc->locID[0]); - if (res != VKFFT_SUCCESS) return res; - if (stageAngle < 0) - { - res = VkShuffleComplex(sc, regID[Q * i + 1], sc->locID[1], sc->locID[2], 0); - if (res != VKFFT_SUCCESS) return res; - res = VkShuffleComplexInv(sc, regID[Q * i + 2], sc->locID[1], sc->locID[2], 0); - if (res != VKFFT_SUCCESS) return res; - } - else { - res = VkShuffleComplexInv(sc, regID[Q * i + 1], sc->locID[1], sc->locID[2], 0); - if (res != VKFFT_SUCCESS) return res; - res = VkShuffleComplex(sc, regID[Q * i + 2], sc->locID[1], sc->locID[2], 0); - if (res != VKFFT_SUCCESS) return res; - } - } - - uint64_t permute2[15] = { 0, 3, 6, 9, 12, 1, 4, 7, 10, 13, 2, 5, 8, 11, 14 }; - res = VkPermute(sc, permute2, 15, 1, regID, temp); - if (res != VKFFT_SUCCESS) return res; - - for (uint64_t i = 0; i < 5; i++) { - free(tf[i]); - tf[i] = 0; - } - for (uint64_t i = 0; i < 2; i++) { - free(tf2[i]); - tf2[i] = 0; - } - break; - } - case 16: { - if (res != VKFFT_SUCCESS) return res; - if (stageSize == 1) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = 1;\n", w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - if (sc->LUT) { - if (sc->useCoalescedLUTUploadToSM) { - sc->tempLen = sprintf(sc->tempStr, " %s = sdata[stageInvocationID];\n", w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " %s = twiddleLUT[LUTId];\n", w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (stageAngle < 0) { - sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.y;\n", w, w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - if (!strcmp(floatType, "float")) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = %s(angle);\n", w, cosDef); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = %s(angle);\n", w, sinDef); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (!strcmp(floatType, "double")) { - sc->tempLen = sprintf(sc->tempStr, " %s = sincos_20(angle);\n", w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - if (!strcmp(floatType, "double")) { - sc->tempLen = sprintf(sc->tempStr, " %s = sincos_20(angle);\n", w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - for (uint64_t i = 0; i < 8; i++) { - res = VkMulComplex(sc, temp, regID[i + 8], w, 0); - if (res != VKFFT_SUCCESS) return res; - res = VkSubComplex(sc, regID[i + 8], regID[i], temp); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, regID[i], regID[i], temp); - if (res != VKFFT_SUCCESS) return res; - } - if (stageSize == 1) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = 1;\n", w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - if (sc->LUT) { - if (sc->useCoalescedLUTUploadToSM) { - sc->tempLen = sprintf(sc->tempStr, " %s = sdata[stageInvocationID+%" PRIu64 "];\n\n", w, stageSize); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " %s = twiddleLUT[LUTId+%" PRIu64 "];\n\n", w, stageSize); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (stageAngle < 0) { - sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.y;\n", w, w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - if (!strcmp(floatType, "float")) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = %s(0.5%s*angle);\n", w, cosDef, LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = %s(0.5%s*angle);\n", w, sinDef, LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (!strcmp(floatType, "double")) { - sc->tempLen = sprintf(sc->tempStr, " %s=normalize(%s + %s(1.0, 0.0));\n", w, w, vecType); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - for (uint64_t i = 0; i < 4; i++) { - res = VkMulComplex(sc, temp, regID[i + 4], w, 0); - if (res != VKFFT_SUCCESS) return res; - res = VkSubComplex(sc, regID[i + 4], regID[i], temp); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, regID[i], regID[i], temp); - if (res != VKFFT_SUCCESS) return res; - } - if (stageAngle < 0) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = %s.y;\n", iw, w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.x;\n", iw, w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - //sc->tempLen = sprintf(sc->tempStr, " w = %s(w.y, -w.x);\n\n", vecType); - } - else { - sc->tempLen = sprintf(sc->tempStr, " %s.x = -%s.y;\n", iw, w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = %s.x;\n", iw, w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - //sc->tempLen = sprintf(sc->tempStr, " iw = %s(-w.y, w.x);\n\n", vecType); - } - - for (uint64_t i = 8; i < 12; i++) { - res = VkMulComplex(sc, temp, regID[i + 4], iw, 0); - if (res != VKFFT_SUCCESS) return res; - res = VkSubComplex(sc, regID[i + 4], regID[i], temp); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, regID[i], regID[i], temp); - if (res != VKFFT_SUCCESS) return res; - } - if (stageSize == 1) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = 1;\n", w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - if (sc->LUT) { - if (sc->useCoalescedLUTUploadToSM) { - sc->tempLen = sprintf(sc->tempStr, " %s = sdata[stageInvocationID+%" PRIu64 "];\n\n", w, 2 * stageSize); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " %s = twiddleLUT[LUTId+%" PRIu64 "];\n\n", w, 2 * stageSize); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (stageAngle < 0) { - sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.y;\n", w, w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - if (!strcmp(floatType, "float")) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = %s(0.25%s*angle);\n", w, cosDef, LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = %s(0.25%s*angle);\n", w, sinDef, LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - //sc->tempLen = sprintf(sc->tempStr, " w = %s(cos(0.25*angle), sin(0.25*angle));\n\n", vecType); - } - if (!strcmp(floatType, "double")) { - sc->tempLen = sprintf(sc->tempStr, " %s=normalize(%s + %s(1.0, 0.0));\n", w, w, vecType); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - for (uint64_t i = 0; i < 2; i++) { - res = VkMulComplex(sc, temp, regID[i + 2], w, 0); - if (res != VKFFT_SUCCESS) return res; - res = VkSubComplex(sc, regID[i + 2], regID[i], temp); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, regID[i], regID[i], temp); - if (res != VKFFT_SUCCESS) return res; - } - if (stageAngle < 0) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = %s.y;\n", iw, w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.x;\n", iw, w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - //sc->tempLen = sprintf(sc->tempStr, " w = %s(w.y, -w.x);\n\n", vecType); - } - else { - sc->tempLen = sprintf(sc->tempStr, " %s.x = -%s.y;\n", iw, w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = %s.x;\n", iw, w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - //sc->tempLen = sprintf(sc->tempStr, " iw = %s(-w.y, w.x);\n\n", vecType); - } - for (uint64_t i = 4; i < 6; i++) { - res = VkMulComplex(sc, temp, regID[i + 2], iw, 0); - if (res != VKFFT_SUCCESS) return res; - res = VkSubComplex(sc, regID[i + 2], regID[i], temp); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, regID[i], regID[i], temp); - if (res != VKFFT_SUCCESS) return res; - } - if (stageAngle < 0) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = %s.x * loc_SQRT1_2 + %s.y * loc_SQRT1_2;\n", iw, w, w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = %s.y * loc_SQRT1_2 - %s.x * loc_SQRT1_2;\n\n", iw, w, w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " %s.x = %s.x * loc_SQRT1_2 - %s.y * loc_SQRT1_2;\n", iw, w, w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = %s.y * loc_SQRT1_2 + %s.x * loc_SQRT1_2;\n\n", iw, w, w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - for (uint64_t i = 8; i < 10; i++) { - res = VkMulComplex(sc, temp, regID[i + 2], iw, 0); - if (res != VKFFT_SUCCESS) return res; - res = VkSubComplex(sc, regID[i + 2], regID[i], temp); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, regID[i], regID[i], temp); - if (res != VKFFT_SUCCESS) return res; - } - if (stageAngle < 0) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = %s.y;\n", w, iw); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.x;\n", w, iw); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - //sc->tempLen = sprintf(sc->tempStr, " w = %s(iw.y, -iw.x);\n\n", vecType); - } - else { - sc->tempLen = sprintf(sc->tempStr, " %s.x = -%s.y;\n", w, iw); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = %s.x;\n", w, iw); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - //sc->tempLen = sprintf(sc->tempStr, " w = %s(-iw.y, iw.x);\n\n", vecType); - } - for (uint64_t i = 12; i < 14; i++) { - res = VkMulComplex(sc, temp, regID[i + 2], w, 0); - if (res != VKFFT_SUCCESS) return res; - res = VkSubComplex(sc, regID[i + 2], regID[i], temp); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, regID[i], regID[i], temp); - if (res != VKFFT_SUCCESS) return res; - } - - if (stageSize == 1) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = 1;\n", w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - if (sc->LUT) { - if (sc->useCoalescedLUTUploadToSM) { - sc->tempLen = sprintf(sc->tempStr, " %s = sdata[stageInvocationID+%" PRIu64 "];\n\n", w, 3 * stageSize); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " %s = twiddleLUT[LUTId+%" PRIu64 "];\n\n", w, 3 * stageSize); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (stageAngle < 0) { - sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.y;\n", w, w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - if (!strcmp(floatType, "float")) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = %s(0.125%s*angle);\n", w, cosDef, LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = %s(0.125%s*angle);\n", w, sinDef, LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - //sc->tempLen = sprintf(sc->tempStr, " w = %s(cos(0.25*angle), sin(0.25*angle));\n\n", vecType); - } - if (!strcmp(floatType, "double")) { - sc->tempLen = sprintf(sc->tempStr, " %s=normalize(%s + %s(1.0, 0.0));\n", w, w, vecType); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - - for (uint64_t i = 0; i < 1; i++) { - res = VkMulComplex(sc, temp, regID[i + 1], w, 0); - if (res != VKFFT_SUCCESS) return res; - res = VkSubComplex(sc, regID[i + 1], regID[i], temp); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, regID[i], regID[i], temp); - if (res != VKFFT_SUCCESS) return res; - } - if (stageAngle < 0) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = %s.y;\n", iw, w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.x;\n", iw, w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - //sc->tempLen = sprintf(sc->tempStr, " w = %s(w.y, -w.x);\n\n", vecType); - } - else { - sc->tempLen = sprintf(sc->tempStr, " %s.x = -%s.y;\n", iw, w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = %s.x;\n", iw, w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - //sc->tempLen = sprintf(sc->tempStr, " iw = %s(-w.y, w.x);\n\n", vecType); - } - for (uint64_t i = 2; i < 3; i++) { - res = VkMulComplex(sc, temp, regID[i + 1], iw, 0); - if (res != VKFFT_SUCCESS) return res; - res = VkSubComplex(sc, regID[i + 1], regID[i], temp); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, regID[i], regID[i], temp); - if (res != VKFFT_SUCCESS) return res; - } - - - if (stageAngle < 0) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = %s.x * loc_SQRT1_2 + %s.y * loc_SQRT1_2;\n", iw, w, w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = %s.y * loc_SQRT1_2 - %s.x * loc_SQRT1_2;\n\n", iw, w, w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " %s.x = %s.x * loc_SQRT1_2 - %s.y * loc_SQRT1_2;\n", iw, w, w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = %s.y * loc_SQRT1_2 + %s.x * loc_SQRT1_2;\n\n", iw, w, w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - for (uint64_t i = 4; i < 5; i++) { - res = VkMulComplex(sc, temp, regID[i + 1], iw, 0); - if (res != VKFFT_SUCCESS) return res; - res = VkSubComplex(sc, regID[i + 1], regID[i], temp); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, regID[i], regID[i], temp); - if (res != VKFFT_SUCCESS) return res; - } - if (stageAngle < 0) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = %s.y;\n", temp, iw); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.x;\n", temp, iw); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - res = VkMovComplex(sc, iw, temp); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " %s.x = -%s.y;\n", temp, iw); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = %s.x;\n", temp, iw); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - res = VkMovComplex(sc, iw, temp); - if (res != VKFFT_SUCCESS) return res; - } - for (uint64_t i = 6; i < 7; i++) { - res = VkMulComplex(sc, temp, regID[i + 1], iw, 0); - if (res != VKFFT_SUCCESS) return res; - res = VkSubComplex(sc, regID[i + 1], regID[i], temp); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, regID[i], regID[i], temp); - if (res != VKFFT_SUCCESS) return res; - } - - - for (uint64_t j = 0; j < 2; j++) { - if (stageAngle < 0) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = %s.x * %.17e%s + %s.y * %.17e%s;\n", iw, w, cos((2 * j + 1) * double_PI / 8), LFending, w, sin((2 * j + 1) * double_PI / 8), LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = %s.y * %.17e%s - %s.x * %.17e%s;\n\n", iw, w, cos((2 * j + 1) * double_PI / 8), LFending, w, sin((2 * j + 1) * double_PI / 8), LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " %s.x = %s.x * %.17e%s - %s.y * %.17e%s;\n", iw, w, cos((2 * j + 1) * double_PI / 8), LFending, w, sin((2 * j + 1) * double_PI / 8), LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = %s.y * %.17e%s + %s.x * %.17e%s;\n\n", iw, w, cos((2 * j + 1) * double_PI / 8), LFending, w, sin((2 * j + 1) * double_PI / 8), LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - for (uint64_t i = 8 + 4 * j; i < 9 + 4 * j; i++) { - res = VkMulComplex(sc, temp, regID[i + 1], iw, 0); - if (res != VKFFT_SUCCESS) return res; - res = VkSubComplex(sc, regID[i + 1], regID[i], temp); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, regID[i], regID[i], temp); - if (res != VKFFT_SUCCESS) return res; - } - if (stageAngle < 0) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = %s.y;\n", temp, iw); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.x;\n", temp, iw); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - res = VkMovComplex(sc, iw, temp); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " %s.x = -%s.y;\n", temp, iw); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = %s.x;\n", temp, iw); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - res = VkMovComplex(sc, iw, temp); - if (res != VKFFT_SUCCESS) return res; - } - for (uint64_t i = 10 + 4 * j; i < 11 + 4 * j; i++) { - res = VkMulComplex(sc, temp, regID[i + 1], iw, 0); - if (res != VKFFT_SUCCESS) return res; - res = VkSubComplex(sc, regID[i + 1], regID[i], temp); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, regID[i], regID[i], temp); - if (res != VKFFT_SUCCESS) return res; - } - } - - uint64_t permute2[16] = { 0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15 }; - res = VkPermute(sc, permute2, 16, 1, regID, temp); - if (res != VKFFT_SUCCESS) return res; - - /*res = VkMovComplex(sc, temp, regID[1]); - if (res != VKFFT_SUCCESS) return res; - res = VkMovComplex(sc, regID[1], regID[8]); - if (res != VKFFT_SUCCESS) return res; - res = VkMovComplex(sc, regID[8], temp); - if (res != VKFFT_SUCCESS) return res; - - res = VkMovComplex(sc, temp, regID[2]); - if (res != VKFFT_SUCCESS) return res; - res = VkMovComplex(sc, regID[2], regID[4]); - if (res != VKFFT_SUCCESS) return res; - res = VkMovComplex(sc, regID[4], temp); - if (res != VKFFT_SUCCESS) return res; - - res = VkMovComplex(sc, temp, regID[3]); - if (res != VKFFT_SUCCESS) return res; - res = VkMovComplex(sc, regID[3], regID[12]); - if (res != VKFFT_SUCCESS) return res; - res = VkMovComplex(sc, regID[12], temp); - if (res != VKFFT_SUCCESS) return res; - - res = VkMovComplex(sc, temp, regID[5]); - if (res != VKFFT_SUCCESS) return res; - res = VkMovComplex(sc, regID[5], regID[10]); - if (res != VKFFT_SUCCESS) return res; - res = VkMovComplex(sc, regID[10], temp); - if (res != VKFFT_SUCCESS) return res; - - res = VkMovComplex(sc, temp, regID[7]); - if (res != VKFFT_SUCCESS) return res; - res = VkMovComplex(sc, regID[7], regID[14]); - if (res != VKFFT_SUCCESS) return res; - res = VkMovComplex(sc, regID[14], temp); - if (res != VKFFT_SUCCESS) return res; - - res = VkMovComplex(sc, temp, regID[11]); - if (res != VKFFT_SUCCESS) return res; - res = VkMovComplex(sc, regID[11], regID[13]); - if (res != VKFFT_SUCCESS) return res; - res = VkMovComplex(sc, regID[13], temp); - if (res != VKFFT_SUCCESS) return res;*/ - break; - } - case 32: { - if (res != VKFFT_SUCCESS) return res; - if (stageSize == 1) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = 1;\n", w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - if (sc->LUT) { - if (sc->useCoalescedLUTUploadToSM) { - sc->tempLen = sprintf(sc->tempStr, " %s = sdata[stageInvocationID];\n", w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " %s = twiddleLUT[LUTId];\n", w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (stageAngle < 0) { - sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.y;\n", w, w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - if (!strcmp(floatType, "float")) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = %s(angle);\n", w, cosDef); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = %s(angle);\n", w, sinDef); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (!strcmp(floatType, "double")) { - sc->tempLen = sprintf(sc->tempStr, " %s = sincos_20(angle);\n", w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - if (!strcmp(floatType, "double")) { - sc->tempLen = sprintf(sc->tempStr, " %s = sincos_20(angle);\n", w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - for (uint64_t i = 0; i < 16; i++) { - res = VkMulComplex(sc, temp, regID[i + 16], w, 0); - if (res != VKFFT_SUCCESS) return res; - res = VkSubComplex(sc, regID[i + 16], regID[i], temp); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, regID[i], regID[i], temp); - if (res != VKFFT_SUCCESS) return res; - } - if (stageSize == 1) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = 1;\n", w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - if (sc->LUT) { - if (sc->useCoalescedLUTUploadToSM) { - sc->tempLen = sprintf(sc->tempStr, " %s = sdata[stageInvocationID+%" PRIu64 "];\n\n", w, stageSize); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " %s = twiddleLUT[LUTId+%" PRIu64 "];\n\n", w, stageSize); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (stageAngle < 0) { - sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.y;\n", w, w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - if (!strcmp(floatType, "float")) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = %s(0.5%s*angle);\n", w, cosDef, LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = %s(0.5%s*angle);\n", w, sinDef, LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (!strcmp(floatType, "double")) { - sc->tempLen = sprintf(sc->tempStr, " %s=normalize(%s + %s(1.0, 0.0));\n", w, w, vecType); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - for (uint64_t i = 0; i < 8; i++) { - res = VkMulComplex(sc, temp, regID[i + 8], w, 0); - if (res != VKFFT_SUCCESS) return res; - res = VkSubComplex(sc, regID[i + 8], regID[i], temp); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, regID[i], regID[i], temp); - if (res != VKFFT_SUCCESS) return res; - } - if (stageAngle < 0) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = %s.y;\n", iw, w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.x;\n", iw, w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - //sc->tempLen = sprintf(sc->tempStr, " w = %s(w.y, -w.x);\n\n", vecType); - } - else { - sc->tempLen = sprintf(sc->tempStr, " %s.x = -%s.y;\n", iw, w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = %s.x;\n", iw, w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - //sc->tempLen = sprintf(sc->tempStr, " iw = %s(-w.y, w.x);\n\n", vecType); - } - - for (uint64_t i = 16; i < 24; i++) { - res = VkMulComplex(sc, temp, regID[i + 8], iw, 0); - if (res != VKFFT_SUCCESS) return res; - res = VkSubComplex(sc, regID[i + 8], regID[i], temp); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, regID[i], regID[i], temp); - if (res != VKFFT_SUCCESS) return res; - } - if (stageSize == 1) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = 1;\n", w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - if (sc->LUT) { - if (sc->useCoalescedLUTUploadToSM) { - sc->tempLen = sprintf(sc->tempStr, " %s = sdata[stageInvocationID+%" PRIu64 "];\n\n", w, 2 * stageSize); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " %s = twiddleLUT[LUTId+%" PRIu64 "];\n\n", w, 2 * stageSize); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (stageAngle < 0) { - sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.y;\n", w, w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - if (!strcmp(floatType, "float")) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = %s(0.25%s*angle);\n", w, cosDef, LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = %s(0.25%s*angle);\n", w, sinDef, LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - //sc->tempLen = sprintf(sc->tempStr, " w = %s(cos(0.25*angle), sin(0.25*angle));\n\n", vecType); - } - if (!strcmp(floatType, "double")) { - sc->tempLen = sprintf(sc->tempStr, " %s=normalize(%s + %s(1.0, 0.0));\n", w, w, vecType); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - for (uint64_t i = 0; i < 4; i++) { - res = VkMulComplex(sc, temp, regID[i + 4], w, 0); - if (res != VKFFT_SUCCESS) return res; - res = VkSubComplex(sc, regID[i + 4], regID[i], temp); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, regID[i], regID[i], temp); - if (res != VKFFT_SUCCESS) return res; - } - if (stageAngle < 0) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = %s.y;\n", iw, w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.x;\n", iw, w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - //sc->tempLen = sprintf(sc->tempStr, " w = %s(w.y, -w.x);\n\n", vecType); - } - else { - sc->tempLen = sprintf(sc->tempStr, " %s.x = -%s.y;\n", iw, w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = %s.x;\n", iw, w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - //sc->tempLen = sprintf(sc->tempStr, " iw = %s(-w.y, w.x);\n\n", vecType); - } - for (uint64_t i = 8; i < 12; i++) { - res = VkMulComplex(sc, temp, regID[i + 4], iw, 0); - if (res != VKFFT_SUCCESS) return res; - res = VkSubComplex(sc, regID[i + 4], regID[i], temp); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, regID[i], regID[i], temp); - if (res != VKFFT_SUCCESS) return res; - } - if (stageAngle < 0) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = %s.x * loc_SQRT1_2 + %s.y * loc_SQRT1_2;\n", iw, w, w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = %s.y * loc_SQRT1_2 - %s.x * loc_SQRT1_2;\n\n", iw, w, w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " %s.x = %s.x * loc_SQRT1_2 - %s.y * loc_SQRT1_2;\n", iw, w, w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = %s.y * loc_SQRT1_2 + %s.x * loc_SQRT1_2;\n\n", iw, w, w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - for (uint64_t i = 16; i < 20; i++) { - res = VkMulComplex(sc, temp, regID[i + 4], iw, 0); - if (res != VKFFT_SUCCESS) return res; - res = VkSubComplex(sc, regID[i + 4], regID[i], temp); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, regID[i], regID[i], temp); - if (res != VKFFT_SUCCESS) return res; - } - if (stageAngle < 0) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = %s.y;\n", w, iw); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.x;\n", w, iw); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - //sc->tempLen = sprintf(sc->tempStr, " w = %s(iw.y, -iw.x);\n\n", vecType); - } - else { - sc->tempLen = sprintf(sc->tempStr, " %s.x = -%s.y;\n", w, iw); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = %s.x;\n", w, iw); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - //sc->tempLen = sprintf(sc->tempStr, " w = %s(-iw.y, iw.x);\n\n", vecType); - } - for (uint64_t i = 24; i < 28; i++) { - res = VkMulComplex(sc, temp, regID[i + 4], w, 0); - if (res != VKFFT_SUCCESS) return res; - res = VkSubComplex(sc, regID[i + 4], regID[i], temp); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, regID[i], regID[i], temp); - if (res != VKFFT_SUCCESS) return res; - } - - if (stageSize == 1) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = 1;\n", w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - if (sc->LUT) { - if (sc->useCoalescedLUTUploadToSM) { - sc->tempLen = sprintf(sc->tempStr, " %s = sdata[stageInvocationID+%" PRIu64 "];\n\n", w, 3 * stageSize); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " %s = twiddleLUT[LUTId+%" PRIu64 "];\n\n", w, 3 * stageSize); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (stageAngle < 0) { - sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.y;\n", w, w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - if (!strcmp(floatType, "float")) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = %s(0.125%s*angle);\n", w, cosDef, LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = %s(0.125%s*angle);\n", w, sinDef, LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - //sc->tempLen = sprintf(sc->tempStr, " w = %s(cos(0.25*angle), sin(0.25*angle));\n\n", vecType); - } - if (!strcmp(floatType, "double")) { - sc->tempLen = sprintf(sc->tempStr, " %s=normalize(%s + %s(1.0, 0.0));\n", w, w, vecType); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - - for (uint64_t i = 0; i < 2; i++) { - res = VkMulComplex(sc, temp, regID[i + 2], w, 0); - if (res != VKFFT_SUCCESS) return res; - res = VkSubComplex(sc, regID[i + 2], regID[i], temp); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, regID[i], regID[i], temp); - if (res != VKFFT_SUCCESS) return res; - } - if (stageAngle < 0) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = %s.y;\n", iw, w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.x;\n", iw, w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - //sc->tempLen = sprintf(sc->tempStr, " w = %s(w.y, -w.x);\n\n", vecType); - } - else { - sc->tempLen = sprintf(sc->tempStr, " %s.x = -%s.y;\n", iw, w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = %s.x;\n", iw, w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - //sc->tempLen = sprintf(sc->tempStr, " iw = %s(-w.y, w.x);\n\n", vecType); - } - for (uint64_t i = 4; i < 6; i++) { - res = VkMulComplex(sc, temp, regID[i + 2], iw, 0); - if (res != VKFFT_SUCCESS) return res; - res = VkSubComplex(sc, regID[i + 2], regID[i], temp); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, regID[i], regID[i], temp); - if (res != VKFFT_SUCCESS) return res; - } - - - if (stageAngle < 0) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = %s.x * loc_SQRT1_2 + %s.y * loc_SQRT1_2;\n", iw, w, w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = %s.y * loc_SQRT1_2 - %s.x * loc_SQRT1_2;\n\n", iw, w, w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " %s.x = %s.x * loc_SQRT1_2 - %s.y * loc_SQRT1_2;\n", iw, w, w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = %s.y * loc_SQRT1_2 + %s.x * loc_SQRT1_2;\n\n", iw, w, w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - for (uint64_t i = 8; i < 10; i++) { - res = VkMulComplex(sc, temp, regID[i + 2], iw, 0); - if (res != VKFFT_SUCCESS) return res; - res = VkSubComplex(sc, regID[i + 2], regID[i], temp); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, regID[i], regID[i], temp); - if (res != VKFFT_SUCCESS) return res; - } - if (stageAngle < 0) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = %s.y;\n", temp, iw); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.x;\n", temp, iw); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - res = VkMovComplex(sc, iw, temp); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " %s.x = -%s.y;\n", temp, iw); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = %s.x;\n", temp, iw); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - res = VkMovComplex(sc, iw, temp); - if (res != VKFFT_SUCCESS) return res; - } - for (uint64_t i = 12; i < 14; i++) { - res = VkMulComplex(sc, temp, regID[i + 2], iw, 0); - if (res != VKFFT_SUCCESS) return res; - res = VkSubComplex(sc, regID[i + 2], regID[i], temp); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, regID[i], regID[i], temp); - if (res != VKFFT_SUCCESS) return res; - } - - - for (uint64_t j = 0; j < 2; j++) { - if (stageAngle < 0) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = %s.x * %.17e%s + %s.y * %.17e%s;\n", iw, w, cos((2 * j + 1) * double_PI / 8), LFending, w, sin((2 * j + 1) * double_PI / 8), LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = %s.y * %.17e%s - %s.x * %.17e%s;\n\n", iw, w, cos((2 * j + 1) * double_PI / 8), LFending, w, sin((2 * j + 1) * double_PI / 8), LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " %s.x = %s.x * %.17e%s - %s.y * %.17e%s;\n", iw, w, cos((2 * j + 1) * double_PI / 8), LFending, w, sin((2 * j + 1) * double_PI / 8), LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = %s.y * %.17e%s + %s.x * %.17e%s;\n\n", iw, w, cos((2 * j + 1) * double_PI / 8), LFending, w, sin((2 * j + 1) * double_PI / 8), LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - for (uint64_t i = 16 + 8 * j; i < 18 + 8 * j; i++) { - res = VkMulComplex(sc, temp, regID[i + 2], iw, 0); - if (res != VKFFT_SUCCESS) return res; - res = VkSubComplex(sc, regID[i + 2], regID[i], temp); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, regID[i], regID[i], temp); - if (res != VKFFT_SUCCESS) return res; - } - if (stageAngle < 0) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = %s.y;\n", temp, iw); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.x;\n", temp, iw); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - res = VkMovComplex(sc, iw, temp); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " %s.x = -%s.y;\n", temp, iw); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = %s.x;\n", temp, iw); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - res = VkMovComplex(sc, iw, temp); - if (res != VKFFT_SUCCESS) return res; - } - for (uint64_t i = 20 + 8 * j; i < 22 + 8 * j; i++) { - res = VkMulComplex(sc, temp, regID[i + 2], iw, 0); - if (res != VKFFT_SUCCESS) return res; - res = VkSubComplex(sc, regID[i + 2], regID[i], temp); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, regID[i], regID[i], temp); - if (res != VKFFT_SUCCESS) return res; - } - } - - if (stageSize == 1) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = 1;\n", w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - if (sc->LUT) { - if (sc->useCoalescedLUTUploadToSM) { - sc->tempLen = sprintf(sc->tempStr, " %s = sdata[stageInvocationID+%" PRIu64 "];\n\n", w, 4 * stageSize); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " %s = twiddleLUT[LUTId+%" PRIu64 "];\n\n", w, 4 * stageSize); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (stageAngle < 0) { - sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.y;\n", w, w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - if (!strcmp(floatType, "float")) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = %s(0.0625%s*angle);\n", w, cosDef, LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = %s(0.0625%s*angle);\n", w, sinDef, LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - //sc->tempLen = sprintf(sc->tempStr, " w = %s(cos(0.25*angle), sin(0.25*angle));\n\n", vecType); - } - if (!strcmp(floatType, "double")) { - sc->tempLen = sprintf(sc->tempStr, " %s=normalize(%s + %s(1.0, 0.0));\n", w, w, vecType); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - - for (uint64_t i = 0; i < 1; i++) { - res = VkMulComplex(sc, temp, regID[i + 1], w, 0); - if (res != VKFFT_SUCCESS) return res; - res = VkSubComplex(sc, regID[i + 1], regID[i], temp); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, regID[i], regID[i], temp); - if (res != VKFFT_SUCCESS) return res; - } - if (stageAngle < 0) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = %s.y;\n", iw, w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.x;\n", iw, w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - //sc->tempLen = sprintf(sc->tempStr, " w = %s(w.y, -w.x);\n\n", vecType); - } - else { - sc->tempLen = sprintf(sc->tempStr, " %s.x = -%s.y;\n", iw, w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = %s.x;\n", iw, w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - //sc->tempLen = sprintf(sc->tempStr, " iw = %s(-w.y, w.x);\n\n", vecType); - } - for (uint64_t i = 2; i < 3; i++) { - res = VkMulComplex(sc, temp, regID[i + 1], iw, 0); - if (res != VKFFT_SUCCESS) return res; - res = VkSubComplex(sc, regID[i + 1], regID[i], temp); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, regID[i], regID[i], temp); - if (res != VKFFT_SUCCESS) return res; - } - - - if (stageAngle < 0) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = %s.x * loc_SQRT1_2 + %s.y * loc_SQRT1_2;\n", iw, w, w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = %s.y * loc_SQRT1_2 - %s.x * loc_SQRT1_2;\n\n", iw, w, w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " %s.x = %s.x * loc_SQRT1_2 - %s.y * loc_SQRT1_2;\n", iw, w, w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = %s.y * loc_SQRT1_2 + %s.x * loc_SQRT1_2;\n\n", iw, w, w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - for (uint64_t i = 4; i < 5; i++) { - res = VkMulComplex(sc, temp, regID[i + 1], iw, 0); - if (res != VKFFT_SUCCESS) return res; - res = VkSubComplex(sc, regID[i + 1], regID[i], temp); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, regID[i], regID[i], temp); - if (res != VKFFT_SUCCESS) return res; - } - if (stageAngle < 0) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = %s.y;\n", temp, iw); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.x;\n", temp, iw); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - res = VkMovComplex(sc, iw, temp); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " %s.x = -%s.y;\n", temp, iw); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = %s.x;\n", temp, iw); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - res = VkMovComplex(sc, iw, temp); - if (res != VKFFT_SUCCESS) return res; - } - for (uint64_t i = 6; i < 7; i++) { - res = VkMulComplex(sc, temp, regID[i + 1], iw, 0); - if (res != VKFFT_SUCCESS) return res; - res = VkSubComplex(sc, regID[i + 1], regID[i], temp); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, regID[i], regID[i], temp); - if (res != VKFFT_SUCCESS) return res; - } - - - for (uint64_t j = 0; j < 2; j++) { - if (stageAngle < 0) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = %s.x * %.17e%s + %s.y * %.17e%s;\n", iw, w, cos((2 * j + 1) * double_PI / 8), LFending, w, sin((2 * j + 1) * double_PI / 8), LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = %s.y * %.17e%s - %s.x * %.17e%s;\n\n", iw, w, cos((2 * j + 1) * double_PI / 8), LFending, w, sin((2 * j + 1) * double_PI / 8), LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " %s.x = %s.x * %.17e%s - %s.y * %.17e%s;\n", iw, w, cos((2 * j + 1) * double_PI / 8), LFending, w, sin((2 * j + 1) * double_PI / 8), LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = %s.y * %.17e%s + %s.x * %.17e%s;\n\n", iw, w, cos((2 * j + 1) * double_PI / 8), LFending, w, sin((2 * j + 1) * double_PI / 8), LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - for (uint64_t i = 8 + 4 * j; i < 9 + 4 * j; i++) { - res = VkMulComplex(sc, temp, regID[i + 1], iw, 0); - if (res != VKFFT_SUCCESS) return res; - res = VkSubComplex(sc, regID[i + 1], regID[i], temp); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, regID[i], regID[i], temp); - if (res != VKFFT_SUCCESS) return res; - } - if (stageAngle < 0) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = %s.y;\n", temp, iw); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.x;\n", temp, iw); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - res = VkMovComplex(sc, iw, temp); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " %s.x = -%s.y;\n", temp, iw); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = %s.x;\n", temp, iw); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - res = VkMovComplex(sc, iw, temp); - if (res != VKFFT_SUCCESS) return res; - } - for (uint64_t i = 10 + 4 * j; i < 11 + 4 * j; i++) { - res = VkMulComplex(sc, temp, regID[i + 1], iw, 0); - if (res != VKFFT_SUCCESS) return res; - res = VkSubComplex(sc, regID[i + 1], regID[i], temp); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, regID[i], regID[i], temp); - if (res != VKFFT_SUCCESS) return res; - } - } - - for (uint64_t j = 0; j < 4; j++) { - if ((j == 1) || (j == 2)) { - if (stageAngle < 0) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = %s.x * %.17e%s + %s.y * %.17e%s;\n", iw, w, cos((7 - 2 * j) * double_PI / 16), LFending, w, sin((7 - 2 * j) * double_PI / 16), LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = %s.y * %.17e%s - %s.x * %.17e%s;\n\n", iw, w, cos((7 - 2 * j) * double_PI / 16), LFending, w, sin((7 - 2 * j) * double_PI / 16), LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " %s.x = %s.x * %.17e%s - %s.y * %.17e%s;\n", iw, w, cos((7 - 2 * j) * double_PI / 16), LFending, w, sin((7 - 2 * j) * double_PI / 16), LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = %s.y * %.17e%s + %s.x * %.17e%s;\n\n", iw, w, cos((7 - 2 * j) * double_PI / 16), LFending, w, sin((7 - 2 * j) * double_PI / 16), LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - if (stageAngle < 0) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = %s.x * %.17e%s + %s.y * %.17e%s;\n", iw, w, cos((2 * j + 1) * double_PI / 16), LFending, w, sin((2 * j + 1) * double_PI / 16), LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = %s.y * %.17e%s - %s.x * %.17e%s;\n\n", iw, w, cos((2 * j + 1) * double_PI / 16), LFending, w, sin((2 * j + 1) * double_PI / 16), LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " %s.x = %s.x * %.17e%s - %s.y * %.17e%s;\n", iw, w, cos((2 * j + 1) * double_PI / 16), LFending, w, sin((2 * j + 1) * double_PI / 16), LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = %s.y * %.17e%s + %s.x * %.17e%s;\n\n", iw, w, cos((2 * j + 1) * double_PI / 16), LFending, w, sin((2 * j + 1) * double_PI / 16), LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - for (uint64_t i = 16 + 4 * j; i < 17 + 4 * j; i++) { - res = VkMulComplex(sc, temp, regID[i + 1], iw, 0); - if (res != VKFFT_SUCCESS) return res; - res = VkSubComplex(sc, regID[i + 1], regID[i], temp); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, regID[i], regID[i], temp); - if (res != VKFFT_SUCCESS) return res; - } - if (stageAngle < 0) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = %s.y;\n", temp, iw); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.x;\n", temp, iw); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - res = VkMovComplex(sc, iw, temp); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " %s.x = -%s.y;\n", temp, iw); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = %s.x;\n", temp, iw); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - res = VkMovComplex(sc, iw, temp); - if (res != VKFFT_SUCCESS) return res; - } - for (uint64_t i = 18 + 4 * j; i < 19 + 4 * j; i++) { - res = VkMulComplex(sc, temp, regID[i + 1], iw, 0); - if (res != VKFFT_SUCCESS) return res; - res = VkSubComplex(sc, regID[i + 1], regID[i], temp); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, regID[i], regID[i], temp); - if (res != VKFFT_SUCCESS) return res; - } - } - - uint64_t permute2[32] = { 0,16,8,24,4,20,12,28,2,18,10,26,6,22,14,30,1,17,9,25,5,21,13,29,3,19,11,27,7,23,15,31 }; - res = VkPermute(sc, permute2, 32, 1, regID, temp); - if (res != VKFFT_SUCCESS) return res; - - /*res = VkMovComplex(sc, temp, regID[1]); - if (res != VKFFT_SUCCESS) return res; - res = VkMovComplex(sc, regID[1], regID[16]); - if (res != VKFFT_SUCCESS) return res; - res = VkMovComplex(sc, regID[16], temp); - if (res != VKFFT_SUCCESS) return res; - - res = VkMovComplex(sc, temp, regID[2]); - if (res != VKFFT_SUCCESS) return res; - res = VkMovComplex(sc, regID[2], regID[8]); - if (res != VKFFT_SUCCESS) return res; - res = VkMovComplex(sc, regID[8], temp); - if (res != VKFFT_SUCCESS) return res; - - res = VkMovComplex(sc, temp, regID[3]); - if (res != VKFFT_SUCCESS) return res; - res = VkMovComplex(sc, regID[3], regID[24]); - if (res != VKFFT_SUCCESS) return res; - res = VkMovComplex(sc, regID[24], temp); - if (res != VKFFT_SUCCESS) return res; - - res = VkMovComplex(sc, temp, regID[5]); - if (res != VKFFT_SUCCESS) return res; - res = VkMovComplex(sc, regID[5], regID[20]); - if (res != VKFFT_SUCCESS) return res; - res = VkMovComplex(sc, regID[20], temp); - if (res != VKFFT_SUCCESS) return res; - - res = VkMovComplex(sc, temp, regID[6]); - if (res != VKFFT_SUCCESS) return res; - res = VkMovComplex(sc, regID[6], regID[12]); - if (res != VKFFT_SUCCESS) return res; - res = VkMovComplex(sc, regID[12], temp); - if (res != VKFFT_SUCCESS) return res; - - res = VkMovComplex(sc, temp, regID[7]); - if (res != VKFFT_SUCCESS) return res; - res = VkMovComplex(sc, regID[7], regID[28]); - if (res != VKFFT_SUCCESS) return res; - res = VkMovComplex(sc, regID[28], temp); - if (res != VKFFT_SUCCESS) return res; - - res = VkMovComplex(sc, temp, regID[9]); - if (res != VKFFT_SUCCESS) return res; - res = VkMovComplex(sc, regID[9], regID[18]); - if (res != VKFFT_SUCCESS) return res; - res = VkMovComplex(sc, regID[18], temp); - if (res != VKFFT_SUCCESS) return res; - - res = VkMovComplex(sc, temp, regID[11]); - if (res != VKFFT_SUCCESS) return res; - res = VkMovComplex(sc, regID[11], regID[26]); - if (res != VKFFT_SUCCESS) return res; - res = VkMovComplex(sc, regID[26], temp); - if (res != VKFFT_SUCCESS) return res; - - res = VkMovComplex(sc, temp, regID[13]); - if (res != VKFFT_SUCCESS) return res; - res = VkMovComplex(sc, regID[13], regID[22]); - if (res != VKFFT_SUCCESS) return res; - res = VkMovComplex(sc, regID[22], temp); - if (res != VKFFT_SUCCESS) return res; - - res = VkMovComplex(sc, temp, regID[15]); - if (res != VKFFT_SUCCESS) return res; - res = VkMovComplex(sc, regID[15], regID[30]); - if (res != VKFFT_SUCCESS) return res; - res = VkMovComplex(sc, regID[30], temp); - if (res != VKFFT_SUCCESS) return res; - - res = VkMovComplex(sc, temp, regID[19]); - if (res != VKFFT_SUCCESS) return res; - res = VkMovComplex(sc, regID[19], regID[25]); - if (res != VKFFT_SUCCESS) return res; - res = VkMovComplex(sc, regID[25], temp); - if (res != VKFFT_SUCCESS) return res; - - res = VkMovComplex(sc, temp, regID[23]); - if (res != VKFFT_SUCCESS) return res; - res = VkMovComplex(sc, regID[23], regID[29]); - if (res != VKFFT_SUCCESS) return res; - res = VkMovComplex(sc, regID[29], temp); - if (res != VKFFT_SUCCESS) return res;*/ - - break; - } - } - return res; -} -static inline VkFFTResult appendSharedMemoryVkFFT(VkFFTSpecializationConstantsLayout* sc, const char* floatType, const char* uintType, uint64_t sharedType) { - VkFFTResult res = VKFFT_SUCCESS; - char vecType[30]; - char sharedDefinitions[20] = ""; - uint64_t vecSize = 1; - uint64_t maxSequenceSharedMemory = 0; - //uint64_t maxSequenceSharedMemoryPow2 = 0; - if (!strcmp(floatType, "float")) - { -#if(VKFFT_BACKEND==0) - sprintf(vecType, "vec2"); - sprintf(sharedDefinitions, "shared"); -#elif(VKFFT_BACKEND==1) - sprintf(vecType, "float2"); - sprintf(sharedDefinitions, "__shared__"); -#elif(VKFFT_BACKEND==2) - sprintf(vecType, "float2"); - sprintf(sharedDefinitions, "__shared__"); -#elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)) - sprintf(vecType, "float2"); - sprintf(sharedDefinitions, "__local"); -#endif - vecSize = 8; - } - if (!strcmp(floatType, "double")) { -#if(VKFFT_BACKEND==0) - sprintf(vecType, "dvec2"); - sprintf(sharedDefinitions, "shared"); -#elif(VKFFT_BACKEND==1) - sprintf(vecType, "double2"); - sprintf(sharedDefinitions, "__shared__"); -#elif(VKFFT_BACKEND==2) - sprintf(vecType, "double2"); - sprintf(sharedDefinitions, "__shared__"); -#elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)) - sprintf(vecType, "double2"); - sprintf(sharedDefinitions, "__local"); -#endif - vecSize = 16; - } - if (sc->useRaderMult) { - sc->sharedMemSize -= sc->additionalRaderSharedSize * vecSize; - sc->sharedMemSizePow2 -= sc->additionalRaderSharedSize * vecSize; - } - maxSequenceSharedMemory = sc->sharedMemSize / vecSize; - //maxSequenceSharedMemoryPow2 = sc->sharedMemSizePow2 / vecSize; - uint64_t mergeR2C = (sc->mergeSequencesR2C && (sc->axis_id == 0)) ? 2 : 0; - switch (sharedType) { - case 0: case 5: case 6: case 110: case 120: case 130: case 140: case 142: case 144://single_c2c + single_r2c - { - sc->resolveBankConflictFirstStages = 0; - sc->sharedStrideBankConflictFirstStages = ((sc->fftDim > sc->numSharedBanks / 2) && ((sc->fftDim & (sc->fftDim - 1)) == 0)) ? sc->fftDim / sc->registerBoost * (sc->numSharedBanks / 2 + 1) / (sc->numSharedBanks / 2) : sc->fftDim / sc->registerBoost; - sc->sharedStrideReadWriteConflict = ((sc->numSharedBanks / 2 <= sc->localSize[1])) ? sc->fftDim / sc->registerBoost + 1 : sc->fftDim / sc->registerBoost + (sc->numSharedBanks / 2) / sc->localSize[1]; - if (sc->sharedStrideReadWriteConflict < sc->fftDim / sc->registerBoost + mergeR2C) sc->sharedStrideReadWriteConflict = sc->fftDim / sc->registerBoost + mergeR2C; - if (sc->useRaderFFT) { - uint64_t max_stride = sc->fftDim; - uint64_t max_shift = 0; - for (uint64_t i = 0; i < sc->numRaderPrimes; i++) { - - for (uint64_t j = 0; j < sc->raderContainer[i].numStages; j++) { - if (sc->raderContainer[i].containerFFTNum < 8) { - uint64_t subLogicalGroupSize = (uint64_t)ceil(sc->raderContainer[i].containerFFTDim / (double)sc->raderContainer[i].registers_per_thread_per_radix[sc->raderContainer[i].stageRadix[j]]); // hopefully it is not <1, will fix - uint64_t shift = (subLogicalGroupSize > (sc->raderContainer[i].containerFFTDim % (sc->numSharedBanks / 2))) ? subLogicalGroupSize - sc->raderContainer[i].containerFFTDim % (sc->numSharedBanks / 2) : 0; - if (j == 0) shift = (sc->raderContainer[i].containerFFTDim % (sc->numSharedBanks / 2)) ? 0 : 1; - uint64_t loc_stride = sc->raderContainer[i].containerFFTDim + shift; - if (sc->raderContainer[i].containerFFTNum * (loc_stride + 1) > max_stride) { - max_stride = sc->raderContainer[i].containerFFTNum * (loc_stride + 1); - if (shift > max_shift) max_shift = shift; - } - } - } - } - sc->sharedShiftRaderFFT = max_shift; - sc->sharedStrideRaderFFT = max_stride; - } - - sc->maxSharedStride = (sc->sharedStrideBankConflictFirstStages < sc->sharedStrideReadWriteConflict) ? sc->sharedStrideReadWriteConflict : sc->sharedStrideBankConflictFirstStages; - - if (sc->useRaderFFT) - sc->maxSharedStride = (sc->maxSharedStride < sc->sharedStrideRaderFFT) ? sc->sharedStrideRaderFFT : sc->maxSharedStride; - - sc->usedSharedMemory = vecSize * sc->localSize[1] * sc->maxSharedStride; - sc->maxSharedStride = ((sc->sharedMemSize < sc->usedSharedMemory)) ? sc->fftDim / sc->registerBoost : sc->maxSharedStride; - - sc->sharedStrideBankConflictFirstStages = (sc->maxSharedStride == sc->fftDim / sc->registerBoost) ? sc->fftDim / sc->registerBoost : sc->sharedStrideBankConflictFirstStages; - sc->sharedStrideReadWriteConflict = (sc->maxSharedStride == sc->fftDim / sc->registerBoost) ? sc->fftDim / sc->registerBoost : sc->sharedStrideReadWriteConflict; - if (sc->useRaderFFT) { - sc->sharedStrideRaderFFT = (sc->maxSharedStride == sc->fftDim / sc->registerBoost) ? sc->fftDim / sc->registerBoost : sc->sharedStrideRaderFFT; - sc->sharedShiftRaderFFT = (sc->maxSharedStride == sc->fftDim / sc->registerBoost) ? 0 : sc->sharedShiftRaderFFT; - } - //sc->maxSharedStride += mergeR2C; - //printf("%" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 "\n", sc->maxSharedStride, sc->sharedStrideBankConflictFirstStages, sc->sharedStrideReadWriteConflict, sc->localSize[1], sc->fftDim); - sc->tempLen = sprintf(sc->tempStr, "%s sharedStride = %" PRIu64 ";\n", uintType, sc->sharedStrideReadWriteConflict); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->usedSharedMemory = vecSize * sc->localSize[1] * sc->maxSharedStride; - if (sc->useRaderMult) { - for (uint64_t i = 0; i < 20; i++) { - sc->RaderKernelOffsetShared[i] += sc->usedSharedMemory / vecSize; - } - sc->usedSharedMemory += sc->additionalRaderSharedSize * vecSize; - } -#if(VKFFT_BACKEND==0) - sc->tempLen = sprintf(sc->tempStr, "%s %s sdata[%" PRIu64 "];// sharedStride - fft size, gl_WorkGroupSize.y - grouped consecutive ffts\n\n", sharedDefinitions, vecType, sc->usedSharedMemory / vecSize); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; -#elif(VKFFT_BACKEND==1) - //sc->tempLen = sprintf(sc->tempStr, "%s %s sdata[%" PRIu64 "];// sharedStride - fft size, gl_WorkGroupSize.y - grouped consecutive ffts\n\n", sharedDefinitions, vecType, sc->localSize[1] * sc->maxSharedStride); - sc->tempLen = sprintf(sc->tempStr, "%s* sdata = (%s*)shared;\n\n", vecType, vecType); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - //sc->tempLen = sprintf(sc->tempStr, "%s %s sdata[];// sharedStride - fft size, gl_WorkGroupSize.y - grouped consecutive ffts\n\n", sharedDefinitions, vecType); -#elif(VKFFT_BACKEND==2) - //sc->tempLen = sprintf(sc->tempStr, "%s %s sdata[%" PRIu64 "];// sharedStride - fft size, gl_WorkGroupSize.y - grouped consecutive ffts\n\n", sharedDefinitions, vecType, sc->localSize[1] * sc->maxSharedStride); - sc->tempLen = sprintf(sc->tempStr, "%s* sdata = (%s*)shared;\n\n", vecType, vecType); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - //sc->tempLen = sprintf(sc->tempStr, "%s %s sdata[];// sharedStride - fft size, gl_WorkGroupSize.y - grouped consecutive ffts\n\n", sharedDefinitions, vecType); -#elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)) - sc->tempLen = sprintf(sc->tempStr, "%s %s sdata[%" PRIu64 "];// sharedStride - fft size, gl_WorkGroupSize.y - grouped consecutive ffts\n\n", sharedDefinitions, vecType, sc->usedSharedMemory / vecSize); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; -#endif - break; - } - case 1: case 2: case 111: case 121: case 131: case 141: case 143: case 145://grouped_c2c + single_c2c_strided - { - uint64_t shift = (sc->fftDim < (sc->numSharedBanks / 2)) ? (sc->numSharedBanks / 2) / sc->fftDim : 1; - sc->sharedStrideReadWriteConflict = ((sc->axisSwapped) && ((sc->localSize[0] % 4) == 0)) ? sc->localSize[0] + shift : sc->localSize[0]; - sc->maxSharedStride = ((maxSequenceSharedMemory < sc->sharedStrideReadWriteConflict* sc->fftDim / sc->registerBoost)) ? sc->localSize[0] : sc->sharedStrideReadWriteConflict; - sc->sharedStrideReadWriteConflict = (sc->maxSharedStride == sc->localSize[0]) ? sc->localSize[0] : sc->sharedStrideReadWriteConflict; - sc->tempLen = sprintf(sc->tempStr, "%s sharedStride = %" PRIu64 ";\n", uintType, sc->maxSharedStride); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->usedSharedMemory = vecSize * sc->maxSharedStride * (sc->fftDim + mergeR2C) / sc->registerBoost; - if (sc->useRaderMult) { - for (uint64_t i = 0; i < 20; i++) { - sc->RaderKernelOffsetShared[i] += sc->usedSharedMemory / vecSize; - } - sc->usedSharedMemory += sc->additionalRaderSharedSize * vecSize; - } -#if(VKFFT_BACKEND==0) - sc->tempLen = sprintf(sc->tempStr, "%s %s sdata[%" PRIu64 "];\n\n", sharedDefinitions, vecType, sc->usedSharedMemory / vecSize); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; -#elif(VKFFT_BACKEND==1) - //sc->tempLen = sprintf(sc->tempStr, "%s %s sdata[%" PRIu64 "];\n\n", sharedDefinitions, vecType, sc->maxSharedStride * (sc->fftDim + mergeR2C) / sc->registerBoost); - sc->tempLen = sprintf(sc->tempStr, "%s* sdata = (%s*)shared;\n\n", vecType, vecType); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - //sc->tempLen = sprintf(sc->tempStr, "%s %s sdata[];\n\n", sharedDefinitions, vecType); -#elif(VKFFT_BACKEND==2) - //sc->tempLen = sprintf(sc->tempStr, "%s %s sdata[%" PRIu64 "];\n\n", sharedDefinitions, vecType, sc->maxSharedStride * (sc->fftDim + mergeR2C) / sc->registerBoost); - sc->tempLen = sprintf(sc->tempStr, "%s* sdata = (%s*)shared;\n\n", vecType, vecType); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - //sc->tempLen = sprintf(sc->tempStr, "%s %s sdata[];\n\n", sharedDefinitions, vecType); -#elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)) - sc->tempLen = sprintf(sc->tempStr, "%s %s sdata[%" PRIu64 "];\n\n", sharedDefinitions, vecType, sc->usedSharedMemory / vecSize); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; -#endif - break; - } - } - if (sc->useRaderMult) { - sc->sharedMemSize += sc->additionalRaderSharedSize * vecSize; - sc->sharedMemSizePow2 += sc->additionalRaderSharedSize * vecSize; - } - return res; -} -static inline VkFFTResult appendInitialization(VkFFTSpecializationConstantsLayout* sc, const char* floatType, const char* uintType, uint64_t initType) { - VkFFTResult res = VKFFT_SUCCESS; - char vecType[30]; - char uintType_32[30]; -#if(VKFFT_BACKEND==0) - if (!strcmp(floatType, "float")) sprintf(vecType, "vec2"); - if (!strcmp(floatType, "double")) sprintf(vecType, "dvec2"); - sprintf(uintType_32, "uint"); -#elif(VKFFT_BACKEND==1) - if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); - if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); - sprintf(uintType_32, "unsigned int"); -#elif(VKFFT_BACKEND==2) - if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); - if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); - sprintf(uintType_32, "unsigned int"); -#elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)) - if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); - if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); - sprintf(uintType_32, "unsigned int"); -#endif - //sc->tempLen = sprintf(sc->tempStr, " uint dum=gl_LocalInvocationID.x;\n"); - uint64_t logicalStoragePerThread = sc->registers_per_thread * sc->registerBoost; - uint64_t logicalRegistersPerThread = sc->registers_per_thread; - if (sc->convolutionStep) { - for (uint64_t i = 0; i < sc->registers_per_thread; i++) { - sc->tempLen = sprintf(sc->tempStr, " %s temp_%" PRIu64 ";\n", vecType, i); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " temp_%" PRIu64 ".x=0;\n", i); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " temp_%" PRIu64 ".y=0;\n", i); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - for (uint64_t j = 1; j < sc->matrixConvolution; j++) { - for (uint64_t i = 0; i < sc->registers_per_thread; i++) { - sc->tempLen = sprintf(sc->tempStr, " %s temp_%" PRIu64 "_%" PRIu64 ";\n", vecType, i, j); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " temp_%" PRIu64 "_%" PRIu64 ".x=0;\n", i, j); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " temp_%" PRIu64 "_%" PRIu64 ".y=0;\n", i, j); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - else { - for (uint64_t i = 0; i < sc->registers_per_thread; i++) { - sc->tempLen = sprintf(sc->tempStr, " %s temp_%" PRIu64 ";\n", vecType, i); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " temp_%" PRIu64 ".x=0;\n", i); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " temp_%" PRIu64 ".y=0;\n", i); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - //sc->tempLen = sprintf(sc->tempStr, " uint dum=gl_LocalInvocationID.y;//gl_LocalInvocationID.x/gl_WorkGroupSize.x;\n"); - //sc->tempLen = sprintf(sc->tempStr, " dum=dum/gl_LocalInvocationID.x-1;\n"); - //sc->tempLen = sprintf(sc->tempStr, " dummy=dummy/gl_LocalInvocationID.x-1;\n"); - sc->regIDs = (char**)malloc(sizeof(char*) * logicalStoragePerThread); - if (!sc->regIDs) return VKFFT_ERROR_MALLOC_FAILED; - for (uint64_t i = 0; i < logicalStoragePerThread; i++) { - sc->regIDs[i] = (char*)malloc(sizeof(char) * 50); - if (!sc->regIDs[i]) { - for (uint64_t j = 0; j < i; j++) { - free(sc->regIDs[j]); - sc->regIDs[j] = 0; - } - free(sc->regIDs); - sc->regIDs = 0; - return VKFFT_ERROR_MALLOC_FAILED; - } - if (i < logicalRegistersPerThread) - sprintf(sc->regIDs[i], "temp_%" PRIu64 "", i); - else - sprintf(sc->regIDs[i], "temp_%" PRIu64 "", i); - //sprintf(sc->regIDs[i], "%" PRIu64 "[%" PRIu64 "]", i / logicalRegistersPerThread, i % logicalRegistersPerThread); - //sprintf(sc->regIDs[i], "s[%" PRIu64 "]", i - logicalRegistersPerThread); - - } - if (sc->registerBoost > 1) { - //sc->tempLen = sprintf(sc->tempStr, " %s sort0;\n", vecType); - //sc->tempLen = sprintf(sc->tempStr, " %s temps[%" PRIu64 "];\n", vecType, (sc->registerBoost -1)* logicalRegistersPerThread); - for (uint64_t i = 1; i < sc->registerBoost; i++) { - //sc->tempLen = sprintf(sc->tempStr, " %s temp%" PRIu64 "[%" PRIu64 "];\n", vecType, i, logicalRegistersPerThread); - for (uint64_t j = 0; j < sc->registers_per_thread; j++) { - sc->tempLen = sprintf(sc->tempStr, " %s temp_%" PRIu64 ";\n", vecType, j + i * sc->registers_per_thread); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " temp_%" PRIu64 ".x=0;\n", j + i * sc->registers_per_thread); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " temp_%" PRIu64 ".y=0;\n", j + i * sc->registers_per_thread); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - /*sc->tempLen = sprintf(sc->tempStr, "\ -for(uint i=0; i<%" PRIu64 "; i++)\n\ -temp%" PRIu64 "[i]=%s(dum, dum);\n", logicalRegistersPerThread, i, vecType);*/ - } - } - sc->tempLen = sprintf(sc->tempStr, " %s w;\n", vecType); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " w.x=0;\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " w.y=0;\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sprintf(sc->w, "w"); - - uint64_t maxNonPow2Radix = sc->maxNonPow2Radix; - for (uint64_t i = 0; i < sc->usedLocRegs; i++) { - sprintf(sc->locID[i], "loc_%" PRIu64 "", i); - sc->tempLen = sprintf(sc->tempStr, " %s %s;\n", vecType, sc->locID[i]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.x=0;\n", sc->locID[i]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y=0;\n", sc->locID[i]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sprintf(sc->temp, "%s", sc->locID[0]); - if (sc->useRaderFFT) { - for (uint64_t i = 0; i < 2; i++) { - sprintf(sc->x0[i], "x0_%" PRIu64 "", i); - sc->tempLen = sprintf(sc->tempStr, " %s %s;\n", vecType, sc->x0[i]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.x=0;\n", sc->x0[i]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y=0;\n", sc->x0[i]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - if (sc->useRaderMult) { - int64_t rader_fft_regs = (sc->useRaderFFT) ? 2 : 0; - int64_t rader_mult_regs = sc->raderRegisters / 2 - rader_fft_regs; - if (rader_mult_regs <= (int64_t)sc->usedLocRegs - 1) { - for (int64_t i = 0; i < rader_mult_regs; i++) { - sprintf(sc->x0[i + rader_fft_regs], "%s", sc->locID[i + 1]); - } - } - else { - for (int64_t i = 0; i < (int64_t)sc->usedLocRegs - 1; i++) { - sprintf(sc->x0[i + rader_fft_regs], "%s", sc->locID[i + 1]); - } - for (int64_t i = sc->usedLocRegs - 1; i < rader_mult_regs; i++) { - sprintf(sc->x0[i + rader_fft_regs], "x0_%" PRIu64 "", i + rader_fft_regs); - sc->tempLen = sprintf(sc->tempStr, " %s %s;\n", vecType, sc->x0[i + rader_fft_regs]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.x=0;\n", sc->x0[i + rader_fft_regs]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y=0;\n", sc->x0[i + rader_fft_regs]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - //sc->tempLen = sprintf(sc->tempStr, " %s temp2;\n", vecType); - //res = VkAppendLine(sc); - //if (res != VKFFT_SUCCESS) return res; - uint64_t useRadix8plus = 0; - for (uint64_t i = 0; i < sc->numStages; i++) - if ((sc->stageRadix[i] == 8) || (sc->stageRadix[i] == 16) || (sc->stageRadix[i] == 32) || (sc->useRaderFFT)) useRadix8plus = 1; - if (useRadix8plus == 1) { - if (maxNonPow2Radix > 1) sprintf(sc->iw, "%s", sc->locID[1]); - else { - sc->tempLen = sprintf(sc->tempStr, " %s iw;\n", vecType); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " iw.x=0;\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " iw.y=0;\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sprintf(sc->iw, "iw"); - } - } - //sc->tempLen = sprintf(sc->tempStr, " %s %s;\n", vecType, sc->tempReg); - sc->tempLen = sprintf(sc->tempStr, " %s %s=0;\n", uintType, sc->stageInvocationID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s %s=0;\n", uintType, sc->blockInvocationID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s %s=0;\n", uintType, sc->sdataID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s %s=0;\n", uintType, sc->combinedID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s %s=0;\n", uintType, sc->inoutID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if ((sc->fftDim < sc->fft_dim_full) || (initType==1) || (initType == 2)) { - sc->tempLen = sprintf(sc->tempStr, " %s disableThreads=1;\n", uintType_32); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - //initialize subgroups ids - if (sc->useRader) { - sc->tempLen = sprintf(sc->tempStr, " %s %s = 0;\n", uintType, sc->raderIDx); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s %s = 0;\n", uintType, sc->raderIDx2); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - /*#if((VKFFT_BACKEND==1)||(VKFFT_BACKEND==2)) - sprintf(sc->gl_SubgroupInvocationID, "gl_SubgroupInvocationID"); - sprintf(sc->gl_SubgroupID, "gl_SubgroupID"); - if (sc->localSize[1] == 1) { - sc->tempLen = sprintf(sc->tempStr, " %s %s=(threadIdx.x %% %" PRIu64 ");\n", uintType, sc->gl_SubgroupInvocationID, sc->warpSize); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s %s=(threadIdx.x / %" PRIu64 ");\n", uintType, sc->gl_SubgroupID, sc->warpSize); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " %s %s=((threadIdx.x+threadIdx.y*blockDim.x) %% %" PRIu64 ");\n", uintType, sc->gl_SubgroupInvocationID, sc->warpSize); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s %s=((threadIdx.x+threadIdx.y*blockDim.x) / %" PRIu64 ");\n", uintType, sc->gl_SubgroupID, sc->warpSize); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - #endif*/ - } - if (sc->LUT) { - sc->tempLen = sprintf(sc->tempStr, " %s LUTId=0;\n", uintType); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " %s angle=0;\n", floatType); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (((sc->stageStartSize > 1) && (!((sc->stageStartSize > 1) && (!sc->reorderFourStep) && (sc->inverse)))) || (((sc->stageStartSize > 1) && (!sc->reorderFourStep) && (sc->inverse))) || (sc->performDCT)) { - sc->tempLen = sprintf(sc->tempStr, " %s mult;\n", vecType); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " mult.x = 0;\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " mult.y = 0;\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->cacheShuffle) { - sc->tempLen = sprintf(sc->tempStr, "\ - %s tshuffle= ((%s>>1))%%(%" PRIu64 ");\n\ - %s shuffle[%" PRIu64 "];\n", uintType, sc->gl_LocalInvocationID_x, sc->registers_per_thread, vecType, sc->registers_per_thread); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - for (uint64_t i = 0; i < sc->registers_per_thread; i++) { - /*sc->tempLen = sprintf(sc->tempStr, "\ -shuffle[%" PRIu64 "];\n", i, vecType);*/ - sc->tempLen = sprintf(sc->tempStr, " shuffle[%" PRIu64 "].x = 0;\n", i); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " shuffle[%" PRIu64 "].y = 0;\n", i); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - return res; -} -static inline VkFFTResult appendZeropadStart(VkFFTSpecializationConstantsLayout* sc) { - //return if sequence is full of zeros from the start - VkFFTResult res = VKFFT_SUCCESS; - if ((sc->frequencyZeropadding)) { - switch (sc->axis_id) { - case 0: { - break; - } - case 1: { - if (!sc->supportAxis) { - char idX[500] = ""; - if (sc->performWorkGroupShift[0]) - sprintf(idX, "(%s + consts.workGroupShiftX * %s)", sc->gl_GlobalInvocationID_x, sc->gl_WorkGroupSize_x); - else - sprintf(idX, "%s", sc->gl_GlobalInvocationID_x); - if (sc->performZeropaddingFull[0]) { - if (sc->fft_zeropad_left_full[0] < sc->fft_zeropad_right_full[0]) { - sc->tempLen = sprintf(sc->tempStr, " if(!((%s >= %" PRIu64 ")&&(%s < %" PRIu64 "))) {\n", idX, sc->fft_zeropad_left_full[0], idX, sc->fft_zeropad_right_full[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - - } - break; - } - case 2: { - if (!sc->supportAxis) { - char idY[500] = ""; - if (sc->performWorkGroupShift[1])//y axis is along z workgroup here - sprintf(idY, "(%s + consts.workGroupShiftZ * %s)", sc->gl_GlobalInvocationID_z, sc->gl_WorkGroupSize_z); - else - sprintf(idY, "%s", sc->gl_GlobalInvocationID_z); - - char idX[500] = ""; - if (sc->performWorkGroupShift[0]) - sprintf(idX, "(%s + consts.workGroupShiftX * %s)", sc->gl_GlobalInvocationID_x, sc->gl_WorkGroupSize_x); - else - sprintf(idX, "%s", sc->gl_GlobalInvocationID_x); - if (sc->performZeropaddingFull[0]) { - if (sc->fft_zeropad_left_full[0] < sc->fft_zeropad_right_full[0]) { - sc->tempLen = sprintf(sc->tempStr, " if(!((%s >= %" PRIu64 ")&&(%s < %" PRIu64 "))) {\n", idX, sc->fft_zeropad_left_full[0], idX, sc->fft_zeropad_right_full[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - if (sc->performZeropaddingFull[1]) { - if (sc->fft_zeropad_left_full[1] < sc->fft_zeropad_right_full[1]) { - sc->tempLen = sprintf(sc->tempStr, " if(!((%s >= %" PRIu64 ")&&(%s < %" PRIu64 "))) {\n", idY, sc->fft_zeropad_left_full[1], idY, sc->fft_zeropad_right_full[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - else { - char idY[500] = ""; - if (sc->performWorkGroupShift[1])//for support axes y is along x workgroup - sprintf(idY, "(%s + consts.workGroupShiftX * %s)", sc->gl_GlobalInvocationID_x, sc->gl_WorkGroupSize_x); - else - sprintf(idY, "%s", sc->gl_GlobalInvocationID_x); - if (sc->performZeropaddingFull[1]) { - if (sc->fft_zeropad_left_full[1] < sc->fft_zeropad_right_full[1]) { - sc->tempLen = sprintf(sc->tempStr, " if(!((%s >= %" PRIu64 ")&&(%s < %" PRIu64 "))) {\n", idY, sc->fft_zeropad_left_full[1], idY, sc->fft_zeropad_right_full[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - break; - } - } - } - else { - switch (sc->axis_id) { - case 0: { - char idY[500] = ""; - uint64_t mult = (sc->mergeSequencesR2C) ? 2 : 1; - if (sc->axisSwapped) { - if (mult != 1) { - if (sc->performWorkGroupShift[1]) - sprintf(idY, "((%s + (%s + consts.workGroupShiftY) * %" PRIu64 ")* %" PRIu64 ")", sc->gl_LocalInvocationID_x, sc->gl_WorkGroupID_y, sc->localSize[0], mult); - else - sprintf(idY, "((%s + %s * %" PRIu64 ")*%" PRIu64 ")", sc->gl_LocalInvocationID_x, sc->gl_WorkGroupID_y, sc->localSize[0], mult); - } - else { - if (sc->performWorkGroupShift[1]) - sprintf(idY, "(%s + (%s + consts.workGroupShiftY) * %" PRIu64 ")", sc->gl_LocalInvocationID_x, sc->gl_WorkGroupID_y, sc->localSize[0]); - else - sprintf(idY, "(%s + %s * %" PRIu64 ")", sc->gl_LocalInvocationID_x, sc->gl_WorkGroupID_y, sc->localSize[0]); - } - char idZ[500] = ""; - if (sc->performWorkGroupShift[2]) - sprintf(idZ, "(%s + consts.workGroupShiftZ * %s) %% %" PRIu64 "", sc->gl_GlobalInvocationID_z, sc->gl_WorkGroupSize_z, sc->size[2]); - else - sprintf(idZ, "%s %% %" PRIu64 "", sc->gl_GlobalInvocationID_z, sc->size[2]); - if (sc->performZeropaddingFull[1]) { - if (sc->fft_zeropad_left_full[1] < sc->fft_zeropad_right_full[1]) { - sc->tempLen = sprintf(sc->tempStr, " if(!((%s >= %" PRIu64 ")&&(%s < %" PRIu64 "))) {\n", idY, sc->fft_zeropad_left_full[1], idY, sc->fft_zeropad_right_full[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - if (sc->performZeropaddingFull[2]) { - if (sc->fft_zeropad_left_full[2] < sc->fft_zeropad_right_full[2]) { - sc->tempLen = sprintf(sc->tempStr, " if(!((%s >= %" PRIu64 ")&&(%s < %" PRIu64 "))) {\n", idZ, sc->fft_zeropad_left_full[2], idZ, sc->fft_zeropad_right_full[2]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - else { - if (mult != 1) { - if (sc->performWorkGroupShift[1]) - sprintf(idY, "((%s + consts.workGroupShiftY * %s)* %" PRIu64 ")", sc->gl_GlobalInvocationID_y, sc->gl_WorkGroupSize_y, mult); - else - sprintf(idY, "(%s* %" PRIu64 ")", sc->gl_GlobalInvocationID_y, mult); - } - else { - if (sc->performWorkGroupShift[1]) - sprintf(idY, "(%s + consts.workGroupShiftY * %s)", sc->gl_GlobalInvocationID_y, sc->gl_WorkGroupSize_y); - else - sprintf(idY, "%s", sc->gl_GlobalInvocationID_y); - } - char idZ[500] = ""; - if (sc->performWorkGroupShift[2]) - sprintf(idZ, "(%s + consts.workGroupShiftZ * %s) %% %" PRIu64 "", sc->gl_GlobalInvocationID_z, sc->gl_WorkGroupSize_z, sc->size[2]); - else - sprintf(idZ, "%s %% %" PRIu64 "", sc->gl_GlobalInvocationID_z, sc->size[2]); - if (sc->performZeropaddingFull[1]) { - if (sc->fft_zeropad_left_full[1] < sc->fft_zeropad_right_full[1]) { - sc->tempLen = sprintf(sc->tempStr, " if(!((%s >= %" PRIu64 ")&&(%s < %" PRIu64 "))) {\n", idY, sc->fft_zeropad_left_full[1], idY, sc->fft_zeropad_right_full[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - if (sc->performZeropaddingFull[2]) { - if (sc->fft_zeropad_left_full[2] < sc->fft_zeropad_right_full[2]) { - sc->tempLen = sprintf(sc->tempStr, " if(!((%s >= %" PRIu64 ")&&(%s < %" PRIu64 "))) {\n", idZ, sc->fft_zeropad_left_full[2], idZ, sc->fft_zeropad_right_full[2]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - break; - } - case 1: { - char idZ[500] = ""; - if (sc->performWorkGroupShift[2]) - sprintf(idZ, "(%s + consts.workGroupShiftZ * %s) %% %" PRIu64 "", sc->gl_GlobalInvocationID_z, sc->gl_WorkGroupSize_z, sc->size[2]); - else - sprintf(idZ, "%s %% %" PRIu64 "", sc->gl_GlobalInvocationID_z, sc->size[2]); - if (sc->performZeropaddingFull[2]) { - if (sc->fft_zeropad_left_full[2] < sc->fft_zeropad_right_full[2]) { - sc->tempLen = sprintf(sc->tempStr, " if(!((%s >= %" PRIu64 ")&&(%s < %" PRIu64 "))) {\n", idZ, sc->fft_zeropad_left_full[2], idZ, sc->fft_zeropad_right_full[2]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - - break; - } - case 2: { - - break; - } - } - } - return res; -} -static inline VkFFTResult appendZeropadEnd(VkFFTSpecializationConstantsLayout* sc) { - //return if sequence is full of zeros from the start - VkFFTResult res = VKFFT_SUCCESS; - if ((sc->frequencyZeropadding)) { - switch (sc->axis_id) { - case 0: { - break; - } - case 1: { - if (!sc->supportAxis) { - char idX[500] = ""; - if (sc->performWorkGroupShift[0]) - sprintf(idX, "(%s + consts.workGroupShiftX * %s)", sc->gl_GlobalInvocationID_x, sc->gl_WorkGroupSize_x); - else - sprintf(idX, "%s", sc->gl_GlobalInvocationID_x); - if (sc->performZeropaddingFull[0]) { - if (sc->fft_zeropad_left_full[0] < sc->fft_zeropad_right_full[0]) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - - } - break; - } - case 2: { - if (!sc->supportAxis) { - char idY[500] = ""; - if (sc->performWorkGroupShift[1])//y axis is along z workgroup here - sprintf(idY, "(%s + consts.workGroupShiftZ * %s)", sc->gl_GlobalInvocationID_z, sc->gl_WorkGroupSize_z); - else - sprintf(idY, "%s", sc->gl_GlobalInvocationID_z); - - char idX[500] = ""; - if (sc->performWorkGroupShift[0]) - sprintf(idX, "(%s + consts.workGroupShiftX * %s)", sc->gl_GlobalInvocationID_x, sc->gl_WorkGroupSize_x); - else - sprintf(idX, "%s", sc->gl_GlobalInvocationID_x); - if (sc->performZeropaddingFull[0]) { - if (sc->fft_zeropad_left_full[0] < sc->fft_zeropad_right_full[0]) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - if (sc->performZeropaddingFull[1]) { - if (sc->fft_zeropad_left_full[1] < sc->fft_zeropad_right_full[1]) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - else { - char idY[500] = ""; - if (sc->performWorkGroupShift[1])//for support axes y is along x workgroup - sprintf(idY, "(%s + consts.workGroupShiftX * %s)", sc->gl_GlobalInvocationID_x, sc->gl_WorkGroupSize_x); - else - sprintf(idY, "%s", sc->gl_GlobalInvocationID_x); - if (sc->performZeropaddingFull[1]) { - if (sc->fft_zeropad_left_full[1] < sc->fft_zeropad_right_full[1]) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - break; - } - } - } - else { - switch (sc->axis_id) { - case 0: { - //char idY[500] = ""; - if (sc->performZeropaddingFull[1]) { - if (sc->fft_zeropad_left_full[1] < sc->fft_zeropad_right_full[1]) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - if (sc->performZeropaddingFull[2]) { - if (sc->fft_zeropad_left_full[2] < sc->fft_zeropad_right_full[2]) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - break; - } - case 1: { - char idZ[500] = ""; - if (sc->performWorkGroupShift[2]) - sprintf(idZ, "(%s + consts.workGroupShiftZ * %s)", sc->gl_GlobalInvocationID_z, sc->gl_WorkGroupSize_z); - else - sprintf(idZ, "%s", sc->gl_GlobalInvocationID_z); - if (sc->performZeropaddingFull[2]) { - if (sc->fft_zeropad_left_full[2] < sc->fft_zeropad_right_full[2]) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - break; - } - case 2: { - - break; - } - } - } - return res; -} - -static inline VkFFTResult appendZeropadStartReadWriteStage(VkFFTSpecializationConstantsLayout* sc, uint64_t readStage) { - //return if sequence is full of zeros from the start - VkFFTResult res = VKFFT_SUCCESS; - if ((sc->frequencyZeropadding)) { - switch (sc->axis_id) { - case 0: { - break; - } - case 1: { - if (!sc->supportAxis) { - char idX[500] = ""; - if (readStage) { - sprintf(idX, "(%s %% %" PRIu64 ")", sc->inoutID, sc->inputStride[1]); - } - else { - sprintf(idX, "(%s %% %" PRIu64 ")", sc->inoutID, sc->outputStride[1]); - } - if (sc->performZeropaddingFull[0]) { - if (sc->fft_zeropad_left_full[0] < sc->fft_zeropad_right_full[0]) { - sc->tempLen = sprintf(sc->tempStr, " if(!((%s >= %" PRIu64 ")&&(%s < %" PRIu64 "))) {\n", idX, sc->fft_zeropad_left_full[0], idX, sc->fft_zeropad_right_full[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - - } - break; - } - case 2: { - if (!sc->supportAxis) { - char idY[500] = ""; - char idX[500] = ""; - if (readStage) { - sprintf(idY, "(%s/%" PRIu64 ") %% %" PRIu64 "", sc->inoutID, sc->inputStride[1], sc->inputStride[2] / sc->inputStride[1]); - sprintf(idX, "(%s %% %" PRIu64 ")", sc->inoutID, sc->inputStride[1]); - } - else { - sprintf(idY, "(%s/%" PRIu64 ") %% %" PRIu64 "", sc->inoutID, sc->outputStride[1], sc->outputStride[2] / sc->outputStride[1]); - sprintf(idX, "(%s %% %" PRIu64 ")", sc->inoutID, sc->outputStride[1]); - - } - if (sc->performZeropaddingFull[0]) { - if (sc->fft_zeropad_left_full[0] < sc->fft_zeropad_right_full[0]) { - sc->tempLen = sprintf(sc->tempStr, " if(!((%s >= %" PRIu64 ")&&(%s < %" PRIu64 "))) {\n", idX, sc->fft_zeropad_left_full[0], idX, sc->fft_zeropad_right_full[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - if (sc->performZeropaddingFull[1]) { - if (sc->fft_zeropad_left_full[1] < sc->fft_zeropad_right_full[1]) { - sc->tempLen = sprintf(sc->tempStr, " if(!((%s >= %" PRIu64 ")&&(%s < %" PRIu64 "))) {\n", idY, sc->fft_zeropad_left_full[1], idY, sc->fft_zeropad_right_full[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - else { - char idY[500] = ""; - if (readStage) { - sprintf(idY, "(%s/%" PRIu64 ") %% %" PRIu64 "", sc->inoutID, sc->inputStride[1], sc->inputStride[2] / sc->inputStride[1]); - } - else { - sprintf(idY, "(%s/%" PRIu64 ") %% %" PRIu64 "", sc->inoutID, sc->outputStride[1], sc->outputStride[2] / sc->outputStride[1]); - } - if (sc->performZeropaddingFull[1]) { - if (sc->fft_zeropad_left_full[1] < sc->fft_zeropad_right_full[1]) { - sc->tempLen = sprintf(sc->tempStr, " if(!((%s >= %" PRIu64 ")&&(%s < %" PRIu64 "))) {\n", idY, sc->fft_zeropad_left_full[1], idY, sc->fft_zeropad_right_full[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - break; - } - } - } - else { - switch (sc->axis_id) { - case 0: { - char idY[500] = ""; - char idZ[500] = ""; - //uint64_t mult = (sc->mergeSequencesR2C) ? 2 : 1; - if (readStage) { - sprintf(idY, "(%s/%" PRIu64 ") %% %" PRIu64 "", sc->inoutID, sc->inputStride[1], sc->inputStride[2] / sc->inputStride[1]); - sprintf(idZ, "(%s/%" PRIu64 ") %% %" PRIu64 "", sc->inoutID, sc->inputStride[2], sc->inputStride[3] / sc->inputStride[2]); - } - else { - sprintf(idY, "(%s/%" PRIu64 ") %% %" PRIu64 "", sc->inoutID, sc->outputStride[1], sc->outputStride[2] / sc->outputStride[1]); - sprintf(idZ, "(%s/%" PRIu64 ") %% %" PRIu64 "", sc->inoutID, sc->outputStride[2], sc->outputStride[3] / sc->outputStride[2]); - - } - if (sc->performZeropaddingFull[1]) { - if (sc->fft_zeropad_left_full[1] < sc->fft_zeropad_right_full[1]) { - sc->tempLen = sprintf(sc->tempStr, " if(!((%s >= %" PRIu64 ")&&(%s < %" PRIu64 "))) {\n", idY, sc->fft_zeropad_left_full[1], idY, sc->fft_zeropad_right_full[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - if (sc->performZeropaddingFull[2]) { - if (sc->fft_zeropad_left_full[2] < sc->fft_zeropad_right_full[2]) { - sc->tempLen = sprintf(sc->tempStr, " if(!((%s >= %" PRIu64 ")&&(%s < %" PRIu64 "))) {\n", idZ, sc->fft_zeropad_left_full[2], idZ, sc->fft_zeropad_right_full[2]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - break; - } - case 1: { - char idZ[500] = ""; - if (readStage) { - sprintf(idZ, "(%s/%" PRIu64 ") %% %" PRIu64 "", sc->inoutID, sc->inputStride[2], sc->inputStride[3] / sc->inputStride[2]); - } - else { - sprintf(idZ, "(%s/%" PRIu64 ") %% %" PRIu64 "", sc->inoutID, sc->outputStride[2], sc->outputStride[3] / sc->outputStride[2]); - } - if (sc->performZeropaddingFull[2]) { - if (sc->fft_zeropad_left_full[2] < sc->fft_zeropad_right_full[2]) { - sc->tempLen = sprintf(sc->tempStr, " if(!((%s >= %" PRIu64 ")&&(%s < %" PRIu64 "))) {\n", idZ, sc->fft_zeropad_left_full[2], idZ, sc->fft_zeropad_right_full[2]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - - break; - } - case 2: { - - break; - } - } - } - return res; -} -static inline VkFFTResult appendZeropadEndReadWriteStage(VkFFTSpecializationConstantsLayout* sc) { - //return if sequence is full of zeros from the start - VkFFTResult res = VKFFT_SUCCESS; - if ((sc->frequencyZeropadding)) { - switch (sc->axis_id) { - case 0: { - break; - } - case 1: { - char idX[500] = ""; - if (sc->performWorkGroupShift[0]) - sprintf(idX, "(%s + consts.workGroupShiftX * %s)", sc->gl_GlobalInvocationID_x, sc->gl_WorkGroupSize_x); - else - sprintf(idX, "%s", sc->gl_GlobalInvocationID_x); - if (sc->performZeropaddingFull[0]) { - if (sc->fft_zeropad_left_full[0] < sc->fft_zeropad_right_full[0]) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - break; - } - case 2: { - if (sc->performZeropaddingFull[0]) { - if (sc->fft_zeropad_left_full[0] < sc->fft_zeropad_right_full[0]) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - if (sc->performZeropaddingFull[1]) { - if (sc->fft_zeropad_left_full[1] < sc->fft_zeropad_right_full[1]) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - break; - } - } - } - else { - switch (sc->axis_id) { - case 0: { - if (sc->performZeropaddingFull[1]) { - if (sc->fft_zeropad_left_full[1] < sc->fft_zeropad_right_full[1]) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - if (sc->performZeropaddingFull[2]) { - if (sc->fft_zeropad_left_full[2] < sc->fft_zeropad_right_full[2]) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - break; - } - case 1: { - if (sc->performZeropaddingFull[2]) { - if (sc->fft_zeropad_left_full[2] < sc->fft_zeropad_right_full[2]) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - break; - } - case 2: { - - break; - } - } - } - return res; -} -static inline VkFFTResult appendSetSMToZero(VkFFTSpecializationConstantsLayout* sc, const char* floatType, const char* floatTypeMemory, const char* uintType, uint64_t readType) { - VkFFTResult res = VKFFT_SUCCESS; - uint64_t used_registers_read = 1; - switch (readType) { - case 0: case 5: case 6: case 110: case 120: case 130: case 140: case 142: case 144: - used_registers_read = (uint64_t)ceil(sc->fftDim / (double)sc->localSize[0]); - break; - case 1: case 2: case 111: case 121: case 131: case 141: case 143: case 145: - used_registers_read = (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]); - break; - } - if (sc->registerBoost > 1) used_registers_read /= sc->registerBoost; - //appendZeropadStart(sc); - for (uint64_t k = 0; k < sc->registerBoost; k++) { - for (uint64_t i = 0; i < used_registers_read; i++) { - switch (readType) { - case 0: case 5: case 6: case 110: case 120: case 130: case 140: case 142: case 144: - { - if (sc->localSize[1] == 1) - sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * used_registers_read) * sc->localSize[0]); - else - sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - if (sc->axisSwapped) { - if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[0])) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")].x = 0;\n", sc->fftDim, sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")].y = 0;\n", sc->fftDim, sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[0])) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[1])) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].x = 0;\n", sc->fftDim, sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].y = 0;\n", sc->fftDim, sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[1])) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - break; - } - case 1: case 2: case 111: case 121: case 131: case 141: case 143: case 145://single_c2c - { - if (sc->localSize[1] * (i + 1) > sc->fftDim) { - sc->tempLen = sprintf(sc->tempStr, "\ - if (%s < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_y, sc->fftDim - sc->localSize[1] * i); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " sdata[%s*(%s+%" PRIu64 ")+%s].x=0;\n", sc->sharedStride, sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[1], sc->gl_LocalInvocationID_x); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdata[%s*(%s+%" PRIu64 ")+%s].y=0;\n", sc->sharedStride, sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[1], sc->gl_LocalInvocationID_x); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->localSize[1] * (i + 1) > sc->fftDim) { - sc->tempLen = sprintf(sc->tempStr, "\ - }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - break; - } - } - } - } - - - //res = appendZeropadEnd(sc); - //if (res != VKFFT_SUCCESS) return res; - return res; -} -static inline VkFFTResult setReadToRegisters(VkFFTSpecializationConstantsLayout* sc, uint64_t readType) { - VkFFTResult res = VKFFT_SUCCESS; - switch (readType) { - case 0: //single_c2c - { - if ((sc->localSize[1] > 1) || ((sc->performR2C) && (sc->actualInverse)) || (sc->localSize[0] * sc->stageRadix[0] * (sc->registers_per_thread_per_radix[sc->stageRadix[0]] / sc->stageRadix[0]) > sc->fftDim) || (sc->rader_generator[0] > 0)) - sc->readToRegisters = 0; - else - sc->readToRegisters = 1; - break; - } - case 1: //grouped_c2c - { - if ((sc->localSize[1] * sc->stageRadix[0] * (sc->registers_per_thread_per_radix[sc->stageRadix[0]] / sc->stageRadix[0]) > sc->fftDim) || (sc->rader_generator[0] > 0)) - sc->readToRegisters = 0; - else - sc->readToRegisters = 1; - break; - } - case 2: //single_c2c_strided - { - if ((sc->localSize[1] * sc->stageRadix[0] * (sc->registers_per_thread_per_radix[sc->stageRadix[0]] / sc->stageRadix[0]) > sc->fftDim) || (sc->rader_generator[0] > 0)) - sc->readToRegisters = 0; - else - sc->readToRegisters = 1; - break; - } - case 5://single_r2c - { - if ((sc->axisSwapped) || (sc->localSize[1] > 1) || (sc->localSize[0] * sc->stageRadix[0] * (sc->registers_per_thread_per_radix[sc->stageRadix[0]] / sc->stageRadix[0]) > sc->fftDim) || (sc->rader_generator[0] > 0)) - sc->readToRegisters = 0; - else - sc->readToRegisters = 1; - break; - } - case 6: //single_c2r - { - if ((sc->rader_generator[0] > 0) || ((sc->fftDim % sc->localSize[0]) && (!sc->axisSwapped)) || ((sc->fftDim % sc->localSize[1]) && (sc->axisSwapped))) - sc->readToRegisters = 0; - else - sc->readToRegisters = 1; - break; - } - case 110: case 111: case 120: case 121: case 130: case 131: case 140: case 141: case 142: case 143: - { - sc->readToRegisters = 0; - break; - } - case 144: - { - uint64_t registers_first_stage = (sc->stageRadix[0] < sc->fixMinRaderPrimeMult) ? sc->registers_per_thread_per_radix[sc->stageRadix[0]] : 1; - if ((sc->rader_generator[0] > 0) || (sc->fftDim % registers_first_stage)) - sc->readToRegisters = 0; - else - sc->readToRegisters = 1; - break; - } - case 145: - { - uint64_t registers_first_stage = (sc->stageRadix[0] < sc->fixMinRaderPrimeMult) ? sc->registers_per_thread_per_radix[sc->stageRadix[0]] : 1; - if ((sc->rader_generator[0] > 0) || (sc->fftDim % registers_first_stage)) - sc->readToRegisters = 0; - else - sc->readToRegisters = 1; - break; - } - } - return res; -} -static inline VkFFTResult appendReadDataVkFFT(VkFFTSpecializationConstantsLayout* sc, const char* floatType, const char* floatTypeMemory, const char* uintType, uint64_t readType) { - VkFFTResult res = VKFFT_SUCCESS; - double double_PI = 3.1415926535897932384626433832795; - char vecType[30]; - char inputsStruct[20] = ""; - char LFending[4] = ""; - char uintType_32[30]; - if (!strcmp(floatType, "float")) sprintf(LFending, "f"); -#if(VKFFT_BACKEND==0) - if (sc->inputBufferBlockNum == 1) - sprintf(inputsStruct, "inputs"); - else - sprintf(inputsStruct, ".inputs"); - if (!strcmp(floatType, "float")) sprintf(vecType, "vec2"); - if (!strcmp(floatType, "double")) sprintf(vecType, "dvec2"); - if (!strcmp(floatType, "double")) sprintf(LFending, "LF"); - sprintf(uintType_32, "uint"); - char cosDef[20] = "cos"; - char sinDef[20] = "sin"; -#elif(VKFFT_BACKEND==1) - if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); - if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); - if (!strcmp(floatType, "double")) sprintf(LFending, "l"); - sprintf(uintType_32, "unsigned int"); - sprintf(inputsStruct, "inputs"); - char cosDef[20] = "__cosf"; - char sinDef[20] = "__sinf"; -#elif(VKFFT_BACKEND==2) - if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); - if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); - if (!strcmp(floatType, "double")) sprintf(LFending, "l"); - sprintf(uintType_32, "unsigned int"); - sprintf(inputsStruct, "inputs"); - char cosDef[20] = "__cosf"; - char sinDef[20] = "__sinf"; -#elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)) - if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); - if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); - sprintf(inputsStruct, "inputs"); - sprintf(uintType_32, "unsigned int"); - char cosDef[20] = "native_cos"; - char sinDef[20] = "native_sin"; -#endif - char convTypeLeft[20] = ""; - char convTypeRight[20] = ""; - if ((!strcmp(floatType, "float")) && (strcmp(floatTypeMemory, "float"))) { - if ((readType == 5) || (readType == 110) || (readType == 111) || (readType == 120) || (readType == 121) || (readType == 130) || (readType == 131) || (readType == 140) || (readType == 141) || (readType == 142) || (readType == 143) || (readType == 144) || (readType == 145)) { -#if(VKFFT_BACKEND==0) - sprintf(convTypeLeft, "float("); - sprintf(convTypeRight, ")"); -#elif(VKFFT_BACKEND==1) - sprintf(convTypeLeft, "(float)"); - //sprintf(convTypeRight, ""); -#elif(VKFFT_BACKEND==2) - sprintf(convTypeLeft, "(float)"); - //sprintf(convTypeRight, ""); -#elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)) - sprintf(convTypeLeft, "(float)"); - //sprintf(convTypeRight, ""); -#endif - } - else { -#if(VKFFT_BACKEND==0) - sprintf(convTypeLeft, "vec2("); - sprintf(convTypeRight, ")"); -#elif(VKFFT_BACKEND==1) - sprintf(convTypeLeft, "conv_float2("); - sprintf(convTypeRight, ")"); -#elif(VKFFT_BACKEND==2) - sprintf(convTypeLeft, "conv_float2("); - sprintf(convTypeRight, ")"); -#elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)) - sprintf(convTypeLeft, "conv_float2("); - sprintf(convTypeRight, ")"); -#endif - } - } - if ((!strcmp(floatType, "double")) && (strcmp(floatTypeMemory, "double"))) { - if ((readType == 5) || (readType == 110) || (readType == 111) || (readType == 120) || (readType == 121) || (readType == 130) || (readType == 131) || (readType == 140) || (readType == 141) || (readType == 142) || (readType == 143) || (readType == 144) || (readType == 145)) { -#if(VKFFT_BACKEND==0) - sprintf(convTypeLeft, "double("); - sprintf(convTypeRight, ")"); -#elif(VKFFT_BACKEND==1) - sprintf(convTypeLeft, "(double)"); - //sprintf(convTypeRight, ""); -#elif(VKFFT_BACKEND==2) - sprintf(convTypeLeft, "(double)"); - //sprintf(convTypeRight, ""); -#elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)) - sprintf(convTypeLeft, "(double)"); - //sprintf(convTypeRight, ""); -#endif - } - else { -#if(VKFFT_BACKEND==0) - sprintf(convTypeLeft, "dvec2("); - sprintf(convTypeRight, ")"); -#elif(VKFFT_BACKEND==1) - sprintf(convTypeLeft, "conv_double2("); - sprintf(convTypeRight, ")"); -#elif(VKFFT_BACKEND==2) - sprintf(convTypeLeft, "conv_double2("); - sprintf(convTypeRight, ")"); -#elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)) - sprintf(convTypeLeft, "conv_double2("); - sprintf(convTypeRight, ")"); -#endif - } - } - char index_x[2000] = ""; - char index_y[2000] = ""; - char requestCoordinate[100] = ""; - if (sc->convolutionStep) { - if (sc->matrixConvolution > 1) { - sprintf(requestCoordinate, "coordinate"); - } - } - char requestBatch[100] = ""; - if (sc->convolutionStep) { - if (sc->numKernels > 1) { - sprintf(requestBatch, "0");//if one buffer - multiple kernel convolution - } - } - //appendZeropadStart(sc); - switch (readType) { - case 0://single_c2c - { - //sc->tempLen = sprintf(sc->tempStr, " return;\n"); - char shiftX[500] = ""; - if (sc->performWorkGroupShift[0]) - sprintf(shiftX, " + consts.workGroupShiftX "); - char shiftY[500] = ""; - if (sc->axisSwapped) { - if (sc->performWorkGroupShift[1]) - sprintf(shiftY, " + consts.workGroupShiftY*%s ", sc->gl_WorkGroupSize_x); - } - else { - if (sc->performWorkGroupShift[1]) - sprintf(shiftY, " + consts.workGroupShiftY*%s ", sc->gl_WorkGroupSize_y); - } - char shiftY2[100] = ""; - if (sc->performWorkGroupShift[1]) - sprintf(shiftY, " + consts.workGroupShiftY "); - uint64_t used_registers_read = (sc->axisSwapped) ? (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]) : (uint64_t)ceil(sc->fftDim / (double)sc->localSize[0]); - if (sc->registerBoost > 1) used_registers_read /= sc->registerBoost; - if (sc->fftDim < sc->fft_dim_full) { - if (sc->axisSwapped) { - sc->tempLen = sprintf(sc->tempStr, " %s numActiveThreads = ((%s/%" PRIu64 ")==%" PRIu64 ") ? %" PRIu64 " : %" PRIu64 ";\n", uintType, sc->gl_WorkGroupID_x, sc->firstStageStartSize / sc->fftDim, ((uint64_t)floor(sc->fft_dim_full / ((double)sc->localSize[0] * sc->fftDim))) / (sc->firstStageStartSize / sc->fftDim), (uint64_t)ceil(((sc->fft_dim_full - (sc->firstStageStartSize / sc->fftDim) * ((((uint64_t)floor(sc->fft_dim_full / ((double)sc->localSize[0] * sc->fftDim))) / (sc->firstStageStartSize / sc->fftDim)) * sc->localSize[0] * sc->fftDim)) / (sc->firstStageStartSize / sc->fftDim)) / (double)used_registers_read), sc->localSize[0] * sc->localSize[1]);// sc->fft_dim_full, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->fftDim, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->localSize[0] * sc->firstStageStartSize, sc->fft_dim_full / (sc->localSize[0] * sc->fftDim)); - //sc->tempLen = sprintf(sc->tempStr, " if (numActiveThreads>%" PRIu64 ") numActiveThreads = %" PRIu64 ";\n", sc->localSize[0]* sc->localSize[1], sc->localSize[0]* sc->localSize[1]); - //sprintf(sc->disableThreadsStart, " if((%s+%" PRIu64 "*%s)< numActiveThreads) {\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " disableThreads = (%s * %" PRIu64 " + (((%s%s) %% %" PRIu64 ") * %" PRIu64 " + ((%s%s) / %" PRIu64 ") * %" PRIu64 ") < %" PRIu64 ") ? 1 : 0;\n", sc->gl_LocalInvocationID_x, sc->firstStageStartSize, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->fftDim, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->localSize[0] * sc->firstStageStartSize, sc->fft_dim_full); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sprintf(sc->disableThreadsStart, " if(disableThreads>0) {\n"); - sc->tempLen = sprintf(sc->tempStr, " if((%s+%" PRIu64 "*%s)< numActiveThreads) {\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sprintf(sc->disableThreadsEnd, "}"); - } - else { - sc->tempLen = sprintf(sc->tempStr, " disableThreads = (%s * %" PRIu64 " + (((%s%s) %% %" PRIu64 ") * %" PRIu64 " + ((%s%s) / %" PRIu64 ") * %" PRIu64 ") < %" PRIu64 ") ? 1 : 0;\n", sc->gl_LocalInvocationID_y, sc->firstStageStartSize, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->fftDim, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->localSize[1] * sc->firstStageStartSize, sc->fft_dim_full); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sprintf(sc->disableThreadsStart, " if(disableThreads>0) {\n"); - res = VkAppendLineFromInput(sc, sc->disableThreadsStart); - if (res != VKFFT_SUCCESS) return res; - sprintf(sc->disableThreadsEnd, "}"); - } - } - else { - sc->tempLen = sprintf(sc->tempStr, " { \n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->fftDim == sc->fft_dim_full) { - for (uint64_t k = 0; k < sc->registerBoost; k++) { - for (uint64_t i = 0; i < used_registers_read; i++) { - - if (sc->localSize[1] == 1) - sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * used_registers_read) * sc->localSize[0]); - else - sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->inputStride[0] > 1) - sc->tempLen = sprintf(sc->tempStr, " inoutID = (combinedID %% %" PRIu64 ") * %" PRIu64 " + (combinedID / %" PRIu64 ") * %" PRIu64 ";\n", sc->fftDim, sc->inputStride[0], sc->fftDim, sc->inputStride[1]); - else - sc->tempLen = sprintf(sc->tempStr, " inoutID = (combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * %" PRIu64 ";\n", sc->fftDim, sc->fftDim, sc->inputStride[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->axisSwapped) { - if (sc->size[sc->axis_id + 1] % sc->localSize[0] != 0) { -#if (VKFFT_BACKEND!=2) //AMD compiler fix - sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + (%s%s)*%" PRIu64 "< %" PRIu64 "){\n", sc->fftDim, sc->gl_WorkGroupID_y, shiftY2, sc->localSize[0], sc->size[sc->axis_id + 1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; -#else - sc->tempLen = sprintf(sc->tempStr, " if(!(combinedID / %" PRIu64 " + (%s%s)*%" PRIu64 "< %" PRIu64 ")) %s = 0; {\n", sc->fftDim, sc->gl_WorkGroupID_y, shiftY2, sc->localSize[0], sc->size[sc->axis_id + 1], sc->inoutID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; -#endif - } - - if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[0])) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - if (sc->size[sc->axis_id + 1] % sc->localSize[1] != 0) { -#if (VKFFT_BACKEND!=2) //AMD compiler fix - sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + (%s%s)*%" PRIu64 "< %" PRIu64 "){\n", sc->fftDim, sc->gl_WorkGroupID_y, shiftY2, sc->localSize[1], sc->size[sc->axis_id + 1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; -#else - sc->tempLen = sprintf(sc->tempStr, " if(!(combinedID / %" PRIu64 " + (%s%s)*%" PRIu64 "< %" PRIu64 ")) %s = 0; {\n", sc->fftDim, sc->gl_WorkGroupID_y, shiftY2, sc->localSize[1], sc->size[sc->axis_id + 1], sc->inoutID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; -#endif - } - if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[1])) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - if (sc->zeropadBluestein[0]) { - sc->tempLen = sprintf(sc->tempStr, " if((combinedID %% %" PRIu64 ") < %" PRIu64 "){\n", sc->fft_dim_full, sc->fft_zeropad_Bluestein_left_read[sc->axis_id]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->zeropad[0]) { - sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->inputStride[1], sc->fft_zeropad_left_read[sc->axis_id], sc->inputStride[1], sc->fft_zeropad_right_read[sc->axis_id]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - res = indexInputVkFFT(sc, uintType, readType, sc->inoutID, 0, requestCoordinate, requestBatch); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, ";\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - res = appendZeropadStartReadWriteStage(sc, 1); - if (res != VKFFT_SUCCESS) return res; - if (sc->readToRegisters) { - if (sc->inputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s = %s%s[%s]%s;\n", sc->regIDs[i + k * sc->registers_per_thread], convTypeLeft, inputsStruct, sc->inoutID, convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " %s = %sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", sc->regIDs[i + k * sc->registers_per_thread], convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight); - } - else { - if (sc->axisSwapped) { - if (sc->inputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")] = %s%s[%s]%s;\n", sc->fftDim, sc->fftDim, convTypeLeft, inputsStruct, sc->inoutID, convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")] = %sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", sc->fftDim, sc->fftDim, convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight); - } - else { - if (sc->inputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride] = %s%s[%s]%s;\n", sc->fftDim, sc->fftDim, convTypeLeft, inputsStruct, sc->inoutID, convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride] = %sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", sc->fftDim, sc->fftDim, convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight); - } - } - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - res = appendZeropadEndReadWriteStage(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->zeropad[0]) { - sc->tempLen = sprintf(sc->tempStr, " }else{\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->readToRegisters) { - sc->tempLen = sprintf(sc->tempStr, " %s.x =0;%s.y = 0;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - if (sc->axisSwapped) { - sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")].x = 0;\n", sc->fftDim, sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")].y = 0;\n", sc->fftDim, sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].x = 0;\n", sc->fftDim, sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].y = 0;\n", sc->fftDim, sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->zeropadBluestein[0]) { - sc->tempLen = sprintf(sc->tempStr, " }else{\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->readToRegisters) { - sc->tempLen = sprintf(sc->tempStr, " %s.x =0;%s.y = 0;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - if (sc->axisSwapped) { - sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")].x = 0;\n", sc->fftDim, sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")].y = 0;\n", sc->fftDim, sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].x = 0;\n", sc->fftDim, sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].y = 0;\n", sc->fftDim, sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->axisSwapped) { - if (sc->size[sc->axis_id + 1] % sc->localSize[0] != 0) { - sc->tempLen = sprintf(sc->tempStr, " }"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[0])) { - sc->tempLen = sprintf(sc->tempStr, " }"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - if (sc->size[sc->axis_id + 1] % sc->localSize[1] != 0) { - sc->tempLen = sprintf(sc->tempStr, " }"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[1])) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - - } - } - } - else { - for (uint64_t k = 0; k < sc->registerBoost; k++) { - for (uint64_t i = 0; i < used_registers_read; i++) { - /* - if (sc->localSize[1] == 1) - sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * used_registers_read) * sc->localSize[0]); - else - sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1]); - - sc->tempLen = sprintf(sc->tempStr, " inoutID = (combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * %" PRIu64 " + (((%s%s) %% %" PRIu64 ") * %" PRIu64 " + ((%s%s) / %" PRIu64 ") * %" PRIu64 ");\n", sc->fftDim, sc->fftDim, sc->firstStageStartSize, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->fftDim, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->localSize[1] * sc->firstStageStartSize); - */ - if (sc->axisSwapped) { - if ((sc->fft_dim_full - (sc->firstStageStartSize / sc->fftDim) * ((((uint64_t)floor(sc->fft_dim_full / ((double)sc->localSize[0] * sc->fftDim))) / (sc->firstStageStartSize / sc->fftDim)) * sc->localSize[0] * sc->fftDim)) / used_registers_read / (sc->firstStageStartSize / sc->fftDim) > sc->localSize[0]) { - if (sc->localSize[1] == 1) - sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * used_registers_read) * sc->localSize[0]); - else - sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 "*numActiveThreads;\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * used_registers_read)); - } - else { - if (sc->localSize[1] == 1) - sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 "*numActiveThreads;\n", sc->gl_LocalInvocationID_x, (i + k * used_registers_read)); - else - sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 "*numActiveThreads;\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * used_registers_read)); - } - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " inoutID = (combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * %" PRIu64 " + (((%s%s) %% %" PRIu64 ") * %" PRIu64 " + ((%s%s) / %" PRIu64 ") * %" PRIu64 ");\n", sc->fftDim, sc->fftDim, sc->firstStageStartSize, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->fftDim, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->localSize[0] * sc->firstStageStartSize); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " inoutID = %s+%" PRIu64 "+%s * %" PRIu64 " + (((%s%s) %% %" PRIu64 ") * %" PRIu64 " + ((%s%s) / %" PRIu64 ") * %" PRIu64 ");\n", sc->gl_LocalInvocationID_x, (i + k * used_registers_read) * sc->localSize[0], sc->gl_LocalInvocationID_y, sc->firstStageStartSize, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->fftDim, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->localSize[1] * sc->firstStageStartSize); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->zeropad[0]) { - sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->fft_dim_full, sc->fft_zeropad_left_read[sc->axis_id], sc->fft_dim_full, sc->fft_zeropad_right_read[sc->axis_id]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - res = indexInputVkFFT(sc, uintType, readType, sc->inoutID, 0, requestCoordinate, requestBatch); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, ";\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - res = appendZeropadStartReadWriteStage(sc, 1); - if (res != VKFFT_SUCCESS) return res; - if (sc->readToRegisters) { - //not used - if (sc->inputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s = %s%s[%s]%s;\n", sc->regIDs[i + k * sc->registers_per_thread], convTypeLeft, inputsStruct, sc->inoutID, convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " %s = %sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", sc->regIDs[i + k * sc->registers_per_thread], convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - if (sc->axisSwapped) { - if (sc->fftDim % sc->localSize[1]) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->inputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID / %" PRIu64 ") + sharedStride*(combinedID %% %" PRIu64 ")] = %s%s[inoutID]%s;\n", sc->fftDim, sc->fftDim, convTypeLeft, inputsStruct, convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID / %" PRIu64 ") + sharedStride*(combinedID %% %" PRIu64 ")] = %sinputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "]%s;\n", sc->fftDim, sc->fftDim, convTypeLeft, sc->inputBufferBlockSize, inputsStruct, sc->inputBufferBlockSize, convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->fftDim % sc->localSize[1]) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[1])) { - sc->tempLen = sprintf(sc->tempStr, " if(%s < %" PRIu64 "){\n", sc->gl_LocalInvocationID_x, sc->fftDim - (i + k * used_registers_read) * sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->inputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " sdata[sharedStride*%s + (%s + %" PRIu64 ")] = %s%s[inoutID]%s;\n", sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, (i + k * used_registers_read) * sc->localSize[0], convTypeLeft, inputsStruct, convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " sdata[sharedStride*%s + (%s + %" PRIu64 ")] = %sinputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "]%s;\n", sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, (i + k * used_registers_read) * sc->localSize[0], convTypeLeft, sc->inputBufferBlockSize, inputsStruct, sc->inputBufferBlockSize, convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[1])) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - res = appendZeropadEndReadWriteStage(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->zeropad[0]) { - sc->tempLen = sprintf(sc->tempStr, " }else{\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->readToRegisters) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = 0; %s.y = 0;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - if (sc->axisSwapped) { - sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID / %" PRIu64 ") + sharedStride*(combinedID %% %" PRIu64 ")].x = 0;\n", sc->fftDim, sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID / %" PRIu64 ") + sharedStride*(combinedID %% %" PRIu64 ")].y = 0;\n", sc->fftDim, sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " sdata[sharedStride*%s + (%s + %" PRIu64 ")].x = 0;\n", sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, (i + k * used_registers_read) * sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdata[sharedStride*%s + (%s + %" PRIu64 ")].y = 0;\n", sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, (i + k * used_registers_read) * sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - } - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - break; - } - case 1://grouped_c2c - { - char shiftX[500] = ""; - if (sc->performWorkGroupShift[0]) - sprintf(shiftX, " + consts.workGroupShiftX * %s ", sc->gl_WorkGroupSize_x); - sc->tempLen = sprintf(sc->tempStr, " disableThreads = (((%s%s) / %" PRIu64 ") %% (%" PRIu64 ")+((%s%s) / %" PRIu64 ") * (%" PRIu64 ") < %" PRIu64 ") ? 1 : 0;\n", sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x, sc->stageStartSize, sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x * sc->stageStartSize, sc->fftDim * sc->stageStartSize, sc->size[sc->axis_id]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sprintf(sc->disableThreadsStart, " if(disableThreads>0) {\n"); - - res = VkAppendLineFromInput(sc, sc->disableThreadsStart); - if (res != VKFFT_SUCCESS) return res; - sprintf(sc->disableThreadsEnd, "}"); - uint64_t used_registers_read = (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]); - if (sc->registerBoost > 1) used_registers_read /= sc->registerBoost; - for (uint64_t k = 0; k < sc->registerBoost; k++) { - for (uint64_t i = 0; i < used_registers_read; i++) { - sc->tempLen = sprintf(sc->tempStr, " inoutID = (%" PRIu64 " * (%s + %" PRIu64 ") + ((%s%s) / %" PRIu64 ") %% (%" PRIu64 ")+((%s%s) / %" PRIu64 ") * (%" PRIu64 "));\n", sc->stageStartSize, sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[1], sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x, sc->stageStartSize, sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x * sc->stageStartSize, sc->fftDim * sc->stageStartSize); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->zeropadBluestein[0]) { - sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 ") < %" PRIu64 "){\n", sc->fft_dim_full, sc->fft_zeropad_Bluestein_left_read[sc->axis_id]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->zeropad[0]) { - sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->fft_dim_full, sc->fft_zeropad_left_read[sc->axis_id], sc->fft_dim_full, sc->fft_zeropad_right_read[sc->axis_id]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sprintf(index_x, "(%s%s) %% (%" PRIu64 ")", sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x); - res = indexInputVkFFT(sc, uintType, readType, index_x, sc->inoutID, requestCoordinate, requestBatch); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, ";\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - res = appendZeropadStartReadWriteStage(sc, 1); - if (res != VKFFT_SUCCESS) return res; - if ((1 + i + k * used_registers_read) * sc->localSize[1] >= (sc->fftDim)) { - sc->tempLen = sprintf(sc->tempStr, " if(%s < %" PRIu64 "){\n", sc->gl_LocalInvocationID_y, sc->fftDim - (i + k * used_registers_read) * sc->localSize[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->readToRegisters) { - if (sc->inputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s=%s%s[%s]%s;\n", sc->regIDs[i + k * sc->registers_per_thread], convTypeLeft, inputsStruct, sc->inoutID, convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " %s=%sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", sc->regIDs[i + k * sc->registers_per_thread], convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - if (sc->inputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " sdata[%s*(%s+%" PRIu64 ")+%s]=%s%s[%s]%s;\n", sc->sharedStride, sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[1], sc->gl_LocalInvocationID_x, convTypeLeft, inputsStruct, sc->inoutID, convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " sdata[%s*(%s+%" PRIu64 ")+%s]=%sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", sc->sharedStride, sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[1], sc->gl_LocalInvocationID_x, convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if ((1 + i + k * used_registers_read) * sc->localSize[1] >= (sc->fftDim)) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - res = appendZeropadEndReadWriteStage(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->zeropad[0]) { - sc->tempLen = sprintf(sc->tempStr, " }else{\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->readToRegisters) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = 0; %s.y = 0;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " sdata[%s*(%s+%" PRIu64 ")+%s].x=0;\n", sc->sharedStride, sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[1], sc->gl_LocalInvocationID_x); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdata[%s*(%s+%" PRIu64 ")+%s].y=0;\n", sc->sharedStride, sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[1], sc->gl_LocalInvocationID_x); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->zeropadBluestein[0]) { - sc->tempLen = sprintf(sc->tempStr, " }else{\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->readToRegisters) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = 0; %s.y = 0;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " sdata[%s*(%s+%" PRIu64 ")+%s].x=0;\n", sc->sharedStride, sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[1], sc->gl_LocalInvocationID_x); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdata[%s*(%s+%" PRIu64 ")+%s].y=0;\n", sc->sharedStride, sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[1], sc->gl_LocalInvocationID_x); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - break; - } - case 2://single_c2c_strided - { - char shiftX[500] = ""; - if (sc->performWorkGroupShift[0]) - sprintf(shiftX, " + consts.workGroupShiftX * %s ", sc->gl_WorkGroupSize_x); - - //sc->tempLen = sprintf(sc->tempStr, " if(gl_GlobalInvolcationID.x%s >= %" PRIu64 ") return; \n", shiftX, sc->size[0] / axis->specializationConstants.fftDim); - sc->tempLen = sprintf(sc->tempStr, " disableThreads = (((%s%s) / %" PRIu64 ") * (%" PRIu64 ") < %" PRIu64 ") ? 1 : 0;\n", sc->gl_GlobalInvocationID_x, shiftX, sc->stageStartSize, sc->stageStartSize * sc->fftDim, sc->fft_dim_full); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sprintf(sc->disableThreadsStart, " if(disableThreads>0) {\n"); - res = VkAppendLineFromInput(sc, sc->disableThreadsStart); - if (res != VKFFT_SUCCESS) return res; - sprintf(sc->disableThreadsEnd, "}"); - uint64_t used_registers_read = (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]); - if (sc->registerBoost > 1) used_registers_read /= sc->registerBoost; - for (uint64_t k = 0; k < sc->registerBoost; k++) { - for (uint64_t i = 0; i < used_registers_read; i++) { - sc->tempLen = sprintf(sc->tempStr, " inoutID = (%s%s) %% (%" PRIu64 ") + %" PRIu64 " * (%s + %" PRIu64 ") + ((%s%s) / %" PRIu64 ") * (%" PRIu64 ");\n", sc->gl_GlobalInvocationID_x, shiftX, sc->stageStartSize, sc->stageStartSize, sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[1], sc->gl_GlobalInvocationID_x, shiftX, sc->stageStartSize, sc->stageStartSize * sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->zeropadBluestein[0]) { - sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 ") < %" PRIu64 "){\n", sc->fft_dim_full, sc->fft_zeropad_Bluestein_left_read[sc->axis_id]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->zeropad[0]) { - sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->fft_dim_full, sc->fft_zeropad_left_read[sc->axis_id], sc->fft_dim_full, sc->fft_zeropad_right_read[sc->axis_id]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - res = indexInputVkFFT(sc, uintType, readType, sc->inoutID, 0, requestCoordinate, requestBatch); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, ";\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - res = appendZeropadStartReadWriteStage(sc, 1); - if (res != VKFFT_SUCCESS) return res; - if ((1 + i + k * used_registers_read) * sc->localSize[1] >= (sc->fftDim)) { - sc->tempLen = sprintf(sc->tempStr, " if(%s < %" PRIu64 "){\n", sc->gl_LocalInvocationID_y, sc->fftDim - (i + k * used_registers_read) * sc->localSize[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->readToRegisters) { - if (sc->inputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s=%s%s[%s]%s;\n", sc->regIDs[i + k * sc->registers_per_thread], convTypeLeft, inputsStruct, sc->inoutID, convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " %s=%sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", sc->regIDs[i + k * sc->registers_per_thread], convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - if (sc->inputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " sdata[%s*(%s+%" PRIu64 ")+%s]=%s%s[%s]%s;\n", sc->sharedStride, sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[1], sc->gl_LocalInvocationID_x, convTypeLeft, inputsStruct, sc->inoutID, convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " sdata[%s*(%s+%" PRIu64 ")+%s]=%sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", sc->sharedStride, sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[1], sc->gl_LocalInvocationID_x, convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if ((1 + i + k * used_registers_read) * sc->localSize[1] >= (sc->fftDim)) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - res = appendZeropadEndReadWriteStage(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->zeropad[0]) { - sc->tempLen = sprintf(sc->tempStr, " }else{\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->readToRegisters) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = 0; %s.y = 0;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " sdata[%s*(%s+%" PRIu64 ")+%s].x=0;\n", sc->sharedStride, sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[1], sc->gl_LocalInvocationID_x); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdata[%s*(%s+%" PRIu64 ")+%s].y=0;\n", sc->sharedStride, sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[1], sc->gl_LocalInvocationID_x); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->zeropadBluestein[0]) { - sc->tempLen = sprintf(sc->tempStr, " }else{\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->readToRegisters) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = 0; %s.y = 0;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " sdata[%s*(%s+%" PRIu64 ")+%s].x=0;\n", sc->sharedStride, sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[1], sc->gl_LocalInvocationID_x); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdata[%s*(%s+%" PRIu64 ")+%s].y=0;\n", sc->sharedStride, sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[1], sc->gl_LocalInvocationID_x); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - break; - } - case 5://single_r2c - { - char shiftX[500] = ""; - if (sc->performWorkGroupShift[0]) - sprintf(shiftX, " + consts.workGroupShiftX "); - char shiftY[500] = ""; - if (sc->performWorkGroupShift[1]) - sprintf(shiftY, " + consts.workGroupShiftY "); - uint64_t mult = (sc->mergeSequencesR2C) ? 2 : 1; - if (sc->fftDim == sc->fft_dim_full) { - uint64_t used_registers_read = (sc->axisSwapped) ? (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]) : (uint64_t)ceil(sc->fftDim / (double)sc->localSize[0]); - if (sc->registerBoost > 1) used_registers_read /= sc->registerBoost; - for (uint64_t k = 0; k < sc->registerBoost; k++) { - for (uint64_t i = 0; i < used_registers_read; i++) { - - if (sc->localSize[1] == 1) - sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * used_registers_read) * sc->localSize[0]); - else - sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - if (sc->inputStride[0] > 1) - sc->tempLen = sprintf(sc->tempStr, " inoutID = (combinedID %% %" PRIu64 ") * %" PRIu64 " + (combinedID / %" PRIu64 ") * %" PRIu64 ";\n", sc->fftDim, sc->inputStride[0], sc->fftDim, mult * sc->inputStride[1]); - else - sc->tempLen = sprintf(sc->tempStr, " inoutID = (combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * %" PRIu64 ";\n", sc->fftDim, sc->fftDim, mult * sc->inputStride[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->axisSwapped) { - if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[0] != 0) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + (%s%s)*%" PRIu64 "< %" PRIu64 "){", sc->fftDim, sc->gl_WorkGroupID_y, shiftY, sc->localSize[0], (uint64_t)ceil(sc->size[1] / (double)mult)); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[0])) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[1] != 0) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + (%s%s)*%" PRIu64 "< %" PRIu64 "){", sc->fftDim, sc->gl_WorkGroupID_y, shiftY, sc->localSize[1], (uint64_t)ceil(sc->size[1] / (double)mult)); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[1])) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - if (sc->zeropadBluestein[0]) { - sc->tempLen = sprintf(sc->tempStr, " if((combinedID %% %" PRIu64 ") < %" PRIu64 "){\n", sc->fft_dim_full, sc->fft_zeropad_Bluestein_left_read[sc->axis_id]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->zeropad[0]) { - sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->inputStride[1], sc->fft_zeropad_left_read[sc->axis_id], sc->inputStride[1], sc->fft_zeropad_right_read[sc->axis_id]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - indexInputVkFFT(sc, uintType, readType, sc->inoutID, 0, requestCoordinate, requestBatch); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, ";\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - res = appendZeropadStartReadWriteStage(sc, 1); - if (res != VKFFT_SUCCESS) return res; - if (sc->readToRegisters) { - if (sc->inputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s.x = %s%s[%s]%s;\n", sc->regIDs[i + k * sc->registers_per_thread], convTypeLeft, inputsStruct, sc->inoutID, convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " %s.x = %sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", sc->regIDs[i + k * sc->registers_per_thread], convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->mergeSequencesR2C) { - if (sc->inputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s.y = %s%s[(%s + %" PRIu64 ")]%s;\n", sc->regIDs[i + k * sc->registers_per_thread], convTypeLeft, inputsStruct, sc->inoutID, sc->inputStride[1], convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " %s.y = %sinputBlocks[(%s + %" PRIu64 ")/ %" PRIu64 "]%s[(%s + %" PRIu64 ") %% %" PRIu64 "]%s;\n", sc->regIDs[i + k * sc->registers_per_thread], convTypeLeft, sc->inoutID, sc->inputStride[1], sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputStride[1], sc->inputBufferBlockSize, convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - if (sc->inputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", sc->regIDs[i + k * sc->registers_per_thread]); - else - sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", sc->regIDs[i + k * sc->registers_per_thread]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - if (sc->axisSwapped) { - - if (sc->inputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")].x = %s%s[%s]%s;\n", sc->fftDim, sc->fftDim, convTypeLeft, inputsStruct, sc->inoutID, convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")].x = %sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", sc->fftDim, sc->fftDim, convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - if (sc->mergeSequencesR2C) { - sc->tempLen = sprintf(sc->tempStr, " inoutID += %" PRIu64 ";\n", sc->inputStride[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->inputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ")* sharedStride + (combinedID / %" PRIu64 ")].y = %s%s[inoutID]%s;\n", sc->fftDim, sc->fftDim, convTypeLeft, inputsStruct, convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ")* sharedStride + (combinedID / %" PRIu64 ")].y = %sinputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "]%s;\n", sc->fftDim, sc->fftDim, convTypeLeft, sc->inputBufferBlockSize, inputsStruct, sc->inputBufferBlockSize, convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - if (sc->inputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") * sharedStride+ (combinedID / %" PRIu64 ")].y = 0;\n", sc->fftDim, sc->fftDim); - else - sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")].y = 0;\n", sc->fftDim, sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - if (sc->inputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].x = %s%s[inoutID]%s;\n", sc->fftDim, sc->fftDim, convTypeLeft, inputsStruct, convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].x = %sinputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "]%s;\n", sc->fftDim, sc->fftDim, convTypeLeft, sc->inputBufferBlockSize, inputsStruct, sc->inputBufferBlockSize, convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->mergeSequencesR2C) { - sc->tempLen = sprintf(sc->tempStr, " inoutID += %" PRIu64 ";\n", sc->inputStride[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->inputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].y = %s%s[inoutID]%s;\n", sc->fftDim, sc->fftDim, convTypeLeft, inputsStruct, convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].y = %sinputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "]%s;\n", sc->fftDim, sc->fftDim, convTypeLeft, sc->inputBufferBlockSize, inputsStruct, sc->inputBufferBlockSize, convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - if (sc->inputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].y = 0;\n", sc->fftDim, sc->fftDim); - else - sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].y = 0;\n", sc->fftDim, sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - - } - res = appendZeropadEndReadWriteStage(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->zeropad[0]) { - sc->tempLen = sprintf(sc->tempStr, " }else{\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->readToRegisters) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = 0; %s.y = 0;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - if (sc->axisSwapped) { - sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")].x = 0;\n", sc->fftDim, sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")].y = 0;\n", sc->fftDim, sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].x = 0;\n", sc->fftDim, sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].y = 0;\n", sc->fftDim, sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - } - - } - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->zeropadBluestein[0]) { - sc->tempLen = sprintf(sc->tempStr, " }else{\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->readToRegisters) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = 0; %s.y = 0;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - if (sc->axisSwapped) { - sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")].x = 0;\n", sc->fftDim, sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")].y = 0;\n", sc->fftDim, sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].x = 0;\n", sc->fftDim, sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].y = 0;\n", sc->fftDim, sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - } - - } - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->axisSwapped) { - if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[0])) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[0] != 0) { - sc->tempLen = sprintf(sc->tempStr, " }"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[1])) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[1] != 0) { - sc->tempLen = sprintf(sc->tempStr, " }"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - } - } - else { - //Not implemented - } - break; - } - case 6: {//single_c2r - //sc->tempLen = sprintf(sc->tempStr, " return;\n"); - char shiftX[500] = ""; - if (sc->performWorkGroupShift[0]) - sprintf(shiftX, " + consts.workGroupShiftX "); - char shiftY[500] = ""; - if (sc->performWorkGroupShift[1]) - sprintf(shiftY, " + consts.workGroupShiftY*%s ", sc->gl_WorkGroupSize_y); - char shiftY2[100] = ""; - if (sc->performWorkGroupShift[1]) - sprintf(shiftY, " + consts.workGroupShiftY "); - if (sc->fftDim < sc->fft_dim_full) { - //not implemented - if (sc->axisSwapped) - sc->tempLen = sprintf(sc->tempStr, " disableThreads = (%s * %" PRIu64 " + (((%s%s) %% %" PRIu64 ") * %" PRIu64 " + ((%s%s) / %" PRIu64 ") * %" PRIu64 ") < %" PRIu64 ") ? 1 : 0;\n", sc->gl_LocalInvocationID_x, sc->firstStageStartSize, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->fftDim, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->localSize[0] * sc->firstStageStartSize, sc->fft_dim_full); - else - sc->tempLen = sprintf(sc->tempStr, " disableThreads = (%s * %" PRIu64 " + (((%s%s) %% %" PRIu64 ") * %" PRIu64 " + ((%s%s) / %" PRIu64 ") * %" PRIu64 ") < %" PRIu64 ") ? 1 : 0;\n", sc->gl_LocalInvocationID_y, sc->firstStageStartSize, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->fftDim, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->localSize[1] * sc->firstStageStartSize, sc->fft_dim_full); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sprintf(sc->disableThreadsStart, " if(disableThreads>0) {\n"); - res = VkAppendLineFromInput(sc, sc->disableThreadsStart); - if (res != VKFFT_SUCCESS) return res; - sprintf(sc->disableThreadsEnd, "}"); - } - else { - sc->tempLen = sprintf(sc->tempStr, " { \n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - uint64_t used_registers_read = (sc->axisSwapped) ? (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]) : (uint64_t)ceil(sc->fftDim / (double)sc->localSize[0]); - if (sc->registerBoost > 1) used_registers_read /= sc->registerBoost; - uint64_t mult = (sc->mergeSequencesR2C) ? 2 : 1; - if (sc->fftDim == sc->fft_dim_full) { - if (sc->zeropadBluestein[0]) { - sc->fftDim = sc->fft_zeropad_Bluestein_left_read[sc->axis_id]; - used_registers_read = (sc->axisSwapped) ? (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]) : (uint64_t)ceil(sc->fftDim / (double)sc->localSize[0]); - } - for (uint64_t k = 0; k < sc->registerBoost; k++) { - uint64_t num_in = (sc->axisSwapped) ? (uint64_t)ceil(mult * (sc->fftDim / 2 + 1) / (double)sc->localSize[1]) : (uint64_t)ceil(mult * (sc->fftDim / 2 + 1) / (double)sc->localSize[0]); - //num_in =(uint64_t)ceil(num_in / (double)sc->min_registers_per_thread); - for (uint64_t i = 0; i < num_in; i++) { - if (sc->localSize[1] == 1) - sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * num_in) * sc->localSize[0]); - else - sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * num_in) * sc->localSize[0] * sc->localSize[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - if (sc->inputStride[0] > 1) - sc->tempLen = sprintf(sc->tempStr, " inoutID = (combinedID %% %" PRIu64 ") * %" PRIu64 " + (combinedID / %" PRIu64 ") * %" PRIu64 ";\n", sc->fftDim / 2 + 1, sc->inputStride[0], sc->fftDim / 2 + 1, sc->inputStride[1]); - else - sc->tempLen = sprintf(sc->tempStr, " inoutID = (combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * %" PRIu64 ";\n", sc->fftDim / 2 + 1, sc->fftDim / 2 + 1, sc->inputStride[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->axisSwapped) { - if (sc->size[sc->axis_id + 1] % sc->localSize[0] != 0) { -#if (VKFFT_BACKEND!=2) //AMD compiler fix - sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + (%s%s)*%" PRIu64 "< %" PRIu64 "){\n", sc->fftDim / 2 + 1, sc->gl_WorkGroupID_y, shiftY2, mult * sc->localSize[0], sc->size[sc->axis_id + 1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; -#else - sc->tempLen = sprintf(sc->tempStr, " if(!(combinedID / %" PRIu64 " + (%s%s)*%" PRIu64 "< %" PRIu64 ")) %s = 0; {\n", sc->fftDim / 2 + 1, sc->gl_WorkGroupID_y, shiftY2, mult * sc->localSize[0], sc->size[sc->axis_id + 1], sc->inoutID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; -#endif - } - if ((1 + i + k * num_in) * sc->localSize[0] * sc->localSize[1] > mult * (sc->fftDim / 2 + 1) * sc->localSize[0]) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", mult * (sc->fftDim / 2 + 1) * sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - if (sc->size[sc->axis_id + 1] % sc->localSize[1] != 0) { -#if (VKFFT_BACKEND!=2) //AMD compiler fix - sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + (%s%s)*%" PRIu64 "< %" PRIu64 "){\n", sc->fftDim / 2 + 1, sc->gl_WorkGroupID_y, shiftY2, mult * sc->localSize[1], sc->size[sc->axis_id + 1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; -#else - sc->tempLen = sprintf(sc->tempStr, " if(!(combinedID / %" PRIu64 " + (%s%s)*%" PRIu64 "< %" PRIu64 ")) %s = 0; {\n", sc->fftDim / 2 + 1, sc->gl_WorkGroupID_y, shiftY2, mult * sc->localSize[1], sc->size[sc->axis_id + 1], sc->inoutID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; -#endif - } - if ((1 + i + k * num_in) * sc->localSize[0] * sc->localSize[1] > mult * (sc->fftDim / 2 + 1) * sc->localSize[1]) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", mult * (sc->fftDim / 2 + 1) * sc->localSize[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - if (sc->zeropad[0]) { - sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->inputStride[1], sc->fft_zeropad_left_read[sc->axis_id], sc->inputStride[1], sc->fft_zeropad_right_read[sc->axis_id]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - res = indexInputVkFFT(sc, uintType, readType, sc->inoutID, 0, requestCoordinate, requestBatch); - sc->tempLen = sprintf(sc->tempStr, ";\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - res = appendZeropadStartReadWriteStage(sc, 1); - if (res != VKFFT_SUCCESS) return res; - if (0) { - //not enabled - if (sc->inputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s = %s%s[%s]%s;\n", sc->regIDs[i + k * sc->registers_per_thread], convTypeLeft, inputsStruct, sc->inoutID, convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " %s = %sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", sc->regIDs[i + k * sc->registers_per_thread], convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - if (!sc->axisSwapped) { - if (sc->inputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride] = %s%s[%s]%s;\n", mult * (sc->fftDim / 2 + 1), mult * (sc->fftDim / 2 + 1), convTypeLeft, inputsStruct, sc->inoutID, convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride] = %sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", mult * (sc->fftDim / 2 + 1), mult * (sc->fftDim / 2 + 1), convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - if (sc->inputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")] = %s%s[%s]%s;\n", mult * (sc->fftDim / 2 + 1), mult * (sc->fftDim / 2 + 1), convTypeLeft, inputsStruct, sc->inoutID, convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")] = %sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", mult * (sc->fftDim / 2 + 1), mult * (sc->fftDim / 2 + 1), convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - res = appendZeropadEndReadWriteStage(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->zeropad[0]) { - sc->tempLen = sprintf(sc->tempStr, " }else{\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (0) { - //not enabled - sc->tempLen = sprintf(sc->tempStr, " %s.x =0;%s.y = 0;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - if (!sc->axisSwapped) { - sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].x = 0;\n", mult * (sc->fftDim / 2 + 1), mult * (sc->fftDim / 2 + 1)); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].y = 0;\n", mult * (sc->fftDim / 2 + 1), mult * (sc->fftDim / 2 + 1)); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")].x = 0;\n", mult * (sc->fftDim / 2 + 1), mult * (sc->fftDim / 2 + 1)); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")].y = 0;\n", mult * (sc->fftDim / 2 + 1), mult * (sc->fftDim / 2 + 1)); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->axisSwapped) { - if ((1 + i + k * num_in) * sc->localSize[0] * sc->localSize[1] > mult * (sc->fftDim / 2 + 1) * sc->localSize[0]) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - if ((1 + i + k * num_in) * sc->localSize[0] * sc->localSize[1] > mult * (sc->fftDim / 2 + 1) * sc->localSize[1]) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - if (sc->axisSwapped) { - if (sc->size[sc->axis_id + 1] % sc->localSize[0] != 0) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - if (sc->size[sc->axis_id + 1] % sc->localSize[1] != 0) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - - } - res = appendBarrierVkFFT(sc, 1); - if (res != VKFFT_SUCCESS) return res; - for (uint64_t i = 0; i < used_registers_read; i++) { - if (sc->mergeSequencesR2C) { - if (sc->axisSwapped) { - if (i < ((sc->fftDim / 2 + 1) / sc->localSize[1])) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = sdata[%s + (%s+%" PRIu64 ") * sharedStride].x - sdata[%s + (%s+%" PRIu64 ") * sharedStride].y;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_x, sc->gl_LocalInvocationID_y, i * sc->localSize[1], sc->gl_LocalInvocationID_x, sc->gl_LocalInvocationID_y, i * sc->localSize[1] + (int64_t)ceil(sc->fftDim / 2.0) + (1 - sc->fftDim % 2)); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = sdata[%s + (%s+%" PRIu64 ") * sharedStride].y + sdata[%s + (%s+%" PRIu64 ") * sharedStride].x;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_x, sc->gl_LocalInvocationID_y, i * sc->localSize[1], sc->gl_LocalInvocationID_x, sc->gl_LocalInvocationID_y, i * sc->localSize[1] + (int64_t)ceil(sc->fftDim / 2.0) + (1 - sc->fftDim % 2)); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - if (i >= (uint64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[1])) { - if ((1 + i + k * used_registers_read) * sc->localSize[1] > sc->fftDim) { - sc->tempLen = sprintf(sc->tempStr, " if(%s < %" PRIu64 "){\n", sc->gl_LocalInvocationID_y, sc->fftDim - (i + k * used_registers_read) * sc->localSize[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if ((((uint64_t)ceil(sc->fftDim / 2.0) - 1 - (sc->localSize[1] - ((sc->fftDim / 2) % sc->localSize[1] + 1))) > (i - ((int64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[1]))) * sc->localSize[1]) && ((uint64_t)ceil(sc->fftDim / 2.0) - 1 > (sc->localSize[1] - ((sc->fftDim / 2) % sc->localSize[1] + 1)))) { - if (sc->zeropadBluestein[0]) { - sc->tempLen = sprintf(sc->tempStr, " if(%" PRIu64 " > %s){\n", ((uint64_t)ceil(sc->fftDim / 2.0) - 1 - (sc->localSize[1] - ((sc->fftDim / 2) % sc->localSize[1] + 1))) - (i - ((int64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[1]))) * sc->localSize[1], sc->gl_LocalInvocationID_y); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " %s.x = sdata[%s + (%" PRIu64 "-%s) * sharedStride].x + sdata[%s + (%" PRIu64 "-%s) * sharedStride].y;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_x, (uint64_t)ceil(sc->fftDim / 2.0) - 1 - (sc->localSize[1] - ((sc->fftDim / 2) % sc->localSize[1] + 1)) - (i - (int64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[1])) * sc->localSize[1], sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, (int64_t)ceil(sc->fftDim / 2.0) + (1 - sc->fftDim % 2) + (uint64_t)ceil(sc->fftDim / 2.0) - 1 - (sc->localSize[1] - ((sc->fftDim / 2) % sc->localSize[1] + 1)) - (i - (int64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[1])) * sc->localSize[1], sc->gl_LocalInvocationID_y); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = -sdata[%s + (%" PRIu64 "-%s) * sharedStride].y + sdata[%s + (%" PRIu64 "-%s) * sharedStride].x;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_x, (uint64_t)ceil(sc->fftDim / 2.0) - 1 - (sc->localSize[1] - ((sc->fftDim / 2) % sc->localSize[1] + 1)) - (i - (int64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[1])) * sc->localSize[1], sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, (int64_t)ceil(sc->fftDim / 2.0) + (1 - sc->fftDim % 2) + (uint64_t)ceil(sc->fftDim / 2.0) - 1 - (sc->localSize[1] - ((sc->fftDim / 2) % sc->localSize[1] + 1)) - (i - (int64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[1])) * sc->localSize[1], sc->gl_LocalInvocationID_y); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->zeropadBluestein[0]) { - sc->tempLen = sprintf(sc->tempStr, " }else{\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.x = 0;\n", sc->regIDs[i + k * sc->registers_per_thread]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", sc->regIDs[i + k * sc->registers_per_thread]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - sc->tempLen = sprintf(sc->tempStr, " %s.x = 0;\n", sc->regIDs[i + k * sc->registers_per_thread]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", sc->regIDs[i + k * sc->registers_per_thread]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if ((1 + i + k * used_registers_read) * sc->localSize[1] > sc->fftDim) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - if (sc->localSize[1] > sc->fftDim) { - sc->tempLen = sprintf(sc->tempStr, " if(%s < %" PRIu64 "){;\n", sc->gl_LocalInvocationID_y, sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " if(%s < %" PRIu64 "){;\n", sc->gl_LocalInvocationID_y, (sc->fftDim / 2 + 1) % sc->localSize[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.x = sdata[%s + (%s+%" PRIu64 ") * sharedStride].x - sdata[%s + (%s+%" PRIu64 ") * sharedStride].y;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_x, sc->gl_LocalInvocationID_y, i * sc->localSize[1], sc->gl_LocalInvocationID_x, sc->gl_LocalInvocationID_y, i * sc->localSize[1] + (int64_t)ceil(sc->fftDim / 2.0) + (1 - sc->fftDim % 2)); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = sdata[%s + (%s+%" PRIu64 ") * sharedStride].y + sdata[%s + (%s+%" PRIu64 ") * sharedStride].x;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_x, sc->gl_LocalInvocationID_y, i * sc->localSize[1], sc->gl_LocalInvocationID_x, sc->gl_LocalInvocationID_y, i * sc->localSize[1] + (int64_t)ceil(sc->fftDim / 2.0) + (1 - sc->fftDim % 2)); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " }else{\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.x = sdata[%s + (%" PRIu64 "-%s) * sharedStride].x + sdata[%s + (%" PRIu64 "-%s) * sharedStride].y;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_x, (uint64_t)ceil(sc->fftDim / 2.0) - 1 - (sc->localSize[1] - ((sc->fftDim / 2) % sc->localSize[1] + 1)) - (i - (int64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[1])) * sc->localSize[1], sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, (int64_t)ceil(sc->fftDim / 2.0) + (1 - sc->fftDim % 2) + (uint64_t)ceil(sc->fftDim / 2.0) - 1 - (sc->localSize[1] - ((sc->fftDim / 2) % sc->localSize[1] + 1)) - (i - (int64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[1])) * sc->localSize[1], sc->gl_LocalInvocationID_y); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = -sdata[%s + (%" PRIu64 "-%s) * sharedStride].y + sdata[%s + (%" PRIu64 "-%s) * sharedStride].x;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_x, (uint64_t)ceil(sc->fftDim / 2.0) - 1 - (sc->localSize[1] - ((sc->fftDim / 2) % sc->localSize[1] + 1)) - (i - (int64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[1])) * sc->localSize[1], sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, (int64_t)ceil(sc->fftDim / 2.0) + (1 - sc->fftDim % 2) + (uint64_t)ceil(sc->fftDim / 2.0) - 1 - (sc->localSize[1] - ((sc->fftDim / 2) % sc->localSize[1] + 1)) - (i - (int64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[1])) * sc->localSize[1], sc->gl_LocalInvocationID_y); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->localSize[1] > sc->fftDim) { - sc->tempLen = sprintf(sc->tempStr, " }else{;\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.x = 0;\n", sc->regIDs[i + k * sc->registers_per_thread]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", sc->regIDs[i + k * sc->registers_per_thread]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - } - else { - if (i < ((sc->fftDim / 2 + 1) / sc->localSize[0])) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = sdata[%s * sharedStride + (%s+%" PRIu64 ")].x - sdata[%s * sharedStride + (%s+%" PRIu64 ")].y;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, i * sc->localSize[0], sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, i * sc->localSize[0] + (int64_t)ceil(sc->fftDim / 2.0) + (1 - sc->fftDim % 2)); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = sdata[%s * sharedStride + (%s+%" PRIu64 ")].y + sdata[%s * sharedStride + (%s+%" PRIu64 ")].x;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, i * sc->localSize[0], sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, i * sc->localSize[0] + (int64_t)ceil(sc->fftDim / 2.0) + (1 - sc->fftDim % 2)); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - if (i >= (uint64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[0])) { - if ((1 + i + k * used_registers_read) * sc->localSize[0] > sc->fftDim) { - sc->tempLen = sprintf(sc->tempStr, " if(%s < %" PRIu64 "){\n", sc->gl_LocalInvocationID_x, sc->fftDim - (i + k * used_registers_read) * sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if ((((uint64_t)ceil(sc->fftDim / 2.0) - 1 - (sc->localSize[0] - ((sc->fftDim / 2) % sc->localSize[0] + 1))) > (i - ((int64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[0]))) * sc->localSize[0]) && ((uint64_t)ceil(sc->fftDim / 2.0) - 1 > (sc->localSize[0] - ((sc->fftDim / 2) % sc->localSize[0] + 1)))) { - if (sc->zeropadBluestein[0]) { - sc->tempLen = sprintf(sc->tempStr, " if(%" PRIu64 " > %s){\n", ((uint64_t)ceil(sc->fftDim / 2.0) - 1 - (sc->localSize[0] - ((sc->fftDim / 2) % sc->localSize[0] + 1))) - (i - ((int64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[0]))) * sc->localSize[0], sc->gl_LocalInvocationID_x); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " %s.x = sdata[%s * sharedStride + (%" PRIu64 "-%s)].x + sdata[%s * sharedStride + (%" PRIu64 "-%s)].y;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_y, (uint64_t)ceil(sc->fftDim / 2.0) - 1 - (sc->localSize[0] - ((sc->fftDim / 2) % sc->localSize[0] + 1)) - (i - (int64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[0])) * sc->localSize[0], sc->gl_LocalInvocationID_x, sc->gl_LocalInvocationID_y, (int64_t)ceil(sc->fftDim / 2.0) + (1 - sc->fftDim % 2) + (uint64_t)ceil(sc->fftDim / 2.0) - 1 - (sc->localSize[0] - ((sc->fftDim / 2) % sc->localSize[0] + 1)) - (i - (int64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[0])) * sc->localSize[0], sc->gl_LocalInvocationID_x); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = -sdata[%s * sharedStride + (%" PRIu64 "-%s)].y + sdata[%s * sharedStride + (%" PRIu64 "-%s)].x;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_y, (uint64_t)ceil(sc->fftDim / 2.0) - 1 - (sc->localSize[0] - ((sc->fftDim / 2) % sc->localSize[0] + 1)) - (i - (int64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[0])) * sc->localSize[0], sc->gl_LocalInvocationID_x, sc->gl_LocalInvocationID_y, (int64_t)ceil(sc->fftDim / 2.0) + (1 - sc->fftDim % 2) + (uint64_t)ceil(sc->fftDim / 2.0) - 1 - (sc->localSize[0] - ((sc->fftDim / 2) % sc->localSize[0] + 1)) - (i - (int64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[0])) * sc->localSize[0], sc->gl_LocalInvocationID_x); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->zeropadBluestein[0]) { - sc->tempLen = sprintf(sc->tempStr, " }else{\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.x = 0;\n", sc->regIDs[i + k * sc->registers_per_thread]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", sc->regIDs[i + k * sc->registers_per_thread]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - sc->tempLen = sprintf(sc->tempStr, " %s.x = 0;\n", sc->regIDs[i + k * sc->registers_per_thread]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", sc->regIDs[i + k * sc->registers_per_thread]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if ((1 + i + k * used_registers_read) * sc->localSize[0] > sc->fftDim) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - if (sc->localSize[0] > sc->fftDim) { - sc->tempLen = sprintf(sc->tempStr, " if(%s < %" PRIu64 "){;\n", sc->gl_LocalInvocationID_x, sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " if(%s < %" PRIu64 "){;\n", sc->gl_LocalInvocationID_x, (sc->fftDim / 2 + 1) % sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.x = sdata[%s * sharedStride + (%s+%" PRIu64 ")].x - sdata[%s * sharedStride + (%s+%" PRIu64 ")].y;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, i * sc->localSize[0], sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, i * sc->localSize[0] + (int64_t)ceil(sc->fftDim / 2.0) + (1 - sc->fftDim % 2)); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = sdata[%s * sharedStride + (%s+%" PRIu64 ")].y + sdata[%s * sharedStride + (%s+%" PRIu64 ")].x;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, i * sc->localSize[0], sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, i * sc->localSize[0] + (int64_t)ceil(sc->fftDim / 2.0) + (1 - sc->fftDim % 2)); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " }else{\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.x = sdata[%s * sharedStride + (%" PRIu64 "-%s)].x + sdata[%s * sharedStride + (%" PRIu64 "-%s)].y;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_y, (uint64_t)ceil(sc->fftDim / 2.0) - 1 - (sc->localSize[0] - ((sc->fftDim / 2) % sc->localSize[0] + 1)) - (i - (int64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[0])) * sc->localSize[0], sc->gl_LocalInvocationID_x, sc->gl_LocalInvocationID_y, (int64_t)ceil(sc->fftDim / 2.0) + (1 - sc->fftDim % 2) + (uint64_t)ceil(sc->fftDim / 2.0) - 1 - (sc->localSize[0] - ((sc->fftDim / 2) % sc->localSize[0] + 1)) - (i - (int64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[0])) * sc->localSize[0], sc->gl_LocalInvocationID_x); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = -sdata[%s * sharedStride + (%" PRIu64 "-%s)].y + sdata[%s * sharedStride + (%" PRIu64 "-%s)].x;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_y, (uint64_t)ceil(sc->fftDim / 2.0) - 1 - (sc->localSize[0] - ((sc->fftDim / 2) % sc->localSize[0] + 1)) - (i - (int64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[0])) * sc->localSize[0], sc->gl_LocalInvocationID_x, sc->gl_LocalInvocationID_y, (int64_t)ceil(sc->fftDim / 2.0) + (1 - sc->fftDim % 2) + (uint64_t)ceil(sc->fftDim / 2.0) - 1 - (sc->localSize[0] - ((sc->fftDim / 2) % sc->localSize[0] + 1)) - (i - (int64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[0])) * sc->localSize[0], sc->gl_LocalInvocationID_x); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->localSize[0] > sc->fftDim) { - sc->tempLen = sprintf(sc->tempStr, " }else{;\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.x = 0;\n", sc->regIDs[i + k * sc->registers_per_thread]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", sc->regIDs[i + k * sc->registers_per_thread]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - } - } - else { - if (sc->axisSwapped) { - if (i < ((sc->fftDim / 2 + 1) / sc->localSize[1])) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = sdata[%s + (%s+%" PRIu64 ") * sharedStride].x;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_x, sc->gl_LocalInvocationID_y, i * sc->localSize[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = sdata[%s + (%s+%" PRIu64 ") * sharedStride].y;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_x, sc->gl_LocalInvocationID_y, i * sc->localSize[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - if (i >= (uint64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[1])) { - if ((1 + i + k * used_registers_read) * sc->localSize[1] > sc->fftDim) { - sc->tempLen = sprintf(sc->tempStr, " if(%s < %" PRIu64 "){\n", sc->gl_LocalInvocationID_y, sc->fftDim - (i + k * used_registers_read) * sc->localSize[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if ((((uint64_t)ceil(sc->fftDim / 2.0) - 1 - (sc->localSize[1] - ((sc->fftDim / 2) % sc->localSize[1] + 1))) > (i - ((int64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[1]))) * sc->localSize[1]) && ((uint64_t)ceil(sc->fftDim / 2.0) - 1 > (sc->localSize[1] - ((sc->fftDim / 2) % sc->localSize[1] + 1)))) { - if (sc->zeropadBluestein[0]) { - sc->tempLen = sprintf(sc->tempStr, " if(%" PRIu64 " > %s){\n", ((uint64_t)ceil(sc->fftDim / 2.0) - 1 - (sc->localSize[1] - ((sc->fftDim / 2) % sc->localSize[1] + 1))) - (i - ((int64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[1]))) * sc->localSize[1], sc->gl_LocalInvocationID_y); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " %s.x = sdata[%s + (%" PRIu64 "-%s) * sharedStride].x;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_x, (uint64_t)ceil(sc->fftDim / 2.0) - 1 - (sc->localSize[1] - ((sc->fftDim / 2) % sc->localSize[1] + 1)) - (i - (int64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[1])) * sc->localSize[1], sc->gl_LocalInvocationID_y); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = -sdata[%s + (%" PRIu64 "-%s) * sharedStride].y;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_x, (uint64_t)ceil(sc->fftDim / 2.0) - 1 - (sc->localSize[1] - ((sc->fftDim / 2) % sc->localSize[1] + 1)) - (i - (int64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[1])) * sc->localSize[1], sc->gl_LocalInvocationID_y); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->zeropadBluestein[0]) { - sc->tempLen = sprintf(sc->tempStr, " }else{\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.x = 0;\n", sc->regIDs[i + k * sc->registers_per_thread]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", sc->regIDs[i + k * sc->registers_per_thread]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - sc->tempLen = sprintf(sc->tempStr, " %s.x = 0;\n", sc->regIDs[i + k * sc->registers_per_thread]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", sc->regIDs[i + k * sc->registers_per_thread]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if ((1 + i + k * used_registers_read) * sc->localSize[1] > sc->fftDim) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - if (sc->localSize[1] > sc->fftDim) { - sc->tempLen = sprintf(sc->tempStr, " if(%s < %" PRIu64 "){;\n", sc->gl_LocalInvocationID_y, sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " if(%s < %" PRIu64 "){;\n", sc->gl_LocalInvocationID_y, (sc->fftDim / 2 + 1) % sc->localSize[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.x = sdata[%s + (%s+%" PRIu64 ") * sharedStride].x;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_x, sc->gl_LocalInvocationID_y, i * sc->localSize[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = sdata[%s + (%s+%" PRIu64 ") * sharedStride].y;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_x, sc->gl_LocalInvocationID_y, i * sc->localSize[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " }else{\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.x = sdata[%s + (%" PRIu64 "-%s) * sharedStride].x;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_x, (uint64_t)ceil(sc->fftDim / 2.0) - 1 + (sc->fftDim / 2 + 1) % sc->localSize[1], sc->gl_LocalInvocationID_y); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = -sdata[%s + (%" PRIu64 "-%s) * sharedStride].y;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_x, (uint64_t)ceil(sc->fftDim / 2.0) - 1 + (sc->fftDim / 2 + 1) % sc->localSize[1], sc->gl_LocalInvocationID_y); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->localSize[1] > sc->fftDim) { - sc->tempLen = sprintf(sc->tempStr, " }else{;\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.x = 0;\n", sc->regIDs[i + k * sc->registers_per_thread]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", sc->regIDs[i + k * sc->registers_per_thread]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - } - else { - if (i < ((sc->fftDim / 2 + 1) / sc->localSize[0])) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = sdata[%s * sharedStride + (%s+%" PRIu64 ")].x;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, i * sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = sdata[%s * sharedStride + (%s+%" PRIu64 ")].y;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, i * sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - if (i >= (uint64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[0])) { - if ((1 + i + k * used_registers_read) * sc->localSize[0] > sc->fftDim) { - sc->tempLen = sprintf(sc->tempStr, " if(%s < %" PRIu64 "){\n", sc->gl_LocalInvocationID_x, sc->fftDim - (i + k * used_registers_read) * sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if ((((uint64_t)ceil(sc->fftDim / 2.0) - 1 - (sc->localSize[0] - ((sc->fftDim / 2) % sc->localSize[0] + 1))) > (i - ((int64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[0]))) * sc->localSize[0]) && ((uint64_t)ceil(sc->fftDim / 2.0) - 1 > (sc->localSize[0] - ((sc->fftDim / 2) % sc->localSize[0] + 1)))) { - if (sc->zeropadBluestein[0]) { - sc->tempLen = sprintf(sc->tempStr, " if(%" PRIu64 " > %s){\n", ((uint64_t)ceil(sc->fftDim / 2.0) - 1 - (sc->localSize[0] - ((sc->fftDim / 2) % sc->localSize[0] + 1))) - (i - ((int64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[0]))) * sc->localSize[0], sc->gl_LocalInvocationID_x); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " %s.x = sdata[%s * sharedStride + (%" PRIu64 "-%s)].x;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_y, (uint64_t)ceil(sc->fftDim / 2.0) - 1 - (sc->localSize[0] - ((sc->fftDim / 2) % sc->localSize[0] + 1)) - (i - (int64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[0])) * sc->localSize[0], sc->gl_LocalInvocationID_x); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = -sdata[%s * sharedStride + (%" PRIu64 "-%s)].y;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_y, (uint64_t)ceil(sc->fftDim / 2.0) - 1 - (sc->localSize[0] - ((sc->fftDim / 2) % sc->localSize[0] + 1)) - (i - (int64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[0])) * sc->localSize[0], sc->gl_LocalInvocationID_x); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->zeropadBluestein[0]) { - sc->tempLen = sprintf(sc->tempStr, " }else{\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.x = 0;\n", sc->regIDs[i + k * sc->registers_per_thread]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", sc->regIDs[i + k * sc->registers_per_thread]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - sc->tempLen = sprintf(sc->tempStr, " %s.x = 0;\n", sc->regIDs[i + k * sc->registers_per_thread]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", sc->regIDs[i + k * sc->registers_per_thread]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if ((1 + i + k * used_registers_read) * sc->localSize[0] > sc->fftDim) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - if (sc->localSize[0] > sc->fftDim) { - sc->tempLen = sprintf(sc->tempStr, " if(%s < %" PRIu64 "){;\n", sc->gl_LocalInvocationID_x, sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " if(%s < %" PRIu64 "){;\n", sc->gl_LocalInvocationID_x, (sc->fftDim / 2 + 1) % sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.x = sdata[%s * sharedStride + (%s+%" PRIu64 ")].x;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, i * sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = sdata[%s * sharedStride + (%s+%" PRIu64 ")].y;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, i * sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " }else{\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.x = sdata[%s * sharedStride + (%" PRIu64 "-%s)].x;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_y, (uint64_t)ceil(sc->fftDim / 2.0) - 1 + (sc->fftDim / 2 + 1) % sc->localSize[0], sc->gl_LocalInvocationID_x); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = -sdata[%s * sharedStride + (%" PRIu64 "-%s)].y;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_y, (uint64_t)ceil(sc->fftDim / 2.0) - 1 + (sc->fftDim / 2 + 1) % sc->localSize[0], sc->gl_LocalInvocationID_x); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->localSize[0] > sc->fftDim) { - sc->tempLen = sprintf(sc->tempStr, " }else{;\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.x = 0;\n", sc->regIDs[i + k * sc->registers_per_thread]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", sc->regIDs[i + k * sc->registers_per_thread]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - } - - } - } - } - //sc->readToRegisters = 1; - if (sc->zeropadBluestein[0]) { - sc->fftDim = sc->fft_dim_full; - used_registers_read = (sc->axisSwapped) ? (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]) : (uint64_t)ceil(sc->fftDim / (double)sc->localSize[0]); - } - if (!sc->readToRegisters) { - res = appendBarrierVkFFT(sc, 1); - if (res != VKFFT_SUCCESS) return res; - for (uint64_t k = 0; k < sc->registerBoost; k++) { - for (uint64_t i = 0; i < used_registers_read; i++) { - if (sc->axisSwapped) { - if ((1 + i + k * used_registers_read) * sc->localSize[1] > sc->fftDim) { - sc->tempLen = sprintf(sc->tempStr, " if(%s < %" PRIu64 "){\n", sc->gl_LocalInvocationID_y, sc->fftDim - (i + k * used_registers_read) * sc->localSize[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " sdata[(%s+%" PRIu64 ") * sharedStride + %s].x = %s.x;\n", sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[1], sc->gl_LocalInvocationID_x, sc->regIDs[i + k * sc->registers_per_thread]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdata[(%s+%" PRIu64 ") * sharedStride + %s].y = %s.y;\n", sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[1], sc->gl_LocalInvocationID_x, sc->regIDs[i + k * sc->registers_per_thread]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if ((1 + i + k * used_registers_read) * sc->localSize[1] > sc->fftDim) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - if ((1 + i + k * used_registers_read) * sc->localSize[0] > sc->fftDim) { - sc->tempLen = sprintf(sc->tempStr, " if(%s < %" PRIu64 "){\n", sc->gl_LocalInvocationID_x, sc->fftDim - (i + k * used_registers_read) * sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " sdata[(%s) * sharedStride + (%s+%" PRIu64 ")].x = %s.x;\n", sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, (i + k * used_registers_read) * sc->localSize[0], sc->regIDs[i + k * sc->registers_per_thread]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdata[(%s) * sharedStride + (%s+%" PRIu64 ")].y = %s.y;\n", sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, (i + k * used_registers_read) * sc->localSize[0], sc->regIDs[i + k * sc->registers_per_thread]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if ((1 + i + k * used_registers_read) * sc->localSize[0] > sc->fftDim) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - } - } - } - else { - - } - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - break; - } - case 110://DCT-I nonstrided - { - char shiftX[500] = ""; - if (sc->performWorkGroupShift[0]) - sprintf(shiftX, " + consts.workGroupShiftX "); - char shiftY[500] = ""; - if (sc->performWorkGroupShift[1]) - sprintf(shiftY, " + consts.workGroupShiftY "); - uint64_t mult = (sc->mergeSequencesR2C) ? 2 : 1; - if (sc->fftDim == sc->fft_dim_full) { - if (sc->zeropadBluestein[0]) { - res = appendSetSMToZero(sc, floatType, floatTypeMemory, uintType, readType); - if (res != VKFFT_SUCCESS) return res; - res = appendBarrierVkFFT(sc, 1); - if (res != VKFFT_SUCCESS) return res; - sc->fftDim = sc->fft_zeropad_Bluestein_left_read[sc->axis_id]; - } - sc->fftDim = (sc->fftDim + 2) / 2; - uint64_t num_in = (sc->axisSwapped) ? (uint64_t)ceil((sc->fftDim) / (double)sc->localSize[1]) : (uint64_t)ceil((sc->fftDim) / (double)sc->localSize[0]); - for (uint64_t k = 0; k < sc->registerBoost; k++) { - for (uint64_t i = 0; i < num_in; i++) { - - if (sc->localSize[1] == 1) - sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * num_in) * sc->localSize[0]); - else - sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * num_in) * sc->localSize[0] * sc->localSize[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->inputStride[0] > 1) - sc->tempLen = sprintf(sc->tempStr, " inoutID = (combinedID %% %" PRIu64 ") * %" PRIu64 " + (combinedID / %" PRIu64 ") * %" PRIu64 ";\n", sc->fftDim, sc->inputStride[0], sc->fftDim, mult * sc->inputStride[1]); - else - sc->tempLen = sprintf(sc->tempStr, " inoutID = (combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * %" PRIu64 ";\n", sc->fftDim, sc->fftDim, mult * sc->inputStride[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->axisSwapped) { - if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[0] != 0) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + (%s%s)*%" PRIu64 "< %" PRIu64 "){", sc->fftDim, sc->gl_WorkGroupID_y, shiftY, sc->localSize[0], (uint64_t)ceil(sc->size[1] / (double)mult)); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if ((1 + i + k * num_in) * sc->localSize[1] >= (sc->fftDim)) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", (sc->fftDim) * sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->zeropadBluestein[0]) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[1] != 0) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + (%s%s)*%" PRIu64 "< %" PRIu64 "){", sc->fftDim, sc->gl_WorkGroupID_y, shiftY, sc->localSize[1], (uint64_t)ceil(sc->size[1] / (double)mult)); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if ((1 + i + k * num_in) * sc->localSize[0] >= (sc->fftDim)) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", (sc->fftDim) * sc->localSize[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->zeropadBluestein[0]) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - if (sc->zeropad[0]) { - sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->inputStride[1], sc->fft_zeropad_left_read[sc->axis_id], sc->inputStride[1], sc->fft_zeropad_right_read[sc->axis_id]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - indexInputVkFFT(sc, uintType, readType, sc->inoutID, 0, requestCoordinate, requestBatch); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, ";\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - res = appendZeropadStartReadWriteStage(sc, 1); - if (res != VKFFT_SUCCESS) return res; - if (sc->axisSwapped) { - sc->tempLen = sprintf(sc->tempStr, " sdataID = (combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ");\n", sc->fftDim, sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - if (sc->inputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = %s%s[%s]%s;\n", convTypeLeft, inputsStruct, sc->inoutID, convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = %sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - if (sc->mergeSequencesR2C) { - sc->tempLen = sprintf(sc->tempStr, " inoutID += %" PRIu64 ";\n", sc->inputStride[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - if (sc->inputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = %s%s[inoutID]%s;\n", convTypeLeft, inputsStruct, convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = %sinputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "]%s;\n", convTypeLeft, sc->inputBufferBlockSize, inputsStruct, sc->inputBufferBlockSize, convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - if (sc->inputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = 0;\n"); - else - sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = 0;\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " if (((combinedID %% %" PRIu64 ")>0)&&((combinedID %% %" PRIu64 ") < %" PRIu64 ")){\n", sc->fftDim, sc->fftDim, sc->fftDim - 1); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " inoutID = (%" PRIu64 " - combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ");\n", 2 * sc->fftDim - 2, sc->fftDim, sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdata[inoutID] = sdata[sdataID];\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " sdataID = (combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride;\n", sc->fftDim, sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->inputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = %s%s[inoutID]%s;\n", convTypeLeft, inputsStruct, convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = %sinputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "]%s;\n", convTypeLeft, sc->inputBufferBlockSize, inputsStruct, sc->inputBufferBlockSize, convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->mergeSequencesR2C) { - sc->tempLen = sprintf(sc->tempStr, " inoutID += %" PRIu64 ";\n", sc->inputStride[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->inputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = %s%s[inoutID]%s;\n", convTypeLeft, inputsStruct, convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = %sinputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "]%s;\n", convTypeLeft, sc->inputBufferBlockSize, inputsStruct, sc->inputBufferBlockSize, convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - if (sc->inputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = 0;\n"); - else - sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = 0;\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " if (((combinedID %% %" PRIu64 ")>0)&&((combinedID %% %" PRIu64 ") < %" PRIu64 ")){\n", sc->fftDim, sc->fftDim, sc->fftDim - 1); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " inoutID = (%" PRIu64 " - combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride;\n", 2 * sc->fftDim - 2, sc->fftDim, sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdata[inoutID] = sdata[sdataID];\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - res = appendZeropadEndReadWriteStage(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->zeropad[0]) { - sc->tempLen = sprintf(sc->tempStr, " }else{\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - if (sc->axisSwapped) { - sc->tempLen = sprintf(sc->tempStr, " sdataID = (combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ");\n", sc->fftDim, sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " sdataID = (combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride;\n", sc->fftDim, sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - - sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = 0;\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = 0;\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->axisSwapped) { - sc->tempLen = sprintf(sc->tempStr, " if (((combinedID %% %" PRIu64 ")>0)&&((combinedID %% %" PRIu64 ") < %" PRIu64 ")){\n", sc->fftDim, sc->fftDim, sc->fftDim - 1); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " inoutID = (%" PRIu64 " - combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ");\n", 2 * sc->fftDim - 2, sc->fftDim, sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdata[inoutID] = sdata[sdataID];\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " if (((combinedID %% %" PRIu64 ")>0)&&((combinedID %% %" PRIu64 ") < %" PRIu64 ")){\n", sc->fftDim, sc->fftDim, sc->fftDim - 1); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " inoutID = (%" PRIu64 " - combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride;\n", 2 * sc->fftDim - 2, sc->fftDim, sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdata[inoutID] = sdata[sdataID];\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->zeropadBluestein[0]) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->axisSwapped) { - if ((1 + i + k * num_in) * sc->localSize[1] >= (sc->fftDim)) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[0] != 0) { - sc->tempLen = sprintf(sc->tempStr, " }"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - if ((1 + i + k * num_in) * sc->localSize[0] >= (sc->fftDim)) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[1] != 0) { - sc->tempLen = sprintf(sc->tempStr, " }"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - } - sc->fftDim = 2 * sc->fftDim - 2; - if (sc->zeropadBluestein[0]) sc->fftDim = sc->fft_dim_full; - } - else { - //Not implemented - } - break; - } - case 111://DCT-I strided - { - char shiftX[500] = ""; - if (sc->performWorkGroupShift[0]) - sprintf(shiftX, " + consts.workGroupShiftX "); - char shiftX2[500] = ""; - if (sc->performWorkGroupShift[0]) - sprintf(shiftX2, " + consts.workGroupShiftX * %s ", sc->gl_WorkGroupSize_x); - char shiftY[500] = ""; - if (sc->performWorkGroupShift[1]) - sprintf(shiftY, " + consts.workGroupShiftY "); - uint64_t mult = (sc->mergeSequencesR2C) ? 2 : 1; - if (sc->fftDim == sc->fft_dim_full) { - if (sc->zeropadBluestein[0]) { - res = appendSetSMToZero(sc, floatType, floatTypeMemory, uintType, readType); - if (res != VKFFT_SUCCESS) return res; - res = appendBarrierVkFFT(sc, 1); - if (res != VKFFT_SUCCESS) return res; - sc->fftDim = sc->fft_zeropad_Bluestein_left_read[sc->axis_id]; - } - sc->fftDim = (sc->fftDim + 2) / 2; - uint64_t num_in = (uint64_t)ceil((sc->fftDim) / (double)sc->localSize[1]); - for (uint64_t k = 0; k < sc->registerBoost; k++) { - for (uint64_t i = 0; i < num_in; i++) { - - //sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * mult * sc->min_registers_per_thread) * sc->localSize[0] * sc->localSize[1]); - //res = VkAppendLine(sc); - //if (res != VKFFT_SUCCESS) return res; - - if ((uint64_t)ceil(sc->size[0] / (double)mult) % sc->localSize[0] != 0) { - sc->tempLen = sprintf(sc->tempStr, " if ((%s%s) < %" PRIu64 ") {\n", sc->gl_GlobalInvocationID_x, shiftX2, (uint64_t)ceil(sc->size[0] / (double)mult)); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->mergeSequencesR2C) - sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 ") / %" PRIu64 ";\n", sc->gl_LocalInvocationID_y, (i + k * num_in) * sc->localSize[1], mult); - else - sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 ");\n", sc->gl_LocalInvocationID_y, (i + k * num_in) * sc->localSize[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->zeropadBluestein[0]) { - sc->tempLen = sprintf(sc->tempStr, " if((combinedID %% %" PRIu64 ") < %" PRIu64 "){\n", sc->fft_dim_full, sc->fft_zeropad_Bluestein_left_read[sc->axis_id]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if ((1 + i + k * num_in) * sc->localSize[1] >= (sc->fftDim)) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", (sc->fftDim)); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->mergeSequencesR2C) - sc->tempLen = sprintf(sc->tempStr, " //sdataID = (((combinedID %% %" PRIu64 ") %% 2) * %" PRIu64 " + (1-2*((combinedID %% %" PRIu64 ") %% 2)) * ((combinedID %% %" PRIu64 ")/2)) * sharedStride + (%s + ((%s + %" PRIu64 ") %% %" PRIu64 ") * %" PRIu64 ") / %" PRIu64 ";\n", sc->fftDim, sc->fftDim - 1, sc->fftDim, sc->fftDim, sc->gl_LocalInvocationID_x, sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1], mult, sc->localSize[0], mult); - else - sc->tempLen = sprintf(sc->tempStr, " sdataID = (combinedID %% %" PRIu64 ") * sharedStride + %s;\n", sc->fftDim, sc->gl_LocalInvocationID_x); - - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->mergeSequencesR2C) { - sprintf(index_x, "(%s + %" PRIu64 " * ((%s %% %" PRIu64 ") + (%s%s) * %" PRIu64 ")) %% (%" PRIu64 ")", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, mult, sc->gl_WorkGroupID_x, shiftX, mult, sc->fft_dim_x); - - sprintf(index_y, "(%s/%" PRIu64 " + %" PRIu64 ")", sc->gl_LocalInvocationID_y, mult, (i + k * num_in) * sc->localSize[1]); - } - else { - sprintf(index_x, "(%s%s) %% (%" PRIu64 ")", sc->gl_GlobalInvocationID_x, shiftX2, sc->fft_dim_x); - sprintf(index_y, "(%s + %" PRIu64 ")", sc->gl_LocalInvocationID_y, (i + k * num_in) * sc->localSize[1]); - } - res = indexInputVkFFT(sc, uintType, readType, index_x, index_y, requestCoordinate, requestBatch); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, ";\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - res = appendZeropadStartReadWriteStage(sc, 1); - if (res != VKFFT_SUCCESS) return res; - if (sc->zeropad[0]) { - sc->tempLen = sprintf(sc->tempStr, " if((%s %% %" PRIu64 " < %" PRIu64 ")||(%s %% %" PRIu64 " >= %" PRIu64 ")){\n", index_y, sc->fft_dim_full, sc->fft_zeropad_left_read[sc->axis_id], index_y, sc->fft_dim_full, sc->fft_zeropad_right_read[sc->axis_id]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->inputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s.x = %s%s[%s]%s;\n", sc->regIDs[0], convTypeLeft, inputsStruct, sc->inoutID, convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " %s.x = %sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", sc->regIDs[0], convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - if (sc->mergeSequencesR2C) { - sc->tempLen = sprintf(sc->tempStr, " if ((%s %% 2) == 0) {\n", sc->gl_LocalInvocationID_x); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = %s.x;\n", sc->regIDs[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " } else {\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = %s.x;\n", sc->regIDs[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = %s.x;\n", sc->regIDs[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = 0;\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " if (((combinedID %% %" PRIu64 ")>0)&&((combinedID %% %" PRIu64 ") < %" PRIu64 ")){\n", sc->fftDim, sc->fftDim, sc->fftDim - 1); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " inoutID = (%" PRIu64 " - combinedID %% %" PRIu64 ") * sharedStride + %s;\n", 2 * sc->fftDim - 2, sc->fftDim, sc->gl_LocalInvocationID_x); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdata[inoutID] = sdata[sdataID];\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - res = appendZeropadEndReadWriteStage(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->zeropad[0]) { - sc->tempLen = sprintf(sc->tempStr, " }else{\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - if (sc->mergeSequencesR2C) { - sc->tempLen = sprintf(sc->tempStr, " if ((%s %% 2) == 0) {\n", sc->gl_LocalInvocationID_x); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = 0;\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " } else {\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = 0;\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = 0;\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = 0;\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " if (((combinedID %% %" PRIu64 ")>0)&&((combinedID %% %" PRIu64 ") < %" PRIu64 ")){\n", sc->fftDim, sc->fftDim, sc->fftDim - 1); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " inoutID = (%" PRIu64 " - combinedID %% %" PRIu64 ") * sharedStride + %s;\n", 2 * sc->fftDim - 2, sc->fftDim, sc->gl_LocalInvocationID_x); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdata[inoutID] = sdata[sdataID];\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if ((1 + i + k * num_in) * sc->localSize[1] >= (sc->fftDim)) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->zeropadBluestein[0]) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if ((uint64_t)ceil(sc->size[0] / (double)mult) % sc->localSize[0] != 0) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - sc->fftDim = 2 * sc->fftDim - 2; - if (sc->zeropadBluestein[0]) sc->fftDim = sc->fft_dim_full; - } - else { - //Not implemented - } - break; - } - case 120://DCT-II nonstrided - { - char shiftX[500] = ""; - if (sc->performWorkGroupShift[0]) - sprintf(shiftX, " + consts.workGroupShiftX "); - char shiftY[500] = ""; - if (sc->performWorkGroupShift[1]) - sprintf(shiftY, " + consts.workGroupShiftY "); - uint64_t mult = (sc->mergeSequencesR2C) ? 2 : 1; - uint64_t used_registers_read = (sc->axisSwapped) ? (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]) : (uint64_t)ceil(sc->fftDim / (double)sc->localSize[0]); - if (sc->registerBoost > 1) used_registers_read /= sc->registerBoost; - if (sc->fftDim == sc->fft_dim_full) { - if (sc->zeropadBluestein[0]) { - res = appendSetSMToZero(sc, floatType, floatTypeMemory, uintType, readType); - if (res != VKFFT_SUCCESS) return res; - res = appendBarrierVkFFT(sc, 1); - if (res != VKFFT_SUCCESS) return res; - sc->fftDim = sc->fft_zeropad_Bluestein_left_read[sc->axis_id]; - used_registers_read = (sc->axisSwapped) ? (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]) : (uint64_t)ceil(sc->fftDim / (double)sc->localSize[0]); - } - for (uint64_t k = 0; k < sc->registerBoost; k++) { - for (uint64_t i = 0; i < used_registers_read; i++) { - - if (sc->localSize[1] == 1) - sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * used_registers_read) * sc->localSize[0]); - else - sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->inputStride[0] > 1) - sc->tempLen = sprintf(sc->tempStr, " inoutID = (combinedID %% %" PRIu64 ") * %" PRIu64 " + (combinedID / %" PRIu64 ") * %" PRIu64 ";\n", sc->fftDim, sc->inputStride[0], sc->fftDim, mult * sc->inputStride[1]); - else - sc->tempLen = sprintf(sc->tempStr, " inoutID = (combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * %" PRIu64 ";\n", sc->fftDim, sc->fftDim, mult * sc->inputStride[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->axisSwapped) { - if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[0])) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[0] != 0) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + (%s%s)*%" PRIu64 "< %" PRIu64 "){", sc->fftDim, sc->gl_WorkGroupID_y, shiftY, sc->localSize[0], (uint64_t)ceil(sc->size[1] / (double)mult)); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->zeropadBluestein[0]) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[1])) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[1] != 0) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + (%s%s)*%" PRIu64 "< %" PRIu64 "){", sc->fftDim, sc->gl_WorkGroupID_y, shiftY, sc->localSize[1], (uint64_t)ceil(sc->size[1] / (double)mult)); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->zeropadBluestein[0]) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - if (sc->zeropad[0]) { - sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->inputStride[1], sc->fft_zeropad_left_read[sc->axis_id], sc->inputStride[1], sc->fft_zeropad_right_read[sc->axis_id]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - indexInputVkFFT(sc, uintType, readType, sc->inoutID, 0, requestCoordinate, requestBatch); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, ";\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - res = appendZeropadStartReadWriteStage(sc, 1); - if (res != VKFFT_SUCCESS) return res; - if (sc->axisSwapped) { - sc->tempLen = sprintf(sc->tempStr, " sdataID = (((combinedID %% %" PRIu64 ") %% 2) * %" PRIu64 " + (1-2*((combinedID %% %" PRIu64 ") %% 2)) * ((combinedID %% %" PRIu64 ")/2)) * sharedStride + (combinedID / %" PRIu64 ");\n", sc->fftDim, sc->fftDim - 1, sc->fftDim, sc->fftDim, sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - if (sc->inputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = %s%s[%s]%s;\n", convTypeLeft, inputsStruct, sc->inoutID, convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = %sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - if (sc->mergeSequencesR2C) { - sc->tempLen = sprintf(sc->tempStr, " inoutID += %" PRIu64 ";\n", sc->inputStride[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - if (sc->inputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = %s%s[inoutID]%s;\n", convTypeLeft, inputsStruct, convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = %sinputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "]%s;\n", convTypeLeft, sc->inputBufferBlockSize, inputsStruct, sc->inputBufferBlockSize, convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - if (sc->inputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = 0;\n"); - else - sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = 0;\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - sc->tempLen = sprintf(sc->tempStr, " sdataID = (((combinedID %% %" PRIu64 ") %% 2) * %" PRIu64 " + (1-2*((combinedID %% %" PRIu64 ") %% 2)) * ((combinedID %% %" PRIu64 ")/2)) + (combinedID / %" PRIu64 ") * sharedStride;\n", sc->fftDim, sc->fftDim - 1, sc->fftDim, sc->fftDim, sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->inputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = %s%s[inoutID]%s;\n", convTypeLeft, inputsStruct, convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = %sinputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "]%s;\n", convTypeLeft, sc->inputBufferBlockSize, inputsStruct, sc->inputBufferBlockSize, convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->mergeSequencesR2C) { - sc->tempLen = sprintf(sc->tempStr, " inoutID += %" PRIu64 ";\n", sc->inputStride[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->inputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = %s%s[inoutID]%s;\n", convTypeLeft, inputsStruct, convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = %sinputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "]%s;\n", convTypeLeft, sc->inputBufferBlockSize, inputsStruct, sc->inputBufferBlockSize, convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - if (sc->inputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = 0;\n"); - else - sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = 0;\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - res = appendZeropadEndReadWriteStage(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->zeropad[0]) { - sc->tempLen = sprintf(sc->tempStr, " }else{\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - if (sc->axisSwapped) { - sc->tempLen = sprintf(sc->tempStr, " sdataID = (((combinedID %% %" PRIu64 ") %% 2) * %" PRIu64 " + (1-2*((combinedID %% %" PRIu64 ") %% 2)) * ((combinedID %% %" PRIu64 ")/2)) * sharedStride + (combinedID / %" PRIu64 ");\n", sc->fftDim, sc->fftDim - 1, sc->fftDim, sc->fftDim, sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " sdataID = (((combinedID %% %" PRIu64 ") %% 2) * %" PRIu64 " + (1-2*((combinedID %% %" PRIu64 ") %% 2)) * ((combinedID %% %" PRIu64 ")/2)) + (combinedID / %" PRIu64 ") * sharedStride;\n", sc->fftDim, sc->fftDim - 1, sc->fftDim, sc->fftDim, sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - - sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = 0;\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = 0;\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->zeropadBluestein[0]) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->axisSwapped) { - if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[0] != 0) { - sc->tempLen = sprintf(sc->tempStr, " }"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[0])) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[1] != 0) { - sc->tempLen = sprintf(sc->tempStr, " }"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[1])) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - } - if (sc->zeropadBluestein[0]) { - sc->fftDim = sc->fft_dim_full; - used_registers_read = (sc->axisSwapped) ? (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]) : (uint64_t)ceil(sc->fftDim / (double)sc->localSize[0]); - } - } - else { - //Not implemented - } - break; - } - case 121://DCT-II strided - { - char shiftX[500] = ""; - if (sc->performWorkGroupShift[0]) - sprintf(shiftX, " + consts.workGroupShiftX "); - char shiftX2[500] = ""; - if (sc->performWorkGroupShift[0]) - sprintf(shiftX2, " + consts.workGroupShiftX * %s ", sc->gl_WorkGroupSize_x); - char shiftY[500] = ""; - if (sc->performWorkGroupShift[1]) - sprintf(shiftY, " + consts.workGroupShiftY "); - uint64_t mult = (sc->mergeSequencesR2C) ? 2 : 1; - uint64_t used_registers_read = (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]); - if (sc->registerBoost > 1) used_registers_read /= sc->registerBoost; - if (sc->fftDim == sc->fft_dim_full) { - if (sc->zeropadBluestein[0]) { - res = appendSetSMToZero(sc, floatType, floatTypeMemory, uintType, readType); - if (res != VKFFT_SUCCESS) return res; - res = appendBarrierVkFFT(sc, 1); - if (res != VKFFT_SUCCESS) return res; - sc->fftDim = sc->fft_zeropad_Bluestein_left_read[sc->axis_id]; - used_registers_read = (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]); - } - for (uint64_t k = 0; k < sc->registerBoost; k++) { - for (uint64_t i = 0; i < mult * used_registers_read; i++) { - - //sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * mult * used_registers_read) * sc->localSize[0] * sc->localSize[1]); - //res = VkAppendLine(sc); - //if (res != VKFFT_SUCCESS) return res; - - if ((uint64_t)ceil(sc->size[0] / (double)mult) % sc->localSize[0] != 0) { - sc->tempLen = sprintf(sc->tempStr, " if ((%s%s) < %" PRIu64 ") {\n", sc->gl_GlobalInvocationID_x, shiftX2, (uint64_t)ceil(sc->size[0] / (double)mult)); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->mergeSequencesR2C) - sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 ") / %" PRIu64 ";\n", sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[1], mult); - else - sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 ");\n", sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if ((1 + i + mult * k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (mult * sc->fftDim * sc->localSize[0])) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->zeropadBluestein[0]) { - sc->tempLen = sprintf(sc->tempStr, " if((combinedID %% %" PRIu64 ") < %" PRIu64 "){\n", sc->fft_dim_full, sc->fft_zeropad_Bluestein_left_read[sc->axis_id]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->mergeSequencesR2C) - sc->tempLen = sprintf(sc->tempStr, " sdataID = (((combinedID %% %" PRIu64 ") %% 2) * %" PRIu64 " + (1-2*((combinedID %% %" PRIu64 ") %% 2)) * ((combinedID %% %" PRIu64 ")/2)) * sharedStride + (%s + ((%s + %" PRIu64 ") %% %" PRIu64 ") * %" PRIu64 ") / %" PRIu64 ";\n", sc->fftDim, sc->fftDim - 1, sc->fftDim, sc->fftDim, sc->gl_LocalInvocationID_x, sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[1], mult, sc->localSize[0], mult); - else - sc->tempLen = sprintf(sc->tempStr, " sdataID = (((combinedID %% %" PRIu64 ") %% 2) * %" PRIu64 " + (1-2*((combinedID %% %" PRIu64 ") %% 2)) * ((combinedID %% %" PRIu64 ")/2)) * sharedStride + %s;\n", sc->fftDim, sc->fftDim - 1, sc->fftDim, sc->fftDim, sc->gl_LocalInvocationID_x); - - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->mergeSequencesR2C) { - sprintf(index_x, "(%s + %" PRIu64 " * ((%s %% %" PRIu64 ") + (%s%s) * %" PRIu64 ")) %% (%" PRIu64 ")", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, mult, sc->gl_WorkGroupID_x, shiftX, mult, sc->fft_dim_x); - - sprintf(index_y, "(%s/%" PRIu64 " + %" PRIu64 ")", sc->gl_LocalInvocationID_y, mult, (i + k * used_registers_read) * sc->localSize[1]); - } - else { - sprintf(index_x, "(%s%s) %% (%" PRIu64 ")", sc->gl_GlobalInvocationID_x, shiftX2, sc->fft_dim_x); - sprintf(index_y, "(%s + %" PRIu64 ")", sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[1]); - } - res = indexInputVkFFT(sc, uintType, readType, index_x, index_y, requestCoordinate, requestBatch); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, ";\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - res = appendZeropadStartReadWriteStage(sc, 1); - if (res != VKFFT_SUCCESS) return res; - if (sc->zeropad[0]) { - sc->tempLen = sprintf(sc->tempStr, " if((%s %% %" PRIu64 " < %" PRIu64 ")||(%s %% %" PRIu64 " >= %" PRIu64 ")){\n", index_y, sc->fft_dim_full, sc->fft_zeropad_left_read[sc->axis_id], index_y, sc->fft_dim_full, sc->fft_zeropad_right_read[sc->axis_id]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->inputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s.x = %s%s[%s]%s;\n", sc->regIDs[0], convTypeLeft, inputsStruct, sc->inoutID, convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " %s.x = %sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", sc->regIDs[0], convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - if (sc->mergeSequencesR2C) { - sc->tempLen = sprintf(sc->tempStr, " if ((%s %% 2) == 0) {\n", sc->gl_LocalInvocationID_x); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = %s.x;\n", sc->regIDs[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " } else {\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = %s.x;\n", sc->regIDs[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = %s.x;\n", sc->regIDs[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = 0;\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - res = appendZeropadEndReadWriteStage(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->zeropad[0]) { - sc->tempLen = sprintf(sc->tempStr, " }else{\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - if (sc->mergeSequencesR2C) { - sc->tempLen = sprintf(sc->tempStr, " if ((%s %% 2) == 0) {\n", sc->gl_LocalInvocationID_x); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = 0;\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " } else {\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = 0;\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = 0;\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = 0;\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->zeropadBluestein[0]) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if ((1 + i + mult * k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (mult * sc->fftDim * sc->localSize[0])) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if ((uint64_t)ceil(sc->size[0] / (double)mult) % sc->localSize[0] != 0) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - if (sc->zeropadBluestein[0]) { - sc->fftDim = sc->fft_dim_full; - used_registers_read = (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]); - } - } - else { - //Not implemented - } - break; - } - case 130://DCT-III nonstrided - { - char shiftX[500] = ""; - if (sc->performWorkGroupShift[0]) - sprintf(shiftX, " + consts.workGroupShiftX "); - char shiftY[500] = ""; - if (sc->performWorkGroupShift[1]) - sprintf(shiftY, " + consts.workGroupShiftY "); - uint64_t mult = (sc->mergeSequencesR2C) ? 2 : 1; - if (sc->fftDim == sc->fft_dim_full) { - if (sc->zeropadBluestein[0]) { - res = appendSetSMToZero(sc, floatType, floatTypeMemory, uintType, readType); - if (res != VKFFT_SUCCESS) return res; - res = appendBarrierVkFFT(sc, 1); - if (res != VKFFT_SUCCESS) return res; - sc->fftDim = sc->fft_zeropad_Bluestein_left_read[sc->axis_id]; - } - uint64_t num_in = (sc->axisSwapped) ? (uint64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[1]) : (uint64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[0]); - for (uint64_t k = 0; k < sc->registerBoost; k++) { - for (uint64_t i = 0; i < num_in; i++) { - - if (sc->localSize[1] == 1) - sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * num_in) * sc->localSize[0]); - else - sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * num_in) * sc->localSize[0] * sc->localSize[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - if (!sc->axisSwapped) { - sc->tempLen = sprintf(sc->tempStr, " %s = combinedID %% %" PRIu64 " + ((combinedID/%" PRIu64 ") * %" PRIu64 ");\n", sc->inoutID, sc->fftDim / 2 + 1, sc->fftDim / 2 + 1, mult * sc->inputStride[1]); - } - else { - sc->tempLen = sprintf(sc->tempStr, " %s = combinedID %% %" PRIu64 " + ((combinedID/%" PRIu64 ") * %" PRIu64 ");\n", sc->inoutID, sc->fftDim / 2 + 1, sc->fftDim / 2 + 1, mult * sc->inputStride[1]); - } - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->axisSwapped) { - if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[0] != 0) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + %s*%" PRIu64 "< %" PRIu64 "){\n", (sc->fftDim / 2 + 1), sc->gl_WorkGroupID_y, sc->localSize[0], (uint64_t)ceil(sc->size[sc->axis_id + 1] / (double)mult)); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if ((1 + i + k * num_in) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim / 2 + 1) * sc->localSize[0]) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", (sc->fftDim / 2 + 1) * sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[1] != 0) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + %s*%" PRIu64 "< %" PRIu64 "){\n", (sc->fftDim / 2 + 1), sc->gl_WorkGroupID_y, sc->localSize[1], (uint64_t)ceil(sc->size[sc->axis_id + 1] / (double)mult)); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if ((1 + i + k * num_in) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim / 2 + 1) * sc->localSize[1]) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", (sc->fftDim / 2 + 1) * sc->localSize[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - indexInputVkFFT(sc, uintType, readType, sc->inoutID, 0, requestCoordinate, requestBatch); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, ";\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - res = appendZeropadStartReadWriteStage(sc, 1); - if (res != VKFFT_SUCCESS) return res; - if (sc->LUT) { - sc->tempLen = sprintf(sc->tempStr, " mult = twiddleLUT[%" PRIu64 " + combinedID %% %" PRIu64 "];\n", sc->startDCT3LUT, sc->fftDim / 2 + 1); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - if (!strcmp(floatType, "float")) { - sc->tempLen = sprintf(sc->tempStr, " mult.x = %s(%.17e%s * (combinedID %% %" PRIu64 ") );\n", cosDef, double_PI / 2 / sc->fftDim, LFending, sc->fftDim / 2 + 1); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " mult.y = %s(%.17e%s * (combinedID %% %" PRIu64 ") );\n", sinDef, double_PI / 2 / sc->fftDim, LFending, sc->fftDim / 2 + 1); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (!strcmp(floatType, "double")) { - sc->tempLen = sprintf(sc->tempStr, " mult = sincos_20(%.17e%s * (combinedID %% %" PRIu64 ") );\n", double_PI / 2 / sc->fftDim, LFending, sc->fftDim / 2 + 1); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - if (sc->zeropad[0]) { - sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->inputStride[1], sc->fft_zeropad_left_read[sc->axis_id], sc->inputStride[1], sc->fft_zeropad_right_read[sc->axis_id]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->inputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s.x = %s%s[%s]%s;\n", sc->regIDs[0], convTypeLeft, inputsStruct, sc->inoutID, convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " %s.x = %sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", sc->regIDs[0], convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - if (sc->mergeSequencesR2C) { - sc->tempLen = sprintf(sc->tempStr, " inoutID += %" PRIu64 ";\n", sc->inputStride[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - if (sc->inputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s.y = %s%s[inoutID]%s;\n", sc->regIDs[0], convTypeLeft, inputsStruct, convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " %s.y = %sinputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "]%s;\n", sc->regIDs[0], convTypeLeft, sc->inputBufferBlockSize, inputsStruct, sc->inputBufferBlockSize, convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", sc->regIDs[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->zeropad[0]) { - sc->tempLen = sprintf(sc->tempStr, " }else{\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.x = 0;\n", sc->regIDs[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", sc->regIDs[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - - if (sc->axisSwapped) { - sc->tempLen = sprintf(sc->tempStr, " sdataID = (combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ");\n", sc->fftDim / 2 + 1, sc->fftDim / 2 + 1); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " sdataID = (combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride ;\n", sc->fftDim / 2 + 1, sc->fftDim / 2 + 1); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " if (combinedID %% %" PRIu64 " > 0){\n", sc->fftDim / 2 + 1); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - sc->tempLen = sprintf(sc->tempStr, " %s = (%" PRIu64 " - combinedID %% %" PRIu64 ") + ((combinedID/%" PRIu64 ") * %" PRIu64 ");\n", sc->inoutID, sc->fftDim, sc->fftDim / 2 + 1, sc->fftDim / 2 + 1, mult * sc->inputStride[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->zeropad[0]) { - sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->inputStride[1], sc->fft_zeropad_left_read[sc->axis_id], sc->inputStride[1], sc->fft_zeropad_right_read[sc->axis_id]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - res = indexInputVkFFT(sc, uintType, readType, sc->inoutID, 0, requestCoordinate, requestBatch); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, ";\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->inputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s.x = %s%s[%s]%s;\n", sc->regIDs[1], convTypeLeft, inputsStruct, sc->inoutID, convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " %s.x = %sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", sc->regIDs[1], convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - if (sc->mergeSequencesR2C) { - sc->tempLen = sprintf(sc->tempStr, " inoutID += %" PRIu64 ";\n", sc->inputStride[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - if (sc->inputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s.y = %s%s[inoutID]%s;\n", sc->regIDs[1], convTypeLeft, inputsStruct, convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " %s.y = %sinputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "]%s;\n", sc->regIDs[1], convTypeLeft, sc->inputBufferBlockSize, inputsStruct, sc->inputBufferBlockSize, convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", sc->regIDs[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->zeropad[0]) { - sc->tempLen = sprintf(sc->tempStr, " }else{\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.x = 0;\n", sc->regIDs[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", sc->regIDs[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = ((%s.x+%s.y)*mult.x+(%s.x-%s.y)*mult.y);\n", sc->regIDs[0], sc->regIDs[1], sc->regIDs[1], sc->regIDs[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = ((-%s.x+%s.y)*mult.x+(%s.x+%s.y)*mult.y);\n", sc->regIDs[1], sc->regIDs[0], sc->regIDs[0], sc->regIDs[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->axisSwapped) { - sc->tempLen = sprintf(sc->tempStr, " sdataID = (%" PRIu64 " - combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ");\n", sc->fftDim, sc->fftDim / 2 + 1, sc->fftDim / 2 + 1); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " sdataID = (%" PRIu64 " - combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride;\n", sc->fftDim, sc->fftDim / 2 + 1, sc->fftDim / 2 + 1); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = ((%s.x-%s.y)*mult.x+(%s.x+%s.y)*mult.y);\n", sc->regIDs[0], sc->regIDs[1], sc->regIDs[1], sc->regIDs[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = ((%s.x+%s.y)*mult.x-(%s.x-%s.y)*mult.y);\n", sc->regIDs[1], sc->regIDs[0], sc->regIDs[0], sc->regIDs[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - sc->tempLen = sprintf(sc->tempStr, " } else {\n"); - res = VkAppendLine(sc); - - if (res != VKFFT_SUCCESS) return res; - - sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = (%s.x*mult.x-%s.y*mult.y);\n", sc->regIDs[0], sc->regIDs[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = (%s.y*mult.x+%s.x*mult.y);\n", sc->regIDs[0], sc->regIDs[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " }\n"); - - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - res = appendZeropadEndReadWriteStage(sc); - if (res != VKFFT_SUCCESS) return res; - - if (sc->axisSwapped) { - if ((1 + i + k * num_in) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim / 2 + 1) * sc->localSize[0]) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - if ((1 + i + k * num_in) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim / 2 + 1) * sc->localSize[1]) - { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - if (sc->axisSwapped) { - if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[0] != 0) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[1] != 0) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - } - if (sc->zeropadBluestein[0]) sc->fftDim = sc->fft_dim_full; - } - else { - //Not implemented - } - break; - } - case 131://DCT-III strided - { - char shiftX[500] = ""; - if (sc->performWorkGroupShift[0]) - sprintf(shiftX, " + consts.workGroupShiftX "); - char shiftX2[500] = ""; - if (sc->performWorkGroupShift[0]) - sprintf(shiftX2, " + consts.workGroupShiftX * %s ", sc->gl_WorkGroupSize_x); - char shiftY[500] = ""; - if (sc->performWorkGroupShift[1]) - sprintf(shiftY, " + consts.workGroupShiftY "); - uint64_t mult = (sc->mergeSequencesR2C) ? 2 : 1; - uint64_t num_in = (uint64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[1]); - - if (sc->fftDim == sc->fft_dim_full) { - if (sc->zeropadBluestein[0]) { - res = appendSetSMToZero(sc, floatType, floatTypeMemory, uintType, readType); - if (res != VKFFT_SUCCESS) return res; - res = appendBarrierVkFFT(sc, 1); - if (res != VKFFT_SUCCESS) return res; - sc->fftDim = sc->fft_zeropad_Bluestein_left_read[sc->axis_id]; - } - for (uint64_t k = 0; k < sc->registerBoost; k++) { - for (uint64_t i = 0; i < num_in; i++) { - - //sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * mult * sc->min_registers_per_thread) * sc->localSize[0] * sc->localSize[1]); - //res = VkAppendLine(sc); - //if (res != VKFFT_SUCCESS) return res; - - if ((uint64_t)ceil(sc->size[0] / (double)mult) % sc->localSize[0] != 0) { - sc->tempLen = sprintf(sc->tempStr, " if ((%s%s) < %" PRIu64 ") {\n", sc->gl_GlobalInvocationID_x, shiftX2, (uint64_t)ceil(sc->size[0] / (double)mult)); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - - if (sc->mergeSequencesR2C) - sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 ") / %" PRIu64 ";\n", sc->gl_LocalInvocationID_y, (i + k * num_in) * sc->localSize[1], mult); - else - sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 ");\n", sc->gl_LocalInvocationID_y, (i + k * num_in) * sc->localSize[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if ((1 + i + k * num_in) * sc->localSize[1] >= (sc->fftDim / 2 + 1)) - { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", (sc->fftDim / 2 + 1)); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - - sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->mergeSequencesR2C) { - sprintf(index_x, "(%s + %" PRIu64 " * ((%s %% %" PRIu64 ") + (%s%s) * %" PRIu64 ")) %% (%" PRIu64 ")", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, mult, sc->gl_WorkGroupID_x, shiftX, mult, sc->fft_dim_x); - - sprintf(index_y, "(%s/%" PRIu64 " + %" PRIu64 ")", sc->gl_LocalInvocationID_y, mult, (i + k * num_in) * sc->localSize[1]); - } - else { - sprintf(index_x, "(%s%s) %% (%" PRIu64 ")", sc->gl_GlobalInvocationID_x, shiftX2, sc->fft_dim_x); - sprintf(index_y, "(%s + %" PRIu64 ")", sc->gl_LocalInvocationID_y, (i + k * num_in) * sc->localSize[1]); - } - res = indexInputVkFFT(sc, uintType, readType, index_x, index_y, requestCoordinate, requestBatch); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, ";\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - res = appendZeropadStartReadWriteStage(sc, 1); - if (res != VKFFT_SUCCESS) return res; - if (sc->zeropad[0]) { - sc->tempLen = sprintf(sc->tempStr, " if((%s %% %" PRIu64 " < %" PRIu64 ")||(%s %% %" PRIu64 " >= %" PRIu64 ")){\n", index_y, sc->fft_dim_full, sc->fft_zeropad_left_read[sc->axis_id], index_y, sc->fft_dim_full, sc->fft_zeropad_right_read[sc->axis_id]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->inputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s.x = %s%s[%s]%s;\n", sc->regIDs[0], convTypeLeft, inputsStruct, sc->inoutID, convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " %s.x = %sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", sc->regIDs[0], convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - if (sc->zeropad[0]) { - sc->tempLen = sprintf(sc->tempStr, " }else{\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.x = 0;\n", sc->regIDs[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->mergeSequencesR2C) { - } - else { - sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", sc->regIDs[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - - if (sc->LUT) { - sc->tempLen = sprintf(sc->tempStr, " mult = twiddleLUT[%" PRIu64 " + combinedID];\n", sc->startDCT3LUT); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - if (!strcmp(floatType, "float")) { - sc->tempLen = sprintf(sc->tempStr, " mult.x = %s(%.17e%s * (combinedID) );\n", cosDef, double_PI / 2 / sc->fftDim, LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " mult.y = %s(%.17e%s * (combinedID) );\n", sinDef, double_PI / 2 / sc->fftDim, LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (!strcmp(floatType, "double")) { - sc->tempLen = sprintf(sc->tempStr, " mult = sincos_20(%.17e%s * (combinedID) );\n", double_PI / 2 / sc->fftDim, LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - //sc->tempLen = sprintf(sc->tempStr, " printf(\" %%f - %%f \\n\", mult.x, mult.y);\n"); - //res = VkAppendLine(sc); - //if (res != VKFFT_SUCCESS) return res; - if (sc->mergeSequencesR2C) - sc->tempLen = sprintf(sc->tempStr, " //sdataID = (combinedID) * sharedStride + (%s + ((%s + %" PRIu64 ") %% %" PRIu64 ") * %" PRIu64 ") / %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1], mult, sc->localSize[0], mult); - else - sc->tempLen = sprintf(sc->tempStr, " sdataID = (combinedID) * sharedStride + %s;\n", sc->gl_LocalInvocationID_x); - - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - sc->tempLen = sprintf(sc->tempStr, " if (combinedID > 0){\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->mergeSequencesR2C) { - sprintf(index_x, "(%s + %" PRIu64 " * ((%s %% %" PRIu64 ") + (%s%s) * %" PRIu64 ")) %% (%" PRIu64 ")", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, mult, sc->gl_WorkGroupID_x, shiftX, mult, sc->fft_dim_x); - - sprintf(index_y, "(%" PRIu64 " - (%s/%" PRIu64 " + %" PRIu64 "))", sc->fftDim, sc->gl_LocalInvocationID_y, mult, (i + k * num_in) * sc->localSize[1]); - } - else { - sprintf(index_x, "(%s%s) %% (%" PRIu64 ")", sc->gl_GlobalInvocationID_x, shiftX2, sc->fft_dim_x); - sprintf(index_y, "(%" PRIu64 " - (%s + %" PRIu64 "))", sc->fftDim, sc->gl_LocalInvocationID_y, (i + k * num_in) * sc->localSize[1]); - } - res = indexInputVkFFT(sc, uintType, readType, index_x, index_y, requestCoordinate, requestBatch); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, ";\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->zeropad[0]) { - sc->tempLen = sprintf(sc->tempStr, " if((%s %% %" PRIu64 " < %" PRIu64 ")||(%s %% %" PRIu64 " >= %" PRIu64 ")){\n", index_y, sc->fft_dim_full, sc->fft_zeropad_left_read[sc->axis_id], index_y, sc->fft_dim_full, sc->fft_zeropad_right_read[sc->axis_id]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->inputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s.x = %s%s[%s]%s;\n", sc->regIDs[1], convTypeLeft, inputsStruct, sc->inoutID, convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " %s.x = %sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", sc->regIDs[1], convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->zeropad[0]) { - sc->tempLen = sprintf(sc->tempStr, " }else{\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.x = 0;\n", sc->regIDs[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->mergeSequencesR2C) { - - } - else { - sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", sc->regIDs[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - - sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = ((%s.x+%s.y)*mult.x-(%s.y-%s.x)*mult.y);\n", sc->regIDs[0], sc->regIDs[1], sc->regIDs[0], sc->regIDs[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = ((%s.y-%s.x)*mult.x+(%s.x+%s.y)*mult.y);\n", sc->regIDs[0], sc->regIDs[1], sc->regIDs[0], sc->regIDs[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdataID = (%" PRIu64 " - combinedID) * sharedStride + %s;\n", sc->fftDim, sc->gl_LocalInvocationID_x); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = ((%s.x+%s.y)*mult.x-(%s.y-%s.x)*mult.y);\n", sc->regIDs[0], sc->regIDs[1], sc->regIDs[0], sc->regIDs[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = -((%s.y-%s.x)*mult.x+(%s.x+%s.y)*mult.y);\n", sc->regIDs[0], sc->regIDs[1], sc->regIDs[0], sc->regIDs[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - sc->tempLen = sprintf(sc->tempStr, " } else {\n"); - res = VkAppendLine(sc); - - if (res != VKFFT_SUCCESS) return res; - - sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = ((%s.x)*mult.x-(%s.y)*mult.y);\n", sc->regIDs[0], sc->regIDs[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = ((%s.y)*mult.x+(%s.x)*mult.y);\n", sc->regIDs[0], sc->regIDs[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - res = appendZeropadEndReadWriteStage(sc); - if (res != VKFFT_SUCCESS) return res; - - if ((uint64_t)ceil(sc->size[0] / (double)mult) % sc->localSize[0] != 0) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if ((1 + i + k * num_in) * sc->localSize[1] >= (sc->fftDim / 2 + 1)) - { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - if (sc->zeropadBluestein[0]) sc->fftDim = sc->fft_dim_full; - } - else { - //Not implemented - } - break; - } - case 140://DCT-IV nonstrided cast to 8x FFT - { - char shiftX[500] = ""; - if (sc->performWorkGroupShift[0]) - sprintf(shiftX, " + consts.workGroupShiftX "); - char shiftY[500] = ""; - if (sc->axisSwapped) { - if (sc->performWorkGroupShift[1]) - sprintf(shiftY, " + consts.workGroupShiftY*%s ", sc->gl_WorkGroupSize_x); - } - else { - if (sc->performWorkGroupShift[1]) - sprintf(shiftY, " + consts.workGroupShiftY*%s ", sc->gl_WorkGroupSize_y); - } - char shiftY2[100] = ""; - if (sc->performWorkGroupShift[1]) - sprintf(shiftY, " + consts.workGroupShiftY "); - if (sc->fftDim < sc->fft_dim_full) { - if (sc->axisSwapped) { - sc->tempLen = sprintf(sc->tempStr, " %s numActiveThreads = ((%s/%" PRIu64 ")==%" PRIu64 ") ? %" PRIu64 " : %" PRIu64 ";\n", uintType, sc->gl_WorkGroupID_x, sc->firstStageStartSize / sc->fftDim, ((uint64_t)floor(sc->fft_dim_full / ((double)sc->localSize[0] * sc->fftDim))) / (sc->firstStageStartSize / sc->fftDim), (sc->fft_dim_full - (sc->firstStageStartSize / sc->fftDim) * ((((uint64_t)floor(sc->fft_dim_full / ((double)sc->localSize[0] * sc->fftDim))) / (sc->firstStageStartSize / sc->fftDim)) * sc->localSize[0] * sc->fftDim)) / sc->min_registers_per_thread / (sc->firstStageStartSize / sc->fftDim), sc->localSize[0] * sc->localSize[1]);// sc->fft_dim_full, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->fftDim, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->localSize[0] * sc->firstStageStartSize, sc->fft_dim_full / (sc->localSize[0] * sc->fftDim)); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sprintf(sc->disableThreadsStart, " if(%s * %" PRIu64 " + (((%s%s) %% %" PRIu64 ") * %" PRIu64 " + ((%s%s) / %" PRIu64 ") * %" PRIu64 ") < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_x, sc->firstStageStartSize, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->fftDim, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->localSize[0] * sc->firstStageStartSize, sc->fft_dim_full); - sc->tempLen = sprintf(sc->tempStr, " if((%s+%" PRIu64 "*%s)< numActiveThreads) {\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sprintf(sc->disableThreadsEnd, "}"); - } - else { - sprintf(sc->disableThreadsStart, " if(%s * %" PRIu64 " + (((%s%s) %% %" PRIu64 ") * %" PRIu64 " + ((%s%s) / %" PRIu64 ") * %" PRIu64 ") < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_y, sc->firstStageStartSize, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->fftDim, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->localSize[1] * sc->firstStageStartSize, sc->fft_dim_full); - res = VkAppendLineFromInput(sc, sc->disableThreadsStart); - if (res != VKFFT_SUCCESS) return res; - sprintf(sc->disableThreadsEnd, "}"); - } - } - else { - sc->tempLen = sprintf(sc->tempStr, " { \n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " %s.x = 0;\n", sc->regIDs[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", sc->regIDs[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->fftDim == sc->fft_dim_full) { - for (uint64_t k = 0; k < sc->registerBoost; k++) { - for (uint64_t i = 0; i < (uint64_t)ceil(sc->min_registers_per_thread / 8.0); i++) { - - if (sc->localSize[1] == 1) - sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * sc->min_registers_per_thread) * sc->localSize[0]); - else - sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[0] * sc->localSize[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->inputStride[0] > 1) - sc->tempLen = sprintf(sc->tempStr, " inoutID = (combinedID %% %" PRIu64 ") * %" PRIu64 " + (combinedID / %" PRIu64 ") * %" PRIu64 ";\n", sc->fftDim / 8, sc->inputStride[0], sc->fftDim / 8, sc->inputStride[1]); - else - sc->tempLen = sprintf(sc->tempStr, " inoutID = (combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * %" PRIu64 ";\n", sc->fftDim / 8, sc->fftDim / 8, sc->inputStride[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->axisSwapped) { - if (sc->size[sc->axis_id + 1] % sc->localSize[0] != 0) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + (%s%s)*%" PRIu64 "< %" PRIu64 "){", sc->fftDim / 8, sc->gl_WorkGroupID_y, shiftY2, sc->localSize[0], sc->size[sc->axis_id + 1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim / 8 * sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - if (sc->size[sc->axis_id + 1] % sc->localSize[1] != 0) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + (%s%s)*%" PRIu64 "< %" PRIu64 "){", sc->fftDim / 8, sc->gl_WorkGroupID_y, shiftY2, sc->localSize[1], sc->size[sc->axis_id + 1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim / 8 * sc->localSize[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->zeropad[0]) { - sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->inputStride[1], sc->fft_zeropad_left_read[sc->axis_id], sc->inputStride[1], sc->fft_zeropad_right_read[sc->axis_id]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - res = indexInputVkFFT(sc, uintType, readType, sc->inoutID, 0, requestCoordinate, requestBatch); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, ";\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - res = appendZeropadStartReadWriteStage(sc, 1); - if (res != VKFFT_SUCCESS) return res; - - if (sc->inputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s.x = %s%s[%s]%s;\n", sc->regIDs[0], convTypeLeft, inputsStruct, sc->inoutID, convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " %s.x = %sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", sc->regIDs[0], convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", sc->regIDs[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->axisSwapped) { - sc->tempLen = sprintf(sc->tempStr, " sdata[2*(combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")] = %s;\n", sc->fftDim / 8, sc->fftDim / 8, sc->regIDs[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdata[(2*(combinedID %% %" PRIu64 ")+1) * sharedStride + (combinedID / %" PRIu64 ")] = %s;\n", sc->fftDim / 8, sc->fftDim / 8, sc->regIDs[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdata[(%" PRIu64 " - 2*(combinedID %% %" PRIu64 ")) * sharedStride + (combinedID / %" PRIu64 ")] = %s;\n", sc->fftDim - 2, sc->fftDim / 8, sc->fftDim / 8, sc->regIDs[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdata[(%" PRIu64 " - 2*(combinedID %% %" PRIu64 ")) * sharedStride + (combinedID / %" PRIu64 ")] = %s;\n", sc->fftDim - 1, sc->fftDim / 8, sc->fftDim / 8, sc->regIDs[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.x = - %s.x;\n", sc->regIDs[0], sc->regIDs[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdata[(%" PRIu64 " - 2*(combinedID %% %" PRIu64 ")) * sharedStride + (combinedID / %" PRIu64 ")] = %s;\n", sc->fftDim / 2 - 2, sc->fftDim / 8, sc->fftDim / 8, sc->regIDs[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdata[(%" PRIu64 " - 2*(combinedID %% %" PRIu64 ")) * sharedStride + (combinedID / %" PRIu64 ")] = %s;\n", sc->fftDim / 2 - 1, sc->fftDim / 8, sc->fftDim / 8, sc->regIDs[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdata[(%" PRIu64 " + 2*(combinedID %% %" PRIu64 ")) * sharedStride + (combinedID / %" PRIu64 ")] = %s;\n", sc->fftDim / 2, sc->fftDim / 8, sc->fftDim / 8, sc->regIDs[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdata[(%" PRIu64 " + 2*(combinedID %% %" PRIu64 ")) * sharedStride + (combinedID / %" PRIu64 ")] = %s;\n", sc->fftDim / 2 + 1, sc->fftDim / 8, sc->fftDim / 8, sc->regIDs[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " sdata[2*(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride] = %s;\n", sc->fftDim / 8, sc->fftDim / 8, sc->regIDs[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdata[(2*(combinedID %% %" PRIu64 ")+1) + (combinedID / %" PRIu64 ") * sharedStride] = %s;\n", sc->fftDim / 8, sc->fftDim / 8, sc->regIDs[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdata[(%" PRIu64 " - 2*(combinedID %% %" PRIu64 ")) + (combinedID / %" PRIu64 ") * sharedStride] = %s;\n", sc->fftDim - 2, sc->fftDim / 8, sc->fftDim / 8, sc->regIDs[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdata[(%" PRIu64 " - 2*(combinedID %% %" PRIu64 ")) + (combinedID / %" PRIu64 ") * sharedStride] = %s;\n", sc->fftDim - 1, sc->fftDim / 8, sc->fftDim / 8, sc->regIDs[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.x = - %s.x;\n", sc->regIDs[0], sc->regIDs[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdata[(%" PRIu64 " - 2*(combinedID %% %" PRIu64 ")) + (combinedID / %" PRIu64 ") * sharedStride] = %s;\n", sc->fftDim / 2 - 2, sc->fftDim / 8, sc->fftDim / 8, sc->regIDs[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdata[(%" PRIu64 " - 2*(combinedID %% %" PRIu64 ")) + (combinedID / %" PRIu64 ") * sharedStride] = %s;\n", sc->fftDim / 2 - 1, sc->fftDim / 8, sc->fftDim / 8, sc->regIDs[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdata[(%" PRIu64 " + 2*(combinedID %% %" PRIu64 ")) + (combinedID / %" PRIu64 ") * sharedStride] = %s;\n", sc->fftDim / 2, sc->fftDim / 8, sc->fftDim / 8, sc->regIDs[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdata[(%" PRIu64 " + 2*(combinedID %% %" PRIu64 ")) + (combinedID / %" PRIu64 ") * sharedStride] = %s;\n", sc->fftDim / 2 + 1, sc->fftDim / 8, sc->fftDim / 8, sc->regIDs[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - res = appendZeropadEndReadWriteStage(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->zeropad[0]) { - sc->tempLen = sprintf(sc->tempStr, " }else{\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->readToRegisters) { - sc->tempLen = sprintf(sc->tempStr, " %s.x =0;%s.y = 0;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - if (sc->axisSwapped) { - sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")].x = 0;\n", sc->fftDim, sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")].y = 0;\n", sc->fftDim, sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].x = 0;\n", sc->fftDim, sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].y = 0;\n", sc->fftDim, sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->axisSwapped) { - if (sc->size[sc->axis_id + 1] % sc->localSize[0] != 0) { - sc->tempLen = sprintf(sc->tempStr, " }"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - if (sc->size[sc->axis_id + 1] % sc->localSize[1] != 0) { - sc->tempLen = sprintf(sc->tempStr, " }"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - - } - } - } - /*else { - for (uint64_t k = 0; k < sc->registerBoost; k++) { - for (uint64_t i = 0; i < sc->min_registers_per_thread; i++) { - if (sc->axisSwapped) { - if (sc->localSize[1] == 1) - sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * sc->min_registers_per_thread) * sc->localSize[0]); - else - sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 "*numActiveThreads;\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread)); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " inoutID = (combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * %" PRIu64 " + (((%s%s) %% %" PRIu64 ") * %" PRIu64 " + ((%s%s) / %" PRIu64 ") * %" PRIu64 ");\n", sc->fftDim, sc->fftDim, sc->firstStageStartSize, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->fftDim, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->localSize[0] * sc->firstStageStartSize); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " inoutID = %s+%" PRIu64 "+%s * %" PRIu64 " + (((%s%s) %% %" PRIu64 ") * %" PRIu64 " + ((%s%s) / %" PRIu64 ") * %" PRIu64 ");\n", sc->gl_LocalInvocationID_x, (i + k * sc->min_registers_per_thread) * sc->localSize[0], sc->gl_LocalInvocationID_y, sc->firstStageStartSize, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->fftDim, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->localSize[1] * sc->firstStageStartSize); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->zeropad[0]) { - sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->fft_dim_full, sc->fft_zeropad_left_read[sc->axis_id], sc->fft_dim_full, sc->fft_zeropad_right_read[sc->axis_id]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - res = indexInputVkFFT(sc, uintType, readType, sc->inoutID, 0, requestCoordinate, requestBatch); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, ";\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - res = appendZeropadStartReadWriteStage(sc, 1); - if (res != VKFFT_SUCCESS) return res; - if (sc->readToRegisters) { - if (sc->inputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s = %s%s[%s]%s;\n", sc->regIDs[i + k * sc->registers_per_thread], convTypeLeft, inputsStruct, sc->inoutID, convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " %s = %sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", sc->regIDs[i + k * sc->registers_per_thread], convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - if (sc->axisSwapped) { - - if (sc->inputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID / %" PRIu64 ") + sharedStride*(combinedID %% %" PRIu64 ")] = %s%s[inoutID]%s;\n", sc->fftDim, sc->fftDim, convTypeLeft, inputsStruct, convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID / %" PRIu64 ") + sharedStride*(combinedID %% %" PRIu64 ")] = %sinputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "]%s;\n", sc->fftDim, sc->fftDim, convTypeLeft, sc->inputBufferBlockSize, inputsStruct, sc->inputBufferBlockSize, convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - if (sc->inputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " sdata[sharedStride*%s + (%s + %" PRIu64 ")] = %s%s[inoutID]%s;\n", sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, (i + k * sc->min_registers_per_thread) * sc->localSize[0], convTypeLeft, inputsStruct, convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " sdata[sharedStride*%s + (%s + %" PRIu64 ")] = %sinputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "]%s;\n", sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, (i + k * sc->min_registers_per_thread) * sc->localSize[0], convTypeLeft, sc->inputBufferBlockSize, inputsStruct, sc->inputBufferBlockSize, convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - res = appendZeropadEndReadWriteStage(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->zeropad[0]) { - sc->tempLen = sprintf(sc->tempStr, " }else{\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->readToRegisters) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = 0; %s.y = 0;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - if (sc->axisSwapped) { - sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID / %" PRIu64 ") + sharedStride*(combinedID %% %" PRIu64 ")].x = 0;\n", sc->fftDim, sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdata[(combinedID / %" PRIu64 ") + sharedStride*(combinedID %% %" PRIu64 ")].y = 0;\n", sc->fftDim, sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " sdata[sharedStride*%s + (%s + %" PRIu64 ")].x = 0;\n", sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, (i + k * sc->min_registers_per_thread) * sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdata[sharedStride*%s + (%s + %" PRIu64 ")].y = 0;\n", sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, (i + k * sc->min_registers_per_thread) * sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - }*/ - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - break; - } - case 141://DCT-IV strided cast to 8x FFT - { - char shiftX[500] = ""; - if (sc->performWorkGroupShift[0]) - sprintf(shiftX, " + consts.workGroupShiftX * %s ", sc->gl_WorkGroupSize_x); - if (sc->fftDim != sc->fft_dim_full) { - sprintf(sc->disableThreadsStart, " if (((%s%s) / %" PRIu64 ") %% (%" PRIu64 ")+((%s%s) / %" PRIu64 ") * (%" PRIu64 ") < %" PRIu64 ") {\n", sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x, sc->stageStartSize, sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x * sc->stageStartSize, sc->fftDim * sc->stageStartSize, sc->size[sc->axis_id]); - res = VkAppendLineFromInput(sc, sc->disableThreadsStart); - if (res != VKFFT_SUCCESS) return res; - - sprintf(sc->disableThreadsEnd, "}"); - } - else { - sprintf(sc->disableThreadsStart, "{\n"); - res = VkAppendLineFromInput(sc, sc->disableThreadsStart); - if (res != VKFFT_SUCCESS) return res; - sprintf(sc->disableThreadsEnd, "}"); - } - sc->tempLen = sprintf(sc->tempStr, " %s.x = 0;\n", sc->regIDs[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", sc->regIDs[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - for (uint64_t k = 0; k < sc->registerBoost; k++) { - for (uint64_t i = 0; i < (uint64_t)ceil(sc->min_registers_per_thread / 8.0); i++) { - if (sc->fftDim == sc->fft_dim_full) - sc->tempLen = sprintf(sc->tempStr, " inoutID = (%s + %" PRIu64 ");\n", sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1]); - else - sc->tempLen = sprintf(sc->tempStr, " inoutID = (%" PRIu64 " * (%s + %" PRIu64 ") + ((%s%s) / %" PRIu64 ") %% (%" PRIu64 ")+((%s%s) / %" PRIu64 ") * (%" PRIu64 "));\n", sc->stageStartSize, sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1], sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x, sc->stageStartSize, sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x * sc->stageStartSize, sc->fftDim * sc->stageStartSize); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - sc->tempLen = sprintf(sc->tempStr, " if(inoutID < %" PRIu64 "){\n", sc->fftDim / 8); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->zeropad[0]) { - sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->fft_dim_full, sc->fft_zeropad_left_read[sc->axis_id], sc->fft_dim_full, sc->fft_zeropad_right_read[sc->axis_id]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sprintf(index_x, "(%s%s) %% (%" PRIu64 ")", sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x); - res = indexInputVkFFT(sc, uintType, readType, index_x, sc->inoutID, requestCoordinate, requestBatch); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, ";\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - if (sc->inputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s.x = %s%s[%s]%s;\n", sc->regIDs[0], convTypeLeft, inputsStruct, sc->inoutID, convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " %s.x = %sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", sc->regIDs[0], convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = 0;\n", sc->regIDs[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdata[%s*(2*(%s+%" PRIu64 "))+%s]=%s;\n", sc->sharedStride, sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1], sc->gl_LocalInvocationID_x, sc->regIDs[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdata[%s*(2*(%s+%" PRIu64 ")+1)+%s]=%s;\n", sc->sharedStride, sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1], sc->gl_LocalInvocationID_x, sc->regIDs[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - sc->tempLen = sprintf(sc->tempStr, " sdata[%s*(%" PRIu64 " - 2*(%s+%" PRIu64 "))+%s]=%s;\n", sc->sharedStride, sc->fftDim - 2, sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1], sc->gl_LocalInvocationID_x, sc->regIDs[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdata[%s*(%" PRIu64 " - 2*(%s+%" PRIu64 "))+%s]=%s;\n", sc->sharedStride, sc->fftDim - 1, sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1], sc->gl_LocalInvocationID_x, sc->regIDs[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.x = - %s.x;\n", sc->regIDs[0], sc->regIDs[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdata[%s*(%" PRIu64 " - 2*(%s+%" PRIu64 "))+%s]=%s;\n", sc->sharedStride, sc->fftDim / 2 - 2, sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1], sc->gl_LocalInvocationID_x, sc->regIDs[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdata[%s*(%" PRIu64 " - 2*(%s+%" PRIu64 "))+%s]=%s;\n", sc->sharedStride, sc->fftDim / 2 - 1, sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1], sc->gl_LocalInvocationID_x, sc->regIDs[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - sc->tempLen = sprintf(sc->tempStr, " sdata[%s*(%" PRIu64 " + 2*(%s+%" PRIu64 "))+%s]=%s;\n", sc->sharedStride, sc->fftDim / 2, sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1], sc->gl_LocalInvocationID_x, sc->regIDs[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdata[%s*(%" PRIu64 " + 2*(%s+%" PRIu64 "))+%s]=%s;\n", sc->sharedStride, sc->fftDim / 2 + 1, sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1], sc->gl_LocalInvocationID_x, sc->regIDs[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - if (sc->zeropad[0]) { - sc->tempLen = sprintf(sc->tempStr, " }else{\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->readToRegisters) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = 0; %s.y = 0;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " sdata[%s*(%s+%" PRIu64 ")+%s].x=0;\n", sc->sharedStride, sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1], sc->gl_LocalInvocationID_x); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdata[%s*(%s+%" PRIu64 ")+%s].y=0;\n", sc->sharedStride, sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1], sc->gl_LocalInvocationID_x); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - break; - } - case 142://DCT-IV nonstrided as 2xN/2 DCT-II - { - char shiftX[500] = ""; - if (sc->performWorkGroupShift[0]) - sprintf(shiftX, " + consts.workGroupShiftX "); - char shiftY[500] = ""; - if (sc->performWorkGroupShift[1]) - sprintf(shiftY, " + consts.workGroupShiftY "); - uint64_t used_registers_read = (sc->axisSwapped) ? (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]) : (uint64_t)ceil(sc->fftDim / (double)sc->localSize[0]); - if (sc->registerBoost > 1) used_registers_read /= sc->registerBoost; - if (sc->fftDim == sc->fft_dim_full) { - if (sc->zeropadBluestein[0]) { - res = appendSetSMToZero(sc, floatType, floatTypeMemory, uintType, readType); - if (res != VKFFT_SUCCESS) return res; - res = appendBarrierVkFFT(sc, 1); - if (res != VKFFT_SUCCESS) return res; - sc->fftDim = sc->fft_zeropad_Bluestein_left_read[sc->axis_id]; - used_registers_read = (sc->axisSwapped) ? (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]) : (uint64_t)ceil(sc->fftDim / (double)sc->localSize[0]); - } - uint64_t maxBluesteinCutOff = 1; - if (sc->zeropadBluestein[0]) { - if (sc->axisSwapped) - maxBluesteinCutOff = 2 * sc->fftDim * sc->localSize[0]; - else - maxBluesteinCutOff = 2 * sc->fftDim * sc->localSize[1]; - } - for (uint64_t k = 0; k < sc->registerBoost; k++) { - for (uint64_t i = 0; i < 2 * used_registers_read; i++) { - - if (sc->localSize[1] == 1) - sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * 2 * used_registers_read) * sc->localSize[0]); - else - sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * 2 * used_registers_read) * sc->localSize[0] * sc->localSize[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - if (sc->inputStride[0] > 1) - sc->tempLen = sprintf(sc->tempStr, " inoutID = (combinedID %% %" PRIu64 ") * %" PRIu64 " + (combinedID / %" PRIu64 ") * %" PRIu64 ";\n", 2 * sc->fftDim, sc->inputStride[0], 2 * sc->fftDim, sc->inputStride[1]); - else - sc->tempLen = sprintf(sc->tempStr, " inoutID = (combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * %" PRIu64 ";\n", 2 * sc->fftDim, 2 * sc->fftDim, sc->inputStride[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->axisSwapped) { - if ((1 + i + k * 2 * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (2 * sc->fftDim * sc->localSize[0])) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", 2 * sc->fftDim * sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if ((uint64_t)ceil(sc->size[1]) % sc->localSize[0] != 0) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + (%s%s)*%" PRIu64 "< %" PRIu64 "){", 2 * sc->fftDim, sc->gl_WorkGroupID_y, shiftY, sc->localSize[0], (uint64_t)ceil(sc->size[1])); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - if ((1 + i + k * 2 * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (2 * sc->fftDim * sc->localSize[1])) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", 2 * sc->fftDim * sc->localSize[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if ((uint64_t)ceil(sc->size[1]) % sc->localSize[1] != 0) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + (%s%s)*%" PRIu64 "< %" PRIu64 "){", 2 * sc->fftDim, sc->gl_WorkGroupID_y, shiftY, sc->localSize[1], (uint64_t)ceil(sc->size[1])); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - if (sc->zeropadBluestein[0]) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", maxBluesteinCutOff); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->zeropad[0]) { - sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->inputStride[1], sc->fft_zeropad_left_read[sc->axis_id], sc->inputStride[1], sc->fft_zeropad_right_read[sc->axis_id]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - indexInputVkFFT(sc, uintType, readType, sc->inoutID, 0, requestCoordinate, requestBatch); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, ";\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - res = appendZeropadStartReadWriteStage(sc, 1); - if (res != VKFFT_SUCCESS) return res; -#if(!((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)))//OpenCL and Level Zero are not handling barrier with thread-conditional writes to local memory - so this is a work-around - if (sc->inputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s.x = %s%s[%s]%s;\n", sc->regIDs[0], convTypeLeft, inputsStruct, sc->inoutID, convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " %s.x = %sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", sc->regIDs[0], convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; -#else - if (i < used_registers_read) { - if (sc->inputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s.x = %s%s[%s]%s;\n", sc->regIDs[i + k * sc->registers_per_thread], convTypeLeft, inputsStruct, sc->inoutID, convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " %s.x = %sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", sc->regIDs[i + k * sc->registers_per_thread], convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - if (sc->inputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s.y = %s%s[%s]%s;\n", sc->regIDs[i - used_registers_read + k * sc->registers_per_thread], convTypeLeft, inputsStruct, sc->inoutID, convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " %s.y = %sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", sc->regIDs[i - used_registers_read + k * sc->registers_per_thread], convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } -#endif -#if(!((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)))//OpenCL and Level Zero are not handling barrier with thread-conditional writes to local memory - so this is a work-around - if (sc->axisSwapped) { - //sc->tempLen = sprintf(sc->tempStr, " sdataID = (((combinedID %% %" PRIu64 ") %% 2) * %" PRIu64 " + (1-2*((combinedID %% %" PRIu64 ") %% 2)) * ((combinedID %% %" PRIu64 ")/2)) * sharedStride + (combinedID / %" PRIu64 ");\n", sc->fftDim, sc->fftDim - 1, sc->fftDim, sc->fftDim, sc->fftDim); - sc->tempLen = sprintf(sc->tempStr, " sdataID = ((combinedID %% %" PRIu64 ")/2) * sharedStride + (combinedID / %" PRIu64 ");\n", 2 * sc->fftDim, 2 * sc->fftDim); - } - else { - //sc->tempLen = sprintf(sc->tempStr, " sdataID = (((combinedID %% %" PRIu64 ") %% 2) * %" PRIu64 " + (1-2*((combinedID %% %" PRIu64 ") %% 2)) * ((combinedID %% %" PRIu64 ")/2)) + (combinedID / %" PRIu64 ") * sharedStride;\n", sc->fftDim, sc->fftDim - 1, sc->fftDim, sc->fftDim, sc->fftDim); - sc->tempLen = sprintf(sc->tempStr, " sdataID = ((combinedID %% %" PRIu64 ")/2) + (combinedID / %" PRIu64 ") * sharedStride;\n", 2 * sc->fftDim, 2 * sc->fftDim); - } - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - sc->tempLen = sprintf(sc->tempStr, " if (((combinedID %% %" PRIu64 ")%%2) == 0) {\n", 2 * sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = %s.x;\n", sc->regIDs[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " else {\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = %s.x;\n", sc->regIDs[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; -#endif - res = appendZeropadEndReadWriteStage(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->zeropad[0]) { - sc->tempLen = sprintf(sc->tempStr, " }else{\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - if (sc->axisSwapped) { - //sc->tempLen = sprintf(sc->tempStr, " sdataID = (((combinedID %% %" PRIu64 ") %% 2) * %" PRIu64 " + (1-2*((combinedID %% %" PRIu64 ") %% 2)) * ((combinedID %% %" PRIu64 ")/2)) * sharedStride + (combinedID / %" PRIu64 ");\n", sc->fftDim, sc->fftDim - 1, sc->fftDim, sc->fftDim, sc->fftDim); - sc->tempLen = sprintf(sc->tempStr, " sdataID = ((combinedID %% %" PRIu64 ")/2) * sharedStride + (combinedID / %" PRIu64 ");\n", 2 * sc->fftDim, 2 * sc->fftDim); - } - else { - //sc->tempLen = sprintf(sc->tempStr, " sdataID = (((combinedID %% %" PRIu64 ") %% 2) * %" PRIu64 " + (1-2*((combinedID %% %" PRIu64 ") %% 2)) * ((combinedID %% %" PRIu64 ")/2)) + (combinedID / %" PRIu64 ") * sharedStride;\n", sc->fftDim, sc->fftDim - 1, sc->fftDim, sc->fftDim, sc->fftDim); - sc->tempLen = sprintf(sc->tempStr, " sdataID = ((combinedID %% %" PRIu64 ")/2) + (combinedID / %" PRIu64 ") * sharedStride;\n", 2 * sc->fftDim, 2 * sc->fftDim); - } - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " if (((combinedID %% %" PRIu64 ")%%2) == 0) {\n", 2 * sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = 0;\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " if (((combinedID %% %" PRIu64 ")%%2) == 1) {\n", 2 * sc->fftDim);//another OpenCL bugfix - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = 0;\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->zeropadBluestein[0]) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->axisSwapped) { - if ((uint64_t)ceil(sc->size[1]) % sc->localSize[0] != 0) { - sc->tempLen = sprintf(sc->tempStr, " }"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if ((1 + i + k * 2 * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (2 * sc->fftDim * sc->localSize[0])) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - if ((uint64_t)ceil(sc->size[1]) % sc->localSize[1] != 0) { - sc->tempLen = sprintf(sc->tempStr, " }"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if ((1 + i + k * 2 * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (2 * sc->fftDim * sc->localSize[1])) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - } -#if((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4))//OpenCL is not handling barrier with thread-conditional writes to local memory - so this is a work-around - for (uint64_t k = 0; k < sc->registerBoost; k++) { - for (uint64_t i = 0; i < 2 * used_registers_read; i++) { - - if (sc->localSize[1] == 1) - sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * 2 * used_registers_read) * sc->localSize[0]); - else - sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * 2 * used_registers_read) * sc->localSize[0] * sc->localSize[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - if (sc->inputStride[0] > 1) - sc->tempLen = sprintf(sc->tempStr, " inoutID = (combinedID %% %" PRIu64 ") * %" PRIu64 " + (combinedID / %" PRIu64 ") * %" PRIu64 ";\n", 2 * sc->fftDim, sc->inputStride[0], 2 * sc->fftDim, sc->inputStride[1]); - else - sc->tempLen = sprintf(sc->tempStr, " inoutID = (combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * %" PRIu64 ";\n", 2 * sc->fftDim, 2 * sc->fftDim, sc->inputStride[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->axisSwapped) { - if ((uint64_t)ceil(sc->size[1]) % sc->localSize[0] != 0) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + (%s%s)*%" PRIu64 "< %" PRIu64 "){", 2 * sc->fftDim, sc->gl_WorkGroupID_y, shiftY, sc->localSize[0], (uint64_t)ceil(sc->size[1])); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if ((1 + i + k * 2 * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (2 * sc->fftDim * sc->localSize[0])) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", 2 * sc->fftDim * sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - if ((uint64_t)ceil(sc->size[1]) % sc->localSize[1] != 0) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + (%s%s)*%" PRIu64 "< %" PRIu64 "){", 2 * sc->fftDim, sc->gl_WorkGroupID_y, shiftY, sc->localSize[1], (uint64_t)ceil(sc->size[1])); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if ((1 + i + k * 2 * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (2 * sc->fftDim * sc->localSize[1])) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", 2 * sc->fftDim * sc->localSize[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - if (sc->zeropadBluestein[0]) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", maxBluesteinCutOff); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->zeropad[0]) { - sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->inputStride[1], sc->fft_zeropad_left_read[sc->axis_id], sc->inputStride[1], sc->fft_zeropad_right_read[sc->axis_id]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - indexInputVkFFT(sc, uintType, readType, sc->inoutID, 0, requestCoordinate, requestBatch); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, ";\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - res = appendZeropadStartReadWriteStage(sc, 1); - if (res != VKFFT_SUCCESS) return res; - if (sc->axisSwapped) { - //sc->tempLen = sprintf(sc->tempStr, " sdataID = (((combinedID %% %" PRIu64 ") %% 2) * %" PRIu64 " + (1-2*((combinedID %% %" PRIu64 ") %% 2)) * ((combinedID %% %" PRIu64 ")/2)) * sharedStride + (combinedID / %" PRIu64 ");\n", sc->fftDim, sc->fftDim - 1, sc->fftDim, sc->fftDim, sc->fftDim); - sc->tempLen = sprintf(sc->tempStr, " sdataID = ((combinedID %% %" PRIu64 ")/2) * sharedStride + (combinedID / %" PRIu64 ");\n", 2 * sc->fftDim, 2 * sc->fftDim); - } - else { - //sc->tempLen = sprintf(sc->tempStr, " sdataID = (((combinedID %% %" PRIu64 ") %% 2) * %" PRIu64 " + (1-2*((combinedID %% %" PRIu64 ") %% 2)) * ((combinedID %% %" PRIu64 ")/2)) + (combinedID / %" PRIu64 ") * sharedStride;\n", sc->fftDim, sc->fftDim - 1, sc->fftDim, sc->fftDim, sc->fftDim); - sc->tempLen = sprintf(sc->tempStr, " sdataID = ((combinedID %% %" PRIu64 ")/2) + (combinedID / %" PRIu64 ") * sharedStride;\n", 2 * sc->fftDim, 2 * sc->fftDim); - } - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (i < used_registers_read) { - sc->tempLen = sprintf(sc->tempStr, " if (((combinedID %% %" PRIu64 ")%%2) == 0) {\n", 2 * sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = %s.x;\n", sc->regIDs[i + k * sc->registers_per_thread]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " if (((combinedID %% %" PRIu64 ")%%2) == 0) {\n", 2 * sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = %s.y;\n", sc->regIDs[i - used_registers_read + k * sc->registers_per_thread]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - res = appendZeropadEndReadWriteStage(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->zeropad[0]) { - sc->tempLen = sprintf(sc->tempStr, " }else{\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - if (sc->axisSwapped) { - //sc->tempLen = sprintf(sc->tempStr, " sdataID = (((combinedID %% %" PRIu64 ") %% 2) * %" PRIu64 " + (1-2*((combinedID %% %" PRIu64 ") %% 2)) * ((combinedID %% %" PRIu64 ")/2)) * sharedStride + (combinedID / %" PRIu64 ");\n", sc->fftDim, sc->fftDim - 1, sc->fftDim, sc->fftDim, sc->fftDim); - sc->tempLen = sprintf(sc->tempStr, " sdataID = ((combinedID %% %" PRIu64 ")/2) * sharedStride + (combinedID / %" PRIu64 ");\n", 2 * sc->fftDim, 2 * sc->fftDim); - } - else { - //sc->tempLen = sprintf(sc->tempStr, " sdataID = (((combinedID %% %" PRIu64 ") %% 2) * %" PRIu64 " + (1-2*((combinedID %% %" PRIu64 ") %% 2)) * ((combinedID %% %" PRIu64 ")/2)) + (combinedID / %" PRIu64 ") * sharedStride;\n", sc->fftDim, sc->fftDim - 1, sc->fftDim, sc->fftDim, sc->fftDim); - sc->tempLen = sprintf(sc->tempStr, " sdataID = ((combinedID %% %" PRIu64 ")/2) + (combinedID / %" PRIu64 ") * sharedStride;\n", 2 * sc->fftDim, 2 * sc->fftDim); - } - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " if (((combinedID %% %" PRIu64 ")%%2) == 0) {\n", 2 * sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = 0;\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " if (((combinedID %% %" PRIu64 ")%%2) == 1) {\n", 2 * sc->fftDim);//another OpenCL bugfix - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = 0;\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->zeropadBluestein[0]) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->axisSwapped) { - if ((1 + i + k * 2 * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (2 * sc->fftDim * sc->localSize[0])) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if ((uint64_t)ceil(sc->size[1]) % sc->localSize[0] != 0) { - sc->tempLen = sprintf(sc->tempStr, " }"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - if ((1 + i + k * 2 * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (2 * sc->fftDim * sc->localSize[1])) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if ((uint64_t)ceil(sc->size[1]) % sc->localSize[1] != 0) { - sc->tempLen = sprintf(sc->tempStr, " }"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - } - res = appendBarrierVkFFT(sc, 1); - if (res != VKFFT_SUCCESS) return res; - for (uint64_t k = 0; k < sc->registerBoost; k++) { - for (uint64_t i = 0; i < 2 * used_registers_read; i++) { - - if (sc->localSize[1] == 1) - sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * 2 * used_registers_read) * sc->localSize[0]); - else - sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * 2 * used_registers_read) * sc->localSize[0] * sc->localSize[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - if (sc->inputStride[0] > 1) - sc->tempLen = sprintf(sc->tempStr, " inoutID = (combinedID %% %" PRIu64 ") * %" PRIu64 " + (combinedID / %" PRIu64 ") * %" PRIu64 ";\n", 2 * sc->fftDim, sc->inputStride[0], 2 * sc->fftDim, sc->inputStride[1]); - else - sc->tempLen = sprintf(sc->tempStr, " inoutID = (combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * %" PRIu64 ";\n", 2 * sc->fftDim, 2 * sc->fftDim, sc->inputStride[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->axisSwapped) { - if ((uint64_t)ceil(sc->size[1]) % sc->localSize[0] != 0) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + (%s%s)*%" PRIu64 "< %" PRIu64 "){", 2 * sc->fftDim, sc->gl_WorkGroupID_y, shiftY, sc->localSize[0], (uint64_t)ceil(sc->size[1])); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if ((1 + i + k * 2 * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (2 * sc->fftDim * sc->localSize[0])) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", 2 * sc->fftDim * sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - if ((uint64_t)ceil(sc->size[1]) % sc->localSize[1] != 0) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + (%s%s)*%" PRIu64 "< %" PRIu64 "){", 2 * sc->fftDim, sc->gl_WorkGroupID_y, shiftY, sc->localSize[1], (uint64_t)ceil(sc->size[1])); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if ((1 + i + k * 2 * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (2 * sc->fftDim * sc->localSize[1])) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", 2 * sc->fftDim * sc->localSize[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - if (sc->zeropadBluestein[0]) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", maxBluesteinCutOff); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->zeropad[0]) { - sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->inputStride[1], sc->fft_zeropad_left_read[sc->axis_id], sc->inputStride[1], sc->fft_zeropad_right_read[sc->axis_id]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - indexInputVkFFT(sc, uintType, readType, sc->inoutID, 0, requestCoordinate, requestBatch); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, ";\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - res = appendZeropadStartReadWriteStage(sc, 1); - if (res != VKFFT_SUCCESS) return res; - if (sc->axisSwapped) { - //sc->tempLen = sprintf(sc->tempStr, " sdataID = (((combinedID %% %" PRIu64 ") %% 2) * %" PRIu64 " + (1-2*((combinedID %% %" PRIu64 ") %% 2)) * ((combinedID %% %" PRIu64 ")/2)) * sharedStride + (combinedID / %" PRIu64 ");\n", sc->fftDim, sc->fftDim - 1, sc->fftDim, sc->fftDim, sc->fftDim); - sc->tempLen = sprintf(sc->tempStr, " sdataID = ((combinedID %% %" PRIu64 ")/2) * sharedStride + (combinedID / %" PRIu64 ");\n", 2 * sc->fftDim, 2 * sc->fftDim); - } - else { - //sc->tempLen = sprintf(sc->tempStr, " sdataID = (((combinedID %% %" PRIu64 ") %% 2) * %" PRIu64 " + (1-2*((combinedID %% %" PRIu64 ") %% 2)) * ((combinedID %% %" PRIu64 ")/2)) + (combinedID / %" PRIu64 ") * sharedStride;\n", sc->fftDim, sc->fftDim - 1, sc->fftDim, sc->fftDim, sc->fftDim); - sc->tempLen = sprintf(sc->tempStr, " sdataID = ((combinedID %% %" PRIu64 ")/2) + (combinedID / %" PRIu64 ") * sharedStride;\n", 2 * sc->fftDim, 2 * sc->fftDim); - } - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (i < used_registers_read) { - sc->tempLen = sprintf(sc->tempStr, " if (((combinedID %% %" PRIu64 ")%%2) == 1) {\n", 2 * sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = %s.x;\n", sc->regIDs[i + k * sc->registers_per_thread]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " if (((combinedID %% %" PRIu64 ")%%2) == 1) {\n", 2 * sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = %s.y;\n", sc->regIDs[i - used_registers_read + k * sc->registers_per_thread]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - res = appendZeropadEndReadWriteStage(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->zeropad[0]) { - sc->tempLen = sprintf(sc->tempStr, " }else{\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - if (sc->axisSwapped) { - //sc->tempLen = sprintf(sc->tempStr, " sdataID = (((combinedID %% %" PRIu64 ") %% 2) * %" PRIu64 " + (1-2*((combinedID %% %" PRIu64 ") %% 2)) * ((combinedID %% %" PRIu64 ")/2)) * sharedStride + (combinedID / %" PRIu64 ");\n", sc->fftDim, sc->fftDim - 1, sc->fftDim, sc->fftDim, sc->fftDim); - sc->tempLen = sprintf(sc->tempStr, " sdataID = ((combinedID %% %" PRIu64 ")/2) * sharedStride + (combinedID / %" PRIu64 ");\n", 2 * sc->fftDim, 2 * sc->fftDim); - } - else { - //sc->tempLen = sprintf(sc->tempStr, " sdataID = (((combinedID %% %" PRIu64 ") %% 2) * %" PRIu64 " + (1-2*((combinedID %% %" PRIu64 ") %% 2)) * ((combinedID %% %" PRIu64 ")/2)) + (combinedID / %" PRIu64 ") * sharedStride;\n", sc->fftDim, sc->fftDim - 1, sc->fftDim, sc->fftDim, sc->fftDim); - sc->tempLen = sprintf(sc->tempStr, " sdataID = ((combinedID %% %" PRIu64 ")/2) + (combinedID / %" PRIu64 ") * sharedStride;\n", 2 * sc->fftDim, 2 * sc->fftDim); - } - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " if (((combinedID %% %" PRIu64 ")%%2) == 0) {\n", 2 * sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = 0;\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " if (((combinedID %% %" PRIu64 ")%%2) == 1) {\n", 2 * sc->fftDim);//another OpenCL bugfix - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = 0;\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->zeropadBluestein[0]) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->axisSwapped) { - if ((1 + i + k * 2 * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (2 * sc->fftDim * sc->localSize[0])) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if ((uint64_t)ceil(sc->size[1]) % sc->localSize[0] != 0) { - sc->tempLen = sprintf(sc->tempStr, " }"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - if ((1 + i + k * 2 * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (2 * sc->fftDim * sc->localSize[1])) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if ((uint64_t)ceil(sc->size[1]) % sc->localSize[1] != 0) { - sc->tempLen = sprintf(sc->tempStr, " }"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - - } - } -#endif - res = appendBarrierVkFFT(sc, 1); - if (res != VKFFT_SUCCESS) return res; - res = appendZeropadStart(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->zeropadBluestein[0]) { - if (sc->axisSwapped) - maxBluesteinCutOff = sc->fftDim * sc->localSize[0]; - else - maxBluesteinCutOff = sc->fftDim * sc->localSize[1]; - } - for (uint64_t k = 0; k < sc->registerBoost; k++) { - for (uint64_t i = 0; i < used_registers_read; i++) { - - if (sc->localSize[1] == 1) - sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * used_registers_read) * sc->localSize[0]); - else - sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->zeropadBluestein[0]) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", maxBluesteinCutOff); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->axisSwapped) { - if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[0])) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " sdataID = (combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ");\n", sc->fftDim, sc->fftDim); - } - else { - if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[1])) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " sdataID = (combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride;\n", sc->fftDim, sc->fftDim); - } - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " if((combinedID %% %" PRIu64 ")>0){\n", sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->axisSwapped) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = sdata[sdataID-sharedStride].y;\n", sc->w); - } - else { - sc->tempLen = sprintf(sc->tempStr, " %s.x = sdata[sdataID-1].y;\n", sc->w); - } - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = sdata[sdataID].x;\n", sc->w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.x = %s.x+%s.y;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->w, sc->w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = %s.x-%s.y;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->w, sc->w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " }else{\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.x = 2*sdata[sdataID].x;\n", sc->regIDs[i + k * sc->registers_per_thread]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->axisSwapped) { - sc->tempLen = sprintf(sc->tempStr, " sdataID = (%" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ");\n", sc->fftDim - 1, sc->fftDim); - } - else { - sc->tempLen = sprintf(sc->tempStr, " sdataID = (%" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride;\n", sc->fftDim - 1, sc->fftDim); - } - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = 2*sdata[sdataID].y;\n", sc->regIDs[i + k * sc->registers_per_thread]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->axisSwapped) { - if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[0])) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[1])) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - if (sc->zeropadBluestein[0]) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - - /*sc->tempLen = sprintf(sc->tempStr, " printf(\" %%f %%f %%d\\n\", %s.x, %s.y, %s);\n", sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_x); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res;*/ - } - } - res = appendZeropadEnd(sc); - if (res != VKFFT_SUCCESS) return res; - res = appendBarrierVkFFT(sc, 1); - if (res != VKFFT_SUCCESS) return res; - res = appendZeropadStart(sc); - if (res != VKFFT_SUCCESS) return res; - for (uint64_t k = 0; k < sc->registerBoost; k++) { - for (uint64_t i = 0; i < used_registers_read; i++) { - - if (sc->localSize[1] == 1) - sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * used_registers_read) * sc->localSize[0]); - else - sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->zeropadBluestein[0]) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", maxBluesteinCutOff); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->axisSwapped) { - if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[0])) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " sdataID = (combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ");\n", sc->fftDim, sc->fftDim); - } - else { - if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[1])) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " sdataID = (combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride;\n", sc->fftDim, sc->fftDim); - } - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " if((combinedID %% %" PRIu64 ")>0){\n", sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = %s.x;\n", sc->regIDs[i + k * sc->registers_per_thread]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; -#if(!((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)))//OpenCL and Level Zero are not handling barrier with thread-conditional writes to local memory - so this is a work-around - if (sc->axisSwapped) { - sc->tempLen = sprintf(sc->tempStr, " sdataID = (%" PRIu64 " - combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ");\n", sc->fftDim, sc->fftDim, sc->fftDim); - } - else { - sc->tempLen = sprintf(sc->tempStr, " sdataID = (%" PRIu64 " - combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride;\n", sc->fftDim, sc->fftDim, sc->fftDim); - } - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = %s.y;\n", sc->regIDs[i + k * sc->registers_per_thread]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; -#endif - sc->tempLen = sprintf(sc->tempStr, " }else{\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID] = %s;\n", sc->regIDs[i + k * sc->registers_per_thread]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->axisSwapped) { - if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[0])) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[1])) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - if (sc->zeropadBluestein[0]) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - /*sc->tempLen = sprintf(sc->tempStr, " printf(\" %%f %%f %%d\\n\", sdata[sdataID].x, sdata[sdataID].y, %s);\n", sc->gl_LocalInvocationID_x); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res;*/ - } - } - res = appendZeropadEnd(sc); - if (res != VKFFT_SUCCESS) return res; - res = appendBarrierVkFFT(sc, 1); - if (res != VKFFT_SUCCESS) return res; -#if((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)) - res = appendZeropadStart(sc); - if (res != VKFFT_SUCCESS) return res; - for (uint64_t k = 0; k < sc->registerBoost; k++) { - for (uint64_t i = 0; i < used_registers_read; i++) { - - if (sc->localSize[1] == 1) - sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * used_registers_read) * sc->localSize[0]); - else - sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->zeropadBluestein[0]) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", maxBluesteinCutOff); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " if((combinedID %% %" PRIu64 ")>0){\n", sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->axisSwapped) { - if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[0])) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " sdataID = (%" PRIu64 " - combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ");\n", sc->fftDim, sc->fftDim, sc->fftDim); - } - else { - if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[1])) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " sdataID = (%" PRIu64 " - combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride;\n", sc->fftDim, sc->fftDim, sc->fftDim); - } - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = %s.y;\n", sc->regIDs[i + k * sc->registers_per_thread]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->axisSwapped) { - if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[0])) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[1])) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - if (sc->zeropadBluestein[0]) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - /*sc->tempLen = sprintf(sc->tempStr, " printf(\" %%f %%f %%d\\n\", sdata[sdataID].x, sdata[sdataID].y, %s);\n", sc->gl_LocalInvocationID_x); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res;*/ - } - } - res = appendZeropadEnd(sc); - if (res != VKFFT_SUCCESS) return res; - res = appendBarrierVkFFT(sc, 1); - if (res != VKFFT_SUCCESS) return res; -#endif - res = appendZeropadStart(sc); - if (res != VKFFT_SUCCESS) return res; - uint64_t num_in = (sc->axisSwapped) ? (uint64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[1]) : (uint64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[0]); - - for (uint64_t k = 0; k < sc->registerBoost; k++) { - for (uint64_t i = 0; i < num_in; i++) { - - if (sc->localSize[1] == 1) - sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * num_in) * sc->localSize[0]); - else - sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * num_in) * sc->localSize[0] * sc->localSize[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->axisSwapped) { - if ((1 + i + k * num_in) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim / 2 + 1) * sc->localSize[0]) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", (sc->fftDim / 2 + 1) * sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - if ((1 + i + k * num_in) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim / 2 + 1) * sc->localSize[1]) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", (sc->fftDim / 2 + 1) * sc->localSize[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - if (sc->LUT) { - sc->tempLen = sprintf(sc->tempStr, " mult = twiddleLUT[%" PRIu64 " + combinedID %% %" PRIu64 "];\n", sc->startDCT3LUT, sc->fftDim / 2 + 1); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - if (!strcmp(floatType, "float")) { - sc->tempLen = sprintf(sc->tempStr, " mult.x = %s(%.17e%s * (combinedID %% %" PRIu64 ") );\n", cosDef, double_PI / 2 / sc->fftDim, LFending, sc->fftDim / 2 + 1); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " mult.y = %s(%.17e%s * (combinedID %% %" PRIu64 ") );\n", sinDef, double_PI / 2 / sc->fftDim, LFending, sc->fftDim / 2 + 1); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (!strcmp(floatType, "double")) { - sc->tempLen = sprintf(sc->tempStr, " mult = sincos_20(%.17e%s * (combinedID %% %" PRIu64 ") );\n", double_PI / 2 / sc->fftDim, LFending, sc->fftDim / 2 + 1); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - - if (sc->axisSwapped) { - sc->tempLen = sprintf(sc->tempStr, " sdataID = (combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ");\n", sc->fftDim / 2 + 1, sc->fftDim / 2 + 1); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " sdataID = (combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride ;\n", sc->fftDim / 2 + 1, sc->fftDim / 2 + 1); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " %s = sdata[sdataID];\n", sc->regIDs[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - sc->tempLen = sprintf(sc->tempStr, " if (combinedID %% %" PRIu64 " > 0){\n", sc->fftDim / 2 + 1); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - if (sc->axisSwapped) { - sc->tempLen = sprintf(sc->tempStr, " inoutID = (%" PRIu64 " - combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ");\n", sc->fftDim, sc->fftDim / 2 + 1, sc->fftDim / 2 + 1); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " inoutID = (%" PRIu64 " - combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride ;\n", sc->fftDim, sc->fftDim / 2 + 1, sc->fftDim / 2 + 1); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " %s = sdata[inoutID];\n", sc->regIDs[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = ((%s.x+%s.y)*mult.x+(%s.x-%s.y)*mult.y);\n", sc->regIDs[0], sc->regIDs[1], sc->regIDs[1], sc->regIDs[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = ((-%s.x+%s.y)*mult.x+(%s.x+%s.y)*mult.y);\n", sc->regIDs[1], sc->regIDs[0], sc->regIDs[0], sc->regIDs[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdata[inoutID].x = ((%s.x-%s.y)*mult.x+(%s.x+%s.y)*mult.y);\n", sc->regIDs[0], sc->regIDs[1], sc->regIDs[1], sc->regIDs[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdata[inoutID].y = ((%s.x+%s.y)*mult.x-(%s.x-%s.y)*mult.y);\n", sc->regIDs[1], sc->regIDs[0], sc->regIDs[0], sc->regIDs[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - sc->tempLen = sprintf(sc->tempStr, " } \n"); - res = VkAppendLine(sc); - - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " if (combinedID %% %" PRIu64 " == 0){\n", sc->fftDim / 2 + 1); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = (%s.x*mult.x-%s.y*mult.y);\n", sc->regIDs[0], sc->regIDs[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = (%s.y*mult.x+%s.x*mult.y);\n", sc->regIDs[0], sc->regIDs[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->axisSwapped) { - if ((1 + i + k * num_in) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim / 2 + 1) * sc->localSize[0]) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - if ((1 + i + k * num_in) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim / 2 + 1) * sc->localSize[1]) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - } - res = appendZeropadEnd(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->zeropadBluestein[0]) { - sc->fftDim = sc->fft_dim_full; - used_registers_read = (sc->axisSwapped) ? (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]) : (uint64_t)ceil(sc->fftDim / (double)sc->localSize[0]); - } - } - else { - //Not implemented - } - break; - } - case 143://DCT-IV strided as 2xN/2 DCT-II - { - char shiftX[500] = ""; - if (sc->performWorkGroupShift[0]) - sprintf(shiftX, " + consts.workGroupShiftX "); - char shiftX2[500] = ""; - if (sc->performWorkGroupShift[0]) - sprintf(shiftX2, " + consts.workGroupShiftX * %s ", sc->gl_WorkGroupSize_x); - char shiftY[500] = ""; - if (sc->performWorkGroupShift[1]) - sprintf(shiftY, " + consts.workGroupShiftY "); - uint64_t used_registers_read = (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]); - if (sc->registerBoost > 1) used_registers_read /= sc->registerBoost; - if (sc->fftDim == sc->fft_dim_full) { - if (sc->zeropadBluestein[0]) { - res = appendSetSMToZero(sc, floatType, floatTypeMemory, uintType, readType); - if (res != VKFFT_SUCCESS) return res; - res = appendBarrierVkFFT(sc, 1); - if (res != VKFFT_SUCCESS) return res; - sc->fftDim = sc->fft_zeropad_Bluestein_left_read[sc->axis_id]; - used_registers_read = (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]); - } - for (uint64_t k = 0; k < sc->registerBoost; k++) { - for (uint64_t i = 0; i < 2 * used_registers_read; i++) { - - if (sc->localSize[1] == 1) - sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * 2 * used_registers_read) * sc->localSize[0]); - else - sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * 2 * used_registers_read) * sc->localSize[0] * sc->localSize[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->zeropadBluestein[0]) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", 2 * sc->fftDim * sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if ((1 + i + k * 2 * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (2 * sc->fftDim * sc->localSize[0])) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", 2 * sc->fftDim * sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if ((uint64_t)ceil(sc->size[0]) % sc->localSize[0] != 0) { - sc->tempLen = sprintf(sc->tempStr, " if ((%s%s) < %" PRIu64 ") {\n", sc->gl_GlobalInvocationID_x, shiftX2, (uint64_t)ceil(sc->size[0])); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sprintf(index_x, "(%s%s) %% (%" PRIu64 ")", sc->gl_GlobalInvocationID_x, shiftX2, sc->fft_dim_x); - sprintf(index_y, "(%s + %" PRIu64 ")", sc->gl_LocalInvocationID_y, (i + k * 2 * used_registers_read) * sc->localSize[1]); - res = indexInputVkFFT(sc, uintType, readType, index_x, index_y, requestCoordinate, requestBatch); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, ";\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->zeropad[0]) { - sc->tempLen = sprintf(sc->tempStr, " if((%s %% %" PRIu64 " < %" PRIu64 ")||(%s %% %" PRIu64 " >= %" PRIu64 ")){\n", index_y, sc->fft_dim_full, sc->fft_zeropad_left_read[sc->axis_id], index_y, sc->fft_dim_full, sc->fft_zeropad_right_read[sc->axis_id]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - - res = appendZeropadStartReadWriteStage(sc, 1); - if (res != VKFFT_SUCCESS) return res; -#if(!((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)))//OpenCL and Level Zero are not handling barrier with thread-conditional writes to local memory - so this is a work-around - if (sc->inputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s.x = %s%s[%s]%s;\n", sc->regIDs[0], convTypeLeft, inputsStruct, sc->inoutID, convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " %s.x = %sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", sc->regIDs[0], convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; -#else - if (i < used_registers_read) { - if (sc->inputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s.x = %s%s[%s]%s;\n", sc->regIDs[i + k * sc->registers_per_thread], convTypeLeft, inputsStruct, sc->inoutID, convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " %s.x = %sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", sc->regIDs[i + k * sc->registers_per_thread], convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - if (sc->inputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s.y = %s%s[%s]%s;\n", sc->regIDs[i - used_registers_read + k * sc->registers_per_thread], convTypeLeft, inputsStruct, sc->inoutID, convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " %s.y = %sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", sc->regIDs[i - used_registers_read + k * sc->registers_per_thread], convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } -#endif -#if(!((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)))//OpenCL and Level Zero are not handling barrier with thread-conditional writes to local memory - so this is a work-around - sc->tempLen = sprintf(sc->tempStr, " sdataID = ((combinedID / %" PRIu64 ")/2) * sharedStride + (combinedID %% %" PRIu64 ");\n", sc->localSize[0], sc->localSize[0]); - - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " if ((combinedID / %" PRIu64 ")%%2 == 0) {\n", sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = %s.x;\n", sc->regIDs[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " } else {\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = %s.x;\n", sc->regIDs[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; -#endif - res = appendZeropadEndReadWriteStage(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->zeropad[0]) { - sc->tempLen = sprintf(sc->tempStr, " }else{\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - sc->tempLen = sprintf(sc->tempStr, " sdataID = ((combinedID / %" PRIu64 ")/2) * sharedStride + (combinedID %% %" PRIu64 ");\n", sc->localSize[0], sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " if ((combinedID / %" PRIu64 ")%%2 == 0) {\n", sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = 0;\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " } else {\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = 0;\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if ((uint64_t)ceil(sc->size[0]) % sc->localSize[0] != 0) { - sc->tempLen = sprintf(sc->tempStr, " }"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if ((1 + i + k * 2 * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (2 * sc->fftDim * sc->localSize[0])) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->zeropadBluestein[0]) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } -#if((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4))//OpenCL is not handling barrier with thread-conditional writes to local memory - so this is a work-around - for (uint64_t k = 0; k < sc->registerBoost; k++) { - for (uint64_t i = 0; i < 2 * used_registers_read; i++) { - - if (sc->localSize[1] == 1) - sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * 2 * used_registers_read) * sc->localSize[0]); - else - sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * 2 * used_registers_read) * sc->localSize[0] * sc->localSize[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->zeropadBluestein[0]) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", 2 * sc->fftDim * sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if ((1 + i + k * 2 * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (2 * sc->fftDim * sc->localSize[0])) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", 2 * sc->fftDim * sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if ((uint64_t)ceil(sc->size[0]) % sc->localSize[0] != 0) { - sc->tempLen = sprintf(sc->tempStr, " if ((%s%s) < %" PRIu64 ") {\n", sc->gl_GlobalInvocationID_x, shiftX2, (uint64_t)ceil(sc->size[0])); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sprintf(index_x, "(%s%s) %% (%" PRIu64 ")", sc->gl_GlobalInvocationID_x, shiftX2, sc->fft_dim_x); - sprintf(index_y, "(%s + %" PRIu64 ")", sc->gl_LocalInvocationID_y, (i + k * 2 * used_registers_read) * sc->localSize[1]); - res = indexInputVkFFT(sc, uintType, readType, index_x, index_y, requestCoordinate, requestBatch); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, ";\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->zeropad[0]) { - sc->tempLen = sprintf(sc->tempStr, " if((%s %% %" PRIu64 " < %" PRIu64 ")||(%s %% %" PRIu64 " >= %" PRIu64 ")){\n", index_y, sc->fft_dim_full, sc->fft_zeropad_left_read[sc->axis_id], index_y, sc->fft_dim_full, sc->fft_zeropad_right_read[sc->axis_id]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - - res = appendZeropadStartReadWriteStage(sc, 1); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdataID = ((combinedID / %" PRIu64 ")/2) * sharedStride + (combinedID %% %" PRIu64 ");\n", sc->localSize[0], sc->localSize[0]); - - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (i < used_registers_read) { - sc->tempLen = sprintf(sc->tempStr, " if ((combinedID / %" PRIu64 ")%%2 == 0) {\n", sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = %s.x;\n", sc->regIDs[i + k * sc->registers_per_thread]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " if ((combinedID / %" PRIu64 ")%%2 == 0) {\n", sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = %s.y;\n", sc->regIDs[i - used_registers_read + k * sc->registers_per_thread]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - res = appendZeropadEndReadWriteStage(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->zeropad[0]) { - sc->tempLen = sprintf(sc->tempStr, " }else{\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - sc->tempLen = sprintf(sc->tempStr, " sdataID = ((combinedID / %" PRIu64 ")/2) * sharedStride + (combinedID %% %" PRIu64 ");\n", sc->localSize[0], sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " if ((combinedID / %" PRIu64 ")%%2 == 0) {\n", sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = 0;\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " } else {\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = 0;\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if ((uint64_t)ceil(sc->size[0]) % sc->localSize[0] != 0) { - sc->tempLen = sprintf(sc->tempStr, " }"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if ((1 + i + k * 2 * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (2 * sc->fftDim * sc->localSize[0])) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->zeropadBluestein[0]) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - res = appendBarrierVkFFT(sc, 1); - if (res != VKFFT_SUCCESS) return res; - for (uint64_t k = 0; k < sc->registerBoost; k++) { - for (uint64_t i = 0; i < 2 * used_registers_read; i++) { - - if (sc->localSize[1] == 1) - sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * 2 * used_registers_read) * sc->localSize[0]); - else - sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * 2 * used_registers_read) * sc->localSize[0] * sc->localSize[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->zeropadBluestein[0]) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", 2 * sc->fftDim * sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if ((1 + i + k * 2 * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (2 * sc->fftDim * sc->localSize[0])) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", 2 * sc->fftDim * sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if ((uint64_t)ceil(sc->size[0]) % sc->localSize[0] != 0) { - sc->tempLen = sprintf(sc->tempStr, " if ((%s%s) < %" PRIu64 ") {\n", sc->gl_GlobalInvocationID_x, shiftX2, (uint64_t)ceil(sc->size[0])); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sprintf(index_x, "(%s%s) %% (%" PRIu64 ")", sc->gl_GlobalInvocationID_x, shiftX2, sc->fft_dim_x); - sprintf(index_y, "(%s + %" PRIu64 ")", sc->gl_LocalInvocationID_y, (i + k * 2 * used_registers_read) * sc->localSize[1]); - res = indexInputVkFFT(sc, uintType, readType, index_x, index_y, requestCoordinate, requestBatch); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, ";\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->zeropad[0]) { - sc->tempLen = sprintf(sc->tempStr, " if((%s %% %" PRIu64 " < %" PRIu64 ")||(%s %% %" PRIu64 " >= %" PRIu64 ")){\n", index_y, sc->fft_dim_full, sc->fft_zeropad_left_read[sc->axis_id], index_y, sc->fft_dim_full, sc->fft_zeropad_right_read[sc->axis_id]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - - res = appendZeropadStartReadWriteStage(sc, 1); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdataID = ((combinedID / %" PRIu64 ")/2) * sharedStride + (combinedID %% %" PRIu64 ");\n", sc->localSize[0], sc->localSize[0]); - - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (i < used_registers_read) { - sc->tempLen = sprintf(sc->tempStr, " if ((combinedID / %" PRIu64 ")%%2 == 1) {\n", sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = %s.x;\n", sc->regIDs[i + k * sc->registers_per_thread]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " if ((combinedID / %" PRIu64 ")%%2 == 1) {\n", sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = %s.y;\n", sc->regIDs[i - used_registers_read + k * sc->registers_per_thread]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - res = appendZeropadEndReadWriteStage(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->zeropad[0]) { - sc->tempLen = sprintf(sc->tempStr, " }else{\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - sc->tempLen = sprintf(sc->tempStr, " sdataID = ((combinedID / %" PRIu64 ")/2) * sharedStride + (combinedID %% %" PRIu64 ");\n", sc->localSize[0], sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " if ((combinedID / %" PRIu64 ")%%2 == 0) {\n", sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = 0;\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " } else {\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = 0;\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if ((uint64_t)ceil(sc->size[0]) % sc->localSize[0] != 0) { - sc->tempLen = sprintf(sc->tempStr, " }"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if ((1 + i + k * 2 * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (2 * sc->fftDim * sc->localSize[0])) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->zeropadBluestein[0]) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } -#endif - res = appendBarrierVkFFT(sc, 1); - if (res != VKFFT_SUCCESS) return res; - res = appendZeropadStart(sc); - if (res != VKFFT_SUCCESS) return res; - for (uint64_t k = 0; k < sc->registerBoost; k++) { - for (uint64_t i = 0; i < used_registers_read; i++) { - - if (sc->localSize[1] == 1) - sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * used_registers_read) * sc->localSize[0]); - else - sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->zeropadBluestein[0]) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[0])) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " sdataID = (combinedID / %" PRIu64 ") * sharedStride + (combinedID %% %" PRIu64 ");\n", sc->localSize[0], sc->localSize[0]); - - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " if((combinedID / %" PRIu64 ")>0){\n", sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.x = sdata[sdataID-sharedStride].y;\n", sc->w); - - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = sdata[sdataID].x;\n", sc->w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.x = %s.x+%s.y;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->w, sc->w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = %s.x-%s.y;\n", sc->regIDs[i + k * sc->registers_per_thread], sc->w, sc->w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " }else{\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.x = 2*sdata[sdataID].x;\n", sc->regIDs[i + k * sc->registers_per_thread]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdataID = (%" PRIu64 ") * sharedStride + (combinedID %% %" PRIu64 ");\n", sc->fftDim - 1, sc->localSize[0]); - - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = 2*sdata[sdataID].y;\n", sc->regIDs[i + k * sc->registers_per_thread]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[0])) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->zeropadBluestein[0]) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - //sc->tempLen = sprintf(sc->tempStr, " printf(\" %%f %%f\\n\", %s.x, %s.y);\n", sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread]); - //res = VkAppendLine(sc); - //if (res != VKFFT_SUCCESS) return res; - } - } - res = appendZeropadEnd(sc); - if (res != VKFFT_SUCCESS) return res; - res = appendBarrierVkFFT(sc, 1); - if (res != VKFFT_SUCCESS) return res; - res = appendZeropadStart(sc); - if (res != VKFFT_SUCCESS) return res; - for (uint64_t k = 0; k < sc->registerBoost; k++) { - for (uint64_t i = 0; i < used_registers_read; i++) { - - if (sc->localSize[1] == 1) - sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * used_registers_read) * sc->localSize[0]); - else - sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->zeropadBluestein[0]) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[0])) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " sdataID = (combinedID / %" PRIu64 ") * sharedStride + (combinedID %% %" PRIu64 ");\n", sc->localSize[0], sc->localSize[0]); - - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " if((combinedID / %" PRIu64 ")>0){\n", sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = %s.x;\n", sc->regIDs[i + k * sc->registers_per_thread]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; -#if(!((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)))//OpenCL and Level Zero are not handling barrier with thread-conditional writes to local memory - so this is a work-around - sc->tempLen = sprintf(sc->tempStr, " sdataID = (%" PRIu64 " - combinedID / %" PRIu64 ") * sharedStride + (combinedID %% %" PRIu64 ");\n", sc->fftDim, sc->localSize[0], sc->localSize[0]); - - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = %s.y;\n", sc->regIDs[i + k * sc->registers_per_thread]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; -#endif - sc->tempLen = sprintf(sc->tempStr, " }else{\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID] = %s;\n", sc->regIDs[i + k * sc->registers_per_thread]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[0])) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->zeropadBluestein[0]) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - res = appendZeropadEnd(sc); - if (res != VKFFT_SUCCESS) return res; - res = appendBarrierVkFFT(sc, 1); - if (res != VKFFT_SUCCESS) return res; -#if((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)) - res = appendZeropadStart(sc); - if (res != VKFFT_SUCCESS) return res; - for (uint64_t k = 0; k < sc->registerBoost; k++) { - for (uint64_t i = 0; i < used_registers_read; i++) { - - if (sc->localSize[1] == 1) - sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * used_registers_read) * sc->localSize[0]); - else - sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->zeropadBluestein[0]) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[0])) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " if((combinedID / %" PRIu64 ")>0){\n", sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdataID = (%" PRIu64 " - combinedID / %" PRIu64 ") * sharedStride + (combinedID %% %" PRIu64 ");\n", sc->fftDim, sc->localSize[0], sc->localSize[0]); - - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = %s.y;\n", sc->regIDs[i + k * sc->registers_per_thread]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[0])) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->zeropadBluestein[0]) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - res = appendZeropadEnd(sc); - if (res != VKFFT_SUCCESS) return res; - res = appendBarrierVkFFT(sc, 1); - if (res != VKFFT_SUCCESS) return res; -#endif - res = appendZeropadStart(sc); - if (res != VKFFT_SUCCESS) return res; - uint64_t num_in = (uint64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[1]); - - for (uint64_t k = 0; k < sc->registerBoost; k++) { - for (uint64_t i = 0; i < num_in; i++) { - - if (sc->localSize[1] == 1) - sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * num_in) * sc->localSize[0]); - else - sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * num_in) * sc->localSize[0] * sc->localSize[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - if ((1 + i + k * num_in) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim / 2 + 1) * sc->localSize[0]) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", (sc->fftDim / 2 + 1) * sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->LUT) { - sc->tempLen = sprintf(sc->tempStr, " mult = twiddleLUT[%" PRIu64 " + combinedID / %" PRIu64 "];\n", sc->startDCT3LUT, sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - if (!strcmp(floatType, "float")) { - sc->tempLen = sprintf(sc->tempStr, " mult.x = %s(%.17e%s * (combinedID / %" PRIu64 ") );\n", cosDef, double_PI / 2 / sc->fftDim, LFending, sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " mult.y = %s(%.17e%s * (combinedID / %" PRIu64 ") );\n", sinDef, double_PI / 2 / sc->fftDim, LFending, sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (!strcmp(floatType, "double")) { - sc->tempLen = sprintf(sc->tempStr, " mult = sincos_20(%.17e%s * (combinedID / %" PRIu64 ") );\n", double_PI / 2 / sc->fftDim, LFending, sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - - sc->tempLen = sprintf(sc->tempStr, " sdataID = (combinedID / %" PRIu64 ") * sharedStride + (combinedID %% %" PRIu64 ");\n", sc->localSize[0], sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - sc->tempLen = sprintf(sc->tempStr, " %s = sdata[sdataID];\n", sc->regIDs[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - sc->tempLen = sprintf(sc->tempStr, " if (combinedID / %" PRIu64 " > 0){\n", sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - sc->tempLen = sprintf(sc->tempStr, " inoutID = (%" PRIu64 " - combinedID / %" PRIu64 ") * sharedStride + (combinedID %% %" PRIu64 ");\n", sc->fftDim, sc->localSize[0], sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s = sdata[inoutID];\n", sc->regIDs[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = ((%s.x+%s.y)*mult.x+(%s.x-%s.y)*mult.y);\n", sc->regIDs[0], sc->regIDs[1], sc->regIDs[1], sc->regIDs[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = ((-%s.x+%s.y)*mult.x+(%s.x+%s.y)*mult.y);\n", sc->regIDs[1], sc->regIDs[0], sc->regIDs[0], sc->regIDs[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdata[inoutID].x = ((%s.x-%s.y)*mult.x+(%s.x+%s.y)*mult.y);\n", sc->regIDs[0], sc->regIDs[1], sc->regIDs[1], sc->regIDs[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdata[inoutID].y = ((%s.x+%s.y)*mult.x-(%s.x-%s.y)*mult.y);\n", sc->regIDs[1], sc->regIDs[0], sc->regIDs[0], sc->regIDs[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - sc->tempLen = sprintf(sc->tempStr, " } else {\n"); - res = VkAppendLine(sc); - - if (res != VKFFT_SUCCESS) return res; - - sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = (%s.x*mult.x-%s.y*mult.y);\n", sc->regIDs[0], sc->regIDs[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = (%s.y*mult.x+%s.x*mult.y);\n", sc->regIDs[0], sc->regIDs[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if ((1 + i + k * num_in) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim / 2 + 1) * sc->localSize[0]) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - res = appendZeropadEnd(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->zeropadBluestein[0]) { - sc->fftDim = sc->fft_dim_full; - used_registers_read = (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]); - } - } - else { - //Not implemented - } - break; - } - case 144://odd DCT-IV nonstrided as N FFT - { - char shiftX[500] = ""; - if (sc->performWorkGroupShift[0]) - sprintf(shiftX, " + consts.workGroupShiftX "); - char shiftY[500] = ""; - if (sc->performWorkGroupShift[1]) - sprintf(shiftY, " + consts.workGroupShiftY "); - uint64_t mult = (sc->mergeSequencesR2C) ? 2 : 1; - uint64_t used_registers_read = (sc->axisSwapped) ? (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]) : (uint64_t)ceil(sc->fftDim / (double)sc->localSize[0]); - if (sc->registerBoost > 1) used_registers_read /= sc->registerBoost; - if (sc->fftDim == sc->fft_dim_full) { - if (sc->zeropadBluestein[0]) { - res = appendSetSMToZero(sc, floatType, floatTypeMemory, uintType, readType); - if (res != VKFFT_SUCCESS) return res; - res = appendBarrierVkFFT(sc, 1); - if (res != VKFFT_SUCCESS) return res; - sc->fftDim = sc->fft_zeropad_Bluestein_left_read[sc->axis_id]; - used_registers_read = (sc->axisSwapped) ? (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]) : (uint64_t)ceil(sc->fftDim / (double)sc->localSize[0]); - } - for (uint64_t k = 0; k < sc->registerBoost; k++) { - for (uint64_t i = 0; i < used_registers_read; i++) { - - if (sc->localSize[1] == 1) - sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * used_registers_read) * sc->localSize[0]); - else - sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->inputStride[0] > 1) - sc->tempLen = sprintf(sc->tempStr, " inoutID = (combinedID %% %" PRIu64 ") * %" PRIu64 " + (combinedID / %" PRIu64 ") * %" PRIu64 ";\n", sc->fftDim, sc->inputStride[0], sc->fftDim, mult * sc->inputStride[1]); - else - sc->tempLen = sprintf(sc->tempStr, " inoutID = (combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * %" PRIu64 ";\n", sc->fftDim, sc->fftDim, mult * sc->inputStride[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->axisSwapped) { - if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[0] != 0) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + (%s%s)*%" PRIu64 "< %" PRIu64 "){", sc->fftDim, sc->gl_WorkGroupID_y, shiftY, sc->localSize[0], (uint64_t)ceil(sc->size[1] / (double)mult)); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[0])) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->zeropadBluestein[0]) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[1] != 0) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + (%s%s)*%" PRIu64 "< %" PRIu64 "){", sc->fftDim, sc->gl_WorkGroupID_y, shiftY, sc->localSize[1], (uint64_t)ceil(sc->size[1] / (double)mult)); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if ((1 + i + k * 2 * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[1])) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->zeropadBluestein[0]) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - if (sc->zeropad[0]) { - sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->inputStride[1], sc->fft_zeropad_left_read[sc->axis_id], sc->inputStride[1], sc->fft_zeropad_right_read[sc->axis_id]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - indexInputVkFFT(sc, uintType, readType, sc->inoutID, 0, requestCoordinate, requestBatch); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, ";\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - res = appendZeropadStartReadWriteStage(sc, 1); - if (res != VKFFT_SUCCESS) return res; - if (sc->axisSwapped) { - sc->tempLen = sprintf(sc->tempStr, " sdataID = (combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ");\n", sc->fftDim, sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - if (sc->inputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = %s%s[%s]%s;\n", convTypeLeft, inputsStruct, sc->inoutID, convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = %sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - if (sc->mergeSequencesR2C) { - sc->tempLen = sprintf(sc->tempStr, " inoutID += %" PRIu64 ";\n", sc->inputStride[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - if (sc->inputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = %s%s[inoutID]%s;\n", convTypeLeft, inputsStruct, convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = %sinputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "]%s;\n", convTypeLeft, sc->inputBufferBlockSize, inputsStruct, sc->inputBufferBlockSize, convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - if (sc->inputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = 0;\n"); - else - sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = 0;\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - sc->tempLen = sprintf(sc->tempStr, " sdataID = (combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride ;\n", sc->fftDim, sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->inputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = %s%s[inoutID]%s;\n", convTypeLeft, inputsStruct, convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = %sinputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "]%s;\n", convTypeLeft, sc->inputBufferBlockSize, inputsStruct, sc->inputBufferBlockSize, convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->mergeSequencesR2C) { - sc->tempLen = sprintf(sc->tempStr, " inoutID += %" PRIu64 ";\n", sc->inputStride[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->inputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = %s%s[inoutID]%s;\n", convTypeLeft, inputsStruct, convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = %sinputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "]%s;\n", convTypeLeft, sc->inputBufferBlockSize, inputsStruct, sc->inputBufferBlockSize, convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - if (sc->inputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = 0;\n"); - else - sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = 0;\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - res = appendZeropadEndReadWriteStage(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->zeropad[0]) { - sc->tempLen = sprintf(sc->tempStr, " }else{\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - if (sc->axisSwapped) { - sc->tempLen = sprintf(sc->tempStr, " sdataID = (combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ");\n", sc->fftDim, sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " sdataID = (combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride ;\n", sc->fftDim, sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - - sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = 0;\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = 0;\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->zeropadBluestein[0]) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->axisSwapped) { - if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[0])) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[0] != 0) { - sc->tempLen = sprintf(sc->tempStr, " }"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[1])) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[1] != 0) { - sc->tempLen = sprintf(sc->tempStr, " }"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - } - res = appendBarrierVkFFT(sc, 1); - if (res != VKFFT_SUCCESS) return res; - res = appendZeropadStart(sc); - if (res != VKFFT_SUCCESS) return res; - for (uint64_t k = 0; k < sc->registerBoost; k++) { - for (uint64_t i = 0; i < used_registers_read; i++) { - if (!sc->axisSwapped) { - sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * used_registers_read) * sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - if (sc->zeropadBluestein[0]) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[1])) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " inoutID = %" PRIu64 " + 4 * (combinedID %% %" PRIu64 ");\n", sc->fftDim / 2, sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - sc->tempLen = sprintf(sc->tempStr, " if (inoutID < %" PRIu64 ") sdataID = inoutID;\n", sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " if ((inoutID < %" PRIu64 ")&&(inoutID >= %" PRIu64 ")) sdataID = %" PRIu64 " - inoutID;\n", 2 * sc->fftDim, sc->fftDim, 2 * sc->fftDim - 1); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " if ((inoutID < %" PRIu64 ")&&(inoutID >= %" PRIu64 ")) sdataID = inoutID - %" PRIu64 ";\n", 3 * sc->fftDim, 2 * sc->fftDim, 2 * sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " if ((inoutID < %" PRIu64 ")&&(inoutID >= %" PRIu64 ")) sdataID = %" PRIu64 " - inoutID;\n", 4 * sc->fftDim, 3 * sc->fftDim, 4 * sc->fftDim - 1); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " if (inoutID >= %" PRIu64 ") sdataID = inoutID - %" PRIu64 ";\n", 4 * sc->fftDim, 4 * sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdataID = sdataID + %s * sharedStride;\n", sc->gl_LocalInvocationID_y); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s = sdata[sdataID];\n", sc->regIDs[i + k * sc->registers_per_thread]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " if ((inoutID < %" PRIu64 ")&&(inoutID >= %" PRIu64 ")){ \n\ - %s.x = -%s.x;\n\ - %s.y = -%s.y;}\n", 2 * sc->fftDim, sc->fftDim, sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " if ((inoutID < %" PRIu64 ")&&(inoutID >= %" PRIu64 ")){ \n\ - %s.x = -%s.x;\n\ - %s.y = -%s.y;}\n", 3 * sc->fftDim, 2 * sc->fftDim, sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[1])) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->zeropadBluestein[0]) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->zeropadBluestein[0]) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[0])) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " inoutID = %" PRIu64 " + 4 * combinedID;\n", sc->fftDim / 2); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - sc->tempLen = sprintf(sc->tempStr, " if (inoutID < %" PRIu64 ") sdataID = inoutID;\n", sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " if ((inoutID < %" PRIu64 ")&&(inoutID >= %" PRIu64 ")) sdataID = %" PRIu64 " - inoutID;\n", 2 * sc->fftDim, sc->fftDim, 2 * sc->fftDim - 1); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " if ((inoutID < %" PRIu64 ")&&(inoutID >= %" PRIu64 ")) sdataID = inoutID - %" PRIu64 ";\n", 3 * sc->fftDim, 2 * sc->fftDim, 2 * sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " if ((inoutID < %" PRIu64 ")&&(inoutID >= %" PRIu64 ")) sdataID = %" PRIu64 " - inoutID;\n", 4 * sc->fftDim, 3 * sc->fftDim, 4 * sc->fftDim - 1); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " if (inoutID >= %" PRIu64 ") sdataID = inoutID - %" PRIu64 ";\n", 4 * sc->fftDim, 4 * sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdataID = sdataID * sharedStride + %s;\n", sc->gl_LocalInvocationID_x); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s = sdata[sdataID];\n", sc->regIDs[i + k * sc->registers_per_thread]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " if ((inoutID < %" PRIu64 ")&&(inoutID >= %" PRIu64 ")){ \n\ - %s.x = -%s.x;\n\ - %s.y = -%s.y;}\n", 2 * sc->fftDim, sc->fftDim, sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " if ((inoutID < %" PRIu64 ")&&(inoutID >= %" PRIu64 ")){ \n\ - %s.x = -%s.x;\n\ - %s.y = -%s.y;}\n", 3 * sc->fftDim, 2 * sc->fftDim, sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[0])) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->zeropadBluestein[0]) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - } - res = appendZeropadEnd(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->zeropadBluestein[0]) { - sc->fftDim = sc->fft_dim_full; - used_registers_read = (sc->axisSwapped) ? (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]) : (uint64_t)ceil(sc->fftDim / (double)sc->localSize[0]); - } - if (!sc->readToRegisters) { - res = appendBarrierVkFFT(sc, 1); - if (res != VKFFT_SUCCESS) return res; - res = appendZeropadStart(sc); - if (res != VKFFT_SUCCESS) return res; - for (uint64_t k = 0; k < sc->registerBoost; k++) { - for (uint64_t i = 0; i < used_registers_read; i++) { - if (sc->axisSwapped) { - if ((1 + i + k * used_registers_read) * sc->localSize[1] > sc->fftDim) { - sc->tempLen = sprintf(sc->tempStr, " if(%s < %" PRIu64 "){\n", sc->gl_LocalInvocationID_y, sc->fftDim - (i + k * used_registers_read) * sc->localSize[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " sdata[(%s+%" PRIu64 ") * sharedStride + %s].x = %s.x;\n", sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[1], sc->gl_LocalInvocationID_x, sc->regIDs[i + k * sc->registers_per_thread]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdata[(%s+%" PRIu64 ") * sharedStride + %s].y = %s.y;\n", sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[1], sc->gl_LocalInvocationID_x, sc->regIDs[i + k * sc->registers_per_thread]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if ((1 + i + k * used_registers_read) * sc->localSize[1] > sc->fftDim) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - if ((1 + i + k * used_registers_read) * sc->localSize[0] > sc->fftDim) { - sc->tempLen = sprintf(sc->tempStr, " if(%s < %" PRIu64 "){\n", sc->gl_LocalInvocationID_x, sc->fftDim - (i + k * used_registers_read) * sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " sdata[(%s) * sharedStride + (%s+%" PRIu64 ")].x = %s.x;\n", sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, (i + k * used_registers_read) * sc->localSize[0], sc->regIDs[i + k * sc->registers_per_thread]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdata[(%s) * sharedStride + (%s+%" PRIu64 ")].y = %s.y;\n", sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, (i + k * used_registers_read) * sc->localSize[0], sc->regIDs[i + k * sc->registers_per_thread]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if ((1 + i + k * used_registers_read) * sc->localSize[0] > sc->fftDim) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - } - res = appendZeropadEnd(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - //Not implemented - } - break; - } - case 145://odd DCT-IV strided as N FFT - { - char shiftX[500] = ""; - if (sc->performWorkGroupShift[0]) - sprintf(shiftX, " + consts.workGroupShiftX "); - char shiftX2[500] = ""; - if (sc->performWorkGroupShift[0]) - sprintf(shiftX2, " + consts.workGroupShiftX * %s ", sc->gl_WorkGroupSize_x); - char shiftY[500] = ""; - if (sc->performWorkGroupShift[1]) - sprintf(shiftY, " + consts.workGroupShiftY "); - uint64_t mult = (sc->mergeSequencesR2C) ? 2 : 1; - uint64_t used_registers_read = (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]); - if (sc->registerBoost > 1) used_registers_read /= sc->registerBoost; - if (sc->fftDim == sc->fft_dim_full) { - if (sc->zeropadBluestein[0]) { - res = appendSetSMToZero(sc, floatType, floatTypeMemory, uintType, readType); - if (res != VKFFT_SUCCESS) return res; - res = appendBarrierVkFFT(sc, 1); - if (res != VKFFT_SUCCESS) return res; - sc->fftDim = sc->fft_zeropad_Bluestein_left_read[sc->axis_id]; - used_registers_read = (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]); - } - for (uint64_t k = 0; k < sc->registerBoost; k++) { - for (uint64_t i = 0; i < mult * used_registers_read; i++) { - - //sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * mult * used_registers_read) * sc->localSize[0] * sc->localSize[1]); - //res = VkAppendLine(sc); - //if (res != VKFFT_SUCCESS) return res; - - if ((uint64_t)ceil(sc->size[0] / (double)mult) % sc->localSize[0] != 0) { - sc->tempLen = sprintf(sc->tempStr, " if ((%s%s) < %" PRIu64 ") {\n", sc->gl_GlobalInvocationID_x, shiftX2, (uint64_t)ceil(sc->size[0] / (double)mult)); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->mergeSequencesR2C) - sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 ") / %" PRIu64 ";\n", sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[1], mult); - else - sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 ");\n", sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->zeropadBluestein[0]) { - sc->tempLen = sprintf(sc->tempStr, " if((combinedID %% %" PRIu64 ") < %" PRIu64 "){\n", sc->fft_dim_full, sc->fft_zeropad_Bluestein_left_read[sc->axis_id]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if ((1 + i + k * mult * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (mult * sc->fftDim * sc->localSize[0])) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " sdataID = (combinedID %% %" PRIu64 ") * sharedStride + %s;\n", sc->fftDim, sc->gl_LocalInvocationID_x); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->mergeSequencesR2C) { - sprintf(index_x, "(%s + %" PRIu64 " * ((%s %% %" PRIu64 ") + (%s%s) * %" PRIu64 ")) %% (%" PRIu64 ")", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, mult, sc->gl_WorkGroupID_x, shiftX, mult, sc->fft_dim_x); - - sprintf(index_y, "(%s/%" PRIu64 " + %" PRIu64 ")", sc->gl_LocalInvocationID_y, mult, (i + k * used_registers_read) * sc->localSize[1]); - } - else { - sprintf(index_x, "(%s%s) %% (%" PRIu64 ")", sc->gl_GlobalInvocationID_x, shiftX2, sc->fft_dim_x); - sprintf(index_y, "(%s + %" PRIu64 ")", sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[1]); - } - res = indexInputVkFFT(sc, uintType, readType, index_x, index_y, requestCoordinate, requestBatch); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, ";\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - res = appendZeropadStartReadWriteStage(sc, 1); - if (res != VKFFT_SUCCESS) return res; - if (sc->zeropad[0]) { - sc->tempLen = sprintf(sc->tempStr, " if((%s %% %" PRIu64 " < %" PRIu64 ")||(%s %% %" PRIu64 " >= %" PRIu64 ")){\n", index_y, sc->fft_dim_full, sc->fft_zeropad_left_read[sc->axis_id], index_y, sc->fft_dim_full, sc->fft_zeropad_right_read[sc->axis_id]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->inputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = %s%s[%s]%s;\n", convTypeLeft, inputsStruct, sc->inoutID, convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = %sinputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "]%s;\n", convTypeLeft, sc->inoutID, sc->inputBufferBlockSize, inputsStruct, sc->inoutID, sc->inputBufferBlockSize, convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = 0;\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - res = appendZeropadEndReadWriteStage(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->zeropad[0]) { - sc->tempLen = sprintf(sc->tempStr, " }else{\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].x = 0;\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdata[sdataID].y = 0;\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if ((1 + i + k * mult * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (mult * sc->fftDim * sc->localSize[0])) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->zeropadBluestein[0]) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if ((uint64_t)ceil(sc->size[0] / (double)mult) % sc->localSize[0] != 0) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - res = appendBarrierVkFFT(sc, 1); - if (res != VKFFT_SUCCESS) return res; - res = appendZeropadStart(sc); - if (res != VKFFT_SUCCESS) return res; - for (uint64_t k = 0; k < sc->registerBoost; k++) { - for (uint64_t i = 0; i < used_registers_read; i++) { - sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->zeropadBluestein[0]) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[0])) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " inoutID = %" PRIu64 " + 4 * combinedID;\n", sc->fftDim / 2); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - sc->tempLen = sprintf(sc->tempStr, " if (inoutID < %" PRIu64 ") sdataID = inoutID;\n", sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " if ((inoutID < %" PRIu64 ")&&(inoutID >= %" PRIu64 ")) sdataID = %" PRIu64 " - inoutID;\n", 2 * sc->fftDim, sc->fftDim, 2 * sc->fftDim - 1); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " if ((inoutID < %" PRIu64 ")&&(inoutID >= %" PRIu64 ")) sdataID = inoutID - %" PRIu64 ";\n", 3 * sc->fftDim, 2 * sc->fftDim, 2 * sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " if ((inoutID < %" PRIu64 ")&&(inoutID >= %" PRIu64 ")) sdataID = %" PRIu64 " - inoutID;\n", 4 * sc->fftDim, 3 * sc->fftDim, 4 * sc->fftDim - 1); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " if (inoutID >= %" PRIu64 ") sdataID = inoutID - %" PRIu64 ";\n", 4 * sc->fftDim, 4 * sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdataID = sdataID * sharedStride + %s;\n", sc->gl_LocalInvocationID_x); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s = sdata[sdataID];\n", sc->regIDs[i + k * sc->registers_per_thread]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " if ((inoutID < %" PRIu64 ")&&(inoutID >= %" PRIu64 ")){ \n\ - %s.x = -%s.x;\n\ - %s.y = -%s.y;}\n", 2 * sc->fftDim, sc->fftDim, sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " if ((inoutID < %" PRIu64 ")&&(inoutID >= %" PRIu64 ")){ \n\ - %s.x = -%s.x;\n\ - %s.y = -%s.y;}\n", 3 * sc->fftDim, 2 * sc->fftDim, sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if ((1 + i + k * used_registers_read) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[0])) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->zeropadBluestein[0]) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - res = appendZeropadEnd(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->zeropadBluestein[0]) { - sc->fftDim = sc->fft_dim_full; - used_registers_read = (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]); - } - if (!sc->readToRegisters) { - res = appendBarrierVkFFT(sc, 1); - if (res != VKFFT_SUCCESS) return res; - res = appendZeropadStart(sc); - if (res != VKFFT_SUCCESS) return res; - for (uint64_t k = 0; k < sc->registerBoost; k++) { - for (uint64_t i = 0; i < used_registers_read; i++) { - if ((1 + i + k * used_registers_read) * sc->localSize[1] > sc->fftDim) { - sc->tempLen = sprintf(sc->tempStr, " if(%s < %" PRIu64 "){\n", sc->gl_LocalInvocationID_y, sc->fftDim - (i + k * used_registers_read) * sc->localSize[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " sdata[(%s+%" PRIu64 ") * sharedStride + %s].x = %s.x;\n", sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[1], sc->gl_LocalInvocationID_x, sc->regIDs[i + k * sc->registers_per_thread]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdata[(%s+%" PRIu64 ") * sharedStride + %s].y = %s.y;\n", sc->gl_LocalInvocationID_y, (i + k * used_registers_read) * sc->localSize[1], sc->gl_LocalInvocationID_x, sc->regIDs[i + k * sc->registers_per_thread]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if ((1 + i + k * used_registers_read) * sc->localSize[1] > sc->fftDim) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - res = appendZeropadEnd(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - //Not implemented - } - break; - } - } - return res; -} - -static inline VkFFTResult appendReorder4StepRead(VkFFTSpecializationConstantsLayout* sc, const char* floatType, const char* uintType, uint64_t reorderType) { - VkFFTResult res = VKFFT_SUCCESS; - char vecType[30]; - char LFending[4] = ""; - if (!strcmp(floatType, "float")) sprintf(LFending, "f"); -#if(VKFFT_BACKEND==0) - if (!strcmp(floatType, "float")) sprintf(vecType, "vec2"); - if (!strcmp(floatType, "double")) sprintf(vecType, "dvec2"); - char cosDef[20] = "cos"; - char sinDef[20] = "sin"; - if (!strcmp(floatType, "double")) sprintf(LFending, "LF"); -#elif(VKFFT_BACKEND==1) - if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); - if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); - char cosDef[20] = "__cosf"; - char sinDef[20] = "__sinf"; - if (!strcmp(floatType, "double")) sprintf(LFending, "l"); -#elif(VKFFT_BACKEND==2) - if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); - if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); - char cosDef[20] = "__cosf"; - char sinDef[20] = "__sinf"; - if (!strcmp(floatType, "double")) sprintf(LFending, "l"); -#elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)) - if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); - if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); - char cosDef[20] = "native_cos"; - char sinDef[20] = "native_sin"; - //if (!strcmp(floatType, "double")) sprintf(LFending, "l"); -#endif - - uint64_t logicalRegistersPerThread = (sc->rader_generator[0] > 0) ? sc->min_registers_per_thread : sc->registers_per_thread_per_radix[sc->stageRadix[0]];// (sc->registers_per_thread % sc->stageRadix[sc->numStages - 1] == 0) ? sc->registers_per_thread : sc->min_registers_per_thread; - switch (reorderType) { - case 1: {//grouped_c2c - char shiftX[500] = ""; - if (sc->performWorkGroupShift[0]) - sprintf(shiftX, " + consts.workGroupShiftX * %s ", sc->gl_WorkGroupSize_x); - if ((sc->stageStartSize > 1) && (!sc->reorderFourStep) && (sc->inverse)) { - if (!sc->readToRegisters) { - res = appendBarrierVkFFT(sc, 1); - if (res != VKFFT_SUCCESS) return res; - } - /*if (sc->localSize[1] * sc->stageRadix[0] * (sc->registers_per_thread_per_radix[sc->stageRadix[0]] / sc->stageRadix[0]) > sc->fftDim) { - res = appendBarrierVkFFT(sc, 1); - if (res != VKFFT_SUCCESS) return res; - sc->readToRegisters = 0; - } - else - sc->readToRegisters = 1;*/ - res = appendZeropadStart(sc); - if (res != VKFFT_SUCCESS) return res; - res = VkAppendLineFromInput(sc, sc->disableThreadsStart); - if (res != VKFFT_SUCCESS) return res; - for (uint64_t i = 0; i < (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]); i++) { - if (((sc->fftDim % sc->localSize[1]) != 0) && (i == ((uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]) - 1))) { - sc->tempLen = sprintf(sc->tempStr, " if(%s<%" PRIu64 "){\n", sc->gl_LocalInvocationID_y, sc->fftDim % sc->localSize[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - uint64_t id = (i / logicalRegistersPerThread) * sc->registers_per_thread + i % logicalRegistersPerThread; - if (sc->LUT) { - sc->tempLen = sprintf(sc->tempStr, " mult = twiddleLUT[%" PRIu64 "+(((%s%s)/%" PRIu64 ") %% (%" PRIu64 "))+%" PRIu64 "*(%s+%" PRIu64 ")];\n", sc->maxStageSumLUT, sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x, sc->stageStartSize, sc->stageStartSize, sc->gl_LocalInvocationID_y, i * sc->localSize[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (!sc->inverse) { - sc->tempLen = sprintf(sc->tempStr, " mult.y = -mult.y;\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - sc->tempLen = sprintf(sc->tempStr, " angle = 2 * loc_PI * ((((%s%s) / %" PRIu64 ") %% (%" PRIu64 ")) * (%s + %" PRIu64 ")) / %f%s;\n", sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x, sc->stageStartSize, sc->gl_LocalInvocationID_y, i * sc->localSize[1], (double)(sc->stageStartSize * sc->fftDim), LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (!strcmp(floatType, "float")) { - sc->tempLen = sprintf(sc->tempStr, " mult.x = %s(angle);\n", cosDef); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " mult.y = %s(angle);\n", sinDef); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - //sc->tempLen = sprintf(sc->tempStr, " mult = %s(cos(angle), sin(angle));\n", vecType); - } - if (!strcmp(floatType, "double")) { - sc->tempLen = sprintf(sc->tempStr, " mult = sincos_20(angle);\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - if (sc->readToRegisters) { - sc->tempLen = sprintf(sc->tempStr, "\ - w.x = %s.x * mult.x - %s.y * mult.y;\n", sc->regIDs[id], sc->regIDs[id]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, "\ - %s.y = %s.y * mult.x + %s.x * mult.y;\n", sc->regIDs[id], sc->regIDs[id], sc->regIDs[id]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, "\ - %s.x = w.x;\n", sc->regIDs[id]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, "\ - %s = %s*(%" PRIu64 "+%s) + %s;\n", sc->inoutID, sc->sharedStride, i * sc->localSize[1], sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - sc->tempLen = sprintf(sc->tempStr, "\ - w.x = sdata[%s].x * mult.x - sdata[%s].y * mult.y;\n", sc->inoutID, sc->inoutID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - sc->tempLen = sprintf(sc->tempStr, "\ - sdata[%s].y = sdata[%s].y * mult.x + sdata[%s].x * mult.y;\n", sc->inoutID, sc->inoutID, sc->inoutID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, "\ - sdata[%s].x = w.x;\n", sc->inoutID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (((sc->fftDim % sc->localSize[1]) != 0) && (i == ((uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]) - 1))) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - res = VkAppendLineFromInput(sc, sc->disableThreadsEnd); - if (res != VKFFT_SUCCESS) return res; - res = appendZeropadEnd(sc); - if (res != VKFFT_SUCCESS) return res; - } - - break; - } - case 2: {//single_c2c_strided - char shiftX[500] = ""; - if (sc->performWorkGroupShift[0]) - sprintf(shiftX, " + consts.workGroupShiftX * %s ", sc->gl_WorkGroupSize_x); - if ((!sc->reorderFourStep) && (sc->inverse)) { - if (!sc->readToRegisters) { - res = appendBarrierVkFFT(sc, 1); - if (res != VKFFT_SUCCESS) return res; - } - /*if (sc->localSize[1] * sc->stageRadix[0] * (sc->registers_per_thread_per_radix[sc->stageRadix[0]] / sc->stageRadix[0]) > sc->fftDim) { - res = appendBarrierVkFFT(sc, 1); - sc->readToRegisters = 0; - } - else - sc->readToRegisters = 1;*/ - res = appendZeropadStart(sc); - if (res != VKFFT_SUCCESS) return res; - res = VkAppendLineFromInput(sc, sc->disableThreadsStart); - if (res != VKFFT_SUCCESS) return res; - for (uint64_t i = 0; i < (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]); i++) { - if (((sc->fftDim % sc->localSize[1]) != 0) && (i == ((uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]) - 1))) { - sc->tempLen = sprintf(sc->tempStr, " if(%s<%" PRIu64 "){\n", sc->gl_LocalInvocationID_y, sc->fftDim % sc->localSize[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - uint64_t id = (i / logicalRegistersPerThread) * sc->registers_per_thread + i % logicalRegistersPerThread; - if (sc->LUT) { - sc->tempLen = sprintf(sc->tempStr, " mult = twiddleLUT[%" PRIu64 " + ((%s%s) %% (%" PRIu64 ")) + (%s + %" PRIu64 ") * %" PRIu64 "];\n", sc->maxStageSumLUT, sc->gl_GlobalInvocationID_x, shiftX, sc->stageStartSize, sc->gl_LocalInvocationID_y, i * sc->localSize[1], sc->stageStartSize); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (!sc->inverse) { - sc->tempLen = sprintf(sc->tempStr, " mult.y = -mult.y;\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - sc->tempLen = sprintf(sc->tempStr, " angle = 2 * loc_PI * ((((%s%s) %% (%" PRIu64 ")) * (%s + %" PRIu64 ")) / %f%s);\n", sc->gl_GlobalInvocationID_x, shiftX, sc->stageStartSize, sc->gl_LocalInvocationID_y, i * sc->localSize[1], (double)(sc->stageStartSize * sc->fftDim), LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - if (!strcmp(floatType, "float")) { - sc->tempLen = sprintf(sc->tempStr, " mult.x = %s(angle);\n", cosDef); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " mult.y = %s(angle);\n", sinDef); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - //sc->tempLen = sprintf(sc->tempStr, " mult = %s(cos(angle), sin(angle));\n", vecType); - } - if (!strcmp(floatType, "double")) { - sc->tempLen = sprintf(sc->tempStr, " mult = sincos_20(angle);\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - if (sc->readToRegisters) { - sc->tempLen = sprintf(sc->tempStr, "\ - w.x = %s.x * mult.x - %s.y * mult.y;\n", sc->regIDs[id], sc->regIDs[id]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, "\ - %s.y = %s.y * mult.x + %s.x * mult.y;\n", sc->regIDs[id], sc->regIDs[id], sc->regIDs[id]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, "\ - %s.x = w.x;\n", sc->regIDs[id]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, "\ - %s = %s*(%" PRIu64 "+%s) + %s;\n", sc->inoutID, sc->sharedStride, i * sc->localSize[1], sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - sc->tempLen = sprintf(sc->tempStr, "\ - w.x = sdata[%s].x * mult.x - sdata[%s].y * mult.y;\n", sc->inoutID, sc->inoutID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - sc->tempLen = sprintf(sc->tempStr, "\ - sdata[%s].y = sdata[%s].y * mult.x + sdata[%s].x * mult.y;\n", sc->inoutID, sc->inoutID, sc->inoutID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, "\ - sdata[%s].x = w.x;\n", sc->inoutID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (((sc->fftDim % sc->localSize[1]) != 0) && (i == ((uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]) - 1))) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - res = VkAppendLineFromInput(sc, sc->disableThreadsEnd); - if (res != VKFFT_SUCCESS) return res; - res = appendZeropadEnd(sc); - if (res != VKFFT_SUCCESS) return res; - } - //appendBarrierVkFFT(sc, 1); - break; - } - } - return res; -} -static inline VkFFTResult appendReorder4StepWrite(VkFFTSpecializationConstantsLayout* sc, const char* floatType, const char* uintType, uint64_t reorderType) { - VkFFTResult res = VKFFT_SUCCESS; - char vecType[30]; - char LFending[4] = ""; - if (!strcmp(floatType, "float")) sprintf(LFending, "f"); -#if(VKFFT_BACKEND==0) - if (!strcmp(floatType, "float")) sprintf(vecType, "vec2"); - if (!strcmp(floatType, "double")) sprintf(vecType, "dvec2"); - char cosDef[20] = "cos"; - char sinDef[20] = "sin"; - if (!strcmp(floatType, "double")) sprintf(LFending, "LF"); -#elif(VKFFT_BACKEND==1) - if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); - if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); - char cosDef[20] = "__cosf"; - char sinDef[20] = "__sinf"; - if (!strcmp(floatType, "double")) sprintf(LFending, "l"); -#elif(VKFFT_BACKEND==2) - if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); - if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); - char cosDef[20] = "__cosf"; - char sinDef[20] = "__sinf"; - if (!strcmp(floatType, "double")) sprintf(LFending, "l"); -#elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)) - if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); - if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); - char cosDef[20] = "native_cos"; - char sinDef[20] = "native_sin"; - //if (!strcmp(floatType, "double")) sprintf(LFending, "l"); -#endif - - uint64_t logicalRegistersPerThread = (sc->rader_generator[sc->numStages - 1] > 0) ? sc->min_registers_per_thread : sc->registers_per_thread_per_radix[sc->stageRadix[sc->numStages - 1]];// (sc->registers_per_thread % sc->stageRadix[sc->numStages - 1] == 0) ? sc->registers_per_thread : sc->min_registers_per_thread; - switch (reorderType) { - case 1: {//grouped_c2c - char shiftX[500] = ""; - if (sc->performWorkGroupShift[0]) - sprintf(shiftX, " + consts.workGroupShiftX * %s ", sc->gl_WorkGroupSize_x); - if ((sc->stageStartSize > 1) && (!((sc->stageStartSize > 1) && (!sc->reorderFourStep) && (sc->inverse)))) { - if (!sc->writeFromRegisters) { - res = appendBarrierVkFFT(sc, 1); - if (res != VKFFT_SUCCESS) return res; - } - /*if (sc->localSize[1] * sc->stageRadix[sc->numStages - 1] * (sc->registers_per_thread_per_radix[sc->stageRadix[sc->numStages - 1]] / sc->stageRadix[sc->numStages - 1]) > sc->fftDim) { - res = appendBarrierVkFFT(sc, 1); - if (res != VKFFT_SUCCESS) return res; - sc->writeFromRegisters = 0; - } - else - sc->writeFromRegisters = 1;*/ - res = appendZeropadStart(sc); - if (res != VKFFT_SUCCESS) return res; - res = VkAppendLineFromInput(sc, sc->disableThreadsStart); - if (res != VKFFT_SUCCESS) return res; - for (uint64_t i = 0; i < (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]); i++) { - if (((sc->fftDim % sc->localSize[1]) != 0) && (i == ((uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]) - 1))) { - sc->tempLen = sprintf(sc->tempStr, " if(%s<%" PRIu64 "){\n", sc->gl_LocalInvocationID_y, sc->fftDim % sc->localSize[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - uint64_t id = (i / logicalRegistersPerThread) * sc->registers_per_thread + i % logicalRegistersPerThread; - if (sc->LUT) { - sc->tempLen = sprintf(sc->tempStr, " mult = twiddleLUT[%" PRIu64 "+(((%s%s)/%" PRIu64 ") %% (%" PRIu64 "))+%" PRIu64 "*(%s+%" PRIu64 ")];\n", sc->maxStageSumLUT, sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x, sc->stageStartSize, sc->stageStartSize, sc->gl_LocalInvocationID_y, i * sc->localSize[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (!sc->inverse) { - sc->tempLen = sprintf(sc->tempStr, " mult.y = -mult.y;\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - sc->tempLen = sprintf(sc->tempStr, " angle = 2 * loc_PI * ((((%s%s) / %" PRIu64 ") %% (%" PRIu64 ")) * (%s + %" PRIu64 ")) / %f%s;\n", sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x, sc->stageStartSize, sc->gl_LocalInvocationID_y, i * sc->localSize[1], (double)(sc->stageStartSize * sc->fftDim), LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->inverse) { - if (!strcmp(floatType, "float")) { - sc->tempLen = sprintf(sc->tempStr, " mult.x = %s(angle);\n", cosDef); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " mult.y = %s(angle);\n", sinDef); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - //sc->tempLen = sprintf(sc->tempStr, " mult = %s(cos(angle), sin(angle));\n", vecType); - } - if (!strcmp(floatType, "double")) { - sc->tempLen = sprintf(sc->tempStr, " mult = sincos_20(angle);\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - if (!strcmp(floatType, "float")) { - sc->tempLen = sprintf(sc->tempStr, " mult.x = %s(angle);\n", cosDef); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " mult.y = -%s(angle);\n", sinDef); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - //sc->tempLen = sprintf(sc->tempStr, " mult = %s(cos(angle), sin(angle));\n", vecType); - } - if (!strcmp(floatType, "double")) { - sc->tempLen = sprintf(sc->tempStr, " mult = sincos_20(-angle);\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - if (sc->writeFromRegisters) { - sc->tempLen = sprintf(sc->tempStr, "\ - w.x = %s.x * mult.x - %s.y * mult.y;\n", sc->regIDs[id], sc->regIDs[id]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, "\ - %s.y = %s.y * mult.x + %s.x * mult.y;\n", sc->regIDs[id], sc->regIDs[id], sc->regIDs[id]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, "\ - %s.x = w.x;\n", sc->regIDs[id]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, "\ - %s = %s*(%" PRIu64 "+%s) + %s;\n", sc->inoutID, sc->sharedStride, i * sc->localSize[1], sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - sc->tempLen = sprintf(sc->tempStr, "\ - w.x = sdata[%s].x * mult.x - sdata[%s].y * mult.y;\n", sc->inoutID, sc->inoutID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - sc->tempLen = sprintf(sc->tempStr, "\ - sdata[%s].y = sdata[%s].y * mult.x + sdata[%s].x * mult.y;\n", sc->inoutID, sc->inoutID, sc->inoutID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, "\ - sdata[%s].x = w.x;\n", sc->inoutID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (((sc->fftDim % sc->localSize[1]) != 0) && (i == ((uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]) - 1))) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - res = VkAppendLineFromInput(sc, sc->disableThreadsEnd); - if (res != VKFFT_SUCCESS) return res; - res = appendZeropadEnd(sc); - if (res != VKFFT_SUCCESS) return res; - } - break; - } - case 2: {//single_c2c_strided - char shiftX[500] = ""; - if (sc->performWorkGroupShift[0]) - sprintf(shiftX, " + consts.workGroupShiftX * %s ", sc->gl_WorkGroupSize_x); - if (!((!sc->reorderFourStep) && (sc->inverse))) { - if (!sc->writeFromRegisters) { - res = appendBarrierVkFFT(sc, 1); - if (res != VKFFT_SUCCESS) return res; - } - /*if (sc->localSize[1] * sc->stageRadix[sc->numStages - 1] * (sc->registers_per_thread_per_radix[sc->stageRadix[sc->numStages - 1]] / sc->stageRadix[sc->numStages - 1]) > sc->fftDim) { - res = appendBarrierVkFFT(sc, 1); - if (res != VKFFT_SUCCESS) return res; - sc->writeFromRegisters = 0; - } - else - sc->writeFromRegisters = 1;*/ - res = appendZeropadStart(sc); - if (res != VKFFT_SUCCESS) return res; - res = VkAppendLineFromInput(sc, sc->disableThreadsStart); - if (res != VKFFT_SUCCESS) return res; - for (uint64_t i = 0; i < (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]); i++) { - if (((sc->fftDim % sc->localSize[1]) != 0) && (i == ((uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]) - 1))) { - sc->tempLen = sprintf(sc->tempStr, " if(%s<%" PRIu64 "){\n", sc->gl_LocalInvocationID_y, sc->fftDim % sc->localSize[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - uint64_t id = (i / logicalRegistersPerThread) * sc->registers_per_thread + i % logicalRegistersPerThread; - if (sc->LUT) { - sc->tempLen = sprintf(sc->tempStr, " mult = twiddleLUT[%" PRIu64 " + ((%s%s) %% (%" PRIu64 ")) + (%s + %" PRIu64 ") * %" PRIu64 "];\n", sc->maxStageSumLUT, sc->gl_GlobalInvocationID_x, shiftX, sc->stageStartSize, sc->gl_LocalInvocationID_y, i * sc->localSize[1], sc->stageStartSize); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (!sc->inverse) { - sc->tempLen = sprintf(sc->tempStr, " mult.y = -mult.y;\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - sc->tempLen = sprintf(sc->tempStr, " angle = 2 * loc_PI * ((((%s%s) %% (%" PRIu64 ")) * (%s + %" PRIu64 ")) / %f%s);\n", sc->gl_GlobalInvocationID_x, shiftX, sc->stageStartSize, sc->gl_LocalInvocationID_y, i * sc->localSize[1], (double)(sc->stageStartSize * sc->fftDim), LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->inverse) { - if (!strcmp(floatType, "float")) { - sc->tempLen = sprintf(sc->tempStr, " mult.x = %s(angle);\n", cosDef); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " mult.y = %s(angle);\n", sinDef); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - //sc->tempLen = sprintf(sc->tempStr, " mult = %s(cos(angle), sin(angle));\n", vecType); - } - if (!strcmp(floatType, "double")) { - sc->tempLen = sprintf(sc->tempStr, " mult = sincos_20(angle);\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - if (!strcmp(floatType, "float")) { - sc->tempLen = sprintf(sc->tempStr, " mult.x = %s(angle);\n", cosDef); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " mult.y = -%s(angle);\n", sinDef); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - //sc->tempLen = sprintf(sc->tempStr, " mult = %s(cos(angle), sin(angle));\n", vecType); - } - if (!strcmp(floatType, "double")) { - sc->tempLen = sprintf(sc->tempStr, " mult = sincos_20(-angle);\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - if (sc->writeFromRegisters) { - sc->tempLen = sprintf(sc->tempStr, "\ - w.x = %s.x * mult.x - %s.y * mult.y;\n", sc->regIDs[id], sc->regIDs[id]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, "\ - %s.y = %s.y * mult.x + %s.x * mult.y;\n", sc->regIDs[id], sc->regIDs[id], sc->regIDs[id]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, "\ - %s.x = w.x;\n", sc->regIDs[id]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, "\ - %s = %s*(%" PRIu64 "+%s) + %s;\n", sc->inoutID, sc->sharedStride, i * sc->localSize[1], sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - sc->tempLen = sprintf(sc->tempStr, "\ - w.x = sdata[%s].x * mult.x - sdata[%s].y * mult.y;\n", sc->inoutID, sc->inoutID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - sc->tempLen = sprintf(sc->tempStr, "\ - sdata[%s].y = sdata[%s].y * mult.x + sdata[%s].x * mult.y;\n", sc->inoutID, sc->inoutID, sc->inoutID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, "\ - sdata[%s].x = w.x;\n", sc->inoutID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (((sc->fftDim % sc->localSize[1]) != 0) && (i == ((uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]) - 1))) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - res = VkAppendLineFromInput(sc, sc->disableThreadsEnd); - if (res != VKFFT_SUCCESS) return res; - res = appendZeropadEnd(sc); - if (res != VKFFT_SUCCESS) return res; - } - //appendBarrierVkFFT(sc, 1); - break; - } - } - return res; -} - -static inline VkFFTResult appendBluesteinMultiplication(VkFFTSpecializationConstantsLayout* sc, const char* floatType, const char* uintType, uint64_t strideType, uint64_t pre_or_post_multiplication) { - VkFFTResult res = VKFFT_SUCCESS; - char vecType[30]; - char LFending[4] = ""; - if (!strcmp(floatType, "float")) sprintf(LFending, "f"); -#if(VKFFT_BACKEND==0) - if (!strcmp(floatType, "float")) sprintf(vecType, "vec2"); - if (!strcmp(floatType, "double")) sprintf(vecType, "dvec2"); - //char cosDef[20] = "cos"; - //char sinDef[20] = "sin"; - if (!strcmp(floatType, "double")) sprintf(LFending, "LF"); -#elif(VKFFT_BACKEND==1) - if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); - if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); - //char cosDef[20] = "__cosf"; - //char sinDef[20] = "__sinf"; - if (!strcmp(floatType, "double")) sprintf(LFending, "l"); -#elif(VKFFT_BACKEND==2) - if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); - if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); - //char cosDef[20] = "__cosf"; - //char sinDef[20] = "__sinf"; - if (!strcmp(floatType, "double")) sprintf(LFending, "l"); -#elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)) - if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); - if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); - //char cosDef[20] = "native_cos"; - //char sinDef[20] = "native_sin"; - //if (!strcmp(floatType, "double")) sprintf(LFending, "l"); -#endif - char shiftX[500] = ""; - if (sc->performWorkGroupShift[0]) - sprintf(shiftX, " + consts.workGroupShiftX * %s ", sc->gl_WorkGroupSize_x); - char index_x[2000] = ""; - //char index_y[2000] = ""; - //char requestBatch[100] = ""; - //char separateRegisterStore[100] = ""; - char kernelName[100] = ""; - sprintf(kernelName, "BluesteinMultiplication"); - if (!((sc->readToRegisters && (pre_or_post_multiplication == 0)) || (sc->writeFromRegisters && (pre_or_post_multiplication == 1)))) { - res = appendBarrierVkFFT(sc, 1); - if (res != VKFFT_SUCCESS) return res; - } - res = VkAppendLineFromInput(sc, sc->disableThreadsStart); - if (res != VKFFT_SUCCESS) return res; - - uint64_t used_registers = 1; - switch (strideType) { - case 0: case 5: case 6: case 110: case 120: case 130: case 140: case 142: case 144: - used_registers = (uint64_t)ceil(sc->fftDim / (double)sc->localSize[0]); - break; - case 1: case 2: case 111: case 121: case 131: case 141: case 143: case 145: - used_registers = (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]); - break; - } - for (uint64_t i = 0; i < used_registers; i++) { - switch (strideType) { - case 0: case 5: case 6: case 110: case 120: case 130: case 140: case 142: case 144: - { - if (sc->localSize[0] * ((1 + i)) > sc->fftDim) { - uint64_t current_group_cut = sc->fftDim - i * sc->localSize[0]; - sc->tempLen = sprintf(sc->tempStr, "\ - if (%s < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_x, current_group_cut); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - break; - } - case 1: case 2: case 111: case 121: case 131: case 141: case 143: case 145: - { - if (sc->localSize[1] * ((1 + i)) > sc->fftDim) { - uint64_t current_group_cut = sc->fftDim - i * sc->localSize[1]; - sc->tempLen = sprintf(sc->tempStr, "\ - if (%s < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_y, current_group_cut); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - break; - } - } - switch (strideType) { - case 0: case 2: case 5: case 6: case 110: case 120: case 130: case 140: case 142: case 144: - { - if (sc->fftDim == sc->fft_dim_full) { - sc->tempLen = sprintf(sc->tempStr, " %s = %s + %" PRIu64 ";\n", sc->inoutID, sc->gl_LocalInvocationID_x, i * sc->localSize[0]); - - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sprintf(index_x, " (%s%s) %% (%" PRIu64 ") + %" PRIu64 " * (%s + %" PRIu64 ") + ((%s%s) / %" PRIu64 ") * (%" PRIu64 ")", sc->gl_GlobalInvocationID_x, shiftX, sc->stageStartSize, sc->stageStartSize, sc->gl_LocalInvocationID_y, (i)*sc->localSize[1], sc->gl_GlobalInvocationID_x, shiftX, sc->stageStartSize, sc->stageStartSize * sc->fftDim); - sc->tempLen = sprintf(sc->tempStr, " %s = %s;\n", sc->inoutID, index_x); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - //sc->tempLen = sprintf(sc->tempStr, " inoutID = indexInput(%s+%" PRIu64 "+%s * %" PRIu64 " + (((%s%s) %% %" PRIu64 ") * %" PRIu64 " + ((%s%s) / %" PRIu64 ") * %" PRIu64 ")%s%s);\n", sc->gl_LocalInvocationID_x, i * sc->localSize[0], sc->gl_LocalInvocationID_y, sc->firstStageStartSize, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->fftDim, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->localSize[1] * sc->firstStageStartSize, requestCoordinate, requestBatch); - } - break; - } - case 1: case 111: case 121: case 131: case 141: case 143: case 145: - { - if (sc->fftDim == sc->fft_dim_full) { - sc->tempLen = sprintf(sc->tempStr, " %s = %s + %" PRIu64 ";\n", sc->inoutID, sc->gl_LocalInvocationID_y, i * sc->localSize[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " %s = (%" PRIu64 " * (%s + %" PRIu64 ") + ((%s%s) / %" PRIu64 ") %% (%" PRIu64 ")+((%s%s) / %" PRIu64 ") * (%" PRIu64 "));\n", sc->inoutID, sc->stageStartSize, sc->gl_LocalInvocationID_y, (i)*sc->localSize[1], sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x, sc->stageStartSize, sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x * sc->stageStartSize, sc->fftDim * sc->stageStartSize); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - break; - } - } - if ((sc->zeropadBluestein[0]) && (pre_or_post_multiplication == 0)) { - sc->tempLen = sprintf(sc->tempStr, " if((%s %% %" PRIu64 ") < %" PRIu64 "){\n", sc->inoutID, sc->fft_dim_full, sc->fft_zeropad_Bluestein_left_read[sc->axis_id]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if ((sc->zeropadBluestein[1]) && (pre_or_post_multiplication == 1)) { - sc->tempLen = sprintf(sc->tempStr, " if((%s %% %" PRIu64 ") < %" PRIu64 "){\n", sc->inoutID, sc->fft_dim_full, sc->fft_zeropad_Bluestein_left_write[sc->axis_id]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " w = %s[%s];\n", kernelName, sc->inoutID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - //uint64_t k = 0; - if (!((sc->readToRegisters && (pre_or_post_multiplication == 0)) || (sc->writeFromRegisters && (pre_or_post_multiplication == 1)))) { - if ((strideType == 0) || (strideType == 5) || (strideType == 6) || (strideType == 110) || (strideType == 120) || (strideType == 130) || (strideType == 140) || (strideType == 142) || (strideType == 144)) { - sc->tempLen = sprintf(sc->tempStr, "\ - %s = sdata[sharedStride * %s + %s + %" PRIu64 " * %s];\n", sc->regIDs[i], sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, i, sc->gl_WorkGroupSize_x); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, "\ - %s = sdata[%s + (%s + %" PRIu64 " * %s)*sharedStride];\n", sc->regIDs[i], sc->gl_LocalInvocationID_x, sc->gl_LocalInvocationID_y, i, sc->gl_WorkGroupSize_y); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - - if (sc->inverseBluestein) - res = VkMulComplex(sc, sc->regIDs[i], sc->regIDs[i], "w", sc->temp); - else - res = VkMulComplexConj(sc, sc->regIDs[i], sc->regIDs[i], "w", sc->temp); - if (res != VKFFT_SUCCESS) return res; - - if (!((sc->readToRegisters && (pre_or_post_multiplication == 0)) || (sc->writeFromRegisters && (pre_or_post_multiplication == 1)))) { - if ((strideType == 0) || (strideType == 5) || (strideType == 6) || (strideType == 110) || (strideType == 120) || (strideType == 130) || (strideType == 140) || (strideType == 142) || (strideType == 144)) { - sc->tempLen = sprintf(sc->tempStr, "\ - sdata[sharedStride * %s + %s + %" PRIu64 " * %s] = %s;\n", sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, i, sc->gl_WorkGroupSize_x, sc->regIDs[i]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, "\ - sdata[%s + (%s + %" PRIu64 " * %s)*sharedStride] = %s;\n", sc->gl_LocalInvocationID_x, sc->gl_LocalInvocationID_y, i, sc->gl_WorkGroupSize_y, sc->regIDs[i]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - if ((sc->zeropadBluestein[0]) && (pre_or_post_multiplication == 0)) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if ((sc->zeropadBluestein[1]) && (pre_or_post_multiplication == 1)) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - switch (strideType) { - case 0: case 5: case 6: case 110: case 120: case 130: case 140: case 142: case 144: - { - if (sc->localSize[0] * ((1 + i)) > sc->fftDim) { - sc->tempLen = sprintf(sc->tempStr, "\ - }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - break; - } - case 1: case 2: case 111: case 121: case 131: case 141: case 143: case 145: - { - if (sc->localSize[1] * ((1 + i)) > sc->fftDim) { - sc->tempLen = sprintf(sc->tempStr, "\ - }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - break; - } - } - } - res = VkAppendLineFromInput(sc, sc->disableThreadsEnd); - if (res != VKFFT_SUCCESS) return res; - return res; -} - -static inline VkFFTResult appendFFTRaderStage(VkFFTSpecializationConstantsLayout* sc, const char* floatType, const char* uintType, uint64_t stageSize, uint64_t stageSizeSum, double stageAngle, uint64_t stageRadix, uint64_t stageID, uint64_t strided) { - VkFFTResult res = VKFFT_SUCCESS; - double double_PI = 3.1415926535897932384626433832795; - char vecType[30]; - char LFending[4] = ""; - char tempNum[100] = ""; - if (!strcmp(floatType, "float")) sprintf(LFending, "f"); -#if(VKFFT_BACKEND==0) - if (!strcmp(floatType, "float")) sprintf(vecType, "vec2"); - if (!strcmp(floatType, "double")) sprintf(vecType, "dvec2"); - char cosDef[20] = "cos"; - char sinDef[20] = "sin"; - if (!strcmp(floatType, "double")) sprintf(LFending, "LF"); -#elif(VKFFT_BACKEND==1) - if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); - if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); - char cosDef[20] = "__cosf"; - char sinDef[20] = "__sinf"; - if (!strcmp(floatType, "double")) sprintf(LFending, "l"); -#elif(VKFFT_BACKEND==2) - if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); - if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); - char cosDef[20] = "__cosf"; - char sinDef[20] = "__sinf"; - if (!strcmp(floatType, "double")) sprintf(LFending, "l"); -#elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)) - if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); - if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); - char cosDef[20] = "native_cos"; - char sinDef[20] = "native_sin"; - //if (!strcmp(floatType, "double")) sprintf(LFending, "l"); -#endif - char stageNormalization[50] = ""; - uint64_t normalizationValue = 1; - if ((((sc->actualInverse) && (sc->normalize)) || (sc->convolutionStep && (stageAngle > 0))) && (stageSize == 1) && (sc->axis_upload_id == 0) && (!(sc->useBluesteinFFT && (stageAngle < 0)))) { - if ((sc->performDCT) && (sc->actualInverse)) { - if (sc->performDCT == 1) - normalizationValue = (sc->sourceFFTSize - 1) * 2; - else - normalizationValue = sc->sourceFFTSize * 2; - } - else - normalizationValue = sc->sourceFFTSize; - } - if (sc->useBluesteinFFT && (stageAngle > 0) && (stageSize == 1) && (sc->axis_upload_id == 0)) { - normalizationValue *= sc->fft_dim_full; - } - if (normalizationValue != 1) { - sprintf(stageNormalization, "%.17e%s", 1.0 / (double)(normalizationValue), LFending); - } - char convolutionInverse[10] = ""; - sc->useCoalescedLUTUploadToSM = 0; - if (sc->convolutionStep) { - if (stageAngle < 0) - sprintf(convolutionInverse, ", 0"); - else - sprintf(convolutionInverse, ", 1"); - } - res = appendBarrierVkFFT(sc, 1); - if (res != VKFFT_SUCCESS) return res; - - - res = appendZeropadStart(sc); - if (res != VKFFT_SUCCESS) return res; - res = VkAppendLineFromInput(sc, sc->disableThreadsStart); - if (res != VKFFT_SUCCESS) return res; - //rotate the stage - char* gl_LocalInvocationID = (strided) ? sc->gl_LocalInvocationID_y : sc->gl_LocalInvocationID_x; - if (stageSize > 1) { - uint64_t num_logical_subgroups = (strided) ? sc->localSize[1] : sc->localSize[0]; - uint64_t num_logical_groups = (uint64_t)ceil((sc->fftDim) / (double)(num_logical_subgroups)); - for (uint64_t t = 0; t < num_logical_groups; t++) { - if (((1 + t) * num_logical_subgroups) > sc->fftDim) { - uint64_t current_group_cut = sc->fftDim - t * num_logical_subgroups; - sc->tempLen = sprintf(sc->tempStr, "\ - if (%s < %" PRIu64 ") {\n", gl_LocalInvocationID, current_group_cut); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, "\ - %s = (%s+%" PRIu64 ") %% (%" PRIu64 ");\n", sc->stageInvocationID, gl_LocalInvocationID, t * num_logical_subgroups, stageSize); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - if (sc->LUT) - sc->tempLen = sprintf(sc->tempStr, " LUTId = stageInvocationID*%" PRIu64 " + %" PRIu64 ";\n", stageRadix, stageSizeSum); - else - sc->tempLen = sprintf(sc->tempStr, " angle = stageInvocationID * %.17e%s;\n", stageAngle, LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->LUT) { - sc->tempLen = sprintf(sc->tempStr, " %s = twiddleLUT[LUTId+(%s+ %" PRIu64 ") / %" PRIu64 "];\n\n", sc->w, gl_LocalInvocationID, t * num_logical_subgroups, sc->fftDim / stageRadix); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (!sc->inverse) { - sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.y;\n", sc->w, sc->w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - if (!strcmp(floatType, "float")) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = %s(angle*%.17e%s*((%s+ %" PRIu64 ") / %" PRIu64 "));\n", sc->w, cosDef, 2.0 / stageRadix, LFending, gl_LocalInvocationID, t * num_logical_subgroups, sc->fftDim / stageRadix); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = %s(angle*%.17e%s*((%s+ %" PRIu64 ") / %" PRIu64 "));\n", sc->w, sinDef, 2.0 / stageRadix, LFending, gl_LocalInvocationID, t * num_logical_subgroups, sc->fftDim / stageRadix); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - //sc->tempLen = sprintf(sc->tempStr, " w = %s(cos(angle*%.17e), sin(angle*%.17e));\n\n", vecType, 2.0 * i / radix, 2.0 * i / radix); - } - if (!strcmp(floatType, "double")) { - sc->tempLen = sprintf(sc->tempStr, " %s = sincos_20(angle*%.17e%s*((%s+ %" PRIu64 ") / %" PRIu64 "));\n", sc->w, 2.0 / stageRadix, LFending, gl_LocalInvocationID, t * num_logical_subgroups, sc->fftDim / stageRadix); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - //sc->tempLen = sprintf(sc->tempStr, " printf(\"%%d %%f %%f \\n \", %s, %s.x, %s.y);\n\n", sc->gl_LocalInvocationID_x, sc->w, sc->w); - //res = VkAppendLine(sc); - //if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, "\ - %s = (%s+ %" PRIu64 ");\n", sc->sdataID, gl_LocalInvocationID, t * num_logical_subgroups); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - if (sc->resolveBankConflictFirstStages == 1) { - sc->tempLen = sprintf(sc->tempStr, "\ - %s = (%s / %" PRIu64 ") * %" PRIu64 " + %s %% %" PRIu64 ";", sc->sdataID, sc->sdataID, sc->numSharedBanks / 2, sc->numSharedBanks / 2 + 1, sc->sdataID, sc->numSharedBanks / 2); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (strided) { - sc->tempLen = sprintf(sc->tempStr, "\ - %s = %s * sharedStride + %s;\n", sc->sdataID, sc->sdataID, sc->gl_LocalInvocationID_x); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - if (sc->localSize[1] > 1) { - sc->tempLen = sprintf(sc->tempStr, "\ - %s = %s + sharedStride * %s;\n", sc->sdataID, sc->sdataID, sc->gl_LocalInvocationID_y); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - sc->tempLen = sprintf(sc->tempStr, "\ - %s = sdata[%s];\n", sc->regIDs[0], sc->sdataID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - res = VkMulComplex(sc, sc->temp, sc->regIDs[0], sc->w, 0); - if (res != VKFFT_SUCCESS) return res; - - sc->tempLen = sprintf(sc->tempStr, "\ - sdata[%s] = %s;\n", sc->sdataID, sc->temp); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (((1 + t) * num_logical_subgroups) > sc->fftDim) { - sc->tempLen = sprintf(sc->tempStr, "\ - }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - res = VkAppendLineFromInput(sc, sc->disableThreadsEnd); - if (res != VKFFT_SUCCESS) return res; - res = appendZeropadEnd(sc); - if (res != VKFFT_SUCCESS) return res; - res = appendBarrierVkFFT(sc, 1); - if (res != VKFFT_SUCCESS) return res; - res = appendZeropadStart(sc); - if (res != VKFFT_SUCCESS) return res; - res = VkAppendLineFromInput(sc, sc->disableThreadsStart); - if (res != VKFFT_SUCCESS) return res; - } - uint64_t raderTranspose = ((sc->currentRaderContainer->containerFFTNum < 8) || (sc->currentRaderContainer->numStages == 1) || (strided)) ? 0 : 1; - - // read x0 - to be used in the end - { - uint64_t locStageRadix = sc->currentRaderContainer->stageRadix[0]; - uint64_t logicalStoragePerThread = sc->currentRaderContainer->registers_per_thread_per_radix[locStageRadix] * sc->registerBoost;// (sc->registers_per_thread % stageRadix == 0) ? sc->registers_per_thread * sc->registerBoost : sc->min_registers_per_thread * sc->registerBoost; - //uint64_t logicalRegistersPerThread = sc->currentRaderContainer->registers_per_thread_per_radix[locStageRadix];// (sc->registers_per_thread % stageRadix == 0) ? sc->registers_per_thread : sc->min_registers_per_thread; - uint64_t locFFTDim = sc->currentRaderContainer->containerFFTDim; //different length due to all -1 cutoffs - //uint64_t locFFTsCombined = sc->currentRaderContainer->containerFFTNum * locFFTDim; - //uint64_t logicalGroupSize = (uint64_t)ceil(locFFTsCombined / (double)logicalStoragePerThread); - uint64_t subLogicalGroupSize = (uint64_t)ceil(locFFTDim / (double)logicalStoragePerThread); // hopefully it is not <1, will fix - - if (!raderTranspose) { - sc->tempLen = sprintf(sc->tempStr, "\ - %s = %s %% %" PRIu64 ";\n", sc->raderIDx, gl_LocalInvocationID, subLogicalGroupSize); //local id - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, "\ - %s = %s / %" PRIu64 ";\n", sc->raderIDx2, gl_LocalInvocationID, subLogicalGroupSize); //global prime id - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, "\ - %s = %s / %" PRIu64 ";\n", sc->raderIDx, gl_LocalInvocationID, sc->currentRaderContainer->containerFFTNum); //local id - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, "\ - %s = %s %% %" PRIu64 ";\n", sc->raderIDx2, gl_LocalInvocationID, sc->currentRaderContainer->containerFFTNum); //global prime id - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (!raderTranspose) { - sc->tempLen = sprintf(sc->tempStr, "\ - if (%s < %" PRIu64 ") {\n", sc->raderIDx2, sc->currentRaderContainer->containerFFTNum); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, "\ - if (%s < %" PRIu64 ") {\n", sc->raderIDx, subLogicalGroupSize); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, "\ - %s = %s;\n", sc->sdataID, sc->raderIDx2); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (strided) { - sc->tempLen = sprintf(sc->tempStr, "\ - %s = %s * sharedStride + %s;\n", sc->sdataID, sc->sdataID, sc->gl_LocalInvocationID_x); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - if (sc->localSize[1] > 1) { - sc->tempLen = sprintf(sc->tempStr, "\ - %s = %s + sharedStride * %s;\n", sc->sdataID, sc->sdataID, sc->gl_LocalInvocationID_y); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - sc->tempLen = sprintf(sc->tempStr, "\ - %s = sdata[%s];\n", sc->x0[0], sc->sdataID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - sc->tempLen = sprintf(sc->tempStr, "\ - }\n"); - res = VkAppendLine(sc); - } - // read x0 for x0+x1 - 0-element - { - uint64_t locStageRadix = sc->currentRaderContainer->stageRadix[sc->currentRaderContainer->numStages - 1]; - uint64_t logicalStoragePerThread = sc->currentRaderContainer->registers_per_thread_per_radix[locStageRadix] * sc->registerBoost;// (sc->registers_per_thread % stageRadix == 0) ? sc->registers_per_thread * sc->registerBoost : sc->min_registers_per_thread * sc->registerBoost; - //uint64_t logicalRegistersPerThread = sc->currentRaderContainer->registers_per_thread_per_radix[locStageRadix];// (sc->registers_per_thread % stageRadix == 0) ? sc->registers_per_thread : sc->min_registers_per_thread; - uint64_t locFFTDim = sc->currentRaderContainer->containerFFTDim; //different length due to all -1 cutoffs - //uint64_t locFFTsCombined = sc->currentRaderContainer->containerFFTNum * locFFTDim; - //uint64_t logicalGroupSize = (uint64_t)ceil(locFFTsCombined / (double)logicalStoragePerThread); - uint64_t subLogicalGroupSize = (uint64_t)ceil(locFFTDim / (double)logicalStoragePerThread); // hopefully it is not <1, will fix - if (!raderTranspose) { - sc->tempLen = sprintf(sc->tempStr, "\ - %s = %s %% %" PRIu64 ";\n", sc->raderIDx, gl_LocalInvocationID, subLogicalGroupSize); //local id - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, "\ - %s = %s / %" PRIu64 ";\n", sc->raderIDx2, gl_LocalInvocationID, subLogicalGroupSize); //global prime id - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, "\ - %s = %s / %" PRIu64 ";\n", sc->raderIDx, gl_LocalInvocationID, sc->currentRaderContainer->containerFFTNum); //local id - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, "\ - %s = %s %% %" PRIu64 ";\n", sc->raderIDx2, gl_LocalInvocationID, sc->currentRaderContainer->containerFFTNum); //global prime id - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (!raderTranspose) { - sc->tempLen = sprintf(sc->tempStr, "\ - if (%s < %" PRIu64 ") {\n", sc->raderIDx2, sc->currentRaderContainer->containerFFTNum); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, "\ - if (%s < %" PRIu64 ") {\n", sc->raderIDx, subLogicalGroupSize); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - - sc->tempLen = sprintf(sc->tempStr, "\ - if (%s == 0) {\n", sc->raderIDx); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - sc->tempLen = sprintf(sc->tempStr, "\ - %s = %s;\n", sc->sdataID, sc->raderIDx2); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (strided) { - sc->tempLen = sprintf(sc->tempStr, "\ - %s = %s * sharedStride + %s;\n", sc->sdataID, sc->sdataID, sc->gl_LocalInvocationID_x); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - if (sc->localSize[1] > 1) { - sc->tempLen = sprintf(sc->tempStr, "\ - %s = %s + sharedStride * %s;\n", sc->sdataID, sc->sdataID, sc->gl_LocalInvocationID_y); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - sc->tempLen = sprintf(sc->tempStr, "\ - %s = sdata[%s];\n", sc->x0[1], sc->sdataID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, "\ - }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, "\ - }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->currentRaderContainer->numStages == 1) { - if (res != VKFFT_SUCCESS) return res; - res = VkAppendLineFromInput(sc, sc->disableThreadsEnd); - if (res != VKFFT_SUCCESS) return res; - res = appendZeropadEnd(sc); - if (res != VKFFT_SUCCESS) return res; - res = appendBarrierVkFFT(sc, 1); - if (res != VKFFT_SUCCESS) return res; - res = appendZeropadStart(sc); - if (res != VKFFT_SUCCESS) return res; - res = VkAppendLineFromInput(sc, sc->disableThreadsStart); - if (res != VKFFT_SUCCESS) return res; - } - uint64_t locStageSize = 1; - uint64_t locStageSizeSum = 0; - double locStageAngle = -double_PI; - uint64_t shift = 0; - for (uint64_t rader_stage = 0; rader_stage < sc->currentRaderContainer->numStages; rader_stage++) { - uint64_t locStageRadix = sc->currentRaderContainer->stageRadix[rader_stage]; - uint64_t logicalStoragePerThread = sc->currentRaderContainer->registers_per_thread_per_radix[locStageRadix] * sc->registerBoost;// (sc->registers_per_thread % stageRadix == 0) ? sc->registers_per_thread * sc->registerBoost : sc->min_registers_per_thread * sc->registerBoost; - uint64_t logicalRegistersPerThread = sc->currentRaderContainer->registers_per_thread_per_radix[locStageRadix];// (sc->registers_per_thread % stageRadix == 0) ? sc->registers_per_thread : sc->min_registers_per_thread; - uint64_t locFFTDim = sc->currentRaderContainer->containerFFTDim; //different length due to all -1 cutoffs - uint64_t locFFTsCombined = sc->currentRaderContainer->containerFFTNum * locFFTDim; - //uint64_t logicalGroupSize = (uint64_t)ceil(locFFTsCombined / (double)logicalStoragePerThread); - uint64_t subLogicalGroupSize = (uint64_t)ceil(locFFTDim / (double)logicalStoragePerThread); // hopefully it is not <1, will fix - uint64_t locFFTDimStride = locFFTDim; - if (shift <= sc->sharedShiftRaderFFT) locFFTDimStride = locFFTDim + shift; - //local radix - if ((rader_stage == 0) || (!raderTranspose)) { - sc->tempLen = sprintf(sc->tempStr, "\ - %s = %s %% %" PRIu64 ";\n", sc->raderIDx, gl_LocalInvocationID, subLogicalGroupSize); //local id - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, "\ - %s = %s / %" PRIu64 ";\n", sc->raderIDx2, gl_LocalInvocationID, subLogicalGroupSize); //global prime id - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, "\ - %s = %s / %" PRIu64 ";\n", sc->raderIDx, gl_LocalInvocationID, sc->currentRaderContainer->containerFFTNum); //local id - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, "\ - %s = %s %% %" PRIu64 ";\n", sc->raderIDx2, gl_LocalInvocationID, sc->currentRaderContainer->containerFFTNum); //global prime id - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - for (uint64_t k = 0; k < sc->registerBoost; k++) { - if ((rader_stage == 0) || (!raderTranspose)) { - sc->tempLen = sprintf(sc->tempStr, "\ - if (%s < %" PRIu64 ") {\n", sc->raderIDx2, sc->currentRaderContainer->containerFFTNum); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, "\ - if (%s < %" PRIu64 ") {\n", sc->raderIDx, subLogicalGroupSize); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - for (uint64_t j = 0; j < logicalRegistersPerThread / locStageRadix; j++) { - if (subLogicalGroupSize * ((j + k * logicalRegistersPerThread / locStageRadix) * locStageRadix) > locFFTDim) continue; - if (subLogicalGroupSize * ((1 + j + k * logicalRegistersPerThread / locStageRadix) * locStageRadix) > locFFTDim) { - uint64_t current_group_cut = locFFTDim / locStageRadix - (j + k * logicalRegistersPerThread / locStageRadix) * subLogicalGroupSize; - - sc->tempLen = sprintf(sc->tempStr, "\ - if (%s < %" PRIu64 ") {\n", sc->raderIDx, current_group_cut); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - - sc->tempLen = sprintf(sc->tempStr, "\ - %s = (%s+ %" PRIu64 ") %% (%" PRIu64 ");\n", sc->stageInvocationID, sc->raderIDx, (j + k * logicalRegistersPerThread / locStageRadix) * subLogicalGroupSize, locStageSize); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->LUT) - sc->tempLen = sprintf(sc->tempStr, " LUTId = stageInvocationID + %" PRIu64 ";\n", locStageSizeSum + sc->currentRaderContainer->RaderRadixOffsetLUT); - else - sc->tempLen = sprintf(sc->tempStr, " angle = stageInvocationID * %.17e%s;\n", locStageAngle, LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - for (uint64_t i = 0; i < locStageRadix; i++) { - uint64_t g = sc->currentRaderContainer->generator; - if (rader_stage == 0) { - if (sc->inline_rader_g_pow == 1) { - sc->tempLen = sprintf(sc->tempStr, "\ - %s= g_pow_%" PRIu64 "[%s + %" PRIu64 "];\n", sc->sdataID, stageRadix, sc->raderIDx, j * subLogicalGroupSize + i * locFFTDim / locStageRadix); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else if (sc->inline_rader_g_pow == 2) { - sc->tempLen = sprintf(sc->tempStr, "\ - %s= g_pow[%s + %" PRIu64 "];\n", sc->sdataID, sc->raderIDx, j * subLogicalGroupSize + i * locFFTDim / locStageRadix + sc->currentRaderContainer->raderUintLUToffset); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, "\ - %s= (%s + %" PRIu64 ");\n\ - %s=1;\n\ - while (%s != 0)\n\ - {\n\ - %s = (%s * %" PRIu64 ") %% %" PRIu64 ";\n\ - %s--;\n\ - }\n", sc->inoutID, sc->raderIDx, j * subLogicalGroupSize + i * locFFTDim / locStageRadix, sc->sdataID, sc->inoutID, sc->sdataID, sc->sdataID, g, stageRadix, sc->inoutID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, "\ - %s = %s + %s*%" PRIu64 ";\n", sc->sdataID, sc->raderIDx2, sc->sdataID, sc->currentRaderContainer->containerFFTNum); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - if (!raderTranspose) { - sc->tempLen = sprintf(sc->tempStr, "\ - %s = %s + %" PRIu64 " + %s*%" PRIu64 ";\n", sc->sdataID, sc->raderIDx, j * subLogicalGroupSize + i * locFFTDim / locStageRadix + sc->fftDim / stageRadix, sc->raderIDx2, locFFTDimStride); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, "\ - %s = (%s + %" PRIu64 ")*%" PRIu64 " + %s + %" PRIu64 ";\n", sc->sdataID, sc->raderIDx, j * subLogicalGroupSize + i * locFFTDim / locStageRadix, sc->currentRaderContainer->containerFFTNum, sc->raderIDx2, sc->fftDim / stageRadix); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - - uint64_t id = j + i * logicalRegistersPerThread / locStageRadix; - id = (id / logicalRegistersPerThread) * sc->registers_per_thread + id % logicalRegistersPerThread; - if (!strided) { - if (sc->resolveBankConflictFirstStages == 1) { - sc->tempLen = sprintf(sc->tempStr, "\ - %s = (%s / %" PRIu64 ") * %" PRIu64 " + %s %% %" PRIu64 ";", sc->sdataID, sc->sdataID, sc->numSharedBanks / 2, sc->numSharedBanks / 2 + 1, sc->sdataID, sc->numSharedBanks / 2); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - if (strided) { - sc->tempLen = sprintf(sc->tempStr, "\ - %s = %s * sharedStride + %s;\n", sc->sdataID, sc->sdataID, sc->gl_LocalInvocationID_x); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - if (sc->localSize[1] > 1) { - sc->tempLen = sprintf(sc->tempStr, "\ - %s = %s + sharedStride * %s;\n", sc->sdataID, sc->sdataID, sc->gl_LocalInvocationID_y); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - sc->tempLen = sprintf(sc->tempStr, "\ - %s = sdata[%s];\n", sc->regIDs[id], sc->sdataID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - - char** regID = (char**)malloc(sizeof(char*) * locStageRadix); - if (regID) { - for (uint64_t i = 0; i < locStageRadix; i++) { - regID[i] = (char*)malloc(sizeof(char) * 50); - if (!regID[i]) { - for (uint64_t p = 0; p < i; p++) { - free(regID[p]); - regID[p] = 0; - } - free(regID); - regID = 0; - return VKFFT_ERROR_MALLOC_FAILED; - } - uint64_t id = j + k * logicalRegistersPerThread / locStageRadix + i * logicalStoragePerThread / locStageRadix; - id = (id / logicalRegistersPerThread) * sc->registers_per_thread + id % logicalRegistersPerThread; - sprintf(regID[i], "%s", sc->regIDs[id]); - } - res = inlineRadixKernelVkFFT(sc, floatType, uintType, locStageRadix, locStageSize, locStageSizeSum, locStageAngle, regID); - if (res != VKFFT_SUCCESS) return res; - for (uint64_t i = 0; i < locStageRadix; i++) { - uint64_t id = j + k * logicalRegistersPerThread / locStageRadix + i * logicalStoragePerThread / locStageRadix; - id = (id / logicalRegistersPerThread) * sc->registers_per_thread + id % logicalRegistersPerThread; - sprintf(sc->regIDs[id], "%s", regID[i]); - } - for (uint64_t i = 0; i < locStageRadix; i++) { - free(regID[i]); - regID[i] = 0; - } - free(regID); - regID = 0; - } - else - return VKFFT_ERROR_MALLOC_FAILED; - - if (subLogicalGroupSize * ((1 + j + k * logicalRegistersPerThread / locStageRadix) * locStageRadix) > locFFTDim) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - sc->tempLen = sprintf(sc->tempStr, "\ - }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (rader_stage != sc->currentRaderContainer->numStages - 1) { - res = VkAppendLineFromInput(sc, sc->disableThreadsEnd); - if (res != VKFFT_SUCCESS) return res; - res = appendZeropadEnd(sc); - if (res != VKFFT_SUCCESS) return res; - res = appendBarrierVkFFT(sc, 1); - if (res != VKFFT_SUCCESS) return res; - res = appendZeropadStart(sc); - if (res != VKFFT_SUCCESS) return res; - res = VkAppendLineFromInput(sc, sc->disableThreadsStart); - if (res != VKFFT_SUCCESS) return res; - } - //local shuffle - char** tempID; - tempID = (char**)malloc(sizeof(char*) * sc->registers_per_thread * sc->registerBoost); - if (tempID) { - for (uint64_t i = 0; i < sc->registers_per_thread * sc->registerBoost; i++) { - tempID[i] = (char*)malloc(sizeof(char) * 50); - if (!tempID[i]) { - for (uint64_t j = 0; j < i; j++) { - free(tempID[j]); - tempID[j] = 0; - } - free(tempID); - tempID = 0; - return VKFFT_ERROR_MALLOC_FAILED; - } - } - for (uint64_t k = 0; k < sc->registerBoost; ++k) { - uint64_t t = 0; - - if ((rader_stage == 0) || (!raderTranspose)) { - sc->tempLen = sprintf(sc->tempStr, "\ - if (%s < %" PRIu64 ") {\n", sc->raderIDx2, sc->currentRaderContainer->containerFFTNum); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, "\ - if (%s < %" PRIu64 ") {\n", sc->raderIDx, subLogicalGroupSize); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - //last stage - save x1 - if (rader_stage == sc->currentRaderContainer->numStages - 1) { - - sc->tempLen = sprintf(sc->tempStr, "\ - if (%s == 0) {\n", sc->raderIDx); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, sc->x0[1], sc->x0[1], sc->regIDs[0]); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, "\ - }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (!strided) { - if (rader_stage != 0) { - shift = (subLogicalGroupSize > (locFFTDim % (sc->numSharedBanks / 2))) ? subLogicalGroupSize - locFFTDim % (sc->numSharedBanks / 2) : 0; - if (shift <= sc->sharedShiftRaderFFT) locFFTDimStride = locFFTDim + shift; - } - else { - if (sc->sharedShiftRaderFFT > 0) { - sc->tempLen = sprintf(sc->tempStr, "\ - }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - res = VkAppendLineFromInput(sc, sc->disableThreadsEnd); - if (res != VKFFT_SUCCESS) return res; - res = appendZeropadEnd(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, "\ - sharedStride = %" PRIu64 ";\n", sc->sharedStrideRaderFFT); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - res = appendZeropadStart(sc); - if (res != VKFFT_SUCCESS) return res; - res = VkAppendLineFromInput(sc, sc->disableThreadsStart); - if (res != VKFFT_SUCCESS) return res; - if ((rader_stage == 0) || (!raderTranspose)) { - sc->tempLen = sprintf(sc->tempStr, "\ - if (%s < %" PRIu64 ") {\n", sc->raderIDx2, sc->currentRaderContainer->containerFFTNum); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, "\ - if (%s < %" PRIu64 ") {\n", sc->raderIDx, subLogicalGroupSize); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - shift = ((locFFTDim % (sc->numSharedBanks / 2))) ? 0 : 1; - if (shift <= sc->sharedShiftRaderFFT) locFFTDimStride = locFFTDim + shift; - } - } - for (uint64_t j = 0; j < logicalRegistersPerThread / locStageRadix; j++) { - if (subLogicalGroupSize * ((j + k * logicalRegistersPerThread / locStageRadix) * locStageRadix) <= locFFTDim) { - if (subLogicalGroupSize * ((1 + j + k * logicalRegistersPerThread / locStageRadix) * locStageRadix) > locFFTDim) { - uint64_t current_group_cut = locFFTDim / locStageRadix - (j + k * logicalRegistersPerThread / locStageRadix) * subLogicalGroupSize; - sc->tempLen = sprintf(sc->tempStr, "\ - if (%s < %" PRIu64 ") {\n", sc->raderIDx, current_group_cut); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sprintf(tempNum, "%" PRIu64 "", j * subLogicalGroupSize); - res = VkAddReal(sc, sc->stageInvocationID, sc->raderIDx, tempNum); - if (res != VKFFT_SUCCESS) return res; - res = VkMovReal(sc, sc->blockInvocationID, sc->stageInvocationID); - if (res != VKFFT_SUCCESS) return res; - sprintf(tempNum, "%" PRIu64 "", locStageSize); - res = VkModReal(sc, sc->stageInvocationID, sc->stageInvocationID, tempNum); - if (res != VKFFT_SUCCESS) return res; - res = VkSubReal(sc, sc->blockInvocationID, sc->blockInvocationID, sc->stageInvocationID); - if (res != VKFFT_SUCCESS) return res; - sprintf(tempNum, "%" PRIu64 "", locStageRadix); - res = VkMulReal(sc, sc->inoutID, sc->blockInvocationID, tempNum); - if (res != VKFFT_SUCCESS) return res; - res = VkAddReal(sc, sc->inoutID, sc->inoutID, sc->stageInvocationID); - if (res != VKFFT_SUCCESS) return res; - } - /*sc->tempLen = sprintf(sc->tempStr, "\ - stageInvocationID = (gl_LocalInvocationID.x + %" PRIu64 ") %% (%" PRIu64 ");\n\ - blockInvocationID = (gl_LocalInvocationID.x + %" PRIu64 ") - stageInvocationID;\n\ - inoutID = stageInvocationID + blockInvocationID * %" PRIu64 ";\n", j * logicalGroupSize, stageSize, j * logicalGroupSize, stageRadix);*/ - - for (uint64_t i = 0; i < locStageRadix; i++) { - uint64_t id = j + k * logicalRegistersPerThread / locStageRadix + i * logicalStoragePerThread / locStageRadix; - id = (id / logicalRegistersPerThread) * sc->registers_per_thread + id % logicalRegistersPerThread; - sprintf(tempID[t + k * sc->registers_per_thread], "%s", sc->regIDs[id]); - t++; - if (subLogicalGroupSize * ((j + k * logicalRegistersPerThread / locStageRadix) * locStageRadix) <= locFFTDim) { - sprintf(tempNum, "%" PRIu64 "", i * locStageSize); - res = VkAddReal(sc, sc->combinedID, sc->inoutID, tempNum); - if (res != VKFFT_SUCCESS) return res; - - //last stage - mult rader kernel - if (rader_stage == sc->currentRaderContainer->numStages - 1) { - if (sc->inline_rader_kernel) { - sc->tempLen = sprintf(sc->tempStr, "\ - %s.x = r_rader_kernel_%" PRIu64 "[%s];\n\ - %s.y = i_rader_kernel_%" PRIu64 "[%s];\n", sc->w, stageRadix, sc->combinedID, sc->w, stageRadix, sc->combinedID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, "\ - %s = twiddleLUT[%s+%" PRIu64 "];\n", sc->w, sc->combinedID, sc->currentRaderContainer->RaderKernelOffsetLUT); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - /*sc->tempLen = sprintf(sc->tempStr, "\ - printf(\"%%f %%f - %%f %%f\\n\", %s.x, %s.y, %s.x, %s.y);\n", sc->regIDs[id], sc->regIDs[id], sc->w, sc->w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res;*/ - res = VkMulComplex(sc, sc->regIDs[id], sc->regIDs[id], sc->w, sc->temp); - if (res != VKFFT_SUCCESS) return res; - } - if (rader_stage != sc->currentRaderContainer->numStages - 1) { - if (!raderTranspose) { - sprintf(tempNum, "%" PRIu64 "", sc->fftDim / stageRadix); - res = VkAddReal(sc, sc->sdataID, sc->combinedID, tempNum); - if (res != VKFFT_SUCCESS) return res; - sprintf(tempNum, "%s * %" PRIu64 "", sc->raderIDx2, locFFTDimStride); - res = VkAddReal(sc, sc->sdataID, sc->sdataID, tempNum); - if (res != VKFFT_SUCCESS) return res; - } - else { - sprintf(tempNum, "%" PRIu64 "", sc->currentRaderContainer->containerFFTNum); - res = VkMulReal(sc, sc->sdataID, sc->combinedID, tempNum); - if (res != VKFFT_SUCCESS) return res; - sprintf(tempNum, "%" PRIu64 "", sc->fftDim / stageRadix); - res = VkAddReal(sc, sc->sdataID, sc->sdataID, tempNum); - if (res != VKFFT_SUCCESS) return res; - sprintf(tempNum, "%s", sc->raderIDx2); - res = VkAddReal(sc, sc->sdataID, sc->sdataID, tempNum); - if (res != VKFFT_SUCCESS) return res; - } - if (!strided) { - if (0 && (locStageSize <= sc->numSharedBanks / 2) && (locFFTsCombined > sc->numSharedBanks / 2) && (sc->sharedStrideBankConflictFirstStages != locFFTDim / sc->registerBoost) && ((locFFTDim & (locFFTDim - 1)) == 0) && (locStageSize * locStageRadix != locFFTDim)) { - if (sc->resolveBankConflictFirstStages == 0) { - sc->resolveBankConflictFirstStages = 1; - sc->tempLen = sprintf(sc->tempStr, "\ - %s = %" PRIu64 ";", sc->sharedStride, sc->sharedStrideBankConflictFirstStages); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, "\ - %s = (%s / %" PRIu64 ") * %" PRIu64 " + %s %% %" PRIu64 ";", sc->sdataID, sc->sdataID, sc->numSharedBanks / 2, sc->numSharedBanks / 2 + 1, sc->sdataID, sc->numSharedBanks / 2); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - } - else { - if (sc->resolveBankConflictFirstStages == 1) { - sc->resolveBankConflictFirstStages = 0; - sc->tempLen = sprintf(sc->tempStr, "\ - %s = %" PRIu64 ";", sc->sharedStride, sc->sharedStrideReadWriteConflict); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - if (strided) { - res = VkMulReal(sc, sc->sdataID, sc->sdataID, sc->sharedStride); - if (res != VKFFT_SUCCESS) return res; - res = VkAddReal(sc, sc->sdataID, sc->sdataID, sc->gl_LocalInvocationID_x); - if (res != VKFFT_SUCCESS) return res; - } - else { - if (sc->localSize[1] > 1) { - res = VkMulReal(sc, sc->combinedID, sc->gl_LocalInvocationID_y, sc->sharedStride); - if (res != VKFFT_SUCCESS) return res; - res = VkAddReal(sc, sc->sdataID, sc->sdataID, sc->combinedID); - if (res != VKFFT_SUCCESS) return res; - } - } - //sprintf(sc->sdataID, "sharedStride * gl_LocalInvocationID.y + inoutID + %" PRIu64 "", i * stageSize); - res = VkSharedStore(sc, sc->sdataID, sc->regIDs[id]); - if (res != VKFFT_SUCCESS) return res; - } - } - /*sc->tempLen = sprintf(sc->tempStr, "\ -sdata[sharedStride * gl_LocalInvocationID.y + inoutID + %" PRIu64 "] = temp%s%s;\n", i * stageSize, sc->regIDs[id], stageNormalization);*/ - } - if (subLogicalGroupSize * ((j + k * logicalRegistersPerThread / locStageRadix) * locStageRadix) <= locFFTDim) { - if (subLogicalGroupSize * ((1 + j + k * logicalRegistersPerThread / locStageRadix) * locStageRadix) > locFFTDim) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - sc->tempLen = sprintf(sc->tempStr, "\ - }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - for (uint64_t j = logicalRegistersPerThread; j < sc->registers_per_thread; j++) { - sprintf(tempID[t + k * sc->registers_per_thread], "%s", sc->regIDs[t + k * sc->registers_per_thread]); - t++; - } - t = 0; - } - if (rader_stage != sc->currentRaderContainer->numStages - 1) { - for (uint64_t i = 0; i < sc->registers_per_thread * sc->registerBoost; i++) { - //printf("0 - %s\n", resID[i]); - sprintf(sc->regIDs[i], "%s", tempID[i]); - //sprintf(resID[i], "%s", tempID[i]); - //printf("1 - %s\n", resID[i]); - } - } - for (uint64_t i = 0; i < sc->registers_per_thread * sc->registerBoost; i++) { - free(tempID[i]); - tempID[i] = 0; - } - free(tempID); - tempID = 0; - } - else - return VKFFT_ERROR_MALLOC_FAILED; - - if (rader_stage > 0) { - switch (locStageRadix) { - case 2: - locStageSizeSum += locStageSize; - break; - case 3: - locStageSizeSum += locStageSize * 2; - break; - case 4: - locStageSizeSum += locStageSize * 2; - break; - case 5: - locStageSizeSum += locStageSize * 4; - break; - case 6: - locStageSizeSum += locStageSize * 5; - break; - case 7: - locStageSizeSum += locStageSize * 6; - break; - case 8: - locStageSizeSum += locStageSize * 3; - break; - case 9: - locStageSizeSum += locStageSize * 8; - break; - case 10: - locStageSizeSum += locStageSize * 9; - break; - case 11: - locStageSizeSum += locStageSize * 10; - break; - case 12: - locStageSizeSum += locStageSize * 11; - break; - case 13: - locStageSizeSum += locStageSize * 12; - break; - case 14: - locStageSizeSum += locStageSize * 13; - break; - case 15: - locStageSizeSum += locStageSize * 14; - break; - case 16: - locStageSizeSum += locStageSize * 4; - break; - case 32: - locStageSizeSum += locStageSize * 5; - break; - default: - locStageSizeSum += locStageSize * (locStageRadix); - break; - } - } - locStageSize *= locStageRadix; - locStageAngle /= locStageRadix; - - if (rader_stage != sc->currentRaderContainer->numStages - 1) { - res = VkAppendLineFromInput(sc, sc->disableThreadsEnd); - if (res != VKFFT_SUCCESS) return res; - res = appendZeropadEnd(sc); - if (res != VKFFT_SUCCESS) return res; - res = appendBarrierVkFFT(sc, 1); - if (res != VKFFT_SUCCESS) return res; - res = appendZeropadStart(sc); - if (res != VKFFT_SUCCESS) return res; - res = VkAppendLineFromInput(sc, sc->disableThreadsStart); - if (res != VKFFT_SUCCESS) return res; - } - } - - //iFFT - locStageSize = 1; - locStageAngle = double_PI; - locStageSizeSum = 0; - for (int64_t rader_stage = sc->currentRaderContainer->numStages - 1; rader_stage >= 0; rader_stage--) { - uint64_t locStageRadix = sc->currentRaderContainer->stageRadix[rader_stage]; - uint64_t logicalStoragePerThread = sc->currentRaderContainer->registers_per_thread_per_radix[locStageRadix] * sc->registerBoost;// (sc->registers_per_thread % stageRadix == 0) ? sc->registers_per_thread * sc->registerBoost : sc->min_registers_per_thread * sc->registerBoost; - uint64_t logicalRegistersPerThread = sc->currentRaderContainer->registers_per_thread_per_radix[locStageRadix];// (sc->registers_per_thread % stageRadix == 0) ? sc->registers_per_thread : sc->min_registers_per_thread; - uint64_t locFFTDim = sc->currentRaderContainer->containerFFTDim; //different length due to all -1 cutoffs - uint64_t locFFTsCombined = sc->currentRaderContainer->containerFFTNum * locFFTDim; - //uint64_t logicalGroupSize = (uint64_t)ceil(locFFTsCombined / (double)logicalStoragePerThread); - uint64_t subLogicalGroupSize = (uint64_t)ceil(locFFTDim / (double)logicalStoragePerThread); // hopefully it is not <1, will fix - uint64_t locFFTDimStride = locFFTDim; //different length due to all -1 cutoffs - if (shift <= sc->sharedShiftRaderFFT) locFFTDimStride = locFFTDim + shift; - //local radix - if (!raderTranspose) { - sc->tempLen = sprintf(sc->tempStr, "\ - %s = %s %% %" PRIu64 ";\n", sc->raderIDx, gl_LocalInvocationID, subLogicalGroupSize); //local id - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, "\ - %s = %s / %" PRIu64 ";\n", sc->raderIDx2, gl_LocalInvocationID, subLogicalGroupSize); //global prime id - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, "\ - %s = %s / %" PRIu64 ";\n", sc->raderIDx, gl_LocalInvocationID, sc->currentRaderContainer->containerFFTNum); //local id - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, "\ - %s = %s %% %" PRIu64 ";\n", sc->raderIDx2, gl_LocalInvocationID, sc->currentRaderContainer->containerFFTNum); //global prime id - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - for (uint64_t k = 0; k < sc->registerBoost; k++) { - if (!raderTranspose) { - sc->tempLen = sprintf(sc->tempStr, "\ - if (%s < %" PRIu64 ") {\n", sc->raderIDx2, sc->currentRaderContainer->containerFFTNum); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, "\ - if (%s < %" PRIu64 ") {\n", sc->raderIDx, subLogicalGroupSize); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - for (uint64_t j = 0; j < logicalRegistersPerThread / locStageRadix; j++) { - if (subLogicalGroupSize * ((j + k * logicalRegistersPerThread / locStageRadix) * locStageRadix) > locFFTDim) continue; - if (subLogicalGroupSize * ((1 + j + k * logicalRegistersPerThread / locStageRadix) * locStageRadix) > locFFTDim) { - uint64_t current_group_cut = locFFTDim / locStageRadix - (j + k * logicalRegistersPerThread / locStageRadix) * subLogicalGroupSize; - sc->tempLen = sprintf(sc->tempStr, "\ - if (%s < %" PRIu64 ") {\n", sc->raderIDx, current_group_cut); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - - sc->tempLen = sprintf(sc->tempStr, "\ - %s = (%s+ %" PRIu64 ") %% (%" PRIu64 ");\n", sc->stageInvocationID, sc->raderIDx, (j + k * logicalRegistersPerThread / locStageRadix) * subLogicalGroupSize, locStageSize); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->LUT) - sc->tempLen = sprintf(sc->tempStr, " LUTId = stageInvocationID + %" PRIu64 ";\n", locStageSizeSum + sc->currentRaderContainer->RaderRadixOffsetLUTiFFT); - else - sc->tempLen = sprintf(sc->tempStr, " angle = stageInvocationID * %.17e%s;\n", locStageAngle, LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (rader_stage != (int64_t)sc->currentRaderContainer->numStages - 1) { - for (uint64_t i = 0; i < locStageRadix; i++) { - uint64_t id = j + i * logicalRegistersPerThread / locStageRadix; - id = (id / logicalRegistersPerThread) * sc->registers_per_thread + id % logicalRegistersPerThread; - if (!raderTranspose) { - sc->tempLen = sprintf(sc->tempStr, "\ - %s = (%s + %" PRIu64 ") + %s*%" PRIu64 ";\n", sc->sdataID, sc->raderIDx, j * subLogicalGroupSize + i * locFFTDim / locStageRadix + sc->fftDim / stageRadix, sc->raderIDx2, locFFTDimStride); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, "\ - %s = (%s + %" PRIu64 ")*%" PRIu64 " + %s + %" PRIu64 ";\n", sc->sdataID, sc->raderIDx, j * subLogicalGroupSize + i * locFFTDim / locStageRadix, sc->currentRaderContainer->containerFFTNum, sc->raderIDx2, sc->fftDim / stageRadix); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (!strided) { - if (sc->resolveBankConflictFirstStages == 1) { - sc->tempLen = sprintf(sc->tempStr, "\ - %s = (%s / %" PRIu64 ") * %" PRIu64 " + %s %% %" PRIu64 ";", sc->sdataID, sc->sdataID, sc->numSharedBanks / 2, sc->numSharedBanks / 2 + 1, sc->sdataID, sc->numSharedBanks / 2); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - if (strided) { - sc->tempLen = sprintf(sc->tempStr, "\ - %s = %s * sharedStride + %s;\n", sc->sdataID, sc->sdataID, sc->gl_LocalInvocationID_x); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - if (sc->localSize[1] > 1) { - sc->tempLen = sprintf(sc->tempStr, "\ - %s = %s + sharedStride * %s;\n", sc->sdataID, sc->sdataID, sc->gl_LocalInvocationID_y); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - sc->tempLen = sprintf(sc->tempStr, "\ - %s = sdata[%s];\n", sc->regIDs[id], sc->sdataID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - char** regID = (char**)malloc(sizeof(char*) * locStageRadix); - if (regID) { - for (uint64_t i = 0; i < locStageRadix; i++) { - regID[i] = (char*)malloc(sizeof(char) * 50); - if (!regID[i]) { - for (uint64_t p = 0; p < i; p++) { - free(regID[p]); - regID[p] = 0; - } - free(regID); - regID = 0; - return VKFFT_ERROR_MALLOC_FAILED; - } - uint64_t id = j + k * logicalRegistersPerThread / locStageRadix + i * logicalStoragePerThread / locStageRadix; - id = (id / logicalRegistersPerThread) * sc->registers_per_thread + id % logicalRegistersPerThread; - sprintf(regID[i], "%s", sc->regIDs[id]); - } - res = inlineRadixKernelVkFFT(sc, floatType, uintType, locStageRadix, locStageSize, locStageSizeSum, locStageAngle, regID); - if (res != VKFFT_SUCCESS) return res; - for (uint64_t i = 0; i < locStageRadix; i++) { - uint64_t id = j + k * logicalRegistersPerThread / locStageRadix + i * logicalStoragePerThread / locStageRadix; - id = (id / logicalRegistersPerThread) * sc->registers_per_thread + id % logicalRegistersPerThread; - sprintf(sc->regIDs[id], "%s", regID[i]); - } - for (uint64_t i = 0; i < locStageRadix; i++) { - free(regID[i]); - regID[i] = 0; - } - free(regID); - regID = 0; - } - else - return VKFFT_ERROR_MALLOC_FAILED; - if (subLogicalGroupSize * ((1 + j + k * logicalRegistersPerThread / locStageRadix) * locStageRadix) > locFFTDim) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - sc->tempLen = sprintf(sc->tempStr, "\ - }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - res = VkAppendLineFromInput(sc, sc->disableThreadsEnd); - if (res != VKFFT_SUCCESS) return res; - res = appendZeropadEnd(sc); - if (res != VKFFT_SUCCESS) return res; - res = appendBarrierVkFFT(sc, 1); - if (res != VKFFT_SUCCESS) return res; - if (!strided) { - if (rader_stage == 0) { - if (sc->sharedStrideRaderFFT > 0) { - sc->tempLen = sprintf(sc->tempStr, "\ - sharedStride = %" PRIu64 ";\n", sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - res = appendZeropadStart(sc); - if (res != VKFFT_SUCCESS) return res; - res = VkAppendLineFromInput(sc, sc->disableThreadsStart); - if (res != VKFFT_SUCCESS) return res; - //local shuffle - char** tempID; - tempID = (char**)malloc(sizeof(char*) * sc->registers_per_thread * sc->registerBoost); - if (tempID) { - for (uint64_t i = 0; i < sc->registers_per_thread * sc->registerBoost; i++) { - tempID[i] = (char*)malloc(sizeof(char) * 50); - if (!tempID[i]) { - for (uint64_t j = 0; j < i; j++) { - free(tempID[j]); - tempID[j] = 0; - } - free(tempID); - tempID = 0; - return VKFFT_ERROR_MALLOC_FAILED; - } - } - for (uint64_t k = 0; k < sc->registerBoost; ++k) { - uint64_t t = 0; - if (!raderTranspose) { - sc->tempLen = sprintf(sc->tempStr, "\ - if (%s < %" PRIu64 ") {\n", sc->raderIDx2, sc->currentRaderContainer->containerFFTNum); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, "\ - if (%s < %" PRIu64 ") {\n", sc->raderIDx, subLogicalGroupSize); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (rader_stage == 0) { - res = VkMovReal(sc, sc->stageInvocationID, sc->raderIDx2); - if (res != VKFFT_SUCCESS) return res; - res = VkMovReal(sc, sc->blockInvocationID, sc->stageInvocationID); - if (res != VKFFT_SUCCESS) return res; - sprintf(tempNum, "%" PRIu64 "", stageSize); - res = VkModReal(sc, sc->stageInvocationID, sc->stageInvocationID, tempNum); - if (res != VKFFT_SUCCESS) return res; - res = VkSubReal(sc, sc->blockInvocationID, sc->blockInvocationID, sc->stageInvocationID); - if (res != VKFFT_SUCCESS) return res; - sprintf(tempNum, "%" PRIu64 "", stageRadix); - res = VkMulReal(sc, sc->raderIDx2, sc->blockInvocationID, tempNum); - if (res != VKFFT_SUCCESS) return res; - res = VkAddReal(sc, sc->raderIDx2, sc->raderIDx2, sc->stageInvocationID); - if (res != VKFFT_SUCCESS) return res; - } - if (!strided) { - if (rader_stage != (int64_t)sc->currentRaderContainer->numStages - 1) { - shift = (subLogicalGroupSize > (locFFTDim % (sc->numSharedBanks / 2))) ? subLogicalGroupSize - locFFTDim % (sc->numSharedBanks / 2) : 0; - if (shift <= sc->sharedShiftRaderFFT) locFFTDimStride = locFFTDim + shift; - } - else { - shift = ((locFFTDim % (sc->numSharedBanks / 2))) ? 0 : 1; - if (shift <= sc->sharedShiftRaderFFT) locFFTDimStride = locFFTDim + shift; - } - } - for (uint64_t j = 0; j < logicalRegistersPerThread / locStageRadix; j++) { - if (subLogicalGroupSize * ((j + k * logicalRegistersPerThread / locStageRadix) * locStageRadix) <= locFFTDim) { - if (subLogicalGroupSize * ((1 + j + k * logicalRegistersPerThread / locStageRadix) * locStageRadix) > locFFTDim) { - uint64_t current_group_cut = locFFTDim / locStageRadix - (j + k * logicalRegistersPerThread / locStageRadix) * subLogicalGroupSize; - sc->tempLen = sprintf(sc->tempStr, "\ - if (%s < %" PRIu64 ") {\n", sc->raderIDx, current_group_cut); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sprintf(tempNum, "%" PRIu64 "", j * subLogicalGroupSize); - res = VkAddReal(sc, sc->stageInvocationID, sc->raderIDx, tempNum); - if (res != VKFFT_SUCCESS) return res; - res = VkMovReal(sc, sc->blockInvocationID, sc->stageInvocationID); - if (res != VKFFT_SUCCESS) return res; - sprintf(tempNum, "%" PRIu64 "", locStageSize); - res = VkModReal(sc, sc->stageInvocationID, sc->stageInvocationID, tempNum); - if (res != VKFFT_SUCCESS) return res; - res = VkSubReal(sc, sc->blockInvocationID, sc->blockInvocationID, sc->stageInvocationID); - if (res != VKFFT_SUCCESS) return res; - sprintf(tempNum, "%" PRIu64 "", locStageRadix); - res = VkMulReal(sc, sc->inoutID, sc->blockInvocationID, tempNum); - if (res != VKFFT_SUCCESS) return res; - res = VkAddReal(sc, sc->inoutID, sc->inoutID, sc->stageInvocationID); - if (res != VKFFT_SUCCESS) return res; - - } - /*sc->tempLen = sprintf(sc->tempStr, "\ - stageInvocationID = (gl_LocalInvocationID.x + %" PRIu64 ") %% (%" PRIu64 ");\n\ - blockInvocationID = (gl_LocalInvocationID.x + %" PRIu64 ") - stageInvocationID;\n\ - inoutID = stageInvocationID + blockInvocationID * %" PRIu64 ";\n", j * logicalGroupSize, stageSize, j * logicalGroupSize, stageRadix);*/ - - for (uint64_t i = 0; i < locStageRadix; i++) { - uint64_t id = j + k * logicalRegistersPerThread / locStageRadix + i * logicalStoragePerThread / locStageRadix; - id = (id / logicalRegistersPerThread) * sc->registers_per_thread + id % logicalRegistersPerThread; - sprintf(tempID[t + k * sc->registers_per_thread], "%s", sc->regIDs[id]); - t++; - if (subLogicalGroupSize * ((j + k * logicalRegistersPerThread / locStageRadix) * locStageRadix) <= locFFTDim) { - sprintf(tempNum, "%" PRIu64 "", i * locStageSize); - res = VkAddReal(sc, sc->combinedID, sc->inoutID, tempNum); - if (res != VKFFT_SUCCESS) return res; - - if (rader_stage == 0) { - locFFTDimStride = locFFTDim; - //last stage - add x0 - - uint64_t g = sc->currentRaderContainer->generator; - if (sc->inline_rader_g_pow == 1) { - sc->tempLen = sprintf(sc->tempStr, "\ - %s= g_pow_%" PRIu64 "[%" PRIu64 "-%s];\n", sc->combinedID, stageRadix, stageRadix - 1, sc->combinedID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else if (sc->inline_rader_g_pow == 2) { - sc->tempLen = sprintf(sc->tempStr, "\ - %s= g_pow[%" PRIu64 "-%s];\n", sc->combinedID, stageRadix - 1 + sc->currentRaderContainer->raderUintLUToffset, sc->combinedID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, "\ - %s= (%" PRIu64 "-%s);\n\ - %s=1;\n\ - while (%s != 0)\n\ - {\n\ - %s = (%s * %" PRIu64 ") %% %" PRIu64 ";\n\ - %s--;\n\ - }\n", sc->inoutID, stageRadix - 1, sc->combinedID, sc->sdataID, sc->inoutID, sc->combinedID, sc->combinedID, g, stageRadix, sc->inoutID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->inverse) { - sprintf(tempNum, "(%" PRIu64 "-%s)*%" PRIu64 "", (stageRadix), sc->combinedID, stageSize); - } - else { - sprintf(tempNum, "%s*%" PRIu64 "", sc->combinedID, stageSize); - } - res = VkAddReal(sc, sc->sdataID, sc->raderIDx2, tempNum); - if (res != VKFFT_SUCCESS) return res; - //normalization is in kernel - /*sprintf(tempNum, "%.17e%s", 1.0 / locFFTDim, LFending); - res = VkMulComplexNumber(sc, sc->regIDs[id], sc->regIDs[id], tempNum); - if (res != VKFFT_SUCCESS) return res;*/ - res = VkAddComplex(sc, sc->regIDs[id], sc->regIDs[id], sc->x0[0]); - if (res != VKFFT_SUCCESS) return res; - - } - else { - if (!raderTranspose) { - sprintf(tempNum, "%" PRIu64 "", sc->fftDim / stageRadix); - res = VkAddReal(sc, sc->sdataID, sc->combinedID, tempNum); - if (res != VKFFT_SUCCESS) return res; - sprintf(tempNum, "%s * %" PRIu64 "", sc->raderIDx2, locFFTDimStride); - res = VkAddReal(sc, sc->sdataID, sc->sdataID, tempNum); - if (res != VKFFT_SUCCESS) return res; - } - else { - sprintf(tempNum, "%" PRIu64 "", sc->currentRaderContainer->containerFFTNum); - res = VkMulReal(sc, sc->sdataID, sc->combinedID, tempNum); - if (res != VKFFT_SUCCESS) return res; - sprintf(tempNum, "%" PRIu64 "", sc->fftDim / stageRadix); - res = VkAddReal(sc, sc->sdataID, sc->sdataID, tempNum); - if (res != VKFFT_SUCCESS) return res; - sprintf(tempNum, "%s", sc->raderIDx2); - res = VkAddReal(sc, sc->sdataID, sc->sdataID, tempNum); - if (res != VKFFT_SUCCESS) return res; - } - } - if (!strided) { - if (0 && (locStageSize <= sc->numSharedBanks / 2) && (locFFTsCombined > sc->numSharedBanks / 2) && (sc->sharedStrideBankConflictFirstStages != locFFTDim / sc->registerBoost) && ((locFFTDim & (locFFTDim - 1)) == 0) && (locStageSize * locStageRadix != locFFTDim)) { - if (sc->resolveBankConflictFirstStages == 0) { - sc->resolveBankConflictFirstStages = 1; - sc->tempLen = sprintf(sc->tempStr, "\ - %s = %" PRIu64 ";", sc->sharedStride, sc->sharedStrideBankConflictFirstStages); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, "\ - %s = (%s / %" PRIu64 ") * %" PRIu64 " + %s %% %" PRIu64 ";", sc->sdataID, sc->sdataID, sc->numSharedBanks / 2, sc->numSharedBanks / 2 + 1, sc->sdataID, sc->numSharedBanks / 2); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - } - else { - if (sc->resolveBankConflictFirstStages == 1) { - sc->resolveBankConflictFirstStages = 0; - sc->tempLen = sprintf(sc->tempStr, "\ - %s = %" PRIu64 ";", sc->sharedStride, sc->sharedStrideReadWriteConflict); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - if (strided) { - res = VkMulReal(sc, sc->sdataID, sc->sdataID, sc->sharedStride); - if (res != VKFFT_SUCCESS) return res; - res = VkAddReal(sc, sc->sdataID, sc->sdataID, sc->gl_LocalInvocationID_x); - if (res != VKFFT_SUCCESS) return res; - } - else { - if (sc->localSize[1] > 1) { - res = VkMulReal(sc, sc->combinedID, sc->gl_LocalInvocationID_y, sc->sharedStride); - if (res != VKFFT_SUCCESS) return res; - res = VkAddReal(sc, sc->sdataID, sc->sdataID, sc->combinedID); - if (res != VKFFT_SUCCESS) return res; - } - } - //sprintf(sc->sdataID, "sharedStride * gl_LocalInvocationID.y + inoutID + %" PRIu64 "", i * stageSize); - if ((((sc->actualInverse) && (sc->normalize)) || ((sc->convolutionStep || sc->useBluesteinFFT) && (stageAngle > 0))) && (rader_stage == 0)) { - if (strcmp(stageNormalization, "")) { - res = VkMulComplexNumber(sc, sc->regIDs[id], sc->regIDs[id], stageNormalization); - } - if (res != VKFFT_SUCCESS) return res; - } - res = VkSharedStore(sc, sc->sdataID, sc->regIDs[id]); - if (res != VKFFT_SUCCESS) return res; - //sc->tempLen = sprintf(sc->tempStr, " printf(\"%%d %%f %%f \\n \", %s, %s.x, %s.y);\n\n", sc->sdataID, sc->regIDs[id], sc->regIDs[id]); - //res = VkAppendLine(sc); - //if (res != VKFFT_SUCCESS) return res; - } - /*sc->tempLen = sprintf(sc->tempStr, "\ -sdata[sharedStride * gl_LocalInvocationID.y + inoutID + %" PRIu64 "] = temp%s%s;\n", i * stageSize, sc->regIDs[id], stageNormalization);*/ - } - if (subLogicalGroupSize * ((j + k * logicalRegistersPerThread / locStageRadix) * locStageRadix) <= locFFTDim) { - if (subLogicalGroupSize * ((1 + j + k * logicalRegistersPerThread / locStageRadix) * locStageRadix) > locFFTDim) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - sc->tempLen = sprintf(sc->tempStr, "\ - }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - for (uint64_t j = logicalRegistersPerThread; j < sc->registers_per_thread; j++) { - sprintf(tempID[t + k * sc->registers_per_thread], "%s", sc->regIDs[t + k * sc->registers_per_thread]); - t++; - } - t = 0; - } - for (uint64_t i = 0; i < sc->registers_per_thread * sc->registerBoost; i++) { - //printf("0 - %s\n", resID[i]); - sprintf(sc->regIDs[i], "%s", tempID[i]); - //sprintf(resID[i], "%s", tempID[i]); - //printf("1 - %s\n", resID[i]); - } - for (uint64_t i = 0; i < sc->registers_per_thread * sc->registerBoost; i++) { - free(tempID[i]); - tempID[i] = 0; - } - free(tempID); - tempID = 0; - } - else - return VKFFT_ERROR_MALLOC_FAILED; - - if (rader_stage < (int64_t)sc->currentRaderContainer->numStages - 1) { - switch (locStageRadix) { - case 2: - locStageSizeSum += locStageSize; - break; - case 3: - locStageSizeSum += locStageSize * 2; - break; - case 4: - locStageSizeSum += locStageSize * 2; - break; - case 5: - locStageSizeSum += locStageSize * 4; - break; - case 6: - locStageSizeSum += locStageSize * 5; - break; - case 7: - locStageSizeSum += locStageSize * 6; - break; - case 8: - locStageSizeSum += locStageSize * 3; - break; - case 9: - locStageSizeSum += locStageSize * 8; - break; - case 10: - locStageSizeSum += locStageSize * 9; - break; - case 11: - locStageSizeSum += locStageSize * 10; - break; - case 12: - locStageSizeSum += locStageSize * 11; - break; - case 13: - locStageSizeSum += locStageSize * 12; - break; - case 14: - locStageSizeSum += locStageSize * 13; - break; - case 15: - locStageSizeSum += locStageSize * 14; - break; - case 16: - locStageSizeSum += locStageSize * 4; - break; - case 32: - locStageSizeSum += locStageSize * 5; - break; - default: - locStageSizeSum += locStageSize * (locStageRadix); - break; - } - } - locStageSize *= locStageRadix; - locStageAngle /= locStageRadix; - res = VkAppendLineFromInput(sc, sc->disableThreadsEnd); - if (res != VKFFT_SUCCESS) return res; - res = appendZeropadEnd(sc); - if (res != VKFFT_SUCCESS) return res; - res = appendBarrierVkFFT(sc, 1); - if (res != VKFFT_SUCCESS) return res; - res = appendZeropadStart(sc); - if (res != VKFFT_SUCCESS) return res; - res = VkAppendLineFromInput(sc, sc->disableThreadsStart); - if (res != VKFFT_SUCCESS) return res; - } - - { - uint64_t locStageRadix = sc->currentRaderContainer->stageRadix[sc->currentRaderContainer->numStages - 1]; - uint64_t logicalStoragePerThread = sc->currentRaderContainer->registers_per_thread_per_radix[locStageRadix] * sc->registerBoost;// (sc->registers_per_thread % stageRadix == 0) ? sc->registers_per_thread * sc->registerBoost : sc->min_registers_per_thread * sc->registerBoost; - //uint64_t logicalRegistersPerThread = sc->currentRaderContainer->registers_per_thread_per_radix[locStageRadix];// (sc->registers_per_thread % stageRadix == 0) ? sc->registers_per_thread : sc->min_registers_per_thread; - uint64_t locFFTDim = sc->currentRaderContainer->containerFFTDim; //different length due to all -1 cutoffs - //uint64_t locFFTsCombined = sc->currentRaderContainer->containerFFTNum * locFFTDim; - //uint64_t logicalGroupSize = (uint64_t)ceil(locFFTsCombined / (double)logicalStoragePerThread); - uint64_t subLogicalGroupSize = (uint64_t)ceil(locFFTDim / (double)logicalStoragePerThread); // hopefully it is not <1, will fix - if (!raderTranspose) { - sc->tempLen = sprintf(sc->tempStr, "\ - %s = %s %% %" PRIu64 ";\n", sc->raderIDx, gl_LocalInvocationID, subLogicalGroupSize); //local id - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, "\ - %s = %s / %" PRIu64 ";\n", sc->raderIDx2, gl_LocalInvocationID, subLogicalGroupSize); //global prime id - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, "\ - %s = %s / %" PRIu64 ";\n", sc->raderIDx, gl_LocalInvocationID, sc->currentRaderContainer->containerFFTNum); //local id - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, "\ - %s = %s %% %" PRIu64 ";\n", sc->raderIDx2, gl_LocalInvocationID, sc->currentRaderContainer->containerFFTNum); //global prime id - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (!raderTranspose) { - sc->tempLen = sprintf(sc->tempStr, "\ - if (%s < %" PRIu64 ") {\n", sc->raderIDx2, sc->currentRaderContainer->containerFFTNum); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, "\ - if (%s < %" PRIu64 ") {\n", sc->raderIDx, subLogicalGroupSize); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, "\ - if (%s == 0) {\n", sc->raderIDx); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - res = VkMovReal(sc, sc->stageInvocationID, sc->raderIDx2); - if (res != VKFFT_SUCCESS) return res; - res = VkMovReal(sc, sc->blockInvocationID, sc->stageInvocationID); - if (res != VKFFT_SUCCESS) return res; - sprintf(tempNum, "%" PRIu64 "", stageSize); - res = VkModReal(sc, sc->stageInvocationID, sc->stageInvocationID, tempNum); - if (res != VKFFT_SUCCESS) return res; - res = VkSubReal(sc, sc->blockInvocationID, sc->blockInvocationID, sc->stageInvocationID); - if (res != VKFFT_SUCCESS) return res; - sprintf(tempNum, "%" PRIu64 "", stageRadix); - res = VkMulReal(sc, sc->raderIDx2, sc->blockInvocationID, tempNum); - if (res != VKFFT_SUCCESS) return res; - res = VkAddReal(sc, sc->raderIDx2, sc->raderIDx2, sc->stageInvocationID); - if (res != VKFFT_SUCCESS) return res; - - sc->tempLen = sprintf(sc->tempStr, "\ - %s = %s;\n", sc->sdataID, sc->raderIDx2); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - if (strided) { - sc->tempLen = sprintf(sc->tempStr, "\ - %s = %s * sharedStride + %s;\n", sc->sdataID, sc->sdataID, sc->gl_LocalInvocationID_x); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - if (sc->localSize[1] > 1) { - sc->tempLen = sprintf(sc->tempStr, "\ - %s = %s + sharedStride * %s;\n", sc->sdataID, sc->sdataID, sc->gl_LocalInvocationID_y); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - if (((sc->actualInverse) && (sc->normalize)) || ((sc->convolutionStep || sc->useBluesteinFFT) && (stageAngle > 0))) { - if (strcmp(stageNormalization, "")) { - res = VkMulComplexNumber(sc, sc->x0[1], sc->x0[1], stageNormalization); - } - if (res != VKFFT_SUCCESS) return res; - } - - sc->tempLen = sprintf(sc->tempStr, "\ - sdata[%s] = %s;\n", sc->sdataID, sc->x0[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, "\ - }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, "\ - }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - res = VkAppendLineFromInput(sc, sc->disableThreadsEnd); - if (res != VKFFT_SUCCESS) return res; - res = appendZeropadEnd(sc); - if (res != VKFFT_SUCCESS) return res; - res = appendBarrierVkFFT(sc, 1); - if (res != VKFFT_SUCCESS) return res; - } - return res; -} -static inline VkFFTResult appendMultRaderStage(VkFFTSpecializationConstantsLayout* sc, const char* floatType, const char* uintType, uint64_t stageSize, uint64_t stageSizeSum, double stageAngle, uint64_t stageRadix, uint64_t stageID, uint64_t strided) { - VkFFTResult res = VKFFT_SUCCESS; - double double_PI = 3.1415926535897932384626433832795; - char vecType[30]; - char LFending[4] = ""; - char tempNum[50] = ""; - if (!strcmp(floatType, "float")) sprintf(LFending, "f"); -#if(VKFFT_BACKEND==0) - if (!strcmp(floatType, "float")) sprintf(vecType, "vec2"); - if (!strcmp(floatType, "double")) sprintf(vecType, "dvec2"); - char cosDef[20] = "cos"; - char sinDef[20] = "sin"; - if (!strcmp(floatType, "double")) sprintf(LFending, "LF"); -#elif(VKFFT_BACKEND==1) - if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); - if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); - char cosDef[20] = "__cosf"; - char sinDef[20] = "__sinf"; - if (!strcmp(floatType, "double")) sprintf(LFending, "l"); -#elif(VKFFT_BACKEND==2) - if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); - if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); - char cosDef[20] = "__cosf"; - char sinDef[20] = "__sinf"; - if (!strcmp(floatType, "double")) sprintf(LFending, "l"); -#elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)) - if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); - if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); - char cosDef[20] = "native_cos"; - char sinDef[20] = "native_sin"; - //if (!strcmp(floatType, "double")) sprintf(LFending, "l"); -#endif - char stageNormalization[50] = ""; - uint64_t normalizationValue = 1; - if ((((sc->actualInverse) && (sc->normalize)) || (sc->convolutionStep && (stageAngle > 0))) && (stageSize == 1) && (sc->axis_upload_id == 0) && (!(sc->useBluesteinFFT && (stageAngle < 0)))) { - if ((sc->performDCT) && (sc->actualInverse)) { - if (sc->performDCT == 1) - normalizationValue = (sc->sourceFFTSize - 1) * 2; - else - normalizationValue = sc->sourceFFTSize * 2; - } - else - normalizationValue = sc->sourceFFTSize; - } - if (sc->useBluesteinFFT && (stageAngle > 0) && (stageSize == 1) && (sc->axis_upload_id == 0)) { - normalizationValue *= sc->fft_dim_full; - } - if (normalizationValue != 1) { - sprintf(stageNormalization, "%.17e%s", 1.0 / (double)(normalizationValue), LFending); - } - char convolutionInverse[10] = ""; - if (sc->convolutionStep) { - if (stageAngle < 0) - sprintf(convolutionInverse, ", 0"); - else - sprintf(convolutionInverse, ", 1"); - } - res = appendBarrierVkFFT(sc, 1); - if (res != VKFFT_SUCCESS) return res; - res = appendZeropadStart(sc); - if (res != VKFFT_SUCCESS) return res; - res = VkAppendLineFromInput(sc, sc->disableThreadsStart); - if (res != VKFFT_SUCCESS) return res; - - uint64_t num_logical_subgroups = (strided) ? sc->localSize[1] / ((stageRadix + 1) / 2) : sc->localSize[0] / ((stageRadix + 1) / 2); - uint64_t num_logical_groups = (uint64_t)ceil((sc->fftDim / stageRadix) / (double)(num_logical_subgroups)); - uint64_t require_cutoff_check = ((sc->fftDim == (num_logical_subgroups * num_logical_groups * stageRadix))) ? 0 : 1; - uint64_t require_cutoff_check2; - char* gl_LocalInvocationID = (strided) ? sc->gl_LocalInvocationID_y : sc->gl_LocalInvocationID_x; - - if (strided) { - require_cutoff_check2 = ((sc->localSize[1] % ((stageRadix + 1) / 2)) == 0) ? 0 : 1; - } - else { - require_cutoff_check2 = ((sc->localSize[0] % ((stageRadix + 1) / 2)) == 0) ? 0 : 1; - } - sc->tempLen = sprintf(sc->tempStr, " %s= %s %% %" PRIu64 ";\n", sc->raderIDx, gl_LocalInvocationID, (stageRadix + 1) / 2); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s= %s / %" PRIu64 ";\n", sc->raderIDx2, gl_LocalInvocationID, (stageRadix + 1) / 2); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - for (uint64_t k = 0; k < sc->registerBoost; k++) { - for (uint64_t j = 0; j < 1; j++) { - if (stageSize > 1) { - if (require_cutoff_check2) { - if (strided) { - sc->tempLen = sprintf(sc->tempStr, "\ - if(%s<%" PRIu64 "){\n", sc->gl_LocalInvocationID_y, sc->localSize[1] - sc->localSize[1] % ((stageRadix + 1) / 2)); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, "\ - if(%s<%" PRIu64 "){\n", sc->gl_LocalInvocationID_x, sc->localSize[0] - sc->localSize[0] % ((stageRadix + 1) / 2)); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - for (uint64_t t = 0; t < num_logical_groups; t++) { - if ((require_cutoff_check) && (t == num_logical_groups - 1)) { - sc->tempLen = sprintf(sc->tempStr, "\ - if(%s<%" PRIu64 "){\n", sc->raderIDx2, sc->fftDim / stageRadix - t * num_logical_subgroups); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, "\ - %s = (%s+%" PRIu64 ") %% (%" PRIu64 ");\n", sc->stageInvocationID, sc->raderIDx2, t * num_logical_subgroups, stageSize); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - if (sc->LUT) - sc->tempLen = sprintf(sc->tempStr, " LUTId = stageInvocationID*%" PRIu64 " + %" PRIu64 ";\n", stageRadix, stageSizeSum); - else - sc->tempLen = sprintf(sc->tempStr, " angle = stageInvocationID * %.17e%s;\n", stageAngle, LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->LUT) { - sc->tempLen = sprintf(sc->tempStr, " %s = twiddleLUT[LUTId+%s];\n\n", sc->w, sc->raderIDx); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (!sc->inverse) { - sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.y;\n", sc->w, sc->w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - if (!strcmp(floatType, "float")) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = %s(angle*%.17e%s*(%s));\n", sc->w, cosDef, 2.0 / stageRadix, LFending, sc->raderIDx); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = %s(angle*%.17e%s*(%s));\n", sc->w, sinDef, 2.0 / stageRadix, LFending, sc->raderIDx); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - //sc->tempLen = sprintf(sc->tempStr, " w = %s(cos(angle*%.17e), sin(angle*%.17e));\n\n", vecType, 2.0 * i / radix, 2.0 * i / radix); - } - if (!strcmp(floatType, "double")) { - sc->tempLen = sprintf(sc->tempStr, " %s = sincos_20(angle*%.17e%s*(%s));\n", sc->w, 2.0 / stageRadix, LFending, sc->raderIDx); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - //sc->tempLen = sprintf(sc->tempStr, " printf(\"%%d %%f %%f \\n \", %s, %s.x, %s.y);\n\n", sc->gl_LocalInvocationID_x, sc->w, sc->w); - //res = VkAppendLine(sc); - //if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, "\ - %s = (%s) * %" PRIu64 " + %s + %" PRIu64 ";\n", sc->sdataID, sc->raderIDx, sc->fftDim / stageRadix, sc->raderIDx2, t * num_logical_subgroups); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (strided) { - sc->tempLen = sprintf(sc->tempStr, "\ - %s = %s * sharedStride + %s;\n", sc->sdataID, sc->sdataID, sc->gl_LocalInvocationID_x); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - if (sc->localSize[1] > 1) { - sc->tempLen = sprintf(sc->tempStr, "\ - %s = %s + sharedStride * %s;\n", sc->sdataID, sc->sdataID, sc->gl_LocalInvocationID_y); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - sc->tempLen = sprintf(sc->tempStr, "\ - %s = sdata[%s];\n", sc->regIDs[0], sc->sdataID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - res = VkMulComplex(sc, sc->temp, sc->regIDs[0], sc->w, 0); - if (res != VKFFT_SUCCESS) return res; - - sc->tempLen = sprintf(sc->tempStr, "\ - sdata[%s] = %s;\n", sc->sdataID, sc->temp); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - sc->tempLen = sprintf(sc->tempStr, "\ - if(%s<%" PRIu64 "){\n", sc->raderIDx, (stageRadix - 1) / 2); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, "\ - %s = (%s+%" PRIu64 ") %% (%" PRIu64 ");\n", sc->stageInvocationID, sc->raderIDx2, t * num_logical_subgroups, stageSize); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - if (sc->LUT) - sc->tempLen = sprintf(sc->tempStr, " LUTId = stageInvocationID*%" PRIu64 " + %" PRIu64 ";\n", stageRadix, stageSizeSum); - else - sc->tempLen = sprintf(sc->tempStr, " angle = stageInvocationID * %.17e%s;\n", stageAngle, LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->LUT) { - sc->tempLen = sprintf(sc->tempStr, " %s = twiddleLUT[LUTId+%s+%" PRIu64 "];\n\n", sc->w, sc->raderIDx, (stageRadix + 1) / 2); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (!sc->inverse) { - sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.y;\n", sc->w, sc->w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - if (!strcmp(floatType, "float")) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = %s(angle*%.17e%s*(%" PRIu64 " + %s));\n", sc->w, cosDef, 2.0 / stageRadix, LFending, (stageRadix + 1) / 2, sc->raderIDx); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = %s(angle*%.17e%s*(%" PRIu64 " + %s));\n", sc->w, sinDef, 2.0 / stageRadix, LFending, (stageRadix + 1) / 2, sc->raderIDx); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - //sc->tempLen = sprintf(sc->tempStr, " w = %s(cos(angle*%.17e), sin(angle*%.17e));\n\n", vecType, 2.0 * i / radix, 2.0 * i / radix); - } - if (!strcmp(floatType, "double")) { - sc->tempLen = sprintf(sc->tempStr, " %s = sincos_20(angle*%.17e%s*(%" PRIu64 " + %s));\n", sc->w, 2.0 / stageRadix, LFending, (stageRadix + 1) / 2, sc->raderIDx); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - //sc->tempLen = sprintf(sc->tempStr, " printf(\"%%d %%f %%f \\n \", %s, %s.x, %s.y);\n\n", sc->gl_LocalInvocationID_x, sc->w, sc->w); - //res = VkAppendLine(sc); - //if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, "\ - %s = (%" PRIu64 " + %s) * %" PRIu64 " + %s + %" PRIu64 ";\n", sc->sdataID, (stageRadix + 1) / 2, sc->raderIDx, sc->fftDim / stageRadix, sc->raderIDx2, t * num_logical_subgroups); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (strided) { - sc->tempLen = sprintf(sc->tempStr, "\ - %s = %s * sharedStride + %s;\n", sc->sdataID, sc->sdataID, sc->gl_LocalInvocationID_x); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - if (sc->localSize[1] > 1) { - sc->tempLen = sprintf(sc->tempStr, "\ - %s = %s + sharedStride * %s;\n", sc->sdataID, sc->sdataID, sc->gl_LocalInvocationID_y); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - sc->tempLen = sprintf(sc->tempStr, "\ - %s = sdata[%s];\n", sc->regIDs[0], sc->sdataID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - res = VkMulComplex(sc, sc->temp, sc->regIDs[0], sc->w, 0); - if (res != VKFFT_SUCCESS) return res; - - sc->tempLen = sprintf(sc->tempStr, "\ - sdata[%s] = %s;\n", sc->sdataID, sc->temp); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, "\ - }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if ((require_cutoff_check) && (t == num_logical_groups - 1)) { - sc->tempLen = sprintf(sc->tempStr, "\ - }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - if (require_cutoff_check2) { - sc->tempLen = sprintf(sc->tempStr, "\ - }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - res = VkAppendLineFromInput(sc, sc->disableThreadsEnd); - if (res != VKFFT_SUCCESS) return res; - res = appendZeropadEnd(sc); - if (res != VKFFT_SUCCESS) return res; - res = appendBarrierVkFFT(sc, 1); - if (res != VKFFT_SUCCESS) return res; - res = appendZeropadStart(sc); - if (res != VKFFT_SUCCESS) return res; - res = VkAppendLineFromInput(sc, sc->disableThreadsStart); - if (res != VKFFT_SUCCESS) return res; - } - if (require_cutoff_check2) { - if (strided) { - sc->tempLen = sprintf(sc->tempStr, "\ - if(%s<%" PRIu64 "){\n", sc->gl_LocalInvocationID_y, sc->localSize[1] - sc->localSize[1] % ((stageRadix + 1) / 2)); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, "\ - if(%s<%" PRIu64 "){\n", sc->gl_LocalInvocationID_x, sc->localSize[0] - sc->localSize[0] % ((stageRadix + 1) / 2)); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - //save x0 - for (uint64_t t = 0; t < num_logical_groups; t++) { - if ((require_cutoff_check) && (t == num_logical_groups - 1)) { - sc->tempLen = sprintf(sc->tempStr, "\ - if(%s<%" PRIu64 "){\n", sc->raderIDx2, sc->fftDim / stageRadix - t * num_logical_subgroups); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (strided) { - if (sc->localSize[0] > 1) { - sc->tempLen = sprintf(sc->tempStr, "\ - %s = (%s + %" PRIu64 ") * sharedStride + %s;\n", sc->sdataID, sc->raderIDx2, t * num_logical_subgroups, sc->gl_LocalInvocationID_x); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, "\ - %s = %s + %" PRIu64 ";\n", sc->sdataID, sc->raderIDx2, t * num_logical_subgroups); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - if (sc->localSize[1] > 1) { - sc->tempLen = sprintf(sc->tempStr, "\ - %s = %s + %" PRIu64 " + sharedStride * %s;\n", sc->sdataID, sc->raderIDx2, t * num_logical_subgroups, sc->gl_LocalInvocationID_y); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, "\ - %s = %s + %" PRIu64 ";\n", sc->sdataID, sc->raderIDx2, t * num_logical_subgroups); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - sc->tempLen = sprintf(sc->tempStr, "\ - %s = sdata[%s];\n", sc->x0[t], sc->sdataID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - if ((require_cutoff_check) && (t == num_logical_groups - 1)) { - sc->tempLen = sprintf(sc->tempStr, "\ - }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - //generator index + shuffle - sc->tempLen = sprintf(sc->tempStr, "\ - if(%s>0){\n", sc->raderIDx); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - uint64_t g = sc->currentRaderContainer->generator; - if (sc->inline_rader_g_pow == 1) { - sc->tempLen = sprintf(sc->tempStr, "\ - %s= g_pow_%" PRIu64 "[%s-1];\n", sc->sdataID, stageRadix, sc->raderIDx); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else if (sc->inline_rader_g_pow == 2) { - sc->tempLen = sprintf(sc->tempStr, "\ - %s= g_pow[%s-1+%" PRIu64 "];\n", sc->sdataID, sc->raderIDx, sc->currentRaderContainer->raderUintLUToffset); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, "\ - %s= (%s-1);\n\ - %s=1;\n\ - while (%s != 0)\n\ - {\n\ - %s = (%s * %" PRIu64 ") %% %" PRIu64 ";\n\ - %s--;\n\ - }\n", sc->inoutID, sc->raderIDx, sc->sdataID, sc->inoutID, sc->sdataID, sc->sdataID, g, stageRadix, sc->inoutID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - for (uint64_t t = 0; t < num_logical_groups; t++) { - if ((require_cutoff_check) && (t == num_logical_groups - 1)) { - sc->tempLen = sprintf(sc->tempStr, "\ - if(%s<%" PRIu64 "){\n", sc->raderIDx2, sc->fftDim / stageRadix - t * num_logical_subgroups); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - - sc->tempLen = sprintf(sc->tempStr, "\ - %s = %s * %" PRIu64 " + %s + %" PRIu64 ";\n", sc->combinedID, sc->sdataID, sc->fftDim / stageRadix, sc->raderIDx2, t * num_logical_subgroups); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (strided) { - sc->tempLen = sprintf(sc->tempStr, "\ - %s = %s * sharedStride + %s;\n", sc->combinedID, sc->combinedID, sc->gl_LocalInvocationID_x); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - if (sc->localSize[1] > 1) { - sc->tempLen = sprintf(sc->tempStr, "\ - %s = %s + sharedStride * %s;\n", sc->combinedID, sc->combinedID, sc->gl_LocalInvocationID_y); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - sc->tempLen = sprintf(sc->tempStr, "\ - %s = sdata[%s];\n", sc->regIDs[t * 2], sc->combinedID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - if ((require_cutoff_check) && (t == num_logical_groups - 1)) { - sc->tempLen = sprintf(sc->tempStr, "\ - }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - if (sc->inline_rader_g_pow == 1) { - sc->tempLen = sprintf(sc->tempStr, "\ - %s= g_pow_%" PRIu64 "[%s+ %" PRIu64 "];\n", sc->sdataID, stageRadix, sc->raderIDx, (stageRadix - 1) / 2 - 1); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else if (sc->inline_rader_g_pow == 2) { - sc->tempLen = sprintf(sc->tempStr, "\ - %s= g_pow[%s+ %" PRIu64 "];\n", sc->sdataID, sc->raderIDx, (stageRadix - 1) / 2 - 1 + sc->currentRaderContainer->raderUintLUToffset); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, "\ - %s= (%s+ %" PRIu64 ");\n\ - %s=1;\n\ - while (%s != 0)\n\ - {\n\ - %s = (%s * %" PRIu64 ") %% %" PRIu64 ";\n\ - %s--;\n\ - }\n", sc->inoutID, sc->raderIDx, (stageRadix - 1) / 2 - 1, sc->sdataID, sc->inoutID, sc->sdataID, sc->sdataID, g, stageRadix, sc->inoutID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - - for (uint64_t t = 0; t < num_logical_groups; t++) { - if ((require_cutoff_check) && (t == num_logical_groups - 1)) { - sc->tempLen = sprintf(sc->tempStr, "\ - if(%s<%" PRIu64 "){\n", sc->raderIDx2, sc->fftDim / stageRadix - t * num_logical_subgroups); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - - sc->tempLen = sprintf(sc->tempStr, "\ - %s = %s * %" PRIu64 " + %s + %" PRIu64 ";\n", sc->combinedID, sc->sdataID, sc->fftDim / stageRadix, sc->raderIDx2, t * num_logical_subgroups); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (strided) { - sc->tempLen = sprintf(sc->tempStr, "\ - %s = %s * sharedStride + %s;\n", sc->combinedID, sc->combinedID, sc->gl_LocalInvocationID_x); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - if (sc->localSize[1] > 1) { - sc->tempLen = sprintf(sc->tempStr, "\ - %s = %s + sharedStride * %s;\n", sc->combinedID, sc->combinedID, sc->gl_LocalInvocationID_y); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - sc->tempLen = sprintf(sc->tempStr, "\ - %s = sdata[%s];\n", sc->regIDs[2 * t + 1], sc->combinedID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - if ((require_cutoff_check) && (t == num_logical_groups - 1)) { - sc->tempLen = sprintf(sc->tempStr, "\ - }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - sc->tempLen = sprintf(sc->tempStr, "\ - }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - if (require_cutoff_check2) { - sc->tempLen = sprintf(sc->tempStr, "\ - }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - - res = VkAppendLineFromInput(sc, sc->disableThreadsEnd); - if (res != VKFFT_SUCCESS) return res; - res = appendZeropadEnd(sc); - if (res != VKFFT_SUCCESS) return res; - res = appendBarrierVkFFT(sc, 1); - if (res != VKFFT_SUCCESS) return res; - //load deconv kernel - if (!sc->inline_rader_kernel) { - for (uint64_t t = 0; t < (uint64_t)ceil((stageRadix - 1) / ((double)(sc->localSize[0] * sc->localSize[1]))); t++) { - sc->tempLen = sprintf(sc->tempStr, "\ - %s = %s + %s * %" PRIu64 " + %" PRIu64 ";\n", sc->combinedID, sc->gl_LocalInvocationID_x, sc->gl_LocalInvocationID_y, sc->localSize[0], t * sc->localSize[0] * sc->localSize[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (t == ((uint64_t)ceil((stageRadix - 1) / ((double)(sc->localSize[0] * sc->localSize[1]))) - 1)) { - sc->tempLen = sprintf(sc->tempStr, "\ - if(%s < %" PRIu64 "){\n", sc->combinedID, (stageRadix - 1)); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->LUT) { - sc->tempLen = sprintf(sc->tempStr, "\ - %s = twiddleLUT[%s+%" PRIu64 "];\n", sc->w, sc->combinedID, sc->currentRaderContainer->RaderKernelOffsetLUT); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->inverse) { - sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.y;\n", sc->w, sc->w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, "\ - sdata[%s+%" PRIu64 "] = %s;\n", sc->combinedID, sc->RaderKernelOffsetShared[stageID], sc->w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - if (sc->inline_rader_g_pow == 1) { - sc->tempLen = sprintf(sc->tempStr, "\ - %s= g_pow_%" PRIu64 "[%" PRIu64 " - %s];\n", sc->sdataID, stageRadix, stageRadix - 1, sc->combinedID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else if (sc->inline_rader_g_pow == 2) { - sc->tempLen = sprintf(sc->tempStr, "\ - %s= g_pow[%" PRIu64 " - %s];\n", sc->sdataID, stageRadix - 1 + sc->currentRaderContainer->raderUintLUToffset, sc->combinedID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, "\ - %s= (%" PRIu64 " - %s);\n\ - %s=1;\n\ - while (%s != 0)\n\ - {\n\ - %s = (%s * %" PRIu64 ") %% %" PRIu64 ";\n\ - %s--;\n\ - }\n", sc->inoutID, stageRadix - 1, sc->combinedID, sc->sdataID, sc->inoutID, sc->sdataID, sc->sdataID, g, stageRadix, sc->inoutID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (!strcmp(floatType, "float")) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = %s(%.17e%s*%s);\n", sc->w, cosDef, 2.0 * double_PI / stageRadix, LFending, sc->sdataID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->inverse) { - sc->tempLen = sprintf(sc->tempStr, " %s.y = %s(%.17e%s*%s);\n", sc->w, sinDef, 2.0 * double_PI / stageRadix, LFending, sc->sdataID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s(%.17e%s*%s);\n", sc->w, sinDef, 2.0 * double_PI / stageRadix, LFending, sc->sdataID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - //sc->tempLen = sprintf(sc->tempStr, " w = %s(cos(angle*%.17e), sin(angle*%.17e));\n\n", vecType, 2.0 * i / radix, 2.0 * i / radix); - } - if (!strcmp(floatType, "double")) { - sc->tempLen = sprintf(sc->tempStr, " %s = sincos_20(%.17e%s*%s);\n", sc->w, 2.0 * double_PI / stageRadix, LFending, sc->sdataID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (!sc->inverse) { - sc->tempLen = sprintf(sc->tempStr, " %s.y = -%s.y;\n", sc->w, sc->w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - sc->tempLen = sprintf(sc->tempStr, "\ - sdata[%s+%" PRIu64 "] = %s;\n", sc->combinedID, sc->RaderKernelOffsetShared[stageID], sc->w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (t == ((uint64_t)ceil((stageRadix - 1) / ((double)(sc->localSize[0] * sc->localSize[1]))) - 1)) { - sc->tempLen = sprintf(sc->tempStr, "\ - }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - res = appendZeropadStart(sc); - if (res != VKFFT_SUCCESS) return res; - res = VkAppendLineFromInput(sc, sc->disableThreadsStart); - if (res != VKFFT_SUCCESS) return res; - - if (require_cutoff_check2) { - if (strided) { - sc->tempLen = sprintf(sc->tempStr, "\ - if(%s<%" PRIu64 "){\n", sc->gl_LocalInvocationID_y, sc->localSize[1] - sc->localSize[1] % ((stageRadix + 1) / 2)); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, "\ - if(%s<%" PRIu64 "){\n", sc->gl_LocalInvocationID_x, sc->localSize[0] - sc->localSize[0] % ((stageRadix + 1) / 2)); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - //x0 is ready - - //no subgroups - /* { - sc->tempLen = sprintf(sc->tempStr, "\ - if(%s==0){\n", sc->gl_LocalInvocationID_x); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, "\ - %s.x = 0;\n\ - %s.y = 0;\n", sc->regIDs[0], sc->regIDs[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, "\ - %s = 0;\n", sc->combinedID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - if (sc->localSize[1] > 1) { - sc->tempLen = sprintf(sc->tempStr, "\ - %s = %s + sharedStride * (%s);\n", sc->sdataID, sc->combinedID, sc->gl_LocalInvocationID_y); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, "\ - while(%s<%" PRIu64 "){\n\ - %s.x += sdata[%s].x;\n\ - %s.y += sdata[%s].y;\n\ - %s++; %s++;}\n", sc->combinedID, stageRadix, sc->regIDs[0], sc->sdataID, sc->regIDs[0], sc->sdataID, sc->combinedID, sc->sdataID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, "\ - while(%s<%" PRIu64 "){\n\ - %s.x += sdata[%s].x;\n\ - %s.y += sdata[%s].y;\n\ - %s++;}\n", sc->combinedID, stageRadix, sc->regIDs[0], sc->combinedID, sc->regIDs[0], sc->combinedID, sc->combinedID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, "\ - %s = 0;\n", sc->sdataID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - if (sc->localSize[1] > 1) { - sc->tempLen = sprintf(sc->tempStr, "\ - %s = %s + sharedStride * (%s);\n", sc->sdataID, sc->sdataID, sc->gl_LocalInvocationID_y); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, "\ - sdata[%s] = %s;\n", sc->sdataID, sc->regIDs[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, "\ - }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - }*/ - //subgroups - /* { - uint64_t numGroupsQuant = ((((sc->localSize[0] * sc->localSize[1] * sc->localSize[2]) % sc->warpSize) == 0) || (sc->numSubgroups == 1)) ? sc->numSubgroups : sc->numSubgroups - 1; - if (numGroupsQuant != sc->numSubgroups) { - sc->tempLen = sprintf(sc->tempStr, "\ - if(%s<%" PRIu64 "){\n", sc->gl_SubgroupID, numGroupsQuant); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - for (uint64_t t = 0; t < (uint64_t)ceil(sc->localSize[1] / (double)numGroupsQuant); t++) { - sc->tempLen = sprintf(sc->tempStr, "\ - %s.x = 0;\n", sc->regIDs[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, "\ - %s.y = 0;\n", sc->regIDs[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - uint64_t quant = (sc->warpSize < (sc->localSize[0] * sc->localSize[1] * sc->localSize[2])) ? sc->warpSize : (sc->localSize[0] * sc->localSize[1] * sc->localSize[2]); - for (uint64_t t2 = 0; t2 < (uint64_t)ceil(stageRadix / (double)quant); t2++) { - if ((t == (uint64_t)ceil(sc->localSize[1] / (double)numGroupsQuant) - 1) && (sc->localSize[1] > 1) && ((sc->localSize[1] % numGroupsQuant) != 0)) { - sc->tempLen = sprintf(sc->tempStr, "\ - if(%s<%" PRIu64 "){\n", sc->gl_SubgroupID, sc->localSize[1] % numGroupsQuant); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (t2 == (uint64_t)ceil(stageRadix / (double)quant) - 1) { - sc->tempLen = sprintf(sc->tempStr, "\ - if(%s<%" PRIu64 "){\n", sc->gl_SubgroupInvocationID, stageRadix % quant); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, "\ - %s = (%s+%" PRIu64 ") * %" PRIu64 ";\n", sc->sdataID, sc->gl_SubgroupInvocationID, t2 * quant, sc->fftDim / stageRadix); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - if (sc->localSize[1] > 1) { - sc->tempLen = sprintf(sc->tempStr, "\ - %s = %s + sharedStride * (%s+%" PRIu64 ");\n", sc->sdataID, sc->sdataID, sc->gl_SubgroupID, t * numGroupsQuant); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, "\ - %s = sdata[%s];\n", sc->regIDs[1], sc->sdataID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, sc->regIDs[0], sc->regIDs[0], sc->regIDs[1]); - if (res != VKFFT_SUCCESS) return res; - if (t2 == (uint64_t)ceil(stageRadix / (double)quant) - 1) { - sc->tempLen = sprintf(sc->tempStr, "\ - }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if ((t == (uint64_t)ceil(sc->localSize[1] / (double)numGroupsQuant) - 1) && (sc->localSize[1] > 1) && ((sc->localSize[1] % numGroupsQuant) != 0)) { - sc->tempLen = sprintf(sc->tempStr, "\ - }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - - res = VkSubgroupAdd(sc, sc->regIDs[0], sc->regIDs[0], 1); - if (res != VKFFT_SUCCESS) return res; - - if ((t == (uint64_t)ceil(sc->localSize[1] / (double)numGroupsQuant) - 1) && (sc->localSize[1] > 1) && ((sc->localSize[1] % numGroupsQuant) != 0)) { - sc->tempLen = sprintf(sc->tempStr, "\ - if(%s<%" PRIu64 "){\n", sc->gl_SubgroupID, sc->localSize[1] % numGroupsQuant); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, "\ - if(%s==0){\n", sc->gl_SubgroupInvocationID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, "\ - %s = 0;\n", sc->sdataID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - if (sc->localSize[1] > 1) { - sc->tempLen = sprintf(sc->tempStr, "\ - %s = %s + sharedStride * (%s+%" PRIu64 ");\n", sc->sdataID, sc->sdataID, sc->gl_SubgroupID, t * numGroupsQuant); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, "\ - sdata[%s] = %s;\n", sc->sdataID, sc->regIDs[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, "\ - }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if ((t == (uint64_t)ceil(sc->localSize[1] / (double)numGroupsQuant) - 1) && (sc->localSize[1] > 1) && ((sc->localSize[1] % numGroupsQuant) != 0)) { - sc->tempLen = sprintf(sc->tempStr, "\ - }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - if (numGroupsQuant != sc->numSubgroups) { - sc->tempLen = sprintf(sc->tempStr, "\ - }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - }*/ - - sc->tempLen = sprintf(sc->tempStr, "\ - if(%s > 0){\n", sc->raderIDx); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - for (uint64_t t = 0; t < num_logical_groups; t++) { - if ((require_cutoff_check) && (t == num_logical_groups - 1)) { - sc->tempLen = sprintf(sc->tempStr, "\ - if(%s<%" PRIu64 "){\n", sc->raderIDx2, sc->fftDim / stageRadix - t * num_logical_subgroups); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, "\ - %s = (%s) * %" PRIu64 " + %s + %" PRIu64 ";\n", sc->sdataID, sc->raderIDx, sc->fftDim / stageRadix, sc->raderIDx2, t * num_logical_subgroups); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (strided) { - sc->tempLen = sprintf(sc->tempStr, "\ - %s = %s * sharedStride + %s;\n", sc->sdataID, sc->sdataID, sc->gl_LocalInvocationID_x); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, "\ - %s = %s + %" PRIu64 " * sharedStride;\n", sc->combinedID, sc->sdataID, (stageRadix - 1) / 2 * sc->fftDim / stageRadix); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - if (sc->localSize[1] > 1) { - sc->tempLen = sprintf(sc->tempStr, "\ - %s = %s + sharedStride * %s;\n", sc->sdataID, sc->sdataID, sc->gl_LocalInvocationID_y); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, "\ - %s = %s + %" PRIu64 ";\n", sc->combinedID, sc->sdataID, (stageRadix - 1) / 2 * sc->fftDim / stageRadix); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - - sc->tempLen = sprintf(sc->tempStr, "\ - %s.x = %s.x - %s.x;\n", sc->temp, sc->regIDs[2 * t], sc->regIDs[2 * t + 1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, "\ - %s.x += %s.x;\n", sc->regIDs[2 * t], sc->regIDs[2 * t + 1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, "\ - %s.y = %s.y + %s.y;\n", sc->temp, sc->regIDs[2 * t], sc->regIDs[2 * t + 1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, "\ - %s.y -= %s.y;\n", sc->regIDs[2 * t], sc->regIDs[2 * t + 1]); - res = VkAppendLine(sc); - sc->tempLen = sprintf(sc->tempStr, "\ - sdata[%s] = %s;\n", sc->sdataID, sc->regIDs[2 * t]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, "\ - sdata[%s] = %s;\n", sc->combinedID, sc->temp); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - if ((require_cutoff_check) && (t == num_logical_groups - 1)) { - sc->tempLen = sprintf(sc->tempStr, "\ - }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - //sc->tempLen = sprintf(sc->tempStr, " printf(\"%%d %%f %%f %%f %%f \\n \", %s, %s.x, %s.y, %s.x, %s.y);\n\n", sc->gl_LocalInvocationID_x, sc->regIDs[0], sc->regIDs[0], sc->temp, sc->temp); - //res = VkAppendLine(sc); - //if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, "\ - }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - if (require_cutoff_check2) { - sc->tempLen = sprintf(sc->tempStr, "\ - }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - res = VkAppendLineFromInput(sc, sc->disableThreadsEnd); - if (res != VKFFT_SUCCESS) return res; - res = appendZeropadEnd(sc); - if (res != VKFFT_SUCCESS) return res; - res = appendBarrierVkFFT(sc, 1); - if (res != VKFFT_SUCCESS) return res; - res = appendZeropadStart(sc); - if (res != VKFFT_SUCCESS) return res; - res = VkAppendLineFromInput(sc, sc->disableThreadsStart); - if (res != VKFFT_SUCCESS) return res; - if (require_cutoff_check2) { - if (strided) { - sc->tempLen = sprintf(sc->tempStr, "\ - if(%s<%" PRIu64 "){\n", sc->gl_LocalInvocationID_y, sc->localSize[1] - sc->localSize[1] % ((stageRadix + 1) / 2)); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, "\ - if(%s<%" PRIu64 "){\n", sc->gl_LocalInvocationID_x, sc->localSize[0] - sc->localSize[0] % ((stageRadix + 1) / 2)); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - - sc->tempLen = sprintf(sc->tempStr, "\ - if(%s < %" PRIu64 "){\n", sc->raderIDx, (stageRadix + 1) / 2); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - for (uint64_t t = 0; t < num_logical_groups; t++) { - sc->tempLen = sprintf(sc->tempStr, "\ - %s.x = 0;\n", sc->regIDs[2 * t + 1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, "\ - %s.y = 0;\n", sc->regIDs[2 * t + 1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, "\ - if(%s == %" PRIu64 "){\n", sc->raderIDx, (stageRadix - 1) / 2); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, "\ - %s.x = 1; %s.y = 0;\n", sc->w, sc->w); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, "\ - }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - for (uint64_t i = 0; i < (stageRadix - 1) / 2; i++) { - - sc->tempLen = sprintf(sc->tempStr, "\ - if(%s < %" PRIu64 "){\n", sc->raderIDx, (stageRadix - 1) / 2); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - sc->tempLen = sprintf(sc->tempStr, "\ - %s = ((%" PRIu64 "+%s) %% %" PRIu64 ");\n", sc->sdataID, stageRadix - 1 - i, sc->raderIDx, (stageRadix - 1)); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->inline_rader_kernel) { - sc->tempLen = sprintf(sc->tempStr, "\ - %s.x = r_rader_kernel_%" PRIu64 "[%s];\n\ - %s.y = i_rader_kernel_%" PRIu64 "[%s];\n", sc->w, stageRadix, sc->sdataID, sc->w, stageRadix, sc->sdataID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, "\ - %s = sdata[%s+%" PRIu64 "];\n", sc->w, sc->sdataID, sc->RaderKernelOffsetShared[stageID]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, "\ - }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - for (uint64_t t = 0; t < num_logical_groups; t++) { -#if(VKFFT_BACKEND != 2) //AMD compiler fix - if ((require_cutoff_check) && (t == num_logical_groups - 1)) { - sc->tempLen = sprintf(sc->tempStr, "\ - if(%s<%" PRIu64 "){\n", sc->raderIDx2, sc->fftDim / stageRadix - t * num_logical_subgroups); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } -#endif - sc->tempLen = sprintf(sc->tempStr, "\ - %s = %s+ %" PRIu64 ";\n", sc->sdataID, sc->raderIDx2, t * num_logical_subgroups + (1 + i) * sc->fftDim / stageRadix); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (strided) { - sc->tempLen = sprintf(sc->tempStr, "\ - %s = %s * sharedStride + %s;\n", sc->sdataID, sc->sdataID, sc->gl_LocalInvocationID_x); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - if (sc->localSize[1] > 1) { - sc->tempLen = sprintf(sc->tempStr, "\ - %s = %s + sharedStride * %s;\n", sc->sdataID, sc->sdataID, sc->gl_LocalInvocationID_y); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - sc->tempLen = sprintf(sc->tempStr, "\ - %s = sdata[%s];\n", sc->regIDs[0], sc->sdataID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (strided) { - sc->tempLen = sprintf(sc->tempStr, "\ - %s += %" PRIu64 "*sharedStride;\n", sc->sdataID, (stageRadix - 1) / 2 * sc->fftDim / stageRadix); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, "\ - %s += %" PRIu64 " ;\n", sc->sdataID, (stageRadix - 1) / 2 * sc->fftDim / stageRadix); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, "\ - %s = sdata[%s];\n", sc->temp, sc->sdataID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; -#if(VKFFT_BACKEND == 2) //AMD compiler fix - if ((require_cutoff_check) && (t == num_logical_groups - 1)) { - sc->tempLen = sprintf(sc->tempStr, "\ - if(%s>=%" PRIu64 "){%s.x =0;%s.y=0;%s.x=0;%s.y=0;}\n", sc->raderIDx2, sc->fftDim / stageRadix - t * num_logical_subgroups, sc->temp, sc->temp, sc->regIDs[0], sc->regIDs[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } -#endif - sprintf(tempNum, "%s", sc->x0[t]); - res = VkFMA3Complex(sc, tempNum, sc->regIDs[2 * t + 1], sc->regIDs[0], sc->w, sc->temp); - if (res != VKFFT_SUCCESS) return res; -#if(VKFFT_BACKEND != 2) //AMD compiler fix - if ((require_cutoff_check) && (t == num_logical_groups - 1)) { - sc->tempLen = sprintf(sc->tempStr, "\ - }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } -#endif -#if(VKFFT_BACKEND == 2) //AMD compiler fix - if ((uint64_t)ceil((sc->localSize[0] * sc->localSize[1]) / ((double)sc->warpSize)) * sc->warpSize * (1 + sc->registers_per_thread + sc->usedLocRegs) > 2048) { - sc->tempLen = sprintf(sc->tempStr, "\ - }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - if (require_cutoff_check2) { - sc->tempLen = sprintf(sc->tempStr, "\ - }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - - res = VkAppendLineFromInput(sc, sc->disableThreadsEnd); - if (res != VKFFT_SUCCESS) return res; - res = appendZeropadEnd(sc); - if (res != VKFFT_SUCCESS) return res; - res = appendBarrierVkFFT(sc, 1); - if (res != VKFFT_SUCCESS) return res; - res = appendZeropadStart(sc); - if (res != VKFFT_SUCCESS) return res; - res = VkAppendLineFromInput(sc, sc->disableThreadsStart); - if (res != VKFFT_SUCCESS) return res; - - if (require_cutoff_check2) { - if (strided) { - sc->tempLen = sprintf(sc->tempStr, "\ - if(%s<%" PRIu64 "){\n", sc->gl_LocalInvocationID_y, sc->localSize[1] - sc->localSize[1] % ((stageRadix + 1) / 2)); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, "\ - if(%s<%" PRIu64 "){\n", sc->gl_LocalInvocationID_x, sc->localSize[0] - sc->localSize[0] % ((stageRadix + 1) / 2)); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - sc->tempLen = sprintf(sc->tempStr, "\ - if(%s < %" PRIu64 "){\n", sc->raderIDx, (stageRadix + 1) / 2); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } -#endif - } -#if(VKFFT_BACKEND == 2) //AMD compiler fix - if ((uint64_t)ceil((sc->localSize[0] * sc->localSize[1]) / ((double)sc->warpSize)) * sc->warpSize * (1 + sc->registers_per_thread + sc->usedLocRegs) <= 2048) { - sc->tempLen = sprintf(sc->tempStr, "\ - }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - if (require_cutoff_check2) { - sc->tempLen = sprintf(sc->tempStr, "\ - }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - - res = VkAppendLineFromInput(sc, sc->disableThreadsEnd); - if (res != VKFFT_SUCCESS) return res; - res = appendZeropadEnd(sc); - if (res != VKFFT_SUCCESS) return res; - res = appendBarrierVkFFT(sc, 1); - if (res != VKFFT_SUCCESS) return res; - res = appendZeropadStart(sc); - if (res != VKFFT_SUCCESS) return res; - res = VkAppendLineFromInput(sc, sc->disableThreadsStart); - if (res != VKFFT_SUCCESS) return res; - - if (require_cutoff_check2) { - if (strided) { - sc->tempLen = sprintf(sc->tempStr, "\ - if(%s<%" PRIu64 "){\n", sc->gl_LocalInvocationID_y, sc->localSize[1] - sc->localSize[1] % ((stageRadix + 1) / 2)); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, "\ - if(%s<%" PRIu64 "){\n", sc->gl_LocalInvocationID_x, sc->localSize[0] - sc->localSize[0] % ((stageRadix + 1) / 2)); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - sc->tempLen = sprintf(sc->tempStr, "\ - if(%s < %" PRIu64 "){\n", sc->raderIDx, (stageRadix + 1) / 2); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } -#endif - } - for (uint64_t t = 0; t < num_logical_groups; t++) { - if ((require_cutoff_check) && (t == num_logical_groups - 1)) { - sc->tempLen = sprintf(sc->tempStr, "\ - if(%s<%" PRIu64 "){\n", sc->raderIDx2, sc->fftDim / stageRadix - t * num_logical_subgroups); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sprintf(tempNum, "%s", sc->x0[t]); - - sc->tempLen = sprintf(sc->tempStr, "\ - %s.x = %s.x-%s.x;\n\ - %s.y = %s.y+%s.y;\n", sc->regIDs[2 * t], tempNum, sc->regIDs[2 * t + 1], sc->regIDs[2 * t], tempNum, sc->regIDs[2 * t + 1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, "\ - %s.x = %s.x+%s.x;\n\ - %s.y = %s.y-%s.y;\n", sc->regIDs[2 * t + 1], tempNum, sc->regIDs[2 * t + 1], sc->regIDs[2 * t + 1], tempNum, sc->regIDs[2 * t + 1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - if ((require_cutoff_check) && (t == num_logical_groups - 1)) { - sc->tempLen = sprintf(sc->tempStr, "\ - }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - sc->tempLen = sprintf(sc->tempStr, "\ - }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - if (require_cutoff_check2) { - sc->tempLen = sprintf(sc->tempStr, "\ - }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - - res = VkAppendLineFromInput(sc, sc->disableThreadsEnd); - if (res != VKFFT_SUCCESS) return res; - res = appendZeropadEnd(sc); - if (res != VKFFT_SUCCESS) return res; - res = appendBarrierVkFFT(sc, 1); - if (res != VKFFT_SUCCESS) return res; - res = appendZeropadStart(sc); - if (res != VKFFT_SUCCESS) return res; - res = VkAppendLineFromInput(sc, sc->disableThreadsStart); - if (res != VKFFT_SUCCESS) return res; - - if (require_cutoff_check2) { - if (strided) { - sc->tempLen = sprintf(sc->tempStr, "\ - if(%s<%" PRIu64 "){\n", sc->gl_LocalInvocationID_y, sc->localSize[1] - sc->localSize[1] % ((stageRadix + 1) / 2)); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, "\ - if(%s<%" PRIu64 "){\n", sc->gl_LocalInvocationID_x, sc->localSize[0] - sc->localSize[0] % ((stageRadix + 1) / 2)); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - - sc->tempLen = sprintf(sc->tempStr, "\ - if(%s < %" PRIu64 "){\n", sc->raderIDx, (stageRadix - 1) / 2); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - //sc->tempLen = sprintf(sc->tempStr, " printf(\"%%d %%f %%f \\n \", %s, %s.x, %s.y);\n\n", sc->gl_LocalInvocationID_x, sc->regIDs[1], sc->regIDs[1]); - //res = VkAppendLine(sc); - //if (res != VKFFT_SUCCESS) return res; - if (sc->inline_rader_g_pow == 1) { - sc->tempLen = sprintf(sc->tempStr, "\ - %s= g_pow_%" PRIu64 "[%" PRIu64 "-%s];\n", sc->sdataID, stageRadix, stageRadix - 1, sc->raderIDx); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else if (sc->inline_rader_g_pow == 2) { - sc->tempLen = sprintf(sc->tempStr, "\ - %s= g_pow[%" PRIu64 "-%s];\n", sc->sdataID, stageRadix - 1 + sc->currentRaderContainer->raderUintLUToffset, sc->raderIDx); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, "\ - %s= (%" PRIu64 "-%s);\n\ - %s=1;\n\ - while (%s != 0)\n\ - {\n\ - %s = (%s * %" PRIu64 ") %% %" PRIu64 ";\n\ - %s--;\n\ - }\n", sc->inoutID, stageRadix - 1, sc->raderIDx, sc->sdataID, sc->inoutID, sc->sdataID, sc->sdataID, g, stageRadix, sc->inoutID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, "\ - }else{\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, "\ - %s = 0;\n", sc->sdataID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, "\ - }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - for (uint64_t t = 0; t < num_logical_groups; t++) { - if ((require_cutoff_check) && (t == num_logical_groups - 1)) { - sc->tempLen = sprintf(sc->tempStr, "\ - if(%s<%" PRIu64 "){\n", sc->raderIDx2, sc->fftDim / stageRadix - t * num_logical_subgroups); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - - sprintf(tempNum, "%" PRIu64 "", t * num_logical_subgroups); - res = VkAddReal(sc, sc->combinedID, sc->raderIDx2, tempNum); - if (res != VKFFT_SUCCESS) return res; - sprintf(tempNum, "%" PRIu64 "", stageSize); - res = VkModReal(sc, sc->stageInvocationID, sc->combinedID, tempNum); - if (res != VKFFT_SUCCESS) return res; - res = VkSubReal(sc, sc->blockInvocationID, sc->combinedID, sc->stageInvocationID); - if (res != VKFFT_SUCCESS) return res; - sprintf(tempNum, "%" PRIu64 "", stageRadix); - res = VkMulReal(sc, sc->inoutID, sc->blockInvocationID, tempNum); - if (res != VKFFT_SUCCESS) return res; - - sc->tempLen = sprintf(sc->tempStr, "\ - %s = %s + %s * %" PRIu64 " + %s;\n", sc->combinedID, sc->inoutID, sc->sdataID, stageSize, sc->stageInvocationID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (strided) { - sc->tempLen = sprintf(sc->tempStr, "\ - %s = %s * sharedStride + %s;\n", sc->combinedID, sc->combinedID, sc->gl_LocalInvocationID_x); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - if (sc->localSize[1] > 1) { - sc->tempLen = sprintf(sc->tempStr, "\ - %s = %s + sharedStride * %s;\n", sc->combinedID, sc->combinedID, sc->gl_LocalInvocationID_y); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - if (((sc->actualInverse) && (sc->normalize)) || ((sc->convolutionStep || sc->useBluesteinFFT) && (stageAngle > 0))) { - if (strcmp(stageNormalization, "")) { - res = VkMulComplexNumber(sc, sc->regIDs[2 * t], sc->regIDs[2 * t], stageNormalization); - } - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, "\ - sdata[%s]=%s;\n", sc->combinedID, sc->regIDs[2 * t]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - if ((require_cutoff_check) && (t == num_logical_groups - 1)) { - sc->tempLen = sprintf(sc->tempStr, "\ - }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - sc->tempLen = sprintf(sc->tempStr, "\ - if(%s < %" PRIu64 "){\n", sc->raderIDx, (stageRadix - 1) / 2); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->inline_rader_g_pow == 1) { - sc->tempLen = sprintf(sc->tempStr, "\ - %s= g_pow_%" PRIu64 "[%" PRIu64 "-%s];\n", sc->sdataID, stageRadix, (stageRadix - 1) / 2, sc->raderIDx); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else if (sc->inline_rader_g_pow == 2) { - sc->tempLen = sprintf(sc->tempStr, "\ - %s= g_pow[%" PRIu64 "-%s];\n", sc->sdataID, (stageRadix - 1) / 2 + sc->currentRaderContainer->raderUintLUToffset, sc->raderIDx); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, "\ - %s= (%" PRIu64 "-%s);\n\ - %s=1;\n\ - while (%s != 0)\n\ - {\n\ - %s = (%s * %" PRIu64 ") %% %" PRIu64 ";\n\ - %s--;\n\ - }\n", sc->inoutID, (stageRadix - 1) / 2, sc->raderIDx, sc->sdataID, sc->inoutID, sc->sdataID, sc->sdataID, g, stageRadix, sc->inoutID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - for (uint64_t t = 0; t < num_logical_groups; t++) { - if ((require_cutoff_check) && (t == num_logical_groups - 1)) { - sc->tempLen = sprintf(sc->tempStr, "\ - if(%s<%" PRIu64 "){\n", sc->raderIDx2, sc->fftDim / stageRadix - t * num_logical_subgroups); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - - sprintf(tempNum, "%" PRIu64 "", t * num_logical_subgroups); - res = VkAddReal(sc, sc->combinedID, sc->raderIDx2, tempNum); - if (res != VKFFT_SUCCESS) return res; - sprintf(tempNum, "%" PRIu64 "", stageSize); - res = VkModReal(sc, sc->stageInvocationID, sc->combinedID, tempNum); - if (res != VKFFT_SUCCESS) return res; - res = VkSubReal(sc, sc->blockInvocationID, sc->combinedID, sc->stageInvocationID); - if (res != VKFFT_SUCCESS) return res; - sprintf(tempNum, "%" PRIu64 "", stageRadix); - res = VkMulReal(sc, sc->inoutID, sc->blockInvocationID, tempNum); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, "\ - %s = %s + %s * %" PRIu64 " + %s;\n", sc->combinedID, sc->inoutID, sc->sdataID, stageSize, sc->stageInvocationID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (strided) { - sc->tempLen = sprintf(sc->tempStr, "\ - %s = %s * sharedStride + %s;\n", sc->combinedID, sc->combinedID, sc->gl_LocalInvocationID_x); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - if (sc->localSize[1] > 1) { - sc->tempLen = sprintf(sc->tempStr, "\ - %s = %s + sharedStride * %s;\n", sc->combinedID, sc->combinedID, sc->gl_LocalInvocationID_y); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - if (((sc->actualInverse) && (sc->normalize)) || ((sc->convolutionStep || sc->useBluesteinFFT) && (stageAngle > 0))) { - if (strcmp(stageNormalization, "")) { - res = VkMulComplexNumber(sc, sc->regIDs[2 * t + 1], sc->regIDs[2 * t + 1], stageNormalization); - } - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, "\ - sdata[%s]=%s;\n", sc->combinedID, sc->regIDs[2 * t + 1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - if ((require_cutoff_check) && (t == num_logical_groups - 1)) { - sc->tempLen = sprintf(sc->tempStr, "\ - }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - sc->tempLen = sprintf(sc->tempStr, "\ - }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (require_cutoff_check2) { - sc->tempLen = sprintf(sc->tempStr, "\ - }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - res = VkAppendLineFromInput(sc, sc->disableThreadsEnd); - if (res != VKFFT_SUCCESS) return res; - res = appendZeropadEnd(sc); - if (res != VKFFT_SUCCESS) return res; - res = appendBarrierVkFFT(sc, 1); - if (res != VKFFT_SUCCESS) return res; - } - } - - return res; -} - -static inline VkFFTResult appendRadixStageNonStrided(VkFFTSpecializationConstantsLayout* sc, const char* floatType, const char* uintType, uint64_t stageSize, uint64_t stageSizeSum, double stageAngle, uint64_t stageRadix) { - VkFFTResult res = VKFFT_SUCCESS; - char vecType[30]; - char LFending[4] = ""; - if (!strcmp(floatType, "float")) sprintf(LFending, "f"); -#if(VKFFT_BACKEND==0) - if (!strcmp(floatType, "float")) sprintf(vecType, "vec2"); - if (!strcmp(floatType, "double")) sprintf(vecType, "dvec2"); - if (!strcmp(floatType, "double")) sprintf(LFending, "LF"); -#elif(VKFFT_BACKEND==1) - if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); - if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); - if (!strcmp(floatType, "double")) sprintf(LFending, "l"); -#elif(VKFFT_BACKEND==2) - if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); - if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); - if (!strcmp(floatType, "double")) sprintf(LFending, "l"); -#elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)) - if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); - if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); - //if (!strcmp(floatType, "double")) sprintf(LFending, "l"); -#endif - - char convolutionInverse[10] = ""; - if (sc->convolutionStep) { - if (stageAngle < 0) - sprintf(convolutionInverse, ", 0"); - else - sprintf(convolutionInverse, ", 1"); - } - uint64_t logicalStoragePerThread = sc->registers_per_thread_per_radix[stageRadix] * sc->registerBoost;// (sc->registers_per_thread % stageRadix == 0) ? sc->registers_per_thread * sc->registerBoost : sc->min_registers_per_thread * sc->registerBoost; - uint64_t logicalRegistersPerThread = sc->registers_per_thread_per_radix[stageRadix];// (sc->registers_per_thread % stageRadix == 0) ? sc->registers_per_thread : sc->min_registers_per_thread; - uint64_t logicalGroupSize = (uint64_t)ceil(sc->fftDim / (double)logicalStoragePerThread); - if ((!((sc->readToRegisters == 1) && (stageSize == 1) && (!(((sc->convolutionStep) || (sc->useBluesteinFFT && sc->BluesteinConvolutionStep)) && (stageAngle > 0) && ((sc->matrixConvolution > 1) || (sc->numKernels > 1)))))) && ((sc->localSize[0] * logicalStoragePerThread > sc->fftDim) || (stageSize > 1) || ((sc->localSize[1] > 1) && (!(sc->performR2C && (sc->actualInverse)))) || ((sc->convolutionStep) && ((sc->matrixConvolution > 1) || (sc->numKernels > 1)) && (stageAngle > 0)) || (sc->performDCT))) - { - res = appendBarrierVkFFT(sc, 1); - if (res != VKFFT_SUCCESS) return res; - } - res = appendZeropadStart(sc); - if (res != VKFFT_SUCCESS) return res; - res = VkAppendLineFromInput(sc, sc->disableThreadsStart); - if (res != VKFFT_SUCCESS) return res; - - //upload second stage of LUT to sm - uint64_t numLUTelementsStage = 0; - switch (stageRadix) { - case 2: - numLUTelementsStage = 1; - break; - case 4: - numLUTelementsStage = 2; - break; - case 8: - numLUTelementsStage = 3; - break; - case 16: - numLUTelementsStage = 4; - break; - case 32: - numLUTelementsStage = 5; - break; - default: - if (stageRadix < sc->fixMinRaderPrimeMult) - numLUTelementsStage = stageRadix - 1; - else - numLUTelementsStage = stageRadix; - break; - } - if ((sc->LUT) && (stageSize > 1) && ((((numLUTelementsStage >= 4) && (sc->fftDim >= 1024)) || (((numLUTelementsStage >= 3) && (sc->fftDim < 1024)))) || (logicalRegistersPerThread / stageRadix > 1)) && (sc->registerBoost == 1) && (stageSize < sc->warpSize)) - sc->useCoalescedLUTUploadToSM = 1; - else - sc->useCoalescedLUTUploadToSM = 0; - - for (uint64_t k = 0; k < sc->registerBoost; k++) { - if (logicalGroupSize != sc->localSize[0]) { - sc->tempLen = sprintf(sc->tempStr, "\ - if (%s < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_x, logicalGroupSize); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - for (uint64_t j = 0; j < logicalRegistersPerThread / stageRadix; j++) { - if (logicalGroupSize * ((j + k * logicalRegistersPerThread / stageRadix) * stageRadix) > sc->fftDim) continue; - if (logicalGroupSize * ((1 + j + k * logicalRegistersPerThread / stageRadix) * stageRadix) > sc->fftDim) { - uint64_t current_group_cut = sc->fftDim / stageRadix - (j + k * logicalRegistersPerThread / stageRadix) * logicalGroupSize; - sc->tempLen = sprintf(sc->tempStr, "\ - if (%s < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_x, current_group_cut); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - - sc->tempLen = sprintf(sc->tempStr, "\ - %s = (%s+ %" PRIu64 ") %% (%" PRIu64 ");\n", sc->stageInvocationID, sc->gl_LocalInvocationID_x, (j + k * logicalRegistersPerThread / stageRadix) * logicalGroupSize, stageSize); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->LUT) - sc->tempLen = sprintf(sc->tempStr, " LUTId = stageInvocationID + %" PRIu64 ";\n", stageSizeSum); - else - sc->tempLen = sprintf(sc->tempStr, " angle = stageInvocationID * %.17e%s;\n", stageAngle, LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if ((!((sc->readToRegisters == 1) && (stageSize == 1) && (!(((sc->convolutionStep) || (sc->useBluesteinFFT && sc->BluesteinConvolutionStep)) && (stageAngle > 0) && ((sc->matrixConvolution > 1) || (sc->numKernels > 1)))))) && ((sc->registerBoost == 1) && ((sc->localSize[0] * logicalStoragePerThread > sc->fftDim) || (stageSize > 1) || ((sc->localSize[1] > 1) && (!(sc->performR2C && (sc->actualInverse)))) || ((sc->convolutionStep) && ((sc->matrixConvolution > 1) || (sc->numKernels > 1)) && (stageAngle > 0)) || (sc->performDCT)))) { - //if(sc->readToRegisters==0){ - for (uint64_t i = 0; i < stageRadix; i++) { - uint64_t id = j + i * logicalRegistersPerThread / stageRadix; - id = (id / logicalRegistersPerThread) * sc->registers_per_thread + id % logicalRegistersPerThread; - - sc->tempLen = sprintf(sc->tempStr, "\ - %s = %s + %" PRIu64 ";\n", sc->sdataID, sc->gl_LocalInvocationID_x, j * logicalGroupSize + i * sc->fftDim / stageRadix); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - if (sc->resolveBankConflictFirstStages == 1) { - sc->tempLen = sprintf(sc->tempStr, "\ - %s = (%s / %" PRIu64 ") * %" PRIu64 " + %s %% %" PRIu64 ";", sc->sdataID, sc->sdataID, sc->numSharedBanks / 2, sc->numSharedBanks / 2 + 1, sc->sdataID, sc->numSharedBanks / 2); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - - if (sc->localSize[1] > 1) { - sc->tempLen = sprintf(sc->tempStr, "\ - %s = %s + sharedStride * %s;\n", sc->sdataID, sc->sdataID, sc->gl_LocalInvocationID_y); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, "\ - %s = sdata[%s];\n", sc->regIDs[id], sc->sdataID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - if (!sc->useCoalescedLUTUploadToSM) { - char** regID = (char**)malloc(sizeof(char*) * stageRadix); - if (regID) { - for (uint64_t i = 0; i < stageRadix; i++) { - regID[i] = (char*)malloc(sizeof(char) * 50); - if (!regID[i]) { - for (uint64_t p = 0; p < i; p++) { - free(regID[p]); - regID[p] = 0; - } - free(regID); - regID = 0; - return VKFFT_ERROR_MALLOC_FAILED; - } - uint64_t id = j + k * logicalRegistersPerThread / stageRadix + i * logicalStoragePerThread / stageRadix; - id = (id / logicalRegistersPerThread) * sc->registers_per_thread + id % logicalRegistersPerThread; - sprintf(regID[i], "%s", sc->regIDs[id]); - /*if(j + i * logicalStoragePerThread / stageRadix < logicalRegistersPerThread) - sprintf(regID[i], "%s", sc->regIDs[j + i * logicalStoragePerThread / stageRadix]); - else - sprintf(regID[i], "%" PRIu64 "[%" PRIu64 "]", (j + i * logicalStoragePerThread / stageRadix)/ logicalRegistersPerThread, (j + i * logicalStoragePerThread / stageRadix) % logicalRegistersPerThread);*/ - - } - res = inlineRadixKernelVkFFT(sc, floatType, uintType, stageRadix, stageSize, stageSizeSum, stageAngle, regID); - if (res != VKFFT_SUCCESS) return res; - for (uint64_t i = 0; i < stageRadix; i++) { - uint64_t id = j + k * logicalRegistersPerThread / stageRadix + i * logicalStoragePerThread / stageRadix; - id = (id / logicalRegistersPerThread) * sc->registers_per_thread + id % logicalRegistersPerThread; - sprintf(sc->regIDs[id], "%s", regID[i]); - } - for (uint64_t i = 0; i < stageRadix; i++) { - free(regID[i]); - regID[i] = 0; - } - free(regID); - regID = 0; - } - else - return VKFFT_ERROR_MALLOC_FAILED; - } - - if (logicalGroupSize * ((1 + j + k * logicalRegistersPerThread / stageRadix) * stageRadix) > sc->fftDim) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - if (logicalGroupSize != sc->localSize[0]) { - sc->tempLen = sprintf(sc->tempStr, "\ - }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->useCoalescedLUTUploadToSM) { - res = VkAppendLineFromInput(sc, sc->disableThreadsEnd); - if (res != VKFFT_SUCCESS) return res; - res = appendZeropadEnd(sc); - if (res != VKFFT_SUCCESS) return res; - res = appendBarrierVkFFT(sc, 1); - if (res != VKFFT_SUCCESS) return res; - - sc->useCoalescedLUTUploadToSM = 1; - sc->tempLen = sprintf(sc->tempStr, "\ - %s = %s;\n", sc->sdataID, sc->gl_LocalInvocationID_x); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->localSize[1] > 1) { - sc->tempLen = sprintf(sc->tempStr, "\ - %s = %s + %" PRIu64 "*%s;\n", sc->sdataID, sc->sdataID, sc->localSize[0], sc->gl_LocalInvocationID_y); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - - for (uint64_t i = 0; i < (uint64_t)ceil(numLUTelementsStage * stageSize / ((double)sc->localSize[0] * sc->localSize[1])); i++) { - if (i > 0) { - sc->tempLen = sprintf(sc->tempStr, "\ - %s = %s + %" PRIu64 ";\n", sc->sdataID, sc->sdataID, sc->localSize[0] * sc->localSize[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (i == (uint64_t)ceil(numLUTelementsStage * stageSize / ((double)sc->localSize[0] * sc->localSize[1])) - 1) { - sc->tempLen = sprintf(sc->tempStr, "\ - if(%s<%" PRIu64 "){\n", sc->sdataID, numLUTelementsStage * stageSize); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, "\ - sdata[%s] = twiddleLUT[%s+%" PRIu64 "];\n", sc->sdataID, sc->sdataID, (stageSizeSum)); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (i == (uint64_t)ceil(numLUTelementsStage * stageSize / ((double)sc->localSize[0] * sc->localSize[1])) - 1) { - sc->tempLen = sprintf(sc->tempStr, "\ - }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - res = appendBarrierVkFFT(sc, 1); - if (res != VKFFT_SUCCESS) return res; - res = appendZeropadStart(sc); - if (res != VKFFT_SUCCESS) return res; - res = VkAppendLineFromInput(sc, sc->disableThreadsStart); - if (res != VKFFT_SUCCESS) return res; - if (logicalGroupSize != sc->localSize[0]) { - sc->tempLen = sprintf(sc->tempStr, "\ - if (%s < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_x, logicalGroupSize); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - for (uint64_t j = 0; j < logicalRegistersPerThread / stageRadix; j++) { - if (logicalGroupSize * ((j + k * logicalRegistersPerThread / stageRadix) * stageRadix) > sc->fftDim) continue; - if (logicalGroupSize * ((1 + j + k * logicalRegistersPerThread / stageRadix) * stageRadix) > sc->fftDim) { - uint64_t current_group_cut = sc->fftDim / stageRadix - (j + k * logicalRegistersPerThread / stageRadix) * logicalGroupSize; - sc->tempLen = sprintf(sc->tempStr, "\ - if (%s < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_x, current_group_cut); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - char** regID = (char**)malloc(sizeof(char*) * stageRadix); - if (regID) { - for (uint64_t i = 0; i < stageRadix; i++) { - regID[i] = (char*)malloc(sizeof(char) * 50); - if (!regID[i]) { - for (uint64_t p = 0; p < i; p++) { - free(regID[p]); - regID[p] = 0; - } - free(regID); - regID = 0; - return VKFFT_ERROR_MALLOC_FAILED; - } - uint64_t id = j + k * logicalRegistersPerThread / stageRadix + i * logicalStoragePerThread / stageRadix; - id = (id / logicalRegistersPerThread) * sc->registers_per_thread + id % logicalRegistersPerThread; - sprintf(regID[i], "%s", sc->regIDs[id]); - /*if(j + i * logicalStoragePerThread / stageRadix < logicalRegistersPerThread) - sprintf(regID[i], "%s", sc->regIDs[j + i * logicalStoragePerThread / stageRadix]); - else - sprintf(regID[i], "%" PRIu64 "[%" PRIu64 "]", (j + i * logicalStoragePerThread / stageRadix)/ logicalRegistersPerThread, (j + i * logicalStoragePerThread / stageRadix) % logicalRegistersPerThread);*/ - - } - sc->tempLen = sprintf(sc->tempStr, "\ - %s = (%s+ %" PRIu64 ") %% (%" PRIu64 ");\n", sc->stageInvocationID, sc->gl_LocalInvocationID_x, (j + k * logicalRegistersPerThread / stageRadix) * logicalGroupSize, stageSize); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (!sc->useCoalescedLUTUploadToSM) { - if (sc->LUT) - sc->tempLen = sprintf(sc->tempStr, " LUTId = stageInvocationID + %" PRIu64 ";\n", stageSizeSum); - else - sc->tempLen = sprintf(sc->tempStr, " angle = stageInvocationID * %.17e%s;\n", stageAngle, LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - res = inlineRadixKernelVkFFT(sc, floatType, uintType, stageRadix, stageSize, stageSizeSum, stageAngle, regID); - if (res != VKFFT_SUCCESS) return res; - for (uint64_t i = 0; i < stageRadix; i++) { - uint64_t id = j + k * logicalRegistersPerThread / stageRadix + i * logicalStoragePerThread / stageRadix; - id = (id / logicalRegistersPerThread) * sc->registers_per_thread + id % logicalRegistersPerThread; - sprintf(sc->regIDs[id], "%s", regID[i]); - } - for (uint64_t i = 0; i < stageRadix; i++) { - free(regID[i]); - regID[i] = 0; - } - free(regID); - regID = 0; - } - else - return VKFFT_ERROR_MALLOC_FAILED; - if (logicalGroupSize * ((1 + j + k * logicalRegistersPerThread / stageRadix) * stageRadix) > sc->fftDim) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - if (logicalGroupSize != sc->localSize[0]) { - sc->tempLen = sprintf(sc->tempStr, "\ - }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if ((stageSize == 1) && (sc->cacheShuffle)) { - for (uint64_t i = 0; i < logicalRegistersPerThread; i++) { - uint64_t id = i + k * logicalRegistersPerThread; - id = (id / logicalRegistersPerThread) * sc->registers_per_thread + id % logicalRegistersPerThread; - sc->tempLen = sprintf(sc->tempStr, "\ - shuffle[%" PRIu64 "]=%s;\n", i, sc->regIDs[id]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - for (uint64_t i = 0; i < logicalRegistersPerThread; i++) { - uint64_t id = i + k * logicalRegistersPerThread; - id = (id / logicalRegistersPerThread) * sc->registers_per_thread + id % logicalRegistersPerThread; - sc->tempLen = sprintf(sc->tempStr, "\ - %s=shuffle[(%" PRIu64 "+tshuffle)%%(%" PRIu64 ")];\n", sc->regIDs[id], i, logicalRegistersPerThread); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - } - - res = VkAppendLineFromInput(sc, sc->disableThreadsEnd); - if (res != VKFFT_SUCCESS) return res; - res = appendZeropadEnd(sc); - if (res != VKFFT_SUCCESS) return res; - return res; -} -static inline VkFFTResult appendRadixStageStrided(VkFFTSpecializationConstantsLayout* sc, const char* floatType, const char* uintType, uint64_t stageSize, uint64_t stageSizeSum, double stageAngle, uint64_t stageRadix) { - VkFFTResult res = VKFFT_SUCCESS; - char vecType[30]; - char LFending[4] = ""; - if (!strcmp(floatType, "float")) sprintf(LFending, "f"); -#if(VKFFT_BACKEND==0) - if (!strcmp(floatType, "float")) sprintf(vecType, "vec2"); - if (!strcmp(floatType, "double")) sprintf(vecType, "dvec2"); - if (!strcmp(floatType, "double")) sprintf(LFending, "LF"); -#elif(VKFFT_BACKEND==1) - if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); - if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); - if (!strcmp(floatType, "double")) sprintf(LFending, "l"); -#elif(VKFFT_BACKEND==2) - if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); - if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); - if (!strcmp(floatType, "double")) sprintf(LFending, "l"); -#elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)) - if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); - if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); - //if (!strcmp(floatType, "double")) sprintf(LFending, "l"); -#endif - - char convolutionInverse[10] = ""; - if (sc->convolutionStep) { - if (stageAngle < 0) - sprintf(convolutionInverse, ", 0"); - else - sprintf(convolutionInverse, ", 1"); - } - uint64_t logicalStoragePerThread = sc->registers_per_thread_per_radix[stageRadix] * sc->registerBoost;// (sc->registers_per_thread % stageRadix == 0) ? sc->registers_per_thread * sc->registerBoost : sc->min_registers_per_thread * sc->registerBoost; - uint64_t logicalRegistersPerThread = sc->registers_per_thread_per_radix[stageRadix];// (sc->registers_per_thread % stageRadix == 0) ? sc->registers_per_thread : sc->min_registers_per_thread; - uint64_t logicalGroupSize = (uint64_t)ceil(sc->fftDim / (double)logicalStoragePerThread); - if ((!((sc->readToRegisters == 1) && (stageSize == 1) && (!(((sc->convolutionStep) || (sc->useBluesteinFFT && sc->BluesteinConvolutionStep)) && (stageAngle > 0) && ((sc->matrixConvolution > 1) || (sc->numKernels > 1)))))) && (((sc->axis_id == 0) && (sc->axis_upload_id == 0) && (!(sc->performR2C && (sc->actualInverse)))) || (sc->localSize[1] * logicalStoragePerThread > sc->fftDim) || (stageSize > 1) || ((sc->convolutionStep) && ((sc->matrixConvolution > 1) || (sc->numKernels > 1)) && (stageAngle > 0)) || (sc->performDCT))) - { - res = appendBarrierVkFFT(sc, 1); - if (res != VKFFT_SUCCESS) return res; - } - res = appendZeropadStart(sc); - if (res != VKFFT_SUCCESS) return res; - res = VkAppendLineFromInput(sc, sc->disableThreadsStart); - if (res != VKFFT_SUCCESS) return res; - - - //upload second stage of LUT to sm - uint64_t numLUTelementsStage = 0; - switch (stageRadix) { - case 2: - numLUTelementsStage = 1; - break; - case 4: - numLUTelementsStage = 2; - break; - case 8: - numLUTelementsStage = 3; - break; - case 16: - numLUTelementsStage = 4; - break; - case 32: - numLUTelementsStage = 5; - break; - default: - if (stageRadix < sc->fixMinRaderPrimeMult) - numLUTelementsStage = stageRadix - 1; - else - numLUTelementsStage = stageRadix; - break; - } - if ((sc->LUT) && (stageSize > 1) && ((((numLUTelementsStage >= 4) && (sc->fftDim >= 1024)) || (((numLUTelementsStage >= 3) && (sc->fftDim < 1024)))) || (logicalRegistersPerThread / stageRadix > 1)) && (sc->registerBoost == 1) && (stageSize < sc->warpSize)) - sc->useCoalescedLUTUploadToSM = 1; - else - sc->useCoalescedLUTUploadToSM = 0; - - - for (uint64_t k = 0; k < sc->registerBoost; k++) { - if (logicalGroupSize != sc->localSize[1]) { - sc->tempLen = sprintf(sc->tempStr, "\ - if (%s < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_y, logicalGroupSize); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - for (uint64_t j = 0; j < logicalRegistersPerThread / stageRadix; j++) { - if (logicalGroupSize * ((j + k * logicalRegistersPerThread / stageRadix) * stageRadix) > sc->fftDim) continue; - if (logicalGroupSize * ((1 + j + k * logicalRegistersPerThread / stageRadix) * stageRadix) > sc->fftDim) { - uint64_t current_group_cut = sc->fftDim / stageRadix - (j + k * logicalRegistersPerThread / stageRadix) * logicalGroupSize; - sc->tempLen = sprintf(sc->tempStr, "\ - if (%s < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_y, current_group_cut); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, "\ - %s = (%s+ %" PRIu64 ") %% (%" PRIu64 ");\n", sc->stageInvocationID, sc->gl_LocalInvocationID_y, (j + k * logicalRegistersPerThread / stageRadix) * logicalGroupSize, stageSize); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->LUT) - sc->tempLen = sprintf(sc->tempStr, " LUTId = stageInvocationID + %" PRIu64 ";\n", stageSizeSum); - else - sc->tempLen = sprintf(sc->tempStr, " angle = stageInvocationID * %.17e%s;\n", stageAngle, LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if ((!((sc->readToRegisters == 1) && (stageSize == 1) && (!(((sc->convolutionStep) || (sc->useBluesteinFFT && sc->BluesteinConvolutionStep)) && (stageAngle > 0) && ((sc->matrixConvolution > 1) || (sc->numKernels > 1)))))) && ((sc->registerBoost == 1) && (((sc->axis_id == 0) && (sc->axis_upload_id == 0) && (!(sc->performR2C && (sc->actualInverse)))) || (sc->localSize[1] * logicalStoragePerThread > sc->fftDim) || (stageSize > 1) || ((sc->convolutionStep) && ((sc->matrixConvolution > 1) || (sc->numKernels > 1)) && (stageAngle > 0)) || (sc->performDCT)))) { - for (uint64_t i = 0; i < stageRadix; i++) { - uint64_t id = j + i * logicalRegistersPerThread / stageRadix; - id = (id / logicalRegistersPerThread) * sc->registers_per_thread + id % logicalRegistersPerThread; - sc->tempLen = sprintf(sc->tempStr, "\ - %s = sdata[%s*(%s+%" PRIu64 ")+%s];\n", sc->regIDs[id], sc->sharedStride, sc->gl_LocalInvocationID_y, j * logicalGroupSize + i * sc->fftDim / stageRadix, sc->gl_LocalInvocationID_x); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - if (!sc->useCoalescedLUTUploadToSM) { - char** regID = (char**)malloc(sizeof(char*) * stageRadix); - if (regID) { - for (uint64_t i = 0; i < stageRadix; i++) { - regID[i] = (char*)malloc(sizeof(char) * 50); - if (!regID[i]) { - for (uint64_t p = 0; p < i; p++) { - free(regID[p]); - regID[p] = 0; - } - free(regID); - regID = 0; - return VKFFT_ERROR_MALLOC_FAILED; - } - uint64_t id = j + k * logicalRegistersPerThread / stageRadix + i * logicalStoragePerThread / stageRadix; - id = (id / logicalRegistersPerThread) * sc->registers_per_thread + id % logicalRegistersPerThread; - sprintf(regID[i], "%s", sc->regIDs[id]); - /*if (j + i * logicalStoragePerThread / stageRadix < logicalRegistersPerThread) - sprintf(regID[i], "_%" PRIu64 "", j + i * logicalStoragePerThread / stageRadix); - else - sprintf(regID[i], "%" PRIu64 "[%" PRIu64 "]", (j + i * logicalStoragePerThread / stageRadix) / logicalRegistersPerThread, (j + i * logicalStoragePerThread / stageRadix) % logicalRegistersPerThread);*/ - - } - res = inlineRadixKernelVkFFT(sc, floatType, uintType, stageRadix, stageSize, stageSizeSum, stageAngle, regID); - if (res != VKFFT_SUCCESS) return res; - for (uint64_t i = 0; i < stageRadix; i++) { - uint64_t id = j + k * logicalRegistersPerThread / stageRadix + i * logicalStoragePerThread / stageRadix; - id = (id / logicalRegistersPerThread) * sc->registers_per_thread + id % logicalRegistersPerThread; - sprintf(sc->regIDs[id], "%s", regID[i]); - } - for (uint64_t i = 0; i < stageRadix; i++) { - free(regID[i]); - regID[i] = 0; - } - free(regID); - regID = 0; - } - else - return VKFFT_ERROR_MALLOC_FAILED; - } - if (logicalGroupSize * ((1 + j + k * logicalRegistersPerThread / stageRadix) * stageRadix) > sc->fftDim) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - if (logicalGroupSize != sc->localSize[1]) { - sc->tempLen = sprintf(sc->tempStr, "\ - }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - //upload second stage of LUT to sm - if (sc->useCoalescedLUTUploadToSM) { - res = VkAppendLineFromInput(sc, sc->disableThreadsEnd); - if (res != VKFFT_SUCCESS) return res; - res = appendZeropadEnd(sc); - if (res != VKFFT_SUCCESS) return res; - res = appendBarrierVkFFT(sc, 1); - if (res != VKFFT_SUCCESS) return res; - - sc->useCoalescedLUTUploadToSM = 1; - sc->tempLen = sprintf(sc->tempStr, "\ - %s = %s;\n", sc->sdataID, sc->gl_LocalInvocationID_x); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, "\ - %s = %s + %" PRIu64 "*%s;\n", sc->sdataID, sc->sdataID, sc->localSize[0], sc->gl_LocalInvocationID_y); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - for (uint64_t i = 0; i < (uint64_t)ceil(numLUTelementsStage * stageSize / ((double)sc->localSize[0] * sc->localSize[1])); i++) { - if (i > 0) { - sc->tempLen = sprintf(sc->tempStr, "\ - %s = %s + %" PRIu64 ";\n", sc->sdataID, sc->sdataID, sc->localSize[0] * sc->localSize[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (i == (uint64_t)ceil(numLUTelementsStage * stageSize / ((double)sc->localSize[0] * sc->localSize[1])) - 1) { - sc->tempLen = sprintf(sc->tempStr, "\ - if(%s<%" PRIu64 "){\n", sc->sdataID, numLUTelementsStage * stageSize); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, "\ - sdata[%s] = twiddleLUT[%s+%" PRIu64 "];\n", sc->sdataID, sc->sdataID, (stageSizeSum)); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (i == (uint64_t)ceil(numLUTelementsStage * stageSize / ((double)sc->localSize[0] * sc->localSize[1])) - 1) { - sc->tempLen = sprintf(sc->tempStr, "\ - }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - res = appendBarrierVkFFT(sc, 1); - if (res != VKFFT_SUCCESS) return res; - res = appendZeropadStart(sc); - if (res != VKFFT_SUCCESS) return res; - res = VkAppendLineFromInput(sc, sc->disableThreadsStart); - if (res != VKFFT_SUCCESS) return res; - if (logicalGroupSize != sc->localSize[1]) { - sc->tempLen = sprintf(sc->tempStr, "\ - if (%s < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_y, logicalGroupSize); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - for (uint64_t j = 0; j < logicalRegistersPerThread / stageRadix; j++) { - if (logicalGroupSize * ((j + k * logicalRegistersPerThread / stageRadix) * stageRadix) > sc->fftDim) continue; - if (logicalGroupSize * ((1 + j + k * logicalRegistersPerThread / stageRadix) * stageRadix) > sc->fftDim) { - uint64_t current_group_cut = sc->fftDim / stageRadix - (j + k * logicalRegistersPerThread / stageRadix) * logicalGroupSize; - sc->tempLen = sprintf(sc->tempStr, "\ - if (%s < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_y, current_group_cut); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - char** regID = (char**)malloc(sizeof(char*) * stageRadix); - if (regID) { - for (uint64_t i = 0; i < stageRadix; i++) { - regID[i] = (char*)malloc(sizeof(char) * 50); - if (!regID[i]) { - for (uint64_t p = 0; p < i; p++) { - free(regID[p]); - regID[p] = 0; - } - free(regID); - regID = 0; - return VKFFT_ERROR_MALLOC_FAILED; - } - uint64_t id = j + k * logicalRegistersPerThread / stageRadix + i * logicalStoragePerThread / stageRadix; - id = (id / logicalRegistersPerThread) * sc->registers_per_thread + id % logicalRegistersPerThread; - sprintf(regID[i], "%s", sc->regIDs[id]); - /*if (j + i * logicalStoragePerThread / stageRadix < logicalRegistersPerThread) - sprintf(regID[i], "_%" PRIu64 "", j + i * logicalStoragePerThread / stageRadix); - else - sprintf(regID[i], "%" PRIu64 "[%" PRIu64 "]", (j + i * logicalStoragePerThread / stageRadix) / logicalRegistersPerThread, (j + i * logicalStoragePerThread / stageRadix) % logicalRegistersPerThread);*/ - - } - sc->tempLen = sprintf(sc->tempStr, "\ - %s = (%s+ %" PRIu64 ") %% (%" PRIu64 ");\n", sc->stageInvocationID, sc->gl_LocalInvocationID_y, (j + k * logicalRegistersPerThread / stageRadix) * logicalGroupSize, stageSize); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->LUT) - sc->tempLen = sprintf(sc->tempStr, " LUTId = stageInvocationID + %" PRIu64 ";\n", stageSizeSum); - else - sc->tempLen = sprintf(sc->tempStr, " angle = stageInvocationID * %.17e%s;\n", stageAngle, LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - res = inlineRadixKernelVkFFT(sc, floatType, uintType, stageRadix, stageSize, stageSizeSum, stageAngle, regID); - if (res != VKFFT_SUCCESS) return res; - for (uint64_t i = 0; i < stageRadix; i++) { - uint64_t id = j + k * logicalRegistersPerThread / stageRadix + i * logicalStoragePerThread / stageRadix; - id = (id / logicalRegistersPerThread) * sc->registers_per_thread + id % logicalRegistersPerThread; - sprintf(sc->regIDs[id], "%s", regID[i]); - } - for (uint64_t i = 0; i < stageRadix; i++) { - free(regID[i]); - regID[i] = 0; - } - free(regID); - regID = 0; - } - else - return VKFFT_ERROR_MALLOC_FAILED; - if (logicalGroupSize * ((1 + j + k * logicalRegistersPerThread / stageRadix) * stageRadix) > sc->fftDim) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - if (logicalGroupSize != sc->localSize[1]) { - sc->tempLen = sprintf(sc->tempStr, "\ - }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - res = VkAppendLineFromInput(sc, sc->disableThreadsEnd); - if (res != VKFFT_SUCCESS) return res; - res = appendZeropadEnd(sc); - if (res != VKFFT_SUCCESS) return res; - if (stageSize == 1) { - sc->tempLen = sprintf(sc->tempStr, " %s = %" PRIu64 ";\n", sc->sharedStride, sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - return res; -} -static inline VkFFTResult appendRadixStage(VkFFTSpecializationConstantsLayout* sc, const char* floatType, const char* uintType, uint64_t stageSize, uint64_t stageSizeSum, double stageAngle, uint64_t stageRadix, uint64_t stageID, uint64_t shuffleType) { - VkFFTResult res = VKFFT_SUCCESS; - if (sc->rader_generator[stageID]) { - for (uint64_t i = 0; i < sc->numRaderPrimes; i++) { - if (sc->raderContainer[i].prime == stageRadix) { - sc->currentRaderContainer = &sc->raderContainer[i]; - } - } - if (sc->currentRaderContainer->type) { - switch (shuffleType) { - case 0: case 5: case 6: case 110: case 120: case 130: case 140: case 142: case 144: { - res = appendMultRaderStage(sc, floatType, uintType, stageSize, stageSizeSum, stageAngle, stageRadix, stageID, 0); - if (res != VKFFT_SUCCESS) return res; - //appendBarrierVkFFT(sc, 1); - break; - } - case 1: case 2: case 111: case 121: case 131: case 141: case 143: case 145: { - res = appendMultRaderStage(sc, floatType, uintType, stageSize, stageSizeSum, stageAngle, stageRadix, stageID, 1); - if (res != VKFFT_SUCCESS) return res; - //appendBarrierVkFFT(sc, 1); - break; - } - } - } - else { - switch (shuffleType) { - case 0: case 5: case 6: case 110: case 120: case 130: case 140: case 142: case 144: { - res = appendFFTRaderStage(sc, floatType, uintType, stageSize, stageSizeSum, stageAngle, stageRadix, stageID, 0); - if (res != VKFFT_SUCCESS) return res; - //appendBarrierVkFFT(sc, 1); - break; - } - case 1: case 2: case 111: case 121: case 131: case 141: case 143: case 145: { - res = appendFFTRaderStage(sc, floatType, uintType, stageSize, stageSizeSum, stageAngle, stageRadix, stageID, 1); - if (res != VKFFT_SUCCESS) return res; - //appendBarrierVkFFT(sc, 1); - break; - } - } - } - } - else { - switch (shuffleType) { - case 0: case 5: case 6: case 110: case 120: case 130: case 140: case 142: case 144: { - res = appendRadixStageNonStrided(sc, floatType, uintType, stageSize, stageSizeSum, stageAngle, stageRadix); - if (res != VKFFT_SUCCESS) return res; - //appendBarrierVkFFT(sc, 1); - break; - } - case 1: case 2: case 111: case 121: case 131: case 141: case 143: case 145: { - res = appendRadixStageStrided(sc, floatType, uintType, stageSize, stageSizeSum, stageAngle, stageRadix); - if (res != VKFFT_SUCCESS) return res; - //appendBarrierVkFFT(sc, 1); - break; - } - } - } - return res; -} - -static inline VkFFTResult appendRegisterBoostShuffle(VkFFTSpecializationConstantsLayout* sc, const char* floatType, uint64_t stageSize, uint64_t stageRadixPrev, uint64_t stageRadix, double stageAngle) { - VkFFTResult res = VKFFT_SUCCESS; - /*if (((sc->actualInverse) && (sc->normalize)) || ((sc->convolutionStep || sc->useBluesteinFFT) && (stageAngle > 0))) { - uint64_t bluesteinInverseNormalize = 1; - if ((sc->useBluesteinFFT) && (stageAngle > 0) && (stageSize == 1) && (sc->normalize) && (sc->axis_upload_id == 0)) bluesteinInverseNormalize = sc->bluesteinNormalizeSize; - char stageNormalization[50] = ""; - if ((stageSize == 1) && (sc->performDCT) && (sc->actualInverse)) { - if (sc->performDCT == 4) - sprintf(stageNormalization, "%" PRIu64 "", stageRadixPrev * stageRadix * 4 * bluesteinInverseNormalize); - else - sprintf(stageNormalization, "%" PRIu64 "", stageRadixPrev * stageRadix * 2 * bluesteinInverseNormalize); - } - else - sprintf(stageNormalization, "%" PRIu64 "", stageRadixPrev * stageRadix * bluesteinInverseNormalize); - uint64_t logicalRegistersPerThread = sc->registers_per_thread_per_radix[stageRadix];// (sc->registers_per_thread % stageRadix == 0) ? sc->registers_per_thread : sc->min_registers_per_thread; - for (uint64_t k = 0; k < sc->registerBoost; ++k) { - for (uint64_t i = 0; i < logicalRegistersPerThread; i++) { - res = VkDivComplexNumber(sc, sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread], stageNormalization); - if (res != VKFFT_SUCCESS) return res; - } - } - }*/ - return res; -} - -static inline VkFFTResult appendRadixShuffleNonStrided(VkFFTSpecializationConstantsLayout* sc, const char* floatType, const char* uintType, uint64_t stageSize, uint64_t stageSizeSum, double stageAngle, uint64_t stageRadix, uint64_t stageRadixNext) { - VkFFTResult res = VKFFT_SUCCESS; - char vecType[30]; - char LFending[4] = ""; - if (!strcmp(floatType, "float")) sprintf(LFending, "f"); -#if(VKFFT_BACKEND==0) - if (!strcmp(floatType, "float")) sprintf(vecType, "vec2"); - if (!strcmp(floatType, "double")) sprintf(vecType, "dvec2"); - if (!strcmp(floatType, "double")) sprintf(LFending, "LF"); -#elif(VKFFT_BACKEND==1) - if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); - if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); - if (!strcmp(floatType, "double")) sprintf(LFending, "l"); -#elif(VKFFT_BACKEND==2) - if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); - if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); - if (!strcmp(floatType, "double")) sprintf(LFending, "l"); -#elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)) - if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); - if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); -#endif - char stageNormalization[50] = ""; - uint64_t normalizationValue = 1; - if ((((sc->actualInverse) && (sc->normalize)) || (sc->convolutionStep && (stageAngle > 0))) && (stageSize == 1) && (sc->axis_upload_id == 0) && (!(sc->useBluesteinFFT && (stageAngle < 0)))) { - if ((sc->performDCT) && (sc->actualInverse)) { - if (sc->performDCT == 1) - normalizationValue = (sc->sourceFFTSize - 1) * 2; - else - normalizationValue = sc->sourceFFTSize * 2; - } - else - normalizationValue = sc->sourceFFTSize; - } - if (sc->useBluesteinFFT && (stageAngle > 0) && (stageSize == 1) && (sc->axis_upload_id == 0)) { - normalizationValue *= sc->fft_dim_full; - } - if (normalizationValue != 1) { - sprintf(stageNormalization, "%.17e%s", 1.0 / (double)(normalizationValue), LFending); - } - char tempNum[50] = ""; - - uint64_t logicalStoragePerThread = sc->registers_per_thread_per_radix[stageRadix] * sc->registerBoost;// (sc->registers_per_thread % stageRadix == 0) ? sc->registers_per_thread * sc->registerBoost : sc->min_registers_per_thread * sc->registerBoost; - uint64_t logicalStoragePerThreadNext = sc->registers_per_thread_per_radix[stageRadixNext] * sc->registerBoost;// (sc->registers_per_thread % stageRadixNext == 0) ? sc->registers_per_thread * sc->registerBoost : sc->min_registers_per_thread * sc->registerBoost; - uint64_t logicalRegistersPerThread = sc->registers_per_thread_per_radix[stageRadix];// (sc->registers_per_thread % stageRadix == 0) ? sc->registers_per_thread : sc->min_registers_per_thread; - uint64_t logicalRegistersPerThreadNext = sc->registers_per_thread_per_radix[stageRadixNext];// (sc->registers_per_thread % stageRadixNext == 0) ? sc->registers_per_thread : sc->min_registers_per_thread; - - uint64_t logicalGroupSize = (uint64_t)ceil(sc->fftDim / (double)logicalStoragePerThread); - uint64_t logicalGroupSizeNext = (uint64_t)ceil(sc->fftDim / (double)logicalStoragePerThreadNext); - if ((!((sc->writeFromRegisters == 1) && (stageSize == sc->fftDim / stageRadix) && (!(((sc->convolutionStep) || (sc->useBluesteinFFT && sc->BluesteinConvolutionStep)) && (stageAngle < 0) && ((sc->matrixConvolution > 1) || (sc->numKernels > 1)))))) && (((sc->registerBoost == 1) && ((sc->localSize[0] * logicalStoragePerThread > sc->fftDim) || (stageSize < sc->fftDim / stageRadix) || ((sc->reorderFourStep) && (sc->fftDim < sc->fft_dim_full) && (sc->localSize[1] > 1)) || (sc->localSize[1] > 1) || ((sc->performR2C) && (!sc->actualInverse) && (sc->axis_id == 0)) || ((sc->convolutionStep) && ((sc->matrixConvolution > 1) || (sc->numKernels > 1)) && (stageAngle < 0)))) || (sc->performDCT))) - { - res = appendBarrierVkFFT(sc, 1); - if (res != VKFFT_SUCCESS) return res; - } - //if ((sc->localSize[0] * logicalStoragePerThread > sc->fftDim) || (stageSize < sc->fftDim / stageRadix) || ((sc->reorderFourStep) && (sc->fftDim < sc->fft_dim_full) && (sc->localSize[1] > 1)) || (sc->localSize[1] > 1) || ((sc->performR2C) && (!sc->actualInverse) && (sc->axis_id == 0)) || ((sc->convolutionStep) && ((sc->matrixConvolution > 1) || (sc->numKernels > 1)) && (stageAngle < 0)) || (sc->registerBoost > 1) || (sc->performDCT)) { - if ((!((sc->writeFromRegisters == 1) && (stageSize == sc->fftDim / stageRadix) && (!(((sc->convolutionStep) || (sc->useBluesteinFFT && sc->BluesteinConvolutionStep)) && (stageAngle < 0) && ((sc->matrixConvolution > 1) || (sc->numKernels > 1)))))) && ((sc->localSize[0] * logicalStoragePerThread > sc->fftDim) || (stageSize < sc->fftDim / stageRadix) || ((sc->reorderFourStep) && (sc->fftDim < sc->fft_dim_full) && (sc->localSize[1] > 1)) || (sc->localSize[1] > 1) || ((sc->performR2C) && (!sc->actualInverse) && (sc->axis_id == 0)) || ((sc->convolutionStep) && ((sc->matrixConvolution > 1) || (sc->numKernels > 1)) && (stageAngle < 0)) || (sc->registerBoost > 1) || (sc->performDCT))) { - if (!((sc->registerBoost > 1) && (stageSize * stageRadix == sc->fftDim / sc->stageRadix[sc->numStages - 1]) && (sc->stageRadix[sc->numStages - 1] == sc->registerBoost))) { - char** tempID; - tempID = (char**)malloc(sizeof(char*) * sc->registers_per_thread * sc->registerBoost); - if (tempID) { - for (uint64_t i = 0; i < sc->registers_per_thread * sc->registerBoost; i++) { - tempID[i] = (char*)malloc(sizeof(char) * 50); - if (!tempID[i]) { - for (uint64_t j = 0; j < i; j++) { - free(tempID[j]); - tempID[j] = 0; - } - free(tempID); - tempID = 0; - return VKFFT_ERROR_MALLOC_FAILED; - } - } - res = appendZeropadStart(sc); - if (res != VKFFT_SUCCESS) return res; - res = VkAppendLineFromInput(sc, sc->disableThreadsStart); - if (res != VKFFT_SUCCESS) return res; - - for (uint64_t k = 0; k < sc->registerBoost; ++k) { - uint64_t t = 0; - if (k > 0) { - res = appendBarrierVkFFT(sc, 2); - if (res != VKFFT_SUCCESS) return res; - res = appendZeropadStart(sc); - if (res != VKFFT_SUCCESS) return res; - res = VkAppendLineFromInput(sc, sc->disableThreadsStart); - if (res != VKFFT_SUCCESS) return res; - if (logicalGroupSize * logicalStoragePerThread > sc->fftDim) { - sc->tempLen = sprintf(sc->tempStr, "\ - if (%s * %" PRIu64 " < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_x, logicalStoragePerThread, sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - if (logicalGroupSize != sc->localSize[0]) { - sc->tempLen = sprintf(sc->tempStr, "\ - if (%s < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_x, logicalGroupSize); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - for (uint64_t j = 0; j < logicalRegistersPerThread / stageRadix; j++) { - if (logicalGroupSize * ((j + k * logicalRegistersPerThread / stageRadix) * stageRadix) <= sc->fftDim) { - if (logicalGroupSize * ((1 + j + k * logicalRegistersPerThread / stageRadix) * stageRadix) > sc->fftDim) { - uint64_t current_group_cut = sc->fftDim / stageRadix - (j + k * logicalRegistersPerThread / stageRadix) * logicalGroupSize; - sc->tempLen = sprintf(sc->tempStr, "\ - if (%s < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_x, current_group_cut); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sprintf(tempNum, "%" PRIu64 "", j * logicalGroupSize); - res = VkAddReal(sc, sc->stageInvocationID, sc->gl_LocalInvocationID_x, tempNum); - if (res != VKFFT_SUCCESS) return res; - res = VkMovReal(sc, sc->blockInvocationID, sc->stageInvocationID); - if (res != VKFFT_SUCCESS) return res; - sprintf(tempNum, "%" PRIu64 "", stageSize); - res = VkModReal(sc, sc->stageInvocationID, sc->stageInvocationID, tempNum); - if (res != VKFFT_SUCCESS) return res; - res = VkSubReal(sc, sc->blockInvocationID, sc->blockInvocationID, sc->stageInvocationID); - if (res != VKFFT_SUCCESS) return res; - sprintf(tempNum, "%" PRIu64 "", stageRadix); - res = VkMulReal(sc, sc->inoutID, sc->blockInvocationID, tempNum); - if (res != VKFFT_SUCCESS) return res; - res = VkAddReal(sc, sc->inoutID, sc->inoutID, sc->stageInvocationID); - if (res != VKFFT_SUCCESS) return res; - } - /*sc->tempLen = sprintf(sc->tempStr, "\ - stageInvocationID = (gl_LocalInvocationID.x + %" PRIu64 ") %% (%" PRIu64 ");\n\ - blockInvocationID = (gl_LocalInvocationID.x + %" PRIu64 ") - stageInvocationID;\n\ - inoutID = stageInvocationID + blockInvocationID * %" PRIu64 ";\n", j * logicalGroupSize, stageSize, j * logicalGroupSize, stageRadix);*/ - if ((stageSize == 1) && (sc->cacheShuffle)) { - for (uint64_t i = 0; i < stageRadix; i++) { - uint64_t id = j + k * logicalRegistersPerThread / stageRadix + i * logicalStoragePerThread / stageRadix; - id = (id / logicalRegistersPerThread) * sc->registers_per_thread + id % logicalRegistersPerThread; - sprintf(tempID[t + k * sc->registers_per_thread], "%s", sc->regIDs[id]); - t++; - if (logicalGroupSize * ((j + k * logicalRegistersPerThread / stageRadix) * stageRadix) <= sc->fftDim) { - sprintf(tempNum, "%" PRIu64 "", i); - res = VkAddReal(sc, sc->sdataID, tempNum, sc->tshuffle); - if (res != VKFFT_SUCCESS) return res; - sprintf(tempNum, "%" PRIu64 "", logicalRegistersPerThread); - res = VkModReal(sc, sc->sdataID, sc->sdataID, tempNum); - if (res != VKFFT_SUCCESS) return res; - sprintf(tempNum, "%" PRIu64 "", stageSize); - res = VkMulReal(sc, sc->sdataID, sc->sdataID, tempNum); - if (res != VKFFT_SUCCESS) return res; - if (sc->localSize[1] > 1) { - res = VkMulReal(sc, sc->combinedID, sc->gl_LocalInvocationID_y, sc->sharedStride); - if (res != VKFFT_SUCCESS) return res; - res = VkAddReal(sc, sc->sdataID, sc->sdataID, sc->combinedID); - if (res != VKFFT_SUCCESS) return res; - } - res = VkAddReal(sc, sc->sdataID, sc->sdataID, sc->inoutID); - if (res != VKFFT_SUCCESS) return res; - - //sprintf(sc->sdataID, "sharedStride * gl_LocalInvocationID.y + inoutID + ((%" PRIu64 "+tshuffle) %% (%" PRIu64 "))*%" PRIu64 "", i, logicalRegistersPerThread, stageSize); - if (strcmp(stageNormalization, "")) { - res = VkMulComplexNumber(sc, sc->regIDs[id], sc->regIDs[id], stageNormalization); - if (res != VKFFT_SUCCESS) return res; - } - res = VkSharedStore(sc, sc->sdataID, sc->regIDs[id]); - if (res != VKFFT_SUCCESS) return res; - } - /*sc->tempLen = sprintf(sc->tempStr, "\ - sdata[sharedStride * gl_LocalInvocationID.y + inoutID + ((%" PRIu64 "+tshuffle) %% (%" PRIu64 "))*%" PRIu64 "] = temp%s%s;\n", i, logicalRegistersPerThread, stageSize, sc->regIDs[id], stageNormalization);*/ - } - } - else { - for (uint64_t i = 0; i < stageRadix; i++) { - uint64_t id = j + k * logicalRegistersPerThread / stageRadix + i * logicalStoragePerThread / stageRadix; - id = (id / logicalRegistersPerThread) * sc->registers_per_thread + id % logicalRegistersPerThread; - sprintf(tempID[t + k * sc->registers_per_thread], "%s", sc->regIDs[id]); - t++; - if (logicalGroupSize * ((j + k * logicalRegistersPerThread / stageRadix) * stageRadix) <= sc->fftDim) { - sprintf(tempNum, "%" PRIu64 "", i * stageSize); - res = VkAddReal(sc, sc->sdataID, sc->inoutID, tempNum); - if (res != VKFFT_SUCCESS) return res; - if ((stageSize <= sc->numSharedBanks / 2) && (sc->fftDim > sc->numSharedBanks / 2) && (sc->sharedStrideBankConflictFirstStages != sc->fftDim / sc->registerBoost) && ((sc->fftDim & (sc->fftDim - 1)) == 0) && (stageSize * stageRadix != sc->fftDim)) { - if (sc->resolveBankConflictFirstStages == 0) { - sc->resolveBankConflictFirstStages = 1; - sc->tempLen = sprintf(sc->tempStr, "\ - %s = %" PRIu64 ";", sc->sharedStride, sc->sharedStrideBankConflictFirstStages); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, "\ - %s = (%s / %" PRIu64 ") * %" PRIu64 " + %s %% %" PRIu64 ";", sc->sdataID, sc->sdataID, sc->numSharedBanks / 2, sc->numSharedBanks / 2 + 1, sc->sdataID, sc->numSharedBanks / 2); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - } - else { - if (sc->resolveBankConflictFirstStages == 1) { - sc->resolveBankConflictFirstStages = 0; - sc->tempLen = sprintf(sc->tempStr, "\ - %s = %" PRIu64 ";", sc->sharedStride, sc->sharedStrideReadWriteConflict); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - if (sc->localSize[1] > 1) { - res = VkMulReal(sc, sc->combinedID, sc->gl_LocalInvocationID_y, sc->sharedStride); - if (res != VKFFT_SUCCESS) return res; - res = VkAddReal(sc, sc->sdataID, sc->sdataID, sc->combinedID); - if (res != VKFFT_SUCCESS) return res; - } - //sprintf(sc->sdataID, "sharedStride * gl_LocalInvocationID.y + inoutID + %" PRIu64 "", i * stageSize); - if (strcmp(stageNormalization, "")) { - res = VkMulComplexNumber(sc, sc->regIDs[id], sc->regIDs[id], stageNormalization); - if (res != VKFFT_SUCCESS) return res; - } - res = VkSharedStore(sc, sc->sdataID, sc->regIDs[id]); - if (res != VKFFT_SUCCESS) return res; - } - /*sc->tempLen = sprintf(sc->tempStr, "\ - sdata[sharedStride * gl_LocalInvocationID.y + inoutID + %" PRIu64 "] = temp%s%s;\n", i * stageSize, sc->regIDs[id], stageNormalization);*/ - } - } - if (logicalGroupSize * ((j + k * logicalRegistersPerThread / stageRadix) * stageRadix) <= sc->fftDim) { - if (logicalGroupSize * ((1 + j + k * logicalRegistersPerThread / stageRadix) * stageRadix) > sc->fftDim) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - if (logicalGroupSize != sc->localSize[0]) { - sc->tempLen = sprintf(sc->tempStr, "\ - }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - for (uint64_t j = logicalRegistersPerThread; j < sc->registers_per_thread; j++) { - sprintf(tempID[t + k * sc->registers_per_thread], "%s", sc->regIDs[t + k * sc->registers_per_thread]); - t++; - } - t = 0; - if (sc->registerBoost > 1) { - if (logicalGroupSize * logicalStoragePerThread > sc->fftDim) - { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - res = VkAppendLineFromInput(sc, sc->disableThreadsEnd); - if (res != VKFFT_SUCCESS) return res; - res = appendZeropadEnd(sc); - if (res != VKFFT_SUCCESS) return res; - res = appendBarrierVkFFT(sc, 2); - if (res != VKFFT_SUCCESS) return res; - res = appendZeropadStart(sc); - if (res != VKFFT_SUCCESS) return res; - res = VkAppendLineFromInput(sc, sc->disableThreadsStart); - if (res != VKFFT_SUCCESS) return res; - if (logicalGroupSize * logicalStoragePerThreadNext > sc->fftDim) { - sc->tempLen = sprintf(sc->tempStr, "\ - if (%s * %" PRIu64 " < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_x, logicalStoragePerThreadNext, sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - for (uint64_t j = 0; j < logicalRegistersPerThreadNext / stageRadixNext; j++) { - for (uint64_t i = 0; i < stageRadixNext; i++) { - uint64_t id = j + k * logicalRegistersPerThreadNext / stageRadixNext + i * logicalStoragePerThreadNext / stageRadixNext; - id = (id / logicalRegistersPerThreadNext) * sc->registers_per_thread + id % logicalRegistersPerThreadNext; - //resID[t + k * sc->registers_per_thread] = sc->regIDs[id]; - sprintf(tempNum, "%" PRIu64 "", t * logicalGroupSizeNext); - res = VkAddReal(sc, sc->sdataID, sc->gl_LocalInvocationID_x, tempNum); - if (res != VKFFT_SUCCESS) return res; - if (sc->localSize[1] > 1) { - res = VkMulReal(sc, sc->combinedID, sc->gl_LocalInvocationID_y, sc->sharedStride); - if (res != VKFFT_SUCCESS) return res; - res = VkAddReal(sc, sc->sdataID, sc->sdataID, sc->combinedID); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->resolveBankConflictFirstStages == 1) { - sc->tempLen = sprintf(sc->tempStr, "\ - %s = (%s / %" PRIu64 ") * %" PRIu64 " + %s %% %" PRIu64 ";", sc->sdataID, sc->sdataID, sc->numSharedBanks / 2, sc->numSharedBanks / 2 + 1, sc->sdataID, sc->numSharedBanks / 2); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - //sprintf(sc->sdataID, "sharedStride * gl_LocalInvocationID.y + gl_LocalInvocationID.x + %" PRIu64 "", t * logicalGroupSizeNext); - res = VkSharedLoad(sc, tempID[t + k * sc->registers_per_thread], sc->sdataID); - if (res != VKFFT_SUCCESS) return res; - /*sc->tempLen = sprintf(sc->tempStr, "\ - temp%s = sdata[sharedStride * gl_LocalInvocationID.y + gl_LocalInvocationID.x + %" PRIu64 "];\n", tempID[t + k * sc->registers_per_thread], t * logicalGroupSizeNext);*/ - t++; - } - - } - if (logicalGroupSize * logicalStoragePerThreadNext > sc->fftDim) - { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - res = VkAppendLineFromInput(sc, sc->disableThreadsEnd); - if (res != VKFFT_SUCCESS) return res; - res = appendZeropadEnd(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - res = VkAppendLineFromInput(sc, sc->disableThreadsEnd); - if (res != VKFFT_SUCCESS) return res; - res = appendZeropadEnd(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - for (uint64_t i = 0; i < sc->registers_per_thread * sc->registerBoost; i++) { - //printf("0 - %s\n", resID[i]); - sprintf(sc->regIDs[i], "%s", tempID[i]); - //sprintf(resID[i], "%s", tempID[i]); - //printf("1 - %s\n", resID[i]); - } - for (uint64_t i = 0; i < sc->registers_per_thread * sc->registerBoost; i++) { - free(tempID[i]); - tempID[i] = 0; - } - free(tempID); - tempID = 0; - } - else - return VKFFT_ERROR_MALLOC_FAILED; - } - else { - char** tempID; - tempID = (char**)malloc(sizeof(char*) * sc->registers_per_thread * sc->registerBoost); - if (tempID) { - //resID = (char**)malloc(sizeof(char*) * sc->registers_per_thread * sc->registerBoost); - for (uint64_t i = 0; i < sc->registers_per_thread * sc->registerBoost; i++) { - tempID[i] = (char*)malloc(sizeof(char) * 50); - if (!tempID[i]) { - for (uint64_t j = 0; j < i; j++) { - free(tempID[j]); - tempID[j] = 0; - } - free(tempID); - tempID = 0; - return VKFFT_ERROR_MALLOC_FAILED; - } - } - for (uint64_t k = 0; k < sc->registerBoost; ++k) { - for (uint64_t j = 0; j < logicalRegistersPerThread / stageRadix; j++) { - for (uint64_t i = 0; i < stageRadix; i++) { - uint64_t id = j + k * logicalRegistersPerThread / stageRadix + i * logicalStoragePerThread / stageRadix; - id = (id / logicalRegistersPerThread) * sc->registers_per_thread + id % logicalRegistersPerThread; - sprintf(tempID[j + i * logicalRegistersPerThread / stageRadix + k * sc->registers_per_thread], "%s", sc->regIDs[id]); - } - } - for (uint64_t j = logicalRegistersPerThread; j < sc->registers_per_thread; j++) { - sprintf(tempID[j + k * sc->registers_per_thread], "%s", sc->regIDs[j + k * sc->registers_per_thread]); - } - } - for (uint64_t i = 0; i < sc->registers_per_thread * sc->registerBoost; i++) { - sprintf(sc->regIDs[i], "%s", tempID[i]); - } - for (uint64_t i = 0; i < sc->registers_per_thread * sc->registerBoost; i++) { - free(tempID[i]); - tempID[i] = 0; - } - free(tempID); - tempID = 0; - } - else - return VKFFT_ERROR_MALLOC_FAILED; - } - } - else { - res = appendZeropadStart(sc); - if (res != VKFFT_SUCCESS) return res; - res = VkAppendLineFromInput(sc, sc->disableThreadsStart); - if (res != VKFFT_SUCCESS) return res; - - if (((sc->actualInverse) && (sc->normalize)) || ((sc->convolutionStep || sc->useBluesteinFFT) && (stageAngle > 0))) { - for (uint64_t i = 0; i < logicalStoragePerThread; i++) { - if (strcmp(stageNormalization, "")) { - res = VkMulComplexNumber(sc, sc->regIDs[(i / logicalRegistersPerThread) * sc->registers_per_thread + i % logicalRegistersPerThread], sc->regIDs[(i / logicalRegistersPerThread) * sc->registers_per_thread + i % logicalRegistersPerThread], stageNormalization); - } - if (res != VKFFT_SUCCESS) return res; - /*sc->tempLen = sprintf(sc->tempStr, "\ - temp%s = temp%s%s;\n", sc->regIDs[(i / logicalRegistersPerThread) * sc->registers_per_thread + i % logicalRegistersPerThread], sc->regIDs[(i / logicalRegistersPerThread) * sc->registers_per_thread + i % logicalRegistersPerThread], stageNormalization);*/ - } - } - - res = VkAppendLineFromInput(sc, sc->disableThreadsEnd); - if (res != VKFFT_SUCCESS) return res; - res = appendZeropadEnd(sc); - if (res != VKFFT_SUCCESS) return res; - } - return res; -} -static inline VkFFTResult appendRadixShuffleStrided(VkFFTSpecializationConstantsLayout* sc, const char* floatType, const char* uintType, uint64_t stageSize, uint64_t stageSizeSum, double stageAngle, uint64_t stageRadix, uint64_t stageRadixNext) { - VkFFTResult res = VKFFT_SUCCESS; - char vecType[30]; - char LFending[4] = ""; - if (!strcmp(floatType, "float")) sprintf(LFending, "f"); -#if(VKFFT_BACKEND==0) - if (!strcmp(floatType, "float")) sprintf(vecType, "vec2"); - if (!strcmp(floatType, "double")) sprintf(vecType, "dvec2"); - if (!strcmp(floatType, "double")) sprintf(LFending, "LF"); -#elif(VKFFT_BACKEND==1) - if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); - if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); - if (!strcmp(floatType, "double")) sprintf(LFending, "l"); -#elif(VKFFT_BACKEND==2) - if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); - if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); - if (!strcmp(floatType, "double")) sprintf(LFending, "l"); -#elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)) - if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); - if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); -#endif - - char tempNum[50] = ""; - - uint64_t logicalStoragePerThread = sc->registers_per_thread_per_radix[stageRadix] * sc->registerBoost;// (sc->registers_per_thread % stageRadix == 0) ? sc->registers_per_thread * sc->registerBoost : sc->min_registers_per_thread * sc->registerBoost; - uint64_t logicalStoragePerThreadNext = sc->registers_per_thread_per_radix[stageRadixNext] * sc->registerBoost;//(sc->registers_per_thread % stageRadixNext == 0) ? sc->registers_per_thread * sc->registerBoost : sc->min_registers_per_thread * sc->registerBoost; - uint64_t logicalRegistersPerThread = sc->registers_per_thread_per_radix[stageRadix];//(sc->registers_per_thread % stageRadix == 0) ? sc->registers_per_thread : sc->min_registers_per_thread; - uint64_t logicalRegistersPerThreadNext = sc->registers_per_thread_per_radix[stageRadixNext];//(sc->registers_per_thread % stageRadixNext == 0) ? sc->registers_per_thread : sc->min_registers_per_thread; - - uint64_t logicalGroupSize = (uint64_t)ceil(sc->fftDim / (double)logicalStoragePerThread); - uint64_t logicalGroupSizeNext = (uint64_t)ceil(sc->fftDim / (double)logicalStoragePerThreadNext); - char stageNormalization[50] = ""; - uint64_t normalizationValue = 1; - if ((((sc->actualInverse) && (sc->normalize)) || (sc->convolutionStep && (stageAngle > 0))) && (stageSize == 1) && (sc->axis_upload_id == 0) && (!(sc->useBluesteinFFT && (stageAngle < 0)))) { - if ((sc->performDCT) && (sc->actualInverse)) { - if (sc->performDCT == 1) - normalizationValue = (sc->sourceFFTSize - 1) * 2; - else - normalizationValue = sc->sourceFFTSize * 2; - } - else - normalizationValue = sc->sourceFFTSize; - } - if (sc->useBluesteinFFT && (stageAngle > 0) && (stageSize == 1) && (sc->axis_upload_id == 0)) { - normalizationValue *= sc->fft_dim_full; - } - if (normalizationValue != 1) { - sprintf(stageNormalization, "%.17e%s", 1.0 / (double)(normalizationValue), LFending); - } - if ((!((sc->writeFromRegisters == 1) && (stageSize == sc->fftDim / stageRadix) && (!(((sc->convolutionStep) || (sc->useBluesteinFFT && sc->BluesteinConvolutionStep)) && (stageAngle < 0) && ((sc->matrixConvolution > 1) || (sc->numKernels > 1)))))) && (((sc->axis_id == 0) && (sc->axis_upload_id == 0)) || (sc->localSize[1] * logicalStoragePerThread > sc->fftDim) || (stageSize < sc->fftDim / stageRadix) || ((sc->convolutionStep) && ((sc->matrixConvolution > 1) || (sc->numKernels > 1)) && (stageAngle < 0)) || (sc->performDCT))) - { - res = appendBarrierVkFFT(sc, 2); - if (res != VKFFT_SUCCESS) return res; - } - if (stageSize == sc->fftDim / stageRadix) { - sc->tempLen = sprintf(sc->tempStr, " %s = %" PRIu64 ";\n", sc->sharedStride, sc->sharedStrideReadWriteConflict); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if ((!((sc->writeFromRegisters == 1) && (stageSize == sc->fftDim / stageRadix) && (!(((sc->convolutionStep) || (sc->useBluesteinFFT && sc->BluesteinConvolutionStep)) && (stageAngle < 0) && ((sc->matrixConvolution > 1) || (sc->numKernels > 1)))))) && (((sc->axis_id == 0) && (sc->axis_upload_id == 0)) || (sc->localSize[1] * logicalStoragePerThread > sc->fftDim) || (stageSize < sc->fftDim / stageRadix) || ((sc->convolutionStep) && ((sc->matrixConvolution > 1) || (sc->numKernels > 1)) && (stageAngle < 0)) || (sc->performDCT))) { - //if (sc->writeFromRegisters == 0) { - //appendBarrierVkFFT(sc, 2); - if (!((sc->registerBoost > 1) && (stageSize * stageRadix == sc->fftDim / sc->stageRadix[sc->numStages - 1]) && (sc->stageRadix[sc->numStages - 1] == sc->registerBoost))) { - char** tempID; - tempID = (char**)malloc(sizeof(char*) * sc->registers_per_thread * sc->registerBoost); - if (tempID) { - for (uint64_t i = 0; i < sc->registers_per_thread * sc->registerBoost; i++) { - tempID[i] = (char*)malloc(sizeof(char) * 50); - if (!tempID[i]) { - for (uint64_t j = 0; j < i; j++) { - free(tempID[j]); - tempID[j] = 0; - } - free(tempID); - tempID = 0; - return VKFFT_ERROR_MALLOC_FAILED; - } - } - res = appendZeropadStart(sc); - if (res != VKFFT_SUCCESS) return res; - res = VkAppendLineFromInput(sc, sc->disableThreadsStart); - if (res != VKFFT_SUCCESS) return res; - - for (uint64_t k = 0; k < sc->registerBoost; ++k) { - uint64_t t = 0; - if (k > 0) { - res = appendBarrierVkFFT(sc, 2); - if (res != VKFFT_SUCCESS) return res; - res = appendZeropadStart(sc); - if (res != VKFFT_SUCCESS) return res; - res = VkAppendLineFromInput(sc, sc->disableThreadsStart); - if (res != VKFFT_SUCCESS) return res; - if (logicalGroupSize * logicalStoragePerThread > sc->fftDim) { - sc->tempLen = sprintf(sc->tempStr, "\ - if (%s * %" PRIu64 " < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_y, logicalStoragePerThread, sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - if (logicalGroupSize != sc->localSize[1]) { - sc->tempLen = sprintf(sc->tempStr, "\ - if (%s < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_y, logicalGroupSize); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - for (uint64_t j = 0; j < logicalRegistersPerThread / stageRadix; j++) { - if (logicalGroupSize * ((j + k * logicalRegistersPerThread / stageRadix) * stageRadix) <= sc->fftDim) { - if (logicalGroupSize * ((1 + j + k * logicalRegistersPerThread / stageRadix) * stageRadix) > sc->fftDim) { - uint64_t current_group_cut = sc->fftDim / stageRadix - (j + k * logicalRegistersPerThread / stageRadix) * logicalGroupSize; - sc->tempLen = sprintf(sc->tempStr, "\ - if (%s < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_y, current_group_cut); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sprintf(tempNum, "%" PRIu64 "", j * logicalGroupSize); - res = VkAddReal(sc, sc->stageInvocationID, sc->gl_LocalInvocationID_y, tempNum); - if (res != VKFFT_SUCCESS) return res; - res = VkMovReal(sc, sc->blockInvocationID, sc->stageInvocationID); - if (res != VKFFT_SUCCESS) return res; - sprintf(tempNum, "%" PRIu64 "", stageSize); - res = VkModReal(sc, sc->stageInvocationID, sc->stageInvocationID, tempNum); - if (res != VKFFT_SUCCESS) return res; - res = VkSubReal(sc, sc->blockInvocationID, sc->blockInvocationID, sc->stageInvocationID); - if (res != VKFFT_SUCCESS) return res; - sprintf(tempNum, "%" PRIu64 "", stageRadix); - res = VkMulReal(sc, sc->inoutID, sc->blockInvocationID, tempNum); - if (res != VKFFT_SUCCESS) return res; - res = VkAddReal(sc, sc->inoutID, sc->inoutID, sc->stageInvocationID); - if (res != VKFFT_SUCCESS) return res; - } - /*sc->tempLen = sprintf(sc->tempStr, "\ - stageInvocationID = (gl_LocalInvocationID.y + %" PRIu64 ") %% (%" PRIu64 ");\n\ - blockInvocationID = (gl_LocalInvocationID.y + %" PRIu64 ") - stageInvocationID;\n\ - inoutID = stageInvocationID + blockInvocationID * %" PRIu64 ";\n", j * logicalGroupSize, stageSize, j * logicalGroupSize, stageRadix);*/ - for (uint64_t i = 0; i < stageRadix; i++) { - uint64_t id = j + k * logicalRegistersPerThread / stageRadix + i * logicalStoragePerThread / stageRadix; - id = (id / logicalRegistersPerThread) * sc->registers_per_thread + id % logicalRegistersPerThread; - sprintf(tempID[t + k * sc->registers_per_thread], "%s", sc->regIDs[id]); - t++; - if (logicalGroupSize * ((j + k * logicalRegistersPerThread / stageRadix) * stageRadix) <= sc->fftDim) { - sprintf(tempNum, "%" PRIu64 "", i * stageSize); - res = VkAddReal(sc, sc->sdataID, sc->inoutID, tempNum); - if (res != VKFFT_SUCCESS) return res; - res = VkMulReal(sc, sc->sdataID, sc->sharedStride, sc->sdataID); - if (res != VKFFT_SUCCESS) return res; - res = VkAddReal(sc, sc->sdataID, sc->sdataID, sc->gl_LocalInvocationID_x); - if (res != VKFFT_SUCCESS) return res; - //sprintf(sc->sdataID, "sharedStride * gl_LocalInvocationID.y + inoutID + %" PRIu64 "", i * stageSize); - if (strcmp(stageNormalization, "")) { - res = VkMulComplexNumber(sc, sc->regIDs[id], sc->regIDs[id], stageNormalization); - if (res != VKFFT_SUCCESS) return res; - } - res = VkSharedStore(sc, sc->sdataID, sc->regIDs[id]); - if (res != VKFFT_SUCCESS) return res; - } - /*sc->tempLen = sprintf(sc->tempStr, "\ - sdata[gl_WorkGroupSize.x*(inoutID+%" PRIu64 ")+gl_LocalInvocationID.x] = temp%s%s;\n", i * stageSize, sc->regIDs[id], stageNormalization);*/ - } - if (logicalGroupSize * ((j + k * logicalRegistersPerThread / stageRadix) * stageRadix) <= sc->fftDim) { - if (logicalGroupSize * ((1 + j + k * logicalRegistersPerThread / stageRadix) * stageRadix) > sc->fftDim) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - if (logicalGroupSize != sc->localSize[1]) { - sc->tempLen = sprintf(sc->tempStr, "\ - }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - for (uint64_t j = logicalRegistersPerThread; j < sc->registers_per_thread; j++) { - sprintf(tempID[t + k * sc->registers_per_thread], "%s", sc->regIDs[t + k * sc->registers_per_thread]); - t++; - } - t = 0; - if (sc->registerBoost > 1) { - res = VkAppendLineFromInput(sc, sc->disableThreadsEnd); - if (res != VKFFT_SUCCESS) return res; - res = appendZeropadEnd(sc); - if (res != VKFFT_SUCCESS) return res; - res = appendBarrierVkFFT(sc, 2); - if (res != VKFFT_SUCCESS) return res; - res = appendZeropadStart(sc); - if (res != VKFFT_SUCCESS) return res; - res = VkAppendLineFromInput(sc, sc->disableThreadsStart); - if (res != VKFFT_SUCCESS) return res; - if (logicalGroupSize * logicalStoragePerThreadNext > sc->fftDim) { - sc->tempLen = sprintf(sc->tempStr, "\ - if (%s * %" PRIu64 " < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_y, logicalStoragePerThreadNext, sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - for (uint64_t j = 0; j < logicalRegistersPerThreadNext / stageRadixNext; j++) { - for (uint64_t i = 0; i < stageRadixNext; i++) { - uint64_t id = j + k * logicalRegistersPerThreadNext / stageRadixNext + i * logicalRegistersPerThreadNext / stageRadixNext; - id = (id / logicalRegistersPerThreadNext) * sc->registers_per_thread + id % logicalRegistersPerThreadNext; - sprintf(tempNum, "%" PRIu64 "", t * logicalGroupSizeNext); - res = VkAddReal(sc, sc->sdataID, sc->gl_LocalInvocationID_y, tempNum); - if (res != VKFFT_SUCCESS) return res; - res = VkMulReal(sc, sc->sdataID, sc->sharedStride, sc->sdataID); - if (res != VKFFT_SUCCESS) return res; - res = VkAddReal(sc, sc->sdataID, sc->sdataID, sc->gl_LocalInvocationID_x); - if (res != VKFFT_SUCCESS) return res; - //sprintf(sc->sdataID, "sharedStride * gl_LocalInvocationID.y + gl_LocalInvocationID.x + %" PRIu64 "", t * logicalGroupSizeNext); - res = VkSharedLoad(sc, tempID[t + k * sc->registers_per_thread], sc->sdataID); - if (res != VKFFT_SUCCESS) return res; - /*sc->tempLen = sprintf(sc->tempStr, "\ - temp%s = sdata[gl_WorkGroupSize.x*(gl_LocalInvocationID.y+%" PRIu64 ")+gl_LocalInvocationID.x];\n", tempID[t + k * sc->registers_per_thread], t * logicalGroupSizeNext);*/ - t++; - } - } - if (logicalGroupSize * logicalStoragePerThreadNext > sc->fftDim) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - res = VkAppendLineFromInput(sc, sc->disableThreadsEnd); - if (res != VKFFT_SUCCESS) return res; - res = appendZeropadEnd(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - res = VkAppendLineFromInput(sc, sc->disableThreadsEnd); - if (res != VKFFT_SUCCESS) return res; - res = appendZeropadEnd(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - for (uint64_t i = 0; i < sc->registers_per_thread * sc->registerBoost; i++) { - sprintf(sc->regIDs[i], "%s", tempID[i]); - } - for (uint64_t i = 0; i < sc->registers_per_thread * sc->registerBoost; i++) { - free(tempID[i]); - tempID[i] = 0; - } - free(tempID); - tempID = 0; - } - else - return VKFFT_ERROR_MALLOC_FAILED; - } - else { - char** tempID; - tempID = (char**)malloc(sizeof(char*) * sc->registers_per_thread * sc->registerBoost); - if (tempID) { - //resID = (char**)malloc(sizeof(char*) * sc->registers_per_thread * sc->registerBoost); - for (uint64_t i = 0; i < sc->registers_per_thread * sc->registerBoost; i++) { - tempID[i] = (char*)malloc(sizeof(char) * 50); - if (!tempID[i]) { - for (uint64_t j = 0; j < i; j++) { - free(tempID[j]); - tempID[j] = 0; - } - free(tempID); - tempID = 0; - return VKFFT_ERROR_MALLOC_FAILED; - } - } - for (uint64_t k = 0; k < sc->registerBoost; ++k) { - for (uint64_t j = 0; j < logicalRegistersPerThread / stageRadix; j++) { - for (uint64_t i = 0; i < stageRadix; i++) { - uint64_t id = j + k * logicalRegistersPerThread / stageRadix + i * logicalStoragePerThread / stageRadix; - id = (id / logicalRegistersPerThread) * sc->registers_per_thread + id % logicalRegistersPerThread; - sprintf(tempID[j + i * logicalRegistersPerThread / stageRadix + k * sc->registers_per_thread], "%s", sc->regIDs[id]); - } - } - for (uint64_t j = logicalRegistersPerThread; j < sc->registers_per_thread; j++) { - sprintf(tempID[j + k * sc->registers_per_thread], "%s", sc->regIDs[j + k * sc->registers_per_thread]); - } - } - for (uint64_t i = 0; i < sc->registers_per_thread * sc->registerBoost; i++) { - sprintf(sc->regIDs[i], "%s", tempID[i]); - } - for (uint64_t i = 0; i < sc->registers_per_thread * sc->registerBoost; i++) { - free(tempID[i]); - tempID[i] = 0; - } - free(tempID); - tempID = 0; - } - else - return VKFFT_ERROR_MALLOC_FAILED; - } - } - else { - res = appendZeropadStart(sc); - if (res != VKFFT_SUCCESS) return res; - res = VkAppendLineFromInput(sc, sc->disableThreadsStart); - if (res != VKFFT_SUCCESS) return res; - if (sc->localSize[1] * logicalStoragePerThread > sc->fftDim) { - sc->tempLen = sprintf(sc->tempStr, "\ - if (%s * %" PRIu64 " < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_y, logicalStoragePerThread, sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (((sc->actualInverse) && (sc->normalize)) || ((sc->convolutionStep || sc->useBluesteinFFT) && (stageAngle > 0))) { - for (uint64_t i = 0; i < logicalRegistersPerThread; i++) { - if (strcmp(stageNormalization, "")) { - res = VkMulComplexNumber(sc, sc->regIDs[(i / logicalRegistersPerThread) * sc->registers_per_thread + i % logicalRegistersPerThread], sc->regIDs[(i / logicalRegistersPerThread) * sc->registers_per_thread + i % logicalRegistersPerThread], stageNormalization); - } - if (res != VKFFT_SUCCESS) return res; - } - } - if (sc->localSize[1] * logicalRegistersPerThread > sc->fftDim) - { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - res = VkAppendLineFromInput(sc, sc->disableThreadsEnd); - if (res != VKFFT_SUCCESS) return res; - res = appendZeropadEnd(sc); - if (res != VKFFT_SUCCESS) return res; - } - return res; -} -static inline VkFFTResult appendRadixShuffle(VkFFTSpecializationConstantsLayout* sc, const char* floatType, const char* uintType, uint64_t stageSize, uint64_t stageSizeSum, double stageAngle, uint64_t stageRadix, uint64_t stageRadixNext, uint64_t stageID, uint64_t shuffleType) { - VkFFTResult res = VKFFT_SUCCESS; - if (sc->rader_generator[stageID] == 0) { - switch (shuffleType) { - case 0: case 5: case 6: case 110: case 120: case 130: case 140: case 142: case 144: { - res = appendRadixShuffleNonStrided(sc, floatType, uintType, stageSize, stageSizeSum, stageAngle, stageRadix, stageRadixNext); - if (res != VKFFT_SUCCESS) return res; - //appendBarrierVkFFT(sc, 1); - break; - } - case 1: case 2: case 111: case 121: case 131: case 141: case 143: case 145: { - res = appendRadixShuffleStrided(sc, floatType, uintType, stageSize, stageSizeSum, stageAngle, stageRadix, stageRadixNext); - if (res != VKFFT_SUCCESS) return res; - //appendBarrierVkFFT(sc, 1); - break; - } - } - } - return res; -} - -static inline VkFFTResult appendBoostThreadDataReorder(VkFFTSpecializationConstantsLayout* sc, const char* floatType, const char* uintType, uint64_t shuffleType, uint64_t start) { - VkFFTResult res = VKFFT_SUCCESS; - switch (shuffleType) { - case 0: case 5: case 6: case 110: case 120: case 130: case 140: case 142: case 144: { - uint64_t logicalStoragePerThread; - if (start == 1) { - logicalStoragePerThread = sc->registers_per_thread_per_radix[sc->stageRadix[0]] * sc->registerBoost;// (sc->registers_per_thread % sc->stageRadix[0] == 0) ? sc->registers_per_thread * sc->registerBoost : sc->min_registers_per_thread * sc->registerBoost; - } - else { - logicalStoragePerThread = sc->registers_per_thread_per_radix[sc->stageRadix[sc->numStages - 1]] * sc->registerBoost;// (sc->registers_per_thread % sc->stageRadix[sc->numStages - 1] == 0) ? sc->registers_per_thread * sc->registerBoost : sc->min_registers_per_thread * sc->registerBoost; - } - uint64_t logicalGroupSize = sc->fftDim / logicalStoragePerThread; - if ((sc->registerBoost > 1) && (logicalStoragePerThread != sc->min_registers_per_thread * sc->registerBoost)) { - for (uint64_t k = 0; k < sc->registerBoost; k++) { - if (k > 0) { - res = appendBarrierVkFFT(sc, 2); - if (res != VKFFT_SUCCESS) return res; - } - res = appendZeropadStart(sc); - if (res != VKFFT_SUCCESS) return res; - res = VkAppendLineFromInput(sc, sc->disableThreadsStart); - if (res != VKFFT_SUCCESS) return res; - if (start == 0) { - sc->tempLen = sprintf(sc->tempStr, "\ - if (%s * %" PRIu64 " < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_x, logicalStoragePerThread, sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - for (uint64_t i = 0; i < logicalStoragePerThread / sc->registerBoost; i++) { - sc->tempLen = sprintf(sc->tempStr, "\ - sdata[%s + %" PRIu64 "] = %s;\n", sc->gl_LocalInvocationID_x, i * logicalGroupSize, sc->regIDs[i + k * sc->registers_per_thread]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else - { - for (uint64_t i = 0; i < sc->min_registers_per_thread; i++) { - sc->tempLen = sprintf(sc->tempStr, "\ - sdata[%s + %" PRIu64 "] = %s;\n", sc->gl_LocalInvocationID_x, i * sc->localSize[0], sc->regIDs[i + k * sc->registers_per_thread]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - res = VkAppendLineFromInput(sc, sc->disableThreadsEnd); - if (res != VKFFT_SUCCESS) return res; - res = appendZeropadEnd(sc); - if (res != VKFFT_SUCCESS) return res; - res = appendBarrierVkFFT(sc, 2); - if (res != VKFFT_SUCCESS) return res; - res = appendZeropadStart(sc); - if (res != VKFFT_SUCCESS) return res; - res = VkAppendLineFromInput(sc, sc->disableThreadsStart); - if (res != VKFFT_SUCCESS) return res; - if (start == 1) { - sc->tempLen = sprintf(sc->tempStr, "\ - if (%s * %" PRIu64 " < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_x, logicalStoragePerThread, sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - for (uint64_t i = 0; i < logicalStoragePerThread / sc->registerBoost; i++) { - sc->tempLen = sprintf(sc->tempStr, "\ - %s = sdata[%s + %" PRIu64 "];\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_x, i * logicalGroupSize); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - for (uint64_t i = 0; i < sc->min_registers_per_thread; i++) { - sc->tempLen = sprintf(sc->tempStr, "\ - %s = sdata[%s + %" PRIu64 "];\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_x, i * sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - res = VkAppendLineFromInput(sc, sc->disableThreadsEnd); - if (res != VKFFT_SUCCESS) return res; - res = appendZeropadEnd(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - - break; - } - case 1: case 2: case 111: case 121: case 131: case 141: case 143: case 145: { - uint64_t logicalStoragePerThread; - if (start == 1) { - logicalStoragePerThread = sc->registers_per_thread_per_radix[sc->stageRadix[0]] * sc->registerBoost;// (sc->registers_per_thread % sc->stageRadix[0] == 0) ? sc->registers_per_thread * sc->registerBoost : sc->min_registers_per_thread * sc->registerBoost; - } - else { - logicalStoragePerThread = sc->registers_per_thread_per_radix[sc->stageRadix[sc->numStages - 1]] * sc->registerBoost;// (sc->registers_per_thread % sc->stageRadix[sc->numStages - 1] == 0) ? sc->registers_per_thread * sc->registerBoost : sc->min_registers_per_thread * sc->registerBoost; - } - uint64_t logicalGroupSize = sc->fftDim / logicalStoragePerThread; - if ((sc->registerBoost > 1) && (logicalStoragePerThread != sc->min_registers_per_thread * sc->registerBoost)) { - for (uint64_t k = 0; k < sc->registerBoost; k++) { - if (k > 0) { - res = appendBarrierVkFFT(sc, 2); - if (res != VKFFT_SUCCESS) return res; - } - res = appendZeropadStart(sc); - if (res != VKFFT_SUCCESS) return res; - res = VkAppendLineFromInput(sc, sc->disableThreadsStart); - if (res != VKFFT_SUCCESS) return res; - if (start == 0) { - sc->tempLen = sprintf(sc->tempStr, "\ - if (%s * %" PRIu64 " < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_y, logicalStoragePerThread, sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - for (uint64_t i = 0; i < logicalStoragePerThread / sc->registerBoost; i++) { - sc->tempLen = sprintf(sc->tempStr, "\ - sdata[%s + %s * (%s + %" PRIu64 ")] = %s;\n", sc->gl_LocalInvocationID_x, sc->sharedStride, sc->gl_LocalInvocationID_y, i * logicalGroupSize, sc->regIDs[i + k * sc->registers_per_thread]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else - { - for (uint64_t i = 0; i < sc->min_registers_per_thread; i++) { - sc->tempLen = sprintf(sc->tempStr, "\ - sdata[%s + %s * (%s + %" PRIu64 ")] = %s;\n", sc->gl_LocalInvocationID_x, sc->sharedStride, sc->gl_LocalInvocationID_y, i * sc->localSize[1], sc->regIDs[i + k * sc->registers_per_thread]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - res = VkAppendLineFromInput(sc, sc->disableThreadsEnd); - if (res != VKFFT_SUCCESS) return res; - res = appendZeropadEnd(sc); - if (res != VKFFT_SUCCESS) return res; - res = appendBarrierVkFFT(sc, 2); - if (res != VKFFT_SUCCESS) return res; - res = appendZeropadStart(sc); - if (res != VKFFT_SUCCESS) return res; - res = VkAppendLineFromInput(sc, sc->disableThreadsStart); - if (res != VKFFT_SUCCESS) return res; - if (start == 1) { - sc->tempLen = sprintf(sc->tempStr, "\ - if (%s * %" PRIu64 " < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_y, logicalStoragePerThread, sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - for (uint64_t i = 0; i < logicalStoragePerThread / sc->registerBoost; i++) { - sc->tempLen = sprintf(sc->tempStr, "\ - %s = sdata[%s + %s * (%s + %" PRIu64 ")];\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_x, sc->sharedStride, sc->gl_LocalInvocationID_y, i * logicalGroupSize); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - for (uint64_t i = 0; i < sc->min_registers_per_thread; i++) { - sc->tempLen = sprintf(sc->tempStr, "\ - %s = sdata[%s + %s * (%s + %" PRIu64 ")];\n", sc->regIDs[i + k * sc->registers_per_thread], sc->gl_LocalInvocationID_x, sc->sharedStride, sc->gl_LocalInvocationID_y, i * sc->localSize[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - res = VkAppendLineFromInput(sc, sc->disableThreadsEnd); - if (res != VKFFT_SUCCESS) return res; - res = appendZeropadEnd(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - - break; - } - } - return res; -} - -static inline VkFFTResult appendCoordinateRegisterStore(VkFFTSpecializationConstantsLayout* sc, uint64_t readType) { - VkFFTResult res = VKFFT_SUCCESS; - if ((!sc->writeFromRegisters) || ((sc->convolutionStep) && ((sc->matrixConvolution > 1) || (sc->numKernels > 1)))) { - switch (readType) { - case 0: case 5: case 6: case 110: case 120: case 130: case 140: case 142: case 144://single_c2c - { - uint64_t used_registers_read = (uint64_t)ceil(sc->fftDim / (double)sc->localSize[0]); - res = appendBarrierVkFFT(sc, 1); - if (res != VKFFT_SUCCESS) return res; - res = appendZeropadStart(sc); - if (res != VKFFT_SUCCESS) return res; - res = VkAppendLineFromInput(sc, sc->disableThreadsStart); - if (res != VKFFT_SUCCESS) return res; - if (sc->matrixConvolution == 1) { - sc->tempLen = sprintf(sc->tempStr, "\ - %s = sdata[sharedStride * %s + %s];\n", sc->regIDs[0], sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - for (uint64_t i = 1; i < used_registers_read; i++) { - if (sc->localSize[0] * (i + 1) > sc->fftDim) { - sc->tempLen = sprintf(sc->tempStr, "\ - if (%s < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_x, sc->fftDim - sc->localSize[0] * i); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, "\ - %s = sdata[sharedStride * %s + %s + %" PRIu64 " * %s];\n", sc->regIDs[i], sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, i, sc->gl_WorkGroupSize_x); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->localSize[0] * (i + 1) > sc->fftDim) { - sc->tempLen = sprintf(sc->tempStr, "\ - }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - //appendBarrierVkFFT(sc, 3); - } - else { - sc->tempLen = sprintf(sc->tempStr, "\ - switch (coordinate) {\n\ - case 0:\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, "\ - %s = sdata[sharedStride * %s + %s];\n", sc->regIDs[0], sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - for (uint64_t i = 1; i < used_registers_read; i++) { - if (sc->localSize[0] * (i + 1) > sc->fftDim) { - sc->tempLen = sprintf(sc->tempStr, "\ - if (%s < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_x, sc->fftDim - sc->localSize[0] * i); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, "\ - %s = sdata[sharedStride * %s + %s + %" PRIu64 " * %s];\n", sc->regIDs[i], sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, i, sc->gl_WorkGroupSize_x); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->localSize[0] * (i + 1) > sc->fftDim) { - sc->tempLen = sprintf(sc->tempStr, "\ - }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - //appendBarrierVkFFT(sc, 3); - sc->tempLen = sprintf(sc->tempStr, " break;\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - for (uint64_t i = 1; i < sc->matrixConvolution; i++) { - sc->tempLen = sprintf(sc->tempStr, "\ - case %" PRIu64 ":\n", i); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, "\ - %s_%" PRIu64 " = sdata[sharedStride * %s + %s];\n", sc->regIDs[0], i, sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - for (uint64_t j = 1; j < used_registers_read; j++) { - if (sc->localSize[0] * (j + 1) > sc->fftDim) { - sc->tempLen = sprintf(sc->tempStr, "\ - if (%s < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_x, sc->fftDim - sc->localSize[0] * j); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, "\ - %s_%" PRIu64 " = sdata[sharedStride * %s + %s + %" PRIu64 " * %s];\n", sc->regIDs[j], i, sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, j, sc->gl_WorkGroupSize_x); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->localSize[0] * (j + 1) > sc->fftDim) { - sc->tempLen = sprintf(sc->tempStr, "\ - }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - //appendBarrierVkFFT(sc, 3); - sc->tempLen = sprintf(sc->tempStr, " break;\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - res = VkAppendLineFromInput(sc, sc->disableThreadsEnd); - if (res != VKFFT_SUCCESS) return res; - res = appendZeropadEnd(sc); - if (res != VKFFT_SUCCESS) return res; - break; - } - case 1: case 111: case 121: case 131: case 141: case 143: case 145://grouped_c2c - { - uint64_t used_registers_read = (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]); - res = appendBarrierVkFFT(sc, 1); - if (res != VKFFT_SUCCESS) return res; - res = appendZeropadStart(sc); - if (res != VKFFT_SUCCESS) return res; - res = VkAppendLineFromInput(sc, sc->disableThreadsStart); - if (res != VKFFT_SUCCESS) return res; - if (sc->matrixConvolution == 1) { - sc->tempLen = sprintf(sc->tempStr, "\ - %s = sdata[%s*(%s)+%s];\n", sc->regIDs[0], sc->sharedStride, sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - for (uint64_t i = 1; i < used_registers_read; i++) { - if (sc->localSize[1] * (i + 1) > sc->fftDim) { - sc->tempLen = sprintf(sc->tempStr, "\ - if (%s < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_y, sc->fftDim - sc->localSize[1] * i); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, "\ - %s = sdata[%s*(%s+%" PRIu64 "*%s)+%s];\n", sc->regIDs[i], sc->sharedStride, sc->gl_LocalInvocationID_y, i, sc->gl_WorkGroupSize_y, sc->gl_LocalInvocationID_x); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->localSize[1] * (i + 1) > sc->fftDim) { - sc->tempLen = sprintf(sc->tempStr, "\ - }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - //appendBarrierVkFFT(sc, 3); - } - else { - sc->tempLen = sprintf(sc->tempStr, "\ - switch (coordinate) {\n\ - case 0:\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, "\ - %s = sdata[%s*(%s)+%s];\n", sc->regIDs[0], sc->sharedStride, sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - for (uint64_t i = 1; i < used_registers_read; i++) { - if (sc->localSize[1] * (i + 1) > sc->fftDim) { - sc->tempLen = sprintf(sc->tempStr, "\ - if (%s < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_y, sc->fftDim - sc->localSize[1] * i); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, "\ - %s = sdata[%s*(%s+%" PRIu64 "*%s)+%s];\n", sc->regIDs[i], sc->sharedStride, sc->gl_LocalInvocationID_y, i, sc->gl_WorkGroupSize_y, sc->gl_LocalInvocationID_x); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->localSize[1] * (i + 1) > sc->fftDim) { - sc->tempLen = sprintf(sc->tempStr, "\ - }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - //appendBarrierVkFFT(sc, 3); - sc->tempLen = sprintf(sc->tempStr, " break;\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - for (uint64_t i = 1; i < sc->matrixConvolution; i++) { - sc->tempLen = sprintf(sc->tempStr, "\ - case %" PRIu64 ":\n", i); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, "\ - %s_%" PRIu64 " = sdata[%s*(%s)+%s];\n", sc->regIDs[0], i, sc->sharedStride, sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - for (uint64_t j = 1; j < used_registers_read; j++) { - if (sc->localSize[1] * (j + 1) > sc->fftDim) { - sc->tempLen = sprintf(sc->tempStr, "\ - if (%s < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_y, sc->fftDim - sc->localSize[1] * j); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, "\ - %s_%" PRIu64 " = sdata[%s*(%s+%" PRIu64 "*%s)+%s];\n", sc->regIDs[j], i, sc->sharedStride, sc->gl_LocalInvocationID_y, j, sc->gl_WorkGroupSize_y, sc->gl_LocalInvocationID_x); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->localSize[1] * (j + 1) > sc->fftDim) { - sc->tempLen = sprintf(sc->tempStr, "\ - }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - //appendBarrierVkFFT(sc, 3); - sc->tempLen = sprintf(sc->tempStr, " break;\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - res = VkAppendLineFromInput(sc, sc->disableThreadsEnd); - if (res != VKFFT_SUCCESS) return res; - res = appendZeropadEnd(sc); - if (res != VKFFT_SUCCESS) return res; - break; - } - } - } - return res; -} -static inline VkFFTResult appendCoordinateRegisterPull(VkFFTSpecializationConstantsLayout* sc, uint64_t readType) { - VkFFTResult res = VKFFT_SUCCESS; - if ((!sc->readToRegisters) || ((sc->convolutionStep) && ((sc->matrixConvolution > 1) || (sc->numKernels > 1)))) { - switch (readType) { - case 0: case 5: case 6: case 110: case 120: case 130: case 140: case 142: case 144://single_c2c - { - uint64_t used_registers_read = (uint64_t)ceil(sc->fftDim / (double)sc->localSize[0]); - res = appendBarrierVkFFT(sc, 1); - if (res != VKFFT_SUCCESS) return res; - res = appendZeropadStart(sc); - if (res != VKFFT_SUCCESS) return res; - res = VkAppendLineFromInput(sc, sc->disableThreadsStart); - if (res != VKFFT_SUCCESS) return res; - if (sc->matrixConvolution == 1) { - sc->tempLen = sprintf(sc->tempStr, "\ - sdata[sharedStride * %s + %s] = %s;\n", sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, sc->regIDs[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - for (uint64_t i = 1; i < used_registers_read; i++) { - if (sc->localSize[0] * (i + 1) > sc->fftDim) { - sc->tempLen = sprintf(sc->tempStr, "\ - if (%s < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_x, sc->fftDim - sc->localSize[0] * i); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, "\ - sdata[sharedStride * %s + %s + %" PRIu64 " * %s] = %s;\n", sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, i, sc->gl_WorkGroupSize_x, sc->regIDs[i]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->localSize[0] * (i + 1) > sc->fftDim) { - sc->tempLen = sprintf(sc->tempStr, "\ - }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - //appendBarrierVkFFT(sc, 3); - } - else { - sc->tempLen = sprintf(sc->tempStr, "\ - switch (coordinate) {\n\ - case 0:\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, "\ - sdata[sharedStride * %s + %s] = %s;\n", sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, sc->regIDs[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - for (uint64_t i = 1; i < used_registers_read; i++) { - if (sc->localSize[0] * (i + 1) > sc->fftDim) { - sc->tempLen = sprintf(sc->tempStr, "\ - if (%s < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_x, sc->fftDim - sc->localSize[0] * i); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, "\ - sdata[sharedStride * %s + %s + %" PRIu64 " * %s] = %s;\n", sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, i, sc->gl_WorkGroupSize_x, sc->regIDs[i]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->localSize[0] * (i + 1) > sc->fftDim) { - sc->tempLen = sprintf(sc->tempStr, "\ - }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - //appendBarrierVkFFT(sc, 3); - sc->tempLen = sprintf(sc->tempStr, " break;\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - for (uint64_t i = 1; i < sc->matrixConvolution; i++) { - sc->tempLen = sprintf(sc->tempStr, "\ - case %" PRIu64 ":\n", i); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, "\ - sdata[sharedStride * %s + %s] = %s_%" PRIu64 ";\n", sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, sc->regIDs[0], i); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - for (uint64_t j = 1; j < used_registers_read; j++) { - if (sc->localSize[0] * (j + 1) > sc->fftDim) { - sc->tempLen = sprintf(sc->tempStr, "\ - if (%s < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_x, sc->fftDim - sc->localSize[0] * j); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, "\ - sdata[sharedStride * %s + %s + %" PRIu64 " * %s] = %s_%" PRIu64 ";\n", sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, j, sc->gl_WorkGroupSize_x, sc->regIDs[j], i); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->localSize[0] * (i + 1) > sc->fftDim) { - sc->tempLen = sprintf(sc->tempStr, "\ - }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - //appendBarrierVkFFT(sc, 3); - sc->tempLen = sprintf(sc->tempStr, " break;\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - res = VkAppendLineFromInput(sc, sc->disableThreadsEnd); - if (res != VKFFT_SUCCESS) return res; - res = appendZeropadEnd(sc); - if (res != VKFFT_SUCCESS) return res; - break; - } - case 1: case 111: case 121: case 131: case 141: case 143: case 145://grouped_c2c - { - uint64_t used_registers_read = (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]); - res = appendBarrierVkFFT(sc, 1); - if (res != VKFFT_SUCCESS) return res; - res = appendZeropadStart(sc); - if (res != VKFFT_SUCCESS) return res; - res = VkAppendLineFromInput(sc, sc->disableThreadsStart); - if (res != VKFFT_SUCCESS) return res; - if (sc->matrixConvolution == 1) { - sc->tempLen = sprintf(sc->tempStr, "\ - sdata[%s*(%s)+%s] = %s;\n", sc->sharedStride, sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, sc->regIDs[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - for (uint64_t i = 1; i < used_registers_read; i++) { - if (sc->localSize[1] * (i + 1) > sc->fftDim) { - sc->tempLen = sprintf(sc->tempStr, "\ - if (%s < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_y, sc->fftDim - sc->localSize[1] * i); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, "\ - sdata[%s*(%s+%" PRIu64 "*%s)+%s] = %s;\n", sc->sharedStride, sc->gl_LocalInvocationID_y, i, sc->gl_WorkGroupSize_y, sc->gl_LocalInvocationID_x, sc->regIDs[i]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->localSize[1] * (i + 1) > sc->fftDim) { - sc->tempLen = sprintf(sc->tempStr, "\ - }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - //appendBarrierVkFFT(sc, 3); - } - else { - sc->tempLen = sprintf(sc->tempStr, "\ - switch (coordinate) {\n\ - case 0:\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, "\ - sdata[%s*(%s)+%s] = %s;\n", sc->sharedStride, sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, sc->regIDs[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - for (uint64_t i = 1; i < used_registers_read; i++) { - if (sc->localSize[1] * (i + 1) > sc->fftDim) { - sc->tempLen = sprintf(sc->tempStr, "\ - if (%s < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_y, sc->fftDim - sc->localSize[1] * i); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->localSize[1] * (i + 1) > sc->fftDim) { - sc->tempLen = sprintf(sc->tempStr, "\ - }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - - sc->tempLen = sprintf(sc->tempStr, "\ - sdata[%s*(%s+%" PRIu64 "*%s)+%s] = %s;\n", sc->sharedStride, sc->gl_LocalInvocationID_y, i, sc->gl_WorkGroupSize_y, sc->gl_LocalInvocationID_x, sc->regIDs[i]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - //appendBarrierVkFFT(sc, 3); - sc->tempLen = sprintf(sc->tempStr, " break;\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - for (uint64_t i = 1; i < sc->matrixConvolution; i++) { - sc->tempLen = sprintf(sc->tempStr, "\ - case %" PRIu64 ":\n", i); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, "\ - sdata[%s*(%s)+%s] = %s_%" PRIu64 ";\n", sc->sharedStride, sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, sc->regIDs[0], i); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - for (uint64_t j = 1; j < used_registers_read; j++) { - if (sc->localSize[1] * (j + 1) > sc->fftDim) { - sc->tempLen = sprintf(sc->tempStr, "\ - if (%s < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_y, sc->fftDim - sc->localSize[1] * j); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, "\ - sdata[%s*(%s+%" PRIu64 "*%s)+%s] = %s_%" PRIu64 ";\n", sc->sharedStride, sc->gl_LocalInvocationID_y, j, sc->gl_WorkGroupSize_y, sc->gl_LocalInvocationID_x, sc->regIDs[j], i); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->localSize[1] * (j + 1) > sc->fftDim) { - sc->tempLen = sprintf(sc->tempStr, "\ - }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - //appendBarrierVkFFT(sc, 3); - sc->tempLen = sprintf(sc->tempStr, " break;\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - res = VkAppendLineFromInput(sc, sc->disableThreadsEnd); - if (res != VKFFT_SUCCESS) return res; - res = appendZeropadEnd(sc); - if (res != VKFFT_SUCCESS) return res; - break; - } - } - } - return res; -} -static inline VkFFTResult appendPreparationBatchedKernelConvolution(VkFFTSpecializationConstantsLayout* sc, const char* floatType, const char* floatTypeMemory, const char* uintType, uint64_t dataType) { - VkFFTResult res = VKFFT_SUCCESS; - char vecType[30]; -#if(VKFFT_BACKEND==0) - if (!strcmp(floatType, "float")) sprintf(vecType, "vec2"); - if (!strcmp(floatType, "double")) sprintf(vecType, "dvec2"); -#elif(VKFFT_BACKEND==1) - if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); - if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); -#elif(VKFFT_BACKEND==2) - if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); - if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); -#elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)) - if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); - if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); -#endif - char separateRegisterStore[100] = "_store"; - - for (uint64_t i = 0; i < sc->registers_per_thread; i++) { - sc->tempLen = sprintf(sc->tempStr, " %s %s%s;\n", vecType, sc->regIDs[i], separateRegisterStore); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - for (uint64_t j = 1; j < sc->matrixConvolution; j++) { - sc->tempLen = sprintf(sc->tempStr, " %s %s_%" PRIu64 "%s;\n", vecType, sc->regIDs[i], j, separateRegisterStore); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - for (uint64_t i = 0; i < sc->registers_per_thread; i++) { - //sc->tempLen = sprintf(sc->tempStr, " temp%s[i]=temp[i];\n", separateRegisterStore); - sc->tempLen = sprintf(sc->tempStr, " %s%s=%s;\n", sc->regIDs[i], separateRegisterStore, sc->regIDs[i]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - for (uint64_t j = 1; j < sc->matrixConvolution; j++) { - sc->tempLen = sprintf(sc->tempStr, " %s_%" PRIu64 "%s=%s_%" PRIu64 ";\n", sc->regIDs[i], j, separateRegisterStore, sc->regIDs[i], j); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - sc->tempLen = sprintf(sc->tempStr, " for (%s batchID=0; batchID < %" PRIu64 "; batchID++){\n", uintType, sc->numKernels); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - return res; -} -static inline VkFFTResult appendBluesteinConvolution(VkFFTSpecializationConstantsLayout* sc, const char* floatType, const char* uintType, uint64_t dataType) { - VkFFTResult res = VKFFT_SUCCESS; - char shiftX[500] = ""; - if (sc->performWorkGroupShift[0]) - sprintf(shiftX, " + consts.workGroupShiftX * %s ", sc->gl_WorkGroupSize_x); - char requestCoordinate[100] = ""; - if (sc->convolutionStep) { - if (sc->matrixConvolution > 1) { - sprintf(requestCoordinate, "0"); - } - } - char requestBatch[100] = ""; - char separateRegisterStore[100] = ""; - if (sc->convolutionStep) { - if (sc->numKernels > 1) { - sprintf(requestBatch, "batchID"); - sprintf(separateRegisterStore, "_store"); - } - } - res = appendZeropadStart(sc); - if (res != VKFFT_SUCCESS) return res; - res = VkAppendLineFromInput(sc, sc->disableThreadsStart); - if (res != VKFFT_SUCCESS) return res; - for (uint64_t j = 0; j < sc->matrixConvolution; j++) { - sc->tempLen = sprintf(sc->tempStr, " %s temp_real%" PRIu64 " = 0;\n", floatType, j); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s temp_imag%" PRIu64 " = 0;\n", floatType, j); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - uint64_t used_registers_read = 1; - switch (dataType) { - case 0: case 5: case 6: case 110: case 120: case 130: case 140: case 142: case 144: - used_registers_read = (uint64_t)ceil(sc->fftDim / (double)sc->localSize[0]); - break; - case 1: case 2: case 111: case 121: case 131: case 141: case 143: case 145: - used_registers_read = (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]); - break; - } - for (uint64_t i = 0; i < used_registers_read; i++) { - switch (dataType) { - case 0: case 5: case 6: case 110: case 120: case 130: case 140: case 142: case 144: - { - if (sc->localSize[0] * (i + 1) > sc->fftDim) { - sc->tempLen = sprintf(sc->tempStr, "\ - if (%s < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_x, sc->fftDim - sc->localSize[0] * i); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->fftDim == sc->fft_dim_full) { - sc->tempLen = sprintf(sc->tempStr, " %s = %s + %" PRIu64 ";\n", sc->inoutID, sc->gl_LocalInvocationID_x, i * sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " %s = %s+%" PRIu64 "+%s * %" PRIu64 " + (((%s%s) %% %" PRIu64 ") * %" PRIu64 " + ((%s%s) / %" PRIu64 ") * %" PRIu64 ");", sc->inoutID, sc->gl_LocalInvocationID_x, i * sc->localSize[0], sc->gl_LocalInvocationID_y, sc->firstStageStartSize, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->fftDim, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->localSize[1] * sc->firstStageStartSize); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - //sc->tempLen = sprintf(sc->tempStr, " inoutID = indexInput(%s+%" PRIu64 "+%s * %" PRIu64 " + (((%s%s) %% %" PRIu64 ") * %" PRIu64 " + ((%s%s) / %" PRIu64 ") * %" PRIu64 ")%s%s);\n", sc->gl_LocalInvocationID_x, i * sc->localSize[0], sc->gl_LocalInvocationID_y, sc->firstStageStartSize, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->fftDim, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->localSize[1] * sc->firstStageStartSize, requestCoordinate, requestBatch); - } - break; - } - case 1: case 2: case 111: case 121: case 131: case 141: case 143: case 145: - { - if (sc->localSize[1] * (i + 1) > sc->fftDim) { - sc->tempLen = sprintf(sc->tempStr, "\ - if (%s < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_y, sc->fftDim - sc->localSize[1] * i); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->fftDim == sc->fft_dim_full) { - sc->tempLen = sprintf(sc->tempStr, " %s = %s + %" PRIu64 ";\n", sc->inoutID, sc->gl_LocalInvocationID_y, i * sc->localSize[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " %s = (%" PRIu64 " * (%s + %" PRIu64 ") + ((%s%s) / %" PRIu64 ") %% (%" PRIu64 ")+((%s%s) / %" PRIu64 ") * (%" PRIu64 "));\n", sc->inoutID, sc->stageStartSize, sc->gl_LocalInvocationID_y, (i)*sc->localSize[1], sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x, sc->stageStartSize, sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x * sc->stageStartSize, sc->fftDim * sc->stageStartSize); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - break; - } - } - char kernelName[100] = ""; - sprintf(kernelName, "BluesteinConvolutionKernel"); - if ((sc->inverseBluestein) && (sc->fftDim == sc->fft_dim_full)) - sc->tempLen = sprintf(sc->tempStr, " temp_real0 = %s[inoutID].x * %s%s.x + %s[inoutID].y * %s%s.y;\n", kernelName, sc->regIDs[i], separateRegisterStore, kernelName, sc->regIDs[i], separateRegisterStore); - else - sc->tempLen = sprintf(sc->tempStr, " temp_real0 = %s[inoutID].x * %s%s.x - %s[inoutID].y * %s%s.y;\n", kernelName, sc->regIDs[i], separateRegisterStore, kernelName, sc->regIDs[i], separateRegisterStore); - - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - if ((sc->inverseBluestein) && (sc->fftDim == sc->fft_dim_full)) - sc->tempLen = sprintf(sc->tempStr, " temp_imag0 = %s[inoutID].x * %s%s.y - %s[inoutID].y * %s%s.x;\n", kernelName, sc->regIDs[i], separateRegisterStore, kernelName, sc->regIDs[i], separateRegisterStore); - else - sc->tempLen = sprintf(sc->tempStr, " temp_imag0 = %s[inoutID].x * %s%s.y + %s[inoutID].y * %s%s.x;\n", kernelName, sc->regIDs[i], separateRegisterStore, kernelName, sc->regIDs[i], separateRegisterStore); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.x = temp_real0;\n", sc->regIDs[i]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = temp_imag0;\n", sc->regIDs[i]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - switch (dataType) { - case 0: case 5: case 6: case 110: case 120: case 130: case 140: case 142: case 144: - { - if (sc->localSize[0] * (i + 1) > sc->fftDim) { - sc->tempLen = sprintf(sc->tempStr, "\ - }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - break; - } - case 1: case 2: case 111: case 121: case 131: case 141: case 143: case 145: - { - if (sc->localSize[1] * (i + 1) > sc->fftDim) { - sc->tempLen = sprintf(sc->tempStr, "\ - }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - break; - } - } - } - res = VkAppendLineFromInput(sc, sc->disableThreadsEnd); - if (res != VKFFT_SUCCESS) return res; - res = appendZeropadEnd(sc); - if (res != VKFFT_SUCCESS) return res; - return res; -} - -static inline VkFFTResult appendKernelConvolution(VkFFTSpecializationConstantsLayout* sc, const char* floatType, const char* floatTypeMemory, const char* uintType, uint64_t dataType) { - VkFFTResult res = VKFFT_SUCCESS; - char convTypeLeft[20] = ""; - char convTypeRight[20] = ""; - if ((!strcmp(floatType, "float")) && (strcmp(floatTypeMemory, "float"))) { -#if(VKFFT_BACKEND==0) - sprintf(convTypeLeft, "float("); - sprintf(convTypeRight, ")"); -#elif(VKFFT_BACKEND==1) - sprintf(convTypeLeft, "(float)"); - //sprintf(convTypeRight, ""); -#elif(VKFFT_BACKEND==2) - sprintf(convTypeLeft, "(float)"); - //sprintf(convTypeRight, ""); -#elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)) - sprintf(convTypeLeft, "(float)"); - //sprintf(convTypeRight, ""); -#endif - } - if ((!strcmp(floatType, "double")) && (strcmp(floatTypeMemory, "double"))) { -#if(VKFFT_BACKEND==0) - sprintf(convTypeLeft, "double("); - sprintf(convTypeRight, ")"); -#elif(VKFFT_BACKEND==1) - sprintf(convTypeLeft, "(double)"); - //sprintf(convTypeRight, ""); -#elif(VKFFT_BACKEND==2) - sprintf(convTypeLeft, "(double)"); - //sprintf(convTypeRight, ""); -#elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)) - sprintf(convTypeLeft, "(double)"); - //sprintf(convTypeRight, ""); -#endif - } - - char shiftX[500] = ""; - if (sc->performWorkGroupShift[0]) - sprintf(shiftX, " + consts.workGroupShiftX * %s ", sc->gl_WorkGroupSize_x); - char requestCoordinate[100] = ""; - if (sc->convolutionStep) { - if (sc->matrixConvolution > 1) { - sprintf(requestCoordinate, "0"); - } - } - char index_x[2000] = ""; - char index_y[2000] = ""; - char requestBatch[100] = ""; - char separateRegisterStore[100] = ""; - if (sc->convolutionStep) { - if (sc->numKernels > 1) { - sprintf(requestBatch, "batchID"); - sprintf(separateRegisterStore, "_store"); - } - } - res = appendZeropadStart(sc); - if (res != VKFFT_SUCCESS) return res; - res = VkAppendLineFromInput(sc, sc->disableThreadsStart); - if (res != VKFFT_SUCCESS) return res; - for (uint64_t j = 0; j < sc->matrixConvolution; j++) { - sc->tempLen = sprintf(sc->tempStr, " %s temp_real%" PRIu64 " = 0;\n", floatType, j); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s temp_imag%" PRIu64 " = 0;\n", floatType, j); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - uint64_t used_registers_read = 1; - switch (dataType) { - case 0: case 5: case 6: case 110: case 120: case 130: case 140: case 142: case 144: - used_registers_read = (uint64_t)ceil(sc->fftDim / (double)sc->localSize[0]); - break; - case 1: case 2: case 111: case 121: case 131: case 141: case 143: case 145: - used_registers_read = (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]); - break; - } - for (uint64_t i = 0; i < used_registers_read; i++) { - if (i > 0) { - for (uint64_t j = 0; j < sc->matrixConvolution; j++) { - sc->tempLen = sprintf(sc->tempStr, " temp_real%" PRIu64 " = 0;\n", j); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " temp_imag%" PRIu64 " = 0;\n", j); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - switch (dataType) { - case 0: - { - if (sc->fftDim == sc->fft_dim_full) { - if (sc->localSize[1] == 1) - sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, i * sc->localSize[0]); - else - sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, i * sc->localSize[0] * sc->localSize[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if ((1 + i) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[1])) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->inputStride[0] > 1) { - sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sprintf(index_x, "(combinedID %% %" PRIu64 ") * %" PRIu64 " + (combinedID / %" PRIu64 ") * %" PRIu64 "", sc->fftDim, sc->inputStride[0], sc->fftDim, sc->inputStride[1]); - uint64_t tempSaveInputOffset = sc->inputOffset; - uint64_t tempSaveInputNumberByteSize = sc->inputNumberByteSize; - sc->inputOffset = sc->kernelOffset; - sc->inputNumberByteSize = sc->kernelNumberByteSize; - res = indexInputVkFFT(sc, uintType, dataType + 1000, index_x, 0, requestCoordinate, requestBatch); - if (res != VKFFT_SUCCESS) return res; - sc->inputOffset = tempSaveInputOffset; - sc->inputNumberByteSize = tempSaveInputNumberByteSize; - sc->tempLen = sprintf(sc->tempStr, ";\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - //sc->tempLen = sprintf(sc->tempStr, " inoutID = indexInput((combinedID %% %" PRIu64 ") * %" PRIu64 " + (combinedID / %" PRIu64 ") * %" PRIu64 "%s%s);\n", sc->fftDim, sc->inputStride[0], sc->fftDim, sc->inputStride[1], requestCoordinate, requestBatch); - } - else { - sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sprintf(index_x, "(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * %" PRIu64 "", sc->fftDim, sc->fftDim, sc->inputStride[1]); - uint64_t tempSaveInputOffset = sc->inputOffset; - uint64_t tempSaveInputNumberByteSize = sc->inputNumberByteSize; - sc->inputOffset = sc->kernelOffset; - sc->inputNumberByteSize = sc->kernelNumberByteSize; - res = indexInputVkFFT(sc, uintType, dataType + 1000, index_x, 0, requestCoordinate, requestBatch); - if (res != VKFFT_SUCCESS) return res; - sc->inputOffset = tempSaveInputOffset; - sc->inputNumberByteSize = tempSaveInputNumberByteSize; - sc->tempLen = sprintf(sc->tempStr, ";\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - //sc->tempLen = sprintf(sc->tempStr, " inoutID = indexInput((combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * %" PRIu64 "%s%s);\n", sc->fftDim, sc->fftDim, sc->inputStride[1], requestCoordinate, requestBatch); - } - } - else { - if (sc->localSize[0] * (i + 1) > sc->fftDim) { - sc->tempLen = sprintf(sc->tempStr, "\ - if (%s < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_x, sc->fftDim - sc->localSize[0] * i); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sprintf(index_x, "%s+%" PRIu64 "+%s * %" PRIu64 " + (((%s%s) %% %" PRIu64 ") * %" PRIu64 " + ((%s%s) / %" PRIu64 ") * %" PRIu64 ")", sc->gl_LocalInvocationID_x, i * sc->localSize[0], sc->gl_LocalInvocationID_y, sc->firstStageStartSize, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->fftDim, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->localSize[1] * sc->firstStageStartSize); - uint64_t tempSaveInputOffset = sc->inputOffset; - uint64_t tempSaveInputNumberByteSize = sc->inputNumberByteSize; - sc->inputOffset = sc->kernelOffset; - sc->inputNumberByteSize = sc->kernelNumberByteSize; - res = indexInputVkFFT(sc, uintType, dataType + 1000, index_x, 0, requestCoordinate, requestBatch); - if (res != VKFFT_SUCCESS) return res; - sc->inputOffset = tempSaveInputOffset; - sc->inputNumberByteSize = tempSaveInputNumberByteSize; - sc->tempLen = sprintf(sc->tempStr, ";\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - //sc->tempLen = sprintf(sc->tempStr, " inoutID = indexInput(%s+%" PRIu64 "+%s * %" PRIu64 " + (((%s%s) %% %" PRIu64 ") * %" PRIu64 " + ((%s%s) / %" PRIu64 ") * %" PRIu64 ")%s%s);\n", sc->gl_LocalInvocationID_x, i * sc->localSize[0], sc->gl_LocalInvocationID_y, sc->firstStageStartSize, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->fftDim, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->localSize[1] * sc->firstStageStartSize, requestCoordinate, requestBatch); - } - break; - } - case 1: - { - if (sc->localSize[1] * (i + 1) > sc->fftDim) { - sc->tempLen = sprintf(sc->tempStr, "\ - if (%s < %" PRIu64 ") {\n", sc->gl_LocalInvocationID_y, sc->fftDim - sc->localSize[1] * i); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sprintf(index_x, "(%s%s) %% (%" PRIu64 ")", sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x); - sprintf(index_y, "(%s+%" PRIu64 ")+((%s%s)/%" PRIu64 ")%%(%" PRIu64 ")+((%s%s)/%" PRIu64 ")*(%" PRIu64 ")", sc->gl_LocalInvocationID_y, i * sc->localSize[1], sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x, sc->stageStartSize, sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x * sc->stageStartSize, sc->fftDim); - uint64_t tempSaveInputOffset = sc->inputOffset; - uint64_t tempSaveInputNumberByteSize = sc->inputNumberByteSize; - sc->inputOffset = sc->kernelOffset; - sc->inputNumberByteSize = sc->kernelNumberByteSize; - res = indexInputVkFFT(sc, uintType, dataType + 1000, index_x, index_y, requestCoordinate, requestBatch); - if (res != VKFFT_SUCCESS) return res; - sc->inputOffset = tempSaveInputOffset; - sc->inputNumberByteSize = tempSaveInputNumberByteSize; - sc->tempLen = sprintf(sc->tempStr, ";\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - //sc->tempLen = sprintf(sc->tempStr, " inoutID = indexInput((%s%s) %% (%" PRIu64 "), (%s+%" PRIu64 ")+((%s%s)/%" PRIu64 ")%%(%" PRIu64 ")+((%s%s)/%" PRIu64 ")*(%" PRIu64 ")%s%s);\n", sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x, sc->gl_LocalInvocationID_y, i * sc->localSize[1], sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x, sc->stageStartSize, sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x * sc->stageStartSize, sc->fftDim, requestCoordinate, requestBatch); - break; - } - } - char kernelName[100] = ""; - sprintf(kernelName, "kernel_obj"); - if ((sc->kernelBlockNum == 1) || (sc->useBluesteinFFT)) { - for (uint64_t j = 0; j < sc->matrixConvolution; j++) { - for (uint64_t l = 0; l < sc->matrixConvolution; l++) { - uint64_t k = 0; - if (sc->symmetricKernel) { - k = (l < j) ? (l * sc->matrixConvolution - l * l + j) : (j * sc->matrixConvolution - j * j + l); - } - else { - k = (j * sc->matrixConvolution + l); - } - if (sc->conjugateConvolution == 0) { - if (l == 0) - sc->tempLen = sprintf(sc->tempStr, " temp_real%" PRIu64 " += %s%s[inoutID+%" PRIu64 "].x%s * %s%s.x - %s%s[inoutID+%" PRIu64 "].y%s * %s%s.y;\n", j, convTypeLeft, kernelName, k * sc->inputStride[3], convTypeRight, sc->regIDs[i], separateRegisterStore, convTypeLeft, kernelName, k * sc->inputStride[3], convTypeRight, sc->regIDs[i], separateRegisterStore); - else - sc->tempLen = sprintf(sc->tempStr, " temp_real%" PRIu64 " += %s%s[inoutID+%" PRIu64 "].x%s * %s_%" PRIu64 "%s.x - %s%s[inoutID+%" PRIu64 "].y%s * %s_%" PRIu64 "%s.y;\n", j, convTypeLeft, kernelName, k * sc->inputStride[3], convTypeRight, sc->regIDs[i], l, separateRegisterStore, convTypeLeft, kernelName, k * sc->inputStride[3], convTypeRight, sc->regIDs[i], l, separateRegisterStore); - } - else { - if (l == 0) - sc->tempLen = sprintf(sc->tempStr, " temp_real%" PRIu64 " += %s%s[inoutID+%" PRIu64 "].x%s * %s%s.x + %s%s[inoutID+%" PRIu64 "].y%s * %s%s.y;\n", j, convTypeLeft, kernelName, k * sc->inputStride[3], convTypeRight, sc->regIDs[i], separateRegisterStore, convTypeLeft, kernelName, k * sc->inputStride[3], convTypeRight, sc->regIDs[i], separateRegisterStore); - else - sc->tempLen = sprintf(sc->tempStr, " temp_real%" PRIu64 " += %s%s[inoutID+%" PRIu64 "].x%s * %s_%" PRIu64 "%s.x + %s%s[inoutID+%" PRIu64 "].y%s * %s_%" PRIu64 "%s.y;\n", j, convTypeLeft, kernelName, k * sc->inputStride[3], convTypeRight, sc->regIDs[i], l, separateRegisterStore, convTypeLeft, kernelName, k * sc->inputStride[3], convTypeRight, sc->regIDs[i], l, separateRegisterStore); - } - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - for (uint64_t l = 0; l < sc->matrixConvolution; l++) { - uint64_t k = 0; - if (sc->symmetricKernel) { - k = (l < j) ? (l * sc->matrixConvolution - l * l + j) : (j * sc->matrixConvolution - j * j + l); - } - else { - k = (j * sc->matrixConvolution + l); - } - if (sc->conjugateConvolution == 0) { - if (l == 0) - sc->tempLen = sprintf(sc->tempStr, " temp_imag%" PRIu64 " += %s%s[inoutID+%" PRIu64 "].x%s * %s%s.y + %s%s[inoutID+%" PRIu64 "].y%s * %s%s.x;\n", j, convTypeLeft, kernelName, k * sc->inputStride[3], convTypeRight, sc->regIDs[i], separateRegisterStore, convTypeLeft, kernelName, k * sc->inputStride[3], convTypeRight, sc->regIDs[i], separateRegisterStore); - else - sc->tempLen = sprintf(sc->tempStr, " temp_imag%" PRIu64 " += %s%s[inoutID+%" PRIu64 "].x%s * %s_%" PRIu64 "%s.y + %s%s[inoutID+%" PRIu64 "].y%s * %s_%" PRIu64 "%s.x;\n", j, convTypeLeft, kernelName, k * sc->inputStride[3], convTypeRight, sc->regIDs[i], l, separateRegisterStore, convTypeLeft, kernelName, k * sc->inputStride[3], convTypeRight, sc->regIDs[i], l, separateRegisterStore); - } - else { - if (sc->conjugateConvolution == 1) { - if (l == 0) - sc->tempLen = sprintf(sc->tempStr, " temp_imag%" PRIu64 " += %s%s[inoutID+%" PRIu64 "].y%s * %s%s.x - %s%s[inoutID+%" PRIu64 "].x%s * %s%s.y ;\n", j, convTypeLeft, kernelName, k * sc->inputStride[3], convTypeRight, sc->regIDs[i], separateRegisterStore, convTypeLeft, kernelName, k * sc->inputStride[3], convTypeRight, sc->regIDs[i], separateRegisterStore); - else - sc->tempLen = sprintf(sc->tempStr, " temp_imag%" PRIu64 " += %s%s[inoutID+%" PRIu64 "].y%s * %s_%" PRIu64 "%s.x - %s%s[inoutID+%" PRIu64 "].x%s * %s_%" PRIu64 "%s.y;\n", j, convTypeLeft, kernelName, k * sc->inputStride[3], convTypeRight, sc->regIDs[i], l, separateRegisterStore, convTypeLeft, kernelName, k * sc->inputStride[3], convTypeRight, sc->regIDs[i], l, separateRegisterStore); - } - else { - if (l == 0) - sc->tempLen = sprintf(sc->tempStr, " temp_imag%" PRIu64 " += %s%s[inoutID+%" PRIu64 "].x%s * %s%s.y - %s%s[inoutID+%" PRIu64 "].y%s * %s%s.x;\n", j, convTypeLeft, kernelName, k * sc->inputStride[3], convTypeRight, sc->regIDs[i], separateRegisterStore, convTypeLeft, kernelName, k * sc->inputStride[3], convTypeRight, sc->regIDs[i], separateRegisterStore); - else - sc->tempLen = sprintf(sc->tempStr, " temp_imag%" PRIu64 " += %s%s[inoutID+%" PRIu64 "].x%s * %s_%" PRIu64 "%s.y - %s%s[inoutID+%" PRIu64 "].y%s * %s_%" PRIu64 "%s.x;\n", j, convTypeLeft, kernelName, k * sc->inputStride[3], convTypeRight, sc->regIDs[i], l, separateRegisterStore, convTypeLeft, kernelName, k * sc->inputStride[3], convTypeRight, sc->regIDs[i], l, separateRegisterStore); - } - } - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - } - } - if (sc->crossPowerSpectrumNormalization) { -#if(VKFFT_BACKEND==0) - sc->tempLen = sprintf(sc->tempStr, " w.x = inversesqrt(temp_real0*temp_real0+temp_imag0*temp_imag0);\n"); -#elif(VKFFT_BACKEND==1) - sc->tempLen = sprintf(sc->tempStr, " w.x = rsqrt(temp_real0*temp_real0+temp_imag0*temp_imag0);\n"); -#elif(VKFFT_BACKEND==2) - sc->tempLen = sprintf(sc->tempStr, " w.x = rsqrt(temp_real0*temp_real0+temp_imag0*temp_imag0);\n"); -#elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)) - sc->tempLen = sprintf(sc->tempStr, " w.x = rsqrt(temp_real0*temp_real0+temp_imag0*temp_imag0);\n"); -#endif - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.x = temp_real0 * w.x;\n", sc->regIDs[i]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = temp_imag0 * w.x;\n", sc->regIDs[i]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " %s.x = temp_real0;\n", sc->regIDs[i]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = temp_imag0;\n", sc->regIDs[i]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - for (uint64_t l = 1; l < sc->matrixConvolution; l++) { - if (sc->crossPowerSpectrumNormalization) { -#if(VKFFT_BACKEND==0) - sc->tempLen = sprintf(sc->tempStr, " w.x = inversesqrt(temp_real%" PRIu64 "*temp_real%" PRIu64 "+temp_imag%" PRIu64 "*temp_imag%" PRIu64 ");\n", l, l, l, l); -#elif(VKFFT_BACKEND==1) - sc->tempLen = sprintf(sc->tempStr, " w.x = rsqrt(temp_real%" PRIu64 "*temp_real%" PRIu64 "+temp_imag%" PRIu64 "*temp_imag%" PRIu64 ");\n", l, l, l, l); -#elif(VKFFT_BACKEND==2) - sc->tempLen = sprintf(sc->tempStr, " w.x = rsqrt(temp_real%" PRIu64 "*temp_real%" PRIu64 "+temp_imag%" PRIu64 "*temp_imag%" PRIu64 ");\n", l, l, l, l); -#elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)) - sc->tempLen = sprintf(sc->tempStr, " w.x = rsqrt(temp_real%" PRIu64 "*temp_real%" PRIu64 "+temp_imag%" PRIu64 "*temp_imag%" PRIu64 ");\n", l, l, l, l); -#endif - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s_%" PRIu64 ".x = temp_real%" PRIu64 " * w.x;\n", sc->regIDs[i], l, l); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s_%" PRIu64 ".y = temp_imag%" PRIu64 " * w.x;\n", sc->regIDs[i], l, l); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " %s_%" PRIu64 ".x = temp_real%" PRIu64 ";\n", sc->regIDs[i], l, l); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s_%" PRIu64 ".y = temp_imag%" PRIu64 ";\n", sc->regIDs[i], l, l); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - else { - for (uint64_t j = 0; j < sc->matrixConvolution; j++) { - - sc->tempLen = sprintf(sc->tempStr, " %s temp_real%" PRIu64 " = 0;\n", floatType, j); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - for (uint64_t l = 0; l < sc->matrixConvolution; l++) { - uint64_t k = 0; - if (sc->symmetricKernel) { - k = (l < j) ? (l * sc->matrixConvolution - l * l + j) : (j * sc->matrixConvolution - j * j + l); - } - else { - k = (j * sc->matrixConvolution + l); - } - if (l == 0) - sc->tempLen = sprintf(sc->tempStr, " temp_real%" PRIu64 " += %skernelBlocks[(inoutID+%" PRIu64 ")/%" PRIu64 "].%s[(inoutID+%" PRIu64 ") %% %" PRIu64 "].x%s * %s%s.x - %skernelBlocks[(inoutID+%" PRIu64 ")/%" PRIu64 "].%s[(inoutID+%" PRIu64 ") %% %" PRIu64 "].y%s * %s%s.y;\n", j, convTypeLeft, k * sc->inputStride[3], sc->kernelBlockSize, kernelName, k * sc->inputStride[3], sc->kernelBlockSize, convTypeRight, sc->regIDs[i], separateRegisterStore, convTypeLeft, k * sc->inputStride[3], sc->kernelBlockSize, kernelName, k * sc->inputStride[3], sc->kernelBlockSize, convTypeRight, sc->regIDs[i], separateRegisterStore); - else - sc->tempLen = sprintf(sc->tempStr, " temp_real%" PRIu64 " += %skernelBlocks[(inoutID+%" PRIu64 ")/%" PRIu64 "].%s[(inoutID+%" PRIu64 ") %% %" PRIu64 "].x%s * %s_%" PRIu64 "%s.x - %skernelBlocks[(inoutID+%" PRIu64 ")/%" PRIu64 "].%s[(inoutID+%" PRIu64 ") %% %" PRIu64 "].y%s * %s_%" PRIu64 "%s.y;\n", j, convTypeLeft, k * sc->inputStride[3], sc->kernelBlockSize, kernelName, k * sc->inputStride[3], sc->kernelBlockSize, convTypeRight, sc->regIDs[i], l, separateRegisterStore, convTypeLeft, k * sc->inputStride[3], sc->kernelBlockSize, kernelName, k * sc->inputStride[3], sc->kernelBlockSize, convTypeRight, sc->regIDs[i], l, separateRegisterStore); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - } - - sc->tempLen = sprintf(sc->tempStr, " %s temp_imag%" PRIu64 " = 0;\n", floatType, j); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - for (uint64_t l = 0; l < sc->matrixConvolution; l++) { - uint64_t k = 0; - if (sc->symmetricKernel) { - k = (l < j) ? (l * sc->matrixConvolution - l * l + j) : (j * sc->matrixConvolution - j * j + l); - } - else { - k = (j * sc->matrixConvolution + l); - } - if (l == 0) - sc->tempLen = sprintf(sc->tempStr, " temp_imag%" PRIu64 " += %skernelBlocks[(inoutID+%" PRIu64 ")/%" PRIu64 "].%s[(inoutID+%" PRIu64 ") %% %" PRIu64 "].x%s * %s%s.y + %skernelBlocks[(inoutID+%" PRIu64 ")/%" PRIu64 "].%s[(inoutID+%" PRIu64 ") %% %" PRIu64 "].y%s * %s%s.x;\n", j, convTypeLeft, k * sc->inputStride[3], sc->kernelBlockSize, kernelName, k * sc->inputStride[3], sc->kernelBlockSize, convTypeRight, sc->regIDs[i], separateRegisterStore, convTypeLeft, k * sc->inputStride[3], sc->kernelBlockSize, kernelName, k * sc->inputStride[3], sc->kernelBlockSize, convTypeRight, sc->regIDs[i], separateRegisterStore); - else - sc->tempLen = sprintf(sc->tempStr, " temp_imag%" PRIu64 " += %skernelBlocks[(inoutID+%" PRIu64 ")/%" PRIu64 "].%s[(inoutID+%" PRIu64 ") %% %" PRIu64 "].x%s * %s_%" PRIu64 "%s.y + %skernelBlocks[(inoutID+%" PRIu64 ")/%" PRIu64 "].%s[(inoutID+%" PRIu64 ") %% %" PRIu64 "].y%s * %s_%" PRIu64 "%s.x;\n", j, convTypeLeft, k * sc->inputStride[3], sc->kernelBlockSize, kernelName, k * sc->inputStride[3], sc->kernelBlockSize, convTypeRight, sc->regIDs[i], l, separateRegisterStore, convTypeLeft, k * sc->inputStride[3], sc->kernelBlockSize, kernelName, k * sc->inputStride[3], sc->kernelBlockSize, convTypeRight, sc->regIDs[i], l, separateRegisterStore); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - sc->tempLen = sprintf(sc->tempStr, " %s.x = temp_real0;\n", sc->regIDs[i]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = temp_imag0;\n", sc->regIDs[i]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - for (uint64_t l = 1; l < sc->matrixConvolution; l++) { - sc->tempLen = sprintf(sc->tempStr, " %s_%" PRIu64 ".x = temp_real%" PRIu64 ";\n", sc->regIDs[i], l, l); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s_%" PRIu64 ".y = temp_imag%" PRIu64 ";\n", sc->regIDs[i], l, l); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - switch (dataType) { - case 0: case 5: case 6: case 110: case 120: case 130: case 140: case 142: case 144: - { - if (sc->localSize[0] * (i + 1) > sc->fftDim) { - sc->tempLen = sprintf(sc->tempStr, "\ - }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - break; - } - case 1: case 2: case 111: case 121: case 131: case 141: case 143: case 145: - { - if (sc->localSize[1] * (i + 1) > sc->fftDim) { - sc->tempLen = sprintf(sc->tempStr, "\ - }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - break; - } - } - } - res = VkAppendLineFromInput(sc, sc->disableThreadsEnd); - if (res != VKFFT_SUCCESS) return res; - res = appendZeropadEnd(sc); - if (res != VKFFT_SUCCESS) return res; - return res; -} -static inline VkFFTResult setWriteFromRegisters(VkFFTSpecializationConstantsLayout* sc, uint64_t writeType) { - VkFFTResult res = VKFFT_SUCCESS; - switch (writeType) { - case 0: //single_c2c - { - if ((sc->localSize[1] > 1) || (sc->localSize[0] * sc->stageRadix[sc->numStages - 1] * (sc->registers_per_thread_per_radix[sc->stageRadix[sc->numStages - 1]] / sc->stageRadix[sc->numStages - 1]) > sc->fftDim) || (sc->rader_generator[sc->numStages - 1] > 0)) { - sc->writeFromRegisters = 0; - } - else - sc->writeFromRegisters = 1; - break; - } - case 1: //grouped_c2c - { - if ((sc->localSize[1] * sc->stageRadix[sc->numStages - 1] * (sc->registers_per_thread_per_radix[sc->stageRadix[sc->numStages - 1]] / sc->stageRadix[sc->numStages - 1]) > sc->fftDim) || (sc->rader_generator[sc->numStages - 1] > 0)) { - sc->writeFromRegisters = 0; - res = appendBarrierVkFFT(sc, 1); - if (res != VKFFT_SUCCESS) return res; - } - else - sc->writeFromRegisters = 1; - break; - } - case 2: //single_c2c_strided - { - if ((sc->localSize[1] * sc->stageRadix[sc->numStages - 1] * (sc->registers_per_thread_per_radix[sc->stageRadix[sc->numStages - 1]] / sc->stageRadix[sc->numStages - 1]) > sc->fftDim) || (sc->rader_generator[sc->numStages - 1] > 0)) { - sc->writeFromRegisters = 0; - } - else - sc->writeFromRegisters = 1; - break; - } - case 5://single_r2c - { - sc->writeFromRegisters = 0; - break; - } - case 6: //single_c2r - { - if ((sc->axisSwapped) || (sc->localSize[1] > 1) || (sc->localSize[0] * sc->stageRadix[sc->numStages - 1] * (sc->registers_per_thread_per_radix[sc->stageRadix[sc->numStages - 1]] / sc->stageRadix[sc->numStages - 1]) > sc->fftDim) || (sc->rader_generator[sc->numStages - 1] > 0)) { - sc->writeFromRegisters = 0; - } - else - sc->writeFromRegisters = 1; - break; - } - case 110: case 111: case 120: case 121: case 130: case 131: case 140: case 141: case 142: case 143: case 144: case 145: - { - sc->writeFromRegisters = 0; - break; - } - } - return res; -} -static inline VkFFTResult appendWriteDataVkFFT(VkFFTSpecializationConstantsLayout* sc, const char* floatType, const char* floatTypeMemory, const char* uintType, uint64_t writeType) { - VkFFTResult res = VKFFT_SUCCESS; - double double_PI = 3.1415926535897932384626433832795; - char vecType[30]; - char outputsStruct[20] = ""; - char LFending[4] = ""; - if (!strcmp(floatType, "float")) sprintf(LFending, "f"); -#if(VKFFT_BACKEND==0) - if (!strcmp(floatType, "float")) sprintf(vecType, "vec2"); - if (!strcmp(floatType, "double")) sprintf(vecType, "dvec2"); - if (sc->outputBufferBlockNum == 1) - sprintf(outputsStruct, "outputs"); - else - sprintf(outputsStruct, ".outputs"); - if (!strcmp(floatType, "double")) sprintf(LFending, "LF"); - char cosDef[20] = "cos"; - char sinDef[20] = "sin"; -#elif(VKFFT_BACKEND==1) - if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); - if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); - sprintf(outputsStruct, "outputs"); - if (!strcmp(floatType, "double")) sprintf(LFending, "l"); - char cosDef[20] = "__cosf"; - char sinDef[20] = "__sinf"; -#elif(VKFFT_BACKEND==2) - if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); - if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); - sprintf(outputsStruct, "outputs"); - if (!strcmp(floatType, "double")) sprintf(LFending, "l"); - char cosDef[20] = "__cosf"; - char sinDef[20] = "__sinf"; -#elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)) - if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); - if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); - sprintf(outputsStruct, "outputs"); - //if (!strcmp(floatType, "double")) sprintf(LFending, "l"); - char cosDef[20] = "native_cos"; - char sinDef[20] = "native_sin"; -#endif - char convTypeLeft[20] = ""; - char convTypeRight[20] = ""; - if ((!strcmp(floatTypeMemory, "half")) && (strcmp(floatType, "half"))) { - if ((writeType == 6) || (writeType == 110) || (writeType == 111) || (writeType == 120) || (writeType == 121) || (writeType == 130) || (writeType == 131) || (writeType == 140) || (writeType == 141) || (writeType == 142) || (writeType == 143) || (writeType == 144) || (writeType == 145)) { - sprintf(convTypeLeft, "float16_t("); - sprintf(convTypeRight, ")"); - } - else { - sprintf(convTypeLeft, "f16vec2("); - sprintf(convTypeRight, ")"); - } - } - if ((!strcmp(floatTypeMemory, "float")) && (strcmp(floatType, "float"))) { - if ((writeType == 6) || (writeType == 110) || (writeType == 111) || (writeType == 120) || (writeType == 121) || (writeType == 130) || (writeType == 131) || (writeType == 140) || (writeType == 141) || (writeType == 142) || (writeType == 143) || (writeType == 144) || (writeType == 145)) { -#if(VKFFT_BACKEND==0) - sprintf(convTypeLeft, "float("); - sprintf(convTypeRight, ")"); -#elif(VKFFT_BACKEND==1) - sprintf(convTypeLeft, "(float)"); - //sprintf(convTypeRight, ""); -#elif(VKFFT_BACKEND==2) - sprintf(convTypeLeft, "(float)"); - //sprintf(convTypeRight, ""); -#elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)) - sprintf(convTypeLeft, "(float)"); - //sprintf(convTypeRight, ""); -#endif - } - else { -#if(VKFFT_BACKEND==0) - sprintf(convTypeLeft, "vec2("); - sprintf(convTypeRight, ")"); -#elif(VKFFT_BACKEND==1) - sprintf(convTypeLeft, "conv_float2("); - sprintf(convTypeRight, ")"); -#elif(VKFFT_BACKEND==2) - sprintf(convTypeLeft, "conv_float2("); - sprintf(convTypeRight, ")"); -#elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)) - sprintf(convTypeLeft, "conv_float2("); - sprintf(convTypeRight, ")"); -#endif - } - } - if ((!strcmp(floatTypeMemory, "double")) && (strcmp(floatType, "double"))) { - if ((writeType == 6) || (writeType == 110) || (writeType == 111) || (writeType == 120) || (writeType == 121) || (writeType == 130) || (writeType == 131) || (writeType == 140) || (writeType == 141) || (writeType == 142) || (writeType == 143) || (writeType == 144) || (writeType == 145)) { -#if(VKFFT_BACKEND==0) - sprintf(convTypeLeft, "double("); - sprintf(convTypeRight, ")"); -#elif(VKFFT_BACKEND==1) - sprintf(convTypeLeft, "(double)"); - //sprintf(convTypeRight, ""); -#elif(VKFFT_BACKEND==2) - sprintf(convTypeLeft, "(double)"); - //sprintf(convTypeRight, ""); -#elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)) - sprintf(convTypeLeft, "(double)"); - //sprintf(convTypeRight, ""); -#endif - } - else { -#if(VKFFT_BACKEND==0) - sprintf(convTypeLeft, "dvec2("); - sprintf(convTypeRight, ")"); -#elif(VKFFT_BACKEND==1) - sprintf(convTypeLeft, "conv_double2("); - sprintf(convTypeRight, ")"); -#elif(VKFFT_BACKEND==2) - sprintf(convTypeLeft, "conv_double2("); - sprintf(convTypeRight, ")"); -#elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)) - sprintf(convTypeLeft, "conv_double2("); - sprintf(convTypeRight, ")"); -#endif - } - } - - char index_x[2000] = ""; - char index_y[2000] = ""; - char requestCoordinate[100] = ""; - if (sc->convolutionStep) { - if (sc->matrixConvolution > 1) { - sprintf(requestCoordinate, "coordinate"); - } - } - char requestBatch[100] = ""; - if (sc->convolutionStep) { - if (sc->numKernels > 1) { - sprintf(requestBatch, "batchID");//if one buffer - multiple kernel convolution - } - } - switch (writeType) { - case 0: //single_c2c - { - if (!sc->writeFromRegisters) { - res = appendBarrierVkFFT(sc, 1); - if (res != VKFFT_SUCCESS) return res; - } - //res = appendZeropadStart(sc); - //if (res != VKFFT_SUCCESS) return res; - char shiftX[500] = ""; - if (sc->performWorkGroupShift[0]) - sprintf(shiftX, " + consts.workGroupShiftX "); - char shiftY[500] = ""; - if (sc->axisSwapped) { - if (sc->performWorkGroupShift[1]) - sprintf(shiftY, " + consts.workGroupShiftY*%s ", sc->gl_WorkGroupSize_x); - } - else { - if (sc->performWorkGroupShift[1]) - sprintf(shiftY, " + consts.workGroupShiftY*%s ", sc->gl_WorkGroupSize_y); - } - - char shiftY2[100] = ""; - if (sc->performWorkGroupShift[1]) - sprintf(shiftY, " + consts.workGroupShiftY "); - if (sc->fftDim < sc->fft_dim_full) { - if (sc->axisSwapped) { - if (!sc->reorderFourStep) { - sc->tempLen = sprintf(sc->tempStr, " if((%s+%" PRIu64 "*%s)< numActiveThreads) {\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " if (((%s + %" PRIu64 " * %s) %% %" PRIu64 " + ((%s%s) / %" PRIu64 ")*%" PRIu64 " < %" PRIu64 ")){\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, sc->localSize[0], sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->localSize[0], sc->fft_dim_full / sc->firstStageStartSize); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - if (!sc->reorderFourStep) { - res = VkAppendLineFromInput(sc, sc->disableThreadsStart); - } - else { - sc->tempLen = sprintf(sc->tempStr, " if (((%s + %" PRIu64 " * %s) %% %" PRIu64 " + ((%s%s) / %" PRIu64 ")*%" PRIu64 " < %" PRIu64 ")){\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, sc->localSize[1], sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->localSize[1], sc->fft_dim_full / sc->firstStageStartSize); - res = VkAppendLine(sc); - } - if (res != VKFFT_SUCCESS) return res; - } - - } - else { - sc->tempLen = sprintf(sc->tempStr, " { \n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - uint64_t used_registers_write = (sc->axisSwapped) ? (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]) : (uint64_t)ceil(sc->fftDim / (double)sc->localSize[0]); - if (sc->registerBoost > 1) used_registers_write /= sc->registerBoost; - if (sc->reorderFourStep) { - if (sc->fftDim == sc->fft_dim_full) { - for (uint64_t k = 0; k < sc->registerBoost; k++) { - for (uint64_t i = 0; i < used_registers_write; i++) { - if (sc->localSize[1] == 1) - sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * used_registers_write) * sc->localSize[0]); - else - sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * used_registers_write) * sc->localSize[0] * sc->localSize[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - if (sc->outputStride[0] > 1) - sc->tempLen = sprintf(sc->tempStr, " inoutID = (combinedID %% %" PRIu64 ") * %" PRIu64 " + (combinedID / %" PRIu64 ") * %" PRIu64 ";\n", sc->fftDim, sc->outputStride[0], sc->fftDim, sc->outputStride[1]); - else - sc->tempLen = sprintf(sc->tempStr, " inoutID = (combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * %" PRIu64 ";\n", sc->fftDim, sc->fftDim, sc->outputStride[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->axisSwapped) { - if (sc->size[sc->axis_id + 1] % sc->localSize[0] != 0) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + (%s%s)*%" PRIu64 "< %" PRIu64 "){\n", sc->fftDim, sc->gl_WorkGroupID_y, shiftY2, sc->localSize[0], sc->size[sc->axis_id + 1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if ((1 + i + k * used_registers_write) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[0])) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - if (sc->size[sc->axis_id + 1] % sc->localSize[1] != 0) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + (%s%s)*%" PRIu64 "< %" PRIu64 "){\n", sc->fftDim, sc->gl_WorkGroupID_y, shiftY2, sc->localSize[1], sc->size[sc->axis_id + 1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if ((1 + i + k * used_registers_write) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[1])) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - if (sc->zeropadBluestein[1]) { - sc->tempLen = sprintf(sc->tempStr, " if((combinedID %% %" PRIu64 ") < %" PRIu64 "){\n", sc->fft_dim_full, sc->fft_zeropad_Bluestein_left_write[sc->axis_id]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->zeropad[1]) { - sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->outputStride[1], sc->fft_zeropad_left_write[sc->axis_id], sc->outputStride[1], sc->fft_zeropad_right_write[sc->axis_id]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - res = indexOutputVkFFT(sc, uintType, writeType, sc->inoutID, 0, requestCoordinate, requestBatch); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, ";\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - res = appendZeropadStartReadWriteStage(sc, 0); - if (res != VKFFT_SUCCESS) return res; - if (sc->writeFromRegisters) { - if (sc->outputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s[%s] = %s%s%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->regIDs[i + k * sc->registers_per_thread], convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = %s%s%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[i + k * sc->registers_per_thread], convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - if (sc->axisSwapped) { - if (sc->outputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s[%s] = %ssdata[(combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")]%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->fftDim, sc->fftDim, convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = %ssdata[(combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")]%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->fftDim, sc->fftDim, convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - if (sc->outputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s[%s] = %ssdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride]%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->fftDim, sc->fftDim, convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = %ssdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride]%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->fftDim, sc->fftDim, convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - res = appendZeropadEndReadWriteStage(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->zeropad[1]) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->zeropadBluestein[1]) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->axisSwapped) { - if ((1 + i + k * used_registers_write) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[0])) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->size[sc->axis_id + 1] % sc->localSize[0] != 0) { - sc->tempLen = sprintf(sc->tempStr, " }"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - if ((1 + i + k * used_registers_write) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[1])) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->size[sc->axis_id + 1] % sc->localSize[1] != 0) { - sc->tempLen = sprintf(sc->tempStr, " }"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - } - } - else { - for (uint64_t k = 0; k < sc->registerBoost; k++) { - for (uint64_t i = 0; i < used_registers_write; i++) { - if (sc->localSize[1] == 1) - sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * used_registers_write) * sc->localSize[0]); - else - sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * used_registers_write) * sc->localSize[0] * sc->localSize[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->axisSwapped) { - sc->tempLen = sprintf(sc->tempStr, " inoutID = combinedID %% %" PRIu64 " + ((%s%s) / %" PRIu64 ")*%" PRIu64 " + ((combinedID/%" PRIu64 ") * %" PRIu64 ")+ ((%s%s) %% %" PRIu64 ") * %" PRIu64 ";\n", sc->localSize[0], sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->localSize[0], sc->localSize[0], sc->fft_dim_full / sc->fftDim, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->fft_dim_full / sc->firstStageStartSize); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - if (sc->localSize[1] == 1) - sc->tempLen = sprintf(sc->tempStr, " inoutID = (%s%s)/%" PRIu64 "+ (combinedID * %" PRIu64 ")+ ((%s%s) %% %" PRIu64 ") * %" PRIu64 ";\n", sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->fft_dim_full / sc->fftDim, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->fft_dim_full / sc->firstStageStartSize); - else - sc->tempLen = sprintf(sc->tempStr, " inoutID = combinedID %% %" PRIu64 " + ((%s%s) / %" PRIu64 ")*%" PRIu64 " + ((combinedID/%" PRIu64 ") * %" PRIu64 ")+ ((%s%s) %% %" PRIu64 ") * %" PRIu64 ";\n", sc->localSize[1], sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->localSize[1], sc->localSize[1], sc->fft_dim_full / sc->fftDim, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->fft_dim_full / sc->firstStageStartSize); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->zeropad[1]) { - sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->fft_dim_full, sc->fft_zeropad_left_write[sc->axis_id], sc->fft_dim_full, sc->fft_zeropad_right_write[sc->axis_id]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - res = indexOutputVkFFT(sc, uintType, writeType, sc->inoutID, 0, requestCoordinate, requestBatch); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, ";\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - res = appendZeropadStartReadWriteStage(sc, 0); - if (res != VKFFT_SUCCESS) return res; - if (sc->writeFromRegisters) { - //not used - if (sc->outputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s[%s] = %s%s%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->regIDs[i + k * sc->registers_per_thread], convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = %s%s%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[i + k * sc->registers_per_thread], convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - if (sc->axisSwapped) { - if ((1 + i + k * used_registers_write) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[0])) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->outputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s[%s] = %ssdata[(combinedID %% %s)+(combinedID/%s)*sharedStride]%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->gl_WorkGroupSize_x, sc->gl_WorkGroupSize_x, convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = %ssdata[(combinedID %% %s)+(combinedID/%s)*sharedStride]%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->gl_WorkGroupSize_x, sc->gl_WorkGroupSize_x, convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if ((1 + i + k * used_registers_write) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[0])) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - if ((1 + i + k * used_registers_write) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[1])) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->outputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s[%s] = %ssdata[(combinedID %% %s)*sharedStride+combinedID/%s]%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->gl_WorkGroupSize_y, sc->gl_WorkGroupSize_y, convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = %ssdata[(combinedID %% %s)*sharedStride+combinedID/%s]%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->gl_WorkGroupSize_y, sc->gl_WorkGroupSize_y, convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if ((1 + i + k * used_registers_write) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[1])) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - res = appendZeropadEndReadWriteStage(sc); - if (res != VKFFT_SUCCESS) return res; - /* - if (sc->outputBufferBlockNum == 1) - if (sc->localSize[1] == 1) - sc->tempLen = sprintf(sc->tempStr, " %s[indexOutput(inoutID%s%s)] = %stemp_%" PRIu64 "%s;\n", requestCoordinate, requestBatch, convTypeLeft, i, convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " %s[indexOutput(inoutID%s%s)] = %stemp_%" PRIu64 "%s;\n", requestCoordinate, requestBatch, convTypeLeft, i, convTypeRight); - else - if (sc->localSize[1] == 1) - sc->tempLen = sprintf(sc->tempStr, " outputBlocks[indexOutput(inoutID%s%s) / %" PRIu64 "]%s[indexOutput(inoutID%s%s) %% %" PRIu64 "] = %stemp_%" PRIu64 "%s;\n", requestCoordinate, requestBatch, sc->outputBufferBlockSize, outputsStruct, requestCoordinate, requestBatch, sc->outputBufferBlockSize, convTypeLeft, i, convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " outputBlocks[indexOutput(inoutID%s%s) / %" PRIu64 "]%s[indexOutput(inoutID%s%s) %% %" PRIu64 "] = %stemp_%" PRIu64 "%s;\n", requestCoordinate, requestBatch, sc->outputBufferBlockSize, outputsStruct, requestCoordinate, requestBatch, sc->outputBufferBlockSize, convTypeLeft, i, convTypeRight); - */ - if (sc->zeropad[1]) { - sc->tempLen = sprintf(sc->tempStr, " }"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - } - } - else { - if (sc->fftDim == sc->fft_dim_full) { - if (sc->registerBoost > 1) used_registers_write /= sc->registerBoost; - for (uint64_t k = 0; k < sc->registerBoost; k++) { - for (uint64_t i = 0; i < used_registers_write; i++) { - if (sc->localSize[1] == 1) - sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * used_registers_write) * sc->localSize[0]); - else - sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * used_registers_write) * sc->localSize[0] * sc->localSize[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - if (sc->outputStride[0] > 1) - sc->tempLen = sprintf(sc->tempStr, " inoutID = (combinedID %% %" PRIu64 ") * %" PRIu64 " + (combinedID / %" PRIu64 ") * %" PRIu64 ";\n", sc->fftDim, sc->outputStride[0], sc->fftDim, sc->outputStride[1]); - else - sc->tempLen = sprintf(sc->tempStr, " inoutID = (combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * %" PRIu64 ";\n", sc->fftDim, sc->fftDim, sc->outputStride[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->axisSwapped) { - if (sc->size[sc->axis_id + 1] % sc->localSize[0] != 0) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + %s*%" PRIu64 "< %" PRIu64 "){\n", sc->fftDim, sc->gl_WorkGroupID_y, sc->localSize[0], sc->size[sc->axis_id + 1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if ((1 + i + k * used_registers_write) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[0])) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - if (sc->size[sc->axis_id + 1] % sc->localSize[1] != 0) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + %s*%" PRIu64 "< %" PRIu64 "){\n", sc->fftDim, sc->gl_WorkGroupID_y, sc->localSize[1], sc->size[sc->axis_id + 1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if ((1 + i + k * used_registers_write) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[1])) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - if (sc->zeropadBluestein[1]) { - sc->tempLen = sprintf(sc->tempStr, " if((combinedID %% %" PRIu64 ") < %" PRIu64 "){\n", sc->fft_dim_full, sc->fft_zeropad_Bluestein_left_write[sc->axis_id]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->zeropad[1]) { - sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->outputStride[1], sc->fft_zeropad_left_write[sc->axis_id], sc->outputStride[1], sc->fft_zeropad_right_write[sc->axis_id]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - res = indexOutputVkFFT(sc, uintType, writeType, sc->inoutID, 0, requestCoordinate, requestBatch); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, ";\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - res = appendZeropadStartReadWriteStage(sc, 0); - if (res != VKFFT_SUCCESS) return res; - if (sc->writeFromRegisters) { - if (sc->outputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s[%s] = %s%s%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->regIDs[i + k * sc->registers_per_thread], convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = %s%s%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[i + k * sc->registers_per_thread], convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - if (sc->axisSwapped) { - if (sc->outputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s[%s] = %ssdata[(combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")]%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->fftDim, sc->fftDim, convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = %ssdata[(combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")]%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->fftDim, sc->fftDim, convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - if (sc->outputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s[%s] = %ssdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride]%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->fftDim, sc->fftDim, convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = %ssdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride]%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->fftDim, sc->fftDim, convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - res = appendZeropadEndReadWriteStage(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->zeropad[1]) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->zeropadBluestein[1]) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->axisSwapped) { - if ((1 + i + k * used_registers_write) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[0])) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->size[sc->axis_id + 1] % sc->localSize[0] != 0) { - sc->tempLen = sprintf(sc->tempStr, " }"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - if ((1 + i + k * used_registers_write) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[1])) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->size[sc->axis_id + 1] % sc->localSize[1] != 0) { - sc->tempLen = sprintf(sc->tempStr, " }"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - } - } - else { - for (uint64_t k = 0; k < sc->registerBoost; k++) { - for (uint64_t i = 0; i < used_registers_write; i++) { - /*if (sc->localSize[1] == 1) - sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * used_registers_write) * sc->localSize[0]); - else { - if (!sc->axisSwapped) - sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * used_registers_write) * sc->localSize[0] * sc->localSize[1]); - else - sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 " * numActiveThreads;\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * used_registers_write)); - } - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res;*/ - if (sc->axisSwapped) { - sc->tempLen = sprintf(sc->tempStr, " inoutID = (combinedID %% %" PRIu64 ")+(combinedID / %" PRIu64 ") * %" PRIu64 " + (((%s%s) %% %" PRIu64 ") * %" PRIu64 " + ((%s%s) / %" PRIu64 ") * %" PRIu64 ");", sc->fftDim, sc->fftDim, sc->firstStageStartSize, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->fftDim, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->localSize[0] * sc->firstStageStartSize); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " inoutID = %s+%" PRIu64 "+%s * %" PRIu64 " + (((%s%s) %% %" PRIu64 ") * %" PRIu64 " + ((%s%s) / %" PRIu64 ") * %" PRIu64 ");", sc->gl_LocalInvocationID_x, (i + k * used_registers_write) * sc->localSize[0], sc->gl_LocalInvocationID_y, sc->firstStageStartSize, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->fftDim, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->localSize[1] * sc->firstStageStartSize); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->zeropad[1]) { - sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->fft_dim_full, sc->fft_zeropad_left_write[sc->axis_id], sc->fft_dim_full, sc->fft_zeropad_right_write[sc->axis_id]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - res = indexOutputVkFFT(sc, uintType, writeType, sc->inoutID, 0, requestCoordinate, requestBatch); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, ";\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - //sc->tempLen = sprintf(sc->tempStr, " inoutID = indexOutput(%s+i*%" PRIu64 "+%s * %" PRIu64 " + (((%s%s) %% %" PRIu64 ") * %" PRIu64 " + ((%s%s) / %" PRIu64 ") * %" PRIu64 ")%s%s);\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, sc->firstStageStartSize, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->fftDim, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->localSize[1] * sc->firstStageStartSize, requestCoordinate, requestBatch); - res = appendZeropadStartReadWriteStage(sc, 0); - if (res != VKFFT_SUCCESS) return res; - if (sc->writeFromRegisters) { - //not used - if (sc->outputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s[inoutID]=%s%s%s;\n", outputsStruct, convTypeLeft, sc->regIDs[i + k * sc->registers_per_thread], convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "] = %s%s%s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[i + k * sc->registers_per_thread], convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - if (sc->axisSwapped) { - if ((1 + i + k * used_registers_write) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[0])) { - sc->tempLen = sprintf(sc->tempStr, " if(%s < %" PRIu64 "){\n", sc->gl_LocalInvocationID_y, sc->fftDim - (i + k * used_registers_write) * sc->localSize[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->outputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s[inoutID]=%ssdata[%s + sharedStride*(%s + %" PRIu64 ")]%s;\n", outputsStruct, convTypeLeft, sc->gl_LocalInvocationID_x, sc->gl_LocalInvocationID_y, (i + k * used_registers_write) * sc->localSize[1], convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "] = %ssdata[%s + sharedStride*(%s + %" PRIu64 ")]%s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeft, sc->gl_LocalInvocationID_x, sc->gl_LocalInvocationID_y, (i + k * used_registers_write) * sc->localSize[1], convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if ((1 + i + k * used_registers_write) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[0])) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - if ((1 + i + k * used_registers_write) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[1])) { - sc->tempLen = sprintf(sc->tempStr, " if(%s < %" PRIu64 "){\n", sc->gl_LocalInvocationID_x, sc->fftDim - (i + k * used_registers_write) * sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->outputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s[inoutID]=%ssdata[sharedStride*%s + (%s + %" PRIu64 ")]%s;\n", outputsStruct, convTypeLeft, sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, (i + k * used_registers_write) * sc->localSize[0], convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "] = %ssdata[sharedStride*%s + (%s + %" PRIu64 ")]%s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeft, sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, (i + k * used_registers_write) * sc->localSize[0], convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if ((1 + i + k * used_registers_write) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[1])) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - appendZeropadEndReadWriteStage(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->zeropad[1]) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - } - } - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - break; - } - case 1: //grouped_c2c - { - if (!sc->writeFromRegisters) { - res = appendBarrierVkFFT(sc, 1); - if (res != VKFFT_SUCCESS) return res; - } - //res = appendZeropadStart(sc); - //if (res != VKFFT_SUCCESS) return res; - char shiftX[500] = ""; - if (sc->performWorkGroupShift[0]) - sprintf(shiftX, " + consts.workGroupShiftX * %s ", sc->gl_WorkGroupSize_x); - sc->tempLen = sprintf(sc->tempStr, " if (((%s%s) / %" PRIu64 ") %% (%" PRIu64 ")+((%s%s) / %" PRIu64 ") * (%" PRIu64 ") < %" PRIu64 ") {\n", sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x, sc->stageStartSize, sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x * sc->stageStartSize, sc->fftDim * sc->stageStartSize, sc->size[sc->axis_id]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - uint64_t used_registers_write = (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]); - if (sc->registerBoost > 1) used_registers_write /= sc->registerBoost; - if ((sc->reorderFourStep) && (sc->stageStartSize == 1)) { - for (uint64_t k = 0; k < sc->registerBoost; k++) { - for (uint64_t i = 0; i < used_registers_write; i++) { - sc->tempLen = sprintf(sc->tempStr, " inoutID = (%s + %" PRIu64 ") * (%" PRIu64 ") + (((%s%s) / %" PRIu64 ") %% (%" PRIu64 ")) * (%" PRIu64 ") + ((%s%s) / %" PRIu64 ");\n", sc->gl_LocalInvocationID_y, (i + k * used_registers_write) * sc->localSize[1], sc->fft_dim_full / sc->fftDim, sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x, sc->firstStageStartSize / sc->fftDim, sc->fft_dim_full / sc->firstStageStartSize, sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x * (sc->firstStageStartSize / sc->fftDim)); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - if (sc->zeropad[1]) { - sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->fft_dim_full, sc->fft_zeropad_left_write[sc->axis_id], sc->fft_dim_full, sc->fft_zeropad_right_write[sc->axis_id]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sprintf(index_x, "(%s%s) %% (%" PRIu64 ")", sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x); - res = indexOutputVkFFT(sc, uintType, writeType, index_x, sc->inoutID, requestCoordinate, requestBatch); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, ";\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - res = appendZeropadStartReadWriteStage(sc, 0); - if (res != VKFFT_SUCCESS) return res; - if ((1 + i + k * used_registers_write) * sc->localSize[1] >= (sc->fftDim)) { - sc->tempLen = sprintf(sc->tempStr, " if(%s < %" PRIu64 "){\n", sc->gl_LocalInvocationID_y, sc->fftDim - (i + k * used_registers_write) * sc->localSize[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->writeFromRegisters) { - if (sc->outputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s[%s] = %s%s%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->regIDs[i + k * sc->registers_per_thread], convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = %s%s%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[i + k * sc->registers_per_thread], convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - if (sc->outputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s[%s] = %ssdata[%s*(%s+%" PRIu64 ") + %s]%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->sharedStride, sc->gl_LocalInvocationID_y, (i + k * used_registers_write) * sc->localSize[1], sc->gl_LocalInvocationID_x, convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = %ssdata[%s*(%s+%" PRIu64 ") + %s]%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->sharedStride, sc->gl_LocalInvocationID_y, (i + k * used_registers_write) * sc->localSize[1], sc->gl_LocalInvocationID_x, convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - } - if ((1 + i + k * used_registers_write) * sc->localSize[1] >= (sc->fftDim)) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - res = appendZeropadEndReadWriteStage(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->zeropad[1]) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - - } - } - } - else { - for (uint64_t k = 0; k < sc->registerBoost; k++) { - for (uint64_t i = 0; i < used_registers_write; i++) { - if (sc->zeropadBluestein[1]) { - sc->tempLen = sprintf(sc->tempStr, " inoutID = (%s + %" PRIu64 ") * %" PRIu64 " + ((%s%s) / %" PRIu64 ") %% (%" PRIu64 ")+((%s%s) / %" PRIu64 ") * (%" PRIu64 ");\n", sc->gl_LocalInvocationID_y, (i + k * used_registers_write) * sc->localSize[1], sc->stageStartSize, sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x, sc->stageStartSize, sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x * sc->stageStartSize, sc->stageStartSize * sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 ") < %" PRIu64 "){\n", sc->fft_dim_full, sc->fft_zeropad_Bluestein_left_write[sc->axis_id]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->zeropad[1]) { - if (!sc->zeropadBluestein[1]) { - sc->tempLen = sprintf(sc->tempStr, " inoutID = (%s + %" PRIu64 ") * %" PRIu64 " + ((%s%s) / %" PRIu64 ") %% (%" PRIu64 ")+((%s%s) / %" PRIu64 ") * (%" PRIu64 ");\n", sc->gl_LocalInvocationID_y, (i + k * used_registers_write) * sc->localSize[1], sc->stageStartSize, sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x, sc->stageStartSize, sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x * sc->stageStartSize, sc->stageStartSize * sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->fft_dim_full, sc->fft_zeropad_left_write[sc->axis_id], sc->fft_dim_full, sc->fft_zeropad_right_write[sc->axis_id]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sprintf(index_x, "(%s%s) %% (%" PRIu64 ")", sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x); - sprintf(index_y, "%" PRIu64 " * (%s + %" PRIu64 ") + ((%s%s) / %" PRIu64 ") %% (%" PRIu64 ")+((%s%s) / %" PRIu64 ") * (%" PRIu64 ")", sc->stageStartSize, sc->gl_LocalInvocationID_y, (i + k * used_registers_write) * sc->localSize[1], sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x, sc->stageStartSize, sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x * sc->stageStartSize, sc->stageStartSize * sc->fftDim); - res = indexOutputVkFFT(sc, uintType, writeType, index_x, index_y, requestCoordinate, requestBatch); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, ";\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - res = appendZeropadStartReadWriteStage(sc, 0); - if (res != VKFFT_SUCCESS) return res; - //sc->tempLen = sprintf(sc->tempStr, " inoutID = indexOutput((%s%s) %% (%" PRIu64 "), %" PRIu64 " * (%s + %" PRIu64 ") + ((%s%s) / %" PRIu64 ") %% (%" PRIu64 ")+((%s%s) / %" PRIu64 ") * (%" PRIu64 ")%s%s);\n", sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x, sc->stageStartSize, sc->gl_LocalInvocationID_y, (i + k * used_registers_write) * sc->localSize[1], sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x, sc->stageStartSize, sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x * sc->stageStartSize, sc->stageStartSize * sc->fftDim, requestCoordinate, requestBatch); - if ((1 + i + k * used_registers_write) * sc->localSize[1] >= (sc->fftDim)) { - sc->tempLen = sprintf(sc->tempStr, " if(%s < %" PRIu64 "){\n", sc->gl_LocalInvocationID_y, sc->fftDim - (i + k * used_registers_write) * sc->localSize[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->writeFromRegisters) { - if (sc->outputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s[inoutID] = %s%s%s;\n", outputsStruct, convTypeLeft, sc->regIDs[i + k * sc->registers_per_thread], convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "] = %s%s%s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[i + k * sc->registers_per_thread], convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - if (sc->outputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s[inoutID] = %ssdata[%s*(%s+%" PRIu64 ") + %s]%s;\n", outputsStruct, convTypeLeft, sc->sharedStride, sc->gl_LocalInvocationID_y, (i + k * used_registers_write) * sc->localSize[1], sc->gl_LocalInvocationID_x, convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "] = %ssdata[%s*(%s+%" PRIu64 ") + %s]%s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeft, sc->sharedStride, sc->gl_LocalInvocationID_y, (i + k * used_registers_write) * sc->localSize[1], sc->gl_LocalInvocationID_x, convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if ((1 + i + k * used_registers_write) * sc->localSize[1] >= (sc->fftDim)) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - res = appendZeropadEndReadWriteStage(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->zeropad[1]) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->zeropadBluestein[1]) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - } - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - break; - - } - case 2: //single_c2c_strided - { - if (!sc->writeFromRegisters) { - res = appendBarrierVkFFT(sc, 1); - if (res != VKFFT_SUCCESS) return res; - } - //res = appendZeropadStart(sc); - //if (res != VKFFT_SUCCESS) return res; - char shiftX[500] = ""; - if (sc->performWorkGroupShift[0]) - sprintf(shiftX, " + consts.workGroupShiftX * %s ", sc->gl_WorkGroupSize_x); - sc->tempLen = sprintf(sc->tempStr, " if (((%s%s) / %" PRIu64 ") * (%" PRIu64 ") < %" PRIu64 ") {\n", sc->gl_GlobalInvocationID_x, shiftX, sc->stageStartSize, sc->stageStartSize * sc->fftDim, sc->fft_dim_full); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - uint64_t used_registers_write = (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]); - if (sc->registerBoost > 1) used_registers_write /= sc->registerBoost; - for (uint64_t k = 0; k < sc->registerBoost; k++) { - for (uint64_t i = 0; i < used_registers_write; i++) { - sc->tempLen = sprintf(sc->tempStr, " inoutID = (%s%s) %% (%" PRIu64 ") + %" PRIu64 " * (%s + %" PRIu64 ") + ((%s%s) / %" PRIu64 ") * (%" PRIu64 ");\n", sc->gl_GlobalInvocationID_x, shiftX, sc->stageStartSize, sc->stageStartSize, sc->gl_LocalInvocationID_y, (i + k * used_registers_write) * sc->localSize[1], sc->gl_GlobalInvocationID_x, shiftX, sc->stageStartSize, sc->stageStartSize * sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->zeropadBluestein[1]) { - sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 ") < %" PRIu64 "){\n", sc->fft_dim_full, sc->fft_zeropad_Bluestein_left_write[sc->axis_id]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->zeropad[1]) { - sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->fft_dim_full, sc->fft_zeropad_left_write[sc->axis_id], sc->fft_dim_full, sc->fft_zeropad_right_write[sc->axis_id]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - res = indexOutputVkFFT(sc, uintType, writeType, sc->inoutID, 0, requestCoordinate, requestBatch); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, ";\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - res = appendZeropadStartReadWriteStage(sc, 0); - if (res != VKFFT_SUCCESS) return res; - if ((1 + i + k * used_registers_write) * sc->localSize[1] >= (sc->fftDim)) { - sc->tempLen = sprintf(sc->tempStr, " if(%s < %" PRIu64 "){\n", sc->gl_LocalInvocationID_y, sc->fftDim - (i + k * used_registers_write) * sc->localSize[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->writeFromRegisters) { - if (sc->outputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s[inoutID] = %s%s%s;\n", outputsStruct, convTypeLeft, sc->regIDs[i + k * sc->registers_per_thread], convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "] = %s%s%s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[i + k * sc->registers_per_thread], convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - if (sc->outputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s[inoutID] = %ssdata[%s*(%s+%" PRIu64 ") + %s]%s;\n", outputsStruct, convTypeLeft, sc->sharedStride, sc->gl_LocalInvocationID_y, (i + k * used_registers_write) * sc->localSize[1], sc->gl_LocalInvocationID_x, convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "] = %ssdata[%s*(%s+%" PRIu64 ") + %s]%s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeft, sc->sharedStride, sc->gl_LocalInvocationID_y, (i + k * used_registers_write) * sc->localSize[1], sc->gl_LocalInvocationID_x, convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if ((1 + i + k * used_registers_write) * sc->localSize[1] >= (sc->fftDim)) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - res = appendZeropadEndReadWriteStage(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->zeropad[1]) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->zeropadBluestein[1]) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - break; - - } - case 5://single_r2c - { - if (!sc->writeFromRegisters) { - res = appendBarrierVkFFT(sc, 1); - if (res != VKFFT_SUCCESS) return res; - } - //res = appendZeropadStart(sc); - //if (res != VKFFT_SUCCESS) return res; - char shiftX[500] = ""; - if (sc->performWorkGroupShift[0]) - sprintf(shiftX, " + consts.workGroupShiftX "); - char shiftY[500] = ""; - if (sc->performWorkGroupShift[1]) - sprintf(shiftY, " + consts.workGroupShiftY*%s ", sc->gl_WorkGroupSize_y); - if (sc->performWorkGroupShift[1]) - sprintf(shiftY, " + consts.workGroupShiftY "); - uint64_t mult = (sc->mergeSequencesR2C) ? 2 : 1; - if (sc->reorderFourStep) { - //Not implemented - } - else { - //appendBarrierVkFFT(sc, 1); - //appendZeropadStart(sc); - if (sc->fftDim == sc->fft_dim_full) { - if (sc->zeropadBluestein[1]) sc->fftDim = sc->fft_zeropad_Bluestein_left_write[sc->axis_id]; - for (uint64_t k = 0; k < sc->registerBoost; k++) { - if (sc->mergeSequencesR2C) { - if (sc->axisSwapped) { - sc->tempLen = sprintf(sc->tempStr, "\ - if (%s==0)\n\ - {\n\ - sdata[%s + %" PRIu64 "* sharedStride] = sdata[%s];\n\ - }\n", sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, sc->fftDim, sc->gl_LocalInvocationID_x); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - //res = appendZeropadEnd(sc); - //if (res != VKFFT_SUCCESS) return res; - res = appendBarrierVkFFT(sc, 1); - if (res != VKFFT_SUCCESS) return res; - //res = appendZeropadStart(sc); - //if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, "\ - if (%s==0)\n\ - {\n\ - sdata[%s * sharedStride + %" PRIu64 "] = sdata[%s * sharedStride];\n\ - }\n", sc->gl_LocalInvocationID_x, sc->gl_LocalInvocationID_y, sc->fftDim, sc->gl_LocalInvocationID_y); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - //res = appendZeropadEnd(sc); - //if (res != VKFFT_SUCCESS) return res; - res = appendBarrierVkFFT(sc, 1); - if (res != VKFFT_SUCCESS) return res; - //res = appendZeropadStart(sc); - //if (res != VKFFT_SUCCESS) return res; - } - } - uint64_t num_out = (sc->axisSwapped) ? (uint64_t)ceil(mult * (sc->fftDim / 2 + 1) / (double)sc->localSize[1]) : (uint64_t)ceil(mult * (sc->fftDim / 2 + 1) / (double)sc->localSize[0]); - //num_out = (uint64_t)ceil(num_out / (double)used_registers_write); - for (uint64_t i = 0; i < num_out; i++) { - if (sc->localSize[1] == 1) - sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * num_out) * sc->localSize[0]); - else - sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * num_out) * sc->localSize[0] * sc->localSize[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - if (!sc->axisSwapped) { - sc->tempLen = sprintf(sc->tempStr, " %s = combinedID %% %" PRIu64 " + ((combinedID/%" PRIu64 ") * %" PRIu64 ");", sc->inoutID, sc->fftDim / 2 + 1, sc->fftDim / 2 + 1, sc->outputStride[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " %s = combinedID %% %" PRIu64 " + ((combinedID/%" PRIu64 ") * %" PRIu64 ");", sc->inoutID, sc->fftDim / 2 + 1, sc->fftDim / 2 + 1, sc->outputStride[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - - if (sc->axisSwapped) { - if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[0] != 0) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + %s*%" PRIu64 "< %" PRIu64 "){", mult * (sc->fftDim / 2 + 1), sc->gl_WorkGroupID_y, sc->localSize[0], (uint64_t)ceil(sc->size[1] / (double)mult)); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if ((1 + i + k * num_out) * sc->localSize[0] * sc->localSize[1] > mult * (sc->fftDim / 2 + 1) * sc->localSize[0]) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){", mult * (sc->fftDim / 2 + 1) * sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[1] != 0) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + %s*%" PRIu64 "< %" PRIu64 "){", mult * (sc->fftDim / 2 + 1), sc->gl_WorkGroupID_y, sc->localSize[1], (uint64_t)ceil(sc->size[1] / (double)mult)); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if ((1 + i + k * num_out) * sc->localSize[0] * sc->localSize[1] > mult * (sc->fftDim / 2 + 1) * sc->localSize[1]) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){", mult * (sc->fftDim / 2 + 1) * sc->localSize[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - if (sc->zeropad[1]) { - sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->outputStride[1], sc->fft_zeropad_left_write[sc->axis_id], sc->outputStride[1], sc->fft_zeropad_right_write[sc->axis_id]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - res = indexOutputVkFFT(sc, uintType, writeType, sc->inoutID, 0, requestCoordinate, requestBatch); - sc->tempLen = sprintf(sc->tempStr, ";\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - res = appendZeropadStartReadWriteStage(sc, 0); - if (res != VKFFT_SUCCESS) return res; - if (sc->writeFromRegisters) { - //not working yet - if (sc->outputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s[inoutID] = %s%s%s;\n", outputsStruct, convTypeLeft, sc->regIDs[i + k * sc->registers_per_thread], convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "] = %s%s%s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[i + k * sc->registers_per_thread], convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - if (sc->mergeSequencesR2C) { - if (sc->axisSwapped) { - sc->tempLen = sprintf(sc->tempStr, "{ %s a = sdata[(combinedID %% %" PRIu64 ")* sharedStride + (combinedID / %" PRIu64 ")]; %s b = sdata[(%" PRIu64 "-combinedID %% %" PRIu64 ")* sharedStride + (combinedID / %" PRIu64 ")];\n", vecType, sc->fftDim / 2 + 1, 2 * (sc->fftDim / 2 + 1), vecType, sc->fftDim, sc->fftDim / 2 + 1, 2 * (sc->fftDim / 2 + 1)); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.x = 0.5%s*((combinedID / %" PRIu64 ") %% 2 == 0 ? a.x+b.x : a.y+b.y);\n", sc->regIDs[0], LFending, sc->fftDim / 2 + 1); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = 0.5%s*((combinedID / %" PRIu64 ") %% 2 == 0 ? a.y-b.y : -a.x+b.x);\n", sc->regIDs[0], LFending, sc->fftDim / 2 + 1); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->outputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s[%s] = %s%s%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->regIDs[0], convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = %s%s%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[0], convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, "}\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, "{ %s a = sdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride]; %s b = sdata[(%" PRIu64 "-combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride];\n", vecType, sc->fftDim / 2 + 1, 2 * (sc->fftDim / 2 + 1), vecType, sc->fftDim, sc->fftDim / 2 + 1, 2 * (sc->fftDim / 2 + 1)); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.x = 0.5%s*((combinedID / %" PRIu64 ") %% 2 == 0 ? a.x+b.x : a.y+b.y);\n", sc->regIDs[0], LFending, sc->fftDim / 2 + 1); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = 0.5%s*((combinedID / %" PRIu64 ") %% 2 == 0 ? a.y-b.y : -a.x+b.x);\n", sc->regIDs[0], LFending, sc->fftDim / 2 + 1); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->outputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s[%s] = %s%s%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->regIDs[0], convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = %s%s%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[0], convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, "}\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - if (!sc->axisSwapped) { - if (sc->outputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s[inoutID] = %ssdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride]%s;\n", outputsStruct, convTypeLeft, sc->fftDim / 2 + 1, sc->fftDim / 2 + 1, convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "] = %ssdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride]%s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeft, sc->fftDim / 2 + 1, sc->fftDim / 2 + 1, convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - if (sc->outputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s[inoutID] = %ssdata[(combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")]%s;\n", outputsStruct, convTypeLeft, sc->fftDim / 2 + 1, sc->fftDim / 2 + 1, convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "] = %ssdata[(combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")]%s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeft, sc->fftDim / 2 + 1, sc->fftDim / 2 + 1, convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - res = appendZeropadEndReadWriteStage(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->zeropad[1]) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->axisSwapped) { - if ((1 + i + k * num_out) * sc->localSize[0] * sc->localSize[1] > mult * (sc->fftDim / 2 + 1) * sc->localSize[0]) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - if ((1 + i + k * num_out) * sc->localSize[0] * sc->localSize[1] > mult * (sc->fftDim / 2 + 1) * sc->localSize[1]) - { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - if (sc->axisSwapped) { - if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[0] != 0) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[1] != 0) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - } - if (sc->zeropadBluestein[1]) sc->fftDim = sc->fft_dim_full; - } - else { - - } - /*sc->tempLen = sprintf(sc->tempStr, "\ -if (%s==%" PRIu64 ") \n\ -{\n", sc->gl_LocalInvocationID_x, sc->localSize[0] - 1); - sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); - sprintf(index_x, "%" PRIu64 "", sc->fftDim / 2); - sprintf(index_y, "%s%s", sc->gl_GlobalInvocationID_y, shiftY); - indexInputVkFFT(sc, uintType, writeType, index_x, index_y, requestCoordinate, requestBatch); - sc->tempLen = sprintf(sc->tempStr, ";\n"); - //sc->tempLen = sprintf(sc->tempStr, " inoutID = indexInput(2 * (%s%s), %" PRIu64 ");\n", sc->gl_GlobalInvocationID_y, shiftY, sc->inputStride[2] / (sc->inputStride[1] + 2)); - if (sc->outputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s[inoutID]=%ssdata[(%" PRIu64 " + %s * sharedStride)]%s;\n", outputsStruct, convTypeLeft,sc->fftDim / 2, sc->gl_LocalInvocationID_y, convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "]=%ssdata[(%" PRIu64 " + %s * sharedStride)]%s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeft, sc->fftDim / 2, sc->gl_LocalInvocationID_y, convTypeRight); - - VkAppendLine(sc, " }\n");*/ - } - break; - } - case 6: //single_c2r - { - char shiftY[500] = ""; - if (sc->performWorkGroupShift[1]) - sprintf(shiftY, " + consts.workGroupShiftY * %" PRIu64 "", sc->localSize[1]); - - if (!sc->writeFromRegisters) { - res = appendBarrierVkFFT(sc, 1); - if (res != VKFFT_SUCCESS) return res; - } - uint64_t mult = (sc->mergeSequencesR2C) ? 2 : 1; - uint64_t used_registers_write = (sc->axisSwapped) ? (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]) : (uint64_t)ceil(sc->fftDim / (double)sc->localSize[0]); - if (sc->registerBoost > 1) used_registers_write /= sc->registerBoost; - //res = appendZeropadStart(sc); - //if (res != VKFFT_SUCCESS) return res; - if (sc->reorderFourStep) { - //Not implemented - } - else { - if (sc->fftDim == sc->fft_dim_full) { - for (uint64_t k = 0; k < sc->registerBoost; k++) { - for (uint64_t i = 0; i < used_registers_write; i++) { - if (sc->localSize[1] == 1) - sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * used_registers_write) * sc->localSize[0]); - else - sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * used_registers_write) * sc->localSize[0] * sc->localSize[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - if (sc->outputStride[0] > 1) - sc->tempLen = sprintf(sc->tempStr, " inoutID = (combinedID %% %" PRIu64 ") * %" PRIu64 " + (combinedID / %" PRIu64 ") * %" PRIu64 ";\n", sc->fftDim, sc->outputStride[0], sc->fftDim, mult * sc->outputStride[1]); - else - sc->tempLen = sprintf(sc->tempStr, " inoutID = (combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * %" PRIu64 ";\n", sc->fftDim, sc->fftDim, mult * sc->outputStride[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - if (sc->axisSwapped) { - if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[0] != 0) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + (%s%s)*%" PRIu64 "< %" PRIu64 "){", sc->fftDim, sc->gl_WorkGroupID_y, shiftY, sc->localSize[0], (uint64_t)ceil(sc->size[1] / (double)mult)); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if ((1 + i + k * used_registers_write) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[0])) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[1] != 0) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + (%s%s)*%" PRIu64 "< %" PRIu64 "){", sc->fftDim, sc->gl_WorkGroupID_y, shiftY, sc->localSize[1], (uint64_t)ceil(sc->size[1] / (double)mult)); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if ((1 + i + k * used_registers_write) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[1])) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - if (sc->zeropadBluestein[1]) { - sc->tempLen = sprintf(sc->tempStr, " if((combinedID %% %" PRIu64 ") < %" PRIu64 "){\n", sc->fft_dim_full, sc->fft_zeropad_Bluestein_left_write[sc->axis_id]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->zeropad[1]) { - sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->outputStride[1], sc->fft_zeropad_left_write[sc->axis_id], sc->outputStride[1], sc->fft_zeropad_right_write[sc->axis_id]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - res = indexOutputVkFFT(sc, uintType, writeType, sc->inoutID, 0, requestCoordinate, requestBatch); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, ";\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - res = appendZeropadStartReadWriteStage(sc, 0); - if (res != VKFFT_SUCCESS) return res; - if (sc->writeFromRegisters) { - if (sc->outputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s[%s] = %s%s.x%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->regIDs[i], convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = %s%s.x%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[i], convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->mergeSequencesR2C) { - sc->tempLen = sprintf(sc->tempStr, " %s = %s + %" PRIu64 ";", sc->inoutID, sc->inoutID, sc->outputStride[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - if (sc->outputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s[%s] = %s%s.y%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->regIDs[i], convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = %s%s.y%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[i], convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - if (sc->axisSwapped) { - if (sc->outputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s[%s] = %ssdata[(combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")].x%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->fftDim, sc->fftDim, convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = %ssdata[(combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")].x%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->fftDim, sc->fftDim, convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->mergeSequencesR2C) { - sc->tempLen = sprintf(sc->tempStr, " %s = %s + %" PRIu64 ";", sc->inoutID, sc->inoutID, sc->outputStride[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->outputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s[%s] = %ssdata[(combinedID %% %" PRIu64 ")* sharedStride + (combinedID / %" PRIu64 ")].y%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->fftDim, sc->fftDim, convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = %ssdata[(combinedID %% %" PRIu64 ") * sharedStride+ (combinedID / %" PRIu64 ")].y%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->fftDim, sc->fftDim, convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - if (sc->outputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s[%s] = %ssdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].x%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->fftDim, sc->fftDim, convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = %ssdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].x%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->fftDim, sc->fftDim, convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->mergeSequencesR2C) { - sc->tempLen = sprintf(sc->tempStr, " %s = %s + %" PRIu64 ";", sc->inoutID, sc->inoutID, sc->outputStride[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->outputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s[%s] = %ssdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].y%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->fftDim, sc->fftDim, convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = %ssdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].y%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->fftDim, sc->fftDim, convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - res = appendZeropadEndReadWriteStage(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->zeropad[1]) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->zeropadBluestein[1]) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->axisSwapped) { - if ((1 + i + k * used_registers_write) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[0])) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[0] != 0) { - sc->tempLen = sprintf(sc->tempStr, " }"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - if ((1 + i + k * used_registers_write) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[1])) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[1] != 0) { - sc->tempLen = sprintf(sc->tempStr, " }"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - } - } - else { - - } - } - - break; - } - case 110://DCT-I nonstrided - { - if (!sc->writeFromRegisters) { - res = appendBarrierVkFFT(sc, 1); - if (res != VKFFT_SUCCESS) return res; - } - //res = appendZeropadStart(sc); - //if (res != VKFFT_SUCCESS) return res; - char shiftX[500] = ""; - if (sc->performWorkGroupShift[0]) - sprintf(shiftX, " + consts.workGroupShiftX "); - char shiftY[500] = ""; - if (sc->performWorkGroupShift[1]) - sprintf(shiftY, " + consts.workGroupShiftY*%s ", sc->gl_WorkGroupSize_y); - char shiftY2[500] = ""; - if (sc->performWorkGroupShift[1]) - sprintf(shiftY2, " + consts.workGroupShiftY "); - uint64_t mult = (sc->mergeSequencesR2C) ? 2 : 1; - if (sc->reorderFourStep) { - //Not implemented - } - else { - //appendBarrierVkFFT(sc, 1); - //appendZeropadStart(sc); - if (sc->fftDim == sc->fft_dim_full) { - if (sc->zeropadBluestein[1]) sc->fftDim = sc->fft_zeropad_Bluestein_left_write[sc->axis_id]; - sc->fftDim = (sc->fftDim + 2) / 2; - for (uint64_t k = 0; k < sc->registerBoost; k++) { - if (sc->mergeSequencesR2C) { - if (sc->axisSwapped) { - sc->tempLen = sprintf(sc->tempStr, "\ - if (%s==0)\n\ - {\n\ - sdata[%s + %" PRIu64 "* sharedStride] = sdata[%s];\n\ - }\n", sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, sc->fftDim, sc->gl_LocalInvocationID_x); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - //res = appendZeropadEnd(sc); - //if (res != VKFFT_SUCCESS) return res; - res = appendBarrierVkFFT(sc, 1); - if (res != VKFFT_SUCCESS) return res; - //res = appendZeropadStart(sc); - //if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, "\ - if (%s==0)\n\ - {\n\ - sdata[%s * sharedStride + %" PRIu64 "] = sdata[%s * sharedStride];\n\ - }\n", sc->gl_LocalInvocationID_x, sc->gl_LocalInvocationID_y, sc->fftDim, sc->gl_LocalInvocationID_y); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - //res = appendZeropadEnd(sc); - //if (res != VKFFT_SUCCESS) return res; - res = appendBarrierVkFFT(sc, 1); - if (res != VKFFT_SUCCESS) return res; - //res = appendZeropadStart(sc); - //if (res != VKFFT_SUCCESS) return res; - } - } - uint64_t num_out = (sc->axisSwapped) ? (uint64_t)ceil((sc->fftDim) / (double)sc->localSize[1]) : (uint64_t)ceil((sc->fftDim) / (double)sc->localSize[0]); - //num_out = (uint64_t)ceil(num_out / (double)used_registers_write); - for (uint64_t i = 0; i < num_out; i++) { - if (sc->localSize[1] == 1) - sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * num_out) * sc->localSize[0]); - else - sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * num_out) * sc->localSize[0] * sc->localSize[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - if (!sc->axisSwapped) { - sc->tempLen = sprintf(sc->tempStr, " %s = combinedID %% %" PRIu64 " + ((combinedID/%" PRIu64 ") * %" PRIu64 ");\n", sc->inoutID, sc->fftDim, sc->fftDim, mult * sc->outputStride[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " %s = combinedID %% %" PRIu64 " + ((combinedID/%" PRIu64 ") * %" PRIu64 ");\n", sc->inoutID, sc->fftDim, sc->fftDim, mult * sc->outputStride[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->axisSwapped) { - if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[0] != 0) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + %s*%" PRIu64 "< %" PRIu64 "){\n", (sc->fftDim), sc->gl_WorkGroupID_y, sc->localSize[0], (uint64_t)ceil(sc->size[1] / (double)mult)); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if ((1 + i + k * num_out) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim) * sc->localSize[0]) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", (sc->fftDim) * sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[1] != 0) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + %s*%" PRIu64 "< %" PRIu64 "){\n", (sc->fftDim), sc->gl_WorkGroupID_y, sc->localSize[1], (uint64_t)ceil(sc->size[1] / (double)mult)); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if ((1 + i + k * num_out) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim) * sc->localSize[1]) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", (sc->fftDim) * sc->localSize[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - if (sc->zeropad[1]) { - sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->outputStride[1], sc->fft_zeropad_left_write[sc->axis_id], sc->outputStride[1], sc->fft_zeropad_right_write[sc->axis_id]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - res = indexOutputVkFFT(sc, uintType, writeType, sc->inoutID, 0, requestCoordinate, requestBatch); - sc->tempLen = sprintf(sc->tempStr, ";\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - res = appendZeropadStartReadWriteStage(sc, 0); - if (res != VKFFT_SUCCESS) return res; - if (sc->mergeSequencesR2C) { - if (sc->axisSwapped) { - - sc->tempLen = sprintf(sc->tempStr, " %s = (sdata[(combinedID %% %" PRIu64 ")* sharedStride + (combinedID / %" PRIu64 ")]);\n", sc->regIDs[0], sc->fftDim, sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->outputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s[%s] = %s(%s.x)%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->regIDs[0], convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = %s(%s.x)%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[0], convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - if (sc->outputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s[%s+%" PRIu64 "] = %s(%s.y)%s;\n", outputsStruct, sc->inoutID, sc->outputStride[1], convTypeLeft, sc->regIDs[0], convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " outputBlocks[(%s %" PRIu64 ")/ %" PRIu64 "]%s[(%s+%" PRIu64 ") %% %" PRIu64 "] = %s(%s.y)%s;\n", sc->inoutID, sc->outputStride[1], sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputStride[1], sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[0], convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->zeropad[1]) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - sc->tempLen = sprintf(sc->tempStr, " %s = (sdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride]);\n", sc->regIDs[0], sc->fftDim, sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->outputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s[%s] = %s(%s.x)%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->regIDs[0], convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = %s(%s.x)%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[0], convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->outputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s[%s+%" PRIu64 "] = %s(%s.y)%s;\n", outputsStruct, sc->inoutID, sc->outputStride[1], convTypeLeft, sc->regIDs[0], convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " outputBlocks[(%s %" PRIu64 ")/ %" PRIu64 "]%s[(%s+%" PRIu64 ") %% %" PRIu64 "] = %s(%s.y)%s;\n", sc->inoutID, sc->outputStride[1], sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputStride[1], sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[0], convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->zeropad[1]) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - else { - if (!sc->axisSwapped) { - sc->tempLen = sprintf(sc->tempStr, " sdataID = (combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride;\n", sc->fftDim, sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->outputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s[inoutID] = %s(sdata[sdataID].x)%s;\n", outputsStruct, convTypeLeft, convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "] = %s(sdata[sdataID].x) %s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeft, convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->zeropad[1]) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - sc->tempLen = sprintf(sc->tempStr, " sdataID = (combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ");\n", sc->fftDim, sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->outputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s[inoutID] = %s(sdata[sdataID].x)%s;\n", outputsStruct, convTypeLeft, convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "] = %s(sdata[sdataID].x)%s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeft, convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->zeropad[1]) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - res = appendZeropadEndReadWriteStage(sc); - if (res != VKFFT_SUCCESS) return res; - - if (sc->axisSwapped) { - if ((1 + i + k * num_out) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim) * sc->localSize[0]) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - if ((1 + i + k * num_out) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim) * sc->localSize[1]) - { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - if (sc->axisSwapped) { - if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[0] != 0) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[1] != 0) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - } - sc->fftDim = 2 * sc->fftDim - 2; - if (sc->zeropadBluestein[1]) sc->fftDim = sc->fft_dim_full; - } - else { - - } - } - break; - } - case 111://DCT-II strided - { - if (!sc->writeFromRegisters) { - res = appendBarrierVkFFT(sc, 1); - if (res != VKFFT_SUCCESS) return res; - } - //res = appendZeropadStart(sc); - //if (res != VKFFT_SUCCESS) return res; - char shiftX[500] = ""; - if (sc->performWorkGroupShift[0]) - sprintf(shiftX, " + consts.workGroupShiftX*%s ", sc->gl_WorkGroupSize_x); - char shiftY[500] = ""; - if (sc->performWorkGroupShift[1]) - sprintf(shiftY, " + consts.workGroupShiftY*%s ", sc->gl_WorkGroupSize_y); - char shiftY2[500] = ""; - if (sc->performWorkGroupShift[1]) - sprintf(shiftY2, " + consts.workGroupShiftY "); - uint64_t mult = (sc->mergeSequencesR2C) ? 2 : 1; - if (sc->reorderFourStep) { - //Not implemented - } - else { - //appendBarrierVkFFT(sc, 1); - //appendZeropadStart(sc); - if (sc->fftDim == sc->fft_dim_full) { - if (sc->zeropadBluestein[1]) sc->fftDim = sc->fft_zeropad_Bluestein_left_write[sc->axis_id]; - sc->fftDim = (sc->fftDim + 2) / 2; - for (uint64_t k = 0; k < sc->registerBoost; k++) { - if (sc->mergeSequencesR2C) { - sc->tempLen = sprintf(sc->tempStr, "\ - if (%s==0)\n\ - {\n\ - sdata[%s + %" PRIu64 "* sharedStride] = sdata[%s];\n\ - }\n", sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, sc->fftDim, sc->gl_LocalInvocationID_x); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - //res = appendZeropadEnd(sc); - //if (res != VKFFT_SUCCESS) return res; - res = appendBarrierVkFFT(sc, 1); - if (res != VKFFT_SUCCESS) return res; - //res = appendZeropadStart(sc); - //if (res != VKFFT_SUCCESS) return res; - } - uint64_t num_out = (uint64_t)ceil(mult * (sc->fftDim) / (double)sc->localSize[1]); - //num_out = (uint64_t)ceil(num_out / (double)used_registers_write); - for (uint64_t i = 0; i < num_out; i++) { - sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * num_out) * sc->localSize[0] * sc->localSize[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - sc->tempLen = sprintf(sc->tempStr, " %s = %s%s + ((combinedID/%" PRIu64 ") * %" PRIu64 ");\n", sc->inoutID, sc->gl_GlobalInvocationID_x, shiftX, sc->localSize[0], sc->outputStride[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->size[0] % sc->localSize[0] != 0) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID %% %" PRIu64 " + %s*%" PRIu64 "< %" PRIu64 "){\n", sc->localSize[0], sc->gl_WorkGroupID_x, sc->localSize[0], sc->size[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if ((1 + i + k * num_out) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim) * sc->localSize[0]) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", (sc->fftDim) * sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->zeropad[1]) { - sc->tempLen = sprintf(sc->tempStr, " if(((combinedID/%" PRIu64 ") %% %" PRIu64 " < %" PRIu64 ")||((combinedID/%" PRIu64 ") %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->localSize[0], sc->fft_dim_full, sc->fft_zeropad_left_read[sc->axis_id], sc->localSize[0], sc->fft_dim_full, sc->fft_zeropad_right_read[sc->axis_id]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - res = indexOutputVkFFT(sc, uintType, writeType, sc->inoutID, 0, requestCoordinate, requestBatch); - sc->tempLen = sprintf(sc->tempStr, ";\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - res = appendZeropadStartReadWriteStage(sc, 0); - if (res != VKFFT_SUCCESS) return res; - - if (sc->mergeSequencesR2C) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = 0.5%s*(sdata[(combinedID / %" PRIu64 ")* sharedStride + (combinedID %% %" PRIu64 ")].x+sdata[(%" PRIu64 "-combinedID / %" PRIu64 ")* sharedStride + (combinedID %% %" PRIu64 ")].x);\n", sc->regIDs[0], LFending, sc->localSize[0], sc->localSize[0], sc->fftDim, sc->localSize[0], sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = 0.5%s*(sdata[(combinedID / %" PRIu64 ")* sharedStride + (combinedID %% %" PRIu64 ")].y-sdata[(%" PRIu64 "-combinedID / %" PRIu64 ")* sharedStride + (combinedID %% %" PRIu64 ")].y);\n", sc->regIDs[0], LFending, sc->localSize[0], sc->localSize[0], sc->fftDim, sc->localSize[0], sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->outputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s[%s] = %s(%s.x*mult.x-%s.y*mult.y)%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->regIDs[0], sc->regIDs[0], convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = %s(%s.x*mult.x-%s.y*mult.y)%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[0], sc->regIDs[0], convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - - sc->tempLen = sprintf(sc->tempStr, " %s.x = 0.5%s*(sdata[(combinedID / %" PRIu64 ")* sharedStride + (combinedID %% %" PRIu64 ")].y+sdata[(%" PRIu64 "-combinedID / %" PRIu64 ")* sharedStride + (combinedID %% %" PRIu64 ")].y);\n", sc->regIDs[1], LFending, sc->localSize[0], sc->localSize[0], sc->fftDim, sc->localSize[0], sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = 0.5%s*(-sdata[(combinedID / %" PRIu64 ")* sharedStride + (combinedID %% %" PRIu64 ")].x+sdata[(%" PRIu64 "-combinedID / %" PRIu64 ")* sharedStride + (combinedID %% %" PRIu64 ")].x);\n", sc->regIDs[1], LFending, sc->localSize[0], sc->localSize[0], sc->fftDim, sc->localSize[0], sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - if (sc->outputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s[%s+%" PRIu64 "] = %s(%s.x*mult.x-%s.y*mult.y)%s;\n", outputsStruct, sc->inoutID, sc->outputStride[1], convTypeLeft, sc->regIDs[1], sc->regIDs[1], convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " outputBlocks[(%s %" PRIu64 ")/ %" PRIu64 "]%s[(%s+%" PRIu64 ") %% %" PRIu64 "] = %s(%s.x*mult.x-%s.y*mult.y)%s;\n", sc->inoutID, sc->outputStride[1], sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputStride[1], sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[1], sc->regIDs[1], convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " > 0){\n", sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s = (%" PRIu64 " - combinedID / %" PRIu64 ") + %s%s * %" PRIu64 ";\n", sc->inoutID, sc->fftDim, sc->localSize[0], sc->gl_GlobalInvocationID_x, shiftX, 2 * sc->outputStride[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - if (sc->outputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s[%s] = -%s(%s.y*mult.x+%s.x*mult.y)%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->regIDs[0], sc->regIDs[0], convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = -%s(%s.y*mult.x+%s.x*mult.y)%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[0], sc->regIDs[0], convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->outputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s[%s+%" PRIu64 "] = -%s(%s.y*mult.x+%s.x*mult.y)%s;\n", outputsStruct, sc->inoutID, sc->outputStride[1], convTypeLeft, sc->regIDs[1], sc->regIDs[1], convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " outputBlocks[(%s %" PRIu64 ")/ %" PRIu64 "]%s[(%s+%" PRIu64 ") %% %" PRIu64 "] = -%s(%s.y*mult.x+%s.x*mult.y)%s;\n", sc->inoutID, sc->outputStride[1], sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputStride[1], sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[1], sc->regIDs[1], convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " sdataID = (combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride;\n", sc->localSize[0], sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->outputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s[inoutID] = %s(sdata[sdataID].x)%s;\n", outputsStruct, convTypeLeft, convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "] = %s(sdata[sdataID].x)%s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeft, convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->zeropad[1]) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - res = appendZeropadEndReadWriteStage(sc); - if (res != VKFFT_SUCCESS) return res; - if ((1 + i + k * num_out) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim) * sc->localSize[0]) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->size[0] % sc->localSize[0] != 0) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - sc->fftDim = 2 * sc->fftDim - 2; - if (sc->zeropadBluestein[1]) sc->fftDim = sc->fft_dim_full; - } - else { - - } - } - break; - } - case 120://DCT-II nonstrided - { - if (!sc->writeFromRegisters) { - res = appendBarrierVkFFT(sc, 1); - if (res != VKFFT_SUCCESS) return res; - } - //res = appendZeropadStart(sc); - //if (res != VKFFT_SUCCESS) return res; - char shiftX[500] = ""; - if (sc->performWorkGroupShift[0]) - sprintf(shiftX, " + consts.workGroupShiftX "); - char shiftY[500] = ""; - if (sc->performWorkGroupShift[1]) - sprintf(shiftY, " + consts.workGroupShiftY*%s ", sc->gl_WorkGroupSize_y); - char shiftY2[500] = ""; - if (sc->performWorkGroupShift[1]) - sprintf(shiftY2, " + consts.workGroupShiftY "); - uint64_t mult = (sc->mergeSequencesR2C) ? 2 : 1; - if (sc->reorderFourStep) { - //Not implemented - } - else { - //appendBarrierVkFFT(sc, 1); - //appendZeropadStart(sc); - if (sc->fftDim == sc->fft_dim_full) { - if (sc->zeropadBluestein[1]) sc->fftDim = sc->fft_zeropad_Bluestein_left_write[sc->axis_id]; - for (uint64_t k = 0; k < sc->registerBoost; k++) { - if (sc->mergeSequencesR2C) { - if (sc->axisSwapped) { - sc->tempLen = sprintf(sc->tempStr, "\ - if (%s==0)\n\ - {\n\ - sdata[%s + %" PRIu64 "* sharedStride] = sdata[%s];\n\ - }\n", sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, sc->fftDim, sc->gl_LocalInvocationID_x); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - //res = appendZeropadEnd(sc); - //if (res != VKFFT_SUCCESS) return res; - res = appendBarrierVkFFT(sc, 1); - if (res != VKFFT_SUCCESS) return res; - //res = appendZeropadStart(sc); - //if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, "\ - if (%s==0)\n\ - {\n\ - sdata[%s * sharedStride + %" PRIu64 "] = sdata[%s * sharedStride];\n\ - }\n", sc->gl_LocalInvocationID_x, sc->gl_LocalInvocationID_y, sc->fftDim, sc->gl_LocalInvocationID_y); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - //res = appendZeropadEnd(sc); - //if (res != VKFFT_SUCCESS) return res; - res = appendBarrierVkFFT(sc, 1); - if (res != VKFFT_SUCCESS) return res; - //res = appendZeropadStart(sc); - //if (res != VKFFT_SUCCESS) return res; - } - } - uint64_t num_out = (sc->axisSwapped) ? (uint64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[1]) : (uint64_t)ceil((sc->fftDim / 2 + 1) / (double)sc->localSize[0]); - //num_out = (uint64_t)ceil(num_out / (double)used_registers_write); - for (uint64_t i = 0; i < num_out; i++) { - if (sc->localSize[1] == 1) - sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * num_out) * sc->localSize[0]); - else - sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * num_out) * sc->localSize[0] * sc->localSize[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - if (!sc->axisSwapped) { - sc->tempLen = sprintf(sc->tempStr, " %s = combinedID %% %" PRIu64 " + ((combinedID/%" PRIu64 ") * %" PRIu64 ");\n", sc->inoutID, sc->fftDim / 2 + 1, sc->fftDim / 2 + 1, mult * sc->outputStride[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " %s = combinedID %% %" PRIu64 " + ((combinedID/%" PRIu64 ") * %" PRIu64 ");\n", sc->inoutID, sc->fftDim / 2 + 1, sc->fftDim / 2 + 1, mult * sc->outputStride[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->axisSwapped) { - if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[0] != 0) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + %s*%" PRIu64 "< %" PRIu64 "){\n", (sc->fftDim / 2 + 1), sc->gl_WorkGroupID_y, sc->localSize[0], (uint64_t)ceil(sc->size[1] / (double)mult)); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if ((1 + i + k * num_out) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim / 2 + 1) * sc->localSize[0]) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", (sc->fftDim / 2 + 1) * sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[1] != 0) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + %s*%" PRIu64 "< %" PRIu64 "){\n", (sc->fftDim / 2 + 1), sc->gl_WorkGroupID_y, sc->localSize[1], (uint64_t)ceil(sc->size[1] / (double)mult)); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if ((1 + i + k * num_out) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim / 2 + 1) * sc->localSize[1]) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", (sc->fftDim / 2 + 1) * sc->localSize[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - if (sc->zeropad[1]) { - sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->outputStride[1], sc->fft_zeropad_left_write[sc->axis_id], sc->outputStride[1], sc->fft_zeropad_right_write[sc->axis_id]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - res = indexOutputVkFFT(sc, uintType, writeType, sc->inoutID, 0, requestCoordinate, requestBatch); - sc->tempLen = sprintf(sc->tempStr, ";\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - res = appendZeropadStartReadWriteStage(sc, 0); - if (res != VKFFT_SUCCESS) return res; - if (sc->LUT) { - sc->tempLen = sprintf(sc->tempStr, " mult = twiddleLUT[%" PRIu64 " + combinedID %% %" PRIu64 "];\n", sc->startDCT3LUT, sc->fftDim / 2 + 1); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " mult.x = 2*mult.x;\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " mult.y = -2*mult.y;\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - if (!strcmp(floatType, "float")) { - sc->tempLen = sprintf(sc->tempStr, " mult.x = 2*%s(%.17e%s * (combinedID %% %" PRIu64 ") );\n", cosDef, -double_PI / 2 / sc->fftDim, LFending, sc->fftDim / 2 + 1); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " mult.y = 2*%s(%.17e%s * (combinedID %% %" PRIu64 ") );\n", sinDef, -double_PI / 2 / sc->fftDim, LFending, sc->fftDim / 2 + 1); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (!strcmp(floatType, "double")) { - sc->tempLen = sprintf(sc->tempStr, " mult = sincos_20(%.17e%s * (combinedID %% %" PRIu64 ") );\n", -double_PI / 2 / sc->fftDim, LFending, sc->fftDim / 2 + 1); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " mult.x = 2*mult.x;\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " mult.y = 2*mult.y;\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - if (sc->mergeSequencesR2C) { - if (sc->axisSwapped) { - - sc->tempLen = sprintf(sc->tempStr, " %s.x = 0.5%s*(sdata[(combinedID %% %" PRIu64 ")* sharedStride + (combinedID / %" PRIu64 ")].x+sdata[(%" PRIu64 "-combinedID %% %" PRIu64 ")* sharedStride + (combinedID / %" PRIu64 ")].x);\n", sc->regIDs[0], LFending, sc->fftDim / 2 + 1, (sc->fftDim / 2 + 1), sc->fftDim, sc->fftDim / 2 + 1, (sc->fftDim / 2 + 1)); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = 0.5%s*(sdata[(combinedID %% %" PRIu64 ")* sharedStride + (combinedID / %" PRIu64 ")].y-sdata[(%" PRIu64 "-combinedID %% %" PRIu64 ")* sharedStride + (combinedID / %" PRIu64 ")].y);\n", sc->regIDs[0], LFending, sc->fftDim / 2 + 1, (sc->fftDim / 2 + 1), sc->fftDim, sc->fftDim / 2 + 1, (sc->fftDim / 2 + 1)); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->outputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s[%s] = %s(%s.x*mult.x-%s.y*mult.y)%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->regIDs[0], sc->regIDs[0], convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = %s(%s.x*mult.x-%s.y*mult.y)%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[0], sc->regIDs[0], convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - - sc->tempLen = sprintf(sc->tempStr, " %s.x = 0.5%s*(sdata[(combinedID %% %" PRIu64 ")* sharedStride + (combinedID / %" PRIu64 ")].y+sdata[(%" PRIu64 "-combinedID %% %" PRIu64 ")* sharedStride + (combinedID / %" PRIu64 ")].y);\n", sc->regIDs[1], LFending, sc->fftDim / 2 + 1, (sc->fftDim / 2 + 1), sc->fftDim, sc->fftDim / 2 + 1, (sc->fftDim / 2 + 1)); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = 0.5%s*(-sdata[(combinedID %% %" PRIu64 ")* sharedStride + (combinedID / %" PRIu64 ")].x+sdata[(%" PRIu64 "-combinedID %% %" PRIu64 ")* sharedStride + (combinedID / %" PRIu64 ")].x);\n", sc->regIDs[1], LFending, sc->fftDim / 2 + 1, (sc->fftDim / 2 + 1), sc->fftDim, sc->fftDim / 2 + 1, (sc->fftDim / 2 + 1)); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - if (sc->outputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s[%s+%" PRIu64 "] = %s(%s.x*mult.x-%s.y*mult.y)%s;\n", outputsStruct, sc->inoutID, sc->outputStride[1], convTypeLeft, sc->regIDs[1], sc->regIDs[1], convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " outputBlocks[(%s %" PRIu64 ")/ %" PRIu64 "]%s[(%s+%" PRIu64 ") %% %" PRIu64 "] = %s(%s.x*mult.x-%s.y*mult.y)%s;\n", sc->inoutID, sc->outputStride[1], sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputStride[1], sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[1], sc->regIDs[1], convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->zeropad[1]) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " if(combinedID %% %" PRIu64 " > 0){\n", sc->fftDim / 2 + 1); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s = (%" PRIu64 " - combinedID %% %" PRIu64 ") + ((combinedID/%" PRIu64 ") * %" PRIu64 ");\n", sc->inoutID, sc->fftDim, sc->fftDim / 2 + 1, sc->fftDim / 2 + 1, mult * sc->outputStride[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - res = indexOutputVkFFT(sc, uintType, writeType, sc->inoutID, 0, requestCoordinate, requestBatch); - sc->tempLen = sprintf(sc->tempStr, ";\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->zeropad[1]) { - sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->outputStride[1], sc->fft_zeropad_left_write[sc->axis_id], sc->outputStride[1], sc->fft_zeropad_right_write[sc->axis_id]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->outputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s[%s] = -%s(%s.y*mult.x+%s.x*mult.y)%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->regIDs[0], sc->regIDs[0], convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = -%s(%s.y*mult.x+%s.x*mult.y)%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[0], sc->regIDs[0], convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->outputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s[%s+%" PRIu64 "] = -%s(%s.y*mult.x+%s.x*mult.y)%s;\n", outputsStruct, sc->inoutID, sc->outputStride[1], convTypeLeft, sc->regIDs[1], sc->regIDs[1], convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " outputBlocks[(%s %" PRIu64 ")/ %" PRIu64 "]%s[(%s+%" PRIu64 ") %% %" PRIu64 "] = -%s(%s.y*mult.x+%s.x*mult.y)%s;\n", sc->inoutID, sc->outputStride[1], sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputStride[1], sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[1], sc->regIDs[1], convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->zeropad[1]) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " %s.x = 0.5%s*(sdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].x+sdata[(%" PRIu64 "-combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].x);\n", sc->regIDs[0], LFending, sc->fftDim / 2 + 1, (sc->fftDim / 2 + 1), sc->fftDim, sc->fftDim / 2 + 1, (sc->fftDim / 2 + 1)); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = 0.5%s*(sdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].y-sdata[(%" PRIu64 "-combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].y);\n", sc->regIDs[0], LFending, sc->fftDim / 2 + 1, (sc->fftDim / 2 + 1), sc->fftDim, sc->fftDim / 2 + 1, (sc->fftDim / 2 + 1)); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->outputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s[%s] = %s(%s.x*mult.x-%s.y*mult.y)%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->regIDs[0], sc->regIDs[0], convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = %s(%s.x*mult.x-%s.y*mult.y)%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[0], sc->regIDs[0], convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.x = 0.5%s*(sdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].y+sdata[(%" PRIu64 "-combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].y);\n", sc->regIDs[1], LFending, sc->fftDim / 2 + 1, (sc->fftDim / 2 + 1), sc->fftDim, sc->fftDim / 2 + 1, (sc->fftDim / 2 + 1)); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = 0.5%s*(-sdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].x+sdata[(%" PRIu64 "-combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].x);\n", sc->regIDs[1], LFending, sc->fftDim / 2 + 1, (sc->fftDim / 2 + 1), sc->fftDim, sc->fftDim / 2 + 1, (sc->fftDim / 2 + 1)); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - if (sc->outputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s[%s+%" PRIu64 "] = %s(%s.x*mult.x-%s.y*mult.y)%s;\n", outputsStruct, sc->inoutID, sc->outputStride[1], convTypeLeft, sc->regIDs[1], sc->regIDs[1], convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " outputBlocks[(%s %" PRIu64 ")/ %" PRIu64 "]%s[(%s+%" PRIu64 ") %% %" PRIu64 "] = %s(%s.x*mult.x-%s.y*mult.y)%s;\n", sc->inoutID, sc->outputStride[1], sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputStride[1], sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[1], sc->regIDs[1], convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->zeropad[1]) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " if(combinedID %% %" PRIu64 " > 0){\n", sc->fftDim / 2 + 1); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s = (%" PRIu64 " - combinedID %% %" PRIu64 ") + ((combinedID/%" PRIu64 ") * %" PRIu64 ");\n", sc->inoutID, sc->fftDim, sc->fftDim / 2 + 1, sc->fftDim / 2 + 1, 2 * sc->outputStride[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - res = indexOutputVkFFT(sc, uintType, writeType, sc->inoutID, 0, requestCoordinate, requestBatch); - sc->tempLen = sprintf(sc->tempStr, ";\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->zeropad[1]) { - sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->outputStride[1], sc->fft_zeropad_left_write[sc->axis_id], sc->outputStride[1], sc->fft_zeropad_right_write[sc->axis_id]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->outputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s[%s] = -%s(%s.y*mult.x+%s.x*mult.y)%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->regIDs[0], sc->regIDs[0], convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = -%s(%s.y*mult.x+%s.x*mult.y)%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[0], sc->regIDs[0], convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->outputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s[%s+%" PRIu64 "] = -%s(%s.y*mult.x+%s.x*mult.y)%s;\n", outputsStruct, sc->inoutID, sc->outputStride[1], convTypeLeft, sc->regIDs[1], sc->regIDs[1], convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " outputBlocks[(%s %" PRIu64 ")/ %" PRIu64 "]%s[(%s+%" PRIu64 ") %% %" PRIu64 "] = -%s(%s.y*mult.x+%s.x*mult.y)%s;\n", sc->inoutID, sc->outputStride[1], sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputStride[1], sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[1], sc->regIDs[1], convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->zeropad[1]) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - if (!sc->axisSwapped) { - sc->tempLen = sprintf(sc->tempStr, " sdataID = (combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride;\n", sc->fftDim / 2 + 1, sc->fftDim / 2 + 1); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->outputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s[inoutID] = %s(sdata[sdataID].x*mult.x - sdata[sdataID].y*mult.y)%s;\n", outputsStruct, convTypeLeft, convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "] = %s(sdata[sdataID].x*mult.x - sdata[sdataID].y*mult.y) %s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeft, convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->zeropad[1]) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " if(combinedID %% %" PRIu64 " > 0){\n", sc->fftDim / 2 + 1); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s = (%" PRIu64 " - combinedID %% %" PRIu64 ") + ((combinedID/%" PRIu64 ") * %" PRIu64 ");\n", sc->inoutID, sc->fftDim, sc->fftDim / 2 + 1, sc->fftDim / 2 + 1, sc->outputStride[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - res = indexOutputVkFFT(sc, uintType, writeType, sc->inoutID, 0, requestCoordinate, requestBatch); - sc->tempLen = sprintf(sc->tempStr, ";\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->zeropad[1]) { - sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->outputStride[1], sc->fft_zeropad_left_write[sc->axis_id], sc->outputStride[1], sc->fft_zeropad_right_write[sc->axis_id]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->outputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s[inoutID] = -%s(sdata[sdataID].y*mult.x + sdata[sdataID].x*mult.y)%s;\n", outputsStruct, convTypeLeft, convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "] = -%s(sdata[sdataID].y*mult.x + sdata[sdataID].x*mult.y) %s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeft, convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->zeropad[1]) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " sdataID = (combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ");\n", sc->fftDim / 2 + 1, sc->fftDim / 2 + 1); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->outputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s[inoutID] = %s(sdata[sdataID].x*mult.x -sdata[sdataID].y*mult.y)%s;\n", outputsStruct, convTypeLeft, convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "] = %s(sdata[sdataID].x*mult.x - sdata[sdataID].y*mult.y)%s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeft, convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->zeropad[1]) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " if(combinedID %% %" PRIu64 " > 0){\n", sc->fftDim / 2 + 1); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s = (%" PRIu64 " - combinedID %% %" PRIu64 ") + ((combinedID/%" PRIu64 ") * %" PRIu64 ");\n", sc->inoutID, sc->fftDim, sc->fftDim / 2 + 1, sc->fftDim / 2 + 1, sc->outputStride[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - res = indexOutputVkFFT(sc, uintType, writeType, sc->inoutID, 0, requestCoordinate, requestBatch); - sc->tempLen = sprintf(sc->tempStr, ";\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->zeropad[1]) { - sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->outputStride[1], sc->fft_zeropad_left_write[sc->axis_id], sc->outputStride[1], sc->fft_zeropad_right_write[sc->axis_id]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->outputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s[inoutID] = -%s(sdata[sdataID].y*mult.x +sdata[sdataID].x*mult.y)%s;\n", outputsStruct, convTypeLeft, convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "] = -%s(sdata[sdataID].y*mult.x + sdata[sdataID].x*mult.y)%s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeft, convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->zeropad[1]) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - res = appendZeropadEndReadWriteStage(sc); - if (res != VKFFT_SUCCESS) return res; - - if (sc->axisSwapped) { - if ((1 + i + k * num_out) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim / 2 + 1) * sc->localSize[0]) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - if ((1 + i + k * num_out) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim / 2 + 1) * sc->localSize[1]) - { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - if (sc->axisSwapped) { - if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[0] != 0) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[1] != 0) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - } - if (sc->zeropadBluestein[1]) sc->fftDim = sc->fft_dim_full; - } - else { - - } - } - break; - } - case 121://DCT-II strided - { - if (!sc->writeFromRegisters) { - res = appendBarrierVkFFT(sc, 1); - if (res != VKFFT_SUCCESS) return res; - } - //res = appendZeropadStart(sc); - //if (res != VKFFT_SUCCESS) return res; - char shiftX[500] = ""; - if (sc->performWorkGroupShift[0]) - sprintf(shiftX, " + consts.workGroupShiftX*%s ", sc->gl_WorkGroupSize_x); - char shiftY[500] = ""; - if (sc->performWorkGroupShift[1]) - sprintf(shiftY, " + consts.workGroupShiftY*%s ", sc->gl_WorkGroupSize_y); - char shiftY2[500] = ""; - if (sc->performWorkGroupShift[1]) - sprintf(shiftY2, " + consts.workGroupShiftY "); - uint64_t mult = (sc->mergeSequencesR2C) ? 2 : 1; - if (sc->reorderFourStep) { - //Not implemented - } - else { - //appendBarrierVkFFT(sc, 1); - //appendZeropadStart(sc); - if (sc->fftDim == sc->fft_dim_full) { - if (sc->zeropadBluestein[1]) sc->fftDim = sc->fft_zeropad_Bluestein_left_write[sc->axis_id]; - for (uint64_t k = 0; k < sc->registerBoost; k++) { - if (sc->mergeSequencesR2C) { - sc->tempLen = sprintf(sc->tempStr, "\ - if (%s==0)\n\ - {\n\ - sdata[%s + %" PRIu64 "* sharedStride] = sdata[%s];\n\ - }\n", sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, sc->fftDim, sc->gl_LocalInvocationID_x); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - //res = appendZeropadEnd(sc); - //if (res != VKFFT_SUCCESS) return res; - res = appendBarrierVkFFT(sc, 1); - if (res != VKFFT_SUCCESS) return res; - //res = appendZeropadStart(sc); - //if (res != VKFFT_SUCCESS) return res; - } - uint64_t num_out = (uint64_t)ceil(mult * (sc->fftDim / 2 + 1) / (double)sc->localSize[1]); - //num_out = (uint64_t)ceil(num_out / (double)sc->min_registers_per_thread); - for (uint64_t i = 0; i < num_out; i++) { - sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * num_out) * sc->localSize[0] * sc->localSize[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - sc->tempLen = sprintf(sc->tempStr, " %s = %s%s + ((combinedID/%" PRIu64 ") * %" PRIu64 ");\n", sc->inoutID, sc->gl_GlobalInvocationID_x, shiftX, sc->localSize[0], sc->outputStride[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->size[0] % sc->localSize[0] != 0) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID %% %" PRIu64 " + %s*%" PRIu64 "< %" PRIu64 "){\n", sc->localSize[0], sc->gl_WorkGroupID_x, sc->localSize[0], sc->size[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if ((1 + i + k * num_out) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim / 2 + 1) * sc->localSize[0]) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", (sc->fftDim / 2 + 1) * sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->zeropad[1]) { - sc->tempLen = sprintf(sc->tempStr, " if(((combinedID/%" PRIu64 ") %% %" PRIu64 " < %" PRIu64 ")||((combinedID/%" PRIu64 ") %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->localSize[0], sc->fft_dim_full, sc->fft_zeropad_left_read[sc->axis_id], sc->localSize[0], sc->fft_dim_full, sc->fft_zeropad_right_read[sc->axis_id]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - res = indexOutputVkFFT(sc, uintType, writeType, sc->inoutID, 0, requestCoordinate, requestBatch); - sc->tempLen = sprintf(sc->tempStr, ";\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - res = appendZeropadStartReadWriteStage(sc, 0); - if (res != VKFFT_SUCCESS) return res; - if (sc->LUT) { - sc->tempLen = sprintf(sc->tempStr, " mult = twiddleLUT[%" PRIu64 " + combinedID / %" PRIu64 "];\n", sc->startDCT3LUT, sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " mult.x = 2*mult.x;\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " mult.y = -2*mult.y;\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - if (!strcmp(floatType, "float")) { - sc->tempLen = sprintf(sc->tempStr, " mult.x = 2*%s(%.17e%s * (combinedID / %" PRIu64 ") );\n", cosDef, -double_PI / 2 / sc->fftDim, LFending, sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " mult.y = 2*%s(%.17e%s * (combinedID / %" PRIu64 ") );\n", sinDef, -double_PI / 2 / sc->fftDim, LFending, sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (!strcmp(floatType, "double")) { - sc->tempLen = sprintf(sc->tempStr, " mult = sincos_20(%.17e%s * (combinedID / %" PRIu64 ") );\n", -double_PI / 2 / sc->fftDim, LFending, sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " mult.x = 2*mult.x;\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " mult.y = 2*mult.y;\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - - if (sc->mergeSequencesR2C) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = 0.5%s*(sdata[(combinedID / %" PRIu64 ")* sharedStride + (combinedID %% %" PRIu64 ")].x+sdata[(%" PRIu64 "-combinedID / %" PRIu64 ")* sharedStride + (combinedID %% %" PRIu64 ")].x);\n", sc->regIDs[0], LFending, sc->localSize[0], sc->localSize[0], sc->fftDim, sc->localSize[0], sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = 0.5%s*(sdata[(combinedID / %" PRIu64 ")* sharedStride + (combinedID %% %" PRIu64 ")].y-sdata[(%" PRIu64 "-combinedID / %" PRIu64 ")* sharedStride + (combinedID %% %" PRIu64 ")].y);\n", sc->regIDs[0], LFending, sc->localSize[0], sc->localSize[0], sc->fftDim, sc->localSize[0], sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->outputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s[%s] = %s(%s.x*mult.x-%s.y*mult.y)%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->regIDs[0], sc->regIDs[0], convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = %s(%s.x*mult.x-%s.y*mult.y)%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[0], sc->regIDs[0], convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - - sc->tempLen = sprintf(sc->tempStr, " %s.x = 0.5%s*(sdata[(combinedID / %" PRIu64 ")* sharedStride + (combinedID %% %" PRIu64 ")].y+sdata[(%" PRIu64 "-combinedID / %" PRIu64 ")* sharedStride + (combinedID %% %" PRIu64 ")].y);\n", sc->regIDs[1], LFending, sc->localSize[0], sc->localSize[0], sc->fftDim, sc->localSize[0], sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = 0.5%s*(-sdata[(combinedID / %" PRIu64 ")* sharedStride + (combinedID %% %" PRIu64 ")].x+sdata[(%" PRIu64 "-combinedID / %" PRIu64 ")* sharedStride + (combinedID %% %" PRIu64 ")].x);\n", sc->regIDs[1], LFending, sc->localSize[0], sc->localSize[0], sc->fftDim, sc->localSize[0], sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - if (sc->outputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s[%s+%" PRIu64 "] = %s(%s.x*mult.x-%s.y*mult.y)%s;\n", outputsStruct, sc->inoutID, sc->outputStride[1], convTypeLeft, sc->regIDs[1], sc->regIDs[1], convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " outputBlocks[(%s %" PRIu64 ")/ %" PRIu64 "]%s[(%s+%" PRIu64 ") %% %" PRIu64 "] = %s(%s.x*mult.x-%s.y*mult.y)%s;\n", sc->inoutID, sc->outputStride[1], sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputStride[1], sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[1], sc->regIDs[1], convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " > 0){\n", sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s = (%" PRIu64 " - combinedID / %" PRIu64 ") + %s%s * %" PRIu64 ";\n", sc->inoutID, sc->fftDim, sc->localSize[0], sc->gl_GlobalInvocationID_x, shiftX, 2 * sc->outputStride[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - if (sc->outputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s[%s] = -%s(%s.y*mult.x+%s.x*mult.y)%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->regIDs[0], sc->regIDs[0], convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = -%s(%s.y*mult.x+%s.x*mult.y)%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[0], sc->regIDs[0], convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->outputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s[%s+%" PRIu64 "] = -%s(%s.y*mult.x+%s.x*mult.y)%s;\n", outputsStruct, sc->inoutID, sc->outputStride[1], convTypeLeft, sc->regIDs[1], sc->regIDs[1], convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " outputBlocks[(%s %" PRIu64 ")/ %" PRIu64 "]%s[(%s+%" PRIu64 ") %% %" PRIu64 "] = -%s(%s.y*mult.x+%s.x*mult.y)%s;\n", sc->inoutID, sc->outputStride[1], sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputStride[1], sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[1], sc->regIDs[1], convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " sdataID = (combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride;\n", sc->localSize[0], sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->outputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s[inoutID] = %s(sdata[sdataID].x*mult.x -sdata[sdataID].y*mult.y)%s;\n", outputsStruct, convTypeLeft, convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "] = %s(sdata[sdataID].x*mult.x - sdata[sdataID].y*mult.y)%s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeft, convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->zeropad[1]) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " if((combinedID/ %" PRIu64 ")> 0){\n", sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s = (%" PRIu64 " - combinedID / %" PRIu64 ") * %" PRIu64 " + %s%s;\n", sc->inoutID, sc->fftDim, sc->localSize[0], sc->outputStride[1], sc->gl_GlobalInvocationID_x, shiftX); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - res = indexOutputVkFFT(sc, uintType, writeType, sc->inoutID, 0, requestCoordinate, requestBatch); - sc->tempLen = sprintf(sc->tempStr, ";\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->zeropad[1]) { - sc->tempLen = sprintf(sc->tempStr, " if(( (%" PRIu64 " - combinedID / %" PRIu64 ") %% %" PRIu64 " < %" PRIu64 ")||( (%" PRIu64 " - combinedID / %" PRIu64 ") %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->fftDim, sc->localSize[0], sc->fft_dim_full, sc->fft_zeropad_left_read[sc->axis_id], sc->fftDim, sc->localSize[0], sc->fft_dim_full, sc->fft_zeropad_right_read[sc->axis_id]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->outputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s[inoutID] = -%s(sdata[sdataID].y*mult.x +sdata[sdataID].x*mult.y)%s;\n", outputsStruct, convTypeLeft, convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "] = -%s(sdata[sdataID].y*mult.x + sdata[sdataID].x*mult.y)%s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeft, convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->zeropad[1]) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - res = appendZeropadEndReadWriteStage(sc); - if (res != VKFFT_SUCCESS) return res; - if ((1 + i + k * num_out) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim / 2 + 1) * sc->localSize[0]) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->size[0] % sc->localSize[0] != 0) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - if (sc->zeropadBluestein[1]) sc->fftDim = sc->fft_dim_full; - } - else { - - } - } - break; - } - case 130://DCT-III nonstrided - { - if (!sc->writeFromRegisters) { - res = appendBarrierVkFFT(sc, 1); - if (res != VKFFT_SUCCESS) return res; - } - //res = appendZeropadStart(sc); - //if (res != VKFFT_SUCCESS) return res; - char shiftX[500] = ""; - if (sc->performWorkGroupShift[0]) - sprintf(shiftX, " + consts.workGroupShiftX "); - char shiftY[500] = ""; - if (sc->performWorkGroupShift[1]) - sprintf(shiftY, " + consts.workGroupShiftY*%s ", sc->gl_WorkGroupSize_y); - char shiftY2[500] = ""; - if (sc->performWorkGroupShift[1]) - sprintf(shiftY2, " + consts.workGroupShiftY "); - uint64_t mult = (sc->mergeSequencesR2C) ? 2 : 1; - uint64_t used_registers_write = (sc->axisSwapped) ? (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]) : (uint64_t)ceil(sc->fftDim / (double)sc->localSize[0]); - if (sc->registerBoost > 1) used_registers_write /= sc->registerBoost; - if (sc->reorderFourStep) { - //Not implemented - } - else { - //appendBarrierVkFFT(sc, 1); - //appendZeropadStart(sc); - if (sc->fftDim == sc->fft_dim_full) { - if (sc->zeropadBluestein[1]) { - sc->fftDim = sc->fft_zeropad_Bluestein_left_write[sc->axis_id]; - used_registers_write = (sc->axisSwapped) ? (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]) : (uint64_t)ceil(sc->fftDim / (double)sc->localSize[0]); - } - uint64_t maxBluesteinCutOff = 1; - if (sc->zeropadBluestein[1]) { - if (sc->axisSwapped) - maxBluesteinCutOff = sc->fftDim * sc->localSize[0]; - else - maxBluesteinCutOff = sc->fftDim * sc->localSize[1]; - } - for (uint64_t k = 0; k < sc->registerBoost; k++) { - //num_out = (uint64_t)ceil(num_out / (double)used_registers_write); - for (uint64_t i = 0; i < used_registers_write; i++) { - if (sc->localSize[1] == 1) - sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * used_registers_write) * sc->localSize[0]); - else - sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * used_registers_write) * sc->localSize[0] * sc->localSize[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->zeropadBluestein[1]) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", maxBluesteinCutOff); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (!sc->axisSwapped) { - sc->tempLen = sprintf(sc->tempStr, " %s = combinedID %% %" PRIu64 " + ((combinedID/%" PRIu64 ") * %" PRIu64 ");\n", sc->inoutID, sc->fftDim, sc->fftDim, mult * sc->outputStride[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " %s = combinedID %% %" PRIu64 " + ((combinedID/%" PRIu64 ") * %" PRIu64 ");\n", sc->inoutID, sc->fftDim, sc->fftDim, mult * sc->outputStride[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->axisSwapped) { - if ((1 + i + k * used_registers_write) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[0])) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[0] != 0) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + %s*%" PRIu64 "< %" PRIu64 "){\n", sc->fftDim, sc->gl_WorkGroupID_y, sc->localSize[0], (uint64_t)ceil(sc->size[1] / (double)mult)); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - - } - else { - if ((1 + i + k * used_registers_write) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[1])) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[1] != 0) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + %s*%" PRIu64 "< %" PRIu64 "){\n", sc->fftDim, sc->gl_WorkGroupID_y, sc->localSize[1], (uint64_t)ceil(sc->size[1] / (double)mult)); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - - } - if (sc->zeropad[1]) { - sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->outputStride[1], sc->fft_zeropad_left_write[sc->axis_id], sc->outputStride[1], sc->fft_zeropad_right_write[sc->axis_id]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - res = indexOutputVkFFT(sc, uintType, writeType, sc->inoutID, 0, requestCoordinate, requestBatch); - sc->tempLen = sprintf(sc->tempStr, ";\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - res = appendZeropadStartReadWriteStage(sc, 0); - if (res != VKFFT_SUCCESS) return res; - if (sc->mergeSequencesR2C) { - if (sc->axisSwapped) { - sc->tempLen = sprintf(sc->tempStr, " sdataID = (((combinedID %% %" PRIu64 ") %% 2) * %" PRIu64 " + (1-2*((combinedID %% %" PRIu64 ") %% 2)) * ((combinedID %% %" PRIu64 ")/2)) * sharedStride + (combinedID / %" PRIu64 ");\n", sc->fftDim, sc->fftDim - 1, sc->fftDim, sc->fftDim, sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " sdataID = (((combinedID %% %" PRIu64 ") %% 2) * %" PRIu64 " + (1-2*((combinedID %% %" PRIu64 ") %% 2)) * ((combinedID %% %" PRIu64 ")/2)) + (combinedID / %" PRIu64 ")* sharedStride;\n", sc->fftDim, sc->fftDim - 1, sc->fftDim, sc->fftDim, sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - - if (sc->outputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s[%s] = %s(sdata[sdataID].x)%s;\n", outputsStruct, sc->inoutID, convTypeLeft, convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = %s(sdata[sdataID].x)%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s = %s + %" PRIu64 ";\n", sc->inoutID, sc->inoutID, sc->outputStride[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->outputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s[%s] = %s(sdata[sdataID].y)%s;\n", outputsStruct, sc->inoutID, convTypeLeft, convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = %s(sdata[sdataID].y)%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - if (!sc->axisSwapped) { - sc->tempLen = sprintf(sc->tempStr, " sdataID = (((combinedID %% %" PRIu64 ") %% 2) * %" PRIu64 " + (1-2*((combinedID %% %" PRIu64 ") %% 2)) * ((combinedID %% %" PRIu64 ")/2)) + (combinedID / %" PRIu64 ") * sharedStride;\n", sc->fftDim, sc->fftDim - 1, sc->fftDim, sc->fftDim, sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->outputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s[inoutID] = %s(sdata[sdataID].x)%s;\n", outputsStruct, convTypeLeft, convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "] = %s(sdata[sdataID].x)%s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeft, convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " sdataID = (((combinedID %% %" PRIu64 ") %% 2) * %" PRIu64 " + (1-2*((combinedID %% %" PRIu64 ") %% 2)) * ((combinedID %% %" PRIu64 ")/2)) * sharedStride + (combinedID / %" PRIu64 ");\n", sc->fftDim, sc->fftDim - 1, sc->fftDim, sc->fftDim, sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->outputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s[inoutID] = %s(sdata[sdataID].x)%s;\n", outputsStruct, convTypeLeft, convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "] = %s(sdata[sdataID].x)%s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeft, convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - res = appendZeropadEndReadWriteStage(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->zeropad[1]) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->axisSwapped) { - if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[0] != 0) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if ((1 + i + k * used_registers_write) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[0])) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[1] != 0) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if ((1 + i + k * used_registers_write) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[1])) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - if (sc->zeropadBluestein[1]) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - if (sc->zeropadBluestein[1]) { - sc->fftDim = sc->fft_dim_full; - used_registers_write = (sc->axisSwapped) ? (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]) : (uint64_t)ceil(sc->fftDim / (double)sc->localSize[0]); - } - - } - else { - - } - } - break; - } - case 131://DCT-III strided - { - if (!sc->writeFromRegisters) { - res = appendBarrierVkFFT(sc, 1); - if (res != VKFFT_SUCCESS) return res; - } - //res = appendZeropadStart(sc); - //if (res != VKFFT_SUCCESS) return res; - char shiftX[500] = ""; - if (sc->performWorkGroupShift[0]) - sprintf(shiftX, " + consts.workGroupShiftX*%s ", sc->gl_WorkGroupSize_x); - char shiftY[500] = ""; - if (sc->performWorkGroupShift[1]) - sprintf(shiftY, " + consts.workGroupShiftY*%s ", sc->gl_WorkGroupSize_y); - char shiftY2[500] = ""; - if (sc->performWorkGroupShift[1]) - sprintf(shiftY2, " + consts.workGroupShiftY "); - //uint64_t mult = (sc->mergeSequencesR2C) ? 2 : 1; - uint64_t used_registers_write = (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]); - if (sc->registerBoost > 1) used_registers_write /= sc->registerBoost; - if (sc->reorderFourStep) { - //Not implemented - } - else { - //appendBarrierVkFFT(sc, 1); - //appendZeropadStart(sc); - if (sc->fftDim == sc->fft_dim_full) { - if (sc->zeropadBluestein[1]) { - sc->fftDim = sc->fft_zeropad_Bluestein_left_write[sc->axis_id]; - used_registers_write = (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]); - } - for (uint64_t k = 0; k < sc->registerBoost; k++) { - if (sc->mergeSequencesR2C) { - sc->tempLen = sprintf(sc->tempStr, "\ - if (%s==0)\n\ - {\n\ - sdata[%s + %" PRIu64 "* sharedStride] = sdata[%s];\n\ - }\n", sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, sc->fftDim, sc->gl_LocalInvocationID_x); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - //res = appendZeropadEnd(sc); - //if (res != VKFFT_SUCCESS) return res; - res = appendBarrierVkFFT(sc, 1); - if (res != VKFFT_SUCCESS) return res; - //res = appendZeropadStart(sc); - //if (res != VKFFT_SUCCESS) return res; - } - //num_out = (uint64_t)ceil(num_out / (double)used_registers_write); - for (uint64_t i = 0; i < used_registers_write; i++) { - sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * used_registers_write) * sc->localSize[0] * sc->localSize[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->zeropadBluestein[1]) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if ((1 + i + k * used_registers_write) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[0])) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " %s = %s%s + ((combinedID/%" PRIu64 ") * %" PRIu64 ");\n", sc->inoutID, sc->gl_GlobalInvocationID_x, shiftX, sc->localSize[0], sc->outputStride[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->size[0] % sc->localSize[0] != 0) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID %% %" PRIu64 " + %s*%" PRIu64 "< %" PRIu64 "){\n", sc->localSize[0], sc->gl_WorkGroupID_x, sc->localSize[0], sc->size[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - /*if ((1 + i + k * num_out) * sc->localSize[0] * sc->localSize[1] > mult * (sc->fftDim / 2 + 1) * sc->localSize[0]) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", mult * (sc->fftDim / 2 + 1) * sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - }*/ - if (sc->zeropad[1]) { - sc->tempLen = sprintf(sc->tempStr, " if(((combinedID/%" PRIu64 ") %% %" PRIu64 " < %" PRIu64 ")||((combinedID/%" PRIu64 ") %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->localSize[0], sc->fft_dim_full, sc->fft_zeropad_left_read[sc->axis_id], sc->localSize[0], sc->fft_dim_full, sc->fft_zeropad_right_read[sc->axis_id]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - res = indexOutputVkFFT(sc, uintType, writeType, sc->inoutID, 0, requestCoordinate, requestBatch); - sc->tempLen = sprintf(sc->tempStr, ";\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - res = appendZeropadStartReadWriteStage(sc, 0); - if (res != VKFFT_SUCCESS) return res; - if (sc->mergeSequencesR2C) { - sc->tempLen = sprintf(sc->tempStr, " %s.x = 0.5%s*(sdata[(combinedID / %" PRIu64 ")* sharedStride + (combinedID %% %" PRIu64 ")].x+sdata[(%" PRIu64 "-combinedID / %" PRIu64 ")* sharedStride + (combinedID %% %" PRIu64 ")].x);\n", sc->regIDs[0], LFending, sc->localSize[0], sc->localSize[0], sc->fftDim, sc->localSize[0], sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = 0.5%s*(sdata[(combinedID / %" PRIu64 ")* sharedStride + (combinedID %% %" PRIu64 ")].y-sdata[(%" PRIu64 "-combinedID / %" PRIu64 ")* sharedStride + (combinedID %% %" PRIu64 ")].y);\n", sc->regIDs[0], LFending, sc->localSize[0], sc->localSize[0], sc->fftDim, sc->localSize[0], sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->outputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s[%s] = %s(%s.x*mult.x-%s.y*mult.y)%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->regIDs[0], sc->regIDs[0], convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = %s(%s.x*mult.x-%s.y*mult.y)%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[0], sc->regIDs[0], convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - - sc->tempLen = sprintf(sc->tempStr, " %s.x = 0.5%s*(sdata[(combinedID / %" PRIu64 ")* sharedStride + (combinedID %% %" PRIu64 ")].y+sdata[(%" PRIu64 "-combinedID / %" PRIu64 ")* sharedStride + (combinedID %% %" PRIu64 ")].y);\n", sc->regIDs[1], LFending, sc->localSize[0], sc->localSize[0], sc->fftDim, sc->localSize[0], sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = 0.5%s*(-sdata[(combinedID / %" PRIu64 ")* sharedStride + (combinedID %% %" PRIu64 ")].x+sdata[(%" PRIu64 "-combinedID / %" PRIu64 ")* sharedStride + (combinedID %% %" PRIu64 ")].x);\n", sc->regIDs[1], LFending, sc->localSize[0], sc->localSize[0], sc->fftDim, sc->localSize[0], sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - if (sc->outputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s[%s+%" PRIu64 "] = %s(%s.x*mult.x-%s.y*mult.y)%s;\n", outputsStruct, sc->inoutID, sc->outputStride[1], convTypeLeft, sc->regIDs[1], sc->regIDs[1], convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " outputBlocks[(%s %" PRIu64 ")/ %" PRIu64 "]%s[(%s+%" PRIu64 ") %% %" PRIu64 "] = %s(%s.x*mult.x-%s.y*mult.y)%s;\n", sc->inoutID, sc->outputStride[1], sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputStride[1], sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[1], sc->regIDs[1], convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " > 0){\n", sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s = (%" PRIu64 " - combinedID / %" PRIu64 ") + %s%s * %" PRIu64 ";\n", sc->inoutID, sc->fftDim, sc->localSize[0], sc->gl_GlobalInvocationID_x, shiftX, 2 * sc->outputStride[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - if (sc->outputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s[%s] = -%s(%s.y*mult.x+%s.x*mult.y)%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->regIDs[0], sc->regIDs[0], convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = -%s(%s.y*mult.x+%s.x*mult.y)%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[0], sc->regIDs[0], convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->outputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s[%s+%" PRIu64 "] = -%s(%s.y*mult.x+%s.x*mult.y)%s;\n", outputsStruct, sc->inoutID, sc->outputStride[1], convTypeLeft, sc->regIDs[1], sc->regIDs[1], convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " outputBlocks[(%s %" PRIu64 ")/ %" PRIu64 "]%s[(%s+%" PRIu64 ") %% %" PRIu64 "] = -%s(%s.y*mult.x+%s.x*mult.y)%s;\n", sc->inoutID, sc->outputStride[1], sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputStride[1], sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[1], sc->regIDs[1], convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " sdataID = (((combinedID / %" PRIu64 ") %% 2) * %" PRIu64 " + (1-2*((combinedID / %" PRIu64 ") %% 2)) * ((combinedID / %" PRIu64 ")/2)) * sharedStride + (combinedID %% %" PRIu64 ");\n", sc->localSize[0], sc->fftDim - 1, sc->localSize[0], sc->localSize[0], sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->outputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s[inoutID] = %s(sdata[sdataID].x)%s;\n", outputsStruct, convTypeLeft, convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "] = %s(sdata[sdataID].x)%s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeft, convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - res = appendZeropadEndReadWriteStage(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->zeropad[1]) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - /*if ((1 + i + k * num_out) * sc->localSize[0] * sc->localSize[1] > mult * (sc->fftDim / 2 + 1) * sc->localSize[0]) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - }*/ - if (sc->size[0] % sc->localSize[0] != 0) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if ((1 + i + k * used_registers_write) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[0])) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->zeropadBluestein[1]) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - if (sc->zeropadBluestein[1]) { - sc->fftDim = sc->fft_dim_full; - used_registers_write = (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]); - } - } - else { - - } - } - break; - } - case 140: //DCT-IV nonstrided as 8N DFT - { - if (!sc->writeFromRegisters) { - res = appendBarrierVkFFT(sc, 1); - if (res != VKFFT_SUCCESS) return res; - } - res = appendZeropadStart(sc); - if (res != VKFFT_SUCCESS) return res; - char shiftX[500] = ""; - if (sc->performWorkGroupShift[0]) - sprintf(shiftX, " + consts.workGroupShiftX "); - char shiftY[500] = ""; - if (sc->axisSwapped) { - if (sc->performWorkGroupShift[1]) - sprintf(shiftY, " + consts.workGroupShiftY*%s ", sc->gl_WorkGroupSize_x); - } - else { - if (sc->performWorkGroupShift[1]) - sprintf(shiftY, " + consts.workGroupShiftY*%s ", sc->gl_WorkGroupSize_y); - } - - char shiftY2[100] = ""; - if (sc->performWorkGroupShift[1]) - sprintf(shiftY, " + consts.workGroupShiftY "); - if (sc->fftDim < sc->fft_dim_full) { - if (sc->axisSwapped) { - if (!sc->reorderFourStep) { - sc->tempLen = sprintf(sc->tempStr, " if((%s+%" PRIu64 "*%s)< numActiveThreads) {\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " if (((%s + %" PRIu64 " * %s) %% %" PRIu64 " + ((%s%s) / %" PRIu64 ")*%" PRIu64 " < %" PRIu64 ")){\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, sc->localSize[0], sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->localSize[0], sc->fft_dim_full / sc->firstStageStartSize); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - sc->tempLen = sprintf(sc->tempStr, " if (((%s + %" PRIu64 " * %s) %% %" PRIu64 " + ((%s%s) / %" PRIu64 ")*%" PRIu64 " < %" PRIu64 ")){\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, sc->localSize[1], sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->localSize[1], sc->fft_dim_full / sc->firstStageStartSize); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - sc->tempLen = sprintf(sc->tempStr, " { \n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - //if (sc->reorderFourStep) { - if (sc->fftDim == sc->fft_dim_full) { - for (uint64_t k = 0; k < sc->registerBoost; k++) { - for (uint64_t i = 0; i < (uint64_t)ceil(sc->min_registers_per_thread / 8.0); i++) { - if (sc->localSize[1] == 1) - sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * sc->min_registers_per_thread) * sc->localSize[0]); - else - sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[0] * sc->localSize[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - if (sc->outputStride[0] > 1) - sc->tempLen = sprintf(sc->tempStr, " inoutID = (combinedID %% %" PRIu64 ") * %" PRIu64 " + (combinedID / %" PRIu64 ") * %" PRIu64 ";\n", sc->fftDim / 8, sc->outputStride[0], sc->fftDim / 8, sc->outputStride[1]); - else - sc->tempLen = sprintf(sc->tempStr, " inoutID = (combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * %" PRIu64 ";\n", sc->fftDim / 8, sc->fftDim / 8, sc->outputStride[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->axisSwapped) { - if (sc->size[sc->axis_id + 1] % sc->localSize[0] != 0) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + (%s%s)*%" PRIu64 "< %" PRIu64 "){", sc->fftDim / 8, sc->gl_WorkGroupID_y, shiftY2, sc->localSize[0], sc->size[sc->axis_id + 1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim / 8 * sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - if (sc->size[sc->axis_id + 1] % sc->localSize[1] != 0) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + (%s%s)*%" PRIu64 "< %" PRIu64 "){", sc->fftDim / 8, sc->gl_WorkGroupID_y, shiftY2, sc->localSize[1], sc->size[sc->axis_id + 1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim / 8 * sc->localSize[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->zeropad[1]) { - sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->outputStride[1], sc->fft_zeropad_left_write[sc->axis_id], sc->outputStride[1], sc->fft_zeropad_right_write[sc->axis_id]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - res = indexOutputVkFFT(sc, uintType, writeType, sc->inoutID, 0, requestCoordinate, requestBatch); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, ";\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - res = appendZeropadStartReadWriteStage(sc, 0); - if (res != VKFFT_SUCCESS) return res; - if (sc->writeFromRegisters) { - if (sc->outputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s[%s] = %s%s%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->regIDs[i + k * sc->registers_per_thread], convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = %s%s%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[i + k * sc->registers_per_thread], convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - if (sc->axisSwapped) { - if (sc->outputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s[%s] = %ssdata[(2*(combinedID %% %" PRIu64 ")+1) * sharedStride + (combinedID / %" PRIu64 ")].x/2%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->fftDim / 8, sc->fftDim / 8, convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = %ssdata[(2*(combinedID %% %" PRIu64 ")+1) * sharedStride + (combinedID / %" PRIu64 ")].x/2%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->fftDim / 8, sc->fftDim / 8, convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - if (sc->outputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s[%s] = %ssdata[2*(combinedID %% %" PRIu64 ")+1 + (combinedID / %" PRIu64 ") * sharedStride].x/2%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->fftDim / 8, sc->fftDim / 8, convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = %ssdata[2*(combinedID %% %" PRIu64 ")+1 + (combinedID / %" PRIu64 ") * sharedStride].x/2%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->fftDim / 8, sc->fftDim / 8, convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - res = appendZeropadEndReadWriteStage(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->zeropad[1]) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->axisSwapped) { - if (sc->size[sc->axis_id + 1] % sc->localSize[0] != 0) { - sc->tempLen = sprintf(sc->tempStr, " }"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - if (sc->size[sc->axis_id + 1] % sc->localSize[1] != 0) { - sc->tempLen = sprintf(sc->tempStr, " }"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - } - } - /*else { - for (uint64_t k = 0; k < sc->registerBoost; k++) { - for (uint64_t i = 0; i < sc->min_registers_per_thread; i++) { - if (sc->localSize[1] == 1) - sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * sc->min_registers_per_thread) * sc->localSize[0]); - else - sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[0] * sc->localSize[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->axisSwapped) { - sc->tempLen = sprintf(sc->tempStr, " inoutID = combinedID %% %" PRIu64 " + ((%s%s) / %" PRIu64 ")*%" PRIu64 " + ((combinedID/%" PRIu64 ") * %" PRIu64 ")+ ((%s%s) %% %" PRIu64 ") * %" PRIu64 ";\n", sc->localSize[0], sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->localSize[0], sc->localSize[0], sc->fft_dim_full / sc->fftDim, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->fft_dim_full / sc->firstStageStartSize); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - if (sc->localSize[1] == 1) - sc->tempLen = sprintf(sc->tempStr, " inoutID = (%s%s)/%" PRIu64 "+ (combinedID * %" PRIu64 ")+ ((%s%s) %% %" PRIu64 ") * %" PRIu64 ";\n", sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->fft_dim_full / sc->fftDim, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->fft_dim_full / sc->firstStageStartSize); - else - sc->tempLen = sprintf(sc->tempStr, " inoutID = combinedID %% %" PRIu64 " + ((%s%s) / %" PRIu64 ")*%" PRIu64 " + ((combinedID/%" PRIu64 ") * %" PRIu64 ")+ ((%s%s) %% %" PRIu64 ") * %" PRIu64 ";\n", sc->localSize[1], sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->localSize[1], sc->localSize[1], sc->fft_dim_full / sc->fftDim, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->fft_dim_full / sc->firstStageStartSize); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->zeropad[1]) { - sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->fft_dim_full, sc->fft_zeropad_left_write[sc->axis_id], sc->fft_dim_full, sc->fft_zeropad_right_write[sc->axis_id]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - res = indexOutputVkFFT(sc, uintType, writeType, sc->inoutID, 0, requestCoordinate, requestBatch); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, ";\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - res = appendZeropadStartReadWriteStage(sc, 0); - if (res != VKFFT_SUCCESS) return res; - if (sc->writeFromRegisters) { - if (sc->outputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s[%s] = %s%s%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->regIDs[i + k * sc->registers_per_thread], convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = %s%s%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[i + k * sc->registers_per_thread], convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - if (sc->axisSwapped) { - if (sc->outputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s[%s] = %ssdata[(combinedID %% %s)+(combinedID/%s)*sharedStride]%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->gl_WorkGroupSize_x, sc->gl_WorkGroupSize_x, convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = %ssdata[(combinedID %% %s)+(combinedID/%s)*sharedStride]%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->gl_WorkGroupSize_x, sc->gl_WorkGroupSize_x, convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - if (sc->outputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s[%s] = %ssdata[(combinedID %% %s)*sharedStride+combinedID/%s]%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->gl_WorkGroupSize_y, sc->gl_WorkGroupSize_y, convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = %ssdata[(combinedID %% %s)*sharedStride+combinedID/%s]%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->gl_WorkGroupSize_y, sc->gl_WorkGroupSize_y, convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - res = appendZeropadEndReadWriteStage(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->zeropad[1]) { - sc->tempLen = sprintf(sc->tempStr, " }"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - }*/ - /*} - else { - if (sc->fftDim == sc->fft_dim_full) { - for (uint64_t k = 0; k < sc->registerBoost; k++) { - for (uint64_t i = 0; i < sc->min_registers_per_thread; i++) { - if (sc->localSize[1] == 1) - sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * sc->min_registers_per_thread) * sc->localSize[0]); - else - sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[0] * sc->localSize[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - if (sc->outputStride[0] > 1) - sc->tempLen = sprintf(sc->tempStr, " inoutID = (combinedID %% %" PRIu64 ") * %" PRIu64 " + (combinedID / %" PRIu64 ") * %" PRIu64 ";\n", sc->fftDim, sc->outputStride[0], sc->fftDim, sc->outputStride[1]); - else - sc->tempLen = sprintf(sc->tempStr, " inoutID = (combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * %" PRIu64 ";\n", sc->fftDim, sc->fftDim, sc->outputStride[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->axisSwapped) { - if (sc->size[sc->axis_id + 1] % sc->localSize[0] != 0) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + %s*%" PRIu64 "< %" PRIu64 "){", sc->fftDim, sc->gl_WorkGroupID_y, sc->localSize[0], sc->size[sc->axis_id + 1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - if (sc->size[sc->axis_id + 1] % sc->localSize[1] != 0) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + %s*%" PRIu64 "< %" PRIu64 "){", sc->fftDim, sc->gl_WorkGroupID_y, sc->localSize[1], sc->size[sc->axis_id + 1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - if (sc->zeropad[1]) { - sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->outputStride[1], sc->fft_zeropad_left_write[sc->axis_id], sc->outputStride[1], sc->fft_zeropad_right_write[sc->axis_id]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - res = indexOutputVkFFT(sc, uintType, writeType, sc->inoutID, 0, requestCoordinate, requestBatch); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, ";\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - res = appendZeropadStartReadWriteStage(sc, 0); - if (res != VKFFT_SUCCESS) return res; - if (sc->writeFromRegisters) { - if (sc->outputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s[%s] = %s%s%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->regIDs[i + k * sc->registers_per_thread], convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = %s%s%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[i + k * sc->registers_per_thread], convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - if (sc->axisSwapped) { - if (sc->outputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s[%s] = %ssdata[(combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")]%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->fftDim, sc->fftDim, convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = %ssdata[(combinedID %% %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")]%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->fftDim, sc->fftDim, convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - if (sc->outputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s[%s] = %ssdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride]%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->fftDim, sc->fftDim, convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = %ssdata[(combinedID %% %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride]%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->fftDim, sc->fftDim, convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - res = appendZeropadEndReadWriteStage(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->zeropad[1]) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->axisSwapped) { - if (sc->size[sc->axis_id + 1] % sc->localSize[0] != 0) { - sc->tempLen = sprintf(sc->tempStr, " }"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - if (sc->size[sc->axis_id + 1] % sc->localSize[1] != 0) { - sc->tempLen = sprintf(sc->tempStr, " }"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - } - } - else { - for (uint64_t k = 0; k < sc->registerBoost; k++) { - for (uint64_t i = 0; i < sc->min_registers_per_thread; i++) { - if (sc->localSize[1] == 1) - sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * sc->min_registers_per_thread) * sc->localSize[0]); - else - sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 " * numActiveThreads;\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread)); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->axisSwapped) { - sc->tempLen = sprintf(sc->tempStr, " inoutID = (combinedID %% %" PRIu64 ")+(combinedID / %" PRIu64 ") * %" PRIu64 " + (((%s%s) %% %" PRIu64 ") * %" PRIu64 " + ((%s%s) / %" PRIu64 ") * %" PRIu64 ");", sc->fftDim, sc->fftDim, sc->firstStageStartSize, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->fftDim, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->localSize[0] * sc->firstStageStartSize); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " inoutID = %s+%" PRIu64 "+%s * %" PRIu64 " + (((%s%s) %% %" PRIu64 ") * %" PRIu64 " + ((%s%s) / %" PRIu64 ") * %" PRIu64 ");", sc->gl_LocalInvocationID_x, (i + k * sc->min_registers_per_thread) * sc->localSize[0], sc->gl_LocalInvocationID_y, sc->firstStageStartSize, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->fftDim, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->localSize[1] * sc->firstStageStartSize); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->zeropad[1]) { - sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->fft_dim_full, sc->fft_zeropad_left_write[sc->axis_id], sc->fft_dim_full, sc->fft_zeropad_right_write[sc->axis_id]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - res = indexOutputVkFFT(sc, uintType, writeType, sc->inoutID, 0, requestCoordinate, requestBatch); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, ";\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - //sc->tempLen = sprintf(sc->tempStr, " inoutID = indexOutput(%s+i*%" PRIu64 "+%s * %" PRIu64 " + (((%s%s) %% %" PRIu64 ") * %" PRIu64 " + ((%s%s) / %" PRIu64 ") * %" PRIu64 ")%s%s);\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, sc->firstStageStartSize, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->fftDim, sc->gl_WorkGroupID_x, shiftX, sc->firstStageStartSize / sc->fftDim, sc->localSize[1] * sc->firstStageStartSize, requestCoordinate, requestBatch); - res = appendZeropadStartReadWriteStage(sc, 0); - if (res != VKFFT_SUCCESS) return res; - if (sc->writeFromRegisters) { - if (sc->outputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s[inoutID]=%s%s%s;\n", outputsStruct, convTypeLeft, sc->regIDs[i + k * sc->registers_per_thread], convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "] = %s%s%s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[i + k * sc->registers_per_thread], convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - if (sc->axisSwapped) { - if (sc->outputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s[inoutID]=%ssdata[%s + sharedStride*(%s + %" PRIu64 ")]%s;\n", outputsStruct, convTypeLeft, sc->gl_LocalInvocationID_x, sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1], convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "] = %ssdata[%s + sharedStride*(%s + %" PRIu64 ")]%s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeft, sc->gl_LocalInvocationID_x, sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1], convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - if (sc->outputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s[inoutID]=%ssdata[sharedStride*%s + (%s + %" PRIu64 ")]%s;\n", outputsStruct, convTypeLeft, sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, (i + k * sc->min_registers_per_thread) * sc->localSize[0], convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "] = %ssdata[sharedStride*%s + (%s + %" PRIu64 ")]%s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeft, sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, (i + k * sc->min_registers_per_thread) * sc->localSize[0], convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - appendZeropadEndReadWriteStage(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->zeropad[1]) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - } - }*/ - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - break; - } - case 141: //DCT-IV strided as 8N DFT - { - if (!sc->writeFromRegisters) { - res = appendBarrierVkFFT(sc, 1); - if (res != VKFFT_SUCCESS) return res; - } - res = appendZeropadStart(sc); - if (res != VKFFT_SUCCESS) return res; - char shiftX[500] = ""; - if (sc->performWorkGroupShift[0]) - sprintf(shiftX, " + consts.workGroupShiftX * %s ", sc->gl_WorkGroupSize_x); - if (sc->fftDim != sc->fft_dim_full) - sc->tempLen = sprintf(sc->tempStr, " if (((%s%s) / %" PRIu64 ") %% (%" PRIu64 ")+((%s%s) / %" PRIu64 ") * (%" PRIu64 ") < %" PRIu64 ") {\n", sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x, sc->stageStartSize, sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x * sc->stageStartSize, sc->fftDim * sc->stageStartSize, sc->size[sc->axis_id]); - else - sc->tempLen = sprintf(sc->tempStr, " {\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - //if ((sc->reorderFourStep) && (sc->stageStartSize == 1)) { - for (uint64_t k = 0; k < sc->registerBoost; k++) { - for (uint64_t i = 0; i < (uint64_t)ceil(sc->min_registers_per_thread / 8.0); i++) { - if (sc->fftDim == sc->fft_dim_full) - sc->tempLen = sprintf(sc->tempStr, " inoutID = (%s + %" PRIu64 ");\n", sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1]); - else - sc->tempLen = sprintf(sc->tempStr, " inoutID = (%s + %" PRIu64 ") * (%" PRIu64 ") + (((%s%s) / %" PRIu64 ") %% (%" PRIu64 ")) * (%" PRIu64 ") + ((%s%s) / %" PRIu64 ");\n", sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1], sc->fft_dim_full / sc->fftDim, sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x, sc->firstStageStartSize / sc->fftDim, sc->fft_dim_full / sc->firstStageStartSize, sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x * (sc->firstStageStartSize / sc->fftDim)); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " if(inoutID < %" PRIu64 "){\n", sc->fftDim / 8); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->zeropad[1]) { - sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->fft_dim_full, sc->fft_zeropad_left_write[sc->axis_id], sc->fft_dim_full, sc->fft_zeropad_right_write[sc->axis_id]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sprintf(index_x, "(%s%s) %% (%" PRIu64 ")", sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x); - res = indexOutputVkFFT(sc, uintType, writeType, index_x, sc->inoutID, requestCoordinate, requestBatch); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, ";\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->outputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s[%s] = %ssdata[%s*(2*(%s+%" PRIu64 ")+1) + %s].x/2%s;\n", outputsStruct, sc->inoutID, convTypeLeft, sc->sharedStride, sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1], sc->gl_LocalInvocationID_x, convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " outputBlocks[%s / %" PRIu64 "]%s[%s %% %" PRIu64 "] = %ssdata[%s*(2*(%s+%" PRIu64 ")+1) + %s].x/2%s;\n", sc->inoutID, sc->outputBufferBlockSize, outputsStruct, sc->inoutID, sc->outputBufferBlockSize, convTypeLeft, sc->sharedStride, sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1], sc->gl_LocalInvocationID_x, convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - if (sc->zeropad[1]) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - /*} - else { - for (uint64_t k = 0; k < sc->registerBoost; k++) { - for (uint64_t i = 0; i < sc->min_registers_per_thread; i++) { - if (sc->zeropad[1]) { - sc->tempLen = sprintf(sc->tempStr, " inoutID = (%s + %" PRIu64 ") * %" PRIu64 " + ((%s%s) / %" PRIu64 ") %% (%" PRIu64 ")+((%s%s) / %" PRIu64 ") * (%" PRIu64 ");\n", sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1], sc->stageStartSize, sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x, sc->stageStartSize, sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x * sc->stageStartSize, sc->stageStartSize * sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->fft_dim_full, sc->fft_zeropad_left_write[sc->axis_id], sc->fft_dim_full, sc->fft_zeropad_right_write[sc->axis_id]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sprintf(index_x, "(%s%s) %% (%" PRIu64 ")", sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x); - sprintf(index_y, "%" PRIu64 " * (%s + %" PRIu64 ") + ((%s%s) / %" PRIu64 ") %% (%" PRIu64 ")+((%s%s) / %" PRIu64 ") * (%" PRIu64 ")", sc->stageStartSize, sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1], sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x, sc->stageStartSize, sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x * sc->stageStartSize, sc->stageStartSize * sc->fftDim); - res = indexOutputVkFFT(sc, uintType, writeType, index_x, index_y, requestCoordinate, requestBatch); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, ";\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - //sc->tempLen = sprintf(sc->tempStr, " inoutID = indexOutput((%s%s) %% (%" PRIu64 "), %" PRIu64 " * (%s + %" PRIu64 ") + ((%s%s) / %" PRIu64 ") %% (%" PRIu64 ")+((%s%s) / %" PRIu64 ") * (%" PRIu64 ")%s%s);\n", sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x, sc->stageStartSize, sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1], sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x, sc->stageStartSize, sc->gl_GlobalInvocationID_x, shiftX, sc->fft_dim_x * sc->stageStartSize, sc->stageStartSize * sc->fftDim, requestCoordinate, requestBatch); - if (sc->writeFromRegisters) { - if (sc->outputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s[inoutID] = %s%s%s;\n", outputsStruct, convTypeLeft, sc->regIDs[i + k * sc->registers_per_thread], convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "] = %s%s%s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[i + k * sc->registers_per_thread], convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - if (sc->outputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s[inoutID] = %ssdata[%s*(%s+%" PRIu64 ") + %s]%s;\n", outputsStruct, convTypeLeft, sc->sharedStride, sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1], sc->gl_LocalInvocationID_x, convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "] = %ssdata[%s*(%s+%" PRIu64 ") + %s]%s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeft, sc->sharedStride, sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[1], sc->gl_LocalInvocationID_x, convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->zeropad[1]) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - }*/ - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - break; - - } - case 142://DCT-IV nonstrided as 2xN/2 DCT-II - { - if (!sc->writeFromRegisters) { - res = appendBarrierVkFFT(sc, 1); - if (res != VKFFT_SUCCESS) return res; - } - //res = appendZeropadStart(sc); - //if (res != VKFFT_SUCCESS) return res; - char shiftX[500] = ""; - if (sc->performWorkGroupShift[0]) - sprintf(shiftX, " + consts.workGroupShiftX "); - char shiftY[500] = ""; - if (sc->performWorkGroupShift[1]) - sprintf(shiftY, " + consts.workGroupShiftY*%s ", sc->gl_WorkGroupSize_y); - char shiftY2[500] = ""; - if (sc->performWorkGroupShift[1]) - sprintf(shiftY2, " + consts.workGroupShiftY "); - uint64_t used_registers_write = (sc->axisSwapped) ? (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]) : (uint64_t)ceil(sc->fftDim / (double)sc->localSize[0]); - if (sc->registerBoost > 1) used_registers_write /= sc->registerBoost; - if (sc->reorderFourStep) { - //Not implemented - } - else { - //appendBarrierVkFFT(sc, 1); - //appendZeropadStart(sc); - if (sc->fftDim == sc->fft_dim_full) { - if (sc->zeropadBluestein[1]) { - sc->fftDim = sc->fft_zeropad_Bluestein_left_write[sc->axis_id]; - used_registers_write = (sc->axisSwapped) ? (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]) : (uint64_t)ceil(sc->fftDim / (double)sc->localSize[0]); - } - uint64_t maxBluesteinCutOff = 1; - if (sc->zeropadBluestein[1]) { - if (sc->axisSwapped) - maxBluesteinCutOff = sc->fftDim * sc->localSize[0]; - else - maxBluesteinCutOff = sc->fftDim * sc->localSize[1]; - } - for (uint64_t k = 0; k < sc->registerBoost; k++) { - //num_out = (uint64_t)ceil(num_out / (double)used_registers_write); - for (uint64_t i = 0; i < used_registers_write; i++) { - if (sc->localSize[1] == 1) - sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * used_registers_write) * sc->localSize[0]); - else - sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * used_registers_write) * sc->localSize[0] * sc->localSize[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->zeropadBluestein[1]) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", maxBluesteinCutOff); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->axisSwapped) { - if ((1 + i + k * used_registers_write) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[0])) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " sdataID = (((combinedID %% %" PRIu64 ") %% 2) * %" PRIu64 " + (1-2*((combinedID %% %" PRIu64 ") %% 2)) * ((combinedID %% %" PRIu64 ")/2)) * sharedStride + (combinedID / %" PRIu64 ");\n", sc->fftDim, sc->fftDim - 1, sc->fftDim, sc->fftDim, sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - if ((1 + i + k * used_registers_write) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[1])) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " sdataID = (((combinedID %% %" PRIu64 ") %% 2) * %" PRIu64 " + (1-2*((combinedID %% %" PRIu64 ") %% 2)) * ((combinedID %% %" PRIu64 ")/2)) + (combinedID / %" PRIu64 ")* sharedStride;\n", sc->fftDim, sc->fftDim - 1, sc->fftDim, sc->fftDim, sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - - sc->tempLen = sprintf(sc->tempStr, " %s = sdata[sdataID];\n", sc->regIDs[i + k * sc->registers_per_thread]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = %s.y * (1.0%s - 2 * ((combinedID %% %" PRIu64 ")%%2));\n", sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread], LFending, sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->axisSwapped) { - if ((1 + i + k * used_registers_write) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[0])) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - if ((1 + i + k * used_registers_write) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim * sc->localSize[1])) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - if (sc->zeropadBluestein[1]) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - for (uint64_t k = 0; k < sc->registerBoost; k++) { - for (uint64_t i = 0; i < used_registers_write; i++) { - if (sc->localSize[1] == 1) - sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * used_registers_write) * sc->localSize[0]); - else - sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * used_registers_write) * sc->localSize[0] * sc->localSize[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->axisSwapped) { - if (sc->size[1] % sc->localSize[0] != 0) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + %s*%" PRIu64 "< %" PRIu64 "){\n", (sc->fftDim), sc->gl_WorkGroupID_y, sc->localSize[0], sc->size[sc->axis_id + 1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if ((1 + i + k * used_registers_write) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim) * sc->localSize[0]) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", (sc->fftDim) * sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - if (sc->size[1] % sc->localSize[1] != 0) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + %s*%" PRIu64 "< %" PRIu64 "){\n", (sc->fftDim), sc->gl_WorkGroupID_y, sc->localSize[1], sc->size[sc->axis_id + 1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if ((1 + i + k * used_registers_write) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim) * sc->localSize[1]) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", (sc->fftDim) * sc->localSize[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - sc->tempLen = sprintf(index_x, "combinedID %% %" PRIu64 " + ((combinedID/%" PRIu64 ") * %" PRIu64 ")", sc->fftDim, sc->fftDim, sc->outputStride[1]); - sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - res = indexOutputVkFFT(sc, uintType, writeType, index_x, 0, requestCoordinate, requestBatch); - sc->tempLen = sprintf(sc->tempStr, ";\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - res = appendZeropadStartReadWriteStage(sc, 0); - if (res != VKFFT_SUCCESS) return res; - if (sc->LUT) { - sc->tempLen = sprintf(sc->tempStr, " mult = twiddleLUT[%" PRIu64 " + combinedID %% %" PRIu64 "];\n", sc->startDCT4LUT, sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - if (!strcmp(floatType, "float")) { - sc->tempLen = sprintf(sc->tempStr, " mult.x = %s(%.17e%s * (2*(combinedID %% %" PRIu64 ")+1) );\n", cosDef, -double_PI / 8 / sc->fftDim, LFending, sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " mult.y = %s(%.17e%s * (2*(combinedID %% %" PRIu64 ")+1) );\n", sinDef, -double_PI / 8 / sc->fftDim, LFending, sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (!strcmp(floatType, "double")) { - sc->tempLen = sprintf(sc->tempStr, " mult = sincos_20(%.17e%s * (2*(combinedID %% %" PRIu64 ")+1) );\n", -double_PI / 8 / sc->fftDim, LFending, sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - if (sc->zeropad[1]) { - sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->outputStride[1], sc->fft_zeropad_left_write[sc->axis_id], sc->outputStride[1], sc->fft_zeropad_right_write[sc->axis_id]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->outputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s[inoutID] = %s(%s.x*mult.x - %s.y*mult.y)%s;\n", outputsStruct, convTypeLeft, sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread], convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "] = %s(%s.x*mult.x - %s.y*mult.y)%s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread], convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->zeropad[1]) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(index_x, "%" PRIu64 " - combinedID %% %" PRIu64 " + ((combinedID/%" PRIu64 ") * %" PRIu64 ")", 2 * sc->fftDim - 1, sc->fftDim, sc->fftDim, sc->outputStride[1]); - sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - res = indexOutputVkFFT(sc, uintType, writeType, index_x, 0, requestCoordinate, requestBatch); - sc->tempLen = sprintf(sc->tempStr, ";\n"); - res = VkAppendLine(sc); - if (sc->zeropad[1]) { - sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->outputStride[1], sc->fft_zeropad_left_write[sc->axis_id], sc->outputStride[1], sc->fft_zeropad_right_write[sc->axis_id]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->outputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s[inoutID] = %s(-%s.x*mult.y - %s.y*mult.x)%s;\n", outputsStruct, convTypeLeft, sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread], convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "] = %s(-%s.x*mult.y - %s.y*mult.x)%s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread], convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - res = appendZeropadEndReadWriteStage(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->zeropad[1]) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->axisSwapped) { - if ((1 + i + k * used_registers_write) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim) * sc->localSize[0]) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - if ((1 + i + k * used_registers_write) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim) * sc->localSize[1]) - { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - if (sc->axisSwapped) { - if (sc->size[1] % sc->localSize[0] != 0) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - if (sc->size[1] % sc->localSize[1] != 0) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - } - if (sc->zeropadBluestein[1]) { - sc->fftDim = sc->fft_dim_full; - used_registers_write = (sc->axisSwapped) ? (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]) : (uint64_t)ceil(sc->fftDim / (double)sc->localSize[0]); - } - } - else { - - } - } - break; - } - case 143://DCT-IV strided as 2xN/2 DCT-II - { - if (!sc->writeFromRegisters) { - res = appendBarrierVkFFT(sc, 1); - if (res != VKFFT_SUCCESS) return res; - } - //res = appendZeropadStart(sc); - //if (res != VKFFT_SUCCESS) return res; - char shiftX[500] = ""; - if (sc->performWorkGroupShift[0]) - sprintf(shiftX, " + consts.workGroupShiftX "); - char shiftX2[500] = ""; - if (sc->performWorkGroupShift[0]) - sprintf(shiftX2, " + consts.workGroupShiftX * %s ", sc->gl_WorkGroupSize_x); - char shiftY[500] = ""; - if (sc->performWorkGroupShift[1]) - sprintf(shiftY, " + consts.workGroupShiftY*%s ", sc->gl_WorkGroupSize_y); - char shiftY2[500] = ""; - if (sc->performWorkGroupShift[1]) - sprintf(shiftY2, " + consts.workGroupShiftY "); - uint64_t used_registers_write = (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]); - if (sc->registerBoost > 1) used_registers_write /= sc->registerBoost; - if (sc->reorderFourStep) { - //Not implemented - } - else { - //appendBarrierVkFFT(sc, 1); - //appendZeropadStart(sc); - if (sc->fftDim == sc->fft_dim_full) { - if (sc->zeropadBluestein[1]) { - sc->fftDim = sc->fft_zeropad_Bluestein_left_write[sc->axis_id]; - used_registers_write = (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]); - } - for (uint64_t k = 0; k < sc->registerBoost; k++) { - //num_out = (uint64_t)ceil(num_out / (double)used_registers_write); - for (uint64_t i = 0; i < used_registers_write; i++) { - if (sc->localSize[1] == 1) - sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * used_registers_write) * sc->localSize[0]); - else - sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * used_registers_write) * sc->localSize[0] * sc->localSize[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->zeropadBluestein[1]) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if ((1 + i + k * used_registers_write) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim) * sc->localSize[0]) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", (sc->fftDim) * sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " sdataID = (((combinedID / %" PRIu64 ") %% 2) * %" PRIu64 " + (1-2*((combinedID / %" PRIu64 ") %% 2)) * ((combinedID / %" PRIu64 ")/2)) * sharedStride + (combinedID %% %" PRIu64 ");\n", sc->localSize[0], sc->fftDim - 1, sc->localSize[0], sc->localSize[0], sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - sc->tempLen = sprintf(sc->tempStr, " %s = sdata[sdataID];\n", sc->regIDs[i + k * sc->registers_per_thread]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = %s.y * (1.0%s - 2 * ((combinedID / %" PRIu64 ")%%2));\n", sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread], LFending, sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if ((1 + i + k * used_registers_write) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim) * sc->localSize[0]) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->zeropadBluestein[1]) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - for (uint64_t k = 0; k < sc->registerBoost; k++) { - for (uint64_t i = 0; i < used_registers_write; i++) { - if (sc->localSize[1] == 1) - sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * used_registers_write) * sc->localSize[0]); - else - sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * used_registers_write) * sc->localSize[0] * sc->localSize[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->zeropadBluestein[1]) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if ((uint64_t)ceil(sc->size[0]) % sc->localSize[0] != 0) { - sc->tempLen = sprintf(sc->tempStr, " if ((%s%s) < %" PRIu64 ") {\n", sc->gl_GlobalInvocationID_x, shiftX2, (uint64_t)ceil(sc->size[0])); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if ((1 + i + k * used_registers_write) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim) * sc->localSize[0]) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", (sc->fftDim) * sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sprintf(index_x, "(%s%s) %% (%" PRIu64 ")", sc->gl_GlobalInvocationID_x, shiftX2, sc->fft_dim_x); - sprintf(index_y, "(%s + %" PRIu64 ")", sc->gl_LocalInvocationID_y, (i + k * 2 * used_registers_write) * sc->localSize[1]); - res = indexOutputVkFFT(sc, uintType, writeType, index_x, index_y, requestCoordinate, requestBatch); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, ";\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - res = appendZeropadStartReadWriteStage(sc, 0); - if (res != VKFFT_SUCCESS) return res; - if (sc->LUT) { - sc->tempLen = sprintf(sc->tempStr, " mult = twiddleLUT[%" PRIu64 " + combinedID / %" PRIu64 "];\n", sc->startDCT4LUT, sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - if (!strcmp(floatType, "float")) { - sc->tempLen = sprintf(sc->tempStr, " mult.x = %s(%.17e%s * (2*(combinedID / %" PRIu64 ")+1) );\n", cosDef, -double_PI / 8 / sc->fftDim, LFending, sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " mult.y = %s(%.17e%s * (2*(combinedID / %" PRIu64 ")+1) );\n", sinDef, -double_PI / 8 / sc->fftDim, LFending, sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (!strcmp(floatType, "double")) { - sc->tempLen = sprintf(sc->tempStr, " mult = sincos_20(%.17e%s * (2*(combinedID / %" PRIu64 ")+1) );\n", -double_PI / 8 / sc->fftDim, LFending, sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - if (sc->zeropad[1]) { - sc->tempLen = sprintf(sc->tempStr, " if((%s %% %" PRIu64 " < %" PRIu64 ")||(%s %% %" PRIu64 " >= %" PRIu64 ")){\n", index_y, sc->fft_dim_full, sc->fft_zeropad_left_read[sc->axis_id], index_y, sc->fft_dim_full, sc->fft_zeropad_right_read[sc->axis_id]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->outputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s[inoutID] = %s(%s.x*mult.x - %s.y*mult.y)%s;\n", outputsStruct, convTypeLeft, sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread], convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "] = %s(%s.x*mult.x - %s.y*mult.y)%s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread], convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->zeropad[1]) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sprintf(index_x, "(%s%s) %% (%" PRIu64 ")", sc->gl_GlobalInvocationID_x, shiftX2, sc->fft_dim_x); - sprintf(index_y, "(%" PRIu64 " - (%s + %" PRIu64 "))", 2 * sc->fftDim - 1, sc->gl_LocalInvocationID_y, (i + k * 2 * used_registers_write) * sc->localSize[1]); - res = indexOutputVkFFT(sc, uintType, writeType, index_x, index_y, requestCoordinate, requestBatch); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, ";\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->zeropad[1]) { - sc->tempLen = sprintf(sc->tempStr, " if((%s %% %" PRIu64 " < %" PRIu64 ")||(%s %% %" PRIu64 " >= %" PRIu64 ")){\n", index_y, sc->fft_dim_full, sc->fft_zeropad_left_read[sc->axis_id], index_y, sc->fft_dim_full, sc->fft_zeropad_right_read[sc->axis_id]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->outputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s[inoutID] = %s(-%s.x*mult.y - %s.y*mult.x)%s;\n", outputsStruct, convTypeLeft, sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread], convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "] = %s(-%s.x*mult.y - %s.y*mult.x)%s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[i + k * sc->registers_per_thread], sc->regIDs[i + k * sc->registers_per_thread], convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - res = appendZeropadEndReadWriteStage(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->zeropad[1]) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if ((1 + i + k * used_registers_write) * sc->localSize[0] * sc->localSize[1] > (sc->fftDim) * sc->localSize[0]) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if ((uint64_t)ceil(sc->size[0]) % sc->localSize[0] != 0) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->zeropadBluestein[1]) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - if (sc->zeropadBluestein[1]) { - sc->fftDim = sc->fft_dim_full; - used_registers_write = (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]); - } - } - else { - - } - } - break; - } - case 144://odd DCT-IV nonstrided as N FFT - { - if (!sc->writeFromRegisters) { - res = appendBarrierVkFFT(sc, 1); - if (res != VKFFT_SUCCESS) return res; - } - //res = appendZeropadStart(sc); - //if (res != VKFFT_SUCCESS) return res; - char shiftX[500] = ""; - if (sc->performWorkGroupShift[0]) - sprintf(shiftX, " + consts.workGroupShiftX "); - char shiftY[500] = ""; - if (sc->performWorkGroupShift[1]) - sprintf(shiftY, " + consts.workGroupShiftY*%s ", sc->gl_WorkGroupSize_y); - char shiftY2[500] = ""; - if (sc->performWorkGroupShift[1]) - sprintf(shiftY2, " + consts.workGroupShiftY "); - uint64_t mult = (sc->mergeSequencesR2C) ? 2 : 1; - uint64_t used_registers_write = (sc->axisSwapped) ? (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]) : (uint64_t)ceil(sc->fftDim / (double)sc->localSize[0]); - if (sc->registerBoost > 1) used_registers_write /= sc->registerBoost; - if (sc->reorderFourStep) { - //Not implemented - } - else { - //appendBarrierVkFFT(sc, 1); - //appendZeropadStart(sc); - if (sc->fftDim == sc->fft_dim_full) { - if (sc->zeropadBluestein[1]) { - sc->fftDim = sc->fft_zeropad_Bluestein_left_write[sc->axis_id]; - used_registers_write = (sc->axisSwapped) ? (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]) : (uint64_t)ceil(sc->fftDim / (double)sc->localSize[0]); - } - for (uint64_t k = 0; k < sc->registerBoost; k++) { - if (sc->mergeSequencesR2C) { - if (sc->axisSwapped) { - sc->tempLen = sprintf(sc->tempStr, "\ - if (%s==0)\n\ - {\n\ - sdata[%s + %" PRIu64 "* sharedStride] = sdata[%s];\n\ - }\n", sc->gl_LocalInvocationID_y, sc->gl_LocalInvocationID_x, sc->fftDim, sc->gl_LocalInvocationID_x); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - //res = appendZeropadEnd(sc); - //if (res != VKFFT_SUCCESS) return res; - res = appendBarrierVkFFT(sc, 1); - if (res != VKFFT_SUCCESS) return res; - //res = appendZeropadStart(sc); - //if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, "\ - if (%s==0)\n\ - {\n\ - sdata[%s * sharedStride + %" PRIu64 "] = sdata[%s * sharedStride];\n\ - }\n", sc->gl_LocalInvocationID_x, sc->gl_LocalInvocationID_y, sc->fftDim, sc->gl_LocalInvocationID_y); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - //res = appendZeropadEnd(sc); - //if (res != VKFFT_SUCCESS) return res; - res = appendBarrierVkFFT(sc, 1); - if (res != VKFFT_SUCCESS) return res; - //res = appendZeropadStart(sc); - //if (res != VKFFT_SUCCESS) return res; - } - } - //uint64_t num_out = (sc->axisSwapped) ? (uint64_t)ceil(mult * (sc->fftDim / 2 + 1) / (double)sc->localSize[1]) : (uint64_t)ceil(mult * (sc->fftDim / 2 + 1) / (double)sc->localSize[0]); - //num_out = (uint64_t)ceil(num_out / (double)used_registers_write); - for (uint64_t i = 0; i < mult * used_registers_write; i++) { - if (sc->localSize[1] == 1) - sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * used_registers_write) * sc->localSize[0]); - else - sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * used_registers_write) * sc->localSize[0] * sc->localSize[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - sc->tempLen = sprintf(sc->tempStr, " %s = combinedID %% %" PRIu64 " + ((combinedID/%" PRIu64 ") * %" PRIu64 ");\n", sc->inoutID, sc->fftDim, sc->fftDim, sc->outputStride[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - if (sc->axisSwapped) { - if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[0] != 0) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + %s*%" PRIu64 "< %" PRIu64 "){\n", mult * sc->fftDim, sc->gl_WorkGroupID_y, sc->localSize[0], (uint64_t)ceil(sc->size[1] / (double)mult)); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if ((1 + i + mult * k * used_registers_write) * sc->localSize[0] * sc->localSize[1] > mult * sc->fftDim * sc->localSize[0]) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", mult * sc->fftDim * sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[1] != 0) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID / %" PRIu64 " + %s*%" PRIu64 "< %" PRIu64 "){\n", mult * sc->fftDim, sc->gl_WorkGroupID_y, sc->localSize[1], (uint64_t)ceil(sc->size[1] / (double)mult)); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if ((1 + i + mult * k * used_registers_write) * sc->localSize[0] * sc->localSize[1] > mult * sc->fftDim * sc->localSize[1]) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", mult * sc->fftDim * sc->localSize[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - if (sc->zeropad[1]) { - sc->tempLen = sprintf(sc->tempStr, " if((inoutID %% %" PRIu64 " < %" PRIu64 ")||(inoutID %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->outputStride[1], sc->fft_zeropad_left_write[sc->axis_id], sc->outputStride[1], sc->fft_zeropad_right_write[sc->axis_id]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - res = indexOutputVkFFT(sc, uintType, writeType, sc->inoutID, 0, requestCoordinate, requestBatch); - sc->tempLen = sprintf(sc->tempStr, ";\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - res = appendZeropadStartReadWriteStage(sc, 0); - if (res != VKFFT_SUCCESS) return res; - if (sc->writeFromRegisters) { - //not working yet - if (sc->outputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s[inoutID] = %s%s%s;\n", outputsStruct, convTypeLeft, sc->regIDs[i + k * sc->registers_per_thread], convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "] = %s%s%s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[i + k * sc->registers_per_thread], convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " sdataID = combinedID %% %" PRIu64 ";\n", sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " if(sdataID < %" PRIu64 "){\n", sc->fftDim / 4); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->mergeSequencesR2C) { - if (sc->axisSwapped) { - sc->tempLen = sprintf(sc->tempStr, "if ( (combinedID / %" PRIu64 ") %% 2 == 0){\n", sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.x = 0.5%s*(sdata[(2*sdataID+1) * sharedStride + (combinedID / %" PRIu64 ")].x+sdata[(%" PRIu64 "- (2*sdataID+1)) * sharedStride + (combinedID / %" PRIu64 ")].x);\n", sc->regIDs[0], LFending, mult * sc->fftDim, sc->fftDim, mult * sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = 0.5%s*(sdata[(2*sdataID+1) * sharedStride + (combinedID / %" PRIu64 ")].y-sdata[(%" PRIu64 "- (2*sdataID+1)) * sharedStride + (combinedID / %" PRIu64 ")].y);\n", sc->regIDs[0], LFending, mult * sc->fftDim, sc->fftDim, mult * sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, "}else{\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.x = 0.5%s*(sdata[(2*sdataID+1) * sharedStride + (combinedID / %" PRIu64 ")].y+sdata[(%" PRIu64 "- (2*sdataID+1)) * sharedStride + (combinedID / %" PRIu64 ")].y);\n", sc->regIDs[0], LFending, mult * sc->fftDim, sc->fftDim, mult * sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = 0.5%s*(-sdata[(2*sdataID+1) * sharedStride + (combinedID / %" PRIu64 ")].x+sdata[(%" PRIu64 "- (2*sdataID+1)) * sharedStride + (combinedID / %" PRIu64 ")].x);\n", sc->regIDs[0], LFending, mult * sc->fftDim, sc->fftDim, mult * sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, "}\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, "if ( (combinedID / %" PRIu64 ") %% 2 == 0){\n", sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.x = 0.5%s*(sdata[(2*sdataID+1) + (combinedID / %" PRIu64 ") * sharedStride].x+sdata[(%" PRIu64 "- (2*sdataID+1)) + (combinedID / %" PRIu64 ") * sharedStride].x);\n", sc->regIDs[0], LFending, mult * sc->fftDim, sc->fftDim, mult * sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = 0.5%s*(sdata[(2*sdataID+1) + (combinedID / %" PRIu64 ") * sharedStride].y-sdata[(%" PRIu64 "- (2*sdataID+1)) + (combinedID / %" PRIu64 ") * sharedStride].y);\n", sc->regIDs[0], LFending, mult * sc->fftDim, sc->fftDim, mult * sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, "}else{\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.x = 0.5%s*(sdata[(2*sdataID+1) + (combinedID / %" PRIu64 ") * sharedStride].y+sdata[(%" PRIu64 "- (2*sdataID+1)) + (combinedID / %" PRIu64 ") * sharedStride].y);\n", sc->regIDs[0], LFending, mult * sc->fftDim, sc->fftDim, mult * sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = 0.5%s*(-sdata[(2*sdataID+1) + (combinedID / %" PRIu64 ") * sharedStride].x+sdata[(%" PRIu64 "- (2*sdataID+1)) + (combinedID / %" PRIu64 ") * sharedStride].x);\n", sc->regIDs[0], LFending, mult * sc->fftDim, sc->fftDim, mult * sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, "}\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - if (!sc->axisSwapped) - sc->tempLen = sprintf(sc->tempStr, " %s = sdata[(2*sdataID+1) + (combinedID / %" PRIu64 ") * sharedStride];\n", sc->regIDs[0], sc->fftDim); - else - sc->tempLen = sprintf(sc->tempStr, " %s = sdata[(2*sdataID+1) * sharedStride + (combinedID / %" PRIu64 ")];\n", sc->regIDs[0], sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " if ((((sdataID + 1)/2) %% 2) != 0) \n\ - %s.x = -%s.x;\n\ - else\n\ - %s.x = %s.x;\n", sc->regIDs[1], sc->regIDs[0], sc->regIDs[1], sc->regIDs[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " if ((((sdataID)/2) %% 2) != 0) \n\ - %s.x += %s.y;\n\ - else\n\ - %s.x -= %s.y;\n", sc->regIDs[1], sc->regIDs[0], sc->regIDs[1], sc->regIDs[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " }\n\n"); - - - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " if((sdataID < %" PRIu64 ")&&(sdataID >= %" PRIu64 ")){\n", sc->fftDim / 2, sc->fftDim / 4); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->mergeSequencesR2C) { - if (sc->axisSwapped) { - sc->tempLen = sprintf(sc->tempStr, "if ( (combinedID / %" PRIu64 ") %% 2 == 0){\n", sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.x = 0.5%s*(sdata[(%" PRIu64 " - 2*sdataID) * sharedStride + (combinedID / %" PRIu64 ")].x+sdata[(%" PRIu64 " + 2*sdataID) * sharedStride + (combinedID / %" PRIu64 ")].x);\n", sc->regIDs[0], LFending, 2 * (sc->fftDim / 2), mult * sc->fftDim, sc->fftDim - 2 * (sc->fftDim / 2), mult * sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = 0.5%s*(sdata[(%" PRIu64 " - 2*sdataID) * sharedStride + (combinedID / %" PRIu64 ")].y-sdata[(%" PRIu64 " + 2*sdataID) * sharedStride + (combinedID / %" PRIu64 ")].y);\n", sc->regIDs[0], LFending, 2 * (sc->fftDim / 2), mult * sc->fftDim, sc->fftDim - 2 * (sc->fftDim / 2), mult * sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, "}else{\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.x = 0.5%s*(sdata[(%" PRIu64 " - 2*sdataID) * sharedStride + (combinedID / %" PRIu64 ")].y+sdata[(%" PRIu64 " + 2*sdataID) * sharedStride + (combinedID / %" PRIu64 ")].y);\n", sc->regIDs[0], LFending, 2 * (sc->fftDim / 2), mult * sc->fftDim, sc->fftDim - 2 * (sc->fftDim / 2), mult * sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = 0.5%s*(-sdata[(%" PRIu64 " - 2*sdataID) * sharedStride + (combinedID / %" PRIu64 ")].x+sdata[(%" PRIu64 " + 2*sdataID) * sharedStride + (combinedID / %" PRIu64 ")].x);\n", sc->regIDs[0], LFending, 2 * (sc->fftDim / 2), mult * sc->fftDim, sc->fftDim - 2 * (sc->fftDim / 2), mult * sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, "}\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, "if ( (combinedID / %" PRIu64 ") %% 2 == 0){\n", sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.x = 0.5%s*(sdata[(%" PRIu64 " - 2*sdataID) + (combinedID / %" PRIu64 ") * sharedStride].x+sdata[(%" PRIu64 " + 2*sdataID) + (combinedID / %" PRIu64 ") * sharedStride].x);\n", sc->regIDs[0], LFending, 2 * (sc->fftDim / 2), mult * sc->fftDim, sc->fftDim - 2 * (sc->fftDim / 2), mult * sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = 0.5%s*(sdata[(%" PRIu64 " - 2*sdataID) + (combinedID / %" PRIu64 ") * sharedStride].y-sdata[(%" PRIu64 " + 2*sdataID) + (combinedID / %" PRIu64 ") * sharedStride].y);\n", sc->regIDs[0], LFending, 2 * (sc->fftDim / 2), mult * sc->fftDim, sc->fftDim - 2 * (sc->fftDim / 2), mult * sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, "}else{\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.x = 0.5%s*(sdata[(%" PRIu64 " - 2*sdataID) + (combinedID / %" PRIu64 ") * sharedStride].y+sdata[(%" PRIu64 " + 2*sdataID) + (combinedID / %" PRIu64 ") * sharedStride].y);\n", sc->regIDs[0], LFending, 2 * (sc->fftDim / 2), mult * sc->fftDim, sc->fftDim - 2 * (sc->fftDim / 2), mult * sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = 0.5%s*(-sdata[(%" PRIu64 " - 2*sdataID) + (combinedID / %" PRIu64 ") * sharedStride].x+sdata[(%" PRIu64 " + 2*sdataID) + (combinedID / %" PRIu64 ") * sharedStride].x);\n", sc->regIDs[0], LFending, 2 * (sc->fftDim / 2), mult * sc->fftDim, sc->fftDim - 2 * (sc->fftDim / 2), mult * sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, "}\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - if (!sc->axisSwapped) - sc->tempLen = sprintf(sc->tempStr, " %s = sdata[(%" PRIu64 " - 2*sdataID) + (combinedID / %" PRIu64 ") * sharedStride];\n", sc->regIDs[0], 2 * (sc->fftDim / 2), sc->fftDim); - else - sc->tempLen = sprintf(sc->tempStr, " %s = sdata[(%" PRIu64 " - 2*sdataID) * sharedStride + (combinedID / %" PRIu64 ")];\n", sc->regIDs[0], 2 * (sc->fftDim / 2), sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " if ((((sdataID+1)/2) %% 2) != 0) \n\ - %s.x = -%s.x;\n\ - else\n\ - %s.x = %s.x;\n", sc->regIDs[1], sc->regIDs[0], sc->regIDs[1], sc->regIDs[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " if ((((sdataID)/2) %% 2) != 0) \n\ - %s.x -= %s.y;\n\ - else\n\ - %s.x += %s.y;\n", sc->regIDs[1], sc->regIDs[0], sc->regIDs[1], sc->regIDs[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " }\n\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - - sc->tempLen = sprintf(sc->tempStr, " if((sdataID < %" PRIu64 ")&&(sdataID >= %" PRIu64 ")){\n", 3 * sc->fftDim / 4, sc->fftDim / 2); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->mergeSequencesR2C) { - if (sc->axisSwapped) { - sc->tempLen = sprintf(sc->tempStr, "if ( (combinedID / %" PRIu64 ") %% 2 == 0){\n", sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.x = 0.5%s*(sdata[(2*sdataID - %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")].x+sdata[(%" PRIu64 " - 2*sdataID) * sharedStride + (combinedID / %" PRIu64 ")].x);\n", sc->regIDs[0], LFending, 2 * (sc->fftDim / 2), mult * sc->fftDim, sc->fftDim + 2 * (sc->fftDim / 2), mult * sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = 0.5%s*(sdata[(2*sdataID - %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")].y-sdata[(%" PRIu64 " - 2*sdataID) * sharedStride + (combinedID / %" PRIu64 ")].y);\n", sc->regIDs[0], LFending, 2 * (sc->fftDim / 2), mult * sc->fftDim, sc->fftDim + 2 * (sc->fftDim / 2), mult * sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, "}else{\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.x = 0.5%s*(sdata[(2*sdataID - %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")].y+sdata[(%" PRIu64 " - 2*sdataID) * sharedStride + (combinedID / %" PRIu64 ")].y);\n", sc->regIDs[0], LFending, 2 * (sc->fftDim / 2), mult * sc->fftDim, sc->fftDim + 2 * (sc->fftDim / 2), mult * sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = 0.5%s*(-sdata[(2*sdataID - %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")].x+sdata[(%" PRIu64 " - 2*sdataID) * sharedStride + (combinedID / %" PRIu64 ")].x);\n", sc->regIDs[0], LFending, 2 * (sc->fftDim / 2), mult * sc->fftDim, sc->fftDim + 2 * (sc->fftDim / 2), mult * sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, "}\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, "if ( (combinedID / %" PRIu64 ") %% 2 == 0){\n", sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.x = 0.5%s*(sdata[(2*sdataID - %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].x+sdata[(%" PRIu64 " - 2*sdataID) + (combinedID / %" PRIu64 ") * sharedStride].x);\n", sc->regIDs[0], LFending, 2 * (sc->fftDim / 2), mult * sc->fftDim, sc->fftDim + 2 * (sc->fftDim / 2), mult * sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = 0.5%s*(sdata[(2*sdataID - %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].y-sdata[(%" PRIu64 " - 2*sdataID) + (combinedID / %" PRIu64 ") * sharedStride].y);\n", sc->regIDs[0], LFending, 2 * (sc->fftDim / 2), mult * sc->fftDim, sc->fftDim + 2 * (sc->fftDim / 2), mult * sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, "}else{\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.x = 0.5%s*(sdata[(2*sdataID - %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].y+sdata[(%" PRIu64 " - 2*sdataID) + (combinedID / %" PRIu64 ") * sharedStride].y);\n", sc->regIDs[0], LFending, 2 * (sc->fftDim / 2), mult * sc->fftDim, sc->fftDim + 2 * (sc->fftDim / 2), mult * sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = 0.5%s*(-sdata[(2*sdataID - %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].x+sdata[(%" PRIu64 " - 2*sdataID) + (combinedID / %" PRIu64 ") * sharedStride].x);\n", sc->regIDs[0], LFending, 2 * (sc->fftDim / 2), mult * sc->fftDim, sc->fftDim + 2 * (sc->fftDim / 2), mult * sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, "}\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - if (!sc->axisSwapped) - sc->tempLen = sprintf(sc->tempStr, " %s = sdata[(2*sdataID - %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride];\n", sc->regIDs[0], 2 * (sc->fftDim / 2), sc->fftDim); - else - sc->tempLen = sprintf(sc->tempStr, " %s = sdata[(2*sdataID - %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")];\n", sc->regIDs[0], 2 * (sc->fftDim / 2), sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " if ((((sdataID+1)/2) %% 2) != 0) \n\ - %s.x = -%s.x;\n\ - else\n\ - %s.x = %s.x;\n", sc->regIDs[1], sc->regIDs[0], sc->regIDs[1], sc->regIDs[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " if ((((sdataID)/2) %% 2) != 0) \n\ - %s.x += %s.y;\n\ - else\n\ - %s.x -= %s.y;\n", sc->regIDs[1], sc->regIDs[0], sc->regIDs[1], sc->regIDs[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " }\n\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - - sc->tempLen = sprintf(sc->tempStr, " if((sdataID >= %" PRIu64 ")){\n", 3 * sc->fftDim / 4); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->mergeSequencesR2C) { - if (sc->axisSwapped) { - sc->tempLen = sprintf(sc->tempStr, "if ( (combinedID / %" PRIu64 ") %% 2 == 0){\n", sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.x = 0.5%s*(sdata[(%" PRIu64 " - 2*sdataID) * sharedStride + (combinedID / %" PRIu64 ")].x+sdata[(2*sdataID - %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")].x);\n", sc->regIDs[0], LFending, 2 * sc->fftDim - 1, mult * sc->fftDim, sc->fftDim - 1, mult * sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = 0.5%s*(sdata[(%" PRIu64 " - 2*sdataID) * sharedStride + (combinedID / %" PRIu64 ")].y-sdata[(2*sdataID - %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")].y);\n", sc->regIDs[0], LFending, 2 * sc->fftDim - 1, mult * sc->fftDim, sc->fftDim - 1, mult * sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, "}else{\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.x = 0.5%s*(sdata[(%" PRIu64 " - 2*sdataID) * sharedStride + (combinedID / %" PRIu64 ")].y+sdata[(2*sdataID - %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")].y);\n", sc->regIDs[0], LFending, 2 * sc->fftDim - 1, mult * sc->fftDim, sc->fftDim - 1, mult * sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = 0.5%s*(-sdata[(%" PRIu64 " - 2*sdataID) * sharedStride + (combinedID / %" PRIu64 ")].x+sdata[(2*sdataID - %" PRIu64 ") * sharedStride + (combinedID / %" PRIu64 ")].x);\n", sc->regIDs[0], LFending, 2 * sc->fftDim - 1, mult * sc->fftDim, sc->fftDim - 1, mult * sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, "}\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, "if ( (combinedID / %" PRIu64 ") %% 2 == 0){\n", sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.x = 0.5%s*(sdata[(%" PRIu64 " - 2*sdataID) + (combinedID / %" PRIu64 ") * sharedStride].x+sdata[(2*sdataID - %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].x);\n", sc->regIDs[0], LFending, 2 * sc->fftDim - 1, mult * sc->fftDim, sc->fftDim - 1, mult * sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = 0.5%s*(sdata[(%" PRIu64 " - 2*sdataID) + (combinedID / %" PRIu64 ") * sharedStride].y-sdata[(2*sdataID - %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].y);\n", sc->regIDs[0], LFending, 2 * sc->fftDim - 1, mult * sc->fftDim, sc->fftDim - 1, mult * sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, "}else{\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.x = 0.5%s*(sdata[(%" PRIu64 " - 2*sdataID) + (combinedID / %" PRIu64 ") * sharedStride].y+sdata[(2*sdataID - %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].y);\n", sc->regIDs[0], LFending, 2 * sc->fftDim - 1, mult * sc->fftDim, sc->fftDim - 1, mult * sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.y = 0.5%s*(-sdata[(%" PRIu64 " - 2*sdataID) + (combinedID / %" PRIu64 ") * sharedStride].x+sdata[(2*sdataID - %" PRIu64 ") + (combinedID / %" PRIu64 ") * sharedStride].x);\n", sc->regIDs[0], LFending, 2 * sc->fftDim - 1, mult * sc->fftDim, sc->fftDim - 1, mult * sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, "}\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - if (!sc->axisSwapped) - sc->tempLen = sprintf(sc->tempStr, " %s = sdata[(%" PRIu64 " - 2*sdataID) + (combinedID / %" PRIu64 ") * sharedStride];\n", sc->regIDs[0], 2 * sc->fftDim - 1, sc->fftDim); - else - sc->tempLen = sprintf(sc->tempStr, " %s = sdata[(%" PRIu64 " - 2*sdataID) * sharedStride + (combinedID / %" PRIu64 ")];\n", sc->regIDs[0], 2 * sc->fftDim - 1, sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " if ((((sdataID+1)/2) %% 2) != 0) \n\ - %s.x = -%s.x;\n\ - else\n\ - %s.x = %s.x;\n", sc->regIDs[1], sc->regIDs[0], sc->regIDs[1], sc->regIDs[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " if ((((sdataID)/2) %% 2) != 0) \n\ - %s.x -= %s.y;\n\ - else\n\ - %s.x += %s.y;\n", sc->regIDs[1], sc->regIDs[0], sc->regIDs[1], sc->regIDs[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " }\n\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.x *= 1.41421356237309504880%s;\n", sc->regIDs[1], LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->outputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s[inoutID] = %s%s.x%s;\n", outputsStruct, convTypeLeft, sc->regIDs[1], convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "] = %s%s.x%s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[1], convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - res = appendZeropadEndReadWriteStage(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->zeropad[1]) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->axisSwapped) { - if ((1 + i + mult * k * used_registers_write) * sc->localSize[0] * sc->localSize[1] > mult * sc->fftDim * sc->localSize[0]) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - if ((1 + i + mult * k * used_registers_write) * sc->localSize[0] * sc->localSize[1] > mult * sc->fftDim * sc->localSize[1]) - { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - if (sc->axisSwapped) { - if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[0] != 0) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - else { - if ((uint64_t)ceil(sc->size[1] / (double)mult) % sc->localSize[1] != 0) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - } - if (sc->zeropadBluestein[1]) { - sc->fftDim = sc->fft_dim_full; - used_registers_write = (sc->axisSwapped) ? (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]) : (uint64_t)ceil(sc->fftDim / (double)sc->localSize[0]); - } - /*for (uint64_t k = 0; k < sc->registerBoost; k++) { - for (uint64_t i = 0; i < sc->min_registers_per_thread; i++) { - if (sc->localSize[1] == 1) - sc->tempLen = sprintf(sc->tempStr, " combinedID = %s + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, (i + k * sc->min_registers_per_thread) * sc->localSize[0]); - else - sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * sc->min_registers_per_thread) * sc->localSize[0] * sc->localSize[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " if(%s + %" PRIu64 " < %" PRIu64 "){\n", sc->gl_LocalInvocationID_x, (i + k * sc->min_registers_per_thread) * sc->localSize[0], (sc->fftDim-1)/2); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->axisSwapped) { - sc->tempLen = sprintf(sc->tempStr, " w = sdata[(2*(combinedID %% %" PRIu64 ")+1)* sharedStride + (combinedID / %" PRIu64 ")];\n",sc->fftDim, sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s = sdata[(2*(combinedID %% %" PRIu64 ")+2)* sharedStride + (combinedID / %" PRIu64 ")];\n", sc->regIDs[i + k * sc->min_registers_per_thread], sc->fftDim, sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " w = sdata[(2*(combinedID %% %" PRIu64 ")+1) + (combinedID / %" PRIu64 ") * sharedStride];\n", sc->fftDim, sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s = sdata[(2*(combinedID %% %" PRIu64 ")+2) + (combinedID / %" PRIu64 ") * sharedStride];\n", sc->regIDs[i + k * sc->min_registers_per_thread], sc->fftDim, sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " }else{\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->axisSwapped) { - sc->tempLen = sprintf(sc->tempStr, " w = sdata[(2*(%" PRIu64 " - combinedID %% %" PRIu64 ")-1)* sharedStride + (combinedID / %" PRIu64 ")];\n", sc->fftDim, sc->fftDim, sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s = sdata[(2*(%" PRIu64 " - combinedID %% %" PRIu64 "))* sharedStride + (combinedID / %" PRIu64 ")];\n", sc->regIDs[i + k * sc->min_registers_per_thread], sc->fftDim, sc->fftDim, sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " w = sdata[(2*(%" PRIu64 " - combinedID %% %" PRIu64 ")-1) + (combinedID / %" PRIu64 ")* sharedStride];\n", sc->fftDim, sc->fftDim, sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s = sdata[(2*(%" PRIu64 " - combinedID %% %" PRIu64 ")) + (combinedID / %" PRIu64 ")* sharedStride];\n", sc->regIDs[i + k * sc->min_registers_per_thread], sc->fftDim, sc->fftDim, sc->fftDim); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - } - }*/ - - } - else { - - } - } - break; - } - case 145://odd DCT-IV strided as N FFT - { - if (!sc->writeFromRegisters) { - res = appendBarrierVkFFT(sc, 1); - if (res != VKFFT_SUCCESS) return res; - } - //res = appendZeropadStart(sc); - //if (res != VKFFT_SUCCESS) return res; - char shiftX[500] = ""; - if (sc->performWorkGroupShift[0]) - sprintf(shiftX, " + consts.workGroupShiftX*%s ", sc->gl_WorkGroupSize_x); - char shiftY[500] = ""; - if (sc->performWorkGroupShift[1]) - sprintf(shiftY, " + consts.workGroupShiftY*%s ", sc->gl_WorkGroupSize_y); - char shiftY2[500] = ""; - if (sc->performWorkGroupShift[1]) - sprintf(shiftY2, " + consts.workGroupShiftY "); - //uint64_t mult = (sc->mergeSequencesR2C) ? 2 : 1; - uint64_t used_registers_write = (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]); - if (sc->registerBoost > 1) used_registers_write /= sc->registerBoost; - if (sc->reorderFourStep) { - //Not implemented - } - else { - //appendBarrierVkFFT(sc, 1); - //appendZeropadStart(sc); - if (sc->fftDim == sc->fft_dim_full) { - if (sc->zeropadBluestein[1]) { - sc->fftDim = sc->fft_zeropad_Bluestein_left_write[sc->axis_id]; - used_registers_write = (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]); - } - for (uint64_t k = 0; k < sc->registerBoost; k++) { - for (uint64_t i = 0; i < used_registers_write; i++) { - sc->tempLen = sprintf(sc->tempStr, " combinedID = (%s + %" PRIu64 " * %s) + %" PRIu64 ";\n", sc->gl_LocalInvocationID_x, sc->localSize[0], sc->gl_LocalInvocationID_y, (i + k * used_registers_write) * sc->localSize[0] * sc->localSize[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - sc->tempLen = sprintf(sc->tempStr, " %s = %s%s + ((combinedID/%" PRIu64 ") * %" PRIu64 ");\n", sc->inoutID, sc->gl_GlobalInvocationID_x, shiftX, sc->localSize[0], sc->outputStride[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->size[0] % sc->localSize[0] != 0) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID %% %" PRIu64 " + %s*%" PRIu64 "< %" PRIu64 "){\n", sc->localSize[0], sc->gl_WorkGroupID_x, sc->localSize[0], sc->size[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if ((1 + i + k * used_registers_write) * sc->localSize[0] * sc->localSize[1] > sc->fftDim * sc->localSize[0]) { - sc->tempLen = sprintf(sc->tempStr, " if(combinedID < %" PRIu64 "){\n", sc->fftDim * sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->zeropad[1]) { - sc->tempLen = sprintf(sc->tempStr, " if(((combinedID/%" PRIu64 ") %% %" PRIu64 " < %" PRIu64 ")||((combinedID/%" PRIu64 ") %% %" PRIu64 " >= %" PRIu64 ")){\n", sc->localSize[0], sc->fft_dim_full, sc->fft_zeropad_left_read[sc->axis_id], sc->localSize[0], sc->fft_dim_full, sc->fft_zeropad_right_read[sc->axis_id]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " %s = ", sc->inoutID); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - res = indexOutputVkFFT(sc, uintType, writeType, sc->inoutID, 0, requestCoordinate, requestBatch); - sc->tempLen = sprintf(sc->tempStr, ";\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - res = appendZeropadStartReadWriteStage(sc, 0); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " sdataID = combinedID / %" PRIu64 ";\n", sc->localSize[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " if(sdataID < %" PRIu64 "){\n", sc->fftDim / 4); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s = sdata[(2*sdataID+1) * sharedStride + %s];\n", sc->regIDs[0], sc->gl_LocalInvocationID_x); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " if ((((sdataID + 1)/2) %% 2) != 0) \n\ - %s.x = -%s.x;\n\ - else\n\ - %s.x = %s.x;\n", sc->regIDs[1], sc->regIDs[0], sc->regIDs[1], sc->regIDs[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " if ((((sdataID)/2) %% 2) != 0) \n\ - %s.x += %s.y;\n\ - else\n\ - %s.x -= %s.y;\n", sc->regIDs[1], sc->regIDs[0], sc->regIDs[1], sc->regIDs[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " }\n\n"); - - - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " if((sdataID < %" PRIu64 ")&&(sdataID >= %" PRIu64 ")){\n", sc->fftDim / 2, sc->fftDim / 4); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s = sdata[(%" PRIu64 " - 2*sdataID) * sharedStride + %s];\n", sc->regIDs[0], 2 * (sc->fftDim / 2), sc->gl_LocalInvocationID_x); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " if ((((sdataID+1)/2) %% 2) != 0) \n\ - %s.x = -%s.x;\n\ - else\n\ - %s.x = %s.x;\n", sc->regIDs[1], sc->regIDs[0], sc->regIDs[1], sc->regIDs[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " if ((((sdataID)/2) %% 2) != 0) \n\ - %s.x -= %s.y;\n\ - else\n\ - %s.x += %s.y;\n", sc->regIDs[1], sc->regIDs[0], sc->regIDs[1], sc->regIDs[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " }\n\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - - sc->tempLen = sprintf(sc->tempStr, " if((sdataID < %" PRIu64 ")&&(sdataID >= %" PRIu64 ")){\n", 3 * sc->fftDim / 4, sc->fftDim / 2); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s = sdata[(2*sdataID - %" PRIu64 ") * sharedStride + %s];\n", sc->regIDs[0], 2 * (sc->fftDim / 2), sc->gl_LocalInvocationID_x); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " if ((((sdataID+1)/2) %% 2) != 0) \n\ - %s.x = -%s.x;\n\ - else\n\ - %s.x = %s.x;\n", sc->regIDs[1], sc->regIDs[0], sc->regIDs[1], sc->regIDs[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " if ((((sdataID)/2) %% 2) != 0) \n\ - %s.x += %s.y;\n\ - else\n\ - %s.x -= %s.y;\n", sc->regIDs[1], sc->regIDs[0], sc->regIDs[1], sc->regIDs[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " }\n\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - - sc->tempLen = sprintf(sc->tempStr, " if((sdataID >= %" PRIu64 ")){\n", 3 * sc->fftDim / 4); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s = sdata[(%" PRIu64 " - 2*sdataID) * sharedStride + %s];\n", sc->regIDs[0], 2 * sc->fftDim - 1, sc->gl_LocalInvocationID_x); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " if ((((sdataID+1)/2) %% 2) != 0) \n\ - %s.x = -%s.x;\n\ - else\n\ - %s.x = %s.x;\n", sc->regIDs[1], sc->regIDs[0], sc->regIDs[1], sc->regIDs[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " if ((((sdataID)/2) %% 2) != 0) \n\ - %s.x -= %s.y;\n\ - else\n\ - %s.x += %s.y;\n", sc->regIDs[1], sc->regIDs[0], sc->regIDs[1], sc->regIDs[0]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " }\n\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s.x *= 1.41421356237309504880%s;\n", sc->regIDs[1], LFending); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->outputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s[inoutID] = %s%s.x%s;\n", outputsStruct, convTypeLeft, sc->regIDs[1], convTypeRight); - else - sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "] = %s%s.x%s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeft, sc->regIDs[1], convTypeRight); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->zeropad[1]) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - - res = appendZeropadEndReadWriteStage(sc); - if (res != VKFFT_SUCCESS) return res; - if ((1 + i + k * used_registers_write) * sc->localSize[0] * sc->localSize[1] > sc->fftDim * sc->localSize[0]) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->size[0] % sc->localSize[0] != 0) { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - } - if (sc->zeropadBluestein[1]) { - sc->fftDim = sc->fft_dim_full; - used_registers_write = (uint64_t)ceil(sc->fftDim / (double)sc->localSize[1]); - } - } - else { - - } - } - break; - } - } - //res = appendZeropadEnd(sc); - //if (res != VKFFT_SUCCESS) return res; - return res; -} -static inline VkFFTResult shaderGenVkFFT_R2C_decomposition(char* output, VkFFTSpecializationConstantsLayout* sc, const char* floatType, const char* floatTypeInputMemory, const char* floatTypeOutputMemory, const char* floatTypeKernelMemory, const char* uintType, uint64_t type) { - VkFFTResult res = VKFFT_SUCCESS; - //appendLicense(output); - if (!sc->disableSetLocale) { - const char* loc_oldLocale = setlocale(LC_ALL, NULL); - strcpy(sc->oldLocale, loc_oldLocale); - setlocale(LC_ALL, "C"); - } - sc->output = output; - sc->tempStr = (char*)malloc(sizeof(char) * sc->maxTempLength); - if (!sc->tempStr) return VKFFT_ERROR_MALLOC_FAILED; - sc->tempLen = 0; - sc->currentLen = 0; - char vecType[30]; - char vecTypeInput[30]; - char vecTypeOutput[30]; - char inputsStruct[20] = ""; - char outputsStruct[20] = ""; - char LFending[4] = ""; - if (!strcmp(floatType, "float")) sprintf(LFending, "f"); -#if(VKFFT_BACKEND==0) - if (sc->inputBufferBlockNum == 1) - sprintf(inputsStruct, "inputs"); - else - sprintf(inputsStruct, ".inputs"); - if (sc->outputBufferBlockNum == 1) - sprintf(outputsStruct, "outputs"); - else - sprintf(outputsStruct, ".outputs"); - if (!strcmp(floatType, "half")) sprintf(vecType, "f16vec2"); - if (!strcmp(floatType, "float")) sprintf(vecType, "vec2"); - if (!strcmp(floatType, "double")) sprintf(vecType, "dvec2"); - if (!strcmp(floatTypeInputMemory, "half")) sprintf(vecTypeInput, "f16vec2"); - if (!strcmp(floatTypeInputMemory, "float")) sprintf(vecTypeInput, "vec2"); - if (!strcmp(floatTypeInputMemory, "double")) sprintf(vecTypeInput, "dvec2"); - if (!strcmp(floatTypeOutputMemory, "half")) sprintf(vecTypeOutput, "f16vec2"); - if (!strcmp(floatTypeOutputMemory, "float")) sprintf(vecTypeOutput, "vec2"); - if (!strcmp(floatTypeOutputMemory, "double")) sprintf(vecTypeOutput, "dvec2"); - sprintf(sc->gl_LocalInvocationID_x, "gl_LocalInvocationID.x"); - sprintf(sc->gl_LocalInvocationID_y, "gl_LocalInvocationID.y"); - sprintf(sc->gl_LocalInvocationID_z, "gl_LocalInvocationID.z"); - switch (sc->swapComputeWorkGroupID) { - case 0: - sprintf(sc->gl_GlobalInvocationID_x, "gl_GlobalInvocationID.x"); - sprintf(sc->gl_GlobalInvocationID_y, "gl_GlobalInvocationID.y"); - sprintf(sc->gl_GlobalInvocationID_z, "gl_GlobalInvocationID.z"); - sprintf(sc->gl_WorkGroupID_x, "gl_WorkGroupID.x"); - sprintf(sc->gl_WorkGroupID_y, "gl_WorkGroupID.y"); - sprintf(sc->gl_WorkGroupID_z, "gl_WorkGroupID.z"); - break; - case 1: - sprintf(sc->gl_GlobalInvocationID_x, "(gl_LocalInvocationID.x + gl_WorkGroupID.y * gl_WorkGroupSize.x)"); - sprintf(sc->gl_GlobalInvocationID_y, "(gl_LocalInvocationID.y + gl_WorkGroupID.x * gl_WorkGroupSize.y)"); - sprintf(sc->gl_GlobalInvocationID_z, "gl_GlobalInvocationID.z"); - sprintf(sc->gl_WorkGroupID_x, "gl_WorkGroupID.y"); - sprintf(sc->gl_WorkGroupID_y, "gl_WorkGroupID.x"); - sprintf(sc->gl_WorkGroupID_z, "gl_WorkGroupID.z"); - break; - case 2: - sprintf(sc->gl_GlobalInvocationID_x, "(gl_LocalInvocationID.x + gl_WorkGroupID.z * gl_WorkGroupSize.x)"); - sprintf(sc->gl_GlobalInvocationID_y, "gl_GlobalInvocationID.y"); - sprintf(sc->gl_GlobalInvocationID_z, "(gl_LocalInvocationID.z + gl_WorkGroupID.x * gl_WorkGroupSize.z)"); - sprintf(sc->gl_WorkGroupID_x, "gl_WorkGroupID.z"); - sprintf(sc->gl_WorkGroupID_y, "gl_WorkGroupID.y"); - sprintf(sc->gl_WorkGroupID_z, "gl_WorkGroupID.x"); - break; - } - sprintf(sc->gl_WorkGroupSize_x, "gl_WorkGroupSize.x"); - sprintf(sc->gl_WorkGroupSize_y, "gl_WorkGroupSize.y"); - sprintf(sc->gl_WorkGroupSize_z, "gl_WorkGroupSize.z"); - sprintf(sc->gl_SubgroupInvocationID, "gl_SubgroupInvocationID"); - sprintf(sc->gl_SubgroupID, "gl_SubgroupID"); - if (!strcmp(floatType, "double")) sprintf(LFending, "LF"); - char cosDef[20] = "cos"; - char sinDef[20] = "sin"; -#elif(VKFFT_BACKEND==1) - sprintf(inputsStruct, "inputs"); - sprintf(outputsStruct, "outputs"); - if (!strcmp(floatType, "half")) sprintf(vecType, "f16vec2"); - if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); - if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); - if (!strcmp(floatTypeInputMemory, "half")) sprintf(vecTypeInput, "f16vec2"); - if (!strcmp(floatTypeInputMemory, "float")) sprintf(vecTypeInput, "float2"); - if (!strcmp(floatTypeInputMemory, "double")) sprintf(vecTypeInput, "double2"); - if (!strcmp(floatTypeOutputMemory, "half")) sprintf(vecTypeOutput, "f16vec2"); - if (!strcmp(floatTypeOutputMemory, "float")) sprintf(vecTypeOutput, "float2"); - if (!strcmp(floatTypeOutputMemory, "double")) sprintf(vecTypeOutput, "double2"); - sprintf(sc->gl_LocalInvocationID_x, "threadIdx.x"); - sprintf(sc->gl_LocalInvocationID_y, "threadIdx.y"); - sprintf(sc->gl_LocalInvocationID_z, "threadIdx.z"); - switch (sc->swapComputeWorkGroupID) { - case 0: - sprintf(sc->gl_GlobalInvocationID_x, "(threadIdx.x + blockIdx.x * blockDim.x)"); - sprintf(sc->gl_GlobalInvocationID_y, "(threadIdx.y + blockIdx.y * blockDim.y)"); - sprintf(sc->gl_GlobalInvocationID_z, "(threadIdx.z + blockIdx.z * blockDim.z)"); - sprintf(sc->gl_WorkGroupID_x, "blockIdx.x"); - sprintf(sc->gl_WorkGroupID_y, "blockIdx.y"); - sprintf(sc->gl_WorkGroupID_z, "blockIdx.z"); - break; - case 1: - sprintf(sc->gl_GlobalInvocationID_x, "(threadIdx.x + blockIdx.y * blockDim.x)"); - sprintf(sc->gl_GlobalInvocationID_y, "(threadIdx.y + blockIdx.x * blockDim.y)"); - sprintf(sc->gl_GlobalInvocationID_z, "(threadIdx.z + blockIdx.z * blockDim.z)"); - sprintf(sc->gl_WorkGroupID_x, "blockIdx.y"); - sprintf(sc->gl_WorkGroupID_y, "blockIdx.x"); - sprintf(sc->gl_WorkGroupID_z, "blockIdx.z"); - break; - case 2: - sprintf(sc->gl_GlobalInvocationID_x, "(threadIdx.x + blockIdx.z * blockDim.x)"); - sprintf(sc->gl_GlobalInvocationID_y, "(threadIdx.y + blockIdx.y * blockDim.y)"); - sprintf(sc->gl_GlobalInvocationID_z, "(threadIdx.z + blockIdx.x * blockDim.z)"); - sprintf(sc->gl_WorkGroupID_x, "blockIdx.z"); - sprintf(sc->gl_WorkGroupID_y, "blockIdx.y"); - sprintf(sc->gl_WorkGroupID_z, "blockIdx.x"); - break; - } - sprintf(sc->gl_WorkGroupSize_x, "blockDim.x"); - sprintf(sc->gl_WorkGroupSize_y, "blockDim.y"); - sprintf(sc->gl_WorkGroupSize_z, "blockDim.z"); - sprintf(sc->gl_SubgroupInvocationID, "(threadIdx.x %% %" PRIu64 ")", sc->warpSize); - sprintf(sc->gl_SubgroupID, "(threadIdx.x / %" PRIu64 ")", sc->warpSize); - if (!strcmp(floatType, "double")) sprintf(LFending, "l"); - char cosDef[20] = "__cosf"; - char sinDef[20] = "__sinf"; -#elif(VKFFT_BACKEND==2) - sprintf(inputsStruct, "inputs"); - sprintf(outputsStruct, "outputs"); - if (!strcmp(floatType, "half")) sprintf(vecType, "f16vec2"); - if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); - if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); - if (!strcmp(floatTypeInputMemory, "half")) sprintf(vecTypeInput, "f16vec2"); - if (!strcmp(floatTypeInputMemory, "float")) sprintf(vecTypeInput, "float2"); - if (!strcmp(floatTypeInputMemory, "double")) sprintf(vecTypeInput, "double2"); - if (!strcmp(floatTypeOutputMemory, "half")) sprintf(vecTypeOutput, "f16vec2"); - if (!strcmp(floatTypeOutputMemory, "float")) sprintf(vecTypeOutput, "float2"); - if (!strcmp(floatTypeOutputMemory, "double")) sprintf(vecTypeOutput, "double2"); - sprintf(sc->gl_LocalInvocationID_x, sc->localSize[0] > 1 ? "threadIdx.x" : "0"); - sprintf(sc->gl_LocalInvocationID_y, sc->localSize[1] > 1 ? "threadIdx.y" : "0"); - sprintf(sc->gl_LocalInvocationID_z, sc->localSize[2] > 1 ? "threadIdx.z" : "0"); - switch (sc->swapComputeWorkGroupID) { - case 0: - sprintf(sc->gl_WorkGroupID_x, "blockIdx.x"); - sprintf(sc->gl_WorkGroupID_y, "blockIdx.y"); - sprintf(sc->gl_WorkGroupID_z, "blockIdx.z"); - break; - case 1: - sprintf(sc->gl_WorkGroupID_x, "blockIdx.y"); - sprintf(sc->gl_WorkGroupID_y, "blockIdx.x"); - sprintf(sc->gl_WorkGroupID_z, "blockIdx.z"); - break; - case 2: - sprintf(sc->gl_WorkGroupID_x, "blockIdx.z"); - sprintf(sc->gl_WorkGroupID_y, "blockIdx.y"); - sprintf(sc->gl_WorkGroupID_z, "blockIdx.x"); - break; - } - sprintf(sc->gl_WorkGroupSize_x, "%" PRIu64, sc->localSize[0]); - sprintf(sc->gl_WorkGroupSize_y, "%" PRIu64, sc->localSize[1]); - sprintf(sc->gl_WorkGroupSize_z, "%" PRIu64, sc->localSize[2]); - sprintf(sc->gl_GlobalInvocationID_x, "(%s + %s * %s)", sc->gl_LocalInvocationID_x, sc->gl_WorkGroupID_x, sc->gl_WorkGroupSize_x); - sprintf(sc->gl_GlobalInvocationID_y, "(%s + %s * %s)", sc->gl_LocalInvocationID_y, sc->gl_WorkGroupID_y, sc->gl_WorkGroupSize_y); - sprintf(sc->gl_GlobalInvocationID_z, "(%s + %s * %s)", sc->gl_LocalInvocationID_z, sc->gl_WorkGroupID_z, sc->gl_WorkGroupSize_z); - sprintf(sc->gl_SubgroupInvocationID, "(threadIdx.x %% %" PRIu64 ")", sc->warpSize); - sprintf(sc->gl_SubgroupID, "(threadIdx.x / %" PRIu64 ")", sc->warpSize); - if (!strcmp(floatType, "double")) sprintf(LFending, "l"); - char cosDef[20] = "__cosf"; - char sinDef[20] = "__sinf"; -#elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)) - sprintf(inputsStruct, "inputs"); - sprintf(outputsStruct, "outputs"); - if (!strcmp(floatType, "half")) sprintf(vecType, "f16vec2"); - if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); - if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); - if (!strcmp(floatTypeInputMemory, "half")) sprintf(vecTypeInput, "f16vec2"); - if (!strcmp(floatTypeInputMemory, "float")) sprintf(vecTypeInput, "float2"); - if (!strcmp(floatTypeInputMemory, "double")) sprintf(vecTypeInput, "double2"); - if (!strcmp(floatTypeOutputMemory, "half")) sprintf(vecTypeOutput, "f16vec2"); - if (!strcmp(floatTypeOutputMemory, "float")) sprintf(vecTypeOutput, "float2"); - if (!strcmp(floatTypeOutputMemory, "double")) sprintf(vecTypeOutput, "double2"); - sprintf(sc->gl_LocalInvocationID_x, "get_local_id(0)"); - sprintf(sc->gl_LocalInvocationID_y, "get_local_id(1)"); - sprintf(sc->gl_LocalInvocationID_z, "get_local_id(2)"); - switch (sc->swapComputeWorkGroupID) { - case 0: - sprintf(sc->gl_GlobalInvocationID_x, "get_global_id(0)"); - sprintf(sc->gl_GlobalInvocationID_y, "get_global_id(1)"); - sprintf(sc->gl_GlobalInvocationID_z, "get_global_id(2)"); - sprintf(sc->gl_WorkGroupID_x, "get_group_id(0)"); - sprintf(sc->gl_WorkGroupID_y, "get_group_id(1)"); - sprintf(sc->gl_WorkGroupID_z, "get_group_id(2)"); - break; - case 1: - sprintf(sc->gl_GlobalInvocationID_x, "(get_local_id(0) + get_group_id(1) * get_local_size(0))"); - sprintf(sc->gl_GlobalInvocationID_y, "(get_local_id(1) + get_group_id(0) * get_local_size(1))"); - sprintf(sc->gl_GlobalInvocationID_z, "get_global_id(2)"); - sprintf(sc->gl_WorkGroupID_x, "get_group_id(1)"); - sprintf(sc->gl_WorkGroupID_y, "get_group_id(0)"); - sprintf(sc->gl_WorkGroupID_z, "get_group_id(2)"); - break; - case 2: - sprintf(sc->gl_GlobalInvocationID_x, "(get_local_id(0) + get_group_id(2) * get_local_size(0))"); - sprintf(sc->gl_GlobalInvocationID_y, "get_global_id(1)"); - sprintf(sc->gl_GlobalInvocationID_z, "(get_local_id(2) + get_group_id(0) * get_local_size(2))"); - sprintf(sc->gl_WorkGroupID_x, "get_group_id(2)"); - sprintf(sc->gl_WorkGroupID_y, "get_group_id(1)"); - sprintf(sc->gl_WorkGroupID_z, "get_group_id(0)"); - break; - } - sprintf(sc->gl_WorkGroupSize_x, "get_local_size(0)"); - sprintf(sc->gl_WorkGroupSize_y, "get_local_size(1)"); - sprintf(sc->gl_WorkGroupSize_z, "get_local_size(2)"); - //if (!strcmp(floatType, "double")) sprintf(LFending, "l"); - char cosDef[20] = "native_cos"; - char sinDef[20] = "native_sin"; -#endif - sprintf(sc->vecType, "%s", vecType); - sprintf(sc->stageInvocationID, "stageInvocationID"); - sprintf(sc->blockInvocationID, "blockInvocationID"); - sprintf(sc->tshuffle, "tshuffle"); - sprintf(sc->sharedStride, "sharedStride"); - sprintf(sc->combinedID, "combinedID"); - sprintf(sc->inoutID, "inoutID"); - sprintf(sc->sdataID, "sdataID"); - - char convTypeLeftInput[20] = ""; - char convTypeRightInput[20] = ""; - if ((!strcmp(floatType, "float")) && (strcmp(floatTypeInputMemory, "float"))) { -#if(VKFFT_BACKEND==0) - sprintf(convTypeLeftInput, "vec2("); - sprintf(convTypeRightInput, ")"); -#elif(VKFFT_BACKEND==1) - sprintf(convTypeLeftInput, "conv_float2("); - sprintf(convTypeRightInput, ")"); -#elif(VKFFT_BACKEND==2) - sprintf(convTypeLeftInput, "conv_float2("); - sprintf(convTypeRightInput, ")"); -#elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)) - sprintf(convTypeLeftInput, "conv_float2("); - sprintf(convTypeRightInput, ")"); -#endif - } - if ((!strcmp(floatType, "double")) && (strcmp(floatTypeInputMemory, "double"))) { -#if(VKFFT_BACKEND==0) - sprintf(convTypeLeftInput, "dvec2("); - sprintf(convTypeRightInput, ")"); -#elif(VKFFT_BACKEND==1) - sprintf(convTypeLeftInput, "conv_double2("); - sprintf(convTypeRightInput, ")"); -#elif(VKFFT_BACKEND==2) - sprintf(convTypeLeftInput, "conv_double2("); - sprintf(convTypeRightInput, ")"); -#elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)) - sprintf(convTypeLeftInput, "conv_double2("); - sprintf(convTypeRightInput, ")"); -#endif - } - - char convTypeLeftOutput[20] = ""; - char convTypeRightOutput[20] = ""; - if ((!strcmp(floatTypeOutputMemory, "half")) && (strcmp(floatType, "half"))) { - sprintf(convTypeLeftOutput, "f16vec2("); - sprintf(convTypeRightOutput, ")"); - } - if ((!strcmp(floatTypeOutputMemory, "float")) && (strcmp(floatType, "float"))) { -#if(VKFFT_BACKEND==0) - sprintf(convTypeLeftOutput, "vec2("); - sprintf(convTypeRightOutput, ")"); -#elif(VKFFT_BACKEND==1) - sprintf(convTypeLeftOutput, "(float2)"); -#elif(VKFFT_BACKEND==2) - sprintf(convTypeLeftOutput, "(float2)"); -#elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)) - sprintf(convTypeLeftOutput, "conv_float2("); - sprintf(convTypeRightOutput, ")"); -#endif - } - if ((!strcmp(floatTypeOutputMemory, "double")) && (strcmp(floatType, "double"))) { -#if(VKFFT_BACKEND==0) - sprintf(convTypeLeftOutput, "dvec2("); - sprintf(convTypeRightOutput, ")"); -#elif(VKFFT_BACKEND==1) - sprintf(convTypeLeftOutput, "(double2)"); -#elif(VKFFT_BACKEND==2) - sprintf(convTypeLeftOutput, "(double2)"); -#elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)) - sprintf(convTypeLeftOutput, "conv_double2("); - sprintf(convTypeRightOutput, ")"); -#endif - } - //sprintf(sc->tempReg, "temp"); - res = appendVersion(sc); - if (res != VKFFT_SUCCESS) return res; - res = appendExtensions(sc, floatType, floatTypeInputMemory, floatTypeOutputMemory, floatTypeKernelMemory); - if (res != VKFFT_SUCCESS) return res; - res = appendLayoutVkFFT(sc); - if (res != VKFFT_SUCCESS) return res; - res = appendConstantsVkFFT(sc, floatType, uintType); - if (res != VKFFT_SUCCESS) return res; - if ((!sc->LUT) && (!strcmp(floatType, "double"))) { - res = appendSinCos20(sc, floatType, uintType); - if (res != VKFFT_SUCCESS) return res; - } - if (strcmp(floatType, floatTypeInputMemory)) { - res = appendConversion(sc, floatType, floatTypeInputMemory); - if (res != VKFFT_SUCCESS) return res; - } - if (strcmp(floatType, floatTypeOutputMemory) && strcmp(floatTypeInputMemory, floatTypeOutputMemory)) { - res = appendConversion(sc, floatType, floatTypeOutputMemory); - if (res != VKFFT_SUCCESS) return res; - } - res = appendPushConstantsVkFFT(sc, floatType, uintType); - if (res != VKFFT_SUCCESS) return res; - uint64_t id = 0; - res = appendInputLayoutVkFFT(sc, id, floatTypeInputMemory, 0); - if (res != VKFFT_SUCCESS) return res; - id++; - res = appendOutputLayoutVkFFT(sc, id, floatTypeOutputMemory, 0); - if (res != VKFFT_SUCCESS) return res; - id++; - if (sc->convolutionStep) { - res = appendKernelLayoutVkFFT(sc, id, floatTypeKernelMemory); - if (res != VKFFT_SUCCESS) return res; - id++; - } - if (sc->LUT) { - res = appendLUTLayoutVkFFT(sc, id, floatType); - if (res != VKFFT_SUCCESS) return res; - id++; - } - //appendIndexInputVkFFT(sc, uintType, type); - //appendIndexOutputVkFFT(sc, uintType, type); - /*uint64_t appendedRadix[10] = { 0,0,0,0,0,0,0,0,0,0 }; - for (uint64_t i = 0; i < sc->numStages; i++) { - if (appendedRadix[sc->stageRadix[i]] == 0) { - appendedRadix[sc->stageRadix[i]] = 1; - appendRadixKernelVkFFT(sc, floatType, uintType, sc->stageRadix[i]); - } - }*/ -#if(VKFFT_BACKEND==0) - sc->tempLen = sprintf(sc->tempStr, "void main() {\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; -#elif(VKFFT_BACKEND==1) - sc->tempLen = sprintf(sc->tempStr, "extern \"C\" __global__ __launch_bounds__(%" PRIu64 ") void VkFFT_main_R2C ", sc->localSize[0] * sc->localSize[1] * sc->localSize[2]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, "(%s* inputs, %s* outputs", vecTypeInput, vecTypeOutput); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->convolutionStep) { - sc->tempLen = sprintf(sc->tempStr, ", %s* kernel_obj", vecType); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->LUT) { - sc->tempLen = sprintf(sc->tempStr, ", %s* twiddleLUT", vecType); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, ") {\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - //sc->tempLen = sprintf(sc->tempStr, ", const PushConsts consts) {\n"); -#elif(VKFFT_BACKEND==2) - sc->tempLen = sprintf(sc->tempStr, "extern \"C\" __launch_bounds__(%" PRIu64 ") __global__ void VkFFT_main_R2C ", sc->localSize[0] * sc->localSize[1] * sc->localSize[2]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, "(%s* inputs, %s* outputs", vecTypeInput, vecTypeOutput); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->convolutionStep) { - sc->tempLen = sprintf(sc->tempStr, ", %s* kernel_obj", vecType); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->LUT) { - sc->tempLen = sprintf(sc->tempStr, ", %s* twiddleLUT", vecType); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, ") {\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - //sc->tempLen = sprintf(sc->tempStr, ", const PushConsts consts) {\n"); -#elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)) - sc->tempLen = sprintf(sc->tempStr, "__kernel __attribute__((reqd_work_group_size(%" PRIu64 ", %" PRIu64 ", %" PRIu64 "))) void VkFFT_main_R2C ", sc->localSize[0], sc->localSize[1], sc->localSize[2]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, "(__global %s* inputs, __global %s* outputs", vecTypeInput, vecTypeOutput); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->convolutionStep) { - sc->tempLen = sprintf(sc->tempStr, ", __global %s* kernel_obj", vecType); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->LUT) { - sc->tempLen = sprintf(sc->tempStr, ", __global %s* twiddleLUT", vecType); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->pushConstantsStructSize > 0) { - sc->tempLen = sprintf(sc->tempStr, ", PushConsts consts"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, ") {\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - //sc->tempLen = sprintf(sc->tempStr, ", const PushConsts consts) {\n"); -#endif - char index_x[2000] = ""; - char idX[500] = ""; - if (sc->performWorkGroupShift[0]) - sprintf(idX, "(%s + consts.workGroupShiftX * %s)", sc->gl_GlobalInvocationID_x, sc->gl_WorkGroupSize_x); - else - sprintf(idX, "%s", sc->gl_GlobalInvocationID_x); - //res = appendZeropadStart(sc); - //if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, "%s id_x = %s %% %" PRIu64 ";\n", uintType, idX, (uint64_t)ceil(sc->size[0] / 4.0)); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, "%s id_y = (%s / %" PRIu64 ") %% %" PRIu64 ";\n", uintType, idX, (uint64_t)ceil(sc->size[0] / 4.0), sc->size[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, "%s id_z = (%s / %" PRIu64 ") / %" PRIu64 ";\n", uintType, idX, (uint64_t)ceil(sc->size[0] / 4.0), sc->size[1]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, "if (%s < %" PRIu64 "){\n", idX, (uint64_t)ceil(sc->size[0] / 4.0) * sc->size[1] * sc->size[2]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - sc->tempLen = sprintf(sc->tempStr, "%s inoutID = ", uintType); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sprintf(index_x, "id_x + id_y*%" PRIu64 " +id_z*%" PRIu64 "", sc->inputStride[1], sc->inputStride[2]); - res = indexInputVkFFT(sc, uintType, 0, index_x, 0, 0, 0); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, ";\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - sc->tempLen = sprintf(sc->tempStr, "%s inoutID2;\n", uintType); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, "%s inoutID3;\n", uintType); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->inputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s t0 = %s%s[inoutID]%s;\n", vecType, convTypeLeftInput, inputsStruct, convTypeRightInput); - else - sc->tempLen = sprintf(sc->tempStr, " %s t0 = %sinputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "]%s;\n", vecType, convTypeLeftInput, sc->inputBufferBlockSize, inputsStruct, sc->inputBufferBlockSize, convTypeRightInput); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s tf;\n", vecType); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->size[0] % 4 == 0) { - sc->tempLen = sprintf(sc->tempStr, "if (id_x == 0) {\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - sc->tempLen = sprintf(sc->tempStr, " inoutID2 = "); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sprintf(index_x, "%" PRIu64 " + id_y*%" PRIu64 " +id_z*%" PRIu64 "", (sc->size[0] / 2), sc->inputStride[1], sc->inputStride[2]); - res = indexInputVkFFT(sc, uintType, 0, index_x, 0, 0, 0); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, ";\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - sc->tempLen = sprintf(sc->tempStr, " inoutID3 = "); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sprintf(index_x, "%" PRIu64 " + id_y*%" PRIu64 " +id_z*%" PRIu64 "", (uint64_t)ceil(sc->size[0] / 4.0), sc->inputStride[1], sc->inputStride[2]); - res = indexInputVkFFT(sc, uintType, 0, index_x, 0, 0, 0); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, ";\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->inputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " tf = %s%s[inoutID3]%s;\n", convTypeLeftInput, inputsStruct, convTypeRightInput); - else - sc->tempLen = sprintf(sc->tempStr, " tf = %sinputBlocks[inoutID3 / %" PRIu64 "]%s[inoutID3 %% %" PRIu64 "]%s;\n", convTypeLeftInput, sc->inputBufferBlockSize, inputsStruct, sc->inputBufferBlockSize, convTypeRightInput); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - sc->tempLen = sprintf(sc->tempStr, "} else {\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - sc->tempLen = sprintf(sc->tempStr, " inoutID2 = "); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sprintf(index_x, "(%" PRIu64 "-id_x) + id_y*%" PRIu64 " +id_z*%" PRIu64 "", (sc->size[0] / 2), sc->inputStride[1], sc->inputStride[2]); - res = indexInputVkFFT(sc, uintType, 0, index_x, 0, 0, 0); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, ";\n"); - - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, "}"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, "inoutID2 = "); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sprintf(index_x, "(%" PRIu64 "-id_x) + id_y*%" PRIu64 " +id_z*%" PRIu64 "", (sc->size[0] / 2), sc->inputStride[1], sc->inputStride[2]); - res = indexInputVkFFT(sc, uintType, 0, index_x, 0, 0, 0); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, ";\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->inputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s t1 = %s%s[inoutID2]%s;\n", vecType, convTypeLeftInput, inputsStruct, convTypeRightInput); - else - sc->tempLen = sprintf(sc->tempStr, " %s t1 = %sinputBlocks[inoutID2 / %" PRIu64 "]%s[inoutID2 %% %" PRIu64 "]%s;\n", vecType, convTypeLeftInput, sc->inputBufferBlockSize, inputsStruct, sc->inputBufferBlockSize, convTypeRightInput); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - sc->tempLen = sprintf(sc->tempStr, " %s t2;\n", vecType); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " %s t3;\n", vecType); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, "if (id_x == 0) {\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->size[0] % 4 == 0) { - if (!sc->inverse) { - sc->tempLen = sprintf(sc->tempStr, " t2.x = t0.x+t0.y;\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " t2.y = 0;\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " t3.x = t0.x-t0.y;\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " t3.y = 0;\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " t2.x = (t0.x+t1.x);\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " t2.y = (t0.x-t1.x);\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - sc->tempLen = sprintf(sc->tempStr, " tf.y = -tf.y;\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (sc->inverse) { - res = VkMulComplexNumber(sc, "tf", "tf", "2"); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->outputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s[inoutID] = %st2%s;\n", outputsStruct, convTypeLeftOutput, convTypeRightOutput); - else - sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "] = %st2%s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeftOutput, convTypeRightOutput); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (!sc->inverse) { - if (sc->outputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s[inoutID2] = %st3%s;\n", outputsStruct, convTypeLeftOutput, convTypeRightOutput); - else - sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID2 / %" PRIu64 "]%s[inoutID2 %% %" PRIu64 "] = %st3%s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeftOutput, convTypeRightOutput); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->outputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s[inoutID3] = %stf%s;\n", outputsStruct, convTypeLeftOutput, convTypeRightOutput); - else - sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID3 / %" PRIu64 "]%s[inoutID3 %% %" PRIu64 "] = %stf%s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeftOutput, convTypeRightOutput); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - } - else { - if (!sc->inverse) { - sc->tempLen = sprintf(sc->tempStr, " t2.x = t0.x+t0.y;\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " t2.y = 0;\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " t3.x = t0.x-t0.y;\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " t3.y = 0;\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " t2.x = (t0.x+t1.x);\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " t2.y = (t0.x-t1.x);\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->outputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s[inoutID] = %st2%s;\n", outputsStruct, convTypeLeftOutput, convTypeRightOutput); - else - sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "] = %st2%s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeftOutput, convTypeRightOutput); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (!sc->inverse) { - if (sc->outputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s[inoutID2] = %st3%s;\n", outputsStruct, convTypeLeftOutput, convTypeRightOutput); - else - sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID2 / %" PRIu64 "]%s[inoutID2 %% %" PRIu64 "] = %st3%s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeftOutput, convTypeRightOutput); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - sc->tempLen = sprintf(sc->tempStr, "} else {\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - res = VkAddComplex(sc, "t2", "t0", "t1"); - if (res != VKFFT_SUCCESS) return res; - res = VkSubComplex(sc, "t3", "t0", "t1"); - if (res != VKFFT_SUCCESS) return res; - if (!sc->inverse) { - res = VkMulComplexNumber(sc, "t2", "t2", "0.5"); - if (res != VKFFT_SUCCESS) return res; - res = VkMulComplexNumber(sc, "t3", "t3", "0.5"); - if (res != VKFFT_SUCCESS) return res; - } - if (sc->LUT) { - sc->tempLen = sprintf(sc->tempStr, " tf = twiddleLUT[id_x];\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " %s angle = (loc_PI*id_x)/%" PRIu64 ";\n", floatType, sc->size[0] / 2); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - if (!strcmp(floatType, "float")) { - sc->tempLen = sprintf(sc->tempStr, " tf.x = %s(angle);\n", cosDef); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " tf.y = %s(angle);\n", sinDef); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - if (!strcmp(floatType, "double")) { - sc->tempLen = sprintf(sc->tempStr, " tf = sincos_20(angle);\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - } - if (!sc->inverse) { - sc->tempLen = sprintf(sc->tempStr, " t0.x = tf.x*t2.y-tf.y*t3.x;\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " t0.y = -tf.y*t2.y-tf.x*t3.x;\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " t1.x = t2.x-t0.x;\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " t1.y = -t3.y+t0.y;\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " t0.x = t2.x+t0.x;\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " t0.y = t3.y+t0.y;\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - else { - sc->tempLen = sprintf(sc->tempStr, " t0.x = tf.x*t2.y+tf.y*t3.x;\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " t0.y = -tf.y*t2.y+tf.x*t3.x;\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " t1.x = t2.x+t0.x;\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " t1.y = -t3.y+t0.y;\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " t0.x = t2.x-t0.x;\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, " t0.y = t3.y+t0.y;\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - } - //sc->tempLen = sprintf(sc->tempStr, " t0.x = t2.x+tf.x*t2.y-tf.y*t3.x;\n"); - //sc->tempLen = sprintf(sc->tempStr, " t0.y = t3.y-tf.y*t2.y-tf.x*t3.x;\n"); - //sc->tempLen = sprintf(sc->tempStr, " t1.x = t2.x-tf.x*t2.y+tf.y*t3.x;\n"); - //sc->tempLen = sprintf(sc->tempStr, " t1.y = -t3.y-tf.y*t2.y-tf.x*t3.x;\n"); - - if (sc->outputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s[inoutID] = %st0%s;\n", outputsStruct, convTypeLeftOutput, convTypeRightOutput); - else - sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID / %" PRIu64 "]%s[inoutID %% %" PRIu64 "] = %st0%s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeftOutput, convTypeRightOutput); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - if (sc->outputBufferBlockNum == 1) - sc->tempLen = sprintf(sc->tempStr, " %s[inoutID2] = %st1%s;\n", outputsStruct, convTypeLeftOutput, convTypeRightOutput); - else - sc->tempLen = sprintf(sc->tempStr, " outputBlocks[inoutID2 / %" PRIu64 "]%s[inoutID2 %% %" PRIu64 "] = %st1%s;\n", sc->outputBufferBlockSize, outputsStruct, sc->outputBufferBlockSize, convTypeLeftOutput, convTypeRightOutput); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - sc->tempLen = sprintf(sc->tempStr, "}\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, "}\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - //res = appendZeropadEnd(sc); - //if (res != VKFFT_SUCCESS) return res; - sc->tempLen = sprintf(sc->tempStr, "}\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) return res; - - //printf("%s", output); - return res; -} -static inline void freeShaderGenVkFFT(VkFFTSpecializationConstantsLayout* sc) { - if (sc->tempStr) { - free(sc->tempStr); - sc->tempStr = 0; - } - if (sc->disableThreadsStart) { - free(sc->disableThreadsStart); - sc->disableThreadsStart = 0; - } - if (sc->disableThreadsEnd) { - free(sc->disableThreadsEnd); - sc->disableThreadsEnd = 0; - } - if (sc->regIDs) { - for (uint64_t i = 0; i < sc->registers_per_thread * sc->registerBoost; i++) { - if (sc->regIDs[i]) { - free(sc->regIDs[i]); - sc->regIDs[i] = 0; - } - } - free(sc->regIDs); - sc->regIDs = 0; - } - if (!sc->disableSetLocale) { - if (sc->oldLocale) - { - setlocale(LC_ALL, sc->oldLocale); - memset(sc->oldLocale, 0, 100 * sizeof(char)); - } - } - if (sc->numRaderPrimes) { - free(sc->raderContainer); - sc->raderContainer = 0; - sc->currentRaderContainer = 0; - } -} -static inline VkFFTResult shaderGenVkFFT(char* output, VkFFTSpecializationConstantsLayout* sc, const char* floatType, const char* floatTypeInputMemory, const char* floatTypeOutputMemory, const char* floatTypeKernelMemory, const char* uintType, uint64_t type) { - VkFFTResult res = VKFFT_SUCCESS; - //appendLicense(output); - if (!sc->disableSetLocale) { - const char* loc_oldLocale = setlocale(LC_ALL, NULL); - strcpy(sc->oldLocale, loc_oldLocale); - setlocale(LC_ALL, "C"); - } - sc->output = output; - sc->tempStr = (char*)malloc(sizeof(char) * sc->maxTempLength); - if (!sc->tempStr) return VKFFT_ERROR_MALLOC_FAILED; - sc->tempLen = 0; - sc->currentLen = 0; - char vecType[30]; - char vecTypeInput[30]; - char vecTypeOutput[30]; - char uintType_32[30]; -#if(VKFFT_BACKEND==0) - sprintf(uintType_32, "uint"); - if (!strcmp(floatType, "half")) sprintf(vecType, "f16vec2"); - if (!strcmp(floatType, "float")) sprintf(vecType, "vec2"); - if (!strcmp(floatType, "double")) sprintf(vecType, "dvec2"); - if (!strcmp(floatTypeInputMemory, "half")) sprintf(vecTypeInput, "f16vec2"); - if (!strcmp(floatTypeInputMemory, "float")) sprintf(vecTypeInput, "vec2"); - if (!strcmp(floatTypeInputMemory, "double")) sprintf(vecTypeInput, "dvec2"); - if (!strcmp(floatTypeOutputMemory, "half")) sprintf(vecTypeOutput, "f16vec2"); - if (!strcmp(floatTypeOutputMemory, "float")) sprintf(vecTypeOutput, "vec2"); - if (!strcmp(floatTypeOutputMemory, "double")) sprintf(vecTypeOutput, "dvec2"); - sprintf(sc->gl_LocalInvocationID_x, "gl_LocalInvocationID.x"); - sprintf(sc->gl_LocalInvocationID_y, "gl_LocalInvocationID.y"); - sprintf(sc->gl_LocalInvocationID_z, "gl_LocalInvocationID.z"); - switch (sc->swapComputeWorkGroupID) { - case 0: - sprintf(sc->gl_GlobalInvocationID_x, "gl_GlobalInvocationID.x"); - sprintf(sc->gl_GlobalInvocationID_y, "gl_GlobalInvocationID.y"); - sprintf(sc->gl_GlobalInvocationID_z, "gl_GlobalInvocationID.z"); - sprintf(sc->gl_WorkGroupID_x, "gl_WorkGroupID.x"); - sprintf(sc->gl_WorkGroupID_y, "gl_WorkGroupID.y"); - sprintf(sc->gl_WorkGroupID_z, "gl_WorkGroupID.z"); - break; - case 1: - sprintf(sc->gl_GlobalInvocationID_x, "(gl_LocalInvocationID.x + gl_WorkGroupID.y * gl_WorkGroupSize.x)"); - sprintf(sc->gl_GlobalInvocationID_y, "(gl_LocalInvocationID.y + gl_WorkGroupID.x * gl_WorkGroupSize.y)"); - sprintf(sc->gl_GlobalInvocationID_z, "gl_GlobalInvocationID.z"); - sprintf(sc->gl_WorkGroupID_x, "gl_WorkGroupID.y"); - sprintf(sc->gl_WorkGroupID_y, "gl_WorkGroupID.x"); - sprintf(sc->gl_WorkGroupID_z, "gl_WorkGroupID.z"); - break; - case 2: - sprintf(sc->gl_GlobalInvocationID_x, "(gl_LocalInvocationID.x + gl_WorkGroupID.z * gl_WorkGroupSize.x)"); - sprintf(sc->gl_GlobalInvocationID_y, "gl_GlobalInvocationID.y"); - sprintf(sc->gl_GlobalInvocationID_z, "(gl_LocalInvocationID.z + gl_WorkGroupID.x * gl_WorkGroupSize.z)"); - sprintf(sc->gl_WorkGroupID_x, "gl_WorkGroupID.z"); - sprintf(sc->gl_WorkGroupID_y, "gl_WorkGroupID.y"); - sprintf(sc->gl_WorkGroupID_z, "gl_WorkGroupID.x"); - break; - } - sprintf(sc->gl_WorkGroupSize_x, "gl_WorkGroupSize.x"); - sprintf(sc->gl_WorkGroupSize_y, "gl_WorkGroupSize.y"); - sprintf(sc->gl_WorkGroupSize_z, "gl_WorkGroupSize.z"); - sprintf(sc->gl_SubgroupInvocationID, "gl_SubgroupInvocationID"); - sprintf(sc->gl_SubgroupID, "gl_SubgroupID"); -#elif(VKFFT_BACKEND==1) - sprintf(uintType_32, "unsigned int"); - if (!strcmp(floatType, "half")) sprintf(vecType, "f16vec2"); - if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); - if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); - if (!strcmp(floatTypeInputMemory, "half")) sprintf(vecTypeInput, "f16vec2"); - if (!strcmp(floatTypeInputMemory, "float")) sprintf(vecTypeInput, "float2"); - if (!strcmp(floatTypeInputMemory, "double")) sprintf(vecTypeInput, "double2"); - if (!strcmp(floatTypeOutputMemory, "half")) sprintf(vecTypeOutput, "f16vec2"); - if (!strcmp(floatTypeOutputMemory, "float")) sprintf(vecTypeOutput, "float2"); - if (!strcmp(floatTypeOutputMemory, "double")) sprintf(vecTypeOutput, "double2"); - sprintf(sc->gl_LocalInvocationID_x, "threadIdx.x"); - sprintf(sc->gl_LocalInvocationID_y, "threadIdx.y"); - sprintf(sc->gl_LocalInvocationID_z, "threadIdx.z"); - switch (sc->swapComputeWorkGroupID) { - case 0: - sprintf(sc->gl_GlobalInvocationID_x, "(threadIdx.x + blockIdx.x * blockDim.x)"); - sprintf(sc->gl_GlobalInvocationID_y, "(threadIdx.y + blockIdx.y * blockDim.y)"); - sprintf(sc->gl_GlobalInvocationID_z, "(threadIdx.z + blockIdx.z * blockDim.z)"); - sprintf(sc->gl_WorkGroupID_x, "blockIdx.x"); - sprintf(sc->gl_WorkGroupID_y, "blockIdx.y"); - sprintf(sc->gl_WorkGroupID_z, "blockIdx.z"); - break; - case 1: - sprintf(sc->gl_GlobalInvocationID_x, "(threadIdx.x + blockIdx.y * blockDim.x)"); - sprintf(sc->gl_GlobalInvocationID_y, "(threadIdx.y + blockIdx.x * blockDim.y)"); - sprintf(sc->gl_GlobalInvocationID_z, "(threadIdx.z + blockIdx.z * blockDim.z)"); - sprintf(sc->gl_WorkGroupID_x, "blockIdx.y"); - sprintf(sc->gl_WorkGroupID_y, "blockIdx.x"); - sprintf(sc->gl_WorkGroupID_z, "blockIdx.z"); - break; - case 2: - sprintf(sc->gl_GlobalInvocationID_x, "(threadIdx.x + blockIdx.z * blockDim.x)"); - sprintf(sc->gl_GlobalInvocationID_y, "(threadIdx.y + blockIdx.y * blockDim.y)"); - sprintf(sc->gl_GlobalInvocationID_z, "(threadIdx.z + blockIdx.x * blockDim.z)"); - sprintf(sc->gl_WorkGroupID_x, "blockIdx.z"); - sprintf(sc->gl_WorkGroupID_y, "blockIdx.y"); - sprintf(sc->gl_WorkGroupID_z, "blockIdx.x"); - break; - } - sprintf(sc->gl_WorkGroupSize_x, "blockDim.x"); - sprintf(sc->gl_WorkGroupSize_y, "blockDim.y"); - sprintf(sc->gl_WorkGroupSize_z, "blockDim.z"); - sprintf(sc->gl_SubgroupInvocationID, "(threadIdx.x %% %" PRIu64 ")", sc->warpSize); - sprintf(sc->gl_SubgroupID, "(threadIdx.x / %" PRIu64 ")", sc->warpSize); -#elif(VKFFT_BACKEND==2) - sprintf(uintType_32, "unsigned int"); - if (!strcmp(floatType, "half")) sprintf(vecType, "f16vec2"); - if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); - if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); - if (!strcmp(floatTypeInputMemory, "half")) sprintf(vecTypeInput, "f16vec2"); - if (!strcmp(floatTypeInputMemory, "float")) sprintf(vecTypeInput, "float2"); - if (!strcmp(floatTypeInputMemory, "double")) sprintf(vecTypeInput, "double2"); - if (!strcmp(floatTypeOutputMemory, "half")) sprintf(vecTypeOutput, "f16vec2"); - if (!strcmp(floatTypeOutputMemory, "float")) sprintf(vecTypeOutput, "float2"); - if (!strcmp(floatTypeOutputMemory, "double")) sprintf(vecTypeOutput, "double2"); - sprintf(sc->gl_LocalInvocationID_x, sc->localSize[0] > 1 ? "threadIdx.x" : "0"); - sprintf(sc->gl_LocalInvocationID_y, sc->localSize[1] > 1 ? "threadIdx.y" : "0"); - sprintf(sc->gl_LocalInvocationID_z, sc->localSize[2] > 1 ? "threadIdx.z" : "0"); - switch (sc->swapComputeWorkGroupID) { - case 0: - sprintf(sc->gl_WorkGroupID_x, "blockIdx.x"); - sprintf(sc->gl_WorkGroupID_y, "blockIdx.y"); - sprintf(sc->gl_WorkGroupID_z, "blockIdx.z"); - break; - case 1: - sprintf(sc->gl_WorkGroupID_x, "blockIdx.y"); - sprintf(sc->gl_WorkGroupID_y, "blockIdx.x"); - sprintf(sc->gl_WorkGroupID_z, "blockIdx.z"); - break; - case 2: - sprintf(sc->gl_WorkGroupID_x, "blockIdx.z"); - sprintf(sc->gl_WorkGroupID_y, "blockIdx.y"); - sprintf(sc->gl_WorkGroupID_z, "blockIdx.x"); - break; - } - sprintf(sc->gl_WorkGroupSize_x, "%" PRIu64, sc->localSize[0]); - sprintf(sc->gl_WorkGroupSize_y, "%" PRIu64, sc->localSize[1]); - sprintf(sc->gl_WorkGroupSize_z, "%" PRIu64, sc->localSize[2]); - sprintf(sc->gl_GlobalInvocationID_x, "(%s + %s * %s)", sc->gl_LocalInvocationID_x, sc->gl_WorkGroupID_x, sc->gl_WorkGroupSize_x); - sprintf(sc->gl_GlobalInvocationID_y, "(%s + %s * %s)", sc->gl_LocalInvocationID_y, sc->gl_WorkGroupID_y, sc->gl_WorkGroupSize_y); - sprintf(sc->gl_GlobalInvocationID_z, "(%s + %s * %s)", sc->gl_LocalInvocationID_z, sc->gl_WorkGroupID_z, sc->gl_WorkGroupSize_z); - sprintf(sc->gl_SubgroupInvocationID, "(threadIdx.x %% %" PRIu64 ")", sc->warpSize); - sprintf(sc->gl_SubgroupID, "(threadIdx.x / %" PRIu64 ")", sc->warpSize); -#elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)) - sprintf(uintType_32, "unsigned int"); - if (!strcmp(floatType, "half")) sprintf(vecType, "f16vec2"); - if (!strcmp(floatType, "float")) sprintf(vecType, "float2"); - if (!strcmp(floatType, "double")) sprintf(vecType, "double2"); - if (!strcmp(floatTypeInputMemory, "half")) sprintf(vecTypeInput, "f16vec2"); - if (!strcmp(floatTypeInputMemory, "float")) sprintf(vecTypeInput, "float2"); - if (!strcmp(floatTypeInputMemory, "double")) sprintf(vecTypeInput, "double2"); - if (!strcmp(floatTypeOutputMemory, "half")) sprintf(vecTypeOutput, "f16vec2"); - if (!strcmp(floatTypeOutputMemory, "float")) sprintf(vecTypeOutput, "float2"); - if (!strcmp(floatTypeOutputMemory, "double")) sprintf(vecTypeOutput, "double2"); - sprintf(sc->gl_LocalInvocationID_x, "get_local_id(0)"); - sprintf(sc->gl_LocalInvocationID_y, "get_local_id(1)"); - sprintf(sc->gl_LocalInvocationID_z, "get_local_id(2)"); - switch (sc->swapComputeWorkGroupID) { - case 0: - sprintf(sc->gl_GlobalInvocationID_x, "get_global_id(0)"); - sprintf(sc->gl_GlobalInvocationID_y, "get_global_id(1)"); - sprintf(sc->gl_GlobalInvocationID_z, "get_global_id(2)"); - sprintf(sc->gl_WorkGroupID_x, "get_group_id(0)"); - sprintf(sc->gl_WorkGroupID_y, "get_group_id(1)"); - sprintf(sc->gl_WorkGroupID_z, "get_group_id(2)"); - break; - case 1: - sprintf(sc->gl_GlobalInvocationID_x, "(get_local_id(0) + get_group_id(1) * get_local_size(0))"); - sprintf(sc->gl_GlobalInvocationID_y, "(get_local_id(1) + get_group_id(0) * get_local_size(1))"); - sprintf(sc->gl_GlobalInvocationID_z, "get_global_id(2)"); - sprintf(sc->gl_WorkGroupID_x, "get_group_id(1)"); - sprintf(sc->gl_WorkGroupID_y, "get_group_id(0)"); - sprintf(sc->gl_WorkGroupID_z, "get_group_id(2)"); - break; - case 2: - sprintf(sc->gl_GlobalInvocationID_x, "(get_local_id(0) + get_group_id(2) * get_local_size(0))"); - sprintf(sc->gl_GlobalInvocationID_y, "get_global_id(1)"); - sprintf(sc->gl_GlobalInvocationID_z, "(get_local_id(2) + get_group_id(0) * get_local_size(2))"); - sprintf(sc->gl_WorkGroupID_x, "get_group_id(2)"); - sprintf(sc->gl_WorkGroupID_y, "get_group_id(1)"); - sprintf(sc->gl_WorkGroupID_z, "get_group_id(0)"); - break; - } - sprintf(sc->gl_WorkGroupSize_x, "get_local_size(0)"); - sprintf(sc->gl_WorkGroupSize_y, "get_local_size(1)"); - sprintf(sc->gl_WorkGroupSize_z, "get_local_size(2)"); -#endif - sprintf(sc->vecType, "%s", vecType); - sprintf(sc->stageInvocationID, "stageInvocationID"); - sprintf(sc->blockInvocationID, "blockInvocationID"); - sprintf(sc->tshuffle, "tshuffle"); - sprintf(sc->sharedStride, "sharedStride"); - sprintf(sc->combinedID, "combinedID"); - sprintf(sc->inoutID, "inoutID"); - sprintf(sc->sdataID, "sdataID"); - sprintf(sc->raderIDx, "raderIDx"); - sprintf(sc->raderIDx2, "raderIDx2"); - //sprintf(sc->tempReg, "temp"); - sc->disableThreadsStart = (char*)malloc(sizeof(char) * 500); - if (!sc->disableThreadsStart) { - freeShaderGenVkFFT(sc); - return VKFFT_ERROR_MALLOC_FAILED; - } - sc->disableThreadsEnd = (char*)malloc(sizeof(char) * 2); - if (!sc->disableThreadsEnd) { - freeShaderGenVkFFT(sc); - return VKFFT_ERROR_MALLOC_FAILED; - } - sc->disableThreadsStart[0] = 0; - sc->disableThreadsEnd[0] = 0; - res = appendVersion(sc); - if (res != VKFFT_SUCCESS) { - freeShaderGenVkFFT(sc); - return res; - } - res = appendExtensions(sc, floatType, floatTypeInputMemory, floatTypeOutputMemory, floatTypeKernelMemory); - if (res != VKFFT_SUCCESS) { - freeShaderGenVkFFT(sc); - return res; - } - res = appendLayoutVkFFT(sc); - if (res != VKFFT_SUCCESS) { - freeShaderGenVkFFT(sc); - return res; - } - res = appendConstantsVkFFT(sc, floatType, uintType); - if (res != VKFFT_SUCCESS) { - freeShaderGenVkFFT(sc); - return res; - } - if ((!sc->LUT) && (!strcmp(floatType, "double"))) { - res = appendSinCos20(sc, floatType, uintType); - if (res != VKFFT_SUCCESS) { - freeShaderGenVkFFT(sc); - return res; - } - } - - if (strcmp(floatType, floatTypeInputMemory)) { - res = appendConversion(sc, floatType, floatTypeInputMemory); - if (res != VKFFT_SUCCESS) { - freeShaderGenVkFFT(sc); - return res; - } - } - if (strcmp(floatType, floatTypeOutputMemory) && strcmp(floatTypeInputMemory, floatTypeOutputMemory)) { - res = appendConversion(sc, floatType, floatTypeOutputMemory); - if (res != VKFFT_SUCCESS) { - freeShaderGenVkFFT(sc); - return res; - } - } - res = appendPushConstantsVkFFT(sc, floatType, uintType); - if (res != VKFFT_SUCCESS) { - freeShaderGenVkFFT(sc); - return res; - } - uint64_t id = 0; - res = appendInputLayoutVkFFT(sc, id, floatTypeInputMemory, type); - if (res != VKFFT_SUCCESS) { - freeShaderGenVkFFT(sc); - return res; - } - id++; - res = appendOutputLayoutVkFFT(sc, id, floatTypeOutputMemory, type); - if (res != VKFFT_SUCCESS) { - freeShaderGenVkFFT(sc); - return res; - } - id++; - if (sc->convolutionStep) { - res = appendKernelLayoutVkFFT(sc, id, floatTypeKernelMemory); - if (res != VKFFT_SUCCESS) { - freeShaderGenVkFFT(sc); - return res; - } - id++; - } - if (sc->LUT) { - res = appendLUTLayoutVkFFT(sc, id, floatType); - if (res != VKFFT_SUCCESS) { - freeShaderGenVkFFT(sc); - return res; - } - id++; - } - if (sc->raderUintLUT) { - res = appendRaderUintLUTLayoutVkFFT(sc, id); - if (res != VKFFT_SUCCESS) { - freeShaderGenVkFFT(sc); - return res; - } - id++; - } - if (sc->useBluesteinFFT) { - res = appendBluesteinLayoutVkFFT(sc, id, floatType); - if (res != VKFFT_SUCCESS) { - freeShaderGenVkFFT(sc); - return res; - } - if (sc->BluesteinConvolutionStep) - id++; - if (sc->BluesteinPreMultiplication || sc->BluesteinPostMultiplication) - id++; - } - //appendIndexInputVkFFT(sc, uintType, type); - //appendIndexOutputVkFFT(sc, uintType, type); - /*uint64_t appendedRadix[10] = { 0,0,0,0,0,0,0,0,0,0 }; - for (uint64_t i = 0; i < sc->numStages; i++) { - if (appendedRadix[sc->stageRadix[i]] == 0) { - appendedRadix[sc->stageRadix[i]] = 1; - appendRadixKernelVkFFT(sc, floatType, uintType, sc->stageRadix[i]); - } - }*/ - uint64_t locType = (((type == 0) || (type == 5) || (type == 6) || (type == 110) || (type == 120) || (type == 130) || (type == 140) || (type == 142) || (type == 144)) && (sc->axisSwapped)) ? 1 : type; -#if(VKFFT_BACKEND==0) - res = appendSharedMemoryVkFFT(sc, floatType, uintType, locType); - if (res != VKFFT_SUCCESS) { - freeShaderGenVkFFT(sc); - return res; - } - sc->tempLen = sprintf(sc->tempStr, "void main() {\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) { - freeShaderGenVkFFT(sc); - return res; - } -#elif(VKFFT_BACKEND==1) - sc->tempLen = sprintf(sc->tempStr, "extern __shared__ float shared[];\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) { - freeShaderGenVkFFT(sc); - return res; - } - sc->tempLen = sprintf(sc->tempStr, "extern \"C\" __global__ void __launch_bounds__(%" PRIu64 ") VkFFT_main ", sc->localSize[0] * sc->localSize[1] * sc->localSize[2]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) { - freeShaderGenVkFFT(sc); - return res; - } - switch (type) { - case 5: - { - sc->tempLen = sprintf(sc->tempStr, "(%s* inputs, %s* outputs", floatTypeInputMemory, vecTypeOutput); - break; - } - case 6: - { - sc->tempLen = sprintf(sc->tempStr, "(%s* inputs, %s* outputs", vecTypeInput, floatTypeOutputMemory); - break; - } - case 110: - { - sc->tempLen = sprintf(sc->tempStr, "(%s* inputs, %s* outputs", floatTypeInputMemory, floatTypeOutputMemory); - break; - } - case 111: - { - sc->tempLen = sprintf(sc->tempStr, "(%s* inputs, %s* outputs", floatTypeInputMemory, floatTypeOutputMemory); - break; - } - case 120: - { - sc->tempLen = sprintf(sc->tempStr, "(%s* inputs, %s* outputs", floatTypeInputMemory, floatTypeOutputMemory); - break; - } - case 121: - { - sc->tempLen = sprintf(sc->tempStr, "(%s* inputs, %s* outputs", floatTypeInputMemory, floatTypeOutputMemory); - break; - } - case 130: - { - sc->tempLen = sprintf(sc->tempStr, "(%s* inputs, %s* outputs", floatTypeInputMemory, floatTypeOutputMemory); - break; - } - case 131: - { - sc->tempLen = sprintf(sc->tempStr, "(%s* inputs, %s* outputs", floatTypeInputMemory, floatTypeOutputMemory); - break; - } - case 140: - { - sc->tempLen = sprintf(sc->tempStr, "(%s* inputs, %s* outputs", floatTypeInputMemory, floatTypeOutputMemory); - break; - } - case 141: - { - sc->tempLen = sprintf(sc->tempStr, "(%s* inputs, %s* outputs", floatTypeInputMemory, floatTypeOutputMemory); - break; - } - case 142: - { - sc->tempLen = sprintf(sc->tempStr, "(%s* inputs, %s* outputs", floatTypeInputMemory, floatTypeOutputMemory); - break; - } - case 143: - { - sc->tempLen = sprintf(sc->tempStr, "(%s* inputs, %s* outputs", floatTypeInputMemory, floatTypeOutputMemory); - break; - } - case 144: - { - sc->tempLen = sprintf(sc->tempStr, "(%s* inputs, %s* outputs", floatTypeInputMemory, floatTypeOutputMemory); - break; - } - case 145: - { - sc->tempLen = sprintf(sc->tempStr, "(%s* inputs, %s* outputs", floatTypeInputMemory, floatTypeOutputMemory); - break; - } - default: - { - sc->tempLen = sprintf(sc->tempStr, "(%s* inputs, %s* outputs", vecTypeInput, vecTypeOutput); - break; - } - } - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) { - freeShaderGenVkFFT(sc); - return res; - } - - if (sc->convolutionStep) { - sc->tempLen = sprintf(sc->tempStr, ", %s* kernel_obj", vecType); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) { - freeShaderGenVkFFT(sc); - return res; - } - } - if (sc->LUT) { - sc->tempLen = sprintf(sc->tempStr, ", %s* twiddleLUT", vecType); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) { - freeShaderGenVkFFT(sc); - return res; - } - } - if (sc->raderUintLUT) { - sc->tempLen = sprintf(sc->tempStr, ", %s* g_pow", uintType_32); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) { - freeShaderGenVkFFT(sc); - return res; - } - } - if (sc->BluesteinConvolutionStep) { - sc->tempLen = sprintf(sc->tempStr, ", %s* BluesteinConvolutionKernel", vecType); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) { - freeShaderGenVkFFT(sc); - return res; - } - } - if (sc->BluesteinPreMultiplication || sc->BluesteinPostMultiplication) { - sc->tempLen = sprintf(sc->tempStr, ", %s* BluesteinMultiplication", vecType); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) { - freeShaderGenVkFFT(sc); - return res; - } - } - sc->tempLen = sprintf(sc->tempStr, ") {\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) { - freeShaderGenVkFFT(sc); - return res; - } - //sc->tempLen = sprintf(sc->tempStr, ", const PushConsts consts) {\n"); - res = appendSharedMemoryVkFFT(sc, floatType, uintType, locType); - if (res != VKFFT_SUCCESS) { - freeShaderGenVkFFT(sc); - return res; - } -#elif(VKFFT_BACKEND==2) - sc->tempLen = sprintf(sc->tempStr, "extern __shared__ float shared[];\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) { - freeShaderGenVkFFT(sc); - return res; - } - // These wrappers help hipcc to generate faster code for load and store operations where - // 64-bit scalar + 32-bit vector registers are used instead of 64-bit vector saving a few - // instructions for computing 64-bit vector addresses. - // TODO: Check if it works correctly when buffer sizes are almost 2^32 but useUint64 is 0 - sc->tempLen = sprintf(sc->tempStr, - "template\n" - "struct Inputs\n" - "{\n" - " const T* buffer;\n" - " inline __device__ Inputs(const T* buffer) : buffer(buffer) {}\n" - " inline __device__ T operator[](unsigned long long idx) const { return buffer[idx]; }\n" - " inline __device__ T operator[](unsigned int idx) const { return *reinterpret_cast(reinterpret_cast(buffer) + idx * static_cast(sizeof(T))); }\n" - "};\n" - "template\n" - "struct Outputs\n" - "{\n" - " T* buffer;\n" - " inline __device__ Outputs(T* buffer) : buffer(buffer) {}\n" - " inline __device__ T& operator[](unsigned long long idx) { return buffer[idx]; }\n" - " inline __device__ T& operator[](unsigned int idx) { return *reinterpret_cast(reinterpret_cast(buffer) + idx * static_cast(sizeof(T))); }\n" - "};\n" - ); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) { - freeShaderGenVkFFT(sc); - return res; - } - sc->tempLen = sprintf(sc->tempStr, "extern \"C\" __launch_bounds__(%" PRIu64 ") __global__ void VkFFT_main ", sc->localSize[0] * sc->localSize[1] * sc->localSize[2]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) { - freeShaderGenVkFFT(sc); - return res; - } - switch (type) { - case 5: - { - sc->tempLen = sprintf(sc->tempStr, "(const Inputs<%s> inputs, Outputs<%s> outputs", floatTypeInputMemory, vecTypeOutput); - break; - } - case 6: - { - sc->tempLen = sprintf(sc->tempStr, "(const Inputs<%s> inputs, Outputs<%s> outputs", vecTypeInput, floatTypeOutputMemory); - break; - } - case 110: - { - sc->tempLen = sprintf(sc->tempStr, "(const Inputs<%s> inputs, Outputs<%s> outputs", floatTypeInputMemory, floatTypeOutputMemory); - break; - } - case 111: - { - sc->tempLen = sprintf(sc->tempStr, "(const Inputs<%s> inputs, Outputs<%s> outputs", floatTypeInputMemory, floatTypeOutputMemory); - break; - } - case 120: - { - sc->tempLen = sprintf(sc->tempStr, "(const Inputs<%s> inputs, Outputs<%s> outputs", floatTypeInputMemory, floatTypeOutputMemory); - break; - } - case 121: - { - sc->tempLen = sprintf(sc->tempStr, "(const Inputs<%s> inputs, Outputs<%s> outputs", floatTypeInputMemory, floatTypeOutputMemory); - break; - } - case 130: - { - sc->tempLen = sprintf(sc->tempStr, "(const Inputs<%s> inputs, Outputs<%s> outputs", floatTypeInputMemory, floatTypeOutputMemory); - break; - } - case 131: - { - sc->tempLen = sprintf(sc->tempStr, "(const Inputs<%s> inputs, Outputs<%s> outputs", floatTypeInputMemory, floatTypeOutputMemory); - break; - } - case 140: - { - sc->tempLen = sprintf(sc->tempStr, "(const Inputs<%s> inputs, Outputs<%s> outputs", floatTypeInputMemory, floatTypeOutputMemory); - break; - } - case 141: - { - sc->tempLen = sprintf(sc->tempStr, "(const Inputs<%s> inputs, Outputs<%s> outputs", floatTypeInputMemory, floatTypeOutputMemory); - break; - } - case 142: - { - sc->tempLen = sprintf(sc->tempStr, "(const Inputs<%s> inputs, Outputs<%s> outputs", floatTypeInputMemory, floatTypeOutputMemory); - break; - } - case 143: - { - sc->tempLen = sprintf(sc->tempStr, "(const Inputs<%s> inputs, Outputs<%s> outputs", floatTypeInputMemory, floatTypeOutputMemory); - break; - } - case 144: - { - sc->tempLen = sprintf(sc->tempStr, "(const Inputs<%s> inputs, Outputs<%s> outputs", floatTypeInputMemory, floatTypeOutputMemory); - break; - } - case 145: - { - sc->tempLen = sprintf(sc->tempStr, "(const Inputs<%s> inputs, Outputs<%s> outputs", floatTypeInputMemory, floatTypeOutputMemory); - break; - } - default: - { - sc->tempLen = sprintf(sc->tempStr, "(const Inputs<%s> inputs, Outputs<%s> outputs", vecTypeInput, vecTypeOutput); - break; - } - } - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) { - freeShaderGenVkFFT(sc); - return res; - } - if (sc->convolutionStep) { - sc->tempLen = sprintf(sc->tempStr, ", const Inputs<%s> kernel_obj", vecType); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) { - freeShaderGenVkFFT(sc); - return res; - } - } - if (sc->LUT) { - sc->tempLen = sprintf(sc->tempStr, ", const Inputs<%s> twiddleLUT", vecType); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) { - freeShaderGenVkFFT(sc); - return res; - } - } - if (sc->raderUintLUT) { - sc->tempLen = sprintf(sc->tempStr, ", %s* g_pow", uintType_32); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) { - freeShaderGenVkFFT(sc); - return res; - } - } - if (sc->BluesteinConvolutionStep) { - sc->tempLen = sprintf(sc->tempStr, ", const Inputs<%s> BluesteinConvolutionKernel", vecType); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) { - freeShaderGenVkFFT(sc); - return res; - } - } - if (sc->BluesteinPreMultiplication || sc->BluesteinPostMultiplication) { - sc->tempLen = sprintf(sc->tempStr, ", const Inputs<%s> BluesteinMultiplication", vecType); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) { - freeShaderGenVkFFT(sc); - return res; - } - } - sc->tempLen = sprintf(sc->tempStr, ") {\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) { - freeShaderGenVkFFT(sc); - return res; - } - //sc->tempLen = sprintf(sc->tempStr, ", const PushConsts consts) {\n"); - res = appendSharedMemoryVkFFT(sc, floatType, uintType, locType); - if (res != VKFFT_SUCCESS) { - freeShaderGenVkFFT(sc); - return res; - } -#elif((VKFFT_BACKEND==3)||(VKFFT_BACKEND==4)) - sc->tempLen = sprintf(sc->tempStr, "__kernel __attribute__((reqd_work_group_size(%" PRIu64 ", %" PRIu64 ", %" PRIu64 "))) void VkFFT_main ", sc->localSize[0], sc->localSize[1], sc->localSize[2]); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) { - freeShaderGenVkFFT(sc); - return res; - } - switch (type) { - case 5: - { - sc->tempLen = sprintf(sc->tempStr, "(__global %s* inputs, __global %s* outputs", floatTypeInputMemory, vecTypeOutput); - break; - } - case 6: - { - sc->tempLen = sprintf(sc->tempStr, "(__global %s* inputs, __global %s* outputs", vecTypeInput, floatTypeOutputMemory); - break; - } - case 110: - { - sc->tempLen = sprintf(sc->tempStr, "(__global %s* inputs, __global %s* outputs", floatTypeInputMemory, floatTypeOutputMemory); - break; - } - case 111: - { - sc->tempLen = sprintf(sc->tempStr, "(__global %s* inputs, __global %s* outputs", floatTypeInputMemory, floatTypeOutputMemory); - break; - } - case 120: - { - sc->tempLen = sprintf(sc->tempStr, "(__global %s* inputs, __global %s* outputs", floatTypeInputMemory, floatTypeOutputMemory); - break; - } - case 121: - { - sc->tempLen = sprintf(sc->tempStr, "(__global %s* inputs, __global %s* outputs", floatTypeInputMemory, floatTypeOutputMemory); - break; - } - case 130: - { - sc->tempLen = sprintf(sc->tempStr, "(__global %s* inputs, __global %s* outputs", floatTypeInputMemory, floatTypeOutputMemory); - break; - } - case 131: - { - sc->tempLen = sprintf(sc->tempStr, "(__global %s* inputs, __global %s* outputs", floatTypeInputMemory, floatTypeOutputMemory); - break; - } - case 140: - { - sc->tempLen = sprintf(sc->tempStr, "(__global %s* inputs, __global %s* outputs", floatTypeInputMemory, floatTypeOutputMemory); - break; - } - case 141: - { - sc->tempLen = sprintf(sc->tempStr, "(__global %s* inputs, __global %s* outputs", floatTypeInputMemory, floatTypeOutputMemory); - break; - } - case 142: - { - sc->tempLen = sprintf(sc->tempStr, "(__global %s* inputs, __global %s* outputs", floatTypeInputMemory, floatTypeOutputMemory); - break; - } - case 143: - { - sc->tempLen = sprintf(sc->tempStr, "(__global %s* inputs, __global %s* outputs", floatTypeInputMemory, floatTypeOutputMemory); - break; - } - case 144: - { - sc->tempLen = sprintf(sc->tempStr, "(__global %s* inputs, __global %s* outputs", floatTypeInputMemory, floatTypeOutputMemory); - break; - } - case 145: - { - sc->tempLen = sprintf(sc->tempStr, "(__global %s* inputs, __global %s* outputs", floatTypeInputMemory, floatTypeOutputMemory); - break; - } - default: - { - sc->tempLen = sprintf(sc->tempStr, "(__global %s* inputs, __global %s* outputs", vecTypeInput, vecTypeOutput); - break; - } - } - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) { - freeShaderGenVkFFT(sc); - return res; - } - if (sc->convolutionStep) { - sc->tempLen = sprintf(sc->tempStr, ", __global %s* kernel_obj", vecType); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) { - freeShaderGenVkFFT(sc); - return res; - } - } - if (sc->LUT) { - sc->tempLen = sprintf(sc->tempStr, ", __global %s* twiddleLUT", vecType); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) { - freeShaderGenVkFFT(sc); - return res; - } - } - if (sc->raderUintLUT) { - sc->tempLen = sprintf(sc->tempStr, ", __global %s* g_pow", uintType_32); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) { - freeShaderGenVkFFT(sc); - return res; - } - } - if (sc->BluesteinConvolutionStep) { - sc->tempLen = sprintf(sc->tempStr, ", __global %s* BluesteinConvolutionKernel", vecType); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) { - freeShaderGenVkFFT(sc); - return res; - } - } - if (sc->BluesteinPreMultiplication || sc->BluesteinPostMultiplication) { - sc->tempLen = sprintf(sc->tempStr, ", __global %s* BluesteinMultiplication", vecType); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) { - freeShaderGenVkFFT(sc); - return res; - } - } - if (sc->pushConstantsStructSize > 0) { - sc->tempLen = sprintf(sc->tempStr, ", PushConsts consts"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) { - freeShaderGenVkFFT(sc); - return res; - } - } - sc->tempLen = sprintf(sc->tempStr, ") {\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) { - freeShaderGenVkFFT(sc); - return res; - } - //sc->tempLen = sprintf(sc->tempStr, ", const PushConsts consts) {\n"); - res = appendSharedMemoryVkFFT(sc, floatType, uintType, locType); - if (res != VKFFT_SUCCESS) { - freeShaderGenVkFFT(sc); - return res; - } -#endif - //if (type==0) sc->tempLen = sprintf(sc->tempStr, "return;\n"); - res = appendInitialization(sc, floatType, uintType, type); - if (res != VKFFT_SUCCESS) { - freeShaderGenVkFFT(sc); - return res; - } - res = setReadToRegisters(sc, type); - if (res != VKFFT_SUCCESS) { - freeShaderGenVkFFT(sc); - return res; - } - res = setWriteFromRegisters(sc, type); - if (res != VKFFT_SUCCESS) { - freeShaderGenVkFFT(sc); - return res; - } - if ((sc->convolutionStep) && (sc->matrixConvolution > 1)) { - sc->tempLen = sprintf(sc->tempStr, " for (%s coordinate=%" PRIu64 "; coordinate > 0; coordinate--){\n\ - coordinate--;\n", uintType, sc->matrixConvolution); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) { - freeShaderGenVkFFT(sc); - return res; - } - } - res = appendReadDataVkFFT(sc, floatType, floatTypeInputMemory, uintType, type); - if (res != VKFFT_SUCCESS) { - freeShaderGenVkFFT(sc); - return res; - } - if (sc->useBluesteinFFT && sc->BluesteinPreMultiplication) { - res = appendBluesteinMultiplication(sc, floatType, uintType, locType, 0); - if (res != VKFFT_SUCCESS) { - freeShaderGenVkFFT(sc); - return res; - } - } - //appendBarrierVkFFT(sc, 1); - res = appendReorder4StepRead(sc, floatType, uintType, locType); - if (res != VKFFT_SUCCESS) { - freeShaderGenVkFFT(sc); - return res; - } - if (!sc->useRader) { - res = appendBoostThreadDataReorder(sc, floatType, uintType, locType, 1); - if (res != VKFFT_SUCCESS) { - freeShaderGenVkFFT(sc); - return res; - } - } - - uint64_t stageSize = 1; - uint64_t stageSizeSum = 0; - double PI_const = 3.1415926535897932384626433832795; - double stageAngle = (sc->inverse) ? PI_const : -PI_const; - for (uint64_t i = 0; i < sc->numStages; i++) { - if ((i == sc->numStages - 1) && (sc->registerBoost > 1)) { - res = appendRadixStage(sc, floatType, uintType, stageSize, stageSizeSum, stageAngle, sc->stageRadix[i], i, locType); - if (res != VKFFT_SUCCESS) { - freeShaderGenVkFFT(sc); - return res; - } - res = appendRegisterBoostShuffle(sc, floatType, stageSize, sc->stageRadix[i - 1], sc->stageRadix[i], stageAngle); - if (res != VKFFT_SUCCESS) { - freeShaderGenVkFFT(sc); - return res; - } - } - else { - - res = appendRadixStage(sc, floatType, uintType, stageSize, stageSizeSum, stageAngle, sc->stageRadix[i], i, locType); - if (res != VKFFT_SUCCESS) { - freeShaderGenVkFFT(sc); - return res; - } - if (i > 0) { - switch (sc->stageRadix[i]) { - case 2: - stageSizeSum += stageSize; - break; - case 3: - stageSizeSum += stageSize * 2; - break; - case 4: - stageSizeSum += stageSize * 2; - break; - case 5: - stageSizeSum += stageSize * 4; - break; - case 6: - stageSizeSum += stageSize * 5; - break; - case 7: - stageSizeSum += stageSize * 6; - break; - case 8: - stageSizeSum += stageSize * 3; - break; - case 9: - stageSizeSum += stageSize * 8; - break; - case 10: - stageSizeSum += stageSize * 9; - break; - case 11: - stageSizeSum += stageSize * 10; - break; - case 12: - stageSizeSum += stageSize * 11; - break; - case 13: - stageSizeSum += stageSize * 12; - break; - case 14: - stageSizeSum += stageSize * 13; - break; - case 15: - stageSizeSum += stageSize * 14; - break; - case 16: - stageSizeSum += stageSize * 4; - break; - case 32: - stageSizeSum += stageSize * 5; - break; - default: - stageSizeSum += stageSize * (sc->stageRadix[i]); - break; - } - } - if ((i == sc->numStages - 1) || (sc->registerBoost == 1)) { - res = appendRadixShuffle(sc, floatType, uintType, stageSize, stageSizeSum, stageAngle, sc->stageRadix[i], sc->stageRadix[i], i, locType); - if (res != VKFFT_SUCCESS) { - freeShaderGenVkFFT(sc); - return res; - } - } - else { - res = appendRadixShuffle(sc, floatType, uintType, stageSize, stageSizeSum, stageAngle, sc->stageRadix[i], sc->stageRadix[i + 1], i, locType); - if (res != VKFFT_SUCCESS) { - freeShaderGenVkFFT(sc); - return res; - } - } - stageSize *= sc->stageRadix[i]; - stageAngle /= sc->stageRadix[i]; - } - } - if ((sc->convolutionStep) || (sc->useBluesteinFFT && sc->BluesteinConvolutionStep)) { - res = appendCoordinateRegisterStore(sc, locType); - if (res != VKFFT_SUCCESS) { - freeShaderGenVkFFT(sc); - return res; - } - - if (sc->matrixConvolution > 1) { - sc->tempLen = sprintf(sc->tempStr, " coordinate++;}\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) { - freeShaderGenVkFFT(sc); - return res; - } - } - if (sc->numKernels > 1) { - res = appendPreparationBatchedKernelConvolution(sc, floatType, floatTypeKernelMemory, uintType, locType); - if (res != VKFFT_SUCCESS) { - freeShaderGenVkFFT(sc); - return res; - } - } - if (sc->useBluesteinFFT && sc->BluesteinConvolutionStep) - { - res = appendBluesteinConvolution(sc, floatType, uintType, locType); - if (res != VKFFT_SUCCESS) { - freeShaderGenVkFFT(sc); - return res; - } - } - else { - res = appendKernelConvolution(sc, floatType, floatTypeKernelMemory, uintType, locType); - if (res != VKFFT_SUCCESS) { - freeShaderGenVkFFT(sc); - return res; - } - } - appendBarrierVkFFT(sc, 1); - if (res != VKFFT_SUCCESS) { - freeShaderGenVkFFT(sc); - return res; - } - if (sc->matrixConvolution > 1) { - sc->tempLen = sprintf(sc->tempStr, " for (%s coordinate=0; coordinate < %" PRIu64 "; coordinate++){\n", uintType, sc->matrixConvolution); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) { - freeShaderGenVkFFT(sc); - return res; - } - } - res = appendCoordinateRegisterPull(sc, locType); - if (res != VKFFT_SUCCESS) { - freeShaderGenVkFFT(sc); - return res; - } - - stageSize = 1; - stageSizeSum = 0; - stageAngle = PI_const; - sc->inverse = 1; - for (uint64_t i = 0; i < sc->numStages; i++) { - res = appendRadixStage(sc, floatType, uintType, stageSize, stageSizeSum, stageAngle, sc->stageRadix[i], i, locType); - if (res != VKFFT_SUCCESS) { - freeShaderGenVkFFT(sc); - return res; - } - if (i > 0) { - switch (sc->stageRadix[i]) { - case 2: - stageSizeSum += stageSize; - break; - case 3: - stageSizeSum += stageSize * 2; - break; - case 4: - stageSizeSum += stageSize * 2; - break; - case 5: - stageSizeSum += stageSize * 4; - break; - case 6: - stageSizeSum += stageSize * 5; - break; - case 7: - stageSizeSum += stageSize * 6; - break; - case 8: - stageSizeSum += stageSize * 3; - break; - case 9: - stageSizeSum += stageSize * 8; - break; - case 10: - stageSizeSum += stageSize * 9; - break; - case 11: - stageSizeSum += stageSize * 10; - break; - case 12: - stageSizeSum += stageSize * 11; - break; - case 13: - stageSizeSum += stageSize * 12; - break; - case 14: - stageSizeSum += stageSize * 13; - break; - case 15: - stageSizeSum += stageSize * 14; - break; - case 16: - stageSizeSum += stageSize * 4; - break; - case 32: - stageSizeSum += stageSize * 5; - break; - default: - stageSizeSum += stageSize * (sc->stageRadix[i]); - break; - } - } - if (i == sc->numStages - 1) { - res = appendRadixShuffle(sc, floatType, uintType, stageSize, stageSizeSum, stageAngle, sc->stageRadix[i], sc->stageRadix[i], i, locType); - if (res != VKFFT_SUCCESS) { - freeShaderGenVkFFT(sc); - return res; - } - } - else { - res = appendRadixShuffle(sc, floatType, uintType, stageSize, stageSizeSum, stageAngle, sc->stageRadix[i], sc->stageRadix[i + 1], i, locType); - if (res != VKFFT_SUCCESS) { - freeShaderGenVkFFT(sc); - return res; - } - } - stageSize *= sc->stageRadix[i]; - stageAngle /= sc->stageRadix[i]; - } - } - if (!sc->useRader) { - //if (((sc->stageRadix[sc->numStages - 1] < sc->fixMinRaderPrimeMult) || (sc->rader_generator[sc->numStages - 1] == 0))) { - res = appendBoostThreadDataReorder(sc, floatType, uintType, locType, 0); - if (res != VKFFT_SUCCESS) { - freeShaderGenVkFFT(sc); - return res; - } - } - res = appendReorder4StepWrite(sc, floatType, uintType, locType); - if (res != VKFFT_SUCCESS) { - freeShaderGenVkFFT(sc); - return res; - } - if (sc->useBluesteinFFT && sc->BluesteinPostMultiplication) { - res = appendBluesteinMultiplication(sc, floatType, uintType, locType, 1); - if (res != VKFFT_SUCCESS) { - freeShaderGenVkFFT(sc); - return res; - } - } - res = appendWriteDataVkFFT(sc, floatType, floatTypeOutputMemory, uintType, type); - if (res != VKFFT_SUCCESS) { - freeShaderGenVkFFT(sc); - return res; - } - if ((sc->convolutionStep) && (sc->matrixConvolution > 1)) - { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) { - freeShaderGenVkFFT(sc); - return res; - } - } - if ((sc->convolutionStep) && (sc->numKernels > 1)) - { - sc->tempLen = sprintf(sc->tempStr, " }\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) { - freeShaderGenVkFFT(sc); - return res; - } - } - sc->tempLen = sprintf(sc->tempStr, "}\n"); - res = VkAppendLine(sc); - if (res != VKFFT_SUCCESS) { - freeShaderGenVkFFT(sc); - return res; - } - freeShaderGenVkFFT(sc); - //if (sc->useBluesteinFFT) - //printf("%s", output); - return res; -} -#if(VKFFT_BACKEND==0) -static inline VkFFTResult findMemoryType(VkFFTApplication* app, uint64_t memoryTypeBits, uint64_t memorySize, VkMemoryPropertyFlags properties, uint32_t* memoryTypeIndex) { - VkPhysicalDeviceMemoryProperties memoryProperties = { 0 }; - - vkGetPhysicalDeviceMemoryProperties(app->configuration.physicalDevice[0], &memoryProperties); - - for (uint64_t i = 0; i < memoryProperties.memoryTypeCount; ++i) { - if ((memoryTypeBits & ((uint64_t)1 << i)) && ((memoryProperties.memoryTypes[i].propertyFlags & properties) == properties) && (memoryProperties.memoryHeaps[memoryProperties.memoryTypes[i].heapIndex].size >= memorySize)) - { - memoryTypeIndex[0] = (uint32_t)i; - return VKFFT_SUCCESS; - } - } - return VKFFT_ERROR_FAILED_TO_FIND_MEMORY; -} -static inline VkFFTResult allocateFFTBuffer(VkFFTApplication* app, VkBuffer* buffer, VkDeviceMemory* deviceMemory, VkBufferUsageFlags usageFlags, VkMemoryPropertyFlags propertyFlags, VkDeviceSize size) { - VkFFTResult resFFT = VKFFT_SUCCESS; - VkResult res = VK_SUCCESS; - uint32_t queueFamilyIndices; - VkBufferCreateInfo bufferCreateInfo = { VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO }; - bufferCreateInfo.sharingMode = VK_SHARING_MODE_EXCLUSIVE; - bufferCreateInfo.queueFamilyIndexCount = 1; - bufferCreateInfo.pQueueFamilyIndices = &queueFamilyIndices; - bufferCreateInfo.size = size; - bufferCreateInfo.usage = usageFlags; - res = vkCreateBuffer(app->configuration.device[0], &bufferCreateInfo, 0, buffer); - if (res != VK_SUCCESS) return VKFFT_ERROR_FAILED_TO_CREATE_BUFFER; - VkMemoryRequirements memoryRequirements = { 0 }; - vkGetBufferMemoryRequirements(app->configuration.device[0], buffer[0], &memoryRequirements); - VkMemoryAllocateInfo memoryAllocateInfo = { VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO }; - memoryAllocateInfo.allocationSize = memoryRequirements.size; - resFFT = findMemoryType(app, memoryRequirements.memoryTypeBits, memoryRequirements.size, propertyFlags, &memoryAllocateInfo.memoryTypeIndex); - if (resFFT != VKFFT_SUCCESS) return resFFT; - res = vkAllocateMemory(app->configuration.device[0], &memoryAllocateInfo, 0, deviceMemory); - if (res != VK_SUCCESS) return VKFFT_ERROR_FAILED_TO_ALLOCATE_MEMORY; - res = vkBindBufferMemory(app->configuration.device[0], buffer[0], deviceMemory[0], 0); - if (res != VK_SUCCESS) return VKFFT_ERROR_FAILED_TO_BIND_BUFFER_MEMORY; - return resFFT; -} -static inline VkFFTResult transferDataFromCPU(VkFFTApplication* app, void* arr, VkBuffer* buffer, VkDeviceSize bufferSize) { - VkResult res = VK_SUCCESS; - VkFFTResult resFFT = VKFFT_SUCCESS; - VkDeviceSize stagingBufferSize = bufferSize; - VkBuffer stagingBuffer = { 0 }; - VkDeviceMemory stagingBufferMemory = { 0 }; - resFFT = allocateFFTBuffer(app, &stagingBuffer, &stagingBufferMemory, VK_BUFFER_USAGE_TRANSFER_SRC_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, stagingBufferSize); - if (resFFT != VKFFT_SUCCESS) return resFFT; - void* data; - res = vkMapMemory(app->configuration.device[0], stagingBufferMemory, 0, stagingBufferSize, 0, &data); - if (res != VK_SUCCESS) return VKFFT_ERROR_FAILED_TO_MAP_MEMORY; - memcpy(data, arr, stagingBufferSize); - vkUnmapMemory(app->configuration.device[0], stagingBufferMemory); - VkCommandBufferAllocateInfo commandBufferAllocateInfo = { VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO }; - commandBufferAllocateInfo.commandPool = app->configuration.commandPool[0]; - commandBufferAllocateInfo.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY; - commandBufferAllocateInfo.commandBufferCount = 1; - VkCommandBuffer commandBuffer = { 0 }; - res = vkAllocateCommandBuffers(app->configuration.device[0], &commandBufferAllocateInfo, &commandBuffer); - if (res != VK_SUCCESS) return VKFFT_ERROR_FAILED_TO_ALLOCATE_COMMAND_BUFFERS; - VkCommandBufferBeginInfo commandBufferBeginInfo = { VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO }; - commandBufferBeginInfo.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT; - res = vkBeginCommandBuffer(commandBuffer, &commandBufferBeginInfo); - if (res != VK_SUCCESS) return VKFFT_ERROR_FAILED_TO_BEGIN_COMMAND_BUFFER; - VkBufferCopy copyRegion = { 0 }; - copyRegion.srcOffset = 0; - copyRegion.dstOffset = 0; - copyRegion.size = stagingBufferSize; - vkCmdCopyBuffer(commandBuffer, stagingBuffer, buffer[0], 1, ©Region); - res = vkEndCommandBuffer(commandBuffer); - if (res != VK_SUCCESS) return VKFFT_ERROR_FAILED_TO_END_COMMAND_BUFFER; - VkSubmitInfo submitInfo = { VK_STRUCTURE_TYPE_SUBMIT_INFO }; - submitInfo.commandBufferCount = 1; - submitInfo.pCommandBuffers = &commandBuffer; - res = vkQueueSubmit(app->configuration.queue[0], 1, &submitInfo, app->configuration.fence[0]); - if (res != VK_SUCCESS) return VKFFT_ERROR_FAILED_TO_SUBMIT_QUEUE; - res = vkWaitForFences(app->configuration.device[0], 1, app->configuration.fence, VK_TRUE, 100000000000); - if (res != VK_SUCCESS) return VKFFT_ERROR_FAILED_TO_WAIT_FOR_FENCES; - res = vkResetFences(app->configuration.device[0], 1, app->configuration.fence); - if (res != VK_SUCCESS) return VKFFT_ERROR_FAILED_TO_RESET_FENCES; - vkFreeCommandBuffers(app->configuration.device[0], app->configuration.commandPool[0], 1, &commandBuffer); - vkDestroyBuffer(app->configuration.device[0], stagingBuffer, 0); - vkFreeMemory(app->configuration.device[0], stagingBufferMemory, 0); - return resFFT; -} -static inline VkFFTResult transferDataToCPU(VkFFTApplication* app, void* arr, VkBuffer* buffer, VkDeviceSize bufferSize) { - VkResult res = VK_SUCCESS; - VkFFTResult resFFT = VKFFT_SUCCESS; - uint64_t stagingBufferSize = bufferSize; - VkBuffer stagingBuffer = { 0 }; - VkDeviceMemory stagingBufferMemory = { 0 }; - resFFT = allocateFFTBuffer(app, &stagingBuffer, &stagingBufferMemory, VK_BUFFER_USAGE_TRANSFER_DST_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, stagingBufferSize); - if (resFFT != VKFFT_SUCCESS) return resFFT; - VkCommandBufferAllocateInfo commandBufferAllocateInfo = { VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO }; - commandBufferAllocateInfo.commandPool = app->configuration.commandPool[0]; - commandBufferAllocateInfo.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY; - commandBufferAllocateInfo.commandBufferCount = 1; - VkCommandBuffer commandBuffer = { 0 }; - res = vkAllocateCommandBuffers(app->configuration.device[0], &commandBufferAllocateInfo, &commandBuffer); - if (res != VK_SUCCESS) return VKFFT_ERROR_FAILED_TO_ALLOCATE_COMMAND_BUFFERS; - VkCommandBufferBeginInfo commandBufferBeginInfo = { VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO }; - commandBufferBeginInfo.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT; - res = vkBeginCommandBuffer(commandBuffer, &commandBufferBeginInfo); - if (res != VK_SUCCESS) return VKFFT_ERROR_FAILED_TO_BEGIN_COMMAND_BUFFER; - VkBufferCopy copyRegion = { 0 }; - copyRegion.srcOffset = 0; - copyRegion.dstOffset = 0; - copyRegion.size = stagingBufferSize; - vkCmdCopyBuffer(commandBuffer, buffer[0], stagingBuffer, 1, ©Region); - res = vkEndCommandBuffer(commandBuffer); - if (res != VK_SUCCESS) return VKFFT_ERROR_FAILED_TO_END_COMMAND_BUFFER; - VkSubmitInfo submitInfo = { VK_STRUCTURE_TYPE_SUBMIT_INFO }; - submitInfo.commandBufferCount = 1; - submitInfo.pCommandBuffers = &commandBuffer; - res = vkQueueSubmit(app->configuration.queue[0], 1, &submitInfo, app->configuration.fence[0]); - if (res != VK_SUCCESS) return VKFFT_ERROR_FAILED_TO_SUBMIT_QUEUE; - res = vkWaitForFences(app->configuration.device[0], 1, app->configuration.fence, VK_TRUE, 100000000000); - if (res != VK_SUCCESS) return VKFFT_ERROR_FAILED_TO_WAIT_FOR_FENCES; - res = vkResetFences(app->configuration.device[0], 1, app->configuration.fence); - if (res != VK_SUCCESS) return VKFFT_ERROR_FAILED_TO_RESET_FENCES; - vkFreeCommandBuffers(app->configuration.device[0], app->configuration.commandPool[0], 1, &commandBuffer); - void* data; - res = vkMapMemory(app->configuration.device[0], stagingBufferMemory, 0, stagingBufferSize, 0, &data); - if (resFFT != VKFFT_SUCCESS) return resFFT; - memcpy(arr, data, stagingBufferSize); - vkUnmapMemory(app->configuration.device[0], stagingBufferMemory); - vkDestroyBuffer(app->configuration.device[0], stagingBuffer, 0); - vkFreeMemory(app->configuration.device[0], stagingBufferMemory, 0); - return resFFT; -} -#endif -static inline void deleteAxis(VkFFTApplication* app, VkFFTAxis* axis) { -#if(VKFFT_BACKEND==0) - if ((app->configuration.useLUT) && (!axis->referenceLUT)) { - if (axis->bufferLUT != 0) { - vkDestroyBuffer(app->configuration.device[0], axis->bufferLUT, 0); - axis->bufferLUT = 0; - } - if (axis->bufferLUTDeviceMemory != 0) { - vkFreeMemory(app->configuration.device[0], axis->bufferLUTDeviceMemory, 0); - axis->bufferLUTDeviceMemory = 0; - } - } - if (axis->descriptorPool != 0) { - vkDestroyDescriptorPool(app->configuration.device[0], axis->descriptorPool, 0); - axis->descriptorPool = 0; - } - if (axis->descriptorSetLayout != 0) { - vkDestroyDescriptorSetLayout(app->configuration.device[0], axis->descriptorSetLayout, 0); - axis->descriptorSetLayout = 0; - } - if (axis->pipelineLayout != 0) { - vkDestroyPipelineLayout(app->configuration.device[0], axis->pipelineLayout, 0); - axis->pipelineLayout = 0; - } - if (axis->pipeline != 0) { - vkDestroyPipeline(app->configuration.device[0], axis->pipeline, 0); - axis->pipeline = 0; - } -#elif(VKFFT_BACKEND==1) - CUresult res = CUDA_SUCCESS; - cudaError_t res_t = cudaSuccess; - if ((app->configuration.useLUT) && (!axis->referenceLUT) && (axis->bufferLUT != 0)) { - res_t = cudaFree(axis->bufferLUT); - if (res_t == cudaSuccess) axis->bufferLUT = 0; - } - if (axis->VkFFTModule != 0) { - res = cuModuleUnload(axis->VkFFTModule); - if (res == CUDA_SUCCESS) axis->VkFFTModule = 0; - } -#elif(VKFFT_BACKEND==2) - hipError_t res = hipSuccess; - if ((app->configuration.useLUT) && (!axis->referenceLUT) && (axis->bufferLUT != 0)) { - res = hipFree(axis->bufferLUT); - if (res == hipSuccess) axis->bufferLUT = 0; - } - if (axis->VkFFTModule != 0) { - res = hipModuleUnload(axis->VkFFTModule); - if (res == hipSuccess) axis->VkFFTModule = 0; - } -#elif(VKFFT_BACKEND==3) - cl_int res = 0; - if ((app->configuration.useLUT) && (!axis->referenceLUT) && (axis->bufferLUT != 0)) { - res = clReleaseMemObject(axis->bufferLUT); - if (res == 0) axis->bufferLUT = 0; - } - if (axis->program != 0) { - res = clReleaseProgram(axis->program); - if (res == 0) axis->program = 0; - } - if (axis->kernel != 0) { - res = clReleaseKernel(axis->kernel); - if (res == 0) axis->kernel = 0; - } -#elif(VKFFT_BACKEND==4) - ze_result_t res = ZE_RESULT_SUCCESS; - if ((app->configuration.useLUT) && (!axis->referenceLUT) && (axis->bufferLUT != 0)) { - res = zeMemFree(app->configuration.context[0], axis->bufferLUT); - if (res == ZE_RESULT_SUCCESS) axis->bufferLUT = 0; - } - if (axis->VkFFTModule != 0) { - res = zeModuleDestroy(axis->VkFFTModule); - if (res == ZE_RESULT_SUCCESS)axis->VkFFTModule = 0; - } - if (axis->VkFFTKernel != 0) { - res = zeKernelDestroy(axis->VkFFTKernel); - if (res == ZE_RESULT_SUCCESS)axis->VkFFTKernel = 0; - } -#endif - if (app->configuration.saveApplicationToString) { - if (axis->binary != 0) { - free(axis->binary); - axis->binary = 0; - } - } -} -static inline void deleteVkFFT(VkFFTApplication* app) { -#if(VKFFT_BACKEND==0) - if (app->configuration.isCompilerInitialized) { - glslang_finalize_process(); - app->configuration.isCompilerInitialized = 0; - } -#elif(VKFFT_BACKEND==1) - if (app->configuration.num_streams > 1) { - cudaError_t res_t = cudaSuccess; - for (uint64_t i = 0; i < app->configuration.num_streams; i++) { - if (app->configuration.stream_event[i] != 0) { - res_t = cudaEventDestroy(app->configuration.stream_event[i]); - if (res_t == cudaSuccess) app->configuration.stream_event[i] = 0; - } - } - if (app->configuration.stream_event != 0) { - free(app->configuration.stream_event); - app->configuration.stream_event = 0; - } - } -#elif(VKFFT_BACKEND==2) - if (app->configuration.num_streams > 1) { - hipError_t res_t = hipSuccess; - for (uint64_t i = 0; i < app->configuration.num_streams; i++) { - if (app->configuration.stream_event[i] != 0) { - res_t = hipEventDestroy(app->configuration.stream_event[i]); - if (res_t == hipSuccess) app->configuration.stream_event[i] = 0; - } - } - if (app->configuration.stream_event != 0) { - free(app->configuration.stream_event); - app->configuration.stream_event = 0; - } - } -#endif - if (app->numRaderFFTPrimes) { - for (uint64_t i = 0; i < app->numRaderFFTPrimes; i++) { - free(app->raderFFTkernel[i]); - app->raderFFTkernel[i] = 0; - } - } - if (!app->configuration.userTempBuffer) { - if (app->configuration.allocateTempBuffer) { - app->configuration.allocateTempBuffer = 0; -#if(VKFFT_BACKEND==0) - if (app->configuration.tempBuffer[0] != 0) { - vkDestroyBuffer(app->configuration.device[0], app->configuration.tempBuffer[0], 0); - app->configuration.tempBuffer[0] = 0; - } - if (app->configuration.tempBufferDeviceMemory != 0) { - vkFreeMemory(app->configuration.device[0], app->configuration.tempBufferDeviceMemory, 0); - app->configuration.tempBufferDeviceMemory = 0; - } -#elif(VKFFT_BACKEND==1) - cudaError_t res_t = cudaSuccess; - if (app->configuration.tempBuffer[0] != 0) { - res_t = cudaFree(app->configuration.tempBuffer[0]); - if (res_t == cudaSuccess) app->configuration.tempBuffer[0] = 0; - } -#elif(VKFFT_BACKEND==2) - hipError_t res_t = hipSuccess; - if (app->configuration.tempBuffer[0] != 0) { - res_t = hipFree(app->configuration.tempBuffer[0]); - if (res_t == hipSuccess) app->configuration.tempBuffer[0] = 0; - } -#elif(VKFFT_BACKEND==3) - cl_int res = 0; - if (app->configuration.tempBuffer[0] != 0) { - res = clReleaseMemObject(app->configuration.tempBuffer[0]); - if (res == 0) app->configuration.tempBuffer[0] = 0; - } -#elif(VKFFT_BACKEND==4) - ze_result_t res = ZE_RESULT_SUCCESS; - if (app->configuration.tempBuffer[0] != 0) { - res = zeMemFree(app->configuration.context[0], app->configuration.tempBuffer[0]); - if (res == ZE_RESULT_SUCCESS) app->configuration.tempBuffer[0] = 0; - } -#endif - if (app->configuration.tempBuffer != 0) { - free(app->configuration.tempBuffer); - app->configuration.tempBuffer = 0; - } - } - if (app->configuration.tempBufferSize != 0) { - free(app->configuration.tempBufferSize); - app->configuration.tempBufferSize = 0; - } - } - for (uint64_t i = 0; i < app->configuration.FFTdim; i++) { - if (app->configuration.useRaderUintLUT) { - for (uint64_t j = 0; j < 4; j++) { - if (app->bufferRaderUintLUT[i][j]) { -#if(VKFFT_BACKEND==0) - vkDestroyBuffer(app->configuration.device[0], app->bufferRaderUintLUT[i][j], 0); - app->bufferRaderUintLUT[i][j] = 0; - vkFreeMemory(app->configuration.device[0], app->bufferRaderUintLUTDeviceMemory[i][j], 0); - app->bufferRaderUintLUTDeviceMemory[i][j] = 0; -#elif(VKFFT_BACKEND==1) - cudaError_t res_t = cudaSuccess; - res_t = cudaFree(app->bufferRaderUintLUT[i][j]); - if (res_t == cudaSuccess) app->bufferRaderUintLUT[i][j] = 0; -#elif(VKFFT_BACKEND==2) - hipError_t res_t = hipSuccess; - res_t = hipFree(app->bufferRaderUintLUT[i][j]); - if (res_t == hipSuccess) app->bufferRaderUintLUT[i][j] = 0; -#elif(VKFFT_BACKEND==3) - cl_int res = 0; - res = clReleaseMemObject(app->bufferRaderUintLUT[i][j]); - if (res == 0) app->bufferRaderUintLUT[i][j] = 0; -#elif(VKFFT_BACKEND==4) - ze_result_t res = ZE_RESULT_SUCCESS; - res = zeMemFree(app->configuration.context[0], app->bufferRaderUintLUT[i][j]); - if (res == ZE_RESULT_SUCCESS) app->bufferRaderUintLUT[i][j] = 0; -#endif - } - } - } - if (app->useBluesteinFFT[i]) { -#if(VKFFT_BACKEND==0) - if (app->bufferBluestein[i] != 0) { - vkDestroyBuffer(app->configuration.device[0], app->bufferBluestein[i], 0); - app->bufferBluestein[i] = 0; - } - if (app->bufferBluesteinDeviceMemory[i] != 0) { - vkFreeMemory(app->configuration.device[0], app->bufferBluesteinDeviceMemory[i], 0); - app->bufferBluesteinDeviceMemory[i] = 0; - } - if (app->bufferBluesteinFFT[i] != 0) { - vkDestroyBuffer(app->configuration.device[0], app->bufferBluesteinFFT[i], 0); - app->bufferBluesteinFFT[i] = 0; - } - if (app->bufferBluesteinFFTDeviceMemory[i] != 0) { - vkFreeMemory(app->configuration.device[0], app->bufferBluesteinFFTDeviceMemory[i], 0); - app->bufferBluesteinFFTDeviceMemory[i] = 0; - } - if (app->bufferBluesteinIFFT[i] != 0) { - vkDestroyBuffer(app->configuration.device[0], app->bufferBluesteinIFFT[i], 0); - app->bufferBluesteinIFFT[i] = 0; - } - if (app->bufferBluesteinIFFTDeviceMemory[i] != 0) { - vkFreeMemory(app->configuration.device[0], app->bufferBluesteinIFFTDeviceMemory[i], 0); - app->bufferBluesteinIFFTDeviceMemory[i] = 0; - } -#elif(VKFFT_BACKEND==1) - cudaError_t res_t = cudaSuccess; - if (app->bufferBluestein[i] != 0) { - res_t = cudaFree(app->bufferBluestein[i]); - if (res_t == cudaSuccess) app->bufferBluestein[i] = 0; - } - if (app->bufferBluesteinFFT[i] != 0) { - res_t = cudaFree(app->bufferBluesteinFFT[i]); - if (res_t == cudaSuccess) app->bufferBluesteinFFT[i] = 0; - } - if (app->bufferBluesteinIFFT[i] != 0) { - res_t = cudaFree(app->bufferBluesteinIFFT[i]); - if (res_t == cudaSuccess) app->bufferBluesteinIFFT[i] = 0; - } -#elif(VKFFT_BACKEND==2) - hipError_t res_t = hipSuccess; - if (app->bufferBluestein[i] != 0) { - res_t = hipFree(app->bufferBluestein[i]); - if (res_t == hipSuccess) app->bufferBluestein[i] = 0; - } - if (app->bufferBluesteinFFT[i] != 0) { - res_t = hipFree(app->bufferBluesteinFFT[i]); - if (res_t == hipSuccess) app->bufferBluesteinFFT[i] = 0; - } - if (app->bufferBluesteinIFFT[i] != 0) { - res_t = hipFree(app->bufferBluesteinIFFT[i]); - if (res_t == hipSuccess) app->bufferBluesteinIFFT[i] = 0; - } -#elif(VKFFT_BACKEND==3) - cl_int res = 0; - if (app->bufferBluestein[i] != 0) { - res = clReleaseMemObject(app->bufferBluestein[i]); - if (res == 0) app->bufferBluestein[i] = 0; - } - if (app->bufferBluesteinFFT[i] != 0) { - res = clReleaseMemObject(app->bufferBluesteinFFT[i]); - if (res == 0) app->bufferBluesteinFFT[i] = 0; - } - if (app->bufferBluesteinIFFT[i] != 0) { - res = clReleaseMemObject(app->bufferBluesteinIFFT[i]); - if (res == 0) app->bufferBluesteinIFFT[i] = 0; - } -#elif(VKFFT_BACKEND==4) - ze_result_t res = ZE_RESULT_SUCCESS; - if (app->bufferBluestein[i] != 0) { - res = zeMemFree(app->configuration.context[0], app->bufferBluestein[i]); - if (res == ZE_RESULT_SUCCESS) app->bufferBluestein[i] = 0; - } - if (app->bufferBluesteinFFT[i] != 0) { - res = zeMemFree(app->configuration.context[0], app->bufferBluesteinFFT[i]); - if (res == ZE_RESULT_SUCCESS) app->bufferBluesteinFFT[i] = 0; - } - if (app->bufferBluesteinIFFT[i] != 0) { - res = zeMemFree(app->configuration.context[0], app->bufferBluesteinIFFT[i]); - if (res == ZE_RESULT_SUCCESS) app->bufferBluesteinIFFT[i] = 0; - } -#endif - } - } - if (!app->configuration.makeInversePlanOnly) { - if (app->localFFTPlan != 0) { - for (uint64_t i = 0; i < app->configuration.FFTdim; i++) { - if (app->localFFTPlan->numAxisUploads[i] > 0) { - for (uint64_t j = 0; j < app->localFFTPlan->numAxisUploads[i]; j++) - deleteAxis(app, &app->localFFTPlan->axes[i][j]); - } - } - if (app->localFFTPlan->multiUploadR2C) { - deleteAxis(app, &app->localFFTPlan->R2Cdecomposition); - } - if (app->localFFTPlan != 0) { - free(app->localFFTPlan); - app->localFFTPlan = 0; - } - } - } - if (!app->configuration.makeForwardPlanOnly) { - if (app->localFFTPlan_inverse != 0) { - for (uint64_t i = 0; i < app->configuration.FFTdim; i++) { - if (app->localFFTPlan_inverse->numAxisUploads[i] > 0) { - for (uint64_t j = 0; j < app->localFFTPlan_inverse->numAxisUploads[i]; j++) - deleteAxis(app, &app->localFFTPlan_inverse->axes[i][j]); - } - } - if (app->localFFTPlan_inverse->multiUploadR2C) { - deleteAxis(app, &app->localFFTPlan_inverse->R2Cdecomposition); - } - if (app->localFFTPlan_inverse != 0) { - free(app->localFFTPlan_inverse); - app->localFFTPlan_inverse = 0; - } - } - } - if (app->configuration.saveApplicationToString) { - if (app->saveApplicationString != 0) { - free(app->saveApplicationString); - app->saveApplicationString = 0; - } - for (uint64_t i = 0; i < app->configuration.FFTdim; i++) { - if (app->applicationBluesteinString[i] != 0) { - free(app->applicationBluesteinString[i]); - app->applicationBluesteinString[i] = 0; - } - } - } - if (app->configuration.autoCustomBluesteinPaddingPattern) { - if (app->configuration.primeSizes != 0) { - free(app->configuration.primeSizes); - app->configuration.primeSizes = 0; - } - if (app->configuration.paddedSizes != 0) { - free(app->configuration.paddedSizes); - app->configuration.paddedSizes = 0; - } - } -} -static inline VkFFTResult VkFFTGetRegistersPerThread(uint64_t fft_length, uint64_t extraSharedMemoryForPow2, uint64_t max_rhs, uint64_t useRader, uint64_t* loc_multipliers, uint64_t* registers_per_thread_per_radix, uint64_t* registers_per_thread, uint64_t* min_registers_per_thread, uint64_t* isGoodSequence) { - for (uint64_t i = 0; i < 33; i++) { - registers_per_thread_per_radix[i] = 0; - } - registers_per_thread[0] = 0; - min_registers_per_thread[0] = -1; - - if (loc_multipliers[2] > 0) { - if (loc_multipliers[3] > 0) { - if (loc_multipliers[5] > 0) { - if (loc_multipliers[7] > 0) { - if (loc_multipliers[11] > 0) { - if (loc_multipliers[13] > 0) { - switch (loc_multipliers[2]) { - case 1: - registers_per_thread_per_radix[2] = 14; - registers_per_thread_per_radix[3] = 15; - break; - case 2: - registers_per_thread_per_radix[2] = 12; - registers_per_thread_per_radix[3] = 12; - break; - case 3: - registers_per_thread_per_radix[2] = 12; - registers_per_thread_per_radix[3] = 12; - break; - default: - registers_per_thread_per_radix[2] = 16; - registers_per_thread_per_radix[3] = 12; - break; - } - registers_per_thread_per_radix[5] = 15; - registers_per_thread_per_radix[7] = 14; - registers_per_thread_per_radix[11] = 11; - registers_per_thread_per_radix[13] = 13; - } - else { - switch (loc_multipliers[2]) { - case 1: - registers_per_thread_per_radix[2] = 14; - registers_per_thread_per_radix[3] = 15; - break; - case 2: - registers_per_thread_per_radix[2] = 12; - registers_per_thread_per_radix[3] = 12; - break; - case 3: - registers_per_thread_per_radix[2] = 12; - registers_per_thread_per_radix[3] = 12; - break; - default: - registers_per_thread_per_radix[2] = 16; - registers_per_thread_per_radix[3] = 12; - break; - } - registers_per_thread_per_radix[5] = 15; - registers_per_thread_per_radix[7] = 14; - registers_per_thread_per_radix[11] = 11; - registers_per_thread_per_radix[13] = 0; - } - } - else { - if (loc_multipliers[13] > 0) { - switch (loc_multipliers[2]) { - case 1: - registers_per_thread_per_radix[2] = 14; - registers_per_thread_per_radix[3] = 15; - break; - case 2: - registers_per_thread_per_radix[2] = 12; - registers_per_thread_per_radix[3] = 12; - break; - case 3: - registers_per_thread_per_radix[2] = 12; - registers_per_thread_per_radix[3] = 12; - break; - default: - registers_per_thread_per_radix[2] = 16; - registers_per_thread_per_radix[3] = 12; - break; - } - registers_per_thread_per_radix[5] = 15; - registers_per_thread_per_radix[7] = 14; - registers_per_thread_per_radix[11] = 0; - registers_per_thread_per_radix[13] = 13; - - } - else { - switch (loc_multipliers[2]) { - case 1: - registers_per_thread_per_radix[2] = 14; - registers_per_thread_per_radix[3] = 15; - - break; - case 2: - registers_per_thread_per_radix[2] = 12; - registers_per_thread_per_radix[3] = 12; - break; - case 3: - registers_per_thread_per_radix[2] = 12; - registers_per_thread_per_radix[3] = 12; - break; - default: - registers_per_thread_per_radix[2] = 16; - registers_per_thread_per_radix[3] = 12; - break; - } - registers_per_thread_per_radix[5] = 15; - registers_per_thread_per_radix[7] = 14; - registers_per_thread_per_radix[11] = 0; - registers_per_thread_per_radix[13] = 0; - } - } - } - else { - if (loc_multipliers[11] > 0) { - if (loc_multipliers[13] > 0) { - switch (loc_multipliers[2]) { - case 1: - registers_per_thread_per_radix[2] = 10; - registers_per_thread_per_radix[3] = 15; - break; - case 2: - registers_per_thread_per_radix[2] = 12; - registers_per_thread_per_radix[3] = 12; - break; - default: - registers_per_thread_per_radix[2] = 12; - registers_per_thread_per_radix[3] = 12; - break; - } - registers_per_thread_per_radix[5] = 10; - registers_per_thread_per_radix[7] = 0; - registers_per_thread_per_radix[11] = 11; - registers_per_thread_per_radix[13] = 13; - } - else { - switch (loc_multipliers[2]) { - case 1: - registers_per_thread_per_radix[2] = 10; - registers_per_thread_per_radix[3] = 15; - break; - case 2: - registers_per_thread_per_radix[2] = 12; - registers_per_thread_per_radix[3] = 12; - break; - default: - registers_per_thread_per_radix[2] = 12; - registers_per_thread_per_radix[3] = 12; - break; - } - registers_per_thread_per_radix[5] = 10; - registers_per_thread_per_radix[7] = 0; - registers_per_thread_per_radix[11] = 11; - registers_per_thread_per_radix[13] = 0; - } - } - else { - if (loc_multipliers[13] > 0) { - switch (loc_multipliers[2]) { - case 1: - registers_per_thread_per_radix[2] = 10; - registers_per_thread_per_radix[3] = 15; - break; - case 2: - registers_per_thread_per_radix[2] = 12; - registers_per_thread_per_radix[3] = 12; - break; - default: - registers_per_thread_per_radix[2] = 12; - registers_per_thread_per_radix[3] = 12; - break; - } - registers_per_thread_per_radix[5] = 10; - registers_per_thread_per_radix[7] = 0; - registers_per_thread_per_radix[11] = 0; - registers_per_thread_per_radix[13] = 13; - } - else { - switch (loc_multipliers[2]) { - case 1: - registers_per_thread_per_radix[2] = 6; - registers_per_thread_per_radix[3] = 6; - registers_per_thread_per_radix[5] = 5; - break; - case 2: - registers_per_thread_per_radix[2] = 12; - registers_per_thread_per_radix[3] = 12; - registers_per_thread_per_radix[5] = 10; - break; - default: - registers_per_thread_per_radix[2] = 12; - registers_per_thread_per_radix[3] = 12; - registers_per_thread_per_radix[5] = 10; - break; - } - registers_per_thread_per_radix[7] = 0; - registers_per_thread_per_radix[11] = 0; - registers_per_thread_per_radix[13] = 0; - - } - } - } - } - else - { - if (loc_multipliers[7] > 0) { - if (loc_multipliers[11] > 0) { - if (loc_multipliers[13] > 0) { - registers_per_thread_per_radix[2] = 12; - registers_per_thread_per_radix[3] = 12; - registers_per_thread_per_radix[5] = 0; - registers_per_thread_per_radix[7] = 14; - registers_per_thread_per_radix[11] = 11; - registers_per_thread_per_radix[13] = 13; - } - else { - registers_per_thread_per_radix[2] = 12; - registers_per_thread_per_radix[3] = 12; - registers_per_thread_per_radix[5] = 0; - registers_per_thread_per_radix[7] = 14; - registers_per_thread_per_radix[11] = 11; - registers_per_thread_per_radix[13] = 0; - } - } - else { - if (loc_multipliers[13] > 0) { - registers_per_thread_per_radix[2] = 12; - registers_per_thread_per_radix[3] = 12; - registers_per_thread_per_radix[5] = 0; - registers_per_thread_per_radix[7] = 14; - registers_per_thread_per_radix[11] = 0; - registers_per_thread_per_radix[13] = 13; - } - else { - switch (loc_multipliers[2]) { - case 1: - registers_per_thread_per_radix[2] = 6; - registers_per_thread_per_radix[3] = 6; - registers_per_thread_per_radix[5] = 0; - registers_per_thread_per_radix[7] = 7; - registers_per_thread_per_radix[11] = 0; - registers_per_thread_per_radix[13] = 0; - break; - case 2: - registers_per_thread_per_radix[2] = 6; - registers_per_thread_per_radix[3] = 6; - registers_per_thread_per_radix[5] = 0; - registers_per_thread_per_radix[7] = 7; - registers_per_thread_per_radix[11] = 0; - registers_per_thread_per_radix[13] = 0; - break; - default: - registers_per_thread_per_radix[2] = 8; - registers_per_thread_per_radix[3] = 6; - registers_per_thread_per_radix[5] = 0; - registers_per_thread_per_radix[7] = 7; - registers_per_thread_per_radix[11] = 0; - registers_per_thread_per_radix[13] = 0; - break; - } - } - } - } - else { - if (loc_multipliers[11] > 0) { - if (loc_multipliers[13] > 0) { - switch (loc_multipliers[2]) { - case 1: - registers_per_thread_per_radix[2] = 6; - registers_per_thread_per_radix[3] = 6; - registers_per_thread_per_radix[5] = 0; - registers_per_thread_per_radix[7] = 0; - registers_per_thread_per_radix[11] = 11; - registers_per_thread_per_radix[13] = 13; - break; - case 2: - registers_per_thread_per_radix[2] = 12; - registers_per_thread_per_radix[3] = 12; - registers_per_thread_per_radix[5] = 0; - registers_per_thread_per_radix[7] = 0; - registers_per_thread_per_radix[11] = 11; - registers_per_thread_per_radix[13] = 13; - break; - default: - registers_per_thread_per_radix[2] = 12; - registers_per_thread_per_radix[3] = 12; - registers_per_thread_per_radix[5] = 0; - registers_per_thread_per_radix[7] = 0; - registers_per_thread_per_radix[11] = 11; - registers_per_thread_per_radix[13] = 13; - break; - } - } - else { - switch (loc_multipliers[2]) { - case 1: - registers_per_thread_per_radix[2] = 6; - registers_per_thread_per_radix[3] = 6; - registers_per_thread_per_radix[5] = 0; - registers_per_thread_per_radix[7] = 0; - registers_per_thread_per_radix[11] = 11; - registers_per_thread_per_radix[13] = 0; - break; - case 2: - registers_per_thread_per_radix[2] = 12; - registers_per_thread_per_radix[3] = 12; - registers_per_thread_per_radix[5] = 0; - registers_per_thread_per_radix[7] = 0; - registers_per_thread_per_radix[11] = 11; - registers_per_thread_per_radix[13] = 0; - break; - default: - registers_per_thread_per_radix[2] = 12; - registers_per_thread_per_radix[3] = 12; - registers_per_thread_per_radix[5] = 0; - registers_per_thread_per_radix[7] = 0; - registers_per_thread_per_radix[11] = 11; - registers_per_thread_per_radix[13] = 0; - break; - } - } - } - else { - if (loc_multipliers[13] > 0) { - switch (loc_multipliers[2]) { - case 1: - registers_per_thread_per_radix[2] = 6; - registers_per_thread_per_radix[3] = 6; - registers_per_thread_per_radix[5] = 0; - registers_per_thread_per_radix[7] = 0; - registers_per_thread_per_radix[11] = 0; - registers_per_thread_per_radix[13] = 13; - break; - case 2: - registers_per_thread_per_radix[2] = 12; - registers_per_thread_per_radix[3] = 12; - registers_per_thread_per_radix[5] = 0; - registers_per_thread_per_radix[7] = 0; - registers_per_thread_per_radix[11] = 0; - registers_per_thread_per_radix[13] = 13; - break; - default: - registers_per_thread_per_radix[2] = 12; - registers_per_thread_per_radix[3] = 12; - registers_per_thread_per_radix[5] = 0; - registers_per_thread_per_radix[7] = 0; - registers_per_thread_per_radix[11] = 0; - registers_per_thread_per_radix[13] = 13; - break; - } - } - else { - if (loc_multipliers[2] == loc_multipliers[3]) { - registers_per_thread_per_radix[2] = 6; - registers_per_thread_per_radix[3] = 6; - registers_per_thread_per_radix[5] = 0; - registers_per_thread_per_radix[7] = 0; - registers_per_thread_per_radix[11] = 0; - registers_per_thread_per_radix[13] = 0; - } - else { - switch (loc_multipliers[2]) { - case 1: - registers_per_thread_per_radix[2] = 6; - registers_per_thread_per_radix[3] = 6; - registers_per_thread_per_radix[5] = 0; - registers_per_thread_per_radix[7] = 0; - registers_per_thread_per_radix[11] = 0; - registers_per_thread_per_radix[13] = 0; - break; - case 2: - registers_per_thread_per_radix[2] = 12; - registers_per_thread_per_radix[3] = 12; - registers_per_thread_per_radix[5] = 0; - registers_per_thread_per_radix[7] = 0; - registers_per_thread_per_radix[11] = 0; - registers_per_thread_per_radix[13] = 0; - break; - default: - registers_per_thread_per_radix[2] = 12; - registers_per_thread_per_radix[3] = 12; - registers_per_thread_per_radix[5] = 0; - registers_per_thread_per_radix[7] = 0; - registers_per_thread_per_radix[11] = 0; - registers_per_thread_per_radix[13] = 0; - break; - } - } - } - } - } - } - } - else { - if (loc_multipliers[5] > 0) { - if (loc_multipliers[7] > 0) { - if (loc_multipliers[11] > 0) { - if (loc_multipliers[13] > 0) { - switch (loc_multipliers[2]) { - case 1: - registers_per_thread_per_radix[2] = 10; - registers_per_thread_per_radix[3] = 0; - registers_per_thread_per_radix[5] = 10; - registers_per_thread_per_radix[7] = 14; - registers_per_thread_per_radix[11] = 11; - registers_per_thread_per_radix[13] = 13; - break; - case 2: - registers_per_thread_per_radix[2] = 10; - registers_per_thread_per_radix[3] = 0; - registers_per_thread_per_radix[5] = 10; - registers_per_thread_per_radix[7] = 14; - registers_per_thread_per_radix[11] = 11; - registers_per_thread_per_radix[13] = 13; - break; - case 3: - registers_per_thread_per_radix[2] = 8; - registers_per_thread_per_radix[3] = 0; - registers_per_thread_per_radix[5] = 10; - registers_per_thread_per_radix[7] = 14; - registers_per_thread_per_radix[11] = 11; - registers_per_thread_per_radix[13] = 13; - break; - default: - registers_per_thread_per_radix[2] = 16; - registers_per_thread_per_radix[3] = 0; - registers_per_thread_per_radix[5] = 10; - registers_per_thread_per_radix[7] = 14; - registers_per_thread_per_radix[11] = 11; - registers_per_thread_per_radix[13] = 13; - break; - } - } - else { - switch (loc_multipliers[2]) { - case 1: - registers_per_thread_per_radix[2] = 10; - registers_per_thread_per_radix[3] = 0; - registers_per_thread_per_radix[5] = 10; - registers_per_thread_per_radix[7] = 14; - registers_per_thread_per_radix[11] = 11; - registers_per_thread_per_radix[13] = 0; - break; - case 2: - registers_per_thread_per_radix[2] = 10; - registers_per_thread_per_radix[3] = 0; - registers_per_thread_per_radix[5] = 10; - registers_per_thread_per_radix[7] = 14; - registers_per_thread_per_radix[11] = 11; - registers_per_thread_per_radix[13] = 0; - break; - case 3: - registers_per_thread_per_radix[2] = 8; - registers_per_thread_per_radix[3] = 0; - registers_per_thread_per_radix[5] = 10; - registers_per_thread_per_radix[7] = 14; - registers_per_thread_per_radix[11] = 11; - registers_per_thread_per_radix[13] = 0; - break; - default: - registers_per_thread_per_radix[2] = 16; - registers_per_thread_per_radix[3] = 0; - registers_per_thread_per_radix[5] = 10; - registers_per_thread_per_radix[7] = 14; - registers_per_thread_per_radix[11] = 11; - registers_per_thread_per_radix[13] = 0; - break; - } - } - } - else { - if (loc_multipliers[13] > 0) { - switch (loc_multipliers[2]) { - case 1: - registers_per_thread_per_radix[2] = 10; - registers_per_thread_per_radix[3] = 0; - registers_per_thread_per_radix[5] = 10; - registers_per_thread_per_radix[7] = 14; - registers_per_thread_per_radix[11] = 0; - registers_per_thread_per_radix[13] = 13; - break; - case 2: - registers_per_thread_per_radix[2] = 10; - registers_per_thread_per_radix[3] = 0; - registers_per_thread_per_radix[5] = 10; - registers_per_thread_per_radix[7] = 14; - registers_per_thread_per_radix[11] = 0; - registers_per_thread_per_radix[13] = 13; - break; - case 3: - registers_per_thread_per_radix[2] = 8; - registers_per_thread_per_radix[3] = 0; - registers_per_thread_per_radix[5] = 10; - registers_per_thread_per_radix[7] = 14; - registers_per_thread_per_radix[11] = 0; - registers_per_thread_per_radix[13] = 13; - break; - default: - registers_per_thread_per_radix[2] = 16; - registers_per_thread_per_radix[3] = 0; - registers_per_thread_per_radix[5] = 10; - registers_per_thread_per_radix[7] = 14; - registers_per_thread_per_radix[11] = 0; - registers_per_thread_per_radix[13] = 13; - break; - } - } - else { - switch (loc_multipliers[2]) { - case 1: - registers_per_thread_per_radix[2] = 10; - registers_per_thread_per_radix[3] = 0; - registers_per_thread_per_radix[5] = 10; - registers_per_thread_per_radix[7] = 7; - registers_per_thread_per_radix[11] = 0; - registers_per_thread_per_radix[13] = 0; - break; - case 2: - registers_per_thread_per_radix[2] = 10; - registers_per_thread_per_radix[3] = 0; - registers_per_thread_per_radix[5] = 10; - registers_per_thread_per_radix[7] = 7; - registers_per_thread_per_radix[11] = 0; - registers_per_thread_per_radix[13] = 0; - break; - default: - registers_per_thread_per_radix[2] = 8; - registers_per_thread_per_radix[3] = 0; - registers_per_thread_per_radix[5] = 10; - registers_per_thread_per_radix[7] = 7; - registers_per_thread_per_radix[11] = 0; - registers_per_thread_per_radix[13] = 0; - break; - } - } - } - } - else { - if (loc_multipliers[11] > 0) { - if (loc_multipliers[13] > 0) { - switch (loc_multipliers[2]) { - case 1: - registers_per_thread_per_radix[2] = 10; - registers_per_thread_per_radix[3] = 0; - registers_per_thread_per_radix[5] = 10; - registers_per_thread_per_radix[7] = 0; - registers_per_thread_per_radix[11] = 11; - registers_per_thread_per_radix[13] = 13; - break; - case 2: - registers_per_thread_per_radix[2] = 10; - registers_per_thread_per_radix[3] = 0; - registers_per_thread_per_radix[5] = 10; - registers_per_thread_per_radix[7] = 0; - registers_per_thread_per_radix[11] = 11; - registers_per_thread_per_radix[13] = 13; - break; - default: - registers_per_thread_per_radix[2] = 8; - registers_per_thread_per_radix[3] = 0; - registers_per_thread_per_radix[5] = 10; - registers_per_thread_per_radix[7] = 0; - registers_per_thread_per_radix[11] = 11; - registers_per_thread_per_radix[13] = 13; - break; - } - } - else { - switch (loc_multipliers[2]) { - case 1: - registers_per_thread_per_radix[2] = 10; - registers_per_thread_per_radix[3] = 0; - registers_per_thread_per_radix[5] = 10; - registers_per_thread_per_radix[7] = 0; - registers_per_thread_per_radix[11] = 11; - registers_per_thread_per_radix[13] = 0; - break; - case 2: - registers_per_thread_per_radix[2] = 10; - registers_per_thread_per_radix[3] = 0; - registers_per_thread_per_radix[5] = 10; - registers_per_thread_per_radix[7] = 0; - registers_per_thread_per_radix[11] = 11; - registers_per_thread_per_radix[13] = 0; - break; - default: - registers_per_thread_per_radix[2] = 8; - registers_per_thread_per_radix[3] = 0; - registers_per_thread_per_radix[5] = 10; - registers_per_thread_per_radix[7] = 0; - registers_per_thread_per_radix[11] = 11; - registers_per_thread_per_radix[13] = 0; - break; - } - } - } - else { - if (loc_multipliers[13] > 0) { - switch (loc_multipliers[2]) { - case 1: - registers_per_thread_per_radix[2] = 10; - registers_per_thread_per_radix[3] = 0; - registers_per_thread_per_radix[5] = 10; - registers_per_thread_per_radix[7] = 0; - registers_per_thread_per_radix[11] = 0; - registers_per_thread_per_radix[13] = 13; - break; - case 2: - registers_per_thread_per_radix[2] = 10; - registers_per_thread_per_radix[3] = 0; - registers_per_thread_per_radix[5] = 10; - registers_per_thread_per_radix[7] = 0; - registers_per_thread_per_radix[11] = 0; - registers_per_thread_per_radix[13] = 13; - break; - default: - registers_per_thread_per_radix[2] = 8; - registers_per_thread_per_radix[3] = 0; - registers_per_thread_per_radix[5] = 10; - registers_per_thread_per_radix[7] = 0; - registers_per_thread_per_radix[11] = 0; - registers_per_thread_per_radix[13] = 13; - break; - } - } - else { - switch (loc_multipliers[2]) { - case 1: - registers_per_thread_per_radix[2] = 10; - registers_per_thread_per_radix[3] = 0; - registers_per_thread_per_radix[5] = 10; - registers_per_thread_per_radix[7] = 0; - registers_per_thread_per_radix[11] = 0; - registers_per_thread_per_radix[13] = 0; - break; - case 2: - registers_per_thread_per_radix[2] = 10; - registers_per_thread_per_radix[3] = 0; - registers_per_thread_per_radix[5] = 10; - registers_per_thread_per_radix[7] = 0; - registers_per_thread_per_radix[11] = 0; - registers_per_thread_per_radix[13] = 0; - break; - default: - registers_per_thread_per_radix[2] = 8; - registers_per_thread_per_radix[3] = 0; - registers_per_thread_per_radix[5] = 10; - registers_per_thread_per_radix[7] = 0; - registers_per_thread_per_radix[11] = 0; - registers_per_thread_per_radix[13] = 0; - break; - } - } - } - } - } - else - { - if (loc_multipliers[7] > 0) { - if (loc_multipliers[11] > 0) { - if (loc_multipliers[13] > 0) { - switch (loc_multipliers[2]) { - case 1: - registers_per_thread_per_radix[2] = 14; - registers_per_thread_per_radix[3] = 0; - registers_per_thread_per_radix[5] = 0; - registers_per_thread_per_radix[7] = 14; - registers_per_thread_per_radix[11] = 11; - registers_per_thread_per_radix[13] = 13; - break; - case 2: - registers_per_thread_per_radix[2] = 14; - registers_per_thread_per_radix[3] = 0; - registers_per_thread_per_radix[5] = 0; - registers_per_thread_per_radix[7] = 14; - registers_per_thread_per_radix[11] = 11; - registers_per_thread_per_radix[13] = 13; - break; - case 3: - registers_per_thread_per_radix[2] = 8; - registers_per_thread_per_radix[3] = 0; - registers_per_thread_per_radix[5] = 0; - registers_per_thread_per_radix[7] = 14; - registers_per_thread_per_radix[11] = 11; - registers_per_thread_per_radix[13] = 13; - break; - default: - registers_per_thread_per_radix[2] = 16; - registers_per_thread_per_radix[3] = 0; - registers_per_thread_per_radix[5] = 0; - registers_per_thread_per_radix[7] = 14; - registers_per_thread_per_radix[11] = 11; - registers_per_thread_per_radix[13] = 13; - break; - } - } - else { - switch (loc_multipliers[2]) { - case 1: - registers_per_thread_per_radix[2] = 14; - registers_per_thread_per_radix[3] = 0; - registers_per_thread_per_radix[5] = 0; - registers_per_thread_per_radix[7] = 14; - registers_per_thread_per_radix[11] = 11; - registers_per_thread_per_radix[13] = 0; - break; - case 2: - registers_per_thread_per_radix[2] = 14; - registers_per_thread_per_radix[3] = 0; - registers_per_thread_per_radix[5] = 0; - registers_per_thread_per_radix[7] = 14; - registers_per_thread_per_radix[11] = 11; - registers_per_thread_per_radix[13] = 0; - break; - case 3: - registers_per_thread_per_radix[2] = 8; - registers_per_thread_per_radix[3] = 0; - registers_per_thread_per_radix[5] = 0; - registers_per_thread_per_radix[7] = 14; - registers_per_thread_per_radix[11] = 11; - registers_per_thread_per_radix[13] = 0; - break; - default: - registers_per_thread_per_radix[2] = 16; - registers_per_thread_per_radix[3] = 0; - registers_per_thread_per_radix[5] = 0; - registers_per_thread_per_radix[7] = 14; - registers_per_thread_per_radix[11] = 11; - registers_per_thread_per_radix[13] = 0; - break; - } - } - } - else { - if (loc_multipliers[13] > 0) { - switch (loc_multipliers[2]) { - case 1: - registers_per_thread_per_radix[2] = 14; - registers_per_thread_per_radix[3] = 0; - registers_per_thread_per_radix[5] = 0; - registers_per_thread_per_radix[7] = 14; - registers_per_thread_per_radix[11] = 0; - registers_per_thread_per_radix[13] = 13; - break; - case 2: - registers_per_thread_per_radix[2] = 14; - registers_per_thread_per_radix[3] = 0; - registers_per_thread_per_radix[5] = 0; - registers_per_thread_per_radix[7] = 14; - registers_per_thread_per_radix[11] = 0; - registers_per_thread_per_radix[13] = 13; - break; - case 3: - registers_per_thread_per_radix[2] = 8; - registers_per_thread_per_radix[3] = 0; - registers_per_thread_per_radix[5] = 0; - registers_per_thread_per_radix[7] = 14; - registers_per_thread_per_radix[11] = 0; - registers_per_thread_per_radix[13] = 13; - break; - default: - registers_per_thread_per_radix[2] = 16; - registers_per_thread_per_radix[3] = 0; - registers_per_thread_per_radix[5] = 0; - registers_per_thread_per_radix[7] = 14; - registers_per_thread_per_radix[11] = 0; - registers_per_thread_per_radix[13] = 13; - break; - } - } - else { - switch (loc_multipliers[2]) { - case 1: - registers_per_thread_per_radix[2] = 14; - registers_per_thread_per_radix[3] = 0; - registers_per_thread_per_radix[5] = 0; - registers_per_thread_per_radix[7] = 14; - registers_per_thread_per_radix[11] = 0; - registers_per_thread_per_radix[13] = 0; - break; - case 2: - registers_per_thread_per_radix[2] = 14; - registers_per_thread_per_radix[3] = 0; - registers_per_thread_per_radix[5] = 0; - registers_per_thread_per_radix[7] = 14; - registers_per_thread_per_radix[11] = 0; - registers_per_thread_per_radix[13] = 0; - break; - default: - registers_per_thread_per_radix[2] = 8; - registers_per_thread_per_radix[3] = 0; - registers_per_thread_per_radix[5] = 0; - registers_per_thread_per_radix[7] = 7; - registers_per_thread_per_radix[11] = 0; - registers_per_thread_per_radix[13] = 0; - break; - } - } - } - } - else { - if (loc_multipliers[11] > 0) { - if (loc_multipliers[13] > 0) { - switch (loc_multipliers[2]) { - case 1: - registers_per_thread_per_radix[2] = 12; - registers_per_thread_per_radix[3] = 0; - registers_per_thread_per_radix[5] = 0; - registers_per_thread_per_radix[7] = 0; - registers_per_thread_per_radix[11] = 11; - registers_per_thread_per_radix[13] = 13; - break; - case 2: - registers_per_thread_per_radix[2] = 12; - registers_per_thread_per_radix[3] = 0; - registers_per_thread_per_radix[5] = 0; - registers_per_thread_per_radix[7] = 0; - registers_per_thread_per_radix[11] = 11; - registers_per_thread_per_radix[13] = 13; - break; - default: - registers_per_thread_per_radix[2] = 8; - registers_per_thread_per_radix[3] = 0; - registers_per_thread_per_radix[5] = 0; - registers_per_thread_per_radix[7] = 0; - registers_per_thread_per_radix[11] = 11; - registers_per_thread_per_radix[13] = 13; - break; - } - } - else { - switch (loc_multipliers[2]) { - case 1: - registers_per_thread_per_radix[2] = 10; - registers_per_thread_per_radix[3] = 0; - registers_per_thread_per_radix[5] = 0; - registers_per_thread_per_radix[7] = 0; - registers_per_thread_per_radix[11] = 11; - registers_per_thread_per_radix[13] = 0; - break; - default: - registers_per_thread_per_radix[2] = 8; - registers_per_thread_per_radix[3] = 0; - registers_per_thread_per_radix[5] = 0; - registers_per_thread_per_radix[7] = 0; - registers_per_thread_per_radix[11] = 11; - registers_per_thread_per_radix[13] = 0; - break; - } - } - } - else { - if (loc_multipliers[13] > 0) { - switch (loc_multipliers[2]) { - case 1: - registers_per_thread_per_radix[2] = 12; - registers_per_thread_per_radix[3] = 0; - registers_per_thread_per_radix[5] = 0; - registers_per_thread_per_radix[7] = 0; - registers_per_thread_per_radix[11] = 0; - registers_per_thread_per_radix[13] = 13; - break; - case 2: - registers_per_thread_per_radix[2] = 12; - registers_per_thread_per_radix[3] = 0; - registers_per_thread_per_radix[5] = 0; - registers_per_thread_per_radix[7] = 0; - registers_per_thread_per_radix[11] = 0; - registers_per_thread_per_radix[13] = 13; - break; - default: - registers_per_thread_per_radix[2] = 8; - registers_per_thread_per_radix[3] = 0; - registers_per_thread_per_radix[5] = 0; - registers_per_thread_per_radix[7] = 0; - registers_per_thread_per_radix[11] = 0; - registers_per_thread_per_radix[13] = 13; - break; - } - } - else { - uint64_t max_loc_multipliers_pow2 = 0; - uint64_t active_threads_y = max_rhs / 64; //estimate workbalance across CU (assume we have 64 CU) - if (active_threads_y == 0) active_threads_y = 1; - uint64_t testMinStages = -1; - uint64_t maxRadixMinStages = 1; - uint64_t fixMaxCheckRadix2 = 3; -#if(VKFFT_BACKEND==1) - fixMaxCheckRadix2 = ((fft_length >= 2048) && (extraSharedMemoryForPow2) && (!useRader)) ? 5 : 3; -#endif - for (uint64_t i = 1; i <= fixMaxCheckRadix2; i++) { - uint64_t numStages = (uint64_t)ceil(log2(fft_length) / ((double)i)); - if (numStages < testMinStages) { - testMinStages = numStages; - maxRadixMinStages = i; - } - } - for (uint64_t i = maxRadixMinStages; i >= 1; i--) { - uint64_t active_threads_x = (active_threads_y * fft_length) / ((uint64_t)pow(2, i)); - if (active_threads_x >= 128) { - max_loc_multipliers_pow2 = i; - i = 1; - } - - } - if (max_loc_multipliers_pow2 < 3) max_loc_multipliers_pow2 = 3; - - uint64_t final_loc_multipliers_pow2 = 1; - uint64_t num_stages_min = (uint64_t)log2(fft_length); - for (uint64_t i = 2; i <= max_loc_multipliers_pow2; i++) { - uint64_t num_stages = (uint64_t)ceil(((uint64_t)log2(fft_length)) / (double)i); - if (num_stages < num_stages_min) { - final_loc_multipliers_pow2 = i; - num_stages_min = num_stages; - } - - } - registers_per_thread_per_radix[2] = (loc_multipliers[2] > final_loc_multipliers_pow2) ? (uint64_t)pow(2, final_loc_multipliers_pow2) : (uint64_t)pow(2, loc_multipliers[2]); - registers_per_thread_per_radix[2] = (loc_multipliers[2] < 3) ? (uint64_t)pow(2, loc_multipliers[2]) : registers_per_thread_per_radix[2]; - registers_per_thread_per_radix[3] = 0; - registers_per_thread_per_radix[5] = 0; - registers_per_thread_per_radix[7] = 0; - registers_per_thread_per_radix[11] = 0; - registers_per_thread_per_radix[13] = 0; - } - } - } - } - } - } - else { - if (loc_multipliers[3] > 0) { - if (loc_multipliers[5] > 0) { - if (loc_multipliers[7] > 0) { - if (loc_multipliers[11] > 0) { - if (loc_multipliers[13] > 0) { - registers_per_thread_per_radix[2] = 0; - registers_per_thread_per_radix[3] = 15; - registers_per_thread_per_radix[5] = 15; - registers_per_thread_per_radix[7] = 14; - registers_per_thread_per_radix[11] = 11; - registers_per_thread_per_radix[13] = 13; - } - else { - registers_per_thread_per_radix[2] = 0; - registers_per_thread_per_radix[3] = 15; - registers_per_thread_per_radix[5] = 15; - registers_per_thread_per_radix[7] = 14; - registers_per_thread_per_radix[11] = 11; - registers_per_thread_per_radix[13] = 0; - } - } - else { - if (loc_multipliers[13] > 0) { - registers_per_thread_per_radix[2] = 0; - registers_per_thread_per_radix[3] = 15; - registers_per_thread_per_radix[5] = 15; - registers_per_thread_per_radix[7] = 14; - registers_per_thread_per_radix[11] = 0; - registers_per_thread_per_radix[13] = 13; - } - else { - registers_per_thread_per_radix[2] = 0; - registers_per_thread_per_radix[3] = 15; - registers_per_thread_per_radix[5] = 15; - registers_per_thread_per_radix[7] = 14; - registers_per_thread_per_radix[11] = 0; - registers_per_thread_per_radix[13] = 0; - } - } - } - else { - if (loc_multipliers[11] > 0) { - if (loc_multipliers[13] > 0) { - registers_per_thread_per_radix[2] = 0; - registers_per_thread_per_radix[3] = 15; - registers_per_thread_per_radix[5] = 15; - registers_per_thread_per_radix[7] = 0; - registers_per_thread_per_radix[11] = 11; - registers_per_thread_per_radix[13] = 13; - } - else { - registers_per_thread_per_radix[2] = 0; - registers_per_thread_per_radix[3] = 15; - registers_per_thread_per_radix[5] = 15; - registers_per_thread_per_radix[7] = 0; - registers_per_thread_per_radix[11] = 11; - registers_per_thread_per_radix[13] = 0; - } - } - else { - if (loc_multipliers[13] > 0) { - registers_per_thread_per_radix[2] = 0; - registers_per_thread_per_radix[3] = 15; - registers_per_thread_per_radix[5] = 15; - registers_per_thread_per_radix[7] = 0; - registers_per_thread_per_radix[11] = 0; - registers_per_thread_per_radix[13] = 13; - } - else { - registers_per_thread_per_radix[2] = 0; - registers_per_thread_per_radix[3] = 15; - registers_per_thread_per_radix[5] = 15; - registers_per_thread_per_radix[7] = 0; - registers_per_thread_per_radix[11] = 0; - registers_per_thread_per_radix[13] = 0; - } - } - } - } - else - { - if (loc_multipliers[7] > 0) { - if (loc_multipliers[3] == 1) { - if (loc_multipliers[11] > 0) { - if (loc_multipliers[13] > 0) { - registers_per_thread_per_radix[2] = 0; - registers_per_thread_per_radix[3] = 12; - registers_per_thread_per_radix[5] = 0; - registers_per_thread_per_radix[7] = 14; - registers_per_thread_per_radix[11] = 11; - registers_per_thread_per_radix[13] = 13; - } - else { - registers_per_thread_per_radix[2] = 0; - registers_per_thread_per_radix[3] = 12; - registers_per_thread_per_radix[5] = 0; - registers_per_thread_per_radix[7] = 14; - registers_per_thread_per_radix[11] = 11; - registers_per_thread_per_radix[13] = 0; - } - } - else { - if (loc_multipliers[13] > 0) { - registers_per_thread_per_radix[2] = 0; - registers_per_thread_per_radix[3] = 12; - registers_per_thread_per_radix[5] = 0; - registers_per_thread_per_radix[7] = 14; - registers_per_thread_per_radix[11] = 0; - registers_per_thread_per_radix[13] = 13; - } - else { - registers_per_thread_per_radix[2] = 0; - registers_per_thread_per_radix[3] = 6; - registers_per_thread_per_radix[5] = 0; - registers_per_thread_per_radix[7] = 7; - registers_per_thread_per_radix[11] = 0; - registers_per_thread_per_radix[13] = 0; - } - } - } - else { - if (loc_multipliers[11] > 0) { - if (loc_multipliers[13] > 0) { - registers_per_thread_per_radix[2] = 0; - registers_per_thread_per_radix[3] = 9; - registers_per_thread_per_radix[5] = 0; - registers_per_thread_per_radix[7] = 7; - registers_per_thread_per_radix[11] = 11; - registers_per_thread_per_radix[13] = 13; - } - else { - registers_per_thread_per_radix[2] = 0; - registers_per_thread_per_radix[3] = 9; - registers_per_thread_per_radix[5] = 0; - registers_per_thread_per_radix[7] = 7; - registers_per_thread_per_radix[11] = 11; - registers_per_thread_per_radix[13] = 0; - } - } - else { - if (loc_multipliers[13] > 0) { - registers_per_thread_per_radix[2] = 0; - registers_per_thread_per_radix[3] = 9; - registers_per_thread_per_radix[5] = 0; - registers_per_thread_per_radix[7] = 7; - registers_per_thread_per_radix[11] = 0; - registers_per_thread_per_radix[13] = 13; - } - else { - registers_per_thread_per_radix[2] = 0; - registers_per_thread_per_radix[3] = 9; - registers_per_thread_per_radix[5] = 0; - registers_per_thread_per_radix[7] = 7; - registers_per_thread_per_radix[11] = 0; - registers_per_thread_per_radix[13] = 0; - } - } - } - } - else { - if (loc_multipliers[3] == 1) { - if (loc_multipliers[11] > 0) { - if (loc_multipliers[13] > 0) { - registers_per_thread_per_radix[2] = 0; - registers_per_thread_per_radix[3] = 12; - registers_per_thread_per_radix[5] = 0; - registers_per_thread_per_radix[7] = 0; - registers_per_thread_per_radix[11] = 11; - registers_per_thread_per_radix[13] = 13; - } - else { - registers_per_thread_per_radix[2] = 0; - registers_per_thread_per_radix[3] = 9; - registers_per_thread_per_radix[5] = 0; - registers_per_thread_per_radix[7] = 0; - registers_per_thread_per_radix[11] = 11; - registers_per_thread_per_radix[13] = 0; - } - } - else { - if (loc_multipliers[13] > 0) { - registers_per_thread_per_radix[2] = 0; - registers_per_thread_per_radix[3] = 12; - registers_per_thread_per_radix[5] = 0; - registers_per_thread_per_radix[7] = 0; - registers_per_thread_per_radix[11] = 0; - registers_per_thread_per_radix[13] = 13; - } - else { - registers_per_thread_per_radix[2] = 0; - registers_per_thread_per_radix[3] = 3; - registers_per_thread_per_radix[5] = 0; - registers_per_thread_per_radix[7] = 0; - registers_per_thread_per_radix[11] = 0; - registers_per_thread_per_radix[13] = 0; - } - } - } - else { - if (loc_multipliers[11] > 0) { - if (loc_multipliers[13] > 0) { - registers_per_thread_per_radix[2] = 0; - registers_per_thread_per_radix[3] = 9; - registers_per_thread_per_radix[5] = 0; - registers_per_thread_per_radix[7] = 0; - registers_per_thread_per_radix[11] = 11; - registers_per_thread_per_radix[13] = 13; - } - else { - registers_per_thread_per_radix[2] = 0; - registers_per_thread_per_radix[3] = 9; - registers_per_thread_per_radix[5] = 0; - registers_per_thread_per_radix[7] = 0; - registers_per_thread_per_radix[11] = 11; - registers_per_thread_per_radix[13] = 0; - } - } - else { - if (loc_multipliers[13] > 0) { - registers_per_thread_per_radix[2] = 0; - registers_per_thread_per_radix[3] = 9; - registers_per_thread_per_radix[5] = 0; - registers_per_thread_per_radix[7] = 0; - registers_per_thread_per_radix[11] = 0; - registers_per_thread_per_radix[13] = 13; - } - else { - registers_per_thread_per_radix[2] = 0; - registers_per_thread_per_radix[3] = 9; - registers_per_thread_per_radix[5] = 0; - registers_per_thread_per_radix[7] = 0; - registers_per_thread_per_radix[11] = 0; - registers_per_thread_per_radix[13] = 0; - } - } - } - } - } - } - else { - if (loc_multipliers[5] > 0) { - if (loc_multipliers[7] > 0) { - if (loc_multipliers[11] > 0) { - if (loc_multipliers[13] > 0) { - registers_per_thread_per_radix[2] = 0; - registers_per_thread_per_radix[3] = 0; - registers_per_thread_per_radix[5] = 5; - registers_per_thread_per_radix[7] = 7; - registers_per_thread_per_radix[11] = 11; - registers_per_thread_per_radix[13] = 13; - } - else { - registers_per_thread_per_radix[2] = 0; - registers_per_thread_per_radix[3] = 0; - registers_per_thread_per_radix[5] = 5; - registers_per_thread_per_radix[7] = 7; - registers_per_thread_per_radix[11] = 11; - registers_per_thread_per_radix[13] = 0; - } - } - else { - if (loc_multipliers[13] > 0) { - registers_per_thread_per_radix[2] = 0; - registers_per_thread_per_radix[3] = 0; - registers_per_thread_per_radix[5] = 5; - registers_per_thread_per_radix[7] = 7; - registers_per_thread_per_radix[11] = 0; - registers_per_thread_per_radix[13] = 13; - } - else { - registers_per_thread_per_radix[2] = 0; - registers_per_thread_per_radix[3] = 0; - registers_per_thread_per_radix[5] = 5; - registers_per_thread_per_radix[7] = 7; - registers_per_thread_per_radix[11] = 0; - registers_per_thread_per_radix[13] = 0; - } - } - } - else { - if (loc_multipliers[11] > 0) { - if (loc_multipliers[13] > 0) { - registers_per_thread_per_radix[2] = 0; - registers_per_thread_per_radix[3] = 0; - registers_per_thread_per_radix[5] = 5; - registers_per_thread_per_radix[7] = 0; - registers_per_thread_per_radix[11] = 11; - registers_per_thread_per_radix[13] = 13; - } - else { - registers_per_thread_per_radix[2] = 0; - registers_per_thread_per_radix[3] = 0; - registers_per_thread_per_radix[5] = 5; - registers_per_thread_per_radix[7] = 0; - registers_per_thread_per_radix[11] = 11; - registers_per_thread_per_radix[13] = 0; - } - } - else { - if (loc_multipliers[13] > 0) { - registers_per_thread_per_radix[2] = 0; - registers_per_thread_per_radix[3] = 0; - registers_per_thread_per_radix[5] = 5; - registers_per_thread_per_radix[7] = 0; - registers_per_thread_per_radix[11] = 0; - registers_per_thread_per_radix[13] = 13; - } - else { - registers_per_thread_per_radix[2] = 0; - registers_per_thread_per_radix[3] = 0; - registers_per_thread_per_radix[5] = 5; - registers_per_thread_per_radix[7] = 0; - registers_per_thread_per_radix[11] = 0; - registers_per_thread_per_radix[13] = 0; - } - } - } - } - else - { - if (loc_multipliers[7] > 0) { - if (loc_multipliers[11] > 0) { - if (loc_multipliers[13] > 0) { - registers_per_thread_per_radix[2] = 0; - registers_per_thread_per_radix[3] = 0; - registers_per_thread_per_radix[5] = 0; - registers_per_thread_per_radix[7] = 7; - registers_per_thread_per_radix[11] = 11; - registers_per_thread_per_radix[13] = 13; - } - else { - registers_per_thread_per_radix[2] = 0; - registers_per_thread_per_radix[3] = 0; - registers_per_thread_per_radix[5] = 0; - registers_per_thread_per_radix[7] = 7; - registers_per_thread_per_radix[11] = 11; - registers_per_thread_per_radix[13] = 0; - } - } - else { - if (loc_multipliers[13] > 0) { - registers_per_thread_per_radix[2] = 0; - registers_per_thread_per_radix[3] = 0; - registers_per_thread_per_radix[5] = 0; - registers_per_thread_per_radix[7] = 7; - registers_per_thread_per_radix[11] = 0; - registers_per_thread_per_radix[13] = 13; - } - else { - registers_per_thread_per_radix[2] = 0; - registers_per_thread_per_radix[3] = 0; - registers_per_thread_per_radix[5] = 0; - registers_per_thread_per_radix[7] = 7; - registers_per_thread_per_radix[11] = 0; - registers_per_thread_per_radix[13] = 0; - } - } - } - else { - if (loc_multipliers[11] > 0) { - if (loc_multipliers[13] > 0) { - registers_per_thread_per_radix[2] = 0; - registers_per_thread_per_radix[3] = 0; - registers_per_thread_per_radix[5] = 0; - registers_per_thread_per_radix[7] = 0; - registers_per_thread_per_radix[11] = 11; - registers_per_thread_per_radix[13] = 13; - } - else { - registers_per_thread_per_radix[2] = 0; - registers_per_thread_per_radix[3] = 0; - registers_per_thread_per_radix[5] = 0; - registers_per_thread_per_radix[7] = 0; - registers_per_thread_per_radix[11] = 11; - registers_per_thread_per_radix[13] = 0; - } - } - else { - if (loc_multipliers[13] > 0) { - registers_per_thread_per_radix[2] = 0; - registers_per_thread_per_radix[3] = 0; - registers_per_thread_per_radix[5] = 0; - registers_per_thread_per_radix[7] = 0; - registers_per_thread_per_radix[11] = 0; - registers_per_thread_per_radix[13] = 13; - } - else { - min_registers_per_thread[0] = 2; - registers_per_thread[0] = 2; - //Rader-only sequence - //return VKFFT_ERROR_UNSUPPORTED_RADIX; - } - } - } - } - } - - } - - registers_per_thread_per_radix[32] = ((registers_per_thread_per_radix[2] % 32) == 0) ? registers_per_thread_per_radix[2] : 0; - registers_per_thread_per_radix[16] = ((registers_per_thread_per_radix[2] % 16) == 0) ? registers_per_thread_per_radix[2] : 0; - registers_per_thread_per_radix[8] = ((registers_per_thread_per_radix[2] % 8) == 0) ? registers_per_thread_per_radix[2] : 0; - registers_per_thread_per_radix[4] = ((registers_per_thread_per_radix[2] % 4) == 0) ? registers_per_thread_per_radix[2] : 0; - if ((registers_per_thread_per_radix[2] >= 12) && (registers_per_thread_per_radix[3] >= 12)) { - registers_per_thread_per_radix[12] = (registers_per_thread_per_radix[2] > registers_per_thread_per_radix[3]) ? registers_per_thread_per_radix[3] : registers_per_thread_per_radix[2]; - if ((registers_per_thread_per_radix[12] % 12) != 0) registers_per_thread_per_radix[12] = 0; - } - registers_per_thread_per_radix[6] = (registers_per_thread_per_radix[2] > registers_per_thread_per_radix[3]) ? registers_per_thread_per_radix[3] : registers_per_thread_per_radix[2]; - registers_per_thread_per_radix[9] = ((registers_per_thread_per_radix[3] % 9) == 0) ? registers_per_thread_per_radix[3] : 0; - registers_per_thread_per_radix[10] = (registers_per_thread_per_radix[2] > registers_per_thread_per_radix[5]) ? registers_per_thread_per_radix[5] : registers_per_thread_per_radix[2]; - registers_per_thread_per_radix[14] = (registers_per_thread_per_radix[2] > registers_per_thread_per_radix[7]) ? registers_per_thread_per_radix[7] : registers_per_thread_per_radix[2]; - registers_per_thread_per_radix[15] = (registers_per_thread_per_radix[3] > registers_per_thread_per_radix[5]) ? registers_per_thread_per_radix[5] : registers_per_thread_per_radix[3]; - - for (uint64_t i = 0; i < 33; i++) { - if ((registers_per_thread_per_radix[i] != 0) && (registers_per_thread_per_radix[i] < min_registers_per_thread[0])) min_registers_per_thread[0] = registers_per_thread_per_radix[i]; - if ((registers_per_thread_per_radix[i] != 0) && (registers_per_thread_per_radix[i] > registers_per_thread[0])) registers_per_thread[0] = registers_per_thread_per_radix[i]; - } - if ((registers_per_thread[0] > 16) || (registers_per_thread[0] >= 2 * min_registers_per_thread[0])) isGoodSequence[0] = 0; - else isGoodSequence[0] = 1; - return VKFFT_SUCCESS; -} -static inline VkFFTResult VkFFTGetRegistersPerThreadOptimizeShared(uint64_t fft_length, uint64_t* registers_per_thread_per_radix, uint64_t* registers_per_thread, uint64_t* min_registers_per_thread) { - //try to split sequence in supported radix to optimize sm usage - uint64_t numStages = 20; - uint64_t fft_length_copy; - uint64_t stages[20]; - uint64_t k = 0; - for (uint64_t i = 0; i < 33; i++) { - registers_per_thread_per_radix[i] = 0; - } - registers_per_thread[0] = 0; - min_registers_per_thread[0] = -1; - - for (uint64_t i = 1; i < numStages; i++) { - fft_length_copy = fft_length; - uint64_t min_comb_radix = (uint64_t)floor(pow(fft_length_copy, 1.0 / i)); - if (min_comb_radix <= 16) { - for (uint64_t j = 0; j < 20; j++) { - stages[j] = 0; - } - k = 0; - for (uint64_t j = min_comb_radix; j <= 16; j++) { - if (k < i) { - if ((fft_length_copy % j) == 0) { - fft_length_copy /= j; - min_comb_radix = (uint64_t)floor(pow(fft_length_copy, 1.0 / (i - k - 1))); - - stages[k] = j; - j = min_comb_radix - 1; - k++; - } - } - } - if ((fft_length_copy == 1) && (k == i)) break; - } - } - for (uint64_t i = 0; i < k; i++) { - for (uint64_t j = 2; j <= stages[i]; j++) { - if ((stages[i] % j) == 0) { - if (registers_per_thread_per_radix[j] < stages[i]) - registers_per_thread_per_radix[j] = stages[i]; - } - } - } - for (uint64_t i = 0; i < 33; i++) { - if ((registers_per_thread_per_radix[i] != 0) && (registers_per_thread_per_radix[i] > registers_per_thread[0])) registers_per_thread[0] = registers_per_thread_per_radix[i]; - } - - for (uint64_t i = 0; i < 33; i++) { - if (registers_per_thread_per_radix[i] != 0) { - double ratio = (registers_per_thread[0] / (double)registers_per_thread_per_radix[i]); - uint64_t ratio_ceil = (uint64_t)ceil(ratio); - uint64_t ratio_floor = (uint64_t)floor(ratio); - double ratio2 = ((registers_per_thread_per_radix[i] * ratio_ceil) / (double)registers_per_thread[0]); - double ratio3 = (registers_per_thread[0] / (double)(registers_per_thread_per_radix[i] * ratio_floor)); - if (ratio2 > ratio3) registers_per_thread_per_radix[i] *= ratio_floor; - else { - registers_per_thread_per_radix[i] *= ratio_ceil; - } - } - } - registers_per_thread[0] = 0; - for (uint64_t i = 0; i < 33; i++) { - if ((registers_per_thread_per_radix[i] != 0) && (registers_per_thread_per_radix[i] < min_registers_per_thread[0])) min_registers_per_thread[0] = registers_per_thread_per_radix[i]; - if ((registers_per_thread_per_radix[i] != 0) && (registers_per_thread_per_radix[i] > registers_per_thread[0])) registers_per_thread[0] = registers_per_thread_per_radix[i]; - } - return VKFFT_SUCCESS; -} -static inline VkFFTResult VkFFTConstructRaderTree(VkFFTApplication* app, VkFFTRaderContainer** raderContainer_input, uint64_t* tempSequence, uint64_t* numRaderPrimes, uint64_t fft_radix_part) { - VkFFTResult res = VKFFT_SUCCESS; - uint64_t locTempSequence = tempSequence[0]; - uint64_t tempSequence_copy = tempSequence[0]; - uint64_t limit = ((tempSequence[0] + 1) > app->configuration.fixMaxRaderPrimeFFT) ? app->configuration.fixMaxRaderPrimeFFT : (tempSequence[0] + 1); - for (uint64_t i = app->configuration.fixMinRaderPrimeMult; i < limit; i++) { - if (locTempSequence % i == 0) { - numRaderPrimes[0]++; - while (locTempSequence % i == 0) locTempSequence /= i; - } - } - for (uint64_t i = app->configuration.fixMinRaderPrimeMult; i < app->configuration.fixMaxRaderPrimeMult; i++) { - if (locTempSequence % i == 0) { - numRaderPrimes[0]++; - while (locTempSequence % i == 0) locTempSequence /= i; - } - } - - raderContainer_input[0] = (VkFFTRaderContainer*)calloc(sizeof(VkFFTRaderContainer), numRaderPrimes[0]); - if (raderContainer_input[0] == 0) return VKFFT_ERROR_MALLOC_FAILED; - VkFFTRaderContainer* raderContainer = raderContainer_input[0]; - uint64_t tempSequence_temp = 1; - limit = ((tempSequence[0] + 1) > app->configuration.fixMaxRaderPrimeFFT) ? app->configuration.fixMaxRaderPrimeFFT : (tempSequence[0] + 1); - for (uint64_t i = app->configuration.fixMinRaderPrimeMult; i < limit; i++) { - if (tempSequence[0] % i == 0) { - if (i < app->configuration.fixMinRaderPrimeFFT) { - tempSequence_temp *= i; - tempSequence[0] /= i; - i--; - continue; - } - //Sophie Germain safe prime check - uint64_t tempSequence2 = i - 1; - for (uint64_t j = 2; j < app->configuration.fixMinRaderPrimeMult; j++) { - if (tempSequence2 % j == 0) { - tempSequence2 /= j; - j--; - } - } - if (tempSequence2 != 1) { - tempSequence_temp *= i; - tempSequence[0] /= i; - i--; - continue; - } - tempSequence[0] /= i; - for (uint64_t j = 0; j < numRaderPrimes[0]; j++) { - if (raderContainer[j].prime == i) - { - raderContainer[j].multiplier++; - j = numRaderPrimes[0]; - } - else if (raderContainer[j].prime == 0) { - raderContainer[j].type = 0; - raderContainer[j].prime = i; - raderContainer[j].multiplier = 1; - j = numRaderPrimes[0]; - } - } - i--; - } - } - tempSequence[0] *= tempSequence_temp; - for (uint64_t i = app->configuration.fixMinRaderPrimeMult; i < app->configuration.fixMaxRaderPrimeMult; i++) { - if (tempSequence[0] % i == 0) { - tempSequence[0] /= i; - for (uint64_t j = 0; j < numRaderPrimes[0]; j++) { - if (raderContainer[j].prime == i) - { - raderContainer[j].multiplier++; - j = numRaderPrimes[0]; - } - else if (raderContainer[j].prime == 0) { - raderContainer[j].type = 1; - raderContainer[j].prime = i; - raderContainer[j].multiplier = 1; - j = numRaderPrimes[0]; - } - } - i--; - } - } - //main loop for all primes - for (uint64_t i = 0; i < numRaderPrimes[0]; i++) { - //generator loop - for (uint64_t r = 2; r < raderContainer[i].prime; r++) { - uint64_t test = r; - for (uint64_t iter = 0; iter < raderContainer[i].prime - 2; iter++) { - if (test == 1) { - test = 0; - iter = raderContainer[i].prime; - } - test = ((test * r) % raderContainer[i].prime); - } - if (test == 1) { - raderContainer[i].generator = r; - r = raderContainer[i].prime; - } - } - - //subsplit and information initialization - if (raderContainer[i].type) {//Multiplication - raderContainer[i].registers_per_thread = 2; - raderContainer[i].min_registers_per_thread = 2; - } - else {//FFT - locTempSequence = raderContainer[i].prime - 1; - raderContainer[i].containerFFTDim = raderContainer[i].prime - 1; - raderContainer[i].containerFFTNum = fft_radix_part * tempSequence_copy / raderContainer[i].prime; - uint64_t stageID = 0; - for (uint64_t j = 2; j < app->configuration.fixMinRaderPrimeMult; j++) { - if (locTempSequence % j == 0) { - locTempSequence /= j; - raderContainer[i].loc_multipliers[j]++; - //raderContainer[i].stageRadix[stageID] = j; - //raderContainer[i].numThreadLaunches[stageID] = fft_radix_part * (tempSequence_copy / raderContainer[i].prime) * ((raderContainer[i].prime-1) / j); - //stageID++; - j--; - } - } - //uint64_t isGoodSequence; - //if (raderContainer[i].containerFFTNum<8) - res = VkFFTGetRegistersPerThreadOptimizeShared(raderContainer[i].prime - 1, raderContainer[i].registers_per_thread_per_radix, &raderContainer[i].registers_per_thread, &raderContainer[i].min_registers_per_thread); - //else - //res = VkFFTGetRegistersPerThread(raderContainer[i].prime - 1, 0, 0, 1, raderContainer[i].loc_multipliers, raderContainer[i].registers_per_thread_per_radix, &raderContainer[i].registers_per_thread, &raderContainer[i].min_registers_per_thread, &isGoodSequence); - if (res != VKFFT_SUCCESS) return res; - if (locTempSequence != 1) { - res = VkFFTConstructRaderTree(app, &raderContainer[i].container, &locTempSequence, &raderContainer[i].numSubPrimes, fft_radix_part * tempSequence_copy / raderContainer[i].prime); - if (res != VKFFT_SUCCESS) return res; - for (uint64_t j = 0; j < raderContainer[i].numSubPrimes; j++) { - for (uint64_t t = 0; t < raderContainer[i].container[j].multiplier; t++) { - raderContainer[i].stageRadix[stageID] = raderContainer[i].container[j].prime; - stageID++; - } - } - } - raderContainer[i].numStages = stageID; - } - } - return res; -} -static inline VkFFTResult VkFFTOptimizeRaderFFTRegisters(VkFFTRaderContainer* raderContainer, uint64_t numRaderPrimes, uint64_t fftDim, uint64_t* min_registers_per_thread, uint64_t* registers_per_thread, uint64_t* registers_per_thread_per_radix) { - VkFFTResult res = VKFFT_SUCCESS; - for (int64_t i = 0; i < (int64_t)numRaderPrimes; i++) { - if (raderContainer[i].type == 0) { - if (raderContainer[i].min_registers_per_thread / min_registers_per_thread[0] >= 2) { - min_registers_per_thread[0] *= (raderContainer[i].min_registers_per_thread / min_registers_per_thread[0]); - for (uint64_t j = 0; j < 33; j++) { - if ((registers_per_thread_per_radix[j] > 0) && (registers_per_thread_per_radix[j] < min_registers_per_thread[0])) registers_per_thread_per_radix[j] *= (uint64_t)ceil(min_registers_per_thread[0] / (double)registers_per_thread_per_radix[j]); - } - for (uint64_t j = 0; j < 33; j++) { - if (registers_per_thread_per_radix[j] > registers_per_thread[0]) registers_per_thread[0] = registers_per_thread_per_radix[j]; - } - } - else if (min_registers_per_thread[0] / raderContainer[i].min_registers_per_thread >= 2) { - raderContainer[i].min_registers_per_thread *= (min_registers_per_thread[0] / raderContainer[i].min_registers_per_thread); - for (uint64_t j = 0; j < 33; j++) { - if ((raderContainer[i].registers_per_thread_per_radix[j] > 0) && (raderContainer[i].registers_per_thread_per_radix[j] < raderContainer[i].min_registers_per_thread)) raderContainer[i].registers_per_thread_per_radix[j] *= (uint64_t)ceil(raderContainer[i].min_registers_per_thread / (double)raderContainer[i].registers_per_thread_per_radix[j]); - } - for (uint64_t j = 0; j < 33; j++) { - if (raderContainer[i].registers_per_thread_per_radix[j] > raderContainer[i].registers_per_thread) raderContainer[i].registers_per_thread = raderContainer[i].registers_per_thread_per_radix[j]; - } - } - - if (raderContainer[i].min_registers_per_thread < min_registers_per_thread[0]) { - for (uint64_t j = 0; j < 33; j++) { - if (raderContainer[i].registers_per_thread_per_radix[j] > 0) { - while (raderContainer[i].registers_per_thread_per_radix[j] < min_registers_per_thread[0]) - raderContainer[i].registers_per_thread_per_radix[j] += j; - if (raderContainer[i].registers_per_thread_per_radix[j] > raderContainer[i].registers_per_thread) - raderContainer[i].registers_per_thread = raderContainer[i].registers_per_thread_per_radix[j]; - } - } - } - - for (int64_t j = 2; j < 33; j++) { - if (raderContainer[i].registers_per_thread_per_radix[j] != 0) { - while (((uint64_t)ceil(fftDim / (double)min_registers_per_thread[0])) < (raderContainer[i].containerFFTNum * (uint64_t)ceil(raderContainer[i].containerFFTDim / (double)raderContainer[i].registers_per_thread_per_radix[j]))) { - raderContainer[i].registers_per_thread_per_radix[j] += j; - } - if (raderContainer[i].registers_per_thread_per_radix[j] > raderContainer[i].registers_per_thread) raderContainer[i].registers_per_thread = raderContainer[i].registers_per_thread_per_radix[j]; - } - } - if (raderContainer[i].registers_per_thread > registers_per_thread[0]) registers_per_thread[0] = raderContainer[i].registers_per_thread; - } - } - //try to increase registers usage closer to registers_per_thread across all primes - for (int64_t i = 0; i < (int64_t)numRaderPrimes; i++) { - if (raderContainer[i].type == 0) { - for (int64_t j = 2; j < 33; j++) { - if (raderContainer[i].registers_per_thread_per_radix[j] > 0) { - while ((raderContainer[i].registers_per_thread_per_radix[j] + j) <= registers_per_thread[0] + 1) {// fix - raderContainer[i].registers_per_thread_per_radix[j] += j; - } - } - } - raderContainer[i].registers_per_thread = 0; - raderContainer[i].min_registers_per_thread = -1; - for (int64_t j = 2; j < 33; j++) { - if (raderContainer[i].registers_per_thread_per_radix[j] > 0) { - if (raderContainer[i].registers_per_thread_per_radix[j] < raderContainer[i].min_registers_per_thread) { - raderContainer[i].min_registers_per_thread = raderContainer[i].registers_per_thread_per_radix[j]; - } - if (raderContainer[i].registers_per_thread_per_radix[j] > raderContainer[i].registers_per_thread) { - raderContainer[i].registers_per_thread = raderContainer[i].registers_per_thread_per_radix[j]; - } - } - } - } - } - //subprimes optimization - for (int64_t i = 0; i < (int64_t)numRaderPrimes; i++) { - if (raderContainer[i].numSubPrimes) { - res = VkFFTOptimizeRaderFFTRegisters(raderContainer[i].container, raderContainer[i].numSubPrimes, fftDim, min_registers_per_thread, registers_per_thread, registers_per_thread_per_radix); - if (res != VKFFT_SUCCESS) return res; - } - } - return res; -} -static inline VkFFTResult VkFFTOptimizeRadixKernels(uint64_t* registers_per_thread_per_radix, uint64_t* loc_multipliers, uint64_t registerBoost, uint64_t* maxNonPow2Radix, uint64_t* reqLocRegs, VkFFTRaderContainer* raderContainer, uint64_t numRaderPrimes) { - VkFFTResult res = VKFFT_SUCCESS; - if (numRaderPrimes) { - for (uint64_t i = 0; i < numRaderPrimes; i++) { - res = VkFFTOptimizeRadixKernels(raderContainer[i].registers_per_thread_per_radix, raderContainer[i].loc_multipliers, 1, maxNonPow2Radix, reqLocRegs, raderContainer[i].container, raderContainer[i].numSubPrimes); - if (res != VKFFT_SUCCESS) return res; - } - } - //optimize used radix kernels - if (((registers_per_thread_per_radix[32] > 0) || ((registers_per_thread_per_radix[2] % 32) == 0)) && ((registers_per_thread_per_radix[32]) % 32 == 0) && (loc_multipliers[2] >= 5)) { - loc_multipliers[32] = loc_multipliers[2] / 5; - loc_multipliers[2] = loc_multipliers[2] - loc_multipliers[32] * 5; - if ((registers_per_thread_per_radix[2] % 32) == 0) registers_per_thread_per_radix[32] = registers_per_thread_per_radix[2]; - } - if (((registers_per_thread_per_radix[16] > 0) || ((registers_per_thread_per_radix[2] % 16) == 0)) && ((registers_per_thread_per_radix[16]) % 16 == 0) && (loc_multipliers[2] >= 4)) { - loc_multipliers[16] = loc_multipliers[2] / 4; - loc_multipliers[2] = loc_multipliers[2] - loc_multipliers[16] * 4; - if ((registers_per_thread_per_radix[2] % 16) == 0) registers_per_thread_per_radix[16] = registers_per_thread_per_radix[2];//if we got 16 regs, why not use r16 kernel - } - if ((registers_per_thread_per_radix[15] > 0) && ((registers_per_thread_per_radix[15]) % 15 == 0) && (loc_multipliers[3] >= 1) && (loc_multipliers[5] >= 1)) { - loc_multipliers[15] = (loc_multipliers[3] > loc_multipliers[5]) ? loc_multipliers[5] : loc_multipliers[3]; - loc_multipliers[3] = loc_multipliers[3] - loc_multipliers[15]; - loc_multipliers[5] = loc_multipliers[5] - loc_multipliers[15]; - } - if ((registers_per_thread_per_radix[14] > 0) && ((registers_per_thread_per_radix[14]) % 14 == 0) && (loc_multipliers[2] >= 1) && (loc_multipliers[7] >= 1)) { - loc_multipliers[14] = (loc_multipliers[2] > loc_multipliers[7]) ? loc_multipliers[7] : loc_multipliers[2]; - loc_multipliers[2] = loc_multipliers[2] - loc_multipliers[14]; - loc_multipliers[7] = loc_multipliers[7] - loc_multipliers[14]; - } - if ((registers_per_thread_per_radix[12] > 0) && ((registers_per_thread_per_radix[12]) % 12 == 0) && (loc_multipliers[2] >= 2) && (loc_multipliers[3] >= 1)) { - loc_multipliers[12] = (loc_multipliers[2] > 2 * loc_multipliers[3]) ? loc_multipliers[3] : loc_multipliers[2] / 2; - loc_multipliers[2] = loc_multipliers[2] - 2 * loc_multipliers[12]; - loc_multipliers[3] = loc_multipliers[3] - loc_multipliers[12]; - } - if ((registers_per_thread_per_radix[10] > 0) && ((registers_per_thread_per_radix[10]) % 10 == 0) && (loc_multipliers[2] >= 1) && (loc_multipliers[5] >= 1)) { - loc_multipliers[10] = (loc_multipliers[2] > loc_multipliers[5]) ? loc_multipliers[5] : loc_multipliers[2]; - loc_multipliers[2] = loc_multipliers[2] - loc_multipliers[10]; - loc_multipliers[5] = loc_multipliers[5] - loc_multipliers[10]; - } - if ((registers_per_thread_per_radix[9] > 0) && ((registers_per_thread_per_radix[9]) % 9 == 0) && (loc_multipliers[3] >= 2)) { - loc_multipliers[9] = loc_multipliers[3] / 2; - loc_multipliers[3] = loc_multipliers[3] - loc_multipliers[9] * 2; - } - if (((registers_per_thread_per_radix[8] > 0) || ((registers_per_thread_per_radix[2] % 8) == 0)) && ((registers_per_thread_per_radix[8]) % 8 == 0) && (loc_multipliers[2] >= 3)) { - loc_multipliers[8] = loc_multipliers[2] / 3; - loc_multipliers[2] = loc_multipliers[2] - loc_multipliers[8] * 3; - if ((registers_per_thread_per_radix[2] % 8) == 0) registers_per_thread_per_radix[8] = registers_per_thread_per_radix[2]; - } - if ((registers_per_thread_per_radix[6] > 0) && ((registers_per_thread_per_radix[6]) % 6 == 0) && (loc_multipliers[2] >= 1) && (loc_multipliers[3] >= 1)) { - loc_multipliers[6] = (loc_multipliers[2] > loc_multipliers[3]) ? loc_multipliers[3] : loc_multipliers[2]; - loc_multipliers[2] = loc_multipliers[2] - loc_multipliers[6]; - loc_multipliers[3] = loc_multipliers[3] - loc_multipliers[6]; - } - if (((registers_per_thread_per_radix[4] > 0) || ((registers_per_thread_per_radix[2] % 4) == 0)) && ((registers_per_thread_per_radix[4]) % 4 == 0) && (loc_multipliers[2] >= 2)) { - loc_multipliers[4] = loc_multipliers[2] / 2; - loc_multipliers[2] = loc_multipliers[2] - loc_multipliers[4] * 2; - if ((registers_per_thread_per_radix[2] % 4) == 0) registers_per_thread_per_radix[4] = registers_per_thread_per_radix[2]; - } - if ((registerBoost == 2) && (loc_multipliers[2] == 0)) { - if (loc_multipliers[4] > 0) { - loc_multipliers[4]--; - loc_multipliers[2] = 2; - } - else if (loc_multipliers[8] > 0) { - loc_multipliers[8]--; - loc_multipliers[4]++; - loc_multipliers[2]++; - } - else if (loc_multipliers[16] > 0) { - loc_multipliers[16]--; - loc_multipliers[8]++; - loc_multipliers[2]++; - } - else if (loc_multipliers[32] > 0) { - loc_multipliers[32]--; - loc_multipliers[16]++; - loc_multipliers[2]++; - } - } - if ((registerBoost == 4) && (loc_multipliers[4] == 0)) { - if (loc_multipliers[8] > 0) { - loc_multipliers[8]--; - loc_multipliers[4]++; - loc_multipliers[2]++; - } - else if (loc_multipliers[16] > 0) { - if (loc_multipliers[2] == 0) { - loc_multipliers[16]--; - loc_multipliers[4] = 2; - } - else { - loc_multipliers[16]--; - loc_multipliers[4]++; - loc_multipliers[2]--; - loc_multipliers[8]++; - } - } - else if (loc_multipliers[32] > 0) { - if (loc_multipliers[2] == 0) { - loc_multipliers[32]--; - loc_multipliers[8]++; - loc_multipliers[4]++; - } - else { - loc_multipliers[32]--; - loc_multipliers[16]++; - loc_multipliers[4]++; - loc_multipliers[2]--; - } - } - } - for (uint64_t i = 2; i < 33; i++) { - uint64_t usedLocRegs = 0; - if (loc_multipliers[i] > 0) { - switch (i) { - case 6: - usedLocRegs = 3; - break; - case 9: - usedLocRegs = 3; - break; - case 10: - usedLocRegs = 5; - break; - case 12: - usedLocRegs = 3; - break; - case 14: - usedLocRegs = 7; - break; - case 15: - usedLocRegs = 5; - break; - default: - usedLocRegs = i; - break; - } - } - if ((loc_multipliers[i] > 0) && ((i & (i - 1)) != 0) && (i > maxNonPow2Radix[0])) { - maxNonPow2Radix[0] = i; - } - if ((usedLocRegs > reqLocRegs[0]) && ((i & (i - 1)) != 0)) { - reqLocRegs[0] = usedLocRegs; - } - } - return res; -} -static inline VkFFTResult VkFFTGetRaderFFTStages(VkFFTRaderContainer* raderContainer, uint64_t numRaderPrimes, uint64_t* stageid, uint64_t* stageRadix, uint64_t* stage_rader_generator) { - VkFFTResult res = VKFFT_SUCCESS; - for (int64_t i = 0; i < (int64_t)numRaderPrimes; i++) { - if (raderContainer[i].multiplier > 0) { - stageRadix[stageid[0]] = raderContainer[i].prime; - stage_rader_generator[stageid[0]] = raderContainer[i].generator; - raderContainer[i].multiplier--; - i--; - stageid[0]++; - //axes[k].specializationConstants.numStages++; - //find primitive root - } - } - for (int64_t i = 0; i < (int64_t)numRaderPrimes; i++) { - if (raderContainer[i].type == 0) { - if (raderContainer[i].numSubPrimes > 0) { - res = VkFFTGetRaderFFTStages(raderContainer[i].container, raderContainer[i].numSubPrimes, &raderContainer[i].numStages, raderContainer[i].stageRadix, raderContainer[i].stage_rader_generator); - if (res != VKFFT_SUCCESS) return res; - } - for (uint64_t j = 32; j > 1; j--) { - if (raderContainer[i].loc_multipliers[j] > 0) { - raderContainer[i].stageRadix[raderContainer[i].numStages] = j; - raderContainer[i].loc_multipliers[j]--; - j++; - raderContainer[i].numStages++; - } - } - /*//make that convolution step uses min_regs radix - max working threads - uint64_t stage_id_swap = axes[k].specializationConstants.raderContainer[i].numStages - 1; - uint64_t temp_radix = axes[k].specializationConstants.raderContainer[i].stageRadix[axes[k].specializationConstants.raderContainer[i].numStages - 1]; - uint64_t temp_regs = axes[k].specializationConstants.raderContainer[i].registers_per_thread_per_radix[axes[k].specializationConstants.raderContainer[i].stageRadix[axes[k].specializationConstants.raderContainer[i].numStages - 1]]; - - for (uint64_t j = 0; j < axes[k].specializationConstants.raderContainer[i].numStages-1; j++) { - if (axes[k].specializationConstants.raderContainer[i].registers_per_thread_per_radix[axes[k].specializationConstants.raderContainer[i].stageRadix[j]] < axes[k].specializationConstants.raderContainer[i].registers_per_thread_per_radix[axes[k].specializationConstants.raderContainer[i].stageRadix[stage_id_swap]]) - stage_id_swap = j; - } - axes[k].specializationConstants.raderContainer[i].stageRadix[axes[k].specializationConstants.raderContainer[i].numStages - 1] = axes[k].specializationConstants.raderContainer[i].stageRadix[stage_id_swap]; - axes[k].specializationConstants.raderContainer[i].registers_per_thread_per_radix[axes[k].specializationConstants.raderContainer[i].stageRadix[axes[k].specializationConstants.raderContainer[i].numStages - 1]] = axes[k].specializationConstants.raderContainer[i].registers_per_thread_per_radix[axes[k].specializationConstants.raderContainer[i].stageRadix[stage_id_swap]]; - axes[k].specializationConstants.raderContainer[i].stageRadix[stage_id_swap] = temp_radix; - axes[k].specializationConstants.raderContainer[i].registers_per_thread_per_radix[axes[k].specializationConstants.raderContainer[i].stageRadix[stage_id_swap]] = temp_regs; - - //make that first step uses second to min_regs radix - stage_id_swap = 0; - temp_radix = axes[k].specializationConstants.raderContainer[i].stageRadix[0]; - temp_regs = axes[k].specializationConstants.raderContainer[i].registers_per_thread_per_radix[axes[k].specializationConstants.raderContainer[i].stageRadix[0]]; - - for (uint64_t j = 1; j < axes[k].specializationConstants.raderContainer[i].numStages - 1; j++) { - if (axes[k].specializationConstants.raderContainer[i].registers_per_thread_per_radix[axes[k].specializationConstants.raderContainer[i].stageRadix[j]] < axes[k].specializationConstants.raderContainer[i].registers_per_thread_per_radix[axes[k].specializationConstants.raderContainer[i].stageRadix[stage_id_swap]]) - stage_id_swap = j; - } - axes[k].specializationConstants.raderContainer[i].stageRadix[0] = axes[k].specializationConstants.raderContainer[i].stageRadix[stage_id_swap]; - axes[k].specializationConstants.raderContainer[i].registers_per_thread_per_radix[axes[k].specializationConstants.raderContainer[i].stageRadix[0]] = axes[k].specializationConstants.raderContainer[i].registers_per_thread_per_radix[axes[k].specializationConstants.raderContainer[i].stageRadix[stage_id_swap]]; - axes[k].specializationConstants.raderContainer[i].stageRadix[stage_id_swap] = temp_radix; - axes[k].specializationConstants.raderContainer[i].registers_per_thread_per_radix[axes[k].specializationConstants.raderContainer[i].stageRadix[stage_id_swap]] = temp_regs; - */ - } - } - return res; -} -static inline VkFFTResult VkFFTMinMaxRegisterCheck(uint64_t numStages, uint64_t* stageRadix, uint64_t* min_registers_per_thread, uint64_t* registers_per_thread, uint64_t* registers_per_thread_per_radix, VkFFTRaderContainer* raderContainer, uint64_t numRaderPrimes, uint64_t* stage_rader_generator) { - VkFFTResult res = VKFFT_SUCCESS; - for (int64_t j = 0; j < (int64_t)numStages; j++) { - if (stage_rader_generator[j] == 0) { - if (registers_per_thread_per_radix[stageRadix[j]] > 0) { - if (registers_per_thread_per_radix[stageRadix[j]] < min_registers_per_thread[0]) { - min_registers_per_thread[0] = registers_per_thread_per_radix[stageRadix[j]]; - } - if (registers_per_thread_per_radix[stageRadix[j]] > registers_per_thread[0]) { - registers_per_thread[0] = registers_per_thread_per_radix[stageRadix[j]]; - } - } - } - else { - for (int64_t i = 0; i < (int64_t)numRaderPrimes; i++) { - if (raderContainer[i].prime == stageRadix[j]) { - if (raderContainer[i].type == 0) { - for (int64_t j2 = 0; j2 < (int64_t)raderContainer[i].numStages; j2++) { - if (raderContainer[i].stage_rader_generator[j] == 0) { - if (raderContainer[i].registers_per_thread_per_radix[raderContainer[i].stageRadix[j2]] > 0) { - if (raderContainer[i].registers_per_thread_per_radix[raderContainer[i].stageRadix[j2]] < min_registers_per_thread[0]) { - min_registers_per_thread[0] = raderContainer[i].registers_per_thread_per_radix[raderContainer[i].stageRadix[j2]]; - } - if (raderContainer[i].registers_per_thread_per_radix[raderContainer[i].stageRadix[j2]] > registers_per_thread[0]) { - registers_per_thread[0] = raderContainer[i].registers_per_thread_per_radix[raderContainer[i].stageRadix[j2]]; - } - } - } - else { - res = VkFFTMinMaxRegisterCheck(raderContainer[i].numStages, raderContainer[i].stageRadix, min_registers_per_thread, registers_per_thread, raderContainer[i].registers_per_thread_per_radix, raderContainer[i].container, raderContainer[i].numSubPrimes, raderContainer[i].stage_rader_generator); - if (res != VKFFT_SUCCESS) return res; - } - } - } - } - } - } - } - return res; -} -static inline VkFFTResult VkFFTGetRaderFFTThreadsNum(VkFFTRaderContainer* raderContainer, uint64_t numRaderPrimes, uint64_t* numThreads) { - VkFFTResult res = VKFFT_SUCCESS; - - for (int64_t i = 0; i < (int64_t)numRaderPrimes; i++) { - if (raderContainer[i].type == 0) { - if (raderContainer[i].numSubPrimes > 0) { - res = VkFFTGetRaderFFTThreadsNum(raderContainer[i].container, raderContainer[i].numSubPrimes, numThreads); - if (res != VKFFT_SUCCESS) return res; - } - for (int64_t j = 0; j < (int64_t)raderContainer[i].numStages; j++) { - if (raderContainer[i].stage_rader_generator[j] == 0) { - if (raderContainer[i].containerFFTNum * (uint64_t)ceil(raderContainer[i].containerFFTDim / (double)raderContainer[i].registers_per_thread_per_radix[raderContainer[i].stageRadix[j]]) > numThreads[0]) numThreads[0] = raderContainer[i].containerFFTNum * (uint64_t)ceil(raderContainer[i].containerFFTDim / (double)raderContainer[i].registers_per_thread_per_radix[raderContainer[i].stageRadix[j]]); - } - } - } - } - return res; -} - -static inline VkFFTResult VkFFTScheduler(VkFFTApplication* app, VkFFTPlan* FFTPlan, uint64_t axis_id) { - VkFFTResult res = VKFFT_SUCCESS; - VkFFTAxis* axes = FFTPlan->axes[axis_id]; - - uint64_t complexSize; - if (app->configuration.doublePrecision || app->configuration.doublePrecisionFloatMemory) - complexSize = (2 * sizeof(double)); - else - if (app->configuration.halfPrecision) - complexSize = (2 * sizeof(float)); - else - complexSize = (2 * sizeof(float)); - - uint64_t usedSharedMemory = ((app->configuration.size[axis_id] & (app->configuration.size[axis_id] - 1)) == 0) ? app->configuration.sharedMemorySizePow2 : app->configuration.sharedMemorySize; - uint64_t maxSequenceLengthSharedMemory = usedSharedMemory / complexSize; - uint64_t maxSingleSizeNonStrided = maxSequenceLengthSharedMemory; - - uint64_t nonStridedAxisId = (app->configuration.considerAllAxesStrided) ? -1 : 0; - uint64_t max_rhs = 1; - for (uint64_t i = 0; i < 3; i++) { - FFTPlan->actualFFTSizePerAxis[axis_id][i] = app->configuration.size[i]; - if ((FFTPlan->actualFFTSizePerAxis[axis_id][i] > 0)) max_rhs *= FFTPlan->actualFFTSizePerAxis[axis_id][i]; - } - if (app->configuration.numberBatches > app->actualNumBatches) - max_rhs *= app->configuration.numberBatches; - else - max_rhs *= app->actualNumBatches; - if (app->configuration.coordinateFeatures > 0) max_rhs *= app->configuration.coordinateFeatures; - if (app->configuration.numberKernels > 0) max_rhs *= app->configuration.numberKernels; - - FFTPlan->actualPerformR2CPerAxis[axis_id] = app->configuration.performR2C; - if ((axis_id == 0) && (app->configuration.performR2C) && (app->configuration.size[axis_id] > maxSingleSizeNonStrided)) { - FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] = app->configuration.size[axis_id] / 2; // now in actualFFTSize - modified dimension size for R2C/DCT - FFTPlan->actualPerformR2CPerAxis[axis_id] = 0; - FFTPlan->multiUploadR2C = 1; - } - if (app->configuration.performDCT == 1) { - FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] = 2 * app->configuration.size[axis_id] - 2; // now in actualFFTSize - modified dimension size for R2C/DCT - } - if ((app->configuration.performDCT == 4) && (app->configuration.size[axis_id] % 2 == 0)) { - FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] = app->configuration.size[axis_id] / 2; // now in actualFFTSize - modified dimension size for R2C/DCT - //FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] = app->configuration.size[axis_id] * 8; // now in actualFFTSize - modified dimension size for R2C/DCT - } - if ((axis_id > 0) && (app->configuration.performR2C)) { - FFTPlan->actualFFTSizePerAxis[axis_id][0] = FFTPlan->actualFFTSizePerAxis[axis_id][0] / 2 + 1; - } - if (axis_id != nonStridedAxisId) { - if (app->configuration.performBandwidthBoost > 0) - axes->specializationConstants.performBandwidthBoost = app->configuration.performBandwidthBoost; - } - //initial Stockham + Rader check - uint64_t multipliers[33]; - for (uint64_t i = 0; i < 33; i++) { - multipliers[i] = 0; - } - - uint64_t tempSequence = FFTPlan->actualFFTSizePerAxis[axis_id][axis_id]; - for (uint64_t i = 2; i < app->configuration.fixMinRaderPrimeMult; i++) { - if (tempSequence % i == 0) { - tempSequence /= i; - multipliers[i]++; - i--; - } - } - // verify that we haven't checked for 3 steps being not enought for Rader before - if (!app->useBluesteinFFT[axis_id]) { - uint64_t useRaderMult = 0; - uint64_t rader_primes[20]; - uint64_t rader_multipliers[20]; - for (uint64_t i = 0; i < 20; i++) { - rader_multipliers[i] = 0; - rader_primes[i] = 0; - } - uint64_t tempSequence_temp = 1; - uint64_t maxSequenceLengthSharedMemoryStrided_temp = (app->configuration.coalescedMemory > complexSize) ? usedSharedMemory / (app->configuration.coalescedMemory) : usedSharedMemory / complexSize; - uint64_t limit_max_rader_prime = ((axis_id == nonStridedAxisId) && (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] <= maxSequenceLengthSharedMemory)) ? maxSequenceLengthSharedMemory : maxSequenceLengthSharedMemoryStrided_temp; - if (limit_max_rader_prime > app->configuration.fixMaxRaderPrimeFFT) limit_max_rader_prime = app->configuration.fixMaxRaderPrimeFFT; - for (uint64_t i = app->configuration.fixMinRaderPrimeMult; i < limit_max_rader_prime; i++) { - if (tempSequence % i == 0) { - if (i < app->configuration.fixMinRaderPrimeFFT) { - tempSequence_temp *= i; - tempSequence /= i; - i--; - continue; - } - //Sophie Germain safe prime check - uint64_t tempSequence2 = i - 1; - for (uint64_t j = 2; j < app->configuration.fixMinRaderPrimeMult; j++) { - if (tempSequence2 % j == 0) { - tempSequence2 /= j; - j--; - } - } - if (tempSequence2 != 1) { - maxSequenceLengthSharedMemory = (usedSharedMemory - (i - 1) * complexSize) / complexSize; - maxSequenceLengthSharedMemoryStrided_temp = (app->configuration.coalescedMemory > complexSize) ? (usedSharedMemory - (i - 1) * complexSize) / (app->configuration.coalescedMemory) : (usedSharedMemory - (i - 1) * complexSize) / complexSize; - limit_max_rader_prime = ((axis_id == nonStridedAxisId) && (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] <= maxSequenceLengthSharedMemory)) ? maxSequenceLengthSharedMemory : maxSequenceLengthSharedMemoryStrided_temp; - tempSequence_temp *= i; - tempSequence /= i; - i--; - continue; - } - tempSequence /= i; - for (uint64_t j = 0; j < 20; j++) { - if (rader_primes[j] == i) - { - rader_multipliers[j]++; - j = 20; - } - else if (rader_primes[j] == 0) { - rader_primes[j] = i; - rader_multipliers[j]++; - j = 20; - } - - } - i--; - } - } - tempSequence *= tempSequence_temp; - for (uint64_t i = app->configuration.fixMinRaderPrimeMult; i < app->configuration.fixMaxRaderPrimeMult; i++) { - if (tempSequence % i == 0) { - tempSequence /= i; - for (uint64_t j = 0; j < 20; j++) { - if (rader_primes[j] == i) - { - rader_multipliers[j]++; - j = 20; - } - else if (rader_primes[j] == 0) { - rader_primes[j] = i; - rader_multipliers[j]++; - j = 20; - } - - } - useRaderMult = i; - i--; - } - } - if (tempSequence != 1) { - useRaderMult = 0; - } - if (useRaderMult) { - if (tempSequence == 1) usedSharedMemory -= (useRaderMult - 1) * complexSize; //reserve memory for Rader - //check once again - if ((axis_id == 0) && (app->configuration.performR2C) && (app->configuration.size[axis_id] > maxSingleSizeNonStrided)) { - FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] = app->configuration.size[axis_id] / 2; // now in actualFFTSize - modified dimension size for R2C/DCT - FFTPlan->actualPerformR2CPerAxis[axis_id] = 0; - FFTPlan->multiUploadR2C = 1; - } - } - maxSequenceLengthSharedMemory = usedSharedMemory / complexSize; - maxSingleSizeNonStrided = maxSequenceLengthSharedMemory; - } - //initial Bluestein check - if (tempSequence != 1) { - app->useBluesteinFFT[axis_id] = 1; - if (axis_id != nonStridedAxisId) { - if (app->configuration.performBandwidthBoost == 0) - axes->specializationConstants.performBandwidthBoost = 1; - } - app->configuration.registerBoost = 1; - tempSequence = 2 * FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] - 1; - uint64_t FFTSizeSelected = 0; - if (((app->configuration.useCustomBluesteinPaddingPattern > 0) || (app->configuration.autoCustomBluesteinPaddingPattern)) && (!app->configuration.fixMaxRadixBluestein)) { - uint64_t arr_limit = (app->configuration.useCustomBluesteinPaddingPattern) ? app->configuration.useCustomBluesteinPaddingPattern : app->configuration.autoCustomBluesteinPaddingPattern; - for (uint64_t i = 0; i < arr_limit; i++) { - if (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] >= app->configuration.primeSizes[i]) { - if (i != (arr_limit - 1)) { - if (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] < app->configuration.primeSizes[i + 1]) { - tempSequence = app->configuration.paddedSizes[i]; - FFTSizeSelected = 1; - i = arr_limit; - } - } - else { - if ((2 * FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] - 1) <= app->configuration.paddedSizes[i]) { - tempSequence = app->configuration.paddedSizes[i]; - FFTSizeSelected = 1; - i = arr_limit; - } - } - } - } - } - if (app->configuration.fixMaxRadixBluestein > 0) { - while (!FFTSizeSelected) { - uint64_t testSequence = tempSequence; - for (uint64_t i = 0; i < 33; i++) { - multipliers[i] = 0; - } - for (uint64_t i = 2; i < app->configuration.fixMaxRadixBluestein + 1; i++) { - if (testSequence % i == 0) { - testSequence /= i; - multipliers[i]++; - i--; - } - } - if (testSequence == 1) FFTSizeSelected = 1; - else tempSequence++; - } - } - else { - while (!FFTSizeSelected) { - if (axis_id == nonStridedAxisId) { - if ((FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] < 128) || ((((uint64_t)pow(2, (uint64_t)ceil(log2(tempSequence))) * 0.75) <= tempSequence) && (((uint64_t)pow(2, (uint64_t)ceil(log2(tempSequence))) <= maxSequenceLengthSharedMemory) || ((2 * FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] - 1) > maxSequenceLengthSharedMemory)))) tempSequence = (uint64_t)pow(2, (uint64_t)ceil(log2(tempSequence))); - } - else { - uint64_t maxSequenceLengthSharedMemoryStrided_temp = (app->configuration.coalescedMemory > complexSize) ? usedSharedMemory / (app->configuration.coalescedMemory) : usedSharedMemory / complexSize; - if ((FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] < 128) || ((((uint64_t)pow(2, (uint64_t)ceil(log2(tempSequence))) * 0.75) <= tempSequence) && (((uint64_t)pow(2, (uint64_t)ceil(log2(tempSequence))) <= maxSequenceLengthSharedMemoryStrided_temp) || ((2 * FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] - 1) > maxSequenceLengthSharedMemoryStrided_temp)))) tempSequence = (uint64_t)pow(2, (uint64_t)ceil(log2(tempSequence))); - } - uint64_t testSequence = tempSequence; - for (uint64_t i = 0; i < 33; i++) { - multipliers[i] = 0; - } - for (uint64_t i = 2; i < 8; i++) { - if (testSequence % i == 0) { - testSequence /= i; - multipliers[i]++; - i--; - } - } - if (testSequence != 1) tempSequence++; - else { - uint64_t registers_per_thread_per_radix[33]; - uint64_t registers_per_thread = 0; - uint64_t min_registers_per_thread = -1; - uint64_t isGoodSequence = 0; - res = VkFFTGetRegistersPerThread(tempSequence, 0, max_rhs / tempSequence, axes->specializationConstants.useRader, multipliers, registers_per_thread_per_radix, ®isters_per_thread, &min_registers_per_thread, &isGoodSequence); - if (res != VKFFT_SUCCESS) return res; - if (isGoodSequence) FFTSizeSelected = 1; - else tempSequence++; - } - } - } - FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] = tempSequence; - //check if padded system still single upload for r2c - else redo the optimization - if ((axis_id == 0) && (app->configuration.performR2C) && (!FFTPlan->multiUploadR2C) && (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] > maxSingleSizeNonStrided)) { - FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] = app->configuration.size[axis_id] / 2; // now in actualFFTSize - modified dimension size for R2C/DCT - FFTPlan->actualPerformR2CPerAxis[axis_id] = 0; - FFTPlan->multiUploadR2C = 1; - tempSequence = 2 * FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] - 1; - FFTSizeSelected = 0; - if (((app->configuration.useCustomBluesteinPaddingPattern > 0) || (app->configuration.autoCustomBluesteinPaddingPattern)) && (!app->configuration.fixMaxRadixBluestein)) { - uint64_t arr_limit = (app->configuration.useCustomBluesteinPaddingPattern) ? app->configuration.useCustomBluesteinPaddingPattern : app->configuration.autoCustomBluesteinPaddingPattern; - for (uint64_t i = 0; i < arr_limit; i++) { - if (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] >= app->configuration.primeSizes[i]) { - if (i != (arr_limit - 1)) { - if (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] < app->configuration.primeSizes[i + 1]) { - tempSequence = app->configuration.paddedSizes[i]; - FFTSizeSelected = 1; - } - } - else { - if ((2 * FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] - 1) <= app->configuration.paddedSizes[i]) { - tempSequence = app->configuration.paddedSizes[i]; - FFTSizeSelected = 1; - } - } - } - } - } - if (app->configuration.fixMaxRadixBluestein > 0) { - while (!FFTSizeSelected) { - uint64_t testSequence = tempSequence; - for (uint64_t i = 0; i < 33; i++) { - multipliers[i] = 0; - } - for (uint64_t i = 2; i < app->configuration.fixMaxRadixBluestein + 1; i++) { - if (testSequence % i == 0) { - testSequence /= i; - multipliers[i]++; - i--; - } - } - if (testSequence == 1) FFTSizeSelected = 1; - else tempSequence++; - } - } - else { - while (!FFTSizeSelected) { - if (axis_id == nonStridedAxisId) { - if ((FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] < 128) || ((((uint64_t)pow(2, (uint64_t)ceil(log2(tempSequence))) * 0.75) <= tempSequence) && (((uint64_t)pow(2, (uint64_t)ceil(log2(tempSequence))) <= maxSequenceLengthSharedMemory) || ((2 * FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] - 1) > maxSequenceLengthSharedMemory)))) tempSequence = (uint64_t)pow(2, (uint64_t)ceil(log2(tempSequence))); - } - else { - uint64_t maxSequenceLengthSharedMemoryStrided_temp = (app->configuration.coalescedMemory > complexSize) ? usedSharedMemory / (app->configuration.coalescedMemory) : usedSharedMemory / complexSize; - if ((FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] < 128) || ((((uint64_t)pow(2, (uint64_t)ceil(log2(tempSequence))) * 0.75) <= tempSequence) && (((uint64_t)pow(2, (uint64_t)ceil(log2(tempSequence))) <= maxSequenceLengthSharedMemoryStrided_temp) || ((2 * FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] - 1) > maxSequenceLengthSharedMemoryStrided_temp)))) tempSequence = (uint64_t)pow(2, (uint64_t)ceil(log2(tempSequence))); - } - uint64_t testSequence = tempSequence; - for (uint64_t i = 0; i < 33; i++) { - multipliers[i] = 0; - } - for (uint64_t i = 2; i < 8; i++) { - if (testSequence % i == 0) { - testSequence /= i; - multipliers[i]++; - i--; - } - } - if (testSequence != 1) tempSequence++; - else { - uint64_t registers_per_thread_per_radix[33]; - uint64_t registers_per_thread = 0; - uint64_t min_registers_per_thread = -1; - uint64_t isGoodSequence = 0; - res = VkFFTGetRegistersPerThread(tempSequence, 0, max_rhs / tempSequence, axes->specializationConstants.useRader, multipliers, registers_per_thread_per_radix, ®isters_per_thread, &min_registers_per_thread, &isGoodSequence); - if (res != VKFFT_SUCCESS) return res; - if (isGoodSequence) FFTSizeSelected = 1; - else tempSequence++; - } - } - } - FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] = tempSequence; - } - - if (app->configuration.forceBluesteinSequenceSize) FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] = app->configuration.forceBluesteinSequenceSize; - - if ((FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] & (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] - 1)) == 0) { - usedSharedMemory = app->configuration.sharedMemorySizePow2; - maxSequenceLengthSharedMemory = usedSharedMemory / complexSize; - maxSingleSizeNonStrided = maxSequenceLengthSharedMemory; - } - } - uint64_t isPowOf2 = (pow(2, (uint64_t)log2(FFTPlan->actualFFTSizePerAxis[axis_id][axis_id])) == FFTPlan->actualFFTSizePerAxis[axis_id][axis_id]) ? 1 : 0; - uint64_t locNumBatches = (app->configuration.numberBatches > app->actualNumBatches) ? app->configuration.numberBatches : app->actualNumBatches; - //return VKFFT_ERROR_UNSUPPORTED_RADIX; - uint64_t registerBoost = 1; - for (uint64_t i = 1; i <= app->configuration.registerBoost; i++) { - if (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] % (i * i) == 0) - registerBoost = i; - } - if ((axis_id == nonStridedAxisId) && (!app->configuration.performConvolution)) maxSingleSizeNonStrided *= registerBoost; - uint64_t maxSequenceLengthSharedMemoryStrided = (app->configuration.coalescedMemory > complexSize) ? usedSharedMemory / (app->configuration.coalescedMemory) : usedSharedMemory / complexSize; - uint64_t maxSingleSizeStrided = (!app->configuration.performConvolution) ? maxSequenceLengthSharedMemoryStrided * registerBoost : maxSequenceLengthSharedMemoryStrided; - uint64_t numPasses = 1; - uint64_t numPassesHalfBandwidth = 1; - uint64_t temp; - temp = (axis_id == nonStridedAxisId) ? (uint64_t)ceil(FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / (double)maxSingleSizeNonStrided) : (uint64_t)ceil(FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / (double)maxSingleSizeStrided); - if (temp > 1) {//more passes than one - for (uint64_t i = 1; i <= app->configuration.registerBoost4Step; i++) { - if (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] % (i * i) == 0) { - registerBoost = i; - } - } - if ((!app->configuration.performConvolution)) maxSingleSizeNonStrided = maxSequenceLengthSharedMemory * registerBoost; - if ((!app->configuration.performConvolution)) maxSingleSizeStrided = maxSequenceLengthSharedMemoryStrided * registerBoost; - temp = ((axis_id == nonStridedAxisId) && ((!app->configuration.reorderFourStep) || (app->useBluesteinFFT[axis_id]))) ? FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / maxSingleSizeNonStrided : FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / maxSingleSizeStrided; - if (app->configuration.reorderFourStep && (!app->useBluesteinFFT[axis_id])) - numPasses = (uint64_t)ceil(log2(FFTPlan->actualFFTSizePerAxis[axis_id][axis_id]) / log2(maxSingleSizeStrided)); - else - numPasses += (uint64_t)ceil(log2(temp) / log2(maxSingleSizeStrided)); - } - registerBoost = ((axis_id == nonStridedAxisId) && ((app->useBluesteinFFT[axis_id]) || (!app->configuration.reorderFourStep) || (numPasses == 1))) ? (uint64_t)ceil(FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / (double)(pow(maxSequenceLengthSharedMemoryStrided, numPasses - 1) * maxSequenceLengthSharedMemory)) : (uint64_t)ceil(FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / (double)pow(maxSequenceLengthSharedMemoryStrided, numPasses)); - uint64_t canBoost = 0; - for (uint64_t i = registerBoost; i <= app->configuration.registerBoost; i++) { - if (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] % (i * i) == 0) { - registerBoost = i; - i = app->configuration.registerBoost + 1; - canBoost = 1; - } - } - if (((canBoost == 0) || (((FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] & (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] - 1)) != 0) && (!app->configuration.registerBoostNonPow2))) && (registerBoost > 1)) { - registerBoost = 1; - numPasses++; - } - maxSingleSizeNonStrided = maxSequenceLengthSharedMemory * registerBoost; - maxSingleSizeStrided = maxSequenceLengthSharedMemoryStrided * registerBoost; - uint64_t maxSingleSizeStridedHalfBandwidth = maxSingleSizeStrided; - if ((axes->specializationConstants.performBandwidthBoost)) { - maxSingleSizeStridedHalfBandwidth = (app->configuration.coalescedMemory / axes->specializationConstants.performBandwidthBoost > complexSize) ? usedSharedMemory / (app->configuration.coalescedMemory / axes->specializationConstants.performBandwidthBoost) : usedSharedMemory / complexSize; - temp = (axis_id == nonStridedAxisId) ? (uint64_t)ceil(FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / (double)maxSingleSizeNonStrided) : (uint64_t)ceil(FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / (double)maxSingleSizeStridedHalfBandwidth); - //temp = FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / maxSingleSizeNonStrided; - if (temp > 1) {//more passes than two - temp = ((!app->configuration.reorderFourStep) || (app->useBluesteinFFT[axis_id])) ? (uint64_t)ceil(FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / (double)maxSingleSizeNonStrided) : (uint64_t)ceil(FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / (double)maxSingleSizeStridedHalfBandwidth); - for (uint64_t i = 0; i < 5; i++) { - temp = (uint64_t)ceil(temp / (double)maxSingleSizeStrided); - numPassesHalfBandwidth++; - if (temp == 1) i = 5; - } - /* - temp = ((axis_id == 0) && (!app->configuration.reorderFourStep)) ? FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / maxSingleSizeNonStrided : FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / maxSingleSizeStridedHalfBandwidth; - - if (app->configuration.reorderFourStep) - numPassesHalfBandwidth = (uint64_t)ceil(log2(FFTPlan->actualFFTSizePerAxis[axis_id][axis_id]) / log2(maxSingleSizeStridedHalfBandwidth)); - else - numPassesHalfBandwidth = 1 + (uint64_t)ceil(log2(temp) / log2(maxSingleSizeStridedHalfBandwidth)); - if ((numPassesHalfBandwidth == 2)&& (!app->configuration.reorderFourStep)&&(registerBoost>1)) //switch back for two step and don't do half bandwidth on strided accesses if register boost and no 4-step reordering - */ - } - if (numPassesHalfBandwidth < numPasses) numPasses = numPassesHalfBandwidth; - else maxSingleSizeStridedHalfBandwidth = maxSingleSizeStrided; - } - if (((uint64_t)log2(FFTPlan->actualFFTSizePerAxis[axis_id][axis_id]) >= app->configuration.swapTo3Stage4Step) && (app->configuration.swapTo3Stage4Step >= 17)) numPasses = 3;//Force set to 3 stage 4 step algorithm - uint64_t* locAxisSplit = FFTPlan->axisSplit[axis_id]; - if (numPasses == 1) { - locAxisSplit[0] = FFTPlan->actualFFTSizePerAxis[axis_id][axis_id]; - } - if (numPasses == 2) { - if (isPowOf2) { - if ((axis_id == nonStridedAxisId) && ((!app->configuration.reorderFourStep) || (app->useBluesteinFFT[axis_id]))) { - uint64_t maxPow8SharedMemory = (uint64_t)pow(8, ((uint64_t)log2(maxSequenceLengthSharedMemory)) / 3); - //unit stride - if (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / maxPow8SharedMemory <= maxSingleSizeStrided) { - locAxisSplit[0] = maxPow8SharedMemory; - } - else { - if (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / maxSequenceLengthSharedMemory <= maxSingleSizeStrided) { - locAxisSplit[0] = maxSequenceLengthSharedMemory; - } - else { - if (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / (maxSequenceLengthSharedMemory * registerBoost) < maxSingleSizeStridedHalfBandwidth) { - for (uint64_t i = 1; i <= (uint64_t)log2(registerBoost); i++) { - if (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / (maxSequenceLengthSharedMemory * (uint64_t)pow(2, i)) <= maxSingleSizeStrided) { - locAxisSplit[0] = (maxSequenceLengthSharedMemory * (uint64_t)pow(2, i)); - i = (uint64_t)log2(registerBoost) + 1; - } - } - } - else { - locAxisSplit[0] = (maxSequenceLengthSharedMemory * registerBoost); - } - } - } - } - else { - uint64_t maxPow8Strided = (uint64_t)pow(8, ((uint64_t)log2(maxSingleSizeStrided)) / 3); - //all FFTs are considered as non-unit stride - if (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / maxPow8Strided <= maxSingleSizeStrided) { - locAxisSplit[0] = maxPow8Strided; - } - else { - if (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / maxSingleSizeStrided < maxSingleSizeStridedHalfBandwidth) { - locAxisSplit[0] = maxSingleSizeStrided; - } - else { - locAxisSplit[0] = maxSingleSizeStridedHalfBandwidth; - } - } - } - locAxisSplit[1] = FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / locAxisSplit[0]; - if (locAxisSplit[1] < 64) { - locAxisSplit[0] = (locAxisSplit[1] == 0) ? locAxisSplit[0] / (64) : locAxisSplit[0] / (64 / locAxisSplit[1]); - locAxisSplit[1] = 64; - } - if (locAxisSplit[1] > locAxisSplit[0]) { - uint64_t swap = locAxisSplit[0]; - locAxisSplit[0] = locAxisSplit[1]; - locAxisSplit[1] = swap; - } - } - else { - uint64_t successSplit = 0; - if ((axis_id == nonStridedAxisId) && ((!app->configuration.reorderFourStep) || (app->useBluesteinFFT[axis_id]))) { - /*for (uint64_t i = 0; i < maxSequenceLengthSharedMemory; i++) { - if (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] % (maxSequenceLengthSharedMemory - i) == 0) { - if (((maxSequenceLengthSharedMemory - i) <= maxSequenceLengthSharedMemory) && (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / (maxSequenceLengthSharedMemory - i) <= maxSingleSizeStrided)) { - locAxisSplit[0] = (maxSequenceLengthSharedMemory - i); - locAxisSplit[1] = FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / (maxSequenceLengthSharedMemory - i); - i = maxSequenceLengthSharedMemory; - successSplit = 1; - } - } - }*/ - uint64_t sqrtSequence = (uint64_t)ceil(sqrt(FFTPlan->actualFFTSizePerAxis[axis_id][axis_id])); - for (uint64_t i = 0; i < sqrtSequence; i++) { - if (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] % (sqrtSequence - i) == 0) { - if ((sqrtSequence - i <= maxSingleSizeStrided) && (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / (sqrtSequence - i) <= maxSequenceLengthSharedMemory)) { - locAxisSplit[0] = FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / (sqrtSequence - i); - locAxisSplit[1] = sqrtSequence - i; - i = sqrtSequence; - successSplit = 1; - } - } - } - } - else { - uint64_t sqrtSequence = (uint64_t)ceil(sqrt(FFTPlan->actualFFTSizePerAxis[axis_id][axis_id])); - for (uint64_t i = 0; i < sqrtSequence; i++) { - if (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] % (sqrtSequence - i) == 0) { - if ((sqrtSequence - i <= maxSingleSizeStrided) && (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / (sqrtSequence - i) <= maxSingleSizeStridedHalfBandwidth)) { - locAxisSplit[0] = FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / (sqrtSequence - i); - locAxisSplit[1] = sqrtSequence - i; - i = sqrtSequence; - successSplit = 1; - } - } - } - } - if (successSplit == 0) - numPasses = 3; - } - } - if (numPasses == 3) { - if (isPowOf2) { - uint64_t maxPow8Strided = (uint64_t)pow(8, ((uint64_t)log2(maxSingleSizeStrided)) / 3); - if ((axis_id == nonStridedAxisId) && ((!app->configuration.reorderFourStep) || (app->useBluesteinFFT[axis_id]))) { - //unit stride - uint64_t maxPow8SharedMemory = (uint64_t)pow(8, ((uint64_t)log2(maxSequenceLengthSharedMemory)) / 3); - if (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / maxPow8SharedMemory <= maxPow8Strided * maxPow8Strided) - locAxisSplit[0] = maxPow8SharedMemory; - else { - if (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / maxSequenceLengthSharedMemory <= maxSingleSizeStrided * maxSingleSizeStrided) - locAxisSplit[0] = maxSequenceLengthSharedMemory; - else { - if (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / (maxSequenceLengthSharedMemory * registerBoost) <= maxSingleSizeStrided * maxSingleSizeStrided) { - for (uint64_t i = 0; i <= (uint64_t)log2(registerBoost); i++) { - if (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / (maxSequenceLengthSharedMemory * (uint64_t)pow(2, i)) <= maxSingleSizeStrided * maxSingleSizeStrided) { - locAxisSplit[0] = (maxSequenceLengthSharedMemory * (uint64_t)pow(2, i)); - i = (uint64_t)log2(registerBoost) + 1; - } - } - } - else { - locAxisSplit[0] = (maxSequenceLengthSharedMemory * registerBoost); - } - } - } - } - else { - //to account for TLB misses, it is best to coalesce the unit-strided stage to 128 bytes - /*uint64_t log2axis = (uint64_t)log2(FFTPlan->actualFFTSizePerAxis[axis_id][axis_id]); - locAxisSplit[0] = (uint64_t)pow(2, (uint64_t)log2axis / 3); - if (log2axis % 3 > 0) locAxisSplit[0] *= 2; - locAxisSplit[1] = (uint64_t)pow(2, (uint64_t)log2axis / 3); - if (log2axis % 3 > 1) locAxisSplit[1] *= 2; - locAxisSplit[2] = FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / locAxisSplit[0] / locAxisSplit[1];*/ - uint64_t maxSingleSizeStrided128 = usedSharedMemory / (128); - uint64_t maxPow8_128 = (uint64_t)pow(8, ((uint64_t)log2(maxSingleSizeStrided128)) / 3); - //unit stride - if (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / maxPow8_128 <= maxPow8Strided * maxSingleSizeStrided) - locAxisSplit[0] = maxPow8_128; - //non-unit stride - else { - - if ((FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / (maxPow8_128 * 2) <= maxPow8Strided * maxSingleSizeStrided) && (maxPow8_128 * 2 <= maxSingleSizeStrided128)) { - locAxisSplit[0] = maxPow8_128 * 2; - } - else { - if ((FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / (maxPow8_128 * 4) <= maxPow8Strided * maxSingleSizeStrided) && (maxPow8_128 * 4 <= maxSingleSizeStrided128)) { - locAxisSplit[0] = maxPow8_128 * 4; - } - else { - if (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / maxSingleSizeStrided <= maxSingleSizeStrided * maxSingleSizeStrided) { - for (uint64_t i = 0; i <= (uint64_t)log2(maxSingleSizeStrided / maxSingleSizeStrided128); i++) { - if (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / (maxSingleSizeStrided128 * (uint64_t)pow(2, i)) <= maxSingleSizeStrided * maxSingleSizeStrided) { - locAxisSplit[0] = (maxSingleSizeStrided128 * (uint64_t)pow(2, i)); - i = (uint64_t)log2(maxSingleSizeStrided / maxSingleSizeStrided128) + 1; - } - } - } - else - locAxisSplit[0] = maxSingleSizeStridedHalfBandwidth; - } - } - } - } - if (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / locAxisSplit[0] / maxPow8Strided <= maxSingleSizeStrided) { - locAxisSplit[1] = maxPow8Strided; - locAxisSplit[2] = FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / locAxisSplit[1] / locAxisSplit[0]; - } - else { - if (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / locAxisSplit[0] / maxSingleSizeStrided <= maxSingleSizeStrided) { - locAxisSplit[1] = maxSingleSizeStrided; - locAxisSplit[2] = FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / locAxisSplit[1] / locAxisSplit[0]; - } - else { - locAxisSplit[1] = maxSingleSizeStridedHalfBandwidth; - locAxisSplit[2] = FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / locAxisSplit[1] / locAxisSplit[0]; - } - } - if (locAxisSplit[2] < 64) { - locAxisSplit[1] = (locAxisSplit[2] == 0) ? locAxisSplit[1] / (64) : locAxisSplit[1] / (64 / locAxisSplit[2]); - locAxisSplit[2] = 64; - } - if (locAxisSplit[2] > locAxisSplit[1]) { - uint64_t swap = locAxisSplit[1]; - locAxisSplit[1] = locAxisSplit[2]; - locAxisSplit[2] = swap; - } - } - else { - uint64_t successSplit = 0; - if ((axis_id == nonStridedAxisId) && ((!app->configuration.reorderFourStep) || (app->useBluesteinFFT[axis_id]))) { - for (uint64_t i = 0; i < maxSequenceLengthSharedMemory; i++) { - if (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] % (maxSequenceLengthSharedMemory - i) == 0) { - uint64_t sqrt3Sequence = (uint64_t)ceil(sqrt(FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / (maxSequenceLengthSharedMemory - i))); - for (uint64_t j = 0; j < sqrt3Sequence; j++) { - if ((FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / (maxSequenceLengthSharedMemory - i)) % (sqrt3Sequence - j) == 0) { - if (((maxSequenceLengthSharedMemory - i) <= maxSequenceLengthSharedMemory) && (sqrt3Sequence - j <= maxSingleSizeStrided) && (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / (maxSequenceLengthSharedMemory - i) / (sqrt3Sequence - j) <= maxSingleSizeStrided)) { - locAxisSplit[0] = (maxSequenceLengthSharedMemory - i); - locAxisSplit[1] = sqrt3Sequence - j; - locAxisSplit[2] = FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / (maxSequenceLengthSharedMemory - i) / (sqrt3Sequence - j); - i = maxSequenceLengthSharedMemory; - j = sqrt3Sequence; - successSplit = 1; - } - } - } - } - } - } - else { - uint64_t sqrt3Sequence = (uint64_t)ceil(pow(FFTPlan->actualFFTSizePerAxis[axis_id][axis_id], 1.0 / 3.0)); - for (uint64_t i = 0; i < sqrt3Sequence; i++) { - if (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] % (sqrt3Sequence - i) == 0) { - uint64_t sqrt2Sequence = (uint64_t)ceil(sqrt(FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / (sqrt3Sequence - i))); - for (uint64_t j = 0; j < sqrt2Sequence; j++) { - if ((FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / (sqrt3Sequence - i)) % (sqrt2Sequence - j) == 0) { - if ((sqrt3Sequence - i <= maxSingleSizeStrided) && (sqrt2Sequence - j <= maxSingleSizeStrided) && (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / (sqrt3Sequence - i) / (sqrt2Sequence - j) <= maxSingleSizeStridedHalfBandwidth)) { - locAxisSplit[0] = FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / (sqrt3Sequence - i) / (sqrt2Sequence - j); - locAxisSplit[1] = sqrt3Sequence - i; - locAxisSplit[2] = sqrt2Sequence - j; - i = sqrt3Sequence; - j = sqrt2Sequence; - successSplit = 1; - } - } - } - } - } - } - if (successSplit == 0) - numPasses = 4; - } - } - if (numPasses > 3) { - //printf("sequence length exceeds boundaries\n"); - return VKFFT_ERROR_UNSUPPORTED_FFT_LENGTH; - } - if ((numPasses > 1) && (app->configuration.performDCT > 0)) { - //printf("sequence length exceeds boundaries\n"); - return VKFFT_ERROR_UNSUPPORTED_FFT_LENGTH_DCT; - } - if ((numPasses > 1) && (app->configuration.performR2C > 0) && (axis_id == 0) && (app->configuration.size[axis_id] % 2 != 0)) { - //printf("sequence length exceeds boundaries\n"); - return VKFFT_ERROR_UNSUPPORTED_FFT_LENGTH_R2C; - } - if (app->configuration.tempBufferSize[0] == 0) { - if ((app->configuration.performR2C) && (axis_id == 0)) { - if (FFTPlan->multiUploadR2C) - app->configuration.tempBufferSize[0] = (FFTPlan->actualFFTSizePerAxis[axis_id][0] + 1) * FFTPlan->actualFFTSizePerAxis[axis_id][1] * FFTPlan->actualFFTSizePerAxis[axis_id][2] * app->configuration.coordinateFeatures * locNumBatches * app->configuration.numberKernels * complexSize; - } - else { - app->configuration.tempBufferSize[0] = FFTPlan->actualFFTSizePerAxis[axis_id][0] * FFTPlan->actualFFTSizePerAxis[axis_id][1] * FFTPlan->actualFFTSizePerAxis[axis_id][2] * app->configuration.coordinateFeatures * locNumBatches * app->configuration.numberKernels * complexSize; - } - } - if (app->useBluesteinFFT[axis_id]) { - if ((app->configuration.performR2C) && (axis_id == 0)) { - if (FFTPlan->multiUploadR2C) { - if ((FFTPlan->actualFFTSizePerAxis[axis_id][0] + 1) * FFTPlan->actualFFTSizePerAxis[axis_id][1] * FFTPlan->actualFFTSizePerAxis[axis_id][2] * app->configuration.coordinateFeatures * locNumBatches * app->configuration.numberKernels * complexSize > app->configuration.tempBufferSize[0]) app->configuration.tempBufferSize[0] = (FFTPlan->actualFFTSizePerAxis[axis_id][0] + 1) * FFTPlan->actualFFTSizePerAxis[axis_id][1] * FFTPlan->actualFFTSizePerAxis[axis_id][2] * app->configuration.coordinateFeatures * locNumBatches * app->configuration.numberKernels * complexSize; - } - } - else { - if (FFTPlan->actualFFTSizePerAxis[axis_id][0] * FFTPlan->actualFFTSizePerAxis[axis_id][1] * FFTPlan->actualFFTSizePerAxis[axis_id][2] * app->configuration.coordinateFeatures * locNumBatches * app->configuration.numberKernels * complexSize > app->configuration.tempBufferSize[0]) app->configuration.tempBufferSize[0] = FFTPlan->actualFFTSizePerAxis[axis_id][0] * FFTPlan->actualFFTSizePerAxis[axis_id][1] * FFTPlan->actualFFTSizePerAxis[axis_id][2] * app->configuration.coordinateFeatures * locNumBatches * app->configuration.numberKernels * complexSize; - } - } - if (((app->configuration.reorderFourStep) && (!app->useBluesteinFFT[axis_id]))) { - for (uint64_t i = 0; i < numPasses; i++) { - if ((locAxisSplit[0] % 2 != 0) && (locAxisSplit[i] % 2 == 0)) { - uint64_t swap = locAxisSplit[0]; - locAxisSplit[0] = locAxisSplit[i]; - locAxisSplit[i] = swap; - } - } - for (uint64_t i = 0; i < numPasses; i++) { - if ((locAxisSplit[0] % 4 != 0) && (locAxisSplit[i] % 4 == 0)) { - uint64_t swap = locAxisSplit[0]; - locAxisSplit[0] = locAxisSplit[i]; - locAxisSplit[i] = swap; - } - } - for (uint64_t i = 0; i < numPasses; i++) { - if ((locAxisSplit[0] % 8 != 0) && (locAxisSplit[i] % 8 == 0)) { - uint64_t swap = locAxisSplit[0]; - locAxisSplit[0] = locAxisSplit[i]; - locAxisSplit[i] = swap; - } - } - } - FFTPlan->numAxisUploads[axis_id] = numPasses; - for (uint64_t k = 0; k < numPasses; k++) { - tempSequence = locAxisSplit[k]; - uint64_t loc_multipliers[33]; //split the smaller sequence - //split the smaller sequence - //uint64_t rader_multipliers[20]; //split the smaller sequence - //uint64_t* rader_generator = axes[k].specializationConstants.rader_generator_sorted; //split the smaller sequence - //uint64_t* rader_primes = axes[k].specializationConstants.rader_primes; - - for (uint64_t i = 0; i < 33; i++) { - loc_multipliers[i] = 0; - } - - for (uint64_t i = 2; i < app->configuration.fixMinRaderPrimeMult; i++) { - if (tempSequence % i == 0) { - tempSequence /= i; - loc_multipliers[i]++; - i--; - } - } - axes[k].specializationConstants.useRader = 0; - axes[k].specializationConstants.useRaderMult = 0; - axes[k].specializationConstants.useRaderFFT = 0; - if (tempSequence != 1) { - res = VkFFTConstructRaderTree(app, &axes[k].specializationConstants.raderContainer, &tempSequence, &axes[k].specializationConstants.numRaderPrimes, locAxisSplit[k] / tempSequence); - if (res != VKFFT_SUCCESS) return res; - } - - for (int64_t i = 0; i < (int64_t)axes[k].specializationConstants.numRaderPrimes; i++) { - if (axes[k].specializationConstants.raderContainer[i].type == 0) { - if (axes[k].specializationConstants.useRaderFFT < axes[k].specializationConstants.raderContainer[i].prime) axes[k].specializationConstants.useRaderFFT = axes[k].specializationConstants.raderContainer[i].prime; - } - else { - if (axes[k].specializationConstants.useRaderMult < axes[k].specializationConstants.raderContainer[i].prime) axes[k].specializationConstants.useRaderMult = axes[k].specializationConstants.raderContainer[i].prime; - } - } - if (axes[k].specializationConstants.useRaderMult) { - app->configuration.useLUT = 1; // workaround, Mult Rader is better with LUT - } - - axes[k].specializationConstants.useRader = axes[k].specializationConstants.numRaderPrimes; - - if ((axes[k].specializationConstants.useRader) && (app->configuration.useRaderUintLUT)) { - app->configuration.useLUT = 1; // useRaderUintLUT forces LUT - } - - uint64_t registers_per_thread_per_radix[33]; - uint64_t registers_per_thread = 0; - uint64_t min_registers_per_thread = -1; - uint64_t isGoodSequence = 0; - uint64_t extraSharedMemoryForPow2 = ((app->configuration.sharedMemorySizePow2 < app->configuration.sharedMemorySize) || ((locAxisSplit[k] < maxSingleSizeNonStrided) && ((axis_id == nonStridedAxisId))) || ((locAxisSplit[k] < maxSingleSizeStrided) && ((axis_id != nonStridedAxisId)))) ? 1 : 0; - - res = VkFFTGetRegistersPerThread(locAxisSplit[k], extraSharedMemoryForPow2, max_rhs / locAxisSplit[k], axes[k].specializationConstants.numRaderPrimes, loc_multipliers, registers_per_thread_per_radix, ®isters_per_thread, &min_registers_per_thread, &isGoodSequence); - if (res != VKFFT_SUCCESS) return res; - //first optimizer pass - if (axes[k].specializationConstants.numRaderPrimes) { - res = VkFFTOptimizeRaderFFTRegisters(axes[k].specializationConstants.raderContainer, axes[k].specializationConstants.numRaderPrimes, locAxisSplit[k], &min_registers_per_thread, ®isters_per_thread, registers_per_thread_per_radix); - if (res != VKFFT_SUCCESS) return res; - /*for (int64_t i = 0; i < axes[k].specializationConstants.numRaderPrimes; i++) { - if (axes[k].specializationConstants.raderContainer[i].type == 0) { - if (axes[k].specializationConstants.raderContainer[i].min_registers_per_thread / min_registers_per_thread >= 2) { - min_registers_per_thread *= (axes[k].specializationConstants.raderContainer[i].min_registers_per_thread / min_registers_per_thread); - for (uint64_t j = 0; j < 33; j++) { - if ((registers_per_thread_per_radix[j] > 0) && (registers_per_thread_per_radix[j] < min_registers_per_thread)) registers_per_thread_per_radix[j] *= (uint64_t)ceil(min_registers_per_thread / (double)registers_per_thread_per_radix[j]); - } - for (uint64_t j = 0; j < 33; j++) { - if (registers_per_thread_per_radix[j] > registers_per_thread) registers_per_thread = registers_per_thread_per_radix[j]; - } - } - else if (min_registers_per_thread / axes[k].specializationConstants.raderContainer[i].min_registers_per_thread >= 2) { - axes[k].specializationConstants.raderContainer[i].min_registers_per_thread *= (min_registers_per_thread / axes[k].specializationConstants.raderContainer[i].min_registers_per_thread); - for (uint64_t j = 0; j < 33; j++) { - if ((axes[k].specializationConstants.raderContainer[i].registers_per_thread_per_radix[j] > 0) && (axes[k].specializationConstants.raderContainer[i].registers_per_thread_per_radix[j] < axes[k].specializationConstants.raderContainer[i].min_registers_per_thread)) axes[k].specializationConstants.raderContainer[i].registers_per_thread_per_radix[j] *= (uint64_t)ceil(axes[k].specializationConstants.raderContainer[i].min_registers_per_thread / (double)axes[k].specializationConstants.raderContainer[i].registers_per_thread_per_radix[j]); - } - for (uint64_t j = 0; j < 33; j++) { - if (axes[k].specializationConstants.raderContainer[i].registers_per_thread_per_radix[j] > axes[k].specializationConstants.raderContainer[i].registers_per_thread) axes[k].specializationConstants.raderContainer[i].registers_per_thread = axes[k].specializationConstants.raderContainer[i].registers_per_thread_per_radix[j]; - } - } - if (axes[k].specializationConstants.raderContainer[i].registers_per_thread > registers_per_thread) registers_per_thread = axes[k].specializationConstants.raderContainer[i].registers_per_thread; - if (axes[k].specializationConstants.raderContainer[i].min_registers_per_thread < min_registers_per_thread) min_registers_per_thread = axes[k].specializationConstants.raderContainer[i].min_registers_per_thread; - } - }*/ - } - - if ((registerBoost == 4) && (registers_per_thread % 4 != 0)) { - registers_per_thread *= 2; - for (uint64_t i = 2; i < 33; i++) { - registers_per_thread_per_radix[i] *= 2; - } - min_registers_per_thread *= 2; - } - uint64_t maxBatchCoalesced = ((axis_id == 0) && (((k == 0) && ((!app->configuration.reorderFourStep) || (app->useBluesteinFFT[axis_id]))) || (numPasses == 1))) ? 1 : app->configuration.coalescedMemory / complexSize; - uint64_t estimate_rader_threadnum = 0; - uint64_t scale_registers_rader = 0; - uint64_t rader_min_registers = min_registers_per_thread; - - if (axes[k].specializationConstants.useRaderMult) { - for (int64_t i = 0; i < (int64_t)axes[k].specializationConstants.numRaderPrimes; i++) { - if (axes[k].specializationConstants.raderContainer[i].type == 1) { - uint64_t temp_rader = (uint64_t)ceil((locAxisSplit[k] / (double)((rader_min_registers / 2 + scale_registers_rader) * 2)) / (double)((axes[k].specializationConstants.raderContainer[i].prime + 1) / 2)); - uint64_t active_rader = (uint64_t)ceil((locAxisSplit[k] / axes[k].specializationConstants.raderContainer[i].prime) / (double)temp_rader); - if (active_rader > 1) { - if ((((double)active_rader - (locAxisSplit[k] / axes[k].specializationConstants.raderContainer[i].prime) / (double)temp_rader) >= 0.5) && ((((uint64_t)ceil((locAxisSplit[k] / axes[k].specializationConstants.raderContainer[i].prime) / (double)(active_rader - 1)) * ((axes[k].specializationConstants.raderContainer[i].prime + 1) / 2)) * maxBatchCoalesced) <= app->configuration.maxThreadsNum)) active_rader--; - } - - uint64_t local_estimate_rader_threadnum = (uint64_t)ceil((locAxisSplit[k] / axes[k].specializationConstants.raderContainer[i].prime) / (double)active_rader) * ((axes[k].specializationConstants.raderContainer[i].prime + 1) / 2) * maxBatchCoalesced; - if ((maxBatchCoalesced * locAxisSplit[k] / ((rader_min_registers / 2 + scale_registers_rader) * 2 * registerBoost)) > local_estimate_rader_threadnum) local_estimate_rader_threadnum = (maxBatchCoalesced * locAxisSplit[k] / ((rader_min_registers / 2 + scale_registers_rader) * 2 * registerBoost)); - if ((local_estimate_rader_threadnum > app->configuration.maxThreadsNum) || ((((locAxisSplit[k] / min_registers_per_thread) > 256) || (local_estimate_rader_threadnum > 256)) && (((rader_min_registers / 2 + scale_registers_rader) * 2) <= 4))) { - scale_registers_rader++; - i = -1; - } - else { - estimate_rader_threadnum = (estimate_rader_threadnum < local_estimate_rader_threadnum) ? local_estimate_rader_threadnum : estimate_rader_threadnum; - } - } - } - rader_min_registers = (rader_min_registers / 2 + scale_registers_rader) * 2;//min number of registers for Rader (can be more than min_registers_per_thread, but min_registers_per_thread should be at least 4 for Nvidiaif you have >256 threads) - if (registers_per_thread < rader_min_registers) registers_per_thread = rader_min_registers; - for (uint64_t i = 2; i < 33; i++) { - if (registers_per_thread_per_radix[i] != 0) { - if (registers_per_thread / registers_per_thread_per_radix[i] >= 2) { - registers_per_thread_per_radix[i] *= (registers_per_thread / registers_per_thread_per_radix[i]); - } - } - } - for (int64_t i = 0; i < (int64_t)axes[k].specializationConstants.numRaderPrimes; i++) { - if (axes[k].specializationConstants.raderContainer[i].type == 0) { - for (uint64_t j = 2; j < 33; j++) { - if (axes[k].specializationConstants.raderContainer[i].registers_per_thread_per_radix[j] != 0) { - if (registers_per_thread / axes[k].specializationConstants.raderContainer[i].registers_per_thread_per_radix[j] >= 2) { - axes[k].specializationConstants.raderContainer[i].registers_per_thread_per_radix[j] *= (registers_per_thread / axes[k].specializationConstants.raderContainer[i].registers_per_thread_per_radix[j]); - } - } - } - } - } - uint64_t new_min_registers = -1; - for (uint64_t i = 2; i < 33; i++) { - if ((registers_per_thread_per_radix[i] > 0) && (registers_per_thread_per_radix[i] < new_min_registers)) new_min_registers = registers_per_thread_per_radix[i]; - if (registers_per_thread_per_radix[i] > registers_per_thread) { - registers_per_thread = registers_per_thread_per_radix[i]; - } - } - for (int64_t i = 0; i < (int64_t)axes[k].specializationConstants.numRaderPrimes; i++) { - if (axes[k].specializationConstants.raderContainer[i].type == 0) { - for (uint64_t j = 2; j < 33; j++) { - if ((axes[k].specializationConstants.raderContainer[i].registers_per_thread_per_radix[j] > 0) && (axes[k].specializationConstants.raderContainer[i].registers_per_thread_per_radix[j] < new_min_registers)) new_min_registers = axes[k].specializationConstants.raderContainer[i].registers_per_thread_per_radix[j]; - if (axes[k].specializationConstants.raderContainer[i].registers_per_thread_per_radix[j] > registers_per_thread) { - registers_per_thread = axes[k].specializationConstants.raderContainer[i].registers_per_thread_per_radix[j]; - } - } - } - } - min_registers_per_thread = (new_min_registers == -1) ? registers_per_thread : new_min_registers; - } - if ((maxBatchCoalesced * locAxisSplit[k] / (min_registers_per_thread * registerBoost) > app->configuration.maxThreadsNum) || (axes[k].specializationConstants.useRader && (estimate_rader_threadnum > app->configuration.maxThreadsNum))) - { - uint64_t scaleRegistersNum = 1; - if ((axis_id == 0) && (k == 0) && (maxBatchCoalesced > 1)) { - maxBatchCoalesced = app->configuration.maxThreadsNum * (min_registers_per_thread * registerBoost) / locAxisSplit[k]; - if (maxBatchCoalesced < 1) maxBatchCoalesced = 1; - } - if ((maxBatchCoalesced * locAxisSplit[k] / (min_registers_per_thread * registerBoost * scaleRegistersNum)) > app->configuration.maxThreadsNum) { - for (uint64_t i = 2; i < locAxisSplit[k]; i++) { - if (((maxBatchCoalesced * locAxisSplit[k] / (min_registers_per_thread * registerBoost * i)) <= app->configuration.maxThreadsNum)) { - scaleRegistersNum = i; - i = locAxisSplit[k]; - } - } - } - min_registers_per_thread *= scaleRegistersNum; - registers_per_thread *= scaleRegistersNum; - for (uint64_t i = 2; i < 33; i++) { - if (registers_per_thread_per_radix[i] != 0) { - registers_per_thread_per_radix[i] *= scaleRegistersNum; - } - } - for (int64_t i = 0; i < (int64_t)axes[k].specializationConstants.numRaderPrimes; i++) { - if (axes[k].specializationConstants.raderContainer[i].type == 0) { - for (uint64_t j = 2; j < 33; j++) { - if (axes[k].specializationConstants.raderContainer[i].registers_per_thread_per_radix[j] != 0) { - axes[k].specializationConstants.raderContainer[i].registers_per_thread_per_radix[j] *= scaleRegistersNum; - } - } - } - } - if (min_registers_per_thread > registers_per_thread) { - temp = min_registers_per_thread; - min_registers_per_thread = registers_per_thread; - registers_per_thread = temp; - } - for (uint64_t i = 2; i < 33; i++) { - if (registers_per_thread_per_radix[i] > registers_per_thread) { - registers_per_thread = registers_per_thread_per_radix[i]; - } - if ((registers_per_thread_per_radix[i] > 0) && (registers_per_thread_per_radix[i] < min_registers_per_thread)) { - min_registers_per_thread = registers_per_thread_per_radix[i]; - } - } - for (int64_t i = 0; i < (int64_t)axes[k].specializationConstants.numRaderPrimes; i++) { - if (axes[k].specializationConstants.raderContainer[i].type == 0) { - for (uint64_t j = 2; j < 33; j++) { - if (axes[k].specializationConstants.raderContainer[i].registers_per_thread_per_radix[j] > registers_per_thread) { - registers_per_thread = axes[k].specializationConstants.raderContainer[i].registers_per_thread_per_radix[j]; - } - if ((axes[k].specializationConstants.raderContainer[i].registers_per_thread_per_radix[j] > 0) && (axes[k].specializationConstants.raderContainer[i].registers_per_thread_per_radix[j] < min_registers_per_thread)) { - min_registers_per_thread = axes[k].specializationConstants.raderContainer[i].registers_per_thread_per_radix[j]; - } - } - } - } - if ((loc_multipliers[3] >= 2) && (((registers_per_thread / min_registers_per_thread) % 3) == 0) && (axes[k].specializationConstants.numRaderPrimes == 0)) { - registers_per_thread /= 3; - for (uint64_t i = 2; i < 33; i++) { - if (registers_per_thread_per_radix[i] % 9 == 0) { - registers_per_thread_per_radix[i] /= 3; - } - } - for (uint64_t i = 2; i < 33; i++) { - if (registers_per_thread_per_radix[i] > registers_per_thread) { - registers_per_thread = registers_per_thread_per_radix[i]; - } - if ((registers_per_thread_per_radix[i] > 0) && (registers_per_thread_per_radix[i] < min_registers_per_thread)) { - min_registers_per_thread = registers_per_thread_per_radix[i]; - } - } - } - } - //second optimizer pass - if (axes[k].specializationConstants.numRaderPrimes) { - res = VkFFTOptimizeRaderFFTRegisters(axes[k].specializationConstants.raderContainer, axes[k].specializationConstants.numRaderPrimes, locAxisSplit[k], &min_registers_per_thread, ®isters_per_thread, registers_per_thread_per_radix); - if (res != VKFFT_SUCCESS) return res; - } - - axes[k].specializationConstants.maxNonPow2Radix = 1; - axes[k].specializationConstants.usedLocRegs = 1; - - res = VkFFTOptimizeRadixKernels(registers_per_thread_per_radix, loc_multipliers, registerBoost, &axes[k].specializationConstants.maxNonPow2Radix, &axes[k].specializationConstants.usedLocRegs, axes[k].specializationConstants.raderContainer, axes[k].specializationConstants.numRaderPrimes); - if (res != VKFFT_SUCCESS) return res; - - for (uint64_t i = 2; i < 33; i++) { - axes[k].specializationConstants.registers_per_thread_per_radix[i] = registers_per_thread_per_radix[i]; - } - axes[k].specializationConstants.numStages = 0; - axes[k].specializationConstants.fftDim = locAxisSplit[k]; - uint64_t tempRegisterBoost = registerBoost;// ((axis_id == nonStridedAxisId) && ((!app->configuration.reorderFourStep)||(app->useBluesteinFFT[axis_id]))) ? (uint64_t)ceil(axes[k].specializationConstants.fftDim / (double)maxSingleSizeNonStrided) : (uint64_t)ceil(axes[k].specializationConstants.fftDim / (double)maxSingleSizeStrided); - uint64_t switchRegisterBoost = 0; - if (tempRegisterBoost > 1) { - if (loc_multipliers[tempRegisterBoost] > 0) { - loc_multipliers[tempRegisterBoost]--; - switchRegisterBoost = tempRegisterBoost; - } - else { - for (uint64_t i = 32; i > 1; i--) { - if (loc_multipliers[i] > 0) { - loc_multipliers[i]--; - switchRegisterBoost = i; - i = 1; - } - } - } - } - - res = VkFFTGetRaderFFTStages(axes[k].specializationConstants.raderContainer, axes[k].specializationConstants.numRaderPrimes, &axes[k].specializationConstants.numStages, axes[k].specializationConstants.stageRadix, axes[k].specializationConstants.rader_generator); - if (res != VKFFT_SUCCESS) return res; - - for (uint64_t i = 32; i > 1; i--) { - if (loc_multipliers[i] > 0) { - axes[k].specializationConstants.stageRadix[axes[k].specializationConstants.numStages] = i; - loc_multipliers[i]--; - i++; - axes[k].specializationConstants.numStages++; - } - } - - //add more registers for Rader FFT if needed - if (axes[k].specializationConstants.useRaderMult) { - axes[k].specializationConstants.rader_min_registers = rader_min_registers; - for (uint64_t i = 0; i < axes[k].specializationConstants.numRaderPrimes; i++) { - if (axes[k].specializationConstants.raderContainer[i].type == 1) { - uint64_t temp_rader = (uint64_t)ceil((locAxisSplit[k] / (double)axes[k].specializationConstants.rader_min_registers) / (double)((axes[k].specializationConstants.raderContainer[i].prime + 1) / 2)); - uint64_t active_rader = (uint64_t)ceil((locAxisSplit[k] / axes[k].specializationConstants.raderContainer[i].prime) / (double)temp_rader); - if (active_rader > 1) { - if ((((double)active_rader - (locAxisSplit[k] / axes[k].specializationConstants.raderContainer[i].prime) / (double)temp_rader) >= 0.5) && ((((uint64_t)ceil((locAxisSplit[k] / axes[k].specializationConstants.raderContainer[i].prime) / (double)(active_rader - 1)) * ((axes[k].specializationConstants.raderContainer[i].prime + 1) / 2)) * maxBatchCoalesced) <= app->configuration.maxThreadsNum)) active_rader--; - } - axes[k].specializationConstants.raderRegisters = (active_rader * 2 > axes[k].specializationConstants.raderRegisters) ? active_rader * 2 : axes[k].specializationConstants.raderRegisters; - if (active_rader * 2 > registers_per_thread) registers_per_thread = active_rader * 2; - } - } - if (axes[k].specializationConstants.raderRegisters < axes[k].specializationConstants.rader_min_registers) axes[k].specializationConstants.raderRegisters = axes[k].specializationConstants.rader_min_registers; - } - - //final check up on all registers, increase if bigger - registers_per_thread = 0; - min_registers_per_thread = -1; - if (axes[k].specializationConstants.useRaderMult) { - registers_per_thread = axes[k].specializationConstants.raderRegisters; - min_registers_per_thread = axes[k].specializationConstants.rader_min_registers; - } - res = VkFFTMinMaxRegisterCheck(axes[k].specializationConstants.numStages, axes[k].specializationConstants.stageRadix, &min_registers_per_thread, ®isters_per_thread, axes[k].specializationConstants.registers_per_thread_per_radix, axes[k].specializationConstants.raderContainer, axes[k].specializationConstants.numRaderPrimes, axes[k].specializationConstants.rader_generator);; - if (res != VKFFT_SUCCESS) return res; - axes[k].specializationConstants.minRaderFFTThreadNum = 0; - res = VkFFTGetRaderFFTThreadsNum(axes[k].specializationConstants.raderContainer, axes[k].specializationConstants.numRaderPrimes, &axes[k].specializationConstants.minRaderFFTThreadNum); - if (res != VKFFT_SUCCESS) return res; - axes[k].specializationConstants.registerBoost = registerBoost; - axes[k].specializationConstants.registers_per_thread = registers_per_thread; - axes[k].specializationConstants.min_registers_per_thread = min_registers_per_thread; - - if (switchRegisterBoost > 0) { - axes[k].specializationConstants.stageRadix[axes[k].specializationConstants.numStages] = switchRegisterBoost; - axes[k].specializationConstants.numStages++; - } - else { - //try to read directly to registers - if (min_registers_per_thread != registers_per_thread) { - for (uint64_t i = 0; i < axes[k].specializationConstants.numStages; i++) { - if (axes[k].specializationConstants.registers_per_thread_per_radix[axes[k].specializationConstants.stageRadix[i]] == min_registers_per_thread) { - uint64_t stageid = axes[k].specializationConstants.stageRadix[i]; - axes[k].specializationConstants.stageRadix[i] = axes[k].specializationConstants.stageRadix[0]; - axes[k].specializationConstants.stageRadix[0] = stageid; - if (axes[k].specializationConstants.useRader) { - stageid = axes[k].specializationConstants.rader_generator[i]; - axes[k].specializationConstants.rader_generator[i] = axes[k].specializationConstants.rader_generator[0]; - axes[k].specializationConstants.rader_generator[0] = stageid; - } - i = axes[k].specializationConstants.numStages; - } - } - } - } - } - return VKFFT_SUCCESS; -} -static inline VkFFTResult VkFFTGeneratePhaseVectors(VkFFTApplication* app, VkFFTPlan* FFTPlan, uint64_t axis_id) { - //generate two arrays used for Blueestein convolution and post-convolution multiplication - double double_PI = 3.1415926535897932384626433832795; - VkFFTResult resFFT = VKFFT_SUCCESS; - VkFFTApplication kernelPreparationApplication = {}; - VkFFTConfiguration kernelPreparationConfiguration = {}; - - kernelPreparationConfiguration.FFTdim = 1; - kernelPreparationConfiguration.size[0] = FFTPlan->actualFFTSizePerAxis[axis_id][axis_id]; - kernelPreparationConfiguration.size[1] = 1; - kernelPreparationConfiguration.size[2] = 1; - kernelPreparationConfiguration.doublePrecision = (app->configuration.doublePrecision || app->configuration.doublePrecisionFloatMemory); - kernelPreparationConfiguration.useLUT = 1; - kernelPreparationConfiguration.registerBoost = 1; - kernelPreparationConfiguration.disableReorderFourStep = 1; - kernelPreparationConfiguration.fixMinRaderPrimeFFT = 17; - kernelPreparationConfiguration.fixMinRaderPrimeMult = 17; - kernelPreparationConfiguration.fixMaxRaderPrimeFFT = 17; - kernelPreparationConfiguration.fixMaxRaderPrimeMult = 17; - - kernelPreparationConfiguration.saveApplicationToString = app->configuration.saveApplicationToString; - kernelPreparationConfiguration.loadApplicationFromString = app->configuration.loadApplicationFromString; - if (kernelPreparationConfiguration.loadApplicationFromString) { - kernelPreparationConfiguration.loadApplicationString = (void*)((char*)app->configuration.loadApplicationString + app->currentApplicationStringPos); - } - kernelPreparationConfiguration.performBandwidthBoost = (app->configuration.performBandwidthBoost > 0) ? app->configuration.performBandwidthBoost : 1; - if (axis_id == 0) kernelPreparationConfiguration.performBandwidthBoost = 0; - if (axis_id > 0) kernelPreparationConfiguration.considerAllAxesStrided = 1; - if (app->configuration.tempBuffer) { - kernelPreparationConfiguration.userTempBuffer = 1; - kernelPreparationConfiguration.tempBuffer = app->configuration.tempBuffer; - kernelPreparationConfiguration.tempBufferSize = app->configuration.tempBufferSize; - kernelPreparationConfiguration.tempBufferNum = app->configuration.tempBufferNum; - } - kernelPreparationConfiguration.device = app->configuration.device; -#if(VKFFT_BACKEND==0) - kernelPreparationConfiguration.queue = app->configuration.queue; //to allocate memory for LUT, we have to pass a queue, vkGPU->fence, commandPool and physicalDevice pointers - kernelPreparationConfiguration.fence = app->configuration.fence; - kernelPreparationConfiguration.commandPool = app->configuration.commandPool; - kernelPreparationConfiguration.physicalDevice = app->configuration.physicalDevice; - kernelPreparationConfiguration.isCompilerInitialized = 1;//compiler can be initialized before VkFFT plan creation. if not, VkFFT will create and destroy one after initialization - kernelPreparationConfiguration.tempBufferDeviceMemory = app->configuration.tempBufferDeviceMemory; -#elif(VKFFT_BACKEND==3) - kernelPreparationConfiguration.context = app->configuration.context; -#elif(VKFFT_BACKEND==4) - kernelPreparationConfiguration.context = app->configuration.context; - kernelPreparationConfiguration.commandQueue = app->configuration.commandQueue; - kernelPreparationConfiguration.commandQueueID = app->configuration.commandQueueID; -#endif - - uint64_t bufferSize = (uint64_t)sizeof(float) * 2 * kernelPreparationConfiguration.size[0] * kernelPreparationConfiguration.size[1] * kernelPreparationConfiguration.size[2]; - if (kernelPreparationConfiguration.doublePrecision) bufferSize *= sizeof(double) / sizeof(float); - app->bufferBluesteinSize[axis_id] = bufferSize; - kernelPreparationConfiguration.inputBufferSize = &app->bufferBluesteinSize[axis_id]; - kernelPreparationConfiguration.bufferSize = &app->bufferBluesteinSize[axis_id]; - kernelPreparationConfiguration.isInputFormatted = 1; - resFFT = initializeVkFFT(&kernelPreparationApplication, kernelPreparationConfiguration); - if (resFFT != VKFFT_SUCCESS) return resFFT; - if (kernelPreparationConfiguration.loadApplicationFromString) { - app->currentApplicationStringPos += kernelPreparationApplication.currentApplicationStringPos; - } -#if(VKFFT_BACKEND==0) - VkResult res = VK_SUCCESS; - resFFT = allocateFFTBuffer(app, &app->bufferBluestein[axis_id], &app->bufferBluesteinDeviceMemory[axis_id], VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, VK_MEMORY_HEAP_DEVICE_LOCAL_BIT, bufferSize); - if (resFFT != VKFFT_SUCCESS) return resFFT; - if (!app->configuration.makeInversePlanOnly) { - resFFT = allocateFFTBuffer(app, &app->bufferBluesteinFFT[axis_id], &app->bufferBluesteinFFTDeviceMemory[axis_id], VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, VK_MEMORY_HEAP_DEVICE_LOCAL_BIT, bufferSize); - if (resFFT != VKFFT_SUCCESS) return resFFT; - } - if (!app->configuration.makeForwardPlanOnly) { - resFFT = allocateFFTBuffer(app, &app->bufferBluesteinIFFT[axis_id], &app->bufferBluesteinIFFTDeviceMemory[axis_id], VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, VK_MEMORY_HEAP_DEVICE_LOCAL_BIT, bufferSize); - if (resFFT != VKFFT_SUCCESS) return resFFT; - } -#elif(VKFFT_BACKEND==1) - cudaError_t res = cudaSuccess; - res = cudaMalloc((void**)&app->bufferBluestein[axis_id], bufferSize); - if (res != cudaSuccess) return VKFFT_ERROR_FAILED_TO_ALLOCATE; - if (!app->configuration.makeInversePlanOnly) { - res = cudaMalloc((void**)&app->bufferBluesteinFFT[axis_id], bufferSize); - if (res != cudaSuccess) return VKFFT_ERROR_FAILED_TO_ALLOCATE; - } - if (!app->configuration.makeForwardPlanOnly) { - res = cudaMalloc((void**)&app->bufferBluesteinIFFT[axis_id], bufferSize); - if (res != cudaSuccess) return VKFFT_ERROR_FAILED_TO_ALLOCATE; - } -#elif(VKFFT_BACKEND==2) - hipError_t res = hipSuccess; - res = hipMalloc((void**)&app->bufferBluestein[axis_id], bufferSize); - if (res != hipSuccess) return VKFFT_ERROR_FAILED_TO_ALLOCATE; - if (!app->configuration.makeInversePlanOnly) { - res = hipMalloc((void**)&app->bufferBluesteinFFT[axis_id], bufferSize); - if (res != hipSuccess) return VKFFT_ERROR_FAILED_TO_ALLOCATE; - } - if (!app->configuration.makeForwardPlanOnly) { - res = hipMalloc((void**)&app->bufferBluesteinIFFT[axis_id], bufferSize); - if (res != hipSuccess) return VKFFT_ERROR_FAILED_TO_ALLOCATE; - } -#elif(VKFFT_BACKEND==3) - cl_int res = CL_SUCCESS; - app->bufferBluestein[axis_id] = clCreateBuffer(app->configuration.context[0], CL_MEM_READ_WRITE, bufferSize, 0, &res); - if (res != CL_SUCCESS) return VKFFT_ERROR_FAILED_TO_ALLOCATE; - if (!app->configuration.makeInversePlanOnly) { - app->bufferBluesteinFFT[axis_id] = clCreateBuffer(app->configuration.context[0], CL_MEM_READ_WRITE, bufferSize, 0, &res); - if (res != CL_SUCCESS) return VKFFT_ERROR_FAILED_TO_ALLOCATE; - } - if (!app->configuration.makeForwardPlanOnly) { - app->bufferBluesteinIFFT[axis_id] = clCreateBuffer(app->configuration.context[0], CL_MEM_READ_WRITE, bufferSize, 0, &res); - if (res != CL_SUCCESS) return VKFFT_ERROR_FAILED_TO_ALLOCATE; - } - cl_command_queue commandQueue = clCreateCommandQueue(app->configuration.context[0], app->configuration.device[0], 0, &res); - if (res != CL_SUCCESS) return VKFFT_ERROR_FAILED_TO_CREATE_COMMAND_QUEUE; -#elif(VKFFT_BACKEND==4) - ze_result_t res = ZE_RESULT_SUCCESS; - - ze_device_mem_alloc_desc_t device_desc = {}; - device_desc.stype = ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC; - res = zeMemAllocDevice(app->configuration.context[0], &device_desc, bufferSize, sizeof(float), app->configuration.device[0], &app->bufferBluestein[axis_id]); - if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_ALLOCATE; - - if (!app->configuration.makeInversePlanOnly) { - res = zeMemAllocDevice(app->configuration.context[0], &device_desc, bufferSize, sizeof(float), app->configuration.device[0], &app->bufferBluesteinFFT[axis_id]); - if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_ALLOCATE; - } - if (!app->configuration.makeForwardPlanOnly) { - res = zeMemAllocDevice(app->configuration.context[0], &device_desc, bufferSize, sizeof(float), app->configuration.device[0], &app->bufferBluesteinIFFT[axis_id]); - if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_ALLOCATE; - } -#endif - void* phaseVectors = malloc(bufferSize); - if (!phaseVectors) { - deleteVkFFT(&kernelPreparationApplication); - deleteVkFFT(app); - return VKFFT_ERROR_MALLOC_FAILED; - } - uint64_t phaseVectorsNonZeroSize = (((app->configuration.performDCT == 4) && (app->configuration.size[axis_id] % 2 == 0)) || ((FFTPlan->multiUploadR2C) && (axis_id == 0))) ? app->configuration.size[axis_id] / 2 : app->configuration.size[axis_id]; - if (app->configuration.performDCT == 1) phaseVectorsNonZeroSize = 2 * app->configuration.size[axis_id] - 2; - if ((FFTPlan->numAxisUploads[axis_id] > 1) && (!app->configuration.makeForwardPlanOnly)) { - if (kernelPreparationConfiguration.doublePrecision) { - double* phaseVectors_cast = (double*)phaseVectors; - for (uint64_t i = 0; i < FFTPlan->actualFFTSizePerAxis[axis_id][axis_id]; i++) { - uint64_t rm = (i * i) % (2 * phaseVectorsNonZeroSize); - double angle = double_PI * rm / phaseVectorsNonZeroSize; - phaseVectors_cast[2 * i] = (i < phaseVectorsNonZeroSize) ? (double)cos(angle) : 0; - phaseVectors_cast[2 * i + 1] = (i < phaseVectorsNonZeroSize) ? (double)-sin(angle) : 0; - } - for (uint64_t i = 1; i < phaseVectorsNonZeroSize; i++) { - phaseVectors_cast[2 * (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] - i)] = phaseVectors_cast[2 * i]; - phaseVectors_cast[2 * (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] - i) + 1] = phaseVectors_cast[2 * i + 1]; - } - } - else { - float* phaseVectors_cast = (float*)phaseVectors; - for (uint64_t i = 0; i < FFTPlan->actualFFTSizePerAxis[axis_id][axis_id]; i++) { - uint64_t rm = (i * i) % (2 * phaseVectorsNonZeroSize); - double angle = double_PI * rm / phaseVectorsNonZeroSize; - phaseVectors_cast[2 * i] = (i < phaseVectorsNonZeroSize) ? (float)cos(angle) : 0; - phaseVectors_cast[2 * i + 1] = (i < phaseVectorsNonZeroSize) ? (float)-sin(angle) : 0; - } - for (uint64_t i = 1; i < phaseVectorsNonZeroSize; i++) { - phaseVectors_cast[2 * (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] - i)] = phaseVectors_cast[2 * i]; - phaseVectors_cast[2 * (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] - i) + 1] = phaseVectors_cast[2 * i + 1]; - } - } -#if(VKFFT_BACKEND==0) - resFFT = transferDataFromCPU(&kernelPreparationApplication, phaseVectors, &app->bufferBluestein[axis_id], bufferSize); - if (resFFT != VKFFT_SUCCESS) { - free(phaseVectors); - deleteVkFFT(&kernelPreparationApplication); - return resFFT; - } -#elif(VKFFT_BACKEND==1) - res = cudaMemcpy(app->bufferBluestein[axis_id], phaseVectors, bufferSize, cudaMemcpyHostToDevice); - if (res != cudaSuccess) { - free(phaseVectors); - deleteVkFFT(&kernelPreparationApplication); - return VKFFT_ERROR_FAILED_TO_COPY; - } -#elif(VKFFT_BACKEND==2) - res = hipMemcpy(app->bufferBluestein[axis_id], phaseVectors, bufferSize, hipMemcpyHostToDevice); - if (res != hipSuccess) { - free(phaseVectors); - deleteVkFFT(&kernelPreparationApplication); - return VKFFT_ERROR_FAILED_TO_COPY; - } -#elif(VKFFT_BACKEND==3) - res = clEnqueueWriteBuffer(commandQueue, app->bufferBluestein[axis_id], CL_TRUE, 0, bufferSize, phaseVectors, 0, NULL, NULL); - if (res != CL_SUCCESS) { - free(phaseVectors); - deleteVkFFT(&kernelPreparationApplication); - return VKFFT_ERROR_FAILED_TO_COPY; - } -#elif(VKFFT_BACKEND==4) - ze_command_queue_desc_t commandQueueCopyDesc = { - ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC, - 0, - app->configuration.commandQueueID, - 0, // index - 0, // flags - ZE_COMMAND_QUEUE_MODE_DEFAULT, - ZE_COMMAND_QUEUE_PRIORITY_NORMAL - }; - ze_command_list_handle_t copyCommandList; - res = zeCommandListCreateImmediate(app->configuration.context[0], app->configuration.device[0], &commandQueueCopyDesc, ©CommandList); - if (res != ZE_RESULT_SUCCESS) { - free(phaseVectors); - deleteVkFFT(&kernelPreparationApplication); - return VKFFT_ERROR_FAILED_TO_CREATE_COMMAND_LIST; - } - res = zeCommandListAppendMemoryCopy(copyCommandList, app->bufferBluestein[axis_id], phaseVectors, bufferSize, 0, 0, 0); - if (res != ZE_RESULT_SUCCESS) { - free(phaseVectors); - deleteVkFFT(&kernelPreparationApplication); - return VKFFT_ERROR_FAILED_TO_COPY; - } - res = zeCommandQueueSynchronize(app->configuration.commandQueue[0], UINT32_MAX); - if (res != ZE_RESULT_SUCCESS) { - free(phaseVectors); - deleteVkFFT(&kernelPreparationApplication); - return VKFFT_ERROR_FAILED_TO_SYNCHRONIZE; - } -#endif -#if(VKFFT_BACKEND==0) - { - VkCommandBufferAllocateInfo commandBufferAllocateInfo = { VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO }; - commandBufferAllocateInfo.commandPool = kernelPreparationApplication.configuration.commandPool[0]; - commandBufferAllocateInfo.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY; - commandBufferAllocateInfo.commandBufferCount = 1; - VkCommandBuffer commandBuffer = {}; - res = vkAllocateCommandBuffers(kernelPreparationApplication.configuration.device[0], &commandBufferAllocateInfo, &commandBuffer); - if (res != 0) { - free(phaseVectors); - deleteVkFFT(&kernelPreparationApplication); - return VKFFT_ERROR_FAILED_TO_ALLOCATE_COMMAND_BUFFERS; - } - VkCommandBufferBeginInfo commandBufferBeginInfo = { VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO }; - commandBufferBeginInfo.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT; - res = vkBeginCommandBuffer(commandBuffer, &commandBufferBeginInfo); - if (res != 0) { - free(phaseVectors); - deleteVkFFT(&kernelPreparationApplication); - return VKFFT_ERROR_FAILED_TO_BEGIN_COMMAND_BUFFER; - } - VkFFTLaunchParams launchParams = {}; - launchParams.commandBuffer = &commandBuffer; - launchParams.inputBuffer = &app->bufferBluestein[axis_id]; - launchParams.buffer = &app->bufferBluesteinIFFT[axis_id]; - //Record commands - resFFT = VkFFTAppend(&kernelPreparationApplication, -1, &launchParams); - if (resFFT != VKFFT_SUCCESS) { - free(phaseVectors); - deleteVkFFT(&kernelPreparationApplication); - return resFFT; - } - res = vkEndCommandBuffer(commandBuffer); - if (res != 0) { - free(phaseVectors); - deleteVkFFT(&kernelPreparationApplication); - return VKFFT_ERROR_FAILED_TO_END_COMMAND_BUFFER; - } - VkSubmitInfo submitInfo = { VK_STRUCTURE_TYPE_SUBMIT_INFO }; - submitInfo.commandBufferCount = 1; - submitInfo.pCommandBuffers = &commandBuffer; - res = vkQueueSubmit(kernelPreparationApplication.configuration.queue[0], 1, &submitInfo, kernelPreparationApplication.configuration.fence[0]); - if (res != 0) { - free(phaseVectors); - deleteVkFFT(&kernelPreparationApplication); - return VKFFT_ERROR_FAILED_TO_SUBMIT_QUEUE; - } - res = vkWaitForFences(kernelPreparationApplication.configuration.device[0], 1, kernelPreparationApplication.configuration.fence, VK_TRUE, 100000000000); - if (res != 0) { - free(phaseVectors); - deleteVkFFT(&kernelPreparationApplication); - return VKFFT_ERROR_FAILED_TO_WAIT_FOR_FENCES; - } - res = vkResetFences(kernelPreparationApplication.configuration.device[0], 1, kernelPreparationApplication.configuration.fence); - if (res != 0) { - free(phaseVectors); - deleteVkFFT(&kernelPreparationApplication); - return VKFFT_ERROR_FAILED_TO_RESET_FENCES; - } - vkFreeCommandBuffers(kernelPreparationApplication.configuration.device[0], kernelPreparationApplication.configuration.commandPool[0], 1, &commandBuffer); - } -#elif(VKFFT_BACKEND==1) - VkFFTLaunchParams launchParams = {}; - launchParams.inputBuffer = &app->bufferBluestein[axis_id]; - launchParams.buffer = &app->bufferBluesteinIFFT[axis_id]; - resFFT = VkFFTAppend(&kernelPreparationApplication, -1, &launchParams); - if (resFFT != VKFFT_SUCCESS) { - free(phaseVectors); - deleteVkFFT(&kernelPreparationApplication); - return resFFT; - } - res = cudaDeviceSynchronize(); - if (res != cudaSuccess) { - free(phaseVectors); - deleteVkFFT(&kernelPreparationApplication); - return VKFFT_ERROR_FAILED_TO_SYNCHRONIZE; - } -#elif(VKFFT_BACKEND==2) - VkFFTLaunchParams launchParams = {}; - launchParams.inputBuffer = &app->bufferBluestein[axis_id]; - launchParams.buffer = &app->bufferBluesteinIFFT[axis_id]; - resFFT = VkFFTAppend(&kernelPreparationApplication, -1, &launchParams); - if (resFFT != VKFFT_SUCCESS) { - free(phaseVectors); - deleteVkFFT(&kernelPreparationApplication); - return resFFT; - } - res = hipDeviceSynchronize(); - if (res != hipSuccess) { - free(phaseVectors); - deleteVkFFT(&kernelPreparationApplication); - return VKFFT_ERROR_FAILED_TO_SYNCHRONIZE; - } -#elif(VKFFT_BACKEND==3) - VkFFTLaunchParams launchParams = {}; - launchParams.commandQueue = &commandQueue; - launchParams.inputBuffer = &app->bufferBluestein[axis_id]; - launchParams.buffer = &app->bufferBluesteinIFFT[axis_id]; - resFFT = VkFFTAppend(&kernelPreparationApplication, -1, &launchParams); - if (resFFT != VKFFT_SUCCESS) { - free(phaseVectors); - deleteVkFFT(&kernelPreparationApplication); - return resFFT; - } - res = clFinish(commandQueue); - if (res != CL_SUCCESS) { - free(phaseVectors); - deleteVkFFT(&kernelPreparationApplication); - return VKFFT_ERROR_FAILED_TO_SYNCHRONIZE; - } -#elif(VKFFT_BACKEND==4) - ze_command_list_desc_t commandListDescription = {}; - commandListDescription.stype = ZE_STRUCTURE_TYPE_COMMAND_LIST_DESC; - ze_command_list_handle_t commandList = {}; - res = zeCommandListCreate(app->configuration.context[0], app->configuration.device[0], &commandListDescription, &commandList); - if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_CREATE_COMMAND_LIST; - VkFFTLaunchParams launchParams = {}; - launchParams.commandList = &commandList; - launchParams.inputBuffer = &app->bufferBluestein[axis_id]; - launchParams.buffer = &app->bufferBluesteinIFFT[axis_id]; - resFFT = VkFFTAppend(&kernelPreparationApplication, -1, &launchParams); - if (resFFT != VKFFT_SUCCESS) { - free(phaseVectors); - deleteVkFFT(&kernelPreparationApplication); - return resFFT; - } - res = zeCommandListClose(commandList); - if (res != ZE_RESULT_SUCCESS) { - free(phaseVectors); - deleteVkFFT(&kernelPreparationApplication); - return VKFFT_ERROR_FAILED_TO_END_COMMAND_BUFFER; - } - res = zeCommandQueueExecuteCommandLists(app->configuration.commandQueue[0], 1, &commandList, 0); - if (res != ZE_RESULT_SUCCESS) { - free(phaseVectors); - deleteVkFFT(&kernelPreparationApplication); - return VKFFT_ERROR_FAILED_TO_SUBMIT_QUEUE; - } - res = zeCommandQueueSynchronize(app->configuration.commandQueue[0], UINT32_MAX); - if (res != ZE_RESULT_SUCCESS) { - free(phaseVectors); - deleteVkFFT(&kernelPreparationApplication); - return VKFFT_ERROR_FAILED_TO_SYNCHRONIZE; - } - res = zeCommandListDestroy(commandList); - if (res != ZE_RESULT_SUCCESS) { - free(phaseVectors); - deleteVkFFT(&kernelPreparationApplication); - return VKFFT_ERROR_FAILED_TO_DESTROY_COMMAND_LIST; - } -#endif - } - if (kernelPreparationConfiguration.doublePrecision) { - double* phaseVectors_cast = (double*)phaseVectors; - for (uint64_t i = 0; i < FFTPlan->actualFFTSizePerAxis[axis_id][axis_id]; i++) { - uint64_t rm = (i * i) % (2 * phaseVectorsNonZeroSize); - double angle = double_PI * rm / phaseVectorsNonZeroSize; - phaseVectors_cast[2 * i] = (i < phaseVectorsNonZeroSize) ? (double)cos(angle) : 0; - phaseVectors_cast[2 * i + 1] = (i < phaseVectorsNonZeroSize) ? (double)sin(angle) : 0; - } - for (uint64_t i = 1; i < phaseVectorsNonZeroSize; i++) { - phaseVectors_cast[2 * (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] - i)] = phaseVectors_cast[2 * i]; - phaseVectors_cast[2 * (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] - i) + 1] = phaseVectors_cast[2 * i + 1]; - } - } - else { - float* phaseVectors_cast = (float*)phaseVectors; - for (uint64_t i = 0; i < FFTPlan->actualFFTSizePerAxis[axis_id][axis_id]; i++) { - uint64_t rm = (i * i) % (2 * phaseVectorsNonZeroSize); - double angle = double_PI * rm / phaseVectorsNonZeroSize; - phaseVectors_cast[2 * i] = (i < phaseVectorsNonZeroSize) ? (float)cos(angle) : 0; - phaseVectors_cast[2 * i + 1] = (i < phaseVectorsNonZeroSize) ? (float)sin(angle) : 0; - } - for (uint64_t i = 1; i < phaseVectorsNonZeroSize; i++) { - phaseVectors_cast[2 * (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] - i)] = phaseVectors_cast[2 * i]; - phaseVectors_cast[2 * (FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] - i) + 1] = phaseVectors_cast[2 * i + 1]; - } - } -#if(VKFFT_BACKEND==0) - resFFT = transferDataFromCPU(&kernelPreparationApplication, phaseVectors, &app->bufferBluestein[axis_id], bufferSize); - if (resFFT != VKFFT_SUCCESS) { - free(phaseVectors); - deleteVkFFT(&kernelPreparationApplication); - return resFFT; - } -#elif(VKFFT_BACKEND==1) - res = cudaMemcpy(app->bufferBluestein[axis_id], phaseVectors, bufferSize, cudaMemcpyHostToDevice); - if (res != cudaSuccess) { - free(phaseVectors); - deleteVkFFT(&kernelPreparationApplication); - return VKFFT_ERROR_FAILED_TO_COPY; - } -#elif(VKFFT_BACKEND==2) - res = hipMemcpy(app->bufferBluestein[axis_id], phaseVectors, bufferSize, hipMemcpyHostToDevice); - if (res != hipSuccess) { - free(phaseVectors); - deleteVkFFT(&kernelPreparationApplication); - return VKFFT_ERROR_FAILED_TO_COPY; - } -#elif(VKFFT_BACKEND==3) - res = clEnqueueWriteBuffer(commandQueue, app->bufferBluestein[axis_id], CL_TRUE, 0, bufferSize, phaseVectors, 0, NULL, NULL); - if (res != CL_SUCCESS) { - free(phaseVectors); - deleteVkFFT(&kernelPreparationApplication); - return VKFFT_ERROR_FAILED_TO_COPY; - } -#elif(VKFFT_BACKEND==4) - ze_command_queue_desc_t commandQueueCopyDesc = { - ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC, - 0, - app->configuration.commandQueueID, - 0, // index - 0, // flags - ZE_COMMAND_QUEUE_MODE_DEFAULT, - ZE_COMMAND_QUEUE_PRIORITY_NORMAL - }; - ze_command_list_handle_t copyCommandList; - res = zeCommandListCreateImmediate(app->configuration.context[0], app->configuration.device[0], &commandQueueCopyDesc, ©CommandList); - if (res != ZE_RESULT_SUCCESS) { - free(phaseVectors); - deleteVkFFT(&kernelPreparationApplication); - return VKFFT_ERROR_FAILED_TO_CREATE_COMMAND_LIST; - } - res = zeCommandListAppendMemoryCopy(copyCommandList, app->bufferBluestein[axis_id], phaseVectors, bufferSize, 0, 0, 0); - if (res != ZE_RESULT_SUCCESS) { - free(phaseVectors); - deleteVkFFT(&kernelPreparationApplication); - return VKFFT_ERROR_FAILED_TO_COPY; - } - res = zeCommandQueueSynchronize(app->configuration.commandQueue[0], UINT32_MAX); - if (res != ZE_RESULT_SUCCESS) { - free(phaseVectors); - deleteVkFFT(&kernelPreparationApplication); - return VKFFT_ERROR_FAILED_TO_SYNCHRONIZE; - } -#endif -#if(VKFFT_BACKEND==0) - if (!app->configuration.makeInversePlanOnly) { - VkCommandBufferAllocateInfo commandBufferAllocateInfo = { VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO }; - commandBufferAllocateInfo.commandPool = kernelPreparationApplication.configuration.commandPool[0]; - commandBufferAllocateInfo.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY; - commandBufferAllocateInfo.commandBufferCount = 1; - VkCommandBuffer commandBuffer = {}; - res = vkAllocateCommandBuffers(kernelPreparationApplication.configuration.device[0], &commandBufferAllocateInfo, &commandBuffer); - if (res != 0) { - free(phaseVectors); - deleteVkFFT(&kernelPreparationApplication); - return VKFFT_ERROR_FAILED_TO_ALLOCATE_COMMAND_BUFFERS; - } - VkCommandBufferBeginInfo commandBufferBeginInfo = { VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO }; - commandBufferBeginInfo.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT; - res = vkBeginCommandBuffer(commandBuffer, &commandBufferBeginInfo); - if (res != 0) { - free(phaseVectors); - deleteVkFFT(&kernelPreparationApplication); - return VKFFT_ERROR_FAILED_TO_BEGIN_COMMAND_BUFFER; - } - VkFFTLaunchParams launchParams = {}; - launchParams.commandBuffer = &commandBuffer; - launchParams.inputBuffer = &app->bufferBluestein[axis_id]; - launchParams.buffer = &app->bufferBluesteinFFT[axis_id]; - //Record commands - resFFT = VkFFTAppend(&kernelPreparationApplication, -1, &launchParams); - if (resFFT != VKFFT_SUCCESS) { - free(phaseVectors); - deleteVkFFT(&kernelPreparationApplication); - return resFFT; - } - res = vkEndCommandBuffer(commandBuffer); - if (res != 0) { - free(phaseVectors); - deleteVkFFT(&kernelPreparationApplication); - return VKFFT_ERROR_FAILED_TO_END_COMMAND_BUFFER; - } - VkSubmitInfo submitInfo = { VK_STRUCTURE_TYPE_SUBMIT_INFO }; - submitInfo.commandBufferCount = 1; - submitInfo.pCommandBuffers = &commandBuffer; - res = vkQueueSubmit(kernelPreparationApplication.configuration.queue[0], 1, &submitInfo, kernelPreparationApplication.configuration.fence[0]); - if (res != 0) { - free(phaseVectors); - deleteVkFFT(&kernelPreparationApplication); - return VKFFT_ERROR_FAILED_TO_SUBMIT_QUEUE; - } - res = vkWaitForFences(kernelPreparationApplication.configuration.device[0], 1, kernelPreparationApplication.configuration.fence, VK_TRUE, 100000000000); - if (res != 0) { - free(phaseVectors); - deleteVkFFT(&kernelPreparationApplication); - return VKFFT_ERROR_FAILED_TO_WAIT_FOR_FENCES; - } - res = vkResetFences(kernelPreparationApplication.configuration.device[0], 1, kernelPreparationApplication.configuration.fence); - if (res != 0) { - free(phaseVectors); - deleteVkFFT(&kernelPreparationApplication); - return VKFFT_ERROR_FAILED_TO_RESET_FENCES; - } - vkFreeCommandBuffers(kernelPreparationApplication.configuration.device[0], kernelPreparationApplication.configuration.commandPool[0], 1, &commandBuffer); - } - if ((FFTPlan->numAxisUploads[axis_id] == 1) && (!app->configuration.makeForwardPlanOnly)) { - VkCommandBufferAllocateInfo commandBufferAllocateInfo = { VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO }; - commandBufferAllocateInfo.commandPool = kernelPreparationApplication.configuration.commandPool[0]; - commandBufferAllocateInfo.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY; - commandBufferAllocateInfo.commandBufferCount = 1; - VkCommandBuffer commandBuffer = {}; - res = vkAllocateCommandBuffers(kernelPreparationApplication.configuration.device[0], &commandBufferAllocateInfo, &commandBuffer); - if (res != 0) { - free(phaseVectors); - deleteVkFFT(&kernelPreparationApplication); - return VKFFT_ERROR_FAILED_TO_ALLOCATE_COMMAND_BUFFERS; - } - VkCommandBufferBeginInfo commandBufferBeginInfo = { VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO }; - commandBufferBeginInfo.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT; - res = vkBeginCommandBuffer(commandBuffer, &commandBufferBeginInfo); - if (res != 0) { - free(phaseVectors); - deleteVkFFT(&kernelPreparationApplication); - return VKFFT_ERROR_FAILED_TO_BEGIN_COMMAND_BUFFER; - } - VkFFTLaunchParams launchParams = {}; - launchParams.commandBuffer = &commandBuffer; - launchParams.inputBuffer = &app->bufferBluestein[axis_id]; - launchParams.buffer = &app->bufferBluesteinIFFT[axis_id]; - //Record commands - resFFT = VkFFTAppend(&kernelPreparationApplication, 1, &launchParams); - if (resFFT != VKFFT_SUCCESS) { - free(phaseVectors); - deleteVkFFT(&kernelPreparationApplication); - return resFFT; - } - res = vkEndCommandBuffer(commandBuffer); - if (res != 0) { - free(phaseVectors); - deleteVkFFT(&kernelPreparationApplication); - return VKFFT_ERROR_FAILED_TO_END_COMMAND_BUFFER; - } - VkSubmitInfo submitInfo = { VK_STRUCTURE_TYPE_SUBMIT_INFO }; - submitInfo.commandBufferCount = 1; - submitInfo.pCommandBuffers = &commandBuffer; - res = vkQueueSubmit(kernelPreparationApplication.configuration.queue[0], 1, &submitInfo, kernelPreparationApplication.configuration.fence[0]); - if (res != 0) { - free(phaseVectors); - deleteVkFFT(&kernelPreparationApplication); - return VKFFT_ERROR_FAILED_TO_SUBMIT_QUEUE; - } - res = vkWaitForFences(kernelPreparationApplication.configuration.device[0], 1, kernelPreparationApplication.configuration.fence, VK_TRUE, 100000000000); - if (res != 0) { - free(phaseVectors); - deleteVkFFT(&kernelPreparationApplication); - return VKFFT_ERROR_FAILED_TO_WAIT_FOR_FENCES; - } - res = vkResetFences(kernelPreparationApplication.configuration.device[0], 1, kernelPreparationApplication.configuration.fence); - if (res != 0) { - free(phaseVectors); - deleteVkFFT(&kernelPreparationApplication); - return VKFFT_ERROR_FAILED_TO_RESET_FENCES; - } - vkFreeCommandBuffers(kernelPreparationApplication.configuration.device[0], kernelPreparationApplication.configuration.commandPool[0], 1, &commandBuffer); - } -#elif(VKFFT_BACKEND==1) - VkFFTLaunchParams launchParams = {}; - launchParams.inputBuffer = &app->bufferBluestein[axis_id]; - if (!app->configuration.makeInversePlanOnly) { - launchParams.buffer = &app->bufferBluesteinFFT[axis_id]; - resFFT = VkFFTAppend(&kernelPreparationApplication, -1, &launchParams); - if (resFFT != VKFFT_SUCCESS) { - free(phaseVectors); - deleteVkFFT(&kernelPreparationApplication); - return resFFT; - } - res = cudaDeviceSynchronize(); - if (res != cudaSuccess) { - free(phaseVectors); - deleteVkFFT(&kernelPreparationApplication); - return VKFFT_ERROR_FAILED_TO_SYNCHRONIZE; - } - } - if ((FFTPlan->numAxisUploads[axis_id] == 1) && (!app->configuration.makeForwardPlanOnly)) { - launchParams.buffer = &app->bufferBluesteinIFFT[axis_id]; - resFFT = VkFFTAppend(&kernelPreparationApplication, 1, &launchParams); - if (resFFT != VKFFT_SUCCESS) { - free(phaseVectors); - deleteVkFFT(&kernelPreparationApplication); - return resFFT; - } - res = cudaDeviceSynchronize(); - if (res != cudaSuccess) { - free(phaseVectors); - deleteVkFFT(&kernelPreparationApplication); - return VKFFT_ERROR_FAILED_TO_SYNCHRONIZE; - } - } -#elif(VKFFT_BACKEND==2) - VkFFTLaunchParams launchParams = {}; - launchParams.inputBuffer = &app->bufferBluestein[axis_id]; - if (!app->configuration.makeInversePlanOnly) { - launchParams.buffer = &app->bufferBluesteinFFT[axis_id]; - resFFT = VkFFTAppend(&kernelPreparationApplication, -1, &launchParams); - if (resFFT != VKFFT_SUCCESS) { - free(phaseVectors); - deleteVkFFT(&kernelPreparationApplication); - return resFFT; - } - res = hipDeviceSynchronize(); - if (res != hipSuccess) { - free(phaseVectors); - deleteVkFFT(&kernelPreparationApplication); - return VKFFT_ERROR_FAILED_TO_SYNCHRONIZE; - } - } - if ((FFTPlan->numAxisUploads[axis_id] == 1) && (!app->configuration.makeForwardPlanOnly)) { - launchParams.buffer = &app->bufferBluesteinIFFT[axis_id]; - resFFT = VkFFTAppend(&kernelPreparationApplication, 1, &launchParams); - if (resFFT != VKFFT_SUCCESS) { - free(phaseVectors); - deleteVkFFT(&kernelPreparationApplication); - return resFFT; - } - res = hipDeviceSynchronize(); - if (res != hipSuccess) { - free(phaseVectors); - deleteVkFFT(&kernelPreparationApplication); - return VKFFT_ERROR_FAILED_TO_SYNCHRONIZE; - } - } -#elif(VKFFT_BACKEND==3) - VkFFTLaunchParams launchParams = {}; - launchParams.commandQueue = &commandQueue; - launchParams.inputBuffer = &app->bufferBluestein[axis_id]; - if (!app->configuration.makeInversePlanOnly) { - launchParams.buffer = &app->bufferBluesteinFFT[axis_id]; - resFFT = VkFFTAppend(&kernelPreparationApplication, -1, &launchParams); - if (resFFT != VKFFT_SUCCESS) { - free(phaseVectors); - deleteVkFFT(&kernelPreparationApplication); - return resFFT; - } - res = clFinish(commandQueue); - if (res != CL_SUCCESS) { - free(phaseVectors); - deleteVkFFT(&kernelPreparationApplication); - return VKFFT_ERROR_FAILED_TO_SYNCHRONIZE; - } - } - if ((FFTPlan->numAxisUploads[axis_id] == 1) && (!app->configuration.makeForwardPlanOnly)) { - launchParams.buffer = &app->bufferBluesteinIFFT[axis_id]; - resFFT = VkFFTAppend(&kernelPreparationApplication, 1, &launchParams); - if (resFFT != VKFFT_SUCCESS) { - free(phaseVectors); - deleteVkFFT(&kernelPreparationApplication); - return resFFT; - } - res = clFinish(commandQueue); - if (res != CL_SUCCESS) { - free(phaseVectors); - deleteVkFFT(&kernelPreparationApplication); - return VKFFT_ERROR_FAILED_TO_SYNCHRONIZE; - } - } -#elif(VKFFT_BACKEND==4) - ze_command_list_desc_t commandListDescription = {}; - commandListDescription.stype = ZE_STRUCTURE_TYPE_COMMAND_LIST_DESC; - ze_command_list_handle_t commandList = {}; - res = zeCommandListCreate(app->configuration.context[0], app->configuration.device[0], &commandListDescription, &commandList); - if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_CREATE_COMMAND_LIST; - VkFFTLaunchParams launchParams = {}; - launchParams.commandList = &commandList; - launchParams.inputBuffer = &app->bufferBluestein[axis_id]; - - if (!app->configuration.makeInversePlanOnly) { - launchParams.buffer = &app->bufferBluesteinFFT[axis_id]; - resFFT = VkFFTAppend(&kernelPreparationApplication, -1, &launchParams); - if (resFFT != VKFFT_SUCCESS) { - free(phaseVectors); - deleteVkFFT(&kernelPreparationApplication); - return resFFT; - } - - res = zeCommandListClose(commandList); - if (res != ZE_RESULT_SUCCESS) { - free(phaseVectors); - deleteVkFFT(&kernelPreparationApplication); - return VKFFT_ERROR_FAILED_TO_END_COMMAND_BUFFER; - } - res = zeCommandQueueExecuteCommandLists(app->configuration.commandQueue[0], 1, &commandList, 0); - if (res != ZE_RESULT_SUCCESS) { - free(phaseVectors); - deleteVkFFT(&kernelPreparationApplication); - return VKFFT_ERROR_FAILED_TO_SUBMIT_QUEUE; - } - res = zeCommandQueueSynchronize(app->configuration.commandQueue[0], UINT32_MAX); - if (res != ZE_RESULT_SUCCESS) { - free(phaseVectors); - deleteVkFFT(&kernelPreparationApplication); - return VKFFT_ERROR_FAILED_TO_SYNCHRONIZE; - } - res = zeCommandListReset(commandList); - if (res != ZE_RESULT_SUCCESS) { - free(phaseVectors); - deleteVkFFT(&kernelPreparationApplication); - return VKFFT_ERROR_FAILED_TO_DESTROY_COMMAND_LIST; - } - } - if ((FFTPlan->numAxisUploads[axis_id] == 1) && (!app->configuration.makeForwardPlanOnly)) { - launchParams.buffer = &app->bufferBluesteinIFFT[axis_id]; - resFFT = VkFFTAppend(&kernelPreparationApplication, 1, &launchParams); - if (resFFT != VKFFT_SUCCESS) { - free(phaseVectors); - deleteVkFFT(&kernelPreparationApplication); - return resFFT; - } - res = zeCommandListClose(commandList); - if (res != ZE_RESULT_SUCCESS) { - free(phaseVectors); - deleteVkFFT(&kernelPreparationApplication); - return VKFFT_ERROR_FAILED_TO_END_COMMAND_BUFFER; - } - res = zeCommandQueueExecuteCommandLists(app->configuration.commandQueue[0], 1, &commandList, 0); - if (res != ZE_RESULT_SUCCESS) { - free(phaseVectors); - deleteVkFFT(&kernelPreparationApplication); - return VKFFT_ERROR_FAILED_TO_SUBMIT_QUEUE; - } - res = zeCommandQueueSynchronize(app->configuration.commandQueue[0], UINT32_MAX); - if (res != ZE_RESULT_SUCCESS) { - free(phaseVectors); - deleteVkFFT(&kernelPreparationApplication); - return VKFFT_ERROR_FAILED_TO_SYNCHRONIZE; - } - } - res = zeCommandListDestroy(commandList); - if (res != ZE_RESULT_SUCCESS) { - free(phaseVectors); - deleteVkFFT(&kernelPreparationApplication); - return VKFFT_ERROR_FAILED_TO_DESTROY_COMMAND_LIST; - } -#endif - free(phaseVectors); -#if(VKFFT_BACKEND==0) - kernelPreparationApplication.configuration.isCompilerInitialized = 0; -#elif(VKFFT_BACKEND==3) - res = clReleaseCommandQueue(commandQueue); - if (res != CL_SUCCESS) return VKFFT_ERROR_FAILED_TO_RELEASE_COMMAND_QUEUE; -#endif - if (kernelPreparationConfiguration.saveApplicationToString) { - app->applicationBluesteinStringSize[axis_id] = kernelPreparationApplication.applicationStringSize; - app->applicationBluesteinString[axis_id] = calloc(app->applicationBluesteinStringSize[axis_id], 1); - if (!app->applicationBluesteinString[axis_id]) { - deleteVkFFT(&kernelPreparationApplication); - return VKFFT_ERROR_MALLOC_FAILED; - } - memcpy(app->applicationBluesteinString[axis_id], kernelPreparationApplication.saveApplicationString, app->applicationBluesteinStringSize[axis_id]); - } - deleteVkFFT(&kernelPreparationApplication); - return resFFT; -} -static inline VkFFTResult VkFFTGenerateRaderFFTKernel(VkFFTApplication* app, VkFFTAxis* axis) { - //generate Rader FFTKernel - VkFFTResult resFFT = VKFFT_SUCCESS; - double double_PI = 3.1415926535897932384626433832795; - if (axis->specializationConstants.useRader) { - for (uint64_t i = 0; i < axis->specializationConstants.numRaderPrimes; i++) { - if (axis->specializationConstants.raderContainer[i].type == 0) { - for (uint64_t j = 0; j < 30; j++) { - if (app->rader_primes[j] == axis->specializationConstants.raderContainer[i].prime) { - axis->specializationConstants.raderContainer[i].raderFFTkernel = app->raderFFTkernel[j]; - } - } - if (axis->specializationConstants.raderContainer[i].raderFFTkernel) continue; - - uint64_t write_id = app->numRaderFFTPrimes; - app->rader_primes[write_id] = axis->specializationConstants.raderContainer[i].prime; - app->numRaderFFTPrimes++; - - if (app->configuration.loadApplicationFromString) continue; - VkFFTApplication kernelPreparationApplication = {}; - VkFFTConfiguration kernelPreparationConfiguration = {}; - - kernelPreparationConfiguration.FFTdim = 1; - kernelPreparationConfiguration.size[0] = axis->specializationConstants.raderContainer[i].prime - 1; - kernelPreparationConfiguration.size[1] = 1; - kernelPreparationConfiguration.size[2] = 1; - kernelPreparationConfiguration.doublePrecision = (app->configuration.doublePrecision || app->configuration.doublePrecisionFloatMemory); - kernelPreparationConfiguration.useLUT = 1; - kernelPreparationConfiguration.fixMinRaderPrimeFFT = 17; - kernelPreparationConfiguration.fixMinRaderPrimeMult = 17; - kernelPreparationConfiguration.fixMaxRaderPrimeFFT = 17; - kernelPreparationConfiguration.fixMaxRaderPrimeMult = 17; - - kernelPreparationConfiguration.device = app->configuration.device; -#if(VKFFT_BACKEND==0) - kernelPreparationConfiguration.queue = app->configuration.queue; //to allocate memory for LUT, we have to pass a queue, vkGPU->fence, commandPool and physicalDevice pointers - kernelPreparationConfiguration.fence = app->configuration.fence; - kernelPreparationConfiguration.commandPool = app->configuration.commandPool; - kernelPreparationConfiguration.physicalDevice = app->configuration.physicalDevice; - kernelPreparationConfiguration.isCompilerInitialized = 1;//compiler can be initialized before VkFFT plan creation. if not, VkFFT will create and destroy one after initialization - kernelPreparationConfiguration.tempBufferDeviceMemory = app->configuration.tempBufferDeviceMemory; -#elif(VKFFT_BACKEND==3) - kernelPreparationConfiguration.context = app->configuration.context; -#elif(VKFFT_BACKEND==4) - kernelPreparationConfiguration.context = app->configuration.context; - kernelPreparationConfiguration.commandQueue = app->configuration.commandQueue; - kernelPreparationConfiguration.commandQueueID = app->configuration.commandQueueID; -#endif - - uint64_t bufferSize = (uint64_t)sizeof(float) * 2 * kernelPreparationConfiguration.size[0] * kernelPreparationConfiguration.size[1] * kernelPreparationConfiguration.size[2]; - if (kernelPreparationConfiguration.doublePrecision) bufferSize *= sizeof(double) / sizeof(float); - - kernelPreparationConfiguration.bufferSize = &bufferSize; - resFFT = initializeVkFFT(&kernelPreparationApplication, kernelPreparationConfiguration); - if (resFFT != VKFFT_SUCCESS) return resFFT; - - if (app->configuration.doublePrecision || app->configuration.doublePrecisionFloatMemory) { - double* raderFFTkernel = (double*)malloc((axis->specializationConstants.raderContainer[i].prime - 1) * sizeof(double) * 2); - if (!raderFFTkernel) return VKFFT_ERROR_MALLOC_FAILED; - axis->specializationConstants.raderContainer[i].raderFFTkernel = (void*)raderFFTkernel; - app->raderFFTkernel[write_id] = (void*)raderFFTkernel; - app->rader_buffer_size[write_id] = (axis->specializationConstants.raderContainer[i].prime - 1) * sizeof(double) * 2; - for (uint64_t j = 0; j < (axis->specializationConstants.raderContainer[i].prime - 1); j++) {//fix later - uint64_t g_pow = 1; - for (uint64_t t = 0; t < axis->specializationConstants.raderContainer[i].prime - 1 - j; t++) { - g_pow = (g_pow * axis->specializationConstants.raderContainer[i].generator) % axis->specializationConstants.raderContainer[i].prime; - } - raderFFTkernel[2 * j] = cos(2.0 * g_pow * double_PI / axis->specializationConstants.raderContainer[i].prime); - raderFFTkernel[2 * j + 1] = -sin(2.0 * g_pow * double_PI / axis->specializationConstants.raderContainer[i].prime); - } - } - else { - float* raderFFTkernel = (float*)malloc((axis->specializationConstants.raderContainer[i].prime - 1) * sizeof(float) * 2); - if (!raderFFTkernel) return VKFFT_ERROR_MALLOC_FAILED; - axis->specializationConstants.raderContainer[i].raderFFTkernel = (void*)raderFFTkernel; - app->raderFFTkernel[write_id] = (void*)raderFFTkernel; - app->rader_buffer_size[write_id] = (axis->specializationConstants.raderContainer[i].prime - 1) * sizeof(float) * 2; - for (uint64_t j = 0; j < (axis->specializationConstants.raderContainer[i].prime - 1); j++) {//fix later - uint64_t g_pow = 1; - for (uint64_t t = 0; t < axis->specializationConstants.raderContainer[i].prime - 1 - j; t++) { - g_pow = (g_pow * axis->specializationConstants.raderContainer[i].generator) % axis->specializationConstants.raderContainer[i].prime; - } - raderFFTkernel[2 * j] = (float)cos(2.0 * g_pow * double_PI / axis->specializationConstants.raderContainer[i].prime); - raderFFTkernel[2 * j + 1] = (float)(-sin(2.0 * g_pow * double_PI / axis->specializationConstants.raderContainer[i].prime)); - } - } -#if(VKFFT_BACKEND==0) - VkDeviceMemory bufferRaderFFTDeviceMemory; - VkBuffer bufferRaderFFT; -#elif(VKFFT_BACKEND==1) - void* bufferRaderFFT; -#elif(VKFFT_BACKEND==2) - void* bufferRaderFFT; -#elif(VKFFT_BACKEND==3) - cl_mem bufferRaderFFT; -#elif(VKFFT_BACKEND==4) - void* bufferRaderFFT; -#endif -#if(VKFFT_BACKEND==0) - VkResult res = VK_SUCCESS; - resFFT = allocateFFTBuffer(app, &bufferRaderFFT, &bufferRaderFFTDeviceMemory, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, VK_MEMORY_HEAP_DEVICE_LOCAL_BIT, bufferSize); - if (resFFT != VKFFT_SUCCESS) return resFFT; -#elif(VKFFT_BACKEND==1) - cudaError_t res = cudaSuccess; - res = cudaMalloc(&bufferRaderFFT, bufferSize); - if (res != cudaSuccess) return VKFFT_ERROR_FAILED_TO_ALLOCATE; -#elif(VKFFT_BACKEND==2) - hipError_t res = hipSuccess; - res = hipMalloc(&bufferRaderFFT, bufferSize); - if (res != hipSuccess) return VKFFT_ERROR_FAILED_TO_ALLOCATE; -#elif(VKFFT_BACKEND==3) - cl_int res = CL_SUCCESS; - bufferRaderFFT = clCreateBuffer(app->configuration.context[0], CL_MEM_READ_WRITE, bufferSize, 0, &res); - if (res != CL_SUCCESS) return VKFFT_ERROR_FAILED_TO_ALLOCATE; - cl_command_queue commandQueue = clCreateCommandQueue(app->configuration.context[0], app->configuration.device[0], 0, &res); - if (res != CL_SUCCESS) return VKFFT_ERROR_FAILED_TO_CREATE_COMMAND_QUEUE; -#elif(VKFFT_BACKEND==4) - ze_result_t res = ZE_RESULT_SUCCESS; - ze_device_mem_alloc_desc_t device_desc = {}; - device_desc.stype = ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC; - res = zeMemAllocDevice(app->configuration.context[0], &device_desc, bufferSize, sizeof(float), app->configuration.device[0], &bufferRaderFFT); - if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_ALLOCATE; -#endif - -#if(VKFFT_BACKEND==0) - resFFT = transferDataFromCPU(&kernelPreparationApplication, axis->specializationConstants.raderContainer[i].raderFFTkernel, &bufferRaderFFT, bufferSize); - if (resFFT != VKFFT_SUCCESS) { - free(axis->specializationConstants.raderContainer[i].raderFFTkernel); - deleteVkFFT(&kernelPreparationApplication); - return resFFT; - } -#elif(VKFFT_BACKEND==1) - res = cudaMemcpy(bufferRaderFFT, axis->specializationConstants.raderContainer[i].raderFFTkernel, bufferSize, cudaMemcpyHostToDevice); - if (res != cudaSuccess) { - free(axis->specializationConstants.raderContainer[i].raderFFTkernel); - deleteVkFFT(&kernelPreparationApplication); - return VKFFT_ERROR_FAILED_TO_COPY; - } -#elif(VKFFT_BACKEND==2) - res = hipMemcpy(bufferRaderFFT, axis->specializationConstants.raderContainer[i].raderFFTkernel, bufferSize, hipMemcpyHostToDevice); - if (res != hipSuccess) { - free(axis->specializationConstants.raderContainer[i].raderFFTkernel); - deleteVkFFT(&kernelPreparationApplication); - return VKFFT_ERROR_FAILED_TO_COPY; - } -#elif(VKFFT_BACKEND==3) - res = clEnqueueWriteBuffer(commandQueue, bufferRaderFFT, CL_TRUE, 0, bufferSize, axis->specializationConstants.raderContainer[i].raderFFTkernel, 0, NULL, NULL); - if (res != CL_SUCCESS) { - free(axis->specializationConstants.raderContainer[i].raderFFTkernel); - deleteVkFFT(&kernelPreparationApplication); - return VKFFT_ERROR_FAILED_TO_COPY; - } -#elif(VKFFT_BACKEND==4) - ze_command_queue_desc_t commandQueueCopyDesc = { - ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC, - 0, - app->configuration.commandQueueID, - 0, // index - 0, // flags - ZE_COMMAND_QUEUE_MODE_DEFAULT, - ZE_COMMAND_QUEUE_PRIORITY_NORMAL - }; - ze_command_list_handle_t copyCommandList; - res = zeCommandListCreateImmediate(app->configuration.context[0], app->configuration.device[0], &commandQueueCopyDesc, ©CommandList); - if (res != ZE_RESULT_SUCCESS) { - free(axis->specializationConstants.raderContainer[i].raderFFTkernel); - deleteVkFFT(&kernelPreparationApplication); - return VKFFT_ERROR_FAILED_TO_CREATE_COMMAND_LIST; - } - res = zeCommandListAppendMemoryCopy(copyCommandList, bufferRaderFFT, axis->specializationConstants.raderContainer[i].raderFFTkernel, bufferSize, 0, 0, 0); - if (res != ZE_RESULT_SUCCESS) { - free(axis->specializationConstants.raderContainer[i].raderFFTkernel); - deleteVkFFT(&kernelPreparationApplication); - return VKFFT_ERROR_FAILED_TO_COPY; - } - res = zeCommandQueueSynchronize(app->configuration.commandQueue[0], UINT32_MAX); - if (res != ZE_RESULT_SUCCESS) { - free(axis->specializationConstants.raderContainer[i].raderFFTkernel); - deleteVkFFT(&kernelPreparationApplication); - return VKFFT_ERROR_FAILED_TO_SYNCHRONIZE; - } -#endif -#if(VKFFT_BACKEND==0) - { - VkCommandBufferAllocateInfo commandBufferAllocateInfo = { VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO }; - commandBufferAllocateInfo.commandPool = kernelPreparationApplication.configuration.commandPool[0]; - commandBufferAllocateInfo.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY; - commandBufferAllocateInfo.commandBufferCount = 1; - VkCommandBuffer commandBuffer = {}; - res = vkAllocateCommandBuffers(kernelPreparationApplication.configuration.device[0], &commandBufferAllocateInfo, &commandBuffer); - if (res != 0) { - free(axis->specializationConstants.raderContainer[i].raderFFTkernel); - deleteVkFFT(&kernelPreparationApplication); - return VKFFT_ERROR_FAILED_TO_ALLOCATE_COMMAND_BUFFERS; - } - VkCommandBufferBeginInfo commandBufferBeginInfo = { VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO }; - commandBufferBeginInfo.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT; - res = vkBeginCommandBuffer(commandBuffer, &commandBufferBeginInfo); - if (res != 0) { - free(axis->specializationConstants.raderContainer[i].raderFFTkernel); - deleteVkFFT(&kernelPreparationApplication); - return VKFFT_ERROR_FAILED_TO_BEGIN_COMMAND_BUFFER; - } - VkFFTLaunchParams launchParams = {}; - launchParams.commandBuffer = &commandBuffer; - launchParams.buffer = &bufferRaderFFT; - //Record commands - resFFT = VkFFTAppend(&kernelPreparationApplication, -1, &launchParams); - if (resFFT != VKFFT_SUCCESS) { - free(axis->specializationConstants.raderContainer[i].raderFFTkernel); - deleteVkFFT(&kernelPreparationApplication); - return resFFT; - } - res = vkEndCommandBuffer(commandBuffer); - if (res != 0) { - free(axis->specializationConstants.raderContainer[i].raderFFTkernel); - deleteVkFFT(&kernelPreparationApplication); - return VKFFT_ERROR_FAILED_TO_END_COMMAND_BUFFER; - } - VkSubmitInfo submitInfo = { VK_STRUCTURE_TYPE_SUBMIT_INFO }; - submitInfo.commandBufferCount = 1; - submitInfo.pCommandBuffers = &commandBuffer; - res = vkQueueSubmit(kernelPreparationApplication.configuration.queue[0], 1, &submitInfo, kernelPreparationApplication.configuration.fence[0]); - if (res != 0) { - free(axis->specializationConstants.raderContainer[i].raderFFTkernel); - deleteVkFFT(&kernelPreparationApplication); - return VKFFT_ERROR_FAILED_TO_SUBMIT_QUEUE; - } - res = vkWaitForFences(kernelPreparationApplication.configuration.device[0], 1, kernelPreparationApplication.configuration.fence, VK_TRUE, 100000000000); - if (res != 0) { - free(axis->specializationConstants.raderContainer[i].raderFFTkernel); - deleteVkFFT(&kernelPreparationApplication); - return VKFFT_ERROR_FAILED_TO_WAIT_FOR_FENCES; - } - res = vkResetFences(kernelPreparationApplication.configuration.device[0], 1, kernelPreparationApplication.configuration.fence); - if (res != 0) { - free(axis->specializationConstants.raderContainer[i].raderFFTkernel); - deleteVkFFT(&kernelPreparationApplication); - return VKFFT_ERROR_FAILED_TO_RESET_FENCES; - } - vkFreeCommandBuffers(kernelPreparationApplication.configuration.device[0], kernelPreparationApplication.configuration.commandPool[0], 1, &commandBuffer); - } -#elif(VKFFT_BACKEND==1) - VkFFTLaunchParams launchParams = {}; - launchParams.buffer = &bufferRaderFFT; - resFFT = VkFFTAppend(&kernelPreparationApplication, -1, &launchParams); - if (resFFT != VKFFT_SUCCESS) { - free(axis->specializationConstants.raderContainer[i].raderFFTkernel); - deleteVkFFT(&kernelPreparationApplication); - return resFFT; - } - res = cudaDeviceSynchronize(); - if (res != cudaSuccess) { - free(axis->specializationConstants.raderContainer[i].raderFFTkernel); - deleteVkFFT(&kernelPreparationApplication); - return VKFFT_ERROR_FAILED_TO_SYNCHRONIZE; - } -#elif(VKFFT_BACKEND==2) - VkFFTLaunchParams launchParams = {}; - launchParams.buffer = &bufferRaderFFT; - resFFT = VkFFTAppend(&kernelPreparationApplication, -1, &launchParams); - if (resFFT != VKFFT_SUCCESS) { - free(axis->specializationConstants.raderContainer[i].raderFFTkernel); - deleteVkFFT(&kernelPreparationApplication); - return resFFT; - } - res = hipDeviceSynchronize(); - if (res != hipSuccess) { - free(axis->specializationConstants.raderContainer[i].raderFFTkernel); - deleteVkFFT(&kernelPreparationApplication); - return VKFFT_ERROR_FAILED_TO_SYNCHRONIZE; - } -#elif(VKFFT_BACKEND==3) - VkFFTLaunchParams launchParams = {}; - launchParams.commandQueue = &commandQueue; - launchParams.buffer = &bufferRaderFFT; - resFFT = VkFFTAppend(&kernelPreparationApplication, -1, &launchParams); - if (resFFT != VKFFT_SUCCESS) { - free(axis->specializationConstants.raderContainer[i].raderFFTkernel); - deleteVkFFT(&kernelPreparationApplication); - return resFFT; - } - res = clFinish(commandQueue); - if (res != CL_SUCCESS) { - free(axis->specializationConstants.raderContainer[i].raderFFTkernel); - deleteVkFFT(&kernelPreparationApplication); - return VKFFT_ERROR_FAILED_TO_SYNCHRONIZE; - } -#elif(VKFFT_BACKEND==4) - ze_command_list_desc_t commandListDescription = {}; - commandListDescription.stype = ZE_STRUCTURE_TYPE_COMMAND_LIST_DESC; - ze_command_list_handle_t commandList = {}; - res = zeCommandListCreate(app->configuration.context[0], app->configuration.device[0], &commandListDescription, &commandList); - if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_CREATE_COMMAND_LIST; - VkFFTLaunchParams launchParams = {}; - launchParams.commandList = &commandList; - launchParams.buffer = &bufferRaderFFT; - resFFT = VkFFTAppend(&kernelPreparationApplication, -1, &launchParams); - if (resFFT != VKFFT_SUCCESS) { - free(axis->specializationConstants.raderContainer[i].raderFFTkernel); - deleteVkFFT(&kernelPreparationApplication); - return resFFT; - } - res = zeCommandListClose(commandList); - if (res != ZE_RESULT_SUCCESS) { - free(axis->specializationConstants.raderContainer[i].raderFFTkernel); - deleteVkFFT(&kernelPreparationApplication); - return VKFFT_ERROR_FAILED_TO_END_COMMAND_BUFFER; - } - res = zeCommandQueueExecuteCommandLists(app->configuration.commandQueue[0], 1, &commandList, 0); - if (res != ZE_RESULT_SUCCESS) { - free(axis->specializationConstants.raderContainer[i].raderFFTkernel); - deleteVkFFT(&kernelPreparationApplication); - return VKFFT_ERROR_FAILED_TO_SUBMIT_QUEUE; - } - res = zeCommandQueueSynchronize(app->configuration.commandQueue[0], UINT32_MAX); - if (res != ZE_RESULT_SUCCESS) { - free(axis->specializationConstants.raderContainer[i].raderFFTkernel); - deleteVkFFT(&kernelPreparationApplication); - return VKFFT_ERROR_FAILED_TO_SYNCHRONIZE; - } - res = zeCommandListDestroy(commandList); - if (res != ZE_RESULT_SUCCESS) { - free(axis->specializationConstants.raderContainer[i].raderFFTkernel); - deleteVkFFT(&kernelPreparationApplication); - return VKFFT_ERROR_FAILED_TO_DESTROY_COMMAND_LIST; - } -#endif -#if(VKFFT_BACKEND==0) - resFFT = transferDataToCPU(&kernelPreparationApplication, axis->specializationConstants.raderContainer[i].raderFFTkernel, &bufferRaderFFT, bufferSize); - if (resFFT != VKFFT_SUCCESS) { - free(axis->specializationConstants.raderContainer[i].raderFFTkernel); - deleteVkFFT(&kernelPreparationApplication); - return resFFT; - } -#elif(VKFFT_BACKEND==1) - res = cudaMemcpy(axis->specializationConstants.raderContainer[i].raderFFTkernel, bufferRaderFFT, bufferSize, cudaMemcpyDeviceToHost); - if (res != cudaSuccess) { - free(axis->specializationConstants.raderContainer[i].raderFFTkernel); - deleteVkFFT(&kernelPreparationApplication); - return VKFFT_ERROR_FAILED_TO_COPY; - } -#elif(VKFFT_BACKEND==2) - res = hipMemcpy(axis->specializationConstants.raderContainer[i].raderFFTkernel, bufferRaderFFT, bufferSize, hipMemcpyDeviceToHost); - if (res != hipSuccess) { - free(axis->specializationConstants.raderContainer[i].raderFFTkernel); - deleteVkFFT(&kernelPreparationApplication); - return VKFFT_ERROR_FAILED_TO_COPY; - } -#elif(VKFFT_BACKEND==3) - res = clEnqueueReadBuffer(commandQueue, bufferRaderFFT, CL_TRUE, 0, bufferSize, axis->specializationConstants.raderContainer[i].raderFFTkernel, 0, NULL, NULL); - if (res != CL_SUCCESS) { - free(axis->specializationConstants.raderContainer[i].raderFFTkernel); - deleteVkFFT(&kernelPreparationApplication); - return VKFFT_ERROR_FAILED_TO_COPY; - } -#elif(VKFFT_BACKEND==4) - ze_command_queue_desc_t commandQueueCopyDesc = { - ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC, - 0, - app->configuration.commandQueueID, - 0, // index - 0, // flags - ZE_COMMAND_QUEUE_MODE_DEFAULT, - ZE_COMMAND_QUEUE_PRIORITY_NORMAL - }; - ze_command_list_handle_t copyCommandList; - res = zeCommandListCreateImmediate(app->configuration.context[0], app->configuration.device[0], &commandQueueCopyDesc, ©CommandList); - if (res != ZE_RESULT_SUCCESS) { - free(axis->specializationConstants.raderContainer[i].raderFFTkernel); - deleteVkFFT(&kernelPreparationApplication); - return VKFFT_ERROR_FAILED_TO_CREATE_COMMAND_LIST; - } - res = zeCommandListAppendMemoryCopy(copyCommandList, axis->specializationConstants.raderContainer[i].raderFFTkernel, bufferRaderFFT, bufferSize, 0, 0, 0); - if (res != ZE_RESULT_SUCCESS) { - free(axis->specializationConstants.raderContainer[i].raderFFTkernel); - deleteVkFFT(&kernelPreparationApplication); - return VKFFT_ERROR_FAILED_TO_COPY; - } - res = zeCommandQueueSynchronize(app->configuration.commandQueue[0], UINT32_MAX); - if (res != ZE_RESULT_SUCCESS) { - free(axis->specializationConstants.raderContainer[i].raderFFTkernel); - deleteVkFFT(&kernelPreparationApplication); - return VKFFT_ERROR_FAILED_TO_SYNCHRONIZE; - } -#endif - -#if(VKFFT_BACKEND==0) - kernelPreparationApplication.configuration.isCompilerInitialized = 0; -#elif(VKFFT_BACKEND==3) - res = clReleaseCommandQueue(commandQueue); - if (res != CL_SUCCESS) return VKFFT_ERROR_FAILED_TO_RELEASE_COMMAND_QUEUE; -#endif - deleteVkFFT(&kernelPreparationApplication); - } - } - if (app->configuration.loadApplicationFromString) { - uint64_t offset = 0; - for (uint64_t i = 0; i < app->numRaderFFTPrimes; i++) { - if (!app->raderFFTkernel[i]) { - uint64_t current_size = 0; - if (app->configuration.doublePrecision || app->configuration.doublePrecisionFloatMemory) { - current_size = (axis->specializationConstants.raderContainer[i].prime - 1) * sizeof(double) * 2; - } - else { - current_size = (axis->specializationConstants.raderContainer[i].prime - 1) * sizeof(float) * 2; - } - app->raderFFTkernel[i] = (void*)malloc(current_size); - if (!app->raderFFTkernel[i]) return VKFFT_ERROR_MALLOC_FAILED; - axis->specializationConstants.raderContainer[i].raderFFTkernel = app->raderFFTkernel[i]; - memcpy(app->raderFFTkernel[i], (char*)app->configuration.loadApplicationString + app->applicationStringOffsetRader + offset, current_size); - offset += current_size; - } - } - } - } - return resFFT; -} -static inline VkFFTResult VkFFTCheckUpdateBufferSet(VkFFTApplication* app, VkFFTAxis* axis, uint64_t planStage, VkFFTLaunchParams* launchParams) { - uint64_t performBufferSetUpdate = planStage; - uint64_t performOffsetUpdate = planStage; - if (!planStage) { - if (launchParams != 0) { - if ((launchParams->buffer != 0) && (app->configuration.buffer != launchParams->buffer)) { - app->configuration.buffer = launchParams->buffer; - performBufferSetUpdate = 1; - } - if ((launchParams->inputBuffer != 0) && (app->configuration.inputBuffer != launchParams->inputBuffer)) { - app->configuration.inputBuffer = launchParams->inputBuffer; - performBufferSetUpdate = 1; - } - if ((launchParams->outputBuffer != 0) && (app->configuration.outputBuffer != launchParams->outputBuffer)) { - app->configuration.outputBuffer = launchParams->outputBuffer; - performBufferSetUpdate = 1; - } - if ((launchParams->tempBuffer != 0) && (app->configuration.tempBuffer != launchParams->tempBuffer)) { - app->configuration.tempBuffer = launchParams->tempBuffer; - performBufferSetUpdate = 1; - } - if ((launchParams->kernel != 0) && (app->configuration.kernel != launchParams->kernel)) { - app->configuration.kernel = launchParams->kernel; - performBufferSetUpdate = 1; - } - if (app->configuration.inputBuffer == 0) app->configuration.inputBuffer = app->configuration.buffer; - if (app->configuration.outputBuffer == 0) app->configuration.outputBuffer = app->configuration.buffer; - - if (app->configuration.bufferOffset != launchParams->bufferOffset) { - app->configuration.bufferOffset = launchParams->bufferOffset; - performOffsetUpdate = 1; - } - if (app->configuration.inputBufferOffset != launchParams->inputBufferOffset) { - app->configuration.inputBufferOffset = launchParams->inputBufferOffset; - performOffsetUpdate = 1; - } - if (app->configuration.outputBufferOffset != launchParams->outputBufferOffset) { - app->configuration.outputBufferOffset = launchParams->outputBufferOffset; - performOffsetUpdate = 1; - } - if (app->configuration.tempBufferOffset != launchParams->tempBufferOffset) { - app->configuration.tempBufferOffset = launchParams->tempBufferOffset; - performOffsetUpdate = 1; - } - if (app->configuration.kernelOffset != launchParams->kernelOffset) { - app->configuration.kernelOffset = launchParams->kernelOffset; - performOffsetUpdate = 1; - } - } - } - if (planStage) { - if (app->configuration.buffer == 0) { - performBufferSetUpdate = 0; - } - if ((app->configuration.isInputFormatted) && (app->configuration.inputBuffer == 0)) { - performBufferSetUpdate = 0; - } - if ((app->configuration.isOutputFormatted) && (app->configuration.outputBuffer == 0)) { - performBufferSetUpdate = 0; - } - if ((app->configuration.userTempBuffer) && (app->configuration.tempBuffer == 0)) { - performBufferSetUpdate = 0; - } - if ((app->configuration.performConvolution) && (app->configuration.kernel == 0)) { - performBufferSetUpdate = 0; - } - } - else { - if (app->configuration.buffer == 0) { - return VKFFT_ERROR_EMPTY_buffer; - } - if ((app->configuration.isInputFormatted) && (app->configuration.inputBuffer == 0)) { - return VKFFT_ERROR_EMPTY_inputBuffer; - } - if ((app->configuration.isOutputFormatted) && (app->configuration.outputBuffer == 0)) { - return VKFFT_ERROR_EMPTY_outputBuffer; - } - if ((app->configuration.userTempBuffer) && (app->configuration.tempBuffer == 0)) { - return VKFFT_ERROR_EMPTY_tempBuffer; - } - if ((app->configuration.performConvolution) && (app->configuration.kernel == 0)) { - return VKFFT_ERROR_EMPTY_kernel; - } - } - if (performBufferSetUpdate) { - if (planStage) axis->specializationConstants.performBufferSetUpdate = 1; - else { - if (!app->configuration.makeInversePlanOnly) { - for (uint64_t i = 0; i < app->configuration.FFTdim; i++) { - for (uint64_t j = 0; j < app->localFFTPlan->numAxisUploads[i]; j++) - app->localFFTPlan->axes[i][j].specializationConstants.performBufferSetUpdate = 1; - if (app->useBluesteinFFT[i] && (app->localFFTPlan->numAxisUploads[i] > 1)) { - for (uint64_t j = 1; j < app->localFFTPlan->numAxisUploads[i]; j++) - app->localFFTPlan->inverseBluesteinAxes[i][j].specializationConstants.performBufferSetUpdate = 1; - } - } - if (app->localFFTPlan->multiUploadR2C) { - app->localFFTPlan->R2Cdecomposition.specializationConstants.performBufferSetUpdate = 1; - } - } - if (!app->configuration.makeForwardPlanOnly) { - for (uint64_t i = 0; i < app->configuration.FFTdim; i++) { - for (uint64_t j = 0; j < app->localFFTPlan_inverse->numAxisUploads[i]; j++) - app->localFFTPlan_inverse->axes[i][j].specializationConstants.performBufferSetUpdate = 1; - if (app->useBluesteinFFT[i] && (app->localFFTPlan_inverse->numAxisUploads[i] > 1)) { - for (uint64_t j = 1; j < app->localFFTPlan_inverse->numAxisUploads[i]; j++) - app->localFFTPlan_inverse->inverseBluesteinAxes[i][j].specializationConstants.performBufferSetUpdate = 1; - } - } - if (app->localFFTPlan_inverse->multiUploadR2C) { - app->localFFTPlan_inverse->R2Cdecomposition.specializationConstants.performBufferSetUpdate = 1; - } - } - } - } - if (performOffsetUpdate) { - if (planStage) axis->specializationConstants.performOffsetUpdate = 1; - else { - if (!app->configuration.makeInversePlanOnly) { - for (uint64_t i = 0; i < app->configuration.FFTdim; i++) { - for (uint64_t j = 0; j < app->localFFTPlan->numAxisUploads[i]; j++) - app->localFFTPlan->axes[i][j].specializationConstants.performOffsetUpdate = 1; - if (app->useBluesteinFFT[i] && (app->localFFTPlan->numAxisUploads[i] > 1)) { - for (uint64_t j = 1; j < app->localFFTPlan->numAxisUploads[i]; j++) - app->localFFTPlan->inverseBluesteinAxes[i][j].specializationConstants.performOffsetUpdate = 1; - } - } - if (app->localFFTPlan->multiUploadR2C) { - app->localFFTPlan->R2Cdecomposition.specializationConstants.performOffsetUpdate = 1; - } - } - if (!app->configuration.makeForwardPlanOnly) { - for (uint64_t i = 0; i < app->configuration.FFTdim; i++) { - for (uint64_t j = 0; j < app->localFFTPlan_inverse->numAxisUploads[i]; j++) - app->localFFTPlan_inverse->axes[i][j].specializationConstants.performOffsetUpdate = 1; - if (app->useBluesteinFFT[i] && (app->localFFTPlan_inverse->numAxisUploads[i] > 1)) { - for (uint64_t j = 1; j < app->localFFTPlan_inverse->numAxisUploads[i]; j++) - app->localFFTPlan_inverse->inverseBluesteinAxes[i][j].specializationConstants.performOffsetUpdate = 1; - } - } - if (app->localFFTPlan_inverse->multiUploadR2C) { - app->localFFTPlan_inverse->R2Cdecomposition.specializationConstants.performOffsetUpdate = 1; - } - } - } - } - return VKFFT_SUCCESS; -} -static inline VkFFTResult VkFFTUpdateBufferSet(VkFFTApplication* app, VkFFTPlan* FFTPlan, VkFFTAxis* axis, uint64_t axis_id, uint64_t axis_upload_id, uint64_t inverse) { - if (axis->specializationConstants.performOffsetUpdate || axis->specializationConstants.performBufferSetUpdate) { -#if(VKFFT_BACKEND==0) - const VkDescriptorType descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; -#endif - uint64_t storageComplexSize; - if (app->configuration.doublePrecision) - storageComplexSize = (2 * sizeof(double)); - else - if (app->configuration.halfPrecision) - storageComplexSize = (2 * 2); - else - storageComplexSize = (2 * sizeof(float)); - for (uint64_t i = 0; i < axis->numBindings; ++i) { - for (uint64_t j = 0; j < axis->specializationConstants.numBuffersBound[i]; ++j) { -#if(VKFFT_BACKEND==0) - VkDescriptorBufferInfo descriptorBufferInfo = { 0 }; -#endif - if (i == 0) { - if ((axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1) && (app->configuration.isInputFormatted) && (!axis->specializationConstants.reverseBluesteinMultiUpload) && ( - ((axis_id == app->firstAxis) && (!inverse)) - || ((axis_id == app->lastAxis) && (inverse) && (!((axis_id == 0) && (axis->specializationConstants.performR2CmultiUpload))) && (!app->configuration.performConvolution) && (!app->configuration.inverseReturnToInputBuffer))) - ) { - if (axis->specializationConstants.performBufferSetUpdate) { - uint64_t bufferId = 0; - uint64_t offset = j; - if (app->configuration.inputBufferSize) - { - for (uint64_t l = 0; l < app->configuration.inputBufferNum; ++l) { - if (offset >= (uint64_t)ceil(app->configuration.inputBufferSize[l] / (double)(axis->specializationConstants.inputBufferBlockSize * storageComplexSize))) { - bufferId++; - offset -= (uint64_t)ceil(app->configuration.inputBufferSize[l] / (double)(axis->specializationConstants.inputBufferBlockSize * storageComplexSize)); - } - else { - l = app->configuration.inputBufferNum; - } - - } - } - axis->inputBuffer = app->configuration.inputBuffer; -#if(VKFFT_BACKEND==0) - descriptorBufferInfo.buffer = app->configuration.inputBuffer[bufferId]; - descriptorBufferInfo.range = (axis->specializationConstants.inputBufferBlockSize * storageComplexSize); - descriptorBufferInfo.offset = offset * (axis->specializationConstants.inputBufferBlockSize * storageComplexSize); -#endif - } - if (axis->specializationConstants.performOffsetUpdate) { - axis->specializationConstants.inputOffset = app->configuration.inputBufferOffset; - } - } - else { - if ((axis_upload_id == 0) && (app->configuration.numberKernels > 1) && (inverse) && (!app->configuration.performConvolution)) { - if (axis->specializationConstants.performBufferSetUpdate) { - uint64_t bufferId = 0; - uint64_t offset = j; - if (app->configuration.outputBufferSize) - { - for (uint64_t l = 0; l < app->configuration.outputBufferNum; ++l) { - if (offset >= (uint64_t)ceil(app->configuration.outputBufferSize[l] / (double)(axis->specializationConstants.inputBufferBlockSize * storageComplexSize))) { - bufferId++; - offset -= (uint64_t)ceil(app->configuration.outputBufferSize[l] / (double)(axis->specializationConstants.inputBufferBlockSize * storageComplexSize)); - } - else { - l = app->configuration.outputBufferNum; - } - - } - } - axis->inputBuffer = app->configuration.outputBuffer; -#if(VKFFT_BACKEND==0) - descriptorBufferInfo.buffer = app->configuration.outputBuffer[bufferId]; - descriptorBufferInfo.range = (axis->specializationConstants.inputBufferBlockSize * storageComplexSize); - descriptorBufferInfo.offset = offset * (axis->specializationConstants.inputBufferBlockSize * storageComplexSize); -#endif - } - if (axis->specializationConstants.performOffsetUpdate) { - axis->specializationConstants.inputOffset = app->configuration.outputBufferOffset; - } - } - else { - uint64_t bufferId = 0; - uint64_t offset = j; - if (((axis->specializationConstants.reorderFourStep == 1) || (app->useBluesteinFFT[axis_id])) && (FFTPlan->numAxisUploads[axis_id] > 1)) { - if ((((axis->specializationConstants.reorderFourStep == 1) && (axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1)) || (app->useBluesteinFFT[axis_id] && (axis->specializationConstants.reverseBluesteinMultiUpload == 0) && (axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1))) && (!((axis_id == 0) && (axis->specializationConstants.performR2CmultiUpload) && (axis->specializationConstants.reorderFourStep == 1) && (inverse == 1)))) { - if (axis->specializationConstants.performBufferSetUpdate) { - if (app->configuration.bufferSize) - { - for (uint64_t l = 0; l < app->configuration.bufferNum; ++l) { - if (offset >= (uint64_t)ceil(app->configuration.bufferSize[l] / (double)(axis->specializationConstants.inputBufferBlockSize * storageComplexSize))) { - bufferId++; - offset -= (uint64_t)ceil(app->configuration.bufferSize[l] / (double)(axis->specializationConstants.inputBufferBlockSize * storageComplexSize)); - } - else { - l = app->configuration.bufferNum; - } - - } - } - axis->inputBuffer = app->configuration.buffer; -#if(VKFFT_BACKEND==0) - descriptorBufferInfo.buffer = app->configuration.buffer[bufferId]; -#endif - } - if (axis->specializationConstants.performOffsetUpdate) { - axis->specializationConstants.inputOffset = app->configuration.bufferOffset; - } - } - else { - if (axis->specializationConstants.performBufferSetUpdate) { - if (app->configuration.tempBufferSize) { - for (uint64_t l = 0; l < app->configuration.tempBufferNum; ++l) { - if (offset >= (uint64_t)ceil(app->configuration.tempBufferSize[l] / (double)(axis->specializationConstants.inputBufferBlockSize * storageComplexSize))) { - bufferId++; - offset -= (uint64_t)ceil(app->configuration.tempBufferSize[l] / (double)(axis->specializationConstants.inputBufferBlockSize * storageComplexSize)); - } - else { - l = app->configuration.tempBufferNum; - } - - } - } - axis->inputBuffer = app->configuration.tempBuffer; -#if(VKFFT_BACKEND==0) - descriptorBufferInfo.buffer = app->configuration.tempBuffer[bufferId]; -#endif - } - if (axis->specializationConstants.performOffsetUpdate) { - axis->specializationConstants.inputOffset = app->configuration.tempBufferOffset; - } - } - } - else { - if (axis->specializationConstants.performBufferSetUpdate) { - if (app->configuration.bufferSize) { - for (uint64_t l = 0; l < app->configuration.bufferNum; ++l) { - if (offset >= (uint64_t)ceil(app->configuration.bufferSize[l] / (double)(axis->specializationConstants.inputBufferBlockSize * storageComplexSize))) { - bufferId++; - offset -= (uint64_t)ceil(app->configuration.bufferSize[l] / (double)(axis->specializationConstants.inputBufferBlockSize * storageComplexSize)); - } - else { - l = app->configuration.bufferNum; - } - - } - } - axis->inputBuffer = app->configuration.buffer; -#if(VKFFT_BACKEND==0) - descriptorBufferInfo.buffer = app->configuration.buffer[bufferId]; -#endif - } - if (axis->specializationConstants.performOffsetUpdate) { - axis->specializationConstants.inputOffset = app->configuration.bufferOffset; - } - } -#if(VKFFT_BACKEND==0) - if (axis->specializationConstants.performBufferSetUpdate) { - descriptorBufferInfo.range = (axis->specializationConstants.inputBufferBlockSize * storageComplexSize); - descriptorBufferInfo.offset = offset * (axis->specializationConstants.inputBufferBlockSize * storageComplexSize); - } -#endif - } - } - //descriptorBufferInfo.offset = 0; - } - if (i == 1) { - if (((axis_upload_id == 0) && (!app->useBluesteinFFT[axis_id]) && (app->configuration.isOutputFormatted && ( - ((axis_id == app->firstAxis) && (inverse)) - || ((axis_id == app->lastAxis) && (!inverse) && (!app->configuration.performConvolution)) - || ((axis_id == app->firstAxis) && (app->configuration.performConvolution) && (app->configuration.FFTdim == 1))) - )) || - ((axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1) && (app->useBluesteinFFT[axis_id]) && (axis->specializationConstants.reverseBluesteinMultiUpload || (FFTPlan->numAxisUploads[axis_id] == 1)) && (app->configuration.isOutputFormatted && ( - ((axis_id == app->firstAxis) && (inverse)) - || ((axis_id == app->lastAxis) && (!inverse) && (!app->configuration.performConvolution))) - )) || - ((app->configuration.numberKernels > 1) && ( - (inverse) - || (axis_id == app->lastAxis))) - ) { - if (axis->specializationConstants.performBufferSetUpdate) { - uint64_t bufferId = 0; - uint64_t offset = j; - if (app->configuration.outputBufferSize) { - for (uint64_t l = 0; l < app->configuration.outputBufferNum; ++l) { - if (offset >= (uint64_t)ceil(app->configuration.outputBufferSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize * storageComplexSize))) { - bufferId++; - offset -= (uint64_t)ceil(app->configuration.outputBufferSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize * storageComplexSize)); - } - else { - l = app->configuration.outputBufferNum; - } - - } - } - axis->outputBuffer = app->configuration.outputBuffer; -#if(VKFFT_BACKEND==0) - descriptorBufferInfo.buffer = app->configuration.outputBuffer[bufferId]; - descriptorBufferInfo.range = (axis->specializationConstants.outputBufferBlockSize * storageComplexSize); - descriptorBufferInfo.offset = offset * (axis->specializationConstants.outputBufferBlockSize * storageComplexSize); -#endif - } - if (axis->specializationConstants.performOffsetUpdate) { - axis->specializationConstants.outputOffset = app->configuration.outputBufferOffset; - } - } - else { - uint64_t bufferId = 0; - uint64_t offset = j; - - if (((axis->specializationConstants.reorderFourStep == 1) || (app->useBluesteinFFT[axis_id])) && (FFTPlan->numAxisUploads[axis_id] > 1)) { - if ((inverse) && (axis_id == app->firstAxis) && ( - ((axis_upload_id == 0) && (app->configuration.isInputFormatted) && (app->configuration.inverseReturnToInputBuffer) && (!app->useBluesteinFFT[axis_id])) - || ((axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1) && (app->configuration.isInputFormatted) && (axis->specializationConstants.actualInverse) && (app->configuration.inverseReturnToInputBuffer) && (app->useBluesteinFFT[axis_id]) && (axis->specializationConstants.reverseBluesteinMultiUpload || (FFTPlan->numAxisUploads[axis_id] == 1)))) - ) { - if (axis->specializationConstants.performBufferSetUpdate) { - if (app->configuration.inputBufferSize) { - for (uint64_t l = 0; l < app->configuration.inputBufferNum; ++l) { - if (offset >= (uint64_t)ceil(app->configuration.inputBufferSize[l] / (double)(axis->specializationConstants.inputBufferBlockSize * storageComplexSize))) { - bufferId++; - offset -= (uint64_t)ceil(app->configuration.inputBufferSize[l] / (double)(axis->specializationConstants.inputBufferBlockSize * storageComplexSize)); - } - else { - l = app->configuration.inputBufferNum; - } - - } - } - axis->outputBuffer = app->configuration.inputBuffer; -#if(VKFFT_BACKEND==0) - descriptorBufferInfo.buffer = app->configuration.inputBuffer[bufferId]; -#endif - } - if (axis->specializationConstants.performOffsetUpdate) { - axis->specializationConstants.outputOffset = app->configuration.inputBufferOffset; - } - } - else { - if (((axis->specializationConstants.reorderFourStep == 1) && (axis_upload_id > 0)) || (app->useBluesteinFFT[axis_id] && (!((axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1) && (axis->specializationConstants.reverseBluesteinMultiUpload == 1))))) { - if (axis->specializationConstants.performBufferSetUpdate) { - if (app->configuration.tempBufferSize) { - for (uint64_t l = 0; l < app->configuration.tempBufferNum; ++l) { - if (offset >= (uint64_t)ceil(app->configuration.tempBufferSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize * storageComplexSize))) { - bufferId++; - offset -= (uint64_t)ceil(app->configuration.tempBufferSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize * storageComplexSize)); - } - else { - l = app->configuration.tempBufferNum; - } - - } - } - axis->outputBuffer = app->configuration.tempBuffer; -#if(VKFFT_BACKEND==0) - descriptorBufferInfo.buffer = app->configuration.tempBuffer[bufferId]; -#endif - } - if (axis->specializationConstants.performOffsetUpdate) { - axis->specializationConstants.outputOffset = app->configuration.tempBufferOffset; - } - } - else { - if (axis->specializationConstants.performBufferSetUpdate) { - if (app->configuration.bufferSize) { - for (uint64_t l = 0; l < app->configuration.bufferNum; ++l) { - if (offset >= (uint64_t)ceil(app->configuration.bufferSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize * storageComplexSize))) { - bufferId++; - offset -= (uint64_t)ceil(app->configuration.bufferSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize * storageComplexSize)); - } - else { - l = app->configuration.bufferNum; - } - - } - } - axis->outputBuffer = app->configuration.buffer; -#if(VKFFT_BACKEND==0) - descriptorBufferInfo.buffer = app->configuration.buffer[bufferId]; -#endif - } - if (axis->specializationConstants.performOffsetUpdate) { - axis->specializationConstants.outputOffset = app->configuration.bufferOffset; - } - } - } - } - else { - if ((inverse) && (axis_id == app->firstAxis) && (axis_upload_id == 0) && (app->configuration.isInputFormatted) && (app->configuration.inverseReturnToInputBuffer)) { - if (axis->specializationConstants.performBufferSetUpdate) { - if (app->configuration.inputBufferSize) { - for (uint64_t l = 0; l < app->configuration.inputBufferNum; ++l) { - if (offset >= (uint64_t)ceil(app->configuration.inputBufferSize[l] / (double)(axis->specializationConstants.inputBufferBlockSize * storageComplexSize))) { - bufferId++; - offset -= (uint64_t)ceil(app->configuration.inputBufferSize[l] / (double)(axis->specializationConstants.inputBufferBlockSize * storageComplexSize)); - } - else { - l = app->configuration.inputBufferNum; - } - - } - } - axis->outputBuffer = app->configuration.inputBuffer; -#if(VKFFT_BACKEND==0) - descriptorBufferInfo.buffer = app->configuration.inputBuffer[bufferId]; -#endif - } - if (axis->specializationConstants.performOffsetUpdate) { - axis->specializationConstants.outputOffset = app->configuration.inputBufferOffset; - } - } - else { - if (axis->specializationConstants.performBufferSetUpdate) { - if (app->configuration.bufferSize) { - for (uint64_t l = 0; l < app->configuration.bufferNum; ++l) { - if (offset >= (uint64_t)ceil(app->configuration.bufferSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize * storageComplexSize))) { - bufferId++; - offset -= (uint64_t)ceil(app->configuration.bufferSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize * storageComplexSize)); - } - else { - l = app->configuration.bufferNum; - } - - } - } - axis->outputBuffer = app->configuration.buffer; -#if(VKFFT_BACKEND==0) - descriptorBufferInfo.buffer = app->configuration.buffer[bufferId]; -#endif - } - if (axis->specializationConstants.performOffsetUpdate) { - axis->specializationConstants.outputOffset = app->configuration.bufferOffset; - } - } - } -#if(VKFFT_BACKEND==0) - if (axis->specializationConstants.performBufferSetUpdate) { - descriptorBufferInfo.range = (axis->specializationConstants.outputBufferBlockSize * storageComplexSize); - descriptorBufferInfo.offset = offset * (axis->specializationConstants.outputBufferBlockSize * storageComplexSize); - } -#endif - } - //descriptorBufferInfo.offset = 0; - } - if ((i == axis->specializationConstants.convolutionBindingID) && (app->configuration.performConvolution)) { - if (axis->specializationConstants.performBufferSetUpdate) { - uint64_t bufferId = 0; - uint64_t offset = j; - if (app->configuration.kernelSize) { - for (uint64_t l = 0; l < app->configuration.kernelNum; ++l) { - if (offset >= (uint64_t)ceil(app->configuration.kernelSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize * storageComplexSize))) { - bufferId++; - offset -= (uint64_t)ceil(app->configuration.kernelSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize * storageComplexSize)); - } - else { - l = app->configuration.kernelNum; - } - - } - } -#if(VKFFT_BACKEND==0) - descriptorBufferInfo.buffer = app->configuration.kernel[bufferId]; - descriptorBufferInfo.range = (axis->specializationConstants.kernelBlockSize * storageComplexSize); - descriptorBufferInfo.offset = offset * (axis->specializationConstants.kernelBlockSize * storageComplexSize); -#endif - } - if (axis->specializationConstants.performOffsetUpdate) { - axis->specializationConstants.kernelOffset = app->configuration.kernelOffset; - } - } - if ((i == axis->specializationConstants.LUTBindingID) && (app->configuration.useLUT)) { -#if(VKFFT_BACKEND==0) - if (axis->specializationConstants.performBufferSetUpdate) { - descriptorBufferInfo.buffer = axis->bufferLUT; - descriptorBufferInfo.offset = 0; - descriptorBufferInfo.range = axis->bufferLUTSize; - } -#endif - } - if ((i == axis->specializationConstants.RaderUintLUTBindingID) && (axis->specializationConstants.raderUintLUT)) { -#if(VKFFT_BACKEND==0) - if (axis->specializationConstants.performBufferSetUpdate) { - descriptorBufferInfo.buffer = axis->bufferRaderUintLUT; - descriptorBufferInfo.offset = 0; - descriptorBufferInfo.range = axis->bufferRaderUintLUTSize; - } -#endif - } - if ((i == axis->specializationConstants.BluesteinConvolutionBindingID) && (app->useBluesteinFFT[axis_id]) && (axis_upload_id == 0)) { -#if(VKFFT_BACKEND==0) - if (axis->specializationConstants.performBufferSetUpdate) { - if (axis->specializationConstants.inverseBluestein) - descriptorBufferInfo.buffer = app->bufferBluesteinIFFT[axis_id]; - else - descriptorBufferInfo.buffer = app->bufferBluesteinFFT[axis_id]; - descriptorBufferInfo.offset = 0; - descriptorBufferInfo.range = app->bufferBluesteinSize[axis_id]; - } -#endif - } - if ((i == axis->specializationConstants.BluesteinMultiplicationBindingID) && (app->useBluesteinFFT[axis_id]) && (axis_upload_id == (FFTPlan->numAxisUploads[axis_id] - 1))) { -#if(VKFFT_BACKEND==0) - if (axis->specializationConstants.performBufferSetUpdate) { - descriptorBufferInfo.buffer = app->bufferBluestein[axis_id]; - descriptorBufferInfo.offset = 0; - descriptorBufferInfo.range = app->bufferBluesteinSize[axis_id]; - } -#endif - } -#if(VKFFT_BACKEND==0) - if (axis->specializationConstants.performBufferSetUpdate) { - VkWriteDescriptorSet writeDescriptorSet = { VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET }; - writeDescriptorSet.dstSet = axis->descriptorSet; - writeDescriptorSet.dstBinding = (uint32_t)i; - writeDescriptorSet.dstArrayElement = (uint32_t)j; - writeDescriptorSet.descriptorType = descriptorType; - writeDescriptorSet.descriptorCount = 1; - writeDescriptorSet.pBufferInfo = &descriptorBufferInfo; - vkUpdateDescriptorSets(app->configuration.device[0], 1, &writeDescriptorSet, 0, 0); - } -#endif - } - } - } - if (axis->specializationConstants.performBufferSetUpdate) { - axis->specializationConstants.performBufferSetUpdate = 0; - } - if (axis->specializationConstants.performOffsetUpdate) { - axis->specializationConstants.performOffsetUpdate = 0; - } - return VKFFT_SUCCESS; -} -static inline VkFFTResult VkFFTUpdateBufferSetR2CMultiUploadDecomposition(VkFFTApplication* app, VkFFTPlan* FFTPlan, VkFFTAxis* axis, uint64_t axis_id, uint64_t axis_upload_id, uint64_t inverse) { - if (axis->specializationConstants.performOffsetUpdate || axis->specializationConstants.performBufferSetUpdate) { -#if(VKFFT_BACKEND==0) - const VkDescriptorType descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; -#endif - uint64_t storageComplexSize; - if (app->configuration.doublePrecision) - storageComplexSize = (2 * sizeof(double)); - else - if (app->configuration.halfPrecision) - storageComplexSize = (2 * 2); - else - storageComplexSize = (2 * sizeof(float)); - for (uint64_t i = 0; i < axis->numBindings; ++i) { - for (uint64_t j = 0; j < axis->specializationConstants.numBuffersBound[i]; ++j) { -#if(VKFFT_BACKEND==0) - VkDescriptorBufferInfo descriptorBufferInfo = { 0 }; -#endif - if (i == 0) { - if (inverse) { - if ((axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1) && (app->configuration.isInputFormatted) && (!axis->specializationConstants.reverseBluesteinMultiUpload) && ( - ((axis_id == app->firstAxis) && (!inverse)) - || ((axis_id == app->lastAxis) && (inverse) && (!app->configuration.performConvolution) && (!app->configuration.inverseReturnToInputBuffer))) - ) { - if (axis->specializationConstants.performBufferSetUpdate) { - uint64_t bufferId = 0; - uint64_t offset = j; - if (app->configuration.inputBufferSize) { - for (uint64_t l = 0; l < app->configuration.inputBufferNum; ++l) { - if (offset >= (uint64_t)ceil(app->configuration.inputBufferSize[l] / (double)(axis->specializationConstants.inputBufferBlockSize * storageComplexSize))) { - bufferId++; - offset -= (uint64_t)ceil(app->configuration.inputBufferSize[l] / (double)(axis->specializationConstants.inputBufferBlockSize * storageComplexSize)); - } - else { - l = app->configuration.inputBufferNum; - } - - } - } - axis->inputBuffer = app->configuration.inputBuffer; -#if(VKFFT_BACKEND==0) - descriptorBufferInfo.buffer = app->configuration.inputBuffer[bufferId]; - descriptorBufferInfo.range = (axis->specializationConstants.inputBufferBlockSize * storageComplexSize); - descriptorBufferInfo.offset = offset * (axis->specializationConstants.inputBufferBlockSize * storageComplexSize); -#endif - } - if (axis->specializationConstants.performOffsetUpdate) { - axis->specializationConstants.inputOffset = app->configuration.inputBufferOffset; - } - } - else { - if ((axis_upload_id == 0) && (app->configuration.numberKernels > 1) && (inverse) && (!app->configuration.performConvolution)) { - if (axis->specializationConstants.performBufferSetUpdate) { - uint64_t bufferId = 0; - uint64_t offset = j; - if (app->configuration.outputBufferSize) { - for (uint64_t l = 0; l < app->configuration.outputBufferNum; ++l) { - if (offset >= (uint64_t)ceil(app->configuration.outputBufferSize[l] / (double)(axis->specializationConstants.inputBufferBlockSize * storageComplexSize))) { - bufferId++; - offset -= (uint64_t)ceil(app->configuration.outputBufferSize[l] / (double)(axis->specializationConstants.inputBufferBlockSize * storageComplexSize)); - } - else { - l = app->configuration.outputBufferNum; - } - - } - } - axis->inputBuffer = app->configuration.outputBuffer; -#if(VKFFT_BACKEND==0) - descriptorBufferInfo.buffer = app->configuration.outputBuffer[bufferId]; - descriptorBufferInfo.range = (axis->specializationConstants.inputBufferBlockSize * storageComplexSize); - descriptorBufferInfo.offset = offset * (axis->specializationConstants.inputBufferBlockSize * storageComplexSize); -#endif - } - if (axis->specializationConstants.performOffsetUpdate) { - axis->specializationConstants.inputOffset = app->configuration.outputBufferOffset; - } - } - else { - if (axis->specializationConstants.performBufferSetUpdate) { - uint64_t bufferId = 0; - uint64_t offset = j; - if (app->configuration.bufferSize) { - for (uint64_t l = 0; l < app->configuration.bufferNum; ++l) { - if (offset >= (uint64_t)ceil(app->configuration.bufferSize[l] / (double)(axis->specializationConstants.inputBufferBlockSize * storageComplexSize))) { - bufferId++; - offset -= (uint64_t)ceil(app->configuration.bufferSize[l] / (double)(axis->specializationConstants.inputBufferBlockSize * storageComplexSize)); - } - else { - l = app->configuration.bufferNum; - } - - } - } - axis->inputBuffer = app->configuration.buffer; -#if(VKFFT_BACKEND==0) - descriptorBufferInfo.buffer = app->configuration.buffer[bufferId]; - descriptorBufferInfo.range = (axis->specializationConstants.inputBufferBlockSize * storageComplexSize); - descriptorBufferInfo.offset = offset * (axis->specializationConstants.inputBufferBlockSize * storageComplexSize); -#endif - } - if (axis->specializationConstants.performOffsetUpdate) { - axis->specializationConstants.inputOffset = app->configuration.bufferOffset; - } - } - } - } - else { - if (((axis_upload_id == 0) && (!app->useBluesteinFFT[axis_id]) && (app->configuration.isOutputFormatted && ( - ((axis_id == app->firstAxis) && (inverse)) - || ((axis_id == app->lastAxis) && (!inverse) && (!app->configuration.performConvolution)) - || ((axis_id == app->firstAxis) && (app->configuration.performConvolution) && (app->configuration.FFTdim == 1))) - )) || - ((axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1) && (app->useBluesteinFFT[axis_id]) && (axis->specializationConstants.reverseBluesteinMultiUpload || (FFTPlan->numAxisUploads[axis_id] == 1)) && (app->configuration.isOutputFormatted && ( - ((axis_id == app->firstAxis) && (inverse)) - || ((axis_id == app->lastAxis) && (!inverse) && (!app->configuration.performConvolution))) - )) || - ((app->configuration.numberKernels > 1) && ( - (inverse) - || (axis_id == app->lastAxis))) - ) { - if (axis->specializationConstants.performBufferSetUpdate) { - uint64_t bufferId = 0; - uint64_t offset = j; - if (app->configuration.outputBufferSize) { - for (uint64_t l = 0; l < app->configuration.outputBufferNum; ++l) { - if (offset >= (uint64_t)ceil(app->configuration.outputBufferSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize * storageComplexSize))) { - bufferId++; - offset -= (uint64_t)ceil(app->configuration.outputBufferSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize * storageComplexSize)); - } - else { - l = app->configuration.outputBufferNum; - } - - } - } - axis->inputBuffer = app->configuration.outputBuffer; -#if(VKFFT_BACKEND==0) - descriptorBufferInfo.buffer = app->configuration.outputBuffer[bufferId]; - descriptorBufferInfo.range = (axis->specializationConstants.outputBufferBlockSize * storageComplexSize); - descriptorBufferInfo.offset = offset * (axis->specializationConstants.outputBufferBlockSize * storageComplexSize); -#endif - } - if (axis->specializationConstants.performOffsetUpdate) { - axis->specializationConstants.inputOffset = app->configuration.outputBufferOffset; - } - } - else { - if (axis->specializationConstants.performBufferSetUpdate) { - uint64_t bufferId = 0; - uint64_t offset = j; - if (app->configuration.bufferSize) { - for (uint64_t l = 0; l < app->configuration.bufferNum; ++l) { - if (offset >= (uint64_t)ceil(app->configuration.bufferSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize * storageComplexSize))) { - bufferId++; - offset -= (uint64_t)ceil(app->configuration.bufferSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize * storageComplexSize)); - } - else { - l = app->configuration.bufferNum; - } - - } - } - axis->inputBuffer = app->configuration.buffer; -#if(VKFFT_BACKEND==0) - descriptorBufferInfo.buffer = app->configuration.buffer[bufferId]; - descriptorBufferInfo.range = (axis->specializationConstants.outputBufferBlockSize * storageComplexSize); - descriptorBufferInfo.offset = offset * (axis->specializationConstants.outputBufferBlockSize * storageComplexSize); -#endif - } - if (axis->specializationConstants.performOffsetUpdate) { - axis->specializationConstants.inputOffset = app->configuration.bufferOffset; - } - } - } - } - if (i == 1) { - if (inverse) { - if ((axis_upload_id == 0) && (app->configuration.numberKernels > 1) && (inverse) && (!app->configuration.performConvolution)) { - if (axis->specializationConstants.performBufferSetUpdate) { - uint64_t bufferId = 0; - uint64_t offset = j; - if (app->configuration.outputBufferSize) { - for (uint64_t l = 0; l < app->configuration.outputBufferNum; ++l) { - if (offset >= (uint64_t)ceil(app->configuration.outputBufferSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize * storageComplexSize))) { - bufferId++; - offset -= (uint64_t)ceil(app->configuration.outputBufferSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize * storageComplexSize)); - } - else { - l = app->configuration.outputBufferNum; - } - - } - } - axis->outputBuffer = app->configuration.outputBuffer; -#if(VKFFT_BACKEND==0) - descriptorBufferInfo.buffer = app->configuration.outputBuffer[bufferId]; - descriptorBufferInfo.range = (axis->specializationConstants.outputBufferBlockSize * storageComplexSize); - descriptorBufferInfo.offset = offset * (axis->specializationConstants.outputBufferBlockSize * storageComplexSize); -#endif - } - if (axis->specializationConstants.performOffsetUpdate) { - axis->specializationConstants.outputOffset = app->configuration.outputBufferOffset; - } - } - else { - uint64_t bufferId = 0; - uint64_t offset = j; - if (axis->specializationConstants.reorderFourStep == 1) { - if (axis->specializationConstants.performBufferSetUpdate) { - if (app->configuration.tempBufferSize) { - for (uint64_t l = 0; l < app->configuration.tempBufferNum; ++l) { - if (offset >= (uint64_t)ceil(app->configuration.tempBufferSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize * storageComplexSize))) { - bufferId++; - offset -= (uint64_t)ceil(app->configuration.tempBufferSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize * storageComplexSize)); - } - else { - l = app->configuration.tempBufferNum; - } - - } - } - axis->outputBuffer = app->configuration.tempBuffer; -#if(VKFFT_BACKEND==0) - descriptorBufferInfo.buffer = app->configuration.tempBuffer[bufferId]; - descriptorBufferInfo.range = (axis->specializationConstants.outputBufferBlockSize * storageComplexSize); - descriptorBufferInfo.offset = offset * (axis->specializationConstants.outputBufferBlockSize * storageComplexSize); -#endif - } - if (axis->specializationConstants.performOffsetUpdate) { - axis->specializationConstants.outputOffset = app->configuration.tempBufferOffset; - } - } - else { - if (axis->specializationConstants.performBufferSetUpdate) { - if (app->configuration.bufferSize) { - for (uint64_t l = 0; l < app->configuration.bufferNum; ++l) { - if (offset >= (uint64_t)ceil(app->configuration.bufferSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize * storageComplexSize))) { - bufferId++; - offset -= (uint64_t)ceil(app->configuration.bufferSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize * storageComplexSize)); - } - else { - l = app->configuration.bufferNum; - } - - } - } - axis->outputBuffer = app->configuration.buffer; -#if(VKFFT_BACKEND==0) - descriptorBufferInfo.buffer = app->configuration.buffer[bufferId]; - descriptorBufferInfo.range = (axis->specializationConstants.outputBufferBlockSize * storageComplexSize); - descriptorBufferInfo.offset = offset * (axis->specializationConstants.outputBufferBlockSize * storageComplexSize); -#endif - } - if (axis->specializationConstants.performOffsetUpdate) { - axis->specializationConstants.outputOffset = app->configuration.bufferOffset; - } - } - } - } - else { - if (((axis_upload_id == 0) && (!app->useBluesteinFFT[axis_id]) && (app->configuration.isOutputFormatted && ( - ((axis_id == app->firstAxis) && (inverse)) - || ((axis_id == app->lastAxis) && (!inverse) && (!app->configuration.performConvolution)) - || ((axis_id == app->firstAxis) && (app->configuration.performConvolution) && (app->configuration.FFTdim == 1))) - )) || - ((axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1) && (app->useBluesteinFFT[axis_id]) && (axis->specializationConstants.reverseBluesteinMultiUpload || (FFTPlan->numAxisUploads[axis_id] == 1)) && (app->configuration.isOutputFormatted && ( - ((axis_id == app->firstAxis) && (inverse)) - || ((axis_id == app->lastAxis) && (!inverse) && (!app->configuration.performConvolution))) - )) || - ((app->configuration.numberKernels > 1) && ( - (inverse) - || (axis_id == app->lastAxis))) - ) { - if (axis->specializationConstants.performBufferSetUpdate) { - uint64_t bufferId = 0; - uint64_t offset = j; - if (app->configuration.outputBufferSize) { - for (uint64_t l = 0; l < app->configuration.outputBufferNum; ++l) { - if (offset >= (uint64_t)ceil(app->configuration.outputBufferSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize * storageComplexSize))) { - bufferId++; - offset -= (uint64_t)ceil(app->configuration.outputBufferSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize * storageComplexSize)); - } - else { - l = app->configuration.outputBufferNum; - } - - } - } - axis->outputBuffer = app->configuration.outputBuffer; -#if(VKFFT_BACKEND==0) - descriptorBufferInfo.buffer = app->configuration.outputBuffer[bufferId]; - descriptorBufferInfo.range = (axis->specializationConstants.outputBufferBlockSize * storageComplexSize); - descriptorBufferInfo.offset = offset * (axis->specializationConstants.outputBufferBlockSize * storageComplexSize); -#endif - } - if (axis->specializationConstants.performOffsetUpdate) { - axis->specializationConstants.outputOffset = app->configuration.outputBufferOffset; - } - } - else { - if (axis->specializationConstants.performBufferSetUpdate) { - uint64_t bufferId = 0; - uint64_t offset = j; - if (app->configuration.bufferSize) { - for (uint64_t l = 0; l < app->configuration.bufferNum; ++l) { - if (offset >= (uint64_t)ceil(app->configuration.bufferSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize * storageComplexSize))) { - bufferId++; - offset -= (uint64_t)ceil(app->configuration.bufferSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize * storageComplexSize)); - } - else { - l = app->configuration.bufferNum; - } - - } - } - axis->outputBuffer = app->configuration.buffer; -#if(VKFFT_BACKEND==0) - descriptorBufferInfo.buffer = app->configuration.buffer[bufferId]; - descriptorBufferInfo.range = (axis->specializationConstants.outputBufferBlockSize * storageComplexSize); - descriptorBufferInfo.offset = offset * (axis->specializationConstants.outputBufferBlockSize * storageComplexSize); -#endif - } - if (axis->specializationConstants.performOffsetUpdate) { - axis->specializationConstants.outputOffset = app->configuration.bufferOffset; - } - } - } - } - if ((i == 2) && (app->configuration.performConvolution)) { - if (axis->specializationConstants.performBufferSetUpdate) { - uint64_t bufferId = 0; - uint64_t offset = j; - if (app->configuration.kernelSize) { - for (uint64_t l = 0; l < app->configuration.kernelNum; ++l) { - if (offset >= (uint64_t)ceil(app->configuration.kernelSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize * storageComplexSize))) { - bufferId++; - offset -= (uint64_t)ceil(app->configuration.kernelSize[l] / (double)(axis->specializationConstants.outputBufferBlockSize * storageComplexSize)); - } - else { - l = app->configuration.kernelNum; - } - - } - } -#if(VKFFT_BACKEND==0) - descriptorBufferInfo.buffer = app->configuration.kernel[bufferId]; - descriptorBufferInfo.range = (axis->specializationConstants.kernelBlockSize * storageComplexSize); - descriptorBufferInfo.offset = offset * (axis->specializationConstants.kernelBlockSize * storageComplexSize); -#endif - } - if (axis->specializationConstants.performOffsetUpdate) { - axis->specializationConstants.kernelOffset = app->configuration.kernelOffset; - } - } - if ((i == axis->numBindings - 1) && (app->configuration.useLUT)) { -#if(VKFFT_BACKEND==0) - if (axis->specializationConstants.performBufferSetUpdate) { - descriptorBufferInfo.buffer = axis->bufferLUT; - descriptorBufferInfo.offset = 0; - descriptorBufferInfo.range = axis->bufferLUTSize; - } -#endif - } -#if(VKFFT_BACKEND==0) - if (axis->specializationConstants.performBufferSetUpdate) { - VkWriteDescriptorSet writeDescriptorSet = { VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET }; - writeDescriptorSet.dstSet = axis->descriptorSet; - writeDescriptorSet.dstBinding = (uint32_t)i; - writeDescriptorSet.dstArrayElement = (uint32_t)j; - writeDescriptorSet.descriptorType = descriptorType; - writeDescriptorSet.descriptorCount = 1; - writeDescriptorSet.pBufferInfo = &descriptorBufferInfo; - vkUpdateDescriptorSets(app->configuration.device[0], 1, &writeDescriptorSet, 0, 0); - } -#endif - } - } - } - if (axis->specializationConstants.performBufferSetUpdate) { - axis->specializationConstants.performBufferSetUpdate = 0; - } - if (axis->specializationConstants.performOffsetUpdate) { - axis->specializationConstants.performOffsetUpdate = 0; - } - return VKFFT_SUCCESS; -} -static inline VkFFTResult VkFFTPlanR2CMultiUploadDecomposition(VkFFTApplication* app, VkFFTPlan* FFTPlan, uint64_t inverse) { - //get radix stages - VkFFTResult resFFT = VKFFT_SUCCESS; -#if(VKFFT_BACKEND==0) - VkResult res = VK_SUCCESS; -#elif(VKFFT_BACKEND==1) - cudaError_t res = cudaSuccess; -#elif(VKFFT_BACKEND==2) - hipError_t res = hipSuccess; -#elif(VKFFT_BACKEND==3) - cl_int res = CL_SUCCESS; -#elif(VKFFT_BACKEND==4) - ze_result_t res = ZE_RESULT_SUCCESS; -#endif - VkFFTAxis* axis = &FFTPlan->R2Cdecomposition; - axis->specializationConstants.warpSize = app->configuration.warpSize; - axis->specializationConstants.numSharedBanks = app->configuration.numSharedBanks; - axis->specializationConstants.useUint64 = app->configuration.useUint64; - axis->specializationConstants.disableSetLocale = app->configuration.disableSetLocale; - - axis->specializationConstants.numAxisUploads = FFTPlan->numAxisUploads[0]; - axis->specializationConstants.reorderFourStep = ((FFTPlan->numAxisUploads[0] > 1) && (!app->useBluesteinFFT[0])) ? app->configuration.reorderFourStep : 0; - uint64_t complexSize; - if (app->configuration.doublePrecision || app->configuration.doublePrecisionFloatMemory) - complexSize = (2 * sizeof(double)); - else - if (app->configuration.halfPrecision) - complexSize = (2 * sizeof(float)); - else - complexSize = (2 * sizeof(float)); - axis->specializationConstants.complexSize = complexSize; - axis->specializationConstants.supportAxis = 0; - axis->specializationConstants.symmetricKernel = app->configuration.symmetricKernel; - axis->specializationConstants.conjugateConvolution = app->configuration.conjugateConvolution; - axis->specializationConstants.crossPowerSpectrumNormalization = app->configuration.crossPowerSpectrumNormalization; - axis->specializationConstants.fft_dim_full = app->configuration.size[0]; - axis->specializationConstants.dispatchZactualFFTSize = 1; - //allocate LUT - if (app->configuration.useLUT) { - double double_PI = 3.1415926535897932384626433832795; - if (app->configuration.doublePrecision || app->configuration.doublePrecisionFloatMemory) { - axis->bufferLUTSize = (app->configuration.size[0] / 2) * 2 * sizeof(double); - double* tempLUT = (double*)malloc(axis->bufferLUTSize); - if (!tempLUT) { - deleteVkFFT(app); - return VKFFT_ERROR_MALLOC_FAILED; - } - for (uint64_t i = 0; i < app->configuration.size[0] / 2; i++) { - double angle = double_PI * i / (app->configuration.size[0] / 2); - tempLUT[2 * i] = (double)cos(angle); - tempLUT[2 * i + 1] = (double)sin(angle); - } - axis->referenceLUT = 0; - if ((!inverse) && (!app->configuration.makeForwardPlanOnly)) { - axis->bufferLUT = app->localFFTPlan_inverse->R2Cdecomposition.bufferLUT; -#if(VKFFT_BACKEND==0) - axis->bufferLUTDeviceMemory = app->localFFTPlan_inverse->R2Cdecomposition.bufferLUTDeviceMemory; -#endif - axis->bufferLUTSize = app->localFFTPlan_inverse->R2Cdecomposition.bufferLUTSize; - axis->referenceLUT = 1; - } - else { -#if(VKFFT_BACKEND==0) - resFFT = allocateFFTBuffer(app, &axis->bufferLUT, &axis->bufferLUTDeviceMemory, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, VK_MEMORY_HEAP_DEVICE_LOCAL_BIT, axis->bufferLUTSize); - if (resFFT != VKFFT_SUCCESS) { - deleteVkFFT(app); - free(tempLUT); - tempLUT = 0; - return resFFT; - } - resFFT = transferDataFromCPU(app, tempLUT, &axis->bufferLUT, axis->bufferLUTSize); - if (resFFT != VKFFT_SUCCESS) { - deleteVkFFT(app); - free(tempLUT); - tempLUT = 0; - return resFFT; - } -#elif(VKFFT_BACKEND==1) - res = cudaMalloc((void**)&axis->bufferLUT, axis->bufferLUTSize); - if (res != cudaSuccess) { - deleteVkFFT(app); - free(tempLUT); - tempLUT = 0; - return VKFFT_ERROR_FAILED_TO_ALLOCATE; - } - res = cudaMemcpy(axis->bufferLUT, tempLUT, axis->bufferLUTSize, cudaMemcpyHostToDevice); - if (res != cudaSuccess) { - deleteVkFFT(app); - free(tempLUT); - tempLUT = 0; - return VKFFT_ERROR_FAILED_TO_ALLOCATE; - } -#elif(VKFFT_BACKEND==2) - res = hipMalloc((void**)&axis->bufferLUT, axis->bufferLUTSize); - if (res != hipSuccess) { - deleteVkFFT(app); - free(tempLUT); - tempLUT = 0; - return VKFFT_ERROR_FAILED_TO_ALLOCATE; - } - res = hipMemcpy(axis->bufferLUT, tempLUT, axis->bufferLUTSize, hipMemcpyHostToDevice); - if (res != hipSuccess) { - deleteVkFFT(app); - free(tempLUT); - tempLUT = 0; - return VKFFT_ERROR_FAILED_TO_ALLOCATE; - } -#elif(VKFFT_BACKEND==3) - axis->bufferLUT = clCreateBuffer(app->configuration.context[0], CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, axis->bufferLUTSize, tempLUT, &res); - if (res != CL_SUCCESS) { - deleteVkFFT(app); - free(tempLUT); - tempLUT = 0; - return VKFFT_ERROR_FAILED_TO_ALLOCATE; - } -#elif(VKFFT_BACKEND==4) - ze_device_mem_alloc_desc_t device_desc = {}; - device_desc.stype = ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC; - res = zeMemAllocDevice(app->configuration.context[0], &device_desc, axis->bufferLUTSize, sizeof(float), app->configuration.device[0], &axis->bufferLUT); - if (res != ZE_RESULT_SUCCESS) { - deleteVkFFT(app); - free(tempLUT); - tempLUT = 0; - return VKFFT_ERROR_FAILED_TO_ALLOCATE; - } - ze_command_queue_desc_t commandQueueCopyDesc = { - ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC, - 0, - app->configuration.commandQueueID, - 0, // index - 0, // flags - ZE_COMMAND_QUEUE_MODE_DEFAULT, - ZE_COMMAND_QUEUE_PRIORITY_NORMAL - }; - ze_command_list_handle_t copyCommandList; - res = zeCommandListCreateImmediate(app->configuration.context[0], app->configuration.device[0], &commandQueueCopyDesc, ©CommandList); - if (res != ZE_RESULT_SUCCESS) { - deleteVkFFT(app); - free(tempLUT); - tempLUT = 0; - return VKFFT_ERROR_FAILED_TO_CREATE_COMMAND_LIST; - } - res = zeCommandListAppendMemoryCopy(copyCommandList, axis->bufferLUT, tempLUT, axis->bufferLUTSize, 0, 0, 0); - if (res != ZE_RESULT_SUCCESS) { - deleteVkFFT(app); - free(tempLUT); - tempLUT = 0; - return VKFFT_ERROR_FAILED_TO_COPY; - } - res = zeCommandQueueSynchronize(app->configuration.commandQueue[0], UINT32_MAX); - if (res != ZE_RESULT_SUCCESS) { - deleteVkFFT(app); - free(tempLUT); - tempLUT = 0; - return VKFFT_ERROR_FAILED_TO_SYNCHRONIZE; - } -#endif - free(tempLUT); - tempLUT = 0; - } - } - else { - axis->bufferLUTSize = (app->configuration.size[0] / 2) * 2 * sizeof(float); - float* tempLUT = (float*)malloc(axis->bufferLUTSize); - if (!tempLUT) { - deleteVkFFT(app); - return VKFFT_ERROR_MALLOC_FAILED; - } - for (uint64_t i = 0; i < app->configuration.size[0] / 2; i++) { - double angle = double_PI * i / (app->configuration.size[0] / 2); - tempLUT[2 * i] = (float)cos(angle); - tempLUT[2 * i + 1] = (float)sin(angle); - } - axis->referenceLUT = 0; - if ((!inverse) && (!app->configuration.makeForwardPlanOnly)) { - axis->bufferLUT = app->localFFTPlan_inverse->R2Cdecomposition.bufferLUT; -#if(VKFFT_BACKEND==0) - axis->bufferLUTDeviceMemory = app->localFFTPlan_inverse->R2Cdecomposition.bufferLUTDeviceMemory; -#endif - axis->bufferLUTSize = app->localFFTPlan_inverse->R2Cdecomposition.bufferLUTSize; - axis->referenceLUT = 1; - } - else { -#if(VKFFT_BACKEND==0) - resFFT = allocateFFTBuffer(app, &axis->bufferLUT, &axis->bufferLUTDeviceMemory, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, VK_MEMORY_HEAP_DEVICE_LOCAL_BIT, axis->bufferLUTSize); - if (resFFT != VKFFT_SUCCESS) { - deleteVkFFT(app); - free(tempLUT); - tempLUT = 0; - return resFFT; - } - resFFT = transferDataFromCPU(app, tempLUT, &axis->bufferLUT, axis->bufferLUTSize); - if (resFFT != VKFFT_SUCCESS) { - deleteVkFFT(app); - free(tempLUT); - tempLUT = 0; - return resFFT; - } -#elif(VKFFT_BACKEND==1) - res = cudaMalloc((void**)&axis->bufferLUT, axis->bufferLUTSize); - if (res != cudaSuccess) { - deleteVkFFT(app); - free(tempLUT); - tempLUT = 0; - return VKFFT_ERROR_FAILED_TO_ALLOCATE; - } - res = cudaMemcpy(axis->bufferLUT, tempLUT, axis->bufferLUTSize, cudaMemcpyHostToDevice); - if (res != cudaSuccess) { - deleteVkFFT(app); - free(tempLUT); - tempLUT = 0; - return VKFFT_ERROR_FAILED_TO_ALLOCATE; - } -#elif(VKFFT_BACKEND==2) - res = hipMalloc((void**)&axis->bufferLUT, axis->bufferLUTSize); - if (res != hipSuccess) { - deleteVkFFT(app); - free(tempLUT); - tempLUT = 0; - return VKFFT_ERROR_FAILED_TO_ALLOCATE; - } - res = hipMemcpy(axis->bufferLUT, tempLUT, axis->bufferLUTSize, hipMemcpyHostToDevice); - if (res != hipSuccess) { - deleteVkFFT(app); - free(tempLUT); - tempLUT = 0; - return VKFFT_ERROR_FAILED_TO_ALLOCATE; - } -#elif(VKFFT_BACKEND==3) - axis->bufferLUT = clCreateBuffer(app->configuration.context[0], CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, axis->bufferLUTSize, tempLUT, &res); - if (res != CL_SUCCESS) { - deleteVkFFT(app); - free(tempLUT); - tempLUT = 0; - return VKFFT_ERROR_FAILED_TO_ALLOCATE; - } -#elif(VKFFT_BACKEND==4) - ze_device_mem_alloc_desc_t device_desc = {}; - device_desc.stype = ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC; - res = zeMemAllocDevice(app->configuration.context[0], &device_desc, axis->bufferLUTSize, sizeof(float), app->configuration.device[0], &axis->bufferLUT); - if (res != ZE_RESULT_SUCCESS) { - deleteVkFFT(app); - free(tempLUT); - tempLUT = 0; - return VKFFT_ERROR_FAILED_TO_ALLOCATE; - } - ze_command_queue_desc_t commandQueueCopyDesc = { - ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC, - 0, - app->configuration.commandQueueID, - 0, // index - 0, // flags - ZE_COMMAND_QUEUE_MODE_DEFAULT, - ZE_COMMAND_QUEUE_PRIORITY_NORMAL - }; - ze_command_list_handle_t copyCommandList; - res = zeCommandListCreateImmediate(app->configuration.context[0], app->configuration.device[0], &commandQueueCopyDesc, ©CommandList); - if (res != ZE_RESULT_SUCCESS) { - deleteVkFFT(app); - free(tempLUT); - tempLUT = 0; - return VKFFT_ERROR_FAILED_TO_CREATE_COMMAND_LIST; - } - res = zeCommandListAppendMemoryCopy(copyCommandList, axis->bufferLUT, tempLUT, axis->bufferLUTSize, 0, 0, 0); - if (res != ZE_RESULT_SUCCESS) { - deleteVkFFT(app); - free(tempLUT); - tempLUT = 0; - return VKFFT_ERROR_FAILED_TO_COPY; - } - res = zeCommandQueueSynchronize(app->configuration.commandQueue[0], UINT32_MAX); - if (res != ZE_RESULT_SUCCESS) { - deleteVkFFT(app); - free(tempLUT); - tempLUT = 0; - return VKFFT_ERROR_FAILED_TO_SYNCHRONIZE; - } -#endif - free(tempLUT); - tempLUT = 0; - } - } - } - //configure strides - uint64_t* axisStride = axis->specializationConstants.inputStride; - uint64_t* usedStride = 0; - if (app->useBluesteinFFT[0] && (FFTPlan->numAxisUploads[0] > 1)) { - if (inverse) - usedStride = FFTPlan->axes[0][FFTPlan->numAxisUploads[0] - 1].specializationConstants.inputStride; - else - usedStride = FFTPlan->inverseBluesteinAxes[0][FFTPlan->numAxisUploads[0] - 1].specializationConstants.outputStride; - } - else { - if (inverse) - usedStride = FFTPlan->axes[0][FFTPlan->numAxisUploads[0] - 1].specializationConstants.inputStride; - else - usedStride = FFTPlan->axes[0][0].specializationConstants.outputStride; - } - axisStride[0] = usedStride[0]; - axisStride[1] = usedStride[1]; - axisStride[2] = usedStride[2]; - axisStride[3] = usedStride[3]; - axisStride[4] = usedStride[4]; - - axisStride = axis->specializationConstants.outputStride; - usedStride = axis->specializationConstants.inputStride; - - axisStride[0] = usedStride[0]; - axisStride[1] = usedStride[1]; - axisStride[2] = usedStride[2]; - axisStride[3] = usedStride[3]; - axisStride[4] = usedStride[4]; - - axis->specializationConstants.inverse = inverse; - - uint64_t storageComplexSize; - if (app->configuration.doublePrecision) - storageComplexSize = (2 * sizeof(double)); - else - if (app->configuration.halfPrecision) - storageComplexSize = (2 * 2); - else - storageComplexSize = (2 * sizeof(float)); - - uint64_t initPageSize = -1; - uint64_t locBufferNum = 1; - uint64_t locBufferSize = 0; - /*for (uint64_t i = 0; i < app->configuration.bufferNum; i++) { - initPageSize += app->configuration.bufferSize[i]; - } - if (app->configuration.performConvolution) { - uint64_t initPageSizeKernel = 0; - for (uint64_t i = 0; i < app->configuration.kernelNum; i++) { - initPageSizeKernel += app->configuration.kernelSize[i]; - } - if (initPageSizeKernel > initPageSize) initPageSize = initPageSizeKernel; - } - if ((!((!app->configuration.reorderFourStep))) && (axis->specializationConstants.inputStride[1] * storageComplexSize > app->configuration.devicePageSize * 1024) && (app->configuration.devicePageSize > 0)) { - initPageSize = app->configuration.localPageSize * 1024; - }*/ - uint64_t axis_id = 0; - uint64_t axis_upload_id = 0; - - { - if (inverse) { - if ((axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1) && (app->configuration.isInputFormatted) && (!axis->specializationConstants.reverseBluesteinMultiUpload) && ( - ((axis_id == app->firstAxis) && (!inverse)) - || ((axis_id == app->lastAxis) && (inverse) && (!app->configuration.performConvolution) && (!app->configuration.inverseReturnToInputBuffer))) - ) { - uint64_t totalSize = 0; - uint64_t locPageSize = initPageSize; - locBufferNum = app->configuration.inputBufferNum; - if (app->configuration.inputBufferSize) { - locBufferSize = (uint64_t)ceil(app->configuration.inputBufferSize[0] / (double)storageComplexSize); - for (uint64_t i = 0; i < app->configuration.inputBufferNum; i++) { - totalSize += app->configuration.inputBufferSize[i]; - if (app->configuration.inputBufferSize[i] < locPageSize) locPageSize = app->configuration.inputBufferSize[i]; - } - } - axis->specializationConstants.inputBufferBlockSize = (locBufferNum == 1) ? locBufferSize : (uint64_t)ceil(locPageSize / (double)storageComplexSize); - axis->specializationConstants.inputBufferBlockNum = (locBufferNum == 1) ? 1 : (uint64_t)ceil(totalSize / (double)(axis->specializationConstants.inputBufferBlockSize * storageComplexSize)); - //if (axis->specializationConstants.inputBufferBlockNum == 1) axis->specializationConstants.inputBufferBlockSize = totalSize / storageComplexSize; - - } - else { - if ((axis_upload_id == 0) && (app->configuration.numberKernels > 1) && (inverse) && (!app->configuration.performConvolution)) { - uint64_t totalSize = 0; - uint64_t locPageSize = initPageSize; - locBufferNum = app->configuration.outputBufferNum; - if (app->configuration.outputBufferSize) { - locBufferSize = (uint64_t)ceil(app->configuration.outputBufferSize[0] / (double)storageComplexSize); - for (uint64_t i = 0; i < app->configuration.outputBufferNum; i++) { - totalSize += app->configuration.outputBufferSize[i]; - if (app->configuration.outputBufferSize[i] < locPageSize) locPageSize = app->configuration.outputBufferSize[i]; - } - } - axis->specializationConstants.inputBufferBlockSize = (locBufferNum == 1) ? locBufferSize : (uint64_t)ceil(locPageSize / (double)storageComplexSize); - axis->specializationConstants.inputBufferBlockNum = (locBufferNum == 1) ? 1 : (uint64_t)ceil(totalSize / (double)(axis->specializationConstants.inputBufferBlockSize * storageComplexSize)); - //if (axis->specializationConstants.inputBufferBlockNum == 1) axis->specializationConstants.outputBufferBlockSize = totalSize / storageComplexSize; - - } - else { - uint64_t totalSize = 0; - uint64_t locPageSize = initPageSize; - locBufferNum = app->configuration.bufferNum; - if (app->configuration.bufferSize) { - locBufferSize = (uint64_t)ceil(app->configuration.bufferSize[0] / (double)storageComplexSize); - for (uint64_t i = 0; i < app->configuration.bufferNum; i++) { - totalSize += app->configuration.bufferSize[i]; - if (app->configuration.bufferSize[i] < locPageSize) locPageSize = app->configuration.bufferSize[i]; - - } - } - axis->specializationConstants.inputBufferBlockSize = (locBufferNum == 1) ? locBufferSize : (uint64_t)ceil(locPageSize / (double)storageComplexSize); - axis->specializationConstants.inputBufferBlockNum = (locBufferNum == 1) ? 1 : (uint64_t)ceil(totalSize / (double)(axis->specializationConstants.inputBufferBlockSize * storageComplexSize)); - //if (axis->specializationConstants.inputBufferBlockNum == 1) axis->specializationConstants.inputBufferBlockSize = totalSize / storageComplexSize; - - } - } - } - else { - if (((axis_upload_id == 0) && (!app->useBluesteinFFT[axis_id]) && (app->configuration.isOutputFormatted && ( - ((axis_id == app->firstAxis) && (inverse)) - || ((axis_id == app->lastAxis) && (!inverse) && (!app->configuration.performConvolution)) - || ((axis_id == app->firstAxis) && (app->configuration.performConvolution) && (app->configuration.FFTdim == 1))) - )) || - ((axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1) && (app->useBluesteinFFT[axis_id]) && (axis->specializationConstants.reverseBluesteinMultiUpload || (FFTPlan->numAxisUploads[axis_id] == 1)) && (app->configuration.isOutputFormatted && ( - ((axis_id == app->firstAxis) && (inverse)) - || ((axis_id == app->lastAxis) && (!inverse) && (!app->configuration.performConvolution))) - )) || - ((app->configuration.numberKernels > 1) && ( - (inverse) - || (axis_id == app->lastAxis))) - ) { - uint64_t totalSize = 0; - uint64_t locPageSize = initPageSize; - locBufferNum = app->configuration.outputBufferNum; - if (app->configuration.outputBufferSize) { - locBufferSize = (uint64_t)ceil(app->configuration.outputBufferSize[0] / (double)storageComplexSize); - for (uint64_t i = 0; i < app->configuration.outputBufferNum; i++) { - totalSize += app->configuration.outputBufferSize[i]; - if (app->configuration.outputBufferSize[i] < locPageSize) locPageSize = app->configuration.outputBufferSize[i]; - } - } - axis->specializationConstants.inputBufferBlockSize = (locBufferNum == 1) ? locBufferSize : (uint64_t)ceil(locPageSize / (double)storageComplexSize); - axis->specializationConstants.outputBufferBlockNum = (locBufferNum == 1) ? 1 : (uint64_t)ceil(totalSize / (double)(axis->specializationConstants.inputBufferBlockSize * storageComplexSize)); - //if (axis->specializationConstants.outputBufferBlockNum == 1) axis->specializationConstants.outputBufferBlockSize = totalSize / storageComplexSize; - - } - else { - uint64_t totalSize = 0; - uint64_t locPageSize = initPageSize; - - locBufferNum = app->configuration.bufferNum; - if (app->configuration.bufferSize) { - locBufferSize = (uint64_t)ceil(app->configuration.bufferSize[0] / (double)storageComplexSize); - for (uint64_t i = 0; i < app->configuration.bufferNum; i++) { - totalSize += app->configuration.bufferSize[i]; - if (app->configuration.bufferSize[i] < locPageSize) locPageSize = app->configuration.bufferSize[i]; - } - } - axis->specializationConstants.inputBufferBlockSize = (locBufferNum == 1) ? locBufferSize : (uint64_t)ceil(locPageSize / (double)storageComplexSize); - axis->specializationConstants.outputBufferBlockNum = (locBufferNum == 1) ? 1 : (uint64_t)ceil(totalSize / (double)(axis->specializationConstants.inputBufferBlockSize * storageComplexSize)); - //if (axis->specializationConstants.outputBufferBlockNum == 1) axis->specializationConstants.outputBufferBlockSize = totalSize / storageComplexSize; - - } - } - } - initPageSize = -1; - locBufferNum = 1; - locBufferSize = -1; - { - if (inverse) { - if ((axis_upload_id == 0) && (app->configuration.numberKernels > 1) && (inverse) && (!app->configuration.performConvolution)) { - uint64_t totalSize = 0; - uint64_t locPageSize = initPageSize; - locBufferNum = app->configuration.outputBufferNum; - if (app->configuration.outputBufferSize) { - locBufferSize = (uint64_t)ceil(app->configuration.outputBufferSize[0] / (double)storageComplexSize); - for (uint64_t i = 0; i < app->configuration.outputBufferNum; i++) { - totalSize += app->configuration.outputBufferSize[i]; - if (app->configuration.outputBufferSize[i] < locPageSize) locPageSize = app->configuration.outputBufferSize[i]; - } - } - axis->specializationConstants.outputBufferBlockSize = (locBufferNum == 1) ? locBufferSize : (uint64_t)ceil(locPageSize / (double)storageComplexSize); - axis->specializationConstants.outputBufferBlockNum = (locBufferNum == 1) ? 1 : (uint64_t)ceil(totalSize / (double)(axis->specializationConstants.outputBufferBlockSize * storageComplexSize)); - //if (axis->specializationConstants.outputBufferBlockNum == 1) axis->specializationConstants.outputBufferBlockSize = totalSize / storageComplexSize; - - } - else { - uint64_t totalSize = 0; - uint64_t locPageSize = initPageSize; - locBufferNum = app->configuration.bufferNum; - if (app->configuration.bufferSize) { - locBufferSize = (uint64_t)ceil(app->configuration.bufferSize[0] / (double)storageComplexSize); - for (uint64_t i = 0; i < app->configuration.bufferNum; i++) { - totalSize += app->configuration.bufferSize[i]; - if (app->configuration.bufferSize[i] < locPageSize) locPageSize = app->configuration.bufferSize[i]; - - } - } - axis->specializationConstants.outputBufferBlockSize = (locBufferNum == 1) ? locBufferSize : (uint64_t)ceil(locPageSize / (double)storageComplexSize); - axis->specializationConstants.outputBufferBlockNum = (locBufferNum == 1) ? 1 : (uint64_t)ceil(totalSize / (double)(axis->specializationConstants.outputBufferBlockSize * storageComplexSize)); - //if (axis->specializationConstants.outputBufferBlockNum == 1) axis->specializationConstants.outputBufferBlockSize = totalSize / storageComplexSize; - - } - } - else { - if (((axis_upload_id == 0) && (!app->useBluesteinFFT[axis_id]) && (app->configuration.isOutputFormatted && ( - ((axis_id == app->firstAxis) && (inverse)) - || ((axis_id == app->lastAxis) && (!inverse) && (!app->configuration.performConvolution)) - || ((axis_id == app->firstAxis) && (app->configuration.performConvolution) && (app->configuration.FFTdim == 1))) - )) || - ((axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1) && (app->useBluesteinFFT[axis_id]) && (axis->specializationConstants.reverseBluesteinMultiUpload || (FFTPlan->numAxisUploads[axis_id] == 1)) && (app->configuration.isOutputFormatted && ( - ((axis_id == app->firstAxis) && (inverse)) - || ((axis_id == app->lastAxis) && (!inverse) && (!app->configuration.performConvolution))) - )) || - ((app->configuration.numberKernels > 1) && ( - (inverse) - || (axis_id == app->lastAxis))) - ) { - uint64_t totalSize = 0; - uint64_t locPageSize = initPageSize; - locBufferNum = app->configuration.outputBufferNum; - if (app->configuration.outputBufferSize) { - locBufferSize = (uint64_t)ceil(app->configuration.outputBufferSize[0] / (double)storageComplexSize); - for (uint64_t i = 0; i < app->configuration.outputBufferNum; i++) { - totalSize += app->configuration.outputBufferSize[i]; - if (app->configuration.outputBufferSize[i] < locPageSize) locPageSize = app->configuration.outputBufferSize[i]; - } - } - axis->specializationConstants.outputBufferBlockSize = (locBufferNum == 1) ? locBufferSize : (uint64_t)ceil(locPageSize / (double)storageComplexSize); - axis->specializationConstants.outputBufferBlockNum = (locBufferNum == 1) ? 1 : (uint64_t)ceil(totalSize / (double)(axis->specializationConstants.outputBufferBlockSize * storageComplexSize)); - //if (axis->specializationConstants.outputBufferBlockNum == 1) axis->specializationConstants.outputBufferBlockSize = totalSize / storageComplexSize; - - } - else { - uint64_t totalSize = 0; - uint64_t locPageSize = initPageSize; - - locBufferNum = app->configuration.bufferNum; - if (app->configuration.bufferSize) { - locBufferSize = (uint64_t)ceil(app->configuration.bufferSize[0] / (double)storageComplexSize); - for (uint64_t i = 0; i < app->configuration.bufferNum; i++) { - totalSize += app->configuration.bufferSize[i]; - if (app->configuration.bufferSize[i] < locPageSize) locPageSize = app->configuration.bufferSize[i]; - } - } - axis->specializationConstants.outputBufferBlockSize = (locBufferNum == 1) ? locBufferSize : (uint64_t)ceil(locPageSize / (double)storageComplexSize); - axis->specializationConstants.outputBufferBlockNum = (locBufferNum == 1) ? 1 : (uint64_t)ceil(totalSize / (double)(axis->specializationConstants.outputBufferBlockSize * storageComplexSize)); - //if (axis->specializationConstants.outputBufferBlockNum == 1) axis->specializationConstants.outputBufferBlockSize = totalSize / storageComplexSize; - - } - } - } - - if (axis->specializationConstants.inputBufferBlockNum == 0) axis->specializationConstants.inputBufferBlockNum = 1; - if (axis->specializationConstants.outputBufferBlockNum == 0) axis->specializationConstants.outputBufferBlockNum = 1; - if (app->configuration.performConvolution) { - //need fixing (not used now) - uint64_t totalSize = 0; - uint64_t locPageSize = initPageSize; - if (app->configuration.kernelSize) { - for (uint64_t i = 0; i < app->configuration.kernelNum; i++) { - totalSize += app->configuration.kernelSize[i]; - if (app->configuration.kernelSize[i] < locPageSize) locPageSize = app->configuration.kernelSize[i]; - } - } - axis->specializationConstants.kernelBlockSize = (uint64_t)ceil(locPageSize / (double)storageComplexSize); - axis->specializationConstants.kernelBlockNum = (uint64_t)ceil(totalSize / (double)(axis->specializationConstants.kernelBlockSize * storageComplexSize)); - //if (axis->specializationConstants.kernelBlockNum == 1) axis->specializationConstants.inputBufferBlockSize = totalSize / storageComplexSize; - if (axis->specializationConstants.kernelBlockNum == 0) axis->specializationConstants.kernelBlockNum = 1; - } - else { - axis->specializationConstants.kernelBlockSize = 0; - axis->specializationConstants.kernelBlockNum = 0; - } - axis->numBindings = 2; - axis->specializationConstants.numBuffersBound[0] = axis->specializationConstants.inputBufferBlockNum; - axis->specializationConstants.numBuffersBound[1] = axis->specializationConstants.outputBufferBlockNum; - axis->specializationConstants.numBuffersBound[2] = 0; - axis->specializationConstants.numBuffersBound[3] = 0; - -#if(VKFFT_BACKEND==0) - VkDescriptorPoolSize descriptorPoolSize = { VK_DESCRIPTOR_TYPE_STORAGE_BUFFER }; - descriptorPoolSize.descriptorCount = (uint32_t)(axis->specializationConstants.numBuffersBound[0] + axis->specializationConstants.numBuffersBound[1]); -#endif - if ((axis_id == 0) && (axis_upload_id == 0) && (app->configuration.FFTdim == 1) && (app->configuration.performConvolution)) { - axis->specializationConstants.numBuffersBound[axis->numBindings] = axis->specializationConstants.kernelBlockNum; -#if(VKFFT_BACKEND==0) - descriptorPoolSize.descriptorCount += (uint32_t)axis->specializationConstants.kernelBlockNum; -#endif - axis->numBindings++; - } - - if (app->configuration.useLUT) { - axis->specializationConstants.numBuffersBound[axis->numBindings] = 1; -#if(VKFFT_BACKEND==0) - descriptorPoolSize.descriptorCount++; -#endif - axis->numBindings++; - } -#if(VKFFT_BACKEND==0) - VkDescriptorPoolCreateInfo descriptorPoolCreateInfo = { VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO }; - descriptorPoolCreateInfo.poolSizeCount = 1; - descriptorPoolCreateInfo.pPoolSizes = &descriptorPoolSize; - descriptorPoolCreateInfo.maxSets = 1; - res = vkCreateDescriptorPool(app->configuration.device[0], &descriptorPoolCreateInfo, 0, &axis->descriptorPool); - if (res != VK_SUCCESS) { - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_TO_CREATE_DESCRIPTOR_POOL; - } - const VkDescriptorType descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; - VkDescriptorSetLayoutBinding* descriptorSetLayoutBindings; - descriptorSetLayoutBindings = (VkDescriptorSetLayoutBinding*)malloc(axis->numBindings * sizeof(VkDescriptorSetLayoutBinding)); - if (!descriptorSetLayoutBindings) { - deleteVkFFT(app); - return VKFFT_ERROR_MALLOC_FAILED; - } - for (uint64_t i = 0; i < axis->numBindings; ++i) { - descriptorSetLayoutBindings[i].binding = (uint32_t)i; - descriptorSetLayoutBindings[i].descriptorType = descriptorType; - descriptorSetLayoutBindings[i].descriptorCount = (uint32_t)axis->specializationConstants.numBuffersBound[i]; - descriptorSetLayoutBindings[i].stageFlags = VK_SHADER_STAGE_COMPUTE_BIT; - } - - VkDescriptorSetLayoutCreateInfo descriptorSetLayoutCreateInfo = { VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO }; - descriptorSetLayoutCreateInfo.bindingCount = (uint32_t)axis->numBindings; - descriptorSetLayoutCreateInfo.pBindings = descriptorSetLayoutBindings; - - res = vkCreateDescriptorSetLayout(app->configuration.device[0], &descriptorSetLayoutCreateInfo, 0, &axis->descriptorSetLayout); - if (res != VK_SUCCESS) { - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_TO_CREATE_DESCRIPTOR_SET_LAYOUT; - } - free(descriptorSetLayoutBindings); - descriptorSetLayoutBindings = 0; - VkDescriptorSetAllocateInfo descriptorSetAllocateInfo = { VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO }; - descriptorSetAllocateInfo.descriptorPool = axis->descriptorPool; - descriptorSetAllocateInfo.descriptorSetCount = 1; - descriptorSetAllocateInfo.pSetLayouts = &axis->descriptorSetLayout; - res = vkAllocateDescriptorSets(app->configuration.device[0], &descriptorSetAllocateInfo, &axis->descriptorSet); - if (res != VK_SUCCESS) { - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_TO_ALLOCATE_DESCRIPTOR_SETS; - } -#endif - if (app->configuration.specifyOffsetsAtLaunch) { - axis->specializationConstants.performPostCompilationInputOffset = 1; - axis->specializationConstants.performPostCompilationOutputOffset = 1; - if (app->configuration.performConvolution) - axis->specializationConstants.performPostCompilationKernelOffset = 1; - } - resFFT = VkFFTCheckUpdateBufferSet(app, axis, 1, 0); - if (resFFT != VKFFT_SUCCESS) { - deleteVkFFT(app); - return resFFT; - } - resFFT = VkFFTUpdateBufferSetR2CMultiUploadDecomposition(app, FFTPlan, axis, axis_id, axis_upload_id, inverse); - if (resFFT != VKFFT_SUCCESS) { - deleteVkFFT(app); - return resFFT; - } - { - axis->axisBlock[0] = 128; - if (axis->axisBlock[0] > app->configuration.maxThreadsNum) axis->axisBlock[0] = app->configuration.maxThreadsNum; - axis->axisBlock[1] = 1; - axis->axisBlock[2] = 1; - - uint64_t tempSize[3] = { (uint64_t)ceil((app->configuration.size[0] * app->configuration.size[1] * app->configuration.size[2]) / (double)(2 * axis->axisBlock[0])), 1, 1 }; - tempSize[2] *= app->configuration.numberKernels * app->configuration.numberBatches * app->configuration.coordinateFeatures; - if ((app->configuration.maxComputeWorkGroupCount[0] > app->configuration.maxComputeWorkGroupCount[1]) && (tempSize[1] > app->configuration.maxComputeWorkGroupCount[1]) && (tempSize[1] > tempSize[0]) && (tempSize[1] >= tempSize[2])) { - uint64_t temp_tempSize = tempSize[0]; - tempSize[0] = tempSize[1]; - tempSize[1] = temp_tempSize; - axis->specializationConstants.swapComputeWorkGroupID = 1; - } - else { - if ((app->configuration.maxComputeWorkGroupCount[0] > app->configuration.maxComputeWorkGroupCount[2]) && (tempSize[2] > app->configuration.maxComputeWorkGroupCount[2]) && (tempSize[2] > tempSize[0]) && (tempSize[2] >= tempSize[1])) { - uint64_t temp_tempSize = tempSize[0]; - tempSize[0] = tempSize[2]; - tempSize[2] = temp_tempSize; - axis->specializationConstants.swapComputeWorkGroupID = 2; - } - } - if (tempSize[0] > app->configuration.maxComputeWorkGroupCount[0]) axis->specializationConstants.performWorkGroupShift[0] = 1; - else axis->specializationConstants.performWorkGroupShift[0] = 0; - if (tempSize[1] > app->configuration.maxComputeWorkGroupCount[1]) axis->specializationConstants.performWorkGroupShift[1] = 1; - else axis->specializationConstants.performWorkGroupShift[1] = 0; - if (tempSize[2] > app->configuration.maxComputeWorkGroupCount[2]) axis->specializationConstants.performWorkGroupShift[2] = 1; - else axis->specializationConstants.performWorkGroupShift[2] = 0; - - axis->specializationConstants.localSize[0] = axis->axisBlock[0]; - axis->specializationConstants.localSize[1] = axis->axisBlock[1]; - axis->specializationConstants.localSize[2] = axis->axisBlock[2]; - - axis->specializationConstants.numCoordinates = (app->configuration.matrixConvolution > 1) ? 1 : app->configuration.coordinateFeatures; - axis->specializationConstants.matrixConvolution = app->configuration.matrixConvolution; - axis->specializationConstants.size[0] = app->configuration.size[0]; - axis->specializationConstants.size[1] = app->configuration.size[1]; - axis->specializationConstants.size[2] = app->configuration.size[2]; - - axis->specializationConstants.numBatches = app->configuration.numberBatches; - if ((app->configuration.FFTdim == 1) && (app->configuration.size[1] == 1) && ((app->configuration.numberBatches == 1) && (app->actualNumBatches > 1)) && (!app->configuration.performConvolution) && (app->configuration.coordinateFeatures == 1)) { - axis->specializationConstants.numBatches = app->actualNumBatches; - } - - axis->specializationConstants.numKernels = app->configuration.numberKernels; - axis->specializationConstants.sharedMemSize = app->configuration.sharedMemorySize; - axis->specializationConstants.sharedMemSizePow2 = app->configuration.sharedMemorySizePow2; - axis->specializationConstants.normalize = app->configuration.normalize; - axis->specializationConstants.axis_id = 0; - axis->specializationConstants.axis_upload_id = 0; - - for (uint64_t i = 0; i < 3; i++) { - axis->specializationConstants.frequencyZeropadding = app->configuration.frequencyZeroPadding; - axis->specializationConstants.performZeropaddingFull[i] = app->configuration.performZeropadding[i]; // don't read if input is zeropadded (0 - off, 1 - on) - axis->specializationConstants.fft_zeropad_left_full[i] = app->configuration.fft_zeropad_left[i]; - axis->specializationConstants.fft_zeropad_right_full[i] = app->configuration.fft_zeropad_right[i]; - } - /*if ((inverse)) { - if ((app->configuration.frequencyZeroPadding) && (axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1)) { - axis->specializationConstants.zeropad[0] = app->configuration.performZeropadding[axis_id]; - axis->specializationConstants.fft_zeropad_left_read[axis_id] = app->configuration.fft_zeropad_left[axis_id]; - axis->specializationConstants.fft_zeropad_right_read[axis_id] = app->configuration.fft_zeropad_right[axis_id]; - } - else - axis->specializationConstants.zeropad[0] = 0; - if ((!app->configuration.frequencyZeroPadding) && (axis_upload_id == 0)) { - axis->specializationConstants.zeropad[1] = app->configuration.performZeropadding[axis_id]; - axis->specializationConstants.fft_zeropad_left_write[axis_id] = app->configuration.fft_zeropad_left[axis_id]; - axis->specializationConstants.fft_zeropad_right_write[axis_id] = app->configuration.fft_zeropad_right[axis_id]; - } - else - axis->specializationConstants.zeropad[1] = 0; - } - else { - if ((!app->configuration.frequencyZeroPadding) && (axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1)) { - axis->specializationConstants.zeropad[0] = app->configuration.performZeropadding[axis_id]; - axis->specializationConstants.fft_zeropad_left_read[axis_id] = app->configuration.fft_zeropad_left[axis_id]; - axis->specializationConstants.fft_zeropad_right_read[axis_id] = app->configuration.fft_zeropad_right[axis_id]; - } - else - axis->specializationConstants.zeropad[0] = 0; - if (((app->configuration.frequencyZeroPadding) && (axis_upload_id == 0)) || (((app->configuration.FFTdim - 1 == axis_id) && (axis_upload_id == 0) && (app->configuration.performConvolution)))) { - axis->specializationConstants.zeropad[1] = app->configuration.performZeropadding[axis_id]; - axis->specializationConstants.fft_zeropad_left_write[axis_id] = app->configuration.fft_zeropad_left[axis_id]; - axis->specializationConstants.fft_zeropad_right_write[axis_id] = app->configuration.fft_zeropad_right[axis_id]; - } - else - axis->specializationConstants.zeropad[1] = 0; - }*/ - if ((app->configuration.FFTdim - 1 == axis_id) && (axis_upload_id == 0) && (app->configuration.performConvolution)) { - axis->specializationConstants.convolutionStep = 1; - } - else - axis->specializationConstants.convolutionStep = 0; - char floatTypeInputMemory[10]; - char floatTypeOutputMemory[10]; - char floatTypeKernelMemory[10]; - char floatType[10]; - axis->specializationConstants.unroll = 1; - axis->specializationConstants.LUT = app->configuration.useLUT; - if (app->configuration.doublePrecision) { - sprintf(floatType, "double"); - sprintf(floatTypeInputMemory, "double"); - sprintf(floatTypeOutputMemory, "double"); - sprintf(floatTypeKernelMemory, "double"); - //axis->specializationConstants.unroll = 1; - } - else { - //axis->specializationConstants.unroll = 0; - if (app->configuration.halfPrecision) { - sprintf(floatType, "float"); - if (app->configuration.halfPrecisionMemoryOnly) { - //only out of place mode, input/output buffer must be different - sprintf(floatTypeInputMemory, "float"); - sprintf(floatTypeOutputMemory, "float"); - sprintf(floatTypeKernelMemory, "float"); - } - else { - sprintf(floatTypeInputMemory, "half"); - sprintf(floatTypeOutputMemory, "half"); - sprintf(floatTypeKernelMemory, "half"); - } - - } - else { - if (app->configuration.doublePrecisionFloatMemory) { - sprintf(floatType, "double"); - sprintf(floatTypeInputMemory, "float"); - sprintf(floatTypeOutputMemory, "float"); - sprintf(floatTypeKernelMemory, "float"); - } - else { - sprintf(floatType, "float"); - sprintf(floatTypeInputMemory, "float"); - sprintf(floatTypeOutputMemory, "float"); - sprintf(floatTypeKernelMemory, "float"); - } - } - } - char uintType[20] = ""; - if (!app->configuration.useUint64) { -#if(VKFFT_BACKEND==0) - sprintf(uintType, "uint"); -#elif(VKFFT_BACKEND==1) - sprintf(uintType, "unsigned int"); -#elif(VKFFT_BACKEND==2) - sprintf(uintType, "unsigned int"); -#elif(VKFFT_BACKEND==3) - sprintf(uintType, "unsigned int"); -#elif(VKFFT_BACKEND==4) - sprintf(uintType, "unsigned int"); -#endif - } - else { -#if(VKFFT_BACKEND==0) - sprintf(uintType, "uint64_t"); -#elif(VKFFT_BACKEND==1) - sprintf(uintType, "unsigned long long"); -#elif(VKFFT_BACKEND==2) - sprintf(uintType, "unsigned long long"); -#elif(VKFFT_BACKEND==3) - sprintf(uintType, "unsigned long"); -#elif(VKFFT_BACKEND==4) - sprintf(uintType, "unsigned long"); -#endif - } - { - axis->pushConstants.structSize = 0; - if (axis->specializationConstants.performWorkGroupShift[0]) { - axis->pushConstants.performWorkGroupShift[0] = 1; - axis->pushConstants.structSize += 1; - } - if (axis->specializationConstants.performWorkGroupShift[1]) { - axis->pushConstants.performWorkGroupShift[1] = 1; - axis->pushConstants.structSize += 1; - } - if (axis->specializationConstants.performWorkGroupShift[2]) { - axis->pushConstants.performWorkGroupShift[2] = 1; - axis->pushConstants.structSize += 1; - } - if (axis->specializationConstants.performPostCompilationInputOffset) { - axis->pushConstants.performPostCompilationInputOffset = 1; - axis->pushConstants.structSize += 1; - } - if (axis->specializationConstants.performPostCompilationOutputOffset) { - axis->pushConstants.performPostCompilationOutputOffset = 1; - axis->pushConstants.structSize += 1; - } - if (axis->specializationConstants.performPostCompilationKernelOffset) { - axis->pushConstants.performPostCompilationKernelOffset = 1; - axis->pushConstants.structSize += 1; - } - if (app->configuration.useUint64) - axis->pushConstants.structSize *= sizeof(uint64_t); - else - axis->pushConstants.structSize *= sizeof(uint32_t); - axis->specializationConstants.pushConstantsStructSize = axis->pushConstants.structSize; - } - //uint64_t LUT = app->configuration.useLUT; - uint64_t type = 0; - - axis->specializationConstants.maxCodeLength = app->configuration.maxCodeLength; - axis->specializationConstants.maxTempLength = app->configuration.maxTempLength; - axis->specializationConstants.code0 = (char*)malloc(sizeof(char) * app->configuration.maxCodeLength); - char* code0 = axis->specializationConstants.code0; - if (!code0) { - deleteVkFFT(app); - return VKFFT_ERROR_MALLOC_FAILED; - } - resFFT = shaderGenVkFFT_R2C_decomposition(code0, &axis->specializationConstants, floatType, floatTypeInputMemory, floatTypeOutputMemory, floatTypeKernelMemory, uintType, type); - freeShaderGenVkFFT(&axis->specializationConstants); - if (resFFT != VKFFT_SUCCESS) { - deleteVkFFT(app); - return resFFT; - } -#if(VKFFT_BACKEND==0) - uint32_t* code; - uint64_t codeSize; - if (app->configuration.loadApplicationFromString) { - char* localStrPointer = (char*)app->configuration.loadApplicationString + app->currentApplicationStringPos; - memcpy(&codeSize, localStrPointer, sizeof(uint64_t)); - code = (uint32_t*)malloc(codeSize); - if (!code) { - free(code0); - code0 = 0; - deleteVkFFT(app); - return VKFFT_ERROR_MALLOC_FAILED; - } - memcpy(code, localStrPointer + sizeof(uint64_t), codeSize); - app->currentApplicationStringPos += codeSize + sizeof(uint64_t); - } - else - { - glslang_resource_t default_resource = {}; - default_resource.max_lights = 32; - default_resource.max_clip_planes = 6; - default_resource.max_texture_units = 32; - default_resource.max_texture_coords = 32; - default_resource.max_vertex_attribs = 64; - default_resource.max_vertex_uniform_components = 4096; - default_resource.max_varying_floats = 64; - default_resource.max_vertex_texture_image_units = 32; - default_resource.max_combined_texture_image_units = 80; - default_resource.max_texture_image_units = 32; - default_resource.max_fragment_uniform_components = 4096; - default_resource.max_draw_buffers = 32; - default_resource.max_vertex_uniform_vectors = 128; - default_resource.max_varying_vectors = 8; - default_resource.max_fragment_uniform_vectors = 16; - default_resource.max_vertex_output_vectors = 16; - default_resource.max_fragment_input_vectors = 15; - default_resource.min_program_texel_offset = -8; - default_resource.max_program_texel_offset = 7; - default_resource.max_clip_distances = 8; - default_resource.max_compute_work_group_count_x = (int)app->configuration.maxComputeWorkGroupCount[0]; - default_resource.max_compute_work_group_count_y = (int)app->configuration.maxComputeWorkGroupCount[1]; - default_resource.max_compute_work_group_count_z = (int)app->configuration.maxComputeWorkGroupCount[2]; - default_resource.max_compute_work_group_size_x = (int)app->configuration.maxComputeWorkGroupSize[0]; - default_resource.max_compute_work_group_size_y = (int)app->configuration.maxComputeWorkGroupSize[1]; - default_resource.max_compute_work_group_size_z = (int)app->configuration.maxComputeWorkGroupSize[2]; - default_resource.max_compute_uniform_components = 1024; - default_resource.max_compute_texture_image_units = 16; - default_resource.max_compute_image_uniforms = 8; - default_resource.max_compute_atomic_counters = 8; - default_resource.max_compute_atomic_counter_buffers = 1; - default_resource.max_varying_components = 60; - default_resource.max_vertex_output_components = 64; - default_resource.max_geometry_input_components = 64; - default_resource.max_geometry_output_components = 128; - default_resource.max_fragment_input_components = 128; - default_resource.max_image_units = 8; - default_resource.max_combined_image_units_and_fragment_outputs = 8; - default_resource.max_combined_shader_output_resources = 8; - default_resource.max_image_samples = 0; - default_resource.max_vertex_image_uniforms = 0; - default_resource.max_tess_control_image_uniforms = 0; - default_resource.max_tess_evaluation_image_uniforms = 0; - default_resource.max_geometry_image_uniforms = 0; - default_resource.max_fragment_image_uniforms = 8; - default_resource.max_combined_image_uniforms = 8; - default_resource.max_geometry_texture_image_units = 16; - default_resource.max_geometry_output_vertices = 256; - default_resource.max_geometry_total_output_components = 1024; - default_resource.max_geometry_uniform_components = 1024; - default_resource.max_geometry_varying_components = 64; - default_resource.max_tess_control_input_components = 128; - default_resource.max_tess_control_output_components = 128; - default_resource.max_tess_control_texture_image_units = 16; - default_resource.max_tess_control_uniform_components = 1024; - default_resource.max_tess_control_total_output_components = 4096; - default_resource.max_tess_evaluation_input_components = 128; - default_resource.max_tess_evaluation_output_components = 128; - default_resource.max_tess_evaluation_texture_image_units = 16; - default_resource.max_tess_evaluation_uniform_components = 1024; - default_resource.max_tess_patch_components = 120; - default_resource.max_patch_vertices = 32; - default_resource.max_tess_gen_level = 64; - default_resource.max_viewports = 16; - default_resource.max_vertex_atomic_counters = 0; - default_resource.max_tess_control_atomic_counters = 0; - default_resource.max_tess_evaluation_atomic_counters = 0; - default_resource.max_geometry_atomic_counters = 0; - default_resource.max_fragment_atomic_counters = 8; - default_resource.max_combined_atomic_counters = 8; - default_resource.max_atomic_counter_bindings = 1; - default_resource.max_vertex_atomic_counter_buffers = 0; - default_resource.max_tess_control_atomic_counter_buffers = 0; - default_resource.max_tess_evaluation_atomic_counter_buffers = 0; - default_resource.max_geometry_atomic_counter_buffers = 0; - default_resource.max_fragment_atomic_counter_buffers = 1; - default_resource.max_combined_atomic_counter_buffers = 1; - default_resource.max_atomic_counter_buffer_size = 16384; - default_resource.max_transform_feedback_buffers = 4; - default_resource.max_transform_feedback_interleaved_components = 64; - default_resource.max_cull_distances = 8; - default_resource.max_combined_clip_and_cull_distances = 8; - default_resource.max_samples = 4; - default_resource.max_mesh_output_vertices_nv = 256; - default_resource.max_mesh_output_primitives_nv = 512; - default_resource.max_mesh_work_group_size_x_nv = 32; - default_resource.max_mesh_work_group_size_y_nv = 1; - default_resource.max_mesh_work_group_size_z_nv = 1; - default_resource.max_task_work_group_size_x_nv = 32; - default_resource.max_task_work_group_size_y_nv = 1; - default_resource.max_task_work_group_size_z_nv = 1; - default_resource.max_mesh_view_count_nv = 4; - - default_resource.limits.non_inductive_for_loops = 1; - default_resource.limits.while_loops = 1; - default_resource.limits.do_while_loops = 1; - default_resource.limits.general_uniform_indexing = 1; - default_resource.limits.general_attribute_matrix_vector_indexing = 1; - default_resource.limits.general_varying_indexing = 1; - default_resource.limits.general_sampler_indexing = 1; - default_resource.limits.general_variable_indexing = 1; - default_resource.limits.general_constant_matrix_vector_indexing = 1; - - glslang_target_client_version_t client_version = (app->configuration.halfPrecision) ? GLSLANG_TARGET_VULKAN_1_1 : GLSLANG_TARGET_VULKAN_1_0; - glslang_target_language_version_t target_language_version = (app->configuration.halfPrecision) ? GLSLANG_TARGET_SPV_1_3 : GLSLANG_TARGET_SPV_1_0; - glslang_input_t input = - { - GLSLANG_SOURCE_GLSL, - GLSLANG_STAGE_COMPUTE, - GLSLANG_CLIENT_VULKAN, - client_version, - GLSLANG_TARGET_SPV, - target_language_version, - code0, - 450, - GLSLANG_NO_PROFILE, - 1, - 0, - GLSLANG_MSG_DEFAULT_BIT, - (const glslang_resource_t*)&default_resource, - }; - //printf("%s\n", code0); - glslang_shader_t* shader = glslang_shader_create((const glslang_input_t *)&input); - const char* err; - if (!glslang_shader_preprocess(shader, &input)) - { - err = glslang_shader_get_info_log(shader); - printf("%s\n", code0); - printf("%s\nVkFFT shader type: %" PRIu64 "\n", err, type); - glslang_shader_delete(shader); - free(code0); - code0 = 0; - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_SHADER_PREPROCESS; - - } - - if (!glslang_shader_parse(shader, &input)) - { - err = glslang_shader_get_info_log(shader); - printf("%s\n", code0); - printf("%s\nVkFFT shader type: %" PRIu64 "\n", err, type); - glslang_shader_delete(shader); - free(code0); - code0 = 0; - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_SHADER_PARSE; - - } - glslang_program_t* program = glslang_program_create(); - glslang_program_add_shader(program, shader); - if (!glslang_program_link(program, GLSLANG_MSG_SPV_RULES_BIT | GLSLANG_MSG_VULKAN_RULES_BIT)) - { - err = glslang_program_get_info_log(program); - printf("%s\n", code0); - printf("%s\nVkFFT shader type: %" PRIu64 "\n", err, type); - glslang_shader_delete(shader); - glslang_program_delete(program); - free(code0); - code0 = 0; - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_SHADER_LINK; - - } - - glslang_program_SPIRV_generate(program, input.stage); - - if (glslang_program_SPIRV_get_messages(program)) - { - printf("%s", glslang_program_SPIRV_get_messages(program)); - glslang_shader_delete(shader); - glslang_program_delete(program); - free(code0); - code0 = 0; - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_SPIRV_GENERATE; - } - - glslang_shader_delete(shader); - uint32_t* tempCode = glslang_program_SPIRV_get_ptr(program); - codeSize = glslang_program_SPIRV_get_size(program) * sizeof(uint32_t); - axis->binarySize = codeSize; - code = (uint32_t*)malloc(codeSize); - if (!code) { - free(code0); - code0 = 0; - glslang_program_delete(program); - deleteVkFFT(app); - return VKFFT_ERROR_MALLOC_FAILED; - } - axis->binary = code; - memcpy(code, tempCode, codeSize); - glslang_program_delete(program); - } - VkPipelineShaderStageCreateInfo pipelineShaderStageCreateInfo = { VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO }; - VkComputePipelineCreateInfo computePipelineCreateInfo = { VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO }; - pipelineShaderStageCreateInfo.stage = VK_SHADER_STAGE_COMPUTE_BIT; - VkShaderModuleCreateInfo createInfo = { VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO }; - createInfo.pCode = code; - createInfo.codeSize = codeSize; - res = vkCreateShaderModule(app->configuration.device[0], &createInfo, 0, &pipelineShaderStageCreateInfo.module); - if (res != VK_SUCCESS) { - free(code0); - code0 = 0; - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_TO_CREATE_SHADER_MODULE; - } - VkPipelineLayoutCreateInfo pipelineLayoutCreateInfo = { VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO }; - pipelineLayoutCreateInfo.setLayoutCount = 1; - pipelineLayoutCreateInfo.pSetLayouts = &axis->descriptorSetLayout; - VkPushConstantRange pushConstantRange = { VK_SHADER_STAGE_COMPUTE_BIT }; - pushConstantRange.offset = 0; - pushConstantRange.size = (uint32_t)axis->pushConstants.structSize; - // Push constant ranges are part of the pipeline layout - if (axis->pushConstants.structSize) { - pipelineLayoutCreateInfo.pushConstantRangeCount = 1; - pipelineLayoutCreateInfo.pPushConstantRanges = &pushConstantRange; - } - res = vkCreatePipelineLayout(app->configuration.device[0], &pipelineLayoutCreateInfo, 0, &axis->pipelineLayout); - if (res != VK_SUCCESS) { - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_TO_CREATE_PIPELINE_LAYOUT; - } - pipelineShaderStageCreateInfo.pName = "main"; - pipelineShaderStageCreateInfo.pSpecializationInfo = 0;// &specializationInfo; - computePipelineCreateInfo.stage = pipelineShaderStageCreateInfo; - computePipelineCreateInfo.layout = axis->pipelineLayout; - res = vkCreateComputePipelines(app->configuration.device[0], VK_NULL_HANDLE, 1, &computePipelineCreateInfo, 0, &axis->pipeline); - if (res != VK_SUCCESS) { - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_TO_CREATE_PIPELINE; - } - vkDestroyShaderModule(app->configuration.device[0], pipelineShaderStageCreateInfo.module, 0); - if (!app->configuration.saveApplicationToString) { - free(code); - code = 0; - } -#elif(VKFFT_BACKEND==1) - char* code; - uint64_t codeSize; - if (app->configuration.loadApplicationFromString) { - char* localStrPointer = (char*)app->configuration.loadApplicationString + app->currentApplicationStringPos; - memcpy(&codeSize, localStrPointer, sizeof(uint64_t)); - code = (char*)malloc(codeSize); - if (!code) { - free(code0); - code0 = 0; - deleteVkFFT(app); - return VKFFT_ERROR_MALLOC_FAILED; - } - memcpy(code, localStrPointer + sizeof(uint64_t), codeSize); - app->currentApplicationStringPos += codeSize + sizeof(uint64_t); - } - else { - nvrtcProgram prog; - nvrtcResult result = nvrtcCreateProgram(&prog, // prog - code0, // buffer - "VkFFT.cu", // name - 0, // numHeaders - 0, // headers - 0); // includeNames - //free(includeNames); - //free(headers); - if (result != NVRTC_SUCCESS) { - printf("nvrtcCreateProgram error: %s\n", nvrtcGetErrorString(result)); - free(code0); - code0 = 0; - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_TO_CREATE_PROGRAM; - } - char* opts[5]; - opts[0] = (char*)malloc(sizeof(char) * 50); - if (!opts[0]) { - free(code0); - code0 = 0; - deleteVkFFT(app); - return VKFFT_ERROR_MALLOC_FAILED; - } - sprintf(opts[0], "--gpu-architecture=sm_%" PRIu64 "%" PRIu64 "", app->configuration.computeCapabilityMajor, app->configuration.computeCapabilityMinor); - //result = nvrtcAddNameExpression(prog, "&consts"); - //if (result != NVRTC_SUCCESS) printf("1.5 error: %s\n", nvrtcGetErrorString(result)); - result = nvrtcCompileProgram(prog, // prog - 1, // numOptions - (const char* const*)opts); // options - free(opts[0]); - if (result != NVRTC_SUCCESS) { - printf("nvrtcCompileProgram error: %s\n", nvrtcGetErrorString(result)); - char* log = (char*)malloc(sizeof(char) * 4000000); - if (!log) { - free(code0); - code0 = 0; - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_TO_COMPILE_PROGRAM; - } - else { - nvrtcGetProgramLog(prog, log); - printf("%s\n", log); - free(log); - log = 0; - printf("%s\n", code0); - free(code0); - code0 = 0; - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_TO_COMPILE_PROGRAM; - } - } -#if (CUDA_VERSION >= 11030) - result = nvrtcGetCUBINSize(prog, &codeSize); -#else - result = nvrtcGetPTXSize(prog, &codeSize); -#endif - if (result != NVRTC_SUCCESS) { -#if (CUDA_VERSION >= 11030) - printf("nvrtcGetCUBINSize error: %s\n", nvrtcGetErrorString(result)); -#else - printf("nvrtcGetPTXSize error: %s\n", nvrtcGetErrorString(result)); -#endif - free(code0); - code0 = 0; - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_TO_GET_CODE_SIZE; - } - axis->binarySize = codeSize; - code = (char*)malloc(codeSize); - if (!code) { - free(code0); - code0 = 0; - deleteVkFFT(app); - return VKFFT_ERROR_MALLOC_FAILED; - } - axis->binary = code; -#if (CUDA_VERSION >= 11030) - result = nvrtcGetCUBIN(prog, code); -#else - result = nvrtcGetPTX(prog, code); -#endif - if (result != NVRTC_SUCCESS) { -#if (CUDA_VERSION >= 11030) - printf("nvrtcGetCUBIN error: %s\n", nvrtcGetErrorString(result)); -#else - printf("nvrtcGetPTX error: %s\n", nvrtcGetErrorString(result)); -#endif - free(code); - code = 0; - free(code0); - code0 = 0; - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_TO_GET_CODE; - } - result = nvrtcDestroyProgram(&prog); - if (result != NVRTC_SUCCESS) { - printf("nvrtcDestroyProgram error: %s\n", nvrtcGetErrorString(result)); - free(code); - code = 0; - free(code0); - code0 = 0; - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_TO_DESTROY_PROGRAM; - } - } - CUresult result2 = cuModuleLoadDataEx(&axis->VkFFTModule, code, 0, 0, 0); - - if (result2 != CUDA_SUCCESS) { - printf("cuModuleLoadDataEx error: %d\n", result2); - free(code); - code = 0; - free(code0); - code0 = 0; - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_TO_LOAD_MODULE; - } - result2 = cuModuleGetFunction(&axis->VkFFTKernel, axis->VkFFTModule, "VkFFT_main_R2C"); - if (result2 != CUDA_SUCCESS) { - printf("cuModuleGetFunction error: %d\n", result2); - free(code); - code = 0; - free(code0); - code0 = 0; - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_TO_GET_FUNCTION; - } - if (axis->specializationConstants.usedSharedMemory > app->configuration.sharedMemorySizeStatic) { - result2 = cuFuncSetAttribute(axis->VkFFTKernel, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, (int)axis->specializationConstants.usedSharedMemory); - if (result2 != CUDA_SUCCESS) { - printf("cuFuncSetAttribute error: %d\n", result2); - free(code); - code = 0; - free(code0); - code0 = 0; - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_TO_SET_DYNAMIC_SHARED_MEMORY; - } - } - if (axis->pushConstants.structSize) { - size_t size = axis->pushConstants.structSize; - result2 = cuModuleGetGlobal(&axis->consts_addr, &size, axis->VkFFTModule, "consts"); - if (result2 != CUDA_SUCCESS) { - printf("cuModuleGetGlobal error: %d\n", result2); - free(code); - code = 0; - free(code0); - code0 = 0; - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_TO_MODULE_GET_GLOBAL; - } - } - if (!app->configuration.saveApplicationToString) { - free(code); - code = 0; - } -#elif(VKFFT_BACKEND==2) - uint32_t* code; - uint64_t codeSize; - if (app->configuration.loadApplicationFromString) { - char* localStrPointer = (char*)app->configuration.loadApplicationString + app->currentApplicationStringPos; - memcpy(&codeSize, localStrPointer, sizeof(uint64_t)); - code = (uint32_t*)malloc(codeSize); - if (!code) { - free(code0); - code0 = 0; - deleteVkFFT(app); - return VKFFT_ERROR_MALLOC_FAILED; - } - memcpy(code, localStrPointer + sizeof(uint64_t), codeSize); - app->currentApplicationStringPos += codeSize + sizeof(uint64_t); - } - else - { - hiprtcProgram prog; - enum hiprtcResult result = hiprtcCreateProgram(&prog, // prog - code0, // buffer - "VkFFT.hip", // name - 0, // numHeaders - 0, // headers - 0); // includeNames - if (result != HIPRTC_SUCCESS) { - printf("hiprtcCreateProgram error: %s\n", hiprtcGetErrorString(result)); - free(code0); - code0 = 0; - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_TO_CREATE_PROGRAM; - } - if (axis->pushConstants.structSize) { - result = hiprtcAddNameExpression(prog, "&consts"); - if (result != HIPRTC_SUCCESS) { - printf("hiprtcAddNameExpression error: %s\n", hiprtcGetErrorString(result)); - free(code0); - code0 = 0; - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_TO_ADD_NAME_EXPRESSION; - } - } - - result = hiprtcCompileProgram(prog, // prog - 0, // numOptions - 0); // options - if (result != HIPRTC_SUCCESS) { - printf("hiprtcCompileProgram error: %s\n", hiprtcGetErrorString(result)); - char* log = (char*)malloc(sizeof(char) * 100000); - if (!log) { - free(code0); - code0 = 0; - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_TO_COMPILE_PROGRAM; - } - else { - hiprtcGetProgramLog(prog, log); - printf("%s\n", log); - free(log); - log = 0; - printf("%s\n", code0); - free(code0); - code0 = 0; - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_TO_COMPILE_PROGRAM; - } - } - result = hiprtcGetCodeSize(prog, &codeSize); - if (result != HIPRTC_SUCCESS) { - printf("hiprtcGetCodeSize error: %s\n", hiprtcGetErrorString(result)); - free(code0); - code0 = 0; - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_TO_GET_CODE; - } - axis->binarySize = codeSize; - code = (uint32_t*)malloc(codeSize); - if (!code) { - free(code0); - code0 = 0; - deleteVkFFT(app); - return VKFFT_ERROR_MALLOC_FAILED; - } - axis->binary = code; - result = hiprtcGetCode(prog, (char*)code); - if (result != HIPRTC_SUCCESS) { - printf("hiprtcGetCode error: %s\n", hiprtcGetErrorString(result)); - free(code); - code = 0; - free(code0); - code0 = 0; - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_TO_GET_CODE_SIZE; - } - //printf("%s\n", code); - // Destroy the program. - result = hiprtcDestroyProgram(&prog); - if (result != HIPRTC_SUCCESS) { - printf("hiprtcDestroyProgram error: %s\n", hiprtcGetErrorString(result)); - free(code); - code = 0; - free(code0); - code0 = 0; - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_TO_DESTROY_PROGRAM; - } - } - hipError_t result2 = hipModuleLoadDataEx(&axis->VkFFTModule, code, 0, 0, 0); - - if (result2 != hipSuccess) { - printf("hipModuleLoadDataEx error: %d\n", result2); - free(code); - code = 0; - free(code0); - code0 = 0; - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_TO_LOAD_MODULE; - } - result2 = hipModuleGetFunction(&axis->VkFFTKernel, axis->VkFFTModule, "VkFFT_main_R2C"); - if (result2 != hipSuccess) { - printf("hipModuleGetFunction error: %d\n", result2); - free(code); - code = 0; - free(code0); - code0 = 0; - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_TO_GET_FUNCTION; - } - if (axis->specializationConstants.usedSharedMemory > app->configuration.sharedMemorySizeStatic) { - result2 = hipFuncSetAttribute(axis->VkFFTKernel, hipFuncAttributeMaxDynamicSharedMemorySize, (int)axis->specializationConstants.usedSharedMemory); - //result2 = hipFuncSetCacheConfig(axis->VkFFTKernel, hipFuncCachePreferShared); - if (result2 != hipSuccess) { - printf("hipFuncSetAttribute error: %d\n", result2); - free(code); - code = 0; - free(code0); - code0 = 0; - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_TO_SET_DYNAMIC_SHARED_MEMORY; - } - } - if (axis->pushConstants.structSize) { - size_t size = axis->pushConstants.structSize; - result2 = hipModuleGetGlobal(&axis->consts_addr, &size, axis->VkFFTModule, "consts"); - if (result2 != hipSuccess) { - printf("hipModuleGetGlobal error: %d\n", result2); - free(code); - code = 0; - free(code0); - code0 = 0; - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_TO_MODULE_GET_GLOBAL; - } - } - if (!app->configuration.saveApplicationToString) { - free(code); - code = 0; - } -#elif(VKFFT_BACKEND==3) - if (app->configuration.loadApplicationFromString) { - char* code; - uint64_t codeSize; - - char* localStrPointer = (char*)app->configuration.loadApplicationString + app->currentApplicationStringPos; - memcpy(&codeSize, localStrPointer, sizeof(uint64_t)); - code = (char*)malloc(codeSize); - if (!code) { - free(code0); - code0 = 0; - deleteVkFFT(app); - return VKFFT_ERROR_MALLOC_FAILED; - } - memcpy(code, localStrPointer + sizeof(uint64_t), codeSize); - app->currentApplicationStringPos += codeSize + sizeof(uint64_t); - - axis->program = clCreateProgramWithBinary(app->configuration.context[0], 1, app->configuration.device, &codeSize, (const unsigned char**)(&code), 0, &res); - if (res != CL_SUCCESS) { - free(code); - code = 0; - free(code0); - code0 = 0; - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_TO_CREATE_PROGRAM; - } - free(code); - code = 0; - } - else { - size_t codelen = strlen(code0); - axis->program = clCreateProgramWithSource(app->configuration.context[0], 1, (const char**)&code0, &codelen, &res); - if (res != CL_SUCCESS) { - free(code0); - code0 = 0; - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_TO_CREATE_PROGRAM; - } - } - res = clBuildProgram(axis->program, 1, app->configuration.device, 0, 0, 0); - if (res != CL_SUCCESS) { - size_t log_size; - clGetProgramBuildInfo(axis->program, app->configuration.device[0], CL_PROGRAM_BUILD_LOG, 0, 0, &log_size); - char* log = (char*)malloc(log_size); - if (!log) { - free(code0); - code0 = 0; - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_TO_COMPILE_PROGRAM; - } - else { - clGetProgramBuildInfo(axis->program, app->configuration.device[0], CL_PROGRAM_BUILD_LOG, log_size, log, 0); - printf("%s\n", log); - free(log); - log = 0; - printf("%s\n", code0); - free(code0); - code0 = 0; - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_TO_COMPILE_PROGRAM; - } - } - if (app->configuration.saveApplicationToString) { - size_t codeSize; - res = clGetProgramInfo(axis->program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &codeSize, NULL); - if (res != CL_SUCCESS) { - free(code0); - code0 = 0; - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_TO_COMPILE_PROGRAM; - } - axis->binarySize = codeSize; - axis->binary = (char*)malloc(axis->binarySize); - if (!axis->binary) { - free(code0); - code0 = 0; - deleteVkFFT(app); - return VKFFT_ERROR_MALLOC_FAILED; - } - res = clGetProgramInfo(axis->program, CL_PROGRAM_BINARIES, axis->binarySize, &axis->binary, NULL); - if (res != CL_SUCCESS) { - if (app->configuration.saveApplicationToString) { - free(axis->binary); - axis->binary = 0; - } - free(code0); - code0 = 0; - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_TO_COMPILE_PROGRAM; - } - } - axis->kernel = clCreateKernel(axis->program, "VkFFT_main_R2C", &res); - if (res != CL_SUCCESS) { - if (app->configuration.saveApplicationToString) { - free(axis->binary); - axis->binary = 0; - } - free(code0); - code0 = 0; - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_TO_CREATE_SHADER_MODULE; - } -#elif(VKFFT_BACKEND==4) - uint32_t* code; - uint64_t codeSize; - if (app->configuration.loadApplicationFromString) { - char* localStrPointer = (char*)app->configuration.loadApplicationString + app->currentApplicationStringPos; - memcpy(&codeSize, localStrPointer, sizeof(uint64_t)); - code = (uint32_t*)malloc(codeSize); - if (!code) { - free(code0); - code0 = 0; - deleteVkFFT(app); - return VKFFT_ERROR_MALLOC_FAILED; - } - memcpy(code, localStrPointer + sizeof(uint64_t), codeSize); - app->currentApplicationStringPos += codeSize + sizeof(uint64_t); - - const char* pBuildFlags = (app->configuration.useUint64) ? "-ze-opt-greater-than-4GB-buffer-required" : 0; - ze_module_desc_t moduleDesc = { - ZE_STRUCTURE_TYPE_MODULE_DESC, - 0, - ZE_MODULE_FORMAT_NATIVE, - codeSize, - (uint8_t*)code, - pBuildFlags, - 0 - }; - res = zeModuleCreate(app->configuration.context[0], app->configuration.device[0], &moduleDesc, &axis->VkFFTModule, 0); - if (res != ZE_RESULT_SUCCESS) { - free(code); - code = 0; - free(code0); - code0 = 0; - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_TO_CREATE_PROGRAM; - } - free(code); - code = 0; - } - else { - size_t codelen = strlen(code0); - uint64_t successOpen = 0; - FILE* temp; - char fname_cl[100]; - char fname_bc[100]; - char fname_spv[100]; - int name_id = 0; - while (!successOpen) { - sprintf(fname_cl, "VkFFT_temp_cl_%d.cl", name_id); - temp = fopen(fname_cl, "r"); - if (temp != 0) { - fclose(temp); - name_id++; - } - else { - successOpen = 1; - sprintf(fname_bc, "VkFFT_temp_bc_%d.spv", name_id); - sprintf(fname_spv, "VkFFT_temp_cl_%d.spv", name_id); - } - } - temp = fopen(fname_cl, "w"); - fwrite(code0, 1, codelen, temp); - fclose(temp); - char system_call[500]; - sprintf(system_call, "clang -c -target spir64 -O0 -emit-llvm -o %s %s", fname_bc, fname_cl); - system(system_call); - sprintf(system_call, "llvm-spirv -o %s %s", fname_spv, fname_bc); - system(system_call); - temp = fopen(fname_spv, "rb"); - fseek(temp, 0L, SEEK_END); - uint64_t spv_size = ftell(temp); - rewind(temp); - - uint8_t* spv_binary = (uint8_t*)malloc(spv_size); - if (!spv_binary) { - free(code0); - code0 = 0; - deleteVkFFT(app); - return VKFFT_ERROR_MALLOC_FAILED; - } - fread(spv_binary, 1, spv_size, temp); - fclose(temp); - remove(fname_cl); - remove(fname_bc); - remove(fname_spv); - const char* pBuildFlags = (app->configuration.useUint64) ? "-ze-opt-greater-than-4GB-buffer-required" : 0; - - ze_module_desc_t moduleDesc = { - ZE_STRUCTURE_TYPE_MODULE_DESC, - 0, - ZE_MODULE_FORMAT_IL_SPIRV, - spv_size, - spv_binary, - pBuildFlags, - 0 - }; - res = zeModuleCreate(app->configuration.context[0], app->configuration.device[0], &moduleDesc, &axis->VkFFTModule, 0); - if (res != ZE_RESULT_SUCCESS) { - free(spv_binary); - spv_binary = 0; - free(code0); - code0 = 0; - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_TO_CREATE_PROGRAM; - } - free(spv_binary); - spv_binary = 0; - if (app->configuration.saveApplicationToString) { - size_t codeSize; - res = zeModuleGetNativeBinary(axis->VkFFTModule, &codeSize, 0); - if (res != ZE_RESULT_SUCCESS) { - free(code0); - code0 = 0; - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_TO_COMPILE_PROGRAM; - } - axis->binarySize = codeSize; - axis->binary = (char*)malloc(axis->binarySize); - if (!axis->binary) { - free(code0); - code0 = 0; - deleteVkFFT(app); - return VKFFT_ERROR_MALLOC_FAILED; - } - res = zeModuleGetNativeBinary(axis->VkFFTModule, &codeSize, (uint8_t*)axis->binary); - if (res != ZE_RESULT_SUCCESS) { - free(axis->binary); - axis->binary = 0; - free(code0); - code0 = 0; - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_TO_COMPILE_PROGRAM; - } - } - } - ze_kernel_desc_t kernelDesc = { - ZE_STRUCTURE_TYPE_KERNEL_DESC, - 0, - 0, // flags - "VkFFT_main_R2C" - }; - res = zeKernelCreate(axis->VkFFTModule, &kernelDesc, &axis->VkFFTKernel); - if (res != ZE_RESULT_SUCCESS) { - if (app->configuration.saveApplicationToString) { - free(axis->binary); - axis->binary = 0; - } - free(code0); - code0 = 0; - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_TO_CREATE_SHADER_MODULE; - } -#endif - if (!app->configuration.keepShaderCode) { - free(code0); - code0 = 0; - axis->specializationConstants.code0 = 0; - } - } - return resFFT; -} -static inline VkFFTResult VkFFTPlanAxis(VkFFTApplication* app, VkFFTPlan* FFTPlan, uint64_t axis_id, uint64_t axis_upload_id, uint64_t inverse, uint64_t reverseBluesteinMultiUpload) { - //get radix stages - VkFFTResult resFFT = VKFFT_SUCCESS; -#if(VKFFT_BACKEND==0) - VkResult res = VK_SUCCESS; -#elif(VKFFT_BACKEND==1) - cudaError_t res = cudaSuccess; -#elif(VKFFT_BACKEND==2) - hipError_t res = hipSuccess; -#elif(VKFFT_BACKEND==3) - cl_int res = CL_SUCCESS; -#elif(VKFFT_BACKEND==4) - ze_result_t res = ZE_RESULT_SUCCESS; -#endif - VkFFTAxis* axis = (reverseBluesteinMultiUpload) ? &FFTPlan->inverseBluesteinAxes[axis_id][axis_upload_id] : &FFTPlan->axes[axis_id][axis_upload_id]; - - axis->specializationConstants.sourceFFTSize = app->configuration.size[axis_id]; - if ((app->configuration.FFTdim == 1) && (FFTPlan->actualFFTSizePerAxis[axis_id][1] == 1) && ((app->configuration.numberBatches > 1) || (app->actualNumBatches > 1)) && (!app->configuration.performConvolution) && (app->configuration.coordinateFeatures == 1)) { - if (app->configuration.numberBatches > 1) { - app->actualNumBatches = app->configuration.numberBatches; - app->configuration.numberBatches = 1; - } - FFTPlan->actualFFTSizePerAxis[axis_id][1] = app->actualNumBatches; - } - axis->specializationConstants.numBatches = app->configuration.numberBatches; - axis->specializationConstants.warpSize = app->configuration.warpSize; - axis->specializationConstants.numSharedBanks = app->configuration.numSharedBanks; - axis->specializationConstants.useUint64 = app->configuration.useUint64; - axis->specializationConstants.disableSetLocale = app->configuration.disableSetLocale; - - axis->specializationConstants.numAxisUploads = FFTPlan->numAxisUploads[axis_id]; - axis->specializationConstants.fixMinRaderPrimeMult = app->configuration.fixMinRaderPrimeMult; - axis->specializationConstants.fixMaxRaderPrimeMult = app->configuration.fixMaxRaderPrimeMult; - axis->specializationConstants.fixMinRaderPrimeFFT = app->configuration.fixMinRaderPrimeFFT; - axis->specializationConstants.fixMaxRaderPrimeFFT = app->configuration.fixMaxRaderPrimeFFT; - - axis->specializationConstants.raderUintLUT = (axis->specializationConstants.useRader) ? app->configuration.useRaderUintLUT : 0; - axis->specializationConstants.inline_rader_g_pow = (axis->specializationConstants.raderUintLUT) ? 2 : 1; - axis->specializationConstants.inline_rader_kernel = (app->configuration.useLUT) ? 0 : 1; - uint64_t complexSize; - if (app->configuration.doublePrecision || app->configuration.doublePrecisionFloatMemory) - complexSize = (2 * sizeof(double)); - else - if (app->configuration.halfPrecision) - complexSize = (2 * sizeof(float)); - else - complexSize = (2 * sizeof(float)); - axis->specializationConstants.complexSize = complexSize; - axis->specializationConstants.supportAxis = 0; - axis->specializationConstants.symmetricKernel = app->configuration.symmetricKernel; - axis->specializationConstants.conjugateConvolution = app->configuration.conjugateConvolution; - axis->specializationConstants.crossPowerSpectrumNormalization = app->configuration.crossPowerSpectrumNormalization; - - uint64_t allowedSharedMemory = app->configuration.sharedMemorySize; - uint64_t allowedSharedMemoryPow2 = app->configuration.sharedMemorySizePow2; - - if (axis->specializationConstants.useRaderMult) { - allowedSharedMemory -= (axis->specializationConstants.useRaderMult - 1) * complexSize; - allowedSharedMemoryPow2 -= (axis->specializationConstants.useRaderMult - 1) * complexSize; - } - - uint64_t maxSequenceLengthSharedMemory = allowedSharedMemory / complexSize; - uint64_t maxSequenceLengthSharedMemoryPow2 = allowedSharedMemoryPow2 / complexSize; - uint64_t maxSingleSizeStrided = (app->configuration.coalescedMemory > complexSize) ? allowedSharedMemory / (app->configuration.coalescedMemory) : allowedSharedMemory / complexSize; - uint64_t maxSingleSizeStridedPow2 = (app->configuration.coalescedMemory > complexSize) ? allowedSharedMemoryPow2 / (app->configuration.coalescedMemory) : allowedSharedMemoryPow2 / complexSize; - - axis->specializationConstants.stageStartSize = 1; - for (uint64_t i = 0; i < axis_upload_id; i++) - axis->specializationConstants.stageStartSize *= FFTPlan->axisSplit[axis_id][i]; - - - axis->specializationConstants.firstStageStartSize = FFTPlan->actualFFTSizePerAxis[axis_id][axis_id] / FFTPlan->axisSplit[axis_id][FFTPlan->numAxisUploads[axis_id] - 1]; - axis->specializationConstants.dispatchZactualFFTSize = (axis_id < 2) ? FFTPlan->actualFFTSizePerAxis[axis_id][2] : FFTPlan->actualFFTSizePerAxis[axis_id][1]; - if (axis_id == 0) { - //configure radix stages - axis->specializationConstants.fft_dim_x = axis->specializationConstants.stageStartSize; - } - else { - axis->specializationConstants.fft_dim_x = FFTPlan->actualFFTSizePerAxis[axis_id][0]; - } - if (app->useBluesteinFFT[axis_id]) { - axis->specializationConstants.useBluesteinFFT = 1; - } - - if (app->configuration.performDCT == 3) { - axis->specializationConstants.actualInverse = inverse; - axis->specializationConstants.inverse = !inverse; - } - else { - if (app->configuration.performDCT == 4) { - axis->specializationConstants.actualInverse = inverse; - axis->specializationConstants.inverse = 1; - } - else { - axis->specializationConstants.actualInverse = inverse; - axis->specializationConstants.inverse = inverse; - } - } - if (app->useBluesteinFFT[axis_id]) { - axis->specializationConstants.actualInverse = inverse; - axis->specializationConstants.inverse = reverseBluesteinMultiUpload; - if (app->configuration.performDCT == 3) { - axis->specializationConstants.inverseBluestein = !inverse; - } - else { - if (app->configuration.performDCT == 4) { - axis->specializationConstants.inverseBluestein = 1; - } - else { - axis->specializationConstants.inverseBluestein = inverse; - } - } - } - axis->specializationConstants.reverseBluesteinMultiUpload = reverseBluesteinMultiUpload; - - axis->specializationConstants.reorderFourStep = ((FFTPlan->numAxisUploads[axis_id] > 1) && (!app->useBluesteinFFT[axis_id])) ? app->configuration.reorderFourStep : 0; - - if ((axis_id == 0) && ((FFTPlan->numAxisUploads[axis_id] == 1) || ((axis_upload_id == 0) && (!axis->specializationConstants.reorderFourStep)))) { - maxSequenceLengthSharedMemory *= axis->specializationConstants.registerBoost; - maxSequenceLengthSharedMemoryPow2 = (uint64_t)pow(2, (uint64_t)log2(maxSequenceLengthSharedMemory)); - } - else { - maxSingleSizeStrided *= axis->specializationConstants.registerBoost; - maxSingleSizeStridedPow2 = (uint64_t)pow(2, (uint64_t)log2(maxSingleSizeStrided)); - } - - axis->specializationConstants.performR2C = FFTPlan->actualPerformR2CPerAxis[axis_id]; - axis->specializationConstants.performR2CmultiUpload = FFTPlan->multiUploadR2C; - if (app->configuration.performDCT == 3) { - axis->specializationConstants.performDCT = 2; - } - else { - axis->specializationConstants.performDCT = app->configuration.performDCT; - } - if ((axis->specializationConstants.performR2CmultiUpload) && (app->configuration.size[0] % 2 != 0)) return VKFFT_ERROR_UNSUPPORTED_FFT_LENGTH_R2C; - axis->specializationConstants.mergeSequencesR2C = ((axis->specializationConstants.fftDim < maxSequenceLengthSharedMemory) && ((FFTPlan->actualFFTSizePerAxis[axis_id][1] % 2) == 0) && ((FFTPlan->actualPerformR2CPerAxis[axis_id]) || (((app->configuration.performDCT == 3) || (app->configuration.performDCT == 2) || (app->configuration.performDCT == 1) || ((app->configuration.performDCT == 4) && ((app->configuration.size[axis_id] % 2) != 0))) && (axis_id == 0)))) ? (1 - app->configuration.disableMergeSequencesR2C) : 0; - //uint64_t passID = FFTPlan->numAxisUploads[axis_id] - 1 - axis_upload_id; - axis->specializationConstants.fft_dim_full = FFTPlan->actualFFTSizePerAxis[axis_id][axis_id]; - if ((FFTPlan->numAxisUploads[axis_id] > 1) && (axis->specializationConstants.reorderFourStep || app->useBluesteinFFT[axis_id]) && (!app->configuration.userTempBuffer) && (app->configuration.allocateTempBuffer == 0)) { - app->configuration.allocateTempBuffer = 1; - -#if(VKFFT_BACKEND==0) - app->configuration.tempBuffer = (VkBuffer*)malloc(sizeof(VkBuffer)); - if (!app->configuration.tempBuffer) { - deleteVkFFT(app); - return VKFFT_ERROR_MALLOC_FAILED; - } - resFFT = allocateFFTBuffer(app, app->configuration.tempBuffer, &app->configuration.tempBufferDeviceMemory, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, VK_MEMORY_HEAP_DEVICE_LOCAL_BIT, app->configuration.tempBufferSize[0]); - if (resFFT != VKFFT_SUCCESS) { - deleteVkFFT(app); - return resFFT; - } -#elif(VKFFT_BACKEND==1) - app->configuration.tempBuffer = (void**)malloc(sizeof(void*)); - if (!app->configuration.tempBuffer) { - deleteVkFFT(app); - return VKFFT_ERROR_MALLOC_FAILED; - } - res = cudaMalloc(app->configuration.tempBuffer, app->configuration.tempBufferSize[0]); - if (res != cudaSuccess) { - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_TO_ALLOCATE; - } -#elif(VKFFT_BACKEND==2) - app->configuration.tempBuffer = (void**)malloc(sizeof(void*)); - if (!app->configuration.tempBuffer) { - deleteVkFFT(app); - return VKFFT_ERROR_MALLOC_FAILED; - } - res = hipMalloc(app->configuration.tempBuffer, app->configuration.tempBufferSize[0]); - if (res != hipSuccess) { - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_TO_ALLOCATE; - } -#elif(VKFFT_BACKEND==3) - app->configuration.tempBuffer = (cl_mem*)malloc(sizeof(cl_mem)); - if (!app->configuration.tempBuffer) { - deleteVkFFT(app); - return VKFFT_ERROR_MALLOC_FAILED; - } - app->configuration.tempBuffer[0] = clCreateBuffer(app->configuration.context[0], CL_MEM_READ_WRITE, app->configuration.tempBufferSize[0], 0, &res); - if (res != CL_SUCCESS) { - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_TO_ALLOCATE; - } -#elif(VKFFT_BACKEND==4) - app->configuration.tempBuffer = (void**)malloc(sizeof(void*)); - if (!app->configuration.tempBuffer) { - deleteVkFFT(app); - return VKFFT_ERROR_MALLOC_FAILED; - } - ze_device_mem_alloc_desc_t device_desc = {}; - device_desc.stype = ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC; - res = zeMemAllocDevice(app->configuration.context[0], &device_desc, app->configuration.tempBufferSize[0], sizeof(float), app->configuration.device[0], app->configuration.tempBuffer); - if (res != ZE_RESULT_SUCCESS) { - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_TO_ALLOCATE; - } -#endif - } - //generate Rader Kernels - resFFT = VkFFTGenerateRaderFFTKernel(app, axis); - if (resFFT != VKFFT_SUCCESS) { - deleteVkFFT(app); - return resFFT; - } - //allocate LUT - if (app->configuration.useLUT) { - double double_PI = 3.1415926535897932384626433832795; - uint64_t dimMult = 1; - uint64_t maxStageSum = 0; - for (uint64_t i = 0; i < axis->specializationConstants.numStages; i++) { - if (i > 0) { - switch (axis->specializationConstants.stageRadix[i]) { - case 2: - maxStageSum += dimMult; - break; - case 3: - maxStageSum += dimMult * 2; - break; - case 4: - maxStageSum += dimMult * 2; - break; - case 5: - maxStageSum += dimMult * 4; - break; - case 6: - maxStageSum += dimMult * 5; - break; - case 7: - maxStageSum += dimMult * 6; - break; - case 8: - maxStageSum += dimMult * 3; - break; - case 9: - maxStageSum += dimMult * 8; - break; - case 10: - maxStageSum += dimMult * 9; - break; - case 11: - maxStageSum += dimMult * 10; - break; - case 12: - maxStageSum += dimMult * 11; - break; - case 13: - maxStageSum += dimMult * 12; - break; - case 14: - maxStageSum += dimMult * 13; - break; - case 15: - maxStageSum += dimMult * 14; - break; - case 16: - maxStageSum += dimMult * 4; - break; - case 32: - maxStageSum += dimMult * 5; - break; - default: - maxStageSum += dimMult * (axis->specializationConstants.stageRadix[i]); - break; - } - } - dimMult *= axis->specializationConstants.stageRadix[i]; - } - axis->specializationConstants.maxStageSumLUT = maxStageSum; - - dimMult = 1; - for (uint64_t k = 0; k < axis->specializationConstants.numRaderPrimes; k++) { - if (axis->specializationConstants.raderContainer[k].type == 0) { - axis->specializationConstants.raderContainer[k].RaderRadixOffsetLUT = maxStageSum; - for (uint64_t i = 0; i < axis->specializationConstants.raderContainer[k].numStages; i++) { - if (i > 0) { - switch (axis->specializationConstants.raderContainer[k].stageRadix[i]) { - case 2: - maxStageSum += dimMult; - break; - case 3: - maxStageSum += dimMult * 2; - break; - case 4: - maxStageSum += dimMult * 2; - break; - case 5: - maxStageSum += dimMult * 4; - break; - case 6: - maxStageSum += dimMult * 5; - break; - case 7: - maxStageSum += dimMult * 6; - break; - case 8: - maxStageSum += dimMult * 3; - break; - case 9: - maxStageSum += dimMult * 8; - break; - case 10: - maxStageSum += dimMult * 9; - break; - case 11: - maxStageSum += dimMult * 10; - break; - case 12: - maxStageSum += dimMult * 11; - break; - case 13: - maxStageSum += dimMult * 12; - break; - case 14: - maxStageSum += dimMult * 13; - break; - case 15: - maxStageSum += dimMult * 14; - break; - case 16: - maxStageSum += dimMult * 4; - break; - case 32: - maxStageSum += dimMult * 5; - break; - default: - maxStageSum += dimMult * (axis->specializationConstants.raderContainer[k].stageRadix[i]); - break; - } - } - dimMult *= axis->specializationConstants.raderContainer[k].stageRadix[i]; - } - axis->specializationConstants.maxStageSumLUT = maxStageSum; - dimMult = 1; - } - } - //iFFT LUT - dimMult = 1; - for (uint64_t k = 0; k < axis->specializationConstants.numRaderPrimes; k++) { - if (axis->specializationConstants.raderContainer[k].type == 0) { - axis->specializationConstants.raderContainer[k].RaderRadixOffsetLUTiFFT = maxStageSum; - for (int64_t i = axis->specializationConstants.raderContainer[k].numStages - 1; i >= 0; i--) { - if (i < (int64_t)axis->specializationConstants.raderContainer[k].numStages - 1) { - switch (axis->specializationConstants.raderContainer[k].stageRadix[i]) { - case 2: - maxStageSum += dimMult; - break; - case 3: - maxStageSum += dimMult * 2; - break; - case 4: - maxStageSum += dimMult * 2; - break; - case 5: - maxStageSum += dimMult * 4; - break; - case 6: - maxStageSum += dimMult * 5; - break; - case 7: - maxStageSum += dimMult * 6; - break; - case 8: - maxStageSum += dimMult * 3; - break; - case 9: - maxStageSum += dimMult * 8; - break; - case 10: - maxStageSum += dimMult * 9; - break; - case 11: - maxStageSum += dimMult * 10; - break; - case 12: - maxStageSum += dimMult * 11; - break; - case 13: - maxStageSum += dimMult * 12; - break; - case 14: - maxStageSum += dimMult * 13; - break; - case 15: - maxStageSum += dimMult * 14; - break; - case 16: - maxStageSum += dimMult * 4; - break; - case 32: - maxStageSum += dimMult * 5; - break; - default: - maxStageSum += dimMult * (axis->specializationConstants.raderContainer[k].stageRadix[i]); - break; - } - } - dimMult *= axis->specializationConstants.raderContainer[k].stageRadix[i]; - } - axis->specializationConstants.maxStageSumLUT = maxStageSum; - dimMult = 1; - } - } - - if (app->configuration.doublePrecision || app->configuration.doublePrecisionFloatMemory) { - if (axis_upload_id > 0) { - if ((app->configuration.performDCT == 2) || (app->configuration.performDCT == 3)) { - axis->specializationConstants.startDCT3LUT = (maxStageSum + axis->specializationConstants.stageStartSize * axis->specializationConstants.fftDim); - axis->bufferLUTSize = (maxStageSum + axis->specializationConstants.stageStartSize * axis->specializationConstants.fftDim + (app->configuration.size[axis_id] / 2 + 2)) * 2 * sizeof(double); - } - else { - if ((app->configuration.performDCT == 4) && (app->configuration.size[axis_id] % 2 == 0)) { - axis->specializationConstants.startDCT3LUT = (maxStageSum + axis->specializationConstants.stageStartSize * axis->specializationConstants.fftDim); - axis->specializationConstants.startDCT4LUT = (axis->specializationConstants.startDCT3LUT + (app->configuration.size[axis_id] / 4 + 2)); - axis->bufferLUTSize = (maxStageSum + axis->specializationConstants.stageStartSize * axis->specializationConstants.fftDim + (app->configuration.size[axis_id] / 4 + 2) + app->configuration.size[axis_id] / 2) * 2 * sizeof(double); - } - else - axis->bufferLUTSize = (maxStageSum + axis->specializationConstants.stageStartSize * axis->specializationConstants.fftDim) * 2 * sizeof(double); - } - } - else { - if ((app->configuration.performDCT == 2) || (app->configuration.performDCT == 3)) { - axis->specializationConstants.startDCT3LUT = (maxStageSum); - axis->bufferLUTSize = (maxStageSum + (app->configuration.size[axis_id] / 2 + 2)) * 2 * sizeof(double); - } - else { - if ((app->configuration.performDCT == 4) && (app->configuration.size[axis_id] % 2 == 0)) { - axis->specializationConstants.startDCT3LUT = (maxStageSum); - axis->specializationConstants.startDCT4LUT = (axis->specializationConstants.startDCT3LUT + (app->configuration.size[axis_id] / 4 + 2)); - axis->bufferLUTSize = (maxStageSum + (app->configuration.size[axis_id] / 4 + 2) + app->configuration.size[axis_id] / 2) * 2 * sizeof(double); - - } - else - axis->bufferLUTSize = (maxStageSum) * 2 * sizeof(double); - } - } - if (axis->specializationConstants.useRader) { - for (uint64_t i = 0; i < axis->specializationConstants.numRaderPrimes; i++) { - if (!axis->specializationConstants.inline_rader_kernel) { - axis->specializationConstants.raderContainer[i].RaderKernelOffsetLUT = axis->bufferLUTSize / (2 * sizeof(double)); - axis->bufferLUTSize += (axis->specializationConstants.raderContainer[i].prime - 1) * 2 * sizeof(double); - } - } - } - if (axis->bufferLUTSize == 0) axis->bufferLUTSize = sizeof(double); - double* tempLUT = (double*)malloc(axis->bufferLUTSize); - if (!tempLUT) { - deleteVkFFT(app); - return VKFFT_ERROR_MALLOC_FAILED; - } - uint64_t localStageSize = axis->specializationConstants.stageRadix[0]; - uint64_t localStageSum = 0; - for (uint64_t i = 1; i < axis->specializationConstants.numStages; i++) { - if ((axis->specializationConstants.stageRadix[i] & (axis->specializationConstants.stageRadix[i] - 1)) == 0) { - for (uint64_t k = 0; k < log2(axis->specializationConstants.stageRadix[i]); k++) { - for (uint64_t j = 0; j < localStageSize; j++) { - tempLUT[2 * (j + localStageSum)] = cos(j * double_PI / localStageSize / pow(2, k)); - tempLUT[2 * (j + localStageSum) + 1] = sin(j * double_PI / localStageSize / pow(2, k)); - } - localStageSum += localStageSize; - } - } - else if (axis->specializationConstants.rader_generator[i] > 0) { - for (uint64_t j = 0; j < localStageSize; j++) { - for (int64_t k = (axis->specializationConstants.stageRadix[i] - 1); k >= 0; k--) { - tempLUT[2 * (k + localStageSum)] = cos(j * 2.0 * k / axis->specializationConstants.stageRadix[i] * double_PI / localStageSize); - tempLUT[2 * (k + localStageSum) + 1] = sin(j * 2.0 * k / axis->specializationConstants.stageRadix[i] * double_PI / localStageSize); - } - localStageSum += (axis->specializationConstants.stageRadix[i]); - } - } - else { - for (uint64_t k = (axis->specializationConstants.stageRadix[i] - 1); k > 0; k--) { - for (uint64_t j = 0; j < localStageSize; j++) { - tempLUT[2 * (j + localStageSum)] = cos(j * 2.0 * k / axis->specializationConstants.stageRadix[i] * double_PI / localStageSize); - tempLUT[2 * (j + localStageSum) + 1] = sin(j * 2.0 * k / axis->specializationConstants.stageRadix[i] * double_PI / localStageSize); - } - localStageSum += localStageSize; - } - } - localStageSize *= axis->specializationConstants.stageRadix[i]; - } - - - if (axis->specializationConstants.useRader) { - for (uint64_t i = 0; i < axis->specializationConstants.numRaderPrimes; i++) { - if (axis->specializationConstants.raderContainer[i].type) { - if (!axis->specializationConstants.inline_rader_kernel) { - for (uint64_t j = 0; j < (axis->specializationConstants.raderContainer[i].prime - 1); j++) {//fix later - uint64_t g_pow = 1; - for (uint64_t t = 0; t < axis->specializationConstants.raderContainer[i].prime - 1 - j; t++) { - g_pow = (g_pow * axis->specializationConstants.raderContainer[i].generator) % axis->specializationConstants.raderContainer[i].prime; - } - tempLUT[2 * (j + axis->specializationConstants.raderContainer[i].RaderKernelOffsetLUT)] = cos(2.0 * g_pow * double_PI / axis->specializationConstants.raderContainer[i].prime); - tempLUT[2 * (j + axis->specializationConstants.raderContainer[i].RaderKernelOffsetLUT) + 1] = (-sin(2.0 * g_pow * double_PI / axis->specializationConstants.raderContainer[i].prime)); - } - } - } - else { - localStageSize = axis->specializationConstants.raderContainer[i].stageRadix[0]; - localStageSum = 0; - for (uint64_t l = 1; l < axis->specializationConstants.raderContainer[i].numStages; l++) { - if ((axis->specializationConstants.raderContainer[i].stageRadix[l] & (axis->specializationConstants.raderContainer[i].stageRadix[l] - 1)) == 0) { - for (uint64_t k = 0; k < log2(axis->specializationConstants.raderContainer[i].stageRadix[l]); k++) { - for (uint64_t j = 0; j < localStageSize; j++) { - tempLUT[2 * (j + localStageSum + axis->specializationConstants.raderContainer[i].RaderRadixOffsetLUT)] = cos(j * double_PI / localStageSize / pow(2, k)); - tempLUT[2 * (j + localStageSum + axis->specializationConstants.raderContainer[i].RaderRadixOffsetLUT) + 1] = sin(j * double_PI / localStageSize / pow(2, k)); - } - localStageSum += localStageSize; - } - } - else { - for (uint64_t k = (axis->specializationConstants.raderContainer[i].stageRadix[l] - 1); k > 0; k--) { - for (uint64_t j = 0; j < localStageSize; j++) { - tempLUT[2 * (j + localStageSum + axis->specializationConstants.raderContainer[i].RaderRadixOffsetLUT)] = cos(j * 2.0 * k / axis->specializationConstants.raderContainer[i].stageRadix[l] * double_PI / localStageSize); - tempLUT[2 * (j + localStageSum + axis->specializationConstants.raderContainer[i].RaderRadixOffsetLUT) + 1] = sin(j * 2.0 * k / axis->specializationConstants.raderContainer[i].stageRadix[l] * double_PI / localStageSize); - } - localStageSum += localStageSize; - } - } - localStageSize *= axis->specializationConstants.raderContainer[i].stageRadix[l]; - } - - localStageSize = axis->specializationConstants.raderContainer[i].stageRadix[axis->specializationConstants.raderContainer[i].numStages - 1]; - localStageSum = 0; - for (int64_t l = (int64_t)axis->specializationConstants.raderContainer[i].numStages - 2; l >= 0; l--) { - if ((axis->specializationConstants.raderContainer[i].stageRadix[l] & (axis->specializationConstants.raderContainer[i].stageRadix[l] - 1)) == 0) { - for (uint64_t k = 0; k < log2(axis->specializationConstants.raderContainer[i].stageRadix[l]); k++) { - for (uint64_t j = 0; j < localStageSize; j++) { - tempLUT[2 * (j + localStageSum + axis->specializationConstants.raderContainer[i].RaderRadixOffsetLUTiFFT)] = cos(j * double_PI / localStageSize / pow(2, k)); - tempLUT[2 * (j + localStageSum + axis->specializationConstants.raderContainer[i].RaderRadixOffsetLUTiFFT) + 1] = sin(j * double_PI / localStageSize / pow(2, k)); - } - localStageSum += localStageSize; - } - } - else { - for (uint64_t k = (axis->specializationConstants.raderContainer[i].stageRadix[l] - 1); k > 0; k--) { - for (uint64_t j = 0; j < localStageSize; j++) { - tempLUT[2 * (j + localStageSum + axis->specializationConstants.raderContainer[i].RaderRadixOffsetLUTiFFT)] = cos(j * 2.0 * k / axis->specializationConstants.raderContainer[i].stageRadix[l] * double_PI / localStageSize); - tempLUT[2 * (j + localStageSum + axis->specializationConstants.raderContainer[i].RaderRadixOffsetLUTiFFT) + 1] = sin(j * 2.0 * k / axis->specializationConstants.raderContainer[i].stageRadix[l] * double_PI / localStageSize); - } - localStageSum += localStageSize; - } - } - localStageSize *= axis->specializationConstants.raderContainer[i].stageRadix[l]; - } - - if (!axis->specializationConstants.inline_rader_kernel) { - double* raderFFTkernel = (double*)axis->specializationConstants.raderContainer[i].raderFFTkernel; - for (uint64_t j = 0; j < (axis->specializationConstants.raderContainer[i].prime - 1); j++) {//fix later - tempLUT[2 * (j + axis->specializationConstants.raderContainer[i].RaderKernelOffsetLUT)] = raderFFTkernel[2 * j] / (axis->specializationConstants.raderContainer[i].prime - 1); - tempLUT[2 * (j + axis->specializationConstants.raderContainer[i].RaderKernelOffsetLUT) + 1] = raderFFTkernel[2 * j + 1] / (axis->specializationConstants.raderContainer[i].prime - 1); - } - } - } - } - } - if (axis_upload_id > 0) { - for (uint64_t i = 0; i < axis->specializationConstants.stageStartSize; i++) { - for (uint64_t j = 0; j < axis->specializationConstants.fftDim; j++) { - double angle = 2 * double_PI * ((i * j) / (double)(axis->specializationConstants.stageStartSize * axis->specializationConstants.fftDim)); - tempLUT[maxStageSum * 2 + 2 * (i + j * axis->specializationConstants.stageStartSize)] = cos(angle); - tempLUT[maxStageSum * 2 + 2 * (i + j * axis->specializationConstants.stageStartSize) + 1] = sin(angle); - } - } - } - if ((app->configuration.performDCT == 2) || (app->configuration.performDCT == 3)) { - for (uint64_t j = 0; j < app->configuration.size[axis_id] / 2 + 2; j++) { - double angle = (double_PI / 2.0 / (double)(app->configuration.size[axis_id])) * j; - tempLUT[2 * axis->specializationConstants.startDCT3LUT + 2 * j] = cos(angle); - tempLUT[2 * axis->specializationConstants.startDCT3LUT + 2 * j + 1] = sin(angle); - } - } - if ((app->configuration.performDCT == 4) && (app->configuration.size[axis_id] % 2 == 0)) { - for (uint64_t j = 0; j < app->configuration.size[axis_id] / 4 + 2; j++) { - double angle = (double_PI / 2.0 / (double)(app->configuration.size[axis_id] / 2)) * j; - tempLUT[2 * axis->specializationConstants.startDCT3LUT + 2 * j] = cos(angle); - tempLUT[2 * axis->specializationConstants.startDCT3LUT + 2 * j + 1] = sin(angle); - } - for (uint64_t j = 0; j < app->configuration.size[axis_id] / 2; j++) { - double angle = (-double_PI / 8.0 / (double)(app->configuration.size[axis_id] / 2)) * (2 * j + 1); - tempLUT[2 * axis->specializationConstants.startDCT4LUT + 2 * j] = cos(angle); - tempLUT[2 * axis->specializationConstants.startDCT4LUT + 2 * j + 1] = sin(angle); - } - } - axis->referenceLUT = 0; - if (reverseBluesteinMultiUpload == 1) { - axis->bufferLUT = FFTPlan->axes[axis_id][axis_upload_id].bufferLUT; -#if(VKFFT_BACKEND==0) - axis->bufferLUTDeviceMemory = FFTPlan->axes[axis_id][axis_upload_id].bufferLUTDeviceMemory; -#endif - axis->bufferLUTSize = FFTPlan->axes[axis_id][axis_upload_id].bufferLUTSize; - axis->referenceLUT = 1; - } - else { - if ((!inverse) && (!app->configuration.makeForwardPlanOnly)) { - axis->bufferLUT = app->localFFTPlan_inverse->axes[axis_id][axis_upload_id].bufferLUT; -#if(VKFFT_BACKEND==0) - axis->bufferLUTDeviceMemory = app->localFFTPlan_inverse->axes[axis_id][axis_upload_id].bufferLUTDeviceMemory; -#endif - axis->bufferLUTSize = app->localFFTPlan_inverse->axes[axis_id][axis_upload_id].bufferLUTSize; - axis->referenceLUT = 1; - } - else { - uint64_t checkRadixOrder = 1; - for (uint64_t i = 0; i < axis->specializationConstants.numStages; i++) - if (FFTPlan->axes[0][0].specializationConstants.stageRadix[i] != axis->specializationConstants.stageRadix[i]) checkRadixOrder = 0; - if (checkRadixOrder && ((axis_id == 1) || (axis_id == 2)) && (!((!axis->specializationConstants.reorderFourStep) && (FFTPlan->numAxisUploads[axis_id] > 1))) && ((axis->specializationConstants.fft_dim_full == FFTPlan->axes[0][0].specializationConstants.fft_dim_full) && (FFTPlan->numAxisUploads[axis_id] == 1) && (axis->specializationConstants.fft_dim_full < maxSingleSizeStrided / axis->specializationConstants.registerBoost)) && ((!app->configuration.performDCT) || (app->configuration.size[axis_id] == app->configuration.size[0]))) { - axis->bufferLUT = FFTPlan->axes[0][axis_upload_id].bufferLUT; -#if(VKFFT_BACKEND==0) - axis->bufferLUTDeviceMemory = FFTPlan->axes[0][axis_upload_id].bufferLUTDeviceMemory; -#endif - axis->bufferLUTSize = FFTPlan->axes[0][axis_upload_id].bufferLUTSize; - axis->referenceLUT = 1; - } - else { - checkRadixOrder = 1; - for (uint64_t i = 0; i < axis->specializationConstants.numStages; i++) - if (FFTPlan->axes[1][0].specializationConstants.stageRadix[i] != axis->specializationConstants.stageRadix[i]) checkRadixOrder = 0; - - if (checkRadixOrder && (axis_id == 2) && (axis->specializationConstants.fft_dim_full == FFTPlan->axes[1][0].specializationConstants.fft_dim_full) && ((!app->configuration.performDCT) || (app->configuration.size[2] == app->configuration.size[1]))) { - axis->bufferLUT = FFTPlan->axes[1][axis_upload_id].bufferLUT; -#if(VKFFT_BACKEND==0) - axis->bufferLUTDeviceMemory = FFTPlan->axes[1][axis_upload_id].bufferLUTDeviceMemory; -#endif - axis->bufferLUTSize = FFTPlan->axes[1][axis_upload_id].bufferLUTSize; - axis->referenceLUT = 1; - } - else { -#if(VKFFT_BACKEND==0) - resFFT = allocateFFTBuffer(app, &axis->bufferLUT, &axis->bufferLUTDeviceMemory, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, VK_MEMORY_HEAP_DEVICE_LOCAL_BIT, axis->bufferLUTSize); - if (resFFT != VKFFT_SUCCESS) { - deleteVkFFT(app); - free(tempLUT); - tempLUT = 0; - return resFFT; - } - resFFT = transferDataFromCPU(app, tempLUT, &axis->bufferLUT, axis->bufferLUTSize); - if (resFFT != VKFFT_SUCCESS) { - deleteVkFFT(app); - free(tempLUT); - tempLUT = 0; - return resFFT; - } -#elif(VKFFT_BACKEND==1) - res = cudaMalloc((void**)&axis->bufferLUT, axis->bufferLUTSize); - if (res != cudaSuccess) { - deleteVkFFT(app); - free(tempLUT); - tempLUT = 0; - return VKFFT_ERROR_FAILED_TO_ALLOCATE; - } - res = cudaMemcpy(axis->bufferLUT, tempLUT, axis->bufferLUTSize, cudaMemcpyHostToDevice); - if (res != cudaSuccess) { - deleteVkFFT(app); - free(tempLUT); - tempLUT = 0; - return VKFFT_ERROR_FAILED_TO_ALLOCATE; - } -#elif(VKFFT_BACKEND==2) - res = hipMalloc((void**)&axis->bufferLUT, axis->bufferLUTSize); - if (res != hipSuccess) { - deleteVkFFT(app); - free(tempLUT); - tempLUT = 0; - return VKFFT_ERROR_FAILED_TO_ALLOCATE; - } - res = hipMemcpy(axis->bufferLUT, tempLUT, axis->bufferLUTSize, hipMemcpyHostToDevice); - if (res != hipSuccess) { - deleteVkFFT(app); - free(tempLUT); - tempLUT = 0; - return VKFFT_ERROR_FAILED_TO_ALLOCATE; - } -#elif(VKFFT_BACKEND==3) - axis->bufferLUT = clCreateBuffer(app->configuration.context[0], CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, axis->bufferLUTSize, tempLUT, &res); - if (res != CL_SUCCESS) { - deleteVkFFT(app); - free(tempLUT); - tempLUT = 0; - return VKFFT_ERROR_FAILED_TO_ALLOCATE; - } -#elif(VKFFT_BACKEND==4) - ze_device_mem_alloc_desc_t device_desc = {}; - device_desc.stype = ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC; - res = zeMemAllocDevice(app->configuration.context[0], &device_desc, axis->bufferLUTSize, sizeof(float), app->configuration.device[0], &axis->bufferLUT); - if (res != ZE_RESULT_SUCCESS) { - deleteVkFFT(app); - free(tempLUT); - tempLUT = 0; - return VKFFT_ERROR_FAILED_TO_ALLOCATE; - } - ze_command_queue_desc_t commandQueueCopyDesc = { - ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC, - 0, - app->configuration.commandQueueID, - 0, // index - 0, // flags - ZE_COMMAND_QUEUE_MODE_DEFAULT, - ZE_COMMAND_QUEUE_PRIORITY_NORMAL - }; - ze_command_list_handle_t copyCommandList; - res = zeCommandListCreateImmediate(app->configuration.context[0], app->configuration.device[0], &commandQueueCopyDesc, ©CommandList); - if (res != ZE_RESULT_SUCCESS) { - deleteVkFFT(app); - free(tempLUT); - tempLUT = 0; - return VKFFT_ERROR_FAILED_TO_CREATE_COMMAND_LIST; - } - res = zeCommandListAppendMemoryCopy(copyCommandList, axis->bufferLUT, tempLUT, axis->bufferLUTSize, 0, 0, 0); - if (res != ZE_RESULT_SUCCESS) { - deleteVkFFT(app); - free(tempLUT); - tempLUT = 0; - return VKFFT_ERROR_FAILED_TO_COPY; - } - res = zeCommandQueueSynchronize(app->configuration.commandQueue[0], UINT32_MAX); - if (res != ZE_RESULT_SUCCESS) { - deleteVkFFT(app); - free(tempLUT); - tempLUT = 0; - return VKFFT_ERROR_FAILED_TO_SYNCHRONIZE; - } -#endif - } - } - } - } - free(tempLUT); - tempLUT = 0; - } - else { - if (axis_upload_id > 0) { - if ((app->configuration.performDCT == 2) || (app->configuration.performDCT == 3)) { - axis->specializationConstants.startDCT3LUT = (maxStageSum + axis->specializationConstants.stageStartSize * axis->specializationConstants.fftDim); - axis->bufferLUTSize = (maxStageSum + axis->specializationConstants.stageStartSize * axis->specializationConstants.fftDim + (app->configuration.size[axis_id] / 2 + 2)) * 2 * sizeof(float); - } - else { - if ((app->configuration.performDCT == 4) && (app->configuration.size[axis_id] % 2 == 0)) { - axis->specializationConstants.startDCT3LUT = (maxStageSum + axis->specializationConstants.stageStartSize * axis->specializationConstants.fftDim); - axis->specializationConstants.startDCT4LUT = (axis->specializationConstants.startDCT3LUT + (axis->specializationConstants.fftDim / 4 + 2)); - axis->bufferLUTSize = (maxStageSum + axis->specializationConstants.stageStartSize * axis->specializationConstants.fftDim + (app->configuration.size[axis_id] / 4 + 2) + app->configuration.size[axis_id] / 2) * 2 * sizeof(float); - } - else - axis->bufferLUTSize = (maxStageSum + axis->specializationConstants.stageStartSize * axis->specializationConstants.fftDim) * 2 * sizeof(float); - } - } - else { - if ((app->configuration.performDCT == 2) || (app->configuration.performDCT == 3)) { - axis->specializationConstants.startDCT3LUT = (maxStageSum); - axis->bufferLUTSize = (maxStageSum + (app->configuration.size[axis_id] / 2 + 2)) * 2 * sizeof(float); - } - else { - if ((app->configuration.performDCT == 4) && (app->configuration.size[axis_id] % 2 == 0)) { - axis->specializationConstants.startDCT3LUT = (maxStageSum); - axis->specializationConstants.startDCT4LUT = (axis->specializationConstants.startDCT3LUT + (app->configuration.size[axis_id] / 4 + 2)); - axis->bufferLUTSize = (maxStageSum + (app->configuration.size[axis_id] / 4 + 2) + app->configuration.size[axis_id] / 2) * 2 * sizeof(float); - } - else - axis->bufferLUTSize = (maxStageSum) * 2 * sizeof(float); - } - } - if (axis->specializationConstants.useRader) { - for (uint64_t i = 0; i < axis->specializationConstants.numRaderPrimes; i++) { - if (!axis->specializationConstants.inline_rader_kernel) { - axis->specializationConstants.raderContainer[i].RaderKernelOffsetLUT = axis->bufferLUTSize / (2 * sizeof(float)); - axis->bufferLUTSize += (axis->specializationConstants.raderContainer[i].prime - 1) * 2 * sizeof(float); - } - } - } - if (axis->bufferLUTSize == 0) axis->bufferLUTSize = sizeof(float); - float* tempLUT = (float*)malloc(axis->bufferLUTSize); - if (!tempLUT) { - deleteVkFFT(app); - return VKFFT_ERROR_MALLOC_FAILED; - } - uint64_t localStageSize = axis->specializationConstants.stageRadix[0]; - uint64_t localStageSum = 0; - for (uint64_t i = 1; i < axis->specializationConstants.numStages; i++) { - if ((axis->specializationConstants.stageRadix[i] & (axis->specializationConstants.stageRadix[i] - 1)) == 0) { - for (uint64_t k = 0; k < log2(axis->specializationConstants.stageRadix[i]); k++) { - for (uint64_t j = 0; j < localStageSize; j++) { - tempLUT[2 * (j + localStageSum)] = (float)cos(j * double_PI / localStageSize / pow(2, k)); - tempLUT[2 * (j + localStageSum) + 1] = (float)sin(j * double_PI / localStageSize / pow(2, k)); - } - localStageSum += localStageSize; - } - } - else if (axis->specializationConstants.rader_generator[i] > 0) { - for (uint64_t j = 0; j < localStageSize; j++) { - for (int64_t k = (axis->specializationConstants.stageRadix[i] - 1); k >= 0; k--) { - tempLUT[2 * (k + localStageSum)] = (float)cos(j * 2.0 * k / axis->specializationConstants.stageRadix[i] * double_PI / localStageSize); - tempLUT[2 * (k + localStageSum) + 1] = (float)sin(j * 2.0 * k / axis->specializationConstants.stageRadix[i] * double_PI / localStageSize); - } - localStageSum += (axis->specializationConstants.stageRadix[i]); - } - } - else { - for (uint64_t k = (axis->specializationConstants.stageRadix[i] - 1); k > 0; k--) { - for (uint64_t j = 0; j < localStageSize; j++) { - tempLUT[2 * (j + localStageSum)] = (float)cos(j * 2.0 * k / axis->specializationConstants.stageRadix[i] * double_PI / localStageSize); - tempLUT[2 * (j + localStageSum) + 1] = (float)sin(j * 2.0 * k / axis->specializationConstants.stageRadix[i] * double_PI / localStageSize); - } - localStageSum += localStageSize; - } - } - localStageSize *= axis->specializationConstants.stageRadix[i]; - } - - if (axis->specializationConstants.useRader) { - for (uint64_t i = 0; i < axis->specializationConstants.numRaderPrimes; i++) { - if (axis->specializationConstants.raderContainer[i].type) { - if (!axis->specializationConstants.inline_rader_kernel) { - for (uint64_t j = 0; j < (axis->specializationConstants.raderContainer[i].prime - 1); j++) {//fix later - uint64_t g_pow = 1; - for (uint64_t t = 0; t < axis->specializationConstants.raderContainer[i].prime - 1 - j; t++) { - g_pow = (g_pow * axis->specializationConstants.raderContainer[i].generator) % axis->specializationConstants.raderContainer[i].prime; - } - tempLUT[2 * (j + axis->specializationConstants.raderContainer[i].RaderKernelOffsetLUT)] = (float)(cos(2.0 * g_pow * double_PI / axis->specializationConstants.raderContainer[i].prime)); - tempLUT[2 * (j + axis->specializationConstants.raderContainer[i].RaderKernelOffsetLUT) + 1] = (float)(-sin(2.0 * g_pow * double_PI / axis->specializationConstants.raderContainer[i].prime)); - } - } - } - else { - localStageSize = axis->specializationConstants.raderContainer[i].stageRadix[0]; - localStageSum = 0; - for (uint64_t l = 1; l < axis->specializationConstants.raderContainer[i].numStages; l++) { - if ((axis->specializationConstants.raderContainer[i].stageRadix[l] & (axis->specializationConstants.raderContainer[i].stageRadix[l] - 1)) == 0) { - for (uint64_t k = 0; k < log2(axis->specializationConstants.raderContainer[i].stageRadix[l]); k++) { - for (uint64_t j = 0; j < localStageSize; j++) { - tempLUT[2 * (j + localStageSum + axis->specializationConstants.raderContainer[i].RaderRadixOffsetLUT)] = (float)cos(j * double_PI / localStageSize / pow(2, k)); - tempLUT[2 * (j + localStageSum + axis->specializationConstants.raderContainer[i].RaderRadixOffsetLUT) + 1] = (float)sin(j * double_PI / localStageSize / pow(2, k)); - } - localStageSum += localStageSize; - } - } - else { - for (uint64_t k = (axis->specializationConstants.raderContainer[i].stageRadix[l] - 1); k > 0; k--) { - for (uint64_t j = 0; j < localStageSize; j++) { - tempLUT[2 * (j + localStageSum + axis->specializationConstants.raderContainer[i].RaderRadixOffsetLUT)] = (float)cos(j * 2.0 * k / axis->specializationConstants.raderContainer[i].stageRadix[l] * double_PI / localStageSize); - tempLUT[2 * (j + localStageSum + axis->specializationConstants.raderContainer[i].RaderRadixOffsetLUT) + 1] = (float)sin(j * 2.0 * k / axis->specializationConstants.raderContainer[i].stageRadix[l] * double_PI / localStageSize); - } - localStageSum += localStageSize; - } - } - localStageSize *= axis->specializationConstants.raderContainer[i].stageRadix[l]; - } - localStageSize = axis->specializationConstants.raderContainer[i].stageRadix[axis->specializationConstants.raderContainer[i].numStages - 1]; - localStageSum = 0; - for (int64_t l = (int64_t)axis->specializationConstants.raderContainer[i].numStages - 2; l >= 0; l--) { - if ((axis->specializationConstants.raderContainer[i].stageRadix[l] & (axis->specializationConstants.raderContainer[i].stageRadix[l] - 1)) == 0) { - for (uint64_t k = 0; k < log2(axis->specializationConstants.raderContainer[i].stageRadix[l]); k++) { - for (uint64_t j = 0; j < localStageSize; j++) { - tempLUT[2 * (j + localStageSum + axis->specializationConstants.raderContainer[i].RaderRadixOffsetLUTiFFT)] = (float)cos(j * double_PI / localStageSize / pow(2, k)); - tempLUT[2 * (j + localStageSum + axis->specializationConstants.raderContainer[i].RaderRadixOffsetLUTiFFT) + 1] = (float)sin(j * double_PI / localStageSize / pow(2, k)); - } - localStageSum += localStageSize; - } - } - else { - for (uint64_t k = (axis->specializationConstants.raderContainer[i].stageRadix[l] - 1); k > 0; k--) { - for (uint64_t j = 0; j < localStageSize; j++) { - tempLUT[2 * (j + localStageSum + axis->specializationConstants.raderContainer[i].RaderRadixOffsetLUTiFFT)] = (float)cos(j * 2.0 * k / axis->specializationConstants.raderContainer[i].stageRadix[l] * double_PI / localStageSize); - tempLUT[2 * (j + localStageSum + axis->specializationConstants.raderContainer[i].RaderRadixOffsetLUTiFFT) + 1] = (float)sin(j * 2.0 * k / axis->specializationConstants.raderContainer[i].stageRadix[l] * double_PI / localStageSize); - } - localStageSum += localStageSize; - } - } - localStageSize *= axis->specializationConstants.raderContainer[i].stageRadix[l]; - } - if (!axis->specializationConstants.inline_rader_kernel) { - float* raderFFTkernel = (float*)axis->specializationConstants.raderContainer[i].raderFFTkernel; - for (uint64_t j = 0; j < (axis->specializationConstants.raderContainer[i].prime - 1); j++) {//fix later - tempLUT[2 * (j + axis->specializationConstants.raderContainer[i].RaderKernelOffsetLUT)] = (float)(raderFFTkernel[2 * j] / (axis->specializationConstants.raderContainer[i].prime - 1)); - tempLUT[2 * (j + axis->specializationConstants.raderContainer[i].RaderKernelOffsetLUT) + 1] = (float)(raderFFTkernel[2 * j + 1] / (axis->specializationConstants.raderContainer[i].prime - 1)); - } - } - } - } - } - - if (axis_upload_id > 0) { - for (uint64_t i = 0; i < axis->specializationConstants.stageStartSize; i++) { - for (uint64_t j = 0; j < axis->specializationConstants.fftDim; j++) { - double angle = 2 * double_PI * ((i * j) / (double)(axis->specializationConstants.stageStartSize * axis->specializationConstants.fftDim)); - tempLUT[maxStageSum * 2 + 2 * (i + j * axis->specializationConstants.stageStartSize)] = (float)cos(angle); - tempLUT[maxStageSum * 2 + 2 * (i + j * axis->specializationConstants.stageStartSize) + 1] = (float)sin(angle); - } - } - } - if ((app->configuration.performDCT == 2) || (app->configuration.performDCT == 3)) { - for (uint64_t j = 0; j < app->configuration.size[axis_id] / 2 + 2; j++) { - double angle = (double_PI / 2.0 / (double)(app->configuration.size[axis_id])) * j; - tempLUT[2 * axis->specializationConstants.startDCT3LUT + 2 * j] = (float)cos(angle); - tempLUT[2 * axis->specializationConstants.startDCT3LUT + 2 * j + 1] = (float)sin(angle); - } - } - if ((app->configuration.performDCT == 4) && (app->configuration.size[axis_id] % 2 == 0)) { - for (uint64_t j = 0; j < app->configuration.size[axis_id] / 4 + 2; j++) { - double angle = (double_PI / 2.0 / (double)(app->configuration.size[axis_id] / 2)) * j; - tempLUT[2 * axis->specializationConstants.startDCT3LUT + 2 * j] = (float)cos(angle); - tempLUT[2 * axis->specializationConstants.startDCT3LUT + 2 * j + 1] = (float)sin(angle); - } - for (uint64_t j = 0; j < app->configuration.size[axis_id] / 2; j++) { - double angle = (-double_PI / 8.0 / (double)(app->configuration.size[axis_id] / 2)) * (2 * j + 1); - tempLUT[2 * axis->specializationConstants.startDCT4LUT + 2 * j] = (float)cos(angle); - tempLUT[2 * axis->specializationConstants.startDCT4LUT + 2 * j + 1] = (float)sin(angle); - } - } - axis->referenceLUT = 0; - - if (reverseBluesteinMultiUpload == 1) { - axis->bufferLUT = FFTPlan->axes[axis_id][axis_upload_id].bufferLUT; -#if(VKFFT_BACKEND==0) - axis->bufferLUTDeviceMemory = FFTPlan->axes[axis_id][axis_upload_id].bufferLUTDeviceMemory; -#endif - axis->bufferLUTSize = FFTPlan->axes[axis_id][axis_upload_id].bufferLUTSize; - axis->referenceLUT = 1; - } - else { - if ((!inverse) && (!app->configuration.makeForwardPlanOnly)) { - axis->bufferLUT = app->localFFTPlan_inverse->axes[axis_id][axis_upload_id].bufferLUT; -#if(VKFFT_BACKEND==0) - axis->bufferLUTDeviceMemory = app->localFFTPlan_inverse->axes[axis_id][axis_upload_id].bufferLUTDeviceMemory; -#endif - axis->bufferLUTSize = app->localFFTPlan_inverse->axes[axis_id][axis_upload_id].bufferLUTSize; - axis->referenceLUT = 1; - } - else { - uint64_t checkRadixOrder = 1; - for (uint64_t i = 0; i < axis->specializationConstants.numStages; i++) - if (FFTPlan->axes[0][0].specializationConstants.stageRadix[i] != axis->specializationConstants.stageRadix[i]) checkRadixOrder = 0; - if (checkRadixOrder && ((axis_id == 1) || (axis_id == 2)) && (!((!axis->specializationConstants.reorderFourStep) && (FFTPlan->numAxisUploads[axis_id] > 1))) && ((axis->specializationConstants.fft_dim_full == FFTPlan->axes[0][0].specializationConstants.fft_dim_full) && (FFTPlan->numAxisUploads[axis_id] == 1) && (axis->specializationConstants.fft_dim_full < maxSingleSizeStrided / axis->specializationConstants.registerBoost)) && ((!app->configuration.performDCT) || (app->configuration.size[axis_id] == app->configuration.size[0]))) { - axis->bufferLUT = FFTPlan->axes[0][axis_upload_id].bufferLUT; -#if(VKFFT_BACKEND==0) - axis->bufferLUTDeviceMemory = FFTPlan->axes[0][axis_upload_id].bufferLUTDeviceMemory; -#endif - axis->bufferLUTSize = FFTPlan->axes[0][axis_upload_id].bufferLUTSize; - axis->referenceLUT = 1; - } - else { - checkRadixOrder = 1; - for (uint64_t i = 0; i < axis->specializationConstants.numStages; i++) - if (FFTPlan->axes[1][0].specializationConstants.stageRadix[i] != axis->specializationConstants.stageRadix[i]) checkRadixOrder = 0; - if (checkRadixOrder && (axis_id == 2) && (axis->specializationConstants.fft_dim_full == FFTPlan->axes[1][0].specializationConstants.fft_dim_full) && ((!app->configuration.performDCT) || (app->configuration.size[2] == app->configuration.size[1]))) { - axis->bufferLUT = FFTPlan->axes[1][axis_upload_id].bufferLUT; -#if(VKFFT_BACKEND==0) - axis->bufferLUTDeviceMemory = FFTPlan->axes[1][axis_upload_id].bufferLUTDeviceMemory; -#endif - axis->bufferLUTSize = FFTPlan->axes[1][axis_upload_id].bufferLUTSize; - axis->referenceLUT = 1; - } - else { -#if(VKFFT_BACKEND==0) - resFFT = allocateFFTBuffer(app, &axis->bufferLUT, &axis->bufferLUTDeviceMemory, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, VK_MEMORY_HEAP_DEVICE_LOCAL_BIT, axis->bufferLUTSize); - if (resFFT != VKFFT_SUCCESS) { - deleteVkFFT(app); - free(tempLUT); - tempLUT = 0; - return resFFT; - } - resFFT = transferDataFromCPU(app, tempLUT, &axis->bufferLUT, axis->bufferLUTSize); - if (resFFT != VKFFT_SUCCESS) { - deleteVkFFT(app); - free(tempLUT); - tempLUT = 0; - return resFFT; - } -#elif(VKFFT_BACKEND==1) - res = cudaMalloc((void**)&axis->bufferLUT, axis->bufferLUTSize); - if (res != cudaSuccess) { - deleteVkFFT(app); - free(tempLUT); - tempLUT = 0; - return VKFFT_ERROR_FAILED_TO_ALLOCATE; - } - res = cudaMemcpy(axis->bufferLUT, tempLUT, axis->bufferLUTSize, cudaMemcpyHostToDevice); - if (res != cudaSuccess) { - deleteVkFFT(app); - free(tempLUT); - tempLUT = 0; - return VKFFT_ERROR_FAILED_TO_ALLOCATE; - } -#elif(VKFFT_BACKEND==2) - res = hipMalloc((void**)&axis->bufferLUT, axis->bufferLUTSize); - if (res != hipSuccess) { - deleteVkFFT(app); - free(tempLUT); - tempLUT = 0; - return VKFFT_ERROR_FAILED_TO_ALLOCATE; - } - res = hipMemcpy(axis->bufferLUT, tempLUT, axis->bufferLUTSize, hipMemcpyHostToDevice); - if (res != hipSuccess) { - deleteVkFFT(app); - free(tempLUT); - tempLUT = 0; - return VKFFT_ERROR_FAILED_TO_ALLOCATE; - } -#elif(VKFFT_BACKEND==3) - axis->bufferLUT = clCreateBuffer(app->configuration.context[0], CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, axis->bufferLUTSize, tempLUT, &res); - if (res != CL_SUCCESS) { - deleteVkFFT(app); - free(tempLUT); - tempLUT = 0; - return VKFFT_ERROR_FAILED_TO_ALLOCATE; - } -#elif(VKFFT_BACKEND==4) - ze_device_mem_alloc_desc_t device_desc = {}; - device_desc.stype = ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC; - res = zeMemAllocDevice(app->configuration.context[0], &device_desc, axis->bufferLUTSize, sizeof(float), app->configuration.device[0], &axis->bufferLUT); - if (res != ZE_RESULT_SUCCESS) { - deleteVkFFT(app); - free(tempLUT); - tempLUT = 0; - return VKFFT_ERROR_FAILED_TO_ALLOCATE; - } - ze_command_queue_desc_t commandQueueCopyDesc = { - ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC, - 0, - app->configuration.commandQueueID, - 0, // index - 0, // flags - ZE_COMMAND_QUEUE_MODE_DEFAULT, - ZE_COMMAND_QUEUE_PRIORITY_NORMAL - }; - ze_command_list_handle_t copyCommandList; - res = zeCommandListCreateImmediate(app->configuration.context[0], app->configuration.device[0], &commandQueueCopyDesc, ©CommandList); - if (res != ZE_RESULT_SUCCESS) { - deleteVkFFT(app); - free(tempLUT); - tempLUT = 0; - return VKFFT_ERROR_FAILED_TO_CREATE_COMMAND_LIST; - } - res = zeCommandListAppendMemoryCopy(copyCommandList, axis->bufferLUT, tempLUT, axis->bufferLUTSize, 0, 0, 0); - if (res != ZE_RESULT_SUCCESS) { - deleteVkFFT(app); - free(tempLUT); - tempLUT = 0; - return VKFFT_ERROR_FAILED_TO_COPY; - } - res = zeCommandQueueSynchronize(app->configuration.commandQueue[0], UINT32_MAX); - if (res != ZE_RESULT_SUCCESS) { - deleteVkFFT(app); - free(tempLUT); - tempLUT = 0; - return VKFFT_ERROR_FAILED_TO_SYNCHRONIZE; - } -#endif - } - } - } - } - free(tempLUT); - tempLUT = 0; - } - } - if (axis->specializationConstants.useRaderMult) axis->specializationConstants.additionalRaderSharedSize = (axis->specializationConstants.useRaderMult - 1); - - //allocate RaderUintLUT - if (axis->specializationConstants.raderUintLUT) { - if (app->bufferRaderUintLUT[axis_id][axis_upload_id] == 0) { - app->bufferRaderUintLUTSize[axis_id][axis_upload_id] = 0; - for (uint64_t i = 0; i < axis->specializationConstants.numRaderPrimes; i++) { - app->bufferRaderUintLUTSize[axis_id][axis_upload_id] += axis->specializationConstants.raderContainer[i].prime * sizeof(uint32_t); - } - uint32_t* tempRaderUintLUT = (uint32_t*)malloc(app->bufferRaderUintLUTSize[axis_id][axis_upload_id]); - if (!tempRaderUintLUT) { - deleteVkFFT(app); - return VKFFT_ERROR_MALLOC_FAILED; - } - uint64_t current_offset = 0; - for (uint64_t i = 0; i < axis->specializationConstants.numRaderPrimes; i++) { - if (axis->specializationConstants.raderContainer[i].prime > 0) { - axis->specializationConstants.raderContainer[i].raderUintLUToffset = current_offset; - uint64_t g_pow = 1; - tempRaderUintLUT[current_offset] = 1; - current_offset++; - for (uint64_t t = 0; t < axis->specializationConstants.raderContainer[i].prime - 1; t++) { - g_pow = (g_pow * axis->specializationConstants.raderContainer[i].generator) % axis->specializationConstants.raderContainer[i].prime; - tempRaderUintLUT[current_offset] = (uint32_t)g_pow; - current_offset++; - } - } - } - -#if(VKFFT_BACKEND==0) - resFFT = allocateFFTBuffer(app, &app->bufferRaderUintLUT[axis_id][axis_upload_id], &app->bufferRaderUintLUTDeviceMemory[axis_id][axis_upload_id], VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, VK_MEMORY_HEAP_DEVICE_LOCAL_BIT, app->bufferRaderUintLUTSize[axis_id][axis_upload_id]); - if (resFFT != VKFFT_SUCCESS) { - deleteVkFFT(app); - free(tempRaderUintLUT); - tempRaderUintLUT = 0; - return resFFT; - } - resFFT = transferDataFromCPU(app, tempRaderUintLUT, &app->bufferRaderUintLUT[axis_id][axis_upload_id], app->bufferRaderUintLUTSize[axis_id][axis_upload_id]); - if (resFFT != VKFFT_SUCCESS) { - deleteVkFFT(app); - free(tempRaderUintLUT); - tempRaderUintLUT = 0; - return resFFT; - } -#elif(VKFFT_BACKEND==1) - res = cudaMalloc((void**)&app->bufferRaderUintLUT[axis_id][axis_upload_id], app->bufferRaderUintLUTSize[axis_id][axis_upload_id]); - if (res != cudaSuccess) { - deleteVkFFT(app); - free(tempRaderUintLUT); - tempRaderUintLUT = 0; - return VKFFT_ERROR_FAILED_TO_ALLOCATE; - } - res = cudaMemcpy(app->bufferRaderUintLUT[axis_id][axis_upload_id], tempRaderUintLUT, app->bufferRaderUintLUTSize[axis_id][axis_upload_id], cudaMemcpyHostToDevice); - if (res != cudaSuccess) { - deleteVkFFT(app); - free(tempRaderUintLUT); - tempRaderUintLUT = 0; - return VKFFT_ERROR_FAILED_TO_ALLOCATE; - } -#elif(VKFFT_BACKEND==2) - res = hipMalloc((void**)&app->bufferRaderUintLUT[axis_id][axis_upload_id], app->bufferRaderUintLUTSize[axis_id][axis_upload_id]); - if (res != hipSuccess) { - deleteVkFFT(app); - free(tempRaderUintLUT); - tempRaderUintLUT = 0; - return VKFFT_ERROR_FAILED_TO_ALLOCATE; - } - res = hipMemcpy(app->bufferRaderUintLUT[axis_id][axis_upload_id], tempRaderUintLUT, app->bufferRaderUintLUTSize[axis_id][axis_upload_id], hipMemcpyHostToDevice); - if (res != hipSuccess) { - deleteVkFFT(app); - free(tempRaderUintLUT); - tempRaderUintLUT = 0; - return VKFFT_ERROR_FAILED_TO_ALLOCATE; - } -#elif(VKFFT_BACKEND==3) - app->bufferRaderUintLUT[axis_id][axis_upload_id] = clCreateBuffer(app->configuration.context[0], CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, app->bufferRaderUintLUTSize[axis_id][axis_upload_id], tempRaderUintLUT, &res); - if (res != CL_SUCCESS) { - deleteVkFFT(app); - free(tempRaderUintLUT); - tempRaderUintLUT = 0; - return VKFFT_ERROR_FAILED_TO_ALLOCATE; - } -#elif(VKFFT_BACKEND==4) - ze_device_mem_alloc_desc_t device_desc = {}; - device_desc.stype = ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC; - res = zeMemAllocDevice(app->configuration.context[0], &device_desc, app->bufferRaderUintLUTSize[axis_id][axis_upload_id], sizeof(uint32_t), app->configuration.device[0], &app->bufferRaderUintLUT[axis_id][axis_upload_id]); - if (res != ZE_RESULT_SUCCESS) { - deleteVkFFT(app); - free(tempRaderUintLUT); - tempRaderUintLUT = 0; - return VKFFT_ERROR_FAILED_TO_ALLOCATE; - } - ze_command_queue_desc_t commandQueueCopyDesc = { - ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC, - 0, - app->configuration.commandQueueID, - 0, // index - 0, // flags - ZE_COMMAND_QUEUE_MODE_DEFAULT, - ZE_COMMAND_QUEUE_PRIORITY_NORMAL - }; - ze_command_list_handle_t copyCommandList; - res = zeCommandListCreateImmediate(app->configuration.context[0], app->configuration.device[0], &commandQueueCopyDesc, ©CommandList); - if (res != ZE_RESULT_SUCCESS) { - deleteVkFFT(app); - free(tempRaderUintLUT); - tempRaderUintLUT = 0; - return VKFFT_ERROR_FAILED_TO_CREATE_COMMAND_LIST; - } - res = zeCommandListAppendMemoryCopy(copyCommandList, app->bufferRaderUintLUT[axis_id][axis_upload_id], tempRaderUintLUT, app->bufferRaderUintLUTSize[axis_id][axis_upload_id], 0, 0, 0); - if (res != ZE_RESULT_SUCCESS) { - deleteVkFFT(app); - free(tempRaderUintLUT); - tempRaderUintLUT = 0; - return VKFFT_ERROR_FAILED_TO_COPY; - } - res = zeCommandQueueSynchronize(app->configuration.commandQueue[0], UINT32_MAX); - if (res != ZE_RESULT_SUCCESS) { - deleteVkFFT(app); - free(tempRaderUintLUT); - tempRaderUintLUT = 0; - return VKFFT_ERROR_FAILED_TO_SYNCHRONIZE; - } -#endif - free(tempRaderUintLUT); - tempRaderUintLUT = 0; - } - - axis->bufferRaderUintLUT = app->bufferRaderUintLUT[axis_id][axis_upload_id]; -#if(VKFFT_BACKEND==0) - axis->bufferRaderUintLUTDeviceMemory = app->bufferRaderUintLUTDeviceMemory[axis_id][axis_upload_id]; -#endif - axis->bufferRaderUintLUTSize = app->bufferRaderUintLUTSize[axis_id][axis_upload_id]; - } - //configure strides - - uint64_t* axisStride = axis->specializationConstants.inputStride; - uint64_t* usedStride = app->configuration.bufferStride; - if ((!inverse) && (axis_id == app->firstAxis) && (axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1) && (app->configuration.isInputFormatted)) usedStride = app->configuration.inputBufferStride; - if ((inverse) && (axis_id == app->lastAxis) && ((axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1) && ((app->useBluesteinFFT[axis_id] && (reverseBluesteinMultiUpload == 0)) || (!app->useBluesteinFFT[axis_id])) && (!app->configuration.performConvolution)) && (app->configuration.isInputFormatted) && (!app->configuration.inverseReturnToInputBuffer)) usedStride = app->configuration.inputBufferStride; - - axisStride[0] = 1; - - if (axis_id == 0) { - axisStride[1] = usedStride[0]; - axisStride[2] = usedStride[1]; - } - if (axis_id == 1) - { - axisStride[1] = usedStride[0]; - axisStride[2] = usedStride[1]; - } - if (axis_id == 2) - { - axisStride[1] = usedStride[1]; - axisStride[2] = usedStride[0]; - } - - axisStride[3] = usedStride[2]; - - axisStride[4] = axisStride[3] * app->configuration.coordinateFeatures; - if (app->useBluesteinFFT[axis_id] && (FFTPlan->numAxisUploads[axis_id] > 1) && (!((axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1) && (reverseBluesteinMultiUpload == 0)))) { - axisStride[0] = 1; - - if (axis_id == 0) { - axisStride[1] = FFTPlan->actualFFTSizePerAxis[axis_id][0]; - axisStride[2] = FFTPlan->actualFFTSizePerAxis[axis_id][0] * FFTPlan->actualFFTSizePerAxis[axis_id][1]; - } - if (axis_id == 1) - { - axisStride[1] = FFTPlan->actualFFTSizePerAxis[axis_id][0]; - axisStride[2] = FFTPlan->actualFFTSizePerAxis[axis_id][0] * FFTPlan->actualFFTSizePerAxis[axis_id][1]; - } - if (axis_id == 2) - { - axisStride[1] = FFTPlan->actualFFTSizePerAxis[axis_id][0] * FFTPlan->actualFFTSizePerAxis[axis_id][1]; - axisStride[2] = FFTPlan->actualFFTSizePerAxis[axis_id][0]; - } - - axisStride[3] = axisStride[2] * FFTPlan->actualFFTSizePerAxis[axis_id][2]; - - axisStride[4] = axisStride[3] * app->configuration.coordinateFeatures; - } - if ((!inverse) && (axis_id == 0) && (axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1) && (reverseBluesteinMultiUpload == 0) && (axis->specializationConstants.performR2C || FFTPlan->multiUploadR2C) && (!(app->configuration.isInputFormatted))) { - axisStride[1] *= 2; - axisStride[2] *= 2; - axisStride[3] *= 2; - axisStride[4] *= 2; - } - if ((FFTPlan->multiUploadR2C) && (!inverse) && (axis_id == 0) && (axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1) && (reverseBluesteinMultiUpload == 0)) { - for (uint64_t i = 1; i < 5; i++) { - axisStride[i] /= 2; - } - } - axisStride = axis->specializationConstants.outputStride; - usedStride = app->configuration.bufferStride; - if ((!inverse) && (axis_id == app->lastAxis) && (axis_upload_id == 0) && (app->configuration.isOutputFormatted)) usedStride = app->configuration.outputBufferStride; - if ((inverse) && (axis_id == app->firstAxis) && (((axis_upload_id == 0) && (!app->configuration.performConvolution)) || ((axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1) && ((reverseBluesteinMultiUpload == 1) || (app->configuration.performConvolution)))) && ((app->configuration.isOutputFormatted))) usedStride = app->configuration.outputBufferStride; - if ((inverse) && (axis_id == app->firstAxis) && (((axis_upload_id == 0) && (app->configuration.isInputFormatted)) || ((axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1) && (!axis->specializationConstants.reorderFourStep))) && (app->configuration.inverseReturnToInputBuffer)) usedStride = app->configuration.inputBufferStride; - - axisStride[0] = 1; - - if (axis_id == 0) { - axisStride[1] = usedStride[0]; - axisStride[2] = usedStride[1]; - } - if (axis_id == 1) - { - axisStride[1] = usedStride[0]; - axisStride[2] = usedStride[1]; - } - if (axis_id == 2) - { - axisStride[1] = usedStride[1]; - axisStride[2] = usedStride[0]; - } - - axisStride[3] = usedStride[2]; - - axisStride[4] = axisStride[3] * app->configuration.coordinateFeatures; - if (app->useBluesteinFFT[axis_id] && (FFTPlan->numAxisUploads[axis_id] > 1) && (!((axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1) && (reverseBluesteinMultiUpload == 1)))) { - axisStride[0] = 1; - - if (axis_id == 0) { - axisStride[1] = FFTPlan->actualFFTSizePerAxis[axis_id][0]; - axisStride[2] = FFTPlan->actualFFTSizePerAxis[axis_id][0] * FFTPlan->actualFFTSizePerAxis[axis_id][1]; - } - if (axis_id == 1) - { - axisStride[1] = FFTPlan->actualFFTSizePerAxis[axis_id][0]; - axisStride[2] = FFTPlan->actualFFTSizePerAxis[axis_id][0] * FFTPlan->actualFFTSizePerAxis[axis_id][1]; - } - if (axis_id == 2) - { - axisStride[1] = FFTPlan->actualFFTSizePerAxis[axis_id][0] * FFTPlan->actualFFTSizePerAxis[axis_id][1]; - axisStride[2] = FFTPlan->actualFFTSizePerAxis[axis_id][0]; - } - - axisStride[3] = axisStride[2] * FFTPlan->actualFFTSizePerAxis[axis_id][2]; - - axisStride[4] = axisStride[3] * app->configuration.coordinateFeatures; - } - if ((inverse) && (axis_id == 0) && (((!app->useBluesteinFFT[axis_id]) && (axis_upload_id == 0)) || ((app->useBluesteinFFT[axis_id]) && (axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1) && ((reverseBluesteinMultiUpload == 1) || (FFTPlan->numAxisUploads[axis_id] == 1)))) && (axis->specializationConstants.performR2C || FFTPlan->multiUploadR2C) && (!((app->configuration.isInputFormatted) && (app->configuration.inverseReturnToInputBuffer))) && (!app->configuration.isOutputFormatted)) { - axisStride[1] *= 2; - axisStride[2] *= 2; - axisStride[3] *= 2; - axisStride[4] *= 2; - } - if ((FFTPlan->multiUploadR2C) && (inverse) && (axis_id == 0) && (((!app->useBluesteinFFT[axis_id]) && (axis_upload_id == 0)) || ((app->useBluesteinFFT[axis_id]) && (axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1) && ((reverseBluesteinMultiUpload == 1) || (FFTPlan->numAxisUploads[axis_id] == 1))))) { - for (uint64_t i = 1; i < 5; i++) { - axisStride[i] /= 2; - } - } - - /*axis->specializationConstants.inputStride[3] = (app->configuration.coordinateFeatures == 1) ? 0 : axis->specializationConstants.inputStride[3]; - axis->specializationConstants.outputStride[3] = (app->configuration.coordinateFeatures == 1) ? 0 : axis->specializationConstants.outputStride[3]; - - axis->specializationConstants.inputStride[4] = ((app->configuration.numberBatches == 1) && (app->configuration.numberKernels == 1)) ? 0 : axis->specializationConstants.inputStride[3] * app->configuration.coordinateFeatures; - axis->specializationConstants.outputStride[4] = ((app->configuration.numberBatches == 1) && (app->configuration.numberKernels == 1)) ? 0 : axis->specializationConstants.outputStride[3] * app->configuration.coordinateFeatures; - */ - - - uint64_t storageComplexSize; - if (app->configuration.doublePrecision) - storageComplexSize = (2 * sizeof(double)); - else - if (app->configuration.halfPrecision) - storageComplexSize = (2 * 2); - else - storageComplexSize = (2 * sizeof(float)); - - uint64_t initPageSize = -1; - uint64_t locBufferNum = 1; - uint64_t locBufferSize = -1; - /*for (uint64_t i = 0; i < app->configuration.bufferNum; i++) { - initPageSize += app->configuration.bufferSize[i]; - }*/ - /*if (app->configuration.performConvolution) { - uint64_t initPageSizeKernel = 0; - for (uint64_t i = 0; i < app->configuration.kernelNum; i++) { - initPageSizeKernel += app->configuration.kernelSize[i]; - } - if (initPageSizeKernel > initPageSize) initPageSize = initPageSizeKernel; - } - if (axis_id == 0) { - if ((!((!axis->specializationConstants.reorderFourStep) && (axis_upload_id == 0))) && (axis->specializationConstants.inputStride[1] * storageComplexSize > app->configuration.devicePageSize * 1024) && (app->configuration.devicePageSize > 0)) { - initPageSize = app->configuration.localPageSize * 1024; - } - } - if (axis_id == 1) { - if ((app->configuration.bufferStride[1] * storageComplexSize > app->configuration.devicePageSize * 1024) && (app->configuration.devicePageSize > 0)) { - initPageSize = app->configuration.localPageSize * 1024; - } - } - if (axis_id == 2) { - if ((app->configuration.bufferStride[2] * storageComplexSize > app->configuration.devicePageSize * 1024) && (app->configuration.devicePageSize > 0)) { - initPageSize = app->configuration.localPageSize * 1024; - } - } - */ - if ((axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1) && (app->configuration.isInputFormatted) && (!axis->specializationConstants.reverseBluesteinMultiUpload) && ( - ((axis_id == app->firstAxis) && (!inverse)) - || ((axis_id == app->lastAxis) && (inverse) && (!((axis_id == 0) && (axis->specializationConstants.performR2CmultiUpload))) && (!app->configuration.performConvolution) && (!app->configuration.inverseReturnToInputBuffer))) - ) { - uint64_t totalSize = 0; - uint64_t locPageSize = initPageSize; - locBufferNum = app->configuration.inputBufferNum; - if (app->configuration.inputBufferSize) { - locBufferSize = (uint64_t)ceil(app->configuration.inputBufferSize[0] / (double)storageComplexSize); - for (uint64_t i = 0; i < app->configuration.inputBufferNum; i++) { - totalSize += app->configuration.inputBufferSize[i]; - if (app->configuration.inputBufferSize[i] < locPageSize) locPageSize = app->configuration.inputBufferSize[i]; - } - } - axis->specializationConstants.inputBufferBlockSize = (locBufferNum == 1) ? locBufferSize : (uint64_t)ceil(locPageSize / (double)storageComplexSize); - axis->specializationConstants.inputBufferBlockNum = (locBufferNum == 1) ? 1 : (uint64_t)ceil(totalSize / (double)(axis->specializationConstants.inputBufferBlockSize * storageComplexSize)); - //if (axis->specializationConstants.inputBufferBlockNum == 1) axis->specializationConstants.inputBufferBlockSize = totalSize / storageComplexSize; - - } - else { - if ((axis_upload_id == 0) && (app->configuration.numberKernels > 1) && (inverse) && (!app->configuration.performConvolution)) { - uint64_t totalSize = 0; - uint64_t locPageSize = initPageSize; - locBufferNum = app->configuration.outputBufferNum; - if (app->configuration.outputBufferSize) { - locBufferSize = (uint64_t)ceil(app->configuration.outputBufferSize[0] / (double)storageComplexSize); - for (uint64_t i = 0; i < app->configuration.outputBufferNum; i++) { - totalSize += app->configuration.outputBufferSize[i]; - if (app->configuration.outputBufferSize[i] < locPageSize) locPageSize = app->configuration.outputBufferSize[i]; - } - } - axis->specializationConstants.inputBufferBlockSize = (locBufferNum == 1) ? locBufferSize : (uint64_t)ceil(locPageSize / (double)storageComplexSize); - axis->specializationConstants.inputBufferBlockNum = (locBufferNum == 1) ? 1 : (uint64_t)ceil(totalSize / (double)(axis->specializationConstants.inputBufferBlockSize * storageComplexSize)); - //if (axis->specializationConstants.inputBufferBlockNum == 1) axis->specializationConstants.outputBufferBlockSize = totalSize / storageComplexSize; - - } - else { - uint64_t totalSize = 0; - uint64_t locPageSize = initPageSize; - if (((axis->specializationConstants.reorderFourStep == 1) || (app->useBluesteinFFT[axis_id])) && (FFTPlan->numAxisUploads[axis_id] > 1)) { - if (((axis->specializationConstants.reorderFourStep == 1) && (axis_upload_id > 0)) || (app->useBluesteinFFT[axis_id] && (reverseBluesteinMultiUpload == 0) && (axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1))) { - locBufferNum = app->configuration.bufferNum; - if (app->configuration.bufferSize) { - locBufferSize = (uint64_t)ceil(app->configuration.bufferSize[0] / (double)storageComplexSize); - for (uint64_t i = 0; i < app->configuration.bufferNum; i++) { - totalSize += app->configuration.bufferSize[i]; - if (app->configuration.bufferSize[i] < locPageSize) locPageSize = app->configuration.bufferSize[i]; - - } - } - } - else { - locBufferNum = app->configuration.tempBufferNum; - if (app->configuration.tempBufferSize) { - locBufferSize = (uint64_t)ceil(app->configuration.tempBufferSize[0] / (double)storageComplexSize); - for (uint64_t i = 0; i < app->configuration.tempBufferNum; i++) { - totalSize += app->configuration.tempBufferSize[i]; - if (app->configuration.tempBufferSize[i] < locPageSize) locPageSize = app->configuration.tempBufferSize[i]; - - } - } - } - } - else { - locBufferNum = app->configuration.bufferNum; - if (app->configuration.bufferSize) { - locBufferSize = (uint64_t)ceil(app->configuration.bufferSize[0] / (double)storageComplexSize); - for (uint64_t i = 0; i < app->configuration.bufferNum; i++) { - totalSize += app->configuration.bufferSize[i]; - if (app->configuration.bufferSize[i] < locPageSize) locPageSize = app->configuration.bufferSize[i]; - - } - } - } - - axis->specializationConstants.inputBufferBlockSize = (locBufferNum == 1) ? locBufferSize : (uint64_t)ceil(locPageSize / (double)storageComplexSize); - axis->specializationConstants.inputBufferBlockNum = (locBufferNum == 1) ? 1 : (uint64_t)ceil(totalSize / (double)(axis->specializationConstants.inputBufferBlockSize * storageComplexSize)); - //if (axis->specializationConstants.inputBufferBlockNum == 1) axis->specializationConstants.inputBufferBlockSize = totalSize / storageComplexSize; - - } - } - initPageSize = -1; - locBufferNum = 1; - locBufferSize = -1; - if (((axis_upload_id == 0) && (!app->useBluesteinFFT[axis_id]) && (app->configuration.isOutputFormatted && ( - ((axis_id == app->firstAxis) && (inverse)) - || ((axis_id == app->lastAxis) && (!inverse) && (!app->configuration.performConvolution)) - || ((axis_id == app->firstAxis) && (app->configuration.performConvolution) && (app->configuration.FFTdim == 1))) - )) || - ((axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1) && (app->useBluesteinFFT[axis_id]) && (axis->specializationConstants.reverseBluesteinMultiUpload || (FFTPlan->numAxisUploads[axis_id] == 1)) && (app->configuration.isOutputFormatted && ( - ((axis_id == app->firstAxis) && (inverse)) - || ((axis_id == app->lastAxis) && (!inverse) && (!app->configuration.performConvolution))) - )) || - ((app->configuration.numberKernels > 1) && ( - (inverse) - || (axis_id == app->lastAxis))) - ) { - uint64_t totalSize = 0; - uint64_t locPageSize = initPageSize; - locBufferNum = app->configuration.outputBufferNum; - if (app->configuration.outputBufferSize) { - locBufferSize = (uint64_t)ceil(app->configuration.outputBufferSize[0] / (double)storageComplexSize); - for (uint64_t i = 0; i < app->configuration.outputBufferNum; i++) { - totalSize += app->configuration.outputBufferSize[i]; - if (app->configuration.outputBufferSize[i] < locPageSize) locPageSize = app->configuration.outputBufferSize[i]; - } - } - axis->specializationConstants.outputBufferBlockSize = (locBufferNum == 1) ? locBufferSize : (uint64_t)ceil(locPageSize / (double)storageComplexSize); - axis->specializationConstants.outputBufferBlockNum = (locBufferNum == 1) ? 1 : (uint64_t)ceil(totalSize / (double)(axis->specializationConstants.outputBufferBlockSize * storageComplexSize)); - //if (axis->specializationConstants.outputBufferBlockNum == 1) axis->specializationConstants.outputBufferBlockSize = totalSize / storageComplexSize; - - } - else { - uint64_t totalSize = 0; - uint64_t locPageSize = initPageSize; - if (((axis->specializationConstants.reorderFourStep == 1) || (app->useBluesteinFFT[axis_id])) && (FFTPlan->numAxisUploads[axis_id] > 1)) { - if (((axis->specializationConstants.reorderFourStep == 1) && (axis_upload_id == 1)) || (app->useBluesteinFFT[axis_id] && (!((axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1) && (axis->specializationConstants.reverseBluesteinMultiUpload == 1))))) { - locBufferNum = app->configuration.tempBufferNum; - if (app->configuration.tempBufferSize) { - locBufferSize = (uint64_t)ceil(app->configuration.tempBufferSize[0] / (double)storageComplexSize); - for (uint64_t i = 0; i < app->configuration.tempBufferNum; i++) { - totalSize += app->configuration.tempBufferSize[i]; - if (app->configuration.tempBufferSize[i] < locPageSize) locPageSize = app->configuration.tempBufferSize[i]; - } - } - } - else { - locBufferNum = app->configuration.bufferNum; - if (app->configuration.bufferSize) { - locBufferSize = (uint64_t)ceil(app->configuration.bufferSize[0] / (double)storageComplexSize); - for (uint64_t i = 0; i < app->configuration.bufferNum; i++) { - totalSize += app->configuration.bufferSize[i]; - if (app->configuration.bufferSize[i] < locPageSize) locPageSize = app->configuration.bufferSize[i]; - } - } - } - } - else { - locBufferNum = app->configuration.bufferNum; - if (app->configuration.bufferSize) { - locBufferSize = (uint64_t)ceil(app->configuration.bufferSize[0] / (double)storageComplexSize); - for (uint64_t i = 0; i < app->configuration.bufferNum; i++) { - totalSize += app->configuration.bufferSize[i]; - if (app->configuration.bufferSize[i] < locPageSize) locPageSize = app->configuration.bufferSize[i]; - } - } - } - axis->specializationConstants.outputBufferBlockSize = (locBufferNum == 1) ? locBufferSize : (uint64_t)ceil(locPageSize / (double)storageComplexSize); - axis->specializationConstants.outputBufferBlockNum = (locBufferNum == 1) ? 1 : (uint64_t)ceil(totalSize / (double)(axis->specializationConstants.outputBufferBlockSize * storageComplexSize)); - //if (axis->specializationConstants.outputBufferBlockNum == 1) axis->specializationConstants.outputBufferBlockSize = totalSize / storageComplexSize; - - } - if (axis->specializationConstants.inputBufferBlockNum == 0) axis->specializationConstants.inputBufferBlockNum = 1; - if (axis->specializationConstants.outputBufferBlockNum == 0) axis->specializationConstants.outputBufferBlockNum = 1; - if (app->configuration.performConvolution) { - uint64_t totalSize = 0; - uint64_t locPageSize = initPageSize; - locBufferNum = app->configuration.kernelNum; - if (app->configuration.kernelSize) { - locBufferSize = (uint64_t)ceil(app->configuration.kernelSize[0] / (double)storageComplexSize); - for (uint64_t i = 0; i < app->configuration.kernelNum; i++) { - totalSize += app->configuration.kernelSize[i]; - if (app->configuration.kernelSize[i] < locPageSize) locPageSize = app->configuration.kernelSize[i]; - } - } - axis->specializationConstants.kernelBlockSize = (locBufferNum == 1) ? locBufferSize : (uint64_t)ceil(locPageSize / (double)storageComplexSize); - axis->specializationConstants.kernelBlockNum = (locBufferNum == 1) ? 1 : (uint64_t)ceil(totalSize / (double)(axis->specializationConstants.kernelBlockSize * storageComplexSize)); - //if (axis->specializationConstants.kernelBlockNum == 1) axis->specializationConstants.inputBufferBlockSize = totalSize / storageComplexSize; - if (axis->specializationConstants.kernelBlockNum == 0) axis->specializationConstants.kernelBlockNum = 1; - } - else { - axis->specializationConstants.kernelBlockSize = 0; - axis->specializationConstants.kernelBlockNum = 0; - } - axis->numBindings = 2; - axis->specializationConstants.numBuffersBound[0] = axis->specializationConstants.inputBufferBlockNum; - axis->specializationConstants.numBuffersBound[1] = axis->specializationConstants.outputBufferBlockNum; - axis->specializationConstants.numBuffersBound[2] = 0; - axis->specializationConstants.numBuffersBound[3] = 0; -#if(VKFFT_BACKEND==0) - VkDescriptorPoolSize descriptorPoolSize = { VK_DESCRIPTOR_TYPE_STORAGE_BUFFER }; - descriptorPoolSize.descriptorCount = (uint32_t)(axis->specializationConstants.inputBufferBlockNum + axis->specializationConstants.outputBufferBlockNum); -#endif - axis->specializationConstants.convolutionBindingID = -1; - if ((axis_id == 0) && (axis_upload_id == 0) && (app->configuration.FFTdim == 1) && (app->configuration.performConvolution)) { - axis->specializationConstants.convolutionBindingID = axis->numBindings; - axis->specializationConstants.numBuffersBound[axis->numBindings] = axis->specializationConstants.kernelBlockNum; -#if(VKFFT_BACKEND==0) - descriptorPoolSize.descriptorCount += (uint32_t)axis->specializationConstants.kernelBlockNum; -#endif - axis->numBindings++; - } - if ((axis_id == 1) && (axis_upload_id == 0) && (app->configuration.FFTdim == 2) && (app->configuration.performConvolution)) { - axis->specializationConstants.convolutionBindingID = axis->numBindings; - axis->specializationConstants.numBuffersBound[axis->numBindings] = axis->specializationConstants.kernelBlockNum; -#if(VKFFT_BACKEND==0) - descriptorPoolSize.descriptorCount += (uint32_t)axis->specializationConstants.kernelBlockNum; -#endif - axis->numBindings++; - } - if ((axis_id == 2) && (axis_upload_id == 0) && (app->configuration.FFTdim == 3) && (app->configuration.performConvolution)) { - axis->specializationConstants.convolutionBindingID = axis->numBindings; - axis->specializationConstants.numBuffersBound[axis->numBindings] = axis->specializationConstants.kernelBlockNum; -#if(VKFFT_BACKEND==0) - descriptorPoolSize.descriptorCount += (uint32_t)axis->specializationConstants.kernelBlockNum; -#endif - axis->numBindings++; - } - if (app->configuration.useLUT) { - axis->specializationConstants.LUTBindingID = axis->numBindings; - axis->specializationConstants.numBuffersBound[axis->numBindings] = 1; -#if(VKFFT_BACKEND==0) - descriptorPoolSize.descriptorCount++; -#endif - axis->numBindings++; - } - if (axis->specializationConstants.raderUintLUT) { - axis->specializationConstants.RaderUintLUTBindingID = axis->numBindings; - axis->specializationConstants.numBuffersBound[axis->numBindings] = 1; -#if(VKFFT_BACKEND==0) - descriptorPoolSize.descriptorCount++; -#endif - axis->numBindings++; - } - if ((app->useBluesteinFFT[axis_id]) && (axis_upload_id == 0)) { - if (axis->specializationConstants.inverseBluestein) - axis->bufferBluesteinFFT = &app->bufferBluesteinIFFT[axis_id]; - else - axis->bufferBluesteinFFT = &app->bufferBluesteinFFT[axis_id]; - axis->specializationConstants.BluesteinConvolutionBindingID = axis->numBindings; - axis->specializationConstants.numBuffersBound[axis->numBindings] = 1; -#if(VKFFT_BACKEND==0) - descriptorPoolSize.descriptorCount++; -#endif - axis->numBindings++; - } - if ((app->useBluesteinFFT[axis_id]) && (axis_upload_id == (FFTPlan->numAxisUploads[axis_id] - 1))) { - axis->bufferBluestein = &app->bufferBluestein[axis_id]; - axis->specializationConstants.BluesteinMultiplicationBindingID = axis->numBindings; - axis->specializationConstants.numBuffersBound[axis->numBindings] = 1; -#if(VKFFT_BACKEND==0) - descriptorPoolSize.descriptorCount++; -#endif - axis->numBindings++; - } -#if(VKFFT_BACKEND==0) - VkDescriptorPoolCreateInfo descriptorPoolCreateInfo = { VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO }; - descriptorPoolCreateInfo.poolSizeCount = 1; - descriptorPoolCreateInfo.pPoolSizes = &descriptorPoolSize; - descriptorPoolCreateInfo.maxSets = 1; - res = vkCreateDescriptorPool(app->configuration.device[0], &descriptorPoolCreateInfo, 0, &axis->descriptorPool); - if (res != VK_SUCCESS) { - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_TO_CREATE_DESCRIPTOR_POOL; - } - const VkDescriptorType descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; - VkDescriptorSetLayoutBinding* descriptorSetLayoutBindings; - descriptorSetLayoutBindings = (VkDescriptorSetLayoutBinding*)malloc(axis->numBindings * sizeof(VkDescriptorSetLayoutBinding)); - if (!descriptorSetLayoutBindings) { - deleteVkFFT(app); - return VKFFT_ERROR_MALLOC_FAILED; - } - for (uint64_t i = 0; i < axis->numBindings; ++i) { - descriptorSetLayoutBindings[i].binding = (uint32_t)i; - descriptorSetLayoutBindings[i].descriptorType = descriptorType; - descriptorSetLayoutBindings[i].descriptorCount = (uint32_t)axis->specializationConstants.numBuffersBound[i]; - descriptorSetLayoutBindings[i].stageFlags = VK_SHADER_STAGE_COMPUTE_BIT; - } - - VkDescriptorSetLayoutCreateInfo descriptorSetLayoutCreateInfo = { VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO }; - descriptorSetLayoutCreateInfo.bindingCount = (uint32_t)axis->numBindings; - descriptorSetLayoutCreateInfo.pBindings = descriptorSetLayoutBindings; - - res = vkCreateDescriptorSetLayout(app->configuration.device[0], &descriptorSetLayoutCreateInfo, 0, &axis->descriptorSetLayout); - if (res != VK_SUCCESS) { - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_TO_CREATE_DESCRIPTOR_SET_LAYOUT; - } - free(descriptorSetLayoutBindings); - descriptorSetLayoutBindings = 0; - VkDescriptorSetAllocateInfo descriptorSetAllocateInfo = { VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO }; - descriptorSetAllocateInfo.descriptorPool = axis->descriptorPool; - descriptorSetAllocateInfo.descriptorSetCount = 1; - descriptorSetAllocateInfo.pSetLayouts = &axis->descriptorSetLayout; - res = vkAllocateDescriptorSets(app->configuration.device[0], &descriptorSetAllocateInfo, &axis->descriptorSet); - if (res != VK_SUCCESS) { - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_TO_ALLOCATE_DESCRIPTOR_SETS; - } -#endif - if (app->configuration.specifyOffsetsAtLaunch) { - axis->specializationConstants.performPostCompilationInputOffset = 1; - axis->specializationConstants.performPostCompilationOutputOffset = 1; - if (app->configuration.performConvolution) - axis->specializationConstants.performPostCompilationKernelOffset = 1; - } - resFFT = VkFFTCheckUpdateBufferSet(app, axis, 1, 0); - if (resFFT != VKFFT_SUCCESS) { - deleteVkFFT(app); - return resFFT; - } - resFFT = VkFFTUpdateBufferSet(app, FFTPlan, axis, axis_id, axis_upload_id, inverse); - if (resFFT != VKFFT_SUCCESS) { - deleteVkFFT(app); - return resFFT; - } - { - uint64_t maxBatchCoalesced = app->configuration.coalescedMemory / complexSize; - axis->groupedBatch = maxBatchCoalesced; - /*if ((FFTPlan->actualFFTSizePerAxis[axis_id][0] < 4096) && (FFTPlan->actualFFTSizePerAxis[axis_id][1] < 512) && (FFTPlan->actualFFTSizePerAxis[axis_id][2] == 1)) { - if (app->configuration.sharedMemorySize / axis->specializationConstants.fftDim >= app->configuration.coalescedMemory) { - if (1024 / axis->specializationConstants.fftDim < maxSequenceLengthSharedMemory / axis->specializationConstants.fftDim) { - if (1024 / axis->specializationConstants.fftDim > axis->groupedBatch) - axis->groupedBatch = 1024 / axis->specializationConstants.fftDim; - else - axis->groupedBatch = maxSequenceLengthSharedMemory / axis->specializationConstants.fftDim; - } - } - } - else { - axis->groupedBatch = (app->configuration.sharedMemorySize / axis->specializationConstants.fftDim >= app->configuration.coalescedMemory) ? maxSequenceLengthSharedMemory / axis->specializationConstants.fftDim : axis->groupedBatch; - }*/ - //if (axis->groupedBatch * (uint64_t)ceil(axis->specializationConstants.fftDim / 8.0) < app->configuration.warpSize) axis->groupedBatch = app->configuration.warpSize / (uint64_t)ceil(axis->specializationConstants.fftDim / 8.0); - //axis->groupedBatch = (app->configuration.sharedMemorySize / axis->specializationConstants.fftDim >= app->configuration.coalescedMemory) ? maxSequenceLengthSharedMemory / axis->specializationConstants.fftDim : axis->groupedBatch; - if (((FFTPlan->numAxisUploads[axis_id] == 1) && (axis_id == 0)) || ((axis_id == 0) && (!axis->specializationConstants.reorderFourStep) && (axis_upload_id == 0))) { - axis->groupedBatch = (maxSequenceLengthSharedMemoryPow2 / axis->specializationConstants.fftDim > axis->groupedBatch) ? maxSequenceLengthSharedMemoryPow2 / axis->specializationConstants.fftDim : axis->groupedBatch; - } - else { - axis->groupedBatch = (maxSingleSizeStridedPow2 / axis->specializationConstants.fftDim > 1) ? maxSingleSizeStridedPow2 / axis->specializationConstants.fftDim * axis->groupedBatch : axis->groupedBatch; - } - //axis->groupedBatch = 8; - //shared memory bank conflict resolve -//#if(VKFFT_BACKEND!=2)//for some reason, hip doesn't get performance increase from having variable shared memory strides. - if ((FFTPlan->numAxisUploads[axis_id] == 2) && (axis_upload_id == 0) && (axis->specializationConstants.fftDim * maxBatchCoalesced <= maxSequenceLengthSharedMemory)) { - axis->groupedBatch = (uint64_t)ceil(axis->groupedBatch / 2.0); - } - //#endif - if ((FFTPlan->numAxisUploads[axis_id] == 3) && (axis_upload_id == 0) && (axis->specializationConstants.fftDim < maxSequenceLengthSharedMemory / (2 * complexSize))) { - axis->groupedBatch = (uint64_t)ceil(axis->groupedBatch / 2.0); - } - if (axis->groupedBatch < maxBatchCoalesced) axis->groupedBatch = maxBatchCoalesced; - axis->groupedBatch = (axis->groupedBatch / maxBatchCoalesced) * maxBatchCoalesced; - //half bandiwdth technique - if (!((axis_id == 0) && (FFTPlan->numAxisUploads[axis_id] == 1)) && !((axis_id == 0) && (axis_upload_id == 0) && (!axis->specializationConstants.reorderFourStep)) && (axis->specializationConstants.fftDim > maxSingleSizeStrided)) { - axis->groupedBatch = maxSequenceLengthSharedMemory / axis->specializationConstants.fftDim; - if (axis->groupedBatch == 0) axis->groupedBatch = 1; - } - - if ((app->configuration.halfThreads) && (axis->groupedBatch * axis->specializationConstants.fftDim * complexSize >= app->configuration.sharedMemorySize)) - axis->groupedBatch = (uint64_t)ceil(axis->groupedBatch / 2.0); - if (axis->groupedBatch > app->configuration.warpSize) axis->groupedBatch = (axis->groupedBatch / app->configuration.warpSize) * app->configuration.warpSize; - if (axis->groupedBatch > 2 * maxBatchCoalesced) axis->groupedBatch = (axis->groupedBatch / (2 * maxBatchCoalesced)) * (2 * maxBatchCoalesced); - if (axis->groupedBatch > 4 * maxBatchCoalesced) axis->groupedBatch = (axis->groupedBatch / (4 * maxBatchCoalesced)) * (2 * maxBatchCoalesced); - //uint64_t maxThreadNum = (axis_id) ? (maxSingleSizeStrided * app->configuration.coalescedMemory / complexSize) / (axis->specializationConstants.min_registers_per_thread * axis->specializationConstants.registerBoost) : maxSequenceLengthSharedMemory / (axis->specializationConstants.min_registers_per_thread * axis->specializationConstants.registerBoost); - //if (maxThreadNum > app->configuration.maxThreadsNum) maxThreadNum = app->configuration.maxThreadsNum; - uint64_t maxThreadNum = app->configuration.maxThreadsNum; - axis->specializationConstants.axisSwapped = 0; - uint64_t r2cmult = (axis->specializationConstants.mergeSequencesR2C) ? 2 : 1; - if (axis_id == 0) { - if (axis_upload_id == 0) { - axis->axisBlock[0] = (((uint64_t)ceil(axis->specializationConstants.fftDim / (double)axis->specializationConstants.min_registers_per_thread)) / axis->specializationConstants.registerBoost > 1) ? ((uint64_t)ceil(axis->specializationConstants.fftDim / (double)axis->specializationConstants.min_registers_per_thread)) / axis->specializationConstants.registerBoost : 1; - if (axis->specializationConstants.useRaderMult) { - uint64_t locMaxBatchCoalesced = ((axis_id == 0) && (((axis_upload_id == 0) && ((!app->configuration.reorderFourStep) || (app->useBluesteinFFT[axis_id]))) || (axis->specializationConstants.numAxisUploads == 1))) ? 1 : maxBatchCoalesced; - uint64_t final_rader_thread_count = 0; - for (uint64_t i = 0; i < axis->specializationConstants.numRaderPrimes; i++) { - if (axis->specializationConstants.raderContainer[i].type == 1) { - uint64_t temp_rader = (uint64_t)ceil((axis->specializationConstants.fftDim / (double)((axis->specializationConstants.rader_min_registers / 2) * 2)) / (double)((axis->specializationConstants.raderContainer[i].prime + 1) / 2)); - uint64_t active_rader = (uint64_t)ceil((axis->specializationConstants.fftDim / axis->specializationConstants.raderContainer[i].prime) / (double)temp_rader); - if (active_rader > 1) { - if ((((double)active_rader - (axis->specializationConstants.fftDim / axis->specializationConstants.raderContainer[i].prime) / (double)temp_rader) >= 0.5) && ((((uint64_t)ceil((axis->specializationConstants.fftDim / axis->specializationConstants.raderContainer[i].prime) / (double)(active_rader - 1)) * ((axis->specializationConstants.raderContainer[i].prime + 1) / 2)) * locMaxBatchCoalesced) <= app->configuration.maxThreadsNum)) active_rader--; - } - uint64_t local_estimate_rader_threadnum = (uint64_t)ceil((axis->specializationConstants.fftDim / axis->specializationConstants.raderContainer[i].prime) / (double)active_rader) * ((axis->specializationConstants.raderContainer[i].prime + 1) / 2); - - uint64_t temp_rader_thread_count = ((uint64_t)ceil(axis->axisBlock[0] / (double)((axis->specializationConstants.raderContainer[i].prime + 1) / 2))) * ((axis->specializationConstants.raderContainer[i].prime + 1) / 2); - if (temp_rader_thread_count < local_estimate_rader_threadnum) temp_rader_thread_count = local_estimate_rader_threadnum; - if (temp_rader_thread_count > final_rader_thread_count) final_rader_thread_count = temp_rader_thread_count; - } - } - axis->axisBlock[0] = final_rader_thread_count; - if (axis->axisBlock[0] * axis->groupedBatch > maxThreadNum) axis->groupedBatch = locMaxBatchCoalesced; - } - if (axis->specializationConstants.useRaderFFT) { - if (axis->axisBlock[0] < axis->specializationConstants.minRaderFFTThreadNum) axis->axisBlock[0] = axis->specializationConstants.minRaderFFTThreadNum; - } - if (axis->axisBlock[0] > maxThreadNum) axis->axisBlock[0] = maxThreadNum; - if (axis->axisBlock[0] > app->configuration.maxComputeWorkGroupSize[0]) axis->axisBlock[0] = app->configuration.maxComputeWorkGroupSize[0]; - if (axis->specializationConstants.reorderFourStep && (FFTPlan->numAxisUploads[axis_id] > 1)) - axis->axisBlock[1] = axis->groupedBatch; - else { - //axis->axisBlock[1] = (axis->axisBlock[0] < app->configuration.warpSize) ? app->configuration.warpSize / axis->axisBlock[0] : 1; - uint64_t estimate_batch = (((axis->axisBlock[0] / app->configuration.warpSize) == 1) && ((axis->axisBlock[0] / (double)app->configuration.warpSize) < 1.5)) ? app->configuration.aimThreads / app->configuration.warpSize : app->configuration.aimThreads / axis->axisBlock[0]; - if (estimate_batch == 0) estimate_batch = 1; - axis->axisBlock[1] = ((axis->axisBlock[0] < app->configuration.aimThreads) && ((axis->axisBlock[0] < app->configuration.warpSize) || (axis->specializationConstants.useRader))) ? estimate_batch : 1; - } - - uint64_t currentAxisBlock1 = axis->axisBlock[1]; - for (uint64_t i = currentAxisBlock1; i < 2 * currentAxisBlock1; i++) { - if (((FFTPlan->numAxisUploads[0] > 1) && (!(((FFTPlan->actualFFTSizePerAxis[axis_id][0] / axis->specializationConstants.fftDim) % axis->axisBlock[1]) == 0))) || ((FFTPlan->numAxisUploads[0] == 1) && (!(((FFTPlan->actualFFTSizePerAxis[axis_id][1] / r2cmult) % axis->axisBlock[1]) == 0)))) { - if (i * axis->specializationConstants.fftDim * complexSize <= allowedSharedMemory) axis->axisBlock[1] = i; - i = 2 * currentAxisBlock1; - } - } - if (((axis->specializationConstants.fftDim % 2 == 0) || (axis->axisBlock[0] < app->configuration.numSharedBanks / 4)) && (!(((!axis->specializationConstants.reorderFourStep) || (axis->specializationConstants.useBluesteinFFT)) && (FFTPlan->numAxisUploads[0] > 1))) && (axis->axisBlock[1] > 1) && (axis->axisBlock[1] * axis->specializationConstants.fftDim < maxSequenceLengthSharedMemoryPow2) && (!((app->configuration.performZeropadding[0] || app->configuration.performZeropadding[1] || app->configuration.performZeropadding[2])))) { - //we plan to swap - this reduces bank conflicts - axis->axisBlock[1] = (uint64_t)pow(2, (uint64_t)ceil(log2((double)axis->axisBlock[1]))); - } - if ((FFTPlan->numAxisUploads[0] > 1) && ((uint64_t)ceil(FFTPlan->actualFFTSizePerAxis[axis_id][0] / axis->specializationConstants.fftDim) < axis->axisBlock[1])) axis->axisBlock[1] = (uint64_t)ceil(FFTPlan->actualFFTSizePerAxis[axis_id][0] / axis->specializationConstants.fftDim); - if ((axis->specializationConstants.mergeSequencesR2C != 0) && (axis->specializationConstants.fftDim * axis->axisBlock[1] >= maxSequenceLengthSharedMemory)) { - axis->specializationConstants.mergeSequencesR2C = 0; - /*if ((!inverse) && (axis_id == 0) && (axis_upload_id == 0) && (!(app->configuration.isInputFormatted))) { - axis->specializationConstants.inputStride[1] /= 2; - axis->specializationConstants.inputStride[2] /= 2; - axis->specializationConstants.inputStride[3] /= 2; - axis->specializationConstants.inputStride[4] /= 2; - } - if ((inverse) && (axis_id == 0) && (axis_upload_id == 0) && (!((app->configuration.isInputFormatted) && (app->configuration.inverseReturnToInputBuffer))) && (!app->configuration.isOutputFormatted)) { - axis->specializationConstants.outputStride[1] /= 2; - axis->specializationConstants.outputStride[2] /= 2; - axis->specializationConstants.outputStride[3] /= 2; - axis->specializationConstants.outputStride[4] /= 2; - }*/ - r2cmult = 1; - } - if ((FFTPlan->numAxisUploads[0] == 1) && ((uint64_t)ceil(FFTPlan->actualFFTSizePerAxis[axis_id][1] / (double)r2cmult) < axis->axisBlock[1])) axis->axisBlock[1] = (uint64_t)ceil(FFTPlan->actualFFTSizePerAxis[axis_id][1] / (double)r2cmult); - - if (axis->axisBlock[1] > app->configuration.maxComputeWorkGroupSize[1]) axis->axisBlock[1] = app->configuration.maxComputeWorkGroupSize[1]; - //if (axis->axisBlock[0] * axis->axisBlock[1] > app->configuration.maxThreadsNum) axis->axisBlock[1] /= 2; - if (axis->axisBlock[0] * axis->axisBlock[1] > maxThreadNum) { - for (uint64_t i = 1; i <= axis->axisBlock[1]; i++) { - if ((axis->axisBlock[1] / i) * axis->axisBlock[0] <= maxThreadNum) - { - axis->axisBlock[1] /= i; - i = axis->axisBlock[1] + 1; - } - - } - } - while ((axis->axisBlock[1] * (axis->specializationConstants.fftDim / axis->specializationConstants.registerBoost)) > maxSequenceLengthSharedMemory) axis->axisBlock[1] /= 2; - if (((axis->specializationConstants.fftDim % 2 == 0) || (axis->axisBlock[0] < app->configuration.numSharedBanks / 4)) && (!(((!axis->specializationConstants.reorderFourStep) || (axis->specializationConstants.useBluesteinFFT)) && (FFTPlan->numAxisUploads[0] > 1))) && (axis->axisBlock[1] > 1) && (axis->axisBlock[1] * axis->specializationConstants.fftDim < maxSequenceLengthSharedMemoryPow2) && (!((app->configuration.performZeropadding[0] || app->configuration.performZeropadding[1] || app->configuration.performZeropadding[2])))) { - /*#if (VKFFT_BACKEND==0) - if (((axis->specializationConstants.fftDim & (axis->specializationConstants.fftDim - 1)) != 0)) { - uint64_t temp = axis->axisBlock[1]; - axis->axisBlock[1] = axis->axisBlock[0]; - axis->axisBlock[0] = temp; - axis->specializationConstants.axisSwapped = 1; - } - #else*/ - uint64_t temp = axis->axisBlock[1]; - axis->axisBlock[1] = axis->axisBlock[0]; - axis->axisBlock[0] = temp; - axis->specializationConstants.axisSwapped = 1; - //#endif - } - axis->axisBlock[2] = 1; - axis->axisBlock[3] = axis->specializationConstants.fftDim; - } - else { - axis->axisBlock[1] = ((uint64_t)ceil(axis->specializationConstants.fftDim / (double)axis->specializationConstants.min_registers_per_thread) / axis->specializationConstants.registerBoost > 1) ? (uint64_t)ceil(axis->specializationConstants.fftDim / (double)axis->specializationConstants.min_registers_per_thread) / axis->specializationConstants.registerBoost : 1; - if (axis->specializationConstants.useRaderMult) { - uint64_t final_rader_thread_count = 0; - for (uint64_t i = 0; i < axis->specializationConstants.numRaderPrimes; i++) { - if (axis->specializationConstants.raderContainer[i].type == 1) { - uint64_t temp_rader = (uint64_t)ceil((axis->specializationConstants.fftDim / (double)((axis->specializationConstants.rader_min_registers / 2) * 2)) / (double)((axis->specializationConstants.raderContainer[i].prime + 1) / 2)); - uint64_t active_rader = (uint64_t)ceil((axis->specializationConstants.fftDim / axis->specializationConstants.raderContainer[i].prime) / (double)temp_rader); - if (active_rader > 1) { - if ((((double)active_rader - (axis->specializationConstants.fftDim / axis->specializationConstants.raderContainer[i].prime) / (double)temp_rader) >= 0.5) && ((((uint64_t)ceil((axis->specializationConstants.fftDim / axis->specializationConstants.raderContainer[i].prime) / (double)(active_rader - 1)) * ((axis->specializationConstants.raderContainer[i].prime + 1) / 2)) * maxBatchCoalesced) <= app->configuration.maxThreadsNum)) active_rader--; - } - uint64_t local_estimate_rader_threadnum = (uint64_t)ceil((axis->specializationConstants.fftDim / axis->specializationConstants.raderContainer[i].prime) / (double)active_rader) * ((axis->specializationConstants.raderContainer[i].prime + 1) / 2); - - uint64_t temp_rader_thread_count = ((uint64_t)ceil(axis->axisBlock[1] / (double)((axis->specializationConstants.raderContainer[i].prime + 1) / 2))) * ((axis->specializationConstants.raderContainer[i].prime + 1) / 2); - if (temp_rader_thread_count < local_estimate_rader_threadnum) temp_rader_thread_count = local_estimate_rader_threadnum; - if (temp_rader_thread_count > final_rader_thread_count) final_rader_thread_count = temp_rader_thread_count; - } - } - axis->axisBlock[1] = final_rader_thread_count; - if (axis->groupedBatch * axis->axisBlock[1] > maxThreadNum) axis->groupedBatch = maxBatchCoalesced; - } - - uint64_t scale = app->configuration.aimThreads / axis->axisBlock[1] / axis->groupedBatch; - if (scale > 1) axis->groupedBatch *= scale; - axis->axisBlock[0] = (axis->specializationConstants.stageStartSize > axis->groupedBatch) ? axis->groupedBatch : axis->specializationConstants.stageStartSize; - if (axis->axisBlock[0] > app->configuration.maxComputeWorkGroupSize[0]) axis->axisBlock[0] = app->configuration.maxComputeWorkGroupSize[0]; - if (axis->axisBlock[0] * axis->axisBlock[1] > maxThreadNum) { - for (uint64_t i = 1; i <= axis->axisBlock[0]; i++) { - if ((axis->axisBlock[0] / i) * axis->axisBlock[1] <= maxThreadNum) - { - axis->axisBlock[0] /= i; - i = axis->axisBlock[0] + 1; - } - - } - } - axis->axisBlock[2] = 1; - axis->axisBlock[3] = axis->specializationConstants.fftDim; - } - - } - if (axis_id == 1) { - - axis->axisBlock[1] = ((uint64_t)ceil(axis->specializationConstants.fftDim / (double)axis->specializationConstants.min_registers_per_thread) / axis->specializationConstants.registerBoost > 1) ? ((uint64_t)ceil(axis->specializationConstants.fftDim / (double)axis->specializationConstants.min_registers_per_thread)) / axis->specializationConstants.registerBoost : 1; - if (axis->specializationConstants.useRaderMult) { - uint64_t final_rader_thread_count = 0; - for (uint64_t i = 0; i < axis->specializationConstants.numRaderPrimes; i++) { - if (axis->specializationConstants.raderContainer[i].type == 1) { - uint64_t temp_rader = (uint64_t)ceil((axis->specializationConstants.fftDim / (double)((axis->specializationConstants.rader_min_registers / 2) * 2)) / (double)((axis->specializationConstants.raderContainer[i].prime + 1) / 2)); - uint64_t active_rader = (uint64_t)ceil((axis->specializationConstants.fftDim / axis->specializationConstants.raderContainer[i].prime) / (double)temp_rader); - if (active_rader > 1) { - if ((((double)active_rader - (axis->specializationConstants.fftDim / axis->specializationConstants.raderContainer[i].prime) / (double)temp_rader) >= 0.5) && ((((uint64_t)ceil((axis->specializationConstants.fftDim / axis->specializationConstants.raderContainer[i].prime) / (double)(active_rader - 1)) * ((axis->specializationConstants.raderContainer[i].prime + 1) / 2)) * maxBatchCoalesced) <= app->configuration.maxThreadsNum)) active_rader--; - } - uint64_t local_estimate_rader_threadnum = (uint64_t)ceil((axis->specializationConstants.fftDim / axis->specializationConstants.raderContainer[i].prime) / (double)active_rader) * ((axis->specializationConstants.raderContainer[i].prime + 1) / 2); - - uint64_t temp_rader_thread_count = ((uint64_t)ceil(axis->axisBlock[1] / (double)((axis->specializationConstants.raderContainer[i].prime + 1) / 2))) * ((axis->specializationConstants.raderContainer[i].prime + 1) / 2); - if (temp_rader_thread_count < local_estimate_rader_threadnum) temp_rader_thread_count = local_estimate_rader_threadnum; - if (temp_rader_thread_count > final_rader_thread_count) final_rader_thread_count = temp_rader_thread_count; - } - } - axis->axisBlock[1] = final_rader_thread_count; - if (axis->groupedBatch * axis->axisBlock[1] > maxThreadNum) axis->groupedBatch = maxBatchCoalesced; - } - axis->axisBlock[0] = (FFTPlan->actualFFTSizePerAxis[axis_id][0] > axis->groupedBatch) ? axis->groupedBatch : FFTPlan->actualFFTSizePerAxis[axis_id][0]; - if (axis->axisBlock[0] > app->configuration.maxComputeWorkGroupSize[0]) axis->axisBlock[0] = app->configuration.maxComputeWorkGroupSize[0]; - if (axis->axisBlock[0] * axis->axisBlock[1] > maxThreadNum) { - for (uint64_t i = 1; i <= axis->axisBlock[0]; i++) { - if ((axis->axisBlock[0] / i) * axis->axisBlock[1] <= maxThreadNum) - { - axis->axisBlock[0] /= i; - i = axis->axisBlock[0] + 1; - } - - } - } - axis->axisBlock[2] = 1; - axis->axisBlock[3] = axis->specializationConstants.fftDim; - - } - if (axis_id == 2) { - axis->axisBlock[1] = ((uint64_t)ceil(axis->specializationConstants.fftDim / (double)axis->specializationConstants.min_registers_per_thread) / axis->specializationConstants.registerBoost > 1) ? ((uint64_t)ceil(axis->specializationConstants.fftDim / (double)axis->specializationConstants.min_registers_per_thread)) / axis->specializationConstants.registerBoost : 1; - if (axis->specializationConstants.useRaderMult) { - uint64_t final_rader_thread_count = 0; - for (uint64_t i = 0; i < axis->specializationConstants.numRaderPrimes; i++) { - if (axis->specializationConstants.raderContainer[i].type == 1) { - uint64_t temp_rader = (uint64_t)ceil((axis->specializationConstants.fftDim / (double)((axis->specializationConstants.rader_min_registers / 2) * 2)) / (double)((axis->specializationConstants.raderContainer[i].prime + 1) / 2)); - uint64_t active_rader = (uint64_t)ceil((axis->specializationConstants.fftDim / axis->specializationConstants.raderContainer[i].prime) / (double)temp_rader); - if (active_rader > 1) { - if ((((double)active_rader - (axis->specializationConstants.fftDim / axis->specializationConstants.raderContainer[i].prime) / (double)temp_rader) >= 0.5) && ((((uint64_t)ceil((axis->specializationConstants.fftDim / axis->specializationConstants.raderContainer[i].prime) / (double)(active_rader - 1)) * ((axis->specializationConstants.raderContainer[i].prime + 1) / 2)) * maxBatchCoalesced) <= app->configuration.maxThreadsNum)) active_rader--; - } - uint64_t local_estimate_rader_threadnum = (uint64_t)ceil((axis->specializationConstants.fftDim / axis->specializationConstants.raderContainer[i].prime) / (double)active_rader) * ((axis->specializationConstants.raderContainer[i].prime + 1) / 2); - - uint64_t temp_rader_thread_count = ((uint64_t)ceil(axis->axisBlock[1] / (double)((axis->specializationConstants.raderContainer[i].prime + 1) / 2))) * ((axis->specializationConstants.raderContainer[i].prime + 1) / 2); - if (temp_rader_thread_count < local_estimate_rader_threadnum) temp_rader_thread_count = local_estimate_rader_threadnum; - if (temp_rader_thread_count > final_rader_thread_count) final_rader_thread_count = temp_rader_thread_count; - } - } - axis->axisBlock[1] = final_rader_thread_count; - if (axis->groupedBatch * axis->axisBlock[1] > maxThreadNum) axis->groupedBatch = maxBatchCoalesced; - } - axis->axisBlock[0] = (FFTPlan->actualFFTSizePerAxis[axis_id][0] > axis->groupedBatch) ? axis->groupedBatch : FFTPlan->actualFFTSizePerAxis[axis_id][0]; - - if (axis->axisBlock[0] > app->configuration.maxComputeWorkGroupSize[0]) axis->axisBlock[0] = app->configuration.maxComputeWorkGroupSize[0]; - if (axis->axisBlock[0] * axis->axisBlock[1] > maxThreadNum) { - for (uint64_t i = 1; i <= axis->axisBlock[0]; i++) { - if ((axis->axisBlock[0] / i) * axis->axisBlock[1] <= maxThreadNum) - { - axis->axisBlock[0] /= i; - i = axis->axisBlock[0] + 1; - } - - } - } - axis->axisBlock[2] = 1; - axis->axisBlock[3] = axis->specializationConstants.fftDim; - } - - - - /*VkSpecializationMapEntry specializationMapEntries[36] = { {} }; - for (uint64_t i = 0; i < 36; i++) { - specializationMapEntries[i].constantID = i + 1; - specializationMapEntries[i].size = sizeof(uint64_t); - specializationMapEntries[i].offset = i * sizeof(uint64_t); - } - VkSpecializationInfo specializationInfo = { 0 }; - specializationInfo.dataSize = 36 * sizeof(uint64_t); - specializationInfo.mapEntryCount = 36; - specializationInfo.pMapEntries = specializationMapEntries;*/ - axis->specializationConstants.localSize[0] = axis->axisBlock[0]; - axis->specializationConstants.localSize[1] = axis->axisBlock[1]; - axis->specializationConstants.localSize[2] = axis->axisBlock[2]; - axis->specializationConstants.numSubgroups = (uint64_t)ceil(axis->axisBlock[0] * axis->axisBlock[1] * axis->axisBlock[2] / (double)app->configuration.warpSize); - //specializationInfo.pData = &axis->specializationConstants; - //uint64_t registerBoost = (FFTPlan->numAxisUploads[axis_id] > 1) ? app->configuration.registerBoost4Step : app->configuration.registerBoost; - - axis->specializationConstants.numCoordinates = (app->configuration.matrixConvolution > 1) ? 1 : app->configuration.coordinateFeatures; - axis->specializationConstants.matrixConvolution = app->configuration.matrixConvolution; - axis->specializationConstants.numKernels = app->configuration.numberKernels; - axis->specializationConstants.sharedMemSize = app->configuration.sharedMemorySize; - axis->specializationConstants.sharedMemSizePow2 = app->configuration.sharedMemorySizePow2; - axis->specializationConstants.normalize = (reverseBluesteinMultiUpload) ? 1 : app->configuration.normalize; - axis->specializationConstants.size[0] = FFTPlan->actualFFTSizePerAxis[axis_id][0]; - axis->specializationConstants.size[1] = FFTPlan->actualFFTSizePerAxis[axis_id][1]; - axis->specializationConstants.size[2] = FFTPlan->actualFFTSizePerAxis[axis_id][2]; - axis->specializationConstants.axis_id = axis_id; - axis->specializationConstants.axis_upload_id = axis_upload_id; - - for (uint64_t i = 0; i < 3; i++) { - axis->specializationConstants.frequencyZeropadding = app->configuration.frequencyZeroPadding; - axis->specializationConstants.performZeropaddingFull[i] = app->configuration.performZeropadding[i]; // don't read if input is zeropadded (0 - off, 1 - on) - axis->specializationConstants.fft_zeropad_left_full[i] = app->configuration.fft_zeropad_left[i]; - axis->specializationConstants.fft_zeropad_right_full[i] = app->configuration.fft_zeropad_right[i]; - } - if (axis->specializationConstants.useBluesteinFFT && (axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1) && ((reverseBluesteinMultiUpload == 0) || (FFTPlan->numAxisUploads[axis_id] == 1))) { - axis->specializationConstants.zeropadBluestein[0] = 1; - axis->specializationConstants.fft_zeropad_Bluestein_left_read[axis_id] = app->configuration.size[axis_id]; - if ((FFTPlan->multiUploadR2C) && (axis_id == 0)) axis->specializationConstants.fft_zeropad_Bluestein_left_read[axis_id] /= 2; - if (app->configuration.performDCT == 1) axis->specializationConstants.fft_zeropad_Bluestein_left_read[axis_id] = 2 * axis->specializationConstants.fft_zeropad_Bluestein_left_read[axis_id] - 2; - if ((app->configuration.performDCT == 4) && (app->configuration.size[axis_id] % 2 == 0)) axis->specializationConstants.fft_zeropad_Bluestein_left_read[axis_id] /= 2; - axis->specializationConstants.fft_zeropad_Bluestein_right_read[axis_id] = FFTPlan->actualFFTSizePerAxis[axis_id][axis_id]; - } - if (axis->specializationConstants.useBluesteinFFT && (axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1) && ((reverseBluesteinMultiUpload == 1) || (FFTPlan->numAxisUploads[axis_id] == 1))) { - axis->specializationConstants.zeropadBluestein[1] = 1; - axis->specializationConstants.fft_zeropad_Bluestein_left_write[axis_id] = app->configuration.size[axis_id]; - if ((FFTPlan->multiUploadR2C) && (axis_id == 0)) axis->specializationConstants.fft_zeropad_Bluestein_left_write[axis_id] /= 2; - if (app->configuration.performDCT == 1) axis->specializationConstants.fft_zeropad_Bluestein_left_write[axis_id] = 2 * axis->specializationConstants.fft_zeropad_Bluestein_left_write[axis_id] - 2; - if ((app->configuration.performDCT == 4) && (app->configuration.size[axis_id] % 2 == 0)) axis->specializationConstants.fft_zeropad_Bluestein_left_write[axis_id] /= 2; - axis->specializationConstants.fft_zeropad_Bluestein_right_write[axis_id] = FFTPlan->actualFFTSizePerAxis[axis_id][axis_id]; - } - uint64_t zeropad_r2c_multiupload_scale = ((axis_id == 0) && (FFTPlan->multiUploadR2C)) ? 2 : 1; - if ((inverse)) { - if ((app->configuration.frequencyZeroPadding) && (axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1) && (reverseBluesteinMultiUpload != 1)) { - axis->specializationConstants.zeropad[0] = app->configuration.performZeropadding[axis_id]; - axis->specializationConstants.fft_zeropad_left_read[axis_id] = app->configuration.fft_zeropad_left[axis_id] / zeropad_r2c_multiupload_scale; - axis->specializationConstants.fft_zeropad_right_read[axis_id] = app->configuration.fft_zeropad_right[axis_id] / zeropad_r2c_multiupload_scale; - } - else - axis->specializationConstants.zeropad[0] = 0; - if ((!app->configuration.frequencyZeroPadding) && (((axis_upload_id == 0) && (!((axis->specializationConstants.useBluesteinFFT) || (app->configuration.performConvolution)))) || ((axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1) && ((((reverseBluesteinMultiUpload == 1) || (FFTPlan->numAxisUploads[axis_id] == 1)) || (app->configuration.performConvolution)))))) { - axis->specializationConstants.zeropad[1] = app->configuration.performZeropadding[axis_id]; - axis->specializationConstants.fft_zeropad_left_write[axis_id] = app->configuration.fft_zeropad_left[axis_id] / zeropad_r2c_multiupload_scale; - axis->specializationConstants.fft_zeropad_right_write[axis_id] = app->configuration.fft_zeropad_right[axis_id] / zeropad_r2c_multiupload_scale; - } - else - axis->specializationConstants.zeropad[1] = 0; - } - else { - if ((!app->configuration.frequencyZeroPadding) && (axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1) && (reverseBluesteinMultiUpload != 1)) { - axis->specializationConstants.zeropad[0] = app->configuration.performZeropadding[axis_id]; - axis->specializationConstants.fft_zeropad_left_read[axis_id] = app->configuration.fft_zeropad_left[axis_id] / zeropad_r2c_multiupload_scale; - axis->specializationConstants.fft_zeropad_right_read[axis_id] = app->configuration.fft_zeropad_right[axis_id] / zeropad_r2c_multiupload_scale; - } - else - axis->specializationConstants.zeropad[0] = 0; - if (((app->configuration.frequencyZeroPadding) && (((axis_upload_id == 0) && (!axis->specializationConstants.useBluesteinFFT)) || ((axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1) && (axis->specializationConstants.useBluesteinFFT && ((reverseBluesteinMultiUpload == 1) || (FFTPlan->numAxisUploads[axis_id] == 1)))))) || (((!app->configuration.frequencyZeroPadding) && (app->configuration.FFTdim - 1 == axis_id) && (axis_upload_id == 0) && (FFTPlan->numAxisUploads[axis_id] == 1) && (app->configuration.performConvolution)))) { - axis->specializationConstants.zeropad[1] = app->configuration.performZeropadding[axis_id]; - axis->specializationConstants.fft_zeropad_left_write[axis_id] = app->configuration.fft_zeropad_left[axis_id] / zeropad_r2c_multiupload_scale; - axis->specializationConstants.fft_zeropad_right_write[axis_id] = app->configuration.fft_zeropad_right[axis_id] / zeropad_r2c_multiupload_scale; - } - else - axis->specializationConstants.zeropad[1] = 0; - } - if ((app->configuration.FFTdim - 1 == axis_id) && (axis_upload_id == 0) && (app->configuration.performConvolution)) { - axis->specializationConstants.convolutionStep = 1; - } - else - axis->specializationConstants.convolutionStep = 0; - if (app->useBluesteinFFT[axis_id] && (axis_upload_id == 0)) - axis->specializationConstants.BluesteinConvolutionStep = 1; - else - axis->specializationConstants.BluesteinConvolutionStep = 0; - - if (app->useBluesteinFFT[axis_id] && (axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1) && (reverseBluesteinMultiUpload == 0)) - axis->specializationConstants.BluesteinPreMultiplication = 1; - else - axis->specializationConstants.BluesteinPreMultiplication = 0; - if (app->useBluesteinFFT[axis_id] && (axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1) && ((reverseBluesteinMultiUpload == 1) || (FFTPlan->numAxisUploads[axis_id] == 1))) - axis->specializationConstants.BluesteinPostMultiplication = 1; - else - axis->specializationConstants.BluesteinPostMultiplication = 0; - - - uint64_t tempSize[3] = { FFTPlan->actualFFTSizePerAxis[axis_id][0], FFTPlan->actualFFTSizePerAxis[axis_id][1], FFTPlan->actualFFTSizePerAxis[axis_id][2] }; - - - if (axis_id == 0) { - if (axis_upload_id == 0) - tempSize[0] = FFTPlan->actualFFTSizePerAxis[axis_id][0] / axis->specializationConstants.fftDim / axis->axisBlock[1]; - else - tempSize[0] = FFTPlan->actualFFTSizePerAxis[axis_id][0] / axis->specializationConstants.fftDim / axis->axisBlock[0]; - if ((FFTPlan->actualPerformR2CPerAxis[axis_id] == 1) && (axis->specializationConstants.mergeSequencesR2C)) tempSize[1] = (uint64_t)ceil(tempSize[1] / 2.0); - tempSize[2] *= app->configuration.numberKernels * app->configuration.numberBatches; - if (!(axis->specializationConstants.convolutionStep && (app->configuration.matrixConvolution > 1))) tempSize[2] *= app->configuration.coordinateFeatures; - //if (app->configuration.performZeropadding[1]) tempSize[1] = (uint64_t)ceil(tempSize[1] / 2.0); - //if (app->configuration.performZeropadding[2]) tempSize[2] = (uint64_t)ceil(tempSize[2] / 2.0); - } - if (axis_id == 1) { - tempSize[0] = (uint64_t)ceil(FFTPlan->actualFFTSizePerAxis[axis_id][0] / (double)axis->axisBlock[0] * FFTPlan->actualFFTSizePerAxis[axis_id][1] / (double)axis->specializationConstants.fftDim); - tempSize[1] = 1; - tempSize[2] = FFTPlan->actualFFTSizePerAxis[axis_id][2]; - tempSize[2] *= app->configuration.numberKernels * app->configuration.numberBatches; - if (!(axis->specializationConstants.convolutionStep && (app->configuration.matrixConvolution > 1))) tempSize[2] *= app->configuration.coordinateFeatures; - //if (app->configuration.actualPerformR2C == 1) tempSize[0] = (uint64_t)ceil(tempSize[0] / 2.0); - //if (app->configuration.performZeropadding[2]) tempSize[2] = (uint64_t)ceil(tempSize[2] / 2.0); - } - if (axis_id == 2) { - tempSize[0] = (uint64_t)ceil(FFTPlan->actualFFTSizePerAxis[axis_id][0] / (double)axis->axisBlock[0] * FFTPlan->actualFFTSizePerAxis[axis_id][2] / (double)axis->specializationConstants.fftDim); - tempSize[1] = 1; - tempSize[2] = FFTPlan->actualFFTSizePerAxis[axis_id][1]; - tempSize[2] *= app->configuration.numberKernels * app->configuration.numberBatches; - if (!(axis->specializationConstants.convolutionStep && (app->configuration.matrixConvolution > 1))) tempSize[2] *= app->configuration.coordinateFeatures; - //if (app->configuration.actualPerformR2C == 1) tempSize[0] = (uint64_t)ceil(tempSize[0] / 2.0); - - } - if ((app->configuration.maxComputeWorkGroupCount[0] > app->configuration.maxComputeWorkGroupCount[1]) && (tempSize[1] > app->configuration.maxComputeWorkGroupCount[1]) && (tempSize[1] > tempSize[0]) && (tempSize[1] >= tempSize[2])) { - uint64_t temp_tempSize = tempSize[0]; - tempSize[0] = tempSize[1]; - tempSize[1] = temp_tempSize; - axis->specializationConstants.swapComputeWorkGroupID = 1; - } - else { - if ((app->configuration.maxComputeWorkGroupCount[0] > app->configuration.maxComputeWorkGroupCount[2]) && (tempSize[2] > app->configuration.maxComputeWorkGroupCount[2]) && (tempSize[2] > tempSize[0]) && (tempSize[2] >= tempSize[1])) { - uint64_t temp_tempSize = tempSize[0]; - tempSize[0] = tempSize[2]; - tempSize[2] = temp_tempSize; - axis->specializationConstants.swapComputeWorkGroupID = 2; - } - } - if (tempSize[0] > app->configuration.maxComputeWorkGroupCount[0]) axis->specializationConstants.performWorkGroupShift[0] = 1; - else axis->specializationConstants.performWorkGroupShift[0] = 0; - if (tempSize[1] > app->configuration.maxComputeWorkGroupCount[1]) axis->specializationConstants.performWorkGroupShift[1] = 1; - else axis->specializationConstants.performWorkGroupShift[1] = 0; - if (tempSize[2] > app->configuration.maxComputeWorkGroupCount[2]) axis->specializationConstants.performWorkGroupShift[2] = 1; - else axis->specializationConstants.performWorkGroupShift[2] = 0; - - char floatTypeInputMemory[10]; - char floatTypeOutputMemory[10]; - char floatTypeKernelMemory[10]; - char floatType[10]; - axis->specializationConstants.unroll = 1; - axis->specializationConstants.LUT = app->configuration.useLUT; - if (app->configuration.doublePrecision) { - sprintf(floatType, "double"); - sprintf(floatTypeInputMemory, "double"); - sprintf(floatTypeOutputMemory, "double"); - sprintf(floatTypeKernelMemory, "double"); - //axis->specializationConstants.unroll = 1; - } - else { - //axis->specializationConstants.unroll = 0; - if (app->configuration.halfPrecision) { - sprintf(floatType, "float"); - if (app->configuration.halfPrecisionMemoryOnly) { - //only out of place mode, input/output buffer must be different - sprintf(floatTypeKernelMemory, "float"); - if ((axis_id == app->firstAxis) && (axis_upload_id == FFTPlan->numAxisUploads[axis_id] - 1) && (!axis->specializationConstants.actualInverse)) - sprintf(floatTypeInputMemory, "half"); - else - sprintf(floatTypeInputMemory, "float"); - if ((axis_id == app->firstAxis) && (axis_upload_id == 0) && (axis->specializationConstants.actualInverse)) - sprintf(floatTypeOutputMemory, "half"); - else - sprintf(floatTypeOutputMemory, "float"); - } - else { - sprintf(floatTypeInputMemory, "half"); - sprintf(floatTypeOutputMemory, "half"); - sprintf(floatTypeKernelMemory, "half"); - } - - } - else { - if (app->configuration.doublePrecisionFloatMemory) { - sprintf(floatType, "double"); - sprintf(floatTypeInputMemory, "float"); - sprintf(floatTypeOutputMemory, "float"); - sprintf(floatTypeKernelMemory, "float"); - } - else { - sprintf(floatType, "float"); - sprintf(floatTypeInputMemory, "float"); - sprintf(floatTypeOutputMemory, "float"); - sprintf(floatTypeKernelMemory, "float"); - } - } - } - char uintType[20] = ""; - if (!app->configuration.useUint64) { -#if(VKFFT_BACKEND==0) - sprintf(uintType, "uint"); -#elif(VKFFT_BACKEND==1) - sprintf(uintType, "unsigned int"); -#elif(VKFFT_BACKEND==2) - sprintf(uintType, "unsigned int"); -#elif(VKFFT_BACKEND==3) - sprintf(uintType, "unsigned int"); -#elif(VKFFT_BACKEND==4) - sprintf(uintType, "unsigned int"); -#endif - } - else { -#if(VKFFT_BACKEND==0) - sprintf(uintType, "uint64_t"); -#elif(VKFFT_BACKEND==1) - sprintf(uintType, "unsigned long long"); -#elif(VKFFT_BACKEND==2) - sprintf(uintType, "unsigned long long"); -#elif(VKFFT_BACKEND==3) - sprintf(uintType, "unsigned long"); -#elif(VKFFT_BACKEND==4) - sprintf(uintType, "unsigned long"); -#endif - } - { - axis->pushConstants.structSize = 0; - if (axis->specializationConstants.performWorkGroupShift[0]) { - axis->pushConstants.performWorkGroupShift[0] = 1; - axis->pushConstants.structSize += 1; - } - if (axis->specializationConstants.performWorkGroupShift[1]) { - axis->pushConstants.performWorkGroupShift[1] = 1; - axis->pushConstants.structSize += 1; - } - if (axis->specializationConstants.performWorkGroupShift[2]) { - axis->pushConstants.performWorkGroupShift[2] = 1; - axis->pushConstants.structSize += 1; - } - if (axis->specializationConstants.performPostCompilationInputOffset) { - axis->pushConstants.performPostCompilationInputOffset = 1; - axis->pushConstants.structSize += 1; - } - if (axis->specializationConstants.performPostCompilationOutputOffset) { - axis->pushConstants.performPostCompilationOutputOffset = 1; - axis->pushConstants.structSize += 1; - } - if (axis->specializationConstants.performPostCompilationKernelOffset) { - axis->pushConstants.performPostCompilationKernelOffset = 1; - axis->pushConstants.structSize += 1; - } - if (app->configuration.useUint64) - axis->pushConstants.structSize *= sizeof(uint64_t); - else - axis->pushConstants.structSize *= sizeof(uint32_t); - axis->specializationConstants.pushConstantsStructSize = axis->pushConstants.structSize; - } - //uint64_t LUT = app->configuration.useLUT; - uint64_t type = 0; - if ((axis_id == 0) && (axis_upload_id == 0)) type = 0; - if (axis_id != 0) type = 1; - if ((axis_id == 0) && (axis_upload_id > 0)) type = 2; - //if ((axis->specializationConstants.fftDim == 8 * maxSequenceLengthSharedMemory) && (app->configuration.registerBoost >= 8)) axis->specializationConstants.registerBoost = 8; - if ((axis_id == 0) && (!axis->specializationConstants.actualInverse) && (FFTPlan->actualPerformR2CPerAxis[axis_id])) type = 5; - if ((axis_id == 0) && (axis->specializationConstants.actualInverse) && (FFTPlan->actualPerformR2CPerAxis[axis_id])) type = 6; - if ((axis_id == 0) && (app->configuration.performDCT == 1)) type = 110; - if ((axis_id != 0) && (app->configuration.performDCT == 1)) type = 111; - if ((axis_id == 0) && (((app->configuration.performDCT == 2) && (!inverse)) || ((app->configuration.performDCT == 3) && (inverse)))) type = 120; - if ((axis_id != 0) && (((app->configuration.performDCT == 2) && (!inverse)) || ((app->configuration.performDCT == 3) && (inverse)))) type = 121; - if ((axis_id == 0) && (((app->configuration.performDCT == 2) && (inverse)) || ((app->configuration.performDCT == 3) && (!inverse)))) type = 130; - if ((axis_id != 0) && (((app->configuration.performDCT == 2) && (inverse)) || ((app->configuration.performDCT == 3) && (!inverse)))) type = 131; - if ((axis_id == 0) && (app->configuration.performDCT == 4) && ((app->configuration.size[axis_id] % 2) == 0)) type = 142; - if ((axis_id == 0) && (app->configuration.performDCT == 4) && ((app->configuration.size[axis_id] % 2) == 1)) type = 144; - if ((axis_id != 0) && (app->configuration.performDCT == 4) && ((app->configuration.size[axis_id] % 2) == 0)) type = 143; - if ((axis_id != 0) && (app->configuration.performDCT == 4) && ((app->configuration.size[axis_id] % 2) == 1)) type = 145; -#if(VKFFT_BACKEND==0) - axis->specializationConstants.cacheShuffle = 0; //((FFTPlan->numAxisUploads[axis_id] > 1) && ((axis->specializationConstants.fftDim & (axis->specializationConstants.fftDim - 1)) == 0) && (!app->configuration.doublePrecision) && (!axis->specializationConstants.useBluesteinFFT) && (!app->configuration.doublePrecisionFloatMemory) && ((type == 0) || (type == 5) || (type == 6))) ? 1 : 0; -#elif(VKFFT_BACKEND==1) - axis->specializationConstants.cacheShuffle = 0; -#elif(VKFFT_BACKEND==2) - axis->specializationConstants.cacheShuffle = 0; -#elif(VKFFT_BACKEND==3) - axis->specializationConstants.cacheShuffle = 0; -#elif(VKFFT_BACKEND==4) - axis->specializationConstants.cacheShuffle = 0; -#endif - axis->specializationConstants.maxCodeLength = app->configuration.maxCodeLength; - axis->specializationConstants.maxTempLength = app->configuration.maxTempLength; - axis->specializationConstants.code0 = (char*)malloc(sizeof(char) * app->configuration.maxCodeLength); - char* code0 = axis->specializationConstants.code0; - if (!code0) { - deleteVkFFT(app); - return VKFFT_ERROR_MALLOC_FAILED; - } - resFFT = shaderGenVkFFT(code0, &axis->specializationConstants, floatType, floatTypeInputMemory, floatTypeOutputMemory, floatTypeKernelMemory, uintType, type); - freeShaderGenVkFFT(&axis->specializationConstants); - if (resFFT != VKFFT_SUCCESS) { - deleteVkFFT(app); - return resFFT; - } -#if(VKFFT_BACKEND==0) - uint32_t* code; - uint64_t codeSize; - if (app->configuration.loadApplicationFromString) { - char* localStrPointer = (char*)app->configuration.loadApplicationString + app->currentApplicationStringPos; - memcpy(&codeSize, localStrPointer, sizeof(uint64_t)); - code = (uint32_t*)malloc(codeSize); - if (!code) { - free(code0); - code0 = 0; - deleteVkFFT(app); - return VKFFT_ERROR_MALLOC_FAILED; - } - memcpy(code, localStrPointer + sizeof(uint64_t), codeSize); - app->currentApplicationStringPos += codeSize + sizeof(uint64_t); - } - else - { - glslang_resource_t default_resource = {}; - default_resource.max_lights = 32; - default_resource.max_clip_planes = 6; - default_resource.max_texture_units = 32; - default_resource.max_texture_coords = 32; - default_resource.max_vertex_attribs = 64; - default_resource.max_vertex_uniform_components = 4096; - default_resource.max_varying_floats = 64; - default_resource.max_vertex_texture_image_units = 32; - default_resource.max_combined_texture_image_units = 80; - default_resource.max_texture_image_units = 32; - default_resource.max_fragment_uniform_components = 4096; - default_resource.max_draw_buffers = 32; - default_resource.max_vertex_uniform_vectors = 128; - default_resource.max_varying_vectors = 8; - default_resource.max_fragment_uniform_vectors = 16; - default_resource.max_vertex_output_vectors = 16; - default_resource.max_fragment_input_vectors = 15; - default_resource.min_program_texel_offset = -8; - default_resource.max_program_texel_offset = 7; - default_resource.max_clip_distances = 8; - default_resource.max_compute_work_group_count_x = (int)app->configuration.maxComputeWorkGroupCount[0]; - default_resource.max_compute_work_group_count_y = (int)app->configuration.maxComputeWorkGroupCount[1]; - default_resource.max_compute_work_group_count_z = (int)app->configuration.maxComputeWorkGroupCount[2]; - default_resource.max_compute_work_group_size_x = (int)app->configuration.maxComputeWorkGroupSize[0]; - default_resource.max_compute_work_group_size_y = (int)app->configuration.maxComputeWorkGroupSize[1]; - default_resource.max_compute_work_group_size_z = (int)app->configuration.maxComputeWorkGroupSize[2]; - default_resource.max_compute_uniform_components = 1024; - default_resource.max_compute_texture_image_units = 16; - default_resource.max_compute_image_uniforms = 8; - default_resource.max_compute_atomic_counters = 8; - default_resource.max_compute_atomic_counter_buffers = 1; - default_resource.max_varying_components = 60; - default_resource.max_vertex_output_components = 64; - default_resource.max_geometry_input_components = 64; - default_resource.max_geometry_output_components = 128; - default_resource.max_fragment_input_components = 128; - default_resource.max_image_units = 8; - default_resource.max_combined_image_units_and_fragment_outputs = 8; - default_resource.max_combined_shader_output_resources = 8; - default_resource.max_image_samples = 0; - default_resource.max_vertex_image_uniforms = 0; - default_resource.max_tess_control_image_uniforms = 0; - default_resource.max_tess_evaluation_image_uniforms = 0; - default_resource.max_geometry_image_uniforms = 0; - default_resource.max_fragment_image_uniforms = 8; - default_resource.max_combined_image_uniforms = 8; - default_resource.max_geometry_texture_image_units = 16; - default_resource.max_geometry_output_vertices = 256; - default_resource.max_geometry_total_output_components = 1024; - default_resource.max_geometry_uniform_components = 1024; - default_resource.max_geometry_varying_components = 64; - default_resource.max_tess_control_input_components = 128; - default_resource.max_tess_control_output_components = 128; - default_resource.max_tess_control_texture_image_units = 16; - default_resource.max_tess_control_uniform_components = 1024; - default_resource.max_tess_control_total_output_components = 4096; - default_resource.max_tess_evaluation_input_components = 128; - default_resource.max_tess_evaluation_output_components = 128; - default_resource.max_tess_evaluation_texture_image_units = 16; - default_resource.max_tess_evaluation_uniform_components = 1024; - default_resource.max_tess_patch_components = 120; - default_resource.max_patch_vertices = 32; - default_resource.max_tess_gen_level = 64; - default_resource.max_viewports = 16; - default_resource.max_vertex_atomic_counters = 0; - default_resource.max_tess_control_atomic_counters = 0; - default_resource.max_tess_evaluation_atomic_counters = 0; - default_resource.max_geometry_atomic_counters = 0; - default_resource.max_fragment_atomic_counters = 8; - default_resource.max_combined_atomic_counters = 8; - default_resource.max_atomic_counter_bindings = 1; - default_resource.max_vertex_atomic_counter_buffers = 0; - default_resource.max_tess_control_atomic_counter_buffers = 0; - default_resource.max_tess_evaluation_atomic_counter_buffers = 0; - default_resource.max_geometry_atomic_counter_buffers = 0; - default_resource.max_fragment_atomic_counter_buffers = 1; - default_resource.max_combined_atomic_counter_buffers = 1; - default_resource.max_atomic_counter_buffer_size = 16384; - default_resource.max_transform_feedback_buffers = 4; - default_resource.max_transform_feedback_interleaved_components = 64; - default_resource.max_cull_distances = 8; - default_resource.max_combined_clip_and_cull_distances = 8; - default_resource.max_samples = 4; - default_resource.max_mesh_output_vertices_nv = 256; - default_resource.max_mesh_output_primitives_nv = 512; - default_resource.max_mesh_work_group_size_x_nv = 32; - default_resource.max_mesh_work_group_size_y_nv = 1; - default_resource.max_mesh_work_group_size_z_nv = 1; - default_resource.max_task_work_group_size_x_nv = 32; - default_resource.max_task_work_group_size_y_nv = 1; - default_resource.max_task_work_group_size_z_nv = 1; - default_resource.max_mesh_view_count_nv = 4; - - default_resource.limits.non_inductive_for_loops = 1; - default_resource.limits.while_loops = 1; - default_resource.limits.do_while_loops = 1; - default_resource.limits.general_uniform_indexing = 1; - default_resource.limits.general_attribute_matrix_vector_indexing = 1; - default_resource.limits.general_varying_indexing = 1; - default_resource.limits.general_sampler_indexing = 1; - default_resource.limits.general_variable_indexing = 1; - default_resource.limits.general_constant_matrix_vector_indexing = 1; - glslang_target_client_version_t client_version = (app->configuration.halfPrecision) ? GLSLANG_TARGET_VULKAN_1_1 : GLSLANG_TARGET_VULKAN_1_0; - glslang_target_language_version_t target_language_version = (app->configuration.halfPrecision) ? GLSLANG_TARGET_SPV_1_3 : GLSLANG_TARGET_SPV_1_0; - glslang_input_t input = - { - GLSLANG_SOURCE_GLSL, - GLSLANG_STAGE_COMPUTE, - GLSLANG_CLIENT_VULKAN, - client_version, - GLSLANG_TARGET_SPV, - target_language_version, - code0, - 450, - GLSLANG_NO_PROFILE, - 1, - 0, - GLSLANG_MSG_DEFAULT_BIT, - (const glslang_resource_t*)&default_resource, - }; - //printf("%s\n", code0); - glslang_shader_t* shader = glslang_shader_create((const glslang_input_t*)&input); - const char* err; - if (!glslang_shader_preprocess(shader, &input)) - { - err = glslang_shader_get_info_log(shader); - printf("%s\n", code0); - printf("%s\nVkFFT shader type: %" PRIu64 "\n", err, type); - glslang_shader_delete(shader); - free(code0); - code0 = 0; - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_SHADER_PREPROCESS; - - } - - if (!glslang_shader_parse(shader, &input)) - { - err = glslang_shader_get_info_log(shader); - printf("%s\n", code0); - printf("%s\nVkFFT shader type: %" PRIu64 "\n", err, type); - glslang_shader_delete(shader); - free(code0); - code0 = 0; - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_SHADER_PARSE; - - } - glslang_program_t* program = glslang_program_create(); - glslang_program_add_shader(program, shader); - if (!glslang_program_link(program, GLSLANG_MSG_SPV_RULES_BIT | GLSLANG_MSG_VULKAN_RULES_BIT)) - { - err = glslang_program_get_info_log(program); - printf("%s\n", code0); - printf("%s\nVkFFT shader type: %" PRIu64 "\n", err, type); - glslang_shader_delete(shader); - glslang_program_delete(program); - free(code0); - code0 = 0; - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_SHADER_LINK; - - } - - glslang_program_SPIRV_generate(program, input.stage); - - if (glslang_program_SPIRV_get_messages(program)) - { - printf("%s", glslang_program_SPIRV_get_messages(program)); - glslang_shader_delete(shader); - glslang_program_delete(program); - free(code0); - code0 = 0; - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_SPIRV_GENERATE; - } - - glslang_shader_delete(shader); - uint32_t* tempCode = glslang_program_SPIRV_get_ptr(program); - codeSize = glslang_program_SPIRV_get_size(program) * sizeof(uint32_t); - axis->binarySize = codeSize; - code = (uint32_t*)malloc(codeSize); - if (!code) { - free(code0); - code0 = 0; - glslang_program_delete(program); - deleteVkFFT(app); - return VKFFT_ERROR_MALLOC_FAILED; - } - axis->binary = code; - memcpy(code, tempCode, codeSize); - glslang_program_delete(program); - } - VkPipelineShaderStageCreateInfo pipelineShaderStageCreateInfo = { VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO }; - VkComputePipelineCreateInfo computePipelineCreateInfo = { VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO }; - pipelineShaderStageCreateInfo.stage = VK_SHADER_STAGE_COMPUTE_BIT; - VkShaderModuleCreateInfo createInfo = { VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO }; - createInfo.pCode = code; - createInfo.codeSize = codeSize; - res = vkCreateShaderModule(app->configuration.device[0], &createInfo, 0, &pipelineShaderStageCreateInfo.module); - if (res != VK_SUCCESS) { - free(code); - code = 0; - free(code0); - code0 = 0; - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_TO_CREATE_SHADER_MODULE; - } - VkPipelineLayoutCreateInfo pipelineLayoutCreateInfo = { VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO }; - pipelineLayoutCreateInfo.setLayoutCount = 1; - pipelineLayoutCreateInfo.pSetLayouts = &axis->descriptorSetLayout; - VkPushConstantRange pushConstantRange = { VK_SHADER_STAGE_COMPUTE_BIT }; - pushConstantRange.offset = 0; - pushConstantRange.size = (uint32_t)axis->pushConstants.structSize; - // Push constant ranges are part of the pipeline layout - if (axis->pushConstants.structSize) { - pipelineLayoutCreateInfo.pushConstantRangeCount = 1; - pipelineLayoutCreateInfo.pPushConstantRanges = &pushConstantRange; - } - res = vkCreatePipelineLayout(app->configuration.device[0], &pipelineLayoutCreateInfo, 0, &axis->pipelineLayout); - if (res != VK_SUCCESS) { - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_TO_CREATE_PIPELINE_LAYOUT; - } - - pipelineShaderStageCreateInfo.pName = "main"; - pipelineShaderStageCreateInfo.pSpecializationInfo = 0;// &specializationInfo; - computePipelineCreateInfo.stage = pipelineShaderStageCreateInfo; - computePipelineCreateInfo.layout = axis->pipelineLayout; - res = vkCreateComputePipelines(app->configuration.device[0], VK_NULL_HANDLE, 1, &computePipelineCreateInfo, 0, &axis->pipeline); - if (res != VK_SUCCESS) { - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_TO_CREATE_PIPELINE; - } - vkDestroyShaderModule(app->configuration.device[0], pipelineShaderStageCreateInfo.module, 0); - if (!app->configuration.saveApplicationToString) { - free(code); - code = 0; - } -#elif(VKFFT_BACKEND==1) - char* code; - uint64_t codeSize; - if (app->configuration.loadApplicationFromString) { - char* localStrPointer = (char*)app->configuration.loadApplicationString + app->currentApplicationStringPos; - memcpy(&codeSize, localStrPointer, sizeof(uint64_t)); - code = (char*)malloc(codeSize); - if (!code) { - free(code0); - code0 = 0; - deleteVkFFT(app); - return VKFFT_ERROR_MALLOC_FAILED; - } - memcpy(code, localStrPointer + sizeof(uint64_t), codeSize); - app->currentApplicationStringPos += codeSize + sizeof(uint64_t); - } - else { - nvrtcProgram prog; - nvrtcResult result = nvrtcCreateProgram(&prog, // prog - code0, // buffer - "VkFFT.cu", // name - 0, // numHeaders - 0, // headers - 0); // includeNames - //free(includeNames); - //free(headers); - if (result != NVRTC_SUCCESS) { - printf("nvrtcCreateProgram error: %s\n", nvrtcGetErrorString(result)); - free(code0); - code0 = 0; - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_TO_CREATE_PROGRAM; - } - char* opts[5]; - opts[0] = (char*)malloc(sizeof(char) * 50); - if (!opts[0]) { - free(code0); - code0 = 0; - deleteVkFFT(app); - return VKFFT_ERROR_MALLOC_FAILED; - } - sprintf(opts[0], "--gpu-architecture=sm_%" PRIu64 "%" PRIu64 "", app->configuration.computeCapabilityMajor, app->configuration.computeCapabilityMinor); - //result = nvrtcAddNameExpression(prog, "&consts"); - //if (result != NVRTC_SUCCESS) printf("1.5 error: %s\n", nvrtcGetErrorString(result)); - result = nvrtcCompileProgram(prog, // prog - 1, // numOptions - (const char* const*)opts); // options - free(opts[0]); - - if (result != NVRTC_SUCCESS) { - printf("nvrtcCompileProgram error: %s\n", nvrtcGetErrorString(result)); - char* log = (char*)malloc(sizeof(char) * 4000000); - if (!log) { - free(code0); - code0 = 0; - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_TO_COMPILE_PROGRAM; - } - else { - nvrtcGetProgramLog(prog, log); - printf("%s\n", log); - free(log); - log = 0; - printf("%s\n", code0); - free(code0); - code0 = 0; - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_TO_COMPILE_PROGRAM; - } - } -#if (CUDA_VERSION >= 11030) - result = nvrtcGetCUBINSize(prog, &codeSize); -#else - result = nvrtcGetPTXSize(prog, &codeSize); -#endif - if (result != NVRTC_SUCCESS) { -#if (CUDA_VERSION >= 11030) - printf("nvrtcGetCUBINSize error: %s\n", nvrtcGetErrorString(result)); -#else - printf("nvrtcGetPTXSize error: %s\n", nvrtcGetErrorString(result)); -#endif - free(code0); - code0 = 0; - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_TO_GET_CODE_SIZE; - } - axis->binarySize = codeSize; - code = (char*)malloc(codeSize); - if (!code) { - free(code0); - code0 = 0; - deleteVkFFT(app); - return VKFFT_ERROR_MALLOC_FAILED; - } - axis->binary = code; -#if (CUDA_VERSION >= 11030) - result = nvrtcGetCUBIN(prog, code); -#else - result = nvrtcGetPTX(prog, code); -#endif - if (result != NVRTC_SUCCESS) { -#if (CUDA_VERSION >= 11030) - printf("nvrtcGetCUBIN error: %s\n", nvrtcGetErrorString(result)); -#else - printf("nvrtcGetPTX error: %s\n", nvrtcGetErrorString(result)); -#endif - free(code); - code = 0; - free(code0); - code0 = 0; - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_TO_GET_CODE; - } - result = nvrtcDestroyProgram(&prog); - if (result != NVRTC_SUCCESS) { - printf("nvrtcDestroyProgram error: %s\n", nvrtcGetErrorString(result)); - free(code); - code = 0; - free(code0); - code0 = 0; - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_TO_DESTROY_PROGRAM; - } - } - CUresult result2 = cuModuleLoadDataEx(&axis->VkFFTModule, code, 0, 0, 0); - - if (result2 != CUDA_SUCCESS) { - printf("cuModuleLoadDataEx error: %d\n", result2); - free(code); - code = 0; - free(code0); - code0 = 0; - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_TO_LOAD_MODULE; - } - result2 = cuModuleGetFunction(&axis->VkFFTKernel, axis->VkFFTModule, "VkFFT_main"); - if (result2 != CUDA_SUCCESS) { - printf("cuModuleGetFunction error: %d\n", result2); - free(code); - code = 0; - free(code0); - code0 = 0; - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_TO_GET_FUNCTION; - } - - /*result2 = cuFuncSetCacheConfig(axis->VkFFTKernel, CU_FUNC_CACHE_PREFER_SHARED); - if (result2 != CUDA_SUCCESS) { - printf("cuFuncSetAttribute error: %d\n", result2); - free(code); - code = 0; - free(code0); - code0 = 0; - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_TO_SET_DYNAMIC_SHARED_MEMORY; - }*/ - - if (axis->specializationConstants.usedSharedMemory > app->configuration.sharedMemorySizeStatic) { - result2 = cuFuncSetAttribute(axis->VkFFTKernel, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, (int)axis->specializationConstants.usedSharedMemory); - if (result2 != CUDA_SUCCESS) { - printf("cuFuncSetAttribute error: %d\n", result2); - free(code); - code = 0; - free(code0); - code0 = 0; - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_TO_SET_DYNAMIC_SHARED_MEMORY; - } - } - if (axis->pushConstants.structSize) { - size_t size = axis->pushConstants.structSize; - result2 = cuModuleGetGlobal(&axis->consts_addr, &size, axis->VkFFTModule, "consts"); - if (result2 != CUDA_SUCCESS) { - printf("cuModuleGetGlobal error: %d\n", result2); - free(code); - code = 0; - free(code0); - code0 = 0; - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_TO_MODULE_GET_GLOBAL; - } - } - if (!app->configuration.saveApplicationToString) { - free(code); - code = 0; - } -#elif(VKFFT_BACKEND==2) - uint32_t* code; - uint64_t codeSize; - if (app->configuration.loadApplicationFromString) { - char* localStrPointer = (char*)app->configuration.loadApplicationString + app->currentApplicationStringPos; - memcpy(&codeSize, localStrPointer, sizeof(uint64_t)); - code = (uint32_t*)malloc(codeSize); - if (!code) { - free(code0); - code0 = 0; - deleteVkFFT(app); - return VKFFT_ERROR_MALLOC_FAILED; - } - memcpy(code, localStrPointer + sizeof(uint64_t), codeSize); - app->currentApplicationStringPos += codeSize + sizeof(uint64_t); - } - else - { - hiprtcProgram prog; - enum hiprtcResult result = hiprtcCreateProgram(&prog, // prog - code0, // buffer - "VkFFT.hip", // name - 0, // numHeaders - 0, // headers - 0); // includeNames - if (result != HIPRTC_SUCCESS) { - printf("hiprtcCreateProgram error: %s\n", hiprtcGetErrorString(result)); - free(code0); - code0 = 0; - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_TO_CREATE_PROGRAM; - } - if (axis->pushConstants.structSize) { - result = hiprtcAddNameExpression(prog, "&consts"); - if (result != HIPRTC_SUCCESS) { - printf("hiprtcAddNameExpression error: %s\n", hiprtcGetErrorString(result)); - free(code0); - code0 = 0; - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_TO_ADD_NAME_EXPRESSION; - } - } - result = hiprtcCompileProgram(prog, // prog - 0, // numOptions - 0); // options - if (result != HIPRTC_SUCCESS) { - printf("hiprtcCompileProgram error: %s\n", hiprtcGetErrorString(result)); - char* log = (char*)malloc(sizeof(char) * 100000); - if (!log) { - free(code0); - code0 = 0; - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_TO_COMPILE_PROGRAM; - } - else { - hiprtcGetProgramLog(prog, log); - printf("%s\n", log); - free(log); - log = 0; - printf("%s\n", code0); - free(code0); - code0 = 0; - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_TO_COMPILE_PROGRAM; - } - } - result = hiprtcGetCodeSize(prog, &codeSize); - if (result != HIPRTC_SUCCESS) { - printf("hiprtcGetCodeSize error: %s\n", hiprtcGetErrorString(result)); - free(code0); - code0 = 0; - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_TO_GET_CODE; - } - axis->binarySize = codeSize; - code = (uint32_t*)malloc(codeSize); - if (!code) { - free(code0); - code0 = 0; - deleteVkFFT(app); - return VKFFT_ERROR_MALLOC_FAILED; - } - axis->binary = code; - result = hiprtcGetCode(prog, (char*)code); - if (result != HIPRTC_SUCCESS) { - printf("hiprtcGetCode error: %s\n", hiprtcGetErrorString(result)); - free(code); - code = 0; - free(code0); - code0 = 0; - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_TO_GET_CODE_SIZE; - } - //printf("%s\n", code); - // Destroy the program. - result = hiprtcDestroyProgram(&prog); - if (result != HIPRTC_SUCCESS) { - printf("hiprtcDestroyProgram error: %s\n", hiprtcGetErrorString(result)); - free(code); - code = 0; - free(code0); - code0 = 0; - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_TO_DESTROY_PROGRAM; - } - } - hipError_t result2 = hipModuleLoadDataEx(&axis->VkFFTModule, code, 0, 0, 0); - - if (result2 != hipSuccess) { - printf("hipModuleLoadDataEx error: %d\n", result2); - free(code); - code = 0; - free(code0); - code0 = 0; - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_TO_LOAD_MODULE; - } - result2 = hipModuleGetFunction(&axis->VkFFTKernel, axis->VkFFTModule, "VkFFT_main"); - if (result2 != hipSuccess) { - printf("hipModuleGetFunction error: %d\n", result2); - free(code); - code = 0; - free(code0); - code0 = 0; - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_TO_GET_FUNCTION; - } - - /*result2 = hipFuncSetCacheConfig(axis->VkFFTKernel, hipFuncCachePreferShared); - if (result2 != hipSuccess) { - printf("hipFuncSetAttribute error: %d\n", result2); - free(code); - code = 0; - free(code0); - code0 = 0; - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_TO_SET_DYNAMIC_SHARED_MEMORY; - }*/ - - if (axis->specializationConstants.usedSharedMemory > app->configuration.sharedMemorySizeStatic) { - result2 = hipFuncSetAttribute(axis->VkFFTKernel, hipFuncAttributeMaxDynamicSharedMemorySize, (int)axis->specializationConstants.usedSharedMemory); - //result2 = hipFuncSetCacheConfig(axis->VkFFTKernel, hipFuncCachePreferShared); - if (result2 != hipSuccess) { - printf("hipFuncSetAttribute error: %d\n", result2); - free(code); - code = 0; - free(code0); - code0 = 0; - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_TO_SET_DYNAMIC_SHARED_MEMORY; - } - } - if (axis->pushConstants.structSize) { - size_t size = axis->pushConstants.structSize; - result2 = hipModuleGetGlobal(&axis->consts_addr, &size, axis->VkFFTModule, "consts"); - if (result2 != hipSuccess) { - printf("hipModuleGetGlobal error: %d\n", result2); - free(code); - code = 0; - free(code0); - code0 = 0; - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_TO_MODULE_GET_GLOBAL; - } - } - if (!app->configuration.saveApplicationToString) { - free(code); - code = 0; - } -#elif(VKFFT_BACKEND==3) - if (app->configuration.loadApplicationFromString) { - char* code; - uint64_t codeSize; - char* localStrPointer = (char*)app->configuration.loadApplicationString + app->currentApplicationStringPos; - memcpy(&codeSize, localStrPointer, sizeof(uint64_t)); - code = (char*)malloc(codeSize); - if (!code) { - free(code0); - code0 = 0; - deleteVkFFT(app); - return VKFFT_ERROR_MALLOC_FAILED; - } - memcpy(code, localStrPointer + sizeof(uint64_t), codeSize); - app->currentApplicationStringPos += codeSize + sizeof(uint64_t); - - axis->program = clCreateProgramWithBinary(app->configuration.context[0], 1, app->configuration.device, &codeSize, (const unsigned char**)(&code), 0, &res); - if (res != CL_SUCCESS) { - free(code0); - code0 = 0; - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_TO_CREATE_PROGRAM; - } - free(code); - code = 0; - } - else { - size_t codelen = strlen(code0); - axis->program = clCreateProgramWithSource(app->configuration.context[0], 1, (const char**)&code0, &codelen, &res); - if (res != CL_SUCCESS) { - free(code0); - code0 = 0; - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_TO_CREATE_PROGRAM; - } - } - res = clBuildProgram(axis->program, 1, app->configuration.device, 0, 0, 0); - if (res != CL_SUCCESS) { - size_t log_size; - clGetProgramBuildInfo(axis->program, app->configuration.device[0], CL_PROGRAM_BUILD_LOG, 0, 0, &log_size); - char* log = (char*)malloc(log_size); - if (!log) { - free(code0); - code0 = 0; - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_TO_COMPILE_PROGRAM; - } - else { - clGetProgramBuildInfo(axis->program, app->configuration.device[0], CL_PROGRAM_BUILD_LOG, log_size, log, 0); - printf("%s\n", log); - free(log); - log = 0; - printf("%s\n", code0); - free(code0); - code0 = 0; - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_TO_COMPILE_PROGRAM; - } - } - if (app->configuration.saveApplicationToString) { - size_t codeSize; - res = clGetProgramInfo(axis->program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &codeSize, NULL); - if (res != CL_SUCCESS) { - free(code0); - code0 = 0; - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_TO_COMPILE_PROGRAM; - } - axis->binarySize = codeSize; - axis->binary = (char*)malloc(axis->binarySize); - if (!axis->binary) { - free(code0); - code0 = 0; - deleteVkFFT(app); - return VKFFT_ERROR_MALLOC_FAILED; - } - res = clGetProgramInfo(axis->program, CL_PROGRAM_BINARIES, axis->binarySize, &axis->binary, NULL); - if (res != CL_SUCCESS) { - free(axis->binary); - axis->binary = 0; - free(code0); - code0 = 0; - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_TO_COMPILE_PROGRAM; - } - } - axis->kernel = clCreateKernel(axis->program, "VkFFT_main", &res); - if (res != CL_SUCCESS) { - if (app->configuration.saveApplicationToString) { - free(axis->binary); - axis->binary = 0; - } - free(code0); - code0 = 0; - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_TO_CREATE_SHADER_MODULE; - } -#elif(VKFFT_BACKEND==4) - uint32_t* code; - uint64_t codeSize; - if (app->configuration.loadApplicationFromString) { - char* localStrPointer = (char*)app->configuration.loadApplicationString + app->currentApplicationStringPos; - memcpy(&codeSize, localStrPointer, sizeof(uint64_t)); - code = (uint32_t*)malloc(codeSize); - if (!code) { - free(code0); - code0 = 0; - deleteVkFFT(app); - return VKFFT_ERROR_MALLOC_FAILED; - } - memcpy(code, localStrPointer + sizeof(uint64_t), codeSize); - app->currentApplicationStringPos += codeSize + sizeof(uint64_t); - - const char* pBuildFlags = (app->configuration.useUint64) ? "-ze-opt-greater-than-4GB-buffer-required" : 0; - ze_module_desc_t moduleDesc = { - ZE_STRUCTURE_TYPE_MODULE_DESC, - 0, - ZE_MODULE_FORMAT_NATIVE, - codeSize, - (uint8_t*)code, - pBuildFlags, - 0 - }; - res = zeModuleCreate(app->configuration.context[0], app->configuration.device[0], &moduleDesc, &axis->VkFFTModule, 0); - if (res != ZE_RESULT_SUCCESS) { - free(code); - code = 0; - free(code0); - code0 = 0; - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_TO_CREATE_PROGRAM; - } - free(code); - code = 0; - } - else { - size_t codelen = strlen(code0); - uint64_t successOpen = 0; - FILE* temp; - char fname_cl[100]; - char fname_bc[100]; - char fname_spv[100]; - int name_id = 0; - while (!successOpen) { - sprintf(fname_cl, "VkFFT_temp_cl_%d.cl", name_id); - temp = fopen(fname_cl, "r"); - if (temp != 0) { - fclose(temp); - name_id++; - } - else { - successOpen = 1; - sprintf(fname_bc, "VkFFT_temp_bc_%d.spv", name_id); - sprintf(fname_spv, "VkFFT_temp_cl_%d.spv", name_id); - } - } - temp = fopen(fname_cl, "w"); - fwrite(code0, 1, codelen, temp); - fclose(temp); - char system_call[500]; - sprintf(system_call, "clang -c -target spir64 -O0 -emit-llvm -o %s %s", fname_bc, fname_cl); - system(system_call); - sprintf(system_call, "llvm-spirv -o %s %s", fname_spv, fname_bc); - system(system_call); - temp = fopen(fname_spv, "rb"); - fseek(temp, 0L, SEEK_END); - uint64_t spv_size = ftell(temp); - rewind(temp); - - uint8_t* spv_binary = (uint8_t*)malloc(spv_size); - if (!spv_binary) { - free(code0); - code0 = 0; - deleteVkFFT(app); - return VKFFT_ERROR_MALLOC_FAILED; - } - fread(spv_binary, 1, spv_size, temp); - fclose(temp); - remove(fname_cl); - remove(fname_bc); - remove(fname_spv); - const char* pBuildFlags = (app->configuration.useUint64) ? "-ze-opt-greater-than-4GB-buffer-required" : 0; - - ze_module_desc_t moduleDesc = { - ZE_STRUCTURE_TYPE_MODULE_DESC, - 0, - ZE_MODULE_FORMAT_IL_SPIRV, - spv_size, - spv_binary, - pBuildFlags, - 0 - }; - res = zeModuleCreate(app->configuration.context[0], app->configuration.device[0], &moduleDesc, &axis->VkFFTModule, 0); - if (res != ZE_RESULT_SUCCESS) { - free(spv_binary); - spv_binary = 0; - free(code0); - code0 = 0; - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_TO_CREATE_PROGRAM; - } - free(spv_binary); - spv_binary = 0; - if (app->configuration.saveApplicationToString) { - size_t codeSize; - res = zeModuleGetNativeBinary(axis->VkFFTModule, &codeSize, 0); - if (res != ZE_RESULT_SUCCESS) { - free(code0); - code0 = 0; - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_TO_COMPILE_PROGRAM; - } - axis->binarySize = codeSize; - axis->binary = (char*)malloc(axis->binarySize); - if (!axis->binary) { - free(code0); - code0 = 0; - deleteVkFFT(app); - return VKFFT_ERROR_MALLOC_FAILED; - } - res = zeModuleGetNativeBinary(axis->VkFFTModule, &codeSize, (uint8_t*)axis->binary); - if (res != ZE_RESULT_SUCCESS) { - free(axis->binary); - axis->binary = 0; - free(code0); - code0 = 0; - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_TO_COMPILE_PROGRAM; - } - } - } - ze_kernel_desc_t kernelDesc = { - ZE_STRUCTURE_TYPE_KERNEL_DESC, - 0, - 0, // flags - "VkFFT_main" - }; - res = zeKernelCreate(axis->VkFFTModule, &kernelDesc, &axis->VkFFTKernel); - if (res != ZE_RESULT_SUCCESS) { - if (app->configuration.saveApplicationToString) { - free(axis->binary); - axis->binary = 0; - } - free(code0); - code0 = 0; - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_TO_CREATE_SHADER_MODULE; - } -#endif - if (!app->configuration.keepShaderCode) { - free(code0); - code0 = 0; - axis->specializationConstants.code0 = 0; - } - } - if (axis->specializationConstants.axisSwapped) {//swap back for correct dispatch - uint64_t temp = axis->axisBlock[1]; - axis->axisBlock[1] = axis->axisBlock[0]; - axis->axisBlock[0] = temp; - axis->specializationConstants.axisSwapped = 0; - } - return resFFT; -} -static inline VkFFTResult initializeBluesteinAutoPadding(VkFFTApplication* app) { - VkFFTResult resFFT = VKFFT_SUCCESS; - if (!app->configuration.useCustomBluesteinPaddingPattern) { - switch (app->configuration.vendorID) { - case 0x10DE://NVIDIA - if (app->configuration.doublePrecision) { - app->configuration.autoCustomBluesteinPaddingPattern = 48; - } - else { - app->configuration.autoCustomBluesteinPaddingPattern = 45; - } - break; - default: //have not done a test run for Intel, so everything else uses AMD profile - if (app->configuration.doublePrecision) { - app->configuration.autoCustomBluesteinPaddingPattern = 54; - } - else { - app->configuration.autoCustomBluesteinPaddingPattern = 29; - } - break; - } - app->configuration.primeSizes = (uint64_t*)malloc(app->configuration.autoCustomBluesteinPaddingPattern * sizeof(uint64_t)); - if (!app->configuration.primeSizes) return VKFFT_ERROR_MALLOC_FAILED; - app->configuration.paddedSizes = (uint64_t*)malloc(app->configuration.autoCustomBluesteinPaddingPattern * sizeof(uint64_t)); - if (!app->configuration.paddedSizes) return VKFFT_ERROR_MALLOC_FAILED; - switch (app->configuration.vendorID) { - case 0x10DE://Nvidia - if (app->configuration.doublePrecision) { - app->configuration.primeSizes[0] = 17; - app->configuration.paddedSizes[0] = 36; - app->configuration.primeSizes[1] = 19; - app->configuration.paddedSizes[1] = 40; - app->configuration.primeSizes[2] = 23; - app->configuration.paddedSizes[2] = 48; - app->configuration.primeSizes[3] = 29; - app->configuration.paddedSizes[3] = 64; - app->configuration.primeSizes[4] = 34; - app->configuration.paddedSizes[4] = 70; - app->configuration.primeSizes[5] = 37; - app->configuration.paddedSizes[5] = 80; - app->configuration.primeSizes[6] = 41; - app->configuration.paddedSizes[6] = 90; - app->configuration.primeSizes[7] = 46; - app->configuration.paddedSizes[7] = 96; - app->configuration.primeSizes[8] = 51; - app->configuration.paddedSizes[8] = 104; - app->configuration.primeSizes[9] = 53; - app->configuration.paddedSizes[9] = 128; - app->configuration.primeSizes[10] = 67; - app->configuration.paddedSizes[10] = 144; - app->configuration.primeSizes[11] = 73; - app->configuration.paddedSizes[11] = 160; - app->configuration.primeSizes[12] = 82; - app->configuration.paddedSizes[12] = 256; - app->configuration.primeSizes[13] = 129; - app->configuration.paddedSizes[13] = 288; - app->configuration.primeSizes[14] = 145; - app->configuration.paddedSizes[14] = 512; - app->configuration.primeSizes[15] = 257; - app->configuration.paddedSizes[15] = 625; - app->configuration.primeSizes[16] = 314; - app->configuration.paddedSizes[16] = 750; - app->configuration.primeSizes[17] = 376; - app->configuration.paddedSizes[17] = 756; - app->configuration.primeSizes[18] = 379; - app->configuration.paddedSizes[18] = 768; - app->configuration.primeSizes[19] = 386; - app->configuration.paddedSizes[19] = 1024; - app->configuration.primeSizes[20] = 513; - app->configuration.paddedSizes[20] = 1056; - app->configuration.primeSizes[21] = 529; - app->configuration.paddedSizes[21] = 1200; - app->configuration.primeSizes[22] = 601; - app->configuration.paddedSizes[22] = 1225; - app->configuration.primeSizes[23] = 614; - app->configuration.paddedSizes[23] = 1250; - app->configuration.primeSizes[24] = 626; - app->configuration.paddedSizes[24] = 1296; - app->configuration.primeSizes[25] = 649; - app->configuration.paddedSizes[25] = 1331; - app->configuration.primeSizes[26] = 667; - app->configuration.paddedSizes[26] = 1440; - app->configuration.primeSizes[27] = 721; - app->configuration.paddedSizes[27] = 1456; - app->configuration.primeSizes[28] = 730; - app->configuration.paddedSizes[28] = 1560; - app->configuration.primeSizes[29] = 781; - app->configuration.paddedSizes[29] = 2048; - app->configuration.primeSizes[30] = 1025; - app->configuration.paddedSizes[30] = 2187; - app->configuration.primeSizes[31] = 1095; - app->configuration.paddedSizes[31] = 2304; - app->configuration.primeSizes[32] = 1153; - app->configuration.paddedSizes[32] = 2688; - app->configuration.primeSizes[33] = 1345; - app->configuration.paddedSizes[33] = 2730; - app->configuration.primeSizes[34] = 1366; - app->configuration.paddedSizes[34] = 2925; - app->configuration.primeSizes[35] = 1464; - app->configuration.paddedSizes[35] = 3000; - app->configuration.primeSizes[36] = 1501; - app->configuration.paddedSizes[36] = 4096; - app->configuration.primeSizes[37] = 2049; - app->configuration.paddedSizes[37] = 4368; - app->configuration.primeSizes[38] = 2185; - app->configuration.paddedSizes[38] = 4608; - app->configuration.primeSizes[39] = 2305; - app->configuration.paddedSizes[39] = 4900; - app->configuration.primeSizes[40] = 2364; - app->configuration.paddedSizes[40] = 4900; - app->configuration.primeSizes[41] = 2451; - app->configuration.paddedSizes[41] = 5184; - app->configuration.primeSizes[42] = 2593; - app->configuration.paddedSizes[42] = 5625; - app->configuration.primeSizes[43] = 2814; - app->configuration.paddedSizes[43] = 5760; - app->configuration.primeSizes[44] = 2881; - app->configuration.paddedSizes[44] = 6000; - app->configuration.primeSizes[45] = 3001; - app->configuration.paddedSizes[45] = 6048; - app->configuration.primeSizes[46] = 3026; - app->configuration.paddedSizes[46] = 6561; - app->configuration.primeSizes[47] = 3282; - app->configuration.paddedSizes[47] = 8192; - } - else { - app->configuration.primeSizes[0] = 17; - app->configuration.paddedSizes[0] = 36; - app->configuration.primeSizes[1] = 19; - app->configuration.paddedSizes[1] = 40; - app->configuration.primeSizes[2] = 23; - app->configuration.paddedSizes[2] = 48; - app->configuration.primeSizes[3] = 29; - app->configuration.paddedSizes[3] = 64; - app->configuration.primeSizes[4] = 34; - app->configuration.paddedSizes[4] = 70; - app->configuration.primeSizes[5] = 37; - app->configuration.paddedSizes[5] = 80; - app->configuration.primeSizes[6] = 41; - app->configuration.paddedSizes[6] = 96; - app->configuration.primeSizes[7] = 51; - app->configuration.paddedSizes[7] = 104; - app->configuration.primeSizes[8] = 53; - app->configuration.paddedSizes[8] = 112; - app->configuration.primeSizes[9] = 57; - app->configuration.paddedSizes[9] = 120; - app->configuration.primeSizes[10] = 61; - app->configuration.paddedSizes[10] = 128; - app->configuration.primeSizes[11] = 67; - app->configuration.paddedSizes[11] = 144; - app->configuration.primeSizes[12] = 73; - app->configuration.paddedSizes[12] = 150; - app->configuration.primeSizes[13] = 76; - app->configuration.paddedSizes[13] = 160; - app->configuration.primeSizes[14] = 82; - app->configuration.paddedSizes[14] = 256; - app->configuration.primeSizes[15] = 129; - app->configuration.paddedSizes[15] = 384; - app->configuration.primeSizes[16] = 193; - app->configuration.paddedSizes[16] = 512; - app->configuration.primeSizes[17] = 257; - app->configuration.paddedSizes[17] = 567; - app->configuration.primeSizes[18] = 285; - app->configuration.paddedSizes[18] = 625; - app->configuration.primeSizes[19] = 314; - app->configuration.paddedSizes[19] = 768; - app->configuration.primeSizes[20] = 386; - app->configuration.paddedSizes[20] = 832; - app->configuration.primeSizes[21] = 417; - app->configuration.paddedSizes[21] = 1024; - app->configuration.primeSizes[22] = 513; - app->configuration.paddedSizes[22] = 1152; - app->configuration.primeSizes[23] = 577; - app->configuration.paddedSizes[23] = 1200; - app->configuration.primeSizes[24] = 601; - app->configuration.paddedSizes[24] = 1296; - app->configuration.primeSizes[25] = 649; - app->configuration.paddedSizes[25] = 1536; - app->configuration.primeSizes[26] = 769; - app->configuration.paddedSizes[26] = 2048; - app->configuration.primeSizes[27] = 1025; - app->configuration.paddedSizes[27] = 2187; - app->configuration.primeSizes[28] = 1095; - app->configuration.paddedSizes[28] = 2304; - app->configuration.primeSizes[29] = 1153; - app->configuration.paddedSizes[29] = 2500; - app->configuration.primeSizes[30] = 1251; - app->configuration.paddedSizes[30] = 2592; - app->configuration.primeSizes[31] = 1297; - app->configuration.paddedSizes[31] = 2816; - app->configuration.primeSizes[32] = 1409; - app->configuration.paddedSizes[32] = 3072; - app->configuration.primeSizes[33] = 1537; - app->configuration.paddedSizes[33] = 4096; - app->configuration.primeSizes[34] = 2049; - app->configuration.paddedSizes[34] = 4368; - app->configuration.primeSizes[35] = 2185; - app->configuration.paddedSizes[35] = 4563; - app->configuration.primeSizes[36] = 2283; - app->configuration.paddedSizes[36] = 4576; - app->configuration.primeSizes[37] = 2289; - app->configuration.paddedSizes[37] = 4608; - app->configuration.primeSizes[38] = 2305; - app->configuration.paddedSizes[38] = 5184; - app->configuration.primeSizes[39] = 2593; - app->configuration.paddedSizes[39] = 5625; - app->configuration.primeSizes[40] = 2814; - app->configuration.paddedSizes[40] = 5632; - app->configuration.primeSizes[41] = 2817; - app->configuration.paddedSizes[41] = 6000; - app->configuration.primeSizes[42] = 3001; - app->configuration.paddedSizes[42] = 6144; - app->configuration.primeSizes[43] = 3073; - app->configuration.paddedSizes[43] = 6561; - app->configuration.primeSizes[44] = 3282; - app->configuration.paddedSizes[44] = 8192; - } - break; - default: //have not done a test run for Intel, so everything else uses AMD profile - if (app->configuration.doublePrecision) { - app->configuration.primeSizes[0] = 17; - app->configuration.paddedSizes[0] = 36; - app->configuration.primeSizes[1] = 19; - app->configuration.paddedSizes[1] = 40; - app->configuration.primeSizes[2] = 23; - app->configuration.paddedSizes[2] = 56; - app->configuration.primeSizes[3] = 29; - app->configuration.paddedSizes[3] = 64; - app->configuration.primeSizes[4] = 34; - app->configuration.paddedSizes[4] = 70; - app->configuration.primeSizes[5] = 37; - app->configuration.paddedSizes[5] = 78; - app->configuration.primeSizes[6] = 41; - app->configuration.paddedSizes[6] = 81; - app->configuration.primeSizes[7] = 43; - app->configuration.paddedSizes[7] = 90; - app->configuration.primeSizes[8] = 46; - app->configuration.paddedSizes[8] = 125; - app->configuration.primeSizes[9] = 67; - app->configuration.paddedSizes[9] = 150; - app->configuration.primeSizes[10] = 76; - app->configuration.paddedSizes[10] = 175; - app->configuration.primeSizes[11] = 89; - app->configuration.paddedSizes[11] = 189; - app->configuration.primeSizes[12] = 97; - app->configuration.paddedSizes[12] = 198; - app->configuration.primeSizes[13] = 101; - app->configuration.paddedSizes[13] = 243; - app->configuration.primeSizes[14] = 123; - app->configuration.paddedSizes[14] = 256; - app->configuration.primeSizes[15] = 129; - app->configuration.paddedSizes[15] = 270; - app->configuration.primeSizes[16] = 136; - app->configuration.paddedSizes[16] = 512; - app->configuration.primeSizes[17] = 257; - app->configuration.paddedSizes[17] = 625; - app->configuration.primeSizes[18] = 314; - app->configuration.paddedSizes[18] = 640; - app->configuration.primeSizes[19] = 321; - app->configuration.paddedSizes[19] = 702; - app->configuration.primeSizes[20] = 353; - app->configuration.paddedSizes[20] = 750; - app->configuration.primeSizes[21] = 376; - app->configuration.paddedSizes[21] = 756; - app->configuration.primeSizes[22] = 379; - app->configuration.paddedSizes[22] = 768; - app->configuration.primeSizes[23] = 386; - app->configuration.paddedSizes[23] = 875; - app->configuration.primeSizes[24] = 439; - app->configuration.paddedSizes[24] = 1024; - app->configuration.primeSizes[25] = 513; - app->configuration.paddedSizes[25] = 1296; - app->configuration.primeSizes[26] = 649; - app->configuration.paddedSizes[26] = 1300; - app->configuration.primeSizes[27] = 651; - app->configuration.paddedSizes[27] = 1323; - app->configuration.primeSizes[28] = 663; - app->configuration.paddedSizes[28] = 1344; - app->configuration.primeSizes[29] = 673; - app->configuration.paddedSizes[29] = 1512; - app->configuration.primeSizes[30] = 757; - app->configuration.paddedSizes[30] = 1792; - app->configuration.primeSizes[31] = 897; - app->configuration.paddedSizes[31] = 2016; - app->configuration.primeSizes[32] = 1009; - app->configuration.paddedSizes[32] = 2048; - app->configuration.primeSizes[33] = 1025; - app->configuration.paddedSizes[33] = 2187; - app->configuration.primeSizes[34] = 1095; - app->configuration.paddedSizes[34] = 3136; - app->configuration.primeSizes[35] = 1569; - app->configuration.paddedSizes[35] = 3159; - app->configuration.primeSizes[36] = 1581; - app->configuration.paddedSizes[36] = 3430; - app->configuration.primeSizes[37] = 1717; - app->configuration.paddedSizes[37] = 3584; - app->configuration.primeSizes[38] = 1793; - app->configuration.paddedSizes[38] = 4096; - app->configuration.primeSizes[39] = 2049; - app->configuration.paddedSizes[39] = 4224; - app->configuration.primeSizes[40] = 2113; - app->configuration.paddedSizes[40] = 4375; - app->configuration.primeSizes[41] = 2189; - app->configuration.paddedSizes[41] = 4480; - app->configuration.primeSizes[42] = 2241; - app->configuration.paddedSizes[42] = 4704; - app->configuration.primeSizes[43] = 2353; - app->configuration.paddedSizes[43] = 4928; - app->configuration.primeSizes[44] = 2465; - app->configuration.paddedSizes[44] = 4992; - app->configuration.primeSizes[45] = 2497; - app->configuration.paddedSizes[45] = 5005; - app->configuration.primeSizes[46] = 2504; - app->configuration.paddedSizes[46] = 5103; - app->configuration.primeSizes[47] = 2553; - app->configuration.paddedSizes[47] = 5376; - app->configuration.primeSizes[48] = 2689; - app->configuration.paddedSizes[48] = 5632; - app->configuration.primeSizes[49] = 2817; - app->configuration.paddedSizes[49] = 5824; - app->configuration.primeSizes[50] = 2913; - app->configuration.paddedSizes[50] = 6048; - app->configuration.primeSizes[51] = 3026; - app->configuration.paddedSizes[51] = 6144; - app->configuration.primeSizes[52] = 3073; - app->configuration.paddedSizes[52] = 6875; - app->configuration.primeSizes[53] = 3439; - app->configuration.paddedSizes[53] = 8192; - } - else { - app->configuration.primeSizes[0] = 17; - app->configuration.paddedSizes[0] = 36; - app->configuration.primeSizes[1] = 19; - app->configuration.paddedSizes[1] = 42; - app->configuration.primeSizes[2] = 23; - app->configuration.paddedSizes[2] = 64; - app->configuration.primeSizes[3] = 34; - app->configuration.paddedSizes[3] = 81; - app->configuration.primeSizes[4] = 43; - app->configuration.paddedSizes[4] = 88; - app->configuration.primeSizes[5] = 46; - app->configuration.paddedSizes[5] = 125; - app->configuration.primeSizes[6] = 67; - app->configuration.paddedSizes[6] = 150; - app->configuration.primeSizes[7] = 76; - app->configuration.paddedSizes[7] = 162; - app->configuration.primeSizes[8] = 82; - app->configuration.paddedSizes[8] = 175; - app->configuration.primeSizes[9] = 89; - app->configuration.paddedSizes[9] = 256; - app->configuration.primeSizes[10] = 129; - app->configuration.paddedSizes[10] = 512; - app->configuration.primeSizes[11] = 257; - app->configuration.paddedSizes[11] = 625; - app->configuration.primeSizes[12] = 314; - app->configuration.paddedSizes[12] = 768; - app->configuration.primeSizes[13] = 386; - app->configuration.paddedSizes[13] = 1024; - app->configuration.primeSizes[14] = 513; - app->configuration.paddedSizes[14] = 1296; - app->configuration.primeSizes[15] = 649; - app->configuration.paddedSizes[15] = 2048; - app->configuration.primeSizes[16] = 1025; - app->configuration.paddedSizes[16] = 2187; - app->configuration.primeSizes[17] = 1095; - app->configuration.paddedSizes[17] = 2304; - app->configuration.primeSizes[18] = 1153; - app->configuration.paddedSizes[18] = 2500; - app->configuration.primeSizes[19] = 1251; - app->configuration.paddedSizes[19] = 2592; - app->configuration.primeSizes[20] = 1297; - app->configuration.paddedSizes[20] = 3072; - app->configuration.primeSizes[21] = 1537; - app->configuration.paddedSizes[21] = 3125; - app->configuration.primeSizes[22] = 1564; - app->configuration.paddedSizes[22] = 3136; - app->configuration.primeSizes[23] = 1569; - app->configuration.paddedSizes[23] = 4096; - app->configuration.primeSizes[24] = 2049; - app->configuration.paddedSizes[24] = 4375; - app->configuration.primeSizes[25] = 2189; - app->configuration.paddedSizes[25] = 4608; - app->configuration.primeSizes[26] = 2305; - app->configuration.paddedSizes[26] = 5184; - app->configuration.primeSizes[27] = 2593; - app->configuration.paddedSizes[27] = 6561; - app->configuration.primeSizes[28] = 3282; - app->configuration.paddedSizes[28] = 8192; - } - break; - } - } - return resFFT; -} -static inline VkFFTResult initializeVkFFT(VkFFTApplication* app, VkFFTConfiguration inputLaunchConfiguration) { - VkFFTResult resFFT = VKFFT_SUCCESS; - //app->configuration = {};// inputLaunchConfiguration; - if (inputLaunchConfiguration.doublePrecision != 0) app->configuration.doublePrecision = inputLaunchConfiguration.doublePrecision; - if (inputLaunchConfiguration.doublePrecisionFloatMemory != 0) app->configuration.doublePrecisionFloatMemory = inputLaunchConfiguration.doublePrecisionFloatMemory; - if (inputLaunchConfiguration.halfPrecision != 0) app->configuration.halfPrecision = inputLaunchConfiguration.halfPrecision; - if (inputLaunchConfiguration.halfPrecisionMemoryOnly != 0) app->configuration.halfPrecisionMemoryOnly = inputLaunchConfiguration.halfPrecisionMemoryOnly; - if (inputLaunchConfiguration.useCustomBluesteinPaddingPattern != 0) { - app->configuration.useCustomBluesteinPaddingPattern = inputLaunchConfiguration.useCustomBluesteinPaddingPattern; - app->configuration.primeSizes = inputLaunchConfiguration.primeSizes; - if (!app->configuration.primeSizes) return VKFFT_ERROR_EMPRY_useCustomBluesteinPaddingPattern_arrays; - app->configuration.paddedSizes = inputLaunchConfiguration.paddedSizes; - if (!app->configuration.paddedSizes) return VKFFT_ERROR_EMPRY_useCustomBluesteinPaddingPattern_arrays; - } - //set device parameters -#if(VKFFT_BACKEND==0) - if (!inputLaunchConfiguration.isCompilerInitialized) { - if (!app->configuration.isCompilerInitialized) { - int resGlslangInitialize = glslang_initialize_process(); - if (!resGlslangInitialize) return VKFFT_ERROR_FAILED_TO_INITIALIZE; - app->configuration.isCompilerInitialized = 1; - } - } - if (inputLaunchConfiguration.physicalDevice == 0) { - deleteVkFFT(app); - return VKFFT_ERROR_INVALID_PHYSICAL_DEVICE; - } - app->configuration.physicalDevice = inputLaunchConfiguration.physicalDevice; - if (inputLaunchConfiguration.device == 0) { - deleteVkFFT(app); - return VKFFT_ERROR_INVALID_DEVICE; - } - app->configuration.device = inputLaunchConfiguration.device; - if (inputLaunchConfiguration.queue == 0) { - deleteVkFFT(app); - return VKFFT_ERROR_INVALID_QUEUE; - } - app->configuration.queue = inputLaunchConfiguration.queue; - if (inputLaunchConfiguration.commandPool == 0) { - deleteVkFFT(app); - return VKFFT_ERROR_INVALID_COMMAND_POOL; - } - app->configuration.commandPool = inputLaunchConfiguration.commandPool; - if (inputLaunchConfiguration.fence == 0) { - deleteVkFFT(app); - return VKFFT_ERROR_INVALID_FENCE; - } - app->configuration.fence = inputLaunchConfiguration.fence; - - VkPhysicalDeviceProperties physicalDeviceProperties = { 0 }; - vkGetPhysicalDeviceProperties(app->configuration.physicalDevice[0], &physicalDeviceProperties); - app->configuration.maxThreadsNum = physicalDeviceProperties.limits.maxComputeWorkGroupInvocations; - if (physicalDeviceProperties.vendorID == 0x8086) app->configuration.maxThreadsNum = 256; //Intel fix - app->configuration.maxComputeWorkGroupCount[0] = physicalDeviceProperties.limits.maxComputeWorkGroupCount[0]; - app->configuration.maxComputeWorkGroupCount[1] = physicalDeviceProperties.limits.maxComputeWorkGroupCount[1]; - app->configuration.maxComputeWorkGroupCount[2] = physicalDeviceProperties.limits.maxComputeWorkGroupCount[2]; - app->configuration.maxComputeWorkGroupSize[0] = physicalDeviceProperties.limits.maxComputeWorkGroupSize[0]; - app->configuration.maxComputeWorkGroupSize[1] = physicalDeviceProperties.limits.maxComputeWorkGroupSize[1]; - app->configuration.maxComputeWorkGroupSize[2] = physicalDeviceProperties.limits.maxComputeWorkGroupSize[2]; - //if ((physicalDeviceProperties.vendorID == 0x8086) && (!app->configuration.doublePrecision) && (!app->configuration.doublePrecisionFloatMemory)) app->configuration.halfThreads = 1; - app->configuration.sharedMemorySize = physicalDeviceProperties.limits.maxComputeSharedMemorySize; - app->configuration.sharedMemorySizePow2 = (uint64_t)pow(2, (uint64_t)log2(physicalDeviceProperties.limits.maxComputeSharedMemorySize)); - app->configuration.vendorID = physicalDeviceProperties.vendorID; - app->configuration.useRaderUintLUT = 1; - switch (physicalDeviceProperties.vendorID) { - case 0x10DE://NVIDIA - app->configuration.coalescedMemory = (app->configuration.halfPrecision) ? 64 : 32;//the coalesced memory is equal to 32 bytes between L2 and VRAM. - app->configuration.useLUT = (app->configuration.doublePrecision || app->configuration.doublePrecisionFloatMemory) ? 1 : 0; - app->configuration.warpSize = 32; - app->configuration.registerBoostNonPow2 = 0; - app->configuration.registerBoost = 4; - app->configuration.registerBoost4Step = 1; - app->configuration.swapTo3Stage4Step = 0; - break; - case 0x8086://INTEL - app->configuration.coalescedMemory = (app->configuration.halfPrecision) ? 128 : 64; - app->configuration.useLUT = 1; - app->configuration.warpSize = 32; - app->configuration.registerBoostNonPow2 = 0; - app->configuration.registerBoost = (physicalDeviceProperties.limits.maxComputeSharedMemorySize >= 65536) ? 1 : 2; - app->configuration.registerBoost4Step = 1; - app->configuration.swapTo3Stage4Step = 0; - break; - case 0x1002://AMD - app->configuration.coalescedMemory = (app->configuration.halfPrecision) ? 64 : 32; - app->configuration.useLUT = (app->configuration.doublePrecision || app->configuration.doublePrecisionFloatMemory) ? 1 : 0; - app->configuration.warpSize = 64; - app->configuration.registerBoostNonPow2 = 0; - app->configuration.registerBoost = (physicalDeviceProperties.limits.maxComputeSharedMemorySize >= 65536) ? 2 : 4; - app->configuration.registerBoost4Step = 1; - app->configuration.swapTo3Stage4Step = (app->configuration.doublePrecision) ? 20 : 21; - break; - default: - app->configuration.coalescedMemory = (app->configuration.halfPrecision) ? 128 : 64; - app->configuration.useLUT = (app->configuration.doublePrecision || app->configuration.doublePrecisionFloatMemory) ? 1 : 0; - app->configuration.warpSize = 32; - app->configuration.registerBoostNonPow2 = 0; - app->configuration.registerBoost = 1; - app->configuration.registerBoost4Step = 1; - app->configuration.swapTo3Stage4Step = 0; - break; - } -#elif(VKFFT_BACKEND==1) - CUresult res = CUDA_SUCCESS; - cudaError_t res_t = cudaSuccess; - if (inputLaunchConfiguration.device == 0) { - deleteVkFFT(app); - return VKFFT_ERROR_INVALID_DEVICE; - } - app->configuration.device = inputLaunchConfiguration.device; - if (inputLaunchConfiguration.num_streams != 0) app->configuration.num_streams = inputLaunchConfiguration.num_streams; - if (inputLaunchConfiguration.stream != 0) app->configuration.stream = inputLaunchConfiguration.stream; - app->configuration.streamID = 0; - int value = 0; - res = cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, app->configuration.device[0]); - if (res != CUDA_SUCCESS) { - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_TO_GET_ATTRIBUTE; - } - app->configuration.computeCapabilityMajor = value; - - res = cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, app->configuration.device[0]); - if (res != CUDA_SUCCESS) { - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_TO_GET_ATTRIBUTE; - } - app->configuration.computeCapabilityMinor = value; - - res = cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, app->configuration.device[0]); - if (res != CUDA_SUCCESS) { - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_TO_GET_ATTRIBUTE; - } - app->configuration.maxThreadsNum = value; - - res = cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, app->configuration.device[0]); - if (res != CUDA_SUCCESS) { - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_TO_GET_ATTRIBUTE; - } - app->configuration.maxComputeWorkGroupCount[0] = value; - res = cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, app->configuration.device[0]); - if (res != CUDA_SUCCESS) { - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_TO_GET_ATTRIBUTE; - } - app->configuration.maxComputeWorkGroupCount[1] = value; - res = cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, app->configuration.device[0]); - if (res != CUDA_SUCCESS) { - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_TO_GET_ATTRIBUTE; - } - app->configuration.maxComputeWorkGroupCount[2] = value; - res = cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, app->configuration.device[0]); - if (res != CUDA_SUCCESS) { - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_TO_GET_ATTRIBUTE; - } - app->configuration.maxComputeWorkGroupSize[0] = value; - res = cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, app->configuration.device[0]); - if (res != CUDA_SUCCESS) { - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_TO_GET_ATTRIBUTE; - } - app->configuration.maxComputeWorkGroupSize[1] = value; - res = cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, app->configuration.device[0]); - if (res != CUDA_SUCCESS) { - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_TO_GET_ATTRIBUTE; - } - app->configuration.maxComputeWorkGroupSize[2] = value; - res = cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, app->configuration.device[0]); - if (res != CUDA_SUCCESS) { - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_TO_GET_ATTRIBUTE; - } - app->configuration.sharedMemorySizeStatic = value; - res = cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN, app->configuration.device[0]); - if (res != CUDA_SUCCESS) { - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_TO_GET_ATTRIBUTE; - } - app->configuration.sharedMemorySize = value;// (value > 65536) ? 65536 : value; - res = cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_WARP_SIZE, app->configuration.device[0]); - if (res != CUDA_SUCCESS) { - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_TO_GET_ATTRIBUTE; - } - app->configuration.warpSize = value; - //we don't need this in CUDA - app->configuration.sharedMemorySizePow2 = (uint64_t)pow(2, (uint64_t)log2(app->configuration.sharedMemorySize)); - app->configuration.useRaderUintLUT = 0; - if (app->configuration.num_streams > 1) { - app->configuration.stream_event = (cudaEvent_t*)malloc(app->configuration.num_streams * sizeof(cudaEvent_t)); - if (!app->configuration.stream_event) { - deleteVkFFT(app); - return VKFFT_ERROR_MALLOC_FAILED; - } - for (uint64_t i = 0; i < app->configuration.num_streams; i++) { - res_t = cudaEventCreate(&app->configuration.stream_event[i]); - if (res != CUDA_SUCCESS) { - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_TO_CREATE_EVENT; - } - } - } - - app->configuration.coalescedMemory = (app->configuration.halfPrecision) ? 64 : 32;//the coalesced memory is equal to 32 bytes between L2 and VRAM. - app->configuration.useLUT = (app->configuration.doublePrecision || app->configuration.doublePrecisionFloatMemory) ? 1 : 0; - app->configuration.registerBoostNonPow2 = 0; - app->configuration.registerBoost = 1; - app->configuration.registerBoost4Step = 1; - app->configuration.swapTo3Stage4Step = 0; - app->configuration.vendorID = 0x10DE; -#elif(VKFFT_BACKEND==2) - hipError_t res = hipSuccess; - if (inputLaunchConfiguration.device == 0) { - deleteVkFFT(app); - return VKFFT_ERROR_INVALID_DEVICE; - } - app->configuration.device = inputLaunchConfiguration.device; - if (inputLaunchConfiguration.num_streams != 0) app->configuration.num_streams = inputLaunchConfiguration.num_streams; - if (inputLaunchConfiguration.stream != 0) app->configuration.stream = inputLaunchConfiguration.stream; - app->configuration.streamID = 0; - int value = 0; - res = hipDeviceGetAttribute(&value, hipDeviceAttributeComputeCapabilityMajor, app->configuration.device[0]); - if (res != hipSuccess) { - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_TO_GET_ATTRIBUTE; - } - app->configuration.computeCapabilityMajor = value; - - res = hipDeviceGetAttribute(&value, hipDeviceAttributeComputeCapabilityMinor, app->configuration.device[0]); - if (res != hipSuccess) { - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_TO_GET_ATTRIBUTE; - } - app->configuration.computeCapabilityMinor = value; - - res = hipDeviceGetAttribute(&value, hipDeviceAttributeMaxThreadsPerBlock, app->configuration.device[0]); - if (res != hipSuccess) { - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_TO_GET_ATTRIBUTE; - } - app->configuration.maxThreadsNum = value; - - res = hipDeviceGetAttribute(&value, hipDeviceAttributeMaxGridDimX, app->configuration.device[0]); - if (res != hipSuccess) { - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_TO_GET_ATTRIBUTE; - } - app->configuration.maxComputeWorkGroupCount[0] = value; - res = hipDeviceGetAttribute(&value, hipDeviceAttributeMaxGridDimY, app->configuration.device[0]); - if (res != hipSuccess) { - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_TO_GET_ATTRIBUTE; - } - app->configuration.maxComputeWorkGroupCount[1] = value; - res = hipDeviceGetAttribute(&value, hipDeviceAttributeMaxGridDimZ, app->configuration.device[0]); - if (res != hipSuccess) { - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_TO_GET_ATTRIBUTE; - } - app->configuration.maxComputeWorkGroupCount[2] = value; - res = hipDeviceGetAttribute(&value, hipDeviceAttributeMaxBlockDimX, app->configuration.device[0]); - if (res != hipSuccess) { - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_TO_GET_ATTRIBUTE; - } - app->configuration.maxComputeWorkGroupSize[0] = value; - res = hipDeviceGetAttribute(&value, hipDeviceAttributeMaxBlockDimY, app->configuration.device[0]); - if (res != hipSuccess) { - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_TO_GET_ATTRIBUTE; - } - app->configuration.maxComputeWorkGroupSize[1] = value; - res = hipDeviceGetAttribute(&value, hipDeviceAttributeMaxBlockDimZ, app->configuration.device[0]); - if (res != hipSuccess) { - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_TO_GET_ATTRIBUTE; - } - app->configuration.maxComputeWorkGroupSize[2] = value; - res = hipDeviceGetAttribute(&value, hipDeviceAttributeMaxSharedMemoryPerBlock, app->configuration.device[0]); - if (res != hipSuccess) { - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_TO_GET_ATTRIBUTE; - } - app->configuration.sharedMemorySizeStatic = value; - //hipDeviceGetAttribute(&value, hipDeviceAttributeMaxSharedMemoryPerBlockOptin, app->configuration.device[0]); - app->configuration.sharedMemorySize = value;// (value > 65536) ? 65536 : value; - res = hipDeviceGetAttribute(&value, hipDeviceAttributeWarpSize, app->configuration.device[0]); - if (res != hipSuccess) { - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_TO_GET_ATTRIBUTE; - } - app->configuration.warpSize = value; - app->configuration.sharedMemorySizePow2 = (uint64_t)pow(2, (uint64_t)log2(app->configuration.sharedMemorySize)); - app->configuration.useRaderUintLUT = 0; - if (app->configuration.num_streams > 1) { - app->configuration.stream_event = (hipEvent_t*)malloc(app->configuration.num_streams * sizeof(hipEvent_t)); - if (!app->configuration.stream_event) { - deleteVkFFT(app); - return VKFFT_ERROR_MALLOC_FAILED; - } - for (uint64_t i = 0; i < app->configuration.num_streams; i++) { - res = hipEventCreate(&app->configuration.stream_event[i]); - if (res != hipSuccess) { - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_TO_CREATE_EVENT; - } - } - } - app->configuration.coalescedMemory = (app->configuration.halfPrecision) ? 64 : 32; - app->configuration.useLUT = (app->configuration.doublePrecision || app->configuration.doublePrecisionFloatMemory) ? 1 : 0; - app->configuration.registerBoostNonPow2 = 0; - app->configuration.registerBoost = 1; - app->configuration.registerBoost4Step = 1; - app->configuration.swapTo3Stage4Step = (app->configuration.doublePrecision) ? 20 : 21; - app->configuration.vendorID = 0x1002; -#elif(VKFFT_BACKEND==3) - cl_int res = 0; - if (inputLaunchConfiguration.device == 0) { - deleteVkFFT(app); - return VKFFT_ERROR_INVALID_DEVICE; - } - app->configuration.device = inputLaunchConfiguration.device; - if (inputLaunchConfiguration.context == 0) { - deleteVkFFT(app); - return VKFFT_ERROR_INVALID_CONTEXT; - } - app->configuration.context = inputLaunchConfiguration.context; - cl_uint vendorID; - size_t value_int64; - cl_uint value_cl_uint; - res = clGetDeviceInfo(app->configuration.device[0], CL_DEVICE_VENDOR_ID, sizeof(cl_int), &vendorID, 0); - if (res != 0) { - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_TO_GET_ATTRIBUTE; - } - res = clGetDeviceInfo(app->configuration.device[0], CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &value_int64, 0); - if (res != 0) { - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_TO_GET_ATTRIBUTE; - } - app->configuration.maxThreadsNum = value_int64; - - res = clGetDeviceInfo(app->configuration.device[0], CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(cl_uint), &value_cl_uint, 0); - if (res != 0) { - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_TO_GET_ATTRIBUTE; - } - size_t* dims = (size_t*)malloc(sizeof(size_t) * value_cl_uint); - if (dims) { - res = clGetDeviceInfo(app->configuration.device[0], CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t) * value_cl_uint, dims, 0); - if (res != 0) { - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_TO_GET_ATTRIBUTE; - } - app->configuration.maxComputeWorkGroupSize[0] = dims[0]; - app->configuration.maxComputeWorkGroupSize[1] = dims[1]; - app->configuration.maxComputeWorkGroupSize[2] = dims[2]; - free(dims); - dims = 0; - } - else { - deleteVkFFT(app); - return VKFFT_ERROR_MALLOC_FAILED; - } - app->configuration.maxComputeWorkGroupCount[0] = UINT64_MAX; - app->configuration.maxComputeWorkGroupCount[1] = UINT64_MAX; - app->configuration.maxComputeWorkGroupCount[2] = UINT64_MAX; - //if ((vendorID == 0x8086) && (!app->configuration.doublePrecision) && (!app->configuration.doublePrecisionFloatMemory)) app->configuration.halfThreads = 1; - cl_ulong sharedMemorySize; - res = clGetDeviceInfo(app->configuration.device[0], CL_DEVICE_LOCAL_MEM_SIZE, sizeof(cl_ulong), &sharedMemorySize, 0); - if (res != 0) { - deleteVkFFT(app); - return VKFFT_ERROR_FAILED_TO_GET_ATTRIBUTE; - } - app->configuration.sharedMemorySize = sharedMemorySize; - app->configuration.sharedMemorySizePow2 = (uint64_t)pow(2, (uint64_t)log2(sharedMemorySize)); - app->configuration.vendorID = vendorID; - app->configuration.useRaderUintLUT = 1; - switch (vendorID) { - case 0x10DE://NVIDIA - app->configuration.coalescedMemory = (app->configuration.halfPrecision) ? 64 : 32;//the coalesced memory is equal to 32 bytes between L2 and VRAM. - app->configuration.useLUT = (app->configuration.doublePrecision || app->configuration.doublePrecisionFloatMemory) ? 1 : 0; - app->configuration.warpSize = 32; - app->configuration.registerBoostNonPow2 = 0; - app->configuration.registerBoost = 4; - app->configuration.registerBoost4Step = 1; - app->configuration.swapTo3Stage4Step = 0; - app->configuration.sharedMemorySize -= 0x10;//reserved by system - app->configuration.sharedMemorySizePow2 = (uint64_t)pow(2, (uint64_t)log2(app->configuration.sharedMemorySize)); - break; - case 0x8086://INTEL - app->configuration.coalescedMemory = (app->configuration.halfPrecision) ? 128 : 64; - app->configuration.useLUT = 1; - app->configuration.warpSize = 32; - app->configuration.registerBoostNonPow2 = 0; - app->configuration.registerBoost = (sharedMemorySize >= 65536) ? 1 : 2; - app->configuration.registerBoost4Step = 1; - app->configuration.swapTo3Stage4Step = 0; - break; - case 0x1002://AMD - app->configuration.coalescedMemory = (app->configuration.halfPrecision) ? 64 : 32; - app->configuration.useLUT = (app->configuration.doublePrecision || app->configuration.doublePrecisionFloatMemory) ? 1 : 0; - app->configuration.warpSize = 64; - app->configuration.registerBoostNonPow2 = 0; - app->configuration.registerBoost = (sharedMemorySize >= 65536) ? 2 : 4; - app->configuration.registerBoost4Step = 1; - app->configuration.swapTo3Stage4Step = 0; - break; - default: - app->configuration.coalescedMemory = (app->configuration.halfPrecision) ? 128 : 64; - app->configuration.useLUT = (app->configuration.doublePrecision || app->configuration.doublePrecisionFloatMemory) ? 1 : 0; - app->configuration.warpSize = 32; - app->configuration.registerBoostNonPow2 = 0; - app->configuration.registerBoost = 1; - app->configuration.registerBoost4Step = 1; - app->configuration.swapTo3Stage4Step = 0; - break; - } -#elif(VKFFT_BACKEND==4) - ze_result_t res = ZE_RESULT_SUCCESS; - if (inputLaunchConfiguration.device == 0) { - deleteVkFFT(app); - return VKFFT_ERROR_INVALID_DEVICE; - } - app->configuration.device = inputLaunchConfiguration.device; - if (inputLaunchConfiguration.context == 0) { - deleteVkFFT(app); - return VKFFT_ERROR_INVALID_CONTEXT; - } - app->configuration.context = inputLaunchConfiguration.context; - if (inputLaunchConfiguration.commandQueue == 0) { - deleteVkFFT(app); - return VKFFT_ERROR_INVALID_QUEUE; - } - app->configuration.commandQueue = inputLaunchConfiguration.commandQueue; - app->configuration.commandQueueID = inputLaunchConfiguration.commandQueueID; - ze_device_properties_t device_properties; - ze_device_compute_properties_t compute_properties; - res = zeDeviceGetProperties(app->configuration.device[0], &device_properties); - if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_GET_ATTRIBUTE; - res = zeDeviceGetComputeProperties(app->configuration.device[0], &compute_properties); - if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_GET_ATTRIBUTE; - uint32_t vendorID = device_properties.vendorId; - app->configuration.maxThreadsNum = compute_properties.maxTotalGroupSize; - app->configuration.maxComputeWorkGroupSize[0] = compute_properties.maxGroupSizeX; - app->configuration.maxComputeWorkGroupSize[1] = compute_properties.maxGroupSizeY; - app->configuration.maxComputeWorkGroupSize[2] = compute_properties.maxGroupSizeZ; - - app->configuration.maxComputeWorkGroupCount[0] = compute_properties.maxGroupCountX; - app->configuration.maxComputeWorkGroupCount[1] = compute_properties.maxGroupCountY; - app->configuration.maxComputeWorkGroupCount[2] = compute_properties.maxGroupCountZ; - //if ((vendorID == 0x8086) && (!app->configuration.doublePrecision) && (!app->configuration.doublePrecisionFloatMemory)) app->configuration.halfThreads = 1; - app->configuration.sharedMemorySize = compute_properties.maxSharedLocalMemory; - app->configuration.sharedMemorySizePow2 = (uint64_t)pow(2, (uint64_t)log2(app->configuration.sharedMemorySize)); - - app->configuration.coalescedMemory = (app->configuration.halfPrecision) ? 128 : 64; - app->configuration.useLUT = 1; - app->configuration.warpSize = device_properties.physicalEUSimdWidth; - app->configuration.registerBoostNonPow2 = 0; - app->configuration.registerBoost = (app->configuration.sharedMemorySize >= 65536) ? 1 : 2; - app->configuration.registerBoost4Step = 1; - app->configuration.swapTo3Stage4Step = 0; - app->configuration.vendorID = 0x8086; - app->configuration.useRaderUintLUT = 1; -#endif - - resFFT = initializeBluesteinAutoPadding(app); - if (resFFT != VKFFT_SUCCESS) { - deleteVkFFT(app); - return resFFT; - } - //set main parameters: - if (inputLaunchConfiguration.FFTdim == 0) { - deleteVkFFT(app); - return VKFFT_ERROR_EMPTY_FFTdim; - } - app->configuration.FFTdim = inputLaunchConfiguration.FFTdim; - if (inputLaunchConfiguration.size[0] == 0) { - deleteVkFFT(app); - return VKFFT_ERROR_EMPTY_size; - } - - app->configuration.size[0] = inputLaunchConfiguration.size[0]; - - if (inputLaunchConfiguration.bufferStride[0] == 0) { - if (inputLaunchConfiguration.performR2C) - app->configuration.bufferStride[0] = app->configuration.size[0] / 2 + 1; - else - app->configuration.bufferStride[0] = app->configuration.size[0]; - } - else - app->configuration.bufferStride[0] = inputLaunchConfiguration.bufferStride[0]; - - if (inputLaunchConfiguration.inputBufferStride[0] == 0) { - if (inputLaunchConfiguration.performR2C) - app->configuration.inputBufferStride[0] = app->configuration.size[0] + 2; - else - app->configuration.inputBufferStride[0] = app->configuration.size[0]; - } - else - app->configuration.inputBufferStride[0] = inputLaunchConfiguration.inputBufferStride[0]; - - if (inputLaunchConfiguration.outputBufferStride[0] == 0) { - if (inputLaunchConfiguration.performR2C) - app->configuration.outputBufferStride[0] = app->configuration.size[0] + 2; - else - app->configuration.outputBufferStride[0] = app->configuration.size[0]; - } - else - app->configuration.outputBufferStride[0] = inputLaunchConfiguration.outputBufferStride[0]; - for (uint64_t i = 1; i < 3; i++) { - if (inputLaunchConfiguration.size[i] == 0) - app->configuration.size[i] = 1; - else - app->configuration.size[i] = inputLaunchConfiguration.size[i]; - - if (inputLaunchConfiguration.bufferStride[i] == 0) - app->configuration.bufferStride[i] = app->configuration.bufferStride[i - 1] * app->configuration.size[i]; - else - app->configuration.bufferStride[i] = inputLaunchConfiguration.bufferStride[i]; - - if (inputLaunchConfiguration.inputBufferStride[i] == 0) - app->configuration.inputBufferStride[i] = app->configuration.inputBufferStride[i - 1] * app->configuration.size[i]; - else - app->configuration.inputBufferStride[i] = inputLaunchConfiguration.inputBufferStride[i]; - - if (inputLaunchConfiguration.outputBufferStride[i] == 0) - app->configuration.outputBufferStride[i] = app->configuration.outputBufferStride[i - 1] * app->configuration.size[i]; - else - app->configuration.outputBufferStride[i] = inputLaunchConfiguration.outputBufferStride[i]; - } - - app->configuration.isInputFormatted = inputLaunchConfiguration.isInputFormatted; - app->configuration.isOutputFormatted = inputLaunchConfiguration.isOutputFormatted; - app->configuration.performConvolution = inputLaunchConfiguration.performConvolution; - - if (inputLaunchConfiguration.bufferNum == 0) app->configuration.bufferNum = 1; - else app->configuration.bufferNum = inputLaunchConfiguration.bufferNum; -#if(VKFFT_BACKEND==0) - if (inputLaunchConfiguration.bufferSize == 0) { - deleteVkFFT(app); - return VKFFT_ERROR_EMPTY_bufferSize; - } -#endif - app->configuration.bufferSize = inputLaunchConfiguration.bufferSize; - if (app->configuration.bufferSize != 0) { - for (uint64_t i = 0; i < app->configuration.bufferNum; i++) { - if (app->configuration.bufferSize[i] == 0) { - deleteVkFFT(app); - return VKFFT_ERROR_EMPTY_bufferSize; - } - } - } - app->configuration.buffer = inputLaunchConfiguration.buffer; - - if (inputLaunchConfiguration.userTempBuffer != 0) app->configuration.userTempBuffer = inputLaunchConfiguration.userTempBuffer; - - if (app->configuration.userTempBuffer != 0) { - if (inputLaunchConfiguration.tempBufferNum == 0) app->configuration.tempBufferNum = 1; - else app->configuration.tempBufferNum = inputLaunchConfiguration.tempBufferNum; -#if(VKFFT_BACKEND==0) - if (inputLaunchConfiguration.tempBufferSize == 0) { - deleteVkFFT(app); - return VKFFT_ERROR_EMPTY_tempBufferSize; - } -#endif - app->configuration.tempBufferSize = inputLaunchConfiguration.tempBufferSize; - if (app->configuration.tempBufferSize != 0) { - for (uint64_t i = 0; i < app->configuration.tempBufferNum; i++) { - if (app->configuration.tempBufferSize[i] == 0) { - deleteVkFFT(app); - return VKFFT_ERROR_EMPTY_tempBufferSize; - } - } - } - app->configuration.tempBuffer = inputLaunchConfiguration.tempBuffer; - } - else { - app->configuration.tempBufferNum = 1; - app->configuration.tempBufferSize = (uint64_t*)malloc(sizeof(uint64_t)); - if (!app->configuration.tempBufferSize) { - deleteVkFFT(app); - return VKFFT_ERROR_MALLOC_FAILED; - } - app->configuration.tempBufferSize[0] = 0; - - } - - if (app->configuration.isInputFormatted) { - if (inputLaunchConfiguration.inputBufferNum == 0) app->configuration.inputBufferNum = 1; - else app->configuration.inputBufferNum = inputLaunchConfiguration.inputBufferNum; -#if(VKFFT_BACKEND==0) - if (inputLaunchConfiguration.inputBufferSize == 0) { - deleteVkFFT(app); - return VKFFT_ERROR_EMPTY_inputBufferSize; - } -#endif - app->configuration.inputBufferSize = inputLaunchConfiguration.inputBufferSize; - if (app->configuration.inputBufferSize != 0) { - for (uint64_t i = 0; i < app->configuration.inputBufferNum; i++) { - if (app->configuration.inputBufferSize[i] == 0) { - deleteVkFFT(app); - return VKFFT_ERROR_EMPTY_inputBufferSize; - } - } - } - app->configuration.inputBuffer = inputLaunchConfiguration.inputBuffer; - } - else { - app->configuration.inputBufferNum = app->configuration.bufferNum; - - app->configuration.inputBufferSize = app->configuration.bufferSize; - app->configuration.inputBuffer = app->configuration.buffer; - } - if (app->configuration.isOutputFormatted) { - if (inputLaunchConfiguration.outputBufferNum == 0) app->configuration.outputBufferNum = 1; - else - app->configuration.outputBufferNum = inputLaunchConfiguration.outputBufferNum; -#if(VKFFT_BACKEND==0) - if (inputLaunchConfiguration.outputBufferSize == 0) { - deleteVkFFT(app); - return VKFFT_ERROR_EMPTY_outputBufferSize; - } -#endif - app->configuration.outputBufferSize = inputLaunchConfiguration.outputBufferSize; - if (app->configuration.outputBufferSize != 0) { - for (uint64_t i = 0; i < app->configuration.outputBufferNum; i++) { - if (app->configuration.outputBufferSize[i] == 0) { - deleteVkFFT(app); - return VKFFT_ERROR_EMPTY_outputBufferSize; - } - } - } - app->configuration.outputBuffer = inputLaunchConfiguration.outputBuffer; - } - else { - app->configuration.outputBufferNum = app->configuration.bufferNum; - - app->configuration.outputBufferSize = app->configuration.bufferSize; - app->configuration.outputBuffer = app->configuration.buffer; - } - if (app->configuration.performConvolution) { - if (inputLaunchConfiguration.kernelNum == 0) app->configuration.kernelNum = 1; - else app->configuration.kernelNum = inputLaunchConfiguration.kernelNum; -#if(VKFFT_BACKEND==0) - if (inputLaunchConfiguration.kernelSize == 0) { - deleteVkFFT(app); - return VKFFT_ERROR_EMPTY_kernelSize; - } -#endif - app->configuration.kernelSize = inputLaunchConfiguration.kernelSize; - if (app->configuration.kernelSize != 0) { - for (uint64_t i = 0; i < app->configuration.kernelNum; i++) { - if (app->configuration.kernelSize[i] == 0) { - deleteVkFFT(app); - return VKFFT_ERROR_EMPTY_kernelSize; - } - } - } - app->configuration.kernel = inputLaunchConfiguration.kernel; - } - - if (inputLaunchConfiguration.bufferOffset != 0) app->configuration.bufferOffset = inputLaunchConfiguration.bufferOffset; - if (inputLaunchConfiguration.tempBufferOffset != 0) app->configuration.tempBufferOffset = inputLaunchConfiguration.tempBufferOffset; - if (inputLaunchConfiguration.inputBufferOffset != 0) app->configuration.inputBufferOffset = inputLaunchConfiguration.inputBufferOffset; - if (inputLaunchConfiguration.outputBufferOffset != 0) app->configuration.outputBufferOffset = inputLaunchConfiguration.outputBufferOffset; - if (inputLaunchConfiguration.kernelOffset != 0) app->configuration.kernelOffset = inputLaunchConfiguration.kernelOffset; - if (inputLaunchConfiguration.specifyOffsetsAtLaunch != 0) app->configuration.specifyOffsetsAtLaunch = inputLaunchConfiguration.specifyOffsetsAtLaunch; - //set optional parameters: - uint64_t checkBufferSizeFor64BitAddressing = 0; - for (uint64_t i = 0; i < app->configuration.bufferNum; i++) { - if (app->configuration.bufferSize) - checkBufferSizeFor64BitAddressing += app->configuration.bufferSize[i]; - else { - checkBufferSizeFor64BitAddressing = app->configuration.size[0] * app->configuration.size[1] * app->configuration.size[2] * 8; - if (app->configuration.coordinateFeatures > 0) checkBufferSizeFor64BitAddressing *= app->configuration.coordinateFeatures; - if (app->configuration.numberBatches > 0) checkBufferSizeFor64BitAddressing *= app->configuration.numberBatches; - if (app->configuration.numberKernels > 0) checkBufferSizeFor64BitAddressing *= app->configuration.numberKernels; - if (app->configuration.doublePrecision) checkBufferSizeFor64BitAddressing *= 2; - } - } - if (checkBufferSizeFor64BitAddressing >= (uint64_t)pow((uint64_t)2, (uint64_t)34)) app->configuration.useUint64 = 1; - checkBufferSizeFor64BitAddressing = 0; - for (uint64_t i = 0; i < app->configuration.inputBufferNum; i++) { - if (app->configuration.inputBufferSize) - checkBufferSizeFor64BitAddressing += app->configuration.inputBufferSize[i]; - } - if (checkBufferSizeFor64BitAddressing >= (uint64_t)pow((uint64_t)2, (uint64_t)34)) app->configuration.useUint64 = 1; - - checkBufferSizeFor64BitAddressing = 0; - for (uint64_t i = 0; i < app->configuration.outputBufferNum; i++) { - if (app->configuration.outputBufferSize) - checkBufferSizeFor64BitAddressing += app->configuration.outputBufferSize[i]; - } - if (checkBufferSizeFor64BitAddressing >= (uint64_t)pow((uint64_t)2, (uint64_t)34)) app->configuration.useUint64 = 1; - - checkBufferSizeFor64BitAddressing = 0; - for (uint64_t i = 0; i < app->configuration.kernelNum; i++) { - if (app->configuration.kernelSize) - checkBufferSizeFor64BitAddressing += app->configuration.kernelSize[i]; - } - if (checkBufferSizeFor64BitAddressing >= (uint64_t)pow((uint64_t)2, (uint64_t)34)) app->configuration.useUint64 = 1; - if (inputLaunchConfiguration.useUint64 != 0) app->configuration.useUint64 = inputLaunchConfiguration.useUint64; - - if (inputLaunchConfiguration.coalescedMemory != 0) app->configuration.coalescedMemory = inputLaunchConfiguration.coalescedMemory; - app->configuration.aimThreads = 128; - if (inputLaunchConfiguration.aimThreads != 0) app->configuration.aimThreads = inputLaunchConfiguration.aimThreads; - app->configuration.numSharedBanks = 32; - if (inputLaunchConfiguration.numSharedBanks != 0) app->configuration.numSharedBanks = inputLaunchConfiguration.numSharedBanks; - if (inputLaunchConfiguration.inverseReturnToInputBuffer != 0) app->configuration.inverseReturnToInputBuffer = inputLaunchConfiguration.inverseReturnToInputBuffer; - - if (inputLaunchConfiguration.useLUT != 0) app->configuration.useLUT = inputLaunchConfiguration.useLUT; - - if (inputLaunchConfiguration.fixMaxRadixBluestein != 0) app->configuration.fixMaxRadixBluestein = inputLaunchConfiguration.fixMaxRadixBluestein; - if (inputLaunchConfiguration.forceBluesteinSequenceSize != 0) app->configuration.forceBluesteinSequenceSize = inputLaunchConfiguration.forceBluesteinSequenceSize; - - app->configuration.fixMinRaderPrimeMult = 17; - app->configuration.fixMaxRaderPrimeMult = 89; - if (inputLaunchConfiguration.fixMinRaderPrimeMult != 0) app->configuration.fixMinRaderPrimeMult = inputLaunchConfiguration.fixMinRaderPrimeMult; - if (inputLaunchConfiguration.fixMaxRaderPrimeMult != 0) app->configuration.fixMaxRaderPrimeMult = inputLaunchConfiguration.fixMaxRaderPrimeMult; - - switch (app->configuration.vendorID) { - case 0x10DE://NVIDIA - app->configuration.fixMinRaderPrimeFFT = 17; - break; - case 0x1002://AMD profile - app->configuration.fixMinRaderPrimeFFT = 29; - break; - default: - app->configuration.fixMinRaderPrimeFFT = 17; - break; - } - app->configuration.fixMaxRaderPrimeFFT = 16384; - if (inputLaunchConfiguration.fixMinRaderPrimeFFT != 0) app->configuration.fixMinRaderPrimeFFT = inputLaunchConfiguration.fixMinRaderPrimeFFT; - if (inputLaunchConfiguration.fixMaxRaderPrimeFFT != 0) app->configuration.fixMaxRaderPrimeFFT = inputLaunchConfiguration.fixMaxRaderPrimeFFT; - - if (inputLaunchConfiguration.performR2C != 0) { - app->configuration.performR2C = inputLaunchConfiguration.performR2C; - } - if (inputLaunchConfiguration.performDCT != 0) { - app->configuration.performDCT = inputLaunchConfiguration.performDCT; - } - if (inputLaunchConfiguration.disableMergeSequencesR2C != 0) { - app->configuration.disableMergeSequencesR2C = inputLaunchConfiguration.disableMergeSequencesR2C; - } - - app->configuration.normalize = 0; - if (inputLaunchConfiguration.normalize != 0) app->configuration.normalize = inputLaunchConfiguration.normalize; - if (inputLaunchConfiguration.makeForwardPlanOnly != 0) app->configuration.makeForwardPlanOnly = inputLaunchConfiguration.makeForwardPlanOnly; - if (inputLaunchConfiguration.makeInversePlanOnly != 0) app->configuration.makeInversePlanOnly = inputLaunchConfiguration.makeInversePlanOnly; - - app->configuration.reorderFourStep = 1; - if (inputLaunchConfiguration.disableReorderFourStep != 0) app->configuration.reorderFourStep = 0; - if (inputLaunchConfiguration.frequencyZeroPadding != 0) app->configuration.frequencyZeroPadding = inputLaunchConfiguration.frequencyZeroPadding; - for (uint64_t i = 0; i < app->configuration.FFTdim; i++) { - if (inputLaunchConfiguration.performZeropadding[i] != 0) { - app->configuration.performZeropadding[i] = inputLaunchConfiguration.performZeropadding[i]; - app->configuration.fft_zeropad_left[i] = inputLaunchConfiguration.fft_zeropad_left[i]; - app->configuration.fft_zeropad_right[i] = inputLaunchConfiguration.fft_zeropad_right[i]; - } - } - if (inputLaunchConfiguration.registerBoost != 0) app->configuration.registerBoost = inputLaunchConfiguration.registerBoost; - if (inputLaunchConfiguration.registerBoostNonPow2 != 0) app->configuration.registerBoostNonPow2 = inputLaunchConfiguration.registerBoostNonPow2; - if (inputLaunchConfiguration.registerBoost4Step != 0) app->configuration.registerBoost4Step = inputLaunchConfiguration.registerBoost4Step; - - if (app->configuration.performR2C != 0) { - app->configuration.registerBoost = 1; - app->configuration.registerBoostNonPow2 = 0; - app->configuration.registerBoost4Step = 1; - } - - app->configuration.coordinateFeatures = 1; - app->configuration.numberBatches = 1; - if (inputLaunchConfiguration.coordinateFeatures != 0) app->configuration.coordinateFeatures = inputLaunchConfiguration.coordinateFeatures; - if (inputLaunchConfiguration.numberBatches != 0) app->configuration.numberBatches = inputLaunchConfiguration.numberBatches; - - app->configuration.matrixConvolution = 1; - app->configuration.numberKernels = 1; - if (inputLaunchConfiguration.kernelConvolution != 0) { - app->configuration.kernelConvolution = inputLaunchConfiguration.kernelConvolution; - app->configuration.reorderFourStep = 0; - app->configuration.registerBoost = 1; - app->configuration.registerBoostNonPow2 = 0; - app->configuration.registerBoost4Step = 1; - } - - if (app->configuration.performConvolution) { - - if (inputLaunchConfiguration.matrixConvolution != 0) app->configuration.matrixConvolution = inputLaunchConfiguration.matrixConvolution; - if (inputLaunchConfiguration.numberKernels != 0) app->configuration.numberKernels = inputLaunchConfiguration.numberKernels; - - if (inputLaunchConfiguration.symmetricKernel != 0) app->configuration.symmetricKernel = inputLaunchConfiguration.symmetricKernel; - if (inputLaunchConfiguration.conjugateConvolution != 0) app->configuration.conjugateConvolution = inputLaunchConfiguration.conjugateConvolution; - if (inputLaunchConfiguration.crossPowerSpectrumNormalization != 0) app->configuration.crossPowerSpectrumNormalization = inputLaunchConfiguration.crossPowerSpectrumNormalization; - - app->configuration.reorderFourStep = 0; - app->configuration.registerBoost = 1; - app->configuration.registerBoostNonPow2 = 0; - app->configuration.registerBoost4Step = 1; - if (app->configuration.matrixConvolution > 1) app->configuration.coordinateFeatures = app->configuration.matrixConvolution; - } - app->firstAxis = 0; - app->lastAxis = app->configuration.FFTdim - 1; - if (inputLaunchConfiguration.omitDimension[0] != 0) { - app->configuration.omitDimension[0] = inputLaunchConfiguration.omitDimension[0]; - app->firstAxis++; - if (app->configuration.performConvolution) { - deleteVkFFT(app); - return VKFFT_ERROR_UNSUPPORTED_FFT_OMIT; - } - if (app->configuration.performR2C) { - deleteVkFFT(app); - return VKFFT_ERROR_UNSUPPORTED_FFT_OMIT; - } - } - if (inputLaunchConfiguration.omitDimension[2] != 0) { - app->configuration.omitDimension[2] = inputLaunchConfiguration.omitDimension[2]; - app->lastAxis--; - if (app->configuration.performConvolution) { - deleteVkFFT(app); - return VKFFT_ERROR_UNSUPPORTED_FFT_OMIT; - } - } - if (inputLaunchConfiguration.omitDimension[1] != 0) { - app->configuration.omitDimension[1] = inputLaunchConfiguration.omitDimension[1]; - if (app->configuration.omitDimension[0] == 1) app->firstAxis++; - if (app->configuration.omitDimension[2] == 1) app->lastAxis--; - if (app->configuration.performConvolution) { - deleteVkFFT(app); - return VKFFT_ERROR_UNSUPPORTED_FFT_OMIT; - } - } - if (app->firstAxis > app->lastAxis) { - deleteVkFFT(app); - return VKFFT_ERROR_UNSUPPORTED_FFT_OMIT; - } - if (inputLaunchConfiguration.reorderFourStep != 0) app->configuration.reorderFourStep = inputLaunchConfiguration.reorderFourStep; - app->configuration.maxCodeLength = 4000000; - if (inputLaunchConfiguration.maxCodeLength != 0) app->configuration.maxCodeLength = inputLaunchConfiguration.maxCodeLength; - app->configuration.maxTempLength = 5000; - if (inputLaunchConfiguration.maxTempLength != 0) app->configuration.maxTempLength = inputLaunchConfiguration.maxTempLength; - - if (inputLaunchConfiguration.useRaderUintLUT != 0) app->configuration.useRaderUintLUT = inputLaunchConfiguration.useRaderUintLUT; - if (inputLaunchConfiguration.halfThreads != 0) app->configuration.halfThreads = inputLaunchConfiguration.halfThreads; - if (inputLaunchConfiguration.swapTo3Stage4Step != 0) app->configuration.swapTo3Stage4Step = inputLaunchConfiguration.swapTo3Stage4Step; - if (app->configuration.performDCT > 0) app->configuration.performBandwidthBoost = -1; - if (inputLaunchConfiguration.performBandwidthBoost != 0) app->configuration.performBandwidthBoost = inputLaunchConfiguration.performBandwidthBoost; - if (inputLaunchConfiguration.devicePageSize != 0) app->configuration.devicePageSize = inputLaunchConfiguration.devicePageSize; - if (inputLaunchConfiguration.localPageSize != 0) app->configuration.localPageSize = inputLaunchConfiguration.localPageSize; - if (inputLaunchConfiguration.keepShaderCode != 0) app->configuration.keepShaderCode = inputLaunchConfiguration.keepShaderCode; - if (inputLaunchConfiguration.printMemoryLayout != 0) app->configuration.printMemoryLayout = inputLaunchConfiguration.printMemoryLayout; - if (inputLaunchConfiguration.considerAllAxesStrided != 0) app->configuration.considerAllAxesStrided = inputLaunchConfiguration.considerAllAxesStrided; - - if (inputLaunchConfiguration.loadApplicationString != 0) app->configuration.loadApplicationString = inputLaunchConfiguration.loadApplicationString; - if (inputLaunchConfiguration.saveApplicationToString != 0) app->configuration.saveApplicationToString = inputLaunchConfiguration.saveApplicationToString; - if (inputLaunchConfiguration.disableSetLocale != 0) app->configuration.disableSetLocale = inputLaunchConfiguration.disableSetLocale; - - if (inputLaunchConfiguration.loadApplicationFromString != 0) { - app->configuration.loadApplicationFromString = inputLaunchConfiguration.loadApplicationFromString; - if (app->configuration.saveApplicationToString != 0) { - deleteVkFFT(app); - return VKFFT_ERROR_ENABLED_saveApplicationToString; - } - if (app->configuration.loadApplicationString == 0) { - deleteVkFFT(app); - return VKFFT_ERROR_EMPTY_applicationString; - } - memcpy(&app->applicationStringSize, app->configuration.loadApplicationString, sizeof(uint64_t)); - memcpy(&app->applicationStringOffsetRader, (char*)app->configuration.loadApplicationString + 2 * sizeof(uint64_t), sizeof(uint64_t)); - app->currentApplicationStringPos = 5 * sizeof(uint64_t); - } - //temporary set: - app->configuration.registerBoost4Step = 1; -#if(VKFFT_BACKEND==0) - app->configuration.useUint64 = 0; //No physical addressing mode in Vulkan shaders. Use multiple-buffer support to achieve emulation of physical addressing. -#endif - //uint64_t initSharedMemory = app->configuration.sharedMemorySize; - if (!app->configuration.makeForwardPlanOnly) { - app->localFFTPlan_inverse = (VkFFTPlan*)calloc(1, sizeof(VkFFTPlan)); - if (app->localFFTPlan_inverse) { - for (uint64_t i = 0; i < app->configuration.FFTdim; i++) { - //app->configuration.sharedMemorySize = ((app->configuration.size[i] & (app->configuration.size[i] - 1)) == 0) ? app->configuration.sharedMemorySizePow2 : initSharedMemory; - resFFT = VkFFTScheduler(app, app->localFFTPlan_inverse, i); - if (resFFT == VKFFT_ERROR_UNSUPPORTED_FFT_LENGTH) { - //try again with Rader disabled - sequences like 89^4 can still be done with Bluestein FFT - uint64_t temp_fixMaxRaderPrimeFFT = app->configuration.fixMaxRaderPrimeFFT; - app->configuration.fixMaxRaderPrimeFFT = app->configuration.fixMinRaderPrimeFFT; - uint64_t temp_fixMaxRaderPrimeMult = app->configuration.fixMaxRaderPrimeMult; - app->configuration.fixMaxRaderPrimeMult = app->configuration.fixMinRaderPrimeMult; - resFFT = VkFFTScheduler(app, app->localFFTPlan_inverse, i); - app->configuration.fixMaxRaderPrimeFFT = temp_fixMaxRaderPrimeFFT; - app->configuration.fixMaxRaderPrimeMult = temp_fixMaxRaderPrimeMult; - } - if (resFFT != VKFFT_SUCCESS) { - deleteVkFFT(app); - return resFFT; - } - if (app->useBluesteinFFT[i] && (app->localFFTPlan_inverse->numAxisUploads[i] > 1)) { - for (uint64_t j = 0; j < app->localFFTPlan_inverse->numAxisUploads[i]; j++) { - app->localFFTPlan_inverse->inverseBluesteinAxes[i][j] = app->localFFTPlan_inverse->axes[i][j]; - } - } - } - for (uint64_t i = 0; i < app->configuration.FFTdim; i++) { - //app->configuration.sharedMemorySize = ((app->configuration.size[i] & (app->configuration.size[i] - 1)) == 0) ? app->configuration.sharedMemorySizePow2 : initSharedMemory; - for (uint64_t j = 0; j < app->localFFTPlan_inverse->numAxisUploads[i]; j++) { - resFFT = VkFFTPlanAxis(app, app->localFFTPlan_inverse, i, j, 1, 0); - if (resFFT != VKFFT_SUCCESS) { - deleteVkFFT(app); - return resFFT; - } - } - if (app->useBluesteinFFT[i] && (app->localFFTPlan_inverse->numAxisUploads[i] > 1)) { - for (uint64_t j = 1; j < app->localFFTPlan_inverse->numAxisUploads[i]; j++) { - resFFT = VkFFTPlanAxis(app, app->localFFTPlan_inverse, i, j, 1, 1); - if (resFFT != VKFFT_SUCCESS) { - deleteVkFFT(app); - return resFFT; - } - } - } - if ((app->localFFTPlan_inverse->multiUploadR2C) && (i == 0)) { - resFFT = VkFFTPlanR2CMultiUploadDecomposition(app, app->localFFTPlan_inverse, 1); - if (resFFT != VKFFT_SUCCESS) { - deleteVkFFT(app); - return resFFT; - } - } - } - } - else { - deleteVkFFT(app); - return VKFFT_ERROR_MALLOC_FAILED; - } - } - if (!app->configuration.makeInversePlanOnly) { - app->localFFTPlan = (VkFFTPlan*)calloc(1, sizeof(VkFFTPlan)); - if (app->localFFTPlan) { - for (uint64_t i = 0; i < app->configuration.FFTdim; i++) { - //app->configuration.sharedMemorySize = ((app->configuration.size[i] & (app->configuration.size[i] - 1)) == 0) ? app->configuration.sharedMemorySizePow2 : initSharedMemory; - resFFT = VkFFTScheduler(app, app->localFFTPlan, i); - if (resFFT == VKFFT_ERROR_UNSUPPORTED_FFT_LENGTH) { - //try again with Rader disabled - sequences like 89^4 can still be done with Bluestein FFT - uint64_t temp_fixMaxRaderPrimeFFT = app->configuration.fixMaxRaderPrimeFFT; - app->configuration.fixMaxRaderPrimeFFT = app->configuration.fixMinRaderPrimeFFT; - uint64_t temp_fixMaxRaderPrimeMult = app->configuration.fixMaxRaderPrimeMult; - app->configuration.fixMaxRaderPrimeMult = app->configuration.fixMinRaderPrimeMult; - resFFT = VkFFTScheduler(app, app->localFFTPlan, i); - app->configuration.fixMaxRaderPrimeFFT = temp_fixMaxRaderPrimeFFT; - app->configuration.fixMaxRaderPrimeMult = temp_fixMaxRaderPrimeMult; - } - if (resFFT != VKFFT_SUCCESS) { - deleteVkFFT(app); - return resFFT; - } - if (app->useBluesteinFFT[i] && (app->localFFTPlan->numAxisUploads[i] > 1)) { - for (uint64_t j = 0; j < app->localFFTPlan->numAxisUploads[i]; j++) { - app->localFFTPlan->inverseBluesteinAxes[i][j] = app->localFFTPlan->axes[i][j]; - } - } - } - for (uint64_t i = 0; i < app->configuration.FFTdim; i++) { - //app->configuration.sharedMemorySize = ((app->configuration.size[i] & (app->configuration.size[i] - 1)) == 0) ? app->configuration.sharedMemorySizePow2 : initSharedMemory; - for (uint64_t j = 0; j < app->localFFTPlan->numAxisUploads[i]; j++) { - resFFT = VkFFTPlanAxis(app, app->localFFTPlan, i, j, 0, 0); - if (resFFT != VKFFT_SUCCESS) { - deleteVkFFT(app); - return resFFT; - } - } - if (app->useBluesteinFFT[i] && (app->localFFTPlan->numAxisUploads[i] > 1)) { - for (uint64_t j = 1; j < app->localFFTPlan->numAxisUploads[i]; j++) { - resFFT = VkFFTPlanAxis(app, app->localFFTPlan, i, j, 0, 1); - if (resFFT != VKFFT_SUCCESS) { - deleteVkFFT(app); - return resFFT; - } - } - } - if ((app->localFFTPlan->multiUploadR2C) && (i == 0)) { - resFFT = VkFFTPlanR2CMultiUploadDecomposition(app, app->localFFTPlan, 0); - if (resFFT != VKFFT_SUCCESS) { - deleteVkFFT(app); - return resFFT; - } - } - } - } - else { - deleteVkFFT(app); - return VKFFT_ERROR_MALLOC_FAILED; - } - } - for (uint64_t i = 0; i < app->configuration.FFTdim; i++) { - if (app->useBluesteinFFT[i]) { - if (!app->configuration.makeInversePlanOnly) - resFFT = VkFFTGeneratePhaseVectors(app, app->localFFTPlan, i); - else - resFFT = VkFFTGeneratePhaseVectors(app, app->localFFTPlan_inverse, i); - if (resFFT != VKFFT_SUCCESS) { - deleteVkFFT(app); - return resFFT; - } - } - } - - if (inputLaunchConfiguration.saveApplicationToString != 0) { - uint64_t totalBinarySize = 5 * sizeof(uint64_t); - if (!app->configuration.makeForwardPlanOnly) { - for (uint64_t i = 0; i < app->configuration.FFTdim; i++) { - for (uint64_t j = 0; j < app->localFFTPlan_inverse->numAxisUploads[i]; j++) { - totalBinarySize += app->localFFTPlan_inverse->axes[i][j].binarySize + sizeof(uint64_t); - } - if (app->useBluesteinFFT[i] && (app->localFFTPlan_inverse->numAxisUploads[i] > 1)) { - for (uint64_t j = 1; j < app->localFFTPlan_inverse->numAxisUploads[i]; j++) { - totalBinarySize += app->localFFTPlan_inverse->inverseBluesteinAxes[i][j].binarySize + sizeof(uint64_t); - } - } - if ((app->localFFTPlan_inverse->multiUploadR2C) && (i == 0)) { - totalBinarySize += app->localFFTPlan_inverse->R2Cdecomposition.binarySize + sizeof(uint64_t); - } - } - } - if (!app->configuration.makeInversePlanOnly) { - for (uint64_t i = 0; i < app->configuration.FFTdim; i++) { - for (uint64_t j = 0; j < app->localFFTPlan->numAxisUploads[i]; j++) { - totalBinarySize += app->localFFTPlan->axes[i][j].binarySize + sizeof(uint64_t); - } - if (app->useBluesteinFFT[i] && (app->localFFTPlan->numAxisUploads[i] > 1)) { - for (uint64_t j = 1; j < app->localFFTPlan->numAxisUploads[i]; j++) { - totalBinarySize += app->localFFTPlan->inverseBluesteinAxes[i][j].binarySize + sizeof(uint64_t); - } - } - if ((app->localFFTPlan->multiUploadR2C) && (i == 0)) { - totalBinarySize += app->localFFTPlan->R2Cdecomposition.binarySize + sizeof(uint64_t); - } - } - } - for (uint64_t i = 0; i < app->configuration.FFTdim; i++) { - if (app->useBluesteinFFT[i]) { - totalBinarySize += app->applicationBluesteinStringSize[i]; - } - } - if (app->numRaderFFTPrimes > 0) { - app->applicationStringOffsetRader = totalBinarySize; - for (uint64_t i = 0; i < app->numRaderFFTPrimes; i++) { - totalBinarySize += app->rader_buffer_size[i]; - } - } - app->saveApplicationString = calloc(totalBinarySize, 1); - if (!app->saveApplicationString) { - deleteVkFFT(app); - return VKFFT_ERROR_MALLOC_FAILED; - } - app->applicationStringSize = totalBinarySize; - char* localApplicationStringCast = (char*)app->saveApplicationString; - memcpy(localApplicationStringCast, &totalBinarySize, sizeof(uint64_t)); - memcpy(localApplicationStringCast + 2, &app->applicationStringOffsetRader, sizeof(uint64_t)); - uint64_t currentPos = 5 * sizeof(uint64_t); - if (!app->configuration.makeForwardPlanOnly) { - for (uint64_t i = 0; i < app->configuration.FFTdim; i++) { - for (uint64_t j = 0; j < app->localFFTPlan_inverse->numAxisUploads[i]; j++) { - memcpy(localApplicationStringCast + currentPos, &app->localFFTPlan_inverse->axes[i][j].binarySize, sizeof(uint64_t)); - currentPos += sizeof(uint64_t); - memcpy(localApplicationStringCast + currentPos, app->localFFTPlan_inverse->axes[i][j].binary, app->localFFTPlan_inverse->axes[i][j].binarySize); - currentPos += app->localFFTPlan_inverse->axes[i][j].binarySize; - } - if (app->useBluesteinFFT[i] && (app->localFFTPlan_inverse->numAxisUploads[i] > 1)) { - for (uint64_t j = 1; j < app->localFFTPlan_inverse->numAxisUploads[i]; j++) { - memcpy(localApplicationStringCast + currentPos, &app->localFFTPlan_inverse->inverseBluesteinAxes[i][j].binarySize, sizeof(uint64_t)); - currentPos += sizeof(uint64_t); - memcpy(localApplicationStringCast + currentPos, app->localFFTPlan_inverse->inverseBluesteinAxes[i][j].binary, app->localFFTPlan_inverse->inverseBluesteinAxes[i][j].binarySize); - currentPos += app->localFFTPlan_inverse->inverseBluesteinAxes[i][j].binarySize; - } - } - if ((app->localFFTPlan_inverse->multiUploadR2C) && (i == 0)) { - memcpy(localApplicationStringCast + currentPos, &app->localFFTPlan_inverse->R2Cdecomposition.binarySize, sizeof(uint64_t)); - currentPos += sizeof(uint64_t); - memcpy(localApplicationStringCast + currentPos, app->localFFTPlan_inverse->R2Cdecomposition.binary, app->localFFTPlan_inverse->R2Cdecomposition.binarySize); - currentPos += app->localFFTPlan_inverse->R2Cdecomposition.binarySize; - } - } - } - if (!app->configuration.makeInversePlanOnly) { - for (uint64_t i = 0; i < app->configuration.FFTdim; i++) { - for (uint64_t j = 0; j < app->localFFTPlan->numAxisUploads[i]; j++) { - memcpy(localApplicationStringCast + currentPos, &app->localFFTPlan->axes[i][j].binarySize, sizeof(uint64_t)); - currentPos += sizeof(uint64_t); - memcpy(localApplicationStringCast + currentPos, app->localFFTPlan->axes[i][j].binary, app->localFFTPlan->axes[i][j].binarySize); - currentPos += app->localFFTPlan->axes[i][j].binarySize; - } - if (app->useBluesteinFFT[i] && (app->localFFTPlan->numAxisUploads[i] > 1)) { - for (uint64_t j = 1; j < app->localFFTPlan->numAxisUploads[i]; j++) { - memcpy(localApplicationStringCast + currentPos, &app->localFFTPlan->inverseBluesteinAxes[i][j].binarySize, sizeof(uint64_t)); - currentPos += sizeof(uint64_t); - memcpy(localApplicationStringCast + currentPos, app->localFFTPlan->inverseBluesteinAxes[i][j].binary, app->localFFTPlan->inverseBluesteinAxes[i][j].binarySize); - currentPos += app->localFFTPlan->inverseBluesteinAxes[i][j].binarySize; - } - } - if ((app->localFFTPlan->multiUploadR2C) && (i == 0)) { - memcpy(localApplicationStringCast + currentPos, &app->localFFTPlan->R2Cdecomposition.binarySize, sizeof(uint64_t)); - currentPos += sizeof(uint64_t); - memcpy(localApplicationStringCast + currentPos, app->localFFTPlan->R2Cdecomposition.binary, app->localFFTPlan->R2Cdecomposition.binarySize); - currentPos += app->localFFTPlan->R2Cdecomposition.binarySize; - } - } - } - for (uint64_t i = 0; i < app->configuration.FFTdim; i++) { - if (app->useBluesteinFFT[i]) { - memcpy(localApplicationStringCast + currentPos, app->applicationBluesteinString[i], app->applicationBluesteinStringSize[i]); - currentPos += app->applicationBluesteinStringSize[i]; - } - } - if (app->numRaderFFTPrimes > 0) { - for (uint64_t i = 0; i < app->numRaderFFTPrimes; i++) { - memcpy(localApplicationStringCast + currentPos, app->raderFFTkernel[i], app->rader_buffer_size[i]); - currentPos += app->rader_buffer_size[i]; - } - } - for (uint64_t i = 0; i < app->configuration.FFTdim; i++) { - if (app->applicationBluesteinString[i] != 0) { - free(app->applicationBluesteinString[i]); - app->applicationBluesteinString[i] = 0; - } - } - } -#if(VKFFT_BACKEND==0) - if (app->configuration.isCompilerInitialized) { - glslang_finalize_process(); - app->configuration.isCompilerInitialized = 0; - } -#endif - return resFFT; -} -static inline VkFFTResult dispatchEnhanced(VkFFTApplication* app, VkFFTAxis* axis, uint64_t* dispatchBlock) { - VkFFTResult resFFT = VKFFT_SUCCESS; - if (axis->specializationConstants.swapComputeWorkGroupID == 1) { - uint64_t temp = dispatchBlock[0]; - dispatchBlock[0] = dispatchBlock[1]; - dispatchBlock[1] = temp; - } - if (axis->specializationConstants.swapComputeWorkGroupID == 2) { - uint64_t temp = dispatchBlock[0]; - dispatchBlock[0] = dispatchBlock[2]; - dispatchBlock[2] = temp; - } - uint64_t blockNumber[3] = { (uint64_t)ceil(dispatchBlock[0] / (double)app->configuration.maxComputeWorkGroupCount[0]),(uint64_t)ceil(dispatchBlock[1] / (double)app->configuration.maxComputeWorkGroupCount[1]),(uint64_t)ceil(dispatchBlock[2] / (double)app->configuration.maxComputeWorkGroupCount[2]) }; - uint64_t blockSize[3] = { (uint64_t)ceil(dispatchBlock[0] / (double)blockNumber[0]), (uint64_t)ceil(dispatchBlock[1] / (double)blockNumber[1]), (uint64_t)ceil(dispatchBlock[2] / (double)blockNumber[2]) }; - uint64_t lastBlockSize[3] = { blockSize[0],blockSize[1],blockSize[2] }; - uint64_t dispatchSize[3] = { 1,1,1 }; - if (blockNumber[0] == 0) blockNumber[0] = 1; - if (blockNumber[1] == 0) blockNumber[1] = 1; - if (blockNumber[2] == 0) blockNumber[2] = 1; - if ((blockNumber[0] > 1) && (blockNumber[0] * blockSize[0] != dispatchBlock[0])) { - lastBlockSize[0] = dispatchBlock[0] % blockSize[0]; - } - if ((blockNumber[1] > 1) && (blockNumber[1] * blockSize[1] != dispatchBlock[1])) { - lastBlockSize[1] = dispatchBlock[1] % blockSize[1]; - } - if ((blockNumber[2] > 1) && (blockNumber[2] * blockSize[2] != dispatchBlock[2])) { - lastBlockSize[2] = dispatchBlock[2] % blockSize[2]; - } - if (app->configuration.specifyOffsetsAtLaunch) { - axis->updatePushConstants = 1; - } - //printf("%" PRIu64 " %" PRIu64 " %" PRIu64 "\n", dispatchBlock[0], dispatchBlock[1], dispatchBlock[2]); - //printf("%" PRIu64 " %" PRIu64 " %" PRIu64 "\n", blockNumber[0], blockNumber[1], blockNumber[2]); - for (uint64_t i = 0; i < 3; i++) - if (blockNumber[i] == 1) blockSize[i] = dispatchBlock[i]; - for (uint64_t i = 0; i < blockNumber[0]; i++) { - for (uint64_t j = 0; j < blockNumber[1]; j++) { - for (uint64_t k = 0; k < blockNumber[2]; k++) { - if (axis->pushConstants.workGroupShift[0] != i * blockSize[0]) { - axis->pushConstants.workGroupShift[0] = i * blockSize[0]; - axis->updatePushConstants = 1; - } - if (axis->pushConstants.workGroupShift[1] != j * blockSize[1]) { - axis->pushConstants.workGroupShift[1] = j * blockSize[1]; - axis->updatePushConstants = 1; - } - if (axis->pushConstants.workGroupShift[2] != k * blockSize[2]) { - axis->pushConstants.workGroupShift[2] = k * blockSize[2]; - axis->updatePushConstants = 1; - } - if (axis->updatePushConstants) { - if (app->configuration.useUint64) { - uint64_t pushConstID = 0; - if (axis->specializationConstants.performWorkGroupShift[0]) { - axis->pushConstants.dataUint64[pushConstID] = axis->pushConstants.workGroupShift[0]; - pushConstID++; - } - if (axis->specializationConstants.performWorkGroupShift[1]) { - axis->pushConstants.dataUint64[pushConstID] = axis->pushConstants.workGroupShift[1]; - pushConstID++; - } - if (axis->specializationConstants.performWorkGroupShift[2]) { - axis->pushConstants.dataUint64[pushConstID] = axis->pushConstants.workGroupShift[2]; - pushConstID++; - } - if (axis->specializationConstants.performPostCompilationInputOffset) { - axis->pushConstants.dataUint64[pushConstID] = axis->specializationConstants.inputOffset / axis->specializationConstants.inputNumberByteSize; - pushConstID++; - } - if (axis->specializationConstants.performPostCompilationOutputOffset) { - axis->pushConstants.dataUint64[pushConstID] = axis->specializationConstants.outputOffset / axis->specializationConstants.outputNumberByteSize; - pushConstID++; - } - if (axis->specializationConstants.performPostCompilationKernelOffset) { - if (axis->specializationConstants.kernelNumberByteSize != 0) - axis->pushConstants.dataUint64[pushConstID] = axis->specializationConstants.kernelOffset / axis->specializationConstants.kernelNumberByteSize; - else - axis->pushConstants.dataUint64[pushConstID] = 0; - pushConstID++; - } - } - else { - uint64_t pushConstID = 0; - if (axis->specializationConstants.performWorkGroupShift[0]) { - axis->pushConstants.dataUint32[pushConstID] = (uint32_t)axis->pushConstants.workGroupShift[0]; - pushConstID++; - } - if (axis->specializationConstants.performWorkGroupShift[1]) { - axis->pushConstants.dataUint32[pushConstID] = (uint32_t)axis->pushConstants.workGroupShift[1]; - pushConstID++; - } - if (axis->specializationConstants.performWorkGroupShift[2]) { - axis->pushConstants.dataUint32[pushConstID] = (uint32_t)axis->pushConstants.workGroupShift[2]; - pushConstID++; - } - if (axis->specializationConstants.performPostCompilationInputOffset) { - axis->pushConstants.dataUint32[pushConstID] = (uint32_t)(axis->specializationConstants.inputOffset / axis->specializationConstants.inputNumberByteSize); - pushConstID++; - } - if (axis->specializationConstants.performPostCompilationOutputOffset) { - axis->pushConstants.dataUint32[pushConstID] = (uint32_t)(axis->specializationConstants.outputOffset / axis->specializationConstants.outputNumberByteSize); - pushConstID++; - } - if (axis->specializationConstants.performPostCompilationKernelOffset) { - if (axis->specializationConstants.kernelNumberByteSize != 0) - axis->pushConstants.dataUint32[pushConstID] = (uint32_t)(axis->specializationConstants.kernelOffset / axis->specializationConstants.kernelNumberByteSize); - else - axis->pushConstants.dataUint64[pushConstID] = 0; - pushConstID++; - } - } - } - dispatchSize[0] = (i == blockNumber[0] - 1) ? lastBlockSize[0] : blockSize[0]; - dispatchSize[1] = (j == blockNumber[1] - 1) ? lastBlockSize[1] : blockSize[1]; - dispatchSize[2] = (k == blockNumber[2] - 1) ? lastBlockSize[2] : blockSize[2]; -#if(VKFFT_BACKEND==0) - if (axis->pushConstants.structSize > 0) { - if (app->configuration.useUint64) { - vkCmdPushConstants(app->configuration.commandBuffer[0], axis->pipelineLayout, VK_SHADER_STAGE_COMPUTE_BIT, 0, (uint32_t)axis->pushConstants.structSize, axis->pushConstants.dataUint64); - } - else { - vkCmdPushConstants(app->configuration.commandBuffer[0], axis->pipelineLayout, VK_SHADER_STAGE_COMPUTE_BIT, 0, (uint32_t)axis->pushConstants.structSize, axis->pushConstants.dataUint32); - } - } - vkCmdDispatch(app->configuration.commandBuffer[0], (uint32_t)dispatchSize[0], (uint32_t)dispatchSize[1], (uint32_t)dispatchSize[2]); -#elif(VKFFT_BACKEND==1) - void* args[10]; - CUresult result = CUDA_SUCCESS; - args[0] = axis->inputBuffer; - args[1] = axis->outputBuffer; - uint64_t args_id = 2; - if (axis->specializationConstants.convolutionStep) { - args[args_id] = app->configuration.kernel; - args_id++; - } - if (axis->specializationConstants.LUT) { - args[args_id] = &axis->bufferLUT; - args_id++; - } - if (axis->specializationConstants.raderUintLUT) { - args[args_id] = &axis->bufferRaderUintLUT; - args_id++; - } - if (axis->specializationConstants.useBluesteinFFT && axis->specializationConstants.BluesteinConvolutionStep) { - if (axis->specializationConstants.inverseBluestein) - args[args_id] = &app->bufferBluesteinIFFT[axis->specializationConstants.axis_id]; - else - args[args_id] = &app->bufferBluesteinFFT[axis->specializationConstants.axis_id]; - args_id++; - } - if (axis->specializationConstants.useBluesteinFFT && (axis->specializationConstants.BluesteinPreMultiplication || axis->specializationConstants.BluesteinPostMultiplication)) { - args[args_id] = &app->bufferBluestein[axis->specializationConstants.axis_id]; - args_id++; - } - //args[args_id] = &axis->pushConstants; - if (axis->updatePushConstants) { - axis->updatePushConstants = 0; - if (axis->pushConstants.structSize > 0) { - if (app->configuration.useUint64) { - result = cuMemcpyHtoD(axis->consts_addr, axis->pushConstants.dataUint64, axis->pushConstants.structSize); - } - else { - result = cuMemcpyHtoD(axis->consts_addr, axis->pushConstants.dataUint32, axis->pushConstants.structSize); - } - if (result != CUDA_SUCCESS) { - printf("cuMemcpyHtoD error: %d\n", result); - return VKFFT_ERROR_FAILED_TO_COPY; - } - } - } - if (app->configuration.num_streams >= 1) { - result = cuLaunchKernel(axis->VkFFTKernel, - (unsigned int)dispatchSize[0], (unsigned int)dispatchSize[1], (unsigned int)dispatchSize[2], // grid dim - (unsigned int)axis->specializationConstants.localSize[0], (unsigned int)axis->specializationConstants.localSize[1], (unsigned int)axis->specializationConstants.localSize[2], // block dim - (unsigned int)axis->specializationConstants.usedSharedMemory, app->configuration.stream[app->configuration.streamID], // shared mem and stream - args, 0); - } - else { - result = cuLaunchKernel(axis->VkFFTKernel, - (unsigned int)dispatchSize[0], (unsigned int)dispatchSize[1], (unsigned int)dispatchSize[2], // grid dim - (unsigned int)axis->specializationConstants.localSize[0], (unsigned int)axis->specializationConstants.localSize[1], (unsigned int)axis->specializationConstants.localSize[2], // block dim - (unsigned int)axis->specializationConstants.usedSharedMemory, 0, // shared mem and stream - args, 0); - } - if (result != CUDA_SUCCESS) { - printf("cuLaunchKernel error: %d, %" PRIu64 " %" PRIu64 " %" PRIu64 " - %" PRIu64 " %" PRIu64 " %" PRIu64 "\n", result, dispatchSize[0], dispatchSize[1], dispatchSize[2], axis->specializationConstants.localSize[0], axis->specializationConstants.localSize[1], axis->specializationConstants.localSize[2]); - return VKFFT_ERROR_FAILED_TO_LAUNCH_KERNEL; - } - if (app->configuration.num_streams > 1) { - app->configuration.streamID = app->configuration.streamCounter % app->configuration.num_streams; - if (app->configuration.streamCounter == 0) { - cudaError_t res2 = cudaEventRecord(app->configuration.stream_event[app->configuration.streamID], app->configuration.stream[app->configuration.streamID]); - if (res2 != cudaSuccess) return VKFFT_ERROR_FAILED_TO_EVENT_RECORD; - } - app->configuration.streamCounter++; - } -#elif(VKFFT_BACKEND==2) - hipError_t result = hipSuccess; - void* args[10]; - args[0] = axis->inputBuffer; - args[1] = axis->outputBuffer; - uint64_t args_id = 2; - if (axis->specializationConstants.convolutionStep) { - args[args_id] = app->configuration.kernel; - args_id++; - } - if (axis->specializationConstants.LUT) { - args[args_id] = &axis->bufferLUT; - args_id++; - } - if (axis->specializationConstants.raderUintLUT) { - args[args_id] = &axis->bufferRaderUintLUT; - args_id++; - } - if (axis->specializationConstants.useBluesteinFFT && axis->specializationConstants.BluesteinConvolutionStep) { - if (axis->specializationConstants.inverseBluestein) - args[args_id] = &app->bufferBluesteinIFFT[axis->specializationConstants.axis_id]; - else - args[args_id] = &app->bufferBluesteinFFT[axis->specializationConstants.axis_id]; - args_id++; - } - if (axis->specializationConstants.useBluesteinFFT && (axis->specializationConstants.BluesteinPreMultiplication || axis->specializationConstants.BluesteinPostMultiplication)) { - args[args_id] = &app->bufferBluestein[axis->specializationConstants.axis_id]; - args_id++; - } - //args[args_id] = &axis->pushConstants; - if (axis->updatePushConstants) { - axis->updatePushConstants = 0; - if (axis->pushConstants.structSize > 0) { - if (app->configuration.useUint64) { - result = hipMemcpyHtoD(axis->consts_addr, axis->pushConstants.dataUint64, axis->pushConstants.structSize); - } - else { - result = hipMemcpyHtoD(axis->consts_addr, axis->pushConstants.dataUint32, axis->pushConstants.structSize); - } - if (result != hipSuccess) { - printf("hipMemcpyHtoD error: %d\n", result); - return VKFFT_ERROR_FAILED_TO_COPY; - } - } - } - //printf("%" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 "\n",maxBlockSize[0], maxBlockSize[1], maxBlockSize[2], axis->specializationConstants.localSize[0], axis->specializationConstants.localSize[1], axis->specializationConstants.localSize[2]); - if (app->configuration.num_streams >= 1) { - result = hipModuleLaunchKernel(axis->VkFFTKernel, - (unsigned int)dispatchSize[0], (unsigned int)dispatchSize[1], (unsigned int)dispatchSize[2], // grid dim - (unsigned int)axis->specializationConstants.localSize[0], (unsigned int)axis->specializationConstants.localSize[1], (unsigned int)axis->specializationConstants.localSize[2], // block dim - (unsigned int)axis->specializationConstants.usedSharedMemory, app->configuration.stream[app->configuration.streamID], // shared mem and stream - args, 0); - } - else { - result = hipModuleLaunchKernel(axis->VkFFTKernel, - (unsigned int)dispatchSize[0], (unsigned int)dispatchSize[1], (unsigned int)dispatchSize[2], // grid dim - (unsigned int)axis->specializationConstants.localSize[0], (unsigned int)axis->specializationConstants.localSize[1], (unsigned int)axis->specializationConstants.localSize[2], // block dim - (unsigned int)axis->specializationConstants.usedSharedMemory, 0, // shared mem and stream - args, 0); - } - if (result != hipSuccess) { - printf("hipModuleLaunchKernel error: %d, %" PRIu64 " %" PRIu64 " %" PRIu64 " - %" PRIu64 " %" PRIu64 " %" PRIu64 "\n", result, dispatchSize[0], dispatchSize[1], dispatchSize[2], axis->specializationConstants.localSize[0], axis->specializationConstants.localSize[1], axis->specializationConstants.localSize[2]); - return VKFFT_ERROR_FAILED_TO_LAUNCH_KERNEL; - } - if (app->configuration.num_streams > 1) { - app->configuration.streamID = app->configuration.streamCounter % app->configuration.num_streams; - if (app->configuration.streamCounter == 0) { - result = hipEventRecord(app->configuration.stream_event[app->configuration.streamID], app->configuration.stream[app->configuration.streamID]); - if (result != hipSuccess) return VKFFT_ERROR_FAILED_TO_EVENT_RECORD; - } - app->configuration.streamCounter++; - } -#elif(VKFFT_BACKEND==3) - cl_int result = CL_SUCCESS; - void* args[10]; - args[0] = axis->inputBuffer; - result = clSetKernelArg(axis->kernel, 0, sizeof(cl_mem), args[0]); - if (result != CL_SUCCESS) { - return VKFFT_ERROR_FAILED_TO_SET_KERNEL_ARG; - } - args[1] = axis->outputBuffer; - result = clSetKernelArg(axis->kernel, 1, sizeof(cl_mem), args[1]); - if (result != CL_SUCCESS) { - return VKFFT_ERROR_FAILED_TO_SET_KERNEL_ARG; - } - uint64_t args_id = 2; - if (axis->specializationConstants.convolutionStep) { - args[args_id] = app->configuration.kernel; - result = clSetKernelArg(axis->kernel, (cl_uint)args_id, sizeof(cl_mem), args[args_id]); - if (result != CL_SUCCESS) { - return VKFFT_ERROR_FAILED_TO_SET_KERNEL_ARG; - } - args_id++; - } - if (axis->specializationConstants.LUT) { - args[args_id] = &axis->bufferLUT; - result = clSetKernelArg(axis->kernel, (cl_uint)args_id, sizeof(cl_mem), args[args_id]); - if (result != CL_SUCCESS) { - return VKFFT_ERROR_FAILED_TO_SET_KERNEL_ARG; - } - args_id++; - } - if (axis->specializationConstants.raderUintLUT) { - args[args_id] = &axis->bufferRaderUintLUT; - result = clSetKernelArg(axis->kernel, (cl_uint)args_id, sizeof(cl_mem), args[args_id]); - if (result != CL_SUCCESS) { - return VKFFT_ERROR_FAILED_TO_SET_KERNEL_ARG; - } - args_id++; - } - if (axis->specializationConstants.useBluesteinFFT && axis->specializationConstants.BluesteinConvolutionStep) { - if (axis->specializationConstants.inverseBluestein) - args[args_id] = &app->bufferBluesteinIFFT[axis->specializationConstants.axis_id]; - else - args[args_id] = &app->bufferBluesteinFFT[axis->specializationConstants.axis_id]; - result = clSetKernelArg(axis->kernel, (cl_uint)args_id, sizeof(cl_mem), args[args_id]); - if (result != CL_SUCCESS) { - return VKFFT_ERROR_FAILED_TO_SET_KERNEL_ARG; - } - args_id++; - } - if (axis->specializationConstants.useBluesteinFFT && (axis->specializationConstants.BluesteinPreMultiplication || axis->specializationConstants.BluesteinPostMultiplication)) { - args[args_id] = &app->bufferBluestein[axis->specializationConstants.axis_id]; - result = clSetKernelArg(axis->kernel, (cl_uint)args_id, sizeof(cl_mem), args[args_id]); - if (result != CL_SUCCESS) { - return VKFFT_ERROR_FAILED_TO_SET_KERNEL_ARG; - } - args_id++; - } - - if (axis->pushConstants.structSize > 0) { - if (app->configuration.useUint64) { - result = clSetKernelArg(axis->kernel, (cl_uint)args_id, axis->pushConstants.structSize, axis->pushConstants.dataUint64); - } - else { - result = clSetKernelArg(axis->kernel, (cl_uint)args_id, axis->pushConstants.structSize, axis->pushConstants.dataUint32); - } - if (result != CL_SUCCESS) { - return VKFFT_ERROR_FAILED_TO_SET_KERNEL_ARG; - } - args_id++; - } - size_t local_work_size[3] = { (size_t)axis->specializationConstants.localSize[0], (size_t)axis->specializationConstants.localSize[1],(size_t)axis->specializationConstants.localSize[2] }; - size_t global_work_size[3] = { (size_t)dispatchSize[0] * local_work_size[0] , (size_t)dispatchSize[1] * local_work_size[1] ,(size_t)dispatchSize[2] * local_work_size[2] }; - result = clEnqueueNDRangeKernel(app->configuration.commandQueue[0], axis->kernel, 3, 0, global_work_size, local_work_size, 0, 0, 0); - //printf("%" PRIu64 " %" PRIu64 " %" PRIu64 " - %" PRIu64 " %" PRIu64 " %" PRIu64 "\n", maxBlockSize[0], maxBlockSize[1], maxBlockSize[2], axis->specializationConstants.localSize[0], axis->specializationConstants.localSize[1], axis->specializationConstants.localSize[2]); - - if (result != CL_SUCCESS) { - return VKFFT_ERROR_FAILED_TO_LAUNCH_KERNEL; - } -#elif(VKFFT_BACKEND==4) - ze_result_t result = ZE_RESULT_SUCCESS; - void* args[10]; - args[0] = axis->inputBuffer; - result = zeKernelSetArgumentValue(axis->VkFFTKernel, 0, sizeof(void*), args[0]); - if (result != ZE_RESULT_SUCCESS) { - return VKFFT_ERROR_FAILED_TO_SET_KERNEL_ARG; - } - args[1] = axis->outputBuffer; - result = zeKernelSetArgumentValue(axis->VkFFTKernel, 1, sizeof(void*), args[1]); - if (result != ZE_RESULT_SUCCESS) { - return VKFFT_ERROR_FAILED_TO_SET_KERNEL_ARG; - } - uint64_t args_id = 2; - if (axis->specializationConstants.convolutionStep) { - args[args_id] = app->configuration.kernel; - result = zeKernelSetArgumentValue(axis->VkFFTKernel, (uint32_t)args_id, sizeof(void*), args[args_id]); - if (result != ZE_RESULT_SUCCESS) { - return VKFFT_ERROR_FAILED_TO_SET_KERNEL_ARG; - } - args_id++; - } - if (axis->specializationConstants.LUT) { - args[args_id] = &axis->bufferLUT; - result = zeKernelSetArgumentValue(axis->VkFFTKernel, (uint32_t)args_id, sizeof(void*), args[args_id]); - if (result != ZE_RESULT_SUCCESS) { - return VKFFT_ERROR_FAILED_TO_SET_KERNEL_ARG; - } - args_id++; - } - if (axis->specializationConstants.raderUintLUT) { - args[args_id] = &axis->bufferRaderUintLUT; - result = zeKernelSetArgumentValue(axis->VkFFTKernel, (uint32_t)args_id, sizeof(void*), args[args_id]); - if (result != ZE_RESULT_SUCCESS) { - return VKFFT_ERROR_FAILED_TO_SET_KERNEL_ARG; - } - args_id++; - } - if (axis->specializationConstants.useBluesteinFFT && axis->specializationConstants.BluesteinConvolutionStep) { - if (axis->specializationConstants.inverseBluestein) - args[args_id] = &app->bufferBluesteinIFFT[axis->specializationConstants.axis_id]; - else - args[args_id] = &app->bufferBluesteinFFT[axis->specializationConstants.axis_id]; - result = zeKernelSetArgumentValue(axis->VkFFTKernel, (uint32_t)args_id, sizeof(void*), args[args_id]); - if (result != ZE_RESULT_SUCCESS) { - return VKFFT_ERROR_FAILED_TO_SET_KERNEL_ARG; - } - args_id++; - } - if (axis->specializationConstants.useBluesteinFFT && (axis->specializationConstants.BluesteinPreMultiplication || axis->specializationConstants.BluesteinPostMultiplication)) { - args[args_id] = &app->bufferBluestein[axis->specializationConstants.axis_id]; - result = zeKernelSetArgumentValue(axis->VkFFTKernel, (uint32_t)args_id, sizeof(void*), args[args_id]); - if (result != ZE_RESULT_SUCCESS) { - return VKFFT_ERROR_FAILED_TO_SET_KERNEL_ARG; - } - args_id++; - } - - if (axis->pushConstants.structSize > 0) { - if (app->configuration.useUint64) { - result = zeKernelSetArgumentValue(axis->VkFFTKernel, (uint32_t)args_id, axis->pushConstants.structSize, axis->pushConstants.dataUint64); - } - else { - result = zeKernelSetArgumentValue(axis->VkFFTKernel, (uint32_t)args_id, axis->pushConstants.structSize, axis->pushConstants.dataUint32); - } - if (result != ZE_RESULT_SUCCESS) { - return VKFFT_ERROR_FAILED_TO_SET_KERNEL_ARG; - } - args_id++; - } - size_t local_work_size[3] = { (size_t)axis->specializationConstants.localSize[0], (size_t)axis->specializationConstants.localSize[1],(size_t)axis->specializationConstants.localSize[2] }; - ze_group_count_t launchArgs = { (uint32_t)dispatchSize[0], (uint32_t)dispatchSize[1],(uint32_t)dispatchSize[2] }; - result = zeCommandListAppendLaunchKernel(app->configuration.commandList[0], axis->VkFFTKernel, &launchArgs, 0, 0, 0); - //printf("%" PRIu64 " %" PRIu64 " %" PRIu64 " - %" PRIu64 " %" PRIu64 " %" PRIu64 "\n", maxBlockSize[0], maxBlockSize[1], maxBlockSize[2], axis->specializationConstants.localSize[0], axis->specializationConstants.localSize[1], axis->specializationConstants.localSize[2]); - - if (result != ZE_RESULT_SUCCESS) { - return VKFFT_ERROR_FAILED_TO_LAUNCH_KERNEL; - } -#endif - } - } - } - return resFFT; -} -static inline VkFFTResult VkFFTSync(VkFFTApplication* app) { -#if(VKFFT_BACKEND==0) - vkCmdPipelineBarrier(app->configuration.commandBuffer[0], VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, 1, app->configuration.memory_barrier, 0, 0, 0, 0); -#elif(VKFFT_BACKEND==1) - if (app->configuration.num_streams > 1) { - cudaError_t res = cudaSuccess; - for (uint64_t s = 0; s < app->configuration.num_streams; s++) { - res = cudaEventSynchronize(app->configuration.stream_event[s]); - if (res != cudaSuccess) return VKFFT_ERROR_FAILED_TO_SYNCHRONIZE; - } - app->configuration.streamCounter = 0; - } -#elif(VKFFT_BACKEND==2) - if (app->configuration.num_streams > 1) { - hipError_t res = hipSuccess; - for (uint64_t s = 0; s < app->configuration.num_streams; s++) { - res = hipEventSynchronize(app->configuration.stream_event[s]); - if (res != hipSuccess) return VKFFT_ERROR_FAILED_TO_SYNCHRONIZE; - } - app->configuration.streamCounter = 0; - } -#elif(VKFFT_BACKEND==3) -#elif(VKFFT_BACKEND==4) - ze_result_t res = ZE_RESULT_SUCCESS; - res = zeCommandListAppendBarrier(app->configuration.commandList[0], nullptr, 0, nullptr); - if (res != ZE_RESULT_SUCCESS) return VKFFT_ERROR_FAILED_TO_SUBMIT_BARRIER; -#endif - return VKFFT_SUCCESS; -} -static inline void printDebugInformation(VkFFTApplication* app, VkFFTAxis* axis) { - if (app->configuration.keepShaderCode) printf("%s\n", axis->specializationConstants.code0); - if (app->configuration.printMemoryLayout) { - if ((axis->inputBuffer == app->configuration.inputBuffer) && (app->configuration.inputBuffer != app->configuration.buffer)) - printf("read: inputBuffer\n"); - if (axis->inputBuffer == app->configuration.buffer) - printf("read: buffer\n"); - if (axis->inputBuffer == app->configuration.tempBuffer) - printf("read: tempBuffer\n"); - if ((axis->inputBuffer == app->configuration.outputBuffer) && (app->configuration.outputBuffer != app->configuration.buffer)) - printf("read: outputBuffer\n"); - if ((axis->outputBuffer == app->configuration.inputBuffer) && (app->configuration.inputBuffer != app->configuration.buffer)) - printf("write: inputBuffer\n"); - if (axis->outputBuffer == app->configuration.buffer) - printf("write: buffer\n"); - if (axis->outputBuffer == app->configuration.tempBuffer) - printf("write: tempBuffer\n"); - if ((axis->outputBuffer == app->configuration.outputBuffer) && (app->configuration.outputBuffer != app->configuration.buffer)) - printf("write: outputBuffer\n"); - } -} -static inline VkFFTResult VkFFTAppend(VkFFTApplication* app, int inverse, VkFFTLaunchParams* launchParams) { - VkFFTResult resFFT = VKFFT_SUCCESS; -#if(VKFFT_BACKEND==0) - app->configuration.commandBuffer = launchParams->commandBuffer; - VkMemoryBarrier memory_barrier = { - VK_STRUCTURE_TYPE_MEMORY_BARRIER, - 0, - VK_ACCESS_SHADER_WRITE_BIT, - VK_ACCESS_SHADER_READ_BIT, - }; - app->configuration.memory_barrier = &memory_barrier; -#elif(VKFFT_BACKEND==1) - app->configuration.streamCounter = 0; -#elif(VKFFT_BACKEND==2) - app->configuration.streamCounter = 0; -#elif(VKFFT_BACKEND==3) - app->configuration.commandQueue = launchParams->commandQueue; -#elif(VKFFT_BACKEND==4) - app->configuration.commandList = launchParams->commandList; -#endif - uint64_t localSize0[3]; - if ((inverse != 1) && (app->configuration.makeInversePlanOnly)) return VKFFT_ERROR_ONLY_INVERSE_FFT_INITIALIZED; - if ((inverse == 1) && (app->configuration.makeForwardPlanOnly)) return VKFFT_ERROR_ONLY_FORWARD_FFT_INITIALIZED; - if ((inverse != 1) && (!app->configuration.makeInversePlanOnly) && (!app->localFFTPlan)) return VKFFT_ERROR_PLAN_NOT_INITIALIZED; - if ((inverse == 1) && (!app->configuration.makeForwardPlanOnly) && (!app->localFFTPlan_inverse)) return VKFFT_ERROR_PLAN_NOT_INITIALIZED; - if (inverse == 1) { - localSize0[0] = app->localFFTPlan_inverse->actualFFTSizePerAxis[0][0]; - localSize0[1] = app->localFFTPlan_inverse->actualFFTSizePerAxis[1][0]; - localSize0[2] = app->localFFTPlan_inverse->actualFFTSizePerAxis[2][0]; - } - else { - localSize0[0] = app->localFFTPlan->actualFFTSizePerAxis[0][0]; - localSize0[1] = app->localFFTPlan->actualFFTSizePerAxis[1][0]; - localSize0[2] = app->localFFTPlan->actualFFTSizePerAxis[2][0]; - } - resFFT = VkFFTCheckUpdateBufferSet(app, 0, 0, launchParams); - if (resFFT != VKFFT_SUCCESS) { - return resFFT; - } - if (inverse != 1) { - //FFT axis 0 - if (!app->configuration.omitDimension[0]) { - for (int64_t l = (int64_t)app->localFFTPlan->numAxisUploads[0] - 1; l >= 0; l--) { - VkFFTAxis* axis = &app->localFFTPlan->axes[0][l]; - resFFT = VkFFTUpdateBufferSet(app, app->localFFTPlan, axis, 0, l, 0); - if (resFFT != VKFFT_SUCCESS) return resFFT; - uint64_t maxCoordinate = ((app->configuration.matrixConvolution > 1) && (app->configuration.performConvolution) && (app->configuration.FFTdim == 1) && (l == 0)) ? 1 : app->configuration.coordinateFeatures; -#if(VKFFT_BACKEND==0) - vkCmdBindPipeline(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipeline); - vkCmdBindDescriptorSets(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, 0); -#endif - uint64_t dispatchBlock[3]; - if (l == 0) { - if (app->localFFTPlan->numAxisUploads[0] > 2) { - dispatchBlock[0] = (uint64_t)ceil((uint64_t)ceil(app->localFFTPlan->actualFFTSizePerAxis[0][0] / axis->specializationConstants.fftDim / (double)axis->axisBlock[1]) / (double)app->localFFTPlan->axisSplit[0][1]) * app->localFFTPlan->axisSplit[0][1]; - dispatchBlock[1] = app->localFFTPlan->actualFFTSizePerAxis[0][1]; - } - else { - if (app->localFFTPlan->numAxisUploads[0] > 1) { - dispatchBlock[0] = (uint64_t)ceil((uint64_t)ceil(app->localFFTPlan->actualFFTSizePerAxis[0][0] / axis->specializationConstants.fftDim / (double)axis->axisBlock[1])); - dispatchBlock[1] = app->localFFTPlan->actualFFTSizePerAxis[0][1]; - } - else { - dispatchBlock[0] = app->localFFTPlan->actualFFTSizePerAxis[0][0] / axis->specializationConstants.fftDim; - dispatchBlock[1] = (uint64_t)ceil(app->localFFTPlan->actualFFTSizePerAxis[0][1] / (double)axis->axisBlock[1]); - } - } - } - else { - dispatchBlock[0] = (uint64_t)ceil(app->localFFTPlan->actualFFTSizePerAxis[0][0] / axis->specializationConstants.fftDim / (double)axis->axisBlock[0]); - dispatchBlock[1] = app->localFFTPlan->actualFFTSizePerAxis[0][1]; - } - dispatchBlock[2] = app->localFFTPlan->actualFFTSizePerAxis[0][2] * maxCoordinate * app->configuration.numberBatches; - if (axis->specializationConstants.mergeSequencesR2C == 1) dispatchBlock[1] = (uint64_t)ceil(dispatchBlock[1] / 2.0); - //if (app->configuration.performZeropadding[1]) dispatchBlock[1] = (uint64_t)ceil(dispatchBlock[1] / 2.0); - //if (app->configuration.performZeropadding[2]) dispatchBlock[2] = (uint64_t)ceil(dispatchBlock[2] / 2.0); - resFFT = dispatchEnhanced(app, axis, dispatchBlock); - if (resFFT != VKFFT_SUCCESS) return resFFT; - printDebugInformation(app, axis); - resFFT = VkFFTSync(app); - if (resFFT != VKFFT_SUCCESS) return resFFT; - } - if (app->useBluesteinFFT[0] && (app->localFFTPlan->numAxisUploads[0] > 1)) { - for (int64_t l = 1; l < (int64_t)app->localFFTPlan->numAxisUploads[0]; l++) { - VkFFTAxis* axis = &app->localFFTPlan->inverseBluesteinAxes[0][l]; - resFFT = VkFFTUpdateBufferSet(app, app->localFFTPlan, axis, 0, l, 0); - if (resFFT != VKFFT_SUCCESS) return resFFT; - uint64_t maxCoordinate = ((app->configuration.matrixConvolution > 1) && (app->configuration.performConvolution) && (app->configuration.FFTdim == 1)) ? 1 : app->configuration.coordinateFeatures; -#if(VKFFT_BACKEND==0) - vkCmdBindPipeline(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipeline); - vkCmdBindDescriptorSets(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, 0); -#endif - uint64_t dispatchBlock[3]; - if (l == 0) { - if (app->localFFTPlan->numAxisUploads[0] > 2) { - dispatchBlock[0] = (uint64_t)ceil((uint64_t)ceil(app->localFFTPlan->actualFFTSizePerAxis[0][0] / axis->specializationConstants.fftDim / (double)axis->axisBlock[1]) / (double)app->localFFTPlan->axisSplit[0][1]) * app->localFFTPlan->axisSplit[0][1]; - dispatchBlock[1] = app->localFFTPlan->actualFFTSizePerAxis[0][1]; - } - else { - if (app->localFFTPlan->numAxisUploads[0] > 1) { - dispatchBlock[0] = (uint64_t)ceil((uint64_t)ceil(app->localFFTPlan->actualFFTSizePerAxis[0][0] / axis->specializationConstants.fftDim / (double)axis->axisBlock[1])); - dispatchBlock[1] = app->localFFTPlan->actualFFTSizePerAxis[0][1]; - } - else { - dispatchBlock[0] = app->localFFTPlan->actualFFTSizePerAxis[0][0] / axis->specializationConstants.fftDim; - dispatchBlock[1] = (uint64_t)ceil(app->localFFTPlan->actualFFTSizePerAxis[0][1] / (double)axis->axisBlock[1]); - } - } - } - else { - dispatchBlock[0] = (uint64_t)ceil(app->localFFTPlan->actualFFTSizePerAxis[0][0] / axis->specializationConstants.fftDim / (double)axis->axisBlock[0]); - dispatchBlock[1] = app->localFFTPlan->actualFFTSizePerAxis[0][1]; - } - dispatchBlock[2] = app->localFFTPlan->actualFFTSizePerAxis[0][2] * maxCoordinate * app->configuration.numberBatches; - if (axis->specializationConstants.mergeSequencesR2C == 1) dispatchBlock[1] = (uint64_t)ceil(dispatchBlock[1] / 2.0); - //if (app->configuration.performZeropadding[1]) dispatchBlock[1] = (uint64_t)ceil(dispatchBlock[1] / 2.0); - //if (app->configuration.performZeropadding[2]) dispatchBlock[2] = (uint64_t)ceil(dispatchBlock[2] / 2.0); - resFFT = dispatchEnhanced(app, axis, dispatchBlock); - if (resFFT != VKFFT_SUCCESS) return resFFT; - printDebugInformation(app, axis); - resFFT = VkFFTSync(app); - if (resFFT != VKFFT_SUCCESS) return resFFT; - } - } - if (app->localFFTPlan->multiUploadR2C) { - VkFFTAxis* axis = &app->localFFTPlan->R2Cdecomposition; - resFFT = VkFFTUpdateBufferSetR2CMultiUploadDecomposition(app, app->localFFTPlan, axis, 0, 0, 0); - if (resFFT != VKFFT_SUCCESS) return resFFT; - uint64_t maxCoordinate = ((app->configuration.matrixConvolution > 1) && (app->configuration.performConvolution) && (app->configuration.FFTdim == 1)) ? 1 : app->configuration.coordinateFeatures; - -#if(VKFFT_BACKEND==0) - vkCmdBindPipeline(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipeline); - vkCmdBindDescriptorSets(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, 0); -#endif - uint64_t dispatchBlock[3]; - - dispatchBlock[0] = (uint64_t)ceil(((app->configuration.size[0] / 2 + 1) * app->configuration.size[1] * app->configuration.size[2]) / (double)(2 * axis->axisBlock[0])); - dispatchBlock[1] = 1; - dispatchBlock[2] = maxCoordinate * axis->specializationConstants.numBatches; - resFFT = dispatchEnhanced(app, axis, dispatchBlock); - if (resFFT != VKFFT_SUCCESS) return resFFT; - printDebugInformation(app, axis); - resFFT = VkFFTSync(app); - if (resFFT != VKFFT_SUCCESS) return resFFT; - //app->configuration.size[0] *= 2; - } - } - if (app->configuration.FFTdim > 1) { - - //FFT axis 1 - if (!app->configuration.omitDimension[1]) { - if ((app->configuration.FFTdim == 2) && (app->configuration.performConvolution)) { - - for (int64_t l = (int64_t)app->localFFTPlan->numAxisUploads[1] - 1; l >= 0; l--) { - VkFFTAxis* axis = &app->localFFTPlan->axes[1][l]; - resFFT = VkFFTUpdateBufferSet(app, app->localFFTPlan, axis, 1, l, 0); - if (resFFT != VKFFT_SUCCESS) return resFFT; - uint64_t maxCoordinate = ((app->configuration.matrixConvolution > 1) && (l == 0)) ? 1 : app->configuration.coordinateFeatures; - -#if(VKFFT_BACKEND==0) - vkCmdBindPipeline(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipeline); - vkCmdBindDescriptorSets(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, 0); -#endif - uint64_t dispatchBlock[3]; - dispatchBlock[0] = (uint64_t)ceil(localSize0[1] / (double)axis->axisBlock[0] * app->localFFTPlan->actualFFTSizePerAxis[1][1] / (double)axis->specializationConstants.fftDim); - dispatchBlock[1] = 1; - dispatchBlock[2] = app->localFFTPlan->actualFFTSizePerAxis[1][2] * maxCoordinate * app->configuration.numberBatches; - //if (app->configuration.mergeSequencesR2C == 1) dispatchBlock[0] = (uint64_t)ceil(dispatchBlock[0] / 2.0); - //if (app->configuration.performZeropadding[2]) dispatchBlock[2] = (uint64_t)ceil(dispatchBlock[2] / 2.0); - resFFT = dispatchEnhanced(app, axis, dispatchBlock); - if (resFFT != VKFFT_SUCCESS) return resFFT; - printDebugInformation(app, axis); - resFFT = VkFFTSync(app); - if (resFFT != VKFFT_SUCCESS) return resFFT; - } - } - else { - - for (int64_t l = (int64_t)app->localFFTPlan->numAxisUploads[1] - 1; l >= 0; l--) { - VkFFTAxis* axis = &app->localFFTPlan->axes[1][l]; - resFFT = VkFFTUpdateBufferSet(app, app->localFFTPlan, axis, 1, l, 0); - if (resFFT != VKFFT_SUCCESS) return resFFT; - -#if(VKFFT_BACKEND==0) - vkCmdBindPipeline(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipeline); - vkCmdBindDescriptorSets(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, 0); -#endif - uint64_t dispatchBlock[3]; - - dispatchBlock[0] = (uint64_t)ceil(localSize0[1] / (double)axis->axisBlock[0] * app->localFFTPlan->actualFFTSizePerAxis[1][1] / (double)axis->specializationConstants.fftDim); - dispatchBlock[1] = 1; - dispatchBlock[2] = app->localFFTPlan->actualFFTSizePerAxis[1][2] * app->configuration.coordinateFeatures * app->configuration.numberBatches; - //if (app->configuration.mergeSequencesR2C == 1) dispatchBlock[0] = (uint64_t)ceil(dispatchBlock[0] / 2.0); - //if (app->configuration.performZeropadding[2]) dispatchBlock[2] = (uint64_t)ceil(dispatchBlock[2] / 2.0); - resFFT = dispatchEnhanced(app, axis, dispatchBlock); - if (resFFT != VKFFT_SUCCESS) return resFFT; - printDebugInformation(app, axis); - - resFFT = VkFFTSync(app); - if (resFFT != VKFFT_SUCCESS) return resFFT; - } - if (app->useBluesteinFFT[1] && (app->localFFTPlan->numAxisUploads[1] > 1)) { - for (int64_t l = 1; l < (int64_t)app->localFFTPlan->numAxisUploads[1]; l++) { - VkFFTAxis* axis = &app->localFFTPlan->inverseBluesteinAxes[1][l]; - resFFT = VkFFTUpdateBufferSet(app, app->localFFTPlan, axis, 1, l, 0); - if (resFFT != VKFFT_SUCCESS) return resFFT; -#if(VKFFT_BACKEND==0) - vkCmdBindPipeline(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipeline); - vkCmdBindDescriptorSets(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, 0); -#endif - uint64_t dispatchBlock[3]; - dispatchBlock[0] = (uint64_t)ceil(localSize0[1] / (double)axis->axisBlock[0] * app->localFFTPlan->actualFFTSizePerAxis[1][1] / (double)axis->specializationConstants.fftDim); - dispatchBlock[1] = 1; - dispatchBlock[2] = app->localFFTPlan->actualFFTSizePerAxis[1][2] * app->configuration.coordinateFeatures * app->configuration.numberBatches; - - //if (app->configuration.performZeropadding[1]) dispatchBlock[1] = (uint64_t)ceil(dispatchBlock[1] / 2.0); - //if (app->configuration.performZeropadding[2]) dispatchBlock[2] = (uint64_t)ceil(dispatchBlock[2] / 2.0); - resFFT = dispatchEnhanced(app, axis, dispatchBlock); - if (resFFT != VKFFT_SUCCESS) return resFFT; - printDebugInformation(app, axis); - resFFT = VkFFTSync(app); - if (resFFT != VKFFT_SUCCESS) return resFFT; - } - } - } - } - } - //FFT axis 2 - if (app->configuration.FFTdim > 2) { - if (!app->configuration.omitDimension[2]) { - if ((app->configuration.FFTdim == 3) && (app->configuration.performConvolution)) { - - for (int64_t l = (int64_t)app->localFFTPlan->numAxisUploads[2] - 1; l >= 0; l--) { - - VkFFTAxis* axis = &app->localFFTPlan->axes[2][l]; - resFFT = VkFFTUpdateBufferSet(app, app->localFFTPlan, axis, 2, l, 0); - if (resFFT != VKFFT_SUCCESS) return resFFT; - uint64_t maxCoordinate = ((app->configuration.matrixConvolution > 1) && (l == 0)) ? 1 : app->configuration.coordinateFeatures; -#if(VKFFT_BACKEND==0) - vkCmdBindPipeline(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipeline); - vkCmdBindDescriptorSets(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, 0); -#endif - uint64_t dispatchBlock[3]; - dispatchBlock[0] = (uint64_t)ceil(localSize0[2] / (double)axis->axisBlock[0] * app->localFFTPlan->actualFFTSizePerAxis[2][2] / (double)axis->specializationConstants.fftDim); - dispatchBlock[1] = 1; - dispatchBlock[2] = app->localFFTPlan->actualFFTSizePerAxis[2][1] * maxCoordinate * app->configuration.numberBatches; - //if (app->configuration.mergeSequencesR2C == 1) dispatchBlock[0] = (uint64_t)ceil(dispatchBlock[0] / 2.0); - resFFT = dispatchEnhanced(app, axis, dispatchBlock); - if (resFFT != VKFFT_SUCCESS) return resFFT; - printDebugInformation(app, axis); - resFFT = VkFFTSync(app); - if (resFFT != VKFFT_SUCCESS) return resFFT; - } - } - else { - - for (int64_t l = (int64_t)app->localFFTPlan->numAxisUploads[2] - 1; l >= 0; l--) { - VkFFTAxis* axis = &app->localFFTPlan->axes[2][l]; - resFFT = VkFFTUpdateBufferSet(app, app->localFFTPlan, axis, 2, l, 0); - if (resFFT != VKFFT_SUCCESS) return resFFT; -#if(VKFFT_BACKEND==0) - vkCmdBindPipeline(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipeline); - vkCmdBindDescriptorSets(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, 0); -#endif - uint64_t dispatchBlock[3]; - dispatchBlock[0] = (uint64_t)ceil(localSize0[2] / (double)axis->axisBlock[0] * app->localFFTPlan->actualFFTSizePerAxis[2][2] / (double)axis->specializationConstants.fftDim); - dispatchBlock[1] = 1; - dispatchBlock[2] = app->localFFTPlan->actualFFTSizePerAxis[2][1] * app->configuration.coordinateFeatures * app->configuration.numberBatches; - //if (app->configuration.mergeSequencesR2C == 1) dispatchBlock[0] = (uint64_t)ceil(dispatchBlock[0] / 2.0); - resFFT = dispatchEnhanced(app, axis, dispatchBlock); - if (resFFT != VKFFT_SUCCESS) return resFFT; - printDebugInformation(app, axis); - resFFT = VkFFTSync(app); - if (resFFT != VKFFT_SUCCESS) return resFFT; - } - if (app->useBluesteinFFT[2] && (app->localFFTPlan->numAxisUploads[2] > 1)) { - for (int64_t l = 1; l < (int64_t)app->localFFTPlan->numAxisUploads[2]; l++) { - VkFFTAxis* axis = &app->localFFTPlan->inverseBluesteinAxes[2][l]; - resFFT = VkFFTUpdateBufferSet(app, app->localFFTPlan, axis, 2, l, 0); - if (resFFT != VKFFT_SUCCESS) return resFFT; -#if(VKFFT_BACKEND==0) - vkCmdBindPipeline(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipeline); - vkCmdBindDescriptorSets(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, 0); -#endif - uint64_t dispatchBlock[3]; - dispatchBlock[0] = (uint64_t)ceil(localSize0[2] / (double)axis->axisBlock[0] * app->localFFTPlan->actualFFTSizePerAxis[2][2] / (double)axis->specializationConstants.fftDim); - dispatchBlock[1] = 1; - dispatchBlock[2] = app->localFFTPlan->actualFFTSizePerAxis[2][1] * app->configuration.coordinateFeatures * app->configuration.numberBatches; - - //if (app->configuration.performZeropadding[1]) dispatchBlock[1] = (uint64_t)ceil(dispatchBlock[1] / 2.0); - //if (app->configuration.performZeropadding[2]) dispatchBlock[2] = (uint64_t)ceil(dispatchBlock[2] / 2.0); - resFFT = dispatchEnhanced(app, axis, dispatchBlock); - if (resFFT != VKFFT_SUCCESS) return resFFT; - printDebugInformation(app, axis); - resFFT = VkFFTSync(app); - if (resFFT != VKFFT_SUCCESS) return resFFT; - } - } - } - } - } - } - if (app->configuration.performConvolution) { - if (app->configuration.FFTdim > 2) { - - //multiple upload ifft leftovers - if (app->configuration.FFTdim == 3) { - - for (int64_t l = (int64_t)1; l < (int64_t)app->localFFTPlan_inverse->numAxisUploads[2]; l++) { - VkFFTAxis* axis = &app->localFFTPlan_inverse->axes[2][l]; - resFFT = VkFFTUpdateBufferSet(app, app->localFFTPlan_inverse, axis, 2, l, 1); - if (resFFT != VKFFT_SUCCESS) return resFFT; - -#if(VKFFT_BACKEND==0) - vkCmdBindPipeline(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipeline); - vkCmdBindDescriptorSets(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, 0); -#endif - uint64_t dispatchBlock[3]; - dispatchBlock[0] = (uint64_t)ceil(localSize0[2] / (double)axis->axisBlock[0] * app->localFFTPlan_inverse->actualFFTSizePerAxis[2][2] / (double)axis->specializationConstants.fftDim); - dispatchBlock[1] = 1; - dispatchBlock[2] = app->localFFTPlan_inverse->actualFFTSizePerAxis[2][1] * app->configuration.coordinateFeatures * app->configuration.numberKernels; - //if (app->configuration.mergeSequencesR2C == 1) dispatchBlock[0] = (uint64_t)ceil(dispatchBlock[0] / 2.0); - resFFT = dispatchEnhanced(app, axis, dispatchBlock); - if (resFFT != VKFFT_SUCCESS) return resFFT; - printDebugInformation(app, axis); - resFFT = VkFFTSync(app); - if (resFFT != VKFFT_SUCCESS) return resFFT; - } - } - - for (int64_t l = 0; l < (int64_t)app->localFFTPlan_inverse->numAxisUploads[1]; l++) { - VkFFTAxis* axis = &app->localFFTPlan_inverse->axes[1][l]; - resFFT = VkFFTUpdateBufferSet(app, app->localFFTPlan_inverse, axis, 1, l, 1); - if (resFFT != VKFFT_SUCCESS) return resFFT; - -#if(VKFFT_BACKEND==0) - vkCmdBindPipeline(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipeline); - vkCmdBindDescriptorSets(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, 0); -#endif - uint64_t dispatchBlock[3]; - dispatchBlock[0] = (uint64_t)ceil(localSize0[2] / (double)axis->axisBlock[0] * app->localFFTPlan_inverse->actualFFTSizePerAxis[1][1] / (double)axis->specializationConstants.fftDim); - dispatchBlock[1] = 1; - dispatchBlock[2] = app->localFFTPlan_inverse->actualFFTSizePerAxis[1][2] * app->configuration.coordinateFeatures * app->configuration.numberKernels; - //if (app->configuration.mergeSequencesR2C == 1) dispatchBlock[0] = (uint64_t)ceil(dispatchBlock[0] / 2.0); - //if (app->configuration.performZeropadding[2]) dispatchBlock[2] = (uint64_t)ceil(dispatchBlock[2] / 2.0); - resFFT = dispatchEnhanced(app, axis, dispatchBlock); - if (resFFT != VKFFT_SUCCESS) return resFFT; - printDebugInformation(app, axis); - resFFT = VkFFTSync(app); - if (resFFT != VKFFT_SUCCESS) return resFFT; - } - - } - if (app->configuration.FFTdim > 1) { - if (app->configuration.FFTdim == 2) { - - for (int64_t l = (int64_t)1; l < (int64_t)app->localFFTPlan_inverse->numAxisUploads[1]; l++) { - VkFFTAxis* axis = &app->localFFTPlan_inverse->axes[1][l]; - resFFT = VkFFTUpdateBufferSet(app, app->localFFTPlan_inverse, axis, 1, l, 1); - if (resFFT != VKFFT_SUCCESS) return resFFT; - -#if(VKFFT_BACKEND==0) - vkCmdBindPipeline(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipeline); - vkCmdBindDescriptorSets(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, 0); -#endif - uint64_t dispatchBlock[3]; - dispatchBlock[0] = (uint64_t)ceil(localSize0[1] / (double)axis->axisBlock[0] * app->localFFTPlan_inverse->actualFFTSizePerAxis[1][1] / (double)axis->specializationConstants.fftDim); - dispatchBlock[1] = 1; - dispatchBlock[2] = app->localFFTPlan_inverse->actualFFTSizePerAxis[1][2] * app->configuration.coordinateFeatures * app->configuration.numberKernels; - //if (app->configuration.mergeSequencesR2C == 1) dispatchBlock[0] = (uint64_t)ceil(dispatchBlock[0] / 2.0); - //if (app->configuration.performZeropadding[2]) dispatchBlock[2] = (uint64_t)ceil(dispatchBlock[2] / 2.0); - resFFT = dispatchEnhanced(app, axis, dispatchBlock); - if (resFFT != VKFFT_SUCCESS) return resFFT; - printDebugInformation(app, axis); - resFFT = VkFFTSync(app); - if (resFFT != VKFFT_SUCCESS) return resFFT; - } - } - if (app->localFFTPlan_inverse->multiUploadR2C) { - //app->configuration.size[0] /= 2; - VkFFTAxis* axis = &app->localFFTPlan_inverse->R2Cdecomposition; - resFFT = VkFFTUpdateBufferSetR2CMultiUploadDecomposition(app, app->localFFTPlan_inverse, axis, 0, 0, 1); - if (resFFT != VKFFT_SUCCESS) return resFFT; - -#if(VKFFT_BACKEND==0) - vkCmdBindPipeline(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipeline); - vkCmdBindDescriptorSets(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, 0); -#endif - uint64_t dispatchBlock[3]; - - dispatchBlock[0] = (uint64_t)ceil(((app->configuration.size[0] / 2 + 1) * app->configuration.size[1] * app->configuration.size[2]) / (double)(2 * axis->axisBlock[0])); - dispatchBlock[1] = 1; - dispatchBlock[2] = app->configuration.coordinateFeatures * app->configuration.numberBatches; - resFFT = dispatchEnhanced(app, axis, dispatchBlock); - if (resFFT != VKFFT_SUCCESS) return resFFT; - printDebugInformation(app, axis); - - resFFT = VkFFTSync(app); - if (resFFT != VKFFT_SUCCESS) return resFFT; - } - for (int64_t l = 0; l < (int64_t)app->localFFTPlan_inverse->numAxisUploads[0]; l++) { - VkFFTAxis* axis = &app->localFFTPlan_inverse->axes[0][l]; - resFFT = VkFFTUpdateBufferSet(app, app->localFFTPlan_inverse, axis, 0, l, 1); - if (resFFT != VKFFT_SUCCESS) return resFFT; - -#if(VKFFT_BACKEND==0) - vkCmdBindPipeline(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipeline); - vkCmdBindDescriptorSets(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, 0); -#endif - uint64_t dispatchBlock[3]; - if (l == 0) { - if (app->localFFTPlan_inverse->numAxisUploads[0] > 2) { - dispatchBlock[0] = (uint64_t)ceil((uint64_t)ceil(app->localFFTPlan_inverse->actualFFTSizePerAxis[0][0] / axis->specializationConstants.fftDim / (double)axis->axisBlock[1]) / (double)app->localFFTPlan_inverse->axisSplit[0][1]) * app->localFFTPlan_inverse->axisSplit[0][1]; - dispatchBlock[1] = app->localFFTPlan_inverse->actualFFTSizePerAxis[0][1]; - } - else { - if (app->localFFTPlan_inverse->numAxisUploads[0] > 1) { - dispatchBlock[0] = (uint64_t)ceil((uint64_t)ceil(app->localFFTPlan_inverse->actualFFTSizePerAxis[0][0] / axis->specializationConstants.fftDim / (double)axis->axisBlock[1])); - dispatchBlock[1] = app->localFFTPlan_inverse->actualFFTSizePerAxis[0][1]; - } - else { - dispatchBlock[0] = app->localFFTPlan_inverse->actualFFTSizePerAxis[0][0] / axis->specializationConstants.fftDim; - dispatchBlock[1] = (uint64_t)ceil(app->localFFTPlan_inverse->actualFFTSizePerAxis[0][1] / (double)axis->axisBlock[1]); - } - } - } - else { - dispatchBlock[0] = (uint64_t)ceil(app->localFFTPlan_inverse->actualFFTSizePerAxis[0][0] / axis->specializationConstants.fftDim / (double)axis->axisBlock[0]); - dispatchBlock[1] = app->localFFTPlan_inverse->actualFFTSizePerAxis[0][1]; - } - dispatchBlock[2] = app->localFFTPlan_inverse->actualFFTSizePerAxis[0][2] * app->configuration.coordinateFeatures * app->configuration.numberKernels; - if (axis->specializationConstants.mergeSequencesR2C == 1) dispatchBlock[1] = (uint64_t)ceil(dispatchBlock[1] / 2.0); - //if (app->configuration.performZeropadding[1]) dispatchBlock[1] = (uint64_t)ceil(dispatchBlock[1] / 2.0); - //if (app->configuration.performZeropadding[2]) dispatchBlock[2] = (uint64_t)ceil(dispatchBlock[2] / 2.0); - resFFT = dispatchEnhanced(app, axis, dispatchBlock); - if (resFFT != VKFFT_SUCCESS) return resFFT; - printDebugInformation(app, axis); - resFFT = VkFFTSync(app); - if (resFFT != VKFFT_SUCCESS) return resFFT; - } - } - if (app->configuration.FFTdim == 1) { - for (int64_t l = (int64_t)1; l < (int64_t)app->localFFTPlan_inverse->numAxisUploads[0]; l++) { - VkFFTAxis* axis = &app->localFFTPlan_inverse->axes[0][l]; - resFFT = VkFFTUpdateBufferSet(app, app->localFFTPlan_inverse, axis, 0, l, 1); - if (resFFT != VKFFT_SUCCESS) return resFFT; - -#if(VKFFT_BACKEND==0) - vkCmdBindPipeline(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipeline); - vkCmdBindDescriptorSets(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, 0); -#endif - uint64_t dispatchBlock[3]; - dispatchBlock[0] = (uint64_t)ceil(app->localFFTPlan_inverse->actualFFTSizePerAxis[0][0] / (double)axis->axisBlock[0] * app->localFFTPlan_inverse->actualFFTSizePerAxis[0][1] / (double)axis->specializationConstants.fftDim); - dispatchBlock[1] = 1; - dispatchBlock[2] = app->localFFTPlan_inverse->actualFFTSizePerAxis[0][2] * app->configuration.coordinateFeatures * app->configuration.numberKernels; - //if (app->configuration.mergeSequencesR2C == 1) dispatchBlock[0] = (uint64_t)ceil(dispatchBlock[0] / 2.0); - //if (app->configuration.performZeropadding[2]) dispatchBlock[2] = (uint64_t)ceil(dispatchBlock[2] / 2.0); - resFFT = dispatchEnhanced(app, axis, dispatchBlock); - if (resFFT != VKFFT_SUCCESS) return resFFT; - printDebugInformation(app, axis); - resFFT = VkFFTSync(app); - if (resFFT != VKFFT_SUCCESS) return resFFT; - } - } - } - - if (inverse == 1) { - //we start from axis 2 and go back to axis 0 - //FFT axis 2 - if (app->configuration.FFTdim > 2) { - if (!app->configuration.omitDimension[2]) { - for (int64_t l = (int64_t)app->localFFTPlan_inverse->numAxisUploads[2] - 1; l >= 0; l--) { - //if ((!app->configuration.reorderFourStep) && (!app->useBluesteinFFT[2])) l = app->localFFTPlan_inverse->numAxisUploads[2] - 1 - l; - VkFFTAxis* axis = &app->localFFTPlan_inverse->axes[2][l]; - resFFT = VkFFTUpdateBufferSet(app, app->localFFTPlan_inverse, axis, 2, l, 1); - if (resFFT != VKFFT_SUCCESS) return resFFT; - -#if(VKFFT_BACKEND==0) - vkCmdBindPipeline(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipeline); - vkCmdBindDescriptorSets(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, 0); -#endif - uint64_t dispatchBlock[3]; - dispatchBlock[0] = (uint64_t)ceil(localSize0[2] / (double)axis->axisBlock[0] * app->localFFTPlan_inverse->actualFFTSizePerAxis[2][2] / (double)axis->specializationConstants.fftDim); - dispatchBlock[1] = 1; - dispatchBlock[2] = app->localFFTPlan_inverse->actualFFTSizePerAxis[2][1] * app->configuration.coordinateFeatures * app->configuration.numberBatches; - //if (app->configuration.performZeropaddingInverse[0]) dispatchBlock[0] = (uint64_t)ceil(dispatchBlock[0] / 2.0); - //if (app->configuration.performZeropaddingInverse[1]) dispatchBlock[1] = (uint64_t)ceil(dispatchBlock[1] / 2.0); - - //if (app->configuration.mergeSequencesR2C == 1) dispatchBlock[0] = (uint64_t)ceil(dispatchBlock[0] / 2.0); - resFFT = dispatchEnhanced(app, axis, dispatchBlock); - if (resFFT != VKFFT_SUCCESS) return resFFT; - printDebugInformation(app, axis); - resFFT = VkFFTSync(app); - if (resFFT != VKFFT_SUCCESS) return resFFT; - //if ((!app->configuration.reorderFourStep) && (!app->useBluesteinFFT[2])) l = app->localFFTPlan_inverse->numAxisUploads[2] - 1 - l; - } - if (app->useBluesteinFFT[2] && (app->localFFTPlan_inverse->numAxisUploads[2] > 1)) { - for (int64_t l = 1; l < (int64_t)app->localFFTPlan_inverse->numAxisUploads[2]; l++) { - VkFFTAxis* axis = &app->localFFTPlan_inverse->inverseBluesteinAxes[2][l]; - resFFT = VkFFTUpdateBufferSet(app, app->localFFTPlan_inverse, axis, 2, l, 1); - if (resFFT != VKFFT_SUCCESS) return resFFT; -#if(VKFFT_BACKEND==0) - vkCmdBindPipeline(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipeline); - vkCmdBindDescriptorSets(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, 0); -#endif - uint64_t dispatchBlock[3]; - dispatchBlock[0] = (uint64_t)ceil(localSize0[2] / (double)axis->axisBlock[0] * app->localFFTPlan_inverse->actualFFTSizePerAxis[2][2] / (double)axis->specializationConstants.fftDim); - dispatchBlock[1] = 1; - dispatchBlock[2] = app->localFFTPlan_inverse->actualFFTSizePerAxis[2][1] * app->configuration.coordinateFeatures * app->configuration.numberBatches; - - //if (app->configuration.performZeropadding[1]) dispatchBlock[1] = (uint64_t)ceil(dispatchBlock[1] / 2.0); - //if (app->configuration.performZeropadding[2]) dispatchBlock[2] = (uint64_t)ceil(dispatchBlock[2] / 2.0); - resFFT = dispatchEnhanced(app, axis, dispatchBlock); - if (resFFT != VKFFT_SUCCESS) return resFFT; - printDebugInformation(app, axis); - resFFT = VkFFTSync(app); - if (resFFT != VKFFT_SUCCESS) return resFFT; - } - } - } - } - if (app->configuration.FFTdim > 1) { - - //FFT axis 1 - if (!app->configuration.omitDimension[1]) { - for (int64_t l = (int64_t)app->localFFTPlan_inverse->numAxisUploads[1] - 1; l >= 0; l--) { - //if ((!app->configuration.reorderFourStep) && (!app->useBluesteinFFT[1])) l = app->localFFTPlan_inverse->numAxisUploads[1] - 1 - l; - VkFFTAxis* axis = &app->localFFTPlan_inverse->axes[1][l]; - resFFT = VkFFTUpdateBufferSet(app, app->localFFTPlan_inverse, axis, 1, l, 1); - if (resFFT != VKFFT_SUCCESS) return resFFT; - -#if(VKFFT_BACKEND==0) - vkCmdBindPipeline(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipeline); - vkCmdBindDescriptorSets(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, 0); -#endif - uint64_t dispatchBlock[3]; - dispatchBlock[0] = (uint64_t)ceil(localSize0[1] / (double)axis->axisBlock[0] * app->localFFTPlan_inverse->actualFFTSizePerAxis[1][1] / (double)axis->specializationConstants.fftDim); - dispatchBlock[1] = 1; - dispatchBlock[2] = app->localFFTPlan_inverse->actualFFTSizePerAxis[1][2] * app->configuration.coordinateFeatures * app->configuration.numberBatches; - //if (app->configuration.mergeSequencesR2C == 1) dispatchBlock[0] = (uint64_t)ceil(dispatchBlock[0] / 2.0); - //if (app->configuration.performZeropadding[2]) dispatchBlock[2] = (uint64_t)ceil(dispatchBlock[2] / 2.0); - //if (app->configuration.performZeropaddingInverse[0]) dispatchBlock[0] = (uint64_t)ceil(dispatchBlock[0] / 2.0); - - resFFT = dispatchEnhanced(app, axis, dispatchBlock); - if (resFFT != VKFFT_SUCCESS) return resFFT; - printDebugInformation(app, axis); - //if ((!app->configuration.reorderFourStep) && (!app->useBluesteinFFT[1])) l = app->localFFTPlan_inverse->numAxisUploads[1] - 1 - l; - resFFT = VkFFTSync(app); - if (resFFT != VKFFT_SUCCESS) return resFFT; - } - if (app->useBluesteinFFT[1] && (app->localFFTPlan_inverse->numAxisUploads[1] > 1)) { - for (int64_t l = 1; l < (int64_t)app->localFFTPlan_inverse->numAxisUploads[1]; l++) { - VkFFTAxis* axis = &app->localFFTPlan_inverse->inverseBluesteinAxes[1][l]; - resFFT = VkFFTUpdateBufferSet(app, app->localFFTPlan_inverse, axis, 1, l, 1); - if (resFFT != VKFFT_SUCCESS) return resFFT; -#if(VKFFT_BACKEND==0) - vkCmdBindPipeline(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipeline); - vkCmdBindDescriptorSets(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, 0); -#endif - uint64_t dispatchBlock[3]; - dispatchBlock[0] = (uint64_t)ceil(localSize0[1] / (double)axis->axisBlock[0] * app->localFFTPlan_inverse->actualFFTSizePerAxis[1][1] / (double)axis->specializationConstants.fftDim); - dispatchBlock[1] = 1; - dispatchBlock[2] = app->localFFTPlan_inverse->actualFFTSizePerAxis[1][2] * app->configuration.coordinateFeatures * app->configuration.numberBatches; - - //if (app->configuration.performZeropadding[1]) dispatchBlock[1] = (uint64_t)ceil(dispatchBlock[1] / 2.0); - //if (app->configuration.performZeropadding[2]) dispatchBlock[2] = (uint64_t)ceil(dispatchBlock[2] / 2.0); - resFFT = dispatchEnhanced(app, axis, dispatchBlock); - if (resFFT != VKFFT_SUCCESS) return resFFT; - printDebugInformation(app, axis); - resFFT = VkFFTSync(app); - if (resFFT != VKFFT_SUCCESS) return resFFT; - } - } - } - } - if (!app->configuration.omitDimension[0]) { - if (app->localFFTPlan_inverse->multiUploadR2C) { - //app->configuration.size[0] /= 2; - VkFFTAxis* axis = &app->localFFTPlan_inverse->R2Cdecomposition; - resFFT = VkFFTUpdateBufferSetR2CMultiUploadDecomposition(app, app->localFFTPlan_inverse, axis, 0, 0, 1); - if (resFFT != VKFFT_SUCCESS) return resFFT; - -#if(VKFFT_BACKEND==0) - vkCmdBindPipeline(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipeline); - vkCmdBindDescriptorSets(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, 0); -#endif - uint64_t dispatchBlock[3]; - - dispatchBlock[0] = (uint64_t)ceil(((app->configuration.size[0] / 2 + 1) * app->configuration.size[1] * app->configuration.size[2]) / (double)(2 * axis->axisBlock[0])); - dispatchBlock[1] = 1; - dispatchBlock[2] = app->configuration.coordinateFeatures * axis->specializationConstants.numBatches; - resFFT = dispatchEnhanced(app, axis, dispatchBlock); - if (resFFT != VKFFT_SUCCESS) return resFFT; - printDebugInformation(app, axis); - - resFFT = VkFFTSync(app); - if (resFFT != VKFFT_SUCCESS) return resFFT; - } - //FFT axis 0 - for (int64_t l = (int64_t)app->localFFTPlan_inverse->numAxisUploads[0] - 1; l >= 0; l--) { - //if ((!app->configuration.reorderFourStep) && (!app->useBluesteinFFT[0])) l = app->localFFTPlan_inverse->numAxisUploads[0] - 1 - l; - VkFFTAxis* axis = &app->localFFTPlan_inverse->axes[0][l]; - resFFT = VkFFTUpdateBufferSet(app, app->localFFTPlan_inverse, axis, 0, l, 1); - if (resFFT != VKFFT_SUCCESS) return resFFT; -#if(VKFFT_BACKEND==0) - vkCmdBindPipeline(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipeline); - vkCmdBindDescriptorSets(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, 0); -#endif - uint64_t dispatchBlock[3]; - if (l == 0) { - if (app->localFFTPlan_inverse->numAxisUploads[0] > 2) { - dispatchBlock[0] = (uint64_t)ceil((uint64_t)ceil(app->localFFTPlan_inverse->actualFFTSizePerAxis[0][0] / axis->specializationConstants.fftDim / (double)axis->axisBlock[1]) / (double)app->localFFTPlan_inverse->axisSplit[0][1]) * app->localFFTPlan_inverse->axisSplit[0][1]; - dispatchBlock[1] = app->localFFTPlan_inverse->actualFFTSizePerAxis[0][1]; - } - else { - if (app->localFFTPlan_inverse->numAxisUploads[0] > 1) { - dispatchBlock[0] = (uint64_t)ceil((uint64_t)ceil(app->localFFTPlan_inverse->actualFFTSizePerAxis[0][0] / axis->specializationConstants.fftDim / (double)axis->axisBlock[1])); - dispatchBlock[1] = app->localFFTPlan_inverse->actualFFTSizePerAxis[0][1]; - } - else { - dispatchBlock[0] = app->localFFTPlan_inverse->actualFFTSizePerAxis[0][0] / axis->specializationConstants.fftDim; - dispatchBlock[1] = (uint64_t)ceil(app->localFFTPlan_inverse->actualFFTSizePerAxis[0][1] / (double)axis->axisBlock[1]); - } - } - } - else { - dispatchBlock[0] = (uint64_t)ceil(app->localFFTPlan_inverse->actualFFTSizePerAxis[0][0] / axis->specializationConstants.fftDim / (double)axis->axisBlock[0]); - dispatchBlock[1] = app->localFFTPlan_inverse->actualFFTSizePerAxis[0][1]; - } - dispatchBlock[2] = app->localFFTPlan_inverse->actualFFTSizePerAxis[0][2] * app->configuration.coordinateFeatures * app->configuration.numberBatches; - if (axis->specializationConstants.mergeSequencesR2C == 1) dispatchBlock[1] = (uint64_t)ceil(dispatchBlock[1] / 2.0); - //if (app->configuration.performZeropadding[1]) dispatchBlock[1] = (uint64_t)ceil(dispatchBlock[1] / 2.0); - //if (app->configuration.performZeropadding[2]) dispatchBlock[2] = (uint64_t)ceil(dispatchBlock[2] / 2.0); - resFFT = dispatchEnhanced(app, axis, dispatchBlock); - if (resFFT != VKFFT_SUCCESS) return resFFT; - printDebugInformation(app, axis); - //if ((!app->configuration.reorderFourStep) && (!app->useBluesteinFFT[0])) l = app->localFFTPlan_inverse->numAxisUploads[0] - 1 - l; - resFFT = VkFFTSync(app); - if (resFFT != VKFFT_SUCCESS) return resFFT; - } - if (app->useBluesteinFFT[0] && (app->localFFTPlan_inverse->numAxisUploads[0] > 1)) { - for (int64_t l = 1; l < (int64_t)app->localFFTPlan_inverse->numAxisUploads[0]; l++) { - VkFFTAxis* axis = &app->localFFTPlan_inverse->inverseBluesteinAxes[0][l]; - resFFT = VkFFTUpdateBufferSet(app, app->localFFTPlan_inverse, axis, 0, l, 1); - if (resFFT != VKFFT_SUCCESS) return resFFT; - -#if(VKFFT_BACKEND==0) - vkCmdBindPipeline(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipeline); - vkCmdBindDescriptorSets(app->configuration.commandBuffer[0], VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, 0); -#endif - uint64_t dispatchBlock[3]; - if (l == 0) { - if (app->localFFTPlan_inverse->numAxisUploads[0] > 2) { - dispatchBlock[0] = (uint64_t)ceil((uint64_t)ceil(app->localFFTPlan_inverse->actualFFTSizePerAxis[0][0] / axis->specializationConstants.fftDim / (double)axis->axisBlock[1]) / (double)app->localFFTPlan_inverse->axisSplit[0][1]) * app->localFFTPlan_inverse->axisSplit[0][1]; - dispatchBlock[1] = app->localFFTPlan_inverse->actualFFTSizePerAxis[0][1]; - } - else { - if (app->localFFTPlan_inverse->numAxisUploads[0] > 1) { - dispatchBlock[0] = (uint64_t)ceil((uint64_t)ceil(app->localFFTPlan_inverse->actualFFTSizePerAxis[0][0] / axis->specializationConstants.fftDim / (double)axis->axisBlock[1])); - dispatchBlock[1] = app->localFFTPlan_inverse->actualFFTSizePerAxis[0][1]; - } - else { - dispatchBlock[0] = app->localFFTPlan_inverse->actualFFTSizePerAxis[0][0] / axis->specializationConstants.fftDim; - dispatchBlock[1] = (uint64_t)ceil(app->localFFTPlan_inverse->actualFFTSizePerAxis[0][1] / (double)axis->axisBlock[1]); - } - } - } - else { - dispatchBlock[0] = (uint64_t)ceil(app->localFFTPlan_inverse->actualFFTSizePerAxis[0][0] / axis->specializationConstants.fftDim / (double)axis->axisBlock[0]); - dispatchBlock[1] = app->localFFTPlan_inverse->actualFFTSizePerAxis[0][1]; - } - dispatchBlock[2] = app->localFFTPlan_inverse->actualFFTSizePerAxis[0][2] * app->configuration.coordinateFeatures * app->configuration.numberBatches; - if (axis->specializationConstants.mergeSequencesR2C == 1) dispatchBlock[1] = (uint64_t)ceil(dispatchBlock[1] / 2.0); - //if (app->configuration.performZeropadding[1]) dispatchBlock[1] = (uint64_t)ceil(dispatchBlock[1] / 2.0); - //if (app->configuration.performZeropadding[2]) dispatchBlock[2] = (uint64_t)ceil(dispatchBlock[2] / 2.0); - resFFT = dispatchEnhanced(app, axis, dispatchBlock); - if (resFFT != VKFFT_SUCCESS) return resFFT; - printDebugInformation(app, axis); - resFFT = VkFFTSync(app); - if (resFFT != VKFFT_SUCCESS) return resFFT; - } - } - } - //if (app->localFFTPlan_inverse->multiUploadR2C) app->configuration.size[0] *= 2; - - } - return resFFT; -} -static inline int VkFFTGetVersion() { - return 10228; //X.XX.XX format -} -#endif