/* Copyright 2005-2021 NVIDIA Corporation.  All rights reserved.
  *
  * NOTICE TO LICENSEE:
  *
  * The source code and/or documentation ("Licensed Deliverables") are
  * subject to NVIDIA intellectual property rights under U.S. and
  * international Copyright laws.
  *
  * The Licensed Deliverables contained herein are PROPRIETARY and
  * CONFIDENTIAL to NVIDIA and are being provided under the terms and
  * conditions of a form of NVIDIA software license agreement by and
  * between NVIDIA and Licensee ("License Agreement") or electronically
  * accepted by Licensee.  Notwithstanding any terms or conditions to
  * the contrary in the License Agreement, reproduction or disclosure
  * of the Licensed Deliverables to any third party without the express
  * written consent of NVIDIA is prohibited.
  *
  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
  * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
  * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  THEY ARE
  * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
  * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
  * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
  * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
  * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
  * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
  * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
  * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
  * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
  * OF THESE LICENSED DELIVERABLES.
  *
  * U.S. Government End Users.  These Licensed Deliverables are a
  * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
  * 1995), consisting of "commercial computer software" and "commercial
  * computer software documentation" as such terms are used in 48
  * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
  * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
  * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
  * U.S. Government End Users acquire the Licensed Deliverables with
  * only those rights set forth herein.
  *
  * Any use of the Licensed Deliverables in individual and commercial
  * software must include, in the user documentation and internal
  * comments to the code, the above Disclaimer and U.S. Government End
  * Users Notice.
  */

/*!
* \file cufftXt.h
* \brief Public header file for the NVIDIA CUDA FFT library (CUFFT)
*/

#ifndef _CUFFTXT_H_
#define _CUFFTXT_H_
#include "cudalibxt.h"
#include "cufft.h"


#ifndef CUFFTAPI
#ifdef _WIN32
#define CUFFTAPI __stdcall
#else
#define CUFFTAPI
#endif
#endif

#ifdef __cplusplus
extern "C" {
#endif

//
// cufftXtSubFormat identifies the data layout of
// a memory descriptor owned by cufft.
// note that multi GPU cufft does not yet support out-of-place transforms
//

typedef enum cufftXtSubFormat_t {
    CUFFT_XT_FORMAT_INPUT = 0x00,              //by default input is in linear order across GPUs
    CUFFT_XT_FORMAT_OUTPUT = 0x01,             //by default output is in scrambled order depending on transform
    CUFFT_XT_FORMAT_INPLACE = 0x02,            //by default inplace is input order, which is linear across GPUs
    CUFFT_XT_FORMAT_INPLACE_SHUFFLED = 0x03,   //shuffled output order after execution of the transform
    CUFFT_XT_FORMAT_1D_INPUT_SHUFFLED = 0x04,  //shuffled input order prior to execution of 1D transforms
    CUFFT_XT_FORMAT_DISTRIBUTED_INPUT = 0x05,
    CUFFT_XT_FORMAT_DISTRIBUTED_OUTPUT = 0x06,
    CUFFT_FORMAT_UNDEFINED = 0x07
} cufftXtSubFormat;

//
// cufftXtCopyType specifies the type of copy for cufftXtMemcpy
//
typedef enum cufftXtCopyType_t {
    CUFFT_COPY_HOST_TO_DEVICE = 0x00,
    CUFFT_COPY_DEVICE_TO_HOST = 0x01,
    CUFFT_COPY_DEVICE_TO_DEVICE = 0x02,
    CUFFT_COPY_UNDEFINED = 0x03
} cufftXtCopyType;

//
// cufftXtQueryType specifies the type of query for cufftXtQueryPlan
//
typedef enum cufftXtQueryType_t {
    CUFFT_QUERY_1D_FACTORS = 0x00,
    CUFFT_QUERY_UNDEFINED = 0x01
} cufftXtQueryType;

typedef struct cufftXt1dFactors_t {
    long long int size;
    long long int stringCount;
    long long int stringLength;
    long long int substringLength;
    long long int factor1;
    long long int factor2;
    long long int stringMask;
    long long int substringMask;
    long long int factor1Mask;
    long long int factor2Mask;
    int stringShift;
    int substringShift;
    int factor1Shift;
    int factor2Shift;
} cufftXt1dFactors;

//
// cufftXtWorkAreaPolicy specifies policy for cufftXtSetWorkAreaPolicy
//
typedef enum cufftXtWorkAreaPolicy_t {
    CUFFT_WORKAREA_MINIMAL = 0, /* maximum reduction */
    CUFFT_WORKAREA_USER = 1, /* use workSize parameter as limit */
    CUFFT_WORKAREA_PERFORMANCE = 2, /* default - 1x overhead or more, maximum performance */
} cufftXtWorkAreaPolicy;

// multi-GPU routines
cufftResult CUFFTAPI cufftXtSetGPUs(cufftHandle handle, int nGPUs, int *whichGPUs);

cufftResult CUFFTAPI cufftXtMalloc(cufftHandle plan,
                                   cudaLibXtDesc ** descriptor,
                                   cufftXtSubFormat format);

cufftResult CUFFTAPI cufftXtMemcpy(cufftHandle plan,
                                   void *dstPointer,
                                   void *srcPointer,
                                   cufftXtCopyType type);

cufftResult CUFFTAPI cufftXtFree(cudaLibXtDesc *descriptor);

cufftResult CUFFTAPI cufftXtSetWorkArea(cufftHandle plan, void **workArea);

cufftResult CUFFTAPI cufftXtExecDescriptorC2C(cufftHandle plan,
                                              cudaLibXtDesc *input,
                                              cudaLibXtDesc *output,
                                              int direction);

cufftResult CUFFTAPI cufftXtExecDescriptorR2C(cufftHandle plan,
                                              cudaLibXtDesc *input,
                                              cudaLibXtDesc *output);

cufftResult CUFFTAPI cufftXtExecDescriptorC2R(cufftHandle plan,
                                              cudaLibXtDesc *input,
                                              cudaLibXtDesc *output);

cufftResult CUFFTAPI cufftXtExecDescriptorZ2Z(cufftHandle plan,
                                              cudaLibXtDesc *input,
                                              cudaLibXtDesc *output,
                                              int direction);

cufftResult CUFFTAPI cufftXtExecDescriptorD2Z(cufftHandle plan,
                                              cudaLibXtDesc *input,
                                              cudaLibXtDesc *output);

cufftResult CUFFTAPI cufftXtExecDescriptorZ2D(cufftHandle plan,
                                              cudaLibXtDesc *input,
                                              cudaLibXtDesc *output);

// Utility functions

cufftResult CUFFTAPI cufftXtQueryPlan(cufftHandle plan, void *queryStruct, cufftXtQueryType queryType);


// callbacks


typedef enum cufftXtCallbackType_t {
    CUFFT_CB_LD_COMPLEX = 0x0,
    CUFFT_CB_LD_COMPLEX_DOUBLE = 0x1,
    CUFFT_CB_LD_REAL = 0x2,
    CUFFT_CB_LD_REAL_DOUBLE = 0x3,
    CUFFT_CB_ST_COMPLEX = 0x4,
    CUFFT_CB_ST_COMPLEX_DOUBLE = 0x5,
    CUFFT_CB_ST_REAL = 0x6,
    CUFFT_CB_ST_REAL_DOUBLE = 0x7,
    CUFFT_CB_UNDEFINED = 0x8

} cufftXtCallbackType;

typedef cufftComplex (*cufftCallbackLoadC)(void *dataIn, size_t offset, void *callerInfo, void *sharedPointer);
typedef cufftDoubleComplex (*cufftCallbackLoadZ)(void *dataIn, size_t offset, void *callerInfo, void *sharedPointer);
typedef cufftReal (*cufftCallbackLoadR)(void *dataIn, size_t offset, void *callerInfo, void *sharedPointer);
typedef cufftDoubleReal(*cufftCallbackLoadD)(void *dataIn, size_t offset, void *callerInfo, void *sharedPointer);

typedef void (*cufftCallbackStoreC)(void *dataOut, size_t offset, cufftComplex element, void *callerInfo, void *sharedPointer);
typedef void (*cufftCallbackStoreZ)(void *dataOut, size_t offset, cufftDoubleComplex element, void *callerInfo, void *sharedPointer);
typedef void (*cufftCallbackStoreR)(void *dataOut, size_t offset, cufftReal element, void *callerInfo, void *sharedPointer);
typedef void (*cufftCallbackStoreD)(void *dataOut, size_t offset, cufftDoubleReal element, void *callerInfo, void *sharedPointer);


cufftResult CUFFTAPI cufftXtSetCallback(cufftHandle plan, void **callback_routine, cufftXtCallbackType cbType, void **caller_info);
cufftResult CUFFTAPI cufftXtClearCallback(cufftHandle plan, cufftXtCallbackType cbType);
cufftResult CUFFTAPI cufftXtSetCallbackSharedSize(cufftHandle plan, cufftXtCallbackType cbType, size_t sharedSize);

cufftResult CUFFTAPI cufftXtMakePlanMany(cufftHandle plan,
                                         int rank,
                                         long long int *n,
                                         long long int *inembed,
                                         long long int istride,
                                         long long int idist,
                                         cudaDataType inputtype,
                                         long long int *onembed,
                                         long long int ostride,
                                         long long int odist,
                                         cudaDataType outputtype,
                                         long long int batch,
                                         size_t *workSize,
                                       	 cudaDataType executiontype);

cufftResult CUFFTAPI cufftXtGetSizeMany(cufftHandle plan,
                                        int rank,
                                        long long int *n,
                                        long long int *inembed,
                                        long long int istride,
                                        long long int idist,
                                        cudaDataType inputtype,
                                        long long int *onembed,
                                        long long int ostride,
                                        long long int odist,
                                        cudaDataType outputtype,
                                        long long int batch,
                                        size_t *workSize,
                                        cudaDataType executiontype);


cufftResult CUFFTAPI cufftXtExec(cufftHandle plan,
                                 void *input,
                                 void *output,
                                 int direction);

cufftResult CUFFTAPI cufftXtExecDescriptor(cufftHandle plan,
                                           cudaLibXtDesc *input,
                                           cudaLibXtDesc *output,
                                           int direction);

cufftResult CUFFTAPI cufftXtSetWorkAreaPolicy(cufftHandle plan, cufftXtWorkAreaPolicy policy, size_t *workSize);

cufftResult CUFFTAPI cufftXtSetDistribution(cufftHandle plan, 
                                            int rank, 
                                            const long long int* lower_input,  
                                            const long long int* upper_input,
                                            const long long int* lower_output,
                                            const long long int* upper_output,
                                            const long long int* strides_input, 
                                            const long long int* strides_output);

#ifdef __cplusplus
}
#endif

#endif