// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
//  * Redistributions of source code must retain the above copyright
//    notice, this list of conditions and the following disclaimer.
//  * Redistributions in binary form must reproduce the above copyright
//    notice, this list of conditions and the following disclaimer in the
//    documentation and/or other materials provided with the distribution.
//  * Neither the name of NVIDIA CORPORATION nor the names of its
//    contributors may be used to endorse or promote products derived
//    from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Copyright (c) 2008-2023 NVIDIA Corporation. All rights reserved.

#ifndef PX_CUDA_CONTEX_H
#define PX_CUDA_CONTEX_H

#include "foundation/PxPreprocessor.h"

#if PX_SUPPORT_GPU_PHYSX

#include "PxCudaTypes.h"

#if !PX_DOXYGEN
namespace physx
{
#endif
	struct PxCudaKernelParam
	{
		void* data;
		size_t size;
	};

	// workaround for not being able to forward declare enums in PxCudaTypes.h. 
	// provides different automatic casting depending on whether cuda.h was included beforehand or not.
	template<typename CUenum>
	struct PxCUenum
	{
		PxU32 value;

		PxCUenum(CUenum e) { value = PxU32(e); }
		operator CUenum() const { return CUenum(value); }
	};

#ifdef CUDA_VERSION
	typedef PxCUenum<CUjit_option> PxCUjit_option;
	typedef PxCUenum<CUresult> PxCUresult;
#else
	typedef PxCUenum<PxU32> PxCUjit_option;
	typedef PxCUenum<PxU32> PxCUresult;
#endif

#define PX_CUDA_KERNEL_PARAM(X)		{ (void*)&X, sizeof(X) }
#define PX_CUDA_KERNEL_PARAM2(X)	(void*)&X

	class PxDeviceAllocatorCallback;
	/**
	Cuda Context
	*/
	class PxCudaContext
	{
	protected:
		virtual ~PxCudaContext() {}

		PxDeviceAllocatorCallback* mAllocatorCallback;

	public:
		virtual void release() = 0;

		virtual PxCUresult memAlloc(CUdeviceptr *dptr, size_t bytesize) = 0;

		virtual PxCUresult memFree(CUdeviceptr dptr) = 0;

		virtual PxCUresult memHostAlloc(void **pp, size_t bytesize, unsigned int Flags) = 0;

		virtual PxCUresult memFreeHost(void *p) = 0;

		virtual PxCUresult memHostGetDevicePointer(CUdeviceptr *pdptr, void *p, unsigned int Flags) = 0;

		virtual PxCUresult moduleLoadDataEx(CUmodule *module, const void *image, unsigned int numOptions, PxCUjit_option *options, void **optionValues) = 0;

		virtual PxCUresult moduleGetFunction(CUfunction *hfunc, CUmodule hmod, const char *name) = 0;

		virtual PxCUresult moduleUnload(CUmodule hmod) = 0;

		virtual PxCUresult streamCreate(CUstream *phStream, unsigned int Flags) = 0;

		virtual PxCUresult streamCreateWithPriority(CUstream *phStream, unsigned int flags, int priority) = 0;

		virtual PxCUresult streamFlush(CUstream hStream) = 0;

		virtual PxCUresult streamWaitEvent(CUstream hStream, CUevent hEvent, unsigned int Flags) = 0;

		virtual PxCUresult streamDestroy(CUstream hStream) = 0;

		virtual PxCUresult streamSynchronize(CUstream hStream) = 0;

		virtual PxCUresult eventCreate(CUevent *phEvent, unsigned int Flags) = 0;

		virtual PxCUresult eventRecord(CUevent hEvent, CUstream hStream) = 0;

		virtual PxCUresult eventQuery(CUevent hEvent) = 0;

		virtual PxCUresult eventSynchronize(CUevent hEvent) = 0;

		virtual PxCUresult eventDestroy(CUevent hEvent) = 0;

		virtual PxCUresult launchKernel(
			CUfunction f,
			unsigned int gridDimX,
			unsigned int gridDimY,
			unsigned int gridDimZ,
			unsigned int blockDimX,
			unsigned int blockDimY,
			unsigned int blockDimZ,
			unsigned int sharedMemBytes,
			CUstream hStream,
			PxCudaKernelParam* kernelParams,
			size_t kernelParamsSizeInBytes,
			void** extra = NULL
		) = 0;

		// PT: same as above but without copying the kernel params to a local stack before the launch
		// i.e. the kernelParams data is passed directly to the kernel.
		virtual PxCUresult launchKernel(
			CUfunction f,
			PxU32 gridDimX, PxU32 gridDimY, PxU32 gridDimZ,
			PxU32 blockDimX, PxU32 blockDimY, PxU32 blockDimZ,
			PxU32 sharedMemBytes,
			CUstream hStream,
			void** kernelParams,
			void** extra = NULL
		) = 0;

		virtual PxCUresult memcpyDtoH(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount) = 0;

		virtual PxCUresult memcpyDtoHAsync(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream) = 0;

		virtual PxCUresult memcpyHtoD(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount) = 0;

		virtual PxCUresult memcpyHtoDAsync(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount, CUstream hStream) = 0;

		virtual PxCUresult memcpyDtoDAsync(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream) = 0;

		virtual PxCUresult memcpyDtoD(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount) = 0;

		virtual PxCUresult memcpyPeerAsync(CUdeviceptr dstDevice, CUcontext dstContext, CUdeviceptr srcDevice, CUcontext srcContext, size_t ByteCount, CUstream hStream) = 0;

		virtual PxCUresult memsetD32Async(CUdeviceptr dstDevice, unsigned int ui, size_t N, CUstream hStream) = 0;

		virtual PxCUresult memsetD8Async(CUdeviceptr dstDevice, unsigned char uc, size_t N, CUstream hStream) = 0;

		virtual PxCUresult memsetD32(CUdeviceptr dstDevice, unsigned int ui, size_t N) = 0;

		virtual PxCUresult memsetD16(CUdeviceptr dstDevice, unsigned short uh, size_t N) = 0;

		virtual PxCUresult memsetD8(CUdeviceptr dstDevice, unsigned char uc, size_t N) = 0;

		virtual PxCUresult getLastError() = 0;

		PxDeviceAllocatorCallback* getAllocatorCallback() { return mAllocatorCallback; }
	};

#if !PX_DOXYGEN
} // namespace physx
#endif

#endif // PX_SUPPORT_GPU_PHYSX
#endif