* [dpdk-dev] [RFC PATCH] gpu/cuda: introduce CUDA driver
@ 2021-10-05 22:49 eagostini
  2021-11-04  2:01 ` [dpdk-dev] [PATCH v2 0/1] " eagostini
                   ` (6 more replies)
  0 siblings, 7 replies; 28+ messages in thread
From: eagostini @ 2021-10-05 22:49 UTC (permalink / raw)
  To: dev; +Cc: Elena Agostini
From: Elena Agostini <eagostini@nvidia.com>
This is the CUDA implementation of the gpudev library.
Funcitonalities implemented through CUDA Driver API are:
- Device probe and remove
- Manage device memory allocations
- Register/unregister external CPU memory in the device memory area
Signed-off-by: Elena Agostini <eagostini@nvidia.com>
---
 drivers/gpu/cuda/cuda.c      | 751 +++++++++++++++++++++++++++++++++++
 drivers/gpu/cuda/meson.build |  30 ++
 drivers/gpu/cuda/version.map |   3 +
 drivers/gpu/meson.build      |   2 +-
 4 files changed, 785 insertions(+), 1 deletion(-)
 create mode 100644 drivers/gpu/cuda/cuda.c
 create mode 100644 drivers/gpu/cuda/meson.build
 create mode 100644 drivers/gpu/cuda/version.map
diff --git a/drivers/gpu/cuda/cuda.c b/drivers/gpu/cuda/cuda.c
new file mode 100644
index 0000000000..202f0a0c0c
--- /dev/null
+++ b/drivers/gpu/cuda/cuda.c
@@ -0,0 +1,751 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright (c) 2021 NVIDIA Corporation & Affiliates
+*/
+
+#include <rte_common.h>
+#include <rte_log.h>
+#include <rte_malloc.h>
+#include <rte_errno.h>
+#include <rte_pci.h>
+#include <rte_bus_pci.h>
+#include <rte_byteorder.h>
+#include <rte_dev.h>
+
+#include <gpudev_driver.h>
+#include <cuda.h>
+
+/* NVIDIA GPU vendor */
+#define NVIDIA_GPU_VENDOR_ID (0x10de)
+
+/* NVIDIA GPU device IDs */
+#define NVIDIA_GPU_A100_40GB_DEVICE_ID (0x20f1)
+#define NVIDIA_GPU_V100_32GB_DEVICE_ID (0x1db6)
+
+#define CUDA_MAX_ALLOCATION_NUM 512
+
+#define GPU_PAGE_SHIFT 16
+#define GPU_PAGE_SIZE (1UL << GPU_PAGE_SHIFT)
+
+RTE_LOG_REGISTER_DEFAULT(gpu_logtype, NOTICE);
+
+/** Helper macro for logging */
+#define rte_gpu_log(level, fmt, ...) \
+	rte_log(RTE_LOG_ ## level, gpu_logtype, fmt "\n", ##__VA_ARGS__)
+
+#define rte_gpu_log_debug(fmt, ...) \
+	rte_gpu_log(DEBUG, RTE_STR(__LINE__) ":%s() " fmt, __func__, \
+		##__VA_ARGS__)
+
+/* NVIDIA GPU address map */
+static struct rte_pci_id pci_id_cuda_map[] = {
+	{
+		RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
+				NVIDIA_GPU_A100_40GB_DEVICE_ID)
+	},
+	{
+		RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
+				NVIDIA_GPU_V100_32GB_DEVICE_ID)
+	},
+	/* {.device_id = 0}, ?? */
+};
+
+/* Device private info */
+struct cuda_info {
+    char gpu_name[RTE_DEV_NAME_MAX_LEN];
+    CUdevice cu_dev;
+};
+
+/* Type of memory allocated by CUDA driver */
+enum mem_type {
+	GPU_MEM = 0,
+	CPU_REGISTERED,
+	GPU_REGISTERED /* Not used yet */
+};
+
+/* key associated to a memory address */
+typedef uintptr_t ptr_key;
+
+/* Single entry of the memory list */
+struct mem_entry {
+    CUdeviceptr ptr_d;
+	void * ptr_h;
+    size_t size;
+	struct rte_gpu *dev;
+	CUcontext ctx;
+	ptr_key pkey;
+	enum mem_type mtype;
+	struct mem_entry * prev;
+	struct mem_entry * next;
+};
+
+struct mem_entry * mem_alloc_list_head = NULL;
+struct mem_entry * mem_alloc_list_tail = NULL;
+uint32_t mem_alloc_list_last_elem = 0;
+
+/* Generate a key from a memory pointer */
+static ptr_key
+get_hash_from_ptr(void * ptr)
+{
+	return (uintptr_t) ptr;
+}
+
+static uint32_t
+mem_list_count_item(void)
+{
+	return mem_alloc_list_last_elem;
+}
+
+/* Initiate list of memory allocations if not done yet */
+static struct mem_entry *
+mem_list_add_item(void)
+{
+	/* Initiate list of memory allocations if not done yet */
+	if(mem_alloc_list_head == NULL)
+	{
+		mem_alloc_list_head = rte_zmalloc(NULL, sizeof(struct mem_entry), RTE_CACHE_LINE_SIZE);
+		if (mem_alloc_list_head == NULL) {
+			rte_gpu_log(ERR, "Failed to allocate memory for memory list.\n");
+			return NULL;
+		}
+
+		mem_alloc_list_head->next = NULL;
+		mem_alloc_list_head->prev = NULL;
+		mem_alloc_list_tail = mem_alloc_list_head;
+	}
+	else
+	{
+		struct mem_entry * mem_alloc_list_cur = rte_zmalloc(NULL, sizeof(struct mem_entry), RTE_CACHE_LINE_SIZE);
+		if (mem_alloc_list_cur == NULL) {
+			rte_gpu_log(ERR, "Failed to allocate memory for memory list.\n");
+			return NULL;
+		}
+
+		mem_alloc_list_tail->next = mem_alloc_list_cur;
+		mem_alloc_list_cur->prev = mem_alloc_list_tail;
+		mem_alloc_list_tail = mem_alloc_list_tail->next;
+		mem_alloc_list_tail->next = NULL;
+	}
+
+	mem_alloc_list_last_elem++;
+
+	return mem_alloc_list_tail;
+}
+
+static struct mem_entry *
+mem_list_find_item(ptr_key pk)
+{
+	struct mem_entry * mem_alloc_list_cur = NULL;
+
+	if( mem_alloc_list_head == NULL )
+	{
+		rte_gpu_log(ERR, "Memory list doesn't exist\n");
+		return NULL;
+	}
+
+	if(mem_list_count_item() == 0)
+	{
+		rte_gpu_log(ERR, "No items in memory list\n");
+		return NULL;
+	}
+
+	mem_alloc_list_cur = mem_alloc_list_head;
+
+	while(mem_alloc_list_cur != NULL)
+	{
+		if(mem_alloc_list_cur->pkey == pk)
+			return mem_alloc_list_cur;
+		mem_alloc_list_cur = mem_alloc_list_cur->next;
+	}
+
+	return mem_alloc_list_cur;
+}
+
+static int
+mem_list_del_item(ptr_key pk)
+{
+	struct mem_entry * mem_alloc_list_cur = NULL;
+
+	mem_alloc_list_cur = mem_list_find_item(pk);
+	if(mem_alloc_list_cur == NULL)
+		return -EINVAL;
+
+	/* if key is in head */
+	if(mem_alloc_list_cur->prev == NULL)
+		mem_alloc_list_head = mem_alloc_list_cur->next;
+	else
+	{
+		mem_alloc_list_cur->prev->next = mem_alloc_list_cur->next;
+		if(mem_alloc_list_cur->next != NULL)
+			mem_alloc_list_cur->next->prev = mem_alloc_list_cur->prev;
+	}
+
+	rte_free(mem_alloc_list_cur);
+
+	mem_alloc_list_last_elem--;
+
+	return 0;
+}
+
+static int
+cuda_dev_info_get(struct rte_gpu *dev, struct rte_gpu_info *info)
+{
+	int ret = 0;
+	CUresult res;
+	struct rte_gpu_info parent_info;
+	CUexecAffinityParam affinityPrm;
+	const char * err_string;
+	struct cuda_info * private;
+	CUcontext current_ctx;
+	CUcontext input_ctx;
+
+	if(dev == NULL)
+		return -EINVAL;
+
+	/* Child initialization time probably called by rte_gpu_add_child() */
+	if(
+		dev->mpshared->info.parent != RTE_GPU_ID_NONE &&
+		dev->mpshared->dev_private == NULL
+	)
+	{
+		/* Store current ctx */
+		res = cuCtxGetCurrent(¤t_ctx);
+		if(CUDA_SUCCESS != res)
+		{
+			cuGetErrorString(res, &(err_string));
+			rte_gpu_log(ERR, "cuCtxGetCurrent failed with %s.\n", err_string);
+
+			return -1;
+		}
+
+		/* Set child ctx as current ctx */
+		input_ctx = (CUcontext)dev->mpshared->info.context;
+		res = cuCtxSetCurrent(input_ctx);
+		if(CUDA_SUCCESS != res)
+		{
+			cuGetErrorString(res, &(err_string));
+			rte_gpu_log(ERR, "cuda_dev_info_get cuCtxSetCurrent input failed with %s.\n", err_string);
+
+			return -1;
+		}
+
+		/*
+		* Ctx capacity info
+		*/
+
+		/* MPS compatible */
+		res = cuCtxGetExecAffinity(&affinityPrm, CU_EXEC_AFFINITY_TYPE_SM_COUNT);
+		if(CUDA_SUCCESS != res)
+		{
+			cuGetErrorString(res, &(err_string));
+			rte_gpu_log(ERR, "cuCtxGetExecAffinity failed with %s.\n", err_string);
+		}
+		dev->mpshared->info.processor_count = (uint32_t)affinityPrm.param.smCount.val;
+
+		ret = rte_gpu_info_get(dev->mpshared->info.parent, &parent_info);
+		if (ret)
+			return -ENODEV;
+		dev->mpshared->info.total_memory = parent_info.total_memory;
+
+		/*
+		* GPU Device private info
+		*/
+		dev->mpshared->dev_private = rte_zmalloc(NULL, sizeof(struct cuda_info), RTE_CACHE_LINE_SIZE);
+		if (dev->mpshared->dev_private == NULL) {
+			rte_gpu_log(ERR, "Failed to allocate memory for GPU process private.\n");
+
+			return -1;
+		}
+
+		private = (struct cuda_info *)dev->mpshared->dev_private;
+
+		res = cuCtxGetDevice(&(private->cu_dev));
+		if(CUDA_SUCCESS != res)
+		{
+			cuGetErrorString(res, &(err_string));
+			rte_gpu_log(ERR, "cuCtxGetDevice failed with %s.\n", err_string);
+
+			return -1;
+		}
+
+		res = cuDeviceGetName(private->gpu_name, RTE_DEV_NAME_MAX_LEN, private->cu_dev);
+		if(CUDA_SUCCESS != res)
+		{
+			cuGetErrorString(res, &(err_string));
+			rte_gpu_log(ERR, "cuDeviceGetName failed with %s.\n", err_string);
+
+			return -1;
+		}
+
+		/* Restore original ctx as current ctx */
+		res = cuCtxSetCurrent(current_ctx);
+		if(CUDA_SUCCESS != res)
+		{
+			cuGetErrorString(res, &(err_string));
+			rte_gpu_log(ERR, "cuda_dev_info_get cuCtxSetCurrent current failed with %s.\n", err_string);
+
+			return -1;
+		}
+	}
+
+	*info = dev->mpshared->info;
+
+	return 0;
+}
+
+/*
+ * GPU Memory
+ */
+
+static int
+cuda_mem_alloc(struct rte_gpu * dev, size_t size, void ** ptr)
+{
+	CUresult res;
+	const char * err_string;
+	CUcontext current_ctx;
+	CUcontext input_ctx;
+	unsigned int flag = 1;
+
+	if(dev == NULL || size == 0)
+		return -EINVAL;
+
+	/* Store current ctx */
+	res = cuCtxGetCurrent(¤t_ctx);
+	if(CUDA_SUCCESS != res)
+	{
+		cuGetErrorString(res, &(err_string));
+		rte_gpu_log(ERR, "cuCtxGetCurrent failed with %s.\n", err_string);
+
+		return -1;
+	}
+
+	/* Set child ctx as current ctx */
+	input_ctx = (CUcontext)dev->mpshared->info.context;
+	res = cuCtxSetCurrent(input_ctx);
+	if(CUDA_SUCCESS != res)
+	{
+		cuGetErrorString(res, &(err_string));
+		rte_gpu_log(ERR, "cuda_mem_alloc cuCtxSetCurrent input failed with %s.\n", err_string);
+
+		return -1;
+	}
+
+	/* Get next memory list item */
+	mem_alloc_list_tail = mem_list_add_item();
+	if(mem_alloc_list_tail == NULL)
+		return -ENOMEM;
+
+	/* Allocate memory */
+	mem_alloc_list_tail->size = size;
+	res = cuMemAlloc(&(mem_alloc_list_tail->ptr_d), mem_alloc_list_tail->size);
+	if (CUDA_SUCCESS != res) {
+		cuGetErrorString(res, &(err_string));
+		rte_gpu_log(ERR, "cuda_mem_alloc cuCtxSetCurrent current failed with %s.\n", err_string);
+
+		return -1;
+	}
+
+	/* GPUDirect RDMA attribute required */
+	res = cuPointerSetAttribute(&flag, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, mem_alloc_list_tail->ptr_d);
+	if (CUDA_SUCCESS != res) {
+		rte_gpu_log(ERR, "Could not set SYNC MEMOP attribute for GPU memory at %llx , err %d\n", mem_alloc_list_tail->ptr_d, res);
+		return -1;
+	}
+
+	mem_alloc_list_tail->pkey = get_hash_from_ptr((void *) mem_alloc_list_tail->ptr_d);
+	mem_alloc_list_tail->ptr_h = NULL;
+	mem_alloc_list_tail->size = size;
+	mem_alloc_list_tail->dev = dev;
+	mem_alloc_list_tail->ctx = (CUcontext)dev->mpshared->info.context;
+	mem_alloc_list_tail->mtype = GPU_MEM;
+
+	/* Restore original ctx as current ctx */
+	res = cuCtxSetCurrent(current_ctx);
+	if(CUDA_SUCCESS != res)
+	{
+		cuGetErrorString(res, &(err_string));
+		rte_gpu_log(ERR, "cuda_mem_alloc cuCtxSetCurrent current failed with %s.\n", err_string);
+
+		return -1;
+	}
+
+	*ptr = (void*) mem_alloc_list_tail->ptr_d;
+
+	return 0;
+}
+
+static int
+cuda_mem_register(struct rte_gpu * dev, size_t size, void * ptr)
+{
+	CUresult res;
+	const char * err_string;
+	CUcontext current_ctx;
+	CUcontext input_ctx;
+	unsigned int flag = 1;
+	int use_ptr_h = 0;
+
+	if(dev == NULL || size == 0 || ptr == NULL)
+		return -EINVAL;
+
+	/* Store current ctx */
+	res = cuCtxGetCurrent(¤t_ctx);
+	if(CUDA_SUCCESS != res)
+	{
+		cuGetErrorString(res, &(err_string));
+		rte_gpu_log(ERR, "cuCtxGetCurrent failed with %s.\n", err_string);
+
+		return -1;
+	}
+
+	/* Set child ctx as current ctx */
+	input_ctx = (CUcontext)dev->mpshared->info.context;
+	res = cuCtxSetCurrent(input_ctx);
+	if(CUDA_SUCCESS != res)
+	{
+		cuGetErrorString(res, &(err_string));
+		rte_gpu_log(ERR, "cuda_mem_register cuCtxSetCurrent input failed with %s.\n", err_string);
+
+		return -1;
+	}
+
+	/* Get next memory list item */
+	mem_alloc_list_tail = mem_list_add_item();
+	if(mem_alloc_list_tail == NULL)
+		return -ENOMEM;
+
+	/* Allocate memory */
+	mem_alloc_list_tail->size = size;
+	mem_alloc_list_tail->ptr_h = ptr;
+
+	res = cuMemHostRegister(mem_alloc_list_tail->ptr_h, mem_alloc_list_tail->size, CU_MEMHOSTREGISTER_PORTABLE | CU_MEMHOSTREGISTER_DEVICEMAP);
+	if (CUDA_SUCCESS != res) {
+		cuGetErrorString(res, &(err_string));
+		rte_gpu_log(ERR, "cuda_mem_register cuMemHostRegister failed with %s ptr %p size %zd.\n",
+						err_string, mem_alloc_list_tail->ptr_h, mem_alloc_list_tail->size
+					);
+
+		return -1;
+	}
+
+	res = cuDeviceGetAttribute(&(use_ptr_h),
+									CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM,
+									((struct cuda_info *)(dev->mpshared->dev_private))->cu_dev
+								);
+	if(CUDA_SUCCESS != res)
+	{
+		cuGetErrorString(res, &(err_string));
+		rte_gpu_log(ERR, "cuDeviceGetAttribute failed with %s.\n",
+					err_string
+			);
+
+		return -1;
+	}
+
+	if(use_ptr_h == 0)
+	{
+		res = cuMemHostGetDevicePointer(&(mem_alloc_list_tail->ptr_d), mem_alloc_list_tail->ptr_h, 0);
+		if (CUDA_SUCCESS != res) {
+			cuGetErrorString(res, &(err_string));
+			rte_gpu_log(ERR, "cuMemHostGetDevicePointer failed with %s.\n", err_string);
+
+			return -1;
+		}
+
+		if((uintptr_t) mem_alloc_list_tail->ptr_d != (uintptr_t) mem_alloc_list_tail->ptr_h)
+		{
+			rte_gpu_log(ERR, "Host input pointer is different wrt GPU registered pointer\n");
+			return -1;
+		}
+	}
+	else
+		mem_alloc_list_tail->ptr_d = (CUdeviceptr) mem_alloc_list_tail->ptr_h;
+
+	/* GPUDirect RDMA attribute required */
+	res = cuPointerSetAttribute(&flag, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, mem_alloc_list_tail->ptr_d);
+	if (CUDA_SUCCESS != res) {
+		rte_gpu_log(ERR, "Could not set SYNC MEMOP attribute for GPU memory at %llx , err %d\n", mem_alloc_list_tail->ptr_d, res);
+		return -1;
+	}
+
+	mem_alloc_list_tail->pkey = get_hash_from_ptr((void *) mem_alloc_list_tail->ptr_h);
+	mem_alloc_list_tail->size = size;
+	mem_alloc_list_tail->dev = dev;
+	mem_alloc_list_tail->ctx = (CUcontext)dev->mpshared->info.context;
+	mem_alloc_list_tail->mtype = CPU_REGISTERED;
+
+	/* Restore original ctx as current ctx */
+	res = cuCtxSetCurrent(current_ctx);
+	if(CUDA_SUCCESS != res)
+	{
+		cuGetErrorString(res, &(err_string));
+		rte_gpu_log(ERR, "cuda_mem_register cuCtxSetCurrent current failed with %s.\n", err_string);
+
+		return -1;
+	}
+
+	return 0;
+}
+
+static int
+cuda_mem_free(struct rte_gpu * dev, void * ptr)
+{
+	CUresult res;
+	struct mem_entry * mem_item;
+	const char * err_string;
+	ptr_key hk;
+
+	if(dev == NULL || ptr == NULL)
+		return -EINVAL;
+
+	hk = get_hash_from_ptr((void *) ptr);
+
+	mem_item = mem_list_find_item(hk);
+	if(mem_item == NULL)
+	{
+		rte_gpu_log(ERR, "Memory address 0x%p not found in driver memory\n", ptr);
+		return -1;
+	}
+
+	if(mem_item->mtype == GPU_MEM)
+	{
+		res = cuMemFree(mem_item->ptr_d);
+		if(CUDA_SUCCESS != res)
+		{
+			cuGetErrorString(res, &(err_string));
+			rte_gpu_log(ERR, "cuMemFree current failed with %s.\n", err_string);
+
+			return -1;
+		}
+
+		return mem_list_del_item(hk);
+
+	}
+	else
+	{
+		rte_gpu_log(ERR, "Memory type %d not supported\n", mem_item->mtype);
+		return -1;
+	}
+
+	return 0;
+}
+
+static int
+cuda_mem_unregister(struct rte_gpu * dev, void * ptr)
+{
+	CUresult res;
+	struct mem_entry * mem_item;
+	const char * err_string;
+	ptr_key hk;
+
+	if(dev == NULL || ptr == NULL)
+		return -EINVAL;
+
+	hk = get_hash_from_ptr((void *) ptr);
+
+	mem_item = mem_list_find_item(hk);
+	if(mem_item == NULL)
+	{
+		rte_gpu_log(ERR, "Memory address 0x%p not nd in driver memory\n", ptr);
+		return -1;
+	}
+
+	if(mem_item->mtype == CPU_REGISTERED)
+	{
+		res = cuMemHostUnregister(ptr);
+		if(CUDA_SUCCESS != res)
+		{
+			cuGetErrorString(res, &(err_string));
+			rte_gpu_log(ERR, "cuMemHostUnregister current failed with %s.\n", err_string);
+
+			return -1;
+		}
+
+		return mem_list_del_item(hk);
+	}
+	else
+	{
+		rte_gpu_log(ERR, "Memory type %d not supported\n", mem_item->mtype);
+		return -1;
+	}
+
+	return 0;
+}
+
+static int
+cuda_dev_close(struct rte_gpu * dev)
+{
+	if (dev == NULL)
+		return -EINVAL;
+
+	rte_free(dev->mpshared->dev_private);
+
+	return 0;
+}
+
+static int
+cuda_gpu_probe(__rte_unused struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
+{
+	struct rte_gpu *dev = NULL;
+	CUresult res;
+	CUdevice cu_dev_id;
+	CUcontext pctx;
+	char dev_name[RTE_DEV_NAME_MAX_LEN];
+	const char * err_string;
+	int processor_count = 0;
+	struct cuda_info * private;
+
+	if (pci_dev == NULL) {
+		rte_gpu_log(ERR, "NULL PCI device");
+		return -EINVAL;
+	}
+
+	rte_pci_device_name(&pci_dev->addr, dev_name, sizeof(dev_name));
+
+	/* Allocate memory to be used privately by drivers */
+	dev = rte_gpu_allocate(pci_dev->device.name);
+	if (dev == NULL)
+		return -ENODEV;
+
+	/* Fill HW specific part of device structure */
+	dev->device = &pci_dev->device;
+	dev->mpshared->info.numa_node = pci_dev->device.numa_node;
+
+	/*
+	 * GPU Device init
+	 */
+
+	/*
+	 * Required to initialize the CUDA Driver.
+	 * Multiple calls of cuInit() will return immediately
+	 * without making any relevant change
+	 */
+	cuInit(0);
+
+	/* Get NVIDIA GPU Device descriptor */
+	res = cuDeviceGetByPCIBusId(&cu_dev_id, dev->device->name);
+	if(CUDA_SUCCESS != res)
+	{
+		cuGetErrorString(res, &(err_string));
+		rte_gpu_log(ERR, "cuDeviceGetByPCIBusId name %s failed with %d: %s.\n",
+					dev->device->name, res, err_string
+			);
+
+		return -1;
+	}
+
+	res = cuDevicePrimaryCtxRetain(&pctx, cu_dev_id);
+	if(CUDA_SUCCESS != res)
+	{
+		cuGetErrorString(res, &(err_string));
+		rte_gpu_log(ERR, "cuDevicePrimaryCtxRetain name %s failed with %d: %s.\n",
+					dev->device->name, res, err_string
+			);
+
+		return -1;
+	}
+
+	dev->mpshared->info.context = (uint64_t) pctx;
+
+	/*
+	 * GPU Device generic info
+	 */
+
+	/* Processor count */
+	res = cuDeviceGetAttribute(&(processor_count), CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, cu_dev_id);
+	if(CUDA_SUCCESS != res)
+	{
+		cuGetErrorString(res, &(err_string));
+		rte_gpu_log(ERR, "cuDeviceGetAttribute failed with %s.\n",
+					err_string
+			);
+
+		return -1;
+	}
+	dev->mpshared->info.processor_count = (uint32_t)processor_count;
+
+	/* Total memory */
+	res = cuDeviceTotalMem(&dev->mpshared->info.total_memory, cu_dev_id);
+	if(CUDA_SUCCESS != res)
+	{
+		cuGetErrorString(res, &(err_string));
+		rte_gpu_log(ERR, "cuDeviceTotalMem failed with %s.\n",
+					err_string
+			);
+
+		return -1;
+	}
+
+	/*
+	 * GPU Device private info
+	 */
+	dev->mpshared->dev_private = rte_zmalloc(NULL, sizeof(struct cuda_info), RTE_CACHE_LINE_SIZE);
+	if (dev->mpshared->dev_private == NULL) {
+		rte_gpu_log(ERR, "Failed to allocate memory for GPU process private.\n");
+
+		return -1;
+	}
+
+	private = (struct cuda_info *)dev->mpshared->dev_private;
+	private->cu_dev = cu_dev_id;
+	res = cuDeviceGetName(private->gpu_name, RTE_DEV_NAME_MAX_LEN, cu_dev_id);
+	if(CUDA_SUCCESS != res)
+	{
+		cuGetErrorString(res, &(err_string));
+		rte_gpu_log(ERR, "cuDeviceGetName failed with %s.\n", err_string);
+
+		return -1;
+	}
+
+	dev->ops.mem_alloc = cuda_mem_alloc;
+	dev->ops.mem_free = cuda_mem_free;
+	dev->ops.mem_register = cuda_mem_register;
+	dev->ops.mem_unregister = cuda_mem_unregister;
+	dev->ops.dev_info_get = cuda_dev_info_get;
+	dev->ops.dev_close = cuda_dev_close;
+
+	rte_gpu_complete_new(dev);
+
+	rte_gpu_log_debug("dev id = %u name = %s\n", dev->mpshared->info.dev_id, private->gpu_name);
+
+	return 0;
+}
+
+static int
+cuda_gpu_remove(struct rte_pci_device *pci_dev)
+{
+	struct rte_gpu *dev;
+	int ret;
+	uint8_t gpu_id;
+
+	if (pci_dev == NULL)
+		return -EINVAL;
+
+	dev = rte_gpu_get_by_name(pci_dev->device.name);
+	if (dev == NULL) {
+		rte_gpu_log(ERR,
+				"Couldn't find HW dev \"%s\" to uninitialise it",
+				pci_dev->device.name);
+		return -ENODEV;
+	}
+	gpu_id = dev->mpshared->info.dev_id;
+
+	/* release dev from library */
+	ret = rte_gpu_release(dev);
+	if (ret)
+		rte_gpu_log(ERR, "Device %i failed to uninit: %i", gpu_id, ret);
+
+	rte_gpu_log_debug("Destroyed dev = %u", gpu_id);
+
+	return 0;
+}
+
+static struct rte_pci_driver rte_cuda_driver = {
+	.id_table = pci_id_cuda_map,
+	.drv_flags = RTE_PCI_DRV_WC_ACTIVATE,
+	.probe = cuda_gpu_probe,
+	.remove = cuda_gpu_remove,
+};
+
+RTE_PMD_REGISTER_PCI(gpu_cuda, rte_cuda_driver);
+RTE_PMD_REGISTER_PCI_TABLE(gpu_cuda, pci_id_cuda_map);
+RTE_PMD_REGISTER_KMOD_DEP(gpu_cuda, "* nvidia & (nv_peer_mem | nvpeer_mem)");
+
diff --git a/drivers/gpu/cuda/meson.build b/drivers/gpu/cuda/meson.build
new file mode 100644
index 0000000000..53e40e6832
--- /dev/null
+++ b/drivers/gpu/cuda/meson.build
@@ -0,0 +1,30 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright (c) 2021 NVIDIA Corporation & Affiliates
+
+if not is_linux
+        build = false
+        reason = 'only supported on Linux'
+endif
+
+# cuda_dep = dependency('cuda-11.1', required: true, version : '>=11.1', method: 'pkg-config')
+# if not cuda_dep.found()
+#         build = false
+#         reason = 'missing dependency, "CUDA"'
+#         subdir_done()
+# endif
+# ext_deps += cuda_dep
+
+cuda_dep = dependency('cuda', version : '>=11', modules: ['cuda'])
+ext_deps += cuda_dep
+
+# cudart_dep = dependency('cudart-11.1', required: true, version : '>=11.1', method: 'pkg-config')
+# if not cudart_dep.found()
+#         build = false
+#         reason = 'missing dependency, "CUDA RT"'
+#         subdir_done()
+# endif
+# ext_deps += cudart_dep
+
+deps += ['gpudev','pci','bus_pci', 'hash']
+sources = files('cuda.c')
+# headers = files('header.h')
diff --git a/drivers/gpu/cuda/version.map b/drivers/gpu/cuda/version.map
new file mode 100644
index 0000000000..4a76d1d52d
--- /dev/null
+++ b/drivers/gpu/cuda/version.map
@@ -0,0 +1,3 @@
+DPDK_21 {
+	local: *;
+};
diff --git a/drivers/gpu/meson.build b/drivers/gpu/meson.build
index e51ad3381b..601bedcd61 100644
--- a/drivers/gpu/meson.build
+++ b/drivers/gpu/meson.build
@@ -1,4 +1,4 @@
 # SPDX-License-Identifier: BSD-3-Clause
 # Copyright (c) 2021 NVIDIA Corporation & Affiliates
 
-drivers = []
+drivers = [ 'cuda' ]
-- 
2.17.1
^ permalink raw reply related	[flat|nested] 28+ messages in thread
* Re: [dpdk-dev] [PATCH v2 1/1] gpu/cuda: introduce CUDA driver
  2021-11-04  2:01   ` [dpdk-dev] [PATCH v2 1/1] " eagostini
@ 2021-11-03 18:15     ` Stephen Hemminger
  2021-11-08 18:35     ` Stephen Hemminger
  1 sibling, 0 replies; 28+ messages in thread
From: Stephen Hemminger @ 2021-11-03 18:15 UTC (permalink / raw)
  To: eagostini; +Cc: dev
On Thu, 4 Nov 2021 02:01:28 +0000
<eagostini@nvidia.com> wrote:
> +
> +#define CUDA_MAX_ALLOCATION_NUM 512
> +
> +#define GPU_PAGE_SHIFT 16
> +#define GPU_PAGE_SIZE (1UL << GPU_PAGE_SHIFT)
> +
> +RTE_LOG_REGISTER_DEFAULT(gpu_logtype, NOTICE);
Use static before this RTE_LOG_REGISTER_DEFAULT.
You don't need to export gpu_log_type as global do you?
> +struct mem_entry *mem_alloc_list_head = NULL;
> +struct mem_entry *mem_alloc_list_tail = NULL;
> +uint32_t mem_alloc_list_last_elem = 0;
These should be static since specific to this driver.
^ permalink raw reply	[flat|nested] 28+ messages in thread
* [dpdk-dev] [PATCH v2 0/1] gpu/cuda: introduce CUDA driver
  2021-10-05 22:49 [dpdk-dev] [RFC PATCH] gpu/cuda: introduce CUDA driver eagostini
@ 2021-11-04  2:01 ` eagostini
  2021-11-04  2:01   ` [dpdk-dev] [PATCH v2 1/1] " eagostini
  2021-11-08 19:02 ` [dpdk-dev] [RFC PATCH] " Stephen Hemminger
                   ` (5 subsequent siblings)
  6 siblings, 1 reply; 28+ messages in thread
From: eagostini @ 2021-11-04  2:01 UTC (permalink / raw)
  To: dev; +Cc: Elena Agostini
From: Elena Agostini <eagostini@nvidia.com>
This is the CUDA implementation of the gpudev library.
Funcitonalities implemented through CUDA Driver API are:
- Device probe and remove
- Manage device memory allocations
- Register/unregister external CPU memory in the device memory area
Changelog:
- Checkpatch validation
- GPU CUDA driver docs
Elena Agostini (1):
  gpu/cuda: introduce CUDA driver
 doc/guides/gpus/cuda.rst     |  91 +++++
 doc/guides/gpus/index.rst    |   1 +
 drivers/gpu/cuda/cuda.c      | 716 +++++++++++++++++++++++++++++++++++
 drivers/gpu/cuda/meson.build |  30 ++
 drivers/gpu/cuda/version.map |   3 +
 drivers/gpu/meson.build      |   2 +-
 6 files changed, 842 insertions(+), 1 deletion(-)
 create mode 100644 doc/guides/gpus/cuda.rst
 create mode 100644 drivers/gpu/cuda/cuda.c
 create mode 100644 drivers/gpu/cuda/meson.build
 create mode 100644 drivers/gpu/cuda/version.map
-- 
2.17.1
^ permalink raw reply	[flat|nested] 28+ messages in thread
* [dpdk-dev] [PATCH v2 1/1] gpu/cuda: introduce CUDA driver
  2021-11-04  2:01 ` [dpdk-dev] [PATCH v2 0/1] " eagostini
@ 2021-11-04  2:01   ` eagostini
  2021-11-03 18:15     ` Stephen Hemminger
  2021-11-08 18:35     ` Stephen Hemminger
  0 siblings, 2 replies; 28+ messages in thread
From: eagostini @ 2021-11-04  2:01 UTC (permalink / raw)
  To: dev; +Cc: Elena Agostini
From: Elena Agostini <eagostini@nvidia.com>
This is the CUDA implementation of the gpudev library.
Funcitonalities implemented through CUDA Driver API are:
- Device probe and remove
- Manage device memory allocations
- Register/unregister external CPU memory in the device memory area
Signed-off-by: Elena Agostini <eagostini@nvidia.com>
---
 doc/guides/gpus/cuda.rst     |  91 +++++
 doc/guides/gpus/index.rst    |   1 +
 drivers/gpu/cuda/cuda.c      | 716 +++++++++++++++++++++++++++++++++++
 drivers/gpu/cuda/meson.build |  30 ++
 drivers/gpu/cuda/version.map |   3 +
 drivers/gpu/meson.build      |   2 +-
 6 files changed, 842 insertions(+), 1 deletion(-)
 create mode 100644 doc/guides/gpus/cuda.rst
 create mode 100644 drivers/gpu/cuda/cuda.c
 create mode 100644 drivers/gpu/cuda/meson.build
 create mode 100644 drivers/gpu/cuda/version.map
diff --git a/doc/guides/gpus/cuda.rst b/doc/guides/gpus/cuda.rst
new file mode 100644
index 0000000000..dec2f92a3e
--- /dev/null
+++ b/doc/guides/gpus/cuda.rst
@@ -0,0 +1,91 @@
+.. SPDX-License-Identifier: BSD-3-Clause
+   Copyright (c) 2021 NVIDIA Corporation & Affiliates
+
+CUDA GPU driver
+===============
+
+The CUDA GPU driver library (**librte_gpu_cuda**) provides support for NVIDIA GPUs.
+Information and documentation about these devices can be found on the
+`NVIDIA website <http://www.nvidia.com>`__. Help is also provided by the
+`NVIDIA CUDA Toolkit developer zone <https://docs.nvidia.com/cuda>`__.
+
+Design
+------
+
+**librte_gpu_cuda** relies on CUDA Driver API (no need for CUDA Runtime API).
+
+Goal of this driver library is not to provide a wrapper for the whole CUDA Driver API.
+Instead, the scope is to implement the generic features of gpudev API.
+For a CUDA application, integrating the gpudev library functions using the CUDA driver library
+is quite straighforward and doesn't create any compatibility problem.
+
+Initialization
+~~~~~~~~~~~~~~
+
+During initialization, CUDA driver library detects NVIDIA physical GPUs on the
+system or specified via EAL device options (e.g. ``-a b6:00.0``).
+The driver initializes the CUDA driver environment through ``cuInit(0)`` function.
+For this reason, it's required to set any CUDA environment configuration before
+calling ``rte_eal_init`` function in the DPDK application.
+
+If the CUDA driver enviroment has been already initialized, the ``cuInit(0)``
+in CUDA driver library has no effect.
+
+CUDA Driver sub-contexts
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+After initialization, a CUDA application can create multiple sub-contexts on GPU
+physical devices. Through gpudev library, is possible to register these sub-contexts
+in the CUDA driver library as child devices having as parent a GPU physical device.
+
+CUDA driver library also supports `MPS <https://docs.nvidia.com/deploy/pdf/CUDA_Multi_Process_Service_Overview.pdf>`__.
+
+GPU memory management
+~~~~~~~~~~~~~~~~~~~~~
+
+The CUDA driver library maintains a table of GPU memory addresses allocated
+and CPU memory addresses registered associated to the input CUDA context.
+Whenever the application tried to deallocate or deregister a memory address,
+if the address is not in the table the CUDA driver library will return an error.
+
+Features
+--------
+
+- Register new child devices aka new CUDA Driver contexts
+- Allocate memory on the GPU
+- Register CPU memory to make it visible from GPU
+
+Minimal requirements
+--------------------
+
+Minimal requirements to enable the CUDA driver library are:
+
+- NVIDIA GPU Ampere or Volta
+- CUDA 11.4 Driver API or newer
+
+`GPUDirect RDMA Technology <https://docs.nvidia.com/cuda/gpudirect-rdma/index.html>`__
+allows compatible network cards (e.g. Mellanox) to directly send and receive packets
+using GPU memory instead of additional memory copies through the CPU system memory.
+To enable this technology, system requirements are:
+
+- `nvidia-peermem <https://docs.nvidia.com/cuda/gpudirect-rdma/index.html#nvidia-peermem>`__ module running on the system
+- Mellanox Network card ConnectX-5 or newer (BlueField models included)
+- DPDK mlx5 PMD enabled
+- To reach the best performance, a PCIe switch between GPU and NIC is recommended
+
+Limitations
+-----------
+
+Supported only on Linux.
+
+Supported GPUs
+--------------
+
+The following NVIDIA GPU devices are supported by this CUDA driver:
+
+- NVIDIA A100 80GB PCIe
+- NVIDIA A100 40GB PCIe
+- NVIDIA A30 24GB
+- NVIDIA A10 24GB
+- NVIDIA V100 32GB PCIe
+- NVIDIA V100 16GB PCIe
diff --git a/doc/guides/gpus/index.rst b/doc/guides/gpus/index.rst
index 1878423239..4b7a420556 100644
--- a/doc/guides/gpus/index.rst
+++ b/doc/guides/gpus/index.rst
@@ -9,3 +9,4 @@ General-Purpose Graphics Processing Unit Drivers
    :numbered:
 
    overview
+   cuda
diff --git a/drivers/gpu/cuda/cuda.c b/drivers/gpu/cuda/cuda.c
new file mode 100644
index 0000000000..e3f8c1c509
--- /dev/null
+++ b/drivers/gpu/cuda/cuda.c
@@ -0,0 +1,716 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright (c) 2021 NVIDIA Corporation & Affiliates
+ */
+
+#include <rte_common.h>
+#include <rte_log.h>
+#include <rte_malloc.h>
+#include <rte_errno.h>
+#include <rte_pci.h>
+#include <rte_bus_pci.h>
+#include <rte_byteorder.h>
+#include <rte_dev.h>
+
+#include <gpudev_driver.h>
+#include <cuda.h>
+
+/* NVIDIA GPU vendor */
+#define NVIDIA_GPU_VENDOR_ID (0x10de)
+
+/* NVIDIA GPU device IDs */
+#define NVIDIA_GPU_A100_40GB_DEVICE_ID (0x20f1)
+#define NVIDIA_GPU_A100_80GB_DEVICE_ID (0x20b5)
+
+#define NVIDIA_GPU_A30_24GB_DEVICE_ID (0x20b7)
+#define NVIDIA_GPU_A10_24GB_DEVICE_ID (0x2236)
+
+#define NVIDIA_GPU_V100_32GB_DEVICE_ID (0x1db6)
+#define NVIDIA_GPU_V100_16GB_DEVICE_ID (0x1db4)
+
+#define CUDA_MAX_ALLOCATION_NUM 512
+
+#define GPU_PAGE_SHIFT 16
+#define GPU_PAGE_SIZE (1UL << GPU_PAGE_SHIFT)
+
+RTE_LOG_REGISTER_DEFAULT(gpu_logtype, NOTICE);
+
+/** Helper macro for logging */
+#define rte_gpu_log(level, fmt, ...) \
+	rte_log(RTE_LOG_ ## level, gpu_logtype, fmt "\n", ##__VA_ARGS__)
+
+#define rte_gpu_log_debug(fmt, ...) \
+	rte_gpu_log(DEBUG, RTE_STR(__LINE__) ":%s() " fmt, __func__, \
+		##__VA_ARGS__)
+
+/* NVIDIA GPU address map */
+static struct rte_pci_id pci_id_cuda_map[] = {
+	{
+		RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
+				NVIDIA_GPU_A100_40GB_DEVICE_ID)
+	},
+	{
+		RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
+				NVIDIA_GPU_V100_32GB_DEVICE_ID)
+	},
+	/* {.device_id = 0}, ?? */
+};
+
+/* Device private info */
+struct cuda_info {
+	char gpu_name[RTE_DEV_NAME_MAX_LEN];
+	CUdevice cu_dev;
+};
+
+/* Type of memory allocated by CUDA driver */
+enum mem_type {
+	GPU_MEM = 0,
+	CPU_REGISTERED,
+	GPU_REGISTERED /* Not used yet */
+};
+
+/* key associated to a memory address */
+typedef uintptr_t ptr_key;
+
+/* Single entry of the memory list */
+struct mem_entry {
+	CUdeviceptr ptr_d;
+	void *ptr_h;
+	size_t size;
+	struct rte_gpu *dev;
+	CUcontext ctx;
+	ptr_key pkey;
+	enum mem_type mtype;
+	struct mem_entry *prev;
+	struct mem_entry *next;
+};
+
+struct mem_entry *mem_alloc_list_head = NULL;
+struct mem_entry *mem_alloc_list_tail = NULL;
+uint32_t mem_alloc_list_last_elem = 0;
+
+/* Generate a key from a memory pointer */
+static ptr_key
+get_hash_from_ptr(void *ptr)
+{
+	return (uintptr_t) ptr;
+}
+
+static uint32_t
+mem_list_count_item(void)
+{
+	return mem_alloc_list_last_elem;
+}
+
+/* Initiate list of memory allocations if not done yet */
+static struct mem_entry *
+mem_list_add_item(void)
+{
+	/* Initiate list of memory allocations if not done yet */
+	if (mem_alloc_list_head == NULL) {
+		mem_alloc_list_head = rte_zmalloc(NULL, sizeof(struct mem_entry), RTE_CACHE_LINE_SIZE);
+		if (mem_alloc_list_head == NULL) {
+			rte_gpu_log(ERR, "Failed to allocate memory for memory list.\n");
+			return NULL;
+		}
+
+		mem_alloc_list_head->next = NULL;
+		mem_alloc_list_head->prev = NULL;
+		mem_alloc_list_tail = mem_alloc_list_head;
+	} else {
+		struct mem_entry *mem_alloc_list_cur = rte_zmalloc(NULL, sizeof(struct mem_entry), RTE_CACHE_LINE_SIZE);
+
+		if (mem_alloc_list_cur == NULL) {
+			rte_gpu_log(ERR, "Failed to allocate memory for memory list.\n");
+			return NULL;
+		}
+
+		mem_alloc_list_tail->next = mem_alloc_list_cur;
+		mem_alloc_list_cur->prev = mem_alloc_list_tail;
+		mem_alloc_list_tail = mem_alloc_list_tail->next;
+		mem_alloc_list_tail->next = NULL;
+	}
+
+	mem_alloc_list_last_elem++;
+
+	return mem_alloc_list_tail;
+}
+
+static struct mem_entry *
+mem_list_find_item(ptr_key pk)
+{
+	struct mem_entry *mem_alloc_list_cur = NULL;
+
+	if (mem_alloc_list_head == NULL) {
+		rte_gpu_log(ERR, "Memory list doesn't exist\n");
+		return NULL;
+	}
+
+	if (mem_list_count_item() == 0) {
+		rte_gpu_log(ERR, "No items in memory list\n");
+		return NULL;
+	}
+
+	mem_alloc_list_cur = mem_alloc_list_head;
+
+	while (mem_alloc_list_cur != NULL) {
+		if (mem_alloc_list_cur->pkey == pk)
+			return mem_alloc_list_cur;
+		mem_alloc_list_cur = mem_alloc_list_cur->next;
+	}
+
+	return mem_alloc_list_cur;
+}
+
+static int
+mem_list_del_item(ptr_key pk)
+{
+	struct mem_entry *mem_alloc_list_cur = NULL;
+
+	mem_alloc_list_cur = mem_list_find_item(pk);
+	if (mem_alloc_list_cur == NULL)
+		return -EINVAL;
+
+	/* if key is in head */
+	if (mem_alloc_list_cur->prev == NULL)
+		mem_alloc_list_head = mem_alloc_list_cur->next;
+	else {
+		mem_alloc_list_cur->prev->next = mem_alloc_list_cur->next;
+		if (mem_alloc_list_cur->next != NULL)
+			mem_alloc_list_cur->next->prev = mem_alloc_list_cur->prev;
+	}
+
+	rte_free(mem_alloc_list_cur);
+
+	mem_alloc_list_last_elem--;
+
+	return 0;
+}
+
+static int
+cuda_dev_info_get(struct rte_gpu *dev, struct rte_gpu_info *info)
+{
+	int ret = 0;
+	CUresult res;
+	struct rte_gpu_info parent_info;
+	CUexecAffinityParam affinityPrm;
+	const char *err_string;
+	struct cuda_info *private;
+	CUcontext current_ctx;
+	CUcontext input_ctx;
+
+	if (dev == NULL)
+		return -EINVAL;
+
+	/* Child initialization time probably called by rte_gpu_add_child() */
+	if (dev->mpshared->info.parent != RTE_GPU_ID_NONE && dev->mpshared->dev_private == NULL) {
+		/* Store current ctx */
+		res = cuCtxGetCurrent(¤t_ctx);
+		if (res != CUDA_SUCCESS) {
+			cuGetErrorString(res, &(err_string));
+			rte_gpu_log(ERR, "cuCtxGetCurrent failed with %s.\n", err_string);
+
+			return -1;
+		}
+
+		/* Set child ctx as current ctx */
+		input_ctx = (CUcontext)dev->mpshared->info.context;
+		res = cuCtxSetCurrent(input_ctx);
+		if (res != CUDA_SUCCESS) {
+			cuGetErrorString(res, &(err_string));
+			rte_gpu_log(ERR, "cuCtxSetCurrent input failed with %s.\n", err_string);
+
+			return -1;
+		}
+
+		/*
+		 * Ctx capacity info
+		 */
+
+		/* MPS compatible */
+		res = cuCtxGetExecAffinity(&affinityPrm, CU_EXEC_AFFINITY_TYPE_SM_COUNT);
+		if (res != CUDA_SUCCESS) {
+			cuGetErrorString(res, &(err_string));
+			rte_gpu_log(ERR, "cuCtxGetExecAffinity failed with %s.\n", err_string);
+		}
+		dev->mpshared->info.processor_count = (uint32_t)affinityPrm.param.smCount.val;
+
+		ret = rte_gpu_info_get(dev->mpshared->info.parent, &parent_info);
+		if (ret)
+			return -ENODEV;
+		dev->mpshared->info.total_memory = parent_info.total_memory;
+
+		/*
+		 * GPU Device private info
+		 */
+		dev->mpshared->dev_private = rte_zmalloc(NULL, sizeof(struct cuda_info), RTE_CACHE_LINE_SIZE);
+		if (dev->mpshared->dev_private == NULL) {
+			rte_gpu_log(ERR, "Failed to allocate memory for GPU process private.\n");
+
+			return -1;
+		}
+
+		private = (struct cuda_info *)dev->mpshared->dev_private;
+
+		res = cuCtxGetDevice(&(private->cu_dev));
+		if (res != CUDA_SUCCESS) {
+			cuGetErrorString(res, &(err_string));
+			rte_gpu_log(ERR, "cuCtxGetDevice failed with %s.\n", err_string);
+
+			return -1;
+		}
+
+		res = cuDeviceGetName(private->gpu_name, RTE_DEV_NAME_MAX_LEN, private->cu_dev);
+		if (res != CUDA_SUCCESS) {
+			cuGetErrorString(res, &(err_string));
+			rte_gpu_log(ERR, "cuDeviceGetName failed with %s.\n", err_string);
+
+			return -1;
+		}
+
+		/* Restore original ctx as current ctx */
+		res = cuCtxSetCurrent(current_ctx);
+		if (res != CUDA_SUCCESS) {
+			cuGetErrorString(res, &(err_string));
+			rte_gpu_log(ERR, "cuCtxSetCurrent current failed with %s.\n", err_string);
+
+			return -1;
+		}
+	}
+
+	*info = dev->mpshared->info;
+
+	return 0;
+}
+
+/*
+ * GPU Memory
+ */
+
+static int
+cuda_mem_alloc(struct rte_gpu *dev, size_t size, void **ptr)
+{
+	CUresult res;
+	const char *err_string;
+	CUcontext current_ctx;
+	CUcontext input_ctx;
+	unsigned int flag = 1;
+
+	if (dev == NULL || size == 0)
+		return -EINVAL;
+
+	/* Store current ctx */
+	res = cuCtxGetCurrent(¤t_ctx);
+	if (res != CUDA_SUCCESS) {
+		cuGetErrorString(res, &(err_string));
+		rte_gpu_log(ERR, "cuCtxGetCurrent failed with %s.\n", err_string);
+
+		return -1;
+	}
+
+	/* Set child ctx as current ctx */
+	input_ctx = (CUcontext)dev->mpshared->info.context;
+	res = cuCtxSetCurrent(input_ctx);
+	if (res != CUDA_SUCCESS) {
+		cuGetErrorString(res, &(err_string));
+		rte_gpu_log(ERR, "cuCtxSetCurrent input failed with %s.\n", err_string);
+
+		return -1;
+	}
+
+	/* Get next memory list item */
+	mem_alloc_list_tail = mem_list_add_item();
+	if (mem_alloc_list_tail == NULL)
+		return -ENOMEM;
+
+	/* Allocate memory */
+	mem_alloc_list_tail->size = size;
+	res = cuMemAlloc(&(mem_alloc_list_tail->ptr_d), mem_alloc_list_tail->size);
+	if (res != CUDA_SUCCESS) {
+		cuGetErrorString(res, &(err_string));
+		rte_gpu_log(ERR, "cuCtxSetCurrent current failed with %s.\n", err_string);
+
+		return -1;
+	}
+
+	/* GPUDirect RDMA attribute required */
+	res = cuPointerSetAttribute(&flag, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, mem_alloc_list_tail->ptr_d);
+	if (res != CUDA_SUCCESS) {
+		rte_gpu_log(ERR, "Could not set SYNC MEMOP attribute for GPU memory at %llx , err %d\n", mem_alloc_list_tail->ptr_d, res);
+		return -1;
+	}
+
+	mem_alloc_list_tail->pkey = get_hash_from_ptr((void *) mem_alloc_list_tail->ptr_d);
+	mem_alloc_list_tail->ptr_h = NULL;
+	mem_alloc_list_tail->size = size;
+	mem_alloc_list_tail->dev = dev;
+	mem_alloc_list_tail->ctx = (CUcontext)dev->mpshared->info.context;
+	mem_alloc_list_tail->mtype = GPU_MEM;
+
+	/* Restore original ctx as current ctx */
+	res = cuCtxSetCurrent(current_ctx);
+	if (res != CUDA_SUCCESS) {
+		cuGetErrorString(res, &(err_string));
+		rte_gpu_log(ERR, "cuCtxSetCurrent current failed with %s.\n", err_string);
+
+		return -1;
+	}
+
+	*ptr = (void *) mem_alloc_list_tail->ptr_d;
+
+	return 0;
+}
+
+static int
+cuda_mem_register(struct rte_gpu *dev, size_t size, void *ptr)
+{
+	CUresult res;
+	const char *err_string;
+	CUcontext current_ctx;
+	CUcontext input_ctx;
+	unsigned int flag = 1;
+	int use_ptr_h = 0;
+
+	if (dev == NULL || size == 0 || ptr == NULL)
+		return -EINVAL;
+
+	/* Store current ctx */
+	res = cuCtxGetCurrent(¤t_ctx);
+	if (res != CUDA_SUCCESS) {
+		cuGetErrorString(res, &(err_string));
+		rte_gpu_log(ERR, "cuCtxGetCurrent failed with %s.\n", err_string);
+
+		return -1;
+	}
+
+	/* Set child ctx as current ctx */
+	input_ctx = (CUcontext)dev->mpshared->info.context;
+	res = cuCtxSetCurrent(input_ctx);
+	if (res != CUDA_SUCCESS) {
+		cuGetErrorString(res, &(err_string));
+		rte_gpu_log(ERR, "cuCtxSetCurrent input failed with %s.\n", err_string);
+
+		return -1;
+	}
+
+	/* Get next memory list item */
+	mem_alloc_list_tail = mem_list_add_item();
+	if (mem_alloc_list_tail == NULL)
+		return -ENOMEM;
+
+	/* Allocate memory */
+	mem_alloc_list_tail->size = size;
+	mem_alloc_list_tail->ptr_h = ptr;
+
+	res = cuMemHostRegister(mem_alloc_list_tail->ptr_h, mem_alloc_list_tail->size, CU_MEMHOSTREGISTER_PORTABLE | CU_MEMHOSTREGISTER_DEVICEMAP);
+	if (res != CUDA_SUCCESS) {
+		cuGetErrorString(res, &(err_string));
+		rte_gpu_log(ERR, "cuMemHostRegister failed with %s ptr %p size %zd.\n",
+						err_string, mem_alloc_list_tail->ptr_h, mem_alloc_list_tail->size
+					);
+
+		return -1;
+	}
+
+	res = cuDeviceGetAttribute(&(use_ptr_h),
+									CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM,
+									((struct cuda_info *)(dev->mpshared->dev_private))->cu_dev
+								);
+	if (res != CUDA_SUCCESS) {
+		cuGetErrorString(res, &(err_string));
+		rte_gpu_log(ERR, "cuDeviceGetAttribute failed with %s.\n",
+					err_string
+			);
+
+		return -1;
+	}
+
+	if (use_ptr_h == 0) {
+		res = cuMemHostGetDevicePointer(&(mem_alloc_list_tail->ptr_d), mem_alloc_list_tail->ptr_h, 0);
+		if (res != CUDA_SUCCESS) {
+			cuGetErrorString(res, &(err_string));
+			rte_gpu_log(ERR, "cuMemHostGetDevicePointer failed with %s.\n", err_string);
+
+			return -1;
+		}
+
+		if ((uintptr_t) mem_alloc_list_tail->ptr_d != (uintptr_t) mem_alloc_list_tail->ptr_h) {
+			rte_gpu_log(ERR, "Host input pointer is different wrt GPU registered pointer\n");
+			return -1;
+		}
+	} else {
+		mem_alloc_list_tail->ptr_d = (CUdeviceptr) mem_alloc_list_tail->ptr_h;
+	}
+
+	/* GPUDirect RDMA attribute required */
+	res = cuPointerSetAttribute(&flag, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, mem_alloc_list_tail->ptr_d);
+	if (res != CUDA_SUCCESS) {
+		rte_gpu_log(ERR, "Could not set SYNC MEMOP attribute for GPU memory at %llx , err %d\n", mem_alloc_list_tail->ptr_d, res);
+		return -1;
+	}
+
+	mem_alloc_list_tail->pkey = get_hash_from_ptr((void *) mem_alloc_list_tail->ptr_h);
+	mem_alloc_list_tail->size = size;
+	mem_alloc_list_tail->dev = dev;
+	mem_alloc_list_tail->ctx = (CUcontext)dev->mpshared->info.context;
+	mem_alloc_list_tail->mtype = CPU_REGISTERED;
+
+	/* Restore original ctx as current ctx */
+	res = cuCtxSetCurrent(current_ctx);
+	if (res != CUDA_SUCCESS) {
+		cuGetErrorString(res, &(err_string));
+		rte_gpu_log(ERR, "cuCtxSetCurrent current failed with %s.\n", err_string);
+
+		return -1;
+	}
+
+	return 0;
+}
+
+static int
+cuda_mem_free(struct rte_gpu *dev, void *ptr)
+{
+	CUresult res;
+	struct mem_entry *mem_item;
+	const char *err_string;
+	ptr_key hk;
+
+	if (dev == NULL || ptr == NULL)
+		return -EINVAL;
+
+	hk = get_hash_from_ptr((void *) ptr);
+
+	mem_item = mem_list_find_item(hk);
+	if (mem_item == NULL) {
+		rte_gpu_log(ERR, "Memory address 0x%p not found in driver memory\n", ptr);
+		return -1;
+	}
+
+	if (mem_item->mtype == GPU_MEM) {
+		res = cuMemFree(mem_item->ptr_d);
+		if (res != CUDA_SUCCESS) {
+			cuGetErrorString(res, &(err_string));
+			rte_gpu_log(ERR, "cuMemFree current failed with %s.\n", err_string);
+
+			return -1;
+		}
+
+		return mem_list_del_item(hk);
+
+	} else {
+		rte_gpu_log(ERR, "Memory type %d not supported\n", mem_item->mtype);
+		return -1;
+	}
+
+	return 0;
+}
+
+static int
+cuda_mem_unregister(struct rte_gpu *dev, void *ptr)
+{
+	CUresult res;
+	struct mem_entry *mem_item;
+	const char *err_string;
+	ptr_key hk;
+
+	if (dev == NULL || ptr == NULL)
+		return -EINVAL;
+
+	hk = get_hash_from_ptr((void *) ptr);
+
+	mem_item = mem_list_find_item(hk);
+	if (mem_item == NULL) {
+		rte_gpu_log(ERR, "Memory address 0x%p not nd in driver memory\n", ptr);
+		return -1;
+	}
+
+	if (mem_item->mtype == CPU_REGISTERED) {
+		res = cuMemHostUnregister(ptr);
+		if (res != CUDA_SUCCESS) {
+			cuGetErrorString(res, &(err_string));
+			rte_gpu_log(ERR, "cuMemHostUnregister current failed with %s.\n", err_string);
+
+			return -1;
+		}
+
+		return mem_list_del_item(hk);
+	} else {
+		rte_gpu_log(ERR, "Memory type %d not supported\n", mem_item->mtype);
+		return -1;
+	}
+
+	return 0;
+}
+
+static int
+cuda_dev_close(struct rte_gpu *dev)
+{
+	if (dev == NULL)
+		return -EINVAL;
+
+	rte_free(dev->mpshared->dev_private);
+
+	return 0;
+}
+
+static int
+cuda_gpu_probe(__rte_unused struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
+{
+	struct rte_gpu *dev = NULL;
+	CUresult res;
+	CUdevice cu_dev_id;
+	CUcontext pctx;
+	char dev_name[RTE_DEV_NAME_MAX_LEN];
+	const char *err_string;
+	int processor_count = 0;
+	struct cuda_info *private;
+
+	if (pci_dev == NULL) {
+		rte_gpu_log(ERR, "NULL PCI device");
+		return -EINVAL;
+	}
+
+	rte_pci_device_name(&pci_dev->addr, dev_name, sizeof(dev_name));
+
+	/* Allocate memory to be used privately by drivers */
+	dev = rte_gpu_allocate(pci_dev->device.name);
+	if (dev == NULL)
+		return -ENODEV;
+
+	/* Fill HW specific part of device structure */
+	dev->device = &pci_dev->device;
+	dev->mpshared->info.numa_node = pci_dev->device.numa_node;
+
+	/*
+	 * GPU Device init
+	 */
+
+	/*
+	 * Required to initialize the CUDA Driver.
+	 * Multiple calls of cuInit() will return immediately
+	 * without making any relevant change
+	 */
+	cuInit(0);
+
+	/* Get NVIDIA GPU Device descriptor */
+	res = cuDeviceGetByPCIBusId(&cu_dev_id, dev->device->name);
+	if (res != CUDA_SUCCESS) {
+		cuGetErrorString(res, &(err_string));
+		rte_gpu_log(ERR, "cuDeviceGetByPCIBusId name %s failed with %d: %s.\n",
+					dev->device->name, res, err_string
+			);
+
+		return -1;
+	}
+
+	res = cuDevicePrimaryCtxRetain(&pctx, cu_dev_id);
+	if (res != CUDA_SUCCESS) {
+		cuGetErrorString(res, &(err_string));
+		rte_gpu_log(ERR, "cuDevicePrimaryCtxRetain name %s failed with %d: %s.\n",
+					dev->device->name, res, err_string
+			);
+
+		return -1;
+	}
+
+	dev->mpshared->info.context = (uint64_t) pctx;
+
+	/*
+	 * GPU Device generic info
+	 */
+
+	/* Processor count */
+	res = cuDeviceGetAttribute(&(processor_count), CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, cu_dev_id);
+	if (res != CUDA_SUCCESS) {
+		cuGetErrorString(res, &(err_string));
+		rte_gpu_log(ERR, "cuDeviceGetAttribute failed with %s.\n",
+					err_string
+			);
+
+		return -1;
+	}
+	dev->mpshared->info.processor_count = (uint32_t)processor_count;
+
+	/* Total memory */
+	res = cuDeviceTotalMem(&dev->mpshared->info.total_memory, cu_dev_id);
+	if (res != CUDA_SUCCESS) {
+		cuGetErrorString(res, &(err_string));
+		rte_gpu_log(ERR, "cuDeviceTotalMem failed with %s.\n",
+					err_string
+			);
+
+		return -1;
+	}
+
+	/*
+	 * GPU Device private info
+	 */
+	dev->mpshared->dev_private = rte_zmalloc(NULL, sizeof(struct cuda_info), RTE_CACHE_LINE_SIZE);
+	if (dev->mpshared->dev_private == NULL) {
+		rte_gpu_log(ERR, "Failed to allocate memory for GPU process private.\n");
+
+		return -1;
+	}
+
+	private = (struct cuda_info *)dev->mpshared->dev_private;
+	private->cu_dev = cu_dev_id;
+	res = cuDeviceGetName(private->gpu_name, RTE_DEV_NAME_MAX_LEN, cu_dev_id);
+	if (res != CUDA_SUCCESS) {
+		cuGetErrorString(res, &(err_string));
+		rte_gpu_log(ERR, "cuDeviceGetName failed with %s.\n", err_string);
+
+		return -1;
+	}
+
+	dev->ops.mem_alloc = cuda_mem_alloc;
+	dev->ops.mem_free = cuda_mem_free;
+	dev->ops.mem_register = cuda_mem_register;
+	dev->ops.mem_unregister = cuda_mem_unregister;
+	dev->ops.dev_info_get = cuda_dev_info_get;
+	dev->ops.dev_close = cuda_dev_close;
+
+	rte_gpu_complete_new(dev);
+
+	rte_gpu_log_debug("dev id = %u name = %s\n", dev->mpshared->info.dev_id, private->gpu_name);
+
+	return 0;
+}
+
+static int
+cuda_gpu_remove(struct rte_pci_device *pci_dev)
+{
+	struct rte_gpu *dev;
+	int ret;
+	uint8_t gpu_id;
+
+	if (pci_dev == NULL)
+		return -EINVAL;
+
+	dev = rte_gpu_get_by_name(pci_dev->device.name);
+	if (dev == NULL) {
+		rte_gpu_log(ERR,
+				"Couldn't find HW dev \"%s\" to uninitialise it",
+				pci_dev->device.name);
+		return -ENODEV;
+	}
+	gpu_id = dev->mpshared->info.dev_id;
+
+	/* release dev from library */
+	ret = rte_gpu_release(dev);
+	if (ret)
+		rte_gpu_log(ERR, "Device %i failed to uninit: %i", gpu_id, ret);
+
+	rte_gpu_log_debug("Destroyed dev = %u", gpu_id);
+
+	return 0;
+}
+
+static struct rte_pci_driver rte_cuda_driver = {
+	.id_table = pci_id_cuda_map,
+	.drv_flags = RTE_PCI_DRV_WC_ACTIVATE,
+	.probe = cuda_gpu_probe,
+	.remove = cuda_gpu_remove,
+};
+
+RTE_PMD_REGISTER_PCI(gpu_cuda, rte_cuda_driver);
+RTE_PMD_REGISTER_PCI_TABLE(gpu_cuda, pci_id_cuda_map);
+RTE_PMD_REGISTER_KMOD_DEP(gpu_cuda, "* nvidia & (nv_peer_mem | nvpeer_mem)");
diff --git a/drivers/gpu/cuda/meson.build b/drivers/gpu/cuda/meson.build
new file mode 100644
index 0000000000..53e40e6832
--- /dev/null
+++ b/drivers/gpu/cuda/meson.build
@@ -0,0 +1,30 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright (c) 2021 NVIDIA Corporation & Affiliates
+
+if not is_linux
+        build = false
+        reason = 'only supported on Linux'
+endif
+
+# cuda_dep = dependency('cuda-11.1', required: true, version : '>=11.1', method: 'pkg-config')
+# if not cuda_dep.found()
+#         build = false
+#         reason = 'missing dependency, "CUDA"'
+#         subdir_done()
+# endif
+# ext_deps += cuda_dep
+
+cuda_dep = dependency('cuda', version : '>=11', modules: ['cuda'])
+ext_deps += cuda_dep
+
+# cudart_dep = dependency('cudart-11.1', required: true, version : '>=11.1', method: 'pkg-config')
+# if not cudart_dep.found()
+#         build = false
+#         reason = 'missing dependency, "CUDA RT"'
+#         subdir_done()
+# endif
+# ext_deps += cudart_dep
+
+deps += ['gpudev','pci','bus_pci', 'hash']
+sources = files('cuda.c')
+# headers = files('header.h')
diff --git a/drivers/gpu/cuda/version.map b/drivers/gpu/cuda/version.map
new file mode 100644
index 0000000000..4a76d1d52d
--- /dev/null
+++ b/drivers/gpu/cuda/version.map
@@ -0,0 +1,3 @@
+DPDK_21 {
+	local: *;
+};
diff --git a/drivers/gpu/meson.build b/drivers/gpu/meson.build
index e51ad3381b..601bedcd61 100644
--- a/drivers/gpu/meson.build
+++ b/drivers/gpu/meson.build
@@ -1,4 +1,4 @@
 # SPDX-License-Identifier: BSD-3-Clause
 # Copyright (c) 2021 NVIDIA Corporation & Affiliates
 
-drivers = []
+drivers = [ 'cuda' ]
-- 
2.17.1
^ permalink raw reply related	[flat|nested] 28+ messages in thread
* Re: [dpdk-dev] [PATCH v2 1/1] gpu/cuda: introduce CUDA driver
  2021-11-04  2:01   ` [dpdk-dev] [PATCH v2 1/1] " eagostini
  2021-11-03 18:15     ` Stephen Hemminger
@ 2021-11-08 18:35     ` Stephen Hemminger
  2021-11-08 18:39       ` Elena Agostini
  1 sibling, 1 reply; 28+ messages in thread
From: Stephen Hemminger @ 2021-11-08 18:35 UTC (permalink / raw)
  To: eagostini; +Cc: dev
On Thu, 4 Nov 2021 02:01:28 +0000
<eagostini@nvidia.com> wrote:
> +/* Single entry of the memory list */
> +struct mem_entry {
> +	CUdeviceptr ptr_d;
> +	void *ptr_h;
> +	size_t size;
> +	struct rte_gpu *dev;
> +	CUcontext ctx;
Not sure where these types CUdeviceptr and CUcontext are coming
from, but the code looks like Windows style not DPDK or Linux.
Please don't introduce CamelCase typedef's and never typedefs
for pointers.
^ permalink raw reply	[flat|nested] 28+ messages in thread
* Re: [dpdk-dev] [PATCH v2 1/1] gpu/cuda: introduce CUDA driver
  2021-11-08 18:35     ` Stephen Hemminger
@ 2021-11-08 18:39       ` Elena Agostini
  2021-11-08 18:59         ` Stephen Hemminger
  0 siblings, 1 reply; 28+ messages in thread
From: Elena Agostini @ 2021-11-08 18:39 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: dev@dpdk.org
> From: Stephen Hemminger <stephen@networkplumber.org>
> Date: Monday, 8 November 2021 at 19:35
> To: Elena Agostini <eagostini@nvidia.com>
> Cc: dev@dpdk.org <dev@dpdk.org>
> Subject: Re: [dpdk-dev] [PATCH v2 1/1] gpu/cuda: introduce CUDA driver
> External email: Use caution opening links or attachments>
>
> On Thu, 4 Nov 2021 02:01:28 +0000
> <eagostini@nvidia.com> wrote:>
> > +/* Single entry of the memory list */
> > +struct mem_entry {
> > +     CUdeviceptr ptr_d;
> > +     void *ptr_h;
> > +     size_t size;
> > +     struct rte_gpu *dev;
> > +     CUcontext ctx;>
> Not sure where these types CUdeviceptr and CUcontext are coming
> from, but the code looks like Windows style not DPDK or Linux.>
> Please don't introduce CamelCase typedef's and never typedefs
> for pointers.
These are CUDA Driver API specific types, I can’t change them.
^ permalink raw reply	[flat|nested] 28+ messages in thread
* Re: [dpdk-dev] [PATCH v2 1/1] gpu/cuda: introduce CUDA driver
  2021-11-08 18:39       ` Elena Agostini
@ 2021-11-08 18:59         ` Stephen Hemminger
  2021-11-08 19:07           ` Elena Agostini
  0 siblings, 1 reply; 28+ messages in thread
From: Stephen Hemminger @ 2021-11-08 18:59 UTC (permalink / raw)
  To: Elena Agostini; +Cc: dev@dpdk.org
On Mon, 8 Nov 2021 18:39:36 +0000
Elena Agostini <eagostini@nvidia.com> wrote:
> > From: Stephen Hemminger <stephen@networkplumber.org>
> > Date: Monday, 8 November 2021 at 19:35
> > To: Elena Agostini <eagostini@nvidia.com>
> > Cc: dev@dpdk.org <dev@dpdk.org>
> > Subject: Re: [dpdk-dev] [PATCH v2 1/1] gpu/cuda: introduce CUDA driver
> > External email: Use caution opening links or attachments>
> >
> > On Thu, 4 Nov 2021 02:01:28 +0000
> > <eagostini@nvidia.com> wrote:>  
> > > +/* Single entry of the memory list */
> > > +struct mem_entry {
> > > +     CUdeviceptr ptr_d;
> > > +     void *ptr_h;
> > > +     size_t size;
> > > +     struct rte_gpu *dev;
> > > +     CUcontext ctx;>  
> > Not sure where these types CUdeviceptr and CUcontext are coming
> > from, but the code looks like Windows style not DPDK or Linux.>
> > Please don't introduce CamelCase typedef's and never typedefs
> > for pointers.  
> 
> These are CUDA Driver API specific types, I can’t change them.
Could you at least avoid using the pointer typedefs.
When a pointer is typedef'd it leads to confusion to readers and
some dumb static analyzers.
^ permalink raw reply	[flat|nested] 28+ messages in thread
* Re: [dpdk-dev] [RFC PATCH] gpu/cuda: introduce CUDA driver
  2021-10-05 22:49 [dpdk-dev] [RFC PATCH] gpu/cuda: introduce CUDA driver eagostini
  2021-11-04  2:01 ` [dpdk-dev] [PATCH v2 0/1] " eagostini
@ 2021-11-08 19:02 ` Stephen Hemminger
  2021-11-08 21:20   ` Elena Agostini
  2021-11-09  2:28 ` [dpdk-dev] [PATCH v3 0/1] " eagostini
                   ` (4 subsequent siblings)
  6 siblings, 1 reply; 28+ messages in thread
From: Stephen Hemminger @ 2021-11-08 19:02 UTC (permalink / raw)
  To: eagostini; +Cc: dev
On Tue, 5 Oct 2021 22:49:05 +0000
<eagostini@nvidia.com> wrote:
> From: Elena Agostini <eagostini@nvidia.com>
> 
> This is the CUDA implementation of the gpudev library.
> Funcitonalities implemented through CUDA Driver API are:
> 
> - Device probe and remove
> - Manage device memory allocations
> - Register/unregister external CPU memory in the device memory area
> 
> Signed-off-by: Elena Agostini <eagostini@nvidia.com>
> ---
What is the license of the CUDA Driver?
^ permalink raw reply	[flat|nested] 28+ messages in thread
* Re: [dpdk-dev] [PATCH v2 1/1] gpu/cuda: introduce CUDA driver
  2021-11-08 18:59         ` Stephen Hemminger
@ 2021-11-08 19:07           ` Elena Agostini
  0 siblings, 0 replies; 28+ messages in thread
From: Elena Agostini @ 2021-11-08 19:07 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: dev@dpdk.org
> From: Stephen Hemminger <stephen@networkplumber.org>
> Date: Monday, 8 November 2021 at 19:59
> To: Elena Agostini <eagostini@nvidia.com>
> Cc: dev@dpdk.org <dev@dpdk.org>
> Subject: Re: [dpdk-dev] [PATCH v2 1/1] gpu/cuda: introduce CUDA driver>
> On Mon, 8 Nov 2021 18:39:36 +0000
> Elena Agostini <eagostini@nvidia.com> wrote:>
> > > From: Stephen Hemminger <stephen@networkplumber.org>
> > > Date: Monday, 8 November 2021 at 19:35
> > > To: Elena Agostini <eagostini@nvidia.com>
> > > Cc: dev@dpdk.org <dev@dpdk.org>
> > > Subject: Re: [dpdk-dev] [PATCH v2 1/1] gpu/cuda: introduce CUDA driver
> > > External email: Use caution opening links or attachments>
> > >
> > > On Thu, 4 Nov 2021 02:01:28 +0000
> > > <eagostini@nvidia.com> wrote:>
> > > > +/* Single entry of the memory list */
> > > > +struct mem_entry {
> > > > +     CUdeviceptr ptr_d;
> > > > +     void *ptr_h;
> > > > +     size_t size;
> > > > +     struct rte_gpu *dev;
> > > > +     CUcontext ctx;>
> > > Not sure where these types CUdeviceptr and CUcontext are coming
> > > from, but the code looks like Windows style not DPDK or Linux.>
> > > Please don't introduce CamelCase typedef's and never typedefs
> > > for pointers.
> >
> > These are CUDA Driver API specific types, I can’t change them.>
> Could you at least avoid using the pointer typedefs.
> When a pointer is typedef'd it leads to confusion to readers and
> some dumb static analyzers.
There is not pointer typedef here.
The typedef is about uintptr_t
^ permalink raw reply	[flat|nested] 28+ messages in thread
* Re: [dpdk-dev] [PATCH v3 1/1] gpu/cuda: introduce CUDA driver
  2021-11-09  2:28   ` [dpdk-dev] [PATCH v3 1/1] " eagostini
@ 2021-11-08 19:52     ` David Marchand
  0 siblings, 0 replies; 28+ messages in thread
From: David Marchand @ 2021-11-08 19:52 UTC (permalink / raw)
  To: Elena Agostini; +Cc: dev, Thomas Monjalon
On Mon, Nov 8, 2021 at 7:17 PM <eagostini@nvidia.com> wrote:
> diff --git a/drivers/gpu/cuda/meson.build b/drivers/gpu/cuda/meson.build
> new file mode 100644
> index 0000000000..92b30c35b4
> --- /dev/null
> +++ b/drivers/gpu/cuda/meson.build
> @@ -0,0 +1,13 @@
> +# SPDX-License-Identifier: BSD-3-Clause
> +# Copyright (c) 2021 NVIDIA Corporation & Affiliates
> +
> +if not is_linux
> +        build = false
> +        reason = 'only supported on Linux'
> +endif
> +
> +cuda_dep = dependency('cuda', version : '>=11', modules: ['cuda'])
Compiler for C supports arguments -D_DEFAULT_SOURCE: YES
Compiler for C supports arguments -D_XOPEN_SOURCE=600: YES
Compiler for C supports arguments -flax-vector-conversions: YES
Compiler for C supports arguments -Wno-strict-aliasing -Wstrict-aliasing: YES
Compiler for C supports arguments -Wno-format-nonliteral
-Wformat-nonliteral: YES
Library libturbo found: NO
Library libldpc_decoder_5gnr found: NO
Dependency cuda found: NO (tried pkgconfig and cmake)
drivers/gpu/cuda/meson.build:9:0: ERROR:  Dependency "cuda" not found,
tried pkgconfig and cmake
-- 
David Marchand
^ permalink raw reply	[flat|nested] 28+ messages in thread
* Re: [dpdk-dev] [RFC PATCH] gpu/cuda: introduce CUDA driver
  2021-11-08 19:02 ` [dpdk-dev] [RFC PATCH] " Stephen Hemminger
@ 2021-11-08 21:20   ` Elena Agostini
  2021-11-08 22:07     ` Stephen Hemminger
  0 siblings, 1 reply; 28+ messages in thread
From: Elena Agostini @ 2021-11-08 21:20 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: dev@dpdk.org
> From: Stephen Hemminger <stephen@networkplumber.org>
> Date: Monday, 8 November 2021 at 20:02
> To: Elena Agostini <eagostini@nvidia.com>
> Cc: dev@dpdk.org <dev@dpdk.org>
> Subject: Re: [dpdk-dev] [RFC PATCH] gpu/cuda: introduce CUDA driver
> External email: Use caution opening links or attachments
>
>
> On Tue, 5 Oct 2021 22:49:05 +0000
> <eagostini@nvidia.com> wrote:
>
> > From: Elena Agostini <eagostini@nvidia.com>
> >
> > This is the CUDA implementation of the gpudev library.
> > Funcitonalities implemented through CUDA Driver API are:
> >
> > - Device probe and remove
> > - Manage device memory allocations
> > - Register/unregister external CPU memory in the device memory area
> >
> > Signed-off-by: Elena Agostini <eagostini@nvidia.com>
> > ---
>
> What is the license of the CUDA Driver?
As you can see in the code:
/* SPDX-License-Identifier: BSD-3-Clause
* Copyright (c) 2021 NVIDIA Corporation & Affiliates
*/
^ permalink raw reply	[flat|nested] 28+ messages in thread
* Re: [dpdk-dev] [RFC PATCH] gpu/cuda: introduce CUDA driver
  2021-11-08 21:20   ` Elena Agostini
@ 2021-11-08 22:07     ` Stephen Hemminger
  2021-11-08 23:15       ` Stephen Hemminger
  0 siblings, 1 reply; 28+ messages in thread
From: Stephen Hemminger @ 2021-11-08 22:07 UTC (permalink / raw)
  To: Elena Agostini; +Cc: dev@dpdk.org
On Mon, 8 Nov 2021 21:20:31 +0000
Elena Agostini <eagostini@nvidia.com> wrote:
> > From: Stephen Hemminger <stephen@networkplumber.org>
> > Date: Monday, 8 November 2021 at 20:02
> > To: Elena Agostini <eagostini@nvidia.com>
> > Cc: dev@dpdk.org <dev@dpdk.org>
> > Subject: Re: [dpdk-dev] [RFC PATCH] gpu/cuda: introduce CUDA driver
> > External email: Use caution opening links or attachments
> >
> >
> > On Tue, 5 Oct 2021 22:49:05 +0000
> > <eagostini@nvidia.com> wrote:
> >  
> > > From: Elena Agostini <eagostini@nvidia.com>
> > >
> > > This is the CUDA implementation of the gpudev library.
> > > Funcitonalities implemented through CUDA Driver API are:
> > >
> > > - Device probe and remove
> > > - Manage device memory allocations
> > > - Register/unregister external CPU memory in the device memory area
> > >
> > > Signed-off-by: Elena Agostini <eagostini@nvidia.com>
> > > ---  
> >
> > What is the license of the CUDA Driver?  
> 
I meant the CUDA driver API?
The DPDK policy is that driver can not be a wrapper around a
close source API.
^ permalink raw reply	[flat|nested] 28+ messages in thread
* Re: [dpdk-dev] [RFC PATCH] gpu/cuda: introduce CUDA driver
  2021-11-08 22:07     ` Stephen Hemminger
@ 2021-11-08 23:15       ` Stephen Hemminger
  0 siblings, 0 replies; 28+ messages in thread
From: Stephen Hemminger @ 2021-11-08 23:15 UTC (permalink / raw)
  To: Elena Agostini; +Cc: dev@dpdk.org
On Mon, 8 Nov 2021 14:07:47 -0800
Stephen Hemminger <stephen@networkplumber.org> wrote:
> On Mon, 8 Nov 2021 21:20:31 +0000
> Elena Agostini <eagostini@nvidia.com> wrote:
> 
> > > From: Stephen Hemminger <stephen@networkplumber.org>
> > > Date: Monday, 8 November 2021 at 20:02
> > > To: Elena Agostini <eagostini@nvidia.com>
> > > Cc: dev@dpdk.org <dev@dpdk.org>
> > > Subject: Re: [dpdk-dev] [RFC PATCH] gpu/cuda: introduce CUDA driver
> > > External email: Use caution opening links or attachments
> > >
> > >
> > > On Tue, 5 Oct 2021 22:49:05 +0000
> > > <eagostini@nvidia.com> wrote:
> > >    
> > > > From: Elena Agostini <eagostini@nvidia.com>
> > > >
> > > > This is the CUDA implementation of the gpudev library.
> > > > Funcitonalities implemented through CUDA Driver API are:
> > > >
> > > > - Device probe and remove
> > > > - Manage device memory allocations
> > > > - Register/unregister external CPU memory in the device memory area
> > > >
> > > > Signed-off-by: Elena Agostini <eagostini@nvidia.com>
> > > > ---    
> > >
> > > What is the license of the CUDA Driver?    
> >   
> 
> I meant the CUDA driver API?
> The DPDK policy is that driver can not be a wrapper around a
> close source API.
If it is this license agreement from Nvidia 
https://docs.nvidia.com/cuda/eula/index.html
Then it is clearly not open source and I would have to recommend
against allowing this driver in DPDK. 
Corollary: without a open-source GPU driver, I would also recommend
against including the GPU driver subsystem in DPDK.
Note: these views are my own as part of the open-source community
and do not represent those of my employer.
^ permalink raw reply	[flat|nested] 28+ messages in thread
* [dpdk-dev] [PATCH v3 0/1] gpu/cuda: introduce CUDA driver
  2021-10-05 22:49 [dpdk-dev] [RFC PATCH] gpu/cuda: introduce CUDA driver eagostini
  2021-11-04  2:01 ` [dpdk-dev] [PATCH v2 0/1] " eagostini
  2021-11-08 19:02 ` [dpdk-dev] [RFC PATCH] " Stephen Hemminger
@ 2021-11-09  2:28 ` eagostini
  2021-11-09  2:28   ` [dpdk-dev] [PATCH v3 1/1] " eagostini
  2021-11-09  5:50 ` [dpdk-dev] [PATCH v4 0/1] " eagostini
                   ` (3 subsequent siblings)
  6 siblings, 1 reply; 28+ messages in thread
From: eagostini @ 2021-11-09  2:28 UTC (permalink / raw)
  To: dev; +Cc: Elena Agostini
From: Elena Agostini <eagostini@nvidia.com>
This is the CUDA implementation of the gpudev library.
Funcitonalities implemented through CUDA Driver API are:
- Device probe and remove
- Manage device memory allocations
- Register/unregister external CPU memory in the device memory area
Changelog:
- CUDA driver implementation of the GPU write memory barrier
- Fixed styling reported by checkpatch
Elena Agostini (1):
  gpu/cuda: introduce CUDA driver
 doc/guides/gpus/cuda.rst     | 110 +++++
 doc/guides/gpus/index.rst    |   1 +
 drivers/gpu/cuda/cuda.c      | 801 +++++++++++++++++++++++++++++++++++
 drivers/gpu/cuda/meson.build |  13 +
 drivers/gpu/cuda/version.map |   3 +
 drivers/gpu/meson.build      |   2 +-
 6 files changed, 929 insertions(+), 1 deletion(-)
 create mode 100644 doc/guides/gpus/cuda.rst
 create mode 100644 drivers/gpu/cuda/cuda.c
 create mode 100644 drivers/gpu/cuda/meson.build
 create mode 100644 drivers/gpu/cuda/version.map
-- 
2.17.1
^ permalink raw reply	[flat|nested] 28+ messages in thread
* [dpdk-dev] [PATCH v3 1/1] gpu/cuda: introduce CUDA driver
  2021-11-09  2:28 ` [dpdk-dev] [PATCH v3 0/1] " eagostini
@ 2021-11-09  2:28   ` eagostini
  2021-11-08 19:52     ` David Marchand
  0 siblings, 1 reply; 28+ messages in thread
From: eagostini @ 2021-11-09  2:28 UTC (permalink / raw)
  To: dev; +Cc: Elena Agostini
From: Elena Agostini <eagostini@nvidia.com>
This is the CUDA implementation of the gpudev library.
Funcitonalities implemented through CUDA Driver API are:
- Device probe and remove
- Manage device memory allocations
- Register/unregister external CPU memory in the device memory area
Signed-off-by: Elena Agostini <eagostini@nvidia.com>
---
 doc/guides/gpus/cuda.rst     | 110 +++++
 doc/guides/gpus/index.rst    |   1 +
 drivers/gpu/cuda/cuda.c      | 801 +++++++++++++++++++++++++++++++++++
 drivers/gpu/cuda/meson.build |  13 +
 drivers/gpu/cuda/version.map |   3 +
 drivers/gpu/meson.build      |   2 +-
 6 files changed, 929 insertions(+), 1 deletion(-)
 create mode 100644 doc/guides/gpus/cuda.rst
 create mode 100644 drivers/gpu/cuda/cuda.c
 create mode 100644 drivers/gpu/cuda/meson.build
 create mode 100644 drivers/gpu/cuda/version.map
diff --git a/doc/guides/gpus/cuda.rst b/doc/guides/gpus/cuda.rst
new file mode 100644
index 0000000000..64a78bf1e1
--- /dev/null
+++ b/doc/guides/gpus/cuda.rst
@@ -0,0 +1,110 @@
+.. SPDX-License-Identifier: BSD-3-Clause
+   Copyright (c) 2021 NVIDIA Corporation & Affiliates
+
+CUDA GPU driver
+===============
+
+The CUDA GPU driver library (**librte_gpu_cuda**) provides support for NVIDIA GPUs.
+Information and documentation about these devices can be found on the
+`NVIDIA website <http://www.nvidia.com>`__. Help is also provided by the
+`NVIDIA CUDA Toolkit developer zone <https://docs.nvidia.com/cuda>`__.
+
+Design
+------
+
+**librte_gpu_cuda** relies on CUDA Driver API (no need for CUDA Runtime API).
+
+Goal of this driver library is not to provide a wrapper for the whole CUDA Driver API.
+Instead, the scope is to implement the generic features of gpudev API.
+For a CUDA application, integrating the gpudev library functions using the CUDA driver library
+is quite straightforward and doesn't create any compatibility problem.
+
+Initialization
+~~~~~~~~~~~~~~
+
+During initialization, CUDA driver library detects NVIDIA physical GPUs on the
+system or specified via EAL device options (e.g. ``-a b6:00.0``).
+The driver initializes the CUDA driver environment through ``cuInit(0)`` function.
+For this reason, it's required to set any CUDA environment configuration before
+calling ``rte_eal_init`` function in the DPDK application.
+
+If the CUDA driver environment has been already initialized, the ``cuInit(0)``
+in CUDA driver library has no effect.
+
+CUDA Driver sub-contexts
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+After initialization, a CUDA application can create multiple sub-contexts on GPU
+physical devices. Through gpudev library, is possible to register these sub-contexts
+in the CUDA driver library as child devices having as parent a GPU physical device.
+
+CUDA driver library also supports `MPS <https://docs.nvidia.com/deploy/pdf/CUDA_Multi_Process_Service_Overview.pdf>`__.
+
+GPU memory management
+~~~~~~~~~~~~~~~~~~~~~
+
+The CUDA driver library maintains a table of GPU memory addresses allocated
+and CPU memory addresses registered associated to the input CUDA context.
+Whenever the application tried to deallocate or deregister a memory address,
+if the address is not in the table the CUDA driver library will return an error.
+
+Features
+--------
+
+- Register new child devices aka new CUDA Driver contexts
+- Allocate memory on the GPU
+- Register CPU memory to make it visible from GPU
+
+Minimal requirements
+--------------------
+
+Minimal requirements to enable the CUDA driver library are:
+
+- NVIDIA GPU Ampere or Volta
+- CUDA 11.4 Driver API or newer
+
+`GPUDirect RDMA Technology <https://docs.nvidia.com/cuda/gpudirect-rdma/index.html>`__
+allows compatible network cards (e.g. Mellanox) to directly send and receive packets
+using GPU memory instead of additional memory copies through the CPU system memory.
+To enable this technology, system requirements are:
+
+- `nvidia-peermem <https://docs.nvidia.com/cuda/gpudirect-rdma/index.html#nvidia-peermem>`__ module running on the system
+- Mellanox Network card ConnectX-5 or newer (BlueField models included)
+- DPDK mlx5 PMD enabled
+- To reach the best performance, a PCIe switch between GPU and NIC is recommended
+
+Limitations
+-----------
+
+Supported only on Linux.
+
+Supported GPUs
+--------------
+
+The following NVIDIA GPU devices are supported by this CUDA driver:
+
+- NVIDIA A100 80GB PCIe
+- NVIDIA A100 40GB PCIe
+- NVIDIA A30 24GB
+- NVIDIA A10 24GB
+- NVIDIA V100 32GB PCIe
+- NVIDIA V100 16GB PCIe
+
+External references
+-------------------
+
+A good example of how to use the GPU CUDA driver through the gpudev library
+is the l2fwd-nv application that can be found `here <https://github.com/NVIDIA/l2fwd-nv>`__.
+
+The application is based on vanilla DPDK example l2fwd and it's enhanced with GPU memory
+managed through gpudev library and CUDA to launch the swap of packets' MAC addresses workload
+on the GPU.
+
+l2fwd-nv is not intended to be used for performance (testpmd is the good candidate for this).
+The goal is to show different use-cases about how a CUDA application can use DPDK to:
+
+- allocate memory on GPU device using gpudev library
+- use that memory to create an external GPU memory mempool
+- receive packets directly in GPU memory
+- coordinate the workload on the GPU with the network and CPU activity to receive packets
+- send modified packets directly from the GPU memory
diff --git a/doc/guides/gpus/index.rst b/doc/guides/gpus/index.rst
index 1878423239..4b7a420556 100644
--- a/doc/guides/gpus/index.rst
+++ b/doc/guides/gpus/index.rst
@@ -9,3 +9,4 @@ General-Purpose Graphics Processing Unit Drivers
    :numbered:
 
    overview
+   cuda
diff --git a/drivers/gpu/cuda/cuda.c b/drivers/gpu/cuda/cuda.c
new file mode 100644
index 0000000000..d3d57492db
--- /dev/null
+++ b/drivers/gpu/cuda/cuda.c
@@ -0,0 +1,801 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright (c) 2021 NVIDIA Corporation & Affiliates
+ */
+
+#include <rte_common.h>
+#include <rte_log.h>
+#include <rte_malloc.h>
+#include <rte_errno.h>
+#include <rte_pci.h>
+#include <rte_bus_pci.h>
+#include <rte_byteorder.h>
+#include <rte_dev.h>
+
+#include <gpudev_driver.h>
+#include <cuda.h>
+
+/* NVIDIA GPU vendor */
+#define NVIDIA_GPU_VENDOR_ID (0x10de)
+
+/* NVIDIA GPU device IDs */
+#define NVIDIA_GPU_A100_40GB_DEVICE_ID (0x20f1)
+#define NVIDIA_GPU_A100_80GB_DEVICE_ID (0x20b5)
+
+#define NVIDIA_GPU_A30_24GB_DEVICE_ID (0x20b7)
+#define NVIDIA_GPU_A10_24GB_DEVICE_ID (0x2236)
+
+#define NVIDIA_GPU_V100_32GB_DEVICE_ID (0x1db6)
+#define NVIDIA_GPU_V100_16GB_DEVICE_ID (0x1db4)
+
+#define CUDA_MAX_ALLOCATION_NUM 512
+
+#define GPU_PAGE_SHIFT 16
+#define GPU_PAGE_SIZE (1UL << GPU_PAGE_SHIFT)
+
+static RTE_LOG_REGISTER_DEFAULT(cuda_logtype, NOTICE);
+
+/** Helper macro for logging */
+#define rte_gpu_cuda_log(level, fmt, ...) \
+	rte_log(RTE_LOG_ ## level, cuda_logtype, fmt "\n", ##__VA_ARGS__)
+
+#define rte_gpu_cuda_log_debug(fmt, ...) \
+	rte_gpu_cuda_log(DEBUG, RTE_STR(__LINE__) ":%s() " fmt, __func__, \
+		##__VA_ARGS__)
+
+/* NVIDIA GPU address map */
+static struct rte_pci_id pci_id_cuda_map[] = {
+	{
+		RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
+				NVIDIA_GPU_A100_40GB_DEVICE_ID)
+	},
+	{
+		RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
+				NVIDIA_GPU_V100_32GB_DEVICE_ID)
+	},
+	/* {.device_id = 0}, ?? */
+};
+
+/* Device private info */
+struct cuda_info {
+	char gpu_name[RTE_DEV_NAME_MAX_LEN];
+	CUdevice cu_dev;
+};
+
+/* Type of memory allocated by CUDA driver */
+enum mem_type {
+	GPU_MEM = 0,
+	CPU_REGISTERED,
+	GPU_REGISTERED /* Not used yet */
+};
+
+/* key associated to a memory address */
+typedef uintptr_t cuda_ptr_key;
+
+/* Single entry of the memory list */
+struct mem_entry {
+	CUdeviceptr ptr_d;
+	void *ptr_h;
+	size_t size;
+	struct rte_gpu *dev;
+	CUcontext ctx;
+	cuda_ptr_key pkey;
+	enum mem_type mtype;
+	struct mem_entry *prev;
+	struct mem_entry *next;
+};
+
+static struct mem_entry *mem_alloc_list_head;
+static struct mem_entry *mem_alloc_list_tail;
+static uint32_t mem_alloc_list_last_elem;
+
+/* Generate a key from a memory pointer */
+static cuda_ptr_key
+get_hash_from_ptr(void *ptr)
+{
+	return (uintptr_t) ptr;
+}
+
+static uint32_t
+mem_list_count_item(void)
+{
+	return mem_alloc_list_last_elem;
+}
+
+/* Initiate list of memory allocations if not done yet */
+static struct mem_entry *
+mem_list_add_item(void)
+{
+	/* Initiate list of memory allocations if not done yet */
+	if (mem_alloc_list_head == NULL) {
+		mem_alloc_list_head = rte_zmalloc(NULL,
+						sizeof(struct mem_entry),
+						RTE_CACHE_LINE_SIZE);
+		if (mem_alloc_list_head == NULL) {
+			rte_gpu_cuda_log(ERR, "Failed to allocate memory for memory list.\n");
+			return NULL;
+		}
+
+		mem_alloc_list_head->next = NULL;
+		mem_alloc_list_head->prev = NULL;
+		mem_alloc_list_tail = mem_alloc_list_head;
+	} else {
+		struct mem_entry *mem_alloc_list_cur = rte_zmalloc(NULL,
+								sizeof(struct mem_entry),
+								RTE_CACHE_LINE_SIZE);
+
+		if (mem_alloc_list_cur == NULL) {
+			rte_gpu_cuda_log(ERR, "Failed to allocate memory for memory list.\n");
+			return NULL;
+		}
+
+		mem_alloc_list_tail->next = mem_alloc_list_cur;
+		mem_alloc_list_cur->prev = mem_alloc_list_tail;
+		mem_alloc_list_tail = mem_alloc_list_tail->next;
+		mem_alloc_list_tail->next = NULL;
+	}
+
+	mem_alloc_list_last_elem++;
+
+	return mem_alloc_list_tail;
+}
+
+static struct mem_entry *
+mem_list_find_item(cuda_ptr_key pk)
+{
+	struct mem_entry *mem_alloc_list_cur = NULL;
+
+	if (mem_alloc_list_head == NULL) {
+		rte_gpu_cuda_log(ERR, "Memory list doesn't exist\n");
+		return NULL;
+	}
+
+	if (mem_list_count_item() == 0) {
+		rte_gpu_cuda_log(ERR, "No items in memory list\n");
+		return NULL;
+	}
+
+	mem_alloc_list_cur = mem_alloc_list_head;
+
+	while (mem_alloc_list_cur != NULL) {
+		if (mem_alloc_list_cur->pkey == pk)
+			return mem_alloc_list_cur;
+		mem_alloc_list_cur = mem_alloc_list_cur->next;
+	}
+
+	return mem_alloc_list_cur;
+}
+
+static int
+mem_list_del_item(cuda_ptr_key pk)
+{
+	struct mem_entry *mem_alloc_list_cur = NULL;
+
+	mem_alloc_list_cur = mem_list_find_item(pk);
+	if (mem_alloc_list_cur == NULL)
+		return -EINVAL;
+
+	/* if key is in head */
+	if (mem_alloc_list_cur->prev == NULL)
+		mem_alloc_list_head = mem_alloc_list_cur->next;
+	else {
+		mem_alloc_list_cur->prev->next = mem_alloc_list_cur->next;
+		if (mem_alloc_list_cur->next != NULL)
+			mem_alloc_list_cur->next->prev = mem_alloc_list_cur->prev;
+	}
+
+	rte_free(mem_alloc_list_cur);
+
+	mem_alloc_list_last_elem--;
+
+	return 0;
+}
+
+static int
+cuda_dev_info_get(struct rte_gpu *dev, struct rte_gpu_info *info)
+{
+	int ret = 0;
+	CUresult res;
+	struct rte_gpu_info parent_info;
+	CUexecAffinityParam affinityPrm;
+	const char *err_string;
+	struct cuda_info *private;
+	CUcontext current_ctx;
+	CUcontext input_ctx;
+
+	if (dev == NULL)
+		return -EINVAL;
+
+	/* Child initialization time probably called by rte_gpu_add_child() */
+	if (dev->mpshared->info.parent != RTE_GPU_ID_NONE && dev->mpshared->dev_private == NULL) {
+		/* Store current ctx */
+		res = cuCtxGetCurrent(¤t_ctx);
+		if (res != CUDA_SUCCESS) {
+			cuGetErrorString(res, &(err_string));
+			rte_gpu_cuda_log(ERR, "cuCtxGetCurrent failed with %s.\n", err_string);
+
+			return -1;
+		}
+
+		/* Set child ctx as current ctx */
+		input_ctx = (CUcontext)dev->mpshared->info.context;
+		res = cuCtxSetCurrent(input_ctx);
+		if (res != CUDA_SUCCESS) {
+			cuGetErrorString(res, &(err_string));
+			rte_gpu_cuda_log(ERR, "cuCtxSetCurrent input failed with %s.\n", err_string);
+
+			return -1;
+		}
+
+		/*
+		 * Ctx capacity info
+		 */
+
+		/* MPS compatible */
+		res = cuCtxGetExecAffinity(&affinityPrm, CU_EXEC_AFFINITY_TYPE_SM_COUNT);
+		if (res != CUDA_SUCCESS) {
+			cuGetErrorString(res, &(err_string));
+			rte_gpu_cuda_log(ERR, "cuCtxGetExecAffinity failed with %s.\n", err_string);
+		}
+		dev->mpshared->info.processor_count = (uint32_t)affinityPrm.param.smCount.val;
+
+		ret = rte_gpu_info_get(dev->mpshared->info.parent, &parent_info);
+		if (ret)
+			return -ENODEV;
+		dev->mpshared->info.total_memory = parent_info.total_memory;
+
+		/*
+		 * GPU Device private info
+		 */
+		dev->mpshared->dev_private = rte_zmalloc(NULL,
+							sizeof(struct cuda_info),
+							RTE_CACHE_LINE_SIZE);
+		if (dev->mpshared->dev_private == NULL) {
+			rte_gpu_cuda_log(ERR, "Failed to allocate memory for GPU process private.\n");
+
+			return -1;
+		}
+
+		private = (struct cuda_info *)dev->mpshared->dev_private;
+
+		res = cuCtxGetDevice(&(private->cu_dev));
+		if (res != CUDA_SUCCESS) {
+			cuGetErrorString(res, &(err_string));
+			rte_gpu_cuda_log(ERR, "cuCtxGetDevice failed with %s.\n", err_string);
+
+			return -1;
+		}
+
+		res = cuDeviceGetName(private->gpu_name, RTE_DEV_NAME_MAX_LEN, private->cu_dev);
+		if (res != CUDA_SUCCESS) {
+			cuGetErrorString(res, &(err_string));
+			rte_gpu_cuda_log(ERR, "cuDeviceGetName failed with %s.\n", err_string);
+
+			return -1;
+		}
+
+		/* Restore original ctx as current ctx */
+		res = cuCtxSetCurrent(current_ctx);
+		if (res != CUDA_SUCCESS) {
+			cuGetErrorString(res, &(err_string));
+			rte_gpu_cuda_log(ERR, "cuCtxSetCurrent current failed with %s.\n", err_string);
+
+			return -1;
+		}
+	}
+
+	*info = dev->mpshared->info;
+
+	return 0;
+}
+
+/*
+ * GPU Memory
+ */
+
+static int
+cuda_mem_alloc(struct rte_gpu *dev, size_t size, void **ptr)
+{
+	CUresult res;
+	const char *err_string;
+	CUcontext current_ctx;
+	CUcontext input_ctx;
+	unsigned int flag = 1;
+
+	if (dev == NULL || size == 0)
+		return -EINVAL;
+
+	/* Store current ctx */
+	res = cuCtxGetCurrent(¤t_ctx);
+	if (res != CUDA_SUCCESS) {
+		cuGetErrorString(res, &(err_string));
+		rte_gpu_cuda_log(ERR, "cuCtxGetCurrent failed with %s.\n", err_string);
+
+		return -1;
+	}
+
+	/* Set child ctx as current ctx */
+	input_ctx = (CUcontext)dev->mpshared->info.context;
+	res = cuCtxSetCurrent(input_ctx);
+	if (res != CUDA_SUCCESS) {
+		cuGetErrorString(res, &(err_string));
+		rte_gpu_cuda_log(ERR, "cuCtxSetCurrent input failed with %s.\n", err_string);
+
+		return -1;
+	}
+
+	/* Get next memory list item */
+	mem_alloc_list_tail = mem_list_add_item();
+	if (mem_alloc_list_tail == NULL)
+		return -ENOMEM;
+
+	/* Allocate memory */
+	mem_alloc_list_tail->size = size;
+	res = cuMemAlloc(&(mem_alloc_list_tail->ptr_d), mem_alloc_list_tail->size);
+	if (res != CUDA_SUCCESS) {
+		cuGetErrorString(res, &(err_string));
+		rte_gpu_cuda_log(ERR,
+				"cuCtxSetCurrent current failed with %s.\n",
+				err_string);
+
+		return -1;
+	}
+
+	/* GPUDirect RDMA attribute required */
+	res = cuPointerSetAttribute(&flag,
+					CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
+					mem_alloc_list_tail->ptr_d);
+	if (res != CUDA_SUCCESS) {
+		rte_gpu_cuda_log(ERR,
+				"Could not set SYNC MEMOP attribute for GPU memory at %llx , err %d\n",
+				mem_alloc_list_tail->ptr_d, res);
+		return -1;
+	}
+
+	mem_alloc_list_tail->pkey = get_hash_from_ptr((void *) mem_alloc_list_tail->ptr_d);
+	mem_alloc_list_tail->ptr_h = NULL;
+	mem_alloc_list_tail->size = size;
+	mem_alloc_list_tail->dev = dev;
+	mem_alloc_list_tail->ctx = (CUcontext)dev->mpshared->info.context;
+	mem_alloc_list_tail->mtype = GPU_MEM;
+
+	/* Restore original ctx as current ctx */
+	res = cuCtxSetCurrent(current_ctx);
+	if (res != CUDA_SUCCESS) {
+		cuGetErrorString(res, &(err_string));
+		rte_gpu_cuda_log(ERR, "cuCtxSetCurrent current failed with %s.\n", err_string);
+
+		return -1;
+	}
+
+	*ptr = (void *) mem_alloc_list_tail->ptr_d;
+
+	return 0;
+}
+
+static int
+cuda_mem_register(struct rte_gpu *dev, size_t size, void *ptr)
+{
+	CUresult res;
+	const char *err_string;
+	CUcontext current_ctx;
+	CUcontext input_ctx;
+	unsigned int flag = 1;
+	int use_ptr_h = 0;
+
+	if (dev == NULL || size == 0 || ptr == NULL)
+		return -EINVAL;
+
+	/* Store current ctx */
+	res = cuCtxGetCurrent(¤t_ctx);
+	if (res != CUDA_SUCCESS) {
+		cuGetErrorString(res, &(err_string));
+		rte_gpu_cuda_log(ERR, "cuCtxGetCurrent failed with %s.\n", err_string);
+
+		return -1;
+	}
+
+	/* Set child ctx as current ctx */
+	input_ctx = (CUcontext)dev->mpshared->info.context;
+	res = cuCtxSetCurrent(input_ctx);
+	if (res != CUDA_SUCCESS) {
+		cuGetErrorString(res, &(err_string));
+		rte_gpu_cuda_log(ERR, "cuCtxSetCurrent input failed with %s.\n", err_string);
+
+		return -1;
+	}
+
+	/* Get next memory list item */
+	mem_alloc_list_tail = mem_list_add_item();
+	if (mem_alloc_list_tail == NULL)
+		return -ENOMEM;
+
+	/* Allocate memory */
+	mem_alloc_list_tail->size = size;
+	mem_alloc_list_tail->ptr_h = ptr;
+
+	res = cuMemHostRegister(mem_alloc_list_tail->ptr_h, mem_alloc_list_tail->size, CU_MEMHOSTREGISTER_PORTABLE | CU_MEMHOSTREGISTER_DEVICEMAP);
+	if (res != CUDA_SUCCESS) {
+		cuGetErrorString(res, &(err_string));
+		rte_gpu_cuda_log(ERR,
+				"cuMemHostRegister failed with %s ptr %p size %zd.\n",
+				err_string, mem_alloc_list_tail->ptr_h, mem_alloc_list_tail->size);
+
+		return -1;
+	}
+
+	res = cuDeviceGetAttribute(&(use_ptr_h),
+					CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM,
+					((struct cuda_info *)(dev->mpshared->dev_private))->cu_dev);
+	if (res != CUDA_SUCCESS) {
+		cuGetErrorString(res, &(err_string));
+		rte_gpu_cuda_log(ERR, "cuDeviceGetAttribute failed with %s.\n",
+					err_string
+			);
+
+		return -1;
+	}
+
+	if (use_ptr_h == 0) {
+		res = cuMemHostGetDevicePointer(&(mem_alloc_list_tail->ptr_d),
+						mem_alloc_list_tail->ptr_h,
+						0);
+		if (res != CUDA_SUCCESS) {
+			cuGetErrorString(res, &(err_string));
+			rte_gpu_cuda_log(ERR,
+					"cuMemHostGetDevicePointer failed with %s.\n",
+					err_string);
+
+			return -1;
+		}
+
+		if ((uintptr_t) mem_alloc_list_tail->ptr_d != (uintptr_t) mem_alloc_list_tail->ptr_h) {
+			rte_gpu_cuda_log(ERR, "Host input pointer is different wrt GPU registered pointer\n");
+			return -1;
+		}
+	} else {
+		mem_alloc_list_tail->ptr_d = (CUdeviceptr) mem_alloc_list_tail->ptr_h;
+	}
+
+	/* GPUDirect RDMA attribute required */
+	res = cuPointerSetAttribute(&flag,
+					CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
+					mem_alloc_list_tail->ptr_d);
+	if (res != CUDA_SUCCESS) {
+		rte_gpu_cuda_log(ERR,
+				"Could not set SYNC MEMOP attribute for GPU memory at %llx , err %d\n",
+				mem_alloc_list_tail->ptr_d, res);
+		return -1;
+	}
+
+	mem_alloc_list_tail->pkey = get_hash_from_ptr((void *) mem_alloc_list_tail->ptr_h);
+	mem_alloc_list_tail->size = size;
+	mem_alloc_list_tail->dev = dev;
+	mem_alloc_list_tail->ctx = (CUcontext)dev->mpshared->info.context;
+	mem_alloc_list_tail->mtype = CPU_REGISTERED;
+
+	/* Restore original ctx as current ctx */
+	res = cuCtxSetCurrent(current_ctx);
+	if (res != CUDA_SUCCESS) {
+		cuGetErrorString(res, &(err_string));
+		rte_gpu_cuda_log(ERR,
+				"cuCtxSetCurrent current failed with %s.\n",
+				err_string);
+
+		return -1;
+	}
+
+	return 0;
+}
+
+static int
+cuda_mem_free(struct rte_gpu *dev, void *ptr)
+{
+	CUresult res;
+	struct mem_entry *mem_item;
+	const char *err_string;
+	cuda_ptr_key hk;
+
+	if (dev == NULL || ptr == NULL)
+		return -EINVAL;
+
+	hk = get_hash_from_ptr((void *) ptr);
+
+	mem_item = mem_list_find_item(hk);
+	if (mem_item == NULL) {
+		rte_gpu_cuda_log(ERR, "Memory address 0x%p not found in driver memory\n", ptr);
+		return -1;
+	}
+
+	if (mem_item->mtype == GPU_MEM) {
+		res = cuMemFree(mem_item->ptr_d);
+		if (res != CUDA_SUCCESS) {
+			cuGetErrorString(res, &(err_string));
+			rte_gpu_cuda_log(ERR, "cuMemFree current failed with %s.\n", err_string);
+
+			return -1;
+		}
+
+		return mem_list_del_item(hk);
+	}
+
+	rte_gpu_cuda_log(ERR, "Memory type %d not supported\n", mem_item->mtype);
+	return -1;
+}
+
+static int
+cuda_mem_unregister(struct rte_gpu *dev, void *ptr)
+{
+	CUresult res;
+	struct mem_entry *mem_item;
+	const char *err_string;
+	cuda_ptr_key hk;
+
+	if (dev == NULL || ptr == NULL)
+		return -EINVAL;
+
+	hk = get_hash_from_ptr((void *) ptr);
+
+	mem_item = mem_list_find_item(hk);
+	if (mem_item == NULL) {
+		rte_gpu_cuda_log(ERR, "Memory address 0x%p not nd in driver memory\n", ptr);
+		return -1;
+	}
+
+	if (mem_item->mtype == CPU_REGISTERED) {
+		res = cuMemHostUnregister(ptr);
+		if (res != CUDA_SUCCESS) {
+			cuGetErrorString(res, &(err_string));
+			rte_gpu_cuda_log(ERR,
+					"cuMemHostUnregister current failed with %s.\n",
+					err_string);
+
+			return -1;
+		}
+
+		return mem_list_del_item(hk);
+	}
+
+	rte_gpu_cuda_log(ERR, "Memory type %d not supported\n", mem_item->mtype);
+	return -1;
+}
+
+static int
+cuda_dev_close(struct rte_gpu *dev)
+{
+	if (dev == NULL)
+		return -EINVAL;
+
+	rte_free(dev->mpshared->dev_private);
+
+	return 0;
+}
+
+static int
+cuda_wmb(struct rte_gpu *dev)
+{
+	CUresult res;
+	const char *err_string;
+	CUcontext current_ctx;
+	CUcontext input_ctx;
+
+	if (dev == NULL)
+		return -EINVAL;
+
+	/* Store current ctx */
+	res = cuCtxGetCurrent(¤t_ctx);
+	if (res != CUDA_SUCCESS) {
+		cuGetErrorString(res, &(err_string));
+		rte_gpu_cuda_log(ERR, "cuCtxGetCurrent failed with %s.\n", err_string);
+
+		return -1;
+	}
+
+	/* Set child ctx as current ctx */
+	input_ctx = (CUcontext)dev->mpshared->info.context;
+	res = cuCtxSetCurrent(input_ctx);
+	if (res != CUDA_SUCCESS) {
+		cuGetErrorString(res, &(err_string));
+		rte_gpu_cuda_log(ERR, "cuCtxSetCurrent input failed with %s.\n", err_string);
+
+		return -1;
+	}
+
+	res = cuFlushGPUDirectRDMAWrites(CU_FLUSH_GPU_DIRECT_RDMA_WRITES_TARGET_CURRENT_CTX, CU_FLUSH_GPU_DIRECT_RDMA_WRITES_TO_ALL_DEVICES);
+	if (res != CUDA_SUCCESS) {
+		cuGetErrorString(res, &(err_string));
+		rte_gpu_cuda_log(ERR, "cuFlushGPUDirectRDMAWrites current failed with %s.\n", err_string);
+
+		return -1;
+	}
+
+	/* Restore original ctx as current ctx */
+	res = cuCtxSetCurrent(current_ctx);
+	if (res != CUDA_SUCCESS) {
+		cuGetErrorString(res, &(err_string));
+		rte_gpu_cuda_log(ERR, "cuCtxSetCurrent current failed with %s.\n", err_string);
+
+		return -1;
+	}
+
+	return 0;
+}
+
+static int
+cuda_gpu_probe(__rte_unused struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
+{
+	struct rte_gpu *dev = NULL;
+	CUresult res;
+	CUdevice cu_dev_id;
+	CUcontext pctx;
+	char dev_name[RTE_DEV_NAME_MAX_LEN];
+	const char *err_string;
+	int processor_count = 0;
+	struct cuda_info *private;
+
+	if (pci_dev == NULL) {
+		rte_gpu_cuda_log(ERR, "NULL PCI device");
+		return -EINVAL;
+	}
+
+	rte_pci_device_name(&pci_dev->addr, dev_name, sizeof(dev_name));
+
+	/* Allocate memory to be used privately by drivers */
+	dev = rte_gpu_allocate(pci_dev->device.name);
+	if (dev == NULL)
+		return -ENODEV;
+
+	/* Initialize values only for the first CUDA driver call */
+	if (dev->mpshared->info.dev_id == 0) {
+		mem_alloc_list_head = NULL;
+		mem_alloc_list_tail = NULL;
+		mem_alloc_list_last_elem = 0;
+	}
+
+	/* Fill HW specific part of device structure */
+	dev->device = &pci_dev->device;
+	dev->mpshared->info.numa_node = pci_dev->device.numa_node;
+
+	/*
+	 * GPU Device init
+	 */
+
+	/*
+	 * Required to initialize the CUDA Driver.
+	 * Multiple calls of cuInit() will return immediately
+	 * without making any relevant change
+	 */
+	cuInit(0);
+
+	/* Get NVIDIA GPU Device descriptor */
+	res = cuDeviceGetByPCIBusId(&cu_dev_id, dev->device->name);
+	if (res != CUDA_SUCCESS) {
+		cuGetErrorString(res, &(err_string));
+		rte_gpu_cuda_log(ERR,
+				"cuDeviceGetByPCIBusId name %s failed with %d: %s.\n",
+				dev->device->name, res, err_string);
+
+		return -1;
+	}
+
+	res = cuDevicePrimaryCtxRetain(&pctx, cu_dev_id);
+	if (res != CUDA_SUCCESS) {
+		cuGetErrorString(res, &(err_string));
+		rte_gpu_cuda_log(ERR,
+				"cuDevicePrimaryCtxRetain name %s failed with %d: %s.\n",
+				dev->device->name, res, err_string);
+
+		return -1;
+	}
+
+	dev->mpshared->info.context = (uint64_t) pctx;
+
+	/*
+	 * GPU Device generic info
+	 */
+
+	/* Processor count */
+	res = cuDeviceGetAttribute(&(processor_count),
+					CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
+					cu_dev_id);
+	if (res != CUDA_SUCCESS) {
+		cuGetErrorString(res, &(err_string));
+		rte_gpu_cuda_log(ERR,
+				"cuDeviceGetAttribute failed with %s.\n",
+				err_string);
+
+		return -1;
+	}
+	dev->mpshared->info.processor_count = (uint32_t)processor_count;
+
+	/* Total memory */
+	res = cuDeviceTotalMem(&dev->mpshared->info.total_memory, cu_dev_id);
+	if (res != CUDA_SUCCESS) {
+		cuGetErrorString(res, &(err_string));
+		rte_gpu_cuda_log(ERR,
+				"cuDeviceTotalMem failed with %s.\n",
+				err_string);
+
+		return -1;
+	}
+
+	/*
+	 * GPU Device private info
+	 */
+	dev->mpshared->dev_private = rte_zmalloc(NULL,
+						sizeof(struct cuda_info),
+						RTE_CACHE_LINE_SIZE);
+	if (dev->mpshared->dev_private == NULL) {
+		rte_gpu_cuda_log(ERR,
+				"Failed to allocate memory for GPU process private.\n");
+
+		return -1;
+	}
+
+	private = (struct cuda_info *)dev->mpshared->dev_private;
+	private->cu_dev = cu_dev_id;
+	res = cuDeviceGetName(private->gpu_name,
+				RTE_DEV_NAME_MAX_LEN,
+				cu_dev_id);
+	if (res != CUDA_SUCCESS) {
+		cuGetErrorString(res, &(err_string));
+		rte_gpu_cuda_log(ERR,
+				"cuDeviceGetName failed with %s.\n",
+				err_string);
+
+		return -1;
+	}
+
+	dev->ops.dev_info_get = cuda_dev_info_get;
+	dev->ops.dev_close = cuda_dev_close;
+	dev->ops.mem_alloc = cuda_mem_alloc;
+	dev->ops.mem_free = cuda_mem_free;
+	dev->ops.mem_register = cuda_mem_register;
+	dev->ops.mem_unregister = cuda_mem_unregister;
+	dev->ops.wmb = cuda_wmb;
+
+	rte_gpu_complete_new(dev);
+
+	rte_gpu_cuda_log_debug("dev id = %u name = %s\n", dev->mpshared->info.dev_id, private->gpu_name);
+
+	return 0;
+}
+
+static int
+cuda_gpu_remove(struct rte_pci_device *pci_dev)
+{
+	struct rte_gpu *dev;
+	int ret;
+	uint8_t gpu_id;
+
+	if (pci_dev == NULL)
+		return -EINVAL;
+
+	dev = rte_gpu_get_by_name(pci_dev->device.name);
+	if (dev == NULL) {
+		rte_gpu_cuda_log(ERR,
+				"Couldn't find HW dev \"%s\" to uninitialise it",
+				pci_dev->device.name);
+		return -ENODEV;
+	}
+	gpu_id = dev->mpshared->info.dev_id;
+
+	/* release dev from library */
+	ret = rte_gpu_release(dev);
+	if (ret)
+		rte_gpu_cuda_log(ERR, "Device %i failed to uninit: %i", gpu_id, ret);
+
+	rte_gpu_cuda_log_debug("Destroyed dev = %u", gpu_id);
+
+	return 0;
+}
+
+static struct rte_pci_driver rte_cuda_driver = {
+	.id_table = pci_id_cuda_map,
+	.drv_flags = RTE_PCI_DRV_WC_ACTIVATE,
+	.probe = cuda_gpu_probe,
+	.remove = cuda_gpu_remove,
+};
+
+RTE_PMD_REGISTER_PCI(gpu_cuda, rte_cuda_driver);
+RTE_PMD_REGISTER_PCI_TABLE(gpu_cuda, pci_id_cuda_map);
+RTE_PMD_REGISTER_KMOD_DEP(gpu_cuda, "* nvidia & (nv_peer_mem | nvpeer_mem)");
diff --git a/drivers/gpu/cuda/meson.build b/drivers/gpu/cuda/meson.build
new file mode 100644
index 0000000000..92b30c35b4
--- /dev/null
+++ b/drivers/gpu/cuda/meson.build
@@ -0,0 +1,13 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright (c) 2021 NVIDIA Corporation & Affiliates
+
+if not is_linux
+        build = false
+        reason = 'only supported on Linux'
+endif
+
+cuda_dep = dependency('cuda', version : '>=11', modules: ['cuda'])
+ext_deps += cuda_dep
+
+deps += ['gpudev','pci','bus_pci', 'hash']
+sources = files('cuda.c')
diff --git a/drivers/gpu/cuda/version.map b/drivers/gpu/cuda/version.map
new file mode 100644
index 0000000000..4a76d1d52d
--- /dev/null
+++ b/drivers/gpu/cuda/version.map
@@ -0,0 +1,3 @@
+DPDK_21 {
+	local: *;
+};
diff --git a/drivers/gpu/meson.build b/drivers/gpu/meson.build
index e51ad3381b..601bedcd61 100644
--- a/drivers/gpu/meson.build
+++ b/drivers/gpu/meson.build
@@ -1,4 +1,4 @@
 # SPDX-License-Identifier: BSD-3-Clause
 # Copyright (c) 2021 NVIDIA Corporation & Affiliates
 
-drivers = []
+drivers = [ 'cuda' ]
-- 
2.17.1
^ permalink raw reply related	[flat|nested] 28+ messages in thread
* [dpdk-dev] [PATCH v4 0/1] gpu/cuda: introduce CUDA driver
  2021-10-05 22:49 [dpdk-dev] [RFC PATCH] gpu/cuda: introduce CUDA driver eagostini
                   ` (2 preceding siblings ...)
  2021-11-09  2:28 ` [dpdk-dev] [PATCH v3 0/1] " eagostini
@ 2021-11-09  5:50 ` eagostini
  2021-11-09  5:50   ` [dpdk-dev] [PATCH v4 1/1] " eagostini
  2021-11-15 22:36 ` [PATCH v5 0/1] " eagostini
                   ` (2 subsequent siblings)
  6 siblings, 1 reply; 28+ messages in thread
From: eagostini @ 2021-11-09  5:50 UTC (permalink / raw)
  To: dev; +Cc: Elena Agostini
From: Elena Agostini <eagostini@nvidia.com>
This is the CUDA implementation of the gpudev library.
Funcitonalities implemented through CUDA Driver API are:
- Device probe and remove
- Manage device memory allocations
- Register/unregister external CPU memory in the device memory area
Changelog:
- CUDA driver implementation of the GPU write memory barrier
- Fixed styling reported by checkpatch
- CUDA driver is optional: not built if CUDA >= 11.4 not found
Elena Agostini (1):
  gpu/cuda: introduce CUDA driver
 doc/guides/gpus/cuda.rst     | 110 +++++
 doc/guides/gpus/index.rst    |   1 +
 drivers/gpu/cuda/cuda.c      | 813 +++++++++++++++++++++++++++++++++++
 drivers/gpu/cuda/meson.build |  20 +
 drivers/gpu/cuda/version.map |   3 +
 drivers/gpu/meson.build      |   2 +-
 6 files changed, 948 insertions(+), 1 deletion(-)
 create mode 100644 doc/guides/gpus/cuda.rst
 create mode 100644 drivers/gpu/cuda/cuda.c
 create mode 100644 drivers/gpu/cuda/meson.build
 create mode 100644 drivers/gpu/cuda/version.map
-- 
2.17.1
^ permalink raw reply	[flat|nested] 28+ messages in thread
* [dpdk-dev] [PATCH v4 1/1] gpu/cuda: introduce CUDA driver
  2021-11-09  5:50 ` [dpdk-dev] [PATCH v4 0/1] " eagostini
@ 2021-11-09  5:50   ` eagostini
  0 siblings, 0 replies; 28+ messages in thread
From: eagostini @ 2021-11-09  5:50 UTC (permalink / raw)
  To: dev; +Cc: Elena Agostini
From: Elena Agostini <eagostini@nvidia.com>
This is the CUDA implementation of the gpudev library.
Funcitonalities implemented through CUDA Driver API are:
- Device probe and remove
- Manage device memory allocations
- Register/unregister external CPU memory in the device memory area
Signed-off-by: Elena Agostini <eagostini@nvidia.com>
---
 doc/guides/gpus/cuda.rst     | 110 +++++
 doc/guides/gpus/index.rst    |   1 +
 drivers/gpu/cuda/cuda.c      | 813 +++++++++++++++++++++++++++++++++++
 drivers/gpu/cuda/meson.build |  20 +
 drivers/gpu/cuda/version.map |   3 +
 drivers/gpu/meson.build      |   2 +-
 6 files changed, 948 insertions(+), 1 deletion(-)
 create mode 100644 doc/guides/gpus/cuda.rst
 create mode 100644 drivers/gpu/cuda/cuda.c
 create mode 100644 drivers/gpu/cuda/meson.build
 create mode 100644 drivers/gpu/cuda/version.map
diff --git a/doc/guides/gpus/cuda.rst b/doc/guides/gpus/cuda.rst
new file mode 100644
index 0000000000..64a78bf1e1
--- /dev/null
+++ b/doc/guides/gpus/cuda.rst
@@ -0,0 +1,110 @@
+.. SPDX-License-Identifier: BSD-3-Clause
+   Copyright (c) 2021 NVIDIA Corporation & Affiliates
+
+CUDA GPU driver
+===============
+
+The CUDA GPU driver library (**librte_gpu_cuda**) provides support for NVIDIA GPUs.
+Information and documentation about these devices can be found on the
+`NVIDIA website <http://www.nvidia.com>`__. Help is also provided by the
+`NVIDIA CUDA Toolkit developer zone <https://docs.nvidia.com/cuda>`__.
+
+Design
+------
+
+**librte_gpu_cuda** relies on CUDA Driver API (no need for CUDA Runtime API).
+
+Goal of this driver library is not to provide a wrapper for the whole CUDA Driver API.
+Instead, the scope is to implement the generic features of gpudev API.
+For a CUDA application, integrating the gpudev library functions using the CUDA driver library
+is quite straightforward and doesn't create any compatibility problem.
+
+Initialization
+~~~~~~~~~~~~~~
+
+During initialization, CUDA driver library detects NVIDIA physical GPUs on the
+system or specified via EAL device options (e.g. ``-a b6:00.0``).
+The driver initializes the CUDA driver environment through ``cuInit(0)`` function.
+For this reason, it's required to set any CUDA environment configuration before
+calling ``rte_eal_init`` function in the DPDK application.
+
+If the CUDA driver environment has been already initialized, the ``cuInit(0)``
+in CUDA driver library has no effect.
+
+CUDA Driver sub-contexts
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+After initialization, a CUDA application can create multiple sub-contexts on GPU
+physical devices. Through gpudev library, is possible to register these sub-contexts
+in the CUDA driver library as child devices having as parent a GPU physical device.
+
+CUDA driver library also supports `MPS <https://docs.nvidia.com/deploy/pdf/CUDA_Multi_Process_Service_Overview.pdf>`__.
+
+GPU memory management
+~~~~~~~~~~~~~~~~~~~~~
+
+The CUDA driver library maintains a table of GPU memory addresses allocated
+and CPU memory addresses registered associated to the input CUDA context.
+Whenever the application tried to deallocate or deregister a memory address,
+if the address is not in the table the CUDA driver library will return an error.
+
+Features
+--------
+
+- Register new child devices aka new CUDA Driver contexts
+- Allocate memory on the GPU
+- Register CPU memory to make it visible from GPU
+
+Minimal requirements
+--------------------
+
+Minimal requirements to enable the CUDA driver library are:
+
+- NVIDIA GPU Ampere or Volta
+- CUDA 11.4 Driver API or newer
+
+`GPUDirect RDMA Technology <https://docs.nvidia.com/cuda/gpudirect-rdma/index.html>`__
+allows compatible network cards (e.g. Mellanox) to directly send and receive packets
+using GPU memory instead of additional memory copies through the CPU system memory.
+To enable this technology, system requirements are:
+
+- `nvidia-peermem <https://docs.nvidia.com/cuda/gpudirect-rdma/index.html#nvidia-peermem>`__ module running on the system
+- Mellanox Network card ConnectX-5 or newer (BlueField models included)
+- DPDK mlx5 PMD enabled
+- To reach the best performance, a PCIe switch between GPU and NIC is recommended
+
+Limitations
+-----------
+
+Supported only on Linux.
+
+Supported GPUs
+--------------
+
+The following NVIDIA GPU devices are supported by this CUDA driver:
+
+- NVIDIA A100 80GB PCIe
+- NVIDIA A100 40GB PCIe
+- NVIDIA A30 24GB
+- NVIDIA A10 24GB
+- NVIDIA V100 32GB PCIe
+- NVIDIA V100 16GB PCIe
+
+External references
+-------------------
+
+A good example of how to use the GPU CUDA driver through the gpudev library
+is the l2fwd-nv application that can be found `here <https://github.com/NVIDIA/l2fwd-nv>`__.
+
+The application is based on vanilla DPDK example l2fwd and it's enhanced with GPU memory
+managed through gpudev library and CUDA to launch the swap of packets' MAC addresses workload
+on the GPU.
+
+l2fwd-nv is not intended to be used for performance (testpmd is the good candidate for this).
+The goal is to show different use-cases about how a CUDA application can use DPDK to:
+
+- allocate memory on GPU device using gpudev library
+- use that memory to create an external GPU memory mempool
+- receive packets directly in GPU memory
+- coordinate the workload on the GPU with the network and CPU activity to receive packets
+- send modified packets directly from the GPU memory
diff --git a/doc/guides/gpus/index.rst b/doc/guides/gpus/index.rst
index 1878423239..4b7a420556 100644
--- a/doc/guides/gpus/index.rst
+++ b/doc/guides/gpus/index.rst
@@ -9,3 +9,4 @@ General-Purpose Graphics Processing Unit Drivers
    :numbered:
 
    overview
+   cuda
diff --git a/drivers/gpu/cuda/cuda.c b/drivers/gpu/cuda/cuda.c
new file mode 100644
index 0000000000..5108785bb7
--- /dev/null
+++ b/drivers/gpu/cuda/cuda.c
@@ -0,0 +1,813 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright (c) 2021 NVIDIA Corporation & Affiliates
+ */
+
+#include <rte_common.h>
+#include <rte_log.h>
+#include <rte_malloc.h>
+#include <rte_errno.h>
+#include <rte_pci.h>
+#include <rte_bus_pci.h>
+#include <rte_byteorder.h>
+#include <rte_dev.h>
+
+#include <gpudev_driver.h>
+#include <cuda.h>
+
+/* NVIDIA GPU vendor */
+#define NVIDIA_GPU_VENDOR_ID (0x10de)
+
+/* NVIDIA GPU device IDs */
+#define NVIDIA_GPU_A100_40GB_DEVICE_ID (0x20f1)
+#define NVIDIA_GPU_A100_80GB_DEVICE_ID (0x20b5)
+
+#define NVIDIA_GPU_A30_24GB_DEVICE_ID (0x20b7)
+#define NVIDIA_GPU_A10_24GB_DEVICE_ID (0x2236)
+
+#define NVIDIA_GPU_V100_32GB_DEVICE_ID (0x1db6)
+#define NVIDIA_GPU_V100_16GB_DEVICE_ID (0x1db4)
+
+#define CUDA_MAX_ALLOCATION_NUM 512
+
+#define GPU_PAGE_SHIFT 16
+#define GPU_PAGE_SIZE (1UL << GPU_PAGE_SHIFT)
+
+static RTE_LOG_REGISTER_DEFAULT(cuda_logtype, NOTICE);
+
+/** Helper macro for logging */
+#define rte_gpu_cuda_log(level, fmt, ...) \
+	rte_log(RTE_LOG_ ## level, cuda_logtype, fmt "\n", ##__VA_ARGS__)
+
+#define rte_gpu_cuda_log_debug(fmt, ...) \
+	rte_gpu_cuda_log(DEBUG, RTE_STR(__LINE__) ":%s() " fmt, __func__, \
+		##__VA_ARGS__)
+
+/* NVIDIA GPU address map */
+static struct rte_pci_id pci_id_cuda_map[] = {
+	{
+		RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
+				NVIDIA_GPU_A100_40GB_DEVICE_ID)
+	},
+	{
+		RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
+				NVIDIA_GPU_V100_32GB_DEVICE_ID)
+	},
+	/* {.device_id = 0}, ?? */
+};
+
+/* Device private info */
+struct cuda_info {
+	char gpu_name[RTE_DEV_NAME_MAX_LEN];
+	CUdevice cu_dev;
+};
+
+/* Type of memory allocated by CUDA driver */
+enum mem_type {
+	GPU_MEM = 0,
+	CPU_REGISTERED,
+	GPU_REGISTERED /* Not used yet */
+};
+
+/* key associated to a memory address */
+typedef uintptr_t cuda_ptr_key;
+
+/* Single entry of the memory list */
+struct mem_entry {
+	CUdeviceptr ptr_d;
+	void *ptr_h;
+	size_t size;
+	struct rte_gpu *dev;
+	CUcontext ctx;
+	cuda_ptr_key pkey;
+	enum mem_type mtype;
+	struct mem_entry *prev;
+	struct mem_entry *next;
+};
+
+static struct mem_entry *mem_alloc_list_head;
+static struct mem_entry *mem_alloc_list_tail;
+static uint32_t mem_alloc_list_last_elem;
+
+/* Generate a key from a memory pointer */
+static cuda_ptr_key
+get_hash_from_ptr(void *ptr)
+{
+	return (uintptr_t) ptr;
+}
+
+static uint32_t
+mem_list_count_item(void)
+{
+	return mem_alloc_list_last_elem;
+}
+
+/* Initiate list of memory allocations if not done yet */
+static struct mem_entry *
+mem_list_add_item(void)
+{
+	/* Initiate list of memory allocations if not done yet */
+	if (mem_alloc_list_head == NULL) {
+		mem_alloc_list_head = rte_zmalloc(NULL,
+						sizeof(struct mem_entry),
+						RTE_CACHE_LINE_SIZE);
+		if (mem_alloc_list_head == NULL) {
+			rte_gpu_cuda_log(ERR, "Failed to allocate memory for memory list.\n");
+			return NULL;
+		}
+
+		mem_alloc_list_head->next = NULL;
+		mem_alloc_list_head->prev = NULL;
+		mem_alloc_list_tail = mem_alloc_list_head;
+	} else {
+		struct mem_entry *mem_alloc_list_cur = rte_zmalloc(NULL,
+								sizeof(struct mem_entry),
+								RTE_CACHE_LINE_SIZE);
+
+		if (mem_alloc_list_cur == NULL) {
+			rte_gpu_cuda_log(ERR, "Failed to allocate memory for memory list.\n");
+			return NULL;
+		}
+
+		mem_alloc_list_tail->next = mem_alloc_list_cur;
+		mem_alloc_list_cur->prev = mem_alloc_list_tail;
+		mem_alloc_list_tail = mem_alloc_list_tail->next;
+		mem_alloc_list_tail->next = NULL;
+	}
+
+	mem_alloc_list_last_elem++;
+
+	return mem_alloc_list_tail;
+}
+
+static struct mem_entry *
+mem_list_find_item(cuda_ptr_key pk)
+{
+	struct mem_entry *mem_alloc_list_cur = NULL;
+
+	if (mem_alloc_list_head == NULL) {
+		rte_gpu_cuda_log(ERR, "Memory list doesn't exist\n");
+		return NULL;
+	}
+
+	if (mem_list_count_item() == 0) {
+		rte_gpu_cuda_log(ERR, "No items in memory list\n");
+		return NULL;
+	}
+
+	mem_alloc_list_cur = mem_alloc_list_head;
+
+	while (mem_alloc_list_cur != NULL) {
+		if (mem_alloc_list_cur->pkey == pk)
+			return mem_alloc_list_cur;
+		mem_alloc_list_cur = mem_alloc_list_cur->next;
+	}
+
+	return mem_alloc_list_cur;
+}
+
+static int
+mem_list_del_item(cuda_ptr_key pk)
+{
+	struct mem_entry *mem_alloc_list_cur = NULL;
+
+	mem_alloc_list_cur = mem_list_find_item(pk);
+	if (mem_alloc_list_cur == NULL)
+		return -EINVAL;
+
+	/* if key is in head */
+	if (mem_alloc_list_cur->prev == NULL)
+		mem_alloc_list_head = mem_alloc_list_cur->next;
+	else {
+		mem_alloc_list_cur->prev->next = mem_alloc_list_cur->next;
+		if (mem_alloc_list_cur->next != NULL)
+			mem_alloc_list_cur->next->prev = mem_alloc_list_cur->prev;
+	}
+
+	rte_free(mem_alloc_list_cur);
+
+	mem_alloc_list_last_elem--;
+
+	return 0;
+}
+
+static int
+cuda_dev_info_get(struct rte_gpu *dev, struct rte_gpu_info *info)
+{
+	int ret = 0;
+	CUresult res;
+	struct rte_gpu_info parent_info;
+	CUexecAffinityParam affinityPrm;
+	const char *err_string;
+	struct cuda_info *private;
+	CUcontext current_ctx;
+	CUcontext input_ctx;
+
+	if (dev == NULL)
+		return -EINVAL;
+
+	/* Child initialization time probably called by rte_gpu_add_child() */
+	if (dev->mpshared->info.parent != RTE_GPU_ID_NONE && dev->mpshared->dev_private == NULL) {
+		/* Store current ctx */
+		res = cuCtxGetCurrent(¤t_ctx);
+		if (res != CUDA_SUCCESS) {
+			cuGetErrorString(res, &(err_string));
+			rte_gpu_cuda_log(ERR, "cuCtxGetCurrent failed with %s.\n", err_string);
+
+			return -1;
+		}
+
+		/* Set child ctx as current ctx */
+		input_ctx = (CUcontext)dev->mpshared->info.context;
+		res = cuCtxSetCurrent(input_ctx);
+		if (res != CUDA_SUCCESS) {
+			cuGetErrorString(res, &(err_string));
+			rte_gpu_cuda_log(ERR,
+					"cuCtxSetCurrent input failed with %s.\n",
+					err_string);
+
+			return -1;
+		}
+
+		/*
+		 * Ctx capacity info
+		 */
+
+		/* MPS compatible */
+		res = cuCtxGetExecAffinity(&affinityPrm, CU_EXEC_AFFINITY_TYPE_SM_COUNT);
+		if (res != CUDA_SUCCESS) {
+			cuGetErrorString(res, &(err_string));
+			rte_gpu_cuda_log(ERR, "cuCtxGetExecAffinity failed with %s.\n", err_string);
+		}
+		dev->mpshared->info.processor_count = (uint32_t)affinityPrm.param.smCount.val;
+
+		ret = rte_gpu_info_get(dev->mpshared->info.parent, &parent_info);
+		if (ret)
+			return -ENODEV;
+		dev->mpshared->info.total_memory = parent_info.total_memory;
+
+		/*
+		 * GPU Device private info
+		 */
+		dev->mpshared->dev_private = rte_zmalloc(NULL,
+							sizeof(struct cuda_info),
+							RTE_CACHE_LINE_SIZE);
+		if (dev->mpshared->dev_private == NULL) {
+			rte_gpu_cuda_log(ERR, "Failed to allocate memory for GPU process private.\n");
+
+			return -1;
+		}
+
+		private = (struct cuda_info *)dev->mpshared->dev_private;
+
+		res = cuCtxGetDevice(&(private->cu_dev));
+		if (res != CUDA_SUCCESS) {
+			cuGetErrorString(res, &(err_string));
+			rte_gpu_cuda_log(ERR, "cuCtxGetDevice failed with %s.\n", err_string);
+
+			return -1;
+		}
+
+		res = cuDeviceGetName(private->gpu_name, RTE_DEV_NAME_MAX_LEN, private->cu_dev);
+		if (res != CUDA_SUCCESS) {
+			cuGetErrorString(res, &(err_string));
+			rte_gpu_cuda_log(ERR, "cuDeviceGetName failed with %s.\n", err_string);
+
+			return -1;
+		}
+
+		/* Restore original ctx as current ctx */
+		res = cuCtxSetCurrent(current_ctx);
+		if (res != CUDA_SUCCESS) {
+			cuGetErrorString(res, &(err_string));
+			rte_gpu_cuda_log(ERR,
+					"cuCtxSetCurrent current failed with %s.\n",
+					err_string);
+
+			return -1;
+		}
+	}
+
+	*info = dev->mpshared->info;
+
+	return 0;
+}
+
+/*
+ * GPU Memory
+ */
+
+static int
+cuda_mem_alloc(struct rte_gpu *dev, size_t size, void **ptr)
+{
+	CUresult res;
+	const char *err_string;
+	CUcontext current_ctx;
+	CUcontext input_ctx;
+	unsigned int flag = 1;
+
+	if (dev == NULL || size == 0)
+		return -EINVAL;
+
+	/* Store current ctx */
+	res = cuCtxGetCurrent(¤t_ctx);
+	if (res != CUDA_SUCCESS) {
+		cuGetErrorString(res, &(err_string));
+		rte_gpu_cuda_log(ERR, "cuCtxGetCurrent failed with %s.\n", err_string);
+
+		return -1;
+	}
+
+	/* Set child ctx as current ctx */
+	input_ctx = (CUcontext)dev->mpshared->info.context;
+	res = cuCtxSetCurrent(input_ctx);
+	if (res != CUDA_SUCCESS) {
+		cuGetErrorString(res, &(err_string));
+		rte_gpu_cuda_log(ERR, "cuCtxSetCurrent input failed with %s.\n", err_string);
+
+		return -1;
+	}
+
+	/* Get next memory list item */
+	mem_alloc_list_tail = mem_list_add_item();
+	if (mem_alloc_list_tail == NULL)
+		return -ENOMEM;
+
+	/* Allocate memory */
+	mem_alloc_list_tail->size = size;
+	res = cuMemAlloc(&(mem_alloc_list_tail->ptr_d), mem_alloc_list_tail->size);
+	if (res != CUDA_SUCCESS) {
+		cuGetErrorString(res, &(err_string));
+		rte_gpu_cuda_log(ERR,
+				"cuCtxSetCurrent current failed with %s.\n",
+				err_string);
+
+		return -1;
+	}
+
+	/* GPUDirect RDMA attribute required */
+	res = cuPointerSetAttribute(&flag,
+					CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
+					mem_alloc_list_tail->ptr_d);
+	if (res != CUDA_SUCCESS) {
+		rte_gpu_cuda_log(ERR,
+				"Could not set SYNC MEMOP attribute for GPU memory at  %"PRIu32", err %d\n",
+				(uint32_t) mem_alloc_list_tail->ptr_d, res);
+		return -1;
+	}
+
+	mem_alloc_list_tail->pkey = get_hash_from_ptr((void *) mem_alloc_list_tail->ptr_d);
+	mem_alloc_list_tail->ptr_h = NULL;
+	mem_alloc_list_tail->size = size;
+	mem_alloc_list_tail->dev = dev;
+	mem_alloc_list_tail->ctx = (CUcontext)dev->mpshared->info.context;
+	mem_alloc_list_tail->mtype = GPU_MEM;
+
+	/* Restore original ctx as current ctx */
+	res = cuCtxSetCurrent(current_ctx);
+	if (res != CUDA_SUCCESS) {
+		cuGetErrorString(res, &(err_string));
+		rte_gpu_cuda_log(ERR, "cuCtxSetCurrent current failed with %s.\n", err_string);
+
+		return -1;
+	}
+
+	*ptr = (void *) mem_alloc_list_tail->ptr_d;
+
+	return 0;
+}
+
+static int
+cuda_mem_register(struct rte_gpu *dev, size_t size, void *ptr)
+{
+	CUresult res;
+	const char *err_string;
+	CUcontext current_ctx;
+	CUcontext input_ctx;
+	unsigned int flag = 1;
+	int use_ptr_h = 0;
+
+	if (dev == NULL || size == 0 || ptr == NULL)
+		return -EINVAL;
+
+	/* Store current ctx */
+	res = cuCtxGetCurrent(¤t_ctx);
+	if (res != CUDA_SUCCESS) {
+		cuGetErrorString(res, &(err_string));
+		rte_gpu_cuda_log(ERR, "cuCtxGetCurrent failed with %s.\n", err_string);
+
+		return -1;
+	}
+
+	/* Set child ctx as current ctx */
+	input_ctx = (CUcontext)dev->mpshared->info.context;
+	res = cuCtxSetCurrent(input_ctx);
+	if (res != CUDA_SUCCESS) {
+		cuGetErrorString(res, &(err_string));
+		rte_gpu_cuda_log(ERR, "cuCtxSetCurrent input failed with %s.\n", err_string);
+
+		return -1;
+	}
+
+	/* Get next memory list item */
+	mem_alloc_list_tail = mem_list_add_item();
+	if (mem_alloc_list_tail == NULL)
+		return -ENOMEM;
+
+	/* Allocate memory */
+	mem_alloc_list_tail->size = size;
+	mem_alloc_list_tail->ptr_h = ptr;
+
+	res = cuMemHostRegister(mem_alloc_list_tail->ptr_h,
+				mem_alloc_list_tail->size,
+				CU_MEMHOSTREGISTER_PORTABLE | CU_MEMHOSTREGISTER_DEVICEMAP);
+	if (res != CUDA_SUCCESS) {
+		cuGetErrorString(res, &(err_string));
+		rte_gpu_cuda_log(ERR,
+				"cuMemHostRegister failed with %s ptr %p size %zd.\n",
+				err_string, mem_alloc_list_tail->ptr_h, mem_alloc_list_tail->size);
+
+		return -1;
+	}
+
+	res = cuDeviceGetAttribute(&(use_ptr_h),
+					CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM,
+					((struct cuda_info *)(dev->mpshared->dev_private))->cu_dev);
+	if (res != CUDA_SUCCESS) {
+		cuGetErrorString(res, &(err_string));
+		rte_gpu_cuda_log(ERR, "cuDeviceGetAttribute failed with %s.\n",
+					err_string
+			);
+
+		return -1;
+	}
+
+	if (use_ptr_h == 0) {
+		res = cuMemHostGetDevicePointer(&(mem_alloc_list_tail->ptr_d),
+						mem_alloc_list_tail->ptr_h,
+						0);
+		if (res != CUDA_SUCCESS) {
+			cuGetErrorString(res, &(err_string));
+			rte_gpu_cuda_log(ERR,
+					"cuMemHostGetDevicePointer failed with %s.\n",
+					err_string);
+
+			return -1;
+		}
+
+		if ((uintptr_t) mem_alloc_list_tail->ptr_d != (uintptr_t) mem_alloc_list_tail->ptr_h) {
+			rte_gpu_cuda_log(ERR,
+					"Host input pointer is different wrt GPU registered pointer\n");
+			return -1;
+		}
+	} else {
+		mem_alloc_list_tail->ptr_d = (CUdeviceptr) mem_alloc_list_tail->ptr_h;
+	}
+
+	/* GPUDirect RDMA attribute required */
+	res = cuPointerSetAttribute(&flag,
+					CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
+					mem_alloc_list_tail->ptr_d);
+	if (res != CUDA_SUCCESS) {
+		rte_gpu_cuda_log(ERR,
+				"Could not set SYNC MEMOP attribute for GPU memory at %"PRIu32", err %d\n",
+				(uint32_t) mem_alloc_list_tail->ptr_d, res);
+		return -1;
+	}
+
+	mem_alloc_list_tail->pkey = get_hash_from_ptr((void *) mem_alloc_list_tail->ptr_h);
+	mem_alloc_list_tail->size = size;
+	mem_alloc_list_tail->dev = dev;
+	mem_alloc_list_tail->ctx = (CUcontext)dev->mpshared->info.context;
+	mem_alloc_list_tail->mtype = CPU_REGISTERED;
+
+	/* Restore original ctx as current ctx */
+	res = cuCtxSetCurrent(current_ctx);
+	if (res != CUDA_SUCCESS) {
+		cuGetErrorString(res, &(err_string));
+		rte_gpu_cuda_log(ERR,
+				"cuCtxSetCurrent current failed with %s.\n",
+				err_string);
+
+		return -1;
+	}
+
+	return 0;
+}
+
+static int
+cuda_mem_free(struct rte_gpu *dev, void *ptr)
+{
+	CUresult res;
+	struct mem_entry *mem_item;
+	const char *err_string;
+	cuda_ptr_key hk;
+
+	if (dev == NULL || ptr == NULL)
+		return -EINVAL;
+
+	hk = get_hash_from_ptr((void *) ptr);
+
+	mem_item = mem_list_find_item(hk);
+	if (mem_item == NULL) {
+		rte_gpu_cuda_log(ERR, "Memory address 0x%p not found in driver memory\n", ptr);
+		return -1;
+	}
+
+	if (mem_item->mtype == GPU_MEM) {
+		res = cuMemFree(mem_item->ptr_d);
+		if (res != CUDA_SUCCESS) {
+			cuGetErrorString(res, &(err_string));
+			rte_gpu_cuda_log(ERR, "cuMemFree current failed with %s.\n", err_string);
+
+			return -1;
+		}
+
+		return mem_list_del_item(hk);
+	}
+
+	rte_gpu_cuda_log(ERR, "Memory type %d not supported\n", mem_item->mtype);
+	return -1;
+}
+
+static int
+cuda_mem_unregister(struct rte_gpu *dev, void *ptr)
+{
+	CUresult res;
+	struct mem_entry *mem_item;
+	const char *err_string;
+	cuda_ptr_key hk;
+
+	if (dev == NULL || ptr == NULL)
+		return -EINVAL;
+
+	hk = get_hash_from_ptr((void *) ptr);
+
+	mem_item = mem_list_find_item(hk);
+	if (mem_item == NULL) {
+		rte_gpu_cuda_log(ERR, "Memory address 0x%p not nd in driver memory\n", ptr);
+		return -1;
+	}
+
+	if (mem_item->mtype == CPU_REGISTERED) {
+		res = cuMemHostUnregister(ptr);
+		if (res != CUDA_SUCCESS) {
+			cuGetErrorString(res, &(err_string));
+			rte_gpu_cuda_log(ERR,
+					"cuMemHostUnregister current failed with %s.\n",
+					err_string);
+
+			return -1;
+		}
+
+		return mem_list_del_item(hk);
+	}
+
+	rte_gpu_cuda_log(ERR, "Memory type %d not supported\n", mem_item->mtype);
+	return -1;
+}
+
+static int
+cuda_dev_close(struct rte_gpu *dev)
+{
+	if (dev == NULL)
+		return -EINVAL;
+
+	rte_free(dev->mpshared->dev_private);
+
+	return 0;
+}
+
+static int
+cuda_wmb(struct rte_gpu *dev)
+{
+	CUresult res;
+	const char *err_string;
+	CUcontext current_ctx;
+	CUcontext input_ctx;
+
+	if (dev == NULL)
+		return -EINVAL;
+
+	/* Store current ctx */
+	res = cuCtxGetCurrent(¤t_ctx);
+	if (res != CUDA_SUCCESS) {
+		cuGetErrorString(res, &(err_string));
+		rte_gpu_cuda_log(ERR, "cuCtxGetCurrent failed with %s.\n", err_string);
+
+		return -1;
+	}
+
+	/* Set child ctx as current ctx */
+	input_ctx = (CUcontext)dev->mpshared->info.context;
+	res = cuCtxSetCurrent(input_ctx);
+	if (res != CUDA_SUCCESS) {
+		cuGetErrorString(res, &(err_string));
+		rte_gpu_cuda_log(ERR, "cuCtxSetCurrent input failed with %s.\n", err_string);
+
+		return -1;
+	}
+
+	res = cuFlushGPUDirectRDMAWrites(CU_FLUSH_GPU_DIRECT_RDMA_WRITES_TARGET_CURRENT_CTX,
+					CU_FLUSH_GPU_DIRECT_RDMA_WRITES_TO_ALL_DEVICES);
+	if (res != CUDA_SUCCESS) {
+		cuGetErrorString(res, &(err_string));
+		rte_gpu_cuda_log(ERR,
+				"cuFlushGPUDirectRDMAWrites current failed with %s.\n",
+				err_string);
+
+		return -1;
+	}
+
+	/* Restore original ctx as current ctx */
+	res = cuCtxSetCurrent(current_ctx);
+	if (res != CUDA_SUCCESS) {
+		cuGetErrorString(res, &(err_string));
+		rte_gpu_cuda_log(ERR,
+				"cuCtxSetCurrent current failed with %s.\n",
+				err_string);
+
+		return -1;
+	}
+
+	return 0;
+}
+
+static int
+cuda_gpu_probe(__rte_unused struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
+{
+	struct rte_gpu *dev = NULL;
+	CUresult res;
+	CUdevice cu_dev_id;
+	CUcontext pctx;
+	char dev_name[RTE_DEV_NAME_MAX_LEN];
+	const char *err_string;
+	int processor_count = 0;
+	struct cuda_info *private;
+
+	if (pci_dev == NULL) {
+		rte_gpu_cuda_log(ERR, "NULL PCI device");
+		return -EINVAL;
+	}
+
+	rte_pci_device_name(&pci_dev->addr, dev_name, sizeof(dev_name));
+
+	/* Allocate memory to be used privately by drivers */
+	dev = rte_gpu_allocate(pci_dev->device.name);
+	if (dev == NULL)
+		return -ENODEV;
+
+	/* Initialize values only for the first CUDA driver call */
+	if (dev->mpshared->info.dev_id == 0) {
+		mem_alloc_list_head = NULL;
+		mem_alloc_list_tail = NULL;
+		mem_alloc_list_last_elem = 0;
+	}
+
+	/* Fill HW specific part of device structure */
+	dev->device = &pci_dev->device;
+	dev->mpshared->info.numa_node = pci_dev->device.numa_node;
+
+	/*
+	 * GPU Device init
+	 */
+
+	/*
+	 * Required to initialize the CUDA Driver.
+	 * Multiple calls of cuInit() will return immediately
+	 * without making any relevant change
+	 */
+	cuInit(0);
+
+	/* Get NVIDIA GPU Device descriptor */
+	res = cuDeviceGetByPCIBusId(&cu_dev_id, dev->device->name);
+	if (res != CUDA_SUCCESS) {
+		cuGetErrorString(res, &(err_string));
+		rte_gpu_cuda_log(ERR,
+				"cuDeviceGetByPCIBusId name %s failed with %d: %s.\n",
+				dev->device->name, res, err_string);
+
+		return -1;
+	}
+
+	res = cuDevicePrimaryCtxRetain(&pctx, cu_dev_id);
+	if (res != CUDA_SUCCESS) {
+		cuGetErrorString(res, &(err_string));
+		rte_gpu_cuda_log(ERR,
+				"cuDevicePrimaryCtxRetain name %s failed with %d: %s.\n",
+				dev->device->name, res, err_string);
+
+		return -1;
+	}
+
+	dev->mpshared->info.context = (uint64_t) pctx;
+
+	/*
+	 * GPU Device generic info
+	 */
+
+	/* Processor count */
+	res = cuDeviceGetAttribute(&(processor_count),
+					CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
+					cu_dev_id);
+	if (res != CUDA_SUCCESS) {
+		cuGetErrorString(res, &(err_string));
+		rte_gpu_cuda_log(ERR,
+				"cuDeviceGetAttribute failed with %s.\n",
+				err_string);
+
+		return -1;
+	}
+	dev->mpshared->info.processor_count = (uint32_t)processor_count;
+
+	/* Total memory */
+	res = cuDeviceTotalMem(&dev->mpshared->info.total_memory, cu_dev_id);
+	if (res != CUDA_SUCCESS) {
+		cuGetErrorString(res, &(err_string));
+		rte_gpu_cuda_log(ERR,
+				"cuDeviceTotalMem failed with %s.\n",
+				err_string);
+
+		return -1;
+	}
+
+	/*
+	 * GPU Device private info
+	 */
+	dev->mpshared->dev_private = rte_zmalloc(NULL,
+						sizeof(struct cuda_info),
+						RTE_CACHE_LINE_SIZE);
+	if (dev->mpshared->dev_private == NULL) {
+		rte_gpu_cuda_log(ERR,
+				"Failed to allocate memory for GPU process private.\n");
+
+		return -1;
+	}
+
+	private = (struct cuda_info *)dev->mpshared->dev_private;
+	private->cu_dev = cu_dev_id;
+	res = cuDeviceGetName(private->gpu_name,
+				RTE_DEV_NAME_MAX_LEN,
+				cu_dev_id);
+	if (res != CUDA_SUCCESS) {
+		cuGetErrorString(res, &(err_string));
+		rte_gpu_cuda_log(ERR,
+				"cuDeviceGetName failed with %s.\n",
+				err_string);
+
+		return -1;
+	}
+
+	dev->ops.dev_info_get = cuda_dev_info_get;
+	dev->ops.dev_close = cuda_dev_close;
+	dev->ops.mem_alloc = cuda_mem_alloc;
+	dev->ops.mem_free = cuda_mem_free;
+	dev->ops.mem_register = cuda_mem_register;
+	dev->ops.mem_unregister = cuda_mem_unregister;
+	dev->ops.wmb = cuda_wmb;
+
+	rte_gpu_complete_new(dev);
+
+	rte_gpu_cuda_log_debug("dev id = %u name = %s\n", dev->mpshared->info.dev_id, private->gpu_name);
+
+	return 0;
+}
+
+static int
+cuda_gpu_remove(struct rte_pci_device *pci_dev)
+{
+	struct rte_gpu *dev;
+	int ret;
+	uint8_t gpu_id;
+
+	if (pci_dev == NULL)
+		return -EINVAL;
+
+	dev = rte_gpu_get_by_name(pci_dev->device.name);
+	if (dev == NULL) {
+		rte_gpu_cuda_log(ERR,
+				"Couldn't find HW dev \"%s\" to uninitialise it",
+				pci_dev->device.name);
+		return -ENODEV;
+	}
+	gpu_id = dev->mpshared->info.dev_id;
+
+	/* release dev from library */
+	ret = rte_gpu_release(dev);
+	if (ret)
+		rte_gpu_cuda_log(ERR, "Device %i failed to uninit: %i", gpu_id, ret);
+
+	rte_gpu_cuda_log_debug("Destroyed dev = %u", gpu_id);
+
+	return 0;
+}
+
+static struct rte_pci_driver rte_cuda_driver = {
+	.id_table = pci_id_cuda_map,
+	.drv_flags = RTE_PCI_DRV_WC_ACTIVATE,
+	.probe = cuda_gpu_probe,
+	.remove = cuda_gpu_remove,
+};
+
+RTE_PMD_REGISTER_PCI(gpu_cuda, rte_cuda_driver);
+RTE_PMD_REGISTER_PCI_TABLE(gpu_cuda, pci_id_cuda_map);
+RTE_PMD_REGISTER_KMOD_DEP(gpu_cuda, "* nvidia & (nv_peer_mem | nvpeer_mem)");
diff --git a/drivers/gpu/cuda/meson.build b/drivers/gpu/cuda/meson.build
new file mode 100644
index 0000000000..f084bbcf83
--- /dev/null
+++ b/drivers/gpu/cuda/meson.build
@@ -0,0 +1,20 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright (c) 2021 NVIDIA Corporation & Affiliates
+
+if not is_linux
+        build = false
+        reason = 'only supported on Linux'
+endif
+
+cuda_dep = dependency('cuda', version : '>=11.4', modules: ['cuda'], required: false)
+
+if not cuda_dep.found()
+         build = false
+         reason = 'missing dependency, "cuda >= 11.4"'
+         subdir_done()
+endif
+
+ext_deps += cuda_dep
+
+deps += ['gpudev','pci','bus_pci', 'hash']
+sources = files('cuda.c')
diff --git a/drivers/gpu/cuda/version.map b/drivers/gpu/cuda/version.map
new file mode 100644
index 0000000000..4a76d1d52d
--- /dev/null
+++ b/drivers/gpu/cuda/version.map
@@ -0,0 +1,3 @@
+DPDK_21 {
+	local: *;
+};
diff --git a/drivers/gpu/meson.build b/drivers/gpu/meson.build
index e51ad3381b..601bedcd61 100644
--- a/drivers/gpu/meson.build
+++ b/drivers/gpu/meson.build
@@ -1,4 +1,4 @@
 # SPDX-License-Identifier: BSD-3-Clause
 # Copyright (c) 2021 NVIDIA Corporation & Affiliates
 
-drivers = []
+drivers = [ 'cuda' ]
-- 
2.17.1
^ permalink raw reply related	[flat|nested] 28+ messages in thread
* [PATCH v5 0/1] gpu/cuda: introduce CUDA driver
  2021-10-05 22:49 [dpdk-dev] [RFC PATCH] gpu/cuda: introduce CUDA driver eagostini
                   ` (3 preceding siblings ...)
  2021-11-09  5:50 ` [dpdk-dev] [PATCH v4 0/1] " eagostini
@ 2021-11-15 22:36 ` eagostini
  2021-11-15 22:36   ` [PATCH v5 1/1] " eagostini
  2021-11-16 20:47 ` [PATCH v6 0/1] " eagostini
  2021-11-16 22:50 ` [PATCH v7 0/1] " eagostini
  6 siblings, 1 reply; 28+ messages in thread
From: eagostini @ 2021-11-15 22:36 UTC (permalink / raw)
  To: dev; +Cc: Elena Agostini
From: Elena Agostini <eagostini@nvidia.com>
This is the CUDA implementation of the gpudev library.
Funcitonalities implemented through CUDA Driver API are:
- Device probe and remove
- Manage device memory allocations
- Register/unregister external CPU memory in the device memory area
Changelog:
- CUDA driver implementation of the GPU write memory barrier
- Fixed styling reported by checkpatch
- CUDA driver library not required at build time
- CUDA driver library is loaded at runtime through dlopen
- Documentation updated
Elena Agostini (1):
  gpu/cuda: introduce CUDA driver
 doc/guides/gpus/cuda.rst               |  127 +++
 doc/guides/gpus/index.rst              |    1 +
 doc/guides/rel_notes/release_21_11.rst |    2 +
 drivers/gpu/cuda/cuda.c                | 1132 ++++++++++++++++++++++++
 drivers/gpu/cuda/cuda_loader.h         |  301 +++++++
 drivers/gpu/cuda/meson.build           |   10 +
 drivers/gpu/cuda/version.map           |    3 +
 drivers/gpu/meson.build                |    2 +-
 8 files changed, 1577 insertions(+), 1 deletion(-)
 create mode 100644 doc/guides/gpus/cuda.rst
 create mode 100644 drivers/gpu/cuda/cuda.c
 create mode 100644 drivers/gpu/cuda/cuda_loader.h
 create mode 100644 drivers/gpu/cuda/meson.build
 create mode 100644 drivers/gpu/cuda/version.map
-- 
2.17.1
^ permalink raw reply	[flat|nested] 28+ messages in thread
* [PATCH v5 1/1] gpu/cuda: introduce CUDA driver
  2021-11-15 22:36 ` [PATCH v5 0/1] " eagostini
@ 2021-11-15 22:36   ` eagostini
  0 siblings, 0 replies; 28+ messages in thread
From: eagostini @ 2021-11-15 22:36 UTC (permalink / raw)
  To: dev; +Cc: Elena Agostini
From: Elena Agostini <eagostini@nvidia.com>
This is the CUDA implementation of the gpudev library.
Funcitonalities implemented through CUDA Driver API are:
- Device probe and remove
- Manage device memory allocations
- Register/unregister external CPU memory in the device memory area
Signed-off-by: Elena Agostini <eagostini@nvidia.com>
---
 doc/guides/gpus/cuda.rst               |  127 +++
 doc/guides/gpus/index.rst              |    1 +
 doc/guides/rel_notes/release_21_11.rst |    2 +
 drivers/gpu/cuda/cuda.c                | 1132 ++++++++++++++++++++++++
 drivers/gpu/cuda/cuda_loader.h         |  301 +++++++
 drivers/gpu/cuda/meson.build           |   10 +
 drivers/gpu/cuda/version.map           |    3 +
 drivers/gpu/meson.build                |    2 +-
 8 files changed, 1577 insertions(+), 1 deletion(-)
 create mode 100644 doc/guides/gpus/cuda.rst
 create mode 100644 drivers/gpu/cuda/cuda.c
 create mode 100644 drivers/gpu/cuda/cuda_loader.h
 create mode 100644 drivers/gpu/cuda/meson.build
 create mode 100644 drivers/gpu/cuda/version.map
diff --git a/doc/guides/gpus/cuda.rst b/doc/guides/gpus/cuda.rst
new file mode 100644
index 0000000000..313fcfeffc
--- /dev/null
+++ b/doc/guides/gpus/cuda.rst
@@ -0,0 +1,127 @@
+.. SPDX-License-Identifier: BSD-3-Clause
+   Copyright (c) 2021 NVIDIA Corporation & Affiliates
+
+CUDA GPU driver
+===============
+
+The CUDA GPU driver library (**librte_gpu_cuda**) provides support for NVIDIA GPUs.
+Information and documentation about these devices can be found on the
+`NVIDIA website <http://www.nvidia.com>`__. Help is also provided by the
+`NVIDIA CUDA Toolkit developer zone <https://docs.nvidia.com/cuda>`__.
+
+CUDA Shared Library
+-------------------
+
+To avoid any system configuration issue, the CUDA API **libcuda.so** shared library
+is not linked at building time because of a Meson's bug that looks
+for `cudart` module even if the `meson.build` file only requires default `cuda` module.
+
+**libcuda.so** is loaded at runtime in the ``cuda_gpu_probe`` function through ``dlopen``
+when the very first GPU is detected.
+If your CUDA installation resides in a custom directory you need to set
+the environment variable ``CUDA_PATH`` to specify where ``dlopen``
+can look for your **libcuda.so**.
+
+All CUDA API symbols are loaded at runtime as well.
+For this reason, to build the CUDA driver library
+you don't need to have the CUDA library installed on your system.
+
+Design
+------
+
+**librte_gpu_cuda** relies on CUDA Driver API (no need for CUDA Runtime API).
+
+Goal of this driver library is not to provide a wrapper for the whole CUDA Driver API.
+Instead, the scope is to implement the generic features of gpudev API.
+For a CUDA application, integrating the gpudev library functions using the CUDA driver library
+is quite straightforward and doesn't create any compatibility problem.
+
+Initialization
+~~~~~~~~~~~~~~
+
+During initialization, CUDA driver library detects NVIDIA physical GPUs on the
+system or specified via EAL device options (e.g. ``-a b6:00.0``).
+The driver initializes the CUDA driver environment through ``cuInit(0)`` function.
+For this reason, it's required to set any CUDA environment configuration before
+calling ``rte_eal_init`` function in the DPDK application.
+
+If the CUDA driver environment has been already initialized, the ``cuInit(0)``
+in CUDA driver library has no effect.
+
+CUDA Driver sub-contexts
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+After initialization, a CUDA application can create multiple sub-contexts on GPU
+physical devices. Through gpudev library, is possible to register these sub-contexts
+in the CUDA driver library as child devices having as parent a GPU physical device.
+
+CUDA driver library also supports `MPS <https://docs.nvidia.com/deploy/pdf/CUDA_Multi_Process_Service_Overview.pdf>`__.
+
+GPU memory management
+~~~~~~~~~~~~~~~~~~~~~
+
+The CUDA driver library maintains a table of GPU memory addresses allocated
+and CPU memory addresses registered associated to the input CUDA context.
+Whenever the application tried to deallocate or deregister a memory address,
+if the address is not in the table the CUDA driver library will return an error.
+
+Features
+--------
+
+- Register new child devices aka new CUDA Driver contexts
+- Allocate memory on the GPU
+- Register CPU memory to make it visible from GPU
+
+Minimal requirements
+--------------------
+
+Minimal requirements to enable the CUDA driver library are:
+
+- NVIDIA GPU Ampere or Volta
+- CUDA 11.4 Driver API or newer
+
+`GPUDirect RDMA Technology <https://docs.nvidia.com/cuda/gpudirect-rdma/index.html>`__
+allows compatible network cards (e.g. Mellanox) to directly send and receive packets
+using GPU memory instead of additional memory copies through the CPU system memory.
+To enable this technology, system requirements are:
+
+- `nvidia-peermem <https://docs.nvidia.com/cuda/gpudirect-rdma/index.html#nvidia-peermem>`__ module running on the system
+- Mellanox Network card ConnectX-5 or newer (BlueField models included)
+- DPDK mlx5 PMD enabled
+- To reach the best performance, an additional PCIe switch between GPU and NIC is recommended
+
+Limitations
+-----------
+
+Supported only on Linux.
+
+Supported GPUs
+--------------
+
+The following NVIDIA GPU devices are supported by this CUDA driver library:
+
+- NVIDIA A100 80GB PCIe
+- NVIDIA A100 40GB PCIe
+- NVIDIA A30 24GB
+- NVIDIA A10 24GB
+- NVIDIA V100 32GB PCIe
+- NVIDIA V100 16GB PCIe
+
+External references
+-------------------
+
+A good example of how to use the GPU CUDA driver library through the gpudev library
+is the l2fwd-nv application that can be found `here <https://github.com/NVIDIA/l2fwd-nv>`__.
+
+The application is based on vanilla DPDK example l2fwd and it's enhanced with GPU memory
+managed through gpudev library and CUDA to launch the swap of packets' MAC addresses workload
+on the GPU.
+
+l2fwd-nv is not intended to be used for performance (testpmd is the good candidate for this).
+The goal is to show different use-cases about how a CUDA application can use DPDK to:
+
+- allocate memory on GPU device using gpudev library
+- use that memory to create an external GPU memory mempool
+- receive packets directly in GPU memory
+- coordinate the workload on the GPU with the network and CPU activity to receive packets
+- send modified packets directly from the GPU memory
diff --git a/doc/guides/gpus/index.rst b/doc/guides/gpus/index.rst
index 1878423239..4b7a420556 100644
--- a/doc/guides/gpus/index.rst
+++ b/doc/guides/gpus/index.rst
@@ -9,3 +9,4 @@ General-Purpose Graphics Processing Unit Drivers
    :numbered:
 
    overview
+   cuda
diff --git a/doc/guides/rel_notes/release_21_11.rst b/doc/guides/rel_notes/release_21_11.rst
index 7d60b554d8..c628deaeea 100644
--- a/doc/guides/rel_notes/release_21_11.rst
+++ b/doc/guides/rel_notes/release_21_11.rst
@@ -111,6 +111,8 @@ New Features
   * Memory management
   * Communication flag & list
 
+* **Added NVIDIA GPU driver implemented with CUDA library.**
+
 * **Added new RSS offload types for IPv4/L4 checksum in RSS flow.**
 
   Added macros ETH_RSS_IPV4_CHKSUM and ETH_RSS_L4_CHKSUM, now IPv4 and
diff --git a/drivers/gpu/cuda/cuda.c b/drivers/gpu/cuda/cuda.c
new file mode 100644
index 0000000000..4f60c1932d
--- /dev/null
+++ b/drivers/gpu/cuda/cuda.c
@@ -0,0 +1,1132 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright (c) 2021 NVIDIA Corporation & Affiliates
+ */
+
+#include <rte_common.h>
+#include <rte_log.h>
+#include <rte_malloc.h>
+#include <rte_errno.h>
+#include <rte_pci.h>
+#include <rte_bus_pci.h>
+#include <rte_byteorder.h>
+#include <rte_dev.h>
+
+#include <gpudev_driver.h>
+#include "cuda_loader.h"
+#include <dlfcn.h>
+
+#define CUDA_DRIVER_MIN_VERSION 11040
+#define CUDA_API_MIN_VERSION 3020
+
+/* CUDA Driver functions loaded with dlsym() */
+enum cuError (*sym_cuInit)(unsigned int flags) = NULL;
+enum cuError (*sym_cuDriverGetVersion)(int *driverVersion) = NULL;
+enum cuError (*sym_cuGetProcAddress)(const char *symbol, void **pfn, int cudaVersion, uint64_t flags) = NULL;
+
+/* CUDA Driver functions loaded with cuGetProcAddress for versioning */
+PFN_cuGetErrorString pfn_cuGetErrorString;
+PFN_cuGetErrorName pfn_cuGetErrorName;
+PFN_cuPointerSetAttribute pfn_cuPointerSetAttribute;
+PFN_cuDeviceGetAttribute pfn_cuDeviceGetAttribute;
+PFN_cuDeviceGetByPCIBusId pfn_cuDeviceGetByPCIBusId;
+PFN_cuDevicePrimaryCtxRetain pfn_cuDevicePrimaryCtxRetain;
+PFN_cuDevicePrimaryCtxRelease pfn_cuDevicePrimaryCtxRelease;
+PFN_cuDeviceTotalMem pfn_cuDeviceTotalMem;
+PFN_cuDeviceGetName pfn_cuDeviceGetName;
+PFN_cuCtxGetApiVersion pfn_cuCtxGetApiVersion;
+PFN_cuCtxSetCurrent pfn_cuCtxSetCurrent;
+PFN_cuCtxGetCurrent pfn_cuCtxGetCurrent;
+PFN_cuCtxGetDevice pfn_cuCtxGetDevice;
+PFN_cuCtxGetExecAffinity pfn_cuCtxGetExecAffinity;
+PFN_cuMemAlloc pfn_cuMemAlloc;
+PFN_cuMemFree pfn_cuMemFree;
+PFN_cuMemHostRegister pfn_cuMemHostRegister;
+PFN_cuMemHostUnregister pfn_cuMemHostUnregister;
+PFN_cuMemHostGetDevicePointer pfn_cuMemHostGetDevicePointer;
+PFN_cuFlushGPUDirectRDMAWrites pfn_cuFlushGPUDirectRDMAWrites;
+
+static void *cudalib;
+static unsigned int cuda_api_version;
+static int cuda_driver_version;
+
+/* NVIDIA GPU vendor */
+#define NVIDIA_GPU_VENDOR_ID (0x10de)
+
+/* NVIDIA GPU device IDs */
+#define NVIDIA_GPU_A100_40GB_DEVICE_ID (0x20f1)
+#define NVIDIA_GPU_A100_80GB_DEVICE_ID (0x20b5)
+
+#define NVIDIA_GPU_A30_24GB_DEVICE_ID (0x20b7)
+#define NVIDIA_GPU_A10_24GB_DEVICE_ID (0x2236)
+
+#define NVIDIA_GPU_V100_32GB_DEVICE_ID (0x1db6)
+#define NVIDIA_GPU_V100_16GB_DEVICE_ID (0x1db4)
+
+#define CUDA_MAX_ALLOCATION_NUM 512
+
+#define GPU_PAGE_SHIFT 16
+#define GPU_PAGE_SIZE (1UL << GPU_PAGE_SHIFT)
+
+static RTE_LOG_REGISTER_DEFAULT(cuda_logtype, NOTICE);
+
+/** Helper macro for logging */
+#define rte_gpu_cuda_log(level, fmt, ...) \
+	rte_log(RTE_LOG_ ## level, cuda_logtype, fmt "\n", ##__VA_ARGS__)
+
+#define rte_gpu_cuda_log_debug(fmt, ...) \
+	rte_gpu_cuda_log(DEBUG, RTE_STR(__LINE__) ":%s() " fmt, __func__, \
+		##__VA_ARGS__)
+
+/* NVIDIA GPU address map */
+static struct rte_pci_id pci_id_cuda_map[] = {
+	{
+		RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
+				NVIDIA_GPU_A100_40GB_DEVICE_ID)
+	},
+	{
+		RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
+				NVIDIA_GPU_V100_32GB_DEVICE_ID)
+	},
+	/* {.device_id = 0}, ?? */
+};
+
+/* Device private info */
+struct cuda_info {
+	char gpu_name[RTE_DEV_NAME_MAX_LEN];
+	cuDev cu_dev;
+	int gdr_supported;
+	int gdr_write_ordering;
+	int gdr_flush_type;
+};
+
+/* Type of memory allocated by CUDA driver */
+enum mem_type {
+	GPU_MEM = 0,
+	CPU_REGISTERED,
+	GPU_REGISTERED /* Not used yet */
+};
+
+/* key associated to a memory address */
+typedef uintptr_t cuda_ptr_key;
+
+/* Single entry of the memory list */
+struct mem_entry {
+	cuDevPtr ptr_d;
+	void *ptr_h;
+	size_t size;
+	struct rte_gpu *dev;
+	CUcontext ctx;
+	cuda_ptr_key pkey;
+	enum mem_type mtype;
+	struct mem_entry *prev;
+	struct mem_entry *next;
+};
+
+static struct mem_entry *mem_alloc_list_head;
+static struct mem_entry *mem_alloc_list_tail;
+static uint32_t mem_alloc_list_last_elem;
+
+/* Load the CUDA symbols */
+
+static int
+cuda_loader(void)
+{
+	char cuda_path[1024];
+
+	if (!getenv("CUDA_PATH"))
+		snprintf(cuda_path, 1024, "%s", "libcuda.so");
+	else
+		snprintf(cuda_path, 1024, "%s%s", getenv("CUDA_PATH"), "libcuda.so");
+
+	cudalib = dlopen(cuda_path, RTLD_LAZY);
+	if (cudalib == NULL) {
+		rte_gpu_cuda_log(ERR, "Failed to find CUDA library in %s (CUDA_PATH=%s).\n",
+							cuda_path, getenv("CUDA_PATH"));
+		return -1;
+	}
+
+	return 0;
+}
+
+static int
+cuda_sym_func_loader(void)
+{
+	if (!cudalib)
+		return -1;
+
+	sym_cuInit = dlsym(cudalib, "cuInit");
+	if (sym_cuInit == NULL) {
+		rte_gpu_cuda_log(ERR, "Failed to load CUDA missing symbol cuInit\n");
+		return -1;
+	}
+
+	sym_cuDriverGetVersion = dlsym(cudalib, "cuDriverGetVersion");
+	if (sym_cuDriverGetVersion == NULL) {
+		rte_gpu_cuda_log(ERR, "Failed to load CUDA missing symbol cuDriverGetVersion\n");
+		return -1;
+	}
+
+	sym_cuGetProcAddress = dlsym(cudalib, "cuGetProcAddress");
+	if (sym_cuGetProcAddress == NULL) {
+		rte_gpu_cuda_log(ERR, "Failed to load CUDA missing symbol cuGetProcAddress\n");
+		return -1;
+	}
+
+	return 0;
+}
+
+static int
+cuda_pfn_func_loader(void)
+{
+	enum cuError res;
+
+	res = sym_cuGetProcAddress("cuGetErrorString", (void **) (&pfn_cuGetErrorString), cuda_driver_version, 0);
+	if (res != 0) {
+		rte_gpu_cuda_log(ERR, "Retrieve pfn_cuGetErrorString failed with %d\n", res);
+		return -1;
+	}
+
+	res = sym_cuGetProcAddress("cuGetErrorName", (void **) (&pfn_cuGetErrorName), cuda_driver_version, 0);
+	if (res != 0) {
+		rte_gpu_cuda_log(ERR, "Retrieve pfn_cuGetErrorName failed with %d\n", res);
+		return -1;
+	}
+
+	res = sym_cuGetProcAddress("cuPointerSetAttribute", (void **) (&pfn_cuPointerSetAttribute), cuda_driver_version, 0);
+	if (res != 0) {
+		rte_gpu_cuda_log(ERR, "Retrieve pfn_cuPointerSetAttribute failed with %d\n", res);
+		return -1;
+	}
+
+	res = sym_cuGetProcAddress("cuDeviceGetAttribute", (void **) (&pfn_cuDeviceGetAttribute), cuda_driver_version, 0);
+	if (res != 0) {
+		rte_gpu_cuda_log(ERR, "Retrieve pfn_cuDeviceGetAttribute failed with %d\n", res);
+		return -1;
+	}
+
+	res = sym_cuGetProcAddress("cuDeviceGetByPCIBusId", (void **) (&pfn_cuDeviceGetByPCIBusId), cuda_driver_version, 0);
+	if (res != 0) {
+		rte_gpu_cuda_log(ERR, "Retrieve pfn_cuDeviceGetByPCIBusId failed with %d\n", res);
+		return -1;
+	}
+
+	res = sym_cuGetProcAddress("cuDeviceGetName", (void **) (&pfn_cuDeviceGetName), cuda_driver_version, 0);
+	if (res != 0) {
+		rte_gpu_cuda_log(ERR, "Retrieve pfn_cuDeviceGetName failed with %d\n", res);
+		return -1;
+	}
+
+	res = sym_cuGetProcAddress("cuDevicePrimaryCtxRetain", (void **) (&pfn_cuDevicePrimaryCtxRetain), cuda_driver_version, 0);
+	if (res != 0) {
+		rte_gpu_cuda_log(ERR, "Retrieve pfn_cuDevicePrimaryCtxRetain failed with %d\n", res);
+		return -1;
+	}
+
+	res = sym_cuGetProcAddress("cuDevicePrimaryCtxRelease", (void **) (&pfn_cuDevicePrimaryCtxRelease), cuda_driver_version, 0);
+	if (res != 0) {
+		rte_gpu_cuda_log(ERR, "Retrieve pfn_cuDevicePrimaryCtxRelease failed with %d\n", res);
+		return -1;
+	}
+
+	res = sym_cuGetProcAddress("cuDeviceTotalMem", (void **) (&pfn_cuDeviceTotalMem), cuda_driver_version, 0);
+	if (res != 0) {
+		rte_gpu_cuda_log(ERR, "Retrieve pfn_cuDeviceTotalMem failed with %d\n", res);
+		return -1;
+	}
+
+	res = sym_cuGetProcAddress("cuCtxGetApiVersion", (void **) (&pfn_cuCtxGetApiVersion), cuda_driver_version, 0);
+	if (res != 0) {
+		rte_gpu_cuda_log(ERR, "Retrieve pfn_cuCtxGetApiVersion failed with %d\n", res);
+		return -1;
+	}
+
+	res = sym_cuGetProcAddress("cuCtxGetDevice", (void **) (&pfn_cuCtxGetDevice), cuda_driver_version, 0);
+	if (res != 0) {
+		rte_gpu_cuda_log(ERR, "Retrieve pfn_cuCtxGetDevice failed with %d\n", res);
+		return -1;
+	}
+
+	res = sym_cuGetProcAddress("cuCtxSetCurrent", (void **) (&pfn_cuCtxSetCurrent), cuda_driver_version, 0);
+	if (res != 0) {
+		rte_gpu_cuda_log(ERR, "Retrieve pfn_cuCtxSetCurrent failed with %d\n", res);
+		return -1;
+	}
+
+	res = sym_cuGetProcAddress("cuCtxGetCurrent", (void **) (&pfn_cuCtxGetCurrent), cuda_driver_version, 0);
+	if (res != 0) {
+		rte_gpu_cuda_log(ERR, "Retrieve pfn_cuCtxGetCurrent failed with %d\n", res);
+		return -1;
+	}
+
+	res = sym_cuGetProcAddress("cuCtxGetExecAffinity", (void **) (&pfn_cuCtxGetExecAffinity), cuda_driver_version, 0);
+	if (res != 0) {
+		rte_gpu_cuda_log(ERR, "Retrieve pfn_cuCtxGetExecAffinity failed with %d\n", res);
+		return -1;
+	}
+
+	res = sym_cuGetProcAddress("cuMemAlloc", (void **) (&pfn_cuMemAlloc), cuda_driver_version, 0);
+	if (res != 0) {
+		rte_gpu_cuda_log(ERR, "Retrieve pfn_cuMemAlloc failed with %d\n", res);
+		return -1;
+	}
+
+	res = sym_cuGetProcAddress("cuMemFree", (void **) (&pfn_cuMemFree), cuda_driver_version, 0);
+	if (res != 0) {
+		rte_gpu_cuda_log(ERR, "Retrieve pfn_cuMemFree failed with %d\n", res);
+		return -1;
+	}
+
+	res = sym_cuGetProcAddress("cuMemHostRegister", (void **) (&pfn_cuMemHostRegister), cuda_driver_version, 0);
+	if (res != 0) {
+		rte_gpu_cuda_log(ERR, "Retrieve pfn_cuMemHostRegister failed with %d\n", res);
+		return -1;
+	}
+
+	res = sym_cuGetProcAddress("cuMemHostUnregister", (void **) (&pfn_cuMemHostUnregister), cuda_driver_version, 0);
+	if (res != 0) {
+		rte_gpu_cuda_log(ERR, "Retrieve pfn_cuMemHostUnregister failed with %d\n", res);
+		return -1;
+	}
+
+	res = sym_cuGetProcAddress("cuMemHostGetDevicePointer", (void **) (&pfn_cuMemHostGetDevicePointer), cuda_driver_version, 0);
+	if (res != 0) {
+		rte_gpu_cuda_log(ERR, "Retrieve pfn_cuMemHostGetDevicePointer failed with %d\n", res);
+		return -1;
+	}
+
+	res = sym_cuGetProcAddress("cuFlushGPUDirectRDMAWrites", (void **) (&pfn_cuFlushGPUDirectRDMAWrites), cuda_driver_version, 0);
+	if (res != 0) {
+		rte_gpu_cuda_log(ERR, "Retrieve cuFlushGPUDirectRDMAWrites failed with %d\n", res);
+		return -1;
+	}
+
+	return 0;
+}
+
+/* Generate a key from a memory pointer */
+static cuda_ptr_key
+get_hash_from_ptr(void *ptr)
+{
+	return (uintptr_t) ptr;
+}
+
+static uint32_t
+mem_list_count_item(void)
+{
+	return mem_alloc_list_last_elem;
+}
+
+/* Initiate list of memory allocations if not done yet */
+static struct mem_entry *
+mem_list_add_item(void)
+{
+	/* Initiate list of memory allocations if not done yet */
+	if (mem_alloc_list_head == NULL) {
+		mem_alloc_list_head = rte_zmalloc(NULL,
+						sizeof(struct mem_entry),
+						RTE_CACHE_LINE_SIZE);
+		if (mem_alloc_list_head == NULL) {
+			rte_gpu_cuda_log(ERR, "Failed to allocate memory for memory list.\n");
+			return NULL;
+		}
+
+		mem_alloc_list_head->next = NULL;
+		mem_alloc_list_head->prev = NULL;
+		mem_alloc_list_tail = mem_alloc_list_head;
+	} else {
+		struct mem_entry *mem_alloc_list_cur = rte_zmalloc(NULL,
+								sizeof(struct mem_entry),
+								RTE_CACHE_LINE_SIZE);
+
+		if (mem_alloc_list_cur == NULL) {
+			rte_gpu_cuda_log(ERR, "Failed to allocate memory for memory list.\n");
+			return NULL;
+		}
+
+		mem_alloc_list_tail->next = mem_alloc_list_cur;
+		mem_alloc_list_cur->prev = mem_alloc_list_tail;
+		mem_alloc_list_tail = mem_alloc_list_tail->next;
+		mem_alloc_list_tail->next = NULL;
+	}
+
+	mem_alloc_list_last_elem++;
+
+	return mem_alloc_list_tail;
+}
+
+static struct mem_entry *
+mem_list_find_item(cuda_ptr_key pk)
+{
+	struct mem_entry *mem_alloc_list_cur = NULL;
+
+	if (mem_alloc_list_head == NULL) {
+		rte_gpu_cuda_log(ERR, "Memory list doesn't exist\n");
+		return NULL;
+	}
+
+	if (mem_list_count_item() == 0) {
+		rte_gpu_cuda_log(ERR, "No items in memory list\n");
+		return NULL;
+	}
+
+	mem_alloc_list_cur = mem_alloc_list_head;
+
+	while (mem_alloc_list_cur != NULL) {
+		if (mem_alloc_list_cur->pkey == pk)
+			return mem_alloc_list_cur;
+		mem_alloc_list_cur = mem_alloc_list_cur->next;
+	}
+
+	return mem_alloc_list_cur;
+}
+
+static int
+mem_list_del_item(cuda_ptr_key pk)
+{
+	struct mem_entry *mem_alloc_list_cur = NULL;
+
+	mem_alloc_list_cur = mem_list_find_item(pk);
+	if (mem_alloc_list_cur == NULL)
+		return -EINVAL;
+
+	/* if key is in head */
+	if (mem_alloc_list_cur->prev == NULL)
+		mem_alloc_list_head = mem_alloc_list_cur->next;
+	else {
+		mem_alloc_list_cur->prev->next = mem_alloc_list_cur->next;
+		if (mem_alloc_list_cur->next != NULL)
+			mem_alloc_list_cur->next->prev = mem_alloc_list_cur->prev;
+	}
+
+	rte_free(mem_alloc_list_cur);
+
+	mem_alloc_list_last_elem--;
+
+	return 0;
+}
+
+static int
+cuda_dev_info_get(struct rte_gpu *dev, struct rte_gpu_info *info)
+{
+	int ret = 0;
+	enum cuError res;
+	struct rte_gpu_info parent_info;
+	struct cuExecAffinityParams affinityPrm;
+	const char *err_string;
+	struct cuda_info *private;
+	CUcontext current_ctx;
+	CUcontext input_ctx;
+
+	if (dev == NULL)
+		return -EINVAL;
+
+	/* Child initialization time probably called by rte_gpu_add_child() */
+	if (dev->mpshared->info.parent != RTE_GPU_ID_NONE && dev->mpshared->dev_private == NULL) {
+		/* Store current ctx */
+		res = pfn_cuCtxGetCurrent(¤t_ctx);
+		if (res != 0) {
+			pfn_cuGetErrorString(res, &(err_string));
+			rte_gpu_cuda_log(ERR, "cuCtxGetCurrent failed with %s.\n", err_string);
+
+			return -1;
+		}
+
+		/* Set child ctx as current ctx */
+		input_ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
+		res = pfn_cuCtxSetCurrent(input_ctx);
+		if (res != 0) {
+			pfn_cuGetErrorString(res, &(err_string));
+			rte_gpu_cuda_log(ERR,
+					"cuCtxSetCurrent input failed with %s.\n",
+					err_string);
+
+			return -1;
+		}
+
+		/*
+		 * Ctx capacity info
+		 */
+
+		/* MPS compatible */
+		res = pfn_cuCtxGetExecAffinity(&affinityPrm, CU_EXEC_AFFINITY_TYPE_SM_COUNT);
+		if (res != 0) {
+			pfn_cuGetErrorString(res, &(err_string));
+			rte_gpu_cuda_log(ERR, "cuCtxGetExecAffinity failed with %s.\n", err_string);
+		}
+		dev->mpshared->info.processor_count = (uint32_t)affinityPrm.param.smCount.val;
+
+		ret = rte_gpu_info_get(dev->mpshared->info.parent, &parent_info);
+		if (ret)
+			return -ENODEV;
+		dev->mpshared->info.total_memory = parent_info.total_memory;
+
+		/*
+		 * GPU Device private info
+		 */
+		dev->mpshared->dev_private = rte_zmalloc(NULL,
+							sizeof(struct cuda_info),
+							RTE_CACHE_LINE_SIZE);
+		if (dev->mpshared->dev_private == NULL) {
+			rte_gpu_cuda_log(ERR, "Failed to allocate memory for GPU process private.\n");
+
+			return -1;
+		}
+
+		private = (struct cuda_info *)dev->mpshared->dev_private;
+
+		res = pfn_cuCtxGetDevice(&(private->cu_dev));
+		if (res != 0) {
+			pfn_cuGetErrorString(res, &(err_string));
+			rte_gpu_cuda_log(ERR, "cuCtxGetDevice failed with %s.\n", err_string);
+
+			return -1;
+		}
+
+		res = pfn_cuDeviceGetName(private->gpu_name, RTE_DEV_NAME_MAX_LEN, private->cu_dev);
+		if (res != 0) {
+			pfn_cuGetErrorString(res, &(err_string));
+			rte_gpu_cuda_log(ERR, "cuDeviceGetName failed with %s.\n", err_string);
+
+			return -1;
+		}
+
+		/* Restore original ctx as current ctx */
+		res = pfn_cuCtxSetCurrent(current_ctx);
+		if (res != 0) {
+			pfn_cuGetErrorString(res, &(err_string));
+			rte_gpu_cuda_log(ERR,
+					"cuCtxSetCurrent current failed with %s.\n",
+					err_string);
+
+			return -1;
+		}
+	}
+
+	*info = dev->mpshared->info;
+
+	return 0;
+}
+
+/*
+ * GPU Memory
+ */
+
+static int
+cuda_mem_alloc(struct rte_gpu *dev, size_t size, void **ptr)
+{
+	enum cuError res;
+	const char *err_string;
+	CUcontext current_ctx;
+	CUcontext input_ctx;
+	unsigned int flag = 1;
+
+	if (dev == NULL || size == 0)
+		return -EINVAL;
+
+	/* Store current ctx */
+	res = pfn_cuCtxGetCurrent(¤t_ctx);
+	if (res != 0) {
+		pfn_cuGetErrorString(res, &(err_string));
+		rte_gpu_cuda_log(ERR, "cuCtxGetCurrent failed with %s.\n", err_string);
+
+		return -1;
+	}
+
+	/* Set child ctx as current ctx */
+	input_ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
+	res = pfn_cuCtxSetCurrent(input_ctx);
+	if (res != 0) {
+		pfn_cuGetErrorString(res, &(err_string));
+		rte_gpu_cuda_log(ERR, "cuCtxSetCurrent input failed with %s.\n", err_string);
+
+		return -1;
+	}
+
+	/* Get next memory list item */
+	mem_alloc_list_tail = mem_list_add_item();
+	if (mem_alloc_list_tail == NULL)
+		return -ENOMEM;
+
+	/* Allocate memory */
+	mem_alloc_list_tail->size = size;
+	res = pfn_cuMemAlloc(&(mem_alloc_list_tail->ptr_d), mem_alloc_list_tail->size);
+	if (res != 0) {
+		pfn_cuGetErrorString(res, &(err_string));
+		rte_gpu_cuda_log(ERR,
+				"cuCtxSetCurrent current failed with %s.\n",
+				err_string);
+
+		return -1;
+	}
+
+	/* GPUDirect RDMA attribute required */
+	res = pfn_cuPointerSetAttribute(&flag,
+					CU_PTR_ATTR_SYNC_MEMOPS,
+					mem_alloc_list_tail->ptr_d);
+	if (res != 0) {
+		rte_gpu_cuda_log(ERR,
+				"Could not set SYNC MEMOP attribute for GPU memory at  %"PRIu32", err %d\n",
+				(uint32_t) mem_alloc_list_tail->ptr_d, res);
+		return -1;
+	}
+
+	mem_alloc_list_tail->pkey = get_hash_from_ptr((void *) mem_alloc_list_tail->ptr_d);
+	mem_alloc_list_tail->ptr_h = NULL;
+	mem_alloc_list_tail->size = size;
+	mem_alloc_list_tail->dev = dev;
+	mem_alloc_list_tail->ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
+	mem_alloc_list_tail->mtype = GPU_MEM;
+
+	/* Restore original ctx as current ctx */
+	res = pfn_cuCtxSetCurrent(current_ctx);
+	if (res != 0) {
+		pfn_cuGetErrorString(res, &(err_string));
+		rte_gpu_cuda_log(ERR, "cuCtxSetCurrent current failed with %s.\n", err_string);
+
+		return -1;
+	}
+
+	*ptr = (void *) mem_alloc_list_tail->ptr_d;
+
+	return 0;
+}
+
+static int
+cuda_mem_register(struct rte_gpu *dev, size_t size, void *ptr)
+{
+	enum cuError res;
+	const char *err_string;
+	CUcontext current_ctx;
+	CUcontext input_ctx;
+	unsigned int flag = 1;
+	int use_ptr_h = 0;
+
+	if (dev == NULL || size == 0 || ptr == NULL)
+		return -EINVAL;
+
+	/* Store current ctx */
+	res = pfn_cuCtxGetCurrent(¤t_ctx);
+	if (res != 0) {
+		pfn_cuGetErrorString(res, &(err_string));
+		rte_gpu_cuda_log(ERR, "cuCtxGetCurrent failed with %s.\n", err_string);
+
+		return -1;
+	}
+
+	/* Set child ctx as current ctx */
+	input_ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
+	res = pfn_cuCtxSetCurrent(input_ctx);
+	if (res != 0) {
+		pfn_cuGetErrorString(res, &(err_string));
+		rte_gpu_cuda_log(ERR, "cuCtxSetCurrent input failed with %s.\n", err_string);
+
+		return -1;
+	}
+
+	/* Get next memory list item */
+	mem_alloc_list_tail = mem_list_add_item();
+	if (mem_alloc_list_tail == NULL)
+		return -ENOMEM;
+
+	/* Allocate memory */
+	mem_alloc_list_tail->size = size;
+	mem_alloc_list_tail->ptr_h = ptr;
+
+	res = pfn_cuMemHostRegister(mem_alloc_list_tail->ptr_h,
+				mem_alloc_list_tail->size,
+				CU_MHOST_REGISTER_PORTABLE | CU_MHOST_REGISTER_DEVICEMAP);
+	if (res != 0) {
+		pfn_cuGetErrorString(res, &(err_string));
+		rte_gpu_cuda_log(ERR,
+				"cuMemHostRegister failed with %s ptr %p size %zd.\n",
+				err_string, mem_alloc_list_tail->ptr_h, mem_alloc_list_tail->size);
+
+		return -1;
+	}
+
+	res = pfn_cuDeviceGetAttribute(&(use_ptr_h),
+					CU_DEV_ATTR_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM,
+					((struct cuda_info *)(dev->mpshared->dev_private))->cu_dev);
+	if (res != 0) {
+		pfn_cuGetErrorString(res, &(err_string));
+		rte_gpu_cuda_log(ERR, "cuDeviceGetAttribute failed with %s.\n",
+					err_string
+			);
+
+		return -1;
+	}
+
+	if (use_ptr_h == 0) {
+		res = pfn_cuMemHostGetDevicePointer(&(mem_alloc_list_tail->ptr_d),
+						mem_alloc_list_tail->ptr_h,
+						0);
+		if (res != 0) {
+			pfn_cuGetErrorString(res, &(err_string));
+			rte_gpu_cuda_log(ERR,
+					"cuMemHostGetDevicePointer failed with %s.\n",
+					err_string);
+
+			return -1;
+		}
+
+		if ((uintptr_t) mem_alloc_list_tail->ptr_d != (uintptr_t) mem_alloc_list_tail->ptr_h) {
+			rte_gpu_cuda_log(ERR,
+					"Host input pointer is different wrt GPU registered pointer\n");
+			return -1;
+		}
+	} else {
+		mem_alloc_list_tail->ptr_d = (cuDevPtr) mem_alloc_list_tail->ptr_h;
+	}
+
+	/* GPUDirect RDMA attribute required */
+	res = pfn_cuPointerSetAttribute(&flag,
+					CU_PTR_ATTR_SYNC_MEMOPS,
+					mem_alloc_list_tail->ptr_d);
+	if (res != 0) {
+		rte_gpu_cuda_log(ERR,
+				"Could not set SYNC MEMOP attribute for GPU memory at %"PRIu32", err %d\n",
+				(uint32_t) mem_alloc_list_tail->ptr_d, res);
+		return -1;
+	}
+
+	mem_alloc_list_tail->pkey = get_hash_from_ptr((void *) mem_alloc_list_tail->ptr_h);
+	mem_alloc_list_tail->size = size;
+	mem_alloc_list_tail->dev = dev;
+	mem_alloc_list_tail->ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
+	mem_alloc_list_tail->mtype = CPU_REGISTERED;
+
+	/* Restore original ctx as current ctx */
+	res = pfn_cuCtxSetCurrent(current_ctx);
+	if (res != 0) {
+		pfn_cuGetErrorString(res, &(err_string));
+		rte_gpu_cuda_log(ERR,
+				"cuCtxSetCurrent current failed with %s.\n",
+				err_string);
+
+		return -1;
+	}
+
+	return 0;
+}
+
+static int
+cuda_mem_free(struct rte_gpu *dev, void *ptr)
+{
+	enum cuError res;
+	struct mem_entry *mem_item;
+	const char *err_string;
+	cuda_ptr_key hk;
+
+	if (dev == NULL || ptr == NULL)
+		return -EINVAL;
+
+	hk = get_hash_from_ptr((void *) ptr);
+
+	mem_item = mem_list_find_item(hk);
+	if (mem_item == NULL) {
+		rte_gpu_cuda_log(ERR, "Memory address 0x%p not found in driver memory\n", ptr);
+		return -1;
+	}
+
+	if (mem_item->mtype == GPU_MEM) {
+		res = pfn_cuMemFree(mem_item->ptr_d);
+		if (res != 0) {
+			pfn_cuGetErrorString(res, &(err_string));
+			rte_gpu_cuda_log(ERR, "cuMemFree current failed with %s.\n", err_string);
+
+			return -1;
+		}
+
+		return mem_list_del_item(hk);
+	}
+
+	rte_gpu_cuda_log(ERR, "Memory type %d not supported\n", mem_item->mtype);
+	return -1;
+}
+
+static int
+cuda_mem_unregister(struct rte_gpu *dev, void *ptr)
+{
+	enum cuError res;
+	struct mem_entry *mem_item;
+	const char *err_string;
+	cuda_ptr_key hk;
+
+	if (dev == NULL || ptr == NULL)
+		return -EINVAL;
+
+	hk = get_hash_from_ptr((void *) ptr);
+
+	mem_item = mem_list_find_item(hk);
+	if (mem_item == NULL) {
+		rte_gpu_cuda_log(ERR, "Memory address 0x%p not nd in driver memory\n", ptr);
+		return -1;
+	}
+
+	if (mem_item->mtype == CPU_REGISTERED) {
+		res = pfn_cuMemHostUnregister(ptr);
+		if (res != 0) {
+			pfn_cuGetErrorString(res, &(err_string));
+			rte_gpu_cuda_log(ERR,
+					"cuMemHostUnregister current failed with %s.\n",
+					err_string);
+
+			return -1;
+		}
+
+		return mem_list_del_item(hk);
+	}
+
+	rte_gpu_cuda_log(ERR, "Memory type %d not supported\n", mem_item->mtype);
+	return -1;
+}
+
+static int
+cuda_dev_close(struct rte_gpu *dev)
+{
+	if (dev == NULL)
+		return -EINVAL;
+
+	rte_free(dev->mpshared->dev_private);
+
+	return 0;
+}
+
+static int
+cuda_wmb(struct rte_gpu *dev)
+{
+	enum cuError res;
+	const char *err_string;
+	CUcontext current_ctx;
+	CUcontext input_ctx;
+	struct cuda_info *private;
+
+	if (dev == NULL)
+		return -EINVAL;
+
+	private = (struct cuda_info *)dev->mpshared->dev_private;
+
+	if (private->gdr_write_ordering != CU_GDR_WRITES_ORDERING_NONE) {
+		/*
+		 * No need to explicitly force the write ordering because
+		 * the device natively supports it
+		 */
+		return 0;
+	}
+
+	if (private->gdr_flush_type != CU_FLUSH_GDR_WRITES_OPTION_HOST) {
+		/*
+		 * Can't flush GDR writes with cuFlushGPUDirectRDMAWrites CUDA function.
+		 * Application needs to use alternative methods.
+		 */
+		return -ENOTSUP;
+	}
+
+	/* Store current ctx */
+	res = pfn_cuCtxGetCurrent(¤t_ctx);
+	if (res != 0) {
+		pfn_cuGetErrorString(res, &(err_string));
+		rte_gpu_cuda_log(ERR, "cuCtxGetCurrent failed with %s.\n", err_string);
+
+		return -1;
+	}
+
+	/* Set child ctx as current ctx */
+	input_ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
+	res = pfn_cuCtxSetCurrent(input_ctx);
+	if (res != 0) {
+		pfn_cuGetErrorString(res, &(err_string));
+		rte_gpu_cuda_log(ERR, "cuCtxSetCurrent input failed with %s.\n", err_string);
+
+		return -1;
+	}
+
+	res = pfn_cuFlushGPUDirectRDMAWrites(CU_FLUSH_GDR_WRITES_TARGET_CURRENT_CTX,
+					CU_FLUSH_GDR_WRITES_TO_ALL_DEVICES);
+	if (res != 0) {
+		pfn_cuGetErrorString(res, &(err_string));
+		rte_gpu_cuda_log(ERR,
+				"cuFlushGPUDirectRDMAWrites current failed with %s.\n",
+				err_string);
+
+		return -1;
+	}
+
+	/* Restore original ctx as current ctx */
+	res = pfn_cuCtxSetCurrent(current_ctx);
+	if (res != 0) {
+		pfn_cuGetErrorString(res, &(err_string));
+		rte_gpu_cuda_log(ERR,
+				"cuCtxSetCurrent current failed with %s.\n",
+				err_string);
+
+		return -1;
+	}
+
+	return 0;
+}
+
+static int
+cuda_gpu_probe(__rte_unused struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
+{
+	struct rte_gpu *dev = NULL;
+	enum cuError res;
+	cuDev cu_dev_id;
+	CUcontext pctx;
+	char dev_name[RTE_DEV_NAME_MAX_LEN];
+	const char *err_string;
+	int processor_count = 0;
+	struct cuda_info *private;
+
+	if (pci_dev == NULL) {
+		rte_gpu_cuda_log(ERR, "NULL PCI device");
+		return -EINVAL;
+	}
+
+	rte_pci_device_name(&pci_dev->addr, dev_name, sizeof(dev_name));
+
+	/* Allocate memory to be used privately by drivers */
+	dev = rte_gpu_allocate(pci_dev->device.name);
+	if (dev == NULL)
+		return -ENODEV;
+
+	/* Initialize values only for the first CUDA driver call */
+	if (dev->mpshared->info.dev_id == 0) {
+		mem_alloc_list_head = NULL;
+		mem_alloc_list_tail = NULL;
+		mem_alloc_list_last_elem = 0;
+
+		/* Load libcuda.so library */
+		if (cuda_loader()) {
+			rte_gpu_cuda_log(ERR, "CUDA Driver library not found.\n");
+			return -ENOTSUP;
+		}
+
+		/* Load initial CUDA functions */
+		if (cuda_sym_func_loader()) {
+			rte_gpu_cuda_log(ERR, "CUDA functions not found in library.\n");
+			return -ENOTSUP;
+		}
+
+		/*
+		 * Required to initialize the CUDA Driver.
+		 * Multiple calls of cuInit() will return immediately
+		 * without making any relevant change
+		 */
+		sym_cuInit(0);
+
+		res = sym_cuDriverGetVersion(&cuda_driver_version);
+		if (res != 0) {
+			rte_gpu_cuda_log(ERR, "cuDriverGetVersion failed with %d\n", res);
+			return -ENOTSUP;
+		}
+
+		if (cuda_driver_version < CUDA_DRIVER_MIN_VERSION) {
+			rte_gpu_cuda_log(ERR, "CUDA Driver version found is %d Minimum requirement is %d\n",
+							cuda_driver_version, CUDA_DRIVER_MIN_VERSION);
+			return -ENOTSUP;
+		}
+
+		if (cuda_pfn_func_loader()) {
+			rte_gpu_cuda_log(ERR, "CUDA PFN functions not found in library.\n");
+			return -ENOTSUP;
+		}
+	}
+
+	/* Fill HW specific part of device structure */
+	dev->device = &pci_dev->device;
+	dev->mpshared->info.numa_node = pci_dev->device.numa_node;
+
+	/* Get NVIDIA GPU Device descriptor */
+	res = pfn_cuDeviceGetByPCIBusId(&cu_dev_id, dev->device->name);
+	if (res != 0) {
+		pfn_cuGetErrorString(res, &(err_string));
+		rte_gpu_cuda_log(ERR,
+				"cuDeviceGetByPCIBusId name %s failed with %d: %s.\n",
+				dev->device->name, res, err_string);
+
+		return -1;
+	}
+
+	res = pfn_cuDevicePrimaryCtxRetain(&pctx, cu_dev_id);
+	if (res != 0) {
+		pfn_cuGetErrorString(res, &(err_string));
+		rte_gpu_cuda_log(ERR,
+				"cuDevicePrimaryCtxRetain name %s failed with %d: %s.\n",
+				dev->device->name, res, err_string);
+
+		return -1;
+	}
+
+	res = pfn_cuCtxGetApiVersion(pctx, &cuda_api_version);
+	if (res != 0) {
+		rte_gpu_cuda_log(ERR, "cuCtxGetApiVersion failed with %d\n", res);
+		return -ENOTSUP;
+	}
+
+	if (cuda_api_version < CUDA_API_MIN_VERSION) {
+		rte_gpu_cuda_log(ERR, "CUDA API version found is %d Minimum requirement is %d\n",
+						cuda_api_version, CUDA_API_MIN_VERSION);
+		return -ENOTSUP;
+	}
+
+	dev->mpshared->info.context = (uint64_t) pctx;
+
+	/*
+	 * GPU Device generic info
+	 */
+
+	/* Processor count */
+	res = pfn_cuDeviceGetAttribute(&(processor_count),
+					CU_DEV_ATTR_MULTIPROCESSOR_COUNT,
+					cu_dev_id);
+	if (res != 0) {
+		pfn_cuGetErrorString(res, &(err_string));
+		rte_gpu_cuda_log(ERR,
+				"cuDeviceGetAttribute failed with %s.\n",
+				err_string);
+
+		return -1;
+	}
+	dev->mpshared->info.processor_count = (uint32_t)processor_count;
+
+	/* Total memory */
+	res = pfn_cuDeviceTotalMem(&dev->mpshared->info.total_memory, cu_dev_id);
+	if (res != 0) {
+		pfn_cuGetErrorString(res, &(err_string));
+		rte_gpu_cuda_log(ERR,
+				"cuDeviceTotalMem failed with %s.\n",
+				err_string);
+
+		return -1;
+	}
+
+	/*
+	 * GPU Device private info
+	 */
+	dev->mpshared->dev_private = rte_zmalloc(NULL,
+						sizeof(struct cuda_info),
+						RTE_CACHE_LINE_SIZE);
+	if (dev->mpshared->dev_private == NULL) {
+		rte_gpu_cuda_log(ERR,
+				"Failed to allocate memory for GPU process private.\n");
+
+		return -1;
+	}
+
+	private = (struct cuda_info *)dev->mpshared->dev_private;
+	private->cu_dev = cu_dev_id;
+	res = pfn_cuDeviceGetName(private->gpu_name,
+				RTE_DEV_NAME_MAX_LEN,
+				cu_dev_id);
+	if (res != 0) {
+		pfn_cuGetErrorString(res, &(err_string));
+		rte_gpu_cuda_log(ERR,
+				"cuDeviceGetName failed with %s.\n",
+				err_string);
+
+		return -1;
+	}
+
+	res = pfn_cuDeviceGetAttribute(&(private->gdr_supported),
+					CU_DEV_ATTR_GPU_DIRECT_RDMA_SUPPORTED,
+					cu_dev_id);
+	if (res != 0) {
+		pfn_cuGetErrorString(res, &(err_string));
+		rte_gpu_cuda_log(ERR,
+					"cuDeviceGetAttribute failed with %s.\n",
+					err_string);
+
+		return -1;
+	}
+
+	if (private->gdr_supported == 0)
+		rte_gpu_cuda_log(WARNING,
+					"GPU %s doesn't support GPUDirect RDMA.\n",
+					pci_dev->device.name);
+
+	res = pfn_cuDeviceGetAttribute(&(private->gdr_write_ordering),
+					CU_DEV_ATTR_GPU_DIRECT_RDMA_WRITES_ORDERING,
+					cu_dev_id);
+	if (res != 0) {
+		pfn_cuGetErrorString(res, &(err_string));
+		rte_gpu_cuda_log(ERR,
+					"cuDeviceGetAttribute failed with %s.\n",
+					err_string);
+
+		return -1;
+	}
+
+	if (private->gdr_write_ordering == CU_GDR_WRITES_ORDERING_NONE) {
+		res = pfn_cuDeviceGetAttribute(&(private->gdr_flush_type),
+					CU_DEV_ATTR_GPU_DIRECT_RDMA_FLUSH_WRITES_OPTIONS,
+					cu_dev_id);
+		if (res != 0) {
+			pfn_cuGetErrorString(res, &(err_string));
+			rte_gpu_cuda_log(ERR,
+						"cuDeviceGetAttribute failed with %s.\n",
+						err_string);
+
+			return -1;
+		}
+
+		if (private->gdr_flush_type != CU_FLUSH_GDR_WRITES_OPTION_HOST) {
+			rte_gpu_cuda_log(ERR,
+						"GPUDirect RDMA flush writes API is not supported.\n");
+		}
+	}
+
+	dev->ops.dev_info_get = cuda_dev_info_get;
+	dev->ops.dev_close = cuda_dev_close;
+	dev->ops.mem_alloc = cuda_mem_alloc;
+	dev->ops.mem_free = cuda_mem_free;
+	dev->ops.mem_register = cuda_mem_register;
+	dev->ops.mem_unregister = cuda_mem_unregister;
+	dev->ops.wmb = cuda_wmb;
+
+	rte_gpu_complete_new(dev);
+
+	rte_gpu_cuda_log_debug("dev id = %u name = %s\n", dev->mpshared->info.dev_id, private->gpu_name);
+
+	return 0;
+}
+
+static int
+cuda_gpu_remove(struct rte_pci_device *pci_dev)
+{
+	struct rte_gpu *dev;
+	int ret;
+	uint8_t gpu_id;
+
+	if (pci_dev == NULL)
+		return -EINVAL;
+
+	dev = rte_gpu_get_by_name(pci_dev->device.name);
+	if (dev == NULL) {
+		rte_gpu_cuda_log(ERR,
+				"Couldn't find HW dev \"%s\" to uninitialise it",
+				pci_dev->device.name);
+		return -ENODEV;
+	}
+	gpu_id = dev->mpshared->info.dev_id;
+
+	/* release dev from library */
+	ret = rte_gpu_release(dev);
+	if (ret)
+		rte_gpu_cuda_log(ERR, "Device %i failed to uninit: %i", gpu_id, ret);
+
+	rte_gpu_cuda_log_debug("Destroyed dev = %u", gpu_id);
+
+	return 0;
+}
+
+static struct rte_pci_driver rte_cuda_driver = {
+	.id_table = pci_id_cuda_map,
+	.drv_flags = RTE_PCI_DRV_WC_ACTIVATE,
+	.probe = cuda_gpu_probe,
+	.remove = cuda_gpu_remove,
+};
+
+RTE_PMD_REGISTER_PCI(gpu_cuda, rte_cuda_driver);
+RTE_PMD_REGISTER_PCI_TABLE(gpu_cuda, pci_id_cuda_map);
+RTE_PMD_REGISTER_KMOD_DEP(gpu_cuda, "* nvidia & (nv_peer_mem | nvpeer_mem)");
diff --git a/drivers/gpu/cuda/cuda_loader.h b/drivers/gpu/cuda/cuda_loader.h
new file mode 100644
index 0000000000..7d12ed5c8a
--- /dev/null
+++ b/drivers/gpu/cuda/cuda_loader.h
@@ -0,0 +1,301 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright (c) 2021 NVIDIA Corporation & Affiliates
+ */
+
+/*
+ * This header is inspired from cuda.h and cudaTypes.h
+ * tipically found in /usr/local/cuda/include
+ */
+
+#ifndef DPDK_CUDA_LOADER_H
+#define DPDK_CUDA_LOADER_H
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <rte_bitops.h>
+
+#if defined(__LP64__)
+typedef unsigned long long cuDevPtr_v2;
+#else
+typedef unsigned int cuDevPtr_v2;
+#endif
+typedef cuDevPtr_v2 cuDevPtr;
+
+typedef int cuDev_v1;
+typedef cuDev_v1 cuDev;
+typedef struct CUctx_st *CUcontext;
+
+enum cuError {
+	SUCCESS = 0,
+	ERROR_INVALID_VALUE = 1,
+	ERROR_OUT_OF_MEMORY = 2,
+	ERROR_NOT_INITIALIZED = 3,
+	ERROR_DEINITIALIZED = 4,
+	ERROR_PROFILER_DISABLED = 5,
+	ERROR_PROFILER_NOT_INITIALIZED = 6,
+	ERROR_PROFILER_ALREADY_STARTED = 7,
+	ERROR_PROFILER_ALREADY_STOPPED = 8,
+	ERROR_STUB_LIBRARY = 34,
+	ERROR_NO_DEVICE = 100,
+	ERROR_INVALID_DEVICE = 101,
+	ERROR_DEVICE_NOT_LICENSED = 102,
+	ERROR_INVALID_IMAGE = 200,
+	ERROR_INVALID_CONTEXT = 201,
+	ERROR_CONTEXT_ALREADY_CURRENT = 202,
+	ERROR_MAP_FAILED = 205,
+	ERROR_UNMAP_FAILED = 206,
+	ERROR_ARRAY_IS_MAPPED = 207,
+	ERROR_ALREADY_MAPPED = 208,
+	ERROR_NO_BINARY_FOR_GPU = 209,
+	ERROR_ALREADY_ACQUIRED = 210,
+	ERROR_NOT_MAPPED = 211,
+	ERROR_NOT_MAPPED_AS_ARRAY = 212,
+	ERROR_NOT_MAPPED_AS_POINTER = 213,
+	ERROR_ECC_UNCORRECTABLE = 214,
+	ERROR_UNSUPPORTED_LIMIT = 215,
+	ERROR_CONTEXT_ALREADY_IN_USE = 216,
+	ERROR_PEER_ACCESS_UNSUPPORTED = 217,
+	ERROR_INVALID_PTX = 218,
+	ERROR_INVALID_GRAPHICS_CONTEXT = 219,
+	ERROR_NVLINK_UNCORRECTABLE = 220,
+	ERROR_JIT_COMPILER_NOT_FOUND = 221,
+	ERROR_UNSUPPORTED_PTX_VERSION = 222,
+	ERROR_JIT_COMPILATION_DISABLED = 223,
+	ERROR_UNSUPPORTED_EXEC_AFFINITY = 224,
+	ERROR_INVALID_SOURCE = 300,
+	ERROR_FILE_NOT_FOUND = 301,
+	ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND = 302,
+	ERROR_SHARED_OBJECT_INIT_FAILED = 303,
+	ERROR_OPERATING_SYSTEM = 304,
+	ERROR_INVALID_HANDLE = 400,
+	ERROR_ILLEGAL_STATE = 401,
+	ERROR_NOT_FOUND = 500,
+	ERROR_NOT_READY = 600,
+	ERROR_ILLEGAL_ADDRESS = 700,
+	ERROR_LAUNCH_OUT_OF_RESOURCES = 701,
+	ERROR_LAUNCH_TIMEOUT = 702,
+	ERROR_LAUNCH_INCOMPATIBLE_TEXTURING = 703,
+	ERROR_PEER_ACCESS_ALREADY_ENABLED = 704,
+	ERROR_PEER_ACCESS_NOT_ENABLED = 705,
+	ERROR_PRIMARY_CONTEXT_ACTIVE = 708,
+	ERROR_CONTEXT_IS_DESTROYED = 709,
+	ERROR_ASSERT = 710,
+	ERROR_TOO_MANY_PEERS = 711,
+	ERROR_HOST_MEMORY_ALREADY_REGISTERED = 712,
+	ERROR_HOST_MEMORY_NOT_REGISTERED = 713,
+	ERROR_HARDWARE_STACK_ERROR = 714,
+	ERROR_ILLEGAL_INSTRUCTION = 715,
+	ERROR_MISALIGNED_ADDRESS = 716,
+	ERROR_INVALID_ADDRESS_SPACE = 717,
+	ERROR_INVALID_PC = 718,
+	ERROR_LAUNCH_FAILED = 719,
+	ERROR_COOPERATIVE_LAUNCH_TOO_LARGE = 720,
+	ERROR_NOT_PERMITTED = 800,
+	ERROR_NOT_SUPPORTED = 801,
+	ERROR_SYSTEM_NOT_READY = 802,
+	ERROR_SYSTEM_DRIVER_MISMATCH = 803,
+	ERROR_COMPAT_NOT_SUPPORTED_ON_DEVICE = 804,
+	ERROR_MPS_CONNECTION_FAILED = 805,
+	ERROR_MPS_RPC_FAILURE = 806,
+	ERROR_MPS_SERVER_NOT_READY = 807,
+	ERROR_MPS_MAX_CLIENTS_REACHED = 808,
+	ERROR_MPS_MAX_CONNECTIONS_REACHED = 809,
+	ERROR_STREAM_CAPTURE_UNSUPPORTED = 900,
+	ERROR_STREAM_CAPTURE_INVALIDATED = 901,
+	ERROR_STREAM_CAPTURE_MERGE = 902,
+	ERROR_STREAM_CAPTURE_UNMATCHED = 903,
+	ERROR_STREAM_CAPTURE_UNJOINED = 904,
+	ERROR_STREAM_CAPTURE_ISOLATION = 905,
+	ERROR_STREAM_CAPTURE_IMPLICIT = 906,
+	ERROR_CAPTURED_EVENT = 907,
+	ERROR_STREAM_CAPTURE_WRONG_THREAD = 908,
+	ERROR_TIMEOUT = 909,
+	ERROR_GRAPH_EXEC_UPDATE_FAILURE = 910,
+	ERROR_EXTERNAL_DEVICE = 911,
+	ERROR_UNKNOWN = 999
+};
+
+/*
+ * Execution Affinity Types. Useful for MPS to detect number of SMs
+ * associated to a CUDA context v3.
+ */
+enum cuExecAffinityParamType {
+	CU_EXEC_AFFINITY_TYPE_SM_COUNT = 0,
+	CU_EXEC_AFFINITY_TYPE_MAX
+};
+
+/*
+ * Number of SMs associated to a context.
+ */
+struct cuExecAffinitySMCount {
+	unsigned int val;
+	/* The number of SMs the context is limited to use. */
+} cuExecAffinitySMCount;
+
+/**
+ * Execution Affinity Parameters
+ */
+struct cuExecAffinityParams {
+	enum cuExecAffinityParamType type;
+	union {
+		struct cuExecAffinitySMCount smCount;
+	} param;
+};
+
+/* GPU device properties to query */
+enum cuDevAttr {
+	CU_DEV_ATTR_MULTIPROCESSOR_COUNT = 16,
+	/* Number of multiprocessors on device */
+	CU_DEV_ATTR_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM = 91,
+	/* Device can access host registered memory at the same virtual address as the CPU */
+	CU_DEV_ATTR_GPU_DIRECT_RDMA_SUPPORTED = 116,
+	/* Device supports GPUDirect RDMA APIs, like nvidia_p2p_get_pages (see https://docs.nvidia.com/cuda/gpudirect-rdma for more information) */
+	CU_DEV_ATTR_GPU_DIRECT_RDMA_FLUSH_WRITES_OPTIONS = 117,
+	/* The returned attribute shall be interpreted as a bitmask, where the individual bits are described by the cuFlushGDRWriteOpts enum */
+	CU_DEV_ATTR_GPU_DIRECT_RDMA_WRITES_ORDERING = 118,
+	/* GPUDirect RDMA writes to the device do not need to be flushed for consumers within the scope indicated by the returned attribute. See cuGDRWriteOrdering for the numerical values returned here. */
+};
+
+/* Memory pointer info */
+enum cuPtrAttr {
+	CU_PTR_ATTR_CONTEXT = 1,
+	/* The CUcontext on which a pointer was allocated or registered */
+	CU_PTR_ATTR_MEMORY_TYPE = 2,
+	/* The CUmemorytype describing the physical location of a pointer */
+	CU_PTR_ATTR_DEVICE_POINTER = 3,
+	/* The address at which a pointer's memory may be accessed on the device */
+	CU_PTR_ATTR_HOST_POINTER = 4,
+	/* The address at which a pointer's memory may be accessed on the host */
+	CU_PTR_ATTR_P2P_TOKENS = 5,
+	/* A pair of tokens for use with the nv-p2p.h Linux kernel interface */
+	CU_PTR_ATTR_SYNC_MEMOPS = 6,
+	/* Synchronize every synchronous memory operation initiated on this region */
+	CU_PTR_ATTR_BUFFER_ID = 7,
+	/* A process-wide unique ID for an allocated memory region*/
+	CU_PTR_ATTR_IS_MANAGED = 8,
+	/* Indicates if the pointer points to managed memory */
+	CU_PTR_ATTR_DEVICE_ORDINAL = 9,
+	/* A device ordinal of a device on which a pointer was allocated or registered */
+	CU_PTR_ATTR_IS_LEGACY_CUDA_IPC_CAPABLE = 10,
+	/* 1 if this pointer maps to an allocation that is suitable for cudaIpcGetMemHandle, 0 otherwise **/
+	CU_PTR_ATTR_RANGE_START_ADDR = 11,
+	/* Starting address for this requested pointer */
+	CU_PTR_ATTR_RANGE_SIZE = 12,
+	/* Size of the address range for this requested pointer */
+	CU_PTR_ATTR_MAPPED = 13,
+	/* 1 if this pointer is in a valid address range that is mapped to a backing allocation, 0 otherwise **/
+	CU_PTR_ATTR_ALLOWED_HANDLE_TYPES = 14,
+	/* Bitmask of allowed CUmemAllocationHandleType for this allocation **/
+	CU_PTR_ATTR_IS_GPU_DIRECT_RDMA_CAPABLE = 15,
+	/* 1 if the memory this pointer is referencing can be used with the GPUDirect RDMA API **/
+	CU_PTR_ATTR_ACCESS_FLAGS = 16,
+	/* Returns the access flags the device associated with the current context has on the corresponding memory referenced by the pointer given */
+	CU_PTR_ATTR_MEMPOOL_HANDLE = 17
+	/* Returns the mempool handle for the allocation if it was allocated from a mempool. Otherwise returns NULL. **/
+};
+
+/* GPUDirect RDMA flush option types */
+#define CU_FLUSH_GDR_WRITES_OPTION_HOST RTE_BIT32(0)
+/* cuFlushGPUDirectRDMAWrites() and its CUDA Runtime API counterpart are supported on the device. */
+#define CU_FLUSH_GDR_WRITES_OPTION_MEMOPS RTE_BIT32(1)
+/* The CU_STREAM_WAIT_VALUE_FLUSH flag and the CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES MemOp are supported on the device. */
+
+/* Type of platform native ordering for GPUDirect RDMA writes */
+#define CU_GDR_WRITES_ORDERING_NONE 0
+/* The device does not natively support ordering of remote writes. cuFlushGPUDirectRDMAWrites() can be leveraged if supported. */
+#define CU_GDR_WRITES_ORDERING_OWNER 100
+/* Natively, the device can consistently consume remote writes, although other CUDA devices may not. */
+#define CU_GDR_WRITES_ORDERING_ALL_DEVICES 200
+/* Any CUDA device in the system can consistently consume remote writes to this device. */
+
+/* Device scope for cuFlushGPUDirectRDMAWrites */
+enum cuFlushGDRScope {
+	CU_FLUSH_GDR_WRITES_TO_OWNER = 100,
+	/* Blocks until remote writes are visible to the CUDA device context owning the data. */
+	CU_FLUSH_GDR_WRITES_TO_ALL_DEVICES = 200
+	/* Blocks until remote writes are visible to all CUDA device contexts. */
+};
+
+/* Targets for cuFlushGPUDirectRDMAWrites */
+enum cuFlushGDRTarget {
+	/* Target is currently active CUDA device context. */
+	CU_FLUSH_GDR_WRITES_TARGET_CURRENT_CTX = 0
+};
+
+#define CU_MHOST_REGISTER_PORTABLE 0x01
+#define CU_MHOST_REGISTER_DEVICEMAP 0x02
+#define CU_MHOST_REGISTER_IOMEMORY 0x04
+#define CU_MHOST_REGISTER_READ_ONLY 0x08
+
+extern enum cuError (*sym_cuInit)(unsigned int flags);
+extern enum cuError (*sym_cuDriverGetVersion)(int *driverVersion);
+extern enum cuError (*sym_cuGetProcAddress)(const char *symbol, void **pfn, int cudaVersion, uint64_t flags);
+
+/* Dynamically loaded symbols with cuGetProcAddress with proper API version */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Generic */
+#define PFN_cuGetErrorString  PFN_cuGetErrorString_v6000
+#define PFN_cuGetErrorName  PFN_cuGetErrorName_v6000
+#define PFN_cuPointerSetAttribute  PFN_cuPointerSetAttribute_v6000
+#define PFN_cuDeviceGetAttribute  PFN_cuDeviceGetAttribute_v2000
+
+/* cuDevice */
+#define PFN_cuDeviceGetByPCIBusId  PFN_cuDeviceGetByPCIBusId_v4010
+#define PFN_cuDevicePrimaryCtxRetain  PFN_cuDevicePrimaryCtxRetain_v7000
+#define PFN_cuDevicePrimaryCtxRelease  PFN_cuDevicePrimaryCtxRelease_v11000
+#define PFN_cuDeviceTotalMem  PFN_cuDeviceTotalMem_v3020
+#define PFN_cuDeviceGetName  PFN_cuDeviceGetName_v2000
+
+/* cuCtx */
+#define PFN_cuCtxGetApiVersion  PFN_cuCtxGetApiVersion_v3020
+#define PFN_cuCtxSetCurrent  PFN_cuCtxSetCurrent_v4000
+#define PFN_cuCtxGetCurrent  PFN_cuCtxGetCurrent_v4000
+#define PFN_cuCtxGetDevice  PFN_cuCtxGetDevice_v2000
+#define PFN_cuCtxGetExecAffinity  PFN_cuCtxGetExecAffinity_v11040
+
+/* cuMem */
+#define PFN_cuMemAlloc PFN_cuMemAlloc_v3020
+#define PFN_cuMemFree PFN_cuMemFree_v3020
+#define PFN_cuMemHostRegister  PFN_cuMemHostRegister_v6050
+#define PFN_cuMemHostUnregister  PFN_cuMemHostUnregister_v4000
+#define PFN_cuMemHostGetDevicePointer  PFN_cuMemHostGetDevicePointer_v3020
+#define PFN_cuFlushGPUDirectRDMAWrites PFN_cuFlushGPUDirectRDMAWrites_v11030
+
+/* Generic */
+typedef enum cuError (*PFN_cuGetErrorString_v6000)(enum cuError error, const char **pStr);
+typedef enum cuError (*PFN_cuGetErrorName_v6000)(enum cuError error, const char **pStr);
+typedef enum cuError (*PFN_cuPointerSetAttribute_v6000)(const void *value, enum cuPtrAttr attribute, cuDevPtr_v2 ptr);
+typedef enum cuError (*PFN_cuDeviceGetAttribute_v2000)(int *pi, enum cuDevAttr attrib, cuDev_v1 dev);
+
+/* Device */
+typedef enum cuError (*PFN_cuDeviceGetByPCIBusId_v4010)(cuDev_v1 *dev, const char *pciBusId);
+typedef enum cuError (*PFN_cuDevicePrimaryCtxRetain_v7000)(CUcontext *pctx, cuDev_v1 dev);
+typedef enum cuError (*PFN_cuDevicePrimaryCtxRelease_v11000)(cuDev_v1 dev);
+typedef enum cuError (*PFN_cuDeviceTotalMem_v3020)(size_t *bytes, cuDev_v1 dev);
+typedef enum cuError (*PFN_cuDeviceGetName_v2000)(char *name, int len, cuDev_v1 dev);
+
+/* Context */
+typedef enum cuError (*PFN_cuCtxGetApiVersion_v3020)(CUcontext ctx, unsigned int *version);
+typedef enum cuError (*PFN_cuCtxSetCurrent_v4000)(CUcontext ctx);
+typedef enum cuError (*PFN_cuCtxGetCurrent_v4000)(CUcontext *pctx);
+typedef enum cuError (*PFN_cuCtxGetDevice_v2000)(cuDev_v1 *device);
+typedef enum cuError (*PFN_cuCtxGetExecAffinity_v11040)(struct cuExecAffinityParams *pExecAffinity, enum cuExecAffinityParamType type);
+
+/* Memory */
+typedef enum cuError (*PFN_cuMemAlloc_v3020)(cuDevPtr_v2 *dptr, size_t bytesize);
+typedef enum cuError (*PFN_cuMemFree_v3020)(cuDevPtr_v2 dptr);
+typedef enum cuError (*PFN_cuMemHostRegister_v6050)(void *p, size_t bytesize, unsigned int Flags);
+typedef enum cuError (*PFN_cuMemHostUnregister_v4000)(void *p);
+typedef enum cuError (*PFN_cuMemHostGetDevicePointer_v3020)(cuDevPtr_v2 *pdptr, void *p, unsigned int Flags);
+typedef enum cuError (*PFN_cuFlushGPUDirectRDMAWrites_v11030)(enum cuFlushGDRTarget target, enum cuFlushGDRScope scope);
+
+#ifdef __cplusplus
+}
+#endif /* __cplusplus */
+
+#endif
diff --git a/drivers/gpu/cuda/meson.build b/drivers/gpu/cuda/meson.build
new file mode 100644
index 0000000000..f2a3095d8d
--- /dev/null
+++ b/drivers/gpu/cuda/meson.build
@@ -0,0 +1,10 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright (c) 2021 NVIDIA Corporation & Affiliates
+
+if not is_linux
+        build = false
+        reason = 'only supported on Linux'
+endif
+
+deps += ['gpudev','pci','bus_pci']
+sources = files('cuda.c')
diff --git a/drivers/gpu/cuda/version.map b/drivers/gpu/cuda/version.map
new file mode 100644
index 0000000000..4a76d1d52d
--- /dev/null
+++ b/drivers/gpu/cuda/version.map
@@ -0,0 +1,3 @@
+DPDK_21 {
+	local: *;
+};
diff --git a/drivers/gpu/meson.build b/drivers/gpu/meson.build
index e51ad3381b..601bedcd61 100644
--- a/drivers/gpu/meson.build
+++ b/drivers/gpu/meson.build
@@ -1,4 +1,4 @@
 # SPDX-License-Identifier: BSD-3-Clause
 # Copyright (c) 2021 NVIDIA Corporation & Affiliates
 
-drivers = []
+drivers = [ 'cuda' ]
-- 
2.17.1
^ permalink raw reply related	[flat|nested] 28+ messages in thread
* Re: [PATCH v7 1/1] gpu/cuda: introduce CUDA driver
  2021-11-16 22:50   ` [PATCH v7 1/1] " eagostini
@ 2021-11-16 15:58     ` Stephen Hemminger
  2021-11-16 16:35       ` Thomas Monjalon
  2021-11-16 16:40       ` Thomas Monjalon
  2021-11-16 16:30     ` Thomas Monjalon
  1 sibling, 2 replies; 28+ messages in thread
From: Stephen Hemminger @ 2021-11-16 15:58 UTC (permalink / raw)
  To: eagostini; +Cc: dev
Minor comments, overall looks fine.
+/* CUDA Driver functions loaded with dlsym() */
+CUresult CUDAAPI (*sym_cuInit)(unsigned int flags) = NULL;
+CUresult CUDAAPI (*sym_cuDriverGetVersion)(int *driverVersion) = NULL;
+CUresult CUDAAPI (*sym_cuGetProcAddress)(const char *symbol,
+		void **pfn, int cudaVersion, uint64_t flags) = NULL;
+
Can these be local (static) since not prefixed with driver specific name.
Also global variables are always loaded as zero so NULL initialization is
unnecessary. For other types checkpatch would complain.
> +/* NVIDIA GPU address map */
> +static struct rte_pci_id pci_id_cuda_map[] = {
Can this be const?
^ permalink raw reply	[flat|nested] 28+ messages in thread
* Re: [PATCH v7 1/1] gpu/cuda: introduce CUDA driver
  2021-11-16 22:50   ` [PATCH v7 1/1] " eagostini
  2021-11-16 15:58     ` Stephen Hemminger
@ 2021-11-16 16:30     ` Thomas Monjalon
  2021-11-16 16:44       ` Thomas Monjalon
  1 sibling, 1 reply; 28+ messages in thread
From: Thomas Monjalon @ 2021-11-16 16:30 UTC (permalink / raw)
  To: Elena Agostini; +Cc: dev
16/11/2021 23:50, eagostini@nvidia.com:
> From: Elena Agostini <eagostini@nvidia.com>
> 
> This is the CUDA implementation of the gpudev library.
> Functionalities implemented through CUDA Driver API are:
> - Device probe and remove
> - Manage device memory allocations
> - Register/unregister external CPU memory in the device memory area
> 
> Signed-off-by: Elena Agostini <eagostini@nvidia.com>
> ---
> --- /dev/null
> +++ b/drivers/gpu/cuda/version.map
> @@ -0,0 +1,3 @@
> +DPDK_21 {
> +	local: *;
> +};
Should be DPDK_22
Applied with some minor fixes, thanks.
^ permalink raw reply	[flat|nested] 28+ messages in thread
* Re: [PATCH v7 1/1] gpu/cuda: introduce CUDA driver
  2021-11-16 15:58     ` Stephen Hemminger
@ 2021-11-16 16:35       ` Thomas Monjalon
  2021-11-16 16:40       ` Thomas Monjalon
  1 sibling, 0 replies; 28+ messages in thread
From: Thomas Monjalon @ 2021-11-16 16:35 UTC (permalink / raw)
  To: eagostini, Stephen Hemminger; +Cc: dev
16/11/2021 16:58, Stephen Hemminger:
> Minor comments, overall looks fine.
> 
> 
> +/* CUDA Driver functions loaded with dlsym() */
> +CUresult CUDAAPI (*sym_cuInit)(unsigned int flags) = NULL;
> +CUresult CUDAAPI (*sym_cuDriverGetVersion)(int *driverVersion) = NULL;
> +CUresult CUDAAPI (*sym_cuGetProcAddress)(const char *symbol,
> +		void **pfn, int cudaVersion, uint64_t flags) = NULL;
> +
> 
> Can these be local (static) since not prefixed with driver specific name.
> Also global variables are always loaded as zero so NULL initialization is
> unnecessary. For other types checkpatch would complain.
> 
> 
> > +/* NVIDIA GPU address map */
> > +static struct rte_pci_id pci_id_cuda_map[] = {
> 
> Can this be const?
I didn't really push yet, so I can do this fixups now.
^ permalink raw reply	[flat|nested] 28+ messages in thread
* Re: [PATCH v7 1/1] gpu/cuda: introduce CUDA driver
  2021-11-16 15:58     ` Stephen Hemminger
  2021-11-16 16:35       ` Thomas Monjalon
@ 2021-11-16 16:40       ` Thomas Monjalon
  1 sibling, 0 replies; 28+ messages in thread
From: Thomas Monjalon @ 2021-11-16 16:40 UTC (permalink / raw)
  To: eagostini, Stephen Hemminger; +Cc: dev, david.marchand
16/11/2021 16:58, Stephen Hemminger:
> Minor comments, overall looks fine.
> 
> 
> +/* CUDA Driver functions loaded with dlsym() */
> +CUresult CUDAAPI (*sym_cuInit)(unsigned int flags) = NULL;
> +CUresult CUDAAPI (*sym_cuDriverGetVersion)(int *driverVersion) = NULL;
> +CUresult CUDAAPI (*sym_cuGetProcAddress)(const char *symbol,
> +		void **pfn, int cudaVersion, uint64_t flags) = NULL;
> +
> 
> Can these be local (static) since not prefixed with driver specific name.
> Also global variables are always loaded as zero so NULL initialization is
> unnecessary. For other types checkpatch would complain.
> 
> 
> > +/* NVIDIA GPU address map */
> > +static struct rte_pci_id pci_id_cuda_map[] = {
> 
> Can this be const?
On a separate note, Elena did what you suggested for the logtype variable:
	static RTE_LOG_REGISTER_DEFAULT(cuda_logtype, NOTICE);
So cuda_logtype is static.
I think we should do the same for all DPDK logtypes.
Volunteer for a patch?
^ permalink raw reply	[flat|nested] 28+ messages in thread
* Re: [PATCH v7 1/1] gpu/cuda: introduce CUDA driver
  2021-11-16 16:30     ` Thomas Monjalon
@ 2021-11-16 16:44       ` Thomas Monjalon
  0 siblings, 0 replies; 28+ messages in thread
From: Thomas Monjalon @ 2021-11-16 16:44 UTC (permalink / raw)
  To: Elena Agostini; +Cc: dev, stephen
16/11/2021 17:30, Thomas Monjalon:
> 16/11/2021 23:50, eagostini@nvidia.com:
> > From: Elena Agostini <eagostini@nvidia.com>
> > 
> > This is the CUDA implementation of the gpudev library.
> > Functionalities implemented through CUDA Driver API are:
> > - Device probe and remove
> > - Manage device memory allocations
> > - Register/unregister external CPU memory in the device memory area
> > 
> > Signed-off-by: Elena Agostini <eagostini@nvidia.com>
> > ---
> > --- /dev/null
> > +++ b/drivers/gpu/cuda/version.map
> > @@ -0,0 +1,3 @@
> > +DPDK_21 {
> > +	local: *;
> > +};
> 
> Should be DPDK_22
> 
> Applied with some minor fixes, thanks.
Did the static/const fixups suggested by Stephen before pushing to the repo.
^ permalink raw reply	[flat|nested] 28+ messages in thread
* [PATCH v6 0/1] gpu/cuda: introduce CUDA driver
  2021-10-05 22:49 [dpdk-dev] [RFC PATCH] gpu/cuda: introduce CUDA driver eagostini
                   ` (4 preceding siblings ...)
  2021-11-15 22:36 ` [PATCH v5 0/1] " eagostini
@ 2021-11-16 20:47 ` eagostini
  2021-11-16 20:47   ` [PATCH v6 1/1] " eagostini
  2021-11-16 22:50 ` [PATCH v7 0/1] " eagostini
  6 siblings, 1 reply; 28+ messages in thread
From: eagostini @ 2021-11-16 20:47 UTC (permalink / raw)
  To: dev; +Cc: Elena Agostini
From: Elena Agostini <eagostini@nvidia.com>
This is the CUDA implementation of the gpudev library.
Funcitonalities implemented through CUDA Driver API are:
- Device probe and remove
- Manage device memory allocations
- Register/unregister external CPU memory in the device memory area
Changelog:
- CUDA driver implementation of the GPU write memory barrier
- Fixed styling reported by checkpatch
- CUDA driver shared library libcuda.so is not required at build time
- CUDA driver shared library libcuda.so is loaded at runtime
- CUDA driver headers are required at build time
- Documentation updated
Elena Agostini (1):
  gpu/cuda: introduce CUDA driver
 doc/guides/gpus/cuda.rst               |  151 ++++
 doc/guides/gpus/index.rst              |    1 +
 doc/guides/rel_notes/release_21_11.rst |    2 +
 drivers/gpu/cuda/cuda.c                | 1145 ++++++++++++++++++++++++
 drivers/gpu/cuda/meson.build           |   18 +
 drivers/gpu/cuda/version.map           |    3 +
 drivers/gpu/meson.build                |    2 +-
 7 files changed, 1321 insertions(+), 1 deletion(-)
 create mode 100644 doc/guides/gpus/cuda.rst
 create mode 100644 drivers/gpu/cuda/cuda.c
 create mode 100644 drivers/gpu/cuda/meson.build
 create mode 100644 drivers/gpu/cuda/version.map
-- 
2.17.1
^ permalink raw reply	[flat|nested] 28+ messages in thread
* [PATCH v6 1/1] gpu/cuda: introduce CUDA driver
  2021-11-16 20:47 ` [PATCH v6 0/1] " eagostini
@ 2021-11-16 20:47   ` eagostini
  0 siblings, 0 replies; 28+ messages in thread
From: eagostini @ 2021-11-16 20:47 UTC (permalink / raw)
  To: dev; +Cc: Elena Agostini
From: Elena Agostini <eagostini@nvidia.com>
This is the CUDA implementation of the gpudev library.
Functionalities implemented through CUDA Driver API are:
- Device probe and remove
- Manage device memory allocations
- Register/unregister external CPU memory in the device memory area
Signed-off-by: Elena Agostini <eagostini@nvidia.com>
---
 doc/guides/gpus/cuda.rst               |  151 ++++
 doc/guides/gpus/index.rst              |    1 +
 doc/guides/rel_notes/release_21_11.rst |    2 +
 drivers/gpu/cuda/cuda.c                | 1145 ++++++++++++++++++++++++
 drivers/gpu/cuda/meson.build           |   18 +
 drivers/gpu/cuda/version.map           |    3 +
 drivers/gpu/meson.build                |    2 +-
 7 files changed, 1321 insertions(+), 1 deletion(-)
 create mode 100644 doc/guides/gpus/cuda.rst
 create mode 100644 drivers/gpu/cuda/cuda.c
 create mode 100644 drivers/gpu/cuda/meson.build
 create mode 100644 drivers/gpu/cuda/version.map
diff --git a/doc/guides/gpus/cuda.rst b/doc/guides/gpus/cuda.rst
new file mode 100644
index 0000000000..9897d52d06
--- /dev/null
+++ b/doc/guides/gpus/cuda.rst
@@ -0,0 +1,151 @@
+.. SPDX-License-Identifier: BSD-3-Clause
+   Copyright (c) 2021 NVIDIA Corporation & Affiliates
+
+CUDA GPU driver
+===============
+
+The CUDA GPU driver library (**librte_gpu_cuda**) provides support for NVIDIA GPUs.
+Information and documentation about these devices can be found on the
+`NVIDIA website <http://www.nvidia.com>`_. Help is also provided by the
+`NVIDIA CUDA Toolkit developer zone <https://docs.nvidia.com/cuda>`_.
+
+Build dependencies
+------------------
+
+The CUDA GPU driver library has an header-only dependency on ``cuda.h`` and ``cudaTypedefs.h``.
+To get these headers there are two options:
+
+- Install `CUDA Toolkit <https://developer.nvidia.com/cuda-toolkit>`_ 
+  (either regular or stubs installation).
+- Download these two headers from this `CUDA headers
+  <https://gitlab.com/nvidia/headers/cuda-individual/cudart>`_ public repo.
+
+You need to indicate to meson where CUDA headers files are through the CFLAGS variable.
+Two ways:
+
+- Set ``export CFLAGS=-I/usr/local/cuda/include`` before building
+- Add CFLAGS in the meson command line ``CFLAGS=-I/usr/local/cuda/include meson build``
+
+If headers are not found, the CUDA GPU driver library is not built.
+
+CUDA Shared Library
+-------------------
+
+To avoid any system configuration issue, the CUDA API **libcuda.so** shared library
+is not linked at building time because of a Meson bug that looks
+for `cudart` module even if the `meson.build` file only requires default `cuda` module.
+
+**libcuda.so** is loaded at runtime in the ``cuda_gpu_probe`` function through ``dlopen``
+when the very first GPU is detected.
+If CUDA installation resides in a custom directory,
+the environment variable ``CUDA_PATH_L`` should specify where ``dlopen``
+can look for **libcuda.so**.
+
+All CUDA API symbols are loaded at runtime as well.
+For this reason, to build the CUDA driver library,
+no need to install the CUDA library.
+
+Design
+------
+
+**librte_gpu_cuda** relies on CUDA Driver API (no need for CUDA Runtime API).
+
+Goal of this driver library is not to provide a wrapper for the whole CUDA Driver API.
+Instead, the scope is to implement the generic features of gpudev API.
+For a CUDA application, integrating the gpudev library functions
+using the CUDA driver library is quite straightforward
+and doesn't create any compatibility problem.
+
+Initialization
+~~~~~~~~~~~~~~
+
+During initialization, CUDA driver library detects NVIDIA physical GPUs
+on the system or specified via EAL device options (e.g. ``-a b6:00.0``).
+The driver initializes the CUDA driver environment through ``cuInit(0)`` function.
+For this reason, it's required to set any CUDA environment configuration before
+calling ``rte_eal_init`` function in the DPDK application.
+
+If the CUDA driver environment has been already initialized, the ``cuInit(0)``
+in CUDA driver library has no effect.
+
+CUDA Driver sub-contexts
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+After initialization, a CUDA application can create multiple sub-contexts
+on GPU physical devices.
+Through gpudev library, is possible to register these sub-contexts
+in the CUDA driver library as child devices having as parent a GPU physical device.
+
+CUDA driver library also supports `MPS
+<https://docs.nvidia.com/deploy/pdf/CUDA_Multi_Process_Service_Overview.pdf>`__.
+
+GPU memory management
+~~~~~~~~~~~~~~~~~~~~~
+
+The CUDA driver library maintains a table of GPU memory addresses allocated
+and CPU memory addresses registered associated to the input CUDA context.
+Whenever the application tried to deallocate or deregister a memory address,
+if the address is not in the table the CUDA driver library will return an error.
+
+Features
+--------
+
+- Register new child devices aka new CUDA Driver contexts.
+- Allocate memory on the GPU.
+- Register CPU memory to make it visible from GPU.
+
+Minimal requirements
+--------------------
+
+Minimal requirements to enable the CUDA driver library are:
+
+- NVIDIA GPU Ampere or Volta
+- CUDA 11.4 Driver API or newer
+
+`GPUDirect RDMA Technology <https://docs.nvidia.com/cuda/gpudirect-rdma/index.html>`_
+allows compatible network cards (e.g. Mellanox) to directly send and receive packets
+using GPU memory instead of additional memory copies through the CPU system memory.
+To enable this technology, system requirements are:
+
+- `nvidia-peermem <https://docs.nvidia.com/cuda/gpudirect-rdma/index.html#nvidia-peermem>`_
+  module running on the system;
+- Mellanox network card ConnectX-5 or newer (BlueField models included);
+- DPDK mlx5 PMD enabled;
+- To reach the best performance, an additional PCIe switch between GPU and NIC is recommended.
+
+Limitations
+-----------
+
+Supported only on Linux.
+
+Supported GPUs
+--------------
+
+The following NVIDIA GPU devices are supported by this CUDA driver library:
+
+- NVIDIA A100 80GB PCIe
+- NVIDIA A100 40GB PCIe
+- NVIDIA A30 24GB
+- NVIDIA A10 24GB
+- NVIDIA V100 32GB PCIe
+- NVIDIA V100 16GB PCIe
+
+External references
+-------------------
+
+A good example of how to use the GPU CUDA driver library through the gpudev library
+is the l2fwd-nv application that can be found `here <https://github.com/NVIDIA/l2fwd-nv>`_.
+
+The application is based on vanilla DPDK example l2fwd
+and is enhanced with GPU memory managed through gpudev library
+and CUDA to launch the swap of packets MAC addresses workload on the GPU.
+
+l2fwd-nv is not intended to be used for performance
+(testpmd is the good candidate for this).
+The goal is to show different use-cases about how a CUDA application can use DPDK to:
+
+- Allocate memory on GPU device using gpudev library.
+- Use that memory to create an external GPU memory mempool.
+- Receive packets directly in GPU memory.
+- Coordinate the workload on the GPU with the network and CPU activity to receive packets.
+- Send modified packets directly from the GPU memory.
diff --git a/doc/guides/gpus/index.rst b/doc/guides/gpus/index.rst
index 1878423239..4b7a420556 100644
--- a/doc/guides/gpus/index.rst
+++ b/doc/guides/gpus/index.rst
@@ -9,3 +9,4 @@ General-Purpose Graphics Processing Unit Drivers
    :numbered:
 
    overview
+   cuda
diff --git a/doc/guides/rel_notes/release_21_11.rst b/doc/guides/rel_notes/release_21_11.rst
index cd4dcd0077..d76bba2fe3 100644
--- a/doc/guides/rel_notes/release_21_11.rst
+++ b/doc/guides/rel_notes/release_21_11.rst
@@ -111,6 +111,8 @@ New Features
   * Memory management
   * Communication flag & list
 
+* **Added NVIDIA GPU driver implemented with CUDA library.**
+
 * **Added new RSS offload types for IPv4/L4 checksum in RSS flow.**
 
   Added macros ETH_RSS_IPV4_CHKSUM and ETH_RSS_L4_CHKSUM, now IPv4 and
diff --git a/drivers/gpu/cuda/cuda.c b/drivers/gpu/cuda/cuda.c
new file mode 100644
index 0000000000..25556e0591
--- /dev/null
+++ b/drivers/gpu/cuda/cuda.c
@@ -0,0 +1,1145 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright (c) 2021 NVIDIA Corporation & Affiliates
+ */
+
+#include <dlfcn.h>
+
+#include <rte_common.h>
+#include <rte_log.h>
+#include <rte_malloc.h>
+#include <rte_errno.h>
+#include <rte_pci.h>
+#include <rte_bus_pci.h>
+#include <rte_byteorder.h>
+#include <rte_dev.h>
+
+#include <gpudev_driver.h>
+#include <cuda.h>
+#include <cudaTypedefs.h>
+
+#define CUDA_DRIVER_MIN_VERSION 11040
+#define CUDA_API_MIN_VERSION 3020
+
+/* CUDA Driver functions loaded with dlsym() */
+CUresult CUDAAPI (*sym_cuInit)(unsigned int flags) = NULL;
+CUresult CUDAAPI (*sym_cuDriverGetVersion)(int *driverVersion) = NULL;
+CUresult CUDAAPI (*sym_cuGetProcAddress)(const char *symbol,
+		void **pfn, int cudaVersion, uint64_t flags) = NULL;
+
+/* CUDA Driver functions loaded with cuGetProcAddress for versioning */
+PFN_cuGetErrorString pfn_cuGetErrorString;
+PFN_cuGetErrorName pfn_cuGetErrorName;
+PFN_cuPointerSetAttribute pfn_cuPointerSetAttribute;
+PFN_cuDeviceGetAttribute pfn_cuDeviceGetAttribute;
+PFN_cuDeviceGetByPCIBusId pfn_cuDeviceGetByPCIBusId;
+PFN_cuDevicePrimaryCtxRetain pfn_cuDevicePrimaryCtxRetain;
+PFN_cuDevicePrimaryCtxRelease pfn_cuDevicePrimaryCtxRelease;
+PFN_cuDeviceTotalMem pfn_cuDeviceTotalMem;
+PFN_cuDeviceGetName pfn_cuDeviceGetName;
+PFN_cuCtxGetApiVersion pfn_cuCtxGetApiVersion;
+PFN_cuCtxSetCurrent pfn_cuCtxSetCurrent;
+PFN_cuCtxGetCurrent pfn_cuCtxGetCurrent;
+PFN_cuCtxGetDevice pfn_cuCtxGetDevice;
+PFN_cuCtxGetExecAffinity pfn_cuCtxGetExecAffinity;
+PFN_cuMemAlloc pfn_cuMemAlloc;
+PFN_cuMemFree pfn_cuMemFree;
+PFN_cuMemHostRegister pfn_cuMemHostRegister;
+PFN_cuMemHostUnregister pfn_cuMemHostUnregister;
+PFN_cuMemHostGetDevicePointer pfn_cuMemHostGetDevicePointer;
+PFN_cuFlushGPUDirectRDMAWrites pfn_cuFlushGPUDirectRDMAWrites;
+
+static void *cudalib;
+static unsigned int cuda_api_version;
+static int cuda_driver_version;
+
+/* NVIDIA GPU vendor */
+#define NVIDIA_GPU_VENDOR_ID (0x10de)
+
+/* NVIDIA GPU device IDs */
+#define NVIDIA_GPU_A100_40GB_DEVICE_ID (0x20f1)
+#define NVIDIA_GPU_A100_80GB_DEVICE_ID (0x20b5)
+
+#define NVIDIA_GPU_A30_24GB_DEVICE_ID (0x20b7)
+#define NVIDIA_GPU_A10_24GB_DEVICE_ID (0x2236)
+
+#define NVIDIA_GPU_V100_32GB_DEVICE_ID (0x1db6)
+#define NVIDIA_GPU_V100_16GB_DEVICE_ID (0x1db4)
+
+#define CUDA_MAX_ALLOCATION_NUM 512
+
+#define GPU_PAGE_SHIFT 16
+#define GPU_PAGE_SIZE (1UL << GPU_PAGE_SHIFT)
+
+static RTE_LOG_REGISTER_DEFAULT(cuda_logtype, NOTICE);
+
+/* Helper macro for logging */
+#define rte_cuda_log(level, fmt, ...) \
+	rte_log(RTE_LOG_ ## level, cuda_logtype, fmt "\n", ##__VA_ARGS__)
+
+#define rte_cuda_debug(fmt, ...) \
+	rte_cuda_log(DEBUG, RTE_STR(__LINE__) ":%s() " fmt, __func__, \
+		##__VA_ARGS__)
+
+/* NVIDIA GPU address map */
+static struct rte_pci_id pci_id_cuda_map[] = {
+	{
+		RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
+				NVIDIA_GPU_A100_40GB_DEVICE_ID)
+	},
+        {
+                RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
+                                NVIDIA_GPU_A100_80GB_DEVICE_ID)
+        },
+        {
+                RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
+                                NVIDIA_GPU_A30_24GB_DEVICE_ID)
+        },
+        {
+                RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
+                                NVIDIA_GPU_A10_24GB_DEVICE_ID)
+        },
+        {
+                RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
+                                NVIDIA_GPU_V100_32GB_DEVICE_ID)
+        },
+        {
+                RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
+                                NVIDIA_GPU_V100_16GB_DEVICE_ID)
+        },
+	{
+		.device_id = 0
+	}
+};
+
+/* Device private info */
+struct cuda_info {
+	char gpu_name[RTE_DEV_NAME_MAX_LEN];
+	CUdevice cu_dev;
+	int gdr_supported;
+	int gdr_write_ordering;
+	int gdr_flush_type;
+};
+
+/* Type of memory allocated by CUDA driver */
+enum mem_type {
+	GPU_MEM = 0,
+	CPU_REGISTERED,
+	GPU_REGISTERED /* Not used yet */
+};
+
+/* key associated to a memory address */
+typedef uintptr_t cuda_ptr_key;
+
+/* Single entry of the memory list */
+struct mem_entry {
+	CUdeviceptr ptr_d;
+	void *ptr_h;
+	size_t size;
+	struct rte_gpu *dev;
+	CUcontext ctx;
+	cuda_ptr_key pkey;
+	enum mem_type mtype;
+	struct mem_entry *prev;
+	struct mem_entry *next;
+};
+
+static struct mem_entry *mem_alloc_list_head;
+static struct mem_entry *mem_alloc_list_tail;
+static uint32_t mem_alloc_list_last_elem;
+
+/* Load the CUDA symbols */
+
+static int
+cuda_loader(void)
+{
+	char cuda_path[1024];
+
+	if (getenv("CUDA_PATH_L") == NULL)
+		snprintf(cuda_path, 1024, "%s", "libcuda.so");
+	else
+		snprintf(cuda_path, 1024, "%s%s", getenv("CUDA_PATH_L"), "libcuda.so");
+
+	cudalib = dlopen(cuda_path, RTLD_LAZY);
+	if (cudalib == NULL) {
+		rte_cuda_log(ERR, "Failed to find CUDA library in %s (CUDA_PATH_L=%s)",
+				cuda_path, getenv("CUDA_PATH_L"));
+		return -1;
+	}
+
+	return 0;
+}
+
+static int
+cuda_sym_func_loader(void)
+{
+	if (cudalib == NULL)
+		return -1;
+
+	sym_cuInit = dlsym(cudalib, "cuInit");
+	if (sym_cuInit == NULL) {
+		rte_cuda_log(ERR, "Failed to load CUDA missing symbol cuInit");
+		return -1;
+	}
+
+	sym_cuDriverGetVersion = dlsym(cudalib, "cuDriverGetVersion");
+	if (sym_cuDriverGetVersion == NULL) {
+		rte_cuda_log(ERR, "Failed to load CUDA missing symbol cuDriverGetVersion");
+		return -1;
+	}
+
+	sym_cuGetProcAddress = dlsym(cudalib, "cuGetProcAddress");
+	if (sym_cuGetProcAddress == NULL) {
+		rte_cuda_log(ERR, "Failed to load CUDA missing symbol cuGetProcAddress");
+		return -1;
+	}
+
+	return 0;
+}
+
+static int
+cuda_pfn_func_loader(void)
+{
+	CUresult res;
+
+	res = sym_cuGetProcAddress("cuGetErrorString",
+			(void **) (&pfn_cuGetErrorString), cuda_driver_version, 0);
+	if (res != 0) {
+		rte_cuda_log(ERR, "Retrieve pfn_cuGetErrorString failed with %d", res);
+		return -1;
+	}
+
+	res = sym_cuGetProcAddress("cuGetErrorName",
+			(void **) (&pfn_cuGetErrorName), cuda_driver_version, 0);
+	if (res != 0) {
+		rte_cuda_log(ERR, "Retrieve pfn_cuGetErrorName failed with %d", res);
+		return -1;
+	}
+
+	res = sym_cuGetProcAddress("cuPointerSetAttribute",
+			(void **) (&pfn_cuPointerSetAttribute), cuda_driver_version, 0);
+	if (res != 0) {
+		rte_cuda_log(ERR, "Retrieve pfn_cuPointerSetAttribute failed with %d", res);
+		return -1;
+	}
+
+	res = sym_cuGetProcAddress("cuDeviceGetAttribute",
+			(void **) (&pfn_cuDeviceGetAttribute), cuda_driver_version, 0);
+	if (res != 0) {
+		rte_cuda_log(ERR, "Retrieve pfn_cuDeviceGetAttribute failed with %d", res);
+		return -1;
+	}
+
+	res = sym_cuGetProcAddress("cuDeviceGetByPCIBusId",
+			(void **) (&pfn_cuDeviceGetByPCIBusId), cuda_driver_version, 0);
+	if (res != 0) {
+		rte_cuda_log(ERR, "Retrieve pfn_cuDeviceGetByPCIBusId failed with %d", res);
+		return -1;
+	}
+
+	res = sym_cuGetProcAddress("cuDeviceGetName",
+			(void **) (&pfn_cuDeviceGetName), cuda_driver_version, 0);
+	if (res != 0) {
+		rte_cuda_log(ERR, "Retrieve pfn_cuDeviceGetName failed with %d", res);
+		return -1;
+	}
+
+	res = sym_cuGetProcAddress("cuDevicePrimaryCtxRetain",
+			(void **) (&pfn_cuDevicePrimaryCtxRetain), cuda_driver_version, 0);
+	if (res != 0) {
+		rte_cuda_log(ERR, "Retrieve pfn_cuDevicePrimaryCtxRetain failed with %d", res);
+		return -1;
+	}
+
+	res = sym_cuGetProcAddress("cuDevicePrimaryCtxRelease",
+			(void **) (&pfn_cuDevicePrimaryCtxRelease), cuda_driver_version, 0);
+	if (res != 0) {
+		rte_cuda_log(ERR, "Retrieve pfn_cuDevicePrimaryCtxRelease failed with %d", res);
+		return -1;
+	}
+
+	res = sym_cuGetProcAddress("cuDeviceTotalMem",
+			(void **) (&pfn_cuDeviceTotalMem), cuda_driver_version, 0);
+	if (res != 0) {
+		rte_cuda_log(ERR, "Retrieve pfn_cuDeviceTotalMem failed with %d", res);
+		return -1;
+	}
+
+	res = sym_cuGetProcAddress("cuCtxGetApiVersion",
+			(void **) (&pfn_cuCtxGetApiVersion), cuda_driver_version, 0);
+	if (res != 0) {
+		rte_cuda_log(ERR, "Retrieve pfn_cuCtxGetApiVersion failed with %d", res);
+		return -1;
+	}
+
+	res = sym_cuGetProcAddress("cuCtxGetDevice",
+			(void **) (&pfn_cuCtxGetDevice), cuda_driver_version, 0);
+	if (res != 0) {
+		rte_cuda_log(ERR, "Retrieve pfn_cuCtxGetDevice failed with %d", res);
+		return -1;
+	}
+
+	res = sym_cuGetProcAddress("cuCtxSetCurrent",
+			(void **) (&pfn_cuCtxSetCurrent), cuda_driver_version, 0);
+	if (res != 0) {
+		rte_cuda_log(ERR, "Retrieve pfn_cuCtxSetCurrent failed with %d", res);
+		return -1;
+	}
+
+	res = sym_cuGetProcAddress("cuCtxGetCurrent",
+			(void **) (&pfn_cuCtxGetCurrent), cuda_driver_version, 0);
+	if (res != 0) {
+		rte_cuda_log(ERR, "Retrieve pfn_cuCtxGetCurrent failed with %d", res);
+		return -1;
+	}
+
+	res = sym_cuGetProcAddress("cuCtxGetExecAffinity",
+			(void **) (&pfn_cuCtxGetExecAffinity), cuda_driver_version, 0);
+	if (res != 0) {
+		rte_cuda_log(ERR, "Retrieve pfn_cuCtxGetExecAffinity failed with %d", res);
+		return -1;
+	}
+
+	res = sym_cuGetProcAddress("cuMemAlloc",
+			(void **) (&pfn_cuMemAlloc), cuda_driver_version, 0);
+	if (res != 0) {
+		rte_cuda_log(ERR, "Retrieve pfn_cuMemAlloc failed with %d", res);
+		return -1;
+	}
+
+	res = sym_cuGetProcAddress("cuMemFree",
+			(void **) (&pfn_cuMemFree), cuda_driver_version, 0);
+	if (res != 0) {
+		rte_cuda_log(ERR, "Retrieve pfn_cuMemFree failed with %d", res);
+		return -1;
+	}
+
+	res = sym_cuGetProcAddress("cuMemHostRegister",
+			(void **) (&pfn_cuMemHostRegister), cuda_driver_version, 0);
+	if (res != 0) {
+		rte_cuda_log(ERR, "Retrieve pfn_cuMemHostRegister failed with %d", res);
+		return -1;
+	}
+
+	res = sym_cuGetProcAddress("cuMemHostUnregister",
+			(void **) (&pfn_cuMemHostUnregister), cuda_driver_version, 0);
+	if (res != 0) {
+		rte_cuda_log(ERR, "Retrieve pfn_cuMemHostUnregister failed with %d", res);
+		return -1;
+	}
+
+	res = sym_cuGetProcAddress("cuMemHostGetDevicePointer",
+			(void **) (&pfn_cuMemHostGetDevicePointer), cuda_driver_version, 0);
+	if (res != 0) {
+		rte_cuda_log(ERR, "Retrieve pfn_cuMemHostGetDevicePointer failed with %d", res);
+		return -1;
+	}
+
+	res = sym_cuGetProcAddress("cuFlushGPUDirectRDMAWrites",
+			(void **) (&pfn_cuFlushGPUDirectRDMAWrites), cuda_driver_version, 0);
+	if (res != 0) {
+		rte_cuda_log(ERR, "Retrieve cuFlushGPUDirectRDMAWrites failed with %d", res);
+		return -1;
+	}
+
+	return 0;
+}
+
+/* Generate a key from a memory pointer */
+static cuda_ptr_key
+get_hash_from_ptr(void *ptr)
+{
+	return (uintptr_t) ptr;
+}
+
+static uint32_t
+mem_list_count_item(void)
+{
+	return mem_alloc_list_last_elem;
+}
+
+/* Initiate list of memory allocations if not done yet */
+static struct mem_entry *
+mem_list_add_item(void)
+{
+	/* Initiate list of memory allocations if not done yet */
+	if (mem_alloc_list_head == NULL) {
+		mem_alloc_list_head = rte_zmalloc(NULL,
+				sizeof(struct mem_entry),
+				RTE_CACHE_LINE_SIZE);
+		if (mem_alloc_list_head == NULL) {
+			rte_cuda_log(ERR, "Failed to allocate memory for memory list");
+			return NULL;
+		}
+
+		mem_alloc_list_head->next = NULL;
+		mem_alloc_list_head->prev = NULL;
+		mem_alloc_list_tail = mem_alloc_list_head;
+	} else {
+		struct mem_entry *mem_alloc_list_cur = rte_zmalloc(NULL,
+				sizeof(struct mem_entry),
+				RTE_CACHE_LINE_SIZE);
+
+		if (mem_alloc_list_cur == NULL) {
+			rte_cuda_log(ERR, "Failed to allocate memory for memory list");
+			return NULL;
+		}
+
+		mem_alloc_list_tail->next = mem_alloc_list_cur;
+		mem_alloc_list_cur->prev = mem_alloc_list_tail;
+		mem_alloc_list_tail = mem_alloc_list_tail->next;
+		mem_alloc_list_tail->next = NULL;
+	}
+
+	mem_alloc_list_last_elem++;
+
+	return mem_alloc_list_tail;
+}
+
+static struct mem_entry *
+mem_list_find_item(cuda_ptr_key pk)
+{
+	struct mem_entry *mem_alloc_list_cur = NULL;
+
+	if (mem_alloc_list_head == NULL) {
+		rte_cuda_log(ERR, "Memory list doesn't exist");
+		return NULL;
+	}
+
+	if (mem_list_count_item() == 0) {
+		rte_cuda_log(ERR, "No items in memory list");
+		return NULL;
+	}
+
+	mem_alloc_list_cur = mem_alloc_list_head;
+
+	while (mem_alloc_list_cur != NULL) {
+		if (mem_alloc_list_cur->pkey == pk)
+			return mem_alloc_list_cur;
+		mem_alloc_list_cur = mem_alloc_list_cur->next;
+	}
+
+	return mem_alloc_list_cur;
+}
+
+static int
+mem_list_del_item(cuda_ptr_key pk)
+{
+	struct mem_entry *mem_alloc_list_cur = NULL;
+
+	mem_alloc_list_cur = mem_list_find_item(pk);
+	if (mem_alloc_list_cur == NULL)
+		return -EINVAL;
+
+	/* if key is in head */
+	if (mem_alloc_list_cur->prev == NULL)
+		mem_alloc_list_head = mem_alloc_list_cur->next;
+	else {
+		mem_alloc_list_cur->prev->next = mem_alloc_list_cur->next;
+		if (mem_alloc_list_cur->next != NULL)
+			mem_alloc_list_cur->next->prev = mem_alloc_list_cur->prev;
+	}
+
+	rte_free(mem_alloc_list_cur);
+
+	mem_alloc_list_last_elem--;
+
+	return 0;
+}
+
+static int
+cuda_dev_info_get(struct rte_gpu *dev, struct rte_gpu_info *info)
+{
+	int ret = 0;
+	CUresult res;
+	struct rte_gpu_info parent_info;
+	CUexecAffinityParam affinityPrm;
+	const char *err_string;
+	struct cuda_info *private;
+	CUcontext current_ctx;
+	CUcontext input_ctx;
+
+	if (dev == NULL)
+		return -ENODEV;
+
+	/* Child initialization time probably called by rte_gpu_add_child() */
+	if (dev->mpshared->info.parent != RTE_GPU_ID_NONE && dev->mpshared->dev_private == NULL) {
+		/* Store current ctx */
+		res = pfn_cuCtxGetCurrent(¤t_ctx);
+		if (res != 0) {
+			pfn_cuGetErrorString(res, &(err_string));
+			rte_cuda_log(ERR, "cuCtxGetCurrent failed with %s", err_string);
+			return -EPERM;
+		}
+
+		/* Set child ctx as current ctx */
+		input_ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
+		res = pfn_cuCtxSetCurrent(input_ctx);
+		if (res != 0) {
+			pfn_cuGetErrorString(res, &(err_string));
+			rte_cuda_log(ERR, "cuCtxSetCurrent input failed with %s",
+					err_string);
+			return -EPERM;
+		}
+
+		/*
+		 * Ctx capacity info
+		 */
+
+		/* MPS compatible */
+		res = pfn_cuCtxGetExecAffinity(&affinityPrm, CU_EXEC_AFFINITY_TYPE_SM_COUNT);
+		if (res != 0) {
+			pfn_cuGetErrorString(res, &(err_string));
+			rte_cuda_log(ERR, "cuCtxGetExecAffinity failed with %s", err_string);
+		}
+		dev->mpshared->info.processor_count = (uint32_t)affinityPrm.param.smCount.val;
+
+		ret = rte_gpu_info_get(dev->mpshared->info.parent, &parent_info);
+		if (ret)
+			return -ENODEV;
+		dev->mpshared->info.total_memory = parent_info.total_memory;
+
+		/*
+		 * GPU Device private info
+		 */
+		dev->mpshared->dev_private = rte_zmalloc(NULL,
+							sizeof(struct cuda_info),
+							RTE_CACHE_LINE_SIZE);
+		if (dev->mpshared->dev_private == NULL) {
+			rte_cuda_log(ERR, "Failed to allocate memory for GPU process private");
+			return -EPERM;
+		}
+
+		private = (struct cuda_info *)dev->mpshared->dev_private;
+
+		res = pfn_cuCtxGetDevice(&(private->cu_dev));
+		if (res != 0) {
+			pfn_cuGetErrorString(res, &(err_string));
+			rte_cuda_log(ERR, "cuCtxGetDevice failed with %s", err_string);
+			return -EPERM;
+		}
+
+		res = pfn_cuDeviceGetName(private->gpu_name, RTE_DEV_NAME_MAX_LEN, private->cu_dev);
+		if (res != 0) {
+			pfn_cuGetErrorString(res, &(err_string));
+			rte_cuda_log(ERR, "cuDeviceGetName failed with %s", err_string);
+			return -EPERM;
+		}
+
+		/* Restore original ctx as current ctx */
+		res = pfn_cuCtxSetCurrent(current_ctx);
+		if (res != 0) {
+			pfn_cuGetErrorString(res, &(err_string));
+			rte_cuda_log(ERR, "cuCtxSetCurrent current failed with %s",
+					err_string);
+			return -EPERM;
+		}
+	}
+
+	*info = dev->mpshared->info;
+
+	return 0;
+}
+
+/*
+ * GPU Memory
+ */
+
+static int
+cuda_mem_alloc(struct rte_gpu *dev, size_t size, void **ptr)
+{
+	CUresult res;
+	const char *err_string;
+	CUcontext current_ctx;
+	CUcontext input_ctx;
+	unsigned int flag = 1;
+
+	if (dev == NULL)
+		return -ENODEV;
+	if (size == 0)
+		return -EINVAL;
+
+	/* Store current ctx */
+	res = pfn_cuCtxGetCurrent(¤t_ctx);
+	if (res != 0) {
+		pfn_cuGetErrorString(res, &(err_string));
+		rte_cuda_log(ERR, "cuCtxGetCurrent failed with %s",
+				err_string);
+		return -EPERM;
+	}
+
+	/* Set child ctx as current ctx */
+	input_ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
+	res = pfn_cuCtxSetCurrent(input_ctx);
+	if (res != 0) {
+		pfn_cuGetErrorString(res, &(err_string));
+		rte_cuda_log(ERR, "cuCtxSetCurrent input failed with %s",
+				err_string);
+		return -EPERM;
+	}
+
+	/* Get next memory list item */
+	mem_alloc_list_tail = mem_list_add_item();
+	if (mem_alloc_list_tail == NULL)
+		return -ENOMEM;
+
+	/* Allocate memory */
+	mem_alloc_list_tail->size = size;
+	res = pfn_cuMemAlloc(&(mem_alloc_list_tail->ptr_d), mem_alloc_list_tail->size);
+	if (res != 0) {
+		pfn_cuGetErrorString(res, &(err_string));
+		rte_cuda_log(ERR, "cuCtxSetCurrent current failed with %s",
+				err_string);
+		return -EPERM;
+	}
+
+	/* GPUDirect RDMA attribute required */
+	res = pfn_cuPointerSetAttribute(&flag,
+					CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
+					mem_alloc_list_tail->ptr_d);
+	if (res != 0) {
+		rte_cuda_log(ERR, "Could not set SYNC MEMOP attribute for "
+				"GPU memory at  %"PRIu32", err %d",
+				(uint32_t) mem_alloc_list_tail->ptr_d, res);
+		return -EPERM;
+	}
+
+	mem_alloc_list_tail->pkey = get_hash_from_ptr((void *) mem_alloc_list_tail->ptr_d);
+	mem_alloc_list_tail->ptr_h = NULL;
+	mem_alloc_list_tail->size = size;
+	mem_alloc_list_tail->dev = dev;
+	mem_alloc_list_tail->ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
+	mem_alloc_list_tail->mtype = GPU_MEM;
+
+	/* Restore original ctx as current ctx */
+	res = pfn_cuCtxSetCurrent(current_ctx);
+	if (res != 0) {
+		pfn_cuGetErrorString(res, &(err_string));
+		rte_cuda_log(ERR, "cuCtxSetCurrent current failed with %s",
+				err_string);
+		return -EPERM;
+	}
+
+	*ptr = (void *) mem_alloc_list_tail->ptr_d;
+
+	return 0;
+}
+
+static int
+cuda_mem_register(struct rte_gpu *dev, size_t size, void *ptr)
+{
+	CUresult res;
+	const char *err_string;
+	CUcontext current_ctx;
+	CUcontext input_ctx;
+	unsigned int flag = 1;
+	int use_ptr_h = 0;
+
+	if (dev == NULL)
+		return -ENODEV;
+
+	if (size == 0 || ptr == NULL)
+		return -EINVAL;
+
+	/* Store current ctx */
+	res = pfn_cuCtxGetCurrent(¤t_ctx);
+	if (res != 0) {
+		pfn_cuGetErrorString(res, &(err_string));
+		rte_cuda_log(ERR, "cuCtxGetCurrent failed with %s",
+				err_string);
+		return -EPERM;
+	}
+
+	/* Set child ctx as current ctx */
+	input_ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
+	res = pfn_cuCtxSetCurrent(input_ctx);
+	if (res != 0) {
+		pfn_cuGetErrorString(res, &(err_string));
+		rte_cuda_log(ERR, "cuCtxSetCurrent input failed with %s",
+				err_string);
+		return -EPERM;
+	}
+
+	/* Get next memory list item */
+	mem_alloc_list_tail = mem_list_add_item();
+	if (mem_alloc_list_tail == NULL)
+		return -ENOMEM;
+
+	/* Allocate memory */
+	mem_alloc_list_tail->size = size;
+	mem_alloc_list_tail->ptr_h = ptr;
+
+	res = pfn_cuMemHostRegister(mem_alloc_list_tail->ptr_h,
+				mem_alloc_list_tail->size,
+				CU_MEMHOSTREGISTER_PORTABLE | CU_MEMHOSTREGISTER_DEVICEMAP);
+	if (res != 0) {
+		pfn_cuGetErrorString(res, &(err_string));
+		rte_cuda_log(ERR, "cuMemHostRegister failed with %s ptr %p size %zd",
+				err_string,
+				mem_alloc_list_tail->ptr_h,
+				mem_alloc_list_tail->size);
+		return -EPERM;
+	}
+
+	res = pfn_cuDeviceGetAttribute(&(use_ptr_h),
+			CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM,
+			((struct cuda_info *)(dev->mpshared->dev_private))->cu_dev);
+	if (res != 0) {
+		pfn_cuGetErrorString(res, &(err_string));
+		rte_cuda_log(ERR, "cuDeviceGetAttribute failed with %s",
+				err_string);
+		return -EPERM;
+	}
+
+	if (use_ptr_h == 0) {
+		res = pfn_cuMemHostGetDevicePointer(&(mem_alloc_list_tail->ptr_d),
+				mem_alloc_list_tail->ptr_h, 0);
+		if (res != 0) {
+			pfn_cuGetErrorString(res, &(err_string));
+			rte_cuda_log(ERR, "cuMemHostGetDevicePointer failed with %s",
+					err_string);
+			return -EPERM;
+		}
+
+		if ((uintptr_t) mem_alloc_list_tail->ptr_d != (uintptr_t) mem_alloc_list_tail->ptr_h) {
+			rte_cuda_log(ERR, "Host input pointer is different wrt GPU registered pointer");
+			return -ENOTSUP;
+		}
+	} else {
+		mem_alloc_list_tail->ptr_d = (CUdeviceptr) mem_alloc_list_tail->ptr_h;
+	}
+
+	/* GPUDirect RDMA attribute required */
+	res = pfn_cuPointerSetAttribute(&flag,
+					CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
+					mem_alloc_list_tail->ptr_d);
+	if (res != 0) {
+		rte_cuda_log(ERR, "Could not set SYNC MEMOP attribute for GPU memory at %"PRIu32", err %d",
+				(uint32_t) mem_alloc_list_tail->ptr_d, res);
+		return -EPERM;
+	}
+
+	mem_alloc_list_tail->pkey = get_hash_from_ptr((void *) mem_alloc_list_tail->ptr_h);
+	mem_alloc_list_tail->size = size;
+	mem_alloc_list_tail->dev = dev;
+	mem_alloc_list_tail->ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
+	mem_alloc_list_tail->mtype = CPU_REGISTERED;
+
+	/* Restore original ctx as current ctx */
+	res = pfn_cuCtxSetCurrent(current_ctx);
+	if (res != 0) {
+		pfn_cuGetErrorString(res, &(err_string));
+		rte_cuda_log(ERR, "cuCtxSetCurrent current failed with %s",
+				err_string);
+		return -EPERM;
+	}
+
+	return 0;
+}
+
+static int
+cuda_mem_free(struct rte_gpu *dev, void *ptr)
+{
+	CUresult res;
+	struct mem_entry *mem_item;
+	const char *err_string;
+	cuda_ptr_key hk;
+
+	if (dev == NULL)
+		return -ENODEV;
+
+	if (ptr == NULL)
+		return -EINVAL;
+
+	hk = get_hash_from_ptr((void *) ptr);
+
+	mem_item = mem_list_find_item(hk);
+	if (mem_item == NULL) {
+		rte_cuda_log(ERR, "Memory address 0x%p not found in driver memory", ptr);
+		return -EPERM;
+	}
+
+	if (mem_item->mtype == GPU_MEM) {
+		res = pfn_cuMemFree(mem_item->ptr_d);
+		if (res != 0) {
+			pfn_cuGetErrorString(res, &(err_string));
+			rte_cuda_log(ERR, "cuMemFree current failed with %s",
+					err_string);
+			return -EPERM;
+		}
+
+		return mem_list_del_item(hk);
+	}
+
+	rte_cuda_log(ERR, "Memory type %d not supported", mem_item->mtype);
+
+	return -EPERM;
+}
+
+static int
+cuda_mem_unregister(struct rte_gpu *dev, void *ptr)
+{
+	CUresult res;
+	struct mem_entry *mem_item;
+	const char *err_string;
+	cuda_ptr_key hk;
+
+	if (dev == NULL)
+		return -ENODEV;
+
+	if (ptr == NULL)
+		return -EINVAL;
+
+	hk = get_hash_from_ptr((void *) ptr);
+
+	mem_item = mem_list_find_item(hk);
+	if (mem_item == NULL) {
+		rte_cuda_log(ERR, "Memory address 0x%p not found in driver memory", ptr);
+		return -EPERM;
+	}
+
+	if (mem_item->mtype == CPU_REGISTERED) {
+		res = pfn_cuMemHostUnregister(ptr);
+		if (res != 0) {
+			pfn_cuGetErrorString(res, &(err_string));
+			rte_cuda_log(ERR, "cuMemHostUnregister current failed with %s",
+					err_string);
+			return -EPERM;
+		}
+
+		return mem_list_del_item(hk);
+	}
+
+	rte_cuda_log(ERR, "Memory type %d not supported", mem_item->mtype);
+
+	return -EPERM;
+}
+
+static int
+cuda_dev_close(struct rte_gpu *dev)
+{
+	if (dev == NULL)
+		return -EINVAL;
+
+	rte_free(dev->mpshared->dev_private);
+
+	return 0;
+}
+
+static int
+cuda_wmb(struct rte_gpu *dev)
+{
+	CUresult res;
+	const char *err_string;
+	CUcontext current_ctx;
+	CUcontext input_ctx;
+	struct cuda_info *private;
+
+	if (dev == NULL)
+		return -ENODEV;
+
+	private = (struct cuda_info *)dev->mpshared->dev_private;
+
+	if (private->gdr_write_ordering != CU_GPU_DIRECT_RDMA_WRITES_ORDERING_NONE) {
+		/*
+		 * No need to explicitly force the write ordering because
+		 * the device natively supports it
+		 */
+		return 0;
+	}
+
+	if (private->gdr_flush_type != CU_FLUSH_GPU_DIRECT_RDMA_WRITES_OPTION_HOST) {
+		/*
+		 * Can't flush GDR writes with cuFlushGPUDirectRDMAWrites CUDA function.
+		 * Application needs to use alternative methods.
+		 */
+		rte_cuda_log(WARNING, "Can't flush GDR writes with cuFlushGPUDirectRDMAWrites CUDA function."
+				"Application needs to use alternative methods.");
+		return -ENOTSUP;
+	}
+
+	/* Store current ctx */
+	res = pfn_cuCtxGetCurrent(¤t_ctx);
+	if (res != 0) {
+		pfn_cuGetErrorString(res, &(err_string));
+		rte_cuda_log(ERR, "cuCtxGetCurrent failed with %s",
+				err_string);
+		return -EPERM;
+	}
+
+	/* Set child ctx as current ctx */
+	input_ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
+	res = pfn_cuCtxSetCurrent(input_ctx);
+	if (res != 0) {
+		pfn_cuGetErrorString(res, &(err_string));
+		rte_cuda_log(ERR, "cuCtxSetCurrent input failed with %s",
+				err_string);
+		return -EPERM;
+	}
+
+	res = pfn_cuFlushGPUDirectRDMAWrites(CU_FLUSH_GPU_DIRECT_RDMA_WRITES_TARGET_CURRENT_CTX,
+					CU_FLUSH_GPU_DIRECT_RDMA_WRITES_TO_ALL_DEVICES);
+	if (res != 0) {
+		pfn_cuGetErrorString(res, &(err_string));
+		rte_cuda_log(ERR, "cuFlushGPUDirectRDMAWrites current failed with %s",
+				err_string);
+		return -EPERM;
+	}
+
+	/* Restore original ctx as current ctx */
+	res = pfn_cuCtxSetCurrent(current_ctx);
+	if (res != 0) {
+		pfn_cuGetErrorString(res, &(err_string));
+		rte_cuda_log(ERR, "cuCtxSetCurrent current failed with %s",
+				err_string);
+		return -EPERM;
+	}
+
+	return 0;
+}
+
+static int
+cuda_gpu_probe(__rte_unused struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
+{
+	struct rte_gpu *dev = NULL;
+	CUresult res;
+	CUdevice cu_dev_id;
+	CUcontext pctx;
+	char dev_name[RTE_DEV_NAME_MAX_LEN];
+	const char *err_string;
+	int processor_count = 0;
+	struct cuda_info *private;
+
+	if (pci_dev == NULL) {
+		rte_cuda_log(ERR, "NULL PCI device");
+		return -EINVAL;
+	}
+
+	rte_pci_device_name(&pci_dev->addr, dev_name, sizeof(dev_name));
+
+	/* Allocate memory to be used privately by drivers */
+	dev = rte_gpu_allocate(pci_dev->device.name);
+	if (dev == NULL)
+		return -ENODEV;
+
+	/* Initialize values only for the first CUDA driver call */
+	if (dev->mpshared->info.dev_id == 0) {
+		mem_alloc_list_head = NULL;
+		mem_alloc_list_tail = NULL;
+		mem_alloc_list_last_elem = 0;
+
+		/* Load libcuda.so library */
+		if (cuda_loader()) {
+			rte_cuda_log(ERR, "CUDA Driver library not found");
+			return -ENOTSUP;
+		}
+
+		/* Load initial CUDA functions */
+		if (cuda_sym_func_loader()) {
+			rte_cuda_log(ERR, "CUDA functions not found in library");
+			return -ENOTSUP;
+		}
+
+		/*
+		 * Required to initialize the CUDA Driver.
+		 * Multiple calls of cuInit() will return immediately
+		 * without making any relevant change
+		 */
+		sym_cuInit(0);
+
+		res = sym_cuDriverGetVersion(&cuda_driver_version);
+		if (res != 0) {
+			rte_cuda_log(ERR, "cuDriverGetVersion failed with %d", res);
+			return -ENOTSUP;
+		}
+
+		if (cuda_driver_version < CUDA_DRIVER_MIN_VERSION) {
+			rte_cuda_log(ERR, "CUDA Driver version found is %d,"
+					"Minimum requirement is %d",
+					cuda_driver_version, CUDA_DRIVER_MIN_VERSION);
+			return -ENOTSUP;
+		}
+
+		if (cuda_pfn_func_loader()) {
+			rte_cuda_log(ERR, "CUDA PFN functions not found in library");
+			return -ENOTSUP;
+		}
+	}
+
+	/* Fill HW specific part of device structure */
+	dev->device = &pci_dev->device;
+	dev->mpshared->info.numa_node = pci_dev->device.numa_node;
+
+	/* Get NVIDIA GPU Device descriptor */
+	res = pfn_cuDeviceGetByPCIBusId(&cu_dev_id, dev->device->name);
+	if (res != 0) {
+		pfn_cuGetErrorString(res, &(err_string));
+		rte_cuda_log(ERR, "cuDeviceGetByPCIBusId name %s failed with %d: %s",
+				dev->device->name, res, err_string);
+		return -EPERM;
+	}
+
+	res = pfn_cuDevicePrimaryCtxRetain(&pctx, cu_dev_id);
+	if (res != 0) {
+		pfn_cuGetErrorString(res, &(err_string));
+		rte_cuda_log(ERR, "cuDevicePrimaryCtxRetain name %s failed with %d: %s",
+				dev->device->name, res, err_string);
+		return -EPERM;
+	}
+
+	res = pfn_cuCtxGetApiVersion(pctx, &cuda_api_version);
+	if (res != 0) {
+		rte_cuda_log(ERR, "cuCtxGetApiVersion failed with %d", res);
+		return -ENOTSUP;
+	}
+
+	if (cuda_api_version < CUDA_API_MIN_VERSION) {
+		rte_cuda_log(ERR, "CUDA API version found is %d Minimum requirement is %d",
+				cuda_api_version, CUDA_API_MIN_VERSION);
+		return -ENOTSUP;
+	}
+
+	dev->mpshared->info.context = (uint64_t) pctx;
+
+	/*
+	 * GPU Device generic info
+	 */
+
+	/* Processor count */
+	res = pfn_cuDeviceGetAttribute(&(processor_count),
+			CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
+			cu_dev_id);
+	if (res != 0) {
+		pfn_cuGetErrorString(res, &(err_string));
+		rte_cuda_log(ERR, "cuDeviceGetAttribute failed with %s",
+				err_string);
+		return -EPERM;
+	}
+	dev->mpshared->info.processor_count = (uint32_t)processor_count;
+
+	/* Total memory */
+	res = pfn_cuDeviceTotalMem(&dev->mpshared->info.total_memory, cu_dev_id);
+	if (res != 0) {
+		pfn_cuGetErrorString(res, &(err_string));
+		rte_cuda_log(ERR, "cuDeviceTotalMem failed with %s",
+				err_string);
+		return -EPERM;
+	}
+
+	/*
+	 * GPU Device private info
+	 */
+	dev->mpshared->dev_private = rte_zmalloc(NULL,
+			sizeof(struct cuda_info),
+			RTE_CACHE_LINE_SIZE);
+	if (dev->mpshared->dev_private == NULL) {
+		rte_cuda_log(ERR, "Failed to allocate memory for GPU process private");
+		return -ENOMEM;
+	}
+
+	private = (struct cuda_info *)dev->mpshared->dev_private;
+	private->cu_dev = cu_dev_id;
+	res = pfn_cuDeviceGetName(private->gpu_name,
+			RTE_DEV_NAME_MAX_LEN,
+			cu_dev_id);
+	if (res != 0) {
+		pfn_cuGetErrorString(res, &(err_string));
+		rte_cuda_log(ERR, "cuDeviceGetName failed with %s",
+				err_string);
+		return -EPERM;
+	}
+
+	res = pfn_cuDeviceGetAttribute(&(private->gdr_supported),
+			CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED,
+			cu_dev_id);
+	if (res != 0) {
+		pfn_cuGetErrorString(res, &(err_string));
+		rte_cuda_log(ERR, "cuDeviceGetAttribute failed with %s",
+				err_string);
+		return -EPERM;
+	}
+
+	if (private->gdr_supported == 0)
+		rte_cuda_log(WARNING, "GPU %s doesn't support GPUDirect RDMA",
+				pci_dev->device.name);
+
+	res = pfn_cuDeviceGetAttribute(&(private->gdr_write_ordering),
+			CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WRITES_ORDERING,
+			cu_dev_id);
+	if (res != 0) {
+		pfn_cuGetErrorString(res, &(err_string));
+		rte_cuda_log(ERR,
+				"cuDeviceGetAttribute failed with %s",
+				err_string);
+		return -EPERM;
+	}
+
+	if (private->gdr_write_ordering == CU_GPU_DIRECT_RDMA_WRITES_ORDERING_NONE) {
+		res = pfn_cuDeviceGetAttribute(&(private->gdr_flush_type),
+					CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_FLUSH_WRITES_OPTIONS,
+					cu_dev_id);
+		if (res != 0) {
+			pfn_cuGetErrorString(res, &(err_string));
+			rte_cuda_log(ERR, "cuDeviceGetAttribute failed with %s",
+					err_string);
+			return -EPERM;
+		}
+
+		if (private->gdr_flush_type != CU_FLUSH_GPU_DIRECT_RDMA_WRITES_OPTION_HOST) {
+			rte_cuda_log(ERR, "GPUDirect RDMA flush writes API is not supported");
+		}
+	}
+
+	dev->ops.dev_info_get = cuda_dev_info_get;
+	dev->ops.dev_close = cuda_dev_close;
+	dev->ops.mem_alloc = cuda_mem_alloc;
+	dev->ops.mem_free = cuda_mem_free;
+	dev->ops.mem_register = cuda_mem_register;
+	dev->ops.mem_unregister = cuda_mem_unregister;
+	dev->ops.wmb = cuda_wmb;
+
+	rte_gpu_complete_new(dev);
+
+	rte_cuda_debug("dev id = %u name = %s",
+			dev->mpshared->info.dev_id, private->gpu_name);
+
+	return 0;
+}
+
+static int
+cuda_gpu_remove(struct rte_pci_device *pci_dev)
+{
+	struct rte_gpu *dev;
+	int ret;
+	uint8_t gpu_id;
+
+	if (pci_dev == NULL)
+		return -EINVAL;
+
+	dev = rte_gpu_get_by_name(pci_dev->device.name);
+	if (dev == NULL) {
+		rte_cuda_log(ERR, "Couldn't find HW dev \"%s\" to uninitialise it",
+				pci_dev->device.name);
+		return -ENODEV;
+	}
+	gpu_id = dev->mpshared->info.dev_id;
+
+	/* release dev from library */
+	ret = rte_gpu_release(dev);
+	if (ret)
+		rte_cuda_log(ERR, "Device %i failed to uninit: %i", gpu_id, ret);
+
+	rte_cuda_debug("Destroyed dev = %u", gpu_id);
+
+	return 0;
+}
+
+static struct rte_pci_driver rte_cuda_driver = {
+	.id_table = pci_id_cuda_map,
+	.drv_flags = RTE_PCI_DRV_WC_ACTIVATE,
+	.probe = cuda_gpu_probe,
+	.remove = cuda_gpu_remove,
+};
+
+RTE_PMD_REGISTER_PCI(gpu_cuda, rte_cuda_driver);
+RTE_PMD_REGISTER_PCI_TABLE(gpu_cuda, pci_id_cuda_map);
+RTE_PMD_REGISTER_KMOD_DEP(gpu_cuda, "* nvidia & (nv_peer_mem | nvpeer_mem)");
diff --git a/drivers/gpu/cuda/meson.build b/drivers/gpu/cuda/meson.build
new file mode 100644
index 0000000000..4bb3e14cd2
--- /dev/null
+++ b/drivers/gpu/cuda/meson.build
@@ -0,0 +1,18 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright (c) 2021 NVIDIA Corporation & Affiliates
+
+if not is_linux
+        build = false
+        reason = 'only supported on Linux'
+endif
+
+cuda_h_found = cc.has_header('cuda.h')
+cuda_typeh_found = cc.has_header('cudaTypedefs.h')
+
+if not cuda_h_found or not cuda_typeh_found
+        build = false
+        reason = 'missing dependency cuda headers cuda.h and cudaTypedefs.h'
+endif
+
+deps += ['gpudev','pci','bus_pci']
+sources = files('cuda.c')
diff --git a/drivers/gpu/cuda/version.map b/drivers/gpu/cuda/version.map
new file mode 100644
index 0000000000..4a76d1d52d
--- /dev/null
+++ b/drivers/gpu/cuda/version.map
@@ -0,0 +1,3 @@
+DPDK_21 {
+	local: *;
+};
diff --git a/drivers/gpu/meson.build b/drivers/gpu/meson.build
index e51ad3381b..601bedcd61 100644
--- a/drivers/gpu/meson.build
+++ b/drivers/gpu/meson.build
@@ -1,4 +1,4 @@
 # SPDX-License-Identifier: BSD-3-Clause
 # Copyright (c) 2021 NVIDIA Corporation & Affiliates
 
-drivers = []
+drivers = [ 'cuda' ]
-- 
2.17.1
^ permalink raw reply related	[flat|nested] 28+ messages in thread
* [PATCH v7 0/1] gpu/cuda: introduce CUDA driver
  2021-10-05 22:49 [dpdk-dev] [RFC PATCH] gpu/cuda: introduce CUDA driver eagostini
                   ` (5 preceding siblings ...)
  2021-11-16 20:47 ` [PATCH v6 0/1] " eagostini
@ 2021-11-16 22:50 ` eagostini
  2021-11-16 22:50   ` [PATCH v7 1/1] " eagostini
  6 siblings, 1 reply; 28+ messages in thread
From: eagostini @ 2021-11-16 22:50 UTC (permalink / raw)
  To: dev; +Cc: Elena Agostini
From: Elena Agostini <eagostini@nvidia.com>
This is the CUDA implementation of the gpudev library.
Funcitonalities implemented through CUDA Driver API are:
- Device probe and remove
- Manage device memory allocations
- Register/unregister external CPU memory in the device memory area
Changelog:
- CUDA driver implementation of the GPU write memory barrier
- Fixed styling reported by checkpatch
- CUDA driver shared library libcuda.so is not required at build time
- CUDA driver shared library libcuda.so is loaded at runtime
- CUDA driver headers are required at build time
- Documentation updated
- NVIDIA GPU T4 support added
Elena Agostini (1):
  gpu/cuda: introduce CUDA driver
 doc/guides/gpus/cuda.rst               |  152 ++++
 doc/guides/gpus/index.rst              |    1 +
 doc/guides/rel_notes/release_21_11.rst |    2 +
 drivers/gpu/cuda/cuda.c                | 1150 ++++++++++++++++++++++++
 drivers/gpu/cuda/meson.build           |   18 +
 drivers/gpu/cuda/version.map           |    3 +
 drivers/gpu/meson.build                |    2 +-
 7 files changed, 1327 insertions(+), 1 deletion(-)
 create mode 100644 doc/guides/gpus/cuda.rst
 create mode 100644 drivers/gpu/cuda/cuda.c
 create mode 100644 drivers/gpu/cuda/meson.build
 create mode 100644 drivers/gpu/cuda/version.map
-- 
2.17.1
^ permalink raw reply	[flat|nested] 28+ messages in thread
* [PATCH v7 1/1] gpu/cuda: introduce CUDA driver
  2021-11-16 22:50 ` [PATCH v7 0/1] " eagostini
@ 2021-11-16 22:50   ` eagostini
  2021-11-16 15:58     ` Stephen Hemminger
  2021-11-16 16:30     ` Thomas Monjalon
  0 siblings, 2 replies; 28+ messages in thread
From: eagostini @ 2021-11-16 22:50 UTC (permalink / raw)
  To: dev; +Cc: Elena Agostini
From: Elena Agostini <eagostini@nvidia.com>
This is the CUDA implementation of the gpudev library.
Functionalities implemented through CUDA Driver API are:
- Device probe and remove
- Manage device memory allocations
- Register/unregister external CPU memory in the device memory area
Signed-off-by: Elena Agostini <eagostini@nvidia.com>
---
 doc/guides/gpus/cuda.rst               |  152 ++++
 doc/guides/gpus/index.rst              |    1 +
 doc/guides/rel_notes/release_21_11.rst |    2 +
 drivers/gpu/cuda/cuda.c                | 1150 ++++++++++++++++++++++++
 drivers/gpu/cuda/meson.build           |   18 +
 drivers/gpu/cuda/version.map           |    3 +
 drivers/gpu/meson.build                |    2 +-
 7 files changed, 1327 insertions(+), 1 deletion(-)
 create mode 100644 doc/guides/gpus/cuda.rst
 create mode 100644 drivers/gpu/cuda/cuda.c
 create mode 100644 drivers/gpu/cuda/meson.build
 create mode 100644 drivers/gpu/cuda/version.map
diff --git a/doc/guides/gpus/cuda.rst b/doc/guides/gpus/cuda.rst
new file mode 100644
index 0000000000..d007c9ff2c
--- /dev/null
+++ b/doc/guides/gpus/cuda.rst
@@ -0,0 +1,152 @@
+.. SPDX-License-Identifier: BSD-3-Clause
+   Copyright (c) 2021 NVIDIA Corporation & Affiliates
+
+CUDA GPU driver
+===============
+
+The CUDA GPU driver library (**librte_gpu_cuda**) provides support for NVIDIA GPUs.
+Information and documentation about these devices can be found on the
+`NVIDIA website <http://www.nvidia.com>`_. Help is also provided by the
+`NVIDIA CUDA Toolkit developer zone <https://docs.nvidia.com/cuda>`_.
+
+Build dependencies
+------------------
+
+The CUDA GPU driver library has an header-only dependency on ``cuda.h`` and ``cudaTypedefs.h``.
+To get these headers there are two options:
+
+- Install `CUDA Toolkit <https://developer.nvidia.com/cuda-toolkit>`_ 
+  (either regular or stubs installation).
+- Download these two headers from this `CUDA headers
+  <https://gitlab.com/nvidia/headers/cuda-individual/cudart>`_ public repo.
+
+You need to indicate to meson where CUDA headers files are through the CFLAGS variable.
+Three ways:
+
+- Set ``export CFLAGS=-I/usr/local/cuda/include`` before building
+- Add CFLAGS in the meson command line ``CFLAGS=-I/usr/local/cuda/include meson build``
+- Add the ``-Dc_args`` in meson command line ``meson build -Dc_args=-I/usr/local/cuda/include``
+
+If headers are not found, the CUDA GPU driver library is not built.
+
+CUDA Shared Library
+-------------------
+
+To avoid any system configuration issue, the CUDA API **libcuda.so** shared library
+is not linked at building time because of a Meson bug that looks
+for `cudart` module even if the `meson.build` file only requires default `cuda` module.
+
+**libcuda.so** is loaded at runtime in the ``cuda_gpu_probe`` function through ``dlopen``
+when the very first GPU is detected.
+If CUDA installation resides in a custom directory,
+the environment variable ``CUDA_PATH_L`` should specify where ``dlopen``
+can look for **libcuda.so**.
+
+All CUDA API symbols are loaded at runtime as well.
+For this reason, to build the CUDA driver library,
+no need to install the CUDA library.
+
+Design
+------
+
+**librte_gpu_cuda** relies on CUDA Driver API (no need for CUDA Runtime API).
+
+Goal of this driver library is not to provide a wrapper for the whole CUDA Driver API.
+Instead, the scope is to implement the generic features of gpudev API.
+For a CUDA application, integrating the gpudev library functions
+using the CUDA driver library is quite straightforward
+and doesn't create any compatibility problem.
+
+Initialization
+~~~~~~~~~~~~~~
+
+During initialization, CUDA driver library detects NVIDIA physical GPUs
+on the system or specified via EAL device options (e.g. ``-a b6:00.0``).
+The driver initializes the CUDA driver environment through ``cuInit(0)`` function.
+For this reason, it's required to set any CUDA environment configuration before
+calling ``rte_eal_init`` function in the DPDK application.
+
+If the CUDA driver environment has been already initialized, the ``cuInit(0)``
+in CUDA driver library has no effect.
+
+CUDA Driver sub-contexts
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+After initialization, a CUDA application can create multiple sub-contexts
+on GPU physical devices.
+Through gpudev library, is possible to register these sub-contexts
+in the CUDA driver library as child devices having as parent a GPU physical device.
+
+CUDA driver library also supports `MPS
+<https://docs.nvidia.com/deploy/pdf/CUDA_Multi_Process_Service_Overview.pdf>`__.
+
+GPU memory management
+~~~~~~~~~~~~~~~~~~~~~
+
+The CUDA driver library maintains a table of GPU memory addresses allocated
+and CPU memory addresses registered associated to the input CUDA context.
+Whenever the application tried to deallocate or deregister a memory address,
+if the address is not in the table the CUDA driver library will return an error.
+
+Features
+--------
+
+- Register new child devices aka new CUDA Driver contexts.
+- Allocate memory on the GPU.
+- Register CPU memory to make it visible from GPU.
+
+Minimal requirements
+--------------------
+
+Minimal requirements to enable the CUDA driver library are:
+
+- NVIDIA GPU Ampere or Volta
+- CUDA 11.4 Driver API or newer
+
+`GPUDirect RDMA Technology <https://docs.nvidia.com/cuda/gpudirect-rdma/index.html>`_
+allows compatible network cards (e.g. Mellanox) to directly send and receive packets
+using GPU memory instead of additional memory copies through the CPU system memory.
+To enable this technology, system requirements are:
+
+- `nvidia-peermem <https://docs.nvidia.com/cuda/gpudirect-rdma/index.html#nvidia-peermem>`_
+  module running on the system;
+- Mellanox network card ConnectX-5 or newer (BlueField models included);
+- DPDK mlx5 PMD enabled;
+- To reach the best performance, an additional PCIe switch between GPU and NIC is recommended.
+
+Limitations
+-----------
+
+Supported only on Linux.
+
+Supported GPUs
+--------------
+
+The following NVIDIA GPU devices are supported by this CUDA driver library:
+
+- NVIDIA A100 80GB PCIe
+- NVIDIA A100 40GB PCIe
+- NVIDIA A30 24GB
+- NVIDIA A10 24GB
+- NVIDIA V100 32GB PCIe
+- NVIDIA V100 16GB PCIe
+
+External references
+-------------------
+
+A good example of how to use the GPU CUDA driver library through the gpudev library
+is the l2fwd-nv application that can be found `here <https://github.com/NVIDIA/l2fwd-nv>`_.
+
+The application is based on vanilla DPDK example l2fwd
+and is enhanced with GPU memory managed through gpudev library
+and CUDA to launch the swap of packets MAC addresses workload on the GPU.
+
+l2fwd-nv is not intended to be used for performance
+(testpmd is the good candidate for this).
+The goal is to show different use-cases about how a CUDA application can use DPDK to:
+
+- Allocate memory on GPU device using gpudev library.
+- Use that memory to create an external GPU memory mempool.
+- Receive packets directly in GPU memory.
+- Coordinate the workload on the GPU with the network and CPU activity to receive packets.
+- Send modified packets directly from the GPU memory.
diff --git a/doc/guides/gpus/index.rst b/doc/guides/gpus/index.rst
index 1878423239..4b7a420556 100644
--- a/doc/guides/gpus/index.rst
+++ b/doc/guides/gpus/index.rst
@@ -9,3 +9,4 @@ General-Purpose Graphics Processing Unit Drivers
    :numbered:
 
    overview
+   cuda
diff --git a/doc/guides/rel_notes/release_21_11.rst b/doc/guides/rel_notes/release_21_11.rst
index cd4dcd0077..d76bba2fe3 100644
--- a/doc/guides/rel_notes/release_21_11.rst
+++ b/doc/guides/rel_notes/release_21_11.rst
@@ -111,6 +111,8 @@ New Features
   * Memory management
   * Communication flag & list
 
+* **Added NVIDIA GPU driver implemented with CUDA library.**
+
 * **Added new RSS offload types for IPv4/L4 checksum in RSS flow.**
 
   Added macros ETH_RSS_IPV4_CHKSUM and ETH_RSS_L4_CHKSUM, now IPv4 and
diff --git a/drivers/gpu/cuda/cuda.c b/drivers/gpu/cuda/cuda.c
new file mode 100644
index 0000000000..12217c48c6
--- /dev/null
+++ b/drivers/gpu/cuda/cuda.c
@@ -0,0 +1,1150 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright (c) 2021 NVIDIA Corporation & Affiliates
+ */
+
+#include <dlfcn.h>
+
+#include <rte_common.h>
+#include <rte_log.h>
+#include <rte_malloc.h>
+#include <rte_errno.h>
+#include <rte_pci.h>
+#include <rte_bus_pci.h>
+#include <rte_byteorder.h>
+#include <rte_dev.h>
+
+#include <gpudev_driver.h>
+#include <cuda.h>
+#include <cudaTypedefs.h>
+
+#define CUDA_DRIVER_MIN_VERSION 11040
+#define CUDA_API_MIN_VERSION 3020
+
+/* CUDA Driver functions loaded with dlsym() */
+CUresult CUDAAPI (*sym_cuInit)(unsigned int flags) = NULL;
+CUresult CUDAAPI (*sym_cuDriverGetVersion)(int *driverVersion) = NULL;
+CUresult CUDAAPI (*sym_cuGetProcAddress)(const char *symbol,
+		void **pfn, int cudaVersion, uint64_t flags) = NULL;
+
+/* CUDA Driver functions loaded with cuGetProcAddress for versioning */
+PFN_cuGetErrorString pfn_cuGetErrorString;
+PFN_cuGetErrorName pfn_cuGetErrorName;
+PFN_cuPointerSetAttribute pfn_cuPointerSetAttribute;
+PFN_cuDeviceGetAttribute pfn_cuDeviceGetAttribute;
+PFN_cuDeviceGetByPCIBusId pfn_cuDeviceGetByPCIBusId;
+PFN_cuDevicePrimaryCtxRetain pfn_cuDevicePrimaryCtxRetain;
+PFN_cuDevicePrimaryCtxRelease pfn_cuDevicePrimaryCtxRelease;
+PFN_cuDeviceTotalMem pfn_cuDeviceTotalMem;
+PFN_cuDeviceGetName pfn_cuDeviceGetName;
+PFN_cuCtxGetApiVersion pfn_cuCtxGetApiVersion;
+PFN_cuCtxSetCurrent pfn_cuCtxSetCurrent;
+PFN_cuCtxGetCurrent pfn_cuCtxGetCurrent;
+PFN_cuCtxGetDevice pfn_cuCtxGetDevice;
+PFN_cuCtxGetExecAffinity pfn_cuCtxGetExecAffinity;
+PFN_cuMemAlloc pfn_cuMemAlloc;
+PFN_cuMemFree pfn_cuMemFree;
+PFN_cuMemHostRegister pfn_cuMemHostRegister;
+PFN_cuMemHostUnregister pfn_cuMemHostUnregister;
+PFN_cuMemHostGetDevicePointer pfn_cuMemHostGetDevicePointer;
+PFN_cuFlushGPUDirectRDMAWrites pfn_cuFlushGPUDirectRDMAWrites;
+
+static void *cudalib;
+static unsigned int cuda_api_version;
+static int cuda_driver_version;
+
+/* NVIDIA GPU vendor */
+#define NVIDIA_GPU_VENDOR_ID (0x10de)
+
+/* NVIDIA GPU device IDs */
+#define NVIDIA_GPU_A100_40GB_DEVICE_ID (0x20f1)
+#define NVIDIA_GPU_A100_80GB_DEVICE_ID (0x20b5)
+
+#define NVIDIA_GPU_A30_24GB_DEVICE_ID (0x20b7)
+#define NVIDIA_GPU_A10_24GB_DEVICE_ID (0x2236)
+
+#define NVIDIA_GPU_V100_32GB_DEVICE_ID (0x1db6)
+#define NVIDIA_GPU_V100_16GB_DEVICE_ID (0x1db4)
+
+#define NVIDIA_GPU_T4_16GB_DEVICE_ID (0x1eb8)
+
+#define CUDA_MAX_ALLOCATION_NUM 512
+
+#define GPU_PAGE_SHIFT 16
+#define GPU_PAGE_SIZE (1UL << GPU_PAGE_SHIFT)
+
+static RTE_LOG_REGISTER_DEFAULT(cuda_logtype, NOTICE);
+
+/* Helper macro for logging */
+#define rte_cuda_log(level, fmt, ...) \
+	rte_log(RTE_LOG_ ## level, cuda_logtype, fmt "\n", ##__VA_ARGS__)
+
+#define rte_cuda_debug(fmt, ...) \
+	rte_cuda_log(DEBUG, RTE_STR(__LINE__) ":%s() " fmt, __func__, \
+		##__VA_ARGS__)
+
+/* NVIDIA GPU address map */
+static struct rte_pci_id pci_id_cuda_map[] = {
+	{
+		RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
+				NVIDIA_GPU_A100_40GB_DEVICE_ID)
+	},
+	{
+		RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
+				NVIDIA_GPU_A100_80GB_DEVICE_ID)
+	},
+	{
+		RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
+				NVIDIA_GPU_A30_24GB_DEVICE_ID)
+	},
+	{
+		RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
+				NVIDIA_GPU_A10_24GB_DEVICE_ID)
+	},
+	{
+		RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
+				NVIDIA_GPU_V100_32GB_DEVICE_ID)
+	},
+	{
+		RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
+				NVIDIA_GPU_V100_16GB_DEVICE_ID)
+	},
+	{
+		RTE_PCI_DEVICE(NVIDIA_GPU_VENDOR_ID,
+				NVIDIA_GPU_T4_16GB_DEVICE_ID)
+	},
+	{
+		.device_id = 0
+	}
+};
+
+/* Device private info */
+struct cuda_info {
+	char gpu_name[RTE_DEV_NAME_MAX_LEN];
+	CUdevice cu_dev;
+	int gdr_supported;
+	int gdr_write_ordering;
+	int gdr_flush_type;
+};
+
+/* Type of memory allocated by CUDA driver */
+enum mem_type {
+	GPU_MEM = 0,
+	CPU_REGISTERED,
+	GPU_REGISTERED /* Not used yet */
+};
+
+/* key associated to a memory address */
+typedef uintptr_t cuda_ptr_key;
+
+/* Single entry of the memory list */
+struct mem_entry {
+	CUdeviceptr ptr_d;
+	void *ptr_h;
+	size_t size;
+	struct rte_gpu *dev;
+	CUcontext ctx;
+	cuda_ptr_key pkey;
+	enum mem_type mtype;
+	struct mem_entry *prev;
+	struct mem_entry *next;
+};
+
+static struct mem_entry *mem_alloc_list_head;
+static struct mem_entry *mem_alloc_list_tail;
+static uint32_t mem_alloc_list_last_elem;
+
+/* Load the CUDA symbols */
+
+static int
+cuda_loader(void)
+{
+	char cuda_path[1024];
+
+	if (getenv("CUDA_PATH_L") == NULL)
+		snprintf(cuda_path, 1024, "%s", "libcuda.so");
+	else
+		snprintf(cuda_path, 1024, "%s%s", getenv("CUDA_PATH_L"), "libcuda.so");
+
+	cudalib = dlopen(cuda_path, RTLD_LAZY);
+	if (cudalib == NULL) {
+		rte_cuda_log(ERR, "Failed to find CUDA library in %s (CUDA_PATH_L=%s)",
+				cuda_path, getenv("CUDA_PATH_L"));
+		return -1;
+	}
+
+	return 0;
+}
+
+static int
+cuda_sym_func_loader(void)
+{
+	if (cudalib == NULL)
+		return -1;
+
+	sym_cuInit = dlsym(cudalib, "cuInit");
+	if (sym_cuInit == NULL) {
+		rte_cuda_log(ERR, "Failed to load CUDA missing symbol cuInit");
+		return -1;
+	}
+
+	sym_cuDriverGetVersion = dlsym(cudalib, "cuDriverGetVersion");
+	if (sym_cuDriverGetVersion == NULL) {
+		rte_cuda_log(ERR, "Failed to load CUDA missing symbol cuDriverGetVersion");
+		return -1;
+	}
+
+	sym_cuGetProcAddress = dlsym(cudalib, "cuGetProcAddress");
+	if (sym_cuGetProcAddress == NULL) {
+		rte_cuda_log(ERR, "Failed to load CUDA missing symbol cuGetProcAddress");
+		return -1;
+	}
+
+	return 0;
+}
+
+static int
+cuda_pfn_func_loader(void)
+{
+	CUresult res;
+
+	res = sym_cuGetProcAddress("cuGetErrorString",
+			(void **) (&pfn_cuGetErrorString), cuda_driver_version, 0);
+	if (res != 0) {
+		rte_cuda_log(ERR, "Retrieve pfn_cuGetErrorString failed with %d", res);
+		return -1;
+	}
+
+	res = sym_cuGetProcAddress("cuGetErrorName",
+			(void **) (&pfn_cuGetErrorName), cuda_driver_version, 0);
+	if (res != 0) {
+		rte_cuda_log(ERR, "Retrieve pfn_cuGetErrorName failed with %d", res);
+		return -1;
+	}
+
+	res = sym_cuGetProcAddress("cuPointerSetAttribute",
+			(void **) (&pfn_cuPointerSetAttribute), cuda_driver_version, 0);
+	if (res != 0) {
+		rte_cuda_log(ERR, "Retrieve pfn_cuPointerSetAttribute failed with %d", res);
+		return -1;
+	}
+
+	res = sym_cuGetProcAddress("cuDeviceGetAttribute",
+			(void **) (&pfn_cuDeviceGetAttribute), cuda_driver_version, 0);
+	if (res != 0) {
+		rte_cuda_log(ERR, "Retrieve pfn_cuDeviceGetAttribute failed with %d", res);
+		return -1;
+	}
+
+	res = sym_cuGetProcAddress("cuDeviceGetByPCIBusId",
+			(void **) (&pfn_cuDeviceGetByPCIBusId), cuda_driver_version, 0);
+	if (res != 0) {
+		rte_cuda_log(ERR, "Retrieve pfn_cuDeviceGetByPCIBusId failed with %d", res);
+		return -1;
+	}
+
+	res = sym_cuGetProcAddress("cuDeviceGetName",
+			(void **) (&pfn_cuDeviceGetName), cuda_driver_version, 0);
+	if (res != 0) {
+		rte_cuda_log(ERR, "Retrieve pfn_cuDeviceGetName failed with %d", res);
+		return -1;
+	}
+
+	res = sym_cuGetProcAddress("cuDevicePrimaryCtxRetain",
+			(void **) (&pfn_cuDevicePrimaryCtxRetain), cuda_driver_version, 0);
+	if (res != 0) {
+		rte_cuda_log(ERR, "Retrieve pfn_cuDevicePrimaryCtxRetain failed with %d", res);
+		return -1;
+	}
+
+	res = sym_cuGetProcAddress("cuDevicePrimaryCtxRelease",
+			(void **) (&pfn_cuDevicePrimaryCtxRelease), cuda_driver_version, 0);
+	if (res != 0) {
+		rte_cuda_log(ERR, "Retrieve pfn_cuDevicePrimaryCtxRelease failed with %d", res);
+		return -1;
+	}
+
+	res = sym_cuGetProcAddress("cuDeviceTotalMem",
+			(void **) (&pfn_cuDeviceTotalMem), cuda_driver_version, 0);
+	if (res != 0) {
+		rte_cuda_log(ERR, "Retrieve pfn_cuDeviceTotalMem failed with %d", res);
+		return -1;
+	}
+
+	res = sym_cuGetProcAddress("cuCtxGetApiVersion",
+			(void **) (&pfn_cuCtxGetApiVersion), cuda_driver_version, 0);
+	if (res != 0) {
+		rte_cuda_log(ERR, "Retrieve pfn_cuCtxGetApiVersion failed with %d", res);
+		return -1;
+	}
+
+	res = sym_cuGetProcAddress("cuCtxGetDevice",
+			(void **) (&pfn_cuCtxGetDevice), cuda_driver_version, 0);
+	if (res != 0) {
+		rte_cuda_log(ERR, "Retrieve pfn_cuCtxGetDevice failed with %d", res);
+		return -1;
+	}
+
+	res = sym_cuGetProcAddress("cuCtxSetCurrent",
+			(void **) (&pfn_cuCtxSetCurrent), cuda_driver_version, 0);
+	if (res != 0) {
+		rte_cuda_log(ERR, "Retrieve pfn_cuCtxSetCurrent failed with %d", res);
+		return -1;
+	}
+
+	res = sym_cuGetProcAddress("cuCtxGetCurrent",
+			(void **) (&pfn_cuCtxGetCurrent), cuda_driver_version, 0);
+	if (res != 0) {
+		rte_cuda_log(ERR, "Retrieve pfn_cuCtxGetCurrent failed with %d", res);
+		return -1;
+	}
+
+	res = sym_cuGetProcAddress("cuCtxGetExecAffinity",
+			(void **) (&pfn_cuCtxGetExecAffinity), cuda_driver_version, 0);
+	if (res != 0) {
+		rte_cuda_log(ERR, "Retrieve pfn_cuCtxGetExecAffinity failed with %d", res);
+		return -1;
+	}
+
+	res = sym_cuGetProcAddress("cuMemAlloc",
+			(void **) (&pfn_cuMemAlloc), cuda_driver_version, 0);
+	if (res != 0) {
+		rte_cuda_log(ERR, "Retrieve pfn_cuMemAlloc failed with %d", res);
+		return -1;
+	}
+
+	res = sym_cuGetProcAddress("cuMemFree",
+			(void **) (&pfn_cuMemFree), cuda_driver_version, 0);
+	if (res != 0) {
+		rte_cuda_log(ERR, "Retrieve pfn_cuMemFree failed with %d", res);
+		return -1;
+	}
+
+	res = sym_cuGetProcAddress("cuMemHostRegister",
+			(void **) (&pfn_cuMemHostRegister), cuda_driver_version, 0);
+	if (res != 0) {
+		rte_cuda_log(ERR, "Retrieve pfn_cuMemHostRegister failed with %d", res);
+		return -1;
+	}
+
+	res = sym_cuGetProcAddress("cuMemHostUnregister",
+			(void **) (&pfn_cuMemHostUnregister), cuda_driver_version, 0);
+	if (res != 0) {
+		rte_cuda_log(ERR, "Retrieve pfn_cuMemHostUnregister failed with %d", res);
+		return -1;
+	}
+
+	res = sym_cuGetProcAddress("cuMemHostGetDevicePointer",
+			(void **) (&pfn_cuMemHostGetDevicePointer), cuda_driver_version, 0);
+	if (res != 0) {
+		rte_cuda_log(ERR, "Retrieve pfn_cuMemHostGetDevicePointer failed with %d", res);
+		return -1;
+	}
+
+	res = sym_cuGetProcAddress("cuFlushGPUDirectRDMAWrites",
+			(void **) (&pfn_cuFlushGPUDirectRDMAWrites), cuda_driver_version, 0);
+	if (res != 0) {
+		rte_cuda_log(ERR, "Retrieve cuFlushGPUDirectRDMAWrites failed with %d", res);
+		return -1;
+	}
+
+	return 0;
+}
+
+/* Generate a key from a memory pointer */
+static cuda_ptr_key
+get_hash_from_ptr(void *ptr)
+{
+	return (uintptr_t) ptr;
+}
+
+static uint32_t
+mem_list_count_item(void)
+{
+	return mem_alloc_list_last_elem;
+}
+
+/* Initiate list of memory allocations if not done yet */
+static struct mem_entry *
+mem_list_add_item(void)
+{
+	/* Initiate list of memory allocations if not done yet */
+	if (mem_alloc_list_head == NULL) {
+		mem_alloc_list_head = rte_zmalloc(NULL,
+				sizeof(struct mem_entry),
+				RTE_CACHE_LINE_SIZE);
+		if (mem_alloc_list_head == NULL) {
+			rte_cuda_log(ERR, "Failed to allocate memory for memory list");
+			return NULL;
+		}
+
+		mem_alloc_list_head->next = NULL;
+		mem_alloc_list_head->prev = NULL;
+		mem_alloc_list_tail = mem_alloc_list_head;
+	} else {
+		struct mem_entry *mem_alloc_list_cur = rte_zmalloc(NULL,
+				sizeof(struct mem_entry),
+				RTE_CACHE_LINE_SIZE);
+
+		if (mem_alloc_list_cur == NULL) {
+			rte_cuda_log(ERR, "Failed to allocate memory for memory list");
+			return NULL;
+		}
+
+		mem_alloc_list_tail->next = mem_alloc_list_cur;
+		mem_alloc_list_cur->prev = mem_alloc_list_tail;
+		mem_alloc_list_tail = mem_alloc_list_tail->next;
+		mem_alloc_list_tail->next = NULL;
+	}
+
+	mem_alloc_list_last_elem++;
+
+	return mem_alloc_list_tail;
+}
+
+static struct mem_entry *
+mem_list_find_item(cuda_ptr_key pk)
+{
+	struct mem_entry *mem_alloc_list_cur = NULL;
+
+	if (mem_alloc_list_head == NULL) {
+		rte_cuda_log(ERR, "Memory list doesn't exist");
+		return NULL;
+	}
+
+	if (mem_list_count_item() == 0) {
+		rte_cuda_log(ERR, "No items in memory list");
+		return NULL;
+	}
+
+	mem_alloc_list_cur = mem_alloc_list_head;
+
+	while (mem_alloc_list_cur != NULL) {
+		if (mem_alloc_list_cur->pkey == pk)
+			return mem_alloc_list_cur;
+		mem_alloc_list_cur = mem_alloc_list_cur->next;
+	}
+
+	return mem_alloc_list_cur;
+}
+
+static int
+mem_list_del_item(cuda_ptr_key pk)
+{
+	struct mem_entry *mem_alloc_list_cur = NULL;
+
+	mem_alloc_list_cur = mem_list_find_item(pk);
+	if (mem_alloc_list_cur == NULL)
+		return -EINVAL;
+
+	/* if key is in head */
+	if (mem_alloc_list_cur->prev == NULL)
+		mem_alloc_list_head = mem_alloc_list_cur->next;
+	else {
+		mem_alloc_list_cur->prev->next = mem_alloc_list_cur->next;
+		if (mem_alloc_list_cur->next != NULL)
+			mem_alloc_list_cur->next->prev = mem_alloc_list_cur->prev;
+	}
+
+	rte_free(mem_alloc_list_cur);
+
+	mem_alloc_list_last_elem--;
+
+	return 0;
+}
+
+static int
+cuda_dev_info_get(struct rte_gpu *dev, struct rte_gpu_info *info)
+{
+	int ret = 0;
+	CUresult res;
+	struct rte_gpu_info parent_info;
+	CUexecAffinityParam affinityPrm;
+	const char *err_string;
+	struct cuda_info *private;
+	CUcontext current_ctx;
+	CUcontext input_ctx;
+
+	if (dev == NULL)
+		return -ENODEV;
+
+	/* Child initialization time probably called by rte_gpu_add_child() */
+	if (dev->mpshared->info.parent != RTE_GPU_ID_NONE && dev->mpshared->dev_private == NULL) {
+		/* Store current ctx */
+		res = pfn_cuCtxGetCurrent(¤t_ctx);
+		if (res != 0) {
+			pfn_cuGetErrorString(res, &(err_string));
+			rte_cuda_log(ERR, "cuCtxGetCurrent failed with %s", err_string);
+			return -EPERM;
+		}
+
+		/* Set child ctx as current ctx */
+		input_ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
+		res = pfn_cuCtxSetCurrent(input_ctx);
+		if (res != 0) {
+			pfn_cuGetErrorString(res, &(err_string));
+			rte_cuda_log(ERR, "cuCtxSetCurrent input failed with %s",
+					err_string);
+			return -EPERM;
+		}
+
+		/*
+		 * Ctx capacity info
+		 */
+
+		/* MPS compatible */
+		res = pfn_cuCtxGetExecAffinity(&affinityPrm, CU_EXEC_AFFINITY_TYPE_SM_COUNT);
+		if (res != 0) {
+			pfn_cuGetErrorString(res, &(err_string));
+			rte_cuda_log(ERR, "cuCtxGetExecAffinity failed with %s", err_string);
+		}
+		dev->mpshared->info.processor_count = (uint32_t)affinityPrm.param.smCount.val;
+
+		ret = rte_gpu_info_get(dev->mpshared->info.parent, &parent_info);
+		if (ret)
+			return -ENODEV;
+		dev->mpshared->info.total_memory = parent_info.total_memory;
+
+		/*
+		 * GPU Device private info
+		 */
+		dev->mpshared->dev_private = rte_zmalloc(NULL,
+							sizeof(struct cuda_info),
+							RTE_CACHE_LINE_SIZE);
+		if (dev->mpshared->dev_private == NULL) {
+			rte_cuda_log(ERR, "Failed to allocate memory for GPU process private");
+			return -EPERM;
+		}
+
+		private = (struct cuda_info *)dev->mpshared->dev_private;
+
+		res = pfn_cuCtxGetDevice(&(private->cu_dev));
+		if (res != 0) {
+			pfn_cuGetErrorString(res, &(err_string));
+			rte_cuda_log(ERR, "cuCtxGetDevice failed with %s", err_string);
+			return -EPERM;
+		}
+
+		res = pfn_cuDeviceGetName(private->gpu_name, RTE_DEV_NAME_MAX_LEN, private->cu_dev);
+		if (res != 0) {
+			pfn_cuGetErrorString(res, &(err_string));
+			rte_cuda_log(ERR, "cuDeviceGetName failed with %s", err_string);
+			return -EPERM;
+		}
+
+		/* Restore original ctx as current ctx */
+		res = pfn_cuCtxSetCurrent(current_ctx);
+		if (res != 0) {
+			pfn_cuGetErrorString(res, &(err_string));
+			rte_cuda_log(ERR, "cuCtxSetCurrent current failed with %s",
+					err_string);
+			return -EPERM;
+		}
+	}
+
+	*info = dev->mpshared->info;
+
+	return 0;
+}
+
+/*
+ * GPU Memory
+ */
+
+static int
+cuda_mem_alloc(struct rte_gpu *dev, size_t size, void **ptr)
+{
+	CUresult res;
+	const char *err_string;
+	CUcontext current_ctx;
+	CUcontext input_ctx;
+	unsigned int flag = 1;
+
+	if (dev == NULL)
+		return -ENODEV;
+	if (size == 0)
+		return -EINVAL;
+
+	/* Store current ctx */
+	res = pfn_cuCtxGetCurrent(¤t_ctx);
+	if (res != 0) {
+		pfn_cuGetErrorString(res, &(err_string));
+		rte_cuda_log(ERR, "cuCtxGetCurrent failed with %s",
+				err_string);
+		return -EPERM;
+	}
+
+	/* Set child ctx as current ctx */
+	input_ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
+	res = pfn_cuCtxSetCurrent(input_ctx);
+	if (res != 0) {
+		pfn_cuGetErrorString(res, &(err_string));
+		rte_cuda_log(ERR, "cuCtxSetCurrent input failed with %s",
+				err_string);
+		return -EPERM;
+	}
+
+	/* Get next memory list item */
+	mem_alloc_list_tail = mem_list_add_item();
+	if (mem_alloc_list_tail == NULL)
+		return -ENOMEM;
+
+	/* Allocate memory */
+	mem_alloc_list_tail->size = size;
+	res = pfn_cuMemAlloc(&(mem_alloc_list_tail->ptr_d), mem_alloc_list_tail->size);
+	if (res != 0) {
+		pfn_cuGetErrorString(res, &(err_string));
+		rte_cuda_log(ERR, "cuCtxSetCurrent current failed with %s",
+				err_string);
+		return -EPERM;
+	}
+
+	/* GPUDirect RDMA attribute required */
+	res = pfn_cuPointerSetAttribute(&flag,
+					CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
+					mem_alloc_list_tail->ptr_d);
+	if (res != 0) {
+		rte_cuda_log(ERR, "Could not set SYNC MEMOP attribute for "
+				"GPU memory at  %"PRIu32", err %d",
+				(uint32_t) mem_alloc_list_tail->ptr_d, res);
+		return -EPERM;
+	}
+
+	mem_alloc_list_tail->pkey = get_hash_from_ptr((void *) mem_alloc_list_tail->ptr_d);
+	mem_alloc_list_tail->ptr_h = NULL;
+	mem_alloc_list_tail->size = size;
+	mem_alloc_list_tail->dev = dev;
+	mem_alloc_list_tail->ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
+	mem_alloc_list_tail->mtype = GPU_MEM;
+
+	/* Restore original ctx as current ctx */
+	res = pfn_cuCtxSetCurrent(current_ctx);
+	if (res != 0) {
+		pfn_cuGetErrorString(res, &(err_string));
+		rte_cuda_log(ERR, "cuCtxSetCurrent current failed with %s",
+				err_string);
+		return -EPERM;
+	}
+
+	*ptr = (void *) mem_alloc_list_tail->ptr_d;
+
+	return 0;
+}
+
+static int
+cuda_mem_register(struct rte_gpu *dev, size_t size, void *ptr)
+{
+	CUresult res;
+	const char *err_string;
+	CUcontext current_ctx;
+	CUcontext input_ctx;
+	unsigned int flag = 1;
+	int use_ptr_h = 0;
+
+	if (dev == NULL)
+		return -ENODEV;
+
+	if (size == 0 || ptr == NULL)
+		return -EINVAL;
+
+	/* Store current ctx */
+	res = pfn_cuCtxGetCurrent(¤t_ctx);
+	if (res != 0) {
+		pfn_cuGetErrorString(res, &(err_string));
+		rte_cuda_log(ERR, "cuCtxGetCurrent failed with %s",
+				err_string);
+		return -EPERM;
+	}
+
+	/* Set child ctx as current ctx */
+	input_ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
+	res = pfn_cuCtxSetCurrent(input_ctx);
+	if (res != 0) {
+		pfn_cuGetErrorString(res, &(err_string));
+		rte_cuda_log(ERR, "cuCtxSetCurrent input failed with %s",
+				err_string);
+		return -EPERM;
+	}
+
+	/* Get next memory list item */
+	mem_alloc_list_tail = mem_list_add_item();
+	if (mem_alloc_list_tail == NULL)
+		return -ENOMEM;
+
+	/* Allocate memory */
+	mem_alloc_list_tail->size = size;
+	mem_alloc_list_tail->ptr_h = ptr;
+
+	res = pfn_cuMemHostRegister(mem_alloc_list_tail->ptr_h,
+				mem_alloc_list_tail->size,
+				CU_MEMHOSTREGISTER_PORTABLE | CU_MEMHOSTREGISTER_DEVICEMAP);
+	if (res != 0) {
+		pfn_cuGetErrorString(res, &(err_string));
+		rte_cuda_log(ERR, "cuMemHostRegister failed with %s ptr %p size %zd",
+				err_string,
+				mem_alloc_list_tail->ptr_h,
+				mem_alloc_list_tail->size);
+		return -EPERM;
+	}
+
+	res = pfn_cuDeviceGetAttribute(&(use_ptr_h),
+			CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM,
+			((struct cuda_info *)(dev->mpshared->dev_private))->cu_dev);
+	if (res != 0) {
+		pfn_cuGetErrorString(res, &(err_string));
+		rte_cuda_log(ERR, "cuDeviceGetAttribute failed with %s",
+				err_string);
+		return -EPERM;
+	}
+
+	if (use_ptr_h == 0) {
+		res = pfn_cuMemHostGetDevicePointer(&(mem_alloc_list_tail->ptr_d),
+				mem_alloc_list_tail->ptr_h, 0);
+		if (res != 0) {
+			pfn_cuGetErrorString(res, &(err_string));
+			rte_cuda_log(ERR, "cuMemHostGetDevicePointer failed with %s",
+					err_string);
+			return -EPERM;
+		}
+
+		if ((uintptr_t) mem_alloc_list_tail->ptr_d != (uintptr_t) mem_alloc_list_tail->ptr_h) {
+			rte_cuda_log(ERR, "Host input pointer is different wrt GPU registered pointer");
+			return -ENOTSUP;
+		}
+	} else {
+		mem_alloc_list_tail->ptr_d = (CUdeviceptr) mem_alloc_list_tail->ptr_h;
+	}
+
+	/* GPUDirect RDMA attribute required */
+	res = pfn_cuPointerSetAttribute(&flag,
+					CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
+					mem_alloc_list_tail->ptr_d);
+	if (res != 0) {
+		rte_cuda_log(ERR, "Could not set SYNC MEMOP attribute for GPU memory at %"PRIu32", err %d",
+				(uint32_t) mem_alloc_list_tail->ptr_d, res);
+		return -EPERM;
+	}
+
+	mem_alloc_list_tail->pkey = get_hash_from_ptr((void *) mem_alloc_list_tail->ptr_h);
+	mem_alloc_list_tail->size = size;
+	mem_alloc_list_tail->dev = dev;
+	mem_alloc_list_tail->ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
+	mem_alloc_list_tail->mtype = CPU_REGISTERED;
+
+	/* Restore original ctx as current ctx */
+	res = pfn_cuCtxSetCurrent(current_ctx);
+	if (res != 0) {
+		pfn_cuGetErrorString(res, &(err_string));
+		rte_cuda_log(ERR, "cuCtxSetCurrent current failed with %s",
+				err_string);
+		return -EPERM;
+	}
+
+	return 0;
+}
+
+static int
+cuda_mem_free(struct rte_gpu *dev, void *ptr)
+{
+	CUresult res;
+	struct mem_entry *mem_item;
+	const char *err_string;
+	cuda_ptr_key hk;
+
+	if (dev == NULL)
+		return -ENODEV;
+
+	if (ptr == NULL)
+		return -EINVAL;
+
+	hk = get_hash_from_ptr((void *) ptr);
+
+	mem_item = mem_list_find_item(hk);
+	if (mem_item == NULL) {
+		rte_cuda_log(ERR, "Memory address 0x%p not found in driver memory", ptr);
+		return -EPERM;
+	}
+
+	if (mem_item->mtype == GPU_MEM) {
+		res = pfn_cuMemFree(mem_item->ptr_d);
+		if (res != 0) {
+			pfn_cuGetErrorString(res, &(err_string));
+			rte_cuda_log(ERR, "cuMemFree current failed with %s",
+					err_string);
+			return -EPERM;
+		}
+
+		return mem_list_del_item(hk);
+	}
+
+	rte_cuda_log(ERR, "Memory type %d not supported", mem_item->mtype);
+
+	return -EPERM;
+}
+
+static int
+cuda_mem_unregister(struct rte_gpu *dev, void *ptr)
+{
+	CUresult res;
+	struct mem_entry *mem_item;
+	const char *err_string;
+	cuda_ptr_key hk;
+
+	if (dev == NULL)
+		return -ENODEV;
+
+	if (ptr == NULL)
+		return -EINVAL;
+
+	hk = get_hash_from_ptr((void *) ptr);
+
+	mem_item = mem_list_find_item(hk);
+	if (mem_item == NULL) {
+		rte_cuda_log(ERR, "Memory address 0x%p not found in driver memory", ptr);
+		return -EPERM;
+	}
+
+	if (mem_item->mtype == CPU_REGISTERED) {
+		res = pfn_cuMemHostUnregister(ptr);
+		if (res != 0) {
+			pfn_cuGetErrorString(res, &(err_string));
+			rte_cuda_log(ERR, "cuMemHostUnregister current failed with %s",
+					err_string);
+			return -EPERM;
+		}
+
+		return mem_list_del_item(hk);
+	}
+
+	rte_cuda_log(ERR, "Memory type %d not supported", mem_item->mtype);
+
+	return -EPERM;
+}
+
+static int
+cuda_dev_close(struct rte_gpu *dev)
+{
+	if (dev == NULL)
+		return -EINVAL;
+
+	rte_free(dev->mpshared->dev_private);
+
+	return 0;
+}
+
+static int
+cuda_wmb(struct rte_gpu *dev)
+{
+	CUresult res;
+	const char *err_string;
+	CUcontext current_ctx;
+	CUcontext input_ctx;
+	struct cuda_info *private;
+
+	if (dev == NULL)
+		return -ENODEV;
+
+	private = (struct cuda_info *)dev->mpshared->dev_private;
+
+	if (private->gdr_write_ordering != CU_GPU_DIRECT_RDMA_WRITES_ORDERING_NONE) {
+		/*
+		 * No need to explicitly force the write ordering because
+		 * the device natively supports it
+		 */
+		return 0;
+	}
+
+	if (private->gdr_flush_type != CU_FLUSH_GPU_DIRECT_RDMA_WRITES_OPTION_HOST) {
+		/*
+		 * Can't flush GDR writes with cuFlushGPUDirectRDMAWrites CUDA function.
+		 * Application needs to use alternative methods.
+		 */
+		rte_cuda_log(WARNING, "Can't flush GDR writes with cuFlushGPUDirectRDMAWrites CUDA function."
+				"Application needs to use alternative methods.");
+		return -ENOTSUP;
+	}
+
+	/* Store current ctx */
+	res = pfn_cuCtxGetCurrent(¤t_ctx);
+	if (res != 0) {
+		pfn_cuGetErrorString(res, &(err_string));
+		rte_cuda_log(ERR, "cuCtxGetCurrent failed with %s",
+				err_string);
+		return -EPERM;
+	}
+
+	/* Set child ctx as current ctx */
+	input_ctx = (CUcontext)((uintptr_t)dev->mpshared->info.context);
+	res = pfn_cuCtxSetCurrent(input_ctx);
+	if (res != 0) {
+		pfn_cuGetErrorString(res, &(err_string));
+		rte_cuda_log(ERR, "cuCtxSetCurrent input failed with %s",
+				err_string);
+		return -EPERM;
+	}
+
+	res = pfn_cuFlushGPUDirectRDMAWrites(CU_FLUSH_GPU_DIRECT_RDMA_WRITES_TARGET_CURRENT_CTX,
+					CU_FLUSH_GPU_DIRECT_RDMA_WRITES_TO_ALL_DEVICES);
+	if (res != 0) {
+		pfn_cuGetErrorString(res, &(err_string));
+		rte_cuda_log(ERR, "cuFlushGPUDirectRDMAWrites current failed with %s",
+				err_string);
+		return -EPERM;
+	}
+
+	/* Restore original ctx as current ctx */
+	res = pfn_cuCtxSetCurrent(current_ctx);
+	if (res != 0) {
+		pfn_cuGetErrorString(res, &(err_string));
+		rte_cuda_log(ERR, "cuCtxSetCurrent current failed with %s",
+				err_string);
+		return -EPERM;
+	}
+
+	return 0;
+}
+
+static int
+cuda_gpu_probe(__rte_unused struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
+{
+	struct rte_gpu *dev = NULL;
+	CUresult res;
+	CUdevice cu_dev_id;
+	CUcontext pctx;
+	char dev_name[RTE_DEV_NAME_MAX_LEN];
+	const char *err_string;
+	int processor_count = 0;
+	struct cuda_info *private;
+
+	if (pci_dev == NULL) {
+		rte_cuda_log(ERR, "NULL PCI device");
+		return -EINVAL;
+	}
+
+	rte_pci_device_name(&pci_dev->addr, dev_name, sizeof(dev_name));
+
+	/* Allocate memory to be used privately by drivers */
+	dev = rte_gpu_allocate(pci_dev->device.name);
+	if (dev == NULL)
+		return -ENODEV;
+
+	/* Initialize values only for the first CUDA driver call */
+	if (dev->mpshared->info.dev_id == 0) {
+		mem_alloc_list_head = NULL;
+		mem_alloc_list_tail = NULL;
+		mem_alloc_list_last_elem = 0;
+
+		/* Load libcuda.so library */
+		if (cuda_loader()) {
+			rte_cuda_log(ERR, "CUDA Driver library not found");
+			return -ENOTSUP;
+		}
+
+		/* Load initial CUDA functions */
+		if (cuda_sym_func_loader()) {
+			rte_cuda_log(ERR, "CUDA functions not found in library");
+			return -ENOTSUP;
+		}
+
+		/*
+		 * Required to initialize the CUDA Driver.
+		 * Multiple calls of cuInit() will return immediately
+		 * without making any relevant change
+		 */
+		sym_cuInit(0);
+
+		res = sym_cuDriverGetVersion(&cuda_driver_version);
+		if (res != 0) {
+			rte_cuda_log(ERR, "cuDriverGetVersion failed with %d", res);
+			return -ENOTSUP;
+		}
+
+		if (cuda_driver_version < CUDA_DRIVER_MIN_VERSION) {
+			rte_cuda_log(ERR, "CUDA Driver version found is %d. "
+					"Minimum requirement is %d",
+					cuda_driver_version, CUDA_DRIVER_MIN_VERSION);
+			return -ENOTSUP;
+		}
+
+		if (cuda_pfn_func_loader()) {
+			rte_cuda_log(ERR, "CUDA PFN functions not found in library");
+			return -ENOTSUP;
+		}
+	}
+
+	/* Fill HW specific part of device structure */
+	dev->device = &pci_dev->device;
+	dev->mpshared->info.numa_node = pci_dev->device.numa_node;
+
+	/* Get NVIDIA GPU Device descriptor */
+	res = pfn_cuDeviceGetByPCIBusId(&cu_dev_id, dev->device->name);
+	if (res != 0) {
+		pfn_cuGetErrorString(res, &(err_string));
+		rte_cuda_log(ERR, "cuDeviceGetByPCIBusId name %s failed with %d: %s",
+				dev->device->name, res, err_string);
+		return -EPERM;
+	}
+
+	res = pfn_cuDevicePrimaryCtxRetain(&pctx, cu_dev_id);
+	if (res != 0) {
+		pfn_cuGetErrorString(res, &(err_string));
+		rte_cuda_log(ERR, "cuDevicePrimaryCtxRetain name %s failed with %d: %s",
+				dev->device->name, res, err_string);
+		return -EPERM;
+	}
+
+	res = pfn_cuCtxGetApiVersion(pctx, &cuda_api_version);
+	if (res != 0) {
+		rte_cuda_log(ERR, "cuCtxGetApiVersion failed with %d", res);
+		return -ENOTSUP;
+	}
+
+	if (cuda_api_version < CUDA_API_MIN_VERSION) {
+		rte_cuda_log(ERR, "CUDA API version found is %d Minimum requirement is %d",
+				cuda_api_version, CUDA_API_MIN_VERSION);
+		return -ENOTSUP;
+	}
+
+	dev->mpshared->info.context = (uint64_t) pctx;
+
+	/*
+	 * GPU Device generic info
+	 */
+
+	/* Processor count */
+	res = pfn_cuDeviceGetAttribute(&(processor_count),
+			CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
+			cu_dev_id);
+	if (res != 0) {
+		pfn_cuGetErrorString(res, &(err_string));
+		rte_cuda_log(ERR, "cuDeviceGetAttribute failed with %s",
+				err_string);
+		return -EPERM;
+	}
+	dev->mpshared->info.processor_count = (uint32_t)processor_count;
+
+	/* Total memory */
+	res = pfn_cuDeviceTotalMem(&dev->mpshared->info.total_memory, cu_dev_id);
+	if (res != 0) {
+		pfn_cuGetErrorString(res, &(err_string));
+		rte_cuda_log(ERR, "cuDeviceTotalMem failed with %s",
+				err_string);
+		return -EPERM;
+	}
+
+	/*
+	 * GPU Device private info
+	 */
+	dev->mpshared->dev_private = rte_zmalloc(NULL,
+			sizeof(struct cuda_info),
+			RTE_CACHE_LINE_SIZE);
+	if (dev->mpshared->dev_private == NULL) {
+		rte_cuda_log(ERR, "Failed to allocate memory for GPU process private");
+		return -ENOMEM;
+	}
+
+	private = (struct cuda_info *)dev->mpshared->dev_private;
+	private->cu_dev = cu_dev_id;
+	res = pfn_cuDeviceGetName(private->gpu_name,
+			RTE_DEV_NAME_MAX_LEN,
+			cu_dev_id);
+	if (res != 0) {
+		pfn_cuGetErrorString(res, &(err_string));
+		rte_cuda_log(ERR, "cuDeviceGetName failed with %s",
+				err_string);
+		return -EPERM;
+	}
+
+	res = pfn_cuDeviceGetAttribute(&(private->gdr_supported),
+			CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED,
+			cu_dev_id);
+	if (res != 0) {
+		pfn_cuGetErrorString(res, &(err_string));
+		rte_cuda_log(ERR, "cuDeviceGetAttribute failed with %s",
+				err_string);
+		return -EPERM;
+	}
+
+	if (private->gdr_supported == 0)
+		rte_cuda_log(WARNING, "GPU %s doesn't support GPUDirect RDMA",
+				pci_dev->device.name);
+
+	res = pfn_cuDeviceGetAttribute(&(private->gdr_write_ordering),
+			CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WRITES_ORDERING,
+			cu_dev_id);
+	if (res != 0) {
+		pfn_cuGetErrorString(res, &(err_string));
+		rte_cuda_log(ERR,
+				"cuDeviceGetAttribute failed with %s",
+				err_string);
+		return -EPERM;
+	}
+
+	if (private->gdr_write_ordering == CU_GPU_DIRECT_RDMA_WRITES_ORDERING_NONE) {
+		res = pfn_cuDeviceGetAttribute(&(private->gdr_flush_type),
+					CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_FLUSH_WRITES_OPTIONS,
+					cu_dev_id);
+		if (res != 0) {
+			pfn_cuGetErrorString(res, &(err_string));
+			rte_cuda_log(ERR, "cuDeviceGetAttribute failed with %s",
+					err_string);
+			return -EPERM;
+		}
+
+		if (private->gdr_flush_type != CU_FLUSH_GPU_DIRECT_RDMA_WRITES_OPTION_HOST)
+			rte_cuda_log(ERR, "GPUDirect RDMA flush writes API is not supported");
+	}
+
+	dev->ops.dev_info_get = cuda_dev_info_get;
+	dev->ops.dev_close = cuda_dev_close;
+	dev->ops.mem_alloc = cuda_mem_alloc;
+	dev->ops.mem_free = cuda_mem_free;
+	dev->ops.mem_register = cuda_mem_register;
+	dev->ops.mem_unregister = cuda_mem_unregister;
+	dev->ops.wmb = cuda_wmb;
+
+	rte_gpu_complete_new(dev);
+
+	rte_cuda_debug("dev id = %u name = %s",
+			dev->mpshared->info.dev_id, private->gpu_name);
+
+	return 0;
+}
+
+static int
+cuda_gpu_remove(struct rte_pci_device *pci_dev)
+{
+	struct rte_gpu *dev;
+	int ret;
+	uint8_t gpu_id;
+
+	if (pci_dev == NULL)
+		return -EINVAL;
+
+	dev = rte_gpu_get_by_name(pci_dev->device.name);
+	if (dev == NULL) {
+		rte_cuda_log(ERR, "Couldn't find HW dev \"%s\" to uninitialise it",
+				pci_dev->device.name);
+		return -ENODEV;
+	}
+	gpu_id = dev->mpshared->info.dev_id;
+
+	/* release dev from library */
+	ret = rte_gpu_release(dev);
+	if (ret)
+		rte_cuda_log(ERR, "Device %i failed to uninit: %i", gpu_id, ret);
+
+	rte_cuda_debug("Destroyed dev = %u", gpu_id);
+
+	return 0;
+}
+
+static struct rte_pci_driver rte_cuda_driver = {
+	.id_table = pci_id_cuda_map,
+	.drv_flags = RTE_PCI_DRV_WC_ACTIVATE,
+	.probe = cuda_gpu_probe,
+	.remove = cuda_gpu_remove,
+};
+
+RTE_PMD_REGISTER_PCI(gpu_cuda, rte_cuda_driver);
+RTE_PMD_REGISTER_PCI_TABLE(gpu_cuda, pci_id_cuda_map);
+RTE_PMD_REGISTER_KMOD_DEP(gpu_cuda, "* nvidia & (nv_peer_mem | nvpeer_mem)");
diff --git a/drivers/gpu/cuda/meson.build b/drivers/gpu/cuda/meson.build
new file mode 100644
index 0000000000..4bb3e14cd2
--- /dev/null
+++ b/drivers/gpu/cuda/meson.build
@@ -0,0 +1,18 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright (c) 2021 NVIDIA Corporation & Affiliates
+
+if not is_linux
+        build = false
+        reason = 'only supported on Linux'
+endif
+
+cuda_h_found = cc.has_header('cuda.h')
+cuda_typeh_found = cc.has_header('cudaTypedefs.h')
+
+if not cuda_h_found or not cuda_typeh_found
+        build = false
+        reason = 'missing dependency cuda headers cuda.h and cudaTypedefs.h'
+endif
+
+deps += ['gpudev','pci','bus_pci']
+sources = files('cuda.c')
diff --git a/drivers/gpu/cuda/version.map b/drivers/gpu/cuda/version.map
new file mode 100644
index 0000000000..4a76d1d52d
--- /dev/null
+++ b/drivers/gpu/cuda/version.map
@@ -0,0 +1,3 @@
+DPDK_21 {
+	local: *;
+};
diff --git a/drivers/gpu/meson.build b/drivers/gpu/meson.build
index e51ad3381b..601bedcd61 100644
--- a/drivers/gpu/meson.build
+++ b/drivers/gpu/meson.build
@@ -1,4 +1,4 @@
 # SPDX-License-Identifier: BSD-3-Clause
 # Copyright (c) 2021 NVIDIA Corporation & Affiliates
 
-drivers = []
+drivers = [ 'cuda' ]
-- 
2.17.1
^ permalink raw reply related	[flat|nested] 28+ messages in thread
end of thread, other threads:[~2021-11-16 16:44 UTC | newest]
Thread overview: 28+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2021-10-05 22:49 [dpdk-dev] [RFC PATCH] gpu/cuda: introduce CUDA driver eagostini
2021-11-04  2:01 ` [dpdk-dev] [PATCH v2 0/1] " eagostini
2021-11-04  2:01   ` [dpdk-dev] [PATCH v2 1/1] " eagostini
2021-11-03 18:15     ` Stephen Hemminger
2021-11-08 18:35     ` Stephen Hemminger
2021-11-08 18:39       ` Elena Agostini
2021-11-08 18:59         ` Stephen Hemminger
2021-11-08 19:07           ` Elena Agostini
2021-11-08 19:02 ` [dpdk-dev] [RFC PATCH] " Stephen Hemminger
2021-11-08 21:20   ` Elena Agostini
2021-11-08 22:07     ` Stephen Hemminger
2021-11-08 23:15       ` Stephen Hemminger
2021-11-09  2:28 ` [dpdk-dev] [PATCH v3 0/1] " eagostini
2021-11-09  2:28   ` [dpdk-dev] [PATCH v3 1/1] " eagostini
2021-11-08 19:52     ` David Marchand
2021-11-09  5:50 ` [dpdk-dev] [PATCH v4 0/1] " eagostini
2021-11-09  5:50   ` [dpdk-dev] [PATCH v4 1/1] " eagostini
2021-11-15 22:36 ` [PATCH v5 0/1] " eagostini
2021-11-15 22:36   ` [PATCH v5 1/1] " eagostini
2021-11-16 20:47 ` [PATCH v6 0/1] " eagostini
2021-11-16 20:47   ` [PATCH v6 1/1] " eagostini
2021-11-16 22:50 ` [PATCH v7 0/1] " eagostini
2021-11-16 22:50   ` [PATCH v7 1/1] " eagostini
2021-11-16 15:58     ` Stephen Hemminger
2021-11-16 16:35       ` Thomas Monjalon
2021-11-16 16:40       ` Thomas Monjalon
2021-11-16 16:30     ` Thomas Monjalon
2021-11-16 16:44       ` Thomas Monjalon
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).