All of lore.kernel.org
 help / color / mirror / Atom feed
From: Oded Gabbay <ogabbay@kernel.org>
To: linux-kernel@vger.kernel.org
Cc: Tal Cohen <talcohen@habana.ai>
Subject: [PATCH 3/3] habanalabs: add support for notification via eventfd
Date: Mon,  9 May 2022 11:22:18 +0300	[thread overview]
Message-ID: <20220509082218.956916-3-ogabbay@kernel.org> (raw)
In-Reply-To: <20220509082218.956916-1-ogabbay@kernel.org>

From: Tal Cohen <talcohen@habana.ai>

The driver will be able to send notification events towards
a user process, using user's registered event file descriptor.
The driver uses the notification mechanism to inform the
user about an occurred event.
A user thread can wait until a notification is received from
the driver.

The driver stores the occurred event until the user reads it,
using HL_INFO_GET_EVENTS - new ioctl opcode in the INFO ioctl.

Gaudi specific implementation includes sending a notification
on a TPC assertion event that is received from f/w.

Signed-off-by: Tal Cohen <talcohen@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/misc/habanalabs/common/device.c       | 52 +++++++++++++++
 drivers/misc/habanalabs/common/habanalabs.h   | 40 ++++++++----
 .../misc/habanalabs/common/habanalabs_drv.c   |  9 +++
 .../misc/habanalabs/common/habanalabs_ioctl.c | 65 +++++++++++++++++++
 drivers/misc/habanalabs/gaudi/gaudi.c         | 14 +++-
 include/uapi/misc/habanalabs.h                | 15 +++++
 6 files changed, 182 insertions(+), 13 deletions(-)

diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c
index 15df89b31e1b..315510aaca35 100644
--- a/drivers/misc/habanalabs/common/device.c
+++ b/drivers/misc/habanalabs/common/device.c
@@ -285,6 +285,14 @@ static void hpriv_release(struct kref *ref)
 
 	hdev->compute_ctx_in_release = 0;
 
+	/* release the eventfd */
+	if (hpriv->notifier_event.eventfd) {
+		eventfd_ctx_put(hpriv->notifier_event.eventfd);
+		hpriv->notifier_event.eventfd = 0;
+	}
+
+	mutex_destroy(&hpriv->notifier_event.lock);
+
 	kfree(hpriv);
 }
 
@@ -355,6 +363,13 @@ static int hl_device_release_ctrl(struct inode *inode, struct file *filp)
 	list_del(&hpriv->dev_node);
 	mutex_unlock(&hdev->fpriv_ctrl_list_lock);
 out:
+	/* release the eventfd */
+	if (hpriv->notifier_event.eventfd) {
+		eventfd_ctx_put(hpriv->notifier_event.eventfd);
+		hpriv->notifier_event.eventfd = 0;
+	}
+
+	mutex_destroy(&hpriv->notifier_event.lock);
 	put_pid(hpriv->taskpid);
 
 	kfree(hpriv);
@@ -1506,6 +1521,43 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
 	return rc;
 }
 
+static void hl_notifier_event_send(struct hl_notifier_event *notifier_event, u64 event)
+{
+	mutex_lock(&notifier_event->lock);
+	notifier_event->events_mask |= event;
+	if (notifier_event->eventfd)
+		eventfd_signal(notifier_event->eventfd, 1);
+
+	mutex_unlock(&notifier_event->lock);
+}
+
+/*
+ * hl_notifier_event_send_all - notify all user processes via eventfd
+ *
+ * @hdev: pointer to habanalabs device structure
+ * @event: the occurred event
+ * Returns 0 for success or an error on failure.
+ */
+void hl_notifier_event_send_all(struct hl_device *hdev, u64 event)
+{
+	struct hl_fpriv	*hpriv;
+
+	mutex_lock(&hdev->fpriv_list_lock);
+
+	list_for_each_entry(hpriv, &hdev->fpriv_list, dev_node)
+		hl_notifier_event_send(&hpriv->notifier_event, event);
+
+	mutex_unlock(&hdev->fpriv_list_lock);
+
+	/* control device */
+	mutex_lock(&hdev->fpriv_ctrl_list_lock);
+
+	list_for_each_entry(hpriv, &hdev->fpriv_ctrl_list, dev_node)
+		hl_notifier_event_send(&hpriv->notifier_event, event);
+
+	mutex_unlock(&hdev->fpriv_ctrl_list_lock);
+}
+
 /*
  * hl_device_init - main initialization function for habanalabs device
  *
diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h
index 918e8a04acab..8977ec67dba7 100644
--- a/drivers/misc/habanalabs/common/habanalabs.h
+++ b/drivers/misc/habanalabs/common/habanalabs.h
@@ -21,6 +21,7 @@
 #include <linux/hashtable.h>
 #include <linux/debugfs.h>
 #include <linux/rwsem.h>
+#include <linux/eventfd.h>
 #include <linux/bitfield.h>
 #include <linux/genalloc.h>
 #include <linux/sched/signal.h>
@@ -1932,6 +1933,18 @@ struct hl_debug_params {
 	bool enable;
 };
 
+/**
+ * struct hl_notifier_event - holds the notifier data structure
+ * @eventfd: the event file descriptor to raise the notifications
+ * @lock: mutex lock to protect the notifier data flows
+ * @events_mask: indicates the bitmap events
+ */
+struct hl_notifier_event {
+	struct eventfd_ctx	*eventfd;
+	struct mutex		lock;
+	u64			events_mask;
+};
+
 /*
  * FILE PRIVATE STRUCTURE
  */
@@ -1943,24 +1956,25 @@ struct hl_debug_params {
  * @taskpid: current process ID.
  * @ctx: current executing context. TODO: remove for multiple ctx per process
  * @ctx_mgr: context manager to handle multiple context for this FD.
- * @cb_mgr: command buffer manager to handle multiple buffers for this FD.
  * @mem_mgr: manager descriptor for memory exportable via mmap
+ * @notifier_event: notifier eventfd towards user process
  * @debugfs_list: list of relevant ASIC debugfs.
  * @dev_node: node in the device list of file private data
  * @refcount: number of related contexts.
  * @restore_phase_mutex: lock for context switch and restore phase.
  */
 struct hl_fpriv {
-	struct hl_device	*hdev;
-	struct file		*filp;
-	struct pid		*taskpid;
-	struct hl_ctx		*ctx;
-	struct hl_ctx_mgr	ctx_mgr;
-	struct hl_mem_mgr	mem_mgr;
-	struct list_head	debugfs_list;
-	struct list_head	dev_node;
-	struct kref		refcount;
-	struct mutex		restore_phase_mutex;
+	struct hl_device		*hdev;
+	struct file			*filp;
+	struct pid			*taskpid;
+	struct hl_ctx			*ctx;
+	struct hl_ctx_mgr		ctx_mgr;
+	struct hl_mem_mgr		mem_mgr;
+	struct hl_notifier_event	notifier_event;
+	struct list_head		debugfs_list;
+	struct list_head		dev_node;
+	struct kref			refcount;
+	struct mutex			restore_phase_mutex;
 };
 
 
@@ -2676,8 +2690,8 @@ struct hl_reset_info {
  * @state_dump_specs: constants and dictionaries needed to dump system state.
  * @multi_cs_completion: array of multi-CS completion.
  * @clk_throttling: holds information about current/previous clock throttling events
- * @reset_info: holds current device reset information.
  * @last_error: holds information about last session in which CS timeout or razwi error occurred.
+ * @reset_info: holds current device reset information.
  * @stream_master_qid_arr: pointer to array with QIDs of master streams.
  * @fw_major_version: major version of current loaded preboot
  * @dram_used_mem: current DRAM memory consumption.
@@ -3071,6 +3085,8 @@ int hl_device_utilization(struct hl_device *hdev, u32 *utilization);
 int hl_build_hwmon_channel_info(struct hl_device *hdev,
 		struct cpucp_sensor *sensors_arr);
 
+void hl_notifier_event_send_all(struct hl_device *hdev, u64 event);
+
 int hl_sysfs_init(struct hl_device *hdev);
 void hl_sysfs_fini(struct hl_device *hdev);
 
diff --git a/drivers/misc/habanalabs/common/habanalabs_drv.c b/drivers/misc/habanalabs/common/habanalabs_drv.c
index 1210de39d661..c97173e9507d 100644
--- a/drivers/misc/habanalabs/common/habanalabs_drv.c
+++ b/drivers/misc/habanalabs/common/habanalabs_drv.c
@@ -134,6 +134,10 @@ int hl_device_open(struct inode *inode, struct file *filp)
 	hpriv->hdev = hdev;
 	filp->private_data = hpriv;
 	hpriv->filp = filp;
+	hpriv->notifier_event.events_mask = 0;
+	hpriv->notifier_event.eventfd = 0;
+
+	mutex_init(&hpriv->notifier_event.lock);
 	mutex_init(&hpriv->restore_phase_mutex);
 	kref_init(&hpriv->refcount);
 	nonseekable_open(inode, filp);
@@ -208,6 +212,7 @@ int hl_device_open(struct inode *inode, struct file *filp)
 	hl_ctx_mgr_fini(hpriv->hdev, &hpriv->ctx_mgr);
 	filp->private_data = NULL;
 	mutex_destroy(&hpriv->restore_phase_mutex);
+	mutex_destroy(&hpriv->notifier_event.lock);
 	put_pid(hpriv->taskpid);
 
 	kfree(hpriv);
@@ -241,6 +246,10 @@ int hl_device_open_ctrl(struct inode *inode, struct file *filp)
 	hpriv->hdev = hdev;
 	filp->private_data = hpriv;
 	hpriv->filp = filp;
+	hpriv->notifier_event.events_mask = 0;
+	hpriv->notifier_event.eventfd = 0;
+
+	mutex_init(&hpriv->notifier_event.lock);
 	nonseekable_open(inode, filp);
 
 	hpriv->taskpid = get_task_pid(current, PIDTYPE_PID);
diff --git a/drivers/misc/habanalabs/common/habanalabs_ioctl.c b/drivers/misc/habanalabs/common/habanalabs_ioctl.c
index bfb5cfe68110..d1ef56a8d3ac 100644
--- a/drivers/misc/habanalabs/common/habanalabs_ioctl.c
+++ b/drivers/misc/habanalabs/common/habanalabs_ioctl.c
@@ -116,6 +116,25 @@ static int hw_events_info(struct hl_device *hdev, bool aggregate,
 	return copy_to_user(out, arr, min(max_size, size)) ? -EFAULT : 0;
 }
 
+static int events_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
+{
+	int rc;
+	u32 max_size = args->return_size;
+	u64 events_mask;
+	void __user *out = (void __user *) (uintptr_t) args->return_pointer;
+
+	if ((max_size < sizeof(u64)) || (!out))
+		return -EINVAL;
+
+	mutex_lock(&hpriv->notifier_event.lock);
+	events_mask = hpriv->notifier_event.events_mask;
+	hpriv->notifier_event.events_mask = 0;
+	mutex_unlock(&hpriv->notifier_event.lock);
+
+	rc = copy_to_user(out, &events_mask, sizeof(u64));
+	return rc;
+}
+
 static int dram_usage_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
 {
 	struct hl_device *hdev = hpriv->hdev;
@@ -614,6 +633,43 @@ static int dev_mem_alloc_page_sizes_info(struct hl_fpriv *hpriv, struct hl_info_
 	return copy_to_user(out, &info, min_t(size_t, max_size, sizeof(info))) ? -EFAULT : 0;
 }
 
+static int eventfd_register(struct hl_fpriv *hpriv, struct hl_info_args *args)
+{
+	int rc;
+
+	/* check if there is already a registered on that process */
+	mutex_lock(&hpriv->notifier_event.lock);
+	if (hpriv->notifier_event.eventfd) {
+		mutex_unlock(&hpriv->notifier_event.lock);
+		return -EINVAL;
+	}
+
+	hpriv->notifier_event.eventfd = eventfd_ctx_fdget(args->eventfd);
+	if (IS_ERR(hpriv->notifier_event.eventfd)) {
+		rc = PTR_ERR(hpriv->notifier_event.eventfd);
+		hpriv->notifier_event.eventfd = 0;
+		mutex_unlock(&hpriv->notifier_event.lock);
+		return rc;
+	}
+
+	mutex_unlock(&hpriv->notifier_event.lock);
+	return 0;
+}
+
+static int eventfd_unregister(struct hl_fpriv *hpriv, struct hl_info_args *args)
+{
+	mutex_lock(&hpriv->notifier_event.lock);
+	if (!hpriv->notifier_event.eventfd) {
+		mutex_unlock(&hpriv->notifier_event.lock);
+		return -EINVAL;
+	}
+
+	eventfd_ctx_put(hpriv->notifier_event.eventfd);
+	hpriv->notifier_event.eventfd = 0;
+	mutex_unlock(&hpriv->notifier_event.lock);
+	return 0;
+}
+
 static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data,
 				struct device *dev)
 {
@@ -667,6 +723,9 @@ static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data,
 	case HL_INFO_DEV_MEM_ALLOC_PAGE_SIZES:
 		return dev_mem_alloc_page_sizes_info(hpriv, args);
 
+	case HL_INFO_GET_EVENTS:
+		return events_info(hpriv, args);
+
 	default:
 		break;
 	}
@@ -717,6 +776,12 @@ static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data,
 	case HL_INFO_DRAM_PENDING_ROWS:
 		return dram_pending_rows_info(hpriv, args);
 
+	case HL_INFO_REGISTER_EVENTFD:
+		return eventfd_register(hpriv, args);
+
+	case HL_INFO_UNREGISTER_EVENTFD:
+		return eventfd_unregister(hpriv, args);
+
 	default:
 		dev_err(dev, "Invalid request %d\n", args->op);
 		rc = -EINVAL;
diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c
index 08cd60300b4f..1c388537de33 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
@@ -7879,7 +7879,6 @@ static void gaudi_handle_eqe(struct hl_device *hdev,
 	case GAUDI_EVENT_MMU_PAGE_FAULT:
 	case GAUDI_EVENT_MMU_WR_PERM:
 	case GAUDI_EVENT_RAZWI_OR_ADC:
-	case GAUDI_EVENT_TPC0_QM ... GAUDI_EVENT_TPC7_QM:
 	case GAUDI_EVENT_MME0_QM ... GAUDI_EVENT_MME2_QM:
 	case GAUDI_EVENT_DMA0_QM ... GAUDI_EVENT_DMA7_QM:
 		fallthrough;
@@ -7899,6 +7898,19 @@ static void gaudi_handle_eqe(struct hl_device *hdev,
 		hl_fw_unmask_irq(hdev, event_type);
 		break;
 
+	case GAUDI_EVENT_TPC0_QM ... GAUDI_EVENT_TPC7_QM:
+		gaudi_print_irq_info(hdev, event_type, true);
+		gaudi_handle_qman_err(hdev, event_type);
+		hl_fw_unmask_irq(hdev, event_type);
+
+		/* In TPC QM event, notify on TPC assertion. While there isn't
+		 * a specific event for assertion yet, the FW generates QM event.
+		 * The SW upper layer will inspect an internal mapped area to indicate
+		 * if the event is a tpc assertion or tpc QM.
+		 */
+		hl_notifier_event_send_all(hdev, HL_NOTIFIER_EVENT_TPC_ASSERT);
+		break;
+
 	case GAUDI_EVENT_RAZWI_OR_ADC_SW:
 		gaudi_print_irq_info(hdev, event_type, true);
 		goto reset_device;
diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h
index 3576bf2b4841..52540d5b4fc9 100644
--- a/include/uapi/misc/habanalabs.h
+++ b/include/uapi/misc/habanalabs.h
@@ -349,6 +349,9 @@ enum hl_server_type {
  *                            Razwi initiator.
  *                            Razwi cause, was it a page fault or MMU access error.
  * HL_INFO_DEV_MEM_ALLOC_PAGE_SIZES - Retrieve valid page sizes for device memory allocation
+ * HL_INFO_REGISTER_EVENTFD   - Register eventfd for event notifications.
+ * HL_INFO_UNREGISTER_EVENTFD - Unregister eventfd
+ * HL_INFO_GET_EVENTS         - Retrieve the last occurred events
  */
 #define HL_INFO_HW_IP_INFO			0
 #define HL_INFO_HW_EVENTS			1
@@ -374,6 +377,9 @@ enum hl_server_type {
 #define HL_INFO_CS_TIMEOUT_EVENT		24
 #define HL_INFO_RAZWI_EVENT			25
 #define HL_INFO_DEV_MEM_ALLOC_PAGE_SIZES	26
+#define HL_INFO_REGISTER_EVENTFD		28
+#define HL_INFO_UNREGISTER_EVENTFD		29
+#define HL_INFO_GET_EVENTS			30
 
 #define HL_INFO_VERSION_MAX_LEN			128
 #define HL_INFO_CARD_NAME_MAX_LEN		16
@@ -679,6 +685,7 @@ enum gaudi_dcores {
  * @period_ms: Period value, in milliseconds, for utilization rate in range 100ms - 1000ms in 100 ms
  *             resolution. Currently not in use.
  * @pll_index: Index as defined in hl_<asic type>_pll_index enumeration.
+ * @eventfd: event file descriptor for event notifications.
  * @pad: Padding to 64 bit.
  */
 struct hl_info_args {
@@ -691,6 +698,7 @@ struct hl_info_args {
 		__u32 ctx_id;
 		__u32 period_ms;
 		__u32 pll_index;
+		__u32 eventfd;
 	};
 
 	__u32 pad;
@@ -1390,6 +1398,13 @@ struct hl_debug_args {
 	__u32 ctx_id;
 };
 
+/*
+ * Notifier event values - for the notification mechanism and the HL_INFO_GET_EVENTS command
+ *
+ * HL_NOTIFIER_EVENT_TPC_ASSERT - Indicates TPC assert event
+ */
+#define HL_NOTIFIER_EVENT_TPC_ASSERT  (1 << 0)
+
 /*
  * Various information operations such as:
  * - H/W IP information
-- 
2.25.1


      parent reply	other threads:[~2022-05-09  8:57 UTC|newest]

Thread overview: 3+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-05-09  8:22 [PATCH 1/3] habanalabs: handle race in driver fini Oded Gabbay
2022-05-09  8:22 ` [PATCH 2/3] habanalabs: add topic to memory manager buffer Oded Gabbay
2022-05-09  8:22 ` Oded Gabbay [this message]

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20220509082218.956916-3-ogabbay@kernel.org \
    --to=ogabbay@kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=talcohen@habana.ai \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.