* [PATCH v2] drm/amdkfd: Provide SMI events watch
@ 2020-04-02 20:46 Amber Lin
2020-04-02 23:51 ` Felix Kuehling
0 siblings, 1 reply; 5+ messages in thread
From: Amber Lin @ 2020-04-02 20:46 UTC (permalink / raw)
To: amd-gfx; +Cc: Amber Lin
When the compute is malfunctioning or performance drops, the system admin
will use SMI (System Management Interface) tool to monitor/diagnostic what
went wrong. This patch provides an event watch interface for the user
space to register events they are interested. After the event is
registered, the user can use annoymous file descriptor's poll function
with wait-time specified to wait for the event to happen. Once the event
happens, the user can use read() to retrieve information related to the
event.
VM fault event is done in this patch.
v2: - remove UNREGISTER and add event ENABLE/DISABLE
- correct kfifo usage
- move event message API to kfd_ioctl.h
Signed-off-by: Amber Lin <Amber.Lin@amd.com>
---
drivers/gpu/drm/amd/amdkfd/Makefile | 3 +-
drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c | 2 +
drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 30 ++++
drivers/gpu/drm/amd/amdkfd/kfd_device.c | 1 +
drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 2 +
drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 12 ++
drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c | 177 +++++++++++++++++++++++
drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h | 31 ++++
include/uapi/linux/kfd_ioctl.h | 30 +++-
9 files changed, 286 insertions(+), 2 deletions(-)
create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile b/drivers/gpu/drm/amd/amdkfd/Makefile
index 6147462..cc98b4a 100644
--- a/drivers/gpu/drm/amd/amdkfd/Makefile
+++ b/drivers/gpu/drm/amd/amdkfd/Makefile
@@ -53,7 +53,8 @@ AMDKFD_FILES := $(AMDKFD_PATH)/kfd_module.o \
$(AMDKFD_PATH)/kfd_int_process_v9.o \
$(AMDKFD_PATH)/kfd_dbgdev.o \
$(AMDKFD_PATH)/kfd_dbgmgr.o \
- $(AMDKFD_PATH)/kfd_crat.o
+ $(AMDKFD_PATH)/kfd_crat.o \
+ $(AMDKFD_PATH)/kfd_smi_events.o
ifneq ($(CONFIG_AMD_IOMMU_V2),)
AMDKFD_FILES += $(AMDKFD_PATH)/kfd_iommu.o
diff --git a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
index 9f59ba9..24b4717 100644
--- a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
+++ b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
@@ -24,6 +24,7 @@
#include "kfd_events.h"
#include "cik_int.h"
#include "amdgpu_amdkfd.h"
+#include "kfd_smi_events.h"
static bool cik_event_interrupt_isr(struct kfd_dev *dev,
const uint32_t *ih_ring_entry,
@@ -107,6 +108,7 @@ static void cik_event_interrupt_wq(struct kfd_dev *dev,
ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) {
struct kfd_vm_fault_info info;
+ kfd_smi_event_update_vmfault(dev, pasid);
kfd_process_vm_fault(dev->dqm, pasid);
memset(&info, 0, sizeof(info));
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index f8fa03a..591ac28 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -39,6 +39,7 @@
#include "kfd_device_queue_manager.h"
#include "kfd_dbgmgr.h"
#include "amdgpu_amdkfd.h"
+#include "kfd_smi_events.h"
static long kfd_ioctl(struct file *, unsigned int, unsigned long);
static int kfd_open(struct inode *, struct file *);
@@ -1243,6 +1244,32 @@ static int kfd_ioctl_acquire_vm(struct file *filep, struct kfd_process *p,
return ret;
}
+/* Handle requests for watching SMI events */
+static int kfd_ioctl_smi_events(struct file *filep,
+ struct kfd_process *p, void *data)
+{
+ struct kfd_ioctl_smi_events_args *args = data;
+ struct kfd_dev *dev;
+
+ dev = kfd_device_by_id(args->gpu_id);
+ if (!dev)
+ return -EINVAL;
+
+ switch (args->op) {
+ case KFD_SMI_EVENTS_REGISTER:
+ /* register the device */
+ return kfd_smi_event_register(dev, &args->data);
+ case KFD_SMI_EVENTS_ENABLE:
+ /* subscribe events to the device */
+ return kfd_smi_event_enable(dev, args->events);
+ case KFD_SMI_EVENTS_DISABLE:
+ /* unsubscribe events */
+ return kfd_smi_event_disable(dev, args->events);
+ }
+
+ return -EINVAL;
+}
+
bool kfd_dev_is_large_bar(struct kfd_dev *dev)
{
struct kfd_local_mem_info mem_info;
@@ -1827,6 +1854,9 @@ static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = {
AMDKFD_IOCTL_DEF(AMDKFD_IOC_ALLOC_QUEUE_GWS,
kfd_ioctl_alloc_queue_gws, 0),
+
+ AMDKFD_IOCTL_DEF(AMDKFD_IOC_SMI_EVENTS,
+ kfd_ioctl_smi_events, 0),
};
#define AMDKFD_CORE_IOCTL_COUNT ARRAY_SIZE(amdkfd_ioctls)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
index 0491ab2..6ac6f31 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
@@ -532,6 +532,7 @@ struct kfd_dev *kgd2kfd_probe(struct kgd_dev *kgd,
kfd->device_info = device_info;
kfd->pdev = pdev;
kfd->init_complete = false;
+ kfd->smi_events.registered = false;
kfd->kfd2kgd = f2g;
atomic_set(&kfd->compute_profile, 0);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
index e05d75e..151e83e 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
@@ -24,6 +24,7 @@
#include "kfd_events.h"
#include "soc15_int.h"
#include "kfd_device_queue_manager.h"
+#include "kfd_smi_events.h"
static bool event_interrupt_isr_v9(struct kfd_dev *dev,
const uint32_t *ih_ring_entry,
@@ -117,6 +118,7 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev,
info.prot_read = ring_id & 0x10;
info.prot_write = ring_id & 0x20;
+ kfd_smi_event_update_vmfault(dev, pasid);
kfd_process_vm_fault(dev->dqm, pasid);
kfd_signal_vm_fault_event(dev, pasid, &info);
}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index 43b888b..b10b665 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -223,6 +223,15 @@ struct kfd_vmid_info {
uint32_t vmid_num_kfd;
};
+struct kfd_smi_events {
+ /* device is registered to watch events */
+ bool registered;
+ /* events enabled */
+ uint32_t events;
+ DECLARE_KFIFO_PTR(fifo, uint32_t);
+ wait_queue_head_t wait_queue;
+};
+
struct kfd_dev {
struct kgd_dev *kgd;
@@ -309,6 +318,9 @@ struct kfd_dev {
/* Global GWS resource shared b/t processes*/
void *gws;
+
+ /* if this device is in SMI events watch */
+ struct kfd_smi_events smi_events;
};
enum kfd_mempool {
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
new file mode 100644
index 0000000..aab4dac
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
@@ -0,0 +1,177 @@
+/*
+ * Copyright 2020 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <linux/poll.h>
+#include <linux/wait.h>
+#include <linux/anon_inodes.h>
+#include <uapi/linux/kfd_ioctl.h>
+#include "amdgpu_vm.h"
+#include "kfd_priv.h"
+#include "kfd_smi_events.h"
+
+static DEFINE_MUTEX(kfd_smi_mutex);
+struct mutex *kfd_get_smi_mutex(void)
+{
+ return &kfd_smi_mutex;
+}
+
+static __poll_t kfd_smi_ev_poll(struct file *, struct poll_table_struct *);
+static ssize_t kfd_smi_ev_read(struct file *, char __user *, size_t, loff_t *);
+static int kfd_smi_ev_release(struct inode *, struct file *);
+
+static uint32_t smi_fifo_len;
+static const char kfd_smi_name[] = "kfd_smi_ev";
+
+static const struct file_operations kfd_smi_ev_fops = {
+ .owner = THIS_MODULE,
+ .poll = kfd_smi_ev_poll,
+ .read = kfd_smi_ev_read,
+ .release = kfd_smi_ev_release
+};
+
+static __poll_t kfd_smi_ev_poll(struct file *filep,
+ struct poll_table_struct *wait)
+{
+ __poll_t mask;
+ struct kfd_dev *dev = filep->private_data;
+
+ poll_wait(filep, &dev->smi_events.wait_queue, wait);
+ mask = kfifo_is_empty(&dev->smi_events.fifo) ? 0: POLLIN | POLLRDNORM;
+
+ return mask;
+}
+
+static ssize_t kfd_smi_ev_read(struct file *filep, char __user *user,
+ size_t size, loff_t *offset)
+{
+ int ret, copied = 0;
+ struct kfd_dev *dev = filep->private_data;
+
+ mutex_lock(kfd_get_smi_mutex());
+ ret = kfifo_to_user(&dev->smi_events.fifo, user, size, &copied);
+ mutex_unlock(kfd_get_smi_mutex());
+ if (ret || !copied) {
+ pr_debug("smi-events: fail to send msg (%i) (%i)\n",
+ ret, copied);
+ return ret ? ret : -EAGAIN;
+ }
+
+ return copied;
+}
+
+static int kfd_smi_ev_release(struct inode *inode, struct file *filep)
+{
+ struct kfd_dev *dev = filep->private_data;
+
+ dev->smi_events.events = 0;
+ kfifo_free(&dev->smi_events.fifo);
+
+ return 0;
+}
+
+void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid)
+{
+ struct amdgpu_device *adev = (struct amdgpu_device *)dev->kgd;
+ struct amdgpu_task_info task_info;
+ struct kfd_smi_msg_vmfault msg;
+ uint32_t *fifo_in = (uint32_t *)&msg;
+ uint32_t fifo_in_len = sizeof(msg) / sizeof(uint32_t);
+
+ if (!(dev->smi_events.events & KFD_SMI_EVENT_VMFAULT))
+ return;
+
+ amdgpu_vm_get_task_info(adev, pasid, &task_info);
+ msg.pid = task_info.pid;
+ strncpy(msg.task_name, task_info.task_name, 16);
+
+ mutex_lock(kfd_get_smi_mutex());
+ if ((fifo_in_len + 1) >
+ (smi_fifo_len - kfifo_len(&dev->smi_events.fifo))) {
+ pr_err("smi_event: no space left for vmfault event\n");
+ mutex_unlock(kfd_get_smi_mutex());
+ return;
+ }
+ /* Always send the event type first to function as a header */
+ kfifo_put(&dev->smi_events.fifo, KFD_SMI_EVENT_VMFAULT);
+ /* Then the msg as the payload. Event type reveals the payload size. */
+ kfifo_in(&dev->smi_events.fifo, fifo_in, fifo_in_len);
+ mutex_unlock(kfd_get_smi_mutex());
+
+ wake_up_all(&dev->smi_events.wait_queue);
+}
+
+int kfd_smi_event_disable(struct kfd_dev *dev, uint64_t events)
+{
+ mutex_lock(kfd_get_smi_mutex());
+ dev->smi_events.events &= ~events;
+ mutex_unlock(kfd_get_smi_mutex());
+
+ return 0;
+}
+
+int kfd_smi_event_enable(struct kfd_dev *dev, uint64_t events)
+{
+ /* If the user didn't register SMI events for this device, kfifo is not
+ * created to report events.
+ */
+ if (!dev->smi_events.registered)
+ return -EINVAL;
+
+ mutex_lock(kfd_get_smi_mutex());
+ dev->smi_events.events |= events;
+ mutex_unlock(kfd_get_smi_mutex());
+
+ return 0;
+}
+
+static void assign_fifo_len(void)
+{
+ /* When a new event is added into support and this new event msg uses
+ * more memory, replace the msg struct here.
+ */
+ uint32_t max_msg = sizeof(struct kfd_smi_msg_vmfault)/sizeof(uint32_t);
+ /* +1 for the event type in front of event message. up to 32 entries */
+ smi_fifo_len = (++max_msg) * 32;
+}
+
+int kfd_smi_event_register(struct kfd_dev *dev, int *fd)
+{
+ int ret = 0;
+
+ if (!smi_fifo_len)
+ assign_fifo_len();
+
+ ret = kfifo_alloc(&dev->smi_events.fifo, smi_fifo_len, GFP_KERNEL);
+ if (ret) {
+ pr_err("smi_event: fail to allocate kfifo\n");
+ return ret;
+ }
+ init_waitqueue_head(&dev->smi_events.wait_queue);
+ dev->smi_events.events = 0;
+ dev->smi_events.registered = true;
+
+ ret = anon_inode_getfd(kfd_smi_name, &kfd_smi_ev_fops,
+ (void *)dev, 0);
+ *fd = ret;
+
+ return ret;
+}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
new file mode 100644
index 0000000..e10bb5d
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright 2020 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef KFD_SMI_EVENTS_H_INCLUDED
+#define KFD_SMI_EVENTS_H_INCLUDED
+
+int kfd_smi_event_register(struct kfd_dev *dev, int *fd);
+int kfd_smi_event_enable(struct kfd_dev *dev, uint64_t events);
+int kfd_smi_event_disable(struct kfd_dev *dev, uint64_t events);
+void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid);
+
+#endif
diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
index 4f66764..dc9309e 100644
--- a/include/uapi/linux/kfd_ioctl.h
+++ b/include/uapi/linux/kfd_ioctl.h
@@ -442,6 +442,31 @@ struct kfd_ioctl_import_dmabuf_args {
__u32 dmabuf_fd; /* to KFD */
};
+/*
+ * KFD SMI(System Management Interface) events
+ */
+enum kfd_smi_events_op {
+ KFD_SMI_EVENTS_REGISTER = 1,
+ KFD_SMI_EVENTS_ENABLE,
+ KFD_SMI_EVENTS_DISABLE
+};
+
+/* Event type (defined by bitmask) */
+#define KFD_SMI_EVENT_VMFAULT 0x00000001
+
+struct kfd_ioctl_smi_events_args {
+ __u32 op; /* to KFD */
+ __u32 events; /* to KFD */
+ __u32 gpu_id; /* to KFD */
+ __u32 data; /* from KFD */
+};
+
+/* Message to user space for each event */
+struct kfd_smi_msg_vmfault {
+ uint32_t pid;
+ char task_name[16];
+};
+
/* Register offset inside the remapped mmio page
*/
enum kfd_mmio_remap {
@@ -546,7 +571,10 @@ enum kfd_mmio_remap {
#define AMDKFD_IOC_ALLOC_QUEUE_GWS \
AMDKFD_IOWR(0x1E, struct kfd_ioctl_alloc_queue_gws_args)
+#define AMDKFD_IOC_SMI_EVENTS \
+ AMDKFD_IOWR(0x1F, struct kfd_ioctl_smi_events_args)
+
#define AMDKFD_COMMAND_START 0x01
-#define AMDKFD_COMMAND_END 0x1F
+#define AMDKFD_COMMAND_END 0x20
#endif
--
2.7.4
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
^ permalink raw reply related [flat|nested] 5+ messages in thread* Re: [PATCH v2] drm/amdkfd: Provide SMI events watch 2020-04-02 20:46 [PATCH v2] drm/amdkfd: Provide SMI events watch Amber Lin @ 2020-04-02 23:51 ` Felix Kuehling 2020-04-03 13:38 ` Amber Lin 0 siblings, 1 reply; 5+ messages in thread From: Felix Kuehling @ 2020-04-02 23:51 UTC (permalink / raw) To: Amber Lin, amd-gfx On 2020-04-02 4:46 p.m., Amber Lin wrote: > When the compute is malfunctioning or performance drops, the system admin > will use SMI (System Management Interface) tool to monitor/diagnostic what > went wrong. This patch provides an event watch interface for the user > space to register events they are interested. After the event is > registered, the user can use annoymous file descriptor's poll function > with wait-time specified to wait for the event to happen. Once the event > happens, the user can use read() to retrieve information related to the > event. > > VM fault event is done in this patch. > > v2: - remove UNREGISTER and add event ENABLE/DISABLE > - correct kfifo usage > - move event message API to kfd_ioctl.h > > Signed-off-by: Amber Lin <Amber.Lin@amd.com> > --- > drivers/gpu/drm/amd/amdkfd/Makefile | 3 +- > drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c | 2 + > drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 30 ++++ > drivers/gpu/drm/amd/amdkfd/kfd_device.c | 1 + > drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 2 + > drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 12 ++ > drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c | 177 +++++++++++++++++++++++ > drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h | 31 ++++ > include/uapi/linux/kfd_ioctl.h | 30 +++- > 9 files changed, 286 insertions(+), 2 deletions(-) > create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c > create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h > > diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile b/drivers/gpu/drm/amd/amdkfd/Makefile > index 6147462..cc98b4a 100644 > --- a/drivers/gpu/drm/amd/amdkfd/Makefile > +++ b/drivers/gpu/drm/amd/amdkfd/Makefile > @@ -53,7 +53,8 @@ AMDKFD_FILES := $(AMDKFD_PATH)/kfd_module.o \ > $(AMDKFD_PATH)/kfd_int_process_v9.o \ > $(AMDKFD_PATH)/kfd_dbgdev.o \ > $(AMDKFD_PATH)/kfd_dbgmgr.o \ > - $(AMDKFD_PATH)/kfd_crat.o > + $(AMDKFD_PATH)/kfd_crat.o \ > + $(AMDKFD_PATH)/kfd_smi_events.o > > ifneq ($(CONFIG_AMD_IOMMU_V2),) > AMDKFD_FILES += $(AMDKFD_PATH)/kfd_iommu.o > diff --git a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c > index 9f59ba9..24b4717 100644 > --- a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c > +++ b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c > @@ -24,6 +24,7 @@ > #include "kfd_events.h" > #include "cik_int.h" > #include "amdgpu_amdkfd.h" > +#include "kfd_smi_events.h" > > static bool cik_event_interrupt_isr(struct kfd_dev *dev, > const uint32_t *ih_ring_entry, > @@ -107,6 +108,7 @@ static void cik_event_interrupt_wq(struct kfd_dev *dev, > ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) { > struct kfd_vm_fault_info info; > > + kfd_smi_event_update_vmfault(dev, pasid); > kfd_process_vm_fault(dev->dqm, pasid); > > memset(&info, 0, sizeof(info)); > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c > index f8fa03a..591ac28 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c > @@ -39,6 +39,7 @@ > #include "kfd_device_queue_manager.h" > #include "kfd_dbgmgr.h" > #include "amdgpu_amdkfd.h" > +#include "kfd_smi_events.h" > > static long kfd_ioctl(struct file *, unsigned int, unsigned long); > static int kfd_open(struct inode *, struct file *); > @@ -1243,6 +1244,32 @@ static int kfd_ioctl_acquire_vm(struct file *filep, struct kfd_process *p, > return ret; > } > > +/* Handle requests for watching SMI events */ > +static int kfd_ioctl_smi_events(struct file *filep, > + struct kfd_process *p, void *data) > +{ > + struct kfd_ioctl_smi_events_args *args = data; > + struct kfd_dev *dev; > + > + dev = kfd_device_by_id(args->gpu_id); > + if (!dev) > + return -EINVAL; > + > + switch (args->op) { > + case KFD_SMI_EVENTS_REGISTER: > + /* register the device */ > + return kfd_smi_event_register(dev, &args->data); > + case KFD_SMI_EVENTS_ENABLE: > + /* subscribe events to the device */ > + return kfd_smi_event_enable(dev, args->events); > + case KFD_SMI_EVENTS_DISABLE: > + /* unsubscribe events */ > + return kfd_smi_event_disable(dev, args->events); > + } > + > + return -EINVAL; > +} > + > bool kfd_dev_is_large_bar(struct kfd_dev *dev) > { > struct kfd_local_mem_info mem_info; > @@ -1827,6 +1854,9 @@ static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = { > > AMDKFD_IOCTL_DEF(AMDKFD_IOC_ALLOC_QUEUE_GWS, > kfd_ioctl_alloc_queue_gws, 0), > + > + AMDKFD_IOCTL_DEF(AMDKFD_IOC_SMI_EVENTS, > + kfd_ioctl_smi_events, 0), > }; > > #define AMDKFD_CORE_IOCTL_COUNT ARRAY_SIZE(amdkfd_ioctls) > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c > index 0491ab2..6ac6f31 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c > @@ -532,6 +532,7 @@ struct kfd_dev *kgd2kfd_probe(struct kgd_dev *kgd, > kfd->device_info = device_info; > kfd->pdev = pdev; > kfd->init_complete = false; > + kfd->smi_events.registered = false; > kfd->kfd2kgd = f2g; > atomic_set(&kfd->compute_profile, 0); > > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c > index e05d75e..151e83e 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c > @@ -24,6 +24,7 @@ > #include "kfd_events.h" > #include "soc15_int.h" > #include "kfd_device_queue_manager.h" > +#include "kfd_smi_events.h" > > static bool event_interrupt_isr_v9(struct kfd_dev *dev, > const uint32_t *ih_ring_entry, > @@ -117,6 +118,7 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev, > info.prot_read = ring_id & 0x10; > info.prot_write = ring_id & 0x20; > > + kfd_smi_event_update_vmfault(dev, pasid); > kfd_process_vm_fault(dev->dqm, pasid); > kfd_signal_vm_fault_event(dev, pasid, &info); > } > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h > index 43b888b..b10b665 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h > @@ -223,6 +223,15 @@ struct kfd_vmid_info { > uint32_t vmid_num_kfd; > }; > > +struct kfd_smi_events { > + /* device is registered to watch events */ > + bool registered; > + /* events enabled */ > + uint32_t events; This should be 64-bit to allow for more future expansion. It doesn't matter as much here, because you can always change the internal header later. But in the ioctl API we can't change it later, so we should define it as 64-bit from the start. > + DECLARE_KFIFO_PTR(fifo, uint32_t); > + wait_queue_head_t wait_queue; > +}; > + > struct kfd_dev { > struct kgd_dev *kgd; > > @@ -309,6 +318,9 @@ struct kfd_dev { > > /* Global GWS resource shared b/t processes*/ > void *gws; > + > + /* if this device is in SMI events watch */ > + struct kfd_smi_events smi_events; > }; > > enum kfd_mempool { > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c > new file mode 100644 > index 0000000..aab4dac > --- /dev/null > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c > @@ -0,0 +1,177 @@ > +/* > + * Copyright 2020 Advanced Micro Devices, Inc. > + * > + * Permission is hereby granted, free of charge, to any person obtaining a > + * copy of this software and associated documentation files (the "Software"), > + * to deal in the Software without restriction, including without limitation > + * the rights to use, copy, modify, merge, publish, distribute, sublicense, > + * and/or sell copies of the Software, and to permit persons to whom the > + * Software is furnished to do so, subject to the following conditions: > + * > + * The above copyright notice and this permission notice shall be included in > + * all copies or substantial portions of the Software. > + * > + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR > + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, > + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL > + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR > + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, > + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR > + * OTHER DEALINGS IN THE SOFTWARE. > + */ > + > +#include <linux/poll.h> > +#include <linux/wait.h> > +#include <linux/anon_inodes.h> > +#include <uapi/linux/kfd_ioctl.h> > +#include "amdgpu_vm.h" > +#include "kfd_priv.h" > +#include "kfd_smi_events.h" > + > +static DEFINE_MUTEX(kfd_smi_mutex); > +struct mutex *kfd_get_smi_mutex(void) > +{ > + return &kfd_smi_mutex; > +} Why do you need a function for this. Just reference kfd_smi_mutex directly below. But since the fifo is per device, should there also be one mutex per device? > + > +static __poll_t kfd_smi_ev_poll(struct file *, struct poll_table_struct *); > +static ssize_t kfd_smi_ev_read(struct file *, char __user *, size_t, loff_t *); > +static int kfd_smi_ev_release(struct inode *, struct file *); > + > +static uint32_t smi_fifo_len; This could be statically initialized rather than doing it in a function below because this can be calculated at compile time. It's also constant: static const uint32_t smi_fifo_len = 32 * (sizeof(struct kfd_smi_msg_vmfault)/sizeof(uint32_t) + 1); Or you could probably even make is a #define. > +static const char kfd_smi_name[] = "kfd_smi_ev"; > + > +static const struct file_operations kfd_smi_ev_fops = { > + .owner = THIS_MODULE, > + .poll = kfd_smi_ev_poll, > + .read = kfd_smi_ev_read, > + .release = kfd_smi_ev_release > +}; > + > +static __poll_t kfd_smi_ev_poll(struct file *filep, > + struct poll_table_struct *wait) > +{ > + __poll_t mask; > + struct kfd_dev *dev = filep->private_data; > + > + poll_wait(filep, &dev->smi_events.wait_queue, wait); > + mask = kfifo_is_empty(&dev->smi_events.fifo) ? 0: POLLIN | POLLRDNORM; > + > + return mask; > +} > + > +static ssize_t kfd_smi_ev_read(struct file *filep, char __user *user, > + size_t size, loff_t *offset) > +{ > + int ret, copied = 0; > + struct kfd_dev *dev = filep->private_data; > + > + mutex_lock(kfd_get_smi_mutex()); > + ret = kfifo_to_user(&dev->smi_events.fifo, user, size, &copied); > + mutex_unlock(kfd_get_smi_mutex()); > + if (ret || !copied) { > + pr_debug("smi-events: fail to send msg (%i) (%i)\n", > + ret, copied); > + return ret ? ret : -EAGAIN; > + } > + > + return copied; > +} > + > +static int kfd_smi_ev_release(struct inode *inode, struct file *filep) > +{ > + struct kfd_dev *dev = filep->private_data; > + > + dev->smi_events.events = 0; > + kfifo_free(&dev->smi_events.fifo); This can cause race conditions. You should do this under the fifo mutex. Also set dev->smi_events.registered = false to allow a new registration after this. > + > + return 0; > +} > + > +void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid) > +{ > + struct amdgpu_device *adev = (struct amdgpu_device *)dev->kgd; > + struct amdgpu_task_info task_info; > + struct kfd_smi_msg_vmfault msg; > + uint32_t *fifo_in = (uint32_t *)&msg; > + uint32_t fifo_in_len = sizeof(msg) / sizeof(uint32_t); > + > + if (!(dev->smi_events.events & KFD_SMI_EVENT_VMFAULT)) > + return; You need to do this while holding the fifo mutex, otherwise the fifo can be destroyed before you access it below. > + > + amdgpu_vm_get_task_info(adev, pasid, &task_info); > + msg.pid = task_info.pid; > + strncpy(msg.task_name, task_info.task_name, 16); > + > + mutex_lock(kfd_get_smi_mutex()); > + if ((fifo_in_len + 1) > > + (smi_fifo_len - kfifo_len(&dev->smi_events.fifo))) { You could use kfifo_avail to get the space available in the fifo. > + pr_err("smi_event: no space left for vmfault event\n"); > + mutex_unlock(kfd_get_smi_mutex()); > + return; > + } > + /* Always send the event type first to function as a header */ > + kfifo_put(&dev->smi_events.fifo, KFD_SMI_EVENT_VMFAULT); > + /* Then the msg as the payload. Event type reveals the payload size. */ > + kfifo_in(&dev->smi_events.fifo, fifo_in, fifo_in_len); > + mutex_unlock(kfd_get_smi_mutex()); > + > + wake_up_all(&dev->smi_events.wait_queue); > +} > + > +int kfd_smi_event_disable(struct kfd_dev *dev, uint64_t events) > +{ > + mutex_lock(kfd_get_smi_mutex()); > + dev->smi_events.events &= ~events; > + mutex_unlock(kfd_get_smi_mutex()); > + > + return 0; > +} > + > +int kfd_smi_event_enable(struct kfd_dev *dev, uint64_t events) > +{ > + /* If the user didn't register SMI events for this device, kfifo is not > + * created to report events. > + */ > + if (!dev->smi_events.registered) > + return -EINVAL; > + > + mutex_lock(kfd_get_smi_mutex()); > + dev->smi_events.events |= events; > + mutex_unlock(kfd_get_smi_mutex()); > + > + return 0; > +} > + > +static void assign_fifo_len(void) > +{ > + /* When a new event is added into support and this new event msg uses > + * more memory, replace the msg struct here. > + */ > + uint32_t max_msg = sizeof(struct kfd_smi_msg_vmfault)/sizeof(uint32_t); > + /* +1 for the event type in front of event message. up to 32 entries */ > + smi_fifo_len = (++max_msg) * 32; See above. This could be statically initialized. > +} > + > +int kfd_smi_event_register(struct kfd_dev *dev, int *fd) > +{ > + int ret = 0; This function should return failure if the event was already registered. You can only allow one FD to be created per device, because that FD owns the FIFO. This entire function must be under the fifo mutex. > + > + if (!smi_fifo_len) > + assign_fifo_len(); > + > + ret = kfifo_alloc(&dev->smi_events.fifo, smi_fifo_len, GFP_KERNEL); > + if (ret) { > + pr_err("smi_event: fail to allocate kfifo\n"); > + return ret; > + } > + init_waitqueue_head(&dev->smi_events.wait_queue); > + dev->smi_events.events = 0; > + dev->smi_events.registered = true; > + > + ret = anon_inode_getfd(kfd_smi_name, &kfd_smi_ev_fops, > + (void *)dev, 0); > + *fd = ret; > + > + return ret; > +} > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h > new file mode 100644 > index 0000000..e10bb5d > --- /dev/null > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h > @@ -0,0 +1,31 @@ > +/* > + * Copyright 2020 Advanced Micro Devices, Inc. > + * > + * Permission is hereby granted, free of charge, to any person obtaining a > + * copy of this software and associated documentation files (the "Software"), > + * to deal in the Software without restriction, including without limitation > + * the rights to use, copy, modify, merge, publish, distribute, sublicense, > + * and/or sell copies of the Software, and to permit persons to whom the > + * Software is furnished to do so, subject to the following conditions: > + * > + * The above copyright notice and this permission notice shall be included in > + * all copies or substantial portions of the Software. > + * > + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR > + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, > + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL > + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR > + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, > + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR > + * OTHER DEALINGS IN THE SOFTWARE. > + */ > + > +#ifndef KFD_SMI_EVENTS_H_INCLUDED > +#define KFD_SMI_EVENTS_H_INCLUDED > + > +int kfd_smi_event_register(struct kfd_dev *dev, int *fd); > +int kfd_smi_event_enable(struct kfd_dev *dev, uint64_t events); > +int kfd_smi_event_disable(struct kfd_dev *dev, uint64_t events); > +void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid); > + > +#endif > diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h > index 4f66764..dc9309e 100644 > --- a/include/uapi/linux/kfd_ioctl.h > +++ b/include/uapi/linux/kfd_ioctl.h > @@ -442,6 +442,31 @@ struct kfd_ioctl_import_dmabuf_args { > __u32 dmabuf_fd; /* to KFD */ > }; > > +/* > + * KFD SMI(System Management Interface) events > + */ > +enum kfd_smi_events_op { > + KFD_SMI_EVENTS_REGISTER = 1, > + KFD_SMI_EVENTS_ENABLE, > + KFD_SMI_EVENTS_DISABLE > +}; > + > +/* Event type (defined by bitmask) */ > +#define KFD_SMI_EVENT_VMFAULT 0x00000001 Since you can only have 64 events, you only need one byte. You have plenty of space in the header dword to add the message size. That would make it easier to parse the events and allow variable size events in the future. Then you should define the header as a struct. Something like this: struct kfd_smi_msg_header { __u8 type; __u8 pad; __u16 size; }; > + > +struct kfd_ioctl_smi_events_args { > + __u32 op; /* to KFD */ > + __u32 events; /* to KFD */ I thought this should be 64-bit. > + __u32 gpu_id; /* to KFD */ > + __u32 data; /* from KFD */ > +}; > + > +/* Message to user space for each event */ > +struct kfd_smi_msg_vmfault { If you define the header like suggested above, you could include it here and at the start of all message structures. Might simplify the code a little: struct kfd_smi_msg_header header; Regards, Felix > + uint32_t pid; > + char task_name[16]; > +}; > + > /* Register offset inside the remapped mmio page > */ > enum kfd_mmio_remap { > @@ -546,7 +571,10 @@ enum kfd_mmio_remap { > #define AMDKFD_IOC_ALLOC_QUEUE_GWS \ > AMDKFD_IOWR(0x1E, struct kfd_ioctl_alloc_queue_gws_args) > > +#define AMDKFD_IOC_SMI_EVENTS \ > + AMDKFD_IOWR(0x1F, struct kfd_ioctl_smi_events_args) > + > #define AMDKFD_COMMAND_START 0x01 > -#define AMDKFD_COMMAND_END 0x1F > +#define AMDKFD_COMMAND_END 0x20 > > #endif _______________________________________________ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx ^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [PATCH v2] drm/amdkfd: Provide SMI events watch 2020-04-02 23:51 ` Felix Kuehling @ 2020-04-03 13:38 ` Amber Lin 2020-04-03 15:38 ` Amber Lin 0 siblings, 1 reply; 5+ messages in thread From: Amber Lin @ 2020-04-03 13:38 UTC (permalink / raw) To: Felix Kuehling, amd-gfx Thanks Felix. I'll make changes accordingly but please pay attention to my last reply inline. On 2020-04-02 7:51 p.m., Felix Kuehling wrote: > On 2020-04-02 4:46 p.m., Amber Lin wrote: >> When the compute is malfunctioning or performance drops, the system >> admin >> will use SMI (System Management Interface) tool to monitor/diagnostic >> what >> went wrong. This patch provides an event watch interface for the user >> space to register events they are interested. After the event is >> registered, the user can use annoymous file descriptor's poll function >> with wait-time specified to wait for the event to happen. Once the event >> happens, the user can use read() to retrieve information related to the >> event. >> >> VM fault event is done in this patch. >> >> v2: - remove UNREGISTER and add event ENABLE/DISABLE >> - correct kfifo usage >> - move event message API to kfd_ioctl.h >> >> Signed-off-by: Amber Lin <Amber.Lin@amd.com> >> --- >> drivers/gpu/drm/amd/amdkfd/Makefile | 3 +- >> drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c | 2 + >> drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 30 ++++ >> drivers/gpu/drm/amd/amdkfd/kfd_device.c | 1 + >> drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 2 + >> drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 12 ++ >> drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c | 177 >> +++++++++++++++++++++++ >> drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h | 31 ++++ >> include/uapi/linux/kfd_ioctl.h | 30 +++- >> 9 files changed, 286 insertions(+), 2 deletions(-) >> create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c >> create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h >> >> diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile >> b/drivers/gpu/drm/amd/amdkfd/Makefile >> index 6147462..cc98b4a 100644 >> --- a/drivers/gpu/drm/amd/amdkfd/Makefile >> +++ b/drivers/gpu/drm/amd/amdkfd/Makefile >> @@ -53,7 +53,8 @@ AMDKFD_FILES := $(AMDKFD_PATH)/kfd_module.o \ >> $(AMDKFD_PATH)/kfd_int_process_v9.o \ >> $(AMDKFD_PATH)/kfd_dbgdev.o \ >> $(AMDKFD_PATH)/kfd_dbgmgr.o \ >> - $(AMDKFD_PATH)/kfd_crat.o >> + $(AMDKFD_PATH)/kfd_crat.o \ >> + $(AMDKFD_PATH)/kfd_smi_events.o >> ifneq ($(CONFIG_AMD_IOMMU_V2),) >> AMDKFD_FILES += $(AMDKFD_PATH)/kfd_iommu.o >> diff --git a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c >> b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c >> index 9f59ba9..24b4717 100644 >> --- a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c >> +++ b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c >> @@ -24,6 +24,7 @@ >> #include "kfd_events.h" >> #include "cik_int.h" >> #include "amdgpu_amdkfd.h" >> +#include "kfd_smi_events.h" >> static bool cik_event_interrupt_isr(struct kfd_dev *dev, >> const uint32_t *ih_ring_entry, >> @@ -107,6 +108,7 @@ static void cik_event_interrupt_wq(struct kfd_dev >> *dev, >> ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) { >> struct kfd_vm_fault_info info; >> + kfd_smi_event_update_vmfault(dev, pasid); >> kfd_process_vm_fault(dev->dqm, pasid); >> memset(&info, 0, sizeof(info)); >> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c >> b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c >> index f8fa03a..591ac28 100644 >> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c >> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c >> @@ -39,6 +39,7 @@ >> #include "kfd_device_queue_manager.h" >> #include "kfd_dbgmgr.h" >> #include "amdgpu_amdkfd.h" >> +#include "kfd_smi_events.h" >> static long kfd_ioctl(struct file *, unsigned int, unsigned long); >> static int kfd_open(struct inode *, struct file *); >> @@ -1243,6 +1244,32 @@ static int kfd_ioctl_acquire_vm(struct file >> *filep, struct kfd_process *p, >> return ret; >> } >> +/* Handle requests for watching SMI events */ >> +static int kfd_ioctl_smi_events(struct file *filep, >> + struct kfd_process *p, void *data) >> +{ >> + struct kfd_ioctl_smi_events_args *args = data; >> + struct kfd_dev *dev; >> + >> + dev = kfd_device_by_id(args->gpu_id); >> + if (!dev) >> + return -EINVAL; >> + >> + switch (args->op) { >> + case KFD_SMI_EVENTS_REGISTER: >> + /* register the device */ >> + return kfd_smi_event_register(dev, &args->data); >> + case KFD_SMI_EVENTS_ENABLE: >> + /* subscribe events to the device */ >> + return kfd_smi_event_enable(dev, args->events); >> + case KFD_SMI_EVENTS_DISABLE: >> + /* unsubscribe events */ >> + return kfd_smi_event_disable(dev, args->events); >> + } >> + >> + return -EINVAL; >> +} >> + >> bool kfd_dev_is_large_bar(struct kfd_dev *dev) >> { >> struct kfd_local_mem_info mem_info; >> @@ -1827,6 +1854,9 @@ static const struct amdkfd_ioctl_desc >> amdkfd_ioctls[] = { >> AMDKFD_IOCTL_DEF(AMDKFD_IOC_ALLOC_QUEUE_GWS, >> kfd_ioctl_alloc_queue_gws, 0), >> + >> + AMDKFD_IOCTL_DEF(AMDKFD_IOC_SMI_EVENTS, >> + kfd_ioctl_smi_events, 0), >> }; >> #define AMDKFD_CORE_IOCTL_COUNT ARRAY_SIZE(amdkfd_ioctls) >> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c >> b/drivers/gpu/drm/amd/amdkfd/kfd_device.c >> index 0491ab2..6ac6f31 100644 >> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c >> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c >> @@ -532,6 +532,7 @@ struct kfd_dev *kgd2kfd_probe(struct kgd_dev *kgd, >> kfd->device_info = device_info; >> kfd->pdev = pdev; >> kfd->init_complete = false; >> + kfd->smi_events.registered = false; >> kfd->kfd2kgd = f2g; >> atomic_set(&kfd->compute_profile, 0); >> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c >> b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c >> index e05d75e..151e83e 100644 >> --- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c >> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c >> @@ -24,6 +24,7 @@ >> #include "kfd_events.h" >> #include "soc15_int.h" >> #include "kfd_device_queue_manager.h" >> +#include "kfd_smi_events.h" >> static bool event_interrupt_isr_v9(struct kfd_dev *dev, >> const uint32_t *ih_ring_entry, >> @@ -117,6 +118,7 @@ static void event_interrupt_wq_v9(struct kfd_dev >> *dev, >> info.prot_read = ring_id & 0x10; >> info.prot_write = ring_id & 0x20; >> + kfd_smi_event_update_vmfault(dev, pasid); >> kfd_process_vm_fault(dev->dqm, pasid); >> kfd_signal_vm_fault_event(dev, pasid, &info); >> } >> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h >> b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h >> index 43b888b..b10b665 100644 >> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h >> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h >> @@ -223,6 +223,15 @@ struct kfd_vmid_info { >> uint32_t vmid_num_kfd; >> }; >> +struct kfd_smi_events { >> + /* device is registered to watch events */ >> + bool registered; >> + /* events enabled */ >> + uint32_t events; > > This should be 64-bit to allow for more future expansion. It doesn't > matter as much here, because you can always change the internal header > later. But in the ioctl API we can't change it later, so we should > define it as 64-bit from the start. > Ok, I'll change back to 64 bits. I was thinking to use "data" as group if we have to run out of bits anyway, but with further thinking, we can squish several events into one since it's event "type", not "ID". 64 will be quite sufficient and no need to worry about running out of it. > >> + DECLARE_KFIFO_PTR(fifo, uint32_t); >> + wait_queue_head_t wait_queue; >> +}; >> + >> struct kfd_dev { >> struct kgd_dev *kgd; >> @@ -309,6 +318,9 @@ struct kfd_dev { >> /* Global GWS resource shared b/t processes*/ >> void *gws; >> + >> + /* if this device is in SMI events watch */ >> + struct kfd_smi_events smi_events; >> }; >> enum kfd_mempool { >> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c >> b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c >> new file mode 100644 >> index 0000000..aab4dac >> --- /dev/null >> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c >> @@ -0,0 +1,177 @@ >> +/* >> + * Copyright 2020 Advanced Micro Devices, Inc. >> + * >> + * Permission is hereby granted, free of charge, to any person >> obtaining a >> + * copy of this software and associated documentation files (the >> "Software"), >> + * to deal in the Software without restriction, including without >> limitation >> + * the rights to use, copy, modify, merge, publish, distribute, >> sublicense, >> + * and/or sell copies of the Software, and to permit persons to whom >> the >> + * Software is furnished to do so, subject to the following conditions: >> + * >> + * The above copyright notice and this permission notice shall be >> included in >> + * all copies or substantial portions of the Software. >> + * >> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, >> EXPRESS OR >> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF >> MERCHANTABILITY, >> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO >> EVENT SHALL >> + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, >> DAMAGES OR >> + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR >> OTHERWISE, >> + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE >> USE OR >> + * OTHER DEALINGS IN THE SOFTWARE. >> + */ >> + >> +#include <linux/poll.h> >> +#include <linux/wait.h> >> +#include <linux/anon_inodes.h> >> +#include <uapi/linux/kfd_ioctl.h> >> +#include "amdgpu_vm.h" >> +#include "kfd_priv.h" >> +#include "kfd_smi_events.h" >> + >> +static DEFINE_MUTEX(kfd_smi_mutex); >> +struct mutex *kfd_get_smi_mutex(void) >> +{ >> + return &kfd_smi_mutex; >> +} > > Why do you need a function for this. Just reference kfd_smi_mutex > directly below. But since the fifo is per device, should there also be > one mutex per device? > I'm not sure how to create mutex for smi_event so I referenced the debugger code. >> + >> +static __poll_t kfd_smi_ev_poll(struct file *, struct >> poll_table_struct *); >> +static ssize_t kfd_smi_ev_read(struct file *, char __user *, size_t, >> loff_t *); >> +static int kfd_smi_ev_release(struct inode *, struct file *); >> + >> +static uint32_t smi_fifo_len; > > This could be statically initialized rather than doing it in a > function below because this can be calculated at compile time. It's > also constant: > > static const uint32_t smi_fifo_len = 32 * (sizeof(struct > kfd_smi_msg_vmfault)/sizeof(uint32_t) + 1); > > Or you could probably even make is a #define. > True. I'll change it. > >> +static const char kfd_smi_name[] = "kfd_smi_ev"; >> + >> +static const struct file_operations kfd_smi_ev_fops = { >> + .owner = THIS_MODULE, >> + .poll = kfd_smi_ev_poll, >> + .read = kfd_smi_ev_read, >> + .release = kfd_smi_ev_release >> +}; >> + >> +static __poll_t kfd_smi_ev_poll(struct file *filep, >> + struct poll_table_struct *wait) >> +{ >> + __poll_t mask; >> + struct kfd_dev *dev = filep->private_data; >> + >> + poll_wait(filep, &dev->smi_events.wait_queue, wait); >> + mask = kfifo_is_empty(&dev->smi_events.fifo) ? 0: POLLIN | >> POLLRDNORM; >> + >> + return mask; >> +} >> + >> +static ssize_t kfd_smi_ev_read(struct file *filep, char __user *user, >> + size_t size, loff_t *offset) >> +{ >> + int ret, copied = 0; >> + struct kfd_dev *dev = filep->private_data; >> + >> + mutex_lock(kfd_get_smi_mutex()); >> + ret = kfifo_to_user(&dev->smi_events.fifo, user, size, &copied); >> + mutex_unlock(kfd_get_smi_mutex()); >> + if (ret || !copied) { >> + pr_debug("smi-events: fail to send msg (%i) (%i)\n", >> + ret, copied); >> + return ret ? ret : -EAGAIN; >> + } >> + >> + return copied; >> +} >> + >> +static int kfd_smi_ev_release(struct inode *inode, struct file *filep) >> +{ >> + struct kfd_dev *dev = filep->private_data; >> + >> + dev->smi_events.events = 0; >> + kfifo_free(&dev->smi_events.fifo); > > This can cause race conditions. You should do this under the fifo > mutex. Also set dev->smi_events.registered = false to allow a new > registration after this. > Good catch. I'll add them. >> + >> + return 0; >> +} >> + >> +void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid) >> +{ >> + struct amdgpu_device *adev = (struct amdgpu_device *)dev->kgd; >> + struct amdgpu_task_info task_info; >> + struct kfd_smi_msg_vmfault msg; >> + uint32_t *fifo_in = (uint32_t *)&msg; >> + uint32_t fifo_in_len = sizeof(msg) / sizeof(uint32_t); >> + >> + if (!(dev->smi_events.events & KFD_SMI_EVENT_VMFAULT)) >> + return; > > You need to do this while holding the fifo mutex, otherwise the fifo > can be destroyed before you access it below. > Ok > >> + >> + amdgpu_vm_get_task_info(adev, pasid, &task_info); >> + msg.pid = task_info.pid; >> + strncpy(msg.task_name, task_info.task_name, 16); >> + >> + mutex_lock(kfd_get_smi_mutex()); >> + if ((fifo_in_len + 1) > >> + (smi_fifo_len - kfifo_len(&dev->smi_events.fifo))) { > > You could use kfifo_avail to get the space available in the fifo. > Ah. Thanks. > >> + pr_err("smi_event: no space left for vmfault event\n"); >> + mutex_unlock(kfd_get_smi_mutex()); >> + return; >> + } >> + /* Always send the event type first to function as a header */ >> + kfifo_put(&dev->smi_events.fifo, KFD_SMI_EVENT_VMFAULT); >> + /* Then the msg as the payload. Event type reveals the payload >> size. */ >> + kfifo_in(&dev->smi_events.fifo, fifo_in, fifo_in_len); >> + mutex_unlock(kfd_get_smi_mutex()); >> + >> + wake_up_all(&dev->smi_events.wait_queue); >> +} >> + >> +int kfd_smi_event_disable(struct kfd_dev *dev, uint64_t events) >> +{ >> + mutex_lock(kfd_get_smi_mutex()); >> + dev->smi_events.events &= ~events; >> + mutex_unlock(kfd_get_smi_mutex()); >> + >> + return 0; >> +} >> + >> +int kfd_smi_event_enable(struct kfd_dev *dev, uint64_t events) >> +{ >> + /* If the user didn't register SMI events for this device, kfifo >> is not >> + * created to report events. >> + */ >> + if (!dev->smi_events.registered) >> + return -EINVAL; >> + >> + mutex_lock(kfd_get_smi_mutex()); >> + dev->smi_events.events |= events; >> + mutex_unlock(kfd_get_smi_mutex()); >> + >> + return 0; >> +} >> + >> +static void assign_fifo_len(void) >> +{ >> + /* When a new event is added into support and this new event msg >> uses >> + * more memory, replace the msg struct here. >> + */ >> + uint32_t max_msg = sizeof(struct >> kfd_smi_msg_vmfault)/sizeof(uint32_t); >> + /* +1 for the event type in front of event message. up to 32 >> entries */ >> + smi_fifo_len = (++max_msg) * 32; > > See above. This could be statically initialized. > > >> +} >> + >> +int kfd_smi_event_register(struct kfd_dev *dev, int *fd) >> +{ >> + int ret = 0; > > This function should return failure if the event was already > registered. You can only allow one FD to be created per device, > because that FD owns the FIFO. > > This entire function must be under the fifo mutex. > Ok > >> + >> + if (!smi_fifo_len) >> + assign_fifo_len(); >> + >> + ret = kfifo_alloc(&dev->smi_events.fifo, smi_fifo_len, GFP_KERNEL); >> + if (ret) { >> + pr_err("smi_event: fail to allocate kfifo\n"); >> + return ret; >> + } >> + init_waitqueue_head(&dev->smi_events.wait_queue); >> + dev->smi_events.events = 0; >> + dev->smi_events.registered = true; >> + >> + ret = anon_inode_getfd(kfd_smi_name, &kfd_smi_ev_fops, >> + (void *)dev, 0); >> + *fd = ret; >> + >> + return ret; >> +} >> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h >> b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h >> new file mode 100644 >> index 0000000..e10bb5d >> --- /dev/null >> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h >> @@ -0,0 +1,31 @@ >> +/* >> + * Copyright 2020 Advanced Micro Devices, Inc. >> + * >> + * Permission is hereby granted, free of charge, to any person >> obtaining a >> + * copy of this software and associated documentation files (the >> "Software"), >> + * to deal in the Software without restriction, including without >> limitation >> + * the rights to use, copy, modify, merge, publish, distribute, >> sublicense, >> + * and/or sell copies of the Software, and to permit persons to whom >> the >> + * Software is furnished to do so, subject to the following conditions: >> + * >> + * The above copyright notice and this permission notice shall be >> included in >> + * all copies or substantial portions of the Software. >> + * >> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, >> EXPRESS OR >> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF >> MERCHANTABILITY, >> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO >> EVENT SHALL >> + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, >> DAMAGES OR >> + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR >> OTHERWISE, >> + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE >> USE OR >> + * OTHER DEALINGS IN THE SOFTWARE. >> + */ >> + >> +#ifndef KFD_SMI_EVENTS_H_INCLUDED >> +#define KFD_SMI_EVENTS_H_INCLUDED >> + >> +int kfd_smi_event_register(struct kfd_dev *dev, int *fd); >> +int kfd_smi_event_enable(struct kfd_dev *dev, uint64_t events); >> +int kfd_smi_event_disable(struct kfd_dev *dev, uint64_t events); >> +void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid); >> + >> +#endif >> diff --git a/include/uapi/linux/kfd_ioctl.h >> b/include/uapi/linux/kfd_ioctl.h >> index 4f66764..dc9309e 100644 >> --- a/include/uapi/linux/kfd_ioctl.h >> +++ b/include/uapi/linux/kfd_ioctl.h >> @@ -442,6 +442,31 @@ struct kfd_ioctl_import_dmabuf_args { >> __u32 dmabuf_fd; /* to KFD */ >> }; >> +/* >> + * KFD SMI(System Management Interface) events >> + */ >> +enum kfd_smi_events_op { >> + KFD_SMI_EVENTS_REGISTER = 1, >> + KFD_SMI_EVENTS_ENABLE, >> + KFD_SMI_EVENTS_DISABLE >> +}; >> + >> +/* Event type (defined by bitmask) */ >> +#define KFD_SMI_EVENT_VMFAULT 0x00000001 > > Since you can only have 64 events, you only need one byte. You have > plenty of space in the header dword to add the message size. That > would make it easier to parse the events and allow variable size > events in the future. Then you should define the header as a struct. > Something like this: > > struct kfd_smi_msg_header { > __u8 type; > __u8 pad; > __u16 size; > }; > It sounds a good idea, but I don't see how it improves the parsing... The user reads the content based on the event type. My test code on the user space is like this: read(fd, &event, sizeof(uint32_t)); //will change uint64_t switch (event) { case event_A: read(fd, &struct_event_A_variable, sizeof(struct event_A)); case event_B: read(fd, &struct_event_B_variable, sizeof(struct event_B)); Not exactly like that, such as read() is done in a separate function, but you see the idea. This is what I mean by the event type tells the size itself. If I provide the header you suggested, the user still needs to decide which struct to use.... Unless we don't use struct for the message. I can change the output to become event:description_of_the event where description is a string. The header you suggest will apply to this method pretty well. > >> + >> +struct kfd_ioctl_smi_events_args { >> + __u32 op; /* to KFD */ >> + __u32 events; /* to KFD */ > > I thought this should be 64-bit. > > >> + __u32 gpu_id; /* to KFD */ >> + __u32 data; /* from KFD */ >> +}; >> + >> +/* Message to user space for each event */ >> +struct kfd_smi_msg_vmfault { > > If you define the header like suggested above, you could include it > here and at the start of all message structures. Might simplify the > code a little: > > struct kfd_smi_msg_header header; > > Regards, > Felix > >> + uint32_t pid; >> + char task_name[16]; >> +}; >> + >> /* Register offset inside the remapped mmio page >> */ >> enum kfd_mmio_remap { >> @@ -546,7 +571,10 @@ enum kfd_mmio_remap { >> #define AMDKFD_IOC_ALLOC_QUEUE_GWS \ >> AMDKFD_IOWR(0x1E, struct kfd_ioctl_alloc_queue_gws_args) >> +#define AMDKFD_IOC_SMI_EVENTS \ >> + AMDKFD_IOWR(0x1F, struct kfd_ioctl_smi_events_args) >> + >> #define AMDKFD_COMMAND_START 0x01 >> -#define AMDKFD_COMMAND_END 0x1F >> +#define AMDKFD_COMMAND_END 0x20 >> #endif _______________________________________________ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx ^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [PATCH v2] drm/amdkfd: Provide SMI events watch 2020-04-03 13:38 ` Amber Lin @ 2020-04-03 15:38 ` Amber Lin 2020-04-03 19:01 ` Kuehling, Felix 0 siblings, 1 reply; 5+ messages in thread From: Amber Lin @ 2020-04-03 15:38 UTC (permalink / raw) To: Felix Kuehling, amd-gfx Further thinking about it, I'll use struct kfd_smi_msg_header. Instead of using struct kfd_smi_msg_vmfault, it's a description about the event. This way we make it generic to all events. On 2020-04-03 9:38 a.m., Amber Lin wrote: > Thanks Felix. I'll make changes accordingly but please pay attention > to my last reply inline. > > On 2020-04-02 7:51 p.m., Felix Kuehling wrote: >> On 2020-04-02 4:46 p.m., Amber Lin wrote: >>> When the compute is malfunctioning or performance drops, the system >>> admin >>> will use SMI (System Management Interface) tool to >>> monitor/diagnostic what >>> went wrong. This patch provides an event watch interface for the user >>> space to register events they are interested. After the event is >>> registered, the user can use annoymous file descriptor's poll function >>> with wait-time specified to wait for the event to happen. Once the >>> event >>> happens, the user can use read() to retrieve information related to the >>> event. >>> >>> VM fault event is done in this patch. >>> >>> v2: - remove UNREGISTER and add event ENABLE/DISABLE >>> - correct kfifo usage >>> - move event message API to kfd_ioctl.h >>> >>> Signed-off-by: Amber Lin <Amber.Lin@amd.com> >>> --- >>> drivers/gpu/drm/amd/amdkfd/Makefile | 3 +- >>> drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c | 2 + >>> drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 30 ++++ >>> drivers/gpu/drm/amd/amdkfd/kfd_device.c | 1 + >>> drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 2 + >>> drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 12 ++ >>> drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c | 177 >>> +++++++++++++++++++++++ >>> drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h | 31 ++++ >>> include/uapi/linux/kfd_ioctl.h | 30 +++- >>> 9 files changed, 286 insertions(+), 2 deletions(-) >>> create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c >>> create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h >>> >>> diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile >>> b/drivers/gpu/drm/amd/amdkfd/Makefile >>> index 6147462..cc98b4a 100644 >>> --- a/drivers/gpu/drm/amd/amdkfd/Makefile >>> +++ b/drivers/gpu/drm/amd/amdkfd/Makefile >>> @@ -53,7 +53,8 @@ AMDKFD_FILES := $(AMDKFD_PATH)/kfd_module.o \ >>> $(AMDKFD_PATH)/kfd_int_process_v9.o \ >>> $(AMDKFD_PATH)/kfd_dbgdev.o \ >>> $(AMDKFD_PATH)/kfd_dbgmgr.o \ >>> - $(AMDKFD_PATH)/kfd_crat.o >>> + $(AMDKFD_PATH)/kfd_crat.o \ >>> + $(AMDKFD_PATH)/kfd_smi_events.o >>> ifneq ($(CONFIG_AMD_IOMMU_V2),) >>> AMDKFD_FILES += $(AMDKFD_PATH)/kfd_iommu.o >>> diff --git a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c >>> b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c >>> index 9f59ba9..24b4717 100644 >>> --- a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c >>> +++ b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c >>> @@ -24,6 +24,7 @@ >>> #include "kfd_events.h" >>> #include "cik_int.h" >>> #include "amdgpu_amdkfd.h" >>> +#include "kfd_smi_events.h" >>> static bool cik_event_interrupt_isr(struct kfd_dev *dev, >>> const uint32_t *ih_ring_entry, >>> @@ -107,6 +108,7 @@ static void cik_event_interrupt_wq(struct >>> kfd_dev *dev, >>> ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) { >>> struct kfd_vm_fault_info info; >>> + kfd_smi_event_update_vmfault(dev, pasid); >>> kfd_process_vm_fault(dev->dqm, pasid); >>> memset(&info, 0, sizeof(info)); >>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c >>> b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c >>> index f8fa03a..591ac28 100644 >>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c >>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c >>> @@ -39,6 +39,7 @@ >>> #include "kfd_device_queue_manager.h" >>> #include "kfd_dbgmgr.h" >>> #include "amdgpu_amdkfd.h" >>> +#include "kfd_smi_events.h" >>> static long kfd_ioctl(struct file *, unsigned int, unsigned long); >>> static int kfd_open(struct inode *, struct file *); >>> @@ -1243,6 +1244,32 @@ static int kfd_ioctl_acquire_vm(struct file >>> *filep, struct kfd_process *p, >>> return ret; >>> } >>> +/* Handle requests for watching SMI events */ >>> +static int kfd_ioctl_smi_events(struct file *filep, >>> + struct kfd_process *p, void *data) >>> +{ >>> + struct kfd_ioctl_smi_events_args *args = data; >>> + struct kfd_dev *dev; >>> + >>> + dev = kfd_device_by_id(args->gpu_id); >>> + if (!dev) >>> + return -EINVAL; >>> + >>> + switch (args->op) { >>> + case KFD_SMI_EVENTS_REGISTER: >>> + /* register the device */ >>> + return kfd_smi_event_register(dev, &args->data); >>> + case KFD_SMI_EVENTS_ENABLE: >>> + /* subscribe events to the device */ >>> + return kfd_smi_event_enable(dev, args->events); >>> + case KFD_SMI_EVENTS_DISABLE: >>> + /* unsubscribe events */ >>> + return kfd_smi_event_disable(dev, args->events); >>> + } >>> + >>> + return -EINVAL; >>> +} >>> + >>> bool kfd_dev_is_large_bar(struct kfd_dev *dev) >>> { >>> struct kfd_local_mem_info mem_info; >>> @@ -1827,6 +1854,9 @@ static const struct amdkfd_ioctl_desc >>> amdkfd_ioctls[] = { >>> AMDKFD_IOCTL_DEF(AMDKFD_IOC_ALLOC_QUEUE_GWS, >>> kfd_ioctl_alloc_queue_gws, 0), >>> + >>> + AMDKFD_IOCTL_DEF(AMDKFD_IOC_SMI_EVENTS, >>> + kfd_ioctl_smi_events, 0), >>> }; >>> #define AMDKFD_CORE_IOCTL_COUNT ARRAY_SIZE(amdkfd_ioctls) >>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c >>> b/drivers/gpu/drm/amd/amdkfd/kfd_device.c >>> index 0491ab2..6ac6f31 100644 >>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c >>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c >>> @@ -532,6 +532,7 @@ struct kfd_dev *kgd2kfd_probe(struct kgd_dev *kgd, >>> kfd->device_info = device_info; >>> kfd->pdev = pdev; >>> kfd->init_complete = false; >>> + kfd->smi_events.registered = false; >>> kfd->kfd2kgd = f2g; >>> atomic_set(&kfd->compute_profile, 0); >>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c >>> b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c >>> index e05d75e..151e83e 100644 >>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c >>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c >>> @@ -24,6 +24,7 @@ >>> #include "kfd_events.h" >>> #include "soc15_int.h" >>> #include "kfd_device_queue_manager.h" >>> +#include "kfd_smi_events.h" >>> static bool event_interrupt_isr_v9(struct kfd_dev *dev, >>> const uint32_t *ih_ring_entry, >>> @@ -117,6 +118,7 @@ static void event_interrupt_wq_v9(struct kfd_dev >>> *dev, >>> info.prot_read = ring_id & 0x10; >>> info.prot_write = ring_id & 0x20; >>> + kfd_smi_event_update_vmfault(dev, pasid); >>> kfd_process_vm_fault(dev->dqm, pasid); >>> kfd_signal_vm_fault_event(dev, pasid, &info); >>> } >>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h >>> b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h >>> index 43b888b..b10b665 100644 >>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h >>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h >>> @@ -223,6 +223,15 @@ struct kfd_vmid_info { >>> uint32_t vmid_num_kfd; >>> }; >>> +struct kfd_smi_events { >>> + /* device is registered to watch events */ >>> + bool registered; >>> + /* events enabled */ >>> + uint32_t events; >> >> This should be 64-bit to allow for more future expansion. It doesn't >> matter as much here, because you can always change the internal >> header later. But in the ioctl API we can't change it later, so we >> should define it as 64-bit from the start. >> > Ok, I'll change back to 64 bits. I was thinking to use "data" as group > if we have to run out of bits anyway, but with further thinking, we > can squish several events into one since it's event "type", not "ID". > 64 will be quite sufficient and no need to worry about running out of it. Ignore what I wrote about squishing events. I was thinking the output format. >> >>> + DECLARE_KFIFO_PTR(fifo, uint32_t); >>> + wait_queue_head_t wait_queue; >>> +}; >>> + >>> struct kfd_dev { >>> struct kgd_dev *kgd; >>> @@ -309,6 +318,9 @@ struct kfd_dev { >>> /* Global GWS resource shared b/t processes*/ >>> void *gws; >>> + >>> + /* if this device is in SMI events watch */ >>> + struct kfd_smi_events smi_events; >>> }; >>> enum kfd_mempool { >>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c >>> b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c >>> new file mode 100644 >>> index 0000000..aab4dac >>> --- /dev/null >>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c >>> @@ -0,0 +1,177 @@ >>> +/* >>> + * Copyright 2020 Advanced Micro Devices, Inc. >>> + * >>> + * Permission is hereby granted, free of charge, to any person >>> obtaining a >>> + * copy of this software and associated documentation files (the >>> "Software"), >>> + * to deal in the Software without restriction, including without >>> limitation >>> + * the rights to use, copy, modify, merge, publish, distribute, >>> sublicense, >>> + * and/or sell copies of the Software, and to permit persons to >>> whom the >>> + * Software is furnished to do so, subject to the following >>> conditions: >>> + * >>> + * The above copyright notice and this permission notice shall be >>> included in >>> + * all copies or substantial portions of the Software. >>> + * >>> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, >>> EXPRESS OR >>> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF >>> MERCHANTABILITY, >>> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO >>> EVENT SHALL >>> + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, >>> DAMAGES OR >>> + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR >>> OTHERWISE, >>> + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE >>> USE OR >>> + * OTHER DEALINGS IN THE SOFTWARE. >>> + */ >>> + >>> +#include <linux/poll.h> >>> +#include <linux/wait.h> >>> +#include <linux/anon_inodes.h> >>> +#include <uapi/linux/kfd_ioctl.h> >>> +#include "amdgpu_vm.h" >>> +#include "kfd_priv.h" >>> +#include "kfd_smi_events.h" >>> + >>> +static DEFINE_MUTEX(kfd_smi_mutex); >>> +struct mutex *kfd_get_smi_mutex(void) >>> +{ >>> + return &kfd_smi_mutex; >>> +} >> >> Why do you need a function for this. Just reference kfd_smi_mutex >> directly below. But since the fifo is per device, should there also >> be one mutex per device? >> > I'm not sure how to create mutex for smi_event so I referenced the > debugger code. >>> + >>> +static __poll_t kfd_smi_ev_poll(struct file *, struct >>> poll_table_struct *); >>> +static ssize_t kfd_smi_ev_read(struct file *, char __user *, >>> size_t, loff_t *); >>> +static int kfd_smi_ev_release(struct inode *, struct file *); >>> + >>> +static uint32_t smi_fifo_len; >> >> This could be statically initialized rather than doing it in a >> function below because this can be calculated at compile time. It's >> also constant: >> >> static const uint32_t smi_fifo_len = 32 * (sizeof(struct >> kfd_smi_msg_vmfault)/sizeof(uint32_t) + 1); >> >> Or you could probably even make is a #define. >> > True. I'll change it. >> >>> +static const char kfd_smi_name[] = "kfd_smi_ev"; >>> + >>> +static const struct file_operations kfd_smi_ev_fops = { >>> + .owner = THIS_MODULE, >>> + .poll = kfd_smi_ev_poll, >>> + .read = kfd_smi_ev_read, >>> + .release = kfd_smi_ev_release >>> +}; >>> + >>> +static __poll_t kfd_smi_ev_poll(struct file *filep, >>> + struct poll_table_struct *wait) >>> +{ >>> + __poll_t mask; >>> + struct kfd_dev *dev = filep->private_data; >>> + >>> + poll_wait(filep, &dev->smi_events.wait_queue, wait); >>> + mask = kfifo_is_empty(&dev->smi_events.fifo) ? 0: POLLIN | >>> POLLRDNORM; >>> + >>> + return mask; >>> +} >>> + >>> +static ssize_t kfd_smi_ev_read(struct file *filep, char __user *user, >>> + size_t size, loff_t *offset) >>> +{ >>> + int ret, copied = 0; >>> + struct kfd_dev *dev = filep->private_data; >>> + >>> + mutex_lock(kfd_get_smi_mutex()); >>> + ret = kfifo_to_user(&dev->smi_events.fifo, user, size, &copied); >>> + mutex_unlock(kfd_get_smi_mutex()); >>> + if (ret || !copied) { >>> + pr_debug("smi-events: fail to send msg (%i) (%i)\n", >>> + ret, copied); >>> + return ret ? ret : -EAGAIN; >>> + } >>> + >>> + return copied; >>> +} >>> + >>> +static int kfd_smi_ev_release(struct inode *inode, struct file *filep) >>> +{ >>> + struct kfd_dev *dev = filep->private_data; >>> + >>> + dev->smi_events.events = 0; >>> + kfifo_free(&dev->smi_events.fifo); >> >> This can cause race conditions. You should do this under the fifo >> mutex. Also set dev->smi_events.registered = false to allow a new >> registration after this. >> > Good catch. I'll add them. >>> + >>> + return 0; >>> +} >>> + >>> +void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid) >>> +{ >>> + struct amdgpu_device *adev = (struct amdgpu_device *)dev->kgd; >>> + struct amdgpu_task_info task_info; >>> + struct kfd_smi_msg_vmfault msg; >>> + uint32_t *fifo_in = (uint32_t *)&msg; >>> + uint32_t fifo_in_len = sizeof(msg) / sizeof(uint32_t); >>> + >>> + if (!(dev->smi_events.events & KFD_SMI_EVENT_VMFAULT)) >>> + return; >> >> You need to do this while holding the fifo mutex, otherwise the fifo >> can be destroyed before you access it below. >> > Ok >> >>> + >>> + amdgpu_vm_get_task_info(adev, pasid, &task_info); >>> + msg.pid = task_info.pid; >>> + strncpy(msg.task_name, task_info.task_name, 16); >>> + >>> + mutex_lock(kfd_get_smi_mutex()); >>> + if ((fifo_in_len + 1) > >>> + (smi_fifo_len - kfifo_len(&dev->smi_events.fifo))) { >> >> You could use kfifo_avail to get the space available in the fifo. >> > Ah. Thanks. >> >>> + pr_err("smi_event: no space left for vmfault event\n"); >>> + mutex_unlock(kfd_get_smi_mutex()); >>> + return; >>> + } >>> + /* Always send the event type first to function as a header */ >>> + kfifo_put(&dev->smi_events.fifo, KFD_SMI_EVENT_VMFAULT); >>> + /* Then the msg as the payload. Event type reveals the payload >>> size. */ >>> + kfifo_in(&dev->smi_events.fifo, fifo_in, fifo_in_len); >>> + mutex_unlock(kfd_get_smi_mutex()); >>> + >>> + wake_up_all(&dev->smi_events.wait_queue); >>> +} >>> + >>> +int kfd_smi_event_disable(struct kfd_dev *dev, uint64_t events) >>> +{ >>> + mutex_lock(kfd_get_smi_mutex()); >>> + dev->smi_events.events &= ~events; >>> + mutex_unlock(kfd_get_smi_mutex()); >>> + >>> + return 0; >>> +} >>> + >>> +int kfd_smi_event_enable(struct kfd_dev *dev, uint64_t events) >>> +{ >>> + /* If the user didn't register SMI events for this device, >>> kfifo is not >>> + * created to report events. >>> + */ >>> + if (!dev->smi_events.registered) >>> + return -EINVAL; >>> + >>> + mutex_lock(kfd_get_smi_mutex()); >>> + dev->smi_events.events |= events; >>> + mutex_unlock(kfd_get_smi_mutex()); >>> + >>> + return 0; >>> +} >>> + >>> +static void assign_fifo_len(void) >>> +{ >>> + /* When a new event is added into support and this new event >>> msg uses >>> + * more memory, replace the msg struct here. >>> + */ >>> + uint32_t max_msg = sizeof(struct >>> kfd_smi_msg_vmfault)/sizeof(uint32_t); >>> + /* +1 for the event type in front of event message. up to 32 >>> entries */ >>> + smi_fifo_len = (++max_msg) * 32; >> >> See above. This could be statically initialized. >> >> >>> +} >>> + >>> +int kfd_smi_event_register(struct kfd_dev *dev, int *fd) >>> +{ >>> + int ret = 0; >> >> This function should return failure if the event was already >> registered. You can only allow one FD to be created per device, >> because that FD owns the FIFO. >> >> This entire function must be under the fifo mutex. >> > Ok >> >>> + >>> + if (!smi_fifo_len) >>> + assign_fifo_len(); >>> + >>> + ret = kfifo_alloc(&dev->smi_events.fifo, smi_fifo_len, >>> GFP_KERNEL); >>> + if (ret) { >>> + pr_err("smi_event: fail to allocate kfifo\n"); >>> + return ret; >>> + } >>> + init_waitqueue_head(&dev->smi_events.wait_queue); >>> + dev->smi_events.events = 0; >>> + dev->smi_events.registered = true; >>> + >>> + ret = anon_inode_getfd(kfd_smi_name, &kfd_smi_ev_fops, >>> + (void *)dev, 0); >>> + *fd = ret; >>> + >>> + return ret; >>> +} >>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h >>> b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h >>> new file mode 100644 >>> index 0000000..e10bb5d >>> --- /dev/null >>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h >>> @@ -0,0 +1,31 @@ >>> +/* >>> + * Copyright 2020 Advanced Micro Devices, Inc. >>> + * >>> + * Permission is hereby granted, free of charge, to any person >>> obtaining a >>> + * copy of this software and associated documentation files (the >>> "Software"), >>> + * to deal in the Software without restriction, including without >>> limitation >>> + * the rights to use, copy, modify, merge, publish, distribute, >>> sublicense, >>> + * and/or sell copies of the Software, and to permit persons to >>> whom the >>> + * Software is furnished to do so, subject to the following >>> conditions: >>> + * >>> + * The above copyright notice and this permission notice shall be >>> included in >>> + * all copies or substantial portions of the Software. >>> + * >>> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, >>> EXPRESS OR >>> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF >>> MERCHANTABILITY, >>> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO >>> EVENT SHALL >>> + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, >>> DAMAGES OR >>> + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR >>> OTHERWISE, >>> + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE >>> USE OR >>> + * OTHER DEALINGS IN THE SOFTWARE. >>> + */ >>> + >>> +#ifndef KFD_SMI_EVENTS_H_INCLUDED >>> +#define KFD_SMI_EVENTS_H_INCLUDED >>> + >>> +int kfd_smi_event_register(struct kfd_dev *dev, int *fd); >>> +int kfd_smi_event_enable(struct kfd_dev *dev, uint64_t events); >>> +int kfd_smi_event_disable(struct kfd_dev *dev, uint64_t events); >>> +void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t >>> pasid); >>> + >>> +#endif >>> diff --git a/include/uapi/linux/kfd_ioctl.h >>> b/include/uapi/linux/kfd_ioctl.h >>> index 4f66764..dc9309e 100644 >>> --- a/include/uapi/linux/kfd_ioctl.h >>> +++ b/include/uapi/linux/kfd_ioctl.h >>> @@ -442,6 +442,31 @@ struct kfd_ioctl_import_dmabuf_args { >>> __u32 dmabuf_fd; /* to KFD */ >>> }; >>> +/* >>> + * KFD SMI(System Management Interface) events >>> + */ >>> +enum kfd_smi_events_op { >>> + KFD_SMI_EVENTS_REGISTER = 1, >>> + KFD_SMI_EVENTS_ENABLE, >>> + KFD_SMI_EVENTS_DISABLE >>> +}; >>> + >>> +/* Event type (defined by bitmask) */ >>> +#define KFD_SMI_EVENT_VMFAULT 0x00000001 >> >> Since you can only have 64 events, you only need one byte. You have >> plenty of space in the header dword to add the message size. That >> would make it easier to parse the events and allow variable size >> events in the future. Then you should define the header as a struct. >> Something like this: >> >> struct kfd_smi_msg_header { >> __u8 type; >> __u8 pad; >> __u16 size; >> }; >> > It sounds a good idea, but I don't see how it improves the parsing... > The user reads the content based on the event type. My test code on > the user space is like this: > > read(fd, &event, sizeof(uint32_t)); //will change uint64_t > > switch (event) { > > case event_A: > > read(fd, &struct_event_A_variable, sizeof(struct event_A)); > > case event_B: > > read(fd, &struct_event_B_variable, sizeof(struct event_B)); > > Not exactly like that, such as read() is done in a separate function, > but you see the idea. > > This is what I mean by the event type tells the size itself. If I > provide the header you suggested, the user still needs to decide which > struct to use.... Unless we don't use struct for the message. I can > change the output to become event:description_of_the event where > description is a string. The header you suggest will apply to this > method pretty well. > As stated in the beginning, I'll use header+description where description is a string whose size rounds up to power of 4 (sizeof(uint32_t)). >> >>> + >>> +struct kfd_ioctl_smi_events_args { >>> + __u32 op; /* to KFD */ >>> + __u32 events; /* to KFD */ >> >> I thought this should be 64-bit. >> >> >>> + __u32 gpu_id; /* to KFD */ >>> + __u32 data; /* from KFD */ >>> +}; >>> + >>> +/* Message to user space for each event */ >>> +struct kfd_smi_msg_vmfault { >> >> If you define the header like suggested above, you could include it >> here and at the start of all message structures. Might simplify the >> code a little: >> >> struct kfd_smi_msg_header header; >> >> Regards, >> Felix >> >>> + uint32_t pid; >>> + char task_name[16]; >>> +}; >>> + >>> /* Register offset inside the remapped mmio page >>> */ >>> enum kfd_mmio_remap { >>> @@ -546,7 +571,10 @@ enum kfd_mmio_remap { >>> #define AMDKFD_IOC_ALLOC_QUEUE_GWS \ >>> AMDKFD_IOWR(0x1E, struct kfd_ioctl_alloc_queue_gws_args) >>> +#define AMDKFD_IOC_SMI_EVENTS \ >>> + AMDKFD_IOWR(0x1F, struct kfd_ioctl_smi_events_args) >>> + >>> #define AMDKFD_COMMAND_START 0x01 >>> -#define AMDKFD_COMMAND_END 0x1F >>> +#define AMDKFD_COMMAND_END 0x20 >>> #endif _______________________________________________ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx ^ permalink raw reply [flat|nested] 5+ messages in thread
* RE: [PATCH v2] drm/amdkfd: Provide SMI events watch 2020-04-03 15:38 ` Amber Lin @ 2020-04-03 19:01 ` Kuehling, Felix 0 siblings, 0 replies; 5+ messages in thread From: Kuehling, Felix @ 2020-04-03 19:01 UTC (permalink / raw) To: Lin, Amber, amd-gfx@lists.freedesktop.org [AMD Official Use Only - Internal Distribution Only] So are you saying you'll make the event descriptions text rather than binary? If you switch to a text format, I wouldn't use a binary header. Rather I'd make it a text format completely. You could use one line per event, that makes it easy to use something like fgets to read a line (event) at a time in user mode. Each line could still start with an event identifier, but it would be text rather than a binary. And you don’t need the size if you define "\n" as delimiter between events. Regards, Felix -----Original Message----- From: Lin, Amber <Amber.Lin@amd.com> Sent: Friday, April 3, 2020 11:38 To: Kuehling, Felix <Felix.Kuehling@amd.com>; amd-gfx@lists.freedesktop.org Subject: Re: [PATCH v2] drm/amdkfd: Provide SMI events watch Further thinking about it, I'll use struct kfd_smi_msg_header. Instead of using struct kfd_smi_msg_vmfault, it's a description about the event. This way we make it generic to all events. On 2020-04-03 9:38 a.m., Amber Lin wrote: > Thanks Felix. I'll make changes accordingly but please pay attention > to my last reply inline. > > On 2020-04-02 7:51 p.m., Felix Kuehling wrote: >> On 2020-04-02 4:46 p.m., Amber Lin wrote: >>> When the compute is malfunctioning or performance drops, the system >>> admin will use SMI (System Management Interface) tool to >>> monitor/diagnostic what went wrong. This patch provides an event >>> watch interface for the user space to register events they are >>> interested. After the event is registered, the user can use >>> annoymous file descriptor's poll function with wait-time specified >>> to wait for the event to happen. Once the event happens, the user >>> can use read() to retrieve information related to the event. >>> >>> VM fault event is done in this patch. >>> >>> v2: - remove UNREGISTER and add event ENABLE/DISABLE >>> - correct kfifo usage >>> - move event message API to kfd_ioctl.h >>> >>> Signed-off-by: Amber Lin <Amber.Lin@amd.com> >>> --- >>> drivers/gpu/drm/amd/amdkfd/Makefile | 3 +- >>> drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c | 2 + >>> drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 30 ++++ >>> drivers/gpu/drm/amd/amdkfd/kfd_device.c | 1 + >>> drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 2 + >>> drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 12 ++ >>> drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c | 177 >>> +++++++++++++++++++++++ >>> drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h | 31 ++++ >>> include/uapi/linux/kfd_ioctl.h | 30 +++- >>> 9 files changed, 286 insertions(+), 2 deletions(-) >>> create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c >>> create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h >>> >>> diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile >>> b/drivers/gpu/drm/amd/amdkfd/Makefile >>> index 6147462..cc98b4a 100644 >>> --- a/drivers/gpu/drm/amd/amdkfd/Makefile >>> +++ b/drivers/gpu/drm/amd/amdkfd/Makefile >>> @@ -53,7 +53,8 @@ AMDKFD_FILES := $(AMDKFD_PATH)/kfd_module.o \ >>> $(AMDKFD_PATH)/kfd_int_process_v9.o \ >>> $(AMDKFD_PATH)/kfd_dbgdev.o \ >>> $(AMDKFD_PATH)/kfd_dbgmgr.o \ >>> - $(AMDKFD_PATH)/kfd_crat.o >>> + $(AMDKFD_PATH)/kfd_crat.o \ >>> + $(AMDKFD_PATH)/kfd_smi_events.o >>> ifneq ($(CONFIG_AMD_IOMMU_V2),) >>> AMDKFD_FILES += $(AMDKFD_PATH)/kfd_iommu.o diff --git >>> a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c >>> b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c >>> index 9f59ba9..24b4717 100644 >>> --- a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c >>> +++ b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c >>> @@ -24,6 +24,7 @@ >>> #include "kfd_events.h" >>> #include "cik_int.h" >>> #include "amdgpu_amdkfd.h" >>> +#include "kfd_smi_events.h" >>> static bool cik_event_interrupt_isr(struct kfd_dev *dev, >>> const uint32_t *ih_ring_entry, @@ -107,6 >>> +108,7 @@ static void cik_event_interrupt_wq(struct kfd_dev *dev, >>> ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) { >>> struct kfd_vm_fault_info info; >>> + kfd_smi_event_update_vmfault(dev, pasid); >>> kfd_process_vm_fault(dev->dqm, pasid); >>> memset(&info, 0, sizeof(info)); diff --git >>> a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c >>> b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c >>> index f8fa03a..591ac28 100644 >>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c >>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c >>> @@ -39,6 +39,7 @@ >>> #include "kfd_device_queue_manager.h" >>> #include "kfd_dbgmgr.h" >>> #include "amdgpu_amdkfd.h" >>> +#include "kfd_smi_events.h" >>> static long kfd_ioctl(struct file *, unsigned int, unsigned >>> long); >>> static int kfd_open(struct inode *, struct file *); @@ -1243,6 >>> +1244,32 @@ static int kfd_ioctl_acquire_vm(struct file *filep, >>> struct kfd_process *p, >>> return ret; >>> } >>> +/* Handle requests for watching SMI events */ >>> +static int kfd_ioctl_smi_events(struct file *filep, >>> + struct kfd_process *p, void *data) { >>> + struct kfd_ioctl_smi_events_args *args = data; >>> + struct kfd_dev *dev; >>> + >>> + dev = kfd_device_by_id(args->gpu_id); >>> + if (!dev) >>> + return -EINVAL; >>> + >>> + switch (args->op) { >>> + case KFD_SMI_EVENTS_REGISTER: >>> + /* register the device */ >>> + return kfd_smi_event_register(dev, &args->data); >>> + case KFD_SMI_EVENTS_ENABLE: >>> + /* subscribe events to the device */ >>> + return kfd_smi_event_enable(dev, args->events); >>> + case KFD_SMI_EVENTS_DISABLE: >>> + /* unsubscribe events */ >>> + return kfd_smi_event_disable(dev, args->events); >>> + } >>> + >>> + return -EINVAL; >>> +} >>> + >>> bool kfd_dev_is_large_bar(struct kfd_dev *dev) >>> { >>> struct kfd_local_mem_info mem_info; @@ -1827,6 +1854,9 @@ >>> static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = { >>> AMDKFD_IOCTL_DEF(AMDKFD_IOC_ALLOC_QUEUE_GWS, >>> kfd_ioctl_alloc_queue_gws, 0), >>> + >>> + AMDKFD_IOCTL_DEF(AMDKFD_IOC_SMI_EVENTS, >>> + kfd_ioctl_smi_events, 0), >>> }; >>> #define AMDKFD_CORE_IOCTL_COUNT ARRAY_SIZE(amdkfd_ioctls) diff >>> --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c >>> b/drivers/gpu/drm/amd/amdkfd/kfd_device.c >>> index 0491ab2..6ac6f31 100644 >>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c >>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c >>> @@ -532,6 +532,7 @@ struct kfd_dev *kgd2kfd_probe(struct kgd_dev >>> *kgd, >>> kfd->device_info = device_info; >>> kfd->pdev = pdev; >>> kfd->init_complete = false; >>> + kfd->smi_events.registered = false; >>> kfd->kfd2kgd = f2g; >>> atomic_set(&kfd->compute_profile, 0); >>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c >>> b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c >>> index e05d75e..151e83e 100644 >>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c >>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c >>> @@ -24,6 +24,7 @@ >>> #include "kfd_events.h" >>> #include "soc15_int.h" >>> #include "kfd_device_queue_manager.h" >>> +#include "kfd_smi_events.h" >>> static bool event_interrupt_isr_v9(struct kfd_dev *dev, >>> const uint32_t *ih_ring_entry, @@ -117,6 >>> +118,7 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev, >>> info.prot_read = ring_id & 0x10; >>> info.prot_write = ring_id & 0x20; >>> + kfd_smi_event_update_vmfault(dev, pasid); >>> kfd_process_vm_fault(dev->dqm, pasid); >>> kfd_signal_vm_fault_event(dev, pasid, &info); >>> } >>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h >>> b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h >>> index 43b888b..b10b665 100644 >>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h >>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h >>> @@ -223,6 +223,15 @@ struct kfd_vmid_info { >>> uint32_t vmid_num_kfd; >>> }; >>> +struct kfd_smi_events { >>> + /* device is registered to watch events */ >>> + bool registered; >>> + /* events enabled */ >>> + uint32_t events; >> >> This should be 64-bit to allow for more future expansion. It doesn't >> matter as much here, because you can always change the internal >> header later. But in the ioctl API we can't change it later, so we >> should define it as 64-bit from the start. >> > Ok, I'll change back to 64 bits. I was thinking to use "data" as group > if we have to run out of bits anyway, but with further thinking, we > can squish several events into one since it's event "type", not "ID". > 64 will be quite sufficient and no need to worry about running out of it. Ignore what I wrote about squishing events. I was thinking the output format. >> >>> + DECLARE_KFIFO_PTR(fifo, uint32_t); >>> + wait_queue_head_t wait_queue; >>> +}; >>> + >>> struct kfd_dev { >>> struct kgd_dev *kgd; >>> @@ -309,6 +318,9 @@ struct kfd_dev { >>> /* Global GWS resource shared b/t processes*/ >>> void *gws; >>> + >>> + /* if this device is in SMI events watch */ >>> + struct kfd_smi_events smi_events; >>> }; >>> enum kfd_mempool { >>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c >>> b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c >>> new file mode 100644 >>> index 0000000..aab4dac >>> --- /dev/null >>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c >>> @@ -0,0 +1,177 @@ >>> +/* >>> + * Copyright 2020 Advanced Micro Devices, Inc. >>> + * >>> + * Permission is hereby granted, free of charge, to any person >>> obtaining a >>> + * copy of this software and associated documentation files (the >>> "Software"), >>> + * to deal in the Software without restriction, including without >>> limitation >>> + * the rights to use, copy, modify, merge, publish, distribute, >>> sublicense, >>> + * and/or sell copies of the Software, and to permit persons to >>> whom the >>> + * Software is furnished to do so, subject to the following >>> conditions: >>> + * >>> + * The above copyright notice and this permission notice shall be >>> included in >>> + * all copies or substantial portions of the Software. >>> + * >>> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, >>> EXPRESS OR >>> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF >>> MERCHANTABILITY, >>> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO >>> EVENT SHALL >>> + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, >>> DAMAGES OR >>> + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR >>> OTHERWISE, >>> + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE >>> USE OR >>> + * OTHER DEALINGS IN THE SOFTWARE. >>> + */ >>> + >>> +#include <linux/poll.h> >>> +#include <linux/wait.h> >>> +#include <linux/anon_inodes.h> >>> +#include <uapi/linux/kfd_ioctl.h> >>> +#include "amdgpu_vm.h" >>> +#include "kfd_priv.h" >>> +#include "kfd_smi_events.h" >>> + >>> +static DEFINE_MUTEX(kfd_smi_mutex); struct mutex >>> +*kfd_get_smi_mutex(void) { >>> + return &kfd_smi_mutex; >>> +} >> >> Why do you need a function for this. Just reference kfd_smi_mutex >> directly below. But since the fifo is per device, should there also >> be one mutex per device? >> > I'm not sure how to create mutex for smi_event so I referenced the > debugger code. >>> + >>> +static __poll_t kfd_smi_ev_poll(struct file *, struct >>> poll_table_struct *); >>> +static ssize_t kfd_smi_ev_read(struct file *, char __user *, >>> size_t, loff_t *); >>> +static int kfd_smi_ev_release(struct inode *, struct file *); >>> + >>> +static uint32_t smi_fifo_len; >> >> This could be statically initialized rather than doing it in a >> function below because this can be calculated at compile time. It's >> also constant: >> >> static const uint32_t smi_fifo_len = 32 * (sizeof(struct >> kfd_smi_msg_vmfault)/sizeof(uint32_t) + 1); >> >> Or you could probably even make is a #define. >> > True. I'll change it. >> >>> +static const char kfd_smi_name[] = "kfd_smi_ev"; >>> + >>> +static const struct file_operations kfd_smi_ev_fops = { >>> + .owner = THIS_MODULE, >>> + .poll = kfd_smi_ev_poll, >>> + .read = kfd_smi_ev_read, >>> + .release = kfd_smi_ev_release >>> +}; >>> + >>> +static __poll_t kfd_smi_ev_poll(struct file *filep, >>> + struct poll_table_struct *wait) { >>> + __poll_t mask; >>> + struct kfd_dev *dev = filep->private_data; >>> + >>> + poll_wait(filep, &dev->smi_events.wait_queue, wait); >>> + mask = kfifo_is_empty(&dev->smi_events.fifo) ? 0: POLLIN | >>> POLLRDNORM; >>> + >>> + return mask; >>> +} >>> + >>> +static ssize_t kfd_smi_ev_read(struct file *filep, char __user >>> +*user, >>> + size_t size, loff_t *offset) { >>> + int ret, copied = 0; >>> + struct kfd_dev *dev = filep->private_data; >>> + >>> + mutex_lock(kfd_get_smi_mutex()); >>> + ret = kfifo_to_user(&dev->smi_events.fifo, user, size, >>> +&copied); >>> + mutex_unlock(kfd_get_smi_mutex()); >>> + if (ret || !copied) { >>> + pr_debug("smi-events: fail to send msg (%i) (%i)\n", >>> + ret, copied); >>> + return ret ? ret : -EAGAIN; >>> + } >>> + >>> + return copied; >>> +} >>> + >>> +static int kfd_smi_ev_release(struct inode *inode, struct file >>> +*filep) { >>> + struct kfd_dev *dev = filep->private_data; >>> + >>> + dev->smi_events.events = 0; >>> + kfifo_free(&dev->smi_events.fifo); >> >> This can cause race conditions. You should do this under the fifo >> mutex. Also set dev->smi_events.registered = false to allow a new >> registration after this. >> > Good catch. I'll add them. >>> + >>> + return 0; >>> +} >>> + >>> +void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t >>> +pasid) { >>> + struct amdgpu_device *adev = (struct amdgpu_device *)dev->kgd; >>> + struct amdgpu_task_info task_info; >>> + struct kfd_smi_msg_vmfault msg; >>> + uint32_t *fifo_in = (uint32_t *)&msg; >>> + uint32_t fifo_in_len = sizeof(msg) / sizeof(uint32_t); >>> + >>> + if (!(dev->smi_events.events & KFD_SMI_EVENT_VMFAULT)) >>> + return; >> >> You need to do this while holding the fifo mutex, otherwise the fifo >> can be destroyed before you access it below. >> > Ok >> >>> + >>> + amdgpu_vm_get_task_info(adev, pasid, &task_info); >>> + msg.pid = task_info.pid; >>> + strncpy(msg.task_name, task_info.task_name, 16); >>> + >>> + mutex_lock(kfd_get_smi_mutex()); >>> + if ((fifo_in_len + 1) > >>> + (smi_fifo_len - kfifo_len(&dev->smi_events.fifo))) { >> >> You could use kfifo_avail to get the space available in the fifo. >> > Ah. Thanks. >> >>> + pr_err("smi_event: no space left for vmfault event\n"); >>> + mutex_unlock(kfd_get_smi_mutex()); >>> + return; >>> + } >>> + /* Always send the event type first to function as a header */ >>> + kfifo_put(&dev->smi_events.fifo, KFD_SMI_EVENT_VMFAULT); >>> + /* Then the msg as the payload. Event type reveals the payload >>> size. */ >>> + kfifo_in(&dev->smi_events.fifo, fifo_in, fifo_in_len); >>> + mutex_unlock(kfd_get_smi_mutex()); >>> + >>> + wake_up_all(&dev->smi_events.wait_queue); >>> +} >>> + >>> +int kfd_smi_event_disable(struct kfd_dev *dev, uint64_t events) { >>> + mutex_lock(kfd_get_smi_mutex()); >>> + dev->smi_events.events &= ~events; >>> + mutex_unlock(kfd_get_smi_mutex()); >>> + >>> + return 0; >>> +} >>> + >>> +int kfd_smi_event_enable(struct kfd_dev *dev, uint64_t events) { >>> + /* If the user didn't register SMI events for this device, >>> kfifo is not >>> + * created to report events. >>> + */ >>> + if (!dev->smi_events.registered) >>> + return -EINVAL; >>> + >>> + mutex_lock(kfd_get_smi_mutex()); >>> + dev->smi_events.events |= events; >>> + mutex_unlock(kfd_get_smi_mutex()); >>> + >>> + return 0; >>> +} >>> + >>> +static void assign_fifo_len(void) >>> +{ >>> + /* When a new event is added into support and this new event >>> msg uses >>> + * more memory, replace the msg struct here. >>> + */ >>> + uint32_t max_msg = sizeof(struct >>> kfd_smi_msg_vmfault)/sizeof(uint32_t); >>> + /* +1 for the event type in front of event message. up to 32 >>> entries */ >>> + smi_fifo_len = (++max_msg) * 32; >> >> See above. This could be statically initialized. >> >> >>> +} >>> + >>> +int kfd_smi_event_register(struct kfd_dev *dev, int *fd) { >>> + int ret = 0; >> >> This function should return failure if the event was already >> registered. You can only allow one FD to be created per device, >> because that FD owns the FIFO. >> >> This entire function must be under the fifo mutex. >> > Ok >> >>> + >>> + if (!smi_fifo_len) >>> + assign_fifo_len(); >>> + >>> + ret = kfifo_alloc(&dev->smi_events.fifo, smi_fifo_len, >>> GFP_KERNEL); >>> + if (ret) { >>> + pr_err("smi_event: fail to allocate kfifo\n"); >>> + return ret; >>> + } >>> + init_waitqueue_head(&dev->smi_events.wait_queue); >>> + dev->smi_events.events = 0; >>> + dev->smi_events.registered = true; >>> + >>> + ret = anon_inode_getfd(kfd_smi_name, &kfd_smi_ev_fops, >>> + (void *)dev, 0); >>> + *fd = ret; >>> + >>> + return ret; >>> +} >>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h >>> b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h >>> new file mode 100644 >>> index 0000000..e10bb5d >>> --- /dev/null >>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h >>> @@ -0,0 +1,31 @@ >>> +/* >>> + * Copyright 2020 Advanced Micro Devices, Inc. >>> + * >>> + * Permission is hereby granted, free of charge, to any person >>> obtaining a >>> + * copy of this software and associated documentation files (the >>> "Software"), >>> + * to deal in the Software without restriction, including without >>> limitation >>> + * the rights to use, copy, modify, merge, publish, distribute, >>> sublicense, >>> + * and/or sell copies of the Software, and to permit persons to >>> whom the >>> + * Software is furnished to do so, subject to the following >>> conditions: >>> + * >>> + * The above copyright notice and this permission notice shall be >>> included in >>> + * all copies or substantial portions of the Software. >>> + * >>> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, >>> EXPRESS OR >>> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF >>> MERCHANTABILITY, >>> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO >>> EVENT SHALL >>> + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, >>> DAMAGES OR >>> + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR >>> OTHERWISE, >>> + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE >>> USE OR >>> + * OTHER DEALINGS IN THE SOFTWARE. >>> + */ >>> + >>> +#ifndef KFD_SMI_EVENTS_H_INCLUDED >>> +#define KFD_SMI_EVENTS_H_INCLUDED >>> + >>> +int kfd_smi_event_register(struct kfd_dev *dev, int *fd); int >>> +kfd_smi_event_enable(struct kfd_dev *dev, uint64_t events); int >>> +kfd_smi_event_disable(struct kfd_dev *dev, uint64_t events); void >>> +kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t >>> pasid); >>> + >>> +#endif >>> diff --git a/include/uapi/linux/kfd_ioctl.h >>> b/include/uapi/linux/kfd_ioctl.h index 4f66764..dc9309e 100644 >>> --- a/include/uapi/linux/kfd_ioctl.h >>> +++ b/include/uapi/linux/kfd_ioctl.h >>> @@ -442,6 +442,31 @@ struct kfd_ioctl_import_dmabuf_args { >>> __u32 dmabuf_fd; /* to KFD */ >>> }; >>> +/* >>> + * KFD SMI(System Management Interface) events */ enum >>> +kfd_smi_events_op { >>> + KFD_SMI_EVENTS_REGISTER = 1, >>> + KFD_SMI_EVENTS_ENABLE, >>> + KFD_SMI_EVENTS_DISABLE >>> +}; >>> + >>> +/* Event type (defined by bitmask) */ #define KFD_SMI_EVENT_VMFAULT >>> +0x00000001 >> >> Since you can only have 64 events, you only need one byte. You have >> plenty of space in the header dword to add the message size. That >> would make it easier to parse the events and allow variable size >> events in the future. Then you should define the header as a struct. >> Something like this: >> >> struct kfd_smi_msg_header { >> __u8 type; >> __u8 pad; >> __u16 size; >> }; >> > It sounds a good idea, but I don't see how it improves the parsing... > The user reads the content based on the event type. My test code on > the user space is like this: > > read(fd, &event, sizeof(uint32_t)); //will change uint64_t > > switch (event) { > > case event_A: > > read(fd, &struct_event_A_variable, sizeof(struct event_A)); > > case event_B: > > read(fd, &struct_event_B_variable, sizeof(struct event_B)); > > Not exactly like that, such as read() is done in a separate function, > but you see the idea. > > This is what I mean by the event type tells the size itself. If I > provide the header you suggested, the user still needs to decide which > struct to use.... Unless we don't use struct for the message. I can > change the output to become event:description_of_the event where > description is a string. The header you suggest will apply to this > method pretty well. > As stated in the beginning, I'll use header+description where description is a string whose size rounds up to power of 4 (sizeof(uint32_t)). >> >>> + >>> +struct kfd_ioctl_smi_events_args { >>> + __u32 op; /* to KFD */ >>> + __u32 events; /* to KFD */ >> >> I thought this should be 64-bit. >> >> >>> + __u32 gpu_id; /* to KFD */ >>> + __u32 data; /* from KFD */ >>> +}; >>> + >>> +/* Message to user space for each event */ struct >>> +kfd_smi_msg_vmfault { >> >> If you define the header like suggested above, you could include it >> here and at the start of all message structures. Might simplify the >> code a little: >> >> struct kfd_smi_msg_header header; >> >> Regards, >> Felix >> >>> + uint32_t pid; >>> + char task_name[16]; >>> +}; >>> + >>> /* Register offset inside the remapped mmio page >>> */ >>> enum kfd_mmio_remap { >>> @@ -546,7 +571,10 @@ enum kfd_mmio_remap { >>> #define AMDKFD_IOC_ALLOC_QUEUE_GWS \ >>> AMDKFD_IOWR(0x1E, struct kfd_ioctl_alloc_queue_gws_args) >>> +#define AMDKFD_IOC_SMI_EVENTS \ >>> + AMDKFD_IOWR(0x1F, struct kfd_ioctl_smi_events_args) >>> + >>> #define AMDKFD_COMMAND_START 0x01 -#define >>> AMDKFD_COMMAND_END 0x1F >>> +#define AMDKFD_COMMAND_END 0x20 >>> #endif _______________________________________________ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx ^ permalink raw reply [flat|nested] 5+ messages in thread
end of thread, other threads:[~2020-04-03 19:01 UTC | newest] Thread overview: 5+ messages (download: mbox.gz follow: Atom feed -- links below jump to the message on this page -- 2020-04-02 20:46 [PATCH v2] drm/amdkfd: Provide SMI events watch Amber Lin 2020-04-02 23:51 ` Felix Kuehling 2020-04-03 13:38 ` Amber Lin 2020-04-03 15:38 ` Amber Lin 2020-04-03 19:01 ` Kuehling, Felix
This is an external index of several public inboxes, see mirroring instructions on how to clone and mirror all data and code used by this external index.